sched_ule.c revision 127850
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 127850 2004-04-04 19:12:56Z jeff $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32109864Sjeff#include <sys/kernel.h> 33109864Sjeff#include <sys/ktr.h> 34109864Sjeff#include <sys/lock.h> 35109864Sjeff#include <sys/mutex.h> 36109864Sjeff#include <sys/proc.h> 37112966Sjeff#include <sys/resource.h> 38122038Sjeff#include <sys/resourcevar.h> 39109864Sjeff#include <sys/sched.h> 40109864Sjeff#include <sys/smp.h> 41109864Sjeff#include <sys/sx.h> 42109864Sjeff#include <sys/sysctl.h> 43109864Sjeff#include <sys/sysproto.h> 44109864Sjeff#include <sys/vmmeter.h> 45109864Sjeff#ifdef DDB 46109864Sjeff#include <ddb/ddb.h> 47109864Sjeff#endif 48109864Sjeff#ifdef KTRACE 49109864Sjeff#include <sys/uio.h> 50109864Sjeff#include <sys/ktrace.h> 51109864Sjeff#endif 52109864Sjeff 53109864Sjeff#include <machine/cpu.h> 54121790Sjeff#include <machine/smp.h> 55109864Sjeff 56113357Sjeff#define KTR_ULE KTR_NFS 57113357Sjeff 58109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 59109864Sjeff/* XXX This is bogus compatability crap for ps */ 60109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 61109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 62109864Sjeff 63109864Sjeffstatic void sched_setup(void *dummy); 64109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 65109864Sjeff 66113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 67113357Sjeff 68113357Sjeffstatic int slice_min = 1; 69113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 70113357Sjeff 71116365Sjeffstatic int slice_max = 10; 72113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 73113357Sjeff 74111857Sjeffint realstathz; 75113357Sjeffint tickincr = 1; 76111857Sjeff 77116069Sjeff#ifdef SMP 78123487Sjeff/* Callouts to handle load balancing SMP systems. */ 79116069Sjeffstatic struct callout kseq_lb_callout; 80123487Sjeffstatic struct callout kseq_group_callout; 81116069Sjeff#endif 82116069Sjeff 83109864Sjeff/* 84109864Sjeff * These datastructures are allocated within their parent datastructure but 85109864Sjeff * are scheduler specific. 86109864Sjeff */ 87109864Sjeff 88109864Sjeffstruct ke_sched { 89109864Sjeff int ske_slice; 90109864Sjeff struct runq *ske_runq; 91109864Sjeff /* The following variables are only used for pctcpu calculation */ 92109864Sjeff int ske_ltick; /* Last tick that we were running on */ 93109864Sjeff int ske_ftick; /* First tick that we were running on */ 94109864Sjeff int ske_ticks; /* Tick count */ 95113357Sjeff /* CPU that we have affinity for. */ 96110260Sjeff u_char ske_cpu; 97109864Sjeff}; 98109864Sjeff#define ke_slice ke_sched->ske_slice 99109864Sjeff#define ke_runq ke_sched->ske_runq 100109864Sjeff#define ke_ltick ke_sched->ske_ltick 101109864Sjeff#define ke_ftick ke_sched->ske_ftick 102109864Sjeff#define ke_ticks ke_sched->ske_ticks 103110260Sjeff#define ke_cpu ke_sched->ske_cpu 104121790Sjeff#define ke_assign ke_procq.tqe_next 105109864Sjeff 106121790Sjeff#define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ 107122158Sjeff#define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ 108121790Sjeff 109109864Sjeffstruct kg_sched { 110110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 111110645Sjeff int skg_runtime; /* Number of ticks we were running */ 112109864Sjeff}; 113109864Sjeff#define kg_slptime kg_sched->skg_slptime 114110645Sjeff#define kg_runtime kg_sched->skg_runtime 115109864Sjeff 116109864Sjeffstruct td_sched { 117109864Sjeff int std_slptime; 118109864Sjeff}; 119109864Sjeff#define td_slptime td_sched->std_slptime 120109864Sjeff 121110267Sjeffstruct td_sched td_sched; 122109864Sjeffstruct ke_sched ke_sched; 123109864Sjeffstruct kg_sched kg_sched; 124109864Sjeff 125109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 126109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 127109864Sjeffstruct p_sched *proc0_sched = NULL; 128109864Sjeffstruct td_sched *thread0_sched = &td_sched; 129109864Sjeff 130109864Sjeff/* 131116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 132116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 133116642Sjeff * value is then directly added to this to allow nice to have some effect 134116642Sjeff * on latency. 135111857Sjeff * 136111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 137116642Sjeff * PRI_NRESV: Number of nice values. 138111857Sjeff * PRI_BASE: The start of the dynamic range. 139109864Sjeff */ 140111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 141121869Sjeff#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 142121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 143116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 144113357Sjeff#define SCHED_PRI_INTERACT(score) \ 145116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 146109864Sjeff 147109864Sjeff/* 148111857Sjeff * These determine the interactivity of a process. 149109864Sjeff * 150110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 151110645Sjeff * before throttling back. 152121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 153116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 154111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 155109864Sjeff */ 156121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 157121868Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 158116365Sjeff#define SCHED_INTERACT_MAX (100) 159116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 160121126Sjeff#define SCHED_INTERACT_THRESH (30) 161111857Sjeff 162109864Sjeff/* 163109864Sjeff * These parameters and macros determine the size of the time slice that is 164109864Sjeff * granted to each thread. 165109864Sjeff * 166109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 167109864Sjeff * SLICE_MAX: Maximum time slice granted. 168109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 169112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 170112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 171121871Sjeff * SLICE_NTHRESH: The nice cutoff point for slice assignment. 172109864Sjeff */ 173113357Sjeff#define SCHED_SLICE_MIN (slice_min) 174113357Sjeff#define SCHED_SLICE_MAX (slice_max) 175125299Sjeff#define SCHED_SLICE_INTERACTIVE (slice_max) 176121871Sjeff#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 177111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 178109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 179112966Sjeff#define SCHED_SLICE_NICE(nice) \ 180121871Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 181109864Sjeff 182109864Sjeff/* 183109864Sjeff * This macro determines whether or not the kse belongs on the current or 184109864Sjeff * next run queue. 185109864Sjeff */ 186113357Sjeff#define SCHED_INTERACTIVE(kg) \ 187113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 188113417Sjeff#define SCHED_CURR(kg, ke) \ 189127278Sobrien (ke->ke_thread->td_priority < kg->kg_user_pri || \ 190121107Sjeff SCHED_INTERACTIVE(kg)) 191109864Sjeff 192109864Sjeff/* 193109864Sjeff * Cpu percentage computation macros and defines. 194109864Sjeff * 195109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 196109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 197109864Sjeff */ 198109864Sjeff 199112971Sjeff#define SCHED_CPU_TIME 10 200109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 201109864Sjeff 202109864Sjeff/* 203113357Sjeff * kseq - per processor runqs and statistics. 204109864Sjeff */ 205109864Sjeffstruct kseq { 206113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 207113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 208113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 209113357Sjeff struct runq *ksq_curr; /* Current queue. */ 210121896Sjeff int ksq_load_timeshare; /* Load for timeshare. */ 211113357Sjeff int ksq_load; /* Aggregate load. */ 212121869Sjeff short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 213113357Sjeff short ksq_nicemin; /* Least nice. */ 214110267Sjeff#ifdef SMP 215123433Sjeff int ksq_transferable; 216123433Sjeff LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 217123433Sjeff struct kseq_group *ksq_group; /* Our processor group. */ 218123433Sjeff volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 219125289Sjeff#else 220125289Sjeff int ksq_sysload; /* For loadavg, !ITHD load. */ 221110267Sjeff#endif 222109864Sjeff}; 223109864Sjeff 224123433Sjeff#ifdef SMP 225109864Sjeff/* 226123433Sjeff * kseq groups are groups of processors which can cheaply share threads. When 227123433Sjeff * one processor in the group goes idle it will check the runqs of the other 228123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 229123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 230123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 231123433Sjeff * load balancer. 232123433Sjeff */ 233123433Sjeffstruct kseq_group { 234123433Sjeff int ksg_cpus; /* Count of CPUs in this kseq group. */ 235127498Smarcel cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ 236127498Smarcel cpumask_t ksg_idlemask; /* Idle cpus in this group. */ 237127498Smarcel cpumask_t ksg_mask; /* Bit mask for first cpu. */ 238123487Sjeff int ksg_load; /* Total load of this group. */ 239123433Sjeff int ksg_transferable; /* Transferable load of this group. */ 240123433Sjeff LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 241123433Sjeff}; 242123433Sjeff#endif 243123433Sjeff 244123433Sjeff/* 245109864Sjeff * One kse queue per processor. 246109864Sjeff */ 247110028Sjeff#ifdef SMP 248127498Smarcelstatic cpumask_t kseq_idle; 249123487Sjeffstatic int ksg_maxid; 250121790Sjeffstatic struct kseq kseq_cpu[MAXCPU]; 251123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU]; 252123433Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 253123433Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 254123487Sjeff#define KSEQ_ID(x) ((x) - kseq_cpu) 255123487Sjeff#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 256123433Sjeff#else /* !SMP */ 257121790Sjeffstatic struct kseq kseq_cpu; 258110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 259110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 260110028Sjeff#endif 261109864Sjeff 262112966Sjeffstatic void sched_slice(struct kse *ke); 263113357Sjeffstatic void sched_priority(struct ksegrp *kg); 264111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 265116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 266121868Sjeffstatic void sched_interact_fork(struct ksegrp *kg); 267121790Sjeffstatic void sched_pctcpu_update(struct kse *ke); 268109864Sjeff 269110267Sjeff/* Operations on per processor queues */ 270121790Sjeffstatic struct kse * kseq_choose(struct kseq *kseq); 271110028Sjeffstatic void kseq_setup(struct kseq *kseq); 272122744Sjeffstatic void kseq_load_add(struct kseq *kseq, struct kse *ke); 273122744Sjeffstatic void kseq_load_rem(struct kseq *kseq, struct kse *ke); 274122744Sjeffstatic __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 275122744Sjeffstatic __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 276113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 277113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 278113660Sjeffvoid kseq_print(int cpu); 279110267Sjeff#ifdef SMP 280123433Sjeffstatic int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 281121790Sjeffstatic struct kse *runq_steal(struct runq *rq); 282122744Sjeffstatic void sched_balance(void *arg); 283123487Sjeffstatic void sched_balance_group(struct kseq_group *ksg); 284123487Sjeffstatic void sched_balance_pair(struct kseq *high, struct kseq *low); 285121790Sjeffstatic void kseq_move(struct kseq *from, int cpu); 286123433Sjeffstatic int kseq_idled(struct kseq *kseq); 287121790Sjeffstatic void kseq_notify(struct kse *ke, int cpu); 288121790Sjeffstatic void kseq_assign(struct kseq *); 289123433Sjeffstatic struct kse *kseq_steal(struct kseq *kseq, int stealidle); 290123693Sjeff/* 291123693Sjeff * On P4 Xeons the round-robin interrupt delivery is broken. As a result of 292123693Sjeff * this, we can't pin interrupts to the cpu that they were delivered to, 293123693Sjeff * otherwise all ithreads only run on CPU 0. 294123693Sjeff */ 295123693Sjeff#ifdef __i386__ 296122038Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 297123693Sjeff ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 298123693Sjeff#else /* !__i386__ */ 299123693Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 300122158Sjeff ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 301122165Sjeff ((ke)->ke_flags & KEF_BOUND) == 0) 302123693Sjeff#endif /* !__i386__ */ 303121790Sjeff#endif 304110028Sjeff 305113357Sjeffvoid 306113660Sjeffkseq_print(int cpu) 307110267Sjeff{ 308113660Sjeff struct kseq *kseq; 309113357Sjeff int i; 310112994Sjeff 311113660Sjeff kseq = KSEQ_CPU(cpu); 312112994Sjeff 313113357Sjeff printf("kseq:\n"); 314113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 315122744Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 316121896Sjeff#ifdef SMP 317123433Sjeff printf("\tload transferable: %d\n", kseq->ksq_transferable); 318121896Sjeff#endif 319113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 320113357Sjeff printf("\tnice counts:\n"); 321121869Sjeff for (i = 0; i < SCHED_PRI_NRESV; i++) 322113357Sjeff if (kseq->ksq_nice[i]) 323113357Sjeff printf("\t\t%d = %d\n", 324113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 325113357Sjeff} 326112994Sjeff 327122744Sjeffstatic __inline void 328122744Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke) 329122744Sjeff{ 330122744Sjeff#ifdef SMP 331123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 332123433Sjeff kseq->ksq_transferable++; 333123433Sjeff kseq->ksq_group->ksg_transferable++; 334123433Sjeff } 335122744Sjeff#endif 336122744Sjeff runq_add(ke->ke_runq, ke); 337122744Sjeff} 338122744Sjeff 339122744Sjeffstatic __inline void 340122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke) 341122744Sjeff{ 342122744Sjeff#ifdef SMP 343123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 344123433Sjeff kseq->ksq_transferable--; 345123433Sjeff kseq->ksq_group->ksg_transferable--; 346123433Sjeff } 347122744Sjeff#endif 348122744Sjeff runq_remove(ke->ke_runq, ke); 349122744Sjeff} 350122744Sjeff 351113357Sjeffstatic void 352122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke) 353113357Sjeff{ 354121896Sjeff int class; 355115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 356121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 357121896Sjeff if (class == PRI_TIMESHARE) 358121896Sjeff kseq->ksq_load_timeshare++; 359113357Sjeff kseq->ksq_load++; 360125289Sjeff if (class != PRI_ITHD) 361123487Sjeff#ifdef SMP 362123487Sjeff kseq->ksq_group->ksg_load++; 363125289Sjeff#else 364125289Sjeff kseq->ksq_sysload++; 365123487Sjeff#endif 366113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 367122744Sjeff CTR6(KTR_ULE, 368122744Sjeff "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 369122744Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 370122744Sjeff ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 371113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 372113357Sjeff kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 373110267Sjeff} 374113357Sjeff 375112994Sjeffstatic void 376122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke) 377110267Sjeff{ 378121896Sjeff int class; 379115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 380121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 381121896Sjeff if (class == PRI_TIMESHARE) 382121896Sjeff kseq->ksq_load_timeshare--; 383125289Sjeff if (class != PRI_ITHD) 384123487Sjeff#ifdef SMP 385123487Sjeff kseq->ksq_group->ksg_load--; 386125289Sjeff#else 387125289Sjeff kseq->ksq_sysload--; 388123487Sjeff#endif 389113357Sjeff kseq->ksq_load--; 390113357Sjeff ke->ke_runq = NULL; 391113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 392113357Sjeff kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 393110267Sjeff} 394110267Sjeff 395113357Sjeffstatic void 396113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 397110267Sjeff{ 398115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 399113357Sjeff /* Normalize to zero. */ 400113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 401121896Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 402113357Sjeff kseq->ksq_nicemin = nice; 403110267Sjeff} 404110267Sjeff 405113357Sjeffstatic void 406113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 407110267Sjeff{ 408113357Sjeff int n; 409113357Sjeff 410115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 411113357Sjeff /* Normalize to zero. */ 412113357Sjeff n = nice + SCHED_PRI_NHALF; 413113357Sjeff kseq->ksq_nice[n]--; 414113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 415113357Sjeff 416113357Sjeff /* 417113357Sjeff * If this wasn't the smallest nice value or there are more in 418113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 419113357Sjeff * the smallest nice. 420113357Sjeff */ 421113357Sjeff if (nice != kseq->ksq_nicemin || 422113357Sjeff kseq->ksq_nice[n] != 0 || 423121896Sjeff kseq->ksq_load_timeshare == 0) 424113357Sjeff return; 425113357Sjeff 426121869Sjeff for (; n < SCHED_PRI_NRESV; n++) 427113357Sjeff if (kseq->ksq_nice[n]) { 428113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 429113357Sjeff return; 430113357Sjeff } 431110267Sjeff} 432110267Sjeff 433113357Sjeff#ifdef SMP 434116069Sjeff/* 435122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 436116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 437116069Sjeff * by migrating some processes. 438116069Sjeff * 439116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 440116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 441116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 442116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 443116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 444116069Sjeff * 445116069Sjeff * It could be improved by considering the priorities and slices assigned to 446116069Sjeff * each task prior to balancing them. There are many pathological cases with 447116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 448116069Sjeff * 449116069Sjeff */ 450121790Sjeffstatic void 451122744Sjeffsched_balance(void *arg) 452116069Sjeff{ 453123487Sjeff struct kseq_group *high; 454123487Sjeff struct kseq_group *low; 455123487Sjeff struct kseq_group *ksg; 456123487Sjeff int timo; 457123487Sjeff int cnt; 458123487Sjeff int i; 459123487Sjeff 460123487Sjeff mtx_lock_spin(&sched_lock); 461123487Sjeff if (smp_started == 0) 462123487Sjeff goto out; 463123487Sjeff low = high = NULL; 464123487Sjeff i = random() % (ksg_maxid + 1); 465123487Sjeff for (cnt = 0; cnt <= ksg_maxid; cnt++) { 466123487Sjeff ksg = KSEQ_GROUP(i); 467123487Sjeff /* 468123487Sjeff * Find the CPU with the highest load that has some 469123487Sjeff * threads to transfer. 470123487Sjeff */ 471123487Sjeff if ((high == NULL || ksg->ksg_load > high->ksg_load) 472123487Sjeff && ksg->ksg_transferable) 473123487Sjeff high = ksg; 474123487Sjeff if (low == NULL || ksg->ksg_load < low->ksg_load) 475123487Sjeff low = ksg; 476123487Sjeff if (++i > ksg_maxid) 477123487Sjeff i = 0; 478123487Sjeff } 479123487Sjeff if (low != NULL && high != NULL && high != low) 480123487Sjeff sched_balance_pair(LIST_FIRST(&high->ksg_members), 481123487Sjeff LIST_FIRST(&low->ksg_members)); 482123487Sjeffout: 483123487Sjeff mtx_unlock_spin(&sched_lock); 484123487Sjeff timo = random() % (hz * 2); 485123487Sjeff callout_reset(&kseq_lb_callout, timo, sched_balance, NULL); 486123487Sjeff} 487123487Sjeff 488123487Sjeffstatic void 489123487Sjeffsched_balance_groups(void *arg) 490123487Sjeff{ 491123487Sjeff int timo; 492123487Sjeff int i; 493123487Sjeff 494123487Sjeff mtx_lock_spin(&sched_lock); 495123487Sjeff if (smp_started) 496123487Sjeff for (i = 0; i <= ksg_maxid; i++) 497123487Sjeff sched_balance_group(KSEQ_GROUP(i)); 498123487Sjeff mtx_unlock_spin(&sched_lock); 499123487Sjeff timo = random() % (hz * 2); 500123487Sjeff callout_reset(&kseq_group_callout, timo, sched_balance_groups, NULL); 501123487Sjeff} 502123487Sjeff 503123487Sjeffstatic void 504123487Sjeffsched_balance_group(struct kseq_group *ksg) 505123487Sjeff{ 506116069Sjeff struct kseq *kseq; 507123487Sjeff struct kseq *high; 508123487Sjeff struct kseq *low; 509123487Sjeff int load; 510123487Sjeff 511123487Sjeff if (ksg->ksg_transferable == 0) 512123487Sjeff return; 513123487Sjeff low = NULL; 514123487Sjeff high = NULL; 515123487Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 516123487Sjeff load = kseq->ksq_load; 517123487Sjeff if (kseq == KSEQ_CPU(0)) 518123487Sjeff load--; 519123487Sjeff if (high == NULL || load > high->ksq_load) 520123487Sjeff high = kseq; 521123487Sjeff if (low == NULL || load < low->ksq_load) 522123487Sjeff low = kseq; 523123487Sjeff } 524123487Sjeff if (high != NULL && low != NULL && high != low) 525123487Sjeff sched_balance_pair(high, low); 526123487Sjeff} 527123487Sjeff 528123487Sjeffstatic void 529123487Sjeffsched_balance_pair(struct kseq *high, struct kseq *low) 530123487Sjeff{ 531123433Sjeff int transferable; 532116069Sjeff int high_load; 533116069Sjeff int low_load; 534116069Sjeff int move; 535116069Sjeff int diff; 536116069Sjeff int i; 537116069Sjeff 538116069Sjeff /* 539123433Sjeff * If we're transfering within a group we have to use this specific 540123433Sjeff * kseq's transferable count, otherwise we can steal from other members 541123433Sjeff * of the group. 542123433Sjeff */ 543123487Sjeff if (high->ksq_group == low->ksq_group) { 544123487Sjeff transferable = high->ksq_transferable; 545123487Sjeff high_load = high->ksq_load; 546123487Sjeff low_load = low->ksq_load; 547123487Sjeff /* 548123487Sjeff * XXX If we encounter cpu 0 we must remember to reduce it's 549123487Sjeff * load by 1 to reflect the swi that is running the callout. 550123487Sjeff * At some point we should really fix load balancing of the 551123487Sjeff * swi and then this wont matter. 552123487Sjeff */ 553123487Sjeff if (high == KSEQ_CPU(0)) 554123487Sjeff high_load--; 555123487Sjeff if (low == KSEQ_CPU(0)) 556123487Sjeff low_load--; 557123487Sjeff } else { 558123487Sjeff transferable = high->ksq_group->ksg_transferable; 559123487Sjeff high_load = high->ksq_group->ksg_load; 560123487Sjeff low_load = low->ksq_group->ksg_load; 561123487Sjeff } 562123433Sjeff if (transferable == 0) 563123487Sjeff return; 564123433Sjeff /* 565122744Sjeff * Determine what the imbalance is and then adjust that to how many 566123433Sjeff * kses we actually have to give up (transferable). 567122744Sjeff */ 568123487Sjeff diff = high_load - low_load; 569116069Sjeff move = diff / 2; 570116069Sjeff if (diff & 0x1) 571116069Sjeff move++; 572123433Sjeff move = min(move, transferable); 573116069Sjeff for (i = 0; i < move; i++) 574123487Sjeff kseq_move(high, KSEQ_ID(low)); 575116069Sjeff return; 576116069Sjeff} 577116069Sjeff 578121790Sjeffstatic void 579116069Sjeffkseq_move(struct kseq *from, int cpu) 580116069Sjeff{ 581123433Sjeff struct kseq *kseq; 582123433Sjeff struct kseq *to; 583116069Sjeff struct kse *ke; 584116069Sjeff 585123433Sjeff kseq = from; 586123433Sjeff to = KSEQ_CPU(cpu); 587123433Sjeff ke = kseq_steal(kseq, 1); 588123433Sjeff if (ke == NULL) { 589123433Sjeff struct kseq_group *ksg; 590123433Sjeff 591123433Sjeff ksg = kseq->ksq_group; 592123433Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 593123433Sjeff if (kseq == from || kseq->ksq_transferable == 0) 594123433Sjeff continue; 595123433Sjeff ke = kseq_steal(kseq, 1); 596123433Sjeff break; 597123433Sjeff } 598123433Sjeff if (ke == NULL) 599123433Sjeff panic("kseq_move: No KSEs available with a " 600123433Sjeff "transferable count of %d\n", 601123433Sjeff ksg->ksg_transferable); 602123433Sjeff } 603123433Sjeff if (kseq == to) 604123433Sjeff return; 605116069Sjeff ke->ke_state = KES_THREAD; 606123433Sjeff kseq_runq_rem(kseq, ke); 607123433Sjeff kseq_load_rem(kseq, ke); 608121923Sjeff kseq_notify(ke, cpu); 609116069Sjeff} 610110267Sjeff 611123433Sjeffstatic int 612123433Sjeffkseq_idled(struct kseq *kseq) 613121790Sjeff{ 614123433Sjeff struct kseq_group *ksg; 615123433Sjeff struct kseq *steal; 616123433Sjeff struct kse *ke; 617123433Sjeff 618123433Sjeff ksg = kseq->ksq_group; 619123433Sjeff /* 620123433Sjeff * If we're in a cpu group, try and steal kses from another cpu in 621123433Sjeff * the group before idling. 622123433Sjeff */ 623123433Sjeff if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 624123433Sjeff LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 625123433Sjeff if (steal == kseq || steal->ksq_transferable == 0) 626123433Sjeff continue; 627123433Sjeff ke = kseq_steal(steal, 0); 628123433Sjeff if (ke == NULL) 629123433Sjeff continue; 630123433Sjeff ke->ke_state = KES_THREAD; 631123433Sjeff kseq_runq_rem(steal, ke); 632123433Sjeff kseq_load_rem(steal, ke); 633123433Sjeff ke->ke_cpu = PCPU_GET(cpuid); 634123433Sjeff sched_add(ke->ke_thread); 635123433Sjeff return (0); 636123433Sjeff } 637123433Sjeff } 638123433Sjeff /* 639123433Sjeff * We only set the idled bit when all of the cpus in the group are 640123433Sjeff * idle. Otherwise we could get into a situation where a KSE bounces 641123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 642123433Sjeff */ 643123433Sjeff ksg->ksg_idlemask |= PCPU_GET(cpumask); 644123433Sjeff if (ksg->ksg_idlemask != ksg->ksg_cpumask) 645123433Sjeff return (1); 646123433Sjeff atomic_set_int(&kseq_idle, ksg->ksg_mask); 647123433Sjeff return (1); 648121790Sjeff} 649121790Sjeff 650121790Sjeffstatic void 651121790Sjeffkseq_assign(struct kseq *kseq) 652121790Sjeff{ 653121790Sjeff struct kse *nke; 654121790Sjeff struct kse *ke; 655121790Sjeff 656121790Sjeff do { 657122848Sjeff (volatile struct kse *)ke = kseq->ksq_assigned; 658121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 659121790Sjeff for (; ke != NULL; ke = nke) { 660121790Sjeff nke = ke->ke_assign; 661121790Sjeff ke->ke_flags &= ~KEF_ASSIGNED; 662121790Sjeff sched_add(ke->ke_thread); 663121790Sjeff } 664121790Sjeff} 665121790Sjeff 666121790Sjeffstatic void 667121790Sjeffkseq_notify(struct kse *ke, int cpu) 668121790Sjeff{ 669121790Sjeff struct kseq *kseq; 670121790Sjeff struct thread *td; 671121790Sjeff struct pcpu *pcpu; 672121790Sjeff 673123529Sjeff ke->ke_cpu = cpu; 674121790Sjeff ke->ke_flags |= KEF_ASSIGNED; 675121790Sjeff 676121790Sjeff kseq = KSEQ_CPU(cpu); 677121790Sjeff 678121790Sjeff /* 679121790Sjeff * Place a KSE on another cpu's queue and force a resched. 680121790Sjeff */ 681121790Sjeff do { 682122848Sjeff (volatile struct kse *)ke->ke_assign = kseq->ksq_assigned; 683121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 684121790Sjeff pcpu = pcpu_find(cpu); 685121790Sjeff td = pcpu->pc_curthread; 686121790Sjeff if (ke->ke_thread->td_priority < td->td_priority || 687121790Sjeff td == pcpu->pc_idlethread) { 688121790Sjeff td->td_flags |= TDF_NEEDRESCHED; 689121790Sjeff ipi_selected(1 << cpu, IPI_AST); 690121790Sjeff } 691121790Sjeff} 692121790Sjeff 693121790Sjeffstatic struct kse * 694121790Sjeffrunq_steal(struct runq *rq) 695121790Sjeff{ 696121790Sjeff struct rqhead *rqh; 697121790Sjeff struct rqbits *rqb; 698121790Sjeff struct kse *ke; 699121790Sjeff int word; 700121790Sjeff int bit; 701121790Sjeff 702121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 703121790Sjeff rqb = &rq->rq_status; 704121790Sjeff for (word = 0; word < RQB_LEN; word++) { 705121790Sjeff if (rqb->rqb_bits[word] == 0) 706121790Sjeff continue; 707121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 708123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 709121790Sjeff continue; 710121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 711121790Sjeff TAILQ_FOREACH(ke, rqh, ke_procq) { 712121896Sjeff if (KSE_CAN_MIGRATE(ke, 713121896Sjeff PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 714121790Sjeff return (ke); 715121790Sjeff } 716121790Sjeff } 717121790Sjeff } 718121790Sjeff return (NULL); 719121790Sjeff} 720121790Sjeff 721121790Sjeffstatic struct kse * 722123433Sjeffkseq_steal(struct kseq *kseq, int stealidle) 723121790Sjeff{ 724121790Sjeff struct kse *ke; 725121790Sjeff 726123433Sjeff /* 727123433Sjeff * Steal from next first to try to get a non-interactive task that 728123433Sjeff * may not have run for a while. 729123433Sjeff */ 730123433Sjeff if ((ke = runq_steal(kseq->ksq_next)) != NULL) 731123433Sjeff return (ke); 732121790Sjeff if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 733121790Sjeff return (ke); 734123433Sjeff if (stealidle) 735123433Sjeff return (runq_steal(&kseq->ksq_idle)); 736123433Sjeff return (NULL); 737121790Sjeff} 738123433Sjeff 739123433Sjeffint 740123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class) 741123433Sjeff{ 742123433Sjeff struct kseq_group *ksg; 743123433Sjeff int cpu; 744123433Sjeff 745123685Sjeff if (smp_started == 0) 746123685Sjeff return (0); 747123433Sjeff cpu = 0; 748123433Sjeff ksg = kseq->ksq_group; 749123433Sjeff 750123433Sjeff /* 751123685Sjeff * If there are any idle groups, give them our extra load. The 752123685Sjeff * threshold at which we start to reassign kses has a large impact 753123685Sjeff * on the overall performance of the system. Tuned too high and 754123685Sjeff * some CPUs may idle. Too low and there will be excess migration 755123685Sjeff * and context swiches. 756123685Sjeff */ 757123694Sjeff if (ksg->ksg_load > (ksg->ksg_cpus * 2) && kseq_idle) { 758123433Sjeff /* 759123433Sjeff * Multiple cpus could find this bit simultaneously 760123433Sjeff * but the race shouldn't be terrible. 761123433Sjeff */ 762123433Sjeff cpu = ffs(kseq_idle); 763123433Sjeff if (cpu) 764123433Sjeff atomic_clear_int(&kseq_idle, 1 << (cpu - 1)); 765123433Sjeff } 766123433Sjeff /* 767123433Sjeff * If another cpu in this group has idled, assign a thread over 768123433Sjeff * to them after checking to see if there are idled groups. 769123433Sjeff */ 770123433Sjeff if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) { 771123433Sjeff cpu = ffs(ksg->ksg_idlemask); 772123433Sjeff if (cpu) 773123433Sjeff ksg->ksg_idlemask &= ~(1 << (cpu - 1)); 774123433Sjeff } 775123433Sjeff /* 776123433Sjeff * Now that we've found an idle CPU, migrate the thread. 777123433Sjeff */ 778123433Sjeff if (cpu) { 779123433Sjeff cpu--; 780123433Sjeff ke->ke_runq = NULL; 781123433Sjeff kseq_notify(ke, cpu); 782123433Sjeff return (1); 783123433Sjeff } 784123433Sjeff return (0); 785123433Sjeff} 786123433Sjeff 787121790Sjeff#endif /* SMP */ 788121790Sjeff 789117326Sjeff/* 790121790Sjeff * Pick the highest priority task we have and return it. 791117326Sjeff */ 792117326Sjeff 793121790Sjeffstatic struct kse * 794121790Sjeffkseq_choose(struct kseq *kseq) 795110267Sjeff{ 796110267Sjeff struct kse *ke; 797110267Sjeff struct runq *swap; 798110267Sjeff 799115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 800113357Sjeff swap = NULL; 801112994Sjeff 802113357Sjeff for (;;) { 803113357Sjeff ke = runq_choose(kseq->ksq_curr); 804113357Sjeff if (ke == NULL) { 805113357Sjeff /* 806113357Sjeff * We already swaped once and didn't get anywhere. 807113357Sjeff */ 808113357Sjeff if (swap) 809113357Sjeff break; 810113357Sjeff swap = kseq->ksq_curr; 811113357Sjeff kseq->ksq_curr = kseq->ksq_next; 812113357Sjeff kseq->ksq_next = swap; 813113357Sjeff continue; 814113357Sjeff } 815113357Sjeff /* 816113357Sjeff * If we encounter a slice of 0 the kse is in a 817113357Sjeff * TIMESHARE kse group and its nice was too far out 818113357Sjeff * of the range that receives slices. 819113357Sjeff */ 820121790Sjeff if (ke->ke_slice == 0) { 821113357Sjeff runq_remove(ke->ke_runq, ke); 822113357Sjeff sched_slice(ke); 823113357Sjeff ke->ke_runq = kseq->ksq_next; 824113357Sjeff runq_add(ke->ke_runq, ke); 825113357Sjeff continue; 826113357Sjeff } 827113357Sjeff return (ke); 828110267Sjeff } 829110267Sjeff 830113357Sjeff return (runq_choose(&kseq->ksq_idle)); 831110267Sjeff} 832110267Sjeff 833109864Sjeffstatic void 834110028Sjeffkseq_setup(struct kseq *kseq) 835110028Sjeff{ 836113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 837113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 838112994Sjeff runq_init(&kseq->ksq_idle); 839113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 840113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 841113660Sjeff kseq->ksq_load = 0; 842121896Sjeff kseq->ksq_load_timeshare = 0; 843110028Sjeff} 844110028Sjeff 845110028Sjeffstatic void 846109864Sjeffsched_setup(void *dummy) 847109864Sjeff{ 848117313Sjeff#ifdef SMP 849123487Sjeff int balance_groups; 850109864Sjeff int i; 851117313Sjeff#endif 852109864Sjeff 853116946Sjeff slice_min = (hz/100); /* 10ms */ 854116946Sjeff slice_max = (hz/7); /* ~140ms */ 855111857Sjeff 856117237Sjeff#ifdef SMP 857123487Sjeff balance_groups = 0; 858123433Sjeff /* 859123433Sjeff * Initialize the kseqs. 860123433Sjeff */ 861123433Sjeff for (i = 0; i < MAXCPU; i++) { 862123433Sjeff struct kseq *ksq; 863123433Sjeff 864123433Sjeff ksq = &kseq_cpu[i]; 865123433Sjeff ksq->ksq_assigned = NULL; 866123433Sjeff kseq_setup(&kseq_cpu[i]); 867123433Sjeff } 868117237Sjeff if (smp_topology == NULL) { 869123433Sjeff struct kseq_group *ksg; 870123433Sjeff struct kseq *ksq; 871123433Sjeff 872117237Sjeff for (i = 0; i < MAXCPU; i++) { 873123433Sjeff ksq = &kseq_cpu[i]; 874123433Sjeff ksg = &kseq_groups[i]; 875123433Sjeff /* 876123433Sjeff * Setup a kse group with one member. 877123433Sjeff */ 878123433Sjeff ksq->ksq_transferable = 0; 879123433Sjeff ksq->ksq_group = ksg; 880123433Sjeff ksg->ksg_cpus = 1; 881123433Sjeff ksg->ksg_idlemask = 0; 882123433Sjeff ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 883123487Sjeff ksg->ksg_load = 0; 884123433Sjeff ksg->ksg_transferable = 0; 885123433Sjeff LIST_INIT(&ksg->ksg_members); 886123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 887117237Sjeff } 888117237Sjeff } else { 889123433Sjeff struct kseq_group *ksg; 890123433Sjeff struct cpu_group *cg; 891117237Sjeff int j; 892113357Sjeff 893117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 894117237Sjeff cg = &smp_topology->ct_group[i]; 895123433Sjeff ksg = &kseq_groups[i]; 896123433Sjeff /* 897123433Sjeff * Initialize the group. 898123433Sjeff */ 899123433Sjeff ksg->ksg_idlemask = 0; 900123487Sjeff ksg->ksg_load = 0; 901123433Sjeff ksg->ksg_transferable = 0; 902123433Sjeff ksg->ksg_cpus = cg->cg_count; 903123433Sjeff ksg->ksg_cpumask = cg->cg_mask; 904123433Sjeff LIST_INIT(&ksg->ksg_members); 905123433Sjeff /* 906123433Sjeff * Find all of the group members and add them. 907123433Sjeff */ 908123433Sjeff for (j = 0; j < MAXCPU; j++) { 909123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 910123433Sjeff if (ksg->ksg_mask == 0) 911123433Sjeff ksg->ksg_mask = 1 << j; 912123433Sjeff kseq_cpu[j].ksq_transferable = 0; 913123433Sjeff kseq_cpu[j].ksq_group = ksg; 914123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, 915123433Sjeff &kseq_cpu[j], ksq_siblings); 916123433Sjeff } 917123433Sjeff } 918123487Sjeff if (ksg->ksg_cpus > 1) 919123487Sjeff balance_groups = 1; 920117237Sjeff } 921123487Sjeff ksg_maxid = smp_topology->ct_count - 1; 922117237Sjeff } 923119137Ssam callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 924123487Sjeff callout_init(&kseq_group_callout, CALLOUT_MPSAFE); 925122744Sjeff sched_balance(NULL); 926123487Sjeff /* 927123487Sjeff * Stagger the group and global load balancer so they do not 928123487Sjeff * interfere with each other. 929123487Sjeff */ 930123487Sjeff if (balance_groups) 931123487Sjeff callout_reset(&kseq_group_callout, hz / 2, 932123487Sjeff sched_balance_groups, NULL); 933117237Sjeff#else 934117237Sjeff kseq_setup(KSEQ_SELF()); 935116069Sjeff#endif 936117237Sjeff mtx_lock_spin(&sched_lock); 937122744Sjeff kseq_load_add(KSEQ_SELF(), &kse0); 938117237Sjeff mtx_unlock_spin(&sched_lock); 939109864Sjeff} 940109864Sjeff 941109864Sjeff/* 942109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 943109864Sjeff * process. 944109864Sjeff */ 945113357Sjeffstatic void 946109864Sjeffsched_priority(struct ksegrp *kg) 947109864Sjeff{ 948109864Sjeff int pri; 949109864Sjeff 950109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 951113357Sjeff return; 952109864Sjeff 953113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 954111857Sjeff pri += SCHED_PRI_BASE; 955109864Sjeff pri += kg->kg_nice; 956109864Sjeff 957109864Sjeff if (pri > PRI_MAX_TIMESHARE) 958109864Sjeff pri = PRI_MAX_TIMESHARE; 959109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 960109864Sjeff pri = PRI_MIN_TIMESHARE; 961109864Sjeff 962109864Sjeff kg->kg_user_pri = pri; 963109864Sjeff 964113357Sjeff return; 965109864Sjeff} 966109864Sjeff 967109864Sjeff/* 968112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 969112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 970109864Sjeff */ 971112966Sjeffstatic void 972112966Sjeffsched_slice(struct kse *ke) 973109864Sjeff{ 974113357Sjeff struct kseq *kseq; 975112966Sjeff struct ksegrp *kg; 976109864Sjeff 977112966Sjeff kg = ke->ke_ksegrp; 978113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 979109864Sjeff 980112966Sjeff /* 981112966Sjeff * Rationale: 982112966Sjeff * KSEs in interactive ksegs get the minimum slice so that we 983112966Sjeff * quickly notice if it abuses its advantage. 984112966Sjeff * 985112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 986112966Sjeff * based on the ksegs nice value relative to the least nice kseg 987112966Sjeff * on the run queue for this cpu. 988112966Sjeff * 989112966Sjeff * If the KSE is less nice than all others it gets the maximum 990112966Sjeff * slice and other KSEs will adjust their slice relative to 991112966Sjeff * this when they first expire. 992112966Sjeff * 993112966Sjeff * There is 20 point window that starts relative to the least 994112966Sjeff * nice kse on the run queue. Slice size is determined by 995112966Sjeff * the kse distance from the last nice ksegrp. 996112966Sjeff * 997121871Sjeff * If the kse is outside of the window it will get no slice 998121871Sjeff * and will be reevaluated each time it is selected on the 999121871Sjeff * run queue. The exception to this is nice 0 ksegs when 1000121871Sjeff * a nice -20 is running. They are always granted a minimum 1001121871Sjeff * slice. 1002112966Sjeff */ 1003113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 1004112966Sjeff int nice; 1005112966Sjeff 1006113357Sjeff nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 1007121896Sjeff if (kseq->ksq_load_timeshare == 0 || 1008113357Sjeff kg->kg_nice < kseq->ksq_nicemin) 1009112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1010121871Sjeff else if (nice <= SCHED_SLICE_NTHRESH) 1011112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 1012121871Sjeff else if (kg->kg_nice == 0) 1013121871Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1014112966Sjeff else 1015112966Sjeff ke->ke_slice = 0; 1016112966Sjeff } else 1017123684Sjeff ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1018112966Sjeff 1019113357Sjeff CTR6(KTR_ULE, 1020113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 1021113357Sjeff ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 1022121896Sjeff kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); 1023113357Sjeff 1024112966Sjeff return; 1025109864Sjeff} 1026109864Sjeff 1027121868Sjeff/* 1028121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1029121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 1030121868Sjeff * This routine will not operate correctly when slp or run times have been 1031121868Sjeff * adjusted to more than double their maximum. 1032121868Sjeff */ 1033116463Sjeffstatic void 1034116463Sjeffsched_interact_update(struct ksegrp *kg) 1035116463Sjeff{ 1036121868Sjeff int sum; 1037121605Sjeff 1038121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1039121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1040121868Sjeff return; 1041121868Sjeff /* 1042121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1043121868Sjeff * will not bring us back into range. Dividing by two here forces 1044121868Sjeff * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1045121868Sjeff */ 1046127850Sjeff if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1047121868Sjeff kg->kg_runtime /= 2; 1048121868Sjeff kg->kg_slptime /= 2; 1049121868Sjeff return; 1050116463Sjeff } 1051121868Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1052121868Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1053116463Sjeff} 1054116463Sjeff 1055121868Sjeffstatic void 1056121868Sjeffsched_interact_fork(struct ksegrp *kg) 1057121868Sjeff{ 1058121868Sjeff int ratio; 1059121868Sjeff int sum; 1060121868Sjeff 1061121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1062121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1063121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1064121868Sjeff kg->kg_runtime /= ratio; 1065121868Sjeff kg->kg_slptime /= ratio; 1066121868Sjeff } 1067121868Sjeff} 1068121868Sjeff 1069111857Sjeffstatic int 1070111857Sjeffsched_interact_score(struct ksegrp *kg) 1071111857Sjeff{ 1072116365Sjeff int div; 1073111857Sjeff 1074111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 1075116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1076116365Sjeff return (SCHED_INTERACT_HALF + 1077116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1078116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 1079116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1080116365Sjeff return (kg->kg_runtime / div); 1081111857Sjeff } 1082111857Sjeff 1083116365Sjeff /* 1084116365Sjeff * This can happen if slptime and runtime are 0. 1085116365Sjeff */ 1086116365Sjeff return (0); 1087111857Sjeff 1088111857Sjeff} 1089111857Sjeff 1090113357Sjeff/* 1091113357Sjeff * This is only somewhat accurate since given many processes of the same 1092113357Sjeff * priority they will switch when their slices run out, which will be 1093113357Sjeff * at most SCHED_SLICE_MAX. 1094113357Sjeff */ 1095109864Sjeffint 1096109864Sjeffsched_rr_interval(void) 1097109864Sjeff{ 1098109864Sjeff return (SCHED_SLICE_MAX); 1099109864Sjeff} 1100109864Sjeff 1101121790Sjeffstatic void 1102109864Sjeffsched_pctcpu_update(struct kse *ke) 1103109864Sjeff{ 1104109864Sjeff /* 1105109864Sjeff * Adjust counters and watermark for pctcpu calc. 1106116365Sjeff */ 1107120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1108120272Sjeff /* 1109120272Sjeff * Shift the tick count out so that the divide doesn't 1110120272Sjeff * round away our results. 1111120272Sjeff */ 1112120272Sjeff ke->ke_ticks <<= 10; 1113120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1114120272Sjeff SCHED_CPU_TICKS; 1115120272Sjeff ke->ke_ticks >>= 10; 1116120272Sjeff } else 1117120272Sjeff ke->ke_ticks = 0; 1118109864Sjeff ke->ke_ltick = ticks; 1119109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1120109864Sjeff} 1121109864Sjeff 1122109864Sjeffvoid 1123109864Sjeffsched_prio(struct thread *td, u_char prio) 1124109864Sjeff{ 1125121605Sjeff struct kse *ke; 1126109864Sjeff 1127121605Sjeff ke = td->td_kse; 1128109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1129109864Sjeff if (TD_ON_RUNQ(td)) { 1130121605Sjeff /* 1131121605Sjeff * If the priority has been elevated due to priority 1132121605Sjeff * propagation, we may have to move ourselves to a new 1133121605Sjeff * queue. We still call adjustrunqueue below in case kse 1134121605Sjeff * needs to fix things up. 1135121605Sjeff */ 1136121872Sjeff if (prio < td->td_priority && ke && 1137121872Sjeff (ke->ke_flags & KEF_ASSIGNED) == 0 && 1138121790Sjeff ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1139121605Sjeff runq_remove(ke->ke_runq, ke); 1140121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1141121605Sjeff runq_add(ke->ke_runq, ke); 1142121605Sjeff } 1143119488Sdavidxu adjustrunqueue(td, prio); 1144121605Sjeff } else 1145119488Sdavidxu td->td_priority = prio; 1146109864Sjeff} 1147109864Sjeff 1148109864Sjeffvoid 1149121128Sjeffsched_switch(struct thread *td) 1150109864Sjeff{ 1151121128Sjeff struct thread *newtd; 1152109864Sjeff struct kse *ke; 1153109864Sjeff 1154109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1155109864Sjeff 1156109864Sjeff ke = td->td_kse; 1157109864Sjeff 1158109864Sjeff td->td_last_kse = ke; 1159113339Sjulian td->td_lastcpu = td->td_oncpu; 1160113339Sjulian td->td_oncpu = NOCPU; 1161111032Sjulian td->td_flags &= ~TDF_NEEDRESCHED; 1162109864Sjeff 1163123434Sjeff /* 1164123434Sjeff * If the KSE has been assigned it may be in the process of switching 1165123434Sjeff * to the new cpu. This is the case in sched_bind(). 1166123434Sjeff */ 1167123434Sjeff if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1168123434Sjeff if (TD_IS_RUNNING(td)) { 1169127278Sobrien kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1170127278Sobrien setrunqueue(td); 1171123434Sjeff } else { 1172125289Sjeff if (ke->ke_runq) { 1173123434Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1174125289Sjeff } else if ((td->td_flags & TDF_IDLETD) == 0) 1175125289Sjeff backtrace(); 1176123434Sjeff /* 1177123434Sjeff * We will not be on the run queue. So we must be 1178123434Sjeff * sleeping or similar. 1179123434Sjeff */ 1180123434Sjeff if (td->td_proc->p_flag & P_SA) 1181123434Sjeff kse_reassign(ke); 1182123434Sjeff } 1183121146Sjeff } 1184121128Sjeff newtd = choosethread(); 1185121128Sjeff if (td != newtd) 1186121128Sjeff cpu_switch(td, newtd); 1187121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1188109864Sjeff 1189113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1190109864Sjeff} 1191109864Sjeff 1192109864Sjeffvoid 1193109864Sjeffsched_nice(struct ksegrp *kg, int nice) 1194109864Sjeff{ 1195113357Sjeff struct kse *ke; 1196109864Sjeff struct thread *td; 1197113357Sjeff struct kseq *kseq; 1198109864Sjeff 1199113873Sjhb PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 1200113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1201113357Sjeff /* 1202113357Sjeff * We need to adjust the nice counts for running KSEs. 1203113357Sjeff */ 1204113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 1205113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1206116500Sjeff if (ke->ke_runq == NULL) 1207113357Sjeff continue; 1208113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1209113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1210113357Sjeff kseq_nice_add(kseq, nice); 1211113357Sjeff } 1212109864Sjeff kg->kg_nice = nice; 1213109864Sjeff sched_priority(kg); 1214113357Sjeff FOREACH_THREAD_IN_GROUP(kg, td) 1215111032Sjulian td->td_flags |= TDF_NEEDRESCHED; 1216109864Sjeff} 1217109864Sjeff 1218109864Sjeffvoid 1219126326Sjhbsched_sleep(struct thread *td) 1220109864Sjeff{ 1221109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1222109864Sjeff 1223109864Sjeff td->td_slptime = ticks; 1224126326Sjhb td->td_base_pri = td->td_priority; 1225109864Sjeff 1226113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 1227113357Sjeff td->td_kse, td->td_slptime); 1228109864Sjeff} 1229109864Sjeff 1230109864Sjeffvoid 1231109864Sjeffsched_wakeup(struct thread *td) 1232109864Sjeff{ 1233109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1234109864Sjeff 1235109864Sjeff /* 1236109864Sjeff * Let the kseg know how long we slept for. This is because process 1237109864Sjeff * interactivity behavior is modeled in the kseg. 1238109864Sjeff */ 1239111788Sjeff if (td->td_slptime) { 1240111788Sjeff struct ksegrp *kg; 1241113357Sjeff int hzticks; 1242109864Sjeff 1243111788Sjeff kg = td->td_ksegrp; 1244121868Sjeff hzticks = (ticks - td->td_slptime) << 10; 1245121868Sjeff if (hzticks >= SCHED_SLP_RUN_MAX) { 1246121868Sjeff kg->kg_slptime = SCHED_SLP_RUN_MAX; 1247121868Sjeff kg->kg_runtime = 1; 1248121868Sjeff } else { 1249121868Sjeff kg->kg_slptime += hzticks; 1250121868Sjeff sched_interact_update(kg); 1251121868Sjeff } 1252111788Sjeff sched_priority(kg); 1253116463Sjeff if (td->td_kse) 1254116463Sjeff sched_slice(td->td_kse); 1255113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 1256113357Sjeff td->td_kse, hzticks); 1257111788Sjeff td->td_slptime = 0; 1258109864Sjeff } 1259109864Sjeff setrunqueue(td); 1260109864Sjeff} 1261109864Sjeff 1262109864Sjeff/* 1263109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1264109864Sjeff * priority. 1265109864Sjeff */ 1266109864Sjeffvoid 1267113357Sjeffsched_fork(struct proc *p, struct proc *p1) 1268109864Sjeff{ 1269109864Sjeff 1270109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1271109864Sjeff 1272113357Sjeff sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 1273113357Sjeff sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 1274113357Sjeff sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 1275113357Sjeff} 1276113357Sjeff 1277113357Sjeffvoid 1278113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child) 1279113357Sjeff{ 1280113923Sjhb 1281116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1282122847Sjeff child->ke_cpu = ke->ke_cpu; 1283113357Sjeff child->ke_runq = NULL; 1284113357Sjeff 1285121051Sjeff /* Grab our parents cpu estimation information. */ 1286121051Sjeff child->ke_ticks = ke->ke_ticks; 1287121051Sjeff child->ke_ltick = ke->ke_ltick; 1288121051Sjeff child->ke_ftick = ke->ke_ftick; 1289113357Sjeff} 1290113357Sjeff 1291113357Sjeffvoid 1292113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1293113357Sjeff{ 1294113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 1295116365Sjeff 1296121868Sjeff child->kg_slptime = kg->kg_slptime; 1297121868Sjeff child->kg_runtime = kg->kg_runtime; 1298121868Sjeff child->kg_user_pri = kg->kg_user_pri; 1299121868Sjeff child->kg_nice = kg->kg_nice; 1300121868Sjeff sched_interact_fork(child); 1301116463Sjeff kg->kg_runtime += tickincr << 10; 1302116463Sjeff sched_interact_update(kg); 1303113357Sjeff 1304121868Sjeff CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", 1305121868Sjeff kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 1306121868Sjeff child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); 1307113357Sjeff} 1308109864Sjeff 1309113357Sjeffvoid 1310113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 1311113357Sjeff{ 1312113357Sjeff} 1313113357Sjeff 1314113357Sjeffvoid 1315113357Sjeffsched_class(struct ksegrp *kg, int class) 1316113357Sjeff{ 1317113357Sjeff struct kseq *kseq; 1318113357Sjeff struct kse *ke; 1319121896Sjeff int nclass; 1320121896Sjeff int oclass; 1321113357Sjeff 1322113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1323113357Sjeff if (kg->kg_pri_class == class) 1324113357Sjeff return; 1325113357Sjeff 1326121896Sjeff nclass = PRI_BASE(class); 1327121896Sjeff oclass = PRI_BASE(kg->kg_pri_class); 1328113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1329113357Sjeff if (ke->ke_state != KES_ONRUNQ && 1330113357Sjeff ke->ke_state != KES_THREAD) 1331113357Sjeff continue; 1332113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1333113357Sjeff 1334121896Sjeff#ifdef SMP 1335122744Sjeff /* 1336122744Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1337122744Sjeff * count because could be changing to or from an interrupt 1338122744Sjeff * class. 1339122744Sjeff */ 1340122744Sjeff if (ke->ke_state == KES_ONRUNQ) { 1341123433Sjeff if (KSE_CAN_MIGRATE(ke, oclass)) { 1342123433Sjeff kseq->ksq_transferable--; 1343123433Sjeff kseq->ksq_group->ksg_transferable--; 1344123433Sjeff } 1345123433Sjeff if (KSE_CAN_MIGRATE(ke, nclass)) { 1346123433Sjeff kseq->ksq_transferable++; 1347123433Sjeff kseq->ksq_group->ksg_transferable++; 1348123433Sjeff } 1349122744Sjeff } 1350121896Sjeff#endif 1351122744Sjeff if (oclass == PRI_TIMESHARE) { 1352121896Sjeff kseq->ksq_load_timeshare--; 1353122744Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1354122744Sjeff } 1355122744Sjeff if (nclass == PRI_TIMESHARE) { 1356121896Sjeff kseq->ksq_load_timeshare++; 1357113357Sjeff kseq_nice_add(kseq, kg->kg_nice); 1358122744Sjeff } 1359109970Sjeff } 1360109970Sjeff 1361113357Sjeff kg->kg_pri_class = class; 1362109864Sjeff} 1363109864Sjeff 1364109864Sjeff/* 1365109864Sjeff * Return some of the child's priority and interactivity to the parent. 1366109864Sjeff */ 1367109864Sjeffvoid 1368113357Sjeffsched_exit(struct proc *p, struct proc *child) 1369109864Sjeff{ 1370109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1371113372Sjeff sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 1372116365Sjeff sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 1373109864Sjeff} 1374109864Sjeff 1375109864Sjeffvoid 1376113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child) 1377113372Sjeff{ 1378122744Sjeff kseq_load_rem(KSEQ_CPU(child->ke_cpu), child); 1379113372Sjeff} 1380113372Sjeff 1381113372Sjeffvoid 1382113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1383113372Sjeff{ 1384116463Sjeff /* kg->kg_slptime += child->kg_slptime; */ 1385116365Sjeff kg->kg_runtime += child->kg_runtime; 1386116463Sjeff sched_interact_update(kg); 1387113372Sjeff} 1388113372Sjeff 1389113372Sjeffvoid 1390113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 1391113372Sjeff{ 1392113372Sjeff} 1393113372Sjeff 1394113372Sjeffvoid 1395121127Sjeffsched_clock(struct thread *td) 1396109864Sjeff{ 1397113357Sjeff struct kseq *kseq; 1398113357Sjeff struct ksegrp *kg; 1399121127Sjeff struct kse *ke; 1400109864Sjeff 1401113357Sjeff /* 1402113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1403113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1404113357Sjeff * calculating this here. 1405113357Sjeff */ 1406113357Sjeff if (realstathz == 0) { 1407113357Sjeff realstathz = stathz ? stathz : hz; 1408113357Sjeff tickincr = hz / realstathz; 1409113357Sjeff /* 1410113357Sjeff * XXX This does not work for values of stathz that are much 1411113357Sjeff * larger than hz. 1412113357Sjeff */ 1413113357Sjeff if (tickincr == 0) 1414113357Sjeff tickincr = 1; 1415113357Sjeff } 1416109864Sjeff 1417121127Sjeff ke = td->td_kse; 1418113357Sjeff kg = ke->ke_ksegrp; 1419109864Sjeff 1420110028Sjeff mtx_assert(&sched_lock, MA_OWNED); 1421110028Sjeff /* Adjust ticks for pctcpu */ 1422111793Sjeff ke->ke_ticks++; 1423109971Sjeff ke->ke_ltick = ticks; 1424112994Sjeff 1425109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1426109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1427109971Sjeff sched_pctcpu_update(ke); 1428109971Sjeff 1429114496Sjulian if (td->td_flags & TDF_IDLETD) 1430109864Sjeff return; 1431110028Sjeff 1432113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1433113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1434110028Sjeff /* 1435113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1436113357Sjeff */ 1437113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1438113357Sjeff return; 1439113357Sjeff /* 1440110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1441113357Sjeff * interactivity. 1442109864Sjeff */ 1443113357Sjeff kg->kg_runtime += tickincr << 10; 1444116463Sjeff sched_interact_update(kg); 1445110645Sjeff 1446109864Sjeff /* 1447109864Sjeff * We used up one time slice. 1448109864Sjeff */ 1449122847Sjeff if (--ke->ke_slice > 0) 1450113357Sjeff return; 1451109864Sjeff /* 1452113357Sjeff * We're out of time, recompute priorities and requeue. 1453109864Sjeff */ 1454122847Sjeff kseq = KSEQ_SELF(); 1455122744Sjeff kseq_load_rem(kseq, ke); 1456113357Sjeff sched_priority(kg); 1457113357Sjeff sched_slice(ke); 1458113357Sjeff if (SCHED_CURR(kg, ke)) 1459113357Sjeff ke->ke_runq = kseq->ksq_curr; 1460113357Sjeff else 1461113357Sjeff ke->ke_runq = kseq->ksq_next; 1462122744Sjeff kseq_load_add(kseq, ke); 1463113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1464109864Sjeff} 1465109864Sjeff 1466109864Sjeffint 1467109864Sjeffsched_runnable(void) 1468109864Sjeff{ 1469109864Sjeff struct kseq *kseq; 1470115998Sjeff int load; 1471109864Sjeff 1472115998Sjeff load = 1; 1473115998Sjeff 1474110028Sjeff kseq = KSEQ_SELF(); 1475121790Sjeff#ifdef SMP 1476122094Sjeff if (kseq->ksq_assigned) { 1477122094Sjeff mtx_lock_spin(&sched_lock); 1478121790Sjeff kseq_assign(kseq); 1479122094Sjeff mtx_unlock_spin(&sched_lock); 1480122094Sjeff } 1481121790Sjeff#endif 1482121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1483121605Sjeff if (kseq->ksq_load > 0) 1484121605Sjeff goto out; 1485121605Sjeff } else 1486121605Sjeff if (kseq->ksq_load - 1 > 0) 1487121605Sjeff goto out; 1488115998Sjeff load = 0; 1489115998Sjeffout: 1490115998Sjeff return (load); 1491109864Sjeff} 1492109864Sjeff 1493109864Sjeffvoid 1494109864Sjeffsched_userret(struct thread *td) 1495109864Sjeff{ 1496109864Sjeff struct ksegrp *kg; 1497121605Sjeff 1498121605Sjeff kg = td->td_ksegrp; 1499109864Sjeff 1500109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1501109864Sjeff mtx_lock_spin(&sched_lock); 1502109864Sjeff td->td_priority = kg->kg_user_pri; 1503109864Sjeff mtx_unlock_spin(&sched_lock); 1504109864Sjeff } 1505109864Sjeff} 1506109864Sjeff 1507109864Sjeffstruct kse * 1508109970Sjeffsched_choose(void) 1509109970Sjeff{ 1510110028Sjeff struct kseq *kseq; 1511109970Sjeff struct kse *ke; 1512109970Sjeff 1513115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1514121790Sjeff kseq = KSEQ_SELF(); 1515113357Sjeff#ifdef SMP 1516123433Sjeffrestart: 1517121790Sjeff if (kseq->ksq_assigned) 1518121790Sjeff kseq_assign(kseq); 1519113357Sjeff#endif 1520121790Sjeff ke = kseq_choose(kseq); 1521109864Sjeff if (ke) { 1522121790Sjeff#ifdef SMP 1523121790Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1524123433Sjeff if (kseq_idled(kseq) == 0) 1525123433Sjeff goto restart; 1526121790Sjeff#endif 1527122744Sjeff kseq_runq_rem(kseq, ke); 1528109864Sjeff ke->ke_state = KES_THREAD; 1529112966Sjeff 1530113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1531113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1532113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1533113357Sjeff ke->ke_thread->td_priority); 1534113357Sjeff } 1535113357Sjeff return (ke); 1536109864Sjeff } 1537109970Sjeff#ifdef SMP 1538123433Sjeff if (kseq_idled(kseq) == 0) 1539123433Sjeff goto restart; 1540109970Sjeff#endif 1541113357Sjeff return (NULL); 1542109864Sjeff} 1543109864Sjeff 1544109864Sjeffvoid 1545121127Sjeffsched_add(struct thread *td) 1546109864Sjeff{ 1547110267Sjeff struct kseq *kseq; 1548113357Sjeff struct ksegrp *kg; 1549121127Sjeff struct kse *ke; 1550121790Sjeff int class; 1551109864Sjeff 1552121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 1553121127Sjeff ke = td->td_kse; 1554121127Sjeff kg = td->td_ksegrp; 1555121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1556121790Sjeff return; 1557121790Sjeff kseq = KSEQ_SELF(); 1558124958Sjeff KASSERT((ke->ke_thread != NULL), 1559124958Sjeff ("sched_add: No thread on KSE")); 1560109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1561110267Sjeff ("sched_add: No KSE on thread")); 1562109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1563110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1564109864Sjeff ke->ke_proc->p_comm)); 1565109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1566110267Sjeff ("sched_add: process swapped out")); 1567113387Sjeff KASSERT(ke->ke_runq == NULL, 1568113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1569109864Sjeff 1570121790Sjeff class = PRI_BASE(kg->kg_pri_class); 1571121790Sjeff switch (class) { 1572112994Sjeff case PRI_ITHD: 1573112994Sjeff case PRI_REALTIME: 1574113357Sjeff ke->ke_runq = kseq->ksq_curr; 1575113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1576113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1577112994Sjeff break; 1578112994Sjeff case PRI_TIMESHARE: 1579113387Sjeff if (SCHED_CURR(kg, ke)) 1580113387Sjeff ke->ke_runq = kseq->ksq_curr; 1581113387Sjeff else 1582113387Sjeff ke->ke_runq = kseq->ksq_next; 1583113357Sjeff break; 1584112994Sjeff case PRI_IDLE: 1585113357Sjeff /* 1586113357Sjeff * This is for priority prop. 1587113357Sjeff */ 1588121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1589113357Sjeff ke->ke_runq = kseq->ksq_curr; 1590113357Sjeff else 1591113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1592113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1593112994Sjeff break; 1594113357Sjeff default: 1595121868Sjeff panic("Unknown pri class."); 1596113357Sjeff break; 1597112994Sjeff } 1598121790Sjeff#ifdef SMP 1599123433Sjeff if (ke->ke_cpu != PCPU_GET(cpuid)) { 1600123529Sjeff ke->ke_runq = NULL; 1601123433Sjeff kseq_notify(ke, ke->ke_cpu); 1602123433Sjeff return; 1603123433Sjeff } 1604121790Sjeff /* 1605123685Sjeff * If we had been idle, clear our bit in the group and potentially 1606123685Sjeff * the global bitmap. If not, see if we should transfer this thread. 1607121790Sjeff */ 1608123433Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1609123433Sjeff (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1610121790Sjeff /* 1611123433Sjeff * Check to see if our group is unidling, and if so, remove it 1612123433Sjeff * from the global idle mask. 1613121790Sjeff */ 1614123433Sjeff if (kseq->ksq_group->ksg_idlemask == 1615123433Sjeff kseq->ksq_group->ksg_cpumask) 1616123433Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1617123433Sjeff /* 1618123433Sjeff * Now remove ourselves from the group specific idle mask. 1619123433Sjeff */ 1620123433Sjeff kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1621123685Sjeff } else if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class)) 1622123685Sjeff if (kseq_transfer(kseq, ke, class)) 1623123685Sjeff return; 1624121790Sjeff#endif 1625121790Sjeff if (td->td_priority < curthread->td_priority) 1626121790Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1627121790Sjeff 1628109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1629109864Sjeff ke->ke_state = KES_ONRUNQ; 1630109864Sjeff 1631122744Sjeff kseq_runq_add(kseq, ke); 1632122744Sjeff kseq_load_add(kseq, ke); 1633109864Sjeff} 1634109864Sjeff 1635109864Sjeffvoid 1636121127Sjeffsched_rem(struct thread *td) 1637109864Sjeff{ 1638113357Sjeff struct kseq *kseq; 1639121127Sjeff struct kse *ke; 1640113357Sjeff 1641121127Sjeff ke = td->td_kse; 1642121790Sjeff /* 1643121790Sjeff * It is safe to just return here because sched_rem() is only ever 1644121790Sjeff * used in places where we're immediately going to add the 1645121790Sjeff * kse back on again. In that case it'll be added with the correct 1646121790Sjeff * thread and priority when the caller drops the sched_lock. 1647121790Sjeff */ 1648121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1649121790Sjeff return; 1650109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1651124958Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), 1652124958Sjeff ("sched_rem: KSE not on run queue")); 1653109864Sjeff 1654109864Sjeff ke->ke_state = KES_THREAD; 1655109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1656113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1657122744Sjeff kseq_runq_rem(kseq, ke); 1658122744Sjeff kseq_load_rem(kseq, ke); 1659109864Sjeff} 1660109864Sjeff 1661109864Sjefffixpt_t 1662121127Sjeffsched_pctcpu(struct thread *td) 1663109864Sjeff{ 1664109864Sjeff fixpt_t pctcpu; 1665121127Sjeff struct kse *ke; 1666109864Sjeff 1667109864Sjeff pctcpu = 0; 1668121127Sjeff ke = td->td_kse; 1669121290Sjeff if (ke == NULL) 1670121290Sjeff return (0); 1671109864Sjeff 1672115998Sjeff mtx_lock_spin(&sched_lock); 1673109864Sjeff if (ke->ke_ticks) { 1674109864Sjeff int rtick; 1675109864Sjeff 1676116365Sjeff /* 1677116365Sjeff * Don't update more frequently than twice a second. Allowing 1678116365Sjeff * this causes the cpu usage to decay away too quickly due to 1679116365Sjeff * rounding errors. 1680116365Sjeff */ 1681123435Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1682123435Sjeff ke->ke_ltick < (ticks - (hz / 2))) 1683116365Sjeff sched_pctcpu_update(ke); 1684109864Sjeff /* How many rtick per second ? */ 1685116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1686110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1687109864Sjeff } 1688109864Sjeff 1689109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1690113865Sjhb mtx_unlock_spin(&sched_lock); 1691109864Sjeff 1692109864Sjeff return (pctcpu); 1693109864Sjeff} 1694109864Sjeff 1695122038Sjeffvoid 1696122038Sjeffsched_bind(struct thread *td, int cpu) 1697122038Sjeff{ 1698122038Sjeff struct kse *ke; 1699122038Sjeff 1700122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1701122038Sjeff ke = td->td_kse; 1702122038Sjeff ke->ke_flags |= KEF_BOUND; 1703123433Sjeff#ifdef SMP 1704123433Sjeff if (PCPU_GET(cpuid) == cpu) 1705122038Sjeff return; 1706122038Sjeff /* sched_rem without the runq_remove */ 1707122038Sjeff ke->ke_state = KES_THREAD; 1708122038Sjeff ke->ke_ksegrp->kg_runq_kses--; 1709122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1710122038Sjeff kseq_notify(ke, cpu); 1711122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 1712124944Sjeff mi_switch(SW_VOL); 1713122038Sjeff#endif 1714122038Sjeff} 1715122038Sjeff 1716122038Sjeffvoid 1717122038Sjeffsched_unbind(struct thread *td) 1718122038Sjeff{ 1719122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1720122038Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 1721122038Sjeff} 1722122038Sjeff 1723109864Sjeffint 1724125289Sjeffsched_load(void) 1725125289Sjeff{ 1726125289Sjeff#ifdef SMP 1727125289Sjeff int total; 1728125289Sjeff int i; 1729125289Sjeff 1730125289Sjeff total = 0; 1731125289Sjeff for (i = 0; i <= ksg_maxid; i++) 1732125289Sjeff total += KSEQ_GROUP(i)->ksg_load; 1733125289Sjeff return (total); 1734125289Sjeff#else 1735125289Sjeff return (KSEQ_SELF()->ksq_sysload); 1736125289Sjeff#endif 1737125289Sjeff} 1738125289Sjeff 1739125289Sjeffint 1740109864Sjeffsched_sizeof_kse(void) 1741109864Sjeff{ 1742109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1743109864Sjeff} 1744109864Sjeff 1745109864Sjeffint 1746109864Sjeffsched_sizeof_ksegrp(void) 1747109864Sjeff{ 1748109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1749109864Sjeff} 1750109864Sjeff 1751109864Sjeffint 1752109864Sjeffsched_sizeof_proc(void) 1753109864Sjeff{ 1754109864Sjeff return (sizeof(struct proc)); 1755109864Sjeff} 1756109864Sjeff 1757109864Sjeffint 1758109864Sjeffsched_sizeof_thread(void) 1759109864Sjeff{ 1760109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1761109864Sjeff} 1762