sched_ule.c revision 125289
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 125289 2004-02-01 02:48:36Z jeff $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32109864Sjeff#include <sys/kernel.h> 33109864Sjeff#include <sys/ktr.h> 34109864Sjeff#include <sys/lock.h> 35109864Sjeff#include <sys/mutex.h> 36109864Sjeff#include <sys/proc.h> 37112966Sjeff#include <sys/resource.h> 38122038Sjeff#include <sys/resourcevar.h> 39109864Sjeff#include <sys/sched.h> 40109864Sjeff#include <sys/smp.h> 41109864Sjeff#include <sys/sx.h> 42109864Sjeff#include <sys/sysctl.h> 43109864Sjeff#include <sys/sysproto.h> 44109864Sjeff#include <sys/vmmeter.h> 45109864Sjeff#ifdef DDB 46109864Sjeff#include <ddb/ddb.h> 47109864Sjeff#endif 48109864Sjeff#ifdef KTRACE 49109864Sjeff#include <sys/uio.h> 50109864Sjeff#include <sys/ktrace.h> 51109864Sjeff#endif 52109864Sjeff 53109864Sjeff#include <machine/cpu.h> 54121790Sjeff#include <machine/smp.h> 55109864Sjeff 56113357Sjeff#define KTR_ULE KTR_NFS 57113357Sjeff 58109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 59109864Sjeff/* XXX This is bogus compatability crap for ps */ 60109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 61109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 62109864Sjeff 63109864Sjeffstatic void sched_setup(void *dummy); 64109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 65109864Sjeff 66113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 67113357Sjeff 68113357Sjeffstatic int slice_min = 1; 69113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 70113357Sjeff 71116365Sjeffstatic int slice_max = 10; 72113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 73113357Sjeff 74111857Sjeffint realstathz; 75113357Sjeffint tickincr = 1; 76111857Sjeff 77116069Sjeff#ifdef SMP 78123487Sjeff/* Callouts to handle load balancing SMP systems. */ 79116069Sjeffstatic struct callout kseq_lb_callout; 80123487Sjeffstatic struct callout kseq_group_callout; 81116069Sjeff#endif 82116069Sjeff 83109864Sjeff/* 84109864Sjeff * These datastructures are allocated within their parent datastructure but 85109864Sjeff * are scheduler specific. 86109864Sjeff */ 87109864Sjeff 88109864Sjeffstruct ke_sched { 89109864Sjeff int ske_slice; 90109864Sjeff struct runq *ske_runq; 91109864Sjeff /* The following variables are only used for pctcpu calculation */ 92109864Sjeff int ske_ltick; /* Last tick that we were running on */ 93109864Sjeff int ske_ftick; /* First tick that we were running on */ 94109864Sjeff int ske_ticks; /* Tick count */ 95113357Sjeff /* CPU that we have affinity for. */ 96110260Sjeff u_char ske_cpu; 97109864Sjeff}; 98109864Sjeff#define ke_slice ke_sched->ske_slice 99109864Sjeff#define ke_runq ke_sched->ske_runq 100109864Sjeff#define ke_ltick ke_sched->ske_ltick 101109864Sjeff#define ke_ftick ke_sched->ske_ftick 102109864Sjeff#define ke_ticks ke_sched->ske_ticks 103110260Sjeff#define ke_cpu ke_sched->ske_cpu 104121790Sjeff#define ke_assign ke_procq.tqe_next 105109864Sjeff 106121790Sjeff#define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ 107122158Sjeff#define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ 108121790Sjeff 109109864Sjeffstruct kg_sched { 110110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 111110645Sjeff int skg_runtime; /* Number of ticks we were running */ 112109864Sjeff}; 113109864Sjeff#define kg_slptime kg_sched->skg_slptime 114110645Sjeff#define kg_runtime kg_sched->skg_runtime 115109864Sjeff 116109864Sjeffstruct td_sched { 117109864Sjeff int std_slptime; 118109864Sjeff}; 119109864Sjeff#define td_slptime td_sched->std_slptime 120109864Sjeff 121110267Sjeffstruct td_sched td_sched; 122109864Sjeffstruct ke_sched ke_sched; 123109864Sjeffstruct kg_sched kg_sched; 124109864Sjeff 125109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 126109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 127109864Sjeffstruct p_sched *proc0_sched = NULL; 128109864Sjeffstruct td_sched *thread0_sched = &td_sched; 129109864Sjeff 130109864Sjeff/* 131116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 132116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 133116642Sjeff * value is then directly added to this to allow nice to have some effect 134116642Sjeff * on latency. 135111857Sjeff * 136111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 137116642Sjeff * PRI_NRESV: Number of nice values. 138111857Sjeff * PRI_BASE: The start of the dynamic range. 139109864Sjeff */ 140111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 141121869Sjeff#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 142121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 143116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 144113357Sjeff#define SCHED_PRI_INTERACT(score) \ 145116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 146109864Sjeff 147109864Sjeff/* 148111857Sjeff * These determine the interactivity of a process. 149109864Sjeff * 150110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 151110645Sjeff * before throttling back. 152121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 153116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 154111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 155109864Sjeff */ 156121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 157121868Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 158116365Sjeff#define SCHED_INTERACT_MAX (100) 159116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 160121126Sjeff#define SCHED_INTERACT_THRESH (30) 161111857Sjeff 162109864Sjeff/* 163109864Sjeff * These parameters and macros determine the size of the time slice that is 164109864Sjeff * granted to each thread. 165109864Sjeff * 166109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 167109864Sjeff * SLICE_MAX: Maximum time slice granted. 168109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 169112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 170112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 171121871Sjeff * SLICE_NTHRESH: The nice cutoff point for slice assignment. 172109864Sjeff */ 173113357Sjeff#define SCHED_SLICE_MIN (slice_min) 174113357Sjeff#define SCHED_SLICE_MAX (slice_max) 175123684Sjeff#define SCHED_SLICE_INTERACTIVE (slice_min * 4) 176121871Sjeff#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 177111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 178109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 179112966Sjeff#define SCHED_SLICE_NICE(nice) \ 180121871Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 181109864Sjeff 182109864Sjeff/* 183109864Sjeff * This macro determines whether or not the kse belongs on the current or 184109864Sjeff * next run queue. 185109864Sjeff */ 186113357Sjeff#define SCHED_INTERACTIVE(kg) \ 187113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 188113417Sjeff#define SCHED_CURR(kg, ke) \ 189121107Sjeff (ke->ke_thread->td_priority != kg->kg_user_pri || \ 190121107Sjeff SCHED_INTERACTIVE(kg)) 191109864Sjeff 192109864Sjeff/* 193109864Sjeff * Cpu percentage computation macros and defines. 194109864Sjeff * 195109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 196109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 197109864Sjeff */ 198109864Sjeff 199112971Sjeff#define SCHED_CPU_TIME 10 200109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 201109864Sjeff 202109864Sjeff/* 203113357Sjeff * kseq - per processor runqs and statistics. 204109864Sjeff */ 205109864Sjeffstruct kseq { 206113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 207113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 208113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 209113357Sjeff struct runq *ksq_curr; /* Current queue. */ 210121896Sjeff int ksq_load_timeshare; /* Load for timeshare. */ 211113357Sjeff int ksq_load; /* Aggregate load. */ 212121869Sjeff short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 213113357Sjeff short ksq_nicemin; /* Least nice. */ 214110267Sjeff#ifdef SMP 215123433Sjeff int ksq_transferable; 216123433Sjeff LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 217123433Sjeff struct kseq_group *ksq_group; /* Our processor group. */ 218123433Sjeff volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 219125289Sjeff#else 220125289Sjeff int ksq_sysload; /* For loadavg, !ITHD load. */ 221110267Sjeff#endif 222109864Sjeff}; 223109864Sjeff 224123433Sjeff#ifdef SMP 225109864Sjeff/* 226123433Sjeff * kseq groups are groups of processors which can cheaply share threads. When 227123433Sjeff * one processor in the group goes idle it will check the runqs of the other 228123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 229123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 230123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 231123433Sjeff * load balancer. 232123433Sjeff */ 233123433Sjeffstruct kseq_group { 234123433Sjeff int ksg_cpus; /* Count of CPUs in this kseq group. */ 235123433Sjeff int ksg_cpumask; /* Mask of cpus in this group. */ 236123433Sjeff int ksg_idlemask; /* Idle cpus in this group. */ 237123433Sjeff int ksg_mask; /* Bit mask for first cpu. */ 238123487Sjeff int ksg_load; /* Total load of this group. */ 239123433Sjeff int ksg_transferable; /* Transferable load of this group. */ 240123433Sjeff LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 241123433Sjeff}; 242123433Sjeff#endif 243123433Sjeff 244123433Sjeff/* 245109864Sjeff * One kse queue per processor. 246109864Sjeff */ 247110028Sjeff#ifdef SMP 248121790Sjeffstatic int kseq_idle; 249123487Sjeffstatic int ksg_maxid; 250121790Sjeffstatic struct kseq kseq_cpu[MAXCPU]; 251123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU]; 252123433Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 253123433Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 254123487Sjeff#define KSEQ_ID(x) ((x) - kseq_cpu) 255123487Sjeff#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 256123433Sjeff#else /* !SMP */ 257121790Sjeffstatic struct kseq kseq_cpu; 258110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 259110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 260110028Sjeff#endif 261109864Sjeff 262112966Sjeffstatic void sched_slice(struct kse *ke); 263113357Sjeffstatic void sched_priority(struct ksegrp *kg); 264111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 265116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 266121868Sjeffstatic void sched_interact_fork(struct ksegrp *kg); 267121790Sjeffstatic void sched_pctcpu_update(struct kse *ke); 268109864Sjeff 269110267Sjeff/* Operations on per processor queues */ 270121790Sjeffstatic struct kse * kseq_choose(struct kseq *kseq); 271110028Sjeffstatic void kseq_setup(struct kseq *kseq); 272122744Sjeffstatic void kseq_load_add(struct kseq *kseq, struct kse *ke); 273122744Sjeffstatic void kseq_load_rem(struct kseq *kseq, struct kse *ke); 274122744Sjeffstatic __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 275122744Sjeffstatic __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 276113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 277113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 278113660Sjeffvoid kseq_print(int cpu); 279110267Sjeff#ifdef SMP 280123433Sjeffstatic int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 281121790Sjeffstatic struct kse *runq_steal(struct runq *rq); 282122744Sjeffstatic void sched_balance(void *arg); 283123487Sjeffstatic void sched_balance_group(struct kseq_group *ksg); 284123487Sjeffstatic void sched_balance_pair(struct kseq *high, struct kseq *low); 285121790Sjeffstatic void kseq_move(struct kseq *from, int cpu); 286123433Sjeffstatic int kseq_idled(struct kseq *kseq); 287121790Sjeffstatic void kseq_notify(struct kse *ke, int cpu); 288121790Sjeffstatic void kseq_assign(struct kseq *); 289123433Sjeffstatic struct kse *kseq_steal(struct kseq *kseq, int stealidle); 290123693Sjeff/* 291123693Sjeff * On P4 Xeons the round-robin interrupt delivery is broken. As a result of 292123693Sjeff * this, we can't pin interrupts to the cpu that they were delivered to, 293123693Sjeff * otherwise all ithreads only run on CPU 0. 294123693Sjeff */ 295123693Sjeff#ifdef __i386__ 296122038Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 297123693Sjeff ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 298123693Sjeff#else /* !__i386__ */ 299123693Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 300122158Sjeff ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 301122165Sjeff ((ke)->ke_flags & KEF_BOUND) == 0) 302123693Sjeff#endif /* !__i386__ */ 303121790Sjeff#endif 304110028Sjeff 305113357Sjeffvoid 306113660Sjeffkseq_print(int cpu) 307110267Sjeff{ 308113660Sjeff struct kseq *kseq; 309113357Sjeff int i; 310112994Sjeff 311113660Sjeff kseq = KSEQ_CPU(cpu); 312112994Sjeff 313113357Sjeff printf("kseq:\n"); 314113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 315122744Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 316121896Sjeff#ifdef SMP 317123433Sjeff printf("\tload transferable: %d\n", kseq->ksq_transferable); 318121896Sjeff#endif 319113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 320113357Sjeff printf("\tnice counts:\n"); 321121869Sjeff for (i = 0; i < SCHED_PRI_NRESV; i++) 322113357Sjeff if (kseq->ksq_nice[i]) 323113357Sjeff printf("\t\t%d = %d\n", 324113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 325113357Sjeff} 326112994Sjeff 327122744Sjeffstatic __inline void 328122744Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke) 329122744Sjeff{ 330122744Sjeff#ifdef SMP 331123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 332123433Sjeff kseq->ksq_transferable++; 333123433Sjeff kseq->ksq_group->ksg_transferable++; 334123433Sjeff } 335122744Sjeff#endif 336122744Sjeff runq_add(ke->ke_runq, ke); 337122744Sjeff} 338122744Sjeff 339122744Sjeffstatic __inline void 340122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke) 341122744Sjeff{ 342122744Sjeff#ifdef SMP 343123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 344123433Sjeff kseq->ksq_transferable--; 345123433Sjeff kseq->ksq_group->ksg_transferable--; 346123433Sjeff } 347122744Sjeff#endif 348122744Sjeff runq_remove(ke->ke_runq, ke); 349122744Sjeff} 350122744Sjeff 351113357Sjeffstatic void 352122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke) 353113357Sjeff{ 354121896Sjeff int class; 355115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 356121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 357121896Sjeff if (class == PRI_TIMESHARE) 358121896Sjeff kseq->ksq_load_timeshare++; 359113357Sjeff kseq->ksq_load++; 360125289Sjeff if (class != PRI_ITHD) 361123487Sjeff#ifdef SMP 362123487Sjeff kseq->ksq_group->ksg_load++; 363125289Sjeff#else 364125289Sjeff kseq->ksq_sysload++; 365123487Sjeff#endif 366113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 367122744Sjeff CTR6(KTR_ULE, 368122744Sjeff "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 369122744Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 370122744Sjeff ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 371113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 372113357Sjeff kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 373110267Sjeff} 374113357Sjeff 375112994Sjeffstatic void 376122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke) 377110267Sjeff{ 378121896Sjeff int class; 379115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 380121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 381121896Sjeff if (class == PRI_TIMESHARE) 382121896Sjeff kseq->ksq_load_timeshare--; 383125289Sjeff if (class != PRI_ITHD) 384123487Sjeff#ifdef SMP 385123487Sjeff kseq->ksq_group->ksg_load--; 386125289Sjeff#else 387125289Sjeff kseq->ksq_sysload--; 388123487Sjeff#endif 389113357Sjeff kseq->ksq_load--; 390113357Sjeff ke->ke_runq = NULL; 391113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 392113357Sjeff kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 393110267Sjeff} 394110267Sjeff 395113357Sjeffstatic void 396113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 397110267Sjeff{ 398115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 399113357Sjeff /* Normalize to zero. */ 400113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 401121896Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 402113357Sjeff kseq->ksq_nicemin = nice; 403110267Sjeff} 404110267Sjeff 405113357Sjeffstatic void 406113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 407110267Sjeff{ 408113357Sjeff int n; 409113357Sjeff 410115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 411113357Sjeff /* Normalize to zero. */ 412113357Sjeff n = nice + SCHED_PRI_NHALF; 413113357Sjeff kseq->ksq_nice[n]--; 414113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 415113357Sjeff 416113357Sjeff /* 417113357Sjeff * If this wasn't the smallest nice value or there are more in 418113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 419113357Sjeff * the smallest nice. 420113357Sjeff */ 421113357Sjeff if (nice != kseq->ksq_nicemin || 422113357Sjeff kseq->ksq_nice[n] != 0 || 423121896Sjeff kseq->ksq_load_timeshare == 0) 424113357Sjeff return; 425113357Sjeff 426121869Sjeff for (; n < SCHED_PRI_NRESV; n++) 427113357Sjeff if (kseq->ksq_nice[n]) { 428113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 429113357Sjeff return; 430113357Sjeff } 431110267Sjeff} 432110267Sjeff 433113357Sjeff#ifdef SMP 434116069Sjeff/* 435122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 436116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 437116069Sjeff * by migrating some processes. 438116069Sjeff * 439116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 440116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 441116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 442116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 443116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 444116069Sjeff * 445116069Sjeff * It could be improved by considering the priorities and slices assigned to 446116069Sjeff * each task prior to balancing them. There are many pathological cases with 447116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 448116069Sjeff * 449116069Sjeff */ 450121790Sjeffstatic void 451122744Sjeffsched_balance(void *arg) 452116069Sjeff{ 453123487Sjeff struct kseq_group *high; 454123487Sjeff struct kseq_group *low; 455123487Sjeff struct kseq_group *ksg; 456123487Sjeff int timo; 457123487Sjeff int cnt; 458123487Sjeff int i; 459123487Sjeff 460123487Sjeff mtx_lock_spin(&sched_lock); 461123487Sjeff if (smp_started == 0) 462123487Sjeff goto out; 463123487Sjeff low = high = NULL; 464123487Sjeff i = random() % (ksg_maxid + 1); 465123487Sjeff for (cnt = 0; cnt <= ksg_maxid; cnt++) { 466123487Sjeff ksg = KSEQ_GROUP(i); 467123487Sjeff /* 468123487Sjeff * Find the CPU with the highest load that has some 469123487Sjeff * threads to transfer. 470123487Sjeff */ 471123487Sjeff if ((high == NULL || ksg->ksg_load > high->ksg_load) 472123487Sjeff && ksg->ksg_transferable) 473123487Sjeff high = ksg; 474123487Sjeff if (low == NULL || ksg->ksg_load < low->ksg_load) 475123487Sjeff low = ksg; 476123487Sjeff if (++i > ksg_maxid) 477123487Sjeff i = 0; 478123487Sjeff } 479123487Sjeff if (low != NULL && high != NULL && high != low) 480123487Sjeff sched_balance_pair(LIST_FIRST(&high->ksg_members), 481123487Sjeff LIST_FIRST(&low->ksg_members)); 482123487Sjeffout: 483123487Sjeff mtx_unlock_spin(&sched_lock); 484123487Sjeff timo = random() % (hz * 2); 485123487Sjeff callout_reset(&kseq_lb_callout, timo, sched_balance, NULL); 486123487Sjeff} 487123487Sjeff 488123487Sjeffstatic void 489123487Sjeffsched_balance_groups(void *arg) 490123487Sjeff{ 491123487Sjeff int timo; 492123487Sjeff int i; 493123487Sjeff 494123487Sjeff mtx_lock_spin(&sched_lock); 495123487Sjeff if (smp_started) 496123487Sjeff for (i = 0; i <= ksg_maxid; i++) 497123487Sjeff sched_balance_group(KSEQ_GROUP(i)); 498123487Sjeff mtx_unlock_spin(&sched_lock); 499123487Sjeff timo = random() % (hz * 2); 500123487Sjeff callout_reset(&kseq_group_callout, timo, sched_balance_groups, NULL); 501123487Sjeff} 502123487Sjeff 503123487Sjeffstatic void 504123487Sjeffsched_balance_group(struct kseq_group *ksg) 505123487Sjeff{ 506116069Sjeff struct kseq *kseq; 507123487Sjeff struct kseq *high; 508123487Sjeff struct kseq *low; 509123487Sjeff int load; 510123487Sjeff 511123487Sjeff if (ksg->ksg_transferable == 0) 512123487Sjeff return; 513123487Sjeff low = NULL; 514123487Sjeff high = NULL; 515123487Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 516123487Sjeff load = kseq->ksq_load; 517123487Sjeff if (kseq == KSEQ_CPU(0)) 518123487Sjeff load--; 519123487Sjeff if (high == NULL || load > high->ksq_load) 520123487Sjeff high = kseq; 521123487Sjeff if (low == NULL || load < low->ksq_load) 522123487Sjeff low = kseq; 523123487Sjeff } 524123487Sjeff if (high != NULL && low != NULL && high != low) 525123487Sjeff sched_balance_pair(high, low); 526123487Sjeff} 527123487Sjeff 528123487Sjeffstatic void 529123487Sjeffsched_balance_pair(struct kseq *high, struct kseq *low) 530123487Sjeff{ 531123433Sjeff int transferable; 532116069Sjeff int high_load; 533116069Sjeff int low_load; 534116069Sjeff int move; 535116069Sjeff int diff; 536116069Sjeff int i; 537116069Sjeff 538116069Sjeff /* 539123433Sjeff * If we're transfering within a group we have to use this specific 540123433Sjeff * kseq's transferable count, otherwise we can steal from other members 541123433Sjeff * of the group. 542123433Sjeff */ 543123487Sjeff if (high->ksq_group == low->ksq_group) { 544123487Sjeff transferable = high->ksq_transferable; 545123487Sjeff high_load = high->ksq_load; 546123487Sjeff low_load = low->ksq_load; 547123487Sjeff /* 548123487Sjeff * XXX If we encounter cpu 0 we must remember to reduce it's 549123487Sjeff * load by 1 to reflect the swi that is running the callout. 550123487Sjeff * At some point we should really fix load balancing of the 551123487Sjeff * swi and then this wont matter. 552123487Sjeff */ 553123487Sjeff if (high == KSEQ_CPU(0)) 554123487Sjeff high_load--; 555123487Sjeff if (low == KSEQ_CPU(0)) 556123487Sjeff low_load--; 557123487Sjeff } else { 558123487Sjeff transferable = high->ksq_group->ksg_transferable; 559123487Sjeff high_load = high->ksq_group->ksg_load; 560123487Sjeff low_load = low->ksq_group->ksg_load; 561123487Sjeff } 562123433Sjeff if (transferable == 0) 563123487Sjeff return; 564123433Sjeff /* 565122744Sjeff * Determine what the imbalance is and then adjust that to how many 566123433Sjeff * kses we actually have to give up (transferable). 567122744Sjeff */ 568123487Sjeff diff = high_load - low_load; 569116069Sjeff move = diff / 2; 570116069Sjeff if (diff & 0x1) 571116069Sjeff move++; 572123433Sjeff move = min(move, transferable); 573116069Sjeff for (i = 0; i < move; i++) 574123487Sjeff kseq_move(high, KSEQ_ID(low)); 575116069Sjeff return; 576116069Sjeff} 577116069Sjeff 578121790Sjeffstatic void 579116069Sjeffkseq_move(struct kseq *from, int cpu) 580116069Sjeff{ 581123433Sjeff struct kseq *kseq; 582123433Sjeff struct kseq *to; 583116069Sjeff struct kse *ke; 584116069Sjeff 585123433Sjeff kseq = from; 586123433Sjeff to = KSEQ_CPU(cpu); 587123433Sjeff ke = kseq_steal(kseq, 1); 588123433Sjeff if (ke == NULL) { 589123433Sjeff struct kseq_group *ksg; 590123433Sjeff 591123433Sjeff ksg = kseq->ksq_group; 592123433Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 593123433Sjeff if (kseq == from || kseq->ksq_transferable == 0) 594123433Sjeff continue; 595123433Sjeff ke = kseq_steal(kseq, 1); 596123433Sjeff break; 597123433Sjeff } 598123433Sjeff if (ke == NULL) 599123433Sjeff panic("kseq_move: No KSEs available with a " 600123433Sjeff "transferable count of %d\n", 601123433Sjeff ksg->ksg_transferable); 602123433Sjeff } 603123433Sjeff if (kseq == to) 604123433Sjeff return; 605116069Sjeff ke->ke_state = KES_THREAD; 606123433Sjeff kseq_runq_rem(kseq, ke); 607123433Sjeff kseq_load_rem(kseq, ke); 608121923Sjeff kseq_notify(ke, cpu); 609116069Sjeff} 610110267Sjeff 611123433Sjeffstatic int 612123433Sjeffkseq_idled(struct kseq *kseq) 613121790Sjeff{ 614123433Sjeff struct kseq_group *ksg; 615123433Sjeff struct kseq *steal; 616123433Sjeff struct kse *ke; 617123433Sjeff 618123433Sjeff ksg = kseq->ksq_group; 619123433Sjeff /* 620123433Sjeff * If we're in a cpu group, try and steal kses from another cpu in 621123433Sjeff * the group before idling. 622123433Sjeff */ 623123433Sjeff if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 624123433Sjeff LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 625123433Sjeff if (steal == kseq || steal->ksq_transferable == 0) 626123433Sjeff continue; 627123433Sjeff ke = kseq_steal(steal, 0); 628123433Sjeff if (ke == NULL) 629123433Sjeff continue; 630123433Sjeff ke->ke_state = KES_THREAD; 631123433Sjeff kseq_runq_rem(steal, ke); 632123433Sjeff kseq_load_rem(steal, ke); 633123433Sjeff ke->ke_cpu = PCPU_GET(cpuid); 634123433Sjeff sched_add(ke->ke_thread); 635123433Sjeff return (0); 636123433Sjeff } 637123433Sjeff } 638123433Sjeff /* 639123433Sjeff * We only set the idled bit when all of the cpus in the group are 640123433Sjeff * idle. Otherwise we could get into a situation where a KSE bounces 641123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 642123433Sjeff */ 643123433Sjeff ksg->ksg_idlemask |= PCPU_GET(cpumask); 644123433Sjeff if (ksg->ksg_idlemask != ksg->ksg_cpumask) 645123433Sjeff return (1); 646123433Sjeff atomic_set_int(&kseq_idle, ksg->ksg_mask); 647123433Sjeff return (1); 648121790Sjeff} 649121790Sjeff 650121790Sjeffstatic void 651121790Sjeffkseq_assign(struct kseq *kseq) 652121790Sjeff{ 653121790Sjeff struct kse *nke; 654121790Sjeff struct kse *ke; 655121790Sjeff 656121790Sjeff do { 657122848Sjeff (volatile struct kse *)ke = kseq->ksq_assigned; 658121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 659121790Sjeff for (; ke != NULL; ke = nke) { 660121790Sjeff nke = ke->ke_assign; 661121790Sjeff ke->ke_flags &= ~KEF_ASSIGNED; 662121790Sjeff sched_add(ke->ke_thread); 663121790Sjeff } 664121790Sjeff} 665121790Sjeff 666121790Sjeffstatic void 667121790Sjeffkseq_notify(struct kse *ke, int cpu) 668121790Sjeff{ 669121790Sjeff struct kseq *kseq; 670121790Sjeff struct thread *td; 671121790Sjeff struct pcpu *pcpu; 672121790Sjeff 673123529Sjeff ke->ke_cpu = cpu; 674121790Sjeff ke->ke_flags |= KEF_ASSIGNED; 675121790Sjeff 676121790Sjeff kseq = KSEQ_CPU(cpu); 677121790Sjeff 678121790Sjeff /* 679121790Sjeff * Place a KSE on another cpu's queue and force a resched. 680121790Sjeff */ 681121790Sjeff do { 682122848Sjeff (volatile struct kse *)ke->ke_assign = kseq->ksq_assigned; 683121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 684121790Sjeff pcpu = pcpu_find(cpu); 685121790Sjeff td = pcpu->pc_curthread; 686121790Sjeff if (ke->ke_thread->td_priority < td->td_priority || 687121790Sjeff td == pcpu->pc_idlethread) { 688121790Sjeff td->td_flags |= TDF_NEEDRESCHED; 689121790Sjeff ipi_selected(1 << cpu, IPI_AST); 690121790Sjeff } 691121790Sjeff} 692121790Sjeff 693121790Sjeffstatic struct kse * 694121790Sjeffrunq_steal(struct runq *rq) 695121790Sjeff{ 696121790Sjeff struct rqhead *rqh; 697121790Sjeff struct rqbits *rqb; 698121790Sjeff struct kse *ke; 699121790Sjeff int word; 700121790Sjeff int bit; 701121790Sjeff 702121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 703121790Sjeff rqb = &rq->rq_status; 704121790Sjeff for (word = 0; word < RQB_LEN; word++) { 705121790Sjeff if (rqb->rqb_bits[word] == 0) 706121790Sjeff continue; 707121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 708123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 709121790Sjeff continue; 710121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 711121790Sjeff TAILQ_FOREACH(ke, rqh, ke_procq) { 712121896Sjeff if (KSE_CAN_MIGRATE(ke, 713121896Sjeff PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 714121790Sjeff return (ke); 715121790Sjeff } 716121790Sjeff } 717121790Sjeff } 718121790Sjeff return (NULL); 719121790Sjeff} 720121790Sjeff 721121790Sjeffstatic struct kse * 722123433Sjeffkseq_steal(struct kseq *kseq, int stealidle) 723121790Sjeff{ 724121790Sjeff struct kse *ke; 725121790Sjeff 726123433Sjeff /* 727123433Sjeff * Steal from next first to try to get a non-interactive task that 728123433Sjeff * may not have run for a while. 729123433Sjeff */ 730123433Sjeff if ((ke = runq_steal(kseq->ksq_next)) != NULL) 731123433Sjeff return (ke); 732121790Sjeff if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 733121790Sjeff return (ke); 734123433Sjeff if (stealidle) 735123433Sjeff return (runq_steal(&kseq->ksq_idle)); 736123433Sjeff return (NULL); 737121790Sjeff} 738123433Sjeff 739123433Sjeffint 740123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class) 741123433Sjeff{ 742123433Sjeff struct kseq_group *ksg; 743123433Sjeff int cpu; 744123433Sjeff 745123685Sjeff if (smp_started == 0) 746123685Sjeff return (0); 747123433Sjeff cpu = 0; 748123433Sjeff ksg = kseq->ksq_group; 749123433Sjeff 750123433Sjeff /* 751123685Sjeff * If there are any idle groups, give them our extra load. The 752123685Sjeff * threshold at which we start to reassign kses has a large impact 753123685Sjeff * on the overall performance of the system. Tuned too high and 754123685Sjeff * some CPUs may idle. Too low and there will be excess migration 755123685Sjeff * and context swiches. 756123685Sjeff */ 757123694Sjeff if (ksg->ksg_load > (ksg->ksg_cpus * 2) && kseq_idle) { 758123433Sjeff /* 759123433Sjeff * Multiple cpus could find this bit simultaneously 760123433Sjeff * but the race shouldn't be terrible. 761123433Sjeff */ 762123433Sjeff cpu = ffs(kseq_idle); 763123433Sjeff if (cpu) 764123433Sjeff atomic_clear_int(&kseq_idle, 1 << (cpu - 1)); 765123433Sjeff } 766123433Sjeff /* 767123433Sjeff * If another cpu in this group has idled, assign a thread over 768123433Sjeff * to them after checking to see if there are idled groups. 769123433Sjeff */ 770123433Sjeff if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) { 771123433Sjeff cpu = ffs(ksg->ksg_idlemask); 772123433Sjeff if (cpu) 773123433Sjeff ksg->ksg_idlemask &= ~(1 << (cpu - 1)); 774123433Sjeff } 775123433Sjeff /* 776123433Sjeff * Now that we've found an idle CPU, migrate the thread. 777123433Sjeff */ 778123433Sjeff if (cpu) { 779123433Sjeff cpu--; 780123433Sjeff ke->ke_runq = NULL; 781123433Sjeff kseq_notify(ke, cpu); 782123433Sjeff return (1); 783123433Sjeff } 784123433Sjeff return (0); 785123433Sjeff} 786123433Sjeff 787121790Sjeff#endif /* SMP */ 788121790Sjeff 789117326Sjeff/* 790121790Sjeff * Pick the highest priority task we have and return it. 791117326Sjeff */ 792117326Sjeff 793121790Sjeffstatic struct kse * 794121790Sjeffkseq_choose(struct kseq *kseq) 795110267Sjeff{ 796110267Sjeff struct kse *ke; 797110267Sjeff struct runq *swap; 798110267Sjeff 799115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 800113357Sjeff swap = NULL; 801112994Sjeff 802113357Sjeff for (;;) { 803113357Sjeff ke = runq_choose(kseq->ksq_curr); 804113357Sjeff if (ke == NULL) { 805113357Sjeff /* 806113357Sjeff * We already swaped once and didn't get anywhere. 807113357Sjeff */ 808113357Sjeff if (swap) 809113357Sjeff break; 810113357Sjeff swap = kseq->ksq_curr; 811113357Sjeff kseq->ksq_curr = kseq->ksq_next; 812113357Sjeff kseq->ksq_next = swap; 813113357Sjeff continue; 814113357Sjeff } 815113357Sjeff /* 816113357Sjeff * If we encounter a slice of 0 the kse is in a 817113357Sjeff * TIMESHARE kse group and its nice was too far out 818113357Sjeff * of the range that receives slices. 819113357Sjeff */ 820121790Sjeff if (ke->ke_slice == 0) { 821113357Sjeff runq_remove(ke->ke_runq, ke); 822113357Sjeff sched_slice(ke); 823113357Sjeff ke->ke_runq = kseq->ksq_next; 824113357Sjeff runq_add(ke->ke_runq, ke); 825113357Sjeff continue; 826113357Sjeff } 827113357Sjeff return (ke); 828110267Sjeff } 829110267Sjeff 830113357Sjeff return (runq_choose(&kseq->ksq_idle)); 831110267Sjeff} 832110267Sjeff 833109864Sjeffstatic void 834110028Sjeffkseq_setup(struct kseq *kseq) 835110028Sjeff{ 836113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 837113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 838112994Sjeff runq_init(&kseq->ksq_idle); 839113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 840113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 841113660Sjeff kseq->ksq_load = 0; 842121896Sjeff kseq->ksq_load_timeshare = 0; 843110028Sjeff} 844110028Sjeff 845110028Sjeffstatic void 846109864Sjeffsched_setup(void *dummy) 847109864Sjeff{ 848117313Sjeff#ifdef SMP 849123487Sjeff int balance_groups; 850109864Sjeff int i; 851117313Sjeff#endif 852109864Sjeff 853116946Sjeff slice_min = (hz/100); /* 10ms */ 854116946Sjeff slice_max = (hz/7); /* ~140ms */ 855111857Sjeff 856117237Sjeff#ifdef SMP 857123487Sjeff balance_groups = 0; 858123433Sjeff /* 859123433Sjeff * Initialize the kseqs. 860123433Sjeff */ 861123433Sjeff for (i = 0; i < MAXCPU; i++) { 862123433Sjeff struct kseq *ksq; 863123433Sjeff 864123433Sjeff ksq = &kseq_cpu[i]; 865123433Sjeff ksq->ksq_assigned = NULL; 866123433Sjeff kseq_setup(&kseq_cpu[i]); 867123433Sjeff } 868117237Sjeff if (smp_topology == NULL) { 869123433Sjeff struct kseq_group *ksg; 870123433Sjeff struct kseq *ksq; 871123433Sjeff 872117237Sjeff for (i = 0; i < MAXCPU; i++) { 873123433Sjeff ksq = &kseq_cpu[i]; 874123433Sjeff ksg = &kseq_groups[i]; 875123433Sjeff /* 876123433Sjeff * Setup a kse group with one member. 877123433Sjeff */ 878123433Sjeff ksq->ksq_transferable = 0; 879123433Sjeff ksq->ksq_group = ksg; 880123433Sjeff ksg->ksg_cpus = 1; 881123433Sjeff ksg->ksg_idlemask = 0; 882123433Sjeff ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 883123487Sjeff ksg->ksg_load = 0; 884123433Sjeff ksg->ksg_transferable = 0; 885123433Sjeff LIST_INIT(&ksg->ksg_members); 886123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 887117237Sjeff } 888117237Sjeff } else { 889123433Sjeff struct kseq_group *ksg; 890123433Sjeff struct cpu_group *cg; 891117237Sjeff int j; 892113357Sjeff 893117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 894117237Sjeff cg = &smp_topology->ct_group[i]; 895123433Sjeff ksg = &kseq_groups[i]; 896123433Sjeff /* 897123433Sjeff * Initialize the group. 898123433Sjeff */ 899123433Sjeff ksg->ksg_idlemask = 0; 900123487Sjeff ksg->ksg_load = 0; 901123433Sjeff ksg->ksg_transferable = 0; 902123433Sjeff ksg->ksg_cpus = cg->cg_count; 903123433Sjeff ksg->ksg_cpumask = cg->cg_mask; 904123433Sjeff LIST_INIT(&ksg->ksg_members); 905123433Sjeff /* 906123433Sjeff * Find all of the group members and add them. 907123433Sjeff */ 908123433Sjeff for (j = 0; j < MAXCPU; j++) { 909123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 910123433Sjeff if (ksg->ksg_mask == 0) 911123433Sjeff ksg->ksg_mask = 1 << j; 912123433Sjeff kseq_cpu[j].ksq_transferable = 0; 913123433Sjeff kseq_cpu[j].ksq_group = ksg; 914123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, 915123433Sjeff &kseq_cpu[j], ksq_siblings); 916123433Sjeff } 917123433Sjeff } 918123487Sjeff if (ksg->ksg_cpus > 1) 919123487Sjeff balance_groups = 1; 920117237Sjeff } 921123487Sjeff ksg_maxid = smp_topology->ct_count - 1; 922117237Sjeff } 923119137Ssam callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 924123487Sjeff callout_init(&kseq_group_callout, CALLOUT_MPSAFE); 925122744Sjeff sched_balance(NULL); 926123487Sjeff /* 927123487Sjeff * Stagger the group and global load balancer so they do not 928123487Sjeff * interfere with each other. 929123487Sjeff */ 930123487Sjeff if (balance_groups) 931123487Sjeff callout_reset(&kseq_group_callout, hz / 2, 932123487Sjeff sched_balance_groups, NULL); 933117237Sjeff#else 934117237Sjeff kseq_setup(KSEQ_SELF()); 935116069Sjeff#endif 936117237Sjeff mtx_lock_spin(&sched_lock); 937122744Sjeff kseq_load_add(KSEQ_SELF(), &kse0); 938117237Sjeff mtx_unlock_spin(&sched_lock); 939109864Sjeff} 940109864Sjeff 941109864Sjeff/* 942109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 943109864Sjeff * process. 944109864Sjeff */ 945113357Sjeffstatic void 946109864Sjeffsched_priority(struct ksegrp *kg) 947109864Sjeff{ 948109864Sjeff int pri; 949109864Sjeff 950109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 951113357Sjeff return; 952109864Sjeff 953113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 954111857Sjeff pri += SCHED_PRI_BASE; 955109864Sjeff pri += kg->kg_nice; 956109864Sjeff 957109864Sjeff if (pri > PRI_MAX_TIMESHARE) 958109864Sjeff pri = PRI_MAX_TIMESHARE; 959109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 960109864Sjeff pri = PRI_MIN_TIMESHARE; 961109864Sjeff 962109864Sjeff kg->kg_user_pri = pri; 963109864Sjeff 964113357Sjeff return; 965109864Sjeff} 966109864Sjeff 967109864Sjeff/* 968112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 969112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 970109864Sjeff */ 971112966Sjeffstatic void 972112966Sjeffsched_slice(struct kse *ke) 973109864Sjeff{ 974113357Sjeff struct kseq *kseq; 975112966Sjeff struct ksegrp *kg; 976109864Sjeff 977112966Sjeff kg = ke->ke_ksegrp; 978113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 979109864Sjeff 980112966Sjeff /* 981112966Sjeff * Rationale: 982112966Sjeff * KSEs in interactive ksegs get the minimum slice so that we 983112966Sjeff * quickly notice if it abuses its advantage. 984112966Sjeff * 985112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 986112966Sjeff * based on the ksegs nice value relative to the least nice kseg 987112966Sjeff * on the run queue for this cpu. 988112966Sjeff * 989112966Sjeff * If the KSE is less nice than all others it gets the maximum 990112966Sjeff * slice and other KSEs will adjust their slice relative to 991112966Sjeff * this when they first expire. 992112966Sjeff * 993112966Sjeff * There is 20 point window that starts relative to the least 994112966Sjeff * nice kse on the run queue. Slice size is determined by 995112966Sjeff * the kse distance from the last nice ksegrp. 996112966Sjeff * 997121871Sjeff * If the kse is outside of the window it will get no slice 998121871Sjeff * and will be reevaluated each time it is selected on the 999121871Sjeff * run queue. The exception to this is nice 0 ksegs when 1000121871Sjeff * a nice -20 is running. They are always granted a minimum 1001121871Sjeff * slice. 1002112966Sjeff */ 1003113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 1004112966Sjeff int nice; 1005112966Sjeff 1006113357Sjeff nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 1007121896Sjeff if (kseq->ksq_load_timeshare == 0 || 1008113357Sjeff kg->kg_nice < kseq->ksq_nicemin) 1009112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1010121871Sjeff else if (nice <= SCHED_SLICE_NTHRESH) 1011112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 1012121871Sjeff else if (kg->kg_nice == 0) 1013121871Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1014112966Sjeff else 1015112966Sjeff ke->ke_slice = 0; 1016112966Sjeff } else 1017123684Sjeff ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1018112966Sjeff 1019113357Sjeff CTR6(KTR_ULE, 1020113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 1021113357Sjeff ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 1022121896Sjeff kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); 1023113357Sjeff 1024112966Sjeff return; 1025109864Sjeff} 1026109864Sjeff 1027121868Sjeff/* 1028121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1029121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 1030121868Sjeff * This routine will not operate correctly when slp or run times have been 1031121868Sjeff * adjusted to more than double their maximum. 1032121868Sjeff */ 1033116463Sjeffstatic void 1034116463Sjeffsched_interact_update(struct ksegrp *kg) 1035116463Sjeff{ 1036121868Sjeff int sum; 1037121605Sjeff 1038121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1039121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1040121868Sjeff return; 1041121868Sjeff /* 1042121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1043121868Sjeff * will not bring us back into range. Dividing by two here forces 1044121868Sjeff * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1045121868Sjeff */ 1046121868Sjeff if (sum > (SCHED_INTERACT_MAX / 5) * 6) { 1047121868Sjeff kg->kg_runtime /= 2; 1048121868Sjeff kg->kg_slptime /= 2; 1049121868Sjeff return; 1050116463Sjeff } 1051121868Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1052121868Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1053116463Sjeff} 1054116463Sjeff 1055121868Sjeffstatic void 1056121868Sjeffsched_interact_fork(struct ksegrp *kg) 1057121868Sjeff{ 1058121868Sjeff int ratio; 1059121868Sjeff int sum; 1060121868Sjeff 1061121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1062121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1063121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1064121868Sjeff kg->kg_runtime /= ratio; 1065121868Sjeff kg->kg_slptime /= ratio; 1066121868Sjeff } 1067121868Sjeff} 1068121868Sjeff 1069111857Sjeffstatic int 1070111857Sjeffsched_interact_score(struct ksegrp *kg) 1071111857Sjeff{ 1072116365Sjeff int div; 1073111857Sjeff 1074111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 1075116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1076116365Sjeff return (SCHED_INTERACT_HALF + 1077116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1078116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 1079116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1080116365Sjeff return (kg->kg_runtime / div); 1081111857Sjeff } 1082111857Sjeff 1083116365Sjeff /* 1084116365Sjeff * This can happen if slptime and runtime are 0. 1085116365Sjeff */ 1086116365Sjeff return (0); 1087111857Sjeff 1088111857Sjeff} 1089111857Sjeff 1090113357Sjeff/* 1091113357Sjeff * This is only somewhat accurate since given many processes of the same 1092113357Sjeff * priority they will switch when their slices run out, which will be 1093113357Sjeff * at most SCHED_SLICE_MAX. 1094113357Sjeff */ 1095109864Sjeffint 1096109864Sjeffsched_rr_interval(void) 1097109864Sjeff{ 1098109864Sjeff return (SCHED_SLICE_MAX); 1099109864Sjeff} 1100109864Sjeff 1101121790Sjeffstatic void 1102109864Sjeffsched_pctcpu_update(struct kse *ke) 1103109864Sjeff{ 1104109864Sjeff /* 1105109864Sjeff * Adjust counters and watermark for pctcpu calc. 1106116365Sjeff */ 1107120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1108120272Sjeff /* 1109120272Sjeff * Shift the tick count out so that the divide doesn't 1110120272Sjeff * round away our results. 1111120272Sjeff */ 1112120272Sjeff ke->ke_ticks <<= 10; 1113120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1114120272Sjeff SCHED_CPU_TICKS; 1115120272Sjeff ke->ke_ticks >>= 10; 1116120272Sjeff } else 1117120272Sjeff ke->ke_ticks = 0; 1118109864Sjeff ke->ke_ltick = ticks; 1119109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1120109864Sjeff} 1121109864Sjeff 1122109864Sjeffvoid 1123109864Sjeffsched_prio(struct thread *td, u_char prio) 1124109864Sjeff{ 1125121605Sjeff struct kse *ke; 1126109864Sjeff 1127121605Sjeff ke = td->td_kse; 1128109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1129109864Sjeff if (TD_ON_RUNQ(td)) { 1130121605Sjeff /* 1131121605Sjeff * If the priority has been elevated due to priority 1132121605Sjeff * propagation, we may have to move ourselves to a new 1133121605Sjeff * queue. We still call adjustrunqueue below in case kse 1134121605Sjeff * needs to fix things up. 1135121605Sjeff */ 1136121872Sjeff if (prio < td->td_priority && ke && 1137121872Sjeff (ke->ke_flags & KEF_ASSIGNED) == 0 && 1138121790Sjeff ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1139121605Sjeff runq_remove(ke->ke_runq, ke); 1140121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1141121605Sjeff runq_add(ke->ke_runq, ke); 1142121605Sjeff } 1143119488Sdavidxu adjustrunqueue(td, prio); 1144121605Sjeff } else 1145119488Sdavidxu td->td_priority = prio; 1146109864Sjeff} 1147109864Sjeff 1148109864Sjeffvoid 1149121128Sjeffsched_switch(struct thread *td) 1150109864Sjeff{ 1151121128Sjeff struct thread *newtd; 1152109864Sjeff struct kse *ke; 1153109864Sjeff 1154109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1155109864Sjeff 1156109864Sjeff ke = td->td_kse; 1157109864Sjeff 1158109864Sjeff td->td_last_kse = ke; 1159113339Sjulian td->td_lastcpu = td->td_oncpu; 1160113339Sjulian td->td_oncpu = NOCPU; 1161111032Sjulian td->td_flags &= ~TDF_NEEDRESCHED; 1162109864Sjeff 1163123434Sjeff /* 1164123434Sjeff * If the KSE has been assigned it may be in the process of switching 1165123434Sjeff * to the new cpu. This is the case in sched_bind(). 1166123434Sjeff */ 1167123434Sjeff if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1168123434Sjeff if (TD_IS_RUNNING(td)) { 1169123434Sjeff if (td->td_proc->p_flag & P_SA) { 1170123434Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1171123434Sjeff setrunqueue(td); 1172123434Sjeff } else 1173123434Sjeff kseq_runq_add(KSEQ_SELF(), ke); 1174123434Sjeff } else { 1175125289Sjeff if (ke->ke_runq) { 1176123434Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1177125289Sjeff } else if ((td->td_flags & TDF_IDLETD) == 0) 1178125289Sjeff backtrace(); 1179123434Sjeff /* 1180123434Sjeff * We will not be on the run queue. So we must be 1181123434Sjeff * sleeping or similar. 1182123434Sjeff */ 1183123434Sjeff if (td->td_proc->p_flag & P_SA) 1184123434Sjeff kse_reassign(ke); 1185123434Sjeff } 1186121146Sjeff } 1187121128Sjeff newtd = choosethread(); 1188121128Sjeff if (td != newtd) 1189121128Sjeff cpu_switch(td, newtd); 1190121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1191109864Sjeff 1192113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1193109864Sjeff} 1194109864Sjeff 1195109864Sjeffvoid 1196109864Sjeffsched_nice(struct ksegrp *kg, int nice) 1197109864Sjeff{ 1198113357Sjeff struct kse *ke; 1199109864Sjeff struct thread *td; 1200113357Sjeff struct kseq *kseq; 1201109864Sjeff 1202113873Sjhb PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 1203113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1204113357Sjeff /* 1205113357Sjeff * We need to adjust the nice counts for running KSEs. 1206113357Sjeff */ 1207113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 1208113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1209116500Sjeff if (ke->ke_runq == NULL) 1210113357Sjeff continue; 1211113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1212113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1213113357Sjeff kseq_nice_add(kseq, nice); 1214113357Sjeff } 1215109864Sjeff kg->kg_nice = nice; 1216109864Sjeff sched_priority(kg); 1217113357Sjeff FOREACH_THREAD_IN_GROUP(kg, td) 1218111032Sjulian td->td_flags |= TDF_NEEDRESCHED; 1219109864Sjeff} 1220109864Sjeff 1221109864Sjeffvoid 1222109864Sjeffsched_sleep(struct thread *td, u_char prio) 1223109864Sjeff{ 1224109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1225109864Sjeff 1226109864Sjeff td->td_slptime = ticks; 1227109864Sjeff td->td_priority = prio; 1228109864Sjeff 1229113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 1230113357Sjeff td->td_kse, td->td_slptime); 1231109864Sjeff} 1232109864Sjeff 1233109864Sjeffvoid 1234109864Sjeffsched_wakeup(struct thread *td) 1235109864Sjeff{ 1236109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1237109864Sjeff 1238109864Sjeff /* 1239109864Sjeff * Let the kseg know how long we slept for. This is because process 1240109864Sjeff * interactivity behavior is modeled in the kseg. 1241109864Sjeff */ 1242111788Sjeff if (td->td_slptime) { 1243111788Sjeff struct ksegrp *kg; 1244113357Sjeff int hzticks; 1245109864Sjeff 1246111788Sjeff kg = td->td_ksegrp; 1247121868Sjeff hzticks = (ticks - td->td_slptime) << 10; 1248121868Sjeff if (hzticks >= SCHED_SLP_RUN_MAX) { 1249121868Sjeff kg->kg_slptime = SCHED_SLP_RUN_MAX; 1250121868Sjeff kg->kg_runtime = 1; 1251121868Sjeff } else { 1252121868Sjeff kg->kg_slptime += hzticks; 1253121868Sjeff sched_interact_update(kg); 1254121868Sjeff } 1255111788Sjeff sched_priority(kg); 1256116463Sjeff if (td->td_kse) 1257116463Sjeff sched_slice(td->td_kse); 1258113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 1259113357Sjeff td->td_kse, hzticks); 1260111788Sjeff td->td_slptime = 0; 1261109864Sjeff } 1262109864Sjeff setrunqueue(td); 1263109864Sjeff} 1264109864Sjeff 1265109864Sjeff/* 1266109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1267109864Sjeff * priority. 1268109864Sjeff */ 1269109864Sjeffvoid 1270113357Sjeffsched_fork(struct proc *p, struct proc *p1) 1271109864Sjeff{ 1272109864Sjeff 1273109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1274109864Sjeff 1275113357Sjeff sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 1276113357Sjeff sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 1277113357Sjeff sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 1278113357Sjeff} 1279113357Sjeff 1280113357Sjeffvoid 1281113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child) 1282113357Sjeff{ 1283113923Sjhb 1284116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1285122847Sjeff child->ke_cpu = ke->ke_cpu; 1286113357Sjeff child->ke_runq = NULL; 1287113357Sjeff 1288121051Sjeff /* Grab our parents cpu estimation information. */ 1289121051Sjeff child->ke_ticks = ke->ke_ticks; 1290121051Sjeff child->ke_ltick = ke->ke_ltick; 1291121051Sjeff child->ke_ftick = ke->ke_ftick; 1292113357Sjeff} 1293113357Sjeff 1294113357Sjeffvoid 1295113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1296113357Sjeff{ 1297113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 1298116365Sjeff 1299121868Sjeff child->kg_slptime = kg->kg_slptime; 1300121868Sjeff child->kg_runtime = kg->kg_runtime; 1301121868Sjeff child->kg_user_pri = kg->kg_user_pri; 1302121868Sjeff child->kg_nice = kg->kg_nice; 1303121868Sjeff sched_interact_fork(child); 1304116463Sjeff kg->kg_runtime += tickincr << 10; 1305116463Sjeff sched_interact_update(kg); 1306113357Sjeff 1307121868Sjeff CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", 1308121868Sjeff kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 1309121868Sjeff child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); 1310113357Sjeff} 1311109864Sjeff 1312113357Sjeffvoid 1313113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 1314113357Sjeff{ 1315113357Sjeff} 1316113357Sjeff 1317113357Sjeffvoid 1318113357Sjeffsched_class(struct ksegrp *kg, int class) 1319113357Sjeff{ 1320113357Sjeff struct kseq *kseq; 1321113357Sjeff struct kse *ke; 1322121896Sjeff int nclass; 1323121896Sjeff int oclass; 1324113357Sjeff 1325113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1326113357Sjeff if (kg->kg_pri_class == class) 1327113357Sjeff return; 1328113357Sjeff 1329121896Sjeff nclass = PRI_BASE(class); 1330121896Sjeff oclass = PRI_BASE(kg->kg_pri_class); 1331113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1332113357Sjeff if (ke->ke_state != KES_ONRUNQ && 1333113357Sjeff ke->ke_state != KES_THREAD) 1334113357Sjeff continue; 1335113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1336113357Sjeff 1337121896Sjeff#ifdef SMP 1338122744Sjeff /* 1339122744Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1340122744Sjeff * count because could be changing to or from an interrupt 1341122744Sjeff * class. 1342122744Sjeff */ 1343122744Sjeff if (ke->ke_state == KES_ONRUNQ) { 1344123433Sjeff if (KSE_CAN_MIGRATE(ke, oclass)) { 1345123433Sjeff kseq->ksq_transferable--; 1346123433Sjeff kseq->ksq_group->ksg_transferable--; 1347123433Sjeff } 1348123433Sjeff if (KSE_CAN_MIGRATE(ke, nclass)) { 1349123433Sjeff kseq->ksq_transferable++; 1350123433Sjeff kseq->ksq_group->ksg_transferable++; 1351123433Sjeff } 1352122744Sjeff } 1353121896Sjeff#endif 1354122744Sjeff if (oclass == PRI_TIMESHARE) { 1355121896Sjeff kseq->ksq_load_timeshare--; 1356122744Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1357122744Sjeff } 1358122744Sjeff if (nclass == PRI_TIMESHARE) { 1359121896Sjeff kseq->ksq_load_timeshare++; 1360113357Sjeff kseq_nice_add(kseq, kg->kg_nice); 1361122744Sjeff } 1362109970Sjeff } 1363109970Sjeff 1364113357Sjeff kg->kg_pri_class = class; 1365109864Sjeff} 1366109864Sjeff 1367109864Sjeff/* 1368109864Sjeff * Return some of the child's priority and interactivity to the parent. 1369109864Sjeff */ 1370109864Sjeffvoid 1371113357Sjeffsched_exit(struct proc *p, struct proc *child) 1372109864Sjeff{ 1373109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1374113372Sjeff sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 1375116365Sjeff sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 1376109864Sjeff} 1377109864Sjeff 1378109864Sjeffvoid 1379113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child) 1380113372Sjeff{ 1381122744Sjeff kseq_load_rem(KSEQ_CPU(child->ke_cpu), child); 1382113372Sjeff} 1383113372Sjeff 1384113372Sjeffvoid 1385113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1386113372Sjeff{ 1387116463Sjeff /* kg->kg_slptime += child->kg_slptime; */ 1388116365Sjeff kg->kg_runtime += child->kg_runtime; 1389116463Sjeff sched_interact_update(kg); 1390113372Sjeff} 1391113372Sjeff 1392113372Sjeffvoid 1393113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 1394113372Sjeff{ 1395113372Sjeff} 1396113372Sjeff 1397113372Sjeffvoid 1398121127Sjeffsched_clock(struct thread *td) 1399109864Sjeff{ 1400113357Sjeff struct kseq *kseq; 1401113357Sjeff struct ksegrp *kg; 1402121127Sjeff struct kse *ke; 1403109864Sjeff 1404113357Sjeff /* 1405113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1406113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1407113357Sjeff * calculating this here. 1408113357Sjeff */ 1409113357Sjeff if (realstathz == 0) { 1410113357Sjeff realstathz = stathz ? stathz : hz; 1411113357Sjeff tickincr = hz / realstathz; 1412113357Sjeff /* 1413113357Sjeff * XXX This does not work for values of stathz that are much 1414113357Sjeff * larger than hz. 1415113357Sjeff */ 1416113357Sjeff if (tickincr == 0) 1417113357Sjeff tickincr = 1; 1418113357Sjeff } 1419109864Sjeff 1420121127Sjeff ke = td->td_kse; 1421113357Sjeff kg = ke->ke_ksegrp; 1422109864Sjeff 1423110028Sjeff mtx_assert(&sched_lock, MA_OWNED); 1424110028Sjeff /* Adjust ticks for pctcpu */ 1425111793Sjeff ke->ke_ticks++; 1426109971Sjeff ke->ke_ltick = ticks; 1427112994Sjeff 1428109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1429109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1430109971Sjeff sched_pctcpu_update(ke); 1431109971Sjeff 1432114496Sjulian if (td->td_flags & TDF_IDLETD) 1433109864Sjeff return; 1434110028Sjeff 1435113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1436113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1437110028Sjeff /* 1438113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1439113357Sjeff */ 1440113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1441113357Sjeff return; 1442113357Sjeff /* 1443110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1444113357Sjeff * interactivity. 1445109864Sjeff */ 1446113357Sjeff kg->kg_runtime += tickincr << 10; 1447116463Sjeff sched_interact_update(kg); 1448110645Sjeff 1449109864Sjeff /* 1450109864Sjeff * We used up one time slice. 1451109864Sjeff */ 1452122847Sjeff if (--ke->ke_slice > 0) 1453113357Sjeff return; 1454109864Sjeff /* 1455113357Sjeff * We're out of time, recompute priorities and requeue. 1456109864Sjeff */ 1457122847Sjeff kseq = KSEQ_SELF(); 1458122744Sjeff kseq_load_rem(kseq, ke); 1459113357Sjeff sched_priority(kg); 1460113357Sjeff sched_slice(ke); 1461113357Sjeff if (SCHED_CURR(kg, ke)) 1462113357Sjeff ke->ke_runq = kseq->ksq_curr; 1463113357Sjeff else 1464113357Sjeff ke->ke_runq = kseq->ksq_next; 1465122744Sjeff kseq_load_add(kseq, ke); 1466113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1467109864Sjeff} 1468109864Sjeff 1469109864Sjeffint 1470109864Sjeffsched_runnable(void) 1471109864Sjeff{ 1472109864Sjeff struct kseq *kseq; 1473115998Sjeff int load; 1474109864Sjeff 1475115998Sjeff load = 1; 1476115998Sjeff 1477110028Sjeff kseq = KSEQ_SELF(); 1478121790Sjeff#ifdef SMP 1479122094Sjeff if (kseq->ksq_assigned) { 1480122094Sjeff mtx_lock_spin(&sched_lock); 1481121790Sjeff kseq_assign(kseq); 1482122094Sjeff mtx_unlock_spin(&sched_lock); 1483122094Sjeff } 1484121790Sjeff#endif 1485121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1486121605Sjeff if (kseq->ksq_load > 0) 1487121605Sjeff goto out; 1488121605Sjeff } else 1489121605Sjeff if (kseq->ksq_load - 1 > 0) 1490121605Sjeff goto out; 1491115998Sjeff load = 0; 1492115998Sjeffout: 1493115998Sjeff return (load); 1494109864Sjeff} 1495109864Sjeff 1496109864Sjeffvoid 1497109864Sjeffsched_userret(struct thread *td) 1498109864Sjeff{ 1499109864Sjeff struct ksegrp *kg; 1500121605Sjeff 1501121605Sjeff kg = td->td_ksegrp; 1502109864Sjeff 1503109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1504109864Sjeff mtx_lock_spin(&sched_lock); 1505109864Sjeff td->td_priority = kg->kg_user_pri; 1506109864Sjeff mtx_unlock_spin(&sched_lock); 1507109864Sjeff } 1508109864Sjeff} 1509109864Sjeff 1510109864Sjeffstruct kse * 1511109970Sjeffsched_choose(void) 1512109970Sjeff{ 1513110028Sjeff struct kseq *kseq; 1514109970Sjeff struct kse *ke; 1515109970Sjeff 1516115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1517121790Sjeff kseq = KSEQ_SELF(); 1518113357Sjeff#ifdef SMP 1519123433Sjeffrestart: 1520121790Sjeff if (kseq->ksq_assigned) 1521121790Sjeff kseq_assign(kseq); 1522113357Sjeff#endif 1523121790Sjeff ke = kseq_choose(kseq); 1524109864Sjeff if (ke) { 1525121790Sjeff#ifdef SMP 1526121790Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1527123433Sjeff if (kseq_idled(kseq) == 0) 1528123433Sjeff goto restart; 1529121790Sjeff#endif 1530122744Sjeff kseq_runq_rem(kseq, ke); 1531109864Sjeff ke->ke_state = KES_THREAD; 1532112966Sjeff 1533113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1534113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1535113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1536113357Sjeff ke->ke_thread->td_priority); 1537113357Sjeff } 1538113357Sjeff return (ke); 1539109864Sjeff } 1540109970Sjeff#ifdef SMP 1541123433Sjeff if (kseq_idled(kseq) == 0) 1542123433Sjeff goto restart; 1543109970Sjeff#endif 1544113357Sjeff return (NULL); 1545109864Sjeff} 1546109864Sjeff 1547109864Sjeffvoid 1548121127Sjeffsched_add(struct thread *td) 1549109864Sjeff{ 1550110267Sjeff struct kseq *kseq; 1551113357Sjeff struct ksegrp *kg; 1552121127Sjeff struct kse *ke; 1553121790Sjeff int class; 1554109864Sjeff 1555121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 1556121127Sjeff ke = td->td_kse; 1557121127Sjeff kg = td->td_ksegrp; 1558121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1559121790Sjeff return; 1560121790Sjeff kseq = KSEQ_SELF(); 1561124958Sjeff KASSERT((ke->ke_thread != NULL), 1562124958Sjeff ("sched_add: No thread on KSE")); 1563109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1564110267Sjeff ("sched_add: No KSE on thread")); 1565109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1566110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1567109864Sjeff ke->ke_proc->p_comm)); 1568109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1569110267Sjeff ("sched_add: process swapped out")); 1570113387Sjeff KASSERT(ke->ke_runq == NULL, 1571113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1572109864Sjeff 1573121790Sjeff class = PRI_BASE(kg->kg_pri_class); 1574121790Sjeff switch (class) { 1575112994Sjeff case PRI_ITHD: 1576112994Sjeff case PRI_REALTIME: 1577113357Sjeff ke->ke_runq = kseq->ksq_curr; 1578113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1579113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1580112994Sjeff break; 1581112994Sjeff case PRI_TIMESHARE: 1582113387Sjeff if (SCHED_CURR(kg, ke)) 1583113387Sjeff ke->ke_runq = kseq->ksq_curr; 1584113387Sjeff else 1585113387Sjeff ke->ke_runq = kseq->ksq_next; 1586113357Sjeff break; 1587112994Sjeff case PRI_IDLE: 1588113357Sjeff /* 1589113357Sjeff * This is for priority prop. 1590113357Sjeff */ 1591121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1592113357Sjeff ke->ke_runq = kseq->ksq_curr; 1593113357Sjeff else 1594113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1595113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1596112994Sjeff break; 1597113357Sjeff default: 1598121868Sjeff panic("Unknown pri class."); 1599113357Sjeff break; 1600112994Sjeff } 1601121790Sjeff#ifdef SMP 1602123433Sjeff if (ke->ke_cpu != PCPU_GET(cpuid)) { 1603123529Sjeff ke->ke_runq = NULL; 1604123433Sjeff kseq_notify(ke, ke->ke_cpu); 1605123433Sjeff return; 1606123433Sjeff } 1607121790Sjeff /* 1608123685Sjeff * If we had been idle, clear our bit in the group and potentially 1609123685Sjeff * the global bitmap. If not, see if we should transfer this thread. 1610121790Sjeff */ 1611123433Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1612123433Sjeff (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1613121790Sjeff /* 1614123433Sjeff * Check to see if our group is unidling, and if so, remove it 1615123433Sjeff * from the global idle mask. 1616121790Sjeff */ 1617123433Sjeff if (kseq->ksq_group->ksg_idlemask == 1618123433Sjeff kseq->ksq_group->ksg_cpumask) 1619123433Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1620123433Sjeff /* 1621123433Sjeff * Now remove ourselves from the group specific idle mask. 1622123433Sjeff */ 1623123433Sjeff kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1624123685Sjeff } else if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class)) 1625123685Sjeff if (kseq_transfer(kseq, ke, class)) 1626123685Sjeff return; 1627121790Sjeff#endif 1628121790Sjeff if (td->td_priority < curthread->td_priority) 1629121790Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1630121790Sjeff 1631109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1632109864Sjeff ke->ke_state = KES_ONRUNQ; 1633109864Sjeff 1634122744Sjeff kseq_runq_add(kseq, ke); 1635122744Sjeff kseq_load_add(kseq, ke); 1636109864Sjeff} 1637109864Sjeff 1638109864Sjeffvoid 1639121127Sjeffsched_rem(struct thread *td) 1640109864Sjeff{ 1641113357Sjeff struct kseq *kseq; 1642121127Sjeff struct kse *ke; 1643113357Sjeff 1644121127Sjeff ke = td->td_kse; 1645121790Sjeff /* 1646121790Sjeff * It is safe to just return here because sched_rem() is only ever 1647121790Sjeff * used in places where we're immediately going to add the 1648121790Sjeff * kse back on again. In that case it'll be added with the correct 1649121790Sjeff * thread and priority when the caller drops the sched_lock. 1650121790Sjeff */ 1651121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1652121790Sjeff return; 1653109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1654124958Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), 1655124958Sjeff ("sched_rem: KSE not on run queue")); 1656109864Sjeff 1657109864Sjeff ke->ke_state = KES_THREAD; 1658109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1659113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1660122744Sjeff kseq_runq_rem(kseq, ke); 1661122744Sjeff kseq_load_rem(kseq, ke); 1662109864Sjeff} 1663109864Sjeff 1664109864Sjefffixpt_t 1665121127Sjeffsched_pctcpu(struct thread *td) 1666109864Sjeff{ 1667109864Sjeff fixpt_t pctcpu; 1668121127Sjeff struct kse *ke; 1669109864Sjeff 1670109864Sjeff pctcpu = 0; 1671121127Sjeff ke = td->td_kse; 1672121290Sjeff if (ke == NULL) 1673121290Sjeff return (0); 1674109864Sjeff 1675115998Sjeff mtx_lock_spin(&sched_lock); 1676109864Sjeff if (ke->ke_ticks) { 1677109864Sjeff int rtick; 1678109864Sjeff 1679116365Sjeff /* 1680116365Sjeff * Don't update more frequently than twice a second. Allowing 1681116365Sjeff * this causes the cpu usage to decay away too quickly due to 1682116365Sjeff * rounding errors. 1683116365Sjeff */ 1684123435Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1685123435Sjeff ke->ke_ltick < (ticks - (hz / 2))) 1686116365Sjeff sched_pctcpu_update(ke); 1687109864Sjeff /* How many rtick per second ? */ 1688116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1689110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1690109864Sjeff } 1691109864Sjeff 1692109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1693113865Sjhb mtx_unlock_spin(&sched_lock); 1694109864Sjeff 1695109864Sjeff return (pctcpu); 1696109864Sjeff} 1697109864Sjeff 1698122038Sjeffvoid 1699122038Sjeffsched_bind(struct thread *td, int cpu) 1700122038Sjeff{ 1701122038Sjeff struct kse *ke; 1702122038Sjeff 1703122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1704122038Sjeff ke = td->td_kse; 1705122038Sjeff ke->ke_flags |= KEF_BOUND; 1706123433Sjeff#ifdef SMP 1707123433Sjeff if (PCPU_GET(cpuid) == cpu) 1708122038Sjeff return; 1709122038Sjeff /* sched_rem without the runq_remove */ 1710122038Sjeff ke->ke_state = KES_THREAD; 1711122038Sjeff ke->ke_ksegrp->kg_runq_kses--; 1712122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1713122038Sjeff kseq_notify(ke, cpu); 1714122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 1715124944Sjeff mi_switch(SW_VOL); 1716122038Sjeff#endif 1717122038Sjeff} 1718122038Sjeff 1719122038Sjeffvoid 1720122038Sjeffsched_unbind(struct thread *td) 1721122038Sjeff{ 1722122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1723122038Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 1724122038Sjeff} 1725122038Sjeff 1726109864Sjeffint 1727125289Sjeffsched_load(void) 1728125289Sjeff{ 1729125289Sjeff#ifdef SMP 1730125289Sjeff int total; 1731125289Sjeff int i; 1732125289Sjeff 1733125289Sjeff total = 0; 1734125289Sjeff for (i = 0; i <= ksg_maxid; i++) 1735125289Sjeff total += KSEQ_GROUP(i)->ksg_load; 1736125289Sjeff return (total); 1737125289Sjeff#else 1738125289Sjeff return (KSEQ_SELF()->ksq_sysload); 1739125289Sjeff#endif 1740125289Sjeff} 1741125289Sjeff 1742125289Sjeffint 1743109864Sjeffsched_sizeof_kse(void) 1744109864Sjeff{ 1745109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1746109864Sjeff} 1747109864Sjeff 1748109864Sjeffint 1749109864Sjeffsched_sizeof_ksegrp(void) 1750109864Sjeff{ 1751109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1752109864Sjeff} 1753109864Sjeff 1754109864Sjeffint 1755109864Sjeffsched_sizeof_proc(void) 1756109864Sjeff{ 1757109864Sjeff return (sizeof(struct proc)); 1758109864Sjeff} 1759109864Sjeff 1760109864Sjeffint 1761109864Sjeffsched_sizeof_thread(void) 1762109864Sjeff{ 1763109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1764109864Sjeff} 1765