sched_ule.c revision 123433
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 123433 2003-12-11 03:57:10Z jeff $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32109864Sjeff#include <sys/kernel.h> 33109864Sjeff#include <sys/ktr.h> 34109864Sjeff#include <sys/lock.h> 35109864Sjeff#include <sys/mutex.h> 36109864Sjeff#include <sys/proc.h> 37112966Sjeff#include <sys/resource.h> 38122038Sjeff#include <sys/resourcevar.h> 39109864Sjeff#include <sys/sched.h> 40109864Sjeff#include <sys/smp.h> 41109864Sjeff#include <sys/sx.h> 42109864Sjeff#include <sys/sysctl.h> 43109864Sjeff#include <sys/sysproto.h> 44109864Sjeff#include <sys/vmmeter.h> 45109864Sjeff#ifdef DDB 46109864Sjeff#include <ddb/ddb.h> 47109864Sjeff#endif 48109864Sjeff#ifdef KTRACE 49109864Sjeff#include <sys/uio.h> 50109864Sjeff#include <sys/ktrace.h> 51109864Sjeff#endif 52109864Sjeff 53109864Sjeff#include <machine/cpu.h> 54121790Sjeff#include <machine/smp.h> 55109864Sjeff 56113357Sjeff#define KTR_ULE KTR_NFS 57113357Sjeff 58109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 59109864Sjeff/* XXX This is bogus compatability crap for ps */ 60109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 61109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 62109864Sjeff 63109864Sjeffstatic void sched_setup(void *dummy); 64109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 65109864Sjeff 66113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 67113357Sjeff 68113357Sjeffstatic int sched_strict; 69113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 70113357Sjeff 71113357Sjeffstatic int slice_min = 1; 72113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 73113357Sjeff 74116365Sjeffstatic int slice_max = 10; 75113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 76113357Sjeff 77111857Sjeffint realstathz; 78113357Sjeffint tickincr = 1; 79111857Sjeff 80116069Sjeff#ifdef SMP 81116069Sjeff/* Callout to handle load balancing SMP systems. */ 82116069Sjeffstatic struct callout kseq_lb_callout; 83116069Sjeff#endif 84116069Sjeff 85109864Sjeff/* 86109864Sjeff * These datastructures are allocated within their parent datastructure but 87109864Sjeff * are scheduler specific. 88109864Sjeff */ 89109864Sjeff 90109864Sjeffstruct ke_sched { 91109864Sjeff int ske_slice; 92109864Sjeff struct runq *ske_runq; 93109864Sjeff /* The following variables are only used for pctcpu calculation */ 94109864Sjeff int ske_ltick; /* Last tick that we were running on */ 95109864Sjeff int ske_ftick; /* First tick that we were running on */ 96109864Sjeff int ske_ticks; /* Tick count */ 97113357Sjeff /* CPU that we have affinity for. */ 98110260Sjeff u_char ske_cpu; 99109864Sjeff}; 100109864Sjeff#define ke_slice ke_sched->ske_slice 101109864Sjeff#define ke_runq ke_sched->ske_runq 102109864Sjeff#define ke_ltick ke_sched->ske_ltick 103109864Sjeff#define ke_ftick ke_sched->ske_ftick 104109864Sjeff#define ke_ticks ke_sched->ske_ticks 105110260Sjeff#define ke_cpu ke_sched->ske_cpu 106121790Sjeff#define ke_assign ke_procq.tqe_next 107109864Sjeff 108121790Sjeff#define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ 109122158Sjeff#define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ 110121790Sjeff 111109864Sjeffstruct kg_sched { 112110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 113110645Sjeff int skg_runtime; /* Number of ticks we were running */ 114109864Sjeff}; 115109864Sjeff#define kg_slptime kg_sched->skg_slptime 116110645Sjeff#define kg_runtime kg_sched->skg_runtime 117109864Sjeff 118109864Sjeffstruct td_sched { 119109864Sjeff int std_slptime; 120109864Sjeff}; 121109864Sjeff#define td_slptime td_sched->std_slptime 122109864Sjeff 123110267Sjeffstruct td_sched td_sched; 124109864Sjeffstruct ke_sched ke_sched; 125109864Sjeffstruct kg_sched kg_sched; 126109864Sjeff 127109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 128109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 129109864Sjeffstruct p_sched *proc0_sched = NULL; 130109864Sjeffstruct td_sched *thread0_sched = &td_sched; 131109864Sjeff 132109864Sjeff/* 133116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 134116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 135116642Sjeff * value is then directly added to this to allow nice to have some effect 136116642Sjeff * on latency. 137111857Sjeff * 138111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 139116642Sjeff * PRI_NRESV: Number of nice values. 140111857Sjeff * PRI_BASE: The start of the dynamic range. 141109864Sjeff */ 142111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 143121869Sjeff#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 144121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 145116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 146113357Sjeff#define SCHED_PRI_INTERACT(score) \ 147116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 148109864Sjeff 149109864Sjeff/* 150111857Sjeff * These determine the interactivity of a process. 151109864Sjeff * 152110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 153110645Sjeff * before throttling back. 154121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 155116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 156111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 157109864Sjeff */ 158121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 159121868Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 160116365Sjeff#define SCHED_INTERACT_MAX (100) 161116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 162121126Sjeff#define SCHED_INTERACT_THRESH (30) 163111857Sjeff 164109864Sjeff/* 165109864Sjeff * These parameters and macros determine the size of the time slice that is 166109864Sjeff * granted to each thread. 167109864Sjeff * 168109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 169109864Sjeff * SLICE_MAX: Maximum time slice granted. 170109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 171112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 172112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 173121871Sjeff * SLICE_NTHRESH: The nice cutoff point for slice assignment. 174109864Sjeff */ 175113357Sjeff#define SCHED_SLICE_MIN (slice_min) 176113357Sjeff#define SCHED_SLICE_MAX (slice_max) 177121871Sjeff#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 178111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 179109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 180112966Sjeff#define SCHED_SLICE_NICE(nice) \ 181121871Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 182109864Sjeff 183109864Sjeff/* 184109864Sjeff * This macro determines whether or not the kse belongs on the current or 185109864Sjeff * next run queue. 186109864Sjeff */ 187113357Sjeff#define SCHED_INTERACTIVE(kg) \ 188113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 189113417Sjeff#define SCHED_CURR(kg, ke) \ 190121107Sjeff (ke->ke_thread->td_priority != kg->kg_user_pri || \ 191121107Sjeff SCHED_INTERACTIVE(kg)) 192109864Sjeff 193109864Sjeff/* 194109864Sjeff * Cpu percentage computation macros and defines. 195109864Sjeff * 196109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 197109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 198109864Sjeff */ 199109864Sjeff 200112971Sjeff#define SCHED_CPU_TIME 10 201109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 202109864Sjeff 203109864Sjeff/* 204113357Sjeff * kseq - per processor runqs and statistics. 205109864Sjeff */ 206109864Sjeffstruct kseq { 207113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 208113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 209113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 210113357Sjeff struct runq *ksq_curr; /* Current queue. */ 211121896Sjeff int ksq_load_timeshare; /* Load for timeshare. */ 212113357Sjeff int ksq_load; /* Aggregate load. */ 213121869Sjeff short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 214113357Sjeff short ksq_nicemin; /* Least nice. */ 215110267Sjeff#ifdef SMP 216123433Sjeff int ksq_transferable; 217123433Sjeff LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 218123433Sjeff struct kseq_group *ksq_group; /* Our processor group. */ 219123433Sjeff volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 220110267Sjeff#endif 221109864Sjeff}; 222109864Sjeff 223123433Sjeff#ifdef SMP 224109864Sjeff/* 225123433Sjeff * kseq groups are groups of processors which can cheaply share threads. When 226123433Sjeff * one processor in the group goes idle it will check the runqs of the other 227123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 228123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 229123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 230123433Sjeff * load balancer. 231123433Sjeff */ 232123433Sjeffstruct kseq_group { 233123433Sjeff int ksg_cpus; /* Count of CPUs in this kseq group. */ 234123433Sjeff int ksg_cpumask; /* Mask of cpus in this group. */ 235123433Sjeff int ksg_idlemask; /* Idle cpus in this group. */ 236123433Sjeff int ksg_mask; /* Bit mask for first cpu. */ 237123433Sjeff int ksg_transferable; /* Transferable load of this group. */ 238123433Sjeff LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 239123433Sjeff}; 240123433Sjeff#endif 241123433Sjeff 242123433Sjeff/* 243109864Sjeff * One kse queue per processor. 244109864Sjeff */ 245110028Sjeff#ifdef SMP 246121790Sjeffstatic int kseq_idle; 247121790Sjeffstatic struct kseq kseq_cpu[MAXCPU]; 248123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU]; 249123433Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 250123433Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 251123433Sjeff#else /* !SMP */ 252121790Sjeffstatic struct kseq kseq_cpu; 253110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 254110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 255110028Sjeff#endif 256109864Sjeff 257112966Sjeffstatic void sched_slice(struct kse *ke); 258113357Sjeffstatic void sched_priority(struct ksegrp *kg); 259111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 260116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 261121868Sjeffstatic void sched_interact_fork(struct ksegrp *kg); 262121790Sjeffstatic void sched_pctcpu_update(struct kse *ke); 263109864Sjeff 264110267Sjeff/* Operations on per processor queues */ 265121790Sjeffstatic struct kse * kseq_choose(struct kseq *kseq); 266110028Sjeffstatic void kseq_setup(struct kseq *kseq); 267122744Sjeffstatic void kseq_load_add(struct kseq *kseq, struct kse *ke); 268122744Sjeffstatic void kseq_load_rem(struct kseq *kseq, struct kse *ke); 269122744Sjeffstatic __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 270122744Sjeffstatic __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 271113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 272113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 273113660Sjeffvoid kseq_print(int cpu); 274110267Sjeff#ifdef SMP 275123433Sjeffstatic int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 276121790Sjeffstatic struct kse *runq_steal(struct runq *rq); 277122744Sjeffstatic void sched_balance(void *arg); 278121790Sjeffstatic void kseq_move(struct kseq *from, int cpu); 279123433Sjeffstatic int kseq_idled(struct kseq *kseq); 280121790Sjeffstatic void kseq_notify(struct kse *ke, int cpu); 281121790Sjeffstatic void kseq_assign(struct kseq *); 282123433Sjeffstatic struct kse *kseq_steal(struct kseq *kseq, int stealidle); 283122038Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 284122158Sjeff ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 285122165Sjeff ((ke)->ke_flags & KEF_BOUND) == 0) 286121790Sjeff#endif 287110028Sjeff 288113357Sjeffvoid 289113660Sjeffkseq_print(int cpu) 290110267Sjeff{ 291113660Sjeff struct kseq *kseq; 292113357Sjeff int i; 293112994Sjeff 294113660Sjeff kseq = KSEQ_CPU(cpu); 295112994Sjeff 296113357Sjeff printf("kseq:\n"); 297113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 298122744Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 299121896Sjeff#ifdef SMP 300123433Sjeff printf("\tload transferable: %d\n", kseq->ksq_transferable); 301121896Sjeff#endif 302113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 303113357Sjeff printf("\tnice counts:\n"); 304121869Sjeff for (i = 0; i < SCHED_PRI_NRESV; i++) 305113357Sjeff if (kseq->ksq_nice[i]) 306113357Sjeff printf("\t\t%d = %d\n", 307113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 308113357Sjeff} 309112994Sjeff 310122744Sjeffstatic __inline void 311122744Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke) 312122744Sjeff{ 313122744Sjeff#ifdef SMP 314123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 315123433Sjeff kseq->ksq_transferable++; 316123433Sjeff kseq->ksq_group->ksg_transferable++; 317123433Sjeff } 318122744Sjeff#endif 319122744Sjeff runq_add(ke->ke_runq, ke); 320122744Sjeff} 321122744Sjeff 322122744Sjeffstatic __inline void 323122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke) 324122744Sjeff{ 325122744Sjeff#ifdef SMP 326123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 327123433Sjeff kseq->ksq_transferable--; 328123433Sjeff kseq->ksq_group->ksg_transferable--; 329123433Sjeff } 330122744Sjeff#endif 331122744Sjeff runq_remove(ke->ke_runq, ke); 332122744Sjeff} 333122744Sjeff 334113357Sjeffstatic void 335122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke) 336113357Sjeff{ 337121896Sjeff int class; 338115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 339121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 340121896Sjeff if (class == PRI_TIMESHARE) 341121896Sjeff kseq->ksq_load_timeshare++; 342113357Sjeff kseq->ksq_load++; 343113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 344122744Sjeff CTR6(KTR_ULE, 345122744Sjeff "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 346122744Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 347122744Sjeff ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 348113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 349113357Sjeff kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 350110267Sjeff} 351113357Sjeff 352112994Sjeffstatic void 353122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke) 354110267Sjeff{ 355121896Sjeff int class; 356115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 357121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 358121896Sjeff if (class == PRI_TIMESHARE) 359121896Sjeff kseq->ksq_load_timeshare--; 360113357Sjeff kseq->ksq_load--; 361113357Sjeff ke->ke_runq = NULL; 362113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 363113357Sjeff kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 364110267Sjeff} 365110267Sjeff 366113357Sjeffstatic void 367113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 368110267Sjeff{ 369115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 370113357Sjeff /* Normalize to zero. */ 371113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 372121896Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 373113357Sjeff kseq->ksq_nicemin = nice; 374110267Sjeff} 375110267Sjeff 376113357Sjeffstatic void 377113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 378110267Sjeff{ 379113357Sjeff int n; 380113357Sjeff 381115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 382113357Sjeff /* Normalize to zero. */ 383113357Sjeff n = nice + SCHED_PRI_NHALF; 384113357Sjeff kseq->ksq_nice[n]--; 385113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 386113357Sjeff 387113357Sjeff /* 388113357Sjeff * If this wasn't the smallest nice value or there are more in 389113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 390113357Sjeff * the smallest nice. 391113357Sjeff */ 392113357Sjeff if (nice != kseq->ksq_nicemin || 393113357Sjeff kseq->ksq_nice[n] != 0 || 394121896Sjeff kseq->ksq_load_timeshare == 0) 395113357Sjeff return; 396113357Sjeff 397121869Sjeff for (; n < SCHED_PRI_NRESV; n++) 398113357Sjeff if (kseq->ksq_nice[n]) { 399113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 400113357Sjeff return; 401113357Sjeff } 402110267Sjeff} 403110267Sjeff 404113357Sjeff#ifdef SMP 405116069Sjeff/* 406122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 407116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 408116069Sjeff * by migrating some processes. 409116069Sjeff * 410116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 411116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 412116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 413116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 414116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 415116069Sjeff * 416116069Sjeff * It could be improved by considering the priorities and slices assigned to 417116069Sjeff * each task prior to balancing them. There are many pathological cases with 418116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 419116069Sjeff * 420116069Sjeff */ 421121790Sjeffstatic void 422122744Sjeffsched_balance(void *arg) 423116069Sjeff{ 424116069Sjeff struct kseq *kseq; 425123433Sjeff int transferable; 426116069Sjeff int high_load; 427116069Sjeff int low_load; 428116069Sjeff int high_cpu; 429116069Sjeff int low_cpu; 430116069Sjeff int move; 431116069Sjeff int diff; 432116069Sjeff int i; 433116069Sjeff 434116069Sjeff high_cpu = 0; 435116069Sjeff low_cpu = 0; 436116069Sjeff high_load = 0; 437116069Sjeff low_load = -1; 438116069Sjeff 439116069Sjeff mtx_lock_spin(&sched_lock); 440116962Sjeff if (smp_started == 0) 441116962Sjeff goto out; 442116962Sjeff 443123126Sjhb for (i = 0; i <= mp_maxid; i++) { 444116970Sjeff if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 445116069Sjeff continue; 446116069Sjeff kseq = KSEQ_CPU(i); 447123433Sjeff /* 448123433Sjeff * Find the CPU with the highest load that has some threads 449123433Sjeff * to transfer. 450123433Sjeff */ 451123433Sjeff if (kseq->ksq_load > high_load && 452123433Sjeff kseq->ksq_group->ksg_transferable) { 453123433Sjeff high_load = kseq->ksq_load; 454116069Sjeff high_cpu = i; 455116069Sjeff } 456116069Sjeff if (low_load == -1 || kseq->ksq_load < low_load) { 457116069Sjeff low_load = kseq->ksq_load; 458116069Sjeff low_cpu = i; 459116069Sjeff } 460116069Sjeff } 461117237Sjeff kseq = KSEQ_CPU(high_cpu); 462116069Sjeff /* 463116069Sjeff * Nothing to do. 464116069Sjeff */ 465123433Sjeff if (low_load >= high_load) 466116069Sjeff goto out; 467122744Sjeff /* 468123433Sjeff * If we're transfering within a group we have to use this specific 469123433Sjeff * kseq's transferable count, otherwise we can steal from other members 470123433Sjeff * of the group. 471123433Sjeff */ 472123433Sjeff if (kseq->ksq_group == KSEQ_CPU(low_cpu)->ksq_group) 473123433Sjeff transferable = kseq->ksq_transferable; 474123433Sjeff else 475123433Sjeff transferable = kseq->ksq_group->ksg_transferable; 476123433Sjeff if (transferable == 0) 477123433Sjeff goto out; 478123433Sjeff /* 479122744Sjeff * Determine what the imbalance is and then adjust that to how many 480123433Sjeff * kses we actually have to give up (transferable). 481122744Sjeff */ 482122744Sjeff diff = kseq->ksq_load - low_load; 483116069Sjeff move = diff / 2; 484116069Sjeff if (diff & 0x1) 485116069Sjeff move++; 486123433Sjeff move = min(move, transferable); 487116069Sjeff for (i = 0; i < move; i++) 488117237Sjeff kseq_move(kseq, low_cpu); 489116069Sjeffout: 490116069Sjeff mtx_unlock_spin(&sched_lock); 491122744Sjeff callout_reset(&kseq_lb_callout, hz, sched_balance, NULL); 492116069Sjeff 493116069Sjeff return; 494116069Sjeff} 495116069Sjeff 496121790Sjeffstatic void 497116069Sjeffkseq_move(struct kseq *from, int cpu) 498116069Sjeff{ 499123433Sjeff struct kseq *kseq; 500123433Sjeff struct kseq *to; 501116069Sjeff struct kse *ke; 502116069Sjeff 503123433Sjeff kseq = from; 504123433Sjeff to = KSEQ_CPU(cpu); 505123433Sjeff ke = kseq_steal(kseq, 1); 506123433Sjeff if (ke == NULL) { 507123433Sjeff struct kseq_group *ksg; 508123433Sjeff 509123433Sjeff ksg = kseq->ksq_group; 510123433Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 511123433Sjeff if (kseq == from || kseq->ksq_transferable == 0) 512123433Sjeff continue; 513123433Sjeff ke = kseq_steal(kseq, 1); 514123433Sjeff break; 515123433Sjeff } 516123433Sjeff if (ke == NULL) 517123433Sjeff panic("kseq_move: No KSEs available with a " 518123433Sjeff "transferable count of %d\n", 519123433Sjeff ksg->ksg_transferable); 520123433Sjeff } 521123433Sjeff if (kseq == to) 522123433Sjeff return; 523116069Sjeff ke->ke_state = KES_THREAD; 524123433Sjeff kseq_runq_rem(kseq, ke); 525123433Sjeff kseq_load_rem(kseq, ke); 526116069Sjeff 527116069Sjeff ke->ke_cpu = cpu; 528121923Sjeff kseq_notify(ke, cpu); 529116069Sjeff} 530110267Sjeff 531123433Sjeffstatic int 532123433Sjeffkseq_idled(struct kseq *kseq) 533121790Sjeff{ 534123433Sjeff struct kseq_group *ksg; 535123433Sjeff struct kseq *steal; 536123433Sjeff struct kse *ke; 537123433Sjeff 538123433Sjeff ksg = kseq->ksq_group; 539123433Sjeff /* 540123433Sjeff * If we're in a cpu group, try and steal kses from another cpu in 541123433Sjeff * the group before idling. 542123433Sjeff */ 543123433Sjeff if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 544123433Sjeff LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 545123433Sjeff if (steal == kseq || steal->ksq_transferable == 0) 546123433Sjeff continue; 547123433Sjeff ke = kseq_steal(steal, 0); 548123433Sjeff if (ke == NULL) 549123433Sjeff continue; 550123433Sjeff ke->ke_state = KES_THREAD; 551123433Sjeff kseq_runq_rem(steal, ke); 552123433Sjeff kseq_load_rem(steal, ke); 553123433Sjeff ke->ke_cpu = PCPU_GET(cpuid); 554123433Sjeff sched_add(ke->ke_thread); 555123433Sjeff return (0); 556123433Sjeff } 557123433Sjeff } 558123433Sjeff /* 559123433Sjeff * We only set the idled bit when all of the cpus in the group are 560123433Sjeff * idle. Otherwise we could get into a situation where a KSE bounces 561123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 562123433Sjeff */ 563123433Sjeff ksg->ksg_idlemask |= PCPU_GET(cpumask); 564123433Sjeff if (ksg->ksg_idlemask != ksg->ksg_cpumask) 565123433Sjeff return (1); 566123433Sjeff atomic_set_int(&kseq_idle, ksg->ksg_mask); 567123433Sjeff return (1); 568121790Sjeff} 569121790Sjeff 570121790Sjeffstatic void 571121790Sjeffkseq_assign(struct kseq *kseq) 572121790Sjeff{ 573121790Sjeff struct kse *nke; 574121790Sjeff struct kse *ke; 575121790Sjeff 576121790Sjeff do { 577122848Sjeff (volatile struct kse *)ke = kseq->ksq_assigned; 578121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 579121790Sjeff for (; ke != NULL; ke = nke) { 580121790Sjeff nke = ke->ke_assign; 581121790Sjeff ke->ke_flags &= ~KEF_ASSIGNED; 582121790Sjeff sched_add(ke->ke_thread); 583121790Sjeff } 584121790Sjeff} 585121790Sjeff 586121790Sjeffstatic void 587121790Sjeffkseq_notify(struct kse *ke, int cpu) 588121790Sjeff{ 589121790Sjeff struct kseq *kseq; 590121790Sjeff struct thread *td; 591121790Sjeff struct pcpu *pcpu; 592121790Sjeff 593121790Sjeff ke->ke_flags |= KEF_ASSIGNED; 594121790Sjeff 595121790Sjeff kseq = KSEQ_CPU(cpu); 596121790Sjeff 597121790Sjeff /* 598121790Sjeff * Place a KSE on another cpu's queue and force a resched. 599121790Sjeff */ 600121790Sjeff do { 601122848Sjeff (volatile struct kse *)ke->ke_assign = kseq->ksq_assigned; 602121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 603121790Sjeff pcpu = pcpu_find(cpu); 604121790Sjeff td = pcpu->pc_curthread; 605121790Sjeff if (ke->ke_thread->td_priority < td->td_priority || 606121790Sjeff td == pcpu->pc_idlethread) { 607121790Sjeff td->td_flags |= TDF_NEEDRESCHED; 608121790Sjeff ipi_selected(1 << cpu, IPI_AST); 609121790Sjeff } 610121790Sjeff} 611121790Sjeff 612121790Sjeffstatic struct kse * 613121790Sjeffrunq_steal(struct runq *rq) 614121790Sjeff{ 615121790Sjeff struct rqhead *rqh; 616121790Sjeff struct rqbits *rqb; 617121790Sjeff struct kse *ke; 618121790Sjeff int word; 619121790Sjeff int bit; 620121790Sjeff 621121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 622121790Sjeff rqb = &rq->rq_status; 623121790Sjeff for (word = 0; word < RQB_LEN; word++) { 624121790Sjeff if (rqb->rqb_bits[word] == 0) 625121790Sjeff continue; 626121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 627123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 628121790Sjeff continue; 629121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 630121790Sjeff TAILQ_FOREACH(ke, rqh, ke_procq) { 631121896Sjeff if (KSE_CAN_MIGRATE(ke, 632121896Sjeff PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 633121790Sjeff return (ke); 634121790Sjeff } 635121790Sjeff } 636121790Sjeff } 637121790Sjeff return (NULL); 638121790Sjeff} 639121790Sjeff 640121790Sjeffstatic struct kse * 641123433Sjeffkseq_steal(struct kseq *kseq, int stealidle) 642121790Sjeff{ 643121790Sjeff struct kse *ke; 644121790Sjeff 645123433Sjeff /* 646123433Sjeff * Steal from next first to try to get a non-interactive task that 647123433Sjeff * may not have run for a while. 648123433Sjeff */ 649123433Sjeff if ((ke = runq_steal(kseq->ksq_next)) != NULL) 650123433Sjeff return (ke); 651121790Sjeff if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 652121790Sjeff return (ke); 653123433Sjeff if (stealidle) 654123433Sjeff return (runq_steal(&kseq->ksq_idle)); 655123433Sjeff return (NULL); 656121790Sjeff} 657123433Sjeff 658123433Sjeffint 659123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class) 660123433Sjeff{ 661123433Sjeff struct kseq_group *ksg; 662123433Sjeff int cpu; 663123433Sjeff 664123433Sjeff cpu = 0; 665123433Sjeff ksg = kseq->ksq_group; 666123433Sjeff 667123433Sjeff /* 668123433Sjeff * XXX This ksg_transferable might work better if we were checking 669123433Sjeff * against a global group load. As it is now, this prevents us from 670123433Sjeff * transfering a thread from a group that is potentially bogged down 671123433Sjeff * with non transferable load. 672123433Sjeff */ 673123433Sjeff if (ksg->ksg_transferable > ksg->ksg_cpus && kseq_idle) { 674123433Sjeff /* 675123433Sjeff * Multiple cpus could find this bit simultaneously 676123433Sjeff * but the race shouldn't be terrible. 677123433Sjeff */ 678123433Sjeff cpu = ffs(kseq_idle); 679123433Sjeff if (cpu) 680123433Sjeff atomic_clear_int(&kseq_idle, 1 << (cpu - 1)); 681123433Sjeff } 682123433Sjeff /* 683123433Sjeff * If another cpu in this group has idled, assign a thread over 684123433Sjeff * to them after checking to see if there are idled groups. 685123433Sjeff */ 686123433Sjeff if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) { 687123433Sjeff cpu = ffs(ksg->ksg_idlemask); 688123433Sjeff if (cpu) 689123433Sjeff ksg->ksg_idlemask &= ~(1 << (cpu - 1)); 690123433Sjeff } 691123433Sjeff /* 692123433Sjeff * Now that we've found an idle CPU, migrate the thread. 693123433Sjeff */ 694123433Sjeff if (cpu) { 695123433Sjeff cpu--; 696123433Sjeff ke->ke_cpu = cpu; 697123433Sjeff ke->ke_runq = NULL; 698123433Sjeff kseq_notify(ke, cpu); 699123433Sjeff return (1); 700123433Sjeff } 701123433Sjeff return (0); 702123433Sjeff} 703123433Sjeff 704121790Sjeff#endif /* SMP */ 705121790Sjeff 706117326Sjeff/* 707121790Sjeff * Pick the highest priority task we have and return it. 708117326Sjeff */ 709117326Sjeff 710121790Sjeffstatic struct kse * 711121790Sjeffkseq_choose(struct kseq *kseq) 712110267Sjeff{ 713110267Sjeff struct kse *ke; 714110267Sjeff struct runq *swap; 715110267Sjeff 716115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 717113357Sjeff swap = NULL; 718112994Sjeff 719113357Sjeff for (;;) { 720113357Sjeff ke = runq_choose(kseq->ksq_curr); 721113357Sjeff if (ke == NULL) { 722113357Sjeff /* 723113357Sjeff * We already swaped once and didn't get anywhere. 724113357Sjeff */ 725113357Sjeff if (swap) 726113357Sjeff break; 727113357Sjeff swap = kseq->ksq_curr; 728113357Sjeff kseq->ksq_curr = kseq->ksq_next; 729113357Sjeff kseq->ksq_next = swap; 730113357Sjeff continue; 731113357Sjeff } 732113357Sjeff /* 733113357Sjeff * If we encounter a slice of 0 the kse is in a 734113357Sjeff * TIMESHARE kse group and its nice was too far out 735113357Sjeff * of the range that receives slices. 736113357Sjeff */ 737121790Sjeff if (ke->ke_slice == 0) { 738113357Sjeff runq_remove(ke->ke_runq, ke); 739113357Sjeff sched_slice(ke); 740113357Sjeff ke->ke_runq = kseq->ksq_next; 741113357Sjeff runq_add(ke->ke_runq, ke); 742113357Sjeff continue; 743113357Sjeff } 744113357Sjeff return (ke); 745110267Sjeff } 746110267Sjeff 747113357Sjeff return (runq_choose(&kseq->ksq_idle)); 748110267Sjeff} 749110267Sjeff 750109864Sjeffstatic void 751110028Sjeffkseq_setup(struct kseq *kseq) 752110028Sjeff{ 753113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 754113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 755112994Sjeff runq_init(&kseq->ksq_idle); 756113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 757113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 758113660Sjeff kseq->ksq_load = 0; 759121896Sjeff kseq->ksq_load_timeshare = 0; 760110028Sjeff} 761110028Sjeff 762110028Sjeffstatic void 763109864Sjeffsched_setup(void *dummy) 764109864Sjeff{ 765117313Sjeff#ifdef SMP 766109864Sjeff int i; 767117313Sjeff#endif 768109864Sjeff 769116946Sjeff slice_min = (hz/100); /* 10ms */ 770116946Sjeff slice_max = (hz/7); /* ~140ms */ 771111857Sjeff 772117237Sjeff#ifdef SMP 773123433Sjeff /* 774123433Sjeff * Initialize the kseqs. 775123433Sjeff */ 776123433Sjeff for (i = 0; i < MAXCPU; i++) { 777123433Sjeff struct kseq *ksq; 778123433Sjeff 779123433Sjeff ksq = &kseq_cpu[i]; 780123433Sjeff ksq->ksq_assigned = NULL; 781123433Sjeff kseq_setup(&kseq_cpu[i]); 782123433Sjeff } 783117237Sjeff if (smp_topology == NULL) { 784123433Sjeff struct kseq_group *ksg; 785123433Sjeff struct kseq *ksq; 786123433Sjeff 787117237Sjeff for (i = 0; i < MAXCPU; i++) { 788123433Sjeff ksq = &kseq_cpu[i]; 789123433Sjeff ksg = &kseq_groups[i]; 790123433Sjeff /* 791123433Sjeff * Setup a kse group with one member. 792123433Sjeff */ 793123433Sjeff ksq->ksq_transferable = 0; 794123433Sjeff ksq->ksq_group = ksg; 795123433Sjeff ksg->ksg_cpus = 1; 796123433Sjeff ksg->ksg_idlemask = 0; 797123433Sjeff ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 798123433Sjeff ksg->ksg_transferable = 0; 799123433Sjeff LIST_INIT(&ksg->ksg_members); 800123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 801117237Sjeff } 802117237Sjeff } else { 803123433Sjeff struct kseq_group *ksg; 804123433Sjeff struct cpu_group *cg; 805117237Sjeff int j; 806113357Sjeff 807117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 808117237Sjeff cg = &smp_topology->ct_group[i]; 809123433Sjeff ksg = &kseq_groups[i]; 810123433Sjeff /* 811123433Sjeff * Initialize the group. 812123433Sjeff */ 813123433Sjeff ksg->ksg_idlemask = 0; 814123433Sjeff ksg->ksg_transferable = 0; 815123433Sjeff ksg->ksg_cpus = cg->cg_count; 816123433Sjeff ksg->ksg_cpumask = cg->cg_mask; 817123433Sjeff LIST_INIT(&ksg->ksg_members); 818123433Sjeff /* 819123433Sjeff * Find all of the group members and add them. 820123433Sjeff */ 821123433Sjeff for (j = 0; j < MAXCPU; j++) { 822123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 823123433Sjeff if (ksg->ksg_mask == 0) 824123433Sjeff ksg->ksg_mask = 1 << j; 825123433Sjeff kseq_cpu[j].ksq_transferable = 0; 826123433Sjeff kseq_cpu[j].ksq_group = ksg; 827123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, 828123433Sjeff &kseq_cpu[j], ksq_siblings); 829123433Sjeff } 830123433Sjeff } 831117237Sjeff } 832117237Sjeff } 833119137Ssam callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 834122744Sjeff sched_balance(NULL); 835117237Sjeff#else 836117237Sjeff kseq_setup(KSEQ_SELF()); 837116069Sjeff#endif 838117237Sjeff mtx_lock_spin(&sched_lock); 839122744Sjeff kseq_load_add(KSEQ_SELF(), &kse0); 840117237Sjeff mtx_unlock_spin(&sched_lock); 841109864Sjeff} 842109864Sjeff 843109864Sjeff/* 844109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 845109864Sjeff * process. 846109864Sjeff */ 847113357Sjeffstatic void 848109864Sjeffsched_priority(struct ksegrp *kg) 849109864Sjeff{ 850109864Sjeff int pri; 851109864Sjeff 852109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 853113357Sjeff return; 854109864Sjeff 855113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 856111857Sjeff pri += SCHED_PRI_BASE; 857109864Sjeff pri += kg->kg_nice; 858109864Sjeff 859109864Sjeff if (pri > PRI_MAX_TIMESHARE) 860109864Sjeff pri = PRI_MAX_TIMESHARE; 861109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 862109864Sjeff pri = PRI_MIN_TIMESHARE; 863109864Sjeff 864109864Sjeff kg->kg_user_pri = pri; 865109864Sjeff 866113357Sjeff return; 867109864Sjeff} 868109864Sjeff 869109864Sjeff/* 870112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 871112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 872109864Sjeff */ 873112966Sjeffstatic void 874112966Sjeffsched_slice(struct kse *ke) 875109864Sjeff{ 876113357Sjeff struct kseq *kseq; 877112966Sjeff struct ksegrp *kg; 878109864Sjeff 879112966Sjeff kg = ke->ke_ksegrp; 880113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 881109864Sjeff 882112966Sjeff /* 883112966Sjeff * Rationale: 884112966Sjeff * KSEs in interactive ksegs get the minimum slice so that we 885112966Sjeff * quickly notice if it abuses its advantage. 886112966Sjeff * 887112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 888112966Sjeff * based on the ksegs nice value relative to the least nice kseg 889112966Sjeff * on the run queue for this cpu. 890112966Sjeff * 891112966Sjeff * If the KSE is less nice than all others it gets the maximum 892112966Sjeff * slice and other KSEs will adjust their slice relative to 893112966Sjeff * this when they first expire. 894112966Sjeff * 895112966Sjeff * There is 20 point window that starts relative to the least 896112966Sjeff * nice kse on the run queue. Slice size is determined by 897112966Sjeff * the kse distance from the last nice ksegrp. 898112966Sjeff * 899121871Sjeff * If the kse is outside of the window it will get no slice 900121871Sjeff * and will be reevaluated each time it is selected on the 901121871Sjeff * run queue. The exception to this is nice 0 ksegs when 902121871Sjeff * a nice -20 is running. They are always granted a minimum 903121871Sjeff * slice. 904112966Sjeff */ 905113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 906112966Sjeff int nice; 907112966Sjeff 908113357Sjeff nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 909121896Sjeff if (kseq->ksq_load_timeshare == 0 || 910113357Sjeff kg->kg_nice < kseq->ksq_nicemin) 911112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 912121871Sjeff else if (nice <= SCHED_SLICE_NTHRESH) 913112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 914121871Sjeff else if (kg->kg_nice == 0) 915121871Sjeff ke->ke_slice = SCHED_SLICE_MIN; 916112966Sjeff else 917112966Sjeff ke->ke_slice = 0; 918112966Sjeff } else 919112966Sjeff ke->ke_slice = SCHED_SLICE_MIN; 920112966Sjeff 921113357Sjeff CTR6(KTR_ULE, 922113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 923113357Sjeff ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 924121896Sjeff kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); 925113357Sjeff 926112966Sjeff return; 927109864Sjeff} 928109864Sjeff 929121868Sjeff/* 930121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 931121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 932121868Sjeff * This routine will not operate correctly when slp or run times have been 933121868Sjeff * adjusted to more than double their maximum. 934121868Sjeff */ 935116463Sjeffstatic void 936116463Sjeffsched_interact_update(struct ksegrp *kg) 937116463Sjeff{ 938121868Sjeff int sum; 939121605Sjeff 940121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 941121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 942121868Sjeff return; 943121868Sjeff /* 944121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 945121868Sjeff * will not bring us back into range. Dividing by two here forces 946121868Sjeff * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 947121868Sjeff */ 948121868Sjeff if (sum > (SCHED_INTERACT_MAX / 5) * 6) { 949121868Sjeff kg->kg_runtime /= 2; 950121868Sjeff kg->kg_slptime /= 2; 951121868Sjeff return; 952116463Sjeff } 953121868Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 954121868Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 955116463Sjeff} 956116463Sjeff 957121868Sjeffstatic void 958121868Sjeffsched_interact_fork(struct ksegrp *kg) 959121868Sjeff{ 960121868Sjeff int ratio; 961121868Sjeff int sum; 962121868Sjeff 963121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 964121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 965121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 966121868Sjeff kg->kg_runtime /= ratio; 967121868Sjeff kg->kg_slptime /= ratio; 968121868Sjeff } 969121868Sjeff} 970121868Sjeff 971111857Sjeffstatic int 972111857Sjeffsched_interact_score(struct ksegrp *kg) 973111857Sjeff{ 974116365Sjeff int div; 975111857Sjeff 976111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 977116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 978116365Sjeff return (SCHED_INTERACT_HALF + 979116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 980116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 981116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 982116365Sjeff return (kg->kg_runtime / div); 983111857Sjeff } 984111857Sjeff 985116365Sjeff /* 986116365Sjeff * This can happen if slptime and runtime are 0. 987116365Sjeff */ 988116365Sjeff return (0); 989111857Sjeff 990111857Sjeff} 991111857Sjeff 992113357Sjeff/* 993113357Sjeff * This is only somewhat accurate since given many processes of the same 994113357Sjeff * priority they will switch when their slices run out, which will be 995113357Sjeff * at most SCHED_SLICE_MAX. 996113357Sjeff */ 997109864Sjeffint 998109864Sjeffsched_rr_interval(void) 999109864Sjeff{ 1000109864Sjeff return (SCHED_SLICE_MAX); 1001109864Sjeff} 1002109864Sjeff 1003121790Sjeffstatic void 1004109864Sjeffsched_pctcpu_update(struct kse *ke) 1005109864Sjeff{ 1006109864Sjeff /* 1007109864Sjeff * Adjust counters and watermark for pctcpu calc. 1008116365Sjeff */ 1009120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1010120272Sjeff /* 1011120272Sjeff * Shift the tick count out so that the divide doesn't 1012120272Sjeff * round away our results. 1013120272Sjeff */ 1014120272Sjeff ke->ke_ticks <<= 10; 1015120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1016120272Sjeff SCHED_CPU_TICKS; 1017120272Sjeff ke->ke_ticks >>= 10; 1018120272Sjeff } else 1019120272Sjeff ke->ke_ticks = 0; 1020109864Sjeff ke->ke_ltick = ticks; 1021109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1022109864Sjeff} 1023109864Sjeff 1024109864Sjeffvoid 1025109864Sjeffsched_prio(struct thread *td, u_char prio) 1026109864Sjeff{ 1027121605Sjeff struct kse *ke; 1028109864Sjeff 1029121605Sjeff ke = td->td_kse; 1030109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1031109864Sjeff if (TD_ON_RUNQ(td)) { 1032121605Sjeff /* 1033121605Sjeff * If the priority has been elevated due to priority 1034121605Sjeff * propagation, we may have to move ourselves to a new 1035121605Sjeff * queue. We still call adjustrunqueue below in case kse 1036121605Sjeff * needs to fix things up. 1037121605Sjeff */ 1038121872Sjeff if (prio < td->td_priority && ke && 1039121872Sjeff (ke->ke_flags & KEF_ASSIGNED) == 0 && 1040121790Sjeff ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1041121605Sjeff runq_remove(ke->ke_runq, ke); 1042121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1043121605Sjeff runq_add(ke->ke_runq, ke); 1044121605Sjeff } 1045119488Sdavidxu adjustrunqueue(td, prio); 1046121605Sjeff } else 1047119488Sdavidxu td->td_priority = prio; 1048109864Sjeff} 1049109864Sjeff 1050109864Sjeffvoid 1051121128Sjeffsched_switch(struct thread *td) 1052109864Sjeff{ 1053121128Sjeff struct thread *newtd; 1054109864Sjeff struct kse *ke; 1055109864Sjeff 1056109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1057109864Sjeff 1058109864Sjeff ke = td->td_kse; 1059109864Sjeff 1060109864Sjeff td->td_last_kse = ke; 1061113339Sjulian td->td_lastcpu = td->td_oncpu; 1062113339Sjulian td->td_oncpu = NOCPU; 1063111032Sjulian td->td_flags &= ~TDF_NEEDRESCHED; 1064109864Sjeff 1065109864Sjeff if (TD_IS_RUNNING(td)) { 1066119488Sdavidxu if (td->td_proc->p_flag & P_SA) { 1067122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1068119488Sdavidxu setrunqueue(td); 1069123433Sjeff } else 1070122744Sjeff kseq_runq_add(KSEQ_SELF(), ke); 1071121146Sjeff } else { 1072121146Sjeff if (ke->ke_runq) 1073122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1074121146Sjeff /* 1075121146Sjeff * We will not be on the run queue. So we must be 1076121146Sjeff * sleeping or similar. 1077121146Sjeff */ 1078121146Sjeff if (td->td_proc->p_flag & P_SA) 1079121146Sjeff kse_reassign(ke); 1080121146Sjeff } 1081121128Sjeff newtd = choosethread(); 1082121128Sjeff if (td != newtd) 1083121128Sjeff cpu_switch(td, newtd); 1084121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1085109864Sjeff 1086113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1087109864Sjeff} 1088109864Sjeff 1089109864Sjeffvoid 1090109864Sjeffsched_nice(struct ksegrp *kg, int nice) 1091109864Sjeff{ 1092113357Sjeff struct kse *ke; 1093109864Sjeff struct thread *td; 1094113357Sjeff struct kseq *kseq; 1095109864Sjeff 1096113873Sjhb PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 1097113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1098113357Sjeff /* 1099113357Sjeff * We need to adjust the nice counts for running KSEs. 1100113357Sjeff */ 1101113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 1102113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1103116500Sjeff if (ke->ke_runq == NULL) 1104113357Sjeff continue; 1105113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1106113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1107113357Sjeff kseq_nice_add(kseq, nice); 1108113357Sjeff } 1109109864Sjeff kg->kg_nice = nice; 1110109864Sjeff sched_priority(kg); 1111113357Sjeff FOREACH_THREAD_IN_GROUP(kg, td) 1112111032Sjulian td->td_flags |= TDF_NEEDRESCHED; 1113109864Sjeff} 1114109864Sjeff 1115109864Sjeffvoid 1116109864Sjeffsched_sleep(struct thread *td, u_char prio) 1117109864Sjeff{ 1118109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1119109864Sjeff 1120109864Sjeff td->td_slptime = ticks; 1121109864Sjeff td->td_priority = prio; 1122109864Sjeff 1123113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 1124113357Sjeff td->td_kse, td->td_slptime); 1125109864Sjeff} 1126109864Sjeff 1127109864Sjeffvoid 1128109864Sjeffsched_wakeup(struct thread *td) 1129109864Sjeff{ 1130109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1131109864Sjeff 1132109864Sjeff /* 1133109864Sjeff * Let the kseg know how long we slept for. This is because process 1134109864Sjeff * interactivity behavior is modeled in the kseg. 1135109864Sjeff */ 1136111788Sjeff if (td->td_slptime) { 1137111788Sjeff struct ksegrp *kg; 1138113357Sjeff int hzticks; 1139109864Sjeff 1140111788Sjeff kg = td->td_ksegrp; 1141121868Sjeff hzticks = (ticks - td->td_slptime) << 10; 1142121868Sjeff if (hzticks >= SCHED_SLP_RUN_MAX) { 1143121868Sjeff kg->kg_slptime = SCHED_SLP_RUN_MAX; 1144121868Sjeff kg->kg_runtime = 1; 1145121868Sjeff } else { 1146121868Sjeff kg->kg_slptime += hzticks; 1147121868Sjeff sched_interact_update(kg); 1148121868Sjeff } 1149111788Sjeff sched_priority(kg); 1150116463Sjeff if (td->td_kse) 1151116463Sjeff sched_slice(td->td_kse); 1152113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 1153113357Sjeff td->td_kse, hzticks); 1154111788Sjeff td->td_slptime = 0; 1155109864Sjeff } 1156109864Sjeff setrunqueue(td); 1157109864Sjeff} 1158109864Sjeff 1159109864Sjeff/* 1160109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1161109864Sjeff * priority. 1162109864Sjeff */ 1163109864Sjeffvoid 1164113357Sjeffsched_fork(struct proc *p, struct proc *p1) 1165109864Sjeff{ 1166109864Sjeff 1167109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1168109864Sjeff 1169113357Sjeff sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 1170113357Sjeff sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 1171113357Sjeff sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 1172113357Sjeff} 1173113357Sjeff 1174113357Sjeffvoid 1175113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child) 1176113357Sjeff{ 1177113923Sjhb 1178116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1179122847Sjeff child->ke_cpu = ke->ke_cpu; 1180113357Sjeff child->ke_runq = NULL; 1181113357Sjeff 1182121051Sjeff /* Grab our parents cpu estimation information. */ 1183121051Sjeff child->ke_ticks = ke->ke_ticks; 1184121051Sjeff child->ke_ltick = ke->ke_ltick; 1185121051Sjeff child->ke_ftick = ke->ke_ftick; 1186113357Sjeff} 1187113357Sjeff 1188113357Sjeffvoid 1189113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1190113357Sjeff{ 1191113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 1192116365Sjeff 1193121868Sjeff child->kg_slptime = kg->kg_slptime; 1194121868Sjeff child->kg_runtime = kg->kg_runtime; 1195121868Sjeff child->kg_user_pri = kg->kg_user_pri; 1196121868Sjeff child->kg_nice = kg->kg_nice; 1197121868Sjeff sched_interact_fork(child); 1198116463Sjeff kg->kg_runtime += tickincr << 10; 1199116463Sjeff sched_interact_update(kg); 1200113357Sjeff 1201121868Sjeff CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", 1202121868Sjeff kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 1203121868Sjeff child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); 1204113357Sjeff} 1205109864Sjeff 1206113357Sjeffvoid 1207113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 1208113357Sjeff{ 1209113357Sjeff} 1210113357Sjeff 1211113357Sjeffvoid 1212113357Sjeffsched_class(struct ksegrp *kg, int class) 1213113357Sjeff{ 1214113357Sjeff struct kseq *kseq; 1215113357Sjeff struct kse *ke; 1216121896Sjeff int nclass; 1217121896Sjeff int oclass; 1218113357Sjeff 1219113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1220113357Sjeff if (kg->kg_pri_class == class) 1221113357Sjeff return; 1222113357Sjeff 1223121896Sjeff nclass = PRI_BASE(class); 1224121896Sjeff oclass = PRI_BASE(kg->kg_pri_class); 1225113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1226113357Sjeff if (ke->ke_state != KES_ONRUNQ && 1227113357Sjeff ke->ke_state != KES_THREAD) 1228113357Sjeff continue; 1229113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1230113357Sjeff 1231121896Sjeff#ifdef SMP 1232122744Sjeff /* 1233122744Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1234122744Sjeff * count because could be changing to or from an interrupt 1235122744Sjeff * class. 1236122744Sjeff */ 1237122744Sjeff if (ke->ke_state == KES_ONRUNQ) { 1238123433Sjeff if (KSE_CAN_MIGRATE(ke, oclass)) { 1239123433Sjeff kseq->ksq_transferable--; 1240123433Sjeff kseq->ksq_group->ksg_transferable--; 1241123433Sjeff } 1242123433Sjeff if (KSE_CAN_MIGRATE(ke, nclass)) { 1243123433Sjeff kseq->ksq_transferable++; 1244123433Sjeff kseq->ksq_group->ksg_transferable++; 1245123433Sjeff } 1246122744Sjeff } 1247121896Sjeff#endif 1248122744Sjeff if (oclass == PRI_TIMESHARE) { 1249121896Sjeff kseq->ksq_load_timeshare--; 1250122744Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1251122744Sjeff } 1252122744Sjeff if (nclass == PRI_TIMESHARE) { 1253121896Sjeff kseq->ksq_load_timeshare++; 1254113357Sjeff kseq_nice_add(kseq, kg->kg_nice); 1255122744Sjeff } 1256109970Sjeff } 1257109970Sjeff 1258113357Sjeff kg->kg_pri_class = class; 1259109864Sjeff} 1260109864Sjeff 1261109864Sjeff/* 1262109864Sjeff * Return some of the child's priority and interactivity to the parent. 1263109864Sjeff */ 1264109864Sjeffvoid 1265113357Sjeffsched_exit(struct proc *p, struct proc *child) 1266109864Sjeff{ 1267109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1268113372Sjeff sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 1269116365Sjeff sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 1270109864Sjeff} 1271109864Sjeff 1272109864Sjeffvoid 1273113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child) 1274113372Sjeff{ 1275122744Sjeff kseq_load_rem(KSEQ_CPU(child->ke_cpu), child); 1276113372Sjeff} 1277113372Sjeff 1278113372Sjeffvoid 1279113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1280113372Sjeff{ 1281116463Sjeff /* kg->kg_slptime += child->kg_slptime; */ 1282116365Sjeff kg->kg_runtime += child->kg_runtime; 1283116463Sjeff sched_interact_update(kg); 1284113372Sjeff} 1285113372Sjeff 1286113372Sjeffvoid 1287113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 1288113372Sjeff{ 1289113372Sjeff} 1290113372Sjeff 1291113372Sjeffvoid 1292121127Sjeffsched_clock(struct thread *td) 1293109864Sjeff{ 1294113357Sjeff struct kseq *kseq; 1295113357Sjeff struct ksegrp *kg; 1296121127Sjeff struct kse *ke; 1297109864Sjeff 1298113357Sjeff /* 1299113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1300113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1301113357Sjeff * calculating this here. 1302113357Sjeff */ 1303113357Sjeff if (realstathz == 0) { 1304113357Sjeff realstathz = stathz ? stathz : hz; 1305113357Sjeff tickincr = hz / realstathz; 1306113357Sjeff /* 1307113357Sjeff * XXX This does not work for values of stathz that are much 1308113357Sjeff * larger than hz. 1309113357Sjeff */ 1310113357Sjeff if (tickincr == 0) 1311113357Sjeff tickincr = 1; 1312113357Sjeff } 1313109864Sjeff 1314121127Sjeff ke = td->td_kse; 1315113357Sjeff kg = ke->ke_ksegrp; 1316109864Sjeff 1317110028Sjeff mtx_assert(&sched_lock, MA_OWNED); 1318110028Sjeff KASSERT((td != NULL), ("schedclock: null thread pointer")); 1319110028Sjeff 1320110028Sjeff /* Adjust ticks for pctcpu */ 1321111793Sjeff ke->ke_ticks++; 1322109971Sjeff ke->ke_ltick = ticks; 1323112994Sjeff 1324109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1325109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1326109971Sjeff sched_pctcpu_update(ke); 1327109971Sjeff 1328114496Sjulian if (td->td_flags & TDF_IDLETD) 1329109864Sjeff return; 1330110028Sjeff 1331113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1332113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1333110028Sjeff /* 1334113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1335113357Sjeff */ 1336113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1337113357Sjeff return; 1338113357Sjeff /* 1339110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1340113357Sjeff * interactivity. 1341109864Sjeff */ 1342113357Sjeff kg->kg_runtime += tickincr << 10; 1343116463Sjeff sched_interact_update(kg); 1344110645Sjeff 1345109864Sjeff /* 1346109864Sjeff * We used up one time slice. 1347109864Sjeff */ 1348122847Sjeff if (--ke->ke_slice > 0) 1349113357Sjeff return; 1350109864Sjeff /* 1351113357Sjeff * We're out of time, recompute priorities and requeue. 1352109864Sjeff */ 1353122847Sjeff kseq = KSEQ_SELF(); 1354122744Sjeff kseq_load_rem(kseq, ke); 1355113357Sjeff sched_priority(kg); 1356113357Sjeff sched_slice(ke); 1357113357Sjeff if (SCHED_CURR(kg, ke)) 1358113357Sjeff ke->ke_runq = kseq->ksq_curr; 1359113357Sjeff else 1360113357Sjeff ke->ke_runq = kseq->ksq_next; 1361122744Sjeff kseq_load_add(kseq, ke); 1362113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1363109864Sjeff} 1364109864Sjeff 1365109864Sjeffint 1366109864Sjeffsched_runnable(void) 1367109864Sjeff{ 1368109864Sjeff struct kseq *kseq; 1369115998Sjeff int load; 1370109864Sjeff 1371115998Sjeff load = 1; 1372115998Sjeff 1373110028Sjeff kseq = KSEQ_SELF(); 1374121790Sjeff#ifdef SMP 1375122094Sjeff if (kseq->ksq_assigned) { 1376122094Sjeff mtx_lock_spin(&sched_lock); 1377121790Sjeff kseq_assign(kseq); 1378122094Sjeff mtx_unlock_spin(&sched_lock); 1379122094Sjeff } 1380121790Sjeff#endif 1381121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1382121605Sjeff if (kseq->ksq_load > 0) 1383121605Sjeff goto out; 1384121605Sjeff } else 1385121605Sjeff if (kseq->ksq_load - 1 > 0) 1386121605Sjeff goto out; 1387115998Sjeff load = 0; 1388115998Sjeffout: 1389115998Sjeff return (load); 1390109864Sjeff} 1391109864Sjeff 1392109864Sjeffvoid 1393109864Sjeffsched_userret(struct thread *td) 1394109864Sjeff{ 1395109864Sjeff struct ksegrp *kg; 1396121605Sjeff 1397121605Sjeff kg = td->td_ksegrp; 1398109864Sjeff 1399109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1400109864Sjeff mtx_lock_spin(&sched_lock); 1401109864Sjeff td->td_priority = kg->kg_user_pri; 1402109864Sjeff mtx_unlock_spin(&sched_lock); 1403109864Sjeff } 1404109864Sjeff} 1405109864Sjeff 1406109864Sjeffstruct kse * 1407109970Sjeffsched_choose(void) 1408109970Sjeff{ 1409110028Sjeff struct kseq *kseq; 1410109970Sjeff struct kse *ke; 1411109970Sjeff 1412115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1413121790Sjeff kseq = KSEQ_SELF(); 1414113357Sjeff#ifdef SMP 1415123433Sjeffrestart: 1416121790Sjeff if (kseq->ksq_assigned) 1417121790Sjeff kseq_assign(kseq); 1418113357Sjeff#endif 1419121790Sjeff ke = kseq_choose(kseq); 1420109864Sjeff if (ke) { 1421121790Sjeff#ifdef SMP 1422121790Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1423123433Sjeff if (kseq_idled(kseq) == 0) 1424123433Sjeff goto restart; 1425121790Sjeff#endif 1426122744Sjeff kseq_runq_rem(kseq, ke); 1427109864Sjeff ke->ke_state = KES_THREAD; 1428112966Sjeff 1429113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1430113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1431113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1432113357Sjeff ke->ke_thread->td_priority); 1433113357Sjeff } 1434113357Sjeff return (ke); 1435109864Sjeff } 1436109970Sjeff#ifdef SMP 1437123433Sjeff if (kseq_idled(kseq) == 0) 1438123433Sjeff goto restart; 1439109970Sjeff#endif 1440113357Sjeff return (NULL); 1441109864Sjeff} 1442109864Sjeff 1443109864Sjeffvoid 1444121127Sjeffsched_add(struct thread *td) 1445109864Sjeff{ 1446110267Sjeff struct kseq *kseq; 1447113357Sjeff struct ksegrp *kg; 1448121127Sjeff struct kse *ke; 1449121790Sjeff int class; 1450109864Sjeff 1451121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 1452121127Sjeff ke = td->td_kse; 1453121127Sjeff kg = td->td_ksegrp; 1454121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1455121790Sjeff return; 1456121790Sjeff kseq = KSEQ_SELF(); 1457110267Sjeff KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1458109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1459110267Sjeff ("sched_add: No KSE on thread")); 1460109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1461110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1462109864Sjeff ke->ke_proc->p_comm)); 1463109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1464110267Sjeff ("sched_add: process swapped out")); 1465113387Sjeff KASSERT(ke->ke_runq == NULL, 1466113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1467109864Sjeff 1468121790Sjeff class = PRI_BASE(kg->kg_pri_class); 1469121790Sjeff switch (class) { 1470112994Sjeff case PRI_ITHD: 1471112994Sjeff case PRI_REALTIME: 1472113357Sjeff ke->ke_runq = kseq->ksq_curr; 1473113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1474113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1475112994Sjeff break; 1476112994Sjeff case PRI_TIMESHARE: 1477113387Sjeff if (SCHED_CURR(kg, ke)) 1478113387Sjeff ke->ke_runq = kseq->ksq_curr; 1479113387Sjeff else 1480113387Sjeff ke->ke_runq = kseq->ksq_next; 1481113357Sjeff break; 1482112994Sjeff case PRI_IDLE: 1483113357Sjeff /* 1484113357Sjeff * This is for priority prop. 1485113357Sjeff */ 1486121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1487113357Sjeff ke->ke_runq = kseq->ksq_curr; 1488113357Sjeff else 1489113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1490113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1491112994Sjeff break; 1492113357Sjeff default: 1493121868Sjeff panic("Unknown pri class."); 1494113357Sjeff break; 1495112994Sjeff } 1496121790Sjeff#ifdef SMP 1497123433Sjeff if (ke->ke_cpu != PCPU_GET(cpuid)) { 1498123433Sjeff kseq_notify(ke, ke->ke_cpu); 1499123433Sjeff return; 1500123433Sjeff } 1501121790Sjeff /* 1502123433Sjeff * If there are any idle groups, give them our extra load. The 1503122744Sjeff * threshold at which we start to reassign kses has a large impact 1504122744Sjeff * on the overall performance of the system. Tuned too high and 1505122744Sjeff * some CPUs may idle. Too low and there will be excess migration 1506122744Sjeff * and context swiches. 1507121790Sjeff */ 1508123433Sjeff if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class)) 1509123433Sjeff if (kseq_transfer(kseq, ke, class)) 1510123433Sjeff return; 1511123433Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1512123433Sjeff (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1513121790Sjeff /* 1514123433Sjeff * Check to see if our group is unidling, and if so, remove it 1515123433Sjeff * from the global idle mask. 1516121790Sjeff */ 1517123433Sjeff if (kseq->ksq_group->ksg_idlemask == 1518123433Sjeff kseq->ksq_group->ksg_cpumask) 1519123433Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1520123433Sjeff /* 1521123433Sjeff * Now remove ourselves from the group specific idle mask. 1522123433Sjeff */ 1523123433Sjeff kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1524121790Sjeff } 1525121790Sjeff#endif 1526121790Sjeff if (td->td_priority < curthread->td_priority) 1527121790Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1528121790Sjeff 1529109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1530109864Sjeff ke->ke_state = KES_ONRUNQ; 1531109864Sjeff 1532122744Sjeff kseq_runq_add(kseq, ke); 1533122744Sjeff kseq_load_add(kseq, ke); 1534109864Sjeff} 1535109864Sjeff 1536109864Sjeffvoid 1537121127Sjeffsched_rem(struct thread *td) 1538109864Sjeff{ 1539113357Sjeff struct kseq *kseq; 1540121127Sjeff struct kse *ke; 1541113357Sjeff 1542121127Sjeff ke = td->td_kse; 1543121790Sjeff /* 1544121790Sjeff * It is safe to just return here because sched_rem() is only ever 1545121790Sjeff * used in places where we're immediately going to add the 1546121790Sjeff * kse back on again. In that case it'll be added with the correct 1547121790Sjeff * thread and priority when the caller drops the sched_lock. 1548121790Sjeff */ 1549121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1550121790Sjeff return; 1551109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1552113387Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1553109864Sjeff 1554109864Sjeff ke->ke_state = KES_THREAD; 1555109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1556113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1557122744Sjeff kseq_runq_rem(kseq, ke); 1558122744Sjeff kseq_load_rem(kseq, ke); 1559109864Sjeff} 1560109864Sjeff 1561109864Sjefffixpt_t 1562121127Sjeffsched_pctcpu(struct thread *td) 1563109864Sjeff{ 1564109864Sjeff fixpt_t pctcpu; 1565121127Sjeff struct kse *ke; 1566109864Sjeff 1567109864Sjeff pctcpu = 0; 1568121127Sjeff ke = td->td_kse; 1569121290Sjeff if (ke == NULL) 1570121290Sjeff return (0); 1571109864Sjeff 1572115998Sjeff mtx_lock_spin(&sched_lock); 1573109864Sjeff if (ke->ke_ticks) { 1574109864Sjeff int rtick; 1575109864Sjeff 1576116365Sjeff /* 1577116365Sjeff * Don't update more frequently than twice a second. Allowing 1578116365Sjeff * this causes the cpu usage to decay away too quickly due to 1579116365Sjeff * rounding errors. 1580116365Sjeff */ 1581116365Sjeff if (ke->ke_ltick < (ticks - (hz / 2))) 1582116365Sjeff sched_pctcpu_update(ke); 1583109864Sjeff /* How many rtick per second ? */ 1584116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1585110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1586109864Sjeff } 1587109864Sjeff 1588109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1589113865Sjhb mtx_unlock_spin(&sched_lock); 1590109864Sjeff 1591109864Sjeff return (pctcpu); 1592109864Sjeff} 1593109864Sjeff 1594122038Sjeffvoid 1595122038Sjeffsched_bind(struct thread *td, int cpu) 1596122038Sjeff{ 1597122038Sjeff struct kse *ke; 1598122038Sjeff 1599122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1600122038Sjeff ke = td->td_kse; 1601122038Sjeff ke->ke_flags |= KEF_BOUND; 1602123433Sjeff#ifdef SMP 1603123433Sjeff if (PCPU_GET(cpuid) == cpu) 1604122038Sjeff return; 1605122038Sjeff /* sched_rem without the runq_remove */ 1606122038Sjeff ke->ke_state = KES_THREAD; 1607122038Sjeff ke->ke_ksegrp->kg_runq_kses--; 1608122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1609122038Sjeff ke->ke_cpu = cpu; 1610122038Sjeff kseq_notify(ke, cpu); 1611122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 1612122038Sjeff td->td_proc->p_stats->p_ru.ru_nvcsw++; 1613122038Sjeff mi_switch(); 1614122038Sjeff#endif 1615122038Sjeff} 1616122038Sjeff 1617122038Sjeffvoid 1618122038Sjeffsched_unbind(struct thread *td) 1619122038Sjeff{ 1620122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1621122038Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 1622122038Sjeff} 1623122038Sjeff 1624109864Sjeffint 1625109864Sjeffsched_sizeof_kse(void) 1626109864Sjeff{ 1627109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1628109864Sjeff} 1629109864Sjeff 1630109864Sjeffint 1631109864Sjeffsched_sizeof_ksegrp(void) 1632109864Sjeff{ 1633109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1634109864Sjeff} 1635109864Sjeff 1636109864Sjeffint 1637109864Sjeffsched_sizeof_proc(void) 1638109864Sjeff{ 1639109864Sjeff return (sizeof(struct proc)); 1640109864Sjeff} 1641109864Sjeff 1642109864Sjeffint 1643109864Sjeffsched_sizeof_thread(void) 1644109864Sjeff{ 1645109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1646109864Sjeff} 1647