sched_ule.c revision 139453
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 139453 2004-12-30 20:52:44Z jhb $"); 29116182Sobrien 30134649Sscottl#include <opt_sched.h> 31134649Sscottl 32134791Sjulian#define kse td_sched 33134791Sjulian 34109864Sjeff#include <sys/param.h> 35109864Sjeff#include <sys/systm.h> 36131929Smarcel#include <sys/kdb.h> 37109864Sjeff#include <sys/kernel.h> 38109864Sjeff#include <sys/ktr.h> 39109864Sjeff#include <sys/lock.h> 40109864Sjeff#include <sys/mutex.h> 41109864Sjeff#include <sys/proc.h> 42112966Sjeff#include <sys/resource.h> 43122038Sjeff#include <sys/resourcevar.h> 44109864Sjeff#include <sys/sched.h> 45109864Sjeff#include <sys/smp.h> 46109864Sjeff#include <sys/sx.h> 47109864Sjeff#include <sys/sysctl.h> 48109864Sjeff#include <sys/sysproto.h> 49139453Sjhb#include <sys/turnstile.h> 50109864Sjeff#include <sys/vmmeter.h> 51109864Sjeff#ifdef KTRACE 52109864Sjeff#include <sys/uio.h> 53109864Sjeff#include <sys/ktrace.h> 54109864Sjeff#endif 55109864Sjeff 56109864Sjeff#include <machine/cpu.h> 57121790Sjeff#include <machine/smp.h> 58109864Sjeff 59109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 60109864Sjeff/* XXX This is bogus compatability crap for ps */ 61109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 62109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 63109864Sjeff 64109864Sjeffstatic void sched_setup(void *dummy); 65109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 66109864Sjeff 67132589Sscottlstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 68113357Sjeff 69132589SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 70132589Sscottl "Scheduler name"); 71130881Sscottl 72113357Sjeffstatic int slice_min = 1; 73113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 74113357Sjeff 75116365Sjeffstatic int slice_max = 10; 76113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 77113357Sjeff 78111857Sjeffint realstathz; 79113357Sjeffint tickincr = 1; 80111857Sjeff 81109864Sjeff/* 82134791Sjulian * The schedulable entity that can be given a context to run. 83134791Sjulian * A process may have several of these. Probably one per processor 84134791Sjulian * but posibly a few more. In this universe they are grouped 85134791Sjulian * with a KSEG that contains the priority and niceness 86134791Sjulian * for the group. 87134791Sjulian */ 88134791Sjulianstruct kse { 89134791Sjulian TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ 90134791Sjulian int ke_flags; /* (j) KEF_* flags. */ 91134791Sjulian struct thread *ke_thread; /* (*) Active associated thread. */ 92134791Sjulian fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ 93134791Sjulian char ke_rqindex; /* (j) Run queue index. */ 94134791Sjulian enum { 95134791Sjulian KES_THREAD = 0x0, /* slaved to thread state */ 96134791Sjulian KES_ONRUNQ 97134791Sjulian } ke_state; /* (j) thread sched specific status. */ 98134791Sjulian int ke_slptime; 99134791Sjulian int ke_slice; 100134791Sjulian struct runq *ke_runq; 101134791Sjulian u_char ke_cpu; /* CPU that we have affinity for. */ 102134791Sjulian /* The following variables are only used for pctcpu calculation */ 103134791Sjulian int ke_ltick; /* Last tick that we were running on */ 104134791Sjulian int ke_ftick; /* First tick that we were running on */ 105134791Sjulian int ke_ticks; /* Tick count */ 106134791Sjulian 107134791Sjulian}; 108134791Sjulian 109134791Sjulian 110134791Sjulian#define td_kse td_sched 111134791Sjulian#define td_slptime td_kse->ke_slptime 112134791Sjulian#define ke_proc ke_thread->td_proc 113134791Sjulian#define ke_ksegrp ke_thread->td_ksegrp 114134791Sjulian 115134791Sjulian/* flags kept in ke_flags */ 116134791Sjulian#define KEF_SCHED0 0x00001 /* For scheduler-specific use. */ 117134791Sjulian#define KEF_SCHED1 0x00002 /* For scheduler-specific use. */ 118134791Sjulian#define KEF_SCHED2 0x00004 /* For scheduler-specific use. */ 119134791Sjulian#define KEF_SCHED3 0x00008 /* For scheduler-specific use. */ 120138842Sjeff#define KEF_SCHED4 0x00010 121138842Sjeff#define KEF_SCHED5 0x00020 122134791Sjulian#define KEF_DIDRUN 0x02000 /* Thread actually ran. */ 123134791Sjulian#define KEF_EXIT 0x04000 /* Thread is being killed. */ 124134791Sjulian 125134791Sjulian/* 126109864Sjeff * These datastructures are allocated within their parent datastructure but 127109864Sjeff * are scheduler specific. 128109864Sjeff */ 129109864Sjeff 130121790Sjeff#define ke_assign ke_procq.tqe_next 131109864Sjeff 132139334Sjeff#define KEF_ASSIGNED 0x0001 /* Thread is being migrated. */ 133139334Sjeff#define KEF_BOUND 0x0002 /* Thread can not migrate. */ 134139334Sjeff#define KEF_XFERABLE 0x0004 /* Thread was added as transferable. */ 135139334Sjeff#define KEF_HOLD 0x0008 /* Thread is temporarily bound. */ 136139334Sjeff#define KEF_REMOVED 0x0010 /* Thread was removed while ASSIGNED */ 137139453Sjhb#define KEF_INTERNAL 0x0020 138121790Sjeff 139109864Sjeffstruct kg_sched { 140134791Sjulian struct thread *skg_last_assigned; /* (j) Last thread assigned to */ 141134791Sjulian /* the system scheduler */ 142110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 143110645Sjeff int skg_runtime; /* Number of ticks we were running */ 144134791Sjulian int skg_avail_opennings; /* (j) Num unfilled slots in group.*/ 145134791Sjulian int skg_concurrency; /* (j) Num threads requested in group.*/ 146109864Sjeff}; 147134791Sjulian#define kg_last_assigned kg_sched->skg_last_assigned 148134791Sjulian#define kg_avail_opennings kg_sched->skg_avail_opennings 149134791Sjulian#define kg_concurrency kg_sched->skg_concurrency 150134791Sjulian#define kg_runtime kg_sched->skg_runtime 151134791Sjulian#define kg_slptime kg_sched->skg_slptime 152109864Sjeff 153136167Sjulian#define SLOT_RELEASE(kg) \ 154136167Sjuliando { \ 155136167Sjulian kg->kg_avail_opennings++; \ 156136167Sjulian CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)", \ 157136167Sjulian kg, \ 158136167Sjulian kg->kg_concurrency, \ 159136167Sjulian kg->kg_avail_opennings); \ 160136167Sjulian /*KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency), \ 161136167Sjulian ("slots out of whack")); */ \ 162136167Sjulian} while (0) 163109864Sjeff 164136167Sjulian#define SLOT_USE(kg) \ 165136167Sjuliando { \ 166136167Sjulian kg->kg_avail_opennings--; \ 167136167Sjulian CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)", \ 168136167Sjulian kg, \ 169136167Sjulian kg->kg_concurrency, \ 170136167Sjulian kg->kg_avail_opennings); \ 171136167Sjulian /*KASSERT((kg->kg_avail_opennings >= 0), \ 172136167Sjulian ("slots out of whack"));*/ \ 173136167Sjulian} while (0) 174136167Sjulian 175134791Sjulianstatic struct kse kse0; 176134791Sjulianstatic struct kg_sched kg_sched0; 177109864Sjeff 178109864Sjeff/* 179116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 180116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 181116642Sjeff * value is then directly added to this to allow nice to have some effect 182116642Sjeff * on latency. 183111857Sjeff * 184111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 185116642Sjeff * PRI_NRESV: Number of nice values. 186111857Sjeff * PRI_BASE: The start of the dynamic range. 187109864Sjeff */ 188111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 189121869Sjeff#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 190121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 191116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 192113357Sjeff#define SCHED_PRI_INTERACT(score) \ 193116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 194109864Sjeff 195109864Sjeff/* 196111857Sjeff * These determine the interactivity of a process. 197109864Sjeff * 198110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 199110645Sjeff * before throttling back. 200121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 201116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 202111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 203109864Sjeff */ 204121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 205121868Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 206116365Sjeff#define SCHED_INTERACT_MAX (100) 207116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 208121126Sjeff#define SCHED_INTERACT_THRESH (30) 209111857Sjeff 210109864Sjeff/* 211109864Sjeff * These parameters and macros determine the size of the time slice that is 212109864Sjeff * granted to each thread. 213109864Sjeff * 214109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 215109864Sjeff * SLICE_MAX: Maximum time slice granted. 216109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 217112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 218112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 219121871Sjeff * SLICE_NTHRESH: The nice cutoff point for slice assignment. 220109864Sjeff */ 221113357Sjeff#define SCHED_SLICE_MIN (slice_min) 222113357Sjeff#define SCHED_SLICE_MAX (slice_max) 223125299Sjeff#define SCHED_SLICE_INTERACTIVE (slice_max) 224121871Sjeff#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 225111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 226109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 227112966Sjeff#define SCHED_SLICE_NICE(nice) \ 228121871Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 229109864Sjeff 230109864Sjeff/* 231134791Sjulian * This macro determines whether or not the thread belongs on the current or 232109864Sjeff * next run queue. 233109864Sjeff */ 234113357Sjeff#define SCHED_INTERACTIVE(kg) \ 235113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 236113417Sjeff#define SCHED_CURR(kg, ke) \ 237139453Sjhb ((ke->ke_thread->td_flags & TDF_BORROWING) || SCHED_INTERACTIVE(kg)) 238109864Sjeff 239109864Sjeff/* 240109864Sjeff * Cpu percentage computation macros and defines. 241109864Sjeff * 242109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 243109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 244109864Sjeff */ 245109864Sjeff 246112971Sjeff#define SCHED_CPU_TIME 10 247109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 248109864Sjeff 249109864Sjeff/* 250113357Sjeff * kseq - per processor runqs and statistics. 251109864Sjeff */ 252109864Sjeffstruct kseq { 253113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 254113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 255113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 256113357Sjeff struct runq *ksq_curr; /* Current queue. */ 257121896Sjeff int ksq_load_timeshare; /* Load for timeshare. */ 258113357Sjeff int ksq_load; /* Aggregate load. */ 259121869Sjeff short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 260113357Sjeff short ksq_nicemin; /* Least nice. */ 261110267Sjeff#ifdef SMP 262123433Sjeff int ksq_transferable; 263123433Sjeff LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 264123433Sjeff struct kseq_group *ksq_group; /* Our processor group. */ 265123433Sjeff volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 266125289Sjeff#else 267125289Sjeff int ksq_sysload; /* For loadavg, !ITHD load. */ 268110267Sjeff#endif 269109864Sjeff}; 270109864Sjeff 271123433Sjeff#ifdef SMP 272109864Sjeff/* 273123433Sjeff * kseq groups are groups of processors which can cheaply share threads. When 274123433Sjeff * one processor in the group goes idle it will check the runqs of the other 275123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 276123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 277123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 278123433Sjeff * load balancer. 279123433Sjeff */ 280123433Sjeffstruct kseq_group { 281123433Sjeff int ksg_cpus; /* Count of CPUs in this kseq group. */ 282127498Smarcel cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ 283127498Smarcel cpumask_t ksg_idlemask; /* Idle cpus in this group. */ 284127498Smarcel cpumask_t ksg_mask; /* Bit mask for first cpu. */ 285123487Sjeff int ksg_load; /* Total load of this group. */ 286123433Sjeff int ksg_transferable; /* Transferable load of this group. */ 287123433Sjeff LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 288123433Sjeff}; 289123433Sjeff#endif 290123433Sjeff 291123433Sjeff/* 292109864Sjeff * One kse queue per processor. 293109864Sjeff */ 294110028Sjeff#ifdef SMP 295127498Smarcelstatic cpumask_t kseq_idle; 296123487Sjeffstatic int ksg_maxid; 297121790Sjeffstatic struct kseq kseq_cpu[MAXCPU]; 298123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU]; 299129982Sjeffstatic int bal_tick; 300129982Sjeffstatic int gbal_tick; 301139334Sjeffstatic int balance_groups; 302129982Sjeff 303123433Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 304123433Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 305123487Sjeff#define KSEQ_ID(x) ((x) - kseq_cpu) 306123487Sjeff#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 307123433Sjeff#else /* !SMP */ 308121790Sjeffstatic struct kseq kseq_cpu; 309129982Sjeff 310110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 311110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 312110028Sjeff#endif 313109864Sjeff 314134791Sjulianstatic void slot_fill(struct ksegrp *kg); 315134791Sjulianstatic struct kse *sched_choose(void); /* XXX Should be thread * */ 316112966Sjeffstatic void sched_slice(struct kse *ke); 317113357Sjeffstatic void sched_priority(struct ksegrp *kg); 318139453Sjhbstatic void sched_thread_priority(struct thread *td, u_char prio); 319111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 320116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 321121868Sjeffstatic void sched_interact_fork(struct ksegrp *kg); 322121790Sjeffstatic void sched_pctcpu_update(struct kse *ke); 323109864Sjeff 324110267Sjeff/* Operations on per processor queues */ 325121790Sjeffstatic struct kse * kseq_choose(struct kseq *kseq); 326110028Sjeffstatic void kseq_setup(struct kseq *kseq); 327122744Sjeffstatic void kseq_load_add(struct kseq *kseq, struct kse *ke); 328122744Sjeffstatic void kseq_load_rem(struct kseq *kseq, struct kse *ke); 329139334Sjeffstatic __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke, int); 330122744Sjeffstatic __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 331113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 332113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 333113660Sjeffvoid kseq_print(int cpu); 334110267Sjeff#ifdef SMP 335123433Sjeffstatic int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 336121790Sjeffstatic struct kse *runq_steal(struct runq *rq); 337129982Sjeffstatic void sched_balance(void); 338129982Sjeffstatic void sched_balance_groups(void); 339123487Sjeffstatic void sched_balance_group(struct kseq_group *ksg); 340123487Sjeffstatic void sched_balance_pair(struct kseq *high, struct kseq *low); 341121790Sjeffstatic void kseq_move(struct kseq *from, int cpu); 342123433Sjeffstatic int kseq_idled(struct kseq *kseq); 343121790Sjeffstatic void kseq_notify(struct kse *ke, int cpu); 344121790Sjeffstatic void kseq_assign(struct kseq *); 345123433Sjeffstatic struct kse *kseq_steal(struct kseq *kseq, int stealidle); 346139334Sjeff#define KSE_CAN_MIGRATE(ke) \ 347135076Sscottl ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 348121790Sjeff#endif 349110028Sjeff 350113357Sjeffvoid 351113660Sjeffkseq_print(int cpu) 352110267Sjeff{ 353113660Sjeff struct kseq *kseq; 354113357Sjeff int i; 355112994Sjeff 356113660Sjeff kseq = KSEQ_CPU(cpu); 357112994Sjeff 358113357Sjeff printf("kseq:\n"); 359113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 360122744Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 361121896Sjeff#ifdef SMP 362123433Sjeff printf("\tload transferable: %d\n", kseq->ksq_transferable); 363121896Sjeff#endif 364113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 365113357Sjeff printf("\tnice counts:\n"); 366121869Sjeff for (i = 0; i < SCHED_PRI_NRESV; i++) 367113357Sjeff if (kseq->ksq_nice[i]) 368113357Sjeff printf("\t\t%d = %d\n", 369113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 370113357Sjeff} 371112994Sjeff 372122744Sjeffstatic __inline void 373139334Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke, int flags) 374122744Sjeff{ 375122744Sjeff#ifdef SMP 376139334Sjeff if (KSE_CAN_MIGRATE(ke)) { 377123433Sjeff kseq->ksq_transferable++; 378123433Sjeff kseq->ksq_group->ksg_transferable++; 379133427Sjeff ke->ke_flags |= KEF_XFERABLE; 380123433Sjeff } 381122744Sjeff#endif 382139334Sjeff runq_add(ke->ke_runq, ke, flags); 383122744Sjeff} 384122744Sjeff 385122744Sjeffstatic __inline void 386122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke) 387122744Sjeff{ 388122744Sjeff#ifdef SMP 389133427Sjeff if (ke->ke_flags & KEF_XFERABLE) { 390123433Sjeff kseq->ksq_transferable--; 391123433Sjeff kseq->ksq_group->ksg_transferable--; 392133427Sjeff ke->ke_flags &= ~KEF_XFERABLE; 393123433Sjeff } 394122744Sjeff#endif 395122744Sjeff runq_remove(ke->ke_runq, ke); 396122744Sjeff} 397122744Sjeff 398113357Sjeffstatic void 399122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke) 400113357Sjeff{ 401121896Sjeff int class; 402115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 403121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 404121896Sjeff if (class == PRI_TIMESHARE) 405121896Sjeff kseq->ksq_load_timeshare++; 406113357Sjeff kseq->ksq_load++; 407139316Sjeff CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 408128563Sobrien if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 409123487Sjeff#ifdef SMP 410123487Sjeff kseq->ksq_group->ksg_load++; 411125289Sjeff#else 412125289Sjeff kseq->ksq_sysload++; 413123487Sjeff#endif 414113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 415130551Sjulian kseq_nice_add(kseq, ke->ke_proc->p_nice); 416110267Sjeff} 417113357Sjeff 418112994Sjeffstatic void 419122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke) 420110267Sjeff{ 421121896Sjeff int class; 422115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 423121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 424121896Sjeff if (class == PRI_TIMESHARE) 425121896Sjeff kseq->ksq_load_timeshare--; 426128563Sobrien if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 427123487Sjeff#ifdef SMP 428123487Sjeff kseq->ksq_group->ksg_load--; 429125289Sjeff#else 430125289Sjeff kseq->ksq_sysload--; 431123487Sjeff#endif 432113357Sjeff kseq->ksq_load--; 433139316Sjeff CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 434113357Sjeff ke->ke_runq = NULL; 435113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 436130551Sjulian kseq_nice_rem(kseq, ke->ke_proc->p_nice); 437110267Sjeff} 438110267Sjeff 439113357Sjeffstatic void 440113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 441110267Sjeff{ 442115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 443113357Sjeff /* Normalize to zero. */ 444113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 445121896Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 446113357Sjeff kseq->ksq_nicemin = nice; 447110267Sjeff} 448110267Sjeff 449113357Sjeffstatic void 450113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 451110267Sjeff{ 452113357Sjeff int n; 453113357Sjeff 454115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 455113357Sjeff /* Normalize to zero. */ 456113357Sjeff n = nice + SCHED_PRI_NHALF; 457113357Sjeff kseq->ksq_nice[n]--; 458113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 459113357Sjeff 460113357Sjeff /* 461113357Sjeff * If this wasn't the smallest nice value or there are more in 462113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 463113357Sjeff * the smallest nice. 464113357Sjeff */ 465113357Sjeff if (nice != kseq->ksq_nicemin || 466113357Sjeff kseq->ksq_nice[n] != 0 || 467121896Sjeff kseq->ksq_load_timeshare == 0) 468113357Sjeff return; 469113357Sjeff 470121869Sjeff for (; n < SCHED_PRI_NRESV; n++) 471113357Sjeff if (kseq->ksq_nice[n]) { 472113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 473113357Sjeff return; 474113357Sjeff } 475110267Sjeff} 476110267Sjeff 477113357Sjeff#ifdef SMP 478116069Sjeff/* 479122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 480116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 481116069Sjeff * by migrating some processes. 482116069Sjeff * 483116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 484116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 485116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 486116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 487116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 488116069Sjeff * 489116069Sjeff * It could be improved by considering the priorities and slices assigned to 490116069Sjeff * each task prior to balancing them. There are many pathological cases with 491116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 492116069Sjeff * 493116069Sjeff */ 494121790Sjeffstatic void 495129982Sjeffsched_balance(void) 496116069Sjeff{ 497123487Sjeff struct kseq_group *high; 498123487Sjeff struct kseq_group *low; 499123487Sjeff struct kseq_group *ksg; 500123487Sjeff int cnt; 501123487Sjeff int i; 502123487Sjeff 503139334Sjeff bal_tick = ticks + (random() % (hz * 2)); 504123487Sjeff if (smp_started == 0) 505139334Sjeff return; 506123487Sjeff low = high = NULL; 507123487Sjeff i = random() % (ksg_maxid + 1); 508123487Sjeff for (cnt = 0; cnt <= ksg_maxid; cnt++) { 509123487Sjeff ksg = KSEQ_GROUP(i); 510123487Sjeff /* 511123487Sjeff * Find the CPU with the highest load that has some 512123487Sjeff * threads to transfer. 513123487Sjeff */ 514123487Sjeff if ((high == NULL || ksg->ksg_load > high->ksg_load) 515123487Sjeff && ksg->ksg_transferable) 516123487Sjeff high = ksg; 517123487Sjeff if (low == NULL || ksg->ksg_load < low->ksg_load) 518123487Sjeff low = ksg; 519123487Sjeff if (++i > ksg_maxid) 520123487Sjeff i = 0; 521123487Sjeff } 522123487Sjeff if (low != NULL && high != NULL && high != low) 523123487Sjeff sched_balance_pair(LIST_FIRST(&high->ksg_members), 524123487Sjeff LIST_FIRST(&low->ksg_members)); 525123487Sjeff} 526123487Sjeff 527123487Sjeffstatic void 528129982Sjeffsched_balance_groups(void) 529123487Sjeff{ 530123487Sjeff int i; 531123487Sjeff 532139334Sjeff gbal_tick = ticks + (random() % (hz * 2)); 533129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 534123487Sjeff if (smp_started) 535123487Sjeff for (i = 0; i <= ksg_maxid; i++) 536123487Sjeff sched_balance_group(KSEQ_GROUP(i)); 537123487Sjeff} 538123487Sjeff 539123487Sjeffstatic void 540123487Sjeffsched_balance_group(struct kseq_group *ksg) 541123487Sjeff{ 542116069Sjeff struct kseq *kseq; 543123487Sjeff struct kseq *high; 544123487Sjeff struct kseq *low; 545123487Sjeff int load; 546123487Sjeff 547123487Sjeff if (ksg->ksg_transferable == 0) 548123487Sjeff return; 549123487Sjeff low = NULL; 550123487Sjeff high = NULL; 551123487Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 552123487Sjeff load = kseq->ksq_load; 553123487Sjeff if (high == NULL || load > high->ksq_load) 554123487Sjeff high = kseq; 555123487Sjeff if (low == NULL || load < low->ksq_load) 556123487Sjeff low = kseq; 557123487Sjeff } 558123487Sjeff if (high != NULL && low != NULL && high != low) 559123487Sjeff sched_balance_pair(high, low); 560123487Sjeff} 561123487Sjeff 562123487Sjeffstatic void 563123487Sjeffsched_balance_pair(struct kseq *high, struct kseq *low) 564123487Sjeff{ 565123433Sjeff int transferable; 566116069Sjeff int high_load; 567116069Sjeff int low_load; 568116069Sjeff int move; 569116069Sjeff int diff; 570116069Sjeff int i; 571116069Sjeff 572116069Sjeff /* 573123433Sjeff * If we're transfering within a group we have to use this specific 574123433Sjeff * kseq's transferable count, otherwise we can steal from other members 575123433Sjeff * of the group. 576123433Sjeff */ 577123487Sjeff if (high->ksq_group == low->ksq_group) { 578123487Sjeff transferable = high->ksq_transferable; 579123487Sjeff high_load = high->ksq_load; 580123487Sjeff low_load = low->ksq_load; 581123487Sjeff } else { 582123487Sjeff transferable = high->ksq_group->ksg_transferable; 583123487Sjeff high_load = high->ksq_group->ksg_load; 584123487Sjeff low_load = low->ksq_group->ksg_load; 585123487Sjeff } 586123433Sjeff if (transferable == 0) 587123487Sjeff return; 588123433Sjeff /* 589122744Sjeff * Determine what the imbalance is and then adjust that to how many 590123433Sjeff * kses we actually have to give up (transferable). 591122744Sjeff */ 592123487Sjeff diff = high_load - low_load; 593116069Sjeff move = diff / 2; 594116069Sjeff if (diff & 0x1) 595116069Sjeff move++; 596123433Sjeff move = min(move, transferable); 597116069Sjeff for (i = 0; i < move; i++) 598123487Sjeff kseq_move(high, KSEQ_ID(low)); 599116069Sjeff return; 600116069Sjeff} 601116069Sjeff 602121790Sjeffstatic void 603116069Sjeffkseq_move(struct kseq *from, int cpu) 604116069Sjeff{ 605123433Sjeff struct kseq *kseq; 606123433Sjeff struct kseq *to; 607116069Sjeff struct kse *ke; 608116069Sjeff 609123433Sjeff kseq = from; 610123433Sjeff to = KSEQ_CPU(cpu); 611123433Sjeff ke = kseq_steal(kseq, 1); 612123433Sjeff if (ke == NULL) { 613123433Sjeff struct kseq_group *ksg; 614123433Sjeff 615123433Sjeff ksg = kseq->ksq_group; 616123433Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 617123433Sjeff if (kseq == from || kseq->ksq_transferable == 0) 618123433Sjeff continue; 619123433Sjeff ke = kseq_steal(kseq, 1); 620123433Sjeff break; 621123433Sjeff } 622123433Sjeff if (ke == NULL) 623123433Sjeff panic("kseq_move: No KSEs available with a " 624123433Sjeff "transferable count of %d\n", 625123433Sjeff ksg->ksg_transferable); 626123433Sjeff } 627123433Sjeff if (kseq == to) 628123433Sjeff return; 629116069Sjeff ke->ke_state = KES_THREAD; 630123433Sjeff kseq_runq_rem(kseq, ke); 631123433Sjeff kseq_load_rem(kseq, ke); 632121923Sjeff kseq_notify(ke, cpu); 633116069Sjeff} 634110267Sjeff 635123433Sjeffstatic int 636123433Sjeffkseq_idled(struct kseq *kseq) 637121790Sjeff{ 638123433Sjeff struct kseq_group *ksg; 639123433Sjeff struct kseq *steal; 640123433Sjeff struct kse *ke; 641123433Sjeff 642123433Sjeff ksg = kseq->ksq_group; 643123433Sjeff /* 644123433Sjeff * If we're in a cpu group, try and steal kses from another cpu in 645123433Sjeff * the group before idling. 646123433Sjeff */ 647123433Sjeff if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 648123433Sjeff LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 649123433Sjeff if (steal == kseq || steal->ksq_transferable == 0) 650123433Sjeff continue; 651123433Sjeff ke = kseq_steal(steal, 0); 652123433Sjeff if (ke == NULL) 653123433Sjeff continue; 654123433Sjeff ke->ke_state = KES_THREAD; 655123433Sjeff kseq_runq_rem(steal, ke); 656123433Sjeff kseq_load_rem(steal, ke); 657123433Sjeff ke->ke_cpu = PCPU_GET(cpuid); 658139334Sjeff ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; 659139334Sjeff sched_add(ke->ke_thread, SRQ_YIELDING); 660123433Sjeff return (0); 661123433Sjeff } 662123433Sjeff } 663123433Sjeff /* 664123433Sjeff * We only set the idled bit when all of the cpus in the group are 665123433Sjeff * idle. Otherwise we could get into a situation where a KSE bounces 666123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 667123433Sjeff */ 668123433Sjeff ksg->ksg_idlemask |= PCPU_GET(cpumask); 669123433Sjeff if (ksg->ksg_idlemask != ksg->ksg_cpumask) 670123433Sjeff return (1); 671123433Sjeff atomic_set_int(&kseq_idle, ksg->ksg_mask); 672123433Sjeff return (1); 673121790Sjeff} 674121790Sjeff 675121790Sjeffstatic void 676121790Sjeffkseq_assign(struct kseq *kseq) 677121790Sjeff{ 678121790Sjeff struct kse *nke; 679121790Sjeff struct kse *ke; 680121790Sjeff 681121790Sjeff do { 682132776Skan *(volatile struct kse **)&ke = kseq->ksq_assigned; 683121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 684121790Sjeff for (; ke != NULL; ke = nke) { 685121790Sjeff nke = ke->ke_assign; 686139334Sjeff kseq->ksq_group->ksg_load--; 687139334Sjeff kseq->ksq_load--; 688121790Sjeff ke->ke_flags &= ~KEF_ASSIGNED; 689139334Sjeff ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; 690139334Sjeff sched_add(ke->ke_thread, SRQ_YIELDING); 691121790Sjeff } 692121790Sjeff} 693121790Sjeff 694121790Sjeffstatic void 695121790Sjeffkseq_notify(struct kse *ke, int cpu) 696121790Sjeff{ 697121790Sjeff struct kseq *kseq; 698121790Sjeff struct thread *td; 699121790Sjeff struct pcpu *pcpu; 700139334Sjeff int class; 701133427Sjeff int prio; 702121790Sjeff 703139334Sjeff kseq = KSEQ_CPU(cpu); 704139334Sjeff /* XXX */ 705139334Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 706139334Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 707139334Sjeff (kseq_idle & kseq->ksq_group->ksg_mask)) 708139334Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 709139334Sjeff kseq->ksq_group->ksg_load++; 710139334Sjeff kseq->ksq_load++; 711123529Sjeff ke->ke_cpu = cpu; 712121790Sjeff ke->ke_flags |= KEF_ASSIGNED; 713133427Sjeff prio = ke->ke_thread->td_priority; 714121790Sjeff 715121790Sjeff /* 716121790Sjeff * Place a KSE on another cpu's queue and force a resched. 717121790Sjeff */ 718121790Sjeff do { 719132776Skan *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned; 720121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 721133427Sjeff /* 722133427Sjeff * Without sched_lock we could lose a race where we set NEEDRESCHED 723133427Sjeff * on a thread that is switched out before the IPI is delivered. This 724133427Sjeff * would lead us to miss the resched. This will be a problem once 725133427Sjeff * sched_lock is pushed down. 726133427Sjeff */ 727121790Sjeff pcpu = pcpu_find(cpu); 728121790Sjeff td = pcpu->pc_curthread; 729121790Sjeff if (ke->ke_thread->td_priority < td->td_priority || 730121790Sjeff td == pcpu->pc_idlethread) { 731121790Sjeff td->td_flags |= TDF_NEEDRESCHED; 732121790Sjeff ipi_selected(1 << cpu, IPI_AST); 733121790Sjeff } 734121790Sjeff} 735121790Sjeff 736121790Sjeffstatic struct kse * 737121790Sjeffrunq_steal(struct runq *rq) 738121790Sjeff{ 739121790Sjeff struct rqhead *rqh; 740121790Sjeff struct rqbits *rqb; 741121790Sjeff struct kse *ke; 742121790Sjeff int word; 743121790Sjeff int bit; 744121790Sjeff 745121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 746121790Sjeff rqb = &rq->rq_status; 747121790Sjeff for (word = 0; word < RQB_LEN; word++) { 748121790Sjeff if (rqb->rqb_bits[word] == 0) 749121790Sjeff continue; 750121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 751123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 752121790Sjeff continue; 753121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 754121790Sjeff TAILQ_FOREACH(ke, rqh, ke_procq) { 755139334Sjeff if (KSE_CAN_MIGRATE(ke)) 756121790Sjeff return (ke); 757121790Sjeff } 758121790Sjeff } 759121790Sjeff } 760121790Sjeff return (NULL); 761121790Sjeff} 762121790Sjeff 763121790Sjeffstatic struct kse * 764123433Sjeffkseq_steal(struct kseq *kseq, int stealidle) 765121790Sjeff{ 766121790Sjeff struct kse *ke; 767121790Sjeff 768123433Sjeff /* 769123433Sjeff * Steal from next first to try to get a non-interactive task that 770123433Sjeff * may not have run for a while. 771123433Sjeff */ 772123433Sjeff if ((ke = runq_steal(kseq->ksq_next)) != NULL) 773123433Sjeff return (ke); 774121790Sjeff if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 775121790Sjeff return (ke); 776123433Sjeff if (stealidle) 777123433Sjeff return (runq_steal(&kseq->ksq_idle)); 778123433Sjeff return (NULL); 779121790Sjeff} 780123433Sjeff 781123433Sjeffint 782123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class) 783123433Sjeff{ 784139334Sjeff struct kseq_group *nksg; 785123433Sjeff struct kseq_group *ksg; 786139334Sjeff struct kseq *old; 787123433Sjeff int cpu; 788139334Sjeff int idx; 789123433Sjeff 790123685Sjeff if (smp_started == 0) 791123685Sjeff return (0); 792123433Sjeff cpu = 0; 793123433Sjeff /* 794133427Sjeff * If our load exceeds a certain threshold we should attempt to 795133427Sjeff * reassign this thread. The first candidate is the cpu that 796133427Sjeff * originally ran the thread. If it is idle, assign it there, 797133427Sjeff * otherwise, pick an idle cpu. 798133427Sjeff * 799133427Sjeff * The threshold at which we start to reassign kses has a large impact 800123685Sjeff * on the overall performance of the system. Tuned too high and 801123685Sjeff * some CPUs may idle. Too low and there will be excess migration 802128055Scognet * and context switches. 803123685Sjeff */ 804139334Sjeff old = KSEQ_CPU(ke->ke_cpu); 805139334Sjeff nksg = old->ksq_group; 806133427Sjeff ksg = kseq->ksq_group; 807139334Sjeff if (kseq_idle) { 808139334Sjeff if (kseq_idle & nksg->ksg_mask) { 809139334Sjeff cpu = ffs(nksg->ksg_idlemask); 810139334Sjeff if (cpu) { 811139334Sjeff CTR2(KTR_SCHED, 812139334Sjeff "kseq_transfer: %p found old cpu %X " 813139334Sjeff "in idlemask.", ke, cpu); 814133427Sjeff goto migrate; 815139334Sjeff } 816133427Sjeff } 817123433Sjeff /* 818123433Sjeff * Multiple cpus could find this bit simultaneously 819123433Sjeff * but the race shouldn't be terrible. 820123433Sjeff */ 821123433Sjeff cpu = ffs(kseq_idle); 822139334Sjeff if (cpu) { 823139334Sjeff CTR2(KTR_SCHED, "kseq_transfer: %p found %X " 824139334Sjeff "in idlemask.", ke, cpu); 825133427Sjeff goto migrate; 826139334Sjeff } 827123433Sjeff } 828139334Sjeff idx = 0; 829139334Sjeff#if 0 830139334Sjeff if (old->ksq_load < kseq->ksq_load) { 831139334Sjeff cpu = ke->ke_cpu + 1; 832139334Sjeff CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X " 833139334Sjeff "load less than ours.", ke, cpu); 834139334Sjeff goto migrate; 835139334Sjeff } 836123433Sjeff /* 837139334Sjeff * No new CPU was found, look for one with less load. 838139334Sjeff */ 839139334Sjeff for (idx = 0; idx <= ksg_maxid; idx++) { 840139334Sjeff nksg = KSEQ_GROUP(idx); 841139334Sjeff if (nksg->ksg_load /*+ (nksg->ksg_cpus * 2)*/ < ksg->ksg_load) { 842139334Sjeff cpu = ffs(nksg->ksg_cpumask); 843139334Sjeff CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less " 844139334Sjeff "than ours.", ke, cpu); 845139334Sjeff goto migrate; 846139334Sjeff } 847139334Sjeff } 848139334Sjeff#endif 849139334Sjeff /* 850123433Sjeff * If another cpu in this group has idled, assign a thread over 851123433Sjeff * to them after checking to see if there are idled groups. 852123433Sjeff */ 853133427Sjeff if (ksg->ksg_idlemask) { 854123433Sjeff cpu = ffs(ksg->ksg_idlemask); 855139334Sjeff if (cpu) { 856139334Sjeff CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in " 857139334Sjeff "group.", ke, cpu); 858133427Sjeff goto migrate; 859139334Sjeff } 860123433Sjeff } 861133427Sjeff return (0); 862133427Sjeffmigrate: 863133427Sjeff /* 864123433Sjeff * Now that we've found an idle CPU, migrate the thread. 865123433Sjeff */ 866133427Sjeff cpu--; 867133427Sjeff ke->ke_runq = NULL; 868133427Sjeff kseq_notify(ke, cpu); 869133427Sjeff 870133427Sjeff return (1); 871123433Sjeff} 872123433Sjeff 873121790Sjeff#endif /* SMP */ 874121790Sjeff 875117326Sjeff/* 876121790Sjeff * Pick the highest priority task we have and return it. 877117326Sjeff */ 878117326Sjeff 879121790Sjeffstatic struct kse * 880121790Sjeffkseq_choose(struct kseq *kseq) 881110267Sjeff{ 882137067Sjeff struct runq *swap; 883110267Sjeff struct kse *ke; 884137067Sjeff int nice; 885110267Sjeff 886115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 887113357Sjeff swap = NULL; 888112994Sjeff 889113357Sjeff for (;;) { 890113357Sjeff ke = runq_choose(kseq->ksq_curr); 891113357Sjeff if (ke == NULL) { 892113357Sjeff /* 893131473Sjhb * We already swapped once and didn't get anywhere. 894113357Sjeff */ 895113357Sjeff if (swap) 896113357Sjeff break; 897113357Sjeff swap = kseq->ksq_curr; 898113357Sjeff kseq->ksq_curr = kseq->ksq_next; 899113357Sjeff kseq->ksq_next = swap; 900113357Sjeff continue; 901113357Sjeff } 902113357Sjeff /* 903113357Sjeff * If we encounter a slice of 0 the kse is in a 904113357Sjeff * TIMESHARE kse group and its nice was too far out 905113357Sjeff * of the range that receives slices. 906113357Sjeff */ 907137067Sjeff nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin); 908138842Sjeff if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH && 909138842Sjeff ke->ke_proc->p_nice != 0)) { 910113357Sjeff runq_remove(ke->ke_runq, ke); 911113357Sjeff sched_slice(ke); 912113357Sjeff ke->ke_runq = kseq->ksq_next; 913136170Sjulian runq_add(ke->ke_runq, ke, 0); 914113357Sjeff continue; 915113357Sjeff } 916113357Sjeff return (ke); 917110267Sjeff } 918110267Sjeff 919113357Sjeff return (runq_choose(&kseq->ksq_idle)); 920110267Sjeff} 921110267Sjeff 922109864Sjeffstatic void 923110028Sjeffkseq_setup(struct kseq *kseq) 924110028Sjeff{ 925113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 926113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 927112994Sjeff runq_init(&kseq->ksq_idle); 928113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 929113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 930113660Sjeff kseq->ksq_load = 0; 931121896Sjeff kseq->ksq_load_timeshare = 0; 932110028Sjeff} 933110028Sjeff 934110028Sjeffstatic void 935109864Sjeffsched_setup(void *dummy) 936109864Sjeff{ 937117313Sjeff#ifdef SMP 938109864Sjeff int i; 939117313Sjeff#endif 940109864Sjeff 941116946Sjeff slice_min = (hz/100); /* 10ms */ 942116946Sjeff slice_max = (hz/7); /* ~140ms */ 943111857Sjeff 944117237Sjeff#ifdef SMP 945123487Sjeff balance_groups = 0; 946123433Sjeff /* 947123433Sjeff * Initialize the kseqs. 948123433Sjeff */ 949123433Sjeff for (i = 0; i < MAXCPU; i++) { 950123433Sjeff struct kseq *ksq; 951123433Sjeff 952123433Sjeff ksq = &kseq_cpu[i]; 953123433Sjeff ksq->ksq_assigned = NULL; 954123433Sjeff kseq_setup(&kseq_cpu[i]); 955123433Sjeff } 956117237Sjeff if (smp_topology == NULL) { 957123433Sjeff struct kseq_group *ksg; 958123433Sjeff struct kseq *ksq; 959139334Sjeff int cpus; 960123433Sjeff 961139334Sjeff for (cpus = 0, i = 0; i < MAXCPU; i++) { 962139334Sjeff if (CPU_ABSENT(i)) 963139334Sjeff continue; 964139334Sjeff ksq = &kseq_cpu[cpus]; 965139334Sjeff ksg = &kseq_groups[cpus]; 966123433Sjeff /* 967129982Sjeff * Setup a kseq group with one member. 968123433Sjeff */ 969123433Sjeff ksq->ksq_transferable = 0; 970123433Sjeff ksq->ksq_group = ksg; 971123433Sjeff ksg->ksg_cpus = 1; 972123433Sjeff ksg->ksg_idlemask = 0; 973123433Sjeff ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 974123487Sjeff ksg->ksg_load = 0; 975123433Sjeff ksg->ksg_transferable = 0; 976123433Sjeff LIST_INIT(&ksg->ksg_members); 977123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 978139334Sjeff cpus++; 979117237Sjeff } 980139334Sjeff ksg_maxid = cpus - 1; 981117237Sjeff } else { 982123433Sjeff struct kseq_group *ksg; 983123433Sjeff struct cpu_group *cg; 984117237Sjeff int j; 985113357Sjeff 986117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 987117237Sjeff cg = &smp_topology->ct_group[i]; 988123433Sjeff ksg = &kseq_groups[i]; 989123433Sjeff /* 990123433Sjeff * Initialize the group. 991123433Sjeff */ 992123433Sjeff ksg->ksg_idlemask = 0; 993123487Sjeff ksg->ksg_load = 0; 994123433Sjeff ksg->ksg_transferable = 0; 995123433Sjeff ksg->ksg_cpus = cg->cg_count; 996123433Sjeff ksg->ksg_cpumask = cg->cg_mask; 997123433Sjeff LIST_INIT(&ksg->ksg_members); 998123433Sjeff /* 999123433Sjeff * Find all of the group members and add them. 1000123433Sjeff */ 1001123433Sjeff for (j = 0; j < MAXCPU; j++) { 1002123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 1003123433Sjeff if (ksg->ksg_mask == 0) 1004123433Sjeff ksg->ksg_mask = 1 << j; 1005123433Sjeff kseq_cpu[j].ksq_transferable = 0; 1006123433Sjeff kseq_cpu[j].ksq_group = ksg; 1007123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, 1008123433Sjeff &kseq_cpu[j], ksq_siblings); 1009123433Sjeff } 1010123433Sjeff } 1011123487Sjeff if (ksg->ksg_cpus > 1) 1012123487Sjeff balance_groups = 1; 1013117237Sjeff } 1014123487Sjeff ksg_maxid = smp_topology->ct_count - 1; 1015117237Sjeff } 1016123487Sjeff /* 1017123487Sjeff * Stagger the group and global load balancer so they do not 1018123487Sjeff * interfere with each other. 1019123487Sjeff */ 1020129982Sjeff bal_tick = ticks + hz; 1021123487Sjeff if (balance_groups) 1022129982Sjeff gbal_tick = ticks + (hz / 2); 1023117237Sjeff#else 1024117237Sjeff kseq_setup(KSEQ_SELF()); 1025116069Sjeff#endif 1026117237Sjeff mtx_lock_spin(&sched_lock); 1027122744Sjeff kseq_load_add(KSEQ_SELF(), &kse0); 1028117237Sjeff mtx_unlock_spin(&sched_lock); 1029109864Sjeff} 1030109864Sjeff 1031109864Sjeff/* 1032109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 1033109864Sjeff * process. 1034109864Sjeff */ 1035113357Sjeffstatic void 1036109864Sjeffsched_priority(struct ksegrp *kg) 1037109864Sjeff{ 1038109864Sjeff int pri; 1039109864Sjeff 1040109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1041113357Sjeff return; 1042109864Sjeff 1043113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 1044111857Sjeff pri += SCHED_PRI_BASE; 1045130551Sjulian pri += kg->kg_proc->p_nice; 1046109864Sjeff 1047109864Sjeff if (pri > PRI_MAX_TIMESHARE) 1048109864Sjeff pri = PRI_MAX_TIMESHARE; 1049109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 1050109864Sjeff pri = PRI_MIN_TIMESHARE; 1051109864Sjeff 1052109864Sjeff kg->kg_user_pri = pri; 1053109864Sjeff 1054113357Sjeff return; 1055109864Sjeff} 1056109864Sjeff 1057109864Sjeff/* 1058112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 1059112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 1060109864Sjeff */ 1061112966Sjeffstatic void 1062112966Sjeffsched_slice(struct kse *ke) 1063109864Sjeff{ 1064113357Sjeff struct kseq *kseq; 1065112966Sjeff struct ksegrp *kg; 1066109864Sjeff 1067112966Sjeff kg = ke->ke_ksegrp; 1068113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1069109864Sjeff 1070139453Sjhb if (ke->ke_thread->td_flags & TDF_BORROWING) { 1071138842Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1072138842Sjeff return; 1073138842Sjeff } 1074138842Sjeff 1075112966Sjeff /* 1076112966Sjeff * Rationale: 1077133427Sjeff * KSEs in interactive ksegs get a minimal slice so that we 1078112966Sjeff * quickly notice if it abuses its advantage. 1079112966Sjeff * 1080112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 1081112966Sjeff * based on the ksegs nice value relative to the least nice kseg 1082112966Sjeff * on the run queue for this cpu. 1083112966Sjeff * 1084112966Sjeff * If the KSE is less nice than all others it gets the maximum 1085112966Sjeff * slice and other KSEs will adjust their slice relative to 1086112966Sjeff * this when they first expire. 1087112966Sjeff * 1088112966Sjeff * There is 20 point window that starts relative to the least 1089112966Sjeff * nice kse on the run queue. Slice size is determined by 1090112966Sjeff * the kse distance from the last nice ksegrp. 1091112966Sjeff * 1092121871Sjeff * If the kse is outside of the window it will get no slice 1093121871Sjeff * and will be reevaluated each time it is selected on the 1094121871Sjeff * run queue. The exception to this is nice 0 ksegs when 1095121871Sjeff * a nice -20 is running. They are always granted a minimum 1096121871Sjeff * slice. 1097112966Sjeff */ 1098113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 1099112966Sjeff int nice; 1100112966Sjeff 1101130551Sjulian nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); 1102121896Sjeff if (kseq->ksq_load_timeshare == 0 || 1103130551Sjulian kg->kg_proc->p_nice < kseq->ksq_nicemin) 1104112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1105121871Sjeff else if (nice <= SCHED_SLICE_NTHRESH) 1106112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 1107130551Sjulian else if (kg->kg_proc->p_nice == 0) 1108121871Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1109112966Sjeff else 1110112966Sjeff ke->ke_slice = 0; 1111112966Sjeff } else 1112123684Sjeff ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1113112966Sjeff 1114112966Sjeff return; 1115109864Sjeff} 1116109864Sjeff 1117121868Sjeff/* 1118121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1119121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 1120121868Sjeff * This routine will not operate correctly when slp or run times have been 1121121868Sjeff * adjusted to more than double their maximum. 1122121868Sjeff */ 1123116463Sjeffstatic void 1124116463Sjeffsched_interact_update(struct ksegrp *kg) 1125116463Sjeff{ 1126121868Sjeff int sum; 1127121605Sjeff 1128121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1129121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1130121868Sjeff return; 1131121868Sjeff /* 1132121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1133121868Sjeff * will not bring us back into range. Dividing by two here forces 1134133427Sjeff * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1135121868Sjeff */ 1136127850Sjeff if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1137121868Sjeff kg->kg_runtime /= 2; 1138121868Sjeff kg->kg_slptime /= 2; 1139121868Sjeff return; 1140116463Sjeff } 1141121868Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1142121868Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1143116463Sjeff} 1144116463Sjeff 1145121868Sjeffstatic void 1146121868Sjeffsched_interact_fork(struct ksegrp *kg) 1147121868Sjeff{ 1148121868Sjeff int ratio; 1149121868Sjeff int sum; 1150121868Sjeff 1151121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1152121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1153121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1154121868Sjeff kg->kg_runtime /= ratio; 1155121868Sjeff kg->kg_slptime /= ratio; 1156121868Sjeff } 1157121868Sjeff} 1158121868Sjeff 1159111857Sjeffstatic int 1160111857Sjeffsched_interact_score(struct ksegrp *kg) 1161111857Sjeff{ 1162116365Sjeff int div; 1163111857Sjeff 1164111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 1165116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1166116365Sjeff return (SCHED_INTERACT_HALF + 1167116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1168116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 1169116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1170116365Sjeff return (kg->kg_runtime / div); 1171111857Sjeff } 1172111857Sjeff 1173116365Sjeff /* 1174116365Sjeff * This can happen if slptime and runtime are 0. 1175116365Sjeff */ 1176116365Sjeff return (0); 1177111857Sjeff 1178111857Sjeff} 1179111857Sjeff 1180113357Sjeff/* 1181134791Sjulian * Very early in the boot some setup of scheduler-specific 1182134791Sjulian * parts of proc0 and of soem scheduler resources needs to be done. 1183134791Sjulian * Called from: 1184134791Sjulian * proc0_init() 1185134791Sjulian */ 1186134791Sjulianvoid 1187134791Sjulianschedinit(void) 1188134791Sjulian{ 1189134791Sjulian /* 1190134791Sjulian * Set up the scheduler specific parts of proc0. 1191134791Sjulian */ 1192136167Sjulian proc0.p_sched = NULL; /* XXX */ 1193134791Sjulian ksegrp0.kg_sched = &kg_sched0; 1194136167Sjulian thread0.td_sched = &kse0; 1195134791Sjulian kse0.ke_thread = &thread0; 1196134791Sjulian kse0.ke_state = KES_THREAD; 1197134791Sjulian kg_sched0.skg_concurrency = 1; 1198134791Sjulian kg_sched0.skg_avail_opennings = 0; /* we are already running */ 1199134791Sjulian} 1200134791Sjulian 1201134791Sjulian/* 1202113357Sjeff * This is only somewhat accurate since given many processes of the same 1203113357Sjeff * priority they will switch when their slices run out, which will be 1204113357Sjeff * at most SCHED_SLICE_MAX. 1205113357Sjeff */ 1206109864Sjeffint 1207109864Sjeffsched_rr_interval(void) 1208109864Sjeff{ 1209109864Sjeff return (SCHED_SLICE_MAX); 1210109864Sjeff} 1211109864Sjeff 1212121790Sjeffstatic void 1213109864Sjeffsched_pctcpu_update(struct kse *ke) 1214109864Sjeff{ 1215109864Sjeff /* 1216109864Sjeff * Adjust counters and watermark for pctcpu calc. 1217116365Sjeff */ 1218120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1219120272Sjeff /* 1220120272Sjeff * Shift the tick count out so that the divide doesn't 1221120272Sjeff * round away our results. 1222120272Sjeff */ 1223120272Sjeff ke->ke_ticks <<= 10; 1224120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1225120272Sjeff SCHED_CPU_TICKS; 1226120272Sjeff ke->ke_ticks >>= 10; 1227120272Sjeff } else 1228120272Sjeff ke->ke_ticks = 0; 1229109864Sjeff ke->ke_ltick = ticks; 1230109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1231109864Sjeff} 1232109864Sjeff 1233109864Sjeffvoid 1234139453Sjhbsched_thread_priority(struct thread *td, u_char prio) 1235109864Sjeff{ 1236121605Sjeff struct kse *ke; 1237109864Sjeff 1238139316Sjeff CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1239139316Sjeff td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1240139316Sjeff curthread->td_proc->p_comm); 1241121605Sjeff ke = td->td_kse; 1242109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1243139453Sjhb if (td->td_priority == prio) 1244139453Sjhb return; 1245109864Sjeff if (TD_ON_RUNQ(td)) { 1246121605Sjeff /* 1247121605Sjeff * If the priority has been elevated due to priority 1248121605Sjeff * propagation, we may have to move ourselves to a new 1249121605Sjeff * queue. We still call adjustrunqueue below in case kse 1250121605Sjeff * needs to fix things up. 1251121605Sjeff */ 1252138842Sjeff if (prio < td->td_priority && ke->ke_runq != NULL && 1253121872Sjeff (ke->ke_flags & KEF_ASSIGNED) == 0 && 1254121790Sjeff ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1255121605Sjeff runq_remove(ke->ke_runq, ke); 1256121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1257136170Sjulian runq_add(ke->ke_runq, ke, 0); 1258121605Sjeff } 1259133555Sjeff /* 1260133555Sjeff * Hold this kse on this cpu so that sched_prio() doesn't 1261133555Sjeff * cause excessive migration. We only want migration to 1262133555Sjeff * happen as the result of a wakeup. 1263133555Sjeff */ 1264133555Sjeff ke->ke_flags |= KEF_HOLD; 1265119488Sdavidxu adjustrunqueue(td, prio); 1266139334Sjeff ke->ke_flags &= ~KEF_HOLD; 1267121605Sjeff } else 1268119488Sdavidxu td->td_priority = prio; 1269109864Sjeff} 1270109864Sjeff 1271139453Sjhb/* 1272139453Sjhb * Update a thread's priority when it is lent another thread's 1273139453Sjhb * priority. 1274139453Sjhb */ 1275109864Sjeffvoid 1276139453Sjhbsched_lend_prio(struct thread *td, u_char prio) 1277139453Sjhb{ 1278139453Sjhb 1279139453Sjhb td->td_flags |= TDF_BORROWING; 1280139453Sjhb sched_thread_priority(td, prio); 1281139453Sjhb} 1282139453Sjhb 1283139453Sjhb/* 1284139453Sjhb * Restore a thread's priority when priority propagation is 1285139453Sjhb * over. The prio argument is the minimum priority the thread 1286139453Sjhb * needs to have to satisfy other possible priority lending 1287139453Sjhb * requests. If the thread's regular priority is less 1288139453Sjhb * important than prio, the thread will keep a priority boost 1289139453Sjhb * of prio. 1290139453Sjhb */ 1291139453Sjhbvoid 1292139453Sjhbsched_unlend_prio(struct thread *td, u_char prio) 1293139453Sjhb{ 1294139453Sjhb u_char base_pri; 1295139453Sjhb 1296139453Sjhb if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1297139453Sjhb td->td_base_pri <= PRI_MAX_TIMESHARE) 1298139453Sjhb base_pri = td->td_ksegrp->kg_user_pri; 1299139453Sjhb else 1300139453Sjhb base_pri = td->td_base_pri; 1301139453Sjhb if (prio >= base_pri) { 1302139453Sjhb td->td_flags &= ~ TDF_BORROWING; 1303139453Sjhb sched_thread_priority(td, base_pri); 1304139453Sjhb } else 1305139453Sjhb sched_lend_prio(td, prio); 1306139453Sjhb} 1307139453Sjhb 1308139453Sjhbvoid 1309139453Sjhbsched_prio(struct thread *td, u_char prio) 1310139453Sjhb{ 1311139453Sjhb u_char oldprio; 1312139453Sjhb 1313139453Sjhb /* First, update the base priority. */ 1314139453Sjhb td->td_base_pri = prio; 1315139453Sjhb 1316139453Sjhb /* 1317139453Sjhb * If the therad is borrowing another thread's priority, don't 1318139453Sjhb * ever lower the priority. 1319139453Sjhb */ 1320139453Sjhb if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1321139453Sjhb return; 1322139453Sjhb 1323139453Sjhb /* Change the real priority. */ 1324139453Sjhb oldprio = td->td_priority; 1325139453Sjhb sched_thread_priority(td, prio); 1326139453Sjhb 1327139453Sjhb /* 1328139453Sjhb * If the thread is on a turnstile, then let the turnstile update 1329139453Sjhb * its state. 1330139453Sjhb */ 1331139453Sjhb if (TD_ON_LOCK(td) && oldprio != prio) 1332139453Sjhb turnstile_adjust(td, oldprio); 1333139453Sjhb} 1334139453Sjhb 1335139453Sjhbvoid 1336135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags) 1337109864Sjeff{ 1338139334Sjeff struct kseq *ksq; 1339109864Sjeff struct kse *ke; 1340109864Sjeff 1341109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1342109864Sjeff 1343109864Sjeff ke = td->td_kse; 1344139334Sjeff ksq = KSEQ_SELF(); 1345109864Sjeff 1346133555Sjeff td->td_lastcpu = td->td_oncpu; 1347113339Sjulian td->td_oncpu = NOCPU; 1348132266Sjhb td->td_flags &= ~TDF_NEEDRESCHED; 1349132266Sjhb td->td_pflags &= ~TDP_OWEPREEMPT; 1350109864Sjeff 1351123434Sjeff /* 1352123434Sjeff * If the KSE has been assigned it may be in the process of switching 1353123434Sjeff * to the new cpu. This is the case in sched_bind(). 1354123434Sjeff */ 1355139334Sjeff if (td == PCPU_GET(idlethread)) { 1356139334Sjeff TD_SET_CAN_RUN(td); 1357139334Sjeff } else if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1358139334Sjeff /* We are ending our run so make our slot available again */ 1359139334Sjeff SLOT_RELEASE(td->td_ksegrp); 1360139334Sjeff if (ke->ke_runq == NULL) 1361139334Sjeff panic("Thread not on runq."); 1362139334Sjeff kseq_load_rem(ksq, ke); 1363139334Sjeff if (TD_IS_RUNNING(td)) { 1364139334Sjeff /* 1365139334Sjeff * Don't allow the thread to migrate 1366139334Sjeff * from a preemption. 1367139334Sjeff */ 1368139334Sjeff ke->ke_flags |= KEF_HOLD; 1369139334Sjeff setrunqueue(td, (flags & SW_PREEMPT) ? 1370139334Sjeff SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1371139334Sjeff SRQ_OURSELF|SRQ_YIELDING); 1372139334Sjeff ke->ke_flags &= ~KEF_HOLD; 1373139334Sjeff } else if ((td->td_proc->p_flag & P_HADTHREADS) && 1374139334Sjeff (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp)) 1375139334Sjeff /* 1376139334Sjeff * We will not be on the run queue. 1377139334Sjeff * So we must be sleeping or similar. 1378139334Sjeff * Don't use the slot if we will need it 1379139334Sjeff * for newtd. 1380139334Sjeff */ 1381139334Sjeff slot_fill(td->td_ksegrp); 1382121146Sjeff } 1383136167Sjulian if (newtd != NULL) { 1384136170Sjulian /* 1385136170Sjulian * If we bring in a thread, 1386136170Sjulian * then account for it as if it had been added to the 1387136170Sjulian * run queue and then chosen. 1388136170Sjulian */ 1389136169Sjulian newtd->td_kse->ke_flags |= KEF_DIDRUN; 1390139334Sjeff newtd->td_kse->ke_runq = ksq->ksq_curr; 1391136167Sjulian SLOT_USE(newtd->td_ksegrp); 1392136173Sjulian TD_SET_RUNNING(newtd); 1393133427Sjeff kseq_load_add(KSEQ_SELF(), newtd->td_kse); 1394136167Sjulian } else 1395131473Sjhb newtd = choosethread(); 1396121128Sjeff if (td != newtd) 1397121128Sjeff cpu_switch(td, newtd); 1398121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1399109864Sjeff 1400113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1401109864Sjeff} 1402109864Sjeff 1403109864Sjeffvoid 1404130551Sjuliansched_nice(struct proc *p, int nice) 1405109864Sjeff{ 1406130551Sjulian struct ksegrp *kg; 1407113357Sjeff struct kse *ke; 1408109864Sjeff struct thread *td; 1409113357Sjeff struct kseq *kseq; 1410109864Sjeff 1411130551Sjulian PROC_LOCK_ASSERT(p, MA_OWNED); 1412113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1413113357Sjeff /* 1414113357Sjeff * We need to adjust the nice counts for running KSEs. 1415113357Sjeff */ 1416130551Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 1417130551Sjulian if (kg->kg_pri_class == PRI_TIMESHARE) { 1418134791Sjulian FOREACH_THREAD_IN_GROUP(kg, td) { 1419134791Sjulian ke = td->td_kse; 1420130551Sjulian if (ke->ke_runq == NULL) 1421130551Sjulian continue; 1422130551Sjulian kseq = KSEQ_CPU(ke->ke_cpu); 1423130551Sjulian kseq_nice_rem(kseq, p->p_nice); 1424130551Sjulian kseq_nice_add(kseq, nice); 1425130551Sjulian } 1426113357Sjeff } 1427130551Sjulian } 1428130551Sjulian p->p_nice = nice; 1429130551Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 1430130551Sjulian sched_priority(kg); 1431130551Sjulian FOREACH_THREAD_IN_GROUP(kg, td) 1432130551Sjulian td->td_flags |= TDF_NEEDRESCHED; 1433130551Sjulian } 1434109864Sjeff} 1435109864Sjeff 1436109864Sjeffvoid 1437126326Sjhbsched_sleep(struct thread *td) 1438109864Sjeff{ 1439109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1440109864Sjeff 1441109864Sjeff td->td_slptime = ticks; 1442109864Sjeff} 1443109864Sjeff 1444109864Sjeffvoid 1445109864Sjeffsched_wakeup(struct thread *td) 1446109864Sjeff{ 1447109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1448109864Sjeff 1449109864Sjeff /* 1450109864Sjeff * Let the kseg know how long we slept for. This is because process 1451109864Sjeff * interactivity behavior is modeled in the kseg. 1452109864Sjeff */ 1453111788Sjeff if (td->td_slptime) { 1454111788Sjeff struct ksegrp *kg; 1455113357Sjeff int hzticks; 1456109864Sjeff 1457111788Sjeff kg = td->td_ksegrp; 1458121868Sjeff hzticks = (ticks - td->td_slptime) << 10; 1459121868Sjeff if (hzticks >= SCHED_SLP_RUN_MAX) { 1460121868Sjeff kg->kg_slptime = SCHED_SLP_RUN_MAX; 1461121868Sjeff kg->kg_runtime = 1; 1462121868Sjeff } else { 1463121868Sjeff kg->kg_slptime += hzticks; 1464121868Sjeff sched_interact_update(kg); 1465121868Sjeff } 1466111788Sjeff sched_priority(kg); 1467134791Sjulian sched_slice(td->td_kse); 1468111788Sjeff td->td_slptime = 0; 1469109864Sjeff } 1470134586Sjulian setrunqueue(td, SRQ_BORING); 1471109864Sjeff} 1472109864Sjeff 1473109864Sjeff/* 1474109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1475109864Sjeff * priority. 1476109864Sjeff */ 1477109864Sjeffvoid 1478134791Sjuliansched_fork(struct thread *td, struct thread *childtd) 1479109864Sjeff{ 1480109864Sjeff 1481109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1482109864Sjeff 1483134791Sjulian sched_fork_ksegrp(td, childtd->td_ksegrp); 1484134791Sjulian sched_fork_thread(td, childtd); 1485113357Sjeff} 1486113357Sjeff 1487113357Sjeffvoid 1488132372Sjuliansched_fork_ksegrp(struct thread *td, struct ksegrp *child) 1489113357Sjeff{ 1490132372Sjulian struct ksegrp *kg = td->td_ksegrp; 1491134791Sjulian mtx_assert(&sched_lock, MA_OWNED); 1492116365Sjeff 1493121868Sjeff child->kg_slptime = kg->kg_slptime; 1494121868Sjeff child->kg_runtime = kg->kg_runtime; 1495121868Sjeff child->kg_user_pri = kg->kg_user_pri; 1496121868Sjeff sched_interact_fork(child); 1497116463Sjeff kg->kg_runtime += tickincr << 10; 1498116463Sjeff sched_interact_update(kg); 1499113357Sjeff} 1500109864Sjeff 1501113357Sjeffvoid 1502113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 1503113357Sjeff{ 1504134791Sjulian struct kse *ke; 1505134791Sjulian struct kse *ke2; 1506134791Sjulian 1507134791Sjulian sched_newthread(child); 1508134791Sjulian ke = td->td_kse; 1509134791Sjulian ke2 = child->td_kse; 1510134791Sjulian ke2->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1511134791Sjulian ke2->ke_cpu = ke->ke_cpu; 1512134791Sjulian ke2->ke_runq = NULL; 1513134791Sjulian 1514134791Sjulian /* Grab our parents cpu estimation information. */ 1515134791Sjulian ke2->ke_ticks = ke->ke_ticks; 1516134791Sjulian ke2->ke_ltick = ke->ke_ltick; 1517134791Sjulian ke2->ke_ftick = ke->ke_ftick; 1518113357Sjeff} 1519113357Sjeff 1520113357Sjeffvoid 1521113357Sjeffsched_class(struct ksegrp *kg, int class) 1522113357Sjeff{ 1523113357Sjeff struct kseq *kseq; 1524113357Sjeff struct kse *ke; 1525134791Sjulian struct thread *td; 1526121896Sjeff int nclass; 1527121896Sjeff int oclass; 1528113357Sjeff 1529113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1530113357Sjeff if (kg->kg_pri_class == class) 1531113357Sjeff return; 1532113357Sjeff 1533121896Sjeff nclass = PRI_BASE(class); 1534121896Sjeff oclass = PRI_BASE(kg->kg_pri_class); 1535134791Sjulian FOREACH_THREAD_IN_GROUP(kg, td) { 1536134791Sjulian ke = td->td_kse; 1537113357Sjeff if (ke->ke_state != KES_ONRUNQ && 1538113357Sjeff ke->ke_state != KES_THREAD) 1539113357Sjeff continue; 1540113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1541113357Sjeff 1542121896Sjeff#ifdef SMP 1543122744Sjeff /* 1544122744Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1545122744Sjeff * count because could be changing to or from an interrupt 1546122744Sjeff * class. 1547122744Sjeff */ 1548122744Sjeff if (ke->ke_state == KES_ONRUNQ) { 1549139334Sjeff if (KSE_CAN_MIGRATE(ke)) { 1550123433Sjeff kseq->ksq_transferable--; 1551123433Sjeff kseq->ksq_group->ksg_transferable--; 1552123433Sjeff } 1553139334Sjeff if (KSE_CAN_MIGRATE(ke)) { 1554123433Sjeff kseq->ksq_transferable++; 1555123433Sjeff kseq->ksq_group->ksg_transferable++; 1556123433Sjeff } 1557122744Sjeff } 1558121896Sjeff#endif 1559122744Sjeff if (oclass == PRI_TIMESHARE) { 1560121896Sjeff kseq->ksq_load_timeshare--; 1561130551Sjulian kseq_nice_rem(kseq, kg->kg_proc->p_nice); 1562122744Sjeff } 1563122744Sjeff if (nclass == PRI_TIMESHARE) { 1564121896Sjeff kseq->ksq_load_timeshare++; 1565130551Sjulian kseq_nice_add(kseq, kg->kg_proc->p_nice); 1566122744Sjeff } 1567109970Sjeff } 1568109970Sjeff 1569113357Sjeff kg->kg_pri_class = class; 1570109864Sjeff} 1571109864Sjeff 1572109864Sjeff/* 1573109864Sjeff * Return some of the child's priority and interactivity to the parent. 1574109864Sjeff */ 1575109864Sjeffvoid 1576134791Sjuliansched_exit(struct proc *p, struct thread *childtd) 1577109864Sjeff{ 1578109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1579134791Sjulian sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd); 1580139316Sjeff sched_exit_thread(NULL, childtd); 1581109864Sjeff} 1582109864Sjeff 1583109864Sjeffvoid 1584132372Sjuliansched_exit_ksegrp(struct ksegrp *kg, struct thread *td) 1585113372Sjeff{ 1586132372Sjulian /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */ 1587132372Sjulian kg->kg_runtime += td->td_ksegrp->kg_runtime; 1588116463Sjeff sched_interact_update(kg); 1589113372Sjeff} 1590113372Sjeff 1591113372Sjeffvoid 1592134791Sjuliansched_exit_thread(struct thread *td, struct thread *childtd) 1593113372Sjeff{ 1594139316Sjeff CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 1595139316Sjeff childtd, childtd->td_proc->p_comm, childtd->td_priority); 1596134791Sjulian kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse); 1597113372Sjeff} 1598113372Sjeff 1599113372Sjeffvoid 1600121127Sjeffsched_clock(struct thread *td) 1601109864Sjeff{ 1602113357Sjeff struct kseq *kseq; 1603113357Sjeff struct ksegrp *kg; 1604121127Sjeff struct kse *ke; 1605109864Sjeff 1606129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 1607133427Sjeff kseq = KSEQ_SELF(); 1608129982Sjeff#ifdef SMP 1609139334Sjeff if (ticks >= bal_tick) 1610129982Sjeff sched_balance(); 1611139334Sjeff if (ticks >= gbal_tick && balance_groups) 1612129982Sjeff sched_balance_groups(); 1613133427Sjeff /* 1614133427Sjeff * We could have been assigned a non real-time thread without an 1615133427Sjeff * IPI. 1616133427Sjeff */ 1617133427Sjeff if (kseq->ksq_assigned) 1618133427Sjeff kseq_assign(kseq); /* Potentially sets NEEDRESCHED */ 1619129982Sjeff#endif 1620113357Sjeff /* 1621113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1622113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1623113357Sjeff * calculating this here. 1624113357Sjeff */ 1625113357Sjeff if (realstathz == 0) { 1626113357Sjeff realstathz = stathz ? stathz : hz; 1627113357Sjeff tickincr = hz / realstathz; 1628113357Sjeff /* 1629113357Sjeff * XXX This does not work for values of stathz that are much 1630113357Sjeff * larger than hz. 1631113357Sjeff */ 1632113357Sjeff if (tickincr == 0) 1633113357Sjeff tickincr = 1; 1634113357Sjeff } 1635109864Sjeff 1636121127Sjeff ke = td->td_kse; 1637113357Sjeff kg = ke->ke_ksegrp; 1638109864Sjeff 1639110028Sjeff /* Adjust ticks for pctcpu */ 1640111793Sjeff ke->ke_ticks++; 1641109971Sjeff ke->ke_ltick = ticks; 1642112994Sjeff 1643109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1644109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1645109971Sjeff sched_pctcpu_update(ke); 1646109971Sjeff 1647114496Sjulian if (td->td_flags & TDF_IDLETD) 1648109864Sjeff return; 1649110028Sjeff /* 1650113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1651113357Sjeff */ 1652113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1653113357Sjeff return; 1654113357Sjeff /* 1655110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1656113357Sjeff * interactivity. 1657109864Sjeff */ 1658113357Sjeff kg->kg_runtime += tickincr << 10; 1659116463Sjeff sched_interact_update(kg); 1660110645Sjeff 1661109864Sjeff /* 1662109864Sjeff * We used up one time slice. 1663109864Sjeff */ 1664122847Sjeff if (--ke->ke_slice > 0) 1665113357Sjeff return; 1666109864Sjeff /* 1667113357Sjeff * We're out of time, recompute priorities and requeue. 1668109864Sjeff */ 1669122744Sjeff kseq_load_rem(kseq, ke); 1670113357Sjeff sched_priority(kg); 1671113357Sjeff sched_slice(ke); 1672113357Sjeff if (SCHED_CURR(kg, ke)) 1673113357Sjeff ke->ke_runq = kseq->ksq_curr; 1674113357Sjeff else 1675113357Sjeff ke->ke_runq = kseq->ksq_next; 1676122744Sjeff kseq_load_add(kseq, ke); 1677113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1678109864Sjeff} 1679109864Sjeff 1680109864Sjeffint 1681109864Sjeffsched_runnable(void) 1682109864Sjeff{ 1683109864Sjeff struct kseq *kseq; 1684115998Sjeff int load; 1685109864Sjeff 1686115998Sjeff load = 1; 1687115998Sjeff 1688110028Sjeff kseq = KSEQ_SELF(); 1689121790Sjeff#ifdef SMP 1690122094Sjeff if (kseq->ksq_assigned) { 1691122094Sjeff mtx_lock_spin(&sched_lock); 1692121790Sjeff kseq_assign(kseq); 1693122094Sjeff mtx_unlock_spin(&sched_lock); 1694122094Sjeff } 1695121790Sjeff#endif 1696121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1697121605Sjeff if (kseq->ksq_load > 0) 1698121605Sjeff goto out; 1699121605Sjeff } else 1700121605Sjeff if (kseq->ksq_load - 1 > 0) 1701121605Sjeff goto out; 1702115998Sjeff load = 0; 1703115998Sjeffout: 1704115998Sjeff return (load); 1705109864Sjeff} 1706109864Sjeff 1707109864Sjeffvoid 1708109864Sjeffsched_userret(struct thread *td) 1709109864Sjeff{ 1710109864Sjeff struct ksegrp *kg; 1711121605Sjeff 1712139453Sjhb KASSERT((td->td_flags & TDF_BORROWING) == 0, 1713139453Sjhb ("thread with borrowed priority returning to userland")); 1714139453Sjhb kg = td->td_ksegrp; 1715139453Sjhb if (td->td_priority != kg->kg_user_pri) { 1716109864Sjeff mtx_lock_spin(&sched_lock); 1717109864Sjeff td->td_priority = kg->kg_user_pri; 1718139453Sjhb td->td_base_pri = kg->kg_user_pri; 1719109864Sjeff mtx_unlock_spin(&sched_lock); 1720109864Sjeff } 1721109864Sjeff} 1722109864Sjeff 1723109864Sjeffstruct kse * 1724109970Sjeffsched_choose(void) 1725109970Sjeff{ 1726110028Sjeff struct kseq *kseq; 1727109970Sjeff struct kse *ke; 1728109970Sjeff 1729115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1730121790Sjeff kseq = KSEQ_SELF(); 1731113357Sjeff#ifdef SMP 1732123433Sjeffrestart: 1733121790Sjeff if (kseq->ksq_assigned) 1734121790Sjeff kseq_assign(kseq); 1735113357Sjeff#endif 1736121790Sjeff ke = kseq_choose(kseq); 1737109864Sjeff if (ke) { 1738121790Sjeff#ifdef SMP 1739121790Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1740123433Sjeff if (kseq_idled(kseq) == 0) 1741123433Sjeff goto restart; 1742121790Sjeff#endif 1743122744Sjeff kseq_runq_rem(kseq, ke); 1744109864Sjeff ke->ke_state = KES_THREAD; 1745113357Sjeff return (ke); 1746109864Sjeff } 1747109970Sjeff#ifdef SMP 1748123433Sjeff if (kseq_idled(kseq) == 0) 1749123433Sjeff goto restart; 1750109970Sjeff#endif 1751113357Sjeff return (NULL); 1752109864Sjeff} 1753109864Sjeff 1754109864Sjeffvoid 1755134586Sjuliansched_add(struct thread *td, int flags) 1756109864Sjeff{ 1757110267Sjeff struct kseq *kseq; 1758113357Sjeff struct ksegrp *kg; 1759121127Sjeff struct kse *ke; 1760139334Sjeff int preemptive; 1761133427Sjeff int canmigrate; 1762121790Sjeff int class; 1763109864Sjeff 1764139316Sjeff CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1765139316Sjeff td, td->td_proc->p_comm, td->td_priority, curthread, 1766139316Sjeff curthread->td_proc->p_comm); 1767121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 1768121127Sjeff ke = td->td_kse; 1769121127Sjeff kg = td->td_ksegrp; 1770139334Sjeff canmigrate = 1; 1771139334Sjeff preemptive = !(flags & SRQ_YIELDING); 1772139334Sjeff class = PRI_BASE(kg->kg_pri_class); 1773139334Sjeff kseq = KSEQ_SELF(); 1774139334Sjeff if ((ke->ke_flags & KEF_INTERNAL) == 0) 1775139334Sjeff SLOT_USE(td->td_ksegrp); 1776139334Sjeff ke->ke_flags &= ~KEF_INTERNAL; 1777139334Sjeff#ifdef SMP 1778138802Sjeff if (ke->ke_flags & KEF_ASSIGNED) { 1779139334Sjeff if (ke->ke_flags & KEF_REMOVED) 1780138802Sjeff ke->ke_flags &= ~KEF_REMOVED; 1781121790Sjeff return; 1782138802Sjeff } 1783139334Sjeff canmigrate = KSE_CAN_MIGRATE(ke); 1784139334Sjeff#endif 1785109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1786110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1787109864Sjeff ke->ke_proc->p_comm)); 1788109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1789110267Sjeff ("sched_add: process swapped out")); 1790113387Sjeff KASSERT(ke->ke_runq == NULL, 1791113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1792121790Sjeff switch (class) { 1793112994Sjeff case PRI_ITHD: 1794112994Sjeff case PRI_REALTIME: 1795113357Sjeff ke->ke_runq = kseq->ksq_curr; 1796113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1797139334Sjeff if (canmigrate) 1798139334Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1799112994Sjeff break; 1800112994Sjeff case PRI_TIMESHARE: 1801113387Sjeff if (SCHED_CURR(kg, ke)) 1802113387Sjeff ke->ke_runq = kseq->ksq_curr; 1803113387Sjeff else 1804113387Sjeff ke->ke_runq = kseq->ksq_next; 1805113357Sjeff break; 1806112994Sjeff case PRI_IDLE: 1807113357Sjeff /* 1808113357Sjeff * This is for priority prop. 1809113357Sjeff */ 1810121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1811113357Sjeff ke->ke_runq = kseq->ksq_curr; 1812113357Sjeff else 1813113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1814113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1815112994Sjeff break; 1816113357Sjeff default: 1817121868Sjeff panic("Unknown pri class."); 1818113357Sjeff break; 1819112994Sjeff } 1820121790Sjeff#ifdef SMP 1821133427Sjeff /* 1822133427Sjeff * Don't migrate running threads here. Force the long term balancer 1823133427Sjeff * to do it. 1824133427Sjeff */ 1825133555Sjeff if (ke->ke_flags & KEF_HOLD) { 1826133555Sjeff ke->ke_flags &= ~KEF_HOLD; 1827133427Sjeff canmigrate = 0; 1828133555Sjeff } 1829133427Sjeff /* 1830133427Sjeff * If this thread is pinned or bound, notify the target cpu. 1831133427Sjeff */ 1832133427Sjeff if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) { 1833123529Sjeff ke->ke_runq = NULL; 1834123433Sjeff kseq_notify(ke, ke->ke_cpu); 1835123433Sjeff return; 1836123433Sjeff } 1837121790Sjeff /* 1838123685Sjeff * If we had been idle, clear our bit in the group and potentially 1839123685Sjeff * the global bitmap. If not, see if we should transfer this thread. 1840121790Sjeff */ 1841123433Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1842123433Sjeff (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1843121790Sjeff /* 1844123433Sjeff * Check to see if our group is unidling, and if so, remove it 1845123433Sjeff * from the global idle mask. 1846121790Sjeff */ 1847123433Sjeff if (kseq->ksq_group->ksg_idlemask == 1848123433Sjeff kseq->ksq_group->ksg_cpumask) 1849123433Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1850123433Sjeff /* 1851123433Sjeff * Now remove ourselves from the group specific idle mask. 1852123433Sjeff */ 1853123433Sjeff kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1854139334Sjeff } else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD) 1855123685Sjeff if (kseq_transfer(kseq, ke, class)) 1856123685Sjeff return; 1857133427Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1858121790Sjeff#endif 1859133555Sjeff if (td->td_priority < curthread->td_priority && 1860133555Sjeff ke->ke_runq == kseq->ksq_curr) 1861133555Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1862131839Sjhb if (preemptive && maybe_preempt(td)) 1863131481Sjhb return; 1864109864Sjeff ke->ke_state = KES_ONRUNQ; 1865109864Sjeff 1866139334Sjeff kseq_runq_add(kseq, ke, flags); 1867122744Sjeff kseq_load_add(kseq, ke); 1868109864Sjeff} 1869109864Sjeff 1870109864Sjeffvoid 1871121127Sjeffsched_rem(struct thread *td) 1872109864Sjeff{ 1873113357Sjeff struct kseq *kseq; 1874121127Sjeff struct kse *ke; 1875113357Sjeff 1876139316Sjeff CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1877139316Sjeff td, td->td_proc->p_comm, td->td_priority, curthread, 1878139316Sjeff curthread->td_proc->p_comm); 1879139334Sjeff mtx_assert(&sched_lock, MA_OWNED); 1880139334Sjeff ke = td->td_kse; 1881139334Sjeff SLOT_RELEASE(td->td_ksegrp); 1882138802Sjeff if (ke->ke_flags & KEF_ASSIGNED) { 1883138802Sjeff ke->ke_flags |= KEF_REMOVED; 1884121790Sjeff return; 1885138802Sjeff } 1886124958Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), 1887124958Sjeff ("sched_rem: KSE not on run queue")); 1888109864Sjeff 1889109864Sjeff ke->ke_state = KES_THREAD; 1890113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1891122744Sjeff kseq_runq_rem(kseq, ke); 1892122744Sjeff kseq_load_rem(kseq, ke); 1893109864Sjeff} 1894109864Sjeff 1895109864Sjefffixpt_t 1896121127Sjeffsched_pctcpu(struct thread *td) 1897109864Sjeff{ 1898109864Sjeff fixpt_t pctcpu; 1899121127Sjeff struct kse *ke; 1900109864Sjeff 1901109864Sjeff pctcpu = 0; 1902121127Sjeff ke = td->td_kse; 1903121290Sjeff if (ke == NULL) 1904121290Sjeff return (0); 1905109864Sjeff 1906115998Sjeff mtx_lock_spin(&sched_lock); 1907109864Sjeff if (ke->ke_ticks) { 1908109864Sjeff int rtick; 1909109864Sjeff 1910116365Sjeff /* 1911116365Sjeff * Don't update more frequently than twice a second. Allowing 1912116365Sjeff * this causes the cpu usage to decay away too quickly due to 1913116365Sjeff * rounding errors. 1914116365Sjeff */ 1915123435Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1916123435Sjeff ke->ke_ltick < (ticks - (hz / 2))) 1917116365Sjeff sched_pctcpu_update(ke); 1918109864Sjeff /* How many rtick per second ? */ 1919116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1920110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1921109864Sjeff } 1922109864Sjeff 1923109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1924113865Sjhb mtx_unlock_spin(&sched_lock); 1925109864Sjeff 1926109864Sjeff return (pctcpu); 1927109864Sjeff} 1928109864Sjeff 1929122038Sjeffvoid 1930122038Sjeffsched_bind(struct thread *td, int cpu) 1931122038Sjeff{ 1932122038Sjeff struct kse *ke; 1933122038Sjeff 1934122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1935122038Sjeff ke = td->td_kse; 1936122038Sjeff ke->ke_flags |= KEF_BOUND; 1937123433Sjeff#ifdef SMP 1938123433Sjeff if (PCPU_GET(cpuid) == cpu) 1939122038Sjeff return; 1940122038Sjeff /* sched_rem without the runq_remove */ 1941122038Sjeff ke->ke_state = KES_THREAD; 1942122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1943122038Sjeff kseq_notify(ke, cpu); 1944122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 1945131527Sphk mi_switch(SW_VOL, NULL); 1946122038Sjeff#endif 1947122038Sjeff} 1948122038Sjeff 1949122038Sjeffvoid 1950122038Sjeffsched_unbind(struct thread *td) 1951122038Sjeff{ 1952122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1953122038Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 1954122038Sjeff} 1955122038Sjeff 1956109864Sjeffint 1957125289Sjeffsched_load(void) 1958125289Sjeff{ 1959125289Sjeff#ifdef SMP 1960125289Sjeff int total; 1961125289Sjeff int i; 1962125289Sjeff 1963125289Sjeff total = 0; 1964125289Sjeff for (i = 0; i <= ksg_maxid; i++) 1965125289Sjeff total += KSEQ_GROUP(i)->ksg_load; 1966125289Sjeff return (total); 1967125289Sjeff#else 1968125289Sjeff return (KSEQ_SELF()->ksq_sysload); 1969125289Sjeff#endif 1970125289Sjeff} 1971125289Sjeff 1972125289Sjeffint 1973109864Sjeffsched_sizeof_ksegrp(void) 1974109864Sjeff{ 1975109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1976109864Sjeff} 1977109864Sjeff 1978109864Sjeffint 1979109864Sjeffsched_sizeof_proc(void) 1980109864Sjeff{ 1981109864Sjeff return (sizeof(struct proc)); 1982109864Sjeff} 1983109864Sjeff 1984109864Sjeffint 1985109864Sjeffsched_sizeof_thread(void) 1986109864Sjeff{ 1987109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1988109864Sjeff} 1989134791Sjulian#define KERN_SWITCH_INCLUDE 1 1990134791Sjulian#include "kern/kern_switch.c" 1991