sched_ule.c revision 161599
1109864Sjeff/*- 2146955Sjeff * Copyright (c) 2002-2005, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 161599 2006-08-25 06:12:53Z davidxu $"); 29116182Sobrien 30147565Speter#include "opt_hwpmc_hooks.h" 31147565Speter#include "opt_sched.h" 32134649Sscottl 33134791Sjulian#define kse td_sched 34134791Sjulian 35109864Sjeff#include <sys/param.h> 36109864Sjeff#include <sys/systm.h> 37131929Smarcel#include <sys/kdb.h> 38109864Sjeff#include <sys/kernel.h> 39109864Sjeff#include <sys/ktr.h> 40109864Sjeff#include <sys/lock.h> 41109864Sjeff#include <sys/mutex.h> 42109864Sjeff#include <sys/proc.h> 43112966Sjeff#include <sys/resource.h> 44122038Sjeff#include <sys/resourcevar.h> 45109864Sjeff#include <sys/sched.h> 46109864Sjeff#include <sys/smp.h> 47109864Sjeff#include <sys/sx.h> 48109864Sjeff#include <sys/sysctl.h> 49109864Sjeff#include <sys/sysproto.h> 50139453Sjhb#include <sys/turnstile.h> 51161599Sdavidxu#include <sys/umtx.h> 52109864Sjeff#include <sys/vmmeter.h> 53109864Sjeff#ifdef KTRACE 54109864Sjeff#include <sys/uio.h> 55109864Sjeff#include <sys/ktrace.h> 56109864Sjeff#endif 57109864Sjeff 58145256Sjkoshy#ifdef HWPMC_HOOKS 59145256Sjkoshy#include <sys/pmckern.h> 60145256Sjkoshy#endif 61145256Sjkoshy 62109864Sjeff#include <machine/cpu.h> 63121790Sjeff#include <machine/smp.h> 64109864Sjeff 65109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 66109864Sjeff/* XXX This is bogus compatability crap for ps */ 67109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 68109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 69109864Sjeff 70109864Sjeffstatic void sched_setup(void *dummy); 71109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 72109864Sjeff 73153533Sdavidxustatic void sched_initticks(void *dummy); 74153533SdavidxuSYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) 75153533Sdavidxu 76132589Sscottlstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 77113357Sjeff 78132589SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 79132589Sscottl "Scheduler name"); 80130881Sscottl 81113357Sjeffstatic int slice_min = 1; 82113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 83113357Sjeff 84116365Sjeffstatic int slice_max = 10; 85113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 86113357Sjeff 87111857Sjeffint realstathz; 88153533Sdavidxuint tickincr = 1 << 10; 89111857Sjeff 90109864Sjeff/* 91146954Sjeff * The following datastructures are allocated within their parent structure 92146954Sjeff * but are scheduler specific. 93134791Sjulian */ 94146954Sjeff/* 95146954Sjeff * The schedulable entity that can be given a context to run. A process may 96146954Sjeff * have several of these. 97146954Sjeff */ 98134791Sjulianstruct kse { 99134791Sjulian TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ 100134791Sjulian int ke_flags; /* (j) KEF_* flags. */ 101134791Sjulian struct thread *ke_thread; /* (*) Active associated thread. */ 102134791Sjulian fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ 103159337Sdavidxu u_char ke_rqindex; /* (j) Run queue index. */ 104134791Sjulian enum { 105134791Sjulian KES_THREAD = 0x0, /* slaved to thread state */ 106134791Sjulian KES_ONRUNQ 107134791Sjulian } ke_state; /* (j) thread sched specific status. */ 108134791Sjulian int ke_slptime; 109134791Sjulian int ke_slice; 110134791Sjulian struct runq *ke_runq; 111134791Sjulian u_char ke_cpu; /* CPU that we have affinity for. */ 112134791Sjulian /* The following variables are only used for pctcpu calculation */ 113134791Sjulian int ke_ltick; /* Last tick that we were running on */ 114134791Sjulian int ke_ftick; /* First tick that we were running on */ 115134791Sjulian int ke_ticks; /* Tick count */ 116134791Sjulian 117134791Sjulian}; 118146954Sjeff#define td_kse td_sched 119134791Sjulian#define td_slptime td_kse->ke_slptime 120134791Sjulian#define ke_proc ke_thread->td_proc 121134791Sjulian#define ke_ksegrp ke_thread->td_ksegrp 122146954Sjeff#define ke_assign ke_procq.tqe_next 123134791Sjulian/* flags kept in ke_flags */ 124139334Sjeff#define KEF_ASSIGNED 0x0001 /* Thread is being migrated. */ 125139334Sjeff#define KEF_BOUND 0x0002 /* Thread can not migrate. */ 126139334Sjeff#define KEF_XFERABLE 0x0004 /* Thread was added as transferable. */ 127139334Sjeff#define KEF_HOLD 0x0008 /* Thread is temporarily bound. */ 128139334Sjeff#define KEF_REMOVED 0x0010 /* Thread was removed while ASSIGNED */ 129146954Sjeff#define KEF_INTERNAL 0x0020 /* Thread added due to migration. */ 130148856Sdavidxu#define KEF_PREEMPTED 0x0040 /* Thread was preempted */ 131146954Sjeff#define KEF_DIDRUN 0x02000 /* Thread actually ran. */ 132146954Sjeff#define KEF_EXIT 0x04000 /* Thread is being killed. */ 133121790Sjeff 134109864Sjeffstruct kg_sched { 135134791Sjulian struct thread *skg_last_assigned; /* (j) Last thread assigned to */ 136134791Sjulian /* the system scheduler */ 137110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 138110645Sjeff int skg_runtime; /* Number of ticks we were running */ 139134791Sjulian int skg_avail_opennings; /* (j) Num unfilled slots in group.*/ 140134791Sjulian int skg_concurrency; /* (j) Num threads requested in group.*/ 141109864Sjeff}; 142134791Sjulian#define kg_last_assigned kg_sched->skg_last_assigned 143134791Sjulian#define kg_avail_opennings kg_sched->skg_avail_opennings 144134791Sjulian#define kg_concurrency kg_sched->skg_concurrency 145134791Sjulian#define kg_runtime kg_sched->skg_runtime 146134791Sjulian#define kg_slptime kg_sched->skg_slptime 147109864Sjeff 148146954Sjeff#define SLOT_RELEASE(kg) (kg)->kg_avail_opennings++ 149146954Sjeff#define SLOT_USE(kg) (kg)->kg_avail_opennings-- 150109864Sjeff 151134791Sjulianstatic struct kse kse0; 152134791Sjulianstatic struct kg_sched kg_sched0; 153109864Sjeff 154109864Sjeff/* 155116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 156116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 157116642Sjeff * value is then directly added to this to allow nice to have some effect 158116642Sjeff * on latency. 159111857Sjeff * 160111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 161116642Sjeff * PRI_NRESV: Number of nice values. 162111857Sjeff * PRI_BASE: The start of the dynamic range. 163109864Sjeff */ 164111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 165121869Sjeff#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 166121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 167116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 168113357Sjeff#define SCHED_PRI_INTERACT(score) \ 169116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 170109864Sjeff 171109864Sjeff/* 172111857Sjeff * These determine the interactivity of a process. 173109864Sjeff * 174110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 175110645Sjeff * before throttling back. 176121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 177116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 178111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 179109864Sjeff */ 180121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 181121868Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 182116365Sjeff#define SCHED_INTERACT_MAX (100) 183116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 184121126Sjeff#define SCHED_INTERACT_THRESH (30) 185111857Sjeff 186109864Sjeff/* 187109864Sjeff * These parameters and macros determine the size of the time slice that is 188109864Sjeff * granted to each thread. 189109864Sjeff * 190109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 191109864Sjeff * SLICE_MAX: Maximum time slice granted. 192109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 193112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 194112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 195121871Sjeff * SLICE_NTHRESH: The nice cutoff point for slice assignment. 196109864Sjeff */ 197113357Sjeff#define SCHED_SLICE_MIN (slice_min) 198113357Sjeff#define SCHED_SLICE_MAX (slice_max) 199125299Sjeff#define SCHED_SLICE_INTERACTIVE (slice_max) 200121871Sjeff#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 201111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 202109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 203112966Sjeff#define SCHED_SLICE_NICE(nice) \ 204121871Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 205109864Sjeff 206109864Sjeff/* 207134791Sjulian * This macro determines whether or not the thread belongs on the current or 208109864Sjeff * next run queue. 209109864Sjeff */ 210113357Sjeff#define SCHED_INTERACTIVE(kg) \ 211113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 212113417Sjeff#define SCHED_CURR(kg, ke) \ 213148856Sdavidxu ((ke->ke_thread->td_flags & TDF_BORROWING) || \ 214148856Sdavidxu (ke->ke_flags & KEF_PREEMPTED) || SCHED_INTERACTIVE(kg)) 215109864Sjeff 216109864Sjeff/* 217109864Sjeff * Cpu percentage computation macros and defines. 218109864Sjeff * 219109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 220109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 221109864Sjeff */ 222109864Sjeff 223112971Sjeff#define SCHED_CPU_TIME 10 224109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 225109864Sjeff 226109864Sjeff/* 227113357Sjeff * kseq - per processor runqs and statistics. 228109864Sjeff */ 229109864Sjeffstruct kseq { 230113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 231113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 232113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 233113357Sjeff struct runq *ksq_curr; /* Current queue. */ 234121896Sjeff int ksq_load_timeshare; /* Load for timeshare. */ 235113357Sjeff int ksq_load; /* Aggregate load. */ 236121869Sjeff short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 237113357Sjeff short ksq_nicemin; /* Least nice. */ 238110267Sjeff#ifdef SMP 239123433Sjeff int ksq_transferable; 240123433Sjeff LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 241123433Sjeff struct kseq_group *ksq_group; /* Our processor group. */ 242123433Sjeff volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 243125289Sjeff#else 244125289Sjeff int ksq_sysload; /* For loadavg, !ITHD load. */ 245110267Sjeff#endif 246109864Sjeff}; 247109864Sjeff 248123433Sjeff#ifdef SMP 249109864Sjeff/* 250123433Sjeff * kseq groups are groups of processors which can cheaply share threads. When 251123433Sjeff * one processor in the group goes idle it will check the runqs of the other 252123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 253123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 254123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 255123433Sjeff * load balancer. 256123433Sjeff */ 257123433Sjeffstruct kseq_group { 258123433Sjeff int ksg_cpus; /* Count of CPUs in this kseq group. */ 259127498Smarcel cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ 260127498Smarcel cpumask_t ksg_idlemask; /* Idle cpus in this group. */ 261127498Smarcel cpumask_t ksg_mask; /* Bit mask for first cpu. */ 262123487Sjeff int ksg_load; /* Total load of this group. */ 263123433Sjeff int ksg_transferable; /* Transferable load of this group. */ 264123433Sjeff LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 265123433Sjeff}; 266123433Sjeff#endif 267123433Sjeff 268123433Sjeff/* 269109864Sjeff * One kse queue per processor. 270109864Sjeff */ 271110028Sjeff#ifdef SMP 272127498Smarcelstatic cpumask_t kseq_idle; 273123487Sjeffstatic int ksg_maxid; 274121790Sjeffstatic struct kseq kseq_cpu[MAXCPU]; 275123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU]; 276129982Sjeffstatic int bal_tick; 277129982Sjeffstatic int gbal_tick; 278139334Sjeffstatic int balance_groups; 279129982Sjeff 280123433Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 281123433Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 282123487Sjeff#define KSEQ_ID(x) ((x) - kseq_cpu) 283123487Sjeff#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 284123433Sjeff#else /* !SMP */ 285121790Sjeffstatic struct kseq kseq_cpu; 286129982Sjeff 287110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 288110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 289110028Sjeff#endif 290109864Sjeff 291146954Sjeffstatic void slot_fill(struct ksegrp *); 292134791Sjulianstatic struct kse *sched_choose(void); /* XXX Should be thread * */ 293146954Sjeffstatic void sched_slice(struct kse *); 294146954Sjeffstatic void sched_priority(struct ksegrp *); 295146954Sjeffstatic void sched_thread_priority(struct thread *, u_char); 296146954Sjeffstatic int sched_interact_score(struct ksegrp *); 297146954Sjeffstatic void sched_interact_update(struct ksegrp *); 298146954Sjeffstatic void sched_interact_fork(struct ksegrp *); 299146954Sjeffstatic void sched_pctcpu_update(struct kse *); 300109864Sjeff 301110267Sjeff/* Operations on per processor queues */ 302146954Sjeffstatic struct kse * kseq_choose(struct kseq *); 303146954Sjeffstatic void kseq_setup(struct kseq *); 304146954Sjeffstatic void kseq_load_add(struct kseq *, struct kse *); 305146954Sjeffstatic void kseq_load_rem(struct kseq *, struct kse *); 306146954Sjeffstatic __inline void kseq_runq_add(struct kseq *, struct kse *, int); 307146954Sjeffstatic __inline void kseq_runq_rem(struct kseq *, struct kse *); 308146954Sjeffstatic void kseq_nice_add(struct kseq *, int); 309146954Sjeffstatic void kseq_nice_rem(struct kseq *, int); 310113660Sjeffvoid kseq_print(int cpu); 311110267Sjeff#ifdef SMP 312146954Sjeffstatic int kseq_transfer(struct kseq *, struct kse *, int); 313146954Sjeffstatic struct kse *runq_steal(struct runq *); 314129982Sjeffstatic void sched_balance(void); 315129982Sjeffstatic void sched_balance_groups(void); 316146954Sjeffstatic void sched_balance_group(struct kseq_group *); 317146954Sjeffstatic void sched_balance_pair(struct kseq *, struct kseq *); 318146954Sjeffstatic void kseq_move(struct kseq *, int); 319146954Sjeffstatic int kseq_idled(struct kseq *); 320146954Sjeffstatic void kseq_notify(struct kse *, int); 321121790Sjeffstatic void kseq_assign(struct kseq *); 322146954Sjeffstatic struct kse *kseq_steal(struct kseq *, int); 323139334Sjeff#define KSE_CAN_MIGRATE(ke) \ 324135076Sscottl ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 325121790Sjeff#endif 326110028Sjeff 327113357Sjeffvoid 328113660Sjeffkseq_print(int cpu) 329110267Sjeff{ 330113660Sjeff struct kseq *kseq; 331113357Sjeff int i; 332112994Sjeff 333113660Sjeff kseq = KSEQ_CPU(cpu); 334112994Sjeff 335113357Sjeff printf("kseq:\n"); 336113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 337122744Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 338121896Sjeff#ifdef SMP 339123433Sjeff printf("\tload transferable: %d\n", kseq->ksq_transferable); 340121896Sjeff#endif 341113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 342113357Sjeff printf("\tnice counts:\n"); 343121869Sjeff for (i = 0; i < SCHED_PRI_NRESV; i++) 344113357Sjeff if (kseq->ksq_nice[i]) 345113357Sjeff printf("\t\t%d = %d\n", 346113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 347113357Sjeff} 348112994Sjeff 349122744Sjeffstatic __inline void 350139334Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke, int flags) 351122744Sjeff{ 352122744Sjeff#ifdef SMP 353139334Sjeff if (KSE_CAN_MIGRATE(ke)) { 354123433Sjeff kseq->ksq_transferable++; 355123433Sjeff kseq->ksq_group->ksg_transferable++; 356133427Sjeff ke->ke_flags |= KEF_XFERABLE; 357123433Sjeff } 358122744Sjeff#endif 359148856Sdavidxu if (ke->ke_flags & KEF_PREEMPTED) 360148856Sdavidxu flags |= SRQ_PREEMPTED; 361139334Sjeff runq_add(ke->ke_runq, ke, flags); 362122744Sjeff} 363122744Sjeff 364122744Sjeffstatic __inline void 365122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke) 366122744Sjeff{ 367122744Sjeff#ifdef SMP 368133427Sjeff if (ke->ke_flags & KEF_XFERABLE) { 369123433Sjeff kseq->ksq_transferable--; 370123433Sjeff kseq->ksq_group->ksg_transferable--; 371133427Sjeff ke->ke_flags &= ~KEF_XFERABLE; 372123433Sjeff } 373122744Sjeff#endif 374122744Sjeff runq_remove(ke->ke_runq, ke); 375122744Sjeff} 376122744Sjeff 377113357Sjeffstatic void 378122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke) 379113357Sjeff{ 380121896Sjeff int class; 381115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 382121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 383121896Sjeff if (class == PRI_TIMESHARE) 384121896Sjeff kseq->ksq_load_timeshare++; 385113357Sjeff kseq->ksq_load++; 386139316Sjeff CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 387128563Sobrien if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 388123487Sjeff#ifdef SMP 389123487Sjeff kseq->ksq_group->ksg_load++; 390125289Sjeff#else 391125289Sjeff kseq->ksq_sysload++; 392123487Sjeff#endif 393113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 394130551Sjulian kseq_nice_add(kseq, ke->ke_proc->p_nice); 395110267Sjeff} 396113357Sjeff 397112994Sjeffstatic void 398122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke) 399110267Sjeff{ 400121896Sjeff int class; 401115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 402121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 403121896Sjeff if (class == PRI_TIMESHARE) 404121896Sjeff kseq->ksq_load_timeshare--; 405128563Sobrien if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 406123487Sjeff#ifdef SMP 407123487Sjeff kseq->ksq_group->ksg_load--; 408125289Sjeff#else 409125289Sjeff kseq->ksq_sysload--; 410123487Sjeff#endif 411113357Sjeff kseq->ksq_load--; 412139316Sjeff CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 413113357Sjeff ke->ke_runq = NULL; 414113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 415130551Sjulian kseq_nice_rem(kseq, ke->ke_proc->p_nice); 416110267Sjeff} 417110267Sjeff 418113357Sjeffstatic void 419113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 420110267Sjeff{ 421115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 422113357Sjeff /* Normalize to zero. */ 423113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 424121896Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 425113357Sjeff kseq->ksq_nicemin = nice; 426110267Sjeff} 427110267Sjeff 428113357Sjeffstatic void 429113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 430110267Sjeff{ 431113357Sjeff int n; 432113357Sjeff 433115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 434113357Sjeff /* Normalize to zero. */ 435113357Sjeff n = nice + SCHED_PRI_NHALF; 436113357Sjeff kseq->ksq_nice[n]--; 437113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 438113357Sjeff 439113357Sjeff /* 440113357Sjeff * If this wasn't the smallest nice value or there are more in 441113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 442113357Sjeff * the smallest nice. 443113357Sjeff */ 444113357Sjeff if (nice != kseq->ksq_nicemin || 445113357Sjeff kseq->ksq_nice[n] != 0 || 446121896Sjeff kseq->ksq_load_timeshare == 0) 447113357Sjeff return; 448113357Sjeff 449121869Sjeff for (; n < SCHED_PRI_NRESV; n++) 450113357Sjeff if (kseq->ksq_nice[n]) { 451113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 452113357Sjeff return; 453113357Sjeff } 454110267Sjeff} 455110267Sjeff 456113357Sjeff#ifdef SMP 457116069Sjeff/* 458122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 459116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 460116069Sjeff * by migrating some processes. 461116069Sjeff * 462116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 463116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 464116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 465116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 466116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 467116069Sjeff * 468116069Sjeff * It could be improved by considering the priorities and slices assigned to 469116069Sjeff * each task prior to balancing them. There are many pathological cases with 470116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 471116069Sjeff * 472116069Sjeff */ 473121790Sjeffstatic void 474129982Sjeffsched_balance(void) 475116069Sjeff{ 476123487Sjeff struct kseq_group *high; 477123487Sjeff struct kseq_group *low; 478123487Sjeff struct kseq_group *ksg; 479123487Sjeff int cnt; 480123487Sjeff int i; 481123487Sjeff 482139334Sjeff bal_tick = ticks + (random() % (hz * 2)); 483123487Sjeff if (smp_started == 0) 484139334Sjeff return; 485123487Sjeff low = high = NULL; 486123487Sjeff i = random() % (ksg_maxid + 1); 487123487Sjeff for (cnt = 0; cnt <= ksg_maxid; cnt++) { 488123487Sjeff ksg = KSEQ_GROUP(i); 489123487Sjeff /* 490123487Sjeff * Find the CPU with the highest load that has some 491123487Sjeff * threads to transfer. 492123487Sjeff */ 493123487Sjeff if ((high == NULL || ksg->ksg_load > high->ksg_load) 494123487Sjeff && ksg->ksg_transferable) 495123487Sjeff high = ksg; 496123487Sjeff if (low == NULL || ksg->ksg_load < low->ksg_load) 497123487Sjeff low = ksg; 498123487Sjeff if (++i > ksg_maxid) 499123487Sjeff i = 0; 500123487Sjeff } 501123487Sjeff if (low != NULL && high != NULL && high != low) 502123487Sjeff sched_balance_pair(LIST_FIRST(&high->ksg_members), 503123487Sjeff LIST_FIRST(&low->ksg_members)); 504123487Sjeff} 505123487Sjeff 506123487Sjeffstatic void 507129982Sjeffsched_balance_groups(void) 508123487Sjeff{ 509123487Sjeff int i; 510123487Sjeff 511139334Sjeff gbal_tick = ticks + (random() % (hz * 2)); 512129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 513123487Sjeff if (smp_started) 514123487Sjeff for (i = 0; i <= ksg_maxid; i++) 515123487Sjeff sched_balance_group(KSEQ_GROUP(i)); 516123487Sjeff} 517123487Sjeff 518123487Sjeffstatic void 519123487Sjeffsched_balance_group(struct kseq_group *ksg) 520123487Sjeff{ 521116069Sjeff struct kseq *kseq; 522123487Sjeff struct kseq *high; 523123487Sjeff struct kseq *low; 524123487Sjeff int load; 525123487Sjeff 526123487Sjeff if (ksg->ksg_transferable == 0) 527123487Sjeff return; 528123487Sjeff low = NULL; 529123487Sjeff high = NULL; 530123487Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 531123487Sjeff load = kseq->ksq_load; 532123487Sjeff if (high == NULL || load > high->ksq_load) 533123487Sjeff high = kseq; 534123487Sjeff if (low == NULL || load < low->ksq_load) 535123487Sjeff low = kseq; 536123487Sjeff } 537123487Sjeff if (high != NULL && low != NULL && high != low) 538123487Sjeff sched_balance_pair(high, low); 539123487Sjeff} 540123487Sjeff 541123487Sjeffstatic void 542123487Sjeffsched_balance_pair(struct kseq *high, struct kseq *low) 543123487Sjeff{ 544123433Sjeff int transferable; 545116069Sjeff int high_load; 546116069Sjeff int low_load; 547116069Sjeff int move; 548116069Sjeff int diff; 549116069Sjeff int i; 550116069Sjeff 551116069Sjeff /* 552123433Sjeff * If we're transfering within a group we have to use this specific 553123433Sjeff * kseq's transferable count, otherwise we can steal from other members 554123433Sjeff * of the group. 555123433Sjeff */ 556123487Sjeff if (high->ksq_group == low->ksq_group) { 557123487Sjeff transferable = high->ksq_transferable; 558123487Sjeff high_load = high->ksq_load; 559123487Sjeff low_load = low->ksq_load; 560123487Sjeff } else { 561123487Sjeff transferable = high->ksq_group->ksg_transferable; 562123487Sjeff high_load = high->ksq_group->ksg_load; 563123487Sjeff low_load = low->ksq_group->ksg_load; 564123487Sjeff } 565123433Sjeff if (transferable == 0) 566123487Sjeff return; 567123433Sjeff /* 568122744Sjeff * Determine what the imbalance is and then adjust that to how many 569123433Sjeff * kses we actually have to give up (transferable). 570122744Sjeff */ 571123487Sjeff diff = high_load - low_load; 572116069Sjeff move = diff / 2; 573116069Sjeff if (diff & 0x1) 574116069Sjeff move++; 575123433Sjeff move = min(move, transferable); 576116069Sjeff for (i = 0; i < move; i++) 577123487Sjeff kseq_move(high, KSEQ_ID(low)); 578116069Sjeff return; 579116069Sjeff} 580116069Sjeff 581121790Sjeffstatic void 582116069Sjeffkseq_move(struct kseq *from, int cpu) 583116069Sjeff{ 584123433Sjeff struct kseq *kseq; 585123433Sjeff struct kseq *to; 586116069Sjeff struct kse *ke; 587116069Sjeff 588123433Sjeff kseq = from; 589123433Sjeff to = KSEQ_CPU(cpu); 590123433Sjeff ke = kseq_steal(kseq, 1); 591123433Sjeff if (ke == NULL) { 592123433Sjeff struct kseq_group *ksg; 593123433Sjeff 594123433Sjeff ksg = kseq->ksq_group; 595123433Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 596123433Sjeff if (kseq == from || kseq->ksq_transferable == 0) 597123433Sjeff continue; 598123433Sjeff ke = kseq_steal(kseq, 1); 599123433Sjeff break; 600123433Sjeff } 601123433Sjeff if (ke == NULL) 602123433Sjeff panic("kseq_move: No KSEs available with a " 603123433Sjeff "transferable count of %d\n", 604123433Sjeff ksg->ksg_transferable); 605123433Sjeff } 606123433Sjeff if (kseq == to) 607123433Sjeff return; 608116069Sjeff ke->ke_state = KES_THREAD; 609123433Sjeff kseq_runq_rem(kseq, ke); 610123433Sjeff kseq_load_rem(kseq, ke); 611121923Sjeff kseq_notify(ke, cpu); 612116069Sjeff} 613110267Sjeff 614123433Sjeffstatic int 615123433Sjeffkseq_idled(struct kseq *kseq) 616121790Sjeff{ 617123433Sjeff struct kseq_group *ksg; 618123433Sjeff struct kseq *steal; 619123433Sjeff struct kse *ke; 620123433Sjeff 621123433Sjeff ksg = kseq->ksq_group; 622123433Sjeff /* 623123433Sjeff * If we're in a cpu group, try and steal kses from another cpu in 624123433Sjeff * the group before idling. 625123433Sjeff */ 626123433Sjeff if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 627123433Sjeff LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 628123433Sjeff if (steal == kseq || steal->ksq_transferable == 0) 629123433Sjeff continue; 630123433Sjeff ke = kseq_steal(steal, 0); 631123433Sjeff if (ke == NULL) 632123433Sjeff continue; 633123433Sjeff ke->ke_state = KES_THREAD; 634123433Sjeff kseq_runq_rem(steal, ke); 635123433Sjeff kseq_load_rem(steal, ke); 636123433Sjeff ke->ke_cpu = PCPU_GET(cpuid); 637139334Sjeff ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; 638139334Sjeff sched_add(ke->ke_thread, SRQ_YIELDING); 639123433Sjeff return (0); 640123433Sjeff } 641123433Sjeff } 642123433Sjeff /* 643123433Sjeff * We only set the idled bit when all of the cpus in the group are 644123433Sjeff * idle. Otherwise we could get into a situation where a KSE bounces 645123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 646123433Sjeff */ 647123433Sjeff ksg->ksg_idlemask |= PCPU_GET(cpumask); 648123433Sjeff if (ksg->ksg_idlemask != ksg->ksg_cpumask) 649123433Sjeff return (1); 650123433Sjeff atomic_set_int(&kseq_idle, ksg->ksg_mask); 651123433Sjeff return (1); 652121790Sjeff} 653121790Sjeff 654121790Sjeffstatic void 655121790Sjeffkseq_assign(struct kseq *kseq) 656121790Sjeff{ 657121790Sjeff struct kse *nke; 658121790Sjeff struct kse *ke; 659121790Sjeff 660121790Sjeff do { 661132776Skan *(volatile struct kse **)&ke = kseq->ksq_assigned; 662148383Sdelphij } while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned, 663148383Sdelphij (uintptr_t)ke, (uintptr_t)NULL)); 664121790Sjeff for (; ke != NULL; ke = nke) { 665121790Sjeff nke = ke->ke_assign; 666139334Sjeff kseq->ksq_group->ksg_load--; 667139334Sjeff kseq->ksq_load--; 668121790Sjeff ke->ke_flags &= ~KEF_ASSIGNED; 669148603Sdavidxu if (ke->ke_flags & KEF_REMOVED) { 670148603Sdavidxu ke->ke_flags &= ~KEF_REMOVED; 671148603Sdavidxu continue; 672148603Sdavidxu } 673139334Sjeff ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; 674139334Sjeff sched_add(ke->ke_thread, SRQ_YIELDING); 675121790Sjeff } 676121790Sjeff} 677121790Sjeff 678121790Sjeffstatic void 679121790Sjeffkseq_notify(struct kse *ke, int cpu) 680121790Sjeff{ 681121790Sjeff struct kseq *kseq; 682121790Sjeff struct thread *td; 683121790Sjeff struct pcpu *pcpu; 684139334Sjeff int class; 685133427Sjeff int prio; 686121790Sjeff 687139334Sjeff kseq = KSEQ_CPU(cpu); 688139334Sjeff /* XXX */ 689139334Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 690139334Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 691139334Sjeff (kseq_idle & kseq->ksq_group->ksg_mask)) 692139334Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 693139334Sjeff kseq->ksq_group->ksg_load++; 694139334Sjeff kseq->ksq_load++; 695123529Sjeff ke->ke_cpu = cpu; 696121790Sjeff ke->ke_flags |= KEF_ASSIGNED; 697133427Sjeff prio = ke->ke_thread->td_priority; 698121790Sjeff 699121790Sjeff /* 700121790Sjeff * Place a KSE on another cpu's queue and force a resched. 701121790Sjeff */ 702121790Sjeff do { 703132776Skan *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned; 704148383Sdelphij } while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned, 705148383Sdelphij (uintptr_t)ke->ke_assign, (uintptr_t)ke)); 706133427Sjeff /* 707133427Sjeff * Without sched_lock we could lose a race where we set NEEDRESCHED 708133427Sjeff * on a thread that is switched out before the IPI is delivered. This 709133427Sjeff * would lead us to miss the resched. This will be a problem once 710133427Sjeff * sched_lock is pushed down. 711133427Sjeff */ 712121790Sjeff pcpu = pcpu_find(cpu); 713121790Sjeff td = pcpu->pc_curthread; 714121790Sjeff if (ke->ke_thread->td_priority < td->td_priority || 715121790Sjeff td == pcpu->pc_idlethread) { 716121790Sjeff td->td_flags |= TDF_NEEDRESCHED; 717121790Sjeff ipi_selected(1 << cpu, IPI_AST); 718121790Sjeff } 719121790Sjeff} 720121790Sjeff 721121790Sjeffstatic struct kse * 722121790Sjeffrunq_steal(struct runq *rq) 723121790Sjeff{ 724121790Sjeff struct rqhead *rqh; 725121790Sjeff struct rqbits *rqb; 726121790Sjeff struct kse *ke; 727121790Sjeff int word; 728121790Sjeff int bit; 729121790Sjeff 730121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 731121790Sjeff rqb = &rq->rq_status; 732121790Sjeff for (word = 0; word < RQB_LEN; word++) { 733121790Sjeff if (rqb->rqb_bits[word] == 0) 734121790Sjeff continue; 735121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 736123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 737121790Sjeff continue; 738121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 739121790Sjeff TAILQ_FOREACH(ke, rqh, ke_procq) { 740139334Sjeff if (KSE_CAN_MIGRATE(ke)) 741121790Sjeff return (ke); 742121790Sjeff } 743121790Sjeff } 744121790Sjeff } 745121790Sjeff return (NULL); 746121790Sjeff} 747121790Sjeff 748121790Sjeffstatic struct kse * 749123433Sjeffkseq_steal(struct kseq *kseq, int stealidle) 750121790Sjeff{ 751121790Sjeff struct kse *ke; 752121790Sjeff 753123433Sjeff /* 754123433Sjeff * Steal from next first to try to get a non-interactive task that 755123433Sjeff * may not have run for a while. 756123433Sjeff */ 757123433Sjeff if ((ke = runq_steal(kseq->ksq_next)) != NULL) 758123433Sjeff return (ke); 759121790Sjeff if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 760121790Sjeff return (ke); 761123433Sjeff if (stealidle) 762123433Sjeff return (runq_steal(&kseq->ksq_idle)); 763123433Sjeff return (NULL); 764121790Sjeff} 765123433Sjeff 766123433Sjeffint 767123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class) 768123433Sjeff{ 769139334Sjeff struct kseq_group *nksg; 770123433Sjeff struct kseq_group *ksg; 771139334Sjeff struct kseq *old; 772123433Sjeff int cpu; 773139334Sjeff int idx; 774123433Sjeff 775123685Sjeff if (smp_started == 0) 776123685Sjeff return (0); 777123433Sjeff cpu = 0; 778123433Sjeff /* 779133427Sjeff * If our load exceeds a certain threshold we should attempt to 780133427Sjeff * reassign this thread. The first candidate is the cpu that 781133427Sjeff * originally ran the thread. If it is idle, assign it there, 782133427Sjeff * otherwise, pick an idle cpu. 783133427Sjeff * 784133427Sjeff * The threshold at which we start to reassign kses has a large impact 785123685Sjeff * on the overall performance of the system. Tuned too high and 786123685Sjeff * some CPUs may idle. Too low and there will be excess migration 787128055Scognet * and context switches. 788123685Sjeff */ 789139334Sjeff old = KSEQ_CPU(ke->ke_cpu); 790139334Sjeff nksg = old->ksq_group; 791133427Sjeff ksg = kseq->ksq_group; 792139334Sjeff if (kseq_idle) { 793139334Sjeff if (kseq_idle & nksg->ksg_mask) { 794139334Sjeff cpu = ffs(nksg->ksg_idlemask); 795139334Sjeff if (cpu) { 796139334Sjeff CTR2(KTR_SCHED, 797139334Sjeff "kseq_transfer: %p found old cpu %X " 798139334Sjeff "in idlemask.", ke, cpu); 799133427Sjeff goto migrate; 800139334Sjeff } 801133427Sjeff } 802123433Sjeff /* 803123433Sjeff * Multiple cpus could find this bit simultaneously 804123433Sjeff * but the race shouldn't be terrible. 805123433Sjeff */ 806123433Sjeff cpu = ffs(kseq_idle); 807139334Sjeff if (cpu) { 808139334Sjeff CTR2(KTR_SCHED, "kseq_transfer: %p found %X " 809139334Sjeff "in idlemask.", ke, cpu); 810133427Sjeff goto migrate; 811139334Sjeff } 812123433Sjeff } 813139334Sjeff idx = 0; 814139334Sjeff#if 0 815139334Sjeff if (old->ksq_load < kseq->ksq_load) { 816139334Sjeff cpu = ke->ke_cpu + 1; 817139334Sjeff CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X " 818139334Sjeff "load less than ours.", ke, cpu); 819139334Sjeff goto migrate; 820139334Sjeff } 821123433Sjeff /* 822139334Sjeff * No new CPU was found, look for one with less load. 823139334Sjeff */ 824139334Sjeff for (idx = 0; idx <= ksg_maxid; idx++) { 825139334Sjeff nksg = KSEQ_GROUP(idx); 826139334Sjeff if (nksg->ksg_load /*+ (nksg->ksg_cpus * 2)*/ < ksg->ksg_load) { 827139334Sjeff cpu = ffs(nksg->ksg_cpumask); 828139334Sjeff CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less " 829139334Sjeff "than ours.", ke, cpu); 830139334Sjeff goto migrate; 831139334Sjeff } 832139334Sjeff } 833139334Sjeff#endif 834139334Sjeff /* 835123433Sjeff * If another cpu in this group has idled, assign a thread over 836123433Sjeff * to them after checking to see if there are idled groups. 837123433Sjeff */ 838133427Sjeff if (ksg->ksg_idlemask) { 839123433Sjeff cpu = ffs(ksg->ksg_idlemask); 840139334Sjeff if (cpu) { 841139334Sjeff CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in " 842139334Sjeff "group.", ke, cpu); 843133427Sjeff goto migrate; 844139334Sjeff } 845123433Sjeff } 846133427Sjeff return (0); 847133427Sjeffmigrate: 848133427Sjeff /* 849123433Sjeff * Now that we've found an idle CPU, migrate the thread. 850123433Sjeff */ 851133427Sjeff cpu--; 852133427Sjeff ke->ke_runq = NULL; 853133427Sjeff kseq_notify(ke, cpu); 854133427Sjeff 855133427Sjeff return (1); 856123433Sjeff} 857123433Sjeff 858121790Sjeff#endif /* SMP */ 859121790Sjeff 860117326Sjeff/* 861121790Sjeff * Pick the highest priority task we have and return it. 862117326Sjeff */ 863117326Sjeff 864121790Sjeffstatic struct kse * 865121790Sjeffkseq_choose(struct kseq *kseq) 866110267Sjeff{ 867137067Sjeff struct runq *swap; 868110267Sjeff struct kse *ke; 869137067Sjeff int nice; 870110267Sjeff 871115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 872113357Sjeff swap = NULL; 873112994Sjeff 874113357Sjeff for (;;) { 875113357Sjeff ke = runq_choose(kseq->ksq_curr); 876113357Sjeff if (ke == NULL) { 877113357Sjeff /* 878131473Sjhb * We already swapped once and didn't get anywhere. 879113357Sjeff */ 880113357Sjeff if (swap) 881113357Sjeff break; 882113357Sjeff swap = kseq->ksq_curr; 883113357Sjeff kseq->ksq_curr = kseq->ksq_next; 884113357Sjeff kseq->ksq_next = swap; 885113357Sjeff continue; 886113357Sjeff } 887113357Sjeff /* 888113357Sjeff * If we encounter a slice of 0 the kse is in a 889113357Sjeff * TIMESHARE kse group and its nice was too far out 890113357Sjeff * of the range that receives slices. 891113357Sjeff */ 892137067Sjeff nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin); 893150442Sdavidxu#if 0 894138842Sjeff if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH && 895138842Sjeff ke->ke_proc->p_nice != 0)) { 896113357Sjeff runq_remove(ke->ke_runq, ke); 897113357Sjeff sched_slice(ke); 898113357Sjeff ke->ke_runq = kseq->ksq_next; 899136170Sjulian runq_add(ke->ke_runq, ke, 0); 900113357Sjeff continue; 901113357Sjeff } 902150442Sdavidxu#endif 903113357Sjeff return (ke); 904110267Sjeff } 905110267Sjeff 906113357Sjeff return (runq_choose(&kseq->ksq_idle)); 907110267Sjeff} 908110267Sjeff 909109864Sjeffstatic void 910110028Sjeffkseq_setup(struct kseq *kseq) 911110028Sjeff{ 912113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 913113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 914112994Sjeff runq_init(&kseq->ksq_idle); 915113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 916113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 917113660Sjeff kseq->ksq_load = 0; 918121896Sjeff kseq->ksq_load_timeshare = 0; 919110028Sjeff} 920110028Sjeff 921110028Sjeffstatic void 922109864Sjeffsched_setup(void *dummy) 923109864Sjeff{ 924117313Sjeff#ifdef SMP 925109864Sjeff int i; 926117313Sjeff#endif 927109864Sjeff 928153533Sdavidxu /* 929153533Sdavidxu * To avoid divide-by-zero, we set realstathz a dummy value 930153533Sdavidxu * in case which sched_clock() called before sched_initticks(). 931153533Sdavidxu */ 932153533Sdavidxu realstathz = hz; 933116946Sjeff slice_min = (hz/100); /* 10ms */ 934116946Sjeff slice_max = (hz/7); /* ~140ms */ 935111857Sjeff 936117237Sjeff#ifdef SMP 937123487Sjeff balance_groups = 0; 938123433Sjeff /* 939123433Sjeff * Initialize the kseqs. 940123433Sjeff */ 941123433Sjeff for (i = 0; i < MAXCPU; i++) { 942123433Sjeff struct kseq *ksq; 943123433Sjeff 944123433Sjeff ksq = &kseq_cpu[i]; 945123433Sjeff ksq->ksq_assigned = NULL; 946123433Sjeff kseq_setup(&kseq_cpu[i]); 947123433Sjeff } 948117237Sjeff if (smp_topology == NULL) { 949123433Sjeff struct kseq_group *ksg; 950123433Sjeff struct kseq *ksq; 951139334Sjeff int cpus; 952123433Sjeff 953139334Sjeff for (cpus = 0, i = 0; i < MAXCPU; i++) { 954139334Sjeff if (CPU_ABSENT(i)) 955139334Sjeff continue; 956153749Sdavidxu ksq = &kseq_cpu[i]; 957139334Sjeff ksg = &kseq_groups[cpus]; 958123433Sjeff /* 959129982Sjeff * Setup a kseq group with one member. 960123433Sjeff */ 961123433Sjeff ksq->ksq_transferable = 0; 962123433Sjeff ksq->ksq_group = ksg; 963123433Sjeff ksg->ksg_cpus = 1; 964123433Sjeff ksg->ksg_idlemask = 0; 965123433Sjeff ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 966123487Sjeff ksg->ksg_load = 0; 967123433Sjeff ksg->ksg_transferable = 0; 968123433Sjeff LIST_INIT(&ksg->ksg_members); 969123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 970139334Sjeff cpus++; 971117237Sjeff } 972139334Sjeff ksg_maxid = cpus - 1; 973117237Sjeff } else { 974123433Sjeff struct kseq_group *ksg; 975123433Sjeff struct cpu_group *cg; 976117237Sjeff int j; 977113357Sjeff 978117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 979117237Sjeff cg = &smp_topology->ct_group[i]; 980123433Sjeff ksg = &kseq_groups[i]; 981123433Sjeff /* 982123433Sjeff * Initialize the group. 983123433Sjeff */ 984123433Sjeff ksg->ksg_idlemask = 0; 985123487Sjeff ksg->ksg_load = 0; 986123433Sjeff ksg->ksg_transferable = 0; 987123433Sjeff ksg->ksg_cpus = cg->cg_count; 988123433Sjeff ksg->ksg_cpumask = cg->cg_mask; 989123433Sjeff LIST_INIT(&ksg->ksg_members); 990123433Sjeff /* 991123433Sjeff * Find all of the group members and add them. 992123433Sjeff */ 993123433Sjeff for (j = 0; j < MAXCPU; j++) { 994123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 995123433Sjeff if (ksg->ksg_mask == 0) 996123433Sjeff ksg->ksg_mask = 1 << j; 997123433Sjeff kseq_cpu[j].ksq_transferable = 0; 998123433Sjeff kseq_cpu[j].ksq_group = ksg; 999123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, 1000123433Sjeff &kseq_cpu[j], ksq_siblings); 1001123433Sjeff } 1002123433Sjeff } 1003123487Sjeff if (ksg->ksg_cpus > 1) 1004123487Sjeff balance_groups = 1; 1005117237Sjeff } 1006123487Sjeff ksg_maxid = smp_topology->ct_count - 1; 1007117237Sjeff } 1008123487Sjeff /* 1009123487Sjeff * Stagger the group and global load balancer so they do not 1010123487Sjeff * interfere with each other. 1011123487Sjeff */ 1012129982Sjeff bal_tick = ticks + hz; 1013123487Sjeff if (balance_groups) 1014129982Sjeff gbal_tick = ticks + (hz / 2); 1015117237Sjeff#else 1016117237Sjeff kseq_setup(KSEQ_SELF()); 1017116069Sjeff#endif 1018117237Sjeff mtx_lock_spin(&sched_lock); 1019122744Sjeff kseq_load_add(KSEQ_SELF(), &kse0); 1020117237Sjeff mtx_unlock_spin(&sched_lock); 1021109864Sjeff} 1022109864Sjeff 1023153533Sdavidxu/* ARGSUSED */ 1024153533Sdavidxustatic void 1025153533Sdavidxusched_initticks(void *dummy) 1026153533Sdavidxu{ 1027153533Sdavidxu mtx_lock_spin(&sched_lock); 1028153533Sdavidxu realstathz = stathz ? stathz : hz; 1029153533Sdavidxu slice_min = (realstathz/100); /* 10ms */ 1030153533Sdavidxu slice_max = (realstathz/7); /* ~140ms */ 1031153533Sdavidxu 1032153533Sdavidxu tickincr = (hz << 10) / realstathz; 1033153533Sdavidxu /* 1034153533Sdavidxu * XXX This does not work for values of stathz that are much 1035153533Sdavidxu * larger than hz. 1036153533Sdavidxu */ 1037153533Sdavidxu if (tickincr == 0) 1038153533Sdavidxu tickincr = 1; 1039153533Sdavidxu mtx_unlock_spin(&sched_lock); 1040153533Sdavidxu} 1041153533Sdavidxu 1042153533Sdavidxu 1043109864Sjeff/* 1044109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 1045109864Sjeff * process. 1046109864Sjeff */ 1047113357Sjeffstatic void 1048109864Sjeffsched_priority(struct ksegrp *kg) 1049109864Sjeff{ 1050109864Sjeff int pri; 1051109864Sjeff 1052109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1053113357Sjeff return; 1054109864Sjeff 1055113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 1056111857Sjeff pri += SCHED_PRI_BASE; 1057130551Sjulian pri += kg->kg_proc->p_nice; 1058109864Sjeff 1059109864Sjeff if (pri > PRI_MAX_TIMESHARE) 1060109864Sjeff pri = PRI_MAX_TIMESHARE; 1061109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 1062109864Sjeff pri = PRI_MIN_TIMESHARE; 1063109864Sjeff 1064161599Sdavidxu sched_user_prio(kg, pri); 1065109864Sjeff 1066113357Sjeff return; 1067109864Sjeff} 1068109864Sjeff 1069109864Sjeff/* 1070112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 1071112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 1072109864Sjeff */ 1073112966Sjeffstatic void 1074112966Sjeffsched_slice(struct kse *ke) 1075109864Sjeff{ 1076113357Sjeff struct kseq *kseq; 1077112966Sjeff struct ksegrp *kg; 1078109864Sjeff 1079112966Sjeff kg = ke->ke_ksegrp; 1080113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1081109864Sjeff 1082139453Sjhb if (ke->ke_thread->td_flags & TDF_BORROWING) { 1083138842Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1084138842Sjeff return; 1085138842Sjeff } 1086138842Sjeff 1087112966Sjeff /* 1088112966Sjeff * Rationale: 1089133427Sjeff * KSEs in interactive ksegs get a minimal slice so that we 1090112966Sjeff * quickly notice if it abuses its advantage. 1091112966Sjeff * 1092112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 1093112966Sjeff * based on the ksegs nice value relative to the least nice kseg 1094112966Sjeff * on the run queue for this cpu. 1095112966Sjeff * 1096112966Sjeff * If the KSE is less nice than all others it gets the maximum 1097112966Sjeff * slice and other KSEs will adjust their slice relative to 1098112966Sjeff * this when they first expire. 1099112966Sjeff * 1100112966Sjeff * There is 20 point window that starts relative to the least 1101112966Sjeff * nice kse on the run queue. Slice size is determined by 1102112966Sjeff * the kse distance from the last nice ksegrp. 1103112966Sjeff * 1104121871Sjeff * If the kse is outside of the window it will get no slice 1105121871Sjeff * and will be reevaluated each time it is selected on the 1106121871Sjeff * run queue. The exception to this is nice 0 ksegs when 1107121871Sjeff * a nice -20 is running. They are always granted a minimum 1108121871Sjeff * slice. 1109112966Sjeff */ 1110113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 1111112966Sjeff int nice; 1112112966Sjeff 1113130551Sjulian nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); 1114121896Sjeff if (kseq->ksq_load_timeshare == 0 || 1115130551Sjulian kg->kg_proc->p_nice < kseq->ksq_nicemin) 1116112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1117121871Sjeff else if (nice <= SCHED_SLICE_NTHRESH) 1118112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 1119130551Sjulian else if (kg->kg_proc->p_nice == 0) 1120121871Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1121112966Sjeff else 1122150442Sdavidxu ke->ke_slice = SCHED_SLICE_MIN; /* 0 */ 1123112966Sjeff } else 1124123684Sjeff ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1125112966Sjeff 1126112966Sjeff return; 1127109864Sjeff} 1128109864Sjeff 1129121868Sjeff/* 1130121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1131121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 1132121868Sjeff * This routine will not operate correctly when slp or run times have been 1133121868Sjeff * adjusted to more than double their maximum. 1134121868Sjeff */ 1135116463Sjeffstatic void 1136116463Sjeffsched_interact_update(struct ksegrp *kg) 1137116463Sjeff{ 1138121868Sjeff int sum; 1139121605Sjeff 1140121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1141121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1142121868Sjeff return; 1143121868Sjeff /* 1144121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1145121868Sjeff * will not bring us back into range. Dividing by two here forces 1146133427Sjeff * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1147121868Sjeff */ 1148127850Sjeff if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1149121868Sjeff kg->kg_runtime /= 2; 1150121868Sjeff kg->kg_slptime /= 2; 1151121868Sjeff return; 1152116463Sjeff } 1153121868Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1154121868Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1155116463Sjeff} 1156116463Sjeff 1157121868Sjeffstatic void 1158121868Sjeffsched_interact_fork(struct ksegrp *kg) 1159121868Sjeff{ 1160121868Sjeff int ratio; 1161121868Sjeff int sum; 1162121868Sjeff 1163121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1164121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1165121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1166121868Sjeff kg->kg_runtime /= ratio; 1167121868Sjeff kg->kg_slptime /= ratio; 1168121868Sjeff } 1169121868Sjeff} 1170121868Sjeff 1171111857Sjeffstatic int 1172111857Sjeffsched_interact_score(struct ksegrp *kg) 1173111857Sjeff{ 1174116365Sjeff int div; 1175111857Sjeff 1176111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 1177116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1178116365Sjeff return (SCHED_INTERACT_HALF + 1179116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1180116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 1181116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1182116365Sjeff return (kg->kg_runtime / div); 1183111857Sjeff } 1184111857Sjeff 1185116365Sjeff /* 1186116365Sjeff * This can happen if slptime and runtime are 0. 1187116365Sjeff */ 1188116365Sjeff return (0); 1189111857Sjeff 1190111857Sjeff} 1191111857Sjeff 1192113357Sjeff/* 1193134791Sjulian * Very early in the boot some setup of scheduler-specific 1194134791Sjulian * parts of proc0 and of soem scheduler resources needs to be done. 1195134791Sjulian * Called from: 1196134791Sjulian * proc0_init() 1197134791Sjulian */ 1198134791Sjulianvoid 1199134791Sjulianschedinit(void) 1200134791Sjulian{ 1201134791Sjulian /* 1202134791Sjulian * Set up the scheduler specific parts of proc0. 1203134791Sjulian */ 1204136167Sjulian proc0.p_sched = NULL; /* XXX */ 1205134791Sjulian ksegrp0.kg_sched = &kg_sched0; 1206136167Sjulian thread0.td_sched = &kse0; 1207134791Sjulian kse0.ke_thread = &thread0; 1208134791Sjulian kse0.ke_state = KES_THREAD; 1209134791Sjulian kg_sched0.skg_concurrency = 1; 1210134791Sjulian kg_sched0.skg_avail_opennings = 0; /* we are already running */ 1211134791Sjulian} 1212134791Sjulian 1213134791Sjulian/* 1214113357Sjeff * This is only somewhat accurate since given many processes of the same 1215113357Sjeff * priority they will switch when their slices run out, which will be 1216113357Sjeff * at most SCHED_SLICE_MAX. 1217113357Sjeff */ 1218109864Sjeffint 1219109864Sjeffsched_rr_interval(void) 1220109864Sjeff{ 1221109864Sjeff return (SCHED_SLICE_MAX); 1222109864Sjeff} 1223109864Sjeff 1224121790Sjeffstatic void 1225109864Sjeffsched_pctcpu_update(struct kse *ke) 1226109864Sjeff{ 1227109864Sjeff /* 1228109864Sjeff * Adjust counters and watermark for pctcpu calc. 1229116365Sjeff */ 1230120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1231120272Sjeff /* 1232120272Sjeff * Shift the tick count out so that the divide doesn't 1233120272Sjeff * round away our results. 1234120272Sjeff */ 1235120272Sjeff ke->ke_ticks <<= 10; 1236120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1237120272Sjeff SCHED_CPU_TICKS; 1238120272Sjeff ke->ke_ticks >>= 10; 1239120272Sjeff } else 1240120272Sjeff ke->ke_ticks = 0; 1241109864Sjeff ke->ke_ltick = ticks; 1242109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1243109864Sjeff} 1244109864Sjeff 1245109864Sjeffvoid 1246139453Sjhbsched_thread_priority(struct thread *td, u_char prio) 1247109864Sjeff{ 1248121605Sjeff struct kse *ke; 1249109864Sjeff 1250139316Sjeff CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1251139316Sjeff td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1252139316Sjeff curthread->td_proc->p_comm); 1253121605Sjeff ke = td->td_kse; 1254109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1255139453Sjhb if (td->td_priority == prio) 1256139453Sjhb return; 1257109864Sjeff if (TD_ON_RUNQ(td)) { 1258121605Sjeff /* 1259121605Sjeff * If the priority has been elevated due to priority 1260121605Sjeff * propagation, we may have to move ourselves to a new 1261121605Sjeff * queue. We still call adjustrunqueue below in case kse 1262121605Sjeff * needs to fix things up. 1263121605Sjeff */ 1264138842Sjeff if (prio < td->td_priority && ke->ke_runq != NULL && 1265121872Sjeff (ke->ke_flags & KEF_ASSIGNED) == 0 && 1266121790Sjeff ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1267121605Sjeff runq_remove(ke->ke_runq, ke); 1268121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1269136170Sjulian runq_add(ke->ke_runq, ke, 0); 1270121605Sjeff } 1271133555Sjeff /* 1272133555Sjeff * Hold this kse on this cpu so that sched_prio() doesn't 1273133555Sjeff * cause excessive migration. We only want migration to 1274133555Sjeff * happen as the result of a wakeup. 1275133555Sjeff */ 1276133555Sjeff ke->ke_flags |= KEF_HOLD; 1277119488Sdavidxu adjustrunqueue(td, prio); 1278139334Sjeff ke->ke_flags &= ~KEF_HOLD; 1279121605Sjeff } else 1280119488Sdavidxu td->td_priority = prio; 1281109864Sjeff} 1282109864Sjeff 1283139453Sjhb/* 1284139453Sjhb * Update a thread's priority when it is lent another thread's 1285139453Sjhb * priority. 1286139453Sjhb */ 1287109864Sjeffvoid 1288139453Sjhbsched_lend_prio(struct thread *td, u_char prio) 1289139453Sjhb{ 1290139453Sjhb 1291139453Sjhb td->td_flags |= TDF_BORROWING; 1292139453Sjhb sched_thread_priority(td, prio); 1293139453Sjhb} 1294139453Sjhb 1295139453Sjhb/* 1296139453Sjhb * Restore a thread's priority when priority propagation is 1297139453Sjhb * over. The prio argument is the minimum priority the thread 1298139453Sjhb * needs to have to satisfy other possible priority lending 1299139453Sjhb * requests. If the thread's regular priority is less 1300139453Sjhb * important than prio, the thread will keep a priority boost 1301139453Sjhb * of prio. 1302139453Sjhb */ 1303139453Sjhbvoid 1304139453Sjhbsched_unlend_prio(struct thread *td, u_char prio) 1305139453Sjhb{ 1306139453Sjhb u_char base_pri; 1307139453Sjhb 1308139453Sjhb if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1309139453Sjhb td->td_base_pri <= PRI_MAX_TIMESHARE) 1310139453Sjhb base_pri = td->td_ksegrp->kg_user_pri; 1311139453Sjhb else 1312139453Sjhb base_pri = td->td_base_pri; 1313139453Sjhb if (prio >= base_pri) { 1314139455Sjhb td->td_flags &= ~TDF_BORROWING; 1315139453Sjhb sched_thread_priority(td, base_pri); 1316139453Sjhb } else 1317139453Sjhb sched_lend_prio(td, prio); 1318139453Sjhb} 1319139453Sjhb 1320139453Sjhbvoid 1321139453Sjhbsched_prio(struct thread *td, u_char prio) 1322139453Sjhb{ 1323139453Sjhb u_char oldprio; 1324139453Sjhb 1325139453Sjhb /* First, update the base priority. */ 1326139453Sjhb td->td_base_pri = prio; 1327139453Sjhb 1328139453Sjhb /* 1329139455Sjhb * If the thread is borrowing another thread's priority, don't 1330139453Sjhb * ever lower the priority. 1331139453Sjhb */ 1332139453Sjhb if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1333139453Sjhb return; 1334139453Sjhb 1335139453Sjhb /* Change the real priority. */ 1336139453Sjhb oldprio = td->td_priority; 1337139453Sjhb sched_thread_priority(td, prio); 1338139453Sjhb 1339139453Sjhb /* 1340139453Sjhb * If the thread is on a turnstile, then let the turnstile update 1341139453Sjhb * its state. 1342139453Sjhb */ 1343139453Sjhb if (TD_ON_LOCK(td) && oldprio != prio) 1344139453Sjhb turnstile_adjust(td, oldprio); 1345139453Sjhb} 1346139455Sjhb 1347139453Sjhbvoid 1348161599Sdavidxusched_user_prio(struct ksegrp *kg, u_char prio) 1349161599Sdavidxu{ 1350161599Sdavidxu struct thread *td; 1351161599Sdavidxu u_char oldprio; 1352161599Sdavidxu 1353161599Sdavidxu kg->kg_base_user_pri = prio; 1354161599Sdavidxu 1355161599Sdavidxu /* XXXKSE only for 1:1 */ 1356161599Sdavidxu 1357161599Sdavidxu td = TAILQ_FIRST(&kg->kg_threads); 1358161599Sdavidxu if (td == NULL) { 1359161599Sdavidxu kg->kg_user_pri = prio; 1360161599Sdavidxu return; 1361161599Sdavidxu } 1362161599Sdavidxu 1363161599Sdavidxu if (td->td_flags & TDF_UBORROWING && kg->kg_user_pri <= prio) 1364161599Sdavidxu return; 1365161599Sdavidxu 1366161599Sdavidxu oldprio = kg->kg_user_pri; 1367161599Sdavidxu kg->kg_user_pri = prio; 1368161599Sdavidxu 1369161599Sdavidxu if (TD_ON_UPILOCK(td) && oldprio != prio) 1370161599Sdavidxu umtx_pi_adjust(td, oldprio); 1371161599Sdavidxu} 1372161599Sdavidxu 1373161599Sdavidxuvoid 1374161599Sdavidxusched_lend_user_prio(struct thread *td, u_char prio) 1375161599Sdavidxu{ 1376161599Sdavidxu u_char oldprio; 1377161599Sdavidxu 1378161599Sdavidxu td->td_flags |= TDF_UBORROWING; 1379161599Sdavidxu 1380161599Sdavidxu oldprio = td->td_ksegrp->kg_user_pri; 1381161599Sdavidxu td->td_ksegrp->kg_user_pri = prio; 1382161599Sdavidxu 1383161599Sdavidxu if (TD_ON_UPILOCK(td) && oldprio != prio) 1384161599Sdavidxu umtx_pi_adjust(td, oldprio); 1385161599Sdavidxu} 1386161599Sdavidxu 1387161599Sdavidxuvoid 1388161599Sdavidxusched_unlend_user_prio(struct thread *td, u_char prio) 1389161599Sdavidxu{ 1390161599Sdavidxu struct ksegrp *kg = td->td_ksegrp; 1391161599Sdavidxu u_char base_pri; 1392161599Sdavidxu 1393161599Sdavidxu base_pri = kg->kg_base_user_pri; 1394161599Sdavidxu if (prio >= base_pri) { 1395161599Sdavidxu td->td_flags &= ~TDF_UBORROWING; 1396161599Sdavidxu sched_user_prio(kg, base_pri); 1397161599Sdavidxu } else 1398161599Sdavidxu sched_lend_user_prio(td, prio); 1399161599Sdavidxu} 1400161599Sdavidxu 1401161599Sdavidxuvoid 1402135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags) 1403109864Sjeff{ 1404139334Sjeff struct kseq *ksq; 1405109864Sjeff struct kse *ke; 1406109864Sjeff 1407109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1408109864Sjeff 1409109864Sjeff ke = td->td_kse; 1410139334Sjeff ksq = KSEQ_SELF(); 1411109864Sjeff 1412133555Sjeff td->td_lastcpu = td->td_oncpu; 1413113339Sjulian td->td_oncpu = NOCPU; 1414132266Sjhb td->td_flags &= ~TDF_NEEDRESCHED; 1415144777Sups td->td_owepreempt = 0; 1416109864Sjeff 1417123434Sjeff /* 1418123434Sjeff * If the KSE has been assigned it may be in the process of switching 1419123434Sjeff * to the new cpu. This is the case in sched_bind(). 1420123434Sjeff */ 1421139334Sjeff if (td == PCPU_GET(idlethread)) { 1422139334Sjeff TD_SET_CAN_RUN(td); 1423139334Sjeff } else if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1424139334Sjeff /* We are ending our run so make our slot available again */ 1425139334Sjeff SLOT_RELEASE(td->td_ksegrp); 1426139334Sjeff kseq_load_rem(ksq, ke); 1427139334Sjeff if (TD_IS_RUNNING(td)) { 1428139334Sjeff /* 1429139334Sjeff * Don't allow the thread to migrate 1430139334Sjeff * from a preemption. 1431139334Sjeff */ 1432139334Sjeff ke->ke_flags |= KEF_HOLD; 1433139334Sjeff setrunqueue(td, (flags & SW_PREEMPT) ? 1434139334Sjeff SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1435139334Sjeff SRQ_OURSELF|SRQ_YIELDING); 1436139334Sjeff ke->ke_flags &= ~KEF_HOLD; 1437139334Sjeff } else if ((td->td_proc->p_flag & P_HADTHREADS) && 1438139334Sjeff (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp)) 1439139334Sjeff /* 1440139334Sjeff * We will not be on the run queue. 1441139334Sjeff * So we must be sleeping or similar. 1442139334Sjeff * Don't use the slot if we will need it 1443139334Sjeff * for newtd. 1444139334Sjeff */ 1445139334Sjeff slot_fill(td->td_ksegrp); 1446121146Sjeff } 1447136167Sjulian if (newtd != NULL) { 1448136170Sjulian /* 1449147068Sjeff * If we bring in a thread account for it as if it had been 1450147068Sjeff * added to the run queue and then chosen. 1451136170Sjulian */ 1452136169Sjulian newtd->td_kse->ke_flags |= KEF_DIDRUN; 1453139334Sjeff newtd->td_kse->ke_runq = ksq->ksq_curr; 1454136173Sjulian TD_SET_RUNNING(newtd); 1455133427Sjeff kseq_load_add(KSEQ_SELF(), newtd->td_kse); 1456147068Sjeff /* 1457147068Sjeff * XXX When we preempt, we've already consumed a slot because 1458147068Sjeff * we got here through sched_add(). However, newtd can come 1459147068Sjeff * from thread_switchout() which can't SLOT_USE() because 1460147068Sjeff * the SLOT code is scheduler dependent. We must use the 1461147068Sjeff * slot here otherwise. 1462147068Sjeff */ 1463147068Sjeff if ((flags & SW_PREEMPT) == 0) 1464147068Sjeff SLOT_USE(newtd->td_ksegrp); 1465136167Sjulian } else 1466131473Sjhb newtd = choosethread(); 1467145256Sjkoshy if (td != newtd) { 1468145256Sjkoshy#ifdef HWPMC_HOOKS 1469145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1470145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1471145256Sjkoshy#endif 1472121128Sjeff cpu_switch(td, newtd); 1473145256Sjkoshy#ifdef HWPMC_HOOKS 1474145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1475145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1476145256Sjkoshy#endif 1477145256Sjkoshy } 1478145256Sjkoshy 1479121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1480109864Sjeff 1481113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1482109864Sjeff} 1483109864Sjeff 1484109864Sjeffvoid 1485130551Sjuliansched_nice(struct proc *p, int nice) 1486109864Sjeff{ 1487130551Sjulian struct ksegrp *kg; 1488113357Sjeff struct kse *ke; 1489109864Sjeff struct thread *td; 1490113357Sjeff struct kseq *kseq; 1491109864Sjeff 1492130551Sjulian PROC_LOCK_ASSERT(p, MA_OWNED); 1493113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1494113357Sjeff /* 1495113357Sjeff * We need to adjust the nice counts for running KSEs. 1496113357Sjeff */ 1497130551Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 1498130551Sjulian if (kg->kg_pri_class == PRI_TIMESHARE) { 1499134791Sjulian FOREACH_THREAD_IN_GROUP(kg, td) { 1500134791Sjulian ke = td->td_kse; 1501130551Sjulian if (ke->ke_runq == NULL) 1502130551Sjulian continue; 1503130551Sjulian kseq = KSEQ_CPU(ke->ke_cpu); 1504130551Sjulian kseq_nice_rem(kseq, p->p_nice); 1505130551Sjulian kseq_nice_add(kseq, nice); 1506130551Sjulian } 1507113357Sjeff } 1508130551Sjulian } 1509130551Sjulian p->p_nice = nice; 1510130551Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 1511130551Sjulian sched_priority(kg); 1512130551Sjulian FOREACH_THREAD_IN_GROUP(kg, td) 1513130551Sjulian td->td_flags |= TDF_NEEDRESCHED; 1514130551Sjulian } 1515109864Sjeff} 1516109864Sjeff 1517109864Sjeffvoid 1518126326Sjhbsched_sleep(struct thread *td) 1519109864Sjeff{ 1520109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1521109864Sjeff 1522109864Sjeff td->td_slptime = ticks; 1523109864Sjeff} 1524109864Sjeff 1525109864Sjeffvoid 1526109864Sjeffsched_wakeup(struct thread *td) 1527109864Sjeff{ 1528109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1529109864Sjeff 1530109864Sjeff /* 1531109864Sjeff * Let the kseg know how long we slept for. This is because process 1532109864Sjeff * interactivity behavior is modeled in the kseg. 1533109864Sjeff */ 1534111788Sjeff if (td->td_slptime) { 1535111788Sjeff struct ksegrp *kg; 1536113357Sjeff int hzticks; 1537109864Sjeff 1538111788Sjeff kg = td->td_ksegrp; 1539121868Sjeff hzticks = (ticks - td->td_slptime) << 10; 1540121868Sjeff if (hzticks >= SCHED_SLP_RUN_MAX) { 1541121868Sjeff kg->kg_slptime = SCHED_SLP_RUN_MAX; 1542121868Sjeff kg->kg_runtime = 1; 1543121868Sjeff } else { 1544121868Sjeff kg->kg_slptime += hzticks; 1545121868Sjeff sched_interact_update(kg); 1546121868Sjeff } 1547111788Sjeff sched_priority(kg); 1548134791Sjulian sched_slice(td->td_kse); 1549111788Sjeff td->td_slptime = 0; 1550109864Sjeff } 1551134586Sjulian setrunqueue(td, SRQ_BORING); 1552109864Sjeff} 1553109864Sjeff 1554109864Sjeff/* 1555109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1556109864Sjeff * priority. 1557109864Sjeff */ 1558109864Sjeffvoid 1559134791Sjuliansched_fork(struct thread *td, struct thread *childtd) 1560109864Sjeff{ 1561109864Sjeff 1562109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1563109864Sjeff 1564134791Sjulian sched_fork_ksegrp(td, childtd->td_ksegrp); 1565134791Sjulian sched_fork_thread(td, childtd); 1566113357Sjeff} 1567113357Sjeff 1568113357Sjeffvoid 1569132372Sjuliansched_fork_ksegrp(struct thread *td, struct ksegrp *child) 1570113357Sjeff{ 1571132372Sjulian struct ksegrp *kg = td->td_ksegrp; 1572134791Sjulian mtx_assert(&sched_lock, MA_OWNED); 1573116365Sjeff 1574121868Sjeff child->kg_slptime = kg->kg_slptime; 1575121868Sjeff child->kg_runtime = kg->kg_runtime; 1576121868Sjeff child->kg_user_pri = kg->kg_user_pri; 1577161599Sdavidxu child->kg_base_user_pri = kg->kg_base_user_pri; 1578121868Sjeff sched_interact_fork(child); 1579153533Sdavidxu kg->kg_runtime += tickincr; 1580116463Sjeff sched_interact_update(kg); 1581113357Sjeff} 1582109864Sjeff 1583113357Sjeffvoid 1584113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 1585113357Sjeff{ 1586134791Sjulian struct kse *ke; 1587134791Sjulian struct kse *ke2; 1588134791Sjulian 1589134791Sjulian sched_newthread(child); 1590134791Sjulian ke = td->td_kse; 1591134791Sjulian ke2 = child->td_kse; 1592134791Sjulian ke2->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1593134791Sjulian ke2->ke_cpu = ke->ke_cpu; 1594134791Sjulian ke2->ke_runq = NULL; 1595134791Sjulian 1596134791Sjulian /* Grab our parents cpu estimation information. */ 1597134791Sjulian ke2->ke_ticks = ke->ke_ticks; 1598134791Sjulian ke2->ke_ltick = ke->ke_ltick; 1599134791Sjulian ke2->ke_ftick = ke->ke_ftick; 1600113357Sjeff} 1601113357Sjeff 1602113357Sjeffvoid 1603113357Sjeffsched_class(struct ksegrp *kg, int class) 1604113357Sjeff{ 1605113357Sjeff struct kseq *kseq; 1606113357Sjeff struct kse *ke; 1607134791Sjulian struct thread *td; 1608121896Sjeff int nclass; 1609121896Sjeff int oclass; 1610113357Sjeff 1611113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1612113357Sjeff if (kg->kg_pri_class == class) 1613113357Sjeff return; 1614113357Sjeff 1615121896Sjeff nclass = PRI_BASE(class); 1616121896Sjeff oclass = PRI_BASE(kg->kg_pri_class); 1617134791Sjulian FOREACH_THREAD_IN_GROUP(kg, td) { 1618134791Sjulian ke = td->td_kse; 1619141292Sjeff if ((ke->ke_state != KES_ONRUNQ && 1620141292Sjeff ke->ke_state != KES_THREAD) || ke->ke_runq == NULL) 1621113357Sjeff continue; 1622113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1623113357Sjeff 1624121896Sjeff#ifdef SMP 1625122744Sjeff /* 1626122744Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1627122744Sjeff * count because could be changing to or from an interrupt 1628122744Sjeff * class. 1629122744Sjeff */ 1630122744Sjeff if (ke->ke_state == KES_ONRUNQ) { 1631139334Sjeff if (KSE_CAN_MIGRATE(ke)) { 1632123433Sjeff kseq->ksq_transferable--; 1633123433Sjeff kseq->ksq_group->ksg_transferable--; 1634123433Sjeff } 1635139334Sjeff if (KSE_CAN_MIGRATE(ke)) { 1636123433Sjeff kseq->ksq_transferable++; 1637123433Sjeff kseq->ksq_group->ksg_transferable++; 1638123433Sjeff } 1639122744Sjeff } 1640121896Sjeff#endif 1641122744Sjeff if (oclass == PRI_TIMESHARE) { 1642121896Sjeff kseq->ksq_load_timeshare--; 1643130551Sjulian kseq_nice_rem(kseq, kg->kg_proc->p_nice); 1644122744Sjeff } 1645122744Sjeff if (nclass == PRI_TIMESHARE) { 1646121896Sjeff kseq->ksq_load_timeshare++; 1647130551Sjulian kseq_nice_add(kseq, kg->kg_proc->p_nice); 1648122744Sjeff } 1649109970Sjeff } 1650109970Sjeff 1651113357Sjeff kg->kg_pri_class = class; 1652109864Sjeff} 1653109864Sjeff 1654109864Sjeff/* 1655109864Sjeff * Return some of the child's priority and interactivity to the parent. 1656109864Sjeff */ 1657109864Sjeffvoid 1658134791Sjuliansched_exit(struct proc *p, struct thread *childtd) 1659109864Sjeff{ 1660109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1661134791Sjulian sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd); 1662139316Sjeff sched_exit_thread(NULL, childtd); 1663109864Sjeff} 1664109864Sjeff 1665109864Sjeffvoid 1666132372Sjuliansched_exit_ksegrp(struct ksegrp *kg, struct thread *td) 1667113372Sjeff{ 1668132372Sjulian /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */ 1669132372Sjulian kg->kg_runtime += td->td_ksegrp->kg_runtime; 1670116463Sjeff sched_interact_update(kg); 1671113372Sjeff} 1672113372Sjeff 1673113372Sjeffvoid 1674134791Sjuliansched_exit_thread(struct thread *td, struct thread *childtd) 1675113372Sjeff{ 1676139316Sjeff CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 1677139316Sjeff childtd, childtd->td_proc->p_comm, childtd->td_priority); 1678134791Sjulian kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse); 1679113372Sjeff} 1680113372Sjeff 1681113372Sjeffvoid 1682121127Sjeffsched_clock(struct thread *td) 1683109864Sjeff{ 1684113357Sjeff struct kseq *kseq; 1685113357Sjeff struct ksegrp *kg; 1686121127Sjeff struct kse *ke; 1687109864Sjeff 1688129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 1689133427Sjeff kseq = KSEQ_SELF(); 1690129982Sjeff#ifdef SMP 1691139334Sjeff if (ticks >= bal_tick) 1692129982Sjeff sched_balance(); 1693139334Sjeff if (ticks >= gbal_tick && balance_groups) 1694129982Sjeff sched_balance_groups(); 1695133427Sjeff /* 1696133427Sjeff * We could have been assigned a non real-time thread without an 1697133427Sjeff * IPI. 1698133427Sjeff */ 1699133427Sjeff if (kseq->ksq_assigned) 1700133427Sjeff kseq_assign(kseq); /* Potentially sets NEEDRESCHED */ 1701129982Sjeff#endif 1702121127Sjeff ke = td->td_kse; 1703113357Sjeff kg = ke->ke_ksegrp; 1704109864Sjeff 1705110028Sjeff /* Adjust ticks for pctcpu */ 1706111793Sjeff ke->ke_ticks++; 1707109971Sjeff ke->ke_ltick = ticks; 1708112994Sjeff 1709109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1710109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1711109971Sjeff sched_pctcpu_update(ke); 1712109971Sjeff 1713114496Sjulian if (td->td_flags & TDF_IDLETD) 1714109864Sjeff return; 1715110028Sjeff /* 1716113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1717113357Sjeff */ 1718113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1719113357Sjeff return; 1720113357Sjeff /* 1721110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1722113357Sjeff * interactivity. 1723109864Sjeff */ 1724153533Sdavidxu kg->kg_runtime += tickincr; 1725116463Sjeff sched_interact_update(kg); 1726110645Sjeff 1727109864Sjeff /* 1728109864Sjeff * We used up one time slice. 1729109864Sjeff */ 1730122847Sjeff if (--ke->ke_slice > 0) 1731113357Sjeff return; 1732109864Sjeff /* 1733113357Sjeff * We're out of time, recompute priorities and requeue. 1734109864Sjeff */ 1735122744Sjeff kseq_load_rem(kseq, ke); 1736113357Sjeff sched_priority(kg); 1737113357Sjeff sched_slice(ke); 1738113357Sjeff if (SCHED_CURR(kg, ke)) 1739113357Sjeff ke->ke_runq = kseq->ksq_curr; 1740113357Sjeff else 1741113357Sjeff ke->ke_runq = kseq->ksq_next; 1742122744Sjeff kseq_load_add(kseq, ke); 1743113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1744109864Sjeff} 1745109864Sjeff 1746109864Sjeffint 1747109864Sjeffsched_runnable(void) 1748109864Sjeff{ 1749109864Sjeff struct kseq *kseq; 1750115998Sjeff int load; 1751109864Sjeff 1752115998Sjeff load = 1; 1753115998Sjeff 1754110028Sjeff kseq = KSEQ_SELF(); 1755121790Sjeff#ifdef SMP 1756122094Sjeff if (kseq->ksq_assigned) { 1757122094Sjeff mtx_lock_spin(&sched_lock); 1758121790Sjeff kseq_assign(kseq); 1759122094Sjeff mtx_unlock_spin(&sched_lock); 1760122094Sjeff } 1761121790Sjeff#endif 1762121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1763121605Sjeff if (kseq->ksq_load > 0) 1764121605Sjeff goto out; 1765121605Sjeff } else 1766121605Sjeff if (kseq->ksq_load - 1 > 0) 1767121605Sjeff goto out; 1768115998Sjeff load = 0; 1769115998Sjeffout: 1770115998Sjeff return (load); 1771109864Sjeff} 1772109864Sjeff 1773109864Sjeffvoid 1774109864Sjeffsched_userret(struct thread *td) 1775109864Sjeff{ 1776109864Sjeff struct ksegrp *kg; 1777121605Sjeff 1778139453Sjhb KASSERT((td->td_flags & TDF_BORROWING) == 0, 1779139453Sjhb ("thread with borrowed priority returning to userland")); 1780139453Sjhb kg = td->td_ksegrp; 1781139453Sjhb if (td->td_priority != kg->kg_user_pri) { 1782109864Sjeff mtx_lock_spin(&sched_lock); 1783109864Sjeff td->td_priority = kg->kg_user_pri; 1784139453Sjhb td->td_base_pri = kg->kg_user_pri; 1785109864Sjeff mtx_unlock_spin(&sched_lock); 1786109864Sjeff } 1787109864Sjeff} 1788109864Sjeff 1789109864Sjeffstruct kse * 1790109970Sjeffsched_choose(void) 1791109970Sjeff{ 1792110028Sjeff struct kseq *kseq; 1793109970Sjeff struct kse *ke; 1794109970Sjeff 1795115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1796121790Sjeff kseq = KSEQ_SELF(); 1797113357Sjeff#ifdef SMP 1798123433Sjeffrestart: 1799121790Sjeff if (kseq->ksq_assigned) 1800121790Sjeff kseq_assign(kseq); 1801113357Sjeff#endif 1802121790Sjeff ke = kseq_choose(kseq); 1803109864Sjeff if (ke) { 1804121790Sjeff#ifdef SMP 1805121790Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1806123433Sjeff if (kseq_idled(kseq) == 0) 1807123433Sjeff goto restart; 1808121790Sjeff#endif 1809122744Sjeff kseq_runq_rem(kseq, ke); 1810109864Sjeff ke->ke_state = KES_THREAD; 1811148856Sdavidxu ke->ke_flags &= ~KEF_PREEMPTED; 1812113357Sjeff return (ke); 1813109864Sjeff } 1814109970Sjeff#ifdef SMP 1815123433Sjeff if (kseq_idled(kseq) == 0) 1816123433Sjeff goto restart; 1817109970Sjeff#endif 1818113357Sjeff return (NULL); 1819109864Sjeff} 1820109864Sjeff 1821109864Sjeffvoid 1822134586Sjuliansched_add(struct thread *td, int flags) 1823109864Sjeff{ 1824110267Sjeff struct kseq *kseq; 1825113357Sjeff struct ksegrp *kg; 1826121127Sjeff struct kse *ke; 1827139334Sjeff int preemptive; 1828133427Sjeff int canmigrate; 1829121790Sjeff int class; 1830109864Sjeff 1831139316Sjeff CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1832139316Sjeff td, td->td_proc->p_comm, td->td_priority, curthread, 1833139316Sjeff curthread->td_proc->p_comm); 1834121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 1835121127Sjeff ke = td->td_kse; 1836121127Sjeff kg = td->td_ksegrp; 1837139334Sjeff canmigrate = 1; 1838139334Sjeff preemptive = !(flags & SRQ_YIELDING); 1839139334Sjeff class = PRI_BASE(kg->kg_pri_class); 1840139334Sjeff kseq = KSEQ_SELF(); 1841139334Sjeff if ((ke->ke_flags & KEF_INTERNAL) == 0) 1842139334Sjeff SLOT_USE(td->td_ksegrp); 1843139334Sjeff ke->ke_flags &= ~KEF_INTERNAL; 1844139334Sjeff#ifdef SMP 1845138802Sjeff if (ke->ke_flags & KEF_ASSIGNED) { 1846139334Sjeff if (ke->ke_flags & KEF_REMOVED) 1847138802Sjeff ke->ke_flags &= ~KEF_REMOVED; 1848121790Sjeff return; 1849138802Sjeff } 1850139334Sjeff canmigrate = KSE_CAN_MIGRATE(ke); 1851149278Sdavidxu /* 1852149278Sdavidxu * Don't migrate running threads here. Force the long term balancer 1853149278Sdavidxu * to do it. 1854149278Sdavidxu */ 1855149278Sdavidxu if (ke->ke_flags & KEF_HOLD) { 1856149278Sdavidxu ke->ke_flags &= ~KEF_HOLD; 1857149278Sdavidxu canmigrate = 0; 1858149278Sdavidxu } 1859139334Sjeff#endif 1860109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1861110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1862109864Sjeff ke->ke_proc->p_comm)); 1863109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1864110267Sjeff ("sched_add: process swapped out")); 1865113387Sjeff KASSERT(ke->ke_runq == NULL, 1866113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1867148856Sdavidxu if (flags & SRQ_PREEMPTED) 1868148856Sdavidxu ke->ke_flags |= KEF_PREEMPTED; 1869121790Sjeff switch (class) { 1870112994Sjeff case PRI_ITHD: 1871112994Sjeff case PRI_REALTIME: 1872113357Sjeff ke->ke_runq = kseq->ksq_curr; 1873113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1874139334Sjeff if (canmigrate) 1875139334Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1876112994Sjeff break; 1877112994Sjeff case PRI_TIMESHARE: 1878113387Sjeff if (SCHED_CURR(kg, ke)) 1879113387Sjeff ke->ke_runq = kseq->ksq_curr; 1880113387Sjeff else 1881113387Sjeff ke->ke_runq = kseq->ksq_next; 1882113357Sjeff break; 1883112994Sjeff case PRI_IDLE: 1884113357Sjeff /* 1885113357Sjeff * This is for priority prop. 1886113357Sjeff */ 1887121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1888113357Sjeff ke->ke_runq = kseq->ksq_curr; 1889113357Sjeff else 1890113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1891113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1892112994Sjeff break; 1893113357Sjeff default: 1894121868Sjeff panic("Unknown pri class."); 1895113357Sjeff break; 1896112994Sjeff } 1897121790Sjeff#ifdef SMP 1898133427Sjeff /* 1899133427Sjeff * If this thread is pinned or bound, notify the target cpu. 1900133427Sjeff */ 1901133427Sjeff if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) { 1902123529Sjeff ke->ke_runq = NULL; 1903123433Sjeff kseq_notify(ke, ke->ke_cpu); 1904123433Sjeff return; 1905123433Sjeff } 1906121790Sjeff /* 1907123685Sjeff * If we had been idle, clear our bit in the group and potentially 1908123685Sjeff * the global bitmap. If not, see if we should transfer this thread. 1909121790Sjeff */ 1910123433Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1911123433Sjeff (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1912121790Sjeff /* 1913123433Sjeff * Check to see if our group is unidling, and if so, remove it 1914123433Sjeff * from the global idle mask. 1915121790Sjeff */ 1916123433Sjeff if (kseq->ksq_group->ksg_idlemask == 1917123433Sjeff kseq->ksq_group->ksg_cpumask) 1918123433Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1919123433Sjeff /* 1920123433Sjeff * Now remove ourselves from the group specific idle mask. 1921123433Sjeff */ 1922123433Sjeff kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1923139334Sjeff } else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD) 1924123685Sjeff if (kseq_transfer(kseq, ke, class)) 1925123685Sjeff return; 1926133427Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1927121790Sjeff#endif 1928133555Sjeff if (td->td_priority < curthread->td_priority && 1929133555Sjeff ke->ke_runq == kseq->ksq_curr) 1930133555Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1931131839Sjhb if (preemptive && maybe_preempt(td)) 1932131481Sjhb return; 1933109864Sjeff ke->ke_state = KES_ONRUNQ; 1934109864Sjeff 1935139334Sjeff kseq_runq_add(kseq, ke, flags); 1936122744Sjeff kseq_load_add(kseq, ke); 1937109864Sjeff} 1938109864Sjeff 1939109864Sjeffvoid 1940121127Sjeffsched_rem(struct thread *td) 1941109864Sjeff{ 1942113357Sjeff struct kseq *kseq; 1943121127Sjeff struct kse *ke; 1944113357Sjeff 1945139316Sjeff CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1946139316Sjeff td, td->td_proc->p_comm, td->td_priority, curthread, 1947139316Sjeff curthread->td_proc->p_comm); 1948139334Sjeff mtx_assert(&sched_lock, MA_OWNED); 1949139334Sjeff ke = td->td_kse; 1950139334Sjeff SLOT_RELEASE(td->td_ksegrp); 1951148856Sdavidxu ke->ke_flags &= ~KEF_PREEMPTED; 1952138802Sjeff if (ke->ke_flags & KEF_ASSIGNED) { 1953138802Sjeff ke->ke_flags |= KEF_REMOVED; 1954121790Sjeff return; 1955138802Sjeff } 1956124958Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), 1957124958Sjeff ("sched_rem: KSE not on run queue")); 1958109864Sjeff 1959109864Sjeff ke->ke_state = KES_THREAD; 1960113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1961122744Sjeff kseq_runq_rem(kseq, ke); 1962122744Sjeff kseq_load_rem(kseq, ke); 1963109864Sjeff} 1964109864Sjeff 1965109864Sjefffixpt_t 1966121127Sjeffsched_pctcpu(struct thread *td) 1967109864Sjeff{ 1968109864Sjeff fixpt_t pctcpu; 1969121127Sjeff struct kse *ke; 1970109864Sjeff 1971109864Sjeff pctcpu = 0; 1972121127Sjeff ke = td->td_kse; 1973121290Sjeff if (ke == NULL) 1974121290Sjeff return (0); 1975109864Sjeff 1976115998Sjeff mtx_lock_spin(&sched_lock); 1977109864Sjeff if (ke->ke_ticks) { 1978109864Sjeff int rtick; 1979109864Sjeff 1980116365Sjeff /* 1981116365Sjeff * Don't update more frequently than twice a second. Allowing 1982116365Sjeff * this causes the cpu usage to decay away too quickly due to 1983116365Sjeff * rounding errors. 1984116365Sjeff */ 1985123435Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1986123435Sjeff ke->ke_ltick < (ticks - (hz / 2))) 1987116365Sjeff sched_pctcpu_update(ke); 1988109864Sjeff /* How many rtick per second ? */ 1989116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1990110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1991109864Sjeff } 1992109864Sjeff 1993109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1994113865Sjhb mtx_unlock_spin(&sched_lock); 1995109864Sjeff 1996109864Sjeff return (pctcpu); 1997109864Sjeff} 1998109864Sjeff 1999122038Sjeffvoid 2000122038Sjeffsched_bind(struct thread *td, int cpu) 2001122038Sjeff{ 2002122038Sjeff struct kse *ke; 2003122038Sjeff 2004122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 2005122038Sjeff ke = td->td_kse; 2006122038Sjeff ke->ke_flags |= KEF_BOUND; 2007123433Sjeff#ifdef SMP 2008123433Sjeff if (PCPU_GET(cpuid) == cpu) 2009122038Sjeff return; 2010122038Sjeff /* sched_rem without the runq_remove */ 2011122038Sjeff ke->ke_state = KES_THREAD; 2012122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 2013122038Sjeff kseq_notify(ke, cpu); 2014122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 2015131527Sphk mi_switch(SW_VOL, NULL); 2016122038Sjeff#endif 2017122038Sjeff} 2018122038Sjeff 2019122038Sjeffvoid 2020122038Sjeffsched_unbind(struct thread *td) 2021122038Sjeff{ 2022122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 2023122038Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 2024122038Sjeff} 2025122038Sjeff 2026109864Sjeffint 2027145256Sjkoshysched_is_bound(struct thread *td) 2028145256Sjkoshy{ 2029145256Sjkoshy mtx_assert(&sched_lock, MA_OWNED); 2030145256Sjkoshy return (td->td_kse->ke_flags & KEF_BOUND); 2031145256Sjkoshy} 2032145256Sjkoshy 2033159630Sdavidxuvoid 2034159630Sdavidxusched_relinquish(struct thread *td) 2035159630Sdavidxu{ 2036159630Sdavidxu struct ksegrp *kg; 2037159630Sdavidxu 2038159630Sdavidxu kg = td->td_ksegrp; 2039159630Sdavidxu mtx_lock_spin(&sched_lock); 2040159630Sdavidxu if (kg->kg_pri_class == PRI_TIMESHARE) 2041159630Sdavidxu sched_prio(td, PRI_MAX_TIMESHARE); 2042159630Sdavidxu mi_switch(SW_VOL, NULL); 2043159630Sdavidxu mtx_unlock_spin(&sched_lock); 2044159630Sdavidxu} 2045159630Sdavidxu 2046145256Sjkoshyint 2047125289Sjeffsched_load(void) 2048125289Sjeff{ 2049125289Sjeff#ifdef SMP 2050125289Sjeff int total; 2051125289Sjeff int i; 2052125289Sjeff 2053125289Sjeff total = 0; 2054125289Sjeff for (i = 0; i <= ksg_maxid; i++) 2055125289Sjeff total += KSEQ_GROUP(i)->ksg_load; 2056125289Sjeff return (total); 2057125289Sjeff#else 2058125289Sjeff return (KSEQ_SELF()->ksq_sysload); 2059125289Sjeff#endif 2060125289Sjeff} 2061125289Sjeff 2062125289Sjeffint 2063109864Sjeffsched_sizeof_ksegrp(void) 2064109864Sjeff{ 2065109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 2066109864Sjeff} 2067109864Sjeff 2068109864Sjeffint 2069109864Sjeffsched_sizeof_proc(void) 2070109864Sjeff{ 2071109864Sjeff return (sizeof(struct proc)); 2072109864Sjeff} 2073109864Sjeff 2074109864Sjeffint 2075109864Sjeffsched_sizeof_thread(void) 2076109864Sjeff{ 2077109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 2078109864Sjeff} 2079159570Sdavidxu 2080159570Sdavidxuvoid 2081159570Sdavidxusched_tick(void) 2082159570Sdavidxu{ 2083159570Sdavidxu} 2084134791Sjulian#define KERN_SWITCH_INCLUDE 1 2085134791Sjulian#include "kern/kern_switch.c" 2086