sched_ule.c revision 121605
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 121605 2003-10-27 06:47:05Z jeff $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32109864Sjeff#include <sys/kernel.h> 33109864Sjeff#include <sys/ktr.h> 34109864Sjeff#include <sys/lock.h> 35109864Sjeff#include <sys/mutex.h> 36109864Sjeff#include <sys/proc.h> 37112966Sjeff#include <sys/resource.h> 38109864Sjeff#include <sys/sched.h> 39109864Sjeff#include <sys/smp.h> 40109864Sjeff#include <sys/sx.h> 41109864Sjeff#include <sys/sysctl.h> 42109864Sjeff#include <sys/sysproto.h> 43109864Sjeff#include <sys/vmmeter.h> 44109864Sjeff#ifdef DDB 45109864Sjeff#include <ddb/ddb.h> 46109864Sjeff#endif 47109864Sjeff#ifdef KTRACE 48109864Sjeff#include <sys/uio.h> 49109864Sjeff#include <sys/ktrace.h> 50109864Sjeff#endif 51109864Sjeff 52109864Sjeff#include <machine/cpu.h> 53109864Sjeff 54113357Sjeff#define KTR_ULE KTR_NFS 55113357Sjeff 56109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57109864Sjeff/* XXX This is bogus compatability crap for ps */ 58109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 59109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 60109864Sjeff 61109864Sjeffstatic void sched_setup(void *dummy); 62109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 63109864Sjeff 64113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 65113357Sjeff 66113357Sjeffstatic int sched_strict; 67113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 68113357Sjeff 69113357Sjeffstatic int slice_min = 1; 70113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71113357Sjeff 72116365Sjeffstatic int slice_max = 10; 73113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74113357Sjeff 75111857Sjeffint realstathz; 76113357Sjeffint tickincr = 1; 77111857Sjeff 78116069Sjeff#ifdef SMP 79116069Sjeff/* Callout to handle load balancing SMP systems. */ 80116069Sjeffstatic struct callout kseq_lb_callout; 81116069Sjeff#endif 82116069Sjeff 83109864Sjeff/* 84109864Sjeff * These datastructures are allocated within their parent datastructure but 85109864Sjeff * are scheduler specific. 86109864Sjeff */ 87109864Sjeff 88109864Sjeffstruct ke_sched { 89109864Sjeff int ske_slice; 90109864Sjeff struct runq *ske_runq; 91109864Sjeff /* The following variables are only used for pctcpu calculation */ 92109864Sjeff int ske_ltick; /* Last tick that we were running on */ 93109864Sjeff int ske_ftick; /* First tick that we were running on */ 94109864Sjeff int ske_ticks; /* Tick count */ 95113357Sjeff /* CPU that we have affinity for. */ 96110260Sjeff u_char ske_cpu; 97109864Sjeff}; 98109864Sjeff#define ke_slice ke_sched->ske_slice 99109864Sjeff#define ke_runq ke_sched->ske_runq 100109864Sjeff#define ke_ltick ke_sched->ske_ltick 101109864Sjeff#define ke_ftick ke_sched->ske_ftick 102109864Sjeff#define ke_ticks ke_sched->ske_ticks 103110260Sjeff#define ke_cpu ke_sched->ske_cpu 104109864Sjeff 105109864Sjeffstruct kg_sched { 106110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 107110645Sjeff int skg_runtime; /* Number of ticks we were running */ 108109864Sjeff}; 109109864Sjeff#define kg_slptime kg_sched->skg_slptime 110110645Sjeff#define kg_runtime kg_sched->skg_runtime 111109864Sjeff 112109864Sjeffstruct td_sched { 113109864Sjeff int std_slptime; 114109864Sjeff}; 115109864Sjeff#define td_slptime td_sched->std_slptime 116109864Sjeff 117110267Sjeffstruct td_sched td_sched; 118109864Sjeffstruct ke_sched ke_sched; 119109864Sjeffstruct kg_sched kg_sched; 120109864Sjeff 121109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 122109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 123109864Sjeffstruct p_sched *proc0_sched = NULL; 124109864Sjeffstruct td_sched *thread0_sched = &td_sched; 125109864Sjeff 126109864Sjeff/* 127116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 128116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 129116642Sjeff * value is then directly added to this to allow nice to have some effect 130116642Sjeff * on latency. 131111857Sjeff * 132111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 133116642Sjeff * PRI_NRESV: Number of nice values. 134111857Sjeff * PRI_BASE: The start of the dynamic range. 135109864Sjeff */ 136111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 137112966Sjeff#define SCHED_PRI_NRESV PRIO_TOTAL 138112970Sjeff#define SCHED_PRI_NHALF (PRIO_TOTAL / 2) 139113357Sjeff#define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) 140116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 141113357Sjeff#define SCHED_PRI_INTERACT(score) \ 142116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 143109864Sjeff 144109864Sjeff/* 145111857Sjeff * These determine the interactivity of a process. 146109864Sjeff * 147110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 148110645Sjeff * before throttling back. 149116955Sjeff * SLP_RUN_THROTTLE: Divisor for reducing slp/run time at fork time. 150116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 151111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 152109864Sjeff */ 153121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 154116955Sjeff#define SCHED_SLP_RUN_THROTTLE (100) 155116365Sjeff#define SCHED_INTERACT_MAX (100) 156116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 157121126Sjeff#define SCHED_INTERACT_THRESH (30) 158111857Sjeff 159109864Sjeff/* 160109864Sjeff * These parameters and macros determine the size of the time slice that is 161109864Sjeff * granted to each thread. 162109864Sjeff * 163109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 164109864Sjeff * SLICE_MAX: Maximum time slice granted. 165109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 166112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 167112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 168109864Sjeff */ 169113357Sjeff#define SCHED_SLICE_MIN (slice_min) 170113357Sjeff#define SCHED_SLICE_MAX (slice_max) 171111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 172109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 173112966Sjeff#define SCHED_SLICE_NICE(nice) \ 174113357Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) 175109864Sjeff 176109864Sjeff/* 177109864Sjeff * This macro determines whether or not the kse belongs on the current or 178109864Sjeff * next run queue. 179110645Sjeff * 180110645Sjeff * XXX nice value should effect how interactive a kg is. 181109864Sjeff */ 182113357Sjeff#define SCHED_INTERACTIVE(kg) \ 183113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 184113417Sjeff#define SCHED_CURR(kg, ke) \ 185121107Sjeff (ke->ke_thread->td_priority != kg->kg_user_pri || \ 186121107Sjeff SCHED_INTERACTIVE(kg)) 187109864Sjeff 188109864Sjeff/* 189109864Sjeff * Cpu percentage computation macros and defines. 190109864Sjeff * 191109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 192109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 193109864Sjeff */ 194109864Sjeff 195112971Sjeff#define SCHED_CPU_TIME 10 196109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 197109864Sjeff 198109864Sjeff/* 199113357Sjeff * kseq - per processor runqs and statistics. 200109864Sjeff */ 201109864Sjeff 202113357Sjeff#define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ 203113357Sjeff 204109864Sjeffstruct kseq { 205113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 206113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 207113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 208113357Sjeff struct runq *ksq_curr; /* Current queue. */ 209113357Sjeff int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ 210113357Sjeff int ksq_load; /* Aggregate load. */ 211113357Sjeff short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ 212113357Sjeff short ksq_nicemin; /* Least nice. */ 213110267Sjeff#ifdef SMP 214117237Sjeff int ksq_cpus; /* Count of CPUs in this kseq. */ 215110267Sjeff unsigned int ksq_rslices; /* Slices on run queue */ 216110267Sjeff#endif 217109864Sjeff}; 218109864Sjeff 219109864Sjeff/* 220109864Sjeff * One kse queue per processor. 221109864Sjeff */ 222110028Sjeff#ifdef SMP 223109864Sjeffstruct kseq kseq_cpu[MAXCPU]; 224117237Sjeffstruct kseq *kseq_idmap[MAXCPU]; 225117237Sjeff#define KSEQ_SELF() (kseq_idmap[PCPU_GET(cpuid)]) 226117237Sjeff#define KSEQ_CPU(x) (kseq_idmap[(x)]) 227110028Sjeff#else 228110028Sjeffstruct kseq kseq_cpu; 229110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 230110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 231110028Sjeff#endif 232109864Sjeff 233112966Sjeffstatic void sched_slice(struct kse *ke); 234113357Sjeffstatic void sched_priority(struct ksegrp *kg); 235111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 236116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 237109864Sjeffvoid sched_pctcpu_update(struct kse *ke); 238109864Sjeffint sched_pickcpu(void); 239109864Sjeff 240110267Sjeff/* Operations on per processor queues */ 241117326Sjeffstatic struct kse * kseq_choose(struct kseq *kseq, int steal); 242110028Sjeffstatic void kseq_setup(struct kseq *kseq); 243112994Sjeffstatic void kseq_add(struct kseq *kseq, struct kse *ke); 244113357Sjeffstatic void kseq_rem(struct kseq *kseq, struct kse *ke); 245113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 246113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 247113660Sjeffvoid kseq_print(int cpu); 248110267Sjeff#ifdef SMP 249110267Sjeffstruct kseq * kseq_load_highest(void); 250116069Sjeffvoid kseq_balance(void *arg); 251116069Sjeffvoid kseq_move(struct kseq *from, int cpu); 252110267Sjeff#endif 253110028Sjeff 254113357Sjeffvoid 255113660Sjeffkseq_print(int cpu) 256110267Sjeff{ 257113660Sjeff struct kseq *kseq; 258113357Sjeff int i; 259112994Sjeff 260113660Sjeff kseq = KSEQ_CPU(cpu); 261112994Sjeff 262113357Sjeff printf("kseq:\n"); 263113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 264113357Sjeff printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); 265113357Sjeff printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); 266113357Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); 267113357Sjeff printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); 268113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 269113357Sjeff printf("\tnice counts:\n"); 270113357Sjeff for (i = 0; i < PRIO_TOTAL + 1; i++) 271113357Sjeff if (kseq->ksq_nice[i]) 272113357Sjeff printf("\t\t%d = %d\n", 273113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 274113357Sjeff} 275112994Sjeff 276113357Sjeffstatic void 277113357Sjeffkseq_add(struct kseq *kseq, struct kse *ke) 278113357Sjeff{ 279115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 280113386Sjeff kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; 281113357Sjeff kseq->ksq_load++; 282113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 283113357Sjeff CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 284113357Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 285113357Sjeff ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 286113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 287113357Sjeff kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 288110267Sjeff#ifdef SMP 289110267Sjeff kseq->ksq_rslices += ke->ke_slice; 290110267Sjeff#endif 291110267Sjeff} 292113357Sjeff 293112994Sjeffstatic void 294110267Sjeffkseq_rem(struct kseq *kseq, struct kse *ke) 295110267Sjeff{ 296115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 297113386Sjeff kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; 298113357Sjeff kseq->ksq_load--; 299113357Sjeff ke->ke_runq = NULL; 300113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 301113357Sjeff kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 302110267Sjeff#ifdef SMP 303110267Sjeff kseq->ksq_rslices -= ke->ke_slice; 304110267Sjeff#endif 305110267Sjeff} 306110267Sjeff 307113357Sjeffstatic void 308113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 309110267Sjeff{ 310115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 311113357Sjeff /* Normalize to zero. */ 312113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 313115998Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) 314113357Sjeff kseq->ksq_nicemin = nice; 315110267Sjeff} 316110267Sjeff 317113357Sjeffstatic void 318113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 319110267Sjeff{ 320113357Sjeff int n; 321113357Sjeff 322115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 323113357Sjeff /* Normalize to zero. */ 324113357Sjeff n = nice + SCHED_PRI_NHALF; 325113357Sjeff kseq->ksq_nice[n]--; 326113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 327113357Sjeff 328113357Sjeff /* 329113357Sjeff * If this wasn't the smallest nice value or there are more in 330113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 331113357Sjeff * the smallest nice. 332113357Sjeff */ 333113357Sjeff if (nice != kseq->ksq_nicemin || 334113357Sjeff kseq->ksq_nice[n] != 0 || 335113357Sjeff kseq->ksq_loads[PRI_TIMESHARE] == 0) 336113357Sjeff return; 337113357Sjeff 338113357Sjeff for (; n < SCHED_PRI_NRESV + 1; n++) 339113357Sjeff if (kseq->ksq_nice[n]) { 340113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 341113357Sjeff return; 342113357Sjeff } 343110267Sjeff} 344110267Sjeff 345113357Sjeff#ifdef SMP 346116069Sjeff/* 347116069Sjeff * kseq_balance is a simple CPU load balancing algorithm. It operates by 348116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 349116069Sjeff * by migrating some processes. 350116069Sjeff * 351116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 352116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 353116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 354116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 355116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 356116069Sjeff * 357116069Sjeff * It could be improved by considering the priorities and slices assigned to 358116069Sjeff * each task prior to balancing them. There are many pathological cases with 359116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 360116069Sjeff * 361116069Sjeff */ 362116069Sjeffvoid 363116069Sjeffkseq_balance(void *arg) 364116069Sjeff{ 365116069Sjeff struct kseq *kseq; 366116069Sjeff int high_load; 367116069Sjeff int low_load; 368116069Sjeff int high_cpu; 369116069Sjeff int low_cpu; 370116069Sjeff int move; 371116069Sjeff int diff; 372116069Sjeff int i; 373116069Sjeff 374116069Sjeff high_cpu = 0; 375116069Sjeff low_cpu = 0; 376116069Sjeff high_load = 0; 377116069Sjeff low_load = -1; 378116069Sjeff 379116069Sjeff mtx_lock_spin(&sched_lock); 380116962Sjeff if (smp_started == 0) 381116962Sjeff goto out; 382116962Sjeff 383116069Sjeff for (i = 0; i < mp_maxid; i++) { 384116970Sjeff if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 385116069Sjeff continue; 386116069Sjeff kseq = KSEQ_CPU(i); 387116069Sjeff if (kseq->ksq_load > high_load) { 388116069Sjeff high_load = kseq->ksq_load; 389116069Sjeff high_cpu = i; 390116069Sjeff } 391116069Sjeff if (low_load == -1 || kseq->ksq_load < low_load) { 392116069Sjeff low_load = kseq->ksq_load; 393116069Sjeff low_cpu = i; 394116069Sjeff } 395116069Sjeff } 396116069Sjeff 397117237Sjeff kseq = KSEQ_CPU(high_cpu); 398117237Sjeff 399116069Sjeff /* 400116069Sjeff * Nothing to do. 401116069Sjeff */ 402117237Sjeff if (high_load < kseq->ksq_cpus + 1) 403116069Sjeff goto out; 404116069Sjeff 405117237Sjeff high_load -= kseq->ksq_cpus; 406117237Sjeff 407117237Sjeff if (low_load >= high_load) 408117237Sjeff goto out; 409117237Sjeff 410116069Sjeff diff = high_load - low_load; 411116069Sjeff move = diff / 2; 412116069Sjeff if (diff & 0x1) 413116069Sjeff move++; 414116069Sjeff 415116069Sjeff for (i = 0; i < move; i++) 416117237Sjeff kseq_move(kseq, low_cpu); 417116069Sjeff 418116069Sjeffout: 419116069Sjeff mtx_unlock_spin(&sched_lock); 420116069Sjeff callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); 421116069Sjeff 422116069Sjeff return; 423116069Sjeff} 424116069Sjeff 425110267Sjeffstruct kseq * 426110267Sjeffkseq_load_highest(void) 427110267Sjeff{ 428110267Sjeff struct kseq *kseq; 429110267Sjeff int load; 430110267Sjeff int cpu; 431110267Sjeff int i; 432110267Sjeff 433115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 434110267Sjeff cpu = 0; 435110267Sjeff load = 0; 436110267Sjeff 437110267Sjeff for (i = 0; i < mp_maxid; i++) { 438116970Sjeff if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 439110267Sjeff continue; 440110267Sjeff kseq = KSEQ_CPU(i); 441113357Sjeff if (kseq->ksq_load > load) { 442113357Sjeff load = kseq->ksq_load; 443110267Sjeff cpu = i; 444110267Sjeff } 445110267Sjeff } 446117237Sjeff kseq = KSEQ_CPU(cpu); 447110267Sjeff 448117237Sjeff if (load > kseq->ksq_cpus) 449117237Sjeff return (kseq); 450117237Sjeff 451110267Sjeff return (NULL); 452110267Sjeff} 453116069Sjeff 454116069Sjeffvoid 455116069Sjeffkseq_move(struct kseq *from, int cpu) 456116069Sjeff{ 457116069Sjeff struct kse *ke; 458116069Sjeff 459117326Sjeff ke = kseq_choose(from, 1); 460116069Sjeff runq_remove(ke->ke_runq, ke); 461116069Sjeff ke->ke_state = KES_THREAD; 462116069Sjeff kseq_rem(from, ke); 463116069Sjeff 464116069Sjeff ke->ke_cpu = cpu; 465121145Sjeff sched_add(ke->ke_thread); 466116069Sjeff} 467110267Sjeff#endif 468110267Sjeff 469117326Sjeff/* 470117326Sjeff * Pick the highest priority task we have and return it. If steal is 1 we 471117326Sjeff * will return kses that have been denied slices due to their nice being too 472117326Sjeff * low. In the future we should prohibit stealing interrupt threads as well. 473117326Sjeff */ 474117326Sjeff 475110267Sjeffstruct kse * 476117326Sjeffkseq_choose(struct kseq *kseq, int steal) 477110267Sjeff{ 478110267Sjeff struct kse *ke; 479110267Sjeff struct runq *swap; 480110267Sjeff 481115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 482113357Sjeff swap = NULL; 483112994Sjeff 484113357Sjeff for (;;) { 485113357Sjeff ke = runq_choose(kseq->ksq_curr); 486113357Sjeff if (ke == NULL) { 487113357Sjeff /* 488113357Sjeff * We already swaped once and didn't get anywhere. 489113357Sjeff */ 490113357Sjeff if (swap) 491113357Sjeff break; 492113357Sjeff swap = kseq->ksq_curr; 493113357Sjeff kseq->ksq_curr = kseq->ksq_next; 494113357Sjeff kseq->ksq_next = swap; 495113357Sjeff continue; 496113357Sjeff } 497113357Sjeff /* 498113357Sjeff * If we encounter a slice of 0 the kse is in a 499113357Sjeff * TIMESHARE kse group and its nice was too far out 500113357Sjeff * of the range that receives slices. 501113357Sjeff */ 502117326Sjeff if (ke->ke_slice == 0 && steal == 0) { 503113357Sjeff runq_remove(ke->ke_runq, ke); 504113357Sjeff sched_slice(ke); 505113357Sjeff ke->ke_runq = kseq->ksq_next; 506113357Sjeff runq_add(ke->ke_runq, ke); 507113357Sjeff continue; 508113357Sjeff } 509113357Sjeff return (ke); 510110267Sjeff } 511110267Sjeff 512113357Sjeff return (runq_choose(&kseq->ksq_idle)); 513110267Sjeff} 514110267Sjeff 515109864Sjeffstatic void 516110028Sjeffkseq_setup(struct kseq *kseq) 517110028Sjeff{ 518113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 519113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 520112994Sjeff runq_init(&kseq->ksq_idle); 521113357Sjeff 522113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 523113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 524113357Sjeff 525113357Sjeff kseq->ksq_loads[PRI_ITHD] = 0; 526113357Sjeff kseq->ksq_loads[PRI_REALTIME] = 0; 527113357Sjeff kseq->ksq_loads[PRI_TIMESHARE] = 0; 528113357Sjeff kseq->ksq_loads[PRI_IDLE] = 0; 529113660Sjeff kseq->ksq_load = 0; 530110267Sjeff#ifdef SMP 531110267Sjeff kseq->ksq_rslices = 0; 532110267Sjeff#endif 533110028Sjeff} 534110028Sjeff 535110028Sjeffstatic void 536109864Sjeffsched_setup(void *dummy) 537109864Sjeff{ 538117313Sjeff#ifdef SMP 539109864Sjeff int i; 540117313Sjeff#endif 541109864Sjeff 542116946Sjeff slice_min = (hz/100); /* 10ms */ 543116946Sjeff slice_max = (hz/7); /* ~140ms */ 544111857Sjeff 545117237Sjeff#ifdef SMP 546109864Sjeff /* init kseqs */ 547117237Sjeff /* Create the idmap. */ 548117237Sjeff#ifdef ULE_HTT_EXPERIMENTAL 549117237Sjeff if (smp_topology == NULL) { 550117237Sjeff#else 551117237Sjeff if (1) { 552117237Sjeff#endif 553117237Sjeff for (i = 0; i < MAXCPU; i++) { 554117237Sjeff kseq_setup(&kseq_cpu[i]); 555117237Sjeff kseq_idmap[i] = &kseq_cpu[i]; 556117237Sjeff kseq_cpu[i].ksq_cpus = 1; 557117237Sjeff } 558117237Sjeff } else { 559117237Sjeff int j; 560113357Sjeff 561117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 562117237Sjeff struct cpu_group *cg; 563117237Sjeff 564117237Sjeff cg = &smp_topology->ct_group[i]; 565117237Sjeff kseq_setup(&kseq_cpu[i]); 566117237Sjeff 567117237Sjeff for (j = 0; j < MAXCPU; j++) 568117237Sjeff if ((cg->cg_mask & (1 << j)) != 0) 569117237Sjeff kseq_idmap[j] = &kseq_cpu[i]; 570117237Sjeff kseq_cpu[i].ksq_cpus = cg->cg_count; 571117237Sjeff } 572117237Sjeff } 573119137Ssam callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 574116069Sjeff kseq_balance(NULL); 575117237Sjeff#else 576117237Sjeff kseq_setup(KSEQ_SELF()); 577116069Sjeff#endif 578117237Sjeff mtx_lock_spin(&sched_lock); 579117237Sjeff kseq_add(KSEQ_SELF(), &kse0); 580117237Sjeff mtx_unlock_spin(&sched_lock); 581109864Sjeff} 582109864Sjeff 583109864Sjeff/* 584109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 585109864Sjeff * process. 586109864Sjeff */ 587113357Sjeffstatic void 588109864Sjeffsched_priority(struct ksegrp *kg) 589109864Sjeff{ 590109864Sjeff int pri; 591109864Sjeff 592109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 593113357Sjeff return; 594109864Sjeff 595113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 596111857Sjeff pri += SCHED_PRI_BASE; 597109864Sjeff pri += kg->kg_nice; 598109864Sjeff 599109864Sjeff if (pri > PRI_MAX_TIMESHARE) 600109864Sjeff pri = PRI_MAX_TIMESHARE; 601109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 602109864Sjeff pri = PRI_MIN_TIMESHARE; 603109864Sjeff 604109864Sjeff kg->kg_user_pri = pri; 605109864Sjeff 606113357Sjeff return; 607109864Sjeff} 608109864Sjeff 609109864Sjeff/* 610112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 611112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 612109864Sjeff */ 613112966Sjeffstatic void 614112966Sjeffsched_slice(struct kse *ke) 615109864Sjeff{ 616113357Sjeff struct kseq *kseq; 617112966Sjeff struct ksegrp *kg; 618109864Sjeff 619112966Sjeff kg = ke->ke_ksegrp; 620113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 621109864Sjeff 622112966Sjeff /* 623112966Sjeff * Rationale: 624112966Sjeff * KSEs in interactive ksegs get the minimum slice so that we 625112966Sjeff * quickly notice if it abuses its advantage. 626112966Sjeff * 627112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 628112966Sjeff * based on the ksegs nice value relative to the least nice kseg 629112966Sjeff * on the run queue for this cpu. 630112966Sjeff * 631112966Sjeff * If the KSE is less nice than all others it gets the maximum 632112966Sjeff * slice and other KSEs will adjust their slice relative to 633112966Sjeff * this when they first expire. 634112966Sjeff * 635112966Sjeff * There is 20 point window that starts relative to the least 636112966Sjeff * nice kse on the run queue. Slice size is determined by 637112966Sjeff * the kse distance from the last nice ksegrp. 638112966Sjeff * 639112966Sjeff * If you are outside of the window you will get no slice and 640112966Sjeff * you will be reevaluated each time you are selected on the 641112966Sjeff * run queue. 642112966Sjeff * 643112966Sjeff */ 644109864Sjeff 645113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 646112966Sjeff int nice; 647112966Sjeff 648113357Sjeff nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 649113357Sjeff if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || 650113357Sjeff kg->kg_nice < kseq->ksq_nicemin) 651112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 652113357Sjeff else if (nice <= SCHED_PRI_NTHRESH) 653112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 654112966Sjeff else 655112966Sjeff ke->ke_slice = 0; 656112966Sjeff } else 657112966Sjeff ke->ke_slice = SCHED_SLICE_MIN; 658112966Sjeff 659113357Sjeff CTR6(KTR_ULE, 660113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 661113357Sjeff ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 662113357Sjeff kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); 663113357Sjeff 664110645Sjeff /* 665112994Sjeff * Check to see if we need to scale back the slp and run time 666112994Sjeff * in the kg. This will cause us to forget old interactivity 667112994Sjeff * while maintaining the current ratio. 668110645Sjeff */ 669116463Sjeff sched_interact_update(kg); 670110645Sjeff 671112966Sjeff return; 672109864Sjeff} 673109864Sjeff 674116463Sjeffstatic void 675116463Sjeffsched_interact_update(struct ksegrp *kg) 676116463Sjeff{ 677121605Sjeff int ratio; 678121605Sjeff 679121605Sjeff if ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { 680121605Sjeff ratio = ((SCHED_SLP_RUN_MAX * 15) / (kg->kg_runtime + 681121605Sjeff kg->kg_slptime )); 682121605Sjeff kg->kg_runtime = (kg->kg_runtime * ratio) / 16; 683121605Sjeff kg->kg_slptime = (kg->kg_slptime * ratio) / 16; 684116463Sjeff } 685116463Sjeff} 686116463Sjeff 687111857Sjeffstatic int 688111857Sjeffsched_interact_score(struct ksegrp *kg) 689111857Sjeff{ 690116365Sjeff int div; 691111857Sjeff 692111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 693116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 694116365Sjeff return (SCHED_INTERACT_HALF + 695116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 696116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 697116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 698116365Sjeff return (kg->kg_runtime / div); 699111857Sjeff } 700111857Sjeff 701116365Sjeff /* 702116365Sjeff * This can happen if slptime and runtime are 0. 703116365Sjeff */ 704116365Sjeff return (0); 705111857Sjeff 706111857Sjeff} 707111857Sjeff 708113357Sjeff/* 709113357Sjeff * This is only somewhat accurate since given many processes of the same 710113357Sjeff * priority they will switch when their slices run out, which will be 711113357Sjeff * at most SCHED_SLICE_MAX. 712113357Sjeff */ 713109864Sjeffint 714109864Sjeffsched_rr_interval(void) 715109864Sjeff{ 716109864Sjeff return (SCHED_SLICE_MAX); 717109864Sjeff} 718109864Sjeff 719109864Sjeffvoid 720109864Sjeffsched_pctcpu_update(struct kse *ke) 721109864Sjeff{ 722109864Sjeff /* 723109864Sjeff * Adjust counters and watermark for pctcpu calc. 724116365Sjeff */ 725120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 726120272Sjeff /* 727120272Sjeff * Shift the tick count out so that the divide doesn't 728120272Sjeff * round away our results. 729120272Sjeff */ 730120272Sjeff ke->ke_ticks <<= 10; 731120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 732120272Sjeff SCHED_CPU_TICKS; 733120272Sjeff ke->ke_ticks >>= 10; 734120272Sjeff } else 735120272Sjeff ke->ke_ticks = 0; 736109864Sjeff ke->ke_ltick = ticks; 737109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 738109864Sjeff} 739109864Sjeff 740109864Sjeff#ifdef SMP 741110267Sjeff/* XXX Should be changed to kseq_load_lowest() */ 742109864Sjeffint 743109864Sjeffsched_pickcpu(void) 744109864Sjeff{ 745110028Sjeff struct kseq *kseq; 746110028Sjeff int load; 747109864Sjeff int cpu; 748109864Sjeff int i; 749109864Sjeff 750115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 751109864Sjeff if (!smp_started) 752109864Sjeff return (0); 753109864Sjeff 754110028Sjeff load = 0; 755110028Sjeff cpu = 0; 756109864Sjeff 757109864Sjeff for (i = 0; i < mp_maxid; i++) { 758116970Sjeff if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 759109864Sjeff continue; 760110028Sjeff kseq = KSEQ_CPU(i); 761113357Sjeff if (kseq->ksq_load < load) { 762109864Sjeff cpu = i; 763113357Sjeff load = kseq->ksq_load; 764109864Sjeff } 765109864Sjeff } 766109864Sjeff 767109864Sjeff CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 768109864Sjeff return (cpu); 769109864Sjeff} 770109864Sjeff#else 771109864Sjeffint 772109864Sjeffsched_pickcpu(void) 773109864Sjeff{ 774109864Sjeff return (0); 775109864Sjeff} 776109864Sjeff#endif 777109864Sjeff 778109864Sjeffvoid 779109864Sjeffsched_prio(struct thread *td, u_char prio) 780109864Sjeff{ 781121605Sjeff struct kse *ke; 782109864Sjeff 783121605Sjeff ke = td->td_kse; 784109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 785109864Sjeff if (TD_ON_RUNQ(td)) { 786121605Sjeff /* 787121605Sjeff * If the priority has been elevated due to priority 788121605Sjeff * propagation, we may have to move ourselves to a new 789121605Sjeff * queue. We still call adjustrunqueue below in case kse 790121605Sjeff * needs to fix things up. 791121605Sjeff */ 792121605Sjeff if ((td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 793121605Sjeff prio < td->td_ksegrp->kg_user_pri) || 794121605Sjeff (td->td_ksegrp->kg_pri_class == PRI_IDLE && 795121605Sjeff prio < PRI_MIN_IDLE)) { 796121605Sjeff runq_remove(ke->ke_runq, ke); 797121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 798121605Sjeff runq_add(ke->ke_runq, ke); 799121605Sjeff } 800119488Sdavidxu adjustrunqueue(td, prio); 801121605Sjeff } else 802119488Sdavidxu td->td_priority = prio; 803109864Sjeff} 804109864Sjeff 805109864Sjeffvoid 806121128Sjeffsched_switch(struct thread *td) 807109864Sjeff{ 808121128Sjeff struct thread *newtd; 809121128Sjeff u_int sched_nest; 810109864Sjeff struct kse *ke; 811109864Sjeff 812109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 813109864Sjeff 814109864Sjeff ke = td->td_kse; 815109864Sjeff 816109864Sjeff td->td_last_kse = ke; 817113339Sjulian td->td_lastcpu = td->td_oncpu; 818113339Sjulian td->td_oncpu = NOCPU; 819111032Sjulian td->td_flags &= ~TDF_NEEDRESCHED; 820109864Sjeff 821109864Sjeff if (TD_IS_RUNNING(td)) { 822119488Sdavidxu if (td->td_proc->p_flag & P_SA) { 823119488Sdavidxu kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 824119488Sdavidxu setrunqueue(td); 825119488Sdavidxu } else { 826119488Sdavidxu /* 827121605Sjeff * This queue is always correct except for idle threads 828121605Sjeff * which have a higher priority due to priority 829121605Sjeff * propagation. 830119488Sdavidxu */ 831121605Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) { 832121605Sjeff if (td->td_priority < PRI_MIN_IDLE) 833121605Sjeff ke->ke_runq = KSEQ_SELF()->ksq_curr; 834121605Sjeff else 835121605Sjeff ke->ke_runq = &KSEQ_SELF()->ksq_idle; 836121605Sjeff } 837119488Sdavidxu runq_add(ke->ke_runq, ke); 838119488Sdavidxu /* setrunqueue(td); */ 839119488Sdavidxu } 840121146Sjeff } else { 841121146Sjeff if (ke->ke_runq) 842121146Sjeff kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 843121146Sjeff /* 844121146Sjeff * We will not be on the run queue. So we must be 845121146Sjeff * sleeping or similar. 846121146Sjeff */ 847121146Sjeff if (td->td_proc->p_flag & P_SA) 848121146Sjeff kse_reassign(ke); 849121146Sjeff } 850121128Sjeff sched_nest = sched_lock.mtx_recurse; 851121128Sjeff newtd = choosethread(); 852121128Sjeff if (td != newtd) 853121128Sjeff cpu_switch(td, newtd); 854121128Sjeff sched_lock.mtx_recurse = sched_nest; 855121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 856109864Sjeff 857113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 858109864Sjeff} 859109864Sjeff 860109864Sjeffvoid 861109864Sjeffsched_nice(struct ksegrp *kg, int nice) 862109864Sjeff{ 863113357Sjeff struct kse *ke; 864109864Sjeff struct thread *td; 865113357Sjeff struct kseq *kseq; 866109864Sjeff 867113873Sjhb PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 868113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 869113357Sjeff /* 870113357Sjeff * We need to adjust the nice counts for running KSEs. 871113357Sjeff */ 872113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 873113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 874116500Sjeff if (ke->ke_runq == NULL) 875113357Sjeff continue; 876113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 877113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 878113357Sjeff kseq_nice_add(kseq, nice); 879113357Sjeff } 880109864Sjeff kg->kg_nice = nice; 881109864Sjeff sched_priority(kg); 882113357Sjeff FOREACH_THREAD_IN_GROUP(kg, td) 883111032Sjulian td->td_flags |= TDF_NEEDRESCHED; 884109864Sjeff} 885109864Sjeff 886109864Sjeffvoid 887109864Sjeffsched_sleep(struct thread *td, u_char prio) 888109864Sjeff{ 889109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 890109864Sjeff 891109864Sjeff td->td_slptime = ticks; 892109864Sjeff td->td_priority = prio; 893109864Sjeff 894113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 895113357Sjeff td->td_kse, td->td_slptime); 896109864Sjeff} 897109864Sjeff 898109864Sjeffvoid 899109864Sjeffsched_wakeup(struct thread *td) 900109864Sjeff{ 901109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 902109864Sjeff 903109864Sjeff /* 904109864Sjeff * Let the kseg know how long we slept for. This is because process 905109864Sjeff * interactivity behavior is modeled in the kseg. 906109864Sjeff */ 907111788Sjeff if (td->td_slptime) { 908111788Sjeff struct ksegrp *kg; 909113357Sjeff int hzticks; 910109864Sjeff 911111788Sjeff kg = td->td_ksegrp; 912113357Sjeff hzticks = ticks - td->td_slptime; 913113357Sjeff kg->kg_slptime += hzticks << 10; 914116463Sjeff sched_interact_update(kg); 915111788Sjeff sched_priority(kg); 916116463Sjeff if (td->td_kse) 917116463Sjeff sched_slice(td->td_kse); 918113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 919113357Sjeff td->td_kse, hzticks); 920111788Sjeff td->td_slptime = 0; 921109864Sjeff } 922109864Sjeff setrunqueue(td); 923109864Sjeff if (td->td_priority < curthread->td_priority) 924111032Sjulian curthread->td_flags |= TDF_NEEDRESCHED; 925109864Sjeff} 926109864Sjeff 927109864Sjeff/* 928109864Sjeff * Penalize the parent for creating a new child and initialize the child's 929109864Sjeff * priority. 930109864Sjeff */ 931109864Sjeffvoid 932113357Sjeffsched_fork(struct proc *p, struct proc *p1) 933109864Sjeff{ 934109864Sjeff 935109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 936109864Sjeff 937113357Sjeff sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 938113357Sjeff sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 939113357Sjeff sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 940113357Sjeff} 941113357Sjeff 942113357Sjeffvoid 943113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child) 944113357Sjeff{ 945113923Sjhb 946116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 947113357Sjeff child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ 948113357Sjeff child->ke_runq = NULL; 949113357Sjeff 950121051Sjeff /* Grab our parents cpu estimation information. */ 951121051Sjeff child->ke_ticks = ke->ke_ticks; 952121051Sjeff child->ke_ltick = ke->ke_ltick; 953121051Sjeff child->ke_ftick = ke->ke_ftick; 954113357Sjeff} 955113357Sjeff 956113357Sjeffvoid 957113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 958113357Sjeff{ 959113923Sjhb 960113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 961109864Sjeff /* XXX Need something better here */ 962116365Sjeff 963116955Sjeff child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE; 964116955Sjeff child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE; 965116463Sjeff kg->kg_runtime += tickincr << 10; 966116463Sjeff sched_interact_update(kg); 967113357Sjeff 968109864Sjeff child->kg_user_pri = kg->kg_user_pri; 969113357Sjeff child->kg_nice = kg->kg_nice; 970113357Sjeff} 971109864Sjeff 972113357Sjeffvoid 973113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 974113357Sjeff{ 975113357Sjeff} 976113357Sjeff 977113357Sjeffvoid 978113357Sjeffsched_class(struct ksegrp *kg, int class) 979113357Sjeff{ 980113357Sjeff struct kseq *kseq; 981113357Sjeff struct kse *ke; 982113357Sjeff 983113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 984113357Sjeff if (kg->kg_pri_class == class) 985113357Sjeff return; 986113357Sjeff 987113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 988113357Sjeff if (ke->ke_state != KES_ONRUNQ && 989113357Sjeff ke->ke_state != KES_THREAD) 990113357Sjeff continue; 991113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 992113357Sjeff 993113386Sjeff kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; 994113386Sjeff kseq->ksq_loads[PRI_BASE(class)]++; 995113357Sjeff 996113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 997113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 998113357Sjeff else if (class == PRI_TIMESHARE) 999113357Sjeff kseq_nice_add(kseq, kg->kg_nice); 1000109970Sjeff } 1001109970Sjeff 1002113357Sjeff kg->kg_pri_class = class; 1003109864Sjeff} 1004109864Sjeff 1005109864Sjeff/* 1006109864Sjeff * Return some of the child's priority and interactivity to the parent. 1007109864Sjeff */ 1008109864Sjeffvoid 1009113357Sjeffsched_exit(struct proc *p, struct proc *child) 1010109864Sjeff{ 1011109864Sjeff /* XXX Need something better here */ 1012109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1013113372Sjeff sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 1014116365Sjeff sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 1015109864Sjeff} 1016109864Sjeff 1017109864Sjeffvoid 1018113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child) 1019113372Sjeff{ 1020113372Sjeff kseq_rem(KSEQ_CPU(child->ke_cpu), child); 1021113372Sjeff} 1022113372Sjeff 1023113372Sjeffvoid 1024113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1025113372Sjeff{ 1026116463Sjeff /* kg->kg_slptime += child->kg_slptime; */ 1027116365Sjeff kg->kg_runtime += child->kg_runtime; 1028116463Sjeff sched_interact_update(kg); 1029113372Sjeff} 1030113372Sjeff 1031113372Sjeffvoid 1032113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 1033113372Sjeff{ 1034113372Sjeff} 1035113372Sjeff 1036113372Sjeffvoid 1037121127Sjeffsched_clock(struct thread *td) 1038109864Sjeff{ 1039113357Sjeff struct kseq *kseq; 1040113357Sjeff struct ksegrp *kg; 1041121127Sjeff struct kse *ke; 1042109864Sjeff 1043113357Sjeff /* 1044113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1045113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1046113357Sjeff * calculating this here. 1047113357Sjeff */ 1048113357Sjeff if (realstathz == 0) { 1049113357Sjeff realstathz = stathz ? stathz : hz; 1050113357Sjeff tickincr = hz / realstathz; 1051113357Sjeff /* 1052113357Sjeff * XXX This does not work for values of stathz that are much 1053113357Sjeff * larger than hz. 1054113357Sjeff */ 1055113357Sjeff if (tickincr == 0) 1056113357Sjeff tickincr = 1; 1057113357Sjeff } 1058109864Sjeff 1059121127Sjeff ke = td->td_kse; 1060113357Sjeff kg = ke->ke_ksegrp; 1061109864Sjeff 1062110028Sjeff mtx_assert(&sched_lock, MA_OWNED); 1063110028Sjeff KASSERT((td != NULL), ("schedclock: null thread pointer")); 1064110028Sjeff 1065110028Sjeff /* Adjust ticks for pctcpu */ 1066111793Sjeff ke->ke_ticks++; 1067109971Sjeff ke->ke_ltick = ticks; 1068112994Sjeff 1069109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1070109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1071109971Sjeff sched_pctcpu_update(ke); 1072109971Sjeff 1073114496Sjulian if (td->td_flags & TDF_IDLETD) 1074109864Sjeff return; 1075110028Sjeff 1076113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1077113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1078113357Sjeff 1079110028Sjeff /* 1080121605Sjeff * Idle tasks should always resched. 1081121605Sjeff */ 1082121605Sjeff if (kg->kg_pri_class == PRI_IDLE) { 1083121605Sjeff td->td_flags |= TDF_NEEDRESCHED; 1084121605Sjeff return; 1085121605Sjeff } 1086121605Sjeff /* 1087113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1088113357Sjeff */ 1089113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1090113357Sjeff return; 1091113357Sjeff /* 1092110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1093113357Sjeff * interactivity. 1094109864Sjeff */ 1095113357Sjeff kg->kg_runtime += tickincr << 10; 1096116463Sjeff sched_interact_update(kg); 1097110645Sjeff 1098109864Sjeff /* 1099109864Sjeff * We used up one time slice. 1100109864Sjeff */ 1101109864Sjeff ke->ke_slice--; 1102121605Sjeff kseq = KSEQ_SELF(); 1103113357Sjeff#ifdef SMP 1104113370Sjeff kseq->ksq_rslices--; 1105113357Sjeff#endif 1106113357Sjeff 1107113357Sjeff if (ke->ke_slice > 0) 1108113357Sjeff return; 1109109864Sjeff /* 1110113357Sjeff * We're out of time, recompute priorities and requeue. 1111109864Sjeff */ 1112113357Sjeff kseq_rem(kseq, ke); 1113113357Sjeff sched_priority(kg); 1114113357Sjeff sched_slice(ke); 1115113357Sjeff if (SCHED_CURR(kg, ke)) 1116113357Sjeff ke->ke_runq = kseq->ksq_curr; 1117113357Sjeff else 1118113357Sjeff ke->ke_runq = kseq->ksq_next; 1119113357Sjeff kseq_add(kseq, ke); 1120113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1121109864Sjeff} 1122109864Sjeff 1123109864Sjeffint 1124109864Sjeffsched_runnable(void) 1125109864Sjeff{ 1126109864Sjeff struct kseq *kseq; 1127115998Sjeff int load; 1128109864Sjeff 1129115998Sjeff load = 1; 1130115998Sjeff 1131115998Sjeff mtx_lock_spin(&sched_lock); 1132110028Sjeff kseq = KSEQ_SELF(); 1133109864Sjeff 1134121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1135121605Sjeff if (kseq->ksq_load > 0) 1136121605Sjeff goto out; 1137121605Sjeff } else 1138121605Sjeff if (kseq->ksq_load - 1 > 0) 1139121605Sjeff goto out; 1140109970Sjeff#ifdef SMP 1141110028Sjeff /* 1142110028Sjeff * For SMP we may steal other processor's KSEs. Just search until we 1143110028Sjeff * verify that at least on other cpu has a runnable task. 1144110028Sjeff */ 1145109970Sjeff if (smp_started) { 1146109970Sjeff int i; 1147109970Sjeff 1148109970Sjeff for (i = 0; i < mp_maxid; i++) { 1149116970Sjeff if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 1150109970Sjeff continue; 1151110028Sjeff kseq = KSEQ_CPU(i); 1152117237Sjeff if (kseq->ksq_load > kseq->ksq_cpus) 1153115998Sjeff goto out; 1154109970Sjeff } 1155109970Sjeff } 1156109970Sjeff#endif 1157115998Sjeff load = 0; 1158115998Sjeffout: 1159115998Sjeff mtx_unlock_spin(&sched_lock); 1160115998Sjeff return (load); 1161109864Sjeff} 1162109864Sjeff 1163109864Sjeffvoid 1164109864Sjeffsched_userret(struct thread *td) 1165109864Sjeff{ 1166109864Sjeff struct ksegrp *kg; 1167121605Sjeff 1168121605Sjeff kg = td->td_ksegrp; 1169109864Sjeff 1170109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1171109864Sjeff mtx_lock_spin(&sched_lock); 1172109864Sjeff td->td_priority = kg->kg_user_pri; 1173109864Sjeff mtx_unlock_spin(&sched_lock); 1174109864Sjeff } 1175109864Sjeff} 1176109864Sjeff 1177109864Sjeffstruct kse * 1178109970Sjeffsched_choose(void) 1179109970Sjeff{ 1180110028Sjeff struct kseq *kseq; 1181109970Sjeff struct kse *ke; 1182109970Sjeff 1183115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1184113357Sjeff#ifdef SMP 1185112966Sjeffretry: 1186113357Sjeff#endif 1187113370Sjeff kseq = KSEQ_SELF(); 1188117326Sjeff ke = kseq_choose(kseq, 0); 1189109864Sjeff if (ke) { 1190113357Sjeff runq_remove(ke->ke_runq, ke); 1191109864Sjeff ke->ke_state = KES_THREAD; 1192112966Sjeff 1193113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1194113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1195113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1196113357Sjeff ke->ke_thread->td_priority); 1197113357Sjeff } 1198113357Sjeff return (ke); 1199109864Sjeff } 1200109864Sjeff 1201109970Sjeff#ifdef SMP 1202113370Sjeff if (smp_started) { 1203109970Sjeff /* 1204109970Sjeff * Find the cpu with the highest load and steal one proc. 1205109970Sjeff */ 1206113370Sjeff if ((kseq = kseq_load_highest()) == NULL) 1207113370Sjeff return (NULL); 1208113370Sjeff 1209113370Sjeff /* 1210113370Sjeff * Remove this kse from this kseq and runq and then requeue 1211113370Sjeff * on the current processor. Then we will dequeue it 1212113370Sjeff * normally above. 1213113370Sjeff */ 1214116069Sjeff kseq_move(kseq, PCPU_GET(cpuid)); 1215113370Sjeff goto retry; 1216109970Sjeff } 1217109970Sjeff#endif 1218113357Sjeff 1219113357Sjeff return (NULL); 1220109864Sjeff} 1221109864Sjeff 1222109864Sjeffvoid 1223121127Sjeffsched_add(struct thread *td) 1224109864Sjeff{ 1225110267Sjeff struct kseq *kseq; 1226113357Sjeff struct ksegrp *kg; 1227121127Sjeff struct kse *ke; 1228109864Sjeff 1229121127Sjeff ke = td->td_kse; 1230121127Sjeff kg = td->td_ksegrp; 1231109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1232110267Sjeff KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1233109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1234110267Sjeff ("sched_add: No KSE on thread")); 1235109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1236110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1237109864Sjeff ke->ke_proc->p_comm)); 1238109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1239110267Sjeff ("sched_add: process swapped out")); 1240113387Sjeff KASSERT(ke->ke_runq == NULL, 1241113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1242109864Sjeff 1243113357Sjeff 1244113386Sjeff switch (PRI_BASE(kg->kg_pri_class)) { 1245112994Sjeff case PRI_ITHD: 1246112994Sjeff case PRI_REALTIME: 1247112994Sjeff kseq = KSEQ_SELF(); 1248113357Sjeff ke->ke_runq = kseq->ksq_curr; 1249113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1250113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1251112994Sjeff break; 1252112994Sjeff case PRI_TIMESHARE: 1253113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1254113387Sjeff if (SCHED_CURR(kg, ke)) 1255113387Sjeff ke->ke_runq = kseq->ksq_curr; 1256113387Sjeff else 1257113387Sjeff ke->ke_runq = kseq->ksq_next; 1258113357Sjeff break; 1259112994Sjeff case PRI_IDLE: 1260111789Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1261113357Sjeff /* 1262113357Sjeff * This is for priority prop. 1263113357Sjeff */ 1264121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1265113357Sjeff ke->ke_runq = kseq->ksq_curr; 1266113357Sjeff else 1267113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1268113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1269112994Sjeff break; 1270113357Sjeff default: 1271113357Sjeff panic("Unknown pri class.\n"); 1272113357Sjeff break; 1273112994Sjeff } 1274109864Sjeff 1275109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1276109864Sjeff ke->ke_state = KES_ONRUNQ; 1277109864Sjeff 1278113357Sjeff runq_add(ke->ke_runq, ke); 1279113387Sjeff kseq_add(kseq, ke); 1280109864Sjeff} 1281109864Sjeff 1282109864Sjeffvoid 1283121127Sjeffsched_rem(struct thread *td) 1284109864Sjeff{ 1285113357Sjeff struct kseq *kseq; 1286121127Sjeff struct kse *ke; 1287113357Sjeff 1288121127Sjeff ke = td->td_kse; 1289121127Sjeff 1290109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1291113387Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1292109864Sjeff 1293109864Sjeff ke->ke_state = KES_THREAD; 1294109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1295113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1296113357Sjeff runq_remove(ke->ke_runq, ke); 1297113357Sjeff kseq_rem(kseq, ke); 1298109864Sjeff} 1299109864Sjeff 1300109864Sjefffixpt_t 1301121127Sjeffsched_pctcpu(struct thread *td) 1302109864Sjeff{ 1303109864Sjeff fixpt_t pctcpu; 1304121127Sjeff struct kse *ke; 1305109864Sjeff 1306109864Sjeff pctcpu = 0; 1307121127Sjeff ke = td->td_kse; 1308121290Sjeff if (ke == NULL) 1309121290Sjeff return (0); 1310109864Sjeff 1311115998Sjeff mtx_lock_spin(&sched_lock); 1312109864Sjeff if (ke->ke_ticks) { 1313109864Sjeff int rtick; 1314109864Sjeff 1315116365Sjeff /* 1316116365Sjeff * Don't update more frequently than twice a second. Allowing 1317116365Sjeff * this causes the cpu usage to decay away too quickly due to 1318116365Sjeff * rounding errors. 1319116365Sjeff */ 1320116365Sjeff if (ke->ke_ltick < (ticks - (hz / 2))) 1321116365Sjeff sched_pctcpu_update(ke); 1322109864Sjeff /* How many rtick per second ? */ 1323116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1324110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1325109864Sjeff } 1326109864Sjeff 1327109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1328113865Sjhb mtx_unlock_spin(&sched_lock); 1329109864Sjeff 1330109864Sjeff return (pctcpu); 1331109864Sjeff} 1332109864Sjeff 1333109864Sjeffint 1334109864Sjeffsched_sizeof_kse(void) 1335109864Sjeff{ 1336109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1337109864Sjeff} 1338109864Sjeff 1339109864Sjeffint 1340109864Sjeffsched_sizeof_ksegrp(void) 1341109864Sjeff{ 1342109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1343109864Sjeff} 1344109864Sjeff 1345109864Sjeffint 1346109864Sjeffsched_sizeof_proc(void) 1347109864Sjeff{ 1348109864Sjeff return (sizeof(struct proc)); 1349109864Sjeff} 1350109864Sjeff 1351109864Sjeffint 1352109864Sjeffsched_sizeof_thread(void) 1353109864Sjeff{ 1354109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1355109864Sjeff} 1356