sched_ule.c revision 121126
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 121126 2003-10-16 08:17:43Z jeff $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32109864Sjeff#include <sys/kernel.h> 33109864Sjeff#include <sys/ktr.h> 34109864Sjeff#include <sys/lock.h> 35109864Sjeff#include <sys/mutex.h> 36109864Sjeff#include <sys/proc.h> 37112966Sjeff#include <sys/resource.h> 38109864Sjeff#include <sys/sched.h> 39109864Sjeff#include <sys/smp.h> 40109864Sjeff#include <sys/sx.h> 41109864Sjeff#include <sys/sysctl.h> 42109864Sjeff#include <sys/sysproto.h> 43109864Sjeff#include <sys/vmmeter.h> 44109864Sjeff#ifdef DDB 45109864Sjeff#include <ddb/ddb.h> 46109864Sjeff#endif 47109864Sjeff#ifdef KTRACE 48109864Sjeff#include <sys/uio.h> 49109864Sjeff#include <sys/ktrace.h> 50109864Sjeff#endif 51109864Sjeff 52109864Sjeff#include <machine/cpu.h> 53109864Sjeff 54113357Sjeff#define KTR_ULE KTR_NFS 55113357Sjeff 56109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57109864Sjeff/* XXX This is bogus compatability crap for ps */ 58109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 59109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 60109864Sjeff 61109864Sjeffstatic void sched_setup(void *dummy); 62109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 63109864Sjeff 64113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 65113357Sjeff 66113357Sjeffstatic int sched_strict; 67113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 68113357Sjeff 69113357Sjeffstatic int slice_min = 1; 70113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71113357Sjeff 72116365Sjeffstatic int slice_max = 10; 73113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74113357Sjeff 75111857Sjeffint realstathz; 76113357Sjeffint tickincr = 1; 77111857Sjeff 78116069Sjeff#ifdef SMP 79116069Sjeff/* Callout to handle load balancing SMP systems. */ 80116069Sjeffstatic struct callout kseq_lb_callout; 81116069Sjeff#endif 82116069Sjeff 83109864Sjeff/* 84109864Sjeff * These datastructures are allocated within their parent datastructure but 85109864Sjeff * are scheduler specific. 86109864Sjeff */ 87109864Sjeff 88109864Sjeffstruct ke_sched { 89109864Sjeff int ske_slice; 90109864Sjeff struct runq *ske_runq; 91109864Sjeff /* The following variables are only used for pctcpu calculation */ 92109864Sjeff int ske_ltick; /* Last tick that we were running on */ 93109864Sjeff int ske_ftick; /* First tick that we were running on */ 94109864Sjeff int ske_ticks; /* Tick count */ 95113357Sjeff /* CPU that we have affinity for. */ 96110260Sjeff u_char ske_cpu; 97109864Sjeff}; 98109864Sjeff#define ke_slice ke_sched->ske_slice 99109864Sjeff#define ke_runq ke_sched->ske_runq 100109864Sjeff#define ke_ltick ke_sched->ske_ltick 101109864Sjeff#define ke_ftick ke_sched->ske_ftick 102109864Sjeff#define ke_ticks ke_sched->ske_ticks 103110260Sjeff#define ke_cpu ke_sched->ske_cpu 104109864Sjeff 105109864Sjeffstruct kg_sched { 106110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 107110645Sjeff int skg_runtime; /* Number of ticks we were running */ 108109864Sjeff}; 109109864Sjeff#define kg_slptime kg_sched->skg_slptime 110110645Sjeff#define kg_runtime kg_sched->skg_runtime 111109864Sjeff 112109864Sjeffstruct td_sched { 113109864Sjeff int std_slptime; 114109864Sjeff}; 115109864Sjeff#define td_slptime td_sched->std_slptime 116109864Sjeff 117110267Sjeffstruct td_sched td_sched; 118109864Sjeffstruct ke_sched ke_sched; 119109864Sjeffstruct kg_sched kg_sched; 120109864Sjeff 121109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 122109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 123109864Sjeffstruct p_sched *proc0_sched = NULL; 124109864Sjeffstruct td_sched *thread0_sched = &td_sched; 125109864Sjeff 126109864Sjeff/* 127116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 128116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 129116642Sjeff * value is then directly added to this to allow nice to have some effect 130116642Sjeff * on latency. 131111857Sjeff * 132111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 133116642Sjeff * PRI_NRESV: Number of nice values. 134111857Sjeff * PRI_BASE: The start of the dynamic range. 135109864Sjeff */ 136111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 137112966Sjeff#define SCHED_PRI_NRESV PRIO_TOTAL 138112970Sjeff#define SCHED_PRI_NHALF (PRIO_TOTAL / 2) 139113357Sjeff#define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) 140116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 141113357Sjeff#define SCHED_PRI_INTERACT(score) \ 142116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 143109864Sjeff 144109864Sjeff/* 145111857Sjeff * These determine the interactivity of a process. 146109864Sjeff * 147110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 148110645Sjeff * before throttling back. 149116955Sjeff * SLP_RUN_THROTTLE: Divisor for reducing slp/run time at fork time. 150116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 151111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 152109864Sjeff */ 153121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 154116955Sjeff#define SCHED_SLP_RUN_THROTTLE (100) 155116365Sjeff#define SCHED_INTERACT_MAX (100) 156116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 157121126Sjeff#define SCHED_INTERACT_THRESH (30) 158111857Sjeff 159109864Sjeff/* 160109864Sjeff * These parameters and macros determine the size of the time slice that is 161109864Sjeff * granted to each thread. 162109864Sjeff * 163109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 164109864Sjeff * SLICE_MAX: Maximum time slice granted. 165109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 166112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 167112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 168109864Sjeff */ 169113357Sjeff#define SCHED_SLICE_MIN (slice_min) 170113357Sjeff#define SCHED_SLICE_MAX (slice_max) 171111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 172109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 173112966Sjeff#define SCHED_SLICE_NICE(nice) \ 174113357Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) 175109864Sjeff 176109864Sjeff/* 177109864Sjeff * This macro determines whether or not the kse belongs on the current or 178109864Sjeff * next run queue. 179110645Sjeff * 180110645Sjeff * XXX nice value should effect how interactive a kg is. 181109864Sjeff */ 182113357Sjeff#define SCHED_INTERACTIVE(kg) \ 183113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 184113417Sjeff#define SCHED_CURR(kg, ke) \ 185121107Sjeff (ke->ke_thread->td_priority != kg->kg_user_pri || \ 186121107Sjeff SCHED_INTERACTIVE(kg)) 187109864Sjeff 188109864Sjeff/* 189109864Sjeff * Cpu percentage computation macros and defines. 190109864Sjeff * 191109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 192109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 193109864Sjeff */ 194109864Sjeff 195112971Sjeff#define SCHED_CPU_TIME 10 196109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 197109864Sjeff 198109864Sjeff/* 199113357Sjeff * kseq - per processor runqs and statistics. 200109864Sjeff */ 201109864Sjeff 202113357Sjeff#define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ 203113357Sjeff 204109864Sjeffstruct kseq { 205113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 206113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 207113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 208113357Sjeff struct runq *ksq_curr; /* Current queue. */ 209113357Sjeff int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ 210113357Sjeff int ksq_load; /* Aggregate load. */ 211113357Sjeff short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ 212113357Sjeff short ksq_nicemin; /* Least nice. */ 213110267Sjeff#ifdef SMP 214117237Sjeff int ksq_cpus; /* Count of CPUs in this kseq. */ 215110267Sjeff unsigned int ksq_rslices; /* Slices on run queue */ 216110267Sjeff#endif 217109864Sjeff}; 218109864Sjeff 219109864Sjeff/* 220109864Sjeff * One kse queue per processor. 221109864Sjeff */ 222110028Sjeff#ifdef SMP 223109864Sjeffstruct kseq kseq_cpu[MAXCPU]; 224117237Sjeffstruct kseq *kseq_idmap[MAXCPU]; 225117237Sjeff#define KSEQ_SELF() (kseq_idmap[PCPU_GET(cpuid)]) 226117237Sjeff#define KSEQ_CPU(x) (kseq_idmap[(x)]) 227110028Sjeff#else 228110028Sjeffstruct kseq kseq_cpu; 229110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 230110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 231110028Sjeff#endif 232109864Sjeff 233112966Sjeffstatic void sched_slice(struct kse *ke); 234113357Sjeffstatic void sched_priority(struct ksegrp *kg); 235111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 236116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 237109864Sjeffvoid sched_pctcpu_update(struct kse *ke); 238109864Sjeffint sched_pickcpu(void); 239109864Sjeff 240110267Sjeff/* Operations on per processor queues */ 241117326Sjeffstatic struct kse * kseq_choose(struct kseq *kseq, int steal); 242110028Sjeffstatic void kseq_setup(struct kseq *kseq); 243112994Sjeffstatic void kseq_add(struct kseq *kseq, struct kse *ke); 244113357Sjeffstatic void kseq_rem(struct kseq *kseq, struct kse *ke); 245113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 246113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 247113660Sjeffvoid kseq_print(int cpu); 248110267Sjeff#ifdef SMP 249110267Sjeffstruct kseq * kseq_load_highest(void); 250116069Sjeffvoid kseq_balance(void *arg); 251116069Sjeffvoid kseq_move(struct kseq *from, int cpu); 252110267Sjeff#endif 253110028Sjeff 254113357Sjeffvoid 255113660Sjeffkseq_print(int cpu) 256110267Sjeff{ 257113660Sjeff struct kseq *kseq; 258113357Sjeff int i; 259112994Sjeff 260113660Sjeff kseq = KSEQ_CPU(cpu); 261112994Sjeff 262113357Sjeff printf("kseq:\n"); 263113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 264113357Sjeff printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); 265113357Sjeff printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); 266113357Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); 267113357Sjeff printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); 268113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 269113357Sjeff printf("\tnice counts:\n"); 270113357Sjeff for (i = 0; i < PRIO_TOTAL + 1; i++) 271113357Sjeff if (kseq->ksq_nice[i]) 272113357Sjeff printf("\t\t%d = %d\n", 273113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 274113357Sjeff} 275112994Sjeff 276113357Sjeffstatic void 277113357Sjeffkseq_add(struct kseq *kseq, struct kse *ke) 278113357Sjeff{ 279115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 280113386Sjeff kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; 281113357Sjeff kseq->ksq_load++; 282113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 283113357Sjeff CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 284113357Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 285113357Sjeff ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 286113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 287113357Sjeff kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 288110267Sjeff#ifdef SMP 289110267Sjeff kseq->ksq_rslices += ke->ke_slice; 290110267Sjeff#endif 291110267Sjeff} 292113357Sjeff 293112994Sjeffstatic void 294110267Sjeffkseq_rem(struct kseq *kseq, struct kse *ke) 295110267Sjeff{ 296115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 297113386Sjeff kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; 298113357Sjeff kseq->ksq_load--; 299113357Sjeff ke->ke_runq = NULL; 300113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 301113357Sjeff kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 302110267Sjeff#ifdef SMP 303110267Sjeff kseq->ksq_rslices -= ke->ke_slice; 304110267Sjeff#endif 305110267Sjeff} 306110267Sjeff 307113357Sjeffstatic void 308113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 309110267Sjeff{ 310115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 311113357Sjeff /* Normalize to zero. */ 312113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 313115998Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) 314113357Sjeff kseq->ksq_nicemin = nice; 315110267Sjeff} 316110267Sjeff 317113357Sjeffstatic void 318113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 319110267Sjeff{ 320113357Sjeff int n; 321113357Sjeff 322115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 323113357Sjeff /* Normalize to zero. */ 324113357Sjeff n = nice + SCHED_PRI_NHALF; 325113357Sjeff kseq->ksq_nice[n]--; 326113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 327113357Sjeff 328113357Sjeff /* 329113357Sjeff * If this wasn't the smallest nice value or there are more in 330113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 331113357Sjeff * the smallest nice. 332113357Sjeff */ 333113357Sjeff if (nice != kseq->ksq_nicemin || 334113357Sjeff kseq->ksq_nice[n] != 0 || 335113357Sjeff kseq->ksq_loads[PRI_TIMESHARE] == 0) 336113357Sjeff return; 337113357Sjeff 338113357Sjeff for (; n < SCHED_PRI_NRESV + 1; n++) 339113357Sjeff if (kseq->ksq_nice[n]) { 340113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 341113357Sjeff return; 342113357Sjeff } 343110267Sjeff} 344110267Sjeff 345113357Sjeff#ifdef SMP 346116069Sjeff/* 347116069Sjeff * kseq_balance is a simple CPU load balancing algorithm. It operates by 348116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 349116069Sjeff * by migrating some processes. 350116069Sjeff * 351116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 352116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 353116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 354116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 355116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 356116069Sjeff * 357116069Sjeff * It could be improved by considering the priorities and slices assigned to 358116069Sjeff * each task prior to balancing them. There are many pathological cases with 359116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 360116069Sjeff * 361116069Sjeff */ 362116069Sjeffvoid 363116069Sjeffkseq_balance(void *arg) 364116069Sjeff{ 365116069Sjeff struct kseq *kseq; 366116069Sjeff int high_load; 367116069Sjeff int low_load; 368116069Sjeff int high_cpu; 369116069Sjeff int low_cpu; 370116069Sjeff int move; 371116069Sjeff int diff; 372116069Sjeff int i; 373116069Sjeff 374116069Sjeff high_cpu = 0; 375116069Sjeff low_cpu = 0; 376116069Sjeff high_load = 0; 377116069Sjeff low_load = -1; 378116069Sjeff 379116069Sjeff mtx_lock_spin(&sched_lock); 380116962Sjeff if (smp_started == 0) 381116962Sjeff goto out; 382116962Sjeff 383116069Sjeff for (i = 0; i < mp_maxid; i++) { 384116970Sjeff if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 385116069Sjeff continue; 386116069Sjeff kseq = KSEQ_CPU(i); 387116069Sjeff if (kseq->ksq_load > high_load) { 388116069Sjeff high_load = kseq->ksq_load; 389116069Sjeff high_cpu = i; 390116069Sjeff } 391116069Sjeff if (low_load == -1 || kseq->ksq_load < low_load) { 392116069Sjeff low_load = kseq->ksq_load; 393116069Sjeff low_cpu = i; 394116069Sjeff } 395116069Sjeff } 396116069Sjeff 397117237Sjeff kseq = KSEQ_CPU(high_cpu); 398117237Sjeff 399116069Sjeff /* 400116069Sjeff * Nothing to do. 401116069Sjeff */ 402117237Sjeff if (high_load < kseq->ksq_cpus + 1) 403116069Sjeff goto out; 404116069Sjeff 405117237Sjeff high_load -= kseq->ksq_cpus; 406117237Sjeff 407117237Sjeff if (low_load >= high_load) 408117237Sjeff goto out; 409117237Sjeff 410116069Sjeff diff = high_load - low_load; 411116069Sjeff move = diff / 2; 412116069Sjeff if (diff & 0x1) 413116069Sjeff move++; 414116069Sjeff 415116069Sjeff for (i = 0; i < move; i++) 416117237Sjeff kseq_move(kseq, low_cpu); 417116069Sjeff 418116069Sjeffout: 419116069Sjeff mtx_unlock_spin(&sched_lock); 420116069Sjeff callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); 421116069Sjeff 422116069Sjeff return; 423116069Sjeff} 424116069Sjeff 425110267Sjeffstruct kseq * 426110267Sjeffkseq_load_highest(void) 427110267Sjeff{ 428110267Sjeff struct kseq *kseq; 429110267Sjeff int load; 430110267Sjeff int cpu; 431110267Sjeff int i; 432110267Sjeff 433115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 434110267Sjeff cpu = 0; 435110267Sjeff load = 0; 436110267Sjeff 437110267Sjeff for (i = 0; i < mp_maxid; i++) { 438116970Sjeff if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 439110267Sjeff continue; 440110267Sjeff kseq = KSEQ_CPU(i); 441113357Sjeff if (kseq->ksq_load > load) { 442113357Sjeff load = kseq->ksq_load; 443110267Sjeff cpu = i; 444110267Sjeff } 445110267Sjeff } 446117237Sjeff kseq = KSEQ_CPU(cpu); 447110267Sjeff 448117237Sjeff if (load > kseq->ksq_cpus) 449117237Sjeff return (kseq); 450117237Sjeff 451110267Sjeff return (NULL); 452110267Sjeff} 453116069Sjeff 454116069Sjeffvoid 455116069Sjeffkseq_move(struct kseq *from, int cpu) 456116069Sjeff{ 457116069Sjeff struct kse *ke; 458116069Sjeff 459117326Sjeff ke = kseq_choose(from, 1); 460116069Sjeff runq_remove(ke->ke_runq, ke); 461116069Sjeff ke->ke_state = KES_THREAD; 462116069Sjeff kseq_rem(from, ke); 463116069Sjeff 464116069Sjeff ke->ke_cpu = cpu; 465116069Sjeff sched_add(ke); 466116069Sjeff} 467110267Sjeff#endif 468110267Sjeff 469117326Sjeff/* 470117326Sjeff * Pick the highest priority task we have and return it. If steal is 1 we 471117326Sjeff * will return kses that have been denied slices due to their nice being too 472117326Sjeff * low. In the future we should prohibit stealing interrupt threads as well. 473117326Sjeff */ 474117326Sjeff 475110267Sjeffstruct kse * 476117326Sjeffkseq_choose(struct kseq *kseq, int steal) 477110267Sjeff{ 478110267Sjeff struct kse *ke; 479110267Sjeff struct runq *swap; 480110267Sjeff 481115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 482113357Sjeff swap = NULL; 483112994Sjeff 484113357Sjeff for (;;) { 485113357Sjeff ke = runq_choose(kseq->ksq_curr); 486113357Sjeff if (ke == NULL) { 487113357Sjeff /* 488113357Sjeff * We already swaped once and didn't get anywhere. 489113357Sjeff */ 490113357Sjeff if (swap) 491113357Sjeff break; 492113357Sjeff swap = kseq->ksq_curr; 493113357Sjeff kseq->ksq_curr = kseq->ksq_next; 494113357Sjeff kseq->ksq_next = swap; 495113357Sjeff continue; 496113357Sjeff } 497113357Sjeff /* 498113357Sjeff * If we encounter a slice of 0 the kse is in a 499113357Sjeff * TIMESHARE kse group and its nice was too far out 500113357Sjeff * of the range that receives slices. 501113357Sjeff */ 502117326Sjeff if (ke->ke_slice == 0 && steal == 0) { 503113357Sjeff runq_remove(ke->ke_runq, ke); 504113357Sjeff sched_slice(ke); 505113357Sjeff ke->ke_runq = kseq->ksq_next; 506113357Sjeff runq_add(ke->ke_runq, ke); 507113357Sjeff continue; 508113357Sjeff } 509113357Sjeff return (ke); 510110267Sjeff } 511110267Sjeff 512113357Sjeff return (runq_choose(&kseq->ksq_idle)); 513110267Sjeff} 514110267Sjeff 515109864Sjeffstatic void 516110028Sjeffkseq_setup(struct kseq *kseq) 517110028Sjeff{ 518113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 519113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 520112994Sjeff runq_init(&kseq->ksq_idle); 521113357Sjeff 522113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 523113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 524113357Sjeff 525113357Sjeff kseq->ksq_loads[PRI_ITHD] = 0; 526113357Sjeff kseq->ksq_loads[PRI_REALTIME] = 0; 527113357Sjeff kseq->ksq_loads[PRI_TIMESHARE] = 0; 528113357Sjeff kseq->ksq_loads[PRI_IDLE] = 0; 529113660Sjeff kseq->ksq_load = 0; 530110267Sjeff#ifdef SMP 531110267Sjeff kseq->ksq_rslices = 0; 532110267Sjeff#endif 533110028Sjeff} 534110028Sjeff 535110028Sjeffstatic void 536109864Sjeffsched_setup(void *dummy) 537109864Sjeff{ 538117313Sjeff#ifdef SMP 539109864Sjeff int i; 540117313Sjeff#endif 541109864Sjeff 542116946Sjeff slice_min = (hz/100); /* 10ms */ 543116946Sjeff slice_max = (hz/7); /* ~140ms */ 544111857Sjeff 545117237Sjeff#ifdef SMP 546109864Sjeff /* init kseqs */ 547117237Sjeff /* Create the idmap. */ 548117237Sjeff#ifdef ULE_HTT_EXPERIMENTAL 549117237Sjeff if (smp_topology == NULL) { 550117237Sjeff#else 551117237Sjeff if (1) { 552117237Sjeff#endif 553117237Sjeff for (i = 0; i < MAXCPU; i++) { 554117237Sjeff kseq_setup(&kseq_cpu[i]); 555117237Sjeff kseq_idmap[i] = &kseq_cpu[i]; 556117237Sjeff kseq_cpu[i].ksq_cpus = 1; 557117237Sjeff } 558117237Sjeff } else { 559117237Sjeff int j; 560113357Sjeff 561117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 562117237Sjeff struct cpu_group *cg; 563117237Sjeff 564117237Sjeff cg = &smp_topology->ct_group[i]; 565117237Sjeff kseq_setup(&kseq_cpu[i]); 566117237Sjeff 567117237Sjeff for (j = 0; j < MAXCPU; j++) 568117237Sjeff if ((cg->cg_mask & (1 << j)) != 0) 569117237Sjeff kseq_idmap[j] = &kseq_cpu[i]; 570117237Sjeff kseq_cpu[i].ksq_cpus = cg->cg_count; 571117237Sjeff } 572117237Sjeff } 573119137Ssam callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 574116069Sjeff kseq_balance(NULL); 575117237Sjeff#else 576117237Sjeff kseq_setup(KSEQ_SELF()); 577116069Sjeff#endif 578117237Sjeff mtx_lock_spin(&sched_lock); 579117237Sjeff kseq_add(KSEQ_SELF(), &kse0); 580117237Sjeff mtx_unlock_spin(&sched_lock); 581109864Sjeff} 582109864Sjeff 583109864Sjeff/* 584109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 585109864Sjeff * process. 586109864Sjeff */ 587113357Sjeffstatic void 588109864Sjeffsched_priority(struct ksegrp *kg) 589109864Sjeff{ 590109864Sjeff int pri; 591109864Sjeff 592109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 593113357Sjeff return; 594109864Sjeff 595113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 596111857Sjeff pri += SCHED_PRI_BASE; 597109864Sjeff pri += kg->kg_nice; 598109864Sjeff 599109864Sjeff if (pri > PRI_MAX_TIMESHARE) 600109864Sjeff pri = PRI_MAX_TIMESHARE; 601109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 602109864Sjeff pri = PRI_MIN_TIMESHARE; 603109864Sjeff 604109864Sjeff kg->kg_user_pri = pri; 605109864Sjeff 606113357Sjeff return; 607109864Sjeff} 608109864Sjeff 609109864Sjeff/* 610112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 611112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 612109864Sjeff */ 613112966Sjeffstatic void 614112966Sjeffsched_slice(struct kse *ke) 615109864Sjeff{ 616113357Sjeff struct kseq *kseq; 617112966Sjeff struct ksegrp *kg; 618109864Sjeff 619112966Sjeff kg = ke->ke_ksegrp; 620113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 621109864Sjeff 622112966Sjeff /* 623112966Sjeff * Rationale: 624112966Sjeff * KSEs in interactive ksegs get the minimum slice so that we 625112966Sjeff * quickly notice if it abuses its advantage. 626112966Sjeff * 627112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 628112966Sjeff * based on the ksegs nice value relative to the least nice kseg 629112966Sjeff * on the run queue for this cpu. 630112966Sjeff * 631112966Sjeff * If the KSE is less nice than all others it gets the maximum 632112966Sjeff * slice and other KSEs will adjust their slice relative to 633112966Sjeff * this when they first expire. 634112966Sjeff * 635112966Sjeff * There is 20 point window that starts relative to the least 636112966Sjeff * nice kse on the run queue. Slice size is determined by 637112966Sjeff * the kse distance from the last nice ksegrp. 638112966Sjeff * 639112966Sjeff * If you are outside of the window you will get no slice and 640112966Sjeff * you will be reevaluated each time you are selected on the 641112966Sjeff * run queue. 642112966Sjeff * 643112966Sjeff */ 644109864Sjeff 645113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 646112966Sjeff int nice; 647112966Sjeff 648113357Sjeff nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 649113357Sjeff if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || 650113357Sjeff kg->kg_nice < kseq->ksq_nicemin) 651112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 652113357Sjeff else if (nice <= SCHED_PRI_NTHRESH) 653112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 654112966Sjeff else 655112966Sjeff ke->ke_slice = 0; 656112966Sjeff } else 657112966Sjeff ke->ke_slice = SCHED_SLICE_MIN; 658112966Sjeff 659113357Sjeff CTR6(KTR_ULE, 660113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 661113357Sjeff ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 662113357Sjeff kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); 663113357Sjeff 664110645Sjeff /* 665112994Sjeff * Check to see if we need to scale back the slp and run time 666112994Sjeff * in the kg. This will cause us to forget old interactivity 667112994Sjeff * while maintaining the current ratio. 668110645Sjeff */ 669116463Sjeff sched_interact_update(kg); 670110645Sjeff 671112966Sjeff return; 672109864Sjeff} 673109864Sjeff 674116463Sjeffstatic void 675116463Sjeffsched_interact_update(struct ksegrp *kg) 676116463Sjeff{ 677121126Sjeff /* XXX Fixme, use a linear algorithm and not a while loop. */ 678121126Sjeff while ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { 679121126Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 680121126Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 681116463Sjeff } 682116463Sjeff} 683116463Sjeff 684111857Sjeffstatic int 685111857Sjeffsched_interact_score(struct ksegrp *kg) 686111857Sjeff{ 687116365Sjeff int div; 688111857Sjeff 689111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 690116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 691116365Sjeff return (SCHED_INTERACT_HALF + 692116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 693116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 694116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 695116365Sjeff return (kg->kg_runtime / div); 696111857Sjeff } 697111857Sjeff 698116365Sjeff /* 699116365Sjeff * This can happen if slptime and runtime are 0. 700116365Sjeff */ 701116365Sjeff return (0); 702111857Sjeff 703111857Sjeff} 704111857Sjeff 705113357Sjeff/* 706113357Sjeff * This is only somewhat accurate since given many processes of the same 707113357Sjeff * priority they will switch when their slices run out, which will be 708113357Sjeff * at most SCHED_SLICE_MAX. 709113357Sjeff */ 710109864Sjeffint 711109864Sjeffsched_rr_interval(void) 712109864Sjeff{ 713109864Sjeff return (SCHED_SLICE_MAX); 714109864Sjeff} 715109864Sjeff 716109864Sjeffvoid 717109864Sjeffsched_pctcpu_update(struct kse *ke) 718109864Sjeff{ 719109864Sjeff /* 720109864Sjeff * Adjust counters and watermark for pctcpu calc. 721116365Sjeff */ 722120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 723120272Sjeff /* 724120272Sjeff * Shift the tick count out so that the divide doesn't 725120272Sjeff * round away our results. 726120272Sjeff */ 727120272Sjeff ke->ke_ticks <<= 10; 728120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 729120272Sjeff SCHED_CPU_TICKS; 730120272Sjeff ke->ke_ticks >>= 10; 731120272Sjeff } else 732120272Sjeff ke->ke_ticks = 0; 733109864Sjeff ke->ke_ltick = ticks; 734109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 735109864Sjeff} 736109864Sjeff 737109864Sjeff#ifdef SMP 738110267Sjeff/* XXX Should be changed to kseq_load_lowest() */ 739109864Sjeffint 740109864Sjeffsched_pickcpu(void) 741109864Sjeff{ 742110028Sjeff struct kseq *kseq; 743110028Sjeff int load; 744109864Sjeff int cpu; 745109864Sjeff int i; 746109864Sjeff 747115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 748109864Sjeff if (!smp_started) 749109864Sjeff return (0); 750109864Sjeff 751110028Sjeff load = 0; 752110028Sjeff cpu = 0; 753109864Sjeff 754109864Sjeff for (i = 0; i < mp_maxid; i++) { 755116970Sjeff if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 756109864Sjeff continue; 757110028Sjeff kseq = KSEQ_CPU(i); 758113357Sjeff if (kseq->ksq_load < load) { 759109864Sjeff cpu = i; 760113357Sjeff load = kseq->ksq_load; 761109864Sjeff } 762109864Sjeff } 763109864Sjeff 764109864Sjeff CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 765109864Sjeff return (cpu); 766109864Sjeff} 767109864Sjeff#else 768109864Sjeffint 769109864Sjeffsched_pickcpu(void) 770109864Sjeff{ 771109864Sjeff return (0); 772109864Sjeff} 773109864Sjeff#endif 774109864Sjeff 775109864Sjeffvoid 776109864Sjeffsched_prio(struct thread *td, u_char prio) 777109864Sjeff{ 778109864Sjeff 779109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 780109864Sjeff if (TD_ON_RUNQ(td)) { 781119488Sdavidxu adjustrunqueue(td, prio); 782119488Sdavidxu } else { 783119488Sdavidxu td->td_priority = prio; 784109864Sjeff } 785109864Sjeff} 786109864Sjeff 787109864Sjeffvoid 788109864Sjeffsched_switchout(struct thread *td) 789109864Sjeff{ 790109864Sjeff struct kse *ke; 791109864Sjeff 792109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 793109864Sjeff 794109864Sjeff ke = td->td_kse; 795109864Sjeff 796109864Sjeff td->td_last_kse = ke; 797113339Sjulian td->td_lastcpu = td->td_oncpu; 798113339Sjulian td->td_oncpu = NOCPU; 799111032Sjulian td->td_flags &= ~TDF_NEEDRESCHED; 800109864Sjeff 801109864Sjeff if (TD_IS_RUNNING(td)) { 802119488Sdavidxu if (td->td_proc->p_flag & P_SA) { 803119488Sdavidxu kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 804119488Sdavidxu setrunqueue(td); 805119488Sdavidxu } else { 806119488Sdavidxu /* 807119488Sdavidxu * This queue is always correct except for idle threads which 808119488Sdavidxu * have a higher priority due to priority propagation. 809119488Sdavidxu */ 810119488Sdavidxu if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE && 811119488Sdavidxu ke->ke_thread->td_priority > PRI_MIN_IDLE) 812119488Sdavidxu ke->ke_runq = KSEQ_SELF()->ksq_curr; 813119488Sdavidxu runq_add(ke->ke_runq, ke); 814119488Sdavidxu /* setrunqueue(td); */ 815119488Sdavidxu } 816109864Sjeff return; 817111857Sjeff } 818113357Sjeff if (ke->ke_runq) 819113357Sjeff kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 820109864Sjeff /* 821109864Sjeff * We will not be on the run queue. So we must be 822109864Sjeff * sleeping or similar. 823109864Sjeff */ 824116361Sdavidxu if (td->td_proc->p_flag & P_SA) 825109864Sjeff kse_reassign(ke); 826109864Sjeff} 827109864Sjeff 828109864Sjeffvoid 829109864Sjeffsched_switchin(struct thread *td) 830109864Sjeff{ 831109864Sjeff /* struct kse *ke = td->td_kse; */ 832109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 833109864Sjeff 834113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 835109864Sjeff} 836109864Sjeff 837109864Sjeffvoid 838109864Sjeffsched_nice(struct ksegrp *kg, int nice) 839109864Sjeff{ 840113357Sjeff struct kse *ke; 841109864Sjeff struct thread *td; 842113357Sjeff struct kseq *kseq; 843109864Sjeff 844113873Sjhb PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 845113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 846113357Sjeff /* 847113357Sjeff * We need to adjust the nice counts for running KSEs. 848113357Sjeff */ 849113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 850113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 851116500Sjeff if (ke->ke_runq == NULL) 852113357Sjeff continue; 853113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 854113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 855113357Sjeff kseq_nice_add(kseq, nice); 856113357Sjeff } 857109864Sjeff kg->kg_nice = nice; 858109864Sjeff sched_priority(kg); 859113357Sjeff FOREACH_THREAD_IN_GROUP(kg, td) 860111032Sjulian td->td_flags |= TDF_NEEDRESCHED; 861109864Sjeff} 862109864Sjeff 863109864Sjeffvoid 864109864Sjeffsched_sleep(struct thread *td, u_char prio) 865109864Sjeff{ 866109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 867109864Sjeff 868109864Sjeff td->td_slptime = ticks; 869109864Sjeff td->td_priority = prio; 870109864Sjeff 871113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 872113357Sjeff td->td_kse, td->td_slptime); 873109864Sjeff} 874109864Sjeff 875109864Sjeffvoid 876109864Sjeffsched_wakeup(struct thread *td) 877109864Sjeff{ 878109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 879109864Sjeff 880109864Sjeff /* 881109864Sjeff * Let the kseg know how long we slept for. This is because process 882109864Sjeff * interactivity behavior is modeled in the kseg. 883109864Sjeff */ 884111788Sjeff if (td->td_slptime) { 885111788Sjeff struct ksegrp *kg; 886113357Sjeff int hzticks; 887109864Sjeff 888111788Sjeff kg = td->td_ksegrp; 889113357Sjeff hzticks = ticks - td->td_slptime; 890113357Sjeff kg->kg_slptime += hzticks << 10; 891116463Sjeff sched_interact_update(kg); 892111788Sjeff sched_priority(kg); 893116463Sjeff if (td->td_kse) 894116463Sjeff sched_slice(td->td_kse); 895113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 896113357Sjeff td->td_kse, hzticks); 897111788Sjeff td->td_slptime = 0; 898109864Sjeff } 899109864Sjeff setrunqueue(td); 900109864Sjeff if (td->td_priority < curthread->td_priority) 901111032Sjulian curthread->td_flags |= TDF_NEEDRESCHED; 902109864Sjeff} 903109864Sjeff 904109864Sjeff/* 905109864Sjeff * Penalize the parent for creating a new child and initialize the child's 906109864Sjeff * priority. 907109864Sjeff */ 908109864Sjeffvoid 909113357Sjeffsched_fork(struct proc *p, struct proc *p1) 910109864Sjeff{ 911109864Sjeff 912109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 913109864Sjeff 914113357Sjeff sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 915113357Sjeff sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 916113357Sjeff sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 917113357Sjeff} 918113357Sjeff 919113357Sjeffvoid 920113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child) 921113357Sjeff{ 922113923Sjhb 923116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 924113357Sjeff child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ 925113357Sjeff child->ke_runq = NULL; 926113357Sjeff 927121051Sjeff /* Grab our parents cpu estimation information. */ 928121051Sjeff child->ke_ticks = ke->ke_ticks; 929121051Sjeff child->ke_ltick = ke->ke_ltick; 930121051Sjeff child->ke_ftick = ke->ke_ftick; 931113357Sjeff} 932113357Sjeff 933113357Sjeffvoid 934113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 935113357Sjeff{ 936113923Sjhb 937113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 938109864Sjeff /* XXX Need something better here */ 939116365Sjeff 940116955Sjeff child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE; 941116955Sjeff child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE; 942116463Sjeff kg->kg_runtime += tickincr << 10; 943116463Sjeff sched_interact_update(kg); 944113357Sjeff 945109864Sjeff child->kg_user_pri = kg->kg_user_pri; 946113357Sjeff child->kg_nice = kg->kg_nice; 947113357Sjeff} 948109864Sjeff 949113357Sjeffvoid 950113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 951113357Sjeff{ 952113357Sjeff} 953113357Sjeff 954113357Sjeffvoid 955113357Sjeffsched_class(struct ksegrp *kg, int class) 956113357Sjeff{ 957113357Sjeff struct kseq *kseq; 958113357Sjeff struct kse *ke; 959113357Sjeff 960113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 961113357Sjeff if (kg->kg_pri_class == class) 962113357Sjeff return; 963113357Sjeff 964113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 965113357Sjeff if (ke->ke_state != KES_ONRUNQ && 966113357Sjeff ke->ke_state != KES_THREAD) 967113357Sjeff continue; 968113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 969113357Sjeff 970113386Sjeff kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; 971113386Sjeff kseq->ksq_loads[PRI_BASE(class)]++; 972113357Sjeff 973113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 974113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 975113357Sjeff else if (class == PRI_TIMESHARE) 976113357Sjeff kseq_nice_add(kseq, kg->kg_nice); 977109970Sjeff } 978109970Sjeff 979113357Sjeff kg->kg_pri_class = class; 980109864Sjeff} 981109864Sjeff 982109864Sjeff/* 983109864Sjeff * Return some of the child's priority and interactivity to the parent. 984109864Sjeff */ 985109864Sjeffvoid 986113357Sjeffsched_exit(struct proc *p, struct proc *child) 987109864Sjeff{ 988109864Sjeff /* XXX Need something better here */ 989109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 990113372Sjeff sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 991116365Sjeff sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 992109864Sjeff} 993109864Sjeff 994109864Sjeffvoid 995113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child) 996113372Sjeff{ 997113372Sjeff kseq_rem(KSEQ_CPU(child->ke_cpu), child); 998113372Sjeff} 999113372Sjeff 1000113372Sjeffvoid 1001113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1002113372Sjeff{ 1003116463Sjeff /* kg->kg_slptime += child->kg_slptime; */ 1004116365Sjeff kg->kg_runtime += child->kg_runtime; 1005116463Sjeff sched_interact_update(kg); 1006113372Sjeff} 1007113372Sjeff 1008113372Sjeffvoid 1009113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 1010113372Sjeff{ 1011113372Sjeff} 1012113372Sjeff 1013113372Sjeffvoid 1014113357Sjeffsched_clock(struct kse *ke) 1015109864Sjeff{ 1016113357Sjeff struct kseq *kseq; 1017113357Sjeff struct ksegrp *kg; 1018113357Sjeff struct thread *td; 1019113357Sjeff#if 0 1020109864Sjeff struct kse *nke; 1021110267Sjeff#endif 1022109864Sjeff 1023113357Sjeff /* 1024113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1025113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1026113357Sjeff * calculating this here. 1027113357Sjeff */ 1028113357Sjeff if (realstathz == 0) { 1029113357Sjeff realstathz = stathz ? stathz : hz; 1030113357Sjeff tickincr = hz / realstathz; 1031113357Sjeff /* 1032113357Sjeff * XXX This does not work for values of stathz that are much 1033113357Sjeff * larger than hz. 1034113357Sjeff */ 1035113357Sjeff if (tickincr == 0) 1036113357Sjeff tickincr = 1; 1037113357Sjeff } 1038109864Sjeff 1039113357Sjeff td = ke->ke_thread; 1040113357Sjeff kg = ke->ke_ksegrp; 1041109864Sjeff 1042110028Sjeff mtx_assert(&sched_lock, MA_OWNED); 1043110028Sjeff KASSERT((td != NULL), ("schedclock: null thread pointer")); 1044110028Sjeff 1045110028Sjeff /* Adjust ticks for pctcpu */ 1046111793Sjeff ke->ke_ticks++; 1047109971Sjeff ke->ke_ltick = ticks; 1048112994Sjeff 1049109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1050109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1051109971Sjeff sched_pctcpu_update(ke); 1052109971Sjeff 1053114496Sjulian if (td->td_flags & TDF_IDLETD) 1054109864Sjeff return; 1055110028Sjeff 1056113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1057113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1058113357Sjeff 1059110028Sjeff /* 1060113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1061113357Sjeff */ 1062113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1063113357Sjeff return; 1064113357Sjeff /* 1065110028Sjeff * Check for a higher priority task on the run queue. This can happen 1066110028Sjeff * on SMP if another processor woke up a process on our runq. 1067110028Sjeff */ 1068110028Sjeff kseq = KSEQ_SELF(); 1069113357Sjeff#if 0 1070117326Sjeff if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq, 0)) != NULL) { 1071113357Sjeff if (sched_strict && 1072113357Sjeff nke->ke_thread->td_priority < td->td_priority) 1073113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1074113357Sjeff else if (nke->ke_thread->td_priority < 1075113357Sjeff td->td_priority SCHED_PRIO_SLOP) 1076113357Sjeff 1077113357Sjeff if (nke->ke_thread->td_priority < td->td_priority) 1078113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1079113357Sjeff } 1080110267Sjeff#endif 1081109864Sjeff /* 1082110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1083113357Sjeff * interactivity. 1084109864Sjeff */ 1085113357Sjeff kg->kg_runtime += tickincr << 10; 1086116463Sjeff sched_interact_update(kg); 1087110645Sjeff 1088109864Sjeff /* 1089109864Sjeff * We used up one time slice. 1090109864Sjeff */ 1091109864Sjeff ke->ke_slice--; 1092113357Sjeff#ifdef SMP 1093113370Sjeff kseq->ksq_rslices--; 1094113357Sjeff#endif 1095113357Sjeff 1096113357Sjeff if (ke->ke_slice > 0) 1097113357Sjeff return; 1098109864Sjeff /* 1099113357Sjeff * We're out of time, recompute priorities and requeue. 1100109864Sjeff */ 1101113357Sjeff kseq_rem(kseq, ke); 1102113357Sjeff sched_priority(kg); 1103113357Sjeff sched_slice(ke); 1104113357Sjeff if (SCHED_CURR(kg, ke)) 1105113357Sjeff ke->ke_runq = kseq->ksq_curr; 1106113357Sjeff else 1107113357Sjeff ke->ke_runq = kseq->ksq_next; 1108113357Sjeff kseq_add(kseq, ke); 1109113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1110109864Sjeff} 1111109864Sjeff 1112109864Sjeffint 1113109864Sjeffsched_runnable(void) 1114109864Sjeff{ 1115109864Sjeff struct kseq *kseq; 1116115998Sjeff int load; 1117109864Sjeff 1118115998Sjeff load = 1; 1119115998Sjeff 1120115998Sjeff mtx_lock_spin(&sched_lock); 1121110028Sjeff kseq = KSEQ_SELF(); 1122109864Sjeff 1123113357Sjeff if (kseq->ksq_load) 1124115998Sjeff goto out; 1125109970Sjeff#ifdef SMP 1126110028Sjeff /* 1127110028Sjeff * For SMP we may steal other processor's KSEs. Just search until we 1128110028Sjeff * verify that at least on other cpu has a runnable task. 1129110028Sjeff */ 1130109970Sjeff if (smp_started) { 1131109970Sjeff int i; 1132109970Sjeff 1133109970Sjeff for (i = 0; i < mp_maxid; i++) { 1134116970Sjeff if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 1135109970Sjeff continue; 1136110028Sjeff kseq = KSEQ_CPU(i); 1137117237Sjeff if (kseq->ksq_load > kseq->ksq_cpus) 1138115998Sjeff goto out; 1139109970Sjeff } 1140109970Sjeff } 1141109970Sjeff#endif 1142115998Sjeff load = 0; 1143115998Sjeffout: 1144115998Sjeff mtx_unlock_spin(&sched_lock); 1145115998Sjeff return (load); 1146109864Sjeff} 1147109864Sjeff 1148109864Sjeffvoid 1149109864Sjeffsched_userret(struct thread *td) 1150109864Sjeff{ 1151109864Sjeff struct ksegrp *kg; 1152121107Sjeff#if 0 1153116365Sjeff struct kseq *kseq; 1154116365Sjeff struct kse *ke; 1155121107Sjeff#endif 1156109864Sjeff 1157109864Sjeff kg = td->td_ksegrp; 1158109864Sjeff 1159109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1160109864Sjeff mtx_lock_spin(&sched_lock); 1161109864Sjeff td->td_priority = kg->kg_user_pri; 1162121107Sjeff /* 1163121107Sjeff * This optimization is temporarily disabled because it 1164121107Sjeff * breaks priority propagation. 1165121107Sjeff */ 1166121107Sjeff#if 0 1167116365Sjeff kseq = KSEQ_SELF(); 1168116365Sjeff if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 1169117237Sjeff#ifdef SMP 1170117237Sjeff kseq->ksq_load > kseq->ksq_cpus && 1171117237Sjeff#else 1172116365Sjeff kseq->ksq_load > 1 && 1173117237Sjeff#endif 1174117326Sjeff (ke = kseq_choose(kseq, 0)) != NULL && 1175116365Sjeff ke->ke_thread->td_priority < td->td_priority) 1176121107Sjeff#endif 1177116365Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1178109864Sjeff mtx_unlock_spin(&sched_lock); 1179109864Sjeff } 1180109864Sjeff} 1181109864Sjeff 1182109864Sjeffstruct kse * 1183109970Sjeffsched_choose(void) 1184109970Sjeff{ 1185110028Sjeff struct kseq *kseq; 1186109970Sjeff struct kse *ke; 1187109970Sjeff 1188115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1189113357Sjeff#ifdef SMP 1190112966Sjeffretry: 1191113357Sjeff#endif 1192113370Sjeff kseq = KSEQ_SELF(); 1193117326Sjeff ke = kseq_choose(kseq, 0); 1194109864Sjeff if (ke) { 1195113357Sjeff runq_remove(ke->ke_runq, ke); 1196109864Sjeff ke->ke_state = KES_THREAD; 1197112966Sjeff 1198113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1199113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1200113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1201113357Sjeff ke->ke_thread->td_priority); 1202113357Sjeff } 1203113357Sjeff return (ke); 1204109864Sjeff } 1205109864Sjeff 1206109970Sjeff#ifdef SMP 1207113370Sjeff if (smp_started) { 1208109970Sjeff /* 1209109970Sjeff * Find the cpu with the highest load and steal one proc. 1210109970Sjeff */ 1211113370Sjeff if ((kseq = kseq_load_highest()) == NULL) 1212113370Sjeff return (NULL); 1213113370Sjeff 1214113370Sjeff /* 1215113370Sjeff * Remove this kse from this kseq and runq and then requeue 1216113370Sjeff * on the current processor. Then we will dequeue it 1217113370Sjeff * normally above. 1218113370Sjeff */ 1219116069Sjeff kseq_move(kseq, PCPU_GET(cpuid)); 1220113370Sjeff goto retry; 1221109970Sjeff } 1222109970Sjeff#endif 1223113357Sjeff 1224113357Sjeff return (NULL); 1225109864Sjeff} 1226109864Sjeff 1227109864Sjeffvoid 1228109864Sjeffsched_add(struct kse *ke) 1229109864Sjeff{ 1230110267Sjeff struct kseq *kseq; 1231113357Sjeff struct ksegrp *kg; 1232109864Sjeff 1233109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1234110267Sjeff KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1235109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1236110267Sjeff ("sched_add: No KSE on thread")); 1237109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1238110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1239109864Sjeff ke->ke_proc->p_comm)); 1240109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1241110267Sjeff ("sched_add: process swapped out")); 1242113387Sjeff KASSERT(ke->ke_runq == NULL, 1243113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1244109864Sjeff 1245113357Sjeff kg = ke->ke_ksegrp; 1246113357Sjeff 1247113386Sjeff switch (PRI_BASE(kg->kg_pri_class)) { 1248112994Sjeff case PRI_ITHD: 1249112994Sjeff case PRI_REALTIME: 1250112994Sjeff kseq = KSEQ_SELF(); 1251113357Sjeff ke->ke_runq = kseq->ksq_curr; 1252113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1253113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1254112994Sjeff break; 1255112994Sjeff case PRI_TIMESHARE: 1256113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1257113387Sjeff if (SCHED_CURR(kg, ke)) 1258113387Sjeff ke->ke_runq = kseq->ksq_curr; 1259113387Sjeff else 1260113387Sjeff ke->ke_runq = kseq->ksq_next; 1261113357Sjeff break; 1262112994Sjeff case PRI_IDLE: 1263111789Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1264113357Sjeff /* 1265113357Sjeff * This is for priority prop. 1266113357Sjeff */ 1267116365Sjeff if (ke->ke_thread->td_priority > PRI_MIN_IDLE) 1268113357Sjeff ke->ke_runq = kseq->ksq_curr; 1269113357Sjeff else 1270113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1271113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1272112994Sjeff break; 1273113357Sjeff default: 1274113357Sjeff panic("Unknown pri class.\n"); 1275113357Sjeff break; 1276112994Sjeff } 1277109864Sjeff 1278109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1279109864Sjeff ke->ke_state = KES_ONRUNQ; 1280109864Sjeff 1281113357Sjeff runq_add(ke->ke_runq, ke); 1282113387Sjeff kseq_add(kseq, ke); 1283109864Sjeff} 1284109864Sjeff 1285109864Sjeffvoid 1286109864Sjeffsched_rem(struct kse *ke) 1287109864Sjeff{ 1288113357Sjeff struct kseq *kseq; 1289113357Sjeff 1290109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1291113387Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1292109864Sjeff 1293109864Sjeff ke->ke_state = KES_THREAD; 1294109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1295113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1296113357Sjeff runq_remove(ke->ke_runq, ke); 1297113357Sjeff kseq_rem(kseq, ke); 1298109864Sjeff} 1299109864Sjeff 1300109864Sjefffixpt_t 1301109864Sjeffsched_pctcpu(struct kse *ke) 1302109864Sjeff{ 1303109864Sjeff fixpt_t pctcpu; 1304109864Sjeff 1305109864Sjeff pctcpu = 0; 1306109864Sjeff 1307115998Sjeff mtx_lock_spin(&sched_lock); 1308109864Sjeff if (ke->ke_ticks) { 1309109864Sjeff int rtick; 1310109864Sjeff 1311116365Sjeff /* 1312116365Sjeff * Don't update more frequently than twice a second. Allowing 1313116365Sjeff * this causes the cpu usage to decay away too quickly due to 1314116365Sjeff * rounding errors. 1315116365Sjeff */ 1316116365Sjeff if (ke->ke_ltick < (ticks - (hz / 2))) 1317116365Sjeff sched_pctcpu_update(ke); 1318109864Sjeff /* How many rtick per second ? */ 1319116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1320110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1321109864Sjeff } 1322109864Sjeff 1323109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1324113865Sjhb mtx_unlock_spin(&sched_lock); 1325109864Sjeff 1326109864Sjeff return (pctcpu); 1327109864Sjeff} 1328109864Sjeff 1329109864Sjeffint 1330109864Sjeffsched_sizeof_kse(void) 1331109864Sjeff{ 1332109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1333109864Sjeff} 1334109864Sjeff 1335109864Sjeffint 1336109864Sjeffsched_sizeof_ksegrp(void) 1337109864Sjeff{ 1338109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1339109864Sjeff} 1340109864Sjeff 1341109864Sjeffint 1342109864Sjeffsched_sizeof_proc(void) 1343109864Sjeff{ 1344109864Sjeff return (sizeof(struct proc)); 1345109864Sjeff} 1346109864Sjeff 1347109864Sjeffint 1348109864Sjeffsched_sizeof_thread(void) 1349109864Sjeff{ 1350109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1351109864Sjeff} 1352