sched_ule.c revision 116463
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 116463 2003-06-17 06:39:51Z jeff $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32109864Sjeff#include <sys/kernel.h> 33109864Sjeff#include <sys/ktr.h> 34109864Sjeff#include <sys/lock.h> 35109864Sjeff#include <sys/mutex.h> 36109864Sjeff#include <sys/proc.h> 37112966Sjeff#include <sys/resource.h> 38109864Sjeff#include <sys/sched.h> 39109864Sjeff#include <sys/smp.h> 40109864Sjeff#include <sys/sx.h> 41109864Sjeff#include <sys/sysctl.h> 42109864Sjeff#include <sys/sysproto.h> 43109864Sjeff#include <sys/vmmeter.h> 44109864Sjeff#ifdef DDB 45109864Sjeff#include <ddb/ddb.h> 46109864Sjeff#endif 47109864Sjeff#ifdef KTRACE 48109864Sjeff#include <sys/uio.h> 49109864Sjeff#include <sys/ktrace.h> 50109864Sjeff#endif 51109864Sjeff 52109864Sjeff#include <machine/cpu.h> 53109864Sjeff 54113357Sjeff#define KTR_ULE KTR_NFS 55113357Sjeff 56109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57109864Sjeff/* XXX This is bogus compatability crap for ps */ 58109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 59109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 60109864Sjeff 61109864Sjeffstatic void sched_setup(void *dummy); 62109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 63109864Sjeff 64113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 65113357Sjeff 66113357Sjeffstatic int sched_strict; 67113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 68113357Sjeff 69113357Sjeffstatic int slice_min = 1; 70113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71113357Sjeff 72116365Sjeffstatic int slice_max = 10; 73113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74113357Sjeff 75111857Sjeffint realstathz; 76113357Sjeffint tickincr = 1; 77111857Sjeff 78116069Sjeff#ifdef SMP 79116069Sjeff/* Callout to handle load balancing SMP systems. */ 80116069Sjeffstatic struct callout kseq_lb_callout; 81116069Sjeff#endif 82116069Sjeff 83109864Sjeff/* 84109864Sjeff * These datastructures are allocated within their parent datastructure but 85109864Sjeff * are scheduler specific. 86109864Sjeff */ 87109864Sjeff 88109864Sjeffstruct ke_sched { 89109864Sjeff int ske_slice; 90109864Sjeff struct runq *ske_runq; 91109864Sjeff /* The following variables are only used for pctcpu calculation */ 92109864Sjeff int ske_ltick; /* Last tick that we were running on */ 93109864Sjeff int ske_ftick; /* First tick that we were running on */ 94109864Sjeff int ske_ticks; /* Tick count */ 95113357Sjeff /* CPU that we have affinity for. */ 96110260Sjeff u_char ske_cpu; 97109864Sjeff}; 98109864Sjeff#define ke_slice ke_sched->ske_slice 99109864Sjeff#define ke_runq ke_sched->ske_runq 100109864Sjeff#define ke_ltick ke_sched->ske_ltick 101109864Sjeff#define ke_ftick ke_sched->ske_ftick 102109864Sjeff#define ke_ticks ke_sched->ske_ticks 103110260Sjeff#define ke_cpu ke_sched->ske_cpu 104109864Sjeff 105109864Sjeffstruct kg_sched { 106110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 107110645Sjeff int skg_runtime; /* Number of ticks we were running */ 108109864Sjeff}; 109109864Sjeff#define kg_slptime kg_sched->skg_slptime 110110645Sjeff#define kg_runtime kg_sched->skg_runtime 111109864Sjeff 112109864Sjeffstruct td_sched { 113109864Sjeff int std_slptime; 114109864Sjeff}; 115109864Sjeff#define td_slptime td_sched->std_slptime 116109864Sjeff 117110267Sjeffstruct td_sched td_sched; 118109864Sjeffstruct ke_sched ke_sched; 119109864Sjeffstruct kg_sched kg_sched; 120109864Sjeff 121109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 122109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 123109864Sjeffstruct p_sched *proc0_sched = NULL; 124109864Sjeffstruct td_sched *thread0_sched = &td_sched; 125109864Sjeff 126109864Sjeff/* 127109864Sjeff * This priority range has 20 priorities on either end that are reachable 128109864Sjeff * only through nice values. 129111857Sjeff * 130111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 131111857Sjeff * PRI_NRESV: Reserved priorities for nice. 132111857Sjeff * PRI_BASE: The start of the dynamic range. 133111857Sjeff * DYN_RANGE: Number of priorities that are available int the dynamic 134111857Sjeff * priority range. 135109864Sjeff */ 136111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 137112966Sjeff#define SCHED_PRI_NRESV PRIO_TOTAL 138112970Sjeff#define SCHED_PRI_NHALF (PRIO_TOTAL / 2) 139113357Sjeff#define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) 140111857Sjeff#define SCHED_PRI_BASE ((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE) 141111857Sjeff#define SCHED_DYN_RANGE (SCHED_PRI_RANGE - SCHED_PRI_NRESV) 142113357Sjeff#define SCHED_PRI_INTERACT(score) \ 143116365Sjeff ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_MAX) 144109864Sjeff 145109864Sjeff/* 146111857Sjeff * These determine the interactivity of a process. 147109864Sjeff * 148110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 149110645Sjeff * before throttling back. 150111857Sjeff * SLP_RUN_THROTTLE: Divisor for reducing slp/run time. 151116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 152111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 153109864Sjeff */ 154116463Sjeff#define SCHED_SLP_RUN_MAX ((hz * 2) << 10) 155116377Sjeff#define SCHED_SLP_RUN_THROTTLE (2) 156116365Sjeff#define SCHED_INTERACT_MAX (100) 157116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 158116365Sjeff#define SCHED_INTERACT_THRESH (20) 159111857Sjeff 160109864Sjeff/* 161109864Sjeff * These parameters and macros determine the size of the time slice that is 162109864Sjeff * granted to each thread. 163109864Sjeff * 164109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 165109864Sjeff * SLICE_MAX: Maximum time slice granted. 166109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 167112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 168112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 169109864Sjeff */ 170113357Sjeff#define SCHED_SLICE_MIN (slice_min) 171113357Sjeff#define SCHED_SLICE_MAX (slice_max) 172111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 173109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 174112966Sjeff#define SCHED_SLICE_NICE(nice) \ 175113357Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) 176109864Sjeff 177109864Sjeff/* 178109864Sjeff * This macro determines whether or not the kse belongs on the current or 179109864Sjeff * next run queue. 180110645Sjeff * 181110645Sjeff * XXX nice value should effect how interactive a kg is. 182109864Sjeff */ 183113357Sjeff#define SCHED_INTERACTIVE(kg) \ 184113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 185113417Sjeff#define SCHED_CURR(kg, ke) \ 186113357Sjeff (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg)) 187109864Sjeff 188109864Sjeff/* 189109864Sjeff * Cpu percentage computation macros and defines. 190109864Sjeff * 191109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 192109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 193109864Sjeff */ 194109864Sjeff 195112971Sjeff#define SCHED_CPU_TIME 10 196109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 197109864Sjeff 198109864Sjeff/* 199113357Sjeff * kseq - per processor runqs and statistics. 200109864Sjeff */ 201109864Sjeff 202113357Sjeff#define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ 203113357Sjeff 204109864Sjeffstruct kseq { 205113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 206113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 207113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 208113357Sjeff struct runq *ksq_curr; /* Current queue. */ 209113357Sjeff int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ 210113357Sjeff int ksq_load; /* Aggregate load. */ 211113357Sjeff short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ 212113357Sjeff short ksq_nicemin; /* Least nice. */ 213110267Sjeff#ifdef SMP 214110267Sjeff unsigned int ksq_rslices; /* Slices on run queue */ 215110267Sjeff#endif 216109864Sjeff}; 217109864Sjeff 218109864Sjeff/* 219109864Sjeff * One kse queue per processor. 220109864Sjeff */ 221110028Sjeff#ifdef SMP 222109864Sjeffstruct kseq kseq_cpu[MAXCPU]; 223110028Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 224110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 225110028Sjeff#else 226110028Sjeffstruct kseq kseq_cpu; 227110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 228110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 229110028Sjeff#endif 230109864Sjeff 231112966Sjeffstatic void sched_slice(struct kse *ke); 232113357Sjeffstatic void sched_priority(struct ksegrp *kg); 233111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 234116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 235109864Sjeffvoid sched_pctcpu_update(struct kse *ke); 236109864Sjeffint sched_pickcpu(void); 237109864Sjeff 238110267Sjeff/* Operations on per processor queues */ 239110028Sjeffstatic struct kse * kseq_choose(struct kseq *kseq); 240110028Sjeffstatic void kseq_setup(struct kseq *kseq); 241112994Sjeffstatic void kseq_add(struct kseq *kseq, struct kse *ke); 242113357Sjeffstatic void kseq_rem(struct kseq *kseq, struct kse *ke); 243113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 244113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 245113660Sjeffvoid kseq_print(int cpu); 246110267Sjeff#ifdef SMP 247110267Sjeffstruct kseq * kseq_load_highest(void); 248116069Sjeffvoid kseq_balance(void *arg); 249116069Sjeffvoid kseq_move(struct kseq *from, int cpu); 250110267Sjeff#endif 251110028Sjeff 252113357Sjeffvoid 253113660Sjeffkseq_print(int cpu) 254110267Sjeff{ 255113660Sjeff struct kseq *kseq; 256113357Sjeff int i; 257112994Sjeff 258113660Sjeff kseq = KSEQ_CPU(cpu); 259112994Sjeff 260113357Sjeff printf("kseq:\n"); 261113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 262113357Sjeff printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); 263113357Sjeff printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); 264113357Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); 265113357Sjeff printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); 266113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 267113357Sjeff printf("\tnice counts:\n"); 268113357Sjeff for (i = 0; i < PRIO_TOTAL + 1; i++) 269113357Sjeff if (kseq->ksq_nice[i]) 270113357Sjeff printf("\t\t%d = %d\n", 271113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 272113357Sjeff} 273112994Sjeff 274113357Sjeffstatic void 275113357Sjeffkseq_add(struct kseq *kseq, struct kse *ke) 276113357Sjeff{ 277115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 278113386Sjeff kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; 279113357Sjeff kseq->ksq_load++; 280113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 281113357Sjeff CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 282113357Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 283113357Sjeff ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 284113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 285113357Sjeff kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 286110267Sjeff#ifdef SMP 287110267Sjeff kseq->ksq_rslices += ke->ke_slice; 288110267Sjeff#endif 289110267Sjeff} 290113357Sjeff 291112994Sjeffstatic void 292110267Sjeffkseq_rem(struct kseq *kseq, struct kse *ke) 293110267Sjeff{ 294115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 295113386Sjeff kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; 296113357Sjeff kseq->ksq_load--; 297113357Sjeff ke->ke_runq = NULL; 298113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 299113357Sjeff kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 300110267Sjeff#ifdef SMP 301110267Sjeff kseq->ksq_rslices -= ke->ke_slice; 302110267Sjeff#endif 303110267Sjeff} 304110267Sjeff 305113357Sjeffstatic void 306113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 307110267Sjeff{ 308115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 309113357Sjeff /* Normalize to zero. */ 310113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 311115998Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) 312113357Sjeff kseq->ksq_nicemin = nice; 313110267Sjeff} 314110267Sjeff 315113357Sjeffstatic void 316113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 317110267Sjeff{ 318113357Sjeff int n; 319113357Sjeff 320115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 321113357Sjeff /* Normalize to zero. */ 322113357Sjeff n = nice + SCHED_PRI_NHALF; 323113357Sjeff kseq->ksq_nice[n]--; 324113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 325113357Sjeff 326113357Sjeff /* 327113357Sjeff * If this wasn't the smallest nice value or there are more in 328113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 329113357Sjeff * the smallest nice. 330113357Sjeff */ 331113357Sjeff if (nice != kseq->ksq_nicemin || 332113357Sjeff kseq->ksq_nice[n] != 0 || 333113357Sjeff kseq->ksq_loads[PRI_TIMESHARE] == 0) 334113357Sjeff return; 335113357Sjeff 336113357Sjeff for (; n < SCHED_PRI_NRESV + 1; n++) 337113357Sjeff if (kseq->ksq_nice[n]) { 338113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 339113357Sjeff return; 340113357Sjeff } 341110267Sjeff} 342110267Sjeff 343113357Sjeff#ifdef SMP 344116069Sjeff/* 345116069Sjeff * kseq_balance is a simple CPU load balancing algorithm. It operates by 346116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 347116069Sjeff * by migrating some processes. 348116069Sjeff * 349116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 350116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 351116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 352116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 353116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 354116069Sjeff * 355116069Sjeff * It could be improved by considering the priorities and slices assigned to 356116069Sjeff * each task prior to balancing them. There are many pathological cases with 357116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 358116069Sjeff * 359116069Sjeff */ 360116069Sjeffvoid 361116069Sjeffkseq_balance(void *arg) 362116069Sjeff{ 363116069Sjeff struct kseq *kseq; 364116069Sjeff int high_load; 365116069Sjeff int low_load; 366116069Sjeff int high_cpu; 367116069Sjeff int low_cpu; 368116069Sjeff int move; 369116069Sjeff int diff; 370116069Sjeff int i; 371116069Sjeff 372116069Sjeff high_cpu = 0; 373116069Sjeff low_cpu = 0; 374116069Sjeff high_load = 0; 375116069Sjeff low_load = -1; 376116069Sjeff 377116069Sjeff mtx_lock_spin(&sched_lock); 378116069Sjeff for (i = 0; i < mp_maxid; i++) { 379116069Sjeff if (CPU_ABSENT(i)) 380116069Sjeff continue; 381116069Sjeff kseq = KSEQ_CPU(i); 382116069Sjeff if (kseq->ksq_load > high_load) { 383116069Sjeff high_load = kseq->ksq_load; 384116069Sjeff high_cpu = i; 385116069Sjeff } 386116069Sjeff if (low_load == -1 || kseq->ksq_load < low_load) { 387116069Sjeff low_load = kseq->ksq_load; 388116069Sjeff low_cpu = i; 389116069Sjeff } 390116069Sjeff } 391116069Sjeff 392116069Sjeff /* 393116069Sjeff * Nothing to do. 394116069Sjeff */ 395116069Sjeff if (high_load < 2 || low_load == high_load) 396116069Sjeff goto out; 397116069Sjeff 398116069Sjeff diff = high_load - low_load; 399116069Sjeff move = diff / 2; 400116069Sjeff if (diff & 0x1) 401116069Sjeff move++; 402116069Sjeff 403116069Sjeff for (i = 0; i < move; i++) 404116069Sjeff kseq_move(KSEQ_CPU(high_cpu), low_cpu); 405116069Sjeff 406116069Sjeffout: 407116069Sjeff mtx_unlock_spin(&sched_lock); 408116069Sjeff callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); 409116069Sjeff 410116069Sjeff return; 411116069Sjeff} 412116069Sjeff 413110267Sjeffstruct kseq * 414110267Sjeffkseq_load_highest(void) 415110267Sjeff{ 416110267Sjeff struct kseq *kseq; 417110267Sjeff int load; 418110267Sjeff int cpu; 419110267Sjeff int i; 420110267Sjeff 421115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 422110267Sjeff cpu = 0; 423110267Sjeff load = 0; 424110267Sjeff 425110267Sjeff for (i = 0; i < mp_maxid; i++) { 426110267Sjeff if (CPU_ABSENT(i)) 427110267Sjeff continue; 428110267Sjeff kseq = KSEQ_CPU(i); 429113357Sjeff if (kseq->ksq_load > load) { 430113357Sjeff load = kseq->ksq_load; 431110267Sjeff cpu = i; 432110267Sjeff } 433110267Sjeff } 434113371Sjeff if (load > 1) 435110267Sjeff return (KSEQ_CPU(cpu)); 436110267Sjeff 437110267Sjeff return (NULL); 438110267Sjeff} 439116069Sjeff 440116069Sjeffvoid 441116069Sjeffkseq_move(struct kseq *from, int cpu) 442116069Sjeff{ 443116069Sjeff struct kse *ke; 444116069Sjeff 445116069Sjeff ke = kseq_choose(from); 446116069Sjeff runq_remove(ke->ke_runq, ke); 447116069Sjeff ke->ke_state = KES_THREAD; 448116069Sjeff kseq_rem(from, ke); 449116069Sjeff 450116069Sjeff ke->ke_cpu = cpu; 451116069Sjeff sched_add(ke); 452116069Sjeff} 453110267Sjeff#endif 454110267Sjeff 455110267Sjeffstruct kse * 456110267Sjeffkseq_choose(struct kseq *kseq) 457110267Sjeff{ 458110267Sjeff struct kse *ke; 459110267Sjeff struct runq *swap; 460110267Sjeff 461115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 462113357Sjeff swap = NULL; 463112994Sjeff 464113357Sjeff for (;;) { 465113357Sjeff ke = runq_choose(kseq->ksq_curr); 466113357Sjeff if (ke == NULL) { 467113357Sjeff /* 468113357Sjeff * We already swaped once and didn't get anywhere. 469113357Sjeff */ 470113357Sjeff if (swap) 471113357Sjeff break; 472113357Sjeff swap = kseq->ksq_curr; 473113357Sjeff kseq->ksq_curr = kseq->ksq_next; 474113357Sjeff kseq->ksq_next = swap; 475113357Sjeff continue; 476113357Sjeff } 477113357Sjeff /* 478113357Sjeff * If we encounter a slice of 0 the kse is in a 479113357Sjeff * TIMESHARE kse group and its nice was too far out 480113357Sjeff * of the range that receives slices. 481113357Sjeff */ 482113357Sjeff if (ke->ke_slice == 0) { 483113357Sjeff runq_remove(ke->ke_runq, ke); 484113357Sjeff sched_slice(ke); 485113357Sjeff ke->ke_runq = kseq->ksq_next; 486113357Sjeff runq_add(ke->ke_runq, ke); 487113357Sjeff continue; 488113357Sjeff } 489113357Sjeff return (ke); 490110267Sjeff } 491110267Sjeff 492113357Sjeff return (runq_choose(&kseq->ksq_idle)); 493110267Sjeff} 494110267Sjeff 495109864Sjeffstatic void 496110028Sjeffkseq_setup(struct kseq *kseq) 497110028Sjeff{ 498113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 499113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 500112994Sjeff runq_init(&kseq->ksq_idle); 501113357Sjeff 502113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 503113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 504113357Sjeff 505113357Sjeff kseq->ksq_loads[PRI_ITHD] = 0; 506113357Sjeff kseq->ksq_loads[PRI_REALTIME] = 0; 507113357Sjeff kseq->ksq_loads[PRI_TIMESHARE] = 0; 508113357Sjeff kseq->ksq_loads[PRI_IDLE] = 0; 509113660Sjeff kseq->ksq_load = 0; 510110267Sjeff#ifdef SMP 511110267Sjeff kseq->ksq_rslices = 0; 512110267Sjeff#endif 513110028Sjeff} 514110028Sjeff 515110028Sjeffstatic void 516109864Sjeffsched_setup(void *dummy) 517109864Sjeff{ 518109864Sjeff int i; 519109864Sjeff 520113357Sjeff slice_min = (hz/100); 521113357Sjeff slice_max = (hz/10); 522111857Sjeff 523109864Sjeff mtx_lock_spin(&sched_lock); 524109864Sjeff /* init kseqs */ 525110028Sjeff for (i = 0; i < MAXCPU; i++) 526110028Sjeff kseq_setup(KSEQ_CPU(i)); 527113357Sjeff 528113357Sjeff kseq_add(KSEQ_SELF(), &kse0); 529109864Sjeff mtx_unlock_spin(&sched_lock); 530116069Sjeff#ifdef SMP 531116069Sjeff callout_init(&kseq_lb_callout, 1); 532116069Sjeff kseq_balance(NULL); 533116069Sjeff#endif 534109864Sjeff} 535109864Sjeff 536109864Sjeff/* 537109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 538109864Sjeff * process. 539109864Sjeff */ 540113357Sjeffstatic void 541109864Sjeffsched_priority(struct ksegrp *kg) 542109864Sjeff{ 543109864Sjeff int pri; 544109864Sjeff 545109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 546113357Sjeff return; 547109864Sjeff 548113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 549111857Sjeff pri += SCHED_PRI_BASE; 550109864Sjeff pri += kg->kg_nice; 551109864Sjeff 552109864Sjeff if (pri > PRI_MAX_TIMESHARE) 553109864Sjeff pri = PRI_MAX_TIMESHARE; 554109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 555109864Sjeff pri = PRI_MIN_TIMESHARE; 556109864Sjeff 557109864Sjeff kg->kg_user_pri = pri; 558109864Sjeff 559113357Sjeff return; 560109864Sjeff} 561109864Sjeff 562109864Sjeff/* 563112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 564112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 565109864Sjeff */ 566112966Sjeffstatic void 567112966Sjeffsched_slice(struct kse *ke) 568109864Sjeff{ 569113357Sjeff struct kseq *kseq; 570112966Sjeff struct ksegrp *kg; 571109864Sjeff 572112966Sjeff kg = ke->ke_ksegrp; 573113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 574109864Sjeff 575112966Sjeff /* 576112966Sjeff * Rationale: 577112966Sjeff * KSEs in interactive ksegs get the minimum slice so that we 578112966Sjeff * quickly notice if it abuses its advantage. 579112966Sjeff * 580112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 581112966Sjeff * based on the ksegs nice value relative to the least nice kseg 582112966Sjeff * on the run queue for this cpu. 583112966Sjeff * 584112966Sjeff * If the KSE is less nice than all others it gets the maximum 585112966Sjeff * slice and other KSEs will adjust their slice relative to 586112966Sjeff * this when they first expire. 587112966Sjeff * 588112966Sjeff * There is 20 point window that starts relative to the least 589112966Sjeff * nice kse on the run queue. Slice size is determined by 590112966Sjeff * the kse distance from the last nice ksegrp. 591112966Sjeff * 592112966Sjeff * If you are outside of the window you will get no slice and 593112966Sjeff * you will be reevaluated each time you are selected on the 594112966Sjeff * run queue. 595112966Sjeff * 596112966Sjeff */ 597109864Sjeff 598113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 599112966Sjeff int nice; 600112966Sjeff 601113357Sjeff nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 602113357Sjeff if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || 603113357Sjeff kg->kg_nice < kseq->ksq_nicemin) 604112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 605113357Sjeff else if (nice <= SCHED_PRI_NTHRESH) 606112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 607112966Sjeff else 608112966Sjeff ke->ke_slice = 0; 609112966Sjeff } else 610112966Sjeff ke->ke_slice = SCHED_SLICE_MIN; 611112966Sjeff 612113357Sjeff CTR6(KTR_ULE, 613113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 614113357Sjeff ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 615113357Sjeff kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); 616113357Sjeff 617110645Sjeff /* 618112994Sjeff * Check to see if we need to scale back the slp and run time 619112994Sjeff * in the kg. This will cause us to forget old interactivity 620112994Sjeff * while maintaining the current ratio. 621110645Sjeff */ 622116463Sjeff sched_interact_update(kg); 623110645Sjeff 624112966Sjeff return; 625109864Sjeff} 626109864Sjeff 627116463Sjeffstatic void 628116463Sjeffsched_interact_update(struct ksegrp *kg) 629116463Sjeff{ 630116463Sjeff if ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { 631116463Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 632116463Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 633116463Sjeff } 634116463Sjeff} 635116463Sjeff 636111857Sjeffstatic int 637111857Sjeffsched_interact_score(struct ksegrp *kg) 638111857Sjeff{ 639116365Sjeff int div; 640111857Sjeff 641111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 642116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 643116365Sjeff return (SCHED_INTERACT_HALF + 644116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 645116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 646116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 647116365Sjeff return (kg->kg_runtime / div); 648111857Sjeff } 649111857Sjeff 650116365Sjeff /* 651116365Sjeff * This can happen if slptime and runtime are 0. 652116365Sjeff */ 653116365Sjeff return (0); 654111857Sjeff 655111857Sjeff} 656111857Sjeff 657113357Sjeff/* 658113357Sjeff * This is only somewhat accurate since given many processes of the same 659113357Sjeff * priority they will switch when their slices run out, which will be 660113357Sjeff * at most SCHED_SLICE_MAX. 661113357Sjeff */ 662109864Sjeffint 663109864Sjeffsched_rr_interval(void) 664109864Sjeff{ 665109864Sjeff return (SCHED_SLICE_MAX); 666109864Sjeff} 667109864Sjeff 668109864Sjeffvoid 669109864Sjeffsched_pctcpu_update(struct kse *ke) 670109864Sjeff{ 671109864Sjeff /* 672109864Sjeff * Adjust counters and watermark for pctcpu calc. 673116365Sjeff */ 674116365Sjeff 675116365Sjeff /* 676111793Sjeff * Shift the tick count out so that the divide doesn't round away 677111793Sjeff * our results. 678111793Sjeff */ 679111793Sjeff ke->ke_ticks <<= 10; 680109864Sjeff ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) * 681109864Sjeff SCHED_CPU_TICKS; 682111793Sjeff ke->ke_ticks >>= 10; 683109864Sjeff ke->ke_ltick = ticks; 684109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 685109864Sjeff} 686109864Sjeff 687109864Sjeff#ifdef SMP 688110267Sjeff/* XXX Should be changed to kseq_load_lowest() */ 689109864Sjeffint 690109864Sjeffsched_pickcpu(void) 691109864Sjeff{ 692110028Sjeff struct kseq *kseq; 693110028Sjeff int load; 694109864Sjeff int cpu; 695109864Sjeff int i; 696109864Sjeff 697115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 698109864Sjeff if (!smp_started) 699109864Sjeff return (0); 700109864Sjeff 701110028Sjeff load = 0; 702110028Sjeff cpu = 0; 703109864Sjeff 704109864Sjeff for (i = 0; i < mp_maxid; i++) { 705109864Sjeff if (CPU_ABSENT(i)) 706109864Sjeff continue; 707110028Sjeff kseq = KSEQ_CPU(i); 708113357Sjeff if (kseq->ksq_load < load) { 709109864Sjeff cpu = i; 710113357Sjeff load = kseq->ksq_load; 711109864Sjeff } 712109864Sjeff } 713109864Sjeff 714109864Sjeff CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 715109864Sjeff return (cpu); 716109864Sjeff} 717109864Sjeff#else 718109864Sjeffint 719109864Sjeffsched_pickcpu(void) 720109864Sjeff{ 721109864Sjeff return (0); 722109864Sjeff} 723109864Sjeff#endif 724109864Sjeff 725109864Sjeffvoid 726109864Sjeffsched_prio(struct thread *td, u_char prio) 727109864Sjeff{ 728109864Sjeff struct kse *ke; 729109864Sjeff struct runq *rq; 730109864Sjeff 731109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 732109864Sjeff ke = td->td_kse; 733109864Sjeff td->td_priority = prio; 734109864Sjeff 735109864Sjeff if (TD_ON_RUNQ(td)) { 736109864Sjeff rq = ke->ke_runq; 737109864Sjeff 738109864Sjeff runq_remove(rq, ke); 739109864Sjeff runq_add(rq, ke); 740109864Sjeff } 741109864Sjeff} 742109864Sjeff 743109864Sjeffvoid 744109864Sjeffsched_switchout(struct thread *td) 745109864Sjeff{ 746109864Sjeff struct kse *ke; 747109864Sjeff 748109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 749109864Sjeff 750109864Sjeff ke = td->td_kse; 751109864Sjeff 752109864Sjeff td->td_last_kse = ke; 753113339Sjulian td->td_lastcpu = td->td_oncpu; 754113339Sjulian td->td_oncpu = NOCPU; 755111032Sjulian td->td_flags &= ~TDF_NEEDRESCHED; 756109864Sjeff 757109864Sjeff if (TD_IS_RUNNING(td)) { 758116365Sjeff /* 759116365Sjeff * This queue is always correct except for idle threads which 760116365Sjeff * have a higher priority due to priority propagation. 761116365Sjeff */ 762116365Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE && 763116365Sjeff ke->ke_thread->td_priority > PRI_MIN_IDLE) 764116365Sjeff ke->ke_runq = KSEQ_SELF()->ksq_curr; 765113357Sjeff runq_add(ke->ke_runq, ke); 766113357Sjeff /* setrunqueue(td); */ 767109864Sjeff return; 768111857Sjeff } 769113357Sjeff if (ke->ke_runq) 770113357Sjeff kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 771109864Sjeff /* 772109864Sjeff * We will not be on the run queue. So we must be 773109864Sjeff * sleeping or similar. 774109864Sjeff */ 775116361Sdavidxu if (td->td_proc->p_flag & P_SA) 776109864Sjeff kse_reassign(ke); 777109864Sjeff} 778109864Sjeff 779109864Sjeffvoid 780109864Sjeffsched_switchin(struct thread *td) 781109864Sjeff{ 782109864Sjeff /* struct kse *ke = td->td_kse; */ 783109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 784109864Sjeff 785113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 786109864Sjeff} 787109864Sjeff 788109864Sjeffvoid 789109864Sjeffsched_nice(struct ksegrp *kg, int nice) 790109864Sjeff{ 791113357Sjeff struct kse *ke; 792109864Sjeff struct thread *td; 793113357Sjeff struct kseq *kseq; 794109864Sjeff 795113873Sjhb PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 796113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 797113357Sjeff /* 798113357Sjeff * We need to adjust the nice counts for running KSEs. 799113357Sjeff */ 800113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 801113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 802113357Sjeff if (ke->ke_state != KES_ONRUNQ && 803113357Sjeff ke->ke_state != KES_THREAD) 804113357Sjeff continue; 805113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 806113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 807113357Sjeff kseq_nice_add(kseq, nice); 808113357Sjeff } 809109864Sjeff kg->kg_nice = nice; 810109864Sjeff sched_priority(kg); 811113357Sjeff FOREACH_THREAD_IN_GROUP(kg, td) 812111032Sjulian td->td_flags |= TDF_NEEDRESCHED; 813109864Sjeff} 814109864Sjeff 815109864Sjeffvoid 816109864Sjeffsched_sleep(struct thread *td, u_char prio) 817109864Sjeff{ 818109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 819109864Sjeff 820109864Sjeff td->td_slptime = ticks; 821109864Sjeff td->td_priority = prio; 822109864Sjeff 823113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 824113357Sjeff td->td_kse, td->td_slptime); 825109864Sjeff} 826109864Sjeff 827109864Sjeffvoid 828109864Sjeffsched_wakeup(struct thread *td) 829109864Sjeff{ 830109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 831109864Sjeff 832109864Sjeff /* 833109864Sjeff * Let the kseg know how long we slept for. This is because process 834109864Sjeff * interactivity behavior is modeled in the kseg. 835109864Sjeff */ 836111788Sjeff if (td->td_slptime) { 837111788Sjeff struct ksegrp *kg; 838113357Sjeff int hzticks; 839109864Sjeff 840111788Sjeff kg = td->td_ksegrp; 841113357Sjeff hzticks = ticks - td->td_slptime; 842113357Sjeff kg->kg_slptime += hzticks << 10; 843116463Sjeff sched_interact_update(kg); 844111788Sjeff sched_priority(kg); 845116463Sjeff if (td->td_kse) 846116463Sjeff sched_slice(td->td_kse); 847113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 848113357Sjeff td->td_kse, hzticks); 849111788Sjeff td->td_slptime = 0; 850109864Sjeff } 851109864Sjeff setrunqueue(td); 852109864Sjeff if (td->td_priority < curthread->td_priority) 853111032Sjulian curthread->td_flags |= TDF_NEEDRESCHED; 854109864Sjeff} 855109864Sjeff 856109864Sjeff/* 857109864Sjeff * Penalize the parent for creating a new child and initialize the child's 858109864Sjeff * priority. 859109864Sjeff */ 860109864Sjeffvoid 861113357Sjeffsched_fork(struct proc *p, struct proc *p1) 862109864Sjeff{ 863109864Sjeff 864109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 865109864Sjeff 866113357Sjeff sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 867113357Sjeff sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 868113357Sjeff sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 869113357Sjeff} 870113357Sjeff 871113357Sjeffvoid 872113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child) 873113357Sjeff{ 874113923Sjhb 875116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 876113357Sjeff child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ 877113357Sjeff child->ke_runq = NULL; 878113357Sjeff 879113357Sjeff /* 880113357Sjeff * Claim that we've been running for one second for statistical 881113357Sjeff * purposes. 882113357Sjeff */ 883113357Sjeff child->ke_ticks = 0; 884113357Sjeff child->ke_ltick = ticks; 885113357Sjeff child->ke_ftick = ticks - hz; 886113357Sjeff} 887113357Sjeff 888113357Sjeffvoid 889113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 890113357Sjeff{ 891113923Sjhb 892113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 893109864Sjeff /* XXX Need something better here */ 894116365Sjeff 895116365Sjeff child->kg_slptime = kg->kg_slptime; 896116365Sjeff child->kg_runtime = kg->kg_runtime; 897116463Sjeff kg->kg_runtime += tickincr << 10; 898116463Sjeff sched_interact_update(kg); 899113357Sjeff 900109864Sjeff child->kg_user_pri = kg->kg_user_pri; 901113357Sjeff child->kg_nice = kg->kg_nice; 902113357Sjeff} 903109864Sjeff 904113357Sjeffvoid 905113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 906113357Sjeff{ 907113357Sjeff} 908113357Sjeff 909113357Sjeffvoid 910113357Sjeffsched_class(struct ksegrp *kg, int class) 911113357Sjeff{ 912113357Sjeff struct kseq *kseq; 913113357Sjeff struct kse *ke; 914113357Sjeff 915113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 916113357Sjeff if (kg->kg_pri_class == class) 917113357Sjeff return; 918113357Sjeff 919113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 920113357Sjeff if (ke->ke_state != KES_ONRUNQ && 921113357Sjeff ke->ke_state != KES_THREAD) 922113357Sjeff continue; 923113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 924113357Sjeff 925113386Sjeff kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; 926113386Sjeff kseq->ksq_loads[PRI_BASE(class)]++; 927113357Sjeff 928113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 929113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 930113357Sjeff else if (class == PRI_TIMESHARE) 931113357Sjeff kseq_nice_add(kseq, kg->kg_nice); 932109970Sjeff } 933109970Sjeff 934113357Sjeff kg->kg_pri_class = class; 935109864Sjeff} 936109864Sjeff 937109864Sjeff/* 938109864Sjeff * Return some of the child's priority and interactivity to the parent. 939109864Sjeff */ 940109864Sjeffvoid 941113357Sjeffsched_exit(struct proc *p, struct proc *child) 942109864Sjeff{ 943109864Sjeff /* XXX Need something better here */ 944109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 945113372Sjeff sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 946116365Sjeff sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 947109864Sjeff} 948109864Sjeff 949109864Sjeffvoid 950113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child) 951113372Sjeff{ 952113372Sjeff kseq_rem(KSEQ_CPU(child->ke_cpu), child); 953113372Sjeff} 954113372Sjeff 955113372Sjeffvoid 956113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 957113372Sjeff{ 958116463Sjeff /* kg->kg_slptime += child->kg_slptime; */ 959116365Sjeff kg->kg_runtime += child->kg_runtime; 960116463Sjeff sched_interact_update(kg); 961113372Sjeff} 962113372Sjeff 963113372Sjeffvoid 964113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 965113372Sjeff{ 966113372Sjeff} 967113372Sjeff 968113372Sjeffvoid 969113357Sjeffsched_clock(struct kse *ke) 970109864Sjeff{ 971113357Sjeff struct kseq *kseq; 972113357Sjeff struct ksegrp *kg; 973113357Sjeff struct thread *td; 974113357Sjeff#if 0 975109864Sjeff struct kse *nke; 976110267Sjeff#endif 977109864Sjeff 978113357Sjeff /* 979113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 980113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 981113357Sjeff * calculating this here. 982113357Sjeff */ 983113357Sjeff if (realstathz == 0) { 984113357Sjeff realstathz = stathz ? stathz : hz; 985113357Sjeff tickincr = hz / realstathz; 986113357Sjeff /* 987113357Sjeff * XXX This does not work for values of stathz that are much 988113357Sjeff * larger than hz. 989113357Sjeff */ 990113357Sjeff if (tickincr == 0) 991113357Sjeff tickincr = 1; 992113357Sjeff } 993109864Sjeff 994113357Sjeff td = ke->ke_thread; 995113357Sjeff kg = ke->ke_ksegrp; 996109864Sjeff 997110028Sjeff mtx_assert(&sched_lock, MA_OWNED); 998110028Sjeff KASSERT((td != NULL), ("schedclock: null thread pointer")); 999110028Sjeff 1000110028Sjeff /* Adjust ticks for pctcpu */ 1001111793Sjeff ke->ke_ticks++; 1002109971Sjeff ke->ke_ltick = ticks; 1003112994Sjeff 1004109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1005109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1006109971Sjeff sched_pctcpu_update(ke); 1007109971Sjeff 1008114496Sjulian if (td->td_flags & TDF_IDLETD) 1009109864Sjeff return; 1010110028Sjeff 1011113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1012113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1013113357Sjeff 1014110028Sjeff /* 1015113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1016113357Sjeff */ 1017113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1018113357Sjeff return; 1019113357Sjeff /* 1020110028Sjeff * Check for a higher priority task on the run queue. This can happen 1021110028Sjeff * on SMP if another processor woke up a process on our runq. 1022110028Sjeff */ 1023110028Sjeff kseq = KSEQ_SELF(); 1024113357Sjeff#if 0 1025113357Sjeff if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) { 1026113357Sjeff if (sched_strict && 1027113357Sjeff nke->ke_thread->td_priority < td->td_priority) 1028113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1029113357Sjeff else if (nke->ke_thread->td_priority < 1030113357Sjeff td->td_priority SCHED_PRIO_SLOP) 1031113357Sjeff 1032113357Sjeff if (nke->ke_thread->td_priority < td->td_priority) 1033113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1034113357Sjeff } 1035110267Sjeff#endif 1036109864Sjeff /* 1037110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1038113357Sjeff * interactivity. 1039109864Sjeff */ 1040113357Sjeff kg->kg_runtime += tickincr << 10; 1041116463Sjeff sched_interact_update(kg); 1042110645Sjeff 1043109864Sjeff /* 1044109864Sjeff * We used up one time slice. 1045109864Sjeff */ 1046109864Sjeff ke->ke_slice--; 1047113357Sjeff#ifdef SMP 1048113370Sjeff kseq->ksq_rslices--; 1049113357Sjeff#endif 1050113357Sjeff 1051113357Sjeff if (ke->ke_slice > 0) 1052113357Sjeff return; 1053109864Sjeff /* 1054113357Sjeff * We're out of time, recompute priorities and requeue. 1055109864Sjeff */ 1056113357Sjeff kseq_rem(kseq, ke); 1057113357Sjeff sched_priority(kg); 1058113357Sjeff sched_slice(ke); 1059113357Sjeff if (SCHED_CURR(kg, ke)) 1060113357Sjeff ke->ke_runq = kseq->ksq_curr; 1061113357Sjeff else 1062113357Sjeff ke->ke_runq = kseq->ksq_next; 1063113357Sjeff kseq_add(kseq, ke); 1064113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1065109864Sjeff} 1066109864Sjeff 1067109864Sjeffint 1068109864Sjeffsched_runnable(void) 1069109864Sjeff{ 1070109864Sjeff struct kseq *kseq; 1071115998Sjeff int load; 1072109864Sjeff 1073115998Sjeff load = 1; 1074115998Sjeff 1075115998Sjeff mtx_lock_spin(&sched_lock); 1076110028Sjeff kseq = KSEQ_SELF(); 1077109864Sjeff 1078113357Sjeff if (kseq->ksq_load) 1079115998Sjeff goto out; 1080109970Sjeff#ifdef SMP 1081110028Sjeff /* 1082110028Sjeff * For SMP we may steal other processor's KSEs. Just search until we 1083110028Sjeff * verify that at least on other cpu has a runnable task. 1084110028Sjeff */ 1085109970Sjeff if (smp_started) { 1086109970Sjeff int i; 1087109970Sjeff 1088109970Sjeff for (i = 0; i < mp_maxid; i++) { 1089109970Sjeff if (CPU_ABSENT(i)) 1090109970Sjeff continue; 1091110028Sjeff kseq = KSEQ_CPU(i); 1092113660Sjeff if (kseq->ksq_load > 1) 1093115998Sjeff goto out; 1094109970Sjeff } 1095109970Sjeff } 1096109970Sjeff#endif 1097115998Sjeff load = 0; 1098115998Sjeffout: 1099115998Sjeff mtx_unlock_spin(&sched_lock); 1100115998Sjeff return (load); 1101109864Sjeff} 1102109864Sjeff 1103109864Sjeffvoid 1104109864Sjeffsched_userret(struct thread *td) 1105109864Sjeff{ 1106109864Sjeff struct ksegrp *kg; 1107116365Sjeff struct kseq *kseq; 1108116365Sjeff struct kse *ke; 1109109864Sjeff 1110109864Sjeff kg = td->td_ksegrp; 1111109864Sjeff 1112109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1113109864Sjeff mtx_lock_spin(&sched_lock); 1114109864Sjeff td->td_priority = kg->kg_user_pri; 1115116365Sjeff kseq = KSEQ_SELF(); 1116116365Sjeff if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 1117116365Sjeff kseq->ksq_load > 1 && 1118116365Sjeff (ke = kseq_choose(kseq)) != NULL && 1119116365Sjeff ke->ke_thread->td_priority < td->td_priority) 1120116365Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1121109864Sjeff mtx_unlock_spin(&sched_lock); 1122109864Sjeff } 1123109864Sjeff} 1124109864Sjeff 1125109864Sjeffstruct kse * 1126109970Sjeffsched_choose(void) 1127109970Sjeff{ 1128110028Sjeff struct kseq *kseq; 1129109970Sjeff struct kse *ke; 1130109970Sjeff 1131115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1132113357Sjeff#ifdef SMP 1133112966Sjeffretry: 1134113357Sjeff#endif 1135113370Sjeff kseq = KSEQ_SELF(); 1136110028Sjeff ke = kseq_choose(kseq); 1137109864Sjeff if (ke) { 1138113357Sjeff runq_remove(ke->ke_runq, ke); 1139109864Sjeff ke->ke_state = KES_THREAD; 1140112966Sjeff 1141113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1142113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1143113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1144113357Sjeff ke->ke_thread->td_priority); 1145113357Sjeff } 1146113357Sjeff return (ke); 1147109864Sjeff } 1148109864Sjeff 1149109970Sjeff#ifdef SMP 1150113370Sjeff if (smp_started) { 1151109970Sjeff /* 1152109970Sjeff * Find the cpu with the highest load and steal one proc. 1153109970Sjeff */ 1154113370Sjeff if ((kseq = kseq_load_highest()) == NULL) 1155113370Sjeff return (NULL); 1156113370Sjeff 1157113370Sjeff /* 1158113370Sjeff * Remove this kse from this kseq and runq and then requeue 1159113370Sjeff * on the current processor. Then we will dequeue it 1160113370Sjeff * normally above. 1161113370Sjeff */ 1162116069Sjeff kseq_move(kseq, PCPU_GET(cpuid)); 1163113370Sjeff goto retry; 1164109970Sjeff } 1165109970Sjeff#endif 1166113357Sjeff 1167113357Sjeff return (NULL); 1168109864Sjeff} 1169109864Sjeff 1170109864Sjeffvoid 1171109864Sjeffsched_add(struct kse *ke) 1172109864Sjeff{ 1173110267Sjeff struct kseq *kseq; 1174113357Sjeff struct ksegrp *kg; 1175109864Sjeff 1176109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1177110267Sjeff KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1178109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1179110267Sjeff ("sched_add: No KSE on thread")); 1180109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1181110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1182109864Sjeff ke->ke_proc->p_comm)); 1183109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1184110267Sjeff ("sched_add: process swapped out")); 1185113387Sjeff KASSERT(ke->ke_runq == NULL, 1186113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1187109864Sjeff 1188113357Sjeff kg = ke->ke_ksegrp; 1189113357Sjeff 1190113386Sjeff switch (PRI_BASE(kg->kg_pri_class)) { 1191112994Sjeff case PRI_ITHD: 1192112994Sjeff case PRI_REALTIME: 1193112994Sjeff kseq = KSEQ_SELF(); 1194113357Sjeff ke->ke_runq = kseq->ksq_curr; 1195113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1196113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1197112994Sjeff break; 1198112994Sjeff case PRI_TIMESHARE: 1199113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1200113387Sjeff if (SCHED_CURR(kg, ke)) 1201113387Sjeff ke->ke_runq = kseq->ksq_curr; 1202113387Sjeff else 1203113387Sjeff ke->ke_runq = kseq->ksq_next; 1204113357Sjeff break; 1205112994Sjeff case PRI_IDLE: 1206111789Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1207113357Sjeff /* 1208113357Sjeff * This is for priority prop. 1209113357Sjeff */ 1210116365Sjeff if (ke->ke_thread->td_priority > PRI_MIN_IDLE) 1211113357Sjeff ke->ke_runq = kseq->ksq_curr; 1212113357Sjeff else 1213113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1214113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1215112994Sjeff break; 1216113357Sjeff default: 1217113357Sjeff panic("Unknown pri class.\n"); 1218113357Sjeff break; 1219112994Sjeff } 1220109864Sjeff 1221109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1222109864Sjeff ke->ke_state = KES_ONRUNQ; 1223109864Sjeff 1224113357Sjeff runq_add(ke->ke_runq, ke); 1225113387Sjeff kseq_add(kseq, ke); 1226109864Sjeff} 1227109864Sjeff 1228109864Sjeffvoid 1229109864Sjeffsched_rem(struct kse *ke) 1230109864Sjeff{ 1231113357Sjeff struct kseq *kseq; 1232113357Sjeff 1233109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1234113387Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1235109864Sjeff 1236109864Sjeff ke->ke_state = KES_THREAD; 1237109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1238113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1239113357Sjeff runq_remove(ke->ke_runq, ke); 1240113357Sjeff kseq_rem(kseq, ke); 1241109864Sjeff} 1242109864Sjeff 1243109864Sjefffixpt_t 1244109864Sjeffsched_pctcpu(struct kse *ke) 1245109864Sjeff{ 1246109864Sjeff fixpt_t pctcpu; 1247109864Sjeff 1248109864Sjeff pctcpu = 0; 1249109864Sjeff 1250115998Sjeff mtx_lock_spin(&sched_lock); 1251109864Sjeff if (ke->ke_ticks) { 1252109864Sjeff int rtick; 1253109864Sjeff 1254109864Sjeff /* Update to account for time potentially spent sleeping */ 1255109864Sjeff ke->ke_ltick = ticks; 1256116365Sjeff /* 1257116365Sjeff * Don't update more frequently than twice a second. Allowing 1258116365Sjeff * this causes the cpu usage to decay away too quickly due to 1259116365Sjeff * rounding errors. 1260116365Sjeff */ 1261116365Sjeff if (ke->ke_ltick < (ticks - (hz / 2))) 1262116365Sjeff sched_pctcpu_update(ke); 1263109864Sjeff 1264109864Sjeff /* How many rtick per second ? */ 1265116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1266110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1267109864Sjeff } 1268109864Sjeff 1269109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1270113865Sjhb mtx_unlock_spin(&sched_lock); 1271109864Sjeff 1272109864Sjeff return (pctcpu); 1273109864Sjeff} 1274109864Sjeff 1275109864Sjeffint 1276109864Sjeffsched_sizeof_kse(void) 1277109864Sjeff{ 1278109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1279109864Sjeff} 1280109864Sjeff 1281109864Sjeffint 1282109864Sjeffsched_sizeof_ksegrp(void) 1283109864Sjeff{ 1284109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1285109864Sjeff} 1286109864Sjeff 1287109864Sjeffint 1288109864Sjeffsched_sizeof_proc(void) 1289109864Sjeff{ 1290109864Sjeff return (sizeof(struct proc)); 1291109864Sjeff} 1292109864Sjeff 1293109864Sjeffint 1294109864Sjeffsched_sizeof_thread(void) 1295109864Sjeff{ 1296109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1297109864Sjeff} 1298