sched_ule.c revision 121127
120253Sjoerg/*- 220302Sjoerg * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 320302Sjoerg * All rights reserved. 420253Sjoerg * 520253Sjoerg * Redistribution and use in source and binary forms, with or without 620253Sjoerg * modification, are permitted provided that the following conditions 720253Sjoerg * are met: 820253Sjoerg * 1. Redistributions of source code must retain the above copyright 920302Sjoerg * notice unmodified, this list of conditions, and the following 1020253Sjoerg * disclaimer. 1120253Sjoerg * 2. Redistributions in binary form must reproduce the above copyright 1220253Sjoerg * notice, this list of conditions and the following disclaimer in the 1320253Sjoerg * documentation and/or other materials provided with the distribution. 1420302Sjoerg * 1520253Sjoerg * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 1620253Sjoerg * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 1720302Sjoerg * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 1820253Sjoerg * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 1920253Sjoerg * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 2020253Sjoerg * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 2120253Sjoerg * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 2220253Sjoerg * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 2320253Sjoerg * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 2420253Sjoerg * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 2544229Sdavidn */ 2620253Sjoerg 2720253Sjoerg#include <sys/cdefs.h> 2830259Scharnier__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 121127 2003-10-16 08:39:15Z jeff $"); 2930259Scharnier 3050479Speter#include <sys/param.h> 3130259Scharnier#include <sys/systm.h> 3230259Scharnier#include <sys/kernel.h> 3330259Scharnier#include <sys/ktr.h> 3430259Scharnier#include <sys/lock.h> 3520253Sjoerg#include <sys/mutex.h> 3620253Sjoerg#include <sys/proc.h> 3720253Sjoerg#include <sys/resource.h> 3830259Scharnier#include <sys/sched.h> 3920253Sjoerg#include <sys/smp.h> 4020555Sdavidn#include <sys/sx.h> 4120555Sdavidn#include <sys/sysctl.h> 4220555Sdavidn#include <sys/sysproto.h> 4364918Sgreen#include <sys/vmmeter.h> 44242349Sbapt#ifdef DDB 45242349Sbapt#include <ddb/ddb.h> 46242349Sbapt#endif 4720253Sjoerg#ifdef KTRACE 4820253Sjoerg#include <sys/uio.h> 4920253Sjoerg#include <sys/ktrace.h> 5023318Sache#endif 5122394Sdavidn 5252512Sdavidn#include <machine/cpu.h> 5324214Sache 54284111Sbapt#define KTR_ULE KTR_NFS 55284128Sbapt 56284124Sbapt/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57284133Sbapt/* XXX This is bogus compatability crap for ps */ 58284118Sbaptstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 5920253SjoergSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 6020253Sjoerg 6120253Sjoergstatic void sched_setup(void *dummy); 6220253SjoergSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 6320253Sjoerg 6420253Sjoergstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 6520253Sjoerg 6685145Sachestatic int sched_strict; 6720253SjoergSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 68283961Sbapt 69284118Sbaptstatic int slice_min = 1; 70283961SbaptSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71283961Sbapt 72284118Sbaptstatic int slice_max = 10; 73283961SbaptSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74283961Sbapt 75283961Sbaptint realstathz; 76284118Sbaptint tickincr = 1; 77284118Sbapt 78283961Sbapt#ifdef SMP 79283961Sbapt/* Callout to handle load balancing SMP systems. */ 80284118Sbaptstatic struct callout kseq_lb_callout; 81283961Sbapt#endif 82283961Sbapt 83283961Sbapt/* 84283961Sbapt * These datastructures are allocated within their parent datastructure but 85283961Sbapt * are scheduler specific. 86283961Sbapt */ 87283961Sbapt 88283961Sbaptstruct ke_sched { 8920253Sjoerg int ske_slice; 9020253Sjoerg struct runq *ske_runq; 9120253Sjoerg /* The following variables are only used for pctcpu calculation */ 9220253Sjoerg int ske_ltick; /* Last tick that we were running on */ 9320253Sjoerg int ske_ftick; /* First tick that we were running on */ 9420253Sjoerg int ske_ticks; /* Tick count */ 9520253Sjoerg /* CPU that we have affinity for. */ 9620253Sjoerg u_char ske_cpu; 9720253Sjoerg}; 9820253Sjoerg#define ke_slice ke_sched->ske_slice 9920253Sjoerg#define ke_runq ke_sched->ske_runq 10020253Sjoerg#define ke_ltick ke_sched->ske_ltick 10120253Sjoerg#define ke_ftick ke_sched->ske_ftick 10220253Sjoerg#define ke_ticks ke_sched->ske_ticks 10320253Sjoerg#define ke_cpu ke_sched->ske_cpu 10420253Sjoerg 10520253Sjoergstruct kg_sched { 106124382Siedowse int skg_slptime; /* Number of ticks we vol. slept */ 10720253Sjoerg int skg_runtime; /* Number of ticks we were running */ 10820253Sjoerg}; 10920253Sjoerg#define kg_slptime kg_sched->skg_slptime 11020253Sjoerg#define kg_runtime kg_sched->skg_runtime 11120253Sjoerg 11220253Sjoergstruct td_sched { 11320253Sjoerg int std_slptime; 11420253Sjoerg}; 11520253Sjoerg#define td_slptime td_sched->std_slptime 11620253Sjoerg 11720253Sjoergstruct td_sched td_sched; 11820253Sjoergstruct ke_sched ke_sched; 11920253Sjoergstruct kg_sched kg_sched; 12020253Sjoerg 12120253Sjoergstruct ke_sched *kse0_sched = &ke_sched; 122284128Sbaptstruct kg_sched *ksegrp0_sched = &kg_sched; 12320253Sjoergstruct p_sched *proc0_sched = NULL; 12452527Sdavidnstruct td_sched *thread0_sched = &td_sched; 12520253Sjoerg 12652512Sdavidn/* 12720253Sjoerg * The priority is primarily determined by the interactivity score. Thus, we 12820253Sjoerg * give lower(better) priorities to kse groups that use less CPU. The nice 12920253Sjoerg * value is then directly added to this to allow nice to have some effect 13020253Sjoerg * on latency. 131284118Sbapt * 13220747Sdavidn * PRI_RANGE: Total priority range for timeshare threads. 133283961Sbapt * PRI_NRESV: Number of nice values. 13482868Sdd * PRI_BASE: The start of the dynamic range. 135167919Sle */ 136167919Sle#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 13720253Sjoerg#define SCHED_PRI_NRESV PRIO_TOTAL 13820253Sjoerg#define SCHED_PRI_NHALF (PRIO_TOTAL / 2) 13920253Sjoerg#define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) 14020253Sjoerg#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 14120253Sjoerg#define SCHED_PRI_INTERACT(score) \ 14220253Sjoerg ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 14320253Sjoerg 14420253Sjoerg/* 14520253Sjoerg * These determine the interactivity of a process. 14620253Sjoerg * 14756000Sdavidn * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 14820253Sjoerg * before throttling back. 14920253Sjoerg * SLP_RUN_THROTTLE: Divisor for reducing slp/run time at fork time. 15056000Sdavidn * INTERACT_MAX: Maximum interactivity value. Smaller is better. 15156000Sdavidn * INTERACT_THRESH: Threshhold for placement on the current runq. 15256000Sdavidn */ 15320253Sjoerg#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 15420253Sjoerg#define SCHED_SLP_RUN_THROTTLE (100) 155284118Sbapt#define SCHED_INTERACT_MAX (100) 15652512Sdavidn#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 15720253Sjoerg#define SCHED_INTERACT_THRESH (30) 15820267Sjoerg 15920267Sjoerg/* 16020267Sjoerg * These parameters and macros determine the size of the time slice that is 161284149Sbapt * granted to each thread. 162284149Sbapt * 163284149Sbapt * SLICE_MIN: Minimum time slice granted, in units of ticks. 164284149Sbapt * SLICE_MAX: Maximum time slice granted. 165284149Sbapt * SLICE_RANGE: Range of available time slices scaled by hz. 166284149Sbapt * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 167284128Sbapt * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 168284149Sbapt */ 16920267Sjoerg#define SCHED_SLICE_MIN (slice_min) 17020267Sjoerg#define SCHED_SLICE_MAX (slice_max) 17120267Sjoerg#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 17220253Sjoerg#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 17320253Sjoerg#define SCHED_SLICE_NICE(nice) \ 17420253Sjoerg (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) 17520253Sjoerg 17620267Sjoerg/* 17720253Sjoerg * This macro determines whether or not the kse belongs on the current or 17821052Sdavidn * next run queue. 179167919Sle * 180167919Sle * XXX nice value should effect how interactive a kg is. 181167919Sle */ 182167919Sle#define SCHED_INTERACTIVE(kg) \ 183167919Sle (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 184219408Sjkim#define SCHED_CURR(kg, ke) \ 185167919Sle (ke->ke_thread->td_priority != kg->kg_user_pri || \ 186168044Sle SCHED_INTERACTIVE(kg)) 187167919Sle 18821052Sdavidn/* 18921052Sdavidn * Cpu percentage computation macros and defines. 19021052Sdavidn * 19121052Sdavidn * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 192224535Sdelphij * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 19321052Sdavidn */ 19421052Sdavidn 19521052Sdavidn#define SCHED_CPU_TIME 10 19621052Sdavidn#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 19721052Sdavidn 19821052Sdavidn/* 19930259Scharnier * kseq - per processor runqs and statistics. 20021052Sdavidn */ 20121052Sdavidn 20221052Sdavidn#define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ 20321052Sdavidn 20421242Sdavidnstruct kseq { 20521242Sdavidn struct runq ksq_idle; /* Queue of IDLE threads. */ 20621242Sdavidn struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 20721242Sdavidn struct runq *ksq_next; /* Next timeshare queue. */ 20821242Sdavidn struct runq *ksq_curr; /* Current queue. */ 20921242Sdavidn int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ 21021242Sdavidn int ksq_load; /* Aggregate load. */ 211282683Sbapt short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ 212219408Sjkim short ksq_nicemin; /* Least nice. */ 21321242Sdavidn#ifdef SMP 214148584Spjd int ksq_cpus; /* Count of CPUs in this kseq. */ 215148584Spjd unsigned int ksq_rslices; /* Slices on run queue */ 216148584Spjd#endif 217148584Spjd}; 218148584Spjd 21921242Sdavidn/* 22021242Sdavidn * One kse queue per processor. 22121242Sdavidn */ 222130633Srobert#ifdef SMP 223130633Srobertstruct kseq kseq_cpu[MAXCPU]; 22421242Sdavidnstruct kseq *kseq_idmap[MAXCPU]; 225252377Skientzle#define KSEQ_SELF() (kseq_idmap[PCPU_GET(cpuid)]) 22621242Sdavidn#define KSEQ_CPU(x) (kseq_idmap[(x)]) 22721242Sdavidn#else 228219408Sjkimstruct kseq kseq_cpu; 22921242Sdavidn#define KSEQ_SELF() (&kseq_cpu) 23021242Sdavidn#define KSEQ_CPU(x) (&kseq_cpu) 23121242Sdavidn#endif 23230259Scharnier 23321242Sdavidnstatic void sched_slice(struct kse *ke); 23421242Sdavidnstatic void sched_priority(struct ksegrp *kg); 23521052Sdavidnstatic int sched_interact_score(struct ksegrp *kg); 23621242Sdavidnstatic void sched_interact_update(struct ksegrp *kg); 237219408Sjkimvoid sched_pctcpu_update(struct kse *ke); 23830259Scharnierint sched_pickcpu(void); 23921052Sdavidn 24021052Sdavidn/* Operations on per processor queues */ 24121052Sdavidnstatic struct kse * kseq_choose(struct kseq *kseq, int steal); 24221052Sdavidnstatic void kseq_setup(struct kseq *kseq); 24330259Scharnierstatic void kseq_add(struct kseq *kseq, struct kse *ke); 24421052Sdavidnstatic void kseq_rem(struct kseq *kseq, struct kse *ke); 24521052Sdavidnstatic void kseq_nice_add(struct kseq *kseq, int nice); 24620253Sjoergstatic void kseq_nice_rem(struct kseq *kseq, int nice); 24720253Sjoergvoid kseq_print(int cpu); 24820253Sjoerg#ifdef SMP 24921330Sdavidnstruct kseq * kseq_load_highest(void); 25021330Sdavidnvoid kseq_balance(void *arg); 25121330Sdavidnvoid kseq_move(struct kseq *from, int cpu); 25220253Sjoerg#endif 25320253Sjoerg 25420253Sjoergvoid 25520253Sjoergkseq_print(int cpu) 25663596Sdavidn{ 25763596Sdavidn struct kseq *kseq; 25863596Sdavidn int i; 25963596Sdavidn 26063596Sdavidn kseq = KSEQ_CPU(cpu); 26163596Sdavidn 26263596Sdavidn printf("kseq:\n"); 26363596Sdavidn printf("\tload: %d\n", kseq->ksq_load); 26420253Sjoerg printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); 26520253Sjoerg printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); 26620253Sjoerg printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); 267284110Sbapt printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); 26820253Sjoerg printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 26920253Sjoerg printf("\tnice counts:\n"); 27052527Sdavidn for (i = 0; i < PRIO_TOTAL + 1; i++) 27120253Sjoerg if (kseq->ksq_nice[i]) 27220747Sdavidn printf("\t\t%d = %d\n", 27344229Sdavidn i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 27461957Sache} 27530259Scharnier 27620253Sjoergstatic void 27720747Sdavidnkseq_add(struct kseq *kseq, struct kse *ke) 27820747Sdavidn{ 27920253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 28020747Sdavidn kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; 28120253Sjoerg kseq->ksq_load++; 28220253Sjoerg if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 28352527Sdavidn CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 28420253Sjoerg ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 28526088Sdavidn ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 28630259Scharnier if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 28720253Sjoerg kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 28852527Sdavidn#ifdef SMP 28920253Sjoerg kseq->ksq_rslices += ke->ke_slice; 29020253Sjoerg#endif 29120253Sjoerg} 29263600Sdavidn 29363600Sdavidnstatic void 29420253Sjoergkseq_rem(struct kseq *kseq, struct kse *ke) 295284135Sbapt{ 29630259Scharnier mtx_assert(&sched_lock, MA_OWNED); 29720253Sjoerg kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; 29820253Sjoerg kseq->ksq_load--; 29920253Sjoerg ke->ke_runq = NULL; 30020253Sjoerg if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 30120253Sjoerg kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 30220253Sjoerg#ifdef SMP 30320253Sjoerg kseq->ksq_rslices -= ke->ke_slice; 30420253Sjoerg#endif 30520253Sjoerg} 30620253Sjoerg 30720253Sjoergstatic void 30820253Sjoergkseq_nice_add(struct kseq *kseq, int nice) 30920253Sjoerg{ 310284135Sbapt mtx_assert(&sched_lock, MA_OWNED); 311284135Sbapt /* Normalize to zero. */ 312283814Sbapt kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 31320253Sjoerg if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) 31452527Sdavidn kseq->ksq_nicemin = nice; 31520253Sjoerg} 31644229Sdavidn 31744229Sdavidnstatic void 318284124Sbaptkseq_nice_rem(struct kseq *kseq, int nice) 31944229Sdavidn{ 32020267Sjoerg int n; 32120253Sjoerg 32252527Sdavidn mtx_assert(&sched_lock, MA_OWNED); 323284128Sbapt /* Normalize to zero. */ 324284128Sbapt n = nice + SCHED_PRI_NHALF; 32520253Sjoerg kseq->ksq_nice[n]--; 326284128Sbapt KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 327284128Sbapt 32820253Sjoerg /* 32920253Sjoerg * If this wasn't the smallest nice value or there are more in 33020253Sjoerg * this bucket we can just return. Otherwise we have to recalculate 33120253Sjoerg * the smallest nice. 33252512Sdavidn */ 33352512Sdavidn if (nice != kseq->ksq_nicemin || 33452527Sdavidn kseq->ksq_nice[n] != 0 || 335284128Sbapt kseq->ksq_loads[PRI_TIMESHARE] == 0) 336284128Sbapt return; 33720253Sjoerg 33820253Sjoerg for (; n < SCHED_PRI_NRESV + 1; n++) 33920253Sjoerg if (kseq->ksq_nice[n]) { 340284128Sbapt kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 341284128Sbapt return; 342284126Sbapt } 34320253Sjoerg} 344284128Sbapt 345284128Sbapt#ifdef SMP 346284128Sbapt/* 34720253Sjoerg * kseq_balance is a simple CPU load balancing algorithm. It operates by 34852527Sdavidn * finding the least loaded and most loaded cpu and equalizing their load 349284128Sbapt * by migrating some processes. 350284128Sbapt * 35120253Sjoerg * Dealing only with two CPUs at a time has two advantages. Firstly, most 35220253Sjoerg * installations will only have 2 cpus. Secondly, load balancing too much at 35352512Sdavidn * once can have an unpleasant effect on the system. The scheduler rarely has 35452512Sdavidn * enough information to make perfect decisions. So this algorithm chooses 35552512Sdavidn * algorithm simplicity and more gradual effects on load in larger systems. 35652512Sdavidn * 35752512Sdavidn * It could be improved by considering the priorities and slices assigned to 35852512Sdavidn * each task prior to balancing them. There are many pathological cases with 35952512Sdavidn * any approach and so the semi random algorithm below may work as well as any. 36052512Sdavidn * 36152512Sdavidn */ 36252512Sdavidnvoid 36352512Sdavidnkseq_balance(void *arg) 36452512Sdavidn{ 365282685Sbapt struct kseq *kseq; 36652512Sdavidn int high_load; 36752512Sdavidn int low_load; 36852512Sdavidn int high_cpu; 36952527Sdavidn int low_cpu; 37052512Sdavidn int move; 37152512Sdavidn int diff; 37252512Sdavidn int i; 37352512Sdavidn 37452527Sdavidn high_cpu = 0; 375284111Sbapt low_cpu = 0; 376284128Sbapt high_load = 0; 377284111Sbapt low_load = -1; 378284111Sbapt 379284126Sbapt mtx_lock_spin(&sched_lock); 38020253Sjoerg if (smp_started == 0) 38120253Sjoerg goto out; 38220253Sjoerg 38320253Sjoerg for (i = 0; i < mp_maxid; i++) { 384284129Sbapt if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 38520253Sjoerg continue; 38630259Scharnier kseq = KSEQ_CPU(i); 387284129Sbapt if (kseq->ksq_load > high_load) { 38852527Sdavidn high_load = kseq->ksq_load; 38920253Sjoerg high_cpu = i; 39052527Sdavidn } 391284133Sbapt if (low_load == -1 || kseq->ksq_load < low_load) { 392284133Sbapt low_load = kseq->ksq_load; 39352527Sdavidn low_cpu = i; 39420253Sjoerg } 39530259Scharnier } 39620253Sjoerg 39730259Scharnier kseq = KSEQ_CPU(high_cpu); 39820253Sjoerg 39920253Sjoerg /* 40052527Sdavidn * Nothing to do. 40152527Sdavidn */ 40252527Sdavidn if (high_load < kseq->ksq_cpus + 1) 40352527Sdavidn goto out; 40461762Sdavidn 40552527Sdavidn high_load -= kseq->ksq_cpus; 40652527Sdavidn 40752527Sdavidn if (low_load >= high_load) 40820253Sjoerg goto out; 40952527Sdavidn 41052527Sdavidn diff = high_load - low_load; 41152527Sdavidn move = diff / 2; 41252527Sdavidn if (diff & 0x1) 41352527Sdavidn move++; 41452527Sdavidn 41520253Sjoerg for (i = 0; i < move; i++) 41620253Sjoerg kseq_move(kseq, low_cpu); 41720253Sjoerg 41820253Sjoergout: 41952527Sdavidn mtx_unlock_spin(&sched_lock); 42052527Sdavidn callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); 42152527Sdavidn 42252527Sdavidn return; 42320253Sjoerg} 42420253Sjoerg 42552527Sdavidnstruct kseq * 42620267Sjoergkseq_load_highest(void) 42752527Sdavidn{ 42852527Sdavidn struct kseq *kseq; 42952527Sdavidn int load; 43052527Sdavidn int cpu; 43152527Sdavidn int i; 43252527Sdavidn 43320253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 43420253Sjoerg cpu = 0; 43520253Sjoerg load = 0; 43620253Sjoerg 43752527Sdavidn for (i = 0; i < mp_maxid; i++) { 43852527Sdavidn if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 43952527Sdavidn continue; 44052527Sdavidn kseq = KSEQ_CPU(i); 44120253Sjoerg if (kseq->ksq_load > load) { 44220253Sjoerg load = kseq->ksq_load; 44320253Sjoerg cpu = i; 44452527Sdavidn } 44552527Sdavidn } 44652527Sdavidn kseq = KSEQ_CPU(cpu); 44752527Sdavidn 44852527Sdavidn if (load > kseq->ksq_cpus) 44952527Sdavidn return (kseq); 45052527Sdavidn 45152527Sdavidn return (NULL); 45252527Sdavidn} 45320253Sjoerg 45452527Sdavidnvoid 45552527Sdavidnkseq_move(struct kseq *from, int cpu) 45652527Sdavidn{ 45752527Sdavidn struct kse *ke; 45852527Sdavidn 45952527Sdavidn ke = kseq_choose(from, 1); 46052527Sdavidn runq_remove(ke->ke_runq, ke); 46152527Sdavidn ke->ke_state = KES_THREAD; 46252527Sdavidn kseq_rem(from, ke); 46320747Sdavidn 464130629Srobert ke->ke_cpu = cpu; 465130629Srobert sched_add(ke); 46620747Sdavidn} 46720747Sdavidn#endif 46830259Scharnier 46920747Sdavidn/* 47030259Scharnier * Pick the highest priority task we have and return it. If steal is 1 we 47120747Sdavidn * will return kses that have been denied slices due to their nice being too 47220747Sdavidn * low. In the future we should prohibit stealing interrupt threads as well. 473124382Siedowse */ 474124382Siedowse 47564918Sgreenstruct kse * 47664918Sgreenkseq_choose(struct kseq *kseq, int steal) 47764918Sgreen{ 47864918Sgreen struct kse *ke; 479252688Sdes struct runq *swap; 48064918Sgreen 48164918Sgreen mtx_assert(&sched_lock, MA_OWNED); 48220267Sjoerg swap = NULL; 48352527Sdavidn 48452527Sdavidn for (;;) { 48520267Sjoerg ke = runq_choose(kseq->ksq_curr); 48620253Sjoerg if (ke == NULL) { 48764918Sgreen /* 48852527Sdavidn * We already swaped once and didn't get anywhere. 48952527Sdavidn */ 49052527Sdavidn if (swap) 49152527Sdavidn break; 49252527Sdavidn swap = kseq->ksq_curr; 493284128Sbapt kseq->ksq_curr = kseq->ksq_next; 49430259Scharnier kseq->ksq_next = swap; 495284128Sbapt continue; 496284128Sbapt } 49720253Sjoerg /* 49820253Sjoerg * If we encounter a slice of 0 the kse is in a 49920253Sjoerg * TIMESHARE kse group and its nice was too far out 50020253Sjoerg * of the range that receives slices. 50120253Sjoerg */ 502284128Sbapt if (ke->ke_slice == 0 && steal == 0) { 50320253Sjoerg runq_remove(ke->ke_runq, ke); 504284133Sbapt sched_slice(ke); 505284118Sbapt ke->ke_runq = kseq->ksq_next; 50620253Sjoerg runq_add(ke->ke_runq, ke); 50720253Sjoerg continue; 50820253Sjoerg } 50920253Sjoerg return (ke); 51064918Sgreen } 511272833Sdes 51264918Sgreen return (runq_choose(&kseq->ksq_idle)); 51364918Sgreen} 51464918Sgreen 51552527Sdavidnstatic void 51620253Sjoergkseq_setup(struct kseq *kseq) 51720253Sjoerg{ 51830259Scharnier runq_init(&kseq->ksq_timeshare[0]); 51920253Sjoerg runq_init(&kseq->ksq_timeshare[1]); 52020253Sjoerg runq_init(&kseq->ksq_idle); 52120253Sjoerg 52220253Sjoerg kseq->ksq_curr = &kseq->ksq_timeshare[0]; 52320253Sjoerg kseq->ksq_next = &kseq->ksq_timeshare[1]; 52452527Sdavidn 525284110Sbapt kseq->ksq_loads[PRI_ITHD] = 0; 52652527Sdavidn kseq->ksq_loads[PRI_REALTIME] = 0; 52752527Sdavidn kseq->ksq_loads[PRI_TIMESHARE] = 0; 52852527Sdavidn kseq->ksq_loads[PRI_IDLE] = 0; 52952527Sdavidn kseq->ksq_load = 0; 53052527Sdavidn#ifdef SMP 53120253Sjoerg kseq->ksq_rslices = 0; 532124382Siedowse#endif 533124382Siedowse} 53463572Sdavidn 53563572Sdavidnstatic void 53663572Sdavidnsched_setup(void *dummy) 53763572Sdavidn{ 53863572Sdavidn#ifdef SMP 53963572Sdavidn int i; 54020253Sjoerg#endif 541124382Siedowse 54220253Sjoerg slice_min = (hz/100); /* 10ms */ 54320253Sjoerg slice_max = (hz/7); /* ~140ms */ 54420253Sjoerg 54564918Sgreen#ifdef SMP 54620253Sjoerg /* init kseqs */ 54720253Sjoerg /* Create the idmap. */ 54820253Sjoerg#ifdef ULE_HTT_EXPERIMENTAL 54920253Sjoerg if (smp_topology == NULL) { 55020253Sjoerg#else 55120253Sjoerg if (1) { 55220253Sjoerg#endif 55320253Sjoerg for (i = 0; i < MAXCPU; i++) { 55420253Sjoerg kseq_setup(&kseq_cpu[i]); 55520253Sjoerg kseq_idmap[i] = &kseq_cpu[i]; 556124382Siedowse kseq_cpu[i].ksq_cpus = 1; 557124382Siedowse } 558124382Siedowse } else { 559124382Siedowse int j; 56020253Sjoerg 56120253Sjoerg for (i = 0; i < smp_topology->ct_count; i++) { 56220253Sjoerg struct cpu_group *cg; 56320253Sjoerg 56420253Sjoerg cg = &smp_topology->ct_group[i]; 56520253Sjoerg kseq_setup(&kseq_cpu[i]); 56620253Sjoerg 56720253Sjoerg for (j = 0; j < MAXCPU; j++) 56820253Sjoerg if ((cg->cg_mask & (1 << j)) != 0) 569283814Sbapt kseq_idmap[j] = &kseq_cpu[i]; 570283814Sbapt kseq_cpu[i].ksq_cpus = cg->cg_count; 571283814Sbapt } 57220253Sjoerg } 573168045Sle callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 57420253Sjoerg kseq_balance(NULL); 57520253Sjoerg#else 57630259Scharnier kseq_setup(KSEQ_SELF()); 577124382Siedowse#endif 578124382Siedowse mtx_lock_spin(&sched_lock); 579124382Siedowse kseq_add(KSEQ_SELF(), &kse0); 580124382Siedowse mtx_unlock_spin(&sched_lock); 581124382Siedowse} 582124382Siedowse 583124382Siedowse/* 584272833Sdes * Scale the scheduling priority according to the "interactivity" of this 585124382Siedowse * process. 586124382Siedowse */ 587124382Siedowsestatic void 588124382Siedowsesched_priority(struct ksegrp *kg) 58952527Sdavidn{ 59020253Sjoerg int pri; 59120253Sjoerg 59220267Sjoerg if (kg->kg_pri_class != PRI_TIMESHARE) 59320267Sjoerg return; 59420267Sjoerg 59520267Sjoerg pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 596284121Sbapt pri += SCHED_PRI_BASE; 597284126Sbapt pri += kg->kg_nice; 59820267Sjoerg 59921330Sdavidn if (pri > PRI_MAX_TIMESHARE) 60052527Sdavidn pri = PRI_MAX_TIMESHARE; 60152502Sdavidn else if (pri < PRI_MIN_TIMESHARE) 602283814Sbapt pri = PRI_MIN_TIMESHARE; 603283814Sbapt 604283814Sbapt kg->kg_user_pri = pri; 605283814Sbapt 606283814Sbapt return; 60752502Sdavidn} 60852502Sdavidn 60952502Sdavidn/* 61052502Sdavidn * Calculate a time slice based on the properties of the kseg and the runq 61152502Sdavidn * that we're on. This is only for PRI_TIMESHARE ksegrps. 61256000Sdavidn */ 61352502Sdavidnstatic void 61452502Sdavidnsched_slice(struct kse *ke) 61552512Sdavidn{ 61652527Sdavidn struct kseq *kseq; 617284128Sbapt struct ksegrp *kg; 618283814Sbapt 619283814Sbapt kg = ke->ke_ksegrp; 620283814Sbapt kseq = KSEQ_CPU(ke->ke_cpu); 621283814Sbapt 62252527Sdavidn /* 623284128Sbapt * Rationale: 62452527Sdavidn * KSEs in interactive ksegs get the minimum slice so that we 62552527Sdavidn * quickly notice if it abuses its advantage. 62652527Sdavidn * 62756000Sdavidn * KSEs in non-interactive ksegs are assigned a slice that is 62852527Sdavidn * based on the ksegs nice value relative to the least nice kseg 62952527Sdavidn * on the run queue for this cpu. 63052502Sdavidn * 63121330Sdavidn * If the KSE is less nice than all others it gets the maximum 63221330Sdavidn * slice and other KSEs will adjust their slice relative to 63320253Sjoerg * this when they first expire. 63420253Sjoerg * 63520253Sjoerg * There is 20 point window that starts relative to the least 63620253Sjoerg * nice kse on the run queue. Slice size is determined by 637242349Sbapt * the kse distance from the last nice ksegrp. 638273779Sbapt * 639273779Sbapt * If you are outside of the window you will get no slice and 640273779Sbapt * you will be reevaluated each time you are selected on the 641273779Sbapt * run queue. 642273779Sbapt * 643273779Sbapt */ 644273779Sbapt 645273779Sbapt if (!SCHED_INTERACTIVE(kg)) { 646273779Sbapt int nice; 647273779Sbapt 648273779Sbapt nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 649273779Sbapt if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || 650273779Sbapt kg->kg_nice < kseq->ksq_nicemin) 651273779Sbapt ke->ke_slice = SCHED_SLICE_MAX; 652273779Sbapt else if (nice <= SCHED_PRI_NTHRESH) 653273779Sbapt ke->ke_slice = SCHED_SLICE_NICE(nice); 654273779Sbapt else 655273779Sbapt ke->ke_slice = 0; 656273779Sbapt } else 657242349Sbapt ke->ke_slice = SCHED_SLICE_MIN; 658242349Sbapt 659244737Sbapt CTR6(KTR_ULE, 660244737Sbapt "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 661244737Sbapt ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 662244737Sbapt kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); 663244737Sbapt 664244737Sbapt /* 665244737Sbapt * Check to see if we need to scale back the slp and run time 666244737Sbapt * in the kg. This will cause us to forget old interactivity 667242349Sbapt * while maintaining the current ratio. 668242349Sbapt */ 669245114Smjg sched_interact_update(kg); 670242349Sbapt 671242349Sbapt return; 672242349Sbapt} 673242349Sbapt 67461759Sdavidnstatic void 675284128Sbaptsched_interact_update(struct ksegrp *kg) 67661759Sdavidn{ 67761759Sdavidn /* XXX Fixme, use a linear algorithm and not a while loop. */ 678284129Sbapt while ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { 679284129Sbapt kg->kg_runtime = (kg->kg_runtime / 5) * 4; 680284128Sbapt kg->kg_slptime = (kg->kg_slptime / 5) * 4; 68161759Sdavidn } 68261759Sdavidn} 68361759Sdavidn 684284128Sbaptstatic int 68520253Sjoergsched_interact_score(struct ksegrp *kg) 68644229Sdavidn{ 687283842Sbapt int div; 688283842Sbapt 689283842Sbapt if (kg->kg_runtime > kg->kg_slptime) { 69020253Sjoerg div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 69120253Sjoerg return (SCHED_INTERACT_HALF + 69220253Sjoerg (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 69320253Sjoerg } if (kg->kg_slptime > kg->kg_runtime) { 69420253Sjoerg div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 69520253Sjoerg return (kg->kg_runtime / div); 69620253Sjoerg } 69720253Sjoerg 698283961Sbapt /* 699283961Sbapt * This can happen if slptime and runtime are 0. 700283961Sbapt */ 701283961Sbapt return (0); 702283961Sbapt 70344229Sdavidn} 704283961Sbapt 70520253Sjoerg/* 70620253Sjoerg * This is only somewhat accurate since given many processes of the same 70752527Sdavidn * priority they will switch when their slices run out, which will be 70820253Sjoerg * at most SCHED_SLICE_MAX. 70982868Sdd */ 71020253Sjoergint 71120253Sjoergsched_rr_interval(void) 71220253Sjoerg{ 713283961Sbapt return (SCHED_SLICE_MAX); 714283961Sbapt} 715284118Sbapt 71652527Sdavidnvoid 71782868Sddsched_pctcpu_update(struct kse *ke) 71882868Sdd{ 71982868Sdd /* 72082868Sdd * Adjust counters and watermark for pctcpu calc. 72182868Sdd */ 72282868Sdd if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 72382868Sdd /* 72482868Sdd * Shift the tick count out so that the divide doesn't 72582868Sdd * round away our results. 72682868Sdd */ 72782868Sdd ke->ke_ticks <<= 10; 72882868Sdd ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 72982868Sdd SCHED_CPU_TICKS; 73082868Sdd ke->ke_ticks >>= 10; 73182868Sdd } else 732283842Sbapt ke->ke_ticks = 0; 733283842Sbapt ke->ke_ltick = ticks; 73482868Sdd ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 73582868Sdd} 73682868Sdd 73782868Sdd#ifdef SMP 73820267Sjoerg/* XXX Should be changed to kseq_load_lowest() */ 73920253Sjoergint 74020253Sjoergsched_pickcpu(void) 74120253Sjoerg{ 74220253Sjoerg struct kseq *kseq; 743284133Sbapt int load; 74420253Sjoerg int cpu; 74520253Sjoerg int i; 74620253Sjoerg 74720253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 74820253Sjoerg if (!smp_started) 74920253Sjoerg return (0); 75020253Sjoerg 751284133Sbapt load = 0; 752284133Sbapt cpu = 0; 75320253Sjoerg 754284133Sbapt for (i = 0; i < mp_maxid; i++) { 755283842Sbapt if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 75620253Sjoerg continue; 75720253Sjoerg kseq = KSEQ_CPU(i); 75820253Sjoerg if (kseq->ksq_load < load) { 75920253Sjoerg cpu = i; 76020253Sjoerg load = kseq->ksq_load; 76120253Sjoerg } 76220253Sjoerg } 76320253Sjoerg 76420253Sjoerg CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 76520253Sjoerg return (cpu); 76620253Sjoerg} 76720253Sjoerg#else 76820253Sjoergint 76920253Sjoergsched_pickcpu(void) 77020253Sjoerg{ 77120253Sjoerg return (0); 77220253Sjoerg} 77320253Sjoerg#endif 77444229Sdavidn 77544229Sdavidnvoid 77644229Sdavidnsched_prio(struct thread *td, u_char prio) 77720253Sjoerg{ 77844229Sdavidn 77920253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 78020253Sjoerg if (TD_ON_RUNQ(td)) { 78120253Sjoerg adjustrunqueue(td, prio); 78220253Sjoerg } else { 78320253Sjoerg td->td_priority = prio; 78420253Sjoerg } 78520253Sjoerg} 78620253Sjoerg 78720253Sjoergvoid 78820253Sjoergsched_switchout(struct thread *td) 78920253Sjoerg{ 79030259Scharnier struct kse *ke; 79120253Sjoerg 79220253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 79320253Sjoerg 79420253Sjoerg ke = td->td_kse; 79520253Sjoerg 79620253Sjoerg td->td_last_kse = ke; 79720253Sjoerg td->td_lastcpu = td->td_oncpu; 798284118Sbapt td->td_oncpu = NOCPU; 79920253Sjoerg td->td_flags &= ~TDF_NEEDRESCHED; 80020253Sjoerg 80120253Sjoerg if (TD_IS_RUNNING(td)) { 80220253Sjoerg if (td->td_proc->p_flag & P_SA) { 803284118Sbapt kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 80420253Sjoerg setrunqueue(td); 80520253Sjoerg } else { 80620253Sjoerg /* 80720253Sjoerg * This queue is always correct except for idle threads which 80820253Sjoerg * have a higher priority due to priority propagation. 80920253Sjoerg */ 81020253Sjoerg if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE && 81120253Sjoerg ke->ke_thread->td_priority > PRI_MIN_IDLE) 81220253Sjoerg ke->ke_runq = KSEQ_SELF()->ksq_curr; 81320253Sjoerg runq_add(ke->ke_runq, ke); 81444229Sdavidn /* setrunqueue(td); */ 81520253Sjoerg } 81644229Sdavidn return; 81720253Sjoerg } 81861957Sache if (ke->ke_runq) 81930259Scharnier kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 82020253Sjoerg /* 82120253Sjoerg * We will not be on the run queue. So we must be 822262865Sjulian * sleeping or similar. 823262865Sjulian */ 82420267Sjoerg if (td->td_proc->p_flag & P_SA) 82520253Sjoerg kse_reassign(ke); 82620253Sjoerg} 82720253Sjoerg 82820253Sjoergvoid 82920253Sjoergsched_switchin(struct thread *td) 83020253Sjoerg{ 83120253Sjoerg /* struct kse *ke = td->td_kse; */ 83220253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 83320253Sjoerg 83420253Sjoerg td->td_oncpu = PCPU_GET(cpuid); 83520253Sjoerg} 83620253Sjoerg 83720253Sjoergvoid 83820253Sjoergsched_nice(struct ksegrp *kg, int nice) 83920253Sjoerg{ 84044229Sdavidn struct kse *ke; 841282700Sbapt struct thread *td; 84220253Sjoerg struct kseq *kseq; 84320253Sjoerg 844284121Sbapt PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 84520267Sjoerg mtx_assert(&sched_lock, MA_OWNED); 846284128Sbapt /* 84720267Sjoerg * We need to adjust the nice counts for running KSEs. 84820267Sjoerg */ 84920267Sjoerg if (kg->kg_pri_class == PRI_TIMESHARE) 850284128Sbapt FOREACH_KSE_IN_GROUP(kg, ke) { 85144229Sdavidn if (ke->ke_runq == NULL) 85220267Sjoerg continue; 85320267Sjoerg kseq = KSEQ_CPU(ke->ke_cpu); 85470486Sben kseq_nice_rem(kseq, kg->kg_nice); 85520253Sjoerg kseq_nice_add(kseq, nice); 85670486Sben } 85720253Sjoerg kg->kg_nice = nice; 85820253Sjoerg sched_priority(kg); 85920253Sjoerg FOREACH_THREAD_IN_GROUP(kg, td) 86020253Sjoerg td->td_flags |= TDF_NEEDRESCHED; 86144229Sdavidn} 86220253Sjoerg 86320253Sjoergvoid 86420253Sjoergsched_sleep(struct thread *td, u_char prio) 86520253Sjoerg{ 86620253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 86720253Sjoerg 86820253Sjoerg td->td_slptime = ticks; 86920253Sjoerg td->td_priority = prio; 87020253Sjoerg 87127831Sdavidn CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 87220253Sjoerg td->td_kse, td->td_slptime); 87320253Sjoerg} 87420253Sjoerg 87530259Scharniervoid 87620253Sjoergsched_wakeup(struct thread *td) 87720253Sjoerg{ 87820253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 87920253Sjoerg 88020253Sjoerg /* 88120253Sjoerg * Let the kseg know how long we slept for. This is because process 88220253Sjoerg * interactivity behavior is modeled in the kseg. 88320253Sjoerg */ 88420253Sjoerg if (td->td_slptime) { 88520253Sjoerg struct ksegrp *kg; 88620253Sjoerg int hzticks; 88720253Sjoerg 88820253Sjoerg kg = td->td_ksegrp; 88920253Sjoerg hzticks = ticks - td->td_slptime; 89020253Sjoerg kg->kg_slptime += hzticks << 10; 89130259Scharnier sched_interact_update(kg); 89220253Sjoerg sched_priority(kg); 89320253Sjoerg if (td->td_kse) 89420253Sjoerg sched_slice(td->td_kse); 89520253Sjoerg CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 89620253Sjoerg td->td_kse, hzticks); 89720253Sjoerg td->td_slptime = 0; 89820253Sjoerg } 89920253Sjoerg setrunqueue(td); 90020253Sjoerg if (td->td_priority < curthread->td_priority) 90120253Sjoerg curthread->td_flags |= TDF_NEEDRESCHED; 902282699Sbapt} 90320253Sjoerg 90420253Sjoerg/* 905282699Sbapt * Penalize the parent for creating a new child and initialize the child's 90620253Sjoerg * priority. 907282699Sbapt */ 908282699Sbaptvoid 909282699Sbaptsched_fork(struct proc *p, struct proc *p1) 910282699Sbapt{ 911282699Sbapt 91220253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 91320253Sjoerg 91420253Sjoerg sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 91520253Sjoerg sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 91620253Sjoerg sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 91720253Sjoerg} 91820253Sjoerg 91920253Sjoergvoid 92020253Sjoergsched_fork_kse(struct kse *ke, struct kse *child) 92120253Sjoerg{ 92220253Sjoerg 92320253Sjoerg child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 92420253Sjoerg child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ 92520253Sjoerg child->ke_runq = NULL; 926130633Srobert 92720253Sjoerg /* Grab our parents cpu estimation information. */ 92820253Sjoerg child->ke_ticks = ke->ke_ticks; 92920253Sjoerg child->ke_ltick = ke->ke_ltick; 93020253Sjoerg child->ke_ftick = ke->ke_ftick; 93120253Sjoerg} 932282700Sbapt 93320253Sjoergvoid 93420253Sjoergsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 93520253Sjoerg{ 93620253Sjoerg 937282700Sbapt PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 93820253Sjoerg /* XXX Need something better here */ 93920253Sjoerg 94020253Sjoerg child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE; 94120253Sjoerg child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE; 94220253Sjoerg kg->kg_runtime += tickincr << 10; 94330259Scharnier sched_interact_update(kg); 94430259Scharnier 94520253Sjoerg child->kg_user_pri = kg->kg_user_pri; 94620253Sjoerg child->kg_nice = kg->kg_nice; 94720253Sjoerg} 94820253Sjoerg 94920253Sjoergvoid 95020253Sjoergsched_fork_thread(struct thread *td, struct thread *child) 95120253Sjoerg{ 95220253Sjoerg} 95320253Sjoerg 95420253Sjoergvoid 95520253Sjoergsched_class(struct ksegrp *kg, int class) 95620253Sjoerg{ 95720253Sjoerg struct kseq *kseq; 95820253Sjoerg struct kse *ke; 95920253Sjoerg 96020253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 961179365Santoine if (kg->kg_pri_class == class) 96220253Sjoerg return; 963179365Santoine 964179365Santoine FOREACH_KSE_IN_GROUP(kg, ke) { 96520253Sjoerg if (ke->ke_state != KES_ONRUNQ && 96620253Sjoerg ke->ke_state != KES_THREAD) 96720253Sjoerg continue; 96820253Sjoerg kseq = KSEQ_CPU(ke->ke_cpu); 969179365Santoine 970231994Skevlo kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; 97120253Sjoerg kseq->ksq_loads[PRI_BASE(class)]++; 97220253Sjoerg 97320253Sjoerg if (kg->kg_pri_class == PRI_TIMESHARE) 97420253Sjoerg kseq_nice_rem(kseq, kg->kg_nice); 97520253Sjoerg else if (class == PRI_TIMESHARE) 97620253Sjoerg kseq_nice_add(kseq, kg->kg_nice); 977179365Santoine } 978181785Sache 979179365Santoine kg->kg_pri_class = class; 98020253Sjoerg} 981231994Skevlo 982231994Skevlo/* 983231994Skevlo * Return some of the child's priority and interactivity to the parent. 984231994Skevlo */ 98520253Sjoergvoid 98620253Sjoergsched_exit(struct proc *p, struct proc *child) 98720590Sdavidn{ 98820253Sjoerg /* XXX Need something better here */ 98920253Sjoerg mtx_assert(&sched_lock, MA_OWNED); 99020253Sjoerg sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 99120253Sjoerg sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 99220253Sjoerg} 99320253Sjoerg 99420253Sjoergvoid 99520253Sjoergsched_exit_kse(struct kse *ke, struct kse *child) 99673563Skris{ 99720253Sjoerg kseq_rem(KSEQ_CPU(child->ke_cpu), child); 998181785Sache} 99920253Sjoerg 100020253Sjoergvoid 100120253Sjoergsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 100220253Sjoerg{ 100320253Sjoerg /* kg->kg_slptime += child->kg_slptime; */ 1004124382Siedowse kg->kg_runtime += child->kg_runtime; 1005284121Sbapt sched_interact_update(kg); 100661957Sache} 100720712Sdavidn 100820253Sjoergvoid 100920253Sjoergsched_exit_thread(struct thread *td, struct thread *child) 101020253Sjoerg{ 101120253Sjoerg} 101220253Sjoerg 101320253Sjoergvoid 101420253Sjoergsched_clock(struct thread *td) 101520253Sjoerg{ 101620253Sjoerg struct kseq *kseq; 101720253Sjoerg struct ksegrp *kg; 101820253Sjoerg struct kse *ke; 101920253Sjoerg#if 0 102020253Sjoerg struct kse *nke; 1021130633Srobert#endif 102220253Sjoerg 102320253Sjoerg /* 102420253Sjoerg * sched_setup() apparently happens prior to stathz being set. We 102520253Sjoerg * need to resolve the timers earlier in the boot so we can avoid 102620253Sjoerg * calculating this here. 1027284111Sbapt */ 1028284128Sbapt if (realstathz == 0) { 1029284111Sbapt realstathz = stathz ? stathz : hz; 1030284111Sbapt tickincr = hz / realstathz; 1031284111Sbapt /* 1032284111Sbapt * XXX This does not work for values of stathz that are much 1033284111Sbapt * larger than hz. 1034284111Sbapt */ 1035284111Sbapt if (tickincr == 0) 1036284111Sbapt tickincr = 1; 1037284111Sbapt } 103820253Sjoerg 1039284111Sbapt ke = td->td_kse; 1040284111Sbapt kg = ke->ke_ksegrp; 1041284111Sbapt 1042284111Sbapt mtx_assert(&sched_lock, MA_OWNED); 1043284111Sbapt KASSERT((td != NULL), ("schedclock: null thread pointer")); 1044284111Sbapt 1045284111Sbapt /* Adjust ticks for pctcpu */ 1046284111Sbapt ke->ke_ticks++; 1047284111Sbapt ke->ke_ltick = ticks; 1048284111Sbapt 1049284111Sbapt /* Go up to one second beyond our max and then trim back down */ 1050284111Sbapt if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1051284111Sbapt sched_pctcpu_update(ke); 1052284111Sbapt 1053284111Sbapt if (td->td_flags & TDF_IDLETD) 1054284111Sbapt return; 1055284111Sbapt 1056284111Sbapt CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1057284111Sbapt ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1058284111Sbapt 1059284111Sbapt /* 1060284111Sbapt * We only do slicing code for TIMESHARE ksegrps. 1061284111Sbapt */ 1062284111Sbapt if (kg->kg_pri_class != PRI_TIMESHARE) 1063284111Sbapt return; 1064284111Sbapt /* 1065284111Sbapt * Check for a higher priority task on the run queue. This can happen 1066284111Sbapt * on SMP if another processor woke up a process on our runq. 1067284111Sbapt */ 1068284111Sbapt kseq = KSEQ_SELF(); 1069284111Sbapt#if 0 1070284111Sbapt if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq, 0)) != NULL) { 1071284111Sbapt if (sched_strict && 1072284111Sbapt nke->ke_thread->td_priority < td->td_priority) 1073284111Sbapt td->td_flags |= TDF_NEEDRESCHED; 1074284111Sbapt else if (nke->ke_thread->td_priority < 1075284111Sbapt td->td_priority SCHED_PRIO_SLOP) 1076284111Sbapt 1077284128Sbapt if (nke->ke_thread->td_priority < td->td_priority) 1078284111Sbapt td->td_flags |= TDF_NEEDRESCHED; 1079284111Sbapt } 1080284111Sbapt#endif 1081284111Sbapt /* 1082284111Sbapt * We used a tick charge it to the ksegrp so that we can compute our 1083284111Sbapt * interactivity. 1084284111Sbapt */ 1085284128Sbapt kg->kg_runtime += tickincr << 10; 1086284111Sbapt sched_interact_update(kg); 1087284111Sbapt 1088284128Sbapt /* 1089284128Sbapt * We used up one time slice. 1090284111Sbapt */ 1091284111Sbapt ke->ke_slice--; 1092284111Sbapt#ifdef SMP 1093284111Sbapt kseq->ksq_rslices--; 1094284113Sbapt#endif 1095284113Sbapt 1096284113Sbapt if (ke->ke_slice > 0) 1097284113Sbapt return; 1098284128Sbapt /* 1099284113Sbapt * We're out of time, recompute priorities and requeue. 1100284113Sbapt */ 1101284113Sbapt kseq_rem(kseq, ke); 1102284113Sbapt sched_priority(kg); 1103284113Sbapt sched_slice(ke); 1104284113Sbapt if (SCHED_CURR(kg, ke)) 1105284111Sbapt ke->ke_runq = kseq->ksq_curr; 1106284111Sbapt else 1107284111Sbapt ke->ke_runq = kseq->ksq_next; 1108284111Sbapt kseq_add(kseq, ke); 1109284128Sbapt td->td_flags |= TDF_NEEDRESCHED; 1110284111Sbapt} 1111284117Sbapt 1112284111Sbaptint 1113284111Sbaptsched_runnable(void) 1114284111Sbapt{ 1115284111Sbapt struct kseq *kseq; 1116284111Sbapt int load; 1117284111Sbapt 1118284111Sbapt load = 1; 1119284111Sbapt 1120284111Sbapt mtx_lock_spin(&sched_lock); 1121284111Sbapt kseq = KSEQ_SELF(); 1122284111Sbapt 1123284111Sbapt if (kseq->ksq_load) 1124284111Sbapt goto out; 1125284111Sbapt#ifdef SMP 1126284112Sbapt /* 1127284112Sbapt * For SMP we may steal other processor's KSEs. Just search until we 1128284112Sbapt * verify that at least on other cpu has a runnable task. 1129284112Sbapt */ 1130284128Sbapt if (smp_started) { 1131284112Sbapt int i; 1132284111Sbapt 1133284111Sbapt for (i = 0; i < mp_maxid; i++) { 1134284111Sbapt if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 1135284111Sbapt continue; 1136284111Sbapt kseq = KSEQ_CPU(i); 1137284111Sbapt if (kseq->ksq_load > kseq->ksq_cpus) 113820253Sjoerg goto out; 1139284124Sbapt } 114020253Sjoerg } 1141284122Sbapt#endif 1142242349Sbapt load = 0; 114320253Sjoergout: 1144284124Sbapt mtx_unlock_spin(&sched_lock); 1145242349Sbapt return (load); 1146242349Sbapt} 114720253Sjoerg 114820267Sjoergvoid 114920253Sjoergsched_userret(struct thread *td) 115044229Sdavidn{ 115120253Sjoerg struct ksegrp *kg; 115220253Sjoerg#if 0 115320590Sdavidn struct kseq *kseq; 115420590Sdavidn struct kse *ke; 115520253Sjoerg#endif 115620253Sjoerg 1157130633Srobert kg = td->td_ksegrp; 115820253Sjoerg 1159130633Srobert if (td->td_priority != kg->kg_user_pri) { 116020253Sjoerg mtx_lock_spin(&sched_lock); 1161130633Srobert td->td_priority = kg->kg_user_pri; 116220253Sjoerg /* 1163130633Srobert * This optimization is temporarily disabled because it 1164130633Srobert * breaks priority propagation. 116520253Sjoerg */ 116620253Sjoerg#if 0 116720253Sjoerg kseq = KSEQ_SELF(); 116820253Sjoerg if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 116920253Sjoerg#ifdef SMP 117020253Sjoerg kseq->ksq_load > kseq->ksq_cpus && 117120253Sjoerg#else 117220253Sjoerg kseq->ksq_load > 1 && 117320253Sjoerg#endif 117420253Sjoerg (ke = kseq_choose(kseq, 0)) != NULL && 117520253Sjoerg ke->ke_thread->td_priority < td->td_priority) 117620253Sjoerg#endif 117720253Sjoerg curthread->td_flags |= TDF_NEEDRESCHED; 117861957Sache mtx_unlock_spin(&sched_lock); 117920253Sjoerg } 118020590Sdavidn} 118174569Sache 118261957Sachestruct kse * 118374569Sachesched_choose(void) 1184283842Sbapt{ 118520747Sdavidn struct kseq *kseq; 118620747Sdavidn struct kse *ke; 118720747Sdavidn 118820747Sdavidn mtx_assert(&sched_lock, MA_OWNED); 118920747Sdavidn#ifdef SMP 1190283842Sbaptretry: 1191283842Sbapt#endif 119220253Sjoerg kseq = KSEQ_SELF(); 119320590Sdavidn ke = kseq_choose(kseq, 0); 119420590Sdavidn if (ke) { 119544229Sdavidn runq_remove(ke->ke_runq, ke); 119620267Sjoerg ke->ke_state = KES_THREAD; 119744229Sdavidn 119820267Sjoerg if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 119920267Sjoerg CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1200262865Sjulian ke, ke->ke_runq, ke->ke_slice, 1201262865Sjulian ke->ke_thread->td_priority); 120220267Sjoerg } 1203262865Sjulian return (ke); 1204262865Sjulian } 1205262865Sjulian 1206262865Sjulian#ifdef SMP 1207262865Sjulian if (smp_started) { 1208262865Sjulian /* 120920267Sjoerg * Find the cpu with the highest load and steal one proc. 121020267Sjoerg */ 121120267Sjoerg if ((kseq = kseq_load_highest()) == NULL) 121244229Sdavidn return (NULL); 121361957Sache 121420253Sjoerg /* 121520267Sjoerg * Remove this kse from this kseq and runq and then requeue 121620253Sjoerg * on the current processor. Then we will dequeue it 121720253Sjoerg * normally above. 1218284110Sbapt */ 1219284110Sbapt kseq_move(kseq, PCPU_GET(cpuid)); 122020253Sjoerg goto retry; 1221109961Sgad } 1222284110Sbapt#endif 1223109961Sgad 122420253Sjoerg return (NULL); 1225109961Sgad} 1226109961Sgad 1227109961Sgadvoid 1228109961Sgadsched_add(struct thread *td) 1229109961Sgad{ 1230109961Sgad struct kseq *kseq; 1231109961Sgad struct ksegrp *kg; 1232109961Sgad struct kse *ke; 1233109961Sgad 1234109961Sgad ke = td->td_kse; 1235109961Sgad kg = td->td_ksegrp; 1236109961Sgad mtx_assert(&sched_lock, MA_OWNED); 1237109961Sgad KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 123820253Sjoerg KASSERT((ke->ke_thread->td_kse != NULL), 1239109961Sgad ("sched_add: No KSE on thread")); 1240109961Sgad KASSERT(ke->ke_state != KES_ONRUNQ, 1241109961Sgad ("sched_add: kse %p (%s) already in run queue", ke, 1242109961Sgad ke->ke_proc->p_comm)); 1243109961Sgad KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1244109961Sgad ("sched_add: process swapped out")); 1245109961Sgad KASSERT(ke->ke_runq == NULL, 1246109961Sgad ("sched_add: KSE %p is still assigned to a run queue", ke)); 1247109961Sgad 1248109961Sgad 1249109961Sgad switch (PRI_BASE(kg->kg_pri_class)) { 1250109961Sgad case PRI_ITHD: 1251109961Sgad case PRI_REALTIME: 1252109961Sgad kseq = KSEQ_SELF(); 1253109961Sgad ke->ke_runq = kseq->ksq_curr; 1254109961Sgad ke->ke_slice = SCHED_SLICE_MAX; 1255109961Sgad ke->ke_cpu = PCPU_GET(cpuid); 1256109961Sgad break; 1257109961Sgad case PRI_TIMESHARE: 1258109961Sgad kseq = KSEQ_CPU(ke->ke_cpu); 1259109961Sgad if (SCHED_CURR(kg, ke)) 1260109961Sgad ke->ke_runq = kseq->ksq_curr; 1261109961Sgad else 1262109961Sgad ke->ke_runq = kseq->ksq_next; 1263109961Sgad break; 1264109961Sgad case PRI_IDLE: 1265109961Sgad kseq = KSEQ_CPU(ke->ke_cpu); 1266109961Sgad /* 1267228673Sdim * This is for priority prop. 1268109961Sgad */ 1269109961Sgad if (ke->ke_thread->td_priority > PRI_MIN_IDLE) 1270109961Sgad ke->ke_runq = kseq->ksq_curr; 1271109961Sgad else 1272109961Sgad ke->ke_runq = &kseq->ksq_idle; 1273284110Sbapt ke->ke_slice = SCHED_SLICE_MIN; 1274284110Sbapt break; 127520253Sjoerg default: 127620253Sjoerg panic("Unknown pri class.\n"); 127720253Sjoerg break; 127820253Sjoerg } 127920253Sjoerg 128020253Sjoerg ke->ke_ksegrp->kg_runq_kses++; 128120253Sjoerg ke->ke_state = KES_ONRUNQ; 128220253Sjoerg 128320253Sjoerg runq_add(ke->ke_runq, ke); 128420253Sjoerg kseq_add(kseq, ke); 128520253Sjoerg} 128620253Sjoerg 128720253Sjoergvoid 128820253Sjoergsched_rem(struct thread *td) 128920253Sjoerg{ 129020253Sjoerg struct kseq *kseq; 129120253Sjoerg struct kse *ke; 129220253Sjoerg 129320253Sjoerg ke = td->td_kse; 129420253Sjoerg 1295282700Sbapt mtx_assert(&sched_lock, MA_OWNED); 129620253Sjoerg KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 129720253Sjoerg 129820253Sjoerg ke->ke_state = KES_THREAD; 129920253Sjoerg ke->ke_ksegrp->kg_runq_kses--; 130020253Sjoerg kseq = KSEQ_CPU(ke->ke_cpu); 130120253Sjoerg runq_remove(ke->ke_runq, ke); 130220747Sdavidn kseq_rem(kseq, ke); 130320747Sdavidn} 130485145Sache 130520747Sdavidnfixpt_t 130685145Sachesched_pctcpu(struct thread *td) 130785145Sache{ 130820747Sdavidn fixpt_t pctcpu; 130921052Sdavidn struct kse *ke; 131021052Sdavidn 131121052Sdavidn pctcpu = 0; 131221052Sdavidn ke = td->td_kse; 131320747Sdavidn 131421052Sdavidn mtx_lock_spin(&sched_lock); 131521052Sdavidn if (ke->ke_ticks) { 131621052Sdavidn int rtick; 131721052Sdavidn 131821052Sdavidn /* 131921052Sdavidn * Don't update more frequently than twice a second. Allowing 132020747Sdavidn * this causes the cpu usage to decay away too quickly due to 132121052Sdavidn * rounding errors. 132220747Sdavidn */ 132321052Sdavidn if (ke->ke_ltick < (ticks - (hz / 2))) 132421052Sdavidn sched_pctcpu_update(ke); 132521052Sdavidn /* How many rtick per second ? */ 132621052Sdavidn rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 132720747Sdavidn pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 132820747Sdavidn } 132920747Sdavidn 1330 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1331 mtx_unlock_spin(&sched_lock); 1332 1333 return (pctcpu); 1334} 1335 1336int 1337sched_sizeof_kse(void) 1338{ 1339 return (sizeof(struct kse) + sizeof(struct ke_sched)); 1340} 1341 1342int 1343sched_sizeof_ksegrp(void) 1344{ 1345 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1346} 1347 1348int 1349sched_sizeof_proc(void) 1350{ 1351 return (sizeof(struct proc)); 1352} 1353 1354int 1355sched_sizeof_thread(void) 1356{ 1357 return (sizeof(struct thread) + sizeof(struct td_sched)); 1358} 1359