sched_ule.c revision 139316
1/*- 2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 139316 2004-12-26 00:15:33Z jeff $"); 29 30#include <opt_sched.h> 31 32#define kse td_sched 33 34#include <sys/param.h> 35#include <sys/systm.h> 36#include <sys/kdb.h> 37#include <sys/kernel.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mutex.h> 41#include <sys/proc.h> 42#include <sys/resource.h> 43#include <sys/resourcevar.h> 44#include <sys/sched.h> 45#include <sys/smp.h> 46#include <sys/sx.h> 47#include <sys/sysctl.h> 48#include <sys/sysproto.h> 49#include <sys/vmmeter.h> 50#ifdef KTRACE 51#include <sys/uio.h> 52#include <sys/ktrace.h> 53#endif 54 55#include <machine/cpu.h> 56#include <machine/smp.h> 57 58/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 59/* XXX This is bogus compatability crap for ps */ 60static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 61SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 62 63static void sched_setup(void *dummy); 64SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 65 66static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 67 68SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 69 "Scheduler name"); 70 71static int slice_min = 1; 72SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 73 74static int slice_max = 10; 75SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 76 77int realstathz; 78int tickincr = 1; 79 80/* 81 * The schedulable entity that can be given a context to run. 82 * A process may have several of these. Probably one per processor 83 * but posibly a few more. In this universe they are grouped 84 * with a KSEG that contains the priority and niceness 85 * for the group. 86 */ 87struct kse { 88 TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ 89 int ke_flags; /* (j) KEF_* flags. */ 90 struct thread *ke_thread; /* (*) Active associated thread. */ 91 fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ 92 char ke_rqindex; /* (j) Run queue index. */ 93 enum { 94 KES_THREAD = 0x0, /* slaved to thread state */ 95 KES_ONRUNQ 96 } ke_state; /* (j) thread sched specific status. */ 97 int ke_slptime; 98 int ke_slice; 99 struct runq *ke_runq; 100 u_char ke_cpu; /* CPU that we have affinity for. */ 101 /* The following variables are only used for pctcpu calculation */ 102 int ke_ltick; /* Last tick that we were running on */ 103 int ke_ftick; /* First tick that we were running on */ 104 int ke_ticks; /* Tick count */ 105 106}; 107 108 109#define td_kse td_sched 110#define td_slptime td_kse->ke_slptime 111#define ke_proc ke_thread->td_proc 112#define ke_ksegrp ke_thread->td_ksegrp 113 114/* flags kept in ke_flags */ 115#define KEF_SCHED0 0x00001 /* For scheduler-specific use. */ 116#define KEF_SCHED1 0x00002 /* For scheduler-specific use. */ 117#define KEF_SCHED2 0x00004 /* For scheduler-specific use. */ 118#define KEF_SCHED3 0x00008 /* For scheduler-specific use. */ 119#define KEF_SCHED4 0x00010 120#define KEF_SCHED5 0x00020 121#define KEF_DIDRUN 0x02000 /* Thread actually ran. */ 122#define KEF_EXIT 0x04000 /* Thread is being killed. */ 123 124/* 125 * These datastructures are allocated within their parent datastructure but 126 * are scheduler specific. 127 */ 128 129#define ke_assign ke_procq.tqe_next 130 131#define KEF_ASSIGNED KEF_SCHED0 /* Thread is being migrated. */ 132#define KEF_BOUND KEF_SCHED1 /* Thread can not migrate. */ 133#define KEF_XFERABLE KEF_SCHED2 /* Thread was added as transferable. */ 134#define KEF_HOLD KEF_SCHED3 /* Thread is temporarily bound. */ 135#define KEF_REMOVED KEF_SCHED4 /* Thread was removed while ASSIGNED */ 136#define KEF_PRIOELEV KEF_SCHED5 /* Thread has had its prio elevated. */ 137 138struct kg_sched { 139 struct thread *skg_last_assigned; /* (j) Last thread assigned to */ 140 /* the system scheduler */ 141 int skg_slptime; /* Number of ticks we vol. slept */ 142 int skg_runtime; /* Number of ticks we were running */ 143 int skg_avail_opennings; /* (j) Num unfilled slots in group.*/ 144 int skg_concurrency; /* (j) Num threads requested in group.*/ 145}; 146#define kg_last_assigned kg_sched->skg_last_assigned 147#define kg_avail_opennings kg_sched->skg_avail_opennings 148#define kg_concurrency kg_sched->skg_concurrency 149#define kg_runtime kg_sched->skg_runtime 150#define kg_slptime kg_sched->skg_slptime 151 152#define SLOT_RELEASE(kg) \ 153do { \ 154 kg->kg_avail_opennings++; \ 155 CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)", \ 156 kg, \ 157 kg->kg_concurrency, \ 158 kg->kg_avail_opennings); \ 159 /*KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency), \ 160 ("slots out of whack")); */ \ 161} while (0) 162 163#define SLOT_USE(kg) \ 164do { \ 165 kg->kg_avail_opennings--; \ 166 CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)", \ 167 kg, \ 168 kg->kg_concurrency, \ 169 kg->kg_avail_opennings); \ 170 /*KASSERT((kg->kg_avail_opennings >= 0), \ 171 ("slots out of whack"));*/ \ 172} while (0) 173 174static struct kse kse0; 175static struct kg_sched kg_sched0; 176 177/* 178 * The priority is primarily determined by the interactivity score. Thus, we 179 * give lower(better) priorities to kse groups that use less CPU. The nice 180 * value is then directly added to this to allow nice to have some effect 181 * on latency. 182 * 183 * PRI_RANGE: Total priority range for timeshare threads. 184 * PRI_NRESV: Number of nice values. 185 * PRI_BASE: The start of the dynamic range. 186 */ 187#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 188#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 189#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 190#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 191#define SCHED_PRI_INTERACT(score) \ 192 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 193 194/* 195 * These determine the interactivity of a process. 196 * 197 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 198 * before throttling back. 199 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 200 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 201 * INTERACT_THRESH: Threshhold for placement on the current runq. 202 */ 203#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 204#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 205#define SCHED_INTERACT_MAX (100) 206#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 207#define SCHED_INTERACT_THRESH (30) 208 209/* 210 * These parameters and macros determine the size of the time slice that is 211 * granted to each thread. 212 * 213 * SLICE_MIN: Minimum time slice granted, in units of ticks. 214 * SLICE_MAX: Maximum time slice granted. 215 * SLICE_RANGE: Range of available time slices scaled by hz. 216 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 217 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 218 * SLICE_NTHRESH: The nice cutoff point for slice assignment. 219 */ 220#define SCHED_SLICE_MIN (slice_min) 221#define SCHED_SLICE_MAX (slice_max) 222#define SCHED_SLICE_INTERACTIVE (slice_max) 223#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 224#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 225#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 226#define SCHED_SLICE_NICE(nice) \ 227 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 228 229/* 230 * This macro determines whether or not the thread belongs on the current or 231 * next run queue. 232 */ 233#define SCHED_INTERACTIVE(kg) \ 234 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 235#define SCHED_CURR(kg, ke) \ 236 ((ke->ke_flags & KEF_PRIOELEV) || SCHED_INTERACTIVE(kg)) 237 238/* 239 * Cpu percentage computation macros and defines. 240 * 241 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 242 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 243 */ 244 245#define SCHED_CPU_TIME 10 246#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 247 248/* 249 * kseq - per processor runqs and statistics. 250 */ 251struct kseq { 252 struct runq ksq_idle; /* Queue of IDLE threads. */ 253 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 254 struct runq *ksq_next; /* Next timeshare queue. */ 255 struct runq *ksq_curr; /* Current queue. */ 256 int ksq_load_timeshare; /* Load for timeshare. */ 257 int ksq_load; /* Aggregate load. */ 258 short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 259 short ksq_nicemin; /* Least nice. */ 260#ifdef SMP 261 int ksq_transferable; 262 LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 263 struct kseq_group *ksq_group; /* Our processor group. */ 264 volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 265#else 266 int ksq_sysload; /* For loadavg, !ITHD load. */ 267#endif 268}; 269 270#ifdef SMP 271/* 272 * kseq groups are groups of processors which can cheaply share threads. When 273 * one processor in the group goes idle it will check the runqs of the other 274 * processors in its group prior to halting and waiting for an interrupt. 275 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 276 * In a numa environment we'd want an idle bitmap per group and a two tiered 277 * load balancer. 278 */ 279struct kseq_group { 280 int ksg_cpus; /* Count of CPUs in this kseq group. */ 281 cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ 282 cpumask_t ksg_idlemask; /* Idle cpus in this group. */ 283 cpumask_t ksg_mask; /* Bit mask for first cpu. */ 284 int ksg_load; /* Total load of this group. */ 285 int ksg_transferable; /* Transferable load of this group. */ 286 LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 287}; 288#endif 289 290/* 291 * One kse queue per processor. 292 */ 293#ifdef SMP 294static cpumask_t kseq_idle; 295static int ksg_maxid; 296static struct kseq kseq_cpu[MAXCPU]; 297static struct kseq_group kseq_groups[MAXCPU]; 298static int bal_tick; 299static int gbal_tick; 300 301#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 302#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 303#define KSEQ_ID(x) ((x) - kseq_cpu) 304#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 305#else /* !SMP */ 306static struct kseq kseq_cpu; 307 308#define KSEQ_SELF() (&kseq_cpu) 309#define KSEQ_CPU(x) (&kseq_cpu) 310#endif 311 312static void slot_fill(struct ksegrp *kg); 313static struct kse *sched_choose(void); /* XXX Should be thread * */ 314static void sched_add_internal(struct thread *td, int preemptive); 315static void sched_slice(struct kse *ke); 316static void sched_priority(struct ksegrp *kg); 317static int sched_interact_score(struct ksegrp *kg); 318static void sched_interact_update(struct ksegrp *kg); 319static void sched_interact_fork(struct ksegrp *kg); 320static void sched_pctcpu_update(struct kse *ke); 321 322/* Operations on per processor queues */ 323static struct kse * kseq_choose(struct kseq *kseq); 324static void kseq_setup(struct kseq *kseq); 325static void kseq_load_add(struct kseq *kseq, struct kse *ke); 326static void kseq_load_rem(struct kseq *kseq, struct kse *ke); 327static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 328static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 329static void kseq_nice_add(struct kseq *kseq, int nice); 330static void kseq_nice_rem(struct kseq *kseq, int nice); 331void kseq_print(int cpu); 332#ifdef SMP 333static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 334static struct kse *runq_steal(struct runq *rq); 335static void sched_balance(void); 336static void sched_balance_groups(void); 337static void sched_balance_group(struct kseq_group *ksg); 338static void sched_balance_pair(struct kseq *high, struct kseq *low); 339static void kseq_move(struct kseq *from, int cpu); 340static int kseq_idled(struct kseq *kseq); 341static void kseq_notify(struct kse *ke, int cpu); 342static void kseq_assign(struct kseq *); 343static struct kse *kseq_steal(struct kseq *kseq, int stealidle); 344/* 345 * On P4 Xeons the round-robin interrupt delivery is broken. As a result of 346 * this, we can't pin interrupts to the cpu that they were delivered to, 347 * otherwise all ithreads only run on CPU 0. 348 */ 349#ifdef __i386__ 350#define KSE_CAN_MIGRATE(ke, class) \ 351 ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 352#else /* !__i386__ */ 353#define KSE_CAN_MIGRATE(ke, class) \ 354 ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 355 ((ke)->ke_flags & KEF_BOUND) == 0) 356#endif /* !__i386__ */ 357#endif 358 359void 360kseq_print(int cpu) 361{ 362 struct kseq *kseq; 363 int i; 364 365 kseq = KSEQ_CPU(cpu); 366 367 printf("kseq:\n"); 368 printf("\tload: %d\n", kseq->ksq_load); 369 printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 370#ifdef SMP 371 printf("\tload transferable: %d\n", kseq->ksq_transferable); 372#endif 373 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 374 printf("\tnice counts:\n"); 375 for (i = 0; i < SCHED_PRI_NRESV; i++) 376 if (kseq->ksq_nice[i]) 377 printf("\t\t%d = %d\n", 378 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 379} 380 381static __inline void 382kseq_runq_add(struct kseq *kseq, struct kse *ke) 383{ 384#ifdef SMP 385 if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 386 kseq->ksq_transferable++; 387 kseq->ksq_group->ksg_transferable++; 388 ke->ke_flags |= KEF_XFERABLE; 389 } 390#endif 391 runq_add(ke->ke_runq, ke, 0); 392} 393 394static __inline void 395kseq_runq_rem(struct kseq *kseq, struct kse *ke) 396{ 397#ifdef SMP 398 if (ke->ke_flags & KEF_XFERABLE) { 399 kseq->ksq_transferable--; 400 kseq->ksq_group->ksg_transferable--; 401 ke->ke_flags &= ~KEF_XFERABLE; 402 } 403#endif 404 runq_remove(ke->ke_runq, ke); 405} 406 407static void 408kseq_load_add(struct kseq *kseq, struct kse *ke) 409{ 410 int class; 411 mtx_assert(&sched_lock, MA_OWNED); 412 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 413 if (class == PRI_TIMESHARE) 414 kseq->ksq_load_timeshare++; 415 kseq->ksq_load++; 416 CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 417 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 418#ifdef SMP 419 kseq->ksq_group->ksg_load++; 420#else 421 kseq->ksq_sysload++; 422#endif 423 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 424 kseq_nice_add(kseq, ke->ke_proc->p_nice); 425} 426 427static void 428kseq_load_rem(struct kseq *kseq, struct kse *ke) 429{ 430 int class; 431 mtx_assert(&sched_lock, MA_OWNED); 432 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 433 if (class == PRI_TIMESHARE) 434 kseq->ksq_load_timeshare--; 435 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 436#ifdef SMP 437 kseq->ksq_group->ksg_load--; 438#else 439 kseq->ksq_sysload--; 440#endif 441 kseq->ksq_load--; 442 CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 443 ke->ke_runq = NULL; 444 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 445 kseq_nice_rem(kseq, ke->ke_proc->p_nice); 446} 447 448static void 449kseq_nice_add(struct kseq *kseq, int nice) 450{ 451 mtx_assert(&sched_lock, MA_OWNED); 452 /* Normalize to zero. */ 453 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 454 if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 455 kseq->ksq_nicemin = nice; 456} 457 458static void 459kseq_nice_rem(struct kseq *kseq, int nice) 460{ 461 int n; 462 463 mtx_assert(&sched_lock, MA_OWNED); 464 /* Normalize to zero. */ 465 n = nice + SCHED_PRI_NHALF; 466 kseq->ksq_nice[n]--; 467 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 468 469 /* 470 * If this wasn't the smallest nice value or there are more in 471 * this bucket we can just return. Otherwise we have to recalculate 472 * the smallest nice. 473 */ 474 if (nice != kseq->ksq_nicemin || 475 kseq->ksq_nice[n] != 0 || 476 kseq->ksq_load_timeshare == 0) 477 return; 478 479 for (; n < SCHED_PRI_NRESV; n++) 480 if (kseq->ksq_nice[n]) { 481 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 482 return; 483 } 484} 485 486#ifdef SMP 487/* 488 * sched_balance is a simple CPU load balancing algorithm. It operates by 489 * finding the least loaded and most loaded cpu and equalizing their load 490 * by migrating some processes. 491 * 492 * Dealing only with two CPUs at a time has two advantages. Firstly, most 493 * installations will only have 2 cpus. Secondly, load balancing too much at 494 * once can have an unpleasant effect on the system. The scheduler rarely has 495 * enough information to make perfect decisions. So this algorithm chooses 496 * algorithm simplicity and more gradual effects on load in larger systems. 497 * 498 * It could be improved by considering the priorities and slices assigned to 499 * each task prior to balancing them. There are many pathological cases with 500 * any approach and so the semi random algorithm below may work as well as any. 501 * 502 */ 503static void 504sched_balance(void) 505{ 506 struct kseq_group *high; 507 struct kseq_group *low; 508 struct kseq_group *ksg; 509 int cnt; 510 int i; 511 512 if (smp_started == 0) 513 goto out; 514 low = high = NULL; 515 i = random() % (ksg_maxid + 1); 516 for (cnt = 0; cnt <= ksg_maxid; cnt++) { 517 ksg = KSEQ_GROUP(i); 518 /* 519 * Find the CPU with the highest load that has some 520 * threads to transfer. 521 */ 522 if ((high == NULL || ksg->ksg_load > high->ksg_load) 523 && ksg->ksg_transferable) 524 high = ksg; 525 if (low == NULL || ksg->ksg_load < low->ksg_load) 526 low = ksg; 527 if (++i > ksg_maxid) 528 i = 0; 529 } 530 if (low != NULL && high != NULL && high != low) 531 sched_balance_pair(LIST_FIRST(&high->ksg_members), 532 LIST_FIRST(&low->ksg_members)); 533out: 534 bal_tick = ticks + (random() % (hz * 2)); 535} 536 537static void 538sched_balance_groups(void) 539{ 540 int i; 541 542 mtx_assert(&sched_lock, MA_OWNED); 543 if (smp_started) 544 for (i = 0; i <= ksg_maxid; i++) 545 sched_balance_group(KSEQ_GROUP(i)); 546 gbal_tick = ticks + (random() % (hz * 2)); 547} 548 549static void 550sched_balance_group(struct kseq_group *ksg) 551{ 552 struct kseq *kseq; 553 struct kseq *high; 554 struct kseq *low; 555 int load; 556 557 if (ksg->ksg_transferable == 0) 558 return; 559 low = NULL; 560 high = NULL; 561 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 562 load = kseq->ksq_load; 563 if (high == NULL || load > high->ksq_load) 564 high = kseq; 565 if (low == NULL || load < low->ksq_load) 566 low = kseq; 567 } 568 if (high != NULL && low != NULL && high != low) 569 sched_balance_pair(high, low); 570} 571 572static void 573sched_balance_pair(struct kseq *high, struct kseq *low) 574{ 575 int transferable; 576 int high_load; 577 int low_load; 578 int move; 579 int diff; 580 int i; 581 582 /* 583 * If we're transfering within a group we have to use this specific 584 * kseq's transferable count, otherwise we can steal from other members 585 * of the group. 586 */ 587 if (high->ksq_group == low->ksq_group) { 588 transferable = high->ksq_transferable; 589 high_load = high->ksq_load; 590 low_load = low->ksq_load; 591 } else { 592 transferable = high->ksq_group->ksg_transferable; 593 high_load = high->ksq_group->ksg_load; 594 low_load = low->ksq_group->ksg_load; 595 } 596 if (transferable == 0) 597 return; 598 /* 599 * Determine what the imbalance is and then adjust that to how many 600 * kses we actually have to give up (transferable). 601 */ 602 diff = high_load - low_load; 603 move = diff / 2; 604 if (diff & 0x1) 605 move++; 606 move = min(move, transferable); 607 for (i = 0; i < move; i++) 608 kseq_move(high, KSEQ_ID(low)); 609 return; 610} 611 612static void 613kseq_move(struct kseq *from, int cpu) 614{ 615 struct kseq *kseq; 616 struct kseq *to; 617 struct kse *ke; 618 619 kseq = from; 620 to = KSEQ_CPU(cpu); 621 ke = kseq_steal(kseq, 1); 622 if (ke == NULL) { 623 struct kseq_group *ksg; 624 625 ksg = kseq->ksq_group; 626 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 627 if (kseq == from || kseq->ksq_transferable == 0) 628 continue; 629 ke = kseq_steal(kseq, 1); 630 break; 631 } 632 if (ke == NULL) 633 panic("kseq_move: No KSEs available with a " 634 "transferable count of %d\n", 635 ksg->ksg_transferable); 636 } 637 if (kseq == to) 638 return; 639 ke->ke_state = KES_THREAD; 640 kseq_runq_rem(kseq, ke); 641 kseq_load_rem(kseq, ke); 642 kseq_notify(ke, cpu); 643} 644 645static int 646kseq_idled(struct kseq *kseq) 647{ 648 struct kseq_group *ksg; 649 struct kseq *steal; 650 struct kse *ke; 651 652 ksg = kseq->ksq_group; 653 /* 654 * If we're in a cpu group, try and steal kses from another cpu in 655 * the group before idling. 656 */ 657 if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 658 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 659 if (steal == kseq || steal->ksq_transferable == 0) 660 continue; 661 ke = kseq_steal(steal, 0); 662 if (ke == NULL) 663 continue; 664 ke->ke_state = KES_THREAD; 665 kseq_runq_rem(steal, ke); 666 kseq_load_rem(steal, ke); 667 ke->ke_cpu = PCPU_GET(cpuid); 668 sched_add_internal(ke->ke_thread, 0); 669 return (0); 670 } 671 } 672 /* 673 * We only set the idled bit when all of the cpus in the group are 674 * idle. Otherwise we could get into a situation where a KSE bounces 675 * back and forth between two idle cores on seperate physical CPUs. 676 */ 677 ksg->ksg_idlemask |= PCPU_GET(cpumask); 678 if (ksg->ksg_idlemask != ksg->ksg_cpumask) 679 return (1); 680 atomic_set_int(&kseq_idle, ksg->ksg_mask); 681 return (1); 682} 683 684static void 685kseq_assign(struct kseq *kseq) 686{ 687 struct kse *nke; 688 struct kse *ke; 689 690 do { 691 *(volatile struct kse **)&ke = kseq->ksq_assigned; 692 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 693 for (; ke != NULL; ke = nke) { 694 nke = ke->ke_assign; 695 ke->ke_flags &= ~KEF_ASSIGNED; 696 SLOT_RELEASE(ke->ke_thread->td_ksegrp); 697 sched_add_internal(ke->ke_thread, 0); 698 } 699} 700 701static void 702kseq_notify(struct kse *ke, int cpu) 703{ 704 struct kseq *kseq; 705 struct thread *td; 706 struct pcpu *pcpu; 707 int prio; 708 709 ke->ke_cpu = cpu; 710 ke->ke_flags |= KEF_ASSIGNED; 711 SLOT_USE(ke->ke_thread->td_ksegrp); 712 prio = ke->ke_thread->td_priority; 713 714 kseq = KSEQ_CPU(cpu); 715 716 /* 717 * Place a KSE on another cpu's queue and force a resched. 718 */ 719 do { 720 *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned; 721 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 722 /* 723 * Without sched_lock we could lose a race where we set NEEDRESCHED 724 * on a thread that is switched out before the IPI is delivered. This 725 * would lead us to miss the resched. This will be a problem once 726 * sched_lock is pushed down. 727 */ 728 pcpu = pcpu_find(cpu); 729 td = pcpu->pc_curthread; 730 if (ke->ke_thread->td_priority < td->td_priority || 731 td == pcpu->pc_idlethread) { 732 td->td_flags |= TDF_NEEDRESCHED; 733 ipi_selected(1 << cpu, IPI_AST); 734 } 735} 736 737static struct kse * 738runq_steal(struct runq *rq) 739{ 740 struct rqhead *rqh; 741 struct rqbits *rqb; 742 struct kse *ke; 743 int word; 744 int bit; 745 746 mtx_assert(&sched_lock, MA_OWNED); 747 rqb = &rq->rq_status; 748 for (word = 0; word < RQB_LEN; word++) { 749 if (rqb->rqb_bits[word] == 0) 750 continue; 751 for (bit = 0; bit < RQB_BPW; bit++) { 752 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 753 continue; 754 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 755 TAILQ_FOREACH(ke, rqh, ke_procq) { 756 if (KSE_CAN_MIGRATE(ke, 757 PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 758 return (ke); 759 } 760 } 761 } 762 return (NULL); 763} 764 765static struct kse * 766kseq_steal(struct kseq *kseq, int stealidle) 767{ 768 struct kse *ke; 769 770 /* 771 * Steal from next first to try to get a non-interactive task that 772 * may not have run for a while. 773 */ 774 if ((ke = runq_steal(kseq->ksq_next)) != NULL) 775 return (ke); 776 if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 777 return (ke); 778 if (stealidle) 779 return (runq_steal(&kseq->ksq_idle)); 780 return (NULL); 781} 782 783int 784kseq_transfer(struct kseq *kseq, struct kse *ke, int class) 785{ 786 struct kseq_group *ksg; 787 int cpu; 788 789 if (smp_started == 0) 790 return (0); 791 cpu = 0; 792 /* 793 * If our load exceeds a certain threshold we should attempt to 794 * reassign this thread. The first candidate is the cpu that 795 * originally ran the thread. If it is idle, assign it there, 796 * otherwise, pick an idle cpu. 797 * 798 * The threshold at which we start to reassign kses has a large impact 799 * on the overall performance of the system. Tuned too high and 800 * some CPUs may idle. Too low and there will be excess migration 801 * and context switches. 802 */ 803 ksg = kseq->ksq_group; 804 if (ksg->ksg_load > ksg->ksg_cpus && kseq_idle) { 805 ksg = KSEQ_CPU(ke->ke_cpu)->ksq_group; 806 if (kseq_idle & ksg->ksg_mask) { 807 cpu = ffs(ksg->ksg_idlemask); 808 if (cpu) 809 goto migrate; 810 } 811 /* 812 * Multiple cpus could find this bit simultaneously 813 * but the race shouldn't be terrible. 814 */ 815 cpu = ffs(kseq_idle); 816 if (cpu) 817 goto migrate; 818 } 819 /* 820 * If another cpu in this group has idled, assign a thread over 821 * to them after checking to see if there are idled groups. 822 */ 823 ksg = kseq->ksq_group; 824 if (ksg->ksg_idlemask) { 825 cpu = ffs(ksg->ksg_idlemask); 826 if (cpu) 827 goto migrate; 828 } 829 /* 830 * No new CPU was found. 831 */ 832 return (0); 833migrate: 834 /* 835 * Now that we've found an idle CPU, migrate the thread. 836 */ 837 cpu--; 838 ke->ke_runq = NULL; 839 kseq_notify(ke, cpu); 840 841 return (1); 842} 843 844#endif /* SMP */ 845 846/* 847 * Pick the highest priority task we have and return it. 848 */ 849 850static struct kse * 851kseq_choose(struct kseq *kseq) 852{ 853 struct runq *swap; 854 struct kse *ke; 855 int nice; 856 857 mtx_assert(&sched_lock, MA_OWNED); 858 swap = NULL; 859 860 for (;;) { 861 ke = runq_choose(kseq->ksq_curr); 862 if (ke == NULL) { 863 /* 864 * We already swapped once and didn't get anywhere. 865 */ 866 if (swap) 867 break; 868 swap = kseq->ksq_curr; 869 kseq->ksq_curr = kseq->ksq_next; 870 kseq->ksq_next = swap; 871 continue; 872 } 873 /* 874 * If we encounter a slice of 0 the kse is in a 875 * TIMESHARE kse group and its nice was too far out 876 * of the range that receives slices. 877 */ 878 nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin); 879 if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH && 880 ke->ke_proc->p_nice != 0)) { 881 runq_remove(ke->ke_runq, ke); 882 sched_slice(ke); 883 ke->ke_runq = kseq->ksq_next; 884 runq_add(ke->ke_runq, ke, 0); 885 continue; 886 } 887 return (ke); 888 } 889 890 return (runq_choose(&kseq->ksq_idle)); 891} 892 893static void 894kseq_setup(struct kseq *kseq) 895{ 896 runq_init(&kseq->ksq_timeshare[0]); 897 runq_init(&kseq->ksq_timeshare[1]); 898 runq_init(&kseq->ksq_idle); 899 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 900 kseq->ksq_next = &kseq->ksq_timeshare[1]; 901 kseq->ksq_load = 0; 902 kseq->ksq_load_timeshare = 0; 903} 904 905static void 906sched_setup(void *dummy) 907{ 908#ifdef SMP 909 int balance_groups; 910 int i; 911#endif 912 913 slice_min = (hz/100); /* 10ms */ 914 slice_max = (hz/7); /* ~140ms */ 915 916#ifdef SMP 917 balance_groups = 0; 918 /* 919 * Initialize the kseqs. 920 */ 921 for (i = 0; i < MAXCPU; i++) { 922 struct kseq *ksq; 923 924 ksq = &kseq_cpu[i]; 925 ksq->ksq_assigned = NULL; 926 kseq_setup(&kseq_cpu[i]); 927 } 928 if (smp_topology == NULL) { 929 struct kseq_group *ksg; 930 struct kseq *ksq; 931 932 for (i = 0; i < MAXCPU; i++) { 933 ksq = &kseq_cpu[i]; 934 ksg = &kseq_groups[i]; 935 /* 936 * Setup a kseq group with one member. 937 */ 938 ksq->ksq_transferable = 0; 939 ksq->ksq_group = ksg; 940 ksg->ksg_cpus = 1; 941 ksg->ksg_idlemask = 0; 942 ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 943 ksg->ksg_load = 0; 944 ksg->ksg_transferable = 0; 945 LIST_INIT(&ksg->ksg_members); 946 LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 947 } 948 } else { 949 struct kseq_group *ksg; 950 struct cpu_group *cg; 951 int j; 952 953 for (i = 0; i < smp_topology->ct_count; i++) { 954 cg = &smp_topology->ct_group[i]; 955 ksg = &kseq_groups[i]; 956 /* 957 * Initialize the group. 958 */ 959 ksg->ksg_idlemask = 0; 960 ksg->ksg_load = 0; 961 ksg->ksg_transferable = 0; 962 ksg->ksg_cpus = cg->cg_count; 963 ksg->ksg_cpumask = cg->cg_mask; 964 LIST_INIT(&ksg->ksg_members); 965 /* 966 * Find all of the group members and add them. 967 */ 968 for (j = 0; j < MAXCPU; j++) { 969 if ((cg->cg_mask & (1 << j)) != 0) { 970 if (ksg->ksg_mask == 0) 971 ksg->ksg_mask = 1 << j; 972 kseq_cpu[j].ksq_transferable = 0; 973 kseq_cpu[j].ksq_group = ksg; 974 LIST_INSERT_HEAD(&ksg->ksg_members, 975 &kseq_cpu[j], ksq_siblings); 976 } 977 } 978 if (ksg->ksg_cpus > 1) 979 balance_groups = 1; 980 } 981 ksg_maxid = smp_topology->ct_count - 1; 982 } 983 /* 984 * Stagger the group and global load balancer so they do not 985 * interfere with each other. 986 */ 987 bal_tick = ticks + hz; 988 if (balance_groups) 989 gbal_tick = ticks + (hz / 2); 990#else 991 kseq_setup(KSEQ_SELF()); 992#endif 993 mtx_lock_spin(&sched_lock); 994 kseq_load_add(KSEQ_SELF(), &kse0); 995 mtx_unlock_spin(&sched_lock); 996} 997 998/* 999 * Scale the scheduling priority according to the "interactivity" of this 1000 * process. 1001 */ 1002static void 1003sched_priority(struct ksegrp *kg) 1004{ 1005 int pri; 1006 1007 if (kg->kg_pri_class != PRI_TIMESHARE) 1008 return; 1009 1010 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 1011 pri += SCHED_PRI_BASE; 1012 pri += kg->kg_proc->p_nice; 1013 1014 if (pri > PRI_MAX_TIMESHARE) 1015 pri = PRI_MAX_TIMESHARE; 1016 else if (pri < PRI_MIN_TIMESHARE) 1017 pri = PRI_MIN_TIMESHARE; 1018 1019 kg->kg_user_pri = pri; 1020 1021 return; 1022} 1023 1024/* 1025 * Calculate a time slice based on the properties of the kseg and the runq 1026 * that we're on. This is only for PRI_TIMESHARE ksegrps. 1027 */ 1028static void 1029sched_slice(struct kse *ke) 1030{ 1031 struct kseq *kseq; 1032 struct ksegrp *kg; 1033 1034 kg = ke->ke_ksegrp; 1035 kseq = KSEQ_CPU(ke->ke_cpu); 1036 1037 if (ke->ke_flags & KEF_PRIOELEV) { 1038 ke->ke_slice = SCHED_SLICE_MIN; 1039 return; 1040 } 1041 1042 /* 1043 * Rationale: 1044 * KSEs in interactive ksegs get a minimal slice so that we 1045 * quickly notice if it abuses its advantage. 1046 * 1047 * KSEs in non-interactive ksegs are assigned a slice that is 1048 * based on the ksegs nice value relative to the least nice kseg 1049 * on the run queue for this cpu. 1050 * 1051 * If the KSE is less nice than all others it gets the maximum 1052 * slice and other KSEs will adjust their slice relative to 1053 * this when they first expire. 1054 * 1055 * There is 20 point window that starts relative to the least 1056 * nice kse on the run queue. Slice size is determined by 1057 * the kse distance from the last nice ksegrp. 1058 * 1059 * If the kse is outside of the window it will get no slice 1060 * and will be reevaluated each time it is selected on the 1061 * run queue. The exception to this is nice 0 ksegs when 1062 * a nice -20 is running. They are always granted a minimum 1063 * slice. 1064 */ 1065 if (!SCHED_INTERACTIVE(kg)) { 1066 int nice; 1067 1068 nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); 1069 if (kseq->ksq_load_timeshare == 0 || 1070 kg->kg_proc->p_nice < kseq->ksq_nicemin) 1071 ke->ke_slice = SCHED_SLICE_MAX; 1072 else if (nice <= SCHED_SLICE_NTHRESH) 1073 ke->ke_slice = SCHED_SLICE_NICE(nice); 1074 else if (kg->kg_proc->p_nice == 0) 1075 ke->ke_slice = SCHED_SLICE_MIN; 1076 else 1077 ke->ke_slice = 0; 1078 } else 1079 ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1080 1081 return; 1082} 1083 1084/* 1085 * This routine enforces a maximum limit on the amount of scheduling history 1086 * kept. It is called after either the slptime or runtime is adjusted. 1087 * This routine will not operate correctly when slp or run times have been 1088 * adjusted to more than double their maximum. 1089 */ 1090static void 1091sched_interact_update(struct ksegrp *kg) 1092{ 1093 int sum; 1094 1095 sum = kg->kg_runtime + kg->kg_slptime; 1096 if (sum < SCHED_SLP_RUN_MAX) 1097 return; 1098 /* 1099 * If we have exceeded by more than 1/5th then the algorithm below 1100 * will not bring us back into range. Dividing by two here forces 1101 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1102 */ 1103 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1104 kg->kg_runtime /= 2; 1105 kg->kg_slptime /= 2; 1106 return; 1107 } 1108 kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1109 kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1110} 1111 1112static void 1113sched_interact_fork(struct ksegrp *kg) 1114{ 1115 int ratio; 1116 int sum; 1117 1118 sum = kg->kg_runtime + kg->kg_slptime; 1119 if (sum > SCHED_SLP_RUN_FORK) { 1120 ratio = sum / SCHED_SLP_RUN_FORK; 1121 kg->kg_runtime /= ratio; 1122 kg->kg_slptime /= ratio; 1123 } 1124} 1125 1126static int 1127sched_interact_score(struct ksegrp *kg) 1128{ 1129 int div; 1130 1131 if (kg->kg_runtime > kg->kg_slptime) { 1132 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1133 return (SCHED_INTERACT_HALF + 1134 (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1135 } if (kg->kg_slptime > kg->kg_runtime) { 1136 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1137 return (kg->kg_runtime / div); 1138 } 1139 1140 /* 1141 * This can happen if slptime and runtime are 0. 1142 */ 1143 return (0); 1144 1145} 1146 1147/* 1148 * Very early in the boot some setup of scheduler-specific 1149 * parts of proc0 and of soem scheduler resources needs to be done. 1150 * Called from: 1151 * proc0_init() 1152 */ 1153void 1154schedinit(void) 1155{ 1156 /* 1157 * Set up the scheduler specific parts of proc0. 1158 */ 1159 proc0.p_sched = NULL; /* XXX */ 1160 ksegrp0.kg_sched = &kg_sched0; 1161 thread0.td_sched = &kse0; 1162 kse0.ke_thread = &thread0; 1163 kse0.ke_state = KES_THREAD; 1164 kg_sched0.skg_concurrency = 1; 1165 kg_sched0.skg_avail_opennings = 0; /* we are already running */ 1166} 1167 1168/* 1169 * This is only somewhat accurate since given many processes of the same 1170 * priority they will switch when their slices run out, which will be 1171 * at most SCHED_SLICE_MAX. 1172 */ 1173int 1174sched_rr_interval(void) 1175{ 1176 return (SCHED_SLICE_MAX); 1177} 1178 1179static void 1180sched_pctcpu_update(struct kse *ke) 1181{ 1182 /* 1183 * Adjust counters and watermark for pctcpu calc. 1184 */ 1185 if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1186 /* 1187 * Shift the tick count out so that the divide doesn't 1188 * round away our results. 1189 */ 1190 ke->ke_ticks <<= 10; 1191 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1192 SCHED_CPU_TICKS; 1193 ke->ke_ticks >>= 10; 1194 } else 1195 ke->ke_ticks = 0; 1196 ke->ke_ltick = ticks; 1197 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1198} 1199 1200void 1201sched_prio(struct thread *td, u_char prio) 1202{ 1203 struct kse *ke; 1204 1205 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1206 td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1207 curthread->td_proc->p_comm); 1208 ke = td->td_kse; 1209 mtx_assert(&sched_lock, MA_OWNED); 1210 if (TD_ON_RUNQ(td)) { 1211 /* 1212 * If the priority has been elevated due to priority 1213 * propagation, we may have to move ourselves to a new 1214 * queue. We still call adjustrunqueue below in case kse 1215 * needs to fix things up. 1216 */ 1217 if (prio < td->td_priority && ke->ke_runq != NULL && 1218 (ke->ke_flags & KEF_ASSIGNED) == 0 && 1219 ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1220 runq_remove(ke->ke_runq, ke); 1221 ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1222 runq_add(ke->ke_runq, ke, 0); 1223 } 1224 if (prio < td->td_priority) 1225 ke->ke_flags |= KEF_PRIOELEV; 1226 /* 1227 * Hold this kse on this cpu so that sched_prio() doesn't 1228 * cause excessive migration. We only want migration to 1229 * happen as the result of a wakeup. 1230 */ 1231 ke->ke_flags |= KEF_HOLD; 1232 adjustrunqueue(td, prio); 1233 } else 1234 td->td_priority = prio; 1235} 1236 1237void 1238sched_switch(struct thread *td, struct thread *newtd, int flags) 1239{ 1240 struct kse *ke; 1241 1242 mtx_assert(&sched_lock, MA_OWNED); 1243 1244 ke = td->td_kse; 1245 1246 td->td_lastcpu = td->td_oncpu; 1247 td->td_oncpu = NOCPU; 1248 td->td_flags &= ~TDF_NEEDRESCHED; 1249 td->td_pflags &= ~TDP_OWEPREEMPT; 1250 1251 /* 1252 * If the KSE has been assigned it may be in the process of switching 1253 * to the new cpu. This is the case in sched_bind(). 1254 */ 1255 if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1256 if (td == PCPU_GET(idlethread)) { 1257 TD_SET_CAN_RUN(td); 1258 } else { 1259 /* We are ending our run so make our slot available again */ 1260 SLOT_RELEASE(td->td_ksegrp); 1261 if (TD_IS_RUNNING(td)) { 1262 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1263 /* 1264 * Don't allow the thread to migrate 1265 * from a preemption. 1266 */ 1267 ke->ke_flags |= KEF_HOLD; 1268 setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING); 1269 } else { 1270 if (ke->ke_runq) { 1271 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1272 } else if ((td->td_flags & TDF_IDLETD) == 0) 1273 kdb_backtrace(); 1274 /* 1275 * We will not be on the run queue. 1276 * So we must be sleeping or similar. 1277 * Don't use the slot if we will need it 1278 * for newtd. 1279 */ 1280 if ((td->td_proc->p_flag & P_HADTHREADS) && 1281 (newtd == NULL || 1282 newtd->td_ksegrp != td->td_ksegrp)) 1283 slot_fill(td->td_ksegrp); 1284 } 1285 } 1286 } 1287 if (newtd != NULL) { 1288 /* 1289 * If we bring in a thread, 1290 * then account for it as if it had been added to the 1291 * run queue and then chosen. 1292 */ 1293 newtd->td_kse->ke_flags |= KEF_DIDRUN; 1294 SLOT_USE(newtd->td_ksegrp); 1295 TD_SET_RUNNING(newtd); 1296 kseq_load_add(KSEQ_SELF(), newtd->td_kse); 1297 } else 1298 newtd = choosethread(); 1299 if (td != newtd) 1300 cpu_switch(td, newtd); 1301 sched_lock.mtx_lock = (uintptr_t)td; 1302 1303 td->td_oncpu = PCPU_GET(cpuid); 1304} 1305 1306void 1307sched_nice(struct proc *p, int nice) 1308{ 1309 struct ksegrp *kg; 1310 struct kse *ke; 1311 struct thread *td; 1312 struct kseq *kseq; 1313 1314 PROC_LOCK_ASSERT(p, MA_OWNED); 1315 mtx_assert(&sched_lock, MA_OWNED); 1316 /* 1317 * We need to adjust the nice counts for running KSEs. 1318 */ 1319 FOREACH_KSEGRP_IN_PROC(p, kg) { 1320 if (kg->kg_pri_class == PRI_TIMESHARE) { 1321 FOREACH_THREAD_IN_GROUP(kg, td) { 1322 ke = td->td_kse; 1323 if (ke->ke_runq == NULL) 1324 continue; 1325 kseq = KSEQ_CPU(ke->ke_cpu); 1326 kseq_nice_rem(kseq, p->p_nice); 1327 kseq_nice_add(kseq, nice); 1328 } 1329 } 1330 } 1331 p->p_nice = nice; 1332 FOREACH_KSEGRP_IN_PROC(p, kg) { 1333 sched_priority(kg); 1334 FOREACH_THREAD_IN_GROUP(kg, td) 1335 td->td_flags |= TDF_NEEDRESCHED; 1336 } 1337} 1338 1339void 1340sched_sleep(struct thread *td) 1341{ 1342 mtx_assert(&sched_lock, MA_OWNED); 1343 1344 td->td_slptime = ticks; 1345 td->td_base_pri = td->td_priority; 1346} 1347 1348void 1349sched_wakeup(struct thread *td) 1350{ 1351 mtx_assert(&sched_lock, MA_OWNED); 1352 1353 /* 1354 * Let the kseg know how long we slept for. This is because process 1355 * interactivity behavior is modeled in the kseg. 1356 */ 1357 if (td->td_slptime) { 1358 struct ksegrp *kg; 1359 int hzticks; 1360 1361 kg = td->td_ksegrp; 1362 hzticks = (ticks - td->td_slptime) << 10; 1363 if (hzticks >= SCHED_SLP_RUN_MAX) { 1364 kg->kg_slptime = SCHED_SLP_RUN_MAX; 1365 kg->kg_runtime = 1; 1366 } else { 1367 kg->kg_slptime += hzticks; 1368 sched_interact_update(kg); 1369 } 1370 sched_priority(kg); 1371 sched_slice(td->td_kse); 1372 td->td_slptime = 0; 1373 } 1374 setrunqueue(td, SRQ_BORING); 1375} 1376 1377/* 1378 * Penalize the parent for creating a new child and initialize the child's 1379 * priority. 1380 */ 1381void 1382sched_fork(struct thread *td, struct thread *childtd) 1383{ 1384 1385 mtx_assert(&sched_lock, MA_OWNED); 1386 1387 sched_fork_ksegrp(td, childtd->td_ksegrp); 1388 sched_fork_thread(td, childtd); 1389} 1390 1391void 1392sched_fork_ksegrp(struct thread *td, struct ksegrp *child) 1393{ 1394 struct ksegrp *kg = td->td_ksegrp; 1395 mtx_assert(&sched_lock, MA_OWNED); 1396 1397 child->kg_slptime = kg->kg_slptime; 1398 child->kg_runtime = kg->kg_runtime; 1399 child->kg_user_pri = kg->kg_user_pri; 1400 sched_interact_fork(child); 1401 kg->kg_runtime += tickincr << 10; 1402 sched_interact_update(kg); 1403} 1404 1405void 1406sched_fork_thread(struct thread *td, struct thread *child) 1407{ 1408 struct kse *ke; 1409 struct kse *ke2; 1410 1411 sched_newthread(child); 1412 ke = td->td_kse; 1413 ke2 = child->td_kse; 1414 ke2->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1415 ke2->ke_cpu = ke->ke_cpu; 1416 ke2->ke_runq = NULL; 1417 1418 /* Grab our parents cpu estimation information. */ 1419 ke2->ke_ticks = ke->ke_ticks; 1420 ke2->ke_ltick = ke->ke_ltick; 1421 ke2->ke_ftick = ke->ke_ftick; 1422} 1423 1424void 1425sched_class(struct ksegrp *kg, int class) 1426{ 1427 struct kseq *kseq; 1428 struct kse *ke; 1429 struct thread *td; 1430 int nclass; 1431 int oclass; 1432 1433 mtx_assert(&sched_lock, MA_OWNED); 1434 if (kg->kg_pri_class == class) 1435 return; 1436 1437 nclass = PRI_BASE(class); 1438 oclass = PRI_BASE(kg->kg_pri_class); 1439 FOREACH_THREAD_IN_GROUP(kg, td) { 1440 ke = td->td_kse; 1441 if (ke->ke_state != KES_ONRUNQ && 1442 ke->ke_state != KES_THREAD) 1443 continue; 1444 kseq = KSEQ_CPU(ke->ke_cpu); 1445 1446#ifdef SMP 1447 /* 1448 * On SMP if we're on the RUNQ we must adjust the transferable 1449 * count because could be changing to or from an interrupt 1450 * class. 1451 */ 1452 if (ke->ke_state == KES_ONRUNQ) { 1453 if (KSE_CAN_MIGRATE(ke, oclass)) { 1454 kseq->ksq_transferable--; 1455 kseq->ksq_group->ksg_transferable--; 1456 } 1457 if (KSE_CAN_MIGRATE(ke, nclass)) { 1458 kseq->ksq_transferable++; 1459 kseq->ksq_group->ksg_transferable++; 1460 } 1461 } 1462#endif 1463 if (oclass == PRI_TIMESHARE) { 1464 kseq->ksq_load_timeshare--; 1465 kseq_nice_rem(kseq, kg->kg_proc->p_nice); 1466 } 1467 if (nclass == PRI_TIMESHARE) { 1468 kseq->ksq_load_timeshare++; 1469 kseq_nice_add(kseq, kg->kg_proc->p_nice); 1470 } 1471 } 1472 1473 kg->kg_pri_class = class; 1474} 1475 1476/* 1477 * Return some of the child's priority and interactivity to the parent. 1478 */ 1479void 1480sched_exit(struct proc *p, struct thread *childtd) 1481{ 1482 mtx_assert(&sched_lock, MA_OWNED); 1483 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd); 1484 sched_exit_thread(NULL, childtd); 1485} 1486 1487void 1488sched_exit_ksegrp(struct ksegrp *kg, struct thread *td) 1489{ 1490 /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */ 1491 kg->kg_runtime += td->td_ksegrp->kg_runtime; 1492 sched_interact_update(kg); 1493} 1494 1495void 1496sched_exit_thread(struct thread *td, struct thread *childtd) 1497{ 1498 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 1499 childtd, childtd->td_proc->p_comm, childtd->td_priority); 1500 kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse); 1501} 1502 1503void 1504sched_clock(struct thread *td) 1505{ 1506 struct kseq *kseq; 1507 struct ksegrp *kg; 1508 struct kse *ke; 1509 1510 mtx_assert(&sched_lock, MA_OWNED); 1511 kseq = KSEQ_SELF(); 1512#ifdef SMP 1513 if (ticks == bal_tick) 1514 sched_balance(); 1515 if (ticks == gbal_tick) 1516 sched_balance_groups(); 1517 /* 1518 * We could have been assigned a non real-time thread without an 1519 * IPI. 1520 */ 1521 if (kseq->ksq_assigned) 1522 kseq_assign(kseq); /* Potentially sets NEEDRESCHED */ 1523#endif 1524 /* 1525 * sched_setup() apparently happens prior to stathz being set. We 1526 * need to resolve the timers earlier in the boot so we can avoid 1527 * calculating this here. 1528 */ 1529 if (realstathz == 0) { 1530 realstathz = stathz ? stathz : hz; 1531 tickincr = hz / realstathz; 1532 /* 1533 * XXX This does not work for values of stathz that are much 1534 * larger than hz. 1535 */ 1536 if (tickincr == 0) 1537 tickincr = 1; 1538 } 1539 1540 ke = td->td_kse; 1541 kg = ke->ke_ksegrp; 1542 1543 /* Adjust ticks for pctcpu */ 1544 ke->ke_ticks++; 1545 ke->ke_ltick = ticks; 1546 1547 /* Go up to one second beyond our max and then trim back down */ 1548 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1549 sched_pctcpu_update(ke); 1550 1551 if (td->td_flags & TDF_IDLETD) 1552 return; 1553 /* 1554 * We only do slicing code for TIMESHARE ksegrps. 1555 */ 1556 if (kg->kg_pri_class != PRI_TIMESHARE) 1557 return; 1558 /* 1559 * We used a tick charge it to the ksegrp so that we can compute our 1560 * interactivity. 1561 */ 1562 kg->kg_runtime += tickincr << 10; 1563 sched_interact_update(kg); 1564 1565 /* 1566 * We used up one time slice. 1567 */ 1568 if (--ke->ke_slice > 0) 1569 return; 1570 /* 1571 * We're out of time, recompute priorities and requeue. 1572 */ 1573 kseq_load_rem(kseq, ke); 1574 sched_priority(kg); 1575 sched_slice(ke); 1576 if (SCHED_CURR(kg, ke)) 1577 ke->ke_runq = kseq->ksq_curr; 1578 else 1579 ke->ke_runq = kseq->ksq_next; 1580 kseq_load_add(kseq, ke); 1581 td->td_flags |= TDF_NEEDRESCHED; 1582} 1583 1584int 1585sched_runnable(void) 1586{ 1587 struct kseq *kseq; 1588 int load; 1589 1590 load = 1; 1591 1592 kseq = KSEQ_SELF(); 1593#ifdef SMP 1594 if (kseq->ksq_assigned) { 1595 mtx_lock_spin(&sched_lock); 1596 kseq_assign(kseq); 1597 mtx_unlock_spin(&sched_lock); 1598 } 1599#endif 1600 if ((curthread->td_flags & TDF_IDLETD) != 0) { 1601 if (kseq->ksq_load > 0) 1602 goto out; 1603 } else 1604 if (kseq->ksq_load - 1 > 0) 1605 goto out; 1606 load = 0; 1607out: 1608 return (load); 1609} 1610 1611void 1612sched_userret(struct thread *td) 1613{ 1614 struct ksegrp *kg; 1615 struct kse *ke; 1616 1617 kg = td->td_ksegrp; 1618 ke = td->td_kse; 1619 1620 if (td->td_priority != kg->kg_user_pri || 1621 ke->ke_flags & KEF_PRIOELEV) { 1622 mtx_lock_spin(&sched_lock); 1623 td->td_priority = kg->kg_user_pri; 1624 if (ke->ke_flags & KEF_PRIOELEV) { 1625 ke->ke_flags &= ~KEF_PRIOELEV; 1626 sched_slice(ke); 1627 if (ke->ke_slice == 0) 1628 mi_switch(SW_INVOL, NULL); 1629 } 1630 mtx_unlock_spin(&sched_lock); 1631 } 1632} 1633 1634struct kse * 1635sched_choose(void) 1636{ 1637 struct kseq *kseq; 1638 struct kse *ke; 1639 1640 mtx_assert(&sched_lock, MA_OWNED); 1641 kseq = KSEQ_SELF(); 1642#ifdef SMP 1643restart: 1644 if (kseq->ksq_assigned) 1645 kseq_assign(kseq); 1646#endif 1647 ke = kseq_choose(kseq); 1648 if (ke) { 1649#ifdef SMP 1650 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1651 if (kseq_idled(kseq) == 0) 1652 goto restart; 1653#endif 1654 kseq_runq_rem(kseq, ke); 1655 ke->ke_state = KES_THREAD; 1656 return (ke); 1657 } 1658#ifdef SMP 1659 if (kseq_idled(kseq) == 0) 1660 goto restart; 1661#endif 1662 return (NULL); 1663} 1664 1665void 1666sched_add(struct thread *td, int flags) 1667{ 1668 1669 /* let jeff work out how to map the flags better */ 1670 /* I'm open to suggestions */ 1671 if (flags & SRQ_YIELDING) 1672 /* 1673 * Preempting during switching can be bad JUJU 1674 * especially for KSE processes 1675 */ 1676 sched_add_internal(td, 0); 1677 else 1678 sched_add_internal(td, 1); 1679} 1680 1681static void 1682sched_add_internal(struct thread *td, int preemptive) 1683{ 1684 struct kseq *kseq; 1685 struct ksegrp *kg; 1686 struct kse *ke; 1687#ifdef SMP 1688 int canmigrate; 1689#endif 1690 int class; 1691 1692 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1693 td, td->td_proc->p_comm, td->td_priority, curthread, 1694 curthread->td_proc->p_comm); 1695 mtx_assert(&sched_lock, MA_OWNED); 1696 ke = td->td_kse; 1697 kg = td->td_ksegrp; 1698 if (ke->ke_flags & KEF_ASSIGNED) { 1699 if (ke->ke_flags & KEF_REMOVED) { 1700 SLOT_USE(ke->ke_ksegrp); 1701 ke->ke_flags &= ~KEF_REMOVED; 1702 } 1703 return; 1704 } 1705 kseq = KSEQ_SELF(); 1706 KASSERT(ke->ke_state != KES_ONRUNQ, 1707 ("sched_add: kse %p (%s) already in run queue", ke, 1708 ke->ke_proc->p_comm)); 1709 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1710 ("sched_add: process swapped out")); 1711 KASSERT(ke->ke_runq == NULL, 1712 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1713 1714 class = PRI_BASE(kg->kg_pri_class); 1715 switch (class) { 1716 case PRI_ITHD: 1717 case PRI_REALTIME: 1718 ke->ke_runq = kseq->ksq_curr; 1719 ke->ke_slice = SCHED_SLICE_MAX; 1720 ke->ke_cpu = PCPU_GET(cpuid); 1721 break; 1722 case PRI_TIMESHARE: 1723 if (SCHED_CURR(kg, ke)) 1724 ke->ke_runq = kseq->ksq_curr; 1725 else 1726 ke->ke_runq = kseq->ksq_next; 1727 break; 1728 case PRI_IDLE: 1729 /* 1730 * This is for priority prop. 1731 */ 1732 if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1733 ke->ke_runq = kseq->ksq_curr; 1734 else 1735 ke->ke_runq = &kseq->ksq_idle; 1736 ke->ke_slice = SCHED_SLICE_MIN; 1737 break; 1738 default: 1739 panic("Unknown pri class."); 1740 break; 1741 } 1742#ifdef SMP 1743 /* 1744 * Don't migrate running threads here. Force the long term balancer 1745 * to do it. 1746 */ 1747 canmigrate = KSE_CAN_MIGRATE(ke, class); 1748 if (ke->ke_flags & KEF_HOLD) { 1749 ke->ke_flags &= ~KEF_HOLD; 1750 canmigrate = 0; 1751 } 1752 /* 1753 * If this thread is pinned or bound, notify the target cpu. 1754 */ 1755 if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) { 1756 ke->ke_runq = NULL; 1757 kseq_notify(ke, ke->ke_cpu); 1758 return; 1759 } 1760 /* 1761 * If we had been idle, clear our bit in the group and potentially 1762 * the global bitmap. If not, see if we should transfer this thread. 1763 */ 1764 if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1765 (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1766 /* 1767 * Check to see if our group is unidling, and if so, remove it 1768 * from the global idle mask. 1769 */ 1770 if (kseq->ksq_group->ksg_idlemask == 1771 kseq->ksq_group->ksg_cpumask) 1772 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1773 /* 1774 * Now remove ourselves from the group specific idle mask. 1775 */ 1776 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1777 } else if (kseq->ksq_load > 1 && canmigrate) 1778 if (kseq_transfer(kseq, ke, class)) 1779 return; 1780 ke->ke_cpu = PCPU_GET(cpuid); 1781#endif 1782 /* 1783 * XXX With preemption this is not necessary. 1784 */ 1785 if (td->td_priority < curthread->td_priority && 1786 ke->ke_runq == kseq->ksq_curr) 1787 curthread->td_flags |= TDF_NEEDRESCHED; 1788 if (preemptive && maybe_preempt(td)) 1789 return; 1790 SLOT_USE(td->td_ksegrp); 1791 ke->ke_state = KES_ONRUNQ; 1792 1793 kseq_runq_add(kseq, ke); 1794 kseq_load_add(kseq, ke); 1795} 1796 1797void 1798sched_rem(struct thread *td) 1799{ 1800 struct kseq *kseq; 1801 struct kse *ke; 1802 1803 mtx_assert(&sched_lock, MA_OWNED); 1804 ke = td->td_kse; 1805 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1806 td, td->td_proc->p_comm, td->td_priority, curthread, 1807 curthread->td_proc->p_comm); 1808 /* 1809 * It is safe to just return here because sched_rem() is only ever 1810 * used in places where we're immediately going to add the 1811 * kse back on again. In that case it'll be added with the correct 1812 * thread and priority when the caller drops the sched_lock. 1813 */ 1814 if (ke->ke_flags & KEF_ASSIGNED) { 1815 SLOT_RELEASE(td->td_ksegrp); 1816 ke->ke_flags |= KEF_REMOVED; 1817 return; 1818 } 1819 KASSERT((ke->ke_state == KES_ONRUNQ), 1820 ("sched_rem: KSE not on run queue")); 1821 1822 SLOT_RELEASE(td->td_ksegrp); 1823 ke->ke_state = KES_THREAD; 1824 kseq = KSEQ_CPU(ke->ke_cpu); 1825 kseq_runq_rem(kseq, ke); 1826 kseq_load_rem(kseq, ke); 1827} 1828 1829fixpt_t 1830sched_pctcpu(struct thread *td) 1831{ 1832 fixpt_t pctcpu; 1833 struct kse *ke; 1834 1835 pctcpu = 0; 1836 ke = td->td_kse; 1837 if (ke == NULL) 1838 return (0); 1839 1840 mtx_lock_spin(&sched_lock); 1841 if (ke->ke_ticks) { 1842 int rtick; 1843 1844 /* 1845 * Don't update more frequently than twice a second. Allowing 1846 * this causes the cpu usage to decay away too quickly due to 1847 * rounding errors. 1848 */ 1849 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1850 ke->ke_ltick < (ticks - (hz / 2))) 1851 sched_pctcpu_update(ke); 1852 /* How many rtick per second ? */ 1853 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1854 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1855 } 1856 1857 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1858 mtx_unlock_spin(&sched_lock); 1859 1860 return (pctcpu); 1861} 1862 1863void 1864sched_bind(struct thread *td, int cpu) 1865{ 1866 struct kse *ke; 1867 1868 mtx_assert(&sched_lock, MA_OWNED); 1869 ke = td->td_kse; 1870 ke->ke_flags |= KEF_BOUND; 1871#ifdef SMP 1872 if (PCPU_GET(cpuid) == cpu) 1873 return; 1874 /* sched_rem without the runq_remove */ 1875 ke->ke_state = KES_THREAD; 1876 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1877 kseq_notify(ke, cpu); 1878 /* When we return from mi_switch we'll be on the correct cpu. */ 1879 mi_switch(SW_VOL, NULL); 1880#endif 1881} 1882 1883void 1884sched_unbind(struct thread *td) 1885{ 1886 mtx_assert(&sched_lock, MA_OWNED); 1887 td->td_kse->ke_flags &= ~KEF_BOUND; 1888} 1889 1890int 1891sched_load(void) 1892{ 1893#ifdef SMP 1894 int total; 1895 int i; 1896 1897 total = 0; 1898 for (i = 0; i <= ksg_maxid; i++) 1899 total += KSEQ_GROUP(i)->ksg_load; 1900 return (total); 1901#else 1902 return (KSEQ_SELF()->ksq_sysload); 1903#endif 1904} 1905 1906int 1907sched_sizeof_ksegrp(void) 1908{ 1909 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1910} 1911 1912int 1913sched_sizeof_proc(void) 1914{ 1915 return (sizeof(struct proc)); 1916} 1917 1918int 1919sched_sizeof_thread(void) 1920{ 1921 return (sizeof(struct thread) + sizeof(struct td_sched)); 1922} 1923#define KERN_SWITCH_INCLUDE 1 1924#include "kern/kern_switch.c" 1925