sched_ule.c revision 123529
1/*- 2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 123529 2003-12-14 02:06:29Z jeff $"); 29 30#include <sys/param.h> 31#include <sys/systm.h> 32#include <sys/kernel.h> 33#include <sys/ktr.h> 34#include <sys/lock.h> 35#include <sys/mutex.h> 36#include <sys/proc.h> 37#include <sys/resource.h> 38#include <sys/resourcevar.h> 39#include <sys/sched.h> 40#include <sys/smp.h> 41#include <sys/sx.h> 42#include <sys/sysctl.h> 43#include <sys/sysproto.h> 44#include <sys/vmmeter.h> 45#ifdef DDB 46#include <ddb/ddb.h> 47#endif 48#ifdef KTRACE 49#include <sys/uio.h> 50#include <sys/ktrace.h> 51#endif 52 53#include <machine/cpu.h> 54#include <machine/smp.h> 55 56#define KTR_ULE KTR_NFS 57 58/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 59/* XXX This is bogus compatability crap for ps */ 60static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 61SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 62 63static void sched_setup(void *dummy); 64SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 65 66static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 67 68static int sched_strict; 69SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 70 71static int slice_min = 1; 72SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 73 74static int slice_max = 10; 75SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 76 77int realstathz; 78int tickincr = 1; 79 80#ifdef SMP 81/* Callouts to handle load balancing SMP systems. */ 82static struct callout kseq_lb_callout; 83static struct callout kseq_group_callout; 84#endif 85 86/* 87 * These datastructures are allocated within their parent datastructure but 88 * are scheduler specific. 89 */ 90 91struct ke_sched { 92 int ske_slice; 93 struct runq *ske_runq; 94 /* The following variables are only used for pctcpu calculation */ 95 int ske_ltick; /* Last tick that we were running on */ 96 int ske_ftick; /* First tick that we were running on */ 97 int ske_ticks; /* Tick count */ 98 /* CPU that we have affinity for. */ 99 u_char ske_cpu; 100}; 101#define ke_slice ke_sched->ske_slice 102#define ke_runq ke_sched->ske_runq 103#define ke_ltick ke_sched->ske_ltick 104#define ke_ftick ke_sched->ske_ftick 105#define ke_ticks ke_sched->ske_ticks 106#define ke_cpu ke_sched->ske_cpu 107#define ke_assign ke_procq.tqe_next 108 109#define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ 110#define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ 111 112struct kg_sched { 113 int skg_slptime; /* Number of ticks we vol. slept */ 114 int skg_runtime; /* Number of ticks we were running */ 115}; 116#define kg_slptime kg_sched->skg_slptime 117#define kg_runtime kg_sched->skg_runtime 118 119struct td_sched { 120 int std_slptime; 121}; 122#define td_slptime td_sched->std_slptime 123 124struct td_sched td_sched; 125struct ke_sched ke_sched; 126struct kg_sched kg_sched; 127 128struct ke_sched *kse0_sched = &ke_sched; 129struct kg_sched *ksegrp0_sched = &kg_sched; 130struct p_sched *proc0_sched = NULL; 131struct td_sched *thread0_sched = &td_sched; 132 133/* 134 * The priority is primarily determined by the interactivity score. Thus, we 135 * give lower(better) priorities to kse groups that use less CPU. The nice 136 * value is then directly added to this to allow nice to have some effect 137 * on latency. 138 * 139 * PRI_RANGE: Total priority range for timeshare threads. 140 * PRI_NRESV: Number of nice values. 141 * PRI_BASE: The start of the dynamic range. 142 */ 143#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 144#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 145#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 146#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 147#define SCHED_PRI_INTERACT(score) \ 148 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 149 150/* 151 * These determine the interactivity of a process. 152 * 153 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 154 * before throttling back. 155 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 156 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 157 * INTERACT_THRESH: Threshhold for placement on the current runq. 158 */ 159#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 160#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 161#define SCHED_INTERACT_MAX (100) 162#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 163#define SCHED_INTERACT_THRESH (30) 164 165/* 166 * These parameters and macros determine the size of the time slice that is 167 * granted to each thread. 168 * 169 * SLICE_MIN: Minimum time slice granted, in units of ticks. 170 * SLICE_MAX: Maximum time slice granted. 171 * SLICE_RANGE: Range of available time slices scaled by hz. 172 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 173 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 174 * SLICE_NTHRESH: The nice cutoff point for slice assignment. 175 */ 176#define SCHED_SLICE_MIN (slice_min) 177#define SCHED_SLICE_MAX (slice_max) 178#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 179#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 180#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 181#define SCHED_SLICE_NICE(nice) \ 182 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 183 184/* 185 * This macro determines whether or not the kse belongs on the current or 186 * next run queue. 187 */ 188#define SCHED_INTERACTIVE(kg) \ 189 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 190#define SCHED_CURR(kg, ke) \ 191 (ke->ke_thread->td_priority != kg->kg_user_pri || \ 192 SCHED_INTERACTIVE(kg)) 193 194/* 195 * Cpu percentage computation macros and defines. 196 * 197 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 198 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 199 */ 200 201#define SCHED_CPU_TIME 10 202#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 203 204/* 205 * kseq - per processor runqs and statistics. 206 */ 207struct kseq { 208 struct runq ksq_idle; /* Queue of IDLE threads. */ 209 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 210 struct runq *ksq_next; /* Next timeshare queue. */ 211 struct runq *ksq_curr; /* Current queue. */ 212 int ksq_load_timeshare; /* Load for timeshare. */ 213 int ksq_load; /* Aggregate load. */ 214 short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 215 short ksq_nicemin; /* Least nice. */ 216#ifdef SMP 217 int ksq_transferable; 218 LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 219 struct kseq_group *ksq_group; /* Our processor group. */ 220 volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 221#endif 222}; 223 224#ifdef SMP 225/* 226 * kseq groups are groups of processors which can cheaply share threads. When 227 * one processor in the group goes idle it will check the runqs of the other 228 * processors in its group prior to halting and waiting for an interrupt. 229 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 230 * In a numa environment we'd want an idle bitmap per group and a two tiered 231 * load balancer. 232 */ 233struct kseq_group { 234 int ksg_cpus; /* Count of CPUs in this kseq group. */ 235 int ksg_cpumask; /* Mask of cpus in this group. */ 236 int ksg_idlemask; /* Idle cpus in this group. */ 237 int ksg_mask; /* Bit mask for first cpu. */ 238 int ksg_load; /* Total load of this group. */ 239 int ksg_transferable; /* Transferable load of this group. */ 240 LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 241}; 242#endif 243 244/* 245 * One kse queue per processor. 246 */ 247#ifdef SMP 248static int kseq_idle; 249static int ksg_maxid; 250static struct kseq kseq_cpu[MAXCPU]; 251static struct kseq_group kseq_groups[MAXCPU]; 252#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 253#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 254#define KSEQ_ID(x) ((x) - kseq_cpu) 255#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 256#else /* !SMP */ 257static struct kseq kseq_cpu; 258#define KSEQ_SELF() (&kseq_cpu) 259#define KSEQ_CPU(x) (&kseq_cpu) 260#endif 261 262static void sched_slice(struct kse *ke); 263static void sched_priority(struct ksegrp *kg); 264static int sched_interact_score(struct ksegrp *kg); 265static void sched_interact_update(struct ksegrp *kg); 266static void sched_interact_fork(struct ksegrp *kg); 267static void sched_pctcpu_update(struct kse *ke); 268 269/* Operations on per processor queues */ 270static struct kse * kseq_choose(struct kseq *kseq); 271static void kseq_setup(struct kseq *kseq); 272static void kseq_load_add(struct kseq *kseq, struct kse *ke); 273static void kseq_load_rem(struct kseq *kseq, struct kse *ke); 274static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 275static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 276static void kseq_nice_add(struct kseq *kseq, int nice); 277static void kseq_nice_rem(struct kseq *kseq, int nice); 278void kseq_print(int cpu); 279#ifdef SMP 280static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 281static struct kse *runq_steal(struct runq *rq); 282static void sched_balance(void *arg); 283static void sched_balance_group(struct kseq_group *ksg); 284static void sched_balance_pair(struct kseq *high, struct kseq *low); 285static void kseq_move(struct kseq *from, int cpu); 286static int kseq_idled(struct kseq *kseq); 287static void kseq_notify(struct kse *ke, int cpu); 288static void kseq_assign(struct kseq *); 289static struct kse *kseq_steal(struct kseq *kseq, int stealidle); 290#define KSE_CAN_MIGRATE(ke, class) \ 291 ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 292 ((ke)->ke_flags & KEF_BOUND) == 0) 293#endif 294 295void 296kseq_print(int cpu) 297{ 298 struct kseq *kseq; 299 int i; 300 301 kseq = KSEQ_CPU(cpu); 302 303 printf("kseq:\n"); 304 printf("\tload: %d\n", kseq->ksq_load); 305 printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 306#ifdef SMP 307 printf("\tload transferable: %d\n", kseq->ksq_transferable); 308#endif 309 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 310 printf("\tnice counts:\n"); 311 for (i = 0; i < SCHED_PRI_NRESV; i++) 312 if (kseq->ksq_nice[i]) 313 printf("\t\t%d = %d\n", 314 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 315} 316 317static __inline void 318kseq_runq_add(struct kseq *kseq, struct kse *ke) 319{ 320#ifdef SMP 321 if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 322 kseq->ksq_transferable++; 323 kseq->ksq_group->ksg_transferable++; 324 } 325#endif 326 runq_add(ke->ke_runq, ke); 327} 328 329static __inline void 330kseq_runq_rem(struct kseq *kseq, struct kse *ke) 331{ 332#ifdef SMP 333 if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 334 kseq->ksq_transferable--; 335 kseq->ksq_group->ksg_transferable--; 336 } 337#endif 338 runq_remove(ke->ke_runq, ke); 339} 340 341static void 342kseq_load_add(struct kseq *kseq, struct kse *ke) 343{ 344 int class; 345 mtx_assert(&sched_lock, MA_OWNED); 346 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 347 if (class == PRI_TIMESHARE) 348 kseq->ksq_load_timeshare++; 349 kseq->ksq_load++; 350#ifdef SMP 351 if (class != PRI_ITHD) 352 kseq->ksq_group->ksg_load++; 353#endif 354 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 355 CTR6(KTR_ULE, 356 "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 357 ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 358 ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 359 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 360 kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 361} 362 363static void 364kseq_load_rem(struct kseq *kseq, struct kse *ke) 365{ 366 int class; 367 mtx_assert(&sched_lock, MA_OWNED); 368 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 369 if (class == PRI_TIMESHARE) 370 kseq->ksq_load_timeshare--; 371#ifdef SMP 372 if (class != PRI_ITHD) 373 kseq->ksq_group->ksg_load--; 374#endif 375 kseq->ksq_load--; 376 ke->ke_runq = NULL; 377 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 378 kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 379} 380 381static void 382kseq_nice_add(struct kseq *kseq, int nice) 383{ 384 mtx_assert(&sched_lock, MA_OWNED); 385 /* Normalize to zero. */ 386 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 387 if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 388 kseq->ksq_nicemin = nice; 389} 390 391static void 392kseq_nice_rem(struct kseq *kseq, int nice) 393{ 394 int n; 395 396 mtx_assert(&sched_lock, MA_OWNED); 397 /* Normalize to zero. */ 398 n = nice + SCHED_PRI_NHALF; 399 kseq->ksq_nice[n]--; 400 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 401 402 /* 403 * If this wasn't the smallest nice value or there are more in 404 * this bucket we can just return. Otherwise we have to recalculate 405 * the smallest nice. 406 */ 407 if (nice != kseq->ksq_nicemin || 408 kseq->ksq_nice[n] != 0 || 409 kseq->ksq_load_timeshare == 0) 410 return; 411 412 for (; n < SCHED_PRI_NRESV; n++) 413 if (kseq->ksq_nice[n]) { 414 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 415 return; 416 } 417} 418 419#ifdef SMP 420/* 421 * sched_balance is a simple CPU load balancing algorithm. It operates by 422 * finding the least loaded and most loaded cpu and equalizing their load 423 * by migrating some processes. 424 * 425 * Dealing only with two CPUs at a time has two advantages. Firstly, most 426 * installations will only have 2 cpus. Secondly, load balancing too much at 427 * once can have an unpleasant effect on the system. The scheduler rarely has 428 * enough information to make perfect decisions. So this algorithm chooses 429 * algorithm simplicity and more gradual effects on load in larger systems. 430 * 431 * It could be improved by considering the priorities and slices assigned to 432 * each task prior to balancing them. There are many pathological cases with 433 * any approach and so the semi random algorithm below may work as well as any. 434 * 435 */ 436static void 437sched_balance(void *arg) 438{ 439 struct kseq_group *high; 440 struct kseq_group *low; 441 struct kseq_group *ksg; 442 int timo; 443 int cnt; 444 int i; 445 446 mtx_lock_spin(&sched_lock); 447 if (smp_started == 0) 448 goto out; 449 low = high = NULL; 450 i = random() % (ksg_maxid + 1); 451 for (cnt = 0; cnt <= ksg_maxid; cnt++) { 452 ksg = KSEQ_GROUP(i); 453 /* 454 * Find the CPU with the highest load that has some 455 * threads to transfer. 456 */ 457 if ((high == NULL || ksg->ksg_load > high->ksg_load) 458 && ksg->ksg_transferable) 459 high = ksg; 460 if (low == NULL || ksg->ksg_load < low->ksg_load) 461 low = ksg; 462 if (++i > ksg_maxid) 463 i = 0; 464 } 465 if (low != NULL && high != NULL && high != low) 466 sched_balance_pair(LIST_FIRST(&high->ksg_members), 467 LIST_FIRST(&low->ksg_members)); 468out: 469 mtx_unlock_spin(&sched_lock); 470 timo = random() % (hz * 2); 471 callout_reset(&kseq_lb_callout, timo, sched_balance, NULL); 472} 473 474static void 475sched_balance_groups(void *arg) 476{ 477 int timo; 478 int i; 479 480 mtx_lock_spin(&sched_lock); 481 if (smp_started) 482 for (i = 0; i <= ksg_maxid; i++) 483 sched_balance_group(KSEQ_GROUP(i)); 484 mtx_unlock_spin(&sched_lock); 485 timo = random() % (hz * 2); 486 callout_reset(&kseq_group_callout, timo, sched_balance_groups, NULL); 487} 488 489static void 490sched_balance_group(struct kseq_group *ksg) 491{ 492 struct kseq *kseq; 493 struct kseq *high; 494 struct kseq *low; 495 int load; 496 497 if (ksg->ksg_transferable == 0) 498 return; 499 low = NULL; 500 high = NULL; 501 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 502 load = kseq->ksq_load; 503 if (kseq == KSEQ_CPU(0)) 504 load--; 505 if (high == NULL || load > high->ksq_load) 506 high = kseq; 507 if (low == NULL || load < low->ksq_load) 508 low = kseq; 509 } 510 if (high != NULL && low != NULL && high != low) 511 sched_balance_pair(high, low); 512} 513 514static void 515sched_balance_pair(struct kseq *high, struct kseq *low) 516{ 517 int transferable; 518 int high_load; 519 int low_load; 520 int move; 521 int diff; 522 int i; 523 524 /* 525 * If we're transfering within a group we have to use this specific 526 * kseq's transferable count, otherwise we can steal from other members 527 * of the group. 528 */ 529 if (high->ksq_group == low->ksq_group) { 530 transferable = high->ksq_transferable; 531 high_load = high->ksq_load; 532 low_load = low->ksq_load; 533 /* 534 * XXX If we encounter cpu 0 we must remember to reduce it's 535 * load by 1 to reflect the swi that is running the callout. 536 * At some point we should really fix load balancing of the 537 * swi and then this wont matter. 538 */ 539 if (high == KSEQ_CPU(0)) 540 high_load--; 541 if (low == KSEQ_CPU(0)) 542 low_load--; 543 } else { 544 transferable = high->ksq_group->ksg_transferable; 545 high_load = high->ksq_group->ksg_load; 546 low_load = low->ksq_group->ksg_load; 547 } 548 if (transferable == 0) 549 return; 550 /* 551 * Determine what the imbalance is and then adjust that to how many 552 * kses we actually have to give up (transferable). 553 */ 554 diff = high_load - low_load; 555 move = diff / 2; 556 if (diff & 0x1) 557 move++; 558 move = min(move, transferable); 559 for (i = 0; i < move; i++) 560 kseq_move(high, KSEQ_ID(low)); 561 return; 562} 563 564static void 565kseq_move(struct kseq *from, int cpu) 566{ 567 struct kseq *kseq; 568 struct kseq *to; 569 struct kse *ke; 570 571 kseq = from; 572 to = KSEQ_CPU(cpu); 573 ke = kseq_steal(kseq, 1); 574 if (ke == NULL) { 575 struct kseq_group *ksg; 576 577 ksg = kseq->ksq_group; 578 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 579 if (kseq == from || kseq->ksq_transferable == 0) 580 continue; 581 ke = kseq_steal(kseq, 1); 582 break; 583 } 584 if (ke == NULL) 585 panic("kseq_move: No KSEs available with a " 586 "transferable count of %d\n", 587 ksg->ksg_transferable); 588 } 589 if (kseq == to) 590 return; 591 ke->ke_state = KES_THREAD; 592 kseq_runq_rem(kseq, ke); 593 kseq_load_rem(kseq, ke); 594 kseq_notify(ke, cpu); 595} 596 597static int 598kseq_idled(struct kseq *kseq) 599{ 600 struct kseq_group *ksg; 601 struct kseq *steal; 602 struct kse *ke; 603 604 ksg = kseq->ksq_group; 605 /* 606 * If we're in a cpu group, try and steal kses from another cpu in 607 * the group before idling. 608 */ 609 if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 610 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 611 if (steal == kseq || steal->ksq_transferable == 0) 612 continue; 613 ke = kseq_steal(steal, 0); 614 if (ke == NULL) 615 continue; 616 ke->ke_state = KES_THREAD; 617 kseq_runq_rem(steal, ke); 618 kseq_load_rem(steal, ke); 619 ke->ke_cpu = PCPU_GET(cpuid); 620 sched_add(ke->ke_thread); 621 return (0); 622 } 623 } 624 /* 625 * We only set the idled bit when all of the cpus in the group are 626 * idle. Otherwise we could get into a situation where a KSE bounces 627 * back and forth between two idle cores on seperate physical CPUs. 628 */ 629 ksg->ksg_idlemask |= PCPU_GET(cpumask); 630 if (ksg->ksg_idlemask != ksg->ksg_cpumask) 631 return (1); 632 atomic_set_int(&kseq_idle, ksg->ksg_mask); 633 return (1); 634} 635 636static void 637kseq_assign(struct kseq *kseq) 638{ 639 struct kse *nke; 640 struct kse *ke; 641 642 do { 643 (volatile struct kse *)ke = kseq->ksq_assigned; 644 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 645 for (; ke != NULL; ke = nke) { 646 nke = ke->ke_assign; 647 ke->ke_flags &= ~KEF_ASSIGNED; 648 sched_add(ke->ke_thread); 649 } 650} 651 652static void 653kseq_notify(struct kse *ke, int cpu) 654{ 655 struct kseq *kseq; 656 struct thread *td; 657 struct pcpu *pcpu; 658 659 ke->ke_cpu = cpu; 660 ke->ke_flags |= KEF_ASSIGNED; 661 662 kseq = KSEQ_CPU(cpu); 663 664 /* 665 * Place a KSE on another cpu's queue and force a resched. 666 */ 667 do { 668 (volatile struct kse *)ke->ke_assign = kseq->ksq_assigned; 669 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 670 pcpu = pcpu_find(cpu); 671 td = pcpu->pc_curthread; 672 if (ke->ke_thread->td_priority < td->td_priority || 673 td == pcpu->pc_idlethread) { 674 td->td_flags |= TDF_NEEDRESCHED; 675 ipi_selected(1 << cpu, IPI_AST); 676 } 677} 678 679static struct kse * 680runq_steal(struct runq *rq) 681{ 682 struct rqhead *rqh; 683 struct rqbits *rqb; 684 struct kse *ke; 685 int word; 686 int bit; 687 688 mtx_assert(&sched_lock, MA_OWNED); 689 rqb = &rq->rq_status; 690 for (word = 0; word < RQB_LEN; word++) { 691 if (rqb->rqb_bits[word] == 0) 692 continue; 693 for (bit = 0; bit < RQB_BPW; bit++) { 694 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 695 continue; 696 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 697 TAILQ_FOREACH(ke, rqh, ke_procq) { 698 if (KSE_CAN_MIGRATE(ke, 699 PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 700 return (ke); 701 } 702 } 703 } 704 return (NULL); 705} 706 707static struct kse * 708kseq_steal(struct kseq *kseq, int stealidle) 709{ 710 struct kse *ke; 711 712 /* 713 * Steal from next first to try to get a non-interactive task that 714 * may not have run for a while. 715 */ 716 if ((ke = runq_steal(kseq->ksq_next)) != NULL) 717 return (ke); 718 if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 719 return (ke); 720 if (stealidle) 721 return (runq_steal(&kseq->ksq_idle)); 722 return (NULL); 723} 724 725int 726kseq_transfer(struct kseq *kseq, struct kse *ke, int class) 727{ 728 struct kseq_group *ksg; 729 int cpu; 730 731 cpu = 0; 732 ksg = kseq->ksq_group; 733 734 /* 735 * XXX This ksg_transferable might work better if we were checking 736 * against a global group load. As it is now, this prevents us from 737 * transfering a thread from a group that is potentially bogged down 738 * with non transferable load. 739 */ 740 if (ksg->ksg_transferable > ksg->ksg_cpus && kseq_idle) { 741 /* 742 * Multiple cpus could find this bit simultaneously 743 * but the race shouldn't be terrible. 744 */ 745 cpu = ffs(kseq_idle); 746 if (cpu) 747 atomic_clear_int(&kseq_idle, 1 << (cpu - 1)); 748 } 749 /* 750 * If another cpu in this group has idled, assign a thread over 751 * to them after checking to see if there are idled groups. 752 */ 753 if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) { 754 cpu = ffs(ksg->ksg_idlemask); 755 if (cpu) 756 ksg->ksg_idlemask &= ~(1 << (cpu - 1)); 757 } 758 /* 759 * Now that we've found an idle CPU, migrate the thread. 760 */ 761 if (cpu) { 762 cpu--; 763 ke->ke_runq = NULL; 764 kseq_notify(ke, cpu); 765 return (1); 766 } 767 return (0); 768} 769 770#endif /* SMP */ 771 772/* 773 * Pick the highest priority task we have and return it. 774 */ 775 776static struct kse * 777kseq_choose(struct kseq *kseq) 778{ 779 struct kse *ke; 780 struct runq *swap; 781 782 mtx_assert(&sched_lock, MA_OWNED); 783 swap = NULL; 784 785 for (;;) { 786 ke = runq_choose(kseq->ksq_curr); 787 if (ke == NULL) { 788 /* 789 * We already swaped once and didn't get anywhere. 790 */ 791 if (swap) 792 break; 793 swap = kseq->ksq_curr; 794 kseq->ksq_curr = kseq->ksq_next; 795 kseq->ksq_next = swap; 796 continue; 797 } 798 /* 799 * If we encounter a slice of 0 the kse is in a 800 * TIMESHARE kse group and its nice was too far out 801 * of the range that receives slices. 802 */ 803 if (ke->ke_slice == 0) { 804 runq_remove(ke->ke_runq, ke); 805 sched_slice(ke); 806 ke->ke_runq = kseq->ksq_next; 807 runq_add(ke->ke_runq, ke); 808 continue; 809 } 810 return (ke); 811 } 812 813 return (runq_choose(&kseq->ksq_idle)); 814} 815 816static void 817kseq_setup(struct kseq *kseq) 818{ 819 runq_init(&kseq->ksq_timeshare[0]); 820 runq_init(&kseq->ksq_timeshare[1]); 821 runq_init(&kseq->ksq_idle); 822 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 823 kseq->ksq_next = &kseq->ksq_timeshare[1]; 824 kseq->ksq_load = 0; 825 kseq->ksq_load_timeshare = 0; 826} 827 828static void 829sched_setup(void *dummy) 830{ 831#ifdef SMP 832 int balance_groups; 833 int i; 834#endif 835 836 slice_min = (hz/100); /* 10ms */ 837 slice_max = (hz/7); /* ~140ms */ 838 839#ifdef SMP 840 balance_groups = 0; 841 /* 842 * Initialize the kseqs. 843 */ 844 for (i = 0; i < MAXCPU; i++) { 845 struct kseq *ksq; 846 847 ksq = &kseq_cpu[i]; 848 ksq->ksq_assigned = NULL; 849 kseq_setup(&kseq_cpu[i]); 850 } 851 if (smp_topology == NULL) { 852 struct kseq_group *ksg; 853 struct kseq *ksq; 854 855 for (i = 0; i < MAXCPU; i++) { 856 ksq = &kseq_cpu[i]; 857 ksg = &kseq_groups[i]; 858 /* 859 * Setup a kse group with one member. 860 */ 861 ksq->ksq_transferable = 0; 862 ksq->ksq_group = ksg; 863 ksg->ksg_cpus = 1; 864 ksg->ksg_idlemask = 0; 865 ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 866 ksg->ksg_load = 0; 867 ksg->ksg_transferable = 0; 868 LIST_INIT(&ksg->ksg_members); 869 LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 870 } 871 } else { 872 struct kseq_group *ksg; 873 struct cpu_group *cg; 874 int j; 875 876 for (i = 0; i < smp_topology->ct_count; i++) { 877 cg = &smp_topology->ct_group[i]; 878 ksg = &kseq_groups[i]; 879 /* 880 * Initialize the group. 881 */ 882 ksg->ksg_idlemask = 0; 883 ksg->ksg_load = 0; 884 ksg->ksg_transferable = 0; 885 ksg->ksg_cpus = cg->cg_count; 886 ksg->ksg_cpumask = cg->cg_mask; 887 LIST_INIT(&ksg->ksg_members); 888 /* 889 * Find all of the group members and add them. 890 */ 891 for (j = 0; j < MAXCPU; j++) { 892 if ((cg->cg_mask & (1 << j)) != 0) { 893 if (ksg->ksg_mask == 0) 894 ksg->ksg_mask = 1 << j; 895 kseq_cpu[j].ksq_transferable = 0; 896 kseq_cpu[j].ksq_group = ksg; 897 LIST_INSERT_HEAD(&ksg->ksg_members, 898 &kseq_cpu[j], ksq_siblings); 899 } 900 } 901 if (ksg->ksg_cpus > 1) 902 balance_groups = 1; 903 } 904 ksg_maxid = smp_topology->ct_count - 1; 905 } 906 callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 907 callout_init(&kseq_group_callout, CALLOUT_MPSAFE); 908 sched_balance(NULL); 909 /* 910 * Stagger the group and global load balancer so they do not 911 * interfere with each other. 912 */ 913 if (balance_groups) 914 callout_reset(&kseq_group_callout, hz / 2, 915 sched_balance_groups, NULL); 916#else 917 kseq_setup(KSEQ_SELF()); 918#endif 919 mtx_lock_spin(&sched_lock); 920 kseq_load_add(KSEQ_SELF(), &kse0); 921 mtx_unlock_spin(&sched_lock); 922} 923 924/* 925 * Scale the scheduling priority according to the "interactivity" of this 926 * process. 927 */ 928static void 929sched_priority(struct ksegrp *kg) 930{ 931 int pri; 932 933 if (kg->kg_pri_class != PRI_TIMESHARE) 934 return; 935 936 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 937 pri += SCHED_PRI_BASE; 938 pri += kg->kg_nice; 939 940 if (pri > PRI_MAX_TIMESHARE) 941 pri = PRI_MAX_TIMESHARE; 942 else if (pri < PRI_MIN_TIMESHARE) 943 pri = PRI_MIN_TIMESHARE; 944 945 kg->kg_user_pri = pri; 946 947 return; 948} 949 950/* 951 * Calculate a time slice based on the properties of the kseg and the runq 952 * that we're on. This is only for PRI_TIMESHARE ksegrps. 953 */ 954static void 955sched_slice(struct kse *ke) 956{ 957 struct kseq *kseq; 958 struct ksegrp *kg; 959 960 kg = ke->ke_ksegrp; 961 kseq = KSEQ_CPU(ke->ke_cpu); 962 963 /* 964 * Rationale: 965 * KSEs in interactive ksegs get the minimum slice so that we 966 * quickly notice if it abuses its advantage. 967 * 968 * KSEs in non-interactive ksegs are assigned a slice that is 969 * based on the ksegs nice value relative to the least nice kseg 970 * on the run queue for this cpu. 971 * 972 * If the KSE is less nice than all others it gets the maximum 973 * slice and other KSEs will adjust their slice relative to 974 * this when they first expire. 975 * 976 * There is 20 point window that starts relative to the least 977 * nice kse on the run queue. Slice size is determined by 978 * the kse distance from the last nice ksegrp. 979 * 980 * If the kse is outside of the window it will get no slice 981 * and will be reevaluated each time it is selected on the 982 * run queue. The exception to this is nice 0 ksegs when 983 * a nice -20 is running. They are always granted a minimum 984 * slice. 985 */ 986 if (!SCHED_INTERACTIVE(kg)) { 987 int nice; 988 989 nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 990 if (kseq->ksq_load_timeshare == 0 || 991 kg->kg_nice < kseq->ksq_nicemin) 992 ke->ke_slice = SCHED_SLICE_MAX; 993 else if (nice <= SCHED_SLICE_NTHRESH) 994 ke->ke_slice = SCHED_SLICE_NICE(nice); 995 else if (kg->kg_nice == 0) 996 ke->ke_slice = SCHED_SLICE_MIN; 997 else 998 ke->ke_slice = 0; 999 } else 1000 ke->ke_slice = SCHED_SLICE_MIN; 1001 1002 CTR6(KTR_ULE, 1003 "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 1004 ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 1005 kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); 1006 1007 return; 1008} 1009 1010/* 1011 * This routine enforces a maximum limit on the amount of scheduling history 1012 * kept. It is called after either the slptime or runtime is adjusted. 1013 * This routine will not operate correctly when slp or run times have been 1014 * adjusted to more than double their maximum. 1015 */ 1016static void 1017sched_interact_update(struct ksegrp *kg) 1018{ 1019 int sum; 1020 1021 sum = kg->kg_runtime + kg->kg_slptime; 1022 if (sum < SCHED_SLP_RUN_MAX) 1023 return; 1024 /* 1025 * If we have exceeded by more than 1/5th then the algorithm below 1026 * will not bring us back into range. Dividing by two here forces 1027 * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1028 */ 1029 if (sum > (SCHED_INTERACT_MAX / 5) * 6) { 1030 kg->kg_runtime /= 2; 1031 kg->kg_slptime /= 2; 1032 return; 1033 } 1034 kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1035 kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1036} 1037 1038static void 1039sched_interact_fork(struct ksegrp *kg) 1040{ 1041 int ratio; 1042 int sum; 1043 1044 sum = kg->kg_runtime + kg->kg_slptime; 1045 if (sum > SCHED_SLP_RUN_FORK) { 1046 ratio = sum / SCHED_SLP_RUN_FORK; 1047 kg->kg_runtime /= ratio; 1048 kg->kg_slptime /= ratio; 1049 } 1050} 1051 1052static int 1053sched_interact_score(struct ksegrp *kg) 1054{ 1055 int div; 1056 1057 if (kg->kg_runtime > kg->kg_slptime) { 1058 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1059 return (SCHED_INTERACT_HALF + 1060 (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1061 } if (kg->kg_slptime > kg->kg_runtime) { 1062 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1063 return (kg->kg_runtime / div); 1064 } 1065 1066 /* 1067 * This can happen if slptime and runtime are 0. 1068 */ 1069 return (0); 1070 1071} 1072 1073/* 1074 * This is only somewhat accurate since given many processes of the same 1075 * priority they will switch when their slices run out, which will be 1076 * at most SCHED_SLICE_MAX. 1077 */ 1078int 1079sched_rr_interval(void) 1080{ 1081 return (SCHED_SLICE_MAX); 1082} 1083 1084static void 1085sched_pctcpu_update(struct kse *ke) 1086{ 1087 /* 1088 * Adjust counters and watermark for pctcpu calc. 1089 */ 1090 if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1091 /* 1092 * Shift the tick count out so that the divide doesn't 1093 * round away our results. 1094 */ 1095 ke->ke_ticks <<= 10; 1096 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1097 SCHED_CPU_TICKS; 1098 ke->ke_ticks >>= 10; 1099 } else 1100 ke->ke_ticks = 0; 1101 ke->ke_ltick = ticks; 1102 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1103} 1104 1105void 1106sched_prio(struct thread *td, u_char prio) 1107{ 1108 struct kse *ke; 1109 1110 ke = td->td_kse; 1111 mtx_assert(&sched_lock, MA_OWNED); 1112 if (TD_ON_RUNQ(td)) { 1113 /* 1114 * If the priority has been elevated due to priority 1115 * propagation, we may have to move ourselves to a new 1116 * queue. We still call adjustrunqueue below in case kse 1117 * needs to fix things up. 1118 */ 1119 if (prio < td->td_priority && ke && 1120 (ke->ke_flags & KEF_ASSIGNED) == 0 && 1121 ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1122 runq_remove(ke->ke_runq, ke); 1123 ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1124 runq_add(ke->ke_runq, ke); 1125 } 1126 adjustrunqueue(td, prio); 1127 } else 1128 td->td_priority = prio; 1129} 1130 1131void 1132sched_switch(struct thread *td) 1133{ 1134 struct thread *newtd; 1135 struct kse *ke; 1136 1137 mtx_assert(&sched_lock, MA_OWNED); 1138 1139 ke = td->td_kse; 1140 1141 td->td_last_kse = ke; 1142 td->td_lastcpu = td->td_oncpu; 1143 td->td_oncpu = NOCPU; 1144 td->td_flags &= ~TDF_NEEDRESCHED; 1145 1146 /* 1147 * If the KSE has been assigned it may be in the process of switching 1148 * to the new cpu. This is the case in sched_bind(). 1149 */ 1150 if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1151 if (TD_IS_RUNNING(td)) { 1152 if (td->td_proc->p_flag & P_SA) { 1153 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1154 setrunqueue(td); 1155 } else 1156 kseq_runq_add(KSEQ_SELF(), ke); 1157 } else { 1158 if (ke->ke_runq) 1159 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1160 /* 1161 * We will not be on the run queue. So we must be 1162 * sleeping or similar. 1163 */ 1164 if (td->td_proc->p_flag & P_SA) 1165 kse_reassign(ke); 1166 } 1167 } 1168 newtd = choosethread(); 1169 if (td != newtd) 1170 cpu_switch(td, newtd); 1171 sched_lock.mtx_lock = (uintptr_t)td; 1172 1173 td->td_oncpu = PCPU_GET(cpuid); 1174} 1175 1176void 1177sched_nice(struct ksegrp *kg, int nice) 1178{ 1179 struct kse *ke; 1180 struct thread *td; 1181 struct kseq *kseq; 1182 1183 PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 1184 mtx_assert(&sched_lock, MA_OWNED); 1185 /* 1186 * We need to adjust the nice counts for running KSEs. 1187 */ 1188 if (kg->kg_pri_class == PRI_TIMESHARE) 1189 FOREACH_KSE_IN_GROUP(kg, ke) { 1190 if (ke->ke_runq == NULL) 1191 continue; 1192 kseq = KSEQ_CPU(ke->ke_cpu); 1193 kseq_nice_rem(kseq, kg->kg_nice); 1194 kseq_nice_add(kseq, nice); 1195 } 1196 kg->kg_nice = nice; 1197 sched_priority(kg); 1198 FOREACH_THREAD_IN_GROUP(kg, td) 1199 td->td_flags |= TDF_NEEDRESCHED; 1200} 1201 1202void 1203sched_sleep(struct thread *td, u_char prio) 1204{ 1205 mtx_assert(&sched_lock, MA_OWNED); 1206 1207 td->td_slptime = ticks; 1208 td->td_priority = prio; 1209 1210 CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 1211 td->td_kse, td->td_slptime); 1212} 1213 1214void 1215sched_wakeup(struct thread *td) 1216{ 1217 mtx_assert(&sched_lock, MA_OWNED); 1218 1219 /* 1220 * Let the kseg know how long we slept for. This is because process 1221 * interactivity behavior is modeled in the kseg. 1222 */ 1223 if (td->td_slptime) { 1224 struct ksegrp *kg; 1225 int hzticks; 1226 1227 kg = td->td_ksegrp; 1228 hzticks = (ticks - td->td_slptime) << 10; 1229 if (hzticks >= SCHED_SLP_RUN_MAX) { 1230 kg->kg_slptime = SCHED_SLP_RUN_MAX; 1231 kg->kg_runtime = 1; 1232 } else { 1233 kg->kg_slptime += hzticks; 1234 sched_interact_update(kg); 1235 } 1236 sched_priority(kg); 1237 if (td->td_kse) 1238 sched_slice(td->td_kse); 1239 CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 1240 td->td_kse, hzticks); 1241 td->td_slptime = 0; 1242 } 1243 setrunqueue(td); 1244} 1245 1246/* 1247 * Penalize the parent for creating a new child and initialize the child's 1248 * priority. 1249 */ 1250void 1251sched_fork(struct proc *p, struct proc *p1) 1252{ 1253 1254 mtx_assert(&sched_lock, MA_OWNED); 1255 1256 sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 1257 sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 1258 sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 1259} 1260 1261void 1262sched_fork_kse(struct kse *ke, struct kse *child) 1263{ 1264 1265 child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1266 child->ke_cpu = ke->ke_cpu; 1267 child->ke_runq = NULL; 1268 1269 /* Grab our parents cpu estimation information. */ 1270 child->ke_ticks = ke->ke_ticks; 1271 child->ke_ltick = ke->ke_ltick; 1272 child->ke_ftick = ke->ke_ftick; 1273} 1274 1275void 1276sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1277{ 1278 PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 1279 1280 child->kg_slptime = kg->kg_slptime; 1281 child->kg_runtime = kg->kg_runtime; 1282 child->kg_user_pri = kg->kg_user_pri; 1283 child->kg_nice = kg->kg_nice; 1284 sched_interact_fork(child); 1285 kg->kg_runtime += tickincr << 10; 1286 sched_interact_update(kg); 1287 1288 CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", 1289 kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 1290 child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); 1291} 1292 1293void 1294sched_fork_thread(struct thread *td, struct thread *child) 1295{ 1296} 1297 1298void 1299sched_class(struct ksegrp *kg, int class) 1300{ 1301 struct kseq *kseq; 1302 struct kse *ke; 1303 int nclass; 1304 int oclass; 1305 1306 mtx_assert(&sched_lock, MA_OWNED); 1307 if (kg->kg_pri_class == class) 1308 return; 1309 1310 nclass = PRI_BASE(class); 1311 oclass = PRI_BASE(kg->kg_pri_class); 1312 FOREACH_KSE_IN_GROUP(kg, ke) { 1313 if (ke->ke_state != KES_ONRUNQ && 1314 ke->ke_state != KES_THREAD) 1315 continue; 1316 kseq = KSEQ_CPU(ke->ke_cpu); 1317 1318#ifdef SMP 1319 /* 1320 * On SMP if we're on the RUNQ we must adjust the transferable 1321 * count because could be changing to or from an interrupt 1322 * class. 1323 */ 1324 if (ke->ke_state == KES_ONRUNQ) { 1325 if (KSE_CAN_MIGRATE(ke, oclass)) { 1326 kseq->ksq_transferable--; 1327 kseq->ksq_group->ksg_transferable--; 1328 } 1329 if (KSE_CAN_MIGRATE(ke, nclass)) { 1330 kseq->ksq_transferable++; 1331 kseq->ksq_group->ksg_transferable++; 1332 } 1333 } 1334#endif 1335 if (oclass == PRI_TIMESHARE) { 1336 kseq->ksq_load_timeshare--; 1337 kseq_nice_rem(kseq, kg->kg_nice); 1338 } 1339 if (nclass == PRI_TIMESHARE) { 1340 kseq->ksq_load_timeshare++; 1341 kseq_nice_add(kseq, kg->kg_nice); 1342 } 1343 } 1344 1345 kg->kg_pri_class = class; 1346} 1347 1348/* 1349 * Return some of the child's priority and interactivity to the parent. 1350 */ 1351void 1352sched_exit(struct proc *p, struct proc *child) 1353{ 1354 mtx_assert(&sched_lock, MA_OWNED); 1355 sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 1356 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 1357} 1358 1359void 1360sched_exit_kse(struct kse *ke, struct kse *child) 1361{ 1362 kseq_load_rem(KSEQ_CPU(child->ke_cpu), child); 1363} 1364 1365void 1366sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1367{ 1368 /* kg->kg_slptime += child->kg_slptime; */ 1369 kg->kg_runtime += child->kg_runtime; 1370 sched_interact_update(kg); 1371} 1372 1373void 1374sched_exit_thread(struct thread *td, struct thread *child) 1375{ 1376} 1377 1378void 1379sched_clock(struct thread *td) 1380{ 1381 struct kseq *kseq; 1382 struct ksegrp *kg; 1383 struct kse *ke; 1384 1385 /* 1386 * sched_setup() apparently happens prior to stathz being set. We 1387 * need to resolve the timers earlier in the boot so we can avoid 1388 * calculating this here. 1389 */ 1390 if (realstathz == 0) { 1391 realstathz = stathz ? stathz : hz; 1392 tickincr = hz / realstathz; 1393 /* 1394 * XXX This does not work for values of stathz that are much 1395 * larger than hz. 1396 */ 1397 if (tickincr == 0) 1398 tickincr = 1; 1399 } 1400 1401 ke = td->td_kse; 1402 kg = ke->ke_ksegrp; 1403 1404 mtx_assert(&sched_lock, MA_OWNED); 1405 KASSERT((td != NULL), ("schedclock: null thread pointer")); 1406 1407 /* Adjust ticks for pctcpu */ 1408 ke->ke_ticks++; 1409 ke->ke_ltick = ticks; 1410 1411 /* Go up to one second beyond our max and then trim back down */ 1412 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1413 sched_pctcpu_update(ke); 1414 1415 if (td->td_flags & TDF_IDLETD) 1416 return; 1417 1418 CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1419 ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1420 /* 1421 * We only do slicing code for TIMESHARE ksegrps. 1422 */ 1423 if (kg->kg_pri_class != PRI_TIMESHARE) 1424 return; 1425 /* 1426 * We used a tick charge it to the ksegrp so that we can compute our 1427 * interactivity. 1428 */ 1429 kg->kg_runtime += tickincr << 10; 1430 sched_interact_update(kg); 1431 1432 /* 1433 * We used up one time slice. 1434 */ 1435 if (--ke->ke_slice > 0) 1436 return; 1437 /* 1438 * We're out of time, recompute priorities and requeue. 1439 */ 1440 kseq = KSEQ_SELF(); 1441 kseq_load_rem(kseq, ke); 1442 sched_priority(kg); 1443 sched_slice(ke); 1444 if (SCHED_CURR(kg, ke)) 1445 ke->ke_runq = kseq->ksq_curr; 1446 else 1447 ke->ke_runq = kseq->ksq_next; 1448 kseq_load_add(kseq, ke); 1449 td->td_flags |= TDF_NEEDRESCHED; 1450} 1451 1452int 1453sched_runnable(void) 1454{ 1455 struct kseq *kseq; 1456 int load; 1457 1458 load = 1; 1459 1460 kseq = KSEQ_SELF(); 1461#ifdef SMP 1462 if (kseq->ksq_assigned) { 1463 mtx_lock_spin(&sched_lock); 1464 kseq_assign(kseq); 1465 mtx_unlock_spin(&sched_lock); 1466 } 1467#endif 1468 if ((curthread->td_flags & TDF_IDLETD) != 0) { 1469 if (kseq->ksq_load > 0) 1470 goto out; 1471 } else 1472 if (kseq->ksq_load - 1 > 0) 1473 goto out; 1474 load = 0; 1475out: 1476 return (load); 1477} 1478 1479void 1480sched_userret(struct thread *td) 1481{ 1482 struct ksegrp *kg; 1483 1484 kg = td->td_ksegrp; 1485 1486 if (td->td_priority != kg->kg_user_pri) { 1487 mtx_lock_spin(&sched_lock); 1488 td->td_priority = kg->kg_user_pri; 1489 mtx_unlock_spin(&sched_lock); 1490 } 1491} 1492 1493struct kse * 1494sched_choose(void) 1495{ 1496 struct kseq *kseq; 1497 struct kse *ke; 1498 1499 mtx_assert(&sched_lock, MA_OWNED); 1500 kseq = KSEQ_SELF(); 1501#ifdef SMP 1502restart: 1503 if (kseq->ksq_assigned) 1504 kseq_assign(kseq); 1505#endif 1506 ke = kseq_choose(kseq); 1507 if (ke) { 1508#ifdef SMP 1509 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1510 if (kseq_idled(kseq) == 0) 1511 goto restart; 1512#endif 1513 kseq_runq_rem(kseq, ke); 1514 ke->ke_state = KES_THREAD; 1515 1516 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1517 CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1518 ke, ke->ke_runq, ke->ke_slice, 1519 ke->ke_thread->td_priority); 1520 } 1521 return (ke); 1522 } 1523#ifdef SMP 1524 if (kseq_idled(kseq) == 0) 1525 goto restart; 1526#endif 1527 return (NULL); 1528} 1529 1530void 1531sched_add(struct thread *td) 1532{ 1533 struct kseq *kseq; 1534 struct ksegrp *kg; 1535 struct kse *ke; 1536 int class; 1537 1538 mtx_assert(&sched_lock, MA_OWNED); 1539 ke = td->td_kse; 1540 kg = td->td_ksegrp; 1541 if (ke->ke_flags & KEF_ASSIGNED) 1542 return; 1543 kseq = KSEQ_SELF(); 1544 KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1545 KASSERT((ke->ke_thread->td_kse != NULL), 1546 ("sched_add: No KSE on thread")); 1547 KASSERT(ke->ke_state != KES_ONRUNQ, 1548 ("sched_add: kse %p (%s) already in run queue", ke, 1549 ke->ke_proc->p_comm)); 1550 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1551 ("sched_add: process swapped out")); 1552 KASSERT(ke->ke_runq == NULL, 1553 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1554 1555 class = PRI_BASE(kg->kg_pri_class); 1556 switch (class) { 1557 case PRI_ITHD: 1558 case PRI_REALTIME: 1559 ke->ke_runq = kseq->ksq_curr; 1560 ke->ke_slice = SCHED_SLICE_MAX; 1561 ke->ke_cpu = PCPU_GET(cpuid); 1562 break; 1563 case PRI_TIMESHARE: 1564 if (SCHED_CURR(kg, ke)) 1565 ke->ke_runq = kseq->ksq_curr; 1566 else 1567 ke->ke_runq = kseq->ksq_next; 1568 break; 1569 case PRI_IDLE: 1570 /* 1571 * This is for priority prop. 1572 */ 1573 if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1574 ke->ke_runq = kseq->ksq_curr; 1575 else 1576 ke->ke_runq = &kseq->ksq_idle; 1577 ke->ke_slice = SCHED_SLICE_MIN; 1578 break; 1579 default: 1580 panic("Unknown pri class."); 1581 break; 1582 } 1583#ifdef SMP 1584 if (ke->ke_cpu != PCPU_GET(cpuid)) { 1585 ke->ke_runq = NULL; 1586 kseq_notify(ke, ke->ke_cpu); 1587 return; 1588 } 1589 /* 1590 * If there are any idle groups, give them our extra load. The 1591 * threshold at which we start to reassign kses has a large impact 1592 * on the overall performance of the system. Tuned too high and 1593 * some CPUs may idle. Too low and there will be excess migration 1594 * and context swiches. 1595 */ 1596 if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class)) 1597 if (kseq_transfer(kseq, ke, class)) 1598 return; 1599 if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1600 (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1601 /* 1602 * Check to see if our group is unidling, and if so, remove it 1603 * from the global idle mask. 1604 */ 1605 if (kseq->ksq_group->ksg_idlemask == 1606 kseq->ksq_group->ksg_cpumask) 1607 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1608 /* 1609 * Now remove ourselves from the group specific idle mask. 1610 */ 1611 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1612 } 1613#endif 1614 if (td->td_priority < curthread->td_priority) 1615 curthread->td_flags |= TDF_NEEDRESCHED; 1616 1617 ke->ke_ksegrp->kg_runq_kses++; 1618 ke->ke_state = KES_ONRUNQ; 1619 1620 kseq_runq_add(kseq, ke); 1621 kseq_load_add(kseq, ke); 1622} 1623 1624void 1625sched_rem(struct thread *td) 1626{ 1627 struct kseq *kseq; 1628 struct kse *ke; 1629 1630 ke = td->td_kse; 1631 /* 1632 * It is safe to just return here because sched_rem() is only ever 1633 * used in places where we're immediately going to add the 1634 * kse back on again. In that case it'll be added with the correct 1635 * thread and priority when the caller drops the sched_lock. 1636 */ 1637 if (ke->ke_flags & KEF_ASSIGNED) 1638 return; 1639 mtx_assert(&sched_lock, MA_OWNED); 1640 KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1641 1642 ke->ke_state = KES_THREAD; 1643 ke->ke_ksegrp->kg_runq_kses--; 1644 kseq = KSEQ_CPU(ke->ke_cpu); 1645 kseq_runq_rem(kseq, ke); 1646 kseq_load_rem(kseq, ke); 1647} 1648 1649fixpt_t 1650sched_pctcpu(struct thread *td) 1651{ 1652 fixpt_t pctcpu; 1653 struct kse *ke; 1654 1655 pctcpu = 0; 1656 ke = td->td_kse; 1657 if (ke == NULL) 1658 return (0); 1659 1660 mtx_lock_spin(&sched_lock); 1661 if (ke->ke_ticks) { 1662 int rtick; 1663 1664 /* 1665 * Don't update more frequently than twice a second. Allowing 1666 * this causes the cpu usage to decay away too quickly due to 1667 * rounding errors. 1668 */ 1669 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1670 ke->ke_ltick < (ticks - (hz / 2))) 1671 sched_pctcpu_update(ke); 1672 /* How many rtick per second ? */ 1673 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1674 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1675 } 1676 1677 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1678 mtx_unlock_spin(&sched_lock); 1679 1680 return (pctcpu); 1681} 1682 1683void 1684sched_bind(struct thread *td, int cpu) 1685{ 1686 struct kse *ke; 1687 1688 mtx_assert(&sched_lock, MA_OWNED); 1689 ke = td->td_kse; 1690 ke->ke_flags |= KEF_BOUND; 1691#ifdef SMP 1692 if (PCPU_GET(cpuid) == cpu) 1693 return; 1694 /* sched_rem without the runq_remove */ 1695 ke->ke_state = KES_THREAD; 1696 ke->ke_ksegrp->kg_runq_kses--; 1697 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1698 kseq_notify(ke, cpu); 1699 /* When we return from mi_switch we'll be on the correct cpu. */ 1700 td->td_proc->p_stats->p_ru.ru_nvcsw++; 1701 mi_switch(); 1702#endif 1703} 1704 1705void 1706sched_unbind(struct thread *td) 1707{ 1708 mtx_assert(&sched_lock, MA_OWNED); 1709 td->td_kse->ke_flags &= ~KEF_BOUND; 1710} 1711 1712int 1713sched_sizeof_kse(void) 1714{ 1715 return (sizeof(struct kse) + sizeof(struct ke_sched)); 1716} 1717 1718int 1719sched_sizeof_ksegrp(void) 1720{ 1721 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1722} 1723 1724int 1725sched_sizeof_proc(void) 1726{ 1727 return (sizeof(struct proc)); 1728} 1729 1730int 1731sched_sizeof_thread(void) 1732{ 1733 return (sizeof(struct thread) + sizeof(struct td_sched)); 1734} 1735