sched_ule.c revision 121107
1/*- 2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 121107 2003-10-15 07:47:06Z jeff $"); 29 30#include <sys/param.h> 31#include <sys/systm.h> 32#include <sys/kernel.h> 33#include <sys/ktr.h> 34#include <sys/lock.h> 35#include <sys/mutex.h> 36#include <sys/proc.h> 37#include <sys/resource.h> 38#include <sys/sched.h> 39#include <sys/smp.h> 40#include <sys/sx.h> 41#include <sys/sysctl.h> 42#include <sys/sysproto.h> 43#include <sys/vmmeter.h> 44#ifdef DDB 45#include <ddb/ddb.h> 46#endif 47#ifdef KTRACE 48#include <sys/uio.h> 49#include <sys/ktrace.h> 50#endif 51 52#include <machine/cpu.h> 53 54#define KTR_ULE KTR_NFS 55 56/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57/* XXX This is bogus compatability crap for ps */ 58static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 59SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 60 61static void sched_setup(void *dummy); 62SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 63 64static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 65 66static int sched_strict; 67SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 68 69static int slice_min = 1; 70SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71 72static int slice_max = 10; 73SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74 75int realstathz; 76int tickincr = 1; 77 78#ifdef SMP 79/* Callout to handle load balancing SMP systems. */ 80static struct callout kseq_lb_callout; 81#endif 82 83/* 84 * These datastructures are allocated within their parent datastructure but 85 * are scheduler specific. 86 */ 87 88struct ke_sched { 89 int ske_slice; 90 struct runq *ske_runq; 91 /* The following variables are only used for pctcpu calculation */ 92 int ske_ltick; /* Last tick that we were running on */ 93 int ske_ftick; /* First tick that we were running on */ 94 int ske_ticks; /* Tick count */ 95 /* CPU that we have affinity for. */ 96 u_char ske_cpu; 97}; 98#define ke_slice ke_sched->ske_slice 99#define ke_runq ke_sched->ske_runq 100#define ke_ltick ke_sched->ske_ltick 101#define ke_ftick ke_sched->ske_ftick 102#define ke_ticks ke_sched->ske_ticks 103#define ke_cpu ke_sched->ske_cpu 104 105struct kg_sched { 106 int skg_slptime; /* Number of ticks we vol. slept */ 107 int skg_runtime; /* Number of ticks we were running */ 108}; 109#define kg_slptime kg_sched->skg_slptime 110#define kg_runtime kg_sched->skg_runtime 111 112struct td_sched { 113 int std_slptime; 114}; 115#define td_slptime td_sched->std_slptime 116 117struct td_sched td_sched; 118struct ke_sched ke_sched; 119struct kg_sched kg_sched; 120 121struct ke_sched *kse0_sched = &ke_sched; 122struct kg_sched *ksegrp0_sched = &kg_sched; 123struct p_sched *proc0_sched = NULL; 124struct td_sched *thread0_sched = &td_sched; 125 126/* 127 * The priority is primarily determined by the interactivity score. Thus, we 128 * give lower(better) priorities to kse groups that use less CPU. The nice 129 * value is then directly added to this to allow nice to have some effect 130 * on latency. 131 * 132 * PRI_RANGE: Total priority range for timeshare threads. 133 * PRI_NRESV: Number of nice values. 134 * PRI_BASE: The start of the dynamic range. 135 */ 136#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 137#define SCHED_PRI_NRESV PRIO_TOTAL 138#define SCHED_PRI_NHALF (PRIO_TOTAL / 2) 139#define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) 140#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 141#define SCHED_PRI_INTERACT(score) \ 142 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 143 144/* 145 * These determine the interactivity of a process. 146 * 147 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 148 * before throttling back. 149 * SLP_RUN_THROTTLE: Divisor for reducing slp/run time at fork time. 150 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 151 * INTERACT_THRESH: Threshhold for placement on the current runq. 152 */ 153#define SCHED_SLP_RUN_MAX ((hz * 2) << 10) 154#define SCHED_SLP_RUN_THROTTLE (100) 155#define SCHED_INTERACT_MAX (100) 156#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 157#define SCHED_INTERACT_THRESH (20) 158 159/* 160 * These parameters and macros determine the size of the time slice that is 161 * granted to each thread. 162 * 163 * SLICE_MIN: Minimum time slice granted, in units of ticks. 164 * SLICE_MAX: Maximum time slice granted. 165 * SLICE_RANGE: Range of available time slices scaled by hz. 166 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 167 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 168 */ 169#define SCHED_SLICE_MIN (slice_min) 170#define SCHED_SLICE_MAX (slice_max) 171#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 172#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 173#define SCHED_SLICE_NICE(nice) \ 174 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) 175 176/* 177 * This macro determines whether or not the kse belongs on the current or 178 * next run queue. 179 * 180 * XXX nice value should effect how interactive a kg is. 181 */ 182#define SCHED_INTERACTIVE(kg) \ 183 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 184#define SCHED_CURR(kg, ke) \ 185 (ke->ke_thread->td_priority != kg->kg_user_pri || \ 186 SCHED_INTERACTIVE(kg)) 187 188/* 189 * Cpu percentage computation macros and defines. 190 * 191 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 192 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 193 */ 194 195#define SCHED_CPU_TIME 10 196#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 197 198/* 199 * kseq - per processor runqs and statistics. 200 */ 201 202#define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ 203 204struct kseq { 205 struct runq ksq_idle; /* Queue of IDLE threads. */ 206 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 207 struct runq *ksq_next; /* Next timeshare queue. */ 208 struct runq *ksq_curr; /* Current queue. */ 209 int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ 210 int ksq_load; /* Aggregate load. */ 211 short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ 212 short ksq_nicemin; /* Least nice. */ 213#ifdef SMP 214 int ksq_cpus; /* Count of CPUs in this kseq. */ 215 unsigned int ksq_rslices; /* Slices on run queue */ 216#endif 217}; 218 219/* 220 * One kse queue per processor. 221 */ 222#ifdef SMP 223struct kseq kseq_cpu[MAXCPU]; 224struct kseq *kseq_idmap[MAXCPU]; 225#define KSEQ_SELF() (kseq_idmap[PCPU_GET(cpuid)]) 226#define KSEQ_CPU(x) (kseq_idmap[(x)]) 227#else 228struct kseq kseq_cpu; 229#define KSEQ_SELF() (&kseq_cpu) 230#define KSEQ_CPU(x) (&kseq_cpu) 231#endif 232 233static void sched_slice(struct kse *ke); 234static void sched_priority(struct ksegrp *kg); 235static int sched_interact_score(struct ksegrp *kg); 236static void sched_interact_update(struct ksegrp *kg); 237void sched_pctcpu_update(struct kse *ke); 238int sched_pickcpu(void); 239 240/* Operations on per processor queues */ 241static struct kse * kseq_choose(struct kseq *kseq, int steal); 242static void kseq_setup(struct kseq *kseq); 243static void kseq_add(struct kseq *kseq, struct kse *ke); 244static void kseq_rem(struct kseq *kseq, struct kse *ke); 245static void kseq_nice_add(struct kseq *kseq, int nice); 246static void kseq_nice_rem(struct kseq *kseq, int nice); 247void kseq_print(int cpu); 248#ifdef SMP 249struct kseq * kseq_load_highest(void); 250void kseq_balance(void *arg); 251void kseq_move(struct kseq *from, int cpu); 252#endif 253 254void 255kseq_print(int cpu) 256{ 257 struct kseq *kseq; 258 int i; 259 260 kseq = KSEQ_CPU(cpu); 261 262 printf("kseq:\n"); 263 printf("\tload: %d\n", kseq->ksq_load); 264 printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); 265 printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); 266 printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); 267 printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); 268 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 269 printf("\tnice counts:\n"); 270 for (i = 0; i < PRIO_TOTAL + 1; i++) 271 if (kseq->ksq_nice[i]) 272 printf("\t\t%d = %d\n", 273 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 274} 275 276static void 277kseq_add(struct kseq *kseq, struct kse *ke) 278{ 279 mtx_assert(&sched_lock, MA_OWNED); 280 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; 281 kseq->ksq_load++; 282 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 283 CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 284 ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 285 ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 286 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 287 kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 288#ifdef SMP 289 kseq->ksq_rslices += ke->ke_slice; 290#endif 291} 292 293static void 294kseq_rem(struct kseq *kseq, struct kse *ke) 295{ 296 mtx_assert(&sched_lock, MA_OWNED); 297 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; 298 kseq->ksq_load--; 299 ke->ke_runq = NULL; 300 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 301 kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 302#ifdef SMP 303 kseq->ksq_rslices -= ke->ke_slice; 304#endif 305} 306 307static void 308kseq_nice_add(struct kseq *kseq, int nice) 309{ 310 mtx_assert(&sched_lock, MA_OWNED); 311 /* Normalize to zero. */ 312 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 313 if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) 314 kseq->ksq_nicemin = nice; 315} 316 317static void 318kseq_nice_rem(struct kseq *kseq, int nice) 319{ 320 int n; 321 322 mtx_assert(&sched_lock, MA_OWNED); 323 /* Normalize to zero. */ 324 n = nice + SCHED_PRI_NHALF; 325 kseq->ksq_nice[n]--; 326 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 327 328 /* 329 * If this wasn't the smallest nice value or there are more in 330 * this bucket we can just return. Otherwise we have to recalculate 331 * the smallest nice. 332 */ 333 if (nice != kseq->ksq_nicemin || 334 kseq->ksq_nice[n] != 0 || 335 kseq->ksq_loads[PRI_TIMESHARE] == 0) 336 return; 337 338 for (; n < SCHED_PRI_NRESV + 1; n++) 339 if (kseq->ksq_nice[n]) { 340 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 341 return; 342 } 343} 344 345#ifdef SMP 346/* 347 * kseq_balance is a simple CPU load balancing algorithm. It operates by 348 * finding the least loaded and most loaded cpu and equalizing their load 349 * by migrating some processes. 350 * 351 * Dealing only with two CPUs at a time has two advantages. Firstly, most 352 * installations will only have 2 cpus. Secondly, load balancing too much at 353 * once can have an unpleasant effect on the system. The scheduler rarely has 354 * enough information to make perfect decisions. So this algorithm chooses 355 * algorithm simplicity and more gradual effects on load in larger systems. 356 * 357 * It could be improved by considering the priorities and slices assigned to 358 * each task prior to balancing them. There are many pathological cases with 359 * any approach and so the semi random algorithm below may work as well as any. 360 * 361 */ 362void 363kseq_balance(void *arg) 364{ 365 struct kseq *kseq; 366 int high_load; 367 int low_load; 368 int high_cpu; 369 int low_cpu; 370 int move; 371 int diff; 372 int i; 373 374 high_cpu = 0; 375 low_cpu = 0; 376 high_load = 0; 377 low_load = -1; 378 379 mtx_lock_spin(&sched_lock); 380 if (smp_started == 0) 381 goto out; 382 383 for (i = 0; i < mp_maxid; i++) { 384 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 385 continue; 386 kseq = KSEQ_CPU(i); 387 if (kseq->ksq_load > high_load) { 388 high_load = kseq->ksq_load; 389 high_cpu = i; 390 } 391 if (low_load == -1 || kseq->ksq_load < low_load) { 392 low_load = kseq->ksq_load; 393 low_cpu = i; 394 } 395 } 396 397 kseq = KSEQ_CPU(high_cpu); 398 399 /* 400 * Nothing to do. 401 */ 402 if (high_load < kseq->ksq_cpus + 1) 403 goto out; 404 405 high_load -= kseq->ksq_cpus; 406 407 if (low_load >= high_load) 408 goto out; 409 410 diff = high_load - low_load; 411 move = diff / 2; 412 if (diff & 0x1) 413 move++; 414 415 for (i = 0; i < move; i++) 416 kseq_move(kseq, low_cpu); 417 418out: 419 mtx_unlock_spin(&sched_lock); 420 callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); 421 422 return; 423} 424 425struct kseq * 426kseq_load_highest(void) 427{ 428 struct kseq *kseq; 429 int load; 430 int cpu; 431 int i; 432 433 mtx_assert(&sched_lock, MA_OWNED); 434 cpu = 0; 435 load = 0; 436 437 for (i = 0; i < mp_maxid; i++) { 438 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 439 continue; 440 kseq = KSEQ_CPU(i); 441 if (kseq->ksq_load > load) { 442 load = kseq->ksq_load; 443 cpu = i; 444 } 445 } 446 kseq = KSEQ_CPU(cpu); 447 448 if (load > kseq->ksq_cpus) 449 return (kseq); 450 451 return (NULL); 452} 453 454void 455kseq_move(struct kseq *from, int cpu) 456{ 457 struct kse *ke; 458 459 ke = kseq_choose(from, 1); 460 runq_remove(ke->ke_runq, ke); 461 ke->ke_state = KES_THREAD; 462 kseq_rem(from, ke); 463 464 ke->ke_cpu = cpu; 465 sched_add(ke); 466} 467#endif 468 469/* 470 * Pick the highest priority task we have and return it. If steal is 1 we 471 * will return kses that have been denied slices due to their nice being too 472 * low. In the future we should prohibit stealing interrupt threads as well. 473 */ 474 475struct kse * 476kseq_choose(struct kseq *kseq, int steal) 477{ 478 struct kse *ke; 479 struct runq *swap; 480 481 mtx_assert(&sched_lock, MA_OWNED); 482 swap = NULL; 483 484 for (;;) { 485 ke = runq_choose(kseq->ksq_curr); 486 if (ke == NULL) { 487 /* 488 * We already swaped once and didn't get anywhere. 489 */ 490 if (swap) 491 break; 492 swap = kseq->ksq_curr; 493 kseq->ksq_curr = kseq->ksq_next; 494 kseq->ksq_next = swap; 495 continue; 496 } 497 /* 498 * If we encounter a slice of 0 the kse is in a 499 * TIMESHARE kse group and its nice was too far out 500 * of the range that receives slices. 501 */ 502 if (ke->ke_slice == 0 && steal == 0) { 503 runq_remove(ke->ke_runq, ke); 504 sched_slice(ke); 505 ke->ke_runq = kseq->ksq_next; 506 runq_add(ke->ke_runq, ke); 507 continue; 508 } 509 return (ke); 510 } 511 512 return (runq_choose(&kseq->ksq_idle)); 513} 514 515static void 516kseq_setup(struct kseq *kseq) 517{ 518 runq_init(&kseq->ksq_timeshare[0]); 519 runq_init(&kseq->ksq_timeshare[1]); 520 runq_init(&kseq->ksq_idle); 521 522 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 523 kseq->ksq_next = &kseq->ksq_timeshare[1]; 524 525 kseq->ksq_loads[PRI_ITHD] = 0; 526 kseq->ksq_loads[PRI_REALTIME] = 0; 527 kseq->ksq_loads[PRI_TIMESHARE] = 0; 528 kseq->ksq_loads[PRI_IDLE] = 0; 529 kseq->ksq_load = 0; 530#ifdef SMP 531 kseq->ksq_rslices = 0; 532#endif 533} 534 535static void 536sched_setup(void *dummy) 537{ 538#ifdef SMP 539 int i; 540#endif 541 542 slice_min = (hz/100); /* 10ms */ 543 slice_max = (hz/7); /* ~140ms */ 544 545#ifdef SMP 546 /* init kseqs */ 547 /* Create the idmap. */ 548#ifdef ULE_HTT_EXPERIMENTAL 549 if (smp_topology == NULL) { 550#else 551 if (1) { 552#endif 553 for (i = 0; i < MAXCPU; i++) { 554 kseq_setup(&kseq_cpu[i]); 555 kseq_idmap[i] = &kseq_cpu[i]; 556 kseq_cpu[i].ksq_cpus = 1; 557 } 558 } else { 559 int j; 560 561 for (i = 0; i < smp_topology->ct_count; i++) { 562 struct cpu_group *cg; 563 564 cg = &smp_topology->ct_group[i]; 565 kseq_setup(&kseq_cpu[i]); 566 567 for (j = 0; j < MAXCPU; j++) 568 if ((cg->cg_mask & (1 << j)) != 0) 569 kseq_idmap[j] = &kseq_cpu[i]; 570 kseq_cpu[i].ksq_cpus = cg->cg_count; 571 } 572 } 573 callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 574 kseq_balance(NULL); 575#else 576 kseq_setup(KSEQ_SELF()); 577#endif 578 mtx_lock_spin(&sched_lock); 579 kseq_add(KSEQ_SELF(), &kse0); 580 mtx_unlock_spin(&sched_lock); 581} 582 583/* 584 * Scale the scheduling priority according to the "interactivity" of this 585 * process. 586 */ 587static void 588sched_priority(struct ksegrp *kg) 589{ 590 int pri; 591 592 if (kg->kg_pri_class != PRI_TIMESHARE) 593 return; 594 595 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 596 pri += SCHED_PRI_BASE; 597 pri += kg->kg_nice; 598 599 if (pri > PRI_MAX_TIMESHARE) 600 pri = PRI_MAX_TIMESHARE; 601 else if (pri < PRI_MIN_TIMESHARE) 602 pri = PRI_MIN_TIMESHARE; 603 604 kg->kg_user_pri = pri; 605 606 return; 607} 608 609/* 610 * Calculate a time slice based on the properties of the kseg and the runq 611 * that we're on. This is only for PRI_TIMESHARE ksegrps. 612 */ 613static void 614sched_slice(struct kse *ke) 615{ 616 struct kseq *kseq; 617 struct ksegrp *kg; 618 619 kg = ke->ke_ksegrp; 620 kseq = KSEQ_CPU(ke->ke_cpu); 621 622 /* 623 * Rationale: 624 * KSEs in interactive ksegs get the minimum slice so that we 625 * quickly notice if it abuses its advantage. 626 * 627 * KSEs in non-interactive ksegs are assigned a slice that is 628 * based on the ksegs nice value relative to the least nice kseg 629 * on the run queue for this cpu. 630 * 631 * If the KSE is less nice than all others it gets the maximum 632 * slice and other KSEs will adjust their slice relative to 633 * this when they first expire. 634 * 635 * There is 20 point window that starts relative to the least 636 * nice kse on the run queue. Slice size is determined by 637 * the kse distance from the last nice ksegrp. 638 * 639 * If you are outside of the window you will get no slice and 640 * you will be reevaluated each time you are selected on the 641 * run queue. 642 * 643 */ 644 645 if (!SCHED_INTERACTIVE(kg)) { 646 int nice; 647 648 nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 649 if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || 650 kg->kg_nice < kseq->ksq_nicemin) 651 ke->ke_slice = SCHED_SLICE_MAX; 652 else if (nice <= SCHED_PRI_NTHRESH) 653 ke->ke_slice = SCHED_SLICE_NICE(nice); 654 else 655 ke->ke_slice = 0; 656 } else 657 ke->ke_slice = SCHED_SLICE_MIN; 658 659 CTR6(KTR_ULE, 660 "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 661 ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 662 kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); 663 664 /* 665 * Check to see if we need to scale back the slp and run time 666 * in the kg. This will cause us to forget old interactivity 667 * while maintaining the current ratio. 668 */ 669 sched_interact_update(kg); 670 671 return; 672} 673 674static void 675sched_interact_update(struct ksegrp *kg) 676{ 677 int ratio; 678 if ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { 679 ratio = (SCHED_SLP_RUN_MAX / 680 (kg->kg_runtime + kg->kg_slptime)) * 4; 681 kg->kg_runtime = (kg->kg_runtime * ratio) / 5; 682 kg->kg_slptime = (kg->kg_slptime * ratio) / 5; 683 } 684} 685 686static int 687sched_interact_score(struct ksegrp *kg) 688{ 689 int div; 690 691 if (kg->kg_runtime > kg->kg_slptime) { 692 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 693 return (SCHED_INTERACT_HALF + 694 (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 695 } if (kg->kg_slptime > kg->kg_runtime) { 696 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 697 return (kg->kg_runtime / div); 698 } 699 700 /* 701 * This can happen if slptime and runtime are 0. 702 */ 703 return (0); 704 705} 706 707/* 708 * This is only somewhat accurate since given many processes of the same 709 * priority they will switch when their slices run out, which will be 710 * at most SCHED_SLICE_MAX. 711 */ 712int 713sched_rr_interval(void) 714{ 715 return (SCHED_SLICE_MAX); 716} 717 718void 719sched_pctcpu_update(struct kse *ke) 720{ 721 /* 722 * Adjust counters and watermark for pctcpu calc. 723 */ 724 if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 725 /* 726 * Shift the tick count out so that the divide doesn't 727 * round away our results. 728 */ 729 ke->ke_ticks <<= 10; 730 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 731 SCHED_CPU_TICKS; 732 ke->ke_ticks >>= 10; 733 } else 734 ke->ke_ticks = 0; 735 ke->ke_ltick = ticks; 736 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 737} 738 739#ifdef SMP 740/* XXX Should be changed to kseq_load_lowest() */ 741int 742sched_pickcpu(void) 743{ 744 struct kseq *kseq; 745 int load; 746 int cpu; 747 int i; 748 749 mtx_assert(&sched_lock, MA_OWNED); 750 if (!smp_started) 751 return (0); 752 753 load = 0; 754 cpu = 0; 755 756 for (i = 0; i < mp_maxid; i++) { 757 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 758 continue; 759 kseq = KSEQ_CPU(i); 760 if (kseq->ksq_load < load) { 761 cpu = i; 762 load = kseq->ksq_load; 763 } 764 } 765 766 CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 767 return (cpu); 768} 769#else 770int 771sched_pickcpu(void) 772{ 773 return (0); 774} 775#endif 776 777void 778sched_prio(struct thread *td, u_char prio) 779{ 780 781 mtx_assert(&sched_lock, MA_OWNED); 782 if (TD_ON_RUNQ(td)) { 783 adjustrunqueue(td, prio); 784 } else { 785 td->td_priority = prio; 786 } 787} 788 789void 790sched_switchout(struct thread *td) 791{ 792 struct kse *ke; 793 794 mtx_assert(&sched_lock, MA_OWNED); 795 796 ke = td->td_kse; 797 798 td->td_last_kse = ke; 799 td->td_lastcpu = td->td_oncpu; 800 td->td_oncpu = NOCPU; 801 td->td_flags &= ~TDF_NEEDRESCHED; 802 803 if (TD_IS_RUNNING(td)) { 804 if (td->td_proc->p_flag & P_SA) { 805 kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 806 setrunqueue(td); 807 } else { 808 /* 809 * This queue is always correct except for idle threads which 810 * have a higher priority due to priority propagation. 811 */ 812 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE && 813 ke->ke_thread->td_priority > PRI_MIN_IDLE) 814 ke->ke_runq = KSEQ_SELF()->ksq_curr; 815 runq_add(ke->ke_runq, ke); 816 /* setrunqueue(td); */ 817 } 818 return; 819 } 820 if (ke->ke_runq) 821 kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 822 /* 823 * We will not be on the run queue. So we must be 824 * sleeping or similar. 825 */ 826 if (td->td_proc->p_flag & P_SA) 827 kse_reassign(ke); 828} 829 830void 831sched_switchin(struct thread *td) 832{ 833 /* struct kse *ke = td->td_kse; */ 834 mtx_assert(&sched_lock, MA_OWNED); 835 836 td->td_oncpu = PCPU_GET(cpuid); 837} 838 839void 840sched_nice(struct ksegrp *kg, int nice) 841{ 842 struct kse *ke; 843 struct thread *td; 844 struct kseq *kseq; 845 846 PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 847 mtx_assert(&sched_lock, MA_OWNED); 848 /* 849 * We need to adjust the nice counts for running KSEs. 850 */ 851 if (kg->kg_pri_class == PRI_TIMESHARE) 852 FOREACH_KSE_IN_GROUP(kg, ke) { 853 if (ke->ke_runq == NULL) 854 continue; 855 kseq = KSEQ_CPU(ke->ke_cpu); 856 kseq_nice_rem(kseq, kg->kg_nice); 857 kseq_nice_add(kseq, nice); 858 } 859 kg->kg_nice = nice; 860 sched_priority(kg); 861 FOREACH_THREAD_IN_GROUP(kg, td) 862 td->td_flags |= TDF_NEEDRESCHED; 863} 864 865void 866sched_sleep(struct thread *td, u_char prio) 867{ 868 mtx_assert(&sched_lock, MA_OWNED); 869 870 td->td_slptime = ticks; 871 td->td_priority = prio; 872 873 CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 874 td->td_kse, td->td_slptime); 875} 876 877void 878sched_wakeup(struct thread *td) 879{ 880 mtx_assert(&sched_lock, MA_OWNED); 881 882 /* 883 * Let the kseg know how long we slept for. This is because process 884 * interactivity behavior is modeled in the kseg. 885 */ 886 if (td->td_slptime) { 887 struct ksegrp *kg; 888 int hzticks; 889 890 kg = td->td_ksegrp; 891 hzticks = ticks - td->td_slptime; 892 kg->kg_slptime += hzticks << 10; 893 sched_interact_update(kg); 894 sched_priority(kg); 895 if (td->td_kse) 896 sched_slice(td->td_kse); 897 CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 898 td->td_kse, hzticks); 899 td->td_slptime = 0; 900 } 901 setrunqueue(td); 902 if (td->td_priority < curthread->td_priority) 903 curthread->td_flags |= TDF_NEEDRESCHED; 904} 905 906/* 907 * Penalize the parent for creating a new child and initialize the child's 908 * priority. 909 */ 910void 911sched_fork(struct proc *p, struct proc *p1) 912{ 913 914 mtx_assert(&sched_lock, MA_OWNED); 915 916 sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 917 sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 918 sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 919} 920 921void 922sched_fork_kse(struct kse *ke, struct kse *child) 923{ 924 925 child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 926 child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ 927 child->ke_runq = NULL; 928 929 /* Grab our parents cpu estimation information. */ 930 child->ke_ticks = ke->ke_ticks; 931 child->ke_ltick = ke->ke_ltick; 932 child->ke_ftick = ke->ke_ftick; 933} 934 935void 936sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 937{ 938 939 PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 940 /* XXX Need something better here */ 941 942 child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE; 943 child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE; 944 kg->kg_runtime += tickincr << 10; 945 sched_interact_update(kg); 946 947 child->kg_user_pri = kg->kg_user_pri; 948 child->kg_nice = kg->kg_nice; 949} 950 951void 952sched_fork_thread(struct thread *td, struct thread *child) 953{ 954} 955 956void 957sched_class(struct ksegrp *kg, int class) 958{ 959 struct kseq *kseq; 960 struct kse *ke; 961 962 mtx_assert(&sched_lock, MA_OWNED); 963 if (kg->kg_pri_class == class) 964 return; 965 966 FOREACH_KSE_IN_GROUP(kg, ke) { 967 if (ke->ke_state != KES_ONRUNQ && 968 ke->ke_state != KES_THREAD) 969 continue; 970 kseq = KSEQ_CPU(ke->ke_cpu); 971 972 kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; 973 kseq->ksq_loads[PRI_BASE(class)]++; 974 975 if (kg->kg_pri_class == PRI_TIMESHARE) 976 kseq_nice_rem(kseq, kg->kg_nice); 977 else if (class == PRI_TIMESHARE) 978 kseq_nice_add(kseq, kg->kg_nice); 979 } 980 981 kg->kg_pri_class = class; 982} 983 984/* 985 * Return some of the child's priority and interactivity to the parent. 986 */ 987void 988sched_exit(struct proc *p, struct proc *child) 989{ 990 /* XXX Need something better here */ 991 mtx_assert(&sched_lock, MA_OWNED); 992 sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 993 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 994} 995 996void 997sched_exit_kse(struct kse *ke, struct kse *child) 998{ 999 kseq_rem(KSEQ_CPU(child->ke_cpu), child); 1000} 1001 1002void 1003sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1004{ 1005 /* kg->kg_slptime += child->kg_slptime; */ 1006 kg->kg_runtime += child->kg_runtime; 1007 sched_interact_update(kg); 1008} 1009 1010void 1011sched_exit_thread(struct thread *td, struct thread *child) 1012{ 1013} 1014 1015void 1016sched_clock(struct kse *ke) 1017{ 1018 struct kseq *kseq; 1019 struct ksegrp *kg; 1020 struct thread *td; 1021#if 0 1022 struct kse *nke; 1023#endif 1024 1025 /* 1026 * sched_setup() apparently happens prior to stathz being set. We 1027 * need to resolve the timers earlier in the boot so we can avoid 1028 * calculating this here. 1029 */ 1030 if (realstathz == 0) { 1031 realstathz = stathz ? stathz : hz; 1032 tickincr = hz / realstathz; 1033 /* 1034 * XXX This does not work for values of stathz that are much 1035 * larger than hz. 1036 */ 1037 if (tickincr == 0) 1038 tickincr = 1; 1039 } 1040 1041 td = ke->ke_thread; 1042 kg = ke->ke_ksegrp; 1043 1044 mtx_assert(&sched_lock, MA_OWNED); 1045 KASSERT((td != NULL), ("schedclock: null thread pointer")); 1046 1047 /* Adjust ticks for pctcpu */ 1048 ke->ke_ticks++; 1049 ke->ke_ltick = ticks; 1050 1051 /* Go up to one second beyond our max and then trim back down */ 1052 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1053 sched_pctcpu_update(ke); 1054 1055 if (td->td_flags & TDF_IDLETD) 1056 return; 1057 1058 CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1059 ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1060 1061 /* 1062 * We only do slicing code for TIMESHARE ksegrps. 1063 */ 1064 if (kg->kg_pri_class != PRI_TIMESHARE) 1065 return; 1066 /* 1067 * Check for a higher priority task on the run queue. This can happen 1068 * on SMP if another processor woke up a process on our runq. 1069 */ 1070 kseq = KSEQ_SELF(); 1071#if 0 1072 if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq, 0)) != NULL) { 1073 if (sched_strict && 1074 nke->ke_thread->td_priority < td->td_priority) 1075 td->td_flags |= TDF_NEEDRESCHED; 1076 else if (nke->ke_thread->td_priority < 1077 td->td_priority SCHED_PRIO_SLOP) 1078 1079 if (nke->ke_thread->td_priority < td->td_priority) 1080 td->td_flags |= TDF_NEEDRESCHED; 1081 } 1082#endif 1083 /* 1084 * We used a tick charge it to the ksegrp so that we can compute our 1085 * interactivity. 1086 */ 1087 kg->kg_runtime += tickincr << 10; 1088 sched_interact_update(kg); 1089 1090 /* 1091 * We used up one time slice. 1092 */ 1093 ke->ke_slice--; 1094#ifdef SMP 1095 kseq->ksq_rslices--; 1096#endif 1097 1098 if (ke->ke_slice > 0) 1099 return; 1100 /* 1101 * We're out of time, recompute priorities and requeue. 1102 */ 1103 kseq_rem(kseq, ke); 1104 sched_priority(kg); 1105 sched_slice(ke); 1106 if (SCHED_CURR(kg, ke)) 1107 ke->ke_runq = kseq->ksq_curr; 1108 else 1109 ke->ke_runq = kseq->ksq_next; 1110 kseq_add(kseq, ke); 1111 td->td_flags |= TDF_NEEDRESCHED; 1112} 1113 1114int 1115sched_runnable(void) 1116{ 1117 struct kseq *kseq; 1118 int load; 1119 1120 load = 1; 1121 1122 mtx_lock_spin(&sched_lock); 1123 kseq = KSEQ_SELF(); 1124 1125 if (kseq->ksq_load) 1126 goto out; 1127#ifdef SMP 1128 /* 1129 * For SMP we may steal other processor's KSEs. Just search until we 1130 * verify that at least on other cpu has a runnable task. 1131 */ 1132 if (smp_started) { 1133 int i; 1134 1135 for (i = 0; i < mp_maxid; i++) { 1136 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 1137 continue; 1138 kseq = KSEQ_CPU(i); 1139 if (kseq->ksq_load > kseq->ksq_cpus) 1140 goto out; 1141 } 1142 } 1143#endif 1144 load = 0; 1145out: 1146 mtx_unlock_spin(&sched_lock); 1147 return (load); 1148} 1149 1150void 1151sched_userret(struct thread *td) 1152{ 1153 struct ksegrp *kg; 1154#if 0 1155 struct kseq *kseq; 1156 struct kse *ke; 1157#endif 1158 1159 kg = td->td_ksegrp; 1160 1161 if (td->td_priority != kg->kg_user_pri) { 1162 mtx_lock_spin(&sched_lock); 1163 td->td_priority = kg->kg_user_pri; 1164 /* 1165 * This optimization is temporarily disabled because it 1166 * breaks priority propagation. 1167 */ 1168#if 0 1169 kseq = KSEQ_SELF(); 1170 if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 1171#ifdef SMP 1172 kseq->ksq_load > kseq->ksq_cpus && 1173#else 1174 kseq->ksq_load > 1 && 1175#endif 1176 (ke = kseq_choose(kseq, 0)) != NULL && 1177 ke->ke_thread->td_priority < td->td_priority) 1178#endif 1179 curthread->td_flags |= TDF_NEEDRESCHED; 1180 mtx_unlock_spin(&sched_lock); 1181 } 1182} 1183 1184struct kse * 1185sched_choose(void) 1186{ 1187 struct kseq *kseq; 1188 struct kse *ke; 1189 1190 mtx_assert(&sched_lock, MA_OWNED); 1191#ifdef SMP 1192retry: 1193#endif 1194 kseq = KSEQ_SELF(); 1195 ke = kseq_choose(kseq, 0); 1196 if (ke) { 1197 runq_remove(ke->ke_runq, ke); 1198 ke->ke_state = KES_THREAD; 1199 1200 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1201 CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1202 ke, ke->ke_runq, ke->ke_slice, 1203 ke->ke_thread->td_priority); 1204 } 1205 return (ke); 1206 } 1207 1208#ifdef SMP 1209 if (smp_started) { 1210 /* 1211 * Find the cpu with the highest load and steal one proc. 1212 */ 1213 if ((kseq = kseq_load_highest()) == NULL) 1214 return (NULL); 1215 1216 /* 1217 * Remove this kse from this kseq and runq and then requeue 1218 * on the current processor. Then we will dequeue it 1219 * normally above. 1220 */ 1221 kseq_move(kseq, PCPU_GET(cpuid)); 1222 goto retry; 1223 } 1224#endif 1225 1226 return (NULL); 1227} 1228 1229void 1230sched_add(struct kse *ke) 1231{ 1232 struct kseq *kseq; 1233 struct ksegrp *kg; 1234 1235 mtx_assert(&sched_lock, MA_OWNED); 1236 KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1237 KASSERT((ke->ke_thread->td_kse != NULL), 1238 ("sched_add: No KSE on thread")); 1239 KASSERT(ke->ke_state != KES_ONRUNQ, 1240 ("sched_add: kse %p (%s) already in run queue", ke, 1241 ke->ke_proc->p_comm)); 1242 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1243 ("sched_add: process swapped out")); 1244 KASSERT(ke->ke_runq == NULL, 1245 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1246 1247 kg = ke->ke_ksegrp; 1248 1249 switch (PRI_BASE(kg->kg_pri_class)) { 1250 case PRI_ITHD: 1251 case PRI_REALTIME: 1252 kseq = KSEQ_SELF(); 1253 ke->ke_runq = kseq->ksq_curr; 1254 ke->ke_slice = SCHED_SLICE_MAX; 1255 ke->ke_cpu = PCPU_GET(cpuid); 1256 break; 1257 case PRI_TIMESHARE: 1258 kseq = KSEQ_CPU(ke->ke_cpu); 1259 if (SCHED_CURR(kg, ke)) 1260 ke->ke_runq = kseq->ksq_curr; 1261 else 1262 ke->ke_runq = kseq->ksq_next; 1263 break; 1264 case PRI_IDLE: 1265 kseq = KSEQ_CPU(ke->ke_cpu); 1266 /* 1267 * This is for priority prop. 1268 */ 1269 if (ke->ke_thread->td_priority > PRI_MIN_IDLE) 1270 ke->ke_runq = kseq->ksq_curr; 1271 else 1272 ke->ke_runq = &kseq->ksq_idle; 1273 ke->ke_slice = SCHED_SLICE_MIN; 1274 break; 1275 default: 1276 panic("Unknown pri class.\n"); 1277 break; 1278 } 1279 1280 ke->ke_ksegrp->kg_runq_kses++; 1281 ke->ke_state = KES_ONRUNQ; 1282 1283 runq_add(ke->ke_runq, ke); 1284 kseq_add(kseq, ke); 1285} 1286 1287void 1288sched_rem(struct kse *ke) 1289{ 1290 struct kseq *kseq; 1291 1292 mtx_assert(&sched_lock, MA_OWNED); 1293 KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1294 1295 ke->ke_state = KES_THREAD; 1296 ke->ke_ksegrp->kg_runq_kses--; 1297 kseq = KSEQ_CPU(ke->ke_cpu); 1298 runq_remove(ke->ke_runq, ke); 1299 kseq_rem(kseq, ke); 1300} 1301 1302fixpt_t 1303sched_pctcpu(struct kse *ke) 1304{ 1305 fixpt_t pctcpu; 1306 1307 pctcpu = 0; 1308 1309 mtx_lock_spin(&sched_lock); 1310 if (ke->ke_ticks) { 1311 int rtick; 1312 1313 /* 1314 * Don't update more frequently than twice a second. Allowing 1315 * this causes the cpu usage to decay away too quickly due to 1316 * rounding errors. 1317 */ 1318 if (ke->ke_ltick < (ticks - (hz / 2))) 1319 sched_pctcpu_update(ke); 1320 /* How many rtick per second ? */ 1321 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1322 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1323 } 1324 1325 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1326 mtx_unlock_spin(&sched_lock); 1327 1328 return (pctcpu); 1329} 1330 1331int 1332sched_sizeof_kse(void) 1333{ 1334 return (sizeof(struct kse) + sizeof(struct ke_sched)); 1335} 1336 1337int 1338sched_sizeof_ksegrp(void) 1339{ 1340 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1341} 1342 1343int 1344sched_sizeof_proc(void) 1345{ 1346 return (sizeof(struct proc)); 1347} 1348 1349int 1350sched_sizeof_thread(void) 1351{ 1352 return (sizeof(struct thread) + sizeof(struct td_sched)); 1353} 1354