sched_ule.c revision 117237
1/*- 2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 117237 2003-07-04 19:59:00Z jeff $"); 29 30#include <sys/param.h> 31#include <sys/systm.h> 32#include <sys/kernel.h> 33#include <sys/ktr.h> 34#include <sys/lock.h> 35#include <sys/mutex.h> 36#include <sys/proc.h> 37#include <sys/resource.h> 38#include <sys/sched.h> 39#include <sys/smp.h> 40#include <sys/sx.h> 41#include <sys/sysctl.h> 42#include <sys/sysproto.h> 43#include <sys/vmmeter.h> 44#ifdef DDB 45#include <ddb/ddb.h> 46#endif 47#ifdef KTRACE 48#include <sys/uio.h> 49#include <sys/ktrace.h> 50#endif 51 52#include <machine/cpu.h> 53 54#define KTR_ULE KTR_NFS 55 56/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57/* XXX This is bogus compatability crap for ps */ 58static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 59SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 60 61static void sched_setup(void *dummy); 62SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 63 64static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 65 66static int sched_strict; 67SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 68 69static int slice_min = 1; 70SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71 72static int slice_max = 10; 73SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74 75int realstathz; 76int tickincr = 1; 77 78#ifdef SMP 79/* Callout to handle load balancing SMP systems. */ 80static struct callout kseq_lb_callout; 81#endif 82 83/* 84 * These datastructures are allocated within their parent datastructure but 85 * are scheduler specific. 86 */ 87 88struct ke_sched { 89 int ske_slice; 90 struct runq *ske_runq; 91 /* The following variables are only used for pctcpu calculation */ 92 int ske_ltick; /* Last tick that we were running on */ 93 int ske_ftick; /* First tick that we were running on */ 94 int ske_ticks; /* Tick count */ 95 /* CPU that we have affinity for. */ 96 u_char ske_cpu; 97}; 98#define ke_slice ke_sched->ske_slice 99#define ke_runq ke_sched->ske_runq 100#define ke_ltick ke_sched->ske_ltick 101#define ke_ftick ke_sched->ske_ftick 102#define ke_ticks ke_sched->ske_ticks 103#define ke_cpu ke_sched->ske_cpu 104 105struct kg_sched { 106 int skg_slptime; /* Number of ticks we vol. slept */ 107 int skg_runtime; /* Number of ticks we were running */ 108}; 109#define kg_slptime kg_sched->skg_slptime 110#define kg_runtime kg_sched->skg_runtime 111 112struct td_sched { 113 int std_slptime; 114}; 115#define td_slptime td_sched->std_slptime 116 117struct td_sched td_sched; 118struct ke_sched ke_sched; 119struct kg_sched kg_sched; 120 121struct ke_sched *kse0_sched = &ke_sched; 122struct kg_sched *ksegrp0_sched = &kg_sched; 123struct p_sched *proc0_sched = NULL; 124struct td_sched *thread0_sched = &td_sched; 125 126/* 127 * The priority is primarily determined by the interactivity score. Thus, we 128 * give lower(better) priorities to kse groups that use less CPU. The nice 129 * value is then directly added to this to allow nice to have some effect 130 * on latency. 131 * 132 * PRI_RANGE: Total priority range for timeshare threads. 133 * PRI_NRESV: Number of nice values. 134 * PRI_BASE: The start of the dynamic range. 135 */ 136#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 137#define SCHED_PRI_NRESV PRIO_TOTAL 138#define SCHED_PRI_NHALF (PRIO_TOTAL / 2) 139#define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) 140#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 141#define SCHED_PRI_INTERACT(score) \ 142 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 143 144/* 145 * These determine the interactivity of a process. 146 * 147 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 148 * before throttling back. 149 * SLP_RUN_THROTTLE: Divisor for reducing slp/run time at fork time. 150 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 151 * INTERACT_THRESH: Threshhold for placement on the current runq. 152 */ 153#define SCHED_SLP_RUN_MAX ((hz * 2) << 10) 154#define SCHED_SLP_RUN_THROTTLE (100) 155#define SCHED_INTERACT_MAX (100) 156#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 157#define SCHED_INTERACT_THRESH (20) 158 159/* 160 * These parameters and macros determine the size of the time slice that is 161 * granted to each thread. 162 * 163 * SLICE_MIN: Minimum time slice granted, in units of ticks. 164 * SLICE_MAX: Maximum time slice granted. 165 * SLICE_RANGE: Range of available time slices scaled by hz. 166 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 167 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 168 */ 169#define SCHED_SLICE_MIN (slice_min) 170#define SCHED_SLICE_MAX (slice_max) 171#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 172#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 173#define SCHED_SLICE_NICE(nice) \ 174 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) 175 176/* 177 * This macro determines whether or not the kse belongs on the current or 178 * next run queue. 179 * 180 * XXX nice value should effect how interactive a kg is. 181 */ 182#define SCHED_INTERACTIVE(kg) \ 183 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 184#define SCHED_CURR(kg, ke) \ 185 (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg)) 186 187/* 188 * Cpu percentage computation macros and defines. 189 * 190 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 191 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 192 */ 193 194#define SCHED_CPU_TIME 10 195#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 196 197/* 198 * kseq - per processor runqs and statistics. 199 */ 200 201#define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ 202 203struct kseq { 204 struct runq ksq_idle; /* Queue of IDLE threads. */ 205 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 206 struct runq *ksq_next; /* Next timeshare queue. */ 207 struct runq *ksq_curr; /* Current queue. */ 208 int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ 209 int ksq_load; /* Aggregate load. */ 210 short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ 211 short ksq_nicemin; /* Least nice. */ 212#ifdef SMP 213 int ksq_cpus; /* Count of CPUs in this kseq. */ 214 unsigned int ksq_rslices; /* Slices on run queue */ 215#endif 216}; 217 218/* 219 * One kse queue per processor. 220 */ 221#ifdef SMP 222struct kseq kseq_cpu[MAXCPU]; 223struct kseq *kseq_idmap[MAXCPU]; 224#define KSEQ_SELF() (kseq_idmap[PCPU_GET(cpuid)]) 225#define KSEQ_CPU(x) (kseq_idmap[(x)]) 226#else 227struct kseq kseq_cpu; 228#define KSEQ_SELF() (&kseq_cpu) 229#define KSEQ_CPU(x) (&kseq_cpu) 230#endif 231 232static void sched_slice(struct kse *ke); 233static void sched_priority(struct ksegrp *kg); 234static int sched_interact_score(struct ksegrp *kg); 235static void sched_interact_update(struct ksegrp *kg); 236void sched_pctcpu_update(struct kse *ke); 237int sched_pickcpu(void); 238 239/* Operations on per processor queues */ 240static struct kse * kseq_choose(struct kseq *kseq); 241static void kseq_setup(struct kseq *kseq); 242static void kseq_add(struct kseq *kseq, struct kse *ke); 243static void kseq_rem(struct kseq *kseq, struct kse *ke); 244static void kseq_nice_add(struct kseq *kseq, int nice); 245static void kseq_nice_rem(struct kseq *kseq, int nice); 246void kseq_print(int cpu); 247#ifdef SMP 248struct kseq * kseq_load_highest(void); 249void kseq_balance(void *arg); 250void kseq_move(struct kseq *from, int cpu); 251#endif 252 253void 254kseq_print(int cpu) 255{ 256 struct kseq *kseq; 257 int i; 258 259 kseq = KSEQ_CPU(cpu); 260 261 printf("kseq:\n"); 262 printf("\tload: %d\n", kseq->ksq_load); 263 printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); 264 printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); 265 printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); 266 printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); 267 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 268 printf("\tnice counts:\n"); 269 for (i = 0; i < PRIO_TOTAL + 1; i++) 270 if (kseq->ksq_nice[i]) 271 printf("\t\t%d = %d\n", 272 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 273} 274 275static void 276kseq_add(struct kseq *kseq, struct kse *ke) 277{ 278 mtx_assert(&sched_lock, MA_OWNED); 279 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; 280 kseq->ksq_load++; 281 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 282 CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 283 ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 284 ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 285 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 286 kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 287#ifdef SMP 288 kseq->ksq_rslices += ke->ke_slice; 289#endif 290} 291 292static void 293kseq_rem(struct kseq *kseq, struct kse *ke) 294{ 295 mtx_assert(&sched_lock, MA_OWNED); 296 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; 297 kseq->ksq_load--; 298 ke->ke_runq = NULL; 299 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 300 kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 301#ifdef SMP 302 kseq->ksq_rslices -= ke->ke_slice; 303#endif 304} 305 306static void 307kseq_nice_add(struct kseq *kseq, int nice) 308{ 309 mtx_assert(&sched_lock, MA_OWNED); 310 /* Normalize to zero. */ 311 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 312 if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) 313 kseq->ksq_nicemin = nice; 314} 315 316static void 317kseq_nice_rem(struct kseq *kseq, int nice) 318{ 319 int n; 320 321 mtx_assert(&sched_lock, MA_OWNED); 322 /* Normalize to zero. */ 323 n = nice + SCHED_PRI_NHALF; 324 kseq->ksq_nice[n]--; 325 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 326 327 /* 328 * If this wasn't the smallest nice value or there are more in 329 * this bucket we can just return. Otherwise we have to recalculate 330 * the smallest nice. 331 */ 332 if (nice != kseq->ksq_nicemin || 333 kseq->ksq_nice[n] != 0 || 334 kseq->ksq_loads[PRI_TIMESHARE] == 0) 335 return; 336 337 for (; n < SCHED_PRI_NRESV + 1; n++) 338 if (kseq->ksq_nice[n]) { 339 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 340 return; 341 } 342} 343 344#ifdef SMP 345/* 346 * kseq_balance is a simple CPU load balancing algorithm. It operates by 347 * finding the least loaded and most loaded cpu and equalizing their load 348 * by migrating some processes. 349 * 350 * Dealing only with two CPUs at a time has two advantages. Firstly, most 351 * installations will only have 2 cpus. Secondly, load balancing too much at 352 * once can have an unpleasant effect on the system. The scheduler rarely has 353 * enough information to make perfect decisions. So this algorithm chooses 354 * algorithm simplicity and more gradual effects on load in larger systems. 355 * 356 * It could be improved by considering the priorities and slices assigned to 357 * each task prior to balancing them. There are many pathological cases with 358 * any approach and so the semi random algorithm below may work as well as any. 359 * 360 */ 361void 362kseq_balance(void *arg) 363{ 364 struct kseq *kseq; 365 int high_load; 366 int low_load; 367 int high_cpu; 368 int low_cpu; 369 int move; 370 int diff; 371 int i; 372 373 high_cpu = 0; 374 low_cpu = 0; 375 high_load = 0; 376 low_load = -1; 377 378 mtx_lock_spin(&sched_lock); 379 if (smp_started == 0) 380 goto out; 381 382 for (i = 0; i < mp_maxid; i++) { 383 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 384 continue; 385 kseq = KSEQ_CPU(i); 386 if (kseq->ksq_load > high_load) { 387 high_load = kseq->ksq_load; 388 high_cpu = i; 389 } 390 if (low_load == -1 || kseq->ksq_load < low_load) { 391 low_load = kseq->ksq_load; 392 low_cpu = i; 393 } 394 } 395 396 kseq = KSEQ_CPU(high_cpu); 397 398 /* 399 * Nothing to do. 400 */ 401 if (high_load < kseq->ksq_cpus + 1) 402 goto out; 403 404 high_load -= kseq->ksq_cpus; 405 406 if (low_load >= high_load) 407 goto out; 408 409 diff = high_load - low_load; 410 move = diff / 2; 411 if (diff & 0x1) 412 move++; 413 414 for (i = 0; i < move; i++) 415 kseq_move(kseq, low_cpu); 416 417out: 418 mtx_unlock_spin(&sched_lock); 419 callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); 420 421 return; 422} 423 424struct kseq * 425kseq_load_highest(void) 426{ 427 struct kseq *kseq; 428 int load; 429 int cpu; 430 int i; 431 432 mtx_assert(&sched_lock, MA_OWNED); 433 cpu = 0; 434 load = 0; 435 436 for (i = 0; i < mp_maxid; i++) { 437 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 438 continue; 439 kseq = KSEQ_CPU(i); 440 if (kseq->ksq_load > load) { 441 load = kseq->ksq_load; 442 cpu = i; 443 } 444 } 445 kseq = KSEQ_CPU(cpu); 446 447 if (load > kseq->ksq_cpus) 448 return (kseq); 449 450 return (NULL); 451} 452 453void 454kseq_move(struct kseq *from, int cpu) 455{ 456 struct kse *ke; 457 458 ke = kseq_choose(from); 459 runq_remove(ke->ke_runq, ke); 460 ke->ke_state = KES_THREAD; 461 kseq_rem(from, ke); 462 463 ke->ke_cpu = cpu; 464 sched_add(ke); 465} 466#endif 467 468struct kse * 469kseq_choose(struct kseq *kseq) 470{ 471 struct kse *ke; 472 struct runq *swap; 473 474 mtx_assert(&sched_lock, MA_OWNED); 475 swap = NULL; 476 477 for (;;) { 478 ke = runq_choose(kseq->ksq_curr); 479 if (ke == NULL) { 480 /* 481 * We already swaped once and didn't get anywhere. 482 */ 483 if (swap) 484 break; 485 swap = kseq->ksq_curr; 486 kseq->ksq_curr = kseq->ksq_next; 487 kseq->ksq_next = swap; 488 continue; 489 } 490 /* 491 * If we encounter a slice of 0 the kse is in a 492 * TIMESHARE kse group and its nice was too far out 493 * of the range that receives slices. 494 */ 495 if (ke->ke_slice == 0) { 496 runq_remove(ke->ke_runq, ke); 497 sched_slice(ke); 498 ke->ke_runq = kseq->ksq_next; 499 runq_add(ke->ke_runq, ke); 500 continue; 501 } 502 return (ke); 503 } 504 505 return (runq_choose(&kseq->ksq_idle)); 506} 507 508static void 509kseq_setup(struct kseq *kseq) 510{ 511 runq_init(&kseq->ksq_timeshare[0]); 512 runq_init(&kseq->ksq_timeshare[1]); 513 runq_init(&kseq->ksq_idle); 514 515 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 516 kseq->ksq_next = &kseq->ksq_timeshare[1]; 517 518 kseq->ksq_loads[PRI_ITHD] = 0; 519 kseq->ksq_loads[PRI_REALTIME] = 0; 520 kseq->ksq_loads[PRI_TIMESHARE] = 0; 521 kseq->ksq_loads[PRI_IDLE] = 0; 522 kseq->ksq_load = 0; 523#ifdef SMP 524 kseq->ksq_rslices = 0; 525#endif 526} 527 528static void 529sched_setup(void *dummy) 530{ 531 int i; 532 533 slice_min = (hz/100); /* 10ms */ 534 slice_max = (hz/7); /* ~140ms */ 535 536#ifdef SMP 537 /* init kseqs */ 538 /* Create the idmap. */ 539#ifdef ULE_HTT_EXPERIMENTAL 540 if (smp_topology == NULL) { 541#else 542 if (1) { 543#endif 544 for (i = 0; i < MAXCPU; i++) { 545 kseq_setup(&kseq_cpu[i]); 546 kseq_idmap[i] = &kseq_cpu[i]; 547 kseq_cpu[i].ksq_cpus = 1; 548 } 549 } else { 550 int j; 551 552 for (i = 0; i < smp_topology->ct_count; i++) { 553 struct cpu_group *cg; 554 555 cg = &smp_topology->ct_group[i]; 556 kseq_setup(&kseq_cpu[i]); 557 558 for (j = 0; j < MAXCPU; j++) 559 if ((cg->cg_mask & (1 << j)) != 0) 560 kseq_idmap[j] = &kseq_cpu[i]; 561 kseq_cpu[i].ksq_cpus = cg->cg_count; 562 } 563 } 564 callout_init(&kseq_lb_callout, 1); 565 kseq_balance(NULL); 566#else 567 kseq_setup(KSEQ_SELF()); 568#endif 569 mtx_lock_spin(&sched_lock); 570 kseq_add(KSEQ_SELF(), &kse0); 571 mtx_unlock_spin(&sched_lock); 572} 573 574/* 575 * Scale the scheduling priority according to the "interactivity" of this 576 * process. 577 */ 578static void 579sched_priority(struct ksegrp *kg) 580{ 581 int pri; 582 583 if (kg->kg_pri_class != PRI_TIMESHARE) 584 return; 585 586 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 587 pri += SCHED_PRI_BASE; 588 pri += kg->kg_nice; 589 590 if (pri > PRI_MAX_TIMESHARE) 591 pri = PRI_MAX_TIMESHARE; 592 else if (pri < PRI_MIN_TIMESHARE) 593 pri = PRI_MIN_TIMESHARE; 594 595 kg->kg_user_pri = pri; 596 597 return; 598} 599 600/* 601 * Calculate a time slice based on the properties of the kseg and the runq 602 * that we're on. This is only for PRI_TIMESHARE ksegrps. 603 */ 604static void 605sched_slice(struct kse *ke) 606{ 607 struct kseq *kseq; 608 struct ksegrp *kg; 609 610 kg = ke->ke_ksegrp; 611 kseq = KSEQ_CPU(ke->ke_cpu); 612 613 /* 614 * Rationale: 615 * KSEs in interactive ksegs get the minimum slice so that we 616 * quickly notice if it abuses its advantage. 617 * 618 * KSEs in non-interactive ksegs are assigned a slice that is 619 * based on the ksegs nice value relative to the least nice kseg 620 * on the run queue for this cpu. 621 * 622 * If the KSE is less nice than all others it gets the maximum 623 * slice and other KSEs will adjust their slice relative to 624 * this when they first expire. 625 * 626 * There is 20 point window that starts relative to the least 627 * nice kse on the run queue. Slice size is determined by 628 * the kse distance from the last nice ksegrp. 629 * 630 * If you are outside of the window you will get no slice and 631 * you will be reevaluated each time you are selected on the 632 * run queue. 633 * 634 */ 635 636 if (!SCHED_INTERACTIVE(kg)) { 637 int nice; 638 639 nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 640 if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || 641 kg->kg_nice < kseq->ksq_nicemin) 642 ke->ke_slice = SCHED_SLICE_MAX; 643 else if (nice <= SCHED_PRI_NTHRESH) 644 ke->ke_slice = SCHED_SLICE_NICE(nice); 645 else 646 ke->ke_slice = 0; 647 } else 648 ke->ke_slice = SCHED_SLICE_MIN; 649 650 CTR6(KTR_ULE, 651 "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 652 ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 653 kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); 654 655 /* 656 * Check to see if we need to scale back the slp and run time 657 * in the kg. This will cause us to forget old interactivity 658 * while maintaining the current ratio. 659 */ 660 sched_interact_update(kg); 661 662 return; 663} 664 665static void 666sched_interact_update(struct ksegrp *kg) 667{ 668 /* XXX Fixme, use a linear algorithm and not a while loop. */ 669 while ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { 670 kg->kg_runtime = (kg->kg_runtime / 5) * 4; 671 kg->kg_slptime = (kg->kg_slptime / 5) * 4; 672 } 673} 674 675static int 676sched_interact_score(struct ksegrp *kg) 677{ 678 int div; 679 680 if (kg->kg_runtime > kg->kg_slptime) { 681 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 682 return (SCHED_INTERACT_HALF + 683 (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 684 } if (kg->kg_slptime > kg->kg_runtime) { 685 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 686 return (kg->kg_runtime / div); 687 } 688 689 /* 690 * This can happen if slptime and runtime are 0. 691 */ 692 return (0); 693 694} 695 696/* 697 * This is only somewhat accurate since given many processes of the same 698 * priority they will switch when their slices run out, which will be 699 * at most SCHED_SLICE_MAX. 700 */ 701int 702sched_rr_interval(void) 703{ 704 return (SCHED_SLICE_MAX); 705} 706 707void 708sched_pctcpu_update(struct kse *ke) 709{ 710 /* 711 * Adjust counters and watermark for pctcpu calc. 712 */ 713 714 /* 715 * Shift the tick count out so that the divide doesn't round away 716 * our results. 717 */ 718 ke->ke_ticks <<= 10; 719 ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) * 720 SCHED_CPU_TICKS; 721 ke->ke_ticks >>= 10; 722 ke->ke_ltick = ticks; 723 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 724} 725 726#ifdef SMP 727/* XXX Should be changed to kseq_load_lowest() */ 728int 729sched_pickcpu(void) 730{ 731 struct kseq *kseq; 732 int load; 733 int cpu; 734 int i; 735 736 mtx_assert(&sched_lock, MA_OWNED); 737 if (!smp_started) 738 return (0); 739 740 load = 0; 741 cpu = 0; 742 743 for (i = 0; i < mp_maxid; i++) { 744 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 745 continue; 746 kseq = KSEQ_CPU(i); 747 if (kseq->ksq_load < load) { 748 cpu = i; 749 load = kseq->ksq_load; 750 } 751 } 752 753 CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 754 return (cpu); 755} 756#else 757int 758sched_pickcpu(void) 759{ 760 return (0); 761} 762#endif 763 764void 765sched_prio(struct thread *td, u_char prio) 766{ 767 struct kse *ke; 768 struct runq *rq; 769 770 mtx_assert(&sched_lock, MA_OWNED); 771 ke = td->td_kse; 772 td->td_priority = prio; 773 774 if (TD_ON_RUNQ(td)) { 775 rq = ke->ke_runq; 776 777 runq_remove(rq, ke); 778 runq_add(rq, ke); 779 } 780} 781 782void 783sched_switchout(struct thread *td) 784{ 785 struct kse *ke; 786 787 mtx_assert(&sched_lock, MA_OWNED); 788 789 ke = td->td_kse; 790 791 td->td_last_kse = ke; 792 td->td_lastcpu = td->td_oncpu; 793 td->td_oncpu = NOCPU; 794 td->td_flags &= ~TDF_NEEDRESCHED; 795 796 if (TD_IS_RUNNING(td)) { 797 /* 798 * This queue is always correct except for idle threads which 799 * have a higher priority due to priority propagation. 800 */ 801 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE && 802 ke->ke_thread->td_priority > PRI_MIN_IDLE) 803 ke->ke_runq = KSEQ_SELF()->ksq_curr; 804 runq_add(ke->ke_runq, ke); 805 /* setrunqueue(td); */ 806 return; 807 } 808 if (ke->ke_runq) 809 kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 810 /* 811 * We will not be on the run queue. So we must be 812 * sleeping or similar. 813 */ 814 if (td->td_proc->p_flag & P_SA) 815 kse_reassign(ke); 816} 817 818void 819sched_switchin(struct thread *td) 820{ 821 /* struct kse *ke = td->td_kse; */ 822 mtx_assert(&sched_lock, MA_OWNED); 823 824 td->td_oncpu = PCPU_GET(cpuid); 825} 826 827void 828sched_nice(struct ksegrp *kg, int nice) 829{ 830 struct kse *ke; 831 struct thread *td; 832 struct kseq *kseq; 833 834 PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 835 mtx_assert(&sched_lock, MA_OWNED); 836 /* 837 * We need to adjust the nice counts for running KSEs. 838 */ 839 if (kg->kg_pri_class == PRI_TIMESHARE) 840 FOREACH_KSE_IN_GROUP(kg, ke) { 841 if (ke->ke_runq == NULL) 842 continue; 843 kseq = KSEQ_CPU(ke->ke_cpu); 844 kseq_nice_rem(kseq, kg->kg_nice); 845 kseq_nice_add(kseq, nice); 846 } 847 kg->kg_nice = nice; 848 sched_priority(kg); 849 FOREACH_THREAD_IN_GROUP(kg, td) 850 td->td_flags |= TDF_NEEDRESCHED; 851} 852 853void 854sched_sleep(struct thread *td, u_char prio) 855{ 856 mtx_assert(&sched_lock, MA_OWNED); 857 858 td->td_slptime = ticks; 859 td->td_priority = prio; 860 861 CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 862 td->td_kse, td->td_slptime); 863} 864 865void 866sched_wakeup(struct thread *td) 867{ 868 mtx_assert(&sched_lock, MA_OWNED); 869 870 /* 871 * Let the kseg know how long we slept for. This is because process 872 * interactivity behavior is modeled in the kseg. 873 */ 874 if (td->td_slptime) { 875 struct ksegrp *kg; 876 int hzticks; 877 878 kg = td->td_ksegrp; 879 hzticks = ticks - td->td_slptime; 880 kg->kg_slptime += hzticks << 10; 881 sched_interact_update(kg); 882 sched_priority(kg); 883 if (td->td_kse) 884 sched_slice(td->td_kse); 885 CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 886 td->td_kse, hzticks); 887 td->td_slptime = 0; 888 } 889 setrunqueue(td); 890 if (td->td_priority < curthread->td_priority) 891 curthread->td_flags |= TDF_NEEDRESCHED; 892} 893 894/* 895 * Penalize the parent for creating a new child and initialize the child's 896 * priority. 897 */ 898void 899sched_fork(struct proc *p, struct proc *p1) 900{ 901 902 mtx_assert(&sched_lock, MA_OWNED); 903 904 sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 905 sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 906 sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 907} 908 909void 910sched_fork_kse(struct kse *ke, struct kse *child) 911{ 912 913 child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 914 child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ 915 child->ke_runq = NULL; 916 917 /* 918 * Claim that we've been running for one second for statistical 919 * purposes. 920 */ 921 child->ke_ticks = 0; 922 child->ke_ltick = ticks; 923 child->ke_ftick = ticks - hz; 924} 925 926void 927sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 928{ 929 930 PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 931 /* XXX Need something better here */ 932 933 child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE; 934 child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE; 935 kg->kg_runtime += tickincr << 10; 936 sched_interact_update(kg); 937 938 child->kg_user_pri = kg->kg_user_pri; 939 child->kg_nice = kg->kg_nice; 940} 941 942void 943sched_fork_thread(struct thread *td, struct thread *child) 944{ 945} 946 947void 948sched_class(struct ksegrp *kg, int class) 949{ 950 struct kseq *kseq; 951 struct kse *ke; 952 953 mtx_assert(&sched_lock, MA_OWNED); 954 if (kg->kg_pri_class == class) 955 return; 956 957 FOREACH_KSE_IN_GROUP(kg, ke) { 958 if (ke->ke_state != KES_ONRUNQ && 959 ke->ke_state != KES_THREAD) 960 continue; 961 kseq = KSEQ_CPU(ke->ke_cpu); 962 963 kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; 964 kseq->ksq_loads[PRI_BASE(class)]++; 965 966 if (kg->kg_pri_class == PRI_TIMESHARE) 967 kseq_nice_rem(kseq, kg->kg_nice); 968 else if (class == PRI_TIMESHARE) 969 kseq_nice_add(kseq, kg->kg_nice); 970 } 971 972 kg->kg_pri_class = class; 973} 974 975/* 976 * Return some of the child's priority and interactivity to the parent. 977 */ 978void 979sched_exit(struct proc *p, struct proc *child) 980{ 981 /* XXX Need something better here */ 982 mtx_assert(&sched_lock, MA_OWNED); 983 sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 984 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 985} 986 987void 988sched_exit_kse(struct kse *ke, struct kse *child) 989{ 990 kseq_rem(KSEQ_CPU(child->ke_cpu), child); 991} 992 993void 994sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 995{ 996 /* kg->kg_slptime += child->kg_slptime; */ 997 kg->kg_runtime += child->kg_runtime; 998 sched_interact_update(kg); 999} 1000 1001void 1002sched_exit_thread(struct thread *td, struct thread *child) 1003{ 1004} 1005 1006void 1007sched_clock(struct kse *ke) 1008{ 1009 struct kseq *kseq; 1010 struct ksegrp *kg; 1011 struct thread *td; 1012#if 0 1013 struct kse *nke; 1014#endif 1015 1016 /* 1017 * sched_setup() apparently happens prior to stathz being set. We 1018 * need to resolve the timers earlier in the boot so we can avoid 1019 * calculating this here. 1020 */ 1021 if (realstathz == 0) { 1022 realstathz = stathz ? stathz : hz; 1023 tickincr = hz / realstathz; 1024 /* 1025 * XXX This does not work for values of stathz that are much 1026 * larger than hz. 1027 */ 1028 if (tickincr == 0) 1029 tickincr = 1; 1030 } 1031 1032 td = ke->ke_thread; 1033 kg = ke->ke_ksegrp; 1034 1035 mtx_assert(&sched_lock, MA_OWNED); 1036 KASSERT((td != NULL), ("schedclock: null thread pointer")); 1037 1038 /* Adjust ticks for pctcpu */ 1039 ke->ke_ticks++; 1040 ke->ke_ltick = ticks; 1041 1042 /* Go up to one second beyond our max and then trim back down */ 1043 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1044 sched_pctcpu_update(ke); 1045 1046 if (td->td_flags & TDF_IDLETD) 1047 return; 1048 1049 CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1050 ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1051 1052 /* 1053 * We only do slicing code for TIMESHARE ksegrps. 1054 */ 1055 if (kg->kg_pri_class != PRI_TIMESHARE) 1056 return; 1057 /* 1058 * Check for a higher priority task on the run queue. This can happen 1059 * on SMP if another processor woke up a process on our runq. 1060 */ 1061 kseq = KSEQ_SELF(); 1062#if 0 1063 if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) { 1064 if (sched_strict && 1065 nke->ke_thread->td_priority < td->td_priority) 1066 td->td_flags |= TDF_NEEDRESCHED; 1067 else if (nke->ke_thread->td_priority < 1068 td->td_priority SCHED_PRIO_SLOP) 1069 1070 if (nke->ke_thread->td_priority < td->td_priority) 1071 td->td_flags |= TDF_NEEDRESCHED; 1072 } 1073#endif 1074 /* 1075 * We used a tick charge it to the ksegrp so that we can compute our 1076 * interactivity. 1077 */ 1078 kg->kg_runtime += tickincr << 10; 1079 sched_interact_update(kg); 1080 1081 /* 1082 * We used up one time slice. 1083 */ 1084 ke->ke_slice--; 1085#ifdef SMP 1086 kseq->ksq_rslices--; 1087#endif 1088 1089 if (ke->ke_slice > 0) 1090 return; 1091 /* 1092 * We're out of time, recompute priorities and requeue. 1093 */ 1094 kseq_rem(kseq, ke); 1095 sched_priority(kg); 1096 sched_slice(ke); 1097 if (SCHED_CURR(kg, ke)) 1098 ke->ke_runq = kseq->ksq_curr; 1099 else 1100 ke->ke_runq = kseq->ksq_next; 1101 kseq_add(kseq, ke); 1102 td->td_flags |= TDF_NEEDRESCHED; 1103} 1104 1105int 1106sched_runnable(void) 1107{ 1108 struct kseq *kseq; 1109 int load; 1110 1111 load = 1; 1112 1113 mtx_lock_spin(&sched_lock); 1114 kseq = KSEQ_SELF(); 1115 1116 if (kseq->ksq_load) 1117 goto out; 1118#ifdef SMP 1119 /* 1120 * For SMP we may steal other processor's KSEs. Just search until we 1121 * verify that at least on other cpu has a runnable task. 1122 */ 1123 if (smp_started) { 1124 int i; 1125 1126 for (i = 0; i < mp_maxid; i++) { 1127 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 1128 continue; 1129 kseq = KSEQ_CPU(i); 1130 if (kseq->ksq_load > kseq->ksq_cpus) 1131 goto out; 1132 } 1133 } 1134#endif 1135 load = 0; 1136out: 1137 mtx_unlock_spin(&sched_lock); 1138 return (load); 1139} 1140 1141void 1142sched_userret(struct thread *td) 1143{ 1144 struct ksegrp *kg; 1145 struct kseq *kseq; 1146 struct kse *ke; 1147 1148 kg = td->td_ksegrp; 1149 1150 if (td->td_priority != kg->kg_user_pri) { 1151 mtx_lock_spin(&sched_lock); 1152 td->td_priority = kg->kg_user_pri; 1153 kseq = KSEQ_SELF(); 1154 if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 1155#ifdef SMP 1156 kseq->ksq_load > kseq->ksq_cpus && 1157#else 1158 kseq->ksq_load > 1 && 1159#endif 1160 (ke = kseq_choose(kseq)) != NULL && 1161 ke->ke_thread->td_priority < td->td_priority) 1162 curthread->td_flags |= TDF_NEEDRESCHED; 1163 mtx_unlock_spin(&sched_lock); 1164 } 1165} 1166 1167struct kse * 1168sched_choose(void) 1169{ 1170 struct kseq *kseq; 1171 struct kse *ke; 1172 1173 mtx_assert(&sched_lock, MA_OWNED); 1174#ifdef SMP 1175retry: 1176#endif 1177 kseq = KSEQ_SELF(); 1178 ke = kseq_choose(kseq); 1179 if (ke) { 1180 runq_remove(ke->ke_runq, ke); 1181 ke->ke_state = KES_THREAD; 1182 1183 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1184 CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1185 ke, ke->ke_runq, ke->ke_slice, 1186 ke->ke_thread->td_priority); 1187 } 1188 return (ke); 1189 } 1190 1191#ifdef SMP 1192 if (smp_started) { 1193 /* 1194 * Find the cpu with the highest load and steal one proc. 1195 */ 1196 if ((kseq = kseq_load_highest()) == NULL) 1197 return (NULL); 1198 1199 /* 1200 * Remove this kse from this kseq and runq and then requeue 1201 * on the current processor. Then we will dequeue it 1202 * normally above. 1203 */ 1204 kseq_move(kseq, PCPU_GET(cpuid)); 1205 goto retry; 1206 } 1207#endif 1208 1209 return (NULL); 1210} 1211 1212void 1213sched_add(struct kse *ke) 1214{ 1215 struct kseq *kseq; 1216 struct ksegrp *kg; 1217 1218 mtx_assert(&sched_lock, MA_OWNED); 1219 KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1220 KASSERT((ke->ke_thread->td_kse != NULL), 1221 ("sched_add: No KSE on thread")); 1222 KASSERT(ke->ke_state != KES_ONRUNQ, 1223 ("sched_add: kse %p (%s) already in run queue", ke, 1224 ke->ke_proc->p_comm)); 1225 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1226 ("sched_add: process swapped out")); 1227 KASSERT(ke->ke_runq == NULL, 1228 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1229 1230 kg = ke->ke_ksegrp; 1231 1232 switch (PRI_BASE(kg->kg_pri_class)) { 1233 case PRI_ITHD: 1234 case PRI_REALTIME: 1235 kseq = KSEQ_SELF(); 1236 ke->ke_runq = kseq->ksq_curr; 1237 ke->ke_slice = SCHED_SLICE_MAX; 1238 ke->ke_cpu = PCPU_GET(cpuid); 1239 break; 1240 case PRI_TIMESHARE: 1241 kseq = KSEQ_CPU(ke->ke_cpu); 1242 if (SCHED_CURR(kg, ke)) 1243 ke->ke_runq = kseq->ksq_curr; 1244 else 1245 ke->ke_runq = kseq->ksq_next; 1246 break; 1247 case PRI_IDLE: 1248 kseq = KSEQ_CPU(ke->ke_cpu); 1249 /* 1250 * This is for priority prop. 1251 */ 1252 if (ke->ke_thread->td_priority > PRI_MIN_IDLE) 1253 ke->ke_runq = kseq->ksq_curr; 1254 else 1255 ke->ke_runq = &kseq->ksq_idle; 1256 ke->ke_slice = SCHED_SLICE_MIN; 1257 break; 1258 default: 1259 panic("Unknown pri class.\n"); 1260 break; 1261 } 1262 1263 ke->ke_ksegrp->kg_runq_kses++; 1264 ke->ke_state = KES_ONRUNQ; 1265 1266 runq_add(ke->ke_runq, ke); 1267 kseq_add(kseq, ke); 1268} 1269 1270void 1271sched_rem(struct kse *ke) 1272{ 1273 struct kseq *kseq; 1274 1275 mtx_assert(&sched_lock, MA_OWNED); 1276 KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1277 1278 ke->ke_state = KES_THREAD; 1279 ke->ke_ksegrp->kg_runq_kses--; 1280 kseq = KSEQ_CPU(ke->ke_cpu); 1281 runq_remove(ke->ke_runq, ke); 1282 kseq_rem(kseq, ke); 1283} 1284 1285fixpt_t 1286sched_pctcpu(struct kse *ke) 1287{ 1288 fixpt_t pctcpu; 1289 1290 pctcpu = 0; 1291 1292 mtx_lock_spin(&sched_lock); 1293 if (ke->ke_ticks) { 1294 int rtick; 1295 1296 /* 1297 * Don't update more frequently than twice a second. Allowing 1298 * this causes the cpu usage to decay away too quickly due to 1299 * rounding errors. 1300 */ 1301 if (ke->ke_ltick < (ticks - (hz / 2))) 1302 sched_pctcpu_update(ke); 1303 1304 /* How many rtick per second ? */ 1305 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1306 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1307 } 1308 1309 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1310 mtx_unlock_spin(&sched_lock); 1311 1312 return (pctcpu); 1313} 1314 1315int 1316sched_sizeof_kse(void) 1317{ 1318 return (sizeof(struct kse) + sizeof(struct ke_sched)); 1319} 1320 1321int 1322sched_sizeof_ksegrp(void) 1323{ 1324 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1325} 1326 1327int 1328sched_sizeof_proc(void) 1329{ 1330 return (sizeof(struct proc)); 1331} 1332 1333int 1334sched_sizeof_thread(void) 1335{ 1336 return (sizeof(struct thread) + sizeof(struct td_sched)); 1337} 1338