sched_ule.c revision 109864
1/*- 2 * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 * $FreeBSD: head/sys/kern/sched_ule.c 109864 2003-01-26 05:23:15Z jeff $ 27 */ 28 29#include <sys/param.h> 30#include <sys/systm.h> 31#include <sys/kernel.h> 32#include <sys/ktr.h> 33#include <sys/lock.h> 34#include <sys/mutex.h> 35#include <sys/proc.h> 36#include <sys/sched.h> 37#include <sys/smp.h> 38#include <sys/sx.h> 39#include <sys/sysctl.h> 40#include <sys/sysproto.h> 41#include <sys/vmmeter.h> 42#ifdef DDB 43#include <ddb/ddb.h> 44#endif 45#ifdef KTRACE 46#include <sys/uio.h> 47#include <sys/ktrace.h> 48#endif 49 50#include <machine/cpu.h> 51 52/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 53/* XXX This is bogus compatability crap for ps */ 54static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 55SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 56 57static void sched_setup(void *dummy); 58SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 59 60/* 61 * These datastructures are allocated within their parent datastructure but 62 * are scheduler specific. 63 */ 64 65struct ke_sched { 66 int ske_slice; 67 struct runq *ske_runq; 68 /* The following variables are only used for pctcpu calculation */ 69 int ske_ltick; /* Last tick that we were running on */ 70 int ske_ftick; /* First tick that we were running on */ 71 int ske_ticks; /* Tick count */ 72}; 73#define ke_slice ke_sched->ske_slice 74#define ke_runq ke_sched->ske_runq 75#define ke_ltick ke_sched->ske_ltick 76#define ke_ftick ke_sched->ske_ftick 77#define ke_ticks ke_sched->ske_ticks 78 79struct kg_sched { 80 int skg_slptime; 81}; 82#define kg_slptime kg_sched->skg_slptime 83 84struct td_sched { 85 int std_slptime; 86}; 87#define td_slptime td_sched->std_slptime 88 89struct ke_sched ke_sched; 90struct kg_sched kg_sched; 91struct td_sched td_sched; 92 93struct ke_sched *kse0_sched = &ke_sched; 94struct kg_sched *ksegrp0_sched = &kg_sched; 95struct p_sched *proc0_sched = NULL; 96struct td_sched *thread0_sched = &td_sched; 97 98/* 99 * This priority range has 20 priorities on either end that are reachable 100 * only through nice values. 101 */ 102#define SCHED_PRI_NRESV 40 103#define SCHED_PRI_RANGE ((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) - \ 104 SCHED_PRI_NRESV) 105 106/* 107 * These determine how sleep time effects the priority of a process. 108 * 109 * SLP_MAX: Maximum amount of accrued sleep time. 110 * SLP_SCALE: Scale the number of ticks slept across the dynamic priority 111 * range. 112 * SLP_TOPRI: Convert a number of ticks slept into a priority value. 113 * SLP_DECAY: Reduce the sleep time to 50% for every granted slice. 114 */ 115#define SCHED_SLP_MAX (hz * 2) 116#define SCHED_SLP_SCALE(slp) (((slp) * SCHED_PRI_RANGE) / SCHED_SLP_MAX) 117#define SCHED_SLP_TOPRI(slp) (SCHED_PRI_RANGE - SCHED_SLP_SCALE((slp)) + \ 118 SCHED_PRI_NRESV / 2) 119#define SCHED_SLP_DECAY(slp) ((slp) / 2) /* XXX Multiple kses break */ 120 121/* 122 * These parameters and macros determine the size of the time slice that is 123 * granted to each thread. 124 * 125 * SLICE_MIN: Minimum time slice granted, in units of ticks. 126 * SLICE_MAX: Maximum time slice granted. 127 * SLICE_RANGE: Range of available time slices scaled by hz. 128 * SLICE_SCALE: The number slices granted per unit of pri or slp. 129 * PRI_TOSLICE: Compute a slice size that is proportional to the priority. 130 * SLP_TOSLICE: Compute a slice size that is inversely proportional to the 131 * amount of time slept. (smaller slices for interactive ksegs) 132 * PRI_COMP: This determines what fraction of the actual slice comes from 133 * the slice size computed from the priority. 134 * SLP_COMP: This determines what component of the actual slice comes from 135 * the slize size computed from the sleep time. 136 */ 137#define SCHED_SLICE_MIN (hz / 100) 138#define SCHED_SLICE_MAX (hz / 10) 139#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 140#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 141#define SCHED_PRI_TOSLICE(pri) \ 142 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((pri), SCHED_PRI_RANGE)) 143#define SCHED_SLP_TOSLICE(slp) \ 144 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((slp), SCHED_SLP_MAX)) 145#define SCHED_SLP_COMP(slice) (((slice) / 5) * 3) /* 60% */ 146#define SCHED_PRI_COMP(slice) (((slice) / 5) * 2) /* 40% */ 147 148/* 149 * This macro determines whether or not the kse belongs on the current or 150 * next run queue. 151 */ 152#define SCHED_CURR(kg) ((kg)->kg_slptime > (hz / 4) || \ 153 (kg)->kg_pri_class != PRI_TIMESHARE) 154 155/* 156 * Cpu percentage computation macros and defines. 157 * 158 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 159 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 160 */ 161 162#define SCHED_CPU_TIME 60 163#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 164 165/* 166 * kseq - pair of runqs per processor 167 */ 168 169struct kseq { 170 struct runq ksq_runqs[2]; 171 struct runq *ksq_curr; 172 struct runq *ksq_next; 173 int ksq_load; /* Total runnable */ 174}; 175 176/* 177 * One kse queue per processor. 178 */ 179struct kseq kseq_cpu[MAXCPU]; 180 181static int sched_slice(struct ksegrp *kg); 182static int sched_priority(struct ksegrp *kg); 183void sched_pctcpu_update(struct kse *ke); 184int sched_pickcpu(void); 185 186static void 187sched_setup(void *dummy) 188{ 189 int i; 190 191 mtx_lock_spin(&sched_lock); 192 /* init kseqs */ 193 for (i = 0; i < MAXCPU; i++) { 194 kseq_cpu[i].ksq_load = 0; 195 kseq_cpu[i].ksq_curr = &kseq_cpu[i].ksq_runqs[0]; 196 kseq_cpu[i].ksq_next = &kseq_cpu[i].ksq_runqs[1]; 197 runq_init(kseq_cpu[i].ksq_curr); 198 runq_init(kseq_cpu[i].ksq_next); 199 } 200 /* CPU0 has proc0 */ 201 kseq_cpu[0].ksq_load++; 202 mtx_unlock_spin(&sched_lock); 203} 204 205/* 206 * Scale the scheduling priority according to the "interactivity" of this 207 * process. 208 */ 209static int 210sched_priority(struct ksegrp *kg) 211{ 212 int pri; 213 214 if (kg->kg_pri_class != PRI_TIMESHARE) 215 return (kg->kg_user_pri); 216 217 pri = SCHED_SLP_TOPRI(kg->kg_slptime); 218 CTR2(KTR_RUNQ, "sched_priority: slptime: %d\tpri: %d", 219 kg->kg_slptime, pri); 220 221 pri += PRI_MIN_TIMESHARE; 222 pri += kg->kg_nice; 223 224 if (pri > PRI_MAX_TIMESHARE) 225 pri = PRI_MAX_TIMESHARE; 226 else if (pri < PRI_MIN_TIMESHARE) 227 pri = PRI_MIN_TIMESHARE; 228 229 kg->kg_user_pri = pri; 230 231 return (kg->kg_user_pri); 232} 233 234/* 235 * Calculate a time slice based on the process priority. 236 */ 237static int 238sched_slice(struct ksegrp *kg) 239{ 240 int pslice; 241 int sslice; 242 int slice; 243 int pri; 244 245 pri = kg->kg_user_pri; 246 pri -= PRI_MIN_TIMESHARE; 247 pslice = SCHED_PRI_TOSLICE(pri); 248 sslice = SCHED_SLP_TOSLICE(kg->kg_slptime); 249 slice = SCHED_SLP_COMP(sslice) + SCHED_PRI_COMP(pslice); 250 kg->kg_slptime = SCHED_SLP_DECAY(kg->kg_slptime); 251 252 CTR4(KTR_RUNQ, 253 "sched_slice: pri: %d\tsslice: %d\tpslice: %d\tslice: %d", 254 pri, sslice, pslice, slice); 255 256 if (slice < SCHED_SLICE_MIN) 257 slice = SCHED_SLICE_MIN; 258 else if (slice > SCHED_SLICE_MAX) 259 slice = SCHED_SLICE_MAX; 260 261 return (slice); 262} 263 264int 265sched_rr_interval(void) 266{ 267 return (SCHED_SLICE_MAX); 268} 269 270void 271sched_pctcpu_update(struct kse *ke) 272{ 273 /* 274 * Adjust counters and watermark for pctcpu calc. 275 */ 276 ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) * 277 SCHED_CPU_TICKS; 278 ke->ke_ltick = ticks; 279 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 280} 281 282#ifdef SMP 283int 284sched_pickcpu(void) 285{ 286 int cpu; 287 int load; 288 int i; 289 290 if (!smp_started) 291 return (0); 292 293 cpu = PCPU_GET(cpuid); 294 load = kseq_cpu[cpu].ksq_load; 295 296 for (i = 0; i < mp_maxid; i++) { 297 if (CPU_ABSENT(i)) 298 continue; 299 if (kseq_cpu[i].ksq_load < load) { 300 cpu = i; 301 load = kseq_cpu[i].ksq_load; 302 } 303 } 304 305 CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 306 return (cpu); 307} 308#else 309int 310sched_pickcpu(void) 311{ 312 return (0); 313} 314#endif 315 316void 317sched_prio(struct thread *td, u_char prio) 318{ 319 struct kse *ke; 320 struct runq *rq; 321 322 mtx_assert(&sched_lock, MA_OWNED); 323 ke = td->td_kse; 324 td->td_priority = prio; 325 326 if (TD_ON_RUNQ(td)) { 327 rq = ke->ke_runq; 328 329 runq_remove(rq, ke); 330 runq_add(rq, ke); 331 } 332} 333 334void 335sched_switchout(struct thread *td) 336{ 337 struct kse *ke; 338 339 mtx_assert(&sched_lock, MA_OWNED); 340 341 ke = td->td_kse; 342 343 td->td_last_kse = ke; 344 td->td_lastcpu = ke->ke_oncpu; 345 ke->ke_flags &= ~KEF_NEEDRESCHED; 346 347 if (TD_IS_RUNNING(td)) { 348 setrunqueue(td); 349 return; 350 } else 351 td->td_kse->ke_runq = NULL; 352 353 /* 354 * We will not be on the run queue. So we must be 355 * sleeping or similar. 356 */ 357 if (td->td_proc->p_flag & P_KSES) 358 kse_reassign(ke); 359} 360 361void 362sched_switchin(struct thread *td) 363{ 364 /* struct kse *ke = td->td_kse; */ 365 mtx_assert(&sched_lock, MA_OWNED); 366 367 td->td_kse->ke_oncpu = PCPU_GET(cpuid); /* XXX */ 368 if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 369 td->td_priority != td->td_ksegrp->kg_user_pri) 370 curthread->td_kse->ke_flags |= KEF_NEEDRESCHED; 371 372} 373 374void 375sched_nice(struct ksegrp *kg, int nice) 376{ 377 struct thread *td; 378 379 kg->kg_nice = nice; 380 sched_priority(kg); 381 FOREACH_THREAD_IN_GROUP(kg, td) { 382 td->td_kse->ke_flags |= KEF_NEEDRESCHED; 383 } 384} 385 386void 387sched_sleep(struct thread *td, u_char prio) 388{ 389 mtx_assert(&sched_lock, MA_OWNED); 390 391 td->td_slptime = ticks; 392 td->td_priority = prio; 393 394 /* 395 * If this is an interactive task clear its queue so it moves back 396 * on to curr when it wakes up. Otherwise let it stay on the queue 397 * that it was assigned to. 398 */ 399 if (SCHED_CURR(td->td_kse->ke_ksegrp)) 400 td->td_kse->ke_runq = NULL; 401} 402 403void 404sched_wakeup(struct thread *td) 405{ 406 struct ksegrp *kg; 407 408 mtx_assert(&sched_lock, MA_OWNED); 409 410 /* 411 * Let the kseg know how long we slept for. This is because process 412 * interactivity behavior is modeled in the kseg. 413 */ 414 kg = td->td_ksegrp; 415 416 if (td->td_slptime) { 417 kg->kg_slptime += ticks - td->td_slptime; 418 if (kg->kg_slptime > SCHED_SLP_MAX) 419 kg->kg_slptime = SCHED_SLP_MAX; 420 td->td_priority = sched_priority(kg); 421 } 422 td->td_slptime = 0; 423 setrunqueue(td); 424 if (td->td_priority < curthread->td_priority) 425 curthread->td_kse->ke_flags |= KEF_NEEDRESCHED; 426} 427 428/* 429 * Penalize the parent for creating a new child and initialize the child's 430 * priority. 431 */ 432void 433sched_fork(struct ksegrp *kg, struct ksegrp *child) 434{ 435 struct kse *ckse; 436 struct kse *pkse; 437 438 mtx_assert(&sched_lock, MA_OWNED); 439 ckse = FIRST_KSE_IN_KSEGRP(child); 440 pkse = FIRST_KSE_IN_KSEGRP(kg); 441 442 /* XXX Need something better here */ 443 child->kg_slptime = kg->kg_slptime; 444 child->kg_user_pri = kg->kg_user_pri; 445 446 ckse->ke_slice = pkse->ke_slice; 447 ckse->ke_oncpu = sched_pickcpu(); 448 ckse->ke_runq = NULL; 449 /* 450 * Claim that we've been running for one second for statistical 451 * purposes. 452 */ 453 ckse->ke_ticks = 0; 454 ckse->ke_ltick = ticks; 455 ckse->ke_ftick = ticks - hz; 456} 457 458/* 459 * Return some of the child's priority and interactivity to the parent. 460 */ 461void 462sched_exit(struct ksegrp *kg, struct ksegrp *child) 463{ 464 struct kseq *kseq; 465 struct kse *ke; 466 467 /* XXX Need something better here */ 468 mtx_assert(&sched_lock, MA_OWNED); 469 kg->kg_slptime = child->kg_slptime; 470 sched_priority(kg); 471 472 /* 473 * We drop the load here so that the running process leaves us with a 474 * load of at least one. 475 */ 476 ke = FIRST_KSE_IN_KSEGRP(kg); 477 kseq = &kseq_cpu[ke->ke_oncpu]; 478 kseq->ksq_load--; 479} 480 481int sched_clock_switches; 482 483void 484sched_clock(struct thread *td) 485{ 486 struct kse *ke; 487 struct kse *nke; 488 struct ksegrp *kg; 489 struct kseq *kseq; 490 int cpu; 491 492 cpu = PCPU_GET(cpuid); 493 kseq = &kseq_cpu[cpu]; 494 495 mtx_assert(&sched_lock, MA_OWNED); 496 KASSERT((td != NULL), ("schedclock: null thread pointer")); 497 ke = td->td_kse; 498 kg = td->td_ksegrp; 499 500 nke = runq_choose(kseq->ksq_curr); 501 502 if (td->td_kse->ke_flags & KEF_IDLEKSE) { 503#if 0 504 if (nke && nke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 505 printf("Idle running with %s on the runq!\n", 506 nke->ke_proc->p_comm); 507 Debugger("stop"); 508 } 509#endif 510 return; 511 } 512 if (nke && nke->ke_thread && 513 nke->ke_thread->td_priority < td->td_priority) { 514 sched_clock_switches++; 515 ke->ke_flags |= KEF_NEEDRESCHED; 516 } 517 518 /* 519 * We used a tick, decrease our total sleep time. This decreases our 520 * "interactivity". 521 */ 522 if (kg->kg_slptime) 523 kg->kg_slptime--; 524 /* 525 * We used up one time slice. 526 */ 527 ke->ke_slice--; 528 /* 529 * We're out of time, recompute priorities and requeue 530 */ 531 if (ke->ke_slice == 0) { 532 struct kseq *kseq; 533 534 kseq = &kseq_cpu[ke->ke_oncpu]; 535 536 td->td_priority = sched_priority(kg); 537 ke->ke_slice = sched_slice(kg); 538 ke->ke_flags |= KEF_NEEDRESCHED; 539 ke->ke_runq = NULL; 540 } 541 ke->ke_ticks += 10000; 542 ke->ke_ltick = ticks; 543 /* Go up to one second beyond our max and then trim back down */ 544 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 545 sched_pctcpu_update(ke); 546} 547 548int 549sched_runnable(void) 550{ 551 struct kseq *kseq; 552 int cpu; 553 554 cpu = PCPU_GET(cpuid); 555 kseq = &kseq_cpu[cpu]; 556 557 if (runq_check(kseq->ksq_curr) == 0) 558 return (runq_check(kseq->ksq_next)); 559 return (1); 560} 561 562void 563sched_userret(struct thread *td) 564{ 565 struct ksegrp *kg; 566 567 kg = td->td_ksegrp; 568 569 if (td->td_priority != kg->kg_user_pri) { 570 mtx_lock_spin(&sched_lock); 571 td->td_priority = kg->kg_user_pri; 572 mtx_unlock_spin(&sched_lock); 573 } 574} 575 576struct kse * 577sched_choose(void) 578{ 579 struct kseq *kseq; 580 struct kse *ke; 581 struct runq *swap; 582 int cpu; 583 584 cpu = PCPU_GET(cpuid); 585 kseq = &kseq_cpu[cpu]; 586 587 if ((ke = runq_choose(kseq->ksq_curr)) == NULL) { 588 swap = kseq->ksq_curr; 589 kseq->ksq_curr = kseq->ksq_next; 590 kseq->ksq_next = swap; 591 ke = runq_choose(kseq->ksq_curr); 592 } 593 if (ke) { 594 runq_remove(ke->ke_runq, ke); 595 ke->ke_state = KES_THREAD; 596 } 597 598 return (ke); 599} 600 601void 602sched_add(struct kse *ke) 603{ 604 struct kseq *kseq; 605 int cpu; 606 607 mtx_assert(&sched_lock, MA_OWNED); 608 KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE")); 609 KASSERT((ke->ke_thread->td_kse != NULL), 610 ("runq_add: No KSE on thread")); 611 KASSERT(ke->ke_state != KES_ONRUNQ, 612 ("runq_add: kse %p (%s) already in run queue", ke, 613 ke->ke_proc->p_comm)); 614 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 615 ("runq_add: process swapped out")); 616 617 /* cpu = PCPU_GET(cpuid); */ 618 cpu = ke->ke_oncpu; 619 kseq = &kseq_cpu[cpu]; 620 kseq->ksq_load++; 621 622 if (ke->ke_runq == NULL) { 623 if (SCHED_CURR(ke->ke_ksegrp)) 624 ke->ke_runq = kseq->ksq_curr; 625 else 626 ke->ke_runq = kseq->ksq_next; 627 } 628 ke->ke_ksegrp->kg_runq_kses++; 629 ke->ke_state = KES_ONRUNQ; 630 631 runq_add(ke->ke_runq, ke); 632} 633 634void 635sched_rem(struct kse *ke) 636{ 637 struct kseq *kseq; 638 639 mtx_assert(&sched_lock, MA_OWNED); 640 /* KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); */ 641 642 kseq = &kseq_cpu[ke->ke_oncpu]; 643 kseq->ksq_load--; 644 645 runq_remove(ke->ke_runq, ke); 646 ke->ke_runq = NULL; 647 ke->ke_state = KES_THREAD; 648 ke->ke_ksegrp->kg_runq_kses--; 649} 650 651fixpt_t 652sched_pctcpu(struct kse *ke) 653{ 654 fixpt_t pctcpu; 655 656 pctcpu = 0; 657 658 if (ke->ke_ticks) { 659 int rtick; 660 661 /* Update to account for time potentially spent sleeping */ 662 ke->ke_ltick = ticks; 663 sched_pctcpu_update(ke); 664 665 /* How many rtick per second ? */ 666 rtick = ke->ke_ticks / (SCHED_CPU_TIME * 10000); 667 pctcpu = (FSCALE * ((FSCALE * rtick)/stathz)) >> FSHIFT; 668 } 669 670 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 671 672 return (pctcpu); 673} 674 675int 676sched_sizeof_kse(void) 677{ 678 return (sizeof(struct kse) + sizeof(struct ke_sched)); 679} 680 681int 682sched_sizeof_ksegrp(void) 683{ 684 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 685} 686 687int 688sched_sizeof_proc(void) 689{ 690 return (sizeof(struct proc)); 691} 692 693int 694sched_sizeof_thread(void) 695{ 696 return (sizeof(struct thread) + sizeof(struct td_sched)); 697} 698