sched_ule.c revision 165830
1/*- 2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 165830 2007-01-06 12:33:43Z jeff $"); 29 30#include "opt_hwpmc_hooks.h" 31#include "opt_sched.h" 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/kdb.h> 36#include <sys/kernel.h> 37#include <sys/ktr.h> 38#include <sys/lock.h> 39#include <sys/mutex.h> 40#include <sys/proc.h> 41#include <sys/resource.h> 42#include <sys/resourcevar.h> 43#include <sys/sched.h> 44#include <sys/smp.h> 45#include <sys/sx.h> 46#include <sys/sysctl.h> 47#include <sys/sysproto.h> 48#include <sys/turnstile.h> 49#include <sys/umtx.h> 50#include <sys/vmmeter.h> 51#ifdef KTRACE 52#include <sys/uio.h> 53#include <sys/ktrace.h> 54#endif 55 56#ifdef HWPMC_HOOKS 57#include <sys/pmckern.h> 58#endif 59 60#include <machine/cpu.h> 61#include <machine/smp.h> 62 63/* 64 * Thread scheduler specific section. 65 */ 66struct td_sched { 67 TAILQ_ENTRY(td_sched) ts_procq; /* (j/z) Run queue. */ 68 int ts_flags; /* (j) TSF_* flags. */ 69 struct thread *ts_thread; /* (*) Active associated thread. */ 70 u_char ts_rqindex; /* (j) Run queue index. */ 71 enum { 72 TSS_THREAD, 73 TSS_ONRUNQ 74 } ts_state; /* (j) thread sched specific status. */ 75 int ts_slptime; 76 int ts_slice; 77 struct runq *ts_runq; 78 u_char ts_cpu; /* CPU that we have affinity for. */ 79 /* The following variables are only used for pctcpu calculation */ 80 int ts_ltick; /* Last tick that we were running on */ 81 int ts_ftick; /* First tick that we were running on */ 82 int ts_ticks; /* Tick count */ 83 84 /* originally from kg_sched */ 85 int skg_slptime; /* Number of ticks we vol. slept */ 86 int skg_runtime; /* Number of ticks we were running */ 87}; 88#define ts_assign ts_procq.tqe_next 89/* flags kept in ts_flags */ 90#define TSF_ASSIGNED 0x0001 /* Thread is being migrated. */ 91#define TSF_BOUND 0x0002 /* Thread can not migrate. */ 92#define TSF_XFERABLE 0x0004 /* Thread was added as transferable. */ 93#define TSF_REMOVED 0x0008 /* Thread was removed while ASSIGNED */ 94#define TSF_DIDRUN 0x2000 /* Thread actually ran. */ 95 96static struct td_sched td_sched0; 97 98/* 99 * Cpu percentage computation macros and defines. 100 * 101 * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. 102 * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. 103 * SCHED_TICK_MAX: Maximum number of ticks before scaling back. 104 * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. 105 * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. 106 * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. 107 */ 108#define SCHED_TICK_SECS 10 109#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) 110#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) 111#define SCHED_TICK_SHIFT 10 112#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) 113#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) 114 115/* 116 * These macros determine priorities for non-interactive threads. They are 117 * assigned a priority based on their recent cpu utilization as expressed 118 * by the ratio of ticks to the tick total. NHALF priorities at the start 119 * and end of the MIN to MAX timeshare range are only reachable with negative 120 * or positive nice respectively. 121 * 122 * PRI_RANGE: Priority range for utilization dependent priorities. 123 * PRI_NRESV: Number of nice values. 124 * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. 125 * PRI_NICE: Determines the part of the priority inherited from nice. 126 */ 127#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) 128#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 129#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF) 130#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF) 131#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) 132#define SCHED_PRI_TICKS(ts) \ 133 (SCHED_TICK_HZ((ts)) / \ 134 (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) 135#define SCHED_PRI_NICE(nice) (nice) 136 137/* 138 * These determine the interactivity of a process. Interactivity differs from 139 * cpu utilization in that it expresses the voluntary time slept vs time ran 140 * while cpu utilization includes all time not running. This more accurately 141 * models the intent of the thread. 142 * 143 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 144 * before throttling back. 145 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 146 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 147 * INTERACT_THRESH: Threshhold for placement on the current runq. 148 */ 149#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) 150#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) 151#define SCHED_INTERACT_MAX (100) 152#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 153#define SCHED_INTERACT_THRESH (30) 154 155/* 156 * tickincr: Converts a stathz tick into a hz domain scaled by 157 * the shift factor. Without the shift the error rate 158 * due to rounding would be unacceptably high. 159 * realstathz: stathz is sometimes 0 and run off of hz. 160 * sched_slice: Runtime of each thread before rescheduling. 161 */ 162static int sched_interact = SCHED_INTERACT_THRESH; 163static int realstathz; 164static int tickincr; 165static int sched_slice; 166static int sched_rebalance = 1; 167 168/* 169 * tdq - per processor runqs and statistics. 170 */ 171struct tdq { 172 struct runq tdq_idle; /* Queue of IDLE threads. */ 173 struct runq tdq_timeshare; /* timeshare run queue. */ 174 struct runq tdq_realtime; /* real-time run queue. */ 175 int tdq_idx; /* Current insert index. */ 176 int tdq_ridx; /* Current removal index. */ 177 int tdq_load; /* Aggregate load. */ 178#ifdef SMP 179 int tdq_transferable; 180 LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ 181 struct tdq_group *tdq_group; /* Our processor group. */ 182 volatile struct td_sched *tdq_assigned; /* assigned by another CPU. */ 183#else 184 int tdq_sysload; /* For loadavg, !ITHD load. */ 185#endif 186}; 187 188#ifdef SMP 189/* 190 * tdq groups are groups of processors which can cheaply share threads. When 191 * one processor in the group goes idle it will check the runqs of the other 192 * processors in its group prior to halting and waiting for an interrupt. 193 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 194 * In a numa environment we'd want an idle bitmap per group and a two tiered 195 * load balancer. 196 */ 197struct tdq_group { 198 int tdg_cpus; /* Count of CPUs in this tdq group. */ 199 cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ 200 cpumask_t tdg_idlemask; /* Idle cpus in this group. */ 201 cpumask_t tdg_mask; /* Bit mask for first cpu. */ 202 int tdg_load; /* Total load of this group. */ 203 int tdg_transferable; /* Transferable load of this group. */ 204 LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ 205}; 206#endif 207 208/* 209 * One thread queue per processor. 210 */ 211#ifdef SMP 212static cpumask_t tdq_idle; 213static int tdg_maxid; 214static struct tdq tdq_cpu[MAXCPU]; 215static struct tdq_group tdq_groups[MAXCPU]; 216static int bal_tick; 217static int gbal_tick; 218static int balance_groups; 219 220#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) 221#define TDQ_CPU(x) (&tdq_cpu[(x)]) 222#define TDQ_ID(x) ((x) - tdq_cpu) 223#define TDQ_GROUP(x) (&tdq_groups[(x)]) 224#else /* !SMP */ 225static struct tdq tdq_cpu; 226 227#define TDQ_SELF() (&tdq_cpu) 228#define TDQ_CPU(x) (&tdq_cpu) 229#endif 230 231static struct td_sched *sched_choose(void); /* XXX Should be thread * */ 232static void sched_priority(struct thread *); 233static void sched_thread_priority(struct thread *, u_char); 234static int sched_interact_score(struct thread *); 235static void sched_interact_update(struct thread *); 236static void sched_interact_fork(struct thread *); 237static void sched_pctcpu_update(struct td_sched *); 238static inline void sched_pin_td(struct thread *td); 239static inline void sched_unpin_td(struct thread *td); 240 241/* Operations on per processor queues */ 242static struct td_sched * tdq_choose(struct tdq *); 243static void tdq_setup(struct tdq *); 244static void tdq_load_add(struct tdq *, struct td_sched *); 245static void tdq_load_rem(struct tdq *, struct td_sched *); 246static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); 247static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); 248void tdq_print(int cpu); 249static void runq_print(struct runq *rq); 250#ifdef SMP 251static int tdq_transfer(struct tdq *, struct td_sched *, int); 252static struct td_sched *runq_steal(struct runq *); 253static void sched_balance(void); 254static void sched_balance_groups(void); 255static void sched_balance_group(struct tdq_group *); 256static void sched_balance_pair(struct tdq *, struct tdq *); 257static void sched_smp_tick(void); 258static void tdq_move(struct tdq *, int); 259static int tdq_idled(struct tdq *); 260static void tdq_notify(struct td_sched *, int); 261static void tdq_assign(struct tdq *); 262static struct td_sched *tdq_steal(struct tdq *, int); 263 264#define THREAD_CAN_MIGRATE(td) \ 265 ((td)->td_pinned == 0 && (td)->td_pri_class != PRI_ITHD) 266#endif 267 268static void sched_setup(void *dummy); 269SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 270 271static void sched_initticks(void *dummy); 272SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) 273 274static inline void 275sched_pin_td(struct thread *td) 276{ 277 td->td_pinned++; 278} 279 280static inline void 281sched_unpin_td(struct thread *td) 282{ 283 td->td_pinned--; 284} 285 286static void 287runq_print(struct runq *rq) 288{ 289 struct rqhead *rqh; 290 struct td_sched *ts; 291 int pri; 292 int j; 293 int i; 294 295 for (i = 0; i < RQB_LEN; i++) { 296 printf("\t\trunq bits %d 0x%zx\n", 297 i, rq->rq_status.rqb_bits[i]); 298 for (j = 0; j < RQB_BPW; j++) 299 if (rq->rq_status.rqb_bits[i] & (1ul << j)) { 300 pri = j + (i << RQB_L2BPW); 301 rqh = &rq->rq_queues[pri]; 302 TAILQ_FOREACH(ts, rqh, ts_procq) { 303 printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", 304 ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri); 305 } 306 } 307 } 308} 309 310void 311tdq_print(int cpu) 312{ 313 struct tdq *tdq; 314 315 tdq = TDQ_CPU(cpu); 316 317 printf("tdq:\n"); 318 printf("\tload: %d\n", tdq->tdq_load); 319 printf("\ttimeshare idx: %d\n", tdq->tdq_idx); 320 printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); 321 printf("\trealtime runq:\n"); 322 runq_print(&tdq->tdq_realtime); 323 printf("\ttimeshare runq:\n"); 324 runq_print(&tdq->tdq_timeshare); 325 printf("\tidle runq:\n"); 326 runq_print(&tdq->tdq_idle); 327#ifdef SMP 328 printf("\tload transferable: %d\n", tdq->tdq_transferable); 329#endif 330} 331 332static __inline void 333tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) 334{ 335#ifdef SMP 336 if (THREAD_CAN_MIGRATE(ts->ts_thread)) { 337 tdq->tdq_transferable++; 338 tdq->tdq_group->tdg_transferable++; 339 ts->ts_flags |= TSF_XFERABLE; 340 } 341#endif 342 if (ts->ts_runq == &tdq->tdq_timeshare) { 343 int pri; 344 345 pri = ts->ts_thread->td_priority; 346 KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, 347 ("Invalid priority %d on timeshare runq", pri)); 348 /* 349 * This queue contains only priorities between MIN and MAX 350 * realtime. Use the whole queue to represent these values. 351 */ 352#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) 353 if ((flags & SRQ_BORROWING) == 0) { 354 pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; 355 pri = (pri + tdq->tdq_idx) % RQ_NQS; 356 /* 357 * This effectively shortens the queue by one so we 358 * can have a one slot difference between idx and 359 * ridx while we wait for threads to drain. 360 */ 361 if (tdq->tdq_ridx != tdq->tdq_idx && 362 pri == tdq->tdq_ridx) 363 pri = (pri - 1) % RQ_NQS; 364 } else 365 pri = tdq->tdq_ridx; 366 runq_add_pri(ts->ts_runq, ts, pri, flags); 367 } else 368 runq_add(ts->ts_runq, ts, flags); 369} 370 371static __inline void 372tdq_runq_rem(struct tdq *tdq, struct td_sched *ts) 373{ 374#ifdef SMP 375 if (ts->ts_flags & TSF_XFERABLE) { 376 tdq->tdq_transferable--; 377 tdq->tdq_group->tdg_transferable--; 378 ts->ts_flags &= ~TSF_XFERABLE; 379 } 380#endif 381 if (ts->ts_runq == &tdq->tdq_timeshare) { 382 if (tdq->tdq_idx != tdq->tdq_ridx) 383 runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); 384 else 385 runq_remove_idx(ts->ts_runq, ts, NULL); 386 /* 387 * For timeshare threads we update the priority here so 388 * the priority reflects the time we've been sleeping. 389 */ 390 ts->ts_ltick = ticks; 391 sched_pctcpu_update(ts); 392 sched_priority(ts->ts_thread); 393 } else 394 runq_remove(ts->ts_runq, ts); 395} 396 397static void 398tdq_load_add(struct tdq *tdq, struct td_sched *ts) 399{ 400 int class; 401 mtx_assert(&sched_lock, MA_OWNED); 402 class = PRI_BASE(ts->ts_thread->td_pri_class); 403 tdq->tdq_load++; 404 CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 405 if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 406#ifdef SMP 407 tdq->tdq_group->tdg_load++; 408#else 409 tdq->tdq_sysload++; 410#endif 411} 412 413static void 414tdq_load_rem(struct tdq *tdq, struct td_sched *ts) 415{ 416 int class; 417 mtx_assert(&sched_lock, MA_OWNED); 418 class = PRI_BASE(ts->ts_thread->td_pri_class); 419 if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 420#ifdef SMP 421 tdq->tdq_group->tdg_load--; 422#else 423 tdq->tdq_sysload--; 424#endif 425 tdq->tdq_load--; 426 CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 427 ts->ts_runq = NULL; 428} 429 430#ifdef SMP 431static void 432sched_smp_tick(void) 433{ 434 struct tdq *tdq; 435 436 tdq = TDQ_SELF(); 437 if (sched_rebalance) { 438 if (ticks >= bal_tick) 439 sched_balance(); 440 if (ticks >= gbal_tick && balance_groups) 441 sched_balance_groups(); 442 } 443 /* 444 * We could have been assigned a non real-time thread without an 445 * IPI. 446 */ 447 if (tdq->tdq_assigned) 448 tdq_assign(tdq); /* Potentially sets NEEDRESCHED */ 449} 450 451/* 452 * sched_balance is a simple CPU load balancing algorithm. It operates by 453 * finding the least loaded and most loaded cpu and equalizing their load 454 * by migrating some processes. 455 * 456 * Dealing only with two CPUs at a time has two advantages. Firstly, most 457 * installations will only have 2 cpus. Secondly, load balancing too much at 458 * once can have an unpleasant effect on the system. The scheduler rarely has 459 * enough information to make perfect decisions. So this algorithm chooses 460 * algorithm simplicity and more gradual effects on load in larger systems. 461 * 462 * It could be improved by considering the priorities and slices assigned to 463 * each task prior to balancing them. There are many pathological cases with 464 * any approach and so the semi random algorithm below may work as well as any. 465 * 466 */ 467static void 468sched_balance(void) 469{ 470 struct tdq_group *high; 471 struct tdq_group *low; 472 struct tdq_group *tdg; 473 int cnt; 474 int i; 475 476 bal_tick = ticks + (random() % (hz * 2)); 477 if (smp_started == 0) 478 return; 479 low = high = NULL; 480 i = random() % (tdg_maxid + 1); 481 for (cnt = 0; cnt <= tdg_maxid; cnt++) { 482 tdg = TDQ_GROUP(i); 483 /* 484 * Find the CPU with the highest load that has some 485 * threads to transfer. 486 */ 487 if ((high == NULL || tdg->tdg_load > high->tdg_load) 488 && tdg->tdg_transferable) 489 high = tdg; 490 if (low == NULL || tdg->tdg_load < low->tdg_load) 491 low = tdg; 492 if (++i > tdg_maxid) 493 i = 0; 494 } 495 if (low != NULL && high != NULL && high != low) 496 sched_balance_pair(LIST_FIRST(&high->tdg_members), 497 LIST_FIRST(&low->tdg_members)); 498} 499 500static void 501sched_balance_groups(void) 502{ 503 int i; 504 505 gbal_tick = ticks + (random() % (hz * 2)); 506 mtx_assert(&sched_lock, MA_OWNED); 507 if (smp_started) 508 for (i = 0; i <= tdg_maxid; i++) 509 sched_balance_group(TDQ_GROUP(i)); 510} 511 512static void 513sched_balance_group(struct tdq_group *tdg) 514{ 515 struct tdq *tdq; 516 struct tdq *high; 517 struct tdq *low; 518 int load; 519 520 if (tdg->tdg_transferable == 0) 521 return; 522 low = NULL; 523 high = NULL; 524 LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 525 load = tdq->tdq_load; 526 if (high == NULL || load > high->tdq_load) 527 high = tdq; 528 if (low == NULL || load < low->tdq_load) 529 low = tdq; 530 } 531 if (high != NULL && low != NULL && high != low) 532 sched_balance_pair(high, low); 533} 534 535static void 536sched_balance_pair(struct tdq *high, struct tdq *low) 537{ 538 int transferable; 539 int high_load; 540 int low_load; 541 int move; 542 int diff; 543 int i; 544 545 /* 546 * If we're transfering within a group we have to use this specific 547 * tdq's transferable count, otherwise we can steal from other members 548 * of the group. 549 */ 550 if (high->tdq_group == low->tdq_group) { 551 transferable = high->tdq_transferable; 552 high_load = high->tdq_load; 553 low_load = low->tdq_load; 554 } else { 555 transferable = high->tdq_group->tdg_transferable; 556 high_load = high->tdq_group->tdg_load; 557 low_load = low->tdq_group->tdg_load; 558 } 559 if (transferable == 0) 560 return; 561 /* 562 * Determine what the imbalance is and then adjust that to how many 563 * threads we actually have to give up (transferable). 564 */ 565 diff = high_load - low_load; 566 move = diff / 2; 567 if (diff & 0x1) 568 move++; 569 move = min(move, transferable); 570 for (i = 0; i < move; i++) 571 tdq_move(high, TDQ_ID(low)); 572 return; 573} 574 575static void 576tdq_move(struct tdq *from, int cpu) 577{ 578 struct tdq *tdq; 579 struct tdq *to; 580 struct td_sched *ts; 581 582 tdq = from; 583 to = TDQ_CPU(cpu); 584 ts = tdq_steal(tdq, 1); 585 if (ts == NULL) { 586 struct tdq_group *tdg; 587 588 tdg = tdq->tdq_group; 589 LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 590 if (tdq == from || tdq->tdq_transferable == 0) 591 continue; 592 ts = tdq_steal(tdq, 1); 593 break; 594 } 595 if (ts == NULL) 596 panic("tdq_move: No threads available with a " 597 "transferable count of %d\n", 598 tdg->tdg_transferable); 599 } 600 if (tdq == to) 601 return; 602 ts->ts_state = TSS_THREAD; 603 tdq_runq_rem(tdq, ts); 604 tdq_load_rem(tdq, ts); 605 tdq_notify(ts, cpu); 606} 607 608static int 609tdq_idled(struct tdq *tdq) 610{ 611 struct tdq_group *tdg; 612 struct tdq *steal; 613 struct td_sched *ts; 614 615 tdg = tdq->tdq_group; 616 /* 617 * If we're in a cpu group, try and steal threads from another cpu in 618 * the group before idling. 619 */ 620 if (tdg->tdg_cpus > 1 && tdg->tdg_transferable) { 621 LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { 622 if (steal == tdq || steal->tdq_transferable == 0) 623 continue; 624 ts = tdq_steal(steal, 0); 625 if (ts == NULL) 626 continue; 627 ts->ts_state = TSS_THREAD; 628 tdq_runq_rem(steal, ts); 629 tdq_load_rem(steal, ts); 630 ts->ts_cpu = PCPU_GET(cpuid); 631 sched_pin_td(ts->ts_thread); 632 sched_add(ts->ts_thread, SRQ_YIELDING); 633 sched_unpin_td(ts->ts_thread); 634 return (0); 635 } 636 } 637 /* 638 * We only set the idled bit when all of the cpus in the group are 639 * idle. Otherwise we could get into a situation where a thread bounces 640 * back and forth between two idle cores on seperate physical CPUs. 641 */ 642 tdg->tdg_idlemask |= PCPU_GET(cpumask); 643 if (tdg->tdg_idlemask != tdg->tdg_cpumask) 644 return (1); 645 atomic_set_int(&tdq_idle, tdg->tdg_mask); 646 return (1); 647} 648 649static void 650tdq_assign(struct tdq *tdq) 651{ 652 struct td_sched *nts; 653 struct td_sched *ts; 654 655 do { 656 *(volatile struct td_sched **)&ts = tdq->tdq_assigned; 657 } while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned, 658 (uintptr_t)ts, (uintptr_t)NULL)); 659 for (; ts != NULL; ts = nts) { 660 nts = ts->ts_assign; 661 tdq->tdq_group->tdg_load--; 662 tdq->tdq_load--; 663 ts->ts_flags &= ~TSF_ASSIGNED; 664 if (ts->ts_flags & TSF_REMOVED) { 665 ts->ts_flags &= ~TSF_REMOVED; 666 continue; 667 } 668 sched_pin_td(ts->ts_thread); 669 sched_add(ts->ts_thread, SRQ_YIELDING); 670 sched_unpin_td(ts->ts_thread); 671 } 672} 673 674static void 675tdq_notify(struct td_sched *ts, int cpu) 676{ 677 struct tdq *tdq; 678 struct thread *td; 679 struct pcpu *pcpu; 680 int class; 681 int prio; 682 683 tdq = TDQ_CPU(cpu); 684 class = PRI_BASE(ts->ts_thread->td_pri_class); 685 if ((class != PRI_IDLE && class != PRI_ITHD) 686 && (tdq_idle & tdq->tdq_group->tdg_mask)) 687 atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); 688 tdq->tdq_group->tdg_load++; 689 tdq->tdq_load++; 690 ts->ts_cpu = cpu; 691 ts->ts_flags |= TSF_ASSIGNED; 692 prio = ts->ts_thread->td_priority; 693 694 /* 695 * Place a thread on another cpu's queue and force a resched. 696 */ 697 do { 698 *(volatile struct td_sched **)&ts->ts_assign = tdq->tdq_assigned; 699 } while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned, 700 (uintptr_t)ts->ts_assign, (uintptr_t)ts)); 701 /* Only ipi for realtime/ithd priorities */ 702 if (ts->ts_thread->td_priority > PRI_MIN_KERN) 703 return; 704 /* 705 * Without sched_lock we could lose a race where we set NEEDRESCHED 706 * on a thread that is switched out before the IPI is delivered. This 707 * would lead us to miss the resched. This will be a problem once 708 * sched_lock is pushed down. 709 */ 710 pcpu = pcpu_find(cpu); 711 td = pcpu->pc_curthread; 712 if (ts->ts_thread->td_priority < td->td_priority) { 713 td->td_flags |= TDF_NEEDRESCHED; 714 ipi_selected(1 << cpu, IPI_AST); 715 } 716} 717 718static struct td_sched * 719runq_steal(struct runq *rq) 720{ 721 struct rqhead *rqh; 722 struct rqbits *rqb; 723 struct td_sched *ts; 724 int word; 725 int bit; 726 727 mtx_assert(&sched_lock, MA_OWNED); 728 rqb = &rq->rq_status; 729 for (word = 0; word < RQB_LEN; word++) { 730 if (rqb->rqb_bits[word] == 0) 731 continue; 732 for (bit = 0; bit < RQB_BPW; bit++) { 733 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 734 continue; 735 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 736 TAILQ_FOREACH(ts, rqh, ts_procq) { 737 if (THREAD_CAN_MIGRATE(ts->ts_thread)) 738 return (ts); 739 } 740 } 741 } 742 return (NULL); 743} 744 745static struct td_sched * 746tdq_steal(struct tdq *tdq, int stealidle) 747{ 748 struct td_sched *ts; 749 750 /* 751 * Steal from next first to try to get a non-interactive task that 752 * may not have run for a while. 753 * XXX Need to effect steal order for timeshare threads. 754 */ 755 if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL) 756 return (ts); 757 if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL) 758 return (ts); 759 if (stealidle) 760 return (runq_steal(&tdq->tdq_idle)); 761 return (NULL); 762} 763 764int 765tdq_transfer(struct tdq *tdq, struct td_sched *ts, int class) 766{ 767 struct tdq_group *ntdg; 768 struct tdq_group *tdg; 769 struct tdq *old; 770 int cpu; 771 int idx; 772 773 if (smp_started == 0) 774 return (0); 775 cpu = 0; 776 /* 777 * If our load exceeds a certain threshold we should attempt to 778 * reassign this thread. The first candidate is the cpu that 779 * originally ran the thread. If it is idle, assign it there, 780 * otherwise, pick an idle cpu. 781 * 782 * The threshold at which we start to reassign has a large impact 783 * on the overall performance of the system. Tuned too high and 784 * some CPUs may idle. Too low and there will be excess migration 785 * and context switches. 786 */ 787 old = TDQ_CPU(ts->ts_cpu); 788 ntdg = old->tdq_group; 789 tdg = tdq->tdq_group; 790 if (tdq_idle) { 791 if (tdq_idle & ntdg->tdg_mask) { 792 cpu = ffs(ntdg->tdg_idlemask); 793 if (cpu) { 794 CTR2(KTR_SCHED, 795 "tdq_transfer: %p found old cpu %X " 796 "in idlemask.", ts, cpu); 797 goto migrate; 798 } 799 } 800 /* 801 * Multiple cpus could find this bit simultaneously 802 * but the race shouldn't be terrible. 803 */ 804 cpu = ffs(tdq_idle); 805 if (cpu) { 806 CTR2(KTR_SCHED, "tdq_transfer: %p found %X " 807 "in idlemask.", ts, cpu); 808 goto migrate; 809 } 810 } 811 idx = 0; 812#if 0 813 if (old->tdq_load < tdq->tdq_load) { 814 cpu = ts->ts_cpu + 1; 815 CTR2(KTR_SCHED, "tdq_transfer: %p old cpu %X " 816 "load less than ours.", ts, cpu); 817 goto migrate; 818 } 819 /* 820 * No new CPU was found, look for one with less load. 821 */ 822 for (idx = 0; idx <= tdg_maxid; idx++) { 823 ntdg = TDQ_GROUP(idx); 824 if (ntdg->tdg_load /*+ (ntdg->tdg_cpus * 2)*/ < tdg->tdg_load) { 825 cpu = ffs(ntdg->tdg_cpumask); 826 CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X load less " 827 "than ours.", ts, cpu); 828 goto migrate; 829 } 830 } 831#endif 832 /* 833 * If another cpu in this group has idled, assign a thread over 834 * to them after checking to see if there are idled groups. 835 */ 836 if (tdg->tdg_idlemask) { 837 cpu = ffs(tdg->tdg_idlemask); 838 if (cpu) { 839 CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X idle in " 840 "group.", ts, cpu); 841 goto migrate; 842 } 843 } 844 return (0); 845migrate: 846 /* 847 * Now that we've found an idle CPU, migrate the thread. 848 */ 849 cpu--; 850 ts->ts_runq = NULL; 851 tdq_notify(ts, cpu); 852 853 return (1); 854} 855 856#endif /* SMP */ 857 858/* 859 * Pick the highest priority task we have and return it. 860 */ 861 862static struct td_sched * 863tdq_choose(struct tdq *tdq) 864{ 865 struct td_sched *ts; 866 867 mtx_assert(&sched_lock, MA_OWNED); 868 869 ts = runq_choose(&tdq->tdq_realtime); 870 if (ts != NULL) { 871 KASSERT(ts->ts_thread->td_priority <= PRI_MAX_REALTIME, 872 ("tdq_choose: Invalid priority on realtime queue %d", 873 ts->ts_thread->td_priority)); 874 return (ts); 875 } 876 ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); 877 if (ts != NULL) { 878 KASSERT(ts->ts_thread->td_priority <= PRI_MAX_TIMESHARE && 879 ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE, 880 ("tdq_choose: Invalid priority on timeshare queue %d", 881 ts->ts_thread->td_priority)); 882 return (ts); 883 } 884 885 ts = runq_choose(&tdq->tdq_idle); 886 if (ts != NULL) { 887 KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE, 888 ("tdq_choose: Invalid priority on idle queue %d", 889 ts->ts_thread->td_priority)); 890 return (ts); 891 } 892 893 return (NULL); 894} 895 896static void 897tdq_setup(struct tdq *tdq) 898{ 899 runq_init(&tdq->tdq_realtime); 900 runq_init(&tdq->tdq_timeshare); 901 runq_init(&tdq->tdq_idle); 902 tdq->tdq_load = 0; 903} 904 905static void 906sched_setup(void *dummy) 907{ 908#ifdef SMP 909 int i; 910#endif 911 912 /* 913 * To avoid divide-by-zero, we set realstathz a dummy value 914 * in case which sched_clock() called before sched_initticks(). 915 */ 916 realstathz = hz; 917 sched_slice = (realstathz/7); /* 140ms */ 918 tickincr = 1 << SCHED_TICK_SHIFT; 919 920#ifdef SMP 921 balance_groups = 0; 922 /* 923 * Initialize the tdqs. 924 */ 925 for (i = 0; i < MAXCPU; i++) { 926 struct tdq *tdq; 927 928 tdq = &tdq_cpu[i]; 929 tdq->tdq_assigned = NULL; 930 tdq_setup(&tdq_cpu[i]); 931 } 932 if (smp_topology == NULL) { 933 struct tdq_group *tdg; 934 struct tdq *tdq; 935 int cpus; 936 937 for (cpus = 0, i = 0; i < MAXCPU; i++) { 938 if (CPU_ABSENT(i)) 939 continue; 940 tdq = &tdq_cpu[i]; 941 tdg = &tdq_groups[cpus]; 942 /* 943 * Setup a tdq group with one member. 944 */ 945 tdq->tdq_transferable = 0; 946 tdq->tdq_group = tdg; 947 tdg->tdg_cpus = 1; 948 tdg->tdg_idlemask = 0; 949 tdg->tdg_cpumask = tdg->tdg_mask = 1 << i; 950 tdg->tdg_load = 0; 951 tdg->tdg_transferable = 0; 952 LIST_INIT(&tdg->tdg_members); 953 LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); 954 cpus++; 955 } 956 tdg_maxid = cpus - 1; 957 } else { 958 struct tdq_group *tdg; 959 struct cpu_group *cg; 960 int j; 961 962 for (i = 0; i < smp_topology->ct_count; i++) { 963 cg = &smp_topology->ct_group[i]; 964 tdg = &tdq_groups[i]; 965 /* 966 * Initialize the group. 967 */ 968 tdg->tdg_idlemask = 0; 969 tdg->tdg_load = 0; 970 tdg->tdg_transferable = 0; 971 tdg->tdg_cpus = cg->cg_count; 972 tdg->tdg_cpumask = cg->cg_mask; 973 LIST_INIT(&tdg->tdg_members); 974 /* 975 * Find all of the group members and add them. 976 */ 977 for (j = 0; j < MAXCPU; j++) { 978 if ((cg->cg_mask & (1 << j)) != 0) { 979 if (tdg->tdg_mask == 0) 980 tdg->tdg_mask = 1 << j; 981 tdq_cpu[j].tdq_transferable = 0; 982 tdq_cpu[j].tdq_group = tdg; 983 LIST_INSERT_HEAD(&tdg->tdg_members, 984 &tdq_cpu[j], tdq_siblings); 985 } 986 } 987 if (tdg->tdg_cpus > 1) 988 balance_groups = 1; 989 } 990 tdg_maxid = smp_topology->ct_count - 1; 991 } 992 /* 993 * Stagger the group and global load balancer so they do not 994 * interfere with each other. 995 */ 996 bal_tick = ticks + hz; 997 if (balance_groups) 998 gbal_tick = ticks + (hz / 2); 999#else 1000 tdq_setup(TDQ_SELF()); 1001#endif 1002 mtx_lock_spin(&sched_lock); 1003 tdq_load_add(TDQ_SELF(), &td_sched0); 1004 mtx_unlock_spin(&sched_lock); 1005} 1006 1007/* ARGSUSED */ 1008static void 1009sched_initticks(void *dummy) 1010{ 1011 mtx_lock_spin(&sched_lock); 1012 realstathz = stathz ? stathz : hz; 1013 sched_slice = (realstathz/7); /* ~140ms */ 1014 1015 /* 1016 * tickincr is shifted out by 10 to avoid rounding errors due to 1017 * hz not being evenly divisible by stathz on all platforms. 1018 */ 1019 tickincr = (hz << SCHED_TICK_SHIFT) / realstathz; 1020 /* 1021 * This does not work for values of stathz that are more than 1022 * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. 1023 */ 1024 if (tickincr == 0) 1025 tickincr = 1; 1026 mtx_unlock_spin(&sched_lock); 1027} 1028 1029 1030/* 1031 * Scale the scheduling priority according to the "interactivity" of this 1032 * process. 1033 */ 1034static void 1035sched_priority(struct thread *td) 1036{ 1037 int score; 1038 int pri; 1039 1040 if (td->td_pri_class != PRI_TIMESHARE) 1041 return; 1042 /* 1043 * If the score is interactive we place the thread in the realtime 1044 * queue with a priority that is less than kernel and interrupt 1045 * priorities. These threads are not subject to nice restrictions. 1046 * 1047 * Scores greater than this are placed on the normal realtime queue 1048 * where the priority is partially decided by the most recent cpu 1049 * utilization and the rest is decided by nice value. 1050 */ 1051 score = sched_interact_score(td); 1052 if (score < sched_interact) { 1053 pri = PRI_MIN_REALTIME; 1054 pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact) 1055 * score; 1056 KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME, 1057 ("sched_priority: invalid interactive priority %d", pri)); 1058 } else { 1059 pri = SCHED_PRI_MIN; 1060 if (td->td_sched->ts_ticks) 1061 pri += SCHED_PRI_TICKS(td->td_sched); 1062 pri += SCHED_PRI_NICE(td->td_proc->p_nice); 1063 if (!(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE)) { 1064 static int once = 1; 1065 if (once) { 1066 printf("sched_priority: invalid priority %d", 1067 pri); 1068 printf("nice %d, ticks %d ftick %d ltick %d tick pri %d\n", 1069 td->td_proc->p_nice, 1070 td->td_sched->ts_ticks, 1071 td->td_sched->ts_ftick, 1072 td->td_sched->ts_ltick, 1073 SCHED_PRI_TICKS(td->td_sched)); 1074 once = 0; 1075 } 1076 pri = min(max(pri, PRI_MIN_TIMESHARE), 1077 PRI_MAX_TIMESHARE); 1078 } 1079 } 1080 sched_user_prio(td, pri); 1081 1082 return; 1083} 1084 1085/* 1086 * This routine enforces a maximum limit on the amount of scheduling history 1087 * kept. It is called after either the slptime or runtime is adjusted. 1088 */ 1089static void 1090sched_interact_update(struct thread *td) 1091{ 1092 struct td_sched *ts; 1093 int sum; 1094 1095 ts = td->td_sched; 1096 sum = ts->skg_runtime + ts->skg_slptime; 1097 if (sum < SCHED_SLP_RUN_MAX) 1098 return; 1099 /* 1100 * This only happens from two places: 1101 * 1) We have added an unusual amount of run time from fork_exit. 1102 * 2) We have added an unusual amount of sleep time from sched_sleep(). 1103 */ 1104 if (sum > SCHED_SLP_RUN_MAX * 2) { 1105 if (ts->skg_runtime > ts->skg_slptime) { 1106 ts->skg_runtime = SCHED_SLP_RUN_MAX; 1107 ts->skg_slptime = 1; 1108 } else { 1109 ts->skg_slptime = SCHED_SLP_RUN_MAX; 1110 ts->skg_runtime = 1; 1111 } 1112 return; 1113 } 1114 /* 1115 * If we have exceeded by more than 1/5th then the algorithm below 1116 * will not bring us back into range. Dividing by two here forces 1117 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1118 */ 1119 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1120 ts->skg_runtime /= 2; 1121 ts->skg_slptime /= 2; 1122 return; 1123 } 1124 ts->skg_runtime = (ts->skg_runtime / 5) * 4; 1125 ts->skg_slptime = (ts->skg_slptime / 5) * 4; 1126} 1127 1128static void 1129sched_interact_fork(struct thread *td) 1130{ 1131 int ratio; 1132 int sum; 1133 1134 sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime; 1135 if (sum > SCHED_SLP_RUN_FORK) { 1136 ratio = sum / SCHED_SLP_RUN_FORK; 1137 td->td_sched->skg_runtime /= ratio; 1138 td->td_sched->skg_slptime /= ratio; 1139 } 1140} 1141 1142static int 1143sched_interact_score(struct thread *td) 1144{ 1145 int div; 1146 1147 if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) { 1148 div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF); 1149 return (SCHED_INTERACT_HALF + 1150 (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div))); 1151 } if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) { 1152 div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF); 1153 return (td->td_sched->skg_runtime / div); 1154 } 1155 1156 /* 1157 * This can happen if slptime and runtime are 0. 1158 */ 1159 return (0); 1160 1161} 1162 1163/* 1164 * Called from proc0_init() to bootstrap the scheduler. 1165 */ 1166void 1167schedinit(void) 1168{ 1169 1170 /* 1171 * Set up the scheduler specific parts of proc0. 1172 */ 1173 proc0.p_sched = NULL; /* XXX */ 1174 thread0.td_sched = &td_sched0; 1175 td_sched0.ts_ltick = ticks; 1176 td_sched0.ts_ftick = ticks; 1177 td_sched0.ts_thread = &thread0; 1178 td_sched0.ts_state = TSS_THREAD; 1179} 1180 1181/* 1182 * This is only somewhat accurate since given many processes of the same 1183 * priority they will switch when their slices run out, which will be 1184 * at most sched_slice stathz ticks. 1185 */ 1186int 1187sched_rr_interval(void) 1188{ 1189 1190 /* Convert sched_slice to hz */ 1191 return (hz/(realstathz/sched_slice)); 1192} 1193 1194static void 1195sched_pctcpu_update(struct td_sched *ts) 1196{ 1197 1198 if (ts->ts_ticks == 0) 1199 return; 1200 if (ticks - (hz / 10) < ts->ts_ltick && 1201 SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX) 1202 return; 1203 /* 1204 * Adjust counters and watermark for pctcpu calc. 1205 */ 1206 if (ts->ts_ltick > ticks - SCHED_TICK_TARG) 1207 ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) * 1208 SCHED_TICK_TARG; 1209 else 1210 ts->ts_ticks = 0; 1211 ts->ts_ltick = ticks; 1212 ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG; 1213} 1214 1215static void 1216sched_thread_priority(struct thread *td, u_char prio) 1217{ 1218 struct td_sched *ts; 1219 1220 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1221 td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1222 curthread->td_proc->p_comm); 1223 ts = td->td_sched; 1224 mtx_assert(&sched_lock, MA_OWNED); 1225 if (td->td_priority == prio) 1226 return; 1227 1228 if (TD_ON_RUNQ(td) && prio < td->td_priority) { 1229 /* 1230 * If the priority has been elevated due to priority 1231 * propagation, we may have to move ourselves to a new 1232 * queue. This could be optimized to not re-add in some 1233 * cases. 1234 * 1235 * Hold this td_sched on this cpu so that sched_prio() doesn't 1236 * cause excessive migration. We only want migration to 1237 * happen as the result of a wakeup. 1238 */ 1239 sched_pin_td(td); 1240 sched_rem(td); 1241 td->td_priority = prio; 1242 sched_add(td, SRQ_BORROWING); 1243 sched_unpin_td(td); 1244 } else 1245 td->td_priority = prio; 1246} 1247 1248/* 1249 * Update a thread's priority when it is lent another thread's 1250 * priority. 1251 */ 1252void 1253sched_lend_prio(struct thread *td, u_char prio) 1254{ 1255 1256 td->td_flags |= TDF_BORROWING; 1257 sched_thread_priority(td, prio); 1258} 1259 1260/* 1261 * Restore a thread's priority when priority propagation is 1262 * over. The prio argument is the minimum priority the thread 1263 * needs to have to satisfy other possible priority lending 1264 * requests. If the thread's regular priority is less 1265 * important than prio, the thread will keep a priority boost 1266 * of prio. 1267 */ 1268void 1269sched_unlend_prio(struct thread *td, u_char prio) 1270{ 1271 u_char base_pri; 1272 1273 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1274 td->td_base_pri <= PRI_MAX_TIMESHARE) 1275 base_pri = td->td_user_pri; 1276 else 1277 base_pri = td->td_base_pri; 1278 if (prio >= base_pri) { 1279 td->td_flags &= ~TDF_BORROWING; 1280 sched_thread_priority(td, base_pri); 1281 } else 1282 sched_lend_prio(td, prio); 1283} 1284 1285void 1286sched_prio(struct thread *td, u_char prio) 1287{ 1288 u_char oldprio; 1289 1290 /* First, update the base priority. */ 1291 td->td_base_pri = prio; 1292 1293 /* 1294 * If the thread is borrowing another thread's priority, don't 1295 * ever lower the priority. 1296 */ 1297 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1298 return; 1299 1300 /* Change the real priority. */ 1301 oldprio = td->td_priority; 1302 sched_thread_priority(td, prio); 1303 1304 /* 1305 * If the thread is on a turnstile, then let the turnstile update 1306 * its state. 1307 */ 1308 if (TD_ON_LOCK(td) && oldprio != prio) 1309 turnstile_adjust(td, oldprio); 1310} 1311 1312void 1313sched_user_prio(struct thread *td, u_char prio) 1314{ 1315 u_char oldprio; 1316 1317 td->td_base_user_pri = prio; 1318 if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 1319 return; 1320 oldprio = td->td_user_pri; 1321 td->td_user_pri = prio; 1322 1323 if (TD_ON_UPILOCK(td) && oldprio != prio) 1324 umtx_pi_adjust(td, oldprio); 1325} 1326 1327void 1328sched_lend_user_prio(struct thread *td, u_char prio) 1329{ 1330 u_char oldprio; 1331 1332 td->td_flags |= TDF_UBORROWING; 1333 1334 oldprio = td->td_user_pri; 1335 td->td_user_pri = prio; 1336 1337 if (TD_ON_UPILOCK(td) && oldprio != prio) 1338 umtx_pi_adjust(td, oldprio); 1339} 1340 1341void 1342sched_unlend_user_prio(struct thread *td, u_char prio) 1343{ 1344 u_char base_pri; 1345 1346 base_pri = td->td_base_user_pri; 1347 if (prio >= base_pri) { 1348 td->td_flags &= ~TDF_UBORROWING; 1349 sched_user_prio(td, base_pri); 1350 } else 1351 sched_lend_user_prio(td, prio); 1352} 1353 1354void 1355sched_switch(struct thread *td, struct thread *newtd, int flags) 1356{ 1357 struct tdq *tdq; 1358 struct td_sched *ts; 1359 1360 mtx_assert(&sched_lock, MA_OWNED); 1361 1362 tdq = TDQ_SELF(); 1363 ts = td->td_sched; 1364 td->td_lastcpu = td->td_oncpu; 1365 td->td_oncpu = NOCPU; 1366 td->td_flags &= ~TDF_NEEDRESCHED; 1367 td->td_owepreempt = 0; 1368 /* 1369 * If the thread has been assigned it may be in the process of switching 1370 * to the new cpu. This is the case in sched_bind(). 1371 */ 1372 if (td == PCPU_GET(idlethread)) { 1373 TD_SET_CAN_RUN(td); 1374 } else if ((ts->ts_flags & TSF_ASSIGNED) == 0) { 1375 /* We are ending our run so make our slot available again */ 1376 tdq_load_rem(tdq, ts); 1377 if (TD_IS_RUNNING(td)) { 1378 /* 1379 * Don't allow the thread to migrate 1380 * from a preemption. 1381 */ 1382 sched_pin_td(td); 1383 setrunqueue(td, (flags & SW_PREEMPT) ? 1384 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1385 SRQ_OURSELF|SRQ_YIELDING); 1386 sched_unpin_td(td); 1387 } 1388 } 1389 if (newtd != NULL) { 1390 /* 1391 * If we bring in a thread account for it as if it had been 1392 * added to the run queue and then chosen. 1393 */ 1394 newtd->td_sched->ts_flags |= TSF_DIDRUN; 1395 TD_SET_RUNNING(newtd); 1396 tdq_load_add(TDQ_SELF(), newtd->td_sched); 1397 } else 1398 newtd = choosethread(); 1399 if (td != newtd) { 1400#ifdef HWPMC_HOOKS 1401 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1402 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1403#endif 1404 1405 cpu_switch(td, newtd); 1406#ifdef HWPMC_HOOKS 1407 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1408 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1409#endif 1410 } 1411 sched_lock.mtx_lock = (uintptr_t)td; 1412 td->td_oncpu = PCPU_GET(cpuid); 1413} 1414 1415void 1416sched_nice(struct proc *p, int nice) 1417{ 1418 struct thread *td; 1419 1420 PROC_LOCK_ASSERT(p, MA_OWNED); 1421 mtx_assert(&sched_lock, MA_OWNED); 1422 1423 p->p_nice = nice; 1424 FOREACH_THREAD_IN_PROC(p, td) { 1425 sched_priority(td); 1426 sched_prio(td, td->td_base_user_pri); 1427 } 1428} 1429 1430void 1431sched_sleep(struct thread *td) 1432{ 1433 1434 mtx_assert(&sched_lock, MA_OWNED); 1435 1436 td->td_sched->ts_slptime = ticks; 1437} 1438 1439void 1440sched_wakeup(struct thread *td) 1441{ 1442 int slptime; 1443 1444 mtx_assert(&sched_lock, MA_OWNED); 1445 1446 /* 1447 * If we slept for more than a tick update our interactivity and 1448 * priority. 1449 */ 1450 slptime = td->td_sched->ts_slptime; 1451 td->td_sched->ts_slptime = 0; 1452 if (slptime && slptime != ticks) { 1453 int hzticks; 1454 1455 hzticks = (ticks - slptime) << SCHED_TICK_SHIFT; 1456 td->td_sched->skg_slptime += hzticks; 1457 sched_interact_update(td); 1458 sched_pctcpu_update(td->td_sched); 1459 sched_priority(td); 1460 } 1461 setrunqueue(td, SRQ_BORING); 1462} 1463 1464/* 1465 * Penalize the parent for creating a new child and initialize the child's 1466 * priority. 1467 */ 1468void 1469sched_fork(struct thread *td, struct thread *child) 1470{ 1471 mtx_assert(&sched_lock, MA_OWNED); 1472 sched_fork_thread(td, child); 1473 /* 1474 * Penalize the parent and child for forking. 1475 */ 1476 sched_interact_fork(child); 1477 sched_priority(child); 1478 td->td_sched->skg_runtime += tickincr; 1479 sched_interact_update(td); 1480 sched_priority(td); 1481} 1482 1483void 1484sched_fork_thread(struct thread *td, struct thread *child) 1485{ 1486 struct td_sched *ts; 1487 struct td_sched *ts2; 1488 1489 /* 1490 * Initialize child. 1491 */ 1492 sched_newthread(child); 1493 ts = td->td_sched; 1494 ts2 = child->td_sched; 1495 ts2->ts_cpu = ts->ts_cpu; 1496 ts2->ts_runq = NULL; 1497 /* 1498 * Grab our parents cpu estimation information and priority. 1499 */ 1500 ts2->ts_ticks = ts->ts_ticks; 1501 ts2->ts_ltick = ts->ts_ltick; 1502 ts2->ts_ftick = ts->ts_ftick; 1503 child->td_user_pri = td->td_user_pri; 1504 child->td_base_user_pri = td->td_base_user_pri; 1505 /* 1506 * And update interactivity score. 1507 */ 1508 ts2->skg_slptime = ts->skg_slptime; 1509 ts2->skg_runtime = ts->skg_runtime; 1510 ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ 1511} 1512 1513void 1514sched_class(struct thread *td, int class) 1515{ 1516 1517 mtx_assert(&sched_lock, MA_OWNED); 1518 if (td->td_pri_class == class) 1519 return; 1520 1521#ifdef SMP 1522 /* 1523 * On SMP if we're on the RUNQ we must adjust the transferable 1524 * count because could be changing to or from an interrupt 1525 * class. 1526 */ 1527 if (td->td_sched->ts_state == TSS_ONRUNQ) { 1528 struct tdq *tdq; 1529 1530 tdq = TDQ_CPU(td->td_sched->ts_cpu); 1531 if (THREAD_CAN_MIGRATE(td)) { 1532 tdq->tdq_transferable--; 1533 tdq->tdq_group->tdg_transferable--; 1534 } 1535 td->td_pri_class = class; 1536 if (THREAD_CAN_MIGRATE(td)) { 1537 tdq->tdq_transferable++; 1538 tdq->tdq_group->tdg_transferable++; 1539 } 1540 } 1541#endif 1542 td->td_pri_class = class; 1543} 1544 1545/* 1546 * Return some of the child's priority and interactivity to the parent. 1547 */ 1548void 1549sched_exit(struct proc *p, struct thread *child) 1550{ 1551 struct thread *td; 1552 1553 CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 1554 child, child->td_proc->p_comm, child->td_priority); 1555 1556 td = FIRST_THREAD_IN_PROC(p); 1557 sched_exit_thread(td, child); 1558} 1559 1560void 1561sched_exit_thread(struct thread *td, struct thread *child) 1562{ 1563 1564 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 1565 child, child->td_proc->p_comm, child->td_priority); 1566 1567 tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched); 1568#ifdef KSE 1569 /* 1570 * KSE forks and exits so often that this penalty causes short-lived 1571 * threads to always be non-interactive. This causes mozilla to 1572 * crawl under load. 1573 */ 1574 if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc) 1575 return; 1576#endif 1577 /* 1578 * Give the child's runtime to the parent without returning the 1579 * sleep time as a penalty to the parent. This causes shells that 1580 * launch expensive things to mark their children as expensive. 1581 */ 1582 td->td_sched->skg_runtime += child->td_sched->skg_runtime; 1583 sched_interact_update(td); 1584 sched_priority(td); 1585} 1586 1587void 1588sched_userret(struct thread *td) 1589{ 1590 /* 1591 * XXX we cheat slightly on the locking here to avoid locking in 1592 * the usual case. Setting td_priority here is essentially an 1593 * incomplete workaround for not setting it properly elsewhere. 1594 * Now that some interrupt handlers are threads, not setting it 1595 * properly elsewhere can clobber it in the window between setting 1596 * it here and returning to user mode, so don't waste time setting 1597 * it perfectly here. 1598 */ 1599 KASSERT((td->td_flags & TDF_BORROWING) == 0, 1600 ("thread with borrowed priority returning to userland")); 1601 if (td->td_priority != td->td_user_pri) { 1602 mtx_lock_spin(&sched_lock); 1603 td->td_priority = td->td_user_pri; 1604 td->td_base_pri = td->td_user_pri; 1605 mtx_unlock_spin(&sched_lock); 1606 } 1607} 1608 1609void 1610sched_clock(struct thread *td) 1611{ 1612 struct tdq *tdq; 1613 struct td_sched *ts; 1614 1615 mtx_assert(&sched_lock, MA_OWNED); 1616#ifdef SMP 1617 sched_smp_tick(); 1618#endif 1619 tdq = TDQ_SELF(); 1620 /* 1621 * Advance the insert index once for each tick to ensure that all 1622 * threads get a chance to run. 1623 */ 1624 if (tdq->tdq_idx == tdq->tdq_ridx) { 1625 tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; 1626 if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) 1627 tdq->tdq_ridx = tdq->tdq_idx; 1628 } 1629 /* Adjust ticks for pctcpu */ 1630 ts = td->td_sched; 1631 ts->ts_ticks += tickincr; 1632 ts->ts_ltick = ticks; 1633 /* 1634 * Update if we've exceeded our desired tick threshhold by over one 1635 * second. 1636 */ 1637 if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick) 1638 sched_pctcpu_update(ts); 1639 /* 1640 * We only do slicing code for TIMESHARE threads. 1641 */ 1642 if (td->td_pri_class != PRI_TIMESHARE) 1643 return; 1644 /* 1645 * We used a tick; charge it to the thread so that we can compute our 1646 * interactivity. 1647 */ 1648 td->td_sched->skg_runtime += tickincr; 1649 sched_interact_update(td); 1650 /* 1651 * We used up one time slice. 1652 */ 1653 if (--ts->ts_slice > 0) 1654 return; 1655 /* 1656 * We're out of time, recompute priorities and requeue. 1657 */ 1658 sched_priority(td); 1659 tdq_load_rem(tdq, ts); 1660 ts->ts_slice = sched_slice; 1661 tdq_load_add(tdq, ts); 1662 td->td_flags |= TDF_NEEDRESCHED; 1663} 1664 1665int 1666sched_runnable(void) 1667{ 1668 struct tdq *tdq; 1669 int load; 1670 1671 load = 1; 1672 1673 tdq = TDQ_SELF(); 1674#ifdef SMP 1675 if (tdq->tdq_assigned) { 1676 mtx_lock_spin(&sched_lock); 1677 tdq_assign(tdq); 1678 mtx_unlock_spin(&sched_lock); 1679 } 1680#endif 1681 if ((curthread->td_flags & TDF_IDLETD) != 0) { 1682 if (tdq->tdq_load > 0) 1683 goto out; 1684 } else 1685 if (tdq->tdq_load - 1 > 0) 1686 goto out; 1687 load = 0; 1688out: 1689 return (load); 1690} 1691 1692struct td_sched * 1693sched_choose(void) 1694{ 1695 struct tdq *tdq; 1696 struct td_sched *ts; 1697 1698 mtx_assert(&sched_lock, MA_OWNED); 1699 tdq = TDQ_SELF(); 1700#ifdef SMP 1701restart: 1702 if (tdq->tdq_assigned) 1703 tdq_assign(tdq); 1704#endif 1705 ts = tdq_choose(tdq); 1706 if (ts) { 1707#ifdef SMP 1708 if (ts->ts_thread->td_priority > PRI_MIN_IDLE) 1709 if (tdq_idled(tdq) == 0) 1710 goto restart; 1711#endif 1712 tdq_runq_rem(tdq, ts); 1713 ts->ts_state = TSS_THREAD; 1714 return (ts); 1715 } 1716#ifdef SMP 1717 if (tdq_idled(tdq) == 0) 1718 goto restart; 1719#endif 1720 return (NULL); 1721} 1722 1723void 1724sched_add(struct thread *td, int flags) 1725{ 1726 struct tdq *tdq; 1727 struct td_sched *ts; 1728 int preemptive; 1729 int canmigrate; 1730 int class; 1731 1732 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1733 td, td->td_proc->p_comm, td->td_priority, curthread, 1734 curthread->td_proc->p_comm); 1735 mtx_assert(&sched_lock, MA_OWNED); 1736 tdq = TDQ_SELF(); 1737 ts = td->td_sched; 1738 class = PRI_BASE(td->td_pri_class); 1739 preemptive = !(flags & SRQ_YIELDING); 1740 canmigrate = 1; 1741#ifdef SMP 1742 if (ts->ts_flags & TSF_ASSIGNED) { 1743 if (ts->ts_flags & TSF_REMOVED) 1744 ts->ts_flags &= ~TSF_REMOVED; 1745 return; 1746 } 1747 canmigrate = THREAD_CAN_MIGRATE(td); 1748#endif 1749 KASSERT(ts->ts_state != TSS_ONRUNQ, 1750 ("sched_add: thread %p (%s) already in run queue", td, 1751 td->td_proc->p_comm)); 1752 KASSERT(td->td_proc->p_sflag & PS_INMEM, 1753 ("sched_add: process swapped out")); 1754 KASSERT(ts->ts_runq == NULL, 1755 ("sched_add: thread %p is still assigned to a run queue", td)); 1756 /* 1757 * Set the slice and pick the run queue. 1758 */ 1759 if (ts->ts_slice == 0) 1760 ts->ts_slice = sched_slice; 1761 if (class == PRI_TIMESHARE) 1762 sched_priority(td); 1763 if (td->td_priority <= PRI_MAX_REALTIME) { 1764 ts->ts_runq = &tdq->tdq_realtime; 1765 /* 1766 * If the thread is not artificially pinned and it's in 1767 * the realtime queue we directly dispatch it on this cpu 1768 * for minimum latency. Interrupt handlers may also have 1769 * to complete on the cpu that dispatched them. 1770 */ 1771 if (td->td_pinned == 0 && class == PRI_ITHD) 1772 ts->ts_cpu = PCPU_GET(cpuid); 1773 } else if (td->td_priority <= PRI_MAX_TIMESHARE) 1774 ts->ts_runq = &tdq->tdq_timeshare; 1775 else 1776 ts->ts_runq = &tdq->tdq_idle; 1777 1778#ifdef SMP 1779 /* 1780 * If this thread is pinned or bound, notify the target cpu. 1781 */ 1782 if (!canmigrate && ts->ts_cpu != PCPU_GET(cpuid) ) { 1783 ts->ts_runq = NULL; 1784 tdq_notify(ts, ts->ts_cpu); 1785 return; 1786 } 1787 /* 1788 * If we had been idle, clear our bit in the group and potentially 1789 * the global bitmap. If not, see if we should transfer this thread. 1790 */ 1791 if ((class != PRI_IDLE && class != PRI_ITHD) && 1792 (tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) { 1793 /* 1794 * Check to see if our group is unidling, and if so, remove it 1795 * from the global idle mask. 1796 */ 1797 if (tdq->tdq_group->tdg_idlemask == 1798 tdq->tdq_group->tdg_cpumask) 1799 atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); 1800 /* 1801 * Now remove ourselves from the group specific idle mask. 1802 */ 1803 tdq->tdq_group->tdg_idlemask &= ~PCPU_GET(cpumask); 1804 } else if (canmigrate && tdq->tdq_load > 1) 1805 if (tdq_transfer(tdq, ts, class)) 1806 return; 1807 ts->ts_cpu = PCPU_GET(cpuid); 1808#endif 1809 if (td->td_priority < curthread->td_priority) 1810 curthread->td_flags |= TDF_NEEDRESCHED; 1811 if (preemptive && maybe_preempt(td)) 1812 return; 1813 ts->ts_state = TSS_ONRUNQ; 1814 1815 tdq_runq_add(tdq, ts, flags); 1816 tdq_load_add(tdq, ts); 1817} 1818 1819void 1820sched_rem(struct thread *td) 1821{ 1822 struct tdq *tdq; 1823 struct td_sched *ts; 1824 1825 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1826 td, td->td_proc->p_comm, td->td_priority, curthread, 1827 curthread->td_proc->p_comm); 1828 mtx_assert(&sched_lock, MA_OWNED); 1829 ts = td->td_sched; 1830 if (ts->ts_flags & TSF_ASSIGNED) { 1831 ts->ts_flags |= TSF_REMOVED; 1832 return; 1833 } 1834 KASSERT((ts->ts_state == TSS_ONRUNQ), 1835 ("sched_rem: thread not on run queue")); 1836 1837 ts->ts_state = TSS_THREAD; 1838 tdq = TDQ_CPU(ts->ts_cpu); 1839 tdq_runq_rem(tdq, ts); 1840 tdq_load_rem(tdq, ts); 1841} 1842 1843fixpt_t 1844sched_pctcpu(struct thread *td) 1845{ 1846 fixpt_t pctcpu; 1847 struct td_sched *ts; 1848 1849 pctcpu = 0; 1850 ts = td->td_sched; 1851 if (ts == NULL) 1852 return (0); 1853 1854 mtx_lock_spin(&sched_lock); 1855 if (ts->ts_ticks) { 1856 int rtick; 1857 1858 sched_pctcpu_update(ts); 1859 /* How many rtick per second ? */ 1860 rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); 1861 pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; 1862 } 1863 td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick; 1864 mtx_unlock_spin(&sched_lock); 1865 1866 return (pctcpu); 1867} 1868 1869void 1870sched_bind(struct thread *td, int cpu) 1871{ 1872 struct td_sched *ts; 1873 1874 mtx_assert(&sched_lock, MA_OWNED); 1875 ts = td->td_sched; 1876 KASSERT((ts->ts_flags & TSF_BOUND) == 0, 1877 ("sched_bind: thread %p already bound.", td)); 1878 ts->ts_flags |= TSF_BOUND; 1879#ifdef SMP 1880 if (PCPU_GET(cpuid) == cpu) 1881 return; 1882 /* sched_rem without the runq_remove */ 1883 ts->ts_state = TSS_THREAD; 1884 tdq_load_rem(TDQ_CPU(ts->ts_cpu), ts); 1885 tdq_notify(ts, cpu); 1886 /* When we return from mi_switch we'll be on the correct cpu. */ 1887 mi_switch(SW_VOL, NULL); 1888 sched_pin(); 1889#endif 1890} 1891 1892void 1893sched_unbind(struct thread *td) 1894{ 1895 struct td_sched *ts; 1896 1897 mtx_assert(&sched_lock, MA_OWNED); 1898 ts = td->td_sched; 1899 KASSERT(ts->ts_flags & TSF_BOUND, 1900 ("sched_unbind: thread %p not bound.", td)); 1901 mtx_assert(&sched_lock, MA_OWNED); 1902 ts->ts_flags &= ~TSF_BOUND; 1903#ifdef SMP 1904 sched_unpin(); 1905#endif 1906} 1907 1908int 1909sched_is_bound(struct thread *td) 1910{ 1911 mtx_assert(&sched_lock, MA_OWNED); 1912 return (td->td_sched->ts_flags & TSF_BOUND); 1913} 1914 1915void 1916sched_relinquish(struct thread *td) 1917{ 1918 mtx_lock_spin(&sched_lock); 1919 if (td->td_pri_class == PRI_TIMESHARE) 1920 sched_prio(td, PRI_MAX_TIMESHARE); 1921 mi_switch(SW_VOL, NULL); 1922 mtx_unlock_spin(&sched_lock); 1923} 1924 1925int 1926sched_load(void) 1927{ 1928#ifdef SMP 1929 int total; 1930 int i; 1931 1932 total = 0; 1933 for (i = 0; i <= tdg_maxid; i++) 1934 total += TDQ_GROUP(i)->tdg_load; 1935 return (total); 1936#else 1937 return (TDQ_SELF()->tdq_sysload); 1938#endif 1939} 1940 1941int 1942sched_sizeof_proc(void) 1943{ 1944 return (sizeof(struct proc)); 1945} 1946 1947int 1948sched_sizeof_thread(void) 1949{ 1950 return (sizeof(struct thread) + sizeof(struct td_sched)); 1951} 1952 1953void 1954sched_tick(void) 1955{ 1956} 1957 1958static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 1959SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 1960 "Scheduler name"); 1961SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, ""); 1962SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, ""); 1963SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, ""); 1964SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, ""); 1965SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &sched_rebalance, 0, ""); 1966 1967/* ps compat */ 1968static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 1969SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 1970 1971 1972#define KERN_SWITCH_INCLUDE 1 1973#include "kern/kern_switch.c" 1974