sched_ule.c revision 165766
1/*- 2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 165766 2007-01-04 12:16:19Z jeff $"); 29 30#include "opt_hwpmc_hooks.h" 31#include "opt_sched.h" 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/kdb.h> 36#include <sys/kernel.h> 37#include <sys/ktr.h> 38#include <sys/lock.h> 39#include <sys/mutex.h> 40#include <sys/proc.h> 41#include <sys/resource.h> 42#include <sys/resourcevar.h> 43#include <sys/sched.h> 44#include <sys/smp.h> 45#include <sys/sx.h> 46#include <sys/sysctl.h> 47#include <sys/sysproto.h> 48#include <sys/turnstile.h> 49#include <sys/umtx.h> 50#include <sys/vmmeter.h> 51#ifdef KTRACE 52#include <sys/uio.h> 53#include <sys/ktrace.h> 54#endif 55 56#ifdef HWPMC_HOOKS 57#include <sys/pmckern.h> 58#endif 59 60#include <machine/cpu.h> 61#include <machine/smp.h> 62 63/* 64 * Thread scheduler specific section. 65 */ 66struct td_sched { 67 TAILQ_ENTRY(td_sched) ts_procq; /* (j/z) Run queue. */ 68 int ts_flags; /* (j) TSF_* flags. */ 69 struct thread *ts_thread; /* (*) Active associated thread. */ 70 fixpt_t ts_pctcpu; /* (j) %cpu during p_swtime. */ 71 u_char ts_rqindex; /* (j) Run queue index. */ 72 enum { 73 TSS_THREAD, 74 TSS_ONRUNQ 75 } ts_state; /* (j) thread sched specific status. */ 76 int ts_slptime; 77 int ts_slice; 78 struct runq *ts_runq; 79 u_char ts_cpu; /* CPU that we have affinity for. */ 80 /* The following variables are only used for pctcpu calculation */ 81 int ts_ltick; /* Last tick that we were running on */ 82 int ts_ftick; /* First tick that we were running on */ 83 int ts_ticks; /* Tick count */ 84 85 /* originally from kg_sched */ 86 int skg_slptime; /* Number of ticks we vol. slept */ 87 int skg_runtime; /* Number of ticks we were running */ 88}; 89#define ts_assign ts_procq.tqe_next 90/* flags kept in ts_flags */ 91#define TSF_ASSIGNED 0x0001 /* Thread is being migrated. */ 92#define TSF_BOUND 0x0002 /* Thread can not migrate. */ 93#define TSF_XFERABLE 0x0004 /* Thread was added as transferable. */ 94#define TSF_HOLD 0x0008 /* Thread is temporarily bound. */ 95#define TSF_REMOVED 0x0010 /* Thread was removed while ASSIGNED */ 96#define TSF_INTERNAL 0x0020 /* Thread added due to migration. */ 97#define TSF_DIDRUN 0x2000 /* Thread actually ran. */ 98#define TSF_EXIT 0x4000 /* Thread is being killed. */ 99 100static struct td_sched td_sched0; 101 102/* 103 * Cpu percentage computation macros and defines. 104 * 105 * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. 106 * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. 107 * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. 108 * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. 109 * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. 110 */ 111#define SCHED_TICK_SECS 10 112#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) 113#define SCHED_TICK_SHIFT 10 114#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) 115#define SCHED_TICK_TOTAL(ts) ((ts)->ts_ltick - (ts)->ts_ftick) 116 117/* 118 * These macros determine priorities for non-interactive threads. They are 119 * assigned a priority based on their recent cpu utilization as expressed 120 * by the ratio of ticks to the tick total. NHALF priorities at the start 121 * and end of the MIN to MAX timeshare range are only reachable with negative 122 * or positive nice respectively. 123 * 124 * PRI_RANGE: Priority range for utilization dependent priorities. 125 * PRI_NRESV: Number of nice values. 126 * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. 127 * PRI_NICE: Determines the part of the priority inherited from nice. 128 */ 129#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) 130#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 131#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF) 132#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF) 133#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) 134#define SCHED_PRI_TICKS(ts) \ 135 (SCHED_TICK_HZ((ts)) / \ 136 (max(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) 137#define SCHED_PRI_NICE(nice) (nice) 138 139/* 140 * These determine the interactivity of a process. Interactivity differs from 141 * cpu utilization in that it expresses the voluntary time slept vs time ran 142 * while cpu utilization includes all time not running. This more accurately 143 * models the intent of the thread. 144 * 145 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 146 * before throttling back. 147 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 148 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 149 * INTERACT_THRESH: Threshhold for placement on the current runq. 150 */ 151#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) 152#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) 153#define SCHED_INTERACT_MAX (100) 154#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 155#define SCHED_INTERACT_THRESH (30) 156 157/* 158 * tickincr: Converts a stathz tick into a hz domain scaled by 159 * the shift factor. Without the shift the error rate 160 * due to rounding would be unacceptably high. 161 * realstathz: stathz is sometimes 0 and run off of hz. 162 * sched_slice: Runtime of each thread before rescheduling. 163 */ 164static int sched_interact = SCHED_INTERACT_THRESH; 165static int realstathz; 166static int tickincr; 167static int sched_slice; 168 169/* 170 * tdq - per processor runqs and statistics. 171 */ 172struct tdq { 173 struct runq tdq_idle; /* Queue of IDLE threads. */ 174 struct runq tdq_timeshare; /* timeshare run queue. */ 175 struct runq tdq_realtime; /* real-time run queue. */ 176 int tdq_idx; /* Current insert index. */ 177 int tdq_ridx; /* Current removal index. */ 178 int tdq_load_timeshare; /* Load for timeshare. */ 179 int tdq_load; /* Aggregate load. */ 180#ifdef SMP 181 int tdq_transferable; 182 LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ 183 struct tdq_group *tdq_group; /* Our processor group. */ 184 volatile struct td_sched *tdq_assigned; /* assigned by another CPU. */ 185#else 186 int tdq_sysload; /* For loadavg, !ITHD load. */ 187#endif 188}; 189 190#ifdef SMP 191/* 192 * tdq groups are groups of processors which can cheaply share threads. When 193 * one processor in the group goes idle it will check the runqs of the other 194 * processors in its group prior to halting and waiting for an interrupt. 195 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 196 * In a numa environment we'd want an idle bitmap per group and a two tiered 197 * load balancer. 198 */ 199struct tdq_group { 200 int tdg_cpus; /* Count of CPUs in this tdq group. */ 201 cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ 202 cpumask_t tdg_idlemask; /* Idle cpus in this group. */ 203 cpumask_t tdg_mask; /* Bit mask for first cpu. */ 204 int tdg_load; /* Total load of this group. */ 205 int tdg_transferable; /* Transferable load of this group. */ 206 LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ 207}; 208#endif 209 210/* 211 * One thread queue per processor. 212 */ 213#ifdef SMP 214static cpumask_t tdq_idle; 215static int tdg_maxid; 216static struct tdq tdq_cpu[MAXCPU]; 217static struct tdq_group tdq_groups[MAXCPU]; 218static int bal_tick; 219static int gbal_tick; 220static int balance_groups; 221 222#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) 223#define TDQ_CPU(x) (&tdq_cpu[(x)]) 224#define TDQ_ID(x) ((x) - tdq_cpu) 225#define TDQ_GROUP(x) (&tdq_groups[(x)]) 226#else /* !SMP */ 227static struct tdq tdq_cpu; 228 229#define TDQ_SELF() (&tdq_cpu) 230#define TDQ_CPU(x) (&tdq_cpu) 231#endif 232 233static struct td_sched *sched_choose(void); /* XXX Should be thread * */ 234static void sched_priority(struct thread *); 235static void sched_thread_priority(struct thread *, u_char); 236static int sched_interact_score(struct thread *); 237static void sched_interact_update(struct thread *); 238static void sched_interact_fork(struct thread *); 239static void sched_pctcpu_update(struct td_sched *); 240 241/* Operations on per processor queues */ 242static struct td_sched * tdq_choose(struct tdq *); 243static void tdq_setup(struct tdq *); 244static void tdq_load_add(struct tdq *, struct td_sched *); 245static void tdq_load_rem(struct tdq *, struct td_sched *); 246static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); 247static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); 248void tdq_print(int cpu); 249static void runq_print(struct runq *rq); 250#ifdef SMP 251static int tdq_transfer(struct tdq *, struct td_sched *, int); 252static struct td_sched *runq_steal(struct runq *); 253static void sched_balance(void); 254static void sched_balance_groups(void); 255static void sched_balance_group(struct tdq_group *); 256static void sched_balance_pair(struct tdq *, struct tdq *); 257static void sched_smp_tick(void); 258static void tdq_move(struct tdq *, int); 259static int tdq_idled(struct tdq *); 260static void tdq_notify(struct td_sched *, int); 261static void tdq_assign(struct tdq *); 262static struct td_sched *tdq_steal(struct tdq *, int); 263#define THREAD_CAN_MIGRATE(td) \ 264 ((td)->td_pinned == 0 && (td)->td_pri_class != PRI_ITHD) 265#endif 266 267static void sched_setup(void *dummy); 268SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 269 270static void sched_initticks(void *dummy); 271SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) 272 273static void 274runq_print(struct runq *rq) 275{ 276 struct rqhead *rqh; 277 struct td_sched *ts; 278 int pri; 279 int j; 280 int i; 281 282 for (i = 0; i < RQB_LEN; i++) { 283 printf("\t\trunq bits %d 0x%zx\n", 284 i, rq->rq_status.rqb_bits[i]); 285 for (j = 0; j < RQB_BPW; j++) 286 if (rq->rq_status.rqb_bits[i] & (1ul << j)) { 287 pri = j + (i << RQB_L2BPW); 288 rqh = &rq->rq_queues[pri]; 289 TAILQ_FOREACH(ts, rqh, ts_procq) { 290 printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", 291 ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri); 292 } 293 } 294 } 295} 296 297void 298tdq_print(int cpu) 299{ 300 struct tdq *tdq; 301 302 tdq = TDQ_CPU(cpu); 303 304 printf("tdq:\n"); 305 printf("\tload: %d\n", tdq->tdq_load); 306 printf("\tload TIMESHARE: %d\n", tdq->tdq_load_timeshare); 307 printf("\ttimeshare idx: %d\n", tdq->tdq_idx); 308 printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); 309 printf("\trealtime runq:\n"); 310 runq_print(&tdq->tdq_realtime); 311 printf("\ttimeshare runq:\n"); 312 runq_print(&tdq->tdq_timeshare); 313 printf("\tidle runq:\n"); 314 runq_print(&tdq->tdq_idle); 315#ifdef SMP 316 printf("\tload transferable: %d\n", tdq->tdq_transferable); 317#endif 318} 319 320static __inline void 321tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) 322{ 323#ifdef SMP 324 if (THREAD_CAN_MIGRATE(ts->ts_thread)) { 325 tdq->tdq_transferable++; 326 tdq->tdq_group->tdg_transferable++; 327 ts->ts_flags |= TSF_XFERABLE; 328 } 329#endif 330 if (ts->ts_runq == &tdq->tdq_timeshare) { 331 int pri; 332 333 pri = ts->ts_thread->td_priority; 334 KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, 335 ("Invalid priority %d on timeshare runq", pri)); 336 /* 337 * This queue contains only priorities between MIN and MAX 338 * realtime. Use the whole queue to represent these values. 339 */ 340#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) 341 if ((flags & SRQ_BORROWING) == 0) { 342 pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; 343 pri = (pri + tdq->tdq_idx) % RQ_NQS; 344 /* 345 * This effectively shortens the queue by one so we 346 * can have a one slot difference between idx and 347 * ridx while we wait for threads to drain. 348 */ 349 if (tdq->tdq_ridx != tdq->tdq_idx && 350 pri == tdq->tdq_ridx) 351 pri = (pri - 1) % RQ_NQS; 352 } else 353 pri = tdq->tdq_ridx; 354 runq_add_pri(ts->ts_runq, ts, pri, flags); 355 } else 356 runq_add(ts->ts_runq, ts, flags); 357} 358 359static __inline void 360tdq_runq_rem(struct tdq *tdq, struct td_sched *ts) 361{ 362#ifdef SMP 363 if (ts->ts_flags & TSF_XFERABLE) { 364 tdq->tdq_transferable--; 365 tdq->tdq_group->tdg_transferable--; 366 ts->ts_flags &= ~TSF_XFERABLE; 367 } 368#endif 369 if (ts->ts_runq == &tdq->tdq_timeshare) { 370 if (tdq->tdq_idx != tdq->tdq_ridx) 371 runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); 372 else 373 runq_remove_idx(ts->ts_runq, ts, NULL); 374 } else 375 runq_remove(ts->ts_runq, ts); 376} 377 378static void 379tdq_load_add(struct tdq *tdq, struct td_sched *ts) 380{ 381 int class; 382 mtx_assert(&sched_lock, MA_OWNED); 383 class = PRI_BASE(ts->ts_thread->td_pri_class); 384 if (class == PRI_TIMESHARE) 385 tdq->tdq_load_timeshare++; 386 tdq->tdq_load++; 387 CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 388 if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 389#ifdef SMP 390 tdq->tdq_group->tdg_load++; 391#else 392 tdq->tdq_sysload++; 393#endif 394} 395 396static void 397tdq_load_rem(struct tdq *tdq, struct td_sched *ts) 398{ 399 int class; 400 mtx_assert(&sched_lock, MA_OWNED); 401 class = PRI_BASE(ts->ts_thread->td_pri_class); 402 if (class == PRI_TIMESHARE) 403 tdq->tdq_load_timeshare--; 404 if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 405#ifdef SMP 406 tdq->tdq_group->tdg_load--; 407#else 408 tdq->tdq_sysload--; 409#endif 410 tdq->tdq_load--; 411 CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 412 ts->ts_runq = NULL; 413} 414 415#ifdef SMP 416static void 417sched_smp_tick(void) 418{ 419 struct tdq *tdq; 420 421 tdq = TDQ_SELF(); 422 if (ticks >= bal_tick) 423 sched_balance(); 424 if (ticks >= gbal_tick && balance_groups) 425 sched_balance_groups(); 426 /* 427 * We could have been assigned a non real-time thread without an 428 * IPI. 429 */ 430 if (tdq->tdq_assigned) 431 tdq_assign(tdq); /* Potentially sets NEEDRESCHED */ 432} 433 434/* 435 * sched_balance is a simple CPU load balancing algorithm. It operates by 436 * finding the least loaded and most loaded cpu and equalizing their load 437 * by migrating some processes. 438 * 439 * Dealing only with two CPUs at a time has two advantages. Firstly, most 440 * installations will only have 2 cpus. Secondly, load balancing too much at 441 * once can have an unpleasant effect on the system. The scheduler rarely has 442 * enough information to make perfect decisions. So this algorithm chooses 443 * algorithm simplicity and more gradual effects on load in larger systems. 444 * 445 * It could be improved by considering the priorities and slices assigned to 446 * each task prior to balancing them. There are many pathological cases with 447 * any approach and so the semi random algorithm below may work as well as any. 448 * 449 */ 450static void 451sched_balance(void) 452{ 453 struct tdq_group *high; 454 struct tdq_group *low; 455 struct tdq_group *tdg; 456 int cnt; 457 int i; 458 459 bal_tick = ticks + (random() % (hz * 2)); 460 if (smp_started == 0) 461 return; 462 low = high = NULL; 463 i = random() % (tdg_maxid + 1); 464 for (cnt = 0; cnt <= tdg_maxid; cnt++) { 465 tdg = TDQ_GROUP(i); 466 /* 467 * Find the CPU with the highest load that has some 468 * threads to transfer. 469 */ 470 if ((high == NULL || tdg->tdg_load > high->tdg_load) 471 && tdg->tdg_transferable) 472 high = tdg; 473 if (low == NULL || tdg->tdg_load < low->tdg_load) 474 low = tdg; 475 if (++i > tdg_maxid) 476 i = 0; 477 } 478 if (low != NULL && high != NULL && high != low) 479 sched_balance_pair(LIST_FIRST(&high->tdg_members), 480 LIST_FIRST(&low->tdg_members)); 481} 482 483static void 484sched_balance_groups(void) 485{ 486 int i; 487 488 gbal_tick = ticks + (random() % (hz * 2)); 489 mtx_assert(&sched_lock, MA_OWNED); 490 if (smp_started) 491 for (i = 0; i <= tdg_maxid; i++) 492 sched_balance_group(TDQ_GROUP(i)); 493} 494 495static void 496sched_balance_group(struct tdq_group *tdg) 497{ 498 struct tdq *tdq; 499 struct tdq *high; 500 struct tdq *low; 501 int load; 502 503 if (tdg->tdg_transferable == 0) 504 return; 505 low = NULL; 506 high = NULL; 507 LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 508 load = tdq->tdq_load; 509 if (high == NULL || load > high->tdq_load) 510 high = tdq; 511 if (low == NULL || load < low->tdq_load) 512 low = tdq; 513 } 514 if (high != NULL && low != NULL && high != low) 515 sched_balance_pair(high, low); 516} 517 518static void 519sched_balance_pair(struct tdq *high, struct tdq *low) 520{ 521 int transferable; 522 int high_load; 523 int low_load; 524 int move; 525 int diff; 526 int i; 527 528 /* 529 * If we're transfering within a group we have to use this specific 530 * tdq's transferable count, otherwise we can steal from other members 531 * of the group. 532 */ 533 if (high->tdq_group == low->tdq_group) { 534 transferable = high->tdq_transferable; 535 high_load = high->tdq_load; 536 low_load = low->tdq_load; 537 } else { 538 transferable = high->tdq_group->tdg_transferable; 539 high_load = high->tdq_group->tdg_load; 540 low_load = low->tdq_group->tdg_load; 541 } 542 if (transferable == 0) 543 return; 544 /* 545 * Determine what the imbalance is and then adjust that to how many 546 * threads we actually have to give up (transferable). 547 */ 548 diff = high_load - low_load; 549 move = diff / 2; 550 if (diff & 0x1) 551 move++; 552 move = min(move, transferable); 553 for (i = 0; i < move; i++) 554 tdq_move(high, TDQ_ID(low)); 555 return; 556} 557 558static void 559tdq_move(struct tdq *from, int cpu) 560{ 561 struct tdq *tdq; 562 struct tdq *to; 563 struct td_sched *ts; 564 565 tdq = from; 566 to = TDQ_CPU(cpu); 567 ts = tdq_steal(tdq, 1); 568 if (ts == NULL) { 569 struct tdq_group *tdg; 570 571 tdg = tdq->tdq_group; 572 LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 573 if (tdq == from || tdq->tdq_transferable == 0) 574 continue; 575 ts = tdq_steal(tdq, 1); 576 break; 577 } 578 if (ts == NULL) 579 panic("tdq_move: No threads available with a " 580 "transferable count of %d\n", 581 tdg->tdg_transferable); 582 } 583 if (tdq == to) 584 return; 585 ts->ts_state = TSS_THREAD; 586 tdq_runq_rem(tdq, ts); 587 tdq_load_rem(tdq, ts); 588 tdq_notify(ts, cpu); 589} 590 591static int 592tdq_idled(struct tdq *tdq) 593{ 594 struct tdq_group *tdg; 595 struct tdq *steal; 596 struct td_sched *ts; 597 598 tdg = tdq->tdq_group; 599 /* 600 * If we're in a cpu group, try and steal threads from another cpu in 601 * the group before idling. 602 */ 603 if (tdg->tdg_cpus > 1 && tdg->tdg_transferable) { 604 LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { 605 if (steal == tdq || steal->tdq_transferable == 0) 606 continue; 607 ts = tdq_steal(steal, 0); 608 if (ts == NULL) 609 continue; 610 ts->ts_state = TSS_THREAD; 611 tdq_runq_rem(steal, ts); 612 tdq_load_rem(steal, ts); 613 ts->ts_cpu = PCPU_GET(cpuid); 614 ts->ts_flags |= TSF_INTERNAL | TSF_HOLD; 615 sched_add(ts->ts_thread, SRQ_YIELDING); 616 return (0); 617 } 618 } 619 /* 620 * We only set the idled bit when all of the cpus in the group are 621 * idle. Otherwise we could get into a situation where a thread bounces 622 * back and forth between two idle cores on seperate physical CPUs. 623 */ 624 tdg->tdg_idlemask |= PCPU_GET(cpumask); 625 if (tdg->tdg_idlemask != tdg->tdg_cpumask) 626 return (1); 627 atomic_set_int(&tdq_idle, tdg->tdg_mask); 628 return (1); 629} 630 631static void 632tdq_assign(struct tdq *tdq) 633{ 634 struct td_sched *nts; 635 struct td_sched *ts; 636 637 do { 638 *(volatile struct td_sched **)&ts = tdq->tdq_assigned; 639 } while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned, 640 (uintptr_t)ts, (uintptr_t)NULL)); 641 for (; ts != NULL; ts = nts) { 642 nts = ts->ts_assign; 643 tdq->tdq_group->tdg_load--; 644 tdq->tdq_load--; 645 ts->ts_flags &= ~TSF_ASSIGNED; 646 if (ts->ts_flags & TSF_REMOVED) { 647 ts->ts_flags &= ~TSF_REMOVED; 648 continue; 649 } 650 ts->ts_flags |= TSF_INTERNAL | TSF_HOLD; 651 sched_add(ts->ts_thread, SRQ_YIELDING); 652 } 653} 654 655static void 656tdq_notify(struct td_sched *ts, int cpu) 657{ 658 struct tdq *tdq; 659 struct thread *td; 660 struct pcpu *pcpu; 661 int class; 662 int prio; 663 664 tdq = TDQ_CPU(cpu); 665 class = PRI_BASE(ts->ts_thread->td_pri_class); 666 if ((class != PRI_IDLE && class != PRI_ITHD) 667 && (tdq_idle & tdq->tdq_group->tdg_mask)) 668 atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); 669 tdq->tdq_group->tdg_load++; 670 tdq->tdq_load++; 671 ts->ts_cpu = cpu; 672 ts->ts_flags |= TSF_ASSIGNED; 673 prio = ts->ts_thread->td_priority; 674 675 /* 676 * Place a thread on another cpu's queue and force a resched. 677 */ 678 do { 679 *(volatile struct td_sched **)&ts->ts_assign = tdq->tdq_assigned; 680 } while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned, 681 (uintptr_t)ts->ts_assign, (uintptr_t)ts)); 682 /* 683 * Without sched_lock we could lose a race where we set NEEDRESCHED 684 * on a thread that is switched out before the IPI is delivered. This 685 * would lead us to miss the resched. This will be a problem once 686 * sched_lock is pushed down. 687 */ 688 pcpu = pcpu_find(cpu); 689 td = pcpu->pc_curthread; 690 if (ts->ts_thread->td_priority < td->td_priority || 691 td == pcpu->pc_idlethread) { 692 td->td_flags |= TDF_NEEDRESCHED; 693 ipi_selected(1 << cpu, IPI_AST); 694 } 695} 696 697static struct td_sched * 698runq_steal(struct runq *rq) 699{ 700 struct rqhead *rqh; 701 struct rqbits *rqb; 702 struct td_sched *ts; 703 int word; 704 int bit; 705 706 mtx_assert(&sched_lock, MA_OWNED); 707 rqb = &rq->rq_status; 708 for (word = 0; word < RQB_LEN; word++) { 709 if (rqb->rqb_bits[word] == 0) 710 continue; 711 for (bit = 0; bit < RQB_BPW; bit++) { 712 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 713 continue; 714 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 715 TAILQ_FOREACH(ts, rqh, ts_procq) { 716 if (THREAD_CAN_MIGRATE(ts->ts_thread)) 717 return (ts); 718 } 719 } 720 } 721 return (NULL); 722} 723 724static struct td_sched * 725tdq_steal(struct tdq *tdq, int stealidle) 726{ 727 struct td_sched *ts; 728 729 /* 730 * Steal from next first to try to get a non-interactive task that 731 * may not have run for a while. 732 * XXX Need to effect steal order for timeshare threads. 733 */ 734 if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL) 735 return (ts); 736 if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL) 737 return (ts); 738 if (stealidle) 739 return (runq_steal(&tdq->tdq_idle)); 740 return (NULL); 741} 742 743int 744tdq_transfer(struct tdq *tdq, struct td_sched *ts, int class) 745{ 746 struct tdq_group *ntdg; 747 struct tdq_group *tdg; 748 struct tdq *old; 749 int cpu; 750 int idx; 751 752 if (smp_started == 0) 753 return (0); 754 cpu = 0; 755 /* 756 * If our load exceeds a certain threshold we should attempt to 757 * reassign this thread. The first candidate is the cpu that 758 * originally ran the thread. If it is idle, assign it there, 759 * otherwise, pick an idle cpu. 760 * 761 * The threshold at which we start to reassign has a large impact 762 * on the overall performance of the system. Tuned too high and 763 * some CPUs may idle. Too low and there will be excess migration 764 * and context switches. 765 */ 766 old = TDQ_CPU(ts->ts_cpu); 767 ntdg = old->tdq_group; 768 tdg = tdq->tdq_group; 769 if (tdq_idle) { 770 if (tdq_idle & ntdg->tdg_mask) { 771 cpu = ffs(ntdg->tdg_idlemask); 772 if (cpu) { 773 CTR2(KTR_SCHED, 774 "tdq_transfer: %p found old cpu %X " 775 "in idlemask.", ts, cpu); 776 goto migrate; 777 } 778 } 779 /* 780 * Multiple cpus could find this bit simultaneously 781 * but the race shouldn't be terrible. 782 */ 783 cpu = ffs(tdq_idle); 784 if (cpu) { 785 CTR2(KTR_SCHED, "tdq_transfer: %p found %X " 786 "in idlemask.", ts, cpu); 787 goto migrate; 788 } 789 } 790 idx = 0; 791#if 0 792 if (old->tdq_load < tdq->tdq_load) { 793 cpu = ts->ts_cpu + 1; 794 CTR2(KTR_SCHED, "tdq_transfer: %p old cpu %X " 795 "load less than ours.", ts, cpu); 796 goto migrate; 797 } 798 /* 799 * No new CPU was found, look for one with less load. 800 */ 801 for (idx = 0; idx <= tdg_maxid; idx++) { 802 ntdg = TDQ_GROUP(idx); 803 if (ntdg->tdg_load /*+ (ntdg->tdg_cpus * 2)*/ < tdg->tdg_load) { 804 cpu = ffs(ntdg->tdg_cpumask); 805 CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X load less " 806 "than ours.", ts, cpu); 807 goto migrate; 808 } 809 } 810#endif 811 /* 812 * If another cpu in this group has idled, assign a thread over 813 * to them after checking to see if there are idled groups. 814 */ 815 if (tdg->tdg_idlemask) { 816 cpu = ffs(tdg->tdg_idlemask); 817 if (cpu) { 818 CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X idle in " 819 "group.", ts, cpu); 820 goto migrate; 821 } 822 } 823 return (0); 824migrate: 825 /* 826 * Now that we've found an idle CPU, migrate the thread. 827 */ 828 cpu--; 829 ts->ts_runq = NULL; 830 tdq_notify(ts, cpu); 831 832 return (1); 833} 834 835#endif /* SMP */ 836 837/* 838 * Pick the highest priority task we have and return it. 839 */ 840 841static struct td_sched * 842tdq_choose(struct tdq *tdq) 843{ 844 struct td_sched *ts; 845 846 mtx_assert(&sched_lock, MA_OWNED); 847 848 ts = runq_choose(&tdq->tdq_realtime); 849 if (ts != NULL) { 850 KASSERT(ts->ts_thread->td_priority <= PRI_MAX_REALTIME, 851 ("tdq_choose: Invalid priority on realtime queue %d", 852 ts->ts_thread->td_priority)); 853 return (ts); 854 } 855 ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); 856 if (ts != NULL) { 857 KASSERT(ts->ts_thread->td_priority <= PRI_MAX_TIMESHARE && 858 ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE, 859 ("tdq_choose: Invalid priority on timeshare queue %d", 860 ts->ts_thread->td_priority)); 861 return (ts); 862 } 863 864 ts = runq_choose(&tdq->tdq_idle); 865 if (ts != NULL) { 866 KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE, 867 ("tdq_choose: Invalid priority on idle queue %d", 868 ts->ts_thread->td_priority)); 869 return (ts); 870 } 871 872 return (NULL); 873} 874 875static void 876tdq_setup(struct tdq *tdq) 877{ 878 runq_init(&tdq->tdq_realtime); 879 runq_init(&tdq->tdq_timeshare); 880 runq_init(&tdq->tdq_idle); 881 tdq->tdq_load = 0; 882 tdq->tdq_load_timeshare = 0; 883} 884 885static void 886sched_setup(void *dummy) 887{ 888#ifdef SMP 889 int i; 890#endif 891 892 /* 893 * To avoid divide-by-zero, we set realstathz a dummy value 894 * in case which sched_clock() called before sched_initticks(). 895 */ 896 realstathz = hz; 897 sched_slice = (realstathz/7); /* 140ms */ 898 tickincr = 1 << SCHED_TICK_SHIFT; 899 900#ifdef SMP 901 balance_groups = 0; 902 /* 903 * Initialize the tdqs. 904 */ 905 for (i = 0; i < MAXCPU; i++) { 906 struct tdq *tdq; 907 908 tdq = &tdq_cpu[i]; 909 tdq->tdq_assigned = NULL; 910 tdq_setup(&tdq_cpu[i]); 911 } 912 if (smp_topology == NULL) { 913 struct tdq_group *tdg; 914 struct tdq *tdq; 915 int cpus; 916 917 for (cpus = 0, i = 0; i < MAXCPU; i++) { 918 if (CPU_ABSENT(i)) 919 continue; 920 tdq = &tdq_cpu[i]; 921 tdg = &tdq_groups[cpus]; 922 /* 923 * Setup a tdq group with one member. 924 */ 925 tdq->tdq_transferable = 0; 926 tdq->tdq_group = tdg; 927 tdg->tdg_cpus = 1; 928 tdg->tdg_idlemask = 0; 929 tdg->tdg_cpumask = tdg->tdg_mask = 1 << i; 930 tdg->tdg_load = 0; 931 tdg->tdg_transferable = 0; 932 LIST_INIT(&tdg->tdg_members); 933 LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); 934 cpus++; 935 } 936 tdg_maxid = cpus - 1; 937 } else { 938 struct tdq_group *tdg; 939 struct cpu_group *cg; 940 int j; 941 942 for (i = 0; i < smp_topology->ct_count; i++) { 943 cg = &smp_topology->ct_group[i]; 944 tdg = &tdq_groups[i]; 945 /* 946 * Initialize the group. 947 */ 948 tdg->tdg_idlemask = 0; 949 tdg->tdg_load = 0; 950 tdg->tdg_transferable = 0; 951 tdg->tdg_cpus = cg->cg_count; 952 tdg->tdg_cpumask = cg->cg_mask; 953 LIST_INIT(&tdg->tdg_members); 954 /* 955 * Find all of the group members and add them. 956 */ 957 for (j = 0; j < MAXCPU; j++) { 958 if ((cg->cg_mask & (1 << j)) != 0) { 959 if (tdg->tdg_mask == 0) 960 tdg->tdg_mask = 1 << j; 961 tdq_cpu[j].tdq_transferable = 0; 962 tdq_cpu[j].tdq_group = tdg; 963 LIST_INSERT_HEAD(&tdg->tdg_members, 964 &tdq_cpu[j], tdq_siblings); 965 } 966 } 967 if (tdg->tdg_cpus > 1) 968 balance_groups = 1; 969 } 970 tdg_maxid = smp_topology->ct_count - 1; 971 } 972 /* 973 * Stagger the group and global load balancer so they do not 974 * interfere with each other. 975 */ 976 bal_tick = ticks + hz; 977 if (balance_groups) 978 gbal_tick = ticks + (hz / 2); 979#else 980 tdq_setup(TDQ_SELF()); 981#endif 982 mtx_lock_spin(&sched_lock); 983 tdq_load_add(TDQ_SELF(), &td_sched0); 984 mtx_unlock_spin(&sched_lock); 985} 986 987/* ARGSUSED */ 988static void 989sched_initticks(void *dummy) 990{ 991 mtx_lock_spin(&sched_lock); 992 realstathz = stathz ? stathz : hz; 993 sched_slice = (realstathz/7); /* ~140ms */ 994 995 /* 996 * tickincr is shifted out by 10 to avoid rounding errors due to 997 * hz not being evenly divisible by stathz on all platforms. 998 */ 999 tickincr = (hz << SCHED_TICK_SHIFT) / realstathz; 1000 /* 1001 * This does not work for values of stathz that are more than 1002 * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. 1003 */ 1004 if (tickincr == 0) 1005 tickincr = 1; 1006 mtx_unlock_spin(&sched_lock); 1007} 1008 1009 1010/* 1011 * Scale the scheduling priority according to the "interactivity" of this 1012 * process. 1013 */ 1014static void 1015sched_priority(struct thread *td) 1016{ 1017 int score; 1018 int pri; 1019 1020 if (td->td_pri_class != PRI_TIMESHARE) 1021 return; 1022 /* 1023 * If the score is interactive we place the thread in the realtime 1024 * queue with a priority that is less than kernel and interrupt 1025 * priorities. These threads are not subject to nice restrictions. 1026 * 1027 * Scores greater than this are placed on the normal realtime queue 1028 * where the priority is partially decided by the most recent cpu 1029 * utilization and the rest is decided by nice value. 1030 */ 1031 score = sched_interact_score(td); 1032 if (score < sched_interact) { 1033 pri = PRI_MIN_REALTIME; 1034 pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact) 1035 * score; 1036 KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME, 1037 ("sched_priority: invalid interactive priority %d", pri)); 1038 } else { 1039 pri = SCHED_PRI_MIN; 1040 if (td->td_sched->ts_ticks) 1041 pri += SCHED_PRI_TICKS(td->td_sched); 1042 pri += SCHED_PRI_NICE(td->td_proc->p_nice); 1043 KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE, 1044 ("sched_priority: invalid priority %d", pri)); 1045 } 1046 sched_user_prio(td, pri); 1047 1048 return; 1049} 1050 1051/* 1052 * This routine enforces a maximum limit on the amount of scheduling history 1053 * kept. It is called after either the slptime or runtime is adjusted. 1054 * This routine will not operate correctly when slp or run times have been 1055 * adjusted to more than double their maximum. 1056 */ 1057static void 1058sched_interact_update(struct thread *td) 1059{ 1060 int sum; 1061 1062 sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime; 1063 if (sum < SCHED_SLP_RUN_MAX) 1064 return; 1065 /* 1066 * If we have exceeded by more than 1/5th then the algorithm below 1067 * will not bring us back into range. Dividing by two here forces 1068 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1069 */ 1070 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1071 td->td_sched->skg_runtime /= 2; 1072 td->td_sched->skg_slptime /= 2; 1073 return; 1074 } 1075 td->td_sched->skg_runtime = (td->td_sched->skg_runtime / 5) * 4; 1076 td->td_sched->skg_slptime = (td->td_sched->skg_slptime / 5) * 4; 1077} 1078 1079static void 1080sched_interact_fork(struct thread *td) 1081{ 1082 int ratio; 1083 int sum; 1084 1085 sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime; 1086 if (sum > SCHED_SLP_RUN_FORK) { 1087 ratio = sum / SCHED_SLP_RUN_FORK; 1088 td->td_sched->skg_runtime /= ratio; 1089 td->td_sched->skg_slptime /= ratio; 1090 } 1091} 1092 1093static int 1094sched_interact_score(struct thread *td) 1095{ 1096 int div; 1097 1098 if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) { 1099 div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF); 1100 return (SCHED_INTERACT_HALF + 1101 (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div))); 1102 } if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) { 1103 div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF); 1104 return (td->td_sched->skg_runtime / div); 1105 } 1106 1107 /* 1108 * This can happen if slptime and runtime are 0. 1109 */ 1110 return (0); 1111 1112} 1113 1114/* 1115 * Called from proc0_init() to bootstrap the scheduler. 1116 */ 1117void 1118schedinit(void) 1119{ 1120 1121 /* 1122 * Set up the scheduler specific parts of proc0. 1123 */ 1124 proc0.p_sched = NULL; /* XXX */ 1125 thread0.td_sched = &td_sched0; 1126 td_sched0.ts_ltick = ticks; 1127 td_sched0.ts_ftick = ticks - 1; 1128 td_sched0.ts_thread = &thread0; 1129 td_sched0.ts_state = TSS_THREAD; 1130} 1131 1132/* 1133 * This is only somewhat accurate since given many processes of the same 1134 * priority they will switch when their slices run out, which will be 1135 * at most sched_slice stathz ticks. 1136 */ 1137int 1138sched_rr_interval(void) 1139{ 1140 1141 /* Convert sched_slice to hz */ 1142 return (hz/(realstathz/sched_slice)); 1143} 1144 1145static void 1146sched_pctcpu_update(struct td_sched *ts) 1147{ 1148 1149 if (ts->ts_ticks == 0) 1150 return; 1151 /* 1152 * Adjust counters and watermark for pctcpu calc. 1153 */ 1154 if (ts->ts_ltick > ticks - SCHED_TICK_TARG) 1155 ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) * 1156 SCHED_TICK_TARG; 1157 else 1158 ts->ts_ticks = 0; 1159 ts->ts_ltick = ticks; 1160 ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG; 1161} 1162 1163static void 1164sched_thread_priority(struct thread *td, u_char prio) 1165{ 1166 struct td_sched *ts; 1167 1168 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1169 td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1170 curthread->td_proc->p_comm); 1171 ts = td->td_sched; 1172 mtx_assert(&sched_lock, MA_OWNED); 1173 if (td->td_priority == prio) 1174 return; 1175 1176 if (TD_ON_RUNQ(td) && prio < td->td_priority) { 1177 /* 1178 * If the priority has been elevated due to priority 1179 * propagation, we may have to move ourselves to a new 1180 * queue. This could be optimized to not re-add in some 1181 * cases. 1182 * 1183 * Hold this td_sched on this cpu so that sched_prio() doesn't 1184 * cause excessive migration. We only want migration to 1185 * happen as the result of a wakeup. 1186 */ 1187 ts->ts_flags |= TSF_HOLD; 1188 sched_rem(td); 1189 td->td_priority = prio; 1190 sched_add(td, SRQ_BORROWING); 1191 ts->ts_flags &= ~TSF_HOLD; 1192 } else 1193 td->td_priority = prio; 1194} 1195 1196/* 1197 * Update a thread's priority when it is lent another thread's 1198 * priority. 1199 */ 1200void 1201sched_lend_prio(struct thread *td, u_char prio) 1202{ 1203 1204 td->td_flags |= TDF_BORROWING; 1205 sched_thread_priority(td, prio); 1206} 1207 1208/* 1209 * Restore a thread's priority when priority propagation is 1210 * over. The prio argument is the minimum priority the thread 1211 * needs to have to satisfy other possible priority lending 1212 * requests. If the thread's regular priority is less 1213 * important than prio, the thread will keep a priority boost 1214 * of prio. 1215 */ 1216void 1217sched_unlend_prio(struct thread *td, u_char prio) 1218{ 1219 u_char base_pri; 1220 1221 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1222 td->td_base_pri <= PRI_MAX_TIMESHARE) 1223 base_pri = td->td_user_pri; 1224 else 1225 base_pri = td->td_base_pri; 1226 if (prio >= base_pri) { 1227 td->td_flags &= ~TDF_BORROWING; 1228 sched_thread_priority(td, base_pri); 1229 } else 1230 sched_lend_prio(td, prio); 1231} 1232 1233void 1234sched_prio(struct thread *td, u_char prio) 1235{ 1236 u_char oldprio; 1237 1238 /* First, update the base priority. */ 1239 td->td_base_pri = prio; 1240 1241 /* 1242 * If the thread is borrowing another thread's priority, don't 1243 * ever lower the priority. 1244 */ 1245 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1246 return; 1247 1248 /* Change the real priority. */ 1249 oldprio = td->td_priority; 1250 sched_thread_priority(td, prio); 1251 1252 /* 1253 * If the thread is on a turnstile, then let the turnstile update 1254 * its state. 1255 */ 1256 if (TD_ON_LOCK(td) && oldprio != prio) 1257 turnstile_adjust(td, oldprio); 1258} 1259 1260void 1261sched_user_prio(struct thread *td, u_char prio) 1262{ 1263 u_char oldprio; 1264 1265 td->td_base_user_pri = prio; 1266 if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 1267 return; 1268 oldprio = td->td_user_pri; 1269 td->td_user_pri = prio; 1270 1271 if (TD_ON_UPILOCK(td) && oldprio != prio) 1272 umtx_pi_adjust(td, oldprio); 1273} 1274 1275void 1276sched_lend_user_prio(struct thread *td, u_char prio) 1277{ 1278 u_char oldprio; 1279 1280 td->td_flags |= TDF_UBORROWING; 1281 1282 oldprio = td->td_user_pri; 1283 td->td_user_pri = prio; 1284 1285 if (TD_ON_UPILOCK(td) && oldprio != prio) 1286 umtx_pi_adjust(td, oldprio); 1287} 1288 1289void 1290sched_unlend_user_prio(struct thread *td, u_char prio) 1291{ 1292 u_char base_pri; 1293 1294 base_pri = td->td_base_user_pri; 1295 if (prio >= base_pri) { 1296 td->td_flags &= ~TDF_UBORROWING; 1297 sched_user_prio(td, base_pri); 1298 } else 1299 sched_lend_user_prio(td, prio); 1300} 1301 1302void 1303sched_switch(struct thread *td, struct thread *newtd, int flags) 1304{ 1305 struct tdq *tdq; 1306 struct td_sched *ts; 1307 1308 mtx_assert(&sched_lock, MA_OWNED); 1309 1310 tdq = TDQ_SELF(); 1311 ts = td->td_sched; 1312 td->td_lastcpu = td->td_oncpu; 1313 td->td_oncpu = NOCPU; 1314 td->td_flags &= ~TDF_NEEDRESCHED; 1315 td->td_owepreempt = 0; 1316 /* 1317 * If the thread has been assigned it may be in the process of switching 1318 * to the new cpu. This is the case in sched_bind(). 1319 */ 1320 if (td == PCPU_GET(idlethread)) { 1321 TD_SET_CAN_RUN(td); 1322 } else if ((ts->ts_flags & TSF_ASSIGNED) == 0) { 1323 /* We are ending our run so make our slot available again */ 1324 tdq_load_rem(tdq, ts); 1325 if (TD_IS_RUNNING(td)) { 1326 /* 1327 * Don't allow the thread to migrate 1328 * from a preemption. 1329 */ 1330 ts->ts_flags |= TSF_HOLD; 1331 setrunqueue(td, (flags & SW_PREEMPT) ? 1332 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1333 SRQ_OURSELF|SRQ_YIELDING); 1334 ts->ts_flags &= ~TSF_HOLD; 1335 } 1336 } 1337 if (newtd != NULL) { 1338 /* 1339 * If we bring in a thread account for it as if it had been 1340 * added to the run queue and then chosen. 1341 */ 1342 newtd->td_sched->ts_flags |= TSF_DIDRUN; 1343 TD_SET_RUNNING(newtd); 1344 tdq_load_add(TDQ_SELF(), newtd->td_sched); 1345 } else 1346 newtd = choosethread(); 1347 if (td != newtd) { 1348#ifdef HWPMC_HOOKS 1349 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1350 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1351#endif 1352 1353 cpu_switch(td, newtd); 1354#ifdef HWPMC_HOOKS 1355 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1356 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1357#endif 1358 } 1359 sched_lock.mtx_lock = (uintptr_t)td; 1360 td->td_oncpu = PCPU_GET(cpuid); 1361} 1362 1363void 1364sched_nice(struct proc *p, int nice) 1365{ 1366 struct thread *td; 1367 1368 PROC_LOCK_ASSERT(p, MA_OWNED); 1369 mtx_assert(&sched_lock, MA_OWNED); 1370 1371 p->p_nice = nice; 1372 FOREACH_THREAD_IN_PROC(p, td) { 1373 sched_priority(td); 1374 sched_prio(td, td->td_base_user_pri); 1375 } 1376} 1377 1378void 1379sched_sleep(struct thread *td) 1380{ 1381 1382 mtx_assert(&sched_lock, MA_OWNED); 1383 1384 td->td_sched->ts_slptime = ticks; 1385} 1386 1387void 1388sched_wakeup(struct thread *td) 1389{ 1390 int slptime; 1391 1392 mtx_assert(&sched_lock, MA_OWNED); 1393 1394 /* 1395 * If we slept for more than a tick update our interactivity and 1396 * priority. 1397 */ 1398 slptime = td->td_sched->ts_slptime; 1399 td->td_sched->ts_slptime = 0; 1400 if (slptime && slptime != ticks) { 1401 int hzticks; 1402 1403 hzticks = (ticks - slptime) << SCHED_TICK_SHIFT; 1404 if (hzticks >= SCHED_SLP_RUN_MAX) { 1405 td->td_sched->skg_slptime = SCHED_SLP_RUN_MAX; 1406 td->td_sched->skg_runtime = 1; 1407 } else { 1408 td->td_sched->skg_slptime += hzticks; 1409 sched_interact_update(td); 1410 } 1411 if (ticks - (hz / 10) > td->td_sched->ts_ltick) 1412 sched_pctcpu_update(td->td_sched); 1413 sched_priority(td); 1414 } 1415 setrunqueue(td, SRQ_BORING); 1416} 1417 1418/* 1419 * Penalize the parent for creating a new child and initialize the child's 1420 * priority. 1421 */ 1422void 1423sched_fork(struct thread *td, struct thread *child) 1424{ 1425 mtx_assert(&sched_lock, MA_OWNED); 1426 sched_fork_thread(td, child); 1427 /* 1428 * Penalize the parent and child for forking. 1429 */ 1430 sched_interact_fork(child); 1431 sched_priority(child); 1432 td->td_sched->skg_runtime += tickincr; 1433 sched_interact_update(td); 1434 sched_priority(td); 1435} 1436 1437void 1438sched_fork_thread(struct thread *td, struct thread *child) 1439{ 1440 struct td_sched *ts; 1441 struct td_sched *ts2; 1442 1443 /* 1444 * Initialize child. 1445 */ 1446 sched_newthread(child); 1447 ts = td->td_sched; 1448 ts2 = child->td_sched; 1449 ts2->ts_cpu = ts->ts_cpu; 1450 ts2->ts_runq = NULL; 1451 /* 1452 * Grab our parents cpu estimation information and priority. 1453 */ 1454 ts2->ts_ticks = ts->ts_ticks; 1455 ts2->ts_ltick = ts->ts_ltick; 1456 ts2->ts_ftick = ts->ts_ftick; 1457 child->td_user_pri = td->td_user_pri; 1458 child->td_base_user_pri = td->td_base_user_pri; 1459 /* 1460 * And update interactivity score. 1461 */ 1462 ts2->skg_slptime = ts->skg_slptime; 1463 ts2->skg_runtime = ts->skg_runtime; 1464 ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ 1465} 1466 1467void 1468sched_class(struct thread *td, int class) 1469{ 1470 struct tdq *tdq; 1471 struct td_sched *ts; 1472 int nclass; 1473 int oclass; 1474 1475 mtx_assert(&sched_lock, MA_OWNED); 1476 if (td->td_pri_class == class) 1477 return; 1478 1479 nclass = PRI_BASE(class); 1480 oclass = PRI_BASE(td->td_pri_class); 1481 ts = td->td_sched; 1482 if (ts->ts_state == TSS_ONRUNQ || td->td_state == TDS_RUNNING) { 1483 tdq = TDQ_CPU(ts->ts_cpu); 1484#ifdef SMP 1485 /* 1486 * On SMP if we're on the RUNQ we must adjust the transferable 1487 * count because could be changing to or from an interrupt 1488 * class. 1489 */ 1490 if (ts->ts_state == TSS_ONRUNQ) { 1491 if (THREAD_CAN_MIGRATE(ts->ts_thread)) { 1492 tdq->tdq_transferable--; 1493 tdq->tdq_group->tdg_transferable--; 1494 } 1495 if (THREAD_CAN_MIGRATE(ts->ts_thread)) { 1496 tdq->tdq_transferable++; 1497 tdq->tdq_group->tdg_transferable++; 1498 } 1499 } 1500#endif 1501 if (oclass == PRI_TIMESHARE) 1502 tdq->tdq_load_timeshare--; 1503 if (nclass == PRI_TIMESHARE) 1504 tdq->tdq_load_timeshare++; 1505 } 1506 1507 td->td_pri_class = class; 1508} 1509 1510/* 1511 * Return some of the child's priority and interactivity to the parent. 1512 */ 1513void 1514sched_exit(struct proc *p, struct thread *child) 1515{ 1516 struct thread *td; 1517 1518 CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 1519 child, child->td_proc->p_comm, child->td_priority); 1520 1521 td = FIRST_THREAD_IN_PROC(p); 1522 sched_exit_thread(td, child); 1523} 1524 1525void 1526sched_exit_thread(struct thread *td, struct thread *child) 1527{ 1528 1529 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 1530 child, child->td_proc->p_comm, child->td_priority); 1531 1532 tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched); 1533#ifdef KSE 1534 /* 1535 * KSE forks and exits so often that this penalty causes short-lived 1536 * threads to always be non-interactive. This causes mozilla to 1537 * crawl under load. 1538 */ 1539 if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc) 1540 return; 1541#endif 1542 /* 1543 * Give the child's runtime to the parent without returning the 1544 * sleep time as a penalty to the parent. This causes shells that 1545 * launch expensive things to mark their children as expensive. 1546 */ 1547 td->td_sched->skg_runtime += child->td_sched->skg_runtime; 1548 sched_interact_update(td); 1549 sched_priority(td); 1550} 1551 1552void 1553sched_userret(struct thread *td) 1554{ 1555 /* 1556 * XXX we cheat slightly on the locking here to avoid locking in 1557 * the usual case. Setting td_priority here is essentially an 1558 * incomplete workaround for not setting it properly elsewhere. 1559 * Now that some interrupt handlers are threads, not setting it 1560 * properly elsewhere can clobber it in the window between setting 1561 * it here and returning to user mode, so don't waste time setting 1562 * it perfectly here. 1563 */ 1564 KASSERT((td->td_flags & TDF_BORROWING) == 0, 1565 ("thread with borrowed priority returning to userland")); 1566 if (td->td_priority != td->td_user_pri) { 1567 mtx_lock_spin(&sched_lock); 1568 td->td_priority = td->td_user_pri; 1569 td->td_base_pri = td->td_user_pri; 1570 mtx_unlock_spin(&sched_lock); 1571 } 1572} 1573 1574void 1575sched_clock(struct thread *td) 1576{ 1577 struct tdq *tdq; 1578 struct td_sched *ts; 1579 1580 mtx_assert(&sched_lock, MA_OWNED); 1581#ifdef SMP 1582 sched_smp_tick(); 1583#endif 1584 tdq = TDQ_SELF(); 1585 /* 1586 * Advance the insert index once for each tick to ensure that all 1587 * threads get a chance to run. 1588 */ 1589 if (tdq->tdq_idx == tdq->tdq_ridx) { 1590 tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; 1591 if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) 1592 tdq->tdq_ridx = tdq->tdq_idx; 1593 } 1594 /* Adjust ticks for pctcpu */ 1595 ts = td->td_sched; 1596 ts->ts_ticks += tickincr; 1597 ts->ts_ltick = ticks; 1598 /* 1599 * Update if we've exceeded our desired tick threshhold by over one 1600 * second. 1601 */ 1602 if (ts->ts_ftick + SCHED_TICK_TARG + hz < ts->ts_ltick) 1603 sched_pctcpu_update(ts); 1604 /* 1605 * We only do slicing code for TIMESHARE threads. 1606 */ 1607 if (td->td_pri_class != PRI_TIMESHARE) 1608 return; 1609 /* 1610 * We used a tick; charge it to the thread so that we can compute our 1611 * interactivity. 1612 */ 1613 td->td_sched->skg_runtime += tickincr; 1614 sched_interact_update(td); 1615 /* 1616 * We used up one time slice. 1617 */ 1618 if (--ts->ts_slice > 0) 1619 return; 1620 /* 1621 * We're out of time, recompute priorities and requeue. 1622 */ 1623 tdq_load_rem(tdq, ts); 1624 sched_priority(td); 1625 ts->ts_slice = sched_slice; 1626 tdq_load_add(tdq, ts); 1627 td->td_flags |= TDF_NEEDRESCHED; 1628} 1629 1630int 1631sched_runnable(void) 1632{ 1633 struct tdq *tdq; 1634 int load; 1635 1636 load = 1; 1637 1638 tdq = TDQ_SELF(); 1639#ifdef SMP 1640 if (tdq->tdq_assigned) { 1641 mtx_lock_spin(&sched_lock); 1642 tdq_assign(tdq); 1643 mtx_unlock_spin(&sched_lock); 1644 } 1645#endif 1646 if ((curthread->td_flags & TDF_IDLETD) != 0) { 1647 if (tdq->tdq_load > 0) 1648 goto out; 1649 } else 1650 if (tdq->tdq_load - 1 > 0) 1651 goto out; 1652 load = 0; 1653out: 1654 return (load); 1655} 1656 1657struct td_sched * 1658sched_choose(void) 1659{ 1660 struct tdq *tdq; 1661 struct td_sched *ts; 1662 1663 mtx_assert(&sched_lock, MA_OWNED); 1664 tdq = TDQ_SELF(); 1665#ifdef SMP 1666restart: 1667 if (tdq->tdq_assigned) 1668 tdq_assign(tdq); 1669#endif 1670 ts = tdq_choose(tdq); 1671 if (ts) { 1672#ifdef SMP 1673 if (ts->ts_thread->td_priority <= PRI_MIN_IDLE) 1674 if (tdq_idled(tdq) == 0) 1675 goto restart; 1676#endif 1677 tdq_runq_rem(tdq, ts); 1678 ts->ts_state = TSS_THREAD; 1679 return (ts); 1680 } 1681#ifdef SMP 1682 if (tdq_idled(tdq) == 0) 1683 goto restart; 1684#endif 1685 return (NULL); 1686} 1687 1688void 1689sched_add(struct thread *td, int flags) 1690{ 1691 struct tdq *tdq; 1692 struct td_sched *ts; 1693 int preemptive; 1694 int canmigrate; 1695 int class; 1696 1697 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1698 td, td->td_proc->p_comm, td->td_priority, curthread, 1699 curthread->td_proc->p_comm); 1700 mtx_assert(&sched_lock, MA_OWNED); 1701 tdq = TDQ_SELF(); 1702 ts = td->td_sched; 1703 ts->ts_flags &= ~TSF_INTERNAL; 1704 class = PRI_BASE(td->td_pri_class); 1705 preemptive = !(flags & SRQ_YIELDING); 1706 canmigrate = 1; 1707#ifdef SMP 1708 if (ts->ts_flags & TSF_ASSIGNED) { 1709 if (ts->ts_flags & TSF_REMOVED) 1710 ts->ts_flags &= ~TSF_REMOVED; 1711 return; 1712 } 1713 canmigrate = THREAD_CAN_MIGRATE(td); 1714 /* 1715 * Don't migrate running threads here. Force the long term balancer 1716 * to do it. 1717 */ 1718 if (ts->ts_flags & TSF_HOLD) { 1719 ts->ts_flags &= ~TSF_HOLD; 1720 canmigrate = 0; 1721 } 1722#endif 1723 KASSERT(ts->ts_state != TSS_ONRUNQ, 1724 ("sched_add: thread %p (%s) already in run queue", td, 1725 td->td_proc->p_comm)); 1726 KASSERT(td->td_proc->p_sflag & PS_INMEM, 1727 ("sched_add: process swapped out")); 1728 KASSERT(ts->ts_runq == NULL, 1729 ("sched_add: thread %p is still assigned to a run queue", td)); 1730 /* 1731 * Set the slice and pick the run queue. 1732 */ 1733 if (ts->ts_slice == 0) 1734 ts->ts_slice = sched_slice; 1735 if (td->td_priority <= PRI_MAX_REALTIME) { 1736 ts->ts_runq = &tdq->tdq_realtime; 1737 /* 1738 * If the thread is not artificially pinned and it's in 1739 * the realtime queue we directly dispatch it on this cpu 1740 * for minimum latency. Interrupt handlers may also have 1741 * to complete on the cpu that dispatched them. 1742 */ 1743 if (td->td_pinned == 0) 1744 ts->ts_cpu = PCPU_GET(cpuid); 1745 } else if (td->td_priority <= PRI_MAX_TIMESHARE) 1746 ts->ts_runq = &tdq->tdq_timeshare; 1747 else 1748 ts->ts_runq = &tdq->tdq_idle; 1749 1750#ifdef SMP 1751 /* 1752 * If this thread is pinned or bound, notify the target cpu. 1753 */ 1754 if (!canmigrate && ts->ts_cpu != PCPU_GET(cpuid) ) { 1755 ts->ts_runq = NULL; 1756 tdq_notify(ts, ts->ts_cpu); 1757 return; 1758 } 1759 /* 1760 * If we had been idle, clear our bit in the group and potentially 1761 * the global bitmap. If not, see if we should transfer this thread. 1762 */ 1763 if ((class != PRI_IDLE && class != PRI_ITHD) && 1764 (tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) { 1765 /* 1766 * Check to see if our group is unidling, and if so, remove it 1767 * from the global idle mask. 1768 */ 1769 if (tdq->tdq_group->tdg_idlemask == 1770 tdq->tdq_group->tdg_cpumask) 1771 atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); 1772 /* 1773 * Now remove ourselves from the group specific idle mask. 1774 */ 1775 tdq->tdq_group->tdg_idlemask &= ~PCPU_GET(cpumask); 1776 } else if (canmigrate && tdq->tdq_load > 1) 1777 if (tdq_transfer(tdq, ts, class)) 1778 return; 1779 ts->ts_cpu = PCPU_GET(cpuid); 1780#endif 1781 if (td->td_priority < curthread->td_priority) 1782 curthread->td_flags |= TDF_NEEDRESCHED; 1783 if (preemptive && maybe_preempt(td)) 1784 return; 1785 ts->ts_state = TSS_ONRUNQ; 1786 1787 tdq_runq_add(tdq, ts, flags); 1788 tdq_load_add(tdq, ts); 1789} 1790 1791void 1792sched_rem(struct thread *td) 1793{ 1794 struct tdq *tdq; 1795 struct td_sched *ts; 1796 1797 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1798 td, td->td_proc->p_comm, td->td_priority, curthread, 1799 curthread->td_proc->p_comm); 1800 mtx_assert(&sched_lock, MA_OWNED); 1801 ts = td->td_sched; 1802 if (ts->ts_flags & TSF_ASSIGNED) { 1803 ts->ts_flags |= TSF_REMOVED; 1804 return; 1805 } 1806 KASSERT((ts->ts_state == TSS_ONRUNQ), 1807 ("sched_rem: thread not on run queue")); 1808 1809 ts->ts_state = TSS_THREAD; 1810 tdq = TDQ_CPU(ts->ts_cpu); 1811 tdq_runq_rem(tdq, ts); 1812 tdq_load_rem(tdq, ts); 1813} 1814 1815fixpt_t 1816sched_pctcpu(struct thread *td) 1817{ 1818 fixpt_t pctcpu; 1819 struct td_sched *ts; 1820 1821 pctcpu = 0; 1822 ts = td->td_sched; 1823 if (ts == NULL) 1824 return (0); 1825 1826 mtx_lock_spin(&sched_lock); 1827 if (ts->ts_ticks) { 1828 int rtick; 1829 1830 if (ticks - (hz / 10) > td->td_sched->ts_ltick) 1831 sched_pctcpu_update(ts); 1832 /* How many rtick per second ? */ 1833 rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); 1834 pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; 1835 } 1836 td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick; 1837 mtx_unlock_spin(&sched_lock); 1838 1839 return (pctcpu); 1840} 1841 1842void 1843sched_bind(struct thread *td, int cpu) 1844{ 1845 struct td_sched *ts; 1846 1847 mtx_assert(&sched_lock, MA_OWNED); 1848 ts = td->td_sched; 1849 KASSERT((ts->ts_flags & TSF_BOUND) == 0, 1850 ("sched_bind: thread %p already bound.", td)); 1851 ts->ts_flags |= TSF_BOUND; 1852#ifdef SMP 1853 if (PCPU_GET(cpuid) == cpu) 1854 return; 1855 /* sched_rem without the runq_remove */ 1856 ts->ts_state = TSS_THREAD; 1857 tdq_load_rem(TDQ_CPU(ts->ts_cpu), ts); 1858 tdq_notify(ts, cpu); 1859 /* When we return from mi_switch we'll be on the correct cpu. */ 1860 mi_switch(SW_VOL, NULL); 1861 sched_pin(); 1862#endif 1863} 1864 1865void 1866sched_unbind(struct thread *td) 1867{ 1868 struct td_sched *ts; 1869 1870 mtx_assert(&sched_lock, MA_OWNED); 1871 ts = td->td_sched; 1872 KASSERT(ts->ts_flags & TSF_BOUND, 1873 ("sched_unbind: thread %p not bound.", td)); 1874 mtx_assert(&sched_lock, MA_OWNED); 1875 ts->ts_flags &= ~TSF_BOUND; 1876#ifdef SMP 1877 sched_unpin(); 1878#endif 1879} 1880 1881int 1882sched_is_bound(struct thread *td) 1883{ 1884 mtx_assert(&sched_lock, MA_OWNED); 1885 return (td->td_sched->ts_flags & TSF_BOUND); 1886} 1887 1888void 1889sched_relinquish(struct thread *td) 1890{ 1891 mtx_lock_spin(&sched_lock); 1892 if (td->td_pri_class == PRI_TIMESHARE) 1893 sched_prio(td, PRI_MAX_TIMESHARE); 1894 mi_switch(SW_VOL, NULL); 1895 mtx_unlock_spin(&sched_lock); 1896} 1897 1898int 1899sched_load(void) 1900{ 1901#ifdef SMP 1902 int total; 1903 int i; 1904 1905 total = 0; 1906 for (i = 0; i <= tdg_maxid; i++) 1907 total += TDQ_GROUP(i)->tdg_load; 1908 return (total); 1909#else 1910 return (TDQ_SELF()->tdq_sysload); 1911#endif 1912} 1913 1914int 1915sched_sizeof_proc(void) 1916{ 1917 return (sizeof(struct proc)); 1918} 1919 1920int 1921sched_sizeof_thread(void) 1922{ 1923 return (sizeof(struct thread) + sizeof(struct td_sched)); 1924} 1925 1926void 1927sched_tick(void) 1928{ 1929} 1930 1931static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 1932SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 1933 "Scheduler name"); 1934SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, ""); 1935SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, ""); 1936SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, ""); 1937SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, ""); 1938 1939/* ps compat */ 1940static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 1941SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 1942 1943 1944#define KERN_SWITCH_INCLUDE 1 1945#include "kern/kern_switch.c" 1946