kern_switch.c revision 163709
1193323Sed/*- 2193323Sed * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org> 3193323Sed * All rights reserved. 4193323Sed * 5193323Sed * Redistribution and use in source and binary forms, with or without 6193323Sed * modification, are permitted provided that the following conditions 7193323Sed * are met: 8193323Sed * 1. Redistributions of source code must retain the above copyright 9193323Sed * notice, this list of conditions and the following disclaimer. 10193323Sed * 2. Redistributions in binary form must reproduce the above copyright 11193323Sed * notice, this list of conditions and the following disclaimer in the 12193323Sed * documentation and/or other materials provided with the distribution. 13193323Sed * 14193323Sed * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15193323Sed * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16193323Sed * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17218893Sdim * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18193323Sed * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19193323Sed * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20193323Sed * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21218893Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22204642Srdivacky * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23193323Sed * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24224145Sdim * SUCH DAMAGE. 25204642Srdivacky */ 26193323Sed 27193323Sed#ifdef KSE 28193323Sed/*** 29202375SrdivackyHere is the logic.. 30204642Srdivacky 31210299SedIf there are N processors, then there are at most N KSEs (kernel 32243830Sdimschedulable entities) working to process threads that belong to a 33193323SedKSEGROUP (kg). If there are X of these KSEs actually running at the 34205218Srdivackymoment in question, then there are at most M (N-X) of these KSEs on 35218893Sdimthe run queue, as running KSEs are not on the queue. 36193323Sed 37193323SedRunnable threads are queued off the KSEGROUP in priority order. 38249423SdimIf there are M or more threads runnable, the top M threads 39249423Sdim(by priority) are 'preassigned' to the M KSEs not running. The KSEs take 40249423Sdimtheir priority from those threads and are put on the run queue. 41249423Sdim 42249423SdimThe last thread that had a priority high enough to have a KSE associated 43193323Sedwith it, AND IS ON THE RUN QUEUE is pointed to by 44193323Sedkg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs 45193323Sedassigned as all the available KSEs are activly running, or because there 46194612Sedare no threads queued, that pointer is NULL. 47194612Sed 48224145SdimWhen a KSE is removed from the run queue to become runnable, we know 49194612Sedit was associated with the highest priority thread in the queue (at the head 50194612Sedof the queue). If it is also the last assigned we know M was 1 and must 51194612Sednow be 0. Since the thread is no longer queued that pointer must be 52194612Sedremoved from it. Since we know there were no more KSEs available, 53239462Sdim(M was 1 and is now 0) and since we are not FREEING our KSE 54194612Sedbut using it, we know there are STILL no more KSEs available, we can prove 55194612Sedthat the next thread in the ksegrp list will not have a KSE to assign to 56195340Sedit, so we can show that the pointer must be made 'invalid' (NULL). 57202375Srdivacky 58234353SdimThe pointer exists so that when a new thread is made runnable, it can 59205218Srdivackyhave its priority compared with the last assigned thread to see if 60205218Srdivackyit should 'steal' its KSE or not.. i.e. is it 'earlier' 61193323Sedon the list than that thread or later.. If it's earlier, then the KSE is 62193323Sedremoved from the last assigned (which is now not assigned a KSE) 63193323Sedand reassigned to the new thread, which is placed earlier in the list. 64193323SedThe pointer is then backed up to the previous thread (which may or may not 65193323Sedbe the new thread). 66195098Sed 67218893SdimWhen a thread sleeps or is removed, the KSE becomes available and if there 68249423Sdimare queued threads that are not assigned KSEs, the highest priority one of 69249423Sdimthem is assigned the KSE, which is then placed back on the run queue at 70249423Sdimthe approipriate place, and the kg->kg_last_assigned pointer is adjusted down 71249423Sdimto point to it. 72249423Sdim 73218893SdimThe following diagram shows 2 KSEs and 3 threads from a single process. 74243830Sdim 75243830Sdim RUNQ: --->KSE---KSE--... (KSEs queued at priorities from threads) 76243830Sdim \ \____ 77243830Sdim \ \ 78243830Sdim KSEGROUP---thread--thread--thread (queued in priority order) 79193323Sed \ / 80193323Sed \_______________/ 81218893Sdim (last_assigned) 82193323Sed 83193323SedThe result of this scheme is that the M available KSEs are always 84193323Sedqueued at the priorities they have inherrited from the M highest priority 85218893Sdimthreads for that KSEGROUP. If this situation changes, the KSEs are 86193323Sedreassigned to keep this true. 87193323Sed***/ 88193323Sed#endif 89193323Sed 90193323Sed#include <sys/cdefs.h> 91193323Sed__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 163709 2006-10-26 21:42:22Z jb $"); 92193323Sed 93193323Sed#include "opt_sched.h" 94193323Sed 95193323Sed#ifndef KERN_SWITCH_INCLUDE 96193323Sed#include <sys/param.h> 97193323Sed#include <sys/systm.h> 98226633Sdim#include <sys/kdb.h> 99226633Sdim#include <sys/kernel.h> 100226633Sdim#include <sys/ktr.h> 101226633Sdim#include <sys/lock.h> 102226633Sdim#include <sys/mutex.h> 103226633Sdim#include <sys/proc.h> 104226633Sdim#include <sys/queue.h> 105226633Sdim#include <sys/sched.h> 106226633Sdim#else /* KERN_SWITCH_INCLUDE */ 107226633Sdim#if defined(SMP) && (defined(__i386__) || defined(__amd64__)) 108226633Sdim#include <sys/smp.h> 109226633Sdim#endif 110226633Sdim#if defined(SMP) && defined(SCHED_4BSD) 111226633Sdim#include <sys/sysctl.h> 112226633Sdim#endif 113226633Sdim 114226633Sdim/* Uncomment this to enable logging of critical_enter/exit. */ 115195340Sed#if 0 116195340Sed#define KTR_CRITICAL KTR_SCHED 117234353Sdim#else 118234353Sdim#define KTR_CRITICAL 0 119234353Sdim#endif 120234353Sdim 121234353Sdim#ifdef FULL_PREEMPTION 122234353Sdim#ifndef PREEMPTION 123234353Sdim#error "The FULL_PREEMPTION option requires the PREEMPTION option" 124234353Sdim#endif 125234353Sdim#endif 126234353Sdim 127234353SdimCTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); 128193323Sed 129193323Sed#define td_kse td_sched 130193323Sed 131193323Sed/* 132193323Sed * kern.sched.preemption allows user space to determine if preemption support 133203954Srdivacky * is compiled in or not. It is not currently a boot or runtime flag that 134203954Srdivacky * can be changed. 135203954Srdivacky */ 136203954Srdivacky#ifdef PREEMPTION 137223017Sdimstatic int kern_sched_preemption = 1; 138223017Sdim#else 139218893Sdimstatic int kern_sched_preemption = 0; 140218893Sdim#endif 141218893SdimSYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD, 142218893Sdim &kern_sched_preemption, 0, "Kernel preemption enabled"); 143218893Sdim 144218893Sdim/************************************************************************ 145218893Sdim * Functions that manipulate runnability from a thread perspective. * 146218893Sdim ************************************************************************/ 147193323Sed#ifdef KSE 148193323Sed/* 149193323Sed * Select the KSE that will be run next. From that find the thread, and 150193323Sed * remove it from the KSEGRP's run queue. If there is thread clustering, 151193323Sed * this will be what does it. 152193323Sed */ 153193323Sed#else 154224145Sdim/* 155193323Sed * Select the thread that will be run next. 156234353Sdim */ 157204642Srdivacky#endif 158205218Srdivackystruct thread * 159193323Sedchoosethread(void) 160193323Sed{ 161218893Sdim#ifdef KSE 162239462Sdim struct kse *ke; 163193323Sed#endif 164193323Sed struct thread *td; 165218893Sdim#ifdef KSE 166193323Sed struct ksegrp *kg; 167193323Sed#endif 168193323Sed 169193323Sed#if defined(SMP) && (defined(__i386__) || defined(__amd64__)) 170193323Sed if (smp_active == 0 && PCPU_GET(cpuid) != 0) { 171193323Sed /* Shutting down, run idlethread on AP's */ 172207618Srdivacky td = PCPU_GET(idlethread); 173207618Srdivacky#ifdef KSE 174193323Sed ke = td->td_kse; 175218893Sdim#endif 176218893Sdim CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td); 177193323Sed#ifdef KSE 178193323Sed ke->ke_flags |= KEF_DIDRUN; 179218893Sdim#else 180249423Sdim td->td_kse->ke_flags |= KEF_DIDRUN; 181249423Sdim#endif 182193323Sed TD_SET_RUNNING(td); 183193323Sed return (td); 184193323Sed } 185195098Sed#endif 186218893Sdim 187249423Sdimretry: 188249423Sdim#ifdef KSE 189243830Sdim ke = sched_choose(); 190249423Sdim if (ke) { 191243830Sdim td = ke->ke_thread; 192249423Sdim KASSERT((td->td_kse == ke), ("kse/thread mismatch")); 193249423Sdim kg = ke->ke_ksegrp; 194243830Sdim if (td->td_proc->p_flag & P_HADTHREADS) { 195249423Sdim if (kg->kg_last_assigned == td) { 196243830Sdim kg->kg_last_assigned = TAILQ_PREV(td, 197249423Sdim threadqueue, td_runq); 198249423Sdim } 199243830Sdim TAILQ_REMOVE(&kg->kg_runq, td, td_runq); 200193323Sed } 201218893Sdim#else 202193323Sed td = sched_choose(); 203193323Sed if (td) { 204193323Sed#endif 205193323Sed CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d", 206218893Sdim td, td->td_priority); 207218893Sdim } else { 208218893Sdim /* Simulate runq_choose() having returned the idle thread */ 209218893Sdim td = PCPU_GET(idlethread); 210218893Sdim#ifdef KSE 211218893Sdim ke = td->td_kse; 212218893Sdim#endif 213218893Sdim CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td); 214218893Sdim } 215218893Sdim#ifdef KSE 216218893Sdim ke->ke_flags |= KEF_DIDRUN; 217193323Sed#else 218193323Sed td->td_kse->ke_flags |= KEF_DIDRUN; 219193323Sed#endif 220193323Sed 221193323Sed /* 222193323Sed * If we are in panic, only allow system threads, 223193323Sed * plus the one we are running in, to be run. 224193323Sed */ 225193323Sed if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 && 226193323Sed (td->td_flags & TDF_INPANIC) == 0)) { 227224145Sdim /* note that it is no longer on the run queue */ 228224145Sdim TD_SET_CAN_RUN(td); 229193323Sed goto retry; 230193323Sed } 231193323Sed 232193323Sed TD_SET_RUNNING(td); 233193323Sed return (td); 234193323Sed} 235193323Sed 236193323Sed#ifdef KSE 237239462Sdim/* 238239462Sdim * Given a surplus system slot, try assign a new runnable thread to it. 239193323Sed * Called from: 240193323Sed * sched_thread_exit() (local) 241193323Sed * sched_switch() (local) 242193323Sed * sched_thread_exit() (local) 243193323Sed * remrunqueue() (local) (not at the moment) 244193323Sed */ 245198892Srdivackystatic void 246198892Srdivackyslot_fill(struct ksegrp *kg) 247234353Sdim{ 248234353Sdim struct thread *td; 249202375Srdivacky 250202375Srdivacky mtx_assert(&sched_lock, MA_OWNED); 251205218Srdivacky while (kg->kg_avail_opennings > 0) { 252193323Sed /* 253234353Sdim * Find the first unassigned thread 254193323Sed */ 255193323Sed if ((td = kg->kg_last_assigned) != NULL) 256193323Sed td = TAILQ_NEXT(td, td_runq); 257193323Sed else 258193323Sed td = TAILQ_FIRST(&kg->kg_runq); 259193323Sed 260193323Sed /* 261218893Sdim * If we found one, send it to the system scheduler. 262193323Sed */ 263218893Sdim if (td) { 264193323Sed kg->kg_last_assigned = td; 265193323Sed sched_add(td, SRQ_YIELDING); 266249423Sdim CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg); 267193323Sed } else { 268218893Sdim /* no threads to use up the slots. quit now */ 269218893Sdim break; 270193323Sed } 271193323Sed } 272193323Sed} 273218893Sdim 274193323Sed#ifdef SCHED_4BSD 275193323Sed/* 276193323Sed * Remove a thread from its KSEGRP's run queue. 277193323Sed * This in turn may remove it from a KSE if it was already assigned 278218893Sdim * to one, possibly causing a new thread to be assigned to the KSE 279218893Sdim * and the KSE getting a new priority. 280193323Sed */ 281193323Sedstatic void 282193323Sedremrunqueue(struct thread *td) 283218893Sdim{ 284193323Sed struct thread *td2, *td3; 285193323Sed struct ksegrp *kg; 286193323Sed struct kse *ke; 287193323Sed 288218893Sdim mtx_assert(&sched_lock, MA_OWNED); 289193323Sed KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue")); 290193323Sed kg = td->td_ksegrp; 291193323Sed ke = td->td_kse; 292193323Sed CTR1(KTR_RUNQ, "remrunqueue: td%p", td); 293218893Sdim TD_SET_CAN_RUN(td); 294195340Sed /* 295195340Sed * If it is not a threaded process, take the shortcut. 296195340Sed */ 297195340Sed if ((td->td_proc->p_flag & P_HADTHREADS) == 0) { 298218893Sdim /* remve from sys run queue and free up a slot */ 299234353Sdim sched_rem(td); 300234353Sdim return; 301234353Sdim } 302234353Sdim td3 = TAILQ_PREV(td, threadqueue, td_runq); 303234353Sdim TAILQ_REMOVE(&kg->kg_runq, td, td_runq); 304193323Sed if (ke->ke_state == KES_ONRUNQ) { 305193323Sed /* 306193323Sed * This thread has been assigned to the system run queue. 307193323Sed * We need to dissociate it and try assign the 308193323Sed * KSE to the next available thread. Then, we should 309243830Sdim * see if we need to move the KSE in the run queues. 310243830Sdim */ 311243830Sdim sched_rem(td); 312243830Sdim td2 = kg->kg_last_assigned; 313243830Sdim KASSERT((td2 != NULL), ("last assigned has wrong value")); 314203954Srdivacky if (td2 == td) 315203954Srdivacky kg->kg_last_assigned = td3; 316203954Srdivacky /* slot_fill(kg); */ /* will replace it with another */ 317203954Srdivacky } 318203954Srdivacky} 319226633Sdim#endif 320226633Sdim#endif 321226633Sdim 322226633Sdim/* 323234353Sdim * Change the priority of a thread that is on the run queue. 324234353Sdim */ 325234353Sdimvoid 326226633Sdimadjustrunqueue( struct thread *td, int newpri) 327226633Sdim{ 328234353Sdim#ifdef KSE 329226633Sdim struct ksegrp *kg; 330226633Sdim#endif 331193323Sed struct kse *ke; 332193323Sed 333193323Sed mtx_assert(&sched_lock, MA_OWNED); 334218893Sdim KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue")); 335193323Sed 336193323Sed ke = td->td_kse; 337193323Sed CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td); 338218893Sdim#ifdef KSE 339193323Sed /* 340193323Sed * If it is not a threaded process, take the shortcut. 341249423Sdim */ 342249423Sdim if ((td->td_proc->p_flag & P_HADTHREADS) == 0) { 343193323Sed /* We only care about the kse in the run queue. */ 344210299Sed td->td_priority = newpri; 345210299Sed#ifndef SCHED_CORE 346210299Sed if (ke->ke_rqindex != (newpri / RQ_PPQ)) 347210299Sed#else 348210299Sed if (ke->ke_rqindex != newpri) 349210299Sed#endif 350210299Sed { 351210299Sed sched_rem(td); 352210299Sed sched_add(td, SRQ_BORING); 353210299Sed } 354210299Sed return; 355210299Sed } 356210299Sed 357210299Sed /* It is a threaded process */ 358239462Sdim kg = td->td_ksegrp; 359218893Sdim if (ke->ke_state == KES_ONRUNQ 360239462Sdim#ifdef SCHED_ULE 361193323Sed || ((ke->ke_flags & KEF_ASSIGNED) != 0 && 362218893Sdim (ke->ke_flags & KEF_REMOVED) == 0) 363193323Sed#endif 364193323Sed ) { 365193323Sed if (kg->kg_last_assigned == td) { 366193323Sed kg->kg_last_assigned = 367193323Sed TAILQ_PREV(td, threadqueue, td_runq); 368193323Sed } 369203954Srdivacky sched_rem(td); 370193323Sed } 371193323Sed TAILQ_REMOVE(&kg->kg_runq, td, td_runq); 372218893Sdim TD_SET_CAN_RUN(td); 373193323Sed td->td_priority = newpri; 374193323Sed setrunqueue(td, SRQ_BORING); 375193323Sed#else 376193323Sed /* We only care about the kse in the run queue. */ 377193323Sed td->td_priority = newpri; 378195340Sed#ifndef SCHED_CORE 379195340Sed if (ke->ke_rqindex != (newpri / RQ_PPQ)) 380195340Sed#else 381195340Sed if (ke->ke_rqindex != newpri) 382218893Sdim#endif 383234353Sdim { 384234353Sdim sched_rem(td); 385234353Sdim sched_add(td, SRQ_BORING); 386234353Sdim } 387234353Sdim#endif 388193323Sed} 389193323Sed 390193323Sed#ifdef KSE 391193323Sed/* 392193323Sed * This function is called when a thread is about to be put on a 393206083Srdivacky * ksegrp run queue because it has been made runnable or its 394263508Sdim * priority has been adjusted and the ksegrp does not have a 395206083Srdivacky * free kse slot. It determines if a thread from the same ksegrp 396206083Srdivacky * should be preempted. If so, it tries to switch threads 397206083Srdivacky * if the thread is on the same cpu or notifies another cpu that 398193323Sed * it should switch threads. 399193323Sed */ 400193323Sed 401218893Sdimstatic void 402193323Sedmaybe_preempt_in_ksegrp(struct thread *td) 403193323Sed#if !defined(SMP) 404193323Sed{ 405193323Sed struct thread *running_thread; 406218893Sdim 407224145Sdim mtx_assert(&sched_lock, MA_OWNED); 408224145Sdim running_thread = curthread; 409224145Sdim 410224145Sdim if (running_thread->td_ksegrp != td->td_ksegrp) 411224145Sdim return; 412193323Sed 413193323Sed if (td->td_priority >= running_thread->td_priority) 414193323Sed return; 415193323Sed#ifdef PREEMPTION 416218893Sdim#ifndef FULL_PREEMPTION 417193323Sed if (td->td_priority > PRI_MAX_ITHD) { 418193323Sed running_thread->td_flags |= TDF_NEEDRESCHED; 419193323Sed return; 420193323Sed } 421193323Sed#endif /* FULL_PREEMPTION */ 422193323Sed 423239462Sdim if (running_thread->td_critnest > 1) 424193323Sed running_thread->td_owepreempt = 1; 425193323Sed else 426193323Sed mi_switch(SW_INVOL, NULL); 427218893Sdim 428207618Srdivacky#else /* PREEMPTION */ 429193323Sed running_thread->td_flags |= TDF_NEEDRESCHED; 430193323Sed#endif /* PREEMPTION */ 431193323Sed return; 432198892Srdivacky} 433207618Srdivacky 434198892Srdivacky#else /* SMP */ 435198892Srdivacky{ 436195340Sed struct thread *running_thread; 437205218Srdivacky int worst_pri; 438205218Srdivacky struct ksegrp *kg; 439205218Srdivacky cpumask_t cpumask,dontuse; 440205218Srdivacky struct pcpu *pc; 441205218Srdivacky struct pcpu *best_pcpu; 442218893Sdim struct thread *cputhread; 443198090Srdivacky 444198090Srdivacky mtx_assert(&sched_lock, MA_OWNED); 445193323Sed 446239462Sdim running_thread = curthread; 447239462Sdim 448243830Sdim#if !defined(KSEG_PEEMPT_BEST_CPU) 449218893Sdim if (running_thread->td_ksegrp != td->td_ksegrp) { 450193323Sed#endif 451218893Sdim kg = td->td_ksegrp; 452193323Sed 453193323Sed /* if someone is ahead of this thread, wait our turn */ 454193323Sed if (td != TAILQ_FIRST(&kg->kg_runq)) 455193323Sed return; 456202375Srdivacky 457234353Sdim worst_pri = td->td_priority; 458234353Sdim best_pcpu = NULL; 459234353Sdim dontuse = stopped_cpus | idle_cpus_mask; 460234353Sdim 461234353Sdim /* 462234353Sdim * Find a cpu with the worst priority that runs at thread from 463234353Sdim * the same ksegrp - if multiple exist give first the last run 464234353Sdim * cpu and then the current cpu priority 465234353Sdim */ 466234353Sdim 467234353Sdim SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { 468234353Sdim cpumask = pc->pc_cpumask; 469234353Sdim cputhread = pc->pc_curthread; 470234353Sdim 471234353Sdim if ((cpumask & dontuse) || 472234353Sdim cputhread->td_ksegrp != kg) 473234353Sdim continue; 474234353Sdim 475234353Sdim if (cputhread->td_priority > worst_pri) { 476234353Sdim worst_pri = cputhread->td_priority; 477234353Sdim best_pcpu = pc; 478234353Sdim continue; 479202375Srdivacky } 480202375Srdivacky 481202375Srdivacky if (cputhread->td_priority == worst_pri && 482202375Srdivacky best_pcpu != NULL && 483218893Sdim (td->td_lastcpu == pc->pc_cpuid || 484193323Sed (PCPU_GET(cpumask) == cpumask && 485193323Sed td->td_lastcpu != best_pcpu->pc_cpuid))) 486193323Sed best_pcpu = pc; 487218893Sdim } 488193323Sed 489193323Sed /* Check if we need to preempt someone */ 490193323Sed if (best_pcpu == NULL) 491193323Sed return; 492193323Sed 493193323Sed#if defined(IPI_PREEMPTION) && defined(PREEMPTION) 494239462Sdim#if !defined(FULL_PREEMPTION) 495239462Sdim if (td->td_priority <= PRI_MAX_ITHD) 496218893Sdim#endif /* ! FULL_PREEMPTION */ 497218893Sdim { 498193323Sed ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT); 499218893Sdim return; 500193323Sed } 501239462Sdim#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */ 502193323Sed 503193323Sed if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) { 504193323Sed best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED; 505218893Sdim ipi_selected(best_pcpu->pc_cpumask, IPI_AST); 506193323Sed return; 507193323Sed } 508193323Sed#if !defined(KSEG_PEEMPT_BEST_CPU) 509193323Sed } 510218893Sdim#endif 511193323Sed 512193323Sed if (td->td_priority >= running_thread->td_priority) 513193323Sed return; 514218893Sdim#ifdef PREEMPTION 515193323Sed 516193323Sed#if !defined(FULL_PREEMPTION) 517193323Sed if (td->td_priority > PRI_MAX_ITHD) { 518218893Sdim running_thread->td_flags |= TDF_NEEDRESCHED; 519239462Sdim } 520239462Sdim#endif /* ! FULL_PREEMPTION */ 521239462Sdim 522239462Sdim if (running_thread->td_critnest > 1) 523239462Sdim running_thread->td_owepreempt = 1; 524239462Sdim else 525239462Sdim mi_switch(SW_INVOL, NULL); 526193323Sed 527193323Sed#else /* PREEMPTION */ 528193323Sed running_thread->td_flags |= TDF_NEEDRESCHED; 529193323Sed#endif /* PREEMPTION */ 530218893Sdim return; 531193323Sed} 532193323Sed#endif /* !SMP */ 533193323Sed 534193323Sed 535195340Sedint limitcount; 536203954Srdivacky#endif 537218893Sdimvoid 538193323Sedsetrunqueue(struct thread *td, int flags) 539193323Sed{ 540193323Sed#ifdef KSE 541218893Sdim struct ksegrp *kg; 542193323Sed struct thread *td2; 543193323Sed struct thread *tda; 544193323Sed 545193323Sed CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d", 546193323Sed td, td->td_ksegrp, td->td_proc->p_pid); 547218893Sdim#else 548224145Sdim CTR2(KTR_RUNQ, "setrunqueue: td:%p pid:%d", 549224145Sdim td, td->td_proc->p_pid); 550224145Sdim#endif 551224145Sdim CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)", 552224145Sdim td, td->td_proc->p_comm, td->td_priority, curthread, 553224145Sdim curthread->td_proc->p_comm); 554193323Sed mtx_assert(&sched_lock, MA_OWNED); 555193323Sed KASSERT((td->td_inhibitors == 0), 556193323Sed ("setrunqueue: trying to run inhibitted thread")); 557193323Sed KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 558193323Sed ("setrunqueue: bad thread state")); 559218893Sdim TD_SET_RUNQ(td); 560193323Sed#ifdef KSE 561193323Sed kg = td->td_ksegrp; 562195340Sed if ((td->td_proc->p_flag & P_HADTHREADS) == 0) { 563195340Sed /* 564203954Srdivacky * Common path optimisation: Only one of everything 565239462Sdim * and the KSE is always already attached. 566239462Sdim * Totally ignore the ksegrp run queue. 567263508Sdim */ 568263508Sdim if (kg->kg_avail_opennings != 1) { 569193323Sed if (limitcount < 1) { 570193323Sed limitcount++; 571193323Sed printf("pid %d: corrected slot count (%d->1)\n", 572193323Sed td->td_proc->p_pid, kg->kg_avail_opennings); 573193323Sed 574195340Sed } 575239462Sdim kg->kg_avail_opennings = 1; 576193323Sed } 577243830Sdim sched_add(td, flags); 578203954Srdivacky return; 579218893Sdim } 580193323Sed 581193323Sed /* 582249423Sdim * If the concurrency has reduced, and we would go in the 583193323Sed * assigned section, then keep removing entries from the 584193323Sed * system run queue, until we are not in that section 585195098Sed * or there is room for us to be put in that section. 586195098Sed * What we MUST avoid is the case where there are threads of less 587193323Sed * priority than the new one scheduled, but it can not 588193323Sed * be scheduled itself. That would lead to a non contiguous set 589195098Sed * of scheduled threads, and everything would break. 590193323Sed */ 591193323Sed tda = kg->kg_last_assigned; 592223017Sdim while ((kg->kg_avail_opennings <= 0) && 593193323Sed (tda && (tda->td_priority > td->td_priority))) { 594193323Sed /* 595193323Sed * None free, but there is one we can commandeer. 596193323Sed */ 597195098Sed CTR2(KTR_RUNQ, 598195098Sed "setrunqueue: kg:%p: take slot from td: %p", kg, tda); 599193323Sed sched_rem(tda); 600193323Sed tda = kg->kg_last_assigned = 601193323Sed TAILQ_PREV(tda, threadqueue, td_runq); 602195098Sed } 603193323Sed 604193323Sed /* 605239462Sdim * Add the thread to the ksegrp's run queue at 606239462Sdim * the appropriate place. 607239462Sdim */ 608239462Sdim TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) { 609239462Sdim if (td2->td_priority > td->td_priority) { 610239462Sdim TAILQ_INSERT_BEFORE(td2, td, td_runq); 611239462Sdim break; 612239462Sdim } 613195098Sed } 614195098Sed if (td2 == NULL) { 615193323Sed /* We ran off the end of the TAILQ or it was empty. */ 616193323Sed TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq); 617195098Sed } 618193323Sed 619193323Sed /* 620207618Srdivacky * If we have a slot to use, then put the thread on the system 621195098Sed * run queue and if needed, readjust the last_assigned pointer. 622193323Sed * it may be that we need to schedule something anyhow 623193323Sed * even if the availabel slots are -ve so that 624193323Sed * all the items < last_assigned are scheduled. 625195098Sed */ 626193323Sed if (kg->kg_avail_opennings > 0) { 627193323Sed if (tda == NULL) { 628198090Srdivacky /* 629195098Sed * No pre-existing last assigned so whoever is first 630193323Sed * gets the slot.. (maybe us) 631193323Sed */ 632198090Srdivacky td2 = TAILQ_FIRST(&kg->kg_runq); 633195098Sed kg->kg_last_assigned = td2; 634193323Sed } else if (tda->td_priority > td->td_priority) { 635193323Sed td2 = td; 636243830Sdim } else { 637199989Srdivacky /* 638198892Srdivacky * We are past last_assigned, so 639198892Srdivacky * give the next slot to whatever is next, 640243830Sdim * which may or may not be us. 641199989Srdivacky */ 642198892Srdivacky td2 = TAILQ_NEXT(tda, td_runq); 643198892Srdivacky kg->kg_last_assigned = td2; 644234353Sdim } 645234353Sdim sched_add(td2, flags); 646234353Sdim } else { 647234353Sdim CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d", 648234353Sdim td, td->td_ksegrp, td->td_proc->p_pid); 649234353Sdim if ((flags & SRQ_YIELDING) == 0) 650234353Sdim maybe_preempt_in_ksegrp(td); 651234353Sdim } 652234353Sdim#else 653234353Sdim sched_add(td, flags); 654234353Sdim#endif 655234353Sdim} 656234353Sdim 657234353Sdim/* 658234353Sdim * Kernel thread preemption implementation. Critical sections mark 659234353Sdim * regions of code in which preemptions are not allowed. 660234353Sdim */ 661234353Sdimvoid 662204642Srdivackycritical_enter(void) 663202375Srdivacky{ 664202375Srdivacky struct thread *td; 665202375Srdivacky 666202375Srdivacky td = curthread; 667193323Sed td->td_critnest++; 668205218Srdivacky CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td, 669205218Srdivacky (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest); 670205218Srdivacky} 671205218Srdivacky 672205218Srdivackyvoid 673218893Sdimcritical_exit(void) 674193323Sed{ 675193323Sed struct thread *td; 676193323Sed 677193323Sed td = curthread; 678193323Sed KASSERT(td->td_critnest != 0, 679193323Sed ("critical_exit: td_critnest == 0")); 680193323Sed#ifdef PREEMPTION 681193323Sed if (td->td_critnest == 1) { 682193323Sed td->td_critnest = 0; 683193323Sed mtx_assert(&sched_lock, MA_NOTOWNED); 684193323Sed if (td->td_owepreempt) { 685193323Sed td->td_critnest = 1; 686193323Sed mtx_lock_spin(&sched_lock); 687193323Sed td->td_critnest--; 688193323Sed mi_switch(SW_INVOL, NULL); 689193323Sed mtx_unlock_spin(&sched_lock); 690193323Sed } 691193323Sed } else 692193323Sed#endif 693193323Sed td->td_critnest--; 694193323Sed 695243830Sdim CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td, 696243830Sdim (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest); 697243830Sdim} 698193323Sed 699193323Sed/* 700193323Sed * This function is called when a thread is about to be put on run queue 701 * because it has been made runnable or its priority has been adjusted. It 702 * determines if the new thread should be immediately preempted to. If so, 703 * it switches to it and eventually returns true. If not, it returns false 704 * so that the caller may place the thread on an appropriate run queue. 705 */ 706int 707maybe_preempt(struct thread *td) 708{ 709#ifdef PREEMPTION 710 struct thread *ctd; 711 int cpri, pri; 712#endif 713 714 mtx_assert(&sched_lock, MA_OWNED); 715#ifdef PREEMPTION 716 /* 717 * The new thread should not preempt the current thread if any of the 718 * following conditions are true: 719 * 720 * - The kernel is in the throes of crashing (panicstr). 721 * - The current thread has a higher (numerically lower) or 722 * equivalent priority. Note that this prevents curthread from 723 * trying to preempt to itself. 724 * - It is too early in the boot for context switches (cold is set). 725 * - The current thread has an inhibitor set or is in the process of 726 * exiting. In this case, the current thread is about to switch 727 * out anyways, so there's no point in preempting. If we did, 728 * the current thread would not be properly resumed as well, so 729 * just avoid that whole landmine. 730 * - If the new thread's priority is not a realtime priority and 731 * the current thread's priority is not an idle priority and 732 * FULL_PREEMPTION is disabled. 733 * 734 * If all of these conditions are false, but the current thread is in 735 * a nested critical section, then we have to defer the preemption 736 * until we exit the critical section. Otherwise, switch immediately 737 * to the new thread. 738 */ 739 ctd = curthread; 740 KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd), 741 ("thread has no (or wrong) sched-private part.")); 742 KASSERT((td->td_inhibitors == 0), 743 ("maybe_preempt: trying to run inhibitted thread")); 744 pri = td->td_priority; 745 cpri = ctd->td_priority; 746 if (panicstr != NULL || pri >= cpri || cold /* || dumping */ || 747 TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD) 748 return (0); 749#ifndef FULL_PREEMPTION 750 if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE) 751 return (0); 752#endif 753 754 if (ctd->td_critnest > 1) { 755 CTR1(KTR_PROC, "maybe_preempt: in critical section %d", 756 ctd->td_critnest); 757 ctd->td_owepreempt = 1; 758 return (0); 759 } 760 761 /* 762 * Thread is runnable but not yet put on system run queue. 763 */ 764 MPASS(TD_ON_RUNQ(td)); 765 MPASS(td->td_sched->ke_state != KES_ONRUNQ); 766#ifdef KSE 767 if (td->td_proc->p_flag & P_HADTHREADS) { 768 /* 769 * If this is a threaded process we actually ARE on the 770 * ksegrp run queue so take it off that first. 771 * Also undo any damage done to the last_assigned pointer. 772 * XXX Fix setrunqueue so this isn't needed 773 */ 774 struct ksegrp *kg; 775 776 kg = td->td_ksegrp; 777 if (kg->kg_last_assigned == td) 778 kg->kg_last_assigned = 779 TAILQ_PREV(td, threadqueue, td_runq); 780 TAILQ_REMOVE(&kg->kg_runq, td, td_runq); 781 } 782 783#endif 784 TD_SET_RUNNING(td); 785 CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, 786 td->td_proc->p_pid, td->td_proc->p_comm); 787 mi_switch(SW_INVOL|SW_PREEMPT, td); 788 return (1); 789#else 790 return (0); 791#endif 792} 793 794#if 0 795#ifndef PREEMPTION 796/* XXX: There should be a non-static version of this. */ 797static void 798printf_caddr_t(void *data) 799{ 800 printf("%s", (char *)data); 801} 802static char preempt_warning[] = 803 "WARNING: Kernel preemption is disabled, expect reduced performance.\n"; 804SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t, 805 preempt_warning) 806#endif 807#endif 808 809/************************************************************************ 810 * SYSTEM RUN QUEUE manipulations and tests * 811 ************************************************************************/ 812/* 813 * Initialize a run structure. 814 */ 815void 816runq_init(struct runq *rq) 817{ 818 int i; 819 820 bzero(rq, sizeof *rq); 821 for (i = 0; i < RQ_NQS; i++) 822 TAILQ_INIT(&rq->rq_queues[i]); 823} 824 825/* 826 * Clear the status bit of the queue corresponding to priority level pri, 827 * indicating that it is empty. 828 */ 829static __inline void 830runq_clrbit(struct runq *rq, int pri) 831{ 832 struct rqbits *rqb; 833 834 rqb = &rq->rq_status; 835 CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d", 836 rqb->rqb_bits[RQB_WORD(pri)], 837 rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri), 838 RQB_BIT(pri), RQB_WORD(pri)); 839 rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri); 840} 841 842/* 843 * Find the index of the first non-empty run queue. This is done by 844 * scanning the status bits, a set bit indicates a non-empty queue. 845 */ 846static __inline int 847runq_findbit(struct runq *rq) 848{ 849 struct rqbits *rqb; 850 int pri; 851 int i; 852 853 rqb = &rq->rq_status; 854 for (i = 0; i < RQB_LEN; i++) 855 if (rqb->rqb_bits[i]) { 856 pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW); 857 CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d", 858 rqb->rqb_bits[i], i, pri); 859 return (pri); 860 } 861 862 return (-1); 863} 864 865/* 866 * Set the status bit of the queue corresponding to priority level pri, 867 * indicating that it is non-empty. 868 */ 869static __inline void 870runq_setbit(struct runq *rq, int pri) 871{ 872 struct rqbits *rqb; 873 874 rqb = &rq->rq_status; 875 CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d", 876 rqb->rqb_bits[RQB_WORD(pri)], 877 rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri), 878 RQB_BIT(pri), RQB_WORD(pri)); 879 rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri); 880} 881 882/* 883 * Add the KSE to the queue specified by its priority, and set the 884 * corresponding status bit. 885 */ 886void 887runq_add(struct runq *rq, struct kse *ke, int flags) 888{ 889 struct rqhead *rqh; 890 int pri; 891 892 pri = ke->ke_thread->td_priority / RQ_PPQ; 893 ke->ke_rqindex = pri; 894 runq_setbit(rq, pri); 895 rqh = &rq->rq_queues[pri]; 896 CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p", 897 ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh); 898 if (flags & SRQ_PREEMPTED) { 899 TAILQ_INSERT_HEAD(rqh, ke, ke_procq); 900 } else { 901 TAILQ_INSERT_TAIL(rqh, ke, ke_procq); 902 } 903} 904 905/* 906 * Return true if there are runnable processes of any priority on the run 907 * queue, false otherwise. Has no side effects, does not modify the run 908 * queue structure. 909 */ 910int 911runq_check(struct runq *rq) 912{ 913 struct rqbits *rqb; 914 int i; 915 916 rqb = &rq->rq_status; 917 for (i = 0; i < RQB_LEN; i++) 918 if (rqb->rqb_bits[i]) { 919 CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d", 920 rqb->rqb_bits[i], i); 921 return (1); 922 } 923 CTR0(KTR_RUNQ, "runq_check: empty"); 924 925 return (0); 926} 927 928#if defined(SMP) && defined(SCHED_4BSD) 929int runq_fuzz = 1; 930SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, ""); 931#endif 932 933/* 934 * Find the highest priority process on the run queue. 935 */ 936struct kse * 937runq_choose(struct runq *rq) 938{ 939 struct rqhead *rqh; 940 struct kse *ke; 941 int pri; 942 943 mtx_assert(&sched_lock, MA_OWNED); 944 while ((pri = runq_findbit(rq)) != -1) { 945 rqh = &rq->rq_queues[pri]; 946#if defined(SMP) && defined(SCHED_4BSD) 947 /* fuzz == 1 is normal.. 0 or less are ignored */ 948 if (runq_fuzz > 1) { 949 /* 950 * In the first couple of entries, check if 951 * there is one for our CPU as a preference. 952 */ 953 int count = runq_fuzz; 954 int cpu = PCPU_GET(cpuid); 955 struct kse *ke2; 956 ke2 = ke = TAILQ_FIRST(rqh); 957 958 while (count-- && ke2) { 959 if (ke->ke_thread->td_lastcpu == cpu) { 960 ke = ke2; 961 break; 962 } 963 ke2 = TAILQ_NEXT(ke2, ke_procq); 964 } 965 } else 966#endif 967 ke = TAILQ_FIRST(rqh); 968 KASSERT(ke != NULL, ("runq_choose: no proc on busy queue")); 969 CTR3(KTR_RUNQ, 970 "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh); 971 return (ke); 972 } 973 CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri); 974 975 return (NULL); 976} 977 978/* 979 * Remove the KSE from the queue specified by its priority, and clear the 980 * corresponding status bit if the queue becomes empty. 981 * Caller must set ke->ke_state afterwards. 982 */ 983void 984runq_remove(struct runq *rq, struct kse *ke) 985{ 986 struct rqhead *rqh; 987 int pri; 988 989#ifdef KSE 990 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 991#else 992 KASSERT(ke->ke_thread->td_proc->p_sflag & PS_INMEM, 993#endif 994 ("runq_remove: process swapped out")); 995 pri = ke->ke_rqindex; 996 rqh = &rq->rq_queues[pri]; 997 CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p", 998 ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh); 999 KASSERT(ke != NULL, ("runq_remove: no proc on busy queue")); 1000 TAILQ_REMOVE(rqh, ke, ke_procq); 1001 if (TAILQ_EMPTY(rqh)) { 1002 CTR0(KTR_RUNQ, "runq_remove: empty"); 1003 runq_clrbit(rq, pri); 1004 } 1005} 1006 1007/****** functions that are temporarily here ***********/ 1008#include <vm/uma.h> 1009extern struct mtx kse_zombie_lock; 1010 1011#ifdef KSE 1012/* 1013 * Allocate scheduler specific per-process resources. 1014 * The thread and ksegrp have already been linked in. 1015 * In this case just set the default concurrency value. 1016 * 1017 * Called from: 1018 * proc_init() (UMA init method) 1019 */ 1020void 1021sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td) 1022{ 1023 1024 /* This can go in sched_fork */ 1025 sched_init_concurrency(kg); 1026} 1027#endif 1028 1029/* 1030 * thread is being either created or recycled. 1031 * Fix up the per-scheduler resources associated with it. 1032 * Called from: 1033 * sched_fork_thread() 1034 * thread_dtor() (*may go away) 1035 * thread_init() (*may go away) 1036 */ 1037void 1038sched_newthread(struct thread *td) 1039{ 1040 struct td_sched *ke; 1041 1042 ke = (struct td_sched *) (td + 1); 1043 bzero(ke, sizeof(*ke)); 1044 td->td_sched = ke; 1045 ke->ke_thread = td; 1046 ke->ke_state = KES_THREAD; 1047} 1048 1049#ifdef KSE 1050/* 1051 * Set up an initial concurrency of 1 1052 * and set the given thread (if given) to be using that 1053 * concurrency slot. 1054 * May be used "offline"..before the ksegrp is attached to the world 1055 * and thus wouldn't need schedlock in that case. 1056 * Called from: 1057 * thr_create() 1058 * proc_init() (UMA) via sched_newproc() 1059 */ 1060void 1061sched_init_concurrency(struct ksegrp *kg) 1062{ 1063 1064 CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg); 1065 kg->kg_concurrency = 1; 1066 kg->kg_avail_opennings = 1; 1067} 1068 1069/* 1070 * Change the concurrency of an existing ksegrp to N 1071 * Called from: 1072 * kse_create() 1073 * kse_exit() 1074 * thread_exit() 1075 * thread_single() 1076 */ 1077void 1078sched_set_concurrency(struct ksegrp *kg, int concurrency) 1079{ 1080 1081 CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d", 1082 kg, 1083 concurrency, 1084 kg->kg_avail_opennings, 1085 kg->kg_avail_opennings + (concurrency - kg->kg_concurrency)); 1086 kg->kg_avail_opennings += (concurrency - kg->kg_concurrency); 1087 kg->kg_concurrency = concurrency; 1088} 1089 1090/* 1091 * Called from thread_exit() for all exiting thread 1092 * 1093 * Not to be confused with sched_exit_thread() 1094 * that is only called from thread_exit() for threads exiting 1095 * without the rest of the process exiting because it is also called from 1096 * sched_exit() and we wouldn't want to call it twice. 1097 * XXX This can probably be fixed. 1098 */ 1099void 1100sched_thread_exit(struct thread *td) 1101{ 1102 1103 SLOT_RELEASE(td->td_ksegrp); 1104 slot_fill(td->td_ksegrp); 1105} 1106#endif 1107 1108#endif /* KERN_SWITCH_INCLUDE */ 1109