thr_kern.c revision 67097
1154789Sambrisko/*
2154789Sambrisko * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
3154789Sambrisko * All rights reserved.
4154789Sambrisko *
5154789Sambrisko * Redistribution and use in source and binary forms, with or without
6154789Sambrisko * modification, are permitted provided that the following conditions
7154789Sambrisko * are met:
8154789Sambrisko * 1. Redistributions of source code must retain the above copyright
9154789Sambrisko *    notice, this list of conditions and the following disclaimer.
10154789Sambrisko * 2. Redistributions in binary form must reproduce the above copyright
11154789Sambrisko *    notice, this list of conditions and the following disclaimer in the
12154789Sambrisko *    documentation and/or other materials provided with the distribution.
13154789Sambrisko * 3. All advertising materials mentioning features or use of this software
14154789Sambrisko *    must display the following acknowledgement:
15154789Sambrisko *	This product includes software developed by John Birrell.
16154789Sambrisko * 4. Neither the name of the author nor the names of any co-contributors
17154789Sambrisko *    may be used to endorse or promote products derived from this software
18154789Sambrisko *    without specific prior written permission.
19154789Sambrisko *
20154789Sambrisko * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
21154789Sambrisko * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22154789Sambrisko * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23154789Sambrisko * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24154789Sambrisko * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25154789Sambrisko * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26154789Sambrisko * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27154789Sambrisko * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28154789Sambrisko * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29154789Sambrisko * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30154789Sambrisko * SUCH DAMAGE.
31154789Sambrisko *
32154789Sambrisko * $FreeBSD: head/lib/libkse/thread/thr_kern.c 67097 2000-10-13 22:12:32Z deischen $
33280258Srwatson *
34154789Sambrisko */
35154789Sambrisko#include <errno.h>
36154789Sambrisko#include <poll.h>
37158267Sambrisko#include <stdlib.h>
38158267Sambrisko#include <stdarg.h>
39154789Sambrisko#include <string.h>
40154789Sambrisko#include <unistd.h>
41154789Sambrisko#include <setjmp.h>
42154789Sambrisko#include <sys/param.h>
43154789Sambrisko#include <sys/types.h>
44154789Sambrisko#include <sys/signalvar.h>
45154789Sambrisko#include <sys/stat.h>
46154789Sambrisko#include <sys/time.h>
47154789Sambrisko#include <sys/socket.h>
48154789Sambrisko#include <sys/uio.h>
49154789Sambrisko#include <sys/syscall.h>
50157755Sambrisko#include <fcntl.h>
51157755Sambrisko#ifdef _THREAD_SAFE
52154789Sambrisko#include <pthread.h>
53154789Sambrisko#include "pthread_private.h"
54154789Sambrisko
55154789Sambrisko/* #define DEBUG_THREAD_KERN */
56154789Sambrisko#ifdef DEBUG_THREAD_KERN
57154789Sambrisko#define DBG_MSG		stdout_debug
58154789Sambrisko#else
59154789Sambrisko#define DBG_MSG(x...)
60154789Sambrisko#endif
61154789Sambrisko
62154789Sambrisko/* Static function prototype definitions: */
63154789Sambriskostatic void
64154789Sambriskothread_kern_poll(int wait_reqd);
65154789Sambrisko
66154789Sambriskostatic void
67154789Sambriskodequeue_signals(void);
68154789Sambrisko
69158267Sambriskostatic inline void
70154789Sambriskothread_run_switch_hook(pthread_t thread_out, pthread_t thread_in);
71154789Sambrisko
72154789Sambrisko/* Static variables: */
73192450Simpstatic int	last_tick = 0;
74154789Sambrisko
75255219Spjd/*
76158267Sambrisko * This is called when a signal handler finishes and wants to
77158267Sambrisko * return to a previous frame.
78154789Sambrisko */
79255219Spjdvoid
80255219Spjd_thread_kern_sched_frame(int frame)
81155953Sjhb{
82158267Sambrisko	/*
83158267Sambrisko	 * Flag the pthread kernel as executing scheduler code
84158267Sambrisko	 * to avoid a signal from interrupting this execution and
85154789Sambrisko	 * corrupting the (soon-to-be) current frame.
86	 */
87	_thread_kern_in_sched = 1;
88
89	/* Return to the specified frame: */
90	_thread_run->curframe = _thread_run->sigframes[frame];
91	_thread_run->sigframe_count = frame;
92
93	if (_thread_run->sigframe_count == 0)
94		/* Restore the threads priority: */
95		_thread_run->active_priority &= ~PTHREAD_SIGNAL_PRIORITY;
96
97	/* Switch to the thread scheduler: */
98	___longjmp(_thread_kern_sched_jb, 1);
99}
100
101
102void
103_thread_kern_sched(ucontext_t *scp)
104{
105	/*
106	 * Flag the pthread kernel as executing scheduler code
107	 * to avoid a scheduler signal from interrupting this
108	 * execution and calling the scheduler again.
109	 */
110	_thread_kern_in_sched = 1;
111
112	/* Check if this function was called from the signal handler: */
113	if (scp != NULL) {
114		/*
115		 * The signal handler should have saved the state of
116		 * the current thread.  Restore the process signal
117		 * mask.
118		 */
119		if (_thread_sys_sigprocmask(SIG_SETMASK,
120		    &_process_sigmask, NULL) != 0)
121			PANIC("Unable to restore process mask after signal");
122		/*
123		 * We're running on the signal stack; just call the
124		 * kernel scheduler directly.
125		 */
126		DBG_MSG("Entering scheduler due to signal\n");
127		_thread_kern_scheduler();
128	} else {
129		/* Save the state of the current thread: */
130		if (_setjmp(_thread_run->curframe->ctx.jb) == 0) {
131			/* Flag the jump buffer was the last state saved: */
132			_thread_run->curframe->ctxtype = CTX_JB_NOSIG;
133			_thread_run->curframe->longjmp_val = 1;
134		} else {
135			DBG_MSG("Returned from ___longjmp, thread %p\n",
136			    _thread_run);
137			/*
138			 * This point is reached when a longjmp() is called
139			 * to restore the state of a thread.
140			 *
141			 * This is the normal way out of the scheduler.
142			 */
143			_thread_kern_in_sched = 0;
144
145			if (_thread_run->sig_defer_count == 0) {
146				if (((_thread_run->cancelflags &
147				    PTHREAD_AT_CANCEL_POINT) == 0) &&
148				    ((_thread_run->cancelflags &
149				    PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
150					/*
151					 * Cancellations override signals.
152					 *
153					 * Stick a cancellation point at the
154					 * start of each async-cancellable
155					 * thread's resumption.
156					 *
157					 * We allow threads woken at cancel
158					 * points to do their own checks.
159					 */
160					pthread_testcancel();
161			}
162
163			if (_sched_switch_hook != NULL) {
164				/* Run the installed switch hook: */
165				thread_run_switch_hook(_last_user_thread,
166				    _thread_run);
167			}
168			return;
169		}
170		/* Switch to the thread scheduler: */
171		___longjmp(_thread_kern_sched_jb, 1);
172	}
173}
174
175void
176_thread_kern_sched_sig(void)
177{
178	_thread_run->check_pending = 1;
179	_thread_kern_sched(NULL);
180}
181
182
183void
184_thread_kern_scheduler(void)
185{
186	struct pthread_signal_frame *psf;
187	struct timespec	ts;
188	struct timeval	tv;
189	pthread_t	pthread, pthread_h;
190	unsigned int	current_tick;
191	int		add_to_prioq;
192
193	/* If the currently running thread is a user thread, save it: */
194	if ((_thread_run->flags & PTHREAD_FLAGS_PRIVATE) == 0)
195		_last_user_thread = _thread_run;
196
197	/* Are there pending signals for this thread? */
198	if (_thread_run->check_pending != 0) {
199		_thread_run->check_pending = 0;
200		_thread_sig_check_pending(_thread_run);
201	}
202
203	/*
204	 * Enter a scheduling loop that finds the next thread that is
205	 * ready to run. This loop completes when there are no more threads
206	 * in the global list or when a thread has its state restored by
207	 * either a sigreturn (if the state was saved as a sigcontext) or a
208	 * longjmp (if the state was saved by a setjmp).
209	 */
210	while (!(TAILQ_EMPTY(&_thread_list))) {
211		/* Get the current time of day: */
212		GET_CURRENT_TOD(tv);
213		TIMEVAL_TO_TIMESPEC(&tv, &ts);
214		current_tick = _sched_ticks;
215
216		/*
217		 * Protect the scheduling queues from access by the signal
218		 * handler.
219		 */
220		_queue_signals = 1;
221		add_to_prioq = 0;
222
223		if (_thread_run != &_thread_kern_thread) {
224			/*
225			 * This thread no longer needs to yield the CPU.
226			 */
227			_thread_run->yield_on_sig_undefer = 0;
228
229			if (_thread_run->state != PS_RUNNING) {
230				/*
231				 * Save the current time as the time that the
232				 * thread became inactive:
233				 */
234				_thread_run->last_inactive = (long)current_tick;
235				if (_thread_run->last_inactive <
236				    _thread_run->last_active) {
237					/* Account for a rollover: */
238					_thread_run->last_inactive =+
239					    UINT_MAX + 1;
240				}
241			}
242
243			/*
244			 * Place the currently running thread into the
245			 * appropriate queue(s).
246			 */
247			switch (_thread_run->state) {
248			case PS_DEAD:
249			case PS_STATE_MAX: /* to silence -Wall */
250			case PS_SUSPENDED:
251				/*
252				 * Dead and suspended threads are not placed
253				 * in any queue:
254				 */
255				break;
256
257			case PS_RUNNING:
258				/*
259				 * Runnable threads can't be placed in the
260				 * priority queue until after waiting threads
261				 * are polled (to preserve round-robin
262				 * scheduling).
263				 */
264				add_to_prioq = 1;
265				break;
266
267			/*
268			 * States which do not depend on file descriptor I/O
269			 * operations or timeouts:
270			 */
271			case PS_DEADLOCK:
272			case PS_FDLR_WAIT:
273			case PS_FDLW_WAIT:
274			case PS_FILE_WAIT:
275			case PS_JOIN:
276			case PS_MUTEX_WAIT:
277			case PS_SIGSUSPEND:
278			case PS_SIGTHREAD:
279			case PS_SIGWAIT:
280			case PS_WAIT_WAIT:
281				/* No timeouts for these states: */
282				_thread_run->wakeup_time.tv_sec = -1;
283				_thread_run->wakeup_time.tv_nsec = -1;
284
285				/* Restart the time slice: */
286				_thread_run->slice_usec = -1;
287
288				/* Insert into the waiting queue: */
289				PTHREAD_WAITQ_INSERT(_thread_run);
290				break;
291
292			/* States which can timeout: */
293			case PS_COND_WAIT:
294			case PS_SLEEP_WAIT:
295				/* Restart the time slice: */
296				_thread_run->slice_usec = -1;
297
298				/* Insert into the waiting queue: */
299				PTHREAD_WAITQ_INSERT(_thread_run);
300				break;
301
302			/* States that require periodic work: */
303			case PS_SPINBLOCK:
304				/* No timeouts for this state: */
305				_thread_run->wakeup_time.tv_sec = -1;
306				_thread_run->wakeup_time.tv_nsec = -1;
307
308				/* Increment spinblock count: */
309				_spinblock_count++;
310
311				/* FALLTHROUGH */
312			case PS_FDR_WAIT:
313			case PS_FDW_WAIT:
314			case PS_POLL_WAIT:
315			case PS_SELECT_WAIT:
316				/* Restart the time slice: */
317				_thread_run->slice_usec = -1;
318
319				/* Insert into the waiting queue: */
320				PTHREAD_WAITQ_INSERT(_thread_run);
321
322				/* Insert into the work queue: */
323				PTHREAD_WORKQ_INSERT(_thread_run);
324				break;
325			}
326		}
327
328		/*
329		 * Poll file descriptors only if a new scheduling signal
330		 * has occurred or if we have no more runnable threads.
331		 */
332		if (((current_tick = _sched_ticks) != last_tick) ||
333		    ((_thread_run->state != PS_RUNNING) &&
334		    (PTHREAD_PRIOQ_FIRST() == NULL))) {
335			/* Unprotect the scheduling queues: */
336			_queue_signals = 0;
337
338			/*
339			 * Poll file descriptors to update the state of threads
340			 * waiting on file I/O where data may be available:
341			 */
342			thread_kern_poll(0);
343
344			/* Protect the scheduling queues: */
345			_queue_signals = 1;
346		}
347		last_tick = current_tick;
348
349		/*
350		 * Wake up threads that have timedout.  This has to be
351		 * done after polling in case a thread does a poll or
352		 * select with zero time.
353		 */
354		PTHREAD_WAITQ_SETACTIVE();
355		while (((pthread = TAILQ_FIRST(&_waitingq)) != NULL) &&
356		    (pthread->wakeup_time.tv_sec != -1) &&
357		    (((pthread->wakeup_time.tv_sec == 0) &&
358		    (pthread->wakeup_time.tv_nsec == 0)) ||
359		    (pthread->wakeup_time.tv_sec < ts.tv_sec) ||
360		    ((pthread->wakeup_time.tv_sec == ts.tv_sec) &&
361		    (pthread->wakeup_time.tv_nsec <= ts.tv_nsec)))) {
362			switch (pthread->state) {
363			case PS_POLL_WAIT:
364			case PS_SELECT_WAIT:
365				/* Return zero file descriptors ready: */
366				pthread->data.poll_data->nfds = 0;
367				/* fall through */
368			default:
369				/*
370				 * Remove this thread from the waiting queue
371				 * (and work queue if necessary) and place it
372				 * in the ready queue.
373				 */
374				PTHREAD_WAITQ_CLEARACTIVE();
375				if (pthread->flags & PTHREAD_FLAGS_IN_WORKQ)
376					PTHREAD_WORKQ_REMOVE(pthread);
377				PTHREAD_NEW_STATE(pthread, PS_RUNNING);
378				PTHREAD_WAITQ_SETACTIVE();
379				break;
380			}
381			/*
382			 * Flag the timeout in the thread structure:
383			 */
384			pthread->timeout = 1;
385		}
386		PTHREAD_WAITQ_CLEARACTIVE();
387
388		/*
389		 * Check to see if the current thread needs to be added
390		 * to the priority queue:
391		 */
392		if (add_to_prioq != 0) {
393			/*
394			 * Save the current time as the time that the
395			 * thread became inactive:
396			 */
397			current_tick = _sched_ticks;
398			_thread_run->last_inactive = (long)current_tick;
399			if (_thread_run->last_inactive <
400			    _thread_run->last_active) {
401				/* Account for a rollover: */
402				_thread_run->last_inactive =+ UINT_MAX + 1;
403			}
404
405			if ((_thread_run->slice_usec != -1) &&
406		 	   (_thread_run->attr.sched_policy != SCHED_FIFO)) {
407				/*
408				 * Accumulate the number of microseconds for
409				 * which the current thread has run:
410				 */
411				_thread_run->slice_usec +=
412				    (_thread_run->last_inactive -
413				    _thread_run->last_active) *
414				    (long)_clock_res_usec;
415				/* Check for time quantum exceeded: */
416				if (_thread_run->slice_usec > TIMESLICE_USEC)
417					_thread_run->slice_usec = -1;
418			}
419
420			if (_thread_run->slice_usec == -1) {
421				/*
422				 * The thread exceeded its time
423				 * quantum or it yielded the CPU;
424				 * place it at the tail of the
425				 * queue for its priority.
426				 */
427				PTHREAD_PRIOQ_INSERT_TAIL(_thread_run);
428			} else {
429				/*
430				 * The thread hasn't exceeded its
431				 * interval.  Place it at the head
432				 * of the queue for its priority.
433				 */
434				PTHREAD_PRIOQ_INSERT_HEAD(_thread_run);
435			}
436		}
437
438		/*
439		 * Get the highest priority thread in the ready queue.
440		 */
441		pthread_h = PTHREAD_PRIOQ_FIRST();
442
443		/* Check if there are no threads ready to run: */
444		if (pthread_h == NULL) {
445			/*
446			 * Lock the pthread kernel by changing the pointer to
447			 * the running thread to point to the global kernel
448			 * thread structure:
449			 */
450			_thread_run = &_thread_kern_thread;
451			DBG_MSG("No runnable threads, using kernel thread %p\n",
452			    _thread_run);
453
454			/* Unprotect the scheduling queues: */
455			_queue_signals = 0;
456
457			/*
458			 * There are no threads ready to run, so wait until
459			 * something happens that changes this condition:
460			 */
461			thread_kern_poll(1);
462
463			/*
464			 * This process' usage will likely be very small
465			 * while waiting in a poll.  Since the scheduling
466			 * clock is based on the profiling timer, it is
467			 * unlikely that the profiling timer will fire
468			 * and update the time of day.  To account for this,
469			 * get the time of day after polling with a timeout.
470			 */
471			gettimeofday((struct timeval *) &_sched_tod, NULL);
472
473			/* Check once more for a runnable thread: */
474			_queue_signals = 1;
475			pthread_h = PTHREAD_PRIOQ_FIRST();
476			_queue_signals = 0;
477		}
478
479		if (pthread_h != NULL) {
480			/* Remove the thread from the ready queue: */
481			PTHREAD_PRIOQ_REMOVE(pthread_h);
482
483			/* Unprotect the scheduling queues: */
484			_queue_signals = 0;
485
486			/*
487			 * Check for signals queued while the scheduling
488			 * queues were protected:
489			 */
490			while (_sigq_check_reqd != 0) {
491				/* Clear before handling queued signals: */
492				_sigq_check_reqd = 0;
493
494				/* Protect the scheduling queues again: */
495				_queue_signals = 1;
496
497				dequeue_signals();
498
499				/*
500				 * Check for a higher priority thread that
501				 * became runnable due to signal handling.
502				 */
503				if (((pthread = PTHREAD_PRIOQ_FIRST()) != NULL) &&
504				    (pthread->active_priority > pthread_h->active_priority)) {
505					/* Remove the thread from the ready queue: */
506					PTHREAD_PRIOQ_REMOVE(pthread);
507
508					/*
509					 * Insert the lower priority thread
510					 * at the head of its priority list:
511					 */
512					PTHREAD_PRIOQ_INSERT_HEAD(pthread_h);
513
514					/* There's a new thread in town: */
515					pthread_h = pthread;
516				}
517
518				/* Unprotect the scheduling queues: */
519				_queue_signals = 0;
520			}
521
522			/* Make the selected thread the current thread: */
523			_thread_run = pthread_h;
524
525			/*
526			 * Save the current time as the time that the thread
527			 * became active:
528			 */
529			current_tick = _sched_ticks;
530			_thread_run->last_active = (long) current_tick;
531
532			/*
533			 * Check if this thread is running for the first time
534			 * or running again after using its full time slice
535			 * allocation:
536			 */
537			if (_thread_run->slice_usec == -1) {
538				/* Reset the accumulated time slice period: */
539				_thread_run->slice_usec = 0;
540			}
541
542			/*
543			 * If we had a context switch, run any
544			 * installed switch hooks.
545			 */
546			if ((_sched_switch_hook != NULL) &&
547			    (_last_user_thread != _thread_run)) {
548				thread_run_switch_hook(_last_user_thread,
549				    _thread_run);
550			}
551			/*
552			 * Continue the thread at its current frame:
553			 */
554			psf = _thread_run->curframe;
555			switch(psf->ctxtype) {
556			case CTX_JB_NOSIG:
557				___longjmp(psf->ctx.jb, psf->longjmp_val);
558				break;
559			case CTX_JB:
560				__longjmp(psf->ctx.jb, psf->longjmp_val);
561				break;
562			case CTX_SJB:
563				__siglongjmp(psf->ctx.sigjb, psf->longjmp_val);
564				break;
565			case CTX_UC:
566				/* XXX - Restore FP regsisters? */
567				FP_RESTORE_UC(&psf->ctx.uc);
568
569				/*
570				 * Do a sigreturn to restart the thread that
571				 * was interrupted by a signal:
572				 */
573				_thread_kern_in_sched = 0;
574
575#if NOT_YET
576				_setcontext(&psf->ctx.uc);
577#else
578				/*
579				 * Ensure the process signal mask is set
580				 * correctly:
581				 */
582				psf->ctx.uc.uc_sigmask = _process_sigmask;
583				_thread_sys_sigreturn(&psf->ctx.uc);
584#endif
585				break;
586			}
587			/* This point should not be reached. */
588			PANIC("Thread has returned from sigreturn or longjmp");
589		}
590	}
591
592	/* There are no more threads, so exit this process: */
593	exit(0);
594}
595
596void
597_thread_kern_sched_state(enum pthread_state state, char *fname, int lineno)
598{
599	/*
600	 * Flag the pthread kernel as executing scheduler code
601	 * to avoid a scheduler signal from interrupting this
602	 * execution and calling the scheduler again.
603	 */
604	_thread_kern_in_sched = 1;
605
606	/*
607	 * Prevent the signal handler from fiddling with this thread
608	 * before its state is set and is placed into the proper queue.
609	 */
610	_queue_signals = 1;
611
612	/* Change the state of the current thread: */
613	_thread_run->state = state;
614	_thread_run->fname = fname;
615	_thread_run->lineno = lineno;
616
617	/* Schedule the next thread that is ready: */
618	_thread_kern_sched(NULL);
619}
620
621void
622_thread_kern_sched_state_unlock(enum pthread_state state,
623    spinlock_t *lock, char *fname, int lineno)
624{
625	/*
626	 * Flag the pthread kernel as executing scheduler code
627	 * to avoid a scheduler signal from interrupting this
628	 * execution and calling the scheduler again.
629	 */
630	_thread_kern_in_sched = 1;
631
632	/*
633	 * Prevent the signal handler from fiddling with this thread
634	 * before its state is set and it is placed into the proper
635	 * queue(s).
636	 */
637	_queue_signals = 1;
638
639	/* Change the state of the current thread: */
640	_thread_run->state = state;
641	_thread_run->fname = fname;
642	_thread_run->lineno = lineno;
643
644	_SPINUNLOCK(lock);
645
646	/* Schedule the next thread that is ready: */
647	_thread_kern_sched(NULL);
648}
649
650static void
651thread_kern_poll(int wait_reqd)
652{
653	int             count = 0;
654	int             i, found;
655	int		kern_pipe_added = 0;
656	int             nfds = 0;
657	int		timeout_ms = 0;
658	struct pthread	*pthread;
659	struct timespec ts;
660	struct timeval  tv;
661
662	/* Check if the caller wants to wait: */
663	if (wait_reqd == 0) {
664		timeout_ms = 0;
665	}
666	else {
667		/* Get the current time of day: */
668		GET_CURRENT_TOD(tv);
669		TIMEVAL_TO_TIMESPEC(&tv, &ts);
670
671		_queue_signals = 1;
672		pthread = TAILQ_FIRST(&_waitingq);
673		_queue_signals = 0;
674
675		if ((pthread == NULL) || (pthread->wakeup_time.tv_sec == -1)) {
676			/*
677			 * Either there are no threads in the waiting queue,
678			 * or there are no threads that can timeout.
679			 */
680			timeout_ms = INFTIM;
681		}
682		else {
683			/*
684			 * Calculate the time left for the next thread to
685			 * timeout:
686			 */
687			timeout_ms = ((pthread->wakeup_time.tv_sec - ts.tv_sec) *
688			    1000) + ((pthread->wakeup_time.tv_nsec - ts.tv_nsec) /
689			    1000000);
690			/*
691			 * Don't allow negative timeouts:
692			 */
693			if (timeout_ms < 0)
694				timeout_ms = 0;
695		}
696	}
697
698	/* Protect the scheduling queues: */
699	_queue_signals = 1;
700
701	/*
702	 * Check to see if the signal queue needs to be walked to look
703	 * for threads awoken by a signal while in the scheduler.
704	 */
705	if (_sigq_check_reqd != 0) {
706		/* Reset flag before handling queued signals: */
707		_sigq_check_reqd = 0;
708
709		dequeue_signals();
710	}
711
712	/*
713	 * Check for a thread that became runnable due to a signal:
714	 */
715	if (PTHREAD_PRIOQ_FIRST() != NULL) {
716		/*
717		 * Since there is at least one runnable thread,
718		 * disable the wait.
719		 */
720		timeout_ms = 0;
721	}
722
723	/*
724	 * Form the poll table:
725	 */
726	nfds = 0;
727	if (timeout_ms != 0) {
728		/* Add the kernel pipe to the poll table: */
729		_thread_pfd_table[nfds].fd = _thread_kern_pipe[0];
730		_thread_pfd_table[nfds].events = POLLRDNORM;
731		_thread_pfd_table[nfds].revents = 0;
732		nfds++;
733		kern_pipe_added = 1;
734	}
735
736	PTHREAD_WAITQ_SETACTIVE();
737	TAILQ_FOREACH(pthread, &_workq, qe) {
738		switch (pthread->state) {
739		case PS_SPINBLOCK:
740			/*
741			 * If the lock is available, let the thread run.
742			 */
743			if (pthread->data.spinlock->access_lock == 0) {
744				PTHREAD_WAITQ_CLEARACTIVE();
745				PTHREAD_WORKQ_REMOVE(pthread);
746				PTHREAD_NEW_STATE(pthread,PS_RUNNING);
747				PTHREAD_WAITQ_SETACTIVE();
748				/* One less thread in a spinblock state: */
749				_spinblock_count--;
750				/*
751				 * Since there is at least one runnable
752				 * thread, disable the wait.
753				 */
754				timeout_ms = 0;
755			}
756			break;
757
758		/* File descriptor read wait: */
759		case PS_FDR_WAIT:
760			/* Limit number of polled files to table size: */
761			if (nfds < _thread_dtablesize) {
762				_thread_pfd_table[nfds].events = POLLRDNORM;
763				_thread_pfd_table[nfds].fd = pthread->data.fd.fd;
764				nfds++;
765			}
766			break;
767
768		/* File descriptor write wait: */
769		case PS_FDW_WAIT:
770			/* Limit number of polled files to table size: */
771			if (nfds < _thread_dtablesize) {
772				_thread_pfd_table[nfds].events = POLLWRNORM;
773				_thread_pfd_table[nfds].fd = pthread->data.fd.fd;
774				nfds++;
775			}
776			break;
777
778		/* File descriptor poll or select wait: */
779		case PS_POLL_WAIT:
780		case PS_SELECT_WAIT:
781			/* Limit number of polled files to table size: */
782			if (pthread->data.poll_data->nfds + nfds <
783			    _thread_dtablesize) {
784				for (i = 0; i < pthread->data.poll_data->nfds; i++) {
785					_thread_pfd_table[nfds + i].fd =
786					    pthread->data.poll_data->fds[i].fd;
787					_thread_pfd_table[nfds + i].events =
788					    pthread->data.poll_data->fds[i].events;
789				}
790				nfds += pthread->data.poll_data->nfds;
791			}
792			break;
793
794		/* Other states do not depend on file I/O. */
795		default:
796			break;
797		}
798	}
799	PTHREAD_WAITQ_CLEARACTIVE();
800
801	/*
802	 * Wait for a file descriptor to be ready for read, write, or
803	 * an exception, or a timeout to occur:
804	 */
805	count = _thread_sys_poll(_thread_pfd_table, nfds, timeout_ms);
806
807	if (kern_pipe_added != 0)
808		/*
809		 * Remove the pthread kernel pipe file descriptor
810		 * from the pollfd table:
811		 */
812		nfds = 1;
813	else
814		nfds = 0;
815
816	/*
817	 * Check if it is possible that there are bytes in the kernel
818	 * read pipe waiting to be read:
819	 */
820	if (count < 0 || ((kern_pipe_added != 0) &&
821	    (_thread_pfd_table[0].revents & POLLRDNORM))) {
822		/*
823		 * If the kernel read pipe was included in the
824		 * count:
825		 */
826		if (count > 0) {
827			/* Decrement the count of file descriptors: */
828			count--;
829		}
830
831		if (_sigq_check_reqd != 0) {
832			/* Reset flag before handling signals: */
833			_sigq_check_reqd = 0;
834
835			dequeue_signals();
836		}
837	}
838
839	/*
840	 * Check if any file descriptors are ready:
841	 */
842	if (count > 0) {
843		/*
844		 * Enter a loop to look for threads waiting on file
845		 * descriptors that are flagged as available by the
846		 * _poll syscall:
847		 */
848		PTHREAD_WAITQ_SETACTIVE();
849		TAILQ_FOREACH(pthread, &_workq, qe) {
850			switch (pthread->state) {
851			case PS_SPINBLOCK:
852				/*
853				 * If the lock is available, let the thread run.
854				 */
855				if (pthread->data.spinlock->access_lock == 0) {
856					PTHREAD_WAITQ_CLEARACTIVE();
857					PTHREAD_WORKQ_REMOVE(pthread);
858					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
859					PTHREAD_WAITQ_SETACTIVE();
860
861					/*
862					 * One less thread in a spinblock state:
863					 */
864					_spinblock_count--;
865				}
866				break;
867
868			/* File descriptor read wait: */
869			case PS_FDR_WAIT:
870				if ((nfds < _thread_dtablesize) &&
871				    (_thread_pfd_table[nfds].revents & POLLRDNORM)) {
872					PTHREAD_WAITQ_CLEARACTIVE();
873					PTHREAD_WORKQ_REMOVE(pthread);
874					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
875					PTHREAD_WAITQ_SETACTIVE();
876				}
877				nfds++;
878				break;
879
880			/* File descriptor write wait: */
881			case PS_FDW_WAIT:
882				if ((nfds < _thread_dtablesize) &&
883				    (_thread_pfd_table[nfds].revents & POLLWRNORM)) {
884					PTHREAD_WAITQ_CLEARACTIVE();
885					PTHREAD_WORKQ_REMOVE(pthread);
886					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
887					PTHREAD_WAITQ_SETACTIVE();
888				}
889				nfds++;
890				break;
891
892			/* File descriptor poll or select wait: */
893			case PS_POLL_WAIT:
894			case PS_SELECT_WAIT:
895				if (pthread->data.poll_data->nfds + nfds <
896				    _thread_dtablesize) {
897					/*
898					 * Enter a loop looking for I/O
899					 * readiness:
900					 */
901					found = 0;
902					for (i = 0; i < pthread->data.poll_data->nfds; i++) {
903						if (_thread_pfd_table[nfds + i].revents != 0) {
904							pthread->data.poll_data->fds[i].revents =
905							    _thread_pfd_table[nfds + i].revents;
906							found++;
907						}
908					}
909
910					/* Increment before destroying: */
911					nfds += pthread->data.poll_data->nfds;
912
913					if (found != 0) {
914						pthread->data.poll_data->nfds = found;
915						PTHREAD_WAITQ_CLEARACTIVE();
916						PTHREAD_WORKQ_REMOVE(pthread);
917						PTHREAD_NEW_STATE(pthread,PS_RUNNING);
918						PTHREAD_WAITQ_SETACTIVE();
919					}
920				}
921				else
922					nfds += pthread->data.poll_data->nfds;
923				break;
924
925			/* Other states do not depend on file I/O. */
926			default:
927				break;
928			}
929		}
930		PTHREAD_WAITQ_CLEARACTIVE();
931	}
932	else if (_spinblock_count != 0) {
933		/*
934		 * Enter a loop to look for threads waiting on a spinlock
935		 * that is now available.
936		 */
937		PTHREAD_WAITQ_SETACTIVE();
938		TAILQ_FOREACH(pthread, &_workq, qe) {
939			if (pthread->state == PS_SPINBLOCK) {
940				/*
941				 * If the lock is available, let the thread run.
942				 */
943				if (pthread->data.spinlock->access_lock == 0) {
944					PTHREAD_WAITQ_CLEARACTIVE();
945					PTHREAD_WORKQ_REMOVE(pthread);
946					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
947					PTHREAD_WAITQ_SETACTIVE();
948
949					/*
950					 * One less thread in a spinblock state:
951					 */
952					_spinblock_count--;
953				}
954			}
955		}
956		PTHREAD_WAITQ_CLEARACTIVE();
957	}
958
959	/* Unprotect the scheduling queues: */
960	_queue_signals = 0;
961
962	while (_sigq_check_reqd != 0) {
963		/* Handle queued signals: */
964		_sigq_check_reqd = 0;
965
966		/* Protect the scheduling queues: */
967		_queue_signals = 1;
968
969		dequeue_signals();
970
971		/* Unprotect the scheduling queues: */
972		_queue_signals = 0;
973	}
974}
975
976void
977_thread_kern_set_timeout(const struct timespec * timeout)
978{
979	struct timespec current_time;
980	struct timeval  tv;
981
982	/* Reset the timeout flag for the running thread: */
983	_thread_run->timeout = 0;
984
985	/* Check if the thread is to wait forever: */
986	if (timeout == NULL) {
987		/*
988		 * Set the wakeup time to something that can be recognised as
989		 * different to an actual time of day:
990		 */
991		_thread_run->wakeup_time.tv_sec = -1;
992		_thread_run->wakeup_time.tv_nsec = -1;
993	}
994	/* Check if no waiting is required: */
995	else if (timeout->tv_sec == 0 && timeout->tv_nsec == 0) {
996		/* Set the wake up time to 'immediately': */
997		_thread_run->wakeup_time.tv_sec = 0;
998		_thread_run->wakeup_time.tv_nsec = 0;
999	} else {
1000		/* Get the current time: */
1001		GET_CURRENT_TOD(tv);
1002		TIMEVAL_TO_TIMESPEC(&tv, &current_time);
1003
1004		/* Calculate the time for the current thread to wake up: */
1005		_thread_run->wakeup_time.tv_sec = current_time.tv_sec + timeout->tv_sec;
1006		_thread_run->wakeup_time.tv_nsec = current_time.tv_nsec + timeout->tv_nsec;
1007
1008		/* Check if the nanosecond field needs to wrap: */
1009		if (_thread_run->wakeup_time.tv_nsec >= 1000000000) {
1010			/* Wrap the nanosecond field: */
1011			_thread_run->wakeup_time.tv_sec += 1;
1012			_thread_run->wakeup_time.tv_nsec -= 1000000000;
1013		}
1014	}
1015}
1016
1017void
1018_thread_kern_sig_defer(void)
1019{
1020	/* Allow signal deferral to be recursive. */
1021	_thread_run->sig_defer_count++;
1022}
1023
1024void
1025_thread_kern_sig_undefer(void)
1026{
1027	/*
1028	 * Perform checks to yield only if we are about to undefer
1029	 * signals.
1030	 */
1031	if (_thread_run->sig_defer_count > 1) {
1032		/* Decrement the signal deferral count. */
1033		_thread_run->sig_defer_count--;
1034	}
1035	else if (_thread_run->sig_defer_count == 1) {
1036		/* Reenable signals: */
1037		_thread_run->sig_defer_count = 0;
1038
1039		/*
1040		 * Check if there are queued signals:
1041		 */
1042		if (_sigq_check_reqd != 0)
1043			_thread_kern_sched(NULL);
1044
1045		/*
1046		 * Check for asynchronous cancellation before delivering any
1047		 * pending signals:
1048		 */
1049		if (((_thread_run->cancelflags & PTHREAD_AT_CANCEL_POINT) == 0) &&
1050		    ((_thread_run->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1051			pthread_testcancel();
1052
1053		/*
1054		 * If there are pending signals or this thread has
1055		 * to yield the CPU, call the kernel scheduler:
1056		 *
1057		 * XXX - Come back and revisit the pending signal problem
1058		 */
1059		if ((_thread_run->yield_on_sig_undefer != 0) ||
1060		    SIGNOTEMPTY(_thread_run->sigpend)) {
1061			_thread_run->yield_on_sig_undefer = 0;
1062			_thread_kern_sched(NULL);
1063		}
1064	}
1065}
1066
1067static void
1068dequeue_signals(void)
1069{
1070	char	bufr[128];
1071	int	num;
1072
1073	/*
1074	 * Enter a loop to clear the pthread kernel pipe:
1075	 */
1076	while (((num = _thread_sys_read(_thread_kern_pipe[0], bufr,
1077	    sizeof(bufr))) > 0) || (num == -1 && errno == EINTR)) {
1078	}
1079	if ((num < 0) && (errno != EAGAIN)) {
1080		/*
1081		 * The only error we should expect is if there is
1082		 * no data to read.
1083		 */
1084		PANIC("Unable to read from thread kernel pipe");
1085	}
1086	/* Handle any pending signals: */
1087	_thread_sig_handle_pending();
1088}
1089
1090static inline void
1091thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in)
1092{
1093	pthread_t tid_out = thread_out;
1094	pthread_t tid_in = thread_in;
1095
1096	if ((tid_out != NULL) &&
1097	    (tid_out->flags & PTHREAD_FLAGS_PRIVATE) != 0)
1098		tid_out = NULL;
1099	if ((tid_in != NULL) &&
1100	    (tid_in->flags & PTHREAD_FLAGS_PRIVATE) != 0)
1101		tid_in = NULL;
1102
1103	if ((_sched_switch_hook != NULL) && (tid_out != tid_in)) {
1104		/* Run the scheduler switch hook: */
1105		_sched_switch_hook(tid_out, tid_in);
1106	}
1107}
1108#endif
1109