thr_kern.c revision 84610
1/*
2 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by John Birrell.
16 * 4. Neither the name of the author nor the names of any co-contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * $FreeBSD: head/lib/libkse/thread/thr_kern.c 84610 2001-10-07 02:34:43Z deischen $
33 *
34 */
35#include <errno.h>
36#include <poll.h>
37#include <stdlib.h>
38#include <stdarg.h>
39#include <string.h>
40#include <unistd.h>
41#include <setjmp.h>
42#include <sys/param.h>
43#include <sys/types.h>
44#include <sys/signalvar.h>
45#include <sys/stat.h>
46#include <sys/time.h>
47#include <sys/socket.h>
48#include <sys/uio.h>
49#include <sys/syscall.h>
50#include <fcntl.h>
51#include <pthread.h>
52#include "pthread_private.h"
53
54/* #define DEBUG_THREAD_KERN */
55#ifdef DEBUG_THREAD_KERN
56#define DBG_MSG		stdout_debug
57#else
58#define DBG_MSG(x...)
59#endif
60
61/* Static function prototype definitions: */
62static void
63thread_kern_poll(int wait_reqd);
64
65static void
66dequeue_signals(void);
67
68static inline void
69thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in);
70
71/* Static variables: */
72static int	last_tick = 0;
73static int	called_from_handler = 0;
74
75/*
76 * This is called when a signal handler finishes and wants to
77 * return to a previous frame.
78 */
79void
80_thread_kern_sched_frame(struct pthread_signal_frame *psf)
81{
82	struct pthread	*curthread = _get_curthread();
83
84	/*
85	 * Flag the pthread kernel as executing scheduler code
86	 * to avoid a signal from interrupting this execution and
87	 * corrupting the (soon-to-be) current frame.
88	 */
89	_thread_kern_in_sched = 1;
90
91	/* Restore the signal frame: */
92	_thread_sigframe_restore(curthread, psf);
93
94	/* The signal mask was restored; check for any pending signals: */
95	curthread->check_pending = 1;
96
97	/* Switch to the thread scheduler: */
98	___longjmp(_thread_kern_sched_jb, 1);
99}
100
101
102void
103_thread_kern_sched(ucontext_t *scp)
104{
105	struct pthread	*curthread = _get_curthread();
106
107	/*
108	 * Flag the pthread kernel as executing scheduler code
109	 * to avoid a scheduler signal from interrupting this
110	 * execution and calling the scheduler again.
111	 */
112	_thread_kern_in_sched = 1;
113
114	/* Check if this function was called from the signal handler: */
115	if (scp != NULL) {
116		called_from_handler = 1;
117		DBG_MSG("Entering scheduler due to signal\n");
118	} else {
119		/* Save the state of the current thread: */
120		if (_setjmp(curthread->ctx.jb) == 0) {
121			/* Flag the jump buffer was the last state saved: */
122			curthread->ctxtype = CTX_JB_NOSIG;
123			curthread->longjmp_val = 1;
124		} else {
125			DBG_MSG("Returned from ___longjmp, thread %p\n",
126			    curthread);
127			/*
128			 * This point is reached when a longjmp() is called
129			 * to restore the state of a thread.
130			 *
131			 * This is the normal way out of the scheduler.
132			 */
133			_thread_kern_in_sched = 0;
134
135			if (curthread->sig_defer_count == 0) {
136				if (((curthread->cancelflags &
137				    PTHREAD_AT_CANCEL_POINT) == 0) &&
138				    ((curthread->cancelflags &
139				    PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
140					/*
141					 * Cancellations override signals.
142					 *
143					 * Stick a cancellation point at the
144					 * start of each async-cancellable
145					 * thread's resumption.
146					 *
147					 * We allow threads woken at cancel
148					 * points to do their own checks.
149					 */
150					pthread_testcancel();
151			}
152
153			if (_sched_switch_hook != NULL) {
154				/* Run the installed switch hook: */
155				thread_run_switch_hook(_last_user_thread,
156				    curthread);
157			}
158			return;
159		}
160	}
161	/* Switch to the thread scheduler: */
162	___longjmp(_thread_kern_sched_jb, 1);
163}
164
165void
166_thread_kern_sched_sig(void)
167{
168	struct pthread	*curthread = _get_curthread();
169
170	curthread->check_pending = 1;
171	_thread_kern_sched(NULL);
172}
173
174
175void
176_thread_kern_scheduler(void)
177{
178	struct timespec	ts;
179	struct timeval	tv;
180	struct pthread	*curthread = _get_curthread();
181	pthread_t	pthread, pthread_h;
182	unsigned int	current_tick;
183	int		add_to_prioq;
184
185	/* If the currently running thread is a user thread, save it: */
186	if ((curthread->flags & PTHREAD_FLAGS_PRIVATE) == 0)
187		_last_user_thread = curthread;
188
189	if (called_from_handler != 0) {
190		called_from_handler = 0;
191
192		/*
193		 * The signal handler should have saved the state of
194		 * the current thread.  Restore the process signal
195		 * mask.
196		 */
197		if (__sys_sigprocmask(SIG_SETMASK,
198		    &_process_sigmask, NULL) != 0)
199			PANIC("Unable to restore process mask after signal");
200
201		/*
202		 * Since the signal handler didn't return normally, we
203		 * have to tell the kernel to reuse the signal stack.
204		 */
205		if (__sys_sigaltstack(&_thread_sigstack, NULL) != 0)
206			PANIC("Unable to restore alternate signal stack");
207	}
208
209	/*
210	 * Enter a scheduling loop that finds the next thread that is
211	 * ready to run. This loop completes when there are no more threads
212	 * in the global list or when a thread has its state restored by
213	 * either a sigreturn (if the state was saved as a sigcontext) or a
214	 * longjmp (if the state was saved by a setjmp).
215	 */
216	while (!(TAILQ_EMPTY(&_thread_list))) {
217		/* Get the current time of day: */
218		GET_CURRENT_TOD(tv);
219		TIMEVAL_TO_TIMESPEC(&tv, &ts);
220		current_tick = _sched_ticks;
221
222		/*
223		 * Protect the scheduling queues from access by the signal
224		 * handler.
225		 */
226		_queue_signals = 1;
227		add_to_prioq = 0;
228
229		if (curthread != &_thread_kern_thread) {
230			/*
231			 * This thread no longer needs to yield the CPU.
232			 */
233			curthread->yield_on_sig_undefer = 0;
234
235			if (curthread->state != PS_RUNNING) {
236				/*
237				 * Save the current time as the time that the
238				 * thread became inactive:
239				 */
240				curthread->last_inactive = (long)current_tick;
241				if (curthread->last_inactive <
242				    curthread->last_active) {
243					/* Account for a rollover: */
244					curthread->last_inactive =+
245					    UINT_MAX + 1;
246				}
247			}
248
249			/*
250			 * Place the currently running thread into the
251			 * appropriate queue(s).
252			 */
253			switch (curthread->state) {
254			case PS_DEAD:
255			case PS_STATE_MAX: /* to silence -Wall */
256			case PS_SUSPENDED:
257				/*
258				 * Dead and suspended threads are not placed
259				 * in any queue:
260				 */
261				break;
262
263			case PS_RUNNING:
264				/*
265				 * Runnable threads can't be placed in the
266				 * priority queue until after waiting threads
267				 * are polled (to preserve round-robin
268				 * scheduling).
269				 */
270				add_to_prioq = 1;
271				break;
272
273			/*
274			 * States which do not depend on file descriptor I/O
275			 * operations or timeouts:
276			 */
277			case PS_DEADLOCK:
278			case PS_FDLR_WAIT:
279			case PS_FDLW_WAIT:
280			case PS_FILE_WAIT:
281			case PS_JOIN:
282			case PS_MUTEX_WAIT:
283			case PS_SIGSUSPEND:
284			case PS_SIGTHREAD:
285			case PS_SIGWAIT:
286			case PS_WAIT_WAIT:
287				/* No timeouts for these states: */
288				curthread->wakeup_time.tv_sec = -1;
289				curthread->wakeup_time.tv_nsec = -1;
290
291				/* Restart the time slice: */
292				curthread->slice_usec = -1;
293
294				/* Insert into the waiting queue: */
295				PTHREAD_WAITQ_INSERT(curthread);
296				break;
297
298			/* States which can timeout: */
299			case PS_COND_WAIT:
300			case PS_SLEEP_WAIT:
301				/* Restart the time slice: */
302				curthread->slice_usec = -1;
303
304				/* Insert into the waiting queue: */
305				PTHREAD_WAITQ_INSERT(curthread);
306				break;
307
308			/* States that require periodic work: */
309			case PS_SPINBLOCK:
310				/* No timeouts for this state: */
311				curthread->wakeup_time.tv_sec = -1;
312				curthread->wakeup_time.tv_nsec = -1;
313
314				/* Increment spinblock count: */
315				_spinblock_count++;
316
317				/* FALLTHROUGH */
318			case PS_FDR_WAIT:
319			case PS_FDW_WAIT:
320			case PS_POLL_WAIT:
321			case PS_SELECT_WAIT:
322				/* Restart the time slice: */
323				curthread->slice_usec = -1;
324
325				/* Insert into the waiting queue: */
326				PTHREAD_WAITQ_INSERT(curthread);
327
328				/* Insert into the work queue: */
329				PTHREAD_WORKQ_INSERT(curthread);
330				break;
331			}
332
333			/*
334			 * Are there pending signals for this thread?
335			 *
336			 * This check has to be performed after the thread
337			 * has been placed in the queue(s) appropriate for
338			 * its state.  The process of adding pending signals
339			 * can change a threads state, which in turn will
340			 * attempt to add or remove the thread from any
341			 * scheduling queue to which it belongs.
342			 */
343			if (curthread->check_pending != 0) {
344				curthread->check_pending = 0;
345				_thread_sig_check_pending(curthread);
346			}
347		}
348
349		/*
350		 * Avoid polling file descriptors if there are none
351		 * waiting:
352		 */
353		if (TAILQ_EMPTY(&_workq) != 0) {
354		}
355		/*
356		 * Poll file descriptors only if a new scheduling signal
357		 * has occurred or if we have no more runnable threads.
358		 */
359		else if (((current_tick = _sched_ticks) != last_tick) ||
360		    ((curthread->state != PS_RUNNING) &&
361		    (PTHREAD_PRIOQ_FIRST() == NULL))) {
362			/* Unprotect the scheduling queues: */
363			_queue_signals = 0;
364
365			/*
366			 * Poll file descriptors to update the state of threads
367			 * waiting on file I/O where data may be available:
368			 */
369			thread_kern_poll(0);
370
371			/* Protect the scheduling queues: */
372			_queue_signals = 1;
373		}
374		last_tick = current_tick;
375
376		/*
377		 * Wake up threads that have timedout.  This has to be
378		 * done after polling in case a thread does a poll or
379		 * select with zero time.
380		 */
381		PTHREAD_WAITQ_SETACTIVE();
382		while (((pthread = TAILQ_FIRST(&_waitingq)) != NULL) &&
383		    (pthread->wakeup_time.tv_sec != -1) &&
384		    (((pthread->wakeup_time.tv_sec == 0) &&
385		    (pthread->wakeup_time.tv_nsec == 0)) ||
386		    (pthread->wakeup_time.tv_sec < ts.tv_sec) ||
387		    ((pthread->wakeup_time.tv_sec == ts.tv_sec) &&
388		    (pthread->wakeup_time.tv_nsec <= ts.tv_nsec)))) {
389			switch (pthread->state) {
390			case PS_POLL_WAIT:
391			case PS_SELECT_WAIT:
392				/* Return zero file descriptors ready: */
393				pthread->data.poll_data->nfds = 0;
394				/* fall through */
395			default:
396				/*
397				 * Remove this thread from the waiting queue
398				 * (and work queue if necessary) and place it
399				 * in the ready queue.
400				 */
401				PTHREAD_WAITQ_CLEARACTIVE();
402				if (pthread->flags & PTHREAD_FLAGS_IN_WORKQ)
403					PTHREAD_WORKQ_REMOVE(pthread);
404				PTHREAD_NEW_STATE(pthread, PS_RUNNING);
405				PTHREAD_WAITQ_SETACTIVE();
406				break;
407			}
408			/*
409			 * Flag the timeout in the thread structure:
410			 */
411			pthread->timeout = 1;
412		}
413		PTHREAD_WAITQ_CLEARACTIVE();
414
415		/*
416		 * Check to see if the current thread needs to be added
417		 * to the priority queue:
418		 */
419		if (add_to_prioq != 0) {
420			/*
421			 * Save the current time as the time that the
422			 * thread became inactive:
423			 */
424			current_tick = _sched_ticks;
425			curthread->last_inactive = (long)current_tick;
426			if (curthread->last_inactive <
427			    curthread->last_active) {
428				/* Account for a rollover: */
429				curthread->last_inactive =+ UINT_MAX + 1;
430			}
431
432			if ((curthread->slice_usec != -1) &&
433		 	   (curthread->attr.sched_policy != SCHED_FIFO)) {
434				/*
435				 * Accumulate the number of microseconds for
436				 * which the current thread has run:
437				 */
438				curthread->slice_usec +=
439				    (curthread->last_inactive -
440				    curthread->last_active) *
441				    (long)_clock_res_usec;
442				/* Check for time quantum exceeded: */
443				if (curthread->slice_usec > TIMESLICE_USEC)
444					curthread->slice_usec = -1;
445			}
446
447			if (curthread->slice_usec == -1) {
448				/*
449				 * The thread exceeded its time
450				 * quantum or it yielded the CPU;
451				 * place it at the tail of the
452				 * queue for its priority.
453				 */
454				PTHREAD_PRIOQ_INSERT_TAIL(curthread);
455			} else {
456				/*
457				 * The thread hasn't exceeded its
458				 * interval.  Place it at the head
459				 * of the queue for its priority.
460				 */
461				PTHREAD_PRIOQ_INSERT_HEAD(curthread);
462			}
463		}
464
465		/*
466		 * Get the highest priority thread in the ready queue.
467		 */
468		pthread_h = PTHREAD_PRIOQ_FIRST();
469
470		/* Check if there are no threads ready to run: */
471		if (pthread_h == NULL) {
472			/*
473			 * Lock the pthread kernel by changing the pointer to
474			 * the running thread to point to the global kernel
475			 * thread structure:
476			 */
477			_set_curthread(&_thread_kern_thread);
478			curthread = &_thread_kern_thread;
479
480			DBG_MSG("No runnable threads, using kernel thread %p\n",
481			    curthread);
482
483			/* Unprotect the scheduling queues: */
484			_queue_signals = 0;
485
486			/*
487			 * There are no threads ready to run, so wait until
488			 * something happens that changes this condition:
489			 */
490			thread_kern_poll(1);
491
492			/*
493			 * This process' usage will likely be very small
494			 * while waiting in a poll.  Since the scheduling
495			 * clock is based on the profiling timer, it is
496			 * unlikely that the profiling timer will fire
497			 * and update the time of day.  To account for this,
498			 * get the time of day after polling with a timeout.
499			 */
500			gettimeofday((struct timeval *) &_sched_tod, NULL);
501
502			/* Check once more for a runnable thread: */
503			_queue_signals = 1;
504			pthread_h = PTHREAD_PRIOQ_FIRST();
505			_queue_signals = 0;
506		}
507
508		if (pthread_h != NULL) {
509			/* Remove the thread from the ready queue: */
510			PTHREAD_PRIOQ_REMOVE(pthread_h);
511
512			/* Unprotect the scheduling queues: */
513			_queue_signals = 0;
514
515			/*
516			 * Check for signals queued while the scheduling
517			 * queues were protected:
518			 */
519			while (_sigq_check_reqd != 0) {
520				/* Clear before handling queued signals: */
521				_sigq_check_reqd = 0;
522
523				/* Protect the scheduling queues again: */
524				_queue_signals = 1;
525
526				dequeue_signals();
527
528				/*
529				 * Check for a higher priority thread that
530				 * became runnable due to signal handling.
531				 */
532				if (((pthread = PTHREAD_PRIOQ_FIRST()) != NULL) &&
533				    (pthread->active_priority > pthread_h->active_priority)) {
534					/* Remove the thread from the ready queue: */
535					PTHREAD_PRIOQ_REMOVE(pthread);
536
537					/*
538					 * Insert the lower priority thread
539					 * at the head of its priority list:
540					 */
541					PTHREAD_PRIOQ_INSERT_HEAD(pthread_h);
542
543					/* There's a new thread in town: */
544					pthread_h = pthread;
545				}
546
547				/* Unprotect the scheduling queues: */
548				_queue_signals = 0;
549			}
550
551			/* Make the selected thread the current thread: */
552			_set_curthread(pthread_h);
553			curthread = pthread_h;
554
555			/*
556			 * Save the current time as the time that the thread
557			 * became active:
558			 */
559			current_tick = _sched_ticks;
560			curthread->last_active = (long) current_tick;
561
562			/*
563			 * Check if this thread is running for the first time
564			 * or running again after using its full time slice
565			 * allocation:
566			 */
567			if (curthread->slice_usec == -1) {
568				/* Reset the accumulated time slice period: */
569				curthread->slice_usec = 0;
570			}
571
572			/*
573			 * If we had a context switch, run any
574			 * installed switch hooks.
575			 */
576			if ((_sched_switch_hook != NULL) &&
577			    (_last_user_thread != curthread)) {
578				thread_run_switch_hook(_last_user_thread,
579				    curthread);
580			}
581			/*
582			 * Continue the thread at its current frame:
583			 */
584			switch(curthread->ctxtype) {
585			case CTX_JB_NOSIG:
586				___longjmp(curthread->ctx.jb,
587				    curthread->longjmp_val);
588				break;
589			case CTX_JB:
590				__longjmp(curthread->ctx.jb,
591				    curthread->longjmp_val);
592				break;
593			case CTX_SJB:
594				__siglongjmp(curthread->ctx.sigjb,
595				    curthread->longjmp_val);
596				break;
597			case CTX_UC:
598				/* XXX - Restore FP regsisters? */
599				FP_RESTORE_UC(&curthread->ctx.uc);
600
601				/*
602				 * Do a sigreturn to restart the thread that
603				 * was interrupted by a signal:
604				 */
605				_thread_kern_in_sched = 0;
606
607#if NOT_YET
608				_setcontext(&curthread->ctx.uc);
609#else
610				/*
611				 * Ensure the process signal mask is set
612				 * correctly:
613				 */
614				curthread->ctx.uc.uc_sigmask =
615				    _process_sigmask;
616				__sys_sigreturn(&curthread->ctx.uc);
617#endif
618				break;
619			}
620			/* This point should not be reached. */
621			PANIC("Thread has returned from sigreturn or longjmp");
622		}
623	}
624
625	/* There are no more threads, so exit this process: */
626	exit(0);
627}
628
629void
630_thread_kern_sched_state(enum pthread_state state, char *fname, int lineno)
631{
632	struct pthread	*curthread = _get_curthread();
633
634	/*
635	 * Flag the pthread kernel as executing scheduler code
636	 * to avoid a scheduler signal from interrupting this
637	 * execution and calling the scheduler again.
638	 */
639	_thread_kern_in_sched = 1;
640
641	/*
642	 * Prevent the signal handler from fiddling with this thread
643	 * before its state is set and is placed into the proper queue.
644	 */
645	_queue_signals = 1;
646
647	/* Change the state of the current thread: */
648	curthread->state = state;
649	curthread->fname = fname;
650	curthread->lineno = lineno;
651
652	/* Schedule the next thread that is ready: */
653	_thread_kern_sched(NULL);
654}
655
656void
657_thread_kern_sched_state_unlock(enum pthread_state state,
658    spinlock_t *lock, char *fname, int lineno)
659{
660	struct pthread	*curthread = _get_curthread();
661
662	/*
663	 * Flag the pthread kernel as executing scheduler code
664	 * to avoid a scheduler signal from interrupting this
665	 * execution and calling the scheduler again.
666	 */
667	_thread_kern_in_sched = 1;
668
669	/*
670	 * Prevent the signal handler from fiddling with this thread
671	 * before its state is set and it is placed into the proper
672	 * queue(s).
673	 */
674	_queue_signals = 1;
675
676	/* Change the state of the current thread: */
677	curthread->state = state;
678	curthread->fname = fname;
679	curthread->lineno = lineno;
680
681	_SPINUNLOCK(lock);
682
683	/* Schedule the next thread that is ready: */
684	_thread_kern_sched(NULL);
685}
686
687static void
688thread_kern_poll(int wait_reqd)
689{
690	int             count = 0;
691	int             i, found;
692	int		kern_pipe_added = 0;
693	int             nfds = 0;
694	int		timeout_ms = 0;
695	struct pthread	*pthread;
696	struct timespec ts;
697	struct timeval  tv;
698
699	/* Check if the caller wants to wait: */
700	if (wait_reqd == 0) {
701		timeout_ms = 0;
702	}
703	else {
704		/* Get the current time of day: */
705		GET_CURRENT_TOD(tv);
706		TIMEVAL_TO_TIMESPEC(&tv, &ts);
707
708		_queue_signals = 1;
709		pthread = TAILQ_FIRST(&_waitingq);
710		_queue_signals = 0;
711
712		if ((pthread == NULL) || (pthread->wakeup_time.tv_sec == -1)) {
713			/*
714			 * Either there are no threads in the waiting queue,
715			 * or there are no threads that can timeout.
716			 */
717			timeout_ms = INFTIM;
718		}
719		else if (pthread->wakeup_time.tv_sec - ts.tv_sec > 60000)
720			/* Limit maximum timeout to prevent rollover. */
721			timeout_ms = 60000;
722		else {
723			/*
724			 * Calculate the time left for the next thread to
725			 * timeout:
726			 */
727			timeout_ms = ((pthread->wakeup_time.tv_sec - ts.tv_sec) *
728			    1000) + ((pthread->wakeup_time.tv_nsec - ts.tv_nsec) /
729			    1000000);
730			/*
731			 * Don't allow negative timeouts:
732			 */
733			if (timeout_ms < 0)
734				timeout_ms = 0;
735		}
736	}
737
738	/* Protect the scheduling queues: */
739	_queue_signals = 1;
740
741	/*
742	 * Check to see if the signal queue needs to be walked to look
743	 * for threads awoken by a signal while in the scheduler.
744	 */
745	if (_sigq_check_reqd != 0) {
746		/* Reset flag before handling queued signals: */
747		_sigq_check_reqd = 0;
748
749		dequeue_signals();
750	}
751
752	/*
753	 * Check for a thread that became runnable due to a signal:
754	 */
755	if (PTHREAD_PRIOQ_FIRST() != NULL) {
756		/*
757		 * Since there is at least one runnable thread,
758		 * disable the wait.
759		 */
760		timeout_ms = 0;
761	}
762
763	/*
764	 * Form the poll table:
765	 */
766	nfds = 0;
767	if (timeout_ms != 0) {
768		/* Add the kernel pipe to the poll table: */
769		_thread_pfd_table[nfds].fd = _thread_kern_pipe[0];
770		_thread_pfd_table[nfds].events = POLLRDNORM;
771		_thread_pfd_table[nfds].revents = 0;
772		nfds++;
773		kern_pipe_added = 1;
774	}
775
776	PTHREAD_WAITQ_SETACTIVE();
777	TAILQ_FOREACH(pthread, &_workq, qe) {
778		switch (pthread->state) {
779		case PS_SPINBLOCK:
780			/*
781			 * If the lock is available, let the thread run.
782			 */
783			if (pthread->data.spinlock->access_lock == 0) {
784				PTHREAD_WAITQ_CLEARACTIVE();
785				PTHREAD_WORKQ_REMOVE(pthread);
786				PTHREAD_NEW_STATE(pthread,PS_RUNNING);
787				PTHREAD_WAITQ_SETACTIVE();
788				/* One less thread in a spinblock state: */
789				_spinblock_count--;
790				/*
791				 * Since there is at least one runnable
792				 * thread, disable the wait.
793				 */
794				timeout_ms = 0;
795			}
796			break;
797
798		/* File descriptor read wait: */
799		case PS_FDR_WAIT:
800			/* Limit number of polled files to table size: */
801			if (nfds < _thread_dtablesize) {
802				_thread_pfd_table[nfds].events = POLLRDNORM;
803				_thread_pfd_table[nfds].fd = pthread->data.fd.fd;
804				nfds++;
805			}
806			break;
807
808		/* File descriptor write wait: */
809		case PS_FDW_WAIT:
810			/* Limit number of polled files to table size: */
811			if (nfds < _thread_dtablesize) {
812				_thread_pfd_table[nfds].events = POLLWRNORM;
813				_thread_pfd_table[nfds].fd = pthread->data.fd.fd;
814				nfds++;
815			}
816			break;
817
818		/* File descriptor poll or select wait: */
819		case PS_POLL_WAIT:
820		case PS_SELECT_WAIT:
821			/* Limit number of polled files to table size: */
822			if (pthread->data.poll_data->nfds + nfds <
823			    _thread_dtablesize) {
824				for (i = 0; i < pthread->data.poll_data->nfds; i++) {
825					_thread_pfd_table[nfds + i].fd =
826					    pthread->data.poll_data->fds[i].fd;
827					_thread_pfd_table[nfds + i].events =
828					    pthread->data.poll_data->fds[i].events;
829				}
830				nfds += pthread->data.poll_data->nfds;
831			}
832			break;
833
834		/* Other states do not depend on file I/O. */
835		default:
836			break;
837		}
838	}
839	PTHREAD_WAITQ_CLEARACTIVE();
840
841	/*
842	 * Wait for a file descriptor to be ready for read, write, or
843	 * an exception, or a timeout to occur:
844	 */
845	count = __sys_poll(_thread_pfd_table, nfds, timeout_ms);
846
847	if (kern_pipe_added != 0)
848		/*
849		 * Remove the pthread kernel pipe file descriptor
850		 * from the pollfd table:
851		 */
852		nfds = 1;
853	else
854		nfds = 0;
855
856	/*
857	 * Check if it is possible that there are bytes in the kernel
858	 * read pipe waiting to be read:
859	 */
860	if (count < 0 || ((kern_pipe_added != 0) &&
861	    (_thread_pfd_table[0].revents & POLLRDNORM))) {
862		/*
863		 * If the kernel read pipe was included in the
864		 * count:
865		 */
866		if (count > 0) {
867			/* Decrement the count of file descriptors: */
868			count--;
869		}
870
871		if (_sigq_check_reqd != 0) {
872			/* Reset flag before handling signals: */
873			_sigq_check_reqd = 0;
874
875			dequeue_signals();
876		}
877	}
878
879	/*
880	 * Check if any file descriptors are ready:
881	 */
882	if (count > 0) {
883		/*
884		 * Enter a loop to look for threads waiting on file
885		 * descriptors that are flagged as available by the
886		 * _poll syscall:
887		 */
888		PTHREAD_WAITQ_SETACTIVE();
889		TAILQ_FOREACH(pthread, &_workq, qe) {
890			switch (pthread->state) {
891			case PS_SPINBLOCK:
892				/*
893				 * If the lock is available, let the thread run.
894				 */
895				if (pthread->data.spinlock->access_lock == 0) {
896					PTHREAD_WAITQ_CLEARACTIVE();
897					PTHREAD_WORKQ_REMOVE(pthread);
898					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
899					PTHREAD_WAITQ_SETACTIVE();
900
901					/*
902					 * One less thread in a spinblock state:
903					 */
904					_spinblock_count--;
905				}
906				break;
907
908			/* File descriptor read wait: */
909			case PS_FDR_WAIT:
910				if ((nfds < _thread_dtablesize) &&
911				    (_thread_pfd_table[nfds].revents & POLLRDNORM)) {
912					PTHREAD_WAITQ_CLEARACTIVE();
913					PTHREAD_WORKQ_REMOVE(pthread);
914					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
915					PTHREAD_WAITQ_SETACTIVE();
916				}
917				nfds++;
918				break;
919
920			/* File descriptor write wait: */
921			case PS_FDW_WAIT:
922				if ((nfds < _thread_dtablesize) &&
923				    (_thread_pfd_table[nfds].revents & POLLWRNORM)) {
924					PTHREAD_WAITQ_CLEARACTIVE();
925					PTHREAD_WORKQ_REMOVE(pthread);
926					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
927					PTHREAD_WAITQ_SETACTIVE();
928				}
929				nfds++;
930				break;
931
932			/* File descriptor poll or select wait: */
933			case PS_POLL_WAIT:
934			case PS_SELECT_WAIT:
935				if (pthread->data.poll_data->nfds + nfds <
936				    _thread_dtablesize) {
937					/*
938					 * Enter a loop looking for I/O
939					 * readiness:
940					 */
941					found = 0;
942					for (i = 0; i < pthread->data.poll_data->nfds; i++) {
943						if (_thread_pfd_table[nfds + i].revents != 0) {
944							pthread->data.poll_data->fds[i].revents =
945							    _thread_pfd_table[nfds + i].revents;
946							found++;
947						}
948					}
949
950					/* Increment before destroying: */
951					nfds += pthread->data.poll_data->nfds;
952
953					if (found != 0) {
954						pthread->data.poll_data->nfds = found;
955						PTHREAD_WAITQ_CLEARACTIVE();
956						PTHREAD_WORKQ_REMOVE(pthread);
957						PTHREAD_NEW_STATE(pthread,PS_RUNNING);
958						PTHREAD_WAITQ_SETACTIVE();
959					}
960				}
961				else
962					nfds += pthread->data.poll_data->nfds;
963				break;
964
965			/* Other states do not depend on file I/O. */
966			default:
967				break;
968			}
969		}
970		PTHREAD_WAITQ_CLEARACTIVE();
971	}
972	else if (_spinblock_count != 0) {
973		/*
974		 * Enter a loop to look for threads waiting on a spinlock
975		 * that is now available.
976		 */
977		PTHREAD_WAITQ_SETACTIVE();
978		TAILQ_FOREACH(pthread, &_workq, qe) {
979			if (pthread->state == PS_SPINBLOCK) {
980				/*
981				 * If the lock is available, let the thread run.
982				 */
983				if (pthread->data.spinlock->access_lock == 0) {
984					PTHREAD_WAITQ_CLEARACTIVE();
985					PTHREAD_WORKQ_REMOVE(pthread);
986					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
987					PTHREAD_WAITQ_SETACTIVE();
988
989					/*
990					 * One less thread in a spinblock state:
991					 */
992					_spinblock_count--;
993				}
994			}
995		}
996		PTHREAD_WAITQ_CLEARACTIVE();
997	}
998
999	/* Unprotect the scheduling queues: */
1000	_queue_signals = 0;
1001
1002	while (_sigq_check_reqd != 0) {
1003		/* Handle queued signals: */
1004		_sigq_check_reqd = 0;
1005
1006		/* Protect the scheduling queues: */
1007		_queue_signals = 1;
1008
1009		dequeue_signals();
1010
1011		/* Unprotect the scheduling queues: */
1012		_queue_signals = 0;
1013	}
1014}
1015
1016void
1017_thread_kern_set_timeout(const struct timespec * timeout)
1018{
1019	struct pthread	*curthread = _get_curthread();
1020	struct timespec current_time;
1021	struct timeval  tv;
1022
1023	/* Reset the timeout flag for the running thread: */
1024	curthread->timeout = 0;
1025
1026	/* Check if the thread is to wait forever: */
1027	if (timeout == NULL) {
1028		/*
1029		 * Set the wakeup time to something that can be recognised as
1030		 * different to an actual time of day:
1031		 */
1032		curthread->wakeup_time.tv_sec = -1;
1033		curthread->wakeup_time.tv_nsec = -1;
1034	}
1035	/* Check if no waiting is required: */
1036	else if (timeout->tv_sec == 0 && timeout->tv_nsec == 0) {
1037		/* Set the wake up time to 'immediately': */
1038		curthread->wakeup_time.tv_sec = 0;
1039		curthread->wakeup_time.tv_nsec = 0;
1040	} else {
1041		/* Get the current time: */
1042		GET_CURRENT_TOD(tv);
1043		TIMEVAL_TO_TIMESPEC(&tv, &current_time);
1044
1045		/* Calculate the time for the current thread to wake up: */
1046		curthread->wakeup_time.tv_sec = current_time.tv_sec + timeout->tv_sec;
1047		curthread->wakeup_time.tv_nsec = current_time.tv_nsec + timeout->tv_nsec;
1048
1049		/* Check if the nanosecond field needs to wrap: */
1050		if (curthread->wakeup_time.tv_nsec >= 1000000000) {
1051			/* Wrap the nanosecond field: */
1052			curthread->wakeup_time.tv_sec += 1;
1053			curthread->wakeup_time.tv_nsec -= 1000000000;
1054		}
1055	}
1056}
1057
1058void
1059_thread_kern_sig_defer(void)
1060{
1061	struct pthread	*curthread = _get_curthread();
1062
1063	/* Allow signal deferral to be recursive. */
1064	curthread->sig_defer_count++;
1065}
1066
1067void
1068_thread_kern_sig_undefer(void)
1069{
1070	struct pthread	*curthread = _get_curthread();
1071
1072	/*
1073	 * Perform checks to yield only if we are about to undefer
1074	 * signals.
1075	 */
1076	if (curthread->sig_defer_count > 1) {
1077		/* Decrement the signal deferral count. */
1078		curthread->sig_defer_count--;
1079	}
1080	else if (curthread->sig_defer_count == 1) {
1081		/* Reenable signals: */
1082		curthread->sig_defer_count = 0;
1083
1084		/*
1085		 * Check if there are queued signals:
1086		 */
1087		if (_sigq_check_reqd != 0)
1088			_thread_kern_sched(NULL);
1089
1090		/*
1091		 * Check for asynchronous cancellation before delivering any
1092		 * pending signals:
1093		 */
1094		if (((curthread->cancelflags & PTHREAD_AT_CANCEL_POINT) == 0) &&
1095		    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1096			pthread_testcancel();
1097
1098		/*
1099		 * If there are pending signals or this thread has
1100		 * to yield the CPU, call the kernel scheduler:
1101		 *
1102		 * XXX - Come back and revisit the pending signal problem
1103		 */
1104		if ((curthread->yield_on_sig_undefer != 0) ||
1105		    SIGNOTEMPTY(curthread->sigpend)) {
1106			curthread->yield_on_sig_undefer = 0;
1107			_thread_kern_sched(NULL);
1108		}
1109	}
1110}
1111
1112static void
1113dequeue_signals(void)
1114{
1115	char	bufr[128];
1116	int	num;
1117
1118	/*
1119	 * Enter a loop to clear the pthread kernel pipe:
1120	 */
1121	while (((num = __sys_read(_thread_kern_pipe[0], bufr,
1122	    sizeof(bufr))) > 0) || (num == -1 && errno == EINTR)) {
1123	}
1124	if ((num < 0) && (errno != EAGAIN)) {
1125		/*
1126		 * The only error we should expect is if there is
1127		 * no data to read.
1128		 */
1129		PANIC("Unable to read from thread kernel pipe");
1130	}
1131	/* Handle any pending signals: */
1132	_thread_sig_handle_pending();
1133}
1134
1135static inline void
1136thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in)
1137{
1138	pthread_t tid_out = thread_out;
1139	pthread_t tid_in = thread_in;
1140
1141	if ((tid_out != NULL) &&
1142	    (tid_out->flags & PTHREAD_FLAGS_PRIVATE) != 0)
1143		tid_out = NULL;
1144	if ((tid_in != NULL) &&
1145	    (tid_in->flags & PTHREAD_FLAGS_PRIVATE) != 0)
1146		tid_in = NULL;
1147
1148	if ((_sched_switch_hook != NULL) && (tid_out != tid_in)) {
1149		/* Run the scheduler switch hook: */
1150		_sched_switch_hook(tid_out, tid_in);
1151	}
1152}
1153
1154struct pthread *
1155_get_curthread(void)
1156{
1157	if (_thread_initial == NULL)
1158		_thread_init();
1159
1160	return (_thread_run);
1161}
1162
1163void
1164_set_curthread(struct pthread *newthread)
1165{
1166	_thread_run = newthread;
1167}
1168