thr_kern.c revision 49661
1/*
2 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by John Birrell.
16 * 4. Neither the name of the author nor the names of any co-contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * $Id: uthread_kern.c,v 1.19 1999/06/20 08:28:31 jb Exp $
33 *
34 */
35#include <errno.h>
36#include <poll.h>
37#include <stdlib.h>
38#include <stdarg.h>
39#include <string.h>
40#include <unistd.h>
41#include <setjmp.h>
42#include <sys/types.h>
43#include <sys/stat.h>
44#include <sys/time.h>
45#include <sys/socket.h>
46#include <sys/uio.h>
47#include <sys/syscall.h>
48#include <fcntl.h>
49#ifdef _THREAD_SAFE
50#include <pthread.h>
51#include "pthread_private.h"
52
53/* Static function prototype definitions: */
54static void
55_thread_kern_poll(int wait_reqd);
56
57static void
58dequeue_signals(void);
59
60static inline void
61thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in);
62
63void
64_thread_kern_sched(struct sigcontext * scp)
65{
66#ifndef	__alpha__
67	char           *fdata;
68#endif
69	pthread_t       pthread, pthread_h = NULL;
70	pthread_t	last_thread = NULL;
71	struct itimerval itimer;
72	struct timespec ts, ts1;
73	struct timeval  tv, tv1;
74	int		i, set_timer = 0;
75
76	/*
77	 * Flag the pthread kernel as executing scheduler code
78	 * to avoid a scheduler signal from interrupting this
79	 * execution and calling the scheduler again.
80	 */
81	_thread_kern_in_sched = 1;
82
83	/* Check if this function was called from the signal handler: */
84	if (scp != NULL) {
85		/*
86		 * Copy the signal context to the current thread's jump
87		 * buffer:
88		 */
89		memcpy(&_thread_run->saved_sigcontext, scp, sizeof(_thread_run->saved_sigcontext));
90
91#ifndef	__alpha__
92		/* Point to the floating point data in the running thread: */
93		fdata = _thread_run->saved_fp;
94
95		/* Save the floating point data: */
96__asm__("fnsave %0": :"m"(*fdata));
97#endif
98
99		/* Flag the signal context as the last state saved: */
100		_thread_run->sig_saved = 1;
101	}
102	/* Save the state of the current thread: */
103	else if (setjmp(_thread_run->saved_jmp_buf) != 0) {
104		/*
105		 * This point is reached when a longjmp() is called to
106		 * restore the state of a thread.
107		 *
108		 * This is the normal way out of the scheduler.
109		 */
110		_thread_kern_in_sched = 0;
111
112		if (_sched_switch_hook != NULL) {
113			/* Run the installed switch hook: */
114			thread_run_switch_hook(_last_user_thread, _thread_run);
115		}
116
117		return;
118	} else
119		/* Flag the jump buffer was the last state saved: */
120		_thread_run->sig_saved = 0;
121
122	/* If the currently running thread is a user thread, save it: */
123	if ((_thread_run->flags & PTHREAD_FLAGS_PRIVATE) == 0)
124		_last_user_thread = _thread_run;
125
126	/*
127	 * Enter a scheduling loop that finds the next thread that is
128	 * ready to run. This loop completes when there are no more threads
129	 * in the global list or when a thread has its state restored by
130	 * either a sigreturn (if the state was saved as a sigcontext) or a
131	 * longjmp (if the state was saved by a setjmp).
132	 */
133	while (!(TAILQ_EMPTY(&_thread_list))) {
134		/* Get the current time of day: */
135		gettimeofday(&tv, NULL);
136		TIMEVAL_TO_TIMESPEC(&tv, &ts);
137
138		/*
139		 * Protect the scheduling queues from access by the signal
140		 * handler.
141		 */
142		_queue_signals = 1;
143
144		if (_thread_run != &_thread_kern_thread) {
145
146			/*
147			 * This thread no longer needs to yield the CPU.
148			 */
149			_thread_run->yield_on_sig_undefer = 0;
150
151			/*
152			 * Save the current time as the time that the thread
153			 * became inactive:
154			 */
155			_thread_run->last_inactive.tv_sec = tv.tv_sec;
156			_thread_run->last_inactive.tv_usec = tv.tv_usec;
157
158			/*
159			 * Place the currently running thread into the
160			 * appropriate queue(s).
161			 */
162			switch (_thread_run->state) {
163			case PS_DEAD:
164				/*
165				 * Dead threads are not placed in any queue:
166				 */
167				break;
168
169			case PS_RUNNING:
170				/*
171				 * Runnable threads can't be placed in the
172				 * priority queue until after waiting threads
173				 * are polled (to preserve round-robin
174				 * scheduling).
175				 */
176				if ((_thread_run->slice_usec != -1) &&
177				    (_thread_run->attr.sched_policy != SCHED_FIFO)) {
178					/*
179					 * Accumulate the number of microseconds that
180					 * this thread has run for:
181					 */
182					_thread_run->slice_usec +=
183					    (_thread_run->last_inactive.tv_sec -
184					    _thread_run->last_active.tv_sec) * 1000000 +
185					    _thread_run->last_inactive.tv_usec -
186					    _thread_run->last_active.tv_usec;
187
188					/* Check for time quantum exceeded: */
189					if (_thread_run->slice_usec > TIMESLICE_USEC)
190						_thread_run->slice_usec = -1;
191				}
192				break;
193
194			/*
195			 * States which do not depend on file descriptor I/O
196			 * operations or timeouts:
197			 */
198			case PS_DEADLOCK:
199			case PS_FDLR_WAIT:
200			case PS_FDLW_WAIT:
201			case PS_FILE_WAIT:
202			case PS_JOIN:
203			case PS_MUTEX_WAIT:
204			case PS_SIGSUSPEND:
205			case PS_SIGTHREAD:
206			case PS_SIGWAIT:
207			case PS_SUSPENDED:
208			case PS_WAIT_WAIT:
209				/* No timeouts for these states: */
210				_thread_run->wakeup_time.tv_sec = -1;
211				_thread_run->wakeup_time.tv_nsec = -1;
212
213				/* Restart the time slice: */
214				_thread_run->slice_usec = -1;
215
216				/* Insert into the waiting queue: */
217				PTHREAD_WAITQ_INSERT(_thread_run);
218				break;
219
220			/* States which can timeout: */
221			case PS_COND_WAIT:
222			case PS_SLEEP_WAIT:
223				/* Restart the time slice: */
224				_thread_run->slice_usec = -1;
225
226				/* Insert into the waiting queue: */
227				PTHREAD_WAITQ_INSERT(_thread_run);
228				break;
229
230			/* States that require periodic work: */
231			case PS_SPINBLOCK:
232				/* No timeouts for this state: */
233				_thread_run->wakeup_time.tv_sec = -1;
234				_thread_run->wakeup_time.tv_nsec = -1;
235
236				/* Increment spinblock count: */
237				_spinblock_count++;
238
239				/* fall through */
240			case PS_FDR_WAIT:
241			case PS_FDW_WAIT:
242			case PS_POLL_WAIT:
243			case PS_SELECT_WAIT:
244				/* Restart the time slice: */
245				_thread_run->slice_usec = -1;
246
247				/* Insert into the waiting queue: */
248				PTHREAD_WAITQ_INSERT(_thread_run);
249
250				/* Insert into the work queue: */
251				PTHREAD_WORKQ_INSERT(_thread_run);
252			}
253		}
254
255		/* Unprotect the scheduling queues: */
256		_queue_signals = 0;
257
258		/*
259		 * Poll file descriptors to update the state of threads
260		 * waiting on file I/O where data may be available:
261		 */
262		_thread_kern_poll(0);
263
264		/* Protect the scheduling queues: */
265		_queue_signals = 1;
266
267		/*
268		 * Wake up threads that have timedout.  This has to be
269		 * done after polling in case a thread does a poll or
270		 * select with zero time.
271		 */
272		PTHREAD_WAITQ_SETACTIVE();
273		while (((pthread = TAILQ_FIRST(&_waitingq)) != NULL) &&
274		    (pthread->wakeup_time.tv_sec != -1) &&
275		    (((pthread->wakeup_time.tv_sec == 0) &&
276		    (pthread->wakeup_time.tv_nsec == 0)) ||
277		    (pthread->wakeup_time.tv_sec < ts.tv_sec) ||
278		    ((pthread->wakeup_time.tv_sec == ts.tv_sec) &&
279		    (pthread->wakeup_time.tv_nsec <= ts.tv_nsec)))) {
280			switch (pthread->state) {
281			case PS_POLL_WAIT:
282			case PS_SELECT_WAIT:
283				/* Return zero file descriptors ready: */
284				pthread->data.poll_data->nfds = 0;
285				/* fall through */
286			default:
287				/*
288				 * Remove this thread from the waiting queue
289				 * (and work queue if necessary) and place it
290				 * in the ready queue.
291				 */
292				PTHREAD_WAITQ_CLEARACTIVE();
293				if (pthread->flags & PTHREAD_FLAGS_IN_WORKQ)
294					PTHREAD_WORKQ_REMOVE(pthread);
295				PTHREAD_NEW_STATE(pthread, PS_RUNNING);
296				PTHREAD_WAITQ_SETACTIVE();
297				break;
298			}
299			/*
300			 * Flag the timeout in the thread structure:
301			 */
302			pthread->timeout = 1;
303		}
304		PTHREAD_WAITQ_CLEARACTIVE();
305
306		/*
307		 * Check if there is a current runnable thread that isn't
308		 * already in the ready queue:
309		 */
310		if ((_thread_run != &_thread_kern_thread) &&
311		    (_thread_run->state == PS_RUNNING) &&
312		    ((_thread_run->flags & PTHREAD_FLAGS_IN_PRIOQ) == 0)) {
313			if (_thread_run->slice_usec == -1) {
314				/*
315				 * The thread exceeded its time
316				 * quantum or it yielded the CPU;
317				 * place it at the tail of the
318				 * queue for its priority.
319				 */
320				PTHREAD_PRIOQ_INSERT_TAIL(_thread_run);
321			} else {
322				/*
323				 * The thread hasn't exceeded its
324				 * interval.  Place it at the head
325				 * of the queue for its priority.
326				 */
327				PTHREAD_PRIOQ_INSERT_HEAD(_thread_run);
328			}
329		}
330
331		/*
332		 * Get the highest priority thread in the ready queue.
333		 */
334		pthread_h = PTHREAD_PRIOQ_FIRST();
335
336		/* Check if there are no threads ready to run: */
337		if (pthread_h == NULL) {
338			/*
339			 * Lock the pthread kernel by changing the pointer to
340			 * the running thread to point to the global kernel
341			 * thread structure:
342			 */
343			_thread_run = &_thread_kern_thread;
344
345			/* Unprotect the scheduling queues: */
346			_queue_signals = 0;
347
348			/*
349			 * There are no threads ready to run, so wait until
350			 * something happens that changes this condition:
351			 */
352			_thread_kern_poll(1);
353		}
354		else {
355			/* Remove the thread from the ready queue: */
356			PTHREAD_PRIOQ_REMOVE(pthread_h);
357
358			/* Get first thread on the waiting list: */
359			pthread = TAILQ_FIRST(&_waitingq);
360
361			/* Check to see if there is more than one thread: */
362			if (pthread_h != TAILQ_FIRST(&_thread_list) ||
363			    TAILQ_NEXT(pthread_h, tle) != NULL)
364				set_timer = 1;
365			else
366				set_timer = 0;
367
368			/* Unprotect the scheduling queues: */
369			_queue_signals = 0;
370
371			/*
372			 * Check for signals queued while the scheduling
373			 * queues were protected:
374			 */
375			while (_sigq_check_reqd != 0) {
376				/* Clear before handling queued signals: */
377				_sigq_check_reqd = 0;
378
379				/* Protect the scheduling queues again: */
380				_queue_signals = 1;
381
382				dequeue_signals();
383
384				/*
385				 * Check for a higher priority thread that
386				 * became runnable due to signal handling.
387				 */
388				if (((pthread = PTHREAD_PRIOQ_FIRST()) != NULL) &&
389				    (pthread->active_priority > pthread_h->active_priority)) {
390					/*
391					 * Insert the lower priority thread
392					 * at the head of its priority list:
393					 */
394					PTHREAD_PRIOQ_INSERT_HEAD(pthread_h);
395
396					/* Remove the thread from the ready queue: */
397					PTHREAD_PRIOQ_REMOVE(pthread);
398
399					/* There's a new thread in town: */
400					pthread_h = pthread;
401				}
402
403				/* Get first thread on the waiting list: */
404				pthread = TAILQ_FIRST(&_waitingq);
405
406				/*
407				 * Check to see if there is more than one
408				 * thread:
409				 */
410				if (pthread_h != TAILQ_FIRST(&_thread_list) ||
411				    TAILQ_NEXT(pthread_h, tle) != NULL)
412					set_timer = 1;
413				else
414					set_timer = 0;
415
416				/* Unprotect the scheduling queues: */
417				_queue_signals = 0;
418			}
419
420			/* Make the selected thread the current thread: */
421			_thread_run = pthread_h;
422
423			/*
424			 * Save the current time as the time that the thread
425			 * became active:
426			 */
427			_thread_run->last_active.tv_sec = tv.tv_sec;
428			_thread_run->last_active.tv_usec = tv.tv_usec;
429
430			/*
431			 * Define the maximum time before a scheduling signal
432			 * is required:
433			 */
434			itimer.it_value.tv_sec = 0;
435			itimer.it_value.tv_usec = TIMESLICE_USEC;
436
437			/*
438			 * The interval timer is not reloaded when it
439			 * times out. The interval time needs to be
440			 * calculated every time.
441			 */
442			itimer.it_interval.tv_sec = 0;
443			itimer.it_interval.tv_usec = 0;
444
445			/* Get first thread on the waiting list: */
446			if ((pthread != NULL) &&
447			    (pthread->wakeup_time.tv_sec != -1)) {
448				/*
449				 * Calculate the time until this thread
450				 * is ready, allowing for the clock
451				 * resolution:
452				 */
453				ts1.tv_sec = pthread->wakeup_time.tv_sec
454				    - ts.tv_sec;
455				ts1.tv_nsec = pthread->wakeup_time.tv_nsec
456				    - ts.tv_nsec + _clock_res_nsec;
457
458				/*
459				 * Check for underflow of the nanosecond field:
460				 */
461				if (ts1.tv_nsec < 0) {
462					/*
463					 * Allow for the underflow of the
464					 * nanosecond field:
465					 */
466					ts1.tv_sec--;
467					ts1.tv_nsec += 1000000000;
468				}
469				/*
470				 * Check for overflow of the nanosecond field:
471				 */
472				if (ts1.tv_nsec >= 1000000000) {
473					/*
474					 * Allow for the overflow of the
475					 * nanosecond field:
476					 */
477					ts1.tv_sec++;
478					ts1.tv_nsec -= 1000000000;
479				}
480				/*
481				 * Convert the timespec structure to a
482				 * timeval structure:
483				 */
484				TIMESPEC_TO_TIMEVAL(&tv1, &ts1);
485
486				/*
487				 * Check if the thread will be ready
488				 * sooner than the earliest ones found
489				 * so far:
490				 */
491				if (timercmp(&tv1, &itimer.it_value, <)) {
492					/*
493					 * Update the time value:
494					 */
495					itimer.it_value.tv_sec = tv1.tv_sec;
496					itimer.it_value.tv_usec = tv1.tv_usec;
497				}
498			}
499
500			/*
501			 * Check if this thread is running for the first time
502			 * or running again after using its full time slice
503			 * allocation:
504			 */
505			if (_thread_run->slice_usec == -1) {
506				/* Reset the accumulated time slice period: */
507				_thread_run->slice_usec = 0;
508			}
509
510			/* Check if there is more than one thread: */
511			if (set_timer != 0) {
512				/*
513				 * Start the interval timer for the
514				 * calculated time interval:
515				 */
516				if (setitimer(_ITIMER_SCHED_TIMER, &itimer, NULL) != 0) {
517					/*
518					 * Cannot initialise the timer, so
519					 * abort this process:
520					 */
521					PANIC("Cannot set scheduling timer");
522				}
523			}
524
525			/* Check if a signal context was saved: */
526			if (_thread_run->sig_saved == 1) {
527#ifndef	__alpha__
528				/*
529				 * Point to the floating point data in the
530				 * running thread:
531				 */
532				fdata = _thread_run->saved_fp;
533
534				/* Restore the floating point state: */
535		__asm__("frstor %0": :"m"(*fdata));
536#endif
537				/*
538				 * Do a sigreturn to restart the thread that
539				 * was interrupted by a signal:
540				 */
541				_thread_kern_in_sched = 0;
542
543				/*
544				 * If we had a context switch, run any
545				 * installed switch hooks.
546				 */
547				if ((_sched_switch_hook != NULL) &&
548				    (_last_user_thread != _thread_run)) {
549					thread_run_switch_hook(_last_user_thread,
550					    _thread_run);
551				}
552				_thread_sys_sigreturn(&_thread_run->saved_sigcontext);
553			} else {
554				/*
555				 * Do a longjmp to restart the thread that
556				 * was context switched out (by a longjmp to
557				 * a different thread):
558				 */
559				longjmp(_thread_run->saved_jmp_buf, 1);
560			}
561
562			/* This point should not be reached. */
563			PANIC("Thread has returned from sigreturn or longjmp");
564		}
565	}
566
567	/* There are no more threads, so exit this process: */
568	exit(0);
569}
570
571void
572_thread_kern_sched_state(enum pthread_state state, char *fname, int lineno)
573{
574	/*
575	 * Flag the pthread kernel as executing scheduler code
576	 * to avoid a scheduler signal from interrupting this
577	 * execution and calling the scheduler again.
578	 */
579	_thread_kern_in_sched = 1;
580
581	/*
582	 * Prevent the signal handler from fiddling with this thread
583	 * before its state is set and is placed into the proper queue.
584	 */
585	_queue_signals = 1;
586
587	/* Change the state of the current thread: */
588	_thread_run->state = state;
589	_thread_run->fname = fname;
590	_thread_run->lineno = lineno;
591
592	/* Schedule the next thread that is ready: */
593	_thread_kern_sched(NULL);
594	return;
595}
596
597void
598_thread_kern_sched_state_unlock(enum pthread_state state,
599    spinlock_t *lock, char *fname, int lineno)
600{
601	/*
602	 * Flag the pthread kernel as executing scheduler code
603	 * to avoid a scheduler signal from interrupting this
604	 * execution and calling the scheduler again.
605	 */
606	_thread_kern_in_sched = 1;
607
608	/*
609	 * Prevent the signal handler from fiddling with this thread
610	 * before its state is set and it is placed into the proper
611	 * queue(s).
612	 */
613	_queue_signals = 1;
614
615	/* Change the state of the current thread: */
616	_thread_run->state = state;
617	_thread_run->fname = fname;
618	_thread_run->lineno = lineno;
619
620	_SPINUNLOCK(lock);
621
622	/* Schedule the next thread that is ready: */
623	_thread_kern_sched(NULL);
624	return;
625}
626
627static void
628_thread_kern_poll(int wait_reqd)
629{
630	char            bufr[128];
631	int             count = 0;
632	int             i, found;
633	int		kern_pipe_added = 0;
634	int             nfds = 0;
635	int		timeout_ms = 0;
636	struct pthread	*pthread, *pthread_next;
637	ssize_t         num;
638	struct timespec ts;
639	struct timeval  tv;
640
641	/* Check if the caller wants to wait: */
642	if (wait_reqd == 0) {
643		timeout_ms = 0;
644	}
645	else {
646		/* Get the current time of day: */
647		gettimeofday(&tv, NULL);
648		TIMEVAL_TO_TIMESPEC(&tv, &ts);
649
650		_queue_signals = 1;
651		pthread = TAILQ_FIRST(&_waitingq);
652		_queue_signals = 0;
653
654		if ((pthread == NULL) || (pthread->wakeup_time.tv_sec == -1)) {
655			/*
656			 * Either there are no threads in the waiting queue,
657			 * or there are no threads that can timeout.
658			 */
659			timeout_ms = INFTIM;
660		}
661		else {
662			/*
663			 * Calculate the time left for the next thread to
664			 * timeout allowing for the clock resolution:
665			 */
666			timeout_ms = ((pthread->wakeup_time.tv_sec - ts.tv_sec) *
667			    1000) + ((pthread->wakeup_time.tv_nsec - ts.tv_nsec +
668			    _clock_res_nsec) / 1000000);
669			/*
670			 * Don't allow negative timeouts:
671			 */
672			if (timeout_ms < 0)
673				timeout_ms = 0;
674		}
675	}
676
677	/* Protect the scheduling queues: */
678	_queue_signals = 1;
679
680	/*
681	 * Check to see if the signal queue needs to be walked to look
682	 * for threads awoken by a signal while in the scheduler.
683	 */
684	if (_sigq_check_reqd != 0) {
685		/* Reset flag before handling queued signals: */
686		_sigq_check_reqd = 0;
687
688		dequeue_signals();
689	}
690
691	/*
692	 * Check for a thread that became runnable due to a signal:
693	 */
694	if (PTHREAD_PRIOQ_FIRST() != NULL) {
695		/*
696		 * Since there is at least one runnable thread,
697		 * disable the wait.
698		 */
699		timeout_ms = 0;
700	}
701
702	/*
703	 * Form the poll table:
704	 */
705	nfds = 0;
706	if (timeout_ms != 0) {
707		/* Add the kernel pipe to the poll table: */
708		_thread_pfd_table[nfds].fd = _thread_kern_pipe[0];
709		_thread_pfd_table[nfds].events = POLLRDNORM;
710		_thread_pfd_table[nfds].revents = 0;
711		nfds++;
712		kern_pipe_added = 1;
713	}
714
715	PTHREAD_WAITQ_SETACTIVE();
716	TAILQ_FOREACH(pthread, &_workq, qe) {
717		switch (pthread->state) {
718		case PS_SPINBLOCK:
719			/*
720			 * If the lock is available, let the thread run.
721			 */
722			if (pthread->data.spinlock->access_lock == 0) {
723				PTHREAD_WAITQ_CLEARACTIVE();
724				PTHREAD_WORKQ_REMOVE(pthread);
725				PTHREAD_NEW_STATE(pthread,PS_RUNNING);
726				PTHREAD_WAITQ_SETACTIVE();
727				/* One less thread in a spinblock state: */
728				_spinblock_count--;
729				/*
730				 * Since there is at least one runnable
731				 * thread, disable the wait.
732				 */
733				timeout_ms = 0;
734			}
735			break;
736
737		/* File descriptor read wait: */
738		case PS_FDR_WAIT:
739			/* Limit number of polled files to table size: */
740			if (nfds < _thread_dtablesize) {
741				_thread_pfd_table[nfds].events = POLLRDNORM;
742				_thread_pfd_table[nfds].fd = pthread->data.fd.fd;
743				nfds++;
744			}
745			break;
746
747		/* File descriptor write wait: */
748		case PS_FDW_WAIT:
749			/* Limit number of polled files to table size: */
750			if (nfds < _thread_dtablesize) {
751				_thread_pfd_table[nfds].events = POLLWRNORM;
752				_thread_pfd_table[nfds].fd = pthread->data.fd.fd;
753				nfds++;
754			}
755			break;
756
757		/* File descriptor poll or select wait: */
758		case PS_POLL_WAIT:
759		case PS_SELECT_WAIT:
760			/* Limit number of polled files to table size: */
761			if (pthread->data.poll_data->nfds + nfds <
762			    _thread_dtablesize) {
763				for (i = 0; i < pthread->data.poll_data->nfds; i++) {
764					_thread_pfd_table[nfds + i].fd =
765					    pthread->data.poll_data->fds[i].fd;
766					_thread_pfd_table[nfds + i].events =
767					    pthread->data.poll_data->fds[i].events;
768				}
769				nfds += pthread->data.poll_data->nfds;
770			}
771			break;
772
773		/* Other states do not depend on file I/O. */
774		default:
775			break;
776		}
777	}
778	PTHREAD_WAITQ_CLEARACTIVE();
779
780	/*
781	 * Wait for a file descriptor to be ready for read, write, or
782	 * an exception, or a timeout to occur:
783	 */
784	count = _thread_sys_poll(_thread_pfd_table, nfds, timeout_ms);
785
786	if (kern_pipe_added != 0)
787		/*
788		 * Remove the pthread kernel pipe file descriptor
789		 * from the pollfd table:
790		 */
791		nfds = 1;
792	else
793		nfds = 0;
794
795	/*
796	 * Check if it is possible that there are bytes in the kernel
797	 * read pipe waiting to be read:
798	 */
799	if (count < 0 || ((kern_pipe_added != 0) &&
800	    (_thread_pfd_table[0].revents & POLLRDNORM))) {
801		/*
802		 * If the kernel read pipe was included in the
803		 * count:
804		 */
805		if (count > 0) {
806			/* Decrement the count of file descriptors: */
807			count--;
808		}
809
810		if (_sigq_check_reqd != 0) {
811			/* Reset flag before handling signals: */
812			_sigq_check_reqd = 0;
813
814			dequeue_signals();
815		}
816	}
817
818	/*
819	 * Check if any file descriptors are ready:
820	 */
821	if (count > 0) {
822		/*
823		 * Enter a loop to look for threads waiting on file
824		 * descriptors that are flagged as available by the
825		 * _poll syscall:
826		 */
827		PTHREAD_WAITQ_SETACTIVE();
828		TAILQ_FOREACH(pthread, &_workq, qe) {
829			switch (pthread->state) {
830			case PS_SPINBLOCK:
831				/*
832				 * If the lock is available, let the thread run.
833				 */
834				if (pthread->data.spinlock->access_lock == 0) {
835					PTHREAD_WAITQ_CLEARACTIVE();
836					PTHREAD_WORKQ_REMOVE(pthread);
837					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
838					PTHREAD_WAITQ_SETACTIVE();
839
840					/*
841					 * One less thread in a spinblock state:
842					 */
843					_spinblock_count--;
844				}
845				break;
846
847			/* File descriptor read wait: */
848			case PS_FDR_WAIT:
849				if ((nfds < _thread_dtablesize) &&
850				    (_thread_pfd_table[nfds].revents & POLLRDNORM)) {
851					PTHREAD_WAITQ_CLEARACTIVE();
852					PTHREAD_WORKQ_REMOVE(pthread);
853					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
854					PTHREAD_WAITQ_SETACTIVE();
855				}
856				nfds++;
857				break;
858
859			/* File descriptor write wait: */
860			case PS_FDW_WAIT:
861				if ((nfds < _thread_dtablesize) &&
862				    (_thread_pfd_table[nfds].revents & POLLWRNORM)) {
863					PTHREAD_WAITQ_CLEARACTIVE();
864					PTHREAD_WORKQ_REMOVE(pthread);
865					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
866					PTHREAD_WAITQ_SETACTIVE();
867				}
868				nfds++;
869				break;
870
871			/* File descriptor poll or select wait: */
872			case PS_POLL_WAIT:
873			case PS_SELECT_WAIT:
874				if (pthread->data.poll_data->nfds + nfds <
875				    _thread_dtablesize) {
876					/*
877					 * Enter a loop looking for I/O
878					 * readiness:
879					 */
880					found = 0;
881					for (i = 0; i < pthread->data.poll_data->nfds; i++) {
882						if (_thread_pfd_table[nfds + i].revents != 0) {
883							pthread->data.poll_data->fds[i].revents =
884							    _thread_pfd_table[nfds + i].revents;
885							found++;
886						}
887					}
888
889					/* Increment before destroying: */
890					nfds += pthread->data.poll_data->nfds;
891
892					if (found != 0) {
893						pthread->data.poll_data->nfds = found;
894						PTHREAD_WAITQ_CLEARACTIVE();
895						PTHREAD_WORKQ_REMOVE(pthread);
896						PTHREAD_NEW_STATE(pthread,PS_RUNNING);
897						PTHREAD_WAITQ_SETACTIVE();
898					}
899				}
900				else
901					nfds += pthread->data.poll_data->nfds;
902				break;
903
904			/* Other states do not depend on file I/O. */
905			default:
906				break;
907			}
908		}
909		PTHREAD_WAITQ_CLEARACTIVE();
910	}
911	else if (_spinblock_count != 0) {
912		/*
913		 * Enter a loop to look for threads waiting on a spinlock
914		 * that is now available.
915		 */
916		PTHREAD_WAITQ_SETACTIVE();
917		TAILQ_FOREACH(pthread, &_workq, qe) {
918			if (pthread->state == PS_SPINBLOCK) {
919				/*
920				 * If the lock is available, let the thread run.
921				 */
922				if (pthread->data.spinlock->access_lock == 0) {
923					PTHREAD_WAITQ_CLEARACTIVE();
924					PTHREAD_WORKQ_REMOVE(pthread);
925					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
926					PTHREAD_WAITQ_SETACTIVE();
927
928					/*
929					 * One less thread in a spinblock state:
930					 */
931					_spinblock_count--;
932				}
933			}
934		}
935		PTHREAD_WAITQ_CLEARACTIVE();
936	}
937
938	/* Unprotect the scheduling queues: */
939	_queue_signals = 0;
940
941	while (_sigq_check_reqd != 0) {
942		/* Handle queued signals: */
943		_sigq_check_reqd = 0;
944
945		/* Protect the scheduling queues: */
946		_queue_signals = 1;
947
948		dequeue_signals();
949
950		/* Unprotect the scheduling queues: */
951		_queue_signals = 0;
952	}
953
954	/* Nothing to return. */
955	return;
956}
957
958void
959_thread_kern_set_timeout(struct timespec * timeout)
960{
961	struct timespec current_time;
962	struct timeval  tv;
963
964	/* Reset the timeout flag for the running thread: */
965	_thread_run->timeout = 0;
966
967	/* Check if the thread is to wait forever: */
968	if (timeout == NULL) {
969		/*
970		 * Set the wakeup time to something that can be recognised as
971		 * different to an actual time of day:
972		 */
973		_thread_run->wakeup_time.tv_sec = -1;
974		_thread_run->wakeup_time.tv_nsec = -1;
975	}
976	/* Check if no waiting is required: */
977	else if (timeout->tv_sec == 0 && timeout->tv_nsec == 0) {
978		/* Set the wake up time to 'immediately': */
979		_thread_run->wakeup_time.tv_sec = 0;
980		_thread_run->wakeup_time.tv_nsec = 0;
981	} else {
982		/* Get the current time: */
983		gettimeofday(&tv, NULL);
984		TIMEVAL_TO_TIMESPEC(&tv, &current_time);
985
986		/* Calculate the time for the current thread to wake up: */
987		_thread_run->wakeup_time.tv_sec = current_time.tv_sec + timeout->tv_sec;
988		_thread_run->wakeup_time.tv_nsec = current_time.tv_nsec + timeout->tv_nsec;
989
990		/* Check if the nanosecond field needs to wrap: */
991		if (_thread_run->wakeup_time.tv_nsec >= 1000000000) {
992			/* Wrap the nanosecond field: */
993			_thread_run->wakeup_time.tv_sec += 1;
994			_thread_run->wakeup_time.tv_nsec -= 1000000000;
995		}
996	}
997	return;
998}
999
1000void
1001_thread_kern_sig_defer(void)
1002{
1003	/* Allow signal deferral to be recursive. */
1004	_thread_run->sig_defer_count++;
1005}
1006
1007void
1008_thread_kern_sig_undefer(void)
1009{
1010	pthread_t pthread;
1011	int need_resched = 0;
1012
1013	/*
1014	 * Perform checks to yield only if we are about to undefer
1015	 * signals.
1016	 */
1017	if (_thread_run->sig_defer_count > 1) {
1018		/* Decrement the signal deferral count. */
1019		_thread_run->sig_defer_count--;
1020	}
1021	else if (_thread_run->sig_defer_count == 1) {
1022		/* Reenable signals: */
1023		_thread_run->sig_defer_count = 0;
1024
1025		/*
1026		 * Check if there are queued signals:
1027		 */
1028		while (_sigq_check_reqd != 0) {
1029			/* Defer scheduling while we process queued signals: */
1030			_thread_run->sig_defer_count = 1;
1031
1032			/* Clear the flag before checking the signal queue: */
1033			_sigq_check_reqd = 0;
1034
1035			/* Dequeue and handle signals: */
1036			dequeue_signals();
1037
1038			/*
1039			 * Avoiding an unnecessary check to reschedule, check
1040			 * to see if signal handling caused a higher priority
1041			 * thread to become ready.
1042			 */
1043			if ((need_resched == 0) &&
1044			    (((pthread = PTHREAD_PRIOQ_FIRST()) != NULL) &&
1045			    (pthread->active_priority > _thread_run->active_priority))) {
1046				need_resched = 1;
1047			}
1048
1049			/* Reenable signals: */
1050			_thread_run->sig_defer_count = 0;
1051		}
1052
1053		/* Yield the CPU if necessary: */
1054		if (need_resched || _thread_run->yield_on_sig_undefer != 0) {
1055			_thread_run->yield_on_sig_undefer = 0;
1056			_thread_kern_sched(NULL);
1057		}
1058	}
1059}
1060
1061static void
1062dequeue_signals(void)
1063{
1064	char	bufr[128];
1065	int	i, num;
1066
1067	/*
1068	 * Enter a loop to read and handle queued signals from the
1069	 * pthread kernel pipe:
1070	 */
1071	while (((num = _thread_sys_read(_thread_kern_pipe[0], bufr,
1072	    sizeof(bufr))) > 0) || (num == -1 && errno == EINTR)) {
1073		/*
1074		 * The buffer read contains one byte per signal and
1075		 * each byte is the signal number.
1076		 */
1077		for (i = 0; i < num; i++) {
1078			if ((int) bufr[i] == _SCHED_SIGNAL) {
1079				/*
1080				 * Scheduling signals shouldn't ever be
1081				 * queued; just ignore it for now.
1082				 */
1083			}
1084			else {
1085				/* Handle this signal: */
1086				_thread_sig_handle((int) bufr[i], NULL);
1087			}
1088		}
1089	}
1090	if ((num < 0) && (errno != EAGAIN)) {
1091		/*
1092		 * The only error we should expect is if there is
1093		 * no data to read.
1094		 */
1095		PANIC("Unable to read from thread kernel pipe");
1096	}
1097}
1098
1099static inline void
1100thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in)
1101{
1102	pthread_t tid_out = thread_out;
1103	pthread_t tid_in = thread_in;
1104
1105	if ((tid_out != NULL) &&
1106	    (tid_out->flags & PTHREAD_FLAGS_PRIVATE != 0))
1107		tid_out = NULL;
1108	if ((tid_in != NULL) &&
1109	    (tid_in->flags & PTHREAD_FLAGS_PRIVATE != 0))
1110		tid_in = NULL;
1111
1112	if ((_sched_switch_hook != NULL) && (tid_out != tid_in)) {
1113		/* Run the scheduler switch hook: */
1114		_sched_switch_hook(tid_out, tid_in);
1115	}
1116}
1117#endif
1118