thr_kern.c revision 46680
1/*
2 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by John Birrell.
16 * 4. Neither the name of the author nor the names of any co-contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * $Id: uthread_kern.c,v 1.17 1999/05/07 07:59:44 jasone Exp $
33 *
34 */
35#include <errno.h>
36#include <stdlib.h>
37#include <stdarg.h>
38#include <string.h>
39#include <unistd.h>
40#include <setjmp.h>
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <sys/time.h>
44#include <sys/socket.h>
45#include <sys/uio.h>
46#include <sys/syscall.h>
47#include <fcntl.h>
48#ifdef _THREAD_SAFE
49#include <pthread.h>
50#include "pthread_private.h"
51
52/* Static function prototype definitions: */
53static void
54_thread_kern_select(int wait_reqd);
55
56static inline void
57thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in);
58
59void
60_thread_kern_sched(struct sigcontext * scp)
61{
62#ifndef	__alpha__
63	char           *fdata;
64#endif
65	pthread_t       pthread;
66	pthread_t       pthread_h = NULL;
67	pthread_t	last_thread = NULL;
68	struct itimerval itimer;
69	struct timespec ts;
70	struct timespec ts1;
71	struct timeval  tv;
72	struct timeval  tv1;
73
74	/*
75	 * Flag the pthread kernel as executing scheduler code
76	 * to avoid a scheduler signal from interrupting this
77	 * execution and calling the scheduler again.
78	 */
79	_thread_kern_in_sched = 1;
80
81	/* Check if this function was called from the signal handler: */
82	if (scp != NULL) {
83		/*
84		 * Copy the signal context to the current thread's jump
85		 * buffer:
86		 */
87		memcpy(&_thread_run->saved_sigcontext, scp, sizeof(_thread_run->saved_sigcontext));
88
89#ifndef	__alpha__
90		/* Point to the floating point data in the running thread: */
91		fdata = _thread_run->saved_fp;
92
93		/* Save the floating point data: */
94__asm__("fnsave %0": :"m"(*fdata));
95#endif
96
97		/* Flag the signal context as the last state saved: */
98		_thread_run->sig_saved = 1;
99	}
100	/* Save the state of the current thread: */
101	else if (setjmp(_thread_run->saved_jmp_buf) != 0) {
102		/*
103		 * This point is reached when a longjmp() is called to
104		 * restore the state of a thread.
105		 *
106		 * This is the normal way out of the scheduler.
107		 */
108		_thread_kern_in_sched = 0;
109
110		if (_sched_switch_hook != NULL) {
111			/* Run the installed switch hook: */
112			thread_run_switch_hook(_last_user_thread, _thread_run);
113		}
114		return;
115	} else
116		/* Flag the jump buffer was the last state saved: */
117		_thread_run->sig_saved = 0;
118
119	/* If the currently running thread is a user thread, save it: */
120	if ((_thread_run->flags & PTHREAD_FLAGS_PRIVATE) == 0)
121		_last_user_thread = _thread_run;
122
123	/*
124	 * Enter a scheduling loop that finds the next thread that is
125	 * ready to run. This loop completes when there are no more threads
126	 * in the global list or when a thread has its state restored by
127	 * either a sigreturn (if the state was saved as a sigcontext) or a
128	 * longjmp (if the state was saved by a setjmp).
129	 */
130	while (_thread_link_list != NULL) {
131		/* Get the current time of day: */
132		gettimeofday(&tv, NULL);
133		TIMEVAL_TO_TIMESPEC(&tv, &ts);
134
135		/*
136		 * Poll file descriptors to update the state of threads
137		 * waiting on file I/O where data may be available:
138		 */
139		_thread_kern_select(0);
140
141		/*
142		 * Define the maximum time before a scheduling signal
143		 * is required:
144		 */
145		itimer.it_value.tv_sec = 0;
146		itimer.it_value.tv_usec = TIMESLICE_USEC;
147
148		/*
149		 * The interval timer is not reloaded when it
150		 * times out. The interval time needs to be
151		 * calculated every time.
152		 */
153		itimer.it_interval.tv_sec = 0;
154		itimer.it_interval.tv_usec = 0;
155
156		/*
157		 * Enter a loop to look for sleeping threads that are ready
158		 * or timedout.  While we're at it, also find the smallest
159		 * timeout value for threads waiting for a time.
160		 */
161		_waitingq_check_reqd = 0;	/* reset flag before loop */
162		TAILQ_FOREACH(pthread, &_waitingq, pqe) {
163			/* Check if this thread is ready: */
164			if (pthread->state == PS_RUNNING) {
165				PTHREAD_WAITQ_REMOVE(pthread);
166				PTHREAD_PRIOQ_INSERT_TAIL(pthread);
167			}
168
169			/*
170			 * Check if this thread is blocked by an
171			 * atomic lock:
172			 */
173			else if (pthread->state == PS_SPINBLOCK) {
174				/*
175				 * If the lock is available, let
176				 * the thread run.
177				 */
178				if (pthread->data.spinlock->access_lock == 0) {
179					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
180				}
181
182			/* Check if this thread is to timeout: */
183			} else if (pthread->state == PS_COND_WAIT ||
184			    pthread->state == PS_SLEEP_WAIT ||
185			    pthread->state == PS_FDR_WAIT ||
186			    pthread->state == PS_FDW_WAIT ||
187			    pthread->state == PS_SELECT_WAIT) {
188				/* Check if this thread is to wait forever: */
189				if (pthread->wakeup_time.tv_sec == -1) {
190				}
191				/*
192				 * Check if this thread is to wakeup
193				 * immediately or if it is past its wakeup
194				 * time:
195				 */
196				else if ((pthread->wakeup_time.tv_sec == 0 &&
197					pthread->wakeup_time.tv_nsec == 0) ||
198					 (ts.tv_sec > pthread->wakeup_time.tv_sec) ||
199					 ((ts.tv_sec == pthread->wakeup_time.tv_sec) &&
200					  (ts.tv_nsec >= pthread->wakeup_time.tv_nsec))) {
201					/*
202					 * Check if this thread is waiting on
203					 * select:
204					 */
205					if (pthread->state == PS_SELECT_WAIT) {
206						/*
207						 * The select has timed out, so
208						 * zero the file descriptor
209						 * sets:
210						 */
211						FD_ZERO(&pthread->data.select_data->readfds);
212						FD_ZERO(&pthread->data.select_data->writefds);
213						FD_ZERO(&pthread->data.select_data->exceptfds);
214						pthread->data.select_data->nfds = 0;
215                                        }
216					/*
217					 * Return an error as an interrupted
218					 * wait:
219					 */
220					_thread_seterrno(pthread, EINTR);
221
222					/*
223					 * Flag the timeout in the thread
224					 * structure:
225					 */
226					pthread->timeout = 1;
227
228					/*
229					 * Change the threads state to allow
230					 * it to be restarted:
231					 */
232					PTHREAD_NEW_STATE(pthread,PS_RUNNING);
233				} else {
234					/*
235					 * Calculate the time until this thread
236					 * is ready, allowing for the clock
237					 * resolution:
238					 */
239					ts1.tv_sec = pthread->wakeup_time.tv_sec
240					    - ts.tv_sec;
241					ts1.tv_nsec = pthread->wakeup_time.tv_nsec
242					    - ts.tv_nsec + CLOCK_RES_NSEC;
243
244					/*
245					 * Check for underflow of the
246					 * nanosecond field:
247					 */
248					if (ts1.tv_nsec < 0) {
249						/*
250						 * Allow for the underflow
251						 * of the nanosecond field:
252						 */
253						ts1.tv_sec--;
254						ts1.tv_nsec += 1000000000;
255					}
256					/*
257					 * Check for overflow of the nanosecond
258					 * field:
259					 */
260					if (ts1.tv_nsec >= 1000000000) {
261						/*
262						 * Allow for the overflow of
263						 * the nanosecond field:
264						 */
265						ts1.tv_sec++;
266						ts1.tv_nsec -= 1000000000;
267					}
268					/*
269					 * Convert the timespec structure
270					 * to a timeval structure:
271					 */
272					TIMESPEC_TO_TIMEVAL(&tv1, &ts1);
273
274					/*
275					 * Check if the thread will be ready
276					 * sooner than the earliest ones found
277					 * so far:
278					 */
279					if (timercmp(&tv1, &itimer.it_value, <)) {
280						/*
281						 * Update the time value:
282						 */
283						itimer.it_value.tv_sec = tv1.tv_sec;
284						itimer.it_value.tv_usec = tv1.tv_usec;
285					}
286				}
287
288			}
289		}
290
291		/* Check if there is a current thread: */
292		if (_thread_run != &_thread_kern_thread) {
293			/*
294			 * This thread no longer needs to yield the CPU.
295			 */
296			_thread_run->yield_on_sched_undefer = 0;
297
298			/*
299			 * Save the current time as the time that the thread
300			 * became inactive:
301			 */
302			_thread_run->last_inactive.tv_sec = tv.tv_sec;
303			_thread_run->last_inactive.tv_usec = tv.tv_usec;
304
305			/*
306			 * Accumulate the number of microseconds that this
307			 * thread has run for:
308			 */
309			if ((_thread_run->slice_usec != -1) &&
310			    (_thread_run->attr.sched_policy != SCHED_FIFO)) {
311				_thread_run->slice_usec +=
312				    (_thread_run->last_inactive.tv_sec -
313				    _thread_run->last_active.tv_sec) * 1000000 +
314				    _thread_run->last_inactive.tv_usec -
315				    _thread_run->last_active.tv_usec;
316
317				/* Check for time quantum exceeded: */
318				if (_thread_run->slice_usec > TIMESLICE_USEC)
319					_thread_run->slice_usec = -1;
320			}
321			if (_thread_run->state == PS_RUNNING) {
322				if (_thread_run->slice_usec == -1) {
323					/*
324					 * The thread exceeded its time
325					 * quantum or it yielded the CPU;
326					 * place it at the tail of the
327					 * queue for its priority.
328					 */
329					PTHREAD_PRIOQ_INSERT_TAIL(_thread_run);
330				} else {
331					/*
332					 * The thread hasn't exceeded its
333					 * interval.  Place it at the head
334					 * of the queue for its priority.
335					 */
336					PTHREAD_PRIOQ_INSERT_HEAD(_thread_run);
337				}
338			}
339			else if (_thread_run->state == PS_DEAD) {
340				/*
341				 * Don't add dead threads to the waiting
342				 * queue, because when they're reaped, it
343				 * will corrupt the queue.
344				 */
345			}
346			else {
347				/*
348				 * This thread has changed state and needs
349				 * to be placed in the waiting queue.
350				 */
351				PTHREAD_WAITQ_INSERT(_thread_run);
352
353				/* Restart the time slice: */
354				_thread_run->slice_usec = -1;
355			}
356		}
357
358		/*
359		 * Get the highest priority thread in the ready queue.
360		 */
361		pthread_h = PTHREAD_PRIOQ_FIRST;
362
363		/* Check if there are no threads ready to run: */
364		if (pthread_h == NULL) {
365			/*
366			 * Lock the pthread kernel by changing the pointer to
367			 * the running thread to point to the global kernel
368			 * thread structure:
369			 */
370			_thread_run = &_thread_kern_thread;
371
372			/*
373			 * There are no threads ready to run, so wait until
374			 * something happens that changes this condition:
375			 */
376			_thread_kern_select(1);
377		} else {
378			/* Make the selected thread the current thread: */
379			_thread_run = pthread_h;
380
381			/* Remove the thread from the ready queue. */
382			PTHREAD_PRIOQ_REMOVE(_thread_run);
383
384			/*
385			 * Save the current time as the time that the thread
386			 * became active:
387			 */
388			_thread_run->last_active.tv_sec = tv.tv_sec;
389			_thread_run->last_active.tv_usec = tv.tv_usec;
390
391			/*
392			 * Check if this thread is running for the first time
393			 * or running again after using its full time slice
394			 * allocation:
395			 */
396			if (_thread_run->slice_usec == -1) {
397				/* Reset the accumulated time slice period: */
398				_thread_run->slice_usec = 0;
399			}
400
401			/* Check if there is more than one thread: */
402			if (_thread_run != _thread_link_list || _thread_run->nxt != NULL) {
403				/*
404				 * Start the interval timer for the
405				 * calculated time interval:
406				 */
407				if (setitimer(_ITIMER_SCHED_TIMER, &itimer, NULL) != 0) {
408					/*
409					 * Cannot initialise the timer, so
410					 * abort this process:
411					 */
412					PANIC("Cannot set scheduling timer");
413				}
414			}
415
416			/* Check if a signal context was saved: */
417			if (_thread_run->sig_saved == 1) {
418#ifndef	__alpha__
419				/*
420				 * Point to the floating point data in the
421				 * running thread:
422				 */
423				fdata = _thread_run->saved_fp;
424
425				/* Restore the floating point state: */
426		__asm__("frstor %0": :"m"(*fdata));
427#endif
428				/*
429				 * Do a sigreturn to restart the thread that
430				 * was interrupted by a signal:
431				 */
432				_thread_kern_in_sched = 0;
433
434				/*
435				 * If we had a context switch, run any
436				 * installed switch hooks.
437				 */
438				if ((_sched_switch_hook != NULL) &&
439				    (_last_user_thread != _thread_run)) {
440					thread_run_switch_hook(_last_user_thread,
441					    _thread_run);
442				}
443				_thread_sys_sigreturn(&_thread_run->saved_sigcontext);
444			} else {
445				/*
446				 * Do a longjmp to restart the thread that
447				 * was context switched out (by a longjmp to
448				 * a different thread):
449				 */
450				longjmp(_thread_run->saved_jmp_buf, 1);
451			}
452
453			/* This point should not be reached. */
454			PANIC("Thread has returned from sigreturn or longjmp");
455		}
456	}
457
458	/* There are no more threads, so exit this process: */
459	exit(0);
460}
461
462void
463_thread_kern_sched_state(enum pthread_state state, char *fname, int lineno)
464{
465	/* Change the state of the current thread: */
466	_thread_run->state = state;
467	_thread_run->fname = fname;
468	_thread_run->lineno = lineno;
469
470	/* Schedule the next thread that is ready: */
471	_thread_kern_sched(NULL);
472	return;
473}
474
475void
476_thread_kern_sched_state_unlock(enum pthread_state state,
477    spinlock_t *lock, char *fname, int lineno)
478{
479	/* Change the state of the current thread: */
480	_thread_run->state = state;
481	_thread_run->fname = fname;
482	_thread_run->lineno = lineno;
483
484	_SPINUNLOCK(lock);
485
486	/* Schedule the next thread that is ready: */
487	_thread_kern_sched(NULL);
488	return;
489}
490
491static void
492_thread_kern_select(int wait_reqd)
493{
494	char            bufr[128];
495	fd_set          fd_set_except;
496	fd_set          fd_set_read;
497	fd_set          fd_set_write;
498	int             count = 0;
499	int             count_dec;
500	int             found_one;
501	int             i;
502	int             nfds = -1;
503	int             settimeout;
504	pthread_t       pthread;
505	ssize_t         num;
506	struct timespec ts;
507	struct timespec ts1;
508	struct timeval *p_tv;
509	struct timeval  tv;
510	struct timeval  tv1;
511
512	/* Zero the file descriptor sets: */
513	FD_ZERO(&fd_set_read);
514	FD_ZERO(&fd_set_write);
515	FD_ZERO(&fd_set_except);
516
517	/* Check if the caller wants to wait: */
518	if (wait_reqd) {
519		/*
520		 * Add the pthread kernel pipe file descriptor to the read
521		 * set:
522		 */
523		FD_SET(_thread_kern_pipe[0], &fd_set_read);
524		nfds = _thread_kern_pipe[0];
525
526		/* Get the current time of day: */
527		gettimeofday(&tv, NULL);
528		TIMEVAL_TO_TIMESPEC(&tv, &ts);
529	}
530	/* Initialise the time value structure: */
531	tv.tv_sec = 0;
532	tv.tv_usec = 0;
533
534	/*
535	 * Enter a loop to process threads waiting on either file descriptors
536	 * or times:
537	 */
538	_waitingq_check_reqd = 0;	/* reset flag before loop */
539	TAILQ_FOREACH (pthread, &_waitingq, pqe) {
540		/* Assume that this state does not time out: */
541		settimeout = 0;
542
543		/* Process according to thread state: */
544		switch (pthread->state) {
545		/*
546		 * States which do not depend on file descriptor I/O
547		 * operations or timeouts:
548		 */
549		case PS_DEAD:
550		case PS_DEADLOCK:
551		case PS_FDLR_WAIT:
552		case PS_FDLW_WAIT:
553		case PS_FILE_WAIT:
554		case PS_JOIN:
555		case PS_MUTEX_WAIT:
556		case PS_SIGTHREAD:
557		case PS_SIGWAIT:
558		case PS_STATE_MAX:
559		case PS_WAIT_WAIT:
560		case PS_SUSPENDED:
561			/* Nothing to do here. */
562			break;
563
564		case PS_RUNNING:
565			/*
566			 * A signal occurred and made this thread ready
567			 * while in the scheduler or while the scheduling
568			 * queues were protected.
569			 */
570			PTHREAD_WAITQ_REMOVE(pthread);
571			PTHREAD_PRIOQ_INSERT_TAIL(pthread);
572			break;
573
574		/* File descriptor read wait: */
575		case PS_FDR_WAIT:
576			/* Add the file descriptor to the read set: */
577			FD_SET(pthread->data.fd.fd, &fd_set_read);
578
579			/*
580			 * Check if this file descriptor is greater than any
581			 * of those seen so far:
582			 */
583			if (pthread->data.fd.fd > nfds) {
584				/* Remember this file descriptor: */
585				nfds = pthread->data.fd.fd;
586			}
587			/* Increment the file descriptor count: */
588			count++;
589
590			/* This state can time out: */
591			settimeout = 1;
592			break;
593
594		/* File descriptor write wait: */
595		case PS_FDW_WAIT:
596			/* Add the file descriptor to the write set: */
597			FD_SET(pthread->data.fd.fd, &fd_set_write);
598
599			/*
600			 * Check if this file descriptor is greater than any
601			 * of those seen so far:
602			 */
603			if (pthread->data.fd.fd > nfds) {
604				/* Remember this file descriptor: */
605				nfds = pthread->data.fd.fd;
606			}
607			/* Increment the file descriptor count: */
608			count++;
609
610			/* This state can time out: */
611			settimeout = 1;
612			break;
613
614		/* States that time out: */
615		case PS_SLEEP_WAIT:
616		case PS_COND_WAIT:
617			/* Flag a timeout as required: */
618			settimeout = 1;
619			break;
620
621		/* Select wait: */
622		case PS_SELECT_WAIT:
623			/*
624			 * Enter a loop to process each file descriptor in
625			 * the thread-specific file descriptor sets:
626			 */
627			for (i = 0; i < pthread->data.select_data->nfds; i++) {
628				/*
629				 * Check if this file descriptor is set for
630				 * exceptions:
631				 */
632				if (FD_ISSET(i, &pthread->data.select_data->exceptfds)) {
633					/*
634					 * Add the file descriptor to the
635					 * exception set:
636					 */
637					FD_SET(i, &fd_set_except);
638
639					/*
640					 * Increment the file descriptor
641					 * count:
642					 */
643					count++;
644
645					/*
646					 * Check if this file descriptor is
647					 * greater than any of those seen so
648					 * far:
649					 */
650					if (i > nfds) {
651						/*
652						 * Remember this file
653						 * descriptor:
654						 */
655						nfds = i;
656					}
657				}
658				/*
659				 * Check if this file descriptor is set for
660				 * write:
661				 */
662				if (FD_ISSET(i, &pthread->data.select_data->writefds)) {
663					/*
664					 * Add the file descriptor to the
665					 * write set:
666					 */
667					FD_SET(i, &fd_set_write);
668
669					/*
670					 * Increment the file descriptor
671					 * count:
672					 */
673					count++;
674
675					/*
676					 * Check if this file descriptor is
677					 * greater than any of those seen so
678					 * far:
679					 */
680					if (i > nfds) {
681						/*
682						 * Remember this file
683						 * descriptor:
684						 */
685						nfds = i;
686					}
687				}
688				/*
689				 * Check if this file descriptor is set for
690				 * read:
691				 */
692				if (FD_ISSET(i, &pthread->data.select_data->readfds)) {
693					/*
694					 * Add the file descriptor to the
695					 * read set:
696					 */
697					FD_SET(i, &fd_set_read);
698
699					/*
700					 * Increment the file descriptor
701					 * count:
702					 */
703					count++;
704
705					/*
706					 * Check if this file descriptor is
707					 * greater than any of those seen so
708					 * far:
709					 */
710					if (i > nfds) {
711						/*
712						 * Remember this file
713						 * descriptor:
714						 */
715						nfds = i;
716					}
717				}
718			}
719
720			/* This state can time out: */
721			settimeout = 1;
722			break;
723		}
724
725		/*
726		 * Check if the caller wants to wait and if the thread state
727		 * is one that times out:
728		 */
729		if (wait_reqd && settimeout) {
730			/* Check if this thread wants to wait forever: */
731			if (pthread->wakeup_time.tv_sec == -1) {
732			}
733			/* Check if this thread doesn't want to wait at all: */
734			else if (pthread->wakeup_time.tv_sec == 0 &&
735				 pthread->wakeup_time.tv_nsec == 0) {
736				/* Override the caller's request to wait: */
737				wait_reqd = 0;
738			} else {
739				/*
740				 * Calculate the time until this thread is
741				 * ready, allowing for the clock resolution:
742				 */
743				ts1.tv_sec = pthread->wakeup_time.tv_sec - ts.tv_sec;
744				ts1.tv_nsec = pthread->wakeup_time.tv_nsec - ts.tv_nsec +
745					CLOCK_RES_NSEC;
746
747				/*
748				 * Check for underflow of the nanosecond
749				 * field:
750				 */
751				if (ts1.tv_nsec < 0) {
752					/*
753					 * Allow for the underflow of the
754					 * nanosecond field:
755					 */
756					ts1.tv_sec--;
757					ts1.tv_nsec += 1000000000;
758				}
759				/*
760				 * Check for overflow of the nanosecond
761				 * field:
762				 */
763				if (ts1.tv_nsec >= 1000000000) {
764					/*
765					 * Allow for the overflow of the
766					 * nanosecond field:
767					 */
768					ts1.tv_sec++;
769					ts1.tv_nsec -= 1000000000;
770				}
771				/*
772				 * Convert the timespec structure to a
773				 * timeval structure:
774				 */
775				TIMESPEC_TO_TIMEVAL(&tv1, &ts1);
776
777				/*
778				 * Check if no time value has been found yet,
779				 * or if the thread will be ready sooner that
780				 * the earliest one found so far:
781				 */
782				if ((tv.tv_sec == 0 && tv.tv_usec == 0) || timercmp(&tv1, &tv, <)) {
783					/* Update the time value: */
784					tv.tv_sec = tv1.tv_sec;
785					tv.tv_usec = tv1.tv_usec;
786				}
787			}
788		}
789	}
790
791	/* Check if the caller wants to wait: */
792	if (wait_reqd) {
793		/* Check if no threads were found with timeouts: */
794		if (tv.tv_sec == 0 && tv.tv_usec == 0) {
795			/* Wait forever: */
796			p_tv = NULL;
797		} else {
798			/*
799			 * Point to the time value structure which contains
800			 * the earliest time that a thread will be ready:
801			 */
802			p_tv = &tv;
803		}
804
805		/*
806		 * Flag the pthread kernel as in a select. This is to avoid
807		 * the window between the next statement that unblocks
808		 * signals and the select statement which follows.
809		 */
810		_thread_kern_in_select = 1;
811
812		/*
813		 * Wait for a file descriptor to be ready for read, write, or
814		 * an exception, or a timeout to occur:
815		 */
816		count = _thread_sys_select(nfds + 1, &fd_set_read, &fd_set_write, &fd_set_except, p_tv);
817
818		/* Reset the kernel in select flag: */
819		_thread_kern_in_select = 0;
820
821		/*
822		 * Check if it is possible that there are bytes in the kernel
823		 * read pipe waiting to be read:
824		 */
825		if (count < 0 || FD_ISSET(_thread_kern_pipe[0], &fd_set_read)) {
826			/*
827			 * Check if the kernel read pipe was included in the
828			 * count:
829			 */
830			if (count > 0) {
831				/*
832				 * Remove the kernel read pipe from the
833				 * count:
834				 */
835				FD_CLR(_thread_kern_pipe[0], &fd_set_read);
836
837				/* Decrement the count of file descriptors: */
838				count--;
839			}
840			/*
841			 * Enter a loop to read (and trash) bytes from the
842			 * pthread kernel pipe:
843			 */
844			while ((num = _thread_sys_read(_thread_kern_pipe[0], bufr, sizeof(bufr))) > 0) {
845				/*
846				 * The buffer read contains one byte per
847				 * signal and each byte is the signal number.
848				 * This data is not used, but the fact that
849				 * the signal handler wrote to the pipe *is*
850				 * used to cause the _select call
851				 * to complete if the signal occurred between
852				 * the time when signals were unblocked and
853				 * the _select select call being
854				 * made.
855				 */
856			}
857		}
858	}
859	/* Check if there are file descriptors to poll: */
860	else if (count > 0) {
861		/*
862		 * Point to the time value structure which has been zeroed so
863		 * that the call to _select will not wait:
864		 */
865		p_tv = &tv;
866
867		/* Poll file descrptors without wait: */
868		count = _thread_sys_select(nfds + 1, &fd_set_read, &fd_set_write, &fd_set_except, p_tv);
869	}
870
871	/*
872	 * Check if any file descriptors are ready:
873	 */
874	if (count > 0) {
875		/*
876		 * Enter a loop to look for threads waiting on file
877		 * descriptors that are flagged as available by the
878		 * _select syscall:
879		 */
880		TAILQ_FOREACH (pthread, &_waitingq, pqe) {
881			/* Process according to thread state: */
882			switch (pthread->state) {
883			/*
884			 * States which do not depend on file
885			 * descriptor I/O operations:
886			 */
887			case PS_COND_WAIT:
888			case PS_DEAD:
889			case PS_DEADLOCK:
890			case PS_FDLR_WAIT:
891			case PS_FDLW_WAIT:
892			case PS_FILE_WAIT:
893			case PS_JOIN:
894			case PS_MUTEX_WAIT:
895			case PS_SIGWAIT:
896			case PS_SLEEP_WAIT:
897			case PS_WAIT_WAIT:
898			case PS_SIGTHREAD:
899			case PS_STATE_MAX:
900			case PS_SUSPENDED:
901				/* Nothing to do here. */
902				break;
903
904			case PS_RUNNING:
905				/*
906				 * A signal occurred and made this thread
907				 * ready while in the scheduler.
908				 */
909				PTHREAD_WAITQ_REMOVE(pthread);
910				PTHREAD_PRIOQ_INSERT_TAIL(pthread);
911				break;
912
913			/* File descriptor read wait: */
914			case PS_FDR_WAIT:
915				/*
916				 * Check if the file descriptor is available
917				 * for read:
918				 */
919				if (FD_ISSET(pthread->data.fd.fd, &fd_set_read)) {
920					/*
921					 * Change the thread state to allow
922					 * it to read from the file when it
923					 * is scheduled next:
924					 */
925					pthread->state = PS_RUNNING;
926
927					/*
928					 * Remove it from the waiting queue
929					 * and add it to the ready queue:
930					 */
931					PTHREAD_WAITQ_REMOVE(pthread);
932					PTHREAD_PRIOQ_INSERT_TAIL(pthread);
933				}
934				break;
935
936			/* File descriptor write wait: */
937			case PS_FDW_WAIT:
938				/*
939				 * Check if the file descriptor is available
940				 * for write:
941				 */
942				if (FD_ISSET(pthread->data.fd.fd, &fd_set_write)) {
943					/*
944					 * Change the thread state to allow
945					 * it to write to the file when it is
946					 * scheduled next:
947					 */
948					pthread->state = PS_RUNNING;
949
950					/*
951					 * Remove it from the waiting queue
952					 * and add it to the ready queue:
953					 */
954					PTHREAD_WAITQ_REMOVE(pthread);
955					PTHREAD_PRIOQ_INSERT_TAIL(pthread);
956				}
957				break;
958
959			/* Select wait: */
960			case PS_SELECT_WAIT:
961				/*
962				 * Reset the flag that indicates if a file
963				 * descriptor is ready for some type of
964				 * operation:
965				 */
966				count_dec = 0;
967
968				/*
969				 * Enter a loop to search though the
970				 * thread-specific select file descriptors
971				 * for the first descriptor that is ready:
972				 */
973				for (i = 0; i < pthread->data.select_data->nfds && count_dec == 0; i++) {
974					/*
975					 * Check if this file descriptor does
976					 * not have an exception:
977					 */
978					if (FD_ISSET(i, &pthread->data.select_data->exceptfds) && FD_ISSET(i, &fd_set_except)) {
979						/*
980						 * Flag this file descriptor
981						 * as ready:
982						 */
983						count_dec = 1;
984					}
985					/*
986					 * Check if this file descriptor is
987					 * not ready for write:
988					 */
989					if (FD_ISSET(i, &pthread->data.select_data->writefds) && FD_ISSET(i, &fd_set_write)) {
990						/*
991						 * Flag this file descriptor
992						 * as ready:
993						 */
994						count_dec = 1;
995					}
996					/*
997					 * Check if this file descriptor is
998					 * not ready for read:
999					 */
1000					if (FD_ISSET(i, &pthread->data.select_data->readfds) && FD_ISSET(i, &fd_set_read)) {
1001						/*
1002						 * Flag this file descriptor
1003						 * as ready:
1004						 */
1005						count_dec = 1;
1006					}
1007				}
1008
1009				/*
1010				 * Check if any file descriptors are ready
1011				 * for the current thread:
1012				 */
1013				if (count_dec) {
1014					/*
1015					 * Reset the count of file
1016					 * descriptors that are ready for
1017					 * this thread:
1018					 */
1019					found_one = 0;
1020
1021					/*
1022					 * Enter a loop to search though the
1023					 * thread-specific select file
1024					 * descriptors:
1025					 */
1026					for (i = 0; i < pthread->data.select_data->nfds; i++) {
1027						/*
1028						 * Reset the count of
1029						 * operations for which the
1030						 * current file descriptor is
1031						 * ready:
1032						 */
1033						count_dec = 0;
1034
1035						/*
1036						 * Check if this file
1037						 * descriptor is selected for
1038						 * exceptions:
1039						 */
1040						if (FD_ISSET(i, &pthread->data.select_data->exceptfds)) {
1041							/*
1042							 * Check if this file
1043							 * descriptor has an
1044							 * exception:
1045							 */
1046							if (FD_ISSET(i, &fd_set_except)) {
1047								/*
1048								 * Increment
1049								 * the count
1050								 * for this
1051								 * file:
1052								 */
1053								count_dec++;
1054							} else {
1055								/*
1056								 * Clear the
1057								 * file
1058								 * descriptor
1059								 * in the
1060								 * thread-spec
1061								 * ific file
1062								 * descriptor
1063								 * set:
1064								 */
1065								FD_CLR(i, &pthread->data.select_data->exceptfds);
1066							}
1067						}
1068						/*
1069						 * Check if this file
1070						 * descriptor is selected for
1071						 * write:
1072						 */
1073						if (FD_ISSET(i, &pthread->data.select_data->writefds)) {
1074							/*
1075							 * Check if this file
1076							 * descriptor is
1077							 * ready for write:
1078							 */
1079							if (FD_ISSET(i, &fd_set_write)) {
1080								/*
1081								 * Increment
1082								 * the count
1083								 * for this
1084								 * file:
1085								 */
1086								count_dec++;
1087							} else {
1088								/*
1089								 * Clear the
1090								 * file
1091								 * descriptor
1092								 * in the
1093								 * thread-spec
1094								 * ific file
1095								 * descriptor
1096								 * set:
1097								 */
1098								FD_CLR(i, &pthread->data.select_data->writefds);
1099							}
1100						}
1101						/*
1102						 * Check if this file
1103						 * descriptor is selected for
1104						 * read:
1105						 */
1106						if (FD_ISSET(i, &pthread->data.select_data->readfds)) {
1107							/*
1108							 * Check if this file
1109							 * descriptor is
1110							 * ready for read:
1111							 */
1112							if (FD_ISSET(i, &fd_set_read)) {
1113								/*
1114								 * Increment
1115								 * the count
1116								 * for this
1117								 * file:
1118								 */
1119								count_dec++;
1120							} else {
1121								/*
1122								 * Clear the
1123								 * file
1124								 * descriptor
1125								 * in the
1126								 * thread-spec
1127								 * ific file
1128								 * descriptor
1129								 * set:
1130								 */
1131								FD_CLR(i, &pthread->data.select_data->readfds);
1132							}
1133						}
1134						/*
1135						 * Check if the current file
1136						 * descriptor is ready for
1137						 * any one of the operations:
1138						 */
1139						if (count_dec > 0) {
1140							/*
1141							 * Increment the
1142							 * count of file
1143							 * descriptors that
1144							 * are ready for the
1145							 * current thread:
1146							 */
1147							found_one++;
1148						}
1149					}
1150
1151					/*
1152					 * Return the number of file
1153					 * descriptors that are ready:
1154					 */
1155					pthread->data.select_data->nfds = found_one;
1156
1157					/*
1158					 * Change the state of the current
1159					 * thread to run:
1160					 */
1161					pthread->state = PS_RUNNING;
1162
1163					/*
1164					 * Remove it from the waiting queue
1165					 * and add it to the ready queue:
1166					 */
1167					PTHREAD_WAITQ_REMOVE(pthread);
1168					PTHREAD_PRIOQ_INSERT_TAIL(pthread);
1169				}
1170				break;
1171			}
1172		}
1173	}
1174
1175	/* Nothing to return. */
1176	return;
1177}
1178
1179void
1180_thread_kern_set_timeout(struct timespec * timeout)
1181{
1182	struct timespec current_time;
1183	struct timeval  tv;
1184
1185	/* Reset the timeout flag for the running thread: */
1186	_thread_run->timeout = 0;
1187
1188	/* Check if the thread is to wait forever: */
1189	if (timeout == NULL) {
1190		/*
1191		 * Set the wakeup time to something that can be recognised as
1192		 * different to an actual time of day:
1193		 */
1194		_thread_run->wakeup_time.tv_sec = -1;
1195		_thread_run->wakeup_time.tv_nsec = -1;
1196	}
1197	/* Check if no waiting is required: */
1198	else if (timeout->tv_sec == 0 && timeout->tv_nsec == 0) {
1199		/* Set the wake up time to 'immediately': */
1200		_thread_run->wakeup_time.tv_sec = 0;
1201		_thread_run->wakeup_time.tv_nsec = 0;
1202	} else {
1203		/* Get the current time: */
1204		gettimeofday(&tv, NULL);
1205		TIMEVAL_TO_TIMESPEC(&tv, &current_time);
1206
1207		/* Calculate the time for the current thread to wake up: */
1208		_thread_run->wakeup_time.tv_sec = current_time.tv_sec + timeout->tv_sec;
1209		_thread_run->wakeup_time.tv_nsec = current_time.tv_nsec + timeout->tv_nsec;
1210
1211		/* Check if the nanosecond field needs to wrap: */
1212		if (_thread_run->wakeup_time.tv_nsec >= 1000000000) {
1213			/* Wrap the nanosecond field: */
1214			_thread_run->wakeup_time.tv_sec += 1;
1215			_thread_run->wakeup_time.tv_nsec -= 1000000000;
1216		}
1217	}
1218	return;
1219}
1220
1221void
1222_thread_kern_sched_defer(void)
1223{
1224	/* Allow scheduling deferral to be recursive. */
1225	_thread_run->sched_defer_count++;
1226}
1227
1228void
1229_thread_kern_sched_undefer(void)
1230{
1231	pthread_t pthread;
1232	int need_resched = 0;
1233
1234	/*
1235	 * Perform checks to yield only if we are about to undefer
1236	 * scheduling.
1237	 */
1238	if (_thread_run->sched_defer_count == 1) {
1239		/*
1240		 * Check if the waiting queue needs to be examined for
1241		 * threads that are now ready:
1242		 */
1243		while (_waitingq_check_reqd != 0) {
1244			/* Clear the flag before checking the waiting queue: */
1245			_waitingq_check_reqd = 0;
1246
1247			TAILQ_FOREACH(pthread, &_waitingq, pqe) {
1248				if (pthread->state == PS_RUNNING) {
1249					PTHREAD_WAITQ_REMOVE(pthread);
1250					PTHREAD_PRIOQ_INSERT_TAIL(pthread);
1251				}
1252			}
1253		}
1254
1255		/*
1256		 * We need to yield if a thread change of state caused a
1257		 * higher priority thread to become ready, or if a
1258		 * scheduling signal occurred while preemption was disabled.
1259		 */
1260		if ((((pthread = PTHREAD_PRIOQ_FIRST) != NULL) &&
1261		   (pthread->active_priority > _thread_run->active_priority)) ||
1262		   (_thread_run->yield_on_sched_undefer != 0)) {
1263			_thread_run->yield_on_sched_undefer = 0;
1264			need_resched = 1;
1265		}
1266	}
1267
1268	if (_thread_run->sched_defer_count > 0) {
1269		/* Decrement the scheduling deferral count. */
1270		_thread_run->sched_defer_count--;
1271
1272		/* Yield the CPU if necessary: */
1273		if (need_resched)
1274			_thread_kern_sched(NULL);
1275	}
1276}
1277
1278static inline void
1279thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in)
1280{
1281	pthread_t tid_out = thread_out;
1282	pthread_t tid_in = thread_in;
1283
1284	if ((tid_out != NULL) &&
1285	    (tid_out->flags & PTHREAD_FLAGS_PRIVATE != 0))
1286		tid_out = NULL;
1287	if ((tid_in != NULL) &&
1288	    (tid_in->flags & PTHREAD_FLAGS_PRIVATE != 0))
1289		tid_in = NULL;
1290
1291	if ((_sched_switch_hook != NULL) && (tid_out != tid_in)) {
1292		/* Run the scheduler switch hook: */
1293		_sched_switch_hook(tid_out, tid_in);
1294	}
1295}
1296#endif
1297