1/*	$NetBSD: rumpuser_pth.c,v 1.46 2017/12/27 09:01:53 ozaki-r Exp $	*/
2
3/*
4 * Copyright (c) 2007-2010 Antti Kantee.  All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include "rumpuser_port.h"
29
30#if !defined(lint)
31__RCSID("$NetBSD: rumpuser_pth.c,v 1.46 2017/12/27 09:01:53 ozaki-r Exp $");
32#endif /* !lint */
33
34#include <sys/queue.h>
35
36#if defined(HAVE_SYS_ATOMIC_H)
37#include <sys/atomic.h>
38#endif
39
40#include <assert.h>
41#include <errno.h>
42#include <fcntl.h>
43#include <pthread.h>
44#include <stdlib.h>
45#include <stdio.h>
46#include <string.h>
47#include <stdint.h>
48#include <unistd.h>
49
50#include <rump/rumpuser.h>
51
52#include "rumpuser_int.h"
53
54int
55rumpuser_thread_create(void *(*f)(void *), void *arg, const char *thrname,
56	int joinable, int priority, int cpuidx, void **ptcookie)
57{
58	pthread_t ptid;
59	pthread_t *ptidp;
60	pthread_attr_t pattr;
61	int rv, i;
62
63	if ((rv = pthread_attr_init(&pattr)) != 0)
64		return rv;
65
66	if (joinable) {
67		NOFAIL(ptidp = malloc(sizeof(*ptidp)));
68		pthread_attr_setdetachstate(&pattr, PTHREAD_CREATE_JOINABLE);
69	} else {
70		ptidp = &ptid;
71		pthread_attr_setdetachstate(&pattr, PTHREAD_CREATE_DETACHED);
72	}
73
74	for (i = 0; i < 10; i++) {
75		const struct timespec ts = {0, 10*1000*1000};
76
77		rv = pthread_create(ptidp, &pattr, f, arg);
78		if (rv != EAGAIN)
79			break;
80		nanosleep(&ts, NULL);
81	}
82
83#if defined(HAVE_PTHREAD_SETNAME3)
84	if (rv == 0 && thrname) {
85		pthread_setname_np(*ptidp, thrname, NULL);
86	}
87#elif defined(HAVE_PTHREAD_SETNAME2)
88	if (rv == 0 && thrname) {
89		pthread_setname_np(*ptidp, thrname);
90	}
91#endif
92
93	if (joinable) {
94		assert(ptcookie);
95		*ptcookie = ptidp;
96	}
97
98	pthread_attr_destroy(&pattr);
99
100	ET(rv);
101}
102
103__dead void
104rumpuser_thread_exit(void)
105{
106
107	/*
108	 * FIXXXME: with glibc on ARM pthread_exit() aborts because
109	 * it fails to unwind the stack.  In the typical case, only
110	 * the mountroothook thread will exit and even that's
111	 * conditional on vfs being present.
112	 */
113#if (defined(__ARMEL__) || defined(__ARMEB__)) && defined(__GLIBC__)
114	for (;;)
115		pause();
116#endif
117
118	pthread_exit(NULL);
119}
120
121int
122rumpuser_thread_join(void *ptcookie)
123{
124	pthread_t *pt = ptcookie;
125	int rv;
126
127	KLOCK_WRAP((rv = pthread_join(*pt, NULL)));
128	if (rv == 0)
129		free(pt);
130
131	ET(rv);
132}
133
134struct rumpuser_mtx {
135	pthread_mutex_t pthmtx;
136	struct lwp *owner;
137	int flags;
138};
139
140void
141rumpuser_mutex_init(struct rumpuser_mtx **mtxp, int flags)
142{
143	struct rumpuser_mtx *mtx;
144	pthread_mutexattr_t att;
145	size_t allocsz;
146
147	allocsz = (sizeof(*mtx)+RUMPUSER_LOCKALIGN) & ~(RUMPUSER_LOCKALIGN-1);
148	NOFAIL(mtx = aligned_alloc(RUMPUSER_LOCKALIGN, allocsz));
149
150	pthread_mutexattr_init(&att);
151	pthread_mutexattr_settype(&att, PTHREAD_MUTEX_ERRORCHECK);
152	NOFAIL_ERRNO(pthread_mutex_init(&mtx->pthmtx, &att));
153	pthread_mutexattr_destroy(&att);
154
155	mtx->owner = NULL;
156	assert(flags != 0);
157	mtx->flags = flags;
158
159	*mtxp = mtx;
160}
161
162int
163rumpuser_mutex_spin_p(struct rumpuser_mtx *mtx)
164{
165
166	return (mtx->flags & RUMPUSER_MTX_SPIN) != 0;
167}
168
169static void
170mtxenter(struct rumpuser_mtx *mtx)
171{
172
173	if (!(mtx->flags & RUMPUSER_MTX_KMUTEX))
174		return;
175
176	assert(mtx->owner == NULL);
177	mtx->owner = rumpuser_curlwp();
178}
179
180static void
181mtxexit(struct rumpuser_mtx *mtx)
182{
183
184	if (!(mtx->flags & RUMPUSER_MTX_KMUTEX))
185		return;
186
187	assert(mtx->owner != NULL);
188	mtx->owner = NULL;
189}
190
191void
192rumpuser_mutex_enter(struct rumpuser_mtx *mtx)
193{
194
195	if (mtx->flags & RUMPUSER_MTX_SPIN) {
196		rumpuser_mutex_enter_nowrap(mtx);
197		return;
198	}
199
200	assert(mtx->flags & RUMPUSER_MTX_KMUTEX);
201	if (pthread_mutex_trylock(&mtx->pthmtx) != 0)
202		KLOCK_WRAP(NOFAIL_ERRNO(pthread_mutex_lock(&mtx->pthmtx)));
203	mtxenter(mtx);
204}
205
206void
207rumpuser_mutex_enter_nowrap(struct rumpuser_mtx *mtx)
208{
209
210	assert(mtx->flags & RUMPUSER_MTX_SPIN);
211	NOFAIL_ERRNO(pthread_mutex_lock(&mtx->pthmtx));
212	mtxenter(mtx);
213}
214
215int
216rumpuser_mutex_tryenter(struct rumpuser_mtx *mtx)
217{
218	int rv;
219
220	rv = pthread_mutex_trylock(&mtx->pthmtx);
221	if (rv == 0) {
222		mtxenter(mtx);
223	}
224
225	ET(rv);
226}
227
228void
229rumpuser_mutex_exit(struct rumpuser_mtx *mtx)
230{
231
232	mtxexit(mtx);
233	NOFAIL_ERRNO(pthread_mutex_unlock(&mtx->pthmtx));
234}
235
236void
237rumpuser_mutex_destroy(struct rumpuser_mtx *mtx)
238{
239
240	NOFAIL_ERRNO(pthread_mutex_destroy(&mtx->pthmtx));
241	free(mtx);
242}
243
244void
245rumpuser_mutex_owner(struct rumpuser_mtx *mtx, struct lwp **lp)
246{
247
248	if (__predict_false(!(mtx->flags & RUMPUSER_MTX_KMUTEX))) {
249		printf("panic: rumpuser_mutex_held unsupported on non-kmtx\n");
250		abort();
251	}
252
253	*lp = mtx->owner;
254}
255
256/*
257 * rwlocks.  these are mostly simple, except that NetBSD wants to
258 * support something called downgrade, which means we need to swap
259 * our exclusive lock for a shared lock.  to accommodate this,
260 * we need to check *after* acquiring a lock in case someone was
261 * downgrading it.  if so, we couldn't actually have it and maybe
262 * need to retry later.
263 */
264
265struct rumpuser_rw {
266	pthread_rwlock_t pthrw;
267#if !defined(__APPLE__) && !defined(__ANDROID__)
268	char pad[64 - sizeof(pthread_rwlock_t)];
269	pthread_spinlock_t spin;
270#endif
271	unsigned int readers;
272	struct lwp *writer;
273	int downgrade; /* someone is downgrading (hopefully lock holder ;) */
274};
275
276static int
277rw_amwriter(struct rumpuser_rw *rw)
278{
279
280	return rw->writer == rumpuser_curlwp() && rw->readers == (unsigned)-1;
281}
282
283static int
284rw_nreaders(struct rumpuser_rw *rw)
285{
286	unsigned nreaders = rw->readers;
287
288	return nreaders != (unsigned)-1 ? nreaders : 0;
289}
290
291static int
292rw_setwriter(struct rumpuser_rw *rw, int retry)
293{
294
295	/*
296	 * Don't need the spinlock here, we already have an
297	 * exclusive lock and "downgrade" is stable until complete.
298	 */
299	if (rw->downgrade) {
300		pthread_rwlock_unlock(&rw->pthrw);
301		if (retry) {
302			struct timespec ts;
303
304			/* portable yield, essentially */
305			ts.tv_sec = 0;
306			ts.tv_nsec = 1;
307			KLOCK_WRAP(nanosleep(&ts, NULL));
308		}
309		return EBUSY;
310	}
311	assert(rw->readers == 0);
312	rw->writer = rumpuser_curlwp();
313	rw->readers = (unsigned)-1;
314	return 0;
315}
316
317static void
318rw_clearwriter(struct rumpuser_rw *rw)
319{
320
321	assert(rw_amwriter(rw));
322	rw->readers = 0;
323	rw->writer = NULL;
324}
325
326static inline void
327rw_readup(struct rumpuser_rw *rw)
328{
329
330#if defined(__NetBSD__) || defined(__APPLE__) || defined(__ANDROID__)
331	atomic_inc_uint(&rw->readers);
332#else
333	pthread_spin_lock(&rw->spin);
334	++rw->readers;
335	pthread_spin_unlock(&rw->spin);
336#endif
337}
338
339static inline void
340rw_readdown(struct rumpuser_rw *rw)
341{
342
343#if defined(__NetBSD__) || defined(__APPLE__) || defined(__ANDROID__)
344	atomic_dec_uint(&rw->readers);
345#else
346	pthread_spin_lock(&rw->spin);
347	assert(rw->readers > 0);
348	--rw->readers;
349	pthread_spin_unlock(&rw->spin);
350#endif
351}
352
353void
354rumpuser_rw_init(struct rumpuser_rw **rwp)
355{
356	struct rumpuser_rw *rw;
357	size_t allocsz;
358
359	allocsz = (sizeof(*rw)+RUMPUSER_LOCKALIGN) & ~(RUMPUSER_LOCKALIGN-1);
360
361	NOFAIL(rw = aligned_alloc(RUMPUSER_LOCKALIGN, allocsz));
362	NOFAIL_ERRNO(pthread_rwlock_init(&rw->pthrw, NULL));
363#if !defined(__APPLE__) && !defined(__ANDROID__)
364	NOFAIL_ERRNO(pthread_spin_init(&rw->spin, PTHREAD_PROCESS_PRIVATE));
365#endif
366	rw->readers = 0;
367	rw->writer = NULL;
368	rw->downgrade = 0;
369
370	*rwp = rw;
371}
372
373void
374rumpuser_rw_enter(int enum_rumprwlock, struct rumpuser_rw *rw)
375{
376	enum rumprwlock lk = enum_rumprwlock;
377
378	switch (lk) {
379	case RUMPUSER_RW_WRITER:
380		do {
381			if (pthread_rwlock_trywrlock(&rw->pthrw) != 0)
382				KLOCK_WRAP(NOFAIL_ERRNO(
383				    pthread_rwlock_wrlock(&rw->pthrw)));
384		} while (rw_setwriter(rw, 1) != 0);
385		break;
386	case RUMPUSER_RW_READER:
387		if (pthread_rwlock_tryrdlock(&rw->pthrw) != 0)
388			KLOCK_WRAP(NOFAIL_ERRNO(
389			    pthread_rwlock_rdlock(&rw->pthrw)));
390		rw_readup(rw);
391		break;
392	}
393}
394
395int
396rumpuser_rw_tryenter(int enum_rumprwlock, struct rumpuser_rw *rw)
397{
398	enum rumprwlock lk = enum_rumprwlock;
399	int rv;
400
401	switch (lk) {
402	case RUMPUSER_RW_WRITER:
403		rv = pthread_rwlock_trywrlock(&rw->pthrw);
404		if (rv == 0)
405			rv = rw_setwriter(rw, 0);
406		break;
407	case RUMPUSER_RW_READER:
408		rv = pthread_rwlock_tryrdlock(&rw->pthrw);
409		if (rv == 0)
410			rw_readup(rw);
411		break;
412	default:
413		rv = EINVAL;
414		break;
415	}
416
417	ET(rv);
418}
419
420int
421rumpuser_rw_tryupgrade(struct rumpuser_rw *rw)
422{
423
424	/*
425	 * Not supported by pthreads.  Since the caller needs to
426	 * back off anyway to avoid deadlock, always failing
427	 * is correct.
428	 */
429	ET(EBUSY);
430}
431
432/*
433 * convert from exclusive to shared lock without allowing anyone to
434 * obtain an exclusive lock in between.  actually, might allow
435 * someone to obtain the lock, we just don't allow that thread to
436 * return from the hypercall with it.
437 */
438void
439rumpuser_rw_downgrade(struct rumpuser_rw *rw)
440{
441
442	assert(rw->downgrade == 0);
443	rw->downgrade = 1;
444	rumpuser_rw_exit(rw);
445	/*
446	 * though the competition can't get out of the hypervisor, it
447	 * might have rescheduled itself after we released the lock.
448	 * so need a wrap here.
449	 */
450	KLOCK_WRAP(NOFAIL_ERRNO(pthread_rwlock_rdlock(&rw->pthrw)));
451	rw->downgrade = 0;
452	rw_readup(rw);
453}
454
455void
456rumpuser_rw_exit(struct rumpuser_rw *rw)
457{
458
459	if (rw_nreaders(rw))
460		rw_readdown(rw);
461	else
462		rw_clearwriter(rw);
463	NOFAIL_ERRNO(pthread_rwlock_unlock(&rw->pthrw));
464}
465
466void
467rumpuser_rw_destroy(struct rumpuser_rw *rw)
468{
469
470	NOFAIL_ERRNO(pthread_rwlock_destroy(&rw->pthrw));
471#if !defined(__APPLE__) && ! defined(__ANDROID__)
472	NOFAIL_ERRNO(pthread_spin_destroy(&rw->spin));
473#endif
474	free(rw);
475}
476
477void
478rumpuser_rw_held(int enum_rumprwlock, struct rumpuser_rw *rw, int *rv)
479{
480	enum rumprwlock lk = enum_rumprwlock;
481
482	switch (lk) {
483	case RUMPUSER_RW_WRITER:
484		*rv = rw_amwriter(rw);
485		break;
486	case RUMPUSER_RW_READER:
487		*rv = rw_nreaders(rw);
488		break;
489	}
490}
491
492/*
493 * condvar
494 */
495
496struct rumpuser_cv {
497	pthread_cond_t pthcv;
498	int nwaiters;
499};
500
501void
502rumpuser_cv_init(struct rumpuser_cv **cv)
503{
504
505	NOFAIL(*cv = malloc(sizeof(struct rumpuser_cv)));
506	NOFAIL_ERRNO(pthread_cond_init(&((*cv)->pthcv), NULL));
507	(*cv)->nwaiters = 0;
508}
509
510void
511rumpuser_cv_destroy(struct rumpuser_cv *cv)
512{
513
514	NOFAIL_ERRNO(pthread_cond_destroy(&cv->pthcv));
515	free(cv);
516}
517
518static void
519cv_unschedule(struct rumpuser_mtx *mtx, int *nlocks)
520{
521
522	rumpkern_unsched(nlocks, mtx);
523	mtxexit(mtx);
524}
525
526static void
527cv_reschedule(struct rumpuser_mtx *mtx, int nlocks)
528{
529
530	/*
531	 * If the cv interlock is a spin mutex, we must first release
532	 * the mutex that was reacquired by pthread_cond_wait(),
533	 * acquire the CPU context and only then relock the mutex.
534	 * This is to preserve resource allocation order so that
535	 * we don't deadlock.  Non-spinning mutexes don't have this
536	 * problem since they don't use a hold-and-wait approach
537	 * to acquiring the mutex wrt the rump kernel CPU context.
538	 *
539	 * The more optimal solution would be to rework rumpkern_sched()
540	 * so that it's possible to tell the scheduler
541	 * "if you need to block, drop this lock first", but I'm not
542	 * going poking there without some numbers on how often this
543	 * path is taken for spin mutexes.
544	 */
545	if ((mtx->flags & (RUMPUSER_MTX_SPIN | RUMPUSER_MTX_KMUTEX)) ==
546	    (RUMPUSER_MTX_SPIN | RUMPUSER_MTX_KMUTEX)) {
547		NOFAIL_ERRNO(pthread_mutex_unlock(&mtx->pthmtx));
548		rumpkern_sched(nlocks, mtx);
549		rumpuser_mutex_enter_nowrap(mtx);
550	} else {
551		mtxenter(mtx);
552		rumpkern_sched(nlocks, mtx);
553	}
554}
555
556void
557rumpuser_cv_wait(struct rumpuser_cv *cv, struct rumpuser_mtx *mtx)
558{
559	int nlocks;
560
561	cv->nwaiters++;
562	cv_unschedule(mtx, &nlocks);
563	NOFAIL_ERRNO(pthread_cond_wait(&cv->pthcv, &mtx->pthmtx));
564	cv_reschedule(mtx, nlocks);
565	cv->nwaiters--;
566}
567
568void
569rumpuser_cv_wait_nowrap(struct rumpuser_cv *cv, struct rumpuser_mtx *mtx)
570{
571
572	cv->nwaiters++;
573	mtxexit(mtx);
574	NOFAIL_ERRNO(pthread_cond_wait(&cv->pthcv, &mtx->pthmtx));
575	mtxenter(mtx);
576	cv->nwaiters--;
577}
578
579int
580rumpuser_cv_timedwait(struct rumpuser_cv *cv, struct rumpuser_mtx *mtx,
581	int64_t sec, int64_t nsec)
582{
583	struct timespec ts;
584	int rv, nlocks;
585
586	/*
587	 * Get clock already here, just in case we will be put to sleep
588	 * after releasing the kernel context.
589	 *
590	 * The condition variables should use CLOCK_MONOTONIC, but since
591	 * that's not available everywhere, leave it for another day.
592	 */
593	clock_gettime(CLOCK_REALTIME, &ts);
594
595	cv->nwaiters++;
596	cv_unschedule(mtx, &nlocks);
597
598	ts.tv_sec += sec;
599	ts.tv_nsec += nsec;
600	if (ts.tv_nsec >= 1000*1000*1000) {
601		ts.tv_sec++;
602		ts.tv_nsec -= 1000*1000*1000;
603	}
604	rv = pthread_cond_timedwait(&cv->pthcv, &mtx->pthmtx, &ts);
605
606	cv_reschedule(mtx, nlocks);
607	cv->nwaiters--;
608
609	ET(rv);
610}
611
612void
613rumpuser_cv_signal(struct rumpuser_cv *cv)
614{
615
616	NOFAIL_ERRNO(pthread_cond_signal(&cv->pthcv));
617}
618
619void
620rumpuser_cv_broadcast(struct rumpuser_cv *cv)
621{
622
623	NOFAIL_ERRNO(pthread_cond_broadcast(&cv->pthcv));
624}
625
626void
627rumpuser_cv_has_waiters(struct rumpuser_cv *cv, int *nwaiters)
628{
629
630	*nwaiters = cv->nwaiters;
631}
632
633/*
634 * curlwp
635 */
636
637static pthread_key_t curlwpkey;
638
639/*
640 * the if0'd curlwp implementation is not used by this hypervisor,
641 * but serves as test code to check that the intended usage works.
642 */
643#if 0
644struct rumpuser_lwp {
645	struct lwp *l;
646	LIST_ENTRY(rumpuser_lwp) l_entries;
647};
648static LIST_HEAD(, rumpuser_lwp) lwps = LIST_HEAD_INITIALIZER(lwps);
649static pthread_mutex_t lwplock = PTHREAD_MUTEX_INITIALIZER;
650
651void
652rumpuser_curlwpop(enum rumplwpop op, struct lwp *l)
653{
654	struct rumpuser_lwp *rl, *rliter;
655
656	switch (op) {
657	case RUMPUSER_LWP_CREATE:
658		rl = malloc(sizeof(*rl));
659		rl->l = l;
660		pthread_mutex_lock(&lwplock);
661		LIST_FOREACH(rliter, &lwps, l_entries) {
662			if (rliter->l == l) {
663				fprintf(stderr, "LWP_CREATE: %p exists\n", l);
664				abort();
665			}
666		}
667		LIST_INSERT_HEAD(&lwps, rl, l_entries);
668		pthread_mutex_unlock(&lwplock);
669		break;
670	case RUMPUSER_LWP_DESTROY:
671		pthread_mutex_lock(&lwplock);
672		LIST_FOREACH(rl, &lwps, l_entries) {
673			if (rl->l == l)
674				break;
675		}
676		if (!rl) {
677			fprintf(stderr, "LWP_DESTROY: %p does not exist\n", l);
678			abort();
679		}
680		LIST_REMOVE(rl, l_entries);
681		pthread_mutex_unlock(&lwplock);
682		free(rl);
683		break;
684	case RUMPUSER_LWP_SET:
685		assert(pthread_getspecific(curlwpkey) == NULL && l != NULL);
686
687		pthread_mutex_lock(&lwplock);
688		LIST_FOREACH(rl, &lwps, l_entries) {
689			if (rl->l == l)
690				break;
691		}
692		if (!rl) {
693			fprintf(stderr,
694			    "LWP_SET: %p does not exist\n", l);
695			abort();
696		}
697		pthread_mutex_unlock(&lwplock);
698
699		pthread_setspecific(curlwpkey, rl);
700		break;
701	case RUMPUSER_LWP_CLEAR:
702		assert(((struct rumpuser_lwp *)
703		    pthread_getspecific(curlwpkey))->l == l);
704		pthread_setspecific(curlwpkey, NULL);
705		break;
706	}
707}
708
709struct lwp *
710rumpuser_curlwp(void)
711{
712	struct rumpuser_lwp *rl;
713
714	rl = pthread_getspecific(curlwpkey);
715	return rl ? rl->l : NULL;
716}
717
718#else
719
720void
721rumpuser_curlwpop(int enum_rumplwpop, struct lwp *l)
722{
723	enum rumplwpop op = enum_rumplwpop;
724
725	switch (op) {
726	case RUMPUSER_LWP_CREATE:
727		break;
728	case RUMPUSER_LWP_DESTROY:
729		break;
730	case RUMPUSER_LWP_SET:
731		assert(pthread_getspecific(curlwpkey) == NULL);
732		pthread_setspecific(curlwpkey, l);
733		break;
734	case RUMPUSER_LWP_CLEAR:
735		assert(pthread_getspecific(curlwpkey) == l);
736		pthread_setspecific(curlwpkey, NULL);
737		break;
738	}
739}
740
741struct lwp *
742rumpuser_curlwp(void)
743{
744
745	return pthread_getspecific(curlwpkey);
746}
747#endif
748
749
750void
751rumpuser__thrinit(void)
752{
753	pthread_key_create(&curlwpkey, NULL);
754}
755