1/*      $NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $	*/
2
3/*
4 * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $");
30
31#include <sys/param.h>
32#include <sys/atomic.h>
33#include <sys/cpu.h>
34#include <sys/kmem.h>
35#include <sys/mutex.h>
36#include <sys/namei.h>
37#include <sys/queue.h>
38#include <sys/select.h>
39#include <sys/systm.h>
40
41#include <rump-sys/kern.h>
42
43#include <rump/rumpuser.h>
44
45static struct rumpcpu {
46	/* needed in fastpath */
47	struct cpu_info *rcpu_ci;
48	void *rcpu_prevlwp;
49
50	/* needed in slowpath */
51	struct rumpuser_mtx *rcpu_mtx;
52	struct rumpuser_cv *rcpu_cv;
53	int rcpu_wanted;
54
55	/* offset 20 (P=4) or 36 (P=8) here */
56
57	/*
58	 * Some stats.  Not really that necessary, but we should
59	 * have room.  Note that these overflow quite fast, so need
60	 * to be collected often.
61	 */
62	unsigned int rcpu_fastpath;
63	unsigned int rcpu_slowpath;
64	unsigned int rcpu_migrated;
65
66	/* offset 32 (P=4) or 50 (P=8) */
67
68	int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
69} rcpu_storage[MAXCPUS];
70
71static inline struct rumpcpu *
72cpuinfo_to_rumpcpu(struct cpu_info *ci)
73{
74
75	return &rcpu_storage[cpu_index(ci)];
76}
77
78struct cpu_info rump_bootcpu;
79
80#define RCPULWP_BUSY	((void *)-1)
81#define RCPULWP_WANTED	((void *)-2)
82
83static struct rumpuser_mtx *lwp0mtx;
84static struct rumpuser_cv *lwp0cv;
85static unsigned nextcpu;
86
87kmutex_t unruntime_lock; /* unruntime lwp lock.  practically unused */
88
89static bool lwp0isbusy = false;
90
91/*
92 * Keep some stats.
93 *
94 * Keeping track of there is not really critical for speed, unless
95 * stats happen to be on a different cache line (CACHE_LINE_SIZE is
96 * really just a coarse estimate), so default for the performant case
97 * (i.e. no stats).
98 */
99#ifdef RUMPSCHED_STATS
100#define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
101#define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
102#define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
103#else
104#define SCHED_FASTPATH(rcpu)
105#define SCHED_SLOWPATH(rcpu)
106#define SCHED_MIGRATED(rcpu)
107#endif
108
109struct cpu_info *
110cpu_lookup(u_int index)
111{
112
113	return rcpu_storage[index].rcpu_ci;
114}
115
116static inline struct rumpcpu *
117getnextcpu(void)
118{
119	unsigned newcpu;
120
121	newcpu = atomic_inc_uint_nv(&nextcpu);
122	if (__predict_false(ncpu > UINT_MAX/2))
123		atomic_and_uint(&nextcpu, 0);
124	newcpu = newcpu % ncpu;
125
126	return &rcpu_storage[newcpu];
127}
128
129/* this could/should be mi_attach_cpu? */
130void
131rump_cpus_bootstrap(int *nump)
132{
133	int num = *nump;
134
135	if (num > MAXCPUS) {
136		aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
137		    "available (adjusted)\n", num, MAXCPUS);
138		num = MAXCPUS;
139	}
140
141	cpu_setmodel("rumpcore (virtual)");
142
143	mi_cpu_init();
144
145	/* attach first cpu for bootstrap */
146	rump_cpu_attach(&rump_bootcpu);
147	ncpu = 1;
148	*nump = num;
149}
150
151void
152rump_scheduler_init(int numcpu)
153{
154	struct rumpcpu *rcpu;
155	struct cpu_info *ci;
156	int i;
157
158	rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
159	rumpuser_cv_init(&lwp0cv);
160	for (i = 0; i < numcpu; i++) {
161		if (i == 0) {
162			ci = &rump_bootcpu;
163		} else {
164			ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
165			ci->ci_index = i;
166		}
167
168		rcpu = &rcpu_storage[i];
169		rcpu->rcpu_ci = ci;
170		rcpu->rcpu_wanted = 0;
171		rumpuser_cv_init(&rcpu->rcpu_cv);
172		rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);
173
174		ci->ci_schedstate.spc_mutex =
175		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
176		ci->ci_schedstate.spc_flags = SPCF_RUNNING;
177	}
178
179	mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
180}
181
182void
183rump_schedlock_cv_signal(struct cpu_info *ci, struct rumpuser_cv *cv)
184{
185	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(ci);
186
187	rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
188	rumpuser_cv_signal(cv);
189	rumpuser_mutex_exit(rcpu->rcpu_mtx);
190}
191
192/*
193 * condvar ops using scheduler lock as the rumpuser interlock.
194 */
195void
196rump_schedlock_cv_wait(struct rumpuser_cv *cv)
197{
198	struct lwp *l = curlwp;
199	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
200
201	/* mutex will be taken and released in cpu schedule/unschedule */
202	rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
203}
204
205int
206rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
207{
208	struct lwp *l = curlwp;
209	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
210
211	/* mutex will be taken and released in cpu schedule/unschedule */
212	return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
213	    ts->tv_sec, ts->tv_nsec);
214}
215
216static void
217lwp0busy(void)
218{
219
220	/* busy lwp0 */
221	KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
222	rumpuser_mutex_enter_nowrap(lwp0mtx);
223	while (lwp0isbusy)
224		rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
225	lwp0isbusy = true;
226	rumpuser_mutex_exit(lwp0mtx);
227}
228
229static void
230lwp0rele(void)
231{
232
233	rumpuser_mutex_enter_nowrap(lwp0mtx);
234	KASSERT(lwp0isbusy == true);
235	lwp0isbusy = false;
236	rumpuser_cv_signal(lwp0cv);
237	rumpuser_mutex_exit(lwp0mtx);
238}
239
240/*
241 * rump_schedule: ensure that the calling host thread has a valid lwp context.
242 * ie. ensure that curlwp != NULL.  Also, ensure that there
243 * a 1:1 mapping between the lwp and rump kernel cpu.
244 */
245void
246rump_schedule()
247{
248	struct lwp *l;
249
250	/*
251	 * If there is no dedicated lwp, allocate a temp one and
252	 * set it to be free'd upon unschedule().  Use lwp0 context
253	 * for reserving the necessary resources.  Don't optimize
254	 * for this case -- anyone who cares about performance will
255	 * start a real thread.
256	 */
257	if (__predict_true((l = curlwp) != NULL)) {
258		struct proc *p = l->l_proc;
259		rump_schedule_cpu(l);
260		if (l->l_cred != p->p_cred) {
261			kauth_cred_t oc = l->l_cred;
262			mutex_enter(p->p_lock);
263			l->l_cred = kauth_cred_hold(p->p_cred);
264			mutex_exit(p->p_lock);
265			kauth_cred_free(oc);
266		}
267	} else {
268		lwp0busy();
269
270		/* schedule cpu and use lwp0 */
271		rump_schedule_cpu(&lwp0);
272		rump_lwproc_curlwp_set(&lwp0);
273
274		/* allocate thread, switch to it, and release lwp0 */
275		l = rump__lwproc_alloclwp(initproc);
276		rump_lwproc_switch(l);
277		lwp0rele();
278
279		/*
280		 * mark new thread dead-on-unschedule.  this
281		 * means that we'll be running with l_refcnt == 0.
282		 * relax, it's fine.
283		 */
284		rump_lwproc_releaselwp();
285	}
286}
287
288void
289rump_schedule_cpu(struct lwp *l)
290{
291
292	rump_schedule_cpu_interlock(l, NULL);
293}
294
295/*
296 * Schedule a CPU.  This optimizes for the case where we schedule
297 * the same thread often, and we have nCPU >= nFrequently-Running-Thread
298 * (where CPU is virtual rump cpu, not host CPU).
299 */
300void
301rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
302{
303	struct rumpcpu *rcpu;
304	struct cpu_info *ci;
305	void *old;
306	bool domigrate;
307	bool bound = l->l_pflag & LP_BOUND;
308
309	l->l_stat = LSRUN;
310
311	/*
312	 * First, try fastpath: if we were the previous user of the
313	 * CPU, everything is in order cachewise and we can just
314	 * proceed to use it.
315	 *
316	 * If we are a different thread (i.e. CAS fails), we must go
317	 * through a memory barrier to ensure we get a truthful
318	 * view of the world.
319	 */
320
321	KASSERT(l->l_target_cpu != NULL);
322	rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
323	if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
324		if (interlock == rcpu->rcpu_mtx)
325			rumpuser_mutex_exit(rcpu->rcpu_mtx);
326		SCHED_FASTPATH(rcpu);
327		/* jones, you're the man */
328		goto fastlane;
329	}
330
331	/*
332	 * Else, it's the slowpath for us.  First, determine if we
333	 * can migrate.
334	 */
335	if (ncpu == 1)
336		domigrate = false;
337	else
338		domigrate = true;
339
340	/* Take lock.  This acts as a load barrier too. */
341	if (interlock != rcpu->rcpu_mtx)
342		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
343
344	for (;;) {
345		SCHED_SLOWPATH(rcpu);
346		old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
347
348		/* CPU is free? */
349		if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
350			if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
351			    RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
352				break;
353			}
354		}
355
356		/*
357		 * Do we want to migrate once?
358		 * This may need a slightly better algorithm, or we
359		 * might cache pingpong eternally for non-frequent
360		 * threads.
361		 */
362		if (domigrate && !bound) {
363			domigrate = false;
364			SCHED_MIGRATED(rcpu);
365			rumpuser_mutex_exit(rcpu->rcpu_mtx);
366			rcpu = getnextcpu();
367			rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
368			continue;
369		}
370
371		/* Want CPU, wait until it's released an retry */
372		rcpu->rcpu_wanted++;
373		rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
374		rcpu->rcpu_wanted--;
375	}
376	rumpuser_mutex_exit(rcpu->rcpu_mtx);
377
378 fastlane:
379	ci = rcpu->rcpu_ci;
380	l->l_cpu = l->l_target_cpu = ci;
381	l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
382	l->l_ru.ru_nvcsw++;
383	l->l_stat = LSONPROC;
384
385	/*
386	 * No interrupts, so ci_curlwp === cpu_onproc.
387	 * Okay, we could make an attempt to not set cpu_onproc
388	 * in the case that an interrupt is scheduled immediately
389	 * after a user proc, but leave that for later.
390	 */
391	ci->ci_curlwp = ci->ci_onproc = l;
392}
393
394void
395rump_unschedule()
396{
397	struct lwp *l = curlwp;
398#ifdef DIAGNOSTIC
399	int nlock;
400
401	KERNEL_UNLOCK_ALL(l, &nlock);
402	KASSERT(nlock == 0);
403#endif
404
405	KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
406	rump_unschedule_cpu(l);
407	l->l_mutex = &unruntime_lock;
408	l->l_stat = LSSTOP;
409
410	/*
411	 * Check special conditions:
412	 *  1) do we need to free the lwp which just unscheduled?
413	 *     (locking order: lwp0, cpu)
414	 *  2) do we want to clear curlwp for the current host thread
415	 */
416	if (__predict_false(l->l_flag & LW_WEXIT)) {
417		lwp0busy();
418
419		/* Now that we have lwp0, we can schedule a CPU again */
420		rump_schedule_cpu(l);
421
422		/* switch to lwp0.  this frees the old thread */
423		KASSERT(l->l_flag & LW_WEXIT);
424		rump_lwproc_switch(&lwp0);
425
426		/* release lwp0 */
427		rump_unschedule_cpu(&lwp0);
428		lwp0.l_mutex = &unruntime_lock;
429		lwp0.l_pflag &= ~LP_RUNNING;
430		lwp0rele();
431		rump_lwproc_curlwp_clear(&lwp0);
432
433	} else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
434		rump_lwproc_curlwp_clear(l);
435		l->l_flag &= ~LW_RUMP_CLEAR;
436	}
437}
438
439void
440rump_unschedule_cpu(struct lwp *l)
441{
442
443	rump_unschedule_cpu_interlock(l, NULL);
444}
445
446void
447rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
448{
449
450	if ((l->l_pflag & LP_INTR) == 0)
451		rump_softint_run(l->l_cpu);
452	rump_unschedule_cpu1(l, interlock);
453}
454
455void
456rump_unschedule_cpu1(struct lwp *l, void *interlock)
457{
458	struct rumpcpu *rcpu;
459	struct cpu_info *ci;
460	void *old;
461
462	ci = l->l_cpu;
463	ci->ci_curlwp = ci->ci_onproc = NULL;
464	rcpu = cpuinfo_to_rumpcpu(ci);
465
466	KASSERT(rcpu->rcpu_ci == ci);
467
468	/*
469	 * Make sure all stores are seen before the CPU release.  This
470	 * is relevant only in the non-fastpath scheduling case, but
471	 * we don't know here if that's going to happen, so need to
472	 * expect the worst.
473	 *
474	 * If the scheduler interlock was requested by the caller, we
475	 * need to obtain it before we release the CPU.  Otherwise, we risk a
476	 * race condition where another thread is scheduled onto the
477	 * rump kernel CPU before our current thread can
478	 * grab the interlock.
479	 */
480	if (interlock == rcpu->rcpu_mtx)
481		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
482	else
483		membar_release(); /* XXX what does this pair with? */
484
485	/* Release the CPU. */
486	old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
487
488	/* No waiters?  No problems.  We're outta here. */
489	if (old == RCPULWP_BUSY) {
490		return;
491	}
492
493	KASSERT(old == RCPULWP_WANTED);
494
495	/*
496	 * Ok, things weren't so snappy.
497	 *
498	 * Snailpath: take lock and signal anyone waiting for this CPU.
499	 */
500
501	if (interlock != rcpu->rcpu_mtx)
502		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
503	if (rcpu->rcpu_wanted)
504		rumpuser_cv_broadcast(rcpu->rcpu_cv);
505	if (interlock != rcpu->rcpu_mtx)
506		rumpuser_mutex_exit(rcpu->rcpu_mtx);
507}
508
509/* Give up and retake CPU (perhaps a different one) */
510void
511yield()
512{
513	struct lwp *l = curlwp;
514	int nlocks;
515
516	KERNEL_UNLOCK_ALL(l, &nlocks);
517	rump_unschedule_cpu(l);
518	rump_schedule_cpu(l);
519	KERNEL_LOCK(nlocks, l);
520}
521
522void
523preempt()
524{
525
526	yield();
527}
528
529bool
530kpreempt(uintptr_t where)
531{
532
533	return false;
534}
535
536/*
537 * There is no kernel thread preemption in rump currently.  But call
538 * the implementing macros anyway in case they grow some side-effects
539 * down the road.
540 */
541void
542kpreempt_disable(void)
543{
544
545	KPREEMPT_DISABLE(curlwp);
546}
547
548void
549kpreempt_enable(void)
550{
551
552	KPREEMPT_ENABLE(curlwp);
553}
554
555bool
556kpreempt_disabled(void)
557{
558#if 0
559	const lwp_t *l = curlwp;
560
561	return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
562	    (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
563#endif
564	/* XXX: emulate cpu_kpreempt_disabled() */
565	return true;
566}
567
568void
569suspendsched(void)
570{
571
572	/*
573	 * Could wait until everyone is out and block further entries,
574	 * but skip that for now.
575	 */
576}
577
578void
579sched_nice(struct proc *p, int level)
580{
581
582	/* nothing to do for now */
583}
584
585void
586setrunnable(struct lwp *l)
587{
588
589	sched_enqueue(l);
590}
591
592void
593sched_enqueue(struct lwp *l)
594{
595
596	rump_thread_allow(l);
597}
598
599void
600sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
601{
602
603}
604
605void
606sched_resched_lwp(struct lwp *l, bool unlock)
607{
608
609}
610
611void
612sched_dequeue(struct lwp *l)
613{
614
615	panic("sched_dequeue not implemented");
616}
617
618void
619preempt_point(void)
620{
621
622}
623
624bool
625preempt_needed(void)
626{
627
628	return false;
629}
630