1/*	$NetBSD: init_main.c,v 1.549 2024/03/05 20:59:41 thorpej Exp $	*/
2
3/*-
4 * Copyright (c) 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 *    notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 *    notice, this list of conditions and the following disclaimer in the
45 *    documentation and/or other materials provided with the distribution.
46 * 3. Neither the name of the University nor the names of its contributors
47 *    may be used to endorse or promote products derived from this software
48 *    without specific prior written permission.
49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
61 *
62 *	@(#)init_main.c	8.16 (Berkeley) 5/14/95
63 */
64
65/*
66 * Copyright (c) 1995 Christopher G. Demetriou.  All rights reserved.
67 *
68 * Redistribution and use in source and binary forms, with or without
69 * modification, are permitted provided that the following conditions
70 * are met:
71 * 1. Redistributions of source code must retain the above copyright
72 *    notice, this list of conditions and the following disclaimer.
73 * 2. Redistributions in binary form must reproduce the above copyright
74 *    notice, this list of conditions and the following disclaimer in the
75 *    documentation and/or other materials provided with the distribution.
76 * 3. All advertising materials mentioning features or use of this software
77 *    must display the following acknowledgement:
78 *	This product includes software developed by the University of
79 *	California, Berkeley and its contributors.
80 * 4. Neither the name of the University nor the names of its contributors
81 *    may be used to endorse or promote products derived from this software
82 *    without specific prior written permission.
83 *
84 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
85 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
86 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
87 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
88 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
89 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
90 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
91 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
92 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
93 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
94 * SUCH DAMAGE.
95 *
96 *	@(#)init_main.c	8.16 (Berkeley) 5/14/95
97 */
98
99#include <sys/cdefs.h>
100__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.549 2024/03/05 20:59:41 thorpej Exp $");
101
102#include "opt_cnmagic.h"
103#include "opt_ddb.h"
104#include "opt_inet.h"
105#include "opt_ipsec.h"
106#include "opt_modular.h"
107#include "opt_ntp.h"
108#include "opt_pipe.h"
109#include "opt_syscall_debug.h"
110#include "opt_sysv.h"
111#include "opt_fileassoc.h"
112#include "opt_ktrace.h"
113#include "opt_pax.h"
114#include "opt_compat_netbsd.h"
115#include "opt_ptrace.h"
116#include "opt_splash.h"
117#include "opt_kernhist.h"
118#include "opt_gprof.h"
119
120#if defined(SPLASHSCREEN) && defined(makeoptions_SPLASHSCREEN_IMAGE)
121extern void *_binary_splash_image_start;
122extern void *_binary_splash_image_end;
123#endif
124
125#include "ksyms.h"
126
127#include "veriexec.h"
128
129#include <sys/param.h>
130#include <sys/acct.h>
131#include <sys/filedesc.h>
132#include <sys/file.h>
133#include <sys/errno.h>
134#include <sys/callout.h>
135#include <sys/cpu.h>
136#include <sys/cpufreq.h>
137#include <sys/spldebug.h>
138#include <sys/kernel.h>
139#include <sys/mount.h>
140#include <sys/proc.h>
141#include <sys/lwp.h>
142#include <sys/kthread.h>
143#include <sys/resourcevar.h>
144#include <sys/signalvar.h>
145#include <sys/systm.h>
146#include <sys/vnode.h>
147#include <sys/fstrans.h>
148#include <sys/tty.h>
149#include <sys/conf.h>
150#include <sys/disklabel.h>
151#include <sys/buf.h>
152#include <sys/device.h>
153#include <sys/exec.h>
154#include <sys/socketvar.h>
155#include <sys/protosw.h>
156#include <sys/percpu.h>
157#include <sys/pserialize.h>
158#include <sys/pset.h>
159#include <sys/sysctl.h>
160#include <sys/reboot.h>
161#include <sys/event.h>
162#include <sys/mbuf.h>
163#include <sys/sched.h>
164#include <sys/sleepq.h>
165#include <sys/ipi.h>
166#include <sys/iostat.h>
167#include <sys/vmem.h>
168#include <sys/uuid.h>
169#include <sys/extent.h>
170#include <sys/disk.h>
171#include <sys/msgbuf.h>
172#include <sys/module.h>
173#include <sys/module_hook.h>
174#include <sys/event.h>
175#include <sys/lockf.h>
176#include <sys/once.h>
177#include <sys/kcpuset.h>
178#include <sys/ksyms.h>
179#include <sys/uidinfo.h>
180#include <sys/kprintf.h>
181#include <sys/bufq.h>
182#include <sys/threadpool.h>
183#include <sys/futex.h>
184#ifdef IPSEC
185#include <netipsec/ipsec.h>
186#endif
187#include <sys/domain.h>
188#include <sys/namei.h>
189#include <sys/rnd.h>
190#include <sys/pipe.h>
191#if NVERIEXEC > 0
192#include <sys/verified_exec.h>
193#endif /* NVERIEXEC > 0 */
194#ifdef KTRACE
195#include <sys/ktrace.h>
196#endif
197#include <sys/kauth.h>
198#include <net80211/ieee80211_netbsd.h>
199#include <sys/cprng.h>
200#include <sys/psref.h>
201#include <sys/radixtree.h>
202#include <sys/heartbeat.h>
203
204#include <sys/syscall.h>
205#include <sys/syscallargs.h>
206
207#include <sys/pax.h>
208
209#include <dev/clock_subr.h>
210
211#include <secmodel/secmodel.h>
212
213#include <ufs/ufs/quota.h>
214
215#include <miscfs/genfs/genfs.h>
216#include <miscfs/specfs/specdev.h>
217
218#include <sys/cpu.h>
219
220#include <uvm/uvm.h>	/* extern struct uvm uvm */
221
222#include <dev/cons.h>
223#include <dev/splash/splash.h>
224
225#include <net/bpf.h>
226#include <net/if.h>
227#include <net/pfil.h>
228#include <net/raw_cb.h>
229#include <net/if_llatbl.h>
230
231#include <prop/proplib.h>
232
233#include <sys/userconf.h>
234
235extern time_t rootfstime;
236
237#ifndef curlwp
238struct	lwp *curlwp = &lwp0;
239#endif
240struct	proc *initproc;
241
242struct	vnode *rootvp, *swapdev_vp;
243int	boothowto;
244int	cold __read_mostly = 1;		/* still working on startup */
245int	shutting_down __read_mostly;	/* system is shutting down */
246
247int	start_init_exec;		/* semaphore for start_init() */
248
249static void check_console(struct lwp *l);
250static void start_init(void *);
251static void configure(void);
252static void configure2(void);
253static void configure3(void);
254void main(void);
255
256/*
257 * System startup; initialize the world, create process 0, mount root
258 * filesystem, and fork to create init and pagedaemon.  Most of the
259 * hard work is done in the lower-level initialization routines including
260 * startup(), which does memory initialization and autoconfiguration.
261 */
262void
263main(void)
264{
265	struct timespec time;
266	struct lwp *l;
267	struct proc *p;
268	int s, error;
269#ifdef NVNODE_IMPLICIT
270	int usevnodes;
271#endif
272	CPU_INFO_ITERATOR cii;
273	struct cpu_info *ci;
274
275#ifdef DIAGNOSTIC
276	/*
277	 * Verify that CPU_INFO_FOREACH() knows about the boot CPU
278	 * and only the boot CPU at this point.
279	 */
280	int cpucount = 0;
281	for (CPU_INFO_FOREACH(cii, ci)) {
282		KASSERT(ci == curcpu());
283		cpucount++;
284	}
285	KASSERT(cpucount == 1);
286#endif
287
288	l = &lwp0;
289#ifndef LWP0_CPU_INFO
290	l->l_cpu = curcpu();
291#endif
292	l->l_pflag |= LP_RUNNING;
293
294	/*
295	 * Attempt to find console and initialize
296	 * in case of early panic or other messages.
297	 */
298	consinit();
299#ifdef CNMAGIC
300	cn_set_magic(CNMAGIC);
301#endif
302
303	kernel_lock_init();
304	once_init();
305	todr_init();
306
307	mi_cpu_init();
308	kernconfig_lock_init();
309	kthread_sysinit();
310
311	/* Initialize the device switch tables. */
312	devsw_init();
313
314	/* Initialize event counters. */
315	evcnt_init();
316
317	uvm_init();
318	ubchist_init();
319	kcpuset_sysinit();
320
321	prop_kern_init();
322
323#if ((NKSYMS > 0) || (NDDB > 0) || (NMODULAR > 0))
324	ksyms_init();
325#endif
326	kprintf_init();
327
328	percpu_init();
329
330	/* Initialize radix trees (used by numerous subsystems). */
331	radix_tree_init();
332
333	/* Passive serialization. */
334	pserialize_init();
335
336	/* Initialize the extent manager. */
337	extent_init();
338
339	/* Do machine-dependent initialization. */
340	cpu_startup();
341
342	/* Initialize the sysctl subsystem. */
343	sysctl_init();
344
345	/* Initialize callouts, part 1. */
346	callout_startup();
347
348	/* Initialize the kernel authorization subsystem. */
349	kauth_init();
350
351	secmodel_init();
352
353	spec_init();
354
355	/*
356	 * Set BPF op vector.  Can't do this in bpf attach, since
357	 * network drivers attach before bpf.
358	 */
359	bpf_setops();
360
361	/* Initialize what we can in ipi(9) before CPUs are detected. */
362	ipi_sysinit();
363
364	/* Start module system. */
365	module_init();
366	module_hook_init();
367
368	/*
369	 * Initialize the kernel authorization subsystem and start the
370	 * default security model, if any. We need to do this early
371	 * enough so that subsystems relying on any of the aforementioned
372	 * can work properly. Since the security model may dictate the
373	 * credential inheritance policy, it is needed at least before
374	 * any process is created, specifically proc0.
375	 */
376	module_init_class(MODULE_CLASS_SECMODEL);
377
378	/* Initialize the buffer cache */
379	bufinit();
380	biohist_init();
381
382#ifdef KERNHIST
383	sysctl_kernhist_init();
384#endif
385
386
387#if defined(SPLASHSCREEN) && defined(SPLASHSCREEN_IMAGE)
388	size_t splash_size = (&_binary_splash_image_end -
389	    &_binary_splash_image_start) * sizeof(void *);
390	splash_setimage(&_binary_splash_image_start, splash_size);
391#endif
392
393	/* Initialize sockets. */
394	soinit();
395
396	/*
397	 * The following things must be done before autoconfiguration.
398	 */
399	rnd_init();		/* initialize entropy pool */
400
401	cprng_init();		/* initialize cryptographic PRNG */
402
403	/* Initialize process and pgrp structures. */
404	procinit();
405	lwpinit();
406
407	/* Must be called after lwpinit (lwpinit_specificdata) */
408	psref_init();
409
410	/* Initialize exec structures */
411	exec_init(1);		/* signal_init calls exechook_establish() */
412
413	/* Initialize signal-related data structures. */
414	signal_init();
415
416	/* Initialize resource management. */
417	resource_init();
418
419	/* Create process 0. */
420	proc0_init();
421	lwp0_init();
422
423	/* Disable preemption during boot. */
424	kpreempt_disable();
425
426	/* Initialize the threadpool system. */
427	threadpools_init();
428
429	/* Initialize the UID hash table. */
430	uid_init();
431
432	/* Charge root for one process. */
433	(void)chgproccnt(0, 1);
434
435	/* Initialize the run queues, turnstiles and sleep queues. */
436	sched_rqinit();
437	turnstile_init();
438	sleeptab_init(&sleeptab);
439
440	sched_init();
441
442	/* Initialize processor-sets */
443	psets_init();
444
445	/* Initialize cpufreq(9) */
446	cpufreq_init();
447
448	/* MI initialization of the boot cpu */
449	error = mi_cpu_attach(curcpu());
450	KASSERT(error == 0);
451
452	/* Initialize timekeeping. */
453	time_init();
454
455	/*
456	 * Initialize mbuf's.  Do this now because we might attempt to
457	 * allocate mbufs or mbuf clusters during autoconfiguration.
458	 */
459	mbinit();
460
461	/* Initialize I/O statistics. */
462	iostat_init();
463
464	/* Initialize the log device. */
465	loginit();
466
467	/* Second part of module system initialization. */
468	module_start_unload_thread();
469
470	/* Initialize autoconf data structures before any modules are loaded */
471	config_init_mi();
472
473	/* Initialize the file systems. */
474#ifdef NVNODE_IMPLICIT
475	/*
476	 * If maximum number of vnodes in namei vnode cache is not explicitly
477	 * defined in kernel config, adjust the number such as we use roughly
478	 * 10% of memory for vnodes and associated data structures in the
479	 * assumed worst case.  Do not provide fewer than NVNODE vnodes.
480	 */
481	usevnodes = calc_cache_size(vmem_size(kmem_arena, VMEM_FREE|VMEM_ALLOC),
482	    10, VNODE_KMEM_MAXPCT) / VNODE_COST;
483	if (usevnodes > desiredvnodes)
484		desiredvnodes = usevnodes;
485#endif /* NVNODE_IMPLICIT */
486#ifdef MAXFILES_IMPLICIT
487	/*
488	 * If maximum number of files is not explicitly defined in
489	 * kernel config, adjust the number so that it is somewhat
490	 * more reasonable on machines with larger memory sizes.
491	 * Arbitrary numbers are 20,000 files for 16GB RAM or more
492	 * and 10,000 files for 1GB RAM or more.
493	 *
494	 * XXXtodo: adjust this and other values totally dynamically
495	 */
496	if (ctob((uint64_t)physmem) >= 16ULL * 1024 * 1024 * 1024)
497		maxfiles = MAX(maxfiles, 20000);
498	if (ctob((uint64_t)physmem) >= 1024 * 1024 * 1024)
499		maxfiles = MAX(maxfiles, 10000);
500#endif /* MAXFILES_IMPLICIT */
501
502	/* Initialize fstrans. */
503	fstrans_init();
504
505	vfsinit();
506	lf_init();
507
508	/* Initialize the file descriptor system. */
509	fd_sys_init();
510
511	/* Initialize kqueue. */
512	kqueue_init();
513
514	inittimecounter();
515	ntp_init();
516
517	/* Initialize tty subsystem. */
518	tty_init();
519	ttyldisc_init();
520
521	/* Initialize the buffer cache, part 2. */
522	bufinit2();
523
524	/* Initialize the disk wedge subsystem. */
525	dkwedge_init();
526
527	/* Initialize pfil */
528	pfil_init();
529
530	/* Initialize interfaces. */
531	ifinit1();
532
533	spldebug_start();
534
535	/* Initialize sockets thread(s) */
536	soinit1();
537
538	/*
539	 * Initialize the bufq strategy sub-system and any built-in
540	 * strategy modules - they may be needed by some devices during
541	 * auto-configuration
542	 */
543	bufq_init();
544	module_init_class(MODULE_CLASS_BUFQ);
545
546	/* Configure the system hardware.  This will enable interrupts. */
547	configure();
548#ifdef __HAVE_LEGACY_INTRCNT
549	evcnt_attach_legacy_intrcnt();
550#endif
551
552	/* Enable deferred processing of RNG samples */
553	rnd_init_softint();
554
555	/* Once all CPUs are detected, initialize the per-CPU cprng_fast.  */
556	cprng_fast_init();
557
558	/*
559	 * Now that softints can be established, start monitoring
560	 * system heartbeat on all CPUs.
561	 */
562	heartbeat_start();
563
564	ssp_init();
565
566	ubc_init();		/* must be after autoconfig */
567
568	mm_init();
569
570	configure2();
571
572	/* Initialize the rest of ipi(9) after CPUs have been detected. */
573	ipi_percpu_init();
574
575	futex_sys_init();
576
577	/* Now timer is working.  Enable preemption. */
578	kpreempt_enable();
579
580	/* Get the threads going and into any sleeps before continuing. */
581	yield();
582
583	vmem_rehash_start();	/* must be before exec_init */
584
585#if NVERIEXEC > 0
586	/*
587	 * Initialise the Veriexec subsystem.
588	 */
589	veriexec_init();
590#endif /* NVERIEXEC > 0 */
591
592	pax_init();
593
594#ifdef	IPSEC
595	/* Attach network crypto subsystem */
596	ipsec_attach();
597#endif
598
599	/*
600	 * Initialize protocols.  Block reception of incoming packets
601	 * until everything is ready.
602	 */
603	s = splnet();
604	ifinit();
605#if defined(INET) || defined(INET6)
606	lltableinit();
607#endif
608	domaininit(true);
609	ifinit_post();
610	if_attachdomain();
611	splx(s);
612
613#ifdef GPROF
614	/* Initialize kernel profiling. */
615	kmstartup();
616#endif
617
618	/* Initialize system accounting. */
619	acct_init();
620
621#ifndef PIPE_SOCKETPAIR
622	/* Initialize pipes. */
623	pipe_init();
624#endif
625
626#ifdef KTRACE
627	/* Initialize ktrace. */
628	ktrinit();
629#endif
630
631	machdep_init();
632
633	procinit_sysctl();
634
635	scdebug_init();
636
637	/*
638	 * Create process 1 (init(8)).  We do this now, as Unix has
639	 * historically had init be process 1, and changing this would
640	 * probably upset a lot of people.
641	 *
642	 * Note that process 1 won't immediately exec init(8), but will
643	 * wait for us to inform it that the root file system has been
644	 * mounted.
645	 */
646	if (fork1(l, 0, SIGCHLD, NULL, 0, start_init, NULL, NULL))
647		panic("fork init");
648
649	/*
650	 * The initproc variable cannot be initialized in start_init as there
651	 * is a race between vfs_mountroot and start_init.
652	 */
653	mutex_enter(&proc_lock);
654	initproc = proc_find_raw(1);
655	mutex_exit(&proc_lock);
656
657	/*
658	 * Load any remaining builtin modules, and hand back temporary
659	 * storage to the VM system.  Then require force when loading any
660	 * remaining un-init'ed built-in modules to avoid later surprises.
661	 */
662	module_init_class(MODULE_CLASS_ANY);
663	module_builtin_require_force();
664
665	/*
666	 * Finalize configuration now that all real devices have been
667	 * found.  This needs to be done before the root device is
668	 * selected, since finalization may create the root device.
669	 */
670	config_finalize();
671
672	sysctl_finalize();
673
674	/*
675	 * Now that autoconfiguration has completed, we can determine
676	 * the root and dump devices.
677	 */
678	cpu_rootconf();
679	cpu_dumpconf();
680
681	/* Mount the root file system. */
682	do {
683		domountroothook(root_device);
684		if ((error = vfs_mountroot())) {
685			printf("cannot mount root, error = %d\n", error);
686			boothowto |= RB_ASKNAME;
687			setroot(root_device,
688			    (rootdev != NODEV) ? DISKPART(rootdev) : 0);
689		}
690	} while (error != 0);
691	mountroothook_destroy();
692
693	configure3();
694
695	/*
696	 * Initialise the time-of-day clock, passing the time recorded
697	 * in the root filesystem (if any) for use by systems that
698	 * don't have a non-volatile time-of-day device.
699	 */
700	inittodr(rootfstime);
701
702	/*
703	 * Now can look at time, having had a chance to verify the time
704	 * from the file system.  Reset l->l_rtime as it may have been
705	 * munched in mi_switch() after the time got set.
706	 */
707	getnanotime(&time);
708
709	mutex_enter(&proc_lock);
710	LIST_FOREACH(p, &allproc, p_list) {
711		KASSERT((p->p_flag & PK_MARKER) == 0);
712		mutex_enter(p->p_lock);
713		TIMESPEC_TO_TIMEVAL(&p->p_stats->p_start, &time);
714		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
715			lwp_lock(l);
716			memset(&l->l_rtime, 0, sizeof(l->l_rtime));
717			lwp_unlock(l);
718		}
719		mutex_exit(p->p_lock);
720	}
721	mutex_exit(&proc_lock);
722	binuptime(&curlwp->l_stime);
723
724	for (CPU_INFO_FOREACH(cii, ci)) {
725		ci->ci_schedstate.spc_lastmod = time_second;
726	}
727
728	/* Create the pageout daemon kernel thread. */
729	uvm_swap_init();
730	if (kthread_create(PRI_PGDAEMON, KTHREAD_MPSAFE, NULL, uvm_pageout,
731	    NULL, NULL, "pgdaemon"))
732		panic("fork pagedaemon");
733
734	/* Create the filesystem syncer kernel thread. */
735	if (kthread_create(PRI_IOFLUSH, KTHREAD_MPSAFE, NULL, sched_sync,
736	    NULL, NULL, "ioflush"))
737		panic("fork syncer");
738
739	/* Wait for final configure threads to complete. */
740	config_finalize_mountroot();
741
742	/*
743	 * Okay, now we can let init(8) exec!  It's off to userland!
744	 */
745	mutex_enter(&proc_lock);
746	start_init_exec = 1;
747	cv_broadcast(&lbolt);
748	mutex_exit(&proc_lock);
749
750	/* The scheduler is an infinite loop. */
751	uvm_scheduler();
752	/* NOTREACHED */
753}
754
755/*
756 * Configure the system's hardware.
757 */
758static void
759configure(void)
760{
761
762	/*
763	 * XXX
764	 * callout_setfunc() requires mutex(9) so it can't be in config_init()
765	 * on amiga and atari which use config_init() and autoconf(9) functions
766	 * to initialize console devices.
767	 */
768	config_twiddle_init();
769
770	pmf_init();
771
772	/* Initialize driver modules */
773	module_init_class(MODULE_CLASS_DRIVER);
774
775	userconf_init();
776	if (boothowto & RB_USERCONF)
777		userconf_prompt();
778
779	if ((boothowto & (AB_SILENT|AB_VERBOSE)) == AB_SILENT) {
780		printf_nolog("Detecting hardware...");
781	}
782
783	/*
784	 * Do the machine-dependent portion of autoconfiguration.  This
785	 * sets the configuration machinery here in motion by "finding"
786	 * the root bus.  When this function returns, we expect interrupts
787	 * to be enabled.
788	 */
789	cpu_configure();
790}
791
792static void
793configure2(void)
794{
795	CPU_INFO_ITERATOR cii;
796	struct cpu_info *ci;
797	int s;
798
799	/* Fix up CPU topology info, which has all been collected by now. */
800	cpu_topology_init();
801
802	/*
803	 * Now that we've found all the hardware, start the real time
804	 * and statistics clocks.
805	 */
806	initclocks();
807
808	cold = 0;	/* clocks are running, we're warm now! */
809	s = splsched();
810	curcpu()->ci_schedstate.spc_flags |= SPCF_RUNNING;
811	splx(s);
812
813	/* Setup the runqueues and scheduler. */
814	runq_init();
815	synch_init();
816
817	/* Boot the secondary processors. */
818	for (CPU_INFO_FOREACH(cii, ci)) {
819		uvm_cpu_attach(ci);
820	}
821
822	/* Decide how to partition free memory. */
823	uvm_page_rebucket();
824
825	mp_online = true;
826#if defined(MULTIPROCESSOR)
827	cpu_boot_secondary_processors();
828#endif
829
830	/*
831	 * Bus scans can make it appear as if the system has paused, so
832	 * twiddle constantly while config_interrupts() jobs are running.
833	 */
834	config_twiddle_fn(NULL);
835
836	/*
837	 * Create threads to call back and finish configuration for
838	 * devices that want interrupts enabled.
839	 */
840	config_create_interruptthreads();
841}
842
843static void
844configure3(void)
845{
846
847	/*
848	 * Create threads to call back and finish configuration for
849	 * devices that want the mounted root file system.
850	 */
851	config_create_mountrootthreads();
852
853	/* Get the threads going and into any sleeps before continuing. */
854	yield();
855}
856
857static void
858rootconf_handle_wedges(void)
859{
860	struct disklabel label;
861	struct partition *p;
862	struct vnode *vp;
863	daddr_t startblk;
864	uint64_t nblks;
865	device_t dev;
866	int error;
867
868	if (booted_nblks) {
869		/*
870		 * bootloader passed geometry
871		 */
872		dev      = booted_device;
873		startblk = booted_startblk;
874		nblks    = booted_nblks;
875
876		/*
877		 * keep booted_device and booted_partition
878		 * in case the kernel doesn't identify a wedge
879		 */
880	} else {
881		/*
882		 * bootloader passed partition number
883		 *
884		 * We cannot ask the partition device directly when it is
885		 * covered by a wedge. Instead we look up the geometry in
886		 * the disklabel.
887		 */
888		vp = opendisk(booted_device);
889
890		if (vp == NULL)
891			return;
892
893		VOP_UNLOCK(vp);
894		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
895		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
896		VOP_CLOSE(vp, FREAD, NOCRED);
897		vput(vp);
898		if (error)
899			return;
900
901		KASSERT(booted_partition >= 0
902			&& booted_partition < MAXPARTITIONS);
903
904		p = &label.d_partitions[booted_partition];
905
906		dev      = booted_device;
907		startblk = p->p_offset;
908		nblks    = p->p_size;
909	}
910
911	dev = dkwedge_find_partition(dev, startblk, nblks);
912	if (dev != NULL) {
913		booted_device = dev;
914		booted_partition = 0;
915	}
916}
917
918void
919rootconf(void)
920{
921	if (booted_device != NULL)
922		rootconf_handle_wedges();
923
924	setroot(booted_device, booted_partition);
925}
926
927static void
928check_console(struct lwp *l)
929{
930	struct vnode *vp;
931	int error;
932
933	error = namei_simple_kernel("/dev/console",
934				NSM_FOLLOW_NOEMULROOT, &vp);
935	if (error == 0) {
936		vrele(vp);
937	} else if (error == ENOENT) {
938		if (boothowto & (AB_VERBOSE|AB_DEBUG))
939			printf("warning: no /dev/console\n");
940	} else {
941		printf("warning: lookup /dev/console: error %d\n", error);
942	}
943}
944
945/*
946 * List of paths to try when searching for "init".
947 */
948static const char * const initpaths[] = {
949	"/sbin/init",
950	"/sbin/oinit",
951	"/sbin/init.bak",
952	"/rescue/init",
953	NULL,
954};
955
956/*
957 * Start the initial user process; try exec'ing each pathname in "initpaths".
958 * The program is invoked with one argument containing the boot flags.
959 */
960static void
961start_init(void *arg)
962{
963	struct lwp *l = arg;
964	struct proc *p = l->l_proc;
965	vaddr_t addr;
966	struct sys_execve_args /* {
967		syscallarg(const char *) path;
968		syscallarg(char * const *) argp;
969		syscallarg(char * const *) envp;
970	} */ args;
971	int options, i, error;
972	register_t retval[2];
973	char flags[4], *flagsp;
974	const char *path, *slash;
975	char *ucp, **uap, *arg0, *arg1, *argv[3];
976	char ipath[129];
977	int ipx, len;
978
979	/*
980	 * Now in process 1.
981	 */
982	strncpy(p->p_comm, "init", MAXCOMLEN);
983
984	/*
985	 * Wait for main() to tell us that it's safe to exec.
986	 */
987	mutex_enter(&proc_lock);
988	while (start_init_exec == 0)
989		cv_wait(&lbolt, &proc_lock);
990	mutex_exit(&proc_lock);
991
992	/*
993	 * This is not the right way to do this.  We really should
994	 * hand-craft a descriptor onto /dev/console to hand to init,
995	 * but that's a _lot_ more work, and the benefit from this easy
996	 * hack makes up for the "good is the enemy of the best" effect.
997	 */
998	check_console(l);
999
1000	/*
1001	 * Need just enough stack to hold the faked-up "execve()" arguments.
1002	 */
1003	addr = (vaddr_t)STACK_ALLOC(USRSTACK, PAGE_SIZE);
1004	if (uvm_map(&p->p_vmspace->vm_map, &addr, PAGE_SIZE,
1005	    NULL, UVM_UNKNOWN_OFFSET, 0,
1006	    UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_COPY,
1007	    UVM_ADV_NORMAL,
1008	    UVM_FLAG_FIXED|UVM_FLAG_OVERLAY|UVM_FLAG_COPYONW)) != 0)
1009		panic("init: couldn't allocate argument space");
1010	p->p_vmspace->vm_maxsaddr = (void *)STACK_MAX(addr, PAGE_SIZE);
1011
1012	ipx = 0;
1013	while (1) {
1014		if (boothowto & RB_ASKNAME) {
1015			printf("init path");
1016			if (initpaths[ipx])
1017				printf(" (default %s)", initpaths[ipx]);
1018			printf(": ");
1019			len = cngetsn(ipath, sizeof(ipath)-1);
1020			if (len == 4 && strcmp(ipath, "halt") == 0) {
1021				kern_reboot(RB_HALT, NULL);
1022			} else if (len == 6 && strcmp(ipath, "reboot") == 0) {
1023				kern_reboot(0, NULL);
1024#if defined(DDB)
1025			} else if (len == 3 && strcmp(ipath, "ddb") == 0) {
1026				console_debugger();
1027				continue;
1028#endif
1029			} else if (len > 0 && ipath[0] == '/') {
1030				ipath[len] = '\0';
1031				path = ipath;
1032			} else if (len == 0 && initpaths[ipx] != NULL) {
1033				path = initpaths[ipx++];
1034			} else {
1035				printf("use absolute path, ");
1036#if defined(DDB)
1037				printf("\"ddb\", ");
1038#endif
1039				printf("\"halt\", or \"reboot\"\n");
1040				continue;
1041			}
1042		} else {
1043			if ((path = initpaths[ipx++]) == NULL) {
1044				ipx = 0;
1045				boothowto |= RB_ASKNAME;
1046				continue;
1047			}
1048		}
1049
1050		ucp = (char *)USRSTACK;
1051
1052		/*
1053		 * Construct the boot flag argument.
1054		 */
1055		flagsp = flags;
1056		*flagsp++ = '-';
1057		options = 0;
1058
1059		if (boothowto & RB_SINGLE) {
1060			*flagsp++ = 's';
1061			options = 1;
1062		}
1063#ifdef notyet
1064		if (boothowto & RB_FASTBOOT) {
1065			*flagsp++ = 'f';
1066			options = 1;
1067		}
1068#endif
1069
1070		/*
1071		 * Move out the flags (arg 1), if necessary.
1072		 */
1073		if (options != 0) {
1074			*flagsp++ = '\0';
1075			i = flagsp - flags;
1076#ifdef DEBUG
1077			aprint_normal("init: copying out flags `%s' %d\n", flags, i);
1078#endif
1079			arg1 = STACK_ALLOC(ucp, i);
1080			ucp = STACK_MAX(arg1, i);
1081			if ((error = copyout((void *)flags, arg1, i)) != 0)
1082				goto copyerr;
1083		} else
1084			arg1 = NULL;
1085
1086		/*
1087		 * Move out the file name (also arg 0).
1088		 */
1089		i = strlen(path) + 1;
1090#ifdef DEBUG
1091		aprint_normal("init: copying out path `%s' %d\n", path, i);
1092#else
1093		if (boothowto & RB_ASKNAME || path != initpaths[0])
1094			printf("init: trying %s\n", path);
1095#endif
1096		arg0 = STACK_ALLOC(ucp, i);
1097		ucp = STACK_MAX(arg0, i);
1098		if ((error = copyout(path, arg0, i)) != 0)
1099			goto copyerr;
1100
1101		/*
1102		 * Move out the arg pointers.
1103		 */
1104		ucp = (void *)STACK_ALIGN(ucp, STACK_ALIGNBYTES);
1105		uap = (char **)STACK_ALLOC(ucp, sizeof(argv));
1106		SCARG(&args, path) = arg0;
1107		SCARG(&args, argp) = uap;
1108		SCARG(&args, envp) = NULL;
1109		slash = strrchr(path, '/');
1110
1111		argv[0] = slash ? arg0 + (slash + 1 - path) : arg0;
1112		argv[1] = arg1;
1113		argv[2] = NULL;
1114		if ((error = copyout(argv, uap, sizeof(argv))) != 0)
1115			goto copyerr;
1116
1117		/*
1118		 * Now try to exec the program.  If it can't for any reason
1119		 * other than it doesn't exist, complain.
1120		 */
1121		error = sys_execve(l, &args, retval);
1122		if (error == 0 || error == EJUSTRETURN) {
1123			KERNEL_UNLOCK_LAST(l);
1124			return;
1125		}
1126		printf("exec %s: error %d\n", path, error);
1127	}
1128	printf("init: not found\n");
1129	panic("no init");
1130copyerr:
1131	panic("copyout %d", error);
1132}
1133
1134/*
1135 * calculate cache size (in bytes) from physmem and vsize.
1136 */
1137vaddr_t
1138calc_cache_size(vsize_t vsize, int pct, int va_pct)
1139{
1140	paddr_t t;
1141
1142	/* XXX should consider competing cache if any */
1143	/* XXX should consider submaps */
1144	t = (uintmax_t)physmem * pct / 100 * PAGE_SIZE;
1145	if (vsize != 0) {
1146		vsize = (uintmax_t)vsize * va_pct / 100;
1147		if (t > vsize) {
1148			t = vsize;
1149		}
1150	}
1151	return t;
1152}
1153
1154/*
1155 * Print the system start up banner.
1156 *
1157 * - Print a limited banner if AB_SILENT.
1158 * - Always send normal banner to the log.
1159 */
1160#define MEM_PBUFSIZE	sizeof("99999 MB")
1161
1162void
1163banner(void)
1164{
1165	static char notice[] = " Notice: this software is "
1166	    "protected by copyright";
1167	char pbuf[81];
1168	void (*pr)(const char *, ...) __printflike(1, 2);
1169	int i;
1170
1171	if ((boothowto & AB_SILENT) != 0) {
1172		snprintf(pbuf, sizeof(pbuf), "%s %s (%s)",
1173		    ostype, osrelease, kernel_ident);
1174		printf_nolog("%s", pbuf);
1175		for (i = 80 - strlen(pbuf) - sizeof(notice); i > 0; i--)
1176			printf(" ");
1177		printf_nolog("%s\n", notice);
1178		pr = aprint_normal;
1179	} else {
1180		pr = printf;
1181	}
1182
1183	memset(pbuf, 0, sizeof(pbuf));
1184	(*pr)("%s%s", copyright, version);
1185	format_bytes(pbuf, MEM_PBUFSIZE, ctob((uint64_t)physmem));
1186	(*pr)("total memory = %s\n", pbuf);
1187	format_bytes(pbuf, MEM_PBUFSIZE, ctob((uint64_t)uvm_availmem(false)));
1188	(*pr)("avail memory = %s\n", pbuf);
1189}
1190