1/*-
2 * Copyright (c) 2010 Marcel Moolenaar
3 * Copyright (c) 1999-2004 Poul-Henning Kamp
4 * Copyright (c) 1999 Michael Smith
5 * Copyright (c) 1989, 1993
6 *      The Regents of the University of California.  All rights reserved.
7 * (c) UNIX System Laboratories, Inc.
8 * All or some portions of this file are derived from material licensed
9 * to the University of California by American Telephone and Telegraph
10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11 * the permission of UNIX System Laboratories, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38#include "opt_rootdevname.h"
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD: stable/11/sys/kern/vfs_mountroot.c 353717 2019-10-18 03:38:02Z kp $");
42
43#include <sys/param.h>
44#include <sys/conf.h>
45#include <sys/cons.h>
46#include <sys/fcntl.h>
47#include <sys/jail.h>
48#include <sys/kernel.h>
49#include <sys/malloc.h>
50#include <sys/mdioctl.h>
51#include <sys/mount.h>
52#include <sys/mutex.h>
53#include <sys/namei.h>
54#include <sys/priv.h>
55#include <sys/proc.h>
56#include <sys/filedesc.h>
57#include <sys/reboot.h>
58#include <sys/sbuf.h>
59#include <sys/stat.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysproto.h>
62#include <sys/sx.h>
63#include <sys/sysctl.h>
64#include <sys/sysent.h>
65#include <sys/systm.h>
66#include <sys/vnode.h>
67
68#include <geom/geom.h>
69
70/*
71 * The root filesystem is detailed in the kernel environment variable
72 * vfs.root.mountfrom, which is expected to be in the general format
73 *
74 * <vfsname>:[<path>][	<vfsname>:[<path>] ...]
75 * vfsname   := the name of a VFS known to the kernel and capable
76 *              of being mounted as root
77 * path      := disk device name or other data used by the filesystem
78 *              to locate its physical store
79 *
80 * If the environment variable vfs.root.mountfrom is a space separated list,
81 * each list element is tried in turn and the root filesystem will be mounted
82 * from the first one that succeeds.
83 *
84 * The environment variable vfs.root.mountfrom.options is a comma delimited
85 * set of string mount options.  These mount options must be parseable
86 * by nmount() in the kernel.
87 */
88
89static int parse_mount(char **);
90static struct mntarg *parse_mountroot_options(struct mntarg *, const char *);
91static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS);
92static void vfs_mountroot_wait(void);
93static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev);
94
95/*
96 * The vnode of the system's root (/ in the filesystem, without chroot
97 * active.)
98 */
99struct vnode *rootvnode;
100
101/*
102 * Mount of the system's /dev.
103 */
104struct mount *rootdevmp;
105
106char *rootdevnames[2] = {NULL, NULL};
107
108struct mtx root_holds_mtx;
109MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF);
110
111struct root_hold_token {
112	const char			*who;
113	LIST_ENTRY(root_hold_token)	list;
114};
115
116static LIST_HEAD(, root_hold_token)	root_holds =
117    LIST_HEAD_INITIALIZER(root_holds);
118
119enum action {
120	A_CONTINUE,
121	A_PANIC,
122	A_REBOOT,
123	A_RETRY
124};
125
126static enum action root_mount_onfail = A_CONTINUE;
127
128static int root_mount_mddev;
129static int root_mount_complete;
130
131/* By default wait up to 3 seconds for devices to appear. */
132static int root_mount_timeout = 3;
133TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
134
135static int root_mount_always_wait = 0;
136SYSCTL_INT(_vfs, OID_AUTO, root_mount_always_wait, CTLFLAG_RDTUN,
137    &root_mount_always_wait, 0,
138    "Wait for root mount holds even if the root device already exists");
139
140SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold,
141    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
142    NULL, 0, sysctl_vfs_root_mount_hold, "A",
143    "List of root mount hold tokens");
144
145static int
146sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS)
147{
148	struct sbuf sb;
149	struct root_hold_token *h;
150	int error;
151
152	sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL);
153
154	mtx_lock(&root_holds_mtx);
155	LIST_FOREACH(h, &root_holds, list) {
156		if (h != LIST_FIRST(&root_holds))
157			sbuf_putc(&sb, ' ');
158		sbuf_printf(&sb, "%s", h->who);
159	}
160	mtx_unlock(&root_holds_mtx);
161
162	error = sbuf_finish(&sb);
163	if (error == 0)
164		error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
165	sbuf_delete(&sb);
166	return (error);
167}
168
169struct root_hold_token *
170root_mount_hold(const char *identifier)
171{
172	struct root_hold_token *h;
173
174	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
175	h->who = identifier;
176	mtx_lock(&root_holds_mtx);
177	LIST_INSERT_HEAD(&root_holds, h, list);
178	mtx_unlock(&root_holds_mtx);
179	return (h);
180}
181
182void
183root_mount_rel(struct root_hold_token *h)
184{
185
186	if (h == NULL)
187		return;
188
189	mtx_lock(&root_holds_mtx);
190	LIST_REMOVE(h, list);
191	wakeup(&root_holds);
192	mtx_unlock(&root_holds_mtx);
193	free(h, M_DEVBUF);
194}
195
196int
197root_mounted(void)
198{
199
200	/* No mutex is acquired here because int stores are atomic. */
201	return (root_mount_complete);
202}
203
204static void
205set_rootvnode(void)
206{
207	struct proc *p;
208
209	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode))
210		panic("Cannot find root vnode");
211
212	VOP_UNLOCK(rootvnode, 0);
213
214	p = curthread->td_proc;
215	FILEDESC_XLOCK(p->p_fd);
216
217	if (p->p_fd->fd_cdir != NULL)
218		vrele(p->p_fd->fd_cdir);
219	p->p_fd->fd_cdir = rootvnode;
220	VREF(rootvnode);
221
222	if (p->p_fd->fd_rdir != NULL)
223		vrele(p->p_fd->fd_rdir);
224	p->p_fd->fd_rdir = rootvnode;
225	VREF(rootvnode);
226
227	FILEDESC_XUNLOCK(p->p_fd);
228}
229
230static int
231vfs_mountroot_devfs(struct thread *td, struct mount **mpp)
232{
233	struct vfsoptlist *opts;
234	struct vfsconf *vfsp;
235	struct mount *mp;
236	int error;
237
238	*mpp = NULL;
239
240	if (rootdevmp != NULL) {
241		/*
242		 * Already have /dev; this happens during rerooting.
243		 */
244		error = vfs_busy(rootdevmp, 0);
245		if (error != 0)
246			return (error);
247		*mpp = rootdevmp;
248	} else {
249		vfsp = vfs_byname("devfs");
250		KASSERT(vfsp != NULL, ("Could not find devfs by name"));
251		if (vfsp == NULL)
252			return (ENOENT);
253
254		mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
255
256		error = VFS_MOUNT(mp);
257		KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
258		if (error)
259			return (error);
260
261		error = VFS_STATFS(mp, &mp->mnt_stat);
262		KASSERT(error == 0, ("VFS_STATFS(devfs) failed %d", error));
263		if (error)
264			return (error);
265
266		opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
267		TAILQ_INIT(opts);
268		mp->mnt_opt = opts;
269
270		mtx_lock(&mountlist_mtx);
271		TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
272		mtx_unlock(&mountlist_mtx);
273
274		*mpp = mp;
275		rootdevmp = mp;
276	}
277
278	set_rootvnode();
279
280	error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE);
281	if (error)
282		printf("kern_symlink /dev -> / returns %d\n", error);
283
284	return (error);
285}
286
287static void
288vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
289{
290	struct nameidata nd;
291	struct mount *mporoot, *mpnroot;
292	struct vnode *vp, *vporoot, *vpdevfs;
293	char *fspath;
294	int error;
295
296	mpnroot = TAILQ_NEXT(mpdevfs, mnt_list);
297
298	/* Shuffle the mountlist. */
299	mtx_lock(&mountlist_mtx);
300	mporoot = TAILQ_FIRST(&mountlist);
301	TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list);
302	if (mporoot != mpdevfs) {
303		TAILQ_REMOVE(&mountlist, mpnroot, mnt_list);
304		TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list);
305	}
306	TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list);
307	mtx_unlock(&mountlist_mtx);
308
309	cache_purgevfs(mporoot, true);
310	if (mporoot != mpdevfs)
311		cache_purgevfs(mpdevfs, true);
312
313	VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot);
314
315	VI_LOCK(vporoot);
316	vporoot->v_iflag &= ~VI_MOUNT;
317	VI_UNLOCK(vporoot);
318	vporoot->v_mountedhere = NULL;
319	mporoot->mnt_flag &= ~MNT_ROOTFS;
320	mporoot->mnt_vnodecovered = NULL;
321	vput(vporoot);
322
323	/* Set up the new rootvnode, and purge the cache */
324	mpnroot->mnt_vnodecovered = NULL;
325	set_rootvnode();
326	cache_purgevfs(rootvnode->v_mount, true);
327
328	if (mporoot != mpdevfs) {
329		/* Remount old root under /.mount or /mnt */
330		fspath = "/.mount";
331		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
332		    fspath, td);
333		error = namei(&nd);
334		if (error) {
335			NDFREE(&nd, NDF_ONLY_PNBUF);
336			fspath = "/mnt";
337			NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
338			    fspath, td);
339			error = namei(&nd);
340		}
341		if (!error) {
342			vp = nd.ni_vp;
343			error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
344			if (!error)
345				error = vinvalbuf(vp, V_SAVE, 0, 0);
346			if (!error) {
347				cache_purge(vp);
348				mporoot->mnt_vnodecovered = vp;
349				vp->v_mountedhere = mporoot;
350				strlcpy(mporoot->mnt_stat.f_mntonname,
351				    fspath, MNAMELEN);
352				VOP_UNLOCK(vp, 0);
353			} else
354				vput(vp);
355		}
356		NDFREE(&nd, NDF_ONLY_PNBUF);
357
358		if (error)
359			printf("mountroot: unable to remount previous root "
360			    "under /.mount or /mnt (error %d)\n", error);
361	}
362
363	/* Remount devfs under /dev */
364	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
365	error = namei(&nd);
366	if (!error) {
367		vp = nd.ni_vp;
368		error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
369		if (!error)
370			error = vinvalbuf(vp, V_SAVE, 0, 0);
371		if (!error) {
372			vpdevfs = mpdevfs->mnt_vnodecovered;
373			if (vpdevfs != NULL) {
374				cache_purge(vpdevfs);
375				vpdevfs->v_mountedhere = NULL;
376				vrele(vpdevfs);
377			}
378			mpdevfs->mnt_vnodecovered = vp;
379			vp->v_mountedhere = mpdevfs;
380			VOP_UNLOCK(vp, 0);
381		} else
382			vput(vp);
383	}
384	if (error)
385		printf("mountroot: unable to remount devfs under /dev "
386		    "(error %d)\n", error);
387	NDFREE(&nd, NDF_ONLY_PNBUF);
388
389	if (mporoot == mpdevfs) {
390		vfs_unbusy(mpdevfs);
391		/* Unlink the no longer needed /dev/dev -> / symlink */
392		error = kern_unlinkat(td, AT_FDCWD, "/dev/dev",
393		    UIO_SYSSPACE, 0);
394		if (error)
395			printf("mountroot: unable to unlink /dev/dev "
396			    "(error %d)\n", error);
397	}
398}
399
400/*
401 * Configuration parser.
402 */
403
404/* Parser character classes. */
405#define	CC_WHITESPACE		-1
406#define	CC_NONWHITESPACE	-2
407
408/* Parse errors. */
409#define	PE_EOF			-1
410#define	PE_EOL			-2
411
412static __inline int
413parse_peek(char **conf)
414{
415
416	return (**conf);
417}
418
419static __inline void
420parse_poke(char **conf, int c)
421{
422
423	**conf = c;
424}
425
426static __inline void
427parse_advance(char **conf)
428{
429
430	(*conf)++;
431}
432
433static int
434parse_skipto(char **conf, int mc)
435{
436	int c, match;
437
438	while (1) {
439		c = parse_peek(conf);
440		if (c == 0)
441			return (PE_EOF);
442		switch (mc) {
443		case CC_WHITESPACE:
444			match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0;
445			break;
446		case CC_NONWHITESPACE:
447			if (c == '\n')
448				return (PE_EOL);
449			match = (c != ' ' && c != '\t') ? 1 : 0;
450			break;
451		default:
452			match = (c == mc) ? 1 : 0;
453			break;
454		}
455		if (match)
456			break;
457		parse_advance(conf);
458	}
459	return (0);
460}
461
462static int
463parse_token(char **conf, char **tok)
464{
465	char *p;
466	size_t len;
467	int error;
468
469	*tok = NULL;
470	error = parse_skipto(conf, CC_NONWHITESPACE);
471	if (error)
472		return (error);
473	p = *conf;
474	error = parse_skipto(conf, CC_WHITESPACE);
475	len = *conf - p;
476	*tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO);
477	bcopy(p, *tok, len);
478	return (0);
479}
480
481static void
482parse_dir_ask_printenv(const char *var)
483{
484	char *val;
485
486	val = kern_getenv(var);
487	if (val != NULL) {
488		printf("  %s=%s\n", var, val);
489		freeenv(val);
490	}
491}
492
493static int
494parse_dir_ask(char **conf)
495{
496	char name[80];
497	char *mnt;
498	int error;
499
500	vfs_mountroot_wait();
501
502	printf("\nLoader variables:\n");
503	parse_dir_ask_printenv("vfs.root.mountfrom");
504	parse_dir_ask_printenv("vfs.root.mountfrom.options");
505
506	printf("\nManual root filesystem specification:\n");
507	printf("  <fstype>:<device> [options]\n");
508	printf("      Mount <device> using filesystem <fstype>\n");
509	printf("      and with the specified (optional) option list.\n");
510	printf("\n");
511	printf("    eg. ufs:/dev/da0s1a\n");
512	printf("        zfs:tank\n");
513	printf("        cd9660:/dev/cd0 ro\n");
514	printf("          (which is equivalent to: ");
515	printf("mount -t cd9660 -o ro /dev/cd0 /)\n");
516	printf("\n");
517	printf("  ?               List valid disk boot devices\n");
518	printf("  .               Yield 1 second (for background tasks)\n");
519	printf("  <empty line>    Abort manual input\n");
520
521	do {
522		error = EINVAL;
523		printf("\nmountroot> ");
524		cngets(name, sizeof(name), GETS_ECHO);
525		if (name[0] == '\0')
526			break;
527		if (name[0] == '?' && name[1] == '\0') {
528			printf("\nList of GEOM managed disk devices:\n  ");
529			g_dev_print();
530			continue;
531		}
532		if (name[0] == '.' && name[1] == '\0') {
533			pause("rmask", hz);
534			continue;
535		}
536		mnt = name;
537		error = parse_mount(&mnt);
538		if (error == -1)
539			printf("Invalid file system specification.\n");
540	} while (error != 0);
541
542	return (error);
543}
544
545static int
546parse_dir_md(char **conf)
547{
548	struct stat sb;
549	struct thread *td;
550	struct md_ioctl *mdio;
551	char *path, *tok;
552	int error, fd, len;
553
554	td = curthread;
555
556	error = parse_token(conf, &tok);
557	if (error)
558		return (error);
559
560	len = strlen(tok);
561	mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO);
562	path = (void *)(mdio + 1);
563	bcopy(tok, path, len);
564	free(tok, M_TEMP);
565
566	/* Get file status. */
567	error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL);
568	if (error)
569		goto out;
570
571	/* Open /dev/mdctl so that we can attach/detach. */
572	error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE,
573	    O_RDWR, 0);
574	if (error)
575		goto out;
576
577	fd = td->td_retval[0];
578	mdio->md_version = MDIOVERSION;
579	mdio->md_type = MD_VNODE;
580
581	if (root_mount_mddev != -1) {
582		mdio->md_unit = root_mount_mddev;
583		DROP_GIANT();
584		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
585		PICKUP_GIANT();
586		/* Ignore errors. We don't care. */
587		root_mount_mddev = -1;
588	}
589
590	mdio->md_file = (void *)(mdio + 1);
591	mdio->md_options = MD_AUTOUNIT | MD_READONLY;
592	mdio->md_mediasize = sb.st_size;
593	mdio->md_unit = 0;
594	DROP_GIANT();
595	error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio);
596	PICKUP_GIANT();
597	if (error)
598		goto out;
599
600	if (mdio->md_unit > 9) {
601		printf("rootmount: too many md units\n");
602		mdio->md_file = NULL;
603		mdio->md_options = 0;
604		mdio->md_mediasize = 0;
605		DROP_GIANT();
606		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
607		PICKUP_GIANT();
608		/* Ignore errors. We don't care. */
609		error = ERANGE;
610		goto out;
611	}
612
613	root_mount_mddev = mdio->md_unit;
614	printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file);
615
616	error = kern_close(td, fd);
617
618 out:
619	free(mdio, M_TEMP);
620	return (error);
621}
622
623static int
624parse_dir_onfail(char **conf)
625{
626	char *action;
627	int error;
628
629	error = parse_token(conf, &action);
630	if (error)
631		return (error);
632
633	if (!strcmp(action, "continue"))
634		root_mount_onfail = A_CONTINUE;
635	else if (!strcmp(action, "panic"))
636		root_mount_onfail = A_PANIC;
637	else if (!strcmp(action, "reboot"))
638		root_mount_onfail = A_REBOOT;
639	else if (!strcmp(action, "retry"))
640		root_mount_onfail = A_RETRY;
641	else {
642		printf("rootmount: %s: unknown action\n", action);
643		error = EINVAL;
644	}
645
646	free(action, M_TEMP);
647	return (0);
648}
649
650static int
651parse_dir_timeout(char **conf)
652{
653	char *tok, *endtok;
654	long secs;
655	int error;
656
657	error = parse_token(conf, &tok);
658	if (error)
659		return (error);
660
661	secs = strtol(tok, &endtok, 0);
662	error = (secs < 0 || *endtok != '\0') ? EINVAL : 0;
663	if (!error)
664		root_mount_timeout = secs;
665	free(tok, M_TEMP);
666	return (error);
667}
668
669static int
670parse_directive(char **conf)
671{
672	char *dir;
673	int error;
674
675	error = parse_token(conf, &dir);
676	if (error)
677		return (error);
678
679	if (strcmp(dir, ".ask") == 0)
680		error = parse_dir_ask(conf);
681	else if (strcmp(dir, ".md") == 0)
682		error = parse_dir_md(conf);
683	else if (strcmp(dir, ".onfail") == 0)
684		error = parse_dir_onfail(conf);
685	else if (strcmp(dir, ".timeout") == 0)
686		error = parse_dir_timeout(conf);
687	else {
688		printf("mountroot: invalid directive `%s'\n", dir);
689		/* Ignore the rest of the line. */
690		(void)parse_skipto(conf, '\n');
691		error = EINVAL;
692	}
693	free(dir, M_TEMP);
694	return (error);
695}
696
697static int
698parse_mount_dev_present(const char *dev)
699{
700	struct nameidata nd;
701	int error;
702
703	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread);
704	error = namei(&nd);
705	if (!error)
706		vput(nd.ni_vp);
707	NDFREE(&nd, NDF_ONLY_PNBUF);
708	return (error != 0) ? 0 : 1;
709}
710
711#define	ERRMSGL	255
712static int
713parse_mount(char **conf)
714{
715	char *errmsg;
716	struct mntarg *ma;
717	char *dev, *fs, *opts, *tok;
718	int delay, error, timeout;
719
720	error = parse_token(conf, &tok);
721	if (error)
722		return (error);
723	fs = tok;
724	error = parse_skipto(&tok, ':');
725	if (error) {
726		free(fs, M_TEMP);
727		return (error);
728	}
729	parse_poke(&tok, '\0');
730	parse_advance(&tok);
731	dev = tok;
732
733	if (root_mount_mddev != -1) {
734		/* Handle substitution for the md unit number. */
735		tok = strstr(dev, "md#");
736		if (tok != NULL)
737			tok[2] = '0' + root_mount_mddev;
738	}
739
740	/* Parse options. */
741	error = parse_token(conf, &tok);
742	opts = (error == 0) ? tok : NULL;
743
744	printf("Trying to mount root from %s:%s [%s]...\n", fs, dev,
745	    (opts != NULL) ? opts : "");
746
747	errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
748
749	if (vfs_byname(fs) == NULL) {
750		strlcpy(errmsg, "unknown file system", ERRMSGL);
751		error = ENOENT;
752		goto out;
753	}
754
755	error = vfs_mountroot_wait_if_neccessary(fs, dev);
756	if (error != 0)
757		goto out;
758
759	delay = hz / 10;
760	timeout = root_mount_timeout * hz;
761
762	for (;;) {
763		ma = NULL;
764		ma = mount_arg(ma, "fstype", fs, -1);
765		ma = mount_arg(ma, "fspath", "/", -1);
766		ma = mount_arg(ma, "from", dev, -1);
767		ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
768		ma = mount_arg(ma, "ro", NULL, 0);
769		ma = parse_mountroot_options(ma, opts);
770
771		error = kernel_mount(ma, MNT_ROOTFS);
772		if (error == 0 || timeout <= 0)
773			break;
774
775		if (root_mount_timeout * hz == timeout ||
776		    (bootverbose && timeout % hz == 0)) {
777			printf("Mounting from %s:%s failed with error %d; "
778			    "retrying for %d more second%s\n", fs, dev, error,
779			    timeout / hz, (timeout / hz > 1) ? "s" : "");
780		}
781		pause("rmretry", delay);
782		timeout -= delay;
783	}
784 out:
785	if (error) {
786		printf("Mounting from %s:%s failed with error %d",
787		    fs, dev, error);
788		if (errmsg[0] != '\0')
789			printf(": %s", errmsg);
790		printf(".\n");
791	}
792	free(fs, M_TEMP);
793	free(errmsg, M_TEMP);
794	if (opts != NULL)
795		free(opts, M_TEMP);
796	/* kernel_mount can return -1 on error. */
797	return ((error < 0) ? EDOOFUS : error);
798}
799#undef ERRMSGL
800
801static int
802vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs)
803{
804	struct mount *mp;
805	char *conf;
806	int error;
807
808	root_mount_mddev = -1;
809
810retry:
811	conf = sbuf_data(sb);
812	mp = TAILQ_NEXT(mpdevfs, mnt_list);
813	error = (mp == NULL) ? 0 : EDOOFUS;
814	root_mount_onfail = A_CONTINUE;
815	while (mp == NULL) {
816		error = parse_skipto(&conf, CC_NONWHITESPACE);
817		if (error == PE_EOL) {
818			parse_advance(&conf);
819			continue;
820		}
821		if (error < 0)
822			break;
823		switch (parse_peek(&conf)) {
824		case '#':
825			error = parse_skipto(&conf, '\n');
826			break;
827		case '.':
828			error = parse_directive(&conf);
829			break;
830		default:
831			error = parse_mount(&conf);
832			if (error == -1) {
833				printf("mountroot: invalid file system "
834				    "specification.\n");
835				error = 0;
836			}
837			break;
838		}
839		if (error < 0)
840			break;
841		/* Ignore any trailing garbage on the line. */
842		if (parse_peek(&conf) != '\n') {
843			printf("mountroot: advancing to next directive...\n");
844			(void)parse_skipto(&conf, '\n');
845		}
846		mp = TAILQ_NEXT(mpdevfs, mnt_list);
847	}
848	if (mp != NULL)
849		return (0);
850
851	/*
852	 * We failed to mount (a new) root.
853	 */
854	switch (root_mount_onfail) {
855	case A_CONTINUE:
856		break;
857	case A_PANIC:
858		panic("mountroot: unable to (re-)mount root.");
859		/* NOTREACHED */
860	case A_RETRY:
861		goto retry;
862	case A_REBOOT:
863		kern_reboot(RB_NOSYNC);
864		/* NOTREACHED */
865	}
866
867	return (error);
868}
869
870static void
871vfs_mountroot_conf0(struct sbuf *sb)
872{
873	char *s, *tok, *mnt, *opt;
874	int error;
875
876	sbuf_printf(sb, ".onfail panic\n");
877	sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
878	if (boothowto & RB_ASKNAME)
879		sbuf_printf(sb, ".ask\n");
880#ifdef ROOTDEVNAME
881	if (boothowto & RB_DFLTROOT)
882		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
883#endif
884	if (boothowto & RB_CDROM) {
885		sbuf_printf(sb, "cd9660:/dev/cd0 ro\n");
886		sbuf_printf(sb, ".timeout 0\n");
887		sbuf_printf(sb, "cd9660:/dev/cd1 ro\n");
888		sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
889	}
890	s = kern_getenv("vfs.root.mountfrom");
891	if (s != NULL) {
892		opt = kern_getenv("vfs.root.mountfrom.options");
893		tok = s;
894		error = parse_token(&tok, &mnt);
895		while (!error) {
896			sbuf_printf(sb, "%s %s\n", mnt,
897			    (opt != NULL) ? opt : "");
898			free(mnt, M_TEMP);
899			error = parse_token(&tok, &mnt);
900		}
901		if (opt != NULL)
902			freeenv(opt);
903		freeenv(s);
904	}
905	if (rootdevnames[0] != NULL)
906		sbuf_printf(sb, "%s\n", rootdevnames[0]);
907	if (rootdevnames[1] != NULL)
908		sbuf_printf(sb, "%s\n", rootdevnames[1]);
909#ifdef ROOTDEVNAME
910	if (!(boothowto & RB_DFLTROOT))
911		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
912#endif
913	if (!(boothowto & RB_ASKNAME))
914		sbuf_printf(sb, ".ask\n");
915}
916
917static int
918vfs_mountroot_readconf(struct thread *td, struct sbuf *sb)
919{
920	static char buf[128];
921	struct nameidata nd;
922	off_t ofs;
923	ssize_t resid;
924	int error, flags, len;
925
926	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td);
927	flags = FREAD;
928	error = vn_open(&nd, &flags, 0, NULL);
929	if (error)
930		return (error);
931
932	NDFREE(&nd, NDF_ONLY_PNBUF);
933	ofs = 0;
934	len = sizeof(buf) - 1;
935	while (1) {
936		error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
937		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
938		    NOCRED, &resid, td);
939		if (error)
940			break;
941		if (resid == len)
942			break;
943		buf[len - resid] = 0;
944		sbuf_printf(sb, "%s", buf);
945		ofs += len - resid;
946	}
947
948	VOP_UNLOCK(nd.ni_vp, 0);
949	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
950	return (error);
951}
952
953static void
954vfs_mountroot_wait(void)
955{
956	struct root_hold_token *h;
957	struct timeval lastfail;
958	int curfail;
959
960	curfail = 0;
961	while (1) {
962		DROP_GIANT();
963		g_waitidle();
964		PICKUP_GIANT();
965		mtx_lock(&root_holds_mtx);
966		if (LIST_EMPTY(&root_holds)) {
967			mtx_unlock(&root_holds_mtx);
968			break;
969		}
970		if (ppsratecheck(&lastfail, &curfail, 1)) {
971			printf("Root mount waiting for:");
972			LIST_FOREACH(h, &root_holds, list)
973				printf(" %s", h->who);
974			printf("\n");
975		}
976		msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold",
977		    hz);
978	}
979}
980
981static int
982vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev)
983{
984	int delay, timeout;
985
986	/*
987	 * In case of ZFS and NFS we don't have a way to wait for
988	 * specific device.  Also do the wait if the user forced that
989	 * behaviour by setting vfs.root_mount_always_wait=1.
990	 */
991	if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL ||
992	    dev[0] == '\0' || root_mount_always_wait != 0) {
993		vfs_mountroot_wait();
994		return (0);
995	}
996
997	/*
998	 * Otherwise, no point in waiting if the device is already there.
999	 * Note that we must wait for GEOM to finish reconfiguring itself,
1000	 * eg for geom_part(4) to finish tasting.
1001	 */
1002	DROP_GIANT();
1003	g_waitidle();
1004	PICKUP_GIANT();
1005	if (parse_mount_dev_present(dev))
1006		return (0);
1007
1008	/*
1009	 * No luck.  Let's wait.  This code looks weird, but it's that way
1010	 * to behave exactly as it used to work before.
1011	 */
1012	vfs_mountroot_wait();
1013	printf("mountroot: waiting for device %s...\n", dev);
1014	delay = hz / 10;
1015	timeout = root_mount_timeout * hz;
1016	do {
1017		pause("rmdev", delay);
1018		timeout -= delay;
1019	} while (timeout > 0 && !parse_mount_dev_present(dev));
1020
1021	if (timeout <= 0)
1022		return (ENODEV);
1023
1024	return (0);
1025}
1026
1027void
1028vfs_mountroot(void)
1029{
1030	struct mount *mp;
1031	struct sbuf *sb;
1032	struct thread *td;
1033	time_t timebase;
1034	int error;
1035
1036	td = curthread;
1037
1038	sb = sbuf_new_auto();
1039	vfs_mountroot_conf0(sb);
1040	sbuf_finish(sb);
1041
1042	error = vfs_mountroot_devfs(td, &mp);
1043	while (!error) {
1044		error = vfs_mountroot_parse(sb, mp);
1045		if (!error) {
1046			vfs_mountroot_shuffle(td, mp);
1047			sbuf_clear(sb);
1048			error = vfs_mountroot_readconf(td, sb);
1049			sbuf_finish(sb);
1050		}
1051	}
1052
1053	sbuf_delete(sb);
1054
1055	/*
1056	 * Iterate over all currently mounted file systems and use
1057	 * the time stamp found to check and/or initialize the RTC.
1058	 * Call inittodr() only once and pass it the largest of the
1059	 * timestamps we encounter.
1060	 */
1061	timebase = 0;
1062	mtx_lock(&mountlist_mtx);
1063	mp = TAILQ_FIRST(&mountlist);
1064	while (mp != NULL) {
1065		if (mp->mnt_time > timebase)
1066			timebase = mp->mnt_time;
1067		mp = TAILQ_NEXT(mp, mnt_list);
1068	}
1069	mtx_unlock(&mountlist_mtx);
1070	inittodr(timebase);
1071
1072	/* Keep prison0's root in sync with the global rootvnode. */
1073	mtx_lock(&prison0.pr_mtx);
1074	prison0.pr_root = rootvnode;
1075	vref(prison0.pr_root);
1076	mtx_unlock(&prison0.pr_mtx);
1077
1078	mtx_lock(&root_holds_mtx);
1079	atomic_store_rel_int(&root_mount_complete, 1);
1080	wakeup(&root_mount_complete);
1081	mtx_unlock(&root_holds_mtx);
1082
1083	EVENTHANDLER_INVOKE(mountroot);
1084}
1085
1086static struct mntarg *
1087parse_mountroot_options(struct mntarg *ma, const char *options)
1088{
1089	char *p;
1090	char *name, *name_arg;
1091	char *val, *val_arg;
1092	char *opts;
1093
1094	if (options == NULL || options[0] == '\0')
1095		return (ma);
1096
1097	p = opts = strdup(options, M_MOUNT);
1098	if (opts == NULL) {
1099		return (ma);
1100	}
1101
1102	while((name = strsep(&p, ",")) != NULL) {
1103		if (name[0] == '\0')
1104			break;
1105
1106		val = strchr(name, '=');
1107		if (val != NULL) {
1108			*val = '\0';
1109			++val;
1110		}
1111		if( strcmp(name, "rw") == 0 ||
1112		    strcmp(name, "noro") == 0) {
1113			/*
1114			 * The first time we mount the root file system,
1115			 * we need to mount 'ro', so We need to ignore
1116			 * 'rw' and 'noro' mount options.
1117			 */
1118			continue;
1119		}
1120		name_arg = strdup(name, M_MOUNT);
1121		val_arg = NULL;
1122		if (val != NULL)
1123			val_arg = strdup(val, M_MOUNT);
1124
1125		ma = mount_arg(ma, name_arg, val_arg,
1126		    (val_arg != NULL ? -1 : 0));
1127	}
1128	free(opts, M_MOUNT);
1129	return (ma);
1130}
1131