1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2010 Marcel Moolenaar
5 * Copyright (c) 1999-2004 Poul-Henning Kamp
6 * Copyright (c) 1999 Michael Smith
7 * Copyright (c) 1989, 1993
8 *      The Regents of the University of California.  All rights reserved.
9 * (c) UNIX System Laboratories, Inc.
10 * All or some portions of this file are derived from material licensed
11 * to the University of California by American Telephone and Telegraph
12 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
13 * the permission of UNIX System Laboratories, Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 */
39
40#include "opt_rootdevname.h"
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD$");
44
45#include <sys/param.h>
46#include <sys/conf.h>
47#include <sys/cons.h>
48#include <sys/eventhandler.h>
49#include <sys/fcntl.h>
50#include <sys/jail.h>
51#include <sys/kernel.h>
52#include <sys/malloc.h>
53#include <sys/mdioctl.h>
54#include <sys/mount.h>
55#include <sys/mutex.h>
56#include <sys/namei.h>
57#include <sys/priv.h>
58#include <sys/proc.h>
59#include <sys/filedesc.h>
60#include <sys/reboot.h>
61#include <sys/sbuf.h>
62#include <sys/stat.h>
63#include <sys/syscallsubr.h>
64#include <sys/sysproto.h>
65#include <sys/sx.h>
66#include <sys/sysctl.h>
67#include <sys/sysent.h>
68#include <sys/systm.h>
69#include <sys/vnode.h>
70
71#include <geom/geom.h>
72
73/*
74 * The root filesystem is detailed in the kernel environment variable
75 * vfs.root.mountfrom, which is expected to be in the general format
76 *
77 * <vfsname>:[<path>][	<vfsname>:[<path>] ...]
78 * vfsname   := the name of a VFS known to the kernel and capable
79 *              of being mounted as root
80 * path      := disk device name or other data used by the filesystem
81 *              to locate its physical store
82 *
83 * If the environment variable vfs.root.mountfrom is a space separated list,
84 * each list element is tried in turn and the root filesystem will be mounted
85 * from the first one that succeeds.
86 *
87 * The environment variable vfs.root.mountfrom.options is a comma delimited
88 * set of string mount options.  These mount options must be parseable
89 * by nmount() in the kernel.
90 */
91
92static int parse_mount(char **);
93static struct mntarg *parse_mountroot_options(struct mntarg *, const char *);
94static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS);
95static void vfs_mountroot_wait(void);
96static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev);
97
98/*
99 * The vnode of the system's root (/ in the filesystem, without chroot
100 * active.)
101 */
102struct vnode *rootvnode;
103
104/*
105 * Mount of the system's /dev.
106 */
107struct mount *rootdevmp;
108
109char *rootdevnames[2] = {NULL, NULL};
110
111struct mtx root_holds_mtx;
112MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF);
113
114static TAILQ_HEAD(, root_hold_token)	root_holds =
115    TAILQ_HEAD_INITIALIZER(root_holds);
116
117enum action {
118	A_CONTINUE,
119	A_PANIC,
120	A_REBOOT,
121	A_RETRY
122};
123
124enum rh_flags {
125	RH_FREE,
126	RH_ALLOC,
127	RH_ARG,
128};
129
130static enum action root_mount_onfail = A_CONTINUE;
131
132static int root_mount_mddev;
133static int root_mount_complete;
134
135/* By default wait up to 3 seconds for devices to appear. */
136static int root_mount_timeout = 3;
137TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
138
139static int root_mount_always_wait = 0;
140SYSCTL_INT(_vfs, OID_AUTO, root_mount_always_wait, CTLFLAG_RDTUN,
141    &root_mount_always_wait, 0,
142    "Wait for root mount holds even if the root device already exists");
143
144SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold,
145    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
146    NULL, 0, sysctl_vfs_root_mount_hold, "A",
147    "List of root mount hold tokens");
148
149static int
150sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS)
151{
152	struct sbuf sb;
153	struct root_hold_token *h;
154	int error;
155
156	sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL);
157
158	mtx_lock(&root_holds_mtx);
159	TAILQ_FOREACH(h, &root_holds, list) {
160		if (h != TAILQ_FIRST(&root_holds))
161			sbuf_putc(&sb, ' ');
162		sbuf_printf(&sb, "%s", h->who);
163	}
164	mtx_unlock(&root_holds_mtx);
165
166	error = sbuf_finish(&sb);
167	if (error == 0)
168		error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
169	sbuf_delete(&sb);
170	return (error);
171}
172
173struct root_hold_token *
174root_mount_hold(const char *identifier)
175{
176	struct root_hold_token *h;
177
178	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
179	h->flags = RH_ALLOC;
180	h->who = identifier;
181	mtx_lock(&root_holds_mtx);
182	TSHOLD("root mount");
183	TAILQ_INSERT_TAIL(&root_holds, h, list);
184	mtx_unlock(&root_holds_mtx);
185	return (h);
186}
187
188void
189root_mount_hold_token(const char *identifier, struct root_hold_token *h)
190{
191#ifdef INVARIANTS
192	struct root_hold_token *t;
193#endif
194
195	h->flags = RH_ARG;
196	h->who = identifier;
197	mtx_lock(&root_holds_mtx);
198#ifdef INVARIANTS
199	TAILQ_FOREACH(t, &root_holds, list) {
200		if (t == h) {
201			panic("Duplicate mount hold by '%s' on %p",
202			    identifier, h);
203		}
204	}
205#endif
206	TSHOLD("root mount");
207	TAILQ_INSERT_TAIL(&root_holds, h, list);
208	mtx_unlock(&root_holds_mtx);
209}
210
211void
212root_mount_rel(struct root_hold_token *h)
213{
214
215	if (h == NULL || h->flags == RH_FREE)
216		return;
217
218	mtx_lock(&root_holds_mtx);
219	TAILQ_REMOVE(&root_holds, h, list);
220	TSRELEASE("root mount");
221	wakeup(&root_holds);
222	mtx_unlock(&root_holds_mtx);
223	if (h->flags == RH_ALLOC) {
224		free(h, M_DEVBUF);
225	} else
226		h->flags = RH_FREE;
227}
228
229int
230root_mounted(void)
231{
232
233	/* No mutex is acquired here because int stores are atomic. */
234	return (root_mount_complete);
235}
236
237static void
238set_rootvnode(void)
239{
240
241	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode))
242		panic("set_rootvnode: Cannot find root vnode");
243
244	VOP_UNLOCK(rootvnode);
245
246	pwd_set_rootvnode();
247}
248
249static int
250vfs_mountroot_devfs(struct thread *td, struct mount **mpp)
251{
252	struct vfsoptlist *opts;
253	struct vfsconf *vfsp;
254	struct mount *mp;
255	int error;
256
257	*mpp = NULL;
258
259	if (rootdevmp != NULL) {
260		/*
261		 * Already have /dev; this happens during rerooting.
262		 */
263		error = vfs_busy(rootdevmp, 0);
264		if (error != 0)
265			return (error);
266		*mpp = rootdevmp;
267	} else {
268		vfsp = vfs_byname("devfs");
269		KASSERT(vfsp != NULL, ("Could not find devfs by name"));
270		if (vfsp == NULL)
271			return (ENOENT);
272
273		mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
274
275		error = VFS_MOUNT(mp);
276		KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
277		if (error)
278			return (error);
279
280		error = VFS_STATFS(mp, &mp->mnt_stat);
281		KASSERT(error == 0, ("VFS_STATFS(devfs) failed %d", error));
282		if (error)
283			return (error);
284
285		opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
286		TAILQ_INIT(opts);
287		mp->mnt_opt = opts;
288
289		mtx_lock(&mountlist_mtx);
290		TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
291		mtx_unlock(&mountlist_mtx);
292
293		*mpp = mp;
294		rootdevmp = mp;
295		vfs_op_exit(mp);
296	}
297
298	set_rootvnode();
299
300	error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE);
301	if (error)
302		printf("kern_symlink /dev -> / returns %d\n", error);
303
304	return (error);
305}
306
307static void
308vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
309{
310	struct nameidata nd;
311	struct mount *mporoot, *mpnroot;
312	struct vnode *vp, *vporoot, *vpdevfs;
313	char *fspath;
314	int error;
315
316	mpnroot = TAILQ_NEXT(mpdevfs, mnt_list);
317
318	/* Shuffle the mountlist. */
319	mtx_lock(&mountlist_mtx);
320	mporoot = TAILQ_FIRST(&mountlist);
321	TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list);
322	if (mporoot != mpdevfs) {
323		TAILQ_REMOVE(&mountlist, mpnroot, mnt_list);
324		TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list);
325	}
326	TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list);
327	mtx_unlock(&mountlist_mtx);
328
329	cache_purgevfs(mporoot);
330	if (mporoot != mpdevfs)
331		cache_purgevfs(mpdevfs);
332
333	if (VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot))
334		panic("vfs_mountroot_shuffle: Cannot find root vnode");
335
336	VI_LOCK(vporoot);
337	vporoot->v_iflag &= ~VI_MOUNT;
338	vn_irflag_unset_locked(vporoot, VIRF_MOUNTPOINT);
339	vporoot->v_mountedhere = NULL;
340	VI_UNLOCK(vporoot);
341	mporoot->mnt_flag &= ~MNT_ROOTFS;
342	mporoot->mnt_vnodecovered = NULL;
343	vput(vporoot);
344
345	/* Set up the new rootvnode, and purge the cache */
346	mpnroot->mnt_vnodecovered = NULL;
347	set_rootvnode();
348	cache_purgevfs(rootvnode->v_mount);
349
350	if (mporoot != mpdevfs) {
351		/* Remount old root under /.mount or /mnt */
352		fspath = "/.mount";
353		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
354		    fspath, td);
355		error = namei(&nd);
356		if (error) {
357			NDFREE(&nd, NDF_ONLY_PNBUF);
358			fspath = "/mnt";
359			NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
360			    fspath, td);
361			error = namei(&nd);
362		}
363		if (!error) {
364			vp = nd.ni_vp;
365			error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
366			if (!error)
367				error = vinvalbuf(vp, V_SAVE, 0, 0);
368			if (!error) {
369				cache_purge(vp);
370				mporoot->mnt_vnodecovered = vp;
371				vp->v_mountedhere = mporoot;
372				strlcpy(mporoot->mnt_stat.f_mntonname,
373				    fspath, MNAMELEN);
374				VOP_UNLOCK(vp);
375			} else
376				vput(vp);
377		}
378		NDFREE(&nd, NDF_ONLY_PNBUF);
379
380		if (error)
381			printf("mountroot: unable to remount previous root "
382			    "under /.mount or /mnt (error %d)\n", error);
383	}
384
385	/* Remount devfs under /dev */
386	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
387	error = namei(&nd);
388	if (!error) {
389		vp = nd.ni_vp;
390		error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
391		if (!error)
392			error = vinvalbuf(vp, V_SAVE, 0, 0);
393		if (!error) {
394			vpdevfs = mpdevfs->mnt_vnodecovered;
395			if (vpdevfs != NULL) {
396				cache_purge(vpdevfs);
397				VI_LOCK(vpdevfs);
398				vn_irflag_unset_locked(vpdevfs, VIRF_MOUNTPOINT);
399				vpdevfs->v_mountedhere = NULL;
400				VI_UNLOCK(vpdevfs);
401				vrele(vpdevfs);
402			}
403			VI_LOCK(vp);
404			mpdevfs->mnt_vnodecovered = vp;
405			vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
406			vp->v_mountedhere = mpdevfs;
407			VI_UNLOCK(vp);
408			VOP_UNLOCK(vp);
409		} else
410			vput(vp);
411	}
412	if (error)
413		printf("mountroot: unable to remount devfs under /dev "
414		    "(error %d)\n", error);
415	NDFREE(&nd, NDF_ONLY_PNBUF);
416
417	if (mporoot == mpdevfs) {
418		vfs_unbusy(mpdevfs);
419		/* Unlink the no longer needed /dev/dev -> / symlink */
420		error = kern_funlinkat(td, AT_FDCWD, "/dev/dev", FD_NONE,
421		    UIO_SYSSPACE, 0, 0);
422		if (error)
423			printf("mountroot: unable to unlink /dev/dev "
424			    "(error %d)\n", error);
425	}
426}
427
428/*
429 * Configuration parser.
430 */
431
432/* Parser character classes. */
433#define	CC_WHITESPACE		-1
434#define	CC_NONWHITESPACE	-2
435
436/* Parse errors. */
437#define	PE_EOF			-1
438#define	PE_EOL			-2
439
440static __inline int
441parse_peek(char **conf)
442{
443
444	return (**conf);
445}
446
447static __inline void
448parse_poke(char **conf, int c)
449{
450
451	**conf = c;
452}
453
454static __inline void
455parse_advance(char **conf)
456{
457
458	(*conf)++;
459}
460
461static int
462parse_skipto(char **conf, int mc)
463{
464	int c, match;
465
466	while (1) {
467		c = parse_peek(conf);
468		if (c == 0)
469			return (PE_EOF);
470		switch (mc) {
471		case CC_WHITESPACE:
472			match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0;
473			break;
474		case CC_NONWHITESPACE:
475			if (c == '\n')
476				return (PE_EOL);
477			match = (c != ' ' && c != '\t') ? 1 : 0;
478			break;
479		default:
480			match = (c == mc) ? 1 : 0;
481			break;
482		}
483		if (match)
484			break;
485		parse_advance(conf);
486	}
487	return (0);
488}
489
490static int
491parse_token(char **conf, char **tok)
492{
493	char *p;
494	size_t len;
495	int error;
496
497	*tok = NULL;
498	error = parse_skipto(conf, CC_NONWHITESPACE);
499	if (error)
500		return (error);
501	p = *conf;
502	error = parse_skipto(conf, CC_WHITESPACE);
503	len = *conf - p;
504	*tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO);
505	bcopy(p, *tok, len);
506	return (0);
507}
508
509static void
510parse_dir_ask_printenv(const char *var)
511{
512	char *val;
513
514	val = kern_getenv(var);
515	if (val != NULL) {
516		printf("  %s=%s\n", var, val);
517		freeenv(val);
518	}
519}
520
521static int
522parse_dir_ask(char **conf)
523{
524	char name[80];
525	char *mnt;
526	int error;
527
528	vfs_mountroot_wait();
529
530	printf("\nLoader variables:\n");
531	parse_dir_ask_printenv("vfs.root.mountfrom");
532	parse_dir_ask_printenv("vfs.root.mountfrom.options");
533
534	printf("\nManual root filesystem specification:\n");
535	printf("  <fstype>:<device> [options]\n");
536	printf("      Mount <device> using filesystem <fstype>\n");
537	printf("      and with the specified (optional) option list.\n");
538	printf("\n");
539	printf("    eg. ufs:/dev/da0s1a\n");
540	printf("        zfs:zroot/ROOT/default\n");
541	printf("        cd9660:/dev/cd0 ro\n");
542	printf("          (which is equivalent to: ");
543	printf("mount -t cd9660 -o ro /dev/cd0 /)\n");
544	printf("\n");
545	printf("  ?               List valid disk boot devices\n");
546	printf("  .               Yield 1 second (for background tasks)\n");
547	printf("  <empty line>    Abort manual input\n");
548
549	do {
550		error = EINVAL;
551		printf("\nmountroot> ");
552		cngets(name, sizeof(name), GETS_ECHO);
553		if (name[0] == '\0')
554			break;
555		if (name[0] == '?' && name[1] == '\0') {
556			printf("\nList of GEOM managed disk devices:\n  ");
557			g_dev_print();
558			continue;
559		}
560		if (name[0] == '.' && name[1] == '\0') {
561			pause("rmask", hz);
562			continue;
563		}
564		mnt = name;
565		error = parse_mount(&mnt);
566		if (error == -1)
567			printf("Invalid file system specification.\n");
568	} while (error != 0);
569
570	return (error);
571}
572
573static int
574parse_dir_md(char **conf)
575{
576	struct stat sb;
577	struct thread *td;
578	struct md_ioctl *mdio;
579	char *path, *tok;
580	int error, fd, len;
581
582	td = curthread;
583
584	error = parse_token(conf, &tok);
585	if (error)
586		return (error);
587
588	len = strlen(tok);
589	mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO);
590	path = (void *)(mdio + 1);
591	bcopy(tok, path, len);
592	free(tok, M_TEMP);
593
594	/* Get file status. */
595	error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL);
596	if (error)
597		goto out;
598
599	/* Open /dev/mdctl so that we can attach/detach. */
600	error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE,
601	    O_RDWR, 0);
602	if (error)
603		goto out;
604
605	fd = td->td_retval[0];
606	mdio->md_version = MDIOVERSION;
607	mdio->md_type = MD_VNODE;
608
609	if (root_mount_mddev != -1) {
610		mdio->md_unit = root_mount_mddev;
611		(void)kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
612		/* Ignore errors. We don't care. */
613		root_mount_mddev = -1;
614	}
615
616	mdio->md_file = (void *)(mdio + 1);
617	mdio->md_options = MD_AUTOUNIT | MD_READONLY;
618	mdio->md_mediasize = sb.st_size;
619	mdio->md_unit = 0;
620	error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio);
621	if (error)
622		goto out;
623
624	if (mdio->md_unit > 9) {
625		printf("rootmount: too many md units\n");
626		mdio->md_file = NULL;
627		mdio->md_options = 0;
628		mdio->md_mediasize = 0;
629		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
630		/* Ignore errors. We don't care. */
631		error = ERANGE;
632		goto out;
633	}
634
635	root_mount_mddev = mdio->md_unit;
636	printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file);
637
638	error = kern_close(td, fd);
639
640 out:
641	free(mdio, M_TEMP);
642	return (error);
643}
644
645static int
646parse_dir_onfail(char **conf)
647{
648	char *action;
649	int error;
650
651	error = parse_token(conf, &action);
652	if (error)
653		return (error);
654
655	if (!strcmp(action, "continue"))
656		root_mount_onfail = A_CONTINUE;
657	else if (!strcmp(action, "panic"))
658		root_mount_onfail = A_PANIC;
659	else if (!strcmp(action, "reboot"))
660		root_mount_onfail = A_REBOOT;
661	else if (!strcmp(action, "retry"))
662		root_mount_onfail = A_RETRY;
663	else {
664		printf("rootmount: %s: unknown action\n", action);
665		error = EINVAL;
666	}
667
668	free(action, M_TEMP);
669	return (0);
670}
671
672static int
673parse_dir_timeout(char **conf)
674{
675	char *tok, *endtok;
676	long secs;
677	int error;
678
679	error = parse_token(conf, &tok);
680	if (error)
681		return (error);
682
683	secs = strtol(tok, &endtok, 0);
684	error = (secs < 0 || *endtok != '\0') ? EINVAL : 0;
685	if (!error)
686		root_mount_timeout = secs;
687	free(tok, M_TEMP);
688	return (error);
689}
690
691static int
692parse_directive(char **conf)
693{
694	char *dir;
695	int error;
696
697	error = parse_token(conf, &dir);
698	if (error)
699		return (error);
700
701	if (strcmp(dir, ".ask") == 0)
702		error = parse_dir_ask(conf);
703	else if (strcmp(dir, ".md") == 0)
704		error = parse_dir_md(conf);
705	else if (strcmp(dir, ".onfail") == 0)
706		error = parse_dir_onfail(conf);
707	else if (strcmp(dir, ".timeout") == 0)
708		error = parse_dir_timeout(conf);
709	else {
710		printf("mountroot: invalid directive `%s'\n", dir);
711		/* Ignore the rest of the line. */
712		(void)parse_skipto(conf, '\n');
713		error = EINVAL;
714	}
715	free(dir, M_TEMP);
716	return (error);
717}
718
719static int
720parse_mount_dev_present(const char *dev)
721{
722	struct nameidata nd;
723	int error;
724
725	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread);
726	error = namei(&nd);
727	if (!error)
728		vput(nd.ni_vp);
729	NDFREE(&nd, NDF_ONLY_PNBUF);
730	return (error != 0) ? 0 : 1;
731}
732
733#define	ERRMSGL	255
734static int
735parse_mount(char **conf)
736{
737	char *errmsg;
738	struct mntarg *ma;
739	char *dev, *fs, *opts, *tok;
740	int delay, error, timeout;
741
742	error = parse_token(conf, &tok);
743	if (error)
744		return (error);
745	fs = tok;
746	error = parse_skipto(&tok, ':');
747	if (error) {
748		free(fs, M_TEMP);
749		return (error);
750	}
751	parse_poke(&tok, '\0');
752	parse_advance(&tok);
753	dev = tok;
754
755	if (root_mount_mddev != -1) {
756		/* Handle substitution for the md unit number. */
757		tok = strstr(dev, "md#");
758		if (tok != NULL)
759			tok[2] = '0' + root_mount_mddev;
760	}
761
762	/* Parse options. */
763	error = parse_token(conf, &tok);
764	opts = (error == 0) ? tok : NULL;
765
766	printf("Trying to mount root from %s:%s [%s]...\n", fs, dev,
767	    (opts != NULL) ? opts : "");
768
769	errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
770
771	if (vfs_byname(fs) == NULL) {
772		strlcpy(errmsg, "unknown file system", ERRMSGL);
773		error = ENOENT;
774		goto out;
775	}
776
777	error = vfs_mountroot_wait_if_neccessary(fs, dev);
778	if (error != 0)
779		goto out;
780
781	delay = hz / 10;
782	timeout = root_mount_timeout * hz;
783
784	for (;;) {
785		ma = NULL;
786		ma = mount_arg(ma, "fstype", fs, -1);
787		ma = mount_arg(ma, "fspath", "/", -1);
788		ma = mount_arg(ma, "from", dev, -1);
789		ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
790		ma = mount_arg(ma, "ro", NULL, 0);
791		ma = parse_mountroot_options(ma, opts);
792
793		error = kernel_mount(ma, MNT_ROOTFS);
794		if (error == 0 || timeout <= 0)
795			break;
796
797		if (root_mount_timeout * hz == timeout ||
798		    (bootverbose && timeout % hz == 0)) {
799			printf("Mounting from %s:%s failed with error %d; "
800			    "retrying for %d more second%s\n", fs, dev, error,
801			    timeout / hz, (timeout / hz > 1) ? "s" : "");
802		}
803		pause("rmretry", delay);
804		timeout -= delay;
805	}
806 out:
807	if (error) {
808		printf("Mounting from %s:%s failed with error %d",
809		    fs, dev, error);
810		if (errmsg[0] != '\0')
811			printf(": %s", errmsg);
812		printf(".\n");
813	}
814	free(fs, M_TEMP);
815	free(errmsg, M_TEMP);
816	if (opts != NULL)
817		free(opts, M_TEMP);
818	/* kernel_mount can return -1 on error. */
819	return ((error < 0) ? EDOOFUS : error);
820}
821#undef ERRMSGL
822
823static int
824vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs)
825{
826	struct mount *mp;
827	char *conf;
828	int error;
829
830	root_mount_mddev = -1;
831
832retry:
833	conf = sbuf_data(sb);
834	mp = TAILQ_NEXT(mpdevfs, mnt_list);
835	error = (mp == NULL) ? 0 : EDOOFUS;
836	root_mount_onfail = A_CONTINUE;
837	while (mp == NULL) {
838		error = parse_skipto(&conf, CC_NONWHITESPACE);
839		if (error == PE_EOL) {
840			parse_advance(&conf);
841			continue;
842		}
843		if (error < 0)
844			break;
845		switch (parse_peek(&conf)) {
846		case '#':
847			error = parse_skipto(&conf, '\n');
848			break;
849		case '.':
850			error = parse_directive(&conf);
851			break;
852		default:
853			error = parse_mount(&conf);
854			if (error == -1) {
855				printf("mountroot: invalid file system "
856				    "specification.\n");
857				error = 0;
858			}
859			break;
860		}
861		if (error < 0)
862			break;
863		/* Ignore any trailing garbage on the line. */
864		if (parse_peek(&conf) != '\n') {
865			printf("mountroot: advancing to next directive...\n");
866			(void)parse_skipto(&conf, '\n');
867		}
868		mp = TAILQ_NEXT(mpdevfs, mnt_list);
869	}
870	if (mp != NULL)
871		return (0);
872
873	/*
874	 * We failed to mount (a new) root.
875	 */
876	switch (root_mount_onfail) {
877	case A_CONTINUE:
878		break;
879	case A_PANIC:
880		panic("mountroot: unable to (re-)mount root.");
881		/* NOTREACHED */
882	case A_RETRY:
883		goto retry;
884	case A_REBOOT:
885		kern_reboot(RB_NOSYNC);
886		/* NOTREACHED */
887	}
888
889	return (error);
890}
891
892static void
893vfs_mountroot_conf0(struct sbuf *sb)
894{
895	char *s, *tok, *mnt, *opt;
896	int error;
897
898	sbuf_printf(sb, ".onfail panic\n");
899	sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
900	if (boothowto & RB_ASKNAME)
901		sbuf_printf(sb, ".ask\n");
902#ifdef ROOTDEVNAME
903	if (boothowto & RB_DFLTROOT)
904		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
905#endif
906	if (boothowto & RB_CDROM) {
907		sbuf_printf(sb, "cd9660:/dev/cd0 ro\n");
908		sbuf_printf(sb, ".timeout 0\n");
909		sbuf_printf(sb, "cd9660:/dev/cd1 ro\n");
910		sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
911	}
912	s = kern_getenv("vfs.root.mountfrom");
913	if (s != NULL) {
914		opt = kern_getenv("vfs.root.mountfrom.options");
915		tok = s;
916		error = parse_token(&tok, &mnt);
917		while (!error) {
918			sbuf_printf(sb, "%s %s\n", mnt,
919			    (opt != NULL) ? opt : "");
920			free(mnt, M_TEMP);
921			error = parse_token(&tok, &mnt);
922		}
923		if (opt != NULL)
924			freeenv(opt);
925		freeenv(s);
926	}
927	if (rootdevnames[0] != NULL)
928		sbuf_printf(sb, "%s\n", rootdevnames[0]);
929	if (rootdevnames[1] != NULL)
930		sbuf_printf(sb, "%s\n", rootdevnames[1]);
931#ifdef ROOTDEVNAME
932	if (!(boothowto & RB_DFLTROOT))
933		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
934#endif
935	if (!(boothowto & RB_ASKNAME))
936		sbuf_printf(sb, ".ask\n");
937}
938
939static int
940vfs_mountroot_readconf(struct thread *td, struct sbuf *sb)
941{
942	static char buf[128];
943	struct nameidata nd;
944	off_t ofs;
945	ssize_t resid;
946	int error, flags, len;
947
948	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td);
949	flags = FREAD;
950	error = vn_open(&nd, &flags, 0, NULL);
951	if (error)
952		return (error);
953
954	NDFREE(&nd, NDF_ONLY_PNBUF);
955	ofs = 0;
956	len = sizeof(buf) - 1;
957	while (1) {
958		error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
959		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
960		    NOCRED, &resid, td);
961		if (error)
962			break;
963		if (resid == len)
964			break;
965		buf[len - resid] = 0;
966		sbuf_printf(sb, "%s", buf);
967		ofs += len - resid;
968	}
969
970	VOP_UNLOCK(nd.ni_vp);
971	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
972	return (error);
973}
974
975static void
976vfs_mountroot_wait(void)
977{
978	struct root_hold_token *h;
979	struct timeval lastfail;
980	int curfail;
981
982	TSENTER();
983
984	curfail = 0;
985	while (1) {
986		g_waitidle();
987		mtx_lock(&root_holds_mtx);
988		if (TAILQ_EMPTY(&root_holds)) {
989			mtx_unlock(&root_holds_mtx);
990			break;
991		}
992		if (ppsratecheck(&lastfail, &curfail, 1)) {
993			printf("Root mount waiting for:");
994			TAILQ_FOREACH(h, &root_holds, list)
995				printf(" %s", h->who);
996			printf("\n");
997		}
998		TSWAIT("root mount");
999		msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold",
1000		    hz);
1001		TSUNWAIT("root mount");
1002	}
1003
1004	TSEXIT();
1005}
1006
1007static int
1008vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev)
1009{
1010	int delay, timeout;
1011
1012	/*
1013	 * In case of ZFS and NFS we don't have a way to wait for
1014	 * specific device.  Also do the wait if the user forced that
1015	 * behaviour by setting vfs.root_mount_always_wait=1.
1016	 */
1017	if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL ||
1018	    dev[0] == '\0' || root_mount_always_wait != 0) {
1019		vfs_mountroot_wait();
1020		return (0);
1021	}
1022
1023	/*
1024	 * Otherwise, no point in waiting if the device is already there.
1025	 * Note that we must wait for GEOM to finish reconfiguring itself,
1026	 * eg for geom_part(4) to finish tasting.
1027	 */
1028	g_waitidle();
1029	if (parse_mount_dev_present(dev))
1030		return (0);
1031
1032	/*
1033	 * No luck.  Let's wait.  This code looks weird, but it's that way
1034	 * to behave exactly as it used to work before.
1035	 */
1036	vfs_mountroot_wait();
1037	printf("mountroot: waiting for device %s...\n", dev);
1038	delay = hz / 10;
1039	timeout = root_mount_timeout * hz;
1040	do {
1041		pause("rmdev", delay);
1042		timeout -= delay;
1043	} while (timeout > 0 && !parse_mount_dev_present(dev));
1044
1045	if (timeout <= 0)
1046		return (ENODEV);
1047
1048	return (0);
1049}
1050
1051void
1052vfs_mountroot(void)
1053{
1054	struct mount *mp;
1055	struct sbuf *sb;
1056	struct thread *td;
1057	time_t timebase;
1058	int error;
1059
1060	mtx_assert(&Giant, MA_NOTOWNED);
1061
1062	TSENTER();
1063
1064	td = curthread;
1065
1066	sb = sbuf_new_auto();
1067	vfs_mountroot_conf0(sb);
1068	sbuf_finish(sb);
1069
1070	error = vfs_mountroot_devfs(td, &mp);
1071	while (!error) {
1072		error = vfs_mountroot_parse(sb, mp);
1073		if (!error) {
1074			vfs_mountroot_shuffle(td, mp);
1075			sbuf_clear(sb);
1076			error = vfs_mountroot_readconf(td, sb);
1077			sbuf_finish(sb);
1078		}
1079	}
1080
1081	sbuf_delete(sb);
1082
1083	/*
1084	 * Iterate over all currently mounted file systems and use
1085	 * the time stamp found to check and/or initialize the RTC.
1086	 * Call inittodr() only once and pass it the largest of the
1087	 * timestamps we encounter.
1088	 */
1089	timebase = 0;
1090	mtx_lock(&mountlist_mtx);
1091	mp = TAILQ_FIRST(&mountlist);
1092	while (mp != NULL) {
1093		if (mp->mnt_time > timebase)
1094			timebase = mp->mnt_time;
1095		mp = TAILQ_NEXT(mp, mnt_list);
1096	}
1097	mtx_unlock(&mountlist_mtx);
1098	inittodr(timebase);
1099
1100	/* Keep prison0's root in sync with the global rootvnode. */
1101	mtx_lock(&prison0.pr_mtx);
1102	prison0.pr_root = rootvnode;
1103	vref(prison0.pr_root);
1104	mtx_unlock(&prison0.pr_mtx);
1105
1106	mtx_lock(&root_holds_mtx);
1107	atomic_store_rel_int(&root_mount_complete, 1);
1108	wakeup(&root_mount_complete);
1109	mtx_unlock(&root_holds_mtx);
1110
1111	EVENTHANDLER_INVOKE(mountroot);
1112
1113	TSEXIT();
1114}
1115
1116static struct mntarg *
1117parse_mountroot_options(struct mntarg *ma, const char *options)
1118{
1119	char *p;
1120	char *name, *name_arg;
1121	char *val, *val_arg;
1122	char *opts;
1123
1124	if (options == NULL || options[0] == '\0')
1125		return (ma);
1126
1127	p = opts = strdup(options, M_MOUNT);
1128	if (opts == NULL) {
1129		return (ma);
1130	}
1131
1132	while((name = strsep(&p, ",")) != NULL) {
1133		if (name[0] == '\0')
1134			break;
1135
1136		val = strchr(name, '=');
1137		if (val != NULL) {
1138			*val = '\0';
1139			++val;
1140		}
1141		if( strcmp(name, "rw") == 0 ||
1142		    strcmp(name, "noro") == 0) {
1143			/*
1144			 * The first time we mount the root file system,
1145			 * we need to mount 'ro', so We need to ignore
1146			 * 'rw' and 'noro' mount options.
1147			 */
1148			continue;
1149		}
1150		name_arg = strdup(name, M_MOUNT);
1151		val_arg = NULL;
1152		if (val != NULL)
1153			val_arg = strdup(val, M_MOUNT);
1154
1155		ma = mount_arg(ma, name_arg, val_arg,
1156		    (val_arg != NULL ? -1 : 0));
1157	}
1158	free(opts, M_MOUNT);
1159	return (ma);
1160}
1161