1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2010 Marcel Moolenaar
5 * Copyright (c) 1999-2004 Poul-Henning Kamp
6 * Copyright (c) 1999 Michael Smith
7 * Copyright (c) 1989, 1993
8 *      The Regents of the University of California.  All rights reserved.
9 * (c) UNIX System Laboratories, Inc.
10 * All or some portions of this file are derived from material licensed
11 * to the University of California by American Telephone and Telegraph
12 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
13 * the permission of UNIX System Laboratories, Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 */
39
40#include "opt_rootdevname.h"
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD$");
44
45#include <sys/param.h>
46#include <sys/conf.h>
47#include <sys/cons.h>
48#include <sys/fcntl.h>
49#include <sys/jail.h>
50#include <sys/kernel.h>
51#include <sys/malloc.h>
52#include <sys/mdioctl.h>
53#include <sys/mount.h>
54#include <sys/mutex.h>
55#include <sys/namei.h>
56#include <sys/priv.h>
57#include <sys/proc.h>
58#include <sys/filedesc.h>
59#include <sys/reboot.h>
60#include <sys/sbuf.h>
61#include <sys/stat.h>
62#include <sys/syscallsubr.h>
63#include <sys/sysproto.h>
64#include <sys/sx.h>
65#include <sys/sysctl.h>
66#include <sys/sysent.h>
67#include <sys/systm.h>
68#include <sys/vnode.h>
69
70#include <geom/geom.h>
71
72/*
73 * The root filesystem is detailed in the kernel environment variable
74 * vfs.root.mountfrom, which is expected to be in the general format
75 *
76 * <vfsname>:[<path>][	<vfsname>:[<path>] ...]
77 * vfsname   := the name of a VFS known to the kernel and capable
78 *              of being mounted as root
79 * path      := disk device name or other data used by the filesystem
80 *              to locate its physical store
81 *
82 * If the environment variable vfs.root.mountfrom is a space separated list,
83 * each list element is tried in turn and the root filesystem will be mounted
84 * from the first one that succeeds.
85 *
86 * The environment variable vfs.root.mountfrom.options is a comma delimited
87 * set of string mount options.  These mount options must be parseable
88 * by nmount() in the kernel.
89 */
90
91static int parse_mount(char **);
92static struct mntarg *parse_mountroot_options(struct mntarg *, const char *);
93static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS);
94static void vfs_mountroot_wait(void);
95static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev);
96
97/*
98 * The vnode of the system's root (/ in the filesystem, without chroot
99 * active.)
100 */
101struct vnode *rootvnode;
102
103/*
104 * Mount of the system's /dev.
105 */
106struct mount *rootdevmp;
107
108char *rootdevnames[2] = {NULL, NULL};
109
110struct mtx root_holds_mtx;
111MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF);
112
113static TAILQ_HEAD(, root_hold_token)	root_holds =
114    TAILQ_HEAD_INITIALIZER(root_holds);
115
116enum action {
117	A_CONTINUE,
118	A_PANIC,
119	A_REBOOT,
120	A_RETRY
121};
122
123enum rh_flags {
124	RH_FREE,
125	RH_ALLOC,
126	RH_ARG,
127};
128
129static enum action root_mount_onfail = A_CONTINUE;
130
131static int root_mount_mddev;
132static int root_mount_complete;
133
134/* By default wait up to 3 seconds for devices to appear. */
135static int root_mount_timeout = 3;
136TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
137
138static int root_mount_always_wait = 0;
139SYSCTL_INT(_vfs, OID_AUTO, root_mount_always_wait, CTLFLAG_RDTUN,
140    &root_mount_always_wait, 0,
141    "Wait for root mount holds even if the root device already exists");
142
143SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold,
144    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
145    NULL, 0, sysctl_vfs_root_mount_hold, "A",
146    "List of root mount hold tokens");
147
148static int
149sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS)
150{
151	struct sbuf sb;
152	struct root_hold_token *h;
153	int error;
154
155	sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL);
156
157	mtx_lock(&root_holds_mtx);
158	TAILQ_FOREACH(h, &root_holds, list) {
159		if (h != TAILQ_FIRST(&root_holds))
160			sbuf_putc(&sb, ' ');
161		sbuf_printf(&sb, "%s", h->who);
162	}
163	mtx_unlock(&root_holds_mtx);
164
165	error = sbuf_finish(&sb);
166	if (error == 0)
167		error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
168	sbuf_delete(&sb);
169	return (error);
170}
171
172struct root_hold_token *
173root_mount_hold(const char *identifier)
174{
175	struct root_hold_token *h;
176
177	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
178	h->flags = RH_ALLOC;
179	h->who = identifier;
180	mtx_lock(&root_holds_mtx);
181	TSHOLD("root mount");
182	TAILQ_INSERT_TAIL(&root_holds, h, list);
183	mtx_unlock(&root_holds_mtx);
184	return (h);
185}
186
187void
188root_mount_hold_token(const char *identifier, struct root_hold_token *h)
189{
190#ifdef INVARIANTS
191	struct root_hold_token *t;
192#endif
193
194	h->flags = RH_ARG;
195	h->who = identifier;
196	mtx_lock(&root_holds_mtx);
197#ifdef INVARIANTS
198	TAILQ_FOREACH(t, &root_holds, list) {
199		if (t == h) {
200			panic("Duplicate mount hold by '%s' on %p",
201			    identifier, h);
202		}
203	}
204#endif
205	TSHOLD("root mount");
206	TAILQ_INSERT_TAIL(&root_holds, h, list);
207	mtx_unlock(&root_holds_mtx);
208}
209
210void
211root_mount_rel(struct root_hold_token *h)
212{
213
214	if (h == NULL || h->flags == RH_FREE)
215		return;
216
217	mtx_lock(&root_holds_mtx);
218	TAILQ_REMOVE(&root_holds, h, list);
219	TSRELEASE("root mount");
220	wakeup(&root_holds);
221	mtx_unlock(&root_holds_mtx);
222	if (h->flags == RH_ALLOC) {
223		free(h, M_DEVBUF);
224	} else
225		h->flags = RH_FREE;
226}
227
228int
229root_mounted(void)
230{
231
232	/* No mutex is acquired here because int stores are atomic. */
233	return (root_mount_complete);
234}
235
236static void
237set_rootvnode(void)
238{
239	struct proc *p;
240
241	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode))
242		panic("Cannot find root vnode");
243
244	VOP_UNLOCK(rootvnode, 0);
245
246	p = curthread->td_proc;
247	FILEDESC_XLOCK(p->p_fd);
248
249	if (p->p_fd->fd_cdir != NULL)
250		vrele(p->p_fd->fd_cdir);
251	p->p_fd->fd_cdir = rootvnode;
252	VREF(rootvnode);
253
254	if (p->p_fd->fd_rdir != NULL)
255		vrele(p->p_fd->fd_rdir);
256	p->p_fd->fd_rdir = rootvnode;
257	VREF(rootvnode);
258
259	FILEDESC_XUNLOCK(p->p_fd);
260}
261
262static int
263vfs_mountroot_devfs(struct thread *td, struct mount **mpp)
264{
265	struct vfsoptlist *opts;
266	struct vfsconf *vfsp;
267	struct mount *mp;
268	int error;
269
270	*mpp = NULL;
271
272	if (rootdevmp != NULL) {
273		/*
274		 * Already have /dev; this happens during rerooting.
275		 */
276		error = vfs_busy(rootdevmp, 0);
277		if (error != 0)
278			return (error);
279		*mpp = rootdevmp;
280	} else {
281		vfsp = vfs_byname("devfs");
282		KASSERT(vfsp != NULL, ("Could not find devfs by name"));
283		if (vfsp == NULL)
284			return (ENOENT);
285
286		mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
287
288		error = VFS_MOUNT(mp);
289		KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
290		if (error)
291			return (error);
292
293		error = VFS_STATFS(mp, &mp->mnt_stat);
294		KASSERT(error == 0, ("VFS_STATFS(devfs) failed %d", error));
295		if (error)
296			return (error);
297
298		opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
299		TAILQ_INIT(opts);
300		mp->mnt_opt = opts;
301
302		mtx_lock(&mountlist_mtx);
303		TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
304		mtx_unlock(&mountlist_mtx);
305
306		*mpp = mp;
307		rootdevmp = mp;
308	}
309
310	set_rootvnode();
311
312	error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE);
313	if (error)
314		printf("kern_symlink /dev -> / returns %d\n", error);
315
316	return (error);
317}
318
319static void
320vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
321{
322	struct nameidata nd;
323	struct mount *mporoot, *mpnroot;
324	struct vnode *vp, *vporoot, *vpdevfs;
325	char *fspath;
326	int error;
327
328	mpnroot = TAILQ_NEXT(mpdevfs, mnt_list);
329
330	/* Shuffle the mountlist. */
331	mtx_lock(&mountlist_mtx);
332	mporoot = TAILQ_FIRST(&mountlist);
333	TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list);
334	if (mporoot != mpdevfs) {
335		TAILQ_REMOVE(&mountlist, mpnroot, mnt_list);
336		TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list);
337	}
338	TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list);
339	mtx_unlock(&mountlist_mtx);
340
341	cache_purgevfs(mporoot, true);
342	if (mporoot != mpdevfs)
343		cache_purgevfs(mpdevfs, true);
344
345	VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot);
346
347	VI_LOCK(vporoot);
348	vporoot->v_iflag &= ~VI_MOUNT;
349	VI_UNLOCK(vporoot);
350	vporoot->v_mountedhere = NULL;
351	mporoot->mnt_flag &= ~MNT_ROOTFS;
352	mporoot->mnt_vnodecovered = NULL;
353	vput(vporoot);
354
355	/* Set up the new rootvnode, and purge the cache */
356	mpnroot->mnt_vnodecovered = NULL;
357	set_rootvnode();
358	cache_purgevfs(rootvnode->v_mount, true);
359
360	if (mporoot != mpdevfs) {
361		/* Remount old root under /.mount or /mnt */
362		fspath = "/.mount";
363		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
364		    fspath, td);
365		error = namei(&nd);
366		if (error) {
367			NDFREE(&nd, NDF_ONLY_PNBUF);
368			fspath = "/mnt";
369			NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
370			    fspath, td);
371			error = namei(&nd);
372		}
373		if (!error) {
374			vp = nd.ni_vp;
375			error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
376			if (!error)
377				error = vinvalbuf(vp, V_SAVE, 0, 0);
378			if (!error) {
379				cache_purge(vp);
380				mporoot->mnt_vnodecovered = vp;
381				vp->v_mountedhere = mporoot;
382				strlcpy(mporoot->mnt_stat.f_mntonname,
383				    fspath, MNAMELEN);
384				VOP_UNLOCK(vp, 0);
385			} else
386				vput(vp);
387		}
388		NDFREE(&nd, NDF_ONLY_PNBUF);
389
390		if (error)
391			printf("mountroot: unable to remount previous root "
392			    "under /.mount or /mnt (error %d)\n", error);
393	}
394
395	/* Remount devfs under /dev */
396	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
397	error = namei(&nd);
398	if (!error) {
399		vp = nd.ni_vp;
400		error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
401		if (!error)
402			error = vinvalbuf(vp, V_SAVE, 0, 0);
403		if (!error) {
404			vpdevfs = mpdevfs->mnt_vnodecovered;
405			if (vpdevfs != NULL) {
406				cache_purge(vpdevfs);
407				vpdevfs->v_mountedhere = NULL;
408				vrele(vpdevfs);
409			}
410			mpdevfs->mnt_vnodecovered = vp;
411			vp->v_mountedhere = mpdevfs;
412			VOP_UNLOCK(vp, 0);
413		} else
414			vput(vp);
415	}
416	if (error)
417		printf("mountroot: unable to remount devfs under /dev "
418		    "(error %d)\n", error);
419	NDFREE(&nd, NDF_ONLY_PNBUF);
420
421	if (mporoot == mpdevfs) {
422		vfs_unbusy(mpdevfs);
423		/* Unlink the no longer needed /dev/dev -> / symlink */
424		error = kern_unlinkat(td, AT_FDCWD, "/dev/dev",
425		    UIO_SYSSPACE, 0, 0);
426		if (error)
427			printf("mountroot: unable to unlink /dev/dev "
428			    "(error %d)\n", error);
429	}
430}
431
432/*
433 * Configuration parser.
434 */
435
436/* Parser character classes. */
437#define	CC_WHITESPACE		-1
438#define	CC_NONWHITESPACE	-2
439
440/* Parse errors. */
441#define	PE_EOF			-1
442#define	PE_EOL			-2
443
444static __inline int
445parse_peek(char **conf)
446{
447
448	return (**conf);
449}
450
451static __inline void
452parse_poke(char **conf, int c)
453{
454
455	**conf = c;
456}
457
458static __inline void
459parse_advance(char **conf)
460{
461
462	(*conf)++;
463}
464
465static int
466parse_skipto(char **conf, int mc)
467{
468	int c, match;
469
470	while (1) {
471		c = parse_peek(conf);
472		if (c == 0)
473			return (PE_EOF);
474		switch (mc) {
475		case CC_WHITESPACE:
476			match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0;
477			break;
478		case CC_NONWHITESPACE:
479			if (c == '\n')
480				return (PE_EOL);
481			match = (c != ' ' && c != '\t') ? 1 : 0;
482			break;
483		default:
484			match = (c == mc) ? 1 : 0;
485			break;
486		}
487		if (match)
488			break;
489		parse_advance(conf);
490	}
491	return (0);
492}
493
494static int
495parse_token(char **conf, char **tok)
496{
497	char *p;
498	size_t len;
499	int error;
500
501	*tok = NULL;
502	error = parse_skipto(conf, CC_NONWHITESPACE);
503	if (error)
504		return (error);
505	p = *conf;
506	error = parse_skipto(conf, CC_WHITESPACE);
507	len = *conf - p;
508	*tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO);
509	bcopy(p, *tok, len);
510	return (0);
511}
512
513static void
514parse_dir_ask_printenv(const char *var)
515{
516	char *val;
517
518	val = kern_getenv(var);
519	if (val != NULL) {
520		printf("  %s=%s\n", var, val);
521		freeenv(val);
522	}
523}
524
525static int
526parse_dir_ask(char **conf)
527{
528	char name[80];
529	char *mnt;
530	int error;
531
532	vfs_mountroot_wait();
533
534	printf("\nLoader variables:\n");
535	parse_dir_ask_printenv("vfs.root.mountfrom");
536	parse_dir_ask_printenv("vfs.root.mountfrom.options");
537
538	printf("\nManual root filesystem specification:\n");
539	printf("  <fstype>:<device> [options]\n");
540	printf("      Mount <device> using filesystem <fstype>\n");
541	printf("      and with the specified (optional) option list.\n");
542	printf("\n");
543	printf("    eg. ufs:/dev/da0s1a\n");
544	printf("        zfs:zroot/ROOT/default\n");
545	printf("        cd9660:/dev/cd0 ro\n");
546	printf("          (which is equivalent to: ");
547	printf("mount -t cd9660 -o ro /dev/cd0 /)\n");
548	printf("\n");
549	printf("  ?               List valid disk boot devices\n");
550	printf("  .               Yield 1 second (for background tasks)\n");
551	printf("  <empty line>    Abort manual input\n");
552
553	do {
554		error = EINVAL;
555		printf("\nmountroot> ");
556		cngets(name, sizeof(name), GETS_ECHO);
557		if (name[0] == '\0')
558			break;
559		if (name[0] == '?' && name[1] == '\0') {
560			printf("\nList of GEOM managed disk devices:\n  ");
561			g_dev_print();
562			continue;
563		}
564		if (name[0] == '.' && name[1] == '\0') {
565			pause("rmask", hz);
566			continue;
567		}
568		mnt = name;
569		error = parse_mount(&mnt);
570		if (error == -1)
571			printf("Invalid file system specification.\n");
572	} while (error != 0);
573
574	return (error);
575}
576
577static int
578parse_dir_md(char **conf)
579{
580	struct stat sb;
581	struct thread *td;
582	struct md_ioctl *mdio;
583	char *path, *tok;
584	int error, fd, len;
585
586	td = curthread;
587
588	error = parse_token(conf, &tok);
589	if (error)
590		return (error);
591
592	len = strlen(tok);
593	mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO);
594	path = (void *)(mdio + 1);
595	bcopy(tok, path, len);
596	free(tok, M_TEMP);
597
598	/* Get file status. */
599	error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL);
600	if (error)
601		goto out;
602
603	/* Open /dev/mdctl so that we can attach/detach. */
604	error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE,
605	    O_RDWR, 0);
606	if (error)
607		goto out;
608
609	fd = td->td_retval[0];
610	mdio->md_version = MDIOVERSION;
611	mdio->md_type = MD_VNODE;
612
613	if (root_mount_mddev != -1) {
614		mdio->md_unit = root_mount_mddev;
615		(void)kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
616		/* Ignore errors. We don't care. */
617		root_mount_mddev = -1;
618	}
619
620	mdio->md_file = (void *)(mdio + 1);
621	mdio->md_options = MD_AUTOUNIT | MD_READONLY;
622	mdio->md_mediasize = sb.st_size;
623	mdio->md_unit = 0;
624	error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio);
625	if (error)
626		goto out;
627
628	if (mdio->md_unit > 9) {
629		printf("rootmount: too many md units\n");
630		mdio->md_file = NULL;
631		mdio->md_options = 0;
632		mdio->md_mediasize = 0;
633		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
634		/* Ignore errors. We don't care. */
635		error = ERANGE;
636		goto out;
637	}
638
639	root_mount_mddev = mdio->md_unit;
640	printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file);
641
642	error = kern_close(td, fd);
643
644 out:
645	free(mdio, M_TEMP);
646	return (error);
647}
648
649static int
650parse_dir_onfail(char **conf)
651{
652	char *action;
653	int error;
654
655	error = parse_token(conf, &action);
656	if (error)
657		return (error);
658
659	if (!strcmp(action, "continue"))
660		root_mount_onfail = A_CONTINUE;
661	else if (!strcmp(action, "panic"))
662		root_mount_onfail = A_PANIC;
663	else if (!strcmp(action, "reboot"))
664		root_mount_onfail = A_REBOOT;
665	else if (!strcmp(action, "retry"))
666		root_mount_onfail = A_RETRY;
667	else {
668		printf("rootmount: %s: unknown action\n", action);
669		error = EINVAL;
670	}
671
672	free(action, M_TEMP);
673	return (0);
674}
675
676static int
677parse_dir_timeout(char **conf)
678{
679	char *tok, *endtok;
680	long secs;
681	int error;
682
683	error = parse_token(conf, &tok);
684	if (error)
685		return (error);
686
687	secs = strtol(tok, &endtok, 0);
688	error = (secs < 0 || *endtok != '\0') ? EINVAL : 0;
689	if (!error)
690		root_mount_timeout = secs;
691	free(tok, M_TEMP);
692	return (error);
693}
694
695static int
696parse_directive(char **conf)
697{
698	char *dir;
699	int error;
700
701	error = parse_token(conf, &dir);
702	if (error)
703		return (error);
704
705	if (strcmp(dir, ".ask") == 0)
706		error = parse_dir_ask(conf);
707	else if (strcmp(dir, ".md") == 0)
708		error = parse_dir_md(conf);
709	else if (strcmp(dir, ".onfail") == 0)
710		error = parse_dir_onfail(conf);
711	else if (strcmp(dir, ".timeout") == 0)
712		error = parse_dir_timeout(conf);
713	else {
714		printf("mountroot: invalid directive `%s'\n", dir);
715		/* Ignore the rest of the line. */
716		(void)parse_skipto(conf, '\n');
717		error = EINVAL;
718	}
719	free(dir, M_TEMP);
720	return (error);
721}
722
723static int
724parse_mount_dev_present(const char *dev)
725{
726	struct nameidata nd;
727	int error;
728
729	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread);
730	error = namei(&nd);
731	if (!error)
732		vput(nd.ni_vp);
733	NDFREE(&nd, NDF_ONLY_PNBUF);
734	return (error != 0) ? 0 : 1;
735}
736
737#define	ERRMSGL	255
738static int
739parse_mount(char **conf)
740{
741	char *errmsg;
742	struct mntarg *ma;
743	char *dev, *fs, *opts, *tok;
744	int delay, error, timeout;
745
746	error = parse_token(conf, &tok);
747	if (error)
748		return (error);
749	fs = tok;
750	error = parse_skipto(&tok, ':');
751	if (error) {
752		free(fs, M_TEMP);
753		return (error);
754	}
755	parse_poke(&tok, '\0');
756	parse_advance(&tok);
757	dev = tok;
758
759	if (root_mount_mddev != -1) {
760		/* Handle substitution for the md unit number. */
761		tok = strstr(dev, "md#");
762		if (tok != NULL)
763			tok[2] = '0' + root_mount_mddev;
764	}
765
766	/* Parse options. */
767	error = parse_token(conf, &tok);
768	opts = (error == 0) ? tok : NULL;
769
770	printf("Trying to mount root from %s:%s [%s]...\n", fs, dev,
771	    (opts != NULL) ? opts : "");
772
773	errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
774
775	if (vfs_byname(fs) == NULL) {
776		strlcpy(errmsg, "unknown file system", ERRMSGL);
777		error = ENOENT;
778		goto out;
779	}
780
781	error = vfs_mountroot_wait_if_neccessary(fs, dev);
782	if (error != 0)
783		goto out;
784
785	delay = hz / 10;
786	timeout = root_mount_timeout * hz;
787
788	for (;;) {
789		ma = NULL;
790		ma = mount_arg(ma, "fstype", fs, -1);
791		ma = mount_arg(ma, "fspath", "/", -1);
792		ma = mount_arg(ma, "from", dev, -1);
793		ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
794		ma = mount_arg(ma, "ro", NULL, 0);
795		ma = parse_mountroot_options(ma, opts);
796
797		error = kernel_mount(ma, MNT_ROOTFS);
798		if (error == 0 || timeout <= 0)
799			break;
800
801		if (root_mount_timeout * hz == timeout ||
802		    (bootverbose && timeout % hz == 0)) {
803			printf("Mounting from %s:%s failed with error %d; "
804			    "retrying for %d more second%s\n", fs, dev, error,
805			    timeout / hz, (timeout / hz > 1) ? "s" : "");
806		}
807		pause("rmretry", delay);
808		timeout -= delay;
809	}
810 out:
811	if (error) {
812		printf("Mounting from %s:%s failed with error %d",
813		    fs, dev, error);
814		if (errmsg[0] != '\0')
815			printf(": %s", errmsg);
816		printf(".\n");
817	}
818	free(fs, M_TEMP);
819	free(errmsg, M_TEMP);
820	if (opts != NULL)
821		free(opts, M_TEMP);
822	/* kernel_mount can return -1 on error. */
823	return ((error < 0) ? EDOOFUS : error);
824}
825#undef ERRMSGL
826
827static int
828vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs)
829{
830	struct mount *mp;
831	char *conf;
832	int error;
833
834	root_mount_mddev = -1;
835
836retry:
837	conf = sbuf_data(sb);
838	mp = TAILQ_NEXT(mpdevfs, mnt_list);
839	error = (mp == NULL) ? 0 : EDOOFUS;
840	root_mount_onfail = A_CONTINUE;
841	while (mp == NULL) {
842		error = parse_skipto(&conf, CC_NONWHITESPACE);
843		if (error == PE_EOL) {
844			parse_advance(&conf);
845			continue;
846		}
847		if (error < 0)
848			break;
849		switch (parse_peek(&conf)) {
850		case '#':
851			error = parse_skipto(&conf, '\n');
852			break;
853		case '.':
854			error = parse_directive(&conf);
855			break;
856		default:
857			error = parse_mount(&conf);
858			if (error == -1) {
859				printf("mountroot: invalid file system "
860				    "specification.\n");
861				error = 0;
862			}
863			break;
864		}
865		if (error < 0)
866			break;
867		/* Ignore any trailing garbage on the line. */
868		if (parse_peek(&conf) != '\n') {
869			printf("mountroot: advancing to next directive...\n");
870			(void)parse_skipto(&conf, '\n');
871		}
872		mp = TAILQ_NEXT(mpdevfs, mnt_list);
873	}
874	if (mp != NULL)
875		return (0);
876
877	/*
878	 * We failed to mount (a new) root.
879	 */
880	switch (root_mount_onfail) {
881	case A_CONTINUE:
882		break;
883	case A_PANIC:
884		panic("mountroot: unable to (re-)mount root.");
885		/* NOTREACHED */
886	case A_RETRY:
887		goto retry;
888	case A_REBOOT:
889		kern_reboot(RB_NOSYNC);
890		/* NOTREACHED */
891	}
892
893	return (error);
894}
895
896static void
897vfs_mountroot_conf0(struct sbuf *sb)
898{
899	char *s, *tok, *mnt, *opt;
900	int error;
901
902	sbuf_printf(sb, ".onfail panic\n");
903	sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
904	if (boothowto & RB_ASKNAME)
905		sbuf_printf(sb, ".ask\n");
906#ifdef ROOTDEVNAME
907	if (boothowto & RB_DFLTROOT)
908		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
909#endif
910	if (boothowto & RB_CDROM) {
911		sbuf_printf(sb, "cd9660:/dev/cd0 ro\n");
912		sbuf_printf(sb, ".timeout 0\n");
913		sbuf_printf(sb, "cd9660:/dev/cd1 ro\n");
914		sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
915	}
916	s = kern_getenv("vfs.root.mountfrom");
917	if (s != NULL) {
918		opt = kern_getenv("vfs.root.mountfrom.options");
919		tok = s;
920		error = parse_token(&tok, &mnt);
921		while (!error) {
922			sbuf_printf(sb, "%s %s\n", mnt,
923			    (opt != NULL) ? opt : "");
924			free(mnt, M_TEMP);
925			error = parse_token(&tok, &mnt);
926		}
927		if (opt != NULL)
928			freeenv(opt);
929		freeenv(s);
930	}
931	if (rootdevnames[0] != NULL)
932		sbuf_printf(sb, "%s\n", rootdevnames[0]);
933	if (rootdevnames[1] != NULL)
934		sbuf_printf(sb, "%s\n", rootdevnames[1]);
935#ifdef ROOTDEVNAME
936	if (!(boothowto & RB_DFLTROOT))
937		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
938#endif
939	if (!(boothowto & RB_ASKNAME))
940		sbuf_printf(sb, ".ask\n");
941}
942
943static int
944vfs_mountroot_readconf(struct thread *td, struct sbuf *sb)
945{
946	static char buf[128];
947	struct nameidata nd;
948	off_t ofs;
949	ssize_t resid;
950	int error, flags, len;
951
952	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td);
953	flags = FREAD;
954	error = vn_open(&nd, &flags, 0, NULL);
955	if (error)
956		return (error);
957
958	NDFREE(&nd, NDF_ONLY_PNBUF);
959	ofs = 0;
960	len = sizeof(buf) - 1;
961	while (1) {
962		error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
963		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
964		    NOCRED, &resid, td);
965		if (error)
966			break;
967		if (resid == len)
968			break;
969		buf[len - resid] = 0;
970		sbuf_printf(sb, "%s", buf);
971		ofs += len - resid;
972	}
973
974	VOP_UNLOCK(nd.ni_vp, 0);
975	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
976	return (error);
977}
978
979static void
980vfs_mountroot_wait(void)
981{
982	struct root_hold_token *h;
983	struct timeval lastfail;
984	int curfail;
985
986	TSENTER();
987
988	curfail = 0;
989	while (1) {
990		g_waitidle();
991		mtx_lock(&root_holds_mtx);
992		if (TAILQ_EMPTY(&root_holds)) {
993			mtx_unlock(&root_holds_mtx);
994			break;
995		}
996		if (ppsratecheck(&lastfail, &curfail, 1)) {
997			printf("Root mount waiting for:");
998			TAILQ_FOREACH(h, &root_holds, list)
999				printf(" %s", h->who);
1000			printf("\n");
1001		}
1002		TSWAIT("root mount");
1003		msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold",
1004		    hz);
1005		TSUNWAIT("root mount");
1006	}
1007
1008	TSEXIT();
1009}
1010
1011static int
1012vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev)
1013{
1014	int delay, timeout;
1015
1016	/*
1017	 * In case of ZFS and NFS we don't have a way to wait for
1018	 * specific device.  Also do the wait if the user forced that
1019	 * behaviour by setting vfs.root_mount_always_wait=1.
1020	 */
1021	if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL ||
1022	    dev[0] == '\0' || root_mount_always_wait != 0) {
1023		vfs_mountroot_wait();
1024		return (0);
1025	}
1026
1027	/*
1028	 * Otherwise, no point in waiting if the device is already there.
1029	 * Note that we must wait for GEOM to finish reconfiguring itself,
1030	 * eg for geom_part(4) to finish tasting.
1031	 */
1032	g_waitidle();
1033	if (parse_mount_dev_present(dev))
1034		return (0);
1035
1036	/*
1037	 * No luck.  Let's wait.  This code looks weird, but it's that way
1038	 * to behave exactly as it used to work before.
1039	 */
1040	vfs_mountroot_wait();
1041	printf("mountroot: waiting for device %s...\n", dev);
1042	delay = hz / 10;
1043	timeout = root_mount_timeout * hz;
1044	do {
1045		pause("rmdev", delay);
1046		timeout -= delay;
1047	} while (timeout > 0 && !parse_mount_dev_present(dev));
1048
1049	if (timeout <= 0)
1050		return (ENODEV);
1051
1052	return (0);
1053}
1054
1055void
1056vfs_mountroot(void)
1057{
1058	struct mount *mp;
1059	struct sbuf *sb;
1060	struct thread *td;
1061	time_t timebase;
1062	int error;
1063
1064	mtx_assert(&Giant, MA_NOTOWNED);
1065
1066	TSENTER();
1067
1068	td = curthread;
1069
1070	sb = sbuf_new_auto();
1071	vfs_mountroot_conf0(sb);
1072	sbuf_finish(sb);
1073
1074	error = vfs_mountroot_devfs(td, &mp);
1075	while (!error) {
1076		error = vfs_mountroot_parse(sb, mp);
1077		if (!error) {
1078			vfs_mountroot_shuffle(td, mp);
1079			sbuf_clear(sb);
1080			error = vfs_mountroot_readconf(td, sb);
1081			sbuf_finish(sb);
1082		}
1083	}
1084
1085	sbuf_delete(sb);
1086
1087	/*
1088	 * Iterate over all currently mounted file systems and use
1089	 * the time stamp found to check and/or initialize the RTC.
1090	 * Call inittodr() only once and pass it the largest of the
1091	 * timestamps we encounter.
1092	 */
1093	timebase = 0;
1094	mtx_lock(&mountlist_mtx);
1095	mp = TAILQ_FIRST(&mountlist);
1096	while (mp != NULL) {
1097		if (mp->mnt_time > timebase)
1098			timebase = mp->mnt_time;
1099		mp = TAILQ_NEXT(mp, mnt_list);
1100	}
1101	mtx_unlock(&mountlist_mtx);
1102	inittodr(timebase);
1103
1104	/* Keep prison0's root in sync with the global rootvnode. */
1105	mtx_lock(&prison0.pr_mtx);
1106	prison0.pr_root = rootvnode;
1107	vref(prison0.pr_root);
1108	mtx_unlock(&prison0.pr_mtx);
1109
1110	mtx_lock(&root_holds_mtx);
1111	atomic_store_rel_int(&root_mount_complete, 1);
1112	wakeup(&root_mount_complete);
1113	mtx_unlock(&root_holds_mtx);
1114
1115	EVENTHANDLER_INVOKE(mountroot);
1116
1117	TSEXIT();
1118}
1119
1120static struct mntarg *
1121parse_mountroot_options(struct mntarg *ma, const char *options)
1122{
1123	char *p;
1124	char *name, *name_arg;
1125	char *val, *val_arg;
1126	char *opts;
1127
1128	if (options == NULL || options[0] == '\0')
1129		return (ma);
1130
1131	p = opts = strdup(options, M_MOUNT);
1132	if (opts == NULL) {
1133		return (ma);
1134	}
1135
1136	while((name = strsep(&p, ",")) != NULL) {
1137		if (name[0] == '\0')
1138			break;
1139
1140		val = strchr(name, '=');
1141		if (val != NULL) {
1142			*val = '\0';
1143			++val;
1144		}
1145		if( strcmp(name, "rw") == 0 ||
1146		    strcmp(name, "noro") == 0) {
1147			/*
1148			 * The first time we mount the root file system,
1149			 * we need to mount 'ro', so We need to ignore
1150			 * 'rw' and 'noro' mount options.
1151			 */
1152			continue;
1153		}
1154		name_arg = strdup(name, M_MOUNT);
1155		val_arg = NULL;
1156		if (val != NULL)
1157			val_arg = strdup(val, M_MOUNT);
1158
1159		ma = mount_arg(ma, name_arg, val_arg,
1160		    (val_arg != NULL ? -1 : 0));
1161	}
1162	free(opts, M_MOUNT);
1163	return (ma);
1164}
1165