vfs_mountroot.c revision 297190
1/*-
2 * Copyright (c) 2010 Marcel Moolenaar
3 * Copyright (c) 1999-2004 Poul-Henning Kamp
4 * Copyright (c) 1999 Michael Smith
5 * Copyright (c) 1989, 1993
6 *      The Regents of the University of California.  All rights reserved.
7 * (c) UNIX System Laboratories, Inc.
8 * All or some portions of this file are derived from material licensed
9 * to the University of California by American Telephone and Telegraph
10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11 * the permission of UNIX System Laboratories, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38#include "opt_rootdevname.h"
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD: head/sys/kern/vfs_mountroot.c 297190 2016-03-22 13:46:01Z trasz $");
42
43#include <sys/param.h>
44#include <sys/conf.h>
45#include <sys/cons.h>
46#include <sys/fcntl.h>
47#include <sys/jail.h>
48#include <sys/kernel.h>
49#include <sys/malloc.h>
50#include <sys/mdioctl.h>
51#include <sys/mount.h>
52#include <sys/mutex.h>
53#include <sys/namei.h>
54#include <sys/priv.h>
55#include <sys/proc.h>
56#include <sys/filedesc.h>
57#include <sys/reboot.h>
58#include <sys/sbuf.h>
59#include <sys/stat.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysproto.h>
62#include <sys/sx.h>
63#include <sys/sysctl.h>
64#include <sys/sysent.h>
65#include <sys/systm.h>
66#include <sys/vnode.h>
67
68#include <geom/geom.h>
69
70/*
71 * The root filesystem is detailed in the kernel environment variable
72 * vfs.root.mountfrom, which is expected to be in the general format
73 *
74 * <vfsname>:[<path>][	<vfsname>:[<path>] ...]
75 * vfsname   := the name of a VFS known to the kernel and capable
76 *              of being mounted as root
77 * path      := disk device name or other data used by the filesystem
78 *              to locate its physical store
79 *
80 * If the environment variable vfs.root.mountfrom is a space separated list,
81 * each list element is tried in turn and the root filesystem will be mounted
82 * from the first one that suceeds.
83 *
84 * The environment variable vfs.root.mountfrom.options is a comma delimited
85 * set of string mount options.  These mount options must be parseable
86 * by nmount() in the kernel.
87 */
88
89static int parse_mount(char **);
90static struct mntarg *parse_mountroot_options(struct mntarg *, const char *);
91static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS);
92static void vfs_mountroot_wait(void);
93static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev);
94
95/*
96 * The vnode of the system's root (/ in the filesystem, without chroot
97 * active.)
98 */
99struct vnode *rootvnode;
100
101/*
102 * Mount of the system's /dev.
103 */
104struct mount *rootdevmp;
105
106char *rootdevnames[2] = {NULL, NULL};
107
108struct mtx root_holds_mtx;
109MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF);
110
111struct root_hold_token {
112	const char			*who;
113	LIST_ENTRY(root_hold_token)	list;
114};
115
116static LIST_HEAD(, root_hold_token)	root_holds =
117    LIST_HEAD_INITIALIZER(root_holds);
118
119enum action {
120	A_CONTINUE,
121	A_PANIC,
122	A_REBOOT,
123	A_RETRY
124};
125
126static enum action root_mount_onfail = A_CONTINUE;
127
128static int root_mount_mddev;
129static int root_mount_complete;
130
131/* By default wait up to 3 seconds for devices to appear. */
132static int root_mount_timeout = 3;
133TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
134
135SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold,
136    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
137    NULL, 0, sysctl_vfs_root_mount_hold, "A",
138    "List of root mount hold tokens");
139
140static int
141sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS)
142{
143	struct sbuf sb;
144	struct root_hold_token *h;
145	int error;
146
147	sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL);
148
149	mtx_lock(&root_holds_mtx);
150	LIST_FOREACH(h, &root_holds, list) {
151		if (h != LIST_FIRST(&root_holds))
152			sbuf_putc(&sb, ' ');
153		sbuf_printf(&sb, "%s", h->who);
154	}
155	mtx_unlock(&root_holds_mtx);
156
157	error = sbuf_finish(&sb);
158	if (error == 0)
159		error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
160	sbuf_delete(&sb);
161	return (error);
162}
163
164struct root_hold_token *
165root_mount_hold(const char *identifier)
166{
167	struct root_hold_token *h;
168
169	if (root_mounted())
170		return (NULL);
171
172	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
173	h->who = identifier;
174	mtx_lock(&root_holds_mtx);
175	LIST_INSERT_HEAD(&root_holds, h, list);
176	mtx_unlock(&root_holds_mtx);
177	return (h);
178}
179
180void
181root_mount_rel(struct root_hold_token *h)
182{
183
184	if (h == NULL)
185		return;
186	mtx_lock(&root_holds_mtx);
187	LIST_REMOVE(h, list);
188	wakeup(&root_holds);
189	mtx_unlock(&root_holds_mtx);
190	free(h, M_DEVBUF);
191}
192
193int
194root_mounted(void)
195{
196
197	/* No mutex is acquired here because int stores are atomic. */
198	return (root_mount_complete);
199}
200
201static void
202set_rootvnode(void)
203{
204	struct proc *p;
205
206	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode))
207		panic("Cannot find root vnode");
208
209	VOP_UNLOCK(rootvnode, 0);
210
211	p = curthread->td_proc;
212	FILEDESC_XLOCK(p->p_fd);
213
214	if (p->p_fd->fd_cdir != NULL)
215		vrele(p->p_fd->fd_cdir);
216	p->p_fd->fd_cdir = rootvnode;
217	VREF(rootvnode);
218
219	if (p->p_fd->fd_rdir != NULL)
220		vrele(p->p_fd->fd_rdir);
221	p->p_fd->fd_rdir = rootvnode;
222	VREF(rootvnode);
223
224	FILEDESC_XUNLOCK(p->p_fd);
225}
226
227static int
228vfs_mountroot_devfs(struct thread *td, struct mount **mpp)
229{
230	struct vfsoptlist *opts;
231	struct vfsconf *vfsp;
232	struct mount *mp;
233	int error;
234
235	*mpp = NULL;
236
237	if (rootdevmp != NULL) {
238		/*
239		 * Already have /dev; this happens during rerooting.
240		 */
241		error = vfs_busy(rootdevmp, 0);
242		if (error != 0)
243			return (error);
244		*mpp = rootdevmp;
245	} else {
246		vfsp = vfs_byname("devfs");
247		KASSERT(vfsp != NULL, ("Could not find devfs by name"));
248		if (vfsp == NULL)
249			return (ENOENT);
250
251		mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
252
253		error = VFS_MOUNT(mp);
254		KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
255		if (error)
256			return (error);
257
258		opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
259		TAILQ_INIT(opts);
260		mp->mnt_opt = opts;
261
262		mtx_lock(&mountlist_mtx);
263		TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
264		mtx_unlock(&mountlist_mtx);
265
266		*mpp = mp;
267		rootdevmp = mp;
268	}
269
270	set_rootvnode();
271
272	error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE);
273	if (error)
274		printf("kern_symlink /dev -> / returns %d\n", error);
275
276	return (error);
277}
278
279static void
280vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
281{
282	struct nameidata nd;
283	struct mount *mporoot, *mpnroot;
284	struct vnode *vp, *vporoot, *vpdevfs;
285	char *fspath;
286	int error;
287
288	mpnroot = TAILQ_NEXT(mpdevfs, mnt_list);
289
290	/* Shuffle the mountlist. */
291	mtx_lock(&mountlist_mtx);
292	mporoot = TAILQ_FIRST(&mountlist);
293	TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list);
294	if (mporoot != mpdevfs) {
295		TAILQ_REMOVE(&mountlist, mpnroot, mnt_list);
296		TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list);
297	}
298	TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list);
299	mtx_unlock(&mountlist_mtx);
300
301	cache_purgevfs(mporoot);
302	if (mporoot != mpdevfs)
303		cache_purgevfs(mpdevfs);
304
305	VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot);
306
307	VI_LOCK(vporoot);
308	vporoot->v_iflag &= ~VI_MOUNT;
309	VI_UNLOCK(vporoot);
310	vporoot->v_mountedhere = NULL;
311	mporoot->mnt_flag &= ~MNT_ROOTFS;
312	mporoot->mnt_vnodecovered = NULL;
313	vput(vporoot);
314
315	/* Set up the new rootvnode, and purge the cache */
316	mpnroot->mnt_vnodecovered = NULL;
317	set_rootvnode();
318	cache_purgevfs(rootvnode->v_mount);
319
320	if (mporoot != mpdevfs) {
321		/* Remount old root under /.mount or /mnt */
322		fspath = "/.mount";
323		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
324		    fspath, td);
325		error = namei(&nd);
326		if (error) {
327			NDFREE(&nd, NDF_ONLY_PNBUF);
328			fspath = "/mnt";
329			NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
330			    fspath, td);
331			error = namei(&nd);
332		}
333		if (!error) {
334			vp = nd.ni_vp;
335			error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
336			if (!error)
337				error = vinvalbuf(vp, V_SAVE, 0, 0);
338			if (!error) {
339				cache_purge(vp);
340				mporoot->mnt_vnodecovered = vp;
341				vp->v_mountedhere = mporoot;
342				strlcpy(mporoot->mnt_stat.f_mntonname,
343				    fspath, MNAMELEN);
344				VOP_UNLOCK(vp, 0);
345			} else
346				vput(vp);
347		}
348		NDFREE(&nd, NDF_ONLY_PNBUF);
349
350		if (error && bootverbose)
351			printf("mountroot: unable to remount previous root "
352			    "under /.mount or /mnt (error %d).\n", error);
353	}
354
355	/* Remount devfs under /dev */
356	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
357	error = namei(&nd);
358	if (!error) {
359		vp = nd.ni_vp;
360		error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
361		if (!error)
362			error = vinvalbuf(vp, V_SAVE, 0, 0);
363		if (!error) {
364			vpdevfs = mpdevfs->mnt_vnodecovered;
365			if (vpdevfs != NULL) {
366				cache_purge(vpdevfs);
367				vpdevfs->v_mountedhere = NULL;
368				vrele(vpdevfs);
369			}
370			mpdevfs->mnt_vnodecovered = vp;
371			vp->v_mountedhere = mpdevfs;
372			VOP_UNLOCK(vp, 0);
373		} else
374			vput(vp);
375	}
376	if (error && bootverbose)
377		printf("mountroot: unable to remount devfs under /dev "
378		    "(error %d).\n", error);
379	NDFREE(&nd, NDF_ONLY_PNBUF);
380
381	if (mporoot == mpdevfs) {
382		vfs_unbusy(mpdevfs);
383		/* Unlink the no longer needed /dev/dev -> / symlink */
384		error = kern_unlinkat(td, AT_FDCWD, "/dev/dev",
385		    UIO_SYSSPACE, 0);
386		if (error && bootverbose)
387			printf("mountroot: unable to unlink /dev/dev "
388			    "(error %d)\n", error);
389	}
390}
391
392/*
393 * Configuration parser.
394 */
395
396/* Parser character classes. */
397#define	CC_WHITESPACE		-1
398#define	CC_NONWHITESPACE	-2
399
400/* Parse errors. */
401#define	PE_EOF			-1
402#define	PE_EOL			-2
403
404static __inline int
405parse_peek(char **conf)
406{
407
408	return (**conf);
409}
410
411static __inline void
412parse_poke(char **conf, int c)
413{
414
415	**conf = c;
416}
417
418static __inline void
419parse_advance(char **conf)
420{
421
422	(*conf)++;
423}
424
425static int
426parse_skipto(char **conf, int mc)
427{
428	int c, match;
429
430	while (1) {
431		c = parse_peek(conf);
432		if (c == 0)
433			return (PE_EOF);
434		switch (mc) {
435		case CC_WHITESPACE:
436			match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0;
437			break;
438		case CC_NONWHITESPACE:
439			if (c == '\n')
440				return (PE_EOL);
441			match = (c != ' ' && c != '\t') ? 1 : 0;
442			break;
443		default:
444			match = (c == mc) ? 1 : 0;
445			break;
446		}
447		if (match)
448			break;
449		parse_advance(conf);
450	}
451	return (0);
452}
453
454static int
455parse_token(char **conf, char **tok)
456{
457	char *p;
458	size_t len;
459	int error;
460
461	*tok = NULL;
462	error = parse_skipto(conf, CC_NONWHITESPACE);
463	if (error)
464		return (error);
465	p = *conf;
466	error = parse_skipto(conf, CC_WHITESPACE);
467	len = *conf - p;
468	*tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO);
469	bcopy(p, *tok, len);
470	return (0);
471}
472
473static void
474parse_dir_ask_printenv(const char *var)
475{
476	char *val;
477
478	val = kern_getenv(var);
479	if (val != NULL) {
480		printf("  %s=%s\n", var, val);
481		freeenv(val);
482	}
483}
484
485static int
486parse_dir_ask(char **conf)
487{
488	char name[80];
489	char *mnt;
490	int error;
491
492	vfs_mountroot_wait();
493
494	printf("\nLoader variables:\n");
495	parse_dir_ask_printenv("vfs.root.mountfrom");
496	parse_dir_ask_printenv("vfs.root.mountfrom.options");
497
498	printf("\nManual root filesystem specification:\n");
499	printf("  <fstype>:<device> [options]\n");
500	printf("      Mount <device> using filesystem <fstype>\n");
501	printf("      and with the specified (optional) option list.\n");
502	printf("\n");
503	printf("    eg. ufs:/dev/da0s1a\n");
504	printf("        zfs:tank\n");
505	printf("        cd9660:/dev/cd0 ro\n");
506	printf("          (which is equivalent to: ");
507	printf("mount -t cd9660 -o ro /dev/cd0 /)\n");
508	printf("\n");
509	printf("  ?               List valid disk boot devices\n");
510	printf("  .               Yield 1 second (for background tasks)\n");
511	printf("  <empty line>    Abort manual input\n");
512
513	do {
514		error = EINVAL;
515		printf("\nmountroot> ");
516		cngets(name, sizeof(name), GETS_ECHO);
517		if (name[0] == '\0')
518			break;
519		if (name[0] == '?' && name[1] == '\0') {
520			printf("\nList of GEOM managed disk devices:\n  ");
521			g_dev_print();
522			continue;
523		}
524		if (name[0] == '.' && name[1] == '\0') {
525			pause("rmask", hz);
526			continue;
527		}
528		mnt = name;
529		error = parse_mount(&mnt);
530		if (error == -1)
531			printf("Invalid file system specification.\n");
532	} while (error != 0);
533
534	return (error);
535}
536
537static int
538parse_dir_md(char **conf)
539{
540	struct stat sb;
541	struct thread *td;
542	struct md_ioctl *mdio;
543	char *path, *tok;
544	int error, fd, len;
545
546	td = curthread;
547
548	error = parse_token(conf, &tok);
549	if (error)
550		return (error);
551
552	len = strlen(tok);
553	mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO);
554	path = (void *)(mdio + 1);
555	bcopy(tok, path, len);
556	free(tok, M_TEMP);
557
558	/* Get file status. */
559	error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL);
560	if (error)
561		goto out;
562
563	/* Open /dev/mdctl so that we can attach/detach. */
564	error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE,
565	    O_RDWR, 0);
566	if (error)
567		goto out;
568
569	fd = td->td_retval[0];
570	mdio->md_version = MDIOVERSION;
571	mdio->md_type = MD_VNODE;
572
573	if (root_mount_mddev != -1) {
574		mdio->md_unit = root_mount_mddev;
575		DROP_GIANT();
576		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
577		PICKUP_GIANT();
578		/* Ignore errors. We don't care. */
579		root_mount_mddev = -1;
580	}
581
582	mdio->md_file = (void *)(mdio + 1);
583	mdio->md_options = MD_AUTOUNIT | MD_READONLY;
584	mdio->md_mediasize = sb.st_size;
585	mdio->md_unit = 0;
586	DROP_GIANT();
587	error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio);
588	PICKUP_GIANT();
589	if (error)
590		goto out;
591
592	if (mdio->md_unit > 9) {
593		printf("rootmount: too many md units\n");
594		mdio->md_file = NULL;
595		mdio->md_options = 0;
596		mdio->md_mediasize = 0;
597		DROP_GIANT();
598		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
599		PICKUP_GIANT();
600		/* Ignore errors. We don't care. */
601		error = ERANGE;
602		goto out;
603	}
604
605	root_mount_mddev = mdio->md_unit;
606	printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file);
607
608	error = kern_close(td, fd);
609
610 out:
611	free(mdio, M_TEMP);
612	return (error);
613}
614
615static int
616parse_dir_onfail(char **conf)
617{
618	char *action;
619	int error;
620
621	error = parse_token(conf, &action);
622	if (error)
623		return (error);
624
625	if (!strcmp(action, "continue"))
626		root_mount_onfail = A_CONTINUE;
627	else if (!strcmp(action, "panic"))
628		root_mount_onfail = A_PANIC;
629	else if (!strcmp(action, "reboot"))
630		root_mount_onfail = A_REBOOT;
631	else if (!strcmp(action, "retry"))
632		root_mount_onfail = A_RETRY;
633	else {
634		printf("rootmount: %s: unknown action\n", action);
635		error = EINVAL;
636	}
637
638	free(action, M_TEMP);
639	return (0);
640}
641
642static int
643parse_dir_timeout(char **conf)
644{
645	char *tok, *endtok;
646	long secs;
647	int error;
648
649	error = parse_token(conf, &tok);
650	if (error)
651		return (error);
652
653	secs = strtol(tok, &endtok, 0);
654	error = (secs < 0 || *endtok != '\0') ? EINVAL : 0;
655	if (!error)
656		root_mount_timeout = secs;
657	free(tok, M_TEMP);
658	return (error);
659}
660
661static int
662parse_directive(char **conf)
663{
664	char *dir;
665	int error;
666
667	error = parse_token(conf, &dir);
668	if (error)
669		return (error);
670
671	if (strcmp(dir, ".ask") == 0)
672		error = parse_dir_ask(conf);
673	else if (strcmp(dir, ".md") == 0)
674		error = parse_dir_md(conf);
675	else if (strcmp(dir, ".onfail") == 0)
676		error = parse_dir_onfail(conf);
677	else if (strcmp(dir, ".timeout") == 0)
678		error = parse_dir_timeout(conf);
679	else {
680		printf("mountroot: invalid directive `%s'\n", dir);
681		/* Ignore the rest of the line. */
682		(void)parse_skipto(conf, '\n');
683		error = EINVAL;
684	}
685	free(dir, M_TEMP);
686	return (error);
687}
688
689static int
690parse_mount_dev_present(const char *dev)
691{
692	struct nameidata nd;
693	int error;
694
695	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread);
696	error = namei(&nd);
697	if (!error)
698		vput(nd.ni_vp);
699	NDFREE(&nd, NDF_ONLY_PNBUF);
700	return (error != 0) ? 0 : 1;
701}
702
703#define	ERRMSGL	255
704static int
705parse_mount(char **conf)
706{
707	char *errmsg;
708	struct mntarg *ma;
709	char *dev, *fs, *opts, *tok;
710	int error;
711
712	error = parse_token(conf, &tok);
713	if (error)
714		return (error);
715	fs = tok;
716	error = parse_skipto(&tok, ':');
717	if (error) {
718		free(fs, M_TEMP);
719		return (error);
720	}
721	parse_poke(&tok, '\0');
722	parse_advance(&tok);
723	dev = tok;
724
725	if (root_mount_mddev != -1) {
726		/* Handle substitution for the md unit number. */
727		tok = strstr(dev, "md#");
728		if (tok != NULL)
729			tok[2] = '0' + root_mount_mddev;
730	}
731
732	/* Parse options. */
733	error = parse_token(conf, &tok);
734	opts = (error == 0) ? tok : NULL;
735
736	printf("Trying to mount root from %s:%s [%s]...\n", fs, dev,
737	    (opts != NULL) ? opts : "");
738
739	errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
740
741	if (vfs_byname(fs) == NULL) {
742		strlcpy(errmsg, "unknown file system", ERRMSGL);
743		error = ENOENT;
744		goto out;
745	}
746
747	error = vfs_mountroot_wait_if_neccessary(fs, dev);
748	if (error != 0)
749		goto out;
750
751	ma = NULL;
752	ma = mount_arg(ma, "fstype", fs, -1);
753	ma = mount_arg(ma, "fspath", "/", -1);
754	ma = mount_arg(ma, "from", dev, -1);
755	ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
756	ma = mount_arg(ma, "ro", NULL, 0);
757	ma = parse_mountroot_options(ma, opts);
758	error = kernel_mount(ma, MNT_ROOTFS);
759
760 out:
761	if (error) {
762		printf("Mounting from %s:%s failed with error %d",
763		    fs, dev, error);
764		if (errmsg[0] != '\0')
765			printf(": %s", errmsg);
766		printf(".\n");
767	}
768	free(fs, M_TEMP);
769	free(errmsg, M_TEMP);
770	if (opts != NULL)
771		free(opts, M_TEMP);
772	/* kernel_mount can return -1 on error. */
773	return ((error < 0) ? EDOOFUS : error);
774}
775#undef ERRMSGL
776
777static int
778vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs)
779{
780	struct mount *mp;
781	char *conf;
782	int error;
783
784	root_mount_mddev = -1;
785
786retry:
787	conf = sbuf_data(sb);
788	mp = TAILQ_NEXT(mpdevfs, mnt_list);
789	error = (mp == NULL) ? 0 : EDOOFUS;
790	root_mount_onfail = A_CONTINUE;
791	while (mp == NULL) {
792		error = parse_skipto(&conf, CC_NONWHITESPACE);
793		if (error == PE_EOL) {
794			parse_advance(&conf);
795			continue;
796		}
797		if (error < 0)
798			break;
799		switch (parse_peek(&conf)) {
800		case '#':
801			error = parse_skipto(&conf, '\n');
802			break;
803		case '.':
804			error = parse_directive(&conf);
805			break;
806		default:
807			error = parse_mount(&conf);
808			if (error == -1) {
809				printf("mountroot: invalid file system "
810				    "specification.\n");
811				error = 0;
812			}
813			break;
814		}
815		if (error < 0)
816			break;
817		/* Ignore any trailing garbage on the line. */
818		if (parse_peek(&conf) != '\n') {
819			printf("mountroot: advancing to next directive...\n");
820			(void)parse_skipto(&conf, '\n');
821		}
822		mp = TAILQ_NEXT(mpdevfs, mnt_list);
823	}
824	if (mp != NULL)
825		return (0);
826
827	/*
828	 * We failed to mount (a new) root.
829	 */
830	switch (root_mount_onfail) {
831	case A_CONTINUE:
832		break;
833	case A_PANIC:
834		panic("mountroot: unable to (re-)mount root.");
835		/* NOTREACHED */
836	case A_RETRY:
837		goto retry;
838	case A_REBOOT:
839		kern_reboot(RB_NOSYNC);
840		/* NOTREACHED */
841	}
842
843	return (error);
844}
845
846static void
847vfs_mountroot_conf0(struct sbuf *sb)
848{
849	char *s, *tok, *mnt, *opt;
850	int error;
851
852	sbuf_printf(sb, ".onfail panic\n");
853	sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
854	if (boothowto & RB_ASKNAME)
855		sbuf_printf(sb, ".ask\n");
856#ifdef ROOTDEVNAME
857	if (boothowto & RB_DFLTROOT)
858		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
859#endif
860	if (boothowto & RB_CDROM) {
861		sbuf_printf(sb, "cd9660:/dev/cd0 ro\n");
862		sbuf_printf(sb, ".timeout 0\n");
863		sbuf_printf(sb, "cd9660:/dev/cd1 ro\n");
864		sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
865	}
866	s = kern_getenv("vfs.root.mountfrom");
867	if (s != NULL) {
868		opt = kern_getenv("vfs.root.mountfrom.options");
869		tok = s;
870		error = parse_token(&tok, &mnt);
871		while (!error) {
872			sbuf_printf(sb, "%s %s\n", mnt,
873			    (opt != NULL) ? opt : "");
874			free(mnt, M_TEMP);
875			error = parse_token(&tok, &mnt);
876		}
877		if (opt != NULL)
878			freeenv(opt);
879		freeenv(s);
880	}
881	if (rootdevnames[0] != NULL)
882		sbuf_printf(sb, "%s\n", rootdevnames[0]);
883	if (rootdevnames[1] != NULL)
884		sbuf_printf(sb, "%s\n", rootdevnames[1]);
885#ifdef ROOTDEVNAME
886	if (!(boothowto & RB_DFLTROOT))
887		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
888#endif
889	if (!(boothowto & RB_ASKNAME))
890		sbuf_printf(sb, ".ask\n");
891}
892
893static int
894vfs_mountroot_readconf(struct thread *td, struct sbuf *sb)
895{
896	static char buf[128];
897	struct nameidata nd;
898	off_t ofs;
899	ssize_t resid;
900	int error, flags, len;
901
902	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td);
903	flags = FREAD;
904	error = vn_open(&nd, &flags, 0, NULL);
905	if (error)
906		return (error);
907
908	NDFREE(&nd, NDF_ONLY_PNBUF);
909	ofs = 0;
910	len = sizeof(buf) - 1;
911	while (1) {
912		error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
913		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
914		    NOCRED, &resid, td);
915		if (error)
916			break;
917		if (resid == len)
918			break;
919		buf[len - resid] = 0;
920		sbuf_printf(sb, "%s", buf);
921		ofs += len - resid;
922	}
923
924	VOP_UNLOCK(nd.ni_vp, 0);
925	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
926	return (error);
927}
928
929static void
930vfs_mountroot_wait(void)
931{
932	struct root_hold_token *h;
933	struct timeval lastfail;
934	int curfail;
935
936	curfail = 0;
937	while (1) {
938		DROP_GIANT();
939		g_waitidle();
940		PICKUP_GIANT();
941		mtx_lock(&root_holds_mtx);
942		if (LIST_EMPTY(&root_holds)) {
943			mtx_unlock(&root_holds_mtx);
944			break;
945		}
946		if (ppsratecheck(&lastfail, &curfail, 1)) {
947			printf("Root mount waiting for:");
948			LIST_FOREACH(h, &root_holds, list)
949				printf(" %s", h->who);
950			printf("\n");
951		}
952		msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold",
953		    hz);
954	}
955}
956
957static int
958vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev)
959{
960	int delay, timeout;
961
962	/*
963	 * In case of ZFS and NFS we don't have a way to wait for
964	 * specific device.
965	 */
966	if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL ||
967	    dev[0] == '\0') {
968		vfs_mountroot_wait();
969		return (0);
970	}
971
972	/*
973	 * Otherwise, no point in waiting if the device is already there.
974	 * Note that we must wait for GEOM to finish reconfiguring itself,
975	 * eg for geom_part(4) to finish tasting.
976	 */
977	DROP_GIANT();
978	g_waitidle();
979	PICKUP_GIANT();
980	if (parse_mount_dev_present(dev))
981		return (0);
982
983	/*
984	 * No luck.  Let's wait.  This code looks weird, but it's that way
985	 * to behave exactly as it used to work before.
986	 */
987	vfs_mountroot_wait();
988	printf("mountroot: waiting for device %s...\n", dev);
989	delay = hz / 10;
990	timeout = root_mount_timeout * hz;
991	do {
992		pause("rmdev", delay);
993		timeout -= delay;
994	} while (timeout > 0 && !parse_mount_dev_present(dev));
995
996	if (timeout <= 0)
997		return (ENODEV);
998
999	return (0);
1000}
1001
1002void
1003vfs_mountroot(void)
1004{
1005	struct mount *mp;
1006	struct sbuf *sb;
1007	struct thread *td;
1008	time_t timebase;
1009	int error;
1010
1011	td = curthread;
1012
1013	sb = sbuf_new_auto();
1014	vfs_mountroot_conf0(sb);
1015	sbuf_finish(sb);
1016
1017	error = vfs_mountroot_devfs(td, &mp);
1018	while (!error) {
1019		error = vfs_mountroot_parse(sb, mp);
1020		if (!error) {
1021			vfs_mountroot_shuffle(td, mp);
1022			sbuf_clear(sb);
1023			error = vfs_mountroot_readconf(td, sb);
1024			sbuf_finish(sb);
1025		}
1026	}
1027
1028	sbuf_delete(sb);
1029
1030	/*
1031	 * Iterate over all currently mounted file systems and use
1032	 * the time stamp found to check and/or initialize the RTC.
1033	 * Call inittodr() only once and pass it the largest of the
1034	 * timestamps we encounter.
1035	 */
1036	timebase = 0;
1037	mtx_lock(&mountlist_mtx);
1038	mp = TAILQ_FIRST(&mountlist);
1039	while (mp != NULL) {
1040		if (mp->mnt_time > timebase)
1041			timebase = mp->mnt_time;
1042		mp = TAILQ_NEXT(mp, mnt_list);
1043	}
1044	mtx_unlock(&mountlist_mtx);
1045	inittodr(timebase);
1046
1047	/* Keep prison0's root in sync with the global rootvnode. */
1048	mtx_lock(&prison0.pr_mtx);
1049	prison0.pr_root = rootvnode;
1050	vref(prison0.pr_root);
1051	mtx_unlock(&prison0.pr_mtx);
1052
1053	mtx_lock(&root_holds_mtx);
1054	atomic_store_rel_int(&root_mount_complete, 1);
1055	wakeup(&root_mount_complete);
1056	mtx_unlock(&root_holds_mtx);
1057
1058	EVENTHANDLER_INVOKE(mountroot);
1059}
1060
1061static struct mntarg *
1062parse_mountroot_options(struct mntarg *ma, const char *options)
1063{
1064	char *p;
1065	char *name, *name_arg;
1066	char *val, *val_arg;
1067	char *opts;
1068
1069	if (options == NULL || options[0] == '\0')
1070		return (ma);
1071
1072	p = opts = strdup(options, M_MOUNT);
1073	if (opts == NULL) {
1074		return (ma);
1075	}
1076
1077	while((name = strsep(&p, ",")) != NULL) {
1078		if (name[0] == '\0')
1079			break;
1080
1081		val = strchr(name, '=');
1082		if (val != NULL) {
1083			*val = '\0';
1084			++val;
1085		}
1086		if( strcmp(name, "rw") == 0 ||
1087		    strcmp(name, "noro") == 0) {
1088			/*
1089			 * The first time we mount the root file system,
1090			 * we need to mount 'ro', so We need to ignore
1091			 * 'rw' and 'noro' mount options.
1092			 */
1093			continue;
1094		}
1095		name_arg = strdup(name, M_MOUNT);
1096		val_arg = NULL;
1097		if (val != NULL)
1098			val_arg = strdup(val, M_MOUNT);
1099
1100		ma = mount_arg(ma, name_arg, val_arg,
1101		    (val_arg != NULL ? -1 : 0));
1102	}
1103	free(opts, M_MOUNT);
1104	return (ma);
1105}
1106