vfs_lookup.c revision 193028
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/vfs_lookup.c 193028 2009-05-29 10:02:44Z des $");
39
40#include "opt_kdtrace.h"
41#include "opt_ktrace.h"
42#include "opt_mac.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kernel.h>
47#include <sys/fcntl.h>
48#include <sys/jail.h>
49#include <sys/lock.h>
50#include <sys/mutex.h>
51#include <sys/namei.h>
52#include <sys/vnode.h>
53#include <sys/mount.h>
54#include <sys/filedesc.h>
55#include <sys/proc.h>
56#include <sys/sdt.h>
57#include <sys/syscallsubr.h>
58#include <sys/sysctl.h>
59#ifdef KTRACE
60#include <sys/ktrace.h>
61#endif
62
63#include <security/audit/audit.h>
64#include <security/mac/mac_framework.h>
65
66#include <vm/uma.h>
67
68#define	NAMEI_DIAGNOSTIC 1
69#undef NAMEI_DIAGNOSTIC
70
71SDT_PROVIDER_DECLARE(vfs);
72SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *",
73    "unsigned long");
74SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *");
75
76/*
77 * Allocation zone for namei
78 */
79uma_zone_t namei_zone;
80/*
81 * Placeholder vnode for mp traversal
82 */
83static struct vnode *vp_crossmp;
84
85static void
86nameiinit(void *dummy __unused)
87{
88	int error;
89
90	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
91	    UMA_ALIGN_PTR, 0);
92	error = getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
93	if (error != 0)
94		panic("nameiinit: getnewvnode");
95	VN_LOCK_ASHARE(vp_crossmp);
96}
97SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
98
99static int lookup_shared = 1;
100SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
101    "Enables/Disables shared locks for path name translation");
102TUNABLE_INT("vfs.lookup_shared", &lookup_shared);
103
104/*
105 * Convert a pathname into a pointer to a locked vnode.
106 *
107 * The FOLLOW flag is set when symbolic links are to be followed
108 * when they occur at the end of the name translation process.
109 * Symbolic links are always followed for all other pathname
110 * components other than the last.
111 *
112 * The segflg defines whether the name is to be copied from user
113 * space or kernel space.
114 *
115 * Overall outline of namei:
116 *
117 *	copy in name
118 *	get starting directory
119 *	while (!done && !error) {
120 *		call lookup to search path.
121 *		if symbolic link, massage name in buffer and continue
122 *	}
123 */
124int
125namei(struct nameidata *ndp)
126{
127	struct filedesc *fdp;	/* pointer to file descriptor state */
128	char *cp;		/* pointer into pathname argument */
129	struct vnode *dp;	/* the directory we are searching */
130	struct iovec aiov;		/* uio for reading symbolic links */
131	struct uio auio;
132	int error, linklen;
133	struct componentname *cnp = &ndp->ni_cnd;
134	struct thread *td = cnp->cn_thread;
135	struct proc *p = td->td_proc;
136	int vfslocked;
137
138	KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0,
139	    ("NOT MPSAFE and Giant not held"));
140	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
141	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
142	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
143	    ("namei: nameiop contaminated with flags"));
144	KASSERT((cnp->cn_flags & OPMASK) == 0,
145	    ("namei: flags contaminated with nameiops"));
146	if (!lookup_shared)
147		cnp->cn_flags &= ~LOCKSHARED;
148	fdp = p->p_fd;
149
150	/* We will set this ourselves if we need it. */
151	cnp->cn_flags &= ~TRAILINGSLASH;
152
153	/*
154	 * Get a buffer for the name to be translated, and copy the
155	 * name into the buffer.
156	 */
157	if ((cnp->cn_flags & HASBUF) == 0)
158		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
159	if (ndp->ni_segflg == UIO_SYSSPACE)
160		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
161			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
162	else
163		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
164			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
165
166	/* If we are auditing the kernel pathname, save the user pathname. */
167	if (cnp->cn_flags & AUDITVNODE1)
168		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH1);
169	if (cnp->cn_flags & AUDITVNODE2)
170		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH2);
171
172	/*
173	 * Don't allow empty pathnames.
174	 */
175	if (!error && *cnp->cn_pnbuf == '\0')
176		error = ENOENT;
177
178	if (error) {
179		uma_zfree(namei_zone, cnp->cn_pnbuf);
180#ifdef DIAGNOSTIC
181		cnp->cn_pnbuf = NULL;
182		cnp->cn_nameptr = NULL;
183#endif
184		ndp->ni_vp = NULL;
185		return (error);
186	}
187	ndp->ni_loopcnt = 0;
188#ifdef KTRACE
189	if (KTRPOINT(td, KTR_NAMEI)) {
190		KASSERT(cnp->cn_thread == curthread,
191		    ("namei not using curthread"));
192		ktrnamei(cnp->cn_pnbuf);
193	}
194#endif
195	/*
196	 * Get starting point for the translation.
197	 */
198	FILEDESC_SLOCK(fdp);
199	ndp->ni_rootdir = fdp->fd_rdir;
200	ndp->ni_topdir = fdp->fd_jdir;
201
202	dp = NULL;
203	if (cnp->cn_pnbuf[0] != '/') {
204		if (ndp->ni_startdir != NULL) {
205			dp = ndp->ni_startdir;
206			error = 0;
207		} else if (ndp->ni_dirfd != AT_FDCWD)
208			error = fgetvp(td, ndp->ni_dirfd, &dp);
209		if (error != 0 || dp != NULL) {
210			FILEDESC_SUNLOCK(fdp);
211			if (error == 0 && dp->v_type != VDIR) {
212				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
213				vrele(dp);
214				VFS_UNLOCK_GIANT(vfslocked);
215				error = ENOTDIR;
216			}
217		}
218		if (error) {
219			uma_zfree(namei_zone, cnp->cn_pnbuf);
220#ifdef DIAGNOSTIC
221			cnp->cn_pnbuf = NULL;
222			cnp->cn_nameptr = NULL;
223#endif
224			return (error);
225		}
226	}
227	if (dp == NULL) {
228		dp = fdp->fd_cdir;
229		VREF(dp);
230		FILEDESC_SUNLOCK(fdp);
231		if (ndp->ni_startdir != NULL) {
232			vfslocked = VFS_LOCK_GIANT(ndp->ni_startdir->v_mount);
233			vrele(ndp->ni_startdir);
234			VFS_UNLOCK_GIANT(vfslocked);
235		}
236	}
237	SDT_PROBE(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
238	    cnp->cn_flags, 0, 0);
239	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
240	for (;;) {
241		/*
242		 * Check if root directory should replace current directory.
243		 * Done at start of translation and after symbolic link.
244		 */
245		cnp->cn_nameptr = cnp->cn_pnbuf;
246		if (*(cnp->cn_nameptr) == '/') {
247			vrele(dp);
248			VFS_UNLOCK_GIANT(vfslocked);
249			while (*(cnp->cn_nameptr) == '/') {
250				cnp->cn_nameptr++;
251				ndp->ni_pathlen--;
252			}
253			dp = ndp->ni_rootdir;
254			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
255			VREF(dp);
256		}
257		if (vfslocked)
258			ndp->ni_cnd.cn_flags |= GIANTHELD;
259		ndp->ni_startdir = dp;
260		error = lookup(ndp);
261		if (error) {
262			uma_zfree(namei_zone, cnp->cn_pnbuf);
263#ifdef DIAGNOSTIC
264			cnp->cn_pnbuf = NULL;
265			cnp->cn_nameptr = NULL;
266#endif
267			SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0,
268			    0, 0);
269			return (error);
270		}
271		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
272		ndp->ni_cnd.cn_flags &= ~GIANTHELD;
273		/*
274		 * If not a symbolic link, we're done.
275		 */
276		if ((cnp->cn_flags & ISSYMLINK) == 0) {
277			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
278				uma_zfree(namei_zone, cnp->cn_pnbuf);
279#ifdef DIAGNOSTIC
280				cnp->cn_pnbuf = NULL;
281				cnp->cn_nameptr = NULL;
282#endif
283			} else
284				cnp->cn_flags |= HASBUF;
285
286			if ((cnp->cn_flags & MPSAFE) == 0) {
287				VFS_UNLOCK_GIANT(vfslocked);
288			} else if (vfslocked)
289				ndp->ni_cnd.cn_flags |= GIANTHELD;
290			SDT_PROBE(vfs, namei, lookup, return, 0, ndp->ni_vp,
291			    0, 0, 0);
292			return (0);
293		}
294		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
295			error = ELOOP;
296			break;
297		}
298#ifdef MAC
299		if ((cnp->cn_flags & NOMACCHECK) == 0) {
300			error = mac_vnode_check_readlink(td->td_ucred,
301			    ndp->ni_vp);
302			if (error)
303				break;
304		}
305#endif
306		if (ndp->ni_pathlen > 1)
307			cp = uma_zalloc(namei_zone, M_WAITOK);
308		else
309			cp = cnp->cn_pnbuf;
310		aiov.iov_base = cp;
311		aiov.iov_len = MAXPATHLEN;
312		auio.uio_iov = &aiov;
313		auio.uio_iovcnt = 1;
314		auio.uio_offset = 0;
315		auio.uio_rw = UIO_READ;
316		auio.uio_segflg = UIO_SYSSPACE;
317		auio.uio_td = (struct thread *)0;
318		auio.uio_resid = MAXPATHLEN;
319		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
320		if (error) {
321			if (ndp->ni_pathlen > 1)
322				uma_zfree(namei_zone, cp);
323			break;
324		}
325		linklen = MAXPATHLEN - auio.uio_resid;
326		if (linklen == 0) {
327			if (ndp->ni_pathlen > 1)
328				uma_zfree(namei_zone, cp);
329			error = ENOENT;
330			break;
331		}
332		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
333			if (ndp->ni_pathlen > 1)
334				uma_zfree(namei_zone, cp);
335			error = ENAMETOOLONG;
336			break;
337		}
338		if (ndp->ni_pathlen > 1) {
339			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
340			uma_zfree(namei_zone, cnp->cn_pnbuf);
341			cnp->cn_pnbuf = cp;
342		} else
343			cnp->cn_pnbuf[linklen] = '\0';
344		ndp->ni_pathlen += linklen;
345		vput(ndp->ni_vp);
346		dp = ndp->ni_dvp;
347	}
348	uma_zfree(namei_zone, cnp->cn_pnbuf);
349#ifdef DIAGNOSTIC
350	cnp->cn_pnbuf = NULL;
351	cnp->cn_nameptr = NULL;
352#endif
353	vput(ndp->ni_vp);
354	ndp->ni_vp = NULL;
355	vrele(ndp->ni_dvp);
356	VFS_UNLOCK_GIANT(vfslocked);
357	SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0, 0, 0);
358	return (error);
359}
360
361static int
362compute_cn_lkflags(struct mount *mp, int lkflags)
363{
364
365	if (mp == NULL ||
366	    ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) {
367		lkflags &= ~LK_SHARED;
368		lkflags |= LK_EXCLUSIVE;
369	}
370	return (lkflags);
371}
372
373static __inline int
374needs_exclusive_leaf(struct mount *mp, int flags)
375{
376
377	/*
378	 * Intermediate nodes can use shared locks, we only need to
379	 * force an exclusive lock for leaf nodes.
380	 */
381	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
382		return (0);
383
384	/* Always use exclusive locks if LOCKSHARED isn't set. */
385	if (!(flags & LOCKSHARED))
386		return (1);
387
388	/*
389	 * For lookups during open(), if the mount point supports
390	 * extended shared operations, then use a shared lock for the
391	 * leaf node, otherwise use an exclusive lock.
392	 */
393	if (flags & ISOPEN) {
394		if (mp != NULL &&
395		    (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED))
396			return (0);
397		else
398			return (1);
399	}
400
401	/*
402	 * Lookup requests outside of open() that specify LOCKSHARED
403	 * only need a shared lock on the leaf vnode.
404	 */
405	return (0);
406}
407
408/*
409 * Search a pathname.
410 * This is a very central and rather complicated routine.
411 *
412 * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
413 * The starting directory is taken from ni_startdir. The pathname is
414 * descended until done, or a symbolic link is encountered. The variable
415 * ni_more is clear if the path is completed; it is set to one if a
416 * symbolic link needing interpretation is encountered.
417 *
418 * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
419 * whether the name is to be looked up, created, renamed, or deleted.
420 * When CREATE, RENAME, or DELETE is specified, information usable in
421 * creating, renaming, or deleting a directory entry may be calculated.
422 * If flag has LOCKPARENT or'ed into it, the parent directory is returned
423 * locked. If flag has WANTPARENT or'ed into it, the parent directory is
424 * returned unlocked. Otherwise the parent directory is not returned. If
425 * the target of the pathname exists and LOCKLEAF is or'ed into the flag
426 * the target is returned locked, otherwise it is returned unlocked.
427 * When creating or renaming and LOCKPARENT is specified, the target may not
428 * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
429 *
430 * Overall outline of lookup:
431 *
432 * dirloop:
433 *	identify next component of name at ndp->ni_ptr
434 *	handle degenerate case where name is null string
435 *	if .. and crossing mount points and on mounted filesys, find parent
436 *	call VOP_LOOKUP routine for next component name
437 *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
438 *	    component vnode returned in ni_vp (if it exists), locked.
439 *	if result vnode is mounted on and crossing mount points,
440 *	    find mounted on vnode
441 *	if more components of name, do next level at dirloop
442 *	return the answer in ni_vp, locked if LOCKLEAF set
443 *	    if LOCKPARENT set, return locked parent in ni_dvp
444 *	    if WANTPARENT set, return unlocked parent in ni_dvp
445 */
446int
447lookup(struct nameidata *ndp)
448{
449	char *cp;		/* pointer into pathname argument */
450	struct vnode *dp = 0;	/* the directory we are searching */
451	struct vnode *tdp;		/* saved dp */
452	struct mount *mp;		/* mount table entry */
453	struct prison *pr;
454	int docache;			/* == 0 do not cache last component */
455	int wantparent;			/* 1 => wantparent or lockparent flag */
456	int rdonly;			/* lookup read-only flag bit */
457	int trailing_slash;
458	int error = 0;
459	int dpunlocked = 0;		/* dp has already been unlocked */
460	struct componentname *cnp = &ndp->ni_cnd;
461	int vfslocked;			/* VFS Giant state for child */
462	int dvfslocked;			/* VFS Giant state for parent */
463	int tvfslocked;
464	int lkflags_save;
465#ifdef AUDIT
466	struct thread *td = curthread;
467#endif
468
469	/*
470	 * Setup: break out flag bits into variables.
471	 */
472	dvfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
473	vfslocked = 0;
474	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
475	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
476	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
477	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
478	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
479	if (cnp->cn_nameiop == DELETE ||
480	    (wantparent && cnp->cn_nameiop != CREATE &&
481	     cnp->cn_nameiop != LOOKUP))
482		docache = 0;
483	rdonly = cnp->cn_flags & RDONLY;
484	cnp->cn_flags &= ~ISSYMLINK;
485	ndp->ni_dvp = NULL;
486	/*
487	 * We use shared locks until we hit the parent of the last cn then
488	 * we adjust based on the requesting flags.
489	 */
490	if (lookup_shared)
491		cnp->cn_lkflags = LK_SHARED;
492	else
493		cnp->cn_lkflags = LK_EXCLUSIVE;
494	dp = ndp->ni_startdir;
495	ndp->ni_startdir = NULLVP;
496	vn_lock(dp,
497	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY));
498
499dirloop:
500	/*
501	 * Search a new directory.
502	 *
503	 * The last component of the filename is left accessible via
504	 * cnp->cn_nameptr for callers that need the name. Callers needing
505	 * the name set the SAVENAME flag. When done, they assume
506	 * responsibility for freeing the pathname buffer.
507	 */
508	cnp->cn_consume = 0;
509	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
510		continue;
511	cnp->cn_namelen = cp - cnp->cn_nameptr;
512	if (cnp->cn_namelen > NAME_MAX) {
513		error = ENAMETOOLONG;
514		goto bad;
515	}
516#ifdef NAMEI_DIAGNOSTIC
517	{ char c = *cp;
518	*cp = '\0';
519	printf("{%s}: ", cnp->cn_nameptr);
520	*cp = c; }
521#endif
522	ndp->ni_pathlen -= cnp->cn_namelen;
523	ndp->ni_next = cp;
524
525	/*
526	 * Replace multiple slashes by a single slash and trailing slashes
527	 * by a null.  This must be done before VOP_LOOKUP() because some
528	 * fs's don't know about trailing slashes.  Remember if there were
529	 * trailing slashes to handle symlinks, existing non-directories
530	 * and non-existing files that won't be directories specially later.
531	 */
532	trailing_slash = 0;
533	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
534		cp++;
535		ndp->ni_pathlen--;
536		if (*cp == '\0') {
537			trailing_slash = 1;
538			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
539			cnp->cn_flags |= TRAILINGSLASH;
540		}
541	}
542	ndp->ni_next = cp;
543
544	cnp->cn_flags |= MAKEENTRY;
545	if (*cp == '\0' && docache == 0)
546		cnp->cn_flags &= ~MAKEENTRY;
547	if (cnp->cn_namelen == 2 &&
548	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
549		cnp->cn_flags |= ISDOTDOT;
550	else
551		cnp->cn_flags &= ~ISDOTDOT;
552	if (*ndp->ni_next == 0)
553		cnp->cn_flags |= ISLASTCN;
554	else
555		cnp->cn_flags &= ~ISLASTCN;
556
557
558	/*
559	 * Check for degenerate name (e.g. / or "")
560	 * which is a way of talking about a directory,
561	 * e.g. like "/." or ".".
562	 */
563	if (cnp->cn_nameptr[0] == '\0') {
564		if (dp->v_type != VDIR) {
565			error = ENOTDIR;
566			goto bad;
567		}
568		if (cnp->cn_nameiop != LOOKUP) {
569			error = EISDIR;
570			goto bad;
571		}
572		if (wantparent) {
573			ndp->ni_dvp = dp;
574			VREF(dp);
575		}
576		ndp->ni_vp = dp;
577
578		if (cnp->cn_flags & AUDITVNODE1)
579			AUDIT_ARG(vnode, dp, ARG_VNODE1);
580		else if (cnp->cn_flags & AUDITVNODE2)
581			AUDIT_ARG(vnode, dp, ARG_VNODE2);
582
583		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
584			VOP_UNLOCK(dp, 0);
585		/* XXX This should probably move to the top of function. */
586		if (cnp->cn_flags & SAVESTART)
587			panic("lookup: SAVESTART");
588		goto success;
589	}
590
591	/*
592	 * Handle "..": four special cases.
593	 * 1. Return an error if this is the last component of
594	 *    the name and the operation is DELETE or RENAME.
595	 * 2. If at root directory (e.g. after chroot)
596	 *    or at absolute root directory
597	 *    then ignore it so can't get out.
598	 * 3. If this vnode is the root of a mounted
599	 *    filesystem, then replace it with the
600	 *    vnode which was mounted on so we take the
601	 *    .. in the other filesystem.
602	 * 4. If the vnode is the top directory of
603	 *    the jail or chroot, don't let them out.
604	 */
605	if (cnp->cn_flags & ISDOTDOT) {
606		if ((cnp->cn_flags & ISLASTCN) != 0 &&
607		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
608			error = EINVAL;
609			goto bad;
610		}
611		for (;;) {
612			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
613			     pr = pr->pr_parent)
614				if (dp == pr->pr_root)
615					break;
616			if (dp == ndp->ni_rootdir ||
617			    dp == ndp->ni_topdir ||
618			    dp == rootvnode ||
619			    pr != NULL ||
620			    ((dp->v_vflag & VV_ROOT) != 0 &&
621			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
622				ndp->ni_dvp = dp;
623				ndp->ni_vp = dp;
624				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
625				VREF(dp);
626				goto nextname;
627			}
628			if ((dp->v_vflag & VV_ROOT) == 0)
629				break;
630			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
631				error = ENOENT;
632				goto bad;
633			}
634			tdp = dp;
635			dp = dp->v_mount->mnt_vnodecovered;
636			tvfslocked = dvfslocked;
637			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
638			VREF(dp);
639			vput(tdp);
640			VFS_UNLOCK_GIANT(tvfslocked);
641			vn_lock(dp,
642			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
643			    LK_RETRY));
644		}
645	}
646
647	/*
648	 * We now have a segment name to search for, and a directory to search.
649	 */
650unionlookup:
651#ifdef MAC
652	if ((cnp->cn_flags & NOMACCHECK) == 0) {
653		error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp,
654		    cnp);
655		if (error)
656			goto bad;
657	}
658#endif
659	ndp->ni_dvp = dp;
660	ndp->ni_vp = NULL;
661	ASSERT_VOP_LOCKED(dp, "lookup");
662	VNASSERT(vfslocked == 0, dp, ("lookup: vfslocked %d", vfslocked));
663	/*
664	 * If we have a shared lock we may need to upgrade the lock for the
665	 * last operation.
666	 */
667	if (dp != vp_crossmp &&
668	    VOP_ISLOCKED(dp) == LK_SHARED &&
669	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
670		vn_lock(dp, LK_UPGRADE|LK_RETRY);
671	/*
672	 * If we're looking up the last component and we need an exclusive
673	 * lock, adjust our lkflags.
674	 */
675	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
676		cnp->cn_lkflags = LK_EXCLUSIVE;
677#ifdef NAMEI_DIAGNOSTIC
678	vprint("lookup in", dp);
679#endif
680	lkflags_save = cnp->cn_lkflags;
681	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags);
682	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
683		cnp->cn_lkflags = lkflags_save;
684		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
685#ifdef NAMEI_DIAGNOSTIC
686		printf("not found\n");
687#endif
688		if ((error == ENOENT) &&
689		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
690		    (dp->v_mount->mnt_flag & MNT_UNION)) {
691			tdp = dp;
692			dp = dp->v_mount->mnt_vnodecovered;
693			tvfslocked = dvfslocked;
694			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
695			VREF(dp);
696			vput(tdp);
697			VFS_UNLOCK_GIANT(tvfslocked);
698			vn_lock(dp,
699			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
700			    LK_RETRY));
701			goto unionlookup;
702		}
703
704		if (error != EJUSTRETURN)
705			goto bad;
706		/*
707		 * If creating and at end of pathname, then can consider
708		 * allowing file to be created.
709		 */
710		if (rdonly) {
711			error = EROFS;
712			goto bad;
713		}
714		if (*cp == '\0' && trailing_slash &&
715		     !(cnp->cn_flags & WILLBEDIR)) {
716			error = ENOENT;
717			goto bad;
718		}
719		if ((cnp->cn_flags & LOCKPARENT) == 0)
720			VOP_UNLOCK(dp, 0);
721		/*
722		 * This is a temporary assert to make sure I know what the
723		 * behavior here was.
724		 */
725		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
726		   ("lookup: Unhandled case."));
727		/*
728		 * We return with ni_vp NULL to indicate that the entry
729		 * doesn't currently exist, leaving a pointer to the
730		 * (possibly locked) directory vnode in ndp->ni_dvp.
731		 */
732		if (cnp->cn_flags & SAVESTART) {
733			ndp->ni_startdir = ndp->ni_dvp;
734			VREF(ndp->ni_startdir);
735		}
736		goto success;
737	} else
738		cnp->cn_lkflags = lkflags_save;
739#ifdef NAMEI_DIAGNOSTIC
740	printf("found\n");
741#endif
742	/*
743	 * Take into account any additional components consumed by
744	 * the underlying filesystem.
745	 */
746	if (cnp->cn_consume > 0) {
747		cnp->cn_nameptr += cnp->cn_consume;
748		ndp->ni_next += cnp->cn_consume;
749		ndp->ni_pathlen -= cnp->cn_consume;
750		cnp->cn_consume = 0;
751	}
752
753	dp = ndp->ni_vp;
754	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
755
756	/*
757	 * Check to see if the vnode has been mounted on;
758	 * if so find the root of the mounted filesystem.
759	 */
760	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
761	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
762		if (vfs_busy(mp, 0))
763			continue;
764		vput(dp);
765		VFS_UNLOCK_GIANT(vfslocked);
766		vfslocked = VFS_LOCK_GIANT(mp);
767		if (dp != ndp->ni_dvp)
768			vput(ndp->ni_dvp);
769		else
770			vrele(ndp->ni_dvp);
771		VFS_UNLOCK_GIANT(dvfslocked);
772		dvfslocked = 0;
773		vref(vp_crossmp);
774		ndp->ni_dvp = vp_crossmp;
775		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags),
776		    &tdp);
777		vfs_unbusy(mp);
778		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
779			panic("vp_crossmp exclusively locked or reclaimed");
780		if (error) {
781			dpunlocked = 1;
782			goto bad2;
783		}
784		ndp->ni_vp = dp = tdp;
785	}
786
787	/*
788	 * Check for symbolic link
789	 */
790	if ((dp->v_type == VLNK) &&
791	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
792	     *ndp->ni_next == '/')) {
793		cnp->cn_flags |= ISSYMLINK;
794		if (dp->v_iflag & VI_DOOMED) {
795			/*
796			 * We can't know whether the directory was mounted with
797			 * NOSYMFOLLOW, so we can't follow safely.
798			 */
799			error = ENOENT;
800			goto bad2;
801		}
802		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
803			error = EACCES;
804			goto bad2;
805		}
806		/*
807		 * Symlink code always expects an unlocked dvp.
808		 */
809		if (ndp->ni_dvp != ndp->ni_vp)
810			VOP_UNLOCK(ndp->ni_dvp, 0);
811		goto success;
812	}
813
814nextname:
815	/*
816	 * Not a symbolic link.  If more pathname,
817	 * continue at next component, else return.
818	 */
819	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
820	    ("lookup: invalid path state."));
821	if (*ndp->ni_next == '/') {
822		cnp->cn_nameptr = ndp->ni_next;
823		while (*cnp->cn_nameptr == '/') {
824			cnp->cn_nameptr++;
825			ndp->ni_pathlen--;
826		}
827		if (ndp->ni_dvp != dp)
828			vput(ndp->ni_dvp);
829		else
830			vrele(ndp->ni_dvp);
831		VFS_UNLOCK_GIANT(dvfslocked);
832		dvfslocked = vfslocked;	/* dp becomes dvp in dirloop */
833		vfslocked = 0;
834		goto dirloop;
835	}
836	/*
837	 * If we're processing a path with a trailing slash,
838	 * check that the end result is a directory.
839	 */
840	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
841		error = ENOTDIR;
842		goto bad2;
843	}
844	/*
845	 * Disallow directory write attempts on read-only filesystems.
846	 */
847	if (rdonly &&
848	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
849		error = EROFS;
850		goto bad2;
851	}
852	if (cnp->cn_flags & SAVESTART) {
853		ndp->ni_startdir = ndp->ni_dvp;
854		VREF(ndp->ni_startdir);
855	}
856	if (!wantparent) {
857		if (ndp->ni_dvp != dp)
858			vput(ndp->ni_dvp);
859		else
860			vrele(ndp->ni_dvp);
861		VFS_UNLOCK_GIANT(dvfslocked);
862		dvfslocked = 0;
863	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp)
864		VOP_UNLOCK(ndp->ni_dvp, 0);
865
866	if (cnp->cn_flags & AUDITVNODE1)
867		AUDIT_ARG(vnode, dp, ARG_VNODE1);
868	else if (cnp->cn_flags & AUDITVNODE2)
869		AUDIT_ARG(vnode, dp, ARG_VNODE2);
870
871	if ((cnp->cn_flags & LOCKLEAF) == 0)
872		VOP_UNLOCK(dp, 0);
873success:
874	/*
875	 * Because of lookup_shared we may have the vnode shared locked, but
876	 * the caller may want it to be exclusively locked.
877	 */
878	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
879	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
880		vn_lock(dp, LK_UPGRADE | LK_RETRY);
881		if (dp->v_iflag & VI_DOOMED) {
882			error = ENOENT;
883			goto bad2;
884		}
885	}
886	if (vfslocked && dvfslocked)
887		VFS_UNLOCK_GIANT(dvfslocked);	/* Only need one */
888	if (vfslocked || dvfslocked)
889		ndp->ni_cnd.cn_flags |= GIANTHELD;
890	return (0);
891
892bad2:
893	if (dp != ndp->ni_dvp)
894		vput(ndp->ni_dvp);
895	else
896		vrele(ndp->ni_dvp);
897bad:
898	if (!dpunlocked)
899		vput(dp);
900	VFS_UNLOCK_GIANT(vfslocked);
901	VFS_UNLOCK_GIANT(dvfslocked);
902	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
903	ndp->ni_vp = NULL;
904	return (error);
905}
906
907/*
908 * relookup - lookup a path name component
909 *    Used by lookup to re-acquire things.
910 */
911int
912relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
913{
914	struct vnode *dp = 0;		/* the directory we are searching */
915	int wantparent;			/* 1 => wantparent or lockparent flag */
916	int rdonly;			/* lookup read-only flag bit */
917	int error = 0;
918
919	KASSERT(cnp->cn_flags & ISLASTCN,
920	    ("relookup: Not given last component."));
921	/*
922	 * Setup: break out flag bits into variables.
923	 */
924	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
925	KASSERT(wantparent, ("relookup: parent not wanted."));
926	rdonly = cnp->cn_flags & RDONLY;
927	cnp->cn_flags &= ~ISSYMLINK;
928	dp = dvp;
929	cnp->cn_lkflags = LK_EXCLUSIVE;
930	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
931
932	/*
933	 * Search a new directory.
934	 *
935	 * The last component of the filename is left accessible via
936	 * cnp->cn_nameptr for callers that need the name. Callers needing
937	 * the name set the SAVENAME flag. When done, they assume
938	 * responsibility for freeing the pathname buffer.
939	 */
940#ifdef NAMEI_DIAGNOSTIC
941	printf("{%s}: ", cnp->cn_nameptr);
942#endif
943
944	/*
945	 * Check for degenerate name (e.g. / or "")
946	 * which is a way of talking about a directory,
947	 * e.g. like "/." or ".".
948	 */
949	if (cnp->cn_nameptr[0] == '\0') {
950		if (cnp->cn_nameiop != LOOKUP || wantparent) {
951			error = EISDIR;
952			goto bad;
953		}
954		if (dp->v_type != VDIR) {
955			error = ENOTDIR;
956			goto bad;
957		}
958		if (!(cnp->cn_flags & LOCKLEAF))
959			VOP_UNLOCK(dp, 0);
960		*vpp = dp;
961		/* XXX This should probably move to the top of function. */
962		if (cnp->cn_flags & SAVESTART)
963			panic("lookup: SAVESTART");
964		return (0);
965	}
966
967	if (cnp->cn_flags & ISDOTDOT)
968		panic ("relookup: lookup on dot-dot");
969
970	/*
971	 * We now have a segment name to search for, and a directory to search.
972	 */
973#ifdef NAMEI_DIAGNOSTIC
974	vprint("search in:", dp);
975#endif
976	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
977		KASSERT(*vpp == NULL, ("leaf should be empty"));
978		if (error != EJUSTRETURN)
979			goto bad;
980		/*
981		 * If creating and at end of pathname, then can consider
982		 * allowing file to be created.
983		 */
984		if (rdonly) {
985			error = EROFS;
986			goto bad;
987		}
988		/* ASSERT(dvp == ndp->ni_startdir) */
989		if (cnp->cn_flags & SAVESTART)
990			VREF(dvp);
991		if ((cnp->cn_flags & LOCKPARENT) == 0)
992			VOP_UNLOCK(dp, 0);
993		/*
994		 * This is a temporary assert to make sure I know what the
995		 * behavior here was.
996		 */
997		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
998		   ("relookup: Unhandled case."));
999		/*
1000		 * We return with ni_vp NULL to indicate that the entry
1001		 * doesn't currently exist, leaving a pointer to the
1002		 * (possibly locked) directory vnode in ndp->ni_dvp.
1003		 */
1004		return (0);
1005	}
1006
1007	dp = *vpp;
1008
1009	/*
1010	 * Disallow directory write attempts on read-only filesystems.
1011	 */
1012	if (rdonly &&
1013	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1014		if (dvp == dp)
1015			vrele(dvp);
1016		else
1017			vput(dvp);
1018		error = EROFS;
1019		goto bad;
1020	}
1021	/*
1022	 * Set the parent lock/ref state to the requested state.
1023	 */
1024	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
1025		if (wantparent)
1026			VOP_UNLOCK(dvp, 0);
1027		else
1028			vput(dvp);
1029	} else if (!wantparent)
1030		vrele(dvp);
1031	/*
1032	 * Check for symbolic link
1033	 */
1034	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
1035	    ("relookup: symlink found.\n"));
1036
1037	/* ASSERT(dvp == ndp->ni_startdir) */
1038	if (cnp->cn_flags & SAVESTART)
1039		VREF(dvp);
1040
1041	if ((cnp->cn_flags & LOCKLEAF) == 0)
1042		VOP_UNLOCK(dp, 0);
1043	return (0);
1044bad:
1045	vput(dp);
1046	*vpp = NULL;
1047	return (error);
1048}
1049
1050/*
1051 * Free data allocated by namei(); see namei(9) for details.
1052 */
1053void
1054NDFREE(struct nameidata *ndp, const u_int flags)
1055{
1056	int unlock_dvp;
1057	int unlock_vp;
1058
1059	unlock_dvp = 0;
1060	unlock_vp = 0;
1061
1062	if (!(flags & NDF_NO_FREE_PNBUF) &&
1063	    (ndp->ni_cnd.cn_flags & HASBUF)) {
1064		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
1065		ndp->ni_cnd.cn_flags &= ~HASBUF;
1066	}
1067	if (!(flags & NDF_NO_VP_UNLOCK) &&
1068	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
1069		unlock_vp = 1;
1070	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
1071		if (unlock_vp) {
1072			vput(ndp->ni_vp);
1073			unlock_vp = 0;
1074		} else
1075			vrele(ndp->ni_vp);
1076		ndp->ni_vp = NULL;
1077	}
1078	if (unlock_vp)
1079		VOP_UNLOCK(ndp->ni_vp, 0);
1080	if (!(flags & NDF_NO_DVP_UNLOCK) &&
1081	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
1082	    ndp->ni_dvp != ndp->ni_vp)
1083		unlock_dvp = 1;
1084	if (!(flags & NDF_NO_DVP_RELE) &&
1085	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
1086		if (unlock_dvp) {
1087			vput(ndp->ni_dvp);
1088			unlock_dvp = 0;
1089		} else
1090			vrele(ndp->ni_dvp);
1091		ndp->ni_dvp = NULL;
1092	}
1093	if (unlock_dvp)
1094		VOP_UNLOCK(ndp->ni_dvp, 0);
1095	if (!(flags & NDF_NO_STARTDIR_RELE) &&
1096	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
1097		vrele(ndp->ni_startdir);
1098		ndp->ni_startdir = NULL;
1099	}
1100}
1101
1102/*
1103 * Determine if there is a suitable alternate filename under the specified
1104 * prefix for the specified path.  If the create flag is set, then the
1105 * alternate prefix will be used so long as the parent directory exists.
1106 * This is used by the various compatiblity ABIs so that Linux binaries prefer
1107 * files under /compat/linux for example.  The chosen path (whether under
1108 * the prefix or under /) is returned in a kernel malloc'd buffer pointed
1109 * to by pathbuf.  The caller is responsible for free'ing the buffer from
1110 * the M_TEMP bucket if one is returned.
1111 */
1112int
1113kern_alternate_path(struct thread *td, const char *prefix, const char *path,
1114    enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
1115{
1116	struct nameidata nd, ndroot;
1117	char *ptr, *buf, *cp;
1118	size_t len, sz;
1119	int error;
1120
1121	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1122	*pathbuf = buf;
1123
1124	/* Copy the prefix into the new pathname as a starting point. */
1125	len = strlcpy(buf, prefix, MAXPATHLEN);
1126	if (len >= MAXPATHLEN) {
1127		*pathbuf = NULL;
1128		free(buf, M_TEMP);
1129		return (EINVAL);
1130	}
1131	sz = MAXPATHLEN - len;
1132	ptr = buf + len;
1133
1134	/* Append the filename to the prefix. */
1135	if (pathseg == UIO_SYSSPACE)
1136		error = copystr(path, ptr, sz, &len);
1137	else
1138		error = copyinstr(path, ptr, sz, &len);
1139
1140	if (error) {
1141		*pathbuf = NULL;
1142		free(buf, M_TEMP);
1143		return (error);
1144	}
1145
1146	/* Only use a prefix with absolute pathnames. */
1147	if (*ptr != '/') {
1148		error = EINVAL;
1149		goto keeporig;
1150	}
1151
1152	if (dirfd != AT_FDCWD) {
1153		/*
1154		 * We want the original because the "prefix" is
1155		 * included in the already opened dirfd.
1156		 */
1157		bcopy(ptr, buf, len);
1158		return (0);
1159	}
1160
1161	/*
1162	 * We know that there is a / somewhere in this pathname.
1163	 * Search backwards for it, to find the file's parent dir
1164	 * to see if it exists in the alternate tree. If it does,
1165	 * and we want to create a file (cflag is set). We don't
1166	 * need to worry about the root comparison in this case.
1167	 */
1168
1169	if (create) {
1170		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
1171		*cp = '\0';
1172
1173		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1174		error = namei(&nd);
1175		*cp = '/';
1176		if (error != 0)
1177			goto keeporig;
1178	} else {
1179		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1180
1181		error = namei(&nd);
1182		if (error != 0)
1183			goto keeporig;
1184
1185		/*
1186		 * We now compare the vnode of the prefix to the one
1187		 * vnode asked. If they resolve to be the same, then we
1188		 * ignore the match so that the real root gets used.
1189		 * This avoids the problem of traversing "../.." to find the
1190		 * root directory and never finding it, because "/" resolves
1191		 * to the emulation root directory. This is expensive :-(
1192		 */
1193		NDINIT(&ndroot, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, prefix,
1194		    td);
1195
1196		/* We shouldn't ever get an error from this namei(). */
1197		error = namei(&ndroot);
1198		if (error == 0) {
1199			if (nd.ni_vp == ndroot.ni_vp)
1200				error = ENOENT;
1201
1202			NDFREE(&ndroot, NDF_ONLY_PNBUF);
1203			vrele(ndroot.ni_vp);
1204			VFS_UNLOCK_GIANT(NDHASGIANT(&ndroot));
1205		}
1206	}
1207
1208	NDFREE(&nd, NDF_ONLY_PNBUF);
1209	vrele(nd.ni_vp);
1210	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
1211
1212keeporig:
1213	/* If there was an error, use the original path name. */
1214	if (error)
1215		bcopy(ptr, buf, len);
1216	return (error);
1217}
1218