vfs_lookup.c revision 158094
123353Sdfr/*-
223353Sdfr * Copyright (c) 1982, 1986, 1989, 1993
323353Sdfr *	The Regents of the University of California.  All rights reserved.
423353Sdfr * (c) UNIX System Laboratories, Inc.
523353Sdfr * All or some portions of this file are derived from material licensed
623353Sdfr * to the University of California by American Telephone and Telegraph
723353Sdfr * Co. or Unix System Laboratories, Inc. and are reproduced herein with
823353Sdfr * the permission of UNIX System Laboratories, Inc.
923353Sdfr *
1023353Sdfr * Redistribution and use in source and binary forms, with or without
1123353Sdfr * modification, are permitted provided that the following conditions
1223353Sdfr * are met:
1323353Sdfr * 1. Redistributions of source code must retain the above copyright
1423353Sdfr *    notice, this list of conditions and the following disclaimer.
1523353Sdfr * 2. Redistributions in binary form must reproduce the above copyright
1623353Sdfr *    notice, this list of conditions and the following disclaimer in the
1723353Sdfr *    documentation and/or other materials provided with the distribution.
1823353Sdfr * 4. Neither the name of the University nor the names of its contributors
1923353Sdfr *    may be used to endorse or promote products derived from this software
2023353Sdfr *    without specific prior written permission.
2123353Sdfr *
2223353Sdfr * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2323353Sdfr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2423353Sdfr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2523353Sdfr * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2623353Sdfr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2723353Sdfr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2823353Sdfr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2950476Speter * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3023353Sdfr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31203722Strasz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32206622Suqs * SUCH DAMAGE.
3323353Sdfr *
3423353Sdfr *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
3523353Sdfr */
36107788Sru
3723353Sdfr#include <sys/cdefs.h>
38107788Sru__FBSDID("$FreeBSD: head/sys/kern/vfs_lookup.c 158094 2006-04-28 00:59:48Z jeff $");
3951140Salfred
40141846Sru#include "opt_ktrace.h"
4151140Salfred#include "opt_mac.h"
4251140Salfred#include "opt_vfs.h"
4379727Sschweikh
4451140Salfred#include <sys/param.h>
4551140Salfred#include <sys/systm.h>
4623353Sdfr#include <sys/kernel.h>
4751140Salfred#include <sys/lock.h>
4851140Salfred#include <sys/mac.h>
4951140Salfred#include <sys/mutex.h>
5051140Salfred#include <sys/namei.h>
5151140Salfred#include <sys/vnode.h>
52240378Skevlo#include <sys/mount.h>
5351140Salfred#include <sys/filedesc.h>
5451140Salfred#include <sys/proc.h>
5551140Salfred#include <sys/syscallsubr.h>
5651140Salfred#include <sys/sysctl.h>
57275993Sbrueffer#ifdef KTRACE
58275993Sbrueffer#include <sys/ktrace.h>
5923353Sdfr#endif
60121414Shmp
6134504Scharnier#include <security/audit/audit.h>
62
63#include <vm/uma.h>
64
65#define	NAMEI_DIAGNOSTIC 1
66#undef NAMEI_DIAGNOSTIC
67
68/*
69 * Allocation zone for namei
70 */
71uma_zone_t namei_zone;
72
73static void
74nameiinit(void *dummy __unused)
75{
76	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
77	    UMA_ALIGN_PTR, 0);
78
79}
80SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
81
82#ifdef LOOKUP_SHARED
83static int lookup_shared = 1;
84#else
85static int lookup_shared = 0;
86#endif
87SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
88    "Enables/Disables shared locks for path name translation");
89
90/*
91 * Convert a pathname into a pointer to a locked inode.
92 *
93 * The FOLLOW flag is set when symbolic links are to be followed
94 * when they occur at the end of the name translation process.
95 * Symbolic links are always followed for all other pathname
96 * components other than the last.
97 *
98 * The segflg defines whether the name is to be copied from user
99 * space or kernel space.
100 *
101 * Overall outline of namei:
102 *
103 *	copy in name
104 *	get starting directory
105 *	while (!done && !error) {
106 *		call lookup to search path.
107 *		if symbolic link, massage name in buffer and continue
108 *	}
109 */
110int
111namei(ndp)
112	register struct nameidata *ndp;
113{
114	register struct filedesc *fdp;	/* pointer to file descriptor state */
115	register char *cp;		/* pointer into pathname argument */
116	register struct vnode *dp;	/* the directory we are searching */
117	struct iovec aiov;		/* uio for reading symbolic links */
118	struct uio auio;
119	int error, linklen;
120	struct componentname *cnp = &ndp->ni_cnd;
121	struct thread *td = cnp->cn_thread;
122	struct proc *p = td->td_proc;
123	int vfslocked;
124
125	KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0,
126	    ("NOT MPSAFE and Giant not held"));
127	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
128	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
129	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
130	    ("namei: nameiop contaminated with flags"));
131	KASSERT((cnp->cn_flags & OPMASK) == 0,
132	    ("namei: flags contaminated with nameiops"));
133	if (!lookup_shared)
134		cnp->cn_flags &= ~LOCKSHARED;
135	fdp = p->p_fd;
136
137	/*
138	 * Get a buffer for the name to be translated, and copy the
139	 * name into the buffer.
140	 */
141	if ((cnp->cn_flags & HASBUF) == 0)
142		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
143	if (ndp->ni_segflg == UIO_SYSSPACE)
144		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
145			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
146	else
147		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
148			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
149
150	/* If we are auditing the kernel pathname, save the user pathname. */
151	if (cnp->cn_flags & AUDITVNODE1)
152		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH1);
153	if (cnp->cn_flags & AUDITVNODE2)
154		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH2);
155
156	/*
157	 * Don't allow empty pathnames.
158	 */
159	if (!error && *cnp->cn_pnbuf == '\0')
160		error = ENOENT;
161
162	if (error) {
163		uma_zfree(namei_zone, cnp->cn_pnbuf);
164#ifdef DIAGNOSTIC
165		cnp->cn_pnbuf = NULL;
166		cnp->cn_nameptr = NULL;
167#endif
168		ndp->ni_vp = NULL;
169		return (error);
170	}
171	ndp->ni_loopcnt = 0;
172#ifdef KTRACE
173	if (KTRPOINT(td, KTR_NAMEI)) {
174		KASSERT(cnp->cn_thread == curthread,
175		    ("namei not using curthread"));
176		ktrnamei(cnp->cn_pnbuf);
177	}
178#endif
179
180	/*
181	 * Get starting point for the translation.
182	 */
183	FILEDESC_LOCK(fdp);
184	ndp->ni_rootdir = fdp->fd_rdir;
185	ndp->ni_topdir = fdp->fd_jdir;
186
187	dp = fdp->fd_cdir;
188	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
189	VREF(dp);
190	FILEDESC_UNLOCK(fdp);
191	for (;;) {
192		/*
193		 * Check if root directory should replace current directory.
194		 * Done at start of translation and after symbolic link.
195		 */
196		cnp->cn_nameptr = cnp->cn_pnbuf;
197		if (*(cnp->cn_nameptr) == '/') {
198			vrele(dp);
199			VFS_UNLOCK_GIANT(vfslocked);
200			while (*(cnp->cn_nameptr) == '/') {
201				cnp->cn_nameptr++;
202				ndp->ni_pathlen--;
203			}
204			dp = ndp->ni_rootdir;
205			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
206			VREF(dp);
207		}
208		if (vfslocked)
209			ndp->ni_cnd.cn_flags |= GIANTHELD;
210		ndp->ni_startdir = dp;
211		error = lookup(ndp);
212		if (error) {
213			uma_zfree(namei_zone, cnp->cn_pnbuf);
214#ifdef DIAGNOSTIC
215			cnp->cn_pnbuf = NULL;
216			cnp->cn_nameptr = NULL;
217#endif
218			return (error);
219		}
220		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
221		ndp->ni_cnd.cn_flags &= ~GIANTHELD;
222		/*
223		 * Check for symbolic link
224		 */
225		if ((cnp->cn_flags & ISSYMLINK) == 0) {
226			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
227				uma_zfree(namei_zone, cnp->cn_pnbuf);
228#ifdef DIAGNOSTIC
229				cnp->cn_pnbuf = NULL;
230				cnp->cn_nameptr = NULL;
231#endif
232			} else
233				cnp->cn_flags |= HASBUF;
234
235			if ((cnp->cn_flags & MPSAFE) == 0) {
236				VFS_UNLOCK_GIANT(vfslocked);
237			} else if (vfslocked)
238				ndp->ni_cnd.cn_flags |= GIANTHELD;
239			return (0);
240		}
241		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
242			error = ELOOP;
243			break;
244		}
245#ifdef MAC
246		if ((cnp->cn_flags & NOMACCHECK) == 0) {
247			error = mac_check_vnode_readlink(td->td_ucred,
248			    ndp->ni_vp);
249			if (error)
250				break;
251		}
252#endif
253		if (ndp->ni_pathlen > 1)
254			cp = uma_zalloc(namei_zone, M_WAITOK);
255		else
256			cp = cnp->cn_pnbuf;
257		aiov.iov_base = cp;
258		aiov.iov_len = MAXPATHLEN;
259		auio.uio_iov = &aiov;
260		auio.uio_iovcnt = 1;
261		auio.uio_offset = 0;
262		auio.uio_rw = UIO_READ;
263		auio.uio_segflg = UIO_SYSSPACE;
264		auio.uio_td = (struct thread *)0;
265		auio.uio_resid = MAXPATHLEN;
266		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
267		if (error) {
268			if (ndp->ni_pathlen > 1)
269				uma_zfree(namei_zone, cp);
270			break;
271		}
272		linklen = MAXPATHLEN - auio.uio_resid;
273		if (linklen == 0) {
274			if (ndp->ni_pathlen > 1)
275				uma_zfree(namei_zone, cp);
276			error = ENOENT;
277			break;
278		}
279		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
280			if (ndp->ni_pathlen > 1)
281				uma_zfree(namei_zone, cp);
282			error = ENAMETOOLONG;
283			break;
284		}
285		if (ndp->ni_pathlen > 1) {
286			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
287			uma_zfree(namei_zone, cnp->cn_pnbuf);
288			cnp->cn_pnbuf = cp;
289		} else
290			cnp->cn_pnbuf[linklen] = '\0';
291		ndp->ni_pathlen += linklen;
292		vput(ndp->ni_vp);
293		dp = ndp->ni_dvp;
294	}
295	uma_zfree(namei_zone, cnp->cn_pnbuf);
296#ifdef DIAGNOSTIC
297	cnp->cn_pnbuf = NULL;
298	cnp->cn_nameptr = NULL;
299#endif
300	vput(ndp->ni_vp);
301	ndp->ni_vp = NULL;
302	vrele(ndp->ni_dvp);
303	VFS_UNLOCK_GIANT(vfslocked);
304	return (error);
305}
306
307/*
308 * Search a pathname.
309 * This is a very central and rather complicated routine.
310 *
311 * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
312 * The starting directory is taken from ni_startdir. The pathname is
313 * descended until done, or a symbolic link is encountered. The variable
314 * ni_more is clear if the path is completed; it is set to one if a
315 * symbolic link needing interpretation is encountered.
316 *
317 * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
318 * whether the name is to be looked up, created, renamed, or deleted.
319 * When CREATE, RENAME, or DELETE is specified, information usable in
320 * creating, renaming, or deleting a directory entry may be calculated.
321 * If flag has LOCKPARENT or'ed into it, the parent directory is returned
322 * locked. If flag has WANTPARENT or'ed into it, the parent directory is
323 * returned unlocked. Otherwise the parent directory is not returned. If
324 * the target of the pathname exists and LOCKLEAF is or'ed into the flag
325 * the target is returned locked, otherwise it is returned unlocked.
326 * When creating or renaming and LOCKPARENT is specified, the target may not
327 * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
328 *
329 * Overall outline of lookup:
330 *
331 * dirloop:
332 *	identify next component of name at ndp->ni_ptr
333 *	handle degenerate case where name is null string
334 *	if .. and crossing mount points and on mounted filesys, find parent
335 *	call VOP_LOOKUP routine for next component name
336 *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
337 *	    component vnode returned in ni_vp (if it exists), locked.
338 *	if result vnode is mounted on and crossing mount points,
339 *	    find mounted on vnode
340 *	if more components of name, do next level at dirloop
341 *	return the answer in ni_vp, locked if LOCKLEAF set
342 *	    if LOCKPARENT set, return locked parent in ni_dvp
343 *	    if WANTPARENT set, return unlocked parent in ni_dvp
344 */
345int
346lookup(ndp)
347	register struct nameidata *ndp;
348{
349	register char *cp;		/* pointer into pathname argument */
350	register struct vnode *dp = 0;	/* the directory we are searching */
351	struct vnode *tdp;		/* saved dp */
352	struct mount *mp;		/* mount table entry */
353	int docache;			/* == 0 do not cache last component */
354	int wantparent;			/* 1 => wantparent or lockparent flag */
355	int rdonly;			/* lookup read-only flag bit */
356	int trailing_slash;
357	int error = 0;
358	int dpunlocked = 0;		/* dp has already been unlocked */
359	struct componentname *cnp = &ndp->ni_cnd;
360	struct thread *td = cnp->cn_thread;
361	int vfslocked;			/* VFS Giant state for child */
362	int dvfslocked;			/* VFS Giant state for parent */
363	int tvfslocked;
364
365	/*
366	 * Setup: break out flag bits into variables.
367	 */
368	dvfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
369	vfslocked = 0;
370	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
371	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
372	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
373	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
374	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
375	if (cnp->cn_nameiop == DELETE ||
376	    (wantparent && cnp->cn_nameiop != CREATE &&
377	     cnp->cn_nameiop != LOOKUP))
378		docache = 0;
379	rdonly = cnp->cn_flags & RDONLY;
380	cnp->cn_flags &= ~ISSYMLINK;
381	ndp->ni_dvp = NULL;
382	/*
383	 * We use shared locks until we hit the parent of the last cn then
384	 * we adjust based on the requesting flags.
385	 */
386	if (lookup_shared)
387		cnp->cn_lkflags = LK_SHARED;
388	else
389		cnp->cn_lkflags = LK_EXCLUSIVE;
390	dp = ndp->ni_startdir;
391	ndp->ni_startdir = NULLVP;
392	vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td);
393
394dirloop:
395	/*
396	 * Search a new directory.
397	 *
398	 * The last component of the filename is left accessible via
399	 * cnp->cn_nameptr for callers that need the name. Callers needing
400	 * the name set the SAVENAME flag. When done, they assume
401	 * responsibility for freeing the pathname buffer.
402	 */
403	cnp->cn_consume = 0;
404	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
405		continue;
406	cnp->cn_namelen = cp - cnp->cn_nameptr;
407	if (cnp->cn_namelen > NAME_MAX) {
408		error = ENAMETOOLONG;
409		goto bad;
410	}
411#ifdef NAMEI_DIAGNOSTIC
412	{ char c = *cp;
413	*cp = '\0';
414	printf("{%s}: ", cnp->cn_nameptr);
415	*cp = c; }
416#endif
417	ndp->ni_pathlen -= cnp->cn_namelen;
418	ndp->ni_next = cp;
419
420	/*
421	 * Replace multiple slashes by a single slash and trailing slashes
422	 * by a null.  This must be done before VOP_LOOKUP() because some
423	 * fs's don't know about trailing slashes.  Remember if there were
424	 * trailing slashes to handle symlinks, existing non-directories
425	 * and non-existing files that won't be directories specially later.
426	 */
427	trailing_slash = 0;
428	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
429		cp++;
430		ndp->ni_pathlen--;
431		if (*cp == '\0') {
432			trailing_slash = 1;
433			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
434		}
435	}
436	ndp->ni_next = cp;
437
438	cnp->cn_flags |= MAKEENTRY;
439	if (*cp == '\0' && docache == 0)
440		cnp->cn_flags &= ~MAKEENTRY;
441	if (cnp->cn_namelen == 2 &&
442	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
443		cnp->cn_flags |= ISDOTDOT;
444	else
445		cnp->cn_flags &= ~ISDOTDOT;
446	if (*ndp->ni_next == 0)
447		cnp->cn_flags |= ISLASTCN;
448	else
449		cnp->cn_flags &= ~ISLASTCN;
450
451
452	/*
453	 * Check for degenerate name (e.g. / or "")
454	 * which is a way of talking about a directory,
455	 * e.g. like "/." or ".".
456	 */
457	if (cnp->cn_nameptr[0] == '\0') {
458		if (dp->v_type != VDIR) {
459			error = ENOTDIR;
460			goto bad;
461		}
462		if (cnp->cn_nameiop != LOOKUP) {
463			error = EISDIR;
464			goto bad;
465		}
466		if (wantparent) {
467			ndp->ni_dvp = dp;
468			VREF(dp);
469		}
470		ndp->ni_vp = dp;
471
472		if (cnp->cn_flags & AUDITVNODE1)
473			AUDIT_ARG(vnode, dp, ARG_VNODE1);
474		else if (cnp->cn_flags & AUDITVNODE2)
475			AUDIT_ARG(vnode, dp, ARG_VNODE2);
476
477		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
478			VOP_UNLOCK(dp, 0, td);
479		/* XXX This should probably move to the top of function. */
480		if (cnp->cn_flags & SAVESTART)
481			panic("lookup: SAVESTART");
482		goto success;
483	}
484
485	/*
486	 * Handle "..": four special cases.
487	 * 1. Return an error if this is the last component of
488	 *    the name and the operation is DELETE or RENAME.
489	 * 2. If at root directory (e.g. after chroot)
490	 *    or at absolute root directory
491	 *    then ignore it so can't get out.
492	 * 3. If this vnode is the root of a mounted
493	 *    filesystem, then replace it with the
494	 *    vnode which was mounted on so we take the
495	 *    .. in the other filesystem.
496	 * 4. If the vnode is the top directory of
497	 *    the jail or chroot, don't let them out.
498	 */
499	if (cnp->cn_flags & ISDOTDOT) {
500		if ((cnp->cn_flags & ISLASTCN) != 0 &&
501		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
502			error = EINVAL;
503			goto bad;
504		}
505		for (;;) {
506			if (dp == ndp->ni_rootdir ||
507			    dp == ndp->ni_topdir ||
508			    dp == rootvnode) {
509				ndp->ni_dvp = dp;
510				ndp->ni_vp = dp;
511				VREF(dp);
512				goto nextname;
513			}
514			if ((dp->v_vflag & VV_ROOT) == 0 ||
515			    (cnp->cn_flags & NOCROSSMOUNT))
516				break;
517			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
518				error = EBADF;
519				goto bad;
520			}
521			tdp = dp;
522			dp = dp->v_mount->mnt_vnodecovered;
523			tvfslocked = dvfslocked;
524			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
525			VREF(dp);
526			vput(tdp);
527			VFS_UNLOCK_GIANT(tvfslocked);
528			vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td);
529		}
530	}
531
532	/*
533	 * We now have a segment name to search for, and a directory to search.
534	 */
535unionlookup:
536#ifdef MAC
537	if ((cnp->cn_flags & NOMACCHECK) == 0) {
538		error = mac_check_vnode_lookup(td->td_ucred, dp, cnp);
539		if (error)
540			goto bad;
541	}
542#endif
543	ndp->ni_dvp = dp;
544	ndp->ni_vp = NULL;
545	ASSERT_VOP_LOCKED(dp, "lookup");
546	VNASSERT(vfslocked == 0, dp, ("lookup: vfslocked %d", vfslocked));
547	/*
548	 * If we have a shared lock we may need to upgrade the lock for the
549	 * last operation.
550	 */
551	if (VOP_ISLOCKED(dp, td) == LK_SHARED &&
552	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
553		vn_lock(dp, LK_UPGRADE|LK_RETRY, td);
554	/*
555	 * If we're looking up the last component and we need an exclusive
556	 * lock, adjust our lkflags.
557	 */
558	if ((cnp->cn_flags & (ISLASTCN|LOCKSHARED|LOCKLEAF)) ==
559	    (ISLASTCN|LOCKLEAF))
560		cnp->cn_lkflags = LK_EXCLUSIVE;
561#ifdef NAMEI_DIAGNOSTIC
562	vprint("lookup in", dp);
563#endif
564	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
565		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
566#ifdef NAMEI_DIAGNOSTIC
567		printf("not found\n");
568#endif
569		if ((error == ENOENT) &&
570		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
571		    (dp->v_mount->mnt_flag & MNT_UNION)) {
572			tdp = dp;
573			dp = dp->v_mount->mnt_vnodecovered;
574			tvfslocked = dvfslocked;
575			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
576			VREF(dp);
577			vput(tdp);
578			VFS_UNLOCK_GIANT(tvfslocked);
579			vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td);
580			goto unionlookup;
581		}
582
583		if (error != EJUSTRETURN)
584			goto bad;
585		/*
586		 * If creating and at end of pathname, then can consider
587		 * allowing file to be created.
588		 */
589		if (rdonly) {
590			error = EROFS;
591			goto bad;
592		}
593		if (*cp == '\0' && trailing_slash &&
594		     !(cnp->cn_flags & WILLBEDIR)) {
595			error = ENOENT;
596			goto bad;
597		}
598		if ((cnp->cn_flags & LOCKPARENT) == 0)
599			VOP_UNLOCK(dp, 0, td);
600		/*
601		 * This is a temporary assert to make sure I know what the
602		 * behavior here was.
603		 */
604		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
605		   ("lookup: Unhandled case."));
606		/*
607		 * We return with ni_vp NULL to indicate that the entry
608		 * doesn't currently exist, leaving a pointer to the
609		 * (possibly locked) directory inode in ndp->ni_dvp.
610		 */
611		if (cnp->cn_flags & SAVESTART) {
612			ndp->ni_startdir = ndp->ni_dvp;
613			VREF(ndp->ni_startdir);
614		}
615		goto success;
616	}
617#ifdef NAMEI_DIAGNOSTIC
618	printf("found\n");
619#endif
620	/*
621	 * Take into account any additional components consumed by
622	 * the underlying filesystem.
623	 */
624	if (cnp->cn_consume > 0) {
625		cnp->cn_nameptr += cnp->cn_consume;
626		ndp->ni_next += cnp->cn_consume;
627		ndp->ni_pathlen -= cnp->cn_consume;
628		cnp->cn_consume = 0;
629	}
630
631	dp = ndp->ni_vp;
632	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
633
634	/*
635	 * Check to see if the vnode has been mounted on;
636	 * if so find the root of the mounted filesystem.
637	 */
638	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
639	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
640		if (vfs_busy(mp, 0, 0, td))
641			continue;
642		vput(dp);
643		VFS_UNLOCK_GIANT(vfslocked);
644		vfslocked = VFS_LOCK_GIANT(mp);
645		if (dp != ndp->ni_dvp)
646			VOP_UNLOCK(ndp->ni_dvp, 0, td);
647		error = VFS_ROOT(mp, cnp->cn_lkflags, &tdp, td);
648		vfs_unbusy(mp, td);
649		vn_lock(ndp->ni_dvp, cnp->cn_lkflags | LK_RETRY, td);
650		if (error) {
651			dpunlocked = 1;
652			goto bad2;
653		}
654		ndp->ni_vp = dp = tdp;
655	}
656
657	/*
658	 * Check for symbolic link
659	 */
660	if ((dp->v_type == VLNK) &&
661	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
662	     *ndp->ni_next == '/')) {
663		cnp->cn_flags |= ISSYMLINK;
664		if (dp->v_iflag & VI_DOOMED) {
665			/* We can't know whether the directory was mounted with
666			 * NOSYMFOLLOW, so we can't follow safely. */
667			error = EBADF;
668			goto bad2;
669		}
670		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
671			error = EACCES;
672			goto bad2;
673		}
674		/*
675		 * Symlink code always expects an unlocked dvp.
676		 */
677		if (ndp->ni_dvp != ndp->ni_vp)
678			VOP_UNLOCK(ndp->ni_dvp, 0, td);
679		goto success;
680	}
681
682	/*
683	 * Check for bogus trailing slashes.
684	 */
685	if (trailing_slash && dp->v_type != VDIR) {
686		error = ENOTDIR;
687		goto bad2;
688	}
689
690nextname:
691	/*
692	 * Not a symbolic link.  If more pathname,
693	 * continue at next component, else return.
694	 */
695	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
696	    ("lookup: invalid path state."));
697	if (*ndp->ni_next == '/') {
698		cnp->cn_nameptr = ndp->ni_next;
699		while (*cnp->cn_nameptr == '/') {
700			cnp->cn_nameptr++;
701			ndp->ni_pathlen--;
702		}
703		if (ndp->ni_dvp != dp)
704			vput(ndp->ni_dvp);
705		else
706			vrele(ndp->ni_dvp);
707		VFS_UNLOCK_GIANT(dvfslocked);
708		dvfslocked = vfslocked;	/* dp becomes dvp in dirloop */
709		vfslocked = 0;
710		goto dirloop;
711	}
712	/*
713	 * Disallow directory write attempts on read-only filesystems.
714	 */
715	if (rdonly &&
716	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
717		error = EROFS;
718		goto bad2;
719	}
720	if (cnp->cn_flags & SAVESTART) {
721		ndp->ni_startdir = ndp->ni_dvp;
722		VREF(ndp->ni_startdir);
723	}
724	if (!wantparent) {
725		if (ndp->ni_dvp != dp)
726			vput(ndp->ni_dvp);
727		else
728			vrele(ndp->ni_dvp);
729		VFS_UNLOCK_GIANT(dvfslocked);
730		dvfslocked = 0;
731	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp)
732		VOP_UNLOCK(ndp->ni_dvp, 0, td);
733
734	if (cnp->cn_flags & AUDITVNODE1)
735		AUDIT_ARG(vnode, dp, ARG_VNODE1);
736	else if (cnp->cn_flags & AUDITVNODE2)
737		AUDIT_ARG(vnode, dp, ARG_VNODE2);
738
739	if ((cnp->cn_flags & LOCKLEAF) == 0)
740		VOP_UNLOCK(dp, 0, td);
741success:
742	if (vfslocked && dvfslocked)
743		VFS_UNLOCK_GIANT(dvfslocked);	/* Only need one */
744	if (vfslocked || dvfslocked)
745		ndp->ni_cnd.cn_flags |= GIANTHELD;
746	return (0);
747
748bad2:
749	if (dp != ndp->ni_dvp)
750		vput(ndp->ni_dvp);
751	else
752		vrele(ndp->ni_dvp);
753bad:
754	if (!dpunlocked)
755		vput(dp);
756	VFS_UNLOCK_GIANT(vfslocked);
757	VFS_UNLOCK_GIANT(dvfslocked);
758	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
759	ndp->ni_vp = NULL;
760	return (error);
761}
762
763/*
764 * relookup - lookup a path name component
765 *    Used by lookup to re-aquire things.
766 */
767int
768relookup(dvp, vpp, cnp)
769	struct vnode *dvp, **vpp;
770	struct componentname *cnp;
771{
772	struct thread *td = cnp->cn_thread;
773	struct vnode *dp = 0;		/* the directory we are searching */
774	int wantparent;			/* 1 => wantparent or lockparent flag */
775	int rdonly;			/* lookup read-only flag bit */
776	int error = 0;
777
778	KASSERT(cnp->cn_flags & ISLASTCN,
779	    ("relookup: Not given last component."));
780	/*
781	 * Setup: break out flag bits into variables.
782	 */
783	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
784	KASSERT(wantparent, ("relookup: parent not wanted."));
785	rdonly = cnp->cn_flags & RDONLY;
786	cnp->cn_flags &= ~ISSYMLINK;
787	dp = dvp;
788	cnp->cn_lkflags = LK_EXCLUSIVE;
789	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
790
791	/*
792	 * Search a new directory.
793	 *
794	 * The last component of the filename is left accessible via
795	 * cnp->cn_nameptr for callers that need the name. Callers needing
796	 * the name set the SAVENAME flag. When done, they assume
797	 * responsibility for freeing the pathname buffer.
798	 */
799#ifdef NAMEI_DIAGNOSTIC
800	printf("{%s}: ", cnp->cn_nameptr);
801#endif
802
803	/*
804	 * Check for degenerate name (e.g. / or "")
805	 * which is a way of talking about a directory,
806	 * e.g. like "/." or ".".
807	 */
808	if (cnp->cn_nameptr[0] == '\0') {
809		if (cnp->cn_nameiop != LOOKUP || wantparent) {
810			error = EISDIR;
811			goto bad;
812		}
813		if (dp->v_type != VDIR) {
814			error = ENOTDIR;
815			goto bad;
816		}
817		if (!(cnp->cn_flags & LOCKLEAF))
818			VOP_UNLOCK(dp, 0, td);
819		*vpp = dp;
820		/* XXX This should probably move to the top of function. */
821		if (cnp->cn_flags & SAVESTART)
822			panic("lookup: SAVESTART");
823		return (0);
824	}
825
826	if (cnp->cn_flags & ISDOTDOT)
827		panic ("relookup: lookup on dot-dot");
828
829	/*
830	 * We now have a segment name to search for, and a directory to search.
831	 */
832#ifdef NAMEI_DIAGNOSTIC
833	vprint("search in:", dp);
834#endif
835	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
836		KASSERT(*vpp == NULL, ("leaf should be empty"));
837		if (error != EJUSTRETURN)
838			goto bad;
839		/*
840		 * If creating and at end of pathname, then can consider
841		 * allowing file to be created.
842		 */
843		if (rdonly) {
844			error = EROFS;
845			goto bad;
846		}
847		/* ASSERT(dvp == ndp->ni_startdir) */
848		if (cnp->cn_flags & SAVESTART)
849			VREF(dvp);
850		if ((cnp->cn_flags & LOCKPARENT) == 0)
851			VOP_UNLOCK(dp, 0, td);
852		/*
853		 * This is a temporary assert to make sure I know what the
854		 * behavior here was.
855		 */
856		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
857		   ("relookup: Unhandled case."));
858		/*
859		 * We return with ni_vp NULL to indicate that the entry
860		 * doesn't currently exist, leaving a pointer to the
861		 * (possibly locked) directory inode in ndp->ni_dvp.
862		 */
863		return (0);
864	}
865	dp = *vpp;
866
867	/*
868	 * Disallow directory write attempts on read-only filesystems.
869	 */
870	if (rdonly &&
871	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
872		if (dvp == dp)
873			vrele(dvp);
874		else
875			vput(dvp);
876		error = EROFS;
877		goto bad;
878	}
879	/*
880	 * Set the parent lock/ref state to the requested state.
881	 */
882	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
883		if (wantparent)
884			VOP_UNLOCK(dvp, 0, td);
885		else
886			vput(dvp);
887	} else if (!wantparent)
888		vrele(dvp);
889	/*
890	 * Check for symbolic link
891	 */
892	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
893	    ("relookup: symlink found.\n"));
894
895	/* ASSERT(dvp == ndp->ni_startdir) */
896	if (cnp->cn_flags & SAVESTART)
897		VREF(dvp);
898
899	if ((cnp->cn_flags & LOCKLEAF) == 0)
900		VOP_UNLOCK(dp, 0, td);
901	return (0);
902bad:
903	vput(dp);
904	*vpp = NULL;
905	return (error);
906}
907
908/*
909 * Free data allocated by namei(); see namei(9) for details.
910 */
911void
912NDFREE(ndp, flags)
913     struct nameidata *ndp;
914     const u_int flags;
915{
916	int unlock_dvp;
917	int unlock_vp;
918
919	unlock_dvp = 0;
920	unlock_vp = 0;
921
922	if (!(flags & NDF_NO_FREE_PNBUF) &&
923	    (ndp->ni_cnd.cn_flags & HASBUF)) {
924		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
925		ndp->ni_cnd.cn_flags &= ~HASBUF;
926	}
927	if (!(flags & NDF_NO_VP_UNLOCK) &&
928	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
929		unlock_vp = 1;
930	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
931		if (unlock_vp) {
932			vput(ndp->ni_vp);
933			unlock_vp = 0;
934		} else
935			vrele(ndp->ni_vp);
936		ndp->ni_vp = NULL;
937	}
938	if (unlock_vp)
939		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
940	if (!(flags & NDF_NO_DVP_UNLOCK) &&
941	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
942	    ndp->ni_dvp != ndp->ni_vp)
943		unlock_dvp = 1;
944	if (!(flags & NDF_NO_DVP_RELE) &&
945	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
946		if (unlock_dvp) {
947			vput(ndp->ni_dvp);
948			unlock_dvp = 0;
949		} else
950			vrele(ndp->ni_dvp);
951		ndp->ni_dvp = NULL;
952	}
953	if (unlock_dvp)
954		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
955	if (!(flags & NDF_NO_STARTDIR_RELE) &&
956	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
957		vrele(ndp->ni_startdir);
958		ndp->ni_startdir = NULL;
959	}
960}
961
962/*
963 * Determine if there is a suitable alternate filename under the specified
964 * prefix for the specified path.  If the create flag is set, then the
965 * alternate prefix will be used so long as the parent directory exists.
966 * This is used by the various compatiblity ABIs so that Linux binaries prefer
967 * files under /compat/linux for example.  The chosen path (whether under
968 * the prefix or under /) is returned in a kernel malloc'd buffer pointed
969 * to by pathbuf.  The caller is responsible for free'ing the buffer from
970 * the M_TEMP bucket if one is returned.
971 */
972int
973kern_alternate_path(struct thread *td, const char *prefix, char *path,
974    enum uio_seg pathseg, char **pathbuf, int create)
975{
976	struct nameidata nd, ndroot;
977	char *ptr, *buf, *cp;
978	size_t len, sz;
979	int error;
980
981	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
982	*pathbuf = buf;
983
984	/* Copy the prefix into the new pathname as a starting point. */
985	len = strlcpy(buf, prefix, MAXPATHLEN);
986	if (len >= MAXPATHLEN) {
987		*pathbuf = NULL;
988		free(buf, M_TEMP);
989		return (EINVAL);
990	}
991	sz = MAXPATHLEN - len;
992	ptr = buf + len;
993
994	/* Append the filename to the prefix. */
995	if (pathseg == UIO_SYSSPACE)
996		error = copystr(path, ptr, sz, &len);
997	else
998		error = copyinstr(path, ptr, sz, &len);
999
1000	if (error) {
1001		*pathbuf = NULL;
1002		free(buf, M_TEMP);
1003		return (error);
1004	}
1005
1006	/* Only use a prefix with absolute pathnames. */
1007	if (*ptr != '/') {
1008		error = EINVAL;
1009		goto keeporig;
1010	}
1011
1012	/*
1013	 * We know that there is a / somewhere in this pathname.
1014	 * Search backwards for it, to find the file's parent dir
1015	 * to see if it exists in the alternate tree. If it does,
1016	 * and we want to create a file (cflag is set). We don't
1017	 * need to worry about the root comparison in this case.
1018	 */
1019
1020	if (create) {
1021		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
1022		*cp = '\0';
1023
1024		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1025		error = namei(&nd);
1026		*cp = '/';
1027		if (error != 0)
1028			goto keeporig;
1029	} else {
1030		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1031
1032		error = namei(&nd);
1033		if (error != 0)
1034			goto keeporig;
1035
1036		/*
1037		 * We now compare the vnode of the prefix to the one
1038		 * vnode asked. If they resolve to be the same, then we
1039		 * ignore the match so that the real root gets used.
1040		 * This avoids the problem of traversing "../.." to find the
1041		 * root directory and never finding it, because "/" resolves
1042		 * to the emulation root directory. This is expensive :-(
1043		 */
1044		NDINIT(&ndroot, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, prefix,
1045		    td);
1046
1047		/* We shouldn't ever get an error from this namei(). */
1048		error = namei(&ndroot);
1049		if (error == 0) {
1050			if (nd.ni_vp == ndroot.ni_vp)
1051				error = ENOENT;
1052
1053			NDFREE(&ndroot, NDF_ONLY_PNBUF);
1054			vrele(ndroot.ni_vp);
1055			VFS_UNLOCK_GIANT(NDHASGIANT(&ndroot));
1056		}
1057	}
1058
1059	NDFREE(&nd, NDF_ONLY_PNBUF);
1060	vrele(nd.ni_vp);
1061	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
1062
1063keeporig:
1064	/* If there was an error, use the original path name. */
1065	if (error)
1066		bcopy(ptr, buf, len);
1067	return (error);
1068}
1069