vfs_lookup.c revision 163606
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/vfs_lookup.c 163606 2006-10-22 11:52:19Z rwatson $");
39
40#include "opt_ktrace.h"
41#include "opt_mac.h"
42#include "opt_vfs.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/mutex.h>
49#include <sys/namei.h>
50#include <sys/vnode.h>
51#include <sys/mount.h>
52#include <sys/filedesc.h>
53#include <sys/proc.h>
54#include <sys/syscallsubr.h>
55#include <sys/sysctl.h>
56#ifdef KTRACE
57#include <sys/ktrace.h>
58#endif
59
60#include <security/audit/audit.h>
61#include <security/mac/mac_framework.h>
62
63#include <vm/uma.h>
64
65#define	NAMEI_DIAGNOSTIC 1
66#undef NAMEI_DIAGNOSTIC
67
68/*
69 * Allocation zone for namei
70 */
71uma_zone_t namei_zone;
72
73static void
74nameiinit(void *dummy __unused)
75{
76	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
77	    UMA_ALIGN_PTR, 0);
78
79}
80SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
81
82#ifdef LOOKUP_SHARED
83static int lookup_shared = 1;
84#else
85static int lookup_shared = 0;
86#endif
87SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
88    "Enables/Disables shared locks for path name translation");
89
90/*
91 * Convert a pathname into a pointer to a locked vnode.
92 *
93 * The FOLLOW flag is set when symbolic links are to be followed
94 * when they occur at the end of the name translation process.
95 * Symbolic links are always followed for all other pathname
96 * components other than the last.
97 *
98 * The segflg defines whether the name is to be copied from user
99 * space or kernel space.
100 *
101 * Overall outline of namei:
102 *
103 *	copy in name
104 *	get starting directory
105 *	while (!done && !error) {
106 *		call lookup to search path.
107 *		if symbolic link, massage name in buffer and continue
108 *	}
109 */
110int
111namei(struct nameidata *ndp)
112{
113	struct filedesc *fdp;	/* pointer to file descriptor state */
114	char *cp;		/* pointer into pathname argument */
115	struct vnode *dp;	/* the directory we are searching */
116	struct iovec aiov;		/* uio for reading symbolic links */
117	struct uio auio;
118	int error, linklen;
119	struct componentname *cnp = &ndp->ni_cnd;
120	struct thread *td = cnp->cn_thread;
121	struct proc *p = td->td_proc;
122	int vfslocked;
123
124	KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0,
125	    ("NOT MPSAFE and Giant not held"));
126	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
127	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
128	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
129	    ("namei: nameiop contaminated with flags"));
130	KASSERT((cnp->cn_flags & OPMASK) == 0,
131	    ("namei: flags contaminated with nameiops"));
132	if (!lookup_shared)
133		cnp->cn_flags &= ~LOCKSHARED;
134	fdp = p->p_fd;
135
136	/*
137	 * Get a buffer for the name to be translated, and copy the
138	 * name into the buffer.
139	 */
140	if ((cnp->cn_flags & HASBUF) == 0)
141		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
142	if (ndp->ni_segflg == UIO_SYSSPACE)
143		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
144			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
145	else
146		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
147			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
148
149	/* If we are auditing the kernel pathname, save the user pathname. */
150	if (cnp->cn_flags & AUDITVNODE1)
151		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH1);
152	if (cnp->cn_flags & AUDITVNODE2)
153		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH2);
154
155	/*
156	 * Don't allow empty pathnames.
157	 */
158	if (!error && *cnp->cn_pnbuf == '\0')
159		error = ENOENT;
160
161	if (error) {
162		uma_zfree(namei_zone, cnp->cn_pnbuf);
163#ifdef DIAGNOSTIC
164		cnp->cn_pnbuf = NULL;
165		cnp->cn_nameptr = NULL;
166#endif
167		ndp->ni_vp = NULL;
168		return (error);
169	}
170	ndp->ni_loopcnt = 0;
171#ifdef KTRACE
172	if (KTRPOINT(td, KTR_NAMEI)) {
173		KASSERT(cnp->cn_thread == curthread,
174		    ("namei not using curthread"));
175		ktrnamei(cnp->cn_pnbuf);
176	}
177#endif
178
179	/*
180	 * Get starting point for the translation.
181	 */
182	FILEDESC_LOCK(fdp);
183	ndp->ni_rootdir = fdp->fd_rdir;
184	ndp->ni_topdir = fdp->fd_jdir;
185
186	dp = fdp->fd_cdir;
187	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
188	VREF(dp);
189	FILEDESC_UNLOCK(fdp);
190	for (;;) {
191		/*
192		 * Check if root directory should replace current directory.
193		 * Done at start of translation and after symbolic link.
194		 */
195		cnp->cn_nameptr = cnp->cn_pnbuf;
196		if (*(cnp->cn_nameptr) == '/') {
197			vrele(dp);
198			VFS_UNLOCK_GIANT(vfslocked);
199			while (*(cnp->cn_nameptr) == '/') {
200				cnp->cn_nameptr++;
201				ndp->ni_pathlen--;
202			}
203			dp = ndp->ni_rootdir;
204			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
205			VREF(dp);
206		}
207		if (vfslocked)
208			ndp->ni_cnd.cn_flags |= GIANTHELD;
209		ndp->ni_startdir = dp;
210		error = lookup(ndp);
211		if (error) {
212			uma_zfree(namei_zone, cnp->cn_pnbuf);
213#ifdef DIAGNOSTIC
214			cnp->cn_pnbuf = NULL;
215			cnp->cn_nameptr = NULL;
216#endif
217			return (error);
218		}
219		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
220		ndp->ni_cnd.cn_flags &= ~GIANTHELD;
221		/*
222		 * Check for symbolic link
223		 */
224		if ((cnp->cn_flags & ISSYMLINK) == 0) {
225			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
226				uma_zfree(namei_zone, cnp->cn_pnbuf);
227#ifdef DIAGNOSTIC
228				cnp->cn_pnbuf = NULL;
229				cnp->cn_nameptr = NULL;
230#endif
231			} else
232				cnp->cn_flags |= HASBUF;
233
234			if ((cnp->cn_flags & MPSAFE) == 0) {
235				VFS_UNLOCK_GIANT(vfslocked);
236			} else if (vfslocked)
237				ndp->ni_cnd.cn_flags |= GIANTHELD;
238			return (0);
239		}
240		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
241			error = ELOOP;
242			break;
243		}
244#ifdef MAC
245		if ((cnp->cn_flags & NOMACCHECK) == 0) {
246			error = mac_check_vnode_readlink(td->td_ucred,
247			    ndp->ni_vp);
248			if (error)
249				break;
250		}
251#endif
252		if (ndp->ni_pathlen > 1)
253			cp = uma_zalloc(namei_zone, M_WAITOK);
254		else
255			cp = cnp->cn_pnbuf;
256		aiov.iov_base = cp;
257		aiov.iov_len = MAXPATHLEN;
258		auio.uio_iov = &aiov;
259		auio.uio_iovcnt = 1;
260		auio.uio_offset = 0;
261		auio.uio_rw = UIO_READ;
262		auio.uio_segflg = UIO_SYSSPACE;
263		auio.uio_td = (struct thread *)0;
264		auio.uio_resid = MAXPATHLEN;
265		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
266		if (error) {
267			if (ndp->ni_pathlen > 1)
268				uma_zfree(namei_zone, cp);
269			break;
270		}
271		linklen = MAXPATHLEN - auio.uio_resid;
272		if (linklen == 0) {
273			if (ndp->ni_pathlen > 1)
274				uma_zfree(namei_zone, cp);
275			error = ENOENT;
276			break;
277		}
278		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
279			if (ndp->ni_pathlen > 1)
280				uma_zfree(namei_zone, cp);
281			error = ENAMETOOLONG;
282			break;
283		}
284		if (ndp->ni_pathlen > 1) {
285			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
286			uma_zfree(namei_zone, cnp->cn_pnbuf);
287			cnp->cn_pnbuf = cp;
288		} else
289			cnp->cn_pnbuf[linklen] = '\0';
290		ndp->ni_pathlen += linklen;
291		vput(ndp->ni_vp);
292		dp = ndp->ni_dvp;
293	}
294	uma_zfree(namei_zone, cnp->cn_pnbuf);
295#ifdef DIAGNOSTIC
296	cnp->cn_pnbuf = NULL;
297	cnp->cn_nameptr = NULL;
298#endif
299	vput(ndp->ni_vp);
300	ndp->ni_vp = NULL;
301	vrele(ndp->ni_dvp);
302	VFS_UNLOCK_GIANT(vfslocked);
303	return (error);
304}
305
306static int
307compute_cn_lkflags(struct mount *mp, int lkflags)
308{
309	if (mp == NULL ||
310	    ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) {
311		lkflags &= ~LK_SHARED;
312		lkflags |= LK_EXCLUSIVE;
313	}
314	return lkflags;
315}
316
317/*
318 * Search a pathname.
319 * This is a very central and rather complicated routine.
320 *
321 * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
322 * The starting directory is taken from ni_startdir. The pathname is
323 * descended until done, or a symbolic link is encountered. The variable
324 * ni_more is clear if the path is completed; it is set to one if a
325 * symbolic link needing interpretation is encountered.
326 *
327 * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
328 * whether the name is to be looked up, created, renamed, or deleted.
329 * When CREATE, RENAME, or DELETE is specified, information usable in
330 * creating, renaming, or deleting a directory entry may be calculated.
331 * If flag has LOCKPARENT or'ed into it, the parent directory is returned
332 * locked. If flag has WANTPARENT or'ed into it, the parent directory is
333 * returned unlocked. Otherwise the parent directory is not returned. If
334 * the target of the pathname exists and LOCKLEAF is or'ed into the flag
335 * the target is returned locked, otherwise it is returned unlocked.
336 * When creating or renaming and LOCKPARENT is specified, the target may not
337 * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
338 *
339 * Overall outline of lookup:
340 *
341 * dirloop:
342 *	identify next component of name at ndp->ni_ptr
343 *	handle degenerate case where name is null string
344 *	if .. and crossing mount points and on mounted filesys, find parent
345 *	call VOP_LOOKUP routine for next component name
346 *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
347 *	    component vnode returned in ni_vp (if it exists), locked.
348 *	if result vnode is mounted on and crossing mount points,
349 *	    find mounted on vnode
350 *	if more components of name, do next level at dirloop
351 *	return the answer in ni_vp, locked if LOCKLEAF set
352 *	    if LOCKPARENT set, return locked parent in ni_dvp
353 *	    if WANTPARENT set, return unlocked parent in ni_dvp
354 */
355int
356lookup(struct nameidata *ndp)
357{
358	char *cp;		/* pointer into pathname argument */
359	struct vnode *dp = 0;	/* the directory we are searching */
360	struct vnode *tdp;		/* saved dp */
361	struct mount *mp;		/* mount table entry */
362	int docache;			/* == 0 do not cache last component */
363	int wantparent;			/* 1 => wantparent or lockparent flag */
364	int rdonly;			/* lookup read-only flag bit */
365	int trailing_slash;
366	int error = 0;
367	int dpunlocked = 0;		/* dp has already been unlocked */
368	struct componentname *cnp = &ndp->ni_cnd;
369	struct thread *td = cnp->cn_thread;
370	int vfslocked;			/* VFS Giant state for child */
371	int dvfslocked;			/* VFS Giant state for parent */
372	int tvfslocked;
373	int lkflags_save;
374
375	/*
376	 * Setup: break out flag bits into variables.
377	 */
378	dvfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
379	vfslocked = 0;
380	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
381	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
382	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
383	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
384	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
385	if (cnp->cn_nameiop == DELETE ||
386	    (wantparent && cnp->cn_nameiop != CREATE &&
387	     cnp->cn_nameiop != LOOKUP))
388		docache = 0;
389	rdonly = cnp->cn_flags & RDONLY;
390	cnp->cn_flags &= ~ISSYMLINK;
391	ndp->ni_dvp = NULL;
392	/*
393	 * We use shared locks until we hit the parent of the last cn then
394	 * we adjust based on the requesting flags.
395	 */
396	if (lookup_shared)
397		cnp->cn_lkflags = LK_SHARED;
398	else
399		cnp->cn_lkflags = LK_EXCLUSIVE;
400	dp = ndp->ni_startdir;
401	ndp->ni_startdir = NULLVP;
402	vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
403
404dirloop:
405	/*
406	 * Search a new directory.
407	 *
408	 * The last component of the filename is left accessible via
409	 * cnp->cn_nameptr for callers that need the name. Callers needing
410	 * the name set the SAVENAME flag. When done, they assume
411	 * responsibility for freeing the pathname buffer.
412	 */
413	cnp->cn_consume = 0;
414	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
415		continue;
416	cnp->cn_namelen = cp - cnp->cn_nameptr;
417	if (cnp->cn_namelen > NAME_MAX) {
418		error = ENAMETOOLONG;
419		goto bad;
420	}
421#ifdef NAMEI_DIAGNOSTIC
422	{ char c = *cp;
423	*cp = '\0';
424	printf("{%s}: ", cnp->cn_nameptr);
425	*cp = c; }
426#endif
427	ndp->ni_pathlen -= cnp->cn_namelen;
428	ndp->ni_next = cp;
429
430	/*
431	 * Replace multiple slashes by a single slash and trailing slashes
432	 * by a null.  This must be done before VOP_LOOKUP() because some
433	 * fs's don't know about trailing slashes.  Remember if there were
434	 * trailing slashes to handle symlinks, existing non-directories
435	 * and non-existing files that won't be directories specially later.
436	 */
437	trailing_slash = 0;
438	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
439		cp++;
440		ndp->ni_pathlen--;
441		if (*cp == '\0') {
442			trailing_slash = 1;
443			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
444		}
445	}
446	ndp->ni_next = cp;
447
448	cnp->cn_flags |= MAKEENTRY;
449	if (*cp == '\0' && docache == 0)
450		cnp->cn_flags &= ~MAKEENTRY;
451	if (cnp->cn_namelen == 2 &&
452	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
453		cnp->cn_flags |= ISDOTDOT;
454	else
455		cnp->cn_flags &= ~ISDOTDOT;
456	if (*ndp->ni_next == 0)
457		cnp->cn_flags |= ISLASTCN;
458	else
459		cnp->cn_flags &= ~ISLASTCN;
460
461
462	/*
463	 * Check for degenerate name (e.g. / or "")
464	 * which is a way of talking about a directory,
465	 * e.g. like "/." or ".".
466	 */
467	if (cnp->cn_nameptr[0] == '\0') {
468		if (dp->v_type != VDIR) {
469			error = ENOTDIR;
470			goto bad;
471		}
472		if (cnp->cn_nameiop != LOOKUP) {
473			error = EISDIR;
474			goto bad;
475		}
476		if (wantparent) {
477			ndp->ni_dvp = dp;
478			VREF(dp);
479		}
480		ndp->ni_vp = dp;
481
482		if (cnp->cn_flags & AUDITVNODE1)
483			AUDIT_ARG(vnode, dp, ARG_VNODE1);
484		else if (cnp->cn_flags & AUDITVNODE2)
485			AUDIT_ARG(vnode, dp, ARG_VNODE2);
486
487		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
488			VOP_UNLOCK(dp, 0, td);
489		/* XXX This should probably move to the top of function. */
490		if (cnp->cn_flags & SAVESTART)
491			panic("lookup: SAVESTART");
492		goto success;
493	}
494
495	/*
496	 * Handle "..": four special cases.
497	 * 1. Return an error if this is the last component of
498	 *    the name and the operation is DELETE or RENAME.
499	 * 2. If at root directory (e.g. after chroot)
500	 *    or at absolute root directory
501	 *    then ignore it so can't get out.
502	 * 3. If this vnode is the root of a mounted
503	 *    filesystem, then replace it with the
504	 *    vnode which was mounted on so we take the
505	 *    .. in the other filesystem.
506	 * 4. If the vnode is the top directory of
507	 *    the jail or chroot, don't let them out.
508	 */
509	if (cnp->cn_flags & ISDOTDOT) {
510		if ((cnp->cn_flags & ISLASTCN) != 0 &&
511		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
512			error = EINVAL;
513			goto bad;
514		}
515		for (;;) {
516			if (dp == ndp->ni_rootdir ||
517			    dp == ndp->ni_topdir ||
518			    dp == rootvnode) {
519				ndp->ni_dvp = dp;
520				ndp->ni_vp = dp;
521				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
522				VREF(dp);
523				goto nextname;
524			}
525			if ((dp->v_vflag & VV_ROOT) == 0 ||
526			    (cnp->cn_flags & NOCROSSMOUNT))
527				break;
528			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
529				error = EBADF;
530				goto bad;
531			}
532			tdp = dp;
533			dp = dp->v_mount->mnt_vnodecovered;
534			tvfslocked = dvfslocked;
535			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
536			VREF(dp);
537			vput(tdp);
538			VFS_UNLOCK_GIANT(tvfslocked);
539			vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
540		}
541	}
542
543	/*
544	 * We now have a segment name to search for, and a directory to search.
545	 */
546unionlookup:
547#ifdef MAC
548	if ((cnp->cn_flags & NOMACCHECK) == 0) {
549		error = mac_check_vnode_lookup(td->td_ucred, dp, cnp);
550		if (error)
551			goto bad;
552	}
553#endif
554	ndp->ni_dvp = dp;
555	ndp->ni_vp = NULL;
556	ASSERT_VOP_LOCKED(dp, "lookup");
557	VNASSERT(vfslocked == 0, dp, ("lookup: vfslocked %d", vfslocked));
558	/*
559	 * If we have a shared lock we may need to upgrade the lock for the
560	 * last operation.
561	 */
562	if (VOP_ISLOCKED(dp, td) == LK_SHARED &&
563	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
564		vn_lock(dp, LK_UPGRADE|LK_RETRY, td);
565	/*
566	 * If we're looking up the last component and we need an exclusive
567	 * lock, adjust our lkflags.
568	 */
569	if ((cnp->cn_flags & (ISLASTCN|LOCKSHARED|LOCKLEAF)) ==
570	    (ISLASTCN|LOCKLEAF))
571		cnp->cn_lkflags = LK_EXCLUSIVE;
572#ifdef NAMEI_DIAGNOSTIC
573	vprint("lookup in", dp);
574#endif
575	lkflags_save = cnp->cn_lkflags;
576	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags);
577	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
578		cnp->cn_lkflags = lkflags_save;
579		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
580#ifdef NAMEI_DIAGNOSTIC
581		printf("not found\n");
582#endif
583		if ((error == ENOENT) &&
584		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
585		    (dp->v_mount->mnt_flag & MNT_UNION)) {
586			tdp = dp;
587			dp = dp->v_mount->mnt_vnodecovered;
588			tvfslocked = dvfslocked;
589			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
590			VREF(dp);
591			vput(tdp);
592			VFS_UNLOCK_GIANT(tvfslocked);
593			vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
594			goto unionlookup;
595		}
596
597		if (error != EJUSTRETURN)
598			goto bad;
599		/*
600		 * If creating and at end of pathname, then can consider
601		 * allowing file to be created.
602		 */
603		if (rdonly) {
604			error = EROFS;
605			goto bad;
606		}
607		if (*cp == '\0' && trailing_slash &&
608		     !(cnp->cn_flags & WILLBEDIR)) {
609			error = ENOENT;
610			goto bad;
611		}
612		if ((cnp->cn_flags & LOCKPARENT) == 0)
613			VOP_UNLOCK(dp, 0, td);
614		/*
615		 * This is a temporary assert to make sure I know what the
616		 * behavior here was.
617		 */
618		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
619		   ("lookup: Unhandled case."));
620		/*
621		 * We return with ni_vp NULL to indicate that the entry
622		 * doesn't currently exist, leaving a pointer to the
623		 * (possibly locked) directory vnode in ndp->ni_dvp.
624		 */
625		if (cnp->cn_flags & SAVESTART) {
626			ndp->ni_startdir = ndp->ni_dvp;
627			VREF(ndp->ni_startdir);
628		}
629		goto success;
630	} else
631		cnp->cn_lkflags = lkflags_save;
632#ifdef NAMEI_DIAGNOSTIC
633	printf("found\n");
634#endif
635	/*
636	 * Take into account any additional components consumed by
637	 * the underlying filesystem.
638	 */
639	if (cnp->cn_consume > 0) {
640		cnp->cn_nameptr += cnp->cn_consume;
641		ndp->ni_next += cnp->cn_consume;
642		ndp->ni_pathlen -= cnp->cn_consume;
643		cnp->cn_consume = 0;
644	}
645
646	dp = ndp->ni_vp;
647	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
648
649	/*
650	 * Check to see if the vnode has been mounted on;
651	 * if so find the root of the mounted filesystem.
652	 */
653	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
654	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
655		if (vfs_busy(mp, 0, 0, td))
656			continue;
657		vput(dp);
658		VFS_UNLOCK_GIANT(vfslocked);
659		vfslocked = VFS_LOCK_GIANT(mp);
660		if (dp != ndp->ni_dvp)
661			VOP_UNLOCK(ndp->ni_dvp, 0, td);
662		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td);
663		vfs_unbusy(mp, td);
664		vn_lock(ndp->ni_dvp, compute_cn_lkflags(mp, cnp->cn_lkflags | LK_RETRY), td);
665		if (error) {
666			dpunlocked = 1;
667			goto bad2;
668		}
669		ndp->ni_vp = dp = tdp;
670	}
671
672	/*
673	 * Check for symbolic link
674	 */
675	if ((dp->v_type == VLNK) &&
676	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
677	     *ndp->ni_next == '/')) {
678		cnp->cn_flags |= ISSYMLINK;
679		if (dp->v_iflag & VI_DOOMED) {
680			/* We can't know whether the directory was mounted with
681			 * NOSYMFOLLOW, so we can't follow safely. */
682			error = EBADF;
683			goto bad2;
684		}
685		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
686			error = EACCES;
687			goto bad2;
688		}
689		/*
690		 * Symlink code always expects an unlocked dvp.
691		 */
692		if (ndp->ni_dvp != ndp->ni_vp)
693			VOP_UNLOCK(ndp->ni_dvp, 0, td);
694		goto success;
695	}
696
697	/*
698	 * Check for bogus trailing slashes.
699	 */
700	if (trailing_slash && dp->v_type != VDIR) {
701		error = ENOTDIR;
702		goto bad2;
703	}
704
705nextname:
706	/*
707	 * Not a symbolic link.  If more pathname,
708	 * continue at next component, else return.
709	 */
710	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
711	    ("lookup: invalid path state."));
712	if (*ndp->ni_next == '/') {
713		cnp->cn_nameptr = ndp->ni_next;
714		while (*cnp->cn_nameptr == '/') {
715			cnp->cn_nameptr++;
716			ndp->ni_pathlen--;
717		}
718		if (ndp->ni_dvp != dp)
719			vput(ndp->ni_dvp);
720		else
721			vrele(ndp->ni_dvp);
722		VFS_UNLOCK_GIANT(dvfslocked);
723		dvfslocked = vfslocked;	/* dp becomes dvp in dirloop */
724		vfslocked = 0;
725		goto dirloop;
726	}
727	/*
728	 * Disallow directory write attempts on read-only filesystems.
729	 */
730	if (rdonly &&
731	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
732		error = EROFS;
733		goto bad2;
734	}
735	if (cnp->cn_flags & SAVESTART) {
736		ndp->ni_startdir = ndp->ni_dvp;
737		VREF(ndp->ni_startdir);
738	}
739	if (!wantparent) {
740		if (ndp->ni_dvp != dp)
741			vput(ndp->ni_dvp);
742		else
743			vrele(ndp->ni_dvp);
744		VFS_UNLOCK_GIANT(dvfslocked);
745		dvfslocked = 0;
746	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp)
747		VOP_UNLOCK(ndp->ni_dvp, 0, td);
748
749	if (cnp->cn_flags & AUDITVNODE1)
750		AUDIT_ARG(vnode, dp, ARG_VNODE1);
751	else if (cnp->cn_flags & AUDITVNODE2)
752		AUDIT_ARG(vnode, dp, ARG_VNODE2);
753
754	if ((cnp->cn_flags & LOCKLEAF) == 0)
755		VOP_UNLOCK(dp, 0, td);
756success:
757	if (vfslocked && dvfslocked)
758		VFS_UNLOCK_GIANT(dvfslocked);	/* Only need one */
759	if (vfslocked || dvfslocked)
760		ndp->ni_cnd.cn_flags |= GIANTHELD;
761	return (0);
762
763bad2:
764	if (dp != ndp->ni_dvp)
765		vput(ndp->ni_dvp);
766	else
767		vrele(ndp->ni_dvp);
768bad:
769	if (!dpunlocked)
770		vput(dp);
771	VFS_UNLOCK_GIANT(vfslocked);
772	VFS_UNLOCK_GIANT(dvfslocked);
773	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
774	ndp->ni_vp = NULL;
775	return (error);
776}
777
778/*
779 * relookup - lookup a path name component
780 *    Used by lookup to re-aquire things.
781 */
782int
783relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
784{
785	struct thread *td = cnp->cn_thread;
786	struct vnode *dp = 0;		/* the directory we are searching */
787	int wantparent;			/* 1 => wantparent or lockparent flag */
788	int rdonly;			/* lookup read-only flag bit */
789	int error = 0;
790
791	KASSERT(cnp->cn_flags & ISLASTCN,
792	    ("relookup: Not given last component."));
793	/*
794	 * Setup: break out flag bits into variables.
795	 */
796	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
797	KASSERT(wantparent, ("relookup: parent not wanted."));
798	rdonly = cnp->cn_flags & RDONLY;
799	cnp->cn_flags &= ~ISSYMLINK;
800	dp = dvp;
801	cnp->cn_lkflags = LK_EXCLUSIVE;
802	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
803
804	/*
805	 * Search a new directory.
806	 *
807	 * The last component of the filename is left accessible via
808	 * cnp->cn_nameptr for callers that need the name. Callers needing
809	 * the name set the SAVENAME flag. When done, they assume
810	 * responsibility for freeing the pathname buffer.
811	 */
812#ifdef NAMEI_DIAGNOSTIC
813	printf("{%s}: ", cnp->cn_nameptr);
814#endif
815
816	/*
817	 * Check for degenerate name (e.g. / or "")
818	 * which is a way of talking about a directory,
819	 * e.g. like "/." or ".".
820	 */
821	if (cnp->cn_nameptr[0] == '\0') {
822		if (cnp->cn_nameiop != LOOKUP || wantparent) {
823			error = EISDIR;
824			goto bad;
825		}
826		if (dp->v_type != VDIR) {
827			error = ENOTDIR;
828			goto bad;
829		}
830		if (!(cnp->cn_flags & LOCKLEAF))
831			VOP_UNLOCK(dp, 0, td);
832		*vpp = dp;
833		/* XXX This should probably move to the top of function. */
834		if (cnp->cn_flags & SAVESTART)
835			panic("lookup: SAVESTART");
836		return (0);
837	}
838
839	if (cnp->cn_flags & ISDOTDOT)
840		panic ("relookup: lookup on dot-dot");
841
842	/*
843	 * We now have a segment name to search for, and a directory to search.
844	 */
845#ifdef NAMEI_DIAGNOSTIC
846	vprint("search in:", dp);
847#endif
848	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
849		KASSERT(*vpp == NULL, ("leaf should be empty"));
850		if (error != EJUSTRETURN)
851			goto bad;
852		/*
853		 * If creating and at end of pathname, then can consider
854		 * allowing file to be created.
855		 */
856		if (rdonly) {
857			error = EROFS;
858			goto bad;
859		}
860		/* ASSERT(dvp == ndp->ni_startdir) */
861		if (cnp->cn_flags & SAVESTART)
862			VREF(dvp);
863		if ((cnp->cn_flags & LOCKPARENT) == 0)
864			VOP_UNLOCK(dp, 0, td);
865		/*
866		 * This is a temporary assert to make sure I know what the
867		 * behavior here was.
868		 */
869		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
870		   ("relookup: Unhandled case."));
871		/*
872		 * We return with ni_vp NULL to indicate that the entry
873		 * doesn't currently exist, leaving a pointer to the
874		 * (possibly locked) directory vnode in ndp->ni_dvp.
875		 */
876		return (0);
877	}
878
879	dp = *vpp;
880
881	/*
882	 * Disallow directory write attempts on read-only filesystems.
883	 */
884	if (rdonly &&
885	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
886		if (dvp == dp)
887			vrele(dvp);
888		else
889			vput(dvp);
890		error = EROFS;
891		goto bad;
892	}
893	/*
894	 * Set the parent lock/ref state to the requested state.
895	 */
896	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
897		if (wantparent)
898			VOP_UNLOCK(dvp, 0, td);
899		else
900			vput(dvp);
901	} else if (!wantparent)
902		vrele(dvp);
903	/*
904	 * Check for symbolic link
905	 */
906	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
907	    ("relookup: symlink found.\n"));
908
909	/* ASSERT(dvp == ndp->ni_startdir) */
910	if (cnp->cn_flags & SAVESTART)
911		VREF(dvp);
912
913	if ((cnp->cn_flags & LOCKLEAF) == 0)
914		VOP_UNLOCK(dp, 0, td);
915	return (0);
916bad:
917	vput(dp);
918	*vpp = NULL;
919	return (error);
920}
921
922/*
923 * Free data allocated by namei(); see namei(9) for details.
924 */
925void
926NDFREE(struct nameidata *ndp, const u_int flags)
927{
928	int unlock_dvp;
929	int unlock_vp;
930
931	unlock_dvp = 0;
932	unlock_vp = 0;
933
934	if (!(flags & NDF_NO_FREE_PNBUF) &&
935	    (ndp->ni_cnd.cn_flags & HASBUF)) {
936		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
937		ndp->ni_cnd.cn_flags &= ~HASBUF;
938	}
939	if (!(flags & NDF_NO_VP_UNLOCK) &&
940	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
941		unlock_vp = 1;
942	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
943		if (unlock_vp) {
944			vput(ndp->ni_vp);
945			unlock_vp = 0;
946		} else
947			vrele(ndp->ni_vp);
948		ndp->ni_vp = NULL;
949	}
950	if (unlock_vp)
951		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
952	if (!(flags & NDF_NO_DVP_UNLOCK) &&
953	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
954	    ndp->ni_dvp != ndp->ni_vp)
955		unlock_dvp = 1;
956	if (!(flags & NDF_NO_DVP_RELE) &&
957	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
958		if (unlock_dvp) {
959			vput(ndp->ni_dvp);
960			unlock_dvp = 0;
961		} else
962			vrele(ndp->ni_dvp);
963		ndp->ni_dvp = NULL;
964	}
965	if (unlock_dvp)
966		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
967	if (!(flags & NDF_NO_STARTDIR_RELE) &&
968	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
969		vrele(ndp->ni_startdir);
970		ndp->ni_startdir = NULL;
971	}
972}
973
974/*
975 * Determine if there is a suitable alternate filename under the specified
976 * prefix for the specified path.  If the create flag is set, then the
977 * alternate prefix will be used so long as the parent directory exists.
978 * This is used by the various compatiblity ABIs so that Linux binaries prefer
979 * files under /compat/linux for example.  The chosen path (whether under
980 * the prefix or under /) is returned in a kernel malloc'd buffer pointed
981 * to by pathbuf.  The caller is responsible for free'ing the buffer from
982 * the M_TEMP bucket if one is returned.
983 */
984int
985kern_alternate_path(struct thread *td, const char *prefix, char *path,
986    enum uio_seg pathseg, char **pathbuf, int create)
987{
988	struct nameidata nd, ndroot;
989	char *ptr, *buf, *cp;
990	size_t len, sz;
991	int error;
992
993	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
994	*pathbuf = buf;
995
996	/* Copy the prefix into the new pathname as a starting point. */
997	len = strlcpy(buf, prefix, MAXPATHLEN);
998	if (len >= MAXPATHLEN) {
999		*pathbuf = NULL;
1000		free(buf, M_TEMP);
1001		return (EINVAL);
1002	}
1003	sz = MAXPATHLEN - len;
1004	ptr = buf + len;
1005
1006	/* Append the filename to the prefix. */
1007	if (pathseg == UIO_SYSSPACE)
1008		error = copystr(path, ptr, sz, &len);
1009	else
1010		error = copyinstr(path, ptr, sz, &len);
1011
1012	if (error) {
1013		*pathbuf = NULL;
1014		free(buf, M_TEMP);
1015		return (error);
1016	}
1017
1018	/* Only use a prefix with absolute pathnames. */
1019	if (*ptr != '/') {
1020		error = EINVAL;
1021		goto keeporig;
1022	}
1023
1024	/*
1025	 * We know that there is a / somewhere in this pathname.
1026	 * Search backwards for it, to find the file's parent dir
1027	 * to see if it exists in the alternate tree. If it does,
1028	 * and we want to create a file (cflag is set). We don't
1029	 * need to worry about the root comparison in this case.
1030	 */
1031
1032	if (create) {
1033		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
1034		*cp = '\0';
1035
1036		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1037		error = namei(&nd);
1038		*cp = '/';
1039		if (error != 0)
1040			goto keeporig;
1041	} else {
1042		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1043
1044		error = namei(&nd);
1045		if (error != 0)
1046			goto keeporig;
1047
1048		/*
1049		 * We now compare the vnode of the prefix to the one
1050		 * vnode asked. If they resolve to be the same, then we
1051		 * ignore the match so that the real root gets used.
1052		 * This avoids the problem of traversing "../.." to find the
1053		 * root directory and never finding it, because "/" resolves
1054		 * to the emulation root directory. This is expensive :-(
1055		 */
1056		NDINIT(&ndroot, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, prefix,
1057		    td);
1058
1059		/* We shouldn't ever get an error from this namei(). */
1060		error = namei(&ndroot);
1061		if (error == 0) {
1062			if (nd.ni_vp == ndroot.ni_vp)
1063				error = ENOENT;
1064
1065			NDFREE(&ndroot, NDF_ONLY_PNBUF);
1066			vrele(ndroot.ni_vp);
1067			VFS_UNLOCK_GIANT(NDHASGIANT(&ndroot));
1068		}
1069	}
1070
1071	NDFREE(&nd, NDF_ONLY_PNBUF);
1072	vrele(nd.ni_vp);
1073	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
1074
1075keeporig:
1076	/* If there was an error, use the original path name. */
1077	if (error)
1078		bcopy(ptr, buf, len);
1079	return (error);
1080}
1081