vfs_lookup.c revision 162288
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/vfs_lookup.c 162288 2006-09-13 18:39:09Z mohans $");
39
40#include "opt_ktrace.h"
41#include "opt_mac.h"
42#include "opt_vfs.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/mac.h>
49#include <sys/mutex.h>
50#include <sys/namei.h>
51#include <sys/vnode.h>
52#include <sys/mount.h>
53#include <sys/filedesc.h>
54#include <sys/proc.h>
55#include <sys/syscallsubr.h>
56#include <sys/sysctl.h>
57#ifdef KTRACE
58#include <sys/ktrace.h>
59#endif
60
61#include <security/audit/audit.h>
62
63#include <vm/uma.h>
64
65#define	NAMEI_DIAGNOSTIC 1
66#undef NAMEI_DIAGNOSTIC
67
68/*
69 * Allocation zone for namei
70 */
71uma_zone_t namei_zone;
72
73static void
74nameiinit(void *dummy __unused)
75{
76	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
77	    UMA_ALIGN_PTR, 0);
78
79}
80SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
81
82#ifdef LOOKUP_SHARED
83static int lookup_shared = 1;
84#else
85static int lookup_shared = 0;
86#endif
87SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
88    "Enables/Disables shared locks for path name translation");
89
90/*
91 * Convert a pathname into a pointer to a locked vnode.
92 *
93 * The FOLLOW flag is set when symbolic links are to be followed
94 * when they occur at the end of the name translation process.
95 * Symbolic links are always followed for all other pathname
96 * components other than the last.
97 *
98 * The segflg defines whether the name is to be copied from user
99 * space or kernel space.
100 *
101 * Overall outline of namei:
102 *
103 *	copy in name
104 *	get starting directory
105 *	while (!done && !error) {
106 *		call lookup to search path.
107 *		if symbolic link, massage name in buffer and continue
108 *	}
109 */
110int
111namei(struct nameidata *ndp)
112{
113	struct filedesc *fdp;	/* pointer to file descriptor state */
114	char *cp;		/* pointer into pathname argument */
115	struct vnode *dp;	/* the directory we are searching */
116	struct iovec aiov;		/* uio for reading symbolic links */
117	struct uio auio;
118	int error, linklen;
119	struct componentname *cnp = &ndp->ni_cnd;
120	struct thread *td = cnp->cn_thread;
121	struct proc *p = td->td_proc;
122	int vfslocked;
123
124	KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0,
125	    ("NOT MPSAFE and Giant not held"));
126	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
127	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
128	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
129	    ("namei: nameiop contaminated with flags"));
130	KASSERT((cnp->cn_flags & OPMASK) == 0,
131	    ("namei: flags contaminated with nameiops"));
132	if (!lookup_shared)
133		cnp->cn_flags &= ~LOCKSHARED;
134	fdp = p->p_fd;
135
136	/*
137	 * Get a buffer for the name to be translated, and copy the
138	 * name into the buffer.
139	 */
140	if ((cnp->cn_flags & HASBUF) == 0)
141		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
142	if (ndp->ni_segflg == UIO_SYSSPACE)
143		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
144			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
145	else
146		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
147			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
148
149	/* If we are auditing the kernel pathname, save the user pathname. */
150	if (cnp->cn_flags & AUDITVNODE1)
151		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH1);
152	if (cnp->cn_flags & AUDITVNODE2)
153		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH2);
154
155	/*
156	 * Don't allow empty pathnames.
157	 */
158	if (!error && *cnp->cn_pnbuf == '\0')
159		error = ENOENT;
160
161	if (error) {
162		uma_zfree(namei_zone, cnp->cn_pnbuf);
163#ifdef DIAGNOSTIC
164		cnp->cn_pnbuf = NULL;
165		cnp->cn_nameptr = NULL;
166#endif
167		ndp->ni_vp = NULL;
168		return (error);
169	}
170	ndp->ni_loopcnt = 0;
171#ifdef KTRACE
172	if (KTRPOINT(td, KTR_NAMEI)) {
173		KASSERT(cnp->cn_thread == curthread,
174		    ("namei not using curthread"));
175		ktrnamei(cnp->cn_pnbuf);
176	}
177#endif
178
179	/*
180	 * Get starting point for the translation.
181	 */
182	FILEDESC_LOCK(fdp);
183	ndp->ni_rootdir = fdp->fd_rdir;
184	ndp->ni_topdir = fdp->fd_jdir;
185
186	dp = fdp->fd_cdir;
187	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
188	VREF(dp);
189	FILEDESC_UNLOCK(fdp);
190	for (;;) {
191		/*
192		 * Check if root directory should replace current directory.
193		 * Done at start of translation and after symbolic link.
194		 */
195		cnp->cn_nameptr = cnp->cn_pnbuf;
196		if (*(cnp->cn_nameptr) == '/') {
197			vrele(dp);
198			VFS_UNLOCK_GIANT(vfslocked);
199			while (*(cnp->cn_nameptr) == '/') {
200				cnp->cn_nameptr++;
201				ndp->ni_pathlen--;
202			}
203			dp = ndp->ni_rootdir;
204			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
205			VREF(dp);
206		}
207		if (vfslocked)
208			ndp->ni_cnd.cn_flags |= GIANTHELD;
209		ndp->ni_startdir = dp;
210		error = lookup(ndp);
211		if (error) {
212			uma_zfree(namei_zone, cnp->cn_pnbuf);
213#ifdef DIAGNOSTIC
214			cnp->cn_pnbuf = NULL;
215			cnp->cn_nameptr = NULL;
216#endif
217			return (error);
218		}
219		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
220		ndp->ni_cnd.cn_flags &= ~GIANTHELD;
221		/*
222		 * Check for symbolic link
223		 */
224		if ((cnp->cn_flags & ISSYMLINK) == 0) {
225			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
226				uma_zfree(namei_zone, cnp->cn_pnbuf);
227#ifdef DIAGNOSTIC
228				cnp->cn_pnbuf = NULL;
229				cnp->cn_nameptr = NULL;
230#endif
231			} else
232				cnp->cn_flags |= HASBUF;
233
234			if ((cnp->cn_flags & MPSAFE) == 0) {
235				VFS_UNLOCK_GIANT(vfslocked);
236			} else if (vfslocked)
237				ndp->ni_cnd.cn_flags |= GIANTHELD;
238			return (0);
239		}
240		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
241			error = ELOOP;
242			break;
243		}
244#ifdef MAC
245		if ((cnp->cn_flags & NOMACCHECK) == 0) {
246			error = mac_check_vnode_readlink(td->td_ucred,
247			    ndp->ni_vp);
248			if (error)
249				break;
250		}
251#endif
252		if (ndp->ni_pathlen > 1)
253			cp = uma_zalloc(namei_zone, M_WAITOK);
254		else
255			cp = cnp->cn_pnbuf;
256		aiov.iov_base = cp;
257		aiov.iov_len = MAXPATHLEN;
258		auio.uio_iov = &aiov;
259		auio.uio_iovcnt = 1;
260		auio.uio_offset = 0;
261		auio.uio_rw = UIO_READ;
262		auio.uio_segflg = UIO_SYSSPACE;
263		auio.uio_td = (struct thread *)0;
264		auio.uio_resid = MAXPATHLEN;
265		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
266		if (error) {
267			if (ndp->ni_pathlen > 1)
268				uma_zfree(namei_zone, cp);
269			break;
270		}
271		linklen = MAXPATHLEN - auio.uio_resid;
272		if (linklen == 0) {
273			if (ndp->ni_pathlen > 1)
274				uma_zfree(namei_zone, cp);
275			error = ENOENT;
276			break;
277		}
278		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
279			if (ndp->ni_pathlen > 1)
280				uma_zfree(namei_zone, cp);
281			error = ENAMETOOLONG;
282			break;
283		}
284		if (ndp->ni_pathlen > 1) {
285			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
286			uma_zfree(namei_zone, cnp->cn_pnbuf);
287			cnp->cn_pnbuf = cp;
288		} else
289			cnp->cn_pnbuf[linklen] = '\0';
290		ndp->ni_pathlen += linklen;
291		vput(ndp->ni_vp);
292		dp = ndp->ni_dvp;
293	}
294	uma_zfree(namei_zone, cnp->cn_pnbuf);
295#ifdef DIAGNOSTIC
296	cnp->cn_pnbuf = NULL;
297	cnp->cn_nameptr = NULL;
298#endif
299	vput(ndp->ni_vp);
300	ndp->ni_vp = NULL;
301	vrele(ndp->ni_dvp);
302	VFS_UNLOCK_GIANT(vfslocked);
303	return (error);
304}
305
306static int
307compute_cn_lkflags(struct mount *mp, int lkflags)
308{
309	if ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED)) {
310		lkflags &= ~LK_SHARED;
311		lkflags |= LK_EXCLUSIVE;
312	}
313	return lkflags;
314}
315
316/*
317 * Search a pathname.
318 * This is a very central and rather complicated routine.
319 *
320 * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
321 * The starting directory is taken from ni_startdir. The pathname is
322 * descended until done, or a symbolic link is encountered. The variable
323 * ni_more is clear if the path is completed; it is set to one if a
324 * symbolic link needing interpretation is encountered.
325 *
326 * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
327 * whether the name is to be looked up, created, renamed, or deleted.
328 * When CREATE, RENAME, or DELETE is specified, information usable in
329 * creating, renaming, or deleting a directory entry may be calculated.
330 * If flag has LOCKPARENT or'ed into it, the parent directory is returned
331 * locked. If flag has WANTPARENT or'ed into it, the parent directory is
332 * returned unlocked. Otherwise the parent directory is not returned. If
333 * the target of the pathname exists and LOCKLEAF is or'ed into the flag
334 * the target is returned locked, otherwise it is returned unlocked.
335 * When creating or renaming and LOCKPARENT is specified, the target may not
336 * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
337 *
338 * Overall outline of lookup:
339 *
340 * dirloop:
341 *	identify next component of name at ndp->ni_ptr
342 *	handle degenerate case where name is null string
343 *	if .. and crossing mount points and on mounted filesys, find parent
344 *	call VOP_LOOKUP routine for next component name
345 *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
346 *	    component vnode returned in ni_vp (if it exists), locked.
347 *	if result vnode is mounted on and crossing mount points,
348 *	    find mounted on vnode
349 *	if more components of name, do next level at dirloop
350 *	return the answer in ni_vp, locked if LOCKLEAF set
351 *	    if LOCKPARENT set, return locked parent in ni_dvp
352 *	    if WANTPARENT set, return unlocked parent in ni_dvp
353 */
354int
355lookup(struct nameidata *ndp)
356{
357	char *cp;		/* pointer into pathname argument */
358	struct vnode *dp = 0;	/* the directory we are searching */
359	struct vnode *tdp;		/* saved dp */
360	struct mount *mp;		/* mount table entry */
361	int docache;			/* == 0 do not cache last component */
362	int wantparent;			/* 1 => wantparent or lockparent flag */
363	int rdonly;			/* lookup read-only flag bit */
364	int trailing_slash;
365	int error = 0;
366	int dpunlocked = 0;		/* dp has already been unlocked */
367	struct componentname *cnp = &ndp->ni_cnd;
368	struct thread *td = cnp->cn_thread;
369	int vfslocked;			/* VFS Giant state for child */
370	int dvfslocked;			/* VFS Giant state for parent */
371	int tvfslocked;
372	int lkflags_save;
373
374	/*
375	 * Setup: break out flag bits into variables.
376	 */
377	dvfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
378	vfslocked = 0;
379	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
380	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
381	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
382	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
383	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
384	if (cnp->cn_nameiop == DELETE ||
385	    (wantparent && cnp->cn_nameiop != CREATE &&
386	     cnp->cn_nameiop != LOOKUP))
387		docache = 0;
388	rdonly = cnp->cn_flags & RDONLY;
389	cnp->cn_flags &= ~ISSYMLINK;
390	ndp->ni_dvp = NULL;
391	/*
392	 * We use shared locks until we hit the parent of the last cn then
393	 * we adjust based on the requesting flags.
394	 */
395	if (lookup_shared)
396		cnp->cn_lkflags = LK_SHARED;
397	else
398		cnp->cn_lkflags = LK_EXCLUSIVE;
399	dp = ndp->ni_startdir;
400	ndp->ni_startdir = NULLVP;
401	vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
402
403dirloop:
404	/*
405	 * Search a new directory.
406	 *
407	 * The last component of the filename is left accessible via
408	 * cnp->cn_nameptr for callers that need the name. Callers needing
409	 * the name set the SAVENAME flag. When done, they assume
410	 * responsibility for freeing the pathname buffer.
411	 */
412	cnp->cn_consume = 0;
413	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
414		continue;
415	cnp->cn_namelen = cp - cnp->cn_nameptr;
416	if (cnp->cn_namelen > NAME_MAX) {
417		error = ENAMETOOLONG;
418		goto bad;
419	}
420#ifdef NAMEI_DIAGNOSTIC
421	{ char c = *cp;
422	*cp = '\0';
423	printf("{%s}: ", cnp->cn_nameptr);
424	*cp = c; }
425#endif
426	ndp->ni_pathlen -= cnp->cn_namelen;
427	ndp->ni_next = cp;
428
429	/*
430	 * Replace multiple slashes by a single slash and trailing slashes
431	 * by a null.  This must be done before VOP_LOOKUP() because some
432	 * fs's don't know about trailing slashes.  Remember if there were
433	 * trailing slashes to handle symlinks, existing non-directories
434	 * and non-existing files that won't be directories specially later.
435	 */
436	trailing_slash = 0;
437	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
438		cp++;
439		ndp->ni_pathlen--;
440		if (*cp == '\0') {
441			trailing_slash = 1;
442			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
443		}
444	}
445	ndp->ni_next = cp;
446
447	cnp->cn_flags |= MAKEENTRY;
448	if (*cp == '\0' && docache == 0)
449		cnp->cn_flags &= ~MAKEENTRY;
450	if (cnp->cn_namelen == 2 &&
451	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
452		cnp->cn_flags |= ISDOTDOT;
453	else
454		cnp->cn_flags &= ~ISDOTDOT;
455	if (*ndp->ni_next == 0)
456		cnp->cn_flags |= ISLASTCN;
457	else
458		cnp->cn_flags &= ~ISLASTCN;
459
460
461	/*
462	 * Check for degenerate name (e.g. / or "")
463	 * which is a way of talking about a directory,
464	 * e.g. like "/." or ".".
465	 */
466	if (cnp->cn_nameptr[0] == '\0') {
467		if (dp->v_type != VDIR) {
468			error = ENOTDIR;
469			goto bad;
470		}
471		if (cnp->cn_nameiop != LOOKUP) {
472			error = EISDIR;
473			goto bad;
474		}
475		if (wantparent) {
476			ndp->ni_dvp = dp;
477			VREF(dp);
478		}
479		ndp->ni_vp = dp;
480
481		if (cnp->cn_flags & AUDITVNODE1)
482			AUDIT_ARG(vnode, dp, ARG_VNODE1);
483		else if (cnp->cn_flags & AUDITVNODE2)
484			AUDIT_ARG(vnode, dp, ARG_VNODE2);
485
486		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
487			VOP_UNLOCK(dp, 0, td);
488		/* XXX This should probably move to the top of function. */
489		if (cnp->cn_flags & SAVESTART)
490			panic("lookup: SAVESTART");
491		goto success;
492	}
493
494	/*
495	 * Handle "..": four special cases.
496	 * 1. Return an error if this is the last component of
497	 *    the name and the operation is DELETE or RENAME.
498	 * 2. If at root directory (e.g. after chroot)
499	 *    or at absolute root directory
500	 *    then ignore it so can't get out.
501	 * 3. If this vnode is the root of a mounted
502	 *    filesystem, then replace it with the
503	 *    vnode which was mounted on so we take the
504	 *    .. in the other filesystem.
505	 * 4. If the vnode is the top directory of
506	 *    the jail or chroot, don't let them out.
507	 */
508	if (cnp->cn_flags & ISDOTDOT) {
509		if ((cnp->cn_flags & ISLASTCN) != 0 &&
510		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
511			error = EINVAL;
512			goto bad;
513		}
514		for (;;) {
515			if (dp == ndp->ni_rootdir ||
516			    dp == ndp->ni_topdir ||
517			    dp == rootvnode) {
518				ndp->ni_dvp = dp;
519				ndp->ni_vp = dp;
520				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
521				VREF(dp);
522				goto nextname;
523			}
524			if ((dp->v_vflag & VV_ROOT) == 0 ||
525			    (cnp->cn_flags & NOCROSSMOUNT))
526				break;
527			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
528				error = EBADF;
529				goto bad;
530			}
531			tdp = dp;
532			dp = dp->v_mount->mnt_vnodecovered;
533			tvfslocked = dvfslocked;
534			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
535			VREF(dp);
536			vput(tdp);
537			VFS_UNLOCK_GIANT(tvfslocked);
538			vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
539		}
540	}
541
542	/*
543	 * We now have a segment name to search for, and a directory to search.
544	 */
545unionlookup:
546#ifdef MAC
547	if ((cnp->cn_flags & NOMACCHECK) == 0) {
548		error = mac_check_vnode_lookup(td->td_ucred, dp, cnp);
549		if (error)
550			goto bad;
551	}
552#endif
553	ndp->ni_dvp = dp;
554	ndp->ni_vp = NULL;
555	ASSERT_VOP_LOCKED(dp, "lookup");
556	VNASSERT(vfslocked == 0, dp, ("lookup: vfslocked %d", vfslocked));
557	/*
558	 * If we have a shared lock we may need to upgrade the lock for the
559	 * last operation.
560	 */
561	if (VOP_ISLOCKED(dp, td) == LK_SHARED &&
562	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
563		vn_lock(dp, LK_UPGRADE|LK_RETRY, td);
564	/*
565	 * If we're looking up the last component and we need an exclusive
566	 * lock, adjust our lkflags.
567	 */
568	if ((cnp->cn_flags & (ISLASTCN|LOCKSHARED|LOCKLEAF)) ==
569	    (ISLASTCN|LOCKLEAF))
570		cnp->cn_lkflags = LK_EXCLUSIVE;
571#ifdef NAMEI_DIAGNOSTIC
572	vprint("lookup in", dp);
573#endif
574	lkflags_save = cnp->cn_lkflags;
575	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags);
576	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
577		cnp->cn_lkflags = lkflags_save;
578		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
579#ifdef NAMEI_DIAGNOSTIC
580		printf("not found\n");
581#endif
582		if ((error == ENOENT) &&
583		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
584		    (dp->v_mount->mnt_flag & MNT_UNION)) {
585			tdp = dp;
586			dp = dp->v_mount->mnt_vnodecovered;
587			tvfslocked = dvfslocked;
588			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
589			VREF(dp);
590			vput(tdp);
591			VFS_UNLOCK_GIANT(tvfslocked);
592			vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
593			goto unionlookup;
594		}
595
596		if (error != EJUSTRETURN)
597			goto bad;
598		/*
599		 * If creating and at end of pathname, then can consider
600		 * allowing file to be created.
601		 */
602		if (rdonly) {
603			error = EROFS;
604			goto bad;
605		}
606		if (*cp == '\0' && trailing_slash &&
607		     !(cnp->cn_flags & WILLBEDIR)) {
608			error = ENOENT;
609			goto bad;
610		}
611		if ((cnp->cn_flags & LOCKPARENT) == 0)
612			VOP_UNLOCK(dp, 0, td);
613		/*
614		 * This is a temporary assert to make sure I know what the
615		 * behavior here was.
616		 */
617		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
618		   ("lookup: Unhandled case."));
619		/*
620		 * We return with ni_vp NULL to indicate that the entry
621		 * doesn't currently exist, leaving a pointer to the
622		 * (possibly locked) directory vnode in ndp->ni_dvp.
623		 */
624		if (cnp->cn_flags & SAVESTART) {
625			ndp->ni_startdir = ndp->ni_dvp;
626			VREF(ndp->ni_startdir);
627		}
628		goto success;
629	} else
630		cnp->cn_lkflags = lkflags_save;
631#ifdef NAMEI_DIAGNOSTIC
632	printf("found\n");
633#endif
634	/*
635	 * Take into account any additional components consumed by
636	 * the underlying filesystem.
637	 */
638	if (cnp->cn_consume > 0) {
639		cnp->cn_nameptr += cnp->cn_consume;
640		ndp->ni_next += cnp->cn_consume;
641		ndp->ni_pathlen -= cnp->cn_consume;
642		cnp->cn_consume = 0;
643	}
644
645	dp = ndp->ni_vp;
646	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
647
648	/*
649	 * Check to see if the vnode has been mounted on;
650	 * if so find the root of the mounted filesystem.
651	 */
652	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
653	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
654		if (vfs_busy(mp, 0, 0, td))
655			continue;
656		vput(dp);
657		VFS_UNLOCK_GIANT(vfslocked);
658		vfslocked = VFS_LOCK_GIANT(mp);
659		if (dp != ndp->ni_dvp)
660			VOP_UNLOCK(ndp->ni_dvp, 0, td);
661		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td);
662		vfs_unbusy(mp, td);
663		vn_lock(ndp->ni_dvp, compute_cn_lkflags(mp, cnp->cn_lkflags | LK_RETRY), td);
664		if (error) {
665			dpunlocked = 1;
666			goto bad2;
667		}
668		ndp->ni_vp = dp = tdp;
669	}
670
671	/*
672	 * Check for symbolic link
673	 */
674	if ((dp->v_type == VLNK) &&
675	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
676	     *ndp->ni_next == '/')) {
677		cnp->cn_flags |= ISSYMLINK;
678		if (dp->v_iflag & VI_DOOMED) {
679			/* We can't know whether the directory was mounted with
680			 * NOSYMFOLLOW, so we can't follow safely. */
681			error = EBADF;
682			goto bad2;
683		}
684		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
685			error = EACCES;
686			goto bad2;
687		}
688		/*
689		 * Symlink code always expects an unlocked dvp.
690		 */
691		if (ndp->ni_dvp != ndp->ni_vp)
692			VOP_UNLOCK(ndp->ni_dvp, 0, td);
693		goto success;
694	}
695
696	/*
697	 * Check for bogus trailing slashes.
698	 */
699	if (trailing_slash && dp->v_type != VDIR) {
700		error = ENOTDIR;
701		goto bad2;
702	}
703
704nextname:
705	/*
706	 * Not a symbolic link.  If more pathname,
707	 * continue at next component, else return.
708	 */
709	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
710	    ("lookup: invalid path state."));
711	if (*ndp->ni_next == '/') {
712		cnp->cn_nameptr = ndp->ni_next;
713		while (*cnp->cn_nameptr == '/') {
714			cnp->cn_nameptr++;
715			ndp->ni_pathlen--;
716		}
717		if (ndp->ni_dvp != dp)
718			vput(ndp->ni_dvp);
719		else
720			vrele(ndp->ni_dvp);
721		VFS_UNLOCK_GIANT(dvfslocked);
722		dvfslocked = vfslocked;	/* dp becomes dvp in dirloop */
723		vfslocked = 0;
724		goto dirloop;
725	}
726	/*
727	 * Disallow directory write attempts on read-only filesystems.
728	 */
729	if (rdonly &&
730	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
731		error = EROFS;
732		goto bad2;
733	}
734	if (cnp->cn_flags & SAVESTART) {
735		ndp->ni_startdir = ndp->ni_dvp;
736		VREF(ndp->ni_startdir);
737	}
738	if (!wantparent) {
739		if (ndp->ni_dvp != dp)
740			vput(ndp->ni_dvp);
741		else
742			vrele(ndp->ni_dvp);
743		VFS_UNLOCK_GIANT(dvfslocked);
744		dvfslocked = 0;
745	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp)
746		VOP_UNLOCK(ndp->ni_dvp, 0, td);
747
748	if (cnp->cn_flags & AUDITVNODE1)
749		AUDIT_ARG(vnode, dp, ARG_VNODE1);
750	else if (cnp->cn_flags & AUDITVNODE2)
751		AUDIT_ARG(vnode, dp, ARG_VNODE2);
752
753	if ((cnp->cn_flags & LOCKLEAF) == 0)
754		VOP_UNLOCK(dp, 0, td);
755success:
756	if (vfslocked && dvfslocked)
757		VFS_UNLOCK_GIANT(dvfslocked);	/* Only need one */
758	if (vfslocked || dvfslocked)
759		ndp->ni_cnd.cn_flags |= GIANTHELD;
760	return (0);
761
762bad2:
763	if (dp != ndp->ni_dvp)
764		vput(ndp->ni_dvp);
765	else
766		vrele(ndp->ni_dvp);
767bad:
768	if (!dpunlocked)
769		vput(dp);
770	VFS_UNLOCK_GIANT(vfslocked);
771	VFS_UNLOCK_GIANT(dvfslocked);
772	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
773	ndp->ni_vp = NULL;
774	return (error);
775}
776
777/*
778 * relookup - lookup a path name component
779 *    Used by lookup to re-aquire things.
780 */
781int
782relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
783{
784	struct thread *td = cnp->cn_thread;
785	struct vnode *dp = 0;		/* the directory we are searching */
786	int wantparent;			/* 1 => wantparent or lockparent flag */
787	int rdonly;			/* lookup read-only flag bit */
788	int error = 0;
789
790	KASSERT(cnp->cn_flags & ISLASTCN,
791	    ("relookup: Not given last component."));
792	/*
793	 * Setup: break out flag bits into variables.
794	 */
795	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
796	KASSERT(wantparent, ("relookup: parent not wanted."));
797	rdonly = cnp->cn_flags & RDONLY;
798	cnp->cn_flags &= ~ISSYMLINK;
799	dp = dvp;
800	cnp->cn_lkflags = LK_EXCLUSIVE;
801	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
802
803	/*
804	 * Search a new directory.
805	 *
806	 * The last component of the filename is left accessible via
807	 * cnp->cn_nameptr for callers that need the name. Callers needing
808	 * the name set the SAVENAME flag. When done, they assume
809	 * responsibility for freeing the pathname buffer.
810	 */
811#ifdef NAMEI_DIAGNOSTIC
812	printf("{%s}: ", cnp->cn_nameptr);
813#endif
814
815	/*
816	 * Check for degenerate name (e.g. / or "")
817	 * which is a way of talking about a directory,
818	 * e.g. like "/." or ".".
819	 */
820	if (cnp->cn_nameptr[0] == '\0') {
821		if (cnp->cn_nameiop != LOOKUP || wantparent) {
822			error = EISDIR;
823			goto bad;
824		}
825		if (dp->v_type != VDIR) {
826			error = ENOTDIR;
827			goto bad;
828		}
829		if (!(cnp->cn_flags & LOCKLEAF))
830			VOP_UNLOCK(dp, 0, td);
831		*vpp = dp;
832		/* XXX This should probably move to the top of function. */
833		if (cnp->cn_flags & SAVESTART)
834			panic("lookup: SAVESTART");
835		return (0);
836	}
837
838	if (cnp->cn_flags & ISDOTDOT)
839		panic ("relookup: lookup on dot-dot");
840
841	/*
842	 * We now have a segment name to search for, and a directory to search.
843	 */
844#ifdef NAMEI_DIAGNOSTIC
845	vprint("search in:", dp);
846#endif
847	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
848		KASSERT(*vpp == NULL, ("leaf should be empty"));
849		if (error != EJUSTRETURN)
850			goto bad;
851		/*
852		 * If creating and at end of pathname, then can consider
853		 * allowing file to be created.
854		 */
855		if (rdonly) {
856			error = EROFS;
857			goto bad;
858		}
859		/* ASSERT(dvp == ndp->ni_startdir) */
860		if (cnp->cn_flags & SAVESTART)
861			VREF(dvp);
862		if ((cnp->cn_flags & LOCKPARENT) == 0)
863			VOP_UNLOCK(dp, 0, td);
864		/*
865		 * This is a temporary assert to make sure I know what the
866		 * behavior here was.
867		 */
868		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
869		   ("relookup: Unhandled case."));
870		/*
871		 * We return with ni_vp NULL to indicate that the entry
872		 * doesn't currently exist, leaving a pointer to the
873		 * (possibly locked) directory vnode in ndp->ni_dvp.
874		 */
875		return (0);
876	}
877
878	dp = *vpp;
879
880	/*
881	 * Disallow directory write attempts on read-only filesystems.
882	 */
883	if (rdonly &&
884	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
885		if (dvp == dp)
886			vrele(dvp);
887		else
888			vput(dvp);
889		error = EROFS;
890		goto bad;
891	}
892	/*
893	 * Set the parent lock/ref state to the requested state.
894	 */
895	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
896		if (wantparent)
897			VOP_UNLOCK(dvp, 0, td);
898		else
899			vput(dvp);
900	} else if (!wantparent)
901		vrele(dvp);
902	/*
903	 * Check for symbolic link
904	 */
905	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
906	    ("relookup: symlink found.\n"));
907
908	/* ASSERT(dvp == ndp->ni_startdir) */
909	if (cnp->cn_flags & SAVESTART)
910		VREF(dvp);
911
912	if ((cnp->cn_flags & LOCKLEAF) == 0)
913		VOP_UNLOCK(dp, 0, td);
914	return (0);
915bad:
916	vput(dp);
917	*vpp = NULL;
918	return (error);
919}
920
921/*
922 * Free data allocated by namei(); see namei(9) for details.
923 */
924void
925NDFREE(struct nameidata *ndp, const u_int flags)
926{
927	int unlock_dvp;
928	int unlock_vp;
929
930	unlock_dvp = 0;
931	unlock_vp = 0;
932
933	if (!(flags & NDF_NO_FREE_PNBUF) &&
934	    (ndp->ni_cnd.cn_flags & HASBUF)) {
935		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
936		ndp->ni_cnd.cn_flags &= ~HASBUF;
937	}
938	if (!(flags & NDF_NO_VP_UNLOCK) &&
939	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
940		unlock_vp = 1;
941	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
942		if (unlock_vp) {
943			vput(ndp->ni_vp);
944			unlock_vp = 0;
945		} else
946			vrele(ndp->ni_vp);
947		ndp->ni_vp = NULL;
948	}
949	if (unlock_vp)
950		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
951	if (!(flags & NDF_NO_DVP_UNLOCK) &&
952	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
953	    ndp->ni_dvp != ndp->ni_vp)
954		unlock_dvp = 1;
955	if (!(flags & NDF_NO_DVP_RELE) &&
956	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
957		if (unlock_dvp) {
958			vput(ndp->ni_dvp);
959			unlock_dvp = 0;
960		} else
961			vrele(ndp->ni_dvp);
962		ndp->ni_dvp = NULL;
963	}
964	if (unlock_dvp)
965		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
966	if (!(flags & NDF_NO_STARTDIR_RELE) &&
967	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
968		vrele(ndp->ni_startdir);
969		ndp->ni_startdir = NULL;
970	}
971}
972
973/*
974 * Determine if there is a suitable alternate filename under the specified
975 * prefix for the specified path.  If the create flag is set, then the
976 * alternate prefix will be used so long as the parent directory exists.
977 * This is used by the various compatiblity ABIs so that Linux binaries prefer
978 * files under /compat/linux for example.  The chosen path (whether under
979 * the prefix or under /) is returned in a kernel malloc'd buffer pointed
980 * to by pathbuf.  The caller is responsible for free'ing the buffer from
981 * the M_TEMP bucket if one is returned.
982 */
983int
984kern_alternate_path(struct thread *td, const char *prefix, char *path,
985    enum uio_seg pathseg, char **pathbuf, int create)
986{
987	struct nameidata nd, ndroot;
988	char *ptr, *buf, *cp;
989	size_t len, sz;
990	int error;
991
992	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
993	*pathbuf = buf;
994
995	/* Copy the prefix into the new pathname as a starting point. */
996	len = strlcpy(buf, prefix, MAXPATHLEN);
997	if (len >= MAXPATHLEN) {
998		*pathbuf = NULL;
999		free(buf, M_TEMP);
1000		return (EINVAL);
1001	}
1002	sz = MAXPATHLEN - len;
1003	ptr = buf + len;
1004
1005	/* Append the filename to the prefix. */
1006	if (pathseg == UIO_SYSSPACE)
1007		error = copystr(path, ptr, sz, &len);
1008	else
1009		error = copyinstr(path, ptr, sz, &len);
1010
1011	if (error) {
1012		*pathbuf = NULL;
1013		free(buf, M_TEMP);
1014		return (error);
1015	}
1016
1017	/* Only use a prefix with absolute pathnames. */
1018	if (*ptr != '/') {
1019		error = EINVAL;
1020		goto keeporig;
1021	}
1022
1023	/*
1024	 * We know that there is a / somewhere in this pathname.
1025	 * Search backwards for it, to find the file's parent dir
1026	 * to see if it exists in the alternate tree. If it does,
1027	 * and we want to create a file (cflag is set). We don't
1028	 * need to worry about the root comparison in this case.
1029	 */
1030
1031	if (create) {
1032		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
1033		*cp = '\0';
1034
1035		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1036		error = namei(&nd);
1037		*cp = '/';
1038		if (error != 0)
1039			goto keeporig;
1040	} else {
1041		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1042
1043		error = namei(&nd);
1044		if (error != 0)
1045			goto keeporig;
1046
1047		/*
1048		 * We now compare the vnode of the prefix to the one
1049		 * vnode asked. If they resolve to be the same, then we
1050		 * ignore the match so that the real root gets used.
1051		 * This avoids the problem of traversing "../.." to find the
1052		 * root directory and never finding it, because "/" resolves
1053		 * to the emulation root directory. This is expensive :-(
1054		 */
1055		NDINIT(&ndroot, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, prefix,
1056		    td);
1057
1058		/* We shouldn't ever get an error from this namei(). */
1059		error = namei(&ndroot);
1060		if (error == 0) {
1061			if (nd.ni_vp == ndroot.ni_vp)
1062				error = ENOENT;
1063
1064			NDFREE(&ndroot, NDF_ONLY_PNBUF);
1065			vrele(ndroot.ni_vp);
1066			VFS_UNLOCK_GIANT(NDHASGIANT(&ndroot));
1067		}
1068	}
1069
1070	NDFREE(&nd, NDF_ONLY_PNBUF);
1071	vrele(nd.ni_vp);
1072	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
1073
1074keeporig:
1075	/* If there was an error, use the original path name. */
1076	if (error)
1077		bcopy(ptr, buf, len);
1078	return (error);
1079}
1080