vfs_lookup.c revision 185029
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/vfs_lookup.c 185029 2008-11-17 20:49:29Z pjd $");
39
40#include "opt_ktrace.h"
41#include "opt_mac.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/kernel.h>
46#include <sys/fcntl.h>
47#include <sys/lock.h>
48#include <sys/mutex.h>
49#include <sys/namei.h>
50#include <sys/vnode.h>
51#include <sys/mount.h>
52#include <sys/filedesc.h>
53#include <sys/proc.h>
54#include <sys/syscallsubr.h>
55#include <sys/sysctl.h>
56#ifdef KTRACE
57#include <sys/ktrace.h>
58#endif
59
60#include <security/audit/audit.h>
61#include <security/mac/mac_framework.h>
62
63#include <vm/uma.h>
64
65#define	NAMEI_DIAGNOSTIC 1
66#undef NAMEI_DIAGNOSTIC
67
68/*
69 * Allocation zone for namei
70 */
71uma_zone_t namei_zone;
72/*
73 * Placeholder vnode for mp traversal
74 */
75static struct vnode *vp_crossmp;
76
77static void
78nameiinit(void *dummy __unused)
79{
80	int error;
81
82	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
83	    UMA_ALIGN_PTR, 0);
84	error = getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
85	if (error != 0)
86		panic("nameiinit: getnewvnode");
87	VN_LOCK_ASHARE(vp_crossmp);
88}
89SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
90
91static int lookup_shared = 1;
92SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
93    "Enables/Disables shared locks for path name translation");
94TUNABLE_INT("vfs.lookup_shared", &lookup_shared);
95
96/*
97 * Convert a pathname into a pointer to a locked vnode.
98 *
99 * The FOLLOW flag is set when symbolic links are to be followed
100 * when they occur at the end of the name translation process.
101 * Symbolic links are always followed for all other pathname
102 * components other than the last.
103 *
104 * The segflg defines whether the name is to be copied from user
105 * space or kernel space.
106 *
107 * Overall outline of namei:
108 *
109 *	copy in name
110 *	get starting directory
111 *	while (!done && !error) {
112 *		call lookup to search path.
113 *		if symbolic link, massage name in buffer and continue
114 *	}
115 */
116int
117namei(struct nameidata *ndp)
118{
119	struct filedesc *fdp;	/* pointer to file descriptor state */
120	char *cp;		/* pointer into pathname argument */
121	struct vnode *dp;	/* the directory we are searching */
122	struct iovec aiov;		/* uio for reading symbolic links */
123	struct uio auio;
124	int error, linklen;
125	struct componentname *cnp = &ndp->ni_cnd;
126	struct thread *td = cnp->cn_thread;
127	struct proc *p = td->td_proc;
128	int vfslocked;
129
130	KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0,
131	    ("NOT MPSAFE and Giant not held"));
132	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
133	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
134	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
135	    ("namei: nameiop contaminated with flags"));
136	KASSERT((cnp->cn_flags & OPMASK) == 0,
137	    ("namei: flags contaminated with nameiops"));
138	if (!lookup_shared)
139		cnp->cn_flags &= ~LOCKSHARED;
140	fdp = p->p_fd;
141
142	/*
143	 * Get a buffer for the name to be translated, and copy the
144	 * name into the buffer.
145	 */
146	if ((cnp->cn_flags & HASBUF) == 0)
147		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
148	if (ndp->ni_segflg == UIO_SYSSPACE)
149		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
150			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
151	else
152		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
153			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
154
155	/* If we are auditing the kernel pathname, save the user pathname. */
156	if (cnp->cn_flags & AUDITVNODE1)
157		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH1);
158	if (cnp->cn_flags & AUDITVNODE2)
159		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH2);
160
161	/*
162	 * Don't allow empty pathnames.
163	 */
164	if (!error && *cnp->cn_pnbuf == '\0')
165		error = ENOENT;
166
167	if (error) {
168		uma_zfree(namei_zone, cnp->cn_pnbuf);
169#ifdef DIAGNOSTIC
170		cnp->cn_pnbuf = NULL;
171		cnp->cn_nameptr = NULL;
172#endif
173		ndp->ni_vp = NULL;
174		return (error);
175	}
176	ndp->ni_loopcnt = 0;
177#ifdef KTRACE
178	if (KTRPOINT(td, KTR_NAMEI)) {
179		KASSERT(cnp->cn_thread == curthread,
180		    ("namei not using curthread"));
181		ktrnamei(cnp->cn_pnbuf);
182	}
183#endif
184
185	/*
186	 * Get starting point for the translation.
187	 */
188	FILEDESC_SLOCK(fdp);
189	ndp->ni_rootdir = fdp->fd_rdir;
190	ndp->ni_topdir = fdp->fd_jdir;
191
192	dp = NULL;
193	if (cnp->cn_pnbuf[0] != '/') {
194		if (ndp->ni_startdir != NULL) {
195			dp = ndp->ni_startdir;
196			error = 0;
197		} else if (ndp->ni_dirfd != AT_FDCWD)
198			error = fgetvp(td, ndp->ni_dirfd, &dp);
199		if (error != 0 || dp != NULL) {
200			FILEDESC_SUNLOCK(fdp);
201			if (error == 0 && dp->v_type != VDIR) {
202				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
203				vrele(dp);
204				VFS_UNLOCK_GIANT(vfslocked);
205				error = ENOTDIR;
206			}
207		}
208		if (error) {
209			uma_zfree(namei_zone, cnp->cn_pnbuf);
210#ifdef DIAGNOSTIC
211			cnp->cn_pnbuf = NULL;
212			cnp->cn_nameptr = NULL;
213#endif
214			return (error);
215		}
216	}
217	if (dp == NULL) {
218		dp = fdp->fd_cdir;
219		VREF(dp);
220		FILEDESC_SUNLOCK(fdp);
221		if (ndp->ni_startdir != NULL) {
222			vfslocked = VFS_LOCK_GIANT(ndp->ni_startdir->v_mount);
223			vrele(ndp->ni_startdir);
224			VFS_UNLOCK_GIANT(vfslocked);
225		}
226	}
227	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
228	for (;;) {
229		/*
230		 * Check if root directory should replace current directory.
231		 * Done at start of translation and after symbolic link.
232		 */
233		cnp->cn_nameptr = cnp->cn_pnbuf;
234		if (*(cnp->cn_nameptr) == '/') {
235			vrele(dp);
236			VFS_UNLOCK_GIANT(vfslocked);
237			while (*(cnp->cn_nameptr) == '/') {
238				cnp->cn_nameptr++;
239				ndp->ni_pathlen--;
240			}
241			dp = ndp->ni_rootdir;
242			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
243			VREF(dp);
244		}
245		if (vfslocked)
246			ndp->ni_cnd.cn_flags |= GIANTHELD;
247		ndp->ni_startdir = dp;
248		error = lookup(ndp);
249		if (error) {
250			uma_zfree(namei_zone, cnp->cn_pnbuf);
251#ifdef DIAGNOSTIC
252			cnp->cn_pnbuf = NULL;
253			cnp->cn_nameptr = NULL;
254#endif
255			return (error);
256		}
257		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
258		ndp->ni_cnd.cn_flags &= ~GIANTHELD;
259		/*
260		 * Check for symbolic link
261		 */
262		if ((cnp->cn_flags & ISSYMLINK) == 0) {
263			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
264				uma_zfree(namei_zone, cnp->cn_pnbuf);
265#ifdef DIAGNOSTIC
266				cnp->cn_pnbuf = NULL;
267				cnp->cn_nameptr = NULL;
268#endif
269			} else
270				cnp->cn_flags |= HASBUF;
271
272			if ((cnp->cn_flags & MPSAFE) == 0) {
273				VFS_UNLOCK_GIANT(vfslocked);
274			} else if (vfslocked)
275				ndp->ni_cnd.cn_flags |= GIANTHELD;
276			return (0);
277		}
278		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
279			error = ELOOP;
280			break;
281		}
282#ifdef MAC
283		if ((cnp->cn_flags & NOMACCHECK) == 0) {
284			error = mac_vnode_check_readlink(td->td_ucred,
285			    ndp->ni_vp);
286			if (error)
287				break;
288		}
289#endif
290		if (ndp->ni_pathlen > 1)
291			cp = uma_zalloc(namei_zone, M_WAITOK);
292		else
293			cp = cnp->cn_pnbuf;
294		aiov.iov_base = cp;
295		aiov.iov_len = MAXPATHLEN;
296		auio.uio_iov = &aiov;
297		auio.uio_iovcnt = 1;
298		auio.uio_offset = 0;
299		auio.uio_rw = UIO_READ;
300		auio.uio_segflg = UIO_SYSSPACE;
301		auio.uio_td = (struct thread *)0;
302		auio.uio_resid = MAXPATHLEN;
303		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
304		if (error) {
305			if (ndp->ni_pathlen > 1)
306				uma_zfree(namei_zone, cp);
307			break;
308		}
309		linklen = MAXPATHLEN - auio.uio_resid;
310		if (linklen == 0) {
311			if (ndp->ni_pathlen > 1)
312				uma_zfree(namei_zone, cp);
313			error = ENOENT;
314			break;
315		}
316		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
317			if (ndp->ni_pathlen > 1)
318				uma_zfree(namei_zone, cp);
319			error = ENAMETOOLONG;
320			break;
321		}
322		if (ndp->ni_pathlen > 1) {
323			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
324			uma_zfree(namei_zone, cnp->cn_pnbuf);
325			cnp->cn_pnbuf = cp;
326		} else
327			cnp->cn_pnbuf[linklen] = '\0';
328		ndp->ni_pathlen += linklen;
329		vput(ndp->ni_vp);
330		dp = ndp->ni_dvp;
331	}
332	uma_zfree(namei_zone, cnp->cn_pnbuf);
333#ifdef DIAGNOSTIC
334	cnp->cn_pnbuf = NULL;
335	cnp->cn_nameptr = NULL;
336#endif
337	vput(ndp->ni_vp);
338	ndp->ni_vp = NULL;
339	vrele(ndp->ni_dvp);
340	VFS_UNLOCK_GIANT(vfslocked);
341	return (error);
342}
343
344static int
345compute_cn_lkflags(struct mount *mp, int lkflags)
346{
347
348	if (mp == NULL ||
349	    ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) {
350		lkflags &= ~LK_SHARED;
351		lkflags |= LK_EXCLUSIVE;
352	}
353	return (lkflags);
354}
355
356/*
357 * Search a pathname.
358 * This is a very central and rather complicated routine.
359 *
360 * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
361 * The starting directory is taken from ni_startdir. The pathname is
362 * descended until done, or a symbolic link is encountered. The variable
363 * ni_more is clear if the path is completed; it is set to one if a
364 * symbolic link needing interpretation is encountered.
365 *
366 * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
367 * whether the name is to be looked up, created, renamed, or deleted.
368 * When CREATE, RENAME, or DELETE is specified, information usable in
369 * creating, renaming, or deleting a directory entry may be calculated.
370 * If flag has LOCKPARENT or'ed into it, the parent directory is returned
371 * locked. If flag has WANTPARENT or'ed into it, the parent directory is
372 * returned unlocked. Otherwise the parent directory is not returned. If
373 * the target of the pathname exists and LOCKLEAF is or'ed into the flag
374 * the target is returned locked, otherwise it is returned unlocked.
375 * When creating or renaming and LOCKPARENT is specified, the target may not
376 * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
377 *
378 * Overall outline of lookup:
379 *
380 * dirloop:
381 *	identify next component of name at ndp->ni_ptr
382 *	handle degenerate case where name is null string
383 *	if .. and crossing mount points and on mounted filesys, find parent
384 *	call VOP_LOOKUP routine for next component name
385 *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
386 *	    component vnode returned in ni_vp (if it exists), locked.
387 *	if result vnode is mounted on and crossing mount points,
388 *	    find mounted on vnode
389 *	if more components of name, do next level at dirloop
390 *	return the answer in ni_vp, locked if LOCKLEAF set
391 *	    if LOCKPARENT set, return locked parent in ni_dvp
392 *	    if WANTPARENT set, return unlocked parent in ni_dvp
393 */
394int
395lookup(struct nameidata *ndp)
396{
397	char *cp;		/* pointer into pathname argument */
398	struct vnode *dp = 0;	/* the directory we are searching */
399	struct vnode *tdp;		/* saved dp */
400	struct mount *mp;		/* mount table entry */
401	int docache;			/* == 0 do not cache last component */
402	int wantparent;			/* 1 => wantparent or lockparent flag */
403	int rdonly;			/* lookup read-only flag bit */
404	int trailing_slash;
405	int error = 0;
406	int dpunlocked = 0;		/* dp has already been unlocked */
407	struct componentname *cnp = &ndp->ni_cnd;
408	struct thread *td = cnp->cn_thread;
409	int vfslocked;			/* VFS Giant state for child */
410	int dvfslocked;			/* VFS Giant state for parent */
411	int tvfslocked;
412	int lkflags_save;
413
414	/*
415	 * Setup: break out flag bits into variables.
416	 */
417	dvfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
418	vfslocked = 0;
419	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
420	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
421	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
422	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
423	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
424	if (cnp->cn_nameiop == DELETE ||
425	    (wantparent && cnp->cn_nameiop != CREATE &&
426	     cnp->cn_nameiop != LOOKUP))
427		docache = 0;
428	rdonly = cnp->cn_flags & RDONLY;
429	cnp->cn_flags &= ~ISSYMLINK;
430	ndp->ni_dvp = NULL;
431	/*
432	 * We use shared locks until we hit the parent of the last cn then
433	 * we adjust based on the requesting flags.
434	 */
435	if (lookup_shared)
436		cnp->cn_lkflags = LK_SHARED;
437	else
438		cnp->cn_lkflags = LK_EXCLUSIVE;
439	dp = ndp->ni_startdir;
440	ndp->ni_startdir = NULLVP;
441	vn_lock(dp,
442	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY));
443
444dirloop:
445	/*
446	 * Search a new directory.
447	 *
448	 * The last component of the filename is left accessible via
449	 * cnp->cn_nameptr for callers that need the name. Callers needing
450	 * the name set the SAVENAME flag. When done, they assume
451	 * responsibility for freeing the pathname buffer.
452	 */
453	cnp->cn_consume = 0;
454	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
455		continue;
456	cnp->cn_namelen = cp - cnp->cn_nameptr;
457	if (cnp->cn_namelen > NAME_MAX) {
458		error = ENAMETOOLONG;
459		goto bad;
460	}
461#ifdef NAMEI_DIAGNOSTIC
462	{ char c = *cp;
463	*cp = '\0';
464	printf("{%s}: ", cnp->cn_nameptr);
465	*cp = c; }
466#endif
467	ndp->ni_pathlen -= cnp->cn_namelen;
468	ndp->ni_next = cp;
469
470	/*
471	 * Replace multiple slashes by a single slash and trailing slashes
472	 * by a null.  This must be done before VOP_LOOKUP() because some
473	 * fs's don't know about trailing slashes.  Remember if there were
474	 * trailing slashes to handle symlinks, existing non-directories
475	 * and non-existing files that won't be directories specially later.
476	 */
477	trailing_slash = 0;
478	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
479		cp++;
480		ndp->ni_pathlen--;
481		if (*cp == '\0') {
482			trailing_slash = 1;
483			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
484		}
485	}
486	ndp->ni_next = cp;
487
488	cnp->cn_flags |= MAKEENTRY;
489	if (*cp == '\0' && docache == 0)
490		cnp->cn_flags &= ~MAKEENTRY;
491	if (cnp->cn_namelen == 2 &&
492	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
493		cnp->cn_flags |= ISDOTDOT;
494	else
495		cnp->cn_flags &= ~ISDOTDOT;
496	if (*ndp->ni_next == 0)
497		cnp->cn_flags |= ISLASTCN;
498	else
499		cnp->cn_flags &= ~ISLASTCN;
500
501
502	/*
503	 * Check for degenerate name (e.g. / or "")
504	 * which is a way of talking about a directory,
505	 * e.g. like "/." or ".".
506	 */
507	if (cnp->cn_nameptr[0] == '\0') {
508		if (dp->v_type != VDIR) {
509			error = ENOTDIR;
510			goto bad;
511		}
512		if (cnp->cn_nameiop != LOOKUP) {
513			error = EISDIR;
514			goto bad;
515		}
516		if (wantparent) {
517			ndp->ni_dvp = dp;
518			VREF(dp);
519		}
520		ndp->ni_vp = dp;
521
522		if (cnp->cn_flags & AUDITVNODE1)
523			AUDIT_ARG(vnode, dp, ARG_VNODE1);
524		else if (cnp->cn_flags & AUDITVNODE2)
525			AUDIT_ARG(vnode, dp, ARG_VNODE2);
526
527		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
528			VOP_UNLOCK(dp, 0);
529		/* XXX This should probably move to the top of function. */
530		if (cnp->cn_flags & SAVESTART)
531			panic("lookup: SAVESTART");
532		goto success;
533	}
534
535	/*
536	 * Handle "..": four special cases.
537	 * 1. Return an error if this is the last component of
538	 *    the name and the operation is DELETE or RENAME.
539	 * 2. If at root directory (e.g. after chroot)
540	 *    or at absolute root directory
541	 *    then ignore it so can't get out.
542	 * 3. If this vnode is the root of a mounted
543	 *    filesystem, then replace it with the
544	 *    vnode which was mounted on so we take the
545	 *    .. in the other filesystem.
546	 * 4. If the vnode is the top directory of
547	 *    the jail or chroot, don't let them out.
548	 */
549	if (cnp->cn_flags & ISDOTDOT) {
550		if ((cnp->cn_flags & ISLASTCN) != 0 &&
551		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
552			error = EINVAL;
553			goto bad;
554		}
555		for (;;) {
556			if (dp == ndp->ni_rootdir ||
557			    dp == ndp->ni_topdir ||
558			    dp == rootvnode ||
559			    ((dp->v_vflag & VV_ROOT) != 0 &&
560			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
561				ndp->ni_dvp = dp;
562				ndp->ni_vp = dp;
563				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
564				VREF(dp);
565				goto nextname;
566			}
567			if ((dp->v_vflag & VV_ROOT) == 0)
568				break;
569			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
570				error = EBADF;
571				goto bad;
572			}
573			tdp = dp;
574			dp = dp->v_mount->mnt_vnodecovered;
575			tvfslocked = dvfslocked;
576			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
577			VREF(dp);
578			vput(tdp);
579			VFS_UNLOCK_GIANT(tvfslocked);
580			vn_lock(dp,
581			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
582			    LK_RETRY));
583		}
584	}
585
586	/*
587	 * We now have a segment name to search for, and a directory to search.
588	 */
589unionlookup:
590#ifdef MAC
591	if ((cnp->cn_flags & NOMACCHECK) == 0) {
592		error = mac_vnode_check_lookup(td->td_ucred, dp, cnp);
593		if (error)
594			goto bad;
595	}
596#endif
597	ndp->ni_dvp = dp;
598	ndp->ni_vp = NULL;
599	ASSERT_VOP_LOCKED(dp, "lookup");
600	VNASSERT(vfslocked == 0, dp, ("lookup: vfslocked %d", vfslocked));
601	/*
602	 * If we have a shared lock we may need to upgrade the lock for the
603	 * last operation.
604	 */
605	if (dp != vp_crossmp &&
606	    VOP_ISLOCKED(dp) == LK_SHARED &&
607	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
608		vn_lock(dp, LK_UPGRADE|LK_RETRY);
609	/*
610	 * If we're looking up the last component and we need an exclusive
611	 * lock, adjust our lkflags.
612	 */
613	if ((cnp->cn_flags & (ISLASTCN|LOCKSHARED|LOCKLEAF)) ==
614	    (ISLASTCN|LOCKLEAF))
615		cnp->cn_lkflags = LK_EXCLUSIVE;
616#ifdef NAMEI_DIAGNOSTIC
617	vprint("lookup in", dp);
618#endif
619	lkflags_save = cnp->cn_lkflags;
620	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags);
621	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
622		cnp->cn_lkflags = lkflags_save;
623		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
624#ifdef NAMEI_DIAGNOSTIC
625		printf("not found\n");
626#endif
627		if ((error == ENOENT) &&
628		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
629		    (dp->v_mount->mnt_flag & MNT_UNION)) {
630			tdp = dp;
631			dp = dp->v_mount->mnt_vnodecovered;
632			tvfslocked = dvfslocked;
633			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
634			VREF(dp);
635			vput(tdp);
636			VFS_UNLOCK_GIANT(tvfslocked);
637			vn_lock(dp,
638			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
639			    LK_RETRY));
640			goto unionlookup;
641		}
642
643		if (error != EJUSTRETURN)
644			goto bad;
645		/*
646		 * If creating and at end of pathname, then can consider
647		 * allowing file to be created.
648		 */
649		if (rdonly) {
650			error = EROFS;
651			goto bad;
652		}
653		if (*cp == '\0' && trailing_slash &&
654		     !(cnp->cn_flags & WILLBEDIR)) {
655			error = ENOENT;
656			goto bad;
657		}
658		if ((cnp->cn_flags & LOCKPARENT) == 0)
659			VOP_UNLOCK(dp, 0);
660		/*
661		 * This is a temporary assert to make sure I know what the
662		 * behavior here was.
663		 */
664		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
665		   ("lookup: Unhandled case."));
666		/*
667		 * We return with ni_vp NULL to indicate that the entry
668		 * doesn't currently exist, leaving a pointer to the
669		 * (possibly locked) directory vnode in ndp->ni_dvp.
670		 */
671		if (cnp->cn_flags & SAVESTART) {
672			ndp->ni_startdir = ndp->ni_dvp;
673			VREF(ndp->ni_startdir);
674		}
675		goto success;
676	} else
677		cnp->cn_lkflags = lkflags_save;
678#ifdef NAMEI_DIAGNOSTIC
679	printf("found\n");
680#endif
681	/*
682	 * Take into account any additional components consumed by
683	 * the underlying filesystem.
684	 */
685	if (cnp->cn_consume > 0) {
686		cnp->cn_nameptr += cnp->cn_consume;
687		ndp->ni_next += cnp->cn_consume;
688		ndp->ni_pathlen -= cnp->cn_consume;
689		cnp->cn_consume = 0;
690	}
691
692	dp = ndp->ni_vp;
693	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
694
695	/*
696	 * Check to see if the vnode has been mounted on;
697	 * if so find the root of the mounted filesystem.
698	 */
699	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
700	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
701		if (vfs_busy(mp, 0))
702			continue;
703		vput(dp);
704		VFS_UNLOCK_GIANT(vfslocked);
705		vfslocked = VFS_LOCK_GIANT(mp);
706		if (dp != ndp->ni_dvp)
707			vput(ndp->ni_dvp);
708		else
709			vrele(ndp->ni_dvp);
710		VFS_UNLOCK_GIANT(dvfslocked);
711		dvfslocked = 0;
712		vref(vp_crossmp);
713		ndp->ni_dvp = vp_crossmp;
714		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td);
715		vfs_unbusy(mp);
716		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
717			panic("vp_crossmp exclusively locked or reclaimed");
718		if (error) {
719			dpunlocked = 1;
720			goto bad2;
721		}
722		ndp->ni_vp = dp = tdp;
723	}
724
725	/*
726	 * Check for symbolic link
727	 */
728	if ((dp->v_type == VLNK) &&
729	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
730	     *ndp->ni_next == '/')) {
731		cnp->cn_flags |= ISSYMLINK;
732		if (dp->v_iflag & VI_DOOMED) {
733			/* We can't know whether the directory was mounted with
734			 * NOSYMFOLLOW, so we can't follow safely. */
735			error = EBADF;
736			goto bad2;
737		}
738		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
739			error = EACCES;
740			goto bad2;
741		}
742		/*
743		 * Symlink code always expects an unlocked dvp.
744		 */
745		if (ndp->ni_dvp != ndp->ni_vp)
746			VOP_UNLOCK(ndp->ni_dvp, 0);
747		goto success;
748	}
749
750	/*
751	 * Check for bogus trailing slashes.
752	 */
753	if (trailing_slash && dp->v_type != VDIR) {
754		error = ENOTDIR;
755		goto bad2;
756	}
757
758nextname:
759	/*
760	 * Not a symbolic link.  If more pathname,
761	 * continue at next component, else return.
762	 */
763	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
764	    ("lookup: invalid path state."));
765	if (*ndp->ni_next == '/') {
766		cnp->cn_nameptr = ndp->ni_next;
767		while (*cnp->cn_nameptr == '/') {
768			cnp->cn_nameptr++;
769			ndp->ni_pathlen--;
770		}
771		if (ndp->ni_dvp != dp)
772			vput(ndp->ni_dvp);
773		else
774			vrele(ndp->ni_dvp);
775		VFS_UNLOCK_GIANT(dvfslocked);
776		dvfslocked = vfslocked;	/* dp becomes dvp in dirloop */
777		vfslocked = 0;
778		goto dirloop;
779	}
780	/*
781	 * Disallow directory write attempts on read-only filesystems.
782	 */
783	if (rdonly &&
784	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
785		error = EROFS;
786		goto bad2;
787	}
788	if (cnp->cn_flags & SAVESTART) {
789		ndp->ni_startdir = ndp->ni_dvp;
790		VREF(ndp->ni_startdir);
791	}
792	if (!wantparent) {
793		if (ndp->ni_dvp != dp)
794			vput(ndp->ni_dvp);
795		else
796			vrele(ndp->ni_dvp);
797		VFS_UNLOCK_GIANT(dvfslocked);
798		dvfslocked = 0;
799	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp)
800		VOP_UNLOCK(ndp->ni_dvp, 0);
801
802	if (cnp->cn_flags & AUDITVNODE1)
803		AUDIT_ARG(vnode, dp, ARG_VNODE1);
804	else if (cnp->cn_flags & AUDITVNODE2)
805		AUDIT_ARG(vnode, dp, ARG_VNODE2);
806
807	if ((cnp->cn_flags & LOCKLEAF) == 0)
808		VOP_UNLOCK(dp, 0);
809success:
810	/*
811	 * Because of lookup_shared we may have the vnode shared locked, but
812	 * the caller may want it to be exclusively locked.
813	 */
814	if ((cnp->cn_flags & (ISLASTCN | LOCKSHARED | LOCKLEAF)) ==
815	    (ISLASTCN | LOCKLEAF) && VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
816		vn_lock(dp, LK_UPGRADE | LK_RETRY);
817	}
818	if (vfslocked && dvfslocked)
819		VFS_UNLOCK_GIANT(dvfslocked);	/* Only need one */
820	if (vfslocked || dvfslocked)
821		ndp->ni_cnd.cn_flags |= GIANTHELD;
822	return (0);
823
824bad2:
825	if (dp != ndp->ni_dvp)
826		vput(ndp->ni_dvp);
827	else
828		vrele(ndp->ni_dvp);
829bad:
830	if (!dpunlocked)
831		vput(dp);
832	VFS_UNLOCK_GIANT(vfslocked);
833	VFS_UNLOCK_GIANT(dvfslocked);
834	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
835	ndp->ni_vp = NULL;
836	return (error);
837}
838
839/*
840 * relookup - lookup a path name component
841 *    Used by lookup to re-acquire things.
842 */
843int
844relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
845{
846	struct vnode *dp = 0;		/* the directory we are searching */
847	int wantparent;			/* 1 => wantparent or lockparent flag */
848	int rdonly;			/* lookup read-only flag bit */
849	int error = 0;
850
851	KASSERT(cnp->cn_flags & ISLASTCN,
852	    ("relookup: Not given last component."));
853	/*
854	 * Setup: break out flag bits into variables.
855	 */
856	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
857	KASSERT(wantparent, ("relookup: parent not wanted."));
858	rdonly = cnp->cn_flags & RDONLY;
859	cnp->cn_flags &= ~ISSYMLINK;
860	dp = dvp;
861	cnp->cn_lkflags = LK_EXCLUSIVE;
862	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
863
864	/*
865	 * Search a new directory.
866	 *
867	 * The last component of the filename is left accessible via
868	 * cnp->cn_nameptr for callers that need the name. Callers needing
869	 * the name set the SAVENAME flag. When done, they assume
870	 * responsibility for freeing the pathname buffer.
871	 */
872#ifdef NAMEI_DIAGNOSTIC
873	printf("{%s}: ", cnp->cn_nameptr);
874#endif
875
876	/*
877	 * Check for degenerate name (e.g. / or "")
878	 * which is a way of talking about a directory,
879	 * e.g. like "/." or ".".
880	 */
881	if (cnp->cn_nameptr[0] == '\0') {
882		if (cnp->cn_nameiop != LOOKUP || wantparent) {
883			error = EISDIR;
884			goto bad;
885		}
886		if (dp->v_type != VDIR) {
887			error = ENOTDIR;
888			goto bad;
889		}
890		if (!(cnp->cn_flags & LOCKLEAF))
891			VOP_UNLOCK(dp, 0);
892		*vpp = dp;
893		/* XXX This should probably move to the top of function. */
894		if (cnp->cn_flags & SAVESTART)
895			panic("lookup: SAVESTART");
896		return (0);
897	}
898
899	if (cnp->cn_flags & ISDOTDOT)
900		panic ("relookup: lookup on dot-dot");
901
902	/*
903	 * We now have a segment name to search for, and a directory to search.
904	 */
905#ifdef NAMEI_DIAGNOSTIC
906	vprint("search in:", dp);
907#endif
908	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
909		KASSERT(*vpp == NULL, ("leaf should be empty"));
910		if (error != EJUSTRETURN)
911			goto bad;
912		/*
913		 * If creating and at end of pathname, then can consider
914		 * allowing file to be created.
915		 */
916		if (rdonly) {
917			error = EROFS;
918			goto bad;
919		}
920		/* ASSERT(dvp == ndp->ni_startdir) */
921		if (cnp->cn_flags & SAVESTART)
922			VREF(dvp);
923		if ((cnp->cn_flags & LOCKPARENT) == 0)
924			VOP_UNLOCK(dp, 0);
925		/*
926		 * This is a temporary assert to make sure I know what the
927		 * behavior here was.
928		 */
929		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
930		   ("relookup: Unhandled case."));
931		/*
932		 * We return with ni_vp NULL to indicate that the entry
933		 * doesn't currently exist, leaving a pointer to the
934		 * (possibly locked) directory vnode in ndp->ni_dvp.
935		 */
936		return (0);
937	}
938
939	dp = *vpp;
940
941	/*
942	 * Disallow directory write attempts on read-only filesystems.
943	 */
944	if (rdonly &&
945	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
946		if (dvp == dp)
947			vrele(dvp);
948		else
949			vput(dvp);
950		error = EROFS;
951		goto bad;
952	}
953	/*
954	 * Set the parent lock/ref state to the requested state.
955	 */
956	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
957		if (wantparent)
958			VOP_UNLOCK(dvp, 0);
959		else
960			vput(dvp);
961	} else if (!wantparent)
962		vrele(dvp);
963	/*
964	 * Check for symbolic link
965	 */
966	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
967	    ("relookup: symlink found.\n"));
968
969	/* ASSERT(dvp == ndp->ni_startdir) */
970	if (cnp->cn_flags & SAVESTART)
971		VREF(dvp);
972
973	if ((cnp->cn_flags & LOCKLEAF) == 0)
974		VOP_UNLOCK(dp, 0);
975	return (0);
976bad:
977	vput(dp);
978	*vpp = NULL;
979	return (error);
980}
981
982/*
983 * Free data allocated by namei(); see namei(9) for details.
984 */
985void
986NDFREE(struct nameidata *ndp, const u_int flags)
987{
988	int unlock_dvp;
989	int unlock_vp;
990
991	unlock_dvp = 0;
992	unlock_vp = 0;
993
994	if (!(flags & NDF_NO_FREE_PNBUF) &&
995	    (ndp->ni_cnd.cn_flags & HASBUF)) {
996		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
997		ndp->ni_cnd.cn_flags &= ~HASBUF;
998	}
999	if (!(flags & NDF_NO_VP_UNLOCK) &&
1000	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
1001		unlock_vp = 1;
1002	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
1003		if (unlock_vp) {
1004			vput(ndp->ni_vp);
1005			unlock_vp = 0;
1006		} else
1007			vrele(ndp->ni_vp);
1008		ndp->ni_vp = NULL;
1009	}
1010	if (unlock_vp)
1011		VOP_UNLOCK(ndp->ni_vp, 0);
1012	if (!(flags & NDF_NO_DVP_UNLOCK) &&
1013	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
1014	    ndp->ni_dvp != ndp->ni_vp)
1015		unlock_dvp = 1;
1016	if (!(flags & NDF_NO_DVP_RELE) &&
1017	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
1018		if (unlock_dvp) {
1019			vput(ndp->ni_dvp);
1020			unlock_dvp = 0;
1021		} else
1022			vrele(ndp->ni_dvp);
1023		ndp->ni_dvp = NULL;
1024	}
1025	if (unlock_dvp)
1026		VOP_UNLOCK(ndp->ni_dvp, 0);
1027	if (!(flags & NDF_NO_STARTDIR_RELE) &&
1028	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
1029		vrele(ndp->ni_startdir);
1030		ndp->ni_startdir = NULL;
1031	}
1032}
1033
1034/*
1035 * Determine if there is a suitable alternate filename under the specified
1036 * prefix for the specified path.  If the create flag is set, then the
1037 * alternate prefix will be used so long as the parent directory exists.
1038 * This is used by the various compatiblity ABIs so that Linux binaries prefer
1039 * files under /compat/linux for example.  The chosen path (whether under
1040 * the prefix or under /) is returned in a kernel malloc'd buffer pointed
1041 * to by pathbuf.  The caller is responsible for free'ing the buffer from
1042 * the M_TEMP bucket if one is returned.
1043 */
1044int
1045kern_alternate_path(struct thread *td, const char *prefix, const char *path,
1046    enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
1047{
1048	struct nameidata nd, ndroot;
1049	char *ptr, *buf, *cp;
1050	size_t len, sz;
1051	int error;
1052
1053	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1054	*pathbuf = buf;
1055
1056	/* Copy the prefix into the new pathname as a starting point. */
1057	len = strlcpy(buf, prefix, MAXPATHLEN);
1058	if (len >= MAXPATHLEN) {
1059		*pathbuf = NULL;
1060		free(buf, M_TEMP);
1061		return (EINVAL);
1062	}
1063	sz = MAXPATHLEN - len;
1064	ptr = buf + len;
1065
1066	/* Append the filename to the prefix. */
1067	if (pathseg == UIO_SYSSPACE)
1068		error = copystr(path, ptr, sz, &len);
1069	else
1070		error = copyinstr(path, ptr, sz, &len);
1071
1072	if (error) {
1073		*pathbuf = NULL;
1074		free(buf, M_TEMP);
1075		return (error);
1076	}
1077
1078	/* Only use a prefix with absolute pathnames. */
1079	if (*ptr != '/') {
1080		error = EINVAL;
1081		goto keeporig;
1082	}
1083
1084	if (dirfd != AT_FDCWD) {
1085		/*
1086		 * We want the original because the "prefix" is
1087		 * included in the already opened dirfd.
1088		 */
1089		bcopy(ptr, buf, len);
1090		return (0);
1091	}
1092
1093	/*
1094	 * We know that there is a / somewhere in this pathname.
1095	 * Search backwards for it, to find the file's parent dir
1096	 * to see if it exists in the alternate tree. If it does,
1097	 * and we want to create a file (cflag is set). We don't
1098	 * need to worry about the root comparison in this case.
1099	 */
1100
1101	if (create) {
1102		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
1103		*cp = '\0';
1104
1105		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1106		error = namei(&nd);
1107		*cp = '/';
1108		if (error != 0)
1109			goto keeporig;
1110	} else {
1111		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
1112
1113		error = namei(&nd);
1114		if (error != 0)
1115			goto keeporig;
1116
1117		/*
1118		 * We now compare the vnode of the prefix to the one
1119		 * vnode asked. If they resolve to be the same, then we
1120		 * ignore the match so that the real root gets used.
1121		 * This avoids the problem of traversing "../.." to find the
1122		 * root directory and never finding it, because "/" resolves
1123		 * to the emulation root directory. This is expensive :-(
1124		 */
1125		NDINIT(&ndroot, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, prefix,
1126		    td);
1127
1128		/* We shouldn't ever get an error from this namei(). */
1129		error = namei(&ndroot);
1130		if (error == 0) {
1131			if (nd.ni_vp == ndroot.ni_vp)
1132				error = ENOENT;
1133
1134			NDFREE(&ndroot, NDF_ONLY_PNBUF);
1135			vrele(ndroot.ni_vp);
1136			VFS_UNLOCK_GIANT(NDHASGIANT(&ndroot));
1137		}
1138	}
1139
1140	NDFREE(&nd, NDF_ONLY_PNBUF);
1141	vrele(nd.ni_vp);
1142	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
1143
1144keeporig:
1145	/* If there was an error, use the original path name. */
1146	if (error)
1147		bcopy(ptr, buf, len);
1148	return (error);
1149}
1150