vfs_cache.c revision 112430
1/*
2 * Copyright (c) 1989, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Poul-Henning Kamp of the FreeBSD Project.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
37 * $FreeBSD: head/sys/kern/vfs_cache.c 112430 2003-03-20 10:40:45Z phk $
38 */
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/mutex.h>
45#include <sys/sysctl.h>
46#include <sys/mount.h>
47#include <sys/vnode.h>
48#include <sys/namei.h>
49#include <sys/malloc.h>
50#include <sys/syscallsubr.h>
51#include <sys/sysproto.h>
52#include <sys/proc.h>
53#include <sys/filedesc.h>
54#include <sys/fnv_hash.h>
55
56/*
57 * This structure describes the elements in the cache of recent
58 * names looked up by namei.
59 */
60
61struct	namecache {
62	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
63	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
64	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
65	struct	vnode *nc_dvp;		/* vnode of parent of name */
66	struct	vnode *nc_vp;		/* vnode the name refers to */
67	u_char	nc_flag;		/* flag bits */
68	u_char	nc_nlen;		/* length of name */
69	char	nc_name[0];		/* segment name */
70};
71
72/*
73 * Name caching works as follows:
74 *
75 * Names found by directory scans are retained in a cache
76 * for future reference.  It is managed LRU, so frequently
77 * used names will hang around.  Cache is indexed by hash value
78 * obtained from (vp, name) where vp refers to the directory
79 * containing name.
80 *
81 * If it is a "negative" entry, (i.e. for a name that is known NOT to
82 * exist) the vnode pointer will be NULL.
83 *
84 * Upon reaching the last segment of a path, if the reference
85 * is for DELETE, or NOCACHE is set (rewrite), and the
86 * name is located in the cache, it will be dropped.
87 */
88
89/*
90 * Structures associated with name cacheing.
91 */
92#define NCHHASH(hash) \
93	(&nchashtbl[(hash) & nchash])
94static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
95static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
96static u_long	nchash;			/* size of hash table */
97SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
98static u_long	ncnegfactor = 16;	/* ratio of negative entries */
99SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
100static u_long	numneg;			/* number of cache entries allocated */
101SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
102static u_long	numcache;		/* number of cache entries allocated */
103SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
104static u_long	numcachehv;		/* number of cache entries with vnodes held */
105SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
106#if 0
107static u_long	numcachepl;		/* number of cache purge for leaf entries */
108SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
109#endif
110struct	nchstats nchstats;		/* cache effectiveness statistics */
111
112static int	doingcache = 1;		/* 1 => enable the cache */
113SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
114
115/* Export size information to userland */
116SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
117SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
118
119/*
120 * The new name cache statistics
121 */
122SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
123#define STATNODE(mode, name, var) \
124	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
125STATNODE(CTLFLAG_RD, numneg, &numneg);
126STATNODE(CTLFLAG_RD, numcache, &numcache);
127static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
128static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
129static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
130static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
131static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
132static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
133static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
134static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
135static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
136static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
137
138SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
139        sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
140
141
142
143static void cache_zap(struct namecache *ncp);
144
145static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
146
147/*
148 * Flags in namecache.nc_flag
149 */
150#define NCF_WHITE	1
151
152/*
153 * Grab an atomic snapshot of the name cache hash chain lengths
154 */
155SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
156
157static int
158sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
159{
160	int error;
161	struct nchashhead *ncpp;
162	struct namecache *ncp;
163	int n_nchash;
164	int count;
165
166	n_nchash = nchash + 1;	/* nchash is max index, not count */
167	if (!req->oldptr)
168		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
169
170	/* Scan hash tables for applicable entries */
171	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
172		count = 0;
173		LIST_FOREACH(ncp, ncpp, nc_hash) {
174			count++;
175		}
176		error = SYSCTL_OUT(req, &count, sizeof(count));
177		if (error)
178			return (error);
179	}
180	return (0);
181}
182SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
183	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
184
185static int
186sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
187{
188	int error;
189	struct nchashhead *ncpp;
190	struct namecache *ncp;
191	int n_nchash;
192	int count, maxlength, used, pct;
193
194	if (!req->oldptr)
195		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
196
197	n_nchash = nchash + 1;	/* nchash is max index, not count */
198	used = 0;
199	maxlength = 0;
200
201	/* Scan hash tables for applicable entries */
202	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
203		count = 0;
204		LIST_FOREACH(ncp, ncpp, nc_hash) {
205			count++;
206		}
207		if (count)
208			used++;
209		if (maxlength < count)
210			maxlength = count;
211	}
212	n_nchash = nchash + 1;
213	pct = (used * 100 * 100) / n_nchash;
214	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
215	if (error)
216		return (error);
217	error = SYSCTL_OUT(req, &used, sizeof(used));
218	if (error)
219		return (error);
220	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
221	if (error)
222		return (error);
223	error = SYSCTL_OUT(req, &pct, sizeof(pct));
224	if (error)
225		return (error);
226	return (0);
227}
228SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
229	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
230
231/*
232 * cache_zap():
233 *
234 *   Removes a namecache entry from cache, whether it contains an actual
235 *   pointer to a vnode or if it is just a negative cache entry.
236 */
237static void
238cache_zap(ncp)
239	struct namecache *ncp;
240{
241	LIST_REMOVE(ncp, nc_hash);
242	LIST_REMOVE(ncp, nc_src);
243	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
244		vdrop(ncp->nc_dvp);
245		numcachehv--;
246	}
247	if (ncp->nc_vp) {
248		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
249	} else {
250		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
251		numneg--;
252	}
253	numcache--;
254	free(ncp, M_VFSCACHE);
255}
256
257/*
258 * cache_leaf_test()
259 *
260 *      Test whether this (directory) vnode's namei cache entry contains
261 *      subdirectories or not.  Used to determine whether the directory is
262 *      a leaf in the namei cache or not.  Note: the directory may still
263 *      contain files in the namei cache.
264 *
265 *      Returns 0 if the directory is a leaf, -1 if it isn't.
266 */
267int
268cache_leaf_test(struct vnode *vp)
269{
270	struct namecache *ncpc;
271
272	for (ncpc = LIST_FIRST(&vp->v_cache_src);
273	     ncpc != NULL;
274	     ncpc = LIST_NEXT(ncpc, nc_src)
275	) {
276		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR)
277			return(-1);
278	}
279	return(0);
280}
281
282/*
283 * Lookup an entry in the cache
284 *
285 * Lookup is called with dvp pointing to the directory to search,
286 * cnp pointing to the name of the entry being sought. If the lookup
287 * succeeds, the vnode is returned in *vpp, and a status of -1 is
288 * returned. If the lookup determines that the name does not exist
289 * (negative cacheing), a status of ENOENT is returned. If the lookup
290 * fails, a status of zero is returned.
291 */
292
293int
294cache_lookup(dvp, vpp, cnp)
295	struct vnode *dvp;
296	struct vnode **vpp;
297	struct componentname *cnp;
298{
299	struct namecache *ncp;
300	u_int32_t hash;
301
302	if (!doingcache) {
303		cnp->cn_flags &= ~MAKEENTRY;
304		return (0);
305	}
306
307	numcalls++;
308
309	if (cnp->cn_nameptr[0] == '.') {
310		if (cnp->cn_namelen == 1) {
311			*vpp = dvp;
312			dothits++;
313			return (-1);
314		}
315		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
316			dotdothits++;
317			if (dvp->v_dd->v_id != dvp->v_ddid ||
318			    (cnp->cn_flags & MAKEENTRY) == 0) {
319				dvp->v_ddid = 0;
320				return (0);
321			}
322			*vpp = dvp->v_dd;
323			return (-1);
324		}
325	}
326
327	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
328	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
329	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
330		numchecks++;
331		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
332		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
333			break;
334	}
335
336	/* We failed to find an entry */
337	if (ncp == 0) {
338		if ((cnp->cn_flags & MAKEENTRY) == 0) {
339			nummisszap++;
340		} else {
341			nummiss++;
342		}
343		nchstats.ncs_miss++;
344		return (0);
345	}
346
347	/* We don't want to have an entry, so dump it */
348	if ((cnp->cn_flags & MAKEENTRY) == 0) {
349		numposzaps++;
350		nchstats.ncs_badhits++;
351		cache_zap(ncp);
352		return (0);
353	}
354
355	/* We found a "positive" match, return the vnode */
356        if (ncp->nc_vp) {
357		numposhits++;
358		nchstats.ncs_goodhits++;
359		*vpp = ncp->nc_vp;
360		return (-1);
361	}
362
363	/* We found a negative match, and want to create it, so purge */
364	if (cnp->cn_nameiop == CREATE) {
365		numnegzaps++;
366		nchstats.ncs_badhits++;
367		cache_zap(ncp);
368		return (0);
369	}
370
371	numneghits++;
372	/*
373	 * We found a "negative" match, so we shift it to the end of
374	 * the "negative" cache entries queue to satisfy LRU.  Also,
375	 * check to see if the entry is a whiteout; indicate this to
376	 * the componentname, if so.
377	 */
378	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
379	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
380	nchstats.ncs_neghits++;
381	if (ncp->nc_flag & NCF_WHITE)
382		cnp->cn_flags |= ISWHITEOUT;
383	return (ENOENT);
384}
385
386/*
387 * Add an entry to the cache.
388 */
389void
390cache_enter(dvp, vp, cnp)
391	struct vnode *dvp;
392	struct vnode *vp;
393	struct componentname *cnp;
394{
395	struct namecache *ncp;
396	struct nchashhead *ncpp;
397	u_int32_t hash;
398	int len;
399
400	if (!doingcache)
401		return;
402
403	if (cnp->cn_nameptr[0] == '.') {
404		if (cnp->cn_namelen == 1) {
405			return;
406		}
407		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
408			if (vp) {
409				dvp->v_dd = vp;
410				dvp->v_ddid = vp->v_id;
411			} else {
412				dvp->v_dd = dvp;
413				dvp->v_ddid = 0;
414			}
415			return;
416		}
417	}
418
419	ncp = (struct namecache *)
420		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
421	bzero((char *)ncp, sizeof *ncp);
422	numcache++;
423	if (!vp) {
424		numneg++;
425		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
426	} else if (vp->v_type == VDIR) {
427		vp->v_dd = dvp;
428		vp->v_ddid = dvp->v_id;
429	}
430
431	/*
432	 * Set the rest of the namecache entry elements, calculate it's
433	 * hash key and insert it into the appropriate chain within
434	 * the cache entries table.
435	 */
436	ncp->nc_vp = vp;
437	ncp->nc_dvp = dvp;
438	len = ncp->nc_nlen = cnp->cn_namelen;
439	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
440	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
441	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
442	ncpp = NCHHASH(hash);
443	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
444	if (LIST_EMPTY(&dvp->v_cache_src)) {
445		vhold(dvp);
446		numcachehv++;
447	}
448	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
449	/*
450	 * If the entry is "negative", we place it into the
451	 * "negative" cache queue, otherwise, we place it into the
452	 * destination vnode's cache entries queue.
453	 */
454	if (vp) {
455		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
456	} else {
457		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
458	}
459	if (numneg * ncnegfactor > numcache) {
460		ncp = TAILQ_FIRST(&ncneg);
461		cache_zap(ncp);
462	}
463}
464
465/*
466 * Name cache initialization, from vfs_init() when we are booting
467 */
468static void
469nchinit(void *dummy __unused)
470{
471
472	TAILQ_INIT(&ncneg);
473	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
474}
475SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
476
477
478/*
479 * Invalidate all entries to a particular vnode.
480 *
481 * Remove all entries in the namecache relating to this vnode and
482 * change the v_id.  We take the v_id from a global counter, since
483 * it becomes a handy sequence number in crash-dumps that way.
484 * No valid vnode will ever have (v_id == 0).
485 *
486 * XXX: Only time and the size of v_id prevents this from failing:
487 * XXX: In theory we should hunt down all (struct vnode*, v_id)
488 * XXX: soft references and nuke them, at least on the global
489 * XXX: v_id wraparound.  The period of resistance can be extended
490 * XXX: by incrementing each vnodes v_id individually instead of
491 * XXX: using the global v_id.
492 */
493
494void
495cache_purge(vp)
496	struct vnode *vp;
497{
498	static u_long nextid;
499
500	while (!LIST_EMPTY(&vp->v_cache_src))
501		cache_zap(LIST_FIRST(&vp->v_cache_src));
502	while (!TAILQ_EMPTY(&vp->v_cache_dst))
503		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
504
505	do
506		nextid++;
507	while (nextid == vp->v_id || !nextid);
508	vp->v_id = nextid;
509	vp->v_dd = vp;
510	vp->v_ddid = 0;
511}
512
513/*
514 * Flush all entries referencing a particular filesystem.
515 *
516 * Since we need to check it anyway, we will flush all the invalid
517 * entries at the same time.
518 */
519void
520cache_purgevfs(mp)
521	struct mount *mp;
522{
523	struct nchashhead *ncpp;
524	struct namecache *ncp, *nnp;
525
526	/* Scan hash tables for applicable entries */
527	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
528		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
529			nnp = LIST_NEXT(ncp, nc_hash);
530			if (ncp->nc_dvp->v_mount == mp) {
531				cache_zap(ncp);
532			}
533		}
534	}
535}
536
537/*
538 * Perform canonical checks and cache lookup and pass on to filesystem
539 * through the vop_cachedlookup only if needed.
540 */
541
542int
543vfs_cache_lookup(ap)
544	struct vop_lookup_args /* {
545		struct vnode *a_dvp;
546		struct vnode **a_vpp;
547		struct componentname *a_cnp;
548	} */ *ap;
549{
550	struct vnode *dvp, *vp;
551	int lockparent;
552	int error;
553	struct vnode **vpp = ap->a_vpp;
554	struct componentname *cnp = ap->a_cnp;
555	struct ucred *cred = cnp->cn_cred;
556	int flags = cnp->cn_flags;
557	struct thread *td = cnp->cn_thread;
558	u_long vpid;	/* capability number of vnode */
559
560	*vpp = NULL;
561	dvp = ap->a_dvp;
562	lockparent = flags & LOCKPARENT;
563
564	if (dvp->v_type != VDIR)
565                return (ENOTDIR);
566
567	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
568	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
569		return (EROFS);
570
571	error = VOP_ACCESS(dvp, VEXEC, cred, td);
572
573	if (error)
574		return (error);
575
576	error = cache_lookup(dvp, vpp, cnp);
577
578#ifdef LOOKUP_SHARED
579	if (!error) {
580		/* We do this because the rest of the system now expects to get
581		 * a shared lock, which is later upgraded if LOCKSHARED is not
582		 * set.  We have so many cases here because of bugs that yield
583		 * inconsistant lock states.  This all badly needs to be fixed
584		 */
585		error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
586		if (!error) {
587			int flock;
588
589			flock = VOP_ISLOCKED(*vpp, td);
590			if (flock != LK_EXCLUSIVE) {
591				if (flock == 0) {
592					if ((flags & ISLASTCN) &&
593					    (flags & LOCKSHARED))
594						VOP_LOCK(*vpp, LK_SHARED, td);
595					else
596						VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
597				}
598			} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
599				VOP_LOCK(*vpp, LK_DOWNGRADE, td);
600		}
601		return (error);
602	}
603#else
604	if (!error)
605		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
606#endif
607
608	if (error == ENOENT)
609		return (error);
610
611	vp = *vpp;
612	vpid = vp->v_id;
613	cnp->cn_flags &= ~PDIRUNLOCK;
614	if (dvp == vp) {   /* lookup on "." */
615		VREF(vp);
616		error = 0;
617	} else if (flags & ISDOTDOT) {
618		VOP_UNLOCK(dvp, 0, td);
619		cnp->cn_flags |= PDIRUNLOCK;
620#ifdef LOOKUP_SHARED
621		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
622			error = vget(vp, LK_SHARED, td);
623		else
624			error = vget(vp, LK_EXCLUSIVE, td);
625#else
626		error = vget(vp, LK_EXCLUSIVE, td);
627#endif
628
629		if (!error && lockparent && (flags & ISLASTCN)) {
630			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
631				cnp->cn_flags &= ~PDIRUNLOCK;
632		}
633	} else {
634#ifdef LOOKUP_SHARED
635		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
636			error = vget(vp, LK_SHARED, td);
637		else
638			error = vget(vp, LK_EXCLUSIVE, td);
639#else
640		error = vget(vp, LK_EXCLUSIVE, td);
641#endif
642		if (!lockparent || error || !(flags & ISLASTCN)) {
643			VOP_UNLOCK(dvp, 0, td);
644			cnp->cn_flags |= PDIRUNLOCK;
645		}
646	}
647	/*
648	 * Check that the capability number did not change
649	 * while we were waiting for the lock.
650	 */
651	if (!error) {
652		if (vpid == vp->v_id)
653			return (0);
654		vput(vp);
655		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
656			VOP_UNLOCK(dvp, 0, td);
657			cnp->cn_flags |= PDIRUNLOCK;
658		}
659	}
660	if (cnp->cn_flags & PDIRUNLOCK) {
661		error = vn_lock(dvp, LK_EXCLUSIVE, td);
662		if (error)
663			return (error);
664		cnp->cn_flags &= ~PDIRUNLOCK;
665	}
666#ifdef LOOKUP_SHARED
667	error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
668
669	if (!error) {
670		int flock = 0;
671
672		flock = VOP_ISLOCKED(*vpp, td);
673		if (flock != LK_EXCLUSIVE) {
674			if (flock == 0) {
675				if ((flags & ISLASTCN) && (flags & LOCKSHARED))
676					VOP_LOCK(*vpp, LK_SHARED, td);
677				else
678					VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
679			}
680		} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
681			VOP_LOCK(*vpp, LK_DOWNGRADE, td);
682	}
683
684	return (error);
685#else
686	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
687#endif
688}
689
690
691#ifndef _SYS_SYSPROTO_H_
692struct  __getcwd_args {
693	u_char	*buf;
694	u_int	buflen;
695};
696#endif
697
698/*
699 * XXX All of these sysctls would probably be more productive dead.
700 */
701static int disablecwd;
702SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
703   "Disable the getcwd syscall");
704
705/* Various statistics for the getcwd syscall */
706static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
707static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
708static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
709static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
710static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
711static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
712
713/* Implementation of the getcwd syscall */
714int
715__getcwd(td, uap)
716	struct thread *td;
717	struct __getcwd_args *uap;
718{
719
720	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
721}
722
723int
724kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
725{
726	char *bp, *tmpbuf;
727	int error, i, slash_prefixed;
728	struct filedesc *fdp;
729	struct namecache *ncp;
730	struct vnode *vp;
731
732	numcwdcalls++;
733	if (disablecwd)
734		return (ENODEV);
735	if (buflen < 2)
736		return (EINVAL);
737	if (buflen > MAXPATHLEN)
738		buflen = MAXPATHLEN;
739	error = 0;
740	tmpbuf = bp = malloc(buflen, M_TEMP, M_WAITOK);
741	bp += buflen - 1;
742	*bp = '\0';
743	fdp = td->td_proc->p_fd;
744	slash_prefixed = 0;
745	FILEDESC_LOCK(fdp);
746	mp_fixme("No vnode locking done!");
747	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
748		if (vp->v_vflag & VV_ROOT) {
749			if (vp->v_mount == NULL) {	/* forced unmount */
750				FILEDESC_UNLOCK(fdp);
751				free(tmpbuf, M_TEMP);
752				return (EBADF);
753			}
754			vp = vp->v_mount->mnt_vnodecovered;
755			continue;
756		}
757		if (vp->v_dd->v_id != vp->v_ddid) {
758			FILEDESC_UNLOCK(fdp);
759			numcwdfail1++;
760			free(tmpbuf, M_TEMP);
761			return (ENOTDIR);
762		}
763		ncp = TAILQ_FIRST(&vp->v_cache_dst);
764		if (!ncp) {
765			FILEDESC_UNLOCK(fdp);
766			numcwdfail2++;
767			free(tmpbuf, M_TEMP);
768			return (ENOENT);
769		}
770		if (ncp->nc_dvp != vp->v_dd) {
771			FILEDESC_UNLOCK(fdp);
772			numcwdfail3++;
773			free(tmpbuf, M_TEMP);
774			return (EBADF);
775		}
776		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
777			if (bp == tmpbuf) {
778				FILEDESC_UNLOCK(fdp);
779				numcwdfail4++;
780				free(tmpbuf, M_TEMP);
781				return (ENOMEM);
782			}
783			*--bp = ncp->nc_name[i];
784		}
785		if (bp == tmpbuf) {
786			FILEDESC_UNLOCK(fdp);
787			numcwdfail4++;
788			free(tmpbuf, M_TEMP);
789			return (ENOMEM);
790		}
791		*--bp = '/';
792		slash_prefixed = 1;
793		vp = vp->v_dd;
794	}
795	FILEDESC_UNLOCK(fdp);
796	if (!slash_prefixed) {
797		if (bp == tmpbuf) {
798			numcwdfail4++;
799			free(tmpbuf, M_TEMP);
800			return (ENOMEM);
801		}
802		*--bp = '/';
803	}
804	numcwdfound++;
805	if (bufseg == UIO_SYSSPACE)
806		bcopy(bp, buf, strlen(bp) + 1);
807	else
808		error = copyout(bp, buf, strlen(bp) + 1);
809	free(tmpbuf, M_TEMP);
810	return (error);
811}
812
813/*
814 * Thus begins the fullpath magic.
815 */
816
817#undef STATNODE
818#define STATNODE(name)							\
819	static u_int name;						\
820	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
821
822static int disablefullpath;
823SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
824	"Disable the vn_fullpath function");
825
826STATNODE(numfullpathcalls);
827STATNODE(numfullpathfail1);
828STATNODE(numfullpathfail2);
829STATNODE(numfullpathfail3);
830STATNODE(numfullpathfail4);
831STATNODE(numfullpathfound);
832
833/*
834 * Retrieve the full filesystem path that correspond to a vnode from the name
835 * cache (if available)
836 */
837int
838vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
839{
840	char *bp, *buf;
841	int i, slash_prefixed;
842	struct filedesc *fdp;
843	struct namecache *ncp;
844	struct vnode *vp;
845
846	numfullpathcalls++;
847	if (disablefullpath)
848		return (ENODEV);
849	if (vn == NULL)
850		return (EINVAL);
851	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
852	bp = buf + MAXPATHLEN - 1;
853	*bp = '\0';
854	fdp = td->td_proc->p_fd;
855	slash_prefixed = 0;
856	FILEDESC_LOCK(fdp);
857	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
858		ASSERT_VOP_LOCKED(vp, "vn_fullpath");
859		if (vp->v_vflag & VV_ROOT) {
860			if (vp->v_mount == NULL) {	/* forced unmount */
861				FILEDESC_UNLOCK(fdp);
862				free(buf, M_TEMP);
863				return (EBADF);
864			}
865			vp = vp->v_mount->mnt_vnodecovered;
866			continue;
867		}
868		if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
869			FILEDESC_UNLOCK(fdp);
870			numfullpathfail1++;
871			free(buf, M_TEMP);
872			return (ENOTDIR);
873		}
874		ncp = TAILQ_FIRST(&vp->v_cache_dst);
875		if (!ncp) {
876			FILEDESC_UNLOCK(fdp);
877			numfullpathfail2++;
878			free(buf, M_TEMP);
879			return (ENOENT);
880		}
881		if (vp != vn && ncp->nc_dvp != vp->v_dd) {
882			FILEDESC_UNLOCK(fdp);
883			numfullpathfail3++;
884			free(buf, M_TEMP);
885			return (EBADF);
886		}
887		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
888			if (bp == buf) {
889				FILEDESC_UNLOCK(fdp);
890				numfullpathfail4++;
891				free(buf, M_TEMP);
892				return (ENOMEM);
893			}
894			*--bp = ncp->nc_name[i];
895		}
896		if (bp == buf) {
897			FILEDESC_UNLOCK(fdp);
898			numfullpathfail4++;
899			free(buf, M_TEMP);
900			return (ENOMEM);
901		}
902		*--bp = '/';
903		slash_prefixed = 1;
904		vp = ncp->nc_dvp;
905	}
906	if (!slash_prefixed) {
907		if (bp == buf) {
908			FILEDESC_UNLOCK(fdp);
909			numfullpathfail4++;
910			free(buf, M_TEMP);
911			return (ENOMEM);
912		}
913		*--bp = '/';
914	}
915	FILEDESC_UNLOCK(fdp);
916	numfullpathfound++;
917	*retbuf = bp;
918	*freebuf = buf;
919	return (0);
920}
921