vfs_cache.c revision 116182
1/*
2 * Copyright (c) 1989, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Poul-Henning Kamp of the FreeBSD Project.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: head/sys/kern/vfs_cache.c 116182 2003-06-11 00:56:59Z obrien $");
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/mutex.h>
47#include <sys/sysctl.h>
48#include <sys/mount.h>
49#include <sys/vnode.h>
50#include <sys/namei.h>
51#include <sys/malloc.h>
52#include <sys/syscallsubr.h>
53#include <sys/sysproto.h>
54#include <sys/proc.h>
55#include <sys/filedesc.h>
56#include <sys/fnv_hash.h>
57
58/*
59 * This structure describes the elements in the cache of recent
60 * names looked up by namei.
61 */
62
63struct	namecache {
64	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
65	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
66	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
67	struct	vnode *nc_dvp;		/* vnode of parent of name */
68	struct	vnode *nc_vp;		/* vnode the name refers to */
69	u_char	nc_flag;		/* flag bits */
70	u_char	nc_nlen;		/* length of name */
71	char	nc_name[0];		/* segment name */
72};
73
74/*
75 * Name caching works as follows:
76 *
77 * Names found by directory scans are retained in a cache
78 * for future reference.  It is managed LRU, so frequently
79 * used names will hang around.  Cache is indexed by hash value
80 * obtained from (vp, name) where vp refers to the directory
81 * containing name.
82 *
83 * If it is a "negative" entry, (i.e. for a name that is known NOT to
84 * exist) the vnode pointer will be NULL.
85 *
86 * Upon reaching the last segment of a path, if the reference
87 * is for DELETE, or NOCACHE is set (rewrite), and the
88 * name is located in the cache, it will be dropped.
89 */
90
91/*
92 * Structures associated with name cacheing.
93 */
94#define NCHHASH(hash) \
95	(&nchashtbl[(hash) & nchash])
96static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
97static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
98static u_long	nchash;			/* size of hash table */
99SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
100static u_long	ncnegfactor = 16;	/* ratio of negative entries */
101SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
102static u_long	numneg;			/* number of cache entries allocated */
103SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
104static u_long	numcache;		/* number of cache entries allocated */
105SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
106static u_long	numcachehv;		/* number of cache entries with vnodes held */
107SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
108#if 0
109static u_long	numcachepl;		/* number of cache purge for leaf entries */
110SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
111#endif
112struct	nchstats nchstats;		/* cache effectiveness statistics */
113
114static int	doingcache = 1;		/* 1 => enable the cache */
115SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
116
117/* Export size information to userland */
118SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
119SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
120
121/*
122 * The new name cache statistics
123 */
124SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
125#define STATNODE(mode, name, var) \
126	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
127STATNODE(CTLFLAG_RD, numneg, &numneg);
128STATNODE(CTLFLAG_RD, numcache, &numcache);
129static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
130static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
131static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
132static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
133static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
134static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
135static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
136static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
137static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
138static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
139
140SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
141        sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
142
143
144
145static void cache_zap(struct namecache *ncp);
146
147static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
148
149/*
150 * Flags in namecache.nc_flag
151 */
152#define NCF_WHITE	1
153
154/*
155 * Grab an atomic snapshot of the name cache hash chain lengths
156 */
157SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
158
159static int
160sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
161{
162	int error;
163	struct nchashhead *ncpp;
164	struct namecache *ncp;
165	int n_nchash;
166	int count;
167
168	n_nchash = nchash + 1;	/* nchash is max index, not count */
169	if (!req->oldptr)
170		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
171
172	/* Scan hash tables for applicable entries */
173	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
174		count = 0;
175		LIST_FOREACH(ncp, ncpp, nc_hash) {
176			count++;
177		}
178		error = SYSCTL_OUT(req, &count, sizeof(count));
179		if (error)
180			return (error);
181	}
182	return (0);
183}
184SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
185	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
186
187static int
188sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
189{
190	int error;
191	struct nchashhead *ncpp;
192	struct namecache *ncp;
193	int n_nchash;
194	int count, maxlength, used, pct;
195
196	if (!req->oldptr)
197		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
198
199	n_nchash = nchash + 1;	/* nchash is max index, not count */
200	used = 0;
201	maxlength = 0;
202
203	/* Scan hash tables for applicable entries */
204	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
205		count = 0;
206		LIST_FOREACH(ncp, ncpp, nc_hash) {
207			count++;
208		}
209		if (count)
210			used++;
211		if (maxlength < count)
212			maxlength = count;
213	}
214	n_nchash = nchash + 1;
215	pct = (used * 100 * 100) / n_nchash;
216	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
217	if (error)
218		return (error);
219	error = SYSCTL_OUT(req, &used, sizeof(used));
220	if (error)
221		return (error);
222	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
223	if (error)
224		return (error);
225	error = SYSCTL_OUT(req, &pct, sizeof(pct));
226	if (error)
227		return (error);
228	return (0);
229}
230SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
231	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
232
233/*
234 * cache_zap():
235 *
236 *   Removes a namecache entry from cache, whether it contains an actual
237 *   pointer to a vnode or if it is just a negative cache entry.
238 */
239static void
240cache_zap(ncp)
241	struct namecache *ncp;
242{
243	LIST_REMOVE(ncp, nc_hash);
244	LIST_REMOVE(ncp, nc_src);
245	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
246		vdrop(ncp->nc_dvp);
247		numcachehv--;
248	}
249	if (ncp->nc_vp) {
250		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
251	} else {
252		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
253		numneg--;
254	}
255	numcache--;
256	free(ncp, M_VFSCACHE);
257}
258
259/*
260 * cache_leaf_test()
261 *
262 *      Test whether this (directory) vnode's namei cache entry contains
263 *      subdirectories or not.  Used to determine whether the directory is
264 *      a leaf in the namei cache or not.  Note: the directory may still
265 *      contain files in the namei cache.
266 *
267 *      Returns 0 if the directory is a leaf, -1 if it isn't.
268 */
269int
270cache_leaf_test(struct vnode *vp)
271{
272	struct namecache *ncpc;
273
274	for (ncpc = LIST_FIRST(&vp->v_cache_src);
275	     ncpc != NULL;
276	     ncpc = LIST_NEXT(ncpc, nc_src)
277	) {
278		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR)
279			return(-1);
280	}
281	return(0);
282}
283
284/*
285 * Lookup an entry in the cache
286 *
287 * Lookup is called with dvp pointing to the directory to search,
288 * cnp pointing to the name of the entry being sought. If the lookup
289 * succeeds, the vnode is returned in *vpp, and a status of -1 is
290 * returned. If the lookup determines that the name does not exist
291 * (negative cacheing), a status of ENOENT is returned. If the lookup
292 * fails, a status of zero is returned.
293 */
294
295int
296cache_lookup(dvp, vpp, cnp)
297	struct vnode *dvp;
298	struct vnode **vpp;
299	struct componentname *cnp;
300{
301	struct namecache *ncp;
302	u_int32_t hash;
303
304	if (!doingcache) {
305		cnp->cn_flags &= ~MAKEENTRY;
306		return (0);
307	}
308
309	numcalls++;
310
311	if (cnp->cn_nameptr[0] == '.') {
312		if (cnp->cn_namelen == 1) {
313			*vpp = dvp;
314			dothits++;
315			return (-1);
316		}
317		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
318			dotdothits++;
319			if (dvp->v_dd->v_id != dvp->v_ddid ||
320			    (cnp->cn_flags & MAKEENTRY) == 0) {
321				dvp->v_ddid = 0;
322				return (0);
323			}
324			*vpp = dvp->v_dd;
325			return (-1);
326		}
327	}
328
329	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
330	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
331	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
332		numchecks++;
333		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
334		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
335			break;
336	}
337
338	/* We failed to find an entry */
339	if (ncp == 0) {
340		if ((cnp->cn_flags & MAKEENTRY) == 0) {
341			nummisszap++;
342		} else {
343			nummiss++;
344		}
345		nchstats.ncs_miss++;
346		return (0);
347	}
348
349	/* We don't want to have an entry, so dump it */
350	if ((cnp->cn_flags & MAKEENTRY) == 0) {
351		numposzaps++;
352		nchstats.ncs_badhits++;
353		cache_zap(ncp);
354		return (0);
355	}
356
357	/* We found a "positive" match, return the vnode */
358        if (ncp->nc_vp) {
359		numposhits++;
360		nchstats.ncs_goodhits++;
361		*vpp = ncp->nc_vp;
362		return (-1);
363	}
364
365	/* We found a negative match, and want to create it, so purge */
366	if (cnp->cn_nameiop == CREATE) {
367		numnegzaps++;
368		nchstats.ncs_badhits++;
369		cache_zap(ncp);
370		return (0);
371	}
372
373	numneghits++;
374	/*
375	 * We found a "negative" match, so we shift it to the end of
376	 * the "negative" cache entries queue to satisfy LRU.  Also,
377	 * check to see if the entry is a whiteout; indicate this to
378	 * the componentname, if so.
379	 */
380	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
381	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
382	nchstats.ncs_neghits++;
383	if (ncp->nc_flag & NCF_WHITE)
384		cnp->cn_flags |= ISWHITEOUT;
385	return (ENOENT);
386}
387
388/*
389 * Add an entry to the cache.
390 */
391void
392cache_enter(dvp, vp, cnp)
393	struct vnode *dvp;
394	struct vnode *vp;
395	struct componentname *cnp;
396{
397	struct namecache *ncp;
398	struct nchashhead *ncpp;
399	u_int32_t hash;
400	int len;
401
402	if (!doingcache)
403		return;
404
405	if (cnp->cn_nameptr[0] == '.') {
406		if (cnp->cn_namelen == 1) {
407			return;
408		}
409		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
410			if (vp) {
411				dvp->v_dd = vp;
412				dvp->v_ddid = vp->v_id;
413			} else {
414				dvp->v_dd = dvp;
415				dvp->v_ddid = 0;
416			}
417			return;
418		}
419	}
420
421	ncp = (struct namecache *)
422		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
423	bzero((char *)ncp, sizeof *ncp);
424	numcache++;
425	if (!vp) {
426		numneg++;
427		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
428	} else if (vp->v_type == VDIR) {
429		vp->v_dd = dvp;
430		vp->v_ddid = dvp->v_id;
431	}
432
433	/*
434	 * Set the rest of the namecache entry elements, calculate it's
435	 * hash key and insert it into the appropriate chain within
436	 * the cache entries table.
437	 */
438	ncp->nc_vp = vp;
439	ncp->nc_dvp = dvp;
440	len = ncp->nc_nlen = cnp->cn_namelen;
441	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
442	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
443	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
444	ncpp = NCHHASH(hash);
445	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
446	if (LIST_EMPTY(&dvp->v_cache_src)) {
447		vhold(dvp);
448		numcachehv++;
449	}
450	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
451	/*
452	 * If the entry is "negative", we place it into the
453	 * "negative" cache queue, otherwise, we place it into the
454	 * destination vnode's cache entries queue.
455	 */
456	if (vp) {
457		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
458	} else {
459		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
460	}
461	if (numneg * ncnegfactor > numcache) {
462		ncp = TAILQ_FIRST(&ncneg);
463		cache_zap(ncp);
464	}
465}
466
467/*
468 * Name cache initialization, from vfs_init() when we are booting
469 */
470static void
471nchinit(void *dummy __unused)
472{
473
474	TAILQ_INIT(&ncneg);
475	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
476}
477SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
478
479
480/*
481 * Invalidate all entries to a particular vnode.
482 *
483 * Remove all entries in the namecache relating to this vnode and
484 * change the v_id.  We take the v_id from a global counter, since
485 * it becomes a handy sequence number in crash-dumps that way.
486 * No valid vnode will ever have (v_id == 0).
487 *
488 * XXX: Only time and the size of v_id prevents this from failing:
489 * XXX: In theory we should hunt down all (struct vnode*, v_id)
490 * XXX: soft references and nuke them, at least on the global
491 * XXX: v_id wraparound.  The period of resistance can be extended
492 * XXX: by incrementing each vnodes v_id individually instead of
493 * XXX: using the global v_id.
494 */
495
496void
497cache_purge(vp)
498	struct vnode *vp;
499{
500	static u_long nextid;
501
502	while (!LIST_EMPTY(&vp->v_cache_src))
503		cache_zap(LIST_FIRST(&vp->v_cache_src));
504	while (!TAILQ_EMPTY(&vp->v_cache_dst))
505		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
506
507	do
508		nextid++;
509	while (nextid == vp->v_id || !nextid);
510	vp->v_id = nextid;
511	vp->v_dd = vp;
512	vp->v_ddid = 0;
513}
514
515/*
516 * Flush all entries referencing a particular filesystem.
517 *
518 * Since we need to check it anyway, we will flush all the invalid
519 * entries at the same time.
520 */
521void
522cache_purgevfs(mp)
523	struct mount *mp;
524{
525	struct nchashhead *ncpp;
526	struct namecache *ncp, *nnp;
527
528	/* Scan hash tables for applicable entries */
529	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
530		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
531			nnp = LIST_NEXT(ncp, nc_hash);
532			if (ncp->nc_dvp->v_mount == mp) {
533				cache_zap(ncp);
534			}
535		}
536	}
537}
538
539/*
540 * Perform canonical checks and cache lookup and pass on to filesystem
541 * through the vop_cachedlookup only if needed.
542 */
543
544int
545vfs_cache_lookup(ap)
546	struct vop_lookup_args /* {
547		struct vnode *a_dvp;
548		struct vnode **a_vpp;
549		struct componentname *a_cnp;
550	} */ *ap;
551{
552	struct vnode *dvp, *vp;
553	int lockparent;
554	int error;
555	struct vnode **vpp = ap->a_vpp;
556	struct componentname *cnp = ap->a_cnp;
557	struct ucred *cred = cnp->cn_cred;
558	int flags = cnp->cn_flags;
559	struct thread *td = cnp->cn_thread;
560	u_long vpid;	/* capability number of vnode */
561
562	*vpp = NULL;
563	dvp = ap->a_dvp;
564	lockparent = flags & LOCKPARENT;
565
566	if (dvp->v_type != VDIR)
567                return (ENOTDIR);
568
569	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
570	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
571		return (EROFS);
572
573	error = VOP_ACCESS(dvp, VEXEC, cred, td);
574
575	if (error)
576		return (error);
577
578	error = cache_lookup(dvp, vpp, cnp);
579
580#ifdef LOOKUP_SHARED
581	if (!error) {
582		/* We do this because the rest of the system now expects to get
583		 * a shared lock, which is later upgraded if LOCKSHARED is not
584		 * set.  We have so many cases here because of bugs that yield
585		 * inconsistant lock states.  This all badly needs to be fixed
586		 */
587		error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
588		if (!error) {
589			int flock;
590
591			flock = VOP_ISLOCKED(*vpp, td);
592			if (flock != LK_EXCLUSIVE) {
593				if (flock == 0) {
594					if ((flags & ISLASTCN) &&
595					    (flags & LOCKSHARED))
596						VOP_LOCK(*vpp, LK_SHARED, td);
597					else
598						VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
599				}
600			} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
601				VOP_LOCK(*vpp, LK_DOWNGRADE, td);
602		}
603		return (error);
604	}
605#else
606	if (!error)
607		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
608#endif
609
610	if (error == ENOENT)
611		return (error);
612
613	vp = *vpp;
614	vpid = vp->v_id;
615	cnp->cn_flags &= ~PDIRUNLOCK;
616	if (dvp == vp) {   /* lookup on "." */
617		VREF(vp);
618		error = 0;
619	} else if (flags & ISDOTDOT) {
620		VOP_UNLOCK(dvp, 0, td);
621		cnp->cn_flags |= PDIRUNLOCK;
622#ifdef LOOKUP_SHARED
623		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
624			error = vget(vp, LK_SHARED, td);
625		else
626			error = vget(vp, LK_EXCLUSIVE, td);
627#else
628		error = vget(vp, LK_EXCLUSIVE, td);
629#endif
630
631		if (!error && lockparent && (flags & ISLASTCN)) {
632			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
633				cnp->cn_flags &= ~PDIRUNLOCK;
634		}
635	} else {
636#ifdef LOOKUP_SHARED
637		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
638			error = vget(vp, LK_SHARED, td);
639		else
640			error = vget(vp, LK_EXCLUSIVE, td);
641#else
642		error = vget(vp, LK_EXCLUSIVE, td);
643#endif
644		if (!lockparent || error || !(flags & ISLASTCN)) {
645			VOP_UNLOCK(dvp, 0, td);
646			cnp->cn_flags |= PDIRUNLOCK;
647		}
648	}
649	/*
650	 * Check that the capability number did not change
651	 * while we were waiting for the lock.
652	 */
653	if (!error) {
654		if (vpid == vp->v_id)
655			return (0);
656		vput(vp);
657		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
658			VOP_UNLOCK(dvp, 0, td);
659			cnp->cn_flags |= PDIRUNLOCK;
660		}
661	}
662	if (cnp->cn_flags & PDIRUNLOCK) {
663		error = vn_lock(dvp, LK_EXCLUSIVE, td);
664		if (error)
665			return (error);
666		cnp->cn_flags &= ~PDIRUNLOCK;
667	}
668#ifdef LOOKUP_SHARED
669	error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
670
671	if (!error) {
672		int flock = 0;
673
674		flock = VOP_ISLOCKED(*vpp, td);
675		if (flock != LK_EXCLUSIVE) {
676			if (flock == 0) {
677				if ((flags & ISLASTCN) && (flags & LOCKSHARED))
678					VOP_LOCK(*vpp, LK_SHARED, td);
679				else
680					VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
681			}
682		} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
683			VOP_LOCK(*vpp, LK_DOWNGRADE, td);
684	}
685
686	return (error);
687#else
688	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
689#endif
690}
691
692
693#ifndef _SYS_SYSPROTO_H_
694struct  __getcwd_args {
695	u_char	*buf;
696	u_int	buflen;
697};
698#endif
699
700/*
701 * XXX All of these sysctls would probably be more productive dead.
702 */
703static int disablecwd;
704SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
705   "Disable the getcwd syscall");
706
707/* Various statistics for the getcwd syscall */
708static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
709static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
710static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
711static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
712static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
713static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
714
715/* Implementation of the getcwd syscall */
716int
717__getcwd(td, uap)
718	struct thread *td;
719	struct __getcwd_args *uap;
720{
721
722	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
723}
724
725int
726kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
727{
728	char *bp, *tmpbuf;
729	int error, i, slash_prefixed;
730	struct filedesc *fdp;
731	struct namecache *ncp;
732	struct vnode *vp;
733
734	numcwdcalls++;
735	if (disablecwd)
736		return (ENODEV);
737	if (buflen < 2)
738		return (EINVAL);
739	if (buflen > MAXPATHLEN)
740		buflen = MAXPATHLEN;
741	error = 0;
742	tmpbuf = bp = malloc(buflen, M_TEMP, M_WAITOK);
743	bp += buflen - 1;
744	*bp = '\0';
745	fdp = td->td_proc->p_fd;
746	slash_prefixed = 0;
747	FILEDESC_LOCK(fdp);
748	mp_fixme("No vnode locking done!");
749	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
750		if (vp->v_vflag & VV_ROOT) {
751			if (vp->v_mount == NULL) {	/* forced unmount */
752				FILEDESC_UNLOCK(fdp);
753				free(tmpbuf, M_TEMP);
754				return (EBADF);
755			}
756			vp = vp->v_mount->mnt_vnodecovered;
757			continue;
758		}
759		if (vp->v_dd->v_id != vp->v_ddid) {
760			FILEDESC_UNLOCK(fdp);
761			numcwdfail1++;
762			free(tmpbuf, M_TEMP);
763			return (ENOTDIR);
764		}
765		ncp = TAILQ_FIRST(&vp->v_cache_dst);
766		if (!ncp) {
767			FILEDESC_UNLOCK(fdp);
768			numcwdfail2++;
769			free(tmpbuf, M_TEMP);
770			return (ENOENT);
771		}
772		if (ncp->nc_dvp != vp->v_dd) {
773			FILEDESC_UNLOCK(fdp);
774			numcwdfail3++;
775			free(tmpbuf, M_TEMP);
776			return (EBADF);
777		}
778		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
779			if (bp == tmpbuf) {
780				FILEDESC_UNLOCK(fdp);
781				numcwdfail4++;
782				free(tmpbuf, M_TEMP);
783				return (ENOMEM);
784			}
785			*--bp = ncp->nc_name[i];
786		}
787		if (bp == tmpbuf) {
788			FILEDESC_UNLOCK(fdp);
789			numcwdfail4++;
790			free(tmpbuf, M_TEMP);
791			return (ENOMEM);
792		}
793		*--bp = '/';
794		slash_prefixed = 1;
795		vp = vp->v_dd;
796	}
797	FILEDESC_UNLOCK(fdp);
798	if (!slash_prefixed) {
799		if (bp == tmpbuf) {
800			numcwdfail4++;
801			free(tmpbuf, M_TEMP);
802			return (ENOMEM);
803		}
804		*--bp = '/';
805	}
806	numcwdfound++;
807	if (bufseg == UIO_SYSSPACE)
808		bcopy(bp, buf, strlen(bp) + 1);
809	else
810		error = copyout(bp, buf, strlen(bp) + 1);
811	free(tmpbuf, M_TEMP);
812	return (error);
813}
814
815/*
816 * Thus begins the fullpath magic.
817 */
818
819#undef STATNODE
820#define STATNODE(name)							\
821	static u_int name;						\
822	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
823
824static int disablefullpath;
825SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
826	"Disable the vn_fullpath function");
827
828STATNODE(numfullpathcalls);
829STATNODE(numfullpathfail1);
830STATNODE(numfullpathfail2);
831STATNODE(numfullpathfail3);
832STATNODE(numfullpathfail4);
833STATNODE(numfullpathfound);
834
835/*
836 * Retrieve the full filesystem path that correspond to a vnode from the name
837 * cache (if available)
838 */
839int
840vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
841{
842	char *bp, *buf;
843	int i, slash_prefixed;
844	struct filedesc *fdp;
845	struct namecache *ncp;
846	struct vnode *vp;
847
848	numfullpathcalls++;
849	if (disablefullpath)
850		return (ENODEV);
851	if (vn == NULL)
852		return (EINVAL);
853	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
854	bp = buf + MAXPATHLEN - 1;
855	*bp = '\0';
856	fdp = td->td_proc->p_fd;
857	slash_prefixed = 0;
858	FILEDESC_LOCK(fdp);
859	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
860		ASSERT_VOP_LOCKED(vp, "vn_fullpath");
861		if (vp->v_vflag & VV_ROOT) {
862			if (vp->v_mount == NULL) {	/* forced unmount */
863				FILEDESC_UNLOCK(fdp);
864				free(buf, M_TEMP);
865				return (EBADF);
866			}
867			vp = vp->v_mount->mnt_vnodecovered;
868			continue;
869		}
870		if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
871			FILEDESC_UNLOCK(fdp);
872			numfullpathfail1++;
873			free(buf, M_TEMP);
874			return (ENOTDIR);
875		}
876		ncp = TAILQ_FIRST(&vp->v_cache_dst);
877		if (!ncp) {
878			FILEDESC_UNLOCK(fdp);
879			numfullpathfail2++;
880			free(buf, M_TEMP);
881			return (ENOENT);
882		}
883		if (vp != vn && ncp->nc_dvp != vp->v_dd) {
884			FILEDESC_UNLOCK(fdp);
885			numfullpathfail3++;
886			free(buf, M_TEMP);
887			return (EBADF);
888		}
889		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
890			if (bp == buf) {
891				FILEDESC_UNLOCK(fdp);
892				numfullpathfail4++;
893				free(buf, M_TEMP);
894				return (ENOMEM);
895			}
896			*--bp = ncp->nc_name[i];
897		}
898		if (bp == buf) {
899			FILEDESC_UNLOCK(fdp);
900			numfullpathfail4++;
901			free(buf, M_TEMP);
902			return (ENOMEM);
903		}
904		*--bp = '/';
905		slash_prefixed = 1;
906		vp = ncp->nc_dvp;
907	}
908	if (!slash_prefixed) {
909		if (bp == buf) {
910			FILEDESC_UNLOCK(fdp);
911			numfullpathfail4++;
912			free(buf, M_TEMP);
913			return (ENOMEM);
914		}
915		*--bp = '/';
916	}
917	FILEDESC_UNLOCK(fdp);
918	numfullpathfound++;
919	*retbuf = bp;
920	*freebuf = buf;
921	return (0);
922}
923