vfs_cache.c revision 102870
1/*
2 * Copyright (c) 1989, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Poul-Henning Kamp of the FreeBSD Project.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
37 * $FreeBSD: head/sys/kern/vfs_cache.c 102870 2002-09-02 22:40:30Z iedowse $
38 */
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/mutex.h>
45#include <sys/sysctl.h>
46#include <sys/mount.h>
47#include <sys/vnode.h>
48#include <sys/namei.h>
49#include <sys/malloc.h>
50#include <sys/syscallsubr.h>
51#include <sys/sysproto.h>
52#include <sys/proc.h>
53#include <sys/filedesc.h>
54#include <sys/fnv_hash.h>
55
56/*
57 * This structure describes the elements in the cache of recent
58 * names looked up by namei.
59 */
60
61struct	namecache {
62	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
63	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
64	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
65	struct	vnode *nc_dvp;		/* vnode of parent of name */
66	struct	vnode *nc_vp;		/* vnode the name refers to */
67	u_char	nc_flag;		/* flag bits */
68	u_char	nc_nlen;		/* length of name */
69	char	nc_name[0];		/* segment name */
70};
71
72/*
73 * Name caching works as follows:
74 *
75 * Names found by directory scans are retained in a cache
76 * for future reference.  It is managed LRU, so frequently
77 * used names will hang around.  Cache is indexed by hash value
78 * obtained from (vp, name) where vp refers to the directory
79 * containing name.
80 *
81 * If it is a "negative" entry, (i.e. for a name that is known NOT to
82 * exist) the vnode pointer will be NULL.
83 *
84 * Upon reaching the last segment of a path, if the reference
85 * is for DELETE, or NOCACHE is set (rewrite), and the
86 * name is located in the cache, it will be dropped.
87 */
88
89/*
90 * Structures associated with name cacheing.
91 */
92#define NCHHASH(hash) \
93	(&nchashtbl[(hash) & nchash])
94static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
95static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
96static u_long	nchash;			/* size of hash table */
97SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
98static u_long	ncnegfactor = 16;	/* ratio of negative entries */
99SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
100static u_long	numneg;			/* number of cache entries allocated */
101SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
102static u_long	numcache;		/* number of cache entries allocated */
103SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
104static u_long	numcachehv;		/* number of cache entries with vnodes held */
105SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
106#if 0
107static u_long	numcachepl;		/* number of cache purge for leaf entries */
108SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
109#endif
110struct	nchstats nchstats;		/* cache effectiveness statistics */
111
112static int	doingcache = 1;		/* 1 => enable the cache */
113SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
114
115/* Export size information to userland */
116SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
117SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
118
119/*
120 * The new name cache statistics
121 */
122SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
123#define STATNODE(mode, name, var) \
124	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
125STATNODE(CTLFLAG_RD, numneg, &numneg);
126STATNODE(CTLFLAG_RD, numcache, &numcache);
127static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
128static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
129static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
130static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
131static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
132static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
133static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
134static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
135static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
136static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
137
138SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
139        sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
140
141
142
143static void cache_zap(struct namecache *ncp);
144
145static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
146
147/*
148 * Flags in namecache.nc_flag
149 */
150#define NCF_WHITE	1
151
152/*
153 * Grab an atomic snapshot of the name cache hash chain lengths
154 */
155SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
156
157static int
158sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
159{
160	int error;
161	struct nchashhead *ncpp;
162	struct namecache *ncp;
163	int n_nchash;
164	int count;
165
166	n_nchash = nchash + 1;	/* nchash is max index, not count */
167	if (!req->oldptr)
168		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
169
170	/* Scan hash tables for applicable entries */
171	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
172		count = 0;
173		LIST_FOREACH(ncp, ncpp, nc_hash) {
174			count++;
175		}
176		error = SYSCTL_OUT(req, &count, sizeof(count));
177		if (error)
178			return (error);
179	}
180	return (0);
181}
182SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
183	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
184
185static int
186sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
187{
188	int error;
189	struct nchashhead *ncpp;
190	struct namecache *ncp;
191	int n_nchash;
192	int count, maxlength, used, pct;
193
194	if (!req->oldptr)
195		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
196
197	n_nchash = nchash + 1;	/* nchash is max index, not count */
198	used = 0;
199	maxlength = 0;
200
201	/* Scan hash tables for applicable entries */
202	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
203		count = 0;
204		LIST_FOREACH(ncp, ncpp, nc_hash) {
205			count++;
206		}
207		if (count)
208			used++;
209		if (maxlength < count)
210			maxlength = count;
211	}
212	n_nchash = nchash + 1;
213	pct = (used * 100 * 100) / n_nchash;
214	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
215	if (error)
216		return (error);
217	error = SYSCTL_OUT(req, &used, sizeof(used));
218	if (error)
219		return (error);
220	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
221	if (error)
222		return (error);
223	error = SYSCTL_OUT(req, &pct, sizeof(pct));
224	if (error)
225		return (error);
226	return (0);
227}
228SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
229	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
230
231/*
232 * Delete an entry from its hash list and move it to the front
233 * of the LRU list for immediate reuse.
234 */
235static void
236cache_zap(ncp)
237	struct namecache *ncp;
238{
239	LIST_REMOVE(ncp, nc_hash);
240	LIST_REMOVE(ncp, nc_src);
241	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
242		vdrop(ncp->nc_dvp);
243		numcachehv--;
244	}
245	if (ncp->nc_vp) {
246		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
247	} else {
248		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
249		numneg--;
250	}
251	numcache--;
252	free(ncp, M_VFSCACHE);
253}
254
255/*
256 * cache_leaf_test()
257 *
258 *      Test whether this (directory) vnode's namei cache entry contains
259 *      subdirectories or not.  Used to determine whether the directory is
260 *      a leaf in the namei cache or not.  Note: the directory may still
261 *      contain files in the namei cache.
262 *
263 *      Returns 0 if the directory is a leaf, -1 if it isn't.
264 */
265int
266cache_leaf_test(struct vnode *vp)
267{
268	struct namecache *ncpc;
269
270	for (ncpc = LIST_FIRST(&vp->v_cache_src);
271	     ncpc != NULL;
272	     ncpc = LIST_NEXT(ncpc, nc_src)
273	) {
274		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR)
275			return(-1);
276	}
277	return(0);
278}
279
280/*
281 * Lookup an entry in the cache
282 *
283 * Lookup is called with dvp pointing to the directory to search,
284 * cnp pointing to the name of the entry being sought. If the lookup
285 * succeeds, the vnode is returned in *vpp, and a status of -1 is
286 * returned. If the lookup determines that the name does not exist
287 * (negative cacheing), a status of ENOENT is returned. If the lookup
288 * fails, a status of zero is returned.
289 */
290
291int
292cache_lookup(dvp, vpp, cnp)
293	struct vnode *dvp;
294	struct vnode **vpp;
295	struct componentname *cnp;
296{
297	struct namecache *ncp;
298	u_int32_t hash;
299
300	if (!doingcache) {
301		cnp->cn_flags &= ~MAKEENTRY;
302		return (0);
303	}
304
305	numcalls++;
306
307	if (cnp->cn_nameptr[0] == '.') {
308		if (cnp->cn_namelen == 1) {
309			*vpp = dvp;
310			dothits++;
311			return (-1);
312		}
313		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
314			dotdothits++;
315			if (dvp->v_dd->v_id != dvp->v_ddid ||
316			    (cnp->cn_flags & MAKEENTRY) == 0) {
317				dvp->v_ddid = 0;
318				return (0);
319			}
320			*vpp = dvp->v_dd;
321			return (-1);
322		}
323	}
324
325	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
326	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
327	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
328		numchecks++;
329		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
330		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
331			break;
332	}
333
334	/* We failed to find an entry */
335	if (ncp == 0) {
336		if ((cnp->cn_flags & MAKEENTRY) == 0) {
337			nummisszap++;
338		} else {
339			nummiss++;
340		}
341		nchstats.ncs_miss++;
342		return (0);
343	}
344
345	/* We don't want to have an entry, so dump it */
346	if ((cnp->cn_flags & MAKEENTRY) == 0) {
347		numposzaps++;
348		nchstats.ncs_badhits++;
349		cache_zap(ncp);
350		return (0);
351	}
352
353	/* We found a "positive" match, return the vnode */
354        if (ncp->nc_vp) {
355		numposhits++;
356		nchstats.ncs_goodhits++;
357		*vpp = ncp->nc_vp;
358		return (-1);
359	}
360
361	/* We found a negative match, and want to create it, so purge */
362	if (cnp->cn_nameiop == CREATE) {
363		numnegzaps++;
364		nchstats.ncs_badhits++;
365		cache_zap(ncp);
366		return (0);
367	}
368
369	numneghits++;
370	/*
371	 * We found a "negative" match, ENOENT notifies client of this match.
372	 * The nc_vpid field records whether this is a whiteout.
373	 */
374	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
375	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
376	nchstats.ncs_neghits++;
377	if (ncp->nc_flag & NCF_WHITE)
378		cnp->cn_flags |= ISWHITEOUT;
379	return (ENOENT);
380}
381
382/*
383 * Add an entry to the cache.
384 */
385void
386cache_enter(dvp, vp, cnp)
387	struct vnode *dvp;
388	struct vnode *vp;
389	struct componentname *cnp;
390{
391	struct namecache *ncp;
392	struct nchashhead *ncpp;
393	u_int32_t hash;
394	int len;
395
396	if (!doingcache)
397		return;
398
399	if (cnp->cn_nameptr[0] == '.') {
400		if (cnp->cn_namelen == 1) {
401			return;
402		}
403		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
404			if (vp) {
405				dvp->v_dd = vp;
406				dvp->v_ddid = vp->v_id;
407			} else {
408				dvp->v_dd = dvp;
409				dvp->v_ddid = 0;
410			}
411			return;
412		}
413	}
414
415	ncp = (struct namecache *)
416		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
417	bzero((char *)ncp, sizeof *ncp);
418	numcache++;
419	if (!vp) {
420		numneg++;
421		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
422	} else if (vp->v_type == VDIR) {
423		vp->v_dd = dvp;
424		vp->v_ddid = dvp->v_id;
425	}
426
427	/*
428	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
429	 * For negative entries, we have to record whether it is a whiteout.
430	 * the whiteout flag is stored in the nc_vpid field which is
431	 * otherwise unused.
432	 */
433	ncp->nc_vp = vp;
434	ncp->nc_dvp = dvp;
435	len = ncp->nc_nlen = cnp->cn_namelen;
436	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
437	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
438	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
439	ncpp = NCHHASH(hash);
440	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
441	if (LIST_EMPTY(&dvp->v_cache_src)) {
442		vhold(dvp);
443		numcachehv++;
444	}
445	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
446	if (vp) {
447		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
448	} else {
449		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
450	}
451	if (numneg * ncnegfactor > numcache) {
452		ncp = TAILQ_FIRST(&ncneg);
453		cache_zap(ncp);
454	}
455}
456
457/*
458 * Name cache initialization, from vfs_init() when we are booting
459 */
460static void
461nchinit(void *dummy __unused)
462{
463
464	TAILQ_INIT(&ncneg);
465	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
466}
467SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
468
469
470/*
471 * Invalidate all entries to a particular vnode.
472 *
473 * Remove all entries in the namecache relating to this vnode and
474 * change the v_id.  We take the v_id from a global counter, since
475 * it becomes a handy sequence number in crash-dumps that way.
476 * No valid vnode will ever have (v_id == 0).
477 *
478 * XXX: Only time and the size of v_id prevents this from failing:
479 * XXX: In theory we should hunt down all (struct vnode*, v_id)
480 * XXX: soft references and nuke them, at least on the global
481 * XXX: v_id wraparound.  The period of resistance can be extended
482 * XXX: by incrementing each vnodes v_id individually instead of
483 * XXX: using the global v_id.
484 */
485
486void
487cache_purge(vp)
488	struct vnode *vp;
489{
490	static u_long nextid;
491
492	while (!LIST_EMPTY(&vp->v_cache_src))
493		cache_zap(LIST_FIRST(&vp->v_cache_src));
494	while (!TAILQ_EMPTY(&vp->v_cache_dst))
495		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
496
497	do
498		nextid++;
499	while (nextid == vp->v_id || !nextid);
500	vp->v_id = nextid;
501	vp->v_dd = vp;
502	vp->v_ddid = 0;
503}
504
505/*
506 * Flush all entries referencing a particular filesystem.
507 *
508 * Since we need to check it anyway, we will flush all the invalid
509 * entries at the same time.
510 */
511void
512cache_purgevfs(mp)
513	struct mount *mp;
514{
515	struct nchashhead *ncpp;
516	struct namecache *ncp, *nnp;
517
518	/* Scan hash tables for applicable entries */
519	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
520		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
521			nnp = LIST_NEXT(ncp, nc_hash);
522			if (ncp->nc_dvp->v_mount == mp) {
523				cache_zap(ncp);
524			}
525		}
526	}
527}
528
529/*
530 * Perform canonical checks and cache lookup and pass on to filesystem
531 * through the vop_cachedlookup only if needed.
532 */
533
534int
535vfs_cache_lookup(ap)
536	struct vop_lookup_args /* {
537		struct vnode *a_dvp;
538		struct vnode **a_vpp;
539		struct componentname *a_cnp;
540	} */ *ap;
541{
542	struct vnode *dvp, *vp;
543	int lockparent;
544	int error;
545	struct vnode **vpp = ap->a_vpp;
546	struct componentname *cnp = ap->a_cnp;
547	struct ucred *cred = cnp->cn_cred;
548	int flags = cnp->cn_flags;
549	struct thread *td = cnp->cn_thread;
550	u_long vpid;	/* capability number of vnode */
551
552	*vpp = NULL;
553	dvp = ap->a_dvp;
554	lockparent = flags & LOCKPARENT;
555
556	if (dvp->v_type != VDIR)
557                return (ENOTDIR);
558
559	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
560	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
561		return (EROFS);
562
563	error = VOP_ACCESS(dvp, VEXEC, cred, td);
564
565	if (error)
566		return (error);
567
568	error = cache_lookup(dvp, vpp, cnp);
569
570#ifdef LOOKUP_SHARED
571	if (!error) {
572		/* We do this because the rest of the system now expects to get
573		 * a shared lock, which is later upgraded if LOCKSHARED is not
574		 * set.  We have so many cases here because of bugs that yield
575		 * inconsistant lock states.  This all badly needs to be fixed
576		 */
577		error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
578		if (!error) {
579			int flock;
580
581			flock = VOP_ISLOCKED(*vpp, td);
582			if (flock != LK_EXCLUSIVE) {
583				if (flock == 0) {
584					if ((flags & ISLASTCN) &&
585					    (flags & LOCKSHARED))
586						VOP_LOCK(*vpp, LK_SHARED, td);
587					else
588						VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
589				}
590			} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
591				VOP_LOCK(*vpp, LK_DOWNGRADE, td);
592		}
593		return (error);
594	}
595#else
596	if (!error)
597		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
598#endif
599
600	if (error == ENOENT)
601		return (error);
602
603	vp = *vpp;
604	vpid = vp->v_id;
605	cnp->cn_flags &= ~PDIRUNLOCK;
606	if (dvp == vp) {   /* lookup on "." */
607		VREF(vp);
608		error = 0;
609	} else if (flags & ISDOTDOT) {
610		VOP_UNLOCK(dvp, 0, td);
611		cnp->cn_flags |= PDIRUNLOCK;
612#ifdef LOOKUP_SHARED
613		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
614			error = vget(vp, LK_SHARED, td);
615		else
616			error = vget(vp, LK_EXCLUSIVE, td);
617#else
618		error = vget(vp, LK_EXCLUSIVE, td);
619#endif
620
621		if (!error && lockparent && (flags & ISLASTCN)) {
622			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
623				cnp->cn_flags &= ~PDIRUNLOCK;
624		}
625	} else {
626#ifdef LOOKUP_SHARED
627		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
628			error = vget(vp, LK_SHARED, td);
629		else
630			error = vget(vp, LK_EXCLUSIVE, td);
631#else
632		error = vget(vp, LK_EXCLUSIVE, td);
633#endif
634		if (!lockparent || error || !(flags & ISLASTCN)) {
635			VOP_UNLOCK(dvp, 0, td);
636			cnp->cn_flags |= PDIRUNLOCK;
637		}
638	}
639	/*
640	 * Check that the capability number did not change
641	 * while we were waiting for the lock.
642	 */
643	if (!error) {
644		if (vpid == vp->v_id)
645			return (0);
646		vput(vp);
647		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
648			VOP_UNLOCK(dvp, 0, td);
649			cnp->cn_flags |= PDIRUNLOCK;
650		}
651	}
652	if (cnp->cn_flags & PDIRUNLOCK) {
653		error = vn_lock(dvp, LK_EXCLUSIVE, td);
654		if (error)
655			return (error);
656		cnp->cn_flags &= ~PDIRUNLOCK;
657	}
658#ifdef LOOKUP_SHARED
659	error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
660
661	if (!error) {
662		int flock = 0;
663
664		flock = VOP_ISLOCKED(*vpp, td);
665		if (flock != LK_EXCLUSIVE) {
666			if (flock == 0) {
667				if ((flags & ISLASTCN) && (flags & LOCKSHARED))
668					VOP_LOCK(*vpp, LK_SHARED, td);
669				else
670					VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
671			}
672		} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
673			VOP_LOCK(*vpp, LK_DOWNGRADE, td);
674	}
675
676	return (error);
677#else
678	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
679#endif
680}
681
682
683#ifndef _SYS_SYSPROTO_H_
684struct  __getcwd_args {
685	u_char	*buf;
686	u_int	buflen;
687};
688#endif
689
690/*
691 * XXX All of these sysctls would probably be more productive dead.
692 */
693static int disablecwd;
694SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
695   "Disable the getcwd syscall");
696
697/* Various statistics for the getcwd syscall */
698static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
699static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
700static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
701static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
702static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
703static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
704
705/* Implementation of the getcwd syscall */
706int
707__getcwd(td, uap)
708	struct thread *td;
709	struct __getcwd_args *uap;
710{
711
712	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
713}
714
715int
716kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
717{
718	char *bp, *tmpbuf;
719	int error, i, slash_prefixed;
720	struct filedesc *fdp;
721	struct namecache *ncp;
722	struct vnode *vp;
723
724	numcwdcalls++;
725	if (disablecwd)
726		return (ENODEV);
727	if (buflen < 2)
728		return (EINVAL);
729	if (buflen > MAXPATHLEN)
730		buflen = MAXPATHLEN;
731	error = 0;
732	tmpbuf = bp = malloc(buflen, M_TEMP, M_WAITOK);
733	bp += buflen - 1;
734	*bp = '\0';
735	fdp = td->td_proc->p_fd;
736	slash_prefixed = 0;
737	FILEDESC_LOCK(fdp);
738	mp_fixme("No vnode locking done!");
739	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
740		if (vp->v_vflag & VV_ROOT) {
741			if (vp->v_mount == NULL) {	/* forced unmount */
742				FILEDESC_UNLOCK(fdp);
743				free(tmpbuf, M_TEMP);
744				return (EBADF);
745			}
746			vp = vp->v_mount->mnt_vnodecovered;
747			continue;
748		}
749		if (vp->v_dd->v_id != vp->v_ddid) {
750			FILEDESC_UNLOCK(fdp);
751			numcwdfail1++;
752			free(tmpbuf, M_TEMP);
753			return (ENOTDIR);
754		}
755		ncp = TAILQ_FIRST(&vp->v_cache_dst);
756		if (!ncp) {
757			FILEDESC_UNLOCK(fdp);
758			numcwdfail2++;
759			free(tmpbuf, M_TEMP);
760			return (ENOENT);
761		}
762		if (ncp->nc_dvp != vp->v_dd) {
763			FILEDESC_UNLOCK(fdp);
764			numcwdfail3++;
765			free(tmpbuf, M_TEMP);
766			return (EBADF);
767		}
768		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
769			if (bp == tmpbuf) {
770				FILEDESC_UNLOCK(fdp);
771				numcwdfail4++;
772				free(tmpbuf, M_TEMP);
773				return (ENOMEM);
774			}
775			*--bp = ncp->nc_name[i];
776		}
777		if (bp == tmpbuf) {
778			FILEDESC_UNLOCK(fdp);
779			numcwdfail4++;
780			free(tmpbuf, M_TEMP);
781			return (ENOMEM);
782		}
783		*--bp = '/';
784		slash_prefixed = 1;
785		vp = vp->v_dd;
786	}
787	FILEDESC_UNLOCK(fdp);
788	if (!slash_prefixed) {
789		if (bp == tmpbuf) {
790			numcwdfail4++;
791			free(tmpbuf, M_TEMP);
792			return (ENOMEM);
793		}
794		*--bp = '/';
795	}
796	numcwdfound++;
797	if (bufseg == UIO_SYSSPACE)
798		bcopy(bp, buf, strlen(bp) + 1);
799	else
800		error = copyout(bp, buf, strlen(bp) + 1);
801	free(tmpbuf, M_TEMP);
802	return (error);
803}
804
805/*
806 * Thus begins the fullpath magic.
807 */
808
809#undef STATNODE
810#define STATNODE(name)							\
811	static u_int name;						\
812	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
813
814static int disablefullpath;
815SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
816	"Disable the vn_fullpath function");
817
818STATNODE(numfullpathcalls);
819STATNODE(numfullpathfail1);
820STATNODE(numfullpathfail2);
821STATNODE(numfullpathfail3);
822STATNODE(numfullpathfail4);
823STATNODE(numfullpathfound);
824
825/*
826 * Retrieve the full filesystem path that correspond to a vnode from the name
827 * cache (if available)
828 */
829int
830vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
831{
832	char *bp, *buf;
833	int i, slash_prefixed;
834	struct filedesc *fdp;
835	struct namecache *ncp;
836	struct vnode *vp;
837
838	numfullpathcalls++;
839	if (disablefullpath)
840		return (ENODEV);
841	if (vn == NULL)
842		return (EINVAL);
843	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
844	bp = buf + MAXPATHLEN - 1;
845	*bp = '\0';
846	fdp = td->td_proc->p_fd;
847	slash_prefixed = 0;
848	FILEDESC_LOCK(fdp);
849	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
850		ASSERT_VOP_LOCKED(vp, "vn_fullpath");
851		if (vp->v_vflag & VV_ROOT) {
852			if (vp->v_mount == NULL) {	/* forced unmount */
853				FILEDESC_UNLOCK(fdp);
854				free(buf, M_TEMP);
855				return (EBADF);
856			}
857			vp = vp->v_mount->mnt_vnodecovered;
858			continue;
859		}
860		if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
861			FILEDESC_UNLOCK(fdp);
862			numfullpathfail1++;
863			free(buf, M_TEMP);
864			return (ENOTDIR);
865		}
866		ncp = TAILQ_FIRST(&vp->v_cache_dst);
867		if (!ncp) {
868			FILEDESC_UNLOCK(fdp);
869			numfullpathfail2++;
870			free(buf, M_TEMP);
871			return (ENOENT);
872		}
873		if (vp != vn && ncp->nc_dvp != vp->v_dd) {
874			FILEDESC_UNLOCK(fdp);
875			numfullpathfail3++;
876			free(buf, M_TEMP);
877			return (EBADF);
878		}
879		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
880			if (bp == buf) {
881				FILEDESC_UNLOCK(fdp);
882				numfullpathfail4++;
883				free(buf, M_TEMP);
884				return (ENOMEM);
885			}
886			*--bp = ncp->nc_name[i];
887		}
888		if (bp == buf) {
889			FILEDESC_UNLOCK(fdp);
890			numfullpathfail4++;
891			free(buf, M_TEMP);
892			return (ENOMEM);
893		}
894		*--bp = '/';
895		slash_prefixed = 1;
896		vp = ncp->nc_dvp;
897	}
898	if (!slash_prefixed) {
899		if (bp == buf) {
900			FILEDESC_UNLOCK(fdp);
901			numfullpathfail4++;
902			free(buf, M_TEMP);
903			return (ENOMEM);
904		}
905		*--bp = '/';
906	}
907	FILEDESC_UNLOCK(fdp);
908	numfullpathfound++;
909	*retbuf = bp;
910	*freebuf = buf;
911	return (0);
912}
913