vfs_cache.c revision 190387
1/*-
2 * Copyright (c) 1989, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Poul-Henning Kamp of the FreeBSD Project.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/kern/vfs_cache.c 190387 2009-03-24 18:16:42Z jhb $");
37
38#include "opt_ktrace.h"
39
40#include <sys/param.h>
41#include <sys/filedesc.h>
42#include <sys/fnv_hash.h>
43#include <sys/kernel.h>
44#include <sys/lock.h>
45#include <sys/malloc.h>
46#include <sys/mount.h>
47#include <sys/namei.h>
48#include <sys/proc.h>
49#include <sys/rwlock.h>
50#include <sys/syscallsubr.h>
51#include <sys/sysctl.h>
52#include <sys/sysproto.h>
53#include <sys/systm.h>
54#include <sys/vnode.h>
55#ifdef KTRACE
56#include <sys/ktrace.h>
57#endif
58
59#include <vm/uma.h>
60
61/*
62 * This structure describes the elements in the cache of recent
63 * names looked up by namei.
64 */
65
66struct	namecache {
67	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
68	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
69	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
70	struct	vnode *nc_dvp;		/* vnode of parent of name */
71	struct	vnode *nc_vp;		/* vnode the name refers to */
72	u_char	nc_flag;		/* flag bits */
73	u_char	nc_nlen;		/* length of name */
74	char	nc_name[0];		/* segment name */
75};
76
77/*
78 * Name caching works as follows:
79 *
80 * Names found by directory scans are retained in a cache
81 * for future reference.  It is managed LRU, so frequently
82 * used names will hang around.  Cache is indexed by hash value
83 * obtained from (vp, name) where vp refers to the directory
84 * containing name.
85 *
86 * If it is a "negative" entry, (i.e. for a name that is known NOT to
87 * exist) the vnode pointer will be NULL.
88 *
89 * Upon reaching the last segment of a path, if the reference
90 * is for DELETE, or NOCACHE is set (rewrite), and the
91 * name is located in the cache, it will be dropped.
92 */
93
94/*
95 * Structures associated with name cacheing.
96 */
97#define NCHHASH(hash) \
98	(&nchashtbl[(hash) & nchash])
99static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
100static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
101static u_long	nchash;			/* size of hash table */
102SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
103static u_long	ncnegfactor = 16;	/* ratio of negative entries */
104SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
105static u_long	numneg;			/* number of cache entries allocated */
106SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
107static u_long	numcache;		/* number of cache entries allocated */
108SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
109static u_long	numcachehv;		/* number of cache entries with vnodes held */
110SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
111#if 0
112static u_long	numcachepl;		/* number of cache purge for leaf entries */
113SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
114#endif
115struct	nchstats nchstats;		/* cache effectiveness statistics */
116
117static struct rwlock cache_lock;
118RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
119
120#define	CACHE_UPGRADE_LOCK()	rw_try_upgrade(&cache_lock)
121#define	CACHE_RLOCK()		rw_rlock(&cache_lock)
122#define	CACHE_RUNLOCK()		rw_runlock(&cache_lock)
123#define	CACHE_WLOCK()		rw_wlock(&cache_lock)
124#define	CACHE_WUNLOCK()		rw_wunlock(&cache_lock)
125
126/*
127 * UMA zones for the VFS cache.
128 *
129 * The small cache is used for entries with short names, which are the
130 * most common.  The large cache is used for entries which are too big to
131 * fit in the small cache.
132 */
133static uma_zone_t cache_zone_small;
134static uma_zone_t cache_zone_large;
135
136#define	CACHE_PATH_CUTOFF	32
137#define	CACHE_ZONE_SMALL	(sizeof(struct namecache) + CACHE_PATH_CUTOFF)
138#define	CACHE_ZONE_LARGE	(sizeof(struct namecache) + NAME_MAX)
139
140#define cache_alloc(len)	uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
141	cache_zone_small : cache_zone_large, M_WAITOK)
142#define cache_free(ncp)		do { \
143	if (ncp != NULL) \
144		uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
145		    cache_zone_small : cache_zone_large, (ncp)); \
146} while (0)
147
148static int	doingcache = 1;		/* 1 => enable the cache */
149SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
150
151/* Export size information to userland */
152SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
153	sizeof(struct namecache), "");
154
155/*
156 * The new name cache statistics
157 */
158static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
159#define STATNODE(mode, name, var) \
160	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
161STATNODE(CTLFLAG_RD, numneg, &numneg);
162STATNODE(CTLFLAG_RD, numcache, &numcache);
163static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
164static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
165static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
166static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
167static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
168static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
169static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
170static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
171static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
172static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
173static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades);
174
175SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE,
176	&nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
177
178
179
180static void cache_zap(struct namecache *ncp);
181static int vn_vptocnp(struct vnode **vp, char **bp, char *buf, u_int *buflen);
182static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
183    char *buf, char **retbuf, u_int buflen);
184
185static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
186
187/*
188 * Flags in namecache.nc_flag
189 */
190#define NCF_WHITE	1
191
192#ifdef DIAGNOSTIC
193/*
194 * Grab an atomic snapshot of the name cache hash chain lengths
195 */
196SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
197
198static int
199sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
200{
201	int error;
202	struct nchashhead *ncpp;
203	struct namecache *ncp;
204	int n_nchash;
205	int count;
206
207	n_nchash = nchash + 1;	/* nchash is max index, not count */
208	if (!req->oldptr)
209		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
210
211	/* Scan hash tables for applicable entries */
212	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
213		CACHE_RLOCK();
214		count = 0;
215		LIST_FOREACH(ncp, ncpp, nc_hash) {
216			count++;
217		}
218		CACHE_RUNLOCK();
219		error = SYSCTL_OUT(req, &count, sizeof(count));
220		if (error)
221			return (error);
222	}
223	return (0);
224}
225SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
226	CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
227	"nchash chain lengths");
228
229static int
230sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
231{
232	int error;
233	struct nchashhead *ncpp;
234	struct namecache *ncp;
235	int n_nchash;
236	int count, maxlength, used, pct;
237
238	if (!req->oldptr)
239		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
240
241	n_nchash = nchash + 1;	/* nchash is max index, not count */
242	used = 0;
243	maxlength = 0;
244
245	/* Scan hash tables for applicable entries */
246	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
247		count = 0;
248		CACHE_RLOCK();
249		LIST_FOREACH(ncp, ncpp, nc_hash) {
250			count++;
251		}
252		CACHE_RUNLOCK();
253		if (count)
254			used++;
255		if (maxlength < count)
256			maxlength = count;
257	}
258	n_nchash = nchash + 1;
259	pct = (used * 100 * 100) / n_nchash;
260	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
261	if (error)
262		return (error);
263	error = SYSCTL_OUT(req, &used, sizeof(used));
264	if (error)
265		return (error);
266	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
267	if (error)
268		return (error);
269	error = SYSCTL_OUT(req, &pct, sizeof(pct));
270	if (error)
271		return (error);
272	return (0);
273}
274SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
275	CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
276	"nchash chain lengths");
277#endif
278
279/*
280 * cache_zap():
281 *
282 *   Removes a namecache entry from cache, whether it contains an actual
283 *   pointer to a vnode or if it is just a negative cache entry.
284 */
285static void
286cache_zap(ncp)
287	struct namecache *ncp;
288{
289	struct vnode *vp;
290
291	rw_assert(&cache_lock, RA_WLOCKED);
292	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
293	vp = NULL;
294	LIST_REMOVE(ncp, nc_hash);
295	LIST_REMOVE(ncp, nc_src);
296	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
297		vp = ncp->nc_dvp;
298		numcachehv--;
299	}
300	if (ncp->nc_vp) {
301		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
302		ncp->nc_vp->v_dd = NULL;
303	} else {
304		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
305		numneg--;
306	}
307	numcache--;
308	cache_free(ncp);
309	if (vp)
310		vdrop(vp);
311}
312
313/*
314 * Lookup an entry in the cache
315 *
316 * Lookup is called with dvp pointing to the directory to search,
317 * cnp pointing to the name of the entry being sought. If the lookup
318 * succeeds, the vnode is returned in *vpp, and a status of -1 is
319 * returned. If the lookup determines that the name does not exist
320 * (negative cacheing), a status of ENOENT is returned. If the lookup
321 * fails, a status of zero is returned.  If the directory vnode is
322 * recycled out from under us due to a forced unmount, a status of
323 * ENOENT is returned.
324 *
325 * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
326 * unlocked.  If we're looking up . an extra ref is taken, but the lock is
327 * not recursively acquired.
328 */
329
330int
331cache_lookup(dvp, vpp, cnp)
332	struct vnode *dvp;
333	struct vnode **vpp;
334	struct componentname *cnp;
335{
336	struct namecache *ncp;
337	u_int32_t hash;
338	int error, ltype, wlocked;
339
340	if (!doingcache) {
341		cnp->cn_flags &= ~MAKEENTRY;
342		return (0);
343	}
344retry:
345	CACHE_RLOCK();
346	wlocked = 0;
347	numcalls++;
348	error = 0;
349
350retry_wlocked:
351	if (cnp->cn_nameptr[0] == '.') {
352		if (cnp->cn_namelen == 1) {
353			*vpp = dvp;
354			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
355			    dvp, cnp->cn_nameptr);
356			dothits++;
357			goto success;
358		}
359		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
360			dotdothits++;
361			if (dvp->v_dd == NULL ||
362			    (cnp->cn_flags & MAKEENTRY) == 0) {
363				goto unlock;
364			}
365			*vpp = dvp->v_dd;
366			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
367			    dvp, cnp->cn_nameptr, *vpp);
368			goto success;
369		}
370	}
371
372	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
373	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
374	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
375		numchecks++;
376		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
377		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
378			break;
379	}
380
381	/* We failed to find an entry */
382	if (ncp == NULL) {
383		if ((cnp->cn_flags & MAKEENTRY) == 0) {
384			nummisszap++;
385		} else {
386			nummiss++;
387		}
388		nchstats.ncs_miss++;
389		goto unlock;
390	}
391
392	/* We don't want to have an entry, so dump it */
393	if ((cnp->cn_flags & MAKEENTRY) == 0) {
394		numposzaps++;
395		nchstats.ncs_badhits++;
396		if (!wlocked && !CACHE_UPGRADE_LOCK())
397			goto wlock;
398		cache_zap(ncp);
399		CACHE_WUNLOCK();
400		return (0);
401	}
402
403	/* We found a "positive" match, return the vnode */
404	if (ncp->nc_vp) {
405		numposhits++;
406		nchstats.ncs_goodhits++;
407		*vpp = ncp->nc_vp;
408		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
409		    dvp, cnp->cn_nameptr, *vpp, ncp);
410		goto success;
411	}
412
413	/* We found a negative match, and want to create it, so purge */
414	if (cnp->cn_nameiop == CREATE) {
415		numnegzaps++;
416		nchstats.ncs_badhits++;
417		if (!wlocked && !CACHE_UPGRADE_LOCK())
418			goto wlock;
419		cache_zap(ncp);
420		CACHE_WUNLOCK();
421		return (0);
422	}
423
424	if (!wlocked && !CACHE_UPGRADE_LOCK())
425		goto wlock;
426	numneghits++;
427	/*
428	 * We found a "negative" match, so we shift it to the end of
429	 * the "negative" cache entries queue to satisfy LRU.  Also,
430	 * check to see if the entry is a whiteout; indicate this to
431	 * the componentname, if so.
432	 */
433	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
434	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
435	nchstats.ncs_neghits++;
436	if (ncp->nc_flag & NCF_WHITE)
437		cnp->cn_flags |= ISWHITEOUT;
438	CACHE_WUNLOCK();
439	return (ENOENT);
440
441wlock:
442	/*
443	 * We need to update the cache after our lookup, so upgrade to
444	 * a write lock and retry the operation.
445	 */
446	CACHE_RUNLOCK();
447	CACHE_WLOCK();
448	numupgrades++;
449	wlocked = 1;
450	goto retry_wlocked;
451
452success:
453	/*
454	 * On success we return a locked and ref'd vnode as per the lookup
455	 * protocol.
456	 */
457	if (dvp == *vpp) {   /* lookup on "." */
458		VREF(*vpp);
459		if (wlocked)
460			CACHE_WUNLOCK();
461		else
462			CACHE_RUNLOCK();
463		/*
464		 * When we lookup "." we still can be asked to lock it
465		 * differently...
466		 */
467		ltype = cnp->cn_lkflags & LK_TYPE_MASK;
468		if (ltype != VOP_ISLOCKED(*vpp)) {
469			if (ltype == LK_EXCLUSIVE) {
470				vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
471				if ((*vpp)->v_iflag & VI_DOOMED) {
472					/* forced unmount */
473					vrele(*vpp);
474					*vpp = NULL;
475					return (ENOENT);
476				}
477			} else
478				vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
479		}
480		return (-1);
481	}
482	ltype = 0;	/* silence gcc warning */
483	if (cnp->cn_flags & ISDOTDOT) {
484		ltype = VOP_ISLOCKED(dvp);
485		VOP_UNLOCK(dvp, 0);
486	}
487	VI_LOCK(*vpp);
488	if (wlocked)
489		CACHE_WUNLOCK();
490	else
491		CACHE_RUNLOCK();
492	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
493	if (cnp->cn_flags & ISDOTDOT)
494		vn_lock(dvp, ltype | LK_RETRY);
495	if (error) {
496		*vpp = NULL;
497		goto retry;
498	}
499	if ((cnp->cn_flags & ISLASTCN) &&
500	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
501		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
502	}
503	return (-1);
504
505unlock:
506	if (wlocked)
507		CACHE_WUNLOCK();
508	else
509		CACHE_RUNLOCK();
510	return (0);
511}
512
513/*
514 * Add an entry to the cache.
515 */
516void
517cache_enter(dvp, vp, cnp)
518	struct vnode *dvp;
519	struct vnode *vp;
520	struct componentname *cnp;
521{
522	struct namecache *ncp, *n2;
523	struct nchashhead *ncpp;
524	u_int32_t hash;
525	int hold;
526	int zap;
527	int len;
528
529	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
530	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
531	    ("cahe_enter: Adding a doomed vnode"));
532
533	if (!doingcache)
534		return;
535
536	/*
537	 * Avoid blowout in namecache entries.
538	 */
539	if (numcache >= desiredvnodes * 2)
540		return;
541
542	if (cnp->cn_nameptr[0] == '.') {
543		if (cnp->cn_namelen == 1) {
544			return;
545		}
546		/*
547		 * For dotdot lookups only cache the v_dd pointer if the
548		 * directory has a link back to its parent via v_cache_dst.
549		 * Without this an unlinked directory would keep a soft
550		 * reference to its parent which could not be NULLd at
551		 * cache_purge() time.
552		 */
553		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
554			CACHE_WLOCK();
555			if (!TAILQ_EMPTY(&dvp->v_cache_dst))
556				dvp->v_dd = vp;
557			CACHE_WUNLOCK();
558			return;
559		}
560	}
561
562	hold = 0;
563	zap = 0;
564
565	/*
566	 * Calculate the hash key and setup as much of the new
567	 * namecache entry as possible before acquiring the lock.
568	 */
569	ncp = cache_alloc(cnp->cn_namelen);
570	ncp->nc_vp = vp;
571	ncp->nc_dvp = dvp;
572	len = ncp->nc_nlen = cnp->cn_namelen;
573	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
574	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
575	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
576	CACHE_WLOCK();
577
578	/*
579	 * See if this vnode or negative entry is already in the cache
580	 * with this name.  This can happen with concurrent lookups of
581	 * the same path name.
582	 */
583	ncpp = NCHHASH(hash);
584	LIST_FOREACH(n2, ncpp, nc_hash) {
585		if (n2->nc_dvp == dvp &&
586		    n2->nc_nlen == cnp->cn_namelen &&
587		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
588			CACHE_WUNLOCK();
589			cache_free(ncp);
590			return;
591		}
592	}
593
594	numcache++;
595	if (!vp) {
596		numneg++;
597		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
598	} else if (vp->v_type == VDIR) {
599		vp->v_dd = dvp;
600	} else {
601		vp->v_dd = NULL;
602	}
603
604	/*
605	 * Insert the new namecache entry into the appropriate chain
606	 * within the cache entries table.
607	 */
608	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
609	if (LIST_EMPTY(&dvp->v_cache_src)) {
610		hold = 1;
611		numcachehv++;
612	}
613	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
614	/*
615	 * If the entry is "negative", we place it into the
616	 * "negative" cache queue, otherwise, we place it into the
617	 * destination vnode's cache entries queue.
618	 */
619	if (vp) {
620		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
621	} else {
622		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
623	}
624	if (numneg * ncnegfactor > numcache) {
625		ncp = TAILQ_FIRST(&ncneg);
626		zap = 1;
627	}
628	if (hold)
629		vhold(dvp);
630	if (zap)
631		cache_zap(ncp);
632	CACHE_WUNLOCK();
633}
634
635/*
636 * Name cache initialization, from vfs_init() when we are booting
637 */
638static void
639nchinit(void *dummy __unused)
640{
641
642	TAILQ_INIT(&ncneg);
643
644	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
645	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
646	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
647	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
648
649	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
650}
651SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
652
653
654/*
655 * Invalidate all entries to a particular vnode.
656 */
657void
658cache_purge(vp)
659	struct vnode *vp;
660{
661
662	CTR1(KTR_VFS, "cache_purge(%p)", vp);
663	CACHE_WLOCK();
664	while (!LIST_EMPTY(&vp->v_cache_src))
665		cache_zap(LIST_FIRST(&vp->v_cache_src));
666	while (!TAILQ_EMPTY(&vp->v_cache_dst))
667		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
668	vp->v_dd = NULL;
669	CACHE_WUNLOCK();
670}
671
672/*
673 * Invalidate all negative entries for a particular directory vnode.
674 */
675void
676cache_purge_negative(vp)
677	struct vnode *vp;
678{
679	struct namecache *cp, *ncp;
680
681	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
682	CACHE_WLOCK();
683	LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
684		if (cp->nc_vp == NULL)
685			cache_zap(cp);
686	}
687	CACHE_WUNLOCK();
688}
689
690/*
691 * Flush all entries referencing a particular filesystem.
692 */
693void
694cache_purgevfs(mp)
695	struct mount *mp;
696{
697	struct nchashhead *ncpp;
698	struct namecache *ncp, *nnp;
699
700	/* Scan hash tables for applicable entries */
701	CACHE_WLOCK();
702	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
703		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
704			if (ncp->nc_dvp->v_mount == mp)
705				cache_zap(ncp);
706		}
707	}
708	CACHE_WUNLOCK();
709}
710
711/*
712 * Perform canonical checks and cache lookup and pass on to filesystem
713 * through the vop_cachedlookup only if needed.
714 */
715
716int
717vfs_cache_lookup(ap)
718	struct vop_lookup_args /* {
719		struct vnode *a_dvp;
720		struct vnode **a_vpp;
721		struct componentname *a_cnp;
722	} */ *ap;
723{
724	struct vnode *dvp;
725	int error;
726	struct vnode **vpp = ap->a_vpp;
727	struct componentname *cnp = ap->a_cnp;
728	struct ucred *cred = cnp->cn_cred;
729	int flags = cnp->cn_flags;
730	struct thread *td = cnp->cn_thread;
731
732	*vpp = NULL;
733	dvp = ap->a_dvp;
734
735	if (dvp->v_type != VDIR)
736		return (ENOTDIR);
737
738	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
739	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
740		return (EROFS);
741
742	error = VOP_ACCESS(dvp, VEXEC, cred, td);
743	if (error)
744		return (error);
745
746	error = cache_lookup(dvp, vpp, cnp);
747	if (error == 0)
748		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
749	if (error == -1)
750		return (0);
751	return (error);
752}
753
754
755#ifndef _SYS_SYSPROTO_H_
756struct  __getcwd_args {
757	u_char	*buf;
758	u_int	buflen;
759};
760#endif
761
762/*
763 * XXX All of these sysctls would probably be more productive dead.
764 */
765static int disablecwd;
766SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
767   "Disable the getcwd syscall");
768
769/* Implementation of the getcwd syscall. */
770int
771__getcwd(td, uap)
772	struct thread *td;
773	struct __getcwd_args *uap;
774{
775
776	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
777}
778
779int
780kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
781{
782	char *bp, *tmpbuf;
783	struct filedesc *fdp;
784	struct vnode *cdir, *rdir;
785	int error, vfslocked;
786
787	if (disablecwd)
788		return (ENODEV);
789	if (buflen < 2)
790		return (EINVAL);
791	if (buflen > MAXPATHLEN)
792		buflen = MAXPATHLEN;
793
794	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
795	fdp = td->td_proc->p_fd;
796	FILEDESC_SLOCK(fdp);
797	cdir = fdp->fd_cdir;
798	VREF(cdir);
799	rdir = fdp->fd_rdir;
800	VREF(rdir);
801	FILEDESC_SUNLOCK(fdp);
802	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
803	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
804	vrele(rdir);
805	VFS_UNLOCK_GIANT(vfslocked);
806	vfslocked = VFS_LOCK_GIANT(cdir->v_mount);
807	vrele(cdir);
808	VFS_UNLOCK_GIANT(vfslocked);
809
810	if (!error) {
811		if (bufseg == UIO_SYSSPACE)
812			bcopy(bp, buf, strlen(bp) + 1);
813		else
814			error = copyout(bp, buf, strlen(bp) + 1);
815#ifdef KTRACE
816	if (KTRPOINT(curthread, KTR_NAMEI))
817		ktrnamei(bp);
818#endif
819	}
820	free(tmpbuf, M_TEMP);
821	return (error);
822}
823
824/*
825 * Thus begins the fullpath magic.
826 */
827
828#undef STATNODE
829#define STATNODE(name)							\
830	static u_int name;						\
831	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
832
833static int disablefullpath;
834SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
835	"Disable the vn_fullpath function");
836
837/* These count for kern___getcwd(), too. */
838STATNODE(numfullpathcalls);
839STATNODE(numfullpathfail1);
840STATNODE(numfullpathfail2);
841STATNODE(numfullpathfail4);
842STATNODE(numfullpathfound);
843
844/*
845 * Retrieve the full filesystem path that correspond to a vnode from the name
846 * cache (if available)
847 */
848int
849vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
850{
851	char *buf;
852	struct filedesc *fdp;
853	struct vnode *rdir;
854	int error, vfslocked;
855
856	if (disablefullpath)
857		return (ENODEV);
858	if (vn == NULL)
859		return (EINVAL);
860
861	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
862	fdp = td->td_proc->p_fd;
863	FILEDESC_SLOCK(fdp);
864	rdir = fdp->fd_rdir;
865	VREF(rdir);
866	FILEDESC_SUNLOCK(fdp);
867	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
868	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
869	vrele(rdir);
870	VFS_UNLOCK_GIANT(vfslocked);
871
872	if (!error)
873		*freebuf = buf;
874	else
875		free(buf, M_TEMP);
876	return (error);
877}
878
879/*
880 * This function is similar to vn_fullpath, but it attempts to lookup the
881 * pathname relative to the global root mount point.  This is required for the
882 * auditing sub-system, as audited pathnames must be absolute, relative to the
883 * global root mount point.
884 */
885int
886vn_fullpath_global(struct thread *td, struct vnode *vn,
887    char **retbuf, char **freebuf)
888{
889	char *buf;
890	int error;
891
892	if (disablefullpath)
893		return (ENODEV);
894	if (vn == NULL)
895		return (EINVAL);
896	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
897	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
898	if (!error)
899		*freebuf = buf;
900	else
901		free(buf, M_TEMP);
902	return (error);
903}
904
905static int
906vn_vptocnp(struct vnode **vp, char **bp, char *buf, u_int *buflen)
907{
908	struct vnode *dvp;
909	int error, vfslocked;
910
911	vhold(*vp);
912	CACHE_RUNLOCK();
913	vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
914	vn_lock(*vp, LK_SHARED | LK_RETRY);
915	error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
916	VOP_UNLOCK(*vp, 0);
917	vdrop(*vp);
918	VFS_UNLOCK_GIANT(vfslocked);
919	if (error) {
920		numfullpathfail2++;
921		return (error);
922	}
923	*bp = buf + *buflen;
924	*vp = dvp;
925	CACHE_RLOCK();
926	if ((*vp)->v_iflag & VI_DOOMED) {
927		/* forced unmount */
928		CACHE_RUNLOCK();
929		vdrop(*vp);
930		return (ENOENT);
931	}
932	vdrop(*vp);
933
934	return (0);
935}
936
937/*
938 * The magic behind kern___getcwd() and vn_fullpath().
939 */
940static int
941vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
942    char *buf, char **retbuf, u_int buflen)
943{
944	char *bp;
945	int error, i, slash_prefixed;
946	struct namecache *ncp;
947
948	buflen--;
949	bp = buf + buflen;
950	*bp = '\0';
951	error = 0;
952	slash_prefixed = 0;
953
954	CACHE_RLOCK();
955	numfullpathcalls++;
956	if (vp->v_type != VDIR) {
957		ncp = TAILQ_FIRST(&vp->v_cache_dst);
958		if (ncp != NULL) {
959			buflen -= ncp->nc_nlen;
960			for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
961				*--bp = ncp->nc_name[i];
962			if (bp == buf) {
963				numfullpathfail4++;
964				CACHE_RUNLOCK();
965				return (ENOMEM);
966			}
967			vp = ncp->nc_dvp;
968		} else {
969			error = vn_vptocnp(&vp, &bp, buf, &buflen);
970			if (error)
971				return (error);
972		}
973		if (buflen <= 0) {
974			numfullpathfail4++;
975			CACHE_RUNLOCK();
976			return (ENOMEM);
977		}
978		*--bp = '/';
979		buflen--;
980		slash_prefixed = 1;
981	}
982	while (vp != rdir && vp != rootvnode) {
983		if (vp->v_vflag & VV_ROOT) {
984			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
985				CACHE_RUNLOCK();
986				error = ENOENT;
987				break;
988			}
989			vp = vp->v_mount->mnt_vnodecovered;
990			continue;
991		}
992		if (vp->v_type != VDIR) {
993			numfullpathfail1++;
994			CACHE_RUNLOCK();
995			error = ENOTDIR;
996			break;
997		}
998		ncp = TAILQ_FIRST(&vp->v_cache_dst);
999		if (ncp != NULL) {
1000			MPASS(vp->v_dd == NULL || ncp->nc_dvp == vp->v_dd);
1001			buflen -= ncp->nc_nlen;
1002			for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
1003				*--bp = ncp->nc_name[i];
1004			if (bp == buf) {
1005				numfullpathfail4++;
1006				CACHE_RUNLOCK();
1007				error = ENOMEM;
1008				break;
1009			}
1010			vp = ncp->nc_dvp;
1011		} else {
1012			error = vn_vptocnp(&vp, &bp, buf, &buflen);
1013			if (error)
1014				break;
1015		}
1016		if (buflen <= 0) {
1017			numfullpathfail4++;
1018			CACHE_RUNLOCK();
1019			error = ENOMEM;
1020			break;
1021		}
1022		*--bp = '/';
1023		buflen--;
1024		slash_prefixed = 1;
1025	}
1026	if (error)
1027		return (error);
1028	if (!slash_prefixed) {
1029		if (bp == buf) {
1030			numfullpathfail4++;
1031			CACHE_RUNLOCK();
1032			return (ENOMEM);
1033		} else
1034			*--bp = '/';
1035	}
1036	numfullpathfound++;
1037	CACHE_RUNLOCK();
1038
1039	*retbuf = bp;
1040	return (0);
1041}
1042
1043int
1044vn_commname(struct vnode *vp, char *buf, u_int buflen)
1045{
1046	struct namecache *ncp;
1047	int l;
1048
1049	CACHE_RLOCK();
1050	ncp = TAILQ_FIRST(&vp->v_cache_dst);
1051	if (!ncp) {
1052		CACHE_RUNLOCK();
1053		return (ENOENT);
1054	}
1055	l = min(ncp->nc_nlen, buflen - 1);
1056	memcpy(buf, ncp->nc_name, l);
1057	CACHE_RUNLOCK();
1058	buf[l] = '\0';
1059	return (0);
1060}
1061