vfs_cache.c revision 193174
1/*-
2 * Copyright (c) 1989, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Poul-Henning Kamp of the FreeBSD Project.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/kern/vfs_cache.c 193174 2009-05-31 14:57:43Z kib $");
37
38#include "opt_kdtrace.h"
39#include "opt_ktrace.h"
40
41#include <sys/param.h>
42#include <sys/filedesc.h>
43#include <sys/fnv_hash.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mount.h>
48#include <sys/namei.h>
49#include <sys/proc.h>
50#include <sys/rwlock.h>
51#include <sys/sdt.h>
52#include <sys/syscallsubr.h>
53#include <sys/sysctl.h>
54#include <sys/sysproto.h>
55#include <sys/systm.h>
56#include <sys/vnode.h>
57#ifdef KTRACE
58#include <sys/ktrace.h>
59#endif
60
61#include <vm/uma.h>
62
63SDT_PROVIDER_DECLARE(vfs);
64SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
65    "struct vnode *");
66SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
67    "char *");
68SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
69SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
70    "struct char *", "struct vnode *");
71SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
72SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *",
73    "struct char *");
74SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
75    "struct vnode *");
76SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, "struct vnode *",
77    "char *");
78SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
79    "char *");
80SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
81SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
82SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
83SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
84    "struct vnode *");
85SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
86    "char *");
87
88/*
89 * This structure describes the elements in the cache of recent
90 * names looked up by namei.
91 */
92
93struct	namecache {
94	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
95	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
96	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
97	struct	vnode *nc_dvp;		/* vnode of parent of name */
98	struct	vnode *nc_vp;		/* vnode the name refers to */
99	u_char	nc_flag;		/* flag bits */
100	u_char	nc_nlen;		/* length of name */
101	char	nc_name[0];		/* segment name + nul */
102};
103
104/*
105 * Name caching works as follows:
106 *
107 * Names found by directory scans are retained in a cache
108 * for future reference.  It is managed LRU, so frequently
109 * used names will hang around.  Cache is indexed by hash value
110 * obtained from (vp, name) where vp refers to the directory
111 * containing name.
112 *
113 * If it is a "negative" entry, (i.e. for a name that is known NOT to
114 * exist) the vnode pointer will be NULL.
115 *
116 * Upon reaching the last segment of a path, if the reference
117 * is for DELETE, or NOCACHE is set (rewrite), and the
118 * name is located in the cache, it will be dropped.
119 */
120
121/*
122 * Structures associated with name cacheing.
123 */
124#define NCHHASH(hash) \
125	(&nchashtbl[(hash) & nchash])
126static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
127static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
128static u_long	nchash;			/* size of hash table */
129SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
130static u_long	ncnegfactor = 16;	/* ratio of negative entries */
131SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
132static u_long	numneg;			/* number of cache entries allocated */
133SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
134static u_long	numcache;		/* number of cache entries allocated */
135SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
136static u_long	numcachehv;		/* number of cache entries with vnodes held */
137SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
138#if 0
139static u_long	numcachepl;		/* number of cache purge for leaf entries */
140SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
141#endif
142struct	nchstats nchstats;		/* cache effectiveness statistics */
143
144static struct rwlock cache_lock;
145RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
146
147#define	CACHE_UPGRADE_LOCK()	rw_try_upgrade(&cache_lock)
148#define	CACHE_RLOCK()		rw_rlock(&cache_lock)
149#define	CACHE_RUNLOCK()		rw_runlock(&cache_lock)
150#define	CACHE_WLOCK()		rw_wlock(&cache_lock)
151#define	CACHE_WUNLOCK()		rw_wunlock(&cache_lock)
152
153/*
154 * UMA zones for the VFS cache.
155 *
156 * The small cache is used for entries with short names, which are the
157 * most common.  The large cache is used for entries which are too big to
158 * fit in the small cache.
159 */
160static uma_zone_t cache_zone_small;
161static uma_zone_t cache_zone_large;
162
163#define	CACHE_PATH_CUTOFF	35
164#define	CACHE_ZONE_SMALL	(sizeof(struct namecache) + CACHE_PATH_CUTOFF \
165				    + 1)
166#define	CACHE_ZONE_LARGE	(sizeof(struct namecache) + NAME_MAX + 1)
167
168#define cache_alloc(len)	uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
169	cache_zone_small : cache_zone_large, M_WAITOK)
170#define cache_free(ncp)		do { \
171	if (ncp != NULL) \
172		uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
173		    cache_zone_small : cache_zone_large, (ncp)); \
174} while (0)
175
176static int	doingcache = 1;		/* 1 => enable the cache */
177SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
178
179/* Export size information to userland */
180SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
181	sizeof(struct namecache), "");
182
183/*
184 * The new name cache statistics
185 */
186static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
187#define STATNODE(mode, name, var) \
188	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
189STATNODE(CTLFLAG_RD, numneg, &numneg);
190STATNODE(CTLFLAG_RD, numcache, &numcache);
191static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
192static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
193static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
194static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
195static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
196static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
197static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
198static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
199static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
200static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
201static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades);
202
203SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE,
204	&nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
205
206
207
208static void cache_zap(struct namecache *ncp);
209static int vn_vptocnp_locked(struct vnode **vp, char *buf, u_int *buflen);
210static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
211    char *buf, char **retbuf, u_int buflen);
212
213static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
214
215/*
216 * Flags in namecache.nc_flag
217 */
218#define NCF_WHITE	0x01
219#define NCF_ISDOTDOT	0x02
220
221#ifdef DIAGNOSTIC
222/*
223 * Grab an atomic snapshot of the name cache hash chain lengths
224 */
225SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
226
227static int
228sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
229{
230	int error;
231	struct nchashhead *ncpp;
232	struct namecache *ncp;
233	int n_nchash;
234	int count;
235
236	n_nchash = nchash + 1;	/* nchash is max index, not count */
237	if (!req->oldptr)
238		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
239
240	/* Scan hash tables for applicable entries */
241	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
242		CACHE_RLOCK();
243		count = 0;
244		LIST_FOREACH(ncp, ncpp, nc_hash) {
245			count++;
246		}
247		CACHE_RUNLOCK();
248		error = SYSCTL_OUT(req, &count, sizeof(count));
249		if (error)
250			return (error);
251	}
252	return (0);
253}
254SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
255	CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
256	"nchash chain lengths");
257
258static int
259sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
260{
261	int error;
262	struct nchashhead *ncpp;
263	struct namecache *ncp;
264	int n_nchash;
265	int count, maxlength, used, pct;
266
267	if (!req->oldptr)
268		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
269
270	n_nchash = nchash + 1;	/* nchash is max index, not count */
271	used = 0;
272	maxlength = 0;
273
274	/* Scan hash tables for applicable entries */
275	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
276		count = 0;
277		CACHE_RLOCK();
278		LIST_FOREACH(ncp, ncpp, nc_hash) {
279			count++;
280		}
281		CACHE_RUNLOCK();
282		if (count)
283			used++;
284		if (maxlength < count)
285			maxlength = count;
286	}
287	n_nchash = nchash + 1;
288	pct = (used * 100 * 100) / n_nchash;
289	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
290	if (error)
291		return (error);
292	error = SYSCTL_OUT(req, &used, sizeof(used));
293	if (error)
294		return (error);
295	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
296	if (error)
297		return (error);
298	error = SYSCTL_OUT(req, &pct, sizeof(pct));
299	if (error)
300		return (error);
301	return (0);
302}
303SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
304	CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
305	"nchash chain lengths");
306#endif
307
308/*
309 * cache_zap():
310 *
311 *   Removes a namecache entry from cache, whether it contains an actual
312 *   pointer to a vnode or if it is just a negative cache entry.
313 */
314static void
315cache_zap(ncp)
316	struct namecache *ncp;
317{
318	struct vnode *vp;
319
320	rw_assert(&cache_lock, RA_WLOCKED);
321	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
322#ifdef KDTRACE_HOOKS
323	if (ncp->nc_vp != NULL) {
324		SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
325		    ncp->nc_name, ncp->nc_vp, 0, 0);
326	} else {
327		SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
328		    ncp->nc_name, 0, 0, 0);
329	}
330#endif
331	vp = NULL;
332	LIST_REMOVE(ncp, nc_hash);
333	if (ncp->nc_flag & NCF_ISDOTDOT) {
334		if (ncp == ncp->nc_dvp->v_cache_dd)
335			ncp->nc_dvp->v_cache_dd = NULL;
336	} else {
337		LIST_REMOVE(ncp, nc_src);
338		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
339			vp = ncp->nc_dvp;
340			numcachehv--;
341		}
342	}
343	if (ncp->nc_vp) {
344		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
345		if (ncp == ncp->nc_vp->v_cache_dd)
346			ncp->nc_vp->v_cache_dd = NULL;
347	} else {
348		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
349		numneg--;
350	}
351	numcache--;
352	cache_free(ncp);
353	if (vp)
354		vdrop(vp);
355}
356
357/*
358 * Lookup an entry in the cache
359 *
360 * Lookup is called with dvp pointing to the directory to search,
361 * cnp pointing to the name of the entry being sought. If the lookup
362 * succeeds, the vnode is returned in *vpp, and a status of -1 is
363 * returned. If the lookup determines that the name does not exist
364 * (negative cacheing), a status of ENOENT is returned. If the lookup
365 * fails, a status of zero is returned.  If the directory vnode is
366 * recycled out from under us due to a forced unmount, a status of
367 * ENOENT is returned.
368 *
369 * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
370 * unlocked.  If we're looking up . an extra ref is taken, but the lock is
371 * not recursively acquired.
372 */
373
374int
375cache_lookup(dvp, vpp, cnp)
376	struct vnode *dvp;
377	struct vnode **vpp;
378	struct componentname *cnp;
379{
380	struct namecache *ncp;
381	u_int32_t hash;
382	int error, ltype, wlocked;
383
384	if (!doingcache) {
385		cnp->cn_flags &= ~MAKEENTRY;
386		return (0);
387	}
388retry:
389	CACHE_RLOCK();
390	wlocked = 0;
391	numcalls++;
392	error = 0;
393
394retry_wlocked:
395	if (cnp->cn_nameptr[0] == '.') {
396		if (cnp->cn_namelen == 1) {
397			*vpp = dvp;
398			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
399			    dvp, cnp->cn_nameptr);
400			dothits++;
401			SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
402			    *vpp, 0, 0);
403			goto success;
404		}
405		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
406			dotdothits++;
407			if (dvp->v_cache_dd == NULL) {
408				SDT_PROBE(vfs, namecache, lookup, miss, dvp,
409				    "..", NULL, 0, 0);
410				goto unlock;
411			}
412			if ((cnp->cn_flags & MAKEENTRY) == 0) {
413				if (!wlocked && !CACHE_UPGRADE_LOCK())
414					goto wlock;
415				if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
416					cache_zap(dvp->v_cache_dd);
417				dvp->v_cache_dd = NULL;
418				goto unlock;
419			}
420			if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
421				*vpp = dvp->v_cache_dd->nc_vp;
422			else
423				*vpp = dvp->v_cache_dd->nc_dvp;
424			/* Return failure if negative entry was found. */
425			if (*vpp == NULL) {
426				ncp = dvp->v_cache_dd;
427				goto negative_success;
428			}
429			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
430			    dvp, cnp->cn_nameptr, *vpp);
431			SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
432			    *vpp, 0, 0);
433			goto success;
434		}
435	}
436
437	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
438	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
439	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
440		numchecks++;
441		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
442		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
443			break;
444	}
445
446	/* We failed to find an entry */
447	if (ncp == NULL) {
448		SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
449		    NULL, 0, 0);
450		if ((cnp->cn_flags & MAKEENTRY) == 0) {
451			nummisszap++;
452		} else {
453			nummiss++;
454		}
455		nchstats.ncs_miss++;
456		goto unlock;
457	}
458
459	/* We don't want to have an entry, so dump it */
460	if ((cnp->cn_flags & MAKEENTRY) == 0) {
461		numposzaps++;
462		nchstats.ncs_badhits++;
463		if (!wlocked && !CACHE_UPGRADE_LOCK())
464			goto wlock;
465		cache_zap(ncp);
466		CACHE_WUNLOCK();
467		return (0);
468	}
469
470	/* We found a "positive" match, return the vnode */
471	if (ncp->nc_vp) {
472		numposhits++;
473		nchstats.ncs_goodhits++;
474		*vpp = ncp->nc_vp;
475		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
476		    dvp, cnp->cn_nameptr, *vpp, ncp);
477		SDT_PROBE(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
478		    *vpp, 0, 0);
479		goto success;
480	}
481
482negative_success:
483	/* We found a negative match, and want to create it, so purge */
484	if (cnp->cn_nameiop == CREATE) {
485		numnegzaps++;
486		nchstats.ncs_badhits++;
487		if (!wlocked && !CACHE_UPGRADE_LOCK())
488			goto wlock;
489		cache_zap(ncp);
490		CACHE_WUNLOCK();
491		return (0);
492	}
493
494	if (!wlocked && !CACHE_UPGRADE_LOCK())
495		goto wlock;
496	numneghits++;
497	/*
498	 * We found a "negative" match, so we shift it to the end of
499	 * the "negative" cache entries queue to satisfy LRU.  Also,
500	 * check to see if the entry is a whiteout; indicate this to
501	 * the componentname, if so.
502	 */
503	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
504	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
505	nchstats.ncs_neghits++;
506	if (ncp->nc_flag & NCF_WHITE)
507		cnp->cn_flags |= ISWHITEOUT;
508	SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, ncp->nc_name,
509	    0, 0, 0);
510	CACHE_WUNLOCK();
511	return (ENOENT);
512
513wlock:
514	/*
515	 * We need to update the cache after our lookup, so upgrade to
516	 * a write lock and retry the operation.
517	 */
518	CACHE_RUNLOCK();
519	CACHE_WLOCK();
520	numupgrades++;
521	wlocked = 1;
522	goto retry_wlocked;
523
524success:
525	/*
526	 * On success we return a locked and ref'd vnode as per the lookup
527	 * protocol.
528	 */
529	if (dvp == *vpp) {   /* lookup on "." */
530		VREF(*vpp);
531		if (wlocked)
532			CACHE_WUNLOCK();
533		else
534			CACHE_RUNLOCK();
535		/*
536		 * When we lookup "." we still can be asked to lock it
537		 * differently...
538		 */
539		ltype = cnp->cn_lkflags & LK_TYPE_MASK;
540		if (ltype != VOP_ISLOCKED(*vpp)) {
541			if (ltype == LK_EXCLUSIVE) {
542				vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
543				if ((*vpp)->v_iflag & VI_DOOMED) {
544					/* forced unmount */
545					vrele(*vpp);
546					*vpp = NULL;
547					return (ENOENT);
548				}
549			} else
550				vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
551		}
552		return (-1);
553	}
554	ltype = 0;	/* silence gcc warning */
555	if (cnp->cn_flags & ISDOTDOT) {
556		ltype = VOP_ISLOCKED(dvp);
557		VOP_UNLOCK(dvp, 0);
558	}
559	VI_LOCK(*vpp);
560	if (wlocked)
561		CACHE_WUNLOCK();
562	else
563		CACHE_RUNLOCK();
564	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
565	if (cnp->cn_flags & ISDOTDOT) {
566		vn_lock(dvp, ltype | LK_RETRY);
567		if (dvp->v_iflag & VI_DOOMED) {
568			if (error == 0)
569				vput(*vpp);
570			*vpp = NULL;
571			return (ENOENT);
572		}
573	}
574	if (error) {
575		*vpp = NULL;
576		goto retry;
577	}
578	if ((cnp->cn_flags & ISLASTCN) &&
579	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
580		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
581	}
582	return (-1);
583
584unlock:
585	if (wlocked)
586		CACHE_WUNLOCK();
587	else
588		CACHE_RUNLOCK();
589	return (0);
590}
591
592/*
593 * Add an entry to the cache.
594 */
595void
596cache_enter(dvp, vp, cnp)
597	struct vnode *dvp;
598	struct vnode *vp;
599	struct componentname *cnp;
600{
601	struct namecache *ncp, *n2;
602	struct nchashhead *ncpp;
603	u_int32_t hash;
604	int flag;
605	int hold;
606	int zap;
607	int len;
608
609	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
610	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
611	    ("cahe_enter: Adding a doomed vnode"));
612
613	if (!doingcache)
614		return;
615
616	/*
617	 * Avoid blowout in namecache entries.
618	 */
619	if (numcache >= desiredvnodes * 2)
620		return;
621
622	flag = 0;
623	if (cnp->cn_nameptr[0] == '.') {
624		if (cnp->cn_namelen == 1)
625			return;
626		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
627			CACHE_WLOCK();
628			/*
629			 * If dotdot entry already exists, just retarget it
630			 * to new parent vnode, otherwise continue with new
631			 * namecache entry allocation.
632			 */
633			if ((ncp = dvp->v_cache_dd) != NULL &&
634			    ncp->nc_flag & NCF_ISDOTDOT) {
635				KASSERT(ncp->nc_dvp == dvp,
636				    ("wrong isdotdot parent"));
637				if (ncp->nc_vp != NULL)
638					TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
639					    ncp, nc_dst);
640				else
641					TAILQ_REMOVE(&ncneg, ncp, nc_dst);
642				if (vp != NULL)
643					TAILQ_INSERT_HEAD(&vp->v_cache_dst,
644					    ncp, nc_dst);
645				else
646					TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
647				ncp->nc_vp = vp;
648				CACHE_WUNLOCK();
649				return;
650			}
651			dvp->v_cache_dd = NULL;
652			SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
653			    0, 0);
654			CACHE_WUNLOCK();
655			flag = NCF_ISDOTDOT;
656		}
657	}
658
659	hold = 0;
660	zap = 0;
661
662	/*
663	 * Calculate the hash key and setup as much of the new
664	 * namecache entry as possible before acquiring the lock.
665	 */
666	ncp = cache_alloc(cnp->cn_namelen);
667	ncp->nc_vp = vp;
668	ncp->nc_dvp = dvp;
669	ncp->nc_flag = flag;
670	len = ncp->nc_nlen = cnp->cn_namelen;
671	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
672	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
673	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
674	CACHE_WLOCK();
675
676	/*
677	 * See if this vnode or negative entry is already in the cache
678	 * with this name.  This can happen with concurrent lookups of
679	 * the same path name.
680	 */
681	ncpp = NCHHASH(hash);
682	LIST_FOREACH(n2, ncpp, nc_hash) {
683		if (n2->nc_dvp == dvp &&
684		    n2->nc_nlen == cnp->cn_namelen &&
685		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
686			CACHE_WUNLOCK();
687			cache_free(ncp);
688			return;
689		}
690	}
691
692	if (flag == NCF_ISDOTDOT) {
693		/*
694		 * See if we are trying to add .. entry, but some other lookup
695		 * has populated v_cache_dd pointer already.
696		 */
697		if (dvp->v_cache_dd != NULL) {
698		    CACHE_WUNLOCK();
699		    cache_free(ncp);
700		    return;
701		}
702		KASSERT(vp == NULL || vp->v_type == VDIR,
703		    ("wrong vnode type %p", vp));
704		dvp->v_cache_dd = ncp;
705	}
706
707	numcache++;
708	if (!vp) {
709		numneg++;
710		if (cnp->cn_flags & ISWHITEOUT)
711			ncp->nc_flag |= NCF_WHITE;
712	} else if (vp->v_type == VDIR) {
713		if (flag != NCF_ISDOTDOT) {
714			if ((n2 = vp->v_cache_dd) != NULL &&
715			    (n2->nc_flag & NCF_ISDOTDOT) != 0)
716				cache_zap(n2);
717			vp->v_cache_dd = ncp;
718		}
719	} else {
720		vp->v_cache_dd = NULL;
721	}
722
723	/*
724	 * Insert the new namecache entry into the appropriate chain
725	 * within the cache entries table.
726	 */
727	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
728	if (flag != NCF_ISDOTDOT) {
729		if (LIST_EMPTY(&dvp->v_cache_src)) {
730			hold = 1;
731			numcachehv++;
732		}
733		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
734	}
735
736	/*
737	 * If the entry is "negative", we place it into the
738	 * "negative" cache queue, otherwise, we place it into the
739	 * destination vnode's cache entries queue.
740	 */
741	if (vp) {
742		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
743		SDT_PROBE(vfs, namecache, enter, done, dvp, ncp->nc_name, vp,
744		    0, 0);
745	} else {
746		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
747		SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
748		    ncp->nc_name, 0, 0, 0);
749	}
750	if (numneg * ncnegfactor > numcache) {
751		ncp = TAILQ_FIRST(&ncneg);
752		zap = 1;
753	}
754	if (hold)
755		vhold(dvp);
756	if (zap)
757		cache_zap(ncp);
758	CACHE_WUNLOCK();
759}
760
761/*
762 * Name cache initialization, from vfs_init() when we are booting
763 */
764static void
765nchinit(void *dummy __unused)
766{
767
768	TAILQ_INIT(&ncneg);
769
770	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
771	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
772	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
773	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
774
775	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
776}
777SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
778
779
780/*
781 * Invalidate all entries to a particular vnode.
782 */
783void
784cache_purge(vp)
785	struct vnode *vp;
786{
787
788	CTR1(KTR_VFS, "cache_purge(%p)", vp);
789	SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
790	CACHE_WLOCK();
791	while (!LIST_EMPTY(&vp->v_cache_src))
792		cache_zap(LIST_FIRST(&vp->v_cache_src));
793	while (!TAILQ_EMPTY(&vp->v_cache_dst))
794		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
795	if (vp->v_cache_dd != NULL) {
796		KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT,
797		   ("lost dotdot link"));
798		cache_zap(vp->v_cache_dd);
799	}
800	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
801	CACHE_WUNLOCK();
802}
803
804/*
805 * Invalidate all negative entries for a particular directory vnode.
806 */
807void
808cache_purge_negative(vp)
809	struct vnode *vp;
810{
811	struct namecache *cp, *ncp;
812
813	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
814	SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
815	CACHE_WLOCK();
816	LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
817		if (cp->nc_vp == NULL)
818			cache_zap(cp);
819	}
820	CACHE_WUNLOCK();
821}
822
823/*
824 * Flush all entries referencing a particular filesystem.
825 */
826void
827cache_purgevfs(mp)
828	struct mount *mp;
829{
830	struct nchashhead *ncpp;
831	struct namecache *ncp, *nnp;
832
833	/* Scan hash tables for applicable entries */
834	SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
835	CACHE_WLOCK();
836	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
837		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
838			if (ncp->nc_dvp->v_mount == mp)
839				cache_zap(ncp);
840		}
841	}
842	CACHE_WUNLOCK();
843}
844
845/*
846 * Perform canonical checks and cache lookup and pass on to filesystem
847 * through the vop_cachedlookup only if needed.
848 */
849
850int
851vfs_cache_lookup(ap)
852	struct vop_lookup_args /* {
853		struct vnode *a_dvp;
854		struct vnode **a_vpp;
855		struct componentname *a_cnp;
856	} */ *ap;
857{
858	struct vnode *dvp;
859	int error;
860	struct vnode **vpp = ap->a_vpp;
861	struct componentname *cnp = ap->a_cnp;
862	struct ucred *cred = cnp->cn_cred;
863	int flags = cnp->cn_flags;
864	struct thread *td = cnp->cn_thread;
865
866	*vpp = NULL;
867	dvp = ap->a_dvp;
868
869	if (dvp->v_type != VDIR)
870		return (ENOTDIR);
871
872	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
873	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
874		return (EROFS);
875
876	error = VOP_ACCESS(dvp, VEXEC, cred, td);
877	if (error)
878		return (error);
879
880	error = cache_lookup(dvp, vpp, cnp);
881	if (error == 0)
882		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
883	if (error == -1)
884		return (0);
885	return (error);
886}
887
888
889#ifndef _SYS_SYSPROTO_H_
890struct  __getcwd_args {
891	u_char	*buf;
892	u_int	buflen;
893};
894#endif
895
896/*
897 * XXX All of these sysctls would probably be more productive dead.
898 */
899static int disablecwd;
900SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
901   "Disable the getcwd syscall");
902
903/* Implementation of the getcwd syscall. */
904int
905__getcwd(td, uap)
906	struct thread *td;
907	struct __getcwd_args *uap;
908{
909
910	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
911}
912
913int
914kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
915{
916	char *bp, *tmpbuf;
917	struct filedesc *fdp;
918	struct vnode *cdir, *rdir;
919	int error, vfslocked;
920
921	if (disablecwd)
922		return (ENODEV);
923	if (buflen < 2)
924		return (EINVAL);
925	if (buflen > MAXPATHLEN)
926		buflen = MAXPATHLEN;
927
928	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
929	fdp = td->td_proc->p_fd;
930	FILEDESC_SLOCK(fdp);
931	cdir = fdp->fd_cdir;
932	VREF(cdir);
933	rdir = fdp->fd_rdir;
934	VREF(rdir);
935	FILEDESC_SUNLOCK(fdp);
936	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
937	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
938	vrele(rdir);
939	VFS_UNLOCK_GIANT(vfslocked);
940	vfslocked = VFS_LOCK_GIANT(cdir->v_mount);
941	vrele(cdir);
942	VFS_UNLOCK_GIANT(vfslocked);
943
944	if (!error) {
945		if (bufseg == UIO_SYSSPACE)
946			bcopy(bp, buf, strlen(bp) + 1);
947		else
948			error = copyout(bp, buf, strlen(bp) + 1);
949#ifdef KTRACE
950	if (KTRPOINT(curthread, KTR_NAMEI))
951		ktrnamei(bp);
952#endif
953	}
954	free(tmpbuf, M_TEMP);
955	return (error);
956}
957
958/*
959 * Thus begins the fullpath magic.
960 */
961
962#undef STATNODE
963#define STATNODE(name)							\
964	static u_int name;						\
965	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
966
967static int disablefullpath;
968SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
969	"Disable the vn_fullpath function");
970
971/* These count for kern___getcwd(), too. */
972STATNODE(numfullpathcalls);
973STATNODE(numfullpathfail1);
974STATNODE(numfullpathfail2);
975STATNODE(numfullpathfail4);
976STATNODE(numfullpathfound);
977
978/*
979 * Retrieve the full filesystem path that correspond to a vnode from the name
980 * cache (if available)
981 */
982int
983vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
984{
985	char *buf;
986	struct filedesc *fdp;
987	struct vnode *rdir;
988	int error, vfslocked;
989
990	if (disablefullpath)
991		return (ENODEV);
992	if (vn == NULL)
993		return (EINVAL);
994
995	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
996	fdp = td->td_proc->p_fd;
997	FILEDESC_SLOCK(fdp);
998	rdir = fdp->fd_rdir;
999	VREF(rdir);
1000	FILEDESC_SUNLOCK(fdp);
1001	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
1002	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
1003	vrele(rdir);
1004	VFS_UNLOCK_GIANT(vfslocked);
1005
1006	if (!error)
1007		*freebuf = buf;
1008	else
1009		free(buf, M_TEMP);
1010	return (error);
1011}
1012
1013/*
1014 * This function is similar to vn_fullpath, but it attempts to lookup the
1015 * pathname relative to the global root mount point.  This is required for the
1016 * auditing sub-system, as audited pathnames must be absolute, relative to the
1017 * global root mount point.
1018 */
1019int
1020vn_fullpath_global(struct thread *td, struct vnode *vn,
1021    char **retbuf, char **freebuf)
1022{
1023	char *buf;
1024	int error;
1025
1026	if (disablefullpath)
1027		return (ENODEV);
1028	if (vn == NULL)
1029		return (EINVAL);
1030	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1031	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
1032	if (!error)
1033		*freebuf = buf;
1034	else
1035		free(buf, M_TEMP);
1036	return (error);
1037}
1038
1039int
1040vn_vptocnp(struct vnode **vp, char *buf, u_int *buflen)
1041{
1042	int error;
1043
1044	CACHE_RLOCK();
1045	error = vn_vptocnp_locked(vp, buf, buflen);
1046	if (error == 0) {
1047		/*
1048		 * vn_vptocnp_locked() dropped hold acquired by
1049		 * VOP_VPTOCNP immediately after locking the
1050		 * cache. Since we are going to drop the cache rlock,
1051		 * re-hold the result.
1052		 */
1053		vhold(*vp);
1054		CACHE_RUNLOCK();
1055	}
1056	return (error);
1057}
1058
1059static int
1060vn_vptocnp_locked(struct vnode **vp, char *buf, u_int *buflen)
1061{
1062	struct vnode *dvp;
1063	struct namecache *ncp;
1064	int error, vfslocked;
1065
1066	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
1067		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1068			break;
1069	}
1070	if (ncp != NULL) {
1071		if (*buflen < ncp->nc_nlen) {
1072			CACHE_RUNLOCK();
1073			numfullpathfail4++;
1074			error = ENOMEM;
1075			SDT_PROBE(vfs, namecache, fullpath, return, error,
1076			    startvp, NULL, 0, 0);
1077			return (error);
1078		}
1079		*buflen -= ncp->nc_nlen;
1080		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
1081		SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
1082		    ncp->nc_name, vp, 0, 0);
1083		*vp = ncp->nc_dvp;
1084		return (0);
1085	}
1086	SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0);
1087
1088	vhold(*vp);
1089	CACHE_RUNLOCK();
1090	vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
1091	vn_lock(*vp, LK_SHARED | LK_RETRY);
1092	error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
1093	VOP_UNLOCK(*vp, 0);
1094	vdrop(*vp);
1095	VFS_UNLOCK_GIANT(vfslocked);
1096	if (error) {
1097		numfullpathfail2++;
1098		SDT_PROBE(vfs, namecache, fullpath, return,  error, startvp,
1099		    NULL, 0, 0);
1100		return (error);
1101	}
1102
1103	*vp = dvp;
1104	CACHE_RLOCK();
1105	if ((*vp)->v_iflag & VI_DOOMED) {
1106		/* forced unmount */
1107		CACHE_RUNLOCK();
1108		vdrop(*vp);
1109		error = ENOENT;
1110		SDT_PROBE(vfs, namecache, fullpath, return, error, startvp,
1111		    NULL, 0, 0);
1112		return (error);
1113	}
1114	vdrop(*vp);
1115
1116	return (0);
1117}
1118
1119/*
1120 * The magic behind kern___getcwd() and vn_fullpath().
1121 */
1122static int
1123vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
1124    char *buf, char **retbuf, u_int buflen)
1125{
1126	int error, slash_prefixed;
1127#ifdef KDTRACE_HOOKS
1128	struct vnode *startvp = vp;
1129#endif
1130
1131	buflen--;
1132	buf[buflen] = '\0';
1133	error = 0;
1134	slash_prefixed = 0;
1135
1136	SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
1137	numfullpathcalls++;
1138	CACHE_RLOCK();
1139	if (vp->v_type != VDIR) {
1140		error = vn_vptocnp_locked(&vp, buf, &buflen);
1141		if (error)
1142			return (error);
1143		if (buflen == 0)
1144			return (ENOMEM);
1145		buf[--buflen] = '/';
1146		slash_prefixed = 1;
1147	}
1148	while (vp != rdir && vp != rootvnode) {
1149		if (vp->v_vflag & VV_ROOT) {
1150			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
1151				CACHE_RUNLOCK();
1152				error = ENOENT;
1153				break;
1154			}
1155			vp = vp->v_mount->mnt_vnodecovered;
1156			continue;
1157		}
1158		if (vp->v_type != VDIR) {
1159			CACHE_RUNLOCK();
1160			numfullpathfail1++;
1161			error = ENOTDIR;
1162			break;
1163		}
1164		error = vn_vptocnp_locked(&vp, buf, &buflen);
1165		if (error)
1166			break;
1167		if (buflen == 0) {
1168			error = ENOMEM;
1169			break;
1170		}
1171		buf[--buflen] = '/';
1172		slash_prefixed = 1;
1173	}
1174	if (error)
1175		return (error);
1176	if (!slash_prefixed) {
1177		if (buflen == 0) {
1178			CACHE_RUNLOCK();
1179			numfullpathfail4++;
1180			SDT_PROBE(vfs, namecache, fullpath, return, 0,
1181			    startvp, fullpath, 0, 0);
1182			return (ENOMEM);
1183		}
1184		buf[--buflen] = '/';
1185	}
1186	numfullpathfound++;
1187	CACHE_RUNLOCK();
1188
1189	SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + *buflen,
1190	    0, 0);
1191	*retbuf = buf + buflen;
1192	return (0);
1193}
1194
1195int
1196vn_commname(struct vnode *vp, char *buf, u_int buflen)
1197{
1198	struct namecache *ncp;
1199	int l;
1200
1201	CACHE_RLOCK();
1202	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
1203		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1204			break;
1205	if (ncp == NULL) {
1206		CACHE_RUNLOCK();
1207		return (ENOENT);
1208	}
1209	l = min(ncp->nc_nlen, buflen - 1);
1210	memcpy(buf, ncp->nc_name, l);
1211	CACHE_RUNLOCK();
1212	buf[l] = '\0';
1213	return (0);
1214}
1215