1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993, 1995
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD$");
39
40#include "opt_ddb.h"
41#include "opt_ktrace.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/counter.h>
46#include <sys/filedesc.h>
47#include <sys/fnv_hash.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/malloc.h>
51#include <sys/fcntl.h>
52#include <sys/mount.h>
53#include <sys/namei.h>
54#include <sys/proc.h>
55#include <sys/rwlock.h>
56#include <sys/sdt.h>
57#include <sys/smp.h>
58#include <sys/syscallsubr.h>
59#include <sys/sysctl.h>
60#include <sys/sysproto.h>
61#include <sys/vnode.h>
62#ifdef KTRACE
63#include <sys/ktrace.h>
64#endif
65
66#ifdef DDB
67#include <ddb/ddb.h>
68#endif
69
70#include <vm/uma.h>
71
72SDT_PROVIDER_DECLARE(vfs);
73SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
74    "struct vnode *");
75SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
76    "char *");
77SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
78SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
79    "char *", "struct vnode *");
80SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
81SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
82    "struct vnode *", "char *");
83SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
84    "struct vnode *");
85SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
86    "struct vnode *", "char *");
87SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
88    "char *");
89SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
90SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
91SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
92SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
93    "struct vnode *");
94SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
95    "char *");
96SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
97    "char *");
98
99/*
100 * This structure describes the elements in the cache of recent
101 * names looked up by namei.
102 */
103
104struct	namecache {
105	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
106	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
107	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
108	struct	vnode *nc_dvp;		/* vnode of parent of name */
109	union {
110		struct	vnode *nu_vp;	/* vnode the name refers to */
111	} n_un;
112	u_char	nc_flag;		/* flag bits */
113	u_char	nc_nlen;		/* length of name */
114	char	nc_name[0];		/* segment name + nul */
115};
116
117/*
118 * struct namecache_ts repeats struct namecache layout up to the
119 * nc_nlen member.
120 * struct namecache_ts is used in place of struct namecache when time(s) need
121 * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
122 * both a non-dotdot directory name plus dotdot for the directory's
123 * parent.
124 *
125 * See below for alignment requirement.
126 */
127struct	namecache_ts {
128	struct	timespec nc_time;	/* timespec provided by fs */
129	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
130	int	nc_ticks;		/* ticks value when entry was added */
131	struct namecache nc_nc;
132};
133
134/*
135 * At least mips n32 performs 64-bit accesses to timespec as found
136 * in namecache_ts and requires them to be aligned. Since others
137 * may be in the same spot suffer a little bit and enforce the
138 * alignment for everyone. Note this is a nop for 64-bit platforms.
139 */
140#define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
141
142#define	nc_vp		n_un.nu_vp
143
144/*
145 * Flags in namecache.nc_flag
146 */
147#define NCF_WHITE	0x01
148#define NCF_ISDOTDOT	0x02
149#define	NCF_TS		0x04
150#define	NCF_DTS		0x08
151#define	NCF_DVDROP	0x10
152#define	NCF_NEGATIVE	0x20
153#define	NCF_HOTNEGATIVE	0x40
154
155/*
156 * Name caching works as follows:
157 *
158 * Names found by directory scans are retained in a cache
159 * for future reference.  It is managed LRU, so frequently
160 * used names will hang around.  Cache is indexed by hash value
161 * obtained from (dvp, name) where dvp refers to the directory
162 * containing name.
163 *
164 * If it is a "negative" entry, (i.e. for a name that is known NOT to
165 * exist) the vnode pointer will be NULL.
166 *
167 * Upon reaching the last segment of a path, if the reference
168 * is for DELETE, or NOCACHE is set (rewrite), and the
169 * name is located in the cache, it will be dropped.
170 *
171 * These locks are used (in the order in which they can be taken):
172 * NAME		TYPE	ROLE
173 * vnodelock	mtx	vnode lists and v_cache_dd field protection
174 * bucketlock	rwlock	for access to given set of hash buckets
175 * neglist	mtx	negative entry LRU management
176 *
177 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
178 * shrinking the LRU list.
179 *
180 * It is legal to take multiple vnodelock and bucketlock locks. The locking
181 * order is lower address first. Both are recursive.
182 *
183 * "." lookups are lockless.
184 *
185 * ".." and vnode -> name lookups require vnodelock.
186 *
187 * name -> vnode lookup requires the relevant bucketlock to be held for reading.
188 *
189 * Insertions and removals of entries require involved vnodes and bucketlocks
190 * to be write-locked to prevent other threads from seeing the entry.
191 *
192 * Some lookups result in removal of the found entry (e.g. getting rid of a
193 * negative entry with the intent to create a positive one), which poses a
194 * problem when multiple threads reach the state. Similarly, two different
195 * threads can purge two different vnodes and try to remove the same name.
196 *
197 * If the already held vnode lock is lower than the second required lock, we
198 * can just take the other lock. However, in the opposite case, this could
199 * deadlock. As such, this is resolved by trylocking and if that fails unlocking
200 * the first node, locking everything in order and revalidating the state.
201 */
202
203/*
204 * Structures associated with name caching.
205 */
206#define NCHHASH(hash) \
207	(&nchashtbl[(hash) & nchash])
208static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
209static u_long __read_mostly	nchash;			/* size of hash table */
210SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
211    "Size of namecache hash table");
212static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
213SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
214    "Ratio of negative namecache entries");
215static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
216static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
217u_int ncsizefactor = 2;
218SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
219    "Size factor for namecache");
220static u_int __read_mostly	ncpurgeminvnodes;
221SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
222    "Number of vnodes below which purgevfs ignores the request");
223static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
224
225struct nchstats	nchstats;		/* cache effectiveness statistics */
226
227static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
228static int	shrink_list_turn;
229
230struct neglist {
231	struct mtx		nl_lock;
232	TAILQ_HEAD(, namecache) nl_list;
233} __aligned(CACHE_LINE_SIZE);
234
235static struct neglist __read_mostly	*neglists;
236static struct neglist ncneg_hot;
237static u_long numhotneg;
238
239#define ncneghash	3
240#define	numneglists	(ncneghash + 1)
241static inline struct neglist *
242NCP2NEGLIST(struct namecache *ncp)
243{
244
245	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
246}
247
248#define	numbucketlocks (ncbuckethash + 1)
249static u_int __read_mostly  ncbuckethash;
250static struct rwlock_padalign __read_mostly  *bucketlocks;
251#define	HASH2BUCKETLOCK(hash) \
252	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
253
254#define	numvnodelocks (ncvnodehash + 1)
255static u_int __read_mostly  ncvnodehash;
256static struct mtx __read_mostly *vnodelocks;
257static inline struct mtx *
258VP2VNODELOCK(struct vnode *vp)
259{
260
261	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
262}
263
264/*
265 * UMA zones for the VFS cache.
266 *
267 * The small cache is used for entries with short names, which are the
268 * most common.  The large cache is used for entries which are too big to
269 * fit in the small cache.
270 */
271static uma_zone_t __read_mostly cache_zone_small;
272static uma_zone_t __read_mostly cache_zone_small_ts;
273static uma_zone_t __read_mostly cache_zone_large;
274static uma_zone_t __read_mostly cache_zone_large_ts;
275
276#define	CACHE_PATH_CUTOFF	35
277
278static struct namecache *
279cache_alloc(int len, int ts)
280{
281	struct namecache_ts *ncp_ts;
282	struct namecache *ncp;
283
284	if (__predict_false(ts)) {
285		if (len <= CACHE_PATH_CUTOFF)
286			ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK);
287		else
288			ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK);
289		ncp = &ncp_ts->nc_nc;
290	} else {
291		if (len <= CACHE_PATH_CUTOFF)
292			ncp = uma_zalloc(cache_zone_small, M_WAITOK);
293		else
294			ncp = uma_zalloc(cache_zone_large, M_WAITOK);
295	}
296	return (ncp);
297}
298
299static void
300cache_free(struct namecache *ncp)
301{
302	struct namecache_ts *ncp_ts;
303
304	if (ncp == NULL)
305		return;
306	if ((ncp->nc_flag & NCF_DVDROP) != 0)
307		vdrop(ncp->nc_dvp);
308	if (__predict_false(ncp->nc_flag & NCF_TS)) {
309		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
310		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
311			uma_zfree(cache_zone_small_ts, ncp_ts);
312		else
313			uma_zfree(cache_zone_large_ts, ncp_ts);
314	} else {
315		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
316			uma_zfree(cache_zone_small, ncp);
317		else
318			uma_zfree(cache_zone_large, ncp);
319	}
320}
321
322static void
323cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
324{
325	struct namecache_ts *ncp_ts;
326
327	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
328	    (tsp == NULL && ticksp == NULL),
329	    ("No NCF_TS"));
330
331	if (tsp == NULL && ticksp == NULL)
332		return;
333
334	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
335	if (tsp != NULL)
336		*tsp = ncp_ts->nc_time;
337	if (ticksp != NULL)
338		*ticksp = ncp_ts->nc_ticks;
339}
340
341static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
342SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
343    "VFS namecache enabled");
344
345/* Export size information to userland */
346SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
347    sizeof(struct namecache), "sizeof(struct namecache)");
348
349/*
350 * The new name cache statistics
351 */
352static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
353    "Name cache statistics");
354#define STATNODE_ULONG(name, descr)	\
355	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
356#define STATNODE_COUNTER(name, descr)	\
357	static counter_u64_t __read_mostly name; \
358	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr);
359STATNODE_ULONG(numneg, "Number of negative cache entries");
360STATNODE_ULONG(numcache, "Number of cache entries");
361STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
362STATNODE_COUNTER(numcalls, "Number of cache lookups");
363STATNODE_COUNTER(dothits, "Number of '.' hits");
364STATNODE_COUNTER(dotdothits, "Number of '..' hits");
365STATNODE_COUNTER(numchecks, "Number of checks in lookup");
366STATNODE_COUNTER(nummiss, "Number of cache misses");
367STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
368STATNODE_COUNTER(numposzaps,
369    "Number of cache hits (positive) we do not want to cache");
370STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
371STATNODE_COUNTER(numnegzaps,
372    "Number of cache hits (negative) we do not want to cache");
373STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
374/* These count for kern___getcwd(), too. */
375STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
376STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
377STATNODE_COUNTER(numfullpathfail2,
378    "Number of fullpath search errors (VOP_VPTOCNP failures)");
379STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
380STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
381STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
382    "Number of successful removals after relocking");
383static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
384    "Number of times zap_and_exit failed to lock");
385static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
386    "Number of times zap_and_exit failed to lock");
387static long cache_lock_vnodes_cel_3_failures;
388STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
389    "Number of times 3-way vnode locking failed");
390STATNODE_ULONG(numhotneg, "Number of hot negative entries");
391STATNODE_COUNTER(numneg_evicted,
392    "Number of negative entries evicted when adding a new entry");
393STATNODE_COUNTER(shrinking_skipped,
394    "Number of times shrinking was already in progress");
395
396static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
397static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
398    char *buf, char **retbuf, u_int buflen);
399
400static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
401
402static int cache_yield;
403SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
404    "Number of times cache called yield");
405
406static void __noinline
407cache_maybe_yield(void)
408{
409
410	if (should_yield()) {
411		cache_yield++;
412		kern_yield(PRI_USER);
413	}
414}
415
416static inline void
417cache_assert_vlp_locked(struct mtx *vlp)
418{
419
420	if (vlp != NULL)
421		mtx_assert(vlp, MA_OWNED);
422}
423
424static inline void
425cache_assert_vnode_locked(struct vnode *vp)
426{
427	struct mtx *vlp;
428
429	vlp = VP2VNODELOCK(vp);
430	cache_assert_vlp_locked(vlp);
431}
432
433static uint32_t
434cache_get_hash(char *name, u_char len, struct vnode *dvp)
435{
436	uint32_t hash;
437
438	hash = fnv_32_buf(name, len, FNV1_32_INIT);
439	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
440	return (hash);
441}
442
443static inline struct rwlock *
444NCP2BUCKETLOCK(struct namecache *ncp)
445{
446	uint32_t hash;
447
448	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
449	return (HASH2BUCKETLOCK(hash));
450}
451
452#ifdef INVARIANTS
453static void
454cache_assert_bucket_locked(struct namecache *ncp, int mode)
455{
456	struct rwlock *blp;
457
458	blp = NCP2BUCKETLOCK(ncp);
459	rw_assert(blp, mode);
460}
461#else
462#define cache_assert_bucket_locked(x, y) do { } while (0)
463#endif
464
465#define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
466static void
467_cache_sort_vnodes(void **p1, void **p2)
468{
469	void *tmp;
470
471	MPASS(*p1 != NULL || *p2 != NULL);
472
473	if (*p1 > *p2) {
474		tmp = *p2;
475		*p2 = *p1;
476		*p1 = tmp;
477	}
478}
479
480static void
481cache_lock_all_buckets(void)
482{
483	u_int i;
484
485	for (i = 0; i < numbucketlocks; i++)
486		rw_wlock(&bucketlocks[i]);
487}
488
489static void
490cache_unlock_all_buckets(void)
491{
492	u_int i;
493
494	for (i = 0; i < numbucketlocks; i++)
495		rw_wunlock(&bucketlocks[i]);
496}
497
498static void
499cache_lock_all_vnodes(void)
500{
501	u_int i;
502
503	for (i = 0; i < numvnodelocks; i++)
504		mtx_lock(&vnodelocks[i]);
505}
506
507static void
508cache_unlock_all_vnodes(void)
509{
510	u_int i;
511
512	for (i = 0; i < numvnodelocks; i++)
513		mtx_unlock(&vnodelocks[i]);
514}
515
516static int
517cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
518{
519
520	cache_sort_vnodes(&vlp1, &vlp2);
521
522	if (vlp1 != NULL) {
523		if (!mtx_trylock(vlp1))
524			return (EAGAIN);
525	}
526	if (!mtx_trylock(vlp2)) {
527		if (vlp1 != NULL)
528			mtx_unlock(vlp1);
529		return (EAGAIN);
530	}
531
532	return (0);
533}
534
535static void
536cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
537{
538
539	MPASS(vlp1 != NULL || vlp2 != NULL);
540	MPASS(vlp1 <= vlp2);
541
542	if (vlp1 != NULL)
543		mtx_lock(vlp1);
544	if (vlp2 != NULL)
545		mtx_lock(vlp2);
546}
547
548static void
549cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
550{
551
552	MPASS(vlp1 != NULL || vlp2 != NULL);
553
554	if (vlp1 != NULL)
555		mtx_unlock(vlp1);
556	if (vlp2 != NULL)
557		mtx_unlock(vlp2);
558}
559
560static int
561sysctl_nchstats(SYSCTL_HANDLER_ARGS)
562{
563	struct nchstats snap;
564
565	if (req->oldptr == NULL)
566		return (SYSCTL_OUT(req, 0, sizeof(snap)));
567
568	snap = nchstats;
569	snap.ncs_goodhits = counter_u64_fetch(numposhits);
570	snap.ncs_neghits = counter_u64_fetch(numneghits);
571	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
572	    counter_u64_fetch(numnegzaps);
573	snap.ncs_miss = counter_u64_fetch(nummisszap) +
574	    counter_u64_fetch(nummiss);
575
576	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
577}
578SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
579    CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
580    "VFS cache effectiveness statistics");
581
582#ifdef DIAGNOSTIC
583/*
584 * Grab an atomic snapshot of the name cache hash chain lengths
585 */
586static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
587    "hash table stats");
588
589static int
590sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
591{
592	struct nchashhead *ncpp;
593	struct namecache *ncp;
594	int i, error, n_nchash, *cntbuf;
595
596retry:
597	n_nchash = nchash + 1;	/* nchash is max index, not count */
598	if (req->oldptr == NULL)
599		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
600	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
601	cache_lock_all_buckets();
602	if (n_nchash != nchash + 1) {
603		cache_unlock_all_buckets();
604		free(cntbuf, M_TEMP);
605		goto retry;
606	}
607	/* Scan hash tables counting entries */
608	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
609		LIST_FOREACH(ncp, ncpp, nc_hash)
610			cntbuf[i]++;
611	cache_unlock_all_buckets();
612	for (error = 0, i = 0; i < n_nchash; i++)
613		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
614			break;
615	free(cntbuf, M_TEMP);
616	return (error);
617}
618SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
619    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
620    "nchash chain lengths");
621
622static int
623sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
624{
625	int error;
626	struct nchashhead *ncpp;
627	struct namecache *ncp;
628	int n_nchash;
629	int count, maxlength, used, pct;
630
631	if (!req->oldptr)
632		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
633
634	cache_lock_all_buckets();
635	n_nchash = nchash + 1;	/* nchash is max index, not count */
636	used = 0;
637	maxlength = 0;
638
639	/* Scan hash tables for applicable entries */
640	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
641		count = 0;
642		LIST_FOREACH(ncp, ncpp, nc_hash) {
643			count++;
644		}
645		if (count)
646			used++;
647		if (maxlength < count)
648			maxlength = count;
649	}
650	n_nchash = nchash + 1;
651	cache_unlock_all_buckets();
652	pct = (used * 100) / (n_nchash / 100);
653	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
654	if (error)
655		return (error);
656	error = SYSCTL_OUT(req, &used, sizeof(used));
657	if (error)
658		return (error);
659	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
660	if (error)
661		return (error);
662	error = SYSCTL_OUT(req, &pct, sizeof(pct));
663	if (error)
664		return (error);
665	return (0);
666}
667SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
668    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
669    "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
670#endif
671
672/*
673 * Negative entries management
674 *
675 * A variation of LRU scheme is used. New entries are hashed into one of
676 * numneglists cold lists. Entries get promoted to the hot list on first hit.
677 *
678 * The shrinker will demote hot list head and evict from the cold list in a
679 * round-robin manner.
680 */
681static void
682cache_negative_hit(struct namecache *ncp)
683{
684	struct neglist *neglist;
685
686	MPASS(ncp->nc_flag & NCF_NEGATIVE);
687	if (ncp->nc_flag & NCF_HOTNEGATIVE)
688		return;
689	neglist = NCP2NEGLIST(ncp);
690	mtx_lock(&ncneg_hot.nl_lock);
691	mtx_lock(&neglist->nl_lock);
692	if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
693		numhotneg++;
694		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
695		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
696		ncp->nc_flag |= NCF_HOTNEGATIVE;
697	}
698	mtx_unlock(&neglist->nl_lock);
699	mtx_unlock(&ncneg_hot.nl_lock);
700}
701
702static void
703cache_negative_insert(struct namecache *ncp, bool neg_locked)
704{
705	struct neglist *neglist;
706
707	MPASS(ncp->nc_flag & NCF_NEGATIVE);
708	cache_assert_bucket_locked(ncp, RA_WLOCKED);
709	neglist = NCP2NEGLIST(ncp);
710	if (!neg_locked) {
711		mtx_lock(&neglist->nl_lock);
712	} else {
713		mtx_assert(&neglist->nl_lock, MA_OWNED);
714	}
715	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
716	if (!neg_locked)
717		mtx_unlock(&neglist->nl_lock);
718	atomic_add_rel_long(&numneg, 1);
719}
720
721static void
722cache_negative_remove(struct namecache *ncp, bool neg_locked)
723{
724	struct neglist *neglist;
725	bool hot_locked = false;
726	bool list_locked = false;
727
728	MPASS(ncp->nc_flag & NCF_NEGATIVE);
729	cache_assert_bucket_locked(ncp, RA_WLOCKED);
730	neglist = NCP2NEGLIST(ncp);
731	if (!neg_locked) {
732		if (ncp->nc_flag & NCF_HOTNEGATIVE) {
733			hot_locked = true;
734			mtx_lock(&ncneg_hot.nl_lock);
735			if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
736				list_locked = true;
737				mtx_lock(&neglist->nl_lock);
738			}
739		} else {
740			list_locked = true;
741			mtx_lock(&neglist->nl_lock);
742		}
743	}
744	if (ncp->nc_flag & NCF_HOTNEGATIVE) {
745		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
746		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
747		numhotneg--;
748	} else {
749		mtx_assert(&neglist->nl_lock, MA_OWNED);
750		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
751	}
752	if (list_locked)
753		mtx_unlock(&neglist->nl_lock);
754	if (hot_locked)
755		mtx_unlock(&ncneg_hot.nl_lock);
756	atomic_subtract_rel_long(&numneg, 1);
757}
758
759static void
760cache_negative_shrink_select(int start, struct namecache **ncpp,
761    struct neglist **neglistpp)
762{
763	struct neglist *neglist;
764	struct namecache *ncp;
765	int i;
766
767	*ncpp = ncp = NULL;
768	neglist = NULL;
769
770	for (i = start; i < numneglists; i++) {
771		neglist = &neglists[i];
772		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
773			continue;
774		mtx_lock(&neglist->nl_lock);
775		ncp = TAILQ_FIRST(&neglist->nl_list);
776		if (ncp != NULL)
777			break;
778		mtx_unlock(&neglist->nl_lock);
779	}
780
781	*neglistpp = neglist;
782	*ncpp = ncp;
783}
784
785static void
786cache_negative_zap_one(void)
787{
788	struct namecache *ncp, *ncp2;
789	struct neglist *neglist;
790	struct mtx *dvlp;
791	struct rwlock *blp;
792
793	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
794	    !mtx_trylock(&ncneg_shrink_lock)) {
795		counter_u64_add(shrinking_skipped, 1);
796		return;
797	}
798
799	mtx_lock(&ncneg_hot.nl_lock);
800	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
801	if (ncp != NULL) {
802		neglist = NCP2NEGLIST(ncp);
803		mtx_lock(&neglist->nl_lock);
804		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
805		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
806		ncp->nc_flag &= ~NCF_HOTNEGATIVE;
807		numhotneg--;
808		mtx_unlock(&neglist->nl_lock);
809	}
810	mtx_unlock(&ncneg_hot.nl_lock);
811
812	cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
813	shrink_list_turn++;
814	if (shrink_list_turn == numneglists)
815		shrink_list_turn = 0;
816	if (ncp == NULL && shrink_list_turn == 0)
817		cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
818	mtx_unlock(&ncneg_shrink_lock);
819	if (ncp == NULL)
820		return;
821
822	MPASS(ncp->nc_flag & NCF_NEGATIVE);
823	dvlp = VP2VNODELOCK(ncp->nc_dvp);
824	blp = NCP2BUCKETLOCK(ncp);
825	mtx_unlock(&neglist->nl_lock);
826	mtx_lock(dvlp);
827	rw_wlock(blp);
828	mtx_lock(&neglist->nl_lock);
829	ncp2 = TAILQ_FIRST(&neglist->nl_list);
830	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
831	    blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) {
832		ncp = NULL;
833	} else {
834		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
835		    ncp->nc_name);
836
837		cache_zap_locked(ncp, true);
838		counter_u64_add(numneg_evicted, 1);
839	}
840	mtx_unlock(&neglist->nl_lock);
841	rw_wunlock(blp);
842	mtx_unlock(dvlp);
843	cache_free(ncp);
844}
845
846/*
847 * cache_zap_locked():
848 *
849 *   Removes a namecache entry from cache, whether it contains an actual
850 *   pointer to a vnode or if it is just a negative cache entry.
851 */
852static void
853cache_zap_locked(struct namecache *ncp, bool neg_locked)
854{
855
856	if (!(ncp->nc_flag & NCF_NEGATIVE))
857		cache_assert_vnode_locked(ncp->nc_vp);
858	cache_assert_vnode_locked(ncp->nc_dvp);
859	cache_assert_bucket_locked(ncp, RA_WLOCKED);
860
861	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
862	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
863	LIST_REMOVE(ncp, nc_hash);
864	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
865		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
866		    ncp->nc_name, ncp->nc_vp);
867		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
868		if (ncp == ncp->nc_vp->v_cache_dd)
869			ncp->nc_vp->v_cache_dd = NULL;
870	} else {
871		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
872		    ncp->nc_name);
873		cache_negative_remove(ncp, neg_locked);
874	}
875	if (ncp->nc_flag & NCF_ISDOTDOT) {
876		if (ncp == ncp->nc_dvp->v_cache_dd)
877			ncp->nc_dvp->v_cache_dd = NULL;
878	} else {
879		LIST_REMOVE(ncp, nc_src);
880		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
881			ncp->nc_flag |= NCF_DVDROP;
882			counter_u64_add(numcachehv, -1);
883		}
884	}
885	atomic_subtract_rel_long(&numcache, 1);
886}
887
888static void
889cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
890{
891	struct rwlock *blp;
892
893	MPASS(ncp->nc_dvp == vp);
894	MPASS(ncp->nc_flag & NCF_NEGATIVE);
895	cache_assert_vnode_locked(vp);
896
897	blp = NCP2BUCKETLOCK(ncp);
898	rw_wlock(blp);
899	cache_zap_locked(ncp, false);
900	rw_wunlock(blp);
901}
902
903static bool
904cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
905    struct mtx **vlpp)
906{
907	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
908	struct rwlock *blp;
909
910	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
911	cache_assert_vnode_locked(vp);
912
913	if (ncp->nc_flag & NCF_NEGATIVE) {
914		if (*vlpp != NULL) {
915			mtx_unlock(*vlpp);
916			*vlpp = NULL;
917		}
918		cache_zap_negative_locked_vnode_kl(ncp, vp);
919		return (true);
920	}
921
922	pvlp = VP2VNODELOCK(vp);
923	blp = NCP2BUCKETLOCK(ncp);
924	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
925	vlp2 = VP2VNODELOCK(ncp->nc_vp);
926
927	if (*vlpp == vlp1 || *vlpp == vlp2) {
928		to_unlock = *vlpp;
929		*vlpp = NULL;
930	} else {
931		if (*vlpp != NULL) {
932			mtx_unlock(*vlpp);
933			*vlpp = NULL;
934		}
935		cache_sort_vnodes(&vlp1, &vlp2);
936		if (vlp1 == pvlp) {
937			mtx_lock(vlp2);
938			to_unlock = vlp2;
939		} else {
940			if (!mtx_trylock(vlp1))
941				goto out_relock;
942			to_unlock = vlp1;
943		}
944	}
945	rw_wlock(blp);
946	cache_zap_locked(ncp, false);
947	rw_wunlock(blp);
948	if (to_unlock != NULL)
949		mtx_unlock(to_unlock);
950	return (true);
951
952out_relock:
953	mtx_unlock(vlp2);
954	mtx_lock(vlp1);
955	mtx_lock(vlp2);
956	MPASS(*vlpp == NULL);
957	*vlpp = vlp1;
958	return (false);
959}
960
961static int __noinline
962cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
963{
964	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
965	struct rwlock *blp;
966	int error = 0;
967
968	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
969	cache_assert_vnode_locked(vp);
970
971	pvlp = VP2VNODELOCK(vp);
972	if (ncp->nc_flag & NCF_NEGATIVE) {
973		cache_zap_negative_locked_vnode_kl(ncp, vp);
974		goto out;
975	}
976
977	blp = NCP2BUCKETLOCK(ncp);
978	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
979	vlp2 = VP2VNODELOCK(ncp->nc_vp);
980	cache_sort_vnodes(&vlp1, &vlp2);
981	if (vlp1 == pvlp) {
982		mtx_lock(vlp2);
983		to_unlock = vlp2;
984	} else {
985		if (!mtx_trylock(vlp1)) {
986			error = EAGAIN;
987			goto out;
988		}
989		to_unlock = vlp1;
990	}
991	rw_wlock(blp);
992	cache_zap_locked(ncp, false);
993	rw_wunlock(blp);
994	mtx_unlock(to_unlock);
995out:
996	mtx_unlock(pvlp);
997	return (error);
998}
999
1000/*
1001 * If trylocking failed we can get here. We know enough to take all needed locks
1002 * in the right order and re-lookup the entry.
1003 */
1004static int
1005cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1006    struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1007    struct rwlock *blp)
1008{
1009	struct namecache *rncp;
1010
1011	cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1012
1013	cache_sort_vnodes(&dvlp, &vlp);
1014	cache_lock_vnodes(dvlp, vlp);
1015	rw_wlock(blp);
1016	LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1017		if (rncp == ncp && rncp->nc_dvp == dvp &&
1018		    rncp->nc_nlen == cnp->cn_namelen &&
1019		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1020			break;
1021	}
1022	if (rncp != NULL) {
1023		cache_zap_locked(rncp, false);
1024		rw_wunlock(blp);
1025		cache_unlock_vnodes(dvlp, vlp);
1026		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1027		return (0);
1028	}
1029
1030	rw_wunlock(blp);
1031	cache_unlock_vnodes(dvlp, vlp);
1032	return (EAGAIN);
1033}
1034
1035static int __noinline
1036cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1037    uint32_t hash, struct rwlock *blp)
1038{
1039	struct mtx *dvlp, *vlp;
1040	struct vnode *dvp;
1041
1042	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1043
1044	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1045	vlp = NULL;
1046	if (!(ncp->nc_flag & NCF_NEGATIVE))
1047		vlp = VP2VNODELOCK(ncp->nc_vp);
1048	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1049		cache_zap_locked(ncp, false);
1050		rw_wunlock(blp);
1051		cache_unlock_vnodes(dvlp, vlp);
1052		return (0);
1053	}
1054
1055	dvp = ncp->nc_dvp;
1056	rw_wunlock(blp);
1057	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1058}
1059
1060static int __noinline
1061cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1062    uint32_t hash, struct rwlock *blp)
1063{
1064	struct mtx *dvlp, *vlp;
1065	struct vnode *dvp;
1066
1067	cache_assert_bucket_locked(ncp, RA_RLOCKED);
1068
1069	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1070	vlp = NULL;
1071	if (!(ncp->nc_flag & NCF_NEGATIVE))
1072		vlp = VP2VNODELOCK(ncp->nc_vp);
1073	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1074		rw_runlock(blp);
1075		rw_wlock(blp);
1076		cache_zap_locked(ncp, false);
1077		rw_wunlock(blp);
1078		cache_unlock_vnodes(dvlp, vlp);
1079		return (0);
1080	}
1081
1082	dvp = ncp->nc_dvp;
1083	rw_runlock(blp);
1084	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1085}
1086
1087static int
1088cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1089    struct mtx **vlpp1, struct mtx **vlpp2)
1090{
1091	struct mtx *dvlp, *vlp;
1092
1093	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1094
1095	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1096	vlp = NULL;
1097	if (!(ncp->nc_flag & NCF_NEGATIVE))
1098		vlp = VP2VNODELOCK(ncp->nc_vp);
1099	cache_sort_vnodes(&dvlp, &vlp);
1100
1101	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1102		cache_zap_locked(ncp, false);
1103		cache_unlock_vnodes(dvlp, vlp);
1104		*vlpp1 = NULL;
1105		*vlpp2 = NULL;
1106		return (0);
1107	}
1108
1109	if (*vlpp1 != NULL)
1110		mtx_unlock(*vlpp1);
1111	if (*vlpp2 != NULL)
1112		mtx_unlock(*vlpp2);
1113	*vlpp1 = NULL;
1114	*vlpp2 = NULL;
1115
1116	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1117		cache_zap_locked(ncp, false);
1118		cache_unlock_vnodes(dvlp, vlp);
1119		return (0);
1120	}
1121
1122	rw_wunlock(blp);
1123	*vlpp1 = dvlp;
1124	*vlpp2 = vlp;
1125	if (*vlpp1 != NULL)
1126		mtx_lock(*vlpp1);
1127	mtx_lock(*vlpp2);
1128	rw_wlock(blp);
1129	return (EAGAIN);
1130}
1131
1132static void
1133cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1134{
1135
1136	if (blp != NULL) {
1137		rw_runlock(blp);
1138	} else {
1139		mtx_unlock(vlp);
1140	}
1141}
1142
1143static int __noinline
1144cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1145    struct timespec *tsp, int *ticksp)
1146{
1147	int ltype;
1148
1149	*vpp = dvp;
1150	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1151			dvp, cnp->cn_nameptr);
1152	counter_u64_add(dothits, 1);
1153	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1154	if (tsp != NULL)
1155		timespecclear(tsp);
1156	if (ticksp != NULL)
1157		*ticksp = ticks;
1158	vrefact(*vpp);
1159	/*
1160	 * When we lookup "." we still can be asked to lock it
1161	 * differently...
1162	 */
1163	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1164	if (ltype != VOP_ISLOCKED(*vpp)) {
1165		if (ltype == LK_EXCLUSIVE) {
1166			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1167			if ((*vpp)->v_iflag & VI_DOOMED) {
1168				/* forced unmount */
1169				vrele(*vpp);
1170				*vpp = NULL;
1171				return (ENOENT);
1172			}
1173		} else
1174			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1175	}
1176	return (-1);
1177}
1178
1179static __noinline int
1180cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1181    struct componentname *cnp, struct timespec *tsp, int *ticksp)
1182{
1183	struct namecache *ncp;
1184	struct rwlock *blp;
1185	struct mtx *dvlp, *dvlp2;
1186	uint32_t hash;
1187	int error;
1188
1189	if (cnp->cn_namelen == 2 &&
1190	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1191		counter_u64_add(dotdothits, 1);
1192		dvlp = VP2VNODELOCK(dvp);
1193		dvlp2 = NULL;
1194		mtx_lock(dvlp);
1195retry_dotdot:
1196		ncp = dvp->v_cache_dd;
1197		if (ncp == NULL) {
1198			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1199			    "..", NULL);
1200			mtx_unlock(dvlp);
1201			if (dvlp2 != NULL)
1202				mtx_unlock(dvlp2);
1203			return (0);
1204		}
1205		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1206			if (ncp->nc_dvp != dvp)
1207				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1208			if (!cache_zap_locked_vnode_kl2(ncp,
1209			    dvp, &dvlp2))
1210				goto retry_dotdot;
1211			MPASS(dvp->v_cache_dd == NULL);
1212			mtx_unlock(dvlp);
1213			if (dvlp2 != NULL)
1214				mtx_unlock(dvlp2);
1215			cache_free(ncp);
1216		} else {
1217			dvp->v_cache_dd = NULL;
1218			mtx_unlock(dvlp);
1219			if (dvlp2 != NULL)
1220				mtx_unlock(dvlp2);
1221		}
1222		return (0);
1223	}
1224
1225	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1226	blp = HASH2BUCKETLOCK(hash);
1227retry:
1228	if (LIST_EMPTY(NCHHASH(hash)))
1229		goto out_no_entry;
1230
1231	rw_wlock(blp);
1232
1233	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1234		counter_u64_add(numchecks, 1);
1235		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1236		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1237			break;
1238	}
1239
1240	/* We failed to find an entry */
1241	if (ncp == NULL) {
1242		rw_wunlock(blp);
1243		goto out_no_entry;
1244	}
1245
1246	error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1247	if (__predict_false(error != 0)) {
1248		zap_and_exit_bucket_fail++;
1249		cache_maybe_yield();
1250		goto retry;
1251	}
1252	counter_u64_add(numposzaps, 1);
1253	cache_free(ncp);
1254	return (0);
1255out_no_entry:
1256	SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1257	counter_u64_add(nummisszap, 1);
1258	return (0);
1259}
1260
1261/**
1262 * Lookup a name in the name cache
1263 *
1264 * # Arguments
1265 *
1266 * - dvp:	Parent directory in which to search.
1267 * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1268 * - cnp:	Parameters of the name search.  The most interesting bits of
1269 *   		the cn_flags field have the following meanings:
1270 *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1271 *   			it up.
1272 *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1273 * - tsp:	Return storage for cache timestamp.  On a successful (positive
1274 *   		or negative) lookup, tsp will be filled with any timespec that
1275 *   		was stored when this cache entry was created.  However, it will
1276 *   		be clear for "." entries.
1277 * - ticks:	Return storage for alternate cache timestamp.  On a successful
1278 *   		(positive or negative) lookup, it will contain the ticks value
1279 *   		that was current when the cache entry was created, unless cnp
1280 *   		was ".".
1281 *
1282 * # Returns
1283 *
1284 * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1285 * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1286 *		to a forced unmount.  vpp will not be modified.  If the entry
1287 *		is a whiteout, then the ISWHITEOUT flag will be set in
1288 *		cnp->cn_flags.
1289 * - 0:		A cache miss.  vpp will not be modified.
1290 *
1291 * # Locking
1292 *
1293 * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1294 * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1295 * lock is not recursively acquired.
1296 */
1297int
1298cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1299    struct timespec *tsp, int *ticksp)
1300{
1301	struct namecache_ts *ncp_ts;
1302	struct namecache *ncp;
1303	struct rwlock *blp;
1304	struct mtx *dvlp;
1305	uint32_t hash;
1306	int error, ltype;
1307
1308	if (__predict_false(!doingcache)) {
1309		cnp->cn_flags &= ~MAKEENTRY;
1310		return (0);
1311	}
1312
1313	counter_u64_add(numcalls, 1);
1314
1315	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1316		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1317
1318	if ((cnp->cn_flags & MAKEENTRY) == 0)
1319		return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1320
1321retry:
1322	blp = NULL;
1323	dvlp = NULL;
1324	error = 0;
1325	if (cnp->cn_namelen == 2 &&
1326	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1327		counter_u64_add(dotdothits, 1);
1328		dvlp = VP2VNODELOCK(dvp);
1329		mtx_lock(dvlp);
1330		ncp = dvp->v_cache_dd;
1331		if (ncp == NULL) {
1332			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1333			    "..", NULL);
1334			mtx_unlock(dvlp);
1335			return (0);
1336		}
1337		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1338			if (ncp->nc_flag & NCF_NEGATIVE)
1339				*vpp = NULL;
1340			else
1341				*vpp = ncp->nc_vp;
1342		} else
1343			*vpp = ncp->nc_dvp;
1344		/* Return failure if negative entry was found. */
1345		if (*vpp == NULL)
1346			goto negative_success;
1347		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1348		    dvp, cnp->cn_nameptr, *vpp);
1349		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1350		    *vpp);
1351		cache_out_ts(ncp, tsp, ticksp);
1352		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1353		    NCF_DTS && tsp != NULL) {
1354			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1355			*tsp = ncp_ts->nc_dotdottime;
1356		}
1357		goto success;
1358	}
1359
1360	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1361	blp = HASH2BUCKETLOCK(hash);
1362	rw_rlock(blp);
1363
1364	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1365		counter_u64_add(numchecks, 1);
1366		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1367		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1368			break;
1369	}
1370
1371	/* We failed to find an entry */
1372	if (__predict_false(ncp == NULL)) {
1373		rw_runlock(blp);
1374		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1375		    NULL);
1376		counter_u64_add(nummiss, 1);
1377		return (0);
1378	}
1379
1380	if (ncp->nc_flag & NCF_NEGATIVE)
1381		goto negative_success;
1382
1383	/* We found a "positive" match, return the vnode */
1384	counter_u64_add(numposhits, 1);
1385	*vpp = ncp->nc_vp;
1386	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1387	    dvp, cnp->cn_nameptr, *vpp, ncp);
1388	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1389	    *vpp);
1390	cache_out_ts(ncp, tsp, ticksp);
1391success:
1392	/*
1393	 * On success we return a locked and ref'd vnode as per the lookup
1394	 * protocol.
1395	 */
1396	MPASS(dvp != *vpp);
1397	ltype = 0;	/* silence gcc warning */
1398	if (cnp->cn_flags & ISDOTDOT) {
1399		ltype = VOP_ISLOCKED(dvp);
1400		VOP_UNLOCK(dvp, 0);
1401	}
1402	vhold(*vpp);
1403	cache_lookup_unlock(blp, dvlp);
1404	error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
1405	if (cnp->cn_flags & ISDOTDOT) {
1406		vn_lock(dvp, ltype | LK_RETRY);
1407		if (dvp->v_iflag & VI_DOOMED) {
1408			if (error == 0)
1409				vput(*vpp);
1410			*vpp = NULL;
1411			return (ENOENT);
1412		}
1413	}
1414	if (error) {
1415		*vpp = NULL;
1416		goto retry;
1417	}
1418	if ((cnp->cn_flags & ISLASTCN) &&
1419	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1420		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1421	}
1422	return (-1);
1423
1424negative_success:
1425	/* We found a negative match, and want to create it, so purge */
1426	if (cnp->cn_nameiop == CREATE) {
1427		counter_u64_add(numnegzaps, 1);
1428		goto zap_and_exit;
1429	}
1430
1431	counter_u64_add(numneghits, 1);
1432	cache_negative_hit(ncp);
1433	if (ncp->nc_flag & NCF_WHITE)
1434		cnp->cn_flags |= ISWHITEOUT;
1435	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
1436	    ncp->nc_name);
1437	cache_out_ts(ncp, tsp, ticksp);
1438	cache_lookup_unlock(blp, dvlp);
1439	return (ENOENT);
1440
1441zap_and_exit:
1442	if (blp != NULL)
1443		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1444	else
1445		error = cache_zap_locked_vnode(ncp, dvp);
1446	if (__predict_false(error != 0)) {
1447		zap_and_exit_bucket_fail2++;
1448		cache_maybe_yield();
1449		goto retry;
1450	}
1451	cache_free(ncp);
1452	return (0);
1453}
1454
1455struct celockstate {
1456	struct mtx *vlp[3];
1457	struct rwlock *blp[2];
1458};
1459CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1460CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1461
1462static inline void
1463cache_celockstate_init(struct celockstate *cel)
1464{
1465
1466	bzero(cel, sizeof(*cel));
1467}
1468
1469static void
1470cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1471    struct vnode *dvp)
1472{
1473	struct mtx *vlp1, *vlp2;
1474
1475	MPASS(cel->vlp[0] == NULL);
1476	MPASS(cel->vlp[1] == NULL);
1477	MPASS(cel->vlp[2] == NULL);
1478
1479	MPASS(vp != NULL || dvp != NULL);
1480
1481	vlp1 = VP2VNODELOCK(vp);
1482	vlp2 = VP2VNODELOCK(dvp);
1483	cache_sort_vnodes(&vlp1, &vlp2);
1484
1485	if (vlp1 != NULL) {
1486		mtx_lock(vlp1);
1487		cel->vlp[0] = vlp1;
1488	}
1489	mtx_lock(vlp2);
1490	cel->vlp[1] = vlp2;
1491}
1492
1493static void
1494cache_unlock_vnodes_cel(struct celockstate *cel)
1495{
1496
1497	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1498
1499	if (cel->vlp[0] != NULL)
1500		mtx_unlock(cel->vlp[0]);
1501	if (cel->vlp[1] != NULL)
1502		mtx_unlock(cel->vlp[1]);
1503	if (cel->vlp[2] != NULL)
1504		mtx_unlock(cel->vlp[2]);
1505}
1506
1507static bool
1508cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1509{
1510	struct mtx *vlp;
1511	bool ret;
1512
1513	cache_assert_vlp_locked(cel->vlp[0]);
1514	cache_assert_vlp_locked(cel->vlp[1]);
1515	MPASS(cel->vlp[2] == NULL);
1516
1517	MPASS(vp != NULL);
1518	vlp = VP2VNODELOCK(vp);
1519
1520	ret = true;
1521	if (vlp >= cel->vlp[1]) {
1522		mtx_lock(vlp);
1523	} else {
1524		if (mtx_trylock(vlp))
1525			goto out;
1526		cache_lock_vnodes_cel_3_failures++;
1527		cache_unlock_vnodes_cel(cel);
1528		if (vlp < cel->vlp[0]) {
1529			mtx_lock(vlp);
1530			mtx_lock(cel->vlp[0]);
1531			mtx_lock(cel->vlp[1]);
1532		} else {
1533			if (cel->vlp[0] != NULL)
1534				mtx_lock(cel->vlp[0]);
1535			mtx_lock(vlp);
1536			mtx_lock(cel->vlp[1]);
1537		}
1538		ret = false;
1539	}
1540out:
1541	cel->vlp[2] = vlp;
1542	return (ret);
1543}
1544
1545static void
1546cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1547    struct rwlock *blp2)
1548{
1549
1550	MPASS(cel->blp[0] == NULL);
1551	MPASS(cel->blp[1] == NULL);
1552
1553	cache_sort_vnodes(&blp1, &blp2);
1554
1555	if (blp1 != NULL) {
1556		rw_wlock(blp1);
1557		cel->blp[0] = blp1;
1558	}
1559	rw_wlock(blp2);
1560	cel->blp[1] = blp2;
1561}
1562
1563static void
1564cache_unlock_buckets_cel(struct celockstate *cel)
1565{
1566
1567	if (cel->blp[0] != NULL)
1568		rw_wunlock(cel->blp[0]);
1569	rw_wunlock(cel->blp[1]);
1570}
1571
1572/*
1573 * Lock part of the cache affected by the insertion.
1574 *
1575 * This means vnodelocks for dvp, vp and the relevant bucketlock.
1576 * However, insertion can result in removal of an old entry. In this
1577 * case we have an additional vnode and bucketlock pair to lock. If the
1578 * entry is negative, ncelock is locked instead of the vnode.
1579 *
1580 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1581 * preserving the locking order (smaller address first).
1582 */
1583static void
1584cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1585    uint32_t hash)
1586{
1587	struct namecache *ncp;
1588	struct rwlock *blps[2];
1589
1590	blps[0] = HASH2BUCKETLOCK(hash);
1591	for (;;) {
1592		blps[1] = NULL;
1593		cache_lock_vnodes_cel(cel, dvp, vp);
1594		if (vp == NULL || vp->v_type != VDIR)
1595			break;
1596		ncp = vp->v_cache_dd;
1597		if (ncp == NULL)
1598			break;
1599		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1600			break;
1601		MPASS(ncp->nc_dvp == vp);
1602		blps[1] = NCP2BUCKETLOCK(ncp);
1603		if (ncp->nc_flag & NCF_NEGATIVE)
1604			break;
1605		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1606			break;
1607		/*
1608		 * All vnodes got re-locked. Re-validate the state and if
1609		 * nothing changed we are done. Otherwise restart.
1610		 */
1611		if (ncp == vp->v_cache_dd &&
1612		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1613		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1614		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1615			break;
1616		cache_unlock_vnodes_cel(cel);
1617		cel->vlp[0] = NULL;
1618		cel->vlp[1] = NULL;
1619		cel->vlp[2] = NULL;
1620	}
1621	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1622}
1623
1624static void
1625cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1626    uint32_t hash)
1627{
1628	struct namecache *ncp;
1629	struct rwlock *blps[2];
1630
1631	blps[0] = HASH2BUCKETLOCK(hash);
1632	for (;;) {
1633		blps[1] = NULL;
1634		cache_lock_vnodes_cel(cel, dvp, vp);
1635		ncp = dvp->v_cache_dd;
1636		if (ncp == NULL)
1637			break;
1638		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1639			break;
1640		MPASS(ncp->nc_dvp == dvp);
1641		blps[1] = NCP2BUCKETLOCK(ncp);
1642		if (ncp->nc_flag & NCF_NEGATIVE)
1643			break;
1644		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1645			break;
1646		if (ncp == dvp->v_cache_dd &&
1647		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1648		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1649		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1650			break;
1651		cache_unlock_vnodes_cel(cel);
1652		cel->vlp[0] = NULL;
1653		cel->vlp[1] = NULL;
1654		cel->vlp[2] = NULL;
1655	}
1656	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1657}
1658
1659static void
1660cache_enter_unlock(struct celockstate *cel)
1661{
1662
1663	cache_unlock_buckets_cel(cel);
1664	cache_unlock_vnodes_cel(cel);
1665}
1666
1667static void __noinline
1668cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1669    struct componentname *cnp)
1670{
1671	struct celockstate cel;
1672	struct namecache *ncp;
1673	uint32_t hash;
1674	int len;
1675
1676	if (dvp->v_cache_dd == NULL)
1677		return;
1678	len = cnp->cn_namelen;
1679	cache_celockstate_init(&cel);
1680	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1681	cache_enter_lock_dd(&cel, dvp, vp, hash);
1682	ncp = dvp->v_cache_dd;
1683	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1684		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1685		cache_zap_locked(ncp, false);
1686	} else {
1687		ncp = NULL;
1688	}
1689	dvp->v_cache_dd = NULL;
1690	cache_enter_unlock(&cel);
1691	cache_free(ncp);
1692}
1693
1694/*
1695 * Add an entry to the cache.
1696 */
1697void
1698cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1699    struct timespec *tsp, struct timespec *dtsp)
1700{
1701	struct celockstate cel;
1702	struct namecache *ncp, *n2, *ndd;
1703	struct namecache_ts *ncp_ts, *n2_ts;
1704	struct nchashhead *ncpp;
1705	uint32_t hash;
1706	int flag;
1707	int len;
1708	u_long lnumcache;
1709
1710	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1711	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
1712	    ("cache_enter: Adding a doomed vnode"));
1713	VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
1714	    ("cache_enter: Doomed vnode used as src"));
1715
1716	if (__predict_false(!doingcache))
1717		return;
1718
1719	flag = 0;
1720	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1721		if (cnp->cn_namelen == 1)
1722			return;
1723		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1724			cache_enter_dotdot_prep(dvp, vp, cnp);
1725			flag = NCF_ISDOTDOT;
1726		}
1727	}
1728
1729	/*
1730	 * Avoid blowout in namecache entries.
1731	 */
1732	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1733	if (__predict_false(lnumcache >= ncsize)) {
1734		atomic_add_long(&numcache, -1);
1735		return;
1736	}
1737
1738	cache_celockstate_init(&cel);
1739	ndd = NULL;
1740	ncp_ts = NULL;
1741
1742	/*
1743	 * Calculate the hash key and setup as much of the new
1744	 * namecache entry as possible before acquiring the lock.
1745	 */
1746	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1747	ncp->nc_flag = flag;
1748	ncp->nc_vp = vp;
1749	if (vp == NULL)
1750		ncp->nc_flag |= NCF_NEGATIVE;
1751	ncp->nc_dvp = dvp;
1752	if (tsp != NULL) {
1753		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1754		ncp_ts->nc_time = *tsp;
1755		ncp_ts->nc_ticks = ticks;
1756		ncp_ts->nc_nc.nc_flag |= NCF_TS;
1757		if (dtsp != NULL) {
1758			ncp_ts->nc_dotdottime = *dtsp;
1759			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1760		}
1761	}
1762	len = ncp->nc_nlen = cnp->cn_namelen;
1763	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1764	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
1765	cache_enter_lock(&cel, dvp, vp, hash);
1766
1767	/*
1768	 * See if this vnode or negative entry is already in the cache
1769	 * with this name.  This can happen with concurrent lookups of
1770	 * the same path name.
1771	 */
1772	ncpp = NCHHASH(hash);
1773	LIST_FOREACH(n2, ncpp, nc_hash) {
1774		if (n2->nc_dvp == dvp &&
1775		    n2->nc_nlen == cnp->cn_namelen &&
1776		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1777			if (tsp != NULL) {
1778				KASSERT((n2->nc_flag & NCF_TS) != 0,
1779				    ("no NCF_TS"));
1780				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1781				n2_ts->nc_time = ncp_ts->nc_time;
1782				n2_ts->nc_ticks = ncp_ts->nc_ticks;
1783				if (dtsp != NULL) {
1784					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1785					if (ncp->nc_flag & NCF_NEGATIVE)
1786						mtx_lock(&ncneg_hot.nl_lock);
1787					n2_ts->nc_nc.nc_flag |= NCF_DTS;
1788					if (ncp->nc_flag & NCF_NEGATIVE)
1789						mtx_unlock(&ncneg_hot.nl_lock);
1790				}
1791			}
1792			goto out_unlock_free;
1793		}
1794	}
1795
1796	if (flag == NCF_ISDOTDOT) {
1797		/*
1798		 * See if we are trying to add .. entry, but some other lookup
1799		 * has populated v_cache_dd pointer already.
1800		 */
1801		if (dvp->v_cache_dd != NULL)
1802			goto out_unlock_free;
1803		KASSERT(vp == NULL || vp->v_type == VDIR,
1804		    ("wrong vnode type %p", vp));
1805		dvp->v_cache_dd = ncp;
1806	}
1807
1808	if (vp != NULL) {
1809		if (vp->v_type == VDIR) {
1810			if (flag != NCF_ISDOTDOT) {
1811				/*
1812				 * For this case, the cache entry maps both the
1813				 * directory name in it and the name ".." for the
1814				 * directory's parent.
1815				 */
1816				if ((ndd = vp->v_cache_dd) != NULL) {
1817					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1818						cache_zap_locked(ndd, false);
1819					else
1820						ndd = NULL;
1821				}
1822				vp->v_cache_dd = ncp;
1823			}
1824		} else {
1825			vp->v_cache_dd = NULL;
1826		}
1827	}
1828
1829	if (flag != NCF_ISDOTDOT) {
1830		if (LIST_EMPTY(&dvp->v_cache_src)) {
1831			vhold(dvp);
1832			counter_u64_add(numcachehv, 1);
1833		}
1834		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
1835	}
1836
1837	/*
1838	 * Insert the new namecache entry into the appropriate chain
1839	 * within the cache entries table.
1840	 */
1841	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
1842
1843	/*
1844	 * If the entry is "negative", we place it into the
1845	 * "negative" cache queue, otherwise, we place it into the
1846	 * destination vnode's cache entries queue.
1847	 */
1848	if (vp != NULL) {
1849		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
1850		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
1851		    vp);
1852	} else {
1853		if (cnp->cn_flags & ISWHITEOUT)
1854			ncp->nc_flag |= NCF_WHITE;
1855		cache_negative_insert(ncp, false);
1856		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
1857		    ncp->nc_name);
1858	}
1859	cache_enter_unlock(&cel);
1860	if (numneg * ncnegfactor > lnumcache)
1861		cache_negative_zap_one();
1862	cache_free(ndd);
1863	return;
1864out_unlock_free:
1865	cache_enter_unlock(&cel);
1866	atomic_add_long(&numcache, -1);
1867	cache_free(ncp);
1868	return;
1869}
1870
1871static u_int
1872cache_roundup_2(u_int val)
1873{
1874	u_int res;
1875
1876	for (res = 1; res <= val; res <<= 1)
1877		continue;
1878
1879	return (res);
1880}
1881
1882/*
1883 * Name cache initialization, from vfs_init() when we are booting
1884 */
1885static void
1886nchinit(void *dummy __unused)
1887{
1888	u_int i;
1889
1890	cache_zone_small = uma_zcreate("S VFS Cache",
1891	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
1892	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
1893	    UMA_ZONE_ZINIT);
1894	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
1895	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
1896	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
1897	    UMA_ZONE_ZINIT);
1898	cache_zone_large = uma_zcreate("L VFS Cache",
1899	    sizeof(struct namecache) + NAME_MAX + 1,
1900	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
1901	    UMA_ZONE_ZINIT);
1902	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
1903	    sizeof(struct namecache_ts) + NAME_MAX + 1,
1904	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
1905	    UMA_ZONE_ZINIT);
1906
1907	ncsize = desiredvnodes * ncsizefactor;
1908	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
1909	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
1910	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
1911		ncbuckethash = 7;
1912	if (ncbuckethash > nchash)
1913		ncbuckethash = nchash;
1914	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
1915	    M_WAITOK | M_ZERO);
1916	for (i = 0; i < numbucketlocks; i++)
1917		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
1918	ncvnodehash = ncbuckethash;
1919	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
1920	    M_WAITOK | M_ZERO);
1921	for (i = 0; i < numvnodelocks; i++)
1922		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
1923	ncpurgeminvnodes = numbucketlocks * 2;
1924
1925	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
1926	    M_WAITOK | M_ZERO);
1927	for (i = 0; i < numneglists; i++) {
1928		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
1929		TAILQ_INIT(&neglists[i].nl_list);
1930	}
1931	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
1932	TAILQ_INIT(&ncneg_hot.nl_list);
1933
1934	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
1935
1936	numcachehv = counter_u64_alloc(M_WAITOK);
1937	numcalls = counter_u64_alloc(M_WAITOK);
1938	dothits = counter_u64_alloc(M_WAITOK);
1939	dotdothits = counter_u64_alloc(M_WAITOK);
1940	numchecks = counter_u64_alloc(M_WAITOK);
1941	nummiss = counter_u64_alloc(M_WAITOK);
1942	nummisszap = counter_u64_alloc(M_WAITOK);
1943	numposzaps = counter_u64_alloc(M_WAITOK);
1944	numposhits = counter_u64_alloc(M_WAITOK);
1945	numnegzaps = counter_u64_alloc(M_WAITOK);
1946	numneghits = counter_u64_alloc(M_WAITOK);
1947	numfullpathcalls = counter_u64_alloc(M_WAITOK);
1948	numfullpathfail1 = counter_u64_alloc(M_WAITOK);
1949	numfullpathfail2 = counter_u64_alloc(M_WAITOK);
1950	numfullpathfail4 = counter_u64_alloc(M_WAITOK);
1951	numfullpathfound = counter_u64_alloc(M_WAITOK);
1952	zap_and_exit_bucket_relock_success = counter_u64_alloc(M_WAITOK);
1953	numneg_evicted = counter_u64_alloc(M_WAITOK);
1954	shrinking_skipped = counter_u64_alloc(M_WAITOK);
1955}
1956SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
1957
1958void
1959cache_changesize(int newmaxvnodes)
1960{
1961	struct nchashhead *new_nchashtbl, *old_nchashtbl;
1962	u_long new_nchash, old_nchash;
1963	struct namecache *ncp;
1964	uint32_t hash;
1965	int newncsize;
1966	int i;
1967
1968	newncsize = newmaxvnodes * ncsizefactor;
1969	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
1970	if (newmaxvnodes < numbucketlocks)
1971		newmaxvnodes = numbucketlocks;
1972
1973	new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
1974	/* If same hash table size, nothing to do */
1975	if (nchash == new_nchash) {
1976		free(new_nchashtbl, M_VFSCACHE);
1977		return;
1978	}
1979	/*
1980	 * Move everything from the old hash table to the new table.
1981	 * None of the namecache entries in the table can be removed
1982	 * because to do so, they have to be removed from the hash table.
1983	 */
1984	cache_lock_all_vnodes();
1985	cache_lock_all_buckets();
1986	old_nchashtbl = nchashtbl;
1987	old_nchash = nchash;
1988	nchashtbl = new_nchashtbl;
1989	nchash = new_nchash;
1990	for (i = 0; i <= old_nchash; i++) {
1991		while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
1992			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
1993			    ncp->nc_dvp);
1994			LIST_REMOVE(ncp, nc_hash);
1995			LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
1996		}
1997	}
1998	ncsize = newncsize;
1999	cache_unlock_all_buckets();
2000	cache_unlock_all_vnodes();
2001	free(old_nchashtbl, M_VFSCACHE);
2002}
2003
2004/*
2005 * Invalidate all entries from and to a particular vnode.
2006 */
2007void
2008cache_purge(struct vnode *vp)
2009{
2010	TAILQ_HEAD(, namecache) ncps;
2011	struct namecache *ncp, *nnp;
2012	struct mtx *vlp, *vlp2;
2013
2014	CTR1(KTR_VFS, "cache_purge(%p)", vp);
2015	SDT_PROBE1(vfs, namecache, purge, done, vp);
2016	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2017	    vp->v_cache_dd == NULL)
2018		return;
2019	TAILQ_INIT(&ncps);
2020	vlp = VP2VNODELOCK(vp);
2021	vlp2 = NULL;
2022	mtx_lock(vlp);
2023retry:
2024	while (!LIST_EMPTY(&vp->v_cache_src)) {
2025		ncp = LIST_FIRST(&vp->v_cache_src);
2026		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2027			goto retry;
2028		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2029	}
2030	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2031		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2032		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2033			goto retry;
2034		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2035	}
2036	ncp = vp->v_cache_dd;
2037	if (ncp != NULL) {
2038		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2039		   ("lost dotdot link"));
2040		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2041			goto retry;
2042		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2043	}
2044	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2045	mtx_unlock(vlp);
2046	if (vlp2 != NULL)
2047		mtx_unlock(vlp2);
2048	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2049		cache_free(ncp);
2050	}
2051}
2052
2053/*
2054 * Invalidate all negative entries for a particular directory vnode.
2055 */
2056void
2057cache_purge_negative(struct vnode *vp)
2058{
2059	TAILQ_HEAD(, namecache) ncps;
2060	struct namecache *ncp, *nnp;
2061	struct mtx *vlp;
2062
2063	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2064	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2065	if (LIST_EMPTY(&vp->v_cache_src))
2066		return;
2067	TAILQ_INIT(&ncps);
2068	vlp = VP2VNODELOCK(vp);
2069	mtx_lock(vlp);
2070	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2071		if (!(ncp->nc_flag & NCF_NEGATIVE))
2072			continue;
2073		cache_zap_negative_locked_vnode_kl(ncp, vp);
2074		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2075	}
2076	mtx_unlock(vlp);
2077	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2078		cache_free(ncp);
2079	}
2080}
2081
2082/*
2083 * Flush all entries referencing a particular filesystem.
2084 */
2085void
2086cache_purgevfs(struct mount *mp, bool force)
2087{
2088	TAILQ_HEAD(, namecache) ncps;
2089	struct mtx *vlp1, *vlp2;
2090	struct rwlock *blp;
2091	struct nchashhead *bucket;
2092	struct namecache *ncp, *nnp;
2093	u_long i, j, n_nchash;
2094	int error;
2095
2096	/* Scan hash tables for applicable entries */
2097	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2098	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2099		return;
2100	TAILQ_INIT(&ncps);
2101	n_nchash = nchash + 1;
2102	vlp1 = vlp2 = NULL;
2103	for (i = 0; i < numbucketlocks; i++) {
2104		blp = (struct rwlock *)&bucketlocks[i];
2105		rw_wlock(blp);
2106		for (j = i; j < n_nchash; j += numbucketlocks) {
2107retry:
2108			bucket = &nchashtbl[j];
2109			LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2110				cache_assert_bucket_locked(ncp, RA_WLOCKED);
2111				if (ncp->nc_dvp->v_mount != mp)
2112					continue;
2113				error = cache_zap_wlocked_bucket_kl(ncp, blp,
2114				    &vlp1, &vlp2);
2115				if (error != 0)
2116					goto retry;
2117				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2118			}
2119		}
2120		rw_wunlock(blp);
2121		if (vlp1 == NULL && vlp2 == NULL)
2122			cache_maybe_yield();
2123	}
2124	if (vlp1 != NULL)
2125		mtx_unlock(vlp1);
2126	if (vlp2 != NULL)
2127		mtx_unlock(vlp2);
2128
2129	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2130		cache_free(ncp);
2131	}
2132}
2133
2134/*
2135 * Perform canonical checks and cache lookup and pass on to filesystem
2136 * through the vop_cachedlookup only if needed.
2137 */
2138
2139int
2140vfs_cache_lookup(struct vop_lookup_args *ap)
2141{
2142	struct vnode *dvp;
2143	int error;
2144	struct vnode **vpp = ap->a_vpp;
2145	struct componentname *cnp = ap->a_cnp;
2146	int flags = cnp->cn_flags;
2147
2148	*vpp = NULL;
2149	dvp = ap->a_dvp;
2150
2151	if (dvp->v_type != VDIR)
2152		return (ENOTDIR);
2153
2154	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2155	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2156		return (EROFS);
2157
2158	error = vn_dir_check_exec(dvp, cnp);
2159	if (error != 0)
2160		return (error);
2161
2162	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2163	if (error == 0)
2164		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2165	if (error == -1)
2166		return (0);
2167	return (error);
2168}
2169
2170/*
2171 * XXX All of these sysctls would probably be more productive dead.
2172 */
2173static int __read_mostly disablecwd;
2174SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
2175   "Disable the getcwd syscall");
2176
2177/* Implementation of the getcwd syscall. */
2178int
2179sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2180{
2181
2182	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
2183	    MAXPATHLEN));
2184}
2185
2186int
2187kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen,
2188    size_t path_max)
2189{
2190	char *bp, *tmpbuf;
2191	struct filedesc *fdp;
2192	struct vnode *cdir, *rdir;
2193	int error;
2194
2195	if (__predict_false(disablecwd))
2196		return (ENODEV);
2197	if (__predict_false(buflen < 2))
2198		return (EINVAL);
2199	if (buflen > path_max)
2200		buflen = path_max;
2201
2202	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
2203	fdp = td->td_proc->p_fd;
2204	FILEDESC_SLOCK(fdp);
2205	cdir = fdp->fd_cdir;
2206	vrefact(cdir);
2207	rdir = fdp->fd_rdir;
2208	vrefact(rdir);
2209	FILEDESC_SUNLOCK(fdp);
2210	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
2211	vrele(rdir);
2212	vrele(cdir);
2213
2214	if (!error) {
2215		if (bufseg == UIO_SYSSPACE)
2216			bcopy(bp, buf, strlen(bp) + 1);
2217		else
2218			error = copyout(bp, buf, strlen(bp) + 1);
2219#ifdef KTRACE
2220	if (KTRPOINT(curthread, KTR_NAMEI))
2221		ktrnamei(bp);
2222#endif
2223	}
2224	free(tmpbuf, M_TEMP);
2225	return (error);
2226}
2227
2228/*
2229 * Thus begins the fullpath magic.
2230 */
2231
2232static int __read_mostly disablefullpath;
2233SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
2234    "Disable the vn_fullpath function");
2235
2236/*
2237 * Retrieve the full filesystem path that correspond to a vnode from the name
2238 * cache (if available)
2239 */
2240int
2241vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2242{
2243	char *buf;
2244	struct filedesc *fdp;
2245	struct vnode *rdir;
2246	int error;
2247
2248	if (__predict_false(disablefullpath))
2249		return (ENODEV);
2250	if (__predict_false(vn == NULL))
2251		return (EINVAL);
2252
2253	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
2254	fdp = td->td_proc->p_fd;
2255	FILEDESC_SLOCK(fdp);
2256	rdir = fdp->fd_rdir;
2257	vrefact(rdir);
2258	FILEDESC_SUNLOCK(fdp);
2259	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
2260	vrele(rdir);
2261
2262	if (!error)
2263		*freebuf = buf;
2264	else
2265		free(buf, M_TEMP);
2266	return (error);
2267}
2268
2269/*
2270 * This function is similar to vn_fullpath, but it attempts to lookup the
2271 * pathname relative to the global root mount point.  This is required for the
2272 * auditing sub-system, as audited pathnames must be absolute, relative to the
2273 * global root mount point.
2274 */
2275int
2276vn_fullpath_global(struct thread *td, struct vnode *vn,
2277    char **retbuf, char **freebuf)
2278{
2279	char *buf;
2280	int error;
2281
2282	if (__predict_false(disablefullpath))
2283		return (ENODEV);
2284	if (__predict_false(vn == NULL))
2285		return (EINVAL);
2286	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
2287	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
2288	if (!error)
2289		*freebuf = buf;
2290	else
2291		free(buf, M_TEMP);
2292	return (error);
2293}
2294
2295int
2296vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
2297{
2298	struct vnode *dvp;
2299	struct namecache *ncp;
2300	struct mtx *vlp;
2301	int error;
2302
2303	vlp = VP2VNODELOCK(*vp);
2304	mtx_lock(vlp);
2305	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2306		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2307			break;
2308	}
2309	if (ncp != NULL) {
2310		if (*buflen < ncp->nc_nlen) {
2311			mtx_unlock(vlp);
2312			vrele(*vp);
2313			counter_u64_add(numfullpathfail4, 1);
2314			error = ENOMEM;
2315			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2316			    vp, NULL);
2317			return (error);
2318		}
2319		*buflen -= ncp->nc_nlen;
2320		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2321		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2322		    ncp->nc_name, vp);
2323		dvp = *vp;
2324		*vp = ncp->nc_dvp;
2325		vref(*vp);
2326		mtx_unlock(vlp);
2327		vrele(dvp);
2328		return (0);
2329	}
2330	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2331
2332	mtx_unlock(vlp);
2333	vn_lock(*vp, LK_SHARED | LK_RETRY);
2334	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2335	vput(*vp);
2336	if (error) {
2337		counter_u64_add(numfullpathfail2, 1);
2338		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2339		return (error);
2340	}
2341
2342	*vp = dvp;
2343	if (dvp->v_iflag & VI_DOOMED) {
2344		/* forced unmount */
2345		vrele(dvp);
2346		error = ENOENT;
2347		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2348		return (error);
2349	}
2350	/*
2351	 * *vp has its use count incremented still.
2352	 */
2353
2354	return (0);
2355}
2356
2357/*
2358 * The magic behind kern___getcwd() and vn_fullpath().
2359 */
2360static int
2361vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
2362    char *buf, char **retbuf, u_int buflen)
2363{
2364	int error, slash_prefixed;
2365#ifdef KDTRACE_HOOKS
2366	struct vnode *startvp = vp;
2367#endif
2368	struct vnode *vp1;
2369
2370	buflen--;
2371	buf[buflen] = '\0';
2372	error = 0;
2373	slash_prefixed = 0;
2374
2375	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2376	counter_u64_add(numfullpathcalls, 1);
2377	vref(vp);
2378	if (vp->v_type != VDIR) {
2379		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2380		if (error)
2381			return (error);
2382		if (buflen == 0) {
2383			vrele(vp);
2384			return (ENOMEM);
2385		}
2386		buf[--buflen] = '/';
2387		slash_prefixed = 1;
2388	}
2389	while (vp != rdir && vp != rootvnode) {
2390		/*
2391		 * The vp vnode must be already fully constructed,
2392		 * since it is either found in namecache or obtained
2393		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2394		 * without obtaining the vnode lock.
2395		 */
2396		if ((vp->v_vflag & VV_ROOT) != 0) {
2397			vn_lock(vp, LK_RETRY | LK_SHARED);
2398
2399			/*
2400			 * With the vnode locked, check for races with
2401			 * unmount, forced or not.  Note that we
2402			 * already verified that vp is not equal to
2403			 * the root vnode, which means that
2404			 * mnt_vnodecovered can be NULL only for the
2405			 * case of unmount.
2406			 */
2407			if ((vp->v_iflag & VI_DOOMED) != 0 ||
2408			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2409			    vp1->v_mountedhere != vp->v_mount) {
2410				vput(vp);
2411				error = ENOENT;
2412				SDT_PROBE3(vfs, namecache, fullpath, return,
2413				    error, vp, NULL);
2414				break;
2415			}
2416
2417			vref(vp1);
2418			vput(vp);
2419			vp = vp1;
2420			continue;
2421		}
2422		if (vp->v_type != VDIR) {
2423			vrele(vp);
2424			counter_u64_add(numfullpathfail1, 1);
2425			error = ENOTDIR;
2426			SDT_PROBE3(vfs, namecache, fullpath, return,
2427			    error, vp, NULL);
2428			break;
2429		}
2430		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2431		if (error)
2432			break;
2433		if (buflen == 0) {
2434			vrele(vp);
2435			error = ENOMEM;
2436			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2437			    startvp, NULL);
2438			break;
2439		}
2440		buf[--buflen] = '/';
2441		slash_prefixed = 1;
2442	}
2443	if (error)
2444		return (error);
2445	if (!slash_prefixed) {
2446		if (buflen == 0) {
2447			vrele(vp);
2448			counter_u64_add(numfullpathfail4, 1);
2449			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2450			    startvp, NULL);
2451			return (ENOMEM);
2452		}
2453		buf[--buflen] = '/';
2454	}
2455	counter_u64_add(numfullpathfound, 1);
2456	vrele(vp);
2457
2458	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
2459	*retbuf = buf + buflen;
2460	return (0);
2461}
2462
2463struct vnode *
2464vn_dir_dd_ino(struct vnode *vp)
2465{
2466	struct namecache *ncp;
2467	struct vnode *ddvp;
2468	struct mtx *vlp;
2469
2470	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2471	vlp = VP2VNODELOCK(vp);
2472	mtx_lock(vlp);
2473	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2474		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2475			continue;
2476		ddvp = ncp->nc_dvp;
2477		vhold(ddvp);
2478		mtx_unlock(vlp);
2479		if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread))
2480			return (NULL);
2481		return (ddvp);
2482	}
2483	mtx_unlock(vlp);
2484	return (NULL);
2485}
2486
2487int
2488vn_commname(struct vnode *vp, char *buf, u_int buflen)
2489{
2490	struct namecache *ncp;
2491	struct mtx *vlp;
2492	int l;
2493
2494	vlp = VP2VNODELOCK(vp);
2495	mtx_lock(vlp);
2496	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2497		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2498			break;
2499	if (ncp == NULL) {
2500		mtx_unlock(vlp);
2501		return (ENOENT);
2502	}
2503	l = min(ncp->nc_nlen, buflen - 1);
2504	memcpy(buf, ncp->nc_name, l);
2505	mtx_unlock(vlp);
2506	buf[l] = '\0';
2507	return (0);
2508}
2509
2510/* ABI compat shims for old kernel modules. */
2511#undef cache_enter
2512
2513void	cache_enter(struct vnode *dvp, struct vnode *vp,
2514	    struct componentname *cnp);
2515
2516void
2517cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2518{
2519
2520	cache_enter_time(dvp, vp, cnp, NULL, NULL);
2521}
2522
2523/*
2524 * This function updates path string to vnode's full global path
2525 * and checks the size of the new path string against the pathlen argument.
2526 *
2527 * Requires a locked, referenced vnode.
2528 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2529 *
2530 * If sysctl debug.disablefullpath is set, ENODEV is returned,
2531 * vnode is left locked and path remain untouched.
2532 *
2533 * If vp is a directory, the call to vn_fullpath_global() always succeeds
2534 * because it falls back to the ".." lookup if the namecache lookup fails.
2535 */
2536int
2537vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2538    u_int pathlen)
2539{
2540	struct nameidata nd;
2541	struct vnode *vp1;
2542	char *rpath, *fbuf;
2543	int error;
2544
2545	ASSERT_VOP_ELOCKED(vp, __func__);
2546
2547	/* Return ENODEV if sysctl debug.disablefullpath==1 */
2548	if (__predict_false(disablefullpath))
2549		return (ENODEV);
2550
2551	/* Construct global filesystem path from vp. */
2552	VOP_UNLOCK(vp, 0);
2553	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2554
2555	if (error != 0) {
2556		vrele(vp);
2557		return (error);
2558	}
2559
2560	if (strlen(rpath) >= pathlen) {
2561		vrele(vp);
2562		error = ENAMETOOLONG;
2563		goto out;
2564	}
2565
2566	/*
2567	 * Re-lookup the vnode by path to detect a possible rename.
2568	 * As a side effect, the vnode is relocked.
2569	 * If vnode was renamed, return ENOENT.
2570	 */
2571	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2572	    UIO_SYSSPACE, path, td);
2573	error = namei(&nd);
2574	if (error != 0) {
2575		vrele(vp);
2576		goto out;
2577	}
2578	NDFREE(&nd, NDF_ONLY_PNBUF);
2579	vp1 = nd.ni_vp;
2580	vrele(vp);
2581	if (vp1 == vp)
2582		strcpy(path, rpath);
2583	else {
2584		vput(vp1);
2585		error = ENOENT;
2586	}
2587
2588out:
2589	free(fbuf, M_TEMP);
2590	return (error);
2591}
2592
2593#ifdef DDB
2594static void
2595db_print_vpath(struct vnode *vp)
2596{
2597
2598	while (vp != NULL) {
2599		db_printf("%p: ", vp);
2600		if (vp == rootvnode) {
2601			db_printf("/");
2602			vp = NULL;
2603		} else {
2604			if (vp->v_vflag & VV_ROOT) {
2605				db_printf("<mount point>");
2606				vp = vp->v_mount->mnt_vnodecovered;
2607			} else {
2608				struct namecache *ncp;
2609				char *ncn;
2610				int i;
2611
2612				ncp = TAILQ_FIRST(&vp->v_cache_dst);
2613				if (ncp != NULL) {
2614					ncn = ncp->nc_name;
2615					for (i = 0; i < ncp->nc_nlen; i++)
2616						db_printf("%c", *ncn++);
2617					vp = ncp->nc_dvp;
2618				} else {
2619					vp = NULL;
2620				}
2621			}
2622		}
2623		db_printf("\n");
2624	}
2625
2626	return;
2627}
2628
2629DB_SHOW_COMMAND(vpath, db_show_vpath)
2630{
2631	struct vnode *vp;
2632
2633	if (!have_addr) {
2634		db_printf("usage: show vpath <struct vnode *>\n");
2635		return;
2636	}
2637
2638	vp = (struct vnode *)addr;
2639	db_print_vpath(vp);
2640}
2641
2642#endif
2643