1/*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD$"); 39 40#include "opt_ddb.h" 41#include "opt_ktrace.h" 42 43#include <sys/param.h> 44#include <sys/systm.h> 45#include <sys/counter.h> 46#include <sys/filedesc.h> 47#include <sys/fnv_hash.h> 48#include <sys/kernel.h> 49#include <sys/lock.h> 50#include <sys/malloc.h> 51#include <sys/fcntl.h> 52#include <sys/mount.h> 53#include <sys/namei.h> 54#include <sys/proc.h> 55#include <sys/rwlock.h> 56#include <sys/sdt.h> 57#include <sys/smp.h> 58#include <sys/syscallsubr.h> 59#include <sys/sysctl.h> 60#include <sys/sysproto.h> 61#include <sys/vnode.h> 62#ifdef KTRACE 63#include <sys/ktrace.h> 64#endif 65 66#ifdef DDB 67#include <ddb/ddb.h> 68#endif 69 70#include <vm/uma.h> 71 72SDT_PROVIDER_DECLARE(vfs); 73SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 74 "struct vnode *"); 75SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 76 "char *"); 77SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 78SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 79 "char *", "struct vnode *"); 80SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 81SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 82 "struct vnode *", "char *"); 83SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 84 "struct vnode *"); 85SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 86 "struct vnode *", "char *"); 87SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 88 "char *"); 89SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 90SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 91SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 92SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 93 "struct vnode *"); 94SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 95 "char *"); 96SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 97 "char *"); 98 99/* 100 * This structure describes the elements in the cache of recent 101 * names looked up by namei. 102 */ 103 104struct namecache { 105 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 106 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 107 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 108 struct vnode *nc_dvp; /* vnode of parent of name */ 109 union { 110 struct vnode *nu_vp; /* vnode the name refers to */ 111 } n_un; 112 u_char nc_flag; /* flag bits */ 113 u_char nc_nlen; /* length of name */ 114 char nc_name[0]; /* segment name + nul */ 115}; 116 117/* 118 * struct namecache_ts repeats struct namecache layout up to the 119 * nc_nlen member. 120 * struct namecache_ts is used in place of struct namecache when time(s) need 121 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 122 * both a non-dotdot directory name plus dotdot for the directory's 123 * parent. 124 * 125 * See below for alignment requirement. 126 */ 127struct namecache_ts { 128 struct timespec nc_time; /* timespec provided by fs */ 129 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 130 int nc_ticks; /* ticks value when entry was added */ 131 struct namecache nc_nc; 132}; 133 134/* 135 * At least mips n32 performs 64-bit accesses to timespec as found 136 * in namecache_ts and requires them to be aligned. Since others 137 * may be in the same spot suffer a little bit and enforce the 138 * alignment for everyone. Note this is a nop for 64-bit platforms. 139 */ 140#define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 141 142#define nc_vp n_un.nu_vp 143 144/* 145 * Flags in namecache.nc_flag 146 */ 147#define NCF_WHITE 0x01 148#define NCF_ISDOTDOT 0x02 149#define NCF_TS 0x04 150#define NCF_DTS 0x08 151#define NCF_DVDROP 0x10 152#define NCF_NEGATIVE 0x20 153#define NCF_HOTNEGATIVE 0x40 154 155/* 156 * Name caching works as follows: 157 * 158 * Names found by directory scans are retained in a cache 159 * for future reference. It is managed LRU, so frequently 160 * used names will hang around. Cache is indexed by hash value 161 * obtained from (dvp, name) where dvp refers to the directory 162 * containing name. 163 * 164 * If it is a "negative" entry, (i.e. for a name that is known NOT to 165 * exist) the vnode pointer will be NULL. 166 * 167 * Upon reaching the last segment of a path, if the reference 168 * is for DELETE, or NOCACHE is set (rewrite), and the 169 * name is located in the cache, it will be dropped. 170 * 171 * These locks are used (in the order in which they can be taken): 172 * NAME TYPE ROLE 173 * vnodelock mtx vnode lists and v_cache_dd field protection 174 * bucketlock rwlock for access to given set of hash buckets 175 * neglist mtx negative entry LRU management 176 * 177 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 178 * shrinking the LRU list. 179 * 180 * It is legal to take multiple vnodelock and bucketlock locks. The locking 181 * order is lower address first. Both are recursive. 182 * 183 * "." lookups are lockless. 184 * 185 * ".." and vnode -> name lookups require vnodelock. 186 * 187 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 188 * 189 * Insertions and removals of entries require involved vnodes and bucketlocks 190 * to be write-locked to prevent other threads from seeing the entry. 191 * 192 * Some lookups result in removal of the found entry (e.g. getting rid of a 193 * negative entry with the intent to create a positive one), which poses a 194 * problem when multiple threads reach the state. Similarly, two different 195 * threads can purge two different vnodes and try to remove the same name. 196 * 197 * If the already held vnode lock is lower than the second required lock, we 198 * can just take the other lock. However, in the opposite case, this could 199 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 200 * the first node, locking everything in order and revalidating the state. 201 */ 202 203/* 204 * Structures associated with name caching. 205 */ 206#define NCHHASH(hash) \ 207 (&nchashtbl[(hash) & nchash]) 208static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 209static u_long __read_mostly nchash; /* size of hash table */ 210SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 211 "Size of namecache hash table"); 212static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 213SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 214 "Ratio of negative namecache entries"); 215static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 216static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 217u_int ncsizefactor = 2; 218SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 219 "Size factor for namecache"); 220static u_int __read_mostly ncpurgeminvnodes; 221SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 222 "Number of vnodes below which purgevfs ignores the request"); 223static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 224 225struct nchstats nchstats; /* cache effectiveness statistics */ 226 227static struct mtx __exclusive_cache_line ncneg_shrink_lock; 228static int shrink_list_turn; 229 230struct neglist { 231 struct mtx nl_lock; 232 TAILQ_HEAD(, namecache) nl_list; 233} __aligned(CACHE_LINE_SIZE); 234 235static struct neglist __read_mostly *neglists; 236static struct neglist ncneg_hot; 237static u_long numhotneg; 238 239#define ncneghash 3 240#define numneglists (ncneghash + 1) 241static inline struct neglist * 242NCP2NEGLIST(struct namecache *ncp) 243{ 244 245 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 246} 247 248#define numbucketlocks (ncbuckethash + 1) 249static u_int __read_mostly ncbuckethash; 250static struct rwlock_padalign __read_mostly *bucketlocks; 251#define HASH2BUCKETLOCK(hash) \ 252 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 253 254#define numvnodelocks (ncvnodehash + 1) 255static u_int __read_mostly ncvnodehash; 256static struct mtx __read_mostly *vnodelocks; 257static inline struct mtx * 258VP2VNODELOCK(struct vnode *vp) 259{ 260 261 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 262} 263 264/* 265 * UMA zones for the VFS cache. 266 * 267 * The small cache is used for entries with short names, which are the 268 * most common. The large cache is used for entries which are too big to 269 * fit in the small cache. 270 */ 271static uma_zone_t __read_mostly cache_zone_small; 272static uma_zone_t __read_mostly cache_zone_small_ts; 273static uma_zone_t __read_mostly cache_zone_large; 274static uma_zone_t __read_mostly cache_zone_large_ts; 275 276#define CACHE_PATH_CUTOFF 35 277 278static struct namecache * 279cache_alloc(int len, int ts) 280{ 281 struct namecache_ts *ncp_ts; 282 struct namecache *ncp; 283 284 if (__predict_false(ts)) { 285 if (len <= CACHE_PATH_CUTOFF) 286 ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); 287 else 288 ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); 289 ncp = &ncp_ts->nc_nc; 290 } else { 291 if (len <= CACHE_PATH_CUTOFF) 292 ncp = uma_zalloc(cache_zone_small, M_WAITOK); 293 else 294 ncp = uma_zalloc(cache_zone_large, M_WAITOK); 295 } 296 return (ncp); 297} 298 299static void 300cache_free(struct namecache *ncp) 301{ 302 struct namecache_ts *ncp_ts; 303 304 if (ncp == NULL) 305 return; 306 if ((ncp->nc_flag & NCF_DVDROP) != 0) 307 vdrop(ncp->nc_dvp); 308 if (__predict_false(ncp->nc_flag & NCF_TS)) { 309 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 310 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 311 uma_zfree(cache_zone_small_ts, ncp_ts); 312 else 313 uma_zfree(cache_zone_large_ts, ncp_ts); 314 } else { 315 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 316 uma_zfree(cache_zone_small, ncp); 317 else 318 uma_zfree(cache_zone_large, ncp); 319 } 320} 321 322static void 323cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 324{ 325 struct namecache_ts *ncp_ts; 326 327 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 328 (tsp == NULL && ticksp == NULL), 329 ("No NCF_TS")); 330 331 if (tsp == NULL && ticksp == NULL) 332 return; 333 334 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 335 if (tsp != NULL) 336 *tsp = ncp_ts->nc_time; 337 if (ticksp != NULL) 338 *ticksp = ncp_ts->nc_ticks; 339} 340 341static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 342SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 343 "VFS namecache enabled"); 344 345/* Export size information to userland */ 346SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 347 sizeof(struct namecache), "sizeof(struct namecache)"); 348 349/* 350 * The new name cache statistics 351 */ 352static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, 353 "Name cache statistics"); 354#define STATNODE_ULONG(name, descr) \ 355 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 356#define STATNODE_COUNTER(name, descr) \ 357 static counter_u64_t __read_mostly name; \ 358 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 359STATNODE_ULONG(numneg, "Number of negative cache entries"); 360STATNODE_ULONG(numcache, "Number of cache entries"); 361STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 362STATNODE_COUNTER(numcalls, "Number of cache lookups"); 363STATNODE_COUNTER(dothits, "Number of '.' hits"); 364STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 365STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 366STATNODE_COUNTER(nummiss, "Number of cache misses"); 367STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 368STATNODE_COUNTER(numposzaps, 369 "Number of cache hits (positive) we do not want to cache"); 370STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 371STATNODE_COUNTER(numnegzaps, 372 "Number of cache hits (negative) we do not want to cache"); 373STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 374/* These count for kern___getcwd(), too. */ 375STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 376STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 377STATNODE_COUNTER(numfullpathfail2, 378 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 379STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 380STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 381STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 382 "Number of successful removals after relocking"); 383static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 384 "Number of times zap_and_exit failed to lock"); 385static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 386 "Number of times zap_and_exit failed to lock"); 387static long cache_lock_vnodes_cel_3_failures; 388STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 389 "Number of times 3-way vnode locking failed"); 390STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 391STATNODE_COUNTER(numneg_evicted, 392 "Number of negative entries evicted when adding a new entry"); 393STATNODE_COUNTER(shrinking_skipped, 394 "Number of times shrinking was already in progress"); 395 396static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 397static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 398 char *buf, char **retbuf, u_int buflen); 399 400static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 401 402static int cache_yield; 403SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 404 "Number of times cache called yield"); 405 406static void __noinline 407cache_maybe_yield(void) 408{ 409 410 if (should_yield()) { 411 cache_yield++; 412 kern_yield(PRI_USER); 413 } 414} 415 416static inline void 417cache_assert_vlp_locked(struct mtx *vlp) 418{ 419 420 if (vlp != NULL) 421 mtx_assert(vlp, MA_OWNED); 422} 423 424static inline void 425cache_assert_vnode_locked(struct vnode *vp) 426{ 427 struct mtx *vlp; 428 429 vlp = VP2VNODELOCK(vp); 430 cache_assert_vlp_locked(vlp); 431} 432 433static uint32_t 434cache_get_hash(char *name, u_char len, struct vnode *dvp) 435{ 436 uint32_t hash; 437 438 hash = fnv_32_buf(name, len, FNV1_32_INIT); 439 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 440 return (hash); 441} 442 443static inline struct rwlock * 444NCP2BUCKETLOCK(struct namecache *ncp) 445{ 446 uint32_t hash; 447 448 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 449 return (HASH2BUCKETLOCK(hash)); 450} 451 452#ifdef INVARIANTS 453static void 454cache_assert_bucket_locked(struct namecache *ncp, int mode) 455{ 456 struct rwlock *blp; 457 458 blp = NCP2BUCKETLOCK(ncp); 459 rw_assert(blp, mode); 460} 461#else 462#define cache_assert_bucket_locked(x, y) do { } while (0) 463#endif 464 465#define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 466static void 467_cache_sort_vnodes(void **p1, void **p2) 468{ 469 void *tmp; 470 471 MPASS(*p1 != NULL || *p2 != NULL); 472 473 if (*p1 > *p2) { 474 tmp = *p2; 475 *p2 = *p1; 476 *p1 = tmp; 477 } 478} 479 480static void 481cache_lock_all_buckets(void) 482{ 483 u_int i; 484 485 for (i = 0; i < numbucketlocks; i++) 486 rw_wlock(&bucketlocks[i]); 487} 488 489static void 490cache_unlock_all_buckets(void) 491{ 492 u_int i; 493 494 for (i = 0; i < numbucketlocks; i++) 495 rw_wunlock(&bucketlocks[i]); 496} 497 498static void 499cache_lock_all_vnodes(void) 500{ 501 u_int i; 502 503 for (i = 0; i < numvnodelocks; i++) 504 mtx_lock(&vnodelocks[i]); 505} 506 507static void 508cache_unlock_all_vnodes(void) 509{ 510 u_int i; 511 512 for (i = 0; i < numvnodelocks; i++) 513 mtx_unlock(&vnodelocks[i]); 514} 515 516static int 517cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 518{ 519 520 cache_sort_vnodes(&vlp1, &vlp2); 521 522 if (vlp1 != NULL) { 523 if (!mtx_trylock(vlp1)) 524 return (EAGAIN); 525 } 526 if (!mtx_trylock(vlp2)) { 527 if (vlp1 != NULL) 528 mtx_unlock(vlp1); 529 return (EAGAIN); 530 } 531 532 return (0); 533} 534 535static void 536cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 537{ 538 539 MPASS(vlp1 != NULL || vlp2 != NULL); 540 MPASS(vlp1 <= vlp2); 541 542 if (vlp1 != NULL) 543 mtx_lock(vlp1); 544 if (vlp2 != NULL) 545 mtx_lock(vlp2); 546} 547 548static void 549cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 550{ 551 552 MPASS(vlp1 != NULL || vlp2 != NULL); 553 554 if (vlp1 != NULL) 555 mtx_unlock(vlp1); 556 if (vlp2 != NULL) 557 mtx_unlock(vlp2); 558} 559 560static int 561sysctl_nchstats(SYSCTL_HANDLER_ARGS) 562{ 563 struct nchstats snap; 564 565 if (req->oldptr == NULL) 566 return (SYSCTL_OUT(req, 0, sizeof(snap))); 567 568 snap = nchstats; 569 snap.ncs_goodhits = counter_u64_fetch(numposhits); 570 snap.ncs_neghits = counter_u64_fetch(numneghits); 571 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 572 counter_u64_fetch(numnegzaps); 573 snap.ncs_miss = counter_u64_fetch(nummisszap) + 574 counter_u64_fetch(nummiss); 575 576 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 577} 578SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 579 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 580 "VFS cache effectiveness statistics"); 581 582#ifdef DIAGNOSTIC 583/* 584 * Grab an atomic snapshot of the name cache hash chain lengths 585 */ 586static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, 587 "hash table stats"); 588 589static int 590sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 591{ 592 struct nchashhead *ncpp; 593 struct namecache *ncp; 594 int i, error, n_nchash, *cntbuf; 595 596retry: 597 n_nchash = nchash + 1; /* nchash is max index, not count */ 598 if (req->oldptr == NULL) 599 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 600 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 601 cache_lock_all_buckets(); 602 if (n_nchash != nchash + 1) { 603 cache_unlock_all_buckets(); 604 free(cntbuf, M_TEMP); 605 goto retry; 606 } 607 /* Scan hash tables counting entries */ 608 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 609 LIST_FOREACH(ncp, ncpp, nc_hash) 610 cntbuf[i]++; 611 cache_unlock_all_buckets(); 612 for (error = 0, i = 0; i < n_nchash; i++) 613 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 614 break; 615 free(cntbuf, M_TEMP); 616 return (error); 617} 618SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 619 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 620 "nchash chain lengths"); 621 622static int 623sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 624{ 625 int error; 626 struct nchashhead *ncpp; 627 struct namecache *ncp; 628 int n_nchash; 629 int count, maxlength, used, pct; 630 631 if (!req->oldptr) 632 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 633 634 cache_lock_all_buckets(); 635 n_nchash = nchash + 1; /* nchash is max index, not count */ 636 used = 0; 637 maxlength = 0; 638 639 /* Scan hash tables for applicable entries */ 640 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 641 count = 0; 642 LIST_FOREACH(ncp, ncpp, nc_hash) { 643 count++; 644 } 645 if (count) 646 used++; 647 if (maxlength < count) 648 maxlength = count; 649 } 650 n_nchash = nchash + 1; 651 cache_unlock_all_buckets(); 652 pct = (used * 100) / (n_nchash / 100); 653 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 654 if (error) 655 return (error); 656 error = SYSCTL_OUT(req, &used, sizeof(used)); 657 if (error) 658 return (error); 659 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 660 if (error) 661 return (error); 662 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 663 if (error) 664 return (error); 665 return (0); 666} 667SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 668 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 669 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 670#endif 671 672/* 673 * Negative entries management 674 * 675 * A variation of LRU scheme is used. New entries are hashed into one of 676 * numneglists cold lists. Entries get promoted to the hot list on first hit. 677 * 678 * The shrinker will demote hot list head and evict from the cold list in a 679 * round-robin manner. 680 */ 681static void 682cache_negative_hit(struct namecache *ncp) 683{ 684 struct neglist *neglist; 685 686 MPASS(ncp->nc_flag & NCF_NEGATIVE); 687 if (ncp->nc_flag & NCF_HOTNEGATIVE) 688 return; 689 neglist = NCP2NEGLIST(ncp); 690 mtx_lock(&ncneg_hot.nl_lock); 691 mtx_lock(&neglist->nl_lock); 692 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 693 numhotneg++; 694 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 695 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 696 ncp->nc_flag |= NCF_HOTNEGATIVE; 697 } 698 mtx_unlock(&neglist->nl_lock); 699 mtx_unlock(&ncneg_hot.nl_lock); 700} 701 702static void 703cache_negative_insert(struct namecache *ncp, bool neg_locked) 704{ 705 struct neglist *neglist; 706 707 MPASS(ncp->nc_flag & NCF_NEGATIVE); 708 cache_assert_bucket_locked(ncp, RA_WLOCKED); 709 neglist = NCP2NEGLIST(ncp); 710 if (!neg_locked) { 711 mtx_lock(&neglist->nl_lock); 712 } else { 713 mtx_assert(&neglist->nl_lock, MA_OWNED); 714 } 715 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 716 if (!neg_locked) 717 mtx_unlock(&neglist->nl_lock); 718 atomic_add_rel_long(&numneg, 1); 719} 720 721static void 722cache_negative_remove(struct namecache *ncp, bool neg_locked) 723{ 724 struct neglist *neglist; 725 bool hot_locked = false; 726 bool list_locked = false; 727 728 MPASS(ncp->nc_flag & NCF_NEGATIVE); 729 cache_assert_bucket_locked(ncp, RA_WLOCKED); 730 neglist = NCP2NEGLIST(ncp); 731 if (!neg_locked) { 732 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 733 hot_locked = true; 734 mtx_lock(&ncneg_hot.nl_lock); 735 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 736 list_locked = true; 737 mtx_lock(&neglist->nl_lock); 738 } 739 } else { 740 list_locked = true; 741 mtx_lock(&neglist->nl_lock); 742 } 743 } 744 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 745 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 746 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 747 numhotneg--; 748 } else { 749 mtx_assert(&neglist->nl_lock, MA_OWNED); 750 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 751 } 752 if (list_locked) 753 mtx_unlock(&neglist->nl_lock); 754 if (hot_locked) 755 mtx_unlock(&ncneg_hot.nl_lock); 756 atomic_subtract_rel_long(&numneg, 1); 757} 758 759static void 760cache_negative_shrink_select(int start, struct namecache **ncpp, 761 struct neglist **neglistpp) 762{ 763 struct neglist *neglist; 764 struct namecache *ncp; 765 int i; 766 767 *ncpp = ncp = NULL; 768 neglist = NULL; 769 770 for (i = start; i < numneglists; i++) { 771 neglist = &neglists[i]; 772 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 773 continue; 774 mtx_lock(&neglist->nl_lock); 775 ncp = TAILQ_FIRST(&neglist->nl_list); 776 if (ncp != NULL) 777 break; 778 mtx_unlock(&neglist->nl_lock); 779 } 780 781 *neglistpp = neglist; 782 *ncpp = ncp; 783} 784 785static void 786cache_negative_zap_one(void) 787{ 788 struct namecache *ncp, *ncp2; 789 struct neglist *neglist; 790 struct mtx *dvlp; 791 struct rwlock *blp; 792 793 if (mtx_owner(&ncneg_shrink_lock) != NULL || 794 !mtx_trylock(&ncneg_shrink_lock)) { 795 counter_u64_add(shrinking_skipped, 1); 796 return; 797 } 798 799 mtx_lock(&ncneg_hot.nl_lock); 800 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 801 if (ncp != NULL) { 802 neglist = NCP2NEGLIST(ncp); 803 mtx_lock(&neglist->nl_lock); 804 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 805 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 806 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 807 numhotneg--; 808 mtx_unlock(&neglist->nl_lock); 809 } 810 mtx_unlock(&ncneg_hot.nl_lock); 811 812 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 813 shrink_list_turn++; 814 if (shrink_list_turn == numneglists) 815 shrink_list_turn = 0; 816 if (ncp == NULL && shrink_list_turn == 0) 817 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 818 mtx_unlock(&ncneg_shrink_lock); 819 if (ncp == NULL) 820 return; 821 822 MPASS(ncp->nc_flag & NCF_NEGATIVE); 823 dvlp = VP2VNODELOCK(ncp->nc_dvp); 824 blp = NCP2BUCKETLOCK(ncp); 825 mtx_unlock(&neglist->nl_lock); 826 mtx_lock(dvlp); 827 rw_wlock(blp); 828 mtx_lock(&neglist->nl_lock); 829 ncp2 = TAILQ_FIRST(&neglist->nl_list); 830 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 831 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 832 ncp = NULL; 833 } else { 834 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 835 ncp->nc_name); 836 837 cache_zap_locked(ncp, true); 838 counter_u64_add(numneg_evicted, 1); 839 } 840 mtx_unlock(&neglist->nl_lock); 841 rw_wunlock(blp); 842 mtx_unlock(dvlp); 843 cache_free(ncp); 844} 845 846/* 847 * cache_zap_locked(): 848 * 849 * Removes a namecache entry from cache, whether it contains an actual 850 * pointer to a vnode or if it is just a negative cache entry. 851 */ 852static void 853cache_zap_locked(struct namecache *ncp, bool neg_locked) 854{ 855 856 if (!(ncp->nc_flag & NCF_NEGATIVE)) 857 cache_assert_vnode_locked(ncp->nc_vp); 858 cache_assert_vnode_locked(ncp->nc_dvp); 859 cache_assert_bucket_locked(ncp, RA_WLOCKED); 860 861 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 862 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 863 LIST_REMOVE(ncp, nc_hash); 864 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 865 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 866 ncp->nc_name, ncp->nc_vp); 867 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 868 if (ncp == ncp->nc_vp->v_cache_dd) 869 ncp->nc_vp->v_cache_dd = NULL; 870 } else { 871 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 872 ncp->nc_name); 873 cache_negative_remove(ncp, neg_locked); 874 } 875 if (ncp->nc_flag & NCF_ISDOTDOT) { 876 if (ncp == ncp->nc_dvp->v_cache_dd) 877 ncp->nc_dvp->v_cache_dd = NULL; 878 } else { 879 LIST_REMOVE(ncp, nc_src); 880 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 881 ncp->nc_flag |= NCF_DVDROP; 882 counter_u64_add(numcachehv, -1); 883 } 884 } 885 atomic_subtract_rel_long(&numcache, 1); 886} 887 888static void 889cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 890{ 891 struct rwlock *blp; 892 893 MPASS(ncp->nc_dvp == vp); 894 MPASS(ncp->nc_flag & NCF_NEGATIVE); 895 cache_assert_vnode_locked(vp); 896 897 blp = NCP2BUCKETLOCK(ncp); 898 rw_wlock(blp); 899 cache_zap_locked(ncp, false); 900 rw_wunlock(blp); 901} 902 903static bool 904cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 905 struct mtx **vlpp) 906{ 907 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 908 struct rwlock *blp; 909 910 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 911 cache_assert_vnode_locked(vp); 912 913 if (ncp->nc_flag & NCF_NEGATIVE) { 914 if (*vlpp != NULL) { 915 mtx_unlock(*vlpp); 916 *vlpp = NULL; 917 } 918 cache_zap_negative_locked_vnode_kl(ncp, vp); 919 return (true); 920 } 921 922 pvlp = VP2VNODELOCK(vp); 923 blp = NCP2BUCKETLOCK(ncp); 924 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 925 vlp2 = VP2VNODELOCK(ncp->nc_vp); 926 927 if (*vlpp == vlp1 || *vlpp == vlp2) { 928 to_unlock = *vlpp; 929 *vlpp = NULL; 930 } else { 931 if (*vlpp != NULL) { 932 mtx_unlock(*vlpp); 933 *vlpp = NULL; 934 } 935 cache_sort_vnodes(&vlp1, &vlp2); 936 if (vlp1 == pvlp) { 937 mtx_lock(vlp2); 938 to_unlock = vlp2; 939 } else { 940 if (!mtx_trylock(vlp1)) 941 goto out_relock; 942 to_unlock = vlp1; 943 } 944 } 945 rw_wlock(blp); 946 cache_zap_locked(ncp, false); 947 rw_wunlock(blp); 948 if (to_unlock != NULL) 949 mtx_unlock(to_unlock); 950 return (true); 951 952out_relock: 953 mtx_unlock(vlp2); 954 mtx_lock(vlp1); 955 mtx_lock(vlp2); 956 MPASS(*vlpp == NULL); 957 *vlpp = vlp1; 958 return (false); 959} 960 961static int __noinline 962cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 963{ 964 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 965 struct rwlock *blp; 966 int error = 0; 967 968 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 969 cache_assert_vnode_locked(vp); 970 971 pvlp = VP2VNODELOCK(vp); 972 if (ncp->nc_flag & NCF_NEGATIVE) { 973 cache_zap_negative_locked_vnode_kl(ncp, vp); 974 goto out; 975 } 976 977 blp = NCP2BUCKETLOCK(ncp); 978 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 979 vlp2 = VP2VNODELOCK(ncp->nc_vp); 980 cache_sort_vnodes(&vlp1, &vlp2); 981 if (vlp1 == pvlp) { 982 mtx_lock(vlp2); 983 to_unlock = vlp2; 984 } else { 985 if (!mtx_trylock(vlp1)) { 986 error = EAGAIN; 987 goto out; 988 } 989 to_unlock = vlp1; 990 } 991 rw_wlock(blp); 992 cache_zap_locked(ncp, false); 993 rw_wunlock(blp); 994 mtx_unlock(to_unlock); 995out: 996 mtx_unlock(pvlp); 997 return (error); 998} 999 1000/* 1001 * If trylocking failed we can get here. We know enough to take all needed locks 1002 * in the right order and re-lookup the entry. 1003 */ 1004static int 1005cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1006 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1007 struct rwlock *blp) 1008{ 1009 struct namecache *rncp; 1010 1011 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1012 1013 cache_sort_vnodes(&dvlp, &vlp); 1014 cache_lock_vnodes(dvlp, vlp); 1015 rw_wlock(blp); 1016 LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1017 if (rncp == ncp && rncp->nc_dvp == dvp && 1018 rncp->nc_nlen == cnp->cn_namelen && 1019 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1020 break; 1021 } 1022 if (rncp != NULL) { 1023 cache_zap_locked(rncp, false); 1024 rw_wunlock(blp); 1025 cache_unlock_vnodes(dvlp, vlp); 1026 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1027 return (0); 1028 } 1029 1030 rw_wunlock(blp); 1031 cache_unlock_vnodes(dvlp, vlp); 1032 return (EAGAIN); 1033} 1034 1035static int __noinline 1036cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1037 uint32_t hash, struct rwlock *blp) 1038{ 1039 struct mtx *dvlp, *vlp; 1040 struct vnode *dvp; 1041 1042 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1043 1044 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1045 vlp = NULL; 1046 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1047 vlp = VP2VNODELOCK(ncp->nc_vp); 1048 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1049 cache_zap_locked(ncp, false); 1050 rw_wunlock(blp); 1051 cache_unlock_vnodes(dvlp, vlp); 1052 return (0); 1053 } 1054 1055 dvp = ncp->nc_dvp; 1056 rw_wunlock(blp); 1057 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1058} 1059 1060static int __noinline 1061cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1062 uint32_t hash, struct rwlock *blp) 1063{ 1064 struct mtx *dvlp, *vlp; 1065 struct vnode *dvp; 1066 1067 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1068 1069 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1070 vlp = NULL; 1071 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1072 vlp = VP2VNODELOCK(ncp->nc_vp); 1073 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1074 rw_runlock(blp); 1075 rw_wlock(blp); 1076 cache_zap_locked(ncp, false); 1077 rw_wunlock(blp); 1078 cache_unlock_vnodes(dvlp, vlp); 1079 return (0); 1080 } 1081 1082 dvp = ncp->nc_dvp; 1083 rw_runlock(blp); 1084 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1085} 1086 1087static int 1088cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1089 struct mtx **vlpp1, struct mtx **vlpp2) 1090{ 1091 struct mtx *dvlp, *vlp; 1092 1093 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1094 1095 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1096 vlp = NULL; 1097 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1098 vlp = VP2VNODELOCK(ncp->nc_vp); 1099 cache_sort_vnodes(&dvlp, &vlp); 1100 1101 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1102 cache_zap_locked(ncp, false); 1103 cache_unlock_vnodes(dvlp, vlp); 1104 *vlpp1 = NULL; 1105 *vlpp2 = NULL; 1106 return (0); 1107 } 1108 1109 if (*vlpp1 != NULL) 1110 mtx_unlock(*vlpp1); 1111 if (*vlpp2 != NULL) 1112 mtx_unlock(*vlpp2); 1113 *vlpp1 = NULL; 1114 *vlpp2 = NULL; 1115 1116 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1117 cache_zap_locked(ncp, false); 1118 cache_unlock_vnodes(dvlp, vlp); 1119 return (0); 1120 } 1121 1122 rw_wunlock(blp); 1123 *vlpp1 = dvlp; 1124 *vlpp2 = vlp; 1125 if (*vlpp1 != NULL) 1126 mtx_lock(*vlpp1); 1127 mtx_lock(*vlpp2); 1128 rw_wlock(blp); 1129 return (EAGAIN); 1130} 1131 1132static void 1133cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1134{ 1135 1136 if (blp != NULL) { 1137 rw_runlock(blp); 1138 } else { 1139 mtx_unlock(vlp); 1140 } 1141} 1142 1143static int __noinline 1144cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1145 struct timespec *tsp, int *ticksp) 1146{ 1147 int ltype; 1148 1149 *vpp = dvp; 1150 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1151 dvp, cnp->cn_nameptr); 1152 counter_u64_add(dothits, 1); 1153 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1154 if (tsp != NULL) 1155 timespecclear(tsp); 1156 if (ticksp != NULL) 1157 *ticksp = ticks; 1158 vrefact(*vpp); 1159 /* 1160 * When we lookup "." we still can be asked to lock it 1161 * differently... 1162 */ 1163 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1164 if (ltype != VOP_ISLOCKED(*vpp)) { 1165 if (ltype == LK_EXCLUSIVE) { 1166 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1167 if ((*vpp)->v_iflag & VI_DOOMED) { 1168 /* forced unmount */ 1169 vrele(*vpp); 1170 *vpp = NULL; 1171 return (ENOENT); 1172 } 1173 } else 1174 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1175 } 1176 return (-1); 1177} 1178 1179static __noinline int 1180cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1181 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1182{ 1183 struct namecache *ncp; 1184 struct rwlock *blp; 1185 struct mtx *dvlp, *dvlp2; 1186 uint32_t hash; 1187 int error; 1188 1189 if (cnp->cn_namelen == 2 && 1190 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1191 counter_u64_add(dotdothits, 1); 1192 dvlp = VP2VNODELOCK(dvp); 1193 dvlp2 = NULL; 1194 mtx_lock(dvlp); 1195retry_dotdot: 1196 ncp = dvp->v_cache_dd; 1197 if (ncp == NULL) { 1198 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1199 "..", NULL); 1200 mtx_unlock(dvlp); 1201 if (dvlp2 != NULL) 1202 mtx_unlock(dvlp2); 1203 return (0); 1204 } 1205 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1206 if (ncp->nc_dvp != dvp) 1207 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1208 if (!cache_zap_locked_vnode_kl2(ncp, 1209 dvp, &dvlp2)) 1210 goto retry_dotdot; 1211 MPASS(dvp->v_cache_dd == NULL); 1212 mtx_unlock(dvlp); 1213 if (dvlp2 != NULL) 1214 mtx_unlock(dvlp2); 1215 cache_free(ncp); 1216 } else { 1217 dvp->v_cache_dd = NULL; 1218 mtx_unlock(dvlp); 1219 if (dvlp2 != NULL) 1220 mtx_unlock(dvlp2); 1221 } 1222 return (0); 1223 } 1224 1225 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1226 blp = HASH2BUCKETLOCK(hash); 1227retry: 1228 if (LIST_EMPTY(NCHHASH(hash))) 1229 goto out_no_entry; 1230 1231 rw_wlock(blp); 1232 1233 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1234 counter_u64_add(numchecks, 1); 1235 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1236 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1237 break; 1238 } 1239 1240 /* We failed to find an entry */ 1241 if (ncp == NULL) { 1242 rw_wunlock(blp); 1243 goto out_no_entry; 1244 } 1245 1246 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1247 if (__predict_false(error != 0)) { 1248 zap_and_exit_bucket_fail++; 1249 cache_maybe_yield(); 1250 goto retry; 1251 } 1252 counter_u64_add(numposzaps, 1); 1253 cache_free(ncp); 1254 return (0); 1255out_no_entry: 1256 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1257 counter_u64_add(nummisszap, 1); 1258 return (0); 1259} 1260 1261/** 1262 * Lookup a name in the name cache 1263 * 1264 * # Arguments 1265 * 1266 * - dvp: Parent directory in which to search. 1267 * - vpp: Return argument. Will contain desired vnode on cache hit. 1268 * - cnp: Parameters of the name search. The most interesting bits of 1269 * the cn_flags field have the following meanings: 1270 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1271 * it up. 1272 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1273 * - tsp: Return storage for cache timestamp. On a successful (positive 1274 * or negative) lookup, tsp will be filled with any timespec that 1275 * was stored when this cache entry was created. However, it will 1276 * be clear for "." entries. 1277 * - ticks: Return storage for alternate cache timestamp. On a successful 1278 * (positive or negative) lookup, it will contain the ticks value 1279 * that was current when the cache entry was created, unless cnp 1280 * was ".". 1281 * 1282 * # Returns 1283 * 1284 * - -1: A positive cache hit. vpp will contain the desired vnode. 1285 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1286 * to a forced unmount. vpp will not be modified. If the entry 1287 * is a whiteout, then the ISWHITEOUT flag will be set in 1288 * cnp->cn_flags. 1289 * - 0: A cache miss. vpp will not be modified. 1290 * 1291 * # Locking 1292 * 1293 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1294 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1295 * lock is not recursively acquired. 1296 */ 1297int 1298cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1299 struct timespec *tsp, int *ticksp) 1300{ 1301 struct namecache_ts *ncp_ts; 1302 struct namecache *ncp; 1303 struct rwlock *blp; 1304 struct mtx *dvlp; 1305 uint32_t hash; 1306 int error, ltype; 1307 1308 if (__predict_false(!doingcache)) { 1309 cnp->cn_flags &= ~MAKEENTRY; 1310 return (0); 1311 } 1312 1313 counter_u64_add(numcalls, 1); 1314 1315 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1316 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1317 1318 if ((cnp->cn_flags & MAKEENTRY) == 0) 1319 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1320 1321retry: 1322 blp = NULL; 1323 dvlp = NULL; 1324 error = 0; 1325 if (cnp->cn_namelen == 2 && 1326 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1327 counter_u64_add(dotdothits, 1); 1328 dvlp = VP2VNODELOCK(dvp); 1329 mtx_lock(dvlp); 1330 ncp = dvp->v_cache_dd; 1331 if (ncp == NULL) { 1332 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1333 "..", NULL); 1334 mtx_unlock(dvlp); 1335 return (0); 1336 } 1337 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1338 if (ncp->nc_flag & NCF_NEGATIVE) 1339 *vpp = NULL; 1340 else 1341 *vpp = ncp->nc_vp; 1342 } else 1343 *vpp = ncp->nc_dvp; 1344 /* Return failure if negative entry was found. */ 1345 if (*vpp == NULL) 1346 goto negative_success; 1347 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1348 dvp, cnp->cn_nameptr, *vpp); 1349 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1350 *vpp); 1351 cache_out_ts(ncp, tsp, ticksp); 1352 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1353 NCF_DTS && tsp != NULL) { 1354 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1355 *tsp = ncp_ts->nc_dotdottime; 1356 } 1357 goto success; 1358 } 1359 1360 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1361 blp = HASH2BUCKETLOCK(hash); 1362 rw_rlock(blp); 1363 1364 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1365 counter_u64_add(numchecks, 1); 1366 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1367 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1368 break; 1369 } 1370 1371 /* We failed to find an entry */ 1372 if (__predict_false(ncp == NULL)) { 1373 rw_runlock(blp); 1374 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1375 NULL); 1376 counter_u64_add(nummiss, 1); 1377 return (0); 1378 } 1379 1380 if (ncp->nc_flag & NCF_NEGATIVE) 1381 goto negative_success; 1382 1383 /* We found a "positive" match, return the vnode */ 1384 counter_u64_add(numposhits, 1); 1385 *vpp = ncp->nc_vp; 1386 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1387 dvp, cnp->cn_nameptr, *vpp, ncp); 1388 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1389 *vpp); 1390 cache_out_ts(ncp, tsp, ticksp); 1391success: 1392 /* 1393 * On success we return a locked and ref'd vnode as per the lookup 1394 * protocol. 1395 */ 1396 MPASS(dvp != *vpp); 1397 ltype = 0; /* silence gcc warning */ 1398 if (cnp->cn_flags & ISDOTDOT) { 1399 ltype = VOP_ISLOCKED(dvp); 1400 VOP_UNLOCK(dvp, 0); 1401 } 1402 vhold(*vpp); 1403 cache_lookup_unlock(blp, dvlp); 1404 error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); 1405 if (cnp->cn_flags & ISDOTDOT) { 1406 vn_lock(dvp, ltype | LK_RETRY); 1407 if (dvp->v_iflag & VI_DOOMED) { 1408 if (error == 0) 1409 vput(*vpp); 1410 *vpp = NULL; 1411 return (ENOENT); 1412 } 1413 } 1414 if (error) { 1415 *vpp = NULL; 1416 goto retry; 1417 } 1418 if ((cnp->cn_flags & ISLASTCN) && 1419 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1420 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1421 } 1422 return (-1); 1423 1424negative_success: 1425 /* We found a negative match, and want to create it, so purge */ 1426 if (cnp->cn_nameiop == CREATE) { 1427 counter_u64_add(numnegzaps, 1); 1428 goto zap_and_exit; 1429 } 1430 1431 counter_u64_add(numneghits, 1); 1432 cache_negative_hit(ncp); 1433 if (ncp->nc_flag & NCF_WHITE) 1434 cnp->cn_flags |= ISWHITEOUT; 1435 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 1436 ncp->nc_name); 1437 cache_out_ts(ncp, tsp, ticksp); 1438 cache_lookup_unlock(blp, dvlp); 1439 return (ENOENT); 1440 1441zap_and_exit: 1442 if (blp != NULL) 1443 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1444 else 1445 error = cache_zap_locked_vnode(ncp, dvp); 1446 if (__predict_false(error != 0)) { 1447 zap_and_exit_bucket_fail2++; 1448 cache_maybe_yield(); 1449 goto retry; 1450 } 1451 cache_free(ncp); 1452 return (0); 1453} 1454 1455struct celockstate { 1456 struct mtx *vlp[3]; 1457 struct rwlock *blp[2]; 1458}; 1459CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1460CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1461 1462static inline void 1463cache_celockstate_init(struct celockstate *cel) 1464{ 1465 1466 bzero(cel, sizeof(*cel)); 1467} 1468 1469static void 1470cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1471 struct vnode *dvp) 1472{ 1473 struct mtx *vlp1, *vlp2; 1474 1475 MPASS(cel->vlp[0] == NULL); 1476 MPASS(cel->vlp[1] == NULL); 1477 MPASS(cel->vlp[2] == NULL); 1478 1479 MPASS(vp != NULL || dvp != NULL); 1480 1481 vlp1 = VP2VNODELOCK(vp); 1482 vlp2 = VP2VNODELOCK(dvp); 1483 cache_sort_vnodes(&vlp1, &vlp2); 1484 1485 if (vlp1 != NULL) { 1486 mtx_lock(vlp1); 1487 cel->vlp[0] = vlp1; 1488 } 1489 mtx_lock(vlp2); 1490 cel->vlp[1] = vlp2; 1491} 1492 1493static void 1494cache_unlock_vnodes_cel(struct celockstate *cel) 1495{ 1496 1497 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1498 1499 if (cel->vlp[0] != NULL) 1500 mtx_unlock(cel->vlp[0]); 1501 if (cel->vlp[1] != NULL) 1502 mtx_unlock(cel->vlp[1]); 1503 if (cel->vlp[2] != NULL) 1504 mtx_unlock(cel->vlp[2]); 1505} 1506 1507static bool 1508cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1509{ 1510 struct mtx *vlp; 1511 bool ret; 1512 1513 cache_assert_vlp_locked(cel->vlp[0]); 1514 cache_assert_vlp_locked(cel->vlp[1]); 1515 MPASS(cel->vlp[2] == NULL); 1516 1517 MPASS(vp != NULL); 1518 vlp = VP2VNODELOCK(vp); 1519 1520 ret = true; 1521 if (vlp >= cel->vlp[1]) { 1522 mtx_lock(vlp); 1523 } else { 1524 if (mtx_trylock(vlp)) 1525 goto out; 1526 cache_lock_vnodes_cel_3_failures++; 1527 cache_unlock_vnodes_cel(cel); 1528 if (vlp < cel->vlp[0]) { 1529 mtx_lock(vlp); 1530 mtx_lock(cel->vlp[0]); 1531 mtx_lock(cel->vlp[1]); 1532 } else { 1533 if (cel->vlp[0] != NULL) 1534 mtx_lock(cel->vlp[0]); 1535 mtx_lock(vlp); 1536 mtx_lock(cel->vlp[1]); 1537 } 1538 ret = false; 1539 } 1540out: 1541 cel->vlp[2] = vlp; 1542 return (ret); 1543} 1544 1545static void 1546cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1547 struct rwlock *blp2) 1548{ 1549 1550 MPASS(cel->blp[0] == NULL); 1551 MPASS(cel->blp[1] == NULL); 1552 1553 cache_sort_vnodes(&blp1, &blp2); 1554 1555 if (blp1 != NULL) { 1556 rw_wlock(blp1); 1557 cel->blp[0] = blp1; 1558 } 1559 rw_wlock(blp2); 1560 cel->blp[1] = blp2; 1561} 1562 1563static void 1564cache_unlock_buckets_cel(struct celockstate *cel) 1565{ 1566 1567 if (cel->blp[0] != NULL) 1568 rw_wunlock(cel->blp[0]); 1569 rw_wunlock(cel->blp[1]); 1570} 1571 1572/* 1573 * Lock part of the cache affected by the insertion. 1574 * 1575 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1576 * However, insertion can result in removal of an old entry. In this 1577 * case we have an additional vnode and bucketlock pair to lock. If the 1578 * entry is negative, ncelock is locked instead of the vnode. 1579 * 1580 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1581 * preserving the locking order (smaller address first). 1582 */ 1583static void 1584cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1585 uint32_t hash) 1586{ 1587 struct namecache *ncp; 1588 struct rwlock *blps[2]; 1589 1590 blps[0] = HASH2BUCKETLOCK(hash); 1591 for (;;) { 1592 blps[1] = NULL; 1593 cache_lock_vnodes_cel(cel, dvp, vp); 1594 if (vp == NULL || vp->v_type != VDIR) 1595 break; 1596 ncp = vp->v_cache_dd; 1597 if (ncp == NULL) 1598 break; 1599 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1600 break; 1601 MPASS(ncp->nc_dvp == vp); 1602 blps[1] = NCP2BUCKETLOCK(ncp); 1603 if (ncp->nc_flag & NCF_NEGATIVE) 1604 break; 1605 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1606 break; 1607 /* 1608 * All vnodes got re-locked. Re-validate the state and if 1609 * nothing changed we are done. Otherwise restart. 1610 */ 1611 if (ncp == vp->v_cache_dd && 1612 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1613 blps[1] == NCP2BUCKETLOCK(ncp) && 1614 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1615 break; 1616 cache_unlock_vnodes_cel(cel); 1617 cel->vlp[0] = NULL; 1618 cel->vlp[1] = NULL; 1619 cel->vlp[2] = NULL; 1620 } 1621 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1622} 1623 1624static void 1625cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1626 uint32_t hash) 1627{ 1628 struct namecache *ncp; 1629 struct rwlock *blps[2]; 1630 1631 blps[0] = HASH2BUCKETLOCK(hash); 1632 for (;;) { 1633 blps[1] = NULL; 1634 cache_lock_vnodes_cel(cel, dvp, vp); 1635 ncp = dvp->v_cache_dd; 1636 if (ncp == NULL) 1637 break; 1638 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1639 break; 1640 MPASS(ncp->nc_dvp == dvp); 1641 blps[1] = NCP2BUCKETLOCK(ncp); 1642 if (ncp->nc_flag & NCF_NEGATIVE) 1643 break; 1644 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1645 break; 1646 if (ncp == dvp->v_cache_dd && 1647 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1648 blps[1] == NCP2BUCKETLOCK(ncp) && 1649 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1650 break; 1651 cache_unlock_vnodes_cel(cel); 1652 cel->vlp[0] = NULL; 1653 cel->vlp[1] = NULL; 1654 cel->vlp[2] = NULL; 1655 } 1656 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1657} 1658 1659static void 1660cache_enter_unlock(struct celockstate *cel) 1661{ 1662 1663 cache_unlock_buckets_cel(cel); 1664 cache_unlock_vnodes_cel(cel); 1665} 1666 1667static void __noinline 1668cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1669 struct componentname *cnp) 1670{ 1671 struct celockstate cel; 1672 struct namecache *ncp; 1673 uint32_t hash; 1674 int len; 1675 1676 if (dvp->v_cache_dd == NULL) 1677 return; 1678 len = cnp->cn_namelen; 1679 cache_celockstate_init(&cel); 1680 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1681 cache_enter_lock_dd(&cel, dvp, vp, hash); 1682 ncp = dvp->v_cache_dd; 1683 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1684 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1685 cache_zap_locked(ncp, false); 1686 } else { 1687 ncp = NULL; 1688 } 1689 dvp->v_cache_dd = NULL; 1690 cache_enter_unlock(&cel); 1691 cache_free(ncp); 1692} 1693 1694/* 1695 * Add an entry to the cache. 1696 */ 1697void 1698cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1699 struct timespec *tsp, struct timespec *dtsp) 1700{ 1701 struct celockstate cel; 1702 struct namecache *ncp, *n2, *ndd; 1703 struct namecache_ts *ncp_ts, *n2_ts; 1704 struct nchashhead *ncpp; 1705 uint32_t hash; 1706 int flag; 1707 int len; 1708 u_long lnumcache; 1709 1710 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1711 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, 1712 ("cache_enter: Adding a doomed vnode")); 1713 VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, 1714 ("cache_enter: Doomed vnode used as src")); 1715 1716 if (__predict_false(!doingcache)) 1717 return; 1718 1719 flag = 0; 1720 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1721 if (cnp->cn_namelen == 1) 1722 return; 1723 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1724 cache_enter_dotdot_prep(dvp, vp, cnp); 1725 flag = NCF_ISDOTDOT; 1726 } 1727 } 1728 1729 /* 1730 * Avoid blowout in namecache entries. 1731 */ 1732 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1733 if (__predict_false(lnumcache >= ncsize)) { 1734 atomic_add_long(&numcache, -1); 1735 return; 1736 } 1737 1738 cache_celockstate_init(&cel); 1739 ndd = NULL; 1740 ncp_ts = NULL; 1741 1742 /* 1743 * Calculate the hash key and setup as much of the new 1744 * namecache entry as possible before acquiring the lock. 1745 */ 1746 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1747 ncp->nc_flag = flag; 1748 ncp->nc_vp = vp; 1749 if (vp == NULL) 1750 ncp->nc_flag |= NCF_NEGATIVE; 1751 ncp->nc_dvp = dvp; 1752 if (tsp != NULL) { 1753 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1754 ncp_ts->nc_time = *tsp; 1755 ncp_ts->nc_ticks = ticks; 1756 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1757 if (dtsp != NULL) { 1758 ncp_ts->nc_dotdottime = *dtsp; 1759 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1760 } 1761 } 1762 len = ncp->nc_nlen = cnp->cn_namelen; 1763 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1764 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1765 cache_enter_lock(&cel, dvp, vp, hash); 1766 1767 /* 1768 * See if this vnode or negative entry is already in the cache 1769 * with this name. This can happen with concurrent lookups of 1770 * the same path name. 1771 */ 1772 ncpp = NCHHASH(hash); 1773 LIST_FOREACH(n2, ncpp, nc_hash) { 1774 if (n2->nc_dvp == dvp && 1775 n2->nc_nlen == cnp->cn_namelen && 1776 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1777 if (tsp != NULL) { 1778 KASSERT((n2->nc_flag & NCF_TS) != 0, 1779 ("no NCF_TS")); 1780 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1781 n2_ts->nc_time = ncp_ts->nc_time; 1782 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1783 if (dtsp != NULL) { 1784 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1785 if (ncp->nc_flag & NCF_NEGATIVE) 1786 mtx_lock(&ncneg_hot.nl_lock); 1787 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1788 if (ncp->nc_flag & NCF_NEGATIVE) 1789 mtx_unlock(&ncneg_hot.nl_lock); 1790 } 1791 } 1792 goto out_unlock_free; 1793 } 1794 } 1795 1796 if (flag == NCF_ISDOTDOT) { 1797 /* 1798 * See if we are trying to add .. entry, but some other lookup 1799 * has populated v_cache_dd pointer already. 1800 */ 1801 if (dvp->v_cache_dd != NULL) 1802 goto out_unlock_free; 1803 KASSERT(vp == NULL || vp->v_type == VDIR, 1804 ("wrong vnode type %p", vp)); 1805 dvp->v_cache_dd = ncp; 1806 } 1807 1808 if (vp != NULL) { 1809 if (vp->v_type == VDIR) { 1810 if (flag != NCF_ISDOTDOT) { 1811 /* 1812 * For this case, the cache entry maps both the 1813 * directory name in it and the name ".." for the 1814 * directory's parent. 1815 */ 1816 if ((ndd = vp->v_cache_dd) != NULL) { 1817 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1818 cache_zap_locked(ndd, false); 1819 else 1820 ndd = NULL; 1821 } 1822 vp->v_cache_dd = ncp; 1823 } 1824 } else { 1825 vp->v_cache_dd = NULL; 1826 } 1827 } 1828 1829 if (flag != NCF_ISDOTDOT) { 1830 if (LIST_EMPTY(&dvp->v_cache_src)) { 1831 vhold(dvp); 1832 counter_u64_add(numcachehv, 1); 1833 } 1834 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1835 } 1836 1837 /* 1838 * Insert the new namecache entry into the appropriate chain 1839 * within the cache entries table. 1840 */ 1841 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1842 1843 /* 1844 * If the entry is "negative", we place it into the 1845 * "negative" cache queue, otherwise, we place it into the 1846 * destination vnode's cache entries queue. 1847 */ 1848 if (vp != NULL) { 1849 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1850 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1851 vp); 1852 } else { 1853 if (cnp->cn_flags & ISWHITEOUT) 1854 ncp->nc_flag |= NCF_WHITE; 1855 cache_negative_insert(ncp, false); 1856 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1857 ncp->nc_name); 1858 } 1859 cache_enter_unlock(&cel); 1860 if (numneg * ncnegfactor > lnumcache) 1861 cache_negative_zap_one(); 1862 cache_free(ndd); 1863 return; 1864out_unlock_free: 1865 cache_enter_unlock(&cel); 1866 atomic_add_long(&numcache, -1); 1867 cache_free(ncp); 1868 return; 1869} 1870 1871static u_int 1872cache_roundup_2(u_int val) 1873{ 1874 u_int res; 1875 1876 for (res = 1; res <= val; res <<= 1) 1877 continue; 1878 1879 return (res); 1880} 1881 1882/* 1883 * Name cache initialization, from vfs_init() when we are booting 1884 */ 1885static void 1886nchinit(void *dummy __unused) 1887{ 1888 u_int i; 1889 1890 cache_zone_small = uma_zcreate("S VFS Cache", 1891 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1892 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, 1893 UMA_ZONE_ZINIT); 1894 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1895 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1896 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, 1897 UMA_ZONE_ZINIT); 1898 cache_zone_large = uma_zcreate("L VFS Cache", 1899 sizeof(struct namecache) + NAME_MAX + 1, 1900 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, 1901 UMA_ZONE_ZINIT); 1902 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1903 sizeof(struct namecache_ts) + NAME_MAX + 1, 1904 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, 1905 UMA_ZONE_ZINIT); 1906 1907 ncsize = desiredvnodes * ncsizefactor; 1908 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1909 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 1910 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 1911 ncbuckethash = 7; 1912 if (ncbuckethash > nchash) 1913 ncbuckethash = nchash; 1914 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1915 M_WAITOK | M_ZERO); 1916 for (i = 0; i < numbucketlocks; i++) 1917 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 1918 ncvnodehash = ncbuckethash; 1919 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 1920 M_WAITOK | M_ZERO); 1921 for (i = 0; i < numvnodelocks; i++) 1922 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 1923 ncpurgeminvnodes = numbucketlocks * 2; 1924 1925 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 1926 M_WAITOK | M_ZERO); 1927 for (i = 0; i < numneglists; i++) { 1928 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 1929 TAILQ_INIT(&neglists[i].nl_list); 1930 } 1931 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 1932 TAILQ_INIT(&ncneg_hot.nl_list); 1933 1934 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 1935 1936 numcachehv = counter_u64_alloc(M_WAITOK); 1937 numcalls = counter_u64_alloc(M_WAITOK); 1938 dothits = counter_u64_alloc(M_WAITOK); 1939 dotdothits = counter_u64_alloc(M_WAITOK); 1940 numchecks = counter_u64_alloc(M_WAITOK); 1941 nummiss = counter_u64_alloc(M_WAITOK); 1942 nummisszap = counter_u64_alloc(M_WAITOK); 1943 numposzaps = counter_u64_alloc(M_WAITOK); 1944 numposhits = counter_u64_alloc(M_WAITOK); 1945 numnegzaps = counter_u64_alloc(M_WAITOK); 1946 numneghits = counter_u64_alloc(M_WAITOK); 1947 numfullpathcalls = counter_u64_alloc(M_WAITOK); 1948 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 1949 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 1950 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 1951 numfullpathfound = counter_u64_alloc(M_WAITOK); 1952 zap_and_exit_bucket_relock_success = counter_u64_alloc(M_WAITOK); 1953 numneg_evicted = counter_u64_alloc(M_WAITOK); 1954 shrinking_skipped = counter_u64_alloc(M_WAITOK); 1955} 1956SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 1957 1958void 1959cache_changesize(int newmaxvnodes) 1960{ 1961 struct nchashhead *new_nchashtbl, *old_nchashtbl; 1962 u_long new_nchash, old_nchash; 1963 struct namecache *ncp; 1964 uint32_t hash; 1965 int newncsize; 1966 int i; 1967 1968 newncsize = newmaxvnodes * ncsizefactor; 1969 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 1970 if (newmaxvnodes < numbucketlocks) 1971 newmaxvnodes = numbucketlocks; 1972 1973 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 1974 /* If same hash table size, nothing to do */ 1975 if (nchash == new_nchash) { 1976 free(new_nchashtbl, M_VFSCACHE); 1977 return; 1978 } 1979 /* 1980 * Move everything from the old hash table to the new table. 1981 * None of the namecache entries in the table can be removed 1982 * because to do so, they have to be removed from the hash table. 1983 */ 1984 cache_lock_all_vnodes(); 1985 cache_lock_all_buckets(); 1986 old_nchashtbl = nchashtbl; 1987 old_nchash = nchash; 1988 nchashtbl = new_nchashtbl; 1989 nchash = new_nchash; 1990 for (i = 0; i <= old_nchash; i++) { 1991 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 1992 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 1993 ncp->nc_dvp); 1994 LIST_REMOVE(ncp, nc_hash); 1995 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 1996 } 1997 } 1998 ncsize = newncsize; 1999 cache_unlock_all_buckets(); 2000 cache_unlock_all_vnodes(); 2001 free(old_nchashtbl, M_VFSCACHE); 2002} 2003 2004/* 2005 * Invalidate all entries from and to a particular vnode. 2006 */ 2007void 2008cache_purge(struct vnode *vp) 2009{ 2010 TAILQ_HEAD(, namecache) ncps; 2011 struct namecache *ncp, *nnp; 2012 struct mtx *vlp, *vlp2; 2013 2014 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2015 SDT_PROBE1(vfs, namecache, purge, done, vp); 2016 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2017 vp->v_cache_dd == NULL) 2018 return; 2019 TAILQ_INIT(&ncps); 2020 vlp = VP2VNODELOCK(vp); 2021 vlp2 = NULL; 2022 mtx_lock(vlp); 2023retry: 2024 while (!LIST_EMPTY(&vp->v_cache_src)) { 2025 ncp = LIST_FIRST(&vp->v_cache_src); 2026 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2027 goto retry; 2028 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2029 } 2030 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2031 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2032 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2033 goto retry; 2034 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2035 } 2036 ncp = vp->v_cache_dd; 2037 if (ncp != NULL) { 2038 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2039 ("lost dotdot link")); 2040 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2041 goto retry; 2042 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2043 } 2044 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2045 mtx_unlock(vlp); 2046 if (vlp2 != NULL) 2047 mtx_unlock(vlp2); 2048 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2049 cache_free(ncp); 2050 } 2051} 2052 2053/* 2054 * Invalidate all negative entries for a particular directory vnode. 2055 */ 2056void 2057cache_purge_negative(struct vnode *vp) 2058{ 2059 TAILQ_HEAD(, namecache) ncps; 2060 struct namecache *ncp, *nnp; 2061 struct mtx *vlp; 2062 2063 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2064 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2065 if (LIST_EMPTY(&vp->v_cache_src)) 2066 return; 2067 TAILQ_INIT(&ncps); 2068 vlp = VP2VNODELOCK(vp); 2069 mtx_lock(vlp); 2070 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2071 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2072 continue; 2073 cache_zap_negative_locked_vnode_kl(ncp, vp); 2074 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2075 } 2076 mtx_unlock(vlp); 2077 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2078 cache_free(ncp); 2079 } 2080} 2081 2082/* 2083 * Flush all entries referencing a particular filesystem. 2084 */ 2085void 2086cache_purgevfs(struct mount *mp, bool force) 2087{ 2088 TAILQ_HEAD(, namecache) ncps; 2089 struct mtx *vlp1, *vlp2; 2090 struct rwlock *blp; 2091 struct nchashhead *bucket; 2092 struct namecache *ncp, *nnp; 2093 u_long i, j, n_nchash; 2094 int error; 2095 2096 /* Scan hash tables for applicable entries */ 2097 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2098 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2099 return; 2100 TAILQ_INIT(&ncps); 2101 n_nchash = nchash + 1; 2102 vlp1 = vlp2 = NULL; 2103 for (i = 0; i < numbucketlocks; i++) { 2104 blp = (struct rwlock *)&bucketlocks[i]; 2105 rw_wlock(blp); 2106 for (j = i; j < n_nchash; j += numbucketlocks) { 2107retry: 2108 bucket = &nchashtbl[j]; 2109 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2110 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2111 if (ncp->nc_dvp->v_mount != mp) 2112 continue; 2113 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2114 &vlp1, &vlp2); 2115 if (error != 0) 2116 goto retry; 2117 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2118 } 2119 } 2120 rw_wunlock(blp); 2121 if (vlp1 == NULL && vlp2 == NULL) 2122 cache_maybe_yield(); 2123 } 2124 if (vlp1 != NULL) 2125 mtx_unlock(vlp1); 2126 if (vlp2 != NULL) 2127 mtx_unlock(vlp2); 2128 2129 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2130 cache_free(ncp); 2131 } 2132} 2133 2134/* 2135 * Perform canonical checks and cache lookup and pass on to filesystem 2136 * through the vop_cachedlookup only if needed. 2137 */ 2138 2139int 2140vfs_cache_lookup(struct vop_lookup_args *ap) 2141{ 2142 struct vnode *dvp; 2143 int error; 2144 struct vnode **vpp = ap->a_vpp; 2145 struct componentname *cnp = ap->a_cnp; 2146 int flags = cnp->cn_flags; 2147 2148 *vpp = NULL; 2149 dvp = ap->a_dvp; 2150 2151 if (dvp->v_type != VDIR) 2152 return (ENOTDIR); 2153 2154 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2155 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2156 return (EROFS); 2157 2158 error = vn_dir_check_exec(dvp, cnp); 2159 if (error != 0) 2160 return (error); 2161 2162 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2163 if (error == 0) 2164 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2165 if (error == -1) 2166 return (0); 2167 return (error); 2168} 2169 2170/* 2171 * XXX All of these sysctls would probably be more productive dead. 2172 */ 2173static int __read_mostly disablecwd; 2174SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 2175 "Disable the getcwd syscall"); 2176 2177/* Implementation of the getcwd syscall. */ 2178int 2179sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2180{ 2181 2182 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, 2183 MAXPATHLEN)); 2184} 2185 2186int 2187kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen, 2188 size_t path_max) 2189{ 2190 char *bp, *tmpbuf; 2191 struct filedesc *fdp; 2192 struct vnode *cdir, *rdir; 2193 int error; 2194 2195 if (__predict_false(disablecwd)) 2196 return (ENODEV); 2197 if (__predict_false(buflen < 2)) 2198 return (EINVAL); 2199 if (buflen > path_max) 2200 buflen = path_max; 2201 2202 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); 2203 fdp = td->td_proc->p_fd; 2204 FILEDESC_SLOCK(fdp); 2205 cdir = fdp->fd_cdir; 2206 vrefact(cdir); 2207 rdir = fdp->fd_rdir; 2208 vrefact(rdir); 2209 FILEDESC_SUNLOCK(fdp); 2210 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); 2211 vrele(rdir); 2212 vrele(cdir); 2213 2214 if (!error) { 2215 if (bufseg == UIO_SYSSPACE) 2216 bcopy(bp, buf, strlen(bp) + 1); 2217 else 2218 error = copyout(bp, buf, strlen(bp) + 1); 2219#ifdef KTRACE 2220 if (KTRPOINT(curthread, KTR_NAMEI)) 2221 ktrnamei(bp); 2222#endif 2223 } 2224 free(tmpbuf, M_TEMP); 2225 return (error); 2226} 2227 2228/* 2229 * Thus begins the fullpath magic. 2230 */ 2231 2232static int __read_mostly disablefullpath; 2233SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, 2234 "Disable the vn_fullpath function"); 2235 2236/* 2237 * Retrieve the full filesystem path that correspond to a vnode from the name 2238 * cache (if available) 2239 */ 2240int 2241vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2242{ 2243 char *buf; 2244 struct filedesc *fdp; 2245 struct vnode *rdir; 2246 int error; 2247 2248 if (__predict_false(disablefullpath)) 2249 return (ENODEV); 2250 if (__predict_false(vn == NULL)) 2251 return (EINVAL); 2252 2253 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2254 fdp = td->td_proc->p_fd; 2255 FILEDESC_SLOCK(fdp); 2256 rdir = fdp->fd_rdir; 2257 vrefact(rdir); 2258 FILEDESC_SUNLOCK(fdp); 2259 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); 2260 vrele(rdir); 2261 2262 if (!error) 2263 *freebuf = buf; 2264 else 2265 free(buf, M_TEMP); 2266 return (error); 2267} 2268 2269/* 2270 * This function is similar to vn_fullpath, but it attempts to lookup the 2271 * pathname relative to the global root mount point. This is required for the 2272 * auditing sub-system, as audited pathnames must be absolute, relative to the 2273 * global root mount point. 2274 */ 2275int 2276vn_fullpath_global(struct thread *td, struct vnode *vn, 2277 char **retbuf, char **freebuf) 2278{ 2279 char *buf; 2280 int error; 2281 2282 if (__predict_false(disablefullpath)) 2283 return (ENODEV); 2284 if (__predict_false(vn == NULL)) 2285 return (EINVAL); 2286 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2287 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); 2288 if (!error) 2289 *freebuf = buf; 2290 else 2291 free(buf, M_TEMP); 2292 return (error); 2293} 2294 2295int 2296vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) 2297{ 2298 struct vnode *dvp; 2299 struct namecache *ncp; 2300 struct mtx *vlp; 2301 int error; 2302 2303 vlp = VP2VNODELOCK(*vp); 2304 mtx_lock(vlp); 2305 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2306 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2307 break; 2308 } 2309 if (ncp != NULL) { 2310 if (*buflen < ncp->nc_nlen) { 2311 mtx_unlock(vlp); 2312 vrele(*vp); 2313 counter_u64_add(numfullpathfail4, 1); 2314 error = ENOMEM; 2315 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2316 vp, NULL); 2317 return (error); 2318 } 2319 *buflen -= ncp->nc_nlen; 2320 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2321 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2322 ncp->nc_name, vp); 2323 dvp = *vp; 2324 *vp = ncp->nc_dvp; 2325 vref(*vp); 2326 mtx_unlock(vlp); 2327 vrele(dvp); 2328 return (0); 2329 } 2330 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2331 2332 mtx_unlock(vlp); 2333 vn_lock(*vp, LK_SHARED | LK_RETRY); 2334 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2335 vput(*vp); 2336 if (error) { 2337 counter_u64_add(numfullpathfail2, 1); 2338 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2339 return (error); 2340 } 2341 2342 *vp = dvp; 2343 if (dvp->v_iflag & VI_DOOMED) { 2344 /* forced unmount */ 2345 vrele(dvp); 2346 error = ENOENT; 2347 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2348 return (error); 2349 } 2350 /* 2351 * *vp has its use count incremented still. 2352 */ 2353 2354 return (0); 2355} 2356 2357/* 2358 * The magic behind kern___getcwd() and vn_fullpath(). 2359 */ 2360static int 2361vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 2362 char *buf, char **retbuf, u_int buflen) 2363{ 2364 int error, slash_prefixed; 2365#ifdef KDTRACE_HOOKS 2366 struct vnode *startvp = vp; 2367#endif 2368 struct vnode *vp1; 2369 2370 buflen--; 2371 buf[buflen] = '\0'; 2372 error = 0; 2373 slash_prefixed = 0; 2374 2375 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2376 counter_u64_add(numfullpathcalls, 1); 2377 vref(vp); 2378 if (vp->v_type != VDIR) { 2379 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2380 if (error) 2381 return (error); 2382 if (buflen == 0) { 2383 vrele(vp); 2384 return (ENOMEM); 2385 } 2386 buf[--buflen] = '/'; 2387 slash_prefixed = 1; 2388 } 2389 while (vp != rdir && vp != rootvnode) { 2390 /* 2391 * The vp vnode must be already fully constructed, 2392 * since it is either found in namecache or obtained 2393 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2394 * without obtaining the vnode lock. 2395 */ 2396 if ((vp->v_vflag & VV_ROOT) != 0) { 2397 vn_lock(vp, LK_RETRY | LK_SHARED); 2398 2399 /* 2400 * With the vnode locked, check for races with 2401 * unmount, forced or not. Note that we 2402 * already verified that vp is not equal to 2403 * the root vnode, which means that 2404 * mnt_vnodecovered can be NULL only for the 2405 * case of unmount. 2406 */ 2407 if ((vp->v_iflag & VI_DOOMED) != 0 || 2408 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2409 vp1->v_mountedhere != vp->v_mount) { 2410 vput(vp); 2411 error = ENOENT; 2412 SDT_PROBE3(vfs, namecache, fullpath, return, 2413 error, vp, NULL); 2414 break; 2415 } 2416 2417 vref(vp1); 2418 vput(vp); 2419 vp = vp1; 2420 continue; 2421 } 2422 if (vp->v_type != VDIR) { 2423 vrele(vp); 2424 counter_u64_add(numfullpathfail1, 1); 2425 error = ENOTDIR; 2426 SDT_PROBE3(vfs, namecache, fullpath, return, 2427 error, vp, NULL); 2428 break; 2429 } 2430 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2431 if (error) 2432 break; 2433 if (buflen == 0) { 2434 vrele(vp); 2435 error = ENOMEM; 2436 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2437 startvp, NULL); 2438 break; 2439 } 2440 buf[--buflen] = '/'; 2441 slash_prefixed = 1; 2442 } 2443 if (error) 2444 return (error); 2445 if (!slash_prefixed) { 2446 if (buflen == 0) { 2447 vrele(vp); 2448 counter_u64_add(numfullpathfail4, 1); 2449 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2450 startvp, NULL); 2451 return (ENOMEM); 2452 } 2453 buf[--buflen] = '/'; 2454 } 2455 counter_u64_add(numfullpathfound, 1); 2456 vrele(vp); 2457 2458 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); 2459 *retbuf = buf + buflen; 2460 return (0); 2461} 2462 2463struct vnode * 2464vn_dir_dd_ino(struct vnode *vp) 2465{ 2466 struct namecache *ncp; 2467 struct vnode *ddvp; 2468 struct mtx *vlp; 2469 2470 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2471 vlp = VP2VNODELOCK(vp); 2472 mtx_lock(vlp); 2473 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2474 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2475 continue; 2476 ddvp = ncp->nc_dvp; 2477 vhold(ddvp); 2478 mtx_unlock(vlp); 2479 if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread)) 2480 return (NULL); 2481 return (ddvp); 2482 } 2483 mtx_unlock(vlp); 2484 return (NULL); 2485} 2486 2487int 2488vn_commname(struct vnode *vp, char *buf, u_int buflen) 2489{ 2490 struct namecache *ncp; 2491 struct mtx *vlp; 2492 int l; 2493 2494 vlp = VP2VNODELOCK(vp); 2495 mtx_lock(vlp); 2496 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2497 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2498 break; 2499 if (ncp == NULL) { 2500 mtx_unlock(vlp); 2501 return (ENOENT); 2502 } 2503 l = min(ncp->nc_nlen, buflen - 1); 2504 memcpy(buf, ncp->nc_name, l); 2505 mtx_unlock(vlp); 2506 buf[l] = '\0'; 2507 return (0); 2508} 2509 2510/* ABI compat shims for old kernel modules. */ 2511#undef cache_enter 2512 2513void cache_enter(struct vnode *dvp, struct vnode *vp, 2514 struct componentname *cnp); 2515 2516void 2517cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2518{ 2519 2520 cache_enter_time(dvp, vp, cnp, NULL, NULL); 2521} 2522 2523/* 2524 * This function updates path string to vnode's full global path 2525 * and checks the size of the new path string against the pathlen argument. 2526 * 2527 * Requires a locked, referenced vnode. 2528 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2529 * 2530 * If sysctl debug.disablefullpath is set, ENODEV is returned, 2531 * vnode is left locked and path remain untouched. 2532 * 2533 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2534 * because it falls back to the ".." lookup if the namecache lookup fails. 2535 */ 2536int 2537vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2538 u_int pathlen) 2539{ 2540 struct nameidata nd; 2541 struct vnode *vp1; 2542 char *rpath, *fbuf; 2543 int error; 2544 2545 ASSERT_VOP_ELOCKED(vp, __func__); 2546 2547 /* Return ENODEV if sysctl debug.disablefullpath==1 */ 2548 if (__predict_false(disablefullpath)) 2549 return (ENODEV); 2550 2551 /* Construct global filesystem path from vp. */ 2552 VOP_UNLOCK(vp, 0); 2553 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2554 2555 if (error != 0) { 2556 vrele(vp); 2557 return (error); 2558 } 2559 2560 if (strlen(rpath) >= pathlen) { 2561 vrele(vp); 2562 error = ENAMETOOLONG; 2563 goto out; 2564 } 2565 2566 /* 2567 * Re-lookup the vnode by path to detect a possible rename. 2568 * As a side effect, the vnode is relocked. 2569 * If vnode was renamed, return ENOENT. 2570 */ 2571 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2572 UIO_SYSSPACE, path, td); 2573 error = namei(&nd); 2574 if (error != 0) { 2575 vrele(vp); 2576 goto out; 2577 } 2578 NDFREE(&nd, NDF_ONLY_PNBUF); 2579 vp1 = nd.ni_vp; 2580 vrele(vp); 2581 if (vp1 == vp) 2582 strcpy(path, rpath); 2583 else { 2584 vput(vp1); 2585 error = ENOENT; 2586 } 2587 2588out: 2589 free(fbuf, M_TEMP); 2590 return (error); 2591} 2592 2593#ifdef DDB 2594static void 2595db_print_vpath(struct vnode *vp) 2596{ 2597 2598 while (vp != NULL) { 2599 db_printf("%p: ", vp); 2600 if (vp == rootvnode) { 2601 db_printf("/"); 2602 vp = NULL; 2603 } else { 2604 if (vp->v_vflag & VV_ROOT) { 2605 db_printf("<mount point>"); 2606 vp = vp->v_mount->mnt_vnodecovered; 2607 } else { 2608 struct namecache *ncp; 2609 char *ncn; 2610 int i; 2611 2612 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2613 if (ncp != NULL) { 2614 ncn = ncp->nc_name; 2615 for (i = 0; i < ncp->nc_nlen; i++) 2616 db_printf("%c", *ncn++); 2617 vp = ncp->nc_dvp; 2618 } else { 2619 vp = NULL; 2620 } 2621 } 2622 } 2623 db_printf("\n"); 2624 } 2625 2626 return; 2627} 2628 2629DB_SHOW_COMMAND(vpath, db_show_vpath) 2630{ 2631 struct vnode *vp; 2632 2633 if (!have_addr) { 2634 db_printf("usage: show vpath <struct vnode *>\n"); 2635 return; 2636 } 2637 2638 vp = (struct vnode *)addr; 2639 db_print_vpath(vp); 2640} 2641 2642#endif 2643