Deleted Added
full compact
vfs_cache.c (190942) vfs_cache.c (190945)
1/*-
2 * Copyright (c) 1989, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Poul-Henning Kamp of the FreeBSD Project.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
33 */
34
35#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1989, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Poul-Henning Kamp of the FreeBSD Project.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/kern/vfs_cache.c 190942 2009-04-11 16:12:20Z kib $");
36__FBSDID("$FreeBSD: head/sys/kern/vfs_cache.c 190945 2009-04-11 20:23:08Z kan $");
37
38#include "opt_kdtrace.h"
39#include "opt_ktrace.h"
40
41#include <sys/param.h>
42#include <sys/filedesc.h>
43#include <sys/fnv_hash.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mount.h>
48#include <sys/namei.h>
49#include <sys/proc.h>
50#include <sys/rwlock.h>
51#include <sys/sdt.h>
52#include <sys/syscallsubr.h>
53#include <sys/sysctl.h>
54#include <sys/sysproto.h>
55#include <sys/systm.h>
56#include <sys/vnode.h>
57#ifdef KTRACE
58#include <sys/ktrace.h>
59#endif
60
61#include <vm/uma.h>
62
63SDT_PROVIDER_DECLARE(vfs);
64SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
65 "struct vnode *");
66SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
67 "char *");
68SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
69SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
70 "struct char *", "struct vnode *");
71SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
72SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *",
73 "struct char *");
74SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
75 "struct vnode *");
76SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, "struct vnode *",
77 "char *");
78SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
79 "char *");
80SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
81SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
82SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
83SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
84 "struct vnode *");
85SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
86 "char *");
87
88/*
89 * This structure describes the elements in the cache of recent
90 * names looked up by namei.
91 */
92
93struct namecache {
94 LIST_ENTRY(namecache) nc_hash; /* hash chain */
95 LIST_ENTRY(namecache) nc_src; /* source vnode list */
96 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
97 struct vnode *nc_dvp; /* vnode of parent of name */
98 struct vnode *nc_vp; /* vnode the name refers to */
99 u_char nc_flag; /* flag bits */
100 u_char nc_nlen; /* length of name */
101 char nc_name[0]; /* segment name + nul */
102};
103
104/*
105 * Name caching works as follows:
106 *
107 * Names found by directory scans are retained in a cache
108 * for future reference. It is managed LRU, so frequently
109 * used names will hang around. Cache is indexed by hash value
110 * obtained from (vp, name) where vp refers to the directory
111 * containing name.
112 *
113 * If it is a "negative" entry, (i.e. for a name that is known NOT to
114 * exist) the vnode pointer will be NULL.
115 *
116 * Upon reaching the last segment of a path, if the reference
117 * is for DELETE, or NOCACHE is set (rewrite), and the
118 * name is located in the cache, it will be dropped.
119 */
120
121/*
122 * Structures associated with name cacheing.
123 */
124#define NCHHASH(hash) \
125 (&nchashtbl[(hash) & nchash])
126static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
127static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */
128static u_long nchash; /* size of hash table */
129SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
130static u_long ncnegfactor = 16; /* ratio of negative entries */
131SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
132static u_long numneg; /* number of cache entries allocated */
133SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
134static u_long numcache; /* number of cache entries allocated */
135SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
136static u_long numcachehv; /* number of cache entries with vnodes held */
137SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
138#if 0
139static u_long numcachepl; /* number of cache purge for leaf entries */
140SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
141#endif
142struct nchstats nchstats; /* cache effectiveness statistics */
143
144static struct rwlock cache_lock;
145RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
146
147#define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock)
148#define CACHE_RLOCK() rw_rlock(&cache_lock)
149#define CACHE_RUNLOCK() rw_runlock(&cache_lock)
150#define CACHE_WLOCK() rw_wlock(&cache_lock)
151#define CACHE_WUNLOCK() rw_wunlock(&cache_lock)
152
153/*
154 * UMA zones for the VFS cache.
155 *
156 * The small cache is used for entries with short names, which are the
157 * most common. The large cache is used for entries which are too big to
158 * fit in the small cache.
159 */
160static uma_zone_t cache_zone_small;
161static uma_zone_t cache_zone_large;
162
163#define CACHE_PATH_CUTOFF 35
164#define CACHE_ZONE_SMALL (sizeof(struct namecache) + CACHE_PATH_CUTOFF \
165 + 1)
166#define CACHE_ZONE_LARGE (sizeof(struct namecache) + NAME_MAX + 1)
167
168#define cache_alloc(len) uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
169 cache_zone_small : cache_zone_large, M_WAITOK)
170#define cache_free(ncp) do { \
171 if (ncp != NULL) \
172 uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
173 cache_zone_small : cache_zone_large, (ncp)); \
174} while (0)
175
176static int doingcache = 1; /* 1 => enable the cache */
177SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
178
179/* Export size information to userland */
180SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
181 sizeof(struct namecache), "");
182
183/*
184 * The new name cache statistics
185 */
186static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
187#define STATNODE(mode, name, var) \
188 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
189STATNODE(CTLFLAG_RD, numneg, &numneg);
190STATNODE(CTLFLAG_RD, numcache, &numcache);
191static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
192static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
193static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
194static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
195static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
196static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
197static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
198static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
199static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
200static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
201static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades);
202
203SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE,
204 &nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
205
206
207
208static void cache_zap(struct namecache *ncp);
209static int vn_vptocnp(struct vnode **vp, char **bp, char *buf, u_int *buflen);
210static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
211 char *buf, char **retbuf, u_int buflen);
212
213static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
214
215/*
216 * Flags in namecache.nc_flag
217 */
218#define NCF_WHITE 0x01
219#define NCF_ISDOTDOT 0x02
220
221#ifdef DIAGNOSTIC
222/*
223 * Grab an atomic snapshot of the name cache hash chain lengths
224 */
225SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
226
227static int
228sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
229{
230 int error;
231 struct nchashhead *ncpp;
232 struct namecache *ncp;
233 int n_nchash;
234 int count;
235
236 n_nchash = nchash + 1; /* nchash is max index, not count */
237 if (!req->oldptr)
238 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
239
240 /* Scan hash tables for applicable entries */
241 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
242 CACHE_RLOCK();
243 count = 0;
244 LIST_FOREACH(ncp, ncpp, nc_hash) {
245 count++;
246 }
247 CACHE_RUNLOCK();
248 error = SYSCTL_OUT(req, &count, sizeof(count));
249 if (error)
250 return (error);
251 }
252 return (0);
253}
254SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
255 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
256 "nchash chain lengths");
257
258static int
259sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
260{
261 int error;
262 struct nchashhead *ncpp;
263 struct namecache *ncp;
264 int n_nchash;
265 int count, maxlength, used, pct;
266
267 if (!req->oldptr)
268 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
269
270 n_nchash = nchash + 1; /* nchash is max index, not count */
271 used = 0;
272 maxlength = 0;
273
274 /* Scan hash tables for applicable entries */
275 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
276 count = 0;
277 CACHE_RLOCK();
278 LIST_FOREACH(ncp, ncpp, nc_hash) {
279 count++;
280 }
281 CACHE_RUNLOCK();
282 if (count)
283 used++;
284 if (maxlength < count)
285 maxlength = count;
286 }
287 n_nchash = nchash + 1;
288 pct = (used * 100 * 100) / n_nchash;
289 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
290 if (error)
291 return (error);
292 error = SYSCTL_OUT(req, &used, sizeof(used));
293 if (error)
294 return (error);
295 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
296 if (error)
297 return (error);
298 error = SYSCTL_OUT(req, &pct, sizeof(pct));
299 if (error)
300 return (error);
301 return (0);
302}
303SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
304 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
305 "nchash chain lengths");
306#endif
307
308/*
309 * cache_zap():
310 *
311 * Removes a namecache entry from cache, whether it contains an actual
312 * pointer to a vnode or if it is just a negative cache entry.
313 */
314static void
315cache_zap(ncp)
316 struct namecache *ncp;
317{
318 struct vnode *vp;
319
320 rw_assert(&cache_lock, RA_WLOCKED);
321 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
322#ifdef KDTRACE_HOOKS
323 if (ncp->nc_vp != NULL) {
324 SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
325 ncp->nc_name, ncp->nc_vp, 0, 0);
326 } else {
327 SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
328 ncp->nc_name, 0, 0, 0);
329 }
330#endif
331 vp = NULL;
332 LIST_REMOVE(ncp, nc_hash);
333 if (ncp->nc_flag & NCF_ISDOTDOT) {
334 if (ncp == ncp->nc_dvp->v_cache_dd)
335 ncp->nc_dvp->v_cache_dd = NULL;
336 } else {
337 LIST_REMOVE(ncp, nc_src);
338 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
339 vp = ncp->nc_dvp;
340 numcachehv--;
341 }
342 }
343 if (ncp->nc_vp) {
344 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
345 if (ncp == ncp->nc_vp->v_cache_dd)
346 ncp->nc_vp->v_cache_dd = NULL;
347 } else {
348 TAILQ_REMOVE(&ncneg, ncp, nc_dst);
349 numneg--;
350 }
351 numcache--;
352 cache_free(ncp);
353 if (vp)
354 vdrop(vp);
355}
356
357/*
358 * Lookup an entry in the cache
359 *
360 * Lookup is called with dvp pointing to the directory to search,
361 * cnp pointing to the name of the entry being sought. If the lookup
362 * succeeds, the vnode is returned in *vpp, and a status of -1 is
363 * returned. If the lookup determines that the name does not exist
364 * (negative cacheing), a status of ENOENT is returned. If the lookup
365 * fails, a status of zero is returned. If the directory vnode is
366 * recycled out from under us due to a forced unmount, a status of
367 * ENOENT is returned.
368 *
369 * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is
370 * unlocked. If we're looking up . an extra ref is taken, but the lock is
371 * not recursively acquired.
372 */
373
374int
375cache_lookup(dvp, vpp, cnp)
376 struct vnode *dvp;
377 struct vnode **vpp;
378 struct componentname *cnp;
379{
380 struct namecache *ncp;
381 u_int32_t hash;
382 int error, ltype, wlocked;
383
384 if (!doingcache) {
385 cnp->cn_flags &= ~MAKEENTRY;
386 return (0);
387 }
388retry:
389 CACHE_RLOCK();
390 wlocked = 0;
391 numcalls++;
392 error = 0;
393
394retry_wlocked:
395 if (cnp->cn_nameptr[0] == '.') {
396 if (cnp->cn_namelen == 1) {
397 *vpp = dvp;
398 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
399 dvp, cnp->cn_nameptr);
400 dothits++;
401 SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
402 *vpp, 0, 0);
403 goto success;
404 }
405 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
406 dotdothits++;
407 if (dvp->v_cache_dd == NULL) {
408 SDT_PROBE(vfs, namecache, lookup, miss, dvp,
409 "..", NULL, 0, 0);
410 goto unlock;
411 }
412 if ((cnp->cn_flags & MAKEENTRY) == 0) {
413 if (!wlocked && !CACHE_UPGRADE_LOCK())
414 goto wlock;
415 if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
416 cache_zap(dvp->v_cache_dd);
417 dvp->v_cache_dd = NULL;
418 goto unlock;
419 }
420 if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
421 *vpp = dvp->v_cache_dd->nc_vp;
422 else
423 *vpp = dvp->v_cache_dd->nc_dvp;
424 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
425 dvp, cnp->cn_nameptr, *vpp);
426 SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
427 *vpp, 0, 0);
428 goto success;
429 }
430 }
431
432 hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
433 hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
434 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
435 numchecks++;
436 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
437 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
438 break;
439 }
440
441 /* We failed to find an entry */
442 if (ncp == NULL) {
443 SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
444 NULL, 0, 0);
445 if ((cnp->cn_flags & MAKEENTRY) == 0) {
446 nummisszap++;
447 } else {
448 nummiss++;
449 }
450 nchstats.ncs_miss++;
451 goto unlock;
452 }
453
454 /* We don't want to have an entry, so dump it */
455 if ((cnp->cn_flags & MAKEENTRY) == 0) {
456 numposzaps++;
457 nchstats.ncs_badhits++;
458 if (!wlocked && !CACHE_UPGRADE_LOCK())
459 goto wlock;
460 cache_zap(ncp);
461 CACHE_WUNLOCK();
462 return (0);
463 }
464
465 /* We found a "positive" match, return the vnode */
466 if (ncp->nc_vp) {
467 numposhits++;
468 nchstats.ncs_goodhits++;
469 *vpp = ncp->nc_vp;
470 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
471 dvp, cnp->cn_nameptr, *vpp, ncp);
472 SDT_PROBE(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
473 *vpp, 0, 0);
474 goto success;
475 }
476
477 /* We found a negative match, and want to create it, so purge */
478 if (cnp->cn_nameiop == CREATE) {
479 numnegzaps++;
480 nchstats.ncs_badhits++;
481 if (!wlocked && !CACHE_UPGRADE_LOCK())
482 goto wlock;
483 cache_zap(ncp);
484 CACHE_WUNLOCK();
485 return (0);
486 }
487
488 if (!wlocked && !CACHE_UPGRADE_LOCK())
489 goto wlock;
490 numneghits++;
491 /*
492 * We found a "negative" match, so we shift it to the end of
493 * the "negative" cache entries queue to satisfy LRU. Also,
494 * check to see if the entry is a whiteout; indicate this to
495 * the componentname, if so.
496 */
497 TAILQ_REMOVE(&ncneg, ncp, nc_dst);
498 TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
499 nchstats.ncs_neghits++;
500 if (ncp->nc_flag & NCF_WHITE)
501 cnp->cn_flags |= ISWHITEOUT;
502 SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, ncp->nc_name,
503 0, 0, 0);
504 CACHE_WUNLOCK();
505 return (ENOENT);
506
507wlock:
508 /*
509 * We need to update the cache after our lookup, so upgrade to
510 * a write lock and retry the operation.
511 */
512 CACHE_RUNLOCK();
513 CACHE_WLOCK();
514 numupgrades++;
515 wlocked = 1;
516 goto retry_wlocked;
517
518success:
519 /*
520 * On success we return a locked and ref'd vnode as per the lookup
521 * protocol.
522 */
523 if (dvp == *vpp) { /* lookup on "." */
524 VREF(*vpp);
525 if (wlocked)
526 CACHE_WUNLOCK();
527 else
528 CACHE_RUNLOCK();
529 /*
530 * When we lookup "." we still can be asked to lock it
531 * differently...
532 */
533 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
534 if (ltype != VOP_ISLOCKED(*vpp)) {
535 if (ltype == LK_EXCLUSIVE) {
536 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
537 if ((*vpp)->v_iflag & VI_DOOMED) {
538 /* forced unmount */
539 vrele(*vpp);
540 *vpp = NULL;
541 return (ENOENT);
542 }
543 } else
544 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
545 }
546 return (-1);
547 }
548 ltype = 0; /* silence gcc warning */
549 if (cnp->cn_flags & ISDOTDOT) {
550 ltype = VOP_ISLOCKED(dvp);
551 VOP_UNLOCK(dvp, 0);
552 }
553 VI_LOCK(*vpp);
554 if (wlocked)
555 CACHE_WUNLOCK();
556 else
557 CACHE_RUNLOCK();
558 error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
559 if (cnp->cn_flags & ISDOTDOT) {
560 vn_lock(dvp, ltype | LK_RETRY);
561 if (dvp->v_iflag & VI_DOOMED) {
562 if (error == 0)
563 vput(*vpp);
564 *vpp = NULL;
565 return (ENOENT);
566 }
567 }
568 if (error) {
569 *vpp = NULL;
570 goto retry;
571 }
572 if ((cnp->cn_flags & ISLASTCN) &&
573 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
574 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
575 }
576 return (-1);
577
578unlock:
579 if (wlocked)
580 CACHE_WUNLOCK();
581 else
582 CACHE_RUNLOCK();
583 return (0);
584}
585
586/*
587 * Add an entry to the cache.
588 */
589void
590cache_enter(dvp, vp, cnp)
591 struct vnode *dvp;
592 struct vnode *vp;
593 struct componentname *cnp;
594{
595 struct namecache *ncp, *n2;
596 struct nchashhead *ncpp;
597 u_int32_t hash;
598 int flag;
599 int hold;
600 int zap;
601 int len;
602
603 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
604 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
605 ("cahe_enter: Adding a doomed vnode"));
606
607 if (!doingcache)
608 return;
609
610 /*
611 * Avoid blowout in namecache entries.
612 */
613 if (numcache >= desiredvnodes * 2)
614 return;
615
616 flag = 0;
617 if (cnp->cn_nameptr[0] == '.') {
618 if (cnp->cn_namelen == 1)
619 return;
620 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
621 CACHE_WLOCK();
622 /*
623 * If dotdot entry already exists, just retarget it
624 * to new parent vnode, otherwise continue with new
625 * namecache entry allocation.
626 */
627 if ((ncp = dvp->v_cache_dd) != NULL) {
628 if (ncp->nc_flag & NCF_ISDOTDOT) {
629 KASSERT(ncp->nc_dvp == dvp,
630 ("wrong isdotdot parent"));
631 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
632 ncp, nc_dst);
633 TAILQ_INSERT_HEAD(&vp->v_cache_dst,
634 ncp, nc_dst);
635 ncp->nc_vp = vp;
636 CACHE_WUNLOCK();
637 return;
638 }
639 }
640 dvp->v_cache_dd = NULL;
641 SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
642 0, 0);
643 CACHE_WUNLOCK();
644 flag = NCF_ISDOTDOT;
645 }
646 }
647
648 hold = 0;
649 zap = 0;
650
651 /*
652 * Calculate the hash key and setup as much of the new
653 * namecache entry as possible before acquiring the lock.
654 */
655 ncp = cache_alloc(cnp->cn_namelen);
656 ncp->nc_vp = vp;
657 ncp->nc_dvp = dvp;
658 ncp->nc_flag = flag;
659 len = ncp->nc_nlen = cnp->cn_namelen;
660 hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
661 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
662 hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
663 CACHE_WLOCK();
664
665 /*
666 * See if this vnode or negative entry is already in the cache
667 * with this name. This can happen with concurrent lookups of
668 * the same path name.
669 */
670 ncpp = NCHHASH(hash);
671 LIST_FOREACH(n2, ncpp, nc_hash) {
672 if (n2->nc_dvp == dvp &&
673 n2->nc_nlen == cnp->cn_namelen &&
674 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
675 CACHE_WUNLOCK();
676 cache_free(ncp);
677 return;
678 }
679 }
680
37
38#include "opt_kdtrace.h"
39#include "opt_ktrace.h"
40
41#include <sys/param.h>
42#include <sys/filedesc.h>
43#include <sys/fnv_hash.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mount.h>
48#include <sys/namei.h>
49#include <sys/proc.h>
50#include <sys/rwlock.h>
51#include <sys/sdt.h>
52#include <sys/syscallsubr.h>
53#include <sys/sysctl.h>
54#include <sys/sysproto.h>
55#include <sys/systm.h>
56#include <sys/vnode.h>
57#ifdef KTRACE
58#include <sys/ktrace.h>
59#endif
60
61#include <vm/uma.h>
62
63SDT_PROVIDER_DECLARE(vfs);
64SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
65 "struct vnode *");
66SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
67 "char *");
68SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
69SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
70 "struct char *", "struct vnode *");
71SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
72SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *",
73 "struct char *");
74SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
75 "struct vnode *");
76SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, "struct vnode *",
77 "char *");
78SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
79 "char *");
80SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
81SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
82SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
83SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
84 "struct vnode *");
85SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
86 "char *");
87
88/*
89 * This structure describes the elements in the cache of recent
90 * names looked up by namei.
91 */
92
93struct namecache {
94 LIST_ENTRY(namecache) nc_hash; /* hash chain */
95 LIST_ENTRY(namecache) nc_src; /* source vnode list */
96 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
97 struct vnode *nc_dvp; /* vnode of parent of name */
98 struct vnode *nc_vp; /* vnode the name refers to */
99 u_char nc_flag; /* flag bits */
100 u_char nc_nlen; /* length of name */
101 char nc_name[0]; /* segment name + nul */
102};
103
104/*
105 * Name caching works as follows:
106 *
107 * Names found by directory scans are retained in a cache
108 * for future reference. It is managed LRU, so frequently
109 * used names will hang around. Cache is indexed by hash value
110 * obtained from (vp, name) where vp refers to the directory
111 * containing name.
112 *
113 * If it is a "negative" entry, (i.e. for a name that is known NOT to
114 * exist) the vnode pointer will be NULL.
115 *
116 * Upon reaching the last segment of a path, if the reference
117 * is for DELETE, or NOCACHE is set (rewrite), and the
118 * name is located in the cache, it will be dropped.
119 */
120
121/*
122 * Structures associated with name cacheing.
123 */
124#define NCHHASH(hash) \
125 (&nchashtbl[(hash) & nchash])
126static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
127static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */
128static u_long nchash; /* size of hash table */
129SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
130static u_long ncnegfactor = 16; /* ratio of negative entries */
131SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
132static u_long numneg; /* number of cache entries allocated */
133SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
134static u_long numcache; /* number of cache entries allocated */
135SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
136static u_long numcachehv; /* number of cache entries with vnodes held */
137SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
138#if 0
139static u_long numcachepl; /* number of cache purge for leaf entries */
140SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
141#endif
142struct nchstats nchstats; /* cache effectiveness statistics */
143
144static struct rwlock cache_lock;
145RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
146
147#define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock)
148#define CACHE_RLOCK() rw_rlock(&cache_lock)
149#define CACHE_RUNLOCK() rw_runlock(&cache_lock)
150#define CACHE_WLOCK() rw_wlock(&cache_lock)
151#define CACHE_WUNLOCK() rw_wunlock(&cache_lock)
152
153/*
154 * UMA zones for the VFS cache.
155 *
156 * The small cache is used for entries with short names, which are the
157 * most common. The large cache is used for entries which are too big to
158 * fit in the small cache.
159 */
160static uma_zone_t cache_zone_small;
161static uma_zone_t cache_zone_large;
162
163#define CACHE_PATH_CUTOFF 35
164#define CACHE_ZONE_SMALL (sizeof(struct namecache) + CACHE_PATH_CUTOFF \
165 + 1)
166#define CACHE_ZONE_LARGE (sizeof(struct namecache) + NAME_MAX + 1)
167
168#define cache_alloc(len) uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
169 cache_zone_small : cache_zone_large, M_WAITOK)
170#define cache_free(ncp) do { \
171 if (ncp != NULL) \
172 uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
173 cache_zone_small : cache_zone_large, (ncp)); \
174} while (0)
175
176static int doingcache = 1; /* 1 => enable the cache */
177SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
178
179/* Export size information to userland */
180SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
181 sizeof(struct namecache), "");
182
183/*
184 * The new name cache statistics
185 */
186static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
187#define STATNODE(mode, name, var) \
188 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
189STATNODE(CTLFLAG_RD, numneg, &numneg);
190STATNODE(CTLFLAG_RD, numcache, &numcache);
191static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
192static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
193static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
194static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
195static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
196static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
197static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
198static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
199static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
200static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
201static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades);
202
203SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE,
204 &nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
205
206
207
208static void cache_zap(struct namecache *ncp);
209static int vn_vptocnp(struct vnode **vp, char **bp, char *buf, u_int *buflen);
210static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
211 char *buf, char **retbuf, u_int buflen);
212
213static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
214
215/*
216 * Flags in namecache.nc_flag
217 */
218#define NCF_WHITE 0x01
219#define NCF_ISDOTDOT 0x02
220
221#ifdef DIAGNOSTIC
222/*
223 * Grab an atomic snapshot of the name cache hash chain lengths
224 */
225SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
226
227static int
228sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
229{
230 int error;
231 struct nchashhead *ncpp;
232 struct namecache *ncp;
233 int n_nchash;
234 int count;
235
236 n_nchash = nchash + 1; /* nchash is max index, not count */
237 if (!req->oldptr)
238 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
239
240 /* Scan hash tables for applicable entries */
241 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
242 CACHE_RLOCK();
243 count = 0;
244 LIST_FOREACH(ncp, ncpp, nc_hash) {
245 count++;
246 }
247 CACHE_RUNLOCK();
248 error = SYSCTL_OUT(req, &count, sizeof(count));
249 if (error)
250 return (error);
251 }
252 return (0);
253}
254SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
255 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
256 "nchash chain lengths");
257
258static int
259sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
260{
261 int error;
262 struct nchashhead *ncpp;
263 struct namecache *ncp;
264 int n_nchash;
265 int count, maxlength, used, pct;
266
267 if (!req->oldptr)
268 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
269
270 n_nchash = nchash + 1; /* nchash is max index, not count */
271 used = 0;
272 maxlength = 0;
273
274 /* Scan hash tables for applicable entries */
275 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
276 count = 0;
277 CACHE_RLOCK();
278 LIST_FOREACH(ncp, ncpp, nc_hash) {
279 count++;
280 }
281 CACHE_RUNLOCK();
282 if (count)
283 used++;
284 if (maxlength < count)
285 maxlength = count;
286 }
287 n_nchash = nchash + 1;
288 pct = (used * 100 * 100) / n_nchash;
289 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
290 if (error)
291 return (error);
292 error = SYSCTL_OUT(req, &used, sizeof(used));
293 if (error)
294 return (error);
295 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
296 if (error)
297 return (error);
298 error = SYSCTL_OUT(req, &pct, sizeof(pct));
299 if (error)
300 return (error);
301 return (0);
302}
303SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
304 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
305 "nchash chain lengths");
306#endif
307
308/*
309 * cache_zap():
310 *
311 * Removes a namecache entry from cache, whether it contains an actual
312 * pointer to a vnode or if it is just a negative cache entry.
313 */
314static void
315cache_zap(ncp)
316 struct namecache *ncp;
317{
318 struct vnode *vp;
319
320 rw_assert(&cache_lock, RA_WLOCKED);
321 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
322#ifdef KDTRACE_HOOKS
323 if (ncp->nc_vp != NULL) {
324 SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
325 ncp->nc_name, ncp->nc_vp, 0, 0);
326 } else {
327 SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
328 ncp->nc_name, 0, 0, 0);
329 }
330#endif
331 vp = NULL;
332 LIST_REMOVE(ncp, nc_hash);
333 if (ncp->nc_flag & NCF_ISDOTDOT) {
334 if (ncp == ncp->nc_dvp->v_cache_dd)
335 ncp->nc_dvp->v_cache_dd = NULL;
336 } else {
337 LIST_REMOVE(ncp, nc_src);
338 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
339 vp = ncp->nc_dvp;
340 numcachehv--;
341 }
342 }
343 if (ncp->nc_vp) {
344 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
345 if (ncp == ncp->nc_vp->v_cache_dd)
346 ncp->nc_vp->v_cache_dd = NULL;
347 } else {
348 TAILQ_REMOVE(&ncneg, ncp, nc_dst);
349 numneg--;
350 }
351 numcache--;
352 cache_free(ncp);
353 if (vp)
354 vdrop(vp);
355}
356
357/*
358 * Lookup an entry in the cache
359 *
360 * Lookup is called with dvp pointing to the directory to search,
361 * cnp pointing to the name of the entry being sought. If the lookup
362 * succeeds, the vnode is returned in *vpp, and a status of -1 is
363 * returned. If the lookup determines that the name does not exist
364 * (negative cacheing), a status of ENOENT is returned. If the lookup
365 * fails, a status of zero is returned. If the directory vnode is
366 * recycled out from under us due to a forced unmount, a status of
367 * ENOENT is returned.
368 *
369 * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is
370 * unlocked. If we're looking up . an extra ref is taken, but the lock is
371 * not recursively acquired.
372 */
373
374int
375cache_lookup(dvp, vpp, cnp)
376 struct vnode *dvp;
377 struct vnode **vpp;
378 struct componentname *cnp;
379{
380 struct namecache *ncp;
381 u_int32_t hash;
382 int error, ltype, wlocked;
383
384 if (!doingcache) {
385 cnp->cn_flags &= ~MAKEENTRY;
386 return (0);
387 }
388retry:
389 CACHE_RLOCK();
390 wlocked = 0;
391 numcalls++;
392 error = 0;
393
394retry_wlocked:
395 if (cnp->cn_nameptr[0] == '.') {
396 if (cnp->cn_namelen == 1) {
397 *vpp = dvp;
398 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
399 dvp, cnp->cn_nameptr);
400 dothits++;
401 SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
402 *vpp, 0, 0);
403 goto success;
404 }
405 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
406 dotdothits++;
407 if (dvp->v_cache_dd == NULL) {
408 SDT_PROBE(vfs, namecache, lookup, miss, dvp,
409 "..", NULL, 0, 0);
410 goto unlock;
411 }
412 if ((cnp->cn_flags & MAKEENTRY) == 0) {
413 if (!wlocked && !CACHE_UPGRADE_LOCK())
414 goto wlock;
415 if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
416 cache_zap(dvp->v_cache_dd);
417 dvp->v_cache_dd = NULL;
418 goto unlock;
419 }
420 if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
421 *vpp = dvp->v_cache_dd->nc_vp;
422 else
423 *vpp = dvp->v_cache_dd->nc_dvp;
424 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
425 dvp, cnp->cn_nameptr, *vpp);
426 SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
427 *vpp, 0, 0);
428 goto success;
429 }
430 }
431
432 hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
433 hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
434 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
435 numchecks++;
436 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
437 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
438 break;
439 }
440
441 /* We failed to find an entry */
442 if (ncp == NULL) {
443 SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
444 NULL, 0, 0);
445 if ((cnp->cn_flags & MAKEENTRY) == 0) {
446 nummisszap++;
447 } else {
448 nummiss++;
449 }
450 nchstats.ncs_miss++;
451 goto unlock;
452 }
453
454 /* We don't want to have an entry, so dump it */
455 if ((cnp->cn_flags & MAKEENTRY) == 0) {
456 numposzaps++;
457 nchstats.ncs_badhits++;
458 if (!wlocked && !CACHE_UPGRADE_LOCK())
459 goto wlock;
460 cache_zap(ncp);
461 CACHE_WUNLOCK();
462 return (0);
463 }
464
465 /* We found a "positive" match, return the vnode */
466 if (ncp->nc_vp) {
467 numposhits++;
468 nchstats.ncs_goodhits++;
469 *vpp = ncp->nc_vp;
470 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
471 dvp, cnp->cn_nameptr, *vpp, ncp);
472 SDT_PROBE(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
473 *vpp, 0, 0);
474 goto success;
475 }
476
477 /* We found a negative match, and want to create it, so purge */
478 if (cnp->cn_nameiop == CREATE) {
479 numnegzaps++;
480 nchstats.ncs_badhits++;
481 if (!wlocked && !CACHE_UPGRADE_LOCK())
482 goto wlock;
483 cache_zap(ncp);
484 CACHE_WUNLOCK();
485 return (0);
486 }
487
488 if (!wlocked && !CACHE_UPGRADE_LOCK())
489 goto wlock;
490 numneghits++;
491 /*
492 * We found a "negative" match, so we shift it to the end of
493 * the "negative" cache entries queue to satisfy LRU. Also,
494 * check to see if the entry is a whiteout; indicate this to
495 * the componentname, if so.
496 */
497 TAILQ_REMOVE(&ncneg, ncp, nc_dst);
498 TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
499 nchstats.ncs_neghits++;
500 if (ncp->nc_flag & NCF_WHITE)
501 cnp->cn_flags |= ISWHITEOUT;
502 SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, ncp->nc_name,
503 0, 0, 0);
504 CACHE_WUNLOCK();
505 return (ENOENT);
506
507wlock:
508 /*
509 * We need to update the cache after our lookup, so upgrade to
510 * a write lock and retry the operation.
511 */
512 CACHE_RUNLOCK();
513 CACHE_WLOCK();
514 numupgrades++;
515 wlocked = 1;
516 goto retry_wlocked;
517
518success:
519 /*
520 * On success we return a locked and ref'd vnode as per the lookup
521 * protocol.
522 */
523 if (dvp == *vpp) { /* lookup on "." */
524 VREF(*vpp);
525 if (wlocked)
526 CACHE_WUNLOCK();
527 else
528 CACHE_RUNLOCK();
529 /*
530 * When we lookup "." we still can be asked to lock it
531 * differently...
532 */
533 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
534 if (ltype != VOP_ISLOCKED(*vpp)) {
535 if (ltype == LK_EXCLUSIVE) {
536 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
537 if ((*vpp)->v_iflag & VI_DOOMED) {
538 /* forced unmount */
539 vrele(*vpp);
540 *vpp = NULL;
541 return (ENOENT);
542 }
543 } else
544 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
545 }
546 return (-1);
547 }
548 ltype = 0; /* silence gcc warning */
549 if (cnp->cn_flags & ISDOTDOT) {
550 ltype = VOP_ISLOCKED(dvp);
551 VOP_UNLOCK(dvp, 0);
552 }
553 VI_LOCK(*vpp);
554 if (wlocked)
555 CACHE_WUNLOCK();
556 else
557 CACHE_RUNLOCK();
558 error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
559 if (cnp->cn_flags & ISDOTDOT) {
560 vn_lock(dvp, ltype | LK_RETRY);
561 if (dvp->v_iflag & VI_DOOMED) {
562 if (error == 0)
563 vput(*vpp);
564 *vpp = NULL;
565 return (ENOENT);
566 }
567 }
568 if (error) {
569 *vpp = NULL;
570 goto retry;
571 }
572 if ((cnp->cn_flags & ISLASTCN) &&
573 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
574 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
575 }
576 return (-1);
577
578unlock:
579 if (wlocked)
580 CACHE_WUNLOCK();
581 else
582 CACHE_RUNLOCK();
583 return (0);
584}
585
586/*
587 * Add an entry to the cache.
588 */
589void
590cache_enter(dvp, vp, cnp)
591 struct vnode *dvp;
592 struct vnode *vp;
593 struct componentname *cnp;
594{
595 struct namecache *ncp, *n2;
596 struct nchashhead *ncpp;
597 u_int32_t hash;
598 int flag;
599 int hold;
600 int zap;
601 int len;
602
603 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
604 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
605 ("cahe_enter: Adding a doomed vnode"));
606
607 if (!doingcache)
608 return;
609
610 /*
611 * Avoid blowout in namecache entries.
612 */
613 if (numcache >= desiredvnodes * 2)
614 return;
615
616 flag = 0;
617 if (cnp->cn_nameptr[0] == '.') {
618 if (cnp->cn_namelen == 1)
619 return;
620 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
621 CACHE_WLOCK();
622 /*
623 * If dotdot entry already exists, just retarget it
624 * to new parent vnode, otherwise continue with new
625 * namecache entry allocation.
626 */
627 if ((ncp = dvp->v_cache_dd) != NULL) {
628 if (ncp->nc_flag & NCF_ISDOTDOT) {
629 KASSERT(ncp->nc_dvp == dvp,
630 ("wrong isdotdot parent"));
631 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
632 ncp, nc_dst);
633 TAILQ_INSERT_HEAD(&vp->v_cache_dst,
634 ncp, nc_dst);
635 ncp->nc_vp = vp;
636 CACHE_WUNLOCK();
637 return;
638 }
639 }
640 dvp->v_cache_dd = NULL;
641 SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
642 0, 0);
643 CACHE_WUNLOCK();
644 flag = NCF_ISDOTDOT;
645 }
646 }
647
648 hold = 0;
649 zap = 0;
650
651 /*
652 * Calculate the hash key and setup as much of the new
653 * namecache entry as possible before acquiring the lock.
654 */
655 ncp = cache_alloc(cnp->cn_namelen);
656 ncp->nc_vp = vp;
657 ncp->nc_dvp = dvp;
658 ncp->nc_flag = flag;
659 len = ncp->nc_nlen = cnp->cn_namelen;
660 hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
661 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
662 hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
663 CACHE_WLOCK();
664
665 /*
666 * See if this vnode or negative entry is already in the cache
667 * with this name. This can happen with concurrent lookups of
668 * the same path name.
669 */
670 ncpp = NCHHASH(hash);
671 LIST_FOREACH(n2, ncpp, nc_hash) {
672 if (n2->nc_dvp == dvp &&
673 n2->nc_nlen == cnp->cn_namelen &&
674 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
675 CACHE_WUNLOCK();
676 cache_free(ncp);
677 return;
678 }
679 }
680
681 /*
682 * See if we are trying to add .. entry, but some other lookup
683 * has populated v_cache_dd pointer already.
684 */
685 if (flag == NCF_ISDOTDOT && dvp->v_cache_dd != NULL) {
686 CACHE_WUNLOCK();
687 cache_free(ncp);
688 return;
681 if (flag == NCF_ISDOTDOT) {
682 /*
683 * See if we are trying to add .. entry, but some other lookup
684 * has populated v_cache_dd pointer already.
685 */
686 if (dvp->v_cache_dd != NULL) {
687 CACHE_WUNLOCK();
688 cache_free(ncp);
689 return;
690 }
691 KASSERT(vp == NULL || vp->v_type == VDIR,
692 ("wrong vnode type %p", vp));
693 dvp->v_cache_dd = ncp;
689 }
690
691 numcache++;
692 if (!vp) {
693 numneg++;
694 if (cnp->cn_flags & ISWHITEOUT)
695 ncp->nc_flag |= NCF_WHITE;
696 } else if (vp->v_type == VDIR) {
694 }
695
696 numcache++;
697 if (!vp) {
698 numneg++;
699 if (cnp->cn_flags & ISWHITEOUT)
700 ncp->nc_flag |= NCF_WHITE;
701 } else if (vp->v_type == VDIR) {
697 if (flag == NCF_ISDOTDOT) {
698 KASSERT(dvp->v_cache_dd == NULL,
699 ("dangling v_cache_dd"));
700 dvp->v_cache_dd = ncp;
701 } else {
702 if (flag != NCF_ISDOTDOT) {
702 if ((n2 = vp->v_cache_dd) != NULL &&
703 (n2->nc_flag & NCF_ISDOTDOT) != 0)
704 cache_zap(n2);
705 vp->v_cache_dd = ncp;
706 }
707 } else {
708 vp->v_cache_dd = NULL;
709 }
710
711 /*
712 * Insert the new namecache entry into the appropriate chain
713 * within the cache entries table.
714 */
715 LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
716 if (flag != NCF_ISDOTDOT) {
717 if (LIST_EMPTY(&dvp->v_cache_src)) {
718 hold = 1;
719 numcachehv++;
720 }
721 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
722 }
723
724 /*
725 * If the entry is "negative", we place it into the
726 * "negative" cache queue, otherwise, we place it into the
727 * destination vnode's cache entries queue.
728 */
729 if (vp) {
730 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
731 SDT_PROBE(vfs, namecache, enter, done, dvp, ncp->nc_name, vp,
732 0, 0);
733 } else {
734 TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
735 SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
736 ncp->nc_name, 0, 0, 0);
737 }
738 if (numneg * ncnegfactor > numcache) {
739 ncp = TAILQ_FIRST(&ncneg);
740 zap = 1;
741 }
742 if (hold)
743 vhold(dvp);
744 if (zap)
745 cache_zap(ncp);
746 CACHE_WUNLOCK();
747}
748
749/*
750 * Name cache initialization, from vfs_init() when we are booting
751 */
752static void
753nchinit(void *dummy __unused)
754{
755
756 TAILQ_INIT(&ncneg);
757
758 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
759 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
760 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
761 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
762
763 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
764}
765SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
766
767
768/*
769 * Invalidate all entries to a particular vnode.
770 */
771void
772cache_purge(vp)
773 struct vnode *vp;
774{
775
776 CTR1(KTR_VFS, "cache_purge(%p)", vp);
777 SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
778 CACHE_WLOCK();
779 while (!LIST_EMPTY(&vp->v_cache_src))
780 cache_zap(LIST_FIRST(&vp->v_cache_src));
781 while (!TAILQ_EMPTY(&vp->v_cache_dst))
782 cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
783 if (vp->v_cache_dd != NULL) {
784 KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT,
785 ("lost dotdot link"));
786 cache_zap(vp->v_cache_dd);
787 }
788 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
789 CACHE_WUNLOCK();
790}
791
792/*
793 * Invalidate all negative entries for a particular directory vnode.
794 */
795void
796cache_purge_negative(vp)
797 struct vnode *vp;
798{
799 struct namecache *cp, *ncp;
800
801 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
802 SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
803 CACHE_WLOCK();
804 LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
805 if (cp->nc_vp == NULL)
806 cache_zap(cp);
807 }
808 CACHE_WUNLOCK();
809}
810
811/*
812 * Flush all entries referencing a particular filesystem.
813 */
814void
815cache_purgevfs(mp)
816 struct mount *mp;
817{
818 struct nchashhead *ncpp;
819 struct namecache *ncp, *nnp;
820
821 /* Scan hash tables for applicable entries */
822 SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
823 CACHE_WLOCK();
824 for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
825 LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
826 if (ncp->nc_dvp->v_mount == mp)
827 cache_zap(ncp);
828 }
829 }
830 CACHE_WUNLOCK();
831}
832
833/*
834 * Perform canonical checks and cache lookup and pass on to filesystem
835 * through the vop_cachedlookup only if needed.
836 */
837
838int
839vfs_cache_lookup(ap)
840 struct vop_lookup_args /* {
841 struct vnode *a_dvp;
842 struct vnode **a_vpp;
843 struct componentname *a_cnp;
844 } */ *ap;
845{
846 struct vnode *dvp;
847 int error;
848 struct vnode **vpp = ap->a_vpp;
849 struct componentname *cnp = ap->a_cnp;
850 struct ucred *cred = cnp->cn_cred;
851 int flags = cnp->cn_flags;
852 struct thread *td = cnp->cn_thread;
853
854 *vpp = NULL;
855 dvp = ap->a_dvp;
856
857 if (dvp->v_type != VDIR)
858 return (ENOTDIR);
859
860 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
861 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
862 return (EROFS);
863
864 error = VOP_ACCESS(dvp, VEXEC, cred, td);
865 if (error)
866 return (error);
867
868 error = cache_lookup(dvp, vpp, cnp);
869 if (error == 0)
870 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
871 if (error == -1)
872 return (0);
873 return (error);
874}
875
876
877#ifndef _SYS_SYSPROTO_H_
878struct __getcwd_args {
879 u_char *buf;
880 u_int buflen;
881};
882#endif
883
884/*
885 * XXX All of these sysctls would probably be more productive dead.
886 */
887static int disablecwd;
888SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
889 "Disable the getcwd syscall");
890
891/* Implementation of the getcwd syscall. */
892int
893__getcwd(td, uap)
894 struct thread *td;
895 struct __getcwd_args *uap;
896{
897
898 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
899}
900
901int
902kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
903{
904 char *bp, *tmpbuf;
905 struct filedesc *fdp;
906 struct vnode *cdir, *rdir;
907 int error, vfslocked;
908
909 if (disablecwd)
910 return (ENODEV);
911 if (buflen < 2)
912 return (EINVAL);
913 if (buflen > MAXPATHLEN)
914 buflen = MAXPATHLEN;
915
916 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
917 fdp = td->td_proc->p_fd;
918 FILEDESC_SLOCK(fdp);
919 cdir = fdp->fd_cdir;
920 VREF(cdir);
921 rdir = fdp->fd_rdir;
922 VREF(rdir);
923 FILEDESC_SUNLOCK(fdp);
924 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
925 vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
926 vrele(rdir);
927 VFS_UNLOCK_GIANT(vfslocked);
928 vfslocked = VFS_LOCK_GIANT(cdir->v_mount);
929 vrele(cdir);
930 VFS_UNLOCK_GIANT(vfslocked);
931
932 if (!error) {
933 if (bufseg == UIO_SYSSPACE)
934 bcopy(bp, buf, strlen(bp) + 1);
935 else
936 error = copyout(bp, buf, strlen(bp) + 1);
937#ifdef KTRACE
938 if (KTRPOINT(curthread, KTR_NAMEI))
939 ktrnamei(bp);
940#endif
941 }
942 free(tmpbuf, M_TEMP);
943 return (error);
944}
945
946/*
947 * Thus begins the fullpath magic.
948 */
949
950#undef STATNODE
951#define STATNODE(name) \
952 static u_int name; \
953 SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
954
955static int disablefullpath;
956SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
957 "Disable the vn_fullpath function");
958
959/* These count for kern___getcwd(), too. */
960STATNODE(numfullpathcalls);
961STATNODE(numfullpathfail1);
962STATNODE(numfullpathfail2);
963STATNODE(numfullpathfail4);
964STATNODE(numfullpathfound);
965
966/*
967 * Retrieve the full filesystem path that correspond to a vnode from the name
968 * cache (if available)
969 */
970int
971vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
972{
973 char *buf;
974 struct filedesc *fdp;
975 struct vnode *rdir;
976 int error, vfslocked;
977
978 if (disablefullpath)
979 return (ENODEV);
980 if (vn == NULL)
981 return (EINVAL);
982
983 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
984 fdp = td->td_proc->p_fd;
985 FILEDESC_SLOCK(fdp);
986 rdir = fdp->fd_rdir;
987 VREF(rdir);
988 FILEDESC_SUNLOCK(fdp);
989 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
990 vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
991 vrele(rdir);
992 VFS_UNLOCK_GIANT(vfslocked);
993
994 if (!error)
995 *freebuf = buf;
996 else
997 free(buf, M_TEMP);
998 return (error);
999}
1000
1001/*
1002 * This function is similar to vn_fullpath, but it attempts to lookup the
1003 * pathname relative to the global root mount point. This is required for the
1004 * auditing sub-system, as audited pathnames must be absolute, relative to the
1005 * global root mount point.
1006 */
1007int
1008vn_fullpath_global(struct thread *td, struct vnode *vn,
1009 char **retbuf, char **freebuf)
1010{
1011 char *buf;
1012 int error;
1013
1014 if (disablefullpath)
1015 return (ENODEV);
1016 if (vn == NULL)
1017 return (EINVAL);
1018 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1019 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
1020 if (!error)
1021 *freebuf = buf;
1022 else
1023 free(buf, M_TEMP);
1024 return (error);
1025}
1026
1027static int
1028vn_vptocnp(struct vnode **vp, char **bp, char *buf, u_int *buflen)
1029{
1030 struct vnode *dvp;
1031 int error, vfslocked;
1032
1033 vhold(*vp);
1034 CACHE_RUNLOCK();
1035 vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
1036 vn_lock(*vp, LK_SHARED | LK_RETRY);
1037 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
1038 VOP_UNLOCK(*vp, 0);
1039 vdrop(*vp);
1040 VFS_UNLOCK_GIANT(vfslocked);
1041 if (error) {
1042 numfullpathfail2++;
1043 return (error);
1044 }
1045 *bp = buf + *buflen;
1046 *vp = dvp;
1047 CACHE_RLOCK();
1048 if ((*vp)->v_iflag & VI_DOOMED) {
1049 /* forced unmount */
1050 CACHE_RUNLOCK();
1051 vdrop(*vp);
1052 return (ENOENT);
1053 }
1054 vdrop(*vp);
1055
1056 return (0);
1057}
1058
1059/*
1060 * The magic behind kern___getcwd() and vn_fullpath().
1061 */
1062static int
1063vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
1064 char *buf, char **retbuf, u_int buflen)
1065{
1066 char *bp;
1067 int error, i, slash_prefixed;
1068 struct namecache *ncp;
1069#ifdef KDTRACE_HOOKS
1070 struct vnode *startvp = vp;
1071#endif
1072
1073 buflen--;
1074 bp = buf + buflen;
1075 *bp = '\0';
1076 error = 0;
1077 slash_prefixed = 0;
1078
1079 SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
1080 CACHE_RLOCK();
1081 numfullpathcalls++;
1082 if (vp->v_type != VDIR) {
1083 ncp = TAILQ_FIRST(&vp->v_cache_dst);
1084 if (ncp != NULL) {
1085 buflen -= ncp->nc_nlen;
1086 for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
1087 *--bp = ncp->nc_name[i];
1088 if (bp == buf) {
1089 numfullpathfail4++;
1090 CACHE_RUNLOCK();
1091 error = ENOMEM;
1092 SDT_PROBE(vfs, namecache, fullpath, return,
1093 error, startvp, NULL, 0, 0);
1094 return (error);
1095 }
1096 SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
1097 ncp->nc_name, vp, 0, 0);
1098 vp = ncp->nc_dvp;
1099 } else {
1100 SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0,
1101 0, 0);
1102 error = vn_vptocnp(&vp, &bp, buf, &buflen);
1103 if (error) {
1104 SDT_PROBE(vfs, namecache, fullpath, return,
1105 error, startvp, NULL, 0, 0);
1106 return (error);
1107 }
1108 }
1109 if (buflen <= 0) {
1110 numfullpathfail4++;
1111 CACHE_RUNLOCK();
1112 error = ENOMEM;
1113 SDT_PROBE(vfs, namecache, fullpath, return, error,
1114 startvp, NULL, 0, 0);
1115 return (error);
1116 }
1117 *--bp = '/';
1118 buflen--;
1119 slash_prefixed = 1;
1120 }
1121 while (vp != rdir && vp != rootvnode) {
1122 if (vp->v_vflag & VV_ROOT) {
1123 if (vp->v_iflag & VI_DOOMED) { /* forced unmount */
1124 CACHE_RUNLOCK();
1125 error = ENOENT;
1126 break;
1127 }
1128 vp = vp->v_mount->mnt_vnodecovered;
1129 continue;
1130 }
1131 if (vp->v_type != VDIR) {
1132 numfullpathfail1++;
1133 CACHE_RUNLOCK();
1134 error = ENOTDIR;
1135 break;
1136 }
1137 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
1138 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1139 break;
1140 if (ncp != NULL) {
1141 buflen -= ncp->nc_nlen;
1142 for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
1143 *--bp = ncp->nc_name[i];
1144 if (bp == buf) {
1145 numfullpathfail4++;
1146 CACHE_RUNLOCK();
1147 error = ENOMEM;
1148 break;
1149 }
1150 SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
1151 ncp->nc_name, vp, 0, 0);
1152 vp = ncp->nc_dvp;
1153 } else {
1154 SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0,
1155 0, 0);
1156 error = vn_vptocnp(&vp, &bp, buf, &buflen);
1157 if (error)
1158 break;
1159 }
1160 if (buflen <= 0) {
1161 numfullpathfail4++;
1162 CACHE_RUNLOCK();
1163 error = ENOMEM;
1164 break;
1165 }
1166 *--bp = '/';
1167 buflen--;
1168 slash_prefixed = 1;
1169 }
1170 if (error) {
1171 SDT_PROBE(vfs, namecache, fullpath, return, error, startvp,
1172 NULL, 0, 0);
1173 return (error);
1174 }
1175 if (!slash_prefixed) {
1176 if (bp == buf) {
1177 numfullpathfail4++;
1178 CACHE_RUNLOCK();
1179 SDT_PROBE(vfs, namecache, fullpath, return, 0,
1180 startvp, bp, 0, 0);
1181 return (ENOMEM);
1182 } else
1183 *--bp = '/';
1184 }
1185 numfullpathfound++;
1186 CACHE_RUNLOCK();
1187
1188 SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, bp, 0, 0);
1189 *retbuf = bp;
1190 return (0);
1191}
1192
1193int
1194vn_commname(struct vnode *vp, char *buf, u_int buflen)
1195{
1196 struct namecache *ncp;
1197 int l;
1198
1199 CACHE_RLOCK();
1200 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
1201 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1202 break;
1203 if (ncp == NULL) {
1204 CACHE_RUNLOCK();
1205 return (ENOENT);
1206 }
1207 l = min(ncp->nc_nlen, buflen - 1);
1208 memcpy(buf, ncp->nc_name, l);
1209 CACHE_RUNLOCK();
1210 buf[l] = '\0';
1211 return (0);
1212}
703 if ((n2 = vp->v_cache_dd) != NULL &&
704 (n2->nc_flag & NCF_ISDOTDOT) != 0)
705 cache_zap(n2);
706 vp->v_cache_dd = ncp;
707 }
708 } else {
709 vp->v_cache_dd = NULL;
710 }
711
712 /*
713 * Insert the new namecache entry into the appropriate chain
714 * within the cache entries table.
715 */
716 LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
717 if (flag != NCF_ISDOTDOT) {
718 if (LIST_EMPTY(&dvp->v_cache_src)) {
719 hold = 1;
720 numcachehv++;
721 }
722 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
723 }
724
725 /*
726 * If the entry is "negative", we place it into the
727 * "negative" cache queue, otherwise, we place it into the
728 * destination vnode's cache entries queue.
729 */
730 if (vp) {
731 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
732 SDT_PROBE(vfs, namecache, enter, done, dvp, ncp->nc_name, vp,
733 0, 0);
734 } else {
735 TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
736 SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
737 ncp->nc_name, 0, 0, 0);
738 }
739 if (numneg * ncnegfactor > numcache) {
740 ncp = TAILQ_FIRST(&ncneg);
741 zap = 1;
742 }
743 if (hold)
744 vhold(dvp);
745 if (zap)
746 cache_zap(ncp);
747 CACHE_WUNLOCK();
748}
749
750/*
751 * Name cache initialization, from vfs_init() when we are booting
752 */
753static void
754nchinit(void *dummy __unused)
755{
756
757 TAILQ_INIT(&ncneg);
758
759 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
760 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
761 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
762 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
763
764 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
765}
766SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
767
768
769/*
770 * Invalidate all entries to a particular vnode.
771 */
772void
773cache_purge(vp)
774 struct vnode *vp;
775{
776
777 CTR1(KTR_VFS, "cache_purge(%p)", vp);
778 SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
779 CACHE_WLOCK();
780 while (!LIST_EMPTY(&vp->v_cache_src))
781 cache_zap(LIST_FIRST(&vp->v_cache_src));
782 while (!TAILQ_EMPTY(&vp->v_cache_dst))
783 cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
784 if (vp->v_cache_dd != NULL) {
785 KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT,
786 ("lost dotdot link"));
787 cache_zap(vp->v_cache_dd);
788 }
789 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
790 CACHE_WUNLOCK();
791}
792
793/*
794 * Invalidate all negative entries for a particular directory vnode.
795 */
796void
797cache_purge_negative(vp)
798 struct vnode *vp;
799{
800 struct namecache *cp, *ncp;
801
802 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
803 SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
804 CACHE_WLOCK();
805 LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
806 if (cp->nc_vp == NULL)
807 cache_zap(cp);
808 }
809 CACHE_WUNLOCK();
810}
811
812/*
813 * Flush all entries referencing a particular filesystem.
814 */
815void
816cache_purgevfs(mp)
817 struct mount *mp;
818{
819 struct nchashhead *ncpp;
820 struct namecache *ncp, *nnp;
821
822 /* Scan hash tables for applicable entries */
823 SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
824 CACHE_WLOCK();
825 for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
826 LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
827 if (ncp->nc_dvp->v_mount == mp)
828 cache_zap(ncp);
829 }
830 }
831 CACHE_WUNLOCK();
832}
833
834/*
835 * Perform canonical checks and cache lookup and pass on to filesystem
836 * through the vop_cachedlookup only if needed.
837 */
838
839int
840vfs_cache_lookup(ap)
841 struct vop_lookup_args /* {
842 struct vnode *a_dvp;
843 struct vnode **a_vpp;
844 struct componentname *a_cnp;
845 } */ *ap;
846{
847 struct vnode *dvp;
848 int error;
849 struct vnode **vpp = ap->a_vpp;
850 struct componentname *cnp = ap->a_cnp;
851 struct ucred *cred = cnp->cn_cred;
852 int flags = cnp->cn_flags;
853 struct thread *td = cnp->cn_thread;
854
855 *vpp = NULL;
856 dvp = ap->a_dvp;
857
858 if (dvp->v_type != VDIR)
859 return (ENOTDIR);
860
861 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
862 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
863 return (EROFS);
864
865 error = VOP_ACCESS(dvp, VEXEC, cred, td);
866 if (error)
867 return (error);
868
869 error = cache_lookup(dvp, vpp, cnp);
870 if (error == 0)
871 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
872 if (error == -1)
873 return (0);
874 return (error);
875}
876
877
878#ifndef _SYS_SYSPROTO_H_
879struct __getcwd_args {
880 u_char *buf;
881 u_int buflen;
882};
883#endif
884
885/*
886 * XXX All of these sysctls would probably be more productive dead.
887 */
888static int disablecwd;
889SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
890 "Disable the getcwd syscall");
891
892/* Implementation of the getcwd syscall. */
893int
894__getcwd(td, uap)
895 struct thread *td;
896 struct __getcwd_args *uap;
897{
898
899 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
900}
901
902int
903kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
904{
905 char *bp, *tmpbuf;
906 struct filedesc *fdp;
907 struct vnode *cdir, *rdir;
908 int error, vfslocked;
909
910 if (disablecwd)
911 return (ENODEV);
912 if (buflen < 2)
913 return (EINVAL);
914 if (buflen > MAXPATHLEN)
915 buflen = MAXPATHLEN;
916
917 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
918 fdp = td->td_proc->p_fd;
919 FILEDESC_SLOCK(fdp);
920 cdir = fdp->fd_cdir;
921 VREF(cdir);
922 rdir = fdp->fd_rdir;
923 VREF(rdir);
924 FILEDESC_SUNLOCK(fdp);
925 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
926 vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
927 vrele(rdir);
928 VFS_UNLOCK_GIANT(vfslocked);
929 vfslocked = VFS_LOCK_GIANT(cdir->v_mount);
930 vrele(cdir);
931 VFS_UNLOCK_GIANT(vfslocked);
932
933 if (!error) {
934 if (bufseg == UIO_SYSSPACE)
935 bcopy(bp, buf, strlen(bp) + 1);
936 else
937 error = copyout(bp, buf, strlen(bp) + 1);
938#ifdef KTRACE
939 if (KTRPOINT(curthread, KTR_NAMEI))
940 ktrnamei(bp);
941#endif
942 }
943 free(tmpbuf, M_TEMP);
944 return (error);
945}
946
947/*
948 * Thus begins the fullpath magic.
949 */
950
951#undef STATNODE
952#define STATNODE(name) \
953 static u_int name; \
954 SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
955
956static int disablefullpath;
957SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
958 "Disable the vn_fullpath function");
959
960/* These count for kern___getcwd(), too. */
961STATNODE(numfullpathcalls);
962STATNODE(numfullpathfail1);
963STATNODE(numfullpathfail2);
964STATNODE(numfullpathfail4);
965STATNODE(numfullpathfound);
966
967/*
968 * Retrieve the full filesystem path that correspond to a vnode from the name
969 * cache (if available)
970 */
971int
972vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
973{
974 char *buf;
975 struct filedesc *fdp;
976 struct vnode *rdir;
977 int error, vfslocked;
978
979 if (disablefullpath)
980 return (ENODEV);
981 if (vn == NULL)
982 return (EINVAL);
983
984 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
985 fdp = td->td_proc->p_fd;
986 FILEDESC_SLOCK(fdp);
987 rdir = fdp->fd_rdir;
988 VREF(rdir);
989 FILEDESC_SUNLOCK(fdp);
990 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
991 vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
992 vrele(rdir);
993 VFS_UNLOCK_GIANT(vfslocked);
994
995 if (!error)
996 *freebuf = buf;
997 else
998 free(buf, M_TEMP);
999 return (error);
1000}
1001
1002/*
1003 * This function is similar to vn_fullpath, but it attempts to lookup the
1004 * pathname relative to the global root mount point. This is required for the
1005 * auditing sub-system, as audited pathnames must be absolute, relative to the
1006 * global root mount point.
1007 */
1008int
1009vn_fullpath_global(struct thread *td, struct vnode *vn,
1010 char **retbuf, char **freebuf)
1011{
1012 char *buf;
1013 int error;
1014
1015 if (disablefullpath)
1016 return (ENODEV);
1017 if (vn == NULL)
1018 return (EINVAL);
1019 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1020 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
1021 if (!error)
1022 *freebuf = buf;
1023 else
1024 free(buf, M_TEMP);
1025 return (error);
1026}
1027
1028static int
1029vn_vptocnp(struct vnode **vp, char **bp, char *buf, u_int *buflen)
1030{
1031 struct vnode *dvp;
1032 int error, vfslocked;
1033
1034 vhold(*vp);
1035 CACHE_RUNLOCK();
1036 vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
1037 vn_lock(*vp, LK_SHARED | LK_RETRY);
1038 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
1039 VOP_UNLOCK(*vp, 0);
1040 vdrop(*vp);
1041 VFS_UNLOCK_GIANT(vfslocked);
1042 if (error) {
1043 numfullpathfail2++;
1044 return (error);
1045 }
1046 *bp = buf + *buflen;
1047 *vp = dvp;
1048 CACHE_RLOCK();
1049 if ((*vp)->v_iflag & VI_DOOMED) {
1050 /* forced unmount */
1051 CACHE_RUNLOCK();
1052 vdrop(*vp);
1053 return (ENOENT);
1054 }
1055 vdrop(*vp);
1056
1057 return (0);
1058}
1059
1060/*
1061 * The magic behind kern___getcwd() and vn_fullpath().
1062 */
1063static int
1064vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
1065 char *buf, char **retbuf, u_int buflen)
1066{
1067 char *bp;
1068 int error, i, slash_prefixed;
1069 struct namecache *ncp;
1070#ifdef KDTRACE_HOOKS
1071 struct vnode *startvp = vp;
1072#endif
1073
1074 buflen--;
1075 bp = buf + buflen;
1076 *bp = '\0';
1077 error = 0;
1078 slash_prefixed = 0;
1079
1080 SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
1081 CACHE_RLOCK();
1082 numfullpathcalls++;
1083 if (vp->v_type != VDIR) {
1084 ncp = TAILQ_FIRST(&vp->v_cache_dst);
1085 if (ncp != NULL) {
1086 buflen -= ncp->nc_nlen;
1087 for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
1088 *--bp = ncp->nc_name[i];
1089 if (bp == buf) {
1090 numfullpathfail4++;
1091 CACHE_RUNLOCK();
1092 error = ENOMEM;
1093 SDT_PROBE(vfs, namecache, fullpath, return,
1094 error, startvp, NULL, 0, 0);
1095 return (error);
1096 }
1097 SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
1098 ncp->nc_name, vp, 0, 0);
1099 vp = ncp->nc_dvp;
1100 } else {
1101 SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0,
1102 0, 0);
1103 error = vn_vptocnp(&vp, &bp, buf, &buflen);
1104 if (error) {
1105 SDT_PROBE(vfs, namecache, fullpath, return,
1106 error, startvp, NULL, 0, 0);
1107 return (error);
1108 }
1109 }
1110 if (buflen <= 0) {
1111 numfullpathfail4++;
1112 CACHE_RUNLOCK();
1113 error = ENOMEM;
1114 SDT_PROBE(vfs, namecache, fullpath, return, error,
1115 startvp, NULL, 0, 0);
1116 return (error);
1117 }
1118 *--bp = '/';
1119 buflen--;
1120 slash_prefixed = 1;
1121 }
1122 while (vp != rdir && vp != rootvnode) {
1123 if (vp->v_vflag & VV_ROOT) {
1124 if (vp->v_iflag & VI_DOOMED) { /* forced unmount */
1125 CACHE_RUNLOCK();
1126 error = ENOENT;
1127 break;
1128 }
1129 vp = vp->v_mount->mnt_vnodecovered;
1130 continue;
1131 }
1132 if (vp->v_type != VDIR) {
1133 numfullpathfail1++;
1134 CACHE_RUNLOCK();
1135 error = ENOTDIR;
1136 break;
1137 }
1138 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
1139 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1140 break;
1141 if (ncp != NULL) {
1142 buflen -= ncp->nc_nlen;
1143 for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
1144 *--bp = ncp->nc_name[i];
1145 if (bp == buf) {
1146 numfullpathfail4++;
1147 CACHE_RUNLOCK();
1148 error = ENOMEM;
1149 break;
1150 }
1151 SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
1152 ncp->nc_name, vp, 0, 0);
1153 vp = ncp->nc_dvp;
1154 } else {
1155 SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0,
1156 0, 0);
1157 error = vn_vptocnp(&vp, &bp, buf, &buflen);
1158 if (error)
1159 break;
1160 }
1161 if (buflen <= 0) {
1162 numfullpathfail4++;
1163 CACHE_RUNLOCK();
1164 error = ENOMEM;
1165 break;
1166 }
1167 *--bp = '/';
1168 buflen--;
1169 slash_prefixed = 1;
1170 }
1171 if (error) {
1172 SDT_PROBE(vfs, namecache, fullpath, return, error, startvp,
1173 NULL, 0, 0);
1174 return (error);
1175 }
1176 if (!slash_prefixed) {
1177 if (bp == buf) {
1178 numfullpathfail4++;
1179 CACHE_RUNLOCK();
1180 SDT_PROBE(vfs, namecache, fullpath, return, 0,
1181 startvp, bp, 0, 0);
1182 return (ENOMEM);
1183 } else
1184 *--bp = '/';
1185 }
1186 numfullpathfound++;
1187 CACHE_RUNLOCK();
1188
1189 SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, bp, 0, 0);
1190 *retbuf = bp;
1191 return (0);
1192}
1193
1194int
1195vn_commname(struct vnode *vp, char *buf, u_int buflen)
1196{
1197 struct namecache *ncp;
1198 int l;
1199
1200 CACHE_RLOCK();
1201 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
1202 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1203 break;
1204 if (ncp == NULL) {
1205 CACHE_RUNLOCK();
1206 return (ENOENT);
1207 }
1208 l = min(ncp->nc_nlen, buflen - 1);
1209 memcpy(buf, ncp->nc_name, l);
1210 CACHE_RUNLOCK();
1211 buf[l] = '\0';
1212 return (0);
1213}