1/*	$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $	*/
2
3/*-
4 * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Copyright (c) 1989, 1993
35 *	The Regents of the University of California.  All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 *    notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 *    notice, this list of conditions and the following disclaimer in the
49 *    documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 *    may be used to endorse or promote products derived from this software
52 *    without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67 */
68
69/*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 *	Normally, there are two points where new vnodes are created:
75 *	VOP_CREATE(9) and VOP_LOOKUP(9).  The life-cycle of a vnode
76 *	starts in one of the following ways:
77 *
78 *	- Allocation, via vcache_get(9) or vcache_new(9).
79 *	- Reclamation of inactive vnode, via vcache_vget(9).
80 *
81 *	Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 *	was another, traditional way.  Currently, only the draining thread
83 *	recycles the vnodes.  This behaviour might be revisited.
84 *
85 *	The life-cycle ends when the last reference is dropped, usually
86 *	in VOP_REMOVE(9).  In such case, VOP_INACTIVE(9) is called to inform
87 *	the file system that vnode is inactive.  Via this call, file system
88 *	indicates whether vnode can be recycled (usually, it checks its own
89 *	references, e.g. count of links, whether the file was removed).
90 *
91 *	Depending on indication, vnode can be put into a free list (cache),
92 *	or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 *	disassociate underlying file system from the vnode, and finally
94 *	destroyed.
95 *
96 * Vnode state
97 *
98 *	Vnode is always in one of six states:
99 *	- MARKER	This is a marker vnode to help list traversal.  It
100 *			will never change its state.
101 *	- LOADING	Vnode is associating underlying file system and not
102 *			yet ready to use.
103 *	- LOADED	Vnode has associated underlying file system and is
104 *			ready to use.
105 *	- BLOCKED	Vnode is active but cannot get new references.
106 *	- RECLAIMING	Vnode is disassociating from the underlying file
107 *			system.
108 *	- RECLAIMED	Vnode has disassociated from underlying file system
109 *			and is dead.
110 *
111 *	Valid state changes are:
112 *	LOADING -> LOADED
113 *			Vnode has been initialised in vcache_get() or
114 *			vcache_new() and is ready to use.
115 *	BLOCKED -> RECLAIMING
116 *			Vnode starts disassociation from underlying file
117 *			system in vcache_reclaim().
118 *	RECLAIMING -> RECLAIMED
119 *			Vnode finished disassociation from underlying file
120 *			system in vcache_reclaim().
121 *	LOADED -> BLOCKED
122 *			Either vcache_rekey*() is changing the vnode key or
123 *			vrelel() is about to call VOP_INACTIVE().
124 *	BLOCKED -> LOADED
125 *			The block condition is over.
126 *	LOADING -> RECLAIMED
127 *			Either vcache_get() or vcache_new() failed to
128 *			associate the underlying file system or vcache_rekey*()
129 *			drops a vnode used as placeholder.
130 *
131 *	Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132 *	and it is possible to wait for state change.
133 *
134 *	State is protected with v_interlock with one exception:
135 *	to change from LOADING both v_interlock and vcache_lock must be held
136 *	so it is possible to check "state == LOADING" without holding
137 *	v_interlock.  See vcache_get() for details.
138 *
139 * Reference counting
140 *
141 *	Vnode is considered active, if reference count (vnode_t::v_usecount)
142 *	is non-zero.  It is maintained using: vref(9) and vrele(9), as well
143 *	as vput(9), routines.  Common points holding references are e.g.
144 *	file openings, current working directory, mount points, etc.
145 *
146 *	v_usecount is adjusted with atomic operations, however to change
147 *	from a non-zero value to zero the interlock must also be held.
148 */
149
150#include <sys/cdefs.h>
151__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $");
152
153#ifdef _KERNEL_OPT
154#include "opt_pax.h"
155#endif
156
157#include <sys/param.h>
158#include <sys/kernel.h>
159
160#include <sys/atomic.h>
161#include <sys/buf.h>
162#include <sys/conf.h>
163#include <sys/device.h>
164#include <sys/hash.h>
165#include <sys/kauth.h>
166#include <sys/kmem.h>
167#include <sys/module.h>
168#include <sys/mount.h>
169#include <sys/namei.h>
170#include <sys/pax.h>
171#include <sys/syscallargs.h>
172#include <sys/sysctl.h>
173#include <sys/systm.h>
174#include <sys/threadpool.h>
175#include <sys/vnode_impl.h>
176#include <sys/wapbl.h>
177#include <sys/fstrans.h>
178
179#include <miscfs/deadfs/deadfs.h>
180#include <miscfs/specfs/specdev.h>
181
182#include <uvm/uvm.h>
183#include <uvm/uvm_readahead.h>
184#include <uvm/uvm_stat.h>
185
186/* Flags to vrelel. */
187#define	VRELEL_ASYNC	0x0001	/* Always defer to vrele thread. */
188
189#define	LRU_VRELE	0
190#define	LRU_FREE	1
191#define	LRU_HOLD	2
192#define	LRU_COUNT	3
193
194/*
195 * There are three lru lists: one holds vnodes waiting for async release,
196 * one is for vnodes which have no buffer/page references and one for those
197 * which do (i.e.  v_holdcnt is non-zero).  We put the lists into a single,
198 * private cache line as vnodes migrate between them while under the same
199 * lock (vdrain_lock).
200 */
201
202typedef struct {
203	vnode_impl_t *li_marker;
204} lru_iter_t;
205
206u_int			numvnodes		__cacheline_aligned;
207static vnodelst_t	lru_list[LRU_COUNT]	__cacheline_aligned;
208static struct threadpool *threadpool;
209static struct threadpool_job vdrain_job;
210static struct threadpool_job vrele_job;
211static kmutex_t		vdrain_lock		__cacheline_aligned;
212SLIST_HEAD(hashhead, vnode_impl);
213static kmutex_t		vcache_lock		__cacheline_aligned;
214static kcondvar_t	vcache_cv;
215static u_int		vcache_hashsize;
216static u_long		vcache_hashmask;
217static struct hashhead	*vcache_hashtab;
218static pool_cache_t	vcache_pool;
219static void		lru_requeue(vnode_t *, vnodelst_t *);
220static vnodelst_t *	lru_which(vnode_t *);
221static vnode_impl_t *	lru_iter_first(int, lru_iter_t *);
222static vnode_impl_t *	lru_iter_next(lru_iter_t *);
223static void		lru_iter_release(lru_iter_t *);
224static vnode_impl_t *	vcache_alloc(void);
225static void		vcache_dealloc(vnode_impl_t *);
226static void		vcache_free(vnode_impl_t *);
227static void		vcache_init(void);
228static void		vcache_reinit(void);
229static void		vcache_reclaim(vnode_t *);
230static void		vrele_deferred(vnode_impl_t *);
231static void		vrelel(vnode_t *, int, int);
232static void		vnpanic(vnode_t *, const char *, ...)
233    __printflike(2, 3);
234static bool		vdrain_one(u_int);
235static void		vdrain_task(struct threadpool_job *);
236static void		vrele_task(struct threadpool_job *);
237
238/* Routines having to do with the management of the vnode table. */
239
240/*
241 * The high bit of v_usecount is a gate for vcache_tryvget().  It's set
242 * only when the vnode state is LOADED.
243 * The next bit of v_usecount is a flag for vrelel().  It's set
244 * from vcache_vget() and vcache_tryvget() whenever the operation succeeds.
245 */
246#define	VUSECOUNT_MASK	0x3fffffff
247#define	VUSECOUNT_GATE	0x80000000
248#define	VUSECOUNT_VGET	0x40000000
249
250/*
251 * Return the current usecount of a vnode.
252 */
253inline int
254vrefcnt(struct vnode *vp)
255{
256
257	return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
258}
259
260/* Vnode state operations and diagnostics. */
261
262#if defined(DIAGNOSTIC)
263
264#define VSTATE_VALID(state) \
265	((state) != VS_ACTIVE && (state) != VS_MARKER)
266#define VSTATE_GET(vp) \
267	vstate_assert_get((vp), __func__, __LINE__)
268#define VSTATE_CHANGE(vp, from, to) \
269	vstate_assert_change((vp), (from), (to), __func__, __LINE__)
270#define VSTATE_WAIT_STABLE(vp) \
271	vstate_assert_wait_stable((vp), __func__, __LINE__)
272
273void
274_vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
275    bool has_lock)
276{
277	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
278	int refcnt = vrefcnt(vp);
279
280	if (!has_lock) {
281		enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state);
282
283		if (state == VS_ACTIVE && refcnt > 0 &&
284		    (vstate == VS_LOADED || vstate == VS_BLOCKED))
285			return;
286		if (vstate == state)
287			return;
288		mutex_enter((vp)->v_interlock);
289	}
290
291	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
292
293	if ((state == VS_ACTIVE && refcnt > 0 &&
294	    (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
295	    vip->vi_state == state) {
296		if (!has_lock)
297			mutex_exit((vp)->v_interlock);
298		return;
299	}
300	vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
301	    vstate_name(vip->vi_state), refcnt,
302	    vstate_name(state), func, line);
303}
304
305static enum vnode_state
306vstate_assert_get(vnode_t *vp, const char *func, int line)
307{
308	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
309
310	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
311	if (! VSTATE_VALID(vip->vi_state))
312		vnpanic(vp, "state is %s at %s:%d",
313		    vstate_name(vip->vi_state), func, line);
314
315	return vip->vi_state;
316}
317
318static void
319vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
320{
321	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
322
323	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
324	if (! VSTATE_VALID(vip->vi_state))
325		vnpanic(vp, "state is %s at %s:%d",
326		    vstate_name(vip->vi_state), func, line);
327
328	while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
329		cv_wait(&vp->v_cv, vp->v_interlock);
330
331	if (! VSTATE_VALID(vip->vi_state))
332		vnpanic(vp, "state is %s at %s:%d",
333		    vstate_name(vip->vi_state), func, line);
334}
335
336static void
337vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
338    const char *func, int line)
339{
340	bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
341	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
342
343	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
344	if (from == VS_LOADING)
345		KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
346
347	if (! VSTATE_VALID(from))
348		vnpanic(vp, "from is %s at %s:%d",
349		    vstate_name(from), func, line);
350	if (! VSTATE_VALID(to))
351		vnpanic(vp, "to is %s at %s:%d",
352		    vstate_name(to), func, line);
353	if (vip->vi_state != from)
354		vnpanic(vp, "from is %s, expected %s at %s:%d\n",
355		    vstate_name(vip->vi_state), vstate_name(from), func, line);
356	if ((from == VS_LOADED) != gated)
357		vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
358		    vstate_name(vip->vi_state), gated, func, line);
359
360	/* Open/close the gate for vcache_tryvget(). */
361	if (to == VS_LOADED) {
362		membar_release();
363		atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
364	} else {
365		atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
366	}
367
368	atomic_store_relaxed(&vip->vi_state, to);
369	if (from == VS_LOADING)
370		cv_broadcast(&vcache_cv);
371	if (to == VS_LOADED || to == VS_RECLAIMED)
372		cv_broadcast(&vp->v_cv);
373}
374
375#else /* defined(DIAGNOSTIC) */
376
377#define VSTATE_GET(vp) \
378	(VNODE_TO_VIMPL((vp))->vi_state)
379#define VSTATE_CHANGE(vp, from, to) \
380	vstate_change((vp), (from), (to))
381#define VSTATE_WAIT_STABLE(vp) \
382	vstate_wait_stable((vp))
383void
384_vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
385    bool has_lock)
386{
387
388}
389
390static void
391vstate_wait_stable(vnode_t *vp)
392{
393	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
394
395	while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
396		cv_wait(&vp->v_cv, vp->v_interlock);
397}
398
399static void
400vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
401{
402	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
403
404	/* Open/close the gate for vcache_tryvget(). */
405	if (to == VS_LOADED) {
406		membar_release();
407		atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
408	} else {
409		atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
410	}
411
412	atomic_store_relaxed(&vip->vi_state, to);
413	if (from == VS_LOADING)
414		cv_broadcast(&vcache_cv);
415	if (to == VS_LOADED || to == VS_RECLAIMED)
416		cv_broadcast(&vp->v_cv);
417}
418
419#endif /* defined(DIAGNOSTIC) */
420
421void
422vfs_vnode_sysinit(void)
423{
424	int error __diagused, i;
425
426	dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
427	KASSERT(dead_rootmount != NULL);
428	dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
429
430	mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
431	for (i = 0; i < LRU_COUNT; i++) {
432		TAILQ_INIT(&lru_list[i]);
433	}
434	vcache_init();
435
436	error = threadpool_get(&threadpool, PRI_NONE);
437	KASSERTMSG((error == 0), "threadpool_get failed: %d", error);
438	threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain");
439	threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele");
440}
441
442/*
443 * Allocate a new marker vnode.
444 */
445vnode_t *
446vnalloc_marker(struct mount *mp)
447{
448	vnode_impl_t *vip;
449	vnode_t *vp;
450
451	vip = pool_cache_get(vcache_pool, PR_WAITOK);
452	memset(vip, 0, sizeof(*vip));
453	vp = VIMPL_TO_VNODE(vip);
454	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
455	vp->v_mount = mp;
456	vp->v_type = VBAD;
457	vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
458	klist_init(&vip->vi_klist.vk_klist);
459	vp->v_klist = &vip->vi_klist;
460	vip->vi_state = VS_MARKER;
461
462	return vp;
463}
464
465/*
466 * Free a marker vnode.
467 */
468void
469vnfree_marker(vnode_t *vp)
470{
471	vnode_impl_t *vip;
472
473	vip = VNODE_TO_VIMPL(vp);
474	KASSERT(vip->vi_state == VS_MARKER);
475	mutex_obj_free(vp->v_interlock);
476	uvm_obj_destroy(&vp->v_uobj, true);
477	klist_fini(&vip->vi_klist.vk_klist);
478	pool_cache_put(vcache_pool, vip);
479}
480
481/*
482 * Test a vnode for being a marker vnode.
483 */
484bool
485vnis_marker(vnode_t *vp)
486{
487
488	return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
489}
490
491/*
492 * Return the lru list this node should be on.
493 */
494static vnodelst_t *
495lru_which(vnode_t *vp)
496{
497
498	KASSERT(mutex_owned(vp->v_interlock));
499
500	if (vp->v_holdcnt > 0)
501		return &lru_list[LRU_HOLD];
502	else
503		return &lru_list[LRU_FREE];
504}
505
506/*
507 * Put vnode to end of given list.
508 * Both the current and the new list may be NULL, used on vnode alloc/free.
509 * Adjust numvnodes and signal vdrain thread if there is work.
510 */
511static void
512lru_requeue(vnode_t *vp, vnodelst_t *listhd)
513{
514	vnode_impl_t *vip;
515	int d;
516
517	/*
518	 * If the vnode is on the correct list, and was put there recently,
519	 * then leave it be, thus avoiding huge cache and lock contention.
520	 */
521	vip = VNODE_TO_VIMPL(vp);
522	if (listhd == vip->vi_lrulisthd &&
523	    (getticks() - vip->vi_lrulisttm) < hz) {
524	    	return;
525	}
526
527	mutex_enter(&vdrain_lock);
528	d = 0;
529	if (vip->vi_lrulisthd != NULL)
530		TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
531	else
532		d++;
533	vip->vi_lrulisthd = listhd;
534	vip->vi_lrulisttm = getticks();
535	if (vip->vi_lrulisthd != NULL)
536		TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
537	else
538		d--;
539	if (d != 0) {
540		/*
541		 * Looks strange?  This is not a bug.  Don't store
542		 * numvnodes unless there is a change - avoid false
543		 * sharing on MP.
544		 */
545		numvnodes += d;
546	}
547	if (listhd == &lru_list[LRU_VRELE])
548		threadpool_schedule_job(threadpool, &vrele_job);
549	if (d > 0 && numvnodes > desiredvnodes)
550		threadpool_schedule_job(threadpool, &vdrain_job);
551	if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16)
552		kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock);
553	mutex_exit(&vdrain_lock);
554}
555
556/*
557 * LRU list iterator.
558 * Caller holds vdrain_lock.
559 */
560static vnode_impl_t *
561lru_iter_first(int idx, lru_iter_t *iterp)
562{
563	vnode_impl_t *marker;
564
565	KASSERT(mutex_owned(&vdrain_lock));
566
567	mutex_exit(&vdrain_lock);
568	marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
569	mutex_enter(&vdrain_lock);
570	marker->vi_lrulisthd = &lru_list[idx];
571	iterp->li_marker = marker;
572
573	TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist);
574
575	return lru_iter_next(iterp);
576}
577
578static vnode_impl_t *
579lru_iter_next(lru_iter_t *iter)
580{
581	vnode_impl_t *vip, *marker;
582	vnodelst_t *listhd;
583
584	KASSERT(mutex_owned(&vdrain_lock));
585
586	marker = iter->li_marker;
587	listhd = marker->vi_lrulisthd;
588
589	while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
590		TAILQ_REMOVE(listhd, marker, vi_lrulist);
591		TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist);
592		if (!vnis_marker(VIMPL_TO_VNODE(vip)))
593			break;
594	}
595
596	return vip;
597}
598
599static void
600lru_iter_release(lru_iter_t *iter)
601{
602	vnode_impl_t *marker;
603
604	KASSERT(mutex_owned(&vdrain_lock));
605
606	marker = iter->li_marker;
607	TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist);
608
609	mutex_exit(&vdrain_lock);
610	vnfree_marker(VIMPL_TO_VNODE(marker));
611	mutex_enter(&vdrain_lock);
612}
613
614/*
615 * Release deferred vrele vnodes for this mount.
616 * Called with file system suspended.
617 */
618void
619vrele_flush(struct mount *mp)
620{
621	lru_iter_t iter;
622	vnode_impl_t *vip;
623
624	KASSERT(fstrans_is_owner(mp));
625
626	mutex_enter(&vdrain_lock);
627	for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL;
628	    vip = lru_iter_next(&iter)) {
629		if (VIMPL_TO_VNODE(vip)->v_mount != mp)
630			continue;
631		vrele_deferred(vip);
632	}
633	lru_iter_release(&iter);
634	mutex_exit(&vdrain_lock);
635}
636
637/*
638 * One pass through the LRU lists to keep the number of allocated
639 * vnodes below target.  Returns true if target met.
640 */
641static bool
642vdrain_one(u_int target)
643{
644	int ix, lists[] = { LRU_FREE, LRU_HOLD };
645	lru_iter_t iter;
646	vnode_impl_t *vip;
647	vnode_t *vp;
648	struct mount *mp;
649
650	KASSERT(mutex_owned(&vdrain_lock));
651
652	for (ix = 0; ix < __arraycount(lists); ix++) {
653		for (vip = lru_iter_first(lists[ix], &iter); vip != NULL;
654		    vip = lru_iter_next(&iter)) {
655			if (numvnodes < target) {
656				lru_iter_release(&iter);
657				return true;
658			}
659
660			vp = VIMPL_TO_VNODE(vip);
661
662			/* Probe usecount (unlocked). */
663			if (vrefcnt(vp) > 0)
664				continue;
665			/* Try v_interlock -- we lock the wrong direction! */
666			if (!mutex_tryenter(vp->v_interlock))
667				continue;
668			/* Probe usecount and state. */
669			if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) {
670				mutex_exit(vp->v_interlock);
671				continue;
672			}
673			mutex_exit(&vdrain_lock);
674
675			mp = vp->v_mount;
676			if (fstrans_start_nowait(mp) != 0) {
677				mutex_exit(vp->v_interlock);
678				mutex_enter(&vdrain_lock);
679				continue;
680			}
681
682			if (vcache_vget(vp) == 0) {
683				if (!vrecycle(vp)) {
684					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
685					mutex_enter(vp->v_interlock);
686					vrelel(vp, 0, LK_EXCLUSIVE);
687				}
688			}
689			fstrans_done(mp);
690
691			mutex_enter(&vdrain_lock);
692		}
693		lru_iter_release(&iter);
694	}
695
696	return false;
697}
698
699/*
700 * threadpool task to keep the number of vnodes below desiredvnodes.
701 */
702static void
703vdrain_task(struct threadpool_job *job)
704{
705	u_int target;
706
707	target = desiredvnodes - desiredvnodes / 16;
708
709	mutex_enter(&vdrain_lock);
710
711	while (!vdrain_one(target))
712		kpause("vdrain", false, 1, &vdrain_lock);
713
714	threadpool_job_done(job);
715	mutex_exit(&vdrain_lock);
716}
717
718/*
719 * threadpool task to process asynchronous vrele.
720 */
721static void
722vrele_task(struct threadpool_job *job)
723{
724	int skipped;
725	lru_iter_t iter;
726	vnode_impl_t *vip;
727	struct mount *mp;
728
729	mutex_enter(&vdrain_lock);
730	while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) {
731		for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) {
732			mp = VIMPL_TO_VNODE(vip)->v_mount;
733			if (fstrans_start_nowait(mp) == 0) {
734				vrele_deferred(vip);
735				fstrans_done(mp);
736			} else {
737				skipped++;
738			}
739		}
740
741		lru_iter_release(&iter);
742		if (skipped)
743			kpause("vrele", false, MAX(1, mstohz(10)), &vdrain_lock);
744	}
745
746	threadpool_job_done(job);
747	lru_iter_release(&iter);
748	mutex_exit(&vdrain_lock);
749}
750
751/*
752 * Try to drop reference on a vnode.  Abort if we are releasing the
753 * last reference.  Note: this _must_ succeed if not the last reference.
754 */
755static bool
756vtryrele(vnode_t *vp)
757{
758	u_int use, next;
759
760	membar_release();
761	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
762		if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
763			return false;
764		}
765		KASSERT((use & VUSECOUNT_MASK) > 1);
766		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
767		if (__predict_true(next == use)) {
768			return true;
769		}
770	}
771}
772
773/*
774 * vput: unlock and release the reference.
775 */
776void
777vput(vnode_t *vp)
778{
779	int lktype;
780
781	/*
782	 * Do an unlocked check of the usecount.  If it looks like we're not
783	 * about to drop the last reference, then unlock the vnode and try
784	 * to drop the reference.  If it ends up being the last reference
785	 * after all, vrelel() can fix it all up.  Most of the time this
786	 * will all go to plan.
787	 */
788	if (vrefcnt(vp) > 1) {
789		VOP_UNLOCK(vp);
790		if (vtryrele(vp)) {
791			return;
792		}
793		lktype = LK_NONE;
794	} else {
795		lktype = VOP_ISLOCKED(vp);
796		KASSERT(lktype != LK_NONE);
797	}
798	mutex_enter(vp->v_interlock);
799	vrelel(vp, 0, lktype);
800}
801
802/*
803 * Release a vnode from the deferred list.
804 */
805static void
806vrele_deferred(vnode_impl_t *vip)
807{
808	vnode_t *vp;
809
810	KASSERT(mutex_owned(&vdrain_lock));
811	KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
812
813	vp = VIMPL_TO_VNODE(vip);
814
815	/*
816	 * First remove the vnode from the vrele list.
817	 * Put it on the last lru list, the last vrele()
818	 * will put it back onto the right list before
819	 * its usecount reaches zero.
820	 */
821	TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
822	vip->vi_lrulisthd = &lru_list[LRU_HOLD];
823	vip->vi_lrulisttm = getticks();
824	TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
825
826	mutex_exit(&vdrain_lock);
827
828	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
829	mutex_enter(vp->v_interlock);
830	vrelel(vp, 0, LK_EXCLUSIVE);
831
832	mutex_enter(&vdrain_lock);
833}
834
835/*
836 * Vnode release.  If reference count drops to zero, call inactive
837 * routine and either return to freelist or free to the pool.
838 */
839static void
840vrelel(vnode_t *vp, int flags, int lktype)
841{
842	const bool async = ((flags & VRELEL_ASYNC) != 0);
843	bool recycle, defer, objlock_held;
844	u_int use, next;
845	int error;
846
847	objlock_held = false;
848
849retry:
850	KASSERT(mutex_owned(vp->v_interlock));
851
852	if (__predict_false(vp->v_op == dead_vnodeop_p &&
853	    VSTATE_GET(vp) != VS_RECLAIMED)) {
854		vnpanic(vp, "dead but not clean");
855	}
856
857	/*
858	 * If not the last reference, just unlock and drop the reference count.
859	 *
860	 * Otherwise make sure we pass a point in time where we hold the
861	 * last reference with VGET flag unset.
862	 */
863	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
864		if (__predict_false((use & VUSECOUNT_MASK) > 1)) {
865			if (objlock_held) {
866				objlock_held = false;
867				rw_exit(vp->v_uobj.vmobjlock);
868			}
869			if (lktype != LK_NONE) {
870				mutex_exit(vp->v_interlock);
871				lktype = LK_NONE;
872				VOP_UNLOCK(vp);
873				mutex_enter(vp->v_interlock);
874			}
875			if (vtryrele(vp)) {
876				mutex_exit(vp->v_interlock);
877				return;
878			}
879			next = atomic_load_relaxed(&vp->v_usecount);
880			continue;
881		}
882		KASSERT((use & VUSECOUNT_MASK) == 1);
883		next = use & ~VUSECOUNT_VGET;
884		if (next != use) {
885			next = atomic_cas_uint(&vp->v_usecount, use, next);
886		}
887		if (__predict_true(next == use)) {
888			break;
889		}
890	}
891	membar_acquire();
892	if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) {
893		vnpanic(vp, "%s: bad ref count", __func__);
894	}
895
896#ifdef DIAGNOSTIC
897	if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
898	    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
899		vprint("vrelel: missing VOP_CLOSE()", vp);
900	}
901#endif
902
903	/*
904	 * If already clean there is no need to lock, defer or
905	 * deactivate this node.
906	 */
907	if (VSTATE_GET(vp) == VS_RECLAIMED) {
908		if (objlock_held) {
909			objlock_held = false;
910			rw_exit(vp->v_uobj.vmobjlock);
911		}
912		if (lktype != LK_NONE) {
913			mutex_exit(vp->v_interlock);
914			lktype = LK_NONE;
915			VOP_UNLOCK(vp);
916			mutex_enter(vp->v_interlock);
917		}
918		goto out;
919	}
920
921	/*
922	 * First try to get the vnode locked for VOP_INACTIVE().
923	 * Defer vnode release to vrele task if caller requests
924	 * it explicitly, is the pagedaemon or the lock failed.
925	 */
926	defer = false;
927	if ((curlwp == uvm.pagedaemon_lwp) || async) {
928		defer = true;
929	} else if (lktype == LK_SHARED) {
930		/* Excellent chance of getting, if the last ref. */
931		error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT);
932		if (error != 0) {
933			defer = true;
934		} else {
935			lktype = LK_EXCLUSIVE;
936		}
937	} else if (lktype == LK_NONE) {
938		/* Excellent chance of getting, if the last ref. */
939		error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
940		if (error != 0) {
941			defer = true;
942		} else {
943			lktype = LK_EXCLUSIVE;
944		}
945	}
946	KASSERT(mutex_owned(vp->v_interlock));
947	if (defer) {
948		/*
949		 * Defer reclaim to the vrele task; it's not safe to
950		 * clean it here.  We donate it our last reference.
951		 */
952		if (lktype != LK_NONE) {
953			mutex_exit(vp->v_interlock);
954			VOP_UNLOCK(vp);
955			mutex_enter(vp->v_interlock);
956		}
957		lru_requeue(vp, &lru_list[LRU_VRELE]);
958		mutex_exit(vp->v_interlock);
959		return;
960	}
961	KASSERT(lktype == LK_EXCLUSIVE);
962
963	/* If the node gained another reference, retry. */
964	use = atomic_load_relaxed(&vp->v_usecount);
965	if ((use & VUSECOUNT_VGET) != 0) {
966		goto retry;
967	}
968	KASSERT((use & VUSECOUNT_MASK) == 1);
969
970	if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 ||
971	    (vp->v_vflag & VV_MAPPED) != 0) {
972		/* Take care of space accounting. */
973		if (!objlock_held) {
974			objlock_held = true;
975			if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) {
976				mutex_exit(vp->v_interlock);
977				rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
978				mutex_enter(vp->v_interlock);
979				goto retry;
980			}
981		}
982		if ((vp->v_iflag & VI_EXECMAP) != 0) {
983			cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
984		}
985		vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
986		vp->v_vflag &= ~VV_MAPPED;
987	}
988	if (objlock_held) {
989		objlock_held = false;
990		rw_exit(vp->v_uobj.vmobjlock);
991	}
992
993	/*
994	 * Deactivate the vnode, but preserve our reference across
995	 * the call to VOP_INACTIVE().
996	 *
997	 * If VOP_INACTIVE() indicates that the file has been
998	 * deleted, then recycle the vnode.
999	 *
1000	 * Note that VOP_INACTIVE() will not drop the vnode lock.
1001	 */
1002	mutex_exit(vp->v_interlock);
1003	recycle = false;
1004	VOP_INACTIVE(vp, &recycle);
1005	if (!recycle) {
1006		lktype = LK_NONE;
1007		VOP_UNLOCK(vp);
1008	}
1009	mutex_enter(vp->v_interlock);
1010
1011	/*
1012	 * Block new references then check again to see if a
1013	 * new reference was acquired in the meantime.  If
1014	 * it was, restore the vnode state and try again.
1015	 */
1016	if (recycle) {
1017		VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1018		use = atomic_load_relaxed(&vp->v_usecount);
1019		if ((use & VUSECOUNT_VGET) != 0) {
1020			VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1021			goto retry;
1022		}
1023		KASSERT((use & VUSECOUNT_MASK) == 1);
1024	}
1025
1026	/*
1027	 * Recycle the vnode if the file is now unused (unlinked).
1028	 */
1029	if (recycle) {
1030		VSTATE_ASSERT(vp, VS_BLOCKED);
1031		KASSERT(lktype == LK_EXCLUSIVE);
1032		/* vcache_reclaim drops the lock. */
1033		lktype = LK_NONE;
1034		vcache_reclaim(vp);
1035	}
1036	KASSERT(vrefcnt(vp) > 0);
1037	KASSERT(lktype == LK_NONE);
1038
1039out:
1040	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
1041		if (__predict_false((use & VUSECOUNT_VGET) != 0 &&
1042		    (use & VUSECOUNT_MASK) == 1)) {
1043			/* Gained and released another reference, retry. */
1044			goto retry;
1045		}
1046		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
1047		if (__predict_true(next == use)) {
1048			if (__predict_false((use & VUSECOUNT_MASK) != 1)) {
1049				/* Gained another reference. */
1050				mutex_exit(vp->v_interlock);
1051				return;
1052			}
1053			break;
1054		}
1055	}
1056	membar_acquire();
1057
1058	if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
1059		/*
1060		 * It's clean so destroy it.  It isn't referenced
1061		 * anywhere since it has been reclaimed.
1062		 */
1063		vcache_free(VNODE_TO_VIMPL(vp));
1064	} else {
1065		/*
1066		 * Otherwise, put it back onto the freelist.  It
1067		 * can't be destroyed while still associated with
1068		 * a file system.
1069		 */
1070		lru_requeue(vp, lru_which(vp));
1071		mutex_exit(vp->v_interlock);
1072	}
1073}
1074
1075void
1076vrele(vnode_t *vp)
1077{
1078
1079	if (vtryrele(vp)) {
1080		return;
1081	}
1082	mutex_enter(vp->v_interlock);
1083	vrelel(vp, 0, LK_NONE);
1084}
1085
1086/*
1087 * Asynchronous vnode release, vnode is released in different context.
1088 */
1089void
1090vrele_async(vnode_t *vp)
1091{
1092
1093	if (vtryrele(vp)) {
1094		return;
1095	}
1096	mutex_enter(vp->v_interlock);
1097	vrelel(vp, VRELEL_ASYNC, LK_NONE);
1098}
1099
1100/*
1101 * Vnode reference, where a reference is already held by some other
1102 * object (for example, a file structure).
1103 *
1104 * NB: lockless code sequences may rely on this not blocking.
1105 */
1106void
1107vref(vnode_t *vp)
1108{
1109
1110	KASSERT(vrefcnt(vp) > 0);
1111
1112	atomic_inc_uint(&vp->v_usecount);
1113}
1114
1115/*
1116 * Page or buffer structure gets a reference.
1117 * Called with v_interlock held.
1118 */
1119void
1120vholdl(vnode_t *vp)
1121{
1122
1123	KASSERT(mutex_owned(vp->v_interlock));
1124
1125	if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0)
1126		lru_requeue(vp, lru_which(vp));
1127}
1128
1129/*
1130 * Page or buffer structure gets a reference.
1131 */
1132void
1133vhold(vnode_t *vp)
1134{
1135
1136	mutex_enter(vp->v_interlock);
1137	vholdl(vp);
1138	mutex_exit(vp->v_interlock);
1139}
1140
1141/*
1142 * Page or buffer structure frees a reference.
1143 * Called with v_interlock held.
1144 */
1145void
1146holdrelel(vnode_t *vp)
1147{
1148
1149	KASSERT(mutex_owned(vp->v_interlock));
1150
1151	if (vp->v_holdcnt <= 0) {
1152		vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
1153	}
1154
1155	vp->v_holdcnt--;
1156	if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1157		lru_requeue(vp, lru_which(vp));
1158}
1159
1160/*
1161 * Page or buffer structure frees a reference.
1162 */
1163void
1164holdrele(vnode_t *vp)
1165{
1166
1167	mutex_enter(vp->v_interlock);
1168	holdrelel(vp);
1169	mutex_exit(vp->v_interlock);
1170}
1171
1172/*
1173 * Recycle an unused vnode if caller holds the last reference.
1174 */
1175bool
1176vrecycle(vnode_t *vp)
1177{
1178	int error __diagused;
1179
1180	mutex_enter(vp->v_interlock);
1181
1182	/* If the vnode is already clean we're done. */
1183	VSTATE_WAIT_STABLE(vp);
1184	if (VSTATE_GET(vp) != VS_LOADED) {
1185		VSTATE_ASSERT(vp, VS_RECLAIMED);
1186		vrelel(vp, 0, LK_NONE);
1187		return true;
1188	}
1189
1190	/* Prevent further references until the vnode is locked. */
1191	VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1192
1193	/* Make sure we hold the last reference. */
1194	if (vrefcnt(vp) != 1) {
1195		VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1196		mutex_exit(vp->v_interlock);
1197		return false;
1198	}
1199
1200	mutex_exit(vp->v_interlock);
1201
1202	/*
1203	 * On a leaf file system this lock will always succeed as we hold
1204	 * the last reference and prevent further references.
1205	 * On layered file systems waiting for the lock would open a can of
1206	 * deadlocks as the lower vnodes may have other active references.
1207	 */
1208	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
1209
1210	mutex_enter(vp->v_interlock);
1211	if (error) {
1212		VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1213		mutex_exit(vp->v_interlock);
1214		return false;
1215	}
1216
1217	KASSERT(vrefcnt(vp) == 1);
1218	vcache_reclaim(vp);
1219	vrelel(vp, 0, LK_NONE);
1220
1221	return true;
1222}
1223
1224/*
1225 * Helper for vrevoke() to propagate suspension from lastmp
1226 * to thismp.  Both args may be NULL.
1227 * Returns the currently suspended file system or NULL.
1228 */
1229static struct mount *
1230vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
1231{
1232	int error;
1233
1234	if (lastmp == thismp)
1235		return thismp;
1236
1237	if (lastmp != NULL)
1238		vfs_resume(lastmp);
1239
1240	if (thismp == NULL)
1241		return NULL;
1242
1243	do {
1244		error = vfs_suspend(thismp, 0);
1245	} while (error == EINTR || error == ERESTART);
1246
1247	if (error == 0)
1248		return thismp;
1249
1250	KASSERT(error == EOPNOTSUPP || error == ENOENT);
1251	return NULL;
1252}
1253
1254/*
1255 * Eliminate all activity associated with the requested vnode
1256 * and with all vnodes aliased to the requested vnode.
1257 */
1258void
1259vrevoke(vnode_t *vp)
1260{
1261	struct mount *mp;
1262	vnode_t *vq;
1263	enum vtype type;
1264	dev_t dev;
1265
1266	KASSERT(vrefcnt(vp) > 0);
1267
1268	mp = vrevoke_suspend_next(NULL, vp->v_mount);
1269
1270	mutex_enter(vp->v_interlock);
1271	VSTATE_WAIT_STABLE(vp);
1272	if (VSTATE_GET(vp) == VS_RECLAIMED) {
1273		mutex_exit(vp->v_interlock);
1274	} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1275		atomic_inc_uint(&vp->v_usecount);
1276		mutex_exit(vp->v_interlock);
1277		vgone(vp);
1278	} else {
1279		dev = vp->v_rdev;
1280		type = vp->v_type;
1281		mutex_exit(vp->v_interlock);
1282
1283		while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq)
1284		    == 0) {
1285			mp = vrevoke_suspend_next(mp, vq->v_mount);
1286			vgone(vq);
1287		}
1288	}
1289	vrevoke_suspend_next(mp, NULL);
1290}
1291
1292/*
1293 * Eliminate all activity associated with a vnode in preparation for
1294 * reuse.  Drops a reference from the vnode.
1295 */
1296void
1297vgone(vnode_t *vp)
1298{
1299	int lktype;
1300
1301	KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1302
1303	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1304	lktype = LK_EXCLUSIVE;
1305	mutex_enter(vp->v_interlock);
1306	VSTATE_WAIT_STABLE(vp);
1307	if (VSTATE_GET(vp) == VS_LOADED) {
1308		VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1309		vcache_reclaim(vp);
1310		lktype = LK_NONE;
1311	}
1312	VSTATE_ASSERT(vp, VS_RECLAIMED);
1313	vrelel(vp, 0, lktype);
1314}
1315
1316static inline uint32_t
1317vcache_hash(const struct vcache_key *key)
1318{
1319	uint32_t hash = HASH32_BUF_INIT;
1320
1321	KASSERT(key->vk_key_len > 0);
1322
1323	hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1324	hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1325	return hash;
1326}
1327
1328static int
1329vcache_stats(struct hashstat_sysctl *hs, bool fill)
1330{
1331	vnode_impl_t *vip;
1332	uint64_t chain;
1333
1334	strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name));
1335	strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc));
1336	if (!fill)
1337		return 0;
1338
1339	hs->hash_size = vcache_hashmask + 1;
1340
1341	for (size_t i = 0; i < hs->hash_size; i++) {
1342		chain = 0;
1343		mutex_enter(&vcache_lock);
1344		SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) {
1345			chain++;
1346		}
1347		mutex_exit(&vcache_lock);
1348		if (chain > 0) {
1349			hs->hash_used++;
1350			hs->hash_items += chain;
1351			if (chain > hs->hash_maxchain)
1352				hs->hash_maxchain = chain;
1353		}
1354		preempt_point();
1355	}
1356
1357	return 0;
1358}
1359
1360static void
1361vcache_init(void)
1362{
1363
1364	vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
1365	    0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1366	KASSERT(vcache_pool != NULL);
1367	mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1368	cv_init(&vcache_cv, "vcache");
1369	vcache_hashsize = desiredvnodes;
1370	vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1371	    &vcache_hashmask);
1372	hashstat_register("vcache", vcache_stats);
1373}
1374
1375static void
1376vcache_reinit(void)
1377{
1378	int i;
1379	uint32_t hash;
1380	u_long oldmask, newmask;
1381	struct hashhead *oldtab, *newtab;
1382	vnode_impl_t *vip;
1383
1384	newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1385	mutex_enter(&vcache_lock);
1386	oldtab = vcache_hashtab;
1387	oldmask = vcache_hashmask;
1388	vcache_hashsize = desiredvnodes;
1389	vcache_hashtab = newtab;
1390	vcache_hashmask = newmask;
1391	for (i = 0; i <= oldmask; i++) {
1392		while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1393			SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1394			hash = vcache_hash(&vip->vi_key);
1395			SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1396			    vip, vi_hash);
1397		}
1398	}
1399	mutex_exit(&vcache_lock);
1400	hashdone(oldtab, HASH_SLIST, oldmask);
1401}
1402
1403static inline vnode_impl_t *
1404vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1405{
1406	struct hashhead *hashp;
1407	vnode_impl_t *vip;
1408
1409	KASSERT(mutex_owned(&vcache_lock));
1410
1411	hashp = &vcache_hashtab[hash & vcache_hashmask];
1412	SLIST_FOREACH(vip, hashp, vi_hash) {
1413		if (key->vk_mount != vip->vi_key.vk_mount)
1414			continue;
1415		if (key->vk_key_len != vip->vi_key.vk_key_len)
1416			continue;
1417		if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1418			continue;
1419		return vip;
1420	}
1421	return NULL;
1422}
1423
1424/*
1425 * Allocate a new, uninitialized vcache node.
1426 */
1427static vnode_impl_t *
1428vcache_alloc(void)
1429{
1430	vnode_impl_t *vip;
1431	vnode_t *vp;
1432
1433	vip = pool_cache_get(vcache_pool, PR_WAITOK);
1434	vp = VIMPL_TO_VNODE(vip);
1435	memset(vip, 0, sizeof(*vip));
1436
1437	rw_init(&vip->vi_lock);
1438	vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
1439
1440	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
1441	klist_init(&vip->vi_klist.vk_klist);
1442	vp->v_klist = &vip->vi_klist;
1443	cv_init(&vp->v_cv, "vnode");
1444	cache_vnode_init(vp);
1445
1446	vp->v_usecount = 1;
1447	vp->v_type = VNON;
1448	vp->v_size = vp->v_writesize = VSIZENOTSET;
1449
1450	vip->vi_state = VS_LOADING;
1451
1452	lru_requeue(vp, &lru_list[LRU_FREE]);
1453
1454	return vip;
1455}
1456
1457/*
1458 * Deallocate a vcache node in state VS_LOADING.
1459 *
1460 * vcache_lock held on entry and released on return.
1461 */
1462static void
1463vcache_dealloc(vnode_impl_t *vip)
1464{
1465	vnode_t *vp;
1466
1467	KASSERT(mutex_owned(&vcache_lock));
1468
1469	vp = VIMPL_TO_VNODE(vip);
1470	vfs_ref(dead_rootmount);
1471	vfs_insmntque(vp, dead_rootmount);
1472	mutex_enter(vp->v_interlock);
1473	vp->v_op = dead_vnodeop_p;
1474	VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1475	mutex_exit(&vcache_lock);
1476	vrelel(vp, 0, LK_NONE);
1477}
1478
1479/*
1480 * Free an unused, unreferenced vcache node.
1481 * v_interlock locked on entry.
1482 */
1483static void
1484vcache_free(vnode_impl_t *vip)
1485{
1486	vnode_t *vp;
1487
1488	vp = VIMPL_TO_VNODE(vip);
1489	KASSERT(mutex_owned(vp->v_interlock));
1490
1491	KASSERT(vrefcnt(vp) == 0);
1492	KASSERT(vp->v_holdcnt == 0);
1493	KASSERT(vp->v_writecount == 0);
1494	lru_requeue(vp, NULL);
1495	mutex_exit(vp->v_interlock);
1496
1497	vfs_insmntque(vp, NULL);
1498	if (vp->v_type == VBLK || vp->v_type == VCHR)
1499		spec_node_destroy(vp);
1500
1501	mutex_obj_free(vp->v_interlock);
1502	rw_destroy(&vip->vi_lock);
1503	uvm_obj_destroy(&vp->v_uobj, true);
1504	KASSERT(vp->v_klist == &vip->vi_klist);
1505	klist_fini(&vip->vi_klist.vk_klist);
1506	cv_destroy(&vp->v_cv);
1507	cache_vnode_fini(vp);
1508	pool_cache_put(vcache_pool, vip);
1509}
1510
1511/*
1512 * Try to get an initial reference on this cached vnode.
1513 * Returns zero on success or EBUSY if the vnode state is not LOADED.
1514 *
1515 * NB: lockless code sequences may rely on this not blocking.
1516 */
1517int
1518vcache_tryvget(vnode_t *vp)
1519{
1520	u_int use, next;
1521
1522	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
1523		if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
1524			return EBUSY;
1525		}
1526		next = atomic_cas_uint(&vp->v_usecount,
1527		    use, (use + 1) | VUSECOUNT_VGET);
1528		if (__predict_true(next == use)) {
1529			membar_acquire();
1530			return 0;
1531		}
1532	}
1533}
1534
1535/*
1536 * Try to get an initial reference on this cached vnode.
1537 * Returns zero on success and  ENOENT if the vnode has been reclaimed.
1538 * Will wait for the vnode state to be stable.
1539 *
1540 * v_interlock locked on entry and unlocked on exit.
1541 */
1542int
1543vcache_vget(vnode_t *vp)
1544{
1545	int error;
1546
1547	KASSERT(mutex_owned(vp->v_interlock));
1548
1549	/* Increment hold count to prevent vnode from disappearing. */
1550	vp->v_holdcnt++;
1551	VSTATE_WAIT_STABLE(vp);
1552	vp->v_holdcnt--;
1553
1554	/* If this was the last reference to a reclaimed vnode free it now. */
1555	if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1556		if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1557			vcache_free(VNODE_TO_VIMPL(vp));
1558		else
1559			mutex_exit(vp->v_interlock);
1560		return ENOENT;
1561	}
1562	VSTATE_ASSERT(vp, VS_LOADED);
1563	error = vcache_tryvget(vp);
1564	KASSERT(error == 0);
1565	mutex_exit(vp->v_interlock);
1566
1567	return 0;
1568}
1569
1570/*
1571 * Get a vnode / fs node pair by key and return it referenced through vpp.
1572 */
1573int
1574vcache_get(struct mount *mp, const void *key, size_t key_len,
1575    struct vnode **vpp)
1576{
1577	int error;
1578	uint32_t hash;
1579	const void *new_key;
1580	struct vnode *vp;
1581	struct vcache_key vcache_key;
1582	vnode_impl_t *vip, *new_vip;
1583
1584	new_key = NULL;
1585	*vpp = NULL;
1586
1587	vcache_key.vk_mount = mp;
1588	vcache_key.vk_key = key;
1589	vcache_key.vk_key_len = key_len;
1590	hash = vcache_hash(&vcache_key);
1591
1592again:
1593	mutex_enter(&vcache_lock);
1594	vip = vcache_hash_lookup(&vcache_key, hash);
1595
1596	/* If found, take a reference or retry. */
1597	if (__predict_true(vip != NULL)) {
1598		/*
1599		 * If the vnode is loading we cannot take the v_interlock
1600		 * here as it might change during load (see uvm_obj_setlock()).
1601		 * As changing state from VS_LOADING requires both vcache_lock
1602		 * and v_interlock it is safe to test with vcache_lock held.
1603		 *
1604		 * Wait for vnodes changing state from VS_LOADING and retry.
1605		 */
1606		if (__predict_false(vip->vi_state == VS_LOADING)) {
1607			cv_wait(&vcache_cv, &vcache_lock);
1608			mutex_exit(&vcache_lock);
1609			goto again;
1610		}
1611		vp = VIMPL_TO_VNODE(vip);
1612		mutex_enter(vp->v_interlock);
1613		mutex_exit(&vcache_lock);
1614		error = vcache_vget(vp);
1615		if (error == ENOENT)
1616			goto again;
1617		if (error == 0)
1618			*vpp = vp;
1619		KASSERT((error != 0) == (*vpp == NULL));
1620		return error;
1621	}
1622	mutex_exit(&vcache_lock);
1623
1624	/* Allocate and initialize a new vcache / vnode pair. */
1625	error = vfs_busy(mp);
1626	if (error)
1627		return error;
1628	new_vip = vcache_alloc();
1629	new_vip->vi_key = vcache_key;
1630	vp = VIMPL_TO_VNODE(new_vip);
1631	mutex_enter(&vcache_lock);
1632	vip = vcache_hash_lookup(&vcache_key, hash);
1633	if (vip == NULL) {
1634		SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1635		    new_vip, vi_hash);
1636		vip = new_vip;
1637	}
1638
1639	/* If another thread beat us inserting this node, retry. */
1640	if (vip != new_vip) {
1641		vcache_dealloc(new_vip);
1642		vfs_unbusy(mp);
1643		goto again;
1644	}
1645	mutex_exit(&vcache_lock);
1646
1647	/* Load the fs node.  Exclusive as new_node is VS_LOADING. */
1648	error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1649	if (error) {
1650		mutex_enter(&vcache_lock);
1651		SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1652		    new_vip, vnode_impl, vi_hash);
1653		vcache_dealloc(new_vip);
1654		vfs_unbusy(mp);
1655		KASSERT(*vpp == NULL);
1656		return error;
1657	}
1658	KASSERT(new_key != NULL);
1659	KASSERT(memcmp(key, new_key, key_len) == 0);
1660	KASSERT(vp->v_op != NULL);
1661	vfs_insmntque(vp, mp);
1662	if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1663		vp->v_vflag |= VV_MPSAFE;
1664	vfs_ref(mp);
1665	vfs_unbusy(mp);
1666
1667	/* Finished loading, finalize node. */
1668	mutex_enter(&vcache_lock);
1669	new_vip->vi_key.vk_key = new_key;
1670	mutex_enter(vp->v_interlock);
1671	VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1672	mutex_exit(vp->v_interlock);
1673	mutex_exit(&vcache_lock);
1674	*vpp = vp;
1675	return 0;
1676}
1677
1678/*
1679 * Create a new vnode / fs node pair and return it referenced through vpp.
1680 */
1681int
1682vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1683    kauth_cred_t cred, void *extra, struct vnode **vpp)
1684{
1685	int error;
1686	uint32_t hash;
1687	struct vnode *vp, *ovp;
1688	vnode_impl_t *vip, *ovip;
1689
1690	*vpp = NULL;
1691
1692	/* Allocate and initialize a new vcache / vnode pair. */
1693	error = vfs_busy(mp);
1694	if (error)
1695		return error;
1696	vip = vcache_alloc();
1697	vip->vi_key.vk_mount = mp;
1698	vp = VIMPL_TO_VNODE(vip);
1699
1700	/* Create and load the fs node. */
1701	error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
1702	    &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1703	if (error) {
1704		mutex_enter(&vcache_lock);
1705		vcache_dealloc(vip);
1706		vfs_unbusy(mp);
1707		KASSERT(*vpp == NULL);
1708		return error;
1709	}
1710	KASSERT(vp->v_op != NULL);
1711	KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
1712	if (vip->vi_key.vk_key_len > 0) {
1713		KASSERT(vip->vi_key.vk_key != NULL);
1714		hash = vcache_hash(&vip->vi_key);
1715
1716		/*
1717		 * Wait for previous instance to be reclaimed,
1718		 * then insert new node.
1719		 */
1720		mutex_enter(&vcache_lock);
1721		while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1722			ovp = VIMPL_TO_VNODE(ovip);
1723			mutex_enter(ovp->v_interlock);
1724			mutex_exit(&vcache_lock);
1725			error = vcache_vget(ovp);
1726			KASSERT(error == ENOENT);
1727			mutex_enter(&vcache_lock);
1728		}
1729		SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1730		    vip, vi_hash);
1731		mutex_exit(&vcache_lock);
1732	}
1733	vfs_insmntque(vp, mp);
1734	if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1735		vp->v_vflag |= VV_MPSAFE;
1736	vfs_ref(mp);
1737	vfs_unbusy(mp);
1738
1739	/* Finished loading, finalize node. */
1740	mutex_enter(&vcache_lock);
1741	mutex_enter(vp->v_interlock);
1742	VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1743	mutex_exit(&vcache_lock);
1744	mutex_exit(vp->v_interlock);
1745	*vpp = vp;
1746	return 0;
1747}
1748
1749/*
1750 * Prepare key change: update old cache nodes key and lock new cache node.
1751 * Return an error if the new node already exists.
1752 */
1753int
1754vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1755    const void *old_key, size_t old_key_len,
1756    const void *new_key, size_t new_key_len)
1757{
1758	uint32_t old_hash, new_hash;
1759	struct vcache_key old_vcache_key, new_vcache_key;
1760	vnode_impl_t *vip, *new_vip;
1761
1762	old_vcache_key.vk_mount = mp;
1763	old_vcache_key.vk_key = old_key;
1764	old_vcache_key.vk_key_len = old_key_len;
1765	old_hash = vcache_hash(&old_vcache_key);
1766
1767	new_vcache_key.vk_mount = mp;
1768	new_vcache_key.vk_key = new_key;
1769	new_vcache_key.vk_key_len = new_key_len;
1770	new_hash = vcache_hash(&new_vcache_key);
1771
1772	new_vip = vcache_alloc();
1773	new_vip->vi_key = new_vcache_key;
1774
1775	/* Insert locked new node used as placeholder. */
1776	mutex_enter(&vcache_lock);
1777	vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1778	if (vip != NULL) {
1779		vcache_dealloc(new_vip);
1780		return EEXIST;
1781	}
1782	SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1783	    new_vip, vi_hash);
1784
1785	/* Replace old nodes key with the temporary copy. */
1786	vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1787	KASSERT(vip != NULL);
1788	KASSERT(VIMPL_TO_VNODE(vip) == vp);
1789	KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1790	vip->vi_key = old_vcache_key;
1791	mutex_exit(&vcache_lock);
1792	return 0;
1793}
1794
1795/*
1796 * Key change complete: update old node and remove placeholder.
1797 */
1798void
1799vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1800    const void *old_key, size_t old_key_len,
1801    const void *new_key, size_t new_key_len)
1802{
1803	uint32_t old_hash, new_hash;
1804	struct vcache_key old_vcache_key, new_vcache_key;
1805	vnode_impl_t *vip, *new_vip;
1806	struct vnode *new_vp;
1807
1808	old_vcache_key.vk_mount = mp;
1809	old_vcache_key.vk_key = old_key;
1810	old_vcache_key.vk_key_len = old_key_len;
1811	old_hash = vcache_hash(&old_vcache_key);
1812
1813	new_vcache_key.vk_mount = mp;
1814	new_vcache_key.vk_key = new_key;
1815	new_vcache_key.vk_key_len = new_key_len;
1816	new_hash = vcache_hash(&new_vcache_key);
1817
1818	mutex_enter(&vcache_lock);
1819
1820	/* Lookup old and new node. */
1821	vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1822	KASSERT(vip != NULL);
1823	KASSERT(VIMPL_TO_VNODE(vip) == vp);
1824
1825	new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1826	KASSERT(new_vip != NULL);
1827	KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1828	new_vp = VIMPL_TO_VNODE(new_vip);
1829	mutex_enter(new_vp->v_interlock);
1830	VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1831	mutex_exit(new_vp->v_interlock);
1832
1833	/* Rekey old node and put it onto its new hashlist. */
1834	vip->vi_key = new_vcache_key;
1835	if (old_hash != new_hash) {
1836		SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1837		    vip, vnode_impl, vi_hash);
1838		SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1839		    vip, vi_hash);
1840	}
1841
1842	/* Remove new node used as placeholder. */
1843	SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1844	    new_vip, vnode_impl, vi_hash);
1845	vcache_dealloc(new_vip);
1846}
1847
1848/*
1849 * Disassociate the underlying file system from a vnode.
1850 *
1851 * Must be called with vnode locked and will return unlocked.
1852 * Must be called with the interlock held, and will return with it held.
1853 */
1854static void
1855vcache_reclaim(vnode_t *vp)
1856{
1857	lwp_t *l = curlwp;
1858	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1859	struct mount *mp = vp->v_mount;
1860	uint32_t hash;
1861	uint8_t temp_buf[64], *temp_key;
1862	size_t temp_key_len;
1863	bool recycle;
1864	int error;
1865
1866	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1867	KASSERT(mutex_owned(vp->v_interlock));
1868	KASSERT(vrefcnt(vp) != 0);
1869
1870	temp_key_len = vip->vi_key.vk_key_len;
1871	/*
1872	 * Prevent the vnode from being recycled or brought into use
1873	 * while we clean it out.
1874	 */
1875	VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);
1876
1877	/*
1878	 * Send NOTE_REVOKE now, before we call VOP_RECLAIM(),
1879	 * because VOP_RECLAIM() could cause vp->v_klist to
1880	 * become invalid.  Don't check for interest in NOTE_REVOKE
1881	 * here; it's always posted because it sets EV_EOF.
1882	 *
1883	 * Once it's been posted, reset vp->v_klist to point to
1884	 * our own local storage, in case we were sharing with
1885	 * someone else.
1886	 */
1887	KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
1888	vp->v_klist = &vip->vi_klist;
1889	mutex_exit(vp->v_interlock);
1890
1891	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1892	mutex_enter(vp->v_interlock);
1893	if ((vp->v_iflag & VI_EXECMAP) != 0) {
1894		cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
1895	}
1896	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1897	vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */
1898	mutex_exit(vp->v_interlock);
1899	rw_exit(vp->v_uobj.vmobjlock);
1900
1901	/*
1902	 * With vnode state set to reclaiming, purge name cache immediately
1903	 * to prevent new handles on vnode, and wait for existing threads
1904	 * trying to get a handle to notice VS_RECLAIMED status and abort.
1905	 */
1906	cache_purge(vp);
1907
1908	/* Replace the vnode key with a temporary copy. */
1909	if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1910		temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1911	} else {
1912		temp_key = temp_buf;
1913	}
1914	if (vip->vi_key.vk_key_len > 0) {
1915		mutex_enter(&vcache_lock);
1916		memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1917		vip->vi_key.vk_key = temp_key;
1918		mutex_exit(&vcache_lock);
1919	}
1920
1921	fstrans_start(mp);
1922
1923	/*
1924	 * Clean out any cached data associated with the vnode.
1925	 */
1926	error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1927	if (error != 0) {
1928		if (wapbl_vphaswapbl(vp))
1929			WAPBL_DISCARD(wapbl_vptomp(vp));
1930		error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1931	}
1932	KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1933	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1934	if (vp->v_type == VBLK || vp->v_type == VCHR) {
1935		 spec_node_revoke(vp);
1936	}
1937
1938	/*
1939	 * Disassociate the underlying file system from the vnode.
1940	 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1941	 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1942	 * would no longer function.
1943	 */
1944	VOP_INACTIVE(vp, &recycle);
1945	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1946	if (VOP_RECLAIM(vp)) {
1947		vnpanic(vp, "%s: cannot reclaim", __func__);
1948	}
1949
1950	KASSERT(vp->v_data == NULL);
1951	KASSERT((vp->v_iflag & VI_PAGES) == 0);
1952
1953	if (vp->v_type == VREG && vp->v_ractx != NULL) {
1954		uvm_ra_freectx(vp->v_ractx);
1955		vp->v_ractx = NULL;
1956	}
1957
1958	if (vip->vi_key.vk_key_len > 0) {
1959	/* Remove from vnode cache. */
1960		hash = vcache_hash(&vip->vi_key);
1961		mutex_enter(&vcache_lock);
1962		KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1963		SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1964		    vip, vnode_impl, vi_hash);
1965		mutex_exit(&vcache_lock);
1966	}
1967	if (temp_key != temp_buf)
1968		kmem_free(temp_key, temp_key_len);
1969
1970	/* Done with purge, notify sleepers of the grim news. */
1971	mutex_enter(vp->v_interlock);
1972	vp->v_op = dead_vnodeop_p;
1973	VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1974	vp->v_tag = VT_NON;
1975	mutex_exit(vp->v_interlock);
1976
1977	/*
1978	 * Move to dead mount.  Must be after changing the operations
1979	 * vector as vnode operations enter the mount before using the
1980	 * operations vector.  See sys/kern/vnode_if.c.
1981	 */
1982	vp->v_vflag &= ~VV_ROOT;
1983	vfs_ref(dead_rootmount);
1984	vfs_insmntque(vp, dead_rootmount);
1985
1986#ifdef PAX_SEGVGUARD
1987	pax_segvguard_cleanup(vp);
1988#endif /* PAX_SEGVGUARD */
1989
1990	mutex_enter(vp->v_interlock);
1991	fstrans_done(mp);
1992	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1993}
1994
1995/*
1996 * Disassociate the underlying file system from an open device vnode
1997 * and make it anonymous.
1998 *
1999 * Vnode unlocked on entry, drops a reference to the vnode.
2000 */
2001void
2002vcache_make_anon(vnode_t *vp)
2003{
2004	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
2005	uint32_t hash;
2006	bool recycle;
2007
2008	KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
2009	KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
2010	VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
2011
2012	/* Remove from vnode cache. */
2013	hash = vcache_hash(&vip->vi_key);
2014	mutex_enter(&vcache_lock);
2015	KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
2016	SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
2017	    vip, vnode_impl, vi_hash);
2018	vip->vi_key.vk_mount = dead_rootmount;
2019	vip->vi_key.vk_key_len = 0;
2020	vip->vi_key.vk_key = NULL;
2021	mutex_exit(&vcache_lock);
2022
2023	/*
2024	 * Disassociate the underlying file system from the vnode.
2025	 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
2026	 * the vnode, and may destroy the vnode so that VOP_UNLOCK
2027	 * would no longer function.
2028	 */
2029	if (vn_lock(vp, LK_EXCLUSIVE)) {
2030		vnpanic(vp, "%s: cannot lock", __func__);
2031	}
2032	VOP_INACTIVE(vp, &recycle);
2033	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
2034	if (VOP_RECLAIM(vp)) {
2035		vnpanic(vp, "%s: cannot reclaim", __func__);
2036	}
2037
2038	/* Purge name cache. */
2039	cache_purge(vp);
2040
2041	/* Done with purge, change operations vector. */
2042	mutex_enter(vp->v_interlock);
2043	vp->v_op = spec_vnodeop_p;
2044	vp->v_vflag |= VV_MPSAFE;
2045	mutex_exit(vp->v_interlock);
2046
2047	/*
2048	 * Move to dead mount.  Must be after changing the operations
2049	 * vector as vnode operations enter the mount before using the
2050	 * operations vector.  See sys/kern/vnode_if.c.
2051	 */
2052	vfs_ref(dead_rootmount);
2053	vfs_insmntque(vp, dead_rootmount);
2054
2055	vrele(vp);
2056}
2057
2058/*
2059 * Update outstanding I/O count and do wakeup if requested.
2060 */
2061void
2062vwakeup(struct buf *bp)
2063{
2064	vnode_t *vp;
2065
2066	if ((vp = bp->b_vp) == NULL)
2067		return;
2068
2069	KASSERT(bp->b_objlock == vp->v_interlock);
2070	KASSERT(mutex_owned(bp->b_objlock));
2071
2072	if (--vp->v_numoutput < 0)
2073		vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
2074	if (vp->v_numoutput == 0)
2075		cv_broadcast(&vp->v_cv);
2076}
2077
2078/*
2079 * Test a vnode for being or becoming dead.  Returns one of:
2080 * EBUSY:  vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
2081 * ENOENT: vnode is dead.
2082 * 0:      otherwise.
2083 *
2084 * Whenever this function returns a non-zero value all future
2085 * calls will also return a non-zero value.
2086 */
2087int
2088vdead_check(struct vnode *vp, int flags)
2089{
2090
2091	KASSERT(mutex_owned(vp->v_interlock));
2092
2093	if (! ISSET(flags, VDEAD_NOWAIT))
2094		VSTATE_WAIT_STABLE(vp);
2095
2096	if (VSTATE_GET(vp) == VS_RECLAIMING) {
2097		KASSERT(ISSET(flags, VDEAD_NOWAIT));
2098		return EBUSY;
2099	} else if (VSTATE_GET(vp) == VS_RECLAIMED) {
2100		return ENOENT;
2101	}
2102
2103	return 0;
2104}
2105
2106int
2107vfs_drainvnodes(void)
2108{
2109
2110	mutex_enter(&vdrain_lock);
2111
2112	if (!vdrain_one(desiredvnodes)) {
2113		mutex_exit(&vdrain_lock);
2114		return EBUSY;
2115	}
2116
2117	mutex_exit(&vdrain_lock);
2118
2119	if (vcache_hashsize != desiredvnodes)
2120		vcache_reinit();
2121
2122	return 0;
2123}
2124
2125void
2126vnpanic(vnode_t *vp, const char *fmt, ...)
2127{
2128	va_list ap;
2129
2130#ifdef DIAGNOSTIC
2131	vprint(NULL, vp);
2132#endif
2133	va_start(ap, fmt);
2134	vpanic(fmt, ap);
2135	va_end(ap);
2136}
2137
2138void
2139vshareilock(vnode_t *tvp, vnode_t *fvp)
2140{
2141	kmutex_t *oldlock;
2142
2143	oldlock = tvp->v_interlock;
2144	mutex_obj_hold(fvp->v_interlock);
2145	tvp->v_interlock = fvp->v_interlock;
2146	mutex_obj_free(oldlock);
2147}
2148
2149void
2150vshareklist(vnode_t *tvp, vnode_t *fvp)
2151{
2152	/*
2153	 * If two vnodes share klist state, they must also share
2154	 * an interlock.
2155	 */
2156	KASSERT(tvp->v_interlock == fvp->v_interlock);
2157
2158	/*
2159	 * We make the following assumptions:
2160	 *
2161	 * ==> Some other synchronization is happening outside of
2162	 *     our view to make this safe.
2163	 *
2164	 * ==> That the "to" vnode will have the necessary references
2165	 *     on the "from" vnode so that the storage for the klist
2166	 *     won't be yanked out from beneath us (the vnode_impl).
2167	 *
2168	 * ==> If "from" is also sharing, we then assume that "from"
2169	 *     has the necessary references, and so on.
2170	 */
2171	tvp->v_klist = fvp->v_klist;
2172}
2173