Deleted Added
full compact
vfs_subr.c (130585) vfs_subr.c (130640)
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 130585 2004-06-16 09:47:26Z phk $");
42__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 130640 2004-06-17 17:16:53Z phk $");
43
44#include "opt_ddb.h"
45#include "opt_mac.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/bio.h>
50#include <sys/buf.h>
51#include <sys/conf.h>
52#include <sys/eventhandler.h>
53#include <sys/extattr.h>
54#include <sys/fcntl.h>
55#include <sys/kernel.h>
56#include <sys/kthread.h>
57#include <sys/mac.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/namei.h>
61#include <sys/sleepqueue.h>
62#include <sys/stat.h>
63#include <sys/sysctl.h>
64#include <sys/syslog.h>
65#include <sys/vmmeter.h>
66#include <sys/vnode.h>
67
68#include <vm/vm.h>
69#include <vm/vm_object.h>
70#include <vm/vm_extern.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/vm_kern.h>
75#include <vm/uma.h>
76
77static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
78
79static void addalias(struct vnode *vp, struct cdev *nvp_rdev);
80static void insmntque(struct vnode *vp, struct mount *mp);
81static void vclean(struct vnode *vp, int flags, struct thread *td);
82static void vlruvp(struct vnode *vp);
83static int flushbuflist(struct buf *blist, int flags, struct vnode *vp,
84 int slpflag, int slptimeo, int *errorp);
85static int vtryrecycle(struct vnode *vp);
86static void vx_lock(struct vnode *vp);
87static void vx_unlock(struct vnode *vp);
88static void vgonechrl(struct vnode *vp, struct thread *td);
89
90
91/*
92 * Number of vnodes in existence. Increased whenever getnewvnode()
93 * allocates a new vnode, never decreased.
94 */
95static unsigned long numvnodes;
96
97SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
98
99/*
100 * Conversion tables for conversion from vnode types to inode formats
101 * and back.
102 */
103enum vtype iftovt_tab[16] = {
104 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
105 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
106};
107int vttoif_tab[9] = {
108 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
109 S_IFSOCK, S_IFIFO, S_IFMT,
110};
111
112/*
113 * List of vnodes that are ready for recycling.
114 */
115static TAILQ_HEAD(freelst, vnode) vnode_free_list;
116
117/*
118 * Minimum number of free vnodes. If there are fewer than this free vnodes,
119 * getnewvnode() will return a newly allocated vnode.
120 */
121static u_long wantfreevnodes = 25;
122SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
123/* Number of vnodes in the free list. */
124static u_long freevnodes;
125SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
126
127/*
128 * Various variables used for debugging the new implementation of
129 * reassignbuf().
130 * XXX these are probably of (very) limited utility now.
131 */
132static int reassignbufcalls;
133SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
134static int nameileafonly;
135SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
136
137/*
138 * Cache for the mount type id assigned to NFS. This is used for
139 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
140 */
141int nfs_mount_type = -1;
142
143/* To keep more than one thread at a time from running vfs_getnewfsid */
144static struct mtx mntid_mtx;
145
146/*
147 * Lock for any access to the following:
148 * vnode_free_list
149 * numvnodes
150 * freevnodes
151 */
152static struct mtx vnode_free_list_mtx;
153
154/*
155 * For any iteration/modification of dev->si_hlist (linked through
156 * v_specnext)
157 */
158static struct mtx spechash_mtx;
159
160/* Publicly exported FS */
161struct nfs_public nfs_pub;
162
163/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
164static uma_zone_t vnode_zone;
165static uma_zone_t vnodepoll_zone;
166
167/* Set to 1 to print out reclaim of active vnodes */
168int prtactive;
169
170/*
171 * The workitem queue.
172 *
173 * It is useful to delay writes of file data and filesystem metadata
174 * for tens of seconds so that quickly created and deleted files need
175 * not waste disk bandwidth being created and removed. To realize this,
176 * we append vnodes to a "workitem" queue. When running with a soft
177 * updates implementation, most pending metadata dependencies should
178 * not wait for more than a few seconds. Thus, mounted on block devices
179 * are delayed only about a half the time that file data is delayed.
180 * Similarly, directory updates are more critical, so are only delayed
181 * about a third the time that file data is delayed. Thus, there are
182 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
183 * one each second (driven off the filesystem syncer process). The
184 * syncer_delayno variable indicates the next queue that is to be processed.
185 * Items that need to be processed soon are placed in this queue:
186 *
187 * syncer_workitem_pending[syncer_delayno]
188 *
189 * A delay of fifteen seconds is done by placing the request fifteen
190 * entries later in the queue:
191 *
192 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
193 *
194 */
195static int syncer_delayno;
196static long syncer_mask;
197LIST_HEAD(synclist, vnode);
198static struct synclist *syncer_workitem_pending;
199/*
200 * The sync_mtx protects:
201 * vp->v_synclist
202 * syncer_delayno
203 * syncer_workitem_pending
204 * rushjob
205 */
206static struct mtx sync_mtx;
207
208#define SYNCER_MAXDELAY 32
209static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
210static int syncdelay = 30; /* max time to delay syncing data */
211static int filedelay = 30; /* time to delay syncing files */
212SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
213static int dirdelay = 29; /* time to delay syncing directories */
214SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
215static int metadelay = 28; /* time to delay syncing metadata */
216SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
217static int rushjob; /* number of slots to run ASAP */
218static int stat_rush_requests; /* number of times I/O speeded up */
219SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
220
221/*
222 * Number of vnodes we want to exist at any one time. This is mostly used
223 * to size hash tables in vnode-related code. It is normally not used in
224 * getnewvnode(), as wantfreevnodes is normally nonzero.)
225 *
226 * XXX desiredvnodes is historical cruft and should not exist.
227 */
228int desiredvnodes;
229SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
230 &desiredvnodes, 0, "Maximum number of vnodes");
231static int minvnodes;
232SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
233 &minvnodes, 0, "Minimum number of vnodes");
234static int vnlru_nowhere;
235SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
236 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
237
238/* Hook for calling soft updates. */
239int (*softdep_process_worklist_hook)(struct mount *);
240
241/*
242 * Initialize the vnode management data structures.
243 */
244static void
245vntblinit(void *dummy __unused)
246{
247
248 /*
249 * Desiredvnodes is a function of the physical memory size and
250 * the kernel's heap size. Specifically, desiredvnodes scales
251 * in proportion to the physical memory size until two fifths
252 * of the kernel's heap size is consumed by vnodes and vm
253 * objects.
254 */
255 desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
256 (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
257 minvnodes = desiredvnodes / 4;
258 mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
259 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
260 mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
261 TAILQ_INIT(&vnode_free_list);
262 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
263 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
264 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
265 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
266 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
267 /*
268 * Initialize the filesystem syncer.
269 */
270 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
271 &syncer_mask);
272 syncer_maxdelay = syncer_mask + 1;
273 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
274}
275SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
276
277
278/*
279 * Mark a mount point as busy. Used to synchronize access and to delay
280 * unmounting. Interlock is not released on failure.
281 */
282int
283vfs_busy(mp, flags, interlkp, td)
284 struct mount *mp;
285 int flags;
286 struct mtx *interlkp;
287 struct thread *td;
288{
289 int lkflags;
290
291 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
292 if (flags & LK_NOWAIT)
293 return (ENOENT);
294 mp->mnt_kern_flag |= MNTK_MWAIT;
295 /*
296 * Since all busy locks are shared except the exclusive
297 * lock granted when unmounting, the only place that a
298 * wakeup needs to be done is at the release of the
299 * exclusive lock at the end of dounmount.
300 */
301 msleep(mp, interlkp, PVFS, "vfs_busy", 0);
302 return (ENOENT);
303 }
304 lkflags = LK_SHARED | LK_NOPAUSE;
305 if (interlkp)
306 lkflags |= LK_INTERLOCK;
307 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
308 panic("vfs_busy: unexpected lock failure");
309 return (0);
310}
311
312/*
313 * Free a busy filesystem.
314 */
315void
316vfs_unbusy(mp, td)
317 struct mount *mp;
318 struct thread *td;
319{
320
321 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
322}
323
324/*
325 * Lookup a mount point by filesystem identifier.
326 */
327struct mount *
328vfs_getvfs(fsid)
329 fsid_t *fsid;
330{
331 register struct mount *mp;
332
333 mtx_lock(&mountlist_mtx);
334 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
335 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
336 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
337 mtx_unlock(&mountlist_mtx);
338 return (mp);
339 }
340 }
341 mtx_unlock(&mountlist_mtx);
342 return ((struct mount *) 0);
343}
344
345/*
346 * Get a new unique fsid. Try to make its val[0] unique, since this value
347 * will be used to create fake device numbers for stat(). Also try (but
348 * not so hard) make its val[0] unique mod 2^16, since some emulators only
349 * support 16-bit device numbers. We end up with unique val[0]'s for the
350 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
351 *
352 * Keep in mind that several mounts may be running in parallel. Starting
353 * the search one past where the previous search terminated is both a
354 * micro-optimization and a defense against returning the same fsid to
355 * different mounts.
356 */
357void
358vfs_getnewfsid(mp)
359 struct mount *mp;
360{
361 static u_int16_t mntid_base;
362 fsid_t tfsid;
363 int mtype;
364
365 mtx_lock(&mntid_mtx);
366 mtype = mp->mnt_vfc->vfc_typenum;
367 tfsid.val[1] = mtype;
368 mtype = (mtype & 0xFF) << 24;
369 for (;;) {
43
44#include "opt_ddb.h"
45#include "opt_mac.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/bio.h>
50#include <sys/buf.h>
51#include <sys/conf.h>
52#include <sys/eventhandler.h>
53#include <sys/extattr.h>
54#include <sys/fcntl.h>
55#include <sys/kernel.h>
56#include <sys/kthread.h>
57#include <sys/mac.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/namei.h>
61#include <sys/sleepqueue.h>
62#include <sys/stat.h>
63#include <sys/sysctl.h>
64#include <sys/syslog.h>
65#include <sys/vmmeter.h>
66#include <sys/vnode.h>
67
68#include <vm/vm.h>
69#include <vm/vm_object.h>
70#include <vm/vm_extern.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/vm_kern.h>
75#include <vm/uma.h>
76
77static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
78
79static void addalias(struct vnode *vp, struct cdev *nvp_rdev);
80static void insmntque(struct vnode *vp, struct mount *mp);
81static void vclean(struct vnode *vp, int flags, struct thread *td);
82static void vlruvp(struct vnode *vp);
83static int flushbuflist(struct buf *blist, int flags, struct vnode *vp,
84 int slpflag, int slptimeo, int *errorp);
85static int vtryrecycle(struct vnode *vp);
86static void vx_lock(struct vnode *vp);
87static void vx_unlock(struct vnode *vp);
88static void vgonechrl(struct vnode *vp, struct thread *td);
89
90
91/*
92 * Number of vnodes in existence. Increased whenever getnewvnode()
93 * allocates a new vnode, never decreased.
94 */
95static unsigned long numvnodes;
96
97SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
98
99/*
100 * Conversion tables for conversion from vnode types to inode formats
101 * and back.
102 */
103enum vtype iftovt_tab[16] = {
104 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
105 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
106};
107int vttoif_tab[9] = {
108 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
109 S_IFSOCK, S_IFIFO, S_IFMT,
110};
111
112/*
113 * List of vnodes that are ready for recycling.
114 */
115static TAILQ_HEAD(freelst, vnode) vnode_free_list;
116
117/*
118 * Minimum number of free vnodes. If there are fewer than this free vnodes,
119 * getnewvnode() will return a newly allocated vnode.
120 */
121static u_long wantfreevnodes = 25;
122SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
123/* Number of vnodes in the free list. */
124static u_long freevnodes;
125SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
126
127/*
128 * Various variables used for debugging the new implementation of
129 * reassignbuf().
130 * XXX these are probably of (very) limited utility now.
131 */
132static int reassignbufcalls;
133SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
134static int nameileafonly;
135SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
136
137/*
138 * Cache for the mount type id assigned to NFS. This is used for
139 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
140 */
141int nfs_mount_type = -1;
142
143/* To keep more than one thread at a time from running vfs_getnewfsid */
144static struct mtx mntid_mtx;
145
146/*
147 * Lock for any access to the following:
148 * vnode_free_list
149 * numvnodes
150 * freevnodes
151 */
152static struct mtx vnode_free_list_mtx;
153
154/*
155 * For any iteration/modification of dev->si_hlist (linked through
156 * v_specnext)
157 */
158static struct mtx spechash_mtx;
159
160/* Publicly exported FS */
161struct nfs_public nfs_pub;
162
163/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
164static uma_zone_t vnode_zone;
165static uma_zone_t vnodepoll_zone;
166
167/* Set to 1 to print out reclaim of active vnodes */
168int prtactive;
169
170/*
171 * The workitem queue.
172 *
173 * It is useful to delay writes of file data and filesystem metadata
174 * for tens of seconds so that quickly created and deleted files need
175 * not waste disk bandwidth being created and removed. To realize this,
176 * we append vnodes to a "workitem" queue. When running with a soft
177 * updates implementation, most pending metadata dependencies should
178 * not wait for more than a few seconds. Thus, mounted on block devices
179 * are delayed only about a half the time that file data is delayed.
180 * Similarly, directory updates are more critical, so are only delayed
181 * about a third the time that file data is delayed. Thus, there are
182 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
183 * one each second (driven off the filesystem syncer process). The
184 * syncer_delayno variable indicates the next queue that is to be processed.
185 * Items that need to be processed soon are placed in this queue:
186 *
187 * syncer_workitem_pending[syncer_delayno]
188 *
189 * A delay of fifteen seconds is done by placing the request fifteen
190 * entries later in the queue:
191 *
192 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
193 *
194 */
195static int syncer_delayno;
196static long syncer_mask;
197LIST_HEAD(synclist, vnode);
198static struct synclist *syncer_workitem_pending;
199/*
200 * The sync_mtx protects:
201 * vp->v_synclist
202 * syncer_delayno
203 * syncer_workitem_pending
204 * rushjob
205 */
206static struct mtx sync_mtx;
207
208#define SYNCER_MAXDELAY 32
209static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
210static int syncdelay = 30; /* max time to delay syncing data */
211static int filedelay = 30; /* time to delay syncing files */
212SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
213static int dirdelay = 29; /* time to delay syncing directories */
214SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
215static int metadelay = 28; /* time to delay syncing metadata */
216SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
217static int rushjob; /* number of slots to run ASAP */
218static int stat_rush_requests; /* number of times I/O speeded up */
219SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
220
221/*
222 * Number of vnodes we want to exist at any one time. This is mostly used
223 * to size hash tables in vnode-related code. It is normally not used in
224 * getnewvnode(), as wantfreevnodes is normally nonzero.)
225 *
226 * XXX desiredvnodes is historical cruft and should not exist.
227 */
228int desiredvnodes;
229SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
230 &desiredvnodes, 0, "Maximum number of vnodes");
231static int minvnodes;
232SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
233 &minvnodes, 0, "Minimum number of vnodes");
234static int vnlru_nowhere;
235SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
236 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
237
238/* Hook for calling soft updates. */
239int (*softdep_process_worklist_hook)(struct mount *);
240
241/*
242 * Initialize the vnode management data structures.
243 */
244static void
245vntblinit(void *dummy __unused)
246{
247
248 /*
249 * Desiredvnodes is a function of the physical memory size and
250 * the kernel's heap size. Specifically, desiredvnodes scales
251 * in proportion to the physical memory size until two fifths
252 * of the kernel's heap size is consumed by vnodes and vm
253 * objects.
254 */
255 desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
256 (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
257 minvnodes = desiredvnodes / 4;
258 mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
259 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
260 mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
261 TAILQ_INIT(&vnode_free_list);
262 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
263 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
264 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
265 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
266 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
267 /*
268 * Initialize the filesystem syncer.
269 */
270 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
271 &syncer_mask);
272 syncer_maxdelay = syncer_mask + 1;
273 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
274}
275SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
276
277
278/*
279 * Mark a mount point as busy. Used to synchronize access and to delay
280 * unmounting. Interlock is not released on failure.
281 */
282int
283vfs_busy(mp, flags, interlkp, td)
284 struct mount *mp;
285 int flags;
286 struct mtx *interlkp;
287 struct thread *td;
288{
289 int lkflags;
290
291 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
292 if (flags & LK_NOWAIT)
293 return (ENOENT);
294 mp->mnt_kern_flag |= MNTK_MWAIT;
295 /*
296 * Since all busy locks are shared except the exclusive
297 * lock granted when unmounting, the only place that a
298 * wakeup needs to be done is at the release of the
299 * exclusive lock at the end of dounmount.
300 */
301 msleep(mp, interlkp, PVFS, "vfs_busy", 0);
302 return (ENOENT);
303 }
304 lkflags = LK_SHARED | LK_NOPAUSE;
305 if (interlkp)
306 lkflags |= LK_INTERLOCK;
307 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
308 panic("vfs_busy: unexpected lock failure");
309 return (0);
310}
311
312/*
313 * Free a busy filesystem.
314 */
315void
316vfs_unbusy(mp, td)
317 struct mount *mp;
318 struct thread *td;
319{
320
321 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
322}
323
324/*
325 * Lookup a mount point by filesystem identifier.
326 */
327struct mount *
328vfs_getvfs(fsid)
329 fsid_t *fsid;
330{
331 register struct mount *mp;
332
333 mtx_lock(&mountlist_mtx);
334 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
335 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
336 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
337 mtx_unlock(&mountlist_mtx);
338 return (mp);
339 }
340 }
341 mtx_unlock(&mountlist_mtx);
342 return ((struct mount *) 0);
343}
344
345/*
346 * Get a new unique fsid. Try to make its val[0] unique, since this value
347 * will be used to create fake device numbers for stat(). Also try (but
348 * not so hard) make its val[0] unique mod 2^16, since some emulators only
349 * support 16-bit device numbers. We end up with unique val[0]'s for the
350 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
351 *
352 * Keep in mind that several mounts may be running in parallel. Starting
353 * the search one past where the previous search terminated is both a
354 * micro-optimization and a defense against returning the same fsid to
355 * different mounts.
356 */
357void
358vfs_getnewfsid(mp)
359 struct mount *mp;
360{
361 static u_int16_t mntid_base;
362 fsid_t tfsid;
363 int mtype;
364
365 mtx_lock(&mntid_mtx);
366 mtype = mp->mnt_vfc->vfc_typenum;
367 tfsid.val[1] = mtype;
368 mtype = (mtype & 0xFF) << 24;
369 for (;;) {
370 tfsid.val[0] = makeudev(255,
370 tfsid.val[0] = makedev(255,
371 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
372 mntid_base++;
373 if (vfs_getvfs(&tfsid) == NULL)
374 break;
375 }
376 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
377 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
378 mtx_unlock(&mntid_mtx);
379}
380
381/*
382 * Knob to control the precision of file timestamps:
383 *
384 * 0 = seconds only; nanoseconds zeroed.
385 * 1 = seconds and nanoseconds, accurate within 1/HZ.
386 * 2 = seconds and nanoseconds, truncated to microseconds.
387 * >=3 = seconds and nanoseconds, maximum precision.
388 */
389enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
390
391static int timestamp_precision = TSP_SEC;
392SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
393 &timestamp_precision, 0, "");
394
395/*
396 * Get a current timestamp.
397 */
398void
399vfs_timestamp(tsp)
400 struct timespec *tsp;
401{
402 struct timeval tv;
403
404 switch (timestamp_precision) {
405 case TSP_SEC:
406 tsp->tv_sec = time_second;
407 tsp->tv_nsec = 0;
408 break;
409 case TSP_HZ:
410 getnanotime(tsp);
411 break;
412 case TSP_USEC:
413 microtime(&tv);
414 TIMEVAL_TO_TIMESPEC(&tv, tsp);
415 break;
416 case TSP_NSEC:
417 default:
418 nanotime(tsp);
419 break;
420 }
421}
422
423/*
424 * Set vnode attributes to VNOVAL
425 */
426void
427vattr_null(vap)
428 register struct vattr *vap;
429{
430
431 vap->va_type = VNON;
432 vap->va_size = VNOVAL;
433 vap->va_bytes = VNOVAL;
434 vap->va_mode = VNOVAL;
435 vap->va_nlink = VNOVAL;
436 vap->va_uid = VNOVAL;
437 vap->va_gid = VNOVAL;
438 vap->va_fsid = VNOVAL;
439 vap->va_fileid = VNOVAL;
440 vap->va_blocksize = VNOVAL;
441 vap->va_rdev = VNOVAL;
442 vap->va_atime.tv_sec = VNOVAL;
443 vap->va_atime.tv_nsec = VNOVAL;
444 vap->va_mtime.tv_sec = VNOVAL;
445 vap->va_mtime.tv_nsec = VNOVAL;
446 vap->va_ctime.tv_sec = VNOVAL;
447 vap->va_ctime.tv_nsec = VNOVAL;
448 vap->va_birthtime.tv_sec = VNOVAL;
449 vap->va_birthtime.tv_nsec = VNOVAL;
450 vap->va_flags = VNOVAL;
451 vap->va_gen = VNOVAL;
452 vap->va_vaflags = 0;
453}
454
455/*
456 * This routine is called when we have too many vnodes. It attempts
457 * to free <count> vnodes and will potentially free vnodes that still
458 * have VM backing store (VM backing store is typically the cause
459 * of a vnode blowout so we want to do this). Therefore, this operation
460 * is not considered cheap.
461 *
462 * A number of conditions may prevent a vnode from being reclaimed.
463 * the buffer cache may have references on the vnode, a directory
464 * vnode may still have references due to the namei cache representing
465 * underlying files, or the vnode may be in active use. It is not
466 * desireable to reuse such vnodes. These conditions may cause the
467 * number of vnodes to reach some minimum value regardless of what
468 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
469 */
470static int
471vlrureclaim(struct mount *mp)
472{
473 struct vnode *vp;
474 int done;
475 int trigger;
476 int usevnodes;
477 int count;
478
479 /*
480 * Calculate the trigger point, don't allow user
481 * screwups to blow us up. This prevents us from
482 * recycling vnodes with lots of resident pages. We
483 * aren't trying to free memory, we are trying to
484 * free vnodes.
485 */
486 usevnodes = desiredvnodes;
487 if (usevnodes <= 0)
488 usevnodes = 1;
489 trigger = cnt.v_page_count * 2 / usevnodes;
490
491 done = 0;
492 MNT_ILOCK(mp);
493 count = mp->mnt_nvnodelistsize / 10 + 1;
494 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
495 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
496 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
497
498 if (vp->v_type != VNON &&
499 vp->v_type != VBAD &&
500 VI_TRYLOCK(vp)) {
501 if (VMIGHTFREE(vp) && /* critical path opt */
502 (vp->v_object == NULL ||
503 vp->v_object->resident_page_count < trigger)) {
504 MNT_IUNLOCK(mp);
505 vgonel(vp, curthread);
506 done++;
507 MNT_ILOCK(mp);
508 } else
509 VI_UNLOCK(vp);
510 }
511 --count;
512 }
513 MNT_IUNLOCK(mp);
514 return done;
515}
516
517/*
518 * Attempt to recycle vnodes in a context that is always safe to block.
519 * Calling vlrurecycle() from the bowels of filesystem code has some
520 * interesting deadlock problems.
521 */
522static struct proc *vnlruproc;
523static int vnlruproc_sig;
524
525static void
526vnlru_proc(void)
527{
528 struct mount *mp, *nmp;
529 int done;
530 struct proc *p = vnlruproc;
531 struct thread *td = FIRST_THREAD_IN_PROC(p);
532
533 mtx_lock(&Giant);
534
535 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
536 SHUTDOWN_PRI_FIRST);
537
538 for (;;) {
539 kthread_suspend_check(p);
540 mtx_lock(&vnode_free_list_mtx);
541 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
542 mtx_unlock(&vnode_free_list_mtx);
543 vnlruproc_sig = 0;
544 wakeup(&vnlruproc_sig);
545 tsleep(vnlruproc, PVFS, "vlruwt", hz);
546 continue;
547 }
548 mtx_unlock(&vnode_free_list_mtx);
549 done = 0;
550 mtx_lock(&mountlist_mtx);
551 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
552 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
553 nmp = TAILQ_NEXT(mp, mnt_list);
554 continue;
555 }
556 done += vlrureclaim(mp);
557 mtx_lock(&mountlist_mtx);
558 nmp = TAILQ_NEXT(mp, mnt_list);
559 vfs_unbusy(mp, td);
560 }
561 mtx_unlock(&mountlist_mtx);
562 if (done == 0) {
563#if 0
564 /* These messages are temporary debugging aids */
565 if (vnlru_nowhere < 5)
566 printf("vnlru process getting nowhere..\n");
567 else if (vnlru_nowhere == 5)
568 printf("vnlru process messages stopped.\n");
569#endif
570 vnlru_nowhere++;
571 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
572 }
573 }
574}
575
576static struct kproc_desc vnlru_kp = {
577 "vnlru",
578 vnlru_proc,
579 &vnlruproc
580};
581SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
582
583
584/*
585 * Routines having to do with the management of the vnode table.
586 */
587
588/*
589 * Check to see if a free vnode can be recycled. If it can,
590 * recycle it and return it with the vnode interlock held.
591 */
592static int
593vtryrecycle(struct vnode *vp)
594{
595 struct thread *td = curthread;
596 vm_object_t object;
597 struct mount *vnmp;
598 int error;
599
600 /* Don't recycle if we can't get the interlock */
601 if (!VI_TRYLOCK(vp))
602 return (EWOULDBLOCK);
603 /*
604 * This vnode may found and locked via some other list, if so we
605 * can't recycle it yet.
606 */
607 if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
608 return (EWOULDBLOCK);
609 /*
610 * Don't recycle if its filesystem is being suspended.
611 */
612 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
613 VOP_UNLOCK(vp, 0, td);
614 return (EBUSY);
615 }
616
617 /*
618 * Don't recycle if we still have cached pages.
619 */
620 if (VOP_GETVOBJECT(vp, &object) == 0) {
621 VM_OBJECT_LOCK(object);
622 if (object->resident_page_count ||
623 object->ref_count) {
624 VM_OBJECT_UNLOCK(object);
625 error = EBUSY;
626 goto done;
627 }
628 VM_OBJECT_UNLOCK(object);
629 }
630 if (LIST_FIRST(&vp->v_cache_src)) {
631 /*
632 * note: nameileafonly sysctl is temporary,
633 * for debugging only, and will eventually be
634 * removed.
635 */
636 if (nameileafonly > 0) {
637 /*
638 * Do not reuse namei-cached directory
639 * vnodes that have cached
640 * subdirectories.
641 */
642 if (cache_leaf_test(vp) < 0) {
643 error = EISDIR;
644 goto done;
645 }
646 } else if (nameileafonly < 0 ||
647 vmiodirenable == 0) {
648 /*
649 * Do not reuse namei-cached directory
650 * vnodes if nameileafonly is -1 or
651 * if VMIO backing for directories is
652 * turned off (otherwise we reuse them
653 * too quickly).
654 */
655 error = EBUSY;
656 goto done;
657 }
658 }
659 /*
660 * If we got this far, we need to acquire the interlock and see if
661 * anyone picked up this vnode from another list. If not, we will
662 * mark it with XLOCK via vgonel() so that anyone who does find it
663 * will skip over it.
664 */
665 VI_LOCK(vp);
666 if (VSHOULDBUSY(vp) && (vp->v_iflag & VI_XLOCK) == 0) {
667 VI_UNLOCK(vp);
668 error = EBUSY;
669 goto done;
670 }
671 mtx_lock(&vnode_free_list_mtx);
672 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
673 vp->v_iflag &= ~VI_FREE;
674 mtx_unlock(&vnode_free_list_mtx);
675 vp->v_iflag |= VI_DOOMED;
676 if (vp->v_type != VBAD) {
677 VOP_UNLOCK(vp, 0, td);
678 vgonel(vp, td);
679 VI_LOCK(vp);
680 } else
681 VOP_UNLOCK(vp, 0, td);
682 vn_finished_write(vnmp);
683 return (0);
684done:
685 VOP_UNLOCK(vp, 0, td);
686 vn_finished_write(vnmp);
687 return (error);
688}
689
690/*
691 * Return the next vnode from the free list.
692 */
693int
694getnewvnode(tag, mp, vops, vpp)
695 const char *tag;
696 struct mount *mp;
697 vop_t **vops;
698 struct vnode **vpp;
699{
700 struct vnode *vp = NULL;
701 struct vpollinfo *pollinfo = NULL;
702
703 mtx_lock(&vnode_free_list_mtx);
704
705 /*
706 * Try to reuse vnodes if we hit the max. This situation only
707 * occurs in certain large-memory (2G+) situations. We cannot
708 * attempt to directly reclaim vnodes due to nasty recursion
709 * problems.
710 */
711 while (numvnodes - freevnodes > desiredvnodes) {
712 if (vnlruproc_sig == 0) {
713 vnlruproc_sig = 1; /* avoid unnecessary wakeups */
714 wakeup(vnlruproc);
715 }
716 mtx_unlock(&vnode_free_list_mtx);
717 tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
718 mtx_lock(&vnode_free_list_mtx);
719 }
720
721 /*
722 * Attempt to reuse a vnode already on the free list, allocating
723 * a new vnode if we can't find one or if we have not reached a
724 * good minimum for good LRU performance.
725 */
726
727 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
728 int error;
729 int count;
730
731 for (count = 0; count < freevnodes; count++) {
732 vp = TAILQ_FIRST(&vnode_free_list);
733
734 KASSERT(vp->v_usecount == 0 &&
735 (vp->v_iflag & VI_DOINGINACT) == 0,
736 ("getnewvnode: free vnode isn't"));
737
738 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
739 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
740 mtx_unlock(&vnode_free_list_mtx);
741 error = vtryrecycle(vp);
742 mtx_lock(&vnode_free_list_mtx);
743 if (error == 0)
744 break;
745 vp = NULL;
746 }
747 }
748 if (vp) {
749 freevnodes--;
750 mtx_unlock(&vnode_free_list_mtx);
751
752#ifdef INVARIANTS
753 {
754 if (vp->v_data)
755 panic("cleaned vnode isn't");
756 if (vp->v_numoutput)
757 panic("Clean vnode has pending I/O's");
758 if (vp->v_writecount != 0)
759 panic("Non-zero write count");
760 }
761#endif
762 if ((pollinfo = vp->v_pollinfo) != NULL) {
763 /*
764 * To avoid lock order reversals, the call to
765 * uma_zfree() must be delayed until the vnode
766 * interlock is released.
767 */
768 vp->v_pollinfo = NULL;
769 }
770#ifdef MAC
771 mac_destroy_vnode(vp);
772#endif
773 vp->v_iflag = 0;
774 vp->v_vflag = 0;
775 vp->v_lastw = 0;
776 vp->v_lasta = 0;
777 vp->v_cstart = 0;
778 vp->v_clen = 0;
779 vp->v_socket = 0;
780 lockdestroy(vp->v_vnlock);
781 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
782 KASSERT(vp->v_cleanbufcnt == 0, ("cleanbufcnt not 0"));
783 KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
784 KASSERT(vp->v_dirtybufcnt == 0, ("dirtybufcnt not 0"));
785 KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
786 } else {
787 numvnodes++;
788 mtx_unlock(&vnode_free_list_mtx);
789
790 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
791 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
792 VI_LOCK(vp);
793 vp->v_dd = vp;
794 vp->v_vnlock = &vp->v_lock;
795 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
796 cache_purge(vp); /* Sets up v_id. */
797 LIST_INIT(&vp->v_cache_src);
798 TAILQ_INIT(&vp->v_cache_dst);
799 }
800
801 TAILQ_INIT(&vp->v_cleanblkhd);
802 TAILQ_INIT(&vp->v_dirtyblkhd);
803 vp->v_type = VNON;
804 vp->v_tag = tag;
805 vp->v_op = vops;
806 *vpp = vp;
807 vp->v_usecount = 1;
808 vp->v_data = 0;
809 vp->v_cachedid = -1;
810 VI_UNLOCK(vp);
811 if (pollinfo != NULL) {
812 mtx_destroy(&pollinfo->vpi_lock);
813 uma_zfree(vnodepoll_zone, pollinfo);
814 }
815#ifdef MAC
816 mac_init_vnode(vp);
817 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
818 mac_associate_vnode_singlelabel(mp, vp);
819#endif
820 insmntque(vp, mp);
821
822 return (0);
823}
824
825/*
826 * Move a vnode from one mount queue to another.
827 */
828static void
829insmntque(vp, mp)
830 register struct vnode *vp;
831 register struct mount *mp;
832{
833
834 /*
835 * Delete from old mount point vnode list, if on one.
836 */
837 if (vp->v_mount != NULL) {
838 MNT_ILOCK(vp->v_mount);
839 KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
840 ("bad mount point vnode list size"));
841 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
842 vp->v_mount->mnt_nvnodelistsize--;
843 MNT_IUNLOCK(vp->v_mount);
844 }
845 /*
846 * Insert into list of vnodes for the new mount point, if available.
847 */
848 if ((vp->v_mount = mp) != NULL) {
849 MNT_ILOCK(vp->v_mount);
850 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
851 mp->mnt_nvnodelistsize++;
852 MNT_IUNLOCK(vp->v_mount);
853 }
854}
855
856/*
857 * Update outstanding I/O count and do wakeup if requested.
858 */
859void
860vwakeup(bp)
861 register struct buf *bp;
862{
863 register struct vnode *vp;
864
865 bp->b_flags &= ~B_WRITEINPROG;
866 if ((vp = bp->b_vp)) {
867 VI_LOCK(vp);
868 vp->v_numoutput--;
869 if (vp->v_numoutput < 0)
870 panic("vwakeup: neg numoutput");
871 if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
872 vp->v_iflag &= ~VI_BWAIT;
873 wakeup(&vp->v_numoutput);
874 }
875 VI_UNLOCK(vp);
876 }
877}
878
879/*
880 * Flush out and invalidate all buffers associated with a vnode.
881 * Called with the underlying object locked.
882 */
883int
884vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
885 struct vnode *vp;
886 int flags;
887 struct ucred *cred;
888 struct thread *td;
889 int slpflag, slptimeo;
890{
891 struct buf *blist;
892 int error;
893 vm_object_t object;
894
895 GIANT_REQUIRED;
896
897 ASSERT_VOP_LOCKED(vp, "vinvalbuf");
898
899 VI_LOCK(vp);
900 if (flags & V_SAVE) {
901 while (vp->v_numoutput) {
902 vp->v_iflag |= VI_BWAIT;
903 error = msleep(&vp->v_numoutput, VI_MTX(vp),
904 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
905 if (error) {
906 VI_UNLOCK(vp);
907 return (error);
908 }
909 }
910 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
911 VI_UNLOCK(vp);
912 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
913 return (error);
914 /*
915 * XXX We could save a lock/unlock if this was only
916 * enabled under INVARIANTS
917 */
918 VI_LOCK(vp);
919 if (vp->v_numoutput > 0 ||
920 !TAILQ_EMPTY(&vp->v_dirtyblkhd))
921 panic("vinvalbuf: dirty bufs");
922 }
923 }
924 /*
925 * If you alter this loop please notice that interlock is dropped and
926 * reacquired in flushbuflist. Special care is needed to ensure that
927 * no race conditions occur from this.
928 */
929 for (error = 0;;) {
930 if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
931 flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
932 if (error)
933 break;
934 continue;
935 }
936 if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
937 flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
938 if (error)
939 break;
940 continue;
941 }
942 break;
943 }
944 if (error) {
945 VI_UNLOCK(vp);
946 return (error);
947 }
948
949 /*
950 * Wait for I/O to complete. XXX needs cleaning up. The vnode can
951 * have write I/O in-progress but if there is a VM object then the
952 * VM object can also have read-I/O in-progress.
953 */
954 do {
955 while (vp->v_numoutput > 0) {
956 vp->v_iflag |= VI_BWAIT;
957 msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
958 }
959 VI_UNLOCK(vp);
960 if (VOP_GETVOBJECT(vp, &object) == 0) {
961 VM_OBJECT_LOCK(object);
962 vm_object_pip_wait(object, "vnvlbx");
963 VM_OBJECT_UNLOCK(object);
964 }
965 VI_LOCK(vp);
966 } while (vp->v_numoutput > 0);
967 VI_UNLOCK(vp);
968
969 /*
970 * Destroy the copy in the VM cache, too.
971 */
972 if (VOP_GETVOBJECT(vp, &object) == 0) {
973 VM_OBJECT_LOCK(object);
974 vm_object_page_remove(object, 0, 0,
975 (flags & V_SAVE) ? TRUE : FALSE);
976 VM_OBJECT_UNLOCK(object);
977 }
978
979#ifdef INVARIANTS
980 VI_LOCK(vp);
981 if ((flags & (V_ALT | V_NORMAL)) == 0 &&
982 (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
983 !TAILQ_EMPTY(&vp->v_cleanblkhd)))
984 panic("vinvalbuf: flush failed");
985 VI_UNLOCK(vp);
986#endif
987 return (0);
988}
989
990/*
991 * Flush out buffers on the specified list.
992 *
993 */
994static int
995flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
996 struct buf *blist;
997 int flags;
998 struct vnode *vp;
999 int slpflag, slptimeo;
1000 int *errorp;
1001{
1002 struct buf *bp, *nbp;
1003 int found, error;
1004
1005 ASSERT_VI_LOCKED(vp, "flushbuflist");
1006
1007 for (found = 0, bp = blist; bp; bp = nbp) {
1008 nbp = TAILQ_NEXT(bp, b_vnbufs);
1009 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1010 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1011 continue;
1012 }
1013 found += 1;
1014 error = BUF_TIMELOCK(bp,
1015 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp),
1016 "flushbuf", slpflag, slptimeo);
1017 if (error) {
1018 if (error != ENOLCK)
1019 *errorp = error;
1020 goto done;
1021 }
1022 /*
1023 * XXX Since there are no node locks for NFS, I
1024 * believe there is a slight chance that a delayed
1025 * write will occur while sleeping just above, so
1026 * check for it. Note that vfs_bio_awrite expects
1027 * buffers to reside on a queue, while bwrite and
1028 * brelse do not.
1029 */
1030 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1031 (flags & V_SAVE)) {
1032
1033 if (bp->b_vp == vp) {
1034 if (bp->b_flags & B_CLUSTEROK) {
1035 vfs_bio_awrite(bp);
1036 } else {
1037 bremfree(bp);
1038 bp->b_flags |= B_ASYNC;
1039 bwrite(bp);
1040 }
1041 } else {
1042 bremfree(bp);
1043 (void) bwrite(bp);
1044 }
1045 goto done;
1046 }
1047 bremfree(bp);
1048 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1049 bp->b_flags &= ~B_ASYNC;
1050 brelse(bp);
1051 VI_LOCK(vp);
1052 }
1053 return (found);
1054done:
1055 VI_LOCK(vp);
1056 return (found);
1057}
1058
1059/*
1060 * Truncate a file's buffer and pages to a specified length. This
1061 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1062 * sync activity.
1063 */
1064int
1065vtruncbuf(vp, cred, td, length, blksize)
1066 register struct vnode *vp;
1067 struct ucred *cred;
1068 struct thread *td;
1069 off_t length;
1070 int blksize;
1071{
1072 register struct buf *bp;
1073 struct buf *nbp;
1074 int anyfreed;
1075 int trunclbn;
1076
1077 /*
1078 * Round up to the *next* lbn.
1079 */
1080 trunclbn = (length + blksize - 1) / blksize;
1081
1082 ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1083restart:
1084 VI_LOCK(vp);
1085 anyfreed = 1;
1086 for (;anyfreed;) {
1087 anyfreed = 0;
1088 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1089 nbp = TAILQ_NEXT(bp, b_vnbufs);
1090 if (bp->b_lblkno >= trunclbn) {
1091 if (BUF_LOCK(bp,
1092 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1093 VI_MTX(vp)) == ENOLCK)
1094 goto restart;
1095
1096 bremfree(bp);
1097 bp->b_flags |= (B_INVAL | B_RELBUF);
1098 bp->b_flags &= ~B_ASYNC;
1099 brelse(bp);
1100 anyfreed = 1;
1101
1102 if (nbp &&
1103 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1104 (nbp->b_vp != vp) ||
1105 (nbp->b_flags & B_DELWRI))) {
1106 goto restart;
1107 }
1108 VI_LOCK(vp);
1109 }
1110 }
1111
1112 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1113 nbp = TAILQ_NEXT(bp, b_vnbufs);
1114 if (bp->b_lblkno >= trunclbn) {
1115 if (BUF_LOCK(bp,
1116 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1117 VI_MTX(vp)) == ENOLCK)
1118 goto restart;
1119 bremfree(bp);
1120 bp->b_flags |= (B_INVAL | B_RELBUF);
1121 bp->b_flags &= ~B_ASYNC;
1122 brelse(bp);
1123 anyfreed = 1;
1124 if (nbp &&
1125 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1126 (nbp->b_vp != vp) ||
1127 (nbp->b_flags & B_DELWRI) == 0)) {
1128 goto restart;
1129 }
1130 VI_LOCK(vp);
1131 }
1132 }
1133 }
1134
1135 if (length > 0) {
1136restartsync:
1137 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1138 nbp = TAILQ_NEXT(bp, b_vnbufs);
1139 if (bp->b_lblkno > 0)
1140 continue;
1141 /*
1142 * Since we hold the vnode lock this should only
1143 * fail if we're racing with the buf daemon.
1144 */
1145 if (BUF_LOCK(bp,
1146 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1147 VI_MTX(vp)) == ENOLCK) {
1148 goto restart;
1149 }
1150 KASSERT((bp->b_flags & B_DELWRI),
1151 ("buf(%p) on dirty queue without DELWRI", bp));
1152
1153 bremfree(bp);
1154 bawrite(bp);
1155 VI_LOCK(vp);
1156 goto restartsync;
1157 }
1158 }
1159
1160 while (vp->v_numoutput > 0) {
1161 vp->v_iflag |= VI_BWAIT;
1162 msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1163 }
1164 VI_UNLOCK(vp);
1165 vnode_pager_setsize(vp, length);
1166
1167 return (0);
1168}
1169
1170/*
1171 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1172 * a vnode.
1173 *
1174 * NOTE: We have to deal with the special case of a background bitmap
1175 * buffer, a situation where two buffers will have the same logical
1176 * block offset. We want (1) only the foreground buffer to be accessed
1177 * in a lookup and (2) must differentiate between the foreground and
1178 * background buffer in the splay tree algorithm because the splay
1179 * tree cannot normally handle multiple entities with the same 'index'.
1180 * We accomplish this by adding differentiating flags to the splay tree's
1181 * numerical domain.
1182 */
1183static
1184struct buf *
1185buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1186{
1187 struct buf dummy;
1188 struct buf *lefttreemax, *righttreemin, *y;
1189
1190 if (root == NULL)
1191 return (NULL);
1192 lefttreemax = righttreemin = &dummy;
1193 for (;;) {
1194 if (lblkno < root->b_lblkno ||
1195 (lblkno == root->b_lblkno &&
1196 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1197 if ((y = root->b_left) == NULL)
1198 break;
1199 if (lblkno < y->b_lblkno) {
1200 /* Rotate right. */
1201 root->b_left = y->b_right;
1202 y->b_right = root;
1203 root = y;
1204 if ((y = root->b_left) == NULL)
1205 break;
1206 }
1207 /* Link into the new root's right tree. */
1208 righttreemin->b_left = root;
1209 righttreemin = root;
1210 } else if (lblkno > root->b_lblkno ||
1211 (lblkno == root->b_lblkno &&
1212 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1213 if ((y = root->b_right) == NULL)
1214 break;
1215 if (lblkno > y->b_lblkno) {
1216 /* Rotate left. */
1217 root->b_right = y->b_left;
1218 y->b_left = root;
1219 root = y;
1220 if ((y = root->b_right) == NULL)
1221 break;
1222 }
1223 /* Link into the new root's left tree. */
1224 lefttreemax->b_right = root;
1225 lefttreemax = root;
1226 } else {
1227 break;
1228 }
1229 root = y;
1230 }
1231 /* Assemble the new root. */
1232 lefttreemax->b_right = root->b_left;
1233 righttreemin->b_left = root->b_right;
1234 root->b_left = dummy.b_right;
1235 root->b_right = dummy.b_left;
1236 return (root);
1237}
1238
1239static
1240void
1241buf_vlist_remove(struct buf *bp)
1242{
1243 struct vnode *vp = bp->b_vp;
1244 struct buf *root;
1245
1246 ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
1247 if (bp->b_xflags & BX_VNDIRTY) {
1248 if (bp != vp->v_dirtyblkroot) {
1249 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1250 vp->v_dirtyblkroot);
1251 KASSERT(root == bp,
1252 ("splay lookup failed during dirty remove"));
1253 }
1254 if (bp->b_left == NULL) {
1255 root = bp->b_right;
1256 } else {
1257 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1258 bp->b_left);
1259 root->b_right = bp->b_right;
1260 }
1261 vp->v_dirtyblkroot = root;
1262 TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1263 vp->v_dirtybufcnt--;
1264 } else {
1265 /* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1266 if (bp != vp->v_cleanblkroot) {
1267 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1268 vp->v_cleanblkroot);
1269 KASSERT(root == bp,
1270 ("splay lookup failed during clean remove"));
1271 }
1272 if (bp->b_left == NULL) {
1273 root = bp->b_right;
1274 } else {
1275 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1276 bp->b_left);
1277 root->b_right = bp->b_right;
1278 }
1279 vp->v_cleanblkroot = root;
1280 TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1281 vp->v_cleanbufcnt--;
1282 }
1283 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1284}
1285
1286/*
1287 * Add the buffer to the sorted clean or dirty block list using a
1288 * splay tree algorithm.
1289 *
1290 * NOTE: xflags is passed as a constant, optimizing this inline function!
1291 */
1292static
1293void
1294buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1295{
1296 struct buf *root;
1297
1298 ASSERT_VI_LOCKED(vp, "buf_vlist_add");
1299 bp->b_xflags |= xflags;
1300 if (xflags & BX_VNDIRTY) {
1301 root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1302 if (root == NULL) {
1303 bp->b_left = NULL;
1304 bp->b_right = NULL;
1305 TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1306 } else if (bp->b_lblkno < root->b_lblkno ||
1307 (bp->b_lblkno == root->b_lblkno &&
1308 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1309 bp->b_left = root->b_left;
1310 bp->b_right = root;
1311 root->b_left = NULL;
1312 TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1313 } else {
1314 bp->b_right = root->b_right;
1315 bp->b_left = root;
1316 root->b_right = NULL;
1317 TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1318 root, bp, b_vnbufs);
1319 }
1320 vp->v_dirtybufcnt++;
1321 vp->v_dirtyblkroot = bp;
1322 } else {
1323 /* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1324 root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1325 if (root == NULL) {
1326 bp->b_left = NULL;
1327 bp->b_right = NULL;
1328 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1329 } else if (bp->b_lblkno < root->b_lblkno ||
1330 (bp->b_lblkno == root->b_lblkno &&
1331 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1332 bp->b_left = root->b_left;
1333 bp->b_right = root;
1334 root->b_left = NULL;
1335 TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1336 } else {
1337 bp->b_right = root->b_right;
1338 bp->b_left = root;
1339 root->b_right = NULL;
1340 TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1341 root, bp, b_vnbufs);
1342 }
1343 vp->v_cleanbufcnt++;
1344 vp->v_cleanblkroot = bp;
1345 }
1346}
1347
1348/*
1349 * Lookup a buffer using the splay tree. Note that we specifically avoid
1350 * shadow buffers used in background bitmap writes.
1351 *
1352 * This code isn't quite efficient as it could be because we are maintaining
1353 * two sorted lists and do not know which list the block resides in.
1354 *
1355 * During a "make buildworld" the desired buffer is found at one of
1356 * the roots more than 60% of the time. Thus, checking both roots
1357 * before performing either splay eliminates unnecessary splays on the
1358 * first tree splayed.
1359 */
1360struct buf *
1361gbincore(struct vnode *vp, daddr_t lblkno)
1362{
1363 struct buf *bp;
1364
1365 GIANT_REQUIRED;
1366
1367 ASSERT_VI_LOCKED(vp, "gbincore");
1368 if ((bp = vp->v_cleanblkroot) != NULL &&
1369 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1370 return (bp);
1371 if ((bp = vp->v_dirtyblkroot) != NULL &&
1372 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1373 return (bp);
1374 if ((bp = vp->v_cleanblkroot) != NULL) {
1375 vp->v_cleanblkroot = bp = buf_splay(lblkno, 0, bp);
1376 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1377 return (bp);
1378 }
1379 if ((bp = vp->v_dirtyblkroot) != NULL) {
1380 vp->v_dirtyblkroot = bp = buf_splay(lblkno, 0, bp);
1381 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1382 return (bp);
1383 }
1384 return (NULL);
1385}
1386
1387/*
1388 * Associate a buffer with a vnode.
1389 */
1390void
1391bgetvp(vp, bp)
1392 register struct vnode *vp;
1393 register struct buf *bp;
1394{
1395
1396 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1397
1398 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1399 ("bgetvp: bp already attached! %p", bp));
1400
1401 ASSERT_VI_LOCKED(vp, "bgetvp");
1402 vholdl(vp);
1403 bp->b_vp = vp;
1404 bp->b_dev = vn_todev(vp);
1405 /*
1406 * Insert onto list for new vnode.
1407 */
1408 buf_vlist_add(bp, vp, BX_VNCLEAN);
1409}
1410
1411/*
1412 * Disassociate a buffer from a vnode.
1413 */
1414void
1415brelvp(bp)
1416 register struct buf *bp;
1417{
1418 struct vnode *vp;
1419
1420 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1421
1422 /*
1423 * Delete from old vnode list, if on one.
1424 */
1425 vp = bp->b_vp;
1426 VI_LOCK(vp);
1427 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1428 buf_vlist_remove(bp);
1429 if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1430 vp->v_iflag &= ~VI_ONWORKLST;
1431 mtx_lock(&sync_mtx);
1432 LIST_REMOVE(vp, v_synclist);
1433 mtx_unlock(&sync_mtx);
1434 }
1435 vdropl(vp);
1436 bp->b_vp = (struct vnode *) 0;
1437 if (bp->b_object)
1438 bp->b_object = NULL;
1439 VI_UNLOCK(vp);
1440}
1441
1442/*
1443 * Add an item to the syncer work queue.
1444 */
1445static void
1446vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1447{
1448 int slot;
1449
1450 ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
1451
1452 mtx_lock(&sync_mtx);
1453 if (vp->v_iflag & VI_ONWORKLST)
1454 LIST_REMOVE(vp, v_synclist);
1455 else
1456 vp->v_iflag |= VI_ONWORKLST;
1457
1458 if (delay > syncer_maxdelay - 2)
1459 delay = syncer_maxdelay - 2;
1460 slot = (syncer_delayno + delay) & syncer_mask;
1461
1462 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1463 mtx_unlock(&sync_mtx);
1464}
1465
1466struct proc *updateproc;
1467static void sched_sync(void);
1468static struct kproc_desc up_kp = {
1469 "syncer",
1470 sched_sync,
1471 &updateproc
1472};
1473SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1474
1475/*
1476 * System filesystem synchronizer daemon.
1477 */
1478static void
1479sched_sync(void)
1480{
1481 struct synclist *next;
1482 struct synclist *slp;
1483 struct vnode *vp;
1484 struct mount *mp;
1485 long starttime;
1486 struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
1487
1488 mtx_lock(&Giant);
1489
1490 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1491 SHUTDOWN_PRI_LAST);
1492
1493 for (;;) {
1494 kthread_suspend_check(td->td_proc);
1495
1496 starttime = time_second;
1497
1498 /*
1499 * Push files whose dirty time has expired. Be careful
1500 * of interrupt race on slp queue.
1501 */
1502 mtx_lock(&sync_mtx);
1503 slp = &syncer_workitem_pending[syncer_delayno];
1504 syncer_delayno += 1;
1505 if (syncer_delayno == syncer_maxdelay)
1506 syncer_delayno = 0;
1507 next = &syncer_workitem_pending[syncer_delayno];
1508
1509 while ((vp = LIST_FIRST(slp)) != NULL) {
1510 if (VOP_ISLOCKED(vp, NULL) != 0 ||
1511 vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1512 LIST_REMOVE(vp, v_synclist);
1513 LIST_INSERT_HEAD(next, vp, v_synclist);
1514 continue;
1515 }
1516 if (VI_TRYLOCK(vp) == 0) {
1517 LIST_REMOVE(vp, v_synclist);
1518 LIST_INSERT_HEAD(next, vp, v_synclist);
1519 vn_finished_write(mp);
1520 continue;
1521 }
1522 /*
1523 * We use vhold in case the vnode does not
1524 * successfully sync. vhold prevents the vnode from
1525 * going away when we unlock the sync_mtx so that
1526 * we can acquire the vnode interlock.
1527 */
1528 vholdl(vp);
1529 mtx_unlock(&sync_mtx);
1530 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
1531 (void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1532 VOP_UNLOCK(vp, 0, td);
1533 vn_finished_write(mp);
1534 VI_LOCK(vp);
1535 if ((vp->v_iflag & VI_ONWORKLST) != 0) {
1536 /*
1537 * Put us back on the worklist. The worklist
1538 * routine will remove us from our current
1539 * position and then add us back in at a later
1540 * position.
1541 */
1542 vn_syncer_add_to_worklist(vp, syncdelay);
1543 }
1544 vdropl(vp);
1545 VI_UNLOCK(vp);
1546 mtx_lock(&sync_mtx);
1547 }
1548 mtx_unlock(&sync_mtx);
1549
1550 /*
1551 * Do soft update processing.
1552 */
1553 if (softdep_process_worklist_hook != NULL)
1554 (*softdep_process_worklist_hook)(NULL);
1555
1556 /*
1557 * The variable rushjob allows the kernel to speed up the
1558 * processing of the filesystem syncer process. A rushjob
1559 * value of N tells the filesystem syncer to process the next
1560 * N seconds worth of work on its queue ASAP. Currently rushjob
1561 * is used by the soft update code to speed up the filesystem
1562 * syncer process when the incore state is getting so far
1563 * ahead of the disk that the kernel memory pool is being
1564 * threatened with exhaustion.
1565 */
1566 mtx_lock(&sync_mtx);
1567 if (rushjob > 0) {
1568 rushjob -= 1;
1569 mtx_unlock(&sync_mtx);
1570 continue;
1571 }
1572 mtx_unlock(&sync_mtx);
1573 /*
1574 * If it has taken us less than a second to process the
1575 * current work, then wait. Otherwise start right over
1576 * again. We can still lose time if any single round
1577 * takes more than two seconds, but it does not really
1578 * matter as we are just trying to generally pace the
1579 * filesystem activity.
1580 */
1581 if (time_second == starttime)
1582 tsleep(&lbolt, PPAUSE, "syncer", 0);
1583 }
1584}
1585
1586/*
1587 * Request the syncer daemon to speed up its work.
1588 * We never push it to speed up more than half of its
1589 * normal turn time, otherwise it could take over the cpu.
1590 */
1591int
1592speedup_syncer()
1593{
1594 struct thread *td;
1595 int ret = 0;
1596
1597 td = FIRST_THREAD_IN_PROC(updateproc);
1598 sleepq_remove(td, &lbolt);
1599 mtx_lock(&sync_mtx);
1600 if (rushjob < syncdelay / 2) {
1601 rushjob += 1;
1602 stat_rush_requests += 1;
1603 ret = 1;
1604 }
1605 mtx_unlock(&sync_mtx);
1606 return (ret);
1607}
1608
1609/*
1610 * Associate a p-buffer with a vnode.
1611 *
1612 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1613 * with the buffer. i.e. the bp has not been linked into the vnode or
1614 * ref-counted.
1615 */
1616void
1617pbgetvp(vp, bp)
1618 register struct vnode *vp;
1619 register struct buf *bp;
1620{
1621
1622 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1623
1624 bp->b_vp = vp;
1625 bp->b_object = vp->v_object;
1626 bp->b_flags |= B_PAGING;
1627 bp->b_dev = vn_todev(vp);
1628}
1629
1630/*
1631 * Disassociate a p-buffer from a vnode.
1632 */
1633void
1634pbrelvp(bp)
1635 register struct buf *bp;
1636{
1637
1638 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1639
1640 /* XXX REMOVE ME */
1641 VI_LOCK(bp->b_vp);
1642 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1643 panic(
1644 "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1645 bp,
1646 (int)bp->b_flags
1647 );
1648 }
1649 VI_UNLOCK(bp->b_vp);
1650 bp->b_vp = (struct vnode *) 0;
1651 bp->b_object = NULL;
1652 bp->b_flags &= ~B_PAGING;
1653}
1654
1655/*
1656 * Reassign a buffer from one vnode to another.
1657 * Used to assign file specific control information
1658 * (indirect blocks) to the vnode to which they belong.
1659 */
1660void
1661reassignbuf(bp, newvp)
1662 register struct buf *bp;
1663 register struct vnode *newvp;
1664{
1665 struct vnode *vp;
1666 int delay;
1667
1668 if (newvp == NULL) {
1669 printf("reassignbuf: NULL");
1670 return;
1671 }
1672 vp = bp->b_vp;
1673 ++reassignbufcalls;
1674
1675 /*
1676 * B_PAGING flagged buffers cannot be reassigned because their vp
1677 * is not fully linked in.
1678 */
1679 if (bp->b_flags & B_PAGING)
1680 panic("cannot reassign paging buffer");
1681
1682 /*
1683 * Delete from old vnode list, if on one.
1684 */
1685 VI_LOCK(vp);
1686 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1687 buf_vlist_remove(bp);
1688 if (vp != newvp) {
1689 vdropl(bp->b_vp);
1690 bp->b_vp = NULL; /* for clarification */
1691 }
1692 }
1693 if (vp != newvp) {
1694 VI_UNLOCK(vp);
1695 VI_LOCK(newvp);
1696 }
1697 /*
1698 * If dirty, put on list of dirty buffers; otherwise insert onto list
1699 * of clean buffers.
1700 */
1701 if (bp->b_flags & B_DELWRI) {
1702 if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1703 switch (newvp->v_type) {
1704 case VDIR:
1705 delay = dirdelay;
1706 break;
1707 case VCHR:
1708 delay = metadelay;
1709 break;
1710 default:
1711 delay = filedelay;
1712 }
1713 vn_syncer_add_to_worklist(newvp, delay);
1714 }
1715 buf_vlist_add(bp, newvp, BX_VNDIRTY);
1716 } else {
1717 buf_vlist_add(bp, newvp, BX_VNCLEAN);
1718
1719 if ((newvp->v_iflag & VI_ONWORKLST) &&
1720 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1721 mtx_lock(&sync_mtx);
1722 LIST_REMOVE(newvp, v_synclist);
1723 mtx_unlock(&sync_mtx);
1724 newvp->v_iflag &= ~VI_ONWORKLST;
1725 }
1726 }
1727 if (bp->b_vp != newvp) {
1728 bp->b_vp = newvp;
1729 vholdl(bp->b_vp);
1730 }
1731 VI_UNLOCK(newvp);
1732}
1733
1734/*
1735 * Create a vnode for a device.
1736 * Used for mounting the root filesystem.
1737 */
1738int
1739bdevvp(dev, vpp)
1740 struct cdev *dev;
1741 struct vnode **vpp;
1742{
1743 register struct vnode *vp;
1744 struct vnode *nvp;
1745 int error;
1746
371 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
372 mntid_base++;
373 if (vfs_getvfs(&tfsid) == NULL)
374 break;
375 }
376 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
377 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
378 mtx_unlock(&mntid_mtx);
379}
380
381/*
382 * Knob to control the precision of file timestamps:
383 *
384 * 0 = seconds only; nanoseconds zeroed.
385 * 1 = seconds and nanoseconds, accurate within 1/HZ.
386 * 2 = seconds and nanoseconds, truncated to microseconds.
387 * >=3 = seconds and nanoseconds, maximum precision.
388 */
389enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
390
391static int timestamp_precision = TSP_SEC;
392SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
393 &timestamp_precision, 0, "");
394
395/*
396 * Get a current timestamp.
397 */
398void
399vfs_timestamp(tsp)
400 struct timespec *tsp;
401{
402 struct timeval tv;
403
404 switch (timestamp_precision) {
405 case TSP_SEC:
406 tsp->tv_sec = time_second;
407 tsp->tv_nsec = 0;
408 break;
409 case TSP_HZ:
410 getnanotime(tsp);
411 break;
412 case TSP_USEC:
413 microtime(&tv);
414 TIMEVAL_TO_TIMESPEC(&tv, tsp);
415 break;
416 case TSP_NSEC:
417 default:
418 nanotime(tsp);
419 break;
420 }
421}
422
423/*
424 * Set vnode attributes to VNOVAL
425 */
426void
427vattr_null(vap)
428 register struct vattr *vap;
429{
430
431 vap->va_type = VNON;
432 vap->va_size = VNOVAL;
433 vap->va_bytes = VNOVAL;
434 vap->va_mode = VNOVAL;
435 vap->va_nlink = VNOVAL;
436 vap->va_uid = VNOVAL;
437 vap->va_gid = VNOVAL;
438 vap->va_fsid = VNOVAL;
439 vap->va_fileid = VNOVAL;
440 vap->va_blocksize = VNOVAL;
441 vap->va_rdev = VNOVAL;
442 vap->va_atime.tv_sec = VNOVAL;
443 vap->va_atime.tv_nsec = VNOVAL;
444 vap->va_mtime.tv_sec = VNOVAL;
445 vap->va_mtime.tv_nsec = VNOVAL;
446 vap->va_ctime.tv_sec = VNOVAL;
447 vap->va_ctime.tv_nsec = VNOVAL;
448 vap->va_birthtime.tv_sec = VNOVAL;
449 vap->va_birthtime.tv_nsec = VNOVAL;
450 vap->va_flags = VNOVAL;
451 vap->va_gen = VNOVAL;
452 vap->va_vaflags = 0;
453}
454
455/*
456 * This routine is called when we have too many vnodes. It attempts
457 * to free <count> vnodes and will potentially free vnodes that still
458 * have VM backing store (VM backing store is typically the cause
459 * of a vnode blowout so we want to do this). Therefore, this operation
460 * is not considered cheap.
461 *
462 * A number of conditions may prevent a vnode from being reclaimed.
463 * the buffer cache may have references on the vnode, a directory
464 * vnode may still have references due to the namei cache representing
465 * underlying files, or the vnode may be in active use. It is not
466 * desireable to reuse such vnodes. These conditions may cause the
467 * number of vnodes to reach some minimum value regardless of what
468 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
469 */
470static int
471vlrureclaim(struct mount *mp)
472{
473 struct vnode *vp;
474 int done;
475 int trigger;
476 int usevnodes;
477 int count;
478
479 /*
480 * Calculate the trigger point, don't allow user
481 * screwups to blow us up. This prevents us from
482 * recycling vnodes with lots of resident pages. We
483 * aren't trying to free memory, we are trying to
484 * free vnodes.
485 */
486 usevnodes = desiredvnodes;
487 if (usevnodes <= 0)
488 usevnodes = 1;
489 trigger = cnt.v_page_count * 2 / usevnodes;
490
491 done = 0;
492 MNT_ILOCK(mp);
493 count = mp->mnt_nvnodelistsize / 10 + 1;
494 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
495 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
496 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
497
498 if (vp->v_type != VNON &&
499 vp->v_type != VBAD &&
500 VI_TRYLOCK(vp)) {
501 if (VMIGHTFREE(vp) && /* critical path opt */
502 (vp->v_object == NULL ||
503 vp->v_object->resident_page_count < trigger)) {
504 MNT_IUNLOCK(mp);
505 vgonel(vp, curthread);
506 done++;
507 MNT_ILOCK(mp);
508 } else
509 VI_UNLOCK(vp);
510 }
511 --count;
512 }
513 MNT_IUNLOCK(mp);
514 return done;
515}
516
517/*
518 * Attempt to recycle vnodes in a context that is always safe to block.
519 * Calling vlrurecycle() from the bowels of filesystem code has some
520 * interesting deadlock problems.
521 */
522static struct proc *vnlruproc;
523static int vnlruproc_sig;
524
525static void
526vnlru_proc(void)
527{
528 struct mount *mp, *nmp;
529 int done;
530 struct proc *p = vnlruproc;
531 struct thread *td = FIRST_THREAD_IN_PROC(p);
532
533 mtx_lock(&Giant);
534
535 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
536 SHUTDOWN_PRI_FIRST);
537
538 for (;;) {
539 kthread_suspend_check(p);
540 mtx_lock(&vnode_free_list_mtx);
541 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
542 mtx_unlock(&vnode_free_list_mtx);
543 vnlruproc_sig = 0;
544 wakeup(&vnlruproc_sig);
545 tsleep(vnlruproc, PVFS, "vlruwt", hz);
546 continue;
547 }
548 mtx_unlock(&vnode_free_list_mtx);
549 done = 0;
550 mtx_lock(&mountlist_mtx);
551 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
552 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
553 nmp = TAILQ_NEXT(mp, mnt_list);
554 continue;
555 }
556 done += vlrureclaim(mp);
557 mtx_lock(&mountlist_mtx);
558 nmp = TAILQ_NEXT(mp, mnt_list);
559 vfs_unbusy(mp, td);
560 }
561 mtx_unlock(&mountlist_mtx);
562 if (done == 0) {
563#if 0
564 /* These messages are temporary debugging aids */
565 if (vnlru_nowhere < 5)
566 printf("vnlru process getting nowhere..\n");
567 else if (vnlru_nowhere == 5)
568 printf("vnlru process messages stopped.\n");
569#endif
570 vnlru_nowhere++;
571 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
572 }
573 }
574}
575
576static struct kproc_desc vnlru_kp = {
577 "vnlru",
578 vnlru_proc,
579 &vnlruproc
580};
581SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
582
583
584/*
585 * Routines having to do with the management of the vnode table.
586 */
587
588/*
589 * Check to see if a free vnode can be recycled. If it can,
590 * recycle it and return it with the vnode interlock held.
591 */
592static int
593vtryrecycle(struct vnode *vp)
594{
595 struct thread *td = curthread;
596 vm_object_t object;
597 struct mount *vnmp;
598 int error;
599
600 /* Don't recycle if we can't get the interlock */
601 if (!VI_TRYLOCK(vp))
602 return (EWOULDBLOCK);
603 /*
604 * This vnode may found and locked via some other list, if so we
605 * can't recycle it yet.
606 */
607 if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
608 return (EWOULDBLOCK);
609 /*
610 * Don't recycle if its filesystem is being suspended.
611 */
612 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
613 VOP_UNLOCK(vp, 0, td);
614 return (EBUSY);
615 }
616
617 /*
618 * Don't recycle if we still have cached pages.
619 */
620 if (VOP_GETVOBJECT(vp, &object) == 0) {
621 VM_OBJECT_LOCK(object);
622 if (object->resident_page_count ||
623 object->ref_count) {
624 VM_OBJECT_UNLOCK(object);
625 error = EBUSY;
626 goto done;
627 }
628 VM_OBJECT_UNLOCK(object);
629 }
630 if (LIST_FIRST(&vp->v_cache_src)) {
631 /*
632 * note: nameileafonly sysctl is temporary,
633 * for debugging only, and will eventually be
634 * removed.
635 */
636 if (nameileafonly > 0) {
637 /*
638 * Do not reuse namei-cached directory
639 * vnodes that have cached
640 * subdirectories.
641 */
642 if (cache_leaf_test(vp) < 0) {
643 error = EISDIR;
644 goto done;
645 }
646 } else if (nameileafonly < 0 ||
647 vmiodirenable == 0) {
648 /*
649 * Do not reuse namei-cached directory
650 * vnodes if nameileafonly is -1 or
651 * if VMIO backing for directories is
652 * turned off (otherwise we reuse them
653 * too quickly).
654 */
655 error = EBUSY;
656 goto done;
657 }
658 }
659 /*
660 * If we got this far, we need to acquire the interlock and see if
661 * anyone picked up this vnode from another list. If not, we will
662 * mark it with XLOCK via vgonel() so that anyone who does find it
663 * will skip over it.
664 */
665 VI_LOCK(vp);
666 if (VSHOULDBUSY(vp) && (vp->v_iflag & VI_XLOCK) == 0) {
667 VI_UNLOCK(vp);
668 error = EBUSY;
669 goto done;
670 }
671 mtx_lock(&vnode_free_list_mtx);
672 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
673 vp->v_iflag &= ~VI_FREE;
674 mtx_unlock(&vnode_free_list_mtx);
675 vp->v_iflag |= VI_DOOMED;
676 if (vp->v_type != VBAD) {
677 VOP_UNLOCK(vp, 0, td);
678 vgonel(vp, td);
679 VI_LOCK(vp);
680 } else
681 VOP_UNLOCK(vp, 0, td);
682 vn_finished_write(vnmp);
683 return (0);
684done:
685 VOP_UNLOCK(vp, 0, td);
686 vn_finished_write(vnmp);
687 return (error);
688}
689
690/*
691 * Return the next vnode from the free list.
692 */
693int
694getnewvnode(tag, mp, vops, vpp)
695 const char *tag;
696 struct mount *mp;
697 vop_t **vops;
698 struct vnode **vpp;
699{
700 struct vnode *vp = NULL;
701 struct vpollinfo *pollinfo = NULL;
702
703 mtx_lock(&vnode_free_list_mtx);
704
705 /*
706 * Try to reuse vnodes if we hit the max. This situation only
707 * occurs in certain large-memory (2G+) situations. We cannot
708 * attempt to directly reclaim vnodes due to nasty recursion
709 * problems.
710 */
711 while (numvnodes - freevnodes > desiredvnodes) {
712 if (vnlruproc_sig == 0) {
713 vnlruproc_sig = 1; /* avoid unnecessary wakeups */
714 wakeup(vnlruproc);
715 }
716 mtx_unlock(&vnode_free_list_mtx);
717 tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
718 mtx_lock(&vnode_free_list_mtx);
719 }
720
721 /*
722 * Attempt to reuse a vnode already on the free list, allocating
723 * a new vnode if we can't find one or if we have not reached a
724 * good minimum for good LRU performance.
725 */
726
727 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
728 int error;
729 int count;
730
731 for (count = 0; count < freevnodes; count++) {
732 vp = TAILQ_FIRST(&vnode_free_list);
733
734 KASSERT(vp->v_usecount == 0 &&
735 (vp->v_iflag & VI_DOINGINACT) == 0,
736 ("getnewvnode: free vnode isn't"));
737
738 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
739 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
740 mtx_unlock(&vnode_free_list_mtx);
741 error = vtryrecycle(vp);
742 mtx_lock(&vnode_free_list_mtx);
743 if (error == 0)
744 break;
745 vp = NULL;
746 }
747 }
748 if (vp) {
749 freevnodes--;
750 mtx_unlock(&vnode_free_list_mtx);
751
752#ifdef INVARIANTS
753 {
754 if (vp->v_data)
755 panic("cleaned vnode isn't");
756 if (vp->v_numoutput)
757 panic("Clean vnode has pending I/O's");
758 if (vp->v_writecount != 0)
759 panic("Non-zero write count");
760 }
761#endif
762 if ((pollinfo = vp->v_pollinfo) != NULL) {
763 /*
764 * To avoid lock order reversals, the call to
765 * uma_zfree() must be delayed until the vnode
766 * interlock is released.
767 */
768 vp->v_pollinfo = NULL;
769 }
770#ifdef MAC
771 mac_destroy_vnode(vp);
772#endif
773 vp->v_iflag = 0;
774 vp->v_vflag = 0;
775 vp->v_lastw = 0;
776 vp->v_lasta = 0;
777 vp->v_cstart = 0;
778 vp->v_clen = 0;
779 vp->v_socket = 0;
780 lockdestroy(vp->v_vnlock);
781 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
782 KASSERT(vp->v_cleanbufcnt == 0, ("cleanbufcnt not 0"));
783 KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
784 KASSERT(vp->v_dirtybufcnt == 0, ("dirtybufcnt not 0"));
785 KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
786 } else {
787 numvnodes++;
788 mtx_unlock(&vnode_free_list_mtx);
789
790 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
791 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
792 VI_LOCK(vp);
793 vp->v_dd = vp;
794 vp->v_vnlock = &vp->v_lock;
795 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
796 cache_purge(vp); /* Sets up v_id. */
797 LIST_INIT(&vp->v_cache_src);
798 TAILQ_INIT(&vp->v_cache_dst);
799 }
800
801 TAILQ_INIT(&vp->v_cleanblkhd);
802 TAILQ_INIT(&vp->v_dirtyblkhd);
803 vp->v_type = VNON;
804 vp->v_tag = tag;
805 vp->v_op = vops;
806 *vpp = vp;
807 vp->v_usecount = 1;
808 vp->v_data = 0;
809 vp->v_cachedid = -1;
810 VI_UNLOCK(vp);
811 if (pollinfo != NULL) {
812 mtx_destroy(&pollinfo->vpi_lock);
813 uma_zfree(vnodepoll_zone, pollinfo);
814 }
815#ifdef MAC
816 mac_init_vnode(vp);
817 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
818 mac_associate_vnode_singlelabel(mp, vp);
819#endif
820 insmntque(vp, mp);
821
822 return (0);
823}
824
825/*
826 * Move a vnode from one mount queue to another.
827 */
828static void
829insmntque(vp, mp)
830 register struct vnode *vp;
831 register struct mount *mp;
832{
833
834 /*
835 * Delete from old mount point vnode list, if on one.
836 */
837 if (vp->v_mount != NULL) {
838 MNT_ILOCK(vp->v_mount);
839 KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
840 ("bad mount point vnode list size"));
841 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
842 vp->v_mount->mnt_nvnodelistsize--;
843 MNT_IUNLOCK(vp->v_mount);
844 }
845 /*
846 * Insert into list of vnodes for the new mount point, if available.
847 */
848 if ((vp->v_mount = mp) != NULL) {
849 MNT_ILOCK(vp->v_mount);
850 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
851 mp->mnt_nvnodelistsize++;
852 MNT_IUNLOCK(vp->v_mount);
853 }
854}
855
856/*
857 * Update outstanding I/O count and do wakeup if requested.
858 */
859void
860vwakeup(bp)
861 register struct buf *bp;
862{
863 register struct vnode *vp;
864
865 bp->b_flags &= ~B_WRITEINPROG;
866 if ((vp = bp->b_vp)) {
867 VI_LOCK(vp);
868 vp->v_numoutput--;
869 if (vp->v_numoutput < 0)
870 panic("vwakeup: neg numoutput");
871 if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
872 vp->v_iflag &= ~VI_BWAIT;
873 wakeup(&vp->v_numoutput);
874 }
875 VI_UNLOCK(vp);
876 }
877}
878
879/*
880 * Flush out and invalidate all buffers associated with a vnode.
881 * Called with the underlying object locked.
882 */
883int
884vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
885 struct vnode *vp;
886 int flags;
887 struct ucred *cred;
888 struct thread *td;
889 int slpflag, slptimeo;
890{
891 struct buf *blist;
892 int error;
893 vm_object_t object;
894
895 GIANT_REQUIRED;
896
897 ASSERT_VOP_LOCKED(vp, "vinvalbuf");
898
899 VI_LOCK(vp);
900 if (flags & V_SAVE) {
901 while (vp->v_numoutput) {
902 vp->v_iflag |= VI_BWAIT;
903 error = msleep(&vp->v_numoutput, VI_MTX(vp),
904 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
905 if (error) {
906 VI_UNLOCK(vp);
907 return (error);
908 }
909 }
910 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
911 VI_UNLOCK(vp);
912 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
913 return (error);
914 /*
915 * XXX We could save a lock/unlock if this was only
916 * enabled under INVARIANTS
917 */
918 VI_LOCK(vp);
919 if (vp->v_numoutput > 0 ||
920 !TAILQ_EMPTY(&vp->v_dirtyblkhd))
921 panic("vinvalbuf: dirty bufs");
922 }
923 }
924 /*
925 * If you alter this loop please notice that interlock is dropped and
926 * reacquired in flushbuflist. Special care is needed to ensure that
927 * no race conditions occur from this.
928 */
929 for (error = 0;;) {
930 if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
931 flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
932 if (error)
933 break;
934 continue;
935 }
936 if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
937 flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
938 if (error)
939 break;
940 continue;
941 }
942 break;
943 }
944 if (error) {
945 VI_UNLOCK(vp);
946 return (error);
947 }
948
949 /*
950 * Wait for I/O to complete. XXX needs cleaning up. The vnode can
951 * have write I/O in-progress but if there is a VM object then the
952 * VM object can also have read-I/O in-progress.
953 */
954 do {
955 while (vp->v_numoutput > 0) {
956 vp->v_iflag |= VI_BWAIT;
957 msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
958 }
959 VI_UNLOCK(vp);
960 if (VOP_GETVOBJECT(vp, &object) == 0) {
961 VM_OBJECT_LOCK(object);
962 vm_object_pip_wait(object, "vnvlbx");
963 VM_OBJECT_UNLOCK(object);
964 }
965 VI_LOCK(vp);
966 } while (vp->v_numoutput > 0);
967 VI_UNLOCK(vp);
968
969 /*
970 * Destroy the copy in the VM cache, too.
971 */
972 if (VOP_GETVOBJECT(vp, &object) == 0) {
973 VM_OBJECT_LOCK(object);
974 vm_object_page_remove(object, 0, 0,
975 (flags & V_SAVE) ? TRUE : FALSE);
976 VM_OBJECT_UNLOCK(object);
977 }
978
979#ifdef INVARIANTS
980 VI_LOCK(vp);
981 if ((flags & (V_ALT | V_NORMAL)) == 0 &&
982 (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
983 !TAILQ_EMPTY(&vp->v_cleanblkhd)))
984 panic("vinvalbuf: flush failed");
985 VI_UNLOCK(vp);
986#endif
987 return (0);
988}
989
990/*
991 * Flush out buffers on the specified list.
992 *
993 */
994static int
995flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
996 struct buf *blist;
997 int flags;
998 struct vnode *vp;
999 int slpflag, slptimeo;
1000 int *errorp;
1001{
1002 struct buf *bp, *nbp;
1003 int found, error;
1004
1005 ASSERT_VI_LOCKED(vp, "flushbuflist");
1006
1007 for (found = 0, bp = blist; bp; bp = nbp) {
1008 nbp = TAILQ_NEXT(bp, b_vnbufs);
1009 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1010 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1011 continue;
1012 }
1013 found += 1;
1014 error = BUF_TIMELOCK(bp,
1015 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp),
1016 "flushbuf", slpflag, slptimeo);
1017 if (error) {
1018 if (error != ENOLCK)
1019 *errorp = error;
1020 goto done;
1021 }
1022 /*
1023 * XXX Since there are no node locks for NFS, I
1024 * believe there is a slight chance that a delayed
1025 * write will occur while sleeping just above, so
1026 * check for it. Note that vfs_bio_awrite expects
1027 * buffers to reside on a queue, while bwrite and
1028 * brelse do not.
1029 */
1030 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1031 (flags & V_SAVE)) {
1032
1033 if (bp->b_vp == vp) {
1034 if (bp->b_flags & B_CLUSTEROK) {
1035 vfs_bio_awrite(bp);
1036 } else {
1037 bremfree(bp);
1038 bp->b_flags |= B_ASYNC;
1039 bwrite(bp);
1040 }
1041 } else {
1042 bremfree(bp);
1043 (void) bwrite(bp);
1044 }
1045 goto done;
1046 }
1047 bremfree(bp);
1048 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1049 bp->b_flags &= ~B_ASYNC;
1050 brelse(bp);
1051 VI_LOCK(vp);
1052 }
1053 return (found);
1054done:
1055 VI_LOCK(vp);
1056 return (found);
1057}
1058
1059/*
1060 * Truncate a file's buffer and pages to a specified length. This
1061 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1062 * sync activity.
1063 */
1064int
1065vtruncbuf(vp, cred, td, length, blksize)
1066 register struct vnode *vp;
1067 struct ucred *cred;
1068 struct thread *td;
1069 off_t length;
1070 int blksize;
1071{
1072 register struct buf *bp;
1073 struct buf *nbp;
1074 int anyfreed;
1075 int trunclbn;
1076
1077 /*
1078 * Round up to the *next* lbn.
1079 */
1080 trunclbn = (length + blksize - 1) / blksize;
1081
1082 ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1083restart:
1084 VI_LOCK(vp);
1085 anyfreed = 1;
1086 for (;anyfreed;) {
1087 anyfreed = 0;
1088 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1089 nbp = TAILQ_NEXT(bp, b_vnbufs);
1090 if (bp->b_lblkno >= trunclbn) {
1091 if (BUF_LOCK(bp,
1092 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1093 VI_MTX(vp)) == ENOLCK)
1094 goto restart;
1095
1096 bremfree(bp);
1097 bp->b_flags |= (B_INVAL | B_RELBUF);
1098 bp->b_flags &= ~B_ASYNC;
1099 brelse(bp);
1100 anyfreed = 1;
1101
1102 if (nbp &&
1103 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1104 (nbp->b_vp != vp) ||
1105 (nbp->b_flags & B_DELWRI))) {
1106 goto restart;
1107 }
1108 VI_LOCK(vp);
1109 }
1110 }
1111
1112 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1113 nbp = TAILQ_NEXT(bp, b_vnbufs);
1114 if (bp->b_lblkno >= trunclbn) {
1115 if (BUF_LOCK(bp,
1116 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1117 VI_MTX(vp)) == ENOLCK)
1118 goto restart;
1119 bremfree(bp);
1120 bp->b_flags |= (B_INVAL | B_RELBUF);
1121 bp->b_flags &= ~B_ASYNC;
1122 brelse(bp);
1123 anyfreed = 1;
1124 if (nbp &&
1125 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1126 (nbp->b_vp != vp) ||
1127 (nbp->b_flags & B_DELWRI) == 0)) {
1128 goto restart;
1129 }
1130 VI_LOCK(vp);
1131 }
1132 }
1133 }
1134
1135 if (length > 0) {
1136restartsync:
1137 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1138 nbp = TAILQ_NEXT(bp, b_vnbufs);
1139 if (bp->b_lblkno > 0)
1140 continue;
1141 /*
1142 * Since we hold the vnode lock this should only
1143 * fail if we're racing with the buf daemon.
1144 */
1145 if (BUF_LOCK(bp,
1146 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1147 VI_MTX(vp)) == ENOLCK) {
1148 goto restart;
1149 }
1150 KASSERT((bp->b_flags & B_DELWRI),
1151 ("buf(%p) on dirty queue without DELWRI", bp));
1152
1153 bremfree(bp);
1154 bawrite(bp);
1155 VI_LOCK(vp);
1156 goto restartsync;
1157 }
1158 }
1159
1160 while (vp->v_numoutput > 0) {
1161 vp->v_iflag |= VI_BWAIT;
1162 msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1163 }
1164 VI_UNLOCK(vp);
1165 vnode_pager_setsize(vp, length);
1166
1167 return (0);
1168}
1169
1170/*
1171 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1172 * a vnode.
1173 *
1174 * NOTE: We have to deal with the special case of a background bitmap
1175 * buffer, a situation where two buffers will have the same logical
1176 * block offset. We want (1) only the foreground buffer to be accessed
1177 * in a lookup and (2) must differentiate between the foreground and
1178 * background buffer in the splay tree algorithm because the splay
1179 * tree cannot normally handle multiple entities with the same 'index'.
1180 * We accomplish this by adding differentiating flags to the splay tree's
1181 * numerical domain.
1182 */
1183static
1184struct buf *
1185buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1186{
1187 struct buf dummy;
1188 struct buf *lefttreemax, *righttreemin, *y;
1189
1190 if (root == NULL)
1191 return (NULL);
1192 lefttreemax = righttreemin = &dummy;
1193 for (;;) {
1194 if (lblkno < root->b_lblkno ||
1195 (lblkno == root->b_lblkno &&
1196 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1197 if ((y = root->b_left) == NULL)
1198 break;
1199 if (lblkno < y->b_lblkno) {
1200 /* Rotate right. */
1201 root->b_left = y->b_right;
1202 y->b_right = root;
1203 root = y;
1204 if ((y = root->b_left) == NULL)
1205 break;
1206 }
1207 /* Link into the new root's right tree. */
1208 righttreemin->b_left = root;
1209 righttreemin = root;
1210 } else if (lblkno > root->b_lblkno ||
1211 (lblkno == root->b_lblkno &&
1212 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1213 if ((y = root->b_right) == NULL)
1214 break;
1215 if (lblkno > y->b_lblkno) {
1216 /* Rotate left. */
1217 root->b_right = y->b_left;
1218 y->b_left = root;
1219 root = y;
1220 if ((y = root->b_right) == NULL)
1221 break;
1222 }
1223 /* Link into the new root's left tree. */
1224 lefttreemax->b_right = root;
1225 lefttreemax = root;
1226 } else {
1227 break;
1228 }
1229 root = y;
1230 }
1231 /* Assemble the new root. */
1232 lefttreemax->b_right = root->b_left;
1233 righttreemin->b_left = root->b_right;
1234 root->b_left = dummy.b_right;
1235 root->b_right = dummy.b_left;
1236 return (root);
1237}
1238
1239static
1240void
1241buf_vlist_remove(struct buf *bp)
1242{
1243 struct vnode *vp = bp->b_vp;
1244 struct buf *root;
1245
1246 ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
1247 if (bp->b_xflags & BX_VNDIRTY) {
1248 if (bp != vp->v_dirtyblkroot) {
1249 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1250 vp->v_dirtyblkroot);
1251 KASSERT(root == bp,
1252 ("splay lookup failed during dirty remove"));
1253 }
1254 if (bp->b_left == NULL) {
1255 root = bp->b_right;
1256 } else {
1257 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1258 bp->b_left);
1259 root->b_right = bp->b_right;
1260 }
1261 vp->v_dirtyblkroot = root;
1262 TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1263 vp->v_dirtybufcnt--;
1264 } else {
1265 /* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1266 if (bp != vp->v_cleanblkroot) {
1267 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1268 vp->v_cleanblkroot);
1269 KASSERT(root == bp,
1270 ("splay lookup failed during clean remove"));
1271 }
1272 if (bp->b_left == NULL) {
1273 root = bp->b_right;
1274 } else {
1275 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1276 bp->b_left);
1277 root->b_right = bp->b_right;
1278 }
1279 vp->v_cleanblkroot = root;
1280 TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1281 vp->v_cleanbufcnt--;
1282 }
1283 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1284}
1285
1286/*
1287 * Add the buffer to the sorted clean or dirty block list using a
1288 * splay tree algorithm.
1289 *
1290 * NOTE: xflags is passed as a constant, optimizing this inline function!
1291 */
1292static
1293void
1294buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1295{
1296 struct buf *root;
1297
1298 ASSERT_VI_LOCKED(vp, "buf_vlist_add");
1299 bp->b_xflags |= xflags;
1300 if (xflags & BX_VNDIRTY) {
1301 root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1302 if (root == NULL) {
1303 bp->b_left = NULL;
1304 bp->b_right = NULL;
1305 TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1306 } else if (bp->b_lblkno < root->b_lblkno ||
1307 (bp->b_lblkno == root->b_lblkno &&
1308 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1309 bp->b_left = root->b_left;
1310 bp->b_right = root;
1311 root->b_left = NULL;
1312 TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1313 } else {
1314 bp->b_right = root->b_right;
1315 bp->b_left = root;
1316 root->b_right = NULL;
1317 TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1318 root, bp, b_vnbufs);
1319 }
1320 vp->v_dirtybufcnt++;
1321 vp->v_dirtyblkroot = bp;
1322 } else {
1323 /* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1324 root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1325 if (root == NULL) {
1326 bp->b_left = NULL;
1327 bp->b_right = NULL;
1328 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1329 } else if (bp->b_lblkno < root->b_lblkno ||
1330 (bp->b_lblkno == root->b_lblkno &&
1331 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1332 bp->b_left = root->b_left;
1333 bp->b_right = root;
1334 root->b_left = NULL;
1335 TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1336 } else {
1337 bp->b_right = root->b_right;
1338 bp->b_left = root;
1339 root->b_right = NULL;
1340 TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1341 root, bp, b_vnbufs);
1342 }
1343 vp->v_cleanbufcnt++;
1344 vp->v_cleanblkroot = bp;
1345 }
1346}
1347
1348/*
1349 * Lookup a buffer using the splay tree. Note that we specifically avoid
1350 * shadow buffers used in background bitmap writes.
1351 *
1352 * This code isn't quite efficient as it could be because we are maintaining
1353 * two sorted lists and do not know which list the block resides in.
1354 *
1355 * During a "make buildworld" the desired buffer is found at one of
1356 * the roots more than 60% of the time. Thus, checking both roots
1357 * before performing either splay eliminates unnecessary splays on the
1358 * first tree splayed.
1359 */
1360struct buf *
1361gbincore(struct vnode *vp, daddr_t lblkno)
1362{
1363 struct buf *bp;
1364
1365 GIANT_REQUIRED;
1366
1367 ASSERT_VI_LOCKED(vp, "gbincore");
1368 if ((bp = vp->v_cleanblkroot) != NULL &&
1369 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1370 return (bp);
1371 if ((bp = vp->v_dirtyblkroot) != NULL &&
1372 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1373 return (bp);
1374 if ((bp = vp->v_cleanblkroot) != NULL) {
1375 vp->v_cleanblkroot = bp = buf_splay(lblkno, 0, bp);
1376 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1377 return (bp);
1378 }
1379 if ((bp = vp->v_dirtyblkroot) != NULL) {
1380 vp->v_dirtyblkroot = bp = buf_splay(lblkno, 0, bp);
1381 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1382 return (bp);
1383 }
1384 return (NULL);
1385}
1386
1387/*
1388 * Associate a buffer with a vnode.
1389 */
1390void
1391bgetvp(vp, bp)
1392 register struct vnode *vp;
1393 register struct buf *bp;
1394{
1395
1396 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1397
1398 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1399 ("bgetvp: bp already attached! %p", bp));
1400
1401 ASSERT_VI_LOCKED(vp, "bgetvp");
1402 vholdl(vp);
1403 bp->b_vp = vp;
1404 bp->b_dev = vn_todev(vp);
1405 /*
1406 * Insert onto list for new vnode.
1407 */
1408 buf_vlist_add(bp, vp, BX_VNCLEAN);
1409}
1410
1411/*
1412 * Disassociate a buffer from a vnode.
1413 */
1414void
1415brelvp(bp)
1416 register struct buf *bp;
1417{
1418 struct vnode *vp;
1419
1420 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1421
1422 /*
1423 * Delete from old vnode list, if on one.
1424 */
1425 vp = bp->b_vp;
1426 VI_LOCK(vp);
1427 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1428 buf_vlist_remove(bp);
1429 if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1430 vp->v_iflag &= ~VI_ONWORKLST;
1431 mtx_lock(&sync_mtx);
1432 LIST_REMOVE(vp, v_synclist);
1433 mtx_unlock(&sync_mtx);
1434 }
1435 vdropl(vp);
1436 bp->b_vp = (struct vnode *) 0;
1437 if (bp->b_object)
1438 bp->b_object = NULL;
1439 VI_UNLOCK(vp);
1440}
1441
1442/*
1443 * Add an item to the syncer work queue.
1444 */
1445static void
1446vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1447{
1448 int slot;
1449
1450 ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
1451
1452 mtx_lock(&sync_mtx);
1453 if (vp->v_iflag & VI_ONWORKLST)
1454 LIST_REMOVE(vp, v_synclist);
1455 else
1456 vp->v_iflag |= VI_ONWORKLST;
1457
1458 if (delay > syncer_maxdelay - 2)
1459 delay = syncer_maxdelay - 2;
1460 slot = (syncer_delayno + delay) & syncer_mask;
1461
1462 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1463 mtx_unlock(&sync_mtx);
1464}
1465
1466struct proc *updateproc;
1467static void sched_sync(void);
1468static struct kproc_desc up_kp = {
1469 "syncer",
1470 sched_sync,
1471 &updateproc
1472};
1473SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1474
1475/*
1476 * System filesystem synchronizer daemon.
1477 */
1478static void
1479sched_sync(void)
1480{
1481 struct synclist *next;
1482 struct synclist *slp;
1483 struct vnode *vp;
1484 struct mount *mp;
1485 long starttime;
1486 struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
1487
1488 mtx_lock(&Giant);
1489
1490 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1491 SHUTDOWN_PRI_LAST);
1492
1493 for (;;) {
1494 kthread_suspend_check(td->td_proc);
1495
1496 starttime = time_second;
1497
1498 /*
1499 * Push files whose dirty time has expired. Be careful
1500 * of interrupt race on slp queue.
1501 */
1502 mtx_lock(&sync_mtx);
1503 slp = &syncer_workitem_pending[syncer_delayno];
1504 syncer_delayno += 1;
1505 if (syncer_delayno == syncer_maxdelay)
1506 syncer_delayno = 0;
1507 next = &syncer_workitem_pending[syncer_delayno];
1508
1509 while ((vp = LIST_FIRST(slp)) != NULL) {
1510 if (VOP_ISLOCKED(vp, NULL) != 0 ||
1511 vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1512 LIST_REMOVE(vp, v_synclist);
1513 LIST_INSERT_HEAD(next, vp, v_synclist);
1514 continue;
1515 }
1516 if (VI_TRYLOCK(vp) == 0) {
1517 LIST_REMOVE(vp, v_synclist);
1518 LIST_INSERT_HEAD(next, vp, v_synclist);
1519 vn_finished_write(mp);
1520 continue;
1521 }
1522 /*
1523 * We use vhold in case the vnode does not
1524 * successfully sync. vhold prevents the vnode from
1525 * going away when we unlock the sync_mtx so that
1526 * we can acquire the vnode interlock.
1527 */
1528 vholdl(vp);
1529 mtx_unlock(&sync_mtx);
1530 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
1531 (void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1532 VOP_UNLOCK(vp, 0, td);
1533 vn_finished_write(mp);
1534 VI_LOCK(vp);
1535 if ((vp->v_iflag & VI_ONWORKLST) != 0) {
1536 /*
1537 * Put us back on the worklist. The worklist
1538 * routine will remove us from our current
1539 * position and then add us back in at a later
1540 * position.
1541 */
1542 vn_syncer_add_to_worklist(vp, syncdelay);
1543 }
1544 vdropl(vp);
1545 VI_UNLOCK(vp);
1546 mtx_lock(&sync_mtx);
1547 }
1548 mtx_unlock(&sync_mtx);
1549
1550 /*
1551 * Do soft update processing.
1552 */
1553 if (softdep_process_worklist_hook != NULL)
1554 (*softdep_process_worklist_hook)(NULL);
1555
1556 /*
1557 * The variable rushjob allows the kernel to speed up the
1558 * processing of the filesystem syncer process. A rushjob
1559 * value of N tells the filesystem syncer to process the next
1560 * N seconds worth of work on its queue ASAP. Currently rushjob
1561 * is used by the soft update code to speed up the filesystem
1562 * syncer process when the incore state is getting so far
1563 * ahead of the disk that the kernel memory pool is being
1564 * threatened with exhaustion.
1565 */
1566 mtx_lock(&sync_mtx);
1567 if (rushjob > 0) {
1568 rushjob -= 1;
1569 mtx_unlock(&sync_mtx);
1570 continue;
1571 }
1572 mtx_unlock(&sync_mtx);
1573 /*
1574 * If it has taken us less than a second to process the
1575 * current work, then wait. Otherwise start right over
1576 * again. We can still lose time if any single round
1577 * takes more than two seconds, but it does not really
1578 * matter as we are just trying to generally pace the
1579 * filesystem activity.
1580 */
1581 if (time_second == starttime)
1582 tsleep(&lbolt, PPAUSE, "syncer", 0);
1583 }
1584}
1585
1586/*
1587 * Request the syncer daemon to speed up its work.
1588 * We never push it to speed up more than half of its
1589 * normal turn time, otherwise it could take over the cpu.
1590 */
1591int
1592speedup_syncer()
1593{
1594 struct thread *td;
1595 int ret = 0;
1596
1597 td = FIRST_THREAD_IN_PROC(updateproc);
1598 sleepq_remove(td, &lbolt);
1599 mtx_lock(&sync_mtx);
1600 if (rushjob < syncdelay / 2) {
1601 rushjob += 1;
1602 stat_rush_requests += 1;
1603 ret = 1;
1604 }
1605 mtx_unlock(&sync_mtx);
1606 return (ret);
1607}
1608
1609/*
1610 * Associate a p-buffer with a vnode.
1611 *
1612 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1613 * with the buffer. i.e. the bp has not been linked into the vnode or
1614 * ref-counted.
1615 */
1616void
1617pbgetvp(vp, bp)
1618 register struct vnode *vp;
1619 register struct buf *bp;
1620{
1621
1622 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1623
1624 bp->b_vp = vp;
1625 bp->b_object = vp->v_object;
1626 bp->b_flags |= B_PAGING;
1627 bp->b_dev = vn_todev(vp);
1628}
1629
1630/*
1631 * Disassociate a p-buffer from a vnode.
1632 */
1633void
1634pbrelvp(bp)
1635 register struct buf *bp;
1636{
1637
1638 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1639
1640 /* XXX REMOVE ME */
1641 VI_LOCK(bp->b_vp);
1642 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1643 panic(
1644 "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1645 bp,
1646 (int)bp->b_flags
1647 );
1648 }
1649 VI_UNLOCK(bp->b_vp);
1650 bp->b_vp = (struct vnode *) 0;
1651 bp->b_object = NULL;
1652 bp->b_flags &= ~B_PAGING;
1653}
1654
1655/*
1656 * Reassign a buffer from one vnode to another.
1657 * Used to assign file specific control information
1658 * (indirect blocks) to the vnode to which they belong.
1659 */
1660void
1661reassignbuf(bp, newvp)
1662 register struct buf *bp;
1663 register struct vnode *newvp;
1664{
1665 struct vnode *vp;
1666 int delay;
1667
1668 if (newvp == NULL) {
1669 printf("reassignbuf: NULL");
1670 return;
1671 }
1672 vp = bp->b_vp;
1673 ++reassignbufcalls;
1674
1675 /*
1676 * B_PAGING flagged buffers cannot be reassigned because their vp
1677 * is not fully linked in.
1678 */
1679 if (bp->b_flags & B_PAGING)
1680 panic("cannot reassign paging buffer");
1681
1682 /*
1683 * Delete from old vnode list, if on one.
1684 */
1685 VI_LOCK(vp);
1686 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1687 buf_vlist_remove(bp);
1688 if (vp != newvp) {
1689 vdropl(bp->b_vp);
1690 bp->b_vp = NULL; /* for clarification */
1691 }
1692 }
1693 if (vp != newvp) {
1694 VI_UNLOCK(vp);
1695 VI_LOCK(newvp);
1696 }
1697 /*
1698 * If dirty, put on list of dirty buffers; otherwise insert onto list
1699 * of clean buffers.
1700 */
1701 if (bp->b_flags & B_DELWRI) {
1702 if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1703 switch (newvp->v_type) {
1704 case VDIR:
1705 delay = dirdelay;
1706 break;
1707 case VCHR:
1708 delay = metadelay;
1709 break;
1710 default:
1711 delay = filedelay;
1712 }
1713 vn_syncer_add_to_worklist(newvp, delay);
1714 }
1715 buf_vlist_add(bp, newvp, BX_VNDIRTY);
1716 } else {
1717 buf_vlist_add(bp, newvp, BX_VNCLEAN);
1718
1719 if ((newvp->v_iflag & VI_ONWORKLST) &&
1720 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1721 mtx_lock(&sync_mtx);
1722 LIST_REMOVE(newvp, v_synclist);
1723 mtx_unlock(&sync_mtx);
1724 newvp->v_iflag &= ~VI_ONWORKLST;
1725 }
1726 }
1727 if (bp->b_vp != newvp) {
1728 bp->b_vp = newvp;
1729 vholdl(bp->b_vp);
1730 }
1731 VI_UNLOCK(newvp);
1732}
1733
1734/*
1735 * Create a vnode for a device.
1736 * Used for mounting the root filesystem.
1737 */
1738int
1739bdevvp(dev, vpp)
1740 struct cdev *dev;
1741 struct vnode **vpp;
1742{
1743 register struct vnode *vp;
1744 struct vnode *nvp;
1745 int error;
1746
1747 if (dev == NODEV) {
1747 if (dev == NULL) {
1748 *vpp = NULLVP;
1749 return (ENXIO);
1750 }
1751 if (vfinddev(dev, vpp))
1752 return (0);
1753
1754 error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
1755 if (error) {
1756 *vpp = NULLVP;
1757 return (error);
1758 }
1759 vp = nvp;
1760 vp->v_type = VCHR;
1761 addalias(vp, dev);
1762 *vpp = vp;
1763 return (0);
1764}
1765
1766static void
1767v_incr_usecount(struct vnode *vp, int delta)
1768{
1769
1770 vp->v_usecount += delta;
1771 if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1772 mtx_lock(&spechash_mtx);
1773 vp->v_rdev->si_usecount += delta;
1774 mtx_unlock(&spechash_mtx);
1775 }
1776}
1777
1778/*
1779 * Add vnode to the alias list hung off the struct cdev *.
1780 *
1781 * The reason for this gunk is that multiple vnodes can reference
1782 * the same physical device, so checking vp->v_usecount to see
1783 * how many users there are is inadequate; the v_usecount for
1784 * the vnodes need to be accumulated. vcount() does that.
1785 */
1786struct vnode *
1787addaliasu(nvp, nvp_rdev)
1788 struct vnode *nvp;
1748 *vpp = NULLVP;
1749 return (ENXIO);
1750 }
1751 if (vfinddev(dev, vpp))
1752 return (0);
1753
1754 error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
1755 if (error) {
1756 *vpp = NULLVP;
1757 return (error);
1758 }
1759 vp = nvp;
1760 vp->v_type = VCHR;
1761 addalias(vp, dev);
1762 *vpp = vp;
1763 return (0);
1764}
1765
1766static void
1767v_incr_usecount(struct vnode *vp, int delta)
1768{
1769
1770 vp->v_usecount += delta;
1771 if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1772 mtx_lock(&spechash_mtx);
1773 vp->v_rdev->si_usecount += delta;
1774 mtx_unlock(&spechash_mtx);
1775 }
1776}
1777
1778/*
1779 * Add vnode to the alias list hung off the struct cdev *.
1780 *
1781 * The reason for this gunk is that multiple vnodes can reference
1782 * the same physical device, so checking vp->v_usecount to see
1783 * how many users there are is inadequate; the v_usecount for
1784 * the vnodes need to be accumulated. vcount() does that.
1785 */
1786struct vnode *
1787addaliasu(nvp, nvp_rdev)
1788 struct vnode *nvp;
1789 udev_t nvp_rdev;
1789 dev_t nvp_rdev;
1790{
1791 struct vnode *ovp;
1792 vop_t **ops;
1793 struct cdev *dev;
1794
1795 if (nvp->v_type == VBLK)
1796 return (nvp);
1797 if (nvp->v_type != VCHR)
1798 panic("addaliasu on non-special vnode");
1790{
1791 struct vnode *ovp;
1792 vop_t **ops;
1793 struct cdev *dev;
1794
1795 if (nvp->v_type == VBLK)
1796 return (nvp);
1797 if (nvp->v_type != VCHR)
1798 panic("addaliasu on non-special vnode");
1799 dev = udev2dev(nvp_rdev);
1800 if (dev == NODEV)
1799 dev = findcdev(nvp_rdev);
1800 if (dev == NULL)
1801 return (nvp);
1802 /*
1803 * Check to see if we have a bdevvp vnode with no associated
1804 * filesystem. If so, we want to associate the filesystem of
1805 * the new newly instigated vnode with the bdevvp vnode and
1806 * discard the newly created vnode rather than leaving the
1807 * bdevvp vnode lying around with no associated filesystem.
1808 */
1809 if (vfinddev(dev, &ovp) == 0 || ovp->v_data != NULL) {
1810 addalias(nvp, dev);
1811 return (nvp);
1812 }
1813 /*
1814 * Discard unneeded vnode, but save its node specific data.
1815 * Note that if there is a lock, it is carried over in the
1816 * node specific data to the replacement vnode.
1817 */
1818 vref(ovp);
1819 ovp->v_data = nvp->v_data;
1820 ovp->v_tag = nvp->v_tag;
1821 nvp->v_data = NULL;
1822 lockdestroy(ovp->v_vnlock);
1823 lockinit(ovp->v_vnlock, PVFS, nvp->v_vnlock->lk_wmesg,
1824 nvp->v_vnlock->lk_timo, nvp->v_vnlock->lk_flags & LK_EXTFLG_MASK);
1825 ops = ovp->v_op;
1826 ovp->v_op = nvp->v_op;
1827 if (VOP_ISLOCKED(nvp, curthread)) {
1828 VOP_UNLOCK(nvp, 0, curthread);
1829 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
1830 }
1831 nvp->v_op = ops;
1832 insmntque(ovp, nvp->v_mount);
1833 vrele(nvp);
1834 vgone(nvp);
1835 return (ovp);
1836}
1837
1838/* This is a local helper function that do the same as addaliasu, but for a
1801 return (nvp);
1802 /*
1803 * Check to see if we have a bdevvp vnode with no associated
1804 * filesystem. If so, we want to associate the filesystem of
1805 * the new newly instigated vnode with the bdevvp vnode and
1806 * discard the newly created vnode rather than leaving the
1807 * bdevvp vnode lying around with no associated filesystem.
1808 */
1809 if (vfinddev(dev, &ovp) == 0 || ovp->v_data != NULL) {
1810 addalias(nvp, dev);
1811 return (nvp);
1812 }
1813 /*
1814 * Discard unneeded vnode, but save its node specific data.
1815 * Note that if there is a lock, it is carried over in the
1816 * node specific data to the replacement vnode.
1817 */
1818 vref(ovp);
1819 ovp->v_data = nvp->v_data;
1820 ovp->v_tag = nvp->v_tag;
1821 nvp->v_data = NULL;
1822 lockdestroy(ovp->v_vnlock);
1823 lockinit(ovp->v_vnlock, PVFS, nvp->v_vnlock->lk_wmesg,
1824 nvp->v_vnlock->lk_timo, nvp->v_vnlock->lk_flags & LK_EXTFLG_MASK);
1825 ops = ovp->v_op;
1826 ovp->v_op = nvp->v_op;
1827 if (VOP_ISLOCKED(nvp, curthread)) {
1828 VOP_UNLOCK(nvp, 0, curthread);
1829 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
1830 }
1831 nvp->v_op = ops;
1832 insmntque(ovp, nvp->v_mount);
1833 vrele(nvp);
1834 vgone(nvp);
1835 return (ovp);
1836}
1837
1838/* This is a local helper function that do the same as addaliasu, but for a
1839 * struct cdev *instead of an udev_t. */
1839 * struct cdev *instead of an dev_t. */
1840static void
1841addalias(nvp, dev)
1842 struct vnode *nvp;
1843 struct cdev *dev;
1844{
1845
1846 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1847 dev_ref(dev);
1848 nvp->v_rdev = dev;
1849 VI_LOCK(nvp);
1850 mtx_lock(&spechash_mtx);
1851 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1852 dev->si_usecount += nvp->v_usecount;
1853 mtx_unlock(&spechash_mtx);
1854 VI_UNLOCK(nvp);
1855}
1856
1857/*
1858 * Grab a particular vnode from the free list, increment its
1859 * reference count and lock it. The vnode lock bit is set if the
1860 * vnode is being eliminated in vgone. The process is awakened
1861 * when the transition is completed, and an error returned to
1862 * indicate that the vnode is no longer usable (possibly having
1863 * been changed to a new filesystem type).
1864 */
1865int
1866vget(vp, flags, td)
1867 register struct vnode *vp;
1868 int flags;
1869 struct thread *td;
1870{
1871 int error;
1872
1873 /*
1874 * If the vnode is in the process of being cleaned out for
1875 * another use, we wait for the cleaning to finish and then
1876 * return failure. Cleaning is determined by checking that
1877 * the VI_XLOCK flag is set.
1878 */
1879 if ((flags & LK_INTERLOCK) == 0)
1880 VI_LOCK(vp);
1881 if (vp->v_iflag & VI_XLOCK && vp->v_vxthread != curthread) {
1882 if ((flags & LK_NOWAIT) == 0) {
1883 vp->v_iflag |= VI_XWANT;
1884 msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
1885 return (ENOENT);
1886 }
1887 VI_UNLOCK(vp);
1888 return (EBUSY);
1889 }
1890
1891 v_incr_usecount(vp, 1);
1892
1893 if (VSHOULDBUSY(vp))
1894 vbusy(vp);
1895 if (flags & LK_TYPE_MASK) {
1896 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
1897 /*
1898 * must expand vrele here because we do not want
1899 * to call VOP_INACTIVE if the reference count
1900 * drops back to zero since it was never really
1901 * active. We must remove it from the free list
1902 * before sleeping so that multiple processes do
1903 * not try to recycle it.
1904 */
1905 VI_LOCK(vp);
1906 v_incr_usecount(vp, -1);
1907 if (VSHOULDFREE(vp))
1908 vfree(vp);
1909 else
1910 vlruvp(vp);
1911 VI_UNLOCK(vp);
1912 }
1913 return (error);
1914 }
1915 VI_UNLOCK(vp);
1916 return (0);
1917}
1918
1919/*
1920 * Increase the reference count of a vnode.
1921 */
1922void
1923vref(struct vnode *vp)
1924{
1925
1926 VI_LOCK(vp);
1927 v_incr_usecount(vp, 1);
1928 VI_UNLOCK(vp);
1929}
1930
1931/*
1932 * Return reference count of a vnode.
1933 *
1934 * The results of this call are only guaranteed when some mechanism other
1935 * than the VI lock is used to stop other processes from gaining references
1936 * to the vnode. This may be the case if the caller holds the only reference.
1937 * This is also useful when stale data is acceptable as race conditions may
1938 * be accounted for by some other means.
1939 */
1940int
1941vrefcnt(struct vnode *vp)
1942{
1943 int usecnt;
1944
1945 VI_LOCK(vp);
1946 usecnt = vp->v_usecount;
1947 VI_UNLOCK(vp);
1948
1949 return (usecnt);
1950}
1951
1952
1953/*
1954 * Vnode put/release.
1955 * If count drops to zero, call inactive routine and return to freelist.
1956 */
1957void
1958vrele(vp)
1959 struct vnode *vp;
1960{
1961 struct thread *td = curthread; /* XXX */
1962
1963 GIANT_REQUIRED;
1964
1965 KASSERT(vp != NULL, ("vrele: null vp"));
1966
1967 VI_LOCK(vp);
1968
1969 /* Skip this v_writecount check if we're going to panic below. */
1970 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
1971 ("vrele: missed vn_close"));
1972
1973 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
1974 vp->v_usecount == 1)) {
1975 v_incr_usecount(vp, -1);
1976 VI_UNLOCK(vp);
1977
1978 return;
1979 }
1980
1981 if (vp->v_usecount == 1) {
1982 v_incr_usecount(vp, -1);
1983 /*
1984 * We must call VOP_INACTIVE with the node locked. Mark
1985 * as VI_DOINGINACT to avoid recursion.
1986 */
1987 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
1988 VI_LOCK(vp);
1989 vp->v_iflag |= VI_DOINGINACT;
1990 VI_UNLOCK(vp);
1991 VOP_INACTIVE(vp, td);
1992 VI_LOCK(vp);
1993 KASSERT(vp->v_iflag & VI_DOINGINACT,
1994 ("vrele: lost VI_DOINGINACT"));
1995 vp->v_iflag &= ~VI_DOINGINACT;
1996 } else
1997 VI_LOCK(vp);
1998 if (VSHOULDFREE(vp))
1999 vfree(vp);
2000 else
2001 vlruvp(vp);
2002 VI_UNLOCK(vp);
2003
2004 } else {
2005#ifdef DIAGNOSTIC
2006 vprint("vrele: negative ref count", vp);
2007#endif
2008 VI_UNLOCK(vp);
2009 panic("vrele: negative ref cnt");
2010 }
2011}
2012
2013/*
2014 * Release an already locked vnode. This give the same effects as
2015 * unlock+vrele(), but takes less time and avoids releasing and
2016 * re-aquiring the lock (as vrele() aquires the lock internally.)
2017 */
2018void
2019vput(vp)
2020 struct vnode *vp;
2021{
2022 struct thread *td = curthread; /* XXX */
2023
2024 GIANT_REQUIRED;
2025
2026 KASSERT(vp != NULL, ("vput: null vp"));
2027 VI_LOCK(vp);
2028 /* Skip this v_writecount check if we're going to panic below. */
2029 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2030 ("vput: missed vn_close"));
2031
2032 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2033 vp->v_usecount == 1)) {
2034 v_incr_usecount(vp, -1);
2035 VOP_UNLOCK(vp, LK_INTERLOCK, td);
2036 return;
2037 }
2038
2039 if (vp->v_usecount == 1) {
2040 v_incr_usecount(vp, -1);
2041 /*
2042 * We must call VOP_INACTIVE with the node locked, so
2043 * we just need to release the vnode mutex. Mark as
2044 * as VI_DOINGINACT to avoid recursion.
2045 */
2046 vp->v_iflag |= VI_DOINGINACT;
2047 VI_UNLOCK(vp);
2048 VOP_INACTIVE(vp, td);
2049 VI_LOCK(vp);
2050 KASSERT(vp->v_iflag & VI_DOINGINACT,
2051 ("vput: lost VI_DOINGINACT"));
2052 vp->v_iflag &= ~VI_DOINGINACT;
2053 if (VSHOULDFREE(vp))
2054 vfree(vp);
2055 else
2056 vlruvp(vp);
2057 VI_UNLOCK(vp);
2058
2059 } else {
2060#ifdef DIAGNOSTIC
2061 vprint("vput: negative ref count", vp);
2062#endif
2063 panic("vput: negative ref cnt");
2064 }
2065}
2066
2067/*
2068 * Somebody doesn't want the vnode recycled.
2069 */
2070void
2071vhold(struct vnode *vp)
2072{
2073
2074 VI_LOCK(vp);
2075 vholdl(vp);
2076 VI_UNLOCK(vp);
2077}
2078
2079void
2080vholdl(vp)
2081 register struct vnode *vp;
2082{
2083
2084 vp->v_holdcnt++;
2085 if (VSHOULDBUSY(vp))
2086 vbusy(vp);
2087}
2088
2089/*
2090 * Note that there is one less who cares about this vnode. vdrop() is the
2091 * opposite of vhold().
2092 */
2093void
2094vdrop(struct vnode *vp)
2095{
2096
2097 VI_LOCK(vp);
2098 vdropl(vp);
2099 VI_UNLOCK(vp);
2100}
2101
2102void
2103vdropl(vp)
2104 register struct vnode *vp;
2105{
2106
2107 if (vp->v_holdcnt <= 0)
2108 panic("vdrop: holdcnt");
2109 vp->v_holdcnt--;
2110 if (VSHOULDFREE(vp))
2111 vfree(vp);
2112 else
2113 vlruvp(vp);
2114}
2115
2116/*
2117 * Remove any vnodes in the vnode table belonging to mount point mp.
2118 *
2119 * If FORCECLOSE is not specified, there should not be any active ones,
2120 * return error if any are found (nb: this is a user error, not a
2121 * system error). If FORCECLOSE is specified, detach any active vnodes
2122 * that are found.
2123 *
2124 * If WRITECLOSE is set, only flush out regular file vnodes open for
2125 * writing.
2126 *
2127 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2128 *
2129 * `rootrefs' specifies the base reference count for the root vnode
2130 * of this filesystem. The root vnode is considered busy if its
2131 * v_usecount exceeds this value. On a successful return, vflush()
2132 * will call vrele() on the root vnode exactly rootrefs times.
2133 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2134 * be zero.
2135 */
2136#ifdef DIAGNOSTIC
2137static int busyprt = 0; /* print out busy vnodes */
2138SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2139#endif
2140
2141int
2142vflush(mp, rootrefs, flags)
2143 struct mount *mp;
2144 int rootrefs;
2145 int flags;
2146{
2147 struct thread *td = curthread; /* XXX */
2148 struct vnode *vp, *nvp, *rootvp = NULL;
2149 struct vattr vattr;
2150 int busy = 0, error;
2151
2152 if (rootrefs > 0) {
2153 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2154 ("vflush: bad args"));
2155 /*
2156 * Get the filesystem root vnode. We can vput() it
2157 * immediately, since with rootrefs > 0, it won't go away.
2158 */
2159 if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2160 return (error);
2161 vput(rootvp);
2162
2163 }
2164 MNT_ILOCK(mp);
2165loop:
2166 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
2167 /*
2168 * Make sure this vnode wasn't reclaimed in getnewvnode().
2169 * Start over if it has (it won't be on the list anymore).
2170 */
2171 if (vp->v_mount != mp)
2172 goto loop;
2173 nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2174
2175 VI_LOCK(vp);
2176 MNT_IUNLOCK(mp);
2177 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
2178 if (error) {
2179 MNT_ILOCK(mp);
2180 goto loop;
2181 }
2182 /*
2183 * Skip over a vnodes marked VV_SYSTEM.
2184 */
2185 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2186 VOP_UNLOCK(vp, 0, td);
2187 MNT_ILOCK(mp);
2188 continue;
2189 }
2190 /*
2191 * If WRITECLOSE is set, flush out unlinked but still open
2192 * files (even if open only for reading) and regular file
2193 * vnodes open for writing.
2194 */
2195 if (flags & WRITECLOSE) {
2196 error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2197 VI_LOCK(vp);
2198
2199 if ((vp->v_type == VNON ||
2200 (error == 0 && vattr.va_nlink > 0)) &&
2201 (vp->v_writecount == 0 || vp->v_type != VREG)) {
2202 VOP_UNLOCK(vp, LK_INTERLOCK, td);
2203 MNT_ILOCK(mp);
2204 continue;
2205 }
2206 } else
2207 VI_LOCK(vp);
2208
2209 VOP_UNLOCK(vp, 0, td);
2210
2211 /*
2212 * With v_usecount == 0, all we need to do is clear out the
2213 * vnode data structures and we are done.
2214 */
2215 if (vp->v_usecount == 0) {
2216 vgonel(vp, td);
2217 MNT_ILOCK(mp);
2218 continue;
2219 }
2220
2221 /*
2222 * If FORCECLOSE is set, forcibly close the vnode. For block
2223 * or character devices, revert to an anonymous device. For
2224 * all other files, just kill them.
2225 */
2226 if (flags & FORCECLOSE) {
2227 if (vp->v_type != VCHR)
2228 vgonel(vp, td);
2229 else
2230 vgonechrl(vp, td);
2231 MNT_ILOCK(mp);
2232 continue;
2233 }
2234#ifdef DIAGNOSTIC
2235 if (busyprt)
2236 vprint("vflush: busy vnode", vp);
2237#endif
2238 VI_UNLOCK(vp);
2239 MNT_ILOCK(mp);
2240 busy++;
2241 }
2242 MNT_IUNLOCK(mp);
2243 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2244 /*
2245 * If just the root vnode is busy, and if its refcount
2246 * is equal to `rootrefs', then go ahead and kill it.
2247 */
2248 VI_LOCK(rootvp);
2249 KASSERT(busy > 0, ("vflush: not busy"));
2250 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2251 if (busy == 1 && rootvp->v_usecount == rootrefs) {
2252 vgonel(rootvp, td);
2253 busy = 0;
2254 } else
2255 VI_UNLOCK(rootvp);
2256 }
2257 if (busy)
2258 return (EBUSY);
2259 for (; rootrefs > 0; rootrefs--)
2260 vrele(rootvp);
2261 return (0);
2262}
2263
2264/*
2265 * This moves a now (likely recyclable) vnode to the end of the
2266 * mountlist. XXX However, it is temporarily disabled until we
2267 * can clean up ffs_sync() and friends, which have loop restart
2268 * conditions which this code causes to operate O(N^2).
2269 */
2270static void
2271vlruvp(struct vnode *vp)
2272{
2273#if 0
2274 struct mount *mp;
2275
2276 if ((mp = vp->v_mount) != NULL) {
2277 MNT_ILOCK(mp);
2278 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2279 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2280 MNT_IUNLOCK(mp);
2281 }
2282#endif
2283}
2284
2285static void
2286vx_lock(struct vnode *vp)
2287{
2288
2289 ASSERT_VI_LOCKED(vp, "vx_lock");
2290
2291 /*
2292 * Prevent the vnode from being recycled or brought into use while we
2293 * clean it out.
2294 */
2295 if (vp->v_iflag & VI_XLOCK)
2296 panic("vclean: deadlock");
2297 vp->v_iflag |= VI_XLOCK;
2298 vp->v_vxthread = curthread;
2299}
2300
2301static void
2302vx_unlock(struct vnode *vp)
2303{
2304 ASSERT_VI_LOCKED(vp, "vx_unlock");
2305 vp->v_iflag &= ~VI_XLOCK;
2306 vp->v_vxthread = NULL;
2307 if (vp->v_iflag & VI_XWANT) {
2308 vp->v_iflag &= ~VI_XWANT;
2309 wakeup(vp);
2310 }
2311}
2312
2313/*
2314 * Disassociate the underlying filesystem from a vnode.
2315 */
2316static void
2317vclean(vp, flags, td)
2318 struct vnode *vp;
2319 int flags;
2320 struct thread *td;
2321{
2322 int active;
2323
2324 ASSERT_VI_LOCKED(vp, "vclean");
2325 /*
2326 * Check to see if the vnode is in use. If so we have to reference it
2327 * before we clean it out so that its count cannot fall to zero and
2328 * generate a race against ourselves to recycle it.
2329 */
2330 if ((active = vp->v_usecount))
2331 v_incr_usecount(vp, 1);
2332
2333 /*
2334 * Even if the count is zero, the VOP_INACTIVE routine may still
2335 * have the object locked while it cleans it out. The VOP_LOCK
2336 * ensures that the VOP_INACTIVE routine is done with its work.
2337 * For active vnodes, it ensures that no other activity can
2338 * occur while the underlying object is being cleaned out.
2339 */
2340 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2341
2342 /*
2343 * Clean out any buffers associated with the vnode.
2344 * If the flush fails, just toss the buffers.
2345 */
2346 if (flags & DOCLOSE) {
2347 struct buf *bp;
2348 bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
2349 if (bp != NULL)
2350 (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2351 if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2352 vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2353 }
2354
2355 VOP_DESTROYVOBJECT(vp);
2356
2357 /*
2358 * Any other processes trying to obtain this lock must first
2359 * wait for VXLOCK to clear, then call the new lock operation.
2360 */
2361 VOP_UNLOCK(vp, 0, td);
2362
2363 /*
2364 * If purging an active vnode, it must be closed and
2365 * deactivated before being reclaimed. Note that the
2366 * VOP_INACTIVE will unlock the vnode.
2367 */
2368 if (active) {
2369 if (flags & DOCLOSE)
2370 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2371 VI_LOCK(vp);
2372 if ((vp->v_iflag & VI_DOINGINACT) == 0) {
2373 vp->v_iflag |= VI_DOINGINACT;
2374 VI_UNLOCK(vp);
2375 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2376 panic("vclean: cannot relock.");
2377 VOP_INACTIVE(vp, td);
2378 VI_LOCK(vp);
2379 KASSERT(vp->v_iflag & VI_DOINGINACT,
2380 ("vclean: lost VI_DOINGINACT"));
2381 vp->v_iflag &= ~VI_DOINGINACT;
2382 }
2383 VI_UNLOCK(vp);
2384 }
2385 /*
2386 * Reclaim the vnode.
2387 */
2388 if (VOP_RECLAIM(vp, td))
2389 panic("vclean: cannot reclaim");
2390
2391 if (active) {
2392 /*
2393 * Inline copy of vrele() since VOP_INACTIVE
2394 * has already been called.
2395 */
2396 VI_LOCK(vp);
2397 v_incr_usecount(vp, -1);
2398 if (vp->v_usecount <= 0) {
2399#ifdef INVARIANTS
2400 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2401 vprint("vclean: bad ref count", vp);
2402 panic("vclean: ref cnt");
2403 }
2404#endif
2405 if (VSHOULDFREE(vp))
2406 vfree(vp);
2407 }
2408 VI_UNLOCK(vp);
2409 }
2410 /*
2411 * Delete from old mount point vnode list.
2412 */
2413 if (vp->v_mount != NULL)
2414 insmntque(vp, (struct mount *)0);
2415 cache_purge(vp);
2416 VI_LOCK(vp);
2417 if (VSHOULDFREE(vp))
2418 vfree(vp);
2419
2420 /*
2421 * Done with purge, reset to the standard lock and
2422 * notify sleepers of the grim news.
2423 */
2424 vp->v_vnlock = &vp->v_lock;
2425 vp->v_op = dead_vnodeop_p;
2426 if (vp->v_pollinfo != NULL)
2427 vn_pollgone(vp);
2428 vp->v_tag = "none";
2429}
2430
2431/*
2432 * Eliminate all activity associated with the requested vnode
2433 * and with all vnodes aliased to the requested vnode.
2434 */
2435int
2436vop_revoke(ap)
2437 struct vop_revoke_args /* {
2438 struct vnode *a_vp;
2439 int a_flags;
2440 } */ *ap;
2441{
2442 struct vnode *vp, *vq;
2443 struct cdev *dev;
2444
2445 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2446 vp = ap->a_vp;
2447 KASSERT((vp->v_type == VCHR), ("vop_revoke: not VCHR"));
2448
2449 VI_LOCK(vp);
2450 /*
2451 * If a vgone (or vclean) is already in progress,
2452 * wait until it is done and return.
2453 */
2454 if (vp->v_iflag & VI_XLOCK) {
2455 vp->v_iflag |= VI_XWANT;
2456 msleep(vp, VI_MTX(vp), PINOD | PDROP,
2457 "vop_revokeall", 0);
2458 return (0);
2459 }
2460 VI_UNLOCK(vp);
2461 dev = vp->v_rdev;
2462 for (;;) {
2463 mtx_lock(&spechash_mtx);
2464 vq = SLIST_FIRST(&dev->si_hlist);
2465 mtx_unlock(&spechash_mtx);
2466 if (vq == NULL)
2467 break;
2468 vgone(vq);
2469 }
2470 return (0);
2471}
2472
2473/*
2474 * Recycle an unused vnode to the front of the free list.
2475 * Release the passed interlock if the vnode will be recycled.
2476 */
2477int
2478vrecycle(vp, inter_lkp, td)
2479 struct vnode *vp;
2480 struct mtx *inter_lkp;
2481 struct thread *td;
2482{
2483
2484 VI_LOCK(vp);
2485 if (vp->v_usecount == 0) {
2486 if (inter_lkp) {
2487 mtx_unlock(inter_lkp);
2488 }
2489 vgonel(vp, td);
2490 return (1);
2491 }
2492 VI_UNLOCK(vp);
2493 return (0);
2494}
2495
2496/*
2497 * Eliminate all activity associated with a vnode
2498 * in preparation for reuse.
2499 */
2500void
2501vgone(vp)
2502 register struct vnode *vp;
2503{
2504 struct thread *td = curthread; /* XXX */
2505
2506 VI_LOCK(vp);
2507 vgonel(vp, td);
2508}
2509
2510/*
2511 * Disassociate a character device from the its underlying filesystem and
2512 * attach it to spec. This is for use when the chr device is still active
2513 * and the filesystem is going away.
2514 */
2515static void
2516vgonechrl(struct vnode *vp, struct thread *td)
2517{
2518 ASSERT_VI_LOCKED(vp, "vgonechrl");
2519 vx_lock(vp);
2520 /*
2521 * This is a custom version of vclean() which does not tearm down
2522 * the bufs or vm objects held by this vnode. This allows filesystems
2523 * to continue using devices which were discovered via another
2524 * filesystem that has been unmounted.
2525 */
2526 if (vp->v_usecount != 0) {
2527 v_incr_usecount(vp, 1);
2528 /*
2529 * Ensure that no other activity can occur while the
2530 * underlying object is being cleaned out.
2531 */
2532 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2533 /*
2534 * Any other processes trying to obtain this lock must first
2535 * wait for VXLOCK to clear, then call the new lock operation.
2536 */
2537 VOP_UNLOCK(vp, 0, td);
2538 vp->v_vnlock = &vp->v_lock;
2539 vp->v_tag = "orphanchr";
2540 vp->v_op = spec_vnodeop_p;
2541 if (vp->v_mount != NULL)
2542 insmntque(vp, (struct mount *)0);
2543 cache_purge(vp);
2544 vrele(vp);
2545 VI_LOCK(vp);
2546 } else
2547 vclean(vp, 0, td);
2548 vp->v_op = spec_vnodeop_p;
2549 vx_unlock(vp);
2550 VI_UNLOCK(vp);
2551}
2552
2553/*
2554 * vgone, with the vp interlock held.
2555 */
2556void
2557vgonel(vp, td)
2558 struct vnode *vp;
2559 struct thread *td;
2560{
2561 /*
2562 * If a vgone (or vclean) is already in progress,
2563 * wait until it is done and return.
2564 */
2565 ASSERT_VI_LOCKED(vp, "vgonel");
2566 if (vp->v_iflag & VI_XLOCK) {
2567 vp->v_iflag |= VI_XWANT;
2568 msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
2569 return;
2570 }
2571 vx_lock(vp);
2572
2573 /*
2574 * Clean out the filesystem specific data.
2575 */
2576 vclean(vp, DOCLOSE, td);
2577 VI_UNLOCK(vp);
2578
2579 /*
2580 * If special device, remove it from special device alias list
2581 * if it is on one.
2582 */
2583 VI_LOCK(vp);
1840static void
1841addalias(nvp, dev)
1842 struct vnode *nvp;
1843 struct cdev *dev;
1844{
1845
1846 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1847 dev_ref(dev);
1848 nvp->v_rdev = dev;
1849 VI_LOCK(nvp);
1850 mtx_lock(&spechash_mtx);
1851 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1852 dev->si_usecount += nvp->v_usecount;
1853 mtx_unlock(&spechash_mtx);
1854 VI_UNLOCK(nvp);
1855}
1856
1857/*
1858 * Grab a particular vnode from the free list, increment its
1859 * reference count and lock it. The vnode lock bit is set if the
1860 * vnode is being eliminated in vgone. The process is awakened
1861 * when the transition is completed, and an error returned to
1862 * indicate that the vnode is no longer usable (possibly having
1863 * been changed to a new filesystem type).
1864 */
1865int
1866vget(vp, flags, td)
1867 register struct vnode *vp;
1868 int flags;
1869 struct thread *td;
1870{
1871 int error;
1872
1873 /*
1874 * If the vnode is in the process of being cleaned out for
1875 * another use, we wait for the cleaning to finish and then
1876 * return failure. Cleaning is determined by checking that
1877 * the VI_XLOCK flag is set.
1878 */
1879 if ((flags & LK_INTERLOCK) == 0)
1880 VI_LOCK(vp);
1881 if (vp->v_iflag & VI_XLOCK && vp->v_vxthread != curthread) {
1882 if ((flags & LK_NOWAIT) == 0) {
1883 vp->v_iflag |= VI_XWANT;
1884 msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
1885 return (ENOENT);
1886 }
1887 VI_UNLOCK(vp);
1888 return (EBUSY);
1889 }
1890
1891 v_incr_usecount(vp, 1);
1892
1893 if (VSHOULDBUSY(vp))
1894 vbusy(vp);
1895 if (flags & LK_TYPE_MASK) {
1896 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
1897 /*
1898 * must expand vrele here because we do not want
1899 * to call VOP_INACTIVE if the reference count
1900 * drops back to zero since it was never really
1901 * active. We must remove it from the free list
1902 * before sleeping so that multiple processes do
1903 * not try to recycle it.
1904 */
1905 VI_LOCK(vp);
1906 v_incr_usecount(vp, -1);
1907 if (VSHOULDFREE(vp))
1908 vfree(vp);
1909 else
1910 vlruvp(vp);
1911 VI_UNLOCK(vp);
1912 }
1913 return (error);
1914 }
1915 VI_UNLOCK(vp);
1916 return (0);
1917}
1918
1919/*
1920 * Increase the reference count of a vnode.
1921 */
1922void
1923vref(struct vnode *vp)
1924{
1925
1926 VI_LOCK(vp);
1927 v_incr_usecount(vp, 1);
1928 VI_UNLOCK(vp);
1929}
1930
1931/*
1932 * Return reference count of a vnode.
1933 *
1934 * The results of this call are only guaranteed when some mechanism other
1935 * than the VI lock is used to stop other processes from gaining references
1936 * to the vnode. This may be the case if the caller holds the only reference.
1937 * This is also useful when stale data is acceptable as race conditions may
1938 * be accounted for by some other means.
1939 */
1940int
1941vrefcnt(struct vnode *vp)
1942{
1943 int usecnt;
1944
1945 VI_LOCK(vp);
1946 usecnt = vp->v_usecount;
1947 VI_UNLOCK(vp);
1948
1949 return (usecnt);
1950}
1951
1952
1953/*
1954 * Vnode put/release.
1955 * If count drops to zero, call inactive routine and return to freelist.
1956 */
1957void
1958vrele(vp)
1959 struct vnode *vp;
1960{
1961 struct thread *td = curthread; /* XXX */
1962
1963 GIANT_REQUIRED;
1964
1965 KASSERT(vp != NULL, ("vrele: null vp"));
1966
1967 VI_LOCK(vp);
1968
1969 /* Skip this v_writecount check if we're going to panic below. */
1970 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
1971 ("vrele: missed vn_close"));
1972
1973 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
1974 vp->v_usecount == 1)) {
1975 v_incr_usecount(vp, -1);
1976 VI_UNLOCK(vp);
1977
1978 return;
1979 }
1980
1981 if (vp->v_usecount == 1) {
1982 v_incr_usecount(vp, -1);
1983 /*
1984 * We must call VOP_INACTIVE with the node locked. Mark
1985 * as VI_DOINGINACT to avoid recursion.
1986 */
1987 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
1988 VI_LOCK(vp);
1989 vp->v_iflag |= VI_DOINGINACT;
1990 VI_UNLOCK(vp);
1991 VOP_INACTIVE(vp, td);
1992 VI_LOCK(vp);
1993 KASSERT(vp->v_iflag & VI_DOINGINACT,
1994 ("vrele: lost VI_DOINGINACT"));
1995 vp->v_iflag &= ~VI_DOINGINACT;
1996 } else
1997 VI_LOCK(vp);
1998 if (VSHOULDFREE(vp))
1999 vfree(vp);
2000 else
2001 vlruvp(vp);
2002 VI_UNLOCK(vp);
2003
2004 } else {
2005#ifdef DIAGNOSTIC
2006 vprint("vrele: negative ref count", vp);
2007#endif
2008 VI_UNLOCK(vp);
2009 panic("vrele: negative ref cnt");
2010 }
2011}
2012
2013/*
2014 * Release an already locked vnode. This give the same effects as
2015 * unlock+vrele(), but takes less time and avoids releasing and
2016 * re-aquiring the lock (as vrele() aquires the lock internally.)
2017 */
2018void
2019vput(vp)
2020 struct vnode *vp;
2021{
2022 struct thread *td = curthread; /* XXX */
2023
2024 GIANT_REQUIRED;
2025
2026 KASSERT(vp != NULL, ("vput: null vp"));
2027 VI_LOCK(vp);
2028 /* Skip this v_writecount check if we're going to panic below. */
2029 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2030 ("vput: missed vn_close"));
2031
2032 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2033 vp->v_usecount == 1)) {
2034 v_incr_usecount(vp, -1);
2035 VOP_UNLOCK(vp, LK_INTERLOCK, td);
2036 return;
2037 }
2038
2039 if (vp->v_usecount == 1) {
2040 v_incr_usecount(vp, -1);
2041 /*
2042 * We must call VOP_INACTIVE with the node locked, so
2043 * we just need to release the vnode mutex. Mark as
2044 * as VI_DOINGINACT to avoid recursion.
2045 */
2046 vp->v_iflag |= VI_DOINGINACT;
2047 VI_UNLOCK(vp);
2048 VOP_INACTIVE(vp, td);
2049 VI_LOCK(vp);
2050 KASSERT(vp->v_iflag & VI_DOINGINACT,
2051 ("vput: lost VI_DOINGINACT"));
2052 vp->v_iflag &= ~VI_DOINGINACT;
2053 if (VSHOULDFREE(vp))
2054 vfree(vp);
2055 else
2056 vlruvp(vp);
2057 VI_UNLOCK(vp);
2058
2059 } else {
2060#ifdef DIAGNOSTIC
2061 vprint("vput: negative ref count", vp);
2062#endif
2063 panic("vput: negative ref cnt");
2064 }
2065}
2066
2067/*
2068 * Somebody doesn't want the vnode recycled.
2069 */
2070void
2071vhold(struct vnode *vp)
2072{
2073
2074 VI_LOCK(vp);
2075 vholdl(vp);
2076 VI_UNLOCK(vp);
2077}
2078
2079void
2080vholdl(vp)
2081 register struct vnode *vp;
2082{
2083
2084 vp->v_holdcnt++;
2085 if (VSHOULDBUSY(vp))
2086 vbusy(vp);
2087}
2088
2089/*
2090 * Note that there is one less who cares about this vnode. vdrop() is the
2091 * opposite of vhold().
2092 */
2093void
2094vdrop(struct vnode *vp)
2095{
2096
2097 VI_LOCK(vp);
2098 vdropl(vp);
2099 VI_UNLOCK(vp);
2100}
2101
2102void
2103vdropl(vp)
2104 register struct vnode *vp;
2105{
2106
2107 if (vp->v_holdcnt <= 0)
2108 panic("vdrop: holdcnt");
2109 vp->v_holdcnt--;
2110 if (VSHOULDFREE(vp))
2111 vfree(vp);
2112 else
2113 vlruvp(vp);
2114}
2115
2116/*
2117 * Remove any vnodes in the vnode table belonging to mount point mp.
2118 *
2119 * If FORCECLOSE is not specified, there should not be any active ones,
2120 * return error if any are found (nb: this is a user error, not a
2121 * system error). If FORCECLOSE is specified, detach any active vnodes
2122 * that are found.
2123 *
2124 * If WRITECLOSE is set, only flush out regular file vnodes open for
2125 * writing.
2126 *
2127 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2128 *
2129 * `rootrefs' specifies the base reference count for the root vnode
2130 * of this filesystem. The root vnode is considered busy if its
2131 * v_usecount exceeds this value. On a successful return, vflush()
2132 * will call vrele() on the root vnode exactly rootrefs times.
2133 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2134 * be zero.
2135 */
2136#ifdef DIAGNOSTIC
2137static int busyprt = 0; /* print out busy vnodes */
2138SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2139#endif
2140
2141int
2142vflush(mp, rootrefs, flags)
2143 struct mount *mp;
2144 int rootrefs;
2145 int flags;
2146{
2147 struct thread *td = curthread; /* XXX */
2148 struct vnode *vp, *nvp, *rootvp = NULL;
2149 struct vattr vattr;
2150 int busy = 0, error;
2151
2152 if (rootrefs > 0) {
2153 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2154 ("vflush: bad args"));
2155 /*
2156 * Get the filesystem root vnode. We can vput() it
2157 * immediately, since with rootrefs > 0, it won't go away.
2158 */
2159 if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2160 return (error);
2161 vput(rootvp);
2162
2163 }
2164 MNT_ILOCK(mp);
2165loop:
2166 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
2167 /*
2168 * Make sure this vnode wasn't reclaimed in getnewvnode().
2169 * Start over if it has (it won't be on the list anymore).
2170 */
2171 if (vp->v_mount != mp)
2172 goto loop;
2173 nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2174
2175 VI_LOCK(vp);
2176 MNT_IUNLOCK(mp);
2177 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
2178 if (error) {
2179 MNT_ILOCK(mp);
2180 goto loop;
2181 }
2182 /*
2183 * Skip over a vnodes marked VV_SYSTEM.
2184 */
2185 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2186 VOP_UNLOCK(vp, 0, td);
2187 MNT_ILOCK(mp);
2188 continue;
2189 }
2190 /*
2191 * If WRITECLOSE is set, flush out unlinked but still open
2192 * files (even if open only for reading) and regular file
2193 * vnodes open for writing.
2194 */
2195 if (flags & WRITECLOSE) {
2196 error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2197 VI_LOCK(vp);
2198
2199 if ((vp->v_type == VNON ||
2200 (error == 0 && vattr.va_nlink > 0)) &&
2201 (vp->v_writecount == 0 || vp->v_type != VREG)) {
2202 VOP_UNLOCK(vp, LK_INTERLOCK, td);
2203 MNT_ILOCK(mp);
2204 continue;
2205 }
2206 } else
2207 VI_LOCK(vp);
2208
2209 VOP_UNLOCK(vp, 0, td);
2210
2211 /*
2212 * With v_usecount == 0, all we need to do is clear out the
2213 * vnode data structures and we are done.
2214 */
2215 if (vp->v_usecount == 0) {
2216 vgonel(vp, td);
2217 MNT_ILOCK(mp);
2218 continue;
2219 }
2220
2221 /*
2222 * If FORCECLOSE is set, forcibly close the vnode. For block
2223 * or character devices, revert to an anonymous device. For
2224 * all other files, just kill them.
2225 */
2226 if (flags & FORCECLOSE) {
2227 if (vp->v_type != VCHR)
2228 vgonel(vp, td);
2229 else
2230 vgonechrl(vp, td);
2231 MNT_ILOCK(mp);
2232 continue;
2233 }
2234#ifdef DIAGNOSTIC
2235 if (busyprt)
2236 vprint("vflush: busy vnode", vp);
2237#endif
2238 VI_UNLOCK(vp);
2239 MNT_ILOCK(mp);
2240 busy++;
2241 }
2242 MNT_IUNLOCK(mp);
2243 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2244 /*
2245 * If just the root vnode is busy, and if its refcount
2246 * is equal to `rootrefs', then go ahead and kill it.
2247 */
2248 VI_LOCK(rootvp);
2249 KASSERT(busy > 0, ("vflush: not busy"));
2250 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2251 if (busy == 1 && rootvp->v_usecount == rootrefs) {
2252 vgonel(rootvp, td);
2253 busy = 0;
2254 } else
2255 VI_UNLOCK(rootvp);
2256 }
2257 if (busy)
2258 return (EBUSY);
2259 for (; rootrefs > 0; rootrefs--)
2260 vrele(rootvp);
2261 return (0);
2262}
2263
2264/*
2265 * This moves a now (likely recyclable) vnode to the end of the
2266 * mountlist. XXX However, it is temporarily disabled until we
2267 * can clean up ffs_sync() and friends, which have loop restart
2268 * conditions which this code causes to operate O(N^2).
2269 */
2270static void
2271vlruvp(struct vnode *vp)
2272{
2273#if 0
2274 struct mount *mp;
2275
2276 if ((mp = vp->v_mount) != NULL) {
2277 MNT_ILOCK(mp);
2278 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2279 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2280 MNT_IUNLOCK(mp);
2281 }
2282#endif
2283}
2284
2285static void
2286vx_lock(struct vnode *vp)
2287{
2288
2289 ASSERT_VI_LOCKED(vp, "vx_lock");
2290
2291 /*
2292 * Prevent the vnode from being recycled or brought into use while we
2293 * clean it out.
2294 */
2295 if (vp->v_iflag & VI_XLOCK)
2296 panic("vclean: deadlock");
2297 vp->v_iflag |= VI_XLOCK;
2298 vp->v_vxthread = curthread;
2299}
2300
2301static void
2302vx_unlock(struct vnode *vp)
2303{
2304 ASSERT_VI_LOCKED(vp, "vx_unlock");
2305 vp->v_iflag &= ~VI_XLOCK;
2306 vp->v_vxthread = NULL;
2307 if (vp->v_iflag & VI_XWANT) {
2308 vp->v_iflag &= ~VI_XWANT;
2309 wakeup(vp);
2310 }
2311}
2312
2313/*
2314 * Disassociate the underlying filesystem from a vnode.
2315 */
2316static void
2317vclean(vp, flags, td)
2318 struct vnode *vp;
2319 int flags;
2320 struct thread *td;
2321{
2322 int active;
2323
2324 ASSERT_VI_LOCKED(vp, "vclean");
2325 /*
2326 * Check to see if the vnode is in use. If so we have to reference it
2327 * before we clean it out so that its count cannot fall to zero and
2328 * generate a race against ourselves to recycle it.
2329 */
2330 if ((active = vp->v_usecount))
2331 v_incr_usecount(vp, 1);
2332
2333 /*
2334 * Even if the count is zero, the VOP_INACTIVE routine may still
2335 * have the object locked while it cleans it out. The VOP_LOCK
2336 * ensures that the VOP_INACTIVE routine is done with its work.
2337 * For active vnodes, it ensures that no other activity can
2338 * occur while the underlying object is being cleaned out.
2339 */
2340 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2341
2342 /*
2343 * Clean out any buffers associated with the vnode.
2344 * If the flush fails, just toss the buffers.
2345 */
2346 if (flags & DOCLOSE) {
2347 struct buf *bp;
2348 bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
2349 if (bp != NULL)
2350 (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2351 if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2352 vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2353 }
2354
2355 VOP_DESTROYVOBJECT(vp);
2356
2357 /*
2358 * Any other processes trying to obtain this lock must first
2359 * wait for VXLOCK to clear, then call the new lock operation.
2360 */
2361 VOP_UNLOCK(vp, 0, td);
2362
2363 /*
2364 * If purging an active vnode, it must be closed and
2365 * deactivated before being reclaimed. Note that the
2366 * VOP_INACTIVE will unlock the vnode.
2367 */
2368 if (active) {
2369 if (flags & DOCLOSE)
2370 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2371 VI_LOCK(vp);
2372 if ((vp->v_iflag & VI_DOINGINACT) == 0) {
2373 vp->v_iflag |= VI_DOINGINACT;
2374 VI_UNLOCK(vp);
2375 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2376 panic("vclean: cannot relock.");
2377 VOP_INACTIVE(vp, td);
2378 VI_LOCK(vp);
2379 KASSERT(vp->v_iflag & VI_DOINGINACT,
2380 ("vclean: lost VI_DOINGINACT"));
2381 vp->v_iflag &= ~VI_DOINGINACT;
2382 }
2383 VI_UNLOCK(vp);
2384 }
2385 /*
2386 * Reclaim the vnode.
2387 */
2388 if (VOP_RECLAIM(vp, td))
2389 panic("vclean: cannot reclaim");
2390
2391 if (active) {
2392 /*
2393 * Inline copy of vrele() since VOP_INACTIVE
2394 * has already been called.
2395 */
2396 VI_LOCK(vp);
2397 v_incr_usecount(vp, -1);
2398 if (vp->v_usecount <= 0) {
2399#ifdef INVARIANTS
2400 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2401 vprint("vclean: bad ref count", vp);
2402 panic("vclean: ref cnt");
2403 }
2404#endif
2405 if (VSHOULDFREE(vp))
2406 vfree(vp);
2407 }
2408 VI_UNLOCK(vp);
2409 }
2410 /*
2411 * Delete from old mount point vnode list.
2412 */
2413 if (vp->v_mount != NULL)
2414 insmntque(vp, (struct mount *)0);
2415 cache_purge(vp);
2416 VI_LOCK(vp);
2417 if (VSHOULDFREE(vp))
2418 vfree(vp);
2419
2420 /*
2421 * Done with purge, reset to the standard lock and
2422 * notify sleepers of the grim news.
2423 */
2424 vp->v_vnlock = &vp->v_lock;
2425 vp->v_op = dead_vnodeop_p;
2426 if (vp->v_pollinfo != NULL)
2427 vn_pollgone(vp);
2428 vp->v_tag = "none";
2429}
2430
2431/*
2432 * Eliminate all activity associated with the requested vnode
2433 * and with all vnodes aliased to the requested vnode.
2434 */
2435int
2436vop_revoke(ap)
2437 struct vop_revoke_args /* {
2438 struct vnode *a_vp;
2439 int a_flags;
2440 } */ *ap;
2441{
2442 struct vnode *vp, *vq;
2443 struct cdev *dev;
2444
2445 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2446 vp = ap->a_vp;
2447 KASSERT((vp->v_type == VCHR), ("vop_revoke: not VCHR"));
2448
2449 VI_LOCK(vp);
2450 /*
2451 * If a vgone (or vclean) is already in progress,
2452 * wait until it is done and return.
2453 */
2454 if (vp->v_iflag & VI_XLOCK) {
2455 vp->v_iflag |= VI_XWANT;
2456 msleep(vp, VI_MTX(vp), PINOD | PDROP,
2457 "vop_revokeall", 0);
2458 return (0);
2459 }
2460 VI_UNLOCK(vp);
2461 dev = vp->v_rdev;
2462 for (;;) {
2463 mtx_lock(&spechash_mtx);
2464 vq = SLIST_FIRST(&dev->si_hlist);
2465 mtx_unlock(&spechash_mtx);
2466 if (vq == NULL)
2467 break;
2468 vgone(vq);
2469 }
2470 return (0);
2471}
2472
2473/*
2474 * Recycle an unused vnode to the front of the free list.
2475 * Release the passed interlock if the vnode will be recycled.
2476 */
2477int
2478vrecycle(vp, inter_lkp, td)
2479 struct vnode *vp;
2480 struct mtx *inter_lkp;
2481 struct thread *td;
2482{
2483
2484 VI_LOCK(vp);
2485 if (vp->v_usecount == 0) {
2486 if (inter_lkp) {
2487 mtx_unlock(inter_lkp);
2488 }
2489 vgonel(vp, td);
2490 return (1);
2491 }
2492 VI_UNLOCK(vp);
2493 return (0);
2494}
2495
2496/*
2497 * Eliminate all activity associated with a vnode
2498 * in preparation for reuse.
2499 */
2500void
2501vgone(vp)
2502 register struct vnode *vp;
2503{
2504 struct thread *td = curthread; /* XXX */
2505
2506 VI_LOCK(vp);
2507 vgonel(vp, td);
2508}
2509
2510/*
2511 * Disassociate a character device from the its underlying filesystem and
2512 * attach it to spec. This is for use when the chr device is still active
2513 * and the filesystem is going away.
2514 */
2515static void
2516vgonechrl(struct vnode *vp, struct thread *td)
2517{
2518 ASSERT_VI_LOCKED(vp, "vgonechrl");
2519 vx_lock(vp);
2520 /*
2521 * This is a custom version of vclean() which does not tearm down
2522 * the bufs or vm objects held by this vnode. This allows filesystems
2523 * to continue using devices which were discovered via another
2524 * filesystem that has been unmounted.
2525 */
2526 if (vp->v_usecount != 0) {
2527 v_incr_usecount(vp, 1);
2528 /*
2529 * Ensure that no other activity can occur while the
2530 * underlying object is being cleaned out.
2531 */
2532 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2533 /*
2534 * Any other processes trying to obtain this lock must first
2535 * wait for VXLOCK to clear, then call the new lock operation.
2536 */
2537 VOP_UNLOCK(vp, 0, td);
2538 vp->v_vnlock = &vp->v_lock;
2539 vp->v_tag = "orphanchr";
2540 vp->v_op = spec_vnodeop_p;
2541 if (vp->v_mount != NULL)
2542 insmntque(vp, (struct mount *)0);
2543 cache_purge(vp);
2544 vrele(vp);
2545 VI_LOCK(vp);
2546 } else
2547 vclean(vp, 0, td);
2548 vp->v_op = spec_vnodeop_p;
2549 vx_unlock(vp);
2550 VI_UNLOCK(vp);
2551}
2552
2553/*
2554 * vgone, with the vp interlock held.
2555 */
2556void
2557vgonel(vp, td)
2558 struct vnode *vp;
2559 struct thread *td;
2560{
2561 /*
2562 * If a vgone (or vclean) is already in progress,
2563 * wait until it is done and return.
2564 */
2565 ASSERT_VI_LOCKED(vp, "vgonel");
2566 if (vp->v_iflag & VI_XLOCK) {
2567 vp->v_iflag |= VI_XWANT;
2568 msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
2569 return;
2570 }
2571 vx_lock(vp);
2572
2573 /*
2574 * Clean out the filesystem specific data.
2575 */
2576 vclean(vp, DOCLOSE, td);
2577 VI_UNLOCK(vp);
2578
2579 /*
2580 * If special device, remove it from special device alias list
2581 * if it is on one.
2582 */
2583 VI_LOCK(vp);
2584 if (vp->v_type == VCHR && vp->v_rdev != NODEV) {
2584 if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2585 mtx_lock(&spechash_mtx);
2586 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2587 vp->v_rdev->si_usecount -= vp->v_usecount;
2588 mtx_unlock(&spechash_mtx);
2589 dev_rel(vp->v_rdev);
2590 vp->v_rdev = NULL;
2591 }
2592
2593 /*
2594 * If it is on the freelist and not already at the head,
2595 * move it to the head of the list. The test of the
2596 * VDOOMED flag and the reference count of zero is because
2597 * it will be removed from the free list by getnewvnode,
2598 * but will not have its reference count incremented until
2599 * after calling vgone. If the reference count were
2600 * incremented first, vgone would (incorrectly) try to
2601 * close the previous instance of the underlying object.
2602 */
2603 if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2604 mtx_lock(&vnode_free_list_mtx);
2605 if (vp->v_iflag & VI_FREE) {
2606 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2607 } else {
2608 vp->v_iflag |= VI_FREE;
2609 freevnodes++;
2610 }
2611 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2612 mtx_unlock(&vnode_free_list_mtx);
2613 }
2614
2615 vp->v_type = VBAD;
2616 vx_unlock(vp);
2617 VI_UNLOCK(vp);
2618}
2619
2620/*
2621 * Lookup a vnode by device number.
2622 */
2623int
2624vfinddev(dev, vpp)
2625 struct cdev *dev;
2626 struct vnode **vpp;
2627{
2628 struct vnode *vp;
2629
2630 mtx_lock(&spechash_mtx);
2631 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2632 *vpp = vp;
2633 mtx_unlock(&spechash_mtx);
2634 return (1);
2635 }
2636 mtx_unlock(&spechash_mtx);
2637 return (0);
2638}
2639
2640/*
2641 * Calculate the total number of references to a special device.
2642 */
2643int
2644vcount(vp)
2645 struct vnode *vp;
2646{
2647 int count;
2648
2649 mtx_lock(&spechash_mtx);
2650 count = vp->v_rdev->si_usecount;
2651 mtx_unlock(&spechash_mtx);
2652 return (count);
2653}
2654
2655/*
2656 * Same as above, but using the struct cdev *as argument
2657 */
2658int
2659count_dev(dev)
2660 struct cdev *dev;
2661{
2662 int count;
2663
2664 mtx_lock(&spechash_mtx);
2665 count = dev->si_usecount;
2666 mtx_unlock(&spechash_mtx);
2667 return(count);
2668}
2669
2670/*
2671 * Print out a description of a vnode.
2672 */
2673static char *typename[] =
2674{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2675
2676void
2677vprint(label, vp)
2678 char *label;
2679 struct vnode *vp;
2680{
2681 char buf[96];
2682
2683 if (label != NULL)
2684 printf("%s: %p: ", label, (void *)vp);
2685 else
2686 printf("%p: ", (void *)vp);
2687 printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
2688 vp->v_tag, typename[vp->v_type], vp->v_usecount,
2689 vp->v_writecount, vp->v_holdcnt);
2690 buf[0] = '\0';
2691 if (vp->v_vflag & VV_ROOT)
2692 strcat(buf, "|VV_ROOT");
2693 if (vp->v_vflag & VV_TEXT)
2694 strcat(buf, "|VV_TEXT");
2695 if (vp->v_vflag & VV_SYSTEM)
2696 strcat(buf, "|VV_SYSTEM");
2697 if (vp->v_iflag & VI_XLOCK)
2698 strcat(buf, "|VI_XLOCK");
2699 if (vp->v_iflag & VI_XWANT)
2700 strcat(buf, "|VI_XWANT");
2701 if (vp->v_iflag & VI_BWAIT)
2702 strcat(buf, "|VI_BWAIT");
2703 if (vp->v_iflag & VI_DOOMED)
2704 strcat(buf, "|VI_DOOMED");
2705 if (vp->v_iflag & VI_FREE)
2706 strcat(buf, "|VI_FREE");
2707 if (vp->v_vflag & VV_OBJBUF)
2708 strcat(buf, "|VV_OBJBUF");
2709 if (buf[0] != '\0')
2710 printf(" flags (%s),", &buf[1]);
2711 lockmgr_printinfo(vp->v_vnlock);
2712 printf("\n");
2713 if (vp->v_data != NULL)
2714 VOP_PRINT(vp);
2715}
2716
2717#ifdef DDB
2718#include <ddb/ddb.h>
2719/*
2720 * List all of the locked vnodes in the system.
2721 * Called when debugging the kernel.
2722 */
2723DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2724{
2725 struct mount *mp, *nmp;
2726 struct vnode *vp;
2727
2728 /*
2729 * Note: because this is DDB, we can't obey the locking semantics
2730 * for these structures, which means we could catch an inconsistent
2731 * state and dereference a nasty pointer. Not much to be done
2732 * about that.
2733 */
2734 printf("Locked vnodes\n");
2735 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2736 nmp = TAILQ_NEXT(mp, mnt_list);
2737 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2738 if (VOP_ISLOCKED(vp, NULL))
2739 vprint(NULL, vp);
2740 }
2741 nmp = TAILQ_NEXT(mp, mnt_list);
2742 }
2743}
2744#endif
2745
2746/*
2747 * Fill in a struct xvfsconf based on a struct vfsconf.
2748 */
2749static void
2750vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2751{
2752
2753 strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2754 xvfsp->vfc_typenum = vfsp->vfc_typenum;
2755 xvfsp->vfc_refcount = vfsp->vfc_refcount;
2756 xvfsp->vfc_flags = vfsp->vfc_flags;
2757 /*
2758 * These are unused in userland, we keep them
2759 * to not break binary compatibility.
2760 */
2761 xvfsp->vfc_vfsops = NULL;
2762 xvfsp->vfc_next = NULL;
2763}
2764
2765/*
2766 * Top level filesystem related information gathering.
2767 */
2768static int
2769sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2770{
2771 struct vfsconf *vfsp;
2772 struct xvfsconf *xvfsp;
2773 int cnt, error, i;
2774
2775 cnt = 0;
2776 for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2777 cnt++;
2778 xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2779 /*
2780 * Handle the race that we will have here when struct vfsconf
2781 * will be locked down by using both cnt and checking vfc_next
2782 * against NULL to determine the end of the loop. The race will
2783 * happen because we will have to unlock before calling malloc().
2784 * We are protected by Giant for now.
2785 */
2786 i = 0;
2787 for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2788 vfsconf2x(vfsp, xvfsp + i);
2789 i++;
2790 }
2791 error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2792 free(xvfsp, M_TEMP);
2793 return (error);
2794}
2795
2796SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2797 "S,xvfsconf", "List of all configured filesystems");
2798
2799#ifndef BURN_BRIDGES
2800static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2801
2802static int
2803vfs_sysctl(SYSCTL_HANDLER_ARGS)
2804{
2805 int *name = (int *)arg1 - 1; /* XXX */
2806 u_int namelen = arg2 + 1; /* XXX */
2807 struct vfsconf *vfsp;
2808 struct xvfsconf xvfsp;
2809
2810 printf("WARNING: userland calling deprecated sysctl, "
2811 "please rebuild world\n");
2812
2813#if 1 || defined(COMPAT_PRELITE2)
2814 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2815 if (namelen == 1)
2816 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2817#endif
2818
2819 switch (name[1]) {
2820 case VFS_MAXTYPENUM:
2821 if (namelen != 2)
2822 return (ENOTDIR);
2823 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2824 case VFS_CONF:
2825 if (namelen != 3)
2826 return (ENOTDIR); /* overloaded */
2827 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2828 if (vfsp->vfc_typenum == name[2])
2829 break;
2830 if (vfsp == NULL)
2831 return (EOPNOTSUPP);
2832 vfsconf2x(vfsp, &xvfsp);
2833 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2834 }
2835 return (EOPNOTSUPP);
2836}
2837
2838SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2839 "Generic filesystem");
2840
2841#if 1 || defined(COMPAT_PRELITE2)
2842
2843static int
2844sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2845{
2846 int error;
2847 struct vfsconf *vfsp;
2848 struct ovfsconf ovfs;
2849
2850 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2851 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
2852 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2853 ovfs.vfc_index = vfsp->vfc_typenum;
2854 ovfs.vfc_refcount = vfsp->vfc_refcount;
2855 ovfs.vfc_flags = vfsp->vfc_flags;
2856 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2857 if (error)
2858 return error;
2859 }
2860 return 0;
2861}
2862
2863#endif /* 1 || COMPAT_PRELITE2 */
2864#endif /* !BURN_BRIDGES */
2865
2866#define KINFO_VNODESLOP 10
2867#ifdef notyet
2868/*
2869 * Dump vnode list (via sysctl).
2870 */
2871/* ARGSUSED */
2872static int
2873sysctl_vnode(SYSCTL_HANDLER_ARGS)
2874{
2875 struct xvnode *xvn;
2876 struct thread *td = req->td;
2877 struct mount *mp;
2878 struct vnode *vp;
2879 int error, len, n;
2880
2881 /*
2882 * Stale numvnodes access is not fatal here.
2883 */
2884 req->lock = 0;
2885 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2886 if (!req->oldptr)
2887 /* Make an estimate */
2888 return (SYSCTL_OUT(req, 0, len));
2889
2890 error = sysctl_wire_old_buffer(req, 0);
2891 if (error != 0)
2892 return (error);
2893 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2894 n = 0;
2895 mtx_lock(&mountlist_mtx);
2896 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2897 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2898 continue;
2899 MNT_ILOCK(mp);
2900 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2901 if (n == len)
2902 break;
2903 vref(vp);
2904 xvn[n].xv_size = sizeof *xvn;
2905 xvn[n].xv_vnode = vp;
2906#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
2907 XV_COPY(usecount);
2908 XV_COPY(writecount);
2909 XV_COPY(holdcnt);
2910 XV_COPY(id);
2911 XV_COPY(mount);
2912 XV_COPY(numoutput);
2913 XV_COPY(type);
2914#undef XV_COPY
2915 xvn[n].xv_flag = vp->v_vflag;
2916
2917 switch (vp->v_type) {
2918 case VREG:
2919 case VDIR:
2920 case VLNK:
2921 xvn[n].xv_dev = vp->v_cachedfs;
2922 xvn[n].xv_ino = vp->v_cachedid;
2923 break;
2924 case VBLK:
2925 case VCHR:
2926 if (vp->v_rdev == NULL) {
2927 vrele(vp);
2928 continue;
2929 }
2930 xvn[n].xv_dev = dev2udev(vp->v_rdev);
2931 break;
2932 case VSOCK:
2933 xvn[n].xv_socket = vp->v_socket;
2934 break;
2935 case VFIFO:
2936 xvn[n].xv_fifo = vp->v_fifoinfo;
2937 break;
2938 case VNON:
2939 case VBAD:
2940 default:
2941 /* shouldn't happen? */
2942 vrele(vp);
2943 continue;
2944 }
2945 vrele(vp);
2946 ++n;
2947 }
2948 MNT_IUNLOCK(mp);
2949 mtx_lock(&mountlist_mtx);
2950 vfs_unbusy(mp, td);
2951 if (n == len)
2952 break;
2953 }
2954 mtx_unlock(&mountlist_mtx);
2955
2956 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
2957 free(xvn, M_TEMP);
2958 return (error);
2959}
2960
2961SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2962 0, 0, sysctl_vnode, "S,xvnode", "");
2963#endif
2964
2965/*
2966 * Check to see if a filesystem is mounted on a block device.
2967 */
2968int
2969vfs_mountedon(vp)
2970 struct vnode *vp;
2971{
2972
2973 if (vp->v_rdev->si_mountpoint != NULL)
2974 return (EBUSY);
2975 return (0);
2976}
2977
2978/*
2979 * Unmount all filesystems. The list is traversed in reverse order
2980 * of mounting to avoid dependencies.
2981 */
2982void
2983vfs_unmountall()
2984{
2985 struct mount *mp;
2986 struct thread *td;
2987 int error;
2988
2989 if (curthread != NULL)
2990 td = curthread;
2991 else
2992 td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
2993 /*
2994 * Since this only runs when rebooting, it is not interlocked.
2995 */
2996 while(!TAILQ_EMPTY(&mountlist)) {
2997 mp = TAILQ_LAST(&mountlist, mntlist);
2998 error = dounmount(mp, MNT_FORCE, td);
2999 if (error) {
3000 TAILQ_REMOVE(&mountlist, mp, mnt_list);
3001 printf("unmount of %s failed (",
3002 mp->mnt_stat.f_mntonname);
3003 if (error == EBUSY)
3004 printf("BUSY)\n");
3005 else
3006 printf("%d)\n", error);
3007 } else {
3008 /* The unmount has removed mp from the mountlist */
3009 }
3010 }
3011}
3012
3013/*
3014 * perform msync on all vnodes under a mount point
3015 * the mount point must be locked.
3016 */
3017void
3018vfs_msync(struct mount *mp, int flags)
3019{
3020 struct vnode *vp, *nvp;
3021 struct vm_object *obj;
3022 int tries;
3023
3024 GIANT_REQUIRED;
3025
3026 tries = 5;
3027 MNT_ILOCK(mp);
3028loop:
3029 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
3030 if (vp->v_mount != mp) {
3031 if (--tries > 0)
3032 goto loop;
3033 break;
3034 }
3035 nvp = TAILQ_NEXT(vp, v_nmntvnodes);
3036
3037 VI_LOCK(vp);
3038 if (vp->v_iflag & VI_XLOCK) {
3039 VI_UNLOCK(vp);
3040 continue;
3041 }
3042
3043 if ((vp->v_iflag & VI_OBJDIRTY) &&
3044 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
3045 MNT_IUNLOCK(mp);
3046 if (!vget(vp,
3047 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3048 curthread)) {
3049 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
3050 vput(vp);
3051 MNT_ILOCK(mp);
3052 continue;
3053 }
3054
3055 if (VOP_GETVOBJECT(vp, &obj) == 0) {
3056 VM_OBJECT_LOCK(obj);
3057 vm_object_page_clean(obj, 0, 0,
3058 flags == MNT_WAIT ?
3059 OBJPC_SYNC : OBJPC_NOSYNC);
3060 VM_OBJECT_UNLOCK(obj);
3061 }
3062 vput(vp);
3063 }
3064 MNT_ILOCK(mp);
3065 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
3066 if (--tries > 0)
3067 goto loop;
3068 break;
3069 }
3070 } else
3071 VI_UNLOCK(vp);
3072 }
3073 MNT_IUNLOCK(mp);
3074}
3075
3076/*
3077 * Create the VM object needed for VMIO and mmap support. This
3078 * is done for all VREG files in the system. Some filesystems might
3079 * afford the additional metadata buffering capability of the
3080 * VMIO code by making the device node be VMIO mode also.
3081 *
3082 * vp must be locked when vfs_object_create is called.
3083 */
3084int
3085vfs_object_create(vp, td, cred)
3086 struct vnode *vp;
3087 struct thread *td;
3088 struct ucred *cred;
3089{
3090
3091 GIANT_REQUIRED;
3092 return (VOP_CREATEVOBJECT(vp, cred, td));
3093}
3094
3095/*
3096 * Mark a vnode as free, putting it up for recycling.
3097 */
3098void
3099vfree(vp)
3100 struct vnode *vp;
3101{
3102
3103 ASSERT_VI_LOCKED(vp, "vfree");
3104 mtx_lock(&vnode_free_list_mtx);
3105 KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3106 if (vp->v_iflag & VI_AGE) {
3107 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3108 } else {
3109 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3110 }
3111 freevnodes++;
3112 mtx_unlock(&vnode_free_list_mtx);
3113 vp->v_iflag &= ~VI_AGE;
3114 vp->v_iflag |= VI_FREE;
3115}
3116
3117/*
3118 * Opposite of vfree() - mark a vnode as in use.
3119 */
3120void
3121vbusy(vp)
3122 struct vnode *vp;
3123{
3124
3125 ASSERT_VI_LOCKED(vp, "vbusy");
3126 KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3127
3128 mtx_lock(&vnode_free_list_mtx);
3129 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3130 freevnodes--;
3131 mtx_unlock(&vnode_free_list_mtx);
3132
3133 vp->v_iflag &= ~(VI_FREE|VI_AGE);
3134}
3135
3136/*
3137 * Initalize per-vnode helper structure to hold poll-related state.
3138 */
3139void
3140v_addpollinfo(struct vnode *vp)
3141{
3142
3143 vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
3144 mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3145}
3146
3147/*
3148 * Record a process's interest in events which might happen to
3149 * a vnode. Because poll uses the historic select-style interface
3150 * internally, this routine serves as both the ``check for any
3151 * pending events'' and the ``record my interest in future events''
3152 * functions. (These are done together, while the lock is held,
3153 * to avoid race conditions.)
3154 */
3155int
3156vn_pollrecord(vp, td, events)
3157 struct vnode *vp;
3158 struct thread *td;
3159 short events;
3160{
3161
3162 if (vp->v_pollinfo == NULL)
3163 v_addpollinfo(vp);
3164 mtx_lock(&vp->v_pollinfo->vpi_lock);
3165 if (vp->v_pollinfo->vpi_revents & events) {
3166 /*
3167 * This leaves events we are not interested
3168 * in available for the other process which
3169 * which presumably had requested them
3170 * (otherwise they would never have been
3171 * recorded).
3172 */
3173 events &= vp->v_pollinfo->vpi_revents;
3174 vp->v_pollinfo->vpi_revents &= ~events;
3175
3176 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3177 return events;
3178 }
3179 vp->v_pollinfo->vpi_events |= events;
3180 selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3181 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3182 return 0;
3183}
3184
3185/*
3186 * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
3187 * it is possible for us to miss an event due to race conditions, but
3188 * that condition is expected to be rare, so for the moment it is the
3189 * preferred interface.
3190 */
3191void
3192vn_pollevent(vp, events)
3193 struct vnode *vp;
3194 short events;
3195{
3196
3197 if (vp->v_pollinfo == NULL)
3198 v_addpollinfo(vp);
3199 mtx_lock(&vp->v_pollinfo->vpi_lock);
3200 if (vp->v_pollinfo->vpi_events & events) {
3201 /*
3202 * We clear vpi_events so that we don't
3203 * call selwakeup() twice if two events are
3204 * posted before the polling process(es) is
3205 * awakened. This also ensures that we take at
3206 * most one selwakeup() if the polling process
3207 * is no longer interested. However, it does
3208 * mean that only one event can be noticed at
3209 * a time. (Perhaps we should only clear those
3210 * event bits which we note?) XXX
3211 */
3212 vp->v_pollinfo->vpi_events = 0; /* &= ~events ??? */
3213 vp->v_pollinfo->vpi_revents |= events;
3214 selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3215 }
3216 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3217}
3218
3219/*
3220 * Wake up anyone polling on vp because it is being revoked.
3221 * This depends on dead_poll() returning POLLHUP for correct
3222 * behavior.
3223 */
3224void
3225vn_pollgone(vp)
3226 struct vnode *vp;
3227{
3228
3229 mtx_lock(&vp->v_pollinfo->vpi_lock);
3230 VN_KNOTE(vp, NOTE_REVOKE);
3231 if (vp->v_pollinfo->vpi_events) {
3232 vp->v_pollinfo->vpi_events = 0;
3233 selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3234 }
3235 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3236}
3237
3238
3239
3240/*
3241 * Routine to create and manage a filesystem syncer vnode.
3242 */
3243#define sync_close ((int (*)(struct vop_close_args *))nullop)
3244static int sync_fsync(struct vop_fsync_args *);
3245static int sync_inactive(struct vop_inactive_args *);
3246static int sync_reclaim(struct vop_reclaim_args *);
3247
3248static vop_t **sync_vnodeop_p;
3249static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3250 { &vop_default_desc, (vop_t *) vop_eopnotsupp },
3251 { &vop_close_desc, (vop_t *) sync_close }, /* close */
3252 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
3253 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
3254 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
3255 { &vop_lock_desc, (vop_t *) vop_stdlock }, /* lock */
3256 { &vop_unlock_desc, (vop_t *) vop_stdunlock }, /* unlock */
3257 { &vop_islocked_desc, (vop_t *) vop_stdislocked }, /* islocked */
3258 { NULL, NULL }
3259};
3260static struct vnodeopv_desc sync_vnodeop_opv_desc =
3261 { &sync_vnodeop_p, sync_vnodeop_entries };
3262
3263VNODEOP_SET(sync_vnodeop_opv_desc);
3264
3265/*
3266 * Create a new filesystem syncer vnode for the specified mount point.
3267 */
3268int
3269vfs_allocate_syncvnode(mp)
3270 struct mount *mp;
3271{
3272 struct vnode *vp;
3273 static long start, incr, next;
3274 int error;
3275
3276 /* Allocate a new vnode */
3277 if ((error = getnewvnode("syncer", mp, sync_vnodeop_p, &vp)) != 0) {
3278 mp->mnt_syncer = NULL;
3279 return (error);
3280 }
3281 vp->v_type = VNON;
3282 /*
3283 * Place the vnode onto the syncer worklist. We attempt to
3284 * scatter them about on the list so that they will go off
3285 * at evenly distributed times even if all the filesystems
3286 * are mounted at once.
3287 */
3288 next += incr;
3289 if (next == 0 || next > syncer_maxdelay) {
3290 start /= 2;
3291 incr /= 2;
3292 if (start == 0) {
3293 start = syncer_maxdelay / 2;
3294 incr = syncer_maxdelay;
3295 }
3296 next = start;
3297 }
3298 VI_LOCK(vp);
3299 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3300 VI_UNLOCK(vp);
3301 mp->mnt_syncer = vp;
3302 return (0);
3303}
3304
3305/*
3306 * Do a lazy sync of the filesystem.
3307 */
3308static int
3309sync_fsync(ap)
3310 struct vop_fsync_args /* {
3311 struct vnode *a_vp;
3312 struct ucred *a_cred;
3313 int a_waitfor;
3314 struct thread *a_td;
3315 } */ *ap;
3316{
3317 struct vnode *syncvp = ap->a_vp;
3318 struct mount *mp = syncvp->v_mount;
3319 struct thread *td = ap->a_td;
3320 int error, asyncflag;
3321
3322 /*
3323 * We only need to do something if this is a lazy evaluation.
3324 */
3325 if (ap->a_waitfor != MNT_LAZY)
3326 return (0);
3327
3328 /*
3329 * Move ourselves to the back of the sync list.
3330 */
3331 VI_LOCK(syncvp);
3332 vn_syncer_add_to_worklist(syncvp, syncdelay);
3333 VI_UNLOCK(syncvp);
3334
3335 /*
3336 * Walk the list of vnodes pushing all that are dirty and
3337 * not already on the sync list.
3338 */
3339 mtx_lock(&mountlist_mtx);
3340 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3341 mtx_unlock(&mountlist_mtx);
3342 return (0);
3343 }
3344 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3345 vfs_unbusy(mp, td);
3346 return (0);
3347 }
3348 asyncflag = mp->mnt_flag & MNT_ASYNC;
3349 mp->mnt_flag &= ~MNT_ASYNC;
3350 vfs_msync(mp, MNT_NOWAIT);
3351 error = VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3352 if (asyncflag)
3353 mp->mnt_flag |= MNT_ASYNC;
3354 vn_finished_write(mp);
3355 vfs_unbusy(mp, td);
3356 return (error);
3357}
3358
3359/*
3360 * The syncer vnode is no referenced.
3361 */
3362static int
3363sync_inactive(ap)
3364 struct vop_inactive_args /* {
3365 struct vnode *a_vp;
3366 struct thread *a_td;
3367 } */ *ap;
3368{
3369
3370 VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3371 vgone(ap->a_vp);
3372 return (0);
3373}
3374
3375/*
3376 * The syncer vnode is no longer needed and is being decommissioned.
3377 *
3378 * Modifications to the worklist must be protected by sync_mtx.
3379 */
3380static int
3381sync_reclaim(ap)
3382 struct vop_reclaim_args /* {
3383 struct vnode *a_vp;
3384 } */ *ap;
3385{
3386 struct vnode *vp = ap->a_vp;
3387
3388 VI_LOCK(vp);
3389 vp->v_mount->mnt_syncer = NULL;
3390 if (vp->v_iflag & VI_ONWORKLST) {
3391 mtx_lock(&sync_mtx);
3392 LIST_REMOVE(vp, v_synclist);
3393 mtx_unlock(&sync_mtx);
3394 vp->v_iflag &= ~VI_ONWORKLST;
3395 }
3396 VI_UNLOCK(vp);
3397
3398 return (0);
3399}
3400
3401/*
3402 * extract the struct cdev *from a VCHR
3403 */
3404struct cdev *
3405vn_todev(vp)
3406 struct vnode *vp;
3407{
3408
3409 if (vp->v_type != VCHR)
2585 mtx_lock(&spechash_mtx);
2586 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2587 vp->v_rdev->si_usecount -= vp->v_usecount;
2588 mtx_unlock(&spechash_mtx);
2589 dev_rel(vp->v_rdev);
2590 vp->v_rdev = NULL;
2591 }
2592
2593 /*
2594 * If it is on the freelist and not already at the head,
2595 * move it to the head of the list. The test of the
2596 * VDOOMED flag and the reference count of zero is because
2597 * it will be removed from the free list by getnewvnode,
2598 * but will not have its reference count incremented until
2599 * after calling vgone. If the reference count were
2600 * incremented first, vgone would (incorrectly) try to
2601 * close the previous instance of the underlying object.
2602 */
2603 if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2604 mtx_lock(&vnode_free_list_mtx);
2605 if (vp->v_iflag & VI_FREE) {
2606 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2607 } else {
2608 vp->v_iflag |= VI_FREE;
2609 freevnodes++;
2610 }
2611 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2612 mtx_unlock(&vnode_free_list_mtx);
2613 }
2614
2615 vp->v_type = VBAD;
2616 vx_unlock(vp);
2617 VI_UNLOCK(vp);
2618}
2619
2620/*
2621 * Lookup a vnode by device number.
2622 */
2623int
2624vfinddev(dev, vpp)
2625 struct cdev *dev;
2626 struct vnode **vpp;
2627{
2628 struct vnode *vp;
2629
2630 mtx_lock(&spechash_mtx);
2631 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2632 *vpp = vp;
2633 mtx_unlock(&spechash_mtx);
2634 return (1);
2635 }
2636 mtx_unlock(&spechash_mtx);
2637 return (0);
2638}
2639
2640/*
2641 * Calculate the total number of references to a special device.
2642 */
2643int
2644vcount(vp)
2645 struct vnode *vp;
2646{
2647 int count;
2648
2649 mtx_lock(&spechash_mtx);
2650 count = vp->v_rdev->si_usecount;
2651 mtx_unlock(&spechash_mtx);
2652 return (count);
2653}
2654
2655/*
2656 * Same as above, but using the struct cdev *as argument
2657 */
2658int
2659count_dev(dev)
2660 struct cdev *dev;
2661{
2662 int count;
2663
2664 mtx_lock(&spechash_mtx);
2665 count = dev->si_usecount;
2666 mtx_unlock(&spechash_mtx);
2667 return(count);
2668}
2669
2670/*
2671 * Print out a description of a vnode.
2672 */
2673static char *typename[] =
2674{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2675
2676void
2677vprint(label, vp)
2678 char *label;
2679 struct vnode *vp;
2680{
2681 char buf[96];
2682
2683 if (label != NULL)
2684 printf("%s: %p: ", label, (void *)vp);
2685 else
2686 printf("%p: ", (void *)vp);
2687 printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
2688 vp->v_tag, typename[vp->v_type], vp->v_usecount,
2689 vp->v_writecount, vp->v_holdcnt);
2690 buf[0] = '\0';
2691 if (vp->v_vflag & VV_ROOT)
2692 strcat(buf, "|VV_ROOT");
2693 if (vp->v_vflag & VV_TEXT)
2694 strcat(buf, "|VV_TEXT");
2695 if (vp->v_vflag & VV_SYSTEM)
2696 strcat(buf, "|VV_SYSTEM");
2697 if (vp->v_iflag & VI_XLOCK)
2698 strcat(buf, "|VI_XLOCK");
2699 if (vp->v_iflag & VI_XWANT)
2700 strcat(buf, "|VI_XWANT");
2701 if (vp->v_iflag & VI_BWAIT)
2702 strcat(buf, "|VI_BWAIT");
2703 if (vp->v_iflag & VI_DOOMED)
2704 strcat(buf, "|VI_DOOMED");
2705 if (vp->v_iflag & VI_FREE)
2706 strcat(buf, "|VI_FREE");
2707 if (vp->v_vflag & VV_OBJBUF)
2708 strcat(buf, "|VV_OBJBUF");
2709 if (buf[0] != '\0')
2710 printf(" flags (%s),", &buf[1]);
2711 lockmgr_printinfo(vp->v_vnlock);
2712 printf("\n");
2713 if (vp->v_data != NULL)
2714 VOP_PRINT(vp);
2715}
2716
2717#ifdef DDB
2718#include <ddb/ddb.h>
2719/*
2720 * List all of the locked vnodes in the system.
2721 * Called when debugging the kernel.
2722 */
2723DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2724{
2725 struct mount *mp, *nmp;
2726 struct vnode *vp;
2727
2728 /*
2729 * Note: because this is DDB, we can't obey the locking semantics
2730 * for these structures, which means we could catch an inconsistent
2731 * state and dereference a nasty pointer. Not much to be done
2732 * about that.
2733 */
2734 printf("Locked vnodes\n");
2735 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2736 nmp = TAILQ_NEXT(mp, mnt_list);
2737 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2738 if (VOP_ISLOCKED(vp, NULL))
2739 vprint(NULL, vp);
2740 }
2741 nmp = TAILQ_NEXT(mp, mnt_list);
2742 }
2743}
2744#endif
2745
2746/*
2747 * Fill in a struct xvfsconf based on a struct vfsconf.
2748 */
2749static void
2750vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2751{
2752
2753 strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2754 xvfsp->vfc_typenum = vfsp->vfc_typenum;
2755 xvfsp->vfc_refcount = vfsp->vfc_refcount;
2756 xvfsp->vfc_flags = vfsp->vfc_flags;
2757 /*
2758 * These are unused in userland, we keep them
2759 * to not break binary compatibility.
2760 */
2761 xvfsp->vfc_vfsops = NULL;
2762 xvfsp->vfc_next = NULL;
2763}
2764
2765/*
2766 * Top level filesystem related information gathering.
2767 */
2768static int
2769sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2770{
2771 struct vfsconf *vfsp;
2772 struct xvfsconf *xvfsp;
2773 int cnt, error, i;
2774
2775 cnt = 0;
2776 for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2777 cnt++;
2778 xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2779 /*
2780 * Handle the race that we will have here when struct vfsconf
2781 * will be locked down by using both cnt and checking vfc_next
2782 * against NULL to determine the end of the loop. The race will
2783 * happen because we will have to unlock before calling malloc().
2784 * We are protected by Giant for now.
2785 */
2786 i = 0;
2787 for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2788 vfsconf2x(vfsp, xvfsp + i);
2789 i++;
2790 }
2791 error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2792 free(xvfsp, M_TEMP);
2793 return (error);
2794}
2795
2796SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2797 "S,xvfsconf", "List of all configured filesystems");
2798
2799#ifndef BURN_BRIDGES
2800static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2801
2802static int
2803vfs_sysctl(SYSCTL_HANDLER_ARGS)
2804{
2805 int *name = (int *)arg1 - 1; /* XXX */
2806 u_int namelen = arg2 + 1; /* XXX */
2807 struct vfsconf *vfsp;
2808 struct xvfsconf xvfsp;
2809
2810 printf("WARNING: userland calling deprecated sysctl, "
2811 "please rebuild world\n");
2812
2813#if 1 || defined(COMPAT_PRELITE2)
2814 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2815 if (namelen == 1)
2816 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2817#endif
2818
2819 switch (name[1]) {
2820 case VFS_MAXTYPENUM:
2821 if (namelen != 2)
2822 return (ENOTDIR);
2823 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2824 case VFS_CONF:
2825 if (namelen != 3)
2826 return (ENOTDIR); /* overloaded */
2827 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2828 if (vfsp->vfc_typenum == name[2])
2829 break;
2830 if (vfsp == NULL)
2831 return (EOPNOTSUPP);
2832 vfsconf2x(vfsp, &xvfsp);
2833 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2834 }
2835 return (EOPNOTSUPP);
2836}
2837
2838SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2839 "Generic filesystem");
2840
2841#if 1 || defined(COMPAT_PRELITE2)
2842
2843static int
2844sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2845{
2846 int error;
2847 struct vfsconf *vfsp;
2848 struct ovfsconf ovfs;
2849
2850 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2851 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
2852 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2853 ovfs.vfc_index = vfsp->vfc_typenum;
2854 ovfs.vfc_refcount = vfsp->vfc_refcount;
2855 ovfs.vfc_flags = vfsp->vfc_flags;
2856 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2857 if (error)
2858 return error;
2859 }
2860 return 0;
2861}
2862
2863#endif /* 1 || COMPAT_PRELITE2 */
2864#endif /* !BURN_BRIDGES */
2865
2866#define KINFO_VNODESLOP 10
2867#ifdef notyet
2868/*
2869 * Dump vnode list (via sysctl).
2870 */
2871/* ARGSUSED */
2872static int
2873sysctl_vnode(SYSCTL_HANDLER_ARGS)
2874{
2875 struct xvnode *xvn;
2876 struct thread *td = req->td;
2877 struct mount *mp;
2878 struct vnode *vp;
2879 int error, len, n;
2880
2881 /*
2882 * Stale numvnodes access is not fatal here.
2883 */
2884 req->lock = 0;
2885 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2886 if (!req->oldptr)
2887 /* Make an estimate */
2888 return (SYSCTL_OUT(req, 0, len));
2889
2890 error = sysctl_wire_old_buffer(req, 0);
2891 if (error != 0)
2892 return (error);
2893 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2894 n = 0;
2895 mtx_lock(&mountlist_mtx);
2896 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2897 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2898 continue;
2899 MNT_ILOCK(mp);
2900 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2901 if (n == len)
2902 break;
2903 vref(vp);
2904 xvn[n].xv_size = sizeof *xvn;
2905 xvn[n].xv_vnode = vp;
2906#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
2907 XV_COPY(usecount);
2908 XV_COPY(writecount);
2909 XV_COPY(holdcnt);
2910 XV_COPY(id);
2911 XV_COPY(mount);
2912 XV_COPY(numoutput);
2913 XV_COPY(type);
2914#undef XV_COPY
2915 xvn[n].xv_flag = vp->v_vflag;
2916
2917 switch (vp->v_type) {
2918 case VREG:
2919 case VDIR:
2920 case VLNK:
2921 xvn[n].xv_dev = vp->v_cachedfs;
2922 xvn[n].xv_ino = vp->v_cachedid;
2923 break;
2924 case VBLK:
2925 case VCHR:
2926 if (vp->v_rdev == NULL) {
2927 vrele(vp);
2928 continue;
2929 }
2930 xvn[n].xv_dev = dev2udev(vp->v_rdev);
2931 break;
2932 case VSOCK:
2933 xvn[n].xv_socket = vp->v_socket;
2934 break;
2935 case VFIFO:
2936 xvn[n].xv_fifo = vp->v_fifoinfo;
2937 break;
2938 case VNON:
2939 case VBAD:
2940 default:
2941 /* shouldn't happen? */
2942 vrele(vp);
2943 continue;
2944 }
2945 vrele(vp);
2946 ++n;
2947 }
2948 MNT_IUNLOCK(mp);
2949 mtx_lock(&mountlist_mtx);
2950 vfs_unbusy(mp, td);
2951 if (n == len)
2952 break;
2953 }
2954 mtx_unlock(&mountlist_mtx);
2955
2956 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
2957 free(xvn, M_TEMP);
2958 return (error);
2959}
2960
2961SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2962 0, 0, sysctl_vnode, "S,xvnode", "");
2963#endif
2964
2965/*
2966 * Check to see if a filesystem is mounted on a block device.
2967 */
2968int
2969vfs_mountedon(vp)
2970 struct vnode *vp;
2971{
2972
2973 if (vp->v_rdev->si_mountpoint != NULL)
2974 return (EBUSY);
2975 return (0);
2976}
2977
2978/*
2979 * Unmount all filesystems. The list is traversed in reverse order
2980 * of mounting to avoid dependencies.
2981 */
2982void
2983vfs_unmountall()
2984{
2985 struct mount *mp;
2986 struct thread *td;
2987 int error;
2988
2989 if (curthread != NULL)
2990 td = curthread;
2991 else
2992 td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
2993 /*
2994 * Since this only runs when rebooting, it is not interlocked.
2995 */
2996 while(!TAILQ_EMPTY(&mountlist)) {
2997 mp = TAILQ_LAST(&mountlist, mntlist);
2998 error = dounmount(mp, MNT_FORCE, td);
2999 if (error) {
3000 TAILQ_REMOVE(&mountlist, mp, mnt_list);
3001 printf("unmount of %s failed (",
3002 mp->mnt_stat.f_mntonname);
3003 if (error == EBUSY)
3004 printf("BUSY)\n");
3005 else
3006 printf("%d)\n", error);
3007 } else {
3008 /* The unmount has removed mp from the mountlist */
3009 }
3010 }
3011}
3012
3013/*
3014 * perform msync on all vnodes under a mount point
3015 * the mount point must be locked.
3016 */
3017void
3018vfs_msync(struct mount *mp, int flags)
3019{
3020 struct vnode *vp, *nvp;
3021 struct vm_object *obj;
3022 int tries;
3023
3024 GIANT_REQUIRED;
3025
3026 tries = 5;
3027 MNT_ILOCK(mp);
3028loop:
3029 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
3030 if (vp->v_mount != mp) {
3031 if (--tries > 0)
3032 goto loop;
3033 break;
3034 }
3035 nvp = TAILQ_NEXT(vp, v_nmntvnodes);
3036
3037 VI_LOCK(vp);
3038 if (vp->v_iflag & VI_XLOCK) {
3039 VI_UNLOCK(vp);
3040 continue;
3041 }
3042
3043 if ((vp->v_iflag & VI_OBJDIRTY) &&
3044 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
3045 MNT_IUNLOCK(mp);
3046 if (!vget(vp,
3047 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3048 curthread)) {
3049 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
3050 vput(vp);
3051 MNT_ILOCK(mp);
3052 continue;
3053 }
3054
3055 if (VOP_GETVOBJECT(vp, &obj) == 0) {
3056 VM_OBJECT_LOCK(obj);
3057 vm_object_page_clean(obj, 0, 0,
3058 flags == MNT_WAIT ?
3059 OBJPC_SYNC : OBJPC_NOSYNC);
3060 VM_OBJECT_UNLOCK(obj);
3061 }
3062 vput(vp);
3063 }
3064 MNT_ILOCK(mp);
3065 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
3066 if (--tries > 0)
3067 goto loop;
3068 break;
3069 }
3070 } else
3071 VI_UNLOCK(vp);
3072 }
3073 MNT_IUNLOCK(mp);
3074}
3075
3076/*
3077 * Create the VM object needed for VMIO and mmap support. This
3078 * is done for all VREG files in the system. Some filesystems might
3079 * afford the additional metadata buffering capability of the
3080 * VMIO code by making the device node be VMIO mode also.
3081 *
3082 * vp must be locked when vfs_object_create is called.
3083 */
3084int
3085vfs_object_create(vp, td, cred)
3086 struct vnode *vp;
3087 struct thread *td;
3088 struct ucred *cred;
3089{
3090
3091 GIANT_REQUIRED;
3092 return (VOP_CREATEVOBJECT(vp, cred, td));
3093}
3094
3095/*
3096 * Mark a vnode as free, putting it up for recycling.
3097 */
3098void
3099vfree(vp)
3100 struct vnode *vp;
3101{
3102
3103 ASSERT_VI_LOCKED(vp, "vfree");
3104 mtx_lock(&vnode_free_list_mtx);
3105 KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3106 if (vp->v_iflag & VI_AGE) {
3107 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3108 } else {
3109 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3110 }
3111 freevnodes++;
3112 mtx_unlock(&vnode_free_list_mtx);
3113 vp->v_iflag &= ~VI_AGE;
3114 vp->v_iflag |= VI_FREE;
3115}
3116
3117/*
3118 * Opposite of vfree() - mark a vnode as in use.
3119 */
3120void
3121vbusy(vp)
3122 struct vnode *vp;
3123{
3124
3125 ASSERT_VI_LOCKED(vp, "vbusy");
3126 KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3127
3128 mtx_lock(&vnode_free_list_mtx);
3129 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3130 freevnodes--;
3131 mtx_unlock(&vnode_free_list_mtx);
3132
3133 vp->v_iflag &= ~(VI_FREE|VI_AGE);
3134}
3135
3136/*
3137 * Initalize per-vnode helper structure to hold poll-related state.
3138 */
3139void
3140v_addpollinfo(struct vnode *vp)
3141{
3142
3143 vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
3144 mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3145}
3146
3147/*
3148 * Record a process's interest in events which might happen to
3149 * a vnode. Because poll uses the historic select-style interface
3150 * internally, this routine serves as both the ``check for any
3151 * pending events'' and the ``record my interest in future events''
3152 * functions. (These are done together, while the lock is held,
3153 * to avoid race conditions.)
3154 */
3155int
3156vn_pollrecord(vp, td, events)
3157 struct vnode *vp;
3158 struct thread *td;
3159 short events;
3160{
3161
3162 if (vp->v_pollinfo == NULL)
3163 v_addpollinfo(vp);
3164 mtx_lock(&vp->v_pollinfo->vpi_lock);
3165 if (vp->v_pollinfo->vpi_revents & events) {
3166 /*
3167 * This leaves events we are not interested
3168 * in available for the other process which
3169 * which presumably had requested them
3170 * (otherwise they would never have been
3171 * recorded).
3172 */
3173 events &= vp->v_pollinfo->vpi_revents;
3174 vp->v_pollinfo->vpi_revents &= ~events;
3175
3176 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3177 return events;
3178 }
3179 vp->v_pollinfo->vpi_events |= events;
3180 selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3181 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3182 return 0;
3183}
3184
3185/*
3186 * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
3187 * it is possible for us to miss an event due to race conditions, but
3188 * that condition is expected to be rare, so for the moment it is the
3189 * preferred interface.
3190 */
3191void
3192vn_pollevent(vp, events)
3193 struct vnode *vp;
3194 short events;
3195{
3196
3197 if (vp->v_pollinfo == NULL)
3198 v_addpollinfo(vp);
3199 mtx_lock(&vp->v_pollinfo->vpi_lock);
3200 if (vp->v_pollinfo->vpi_events & events) {
3201 /*
3202 * We clear vpi_events so that we don't
3203 * call selwakeup() twice if two events are
3204 * posted before the polling process(es) is
3205 * awakened. This also ensures that we take at
3206 * most one selwakeup() if the polling process
3207 * is no longer interested. However, it does
3208 * mean that only one event can be noticed at
3209 * a time. (Perhaps we should only clear those
3210 * event bits which we note?) XXX
3211 */
3212 vp->v_pollinfo->vpi_events = 0; /* &= ~events ??? */
3213 vp->v_pollinfo->vpi_revents |= events;
3214 selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3215 }
3216 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3217}
3218
3219/*
3220 * Wake up anyone polling on vp because it is being revoked.
3221 * This depends on dead_poll() returning POLLHUP for correct
3222 * behavior.
3223 */
3224void
3225vn_pollgone(vp)
3226 struct vnode *vp;
3227{
3228
3229 mtx_lock(&vp->v_pollinfo->vpi_lock);
3230 VN_KNOTE(vp, NOTE_REVOKE);
3231 if (vp->v_pollinfo->vpi_events) {
3232 vp->v_pollinfo->vpi_events = 0;
3233 selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3234 }
3235 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3236}
3237
3238
3239
3240/*
3241 * Routine to create and manage a filesystem syncer vnode.
3242 */
3243#define sync_close ((int (*)(struct vop_close_args *))nullop)
3244static int sync_fsync(struct vop_fsync_args *);
3245static int sync_inactive(struct vop_inactive_args *);
3246static int sync_reclaim(struct vop_reclaim_args *);
3247
3248static vop_t **sync_vnodeop_p;
3249static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3250 { &vop_default_desc, (vop_t *) vop_eopnotsupp },
3251 { &vop_close_desc, (vop_t *) sync_close }, /* close */
3252 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
3253 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
3254 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
3255 { &vop_lock_desc, (vop_t *) vop_stdlock }, /* lock */
3256 { &vop_unlock_desc, (vop_t *) vop_stdunlock }, /* unlock */
3257 { &vop_islocked_desc, (vop_t *) vop_stdislocked }, /* islocked */
3258 { NULL, NULL }
3259};
3260static struct vnodeopv_desc sync_vnodeop_opv_desc =
3261 { &sync_vnodeop_p, sync_vnodeop_entries };
3262
3263VNODEOP_SET(sync_vnodeop_opv_desc);
3264
3265/*
3266 * Create a new filesystem syncer vnode for the specified mount point.
3267 */
3268int
3269vfs_allocate_syncvnode(mp)
3270 struct mount *mp;
3271{
3272 struct vnode *vp;
3273 static long start, incr, next;
3274 int error;
3275
3276 /* Allocate a new vnode */
3277 if ((error = getnewvnode("syncer", mp, sync_vnodeop_p, &vp)) != 0) {
3278 mp->mnt_syncer = NULL;
3279 return (error);
3280 }
3281 vp->v_type = VNON;
3282 /*
3283 * Place the vnode onto the syncer worklist. We attempt to
3284 * scatter them about on the list so that they will go off
3285 * at evenly distributed times even if all the filesystems
3286 * are mounted at once.
3287 */
3288 next += incr;
3289 if (next == 0 || next > syncer_maxdelay) {
3290 start /= 2;
3291 incr /= 2;
3292 if (start == 0) {
3293 start = syncer_maxdelay / 2;
3294 incr = syncer_maxdelay;
3295 }
3296 next = start;
3297 }
3298 VI_LOCK(vp);
3299 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3300 VI_UNLOCK(vp);
3301 mp->mnt_syncer = vp;
3302 return (0);
3303}
3304
3305/*
3306 * Do a lazy sync of the filesystem.
3307 */
3308static int
3309sync_fsync(ap)
3310 struct vop_fsync_args /* {
3311 struct vnode *a_vp;
3312 struct ucred *a_cred;
3313 int a_waitfor;
3314 struct thread *a_td;
3315 } */ *ap;
3316{
3317 struct vnode *syncvp = ap->a_vp;
3318 struct mount *mp = syncvp->v_mount;
3319 struct thread *td = ap->a_td;
3320 int error, asyncflag;
3321
3322 /*
3323 * We only need to do something if this is a lazy evaluation.
3324 */
3325 if (ap->a_waitfor != MNT_LAZY)
3326 return (0);
3327
3328 /*
3329 * Move ourselves to the back of the sync list.
3330 */
3331 VI_LOCK(syncvp);
3332 vn_syncer_add_to_worklist(syncvp, syncdelay);
3333 VI_UNLOCK(syncvp);
3334
3335 /*
3336 * Walk the list of vnodes pushing all that are dirty and
3337 * not already on the sync list.
3338 */
3339 mtx_lock(&mountlist_mtx);
3340 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3341 mtx_unlock(&mountlist_mtx);
3342 return (0);
3343 }
3344 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3345 vfs_unbusy(mp, td);
3346 return (0);
3347 }
3348 asyncflag = mp->mnt_flag & MNT_ASYNC;
3349 mp->mnt_flag &= ~MNT_ASYNC;
3350 vfs_msync(mp, MNT_NOWAIT);
3351 error = VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3352 if (asyncflag)
3353 mp->mnt_flag |= MNT_ASYNC;
3354 vn_finished_write(mp);
3355 vfs_unbusy(mp, td);
3356 return (error);
3357}
3358
3359/*
3360 * The syncer vnode is no referenced.
3361 */
3362static int
3363sync_inactive(ap)
3364 struct vop_inactive_args /* {
3365 struct vnode *a_vp;
3366 struct thread *a_td;
3367 } */ *ap;
3368{
3369
3370 VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3371 vgone(ap->a_vp);
3372 return (0);
3373}
3374
3375/*
3376 * The syncer vnode is no longer needed and is being decommissioned.
3377 *
3378 * Modifications to the worklist must be protected by sync_mtx.
3379 */
3380static int
3381sync_reclaim(ap)
3382 struct vop_reclaim_args /* {
3383 struct vnode *a_vp;
3384 } */ *ap;
3385{
3386 struct vnode *vp = ap->a_vp;
3387
3388 VI_LOCK(vp);
3389 vp->v_mount->mnt_syncer = NULL;
3390 if (vp->v_iflag & VI_ONWORKLST) {
3391 mtx_lock(&sync_mtx);
3392 LIST_REMOVE(vp, v_synclist);
3393 mtx_unlock(&sync_mtx);
3394 vp->v_iflag &= ~VI_ONWORKLST;
3395 }
3396 VI_UNLOCK(vp);
3397
3398 return (0);
3399}
3400
3401/*
3402 * extract the struct cdev *from a VCHR
3403 */
3404struct cdev *
3405vn_todev(vp)
3406 struct vnode *vp;
3407{
3408
3409 if (vp->v_type != VCHR)
3410 return (NODEV);
3410 return (NULL);
3411 return (vp->v_rdev);
3412}
3413
3414/*
3415 * Check if vnode represents a disk device
3416 */
3417int
3418vn_isdisk(vp, errp)
3419 struct vnode *vp;
3420 int *errp;
3421{
3422 int error;
3423
3424 error = 0;
3425 if (vp->v_type != VCHR)
3426 error = ENOTBLK;
3427 else if (vp->v_rdev == NULL)
3428 error = ENXIO;
3429 else if (!(devsw(vp->v_rdev)->d_flags & D_DISK))
3430 error = ENOTBLK;
3431 if (errp != NULL)
3432 *errp = error;
3433 return (error == 0);
3434}
3435
3436/*
3437 * Free data allocated by namei(); see namei(9) for details.
3438 */
3439void
3440NDFREE(ndp, flags)
3441 struct nameidata *ndp;
3442 const u_int flags;
3443{
3444
3445 if (!(flags & NDF_NO_FREE_PNBUF) &&
3446 (ndp->ni_cnd.cn_flags & HASBUF)) {
3447 uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3448 ndp->ni_cnd.cn_flags &= ~HASBUF;
3449 }
3450 if (!(flags & NDF_NO_DVP_UNLOCK) &&
3451 (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3452 ndp->ni_dvp != ndp->ni_vp)
3453 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3454 if (!(flags & NDF_NO_DVP_RELE) &&
3455 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3456 vrele(ndp->ni_dvp);
3457 ndp->ni_dvp = NULL;
3458 }
3459 if (!(flags & NDF_NO_VP_UNLOCK) &&
3460 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3461 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3462 if (!(flags & NDF_NO_VP_RELE) &&
3463 ndp->ni_vp) {
3464 vrele(ndp->ni_vp);
3465 ndp->ni_vp = NULL;
3466 }
3467 if (!(flags & NDF_NO_STARTDIR_RELE) &&
3468 (ndp->ni_cnd.cn_flags & SAVESTART)) {
3469 vrele(ndp->ni_startdir);
3470 ndp->ni_startdir = NULL;
3471 }
3472}
3473
3474/*
3475 * Common filesystem object access control check routine. Accepts a
3476 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3477 * and optional call-by-reference privused argument allowing vaccess()
3478 * to indicate to the caller whether privilege was used to satisfy the
3479 * request (obsoleted). Returns 0 on success, or an errno on failure.
3480 */
3481int
3482vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3483 enum vtype type;
3484 mode_t file_mode;
3485 uid_t file_uid;
3486 gid_t file_gid;
3487 mode_t acc_mode;
3488 struct ucred *cred;
3489 int *privused;
3490{
3491 mode_t dac_granted;
3492#ifdef CAPABILITIES
3493 mode_t cap_granted;
3494#endif
3495
3496 /*
3497 * Look for a normal, non-privileged way to access the file/directory
3498 * as requested. If it exists, go with that.
3499 */
3500
3501 if (privused != NULL)
3502 *privused = 0;
3503
3504 dac_granted = 0;
3505
3506 /* Check the owner. */
3507 if (cred->cr_uid == file_uid) {
3508 dac_granted |= VADMIN;
3509 if (file_mode & S_IXUSR)
3510 dac_granted |= VEXEC;
3511 if (file_mode & S_IRUSR)
3512 dac_granted |= VREAD;
3513 if (file_mode & S_IWUSR)
3514 dac_granted |= (VWRITE | VAPPEND);
3515
3516 if ((acc_mode & dac_granted) == acc_mode)
3517 return (0);
3518
3519 goto privcheck;
3520 }
3521
3522 /* Otherwise, check the groups (first match) */
3523 if (groupmember(file_gid, cred)) {
3524 if (file_mode & S_IXGRP)
3525 dac_granted |= VEXEC;
3526 if (file_mode & S_IRGRP)
3527 dac_granted |= VREAD;
3528 if (file_mode & S_IWGRP)
3529 dac_granted |= (VWRITE | VAPPEND);
3530
3531 if ((acc_mode & dac_granted) == acc_mode)
3532 return (0);
3533
3534 goto privcheck;
3535 }
3536
3537 /* Otherwise, check everyone else. */
3538 if (file_mode & S_IXOTH)
3539 dac_granted |= VEXEC;
3540 if (file_mode & S_IROTH)
3541 dac_granted |= VREAD;
3542 if (file_mode & S_IWOTH)
3543 dac_granted |= (VWRITE | VAPPEND);
3544 if ((acc_mode & dac_granted) == acc_mode)
3545 return (0);
3546
3547privcheck:
3548 if (!suser_cred(cred, PRISON_ROOT)) {
3549 /* XXX audit: privilege used */
3550 if (privused != NULL)
3551 *privused = 1;
3552 return (0);
3553 }
3554
3555#ifdef CAPABILITIES
3556 /*
3557 * Build a capability mask to determine if the set of capabilities
3558 * satisfies the requirements when combined with the granted mask
3559 * from above.
3560 * For each capability, if the capability is required, bitwise
3561 * or the request type onto the cap_granted mask.
3562 */
3563 cap_granted = 0;
3564
3565 if (type == VDIR) {
3566 /*
3567 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3568 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3569 */
3570 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3571 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3572 cap_granted |= VEXEC;
3573 } else {
3574 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3575 !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3576 cap_granted |= VEXEC;
3577 }
3578
3579 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3580 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3581 cap_granted |= VREAD;
3582
3583 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3584 !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3585 cap_granted |= (VWRITE | VAPPEND);
3586
3587 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3588 !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3589 cap_granted |= VADMIN;
3590
3591 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3592 /* XXX audit: privilege used */
3593 if (privused != NULL)
3594 *privused = 1;
3595 return (0);
3596 }
3597#endif
3598
3599 return ((acc_mode & VADMIN) ? EPERM : EACCES);
3600}
3601
3602/*
3603 * Credential check based on process requesting service, and per-attribute
3604 * permissions.
3605 */
3606int
3607extattr_check_cred(struct vnode *vp, int attrnamespace,
3608 struct ucred *cred, struct thread *td, int access)
3609{
3610
3611 /*
3612 * Kernel-invoked always succeeds.
3613 */
3614 if (cred == NOCRED)
3615 return (0);
3616
3617 /*
3618 * Do not allow privileged processes in jail to directly
3619 * manipulate system attributes.
3620 *
3621 * XXX What capability should apply here?
3622 * Probably CAP_SYS_SETFFLAG.
3623 */
3624 switch (attrnamespace) {
3625 case EXTATTR_NAMESPACE_SYSTEM:
3626 /* Potentially should be: return (EPERM); */
3627 return (suser_cred(cred, 0));
3628 case EXTATTR_NAMESPACE_USER:
3629 return (VOP_ACCESS(vp, access, cred, td));
3630 default:
3631 return (EPERM);
3632 }
3633}
3634
3635#ifdef DEBUG_VFS_LOCKS
3636/*
3637 * This only exists to supress warnings from unlocked specfs accesses. It is
3638 * no longer ok to have an unlocked VFS.
3639 */
3640#define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
3641
3642int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
3643int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
3644int vfs_badlock_print = 1; /* Print lock violations. */
3645
3646static void
3647vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3648{
3649
3650 if (vfs_badlock_print)
3651 printf("%s: %p %s\n", str, (void *)vp, msg);
3652 if (vfs_badlock_ddb)
3653 Debugger("lock violation");
3654}
3655
3656void
3657assert_vi_locked(struct vnode *vp, const char *str)
3658{
3659
3660 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3661 vfs_badlock("interlock is not locked but should be", str, vp);
3662}
3663
3664void
3665assert_vi_unlocked(struct vnode *vp, const char *str)
3666{
3667
3668 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3669 vfs_badlock("interlock is locked but should not be", str, vp);
3670}
3671
3672void
3673assert_vop_locked(struct vnode *vp, const char *str)
3674{
3675
3676 if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
3677 vfs_badlock("is not locked but should be", str, vp);
3678}
3679
3680void
3681assert_vop_unlocked(struct vnode *vp, const char *str)
3682{
3683
3684 if (vp && !IGNORE_LOCK(vp) &&
3685 VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
3686 vfs_badlock("is locked but should not be", str, vp);
3687}
3688
3689#if 0
3690void
3691assert_vop_elocked(struct vnode *vp, const char *str)
3692{
3693
3694 if (vp && !IGNORE_LOCK(vp) &&
3695 VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
3696 vfs_badlock("is not exclusive locked but should be", str, vp);
3697}
3698
3699void
3700assert_vop_elocked_other(struct vnode *vp, const char *str)
3701{
3702
3703 if (vp && !IGNORE_LOCK(vp) &&
3704 VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
3705 vfs_badlock("is not exclusive locked by another thread",
3706 str, vp);
3707}
3708
3709void
3710assert_vop_slocked(struct vnode *vp, const char *str)
3711{
3712
3713 if (vp && !IGNORE_LOCK(vp) &&
3714 VOP_ISLOCKED(vp, curthread) != LK_SHARED)
3715 vfs_badlock("is not locked shared but should be", str, vp);
3716}
3717#endif /* 0 */
3718
3719void
3720vop_rename_pre(void *ap)
3721{
3722 struct vop_rename_args *a = ap;
3723
3724 if (a->a_tvp)
3725 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3726 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3727 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3728 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3729
3730 /* Check the source (from). */
3731 if (a->a_tdvp != a->a_fdvp)
3732 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3733 if (a->a_tvp != a->a_fvp)
3734 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
3735
3736 /* Check the target. */
3737 if (a->a_tvp)
3738 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3739 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3740}
3741
3742void
3743vop_strategy_pre(void *ap)
3744{
3745 struct vop_strategy_args *a;
3746 struct buf *bp;
3747
3748 a = ap;
3749 bp = a->a_bp;
3750
3751 /*
3752 * Cluster ops lock their component buffers but not the IO container.
3753 */
3754 if ((bp->b_flags & B_CLUSTER) != 0)
3755 return;
3756
3757 if (BUF_REFCNT(bp) < 1) {
3758 if (vfs_badlock_print)
3759 printf(
3760 "VOP_STRATEGY: bp is not locked but should be\n");
3761 if (vfs_badlock_ddb)
3762 Debugger("lock violation");
3763 }
3764}
3765
3766void
3767vop_lookup_pre(void *ap)
3768{
3769 struct vop_lookup_args *a;
3770 struct vnode *dvp;
3771
3772 a = ap;
3773 dvp = a->a_dvp;
3774 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3775 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3776}
3777
3778void
3779vop_lookup_post(void *ap, int rc)
3780{
3781 struct vop_lookup_args *a;
3782 struct componentname *cnp;
3783 struct vnode *dvp;
3784 struct vnode *vp;
3785 int flags;
3786
3787 a = ap;
3788 dvp = a->a_dvp;
3789 cnp = a->a_cnp;
3790 vp = *(a->a_vpp);
3791 flags = cnp->cn_flags;
3792
3793 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3794
3795 /*
3796 * If this is the last path component for this lookup and LOCKPARENT
3797 * is set, OR if there is an error the directory has to be locked.
3798 */
3799 if ((flags & LOCKPARENT) && (flags & ISLASTCN))
3800 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
3801 else if (rc != 0)
3802 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
3803 else if (dvp != vp)
3804 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
3805 if (flags & PDIRUNLOCK)
3806 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
3807}
3808
3809void
3810vop_lock_pre(void *ap)
3811{
3812 struct vop_lock_args *a = ap;
3813
3814 if ((a->a_flags & LK_INTERLOCK) == 0)
3815 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3816 else
3817 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3818}
3819
3820void
3821vop_lock_post(void *ap, int rc)
3822{
3823 struct vop_lock_args *a = ap;
3824
3825 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3826 if (rc == 0)
3827 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3828}
3829
3830void
3831vop_unlock_pre(void *ap)
3832{
3833 struct vop_unlock_args *a = ap;
3834
3835 if (a->a_flags & LK_INTERLOCK)
3836 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3837 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3838}
3839
3840void
3841vop_unlock_post(void *ap, int rc)
3842{
3843 struct vop_unlock_args *a = ap;
3844
3845 if (a->a_flags & LK_INTERLOCK)
3846 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3847}
3848#endif /* DEBUG_VFS_LOCKS */
3411 return (vp->v_rdev);
3412}
3413
3414/*
3415 * Check if vnode represents a disk device
3416 */
3417int
3418vn_isdisk(vp, errp)
3419 struct vnode *vp;
3420 int *errp;
3421{
3422 int error;
3423
3424 error = 0;
3425 if (vp->v_type != VCHR)
3426 error = ENOTBLK;
3427 else if (vp->v_rdev == NULL)
3428 error = ENXIO;
3429 else if (!(devsw(vp->v_rdev)->d_flags & D_DISK))
3430 error = ENOTBLK;
3431 if (errp != NULL)
3432 *errp = error;
3433 return (error == 0);
3434}
3435
3436/*
3437 * Free data allocated by namei(); see namei(9) for details.
3438 */
3439void
3440NDFREE(ndp, flags)
3441 struct nameidata *ndp;
3442 const u_int flags;
3443{
3444
3445 if (!(flags & NDF_NO_FREE_PNBUF) &&
3446 (ndp->ni_cnd.cn_flags & HASBUF)) {
3447 uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3448 ndp->ni_cnd.cn_flags &= ~HASBUF;
3449 }
3450 if (!(flags & NDF_NO_DVP_UNLOCK) &&
3451 (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3452 ndp->ni_dvp != ndp->ni_vp)
3453 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3454 if (!(flags & NDF_NO_DVP_RELE) &&
3455 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3456 vrele(ndp->ni_dvp);
3457 ndp->ni_dvp = NULL;
3458 }
3459 if (!(flags & NDF_NO_VP_UNLOCK) &&
3460 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3461 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3462 if (!(flags & NDF_NO_VP_RELE) &&
3463 ndp->ni_vp) {
3464 vrele(ndp->ni_vp);
3465 ndp->ni_vp = NULL;
3466 }
3467 if (!(flags & NDF_NO_STARTDIR_RELE) &&
3468 (ndp->ni_cnd.cn_flags & SAVESTART)) {
3469 vrele(ndp->ni_startdir);
3470 ndp->ni_startdir = NULL;
3471 }
3472}
3473
3474/*
3475 * Common filesystem object access control check routine. Accepts a
3476 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3477 * and optional call-by-reference privused argument allowing vaccess()
3478 * to indicate to the caller whether privilege was used to satisfy the
3479 * request (obsoleted). Returns 0 on success, or an errno on failure.
3480 */
3481int
3482vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3483 enum vtype type;
3484 mode_t file_mode;
3485 uid_t file_uid;
3486 gid_t file_gid;
3487 mode_t acc_mode;
3488 struct ucred *cred;
3489 int *privused;
3490{
3491 mode_t dac_granted;
3492#ifdef CAPABILITIES
3493 mode_t cap_granted;
3494#endif
3495
3496 /*
3497 * Look for a normal, non-privileged way to access the file/directory
3498 * as requested. If it exists, go with that.
3499 */
3500
3501 if (privused != NULL)
3502 *privused = 0;
3503
3504 dac_granted = 0;
3505
3506 /* Check the owner. */
3507 if (cred->cr_uid == file_uid) {
3508 dac_granted |= VADMIN;
3509 if (file_mode & S_IXUSR)
3510 dac_granted |= VEXEC;
3511 if (file_mode & S_IRUSR)
3512 dac_granted |= VREAD;
3513 if (file_mode & S_IWUSR)
3514 dac_granted |= (VWRITE | VAPPEND);
3515
3516 if ((acc_mode & dac_granted) == acc_mode)
3517 return (0);
3518
3519 goto privcheck;
3520 }
3521
3522 /* Otherwise, check the groups (first match) */
3523 if (groupmember(file_gid, cred)) {
3524 if (file_mode & S_IXGRP)
3525 dac_granted |= VEXEC;
3526 if (file_mode & S_IRGRP)
3527 dac_granted |= VREAD;
3528 if (file_mode & S_IWGRP)
3529 dac_granted |= (VWRITE | VAPPEND);
3530
3531 if ((acc_mode & dac_granted) == acc_mode)
3532 return (0);
3533
3534 goto privcheck;
3535 }
3536
3537 /* Otherwise, check everyone else. */
3538 if (file_mode & S_IXOTH)
3539 dac_granted |= VEXEC;
3540 if (file_mode & S_IROTH)
3541 dac_granted |= VREAD;
3542 if (file_mode & S_IWOTH)
3543 dac_granted |= (VWRITE | VAPPEND);
3544 if ((acc_mode & dac_granted) == acc_mode)
3545 return (0);
3546
3547privcheck:
3548 if (!suser_cred(cred, PRISON_ROOT)) {
3549 /* XXX audit: privilege used */
3550 if (privused != NULL)
3551 *privused = 1;
3552 return (0);
3553 }
3554
3555#ifdef CAPABILITIES
3556 /*
3557 * Build a capability mask to determine if the set of capabilities
3558 * satisfies the requirements when combined with the granted mask
3559 * from above.
3560 * For each capability, if the capability is required, bitwise
3561 * or the request type onto the cap_granted mask.
3562 */
3563 cap_granted = 0;
3564
3565 if (type == VDIR) {
3566 /*
3567 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3568 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3569 */
3570 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3571 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3572 cap_granted |= VEXEC;
3573 } else {
3574 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3575 !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3576 cap_granted |= VEXEC;
3577 }
3578
3579 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3580 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3581 cap_granted |= VREAD;
3582
3583 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3584 !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3585 cap_granted |= (VWRITE | VAPPEND);
3586
3587 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3588 !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3589 cap_granted |= VADMIN;
3590
3591 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3592 /* XXX audit: privilege used */
3593 if (privused != NULL)
3594 *privused = 1;
3595 return (0);
3596 }
3597#endif
3598
3599 return ((acc_mode & VADMIN) ? EPERM : EACCES);
3600}
3601
3602/*
3603 * Credential check based on process requesting service, and per-attribute
3604 * permissions.
3605 */
3606int
3607extattr_check_cred(struct vnode *vp, int attrnamespace,
3608 struct ucred *cred, struct thread *td, int access)
3609{
3610
3611 /*
3612 * Kernel-invoked always succeeds.
3613 */
3614 if (cred == NOCRED)
3615 return (0);
3616
3617 /*
3618 * Do not allow privileged processes in jail to directly
3619 * manipulate system attributes.
3620 *
3621 * XXX What capability should apply here?
3622 * Probably CAP_SYS_SETFFLAG.
3623 */
3624 switch (attrnamespace) {
3625 case EXTATTR_NAMESPACE_SYSTEM:
3626 /* Potentially should be: return (EPERM); */
3627 return (suser_cred(cred, 0));
3628 case EXTATTR_NAMESPACE_USER:
3629 return (VOP_ACCESS(vp, access, cred, td));
3630 default:
3631 return (EPERM);
3632 }
3633}
3634
3635#ifdef DEBUG_VFS_LOCKS
3636/*
3637 * This only exists to supress warnings from unlocked specfs accesses. It is
3638 * no longer ok to have an unlocked VFS.
3639 */
3640#define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
3641
3642int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
3643int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
3644int vfs_badlock_print = 1; /* Print lock violations. */
3645
3646static void
3647vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3648{
3649
3650 if (vfs_badlock_print)
3651 printf("%s: %p %s\n", str, (void *)vp, msg);
3652 if (vfs_badlock_ddb)
3653 Debugger("lock violation");
3654}
3655
3656void
3657assert_vi_locked(struct vnode *vp, const char *str)
3658{
3659
3660 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3661 vfs_badlock("interlock is not locked but should be", str, vp);
3662}
3663
3664void
3665assert_vi_unlocked(struct vnode *vp, const char *str)
3666{
3667
3668 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3669 vfs_badlock("interlock is locked but should not be", str, vp);
3670}
3671
3672void
3673assert_vop_locked(struct vnode *vp, const char *str)
3674{
3675
3676 if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
3677 vfs_badlock("is not locked but should be", str, vp);
3678}
3679
3680void
3681assert_vop_unlocked(struct vnode *vp, const char *str)
3682{
3683
3684 if (vp && !IGNORE_LOCK(vp) &&
3685 VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
3686 vfs_badlock("is locked but should not be", str, vp);
3687}
3688
3689#if 0
3690void
3691assert_vop_elocked(struct vnode *vp, const char *str)
3692{
3693
3694 if (vp && !IGNORE_LOCK(vp) &&
3695 VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
3696 vfs_badlock("is not exclusive locked but should be", str, vp);
3697}
3698
3699void
3700assert_vop_elocked_other(struct vnode *vp, const char *str)
3701{
3702
3703 if (vp && !IGNORE_LOCK(vp) &&
3704 VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
3705 vfs_badlock("is not exclusive locked by another thread",
3706 str, vp);
3707}
3708
3709void
3710assert_vop_slocked(struct vnode *vp, const char *str)
3711{
3712
3713 if (vp && !IGNORE_LOCK(vp) &&
3714 VOP_ISLOCKED(vp, curthread) != LK_SHARED)
3715 vfs_badlock("is not locked shared but should be", str, vp);
3716}
3717#endif /* 0 */
3718
3719void
3720vop_rename_pre(void *ap)
3721{
3722 struct vop_rename_args *a = ap;
3723
3724 if (a->a_tvp)
3725 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3726 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3727 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3728 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3729
3730 /* Check the source (from). */
3731 if (a->a_tdvp != a->a_fdvp)
3732 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3733 if (a->a_tvp != a->a_fvp)
3734 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
3735
3736 /* Check the target. */
3737 if (a->a_tvp)
3738 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3739 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3740}
3741
3742void
3743vop_strategy_pre(void *ap)
3744{
3745 struct vop_strategy_args *a;
3746 struct buf *bp;
3747
3748 a = ap;
3749 bp = a->a_bp;
3750
3751 /*
3752 * Cluster ops lock their component buffers but not the IO container.
3753 */
3754 if ((bp->b_flags & B_CLUSTER) != 0)
3755 return;
3756
3757 if (BUF_REFCNT(bp) < 1) {
3758 if (vfs_badlock_print)
3759 printf(
3760 "VOP_STRATEGY: bp is not locked but should be\n");
3761 if (vfs_badlock_ddb)
3762 Debugger("lock violation");
3763 }
3764}
3765
3766void
3767vop_lookup_pre(void *ap)
3768{
3769 struct vop_lookup_args *a;
3770 struct vnode *dvp;
3771
3772 a = ap;
3773 dvp = a->a_dvp;
3774 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3775 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3776}
3777
3778void
3779vop_lookup_post(void *ap, int rc)
3780{
3781 struct vop_lookup_args *a;
3782 struct componentname *cnp;
3783 struct vnode *dvp;
3784 struct vnode *vp;
3785 int flags;
3786
3787 a = ap;
3788 dvp = a->a_dvp;
3789 cnp = a->a_cnp;
3790 vp = *(a->a_vpp);
3791 flags = cnp->cn_flags;
3792
3793 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3794
3795 /*
3796 * If this is the last path component for this lookup and LOCKPARENT
3797 * is set, OR if there is an error the directory has to be locked.
3798 */
3799 if ((flags & LOCKPARENT) && (flags & ISLASTCN))
3800 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
3801 else if (rc != 0)
3802 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
3803 else if (dvp != vp)
3804 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
3805 if (flags & PDIRUNLOCK)
3806 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
3807}
3808
3809void
3810vop_lock_pre(void *ap)
3811{
3812 struct vop_lock_args *a = ap;
3813
3814 if ((a->a_flags & LK_INTERLOCK) == 0)
3815 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3816 else
3817 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3818}
3819
3820void
3821vop_lock_post(void *ap, int rc)
3822{
3823 struct vop_lock_args *a = ap;
3824
3825 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3826 if (rc == 0)
3827 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3828}
3829
3830void
3831vop_unlock_pre(void *ap)
3832{
3833 struct vop_unlock_args *a = ap;
3834
3835 if (a->a_flags & LK_INTERLOCK)
3836 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3837 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3838}
3839
3840void
3841vop_unlock_post(void *ap, int rc)
3842{
3843 struct vop_unlock_args *a = ap;
3844
3845 if (a->a_flags & LK_INTERLOCK)
3846 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3847}
3848#endif /* DEBUG_VFS_LOCKS */