Deleted Added
full compact
vfs_subr.c (126253) vfs_subr.c (126326)
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
39 */
40
41/*
42 * External virtual filesystem routines
43 */
44
45#include <sys/cdefs.h>
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
39 */
40
41/*
42 * External virtual filesystem routines
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 126253 2004-02-26 00:27:04Z truckman $");
46__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 126326 2004-02-27 18:52:44Z jhb $");
47
48#include "opt_ddb.h"
49#include "opt_mac.h"
50
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/bio.h>
54#include <sys/buf.h>
55#include <sys/conf.h>
56#include <sys/eventhandler.h>
57#include <sys/extattr.h>
58#include <sys/fcntl.h>
59#include <sys/kernel.h>
60#include <sys/kthread.h>
61#include <sys/mac.h>
62#include <sys/malloc.h>
63#include <sys/mount.h>
64#include <sys/namei.h>
47
48#include "opt_ddb.h"
49#include "opt_mac.h"
50
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/bio.h>
54#include <sys/buf.h>
55#include <sys/conf.h>
56#include <sys/eventhandler.h>
57#include <sys/extattr.h>
58#include <sys/fcntl.h>
59#include <sys/kernel.h>
60#include <sys/kthread.h>
61#include <sys/mac.h>
62#include <sys/malloc.h>
63#include <sys/mount.h>
64#include <sys/namei.h>
65#include <sys/sleepqueue.h>
65#include <sys/stat.h>
66#include <sys/sysctl.h>
67#include <sys/syslog.h>
68#include <sys/vmmeter.h>
69#include <sys/vnode.h>
70
71#include <vm/vm.h>
72#include <vm/vm_object.h>
73#include <vm/vm_extern.h>
74#include <vm/pmap.h>
75#include <vm/vm_map.h>
76#include <vm/vm_page.h>
77#include <vm/vm_kern.h>
78#include <vm/uma.h>
79
80static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
81
82static void addalias(struct vnode *vp, dev_t nvp_rdev);
83static void insmntque(struct vnode *vp, struct mount *mp);
84static void vclean(struct vnode *vp, int flags, struct thread *td);
85static void vlruvp(struct vnode *vp);
86static int flushbuflist(struct buf *blist, int flags, struct vnode *vp,
87 int slpflag, int slptimeo, int *errorp);
88static int vtryrecycle(struct vnode *vp);
89static void vx_lock(struct vnode *vp);
90static void vx_unlock(struct vnode *vp);
91static void vgonechrl(struct vnode *vp, struct thread *td);
92
93
94/*
95 * Number of vnodes in existence. Increased whenever getnewvnode()
96 * allocates a new vnode, never decreased.
97 */
98static unsigned long numvnodes;
99
100SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
101
102/*
103 * Conversion tables for conversion from vnode types to inode formats
104 * and back.
105 */
106enum vtype iftovt_tab[16] = {
107 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
108 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
109};
110int vttoif_tab[9] = {
111 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
112 S_IFSOCK, S_IFIFO, S_IFMT,
113};
114
115/*
116 * List of vnodes that are ready for recycling.
117 */
118static TAILQ_HEAD(freelst, vnode) vnode_free_list;
119
120/*
121 * Minimum number of free vnodes. If there are fewer than this free vnodes,
122 * getnewvnode() will return a newly allocated vnode.
123 */
124static u_long wantfreevnodes = 25;
125SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
126/* Number of vnodes in the free list. */
127static u_long freevnodes;
128SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
129
130/*
131 * Various variables used for debugging the new implementation of
132 * reassignbuf().
133 * XXX these are probably of (very) limited utility now.
134 */
135static int reassignbufcalls;
136SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
137static int nameileafonly;
138SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
139
140/*
141 * Cache for the mount type id assigned to NFS. This is used for
142 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
143 */
144int nfs_mount_type = -1;
145
146/* To keep more than one thread at a time from running vfs_getnewfsid */
147static struct mtx mntid_mtx;
148
149/*
150 * Lock for any access to the following:
151 * vnode_free_list
152 * numvnodes
153 * freevnodes
154 */
155static struct mtx vnode_free_list_mtx;
156
157/*
158 * For any iteration/modification of dev->si_hlist (linked through
159 * v_specnext)
160 */
161static struct mtx spechash_mtx;
162
163/* Publicly exported FS */
164struct nfs_public nfs_pub;
165
166/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
167static uma_zone_t vnode_zone;
168static uma_zone_t vnodepoll_zone;
169
170/* Set to 1 to print out reclaim of active vnodes */
171int prtactive;
172
173/*
174 * The workitem queue.
175 *
176 * It is useful to delay writes of file data and filesystem metadata
177 * for tens of seconds so that quickly created and deleted files need
178 * not waste disk bandwidth being created and removed. To realize this,
179 * we append vnodes to a "workitem" queue. When running with a soft
180 * updates implementation, most pending metadata dependencies should
181 * not wait for more than a few seconds. Thus, mounted on block devices
182 * are delayed only about a half the time that file data is delayed.
183 * Similarly, directory updates are more critical, so are only delayed
184 * about a third the time that file data is delayed. Thus, there are
185 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
186 * one each second (driven off the filesystem syncer process). The
187 * syncer_delayno variable indicates the next queue that is to be processed.
188 * Items that need to be processed soon are placed in this queue:
189 *
190 * syncer_workitem_pending[syncer_delayno]
191 *
192 * A delay of fifteen seconds is done by placing the request fifteen
193 * entries later in the queue:
194 *
195 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
196 *
197 */
198static int syncer_delayno;
199static long syncer_mask;
200LIST_HEAD(synclist, vnode);
201static struct synclist *syncer_workitem_pending;
202/*
203 * The sync_mtx protects:
204 * vp->v_synclist
205 * syncer_delayno
206 * syncer_workitem_pending
207 * rushjob
208 */
209static struct mtx sync_mtx;
210
211#define SYNCER_MAXDELAY 32
212static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
213static int syncdelay = 30; /* max time to delay syncing data */
214static int filedelay = 30; /* time to delay syncing files */
215SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
216static int dirdelay = 29; /* time to delay syncing directories */
217SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
218static int metadelay = 28; /* time to delay syncing metadata */
219SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
220static int rushjob; /* number of slots to run ASAP */
221static int stat_rush_requests; /* number of times I/O speeded up */
222SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
223
224/*
225 * Number of vnodes we want to exist at any one time. This is mostly used
226 * to size hash tables in vnode-related code. It is normally not used in
227 * getnewvnode(), as wantfreevnodes is normally nonzero.)
228 *
229 * XXX desiredvnodes is historical cruft and should not exist.
230 */
231int desiredvnodes;
232SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
233 &desiredvnodes, 0, "Maximum number of vnodes");
234static int minvnodes;
235SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
236 &minvnodes, 0, "Minimum number of vnodes");
237static int vnlru_nowhere;
238SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
239 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
240
241/* Hook for calling soft updates. */
242int (*softdep_process_worklist_hook)(struct mount *);
243
244/*
245 * Initialize the vnode management data structures.
246 */
247static void
248vntblinit(void *dummy __unused)
249{
250
251 /*
252 * Desiredvnodes is a function of the physical memory size and
253 * the kernel's heap size. Specifically, desiredvnodes scales
254 * in proportion to the physical memory size until two fifths
255 * of the kernel's heap size is consumed by vnodes and vm
256 * objects.
257 */
258 desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
259 (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
260 minvnodes = desiredvnodes / 4;
261 mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
262 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
263 mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
264 TAILQ_INIT(&vnode_free_list);
265 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
266 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
267 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
268 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
269 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
270 /*
271 * Initialize the filesystem syncer.
272 */
273 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
274 &syncer_mask);
275 syncer_maxdelay = syncer_mask + 1;
276 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
277}
278SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
279
280
281/*
282 * Mark a mount point as busy. Used to synchronize access and to delay
283 * unmounting. Interlock is not released on failure.
284 */
285int
286vfs_busy(mp, flags, interlkp, td)
287 struct mount *mp;
288 int flags;
289 struct mtx *interlkp;
290 struct thread *td;
291{
292 int lkflags;
293
294 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
295 if (flags & LK_NOWAIT)
296 return (ENOENT);
297 mp->mnt_kern_flag |= MNTK_MWAIT;
298 /*
299 * Since all busy locks are shared except the exclusive
300 * lock granted when unmounting, the only place that a
301 * wakeup needs to be done is at the release of the
302 * exclusive lock at the end of dounmount.
303 */
304 msleep(mp, interlkp, PVFS, "vfs_busy", 0);
305 return (ENOENT);
306 }
307 lkflags = LK_SHARED | LK_NOPAUSE;
308 if (interlkp)
309 lkflags |= LK_INTERLOCK;
310 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
311 panic("vfs_busy: unexpected lock failure");
312 return (0);
313}
314
315/*
316 * Free a busy filesystem.
317 */
318void
319vfs_unbusy(mp, td)
320 struct mount *mp;
321 struct thread *td;
322{
323
324 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
325}
326
327/*
328 * Lookup a mount point by filesystem identifier.
329 */
330struct mount *
331vfs_getvfs(fsid)
332 fsid_t *fsid;
333{
334 register struct mount *mp;
335
336 mtx_lock(&mountlist_mtx);
337 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
338 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
339 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
340 mtx_unlock(&mountlist_mtx);
341 return (mp);
342 }
343 }
344 mtx_unlock(&mountlist_mtx);
345 return ((struct mount *) 0);
346}
347
348/*
349 * Get a new unique fsid. Try to make its val[0] unique, since this value
350 * will be used to create fake device numbers for stat(). Also try (but
351 * not so hard) make its val[0] unique mod 2^16, since some emulators only
352 * support 16-bit device numbers. We end up with unique val[0]'s for the
353 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
354 *
355 * Keep in mind that several mounts may be running in parallel. Starting
356 * the search one past where the previous search terminated is both a
357 * micro-optimization and a defense against returning the same fsid to
358 * different mounts.
359 */
360void
361vfs_getnewfsid(mp)
362 struct mount *mp;
363{
364 static u_int16_t mntid_base;
365 fsid_t tfsid;
366 int mtype;
367
368 mtx_lock(&mntid_mtx);
369 mtype = mp->mnt_vfc->vfc_typenum;
370 tfsid.val[1] = mtype;
371 mtype = (mtype & 0xFF) << 24;
372 for (;;) {
373 tfsid.val[0] = makeudev(255,
374 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
375 mntid_base++;
376 if (vfs_getvfs(&tfsid) == NULL)
377 break;
378 }
379 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
380 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
381 mtx_unlock(&mntid_mtx);
382}
383
384/*
385 * Knob to control the precision of file timestamps:
386 *
387 * 0 = seconds only; nanoseconds zeroed.
388 * 1 = seconds and nanoseconds, accurate within 1/HZ.
389 * 2 = seconds and nanoseconds, truncated to microseconds.
390 * >=3 = seconds and nanoseconds, maximum precision.
391 */
392enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
393
394static int timestamp_precision = TSP_SEC;
395SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
396 &timestamp_precision, 0, "");
397
398/*
399 * Get a current timestamp.
400 */
401void
402vfs_timestamp(tsp)
403 struct timespec *tsp;
404{
405 struct timeval tv;
406
407 switch (timestamp_precision) {
408 case TSP_SEC:
409 tsp->tv_sec = time_second;
410 tsp->tv_nsec = 0;
411 break;
412 case TSP_HZ:
413 getnanotime(tsp);
414 break;
415 case TSP_USEC:
416 microtime(&tv);
417 TIMEVAL_TO_TIMESPEC(&tv, tsp);
418 break;
419 case TSP_NSEC:
420 default:
421 nanotime(tsp);
422 break;
423 }
424}
425
426/*
427 * Set vnode attributes to VNOVAL
428 */
429void
430vattr_null(vap)
431 register struct vattr *vap;
432{
433
434 vap->va_type = VNON;
435 vap->va_size = VNOVAL;
436 vap->va_bytes = VNOVAL;
437 vap->va_mode = VNOVAL;
438 vap->va_nlink = VNOVAL;
439 vap->va_uid = VNOVAL;
440 vap->va_gid = VNOVAL;
441 vap->va_fsid = VNOVAL;
442 vap->va_fileid = VNOVAL;
443 vap->va_blocksize = VNOVAL;
444 vap->va_rdev = VNOVAL;
445 vap->va_atime.tv_sec = VNOVAL;
446 vap->va_atime.tv_nsec = VNOVAL;
447 vap->va_mtime.tv_sec = VNOVAL;
448 vap->va_mtime.tv_nsec = VNOVAL;
449 vap->va_ctime.tv_sec = VNOVAL;
450 vap->va_ctime.tv_nsec = VNOVAL;
451 vap->va_birthtime.tv_sec = VNOVAL;
452 vap->va_birthtime.tv_nsec = VNOVAL;
453 vap->va_flags = VNOVAL;
454 vap->va_gen = VNOVAL;
455 vap->va_vaflags = 0;
456}
457
458/*
459 * This routine is called when we have too many vnodes. It attempts
460 * to free <count> vnodes and will potentially free vnodes that still
461 * have VM backing store (VM backing store is typically the cause
462 * of a vnode blowout so we want to do this). Therefore, this operation
463 * is not considered cheap.
464 *
465 * A number of conditions may prevent a vnode from being reclaimed.
466 * the buffer cache may have references on the vnode, a directory
467 * vnode may still have references due to the namei cache representing
468 * underlying files, or the vnode may be in active use. It is not
469 * desireable to reuse such vnodes. These conditions may cause the
470 * number of vnodes to reach some minimum value regardless of what
471 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
472 */
473static int
474vlrureclaim(struct mount *mp)
475{
476 struct vnode *vp;
477 int done;
478 int trigger;
479 int usevnodes;
480 int count;
481
482 /*
483 * Calculate the trigger point, don't allow user
484 * screwups to blow us up. This prevents us from
485 * recycling vnodes with lots of resident pages. We
486 * aren't trying to free memory, we are trying to
487 * free vnodes.
488 */
489 usevnodes = desiredvnodes;
490 if (usevnodes <= 0)
491 usevnodes = 1;
492 trigger = cnt.v_page_count * 2 / usevnodes;
493
494 done = 0;
495 MNT_ILOCK(mp);
496 count = mp->mnt_nvnodelistsize / 10 + 1;
497 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
498 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
499 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
500
501 if (vp->v_type != VNON &&
502 vp->v_type != VBAD &&
503 VI_TRYLOCK(vp)) {
504 if (VMIGHTFREE(vp) && /* critical path opt */
505 (vp->v_object == NULL ||
506 vp->v_object->resident_page_count < trigger)) {
507 MNT_IUNLOCK(mp);
508 vgonel(vp, curthread);
509 done++;
510 MNT_ILOCK(mp);
511 } else
512 VI_UNLOCK(vp);
513 }
514 --count;
515 }
516 MNT_IUNLOCK(mp);
517 return done;
518}
519
520/*
521 * Attempt to recycle vnodes in a context that is always safe to block.
522 * Calling vlrurecycle() from the bowels of filesystem code has some
523 * interesting deadlock problems.
524 */
525static struct proc *vnlruproc;
526static int vnlruproc_sig;
527
528static void
529vnlru_proc(void)
530{
531 struct mount *mp, *nmp;
532 int done;
533 struct proc *p = vnlruproc;
534 struct thread *td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */
535
536 mtx_lock(&Giant);
537
538 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
539 SHUTDOWN_PRI_FIRST);
540
541 for (;;) {
542 kthread_suspend_check(p);
543 mtx_lock(&vnode_free_list_mtx);
544 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
545 mtx_unlock(&vnode_free_list_mtx);
546 vnlruproc_sig = 0;
547 wakeup(&vnlruproc_sig);
548 tsleep(vnlruproc, PVFS, "vlruwt", hz);
549 continue;
550 }
551 mtx_unlock(&vnode_free_list_mtx);
552 done = 0;
553 mtx_lock(&mountlist_mtx);
554 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
555 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
556 nmp = TAILQ_NEXT(mp, mnt_list);
557 continue;
558 }
559 done += vlrureclaim(mp);
560 mtx_lock(&mountlist_mtx);
561 nmp = TAILQ_NEXT(mp, mnt_list);
562 vfs_unbusy(mp, td);
563 }
564 mtx_unlock(&mountlist_mtx);
565 if (done == 0) {
566#if 0
567 /* These messages are temporary debugging aids */
568 if (vnlru_nowhere < 5)
569 printf("vnlru process getting nowhere..\n");
570 else if (vnlru_nowhere == 5)
571 printf("vnlru process messages stopped.\n");
572#endif
573 vnlru_nowhere++;
574 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
575 }
576 }
577}
578
579static struct kproc_desc vnlru_kp = {
580 "vnlru",
581 vnlru_proc,
582 &vnlruproc
583};
584SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
585
586
587/*
588 * Routines having to do with the management of the vnode table.
589 */
590
591/*
592 * Check to see if a free vnode can be recycled. If it can,
593 * recycle it and return it with the vnode interlock held.
594 */
595static int
596vtryrecycle(struct vnode *vp)
597{
598 struct thread *td = curthread;
599 vm_object_t object;
600 struct mount *vnmp;
601 int error;
602
603 /* Don't recycle if we can't get the interlock */
604 if (!VI_TRYLOCK(vp))
605 return (EWOULDBLOCK);
606 /*
607 * This vnode may found and locked via some other list, if so we
608 * can't recycle it yet.
609 */
610 if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
611 return (EWOULDBLOCK);
612 /*
613 * Don't recycle if its filesystem is being suspended.
614 */
615 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
616 error = EBUSY;
617 goto done;
618 }
619
620 /*
621 * Don't recycle if we still have cached pages.
622 */
623 if (VOP_GETVOBJECT(vp, &object) == 0) {
624 VM_OBJECT_LOCK(object);
625 if (object->resident_page_count ||
626 object->ref_count) {
627 VM_OBJECT_UNLOCK(object);
628 error = EBUSY;
629 goto done;
630 }
631 VM_OBJECT_UNLOCK(object);
632 }
633 if (LIST_FIRST(&vp->v_cache_src)) {
634 /*
635 * note: nameileafonly sysctl is temporary,
636 * for debugging only, and will eventually be
637 * removed.
638 */
639 if (nameileafonly > 0) {
640 /*
641 * Do not reuse namei-cached directory
642 * vnodes that have cached
643 * subdirectories.
644 */
645 if (cache_leaf_test(vp) < 0) {
646 error = EISDIR;
647 goto done;
648 }
649 } else if (nameileafonly < 0 ||
650 vmiodirenable == 0) {
651 /*
652 * Do not reuse namei-cached directory
653 * vnodes if nameileafonly is -1 or
654 * if VMIO backing for directories is
655 * turned off (otherwise we reuse them
656 * too quickly).
657 */
658 error = EBUSY;
659 goto done;
660 }
661 }
662 /*
663 * If we got this far, we need to acquire the interlock and see if
664 * anyone picked up this vnode from another list. If not, we will
665 * mark it with XLOCK via vgonel() so that anyone who does find it
666 * will skip over it.
667 */
668 VI_LOCK(vp);
669 if (VSHOULDBUSY(vp) && (vp->v_iflag & VI_XLOCK) == 0) {
670 VI_UNLOCK(vp);
671 error = EBUSY;
672 goto done;
673 }
674 mtx_lock(&vnode_free_list_mtx);
675 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
676 vp->v_iflag &= ~VI_FREE;
677 mtx_unlock(&vnode_free_list_mtx);
678 vp->v_iflag |= VI_DOOMED;
679 if (vp->v_type != VBAD) {
680 VOP_UNLOCK(vp, 0, td);
681 vgonel(vp, td);
682 VI_LOCK(vp);
683 } else
684 VOP_UNLOCK(vp, 0, td);
685 vn_finished_write(vnmp);
686 return (0);
687done:
688 VOP_UNLOCK(vp, 0, td);
689 return (error);
690}
691
692/*
693 * Return the next vnode from the free list.
694 */
695int
696getnewvnode(tag, mp, vops, vpp)
697 const char *tag;
698 struct mount *mp;
699 vop_t **vops;
700 struct vnode **vpp;
701{
702 struct vnode *vp = NULL;
703 struct vpollinfo *pollinfo = NULL;
704
705 mtx_lock(&vnode_free_list_mtx);
706
707 /*
708 * Try to reuse vnodes if we hit the max. This situation only
709 * occurs in certain large-memory (2G+) situations. We cannot
710 * attempt to directly reclaim vnodes due to nasty recursion
711 * problems.
712 */
713 while (numvnodes - freevnodes > desiredvnodes) {
714 if (vnlruproc_sig == 0) {
715 vnlruproc_sig = 1; /* avoid unnecessary wakeups */
716 wakeup(vnlruproc);
717 }
718 mtx_unlock(&vnode_free_list_mtx);
719 tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
720 mtx_lock(&vnode_free_list_mtx);
721 }
722
723 /*
724 * Attempt to reuse a vnode already on the free list, allocating
725 * a new vnode if we can't find one or if we have not reached a
726 * good minimum for good LRU performance.
727 */
728
729 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
730 int error;
731 int count;
732
733 for (count = 0; count < freevnodes; count++) {
734 vp = TAILQ_FIRST(&vnode_free_list);
735
736 KASSERT(vp->v_usecount == 0 &&
737 (vp->v_iflag & VI_DOINGINACT) == 0,
738 ("getnewvnode: free vnode isn't"));
739
740 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
741 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
742 mtx_unlock(&vnode_free_list_mtx);
743 error = vtryrecycle(vp);
744 mtx_lock(&vnode_free_list_mtx);
745 if (error == 0)
746 break;
747 vp = NULL;
748 }
749 }
750 if (vp) {
751 freevnodes--;
752 mtx_unlock(&vnode_free_list_mtx);
753
754#ifdef INVARIANTS
755 {
756 if (vp->v_data)
757 panic("cleaned vnode isn't");
758 if (vp->v_numoutput)
759 panic("Clean vnode has pending I/O's");
760 if (vp->v_writecount != 0)
761 panic("Non-zero write count");
762 }
763#endif
764 if ((pollinfo = vp->v_pollinfo) != NULL) {
765 /*
766 * To avoid lock order reversals, the call to
767 * uma_zfree() must be delayed until the vnode
768 * interlock is released.
769 */
770 vp->v_pollinfo = NULL;
771 }
772#ifdef MAC
773 mac_destroy_vnode(vp);
774#endif
775 vp->v_iflag = 0;
776 vp->v_vflag = 0;
777 vp->v_lastw = 0;
778 vp->v_lasta = 0;
779 vp->v_cstart = 0;
780 vp->v_clen = 0;
781 vp->v_socket = 0;
782 lockdestroy(vp->v_vnlock);
783 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
784 KASSERT(vp->v_cleanbufcnt == 0, ("cleanbufcnt not 0"));
785 KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
786 KASSERT(vp->v_dirtybufcnt == 0, ("dirtybufcnt not 0"));
787 KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
788 } else {
789 numvnodes++;
790 mtx_unlock(&vnode_free_list_mtx);
791
792 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
793 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
794 VI_LOCK(vp);
795 vp->v_dd = vp;
796 vp->v_vnlock = &vp->v_lock;
797 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
798 cache_purge(vp); /* Sets up v_id. */
799 LIST_INIT(&vp->v_cache_src);
800 TAILQ_INIT(&vp->v_cache_dst);
801 }
802
803 TAILQ_INIT(&vp->v_cleanblkhd);
804 TAILQ_INIT(&vp->v_dirtyblkhd);
805 vp->v_type = VNON;
806 vp->v_tag = tag;
807 vp->v_op = vops;
808 *vpp = vp;
809 vp->v_usecount = 1;
810 vp->v_data = 0;
811 vp->v_cachedid = -1;
812 VI_UNLOCK(vp);
813 if (pollinfo != NULL) {
814 mtx_destroy(&pollinfo->vpi_lock);
815 uma_zfree(vnodepoll_zone, pollinfo);
816 }
817#ifdef MAC
818 mac_init_vnode(vp);
819 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
820 mac_associate_vnode_singlelabel(mp, vp);
821#endif
822 insmntque(vp, mp);
823
824 return (0);
825}
826
827/*
828 * Move a vnode from one mount queue to another.
829 */
830static void
831insmntque(vp, mp)
832 register struct vnode *vp;
833 register struct mount *mp;
834{
835
836 /*
837 * Delete from old mount point vnode list, if on one.
838 */
839 if (vp->v_mount != NULL) {
840 MNT_ILOCK(vp->v_mount);
841 KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
842 ("bad mount point vnode list size"));
843 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
844 vp->v_mount->mnt_nvnodelistsize--;
845 MNT_IUNLOCK(vp->v_mount);
846 }
847 /*
848 * Insert into list of vnodes for the new mount point, if available.
849 */
850 if ((vp->v_mount = mp) != NULL) {
851 MNT_ILOCK(vp->v_mount);
852 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
853 mp->mnt_nvnodelistsize++;
854 MNT_IUNLOCK(vp->v_mount);
855 }
856}
857
858/*
859 * Update outstanding I/O count and do wakeup if requested.
860 */
861void
862vwakeup(bp)
863 register struct buf *bp;
864{
865 register struct vnode *vp;
866
867 bp->b_flags &= ~B_WRITEINPROG;
868 if ((vp = bp->b_vp)) {
869 VI_LOCK(vp);
870 vp->v_numoutput--;
871 if (vp->v_numoutput < 0)
872 panic("vwakeup: neg numoutput");
873 if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
874 vp->v_iflag &= ~VI_BWAIT;
875 wakeup(&vp->v_numoutput);
876 }
877 VI_UNLOCK(vp);
878 }
879}
880
881/*
882 * Flush out and invalidate all buffers associated with a vnode.
883 * Called with the underlying object locked.
884 */
885int
886vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
887 struct vnode *vp;
888 int flags;
889 struct ucred *cred;
890 struct thread *td;
891 int slpflag, slptimeo;
892{
893 struct buf *blist;
894 int error;
895 vm_object_t object;
896
897 GIANT_REQUIRED;
898
899 ASSERT_VOP_LOCKED(vp, "vinvalbuf");
900
901 VI_LOCK(vp);
902 if (flags & V_SAVE) {
903 while (vp->v_numoutput) {
904 vp->v_iflag |= VI_BWAIT;
905 error = msleep(&vp->v_numoutput, VI_MTX(vp),
906 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
907 if (error) {
908 VI_UNLOCK(vp);
909 return (error);
910 }
911 }
912 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
913 VI_UNLOCK(vp);
914 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
915 return (error);
916 /*
917 * XXX We could save a lock/unlock if this was only
918 * enabled under INVARIANTS
919 */
920 VI_LOCK(vp);
921 if (vp->v_numoutput > 0 ||
922 !TAILQ_EMPTY(&vp->v_dirtyblkhd))
923 panic("vinvalbuf: dirty bufs");
924 }
925 }
926 /*
927 * If you alter this loop please notice that interlock is dropped and
928 * reacquired in flushbuflist. Special care is needed to ensure that
929 * no race conditions occur from this.
930 */
931 for (error = 0;;) {
932 if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
933 flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
934 if (error)
935 break;
936 continue;
937 }
938 if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
939 flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
940 if (error)
941 break;
942 continue;
943 }
944 break;
945 }
946 if (error) {
947 VI_UNLOCK(vp);
948 return (error);
949 }
950
951 /*
952 * Wait for I/O to complete. XXX needs cleaning up. The vnode can
953 * have write I/O in-progress but if there is a VM object then the
954 * VM object can also have read-I/O in-progress.
955 */
956 do {
957 while (vp->v_numoutput > 0) {
958 vp->v_iflag |= VI_BWAIT;
959 msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
960 }
961 VI_UNLOCK(vp);
962 if (VOP_GETVOBJECT(vp, &object) == 0) {
963 VM_OBJECT_LOCK(object);
964 vm_object_pip_wait(object, "vnvlbx");
965 VM_OBJECT_UNLOCK(object);
966 }
967 VI_LOCK(vp);
968 } while (vp->v_numoutput > 0);
969 VI_UNLOCK(vp);
970
971 /*
972 * Destroy the copy in the VM cache, too.
973 */
974 if (VOP_GETVOBJECT(vp, &object) == 0) {
975 VM_OBJECT_LOCK(object);
976 vm_object_page_remove(object, 0, 0,
977 (flags & V_SAVE) ? TRUE : FALSE);
978 VM_OBJECT_UNLOCK(object);
979 }
980
981#ifdef INVARIANTS
982 VI_LOCK(vp);
983 if ((flags & (V_ALT | V_NORMAL)) == 0 &&
984 (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
985 !TAILQ_EMPTY(&vp->v_cleanblkhd)))
986 panic("vinvalbuf: flush failed");
987 VI_UNLOCK(vp);
988#endif
989 return (0);
990}
991
992/*
993 * Flush out buffers on the specified list.
994 *
995 */
996static int
997flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
998 struct buf *blist;
999 int flags;
1000 struct vnode *vp;
1001 int slpflag, slptimeo;
1002 int *errorp;
1003{
1004 struct buf *bp, *nbp;
1005 int found, error;
1006
1007 ASSERT_VI_LOCKED(vp, "flushbuflist");
1008
1009 for (found = 0, bp = blist; bp; bp = nbp) {
1010 nbp = TAILQ_NEXT(bp, b_vnbufs);
1011 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1012 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1013 continue;
1014 }
1015 found += 1;
1016 error = BUF_TIMELOCK(bp,
1017 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp),
1018 "flushbuf", slpflag, slptimeo);
1019 if (error) {
1020 if (error != ENOLCK)
1021 *errorp = error;
1022 goto done;
1023 }
1024 /*
1025 * XXX Since there are no node locks for NFS, I
1026 * believe there is a slight chance that a delayed
1027 * write will occur while sleeping just above, so
1028 * check for it. Note that vfs_bio_awrite expects
1029 * buffers to reside on a queue, while BUF_WRITE and
1030 * brelse do not.
1031 */
1032 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1033 (flags & V_SAVE)) {
1034
1035 if (bp->b_vp == vp) {
1036 if (bp->b_flags & B_CLUSTEROK) {
1037 vfs_bio_awrite(bp);
1038 } else {
1039 bremfree(bp);
1040 bp->b_flags |= B_ASYNC;
1041 BUF_WRITE(bp);
1042 }
1043 } else {
1044 bremfree(bp);
1045 (void) BUF_WRITE(bp);
1046 }
1047 goto done;
1048 }
1049 bremfree(bp);
1050 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1051 bp->b_flags &= ~B_ASYNC;
1052 brelse(bp);
1053 VI_LOCK(vp);
1054 }
1055 return (found);
1056done:
1057 VI_LOCK(vp);
1058 return (found);
1059}
1060
1061/*
1062 * Truncate a file's buffer and pages to a specified length. This
1063 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1064 * sync activity.
1065 */
1066int
1067vtruncbuf(vp, cred, td, length, blksize)
1068 register struct vnode *vp;
1069 struct ucred *cred;
1070 struct thread *td;
1071 off_t length;
1072 int blksize;
1073{
1074 register struct buf *bp;
1075 struct buf *nbp;
1076 int anyfreed;
1077 int trunclbn;
1078
1079 /*
1080 * Round up to the *next* lbn.
1081 */
1082 trunclbn = (length + blksize - 1) / blksize;
1083
1084 ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1085restart:
1086 VI_LOCK(vp);
1087 anyfreed = 1;
1088 for (;anyfreed;) {
1089 anyfreed = 0;
1090 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1091 nbp = TAILQ_NEXT(bp, b_vnbufs);
1092 if (bp->b_lblkno >= trunclbn) {
1093 if (BUF_LOCK(bp,
1094 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1095 VI_MTX(vp)) == ENOLCK)
1096 goto restart;
1097
1098 bremfree(bp);
1099 bp->b_flags |= (B_INVAL | B_RELBUF);
1100 bp->b_flags &= ~B_ASYNC;
1101 brelse(bp);
1102 anyfreed = 1;
1103
1104 if (nbp &&
1105 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1106 (nbp->b_vp != vp) ||
1107 (nbp->b_flags & B_DELWRI))) {
1108 goto restart;
1109 }
1110 VI_LOCK(vp);
1111 }
1112 }
1113
1114 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1115 nbp = TAILQ_NEXT(bp, b_vnbufs);
1116 if (bp->b_lblkno >= trunclbn) {
1117 if (BUF_LOCK(bp,
1118 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1119 VI_MTX(vp)) == ENOLCK)
1120 goto restart;
1121 bremfree(bp);
1122 bp->b_flags |= (B_INVAL | B_RELBUF);
1123 bp->b_flags &= ~B_ASYNC;
1124 brelse(bp);
1125 anyfreed = 1;
1126 if (nbp &&
1127 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1128 (nbp->b_vp != vp) ||
1129 (nbp->b_flags & B_DELWRI) == 0)) {
1130 goto restart;
1131 }
1132 VI_LOCK(vp);
1133 }
1134 }
1135 }
1136
1137 if (length > 0) {
1138restartsync:
1139 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1140 nbp = TAILQ_NEXT(bp, b_vnbufs);
1141 if (bp->b_lblkno > 0)
1142 continue;
1143 /*
1144 * Since we hold the vnode lock this should only
1145 * fail if we're racing with the buf daemon.
1146 */
1147 if (BUF_LOCK(bp,
1148 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1149 VI_MTX(vp)) == ENOLCK) {
1150 goto restart;
1151 }
1152 KASSERT((bp->b_flags & B_DELWRI),
1153 ("buf(%p) on dirty queue without DELWRI", bp));
1154
1155 bremfree(bp);
1156 bawrite(bp);
1157 VI_LOCK(vp);
1158 goto restartsync;
1159 }
1160 }
1161
1162 while (vp->v_numoutput > 0) {
1163 vp->v_iflag |= VI_BWAIT;
1164 msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1165 }
1166 VI_UNLOCK(vp);
1167 vnode_pager_setsize(vp, length);
1168
1169 return (0);
1170}
1171
1172/*
1173 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1174 * a vnode.
1175 *
1176 * NOTE: We have to deal with the special case of a background bitmap
1177 * buffer, a situation where two buffers will have the same logical
1178 * block offset. We want (1) only the foreground buffer to be accessed
1179 * in a lookup and (2) must differentiate between the foreground and
1180 * background buffer in the splay tree algorithm because the splay
1181 * tree cannot normally handle multiple entities with the same 'index'.
1182 * We accomplish this by adding differentiating flags to the splay tree's
1183 * numerical domain.
1184 */
1185static
1186struct buf *
1187buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1188{
1189 struct buf dummy;
1190 struct buf *lefttreemax, *righttreemin, *y;
1191
1192 if (root == NULL)
1193 return (NULL);
1194 lefttreemax = righttreemin = &dummy;
1195 for (;;) {
1196 if (lblkno < root->b_lblkno ||
1197 (lblkno == root->b_lblkno &&
1198 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1199 if ((y = root->b_left) == NULL)
1200 break;
1201 if (lblkno < y->b_lblkno) {
1202 /* Rotate right. */
1203 root->b_left = y->b_right;
1204 y->b_right = root;
1205 root = y;
1206 if ((y = root->b_left) == NULL)
1207 break;
1208 }
1209 /* Link into the new root's right tree. */
1210 righttreemin->b_left = root;
1211 righttreemin = root;
1212 } else if (lblkno > root->b_lblkno ||
1213 (lblkno == root->b_lblkno &&
1214 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1215 if ((y = root->b_right) == NULL)
1216 break;
1217 if (lblkno > y->b_lblkno) {
1218 /* Rotate left. */
1219 root->b_right = y->b_left;
1220 y->b_left = root;
1221 root = y;
1222 if ((y = root->b_right) == NULL)
1223 break;
1224 }
1225 /* Link into the new root's left tree. */
1226 lefttreemax->b_right = root;
1227 lefttreemax = root;
1228 } else {
1229 break;
1230 }
1231 root = y;
1232 }
1233 /* Assemble the new root. */
1234 lefttreemax->b_right = root->b_left;
1235 righttreemin->b_left = root->b_right;
1236 root->b_left = dummy.b_right;
1237 root->b_right = dummy.b_left;
1238 return (root);
1239}
1240
1241static
1242void
1243buf_vlist_remove(struct buf *bp)
1244{
1245 struct vnode *vp = bp->b_vp;
1246 struct buf *root;
1247
1248 ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
1249 if (bp->b_xflags & BX_VNDIRTY) {
1250 if (bp != vp->v_dirtyblkroot) {
1251 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1252 vp->v_dirtyblkroot);
1253 KASSERT(root == bp,
1254 ("splay lookup failed during dirty remove"));
1255 }
1256 if (bp->b_left == NULL) {
1257 root = bp->b_right;
1258 } else {
1259 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1260 bp->b_left);
1261 root->b_right = bp->b_right;
1262 }
1263 vp->v_dirtyblkroot = root;
1264 TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1265 vp->v_dirtybufcnt--;
1266 } else {
1267 /* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1268 if (bp != vp->v_cleanblkroot) {
1269 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1270 vp->v_cleanblkroot);
1271 KASSERT(root == bp,
1272 ("splay lookup failed during clean remove"));
1273 }
1274 if (bp->b_left == NULL) {
1275 root = bp->b_right;
1276 } else {
1277 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1278 bp->b_left);
1279 root->b_right = bp->b_right;
1280 }
1281 vp->v_cleanblkroot = root;
1282 TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1283 vp->v_cleanbufcnt--;
1284 }
1285 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1286}
1287
1288/*
1289 * Add the buffer to the sorted clean or dirty block list using a
1290 * splay tree algorithm.
1291 *
1292 * NOTE: xflags is passed as a constant, optimizing this inline function!
1293 */
1294static
1295void
1296buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1297{
1298 struct buf *root;
1299
1300 ASSERT_VI_LOCKED(vp, "buf_vlist_add");
1301 bp->b_xflags |= xflags;
1302 if (xflags & BX_VNDIRTY) {
1303 root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1304 if (root == NULL) {
1305 bp->b_left = NULL;
1306 bp->b_right = NULL;
1307 TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1308 } else if (bp->b_lblkno < root->b_lblkno ||
1309 (bp->b_lblkno == root->b_lblkno &&
1310 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1311 bp->b_left = root->b_left;
1312 bp->b_right = root;
1313 root->b_left = NULL;
1314 TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1315 } else {
1316 bp->b_right = root->b_right;
1317 bp->b_left = root;
1318 root->b_right = NULL;
1319 TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1320 root, bp, b_vnbufs);
1321 }
1322 vp->v_dirtybufcnt++;
1323 vp->v_dirtyblkroot = bp;
1324 } else {
1325 /* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1326 root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1327 if (root == NULL) {
1328 bp->b_left = NULL;
1329 bp->b_right = NULL;
1330 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1331 } else if (bp->b_lblkno < root->b_lblkno ||
1332 (bp->b_lblkno == root->b_lblkno &&
1333 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1334 bp->b_left = root->b_left;
1335 bp->b_right = root;
1336 root->b_left = NULL;
1337 TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1338 } else {
1339 bp->b_right = root->b_right;
1340 bp->b_left = root;
1341 root->b_right = NULL;
1342 TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1343 root, bp, b_vnbufs);
1344 }
1345 vp->v_cleanbufcnt++;
1346 vp->v_cleanblkroot = bp;
1347 }
1348}
1349
1350/*
1351 * Lookup a buffer using the splay tree. Note that we specifically avoid
1352 * shadow buffers used in background bitmap writes.
1353 *
1354 * This code isn't quite efficient as it could be because we are maintaining
1355 * two sorted lists and do not know which list the block resides in.
1356 *
1357 * During a "make buildworld" the desired buffer is found at one of
1358 * the roots more than 60% of the time. Thus, checking both roots
1359 * before performing either splay eliminates unnecessary splays on the
1360 * first tree splayed.
1361 */
1362struct buf *
1363gbincore(struct vnode *vp, daddr_t lblkno)
1364{
1365 struct buf *bp;
1366
1367 GIANT_REQUIRED;
1368
1369 ASSERT_VI_LOCKED(vp, "gbincore");
1370 if ((bp = vp->v_cleanblkroot) != NULL &&
1371 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1372 return (bp);
1373 if ((bp = vp->v_dirtyblkroot) != NULL &&
1374 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1375 return (bp);
1376 if ((bp = vp->v_cleanblkroot) != NULL) {
1377 vp->v_cleanblkroot = bp = buf_splay(lblkno, 0, bp);
1378 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1379 return (bp);
1380 }
1381 if ((bp = vp->v_dirtyblkroot) != NULL) {
1382 vp->v_dirtyblkroot = bp = buf_splay(lblkno, 0, bp);
1383 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1384 return (bp);
1385 }
1386 return (NULL);
1387}
1388
1389/*
1390 * Associate a buffer with a vnode.
1391 */
1392void
1393bgetvp(vp, bp)
1394 register struct vnode *vp;
1395 register struct buf *bp;
1396{
1397
1398 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1399
1400 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1401 ("bgetvp: bp already attached! %p", bp));
1402
1403 ASSERT_VI_LOCKED(vp, "bgetvp");
1404 vholdl(vp);
1405 bp->b_vp = vp;
1406 bp->b_dev = vn_todev(vp);
1407 /*
1408 * Insert onto list for new vnode.
1409 */
1410 buf_vlist_add(bp, vp, BX_VNCLEAN);
1411}
1412
1413/*
1414 * Disassociate a buffer from a vnode.
1415 */
1416void
1417brelvp(bp)
1418 register struct buf *bp;
1419{
1420 struct vnode *vp;
1421
1422 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1423
1424 /*
1425 * Delete from old vnode list, if on one.
1426 */
1427 vp = bp->b_vp;
1428 VI_LOCK(vp);
1429 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1430 buf_vlist_remove(bp);
1431 if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1432 vp->v_iflag &= ~VI_ONWORKLST;
1433 mtx_lock(&sync_mtx);
1434 LIST_REMOVE(vp, v_synclist);
1435 mtx_unlock(&sync_mtx);
1436 }
1437 vdropl(vp);
1438 bp->b_vp = (struct vnode *) 0;
1439 if (bp->b_object)
1440 bp->b_object = NULL;
1441 VI_UNLOCK(vp);
1442}
1443
1444/*
1445 * Add an item to the syncer work queue.
1446 */
1447static void
1448vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1449{
1450 int slot;
1451
1452 ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
1453
1454 mtx_lock(&sync_mtx);
1455 if (vp->v_iflag & VI_ONWORKLST)
1456 LIST_REMOVE(vp, v_synclist);
1457 else
1458 vp->v_iflag |= VI_ONWORKLST;
1459
1460 if (delay > syncer_maxdelay - 2)
1461 delay = syncer_maxdelay - 2;
1462 slot = (syncer_delayno + delay) & syncer_mask;
1463
1464 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1465 mtx_unlock(&sync_mtx);
1466}
1467
1468struct proc *updateproc;
1469static void sched_sync(void);
1470static struct kproc_desc up_kp = {
1471 "syncer",
1472 sched_sync,
1473 &updateproc
1474};
1475SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1476
1477/*
1478 * System filesystem synchronizer daemon.
1479 */
1480static void
1481sched_sync(void)
1482{
1483 struct synclist *next;
1484 struct synclist *slp;
1485 struct vnode *vp;
1486 struct mount *mp;
1487 long starttime;
1488 struct thread *td = FIRST_THREAD_IN_PROC(updateproc); /* XXXKSE */
1489
1490 mtx_lock(&Giant);
1491
1492 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1493 SHUTDOWN_PRI_LAST);
1494
1495 for (;;) {
1496 kthread_suspend_check(td->td_proc);
1497
1498 starttime = time_second;
1499
1500 /*
1501 * Push files whose dirty time has expired. Be careful
1502 * of interrupt race on slp queue.
1503 */
1504 mtx_lock(&sync_mtx);
1505 slp = &syncer_workitem_pending[syncer_delayno];
1506 syncer_delayno += 1;
1507 if (syncer_delayno == syncer_maxdelay)
1508 syncer_delayno = 0;
1509 next = &syncer_workitem_pending[syncer_delayno];
1510
1511 while ((vp = LIST_FIRST(slp)) != NULL) {
1512 if (VOP_ISLOCKED(vp, NULL) != 0 ||
1513 vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1514 LIST_REMOVE(vp, v_synclist);
1515 LIST_INSERT_HEAD(next, vp, v_synclist);
1516 continue;
1517 }
1518 if (VI_TRYLOCK(vp) == 0) {
1519 LIST_REMOVE(vp, v_synclist);
1520 LIST_INSERT_HEAD(next, vp, v_synclist);
1521 vn_finished_write(mp);
1522 continue;
1523 }
1524 /*
1525 * We use vhold in case the vnode does not
1526 * successfully sync. vhold prevents the vnode from
1527 * going away when we unlock the sync_mtx so that
1528 * we can acquire the vnode interlock.
1529 */
1530 vholdl(vp);
1531 mtx_unlock(&sync_mtx);
1532 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
1533 (void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1534 VOP_UNLOCK(vp, 0, td);
1535 vn_finished_write(mp);
1536 VI_LOCK(vp);
1537 if ((vp->v_iflag & VI_ONWORKLST) != 0) {
1538 /*
1539 * Put us back on the worklist. The worklist
1540 * routine will remove us from our current
1541 * position and then add us back in at a later
1542 * position.
1543 */
1544 vn_syncer_add_to_worklist(vp, syncdelay);
1545 }
1546 vdropl(vp);
1547 VI_UNLOCK(vp);
1548 mtx_lock(&sync_mtx);
1549 }
1550 mtx_unlock(&sync_mtx);
1551
1552 /*
1553 * Do soft update processing.
1554 */
1555 if (softdep_process_worklist_hook != NULL)
1556 (*softdep_process_worklist_hook)(NULL);
1557
1558 /*
1559 * The variable rushjob allows the kernel to speed up the
1560 * processing of the filesystem syncer process. A rushjob
1561 * value of N tells the filesystem syncer to process the next
1562 * N seconds worth of work on its queue ASAP. Currently rushjob
1563 * is used by the soft update code to speed up the filesystem
1564 * syncer process when the incore state is getting so far
1565 * ahead of the disk that the kernel memory pool is being
1566 * threatened with exhaustion.
1567 */
1568 mtx_lock(&sync_mtx);
1569 if (rushjob > 0) {
1570 rushjob -= 1;
1571 mtx_unlock(&sync_mtx);
1572 continue;
1573 }
1574 mtx_unlock(&sync_mtx);
1575 /*
1576 * If it has taken us less than a second to process the
1577 * current work, then wait. Otherwise start right over
1578 * again. We can still lose time if any single round
1579 * takes more than two seconds, but it does not really
1580 * matter as we are just trying to generally pace the
1581 * filesystem activity.
1582 */
1583 if (time_second == starttime)
1584 tsleep(&lbolt, PPAUSE, "syncer", 0);
1585 }
1586}
1587
1588/*
1589 * Request the syncer daemon to speed up its work.
1590 * We never push it to speed up more than half of its
1591 * normal turn time, otherwise it could take over the cpu.
1592 * XXXKSE only one update?
1593 */
1594int
1595speedup_syncer()
1596{
1597 struct thread *td;
1598 int ret = 0;
1599
1600 td = FIRST_THREAD_IN_PROC(updateproc);
66#include <sys/stat.h>
67#include <sys/sysctl.h>
68#include <sys/syslog.h>
69#include <sys/vmmeter.h>
70#include <sys/vnode.h>
71
72#include <vm/vm.h>
73#include <vm/vm_object.h>
74#include <vm/vm_extern.h>
75#include <vm/pmap.h>
76#include <vm/vm_map.h>
77#include <vm/vm_page.h>
78#include <vm/vm_kern.h>
79#include <vm/uma.h>
80
81static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
82
83static void addalias(struct vnode *vp, dev_t nvp_rdev);
84static void insmntque(struct vnode *vp, struct mount *mp);
85static void vclean(struct vnode *vp, int flags, struct thread *td);
86static void vlruvp(struct vnode *vp);
87static int flushbuflist(struct buf *blist, int flags, struct vnode *vp,
88 int slpflag, int slptimeo, int *errorp);
89static int vtryrecycle(struct vnode *vp);
90static void vx_lock(struct vnode *vp);
91static void vx_unlock(struct vnode *vp);
92static void vgonechrl(struct vnode *vp, struct thread *td);
93
94
95/*
96 * Number of vnodes in existence. Increased whenever getnewvnode()
97 * allocates a new vnode, never decreased.
98 */
99static unsigned long numvnodes;
100
101SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
102
103/*
104 * Conversion tables for conversion from vnode types to inode formats
105 * and back.
106 */
107enum vtype iftovt_tab[16] = {
108 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
109 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
110};
111int vttoif_tab[9] = {
112 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
113 S_IFSOCK, S_IFIFO, S_IFMT,
114};
115
116/*
117 * List of vnodes that are ready for recycling.
118 */
119static TAILQ_HEAD(freelst, vnode) vnode_free_list;
120
121/*
122 * Minimum number of free vnodes. If there are fewer than this free vnodes,
123 * getnewvnode() will return a newly allocated vnode.
124 */
125static u_long wantfreevnodes = 25;
126SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
127/* Number of vnodes in the free list. */
128static u_long freevnodes;
129SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
130
131/*
132 * Various variables used for debugging the new implementation of
133 * reassignbuf().
134 * XXX these are probably of (very) limited utility now.
135 */
136static int reassignbufcalls;
137SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
138static int nameileafonly;
139SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
140
141/*
142 * Cache for the mount type id assigned to NFS. This is used for
143 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
144 */
145int nfs_mount_type = -1;
146
147/* To keep more than one thread at a time from running vfs_getnewfsid */
148static struct mtx mntid_mtx;
149
150/*
151 * Lock for any access to the following:
152 * vnode_free_list
153 * numvnodes
154 * freevnodes
155 */
156static struct mtx vnode_free_list_mtx;
157
158/*
159 * For any iteration/modification of dev->si_hlist (linked through
160 * v_specnext)
161 */
162static struct mtx spechash_mtx;
163
164/* Publicly exported FS */
165struct nfs_public nfs_pub;
166
167/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
168static uma_zone_t vnode_zone;
169static uma_zone_t vnodepoll_zone;
170
171/* Set to 1 to print out reclaim of active vnodes */
172int prtactive;
173
174/*
175 * The workitem queue.
176 *
177 * It is useful to delay writes of file data and filesystem metadata
178 * for tens of seconds so that quickly created and deleted files need
179 * not waste disk bandwidth being created and removed. To realize this,
180 * we append vnodes to a "workitem" queue. When running with a soft
181 * updates implementation, most pending metadata dependencies should
182 * not wait for more than a few seconds. Thus, mounted on block devices
183 * are delayed only about a half the time that file data is delayed.
184 * Similarly, directory updates are more critical, so are only delayed
185 * about a third the time that file data is delayed. Thus, there are
186 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
187 * one each second (driven off the filesystem syncer process). The
188 * syncer_delayno variable indicates the next queue that is to be processed.
189 * Items that need to be processed soon are placed in this queue:
190 *
191 * syncer_workitem_pending[syncer_delayno]
192 *
193 * A delay of fifteen seconds is done by placing the request fifteen
194 * entries later in the queue:
195 *
196 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
197 *
198 */
199static int syncer_delayno;
200static long syncer_mask;
201LIST_HEAD(synclist, vnode);
202static struct synclist *syncer_workitem_pending;
203/*
204 * The sync_mtx protects:
205 * vp->v_synclist
206 * syncer_delayno
207 * syncer_workitem_pending
208 * rushjob
209 */
210static struct mtx sync_mtx;
211
212#define SYNCER_MAXDELAY 32
213static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
214static int syncdelay = 30; /* max time to delay syncing data */
215static int filedelay = 30; /* time to delay syncing files */
216SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
217static int dirdelay = 29; /* time to delay syncing directories */
218SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
219static int metadelay = 28; /* time to delay syncing metadata */
220SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
221static int rushjob; /* number of slots to run ASAP */
222static int stat_rush_requests; /* number of times I/O speeded up */
223SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
224
225/*
226 * Number of vnodes we want to exist at any one time. This is mostly used
227 * to size hash tables in vnode-related code. It is normally not used in
228 * getnewvnode(), as wantfreevnodes is normally nonzero.)
229 *
230 * XXX desiredvnodes is historical cruft and should not exist.
231 */
232int desiredvnodes;
233SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
234 &desiredvnodes, 0, "Maximum number of vnodes");
235static int minvnodes;
236SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
237 &minvnodes, 0, "Minimum number of vnodes");
238static int vnlru_nowhere;
239SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
240 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
241
242/* Hook for calling soft updates. */
243int (*softdep_process_worklist_hook)(struct mount *);
244
245/*
246 * Initialize the vnode management data structures.
247 */
248static void
249vntblinit(void *dummy __unused)
250{
251
252 /*
253 * Desiredvnodes is a function of the physical memory size and
254 * the kernel's heap size. Specifically, desiredvnodes scales
255 * in proportion to the physical memory size until two fifths
256 * of the kernel's heap size is consumed by vnodes and vm
257 * objects.
258 */
259 desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
260 (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
261 minvnodes = desiredvnodes / 4;
262 mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
263 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
264 mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
265 TAILQ_INIT(&vnode_free_list);
266 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
267 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
268 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
269 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
270 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
271 /*
272 * Initialize the filesystem syncer.
273 */
274 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
275 &syncer_mask);
276 syncer_maxdelay = syncer_mask + 1;
277 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
278}
279SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
280
281
282/*
283 * Mark a mount point as busy. Used to synchronize access and to delay
284 * unmounting. Interlock is not released on failure.
285 */
286int
287vfs_busy(mp, flags, interlkp, td)
288 struct mount *mp;
289 int flags;
290 struct mtx *interlkp;
291 struct thread *td;
292{
293 int lkflags;
294
295 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
296 if (flags & LK_NOWAIT)
297 return (ENOENT);
298 mp->mnt_kern_flag |= MNTK_MWAIT;
299 /*
300 * Since all busy locks are shared except the exclusive
301 * lock granted when unmounting, the only place that a
302 * wakeup needs to be done is at the release of the
303 * exclusive lock at the end of dounmount.
304 */
305 msleep(mp, interlkp, PVFS, "vfs_busy", 0);
306 return (ENOENT);
307 }
308 lkflags = LK_SHARED | LK_NOPAUSE;
309 if (interlkp)
310 lkflags |= LK_INTERLOCK;
311 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
312 panic("vfs_busy: unexpected lock failure");
313 return (0);
314}
315
316/*
317 * Free a busy filesystem.
318 */
319void
320vfs_unbusy(mp, td)
321 struct mount *mp;
322 struct thread *td;
323{
324
325 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
326}
327
328/*
329 * Lookup a mount point by filesystem identifier.
330 */
331struct mount *
332vfs_getvfs(fsid)
333 fsid_t *fsid;
334{
335 register struct mount *mp;
336
337 mtx_lock(&mountlist_mtx);
338 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
339 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
340 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
341 mtx_unlock(&mountlist_mtx);
342 return (mp);
343 }
344 }
345 mtx_unlock(&mountlist_mtx);
346 return ((struct mount *) 0);
347}
348
349/*
350 * Get a new unique fsid. Try to make its val[0] unique, since this value
351 * will be used to create fake device numbers for stat(). Also try (but
352 * not so hard) make its val[0] unique mod 2^16, since some emulators only
353 * support 16-bit device numbers. We end up with unique val[0]'s for the
354 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
355 *
356 * Keep in mind that several mounts may be running in parallel. Starting
357 * the search one past where the previous search terminated is both a
358 * micro-optimization and a defense against returning the same fsid to
359 * different mounts.
360 */
361void
362vfs_getnewfsid(mp)
363 struct mount *mp;
364{
365 static u_int16_t mntid_base;
366 fsid_t tfsid;
367 int mtype;
368
369 mtx_lock(&mntid_mtx);
370 mtype = mp->mnt_vfc->vfc_typenum;
371 tfsid.val[1] = mtype;
372 mtype = (mtype & 0xFF) << 24;
373 for (;;) {
374 tfsid.val[0] = makeudev(255,
375 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
376 mntid_base++;
377 if (vfs_getvfs(&tfsid) == NULL)
378 break;
379 }
380 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
381 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
382 mtx_unlock(&mntid_mtx);
383}
384
385/*
386 * Knob to control the precision of file timestamps:
387 *
388 * 0 = seconds only; nanoseconds zeroed.
389 * 1 = seconds and nanoseconds, accurate within 1/HZ.
390 * 2 = seconds and nanoseconds, truncated to microseconds.
391 * >=3 = seconds and nanoseconds, maximum precision.
392 */
393enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
394
395static int timestamp_precision = TSP_SEC;
396SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
397 &timestamp_precision, 0, "");
398
399/*
400 * Get a current timestamp.
401 */
402void
403vfs_timestamp(tsp)
404 struct timespec *tsp;
405{
406 struct timeval tv;
407
408 switch (timestamp_precision) {
409 case TSP_SEC:
410 tsp->tv_sec = time_second;
411 tsp->tv_nsec = 0;
412 break;
413 case TSP_HZ:
414 getnanotime(tsp);
415 break;
416 case TSP_USEC:
417 microtime(&tv);
418 TIMEVAL_TO_TIMESPEC(&tv, tsp);
419 break;
420 case TSP_NSEC:
421 default:
422 nanotime(tsp);
423 break;
424 }
425}
426
427/*
428 * Set vnode attributes to VNOVAL
429 */
430void
431vattr_null(vap)
432 register struct vattr *vap;
433{
434
435 vap->va_type = VNON;
436 vap->va_size = VNOVAL;
437 vap->va_bytes = VNOVAL;
438 vap->va_mode = VNOVAL;
439 vap->va_nlink = VNOVAL;
440 vap->va_uid = VNOVAL;
441 vap->va_gid = VNOVAL;
442 vap->va_fsid = VNOVAL;
443 vap->va_fileid = VNOVAL;
444 vap->va_blocksize = VNOVAL;
445 vap->va_rdev = VNOVAL;
446 vap->va_atime.tv_sec = VNOVAL;
447 vap->va_atime.tv_nsec = VNOVAL;
448 vap->va_mtime.tv_sec = VNOVAL;
449 vap->va_mtime.tv_nsec = VNOVAL;
450 vap->va_ctime.tv_sec = VNOVAL;
451 vap->va_ctime.tv_nsec = VNOVAL;
452 vap->va_birthtime.tv_sec = VNOVAL;
453 vap->va_birthtime.tv_nsec = VNOVAL;
454 vap->va_flags = VNOVAL;
455 vap->va_gen = VNOVAL;
456 vap->va_vaflags = 0;
457}
458
459/*
460 * This routine is called when we have too many vnodes. It attempts
461 * to free <count> vnodes and will potentially free vnodes that still
462 * have VM backing store (VM backing store is typically the cause
463 * of a vnode blowout so we want to do this). Therefore, this operation
464 * is not considered cheap.
465 *
466 * A number of conditions may prevent a vnode from being reclaimed.
467 * the buffer cache may have references on the vnode, a directory
468 * vnode may still have references due to the namei cache representing
469 * underlying files, or the vnode may be in active use. It is not
470 * desireable to reuse such vnodes. These conditions may cause the
471 * number of vnodes to reach some minimum value regardless of what
472 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
473 */
474static int
475vlrureclaim(struct mount *mp)
476{
477 struct vnode *vp;
478 int done;
479 int trigger;
480 int usevnodes;
481 int count;
482
483 /*
484 * Calculate the trigger point, don't allow user
485 * screwups to blow us up. This prevents us from
486 * recycling vnodes with lots of resident pages. We
487 * aren't trying to free memory, we are trying to
488 * free vnodes.
489 */
490 usevnodes = desiredvnodes;
491 if (usevnodes <= 0)
492 usevnodes = 1;
493 trigger = cnt.v_page_count * 2 / usevnodes;
494
495 done = 0;
496 MNT_ILOCK(mp);
497 count = mp->mnt_nvnodelistsize / 10 + 1;
498 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
499 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
500 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
501
502 if (vp->v_type != VNON &&
503 vp->v_type != VBAD &&
504 VI_TRYLOCK(vp)) {
505 if (VMIGHTFREE(vp) && /* critical path opt */
506 (vp->v_object == NULL ||
507 vp->v_object->resident_page_count < trigger)) {
508 MNT_IUNLOCK(mp);
509 vgonel(vp, curthread);
510 done++;
511 MNT_ILOCK(mp);
512 } else
513 VI_UNLOCK(vp);
514 }
515 --count;
516 }
517 MNT_IUNLOCK(mp);
518 return done;
519}
520
521/*
522 * Attempt to recycle vnodes in a context that is always safe to block.
523 * Calling vlrurecycle() from the bowels of filesystem code has some
524 * interesting deadlock problems.
525 */
526static struct proc *vnlruproc;
527static int vnlruproc_sig;
528
529static void
530vnlru_proc(void)
531{
532 struct mount *mp, *nmp;
533 int done;
534 struct proc *p = vnlruproc;
535 struct thread *td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */
536
537 mtx_lock(&Giant);
538
539 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
540 SHUTDOWN_PRI_FIRST);
541
542 for (;;) {
543 kthread_suspend_check(p);
544 mtx_lock(&vnode_free_list_mtx);
545 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
546 mtx_unlock(&vnode_free_list_mtx);
547 vnlruproc_sig = 0;
548 wakeup(&vnlruproc_sig);
549 tsleep(vnlruproc, PVFS, "vlruwt", hz);
550 continue;
551 }
552 mtx_unlock(&vnode_free_list_mtx);
553 done = 0;
554 mtx_lock(&mountlist_mtx);
555 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
556 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
557 nmp = TAILQ_NEXT(mp, mnt_list);
558 continue;
559 }
560 done += vlrureclaim(mp);
561 mtx_lock(&mountlist_mtx);
562 nmp = TAILQ_NEXT(mp, mnt_list);
563 vfs_unbusy(mp, td);
564 }
565 mtx_unlock(&mountlist_mtx);
566 if (done == 0) {
567#if 0
568 /* These messages are temporary debugging aids */
569 if (vnlru_nowhere < 5)
570 printf("vnlru process getting nowhere..\n");
571 else if (vnlru_nowhere == 5)
572 printf("vnlru process messages stopped.\n");
573#endif
574 vnlru_nowhere++;
575 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
576 }
577 }
578}
579
580static struct kproc_desc vnlru_kp = {
581 "vnlru",
582 vnlru_proc,
583 &vnlruproc
584};
585SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
586
587
588/*
589 * Routines having to do with the management of the vnode table.
590 */
591
592/*
593 * Check to see if a free vnode can be recycled. If it can,
594 * recycle it and return it with the vnode interlock held.
595 */
596static int
597vtryrecycle(struct vnode *vp)
598{
599 struct thread *td = curthread;
600 vm_object_t object;
601 struct mount *vnmp;
602 int error;
603
604 /* Don't recycle if we can't get the interlock */
605 if (!VI_TRYLOCK(vp))
606 return (EWOULDBLOCK);
607 /*
608 * This vnode may found and locked via some other list, if so we
609 * can't recycle it yet.
610 */
611 if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
612 return (EWOULDBLOCK);
613 /*
614 * Don't recycle if its filesystem is being suspended.
615 */
616 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
617 error = EBUSY;
618 goto done;
619 }
620
621 /*
622 * Don't recycle if we still have cached pages.
623 */
624 if (VOP_GETVOBJECT(vp, &object) == 0) {
625 VM_OBJECT_LOCK(object);
626 if (object->resident_page_count ||
627 object->ref_count) {
628 VM_OBJECT_UNLOCK(object);
629 error = EBUSY;
630 goto done;
631 }
632 VM_OBJECT_UNLOCK(object);
633 }
634 if (LIST_FIRST(&vp->v_cache_src)) {
635 /*
636 * note: nameileafonly sysctl is temporary,
637 * for debugging only, and will eventually be
638 * removed.
639 */
640 if (nameileafonly > 0) {
641 /*
642 * Do not reuse namei-cached directory
643 * vnodes that have cached
644 * subdirectories.
645 */
646 if (cache_leaf_test(vp) < 0) {
647 error = EISDIR;
648 goto done;
649 }
650 } else if (nameileafonly < 0 ||
651 vmiodirenable == 0) {
652 /*
653 * Do not reuse namei-cached directory
654 * vnodes if nameileafonly is -1 or
655 * if VMIO backing for directories is
656 * turned off (otherwise we reuse them
657 * too quickly).
658 */
659 error = EBUSY;
660 goto done;
661 }
662 }
663 /*
664 * If we got this far, we need to acquire the interlock and see if
665 * anyone picked up this vnode from another list. If not, we will
666 * mark it with XLOCK via vgonel() so that anyone who does find it
667 * will skip over it.
668 */
669 VI_LOCK(vp);
670 if (VSHOULDBUSY(vp) && (vp->v_iflag & VI_XLOCK) == 0) {
671 VI_UNLOCK(vp);
672 error = EBUSY;
673 goto done;
674 }
675 mtx_lock(&vnode_free_list_mtx);
676 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
677 vp->v_iflag &= ~VI_FREE;
678 mtx_unlock(&vnode_free_list_mtx);
679 vp->v_iflag |= VI_DOOMED;
680 if (vp->v_type != VBAD) {
681 VOP_UNLOCK(vp, 0, td);
682 vgonel(vp, td);
683 VI_LOCK(vp);
684 } else
685 VOP_UNLOCK(vp, 0, td);
686 vn_finished_write(vnmp);
687 return (0);
688done:
689 VOP_UNLOCK(vp, 0, td);
690 return (error);
691}
692
693/*
694 * Return the next vnode from the free list.
695 */
696int
697getnewvnode(tag, mp, vops, vpp)
698 const char *tag;
699 struct mount *mp;
700 vop_t **vops;
701 struct vnode **vpp;
702{
703 struct vnode *vp = NULL;
704 struct vpollinfo *pollinfo = NULL;
705
706 mtx_lock(&vnode_free_list_mtx);
707
708 /*
709 * Try to reuse vnodes if we hit the max. This situation only
710 * occurs in certain large-memory (2G+) situations. We cannot
711 * attempt to directly reclaim vnodes due to nasty recursion
712 * problems.
713 */
714 while (numvnodes - freevnodes > desiredvnodes) {
715 if (vnlruproc_sig == 0) {
716 vnlruproc_sig = 1; /* avoid unnecessary wakeups */
717 wakeup(vnlruproc);
718 }
719 mtx_unlock(&vnode_free_list_mtx);
720 tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
721 mtx_lock(&vnode_free_list_mtx);
722 }
723
724 /*
725 * Attempt to reuse a vnode already on the free list, allocating
726 * a new vnode if we can't find one or if we have not reached a
727 * good minimum for good LRU performance.
728 */
729
730 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
731 int error;
732 int count;
733
734 for (count = 0; count < freevnodes; count++) {
735 vp = TAILQ_FIRST(&vnode_free_list);
736
737 KASSERT(vp->v_usecount == 0 &&
738 (vp->v_iflag & VI_DOINGINACT) == 0,
739 ("getnewvnode: free vnode isn't"));
740
741 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
742 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
743 mtx_unlock(&vnode_free_list_mtx);
744 error = vtryrecycle(vp);
745 mtx_lock(&vnode_free_list_mtx);
746 if (error == 0)
747 break;
748 vp = NULL;
749 }
750 }
751 if (vp) {
752 freevnodes--;
753 mtx_unlock(&vnode_free_list_mtx);
754
755#ifdef INVARIANTS
756 {
757 if (vp->v_data)
758 panic("cleaned vnode isn't");
759 if (vp->v_numoutput)
760 panic("Clean vnode has pending I/O's");
761 if (vp->v_writecount != 0)
762 panic("Non-zero write count");
763 }
764#endif
765 if ((pollinfo = vp->v_pollinfo) != NULL) {
766 /*
767 * To avoid lock order reversals, the call to
768 * uma_zfree() must be delayed until the vnode
769 * interlock is released.
770 */
771 vp->v_pollinfo = NULL;
772 }
773#ifdef MAC
774 mac_destroy_vnode(vp);
775#endif
776 vp->v_iflag = 0;
777 vp->v_vflag = 0;
778 vp->v_lastw = 0;
779 vp->v_lasta = 0;
780 vp->v_cstart = 0;
781 vp->v_clen = 0;
782 vp->v_socket = 0;
783 lockdestroy(vp->v_vnlock);
784 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
785 KASSERT(vp->v_cleanbufcnt == 0, ("cleanbufcnt not 0"));
786 KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
787 KASSERT(vp->v_dirtybufcnt == 0, ("dirtybufcnt not 0"));
788 KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
789 } else {
790 numvnodes++;
791 mtx_unlock(&vnode_free_list_mtx);
792
793 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
794 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
795 VI_LOCK(vp);
796 vp->v_dd = vp;
797 vp->v_vnlock = &vp->v_lock;
798 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
799 cache_purge(vp); /* Sets up v_id. */
800 LIST_INIT(&vp->v_cache_src);
801 TAILQ_INIT(&vp->v_cache_dst);
802 }
803
804 TAILQ_INIT(&vp->v_cleanblkhd);
805 TAILQ_INIT(&vp->v_dirtyblkhd);
806 vp->v_type = VNON;
807 vp->v_tag = tag;
808 vp->v_op = vops;
809 *vpp = vp;
810 vp->v_usecount = 1;
811 vp->v_data = 0;
812 vp->v_cachedid = -1;
813 VI_UNLOCK(vp);
814 if (pollinfo != NULL) {
815 mtx_destroy(&pollinfo->vpi_lock);
816 uma_zfree(vnodepoll_zone, pollinfo);
817 }
818#ifdef MAC
819 mac_init_vnode(vp);
820 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
821 mac_associate_vnode_singlelabel(mp, vp);
822#endif
823 insmntque(vp, mp);
824
825 return (0);
826}
827
828/*
829 * Move a vnode from one mount queue to another.
830 */
831static void
832insmntque(vp, mp)
833 register struct vnode *vp;
834 register struct mount *mp;
835{
836
837 /*
838 * Delete from old mount point vnode list, if on one.
839 */
840 if (vp->v_mount != NULL) {
841 MNT_ILOCK(vp->v_mount);
842 KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
843 ("bad mount point vnode list size"));
844 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
845 vp->v_mount->mnt_nvnodelistsize--;
846 MNT_IUNLOCK(vp->v_mount);
847 }
848 /*
849 * Insert into list of vnodes for the new mount point, if available.
850 */
851 if ((vp->v_mount = mp) != NULL) {
852 MNT_ILOCK(vp->v_mount);
853 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
854 mp->mnt_nvnodelistsize++;
855 MNT_IUNLOCK(vp->v_mount);
856 }
857}
858
859/*
860 * Update outstanding I/O count and do wakeup if requested.
861 */
862void
863vwakeup(bp)
864 register struct buf *bp;
865{
866 register struct vnode *vp;
867
868 bp->b_flags &= ~B_WRITEINPROG;
869 if ((vp = bp->b_vp)) {
870 VI_LOCK(vp);
871 vp->v_numoutput--;
872 if (vp->v_numoutput < 0)
873 panic("vwakeup: neg numoutput");
874 if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
875 vp->v_iflag &= ~VI_BWAIT;
876 wakeup(&vp->v_numoutput);
877 }
878 VI_UNLOCK(vp);
879 }
880}
881
882/*
883 * Flush out and invalidate all buffers associated with a vnode.
884 * Called with the underlying object locked.
885 */
886int
887vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
888 struct vnode *vp;
889 int flags;
890 struct ucred *cred;
891 struct thread *td;
892 int slpflag, slptimeo;
893{
894 struct buf *blist;
895 int error;
896 vm_object_t object;
897
898 GIANT_REQUIRED;
899
900 ASSERT_VOP_LOCKED(vp, "vinvalbuf");
901
902 VI_LOCK(vp);
903 if (flags & V_SAVE) {
904 while (vp->v_numoutput) {
905 vp->v_iflag |= VI_BWAIT;
906 error = msleep(&vp->v_numoutput, VI_MTX(vp),
907 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
908 if (error) {
909 VI_UNLOCK(vp);
910 return (error);
911 }
912 }
913 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
914 VI_UNLOCK(vp);
915 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
916 return (error);
917 /*
918 * XXX We could save a lock/unlock if this was only
919 * enabled under INVARIANTS
920 */
921 VI_LOCK(vp);
922 if (vp->v_numoutput > 0 ||
923 !TAILQ_EMPTY(&vp->v_dirtyblkhd))
924 panic("vinvalbuf: dirty bufs");
925 }
926 }
927 /*
928 * If you alter this loop please notice that interlock is dropped and
929 * reacquired in flushbuflist. Special care is needed to ensure that
930 * no race conditions occur from this.
931 */
932 for (error = 0;;) {
933 if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
934 flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
935 if (error)
936 break;
937 continue;
938 }
939 if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
940 flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
941 if (error)
942 break;
943 continue;
944 }
945 break;
946 }
947 if (error) {
948 VI_UNLOCK(vp);
949 return (error);
950 }
951
952 /*
953 * Wait for I/O to complete. XXX needs cleaning up. The vnode can
954 * have write I/O in-progress but if there is a VM object then the
955 * VM object can also have read-I/O in-progress.
956 */
957 do {
958 while (vp->v_numoutput > 0) {
959 vp->v_iflag |= VI_BWAIT;
960 msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
961 }
962 VI_UNLOCK(vp);
963 if (VOP_GETVOBJECT(vp, &object) == 0) {
964 VM_OBJECT_LOCK(object);
965 vm_object_pip_wait(object, "vnvlbx");
966 VM_OBJECT_UNLOCK(object);
967 }
968 VI_LOCK(vp);
969 } while (vp->v_numoutput > 0);
970 VI_UNLOCK(vp);
971
972 /*
973 * Destroy the copy in the VM cache, too.
974 */
975 if (VOP_GETVOBJECT(vp, &object) == 0) {
976 VM_OBJECT_LOCK(object);
977 vm_object_page_remove(object, 0, 0,
978 (flags & V_SAVE) ? TRUE : FALSE);
979 VM_OBJECT_UNLOCK(object);
980 }
981
982#ifdef INVARIANTS
983 VI_LOCK(vp);
984 if ((flags & (V_ALT | V_NORMAL)) == 0 &&
985 (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
986 !TAILQ_EMPTY(&vp->v_cleanblkhd)))
987 panic("vinvalbuf: flush failed");
988 VI_UNLOCK(vp);
989#endif
990 return (0);
991}
992
993/*
994 * Flush out buffers on the specified list.
995 *
996 */
997static int
998flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
999 struct buf *blist;
1000 int flags;
1001 struct vnode *vp;
1002 int slpflag, slptimeo;
1003 int *errorp;
1004{
1005 struct buf *bp, *nbp;
1006 int found, error;
1007
1008 ASSERT_VI_LOCKED(vp, "flushbuflist");
1009
1010 for (found = 0, bp = blist; bp; bp = nbp) {
1011 nbp = TAILQ_NEXT(bp, b_vnbufs);
1012 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1013 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1014 continue;
1015 }
1016 found += 1;
1017 error = BUF_TIMELOCK(bp,
1018 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp),
1019 "flushbuf", slpflag, slptimeo);
1020 if (error) {
1021 if (error != ENOLCK)
1022 *errorp = error;
1023 goto done;
1024 }
1025 /*
1026 * XXX Since there are no node locks for NFS, I
1027 * believe there is a slight chance that a delayed
1028 * write will occur while sleeping just above, so
1029 * check for it. Note that vfs_bio_awrite expects
1030 * buffers to reside on a queue, while BUF_WRITE and
1031 * brelse do not.
1032 */
1033 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1034 (flags & V_SAVE)) {
1035
1036 if (bp->b_vp == vp) {
1037 if (bp->b_flags & B_CLUSTEROK) {
1038 vfs_bio_awrite(bp);
1039 } else {
1040 bremfree(bp);
1041 bp->b_flags |= B_ASYNC;
1042 BUF_WRITE(bp);
1043 }
1044 } else {
1045 bremfree(bp);
1046 (void) BUF_WRITE(bp);
1047 }
1048 goto done;
1049 }
1050 bremfree(bp);
1051 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1052 bp->b_flags &= ~B_ASYNC;
1053 brelse(bp);
1054 VI_LOCK(vp);
1055 }
1056 return (found);
1057done:
1058 VI_LOCK(vp);
1059 return (found);
1060}
1061
1062/*
1063 * Truncate a file's buffer and pages to a specified length. This
1064 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1065 * sync activity.
1066 */
1067int
1068vtruncbuf(vp, cred, td, length, blksize)
1069 register struct vnode *vp;
1070 struct ucred *cred;
1071 struct thread *td;
1072 off_t length;
1073 int blksize;
1074{
1075 register struct buf *bp;
1076 struct buf *nbp;
1077 int anyfreed;
1078 int trunclbn;
1079
1080 /*
1081 * Round up to the *next* lbn.
1082 */
1083 trunclbn = (length + blksize - 1) / blksize;
1084
1085 ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1086restart:
1087 VI_LOCK(vp);
1088 anyfreed = 1;
1089 for (;anyfreed;) {
1090 anyfreed = 0;
1091 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1092 nbp = TAILQ_NEXT(bp, b_vnbufs);
1093 if (bp->b_lblkno >= trunclbn) {
1094 if (BUF_LOCK(bp,
1095 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1096 VI_MTX(vp)) == ENOLCK)
1097 goto restart;
1098
1099 bremfree(bp);
1100 bp->b_flags |= (B_INVAL | B_RELBUF);
1101 bp->b_flags &= ~B_ASYNC;
1102 brelse(bp);
1103 anyfreed = 1;
1104
1105 if (nbp &&
1106 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1107 (nbp->b_vp != vp) ||
1108 (nbp->b_flags & B_DELWRI))) {
1109 goto restart;
1110 }
1111 VI_LOCK(vp);
1112 }
1113 }
1114
1115 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1116 nbp = TAILQ_NEXT(bp, b_vnbufs);
1117 if (bp->b_lblkno >= trunclbn) {
1118 if (BUF_LOCK(bp,
1119 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1120 VI_MTX(vp)) == ENOLCK)
1121 goto restart;
1122 bremfree(bp);
1123 bp->b_flags |= (B_INVAL | B_RELBUF);
1124 bp->b_flags &= ~B_ASYNC;
1125 brelse(bp);
1126 anyfreed = 1;
1127 if (nbp &&
1128 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1129 (nbp->b_vp != vp) ||
1130 (nbp->b_flags & B_DELWRI) == 0)) {
1131 goto restart;
1132 }
1133 VI_LOCK(vp);
1134 }
1135 }
1136 }
1137
1138 if (length > 0) {
1139restartsync:
1140 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1141 nbp = TAILQ_NEXT(bp, b_vnbufs);
1142 if (bp->b_lblkno > 0)
1143 continue;
1144 /*
1145 * Since we hold the vnode lock this should only
1146 * fail if we're racing with the buf daemon.
1147 */
1148 if (BUF_LOCK(bp,
1149 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1150 VI_MTX(vp)) == ENOLCK) {
1151 goto restart;
1152 }
1153 KASSERT((bp->b_flags & B_DELWRI),
1154 ("buf(%p) on dirty queue without DELWRI", bp));
1155
1156 bremfree(bp);
1157 bawrite(bp);
1158 VI_LOCK(vp);
1159 goto restartsync;
1160 }
1161 }
1162
1163 while (vp->v_numoutput > 0) {
1164 vp->v_iflag |= VI_BWAIT;
1165 msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1166 }
1167 VI_UNLOCK(vp);
1168 vnode_pager_setsize(vp, length);
1169
1170 return (0);
1171}
1172
1173/*
1174 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1175 * a vnode.
1176 *
1177 * NOTE: We have to deal with the special case of a background bitmap
1178 * buffer, a situation where two buffers will have the same logical
1179 * block offset. We want (1) only the foreground buffer to be accessed
1180 * in a lookup and (2) must differentiate between the foreground and
1181 * background buffer in the splay tree algorithm because the splay
1182 * tree cannot normally handle multiple entities with the same 'index'.
1183 * We accomplish this by adding differentiating flags to the splay tree's
1184 * numerical domain.
1185 */
1186static
1187struct buf *
1188buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1189{
1190 struct buf dummy;
1191 struct buf *lefttreemax, *righttreemin, *y;
1192
1193 if (root == NULL)
1194 return (NULL);
1195 lefttreemax = righttreemin = &dummy;
1196 for (;;) {
1197 if (lblkno < root->b_lblkno ||
1198 (lblkno == root->b_lblkno &&
1199 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1200 if ((y = root->b_left) == NULL)
1201 break;
1202 if (lblkno < y->b_lblkno) {
1203 /* Rotate right. */
1204 root->b_left = y->b_right;
1205 y->b_right = root;
1206 root = y;
1207 if ((y = root->b_left) == NULL)
1208 break;
1209 }
1210 /* Link into the new root's right tree. */
1211 righttreemin->b_left = root;
1212 righttreemin = root;
1213 } else if (lblkno > root->b_lblkno ||
1214 (lblkno == root->b_lblkno &&
1215 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1216 if ((y = root->b_right) == NULL)
1217 break;
1218 if (lblkno > y->b_lblkno) {
1219 /* Rotate left. */
1220 root->b_right = y->b_left;
1221 y->b_left = root;
1222 root = y;
1223 if ((y = root->b_right) == NULL)
1224 break;
1225 }
1226 /* Link into the new root's left tree. */
1227 lefttreemax->b_right = root;
1228 lefttreemax = root;
1229 } else {
1230 break;
1231 }
1232 root = y;
1233 }
1234 /* Assemble the new root. */
1235 lefttreemax->b_right = root->b_left;
1236 righttreemin->b_left = root->b_right;
1237 root->b_left = dummy.b_right;
1238 root->b_right = dummy.b_left;
1239 return (root);
1240}
1241
1242static
1243void
1244buf_vlist_remove(struct buf *bp)
1245{
1246 struct vnode *vp = bp->b_vp;
1247 struct buf *root;
1248
1249 ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
1250 if (bp->b_xflags & BX_VNDIRTY) {
1251 if (bp != vp->v_dirtyblkroot) {
1252 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1253 vp->v_dirtyblkroot);
1254 KASSERT(root == bp,
1255 ("splay lookup failed during dirty remove"));
1256 }
1257 if (bp->b_left == NULL) {
1258 root = bp->b_right;
1259 } else {
1260 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1261 bp->b_left);
1262 root->b_right = bp->b_right;
1263 }
1264 vp->v_dirtyblkroot = root;
1265 TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1266 vp->v_dirtybufcnt--;
1267 } else {
1268 /* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1269 if (bp != vp->v_cleanblkroot) {
1270 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1271 vp->v_cleanblkroot);
1272 KASSERT(root == bp,
1273 ("splay lookup failed during clean remove"));
1274 }
1275 if (bp->b_left == NULL) {
1276 root = bp->b_right;
1277 } else {
1278 root = buf_splay(bp->b_lblkno, bp->b_xflags,
1279 bp->b_left);
1280 root->b_right = bp->b_right;
1281 }
1282 vp->v_cleanblkroot = root;
1283 TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1284 vp->v_cleanbufcnt--;
1285 }
1286 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1287}
1288
1289/*
1290 * Add the buffer to the sorted clean or dirty block list using a
1291 * splay tree algorithm.
1292 *
1293 * NOTE: xflags is passed as a constant, optimizing this inline function!
1294 */
1295static
1296void
1297buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1298{
1299 struct buf *root;
1300
1301 ASSERT_VI_LOCKED(vp, "buf_vlist_add");
1302 bp->b_xflags |= xflags;
1303 if (xflags & BX_VNDIRTY) {
1304 root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1305 if (root == NULL) {
1306 bp->b_left = NULL;
1307 bp->b_right = NULL;
1308 TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1309 } else if (bp->b_lblkno < root->b_lblkno ||
1310 (bp->b_lblkno == root->b_lblkno &&
1311 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1312 bp->b_left = root->b_left;
1313 bp->b_right = root;
1314 root->b_left = NULL;
1315 TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1316 } else {
1317 bp->b_right = root->b_right;
1318 bp->b_left = root;
1319 root->b_right = NULL;
1320 TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1321 root, bp, b_vnbufs);
1322 }
1323 vp->v_dirtybufcnt++;
1324 vp->v_dirtyblkroot = bp;
1325 } else {
1326 /* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1327 root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1328 if (root == NULL) {
1329 bp->b_left = NULL;
1330 bp->b_right = NULL;
1331 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1332 } else if (bp->b_lblkno < root->b_lblkno ||
1333 (bp->b_lblkno == root->b_lblkno &&
1334 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1335 bp->b_left = root->b_left;
1336 bp->b_right = root;
1337 root->b_left = NULL;
1338 TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1339 } else {
1340 bp->b_right = root->b_right;
1341 bp->b_left = root;
1342 root->b_right = NULL;
1343 TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1344 root, bp, b_vnbufs);
1345 }
1346 vp->v_cleanbufcnt++;
1347 vp->v_cleanblkroot = bp;
1348 }
1349}
1350
1351/*
1352 * Lookup a buffer using the splay tree. Note that we specifically avoid
1353 * shadow buffers used in background bitmap writes.
1354 *
1355 * This code isn't quite efficient as it could be because we are maintaining
1356 * two sorted lists and do not know which list the block resides in.
1357 *
1358 * During a "make buildworld" the desired buffer is found at one of
1359 * the roots more than 60% of the time. Thus, checking both roots
1360 * before performing either splay eliminates unnecessary splays on the
1361 * first tree splayed.
1362 */
1363struct buf *
1364gbincore(struct vnode *vp, daddr_t lblkno)
1365{
1366 struct buf *bp;
1367
1368 GIANT_REQUIRED;
1369
1370 ASSERT_VI_LOCKED(vp, "gbincore");
1371 if ((bp = vp->v_cleanblkroot) != NULL &&
1372 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1373 return (bp);
1374 if ((bp = vp->v_dirtyblkroot) != NULL &&
1375 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1376 return (bp);
1377 if ((bp = vp->v_cleanblkroot) != NULL) {
1378 vp->v_cleanblkroot = bp = buf_splay(lblkno, 0, bp);
1379 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1380 return (bp);
1381 }
1382 if ((bp = vp->v_dirtyblkroot) != NULL) {
1383 vp->v_dirtyblkroot = bp = buf_splay(lblkno, 0, bp);
1384 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1385 return (bp);
1386 }
1387 return (NULL);
1388}
1389
1390/*
1391 * Associate a buffer with a vnode.
1392 */
1393void
1394bgetvp(vp, bp)
1395 register struct vnode *vp;
1396 register struct buf *bp;
1397{
1398
1399 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1400
1401 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1402 ("bgetvp: bp already attached! %p", bp));
1403
1404 ASSERT_VI_LOCKED(vp, "bgetvp");
1405 vholdl(vp);
1406 bp->b_vp = vp;
1407 bp->b_dev = vn_todev(vp);
1408 /*
1409 * Insert onto list for new vnode.
1410 */
1411 buf_vlist_add(bp, vp, BX_VNCLEAN);
1412}
1413
1414/*
1415 * Disassociate a buffer from a vnode.
1416 */
1417void
1418brelvp(bp)
1419 register struct buf *bp;
1420{
1421 struct vnode *vp;
1422
1423 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1424
1425 /*
1426 * Delete from old vnode list, if on one.
1427 */
1428 vp = bp->b_vp;
1429 VI_LOCK(vp);
1430 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1431 buf_vlist_remove(bp);
1432 if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1433 vp->v_iflag &= ~VI_ONWORKLST;
1434 mtx_lock(&sync_mtx);
1435 LIST_REMOVE(vp, v_synclist);
1436 mtx_unlock(&sync_mtx);
1437 }
1438 vdropl(vp);
1439 bp->b_vp = (struct vnode *) 0;
1440 if (bp->b_object)
1441 bp->b_object = NULL;
1442 VI_UNLOCK(vp);
1443}
1444
1445/*
1446 * Add an item to the syncer work queue.
1447 */
1448static void
1449vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1450{
1451 int slot;
1452
1453 ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
1454
1455 mtx_lock(&sync_mtx);
1456 if (vp->v_iflag & VI_ONWORKLST)
1457 LIST_REMOVE(vp, v_synclist);
1458 else
1459 vp->v_iflag |= VI_ONWORKLST;
1460
1461 if (delay > syncer_maxdelay - 2)
1462 delay = syncer_maxdelay - 2;
1463 slot = (syncer_delayno + delay) & syncer_mask;
1464
1465 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1466 mtx_unlock(&sync_mtx);
1467}
1468
1469struct proc *updateproc;
1470static void sched_sync(void);
1471static struct kproc_desc up_kp = {
1472 "syncer",
1473 sched_sync,
1474 &updateproc
1475};
1476SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1477
1478/*
1479 * System filesystem synchronizer daemon.
1480 */
1481static void
1482sched_sync(void)
1483{
1484 struct synclist *next;
1485 struct synclist *slp;
1486 struct vnode *vp;
1487 struct mount *mp;
1488 long starttime;
1489 struct thread *td = FIRST_THREAD_IN_PROC(updateproc); /* XXXKSE */
1490
1491 mtx_lock(&Giant);
1492
1493 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1494 SHUTDOWN_PRI_LAST);
1495
1496 for (;;) {
1497 kthread_suspend_check(td->td_proc);
1498
1499 starttime = time_second;
1500
1501 /*
1502 * Push files whose dirty time has expired. Be careful
1503 * of interrupt race on slp queue.
1504 */
1505 mtx_lock(&sync_mtx);
1506 slp = &syncer_workitem_pending[syncer_delayno];
1507 syncer_delayno += 1;
1508 if (syncer_delayno == syncer_maxdelay)
1509 syncer_delayno = 0;
1510 next = &syncer_workitem_pending[syncer_delayno];
1511
1512 while ((vp = LIST_FIRST(slp)) != NULL) {
1513 if (VOP_ISLOCKED(vp, NULL) != 0 ||
1514 vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1515 LIST_REMOVE(vp, v_synclist);
1516 LIST_INSERT_HEAD(next, vp, v_synclist);
1517 continue;
1518 }
1519 if (VI_TRYLOCK(vp) == 0) {
1520 LIST_REMOVE(vp, v_synclist);
1521 LIST_INSERT_HEAD(next, vp, v_synclist);
1522 vn_finished_write(mp);
1523 continue;
1524 }
1525 /*
1526 * We use vhold in case the vnode does not
1527 * successfully sync. vhold prevents the vnode from
1528 * going away when we unlock the sync_mtx so that
1529 * we can acquire the vnode interlock.
1530 */
1531 vholdl(vp);
1532 mtx_unlock(&sync_mtx);
1533 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
1534 (void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1535 VOP_UNLOCK(vp, 0, td);
1536 vn_finished_write(mp);
1537 VI_LOCK(vp);
1538 if ((vp->v_iflag & VI_ONWORKLST) != 0) {
1539 /*
1540 * Put us back on the worklist. The worklist
1541 * routine will remove us from our current
1542 * position and then add us back in at a later
1543 * position.
1544 */
1545 vn_syncer_add_to_worklist(vp, syncdelay);
1546 }
1547 vdropl(vp);
1548 VI_UNLOCK(vp);
1549 mtx_lock(&sync_mtx);
1550 }
1551 mtx_unlock(&sync_mtx);
1552
1553 /*
1554 * Do soft update processing.
1555 */
1556 if (softdep_process_worklist_hook != NULL)
1557 (*softdep_process_worklist_hook)(NULL);
1558
1559 /*
1560 * The variable rushjob allows the kernel to speed up the
1561 * processing of the filesystem syncer process. A rushjob
1562 * value of N tells the filesystem syncer to process the next
1563 * N seconds worth of work on its queue ASAP. Currently rushjob
1564 * is used by the soft update code to speed up the filesystem
1565 * syncer process when the incore state is getting so far
1566 * ahead of the disk that the kernel memory pool is being
1567 * threatened with exhaustion.
1568 */
1569 mtx_lock(&sync_mtx);
1570 if (rushjob > 0) {
1571 rushjob -= 1;
1572 mtx_unlock(&sync_mtx);
1573 continue;
1574 }
1575 mtx_unlock(&sync_mtx);
1576 /*
1577 * If it has taken us less than a second to process the
1578 * current work, then wait. Otherwise start right over
1579 * again. We can still lose time if any single round
1580 * takes more than two seconds, but it does not really
1581 * matter as we are just trying to generally pace the
1582 * filesystem activity.
1583 */
1584 if (time_second == starttime)
1585 tsleep(&lbolt, PPAUSE, "syncer", 0);
1586 }
1587}
1588
1589/*
1590 * Request the syncer daemon to speed up its work.
1591 * We never push it to speed up more than half of its
1592 * normal turn time, otherwise it could take over the cpu.
1593 * XXXKSE only one update?
1594 */
1595int
1596speedup_syncer()
1597{
1598 struct thread *td;
1599 int ret = 0;
1600
1601 td = FIRST_THREAD_IN_PROC(updateproc);
1601 mtx_lock_spin(&sched_lock);
1602 if (td->td_wchan == &lbolt) {
1603 unsleep(td);
1604 TD_CLR_SLEEPING(td);
1605 setrunnable(td);
1606 }
1607 mtx_unlock_spin(&sched_lock);
1602 sleepq_remove(td, &lbolt);
1608 mtx_lock(&sync_mtx);
1609 if (rushjob < syncdelay / 2) {
1610 rushjob += 1;
1611 stat_rush_requests += 1;
1612 ret = 1;
1613 }
1614 mtx_unlock(&sync_mtx);
1615 return (ret);
1616}
1617
1618/*
1619 * Associate a p-buffer with a vnode.
1620 *
1621 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1622 * with the buffer. i.e. the bp has not been linked into the vnode or
1623 * ref-counted.
1624 */
1625void
1626pbgetvp(vp, bp)
1627 register struct vnode *vp;
1628 register struct buf *bp;
1629{
1630
1631 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1632
1633 bp->b_vp = vp;
1634 bp->b_object = vp->v_object;
1635 bp->b_flags |= B_PAGING;
1636 bp->b_dev = vn_todev(vp);
1637}
1638
1639/*
1640 * Disassociate a p-buffer from a vnode.
1641 */
1642void
1643pbrelvp(bp)
1644 register struct buf *bp;
1645{
1646
1647 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1648
1649 /* XXX REMOVE ME */
1650 VI_LOCK(bp->b_vp);
1651 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1652 panic(
1653 "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1654 bp,
1655 (int)bp->b_flags
1656 );
1657 }
1658 VI_UNLOCK(bp->b_vp);
1659 bp->b_vp = (struct vnode *) 0;
1660 bp->b_object = NULL;
1661 bp->b_flags &= ~B_PAGING;
1662}
1663
1664/*
1665 * Reassign a buffer from one vnode to another.
1666 * Used to assign file specific control information
1667 * (indirect blocks) to the vnode to which they belong.
1668 */
1669void
1670reassignbuf(bp, newvp)
1671 register struct buf *bp;
1672 register struct vnode *newvp;
1673{
1674 struct vnode *vp;
1675 int delay;
1676
1677 if (newvp == NULL) {
1678 printf("reassignbuf: NULL");
1679 return;
1680 }
1681 vp = bp->b_vp;
1682 ++reassignbufcalls;
1683
1684 /*
1685 * B_PAGING flagged buffers cannot be reassigned because their vp
1686 * is not fully linked in.
1687 */
1688 if (bp->b_flags & B_PAGING)
1689 panic("cannot reassign paging buffer");
1690
1691 /*
1692 * Delete from old vnode list, if on one.
1693 */
1694 VI_LOCK(vp);
1695 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1696 buf_vlist_remove(bp);
1697 if (vp != newvp) {
1698 vdropl(bp->b_vp);
1699 bp->b_vp = NULL; /* for clarification */
1700 }
1701 }
1702 if (vp != newvp) {
1703 VI_UNLOCK(vp);
1704 VI_LOCK(newvp);
1705 }
1706 /*
1707 * If dirty, put on list of dirty buffers; otherwise insert onto list
1708 * of clean buffers.
1709 */
1710 if (bp->b_flags & B_DELWRI) {
1711 if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1712 switch (newvp->v_type) {
1713 case VDIR:
1714 delay = dirdelay;
1715 break;
1716 case VCHR:
1717 if (newvp->v_rdev->si_mountpoint != NULL) {
1718 delay = metadelay;
1719 break;
1720 }
1721 /* FALLTHROUGH */
1722 default:
1723 delay = filedelay;
1724 }
1725 vn_syncer_add_to_worklist(newvp, delay);
1726 }
1727 buf_vlist_add(bp, newvp, BX_VNDIRTY);
1728 } else {
1729 buf_vlist_add(bp, newvp, BX_VNCLEAN);
1730
1731 if ((newvp->v_iflag & VI_ONWORKLST) &&
1732 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1733 mtx_lock(&sync_mtx);
1734 LIST_REMOVE(newvp, v_synclist);
1735 mtx_unlock(&sync_mtx);
1736 newvp->v_iflag &= ~VI_ONWORKLST;
1737 }
1738 }
1739 if (bp->b_vp != newvp) {
1740 bp->b_vp = newvp;
1741 vholdl(bp->b_vp);
1742 }
1743 VI_UNLOCK(newvp);
1744}
1745
1746/*
1747 * Create a vnode for a device.
1748 * Used for mounting the root filesystem.
1749 */
1750int
1751bdevvp(dev, vpp)
1752 dev_t dev;
1753 struct vnode **vpp;
1754{
1755 register struct vnode *vp;
1756 struct vnode *nvp;
1757 int error;
1758
1759 if (dev == NODEV) {
1760 *vpp = NULLVP;
1761 return (ENXIO);
1762 }
1763 if (vfinddev(dev, VCHR, vpp))
1764 return (0);
1765 error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
1766 if (error) {
1767 *vpp = NULLVP;
1768 return (error);
1769 }
1770 vp = nvp;
1771 vp->v_type = VCHR;
1772 addalias(vp, dev);
1773 *vpp = vp;
1774 return (0);
1775}
1776
1777static void
1778v_incr_usecount(struct vnode *vp, int delta)
1779{
1780
1781 vp->v_usecount += delta;
1782 if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1783 mtx_lock(&spechash_mtx);
1784 vp->v_rdev->si_usecount += delta;
1785 mtx_unlock(&spechash_mtx);
1786 }
1787}
1788
1789/*
1790 * Add vnode to the alias list hung off the dev_t.
1791 *
1792 * The reason for this gunk is that multiple vnodes can reference
1793 * the same physical device, so checking vp->v_usecount to see
1794 * how many users there are is inadequate; the v_usecount for
1795 * the vnodes need to be accumulated. vcount() does that.
1796 */
1797struct vnode *
1798addaliasu(nvp, nvp_rdev)
1799 struct vnode *nvp;
1800 udev_t nvp_rdev;
1801{
1802 struct vnode *ovp;
1803 vop_t **ops;
1804 dev_t dev;
1805
1806 if (nvp->v_type == VBLK)
1807 return (nvp);
1808 if (nvp->v_type != VCHR)
1809 panic("addaliasu on non-special vnode");
1810 dev = udev2dev(nvp_rdev);
1811 if (dev == NODEV)
1812 return (nvp);
1813 /*
1814 * Check to see if we have a bdevvp vnode with no associated
1815 * filesystem. If so, we want to associate the filesystem of
1816 * the new newly instigated vnode with the bdevvp vnode and
1817 * discard the newly created vnode rather than leaving the
1818 * bdevvp vnode lying around with no associated filesystem.
1819 */
1820 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1821 addalias(nvp, dev);
1822 return (nvp);
1823 }
1824 /*
1825 * Discard unneeded vnode, but save its node specific data.
1826 * Note that if there is a lock, it is carried over in the
1827 * node specific data to the replacement vnode.
1828 */
1829 vref(ovp);
1830 ovp->v_data = nvp->v_data;
1831 ovp->v_tag = nvp->v_tag;
1832 nvp->v_data = NULL;
1833 lockdestroy(ovp->v_vnlock);
1834 lockinit(ovp->v_vnlock, PVFS, nvp->v_vnlock->lk_wmesg,
1835 nvp->v_vnlock->lk_timo, nvp->v_vnlock->lk_flags & LK_EXTFLG_MASK);
1836 ops = ovp->v_op;
1837 ovp->v_op = nvp->v_op;
1838 if (VOP_ISLOCKED(nvp, curthread)) {
1839 VOP_UNLOCK(nvp, 0, curthread);
1840 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
1841 }
1842 nvp->v_op = ops;
1843 insmntque(ovp, nvp->v_mount);
1844 vrele(nvp);
1845 vgone(nvp);
1846 return (ovp);
1847}
1848
1849/* This is a local helper function that do the same as addaliasu, but for a
1850 * dev_t instead of an udev_t. */
1851static void
1852addalias(nvp, dev)
1853 struct vnode *nvp;
1854 dev_t dev;
1855{
1856
1857 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1858 dev_ref(dev);
1859 nvp->v_rdev = dev;
1860 VI_LOCK(nvp);
1861 mtx_lock(&spechash_mtx);
1862 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1863 dev->si_usecount += nvp->v_usecount;
1864 mtx_unlock(&spechash_mtx);
1865 VI_UNLOCK(nvp);
1866}
1867
1868/*
1869 * Grab a particular vnode from the free list, increment its
1870 * reference count and lock it. The vnode lock bit is set if the
1871 * vnode is being eliminated in vgone. The process is awakened
1872 * when the transition is completed, and an error returned to
1873 * indicate that the vnode is no longer usable (possibly having
1874 * been changed to a new filesystem type).
1875 */
1876int
1877vget(vp, flags, td)
1878 register struct vnode *vp;
1879 int flags;
1880 struct thread *td;
1881{
1882 int error;
1883
1884 /*
1885 * If the vnode is in the process of being cleaned out for
1886 * another use, we wait for the cleaning to finish and then
1887 * return failure. Cleaning is determined by checking that
1888 * the VI_XLOCK flag is set.
1889 */
1890 if ((flags & LK_INTERLOCK) == 0)
1891 VI_LOCK(vp);
1892 if (vp->v_iflag & VI_XLOCK && vp->v_vxthread != curthread) {
1893 if ((flags & LK_NOWAIT) == 0) {
1894 vp->v_iflag |= VI_XWANT;
1895 msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
1896 return (ENOENT);
1897 }
1898 VI_UNLOCK(vp);
1899 return (EBUSY);
1900 }
1901
1902 v_incr_usecount(vp, 1);
1903
1904 if (VSHOULDBUSY(vp))
1905 vbusy(vp);
1906 if (flags & LK_TYPE_MASK) {
1907 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
1908 /*
1909 * must expand vrele here because we do not want
1910 * to call VOP_INACTIVE if the reference count
1911 * drops back to zero since it was never really
1912 * active. We must remove it from the free list
1913 * before sleeping so that multiple processes do
1914 * not try to recycle it.
1915 */
1916 VI_LOCK(vp);
1917 v_incr_usecount(vp, -1);
1918 if (VSHOULDFREE(vp))
1919 vfree(vp);
1920 else
1921 vlruvp(vp);
1922 VI_UNLOCK(vp);
1923 }
1924 return (error);
1925 }
1926 VI_UNLOCK(vp);
1927 return (0);
1928}
1929
1930/*
1931 * Increase the reference count of a vnode.
1932 */
1933void
1934vref(struct vnode *vp)
1935{
1936
1937 VI_LOCK(vp);
1938 v_incr_usecount(vp, 1);
1939 VI_UNLOCK(vp);
1940}
1941
1942/*
1943 * Return reference count of a vnode.
1944 *
1945 * The results of this call are only guaranteed when some mechanism other
1946 * than the VI lock is used to stop other processes from gaining references
1947 * to the vnode. This may be the case if the caller holds the only reference.
1948 * This is also useful when stale data is acceptable as race conditions may
1949 * be accounted for by some other means.
1950 */
1951int
1952vrefcnt(struct vnode *vp)
1953{
1954 int usecnt;
1955
1956 VI_LOCK(vp);
1957 usecnt = vp->v_usecount;
1958 VI_UNLOCK(vp);
1959
1960 return (usecnt);
1961}
1962
1963
1964/*
1965 * Vnode put/release.
1966 * If count drops to zero, call inactive routine and return to freelist.
1967 */
1968void
1969vrele(vp)
1970 struct vnode *vp;
1971{
1972 struct thread *td = curthread; /* XXX */
1973
1974 KASSERT(vp != NULL, ("vrele: null vp"));
1975
1976 VI_LOCK(vp);
1977
1978 /* Skip this v_writecount check if we're going to panic below. */
1979 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
1980 ("vrele: missed vn_close"));
1981
1982 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
1983 vp->v_usecount == 1)) {
1984 v_incr_usecount(vp, -1);
1985 VI_UNLOCK(vp);
1986
1987 return;
1988 }
1989
1990 if (vp->v_usecount == 1) {
1991 v_incr_usecount(vp, -1);
1992 /*
1993 * We must call VOP_INACTIVE with the node locked. Mark
1994 * as VI_DOINGINACT to avoid recursion.
1995 */
1996 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
1997 VI_LOCK(vp);
1998 vp->v_iflag |= VI_DOINGINACT;
1999 VI_UNLOCK(vp);
2000 VOP_INACTIVE(vp, td);
2001 VI_LOCK(vp);
2002 KASSERT(vp->v_iflag & VI_DOINGINACT,
2003 ("vrele: lost VI_DOINGINACT"));
2004 vp->v_iflag &= ~VI_DOINGINACT;
2005 } else
2006 VI_LOCK(vp);
2007 if (VSHOULDFREE(vp))
2008 vfree(vp);
2009 else
2010 vlruvp(vp);
2011 VI_UNLOCK(vp);
2012
2013 } else {
2014#ifdef DIAGNOSTIC
2015 vprint("vrele: negative ref count", vp);
2016#endif
2017 VI_UNLOCK(vp);
2018 panic("vrele: negative ref cnt");
2019 }
2020}
2021
2022/*
2023 * Release an already locked vnode. This give the same effects as
2024 * unlock+vrele(), but takes less time and avoids releasing and
2025 * re-aquiring the lock (as vrele() aquires the lock internally.)
2026 */
2027void
2028vput(vp)
2029 struct vnode *vp;
2030{
2031 struct thread *td = curthread; /* XXX */
2032
2033 GIANT_REQUIRED;
2034
2035 KASSERT(vp != NULL, ("vput: null vp"));
2036 VI_LOCK(vp);
2037 /* Skip this v_writecount check if we're going to panic below. */
2038 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2039 ("vput: missed vn_close"));
2040
2041 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2042 vp->v_usecount == 1)) {
2043 v_incr_usecount(vp, -1);
2044 VOP_UNLOCK(vp, LK_INTERLOCK, td);
2045 return;
2046 }
2047
2048 if (vp->v_usecount == 1) {
2049 v_incr_usecount(vp, -1);
2050 /*
2051 * We must call VOP_INACTIVE with the node locked, so
2052 * we just need to release the vnode mutex. Mark as
2053 * as VI_DOINGINACT to avoid recursion.
2054 */
2055 vp->v_iflag |= VI_DOINGINACT;
2056 VI_UNLOCK(vp);
2057 VOP_INACTIVE(vp, td);
2058 VI_LOCK(vp);
2059 KASSERT(vp->v_iflag & VI_DOINGINACT,
2060 ("vput: lost VI_DOINGINACT"));
2061 vp->v_iflag &= ~VI_DOINGINACT;
2062 if (VSHOULDFREE(vp))
2063 vfree(vp);
2064 else
2065 vlruvp(vp);
2066 VI_UNLOCK(vp);
2067
2068 } else {
2069#ifdef DIAGNOSTIC
2070 vprint("vput: negative ref count", vp);
2071#endif
2072 panic("vput: negative ref cnt");
2073 }
2074}
2075
2076/*
2077 * Somebody doesn't want the vnode recycled.
2078 */
2079void
2080vhold(struct vnode *vp)
2081{
2082
2083 VI_LOCK(vp);
2084 vholdl(vp);
2085 VI_UNLOCK(vp);
2086}
2087
2088void
2089vholdl(vp)
2090 register struct vnode *vp;
2091{
2092
2093 vp->v_holdcnt++;
2094 if (VSHOULDBUSY(vp))
2095 vbusy(vp);
2096}
2097
2098/*
2099 * Note that there is one less who cares about this vnode. vdrop() is the
2100 * opposite of vhold().
2101 */
2102void
2103vdrop(struct vnode *vp)
2104{
2105
2106 VI_LOCK(vp);
2107 vdropl(vp);
2108 VI_UNLOCK(vp);
2109}
2110
2111void
2112vdropl(vp)
2113 register struct vnode *vp;
2114{
2115
2116 if (vp->v_holdcnt <= 0)
2117 panic("vdrop: holdcnt");
2118 vp->v_holdcnt--;
2119 if (VSHOULDFREE(vp))
2120 vfree(vp);
2121 else
2122 vlruvp(vp);
2123}
2124
2125/*
2126 * Remove any vnodes in the vnode table belonging to mount point mp.
2127 *
2128 * If FORCECLOSE is not specified, there should not be any active ones,
2129 * return error if any are found (nb: this is a user error, not a
2130 * system error). If FORCECLOSE is specified, detach any active vnodes
2131 * that are found.
2132 *
2133 * If WRITECLOSE is set, only flush out regular file vnodes open for
2134 * writing.
2135 *
2136 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2137 *
2138 * `rootrefs' specifies the base reference count for the root vnode
2139 * of this filesystem. The root vnode is considered busy if its
2140 * v_usecount exceeds this value. On a successful return, vflush()
2141 * will call vrele() on the root vnode exactly rootrefs times.
2142 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2143 * be zero.
2144 */
2145#ifdef DIAGNOSTIC
2146static int busyprt = 0; /* print out busy vnodes */
2147SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2148#endif
2149
2150int
2151vflush(mp, rootrefs, flags)
2152 struct mount *mp;
2153 int rootrefs;
2154 int flags;
2155{
2156 struct thread *td = curthread; /* XXX */
2157 struct vnode *vp, *nvp, *rootvp = NULL;
2158 struct vattr vattr;
2159 int busy = 0, error;
2160
2161 if (rootrefs > 0) {
2162 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2163 ("vflush: bad args"));
2164 /*
2165 * Get the filesystem root vnode. We can vput() it
2166 * immediately, since with rootrefs > 0, it won't go away.
2167 */
2168 if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2169 return (error);
2170 vput(rootvp);
2171
2172 }
2173 MNT_ILOCK(mp);
2174loop:
2175 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
2176 /*
2177 * Make sure this vnode wasn't reclaimed in getnewvnode().
2178 * Start over if it has (it won't be on the list anymore).
2179 */
2180 if (vp->v_mount != mp)
2181 goto loop;
2182 nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2183
2184 VI_LOCK(vp);
2185 MNT_IUNLOCK(mp);
2186 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
2187 if (error) {
2188 MNT_ILOCK(mp);
2189 goto loop;
2190 }
2191 /*
2192 * Skip over a vnodes marked VV_SYSTEM.
2193 */
2194 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2195 VOP_UNLOCK(vp, 0, td);
2196 MNT_ILOCK(mp);
2197 continue;
2198 }
2199 /*
2200 * If WRITECLOSE is set, flush out unlinked but still open
2201 * files (even if open only for reading) and regular file
2202 * vnodes open for writing.
2203 */
2204 if (flags & WRITECLOSE) {
2205 error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2206 VI_LOCK(vp);
2207
2208 if ((vp->v_type == VNON ||
2209 (error == 0 && vattr.va_nlink > 0)) &&
2210 (vp->v_writecount == 0 || vp->v_type != VREG)) {
2211 VOP_UNLOCK(vp, LK_INTERLOCK, td);
2212 MNT_ILOCK(mp);
2213 continue;
2214 }
2215 } else
2216 VI_LOCK(vp);
2217
2218 VOP_UNLOCK(vp, 0, td);
2219
2220 /*
2221 * With v_usecount == 0, all we need to do is clear out the
2222 * vnode data structures and we are done.
2223 */
2224 if (vp->v_usecount == 0) {
2225 vgonel(vp, td);
2226 MNT_ILOCK(mp);
2227 continue;
2228 }
2229
2230 /*
2231 * If FORCECLOSE is set, forcibly close the vnode. For block
2232 * or character devices, revert to an anonymous device. For
2233 * all other files, just kill them.
2234 */
2235 if (flags & FORCECLOSE) {
2236 if (vp->v_type != VCHR)
2237 vgonel(vp, td);
2238 else
2239 vgonechrl(vp, td);
2240 MNT_ILOCK(mp);
2241 continue;
2242 }
2243#ifdef DIAGNOSTIC
2244 if (busyprt)
2245 vprint("vflush: busy vnode", vp);
2246#endif
2247 VI_UNLOCK(vp);
2248 MNT_ILOCK(mp);
2249 busy++;
2250 }
2251 MNT_IUNLOCK(mp);
2252 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2253 /*
2254 * If just the root vnode is busy, and if its refcount
2255 * is equal to `rootrefs', then go ahead and kill it.
2256 */
2257 VI_LOCK(rootvp);
2258 KASSERT(busy > 0, ("vflush: not busy"));
2259 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2260 if (busy == 1 && rootvp->v_usecount == rootrefs) {
2261 vgonel(rootvp, td);
2262 busy = 0;
2263 } else
2264 VI_UNLOCK(rootvp);
2265 }
2266 if (busy)
2267 return (EBUSY);
2268 for (; rootrefs > 0; rootrefs--)
2269 vrele(rootvp);
2270 return (0);
2271}
2272
2273/*
2274 * This moves a now (likely recyclable) vnode to the end of the
2275 * mountlist. XXX However, it is temporarily disabled until we
2276 * can clean up ffs_sync() and friends, which have loop restart
2277 * conditions which this code causes to operate O(N^2).
2278 */
2279static void
2280vlruvp(struct vnode *vp)
2281{
2282#if 0
2283 struct mount *mp;
2284
2285 if ((mp = vp->v_mount) != NULL) {
2286 MNT_ILOCK(mp);
2287 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2288 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2289 MNT_IUNLOCK(mp);
2290 }
2291#endif
2292}
2293
2294static void
2295vx_lock(struct vnode *vp)
2296{
2297
2298 ASSERT_VI_LOCKED(vp, "vx_lock");
2299
2300 /*
2301 * Prevent the vnode from being recycled or brought into use while we
2302 * clean it out.
2303 */
2304 if (vp->v_iflag & VI_XLOCK)
2305 panic("vclean: deadlock");
2306 vp->v_iflag |= VI_XLOCK;
2307 vp->v_vxthread = curthread;
2308}
2309
2310static void
2311vx_unlock(struct vnode *vp)
2312{
2313 ASSERT_VI_LOCKED(vp, "vx_unlock");
2314 vp->v_iflag &= ~VI_XLOCK;
2315 vp->v_vxthread = NULL;
2316 if (vp->v_iflag & VI_XWANT) {
2317 vp->v_iflag &= ~VI_XWANT;
2318 wakeup(vp);
2319 }
2320}
2321
2322/*
2323 * Disassociate the underlying filesystem from a vnode.
2324 */
2325static void
2326vclean(vp, flags, td)
2327 struct vnode *vp;
2328 int flags;
2329 struct thread *td;
2330{
2331 int active;
2332
2333 ASSERT_VI_LOCKED(vp, "vclean");
2334 /*
2335 * Check to see if the vnode is in use. If so we have to reference it
2336 * before we clean it out so that its count cannot fall to zero and
2337 * generate a race against ourselves to recycle it.
2338 */
2339 if ((active = vp->v_usecount))
2340 v_incr_usecount(vp, 1);
2341
2342 /*
2343 * Even if the count is zero, the VOP_INACTIVE routine may still
2344 * have the object locked while it cleans it out. The VOP_LOCK
2345 * ensures that the VOP_INACTIVE routine is done with its work.
2346 * For active vnodes, it ensures that no other activity can
2347 * occur while the underlying object is being cleaned out.
2348 */
2349 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2350
2351 /*
2352 * Clean out any buffers associated with the vnode.
2353 * If the flush fails, just toss the buffers.
2354 */
2355 if (flags & DOCLOSE) {
2356 struct buf *bp;
2357 bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
2358 if (bp != NULL)
2359 (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2360 if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2361 vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2362 }
2363
2364 VOP_DESTROYVOBJECT(vp);
2365
2366 /*
2367 * Any other processes trying to obtain this lock must first
2368 * wait for VXLOCK to clear, then call the new lock operation.
2369 */
2370 VOP_UNLOCK(vp, 0, td);
2371
2372 /*
2373 * If purging an active vnode, it must be closed and
2374 * deactivated before being reclaimed. Note that the
2375 * VOP_INACTIVE will unlock the vnode.
2376 */
2377 if (active) {
2378 if (flags & DOCLOSE)
2379 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2380 VI_LOCK(vp);
2381 if ((vp->v_iflag & VI_DOINGINACT) == 0) {
2382 vp->v_iflag |= VI_DOINGINACT;
2383 VI_UNLOCK(vp);
2384 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2385 panic("vclean: cannot relock.");
2386 VOP_INACTIVE(vp, td);
2387 VI_LOCK(vp);
2388 KASSERT(vp->v_iflag & VI_DOINGINACT,
2389 ("vclean: lost VI_DOINGINACT"));
2390 vp->v_iflag &= ~VI_DOINGINACT;
2391 }
2392 VI_UNLOCK(vp);
2393 }
2394 /*
2395 * Reclaim the vnode.
2396 */
2397 if (VOP_RECLAIM(vp, td))
2398 panic("vclean: cannot reclaim");
2399
2400 if (active) {
2401 /*
2402 * Inline copy of vrele() since VOP_INACTIVE
2403 * has already been called.
2404 */
2405 VI_LOCK(vp);
2406 v_incr_usecount(vp, -1);
2407 if (vp->v_usecount <= 0) {
2408#ifdef INVARIANTS
2409 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2410 vprint("vclean: bad ref count", vp);
2411 panic("vclean: ref cnt");
2412 }
2413#endif
2414 if (VSHOULDFREE(vp))
2415 vfree(vp);
2416 }
2417 VI_UNLOCK(vp);
2418 }
2419 /*
2420 * Delete from old mount point vnode list.
2421 */
2422 if (vp->v_mount != NULL)
2423 insmntque(vp, (struct mount *)0);
2424 cache_purge(vp);
2425 VI_LOCK(vp);
2426 if (VSHOULDFREE(vp))
2427 vfree(vp);
2428
2429 /*
2430 * Done with purge, reset to the standard lock and
2431 * notify sleepers of the grim news.
2432 */
2433 vp->v_vnlock = &vp->v_lock;
2434 vp->v_op = dead_vnodeop_p;
2435 if (vp->v_pollinfo != NULL)
2436 vn_pollgone(vp);
2437 vp->v_tag = "none";
2438}
2439
2440/*
2441 * Eliminate all activity associated with the requested vnode
2442 * and with all vnodes aliased to the requested vnode.
2443 */
2444int
2445vop_revoke(ap)
2446 struct vop_revoke_args /* {
2447 struct vnode *a_vp;
2448 int a_flags;
2449 } */ *ap;
2450{
2451 struct vnode *vp, *vq;
2452 dev_t dev;
2453
2454 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2455 vp = ap->a_vp;
2456 KASSERT((vp->v_type == VCHR), ("vop_revoke: not VCHR"));
2457
2458 VI_LOCK(vp);
2459 /*
2460 * If a vgone (or vclean) is already in progress,
2461 * wait until it is done and return.
2462 */
2463 if (vp->v_iflag & VI_XLOCK) {
2464 vp->v_iflag |= VI_XWANT;
2465 msleep(vp, VI_MTX(vp), PINOD | PDROP,
2466 "vop_revokeall", 0);
2467 return (0);
2468 }
2469 VI_UNLOCK(vp);
2470 dev = vp->v_rdev;
2471 for (;;) {
2472 mtx_lock(&spechash_mtx);
2473 vq = SLIST_FIRST(&dev->si_hlist);
2474 mtx_unlock(&spechash_mtx);
2475 if (vq == NULL)
2476 break;
2477 vgone(vq);
2478 }
2479 return (0);
2480}
2481
2482/*
2483 * Recycle an unused vnode to the front of the free list.
2484 * Release the passed interlock if the vnode will be recycled.
2485 */
2486int
2487vrecycle(vp, inter_lkp, td)
2488 struct vnode *vp;
2489 struct mtx *inter_lkp;
2490 struct thread *td;
2491{
2492
2493 VI_LOCK(vp);
2494 if (vp->v_usecount == 0) {
2495 if (inter_lkp) {
2496 mtx_unlock(inter_lkp);
2497 }
2498 vgonel(vp, td);
2499 return (1);
2500 }
2501 VI_UNLOCK(vp);
2502 return (0);
2503}
2504
2505/*
2506 * Eliminate all activity associated with a vnode
2507 * in preparation for reuse.
2508 */
2509void
2510vgone(vp)
2511 register struct vnode *vp;
2512{
2513 struct thread *td = curthread; /* XXX */
2514
2515 VI_LOCK(vp);
2516 vgonel(vp, td);
2517}
2518
2519/*
2520 * Disassociate a character device from the its underlying filesystem and
2521 * attach it to spec. This is for use when the chr device is still active
2522 * and the filesystem is going away.
2523 */
2524static void
2525vgonechrl(struct vnode *vp, struct thread *td)
2526{
2527 ASSERT_VI_LOCKED(vp, "vgonechrl");
2528 vx_lock(vp);
2529 /*
2530 * This is a custom version of vclean() which does not tearm down
2531 * the bufs or vm objects held by this vnode. This allows filesystems
2532 * to continue using devices which were discovered via another
2533 * filesystem that has been unmounted.
2534 */
2535 if (vp->v_usecount != 0) {
2536 v_incr_usecount(vp, 1);
2537 /*
2538 * Ensure that no other activity can occur while the
2539 * underlying object is being cleaned out.
2540 */
2541 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2542 /*
2543 * Any other processes trying to obtain this lock must first
2544 * wait for VXLOCK to clear, then call the new lock operation.
2545 */
2546 VOP_UNLOCK(vp, 0, td);
2547 vp->v_vnlock = &vp->v_lock;
2548 vp->v_tag = "orphanchr";
2549 vp->v_op = spec_vnodeop_p;
2550 if (vp->v_mount != NULL)
2551 insmntque(vp, (struct mount *)0);
2552 cache_purge(vp);
2553 vrele(vp);
2554 VI_LOCK(vp);
2555 } else
2556 vclean(vp, 0, td);
2557 vp->v_op = spec_vnodeop_p;
2558 vx_unlock(vp);
2559 VI_UNLOCK(vp);
2560}
2561
2562/*
2563 * vgone, with the vp interlock held.
2564 */
2565void
2566vgonel(vp, td)
2567 struct vnode *vp;
2568 struct thread *td;
2569{
2570 /*
2571 * If a vgone (or vclean) is already in progress,
2572 * wait until it is done and return.
2573 */
2574 ASSERT_VI_LOCKED(vp, "vgonel");
2575 if (vp->v_iflag & VI_XLOCK) {
2576 vp->v_iflag |= VI_XWANT;
2577 msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
2578 return;
2579 }
2580 vx_lock(vp);
2581
2582 /*
2583 * Clean out the filesystem specific data.
2584 */
2585 vclean(vp, DOCLOSE, td);
2586 VI_UNLOCK(vp);
2587
2588 /*
2589 * If special device, remove it from special device alias list
2590 * if it is on one.
2591 */
2592 VI_LOCK(vp);
2593 if (vp->v_type == VCHR && vp->v_rdev != NODEV) {
2594 mtx_lock(&spechash_mtx);
2595 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2596 vp->v_rdev->si_usecount -= vp->v_usecount;
2597 mtx_unlock(&spechash_mtx);
2598 dev_rel(vp->v_rdev);
2599 vp->v_rdev = NULL;
2600 }
2601
2602 /*
2603 * If it is on the freelist and not already at the head,
2604 * move it to the head of the list. The test of the
2605 * VDOOMED flag and the reference count of zero is because
2606 * it will be removed from the free list by getnewvnode,
2607 * but will not have its reference count incremented until
2608 * after calling vgone. If the reference count were
2609 * incremented first, vgone would (incorrectly) try to
2610 * close the previous instance of the underlying object.
2611 */
2612 if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2613 mtx_lock(&vnode_free_list_mtx);
2614 if (vp->v_iflag & VI_FREE) {
2615 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2616 } else {
2617 vp->v_iflag |= VI_FREE;
2618 freevnodes++;
2619 }
2620 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2621 mtx_unlock(&vnode_free_list_mtx);
2622 }
2623
2624 vp->v_type = VBAD;
2625 vx_unlock(vp);
2626 VI_UNLOCK(vp);
2627}
2628
2629/*
2630 * Lookup a vnode by device number.
2631 */
2632int
2633vfinddev(dev, type, vpp)
2634 dev_t dev;
2635 enum vtype type;
2636 struct vnode **vpp;
2637{
2638 struct vnode *vp;
2639
2640 mtx_lock(&spechash_mtx);
2641 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2642 if (type == vp->v_type) {
2643 *vpp = vp;
2644 mtx_unlock(&spechash_mtx);
2645 return (1);
2646 }
2647 }
2648 mtx_unlock(&spechash_mtx);
2649 return (0);
2650}
2651
2652/*
2653 * Calculate the total number of references to a special device.
2654 */
2655int
2656vcount(vp)
2657 struct vnode *vp;
2658{
2659 int count;
2660
2661 mtx_lock(&spechash_mtx);
2662 count = vp->v_rdev->si_usecount;
2663 mtx_unlock(&spechash_mtx);
2664 return (count);
2665}
2666
2667/*
2668 * Same as above, but using the dev_t as argument
2669 */
2670int
2671count_dev(dev)
2672 dev_t dev;
2673{
2674 int count;
2675
2676 mtx_lock(&spechash_mtx);
2677 count = dev->si_usecount;
2678 mtx_unlock(&spechash_mtx);
2679 return(count);
2680}
2681
2682/*
2683 * Print out a description of a vnode.
2684 */
2685static char *typename[] =
2686{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2687
2688void
2689vprint(label, vp)
2690 char *label;
2691 struct vnode *vp;
2692{
2693 char buf[96];
2694
2695 if (label != NULL)
2696 printf("%s: %p: ", label, (void *)vp);
2697 else
2698 printf("%p: ", (void *)vp);
2699 printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
2700 vp->v_tag, typename[vp->v_type], vp->v_usecount,
2701 vp->v_writecount, vp->v_holdcnt);
2702 buf[0] = '\0';
2703 if (vp->v_vflag & VV_ROOT)
2704 strcat(buf, "|VV_ROOT");
2705 if (vp->v_vflag & VV_TEXT)
2706 strcat(buf, "|VV_TEXT");
2707 if (vp->v_vflag & VV_SYSTEM)
2708 strcat(buf, "|VV_SYSTEM");
2709 if (vp->v_iflag & VI_XLOCK)
2710 strcat(buf, "|VI_XLOCK");
2711 if (vp->v_iflag & VI_XWANT)
2712 strcat(buf, "|VI_XWANT");
2713 if (vp->v_iflag & VI_BWAIT)
2714 strcat(buf, "|VI_BWAIT");
2715 if (vp->v_iflag & VI_DOOMED)
2716 strcat(buf, "|VI_DOOMED");
2717 if (vp->v_iflag & VI_FREE)
2718 strcat(buf, "|VI_FREE");
2719 if (vp->v_vflag & VV_OBJBUF)
2720 strcat(buf, "|VV_OBJBUF");
2721 if (buf[0] != '\0')
2722 printf(" flags (%s),", &buf[1]);
2723 lockmgr_printinfo(vp->v_vnlock);
2724 printf("\n");
2725 if (vp->v_data != NULL)
2726 VOP_PRINT(vp);
2727}
2728
2729#ifdef DDB
2730#include <ddb/ddb.h>
2731/*
2732 * List all of the locked vnodes in the system.
2733 * Called when debugging the kernel.
2734 */
2735DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2736{
2737 struct mount *mp, *nmp;
2738 struct vnode *vp;
2739
2740 /*
2741 * Note: because this is DDB, we can't obey the locking semantics
2742 * for these structures, which means we could catch an inconsistent
2743 * state and dereference a nasty pointer. Not much to be done
2744 * about that.
2745 */
2746 printf("Locked vnodes\n");
2747 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2748 nmp = TAILQ_NEXT(mp, mnt_list);
2749 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2750 if (VOP_ISLOCKED(vp, NULL))
2751 vprint(NULL, vp);
2752 }
2753 nmp = TAILQ_NEXT(mp, mnt_list);
2754 }
2755}
2756#endif
2757
2758/*
2759 * Fill in a struct xvfsconf based on a struct vfsconf.
2760 */
2761static void
2762vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2763{
2764
2765 strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2766 xvfsp->vfc_typenum = vfsp->vfc_typenum;
2767 xvfsp->vfc_refcount = vfsp->vfc_refcount;
2768 xvfsp->vfc_flags = vfsp->vfc_flags;
2769 /*
2770 * These are unused in userland, we keep them
2771 * to not break binary compatibility.
2772 */
2773 xvfsp->vfc_vfsops = NULL;
2774 xvfsp->vfc_next = NULL;
2775}
2776
2777static int
2778sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2779{
2780 struct vfsconf *vfsp;
2781 struct xvfsconf *xvfsp;
2782 int cnt, error, i;
2783
2784 cnt = 0;
2785 for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2786 cnt++;
2787 xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2788 /*
2789 * Handle the race that we will have here when struct vfsconf
2790 * will be locked down by using both cnt and checking vfc_next
2791 * against NULL to determine the end of the loop. The race will
2792 * happen because we will have to unlock before calling malloc().
2793 * We are protected by Giant for now.
2794 */
2795 i = 0;
2796 for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2797 vfsconf2x(vfsp, xvfsp + i);
2798 i++;
2799 }
2800 error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2801 free(xvfsp, M_TEMP);
2802 return (error);
2803}
2804
2805SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2806 "S,xvfsconf", "List of all configured filesystems");
2807
2808/*
2809 * Top level filesystem related information gathering.
2810 */
2811static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2812
2813static int
2814vfs_sysctl(SYSCTL_HANDLER_ARGS)
2815{
2816 int *name = (int *)arg1 - 1; /* XXX */
2817 u_int namelen = arg2 + 1; /* XXX */
2818 struct vfsconf *vfsp;
2819 struct xvfsconf xvfsp;
2820
2821 printf("WARNING: userland calling deprecated sysctl, "
2822 "please rebuild world\n");
2823
2824#if 1 || defined(COMPAT_PRELITE2)
2825 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2826 if (namelen == 1)
2827 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2828#endif
2829
2830 switch (name[1]) {
2831 case VFS_MAXTYPENUM:
2832 if (namelen != 2)
2833 return (ENOTDIR);
2834 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2835 case VFS_CONF:
2836 if (namelen != 3)
2837 return (ENOTDIR); /* overloaded */
2838 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2839 if (vfsp->vfc_typenum == name[2])
2840 break;
2841 if (vfsp == NULL)
2842 return (EOPNOTSUPP);
2843 vfsconf2x(vfsp, &xvfsp);
2844 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2845 }
2846 return (EOPNOTSUPP);
2847}
2848
2849SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2850 "Generic filesystem");
2851
2852#if 1 || defined(COMPAT_PRELITE2)
2853
2854static int
2855sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2856{
2857 int error;
2858 struct vfsconf *vfsp;
2859 struct ovfsconf ovfs;
2860
2861 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2862 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
2863 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2864 ovfs.vfc_index = vfsp->vfc_typenum;
2865 ovfs.vfc_refcount = vfsp->vfc_refcount;
2866 ovfs.vfc_flags = vfsp->vfc_flags;
2867 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2868 if (error)
2869 return error;
2870 }
2871 return 0;
2872}
2873
2874#endif /* 1 || COMPAT_PRELITE2 */
2875
2876#define KINFO_VNODESLOP 10
2877#ifdef notyet
2878/*
2879 * Dump vnode list (via sysctl).
2880 */
2881/* ARGSUSED */
2882static int
2883sysctl_vnode(SYSCTL_HANDLER_ARGS)
2884{
2885 struct xvnode *xvn;
2886 struct thread *td = req->td;
2887 struct mount *mp;
2888 struct vnode *vp;
2889 int error, len, n;
2890
2891 /*
2892 * Stale numvnodes access is not fatal here.
2893 */
2894 req->lock = 0;
2895 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2896 if (!req->oldptr)
2897 /* Make an estimate */
2898 return (SYSCTL_OUT(req, 0, len));
2899
2900 error = sysctl_wire_old_buffer(req, 0);
2901 if (error != 0)
2902 return (error);
2903 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2904 n = 0;
2905 mtx_lock(&mountlist_mtx);
2906 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2907 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2908 continue;
2909 MNT_ILOCK(mp);
2910 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2911 if (n == len)
2912 break;
2913 vref(vp);
2914 xvn[n].xv_size = sizeof *xvn;
2915 xvn[n].xv_vnode = vp;
2916#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
2917 XV_COPY(usecount);
2918 XV_COPY(writecount);
2919 XV_COPY(holdcnt);
2920 XV_COPY(id);
2921 XV_COPY(mount);
2922 XV_COPY(numoutput);
2923 XV_COPY(type);
2924#undef XV_COPY
2925 xvn[n].xv_flag = vp->v_vflag;
2926
2927 switch (vp->v_type) {
2928 case VREG:
2929 case VDIR:
2930 case VLNK:
2931 xvn[n].xv_dev = vp->v_cachedfs;
2932 xvn[n].xv_ino = vp->v_cachedid;
2933 break;
2934 case VBLK:
2935 case VCHR:
2936 if (vp->v_rdev == NULL) {
2937 vrele(vp);
2938 continue;
2939 }
2940 xvn[n].xv_dev = dev2udev(vp->v_rdev);
2941 break;
2942 case VSOCK:
2943 xvn[n].xv_socket = vp->v_socket;
2944 break;
2945 case VFIFO:
2946 xvn[n].xv_fifo = vp->v_fifoinfo;
2947 break;
2948 case VNON:
2949 case VBAD:
2950 default:
2951 /* shouldn't happen? */
2952 vrele(vp);
2953 continue;
2954 }
2955 vrele(vp);
2956 ++n;
2957 }
2958 MNT_IUNLOCK(mp);
2959 mtx_lock(&mountlist_mtx);
2960 vfs_unbusy(mp, td);
2961 if (n == len)
2962 break;
2963 }
2964 mtx_unlock(&mountlist_mtx);
2965
2966 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
2967 free(xvn, M_TEMP);
2968 return (error);
2969}
2970
2971SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2972 0, 0, sysctl_vnode, "S,xvnode", "");
2973#endif
2974
2975/*
2976 * Check to see if a filesystem is mounted on a block device.
2977 */
2978int
2979vfs_mountedon(vp)
2980 struct vnode *vp;
2981{
2982
2983 if (vp->v_rdev->si_mountpoint != NULL)
2984 return (EBUSY);
2985 return (0);
2986}
2987
2988/*
2989 * Unmount all filesystems. The list is traversed in reverse order
2990 * of mounting to avoid dependencies.
2991 */
2992void
2993vfs_unmountall()
2994{
2995 struct mount *mp;
2996 struct thread *td;
2997 int error;
2998
2999 if (curthread != NULL)
3000 td = curthread;
3001 else
3002 td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
3003 /*
3004 * Since this only runs when rebooting, it is not interlocked.
3005 */
3006 while(!TAILQ_EMPTY(&mountlist)) {
3007 mp = TAILQ_LAST(&mountlist, mntlist);
3008 error = dounmount(mp, MNT_FORCE, td);
3009 if (error) {
3010 TAILQ_REMOVE(&mountlist, mp, mnt_list);
3011 printf("unmount of %s failed (",
3012 mp->mnt_stat.f_mntonname);
3013 if (error == EBUSY)
3014 printf("BUSY)\n");
3015 else
3016 printf("%d)\n", error);
3017 } else {
3018 /* The unmount has removed mp from the mountlist */
3019 }
3020 }
3021}
3022
3023/*
3024 * perform msync on all vnodes under a mount point
3025 * the mount point must be locked.
3026 */
3027void
3028vfs_msync(struct mount *mp, int flags)
3029{
3030 struct vnode *vp, *nvp;
3031 struct vm_object *obj;
3032 int tries;
3033
3034 GIANT_REQUIRED;
3035
3036 tries = 5;
3037 MNT_ILOCK(mp);
3038loop:
3039 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
3040 if (vp->v_mount != mp) {
3041 if (--tries > 0)
3042 goto loop;
3043 break;
3044 }
3045 nvp = TAILQ_NEXT(vp, v_nmntvnodes);
3046
3047 VI_LOCK(vp);
3048 if (vp->v_iflag & VI_XLOCK) {
3049 VI_UNLOCK(vp);
3050 continue;
3051 }
3052
3053 if ((vp->v_iflag & VI_OBJDIRTY) &&
3054 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
3055 MNT_IUNLOCK(mp);
3056 if (!vget(vp,
3057 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3058 curthread)) {
3059 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
3060 vput(vp);
3061 MNT_ILOCK(mp);
3062 continue;
3063 }
3064
3065 if (VOP_GETVOBJECT(vp, &obj) == 0) {
3066 VM_OBJECT_LOCK(obj);
3067 vm_object_page_clean(obj, 0, 0,
3068 flags == MNT_WAIT ?
3069 OBJPC_SYNC : OBJPC_NOSYNC);
3070 VM_OBJECT_UNLOCK(obj);
3071 }
3072 vput(vp);
3073 }
3074 MNT_ILOCK(mp);
3075 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
3076 if (--tries > 0)
3077 goto loop;
3078 break;
3079 }
3080 } else
3081 VI_UNLOCK(vp);
3082 }
3083 MNT_IUNLOCK(mp);
3084}
3085
3086/*
3087 * Create the VM object needed for VMIO and mmap support. This
3088 * is done for all VREG files in the system. Some filesystems might
3089 * afford the additional metadata buffering capability of the
3090 * VMIO code by making the device node be VMIO mode also.
3091 *
3092 * vp must be locked when vfs_object_create is called.
3093 */
3094int
3095vfs_object_create(vp, td, cred)
3096 struct vnode *vp;
3097 struct thread *td;
3098 struct ucred *cred;
3099{
3100
3101 GIANT_REQUIRED;
3102 return (VOP_CREATEVOBJECT(vp, cred, td));
3103}
3104
3105/*
3106 * Mark a vnode as free, putting it up for recycling.
3107 */
3108void
3109vfree(vp)
3110 struct vnode *vp;
3111{
3112
3113 ASSERT_VI_LOCKED(vp, "vfree");
3114 mtx_lock(&vnode_free_list_mtx);
3115 KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3116 if (vp->v_iflag & VI_AGE) {
3117 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3118 } else {
3119 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3120 }
3121 freevnodes++;
3122 mtx_unlock(&vnode_free_list_mtx);
3123 vp->v_iflag &= ~VI_AGE;
3124 vp->v_iflag |= VI_FREE;
3125}
3126
3127/*
3128 * Opposite of vfree() - mark a vnode as in use.
3129 */
3130void
3131vbusy(vp)
3132 struct vnode *vp;
3133{
3134
3135 ASSERT_VI_LOCKED(vp, "vbusy");
3136 KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3137
3138 mtx_lock(&vnode_free_list_mtx);
3139 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3140 freevnodes--;
3141 mtx_unlock(&vnode_free_list_mtx);
3142
3143 vp->v_iflag &= ~(VI_FREE|VI_AGE);
3144}
3145
3146/*
3147 * Initalize per-vnode helper structure to hold poll-related state.
3148 */
3149void
3150v_addpollinfo(struct vnode *vp)
3151{
3152
3153 vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
3154 mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3155}
3156
3157/*
3158 * Record a process's interest in events which might happen to
3159 * a vnode. Because poll uses the historic select-style interface
3160 * internally, this routine serves as both the ``check for any
3161 * pending events'' and the ``record my interest in future events''
3162 * functions. (These are done together, while the lock is held,
3163 * to avoid race conditions.)
3164 */
3165int
3166vn_pollrecord(vp, td, events)
3167 struct vnode *vp;
3168 struct thread *td;
3169 short events;
3170{
3171
3172 if (vp->v_pollinfo == NULL)
3173 v_addpollinfo(vp);
3174 mtx_lock(&vp->v_pollinfo->vpi_lock);
3175 if (vp->v_pollinfo->vpi_revents & events) {
3176 /*
3177 * This leaves events we are not interested
3178 * in available for the other process which
3179 * which presumably had requested them
3180 * (otherwise they would never have been
3181 * recorded).
3182 */
3183 events &= vp->v_pollinfo->vpi_revents;
3184 vp->v_pollinfo->vpi_revents &= ~events;
3185
3186 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3187 return events;
3188 }
3189 vp->v_pollinfo->vpi_events |= events;
3190 selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3191 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3192 return 0;
3193}
3194
3195/*
3196 * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
3197 * it is possible for us to miss an event due to race conditions, but
3198 * that condition is expected to be rare, so for the moment it is the
3199 * preferred interface.
3200 */
3201void
3202vn_pollevent(vp, events)
3203 struct vnode *vp;
3204 short events;
3205{
3206
3207 if (vp->v_pollinfo == NULL)
3208 v_addpollinfo(vp);
3209 mtx_lock(&vp->v_pollinfo->vpi_lock);
3210 if (vp->v_pollinfo->vpi_events & events) {
3211 /*
3212 * We clear vpi_events so that we don't
3213 * call selwakeup() twice if two events are
3214 * posted before the polling process(es) is
3215 * awakened. This also ensures that we take at
3216 * most one selwakeup() if the polling process
3217 * is no longer interested. However, it does
3218 * mean that only one event can be noticed at
3219 * a time. (Perhaps we should only clear those
3220 * event bits which we note?) XXX
3221 */
3222 vp->v_pollinfo->vpi_events = 0; /* &= ~events ??? */
3223 vp->v_pollinfo->vpi_revents |= events;
3224 selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3225 }
3226 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3227}
3228
3229/*
3230 * Wake up anyone polling on vp because it is being revoked.
3231 * This depends on dead_poll() returning POLLHUP for correct
3232 * behavior.
3233 */
3234void
3235vn_pollgone(vp)
3236 struct vnode *vp;
3237{
3238
3239 mtx_lock(&vp->v_pollinfo->vpi_lock);
3240 VN_KNOTE(vp, NOTE_REVOKE);
3241 if (vp->v_pollinfo->vpi_events) {
3242 vp->v_pollinfo->vpi_events = 0;
3243 selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3244 }
3245 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3246}
3247
3248
3249
3250/*
3251 * Routine to create and manage a filesystem syncer vnode.
3252 */
3253#define sync_close ((int (*)(struct vop_close_args *))nullop)
3254static int sync_fsync(struct vop_fsync_args *);
3255static int sync_inactive(struct vop_inactive_args *);
3256static int sync_reclaim(struct vop_reclaim_args *);
3257
3258static vop_t **sync_vnodeop_p;
3259static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3260 { &vop_default_desc, (vop_t *) vop_eopnotsupp },
3261 { &vop_close_desc, (vop_t *) sync_close }, /* close */
3262 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
3263 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
3264 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
3265 { &vop_lock_desc, (vop_t *) vop_stdlock }, /* lock */
3266 { &vop_unlock_desc, (vop_t *) vop_stdunlock }, /* unlock */
3267 { &vop_islocked_desc, (vop_t *) vop_stdislocked }, /* islocked */
3268 { NULL, NULL }
3269};
3270static struct vnodeopv_desc sync_vnodeop_opv_desc =
3271 { &sync_vnodeop_p, sync_vnodeop_entries };
3272
3273VNODEOP_SET(sync_vnodeop_opv_desc);
3274
3275/*
3276 * Create a new filesystem syncer vnode for the specified mount point.
3277 */
3278int
3279vfs_allocate_syncvnode(mp)
3280 struct mount *mp;
3281{
3282 struct vnode *vp;
3283 static long start, incr, next;
3284 int error;
3285
3286 /* Allocate a new vnode */
3287 if ((error = getnewvnode("syncer", mp, sync_vnodeop_p, &vp)) != 0) {
3288 mp->mnt_syncer = NULL;
3289 return (error);
3290 }
3291 vp->v_type = VNON;
3292 /*
3293 * Place the vnode onto the syncer worklist. We attempt to
3294 * scatter them about on the list so that they will go off
3295 * at evenly distributed times even if all the filesystems
3296 * are mounted at once.
3297 */
3298 next += incr;
3299 if (next == 0 || next > syncer_maxdelay) {
3300 start /= 2;
3301 incr /= 2;
3302 if (start == 0) {
3303 start = syncer_maxdelay / 2;
3304 incr = syncer_maxdelay;
3305 }
3306 next = start;
3307 }
3308 VI_LOCK(vp);
3309 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3310 VI_UNLOCK(vp);
3311 mp->mnt_syncer = vp;
3312 return (0);
3313}
3314
3315/*
3316 * Do a lazy sync of the filesystem.
3317 */
3318static int
3319sync_fsync(ap)
3320 struct vop_fsync_args /* {
3321 struct vnode *a_vp;
3322 struct ucred *a_cred;
3323 int a_waitfor;
3324 struct thread *a_td;
3325 } */ *ap;
3326{
3327 struct vnode *syncvp = ap->a_vp;
3328 struct mount *mp = syncvp->v_mount;
3329 struct thread *td = ap->a_td;
3330 int error, asyncflag;
3331
3332 /*
3333 * We only need to do something if this is a lazy evaluation.
3334 */
3335 if (ap->a_waitfor != MNT_LAZY)
3336 return (0);
3337
3338 /*
3339 * Move ourselves to the back of the sync list.
3340 */
3341 VI_LOCK(syncvp);
3342 vn_syncer_add_to_worklist(syncvp, syncdelay);
3343 VI_UNLOCK(syncvp);
3344
3345 /*
3346 * Walk the list of vnodes pushing all that are dirty and
3347 * not already on the sync list.
3348 */
3349 mtx_lock(&mountlist_mtx);
3350 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3351 mtx_unlock(&mountlist_mtx);
3352 return (0);
3353 }
3354 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3355 vfs_unbusy(mp, td);
3356 return (0);
3357 }
3358 asyncflag = mp->mnt_flag & MNT_ASYNC;
3359 mp->mnt_flag &= ~MNT_ASYNC;
3360 vfs_msync(mp, MNT_NOWAIT);
3361 error = VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3362 if (asyncflag)
3363 mp->mnt_flag |= MNT_ASYNC;
3364 vn_finished_write(mp);
3365 vfs_unbusy(mp, td);
3366 return (error);
3367}
3368
3369/*
3370 * The syncer vnode is no referenced.
3371 */
3372static int
3373sync_inactive(ap)
3374 struct vop_inactive_args /* {
3375 struct vnode *a_vp;
3376 struct thread *a_td;
3377 } */ *ap;
3378{
3379
3380 VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3381 vgone(ap->a_vp);
3382 return (0);
3383}
3384
3385/*
3386 * The syncer vnode is no longer needed and is being decommissioned.
3387 *
3388 * Modifications to the worklist must be protected by sync_mtx.
3389 */
3390static int
3391sync_reclaim(ap)
3392 struct vop_reclaim_args /* {
3393 struct vnode *a_vp;
3394 } */ *ap;
3395{
3396 struct vnode *vp = ap->a_vp;
3397
3398 VI_LOCK(vp);
3399 vp->v_mount->mnt_syncer = NULL;
3400 if (vp->v_iflag & VI_ONWORKLST) {
3401 mtx_lock(&sync_mtx);
3402 LIST_REMOVE(vp, v_synclist);
3403 mtx_unlock(&sync_mtx);
3404 vp->v_iflag &= ~VI_ONWORKLST;
3405 }
3406 VI_UNLOCK(vp);
3407
3408 return (0);
3409}
3410
3411/*
3412 * extract the dev_t from a VCHR
3413 */
3414dev_t
3415vn_todev(vp)
3416 struct vnode *vp;
3417{
3418
3419 if (vp->v_type != VCHR)
3420 return (NODEV);
3421 return (vp->v_rdev);
3422}
3423
3424/*
3425 * Check if vnode represents a disk device
3426 */
3427int
3428vn_isdisk(vp, errp)
3429 struct vnode *vp;
3430 int *errp;
3431{
3432 int error;
3433
3434 error = 0;
3435 if (vp->v_type != VCHR)
3436 error = ENOTBLK;
3437 else if (vp->v_rdev == NULL)
3438 error = ENXIO;
3439 else if (!(devsw(vp->v_rdev)->d_flags & D_DISK))
3440 error = ENOTBLK;
3441 if (errp != NULL)
3442 *errp = error;
3443 return (error == 0);
3444}
3445
3446/*
3447 * Free data allocated by namei(); see namei(9) for details.
3448 */
3449void
3450NDFREE(ndp, flags)
3451 struct nameidata *ndp;
3452 const u_int flags;
3453{
3454
3455 if (!(flags & NDF_NO_FREE_PNBUF) &&
3456 (ndp->ni_cnd.cn_flags & HASBUF)) {
3457 uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3458 ndp->ni_cnd.cn_flags &= ~HASBUF;
3459 }
3460 if (!(flags & NDF_NO_DVP_UNLOCK) &&
3461 (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3462 ndp->ni_dvp != ndp->ni_vp)
3463 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3464 if (!(flags & NDF_NO_DVP_RELE) &&
3465 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3466 vrele(ndp->ni_dvp);
3467 ndp->ni_dvp = NULL;
3468 }
3469 if (!(flags & NDF_NO_VP_UNLOCK) &&
3470 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3471 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3472 if (!(flags & NDF_NO_VP_RELE) &&
3473 ndp->ni_vp) {
3474 vrele(ndp->ni_vp);
3475 ndp->ni_vp = NULL;
3476 }
3477 if (!(flags & NDF_NO_STARTDIR_RELE) &&
3478 (ndp->ni_cnd.cn_flags & SAVESTART)) {
3479 vrele(ndp->ni_startdir);
3480 ndp->ni_startdir = NULL;
3481 }
3482}
3483
3484/*
3485 * Common filesystem object access control check routine. Accepts a
3486 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3487 * and optional call-by-reference privused argument allowing vaccess()
3488 * to indicate to the caller whether privilege was used to satisfy the
3489 * request (obsoleted). Returns 0 on success, or an errno on failure.
3490 */
3491int
3492vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3493 enum vtype type;
3494 mode_t file_mode;
3495 uid_t file_uid;
3496 gid_t file_gid;
3497 mode_t acc_mode;
3498 struct ucred *cred;
3499 int *privused;
3500{
3501 mode_t dac_granted;
3502#ifdef CAPABILITIES
3503 mode_t cap_granted;
3504#endif
3505
3506 /*
3507 * Look for a normal, non-privileged way to access the file/directory
3508 * as requested. If it exists, go with that.
3509 */
3510
3511 if (privused != NULL)
3512 *privused = 0;
3513
3514 dac_granted = 0;
3515
3516 /* Check the owner. */
3517 if (cred->cr_uid == file_uid) {
3518 dac_granted |= VADMIN;
3519 if (file_mode & S_IXUSR)
3520 dac_granted |= VEXEC;
3521 if (file_mode & S_IRUSR)
3522 dac_granted |= VREAD;
3523 if (file_mode & S_IWUSR)
3524 dac_granted |= (VWRITE | VAPPEND);
3525
3526 if ((acc_mode & dac_granted) == acc_mode)
3527 return (0);
3528
3529 goto privcheck;
3530 }
3531
3532 /* Otherwise, check the groups (first match) */
3533 if (groupmember(file_gid, cred)) {
3534 if (file_mode & S_IXGRP)
3535 dac_granted |= VEXEC;
3536 if (file_mode & S_IRGRP)
3537 dac_granted |= VREAD;
3538 if (file_mode & S_IWGRP)
3539 dac_granted |= (VWRITE | VAPPEND);
3540
3541 if ((acc_mode & dac_granted) == acc_mode)
3542 return (0);
3543
3544 goto privcheck;
3545 }
3546
3547 /* Otherwise, check everyone else. */
3548 if (file_mode & S_IXOTH)
3549 dac_granted |= VEXEC;
3550 if (file_mode & S_IROTH)
3551 dac_granted |= VREAD;
3552 if (file_mode & S_IWOTH)
3553 dac_granted |= (VWRITE | VAPPEND);
3554 if ((acc_mode & dac_granted) == acc_mode)
3555 return (0);
3556
3557privcheck:
3558 if (!suser_cred(cred, PRISON_ROOT)) {
3559 /* XXX audit: privilege used */
3560 if (privused != NULL)
3561 *privused = 1;
3562 return (0);
3563 }
3564
3565#ifdef CAPABILITIES
3566 /*
3567 * Build a capability mask to determine if the set of capabilities
3568 * satisfies the requirements when combined with the granted mask
3569 * from above.
3570 * For each capability, if the capability is required, bitwise
3571 * or the request type onto the cap_granted mask.
3572 */
3573 cap_granted = 0;
3574
3575 if (type == VDIR) {
3576 /*
3577 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3578 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3579 */
3580 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3581 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3582 cap_granted |= VEXEC;
3583 } else {
3584 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3585 !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3586 cap_granted |= VEXEC;
3587 }
3588
3589 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3590 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3591 cap_granted |= VREAD;
3592
3593 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3594 !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3595 cap_granted |= (VWRITE | VAPPEND);
3596
3597 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3598 !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3599 cap_granted |= VADMIN;
3600
3601 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3602 /* XXX audit: privilege used */
3603 if (privused != NULL)
3604 *privused = 1;
3605 return (0);
3606 }
3607#endif
3608
3609 return ((acc_mode & VADMIN) ? EPERM : EACCES);
3610}
3611
3612/*
3613 * Credential check based on process requesting service, and per-attribute
3614 * permissions.
3615 */
3616int
3617extattr_check_cred(struct vnode *vp, int attrnamespace,
3618 struct ucred *cred, struct thread *td, int access)
3619{
3620
3621 /*
3622 * Kernel-invoked always succeeds.
3623 */
3624 if (cred == NOCRED)
3625 return (0);
3626
3627 /*
3628 * Do not allow privileged processes in jail to directly
3629 * manipulate system attributes.
3630 *
3631 * XXX What capability should apply here?
3632 * Probably CAP_SYS_SETFFLAG.
3633 */
3634 switch (attrnamespace) {
3635 case EXTATTR_NAMESPACE_SYSTEM:
3636 /* Potentially should be: return (EPERM); */
3637 return (suser_cred(cred, 0));
3638 case EXTATTR_NAMESPACE_USER:
3639 return (VOP_ACCESS(vp, access, cred, td));
3640 default:
3641 return (EPERM);
3642 }
3643}
3644
3645#ifdef DEBUG_VFS_LOCKS
3646/*
3647 * This only exists to supress warnings from unlocked specfs accesses. It is
3648 * no longer ok to have an unlocked VFS.
3649 */
3650#define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
3651
3652int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
3653int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
3654int vfs_badlock_print = 1; /* Print lock violations. */
3655
3656static void
3657vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3658{
3659
3660 if (vfs_badlock_print)
3661 printf("%s: %p %s\n", str, (void *)vp, msg);
3662 if (vfs_badlock_ddb)
3663 Debugger("lock violation");
3664}
3665
3666void
3667assert_vi_locked(struct vnode *vp, const char *str)
3668{
3669
3670 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3671 vfs_badlock("interlock is not locked but should be", str, vp);
3672}
3673
3674void
3675assert_vi_unlocked(struct vnode *vp, const char *str)
3676{
3677
3678 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3679 vfs_badlock("interlock is locked but should not be", str, vp);
3680}
3681
3682void
3683assert_vop_locked(struct vnode *vp, const char *str)
3684{
3685
3686 if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
3687 vfs_badlock("is not locked but should be", str, vp);
3688}
3689
3690void
3691assert_vop_unlocked(struct vnode *vp, const char *str)
3692{
3693
3694 if (vp && !IGNORE_LOCK(vp) &&
3695 VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
3696 vfs_badlock("is locked but should not be", str, vp);
3697}
3698
3699#if 0
3700void
3701assert_vop_elocked(struct vnode *vp, const char *str)
3702{
3703
3704 if (vp && !IGNORE_LOCK(vp) &&
3705 VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
3706 vfs_badlock("is not exclusive locked but should be", str, vp);
3707}
3708
3709void
3710assert_vop_elocked_other(struct vnode *vp, const char *str)
3711{
3712
3713 if (vp && !IGNORE_LOCK(vp) &&
3714 VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
3715 vfs_badlock("is not exclusive locked by another thread",
3716 str, vp);
3717}
3718
3719void
3720assert_vop_slocked(struct vnode *vp, const char *str)
3721{
3722
3723 if (vp && !IGNORE_LOCK(vp) &&
3724 VOP_ISLOCKED(vp, curthread) != LK_SHARED)
3725 vfs_badlock("is not locked shared but should be", str, vp);
3726}
3727#endif /* 0 */
3728
3729void
3730vop_rename_pre(void *ap)
3731{
3732 struct vop_rename_args *a = ap;
3733
3734 if (a->a_tvp)
3735 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3736 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3737 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3738 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3739
3740 /* Check the source (from). */
3741 if (a->a_tdvp != a->a_fdvp)
3742 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3743 if (a->a_tvp != a->a_fvp)
3744 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
3745
3746 /* Check the target. */
3747 if (a->a_tvp)
3748 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3749 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3750}
3751
3752void
3753vop_strategy_pre(void *ap)
3754{
3755 struct vop_strategy_args *a;
3756 struct buf *bp;
3757
3758 a = ap;
3759 bp = a->a_bp;
3760
3761 /*
3762 * Cluster ops lock their component buffers but not the IO container.
3763 */
3764 if ((bp->b_flags & B_CLUSTER) != 0)
3765 return;
3766
3767 if (BUF_REFCNT(bp) < 1) {
3768 if (vfs_badlock_print)
3769 printf(
3770 "VOP_STRATEGY: bp is not locked but should be\n");
3771 if (vfs_badlock_ddb)
3772 Debugger("lock violation");
3773 }
3774}
3775
3776void
3777vop_lookup_pre(void *ap)
3778{
3779 struct vop_lookup_args *a;
3780 struct vnode *dvp;
3781
3782 a = ap;
3783 dvp = a->a_dvp;
3784 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3785 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3786}
3787
3788void
3789vop_lookup_post(void *ap, int rc)
3790{
3791 struct vop_lookup_args *a;
3792 struct componentname *cnp;
3793 struct vnode *dvp;
3794 struct vnode *vp;
3795 int flags;
3796
3797 a = ap;
3798 dvp = a->a_dvp;
3799 cnp = a->a_cnp;
3800 vp = *(a->a_vpp);
3801 flags = cnp->cn_flags;
3802
3803 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3804
3805 /*
3806 * If this is the last path component for this lookup and LOCKPARENT
3807 * is set, OR if there is an error the directory has to be locked.
3808 */
3809 if ((flags & LOCKPARENT) && (flags & ISLASTCN))
3810 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
3811 else if (rc != 0)
3812 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
3813 else if (dvp != vp)
3814 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
3815 if (flags & PDIRUNLOCK)
3816 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
3817}
3818
3819void
3820vop_lock_pre(void *ap)
3821{
3822 struct vop_lock_args *a = ap;
3823
3824 if ((a->a_flags & LK_INTERLOCK) == 0)
3825 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3826 else
3827 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3828}
3829
3830void
3831vop_lock_post(void *ap, int rc)
3832{
3833 struct vop_lock_args *a = ap;
3834
3835 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3836 if (rc == 0)
3837 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3838}
3839
3840void
3841vop_unlock_pre(void *ap)
3842{
3843 struct vop_unlock_args *a = ap;
3844
3845 if (a->a_flags & LK_INTERLOCK)
3846 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3847 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3848}
3849
3850void
3851vop_unlock_post(void *ap, int rc)
3852{
3853 struct vop_unlock_args *a = ap;
3854
3855 if (a->a_flags & LK_INTERLOCK)
3856 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3857}
3858#endif /* DEBUG_VFS_LOCKS */
1603 mtx_lock(&sync_mtx);
1604 if (rushjob < syncdelay / 2) {
1605 rushjob += 1;
1606 stat_rush_requests += 1;
1607 ret = 1;
1608 }
1609 mtx_unlock(&sync_mtx);
1610 return (ret);
1611}
1612
1613/*
1614 * Associate a p-buffer with a vnode.
1615 *
1616 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1617 * with the buffer. i.e. the bp has not been linked into the vnode or
1618 * ref-counted.
1619 */
1620void
1621pbgetvp(vp, bp)
1622 register struct vnode *vp;
1623 register struct buf *bp;
1624{
1625
1626 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1627
1628 bp->b_vp = vp;
1629 bp->b_object = vp->v_object;
1630 bp->b_flags |= B_PAGING;
1631 bp->b_dev = vn_todev(vp);
1632}
1633
1634/*
1635 * Disassociate a p-buffer from a vnode.
1636 */
1637void
1638pbrelvp(bp)
1639 register struct buf *bp;
1640{
1641
1642 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1643
1644 /* XXX REMOVE ME */
1645 VI_LOCK(bp->b_vp);
1646 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1647 panic(
1648 "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1649 bp,
1650 (int)bp->b_flags
1651 );
1652 }
1653 VI_UNLOCK(bp->b_vp);
1654 bp->b_vp = (struct vnode *) 0;
1655 bp->b_object = NULL;
1656 bp->b_flags &= ~B_PAGING;
1657}
1658
1659/*
1660 * Reassign a buffer from one vnode to another.
1661 * Used to assign file specific control information
1662 * (indirect blocks) to the vnode to which they belong.
1663 */
1664void
1665reassignbuf(bp, newvp)
1666 register struct buf *bp;
1667 register struct vnode *newvp;
1668{
1669 struct vnode *vp;
1670 int delay;
1671
1672 if (newvp == NULL) {
1673 printf("reassignbuf: NULL");
1674 return;
1675 }
1676 vp = bp->b_vp;
1677 ++reassignbufcalls;
1678
1679 /*
1680 * B_PAGING flagged buffers cannot be reassigned because their vp
1681 * is not fully linked in.
1682 */
1683 if (bp->b_flags & B_PAGING)
1684 panic("cannot reassign paging buffer");
1685
1686 /*
1687 * Delete from old vnode list, if on one.
1688 */
1689 VI_LOCK(vp);
1690 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1691 buf_vlist_remove(bp);
1692 if (vp != newvp) {
1693 vdropl(bp->b_vp);
1694 bp->b_vp = NULL; /* for clarification */
1695 }
1696 }
1697 if (vp != newvp) {
1698 VI_UNLOCK(vp);
1699 VI_LOCK(newvp);
1700 }
1701 /*
1702 * If dirty, put on list of dirty buffers; otherwise insert onto list
1703 * of clean buffers.
1704 */
1705 if (bp->b_flags & B_DELWRI) {
1706 if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1707 switch (newvp->v_type) {
1708 case VDIR:
1709 delay = dirdelay;
1710 break;
1711 case VCHR:
1712 if (newvp->v_rdev->si_mountpoint != NULL) {
1713 delay = metadelay;
1714 break;
1715 }
1716 /* FALLTHROUGH */
1717 default:
1718 delay = filedelay;
1719 }
1720 vn_syncer_add_to_worklist(newvp, delay);
1721 }
1722 buf_vlist_add(bp, newvp, BX_VNDIRTY);
1723 } else {
1724 buf_vlist_add(bp, newvp, BX_VNCLEAN);
1725
1726 if ((newvp->v_iflag & VI_ONWORKLST) &&
1727 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1728 mtx_lock(&sync_mtx);
1729 LIST_REMOVE(newvp, v_synclist);
1730 mtx_unlock(&sync_mtx);
1731 newvp->v_iflag &= ~VI_ONWORKLST;
1732 }
1733 }
1734 if (bp->b_vp != newvp) {
1735 bp->b_vp = newvp;
1736 vholdl(bp->b_vp);
1737 }
1738 VI_UNLOCK(newvp);
1739}
1740
1741/*
1742 * Create a vnode for a device.
1743 * Used for mounting the root filesystem.
1744 */
1745int
1746bdevvp(dev, vpp)
1747 dev_t dev;
1748 struct vnode **vpp;
1749{
1750 register struct vnode *vp;
1751 struct vnode *nvp;
1752 int error;
1753
1754 if (dev == NODEV) {
1755 *vpp = NULLVP;
1756 return (ENXIO);
1757 }
1758 if (vfinddev(dev, VCHR, vpp))
1759 return (0);
1760 error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
1761 if (error) {
1762 *vpp = NULLVP;
1763 return (error);
1764 }
1765 vp = nvp;
1766 vp->v_type = VCHR;
1767 addalias(vp, dev);
1768 *vpp = vp;
1769 return (0);
1770}
1771
1772static void
1773v_incr_usecount(struct vnode *vp, int delta)
1774{
1775
1776 vp->v_usecount += delta;
1777 if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1778 mtx_lock(&spechash_mtx);
1779 vp->v_rdev->si_usecount += delta;
1780 mtx_unlock(&spechash_mtx);
1781 }
1782}
1783
1784/*
1785 * Add vnode to the alias list hung off the dev_t.
1786 *
1787 * The reason for this gunk is that multiple vnodes can reference
1788 * the same physical device, so checking vp->v_usecount to see
1789 * how many users there are is inadequate; the v_usecount for
1790 * the vnodes need to be accumulated. vcount() does that.
1791 */
1792struct vnode *
1793addaliasu(nvp, nvp_rdev)
1794 struct vnode *nvp;
1795 udev_t nvp_rdev;
1796{
1797 struct vnode *ovp;
1798 vop_t **ops;
1799 dev_t dev;
1800
1801 if (nvp->v_type == VBLK)
1802 return (nvp);
1803 if (nvp->v_type != VCHR)
1804 panic("addaliasu on non-special vnode");
1805 dev = udev2dev(nvp_rdev);
1806 if (dev == NODEV)
1807 return (nvp);
1808 /*
1809 * Check to see if we have a bdevvp vnode with no associated
1810 * filesystem. If so, we want to associate the filesystem of
1811 * the new newly instigated vnode with the bdevvp vnode and
1812 * discard the newly created vnode rather than leaving the
1813 * bdevvp vnode lying around with no associated filesystem.
1814 */
1815 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1816 addalias(nvp, dev);
1817 return (nvp);
1818 }
1819 /*
1820 * Discard unneeded vnode, but save its node specific data.
1821 * Note that if there is a lock, it is carried over in the
1822 * node specific data to the replacement vnode.
1823 */
1824 vref(ovp);
1825 ovp->v_data = nvp->v_data;
1826 ovp->v_tag = nvp->v_tag;
1827 nvp->v_data = NULL;
1828 lockdestroy(ovp->v_vnlock);
1829 lockinit(ovp->v_vnlock, PVFS, nvp->v_vnlock->lk_wmesg,
1830 nvp->v_vnlock->lk_timo, nvp->v_vnlock->lk_flags & LK_EXTFLG_MASK);
1831 ops = ovp->v_op;
1832 ovp->v_op = nvp->v_op;
1833 if (VOP_ISLOCKED(nvp, curthread)) {
1834 VOP_UNLOCK(nvp, 0, curthread);
1835 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
1836 }
1837 nvp->v_op = ops;
1838 insmntque(ovp, nvp->v_mount);
1839 vrele(nvp);
1840 vgone(nvp);
1841 return (ovp);
1842}
1843
1844/* This is a local helper function that do the same as addaliasu, but for a
1845 * dev_t instead of an udev_t. */
1846static void
1847addalias(nvp, dev)
1848 struct vnode *nvp;
1849 dev_t dev;
1850{
1851
1852 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1853 dev_ref(dev);
1854 nvp->v_rdev = dev;
1855 VI_LOCK(nvp);
1856 mtx_lock(&spechash_mtx);
1857 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1858 dev->si_usecount += nvp->v_usecount;
1859 mtx_unlock(&spechash_mtx);
1860 VI_UNLOCK(nvp);
1861}
1862
1863/*
1864 * Grab a particular vnode from the free list, increment its
1865 * reference count and lock it. The vnode lock bit is set if the
1866 * vnode is being eliminated in vgone. The process is awakened
1867 * when the transition is completed, and an error returned to
1868 * indicate that the vnode is no longer usable (possibly having
1869 * been changed to a new filesystem type).
1870 */
1871int
1872vget(vp, flags, td)
1873 register struct vnode *vp;
1874 int flags;
1875 struct thread *td;
1876{
1877 int error;
1878
1879 /*
1880 * If the vnode is in the process of being cleaned out for
1881 * another use, we wait for the cleaning to finish and then
1882 * return failure. Cleaning is determined by checking that
1883 * the VI_XLOCK flag is set.
1884 */
1885 if ((flags & LK_INTERLOCK) == 0)
1886 VI_LOCK(vp);
1887 if (vp->v_iflag & VI_XLOCK && vp->v_vxthread != curthread) {
1888 if ((flags & LK_NOWAIT) == 0) {
1889 vp->v_iflag |= VI_XWANT;
1890 msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
1891 return (ENOENT);
1892 }
1893 VI_UNLOCK(vp);
1894 return (EBUSY);
1895 }
1896
1897 v_incr_usecount(vp, 1);
1898
1899 if (VSHOULDBUSY(vp))
1900 vbusy(vp);
1901 if (flags & LK_TYPE_MASK) {
1902 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
1903 /*
1904 * must expand vrele here because we do not want
1905 * to call VOP_INACTIVE if the reference count
1906 * drops back to zero since it was never really
1907 * active. We must remove it from the free list
1908 * before sleeping so that multiple processes do
1909 * not try to recycle it.
1910 */
1911 VI_LOCK(vp);
1912 v_incr_usecount(vp, -1);
1913 if (VSHOULDFREE(vp))
1914 vfree(vp);
1915 else
1916 vlruvp(vp);
1917 VI_UNLOCK(vp);
1918 }
1919 return (error);
1920 }
1921 VI_UNLOCK(vp);
1922 return (0);
1923}
1924
1925/*
1926 * Increase the reference count of a vnode.
1927 */
1928void
1929vref(struct vnode *vp)
1930{
1931
1932 VI_LOCK(vp);
1933 v_incr_usecount(vp, 1);
1934 VI_UNLOCK(vp);
1935}
1936
1937/*
1938 * Return reference count of a vnode.
1939 *
1940 * The results of this call are only guaranteed when some mechanism other
1941 * than the VI lock is used to stop other processes from gaining references
1942 * to the vnode. This may be the case if the caller holds the only reference.
1943 * This is also useful when stale data is acceptable as race conditions may
1944 * be accounted for by some other means.
1945 */
1946int
1947vrefcnt(struct vnode *vp)
1948{
1949 int usecnt;
1950
1951 VI_LOCK(vp);
1952 usecnt = vp->v_usecount;
1953 VI_UNLOCK(vp);
1954
1955 return (usecnt);
1956}
1957
1958
1959/*
1960 * Vnode put/release.
1961 * If count drops to zero, call inactive routine and return to freelist.
1962 */
1963void
1964vrele(vp)
1965 struct vnode *vp;
1966{
1967 struct thread *td = curthread; /* XXX */
1968
1969 KASSERT(vp != NULL, ("vrele: null vp"));
1970
1971 VI_LOCK(vp);
1972
1973 /* Skip this v_writecount check if we're going to panic below. */
1974 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
1975 ("vrele: missed vn_close"));
1976
1977 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
1978 vp->v_usecount == 1)) {
1979 v_incr_usecount(vp, -1);
1980 VI_UNLOCK(vp);
1981
1982 return;
1983 }
1984
1985 if (vp->v_usecount == 1) {
1986 v_incr_usecount(vp, -1);
1987 /*
1988 * We must call VOP_INACTIVE with the node locked. Mark
1989 * as VI_DOINGINACT to avoid recursion.
1990 */
1991 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
1992 VI_LOCK(vp);
1993 vp->v_iflag |= VI_DOINGINACT;
1994 VI_UNLOCK(vp);
1995 VOP_INACTIVE(vp, td);
1996 VI_LOCK(vp);
1997 KASSERT(vp->v_iflag & VI_DOINGINACT,
1998 ("vrele: lost VI_DOINGINACT"));
1999 vp->v_iflag &= ~VI_DOINGINACT;
2000 } else
2001 VI_LOCK(vp);
2002 if (VSHOULDFREE(vp))
2003 vfree(vp);
2004 else
2005 vlruvp(vp);
2006 VI_UNLOCK(vp);
2007
2008 } else {
2009#ifdef DIAGNOSTIC
2010 vprint("vrele: negative ref count", vp);
2011#endif
2012 VI_UNLOCK(vp);
2013 panic("vrele: negative ref cnt");
2014 }
2015}
2016
2017/*
2018 * Release an already locked vnode. This give the same effects as
2019 * unlock+vrele(), but takes less time and avoids releasing and
2020 * re-aquiring the lock (as vrele() aquires the lock internally.)
2021 */
2022void
2023vput(vp)
2024 struct vnode *vp;
2025{
2026 struct thread *td = curthread; /* XXX */
2027
2028 GIANT_REQUIRED;
2029
2030 KASSERT(vp != NULL, ("vput: null vp"));
2031 VI_LOCK(vp);
2032 /* Skip this v_writecount check if we're going to panic below. */
2033 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2034 ("vput: missed vn_close"));
2035
2036 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2037 vp->v_usecount == 1)) {
2038 v_incr_usecount(vp, -1);
2039 VOP_UNLOCK(vp, LK_INTERLOCK, td);
2040 return;
2041 }
2042
2043 if (vp->v_usecount == 1) {
2044 v_incr_usecount(vp, -1);
2045 /*
2046 * We must call VOP_INACTIVE with the node locked, so
2047 * we just need to release the vnode mutex. Mark as
2048 * as VI_DOINGINACT to avoid recursion.
2049 */
2050 vp->v_iflag |= VI_DOINGINACT;
2051 VI_UNLOCK(vp);
2052 VOP_INACTIVE(vp, td);
2053 VI_LOCK(vp);
2054 KASSERT(vp->v_iflag & VI_DOINGINACT,
2055 ("vput: lost VI_DOINGINACT"));
2056 vp->v_iflag &= ~VI_DOINGINACT;
2057 if (VSHOULDFREE(vp))
2058 vfree(vp);
2059 else
2060 vlruvp(vp);
2061 VI_UNLOCK(vp);
2062
2063 } else {
2064#ifdef DIAGNOSTIC
2065 vprint("vput: negative ref count", vp);
2066#endif
2067 panic("vput: negative ref cnt");
2068 }
2069}
2070
2071/*
2072 * Somebody doesn't want the vnode recycled.
2073 */
2074void
2075vhold(struct vnode *vp)
2076{
2077
2078 VI_LOCK(vp);
2079 vholdl(vp);
2080 VI_UNLOCK(vp);
2081}
2082
2083void
2084vholdl(vp)
2085 register struct vnode *vp;
2086{
2087
2088 vp->v_holdcnt++;
2089 if (VSHOULDBUSY(vp))
2090 vbusy(vp);
2091}
2092
2093/*
2094 * Note that there is one less who cares about this vnode. vdrop() is the
2095 * opposite of vhold().
2096 */
2097void
2098vdrop(struct vnode *vp)
2099{
2100
2101 VI_LOCK(vp);
2102 vdropl(vp);
2103 VI_UNLOCK(vp);
2104}
2105
2106void
2107vdropl(vp)
2108 register struct vnode *vp;
2109{
2110
2111 if (vp->v_holdcnt <= 0)
2112 panic("vdrop: holdcnt");
2113 vp->v_holdcnt--;
2114 if (VSHOULDFREE(vp))
2115 vfree(vp);
2116 else
2117 vlruvp(vp);
2118}
2119
2120/*
2121 * Remove any vnodes in the vnode table belonging to mount point mp.
2122 *
2123 * If FORCECLOSE is not specified, there should not be any active ones,
2124 * return error if any are found (nb: this is a user error, not a
2125 * system error). If FORCECLOSE is specified, detach any active vnodes
2126 * that are found.
2127 *
2128 * If WRITECLOSE is set, only flush out regular file vnodes open for
2129 * writing.
2130 *
2131 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2132 *
2133 * `rootrefs' specifies the base reference count for the root vnode
2134 * of this filesystem. The root vnode is considered busy if its
2135 * v_usecount exceeds this value. On a successful return, vflush()
2136 * will call vrele() on the root vnode exactly rootrefs times.
2137 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2138 * be zero.
2139 */
2140#ifdef DIAGNOSTIC
2141static int busyprt = 0; /* print out busy vnodes */
2142SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2143#endif
2144
2145int
2146vflush(mp, rootrefs, flags)
2147 struct mount *mp;
2148 int rootrefs;
2149 int flags;
2150{
2151 struct thread *td = curthread; /* XXX */
2152 struct vnode *vp, *nvp, *rootvp = NULL;
2153 struct vattr vattr;
2154 int busy = 0, error;
2155
2156 if (rootrefs > 0) {
2157 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2158 ("vflush: bad args"));
2159 /*
2160 * Get the filesystem root vnode. We can vput() it
2161 * immediately, since with rootrefs > 0, it won't go away.
2162 */
2163 if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2164 return (error);
2165 vput(rootvp);
2166
2167 }
2168 MNT_ILOCK(mp);
2169loop:
2170 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
2171 /*
2172 * Make sure this vnode wasn't reclaimed in getnewvnode().
2173 * Start over if it has (it won't be on the list anymore).
2174 */
2175 if (vp->v_mount != mp)
2176 goto loop;
2177 nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2178
2179 VI_LOCK(vp);
2180 MNT_IUNLOCK(mp);
2181 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
2182 if (error) {
2183 MNT_ILOCK(mp);
2184 goto loop;
2185 }
2186 /*
2187 * Skip over a vnodes marked VV_SYSTEM.
2188 */
2189 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2190 VOP_UNLOCK(vp, 0, td);
2191 MNT_ILOCK(mp);
2192 continue;
2193 }
2194 /*
2195 * If WRITECLOSE is set, flush out unlinked but still open
2196 * files (even if open only for reading) and regular file
2197 * vnodes open for writing.
2198 */
2199 if (flags & WRITECLOSE) {
2200 error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2201 VI_LOCK(vp);
2202
2203 if ((vp->v_type == VNON ||
2204 (error == 0 && vattr.va_nlink > 0)) &&
2205 (vp->v_writecount == 0 || vp->v_type != VREG)) {
2206 VOP_UNLOCK(vp, LK_INTERLOCK, td);
2207 MNT_ILOCK(mp);
2208 continue;
2209 }
2210 } else
2211 VI_LOCK(vp);
2212
2213 VOP_UNLOCK(vp, 0, td);
2214
2215 /*
2216 * With v_usecount == 0, all we need to do is clear out the
2217 * vnode data structures and we are done.
2218 */
2219 if (vp->v_usecount == 0) {
2220 vgonel(vp, td);
2221 MNT_ILOCK(mp);
2222 continue;
2223 }
2224
2225 /*
2226 * If FORCECLOSE is set, forcibly close the vnode. For block
2227 * or character devices, revert to an anonymous device. For
2228 * all other files, just kill them.
2229 */
2230 if (flags & FORCECLOSE) {
2231 if (vp->v_type != VCHR)
2232 vgonel(vp, td);
2233 else
2234 vgonechrl(vp, td);
2235 MNT_ILOCK(mp);
2236 continue;
2237 }
2238#ifdef DIAGNOSTIC
2239 if (busyprt)
2240 vprint("vflush: busy vnode", vp);
2241#endif
2242 VI_UNLOCK(vp);
2243 MNT_ILOCK(mp);
2244 busy++;
2245 }
2246 MNT_IUNLOCK(mp);
2247 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2248 /*
2249 * If just the root vnode is busy, and if its refcount
2250 * is equal to `rootrefs', then go ahead and kill it.
2251 */
2252 VI_LOCK(rootvp);
2253 KASSERT(busy > 0, ("vflush: not busy"));
2254 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2255 if (busy == 1 && rootvp->v_usecount == rootrefs) {
2256 vgonel(rootvp, td);
2257 busy = 0;
2258 } else
2259 VI_UNLOCK(rootvp);
2260 }
2261 if (busy)
2262 return (EBUSY);
2263 for (; rootrefs > 0; rootrefs--)
2264 vrele(rootvp);
2265 return (0);
2266}
2267
2268/*
2269 * This moves a now (likely recyclable) vnode to the end of the
2270 * mountlist. XXX However, it is temporarily disabled until we
2271 * can clean up ffs_sync() and friends, which have loop restart
2272 * conditions which this code causes to operate O(N^2).
2273 */
2274static void
2275vlruvp(struct vnode *vp)
2276{
2277#if 0
2278 struct mount *mp;
2279
2280 if ((mp = vp->v_mount) != NULL) {
2281 MNT_ILOCK(mp);
2282 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2283 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2284 MNT_IUNLOCK(mp);
2285 }
2286#endif
2287}
2288
2289static void
2290vx_lock(struct vnode *vp)
2291{
2292
2293 ASSERT_VI_LOCKED(vp, "vx_lock");
2294
2295 /*
2296 * Prevent the vnode from being recycled or brought into use while we
2297 * clean it out.
2298 */
2299 if (vp->v_iflag & VI_XLOCK)
2300 panic("vclean: deadlock");
2301 vp->v_iflag |= VI_XLOCK;
2302 vp->v_vxthread = curthread;
2303}
2304
2305static void
2306vx_unlock(struct vnode *vp)
2307{
2308 ASSERT_VI_LOCKED(vp, "vx_unlock");
2309 vp->v_iflag &= ~VI_XLOCK;
2310 vp->v_vxthread = NULL;
2311 if (vp->v_iflag & VI_XWANT) {
2312 vp->v_iflag &= ~VI_XWANT;
2313 wakeup(vp);
2314 }
2315}
2316
2317/*
2318 * Disassociate the underlying filesystem from a vnode.
2319 */
2320static void
2321vclean(vp, flags, td)
2322 struct vnode *vp;
2323 int flags;
2324 struct thread *td;
2325{
2326 int active;
2327
2328 ASSERT_VI_LOCKED(vp, "vclean");
2329 /*
2330 * Check to see if the vnode is in use. If so we have to reference it
2331 * before we clean it out so that its count cannot fall to zero and
2332 * generate a race against ourselves to recycle it.
2333 */
2334 if ((active = vp->v_usecount))
2335 v_incr_usecount(vp, 1);
2336
2337 /*
2338 * Even if the count is zero, the VOP_INACTIVE routine may still
2339 * have the object locked while it cleans it out. The VOP_LOCK
2340 * ensures that the VOP_INACTIVE routine is done with its work.
2341 * For active vnodes, it ensures that no other activity can
2342 * occur while the underlying object is being cleaned out.
2343 */
2344 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2345
2346 /*
2347 * Clean out any buffers associated with the vnode.
2348 * If the flush fails, just toss the buffers.
2349 */
2350 if (flags & DOCLOSE) {
2351 struct buf *bp;
2352 bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
2353 if (bp != NULL)
2354 (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2355 if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2356 vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2357 }
2358
2359 VOP_DESTROYVOBJECT(vp);
2360
2361 /*
2362 * Any other processes trying to obtain this lock must first
2363 * wait for VXLOCK to clear, then call the new lock operation.
2364 */
2365 VOP_UNLOCK(vp, 0, td);
2366
2367 /*
2368 * If purging an active vnode, it must be closed and
2369 * deactivated before being reclaimed. Note that the
2370 * VOP_INACTIVE will unlock the vnode.
2371 */
2372 if (active) {
2373 if (flags & DOCLOSE)
2374 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2375 VI_LOCK(vp);
2376 if ((vp->v_iflag & VI_DOINGINACT) == 0) {
2377 vp->v_iflag |= VI_DOINGINACT;
2378 VI_UNLOCK(vp);
2379 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2380 panic("vclean: cannot relock.");
2381 VOP_INACTIVE(vp, td);
2382 VI_LOCK(vp);
2383 KASSERT(vp->v_iflag & VI_DOINGINACT,
2384 ("vclean: lost VI_DOINGINACT"));
2385 vp->v_iflag &= ~VI_DOINGINACT;
2386 }
2387 VI_UNLOCK(vp);
2388 }
2389 /*
2390 * Reclaim the vnode.
2391 */
2392 if (VOP_RECLAIM(vp, td))
2393 panic("vclean: cannot reclaim");
2394
2395 if (active) {
2396 /*
2397 * Inline copy of vrele() since VOP_INACTIVE
2398 * has already been called.
2399 */
2400 VI_LOCK(vp);
2401 v_incr_usecount(vp, -1);
2402 if (vp->v_usecount <= 0) {
2403#ifdef INVARIANTS
2404 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2405 vprint("vclean: bad ref count", vp);
2406 panic("vclean: ref cnt");
2407 }
2408#endif
2409 if (VSHOULDFREE(vp))
2410 vfree(vp);
2411 }
2412 VI_UNLOCK(vp);
2413 }
2414 /*
2415 * Delete from old mount point vnode list.
2416 */
2417 if (vp->v_mount != NULL)
2418 insmntque(vp, (struct mount *)0);
2419 cache_purge(vp);
2420 VI_LOCK(vp);
2421 if (VSHOULDFREE(vp))
2422 vfree(vp);
2423
2424 /*
2425 * Done with purge, reset to the standard lock and
2426 * notify sleepers of the grim news.
2427 */
2428 vp->v_vnlock = &vp->v_lock;
2429 vp->v_op = dead_vnodeop_p;
2430 if (vp->v_pollinfo != NULL)
2431 vn_pollgone(vp);
2432 vp->v_tag = "none";
2433}
2434
2435/*
2436 * Eliminate all activity associated with the requested vnode
2437 * and with all vnodes aliased to the requested vnode.
2438 */
2439int
2440vop_revoke(ap)
2441 struct vop_revoke_args /* {
2442 struct vnode *a_vp;
2443 int a_flags;
2444 } */ *ap;
2445{
2446 struct vnode *vp, *vq;
2447 dev_t dev;
2448
2449 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2450 vp = ap->a_vp;
2451 KASSERT((vp->v_type == VCHR), ("vop_revoke: not VCHR"));
2452
2453 VI_LOCK(vp);
2454 /*
2455 * If a vgone (or vclean) is already in progress,
2456 * wait until it is done and return.
2457 */
2458 if (vp->v_iflag & VI_XLOCK) {
2459 vp->v_iflag |= VI_XWANT;
2460 msleep(vp, VI_MTX(vp), PINOD | PDROP,
2461 "vop_revokeall", 0);
2462 return (0);
2463 }
2464 VI_UNLOCK(vp);
2465 dev = vp->v_rdev;
2466 for (;;) {
2467 mtx_lock(&spechash_mtx);
2468 vq = SLIST_FIRST(&dev->si_hlist);
2469 mtx_unlock(&spechash_mtx);
2470 if (vq == NULL)
2471 break;
2472 vgone(vq);
2473 }
2474 return (0);
2475}
2476
2477/*
2478 * Recycle an unused vnode to the front of the free list.
2479 * Release the passed interlock if the vnode will be recycled.
2480 */
2481int
2482vrecycle(vp, inter_lkp, td)
2483 struct vnode *vp;
2484 struct mtx *inter_lkp;
2485 struct thread *td;
2486{
2487
2488 VI_LOCK(vp);
2489 if (vp->v_usecount == 0) {
2490 if (inter_lkp) {
2491 mtx_unlock(inter_lkp);
2492 }
2493 vgonel(vp, td);
2494 return (1);
2495 }
2496 VI_UNLOCK(vp);
2497 return (0);
2498}
2499
2500/*
2501 * Eliminate all activity associated with a vnode
2502 * in preparation for reuse.
2503 */
2504void
2505vgone(vp)
2506 register struct vnode *vp;
2507{
2508 struct thread *td = curthread; /* XXX */
2509
2510 VI_LOCK(vp);
2511 vgonel(vp, td);
2512}
2513
2514/*
2515 * Disassociate a character device from the its underlying filesystem and
2516 * attach it to spec. This is for use when the chr device is still active
2517 * and the filesystem is going away.
2518 */
2519static void
2520vgonechrl(struct vnode *vp, struct thread *td)
2521{
2522 ASSERT_VI_LOCKED(vp, "vgonechrl");
2523 vx_lock(vp);
2524 /*
2525 * This is a custom version of vclean() which does not tearm down
2526 * the bufs or vm objects held by this vnode. This allows filesystems
2527 * to continue using devices which were discovered via another
2528 * filesystem that has been unmounted.
2529 */
2530 if (vp->v_usecount != 0) {
2531 v_incr_usecount(vp, 1);
2532 /*
2533 * Ensure that no other activity can occur while the
2534 * underlying object is being cleaned out.
2535 */
2536 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2537 /*
2538 * Any other processes trying to obtain this lock must first
2539 * wait for VXLOCK to clear, then call the new lock operation.
2540 */
2541 VOP_UNLOCK(vp, 0, td);
2542 vp->v_vnlock = &vp->v_lock;
2543 vp->v_tag = "orphanchr";
2544 vp->v_op = spec_vnodeop_p;
2545 if (vp->v_mount != NULL)
2546 insmntque(vp, (struct mount *)0);
2547 cache_purge(vp);
2548 vrele(vp);
2549 VI_LOCK(vp);
2550 } else
2551 vclean(vp, 0, td);
2552 vp->v_op = spec_vnodeop_p;
2553 vx_unlock(vp);
2554 VI_UNLOCK(vp);
2555}
2556
2557/*
2558 * vgone, with the vp interlock held.
2559 */
2560void
2561vgonel(vp, td)
2562 struct vnode *vp;
2563 struct thread *td;
2564{
2565 /*
2566 * If a vgone (or vclean) is already in progress,
2567 * wait until it is done and return.
2568 */
2569 ASSERT_VI_LOCKED(vp, "vgonel");
2570 if (vp->v_iflag & VI_XLOCK) {
2571 vp->v_iflag |= VI_XWANT;
2572 msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
2573 return;
2574 }
2575 vx_lock(vp);
2576
2577 /*
2578 * Clean out the filesystem specific data.
2579 */
2580 vclean(vp, DOCLOSE, td);
2581 VI_UNLOCK(vp);
2582
2583 /*
2584 * If special device, remove it from special device alias list
2585 * if it is on one.
2586 */
2587 VI_LOCK(vp);
2588 if (vp->v_type == VCHR && vp->v_rdev != NODEV) {
2589 mtx_lock(&spechash_mtx);
2590 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2591 vp->v_rdev->si_usecount -= vp->v_usecount;
2592 mtx_unlock(&spechash_mtx);
2593 dev_rel(vp->v_rdev);
2594 vp->v_rdev = NULL;
2595 }
2596
2597 /*
2598 * If it is on the freelist and not already at the head,
2599 * move it to the head of the list. The test of the
2600 * VDOOMED flag and the reference count of zero is because
2601 * it will be removed from the free list by getnewvnode,
2602 * but will not have its reference count incremented until
2603 * after calling vgone. If the reference count were
2604 * incremented first, vgone would (incorrectly) try to
2605 * close the previous instance of the underlying object.
2606 */
2607 if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2608 mtx_lock(&vnode_free_list_mtx);
2609 if (vp->v_iflag & VI_FREE) {
2610 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2611 } else {
2612 vp->v_iflag |= VI_FREE;
2613 freevnodes++;
2614 }
2615 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2616 mtx_unlock(&vnode_free_list_mtx);
2617 }
2618
2619 vp->v_type = VBAD;
2620 vx_unlock(vp);
2621 VI_UNLOCK(vp);
2622}
2623
2624/*
2625 * Lookup a vnode by device number.
2626 */
2627int
2628vfinddev(dev, type, vpp)
2629 dev_t dev;
2630 enum vtype type;
2631 struct vnode **vpp;
2632{
2633 struct vnode *vp;
2634
2635 mtx_lock(&spechash_mtx);
2636 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2637 if (type == vp->v_type) {
2638 *vpp = vp;
2639 mtx_unlock(&spechash_mtx);
2640 return (1);
2641 }
2642 }
2643 mtx_unlock(&spechash_mtx);
2644 return (0);
2645}
2646
2647/*
2648 * Calculate the total number of references to a special device.
2649 */
2650int
2651vcount(vp)
2652 struct vnode *vp;
2653{
2654 int count;
2655
2656 mtx_lock(&spechash_mtx);
2657 count = vp->v_rdev->si_usecount;
2658 mtx_unlock(&spechash_mtx);
2659 return (count);
2660}
2661
2662/*
2663 * Same as above, but using the dev_t as argument
2664 */
2665int
2666count_dev(dev)
2667 dev_t dev;
2668{
2669 int count;
2670
2671 mtx_lock(&spechash_mtx);
2672 count = dev->si_usecount;
2673 mtx_unlock(&spechash_mtx);
2674 return(count);
2675}
2676
2677/*
2678 * Print out a description of a vnode.
2679 */
2680static char *typename[] =
2681{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2682
2683void
2684vprint(label, vp)
2685 char *label;
2686 struct vnode *vp;
2687{
2688 char buf[96];
2689
2690 if (label != NULL)
2691 printf("%s: %p: ", label, (void *)vp);
2692 else
2693 printf("%p: ", (void *)vp);
2694 printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
2695 vp->v_tag, typename[vp->v_type], vp->v_usecount,
2696 vp->v_writecount, vp->v_holdcnt);
2697 buf[0] = '\0';
2698 if (vp->v_vflag & VV_ROOT)
2699 strcat(buf, "|VV_ROOT");
2700 if (vp->v_vflag & VV_TEXT)
2701 strcat(buf, "|VV_TEXT");
2702 if (vp->v_vflag & VV_SYSTEM)
2703 strcat(buf, "|VV_SYSTEM");
2704 if (vp->v_iflag & VI_XLOCK)
2705 strcat(buf, "|VI_XLOCK");
2706 if (vp->v_iflag & VI_XWANT)
2707 strcat(buf, "|VI_XWANT");
2708 if (vp->v_iflag & VI_BWAIT)
2709 strcat(buf, "|VI_BWAIT");
2710 if (vp->v_iflag & VI_DOOMED)
2711 strcat(buf, "|VI_DOOMED");
2712 if (vp->v_iflag & VI_FREE)
2713 strcat(buf, "|VI_FREE");
2714 if (vp->v_vflag & VV_OBJBUF)
2715 strcat(buf, "|VV_OBJBUF");
2716 if (buf[0] != '\0')
2717 printf(" flags (%s),", &buf[1]);
2718 lockmgr_printinfo(vp->v_vnlock);
2719 printf("\n");
2720 if (vp->v_data != NULL)
2721 VOP_PRINT(vp);
2722}
2723
2724#ifdef DDB
2725#include <ddb/ddb.h>
2726/*
2727 * List all of the locked vnodes in the system.
2728 * Called when debugging the kernel.
2729 */
2730DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2731{
2732 struct mount *mp, *nmp;
2733 struct vnode *vp;
2734
2735 /*
2736 * Note: because this is DDB, we can't obey the locking semantics
2737 * for these structures, which means we could catch an inconsistent
2738 * state and dereference a nasty pointer. Not much to be done
2739 * about that.
2740 */
2741 printf("Locked vnodes\n");
2742 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2743 nmp = TAILQ_NEXT(mp, mnt_list);
2744 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2745 if (VOP_ISLOCKED(vp, NULL))
2746 vprint(NULL, vp);
2747 }
2748 nmp = TAILQ_NEXT(mp, mnt_list);
2749 }
2750}
2751#endif
2752
2753/*
2754 * Fill in a struct xvfsconf based on a struct vfsconf.
2755 */
2756static void
2757vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2758{
2759
2760 strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2761 xvfsp->vfc_typenum = vfsp->vfc_typenum;
2762 xvfsp->vfc_refcount = vfsp->vfc_refcount;
2763 xvfsp->vfc_flags = vfsp->vfc_flags;
2764 /*
2765 * These are unused in userland, we keep them
2766 * to not break binary compatibility.
2767 */
2768 xvfsp->vfc_vfsops = NULL;
2769 xvfsp->vfc_next = NULL;
2770}
2771
2772static int
2773sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2774{
2775 struct vfsconf *vfsp;
2776 struct xvfsconf *xvfsp;
2777 int cnt, error, i;
2778
2779 cnt = 0;
2780 for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2781 cnt++;
2782 xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2783 /*
2784 * Handle the race that we will have here when struct vfsconf
2785 * will be locked down by using both cnt and checking vfc_next
2786 * against NULL to determine the end of the loop. The race will
2787 * happen because we will have to unlock before calling malloc().
2788 * We are protected by Giant for now.
2789 */
2790 i = 0;
2791 for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2792 vfsconf2x(vfsp, xvfsp + i);
2793 i++;
2794 }
2795 error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2796 free(xvfsp, M_TEMP);
2797 return (error);
2798}
2799
2800SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2801 "S,xvfsconf", "List of all configured filesystems");
2802
2803/*
2804 * Top level filesystem related information gathering.
2805 */
2806static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2807
2808static int
2809vfs_sysctl(SYSCTL_HANDLER_ARGS)
2810{
2811 int *name = (int *)arg1 - 1; /* XXX */
2812 u_int namelen = arg2 + 1; /* XXX */
2813 struct vfsconf *vfsp;
2814 struct xvfsconf xvfsp;
2815
2816 printf("WARNING: userland calling deprecated sysctl, "
2817 "please rebuild world\n");
2818
2819#if 1 || defined(COMPAT_PRELITE2)
2820 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2821 if (namelen == 1)
2822 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2823#endif
2824
2825 switch (name[1]) {
2826 case VFS_MAXTYPENUM:
2827 if (namelen != 2)
2828 return (ENOTDIR);
2829 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2830 case VFS_CONF:
2831 if (namelen != 3)
2832 return (ENOTDIR); /* overloaded */
2833 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2834 if (vfsp->vfc_typenum == name[2])
2835 break;
2836 if (vfsp == NULL)
2837 return (EOPNOTSUPP);
2838 vfsconf2x(vfsp, &xvfsp);
2839 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2840 }
2841 return (EOPNOTSUPP);
2842}
2843
2844SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2845 "Generic filesystem");
2846
2847#if 1 || defined(COMPAT_PRELITE2)
2848
2849static int
2850sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2851{
2852 int error;
2853 struct vfsconf *vfsp;
2854 struct ovfsconf ovfs;
2855
2856 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2857 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
2858 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2859 ovfs.vfc_index = vfsp->vfc_typenum;
2860 ovfs.vfc_refcount = vfsp->vfc_refcount;
2861 ovfs.vfc_flags = vfsp->vfc_flags;
2862 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2863 if (error)
2864 return error;
2865 }
2866 return 0;
2867}
2868
2869#endif /* 1 || COMPAT_PRELITE2 */
2870
2871#define KINFO_VNODESLOP 10
2872#ifdef notyet
2873/*
2874 * Dump vnode list (via sysctl).
2875 */
2876/* ARGSUSED */
2877static int
2878sysctl_vnode(SYSCTL_HANDLER_ARGS)
2879{
2880 struct xvnode *xvn;
2881 struct thread *td = req->td;
2882 struct mount *mp;
2883 struct vnode *vp;
2884 int error, len, n;
2885
2886 /*
2887 * Stale numvnodes access is not fatal here.
2888 */
2889 req->lock = 0;
2890 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2891 if (!req->oldptr)
2892 /* Make an estimate */
2893 return (SYSCTL_OUT(req, 0, len));
2894
2895 error = sysctl_wire_old_buffer(req, 0);
2896 if (error != 0)
2897 return (error);
2898 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2899 n = 0;
2900 mtx_lock(&mountlist_mtx);
2901 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2902 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2903 continue;
2904 MNT_ILOCK(mp);
2905 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2906 if (n == len)
2907 break;
2908 vref(vp);
2909 xvn[n].xv_size = sizeof *xvn;
2910 xvn[n].xv_vnode = vp;
2911#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
2912 XV_COPY(usecount);
2913 XV_COPY(writecount);
2914 XV_COPY(holdcnt);
2915 XV_COPY(id);
2916 XV_COPY(mount);
2917 XV_COPY(numoutput);
2918 XV_COPY(type);
2919#undef XV_COPY
2920 xvn[n].xv_flag = vp->v_vflag;
2921
2922 switch (vp->v_type) {
2923 case VREG:
2924 case VDIR:
2925 case VLNK:
2926 xvn[n].xv_dev = vp->v_cachedfs;
2927 xvn[n].xv_ino = vp->v_cachedid;
2928 break;
2929 case VBLK:
2930 case VCHR:
2931 if (vp->v_rdev == NULL) {
2932 vrele(vp);
2933 continue;
2934 }
2935 xvn[n].xv_dev = dev2udev(vp->v_rdev);
2936 break;
2937 case VSOCK:
2938 xvn[n].xv_socket = vp->v_socket;
2939 break;
2940 case VFIFO:
2941 xvn[n].xv_fifo = vp->v_fifoinfo;
2942 break;
2943 case VNON:
2944 case VBAD:
2945 default:
2946 /* shouldn't happen? */
2947 vrele(vp);
2948 continue;
2949 }
2950 vrele(vp);
2951 ++n;
2952 }
2953 MNT_IUNLOCK(mp);
2954 mtx_lock(&mountlist_mtx);
2955 vfs_unbusy(mp, td);
2956 if (n == len)
2957 break;
2958 }
2959 mtx_unlock(&mountlist_mtx);
2960
2961 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
2962 free(xvn, M_TEMP);
2963 return (error);
2964}
2965
2966SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2967 0, 0, sysctl_vnode, "S,xvnode", "");
2968#endif
2969
2970/*
2971 * Check to see if a filesystem is mounted on a block device.
2972 */
2973int
2974vfs_mountedon(vp)
2975 struct vnode *vp;
2976{
2977
2978 if (vp->v_rdev->si_mountpoint != NULL)
2979 return (EBUSY);
2980 return (0);
2981}
2982
2983/*
2984 * Unmount all filesystems. The list is traversed in reverse order
2985 * of mounting to avoid dependencies.
2986 */
2987void
2988vfs_unmountall()
2989{
2990 struct mount *mp;
2991 struct thread *td;
2992 int error;
2993
2994 if (curthread != NULL)
2995 td = curthread;
2996 else
2997 td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
2998 /*
2999 * Since this only runs when rebooting, it is not interlocked.
3000 */
3001 while(!TAILQ_EMPTY(&mountlist)) {
3002 mp = TAILQ_LAST(&mountlist, mntlist);
3003 error = dounmount(mp, MNT_FORCE, td);
3004 if (error) {
3005 TAILQ_REMOVE(&mountlist, mp, mnt_list);
3006 printf("unmount of %s failed (",
3007 mp->mnt_stat.f_mntonname);
3008 if (error == EBUSY)
3009 printf("BUSY)\n");
3010 else
3011 printf("%d)\n", error);
3012 } else {
3013 /* The unmount has removed mp from the mountlist */
3014 }
3015 }
3016}
3017
3018/*
3019 * perform msync on all vnodes under a mount point
3020 * the mount point must be locked.
3021 */
3022void
3023vfs_msync(struct mount *mp, int flags)
3024{
3025 struct vnode *vp, *nvp;
3026 struct vm_object *obj;
3027 int tries;
3028
3029 GIANT_REQUIRED;
3030
3031 tries = 5;
3032 MNT_ILOCK(mp);
3033loop:
3034 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
3035 if (vp->v_mount != mp) {
3036 if (--tries > 0)
3037 goto loop;
3038 break;
3039 }
3040 nvp = TAILQ_NEXT(vp, v_nmntvnodes);
3041
3042 VI_LOCK(vp);
3043 if (vp->v_iflag & VI_XLOCK) {
3044 VI_UNLOCK(vp);
3045 continue;
3046 }
3047
3048 if ((vp->v_iflag & VI_OBJDIRTY) &&
3049 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
3050 MNT_IUNLOCK(mp);
3051 if (!vget(vp,
3052 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3053 curthread)) {
3054 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
3055 vput(vp);
3056 MNT_ILOCK(mp);
3057 continue;
3058 }
3059
3060 if (VOP_GETVOBJECT(vp, &obj) == 0) {
3061 VM_OBJECT_LOCK(obj);
3062 vm_object_page_clean(obj, 0, 0,
3063 flags == MNT_WAIT ?
3064 OBJPC_SYNC : OBJPC_NOSYNC);
3065 VM_OBJECT_UNLOCK(obj);
3066 }
3067 vput(vp);
3068 }
3069 MNT_ILOCK(mp);
3070 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
3071 if (--tries > 0)
3072 goto loop;
3073 break;
3074 }
3075 } else
3076 VI_UNLOCK(vp);
3077 }
3078 MNT_IUNLOCK(mp);
3079}
3080
3081/*
3082 * Create the VM object needed for VMIO and mmap support. This
3083 * is done for all VREG files in the system. Some filesystems might
3084 * afford the additional metadata buffering capability of the
3085 * VMIO code by making the device node be VMIO mode also.
3086 *
3087 * vp must be locked when vfs_object_create is called.
3088 */
3089int
3090vfs_object_create(vp, td, cred)
3091 struct vnode *vp;
3092 struct thread *td;
3093 struct ucred *cred;
3094{
3095
3096 GIANT_REQUIRED;
3097 return (VOP_CREATEVOBJECT(vp, cred, td));
3098}
3099
3100/*
3101 * Mark a vnode as free, putting it up for recycling.
3102 */
3103void
3104vfree(vp)
3105 struct vnode *vp;
3106{
3107
3108 ASSERT_VI_LOCKED(vp, "vfree");
3109 mtx_lock(&vnode_free_list_mtx);
3110 KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3111 if (vp->v_iflag & VI_AGE) {
3112 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3113 } else {
3114 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3115 }
3116 freevnodes++;
3117 mtx_unlock(&vnode_free_list_mtx);
3118 vp->v_iflag &= ~VI_AGE;
3119 vp->v_iflag |= VI_FREE;
3120}
3121
3122/*
3123 * Opposite of vfree() - mark a vnode as in use.
3124 */
3125void
3126vbusy(vp)
3127 struct vnode *vp;
3128{
3129
3130 ASSERT_VI_LOCKED(vp, "vbusy");
3131 KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3132
3133 mtx_lock(&vnode_free_list_mtx);
3134 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3135 freevnodes--;
3136 mtx_unlock(&vnode_free_list_mtx);
3137
3138 vp->v_iflag &= ~(VI_FREE|VI_AGE);
3139}
3140
3141/*
3142 * Initalize per-vnode helper structure to hold poll-related state.
3143 */
3144void
3145v_addpollinfo(struct vnode *vp)
3146{
3147
3148 vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
3149 mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3150}
3151
3152/*
3153 * Record a process's interest in events which might happen to
3154 * a vnode. Because poll uses the historic select-style interface
3155 * internally, this routine serves as both the ``check for any
3156 * pending events'' and the ``record my interest in future events''
3157 * functions. (These are done together, while the lock is held,
3158 * to avoid race conditions.)
3159 */
3160int
3161vn_pollrecord(vp, td, events)
3162 struct vnode *vp;
3163 struct thread *td;
3164 short events;
3165{
3166
3167 if (vp->v_pollinfo == NULL)
3168 v_addpollinfo(vp);
3169 mtx_lock(&vp->v_pollinfo->vpi_lock);
3170 if (vp->v_pollinfo->vpi_revents & events) {
3171 /*
3172 * This leaves events we are not interested
3173 * in available for the other process which
3174 * which presumably had requested them
3175 * (otherwise they would never have been
3176 * recorded).
3177 */
3178 events &= vp->v_pollinfo->vpi_revents;
3179 vp->v_pollinfo->vpi_revents &= ~events;
3180
3181 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3182 return events;
3183 }
3184 vp->v_pollinfo->vpi_events |= events;
3185 selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3186 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3187 return 0;
3188}
3189
3190/*
3191 * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
3192 * it is possible for us to miss an event due to race conditions, but
3193 * that condition is expected to be rare, so for the moment it is the
3194 * preferred interface.
3195 */
3196void
3197vn_pollevent(vp, events)
3198 struct vnode *vp;
3199 short events;
3200{
3201
3202 if (vp->v_pollinfo == NULL)
3203 v_addpollinfo(vp);
3204 mtx_lock(&vp->v_pollinfo->vpi_lock);
3205 if (vp->v_pollinfo->vpi_events & events) {
3206 /*
3207 * We clear vpi_events so that we don't
3208 * call selwakeup() twice if two events are
3209 * posted before the polling process(es) is
3210 * awakened. This also ensures that we take at
3211 * most one selwakeup() if the polling process
3212 * is no longer interested. However, it does
3213 * mean that only one event can be noticed at
3214 * a time. (Perhaps we should only clear those
3215 * event bits which we note?) XXX
3216 */
3217 vp->v_pollinfo->vpi_events = 0; /* &= ~events ??? */
3218 vp->v_pollinfo->vpi_revents |= events;
3219 selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3220 }
3221 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3222}
3223
3224/*
3225 * Wake up anyone polling on vp because it is being revoked.
3226 * This depends on dead_poll() returning POLLHUP for correct
3227 * behavior.
3228 */
3229void
3230vn_pollgone(vp)
3231 struct vnode *vp;
3232{
3233
3234 mtx_lock(&vp->v_pollinfo->vpi_lock);
3235 VN_KNOTE(vp, NOTE_REVOKE);
3236 if (vp->v_pollinfo->vpi_events) {
3237 vp->v_pollinfo->vpi_events = 0;
3238 selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3239 }
3240 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3241}
3242
3243
3244
3245/*
3246 * Routine to create and manage a filesystem syncer vnode.
3247 */
3248#define sync_close ((int (*)(struct vop_close_args *))nullop)
3249static int sync_fsync(struct vop_fsync_args *);
3250static int sync_inactive(struct vop_inactive_args *);
3251static int sync_reclaim(struct vop_reclaim_args *);
3252
3253static vop_t **sync_vnodeop_p;
3254static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3255 { &vop_default_desc, (vop_t *) vop_eopnotsupp },
3256 { &vop_close_desc, (vop_t *) sync_close }, /* close */
3257 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
3258 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
3259 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
3260 { &vop_lock_desc, (vop_t *) vop_stdlock }, /* lock */
3261 { &vop_unlock_desc, (vop_t *) vop_stdunlock }, /* unlock */
3262 { &vop_islocked_desc, (vop_t *) vop_stdislocked }, /* islocked */
3263 { NULL, NULL }
3264};
3265static struct vnodeopv_desc sync_vnodeop_opv_desc =
3266 { &sync_vnodeop_p, sync_vnodeop_entries };
3267
3268VNODEOP_SET(sync_vnodeop_opv_desc);
3269
3270/*
3271 * Create a new filesystem syncer vnode for the specified mount point.
3272 */
3273int
3274vfs_allocate_syncvnode(mp)
3275 struct mount *mp;
3276{
3277 struct vnode *vp;
3278 static long start, incr, next;
3279 int error;
3280
3281 /* Allocate a new vnode */
3282 if ((error = getnewvnode("syncer", mp, sync_vnodeop_p, &vp)) != 0) {
3283 mp->mnt_syncer = NULL;
3284 return (error);
3285 }
3286 vp->v_type = VNON;
3287 /*
3288 * Place the vnode onto the syncer worklist. We attempt to
3289 * scatter them about on the list so that they will go off
3290 * at evenly distributed times even if all the filesystems
3291 * are mounted at once.
3292 */
3293 next += incr;
3294 if (next == 0 || next > syncer_maxdelay) {
3295 start /= 2;
3296 incr /= 2;
3297 if (start == 0) {
3298 start = syncer_maxdelay / 2;
3299 incr = syncer_maxdelay;
3300 }
3301 next = start;
3302 }
3303 VI_LOCK(vp);
3304 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3305 VI_UNLOCK(vp);
3306 mp->mnt_syncer = vp;
3307 return (0);
3308}
3309
3310/*
3311 * Do a lazy sync of the filesystem.
3312 */
3313static int
3314sync_fsync(ap)
3315 struct vop_fsync_args /* {
3316 struct vnode *a_vp;
3317 struct ucred *a_cred;
3318 int a_waitfor;
3319 struct thread *a_td;
3320 } */ *ap;
3321{
3322 struct vnode *syncvp = ap->a_vp;
3323 struct mount *mp = syncvp->v_mount;
3324 struct thread *td = ap->a_td;
3325 int error, asyncflag;
3326
3327 /*
3328 * We only need to do something if this is a lazy evaluation.
3329 */
3330 if (ap->a_waitfor != MNT_LAZY)
3331 return (0);
3332
3333 /*
3334 * Move ourselves to the back of the sync list.
3335 */
3336 VI_LOCK(syncvp);
3337 vn_syncer_add_to_worklist(syncvp, syncdelay);
3338 VI_UNLOCK(syncvp);
3339
3340 /*
3341 * Walk the list of vnodes pushing all that are dirty and
3342 * not already on the sync list.
3343 */
3344 mtx_lock(&mountlist_mtx);
3345 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3346 mtx_unlock(&mountlist_mtx);
3347 return (0);
3348 }
3349 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3350 vfs_unbusy(mp, td);
3351 return (0);
3352 }
3353 asyncflag = mp->mnt_flag & MNT_ASYNC;
3354 mp->mnt_flag &= ~MNT_ASYNC;
3355 vfs_msync(mp, MNT_NOWAIT);
3356 error = VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3357 if (asyncflag)
3358 mp->mnt_flag |= MNT_ASYNC;
3359 vn_finished_write(mp);
3360 vfs_unbusy(mp, td);
3361 return (error);
3362}
3363
3364/*
3365 * The syncer vnode is no referenced.
3366 */
3367static int
3368sync_inactive(ap)
3369 struct vop_inactive_args /* {
3370 struct vnode *a_vp;
3371 struct thread *a_td;
3372 } */ *ap;
3373{
3374
3375 VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3376 vgone(ap->a_vp);
3377 return (0);
3378}
3379
3380/*
3381 * The syncer vnode is no longer needed and is being decommissioned.
3382 *
3383 * Modifications to the worklist must be protected by sync_mtx.
3384 */
3385static int
3386sync_reclaim(ap)
3387 struct vop_reclaim_args /* {
3388 struct vnode *a_vp;
3389 } */ *ap;
3390{
3391 struct vnode *vp = ap->a_vp;
3392
3393 VI_LOCK(vp);
3394 vp->v_mount->mnt_syncer = NULL;
3395 if (vp->v_iflag & VI_ONWORKLST) {
3396 mtx_lock(&sync_mtx);
3397 LIST_REMOVE(vp, v_synclist);
3398 mtx_unlock(&sync_mtx);
3399 vp->v_iflag &= ~VI_ONWORKLST;
3400 }
3401 VI_UNLOCK(vp);
3402
3403 return (0);
3404}
3405
3406/*
3407 * extract the dev_t from a VCHR
3408 */
3409dev_t
3410vn_todev(vp)
3411 struct vnode *vp;
3412{
3413
3414 if (vp->v_type != VCHR)
3415 return (NODEV);
3416 return (vp->v_rdev);
3417}
3418
3419/*
3420 * Check if vnode represents a disk device
3421 */
3422int
3423vn_isdisk(vp, errp)
3424 struct vnode *vp;
3425 int *errp;
3426{
3427 int error;
3428
3429 error = 0;
3430 if (vp->v_type != VCHR)
3431 error = ENOTBLK;
3432 else if (vp->v_rdev == NULL)
3433 error = ENXIO;
3434 else if (!(devsw(vp->v_rdev)->d_flags & D_DISK))
3435 error = ENOTBLK;
3436 if (errp != NULL)
3437 *errp = error;
3438 return (error == 0);
3439}
3440
3441/*
3442 * Free data allocated by namei(); see namei(9) for details.
3443 */
3444void
3445NDFREE(ndp, flags)
3446 struct nameidata *ndp;
3447 const u_int flags;
3448{
3449
3450 if (!(flags & NDF_NO_FREE_PNBUF) &&
3451 (ndp->ni_cnd.cn_flags & HASBUF)) {
3452 uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3453 ndp->ni_cnd.cn_flags &= ~HASBUF;
3454 }
3455 if (!(flags & NDF_NO_DVP_UNLOCK) &&
3456 (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3457 ndp->ni_dvp != ndp->ni_vp)
3458 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3459 if (!(flags & NDF_NO_DVP_RELE) &&
3460 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3461 vrele(ndp->ni_dvp);
3462 ndp->ni_dvp = NULL;
3463 }
3464 if (!(flags & NDF_NO_VP_UNLOCK) &&
3465 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3466 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3467 if (!(flags & NDF_NO_VP_RELE) &&
3468 ndp->ni_vp) {
3469 vrele(ndp->ni_vp);
3470 ndp->ni_vp = NULL;
3471 }
3472 if (!(flags & NDF_NO_STARTDIR_RELE) &&
3473 (ndp->ni_cnd.cn_flags & SAVESTART)) {
3474 vrele(ndp->ni_startdir);
3475 ndp->ni_startdir = NULL;
3476 }
3477}
3478
3479/*
3480 * Common filesystem object access control check routine. Accepts a
3481 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3482 * and optional call-by-reference privused argument allowing vaccess()
3483 * to indicate to the caller whether privilege was used to satisfy the
3484 * request (obsoleted). Returns 0 on success, or an errno on failure.
3485 */
3486int
3487vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3488 enum vtype type;
3489 mode_t file_mode;
3490 uid_t file_uid;
3491 gid_t file_gid;
3492 mode_t acc_mode;
3493 struct ucred *cred;
3494 int *privused;
3495{
3496 mode_t dac_granted;
3497#ifdef CAPABILITIES
3498 mode_t cap_granted;
3499#endif
3500
3501 /*
3502 * Look for a normal, non-privileged way to access the file/directory
3503 * as requested. If it exists, go with that.
3504 */
3505
3506 if (privused != NULL)
3507 *privused = 0;
3508
3509 dac_granted = 0;
3510
3511 /* Check the owner. */
3512 if (cred->cr_uid == file_uid) {
3513 dac_granted |= VADMIN;
3514 if (file_mode & S_IXUSR)
3515 dac_granted |= VEXEC;
3516 if (file_mode & S_IRUSR)
3517 dac_granted |= VREAD;
3518 if (file_mode & S_IWUSR)
3519 dac_granted |= (VWRITE | VAPPEND);
3520
3521 if ((acc_mode & dac_granted) == acc_mode)
3522 return (0);
3523
3524 goto privcheck;
3525 }
3526
3527 /* Otherwise, check the groups (first match) */
3528 if (groupmember(file_gid, cred)) {
3529 if (file_mode & S_IXGRP)
3530 dac_granted |= VEXEC;
3531 if (file_mode & S_IRGRP)
3532 dac_granted |= VREAD;
3533 if (file_mode & S_IWGRP)
3534 dac_granted |= (VWRITE | VAPPEND);
3535
3536 if ((acc_mode & dac_granted) == acc_mode)
3537 return (0);
3538
3539 goto privcheck;
3540 }
3541
3542 /* Otherwise, check everyone else. */
3543 if (file_mode & S_IXOTH)
3544 dac_granted |= VEXEC;
3545 if (file_mode & S_IROTH)
3546 dac_granted |= VREAD;
3547 if (file_mode & S_IWOTH)
3548 dac_granted |= (VWRITE | VAPPEND);
3549 if ((acc_mode & dac_granted) == acc_mode)
3550 return (0);
3551
3552privcheck:
3553 if (!suser_cred(cred, PRISON_ROOT)) {
3554 /* XXX audit: privilege used */
3555 if (privused != NULL)
3556 *privused = 1;
3557 return (0);
3558 }
3559
3560#ifdef CAPABILITIES
3561 /*
3562 * Build a capability mask to determine if the set of capabilities
3563 * satisfies the requirements when combined with the granted mask
3564 * from above.
3565 * For each capability, if the capability is required, bitwise
3566 * or the request type onto the cap_granted mask.
3567 */
3568 cap_granted = 0;
3569
3570 if (type == VDIR) {
3571 /*
3572 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3573 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3574 */
3575 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3576 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3577 cap_granted |= VEXEC;
3578 } else {
3579 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3580 !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3581 cap_granted |= VEXEC;
3582 }
3583
3584 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3585 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3586 cap_granted |= VREAD;
3587
3588 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3589 !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3590 cap_granted |= (VWRITE | VAPPEND);
3591
3592 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3593 !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3594 cap_granted |= VADMIN;
3595
3596 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3597 /* XXX audit: privilege used */
3598 if (privused != NULL)
3599 *privused = 1;
3600 return (0);
3601 }
3602#endif
3603
3604 return ((acc_mode & VADMIN) ? EPERM : EACCES);
3605}
3606
3607/*
3608 * Credential check based on process requesting service, and per-attribute
3609 * permissions.
3610 */
3611int
3612extattr_check_cred(struct vnode *vp, int attrnamespace,
3613 struct ucred *cred, struct thread *td, int access)
3614{
3615
3616 /*
3617 * Kernel-invoked always succeeds.
3618 */
3619 if (cred == NOCRED)
3620 return (0);
3621
3622 /*
3623 * Do not allow privileged processes in jail to directly
3624 * manipulate system attributes.
3625 *
3626 * XXX What capability should apply here?
3627 * Probably CAP_SYS_SETFFLAG.
3628 */
3629 switch (attrnamespace) {
3630 case EXTATTR_NAMESPACE_SYSTEM:
3631 /* Potentially should be: return (EPERM); */
3632 return (suser_cred(cred, 0));
3633 case EXTATTR_NAMESPACE_USER:
3634 return (VOP_ACCESS(vp, access, cred, td));
3635 default:
3636 return (EPERM);
3637 }
3638}
3639
3640#ifdef DEBUG_VFS_LOCKS
3641/*
3642 * This only exists to supress warnings from unlocked specfs accesses. It is
3643 * no longer ok to have an unlocked VFS.
3644 */
3645#define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
3646
3647int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
3648int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
3649int vfs_badlock_print = 1; /* Print lock violations. */
3650
3651static void
3652vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3653{
3654
3655 if (vfs_badlock_print)
3656 printf("%s: %p %s\n", str, (void *)vp, msg);
3657 if (vfs_badlock_ddb)
3658 Debugger("lock violation");
3659}
3660
3661void
3662assert_vi_locked(struct vnode *vp, const char *str)
3663{
3664
3665 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3666 vfs_badlock("interlock is not locked but should be", str, vp);
3667}
3668
3669void
3670assert_vi_unlocked(struct vnode *vp, const char *str)
3671{
3672
3673 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3674 vfs_badlock("interlock is locked but should not be", str, vp);
3675}
3676
3677void
3678assert_vop_locked(struct vnode *vp, const char *str)
3679{
3680
3681 if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
3682 vfs_badlock("is not locked but should be", str, vp);
3683}
3684
3685void
3686assert_vop_unlocked(struct vnode *vp, const char *str)
3687{
3688
3689 if (vp && !IGNORE_LOCK(vp) &&
3690 VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
3691 vfs_badlock("is locked but should not be", str, vp);
3692}
3693
3694#if 0
3695void
3696assert_vop_elocked(struct vnode *vp, const char *str)
3697{
3698
3699 if (vp && !IGNORE_LOCK(vp) &&
3700 VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
3701 vfs_badlock("is not exclusive locked but should be", str, vp);
3702}
3703
3704void
3705assert_vop_elocked_other(struct vnode *vp, const char *str)
3706{
3707
3708 if (vp && !IGNORE_LOCK(vp) &&
3709 VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
3710 vfs_badlock("is not exclusive locked by another thread",
3711 str, vp);
3712}
3713
3714void
3715assert_vop_slocked(struct vnode *vp, const char *str)
3716{
3717
3718 if (vp && !IGNORE_LOCK(vp) &&
3719 VOP_ISLOCKED(vp, curthread) != LK_SHARED)
3720 vfs_badlock("is not locked shared but should be", str, vp);
3721}
3722#endif /* 0 */
3723
3724void
3725vop_rename_pre(void *ap)
3726{
3727 struct vop_rename_args *a = ap;
3728
3729 if (a->a_tvp)
3730 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3731 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3732 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3733 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3734
3735 /* Check the source (from). */
3736 if (a->a_tdvp != a->a_fdvp)
3737 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3738 if (a->a_tvp != a->a_fvp)
3739 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
3740
3741 /* Check the target. */
3742 if (a->a_tvp)
3743 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3744 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3745}
3746
3747void
3748vop_strategy_pre(void *ap)
3749{
3750 struct vop_strategy_args *a;
3751 struct buf *bp;
3752
3753 a = ap;
3754 bp = a->a_bp;
3755
3756 /*
3757 * Cluster ops lock their component buffers but not the IO container.
3758 */
3759 if ((bp->b_flags & B_CLUSTER) != 0)
3760 return;
3761
3762 if (BUF_REFCNT(bp) < 1) {
3763 if (vfs_badlock_print)
3764 printf(
3765 "VOP_STRATEGY: bp is not locked but should be\n");
3766 if (vfs_badlock_ddb)
3767 Debugger("lock violation");
3768 }
3769}
3770
3771void
3772vop_lookup_pre(void *ap)
3773{
3774 struct vop_lookup_args *a;
3775 struct vnode *dvp;
3776
3777 a = ap;
3778 dvp = a->a_dvp;
3779 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3780 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3781}
3782
3783void
3784vop_lookup_post(void *ap, int rc)
3785{
3786 struct vop_lookup_args *a;
3787 struct componentname *cnp;
3788 struct vnode *dvp;
3789 struct vnode *vp;
3790 int flags;
3791
3792 a = ap;
3793 dvp = a->a_dvp;
3794 cnp = a->a_cnp;
3795 vp = *(a->a_vpp);
3796 flags = cnp->cn_flags;
3797
3798 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3799
3800 /*
3801 * If this is the last path component for this lookup and LOCKPARENT
3802 * is set, OR if there is an error the directory has to be locked.
3803 */
3804 if ((flags & LOCKPARENT) && (flags & ISLASTCN))
3805 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
3806 else if (rc != 0)
3807 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
3808 else if (dvp != vp)
3809 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
3810 if (flags & PDIRUNLOCK)
3811 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
3812}
3813
3814void
3815vop_lock_pre(void *ap)
3816{
3817 struct vop_lock_args *a = ap;
3818
3819 if ((a->a_flags & LK_INTERLOCK) == 0)
3820 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3821 else
3822 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3823}
3824
3825void
3826vop_lock_post(void *ap, int rc)
3827{
3828 struct vop_lock_args *a = ap;
3829
3830 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3831 if (rc == 0)
3832 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3833}
3834
3835void
3836vop_unlock_pre(void *ap)
3837{
3838 struct vop_unlock_args *a = ap;
3839
3840 if (a->a_flags & LK_INTERLOCK)
3841 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3842 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3843}
3844
3845void
3846vop_unlock_post(void *ap, int rc)
3847{
3848 struct vop_unlock_args *a = ap;
3849
3850 if (a->a_flags & LK_INTERLOCK)
3851 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3852}
3853#endif /* DEBUG_VFS_LOCKS */