Deleted Added
full compact
vfs_export.c (58349) vfs_export.c (60041)
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_export.c 58349 2000-03-20 11:29:10Z phk $
39 * $FreeBSD: head/sys/kern/vfs_export.c 60041 2000-05-05 09:59:14Z phk $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/bio.h>
49#include <sys/buf.h>
50#include <sys/conf.h>
51#include <sys/dirent.h>
52#include <sys/domain.h>
53#include <sys/eventhandler.h>
54#include <sys/fcntl.h>
55#include <sys/kernel.h>
56#include <sys/kthread.h>
57#include <sys/malloc.h>
58#include <sys/mount.h>
59#include <sys/namei.h>
60#include <sys/proc.h>
61#include <sys/reboot.h>
62#include <sys/socket.h>
63#include <sys/stat.h>
64#include <sys/sysctl.h>
65#include <sys/vmmeter.h>
66#include <sys/vnode.h>
67
68#include <machine/limits.h>
69
70#include <vm/vm.h>
71#include <vm/vm_object.h>
72#include <vm/vm_extern.h>
73#include <vm/pmap.h>
74#include <vm/vm_map.h>
75#include <vm/vm_page.h>
76#include <vm/vm_pager.h>
77#include <vm/vnode_pager.h>
78#include <vm/vm_zone.h>
79
80static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
81
82static void insmntque __P((struct vnode *vp, struct mount *mp));
83static void vclean __P((struct vnode *vp, int flags, struct proc *p));
84static void vfree __P((struct vnode *));
85static unsigned long numvnodes;
86SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
87
88enum vtype iftovt_tab[16] = {
89 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
90 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
91};
92int vttoif_tab[9] = {
93 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
94 S_IFSOCK, S_IFIFO, S_IFMT,
95};
96
97static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
98struct tobefreelist vnode_tobefree_list; /* vnode free list */
99
100static u_long wantfreevnodes = 25;
101SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
102static u_long freevnodes = 0;
103SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
104
105static int reassignbufcalls;
106SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
107static int reassignbufloops;
108SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
109static int reassignbufsortgood;
110SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
111static int reassignbufsortbad;
112SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
113static int reassignbufmethod = 1;
114SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
115
116#ifdef ENABLE_VFS_IOOPT
117int vfs_ioopt = 0;
118SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
119#endif
120
121struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
122struct simplelock mountlist_slock;
123struct simplelock mntvnode_slock;
124int nfs_mount_type = -1;
125#ifndef NULL_SIMPLELOCKS
126static struct simplelock mntid_slock;
127static struct simplelock vnode_free_list_slock;
128static struct simplelock spechash_slock;
129#endif
130struct nfs_public nfs_pub; /* publicly exported FS */
131static vm_zone_t vnode_zone;
132
133/*
134 * The workitem queue.
135 */
136#define SYNCER_MAXDELAY 32
137static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
138time_t syncdelay = 30; /* max time to delay syncing data */
139time_t filedelay = 30; /* time to delay syncing files */
140SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
141time_t dirdelay = 29; /* time to delay syncing directories */
142SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
143time_t metadelay = 28; /* time to delay syncing metadata */
144SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
145static int rushjob; /* number of slots to run ASAP */
146static int stat_rush_requests; /* number of times I/O speeded up */
147SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
148
149static int syncer_delayno = 0;
150static long syncer_mask;
151LIST_HEAD(synclist, vnode);
152static struct synclist *syncer_workitem_pending;
153
154int desiredvnodes;
155SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
156 &desiredvnodes, 0, "Maximum number of vnodes");
157
158static void vfs_free_addrlist __P((struct netexport *nep));
159static int vfs_free_netcred __P((struct radix_node *rn, void *w));
160static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
161 struct export_args *argp));
162
163/*
164 * Initialize the vnode management data structures.
165 */
166void
167vntblinit()
168{
169
170 desiredvnodes = maxproc + cnt.v_page_count / 4;
171 simple_lock_init(&mntvnode_slock);
172 simple_lock_init(&mntid_slock);
173 simple_lock_init(&spechash_slock);
174 TAILQ_INIT(&vnode_free_list);
175 TAILQ_INIT(&vnode_tobefree_list);
176 simple_lock_init(&vnode_free_list_slock);
177 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
178 /*
179 * Initialize the filesystem syncer.
180 */
181 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
182 &syncer_mask);
183 syncer_maxdelay = syncer_mask + 1;
184}
185
186/*
187 * Mark a mount point as busy. Used to synchronize access and to delay
188 * unmounting. Interlock is not released on failure.
189 */
190int
191vfs_busy(mp, flags, interlkp, p)
192 struct mount *mp;
193 int flags;
194 struct simplelock *interlkp;
195 struct proc *p;
196{
197 int lkflags;
198
199 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
200 if (flags & LK_NOWAIT)
201 return (ENOENT);
202 mp->mnt_kern_flag |= MNTK_MWAIT;
203 if (interlkp) {
204 simple_unlock(interlkp);
205 }
206 /*
207 * Since all busy locks are shared except the exclusive
208 * lock granted when unmounting, the only place that a
209 * wakeup needs to be done is at the release of the
210 * exclusive lock at the end of dounmount.
211 */
212 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
213 if (interlkp) {
214 simple_lock(interlkp);
215 }
216 return (ENOENT);
217 }
218 lkflags = LK_SHARED | LK_NOPAUSE;
219 if (interlkp)
220 lkflags |= LK_INTERLOCK;
221 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
222 panic("vfs_busy: unexpected lock failure");
223 return (0);
224}
225
226/*
227 * Free a busy filesystem.
228 */
229void
230vfs_unbusy(mp, p)
231 struct mount *mp;
232 struct proc *p;
233{
234
235 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
236}
237
238/*
239 * Lookup a filesystem type, and if found allocate and initialize
240 * a mount structure for it.
241 *
242 * Devname is usually updated by mount(8) after booting.
243 */
244int
245vfs_rootmountalloc(fstypename, devname, mpp)
246 char *fstypename;
247 char *devname;
248 struct mount **mpp;
249{
250 struct proc *p = curproc; /* XXX */
251 struct vfsconf *vfsp;
252 struct mount *mp;
253
254 if (fstypename == NULL)
255 return (ENODEV);
256 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
257 if (!strcmp(vfsp->vfc_name, fstypename))
258 break;
259 if (vfsp == NULL)
260 return (ENODEV);
261 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
262 bzero((char *)mp, (u_long)sizeof(struct mount));
263 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
264 (void)vfs_busy(mp, LK_NOWAIT, 0, p);
265 LIST_INIT(&mp->mnt_vnodelist);
266 mp->mnt_vfc = vfsp;
267 mp->mnt_op = vfsp->vfc_vfsops;
268 mp->mnt_flag = MNT_RDONLY;
269 mp->mnt_vnodecovered = NULLVP;
270 vfsp->vfc_refcount++;
271 mp->mnt_iosize_max = DFLTPHYS;
272 mp->mnt_stat.f_type = vfsp->vfc_typenum;
273 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
274 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
275 mp->mnt_stat.f_mntonname[0] = '/';
276 mp->mnt_stat.f_mntonname[1] = 0;
277 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
278 *mpp = mp;
279 return (0);
280}
281
282/*
283 * Find an appropriate filesystem to use for the root. If a filesystem
284 * has not been preselected, walk through the list of known filesystems
285 * trying those that have mountroot routines, and try them until one
286 * works or we have tried them all.
287 */
288#ifdef notdef /* XXX JH */
289int
290lite2_vfs_mountroot()
291{
292 struct vfsconf *vfsp;
293 extern int (*lite2_mountroot) __P((void));
294 int error;
295
296 if (lite2_mountroot != NULL)
297 return ((*lite2_mountroot)());
298 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
299 if (vfsp->vfc_mountroot == NULL)
300 continue;
301 if ((error = (*vfsp->vfc_mountroot)()) == 0)
302 return (0);
303 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
304 }
305 return (ENODEV);
306}
307#endif
308
309/*
310 * Lookup a mount point by filesystem identifier.
311 */
312struct mount *
313vfs_getvfs(fsid)
314 fsid_t *fsid;
315{
316 register struct mount *mp;
317
318 simple_lock(&mountlist_slock);
319 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
320 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
321 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
322 simple_unlock(&mountlist_slock);
323 return (mp);
324 }
325 }
326 simple_unlock(&mountlist_slock);
327 return ((struct mount *) 0);
328}
329
330/*
331 * Get a new unique fsid. Try to make its val[0] unique, since this value
332 * will be used to create fake device numbers for stat(). Also try (but
333 * not so hard) make its val[0] unique mod 2^16, since some emulators only
334 * support 16-bit device numbers. We end up with unique val[0]'s for the
335 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
336 *
337 * Keep in mind that several mounts may be running in parallel. Starting
338 * the search one past where the previous search terminated is both a
339 * micro-optimization and a defense against returning the same fsid to
340 * different mounts.
341 */
342void
343vfs_getnewfsid(mp)
344 struct mount *mp;
345{
346 static u_int16_t mntid_base;
347 fsid_t tfsid;
348 int mtype;
349
350 simple_lock(&mntid_slock);
351 mtype = mp->mnt_vfc->vfc_typenum;
352 tfsid.val[1] = mtype;
353 mtype = (mtype & 0xFF) << 16;
354 for (;;) {
355 tfsid.val[0] = makeudev(255, mtype | mntid_base++);
356 if (vfs_getvfs(&tfsid) == NULL)
357 break;
358 }
359 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
360 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
361 simple_unlock(&mntid_slock);
362}
363
364/*
365 * Knob to control the precision of file timestamps:
366 *
367 * 0 = seconds only; nanoseconds zeroed.
368 * 1 = seconds and nanoseconds, accurate within 1/HZ.
369 * 2 = seconds and nanoseconds, truncated to microseconds.
370 * >=3 = seconds and nanoseconds, maximum precision.
371 */
372enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
373
374static int timestamp_precision = TSP_SEC;
375SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
376 &timestamp_precision, 0, "");
377
378/*
379 * Get a current timestamp.
380 */
381void
382vfs_timestamp(tsp)
383 struct timespec *tsp;
384{
385 struct timeval tv;
386
387 switch (timestamp_precision) {
388 case TSP_SEC:
389 tsp->tv_sec = time_second;
390 tsp->tv_nsec = 0;
391 break;
392 case TSP_HZ:
393 getnanotime(tsp);
394 break;
395 case TSP_USEC:
396 microtime(&tv);
397 TIMEVAL_TO_TIMESPEC(&tv, tsp);
398 break;
399 case TSP_NSEC:
400 default:
401 nanotime(tsp);
402 break;
403 }
404}
405
406/*
407 * Set vnode attributes to VNOVAL
408 */
409void
410vattr_null(vap)
411 register struct vattr *vap;
412{
413
414 vap->va_type = VNON;
415 vap->va_size = VNOVAL;
416 vap->va_bytes = VNOVAL;
417 vap->va_mode = VNOVAL;
418 vap->va_nlink = VNOVAL;
419 vap->va_uid = VNOVAL;
420 vap->va_gid = VNOVAL;
421 vap->va_fsid = VNOVAL;
422 vap->va_fileid = VNOVAL;
423 vap->va_blocksize = VNOVAL;
424 vap->va_rdev = VNOVAL;
425 vap->va_atime.tv_sec = VNOVAL;
426 vap->va_atime.tv_nsec = VNOVAL;
427 vap->va_mtime.tv_sec = VNOVAL;
428 vap->va_mtime.tv_nsec = VNOVAL;
429 vap->va_ctime.tv_sec = VNOVAL;
430 vap->va_ctime.tv_nsec = VNOVAL;
431 vap->va_flags = VNOVAL;
432 vap->va_gen = VNOVAL;
433 vap->va_vaflags = 0;
434}
435
436/*
437 * Routines having to do with the management of the vnode table.
438 */
439extern vop_t **dead_vnodeop_p;
440
441/*
442 * Return the next vnode from the free list.
443 */
444int
445getnewvnode(tag, mp, vops, vpp)
446 enum vtagtype tag;
447 struct mount *mp;
448 vop_t **vops;
449 struct vnode **vpp;
450{
451 int s;
452 struct proc *p = curproc; /* XXX */
453 struct vnode *vp, *tvp, *nvp;
454 vm_object_t object;
455 TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
456
457 /*
458 * We take the least recently used vnode from the freelist
459 * if we can get it and it has no cached pages, and no
460 * namecache entries are relative to it.
461 * Otherwise we allocate a new vnode
462 */
463
464 s = splbio();
465 simple_lock(&vnode_free_list_slock);
466 TAILQ_INIT(&vnode_tmp_list);
467
468 for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
469 nvp = TAILQ_NEXT(vp, v_freelist);
470 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
471 if (vp->v_flag & VAGE) {
472 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
473 } else {
474 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
475 }
476 vp->v_flag &= ~(VTBFREE|VAGE);
477 vp->v_flag |= VFREE;
478 if (vp->v_usecount)
479 panic("tobe free vnode isn't");
480 freevnodes++;
481 }
482
483 if (wantfreevnodes && freevnodes < wantfreevnodes) {
484 vp = NULL;
485 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
486 /*
487 * XXX: this is only here to be backwards compatible
488 */
489 vp = NULL;
490 } else {
491 for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
492 nvp = TAILQ_NEXT(vp, v_freelist);
493 if (!simple_lock_try(&vp->v_interlock))
494 continue;
495 if (vp->v_usecount)
496 panic("free vnode isn't");
497
498 object = vp->v_object;
499 if (object && (object->resident_page_count || object->ref_count)) {
500 printf("object inconsistant state: RPC: %d, RC: %d\n",
501 object->resident_page_count, object->ref_count);
502 /* Don't recycle if it's caching some pages */
503 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
504 TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
505 continue;
506 } else if (LIST_FIRST(&vp->v_cache_src)) {
507 /* Don't recycle if active in the namecache */
508 simple_unlock(&vp->v_interlock);
509 continue;
510 } else {
511 break;
512 }
513 }
514 }
515
516 for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
517 nvp = TAILQ_NEXT(tvp, v_freelist);
518 TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
519 TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
520 simple_unlock(&tvp->v_interlock);
521 }
522
523 if (vp) {
524 vp->v_flag |= VDOOMED;
525 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
526 freevnodes--;
527 simple_unlock(&vnode_free_list_slock);
528 cache_purge(vp);
529 vp->v_lease = NULL;
530 if (vp->v_type != VBAD) {
531 vgonel(vp, p);
532 } else {
533 simple_unlock(&vp->v_interlock);
534 }
535
536#ifdef INVARIANTS
537 {
538 int s;
539
540 if (vp->v_data)
541 panic("cleaned vnode isn't");
542 s = splbio();
543 if (vp->v_numoutput)
544 panic("Clean vnode has pending I/O's");
545 splx(s);
546 }
547#endif
548 vp->v_flag = 0;
549 vp->v_lastw = 0;
550 vp->v_lasta = 0;
551 vp->v_cstart = 0;
552 vp->v_clen = 0;
553 vp->v_socket = 0;
554 vp->v_writecount = 0; /* XXX */
555 } else {
556 simple_unlock(&vnode_free_list_slock);
557 vp = (struct vnode *) zalloc(vnode_zone);
558 bzero((char *) vp, sizeof *vp);
559 simple_lock_init(&vp->v_interlock);
560 vp->v_dd = vp;
561 cache_purge(vp);
562 LIST_INIT(&vp->v_cache_src);
563 TAILQ_INIT(&vp->v_cache_dst);
564 numvnodes++;
565 }
566
567 TAILQ_INIT(&vp->v_cleanblkhd);
568 TAILQ_INIT(&vp->v_dirtyblkhd);
569 vp->v_type = VNON;
570 vp->v_tag = tag;
571 vp->v_op = vops;
572 insmntque(vp, mp);
573 *vpp = vp;
574 vp->v_usecount = 1;
575 vp->v_data = 0;
576 splx(s);
577
578 vfs_object_create(vp, p, p->p_ucred);
579 return (0);
580}
581
582/*
583 * Move a vnode from one mount queue to another.
584 */
585static void
586insmntque(vp, mp)
587 register struct vnode *vp;
588 register struct mount *mp;
589{
590
591 simple_lock(&mntvnode_slock);
592 /*
593 * Delete from old mount point vnode list, if on one.
594 */
595 if (vp->v_mount != NULL)
596 LIST_REMOVE(vp, v_mntvnodes);
597 /*
598 * Insert into list of vnodes for the new mount point, if available.
599 */
600 if ((vp->v_mount = mp) == NULL) {
601 simple_unlock(&mntvnode_slock);
602 return;
603 }
604 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
605 simple_unlock(&mntvnode_slock);
606}
607
608/*
609 * Update outstanding I/O count and do wakeup if requested.
610 */
611void
612vwakeup(bp)
613 register struct buf *bp;
614{
615 register struct vnode *vp;
616
617 bp->b_flags &= ~B_WRITEINPROG;
618 if ((vp = bp->b_vp)) {
619 vp->v_numoutput--;
620 if (vp->v_numoutput < 0)
621 panic("vwakeup: neg numoutput");
622 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
623 vp->v_flag &= ~VBWAIT;
624 wakeup((caddr_t) &vp->v_numoutput);
625 }
626 }
627}
628
629/*
630 * Flush out and invalidate all buffers associated with a vnode.
631 * Called with the underlying object locked.
632 */
633int
634vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
635 register struct vnode *vp;
636 int flags;
637 struct ucred *cred;
638 struct proc *p;
639 int slpflag, slptimeo;
640{
641 register struct buf *bp;
642 struct buf *nbp, *blist;
643 int s, error;
644 vm_object_t object;
645
646 if (flags & V_SAVE) {
647 s = splbio();
648 while (vp->v_numoutput) {
649 vp->v_flag |= VBWAIT;
650 error = tsleep((caddr_t)&vp->v_numoutput,
651 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
652 if (error) {
653 splx(s);
654 return (error);
655 }
656 }
657 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
658 splx(s);
659 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
660 return (error);
661 s = splbio();
662 if (vp->v_numoutput > 0 ||
663 !TAILQ_EMPTY(&vp->v_dirtyblkhd))
664 panic("vinvalbuf: dirty bufs");
665 }
666 splx(s);
667 }
668 s = splbio();
669 for (;;) {
670 blist = TAILQ_FIRST(&vp->v_cleanblkhd);
671 if (!blist)
672 blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
673 if (!blist)
674 break;
675
676 for (bp = blist; bp; bp = nbp) {
677 nbp = TAILQ_NEXT(bp, b_vnbufs);
678 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
679 error = BUF_TIMELOCK(bp,
680 LK_EXCLUSIVE | LK_SLEEPFAIL,
681 "vinvalbuf", slpflag, slptimeo);
682 if (error == ENOLCK)
683 break;
684 splx(s);
685 return (error);
686 }
687 /*
688 * XXX Since there are no node locks for NFS, I
689 * believe there is a slight chance that a delayed
690 * write will occur while sleeping just above, so
691 * check for it. Note that vfs_bio_awrite expects
692 * buffers to reside on a queue, while VOP_BWRITE and
693 * brelse do not.
694 */
695 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
696 (flags & V_SAVE)) {
697
698 if (bp->b_vp == vp) {
699 if (bp->b_flags & B_CLUSTEROK) {
700 BUF_UNLOCK(bp);
701 vfs_bio_awrite(bp);
702 } else {
703 bremfree(bp);
704 bp->b_flags |= B_ASYNC;
705 BUF_WRITE(bp);
706 }
707 } else {
708 bremfree(bp);
709 (void) BUF_WRITE(bp);
710 }
711 break;
712 }
713 bremfree(bp);
714 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
715 bp->b_flags &= ~B_ASYNC;
716 brelse(bp);
717 }
718 }
719
720 while (vp->v_numoutput > 0) {
721 vp->v_flag |= VBWAIT;
722 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
723 }
724
725 splx(s);
726
727 /*
728 * Destroy the copy in the VM cache, too.
729 */
730 simple_lock(&vp->v_interlock);
731 object = vp->v_object;
732 if (object != NULL) {
733 vm_object_page_remove(object, 0, 0,
734 (flags & V_SAVE) ? TRUE : FALSE);
735 }
736 simple_unlock(&vp->v_interlock);
737
738 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
739 panic("vinvalbuf: flush failed");
740 return (0);
741}
742
743/*
744 * Truncate a file's buffer and pages to a specified length. This
745 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
746 * sync activity.
747 */
748int
749vtruncbuf(vp, cred, p, length, blksize)
750 register struct vnode *vp;
751 struct ucred *cred;
752 struct proc *p;
753 off_t length;
754 int blksize;
755{
756 register struct buf *bp;
757 struct buf *nbp;
758 int s, anyfreed;
759 int trunclbn;
760
761 /*
762 * Round up to the *next* lbn.
763 */
764 trunclbn = (length + blksize - 1) / blksize;
765
766 s = splbio();
767restart:
768 anyfreed = 1;
769 for (;anyfreed;) {
770 anyfreed = 0;
771 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
772 nbp = TAILQ_NEXT(bp, b_vnbufs);
773 if (bp->b_lblkno >= trunclbn) {
774 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
775 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
776 goto restart;
777 } else {
778 bremfree(bp);
779 bp->b_flags |= (B_INVAL | B_RELBUF);
780 bp->b_flags &= ~B_ASYNC;
781 brelse(bp);
782 anyfreed = 1;
783 }
784 if (nbp &&
785 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
786 (nbp->b_vp != vp) ||
787 (nbp->b_flags & B_DELWRI))) {
788 goto restart;
789 }
790 }
791 }
792
793 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
794 nbp = TAILQ_NEXT(bp, b_vnbufs);
795 if (bp->b_lblkno >= trunclbn) {
796 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
797 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
798 goto restart;
799 } else {
800 bremfree(bp);
801 bp->b_flags |= (B_INVAL | B_RELBUF);
802 bp->b_flags &= ~B_ASYNC;
803 brelse(bp);
804 anyfreed = 1;
805 }
806 if (nbp &&
807 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
808 (nbp->b_vp != vp) ||
809 (nbp->b_flags & B_DELWRI) == 0)) {
810 goto restart;
811 }
812 }
813 }
814 }
815
816 if (length > 0) {
817restartsync:
818 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
819 nbp = TAILQ_NEXT(bp, b_vnbufs);
820 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
821 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
822 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
823 goto restart;
824 } else {
825 bremfree(bp);
826 if (bp->b_vp == vp) {
827 bp->b_flags |= B_ASYNC;
828 } else {
829 bp->b_flags &= ~B_ASYNC;
830 }
831 BUF_WRITE(bp);
832 }
833 goto restartsync;
834 }
835
836 }
837 }
838
839 while (vp->v_numoutput > 0) {
840 vp->v_flag |= VBWAIT;
841 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
842 }
843
844 splx(s);
845
846 vnode_pager_setsize(vp, length);
847
848 return (0);
849}
850
851/*
852 * Associate a buffer with a vnode.
853 */
854void
855bgetvp(vp, bp)
856 register struct vnode *vp;
857 register struct buf *bp;
858{
859 int s;
860
861 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
862
863 vhold(vp);
864 bp->b_vp = vp;
865 bp->b_dev = vn_todev(vp);
866 /*
867 * Insert onto list for new vnode.
868 */
869 s = splbio();
870 bp->b_xflags |= BX_VNCLEAN;
871 bp->b_xflags &= ~BX_VNDIRTY;
872 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
873 splx(s);
874}
875
876/*
877 * Disassociate a buffer from a vnode.
878 */
879void
880brelvp(bp)
881 register struct buf *bp;
882{
883 struct vnode *vp;
884 struct buflists *listheadp;
885 int s;
886
887 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
888
889 /*
890 * Delete from old vnode list, if on one.
891 */
892 vp = bp->b_vp;
893 s = splbio();
894 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
895 if (bp->b_xflags & BX_VNDIRTY)
896 listheadp = &vp->v_dirtyblkhd;
897 else
898 listheadp = &vp->v_cleanblkhd;
899 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
900 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
901 }
902 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
903 vp->v_flag &= ~VONWORKLST;
904 LIST_REMOVE(vp, v_synclist);
905 }
906 splx(s);
907 bp->b_vp = (struct vnode *) 0;
908 vdrop(vp);
909}
910
911/*
912 * The workitem queue.
913 *
914 * It is useful to delay writes of file data and filesystem metadata
915 * for tens of seconds so that quickly created and deleted files need
916 * not waste disk bandwidth being created and removed. To realize this,
917 * we append vnodes to a "workitem" queue. When running with a soft
918 * updates implementation, most pending metadata dependencies should
919 * not wait for more than a few seconds. Thus, mounted on block devices
920 * are delayed only about a half the time that file data is delayed.
921 * Similarly, directory updates are more critical, so are only delayed
922 * about a third the time that file data is delayed. Thus, there are
923 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
924 * one each second (driven off the filesystem syncer process). The
925 * syncer_delayno variable indicates the next queue that is to be processed.
926 * Items that need to be processed soon are placed in this queue:
927 *
928 * syncer_workitem_pending[syncer_delayno]
929 *
930 * A delay of fifteen seconds is done by placing the request fifteen
931 * entries later in the queue:
932 *
933 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
934 *
935 */
936
937/*
938 * Add an item to the syncer work queue.
939 */
940static void
941vn_syncer_add_to_worklist(struct vnode *vp, int delay)
942{
943 int s, slot;
944
945 s = splbio();
946
947 if (vp->v_flag & VONWORKLST) {
948 LIST_REMOVE(vp, v_synclist);
949 }
950
951 if (delay > syncer_maxdelay - 2)
952 delay = syncer_maxdelay - 2;
953 slot = (syncer_delayno + delay) & syncer_mask;
954
955 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
956 vp->v_flag |= VONWORKLST;
957 splx(s);
958}
959
960struct proc *updateproc;
961static void sched_sync __P((void));
962static struct kproc_desc up_kp = {
963 "syncer",
964 sched_sync,
965 &updateproc
966};
967SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
968
969/*
970 * System filesystem synchronizer daemon.
971 */
972void
973sched_sync(void)
974{
975 struct synclist *slp;
976 struct vnode *vp;
977 long starttime;
978 int s;
979 struct proc *p = updateproc;
980
981 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
982 SHUTDOWN_PRI_LAST);
983
984 for (;;) {
985 kproc_suspend_loop(p);
986
987 starttime = time_second;
988
989 /*
990 * Push files whose dirty time has expired. Be careful
991 * of interrupt race on slp queue.
992 */
993 s = splbio();
994 slp = &syncer_workitem_pending[syncer_delayno];
995 syncer_delayno += 1;
996 if (syncer_delayno == syncer_maxdelay)
997 syncer_delayno = 0;
998 splx(s);
999
1000 while ((vp = LIST_FIRST(slp)) != NULL) {
1001 if (VOP_ISLOCKED(vp, NULL) == 0) {
1002 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1003 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
1004 VOP_UNLOCK(vp, 0, p);
1005 }
1006 s = splbio();
1007 if (LIST_FIRST(slp) == vp) {
1008 /*
1009 * Note: v_tag VT_VFS vps can remain on the
1010 * worklist too with no dirty blocks, but
1011 * since sync_fsync() moves it to a different
1012 * slot we are safe.
1013 */
1014 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1015 !vn_isdisk(vp, NULL))
1016 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1017 /*
1018 * Put us back on the worklist. The worklist
1019 * routine will remove us from our current
1020 * position and then add us back in at a later
1021 * position.
1022 */
1023 vn_syncer_add_to_worklist(vp, syncdelay);
1024 }
1025 splx(s);
1026 }
1027
1028 /*
1029 * Do soft update processing.
1030 */
1031 if (bioops.io_sync)
1032 (*bioops.io_sync)(NULL);
1033
1034 /*
1035 * The variable rushjob allows the kernel to speed up the
1036 * processing of the filesystem syncer process. A rushjob
1037 * value of N tells the filesystem syncer to process the next
1038 * N seconds worth of work on its queue ASAP. Currently rushjob
1039 * is used by the soft update code to speed up the filesystem
1040 * syncer process when the incore state is getting so far
1041 * ahead of the disk that the kernel memory pool is being
1042 * threatened with exhaustion.
1043 */
1044 if (rushjob > 0) {
1045 rushjob -= 1;
1046 continue;
1047 }
1048 /*
1049 * If it has taken us less than a second to process the
1050 * current work, then wait. Otherwise start right over
1051 * again. We can still lose time if any single round
1052 * takes more than two seconds, but it does not really
1053 * matter as we are just trying to generally pace the
1054 * filesystem activity.
1055 */
1056 if (time_second == starttime)
1057 tsleep(&lbolt, PPAUSE, "syncer", 0);
1058 }
1059}
1060
1061/*
1062 * Request the syncer daemon to speed up its work.
1063 * We never push it to speed up more than half of its
1064 * normal turn time, otherwise it could take over the cpu.
1065 */
1066int
1067speedup_syncer()
1068{
1069 int s;
1070
1071 s = splhigh();
1072 if (updateproc->p_wchan == &lbolt)
1073 setrunnable(updateproc);
1074 splx(s);
1075 if (rushjob < syncdelay / 2) {
1076 rushjob += 1;
1077 stat_rush_requests += 1;
1078 return (1);
1079 }
1080 return(0);
1081}
1082
1083/*
1084 * Associate a p-buffer with a vnode.
1085 *
1086 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1087 * with the buffer. i.e. the bp has not been linked into the vnode or
1088 * ref-counted.
1089 */
1090void
1091pbgetvp(vp, bp)
1092 register struct vnode *vp;
1093 register struct buf *bp;
1094{
1095
1096 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1097
1098 bp->b_vp = vp;
1099 bp->b_flags |= B_PAGING;
1100 bp->b_dev = vn_todev(vp);
1101}
1102
1103/*
1104 * Disassociate a p-buffer from a vnode.
1105 */
1106void
1107pbrelvp(bp)
1108 register struct buf *bp;
1109{
1110
1111 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1112
1113 /* XXX REMOVE ME */
1114 if (bp->b_vnbufs.tqe_next != NULL) {
1115 panic(
1116 "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1117 bp,
1118 (int)bp->b_flags
1119 );
1120 }
1121 bp->b_vp = (struct vnode *) 0;
1122 bp->b_flags &= ~B_PAGING;
1123}
1124
1125void
1126pbreassignbuf(bp, newvp)
1127 struct buf *bp;
1128 struct vnode *newvp;
1129{
1130 if ((bp->b_flags & B_PAGING) == 0) {
1131 panic(
1132 "pbreassignbuf() on non phys bp %p",
1133 bp
1134 );
1135 }
1136 bp->b_vp = newvp;
1137}
1138
1139/*
1140 * Reassign a buffer from one vnode to another.
1141 * Used to assign file specific control information
1142 * (indirect blocks) to the vnode to which they belong.
1143 */
1144void
1145reassignbuf(bp, newvp)
1146 register struct buf *bp;
1147 register struct vnode *newvp;
1148{
1149 struct buflists *listheadp;
1150 int delay;
1151 int s;
1152
1153 if (newvp == NULL) {
1154 printf("reassignbuf: NULL");
1155 return;
1156 }
1157 ++reassignbufcalls;
1158
1159 /*
1160 * B_PAGING flagged buffers cannot be reassigned because their vp
1161 * is not fully linked in.
1162 */
1163 if (bp->b_flags & B_PAGING)
1164 panic("cannot reassign paging buffer");
1165
1166 s = splbio();
1167 /*
1168 * Delete from old vnode list, if on one.
1169 */
1170 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1171 if (bp->b_xflags & BX_VNDIRTY)
1172 listheadp = &bp->b_vp->v_dirtyblkhd;
1173 else
1174 listheadp = &bp->b_vp->v_cleanblkhd;
1175 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1176 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1177 if (bp->b_vp != newvp) {
1178 vdrop(bp->b_vp);
1179 bp->b_vp = NULL; /* for clarification */
1180 }
1181 }
1182 /*
1183 * If dirty, put on list of dirty buffers; otherwise insert onto list
1184 * of clean buffers.
1185 */
1186 if (bp->b_flags & B_DELWRI) {
1187 struct buf *tbp;
1188
1189 listheadp = &newvp->v_dirtyblkhd;
1190 if ((newvp->v_flag & VONWORKLST) == 0) {
1191 switch (newvp->v_type) {
1192 case VDIR:
1193 delay = dirdelay;
1194 break;
1195 case VCHR:
1196 case VBLK:
1197 if (newvp->v_specmountpoint != NULL) {
1198 delay = metadelay;
1199 break;
1200 }
1201 /* fall through */
1202 default:
1203 delay = filedelay;
1204 }
1205 vn_syncer_add_to_worklist(newvp, delay);
1206 }
1207 bp->b_xflags |= BX_VNDIRTY;
1208 tbp = TAILQ_FIRST(listheadp);
1209 if (tbp == NULL ||
1210 bp->b_lblkno == 0 ||
1211 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
1212 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1213 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1214 ++reassignbufsortgood;
1215 } else if (bp->b_lblkno < 0) {
1216 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1217 ++reassignbufsortgood;
1218 } else if (reassignbufmethod == 1) {
1219 /*
1220 * New sorting algorithm, only handle sequential case,
1221 * otherwise append to end (but before metadata)
1222 */
1223 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1224 (tbp->b_xflags & BX_VNDIRTY)) {
1225 /*
1226 * Found the best place to insert the buffer
1227 */
1228 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1229 ++reassignbufsortgood;
1230 } else {
1231 /*
1232 * Missed, append to end, but before meta-data.
1233 * We know that the head buffer in the list is
1234 * not meta-data due to prior conditionals.
1235 *
1236 * Indirect effects: NFS second stage write
1237 * tends to wind up here, giving maximum
1238 * distance between the unstable write and the
1239 * commit rpc.
1240 */
1241 tbp = TAILQ_LAST(listheadp, buflists);
1242 while (tbp && tbp->b_lblkno < 0)
1243 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
1244 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1245 ++reassignbufsortbad;
1246 }
1247 } else {
1248 /*
1249 * Old sorting algorithm, scan queue and insert
1250 */
1251 struct buf *ttbp;
1252 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1253 (ttbp->b_lblkno < bp->b_lblkno)) {
1254 ++reassignbufloops;
1255 tbp = ttbp;
1256 }
1257 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1258 }
1259 } else {
1260 bp->b_xflags |= BX_VNCLEAN;
1261 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1262 if ((newvp->v_flag & VONWORKLST) &&
1263 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1264 newvp->v_flag &= ~VONWORKLST;
1265 LIST_REMOVE(newvp, v_synclist);
1266 }
1267 }
1268 if (bp->b_vp != newvp) {
1269 bp->b_vp = newvp;
1270 vhold(bp->b_vp);
1271 }
1272 splx(s);
1273}
1274
1275/*
1276 * Create a vnode for a block device.
1277 * Used for mounting the root file system.
1278 */
1279int
1280bdevvp(dev, vpp)
1281 dev_t dev;
1282 struct vnode **vpp;
1283{
1284 register struct vnode *vp;
1285 struct vnode *nvp;
1286 int error;
1287
1288 if (dev == NODEV) {
1289 *vpp = NULLVP;
1290 return (ENXIO);
1291 }
1292 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1293 if (error) {
1294 *vpp = NULLVP;
1295 return (error);
1296 }
1297 vp = nvp;
1298 vp->v_type = VBLK;
1299 addalias(vp, dev);
1300 *vpp = vp;
1301 return (0);
1302}
1303
1304/*
1305 * Add vnode to the alias list hung off the dev_t.
1306 *
1307 * The reason for this gunk is that multiple vnodes can reference
1308 * the same physical device, so checking vp->v_usecount to see
1309 * how many users there are is inadequate; the v_usecount for
1310 * the vnodes need to be accumulated. vcount() does that.
1311 */
1312void
1313addaliasu(nvp, nvp_rdev)
1314 struct vnode *nvp;
1315 udev_t nvp_rdev;
1316{
1317
1318 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1319 panic("addaliasu on non-special vnode");
1320 addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
1321}
1322
1323void
1324addalias(nvp, dev)
1325 struct vnode *nvp;
1326 dev_t dev;
1327{
1328
1329 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1330 panic("addalias on non-special vnode");
1331
1332 nvp->v_rdev = dev;
1333 simple_lock(&spechash_slock);
1334 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1335 simple_unlock(&spechash_slock);
1336}
1337
1338/*
1339 * Grab a particular vnode from the free list, increment its
1340 * reference count and lock it. The vnode lock bit is set if the
1341 * vnode is being eliminated in vgone. The process is awakened
1342 * when the transition is completed, and an error returned to
1343 * indicate that the vnode is no longer usable (possibly having
1344 * been changed to a new file system type).
1345 */
1346int
1347vget(vp, flags, p)
1348 register struct vnode *vp;
1349 int flags;
1350 struct proc *p;
1351{
1352 int error;
1353
1354 /*
1355 * If the vnode is in the process of being cleaned out for
1356 * another use, we wait for the cleaning to finish and then
1357 * return failure. Cleaning is determined by checking that
1358 * the VXLOCK flag is set.
1359 */
1360 if ((flags & LK_INTERLOCK) == 0) {
1361 simple_lock(&vp->v_interlock);
1362 }
1363 if (vp->v_flag & VXLOCK) {
1364 vp->v_flag |= VXWANT;
1365 simple_unlock(&vp->v_interlock);
1366 tsleep((caddr_t)vp, PINOD, "vget", 0);
1367 return (ENOENT);
1368 }
1369
1370 vp->v_usecount++;
1371
1372 if (VSHOULDBUSY(vp))
1373 vbusy(vp);
1374 if (flags & LK_TYPE_MASK) {
1375 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1376 /*
1377 * must expand vrele here because we do not want
1378 * to call VOP_INACTIVE if the reference count
1379 * drops back to zero since it was never really
1380 * active. We must remove it from the free list
1381 * before sleeping so that multiple processes do
1382 * not try to recycle it.
1383 */
1384 simple_lock(&vp->v_interlock);
1385 vp->v_usecount--;
1386 if (VSHOULDFREE(vp))
1387 vfree(vp);
1388 simple_unlock(&vp->v_interlock);
1389 }
1390 return (error);
1391 }
1392 simple_unlock(&vp->v_interlock);
1393 return (0);
1394}
1395
1396void
1397vref(struct vnode *vp)
1398{
1399 simple_lock(&vp->v_interlock);
1400 vp->v_usecount++;
1401 simple_unlock(&vp->v_interlock);
1402}
1403
1404/*
1405 * Vnode put/release.
1406 * If count drops to zero, call inactive routine and return to freelist.
1407 */
1408void
1409vrele(vp)
1410 struct vnode *vp;
1411{
1412 struct proc *p = curproc; /* XXX */
1413
1414 KASSERT(vp != NULL, ("vrele: null vp"));
1415
1416 simple_lock(&vp->v_interlock);
1417
1418 if (vp->v_usecount > 1) {
1419
1420 vp->v_usecount--;
1421 simple_unlock(&vp->v_interlock);
1422
1423 return;
1424 }
1425
1426 if (vp->v_usecount == 1) {
1427
1428 vp->v_usecount--;
1429 if (VSHOULDFREE(vp))
1430 vfree(vp);
1431 /*
1432 * If we are doing a vput, the node is already locked, and we must
1433 * call VOP_INACTIVE with the node locked. So, in the case of
1434 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1435 */
1436 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1437 VOP_INACTIVE(vp, p);
1438 }
1439
1440 } else {
1441#ifdef DIAGNOSTIC
1442 vprint("vrele: negative ref count", vp);
1443 simple_unlock(&vp->v_interlock);
1444#endif
1445 panic("vrele: negative ref cnt");
1446 }
1447}
1448
1449void
1450vput(vp)
1451 struct vnode *vp;
1452{
1453 struct proc *p = curproc; /* XXX */
1454
1455 KASSERT(vp != NULL, ("vput: null vp"));
1456
1457 simple_lock(&vp->v_interlock);
1458
1459 if (vp->v_usecount > 1) {
1460
1461 vp->v_usecount--;
1462 VOP_UNLOCK(vp, LK_INTERLOCK, p);
1463 return;
1464
1465 }
1466
1467 if (vp->v_usecount == 1) {
1468
1469 vp->v_usecount--;
1470 if (VSHOULDFREE(vp))
1471 vfree(vp);
1472 /*
1473 * If we are doing a vput, the node is already locked, and we must
1474 * call VOP_INACTIVE with the node locked. So, in the case of
1475 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1476 */
1477 simple_unlock(&vp->v_interlock);
1478 VOP_INACTIVE(vp, p);
1479
1480 } else {
1481#ifdef DIAGNOSTIC
1482 vprint("vput: negative ref count", vp);
1483#endif
1484 panic("vput: negative ref cnt");
1485 }
1486}
1487
1488/*
1489 * Somebody doesn't want the vnode recycled.
1490 */
1491void
1492vhold(vp)
1493 register struct vnode *vp;
1494{
1495 int s;
1496
1497 s = splbio();
1498 vp->v_holdcnt++;
1499 if (VSHOULDBUSY(vp))
1500 vbusy(vp);
1501 splx(s);
1502}
1503
1504/*
1505 * One less who cares about this vnode.
1506 */
1507void
1508vdrop(vp)
1509 register struct vnode *vp;
1510{
1511 int s;
1512
1513 s = splbio();
1514 if (vp->v_holdcnt <= 0)
1515 panic("vdrop: holdcnt");
1516 vp->v_holdcnt--;
1517 if (VSHOULDFREE(vp))
1518 vfree(vp);
1519 splx(s);
1520}
1521
1522/*
1523 * Remove any vnodes in the vnode table belonging to mount point mp.
1524 *
1525 * If MNT_NOFORCE is specified, there should not be any active ones,
1526 * return error if any are found (nb: this is a user error, not a
1527 * system error). If MNT_FORCE is specified, detach any active vnodes
1528 * that are found.
1529 */
1530#ifdef DIAGNOSTIC
1531static int busyprt = 0; /* print out busy vnodes */
1532SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1533#endif
1534
1535int
1536vflush(mp, skipvp, flags)
1537 struct mount *mp;
1538 struct vnode *skipvp;
1539 int flags;
1540{
1541 struct proc *p = curproc; /* XXX */
1542 struct vnode *vp, *nvp;
1543 int busy = 0;
1544
1545 simple_lock(&mntvnode_slock);
1546loop:
1547 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1548 /*
1549 * Make sure this vnode wasn't reclaimed in getnewvnode().
1550 * Start over if it has (it won't be on the list anymore).
1551 */
1552 if (vp->v_mount != mp)
1553 goto loop;
1554 nvp = LIST_NEXT(vp, v_mntvnodes);
1555 /*
1556 * Skip over a selected vnode.
1557 */
1558 if (vp == skipvp)
1559 continue;
1560
1561 simple_lock(&vp->v_interlock);
1562 /*
1563 * Skip over a vnodes marked VSYSTEM.
1564 */
1565 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1566 simple_unlock(&vp->v_interlock);
1567 continue;
1568 }
1569 /*
1570 * If WRITECLOSE is set, only flush out regular file vnodes
1571 * open for writing.
1572 */
1573 if ((flags & WRITECLOSE) &&
1574 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1575 simple_unlock(&vp->v_interlock);
1576 continue;
1577 }
1578
1579 /*
1580 * With v_usecount == 0, all we need to do is clear out the
1581 * vnode data structures and we are done.
1582 */
1583 if (vp->v_usecount == 0) {
1584 simple_unlock(&mntvnode_slock);
1585 vgonel(vp, p);
1586 simple_lock(&mntvnode_slock);
1587 continue;
1588 }
1589
1590 /*
1591 * If FORCECLOSE is set, forcibly close the vnode. For block
1592 * or character devices, revert to an anonymous device. For
1593 * all other files, just kill them.
1594 */
1595 if (flags & FORCECLOSE) {
1596 simple_unlock(&mntvnode_slock);
1597 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1598 vgonel(vp, p);
1599 } else {
1600 vclean(vp, 0, p);
1601 vp->v_op = spec_vnodeop_p;
1602 insmntque(vp, (struct mount *) 0);
1603 }
1604 simple_lock(&mntvnode_slock);
1605 continue;
1606 }
1607#ifdef DIAGNOSTIC
1608 if (busyprt)
1609 vprint("vflush: busy vnode", vp);
1610#endif
1611 simple_unlock(&vp->v_interlock);
1612 busy++;
1613 }
1614 simple_unlock(&mntvnode_slock);
1615 if (busy)
1616 return (EBUSY);
1617 return (0);
1618}
1619
1620/*
1621 * Disassociate the underlying file system from a vnode.
1622 */
1623static void
1624vclean(vp, flags, p)
1625 struct vnode *vp;
1626 int flags;
1627 struct proc *p;
1628{
1629 int active;
1630 vm_object_t obj;
1631
1632 /*
1633 * Check to see if the vnode is in use. If so we have to reference it
1634 * before we clean it out so that its count cannot fall to zero and
1635 * generate a race against ourselves to recycle it.
1636 */
1637 if ((active = vp->v_usecount))
1638 vp->v_usecount++;
1639
1640 /*
1641 * Prevent the vnode from being recycled or brought into use while we
1642 * clean it out.
1643 */
1644 if (vp->v_flag & VXLOCK)
1645 panic("vclean: deadlock");
1646 vp->v_flag |= VXLOCK;
1647 /*
1648 * Even if the count is zero, the VOP_INACTIVE routine may still
1649 * have the object locked while it cleans it out. The VOP_LOCK
1650 * ensures that the VOP_INACTIVE routine is done with its work.
1651 * For active vnodes, it ensures that no other activity can
1652 * occur while the underlying object is being cleaned out.
1653 */
1654 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1655
1656 /*
1657 * Clean out any buffers associated with the vnode.
1658 */
1659 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1660 if ((obj = vp->v_object) != NULL) {
1661 if (obj->ref_count == 0) {
1662 /*
1663 * vclean() may be called twice. The first time removes the
1664 * primary reference to the object, the second time goes
1665 * one further and is a special-case to terminate the object.
1666 */
1667 vm_object_terminate(obj);
1668 } else {
1669 /*
1670 * Woe to the process that tries to page now :-).
1671 */
1672 vm_pager_deallocate(obj);
1673 }
1674 }
1675
1676 /*
1677 * If purging an active vnode, it must be closed and
1678 * deactivated before being reclaimed. Note that the
1679 * VOP_INACTIVE will unlock the vnode.
1680 */
1681 if (active) {
1682 if (flags & DOCLOSE)
1683 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1684 VOP_INACTIVE(vp, p);
1685 } else {
1686 /*
1687 * Any other processes trying to obtain this lock must first
1688 * wait for VXLOCK to clear, then call the new lock operation.
1689 */
1690 VOP_UNLOCK(vp, 0, p);
1691 }
1692 /*
1693 * Reclaim the vnode.
1694 */
1695 if (VOP_RECLAIM(vp, p))
1696 panic("vclean: cannot reclaim");
1697
1698 if (active) {
1699 /*
1700 * Inline copy of vrele() since VOP_INACTIVE
1701 * has already been called.
1702 */
1703 simple_lock(&vp->v_interlock);
1704 if (--vp->v_usecount <= 0) {
1705#ifdef DIAGNOSTIC
1706 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1707 vprint("vclean: bad ref count", vp);
1708 panic("vclean: ref cnt");
1709 }
1710#endif
1711 vfree(vp);
1712 }
1713 simple_unlock(&vp->v_interlock);
1714 }
1715
1716 cache_purge(vp);
1717 if (vp->v_vnlock) {
1718 FREE(vp->v_vnlock, M_VNODE);
1719 vp->v_vnlock = NULL;
1720 }
1721
1722 if (VSHOULDFREE(vp))
1723 vfree(vp);
1724
1725 /*
1726 * Done with purge, notify sleepers of the grim news.
1727 */
1728 vp->v_op = dead_vnodeop_p;
1729 vn_pollgone(vp);
1730 vp->v_tag = VT_NON;
1731 vp->v_flag &= ~VXLOCK;
1732 if (vp->v_flag & VXWANT) {
1733 vp->v_flag &= ~VXWANT;
1734 wakeup((caddr_t) vp);
1735 }
1736}
1737
1738/*
1739 * Eliminate all activity associated with the requested vnode
1740 * and with all vnodes aliased to the requested vnode.
1741 */
1742int
1743vop_revoke(ap)
1744 struct vop_revoke_args /* {
1745 struct vnode *a_vp;
1746 int a_flags;
1747 } */ *ap;
1748{
1749 struct vnode *vp, *vq;
1750 dev_t dev;
1751
1752 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1753
1754 vp = ap->a_vp;
1755 /*
1756 * If a vgone (or vclean) is already in progress,
1757 * wait until it is done and return.
1758 */
1759 if (vp->v_flag & VXLOCK) {
1760 vp->v_flag |= VXWANT;
1761 simple_unlock(&vp->v_interlock);
1762 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1763 return (0);
1764 }
1765 dev = vp->v_rdev;
1766 for (;;) {
1767 simple_lock(&spechash_slock);
1768 vq = SLIST_FIRST(&dev->si_hlist);
1769 simple_unlock(&spechash_slock);
1770 if (!vq)
1771 break;
1772 vgone(vq);
1773 }
1774 return (0);
1775}
1776
1777/*
1778 * Recycle an unused vnode to the front of the free list.
1779 * Release the passed interlock if the vnode will be recycled.
1780 */
1781int
1782vrecycle(vp, inter_lkp, p)
1783 struct vnode *vp;
1784 struct simplelock *inter_lkp;
1785 struct proc *p;
1786{
1787
1788 simple_lock(&vp->v_interlock);
1789 if (vp->v_usecount == 0) {
1790 if (inter_lkp) {
1791 simple_unlock(inter_lkp);
1792 }
1793 vgonel(vp, p);
1794 return (1);
1795 }
1796 simple_unlock(&vp->v_interlock);
1797 return (0);
1798}
1799
1800/*
1801 * Eliminate all activity associated with a vnode
1802 * in preparation for reuse.
1803 */
1804void
1805vgone(vp)
1806 register struct vnode *vp;
1807{
1808 struct proc *p = curproc; /* XXX */
1809
1810 simple_lock(&vp->v_interlock);
1811 vgonel(vp, p);
1812}
1813
1814/*
1815 * vgone, with the vp interlock held.
1816 */
1817void
1818vgonel(vp, p)
1819 struct vnode *vp;
1820 struct proc *p;
1821{
1822 int s;
1823
1824 /*
1825 * If a vgone (or vclean) is already in progress,
1826 * wait until it is done and return.
1827 */
1828 if (vp->v_flag & VXLOCK) {
1829 vp->v_flag |= VXWANT;
1830 simple_unlock(&vp->v_interlock);
1831 tsleep((caddr_t)vp, PINOD, "vgone", 0);
1832 return;
1833 }
1834
1835 /*
1836 * Clean out the filesystem specific data.
1837 */
1838 vclean(vp, DOCLOSE, p);
1839 simple_lock(&vp->v_interlock);
1840
1841 /*
1842 * Delete from old mount point vnode list, if on one.
1843 */
1844 if (vp->v_mount != NULL)
1845 insmntque(vp, (struct mount *)0);
1846 /*
1847 * If special device, remove it from special device alias list
1848 * if it is on one.
1849 */
1850 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
1851 simple_lock(&spechash_slock);
1852 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);
1853 freedev(vp->v_rdev);
1854 simple_unlock(&spechash_slock);
1855 vp->v_rdev = NULL;
1856 }
1857
1858 /*
1859 * If it is on the freelist and not already at the head,
1860 * move it to the head of the list. The test of the back
1861 * pointer and the reference count of zero is because
1862 * it will be removed from the free list by getnewvnode,
1863 * but will not have its reference count incremented until
1864 * after calling vgone. If the reference count were
1865 * incremented first, vgone would (incorrectly) try to
1866 * close the previous instance of the underlying object.
1867 */
1868 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1869 s = splbio();
1870 simple_lock(&vnode_free_list_slock);
1871 if (vp->v_flag & VFREE) {
1872 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1873 } else if (vp->v_flag & VTBFREE) {
1874 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
1875 vp->v_flag &= ~VTBFREE;
1876 freevnodes++;
1877 } else
1878 freevnodes++;
1879 vp->v_flag |= VFREE;
1880 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1881 simple_unlock(&vnode_free_list_slock);
1882 splx(s);
1883 }
1884
1885 vp->v_type = VBAD;
1886 simple_unlock(&vp->v_interlock);
1887}
1888
1889/*
1890 * Lookup a vnode by device number.
1891 */
1892int
1893vfinddev(dev, type, vpp)
1894 dev_t dev;
1895 enum vtype type;
1896 struct vnode **vpp;
1897{
1898 struct vnode *vp;
1899
1900 simple_lock(&spechash_slock);
1901 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
1902 if (type == vp->v_type) {
1903 *vpp = vp;
1904 simple_unlock(&spechash_slock);
1905 return (1);
1906 }
1907 }
1908 simple_unlock(&spechash_slock);
1909 return (0);
1910}
1911
1912/*
1913 * Calculate the total number of references to a special device.
1914 */
1915int
1916vcount(vp)
1917 struct vnode *vp;
1918{
1919 struct vnode *vq;
1920 int count;
1921
1922 count = 0;
1923 simple_lock(&spechash_slock);
1924 SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
1925 count += vq->v_usecount;
1926 simple_unlock(&spechash_slock);
1927 return (count);
1928}
1929
1930/*
1931 * Same as above, but using the dev_t as argument
1932 */
1933
1934int
1935count_dev(dev)
1936 dev_t dev;
1937{
1938 struct vnode *vp;
1939
1940 vp = SLIST_FIRST(&dev->si_hlist);
1941 if (vp == NULL)
1942 return (0);
1943 return(vcount(vp));
1944}
1945
1946/*
1947 * Print out a description of a vnode.
1948 */
1949static char *typename[] =
1950{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1951
1952void
1953vprint(label, vp)
1954 char *label;
1955 struct vnode *vp;
1956{
1957 char buf[96];
1958
1959 if (label != NULL)
1960 printf("%s: %p: ", label, (void *)vp);
1961 else
1962 printf("%p: ", (void *)vp);
1963 printf("type %s, usecount %d, writecount %d, refcount %d,",
1964 typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1965 vp->v_holdcnt);
1966 buf[0] = '\0';
1967 if (vp->v_flag & VROOT)
1968 strcat(buf, "|VROOT");
1969 if (vp->v_flag & VTEXT)
1970 strcat(buf, "|VTEXT");
1971 if (vp->v_flag & VSYSTEM)
1972 strcat(buf, "|VSYSTEM");
1973 if (vp->v_flag & VXLOCK)
1974 strcat(buf, "|VXLOCK");
1975 if (vp->v_flag & VXWANT)
1976 strcat(buf, "|VXWANT");
1977 if (vp->v_flag & VBWAIT)
1978 strcat(buf, "|VBWAIT");
1979 if (vp->v_flag & VDOOMED)
1980 strcat(buf, "|VDOOMED");
1981 if (vp->v_flag & VFREE)
1982 strcat(buf, "|VFREE");
1983 if (vp->v_flag & VOBJBUF)
1984 strcat(buf, "|VOBJBUF");
1985 if (buf[0] != '\0')
1986 printf(" flags (%s)", &buf[1]);
1987 if (vp->v_data == NULL) {
1988 printf("\n");
1989 } else {
1990 printf("\n\t");
1991 VOP_PRINT(vp);
1992 }
1993}
1994
1995#ifdef DDB
1996#include <ddb/ddb.h>
1997/*
1998 * List all of the locked vnodes in the system.
1999 * Called when debugging the kernel.
2000 */
2001DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2002{
2003 struct proc *p = curproc; /* XXX */
2004 struct mount *mp, *nmp;
2005 struct vnode *vp;
2006
2007 printf("Locked vnodes\n");
2008 simple_lock(&mountlist_slock);
2009 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2010 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2011 nmp = TAILQ_NEXT(mp, mnt_list);
2012 continue;
2013 }
2014 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2015 if (VOP_ISLOCKED(vp, NULL))
2016 vprint((char *)0, vp);
2017 }
2018 simple_lock(&mountlist_slock);
2019 nmp = TAILQ_NEXT(mp, mnt_list);
2020 vfs_unbusy(mp, p);
2021 }
2022 simple_unlock(&mountlist_slock);
2023}
2024#endif
2025
2026/*
2027 * Top level filesystem related information gathering.
2028 */
2029static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
2030
2031static int
2032vfs_sysctl SYSCTL_HANDLER_ARGS
2033{
2034 int *name = (int *)arg1 - 1; /* XXX */
2035 u_int namelen = arg2 + 1; /* XXX */
2036 struct vfsconf *vfsp;
2037
2038#if 1 || defined(COMPAT_PRELITE2)
2039 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2040 if (namelen == 1)
2041 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2042#endif
2043
2044#ifdef notyet
2045 /* all sysctl names at this level are at least name and field */
2046 if (namelen < 2)
2047 return (ENOTDIR); /* overloaded */
2048 if (name[0] != VFS_GENERIC) {
2049 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2050 if (vfsp->vfc_typenum == name[0])
2051 break;
2052 if (vfsp == NULL)
2053 return (EOPNOTSUPP);
2054 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2055 oldp, oldlenp, newp, newlen, p));
2056 }
2057#endif
2058 switch (name[1]) {
2059 case VFS_MAXTYPENUM:
2060 if (namelen != 2)
2061 return (ENOTDIR);
2062 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2063 case VFS_CONF:
2064 if (namelen != 3)
2065 return (ENOTDIR); /* overloaded */
2066 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2067 if (vfsp->vfc_typenum == name[2])
2068 break;
2069 if (vfsp == NULL)
2070 return (EOPNOTSUPP);
2071 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2072 }
2073 return (EOPNOTSUPP);
2074}
2075
2076SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2077 "Generic filesystem");
2078
2079#if 1 || defined(COMPAT_PRELITE2)
2080
2081static int
2082sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
2083{
2084 int error;
2085 struct vfsconf *vfsp;
2086 struct ovfsconf ovfs;
2087
2088 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2089 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
2090 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2091 ovfs.vfc_index = vfsp->vfc_typenum;
2092 ovfs.vfc_refcount = vfsp->vfc_refcount;
2093 ovfs.vfc_flags = vfsp->vfc_flags;
2094 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2095 if (error)
2096 return error;
2097 }
2098 return 0;
2099}
2100
2101#endif /* 1 || COMPAT_PRELITE2 */
2102
2103#if 0
2104#define KINFO_VNODESLOP 10
2105/*
2106 * Dump vnode list (via sysctl).
2107 * Copyout address of vnode followed by vnode.
2108 */
2109/* ARGSUSED */
2110static int
2111sysctl_vnode SYSCTL_HANDLER_ARGS
2112{
2113 struct proc *p = curproc; /* XXX */
2114 struct mount *mp, *nmp;
2115 struct vnode *nvp, *vp;
2116 int error;
2117
2118#define VPTRSZ sizeof (struct vnode *)
2119#define VNODESZ sizeof (struct vnode)
2120
2121 req->lock = 0;
2122 if (!req->oldptr) /* Make an estimate */
2123 return (SYSCTL_OUT(req, 0,
2124 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2125
2126 simple_lock(&mountlist_slock);
2127 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2128 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2129 nmp = TAILQ_NEXT(mp, mnt_list);
2130 continue;
2131 }
2132again:
2133 simple_lock(&mntvnode_slock);
2134 for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2135 vp != NULL;
2136 vp = nvp) {
2137 /*
2138 * Check that the vp is still associated with
2139 * this filesystem. RACE: could have been
2140 * recycled onto the same filesystem.
2141 */
2142 if (vp->v_mount != mp) {
2143 simple_unlock(&mntvnode_slock);
2144 goto again;
2145 }
2146 nvp = LIST_NEXT(vp, v_mntvnodes);
2147 simple_unlock(&mntvnode_slock);
2148 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2149 (error = SYSCTL_OUT(req, vp, VNODESZ)))
2150 return (error);
2151 simple_lock(&mntvnode_slock);
2152 }
2153 simple_unlock(&mntvnode_slock);
2154 simple_lock(&mountlist_slock);
2155 nmp = TAILQ_NEXT(mp, mnt_list);
2156 vfs_unbusy(mp, p);
2157 }
2158 simple_unlock(&mountlist_slock);
2159
2160 return (0);
2161}
2162#endif
2163
2164/*
2165 * XXX
2166 * Exporting the vnode list on large systems causes them to crash.
2167 * Exporting the vnode list on medium systems causes sysctl to coredump.
2168 */
2169#if 0
2170SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2171 0, 0, sysctl_vnode, "S,vnode", "");
2172#endif
2173
2174/*
2175 * Check to see if a filesystem is mounted on a block device.
2176 */
2177int
2178vfs_mountedon(vp)
2179 struct vnode *vp;
2180{
2181
2182 if (vp->v_specmountpoint != NULL)
2183 return (EBUSY);
2184 return (0);
2185}
2186
2187/*
2188 * Unmount all filesystems. The list is traversed in reverse order
2189 * of mounting to avoid dependencies.
2190 */
2191void
2192vfs_unmountall()
2193{
2194 struct mount *mp;
2195 struct proc *p;
2196 int error;
2197
2198 if (curproc != NULL)
2199 p = curproc;
2200 else
2201 p = initproc; /* XXX XXX should this be proc0? */
2202 /*
2203 * Since this only runs when rebooting, it is not interlocked.
2204 */
2205 while(!TAILQ_EMPTY(&mountlist)) {
2206 mp = TAILQ_LAST(&mountlist, mntlist);
2207 error = dounmount(mp, MNT_FORCE, p);
2208 if (error) {
2209 TAILQ_REMOVE(&mountlist, mp, mnt_list);
2210 printf("unmount of %s failed (",
2211 mp->mnt_stat.f_mntonname);
2212 if (error == EBUSY)
2213 printf("BUSY)\n");
2214 else
2215 printf("%d)\n", error);
2216 } else {
2217 /* The unmount has removed mp from the mountlist */
2218 }
2219 }
2220}
2221
2222/*
2223 * Build hash lists of net addresses and hang them off the mount point.
2224 * Called by ufs_mount() to set up the lists of export addresses.
2225 */
2226static int
2227vfs_hang_addrlist(mp, nep, argp)
2228 struct mount *mp;
2229 struct netexport *nep;
2230 struct export_args *argp;
2231{
2232 register struct netcred *np;
2233 register struct radix_node_head *rnh;
2234 register int i;
2235 struct radix_node *rn;
2236 struct sockaddr *saddr, *smask = 0;
2237 struct domain *dom;
2238 int error;
2239
2240 if (argp->ex_addrlen == 0) {
2241 if (mp->mnt_flag & MNT_DEFEXPORTED)
2242 return (EPERM);
2243 np = &nep->ne_defexported;
2244 np->netc_exflags = argp->ex_flags;
2245 np->netc_anon = argp->ex_anon;
2246 np->netc_anon.cr_ref = 1;
2247 mp->mnt_flag |= MNT_DEFEXPORTED;
2248 return (0);
2249 }
2250 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2251 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2252 bzero((caddr_t) np, i);
2253 saddr = (struct sockaddr *) (np + 1);
2254 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2255 goto out;
2256 if (saddr->sa_len > argp->ex_addrlen)
2257 saddr->sa_len = argp->ex_addrlen;
2258 if (argp->ex_masklen) {
2259 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2260 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2261 if (error)
2262 goto out;
2263 if (smask->sa_len > argp->ex_masklen)
2264 smask->sa_len = argp->ex_masklen;
2265 }
2266 i = saddr->sa_family;
2267 if ((rnh = nep->ne_rtable[i]) == 0) {
2268 /*
2269 * Seems silly to initialize every AF when most are not used,
2270 * do so on demand here
2271 */
2272 for (dom = domains; dom; dom = dom->dom_next)
2273 if (dom->dom_family == i && dom->dom_rtattach) {
2274 dom->dom_rtattach((void **) &nep->ne_rtable[i],
2275 dom->dom_rtoffset);
2276 break;
2277 }
2278 if ((rnh = nep->ne_rtable[i]) == 0) {
2279 error = ENOBUFS;
2280 goto out;
2281 }
2282 }
2283 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2284 np->netc_rnodes);
2285 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
2286 error = EPERM;
2287 goto out;
2288 }
2289 np->netc_exflags = argp->ex_flags;
2290 np->netc_anon = argp->ex_anon;
2291 np->netc_anon.cr_ref = 1;
2292 return (0);
2293out:
2294 free(np, M_NETADDR);
2295 return (error);
2296}
2297
2298/* ARGSUSED */
2299static int
2300vfs_free_netcred(rn, w)
2301 struct radix_node *rn;
2302 void *w;
2303{
2304 register struct radix_node_head *rnh = (struct radix_node_head *) w;
2305
2306 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2307 free((caddr_t) rn, M_NETADDR);
2308 return (0);
2309}
2310
2311/*
2312 * Free the net address hash lists that are hanging off the mount points.
2313 */
2314static void
2315vfs_free_addrlist(nep)
2316 struct netexport *nep;
2317{
2318 register int i;
2319 register struct radix_node_head *rnh;
2320
2321 for (i = 0; i <= AF_MAX; i++)
2322 if ((rnh = nep->ne_rtable[i])) {
2323 (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2324 (caddr_t) rnh);
2325 free((caddr_t) rnh, M_RTABLE);
2326 nep->ne_rtable[i] = 0;
2327 }
2328}
2329
2330int
2331vfs_export(mp, nep, argp)
2332 struct mount *mp;
2333 struct netexport *nep;
2334 struct export_args *argp;
2335{
2336 int error;
2337
2338 if (argp->ex_flags & MNT_DELEXPORT) {
2339 if (mp->mnt_flag & MNT_EXPUBLIC) {
2340 vfs_setpublicfs(NULL, NULL, NULL);
2341 mp->mnt_flag &= ~MNT_EXPUBLIC;
2342 }
2343 vfs_free_addrlist(nep);
2344 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2345 }
2346 if (argp->ex_flags & MNT_EXPORTED) {
2347 if (argp->ex_flags & MNT_EXPUBLIC) {
2348 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2349 return (error);
2350 mp->mnt_flag |= MNT_EXPUBLIC;
2351 }
2352 if ((error = vfs_hang_addrlist(mp, nep, argp)))
2353 return (error);
2354 mp->mnt_flag |= MNT_EXPORTED;
2355 }
2356 return (0);
2357}
2358
2359
2360/*
2361 * Set the publicly exported filesystem (WebNFS). Currently, only
2362 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2363 */
2364int
2365vfs_setpublicfs(mp, nep, argp)
2366 struct mount *mp;
2367 struct netexport *nep;
2368 struct export_args *argp;
2369{
2370 int error;
2371 struct vnode *rvp;
2372 char *cp;
2373
2374 /*
2375 * mp == NULL -> invalidate the current info, the FS is
2376 * no longer exported. May be called from either vfs_export
2377 * or unmount, so check if it hasn't already been done.
2378 */
2379 if (mp == NULL) {
2380 if (nfs_pub.np_valid) {
2381 nfs_pub.np_valid = 0;
2382 if (nfs_pub.np_index != NULL) {
2383 FREE(nfs_pub.np_index, M_TEMP);
2384 nfs_pub.np_index = NULL;
2385 }
2386 }
2387 return (0);
2388 }
2389
2390 /*
2391 * Only one allowed at a time.
2392 */
2393 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2394 return (EBUSY);
2395
2396 /*
2397 * Get real filehandle for root of exported FS.
2398 */
2399 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2400 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2401
2402 if ((error = VFS_ROOT(mp, &rvp)))
2403 return (error);
2404
2405 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2406 return (error);
2407
2408 vput(rvp);
2409
2410 /*
2411 * If an indexfile was specified, pull it in.
2412 */
2413 if (argp->ex_indexfile != NULL) {
2414 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2415 M_WAITOK);
2416 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2417 MAXNAMLEN, (size_t *)0);
2418 if (!error) {
2419 /*
2420 * Check for illegal filenames.
2421 */
2422 for (cp = nfs_pub.np_index; *cp; cp++) {
2423 if (*cp == '/') {
2424 error = EINVAL;
2425 break;
2426 }
2427 }
2428 }
2429 if (error) {
2430 FREE(nfs_pub.np_index, M_TEMP);
2431 return (error);
2432 }
2433 }
2434
2435 nfs_pub.np_mount = mp;
2436 nfs_pub.np_valid = 1;
2437 return (0);
2438}
2439
2440struct netcred *
2441vfs_export_lookup(mp, nep, nam)
2442 register struct mount *mp;
2443 struct netexport *nep;
2444 struct sockaddr *nam;
2445{
2446 register struct netcred *np;
2447 register struct radix_node_head *rnh;
2448 struct sockaddr *saddr;
2449
2450 np = NULL;
2451 if (mp->mnt_flag & MNT_EXPORTED) {
2452 /*
2453 * Lookup in the export list first.
2454 */
2455 if (nam != NULL) {
2456 saddr = nam;
2457 rnh = nep->ne_rtable[saddr->sa_family];
2458 if (rnh != NULL) {
2459 np = (struct netcred *)
2460 (*rnh->rnh_matchaddr)((caddr_t)saddr,
2461 rnh);
2462 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2463 np = NULL;
2464 }
2465 }
2466 /*
2467 * If no address match, use the default if it exists.
2468 */
2469 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2470 np = &nep->ne_defexported;
2471 }
2472 return (np);
2473}
2474
2475/*
2476 * perform msync on all vnodes under a mount point
2477 * the mount point must be locked.
2478 */
2479void
2480vfs_msync(struct mount *mp, int flags) {
2481 struct vnode *vp, *nvp;
2482 struct vm_object *obj;
2483 int anyio, tries;
2484
2485 tries = 5;
2486loop:
2487 anyio = 0;
2488 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
2489
2490 nvp = LIST_NEXT(vp, v_mntvnodes);
2491
2492 if (vp->v_mount != mp) {
2493 goto loop;
2494 }
2495
2496 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
2497 continue;
2498
2499 if (flags != MNT_WAIT) {
2500 obj = vp->v_object;
2501 if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2502 continue;
2503 if (VOP_ISLOCKED(vp, NULL))
2504 continue;
2505 }
2506
2507 simple_lock(&vp->v_interlock);
2508 if (vp->v_object &&
2509 (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2510 if (!vget(vp,
2511 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2512 if (vp->v_object) {
2513 vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
2514 anyio = 1;
2515 }
2516 vput(vp);
2517 }
2518 } else {
2519 simple_unlock(&vp->v_interlock);
2520 }
2521 }
2522 if (anyio && (--tries > 0))
2523 goto loop;
2524}
2525
2526/*
2527 * Create the VM object needed for VMIO and mmap support. This
2528 * is done for all VREG files in the system. Some filesystems might
2529 * afford the additional metadata buffering capability of the
2530 * VMIO code by making the device node be VMIO mode also.
2531 *
2532 * vp must be locked when vfs_object_create is called.
2533 */
2534int
2535vfs_object_create(vp, p, cred)
2536 struct vnode *vp;
2537 struct proc *p;
2538 struct ucred *cred;
2539{
2540 struct vattr vat;
2541 vm_object_t object;
2542 int error = 0;
2543
2544 if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
2545 return 0;
2546
2547retry:
2548 if ((object = vp->v_object) == NULL) {
2549 if (vp->v_type == VREG || vp->v_type == VDIR) {
2550 if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2551 goto retn;
2552 object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
2553 } else if (devsw(vp->v_rdev) != NULL) {
2554 /*
2555 * This simply allocates the biggest object possible
2556 * for a disk vnode. This should be fixed, but doesn't
2557 * cause any problems (yet).
2558 */
2559 object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
2560 } else {
2561 goto retn;
2562 }
2563 /*
2564 * Dereference the reference we just created. This assumes
2565 * that the object is associated with the vp.
2566 */
2567 object->ref_count--;
2568 vp->v_usecount--;
2569 } else {
2570 if (object->flags & OBJ_DEAD) {
2571 VOP_UNLOCK(vp, 0, p);
2572 tsleep(object, PVM, "vodead", 0);
2573 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2574 goto retry;
2575 }
2576 }
2577
2578 KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
2579 vp->v_flag |= VOBJBUF;
2580
2581retn:
2582 return error;
2583}
2584
2585static void
2586vfree(vp)
2587 struct vnode *vp;
2588{
2589 int s;
2590
2591 s = splbio();
2592 simple_lock(&vnode_free_list_slock);
2593 if (vp->v_flag & VTBFREE) {
2594 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2595 vp->v_flag &= ~VTBFREE;
2596 }
2597 if (vp->v_flag & VAGE) {
2598 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2599 } else {
2600 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2601 }
2602 freevnodes++;
2603 simple_unlock(&vnode_free_list_slock);
2604 vp->v_flag &= ~VAGE;
2605 vp->v_flag |= VFREE;
2606 splx(s);
2607}
2608
2609void
2610vbusy(vp)
2611 struct vnode *vp;
2612{
2613 int s;
2614
2615 s = splbio();
2616 simple_lock(&vnode_free_list_slock);
2617 if (vp->v_flag & VTBFREE) {
2618 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2619 vp->v_flag &= ~VTBFREE;
2620 } else {
2621 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2622 freevnodes--;
2623 }
2624 simple_unlock(&vnode_free_list_slock);
2625 vp->v_flag &= ~(VFREE|VAGE);
2626 splx(s);
2627}
2628
2629/*
2630 * Record a process's interest in events which might happen to
2631 * a vnode. Because poll uses the historic select-style interface
2632 * internally, this routine serves as both the ``check for any
2633 * pending events'' and the ``record my interest in future events''
2634 * functions. (These are done together, while the lock is held,
2635 * to avoid race conditions.)
2636 */
2637int
2638vn_pollrecord(vp, p, events)
2639 struct vnode *vp;
2640 struct proc *p;
2641 short events;
2642{
2643 simple_lock(&vp->v_pollinfo.vpi_lock);
2644 if (vp->v_pollinfo.vpi_revents & events) {
2645 /*
2646 * This leaves events we are not interested
2647 * in available for the other process which
2648 * which presumably had requested them
2649 * (otherwise they would never have been
2650 * recorded).
2651 */
2652 events &= vp->v_pollinfo.vpi_revents;
2653 vp->v_pollinfo.vpi_revents &= ~events;
2654
2655 simple_unlock(&vp->v_pollinfo.vpi_lock);
2656 return events;
2657 }
2658 vp->v_pollinfo.vpi_events |= events;
2659 selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2660 simple_unlock(&vp->v_pollinfo.vpi_lock);
2661 return 0;
2662}
2663
2664/*
2665 * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
2666 * it is possible for us to miss an event due to race conditions, but
2667 * that condition is expected to be rare, so for the moment it is the
2668 * preferred interface.
2669 */
2670void
2671vn_pollevent(vp, events)
2672 struct vnode *vp;
2673 short events;
2674{
2675 simple_lock(&vp->v_pollinfo.vpi_lock);
2676 if (vp->v_pollinfo.vpi_events & events) {
2677 /*
2678 * We clear vpi_events so that we don't
2679 * call selwakeup() twice if two events are
2680 * posted before the polling process(es) is
2681 * awakened. This also ensures that we take at
2682 * most one selwakeup() if the polling process
2683 * is no longer interested. However, it does
2684 * mean that only one event can be noticed at
2685 * a time. (Perhaps we should only clear those
2686 * event bits which we note?) XXX
2687 */
2688 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
2689 vp->v_pollinfo.vpi_revents |= events;
2690 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2691 }
2692 simple_unlock(&vp->v_pollinfo.vpi_lock);
2693}
2694
2695/*
2696 * Wake up anyone polling on vp because it is being revoked.
2697 * This depends on dead_poll() returning POLLHUP for correct
2698 * behavior.
2699 */
2700void
2701vn_pollgone(vp)
2702 struct vnode *vp;
2703{
2704 simple_lock(&vp->v_pollinfo.vpi_lock);
2705 if (vp->v_pollinfo.vpi_events) {
2706 vp->v_pollinfo.vpi_events = 0;
2707 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2708 }
2709 simple_unlock(&vp->v_pollinfo.vpi_lock);
2710}
2711
2712
2713
2714/*
2715 * Routine to create and manage a filesystem syncer vnode.
2716 */
2717#define sync_close ((int (*) __P((struct vop_close_args *)))nullop)
2718static int sync_fsync __P((struct vop_fsync_args *));
2719static int sync_inactive __P((struct vop_inactive_args *));
2720static int sync_reclaim __P((struct vop_reclaim_args *));
2721#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock)
2722#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock)
2723static int sync_print __P((struct vop_print_args *));
2724#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2725
2726static vop_t **sync_vnodeop_p;
2727static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2728 { &vop_default_desc, (vop_t *) vop_eopnotsupp },
2729 { &vop_close_desc, (vop_t *) sync_close }, /* close */
2730 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
2731 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
2732 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
2733 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */
2734 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */
2735 { &vop_print_desc, (vop_t *) sync_print }, /* print */
2736 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */
2737 { NULL, NULL }
2738};
2739static struct vnodeopv_desc sync_vnodeop_opv_desc =
2740 { &sync_vnodeop_p, sync_vnodeop_entries };
2741
2742VNODEOP_SET(sync_vnodeop_opv_desc);
2743
2744/*
2745 * Create a new filesystem syncer vnode for the specified mount point.
2746 */
2747int
2748vfs_allocate_syncvnode(mp)
2749 struct mount *mp;
2750{
2751 struct vnode *vp;
2752 static long start, incr, next;
2753 int error;
2754
2755 /* Allocate a new vnode */
2756 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2757 mp->mnt_syncer = NULL;
2758 return (error);
2759 }
2760 vp->v_type = VNON;
2761 /*
2762 * Place the vnode onto the syncer worklist. We attempt to
2763 * scatter them about on the list so that they will go off
2764 * at evenly distributed times even if all the filesystems
2765 * are mounted at once.
2766 */
2767 next += incr;
2768 if (next == 0 || next > syncer_maxdelay) {
2769 start /= 2;
2770 incr /= 2;
2771 if (start == 0) {
2772 start = syncer_maxdelay / 2;
2773 incr = syncer_maxdelay;
2774 }
2775 next = start;
2776 }
2777 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2778 mp->mnt_syncer = vp;
2779 return (0);
2780}
2781
2782/*
2783 * Do a lazy sync of the filesystem.
2784 */
2785static int
2786sync_fsync(ap)
2787 struct vop_fsync_args /* {
2788 struct vnode *a_vp;
2789 struct ucred *a_cred;
2790 int a_waitfor;
2791 struct proc *a_p;
2792 } */ *ap;
2793{
2794 struct vnode *syncvp = ap->a_vp;
2795 struct mount *mp = syncvp->v_mount;
2796 struct proc *p = ap->a_p;
2797 int asyncflag;
2798
2799 /*
2800 * We only need to do something if this is a lazy evaluation.
2801 */
2802 if (ap->a_waitfor != MNT_LAZY)
2803 return (0);
2804
2805 /*
2806 * Move ourselves to the back of the sync list.
2807 */
2808 vn_syncer_add_to_worklist(syncvp, syncdelay);
2809
2810 /*
2811 * Walk the list of vnodes pushing all that are dirty and
2812 * not already on the sync list.
2813 */
2814 simple_lock(&mountlist_slock);
2815 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2816 simple_unlock(&mountlist_slock);
2817 return (0);
2818 }
2819 asyncflag = mp->mnt_flag & MNT_ASYNC;
2820 mp->mnt_flag &= ~MNT_ASYNC;
2821 vfs_msync(mp, MNT_NOWAIT);
2822 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2823 if (asyncflag)
2824 mp->mnt_flag |= MNT_ASYNC;
2825 vfs_unbusy(mp, p);
2826 return (0);
2827}
2828
2829/*
2830 * The syncer vnode is no referenced.
2831 */
2832static int
2833sync_inactive(ap)
2834 struct vop_inactive_args /* {
2835 struct vnode *a_vp;
2836 struct proc *a_p;
2837 } */ *ap;
2838{
2839
2840 vgone(ap->a_vp);
2841 return (0);
2842}
2843
2844/*
2845 * The syncer vnode is no longer needed and is being decommissioned.
2846 *
2847 * Modifications to the worklist must be protected at splbio().
2848 */
2849static int
2850sync_reclaim(ap)
2851 struct vop_reclaim_args /* {
2852 struct vnode *a_vp;
2853 } */ *ap;
2854{
2855 struct vnode *vp = ap->a_vp;
2856 int s;
2857
2858 s = splbio();
2859 vp->v_mount->mnt_syncer = NULL;
2860 if (vp->v_flag & VONWORKLST) {
2861 LIST_REMOVE(vp, v_synclist);
2862 vp->v_flag &= ~VONWORKLST;
2863 }
2864 splx(s);
2865
2866 return (0);
2867}
2868
2869/*
2870 * Print out a syncer vnode.
2871 */
2872static int
2873sync_print(ap)
2874 struct vop_print_args /* {
2875 struct vnode *a_vp;
2876 } */ *ap;
2877{
2878 struct vnode *vp = ap->a_vp;
2879
2880 printf("syncer vnode");
2881 if (vp->v_vnlock != NULL)
2882 lockmgr_printinfo(vp->v_vnlock);
2883 printf("\n");
2884 return (0);
2885}
2886
2887/*
2888 * extract the dev_t from a VBLK or VCHR
2889 */
2890dev_t
2891vn_todev(vp)
2892 struct vnode *vp;
2893{
2894 if (vp->v_type != VBLK && vp->v_type != VCHR)
2895 return (NODEV);
2896 return (vp->v_rdev);
2897}
2898
2899/*
2900 * Check if vnode represents a disk device
2901 */
2902int
2903vn_isdisk(vp, errp)
2904 struct vnode *vp;
2905 int *errp;
2906{
2907 if (vp->v_type != VBLK && vp->v_type != VCHR) {
2908 if (errp != NULL)
2909 *errp = ENOTBLK;
2910 return (0);
2911 }
2912 if (vp->v_rdev == NULL) {
2913 if (errp != NULL)
2914 *errp = ENXIO;
2915 return (0);
2916 }
2917 if (!devsw(vp->v_rdev)) {
2918 if (errp != NULL)
2919 *errp = ENXIO;
2920 return (0);
2921 }
2922 if (!(devsw(vp->v_rdev)->d_flags & D_DISK)) {
2923 if (errp != NULL)
2924 *errp = ENOTBLK;
2925 return (0);
2926 }
2927 if (errp != NULL)
2928 *errp = 0;
2929 return (1);
2930}
2931
2932void
2933NDFREE(ndp, flags)
2934 struct nameidata *ndp;
2935 const uint flags;
2936{
2937 if (!(flags & NDF_NO_FREE_PNBUF) &&
2938 (ndp->ni_cnd.cn_flags & HASBUF)) {
2939 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
2940 ndp->ni_cnd.cn_flags &= ~HASBUF;
2941 }
2942 if (!(flags & NDF_NO_DVP_UNLOCK) &&
2943 (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
2944 ndp->ni_dvp != ndp->ni_vp)
2945 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
2946 if (!(flags & NDF_NO_DVP_RELE) &&
2947 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
2948 vrele(ndp->ni_dvp);
2949 ndp->ni_dvp = NULL;
2950 }
2951 if (!(flags & NDF_NO_VP_UNLOCK) &&
2952 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
2953 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
2954 if (!(flags & NDF_NO_VP_RELE) &&
2955 ndp->ni_vp) {
2956 vrele(ndp->ni_vp);
2957 ndp->ni_vp = NULL;
2958 }
2959 if (!(flags & NDF_NO_STARTDIR_RELE) &&
2960 (ndp->ni_cnd.cn_flags & SAVESTART)) {
2961 vrele(ndp->ni_startdir);
2962 ndp->ni_startdir = NULL;
2963 }
2964}
50#include <sys/buf.h>
51#include <sys/conf.h>
52#include <sys/dirent.h>
53#include <sys/domain.h>
54#include <sys/eventhandler.h>
55#include <sys/fcntl.h>
56#include <sys/kernel.h>
57#include <sys/kthread.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/namei.h>
61#include <sys/proc.h>
62#include <sys/reboot.h>
63#include <sys/socket.h>
64#include <sys/stat.h>
65#include <sys/sysctl.h>
66#include <sys/vmmeter.h>
67#include <sys/vnode.h>
68
69#include <machine/limits.h>
70
71#include <vm/vm.h>
72#include <vm/vm_object.h>
73#include <vm/vm_extern.h>
74#include <vm/pmap.h>
75#include <vm/vm_map.h>
76#include <vm/vm_page.h>
77#include <vm/vm_pager.h>
78#include <vm/vnode_pager.h>
79#include <vm/vm_zone.h>
80
81static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
82
83static void insmntque __P((struct vnode *vp, struct mount *mp));
84static void vclean __P((struct vnode *vp, int flags, struct proc *p));
85static void vfree __P((struct vnode *));
86static unsigned long numvnodes;
87SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
88
89enum vtype iftovt_tab[16] = {
90 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
91 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
92};
93int vttoif_tab[9] = {
94 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
95 S_IFSOCK, S_IFIFO, S_IFMT,
96};
97
98static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
99struct tobefreelist vnode_tobefree_list; /* vnode free list */
100
101static u_long wantfreevnodes = 25;
102SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
103static u_long freevnodes = 0;
104SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
105
106static int reassignbufcalls;
107SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
108static int reassignbufloops;
109SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
110static int reassignbufsortgood;
111SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
112static int reassignbufsortbad;
113SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
114static int reassignbufmethod = 1;
115SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
116
117#ifdef ENABLE_VFS_IOOPT
118int vfs_ioopt = 0;
119SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
120#endif
121
122struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
123struct simplelock mountlist_slock;
124struct simplelock mntvnode_slock;
125int nfs_mount_type = -1;
126#ifndef NULL_SIMPLELOCKS
127static struct simplelock mntid_slock;
128static struct simplelock vnode_free_list_slock;
129static struct simplelock spechash_slock;
130#endif
131struct nfs_public nfs_pub; /* publicly exported FS */
132static vm_zone_t vnode_zone;
133
134/*
135 * The workitem queue.
136 */
137#define SYNCER_MAXDELAY 32
138static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
139time_t syncdelay = 30; /* max time to delay syncing data */
140time_t filedelay = 30; /* time to delay syncing files */
141SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
142time_t dirdelay = 29; /* time to delay syncing directories */
143SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
144time_t metadelay = 28; /* time to delay syncing metadata */
145SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
146static int rushjob; /* number of slots to run ASAP */
147static int stat_rush_requests; /* number of times I/O speeded up */
148SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
149
150static int syncer_delayno = 0;
151static long syncer_mask;
152LIST_HEAD(synclist, vnode);
153static struct synclist *syncer_workitem_pending;
154
155int desiredvnodes;
156SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
157 &desiredvnodes, 0, "Maximum number of vnodes");
158
159static void vfs_free_addrlist __P((struct netexport *nep));
160static int vfs_free_netcred __P((struct radix_node *rn, void *w));
161static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
162 struct export_args *argp));
163
164/*
165 * Initialize the vnode management data structures.
166 */
167void
168vntblinit()
169{
170
171 desiredvnodes = maxproc + cnt.v_page_count / 4;
172 simple_lock_init(&mntvnode_slock);
173 simple_lock_init(&mntid_slock);
174 simple_lock_init(&spechash_slock);
175 TAILQ_INIT(&vnode_free_list);
176 TAILQ_INIT(&vnode_tobefree_list);
177 simple_lock_init(&vnode_free_list_slock);
178 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
179 /*
180 * Initialize the filesystem syncer.
181 */
182 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
183 &syncer_mask);
184 syncer_maxdelay = syncer_mask + 1;
185}
186
187/*
188 * Mark a mount point as busy. Used to synchronize access and to delay
189 * unmounting. Interlock is not released on failure.
190 */
191int
192vfs_busy(mp, flags, interlkp, p)
193 struct mount *mp;
194 int flags;
195 struct simplelock *interlkp;
196 struct proc *p;
197{
198 int lkflags;
199
200 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
201 if (flags & LK_NOWAIT)
202 return (ENOENT);
203 mp->mnt_kern_flag |= MNTK_MWAIT;
204 if (interlkp) {
205 simple_unlock(interlkp);
206 }
207 /*
208 * Since all busy locks are shared except the exclusive
209 * lock granted when unmounting, the only place that a
210 * wakeup needs to be done is at the release of the
211 * exclusive lock at the end of dounmount.
212 */
213 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
214 if (interlkp) {
215 simple_lock(interlkp);
216 }
217 return (ENOENT);
218 }
219 lkflags = LK_SHARED | LK_NOPAUSE;
220 if (interlkp)
221 lkflags |= LK_INTERLOCK;
222 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
223 panic("vfs_busy: unexpected lock failure");
224 return (0);
225}
226
227/*
228 * Free a busy filesystem.
229 */
230void
231vfs_unbusy(mp, p)
232 struct mount *mp;
233 struct proc *p;
234{
235
236 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
237}
238
239/*
240 * Lookup a filesystem type, and if found allocate and initialize
241 * a mount structure for it.
242 *
243 * Devname is usually updated by mount(8) after booting.
244 */
245int
246vfs_rootmountalloc(fstypename, devname, mpp)
247 char *fstypename;
248 char *devname;
249 struct mount **mpp;
250{
251 struct proc *p = curproc; /* XXX */
252 struct vfsconf *vfsp;
253 struct mount *mp;
254
255 if (fstypename == NULL)
256 return (ENODEV);
257 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
258 if (!strcmp(vfsp->vfc_name, fstypename))
259 break;
260 if (vfsp == NULL)
261 return (ENODEV);
262 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
263 bzero((char *)mp, (u_long)sizeof(struct mount));
264 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
265 (void)vfs_busy(mp, LK_NOWAIT, 0, p);
266 LIST_INIT(&mp->mnt_vnodelist);
267 mp->mnt_vfc = vfsp;
268 mp->mnt_op = vfsp->vfc_vfsops;
269 mp->mnt_flag = MNT_RDONLY;
270 mp->mnt_vnodecovered = NULLVP;
271 vfsp->vfc_refcount++;
272 mp->mnt_iosize_max = DFLTPHYS;
273 mp->mnt_stat.f_type = vfsp->vfc_typenum;
274 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
275 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
276 mp->mnt_stat.f_mntonname[0] = '/';
277 mp->mnt_stat.f_mntonname[1] = 0;
278 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
279 *mpp = mp;
280 return (0);
281}
282
283/*
284 * Find an appropriate filesystem to use for the root. If a filesystem
285 * has not been preselected, walk through the list of known filesystems
286 * trying those that have mountroot routines, and try them until one
287 * works or we have tried them all.
288 */
289#ifdef notdef /* XXX JH */
290int
291lite2_vfs_mountroot()
292{
293 struct vfsconf *vfsp;
294 extern int (*lite2_mountroot) __P((void));
295 int error;
296
297 if (lite2_mountroot != NULL)
298 return ((*lite2_mountroot)());
299 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
300 if (vfsp->vfc_mountroot == NULL)
301 continue;
302 if ((error = (*vfsp->vfc_mountroot)()) == 0)
303 return (0);
304 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
305 }
306 return (ENODEV);
307}
308#endif
309
310/*
311 * Lookup a mount point by filesystem identifier.
312 */
313struct mount *
314vfs_getvfs(fsid)
315 fsid_t *fsid;
316{
317 register struct mount *mp;
318
319 simple_lock(&mountlist_slock);
320 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
321 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
322 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
323 simple_unlock(&mountlist_slock);
324 return (mp);
325 }
326 }
327 simple_unlock(&mountlist_slock);
328 return ((struct mount *) 0);
329}
330
331/*
332 * Get a new unique fsid. Try to make its val[0] unique, since this value
333 * will be used to create fake device numbers for stat(). Also try (but
334 * not so hard) make its val[0] unique mod 2^16, since some emulators only
335 * support 16-bit device numbers. We end up with unique val[0]'s for the
336 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
337 *
338 * Keep in mind that several mounts may be running in parallel. Starting
339 * the search one past where the previous search terminated is both a
340 * micro-optimization and a defense against returning the same fsid to
341 * different mounts.
342 */
343void
344vfs_getnewfsid(mp)
345 struct mount *mp;
346{
347 static u_int16_t mntid_base;
348 fsid_t tfsid;
349 int mtype;
350
351 simple_lock(&mntid_slock);
352 mtype = mp->mnt_vfc->vfc_typenum;
353 tfsid.val[1] = mtype;
354 mtype = (mtype & 0xFF) << 16;
355 for (;;) {
356 tfsid.val[0] = makeudev(255, mtype | mntid_base++);
357 if (vfs_getvfs(&tfsid) == NULL)
358 break;
359 }
360 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
361 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
362 simple_unlock(&mntid_slock);
363}
364
365/*
366 * Knob to control the precision of file timestamps:
367 *
368 * 0 = seconds only; nanoseconds zeroed.
369 * 1 = seconds and nanoseconds, accurate within 1/HZ.
370 * 2 = seconds and nanoseconds, truncated to microseconds.
371 * >=3 = seconds and nanoseconds, maximum precision.
372 */
373enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
374
375static int timestamp_precision = TSP_SEC;
376SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
377 &timestamp_precision, 0, "");
378
379/*
380 * Get a current timestamp.
381 */
382void
383vfs_timestamp(tsp)
384 struct timespec *tsp;
385{
386 struct timeval tv;
387
388 switch (timestamp_precision) {
389 case TSP_SEC:
390 tsp->tv_sec = time_second;
391 tsp->tv_nsec = 0;
392 break;
393 case TSP_HZ:
394 getnanotime(tsp);
395 break;
396 case TSP_USEC:
397 microtime(&tv);
398 TIMEVAL_TO_TIMESPEC(&tv, tsp);
399 break;
400 case TSP_NSEC:
401 default:
402 nanotime(tsp);
403 break;
404 }
405}
406
407/*
408 * Set vnode attributes to VNOVAL
409 */
410void
411vattr_null(vap)
412 register struct vattr *vap;
413{
414
415 vap->va_type = VNON;
416 vap->va_size = VNOVAL;
417 vap->va_bytes = VNOVAL;
418 vap->va_mode = VNOVAL;
419 vap->va_nlink = VNOVAL;
420 vap->va_uid = VNOVAL;
421 vap->va_gid = VNOVAL;
422 vap->va_fsid = VNOVAL;
423 vap->va_fileid = VNOVAL;
424 vap->va_blocksize = VNOVAL;
425 vap->va_rdev = VNOVAL;
426 vap->va_atime.tv_sec = VNOVAL;
427 vap->va_atime.tv_nsec = VNOVAL;
428 vap->va_mtime.tv_sec = VNOVAL;
429 vap->va_mtime.tv_nsec = VNOVAL;
430 vap->va_ctime.tv_sec = VNOVAL;
431 vap->va_ctime.tv_nsec = VNOVAL;
432 vap->va_flags = VNOVAL;
433 vap->va_gen = VNOVAL;
434 vap->va_vaflags = 0;
435}
436
437/*
438 * Routines having to do with the management of the vnode table.
439 */
440extern vop_t **dead_vnodeop_p;
441
442/*
443 * Return the next vnode from the free list.
444 */
445int
446getnewvnode(tag, mp, vops, vpp)
447 enum vtagtype tag;
448 struct mount *mp;
449 vop_t **vops;
450 struct vnode **vpp;
451{
452 int s;
453 struct proc *p = curproc; /* XXX */
454 struct vnode *vp, *tvp, *nvp;
455 vm_object_t object;
456 TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
457
458 /*
459 * We take the least recently used vnode from the freelist
460 * if we can get it and it has no cached pages, and no
461 * namecache entries are relative to it.
462 * Otherwise we allocate a new vnode
463 */
464
465 s = splbio();
466 simple_lock(&vnode_free_list_slock);
467 TAILQ_INIT(&vnode_tmp_list);
468
469 for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
470 nvp = TAILQ_NEXT(vp, v_freelist);
471 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
472 if (vp->v_flag & VAGE) {
473 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
474 } else {
475 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
476 }
477 vp->v_flag &= ~(VTBFREE|VAGE);
478 vp->v_flag |= VFREE;
479 if (vp->v_usecount)
480 panic("tobe free vnode isn't");
481 freevnodes++;
482 }
483
484 if (wantfreevnodes && freevnodes < wantfreevnodes) {
485 vp = NULL;
486 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
487 /*
488 * XXX: this is only here to be backwards compatible
489 */
490 vp = NULL;
491 } else {
492 for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
493 nvp = TAILQ_NEXT(vp, v_freelist);
494 if (!simple_lock_try(&vp->v_interlock))
495 continue;
496 if (vp->v_usecount)
497 panic("free vnode isn't");
498
499 object = vp->v_object;
500 if (object && (object->resident_page_count || object->ref_count)) {
501 printf("object inconsistant state: RPC: %d, RC: %d\n",
502 object->resident_page_count, object->ref_count);
503 /* Don't recycle if it's caching some pages */
504 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
505 TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
506 continue;
507 } else if (LIST_FIRST(&vp->v_cache_src)) {
508 /* Don't recycle if active in the namecache */
509 simple_unlock(&vp->v_interlock);
510 continue;
511 } else {
512 break;
513 }
514 }
515 }
516
517 for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
518 nvp = TAILQ_NEXT(tvp, v_freelist);
519 TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
520 TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
521 simple_unlock(&tvp->v_interlock);
522 }
523
524 if (vp) {
525 vp->v_flag |= VDOOMED;
526 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
527 freevnodes--;
528 simple_unlock(&vnode_free_list_slock);
529 cache_purge(vp);
530 vp->v_lease = NULL;
531 if (vp->v_type != VBAD) {
532 vgonel(vp, p);
533 } else {
534 simple_unlock(&vp->v_interlock);
535 }
536
537#ifdef INVARIANTS
538 {
539 int s;
540
541 if (vp->v_data)
542 panic("cleaned vnode isn't");
543 s = splbio();
544 if (vp->v_numoutput)
545 panic("Clean vnode has pending I/O's");
546 splx(s);
547 }
548#endif
549 vp->v_flag = 0;
550 vp->v_lastw = 0;
551 vp->v_lasta = 0;
552 vp->v_cstart = 0;
553 vp->v_clen = 0;
554 vp->v_socket = 0;
555 vp->v_writecount = 0; /* XXX */
556 } else {
557 simple_unlock(&vnode_free_list_slock);
558 vp = (struct vnode *) zalloc(vnode_zone);
559 bzero((char *) vp, sizeof *vp);
560 simple_lock_init(&vp->v_interlock);
561 vp->v_dd = vp;
562 cache_purge(vp);
563 LIST_INIT(&vp->v_cache_src);
564 TAILQ_INIT(&vp->v_cache_dst);
565 numvnodes++;
566 }
567
568 TAILQ_INIT(&vp->v_cleanblkhd);
569 TAILQ_INIT(&vp->v_dirtyblkhd);
570 vp->v_type = VNON;
571 vp->v_tag = tag;
572 vp->v_op = vops;
573 insmntque(vp, mp);
574 *vpp = vp;
575 vp->v_usecount = 1;
576 vp->v_data = 0;
577 splx(s);
578
579 vfs_object_create(vp, p, p->p_ucred);
580 return (0);
581}
582
583/*
584 * Move a vnode from one mount queue to another.
585 */
586static void
587insmntque(vp, mp)
588 register struct vnode *vp;
589 register struct mount *mp;
590{
591
592 simple_lock(&mntvnode_slock);
593 /*
594 * Delete from old mount point vnode list, if on one.
595 */
596 if (vp->v_mount != NULL)
597 LIST_REMOVE(vp, v_mntvnodes);
598 /*
599 * Insert into list of vnodes for the new mount point, if available.
600 */
601 if ((vp->v_mount = mp) == NULL) {
602 simple_unlock(&mntvnode_slock);
603 return;
604 }
605 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
606 simple_unlock(&mntvnode_slock);
607}
608
609/*
610 * Update outstanding I/O count and do wakeup if requested.
611 */
612void
613vwakeup(bp)
614 register struct buf *bp;
615{
616 register struct vnode *vp;
617
618 bp->b_flags &= ~B_WRITEINPROG;
619 if ((vp = bp->b_vp)) {
620 vp->v_numoutput--;
621 if (vp->v_numoutput < 0)
622 panic("vwakeup: neg numoutput");
623 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
624 vp->v_flag &= ~VBWAIT;
625 wakeup((caddr_t) &vp->v_numoutput);
626 }
627 }
628}
629
630/*
631 * Flush out and invalidate all buffers associated with a vnode.
632 * Called with the underlying object locked.
633 */
634int
635vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
636 register struct vnode *vp;
637 int flags;
638 struct ucred *cred;
639 struct proc *p;
640 int slpflag, slptimeo;
641{
642 register struct buf *bp;
643 struct buf *nbp, *blist;
644 int s, error;
645 vm_object_t object;
646
647 if (flags & V_SAVE) {
648 s = splbio();
649 while (vp->v_numoutput) {
650 vp->v_flag |= VBWAIT;
651 error = tsleep((caddr_t)&vp->v_numoutput,
652 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
653 if (error) {
654 splx(s);
655 return (error);
656 }
657 }
658 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
659 splx(s);
660 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
661 return (error);
662 s = splbio();
663 if (vp->v_numoutput > 0 ||
664 !TAILQ_EMPTY(&vp->v_dirtyblkhd))
665 panic("vinvalbuf: dirty bufs");
666 }
667 splx(s);
668 }
669 s = splbio();
670 for (;;) {
671 blist = TAILQ_FIRST(&vp->v_cleanblkhd);
672 if (!blist)
673 blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
674 if (!blist)
675 break;
676
677 for (bp = blist; bp; bp = nbp) {
678 nbp = TAILQ_NEXT(bp, b_vnbufs);
679 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
680 error = BUF_TIMELOCK(bp,
681 LK_EXCLUSIVE | LK_SLEEPFAIL,
682 "vinvalbuf", slpflag, slptimeo);
683 if (error == ENOLCK)
684 break;
685 splx(s);
686 return (error);
687 }
688 /*
689 * XXX Since there are no node locks for NFS, I
690 * believe there is a slight chance that a delayed
691 * write will occur while sleeping just above, so
692 * check for it. Note that vfs_bio_awrite expects
693 * buffers to reside on a queue, while VOP_BWRITE and
694 * brelse do not.
695 */
696 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
697 (flags & V_SAVE)) {
698
699 if (bp->b_vp == vp) {
700 if (bp->b_flags & B_CLUSTEROK) {
701 BUF_UNLOCK(bp);
702 vfs_bio_awrite(bp);
703 } else {
704 bremfree(bp);
705 bp->b_flags |= B_ASYNC;
706 BUF_WRITE(bp);
707 }
708 } else {
709 bremfree(bp);
710 (void) BUF_WRITE(bp);
711 }
712 break;
713 }
714 bremfree(bp);
715 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
716 bp->b_flags &= ~B_ASYNC;
717 brelse(bp);
718 }
719 }
720
721 while (vp->v_numoutput > 0) {
722 vp->v_flag |= VBWAIT;
723 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
724 }
725
726 splx(s);
727
728 /*
729 * Destroy the copy in the VM cache, too.
730 */
731 simple_lock(&vp->v_interlock);
732 object = vp->v_object;
733 if (object != NULL) {
734 vm_object_page_remove(object, 0, 0,
735 (flags & V_SAVE) ? TRUE : FALSE);
736 }
737 simple_unlock(&vp->v_interlock);
738
739 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
740 panic("vinvalbuf: flush failed");
741 return (0);
742}
743
744/*
745 * Truncate a file's buffer and pages to a specified length. This
746 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
747 * sync activity.
748 */
749int
750vtruncbuf(vp, cred, p, length, blksize)
751 register struct vnode *vp;
752 struct ucred *cred;
753 struct proc *p;
754 off_t length;
755 int blksize;
756{
757 register struct buf *bp;
758 struct buf *nbp;
759 int s, anyfreed;
760 int trunclbn;
761
762 /*
763 * Round up to the *next* lbn.
764 */
765 trunclbn = (length + blksize - 1) / blksize;
766
767 s = splbio();
768restart:
769 anyfreed = 1;
770 for (;anyfreed;) {
771 anyfreed = 0;
772 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
773 nbp = TAILQ_NEXT(bp, b_vnbufs);
774 if (bp->b_lblkno >= trunclbn) {
775 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
776 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
777 goto restart;
778 } else {
779 bremfree(bp);
780 bp->b_flags |= (B_INVAL | B_RELBUF);
781 bp->b_flags &= ~B_ASYNC;
782 brelse(bp);
783 anyfreed = 1;
784 }
785 if (nbp &&
786 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
787 (nbp->b_vp != vp) ||
788 (nbp->b_flags & B_DELWRI))) {
789 goto restart;
790 }
791 }
792 }
793
794 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
795 nbp = TAILQ_NEXT(bp, b_vnbufs);
796 if (bp->b_lblkno >= trunclbn) {
797 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
798 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
799 goto restart;
800 } else {
801 bremfree(bp);
802 bp->b_flags |= (B_INVAL | B_RELBUF);
803 bp->b_flags &= ~B_ASYNC;
804 brelse(bp);
805 anyfreed = 1;
806 }
807 if (nbp &&
808 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
809 (nbp->b_vp != vp) ||
810 (nbp->b_flags & B_DELWRI) == 0)) {
811 goto restart;
812 }
813 }
814 }
815 }
816
817 if (length > 0) {
818restartsync:
819 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
820 nbp = TAILQ_NEXT(bp, b_vnbufs);
821 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
822 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
823 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
824 goto restart;
825 } else {
826 bremfree(bp);
827 if (bp->b_vp == vp) {
828 bp->b_flags |= B_ASYNC;
829 } else {
830 bp->b_flags &= ~B_ASYNC;
831 }
832 BUF_WRITE(bp);
833 }
834 goto restartsync;
835 }
836
837 }
838 }
839
840 while (vp->v_numoutput > 0) {
841 vp->v_flag |= VBWAIT;
842 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
843 }
844
845 splx(s);
846
847 vnode_pager_setsize(vp, length);
848
849 return (0);
850}
851
852/*
853 * Associate a buffer with a vnode.
854 */
855void
856bgetvp(vp, bp)
857 register struct vnode *vp;
858 register struct buf *bp;
859{
860 int s;
861
862 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
863
864 vhold(vp);
865 bp->b_vp = vp;
866 bp->b_dev = vn_todev(vp);
867 /*
868 * Insert onto list for new vnode.
869 */
870 s = splbio();
871 bp->b_xflags |= BX_VNCLEAN;
872 bp->b_xflags &= ~BX_VNDIRTY;
873 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
874 splx(s);
875}
876
877/*
878 * Disassociate a buffer from a vnode.
879 */
880void
881brelvp(bp)
882 register struct buf *bp;
883{
884 struct vnode *vp;
885 struct buflists *listheadp;
886 int s;
887
888 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
889
890 /*
891 * Delete from old vnode list, if on one.
892 */
893 vp = bp->b_vp;
894 s = splbio();
895 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
896 if (bp->b_xflags & BX_VNDIRTY)
897 listheadp = &vp->v_dirtyblkhd;
898 else
899 listheadp = &vp->v_cleanblkhd;
900 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
901 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
902 }
903 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
904 vp->v_flag &= ~VONWORKLST;
905 LIST_REMOVE(vp, v_synclist);
906 }
907 splx(s);
908 bp->b_vp = (struct vnode *) 0;
909 vdrop(vp);
910}
911
912/*
913 * The workitem queue.
914 *
915 * It is useful to delay writes of file data and filesystem metadata
916 * for tens of seconds so that quickly created and deleted files need
917 * not waste disk bandwidth being created and removed. To realize this,
918 * we append vnodes to a "workitem" queue. When running with a soft
919 * updates implementation, most pending metadata dependencies should
920 * not wait for more than a few seconds. Thus, mounted on block devices
921 * are delayed only about a half the time that file data is delayed.
922 * Similarly, directory updates are more critical, so are only delayed
923 * about a third the time that file data is delayed. Thus, there are
924 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
925 * one each second (driven off the filesystem syncer process). The
926 * syncer_delayno variable indicates the next queue that is to be processed.
927 * Items that need to be processed soon are placed in this queue:
928 *
929 * syncer_workitem_pending[syncer_delayno]
930 *
931 * A delay of fifteen seconds is done by placing the request fifteen
932 * entries later in the queue:
933 *
934 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
935 *
936 */
937
938/*
939 * Add an item to the syncer work queue.
940 */
941static void
942vn_syncer_add_to_worklist(struct vnode *vp, int delay)
943{
944 int s, slot;
945
946 s = splbio();
947
948 if (vp->v_flag & VONWORKLST) {
949 LIST_REMOVE(vp, v_synclist);
950 }
951
952 if (delay > syncer_maxdelay - 2)
953 delay = syncer_maxdelay - 2;
954 slot = (syncer_delayno + delay) & syncer_mask;
955
956 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
957 vp->v_flag |= VONWORKLST;
958 splx(s);
959}
960
961struct proc *updateproc;
962static void sched_sync __P((void));
963static struct kproc_desc up_kp = {
964 "syncer",
965 sched_sync,
966 &updateproc
967};
968SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
969
970/*
971 * System filesystem synchronizer daemon.
972 */
973void
974sched_sync(void)
975{
976 struct synclist *slp;
977 struct vnode *vp;
978 long starttime;
979 int s;
980 struct proc *p = updateproc;
981
982 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
983 SHUTDOWN_PRI_LAST);
984
985 for (;;) {
986 kproc_suspend_loop(p);
987
988 starttime = time_second;
989
990 /*
991 * Push files whose dirty time has expired. Be careful
992 * of interrupt race on slp queue.
993 */
994 s = splbio();
995 slp = &syncer_workitem_pending[syncer_delayno];
996 syncer_delayno += 1;
997 if (syncer_delayno == syncer_maxdelay)
998 syncer_delayno = 0;
999 splx(s);
1000
1001 while ((vp = LIST_FIRST(slp)) != NULL) {
1002 if (VOP_ISLOCKED(vp, NULL) == 0) {
1003 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1004 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
1005 VOP_UNLOCK(vp, 0, p);
1006 }
1007 s = splbio();
1008 if (LIST_FIRST(slp) == vp) {
1009 /*
1010 * Note: v_tag VT_VFS vps can remain on the
1011 * worklist too with no dirty blocks, but
1012 * since sync_fsync() moves it to a different
1013 * slot we are safe.
1014 */
1015 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1016 !vn_isdisk(vp, NULL))
1017 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1018 /*
1019 * Put us back on the worklist. The worklist
1020 * routine will remove us from our current
1021 * position and then add us back in at a later
1022 * position.
1023 */
1024 vn_syncer_add_to_worklist(vp, syncdelay);
1025 }
1026 splx(s);
1027 }
1028
1029 /*
1030 * Do soft update processing.
1031 */
1032 if (bioops.io_sync)
1033 (*bioops.io_sync)(NULL);
1034
1035 /*
1036 * The variable rushjob allows the kernel to speed up the
1037 * processing of the filesystem syncer process. A rushjob
1038 * value of N tells the filesystem syncer to process the next
1039 * N seconds worth of work on its queue ASAP. Currently rushjob
1040 * is used by the soft update code to speed up the filesystem
1041 * syncer process when the incore state is getting so far
1042 * ahead of the disk that the kernel memory pool is being
1043 * threatened with exhaustion.
1044 */
1045 if (rushjob > 0) {
1046 rushjob -= 1;
1047 continue;
1048 }
1049 /*
1050 * If it has taken us less than a second to process the
1051 * current work, then wait. Otherwise start right over
1052 * again. We can still lose time if any single round
1053 * takes more than two seconds, but it does not really
1054 * matter as we are just trying to generally pace the
1055 * filesystem activity.
1056 */
1057 if (time_second == starttime)
1058 tsleep(&lbolt, PPAUSE, "syncer", 0);
1059 }
1060}
1061
1062/*
1063 * Request the syncer daemon to speed up its work.
1064 * We never push it to speed up more than half of its
1065 * normal turn time, otherwise it could take over the cpu.
1066 */
1067int
1068speedup_syncer()
1069{
1070 int s;
1071
1072 s = splhigh();
1073 if (updateproc->p_wchan == &lbolt)
1074 setrunnable(updateproc);
1075 splx(s);
1076 if (rushjob < syncdelay / 2) {
1077 rushjob += 1;
1078 stat_rush_requests += 1;
1079 return (1);
1080 }
1081 return(0);
1082}
1083
1084/*
1085 * Associate a p-buffer with a vnode.
1086 *
1087 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1088 * with the buffer. i.e. the bp has not been linked into the vnode or
1089 * ref-counted.
1090 */
1091void
1092pbgetvp(vp, bp)
1093 register struct vnode *vp;
1094 register struct buf *bp;
1095{
1096
1097 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1098
1099 bp->b_vp = vp;
1100 bp->b_flags |= B_PAGING;
1101 bp->b_dev = vn_todev(vp);
1102}
1103
1104/*
1105 * Disassociate a p-buffer from a vnode.
1106 */
1107void
1108pbrelvp(bp)
1109 register struct buf *bp;
1110{
1111
1112 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1113
1114 /* XXX REMOVE ME */
1115 if (bp->b_vnbufs.tqe_next != NULL) {
1116 panic(
1117 "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1118 bp,
1119 (int)bp->b_flags
1120 );
1121 }
1122 bp->b_vp = (struct vnode *) 0;
1123 bp->b_flags &= ~B_PAGING;
1124}
1125
1126void
1127pbreassignbuf(bp, newvp)
1128 struct buf *bp;
1129 struct vnode *newvp;
1130{
1131 if ((bp->b_flags & B_PAGING) == 0) {
1132 panic(
1133 "pbreassignbuf() on non phys bp %p",
1134 bp
1135 );
1136 }
1137 bp->b_vp = newvp;
1138}
1139
1140/*
1141 * Reassign a buffer from one vnode to another.
1142 * Used to assign file specific control information
1143 * (indirect blocks) to the vnode to which they belong.
1144 */
1145void
1146reassignbuf(bp, newvp)
1147 register struct buf *bp;
1148 register struct vnode *newvp;
1149{
1150 struct buflists *listheadp;
1151 int delay;
1152 int s;
1153
1154 if (newvp == NULL) {
1155 printf("reassignbuf: NULL");
1156 return;
1157 }
1158 ++reassignbufcalls;
1159
1160 /*
1161 * B_PAGING flagged buffers cannot be reassigned because their vp
1162 * is not fully linked in.
1163 */
1164 if (bp->b_flags & B_PAGING)
1165 panic("cannot reassign paging buffer");
1166
1167 s = splbio();
1168 /*
1169 * Delete from old vnode list, if on one.
1170 */
1171 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1172 if (bp->b_xflags & BX_VNDIRTY)
1173 listheadp = &bp->b_vp->v_dirtyblkhd;
1174 else
1175 listheadp = &bp->b_vp->v_cleanblkhd;
1176 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1177 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1178 if (bp->b_vp != newvp) {
1179 vdrop(bp->b_vp);
1180 bp->b_vp = NULL; /* for clarification */
1181 }
1182 }
1183 /*
1184 * If dirty, put on list of dirty buffers; otherwise insert onto list
1185 * of clean buffers.
1186 */
1187 if (bp->b_flags & B_DELWRI) {
1188 struct buf *tbp;
1189
1190 listheadp = &newvp->v_dirtyblkhd;
1191 if ((newvp->v_flag & VONWORKLST) == 0) {
1192 switch (newvp->v_type) {
1193 case VDIR:
1194 delay = dirdelay;
1195 break;
1196 case VCHR:
1197 case VBLK:
1198 if (newvp->v_specmountpoint != NULL) {
1199 delay = metadelay;
1200 break;
1201 }
1202 /* fall through */
1203 default:
1204 delay = filedelay;
1205 }
1206 vn_syncer_add_to_worklist(newvp, delay);
1207 }
1208 bp->b_xflags |= BX_VNDIRTY;
1209 tbp = TAILQ_FIRST(listheadp);
1210 if (tbp == NULL ||
1211 bp->b_lblkno == 0 ||
1212 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
1213 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1214 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1215 ++reassignbufsortgood;
1216 } else if (bp->b_lblkno < 0) {
1217 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1218 ++reassignbufsortgood;
1219 } else if (reassignbufmethod == 1) {
1220 /*
1221 * New sorting algorithm, only handle sequential case,
1222 * otherwise append to end (but before metadata)
1223 */
1224 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1225 (tbp->b_xflags & BX_VNDIRTY)) {
1226 /*
1227 * Found the best place to insert the buffer
1228 */
1229 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1230 ++reassignbufsortgood;
1231 } else {
1232 /*
1233 * Missed, append to end, but before meta-data.
1234 * We know that the head buffer in the list is
1235 * not meta-data due to prior conditionals.
1236 *
1237 * Indirect effects: NFS second stage write
1238 * tends to wind up here, giving maximum
1239 * distance between the unstable write and the
1240 * commit rpc.
1241 */
1242 tbp = TAILQ_LAST(listheadp, buflists);
1243 while (tbp && tbp->b_lblkno < 0)
1244 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
1245 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1246 ++reassignbufsortbad;
1247 }
1248 } else {
1249 /*
1250 * Old sorting algorithm, scan queue and insert
1251 */
1252 struct buf *ttbp;
1253 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1254 (ttbp->b_lblkno < bp->b_lblkno)) {
1255 ++reassignbufloops;
1256 tbp = ttbp;
1257 }
1258 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1259 }
1260 } else {
1261 bp->b_xflags |= BX_VNCLEAN;
1262 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1263 if ((newvp->v_flag & VONWORKLST) &&
1264 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1265 newvp->v_flag &= ~VONWORKLST;
1266 LIST_REMOVE(newvp, v_synclist);
1267 }
1268 }
1269 if (bp->b_vp != newvp) {
1270 bp->b_vp = newvp;
1271 vhold(bp->b_vp);
1272 }
1273 splx(s);
1274}
1275
1276/*
1277 * Create a vnode for a block device.
1278 * Used for mounting the root file system.
1279 */
1280int
1281bdevvp(dev, vpp)
1282 dev_t dev;
1283 struct vnode **vpp;
1284{
1285 register struct vnode *vp;
1286 struct vnode *nvp;
1287 int error;
1288
1289 if (dev == NODEV) {
1290 *vpp = NULLVP;
1291 return (ENXIO);
1292 }
1293 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1294 if (error) {
1295 *vpp = NULLVP;
1296 return (error);
1297 }
1298 vp = nvp;
1299 vp->v_type = VBLK;
1300 addalias(vp, dev);
1301 *vpp = vp;
1302 return (0);
1303}
1304
1305/*
1306 * Add vnode to the alias list hung off the dev_t.
1307 *
1308 * The reason for this gunk is that multiple vnodes can reference
1309 * the same physical device, so checking vp->v_usecount to see
1310 * how many users there are is inadequate; the v_usecount for
1311 * the vnodes need to be accumulated. vcount() does that.
1312 */
1313void
1314addaliasu(nvp, nvp_rdev)
1315 struct vnode *nvp;
1316 udev_t nvp_rdev;
1317{
1318
1319 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1320 panic("addaliasu on non-special vnode");
1321 addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
1322}
1323
1324void
1325addalias(nvp, dev)
1326 struct vnode *nvp;
1327 dev_t dev;
1328{
1329
1330 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1331 panic("addalias on non-special vnode");
1332
1333 nvp->v_rdev = dev;
1334 simple_lock(&spechash_slock);
1335 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1336 simple_unlock(&spechash_slock);
1337}
1338
1339/*
1340 * Grab a particular vnode from the free list, increment its
1341 * reference count and lock it. The vnode lock bit is set if the
1342 * vnode is being eliminated in vgone. The process is awakened
1343 * when the transition is completed, and an error returned to
1344 * indicate that the vnode is no longer usable (possibly having
1345 * been changed to a new file system type).
1346 */
1347int
1348vget(vp, flags, p)
1349 register struct vnode *vp;
1350 int flags;
1351 struct proc *p;
1352{
1353 int error;
1354
1355 /*
1356 * If the vnode is in the process of being cleaned out for
1357 * another use, we wait for the cleaning to finish and then
1358 * return failure. Cleaning is determined by checking that
1359 * the VXLOCK flag is set.
1360 */
1361 if ((flags & LK_INTERLOCK) == 0) {
1362 simple_lock(&vp->v_interlock);
1363 }
1364 if (vp->v_flag & VXLOCK) {
1365 vp->v_flag |= VXWANT;
1366 simple_unlock(&vp->v_interlock);
1367 tsleep((caddr_t)vp, PINOD, "vget", 0);
1368 return (ENOENT);
1369 }
1370
1371 vp->v_usecount++;
1372
1373 if (VSHOULDBUSY(vp))
1374 vbusy(vp);
1375 if (flags & LK_TYPE_MASK) {
1376 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1377 /*
1378 * must expand vrele here because we do not want
1379 * to call VOP_INACTIVE if the reference count
1380 * drops back to zero since it was never really
1381 * active. We must remove it from the free list
1382 * before sleeping so that multiple processes do
1383 * not try to recycle it.
1384 */
1385 simple_lock(&vp->v_interlock);
1386 vp->v_usecount--;
1387 if (VSHOULDFREE(vp))
1388 vfree(vp);
1389 simple_unlock(&vp->v_interlock);
1390 }
1391 return (error);
1392 }
1393 simple_unlock(&vp->v_interlock);
1394 return (0);
1395}
1396
1397void
1398vref(struct vnode *vp)
1399{
1400 simple_lock(&vp->v_interlock);
1401 vp->v_usecount++;
1402 simple_unlock(&vp->v_interlock);
1403}
1404
1405/*
1406 * Vnode put/release.
1407 * If count drops to zero, call inactive routine and return to freelist.
1408 */
1409void
1410vrele(vp)
1411 struct vnode *vp;
1412{
1413 struct proc *p = curproc; /* XXX */
1414
1415 KASSERT(vp != NULL, ("vrele: null vp"));
1416
1417 simple_lock(&vp->v_interlock);
1418
1419 if (vp->v_usecount > 1) {
1420
1421 vp->v_usecount--;
1422 simple_unlock(&vp->v_interlock);
1423
1424 return;
1425 }
1426
1427 if (vp->v_usecount == 1) {
1428
1429 vp->v_usecount--;
1430 if (VSHOULDFREE(vp))
1431 vfree(vp);
1432 /*
1433 * If we are doing a vput, the node is already locked, and we must
1434 * call VOP_INACTIVE with the node locked. So, in the case of
1435 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1436 */
1437 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1438 VOP_INACTIVE(vp, p);
1439 }
1440
1441 } else {
1442#ifdef DIAGNOSTIC
1443 vprint("vrele: negative ref count", vp);
1444 simple_unlock(&vp->v_interlock);
1445#endif
1446 panic("vrele: negative ref cnt");
1447 }
1448}
1449
1450void
1451vput(vp)
1452 struct vnode *vp;
1453{
1454 struct proc *p = curproc; /* XXX */
1455
1456 KASSERT(vp != NULL, ("vput: null vp"));
1457
1458 simple_lock(&vp->v_interlock);
1459
1460 if (vp->v_usecount > 1) {
1461
1462 vp->v_usecount--;
1463 VOP_UNLOCK(vp, LK_INTERLOCK, p);
1464 return;
1465
1466 }
1467
1468 if (vp->v_usecount == 1) {
1469
1470 vp->v_usecount--;
1471 if (VSHOULDFREE(vp))
1472 vfree(vp);
1473 /*
1474 * If we are doing a vput, the node is already locked, and we must
1475 * call VOP_INACTIVE with the node locked. So, in the case of
1476 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1477 */
1478 simple_unlock(&vp->v_interlock);
1479 VOP_INACTIVE(vp, p);
1480
1481 } else {
1482#ifdef DIAGNOSTIC
1483 vprint("vput: negative ref count", vp);
1484#endif
1485 panic("vput: negative ref cnt");
1486 }
1487}
1488
1489/*
1490 * Somebody doesn't want the vnode recycled.
1491 */
1492void
1493vhold(vp)
1494 register struct vnode *vp;
1495{
1496 int s;
1497
1498 s = splbio();
1499 vp->v_holdcnt++;
1500 if (VSHOULDBUSY(vp))
1501 vbusy(vp);
1502 splx(s);
1503}
1504
1505/*
1506 * One less who cares about this vnode.
1507 */
1508void
1509vdrop(vp)
1510 register struct vnode *vp;
1511{
1512 int s;
1513
1514 s = splbio();
1515 if (vp->v_holdcnt <= 0)
1516 panic("vdrop: holdcnt");
1517 vp->v_holdcnt--;
1518 if (VSHOULDFREE(vp))
1519 vfree(vp);
1520 splx(s);
1521}
1522
1523/*
1524 * Remove any vnodes in the vnode table belonging to mount point mp.
1525 *
1526 * If MNT_NOFORCE is specified, there should not be any active ones,
1527 * return error if any are found (nb: this is a user error, not a
1528 * system error). If MNT_FORCE is specified, detach any active vnodes
1529 * that are found.
1530 */
1531#ifdef DIAGNOSTIC
1532static int busyprt = 0; /* print out busy vnodes */
1533SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1534#endif
1535
1536int
1537vflush(mp, skipvp, flags)
1538 struct mount *mp;
1539 struct vnode *skipvp;
1540 int flags;
1541{
1542 struct proc *p = curproc; /* XXX */
1543 struct vnode *vp, *nvp;
1544 int busy = 0;
1545
1546 simple_lock(&mntvnode_slock);
1547loop:
1548 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1549 /*
1550 * Make sure this vnode wasn't reclaimed in getnewvnode().
1551 * Start over if it has (it won't be on the list anymore).
1552 */
1553 if (vp->v_mount != mp)
1554 goto loop;
1555 nvp = LIST_NEXT(vp, v_mntvnodes);
1556 /*
1557 * Skip over a selected vnode.
1558 */
1559 if (vp == skipvp)
1560 continue;
1561
1562 simple_lock(&vp->v_interlock);
1563 /*
1564 * Skip over a vnodes marked VSYSTEM.
1565 */
1566 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1567 simple_unlock(&vp->v_interlock);
1568 continue;
1569 }
1570 /*
1571 * If WRITECLOSE is set, only flush out regular file vnodes
1572 * open for writing.
1573 */
1574 if ((flags & WRITECLOSE) &&
1575 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1576 simple_unlock(&vp->v_interlock);
1577 continue;
1578 }
1579
1580 /*
1581 * With v_usecount == 0, all we need to do is clear out the
1582 * vnode data structures and we are done.
1583 */
1584 if (vp->v_usecount == 0) {
1585 simple_unlock(&mntvnode_slock);
1586 vgonel(vp, p);
1587 simple_lock(&mntvnode_slock);
1588 continue;
1589 }
1590
1591 /*
1592 * If FORCECLOSE is set, forcibly close the vnode. For block
1593 * or character devices, revert to an anonymous device. For
1594 * all other files, just kill them.
1595 */
1596 if (flags & FORCECLOSE) {
1597 simple_unlock(&mntvnode_slock);
1598 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1599 vgonel(vp, p);
1600 } else {
1601 vclean(vp, 0, p);
1602 vp->v_op = spec_vnodeop_p;
1603 insmntque(vp, (struct mount *) 0);
1604 }
1605 simple_lock(&mntvnode_slock);
1606 continue;
1607 }
1608#ifdef DIAGNOSTIC
1609 if (busyprt)
1610 vprint("vflush: busy vnode", vp);
1611#endif
1612 simple_unlock(&vp->v_interlock);
1613 busy++;
1614 }
1615 simple_unlock(&mntvnode_slock);
1616 if (busy)
1617 return (EBUSY);
1618 return (0);
1619}
1620
1621/*
1622 * Disassociate the underlying file system from a vnode.
1623 */
1624static void
1625vclean(vp, flags, p)
1626 struct vnode *vp;
1627 int flags;
1628 struct proc *p;
1629{
1630 int active;
1631 vm_object_t obj;
1632
1633 /*
1634 * Check to see if the vnode is in use. If so we have to reference it
1635 * before we clean it out so that its count cannot fall to zero and
1636 * generate a race against ourselves to recycle it.
1637 */
1638 if ((active = vp->v_usecount))
1639 vp->v_usecount++;
1640
1641 /*
1642 * Prevent the vnode from being recycled or brought into use while we
1643 * clean it out.
1644 */
1645 if (vp->v_flag & VXLOCK)
1646 panic("vclean: deadlock");
1647 vp->v_flag |= VXLOCK;
1648 /*
1649 * Even if the count is zero, the VOP_INACTIVE routine may still
1650 * have the object locked while it cleans it out. The VOP_LOCK
1651 * ensures that the VOP_INACTIVE routine is done with its work.
1652 * For active vnodes, it ensures that no other activity can
1653 * occur while the underlying object is being cleaned out.
1654 */
1655 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1656
1657 /*
1658 * Clean out any buffers associated with the vnode.
1659 */
1660 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1661 if ((obj = vp->v_object) != NULL) {
1662 if (obj->ref_count == 0) {
1663 /*
1664 * vclean() may be called twice. The first time removes the
1665 * primary reference to the object, the second time goes
1666 * one further and is a special-case to terminate the object.
1667 */
1668 vm_object_terminate(obj);
1669 } else {
1670 /*
1671 * Woe to the process that tries to page now :-).
1672 */
1673 vm_pager_deallocate(obj);
1674 }
1675 }
1676
1677 /*
1678 * If purging an active vnode, it must be closed and
1679 * deactivated before being reclaimed. Note that the
1680 * VOP_INACTIVE will unlock the vnode.
1681 */
1682 if (active) {
1683 if (flags & DOCLOSE)
1684 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1685 VOP_INACTIVE(vp, p);
1686 } else {
1687 /*
1688 * Any other processes trying to obtain this lock must first
1689 * wait for VXLOCK to clear, then call the new lock operation.
1690 */
1691 VOP_UNLOCK(vp, 0, p);
1692 }
1693 /*
1694 * Reclaim the vnode.
1695 */
1696 if (VOP_RECLAIM(vp, p))
1697 panic("vclean: cannot reclaim");
1698
1699 if (active) {
1700 /*
1701 * Inline copy of vrele() since VOP_INACTIVE
1702 * has already been called.
1703 */
1704 simple_lock(&vp->v_interlock);
1705 if (--vp->v_usecount <= 0) {
1706#ifdef DIAGNOSTIC
1707 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1708 vprint("vclean: bad ref count", vp);
1709 panic("vclean: ref cnt");
1710 }
1711#endif
1712 vfree(vp);
1713 }
1714 simple_unlock(&vp->v_interlock);
1715 }
1716
1717 cache_purge(vp);
1718 if (vp->v_vnlock) {
1719 FREE(vp->v_vnlock, M_VNODE);
1720 vp->v_vnlock = NULL;
1721 }
1722
1723 if (VSHOULDFREE(vp))
1724 vfree(vp);
1725
1726 /*
1727 * Done with purge, notify sleepers of the grim news.
1728 */
1729 vp->v_op = dead_vnodeop_p;
1730 vn_pollgone(vp);
1731 vp->v_tag = VT_NON;
1732 vp->v_flag &= ~VXLOCK;
1733 if (vp->v_flag & VXWANT) {
1734 vp->v_flag &= ~VXWANT;
1735 wakeup((caddr_t) vp);
1736 }
1737}
1738
1739/*
1740 * Eliminate all activity associated with the requested vnode
1741 * and with all vnodes aliased to the requested vnode.
1742 */
1743int
1744vop_revoke(ap)
1745 struct vop_revoke_args /* {
1746 struct vnode *a_vp;
1747 int a_flags;
1748 } */ *ap;
1749{
1750 struct vnode *vp, *vq;
1751 dev_t dev;
1752
1753 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1754
1755 vp = ap->a_vp;
1756 /*
1757 * If a vgone (or vclean) is already in progress,
1758 * wait until it is done and return.
1759 */
1760 if (vp->v_flag & VXLOCK) {
1761 vp->v_flag |= VXWANT;
1762 simple_unlock(&vp->v_interlock);
1763 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1764 return (0);
1765 }
1766 dev = vp->v_rdev;
1767 for (;;) {
1768 simple_lock(&spechash_slock);
1769 vq = SLIST_FIRST(&dev->si_hlist);
1770 simple_unlock(&spechash_slock);
1771 if (!vq)
1772 break;
1773 vgone(vq);
1774 }
1775 return (0);
1776}
1777
1778/*
1779 * Recycle an unused vnode to the front of the free list.
1780 * Release the passed interlock if the vnode will be recycled.
1781 */
1782int
1783vrecycle(vp, inter_lkp, p)
1784 struct vnode *vp;
1785 struct simplelock *inter_lkp;
1786 struct proc *p;
1787{
1788
1789 simple_lock(&vp->v_interlock);
1790 if (vp->v_usecount == 0) {
1791 if (inter_lkp) {
1792 simple_unlock(inter_lkp);
1793 }
1794 vgonel(vp, p);
1795 return (1);
1796 }
1797 simple_unlock(&vp->v_interlock);
1798 return (0);
1799}
1800
1801/*
1802 * Eliminate all activity associated with a vnode
1803 * in preparation for reuse.
1804 */
1805void
1806vgone(vp)
1807 register struct vnode *vp;
1808{
1809 struct proc *p = curproc; /* XXX */
1810
1811 simple_lock(&vp->v_interlock);
1812 vgonel(vp, p);
1813}
1814
1815/*
1816 * vgone, with the vp interlock held.
1817 */
1818void
1819vgonel(vp, p)
1820 struct vnode *vp;
1821 struct proc *p;
1822{
1823 int s;
1824
1825 /*
1826 * If a vgone (or vclean) is already in progress,
1827 * wait until it is done and return.
1828 */
1829 if (vp->v_flag & VXLOCK) {
1830 vp->v_flag |= VXWANT;
1831 simple_unlock(&vp->v_interlock);
1832 tsleep((caddr_t)vp, PINOD, "vgone", 0);
1833 return;
1834 }
1835
1836 /*
1837 * Clean out the filesystem specific data.
1838 */
1839 vclean(vp, DOCLOSE, p);
1840 simple_lock(&vp->v_interlock);
1841
1842 /*
1843 * Delete from old mount point vnode list, if on one.
1844 */
1845 if (vp->v_mount != NULL)
1846 insmntque(vp, (struct mount *)0);
1847 /*
1848 * If special device, remove it from special device alias list
1849 * if it is on one.
1850 */
1851 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
1852 simple_lock(&spechash_slock);
1853 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);
1854 freedev(vp->v_rdev);
1855 simple_unlock(&spechash_slock);
1856 vp->v_rdev = NULL;
1857 }
1858
1859 /*
1860 * If it is on the freelist and not already at the head,
1861 * move it to the head of the list. The test of the back
1862 * pointer and the reference count of zero is because
1863 * it will be removed from the free list by getnewvnode,
1864 * but will not have its reference count incremented until
1865 * after calling vgone. If the reference count were
1866 * incremented first, vgone would (incorrectly) try to
1867 * close the previous instance of the underlying object.
1868 */
1869 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1870 s = splbio();
1871 simple_lock(&vnode_free_list_slock);
1872 if (vp->v_flag & VFREE) {
1873 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1874 } else if (vp->v_flag & VTBFREE) {
1875 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
1876 vp->v_flag &= ~VTBFREE;
1877 freevnodes++;
1878 } else
1879 freevnodes++;
1880 vp->v_flag |= VFREE;
1881 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1882 simple_unlock(&vnode_free_list_slock);
1883 splx(s);
1884 }
1885
1886 vp->v_type = VBAD;
1887 simple_unlock(&vp->v_interlock);
1888}
1889
1890/*
1891 * Lookup a vnode by device number.
1892 */
1893int
1894vfinddev(dev, type, vpp)
1895 dev_t dev;
1896 enum vtype type;
1897 struct vnode **vpp;
1898{
1899 struct vnode *vp;
1900
1901 simple_lock(&spechash_slock);
1902 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
1903 if (type == vp->v_type) {
1904 *vpp = vp;
1905 simple_unlock(&spechash_slock);
1906 return (1);
1907 }
1908 }
1909 simple_unlock(&spechash_slock);
1910 return (0);
1911}
1912
1913/*
1914 * Calculate the total number of references to a special device.
1915 */
1916int
1917vcount(vp)
1918 struct vnode *vp;
1919{
1920 struct vnode *vq;
1921 int count;
1922
1923 count = 0;
1924 simple_lock(&spechash_slock);
1925 SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
1926 count += vq->v_usecount;
1927 simple_unlock(&spechash_slock);
1928 return (count);
1929}
1930
1931/*
1932 * Same as above, but using the dev_t as argument
1933 */
1934
1935int
1936count_dev(dev)
1937 dev_t dev;
1938{
1939 struct vnode *vp;
1940
1941 vp = SLIST_FIRST(&dev->si_hlist);
1942 if (vp == NULL)
1943 return (0);
1944 return(vcount(vp));
1945}
1946
1947/*
1948 * Print out a description of a vnode.
1949 */
1950static char *typename[] =
1951{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1952
1953void
1954vprint(label, vp)
1955 char *label;
1956 struct vnode *vp;
1957{
1958 char buf[96];
1959
1960 if (label != NULL)
1961 printf("%s: %p: ", label, (void *)vp);
1962 else
1963 printf("%p: ", (void *)vp);
1964 printf("type %s, usecount %d, writecount %d, refcount %d,",
1965 typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1966 vp->v_holdcnt);
1967 buf[0] = '\0';
1968 if (vp->v_flag & VROOT)
1969 strcat(buf, "|VROOT");
1970 if (vp->v_flag & VTEXT)
1971 strcat(buf, "|VTEXT");
1972 if (vp->v_flag & VSYSTEM)
1973 strcat(buf, "|VSYSTEM");
1974 if (vp->v_flag & VXLOCK)
1975 strcat(buf, "|VXLOCK");
1976 if (vp->v_flag & VXWANT)
1977 strcat(buf, "|VXWANT");
1978 if (vp->v_flag & VBWAIT)
1979 strcat(buf, "|VBWAIT");
1980 if (vp->v_flag & VDOOMED)
1981 strcat(buf, "|VDOOMED");
1982 if (vp->v_flag & VFREE)
1983 strcat(buf, "|VFREE");
1984 if (vp->v_flag & VOBJBUF)
1985 strcat(buf, "|VOBJBUF");
1986 if (buf[0] != '\0')
1987 printf(" flags (%s)", &buf[1]);
1988 if (vp->v_data == NULL) {
1989 printf("\n");
1990 } else {
1991 printf("\n\t");
1992 VOP_PRINT(vp);
1993 }
1994}
1995
1996#ifdef DDB
1997#include <ddb/ddb.h>
1998/*
1999 * List all of the locked vnodes in the system.
2000 * Called when debugging the kernel.
2001 */
2002DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2003{
2004 struct proc *p = curproc; /* XXX */
2005 struct mount *mp, *nmp;
2006 struct vnode *vp;
2007
2008 printf("Locked vnodes\n");
2009 simple_lock(&mountlist_slock);
2010 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2011 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2012 nmp = TAILQ_NEXT(mp, mnt_list);
2013 continue;
2014 }
2015 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2016 if (VOP_ISLOCKED(vp, NULL))
2017 vprint((char *)0, vp);
2018 }
2019 simple_lock(&mountlist_slock);
2020 nmp = TAILQ_NEXT(mp, mnt_list);
2021 vfs_unbusy(mp, p);
2022 }
2023 simple_unlock(&mountlist_slock);
2024}
2025#endif
2026
2027/*
2028 * Top level filesystem related information gathering.
2029 */
2030static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
2031
2032static int
2033vfs_sysctl SYSCTL_HANDLER_ARGS
2034{
2035 int *name = (int *)arg1 - 1; /* XXX */
2036 u_int namelen = arg2 + 1; /* XXX */
2037 struct vfsconf *vfsp;
2038
2039#if 1 || defined(COMPAT_PRELITE2)
2040 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2041 if (namelen == 1)
2042 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2043#endif
2044
2045#ifdef notyet
2046 /* all sysctl names at this level are at least name and field */
2047 if (namelen < 2)
2048 return (ENOTDIR); /* overloaded */
2049 if (name[0] != VFS_GENERIC) {
2050 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2051 if (vfsp->vfc_typenum == name[0])
2052 break;
2053 if (vfsp == NULL)
2054 return (EOPNOTSUPP);
2055 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2056 oldp, oldlenp, newp, newlen, p));
2057 }
2058#endif
2059 switch (name[1]) {
2060 case VFS_MAXTYPENUM:
2061 if (namelen != 2)
2062 return (ENOTDIR);
2063 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2064 case VFS_CONF:
2065 if (namelen != 3)
2066 return (ENOTDIR); /* overloaded */
2067 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2068 if (vfsp->vfc_typenum == name[2])
2069 break;
2070 if (vfsp == NULL)
2071 return (EOPNOTSUPP);
2072 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2073 }
2074 return (EOPNOTSUPP);
2075}
2076
2077SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2078 "Generic filesystem");
2079
2080#if 1 || defined(COMPAT_PRELITE2)
2081
2082static int
2083sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
2084{
2085 int error;
2086 struct vfsconf *vfsp;
2087 struct ovfsconf ovfs;
2088
2089 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2090 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
2091 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2092 ovfs.vfc_index = vfsp->vfc_typenum;
2093 ovfs.vfc_refcount = vfsp->vfc_refcount;
2094 ovfs.vfc_flags = vfsp->vfc_flags;
2095 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2096 if (error)
2097 return error;
2098 }
2099 return 0;
2100}
2101
2102#endif /* 1 || COMPAT_PRELITE2 */
2103
2104#if 0
2105#define KINFO_VNODESLOP 10
2106/*
2107 * Dump vnode list (via sysctl).
2108 * Copyout address of vnode followed by vnode.
2109 */
2110/* ARGSUSED */
2111static int
2112sysctl_vnode SYSCTL_HANDLER_ARGS
2113{
2114 struct proc *p = curproc; /* XXX */
2115 struct mount *mp, *nmp;
2116 struct vnode *nvp, *vp;
2117 int error;
2118
2119#define VPTRSZ sizeof (struct vnode *)
2120#define VNODESZ sizeof (struct vnode)
2121
2122 req->lock = 0;
2123 if (!req->oldptr) /* Make an estimate */
2124 return (SYSCTL_OUT(req, 0,
2125 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2126
2127 simple_lock(&mountlist_slock);
2128 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2129 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2130 nmp = TAILQ_NEXT(mp, mnt_list);
2131 continue;
2132 }
2133again:
2134 simple_lock(&mntvnode_slock);
2135 for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2136 vp != NULL;
2137 vp = nvp) {
2138 /*
2139 * Check that the vp is still associated with
2140 * this filesystem. RACE: could have been
2141 * recycled onto the same filesystem.
2142 */
2143 if (vp->v_mount != mp) {
2144 simple_unlock(&mntvnode_slock);
2145 goto again;
2146 }
2147 nvp = LIST_NEXT(vp, v_mntvnodes);
2148 simple_unlock(&mntvnode_slock);
2149 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2150 (error = SYSCTL_OUT(req, vp, VNODESZ)))
2151 return (error);
2152 simple_lock(&mntvnode_slock);
2153 }
2154 simple_unlock(&mntvnode_slock);
2155 simple_lock(&mountlist_slock);
2156 nmp = TAILQ_NEXT(mp, mnt_list);
2157 vfs_unbusy(mp, p);
2158 }
2159 simple_unlock(&mountlist_slock);
2160
2161 return (0);
2162}
2163#endif
2164
2165/*
2166 * XXX
2167 * Exporting the vnode list on large systems causes them to crash.
2168 * Exporting the vnode list on medium systems causes sysctl to coredump.
2169 */
2170#if 0
2171SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2172 0, 0, sysctl_vnode, "S,vnode", "");
2173#endif
2174
2175/*
2176 * Check to see if a filesystem is mounted on a block device.
2177 */
2178int
2179vfs_mountedon(vp)
2180 struct vnode *vp;
2181{
2182
2183 if (vp->v_specmountpoint != NULL)
2184 return (EBUSY);
2185 return (0);
2186}
2187
2188/*
2189 * Unmount all filesystems. The list is traversed in reverse order
2190 * of mounting to avoid dependencies.
2191 */
2192void
2193vfs_unmountall()
2194{
2195 struct mount *mp;
2196 struct proc *p;
2197 int error;
2198
2199 if (curproc != NULL)
2200 p = curproc;
2201 else
2202 p = initproc; /* XXX XXX should this be proc0? */
2203 /*
2204 * Since this only runs when rebooting, it is not interlocked.
2205 */
2206 while(!TAILQ_EMPTY(&mountlist)) {
2207 mp = TAILQ_LAST(&mountlist, mntlist);
2208 error = dounmount(mp, MNT_FORCE, p);
2209 if (error) {
2210 TAILQ_REMOVE(&mountlist, mp, mnt_list);
2211 printf("unmount of %s failed (",
2212 mp->mnt_stat.f_mntonname);
2213 if (error == EBUSY)
2214 printf("BUSY)\n");
2215 else
2216 printf("%d)\n", error);
2217 } else {
2218 /* The unmount has removed mp from the mountlist */
2219 }
2220 }
2221}
2222
2223/*
2224 * Build hash lists of net addresses and hang them off the mount point.
2225 * Called by ufs_mount() to set up the lists of export addresses.
2226 */
2227static int
2228vfs_hang_addrlist(mp, nep, argp)
2229 struct mount *mp;
2230 struct netexport *nep;
2231 struct export_args *argp;
2232{
2233 register struct netcred *np;
2234 register struct radix_node_head *rnh;
2235 register int i;
2236 struct radix_node *rn;
2237 struct sockaddr *saddr, *smask = 0;
2238 struct domain *dom;
2239 int error;
2240
2241 if (argp->ex_addrlen == 0) {
2242 if (mp->mnt_flag & MNT_DEFEXPORTED)
2243 return (EPERM);
2244 np = &nep->ne_defexported;
2245 np->netc_exflags = argp->ex_flags;
2246 np->netc_anon = argp->ex_anon;
2247 np->netc_anon.cr_ref = 1;
2248 mp->mnt_flag |= MNT_DEFEXPORTED;
2249 return (0);
2250 }
2251 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2252 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2253 bzero((caddr_t) np, i);
2254 saddr = (struct sockaddr *) (np + 1);
2255 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2256 goto out;
2257 if (saddr->sa_len > argp->ex_addrlen)
2258 saddr->sa_len = argp->ex_addrlen;
2259 if (argp->ex_masklen) {
2260 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2261 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2262 if (error)
2263 goto out;
2264 if (smask->sa_len > argp->ex_masklen)
2265 smask->sa_len = argp->ex_masklen;
2266 }
2267 i = saddr->sa_family;
2268 if ((rnh = nep->ne_rtable[i]) == 0) {
2269 /*
2270 * Seems silly to initialize every AF when most are not used,
2271 * do so on demand here
2272 */
2273 for (dom = domains; dom; dom = dom->dom_next)
2274 if (dom->dom_family == i && dom->dom_rtattach) {
2275 dom->dom_rtattach((void **) &nep->ne_rtable[i],
2276 dom->dom_rtoffset);
2277 break;
2278 }
2279 if ((rnh = nep->ne_rtable[i]) == 0) {
2280 error = ENOBUFS;
2281 goto out;
2282 }
2283 }
2284 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2285 np->netc_rnodes);
2286 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
2287 error = EPERM;
2288 goto out;
2289 }
2290 np->netc_exflags = argp->ex_flags;
2291 np->netc_anon = argp->ex_anon;
2292 np->netc_anon.cr_ref = 1;
2293 return (0);
2294out:
2295 free(np, M_NETADDR);
2296 return (error);
2297}
2298
2299/* ARGSUSED */
2300static int
2301vfs_free_netcred(rn, w)
2302 struct radix_node *rn;
2303 void *w;
2304{
2305 register struct radix_node_head *rnh = (struct radix_node_head *) w;
2306
2307 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2308 free((caddr_t) rn, M_NETADDR);
2309 return (0);
2310}
2311
2312/*
2313 * Free the net address hash lists that are hanging off the mount points.
2314 */
2315static void
2316vfs_free_addrlist(nep)
2317 struct netexport *nep;
2318{
2319 register int i;
2320 register struct radix_node_head *rnh;
2321
2322 for (i = 0; i <= AF_MAX; i++)
2323 if ((rnh = nep->ne_rtable[i])) {
2324 (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2325 (caddr_t) rnh);
2326 free((caddr_t) rnh, M_RTABLE);
2327 nep->ne_rtable[i] = 0;
2328 }
2329}
2330
2331int
2332vfs_export(mp, nep, argp)
2333 struct mount *mp;
2334 struct netexport *nep;
2335 struct export_args *argp;
2336{
2337 int error;
2338
2339 if (argp->ex_flags & MNT_DELEXPORT) {
2340 if (mp->mnt_flag & MNT_EXPUBLIC) {
2341 vfs_setpublicfs(NULL, NULL, NULL);
2342 mp->mnt_flag &= ~MNT_EXPUBLIC;
2343 }
2344 vfs_free_addrlist(nep);
2345 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2346 }
2347 if (argp->ex_flags & MNT_EXPORTED) {
2348 if (argp->ex_flags & MNT_EXPUBLIC) {
2349 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2350 return (error);
2351 mp->mnt_flag |= MNT_EXPUBLIC;
2352 }
2353 if ((error = vfs_hang_addrlist(mp, nep, argp)))
2354 return (error);
2355 mp->mnt_flag |= MNT_EXPORTED;
2356 }
2357 return (0);
2358}
2359
2360
2361/*
2362 * Set the publicly exported filesystem (WebNFS). Currently, only
2363 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2364 */
2365int
2366vfs_setpublicfs(mp, nep, argp)
2367 struct mount *mp;
2368 struct netexport *nep;
2369 struct export_args *argp;
2370{
2371 int error;
2372 struct vnode *rvp;
2373 char *cp;
2374
2375 /*
2376 * mp == NULL -> invalidate the current info, the FS is
2377 * no longer exported. May be called from either vfs_export
2378 * or unmount, so check if it hasn't already been done.
2379 */
2380 if (mp == NULL) {
2381 if (nfs_pub.np_valid) {
2382 nfs_pub.np_valid = 0;
2383 if (nfs_pub.np_index != NULL) {
2384 FREE(nfs_pub.np_index, M_TEMP);
2385 nfs_pub.np_index = NULL;
2386 }
2387 }
2388 return (0);
2389 }
2390
2391 /*
2392 * Only one allowed at a time.
2393 */
2394 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2395 return (EBUSY);
2396
2397 /*
2398 * Get real filehandle for root of exported FS.
2399 */
2400 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2401 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2402
2403 if ((error = VFS_ROOT(mp, &rvp)))
2404 return (error);
2405
2406 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2407 return (error);
2408
2409 vput(rvp);
2410
2411 /*
2412 * If an indexfile was specified, pull it in.
2413 */
2414 if (argp->ex_indexfile != NULL) {
2415 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2416 M_WAITOK);
2417 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2418 MAXNAMLEN, (size_t *)0);
2419 if (!error) {
2420 /*
2421 * Check for illegal filenames.
2422 */
2423 for (cp = nfs_pub.np_index; *cp; cp++) {
2424 if (*cp == '/') {
2425 error = EINVAL;
2426 break;
2427 }
2428 }
2429 }
2430 if (error) {
2431 FREE(nfs_pub.np_index, M_TEMP);
2432 return (error);
2433 }
2434 }
2435
2436 nfs_pub.np_mount = mp;
2437 nfs_pub.np_valid = 1;
2438 return (0);
2439}
2440
2441struct netcred *
2442vfs_export_lookup(mp, nep, nam)
2443 register struct mount *mp;
2444 struct netexport *nep;
2445 struct sockaddr *nam;
2446{
2447 register struct netcred *np;
2448 register struct radix_node_head *rnh;
2449 struct sockaddr *saddr;
2450
2451 np = NULL;
2452 if (mp->mnt_flag & MNT_EXPORTED) {
2453 /*
2454 * Lookup in the export list first.
2455 */
2456 if (nam != NULL) {
2457 saddr = nam;
2458 rnh = nep->ne_rtable[saddr->sa_family];
2459 if (rnh != NULL) {
2460 np = (struct netcred *)
2461 (*rnh->rnh_matchaddr)((caddr_t)saddr,
2462 rnh);
2463 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2464 np = NULL;
2465 }
2466 }
2467 /*
2468 * If no address match, use the default if it exists.
2469 */
2470 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2471 np = &nep->ne_defexported;
2472 }
2473 return (np);
2474}
2475
2476/*
2477 * perform msync on all vnodes under a mount point
2478 * the mount point must be locked.
2479 */
2480void
2481vfs_msync(struct mount *mp, int flags) {
2482 struct vnode *vp, *nvp;
2483 struct vm_object *obj;
2484 int anyio, tries;
2485
2486 tries = 5;
2487loop:
2488 anyio = 0;
2489 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
2490
2491 nvp = LIST_NEXT(vp, v_mntvnodes);
2492
2493 if (vp->v_mount != mp) {
2494 goto loop;
2495 }
2496
2497 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
2498 continue;
2499
2500 if (flags != MNT_WAIT) {
2501 obj = vp->v_object;
2502 if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2503 continue;
2504 if (VOP_ISLOCKED(vp, NULL))
2505 continue;
2506 }
2507
2508 simple_lock(&vp->v_interlock);
2509 if (vp->v_object &&
2510 (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2511 if (!vget(vp,
2512 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2513 if (vp->v_object) {
2514 vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
2515 anyio = 1;
2516 }
2517 vput(vp);
2518 }
2519 } else {
2520 simple_unlock(&vp->v_interlock);
2521 }
2522 }
2523 if (anyio && (--tries > 0))
2524 goto loop;
2525}
2526
2527/*
2528 * Create the VM object needed for VMIO and mmap support. This
2529 * is done for all VREG files in the system. Some filesystems might
2530 * afford the additional metadata buffering capability of the
2531 * VMIO code by making the device node be VMIO mode also.
2532 *
2533 * vp must be locked when vfs_object_create is called.
2534 */
2535int
2536vfs_object_create(vp, p, cred)
2537 struct vnode *vp;
2538 struct proc *p;
2539 struct ucred *cred;
2540{
2541 struct vattr vat;
2542 vm_object_t object;
2543 int error = 0;
2544
2545 if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
2546 return 0;
2547
2548retry:
2549 if ((object = vp->v_object) == NULL) {
2550 if (vp->v_type == VREG || vp->v_type == VDIR) {
2551 if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2552 goto retn;
2553 object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
2554 } else if (devsw(vp->v_rdev) != NULL) {
2555 /*
2556 * This simply allocates the biggest object possible
2557 * for a disk vnode. This should be fixed, but doesn't
2558 * cause any problems (yet).
2559 */
2560 object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
2561 } else {
2562 goto retn;
2563 }
2564 /*
2565 * Dereference the reference we just created. This assumes
2566 * that the object is associated with the vp.
2567 */
2568 object->ref_count--;
2569 vp->v_usecount--;
2570 } else {
2571 if (object->flags & OBJ_DEAD) {
2572 VOP_UNLOCK(vp, 0, p);
2573 tsleep(object, PVM, "vodead", 0);
2574 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2575 goto retry;
2576 }
2577 }
2578
2579 KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
2580 vp->v_flag |= VOBJBUF;
2581
2582retn:
2583 return error;
2584}
2585
2586static void
2587vfree(vp)
2588 struct vnode *vp;
2589{
2590 int s;
2591
2592 s = splbio();
2593 simple_lock(&vnode_free_list_slock);
2594 if (vp->v_flag & VTBFREE) {
2595 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2596 vp->v_flag &= ~VTBFREE;
2597 }
2598 if (vp->v_flag & VAGE) {
2599 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2600 } else {
2601 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2602 }
2603 freevnodes++;
2604 simple_unlock(&vnode_free_list_slock);
2605 vp->v_flag &= ~VAGE;
2606 vp->v_flag |= VFREE;
2607 splx(s);
2608}
2609
2610void
2611vbusy(vp)
2612 struct vnode *vp;
2613{
2614 int s;
2615
2616 s = splbio();
2617 simple_lock(&vnode_free_list_slock);
2618 if (vp->v_flag & VTBFREE) {
2619 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2620 vp->v_flag &= ~VTBFREE;
2621 } else {
2622 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2623 freevnodes--;
2624 }
2625 simple_unlock(&vnode_free_list_slock);
2626 vp->v_flag &= ~(VFREE|VAGE);
2627 splx(s);
2628}
2629
2630/*
2631 * Record a process's interest in events which might happen to
2632 * a vnode. Because poll uses the historic select-style interface
2633 * internally, this routine serves as both the ``check for any
2634 * pending events'' and the ``record my interest in future events''
2635 * functions. (These are done together, while the lock is held,
2636 * to avoid race conditions.)
2637 */
2638int
2639vn_pollrecord(vp, p, events)
2640 struct vnode *vp;
2641 struct proc *p;
2642 short events;
2643{
2644 simple_lock(&vp->v_pollinfo.vpi_lock);
2645 if (vp->v_pollinfo.vpi_revents & events) {
2646 /*
2647 * This leaves events we are not interested
2648 * in available for the other process which
2649 * which presumably had requested them
2650 * (otherwise they would never have been
2651 * recorded).
2652 */
2653 events &= vp->v_pollinfo.vpi_revents;
2654 vp->v_pollinfo.vpi_revents &= ~events;
2655
2656 simple_unlock(&vp->v_pollinfo.vpi_lock);
2657 return events;
2658 }
2659 vp->v_pollinfo.vpi_events |= events;
2660 selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2661 simple_unlock(&vp->v_pollinfo.vpi_lock);
2662 return 0;
2663}
2664
2665/*
2666 * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
2667 * it is possible for us to miss an event due to race conditions, but
2668 * that condition is expected to be rare, so for the moment it is the
2669 * preferred interface.
2670 */
2671void
2672vn_pollevent(vp, events)
2673 struct vnode *vp;
2674 short events;
2675{
2676 simple_lock(&vp->v_pollinfo.vpi_lock);
2677 if (vp->v_pollinfo.vpi_events & events) {
2678 /*
2679 * We clear vpi_events so that we don't
2680 * call selwakeup() twice if two events are
2681 * posted before the polling process(es) is
2682 * awakened. This also ensures that we take at
2683 * most one selwakeup() if the polling process
2684 * is no longer interested. However, it does
2685 * mean that only one event can be noticed at
2686 * a time. (Perhaps we should only clear those
2687 * event bits which we note?) XXX
2688 */
2689 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
2690 vp->v_pollinfo.vpi_revents |= events;
2691 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2692 }
2693 simple_unlock(&vp->v_pollinfo.vpi_lock);
2694}
2695
2696/*
2697 * Wake up anyone polling on vp because it is being revoked.
2698 * This depends on dead_poll() returning POLLHUP for correct
2699 * behavior.
2700 */
2701void
2702vn_pollgone(vp)
2703 struct vnode *vp;
2704{
2705 simple_lock(&vp->v_pollinfo.vpi_lock);
2706 if (vp->v_pollinfo.vpi_events) {
2707 vp->v_pollinfo.vpi_events = 0;
2708 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2709 }
2710 simple_unlock(&vp->v_pollinfo.vpi_lock);
2711}
2712
2713
2714
2715/*
2716 * Routine to create and manage a filesystem syncer vnode.
2717 */
2718#define sync_close ((int (*) __P((struct vop_close_args *)))nullop)
2719static int sync_fsync __P((struct vop_fsync_args *));
2720static int sync_inactive __P((struct vop_inactive_args *));
2721static int sync_reclaim __P((struct vop_reclaim_args *));
2722#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock)
2723#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock)
2724static int sync_print __P((struct vop_print_args *));
2725#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2726
2727static vop_t **sync_vnodeop_p;
2728static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2729 { &vop_default_desc, (vop_t *) vop_eopnotsupp },
2730 { &vop_close_desc, (vop_t *) sync_close }, /* close */
2731 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
2732 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
2733 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
2734 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */
2735 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */
2736 { &vop_print_desc, (vop_t *) sync_print }, /* print */
2737 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */
2738 { NULL, NULL }
2739};
2740static struct vnodeopv_desc sync_vnodeop_opv_desc =
2741 { &sync_vnodeop_p, sync_vnodeop_entries };
2742
2743VNODEOP_SET(sync_vnodeop_opv_desc);
2744
2745/*
2746 * Create a new filesystem syncer vnode for the specified mount point.
2747 */
2748int
2749vfs_allocate_syncvnode(mp)
2750 struct mount *mp;
2751{
2752 struct vnode *vp;
2753 static long start, incr, next;
2754 int error;
2755
2756 /* Allocate a new vnode */
2757 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2758 mp->mnt_syncer = NULL;
2759 return (error);
2760 }
2761 vp->v_type = VNON;
2762 /*
2763 * Place the vnode onto the syncer worklist. We attempt to
2764 * scatter them about on the list so that they will go off
2765 * at evenly distributed times even if all the filesystems
2766 * are mounted at once.
2767 */
2768 next += incr;
2769 if (next == 0 || next > syncer_maxdelay) {
2770 start /= 2;
2771 incr /= 2;
2772 if (start == 0) {
2773 start = syncer_maxdelay / 2;
2774 incr = syncer_maxdelay;
2775 }
2776 next = start;
2777 }
2778 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2779 mp->mnt_syncer = vp;
2780 return (0);
2781}
2782
2783/*
2784 * Do a lazy sync of the filesystem.
2785 */
2786static int
2787sync_fsync(ap)
2788 struct vop_fsync_args /* {
2789 struct vnode *a_vp;
2790 struct ucred *a_cred;
2791 int a_waitfor;
2792 struct proc *a_p;
2793 } */ *ap;
2794{
2795 struct vnode *syncvp = ap->a_vp;
2796 struct mount *mp = syncvp->v_mount;
2797 struct proc *p = ap->a_p;
2798 int asyncflag;
2799
2800 /*
2801 * We only need to do something if this is a lazy evaluation.
2802 */
2803 if (ap->a_waitfor != MNT_LAZY)
2804 return (0);
2805
2806 /*
2807 * Move ourselves to the back of the sync list.
2808 */
2809 vn_syncer_add_to_worklist(syncvp, syncdelay);
2810
2811 /*
2812 * Walk the list of vnodes pushing all that are dirty and
2813 * not already on the sync list.
2814 */
2815 simple_lock(&mountlist_slock);
2816 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2817 simple_unlock(&mountlist_slock);
2818 return (0);
2819 }
2820 asyncflag = mp->mnt_flag & MNT_ASYNC;
2821 mp->mnt_flag &= ~MNT_ASYNC;
2822 vfs_msync(mp, MNT_NOWAIT);
2823 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2824 if (asyncflag)
2825 mp->mnt_flag |= MNT_ASYNC;
2826 vfs_unbusy(mp, p);
2827 return (0);
2828}
2829
2830/*
2831 * The syncer vnode is no referenced.
2832 */
2833static int
2834sync_inactive(ap)
2835 struct vop_inactive_args /* {
2836 struct vnode *a_vp;
2837 struct proc *a_p;
2838 } */ *ap;
2839{
2840
2841 vgone(ap->a_vp);
2842 return (0);
2843}
2844
2845/*
2846 * The syncer vnode is no longer needed and is being decommissioned.
2847 *
2848 * Modifications to the worklist must be protected at splbio().
2849 */
2850static int
2851sync_reclaim(ap)
2852 struct vop_reclaim_args /* {
2853 struct vnode *a_vp;
2854 } */ *ap;
2855{
2856 struct vnode *vp = ap->a_vp;
2857 int s;
2858
2859 s = splbio();
2860 vp->v_mount->mnt_syncer = NULL;
2861 if (vp->v_flag & VONWORKLST) {
2862 LIST_REMOVE(vp, v_synclist);
2863 vp->v_flag &= ~VONWORKLST;
2864 }
2865 splx(s);
2866
2867 return (0);
2868}
2869
2870/*
2871 * Print out a syncer vnode.
2872 */
2873static int
2874sync_print(ap)
2875 struct vop_print_args /* {
2876 struct vnode *a_vp;
2877 } */ *ap;
2878{
2879 struct vnode *vp = ap->a_vp;
2880
2881 printf("syncer vnode");
2882 if (vp->v_vnlock != NULL)
2883 lockmgr_printinfo(vp->v_vnlock);
2884 printf("\n");
2885 return (0);
2886}
2887
2888/*
2889 * extract the dev_t from a VBLK or VCHR
2890 */
2891dev_t
2892vn_todev(vp)
2893 struct vnode *vp;
2894{
2895 if (vp->v_type != VBLK && vp->v_type != VCHR)
2896 return (NODEV);
2897 return (vp->v_rdev);
2898}
2899
2900/*
2901 * Check if vnode represents a disk device
2902 */
2903int
2904vn_isdisk(vp, errp)
2905 struct vnode *vp;
2906 int *errp;
2907{
2908 if (vp->v_type != VBLK && vp->v_type != VCHR) {
2909 if (errp != NULL)
2910 *errp = ENOTBLK;
2911 return (0);
2912 }
2913 if (vp->v_rdev == NULL) {
2914 if (errp != NULL)
2915 *errp = ENXIO;
2916 return (0);
2917 }
2918 if (!devsw(vp->v_rdev)) {
2919 if (errp != NULL)
2920 *errp = ENXIO;
2921 return (0);
2922 }
2923 if (!(devsw(vp->v_rdev)->d_flags & D_DISK)) {
2924 if (errp != NULL)
2925 *errp = ENOTBLK;
2926 return (0);
2927 }
2928 if (errp != NULL)
2929 *errp = 0;
2930 return (1);
2931}
2932
2933void
2934NDFREE(ndp, flags)
2935 struct nameidata *ndp;
2936 const uint flags;
2937{
2938 if (!(flags & NDF_NO_FREE_PNBUF) &&
2939 (ndp->ni_cnd.cn_flags & HASBUF)) {
2940 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
2941 ndp->ni_cnd.cn_flags &= ~HASBUF;
2942 }
2943 if (!(flags & NDF_NO_DVP_UNLOCK) &&
2944 (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
2945 ndp->ni_dvp != ndp->ni_vp)
2946 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
2947 if (!(flags & NDF_NO_DVP_RELE) &&
2948 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
2949 vrele(ndp->ni_dvp);
2950 ndp->ni_dvp = NULL;
2951 }
2952 if (!(flags & NDF_NO_VP_UNLOCK) &&
2953 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
2954 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
2955 if (!(flags & NDF_NO_VP_RELE) &&
2956 ndp->ni_vp) {
2957 vrele(ndp->ni_vp);
2958 ndp->ni_vp = NULL;
2959 }
2960 if (!(flags & NDF_NO_STARTDIR_RELE) &&
2961 (ndp->ni_cnd.cn_flags & SAVESTART)) {
2962 vrele(ndp->ni_startdir);
2963 ndp->ni_startdir = NULL;
2964 }
2965}