Deleted Added
full compact
vfs_subr.c (65770) vfs_subr.c (66067)
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_subr.c 65770 2000-09-12 09:49:08Z bp $
39 * $FreeBSD: head/sys/kern/vfs_subr.c 66067 2000-09-19 10:28:44Z phk $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_ffs.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/dirent.h>
54#include <sys/domain.h>
55#include <sys/eventhandler.h>
56#include <sys/fcntl.h>
57#include <sys/kernel.h>
58#include <sys/kthread.h>
59#include <sys/ktr.h>
60#include <sys/malloc.h>
61#include <sys/mount.h>
62#include <sys/namei.h>
63#include <sys/proc.h>
64#include <sys/reboot.h>
65#include <sys/socket.h>
66#include <sys/stat.h>
67#include <sys/sysctl.h>
68#include <sys/vmmeter.h>
69#include <sys/vnode.h>
70
71#include <machine/limits.h>
72#include <machine/mutex.h>
73
74#include <vm/vm.h>
75#include <vm/vm_object.h>
76#include <vm/vm_extern.h>
77#include <vm/pmap.h>
78#include <vm/vm_map.h>
79#include <vm/vm_page.h>
80#include <vm/vm_pager.h>
81#include <vm/vnode_pager.h>
82#include <vm/vm_zone.h>
83
84static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
85
86static void insmntque __P((struct vnode *vp, struct mount *mp));
87static void vclean __P((struct vnode *vp, int flags, struct proc *p));
88static unsigned long numvnodes;
89SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
90
91enum vtype iftovt_tab[16] = {
92 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
93 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
94};
95int vttoif_tab[9] = {
96 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
97 S_IFSOCK, S_IFIFO, S_IFMT,
98};
99
100static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
101
102static u_long wantfreevnodes = 25;
103SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
104static u_long freevnodes = 0;
105SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
106
107static int reassignbufcalls;
108SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
109static int reassignbufloops;
110SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
111static int reassignbufsortgood;
112SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
113static int reassignbufsortbad;
114SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
115static int reassignbufmethod = 1;
116SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
117
118#ifdef ENABLE_VFS_IOOPT
119int vfs_ioopt = 0;
120SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
121#endif
122
123struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
124struct simplelock mountlist_slock;
125struct simplelock mntvnode_slock;
126int nfs_mount_type = -1;
127#ifndef NULL_SIMPLELOCKS
128static struct simplelock mntid_slock;
129static struct simplelock vnode_free_list_slock;
130static struct simplelock spechash_slock;
131#endif
132struct nfs_public nfs_pub; /* publicly exported FS */
133static vm_zone_t vnode_zone;
134int prtactive = 0; /* 1 => print out reclaim of active vnodes */
135
136/*
137 * The workitem queue.
138 */
139#define SYNCER_MAXDELAY 32
140static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
141time_t syncdelay = 30; /* max time to delay syncing data */
142time_t filedelay = 30; /* time to delay syncing files */
143SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
144time_t dirdelay = 29; /* time to delay syncing directories */
145SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
146time_t metadelay = 28; /* time to delay syncing metadata */
147SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
148static int rushjob; /* number of slots to run ASAP */
149static int stat_rush_requests; /* number of times I/O speeded up */
150SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
151
152static int syncer_delayno = 0;
153static long syncer_mask;
154LIST_HEAD(synclist, vnode);
155static struct synclist *syncer_workitem_pending;
156
157int desiredvnodes;
158SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
159 &desiredvnodes, 0, "Maximum number of vnodes");
160
161static void vfs_free_addrlist __P((struct netexport *nep));
162static int vfs_free_netcred __P((struct radix_node *rn, void *w));
163static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
164 struct export_args *argp));
165
166/*
167 * Initialize the vnode management data structures.
168 */
169void
170vntblinit()
171{
172
173 desiredvnodes = maxproc + cnt.v_page_count / 4;
174 simple_lock_init(&mntvnode_slock);
175 simple_lock_init(&mntid_slock);
176 simple_lock_init(&spechash_slock);
177 TAILQ_INIT(&vnode_free_list);
178 simple_lock_init(&vnode_free_list_slock);
179 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
180 /*
181 * Initialize the filesystem syncer.
182 */
183 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
184 &syncer_mask);
185 syncer_maxdelay = syncer_mask + 1;
186}
187
188/*
189 * Mark a mount point as busy. Used to synchronize access and to delay
190 * unmounting. Interlock is not released on failure.
191 */
192int
193vfs_busy(mp, flags, interlkp, p)
194 struct mount *mp;
195 int flags;
196 struct simplelock *interlkp;
197 struct proc *p;
198{
199 int lkflags;
200
201 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
202 if (flags & LK_NOWAIT)
203 return (ENOENT);
204 mp->mnt_kern_flag |= MNTK_MWAIT;
205 if (interlkp) {
206 simple_unlock(interlkp);
207 }
208 /*
209 * Since all busy locks are shared except the exclusive
210 * lock granted when unmounting, the only place that a
211 * wakeup needs to be done is at the release of the
212 * exclusive lock at the end of dounmount.
213 */
214 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
215 if (interlkp) {
216 simple_lock(interlkp);
217 }
218 return (ENOENT);
219 }
220 lkflags = LK_SHARED | LK_NOPAUSE;
221 if (interlkp)
222 lkflags |= LK_INTERLOCK;
223 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
224 panic("vfs_busy: unexpected lock failure");
225 return (0);
226}
227
228/*
229 * Free a busy filesystem.
230 */
231void
232vfs_unbusy(mp, p)
233 struct mount *mp;
234 struct proc *p;
235{
236
237 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
238}
239
240/*
241 * Lookup a filesystem type, and if found allocate and initialize
242 * a mount structure for it.
243 *
244 * Devname is usually updated by mount(8) after booting.
245 */
246int
247vfs_rootmountalloc(fstypename, devname, mpp)
248 char *fstypename;
249 char *devname;
250 struct mount **mpp;
251{
252 struct proc *p = curproc; /* XXX */
253 struct vfsconf *vfsp;
254 struct mount *mp;
255
256 if (fstypename == NULL)
257 return (ENODEV);
258 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
259 if (!strcmp(vfsp->vfc_name, fstypename))
260 break;
261 if (vfsp == NULL)
262 return (ENODEV);
263 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
264 bzero((char *)mp, (u_long)sizeof(struct mount));
265 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
266 (void)vfs_busy(mp, LK_NOWAIT, 0, p);
267 LIST_INIT(&mp->mnt_vnodelist);
268 mp->mnt_vfc = vfsp;
269 mp->mnt_op = vfsp->vfc_vfsops;
270 mp->mnt_flag = MNT_RDONLY;
271 mp->mnt_vnodecovered = NULLVP;
272 vfsp->vfc_refcount++;
273 mp->mnt_iosize_max = DFLTPHYS;
274 mp->mnt_stat.f_type = vfsp->vfc_typenum;
275 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
276 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
277 mp->mnt_stat.f_mntonname[0] = '/';
278 mp->mnt_stat.f_mntonname[1] = 0;
279 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
280 *mpp = mp;
281 return (0);
282}
283
284/*
285 * Find an appropriate filesystem to use for the root. If a filesystem
286 * has not been preselected, walk through the list of known filesystems
287 * trying those that have mountroot routines, and try them until one
288 * works or we have tried them all.
289 */
290#ifdef notdef /* XXX JH */
291int
292lite2_vfs_mountroot()
293{
294 struct vfsconf *vfsp;
295 extern int (*lite2_mountroot) __P((void));
296 int error;
297
298 if (lite2_mountroot != NULL)
299 return ((*lite2_mountroot)());
300 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
301 if (vfsp->vfc_mountroot == NULL)
302 continue;
303 if ((error = (*vfsp->vfc_mountroot)()) == 0)
304 return (0);
305 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
306 }
307 return (ENODEV);
308}
309#endif
310
311/*
312 * Lookup a mount point by filesystem identifier.
313 */
314struct mount *
315vfs_getvfs(fsid)
316 fsid_t *fsid;
317{
318 register struct mount *mp;
319
320 simple_lock(&mountlist_slock);
321 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
322 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
323 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
324 simple_unlock(&mountlist_slock);
325 return (mp);
326 }
327 }
328 simple_unlock(&mountlist_slock);
329 return ((struct mount *) 0);
330}
331
332/*
333 * Get a new unique fsid. Try to make its val[0] unique, since this value
334 * will be used to create fake device numbers for stat(). Also try (but
335 * not so hard) make its val[0] unique mod 2^16, since some emulators only
336 * support 16-bit device numbers. We end up with unique val[0]'s for the
337 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
338 *
339 * Keep in mind that several mounts may be running in parallel. Starting
340 * the search one past where the previous search terminated is both a
341 * micro-optimization and a defense against returning the same fsid to
342 * different mounts.
343 */
344void
345vfs_getnewfsid(mp)
346 struct mount *mp;
347{
348 static u_int16_t mntid_base;
349 fsid_t tfsid;
350 int mtype;
351
352 simple_lock(&mntid_slock);
353 mtype = mp->mnt_vfc->vfc_typenum;
354 tfsid.val[1] = mtype;
355 mtype = (mtype & 0xFF) << 24;
356 for (;;) {
357 tfsid.val[0] = makeudev(255,
358 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
359 mntid_base++;
360 if (vfs_getvfs(&tfsid) == NULL)
361 break;
362 }
363 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
364 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
365 simple_unlock(&mntid_slock);
366}
367
368/*
369 * Knob to control the precision of file timestamps:
370 *
371 * 0 = seconds only; nanoseconds zeroed.
372 * 1 = seconds and nanoseconds, accurate within 1/HZ.
373 * 2 = seconds and nanoseconds, truncated to microseconds.
374 * >=3 = seconds and nanoseconds, maximum precision.
375 */
376enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
377
378static int timestamp_precision = TSP_SEC;
379SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
380 &timestamp_precision, 0, "");
381
382/*
383 * Get a current timestamp.
384 */
385void
386vfs_timestamp(tsp)
387 struct timespec *tsp;
388{
389 struct timeval tv;
390
391 switch (timestamp_precision) {
392 case TSP_SEC:
393 tsp->tv_sec = time_second;
394 tsp->tv_nsec = 0;
395 break;
396 case TSP_HZ:
397 getnanotime(tsp);
398 break;
399 case TSP_USEC:
400 microtime(&tv);
401 TIMEVAL_TO_TIMESPEC(&tv, tsp);
402 break;
403 case TSP_NSEC:
404 default:
405 nanotime(tsp);
406 break;
407 }
408}
409
410/*
411 * Set vnode attributes to VNOVAL
412 */
413void
414vattr_null(vap)
415 register struct vattr *vap;
416{
417
418 vap->va_type = VNON;
419 vap->va_size = VNOVAL;
420 vap->va_bytes = VNOVAL;
421 vap->va_mode = VNOVAL;
422 vap->va_nlink = VNOVAL;
423 vap->va_uid = VNOVAL;
424 vap->va_gid = VNOVAL;
425 vap->va_fsid = VNOVAL;
426 vap->va_fileid = VNOVAL;
427 vap->va_blocksize = VNOVAL;
428 vap->va_rdev = VNOVAL;
429 vap->va_atime.tv_sec = VNOVAL;
430 vap->va_atime.tv_nsec = VNOVAL;
431 vap->va_mtime.tv_sec = VNOVAL;
432 vap->va_mtime.tv_nsec = VNOVAL;
433 vap->va_ctime.tv_sec = VNOVAL;
434 vap->va_ctime.tv_nsec = VNOVAL;
435 vap->va_flags = VNOVAL;
436 vap->va_gen = VNOVAL;
437 vap->va_vaflags = 0;
438}
439
440/*
441 * Routines having to do with the management of the vnode table.
442 */
443
444/*
445 * Return the next vnode from the free list.
446 */
447int
448getnewvnode(tag, mp, vops, vpp)
449 enum vtagtype tag;
450 struct mount *mp;
451 vop_t **vops;
452 struct vnode **vpp;
453{
454 int s, count;
455 struct proc *p = curproc; /* XXX */
456 struct vnode *vp = NULL;
457 struct mount *vnmp;
458 vm_object_t object;
459
460 /*
461 * We take the least recently used vnode from the freelist
462 * if we can get it and it has no cached pages, and no
463 * namecache entries are relative to it.
464 * Otherwise we allocate a new vnode
465 */
466
467 s = splbio();
468 simple_lock(&vnode_free_list_slock);
469
470 if (wantfreevnodes && freevnodes < wantfreevnodes) {
471 vp = NULL;
472 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
473 /*
474 * XXX: this is only here to be backwards compatible
475 */
476 vp = NULL;
477 } else for (count = 0; count < freevnodes; count++) {
478 vp = TAILQ_FIRST(&vnode_free_list);
479 if (vp == NULL || vp->v_usecount)
480 panic("getnewvnode: free vnode isn't");
481 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
482 /*
483 * Don't recycle if active in the namecache or
484 * if it still has cached pages or we cannot get
485 * its interlock.
486 */
487 if (LIST_FIRST(&vp->v_cache_src) != NULL ||
488 (VOP_GETVOBJECT(vp, &object) == 0 &&
489 (object->resident_page_count || object->ref_count)) ||
490 !simple_lock_try(&vp->v_interlock)) {
491 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
492 vp = NULL;
493 continue;
494 }
495 /*
496 * Skip over it if its filesystem is being suspended.
497 */
498 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0)
499 break;
500 simple_unlock(&vp->v_interlock);
501 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
502 vp = NULL;
503 }
504 if (vp) {
505 vp->v_flag |= VDOOMED;
506 freevnodes--;
507 simple_unlock(&vnode_free_list_slock);
508 cache_purge(vp);
509 vp->v_lease = NULL;
510 if (vp->v_type != VBAD) {
511 vgonel(vp, p);
512 } else {
513 simple_unlock(&vp->v_interlock);
514 }
515 vn_finished_write(vnmp);
516
517#ifdef INVARIANTS
518 {
519 int s;
520
521 if (vp->v_data)
522 panic("cleaned vnode isn't");
523 s = splbio();
524 if (vp->v_numoutput)
525 panic("Clean vnode has pending I/O's");
526 splx(s);
527 if (vp->v_writecount != 0)
528 panic("Non-zero write count");
529 }
530#endif
531 vp->v_flag = 0;
532 vp->v_lastw = 0;
533 vp->v_lasta = 0;
534 vp->v_cstart = 0;
535 vp->v_clen = 0;
536 vp->v_socket = 0;
537 } else {
538 simple_unlock(&vnode_free_list_slock);
539 vp = (struct vnode *) zalloc(vnode_zone);
540 bzero((char *) vp, sizeof *vp);
541 simple_lock_init(&vp->v_interlock);
542 vp->v_dd = vp;
543 cache_purge(vp);
544 LIST_INIT(&vp->v_cache_src);
545 TAILQ_INIT(&vp->v_cache_dst);
546 numvnodes++;
547 }
548
549 TAILQ_INIT(&vp->v_cleanblkhd);
550 TAILQ_INIT(&vp->v_dirtyblkhd);
551 vp->v_type = VNON;
552 vp->v_tag = tag;
553 vp->v_op = vops;
554 insmntque(vp, mp);
555 *vpp = vp;
556 vp->v_usecount = 1;
557 vp->v_data = 0;
558 splx(s);
559
560 vfs_object_create(vp, p, p->p_ucred);
561 return (0);
562}
563
564/*
565 * Move a vnode from one mount queue to another.
566 */
567static void
568insmntque(vp, mp)
569 register struct vnode *vp;
570 register struct mount *mp;
571{
572
573 simple_lock(&mntvnode_slock);
574 /*
575 * Delete from old mount point vnode list, if on one.
576 */
577 if (vp->v_mount != NULL)
578 LIST_REMOVE(vp, v_mntvnodes);
579 /*
580 * Insert into list of vnodes for the new mount point, if available.
581 */
582 if ((vp->v_mount = mp) == NULL) {
583 simple_unlock(&mntvnode_slock);
584 return;
585 }
586 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
587 simple_unlock(&mntvnode_slock);
588}
589
590/*
591 * Update outstanding I/O count and do wakeup if requested.
592 */
593void
594vwakeup(bp)
595 register struct buf *bp;
596{
597 register struct vnode *vp;
598
599 bp->b_flags &= ~B_WRITEINPROG;
600 if ((vp = bp->b_vp)) {
601 vp->v_numoutput--;
602 if (vp->v_numoutput < 0)
603 panic("vwakeup: neg numoutput");
604 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
605 vp->v_flag &= ~VBWAIT;
606 wakeup((caddr_t) &vp->v_numoutput);
607 }
608 }
609}
610
611/*
612 * Flush out and invalidate all buffers associated with a vnode.
613 * Called with the underlying object locked.
614 */
615int
616vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
617 register struct vnode *vp;
618 int flags;
619 struct ucred *cred;
620 struct proc *p;
621 int slpflag, slptimeo;
622{
623 register struct buf *bp;
624 struct buf *nbp, *blist;
625 int s, error;
626 vm_object_t object;
627
628 if (flags & V_SAVE) {
629 s = splbio();
630 while (vp->v_numoutput) {
631 vp->v_flag |= VBWAIT;
632 error = tsleep((caddr_t)&vp->v_numoutput,
633 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
634 if (error) {
635 splx(s);
636 return (error);
637 }
638 }
639 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
640 splx(s);
641 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
642 return (error);
643 s = splbio();
644 if (vp->v_numoutput > 0 ||
645 !TAILQ_EMPTY(&vp->v_dirtyblkhd))
646 panic("vinvalbuf: dirty bufs");
647 }
648 splx(s);
649 }
650 s = splbio();
651 for (;;) {
652 blist = TAILQ_FIRST(&vp->v_cleanblkhd);
653 if (!blist)
654 blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
655 if (!blist)
656 break;
657
658 for (bp = blist; bp; bp = nbp) {
659 nbp = TAILQ_NEXT(bp, b_vnbufs);
660 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
661 error = BUF_TIMELOCK(bp,
662 LK_EXCLUSIVE | LK_SLEEPFAIL,
663 "vinvalbuf", slpflag, slptimeo);
664 if (error == ENOLCK)
665 break;
666 splx(s);
667 return (error);
668 }
669 /*
670 * XXX Since there are no node locks for NFS, I
671 * believe there is a slight chance that a delayed
672 * write will occur while sleeping just above, so
673 * check for it. Note that vfs_bio_awrite expects
674 * buffers to reside on a queue, while VOP_BWRITE and
675 * brelse do not.
676 */
677 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
678 (flags & V_SAVE)) {
679
680 if (bp->b_vp == vp) {
681 if (bp->b_flags & B_CLUSTEROK) {
682 BUF_UNLOCK(bp);
683 vfs_bio_awrite(bp);
684 } else {
685 bremfree(bp);
686 bp->b_flags |= B_ASYNC;
687 BUF_WRITE(bp);
688 }
689 } else {
690 bremfree(bp);
691 (void) BUF_WRITE(bp);
692 }
693 break;
694 }
695 bremfree(bp);
696 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
697 bp->b_flags &= ~B_ASYNC;
698 brelse(bp);
699 }
700 }
701
702 while (vp->v_numoutput > 0) {
703 vp->v_flag |= VBWAIT;
704 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
705 }
706
707 splx(s);
708
709 /*
710 * Destroy the copy in the VM cache, too.
711 */
712 simple_lock(&vp->v_interlock);
713 if (VOP_GETVOBJECT(vp, &object) == 0) {
714 vm_object_page_remove(object, 0, 0,
715 (flags & V_SAVE) ? TRUE : FALSE);
716 }
717 simple_unlock(&vp->v_interlock);
718
719 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
720 panic("vinvalbuf: flush failed");
721 return (0);
722}
723
724/*
725 * Truncate a file's buffer and pages to a specified length. This
726 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
727 * sync activity.
728 */
729int
730vtruncbuf(vp, cred, p, length, blksize)
731 register struct vnode *vp;
732 struct ucred *cred;
733 struct proc *p;
734 off_t length;
735 int blksize;
736{
737 register struct buf *bp;
738 struct buf *nbp;
739 int s, anyfreed;
740 int trunclbn;
741
742 /*
743 * Round up to the *next* lbn.
744 */
745 trunclbn = (length + blksize - 1) / blksize;
746
747 s = splbio();
748restart:
749 anyfreed = 1;
750 for (;anyfreed;) {
751 anyfreed = 0;
752 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
753 nbp = TAILQ_NEXT(bp, b_vnbufs);
754 if (bp->b_lblkno >= trunclbn) {
755 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
756 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
757 goto restart;
758 } else {
759 bremfree(bp);
760 bp->b_flags |= (B_INVAL | B_RELBUF);
761 bp->b_flags &= ~B_ASYNC;
762 brelse(bp);
763 anyfreed = 1;
764 }
765 if (nbp &&
766 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
767 (nbp->b_vp != vp) ||
768 (nbp->b_flags & B_DELWRI))) {
769 goto restart;
770 }
771 }
772 }
773
774 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
775 nbp = TAILQ_NEXT(bp, b_vnbufs);
776 if (bp->b_lblkno >= trunclbn) {
777 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
778 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
779 goto restart;
780 } else {
781 bremfree(bp);
782 bp->b_flags |= (B_INVAL | B_RELBUF);
783 bp->b_flags &= ~B_ASYNC;
784 brelse(bp);
785 anyfreed = 1;
786 }
787 if (nbp &&
788 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
789 (nbp->b_vp != vp) ||
790 (nbp->b_flags & B_DELWRI) == 0)) {
791 goto restart;
792 }
793 }
794 }
795 }
796
797 if (length > 0) {
798restartsync:
799 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
800 nbp = TAILQ_NEXT(bp, b_vnbufs);
801 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
802 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
803 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
804 goto restart;
805 } else {
806 bremfree(bp);
807 if (bp->b_vp == vp) {
808 bp->b_flags |= B_ASYNC;
809 } else {
810 bp->b_flags &= ~B_ASYNC;
811 }
812 BUF_WRITE(bp);
813 }
814 goto restartsync;
815 }
816
817 }
818 }
819
820 while (vp->v_numoutput > 0) {
821 vp->v_flag |= VBWAIT;
822 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
823 }
824
825 splx(s);
826
827 vnode_pager_setsize(vp, length);
828
829 return (0);
830}
831
832/*
833 * Associate a buffer with a vnode.
834 */
835void
836bgetvp(vp, bp)
837 register struct vnode *vp;
838 register struct buf *bp;
839{
840 int s;
841
842 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
843
844 vhold(vp);
845 bp->b_vp = vp;
846 bp->b_dev = vn_todev(vp);
847 /*
848 * Insert onto list for new vnode.
849 */
850 s = splbio();
851 bp->b_xflags |= BX_VNCLEAN;
852 bp->b_xflags &= ~BX_VNDIRTY;
853 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
854 splx(s);
855}
856
857/*
858 * Disassociate a buffer from a vnode.
859 */
860void
861brelvp(bp)
862 register struct buf *bp;
863{
864 struct vnode *vp;
865 struct buflists *listheadp;
866 int s;
867
868 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
869
870 /*
871 * Delete from old vnode list, if on one.
872 */
873 vp = bp->b_vp;
874 s = splbio();
875 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
876 if (bp->b_xflags & BX_VNDIRTY)
877 listheadp = &vp->v_dirtyblkhd;
878 else
879 listheadp = &vp->v_cleanblkhd;
880 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
881 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
882 }
883 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
884 vp->v_flag &= ~VONWORKLST;
885 LIST_REMOVE(vp, v_synclist);
886 }
887 splx(s);
888 bp->b_vp = (struct vnode *) 0;
889 vdrop(vp);
890}
891
892/*
893 * The workitem queue.
894 *
895 * It is useful to delay writes of file data and filesystem metadata
896 * for tens of seconds so that quickly created and deleted files need
897 * not waste disk bandwidth being created and removed. To realize this,
898 * we append vnodes to a "workitem" queue. When running with a soft
899 * updates implementation, most pending metadata dependencies should
900 * not wait for more than a few seconds. Thus, mounted on block devices
901 * are delayed only about a half the time that file data is delayed.
902 * Similarly, directory updates are more critical, so are only delayed
903 * about a third the time that file data is delayed. Thus, there are
904 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
905 * one each second (driven off the filesystem syncer process). The
906 * syncer_delayno variable indicates the next queue that is to be processed.
907 * Items that need to be processed soon are placed in this queue:
908 *
909 * syncer_workitem_pending[syncer_delayno]
910 *
911 * A delay of fifteen seconds is done by placing the request fifteen
912 * entries later in the queue:
913 *
914 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
915 *
916 */
917
918/*
919 * Add an item to the syncer work queue.
920 */
921static void
922vn_syncer_add_to_worklist(struct vnode *vp, int delay)
923{
924 int s, slot;
925
926 s = splbio();
927
928 if (vp->v_flag & VONWORKLST) {
929 LIST_REMOVE(vp, v_synclist);
930 }
931
932 if (delay > syncer_maxdelay - 2)
933 delay = syncer_maxdelay - 2;
934 slot = (syncer_delayno + delay) & syncer_mask;
935
936 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
937 vp->v_flag |= VONWORKLST;
938 splx(s);
939}
940
941struct proc *updateproc;
942static void sched_sync __P((void));
943static struct kproc_desc up_kp = {
944 "syncer",
945 sched_sync,
946 &updateproc
947};
948SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
949
950/*
951 * System filesystem synchronizer daemon.
952 */
953void
954sched_sync(void)
955{
956 struct synclist *slp;
957 struct vnode *vp;
958 struct mount *mp;
959 long starttime;
960 int s;
961 struct proc *p = updateproc;
962
963 mtx_enter(&Giant, MTX_DEF);
964
965 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
966 SHUTDOWN_PRI_LAST);
967
968 for (;;) {
969 kproc_suspend_loop(p);
970
971 starttime = time_second;
972
973 /*
974 * Push files whose dirty time has expired. Be careful
975 * of interrupt race on slp queue.
976 */
977 s = splbio();
978 slp = &syncer_workitem_pending[syncer_delayno];
979 syncer_delayno += 1;
980 if (syncer_delayno == syncer_maxdelay)
981 syncer_delayno = 0;
982 splx(s);
983
984 while ((vp = LIST_FIRST(slp)) != NULL) {
985 if (VOP_ISLOCKED(vp, NULL) == 0 &&
986 vn_start_write(vp, &mp, V_NOWAIT) == 0) {
987 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
988 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
989 VOP_UNLOCK(vp, 0, p);
990 vn_finished_write(mp);
991 }
992 s = splbio();
993 if (LIST_FIRST(slp) == vp) {
994 /*
995 * Note: v_tag VT_VFS vps can remain on the
996 * worklist too with no dirty blocks, but
997 * since sync_fsync() moves it to a different
998 * slot we are safe.
999 */
1000 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1001 !vn_isdisk(vp, NULL))
1002 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1003 /*
1004 * Put us back on the worklist. The worklist
1005 * routine will remove us from our current
1006 * position and then add us back in at a later
1007 * position.
1008 */
1009 vn_syncer_add_to_worklist(vp, syncdelay);
1010 }
1011 splx(s);
1012 }
1013
1014 /*
1015 * Do soft update processing.
1016 */
1017#ifdef SOFTUPDATES
1018 softdep_process_worklist(NULL);
1019#endif
1020
1021 /*
1022 * The variable rushjob allows the kernel to speed up the
1023 * processing of the filesystem syncer process. A rushjob
1024 * value of N tells the filesystem syncer to process the next
1025 * N seconds worth of work on its queue ASAP. Currently rushjob
1026 * is used by the soft update code to speed up the filesystem
1027 * syncer process when the incore state is getting so far
1028 * ahead of the disk that the kernel memory pool is being
1029 * threatened with exhaustion.
1030 */
1031 if (rushjob > 0) {
1032 rushjob -= 1;
1033 continue;
1034 }
1035 /*
1036 * If it has taken us less than a second to process the
1037 * current work, then wait. Otherwise start right over
1038 * again. We can still lose time if any single round
1039 * takes more than two seconds, but it does not really
1040 * matter as we are just trying to generally pace the
1041 * filesystem activity.
1042 */
1043 if (time_second == starttime)
1044 tsleep(&lbolt, PPAUSE, "syncer", 0);
1045 }
1046}
1047
1048/*
1049 * Request the syncer daemon to speed up its work.
1050 * We never push it to speed up more than half of its
1051 * normal turn time, otherwise it could take over the cpu.
1052 */
1053int
1054speedup_syncer()
1055{
1056 int s;
1057
1058 s = splhigh();
1059 if (updateproc->p_wchan == &lbolt)
1060 setrunnable(updateproc);
1061 splx(s);
1062 if (rushjob < syncdelay / 2) {
1063 rushjob += 1;
1064 stat_rush_requests += 1;
1065 return (1);
1066 }
1067 return(0);
1068}
1069
1070/*
1071 * Associate a p-buffer with a vnode.
1072 *
1073 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1074 * with the buffer. i.e. the bp has not been linked into the vnode or
1075 * ref-counted.
1076 */
1077void
1078pbgetvp(vp, bp)
1079 register struct vnode *vp;
1080 register struct buf *bp;
1081{
1082
1083 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1084
1085 bp->b_vp = vp;
1086 bp->b_flags |= B_PAGING;
1087 bp->b_dev = vn_todev(vp);
1088}
1089
1090/*
1091 * Disassociate a p-buffer from a vnode.
1092 */
1093void
1094pbrelvp(bp)
1095 register struct buf *bp;
1096{
1097
1098 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1099
1100 /* XXX REMOVE ME */
1101 if (bp->b_vnbufs.tqe_next != NULL) {
1102 panic(
1103 "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1104 bp,
1105 (int)bp->b_flags
1106 );
1107 }
1108 bp->b_vp = (struct vnode *) 0;
1109 bp->b_flags &= ~B_PAGING;
1110}
1111
1112void
1113pbreassignbuf(bp, newvp)
1114 struct buf *bp;
1115 struct vnode *newvp;
1116{
1117 if ((bp->b_flags & B_PAGING) == 0) {
1118 panic(
1119 "pbreassignbuf() on non phys bp %p",
1120 bp
1121 );
1122 }
1123 bp->b_vp = newvp;
1124}
1125
1126/*
1127 * Reassign a buffer from one vnode to another.
1128 * Used to assign file specific control information
1129 * (indirect blocks) to the vnode to which they belong.
1130 */
1131void
1132reassignbuf(bp, newvp)
1133 register struct buf *bp;
1134 register struct vnode *newvp;
1135{
1136 struct buflists *listheadp;
1137 int delay;
1138 int s;
1139
1140 if (newvp == NULL) {
1141 printf("reassignbuf: NULL");
1142 return;
1143 }
1144 ++reassignbufcalls;
1145
1146 /*
1147 * B_PAGING flagged buffers cannot be reassigned because their vp
1148 * is not fully linked in.
1149 */
1150 if (bp->b_flags & B_PAGING)
1151 panic("cannot reassign paging buffer");
1152
1153 s = splbio();
1154 /*
1155 * Delete from old vnode list, if on one.
1156 */
1157 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1158 if (bp->b_xflags & BX_VNDIRTY)
1159 listheadp = &bp->b_vp->v_dirtyblkhd;
1160 else
1161 listheadp = &bp->b_vp->v_cleanblkhd;
1162 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1163 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1164 if (bp->b_vp != newvp) {
1165 vdrop(bp->b_vp);
1166 bp->b_vp = NULL; /* for clarification */
1167 }
1168 }
1169 /*
1170 * If dirty, put on list of dirty buffers; otherwise insert onto list
1171 * of clean buffers.
1172 */
1173 if (bp->b_flags & B_DELWRI) {
1174 struct buf *tbp;
1175
1176 listheadp = &newvp->v_dirtyblkhd;
1177 if ((newvp->v_flag & VONWORKLST) == 0) {
1178 switch (newvp->v_type) {
1179 case VDIR:
1180 delay = dirdelay;
1181 break;
1182 case VCHR:
1183 case VBLK:
1184 if (newvp->v_specmountpoint != NULL) {
1185 delay = metadelay;
1186 break;
1187 }
1188 /* fall through */
1189 default:
1190 delay = filedelay;
1191 }
1192 vn_syncer_add_to_worklist(newvp, delay);
1193 }
1194 bp->b_xflags |= BX_VNDIRTY;
1195 tbp = TAILQ_FIRST(listheadp);
1196 if (tbp == NULL ||
1197 bp->b_lblkno == 0 ||
1198 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
1199 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1200 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1201 ++reassignbufsortgood;
1202 } else if (bp->b_lblkno < 0) {
1203 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1204 ++reassignbufsortgood;
1205 } else if (reassignbufmethod == 1) {
1206 /*
1207 * New sorting algorithm, only handle sequential case,
1208 * otherwise append to end (but before metadata)
1209 */
1210 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1211 (tbp->b_xflags & BX_VNDIRTY)) {
1212 /*
1213 * Found the best place to insert the buffer
1214 */
1215 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1216 ++reassignbufsortgood;
1217 } else {
1218 /*
1219 * Missed, append to end, but before meta-data.
1220 * We know that the head buffer in the list is
1221 * not meta-data due to prior conditionals.
1222 *
1223 * Indirect effects: NFS second stage write
1224 * tends to wind up here, giving maximum
1225 * distance between the unstable write and the
1226 * commit rpc.
1227 */
1228 tbp = TAILQ_LAST(listheadp, buflists);
1229 while (tbp && tbp->b_lblkno < 0)
1230 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
1231 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1232 ++reassignbufsortbad;
1233 }
1234 } else {
1235 /*
1236 * Old sorting algorithm, scan queue and insert
1237 */
1238 struct buf *ttbp;
1239 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1240 (ttbp->b_lblkno < bp->b_lblkno)) {
1241 ++reassignbufloops;
1242 tbp = ttbp;
1243 }
1244 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1245 }
1246 } else {
1247 bp->b_xflags |= BX_VNCLEAN;
1248 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1249 if ((newvp->v_flag & VONWORKLST) &&
1250 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1251 newvp->v_flag &= ~VONWORKLST;
1252 LIST_REMOVE(newvp, v_synclist);
1253 }
1254 }
1255 if (bp->b_vp != newvp) {
1256 bp->b_vp = newvp;
1257 vhold(bp->b_vp);
1258 }
1259 splx(s);
1260}
1261
1262/*
1263 * Create a vnode for a block device.
1264 * Used for mounting the root file system.
1265 * XXX: This now changed to a VCHR due to the block/char merging.
1266 */
1267int
1268bdevvp(dev, vpp)
1269 dev_t dev;
1270 struct vnode **vpp;
1271{
1272 register struct vnode *vp;
1273 struct vnode *nvp;
1274 int error;
1275
1276 if (dev == NODEV) {
1277 *vpp = NULLVP;
1278 return (ENXIO);
1279 }
1280 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1281 if (error) {
1282 *vpp = NULLVP;
1283 return (error);
1284 }
1285 vp = nvp;
1286 vp->v_type = VCHR;
1287 addalias(vp, dev);
1288 *vpp = vp;
1289 return (0);
1290}
1291
1292/*
1293 * Add vnode to the alias list hung off the dev_t.
1294 *
1295 * The reason for this gunk is that multiple vnodes can reference
1296 * the same physical device, so checking vp->v_usecount to see
1297 * how many users there are is inadequate; the v_usecount for
1298 * the vnodes need to be accumulated. vcount() does that.
1299 */
1300struct vnode *
1301addaliasu(nvp, nvp_rdev)
1302 struct vnode *nvp;
1303 udev_t nvp_rdev;
1304{
1305 struct vnode *ovp;
1306 vop_t **ops;
1307 dev_t dev;
1308
1309 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1310 panic("addaliasu on non-special vnode");
1311 dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
1312 /*
1313 * Check to see if we have a bdevvp vnode with no associated
1314 * filesystem. If so, we want to associate the filesystem of
1315 * the new newly instigated vnode with the bdevvp vnode and
1316 * discard the newly created vnode rather than leaving the
1317 * bdevvp vnode lying around with no associated filesystem.
1318 */
1319 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1320 addalias(nvp, dev);
1321 return (nvp);
1322 }
1323 /*
1324 * Discard unneeded vnode, but save its node specific data.
1325 * Note that if there is a lock, it is carried over in the
1326 * node specific data to the replacement vnode.
1327 */
1328 vref(ovp);
1329 ovp->v_data = nvp->v_data;
1330 ovp->v_tag = nvp->v_tag;
1331 nvp->v_data = NULL;
1332 ops = nvp->v_op;
1333 nvp->v_op = ovp->v_op;
1334 ovp->v_op = ops;
1335 insmntque(ovp, nvp->v_mount);
1336 vrele(nvp);
1337 vgone(nvp);
1338 return (ovp);
1339}
1340
1341void
1342addalias(nvp, dev)
1343 struct vnode *nvp;
1344 dev_t dev;
1345{
1346
1347 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1348 panic("addalias on non-special vnode");
1349
1350 nvp->v_rdev = dev;
1351 simple_lock(&spechash_slock);
1352 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1353 simple_unlock(&spechash_slock);
1354}
1355
1356/*
1357 * Grab a particular vnode from the free list, increment its
1358 * reference count and lock it. The vnode lock bit is set if the
1359 * vnode is being eliminated in vgone. The process is awakened
1360 * when the transition is completed, and an error returned to
1361 * indicate that the vnode is no longer usable (possibly having
1362 * been changed to a new file system type).
1363 */
1364int
1365vget(vp, flags, p)
1366 register struct vnode *vp;
1367 int flags;
1368 struct proc *p;
1369{
1370 int error;
1371
1372 /*
1373 * If the vnode is in the process of being cleaned out for
1374 * another use, we wait for the cleaning to finish and then
1375 * return failure. Cleaning is determined by checking that
1376 * the VXLOCK flag is set.
1377 */
1378 if ((flags & LK_INTERLOCK) == 0) {
1379 simple_lock(&vp->v_interlock);
1380 }
1381 if (vp->v_flag & VXLOCK) {
1382 vp->v_flag |= VXWANT;
1383 simple_unlock(&vp->v_interlock);
1384 tsleep((caddr_t)vp, PINOD, "vget", 0);
1385 return (ENOENT);
1386 }
1387
1388 vp->v_usecount++;
1389
1390 if (VSHOULDBUSY(vp))
1391 vbusy(vp);
1392 if (flags & LK_TYPE_MASK) {
1393 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1394 /*
1395 * must expand vrele here because we do not want
1396 * to call VOP_INACTIVE if the reference count
1397 * drops back to zero since it was never really
1398 * active. We must remove it from the free list
1399 * before sleeping so that multiple processes do
1400 * not try to recycle it.
1401 */
1402 simple_lock(&vp->v_interlock);
1403 vp->v_usecount--;
1404 if (VSHOULDFREE(vp))
1405 vfree(vp);
1406 simple_unlock(&vp->v_interlock);
1407 }
1408 return (error);
1409 }
1410 simple_unlock(&vp->v_interlock);
1411 return (0);
1412}
1413
1414void
1415vref(struct vnode *vp)
1416{
1417 simple_lock(&vp->v_interlock);
1418 vp->v_usecount++;
1419 simple_unlock(&vp->v_interlock);
1420}
1421
1422/*
1423 * Vnode put/release.
1424 * If count drops to zero, call inactive routine and return to freelist.
1425 */
1426void
1427vrele(vp)
1428 struct vnode *vp;
1429{
1430 struct proc *p = curproc; /* XXX */
1431
1432 KASSERT(vp != NULL, ("vrele: null vp"));
1433 KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close"));
1434
1435 simple_lock(&vp->v_interlock);
1436
1437 if (vp->v_usecount > 1) {
1438
1439 vp->v_usecount--;
1440 simple_unlock(&vp->v_interlock);
1441
1442 return;
1443 }
1444
1445 if (vp->v_usecount == 1) {
1446
1447 vp->v_usecount--;
1448 if (VSHOULDFREE(vp))
1449 vfree(vp);
1450 /*
1451 * If we are doing a vput, the node is already locked, and we must
1452 * call VOP_INACTIVE with the node locked. So, in the case of
1453 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1454 */
1455 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1456 VOP_INACTIVE(vp, p);
1457 }
1458
1459 } else {
1460#ifdef DIAGNOSTIC
1461 vprint("vrele: negative ref count", vp);
1462 simple_unlock(&vp->v_interlock);
1463#endif
1464 panic("vrele: negative ref cnt");
1465 }
1466}
1467
1468void
1469vput(vp)
1470 struct vnode *vp;
1471{
1472 struct proc *p = curproc; /* XXX */
1473
1474 KASSERT(vp != NULL, ("vput: null vp"));
1475 KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close"));
1476
1477 simple_lock(&vp->v_interlock);
1478
1479 if (vp->v_usecount > 1) {
1480
1481 vp->v_usecount--;
1482 VOP_UNLOCK(vp, LK_INTERLOCK, p);
1483 return;
1484
1485 }
1486
1487 if (vp->v_usecount == 1) {
1488
1489 vp->v_usecount--;
1490 if (VSHOULDFREE(vp))
1491 vfree(vp);
1492 /*
1493 * If we are doing a vput, the node is already locked, and we must
1494 * call VOP_INACTIVE with the node locked. So, in the case of
1495 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1496 */
1497 simple_unlock(&vp->v_interlock);
1498 VOP_INACTIVE(vp, p);
1499
1500 } else {
1501#ifdef DIAGNOSTIC
1502 vprint("vput: negative ref count", vp);
1503#endif
1504 panic("vput: negative ref cnt");
1505 }
1506}
1507
1508/*
1509 * Somebody doesn't want the vnode recycled.
1510 */
1511void
1512vhold(vp)
1513 register struct vnode *vp;
1514{
1515 int s;
1516
1517 s = splbio();
1518 vp->v_holdcnt++;
1519 if (VSHOULDBUSY(vp))
1520 vbusy(vp);
1521 splx(s);
1522}
1523
1524/*
1525 * One less who cares about this vnode.
1526 */
1527void
1528vdrop(vp)
1529 register struct vnode *vp;
1530{
1531 int s;
1532
1533 s = splbio();
1534 if (vp->v_holdcnt <= 0)
1535 panic("vdrop: holdcnt");
1536 vp->v_holdcnt--;
1537 if (VSHOULDFREE(vp))
1538 vfree(vp);
1539 splx(s);
1540}
1541
1542/*
1543 * Remove any vnodes in the vnode table belonging to mount point mp.
1544 *
1545 * If MNT_NOFORCE is specified, there should not be any active ones,
1546 * return error if any are found (nb: this is a user error, not a
1547 * system error). If MNT_FORCE is specified, detach any active vnodes
1548 * that are found.
1549 */
1550#ifdef DIAGNOSTIC
1551static int busyprt = 0; /* print out busy vnodes */
1552SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1553#endif
1554
1555int
1556vflush(mp, skipvp, flags)
1557 struct mount *mp;
1558 struct vnode *skipvp;
1559 int flags;
1560{
1561 struct proc *p = curproc; /* XXX */
1562 struct vnode *vp, *nvp;
1563 int busy = 0;
1564
1565 simple_lock(&mntvnode_slock);
1566loop:
1567 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1568 /*
1569 * Make sure this vnode wasn't reclaimed in getnewvnode().
1570 * Start over if it has (it won't be on the list anymore).
1571 */
1572 if (vp->v_mount != mp)
1573 goto loop;
1574 nvp = LIST_NEXT(vp, v_mntvnodes);
1575 /*
1576 * Skip over a selected vnode.
1577 */
1578 if (vp == skipvp)
1579 continue;
1580
1581 simple_lock(&vp->v_interlock);
1582 /*
1583 * Skip over a vnodes marked VSYSTEM.
1584 */
1585 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1586 simple_unlock(&vp->v_interlock);
1587 continue;
1588 }
1589 /*
1590 * If WRITECLOSE is set, only flush out regular file vnodes
1591 * open for writing.
1592 */
1593 if ((flags & WRITECLOSE) &&
1594 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1595 simple_unlock(&vp->v_interlock);
1596 continue;
1597 }
1598
1599 /*
1600 * With v_usecount == 0, all we need to do is clear out the
1601 * vnode data structures and we are done.
1602 */
1603 if (vp->v_usecount == 0) {
1604 simple_unlock(&mntvnode_slock);
1605 vgonel(vp, p);
1606 simple_lock(&mntvnode_slock);
1607 continue;
1608 }
1609
1610 /*
1611 * If FORCECLOSE is set, forcibly close the vnode. For block
1612 * or character devices, revert to an anonymous device. For
1613 * all other files, just kill them.
1614 */
1615 if (flags & FORCECLOSE) {
1616 simple_unlock(&mntvnode_slock);
1617 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1618 vgonel(vp, p);
1619 } else {
1620 vclean(vp, 0, p);
1621 vp->v_op = spec_vnodeop_p;
1622 insmntque(vp, (struct mount *) 0);
1623 }
1624 simple_lock(&mntvnode_slock);
1625 continue;
1626 }
1627#ifdef DIAGNOSTIC
1628 if (busyprt)
1629 vprint("vflush: busy vnode", vp);
1630#endif
1631 simple_unlock(&vp->v_interlock);
1632 busy++;
1633 }
1634 simple_unlock(&mntvnode_slock);
1635 if (busy)
1636 return (EBUSY);
1637 return (0);
1638}
1639
1640/*
1641 * Disassociate the underlying file system from a vnode.
1642 */
1643static void
1644vclean(vp, flags, p)
1645 struct vnode *vp;
1646 int flags;
1647 struct proc *p;
1648{
1649 int active;
1650
1651 /*
1652 * Check to see if the vnode is in use. If so we have to reference it
1653 * before we clean it out so that its count cannot fall to zero and
1654 * generate a race against ourselves to recycle it.
1655 */
1656 if ((active = vp->v_usecount))
1657 vp->v_usecount++;
1658
1659 /*
1660 * Prevent the vnode from being recycled or brought into use while we
1661 * clean it out.
1662 */
1663 if (vp->v_flag & VXLOCK)
1664 panic("vclean: deadlock");
1665 vp->v_flag |= VXLOCK;
1666 /*
1667 * Even if the count is zero, the VOP_INACTIVE routine may still
1668 * have the object locked while it cleans it out. The VOP_LOCK
1669 * ensures that the VOP_INACTIVE routine is done with its work.
1670 * For active vnodes, it ensures that no other activity can
1671 * occur while the underlying object is being cleaned out.
1672 */
1673 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1674
1675 /*
1676 * Clean out any buffers associated with the vnode.
1677 * If the flush fails, just toss the buffers.
1678 */
1679 if (flags & DOCLOSE) {
1680 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
1681 (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
1682 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
1683 vinvalbuf(vp, 0, NOCRED, p, 0, 0);
1684 }
1685
1686 VOP_DESTROYVOBJECT(vp);
1687
1688 /*
1689 * If purging an active vnode, it must be closed and
1690 * deactivated before being reclaimed. Note that the
1691 * VOP_INACTIVE will unlock the vnode.
1692 */
1693 if (active) {
1694 if (flags & DOCLOSE)
1695 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1696 VOP_INACTIVE(vp, p);
1697 } else {
1698 /*
1699 * Any other processes trying to obtain this lock must first
1700 * wait for VXLOCK to clear, then call the new lock operation.
1701 */
1702 VOP_UNLOCK(vp, 0, p);
1703 }
1704 /*
1705 * Reclaim the vnode.
1706 */
1707 if (VOP_RECLAIM(vp, p))
1708 panic("vclean: cannot reclaim");
1709
1710 if (active) {
1711 /*
1712 * Inline copy of vrele() since VOP_INACTIVE
1713 * has already been called.
1714 */
1715 simple_lock(&vp->v_interlock);
1716 if (--vp->v_usecount <= 0) {
1717#ifdef DIAGNOSTIC
1718 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1719 vprint("vclean: bad ref count", vp);
1720 panic("vclean: ref cnt");
1721 }
1722#endif
1723 vfree(vp);
1724 }
1725 simple_unlock(&vp->v_interlock);
1726 }
1727
1728 cache_purge(vp);
1729 if (vp->v_vnlock) {
1730 FREE(vp->v_vnlock, M_VNODE);
1731 vp->v_vnlock = NULL;
1732 }
1733
1734 if (VSHOULDFREE(vp))
1735 vfree(vp);
1736
1737 /*
1738 * Done with purge, notify sleepers of the grim news.
1739 */
1740 vp->v_op = dead_vnodeop_p;
1741 vn_pollgone(vp);
1742 vp->v_tag = VT_NON;
1743 vp->v_flag &= ~VXLOCK;
1744 if (vp->v_flag & VXWANT) {
1745 vp->v_flag &= ~VXWANT;
1746 wakeup((caddr_t) vp);
1747 }
1748}
1749
1750/*
1751 * Eliminate all activity associated with the requested vnode
1752 * and with all vnodes aliased to the requested vnode.
1753 */
1754int
1755vop_revoke(ap)
1756 struct vop_revoke_args /* {
1757 struct vnode *a_vp;
1758 int a_flags;
1759 } */ *ap;
1760{
1761 struct vnode *vp, *vq;
1762 dev_t dev;
1763
1764 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1765
1766 vp = ap->a_vp;
1767 /*
1768 * If a vgone (or vclean) is already in progress,
1769 * wait until it is done and return.
1770 */
1771 if (vp->v_flag & VXLOCK) {
1772 vp->v_flag |= VXWANT;
1773 simple_unlock(&vp->v_interlock);
1774 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1775 return (0);
1776 }
1777 dev = vp->v_rdev;
1778 for (;;) {
1779 simple_lock(&spechash_slock);
1780 vq = SLIST_FIRST(&dev->si_hlist);
1781 simple_unlock(&spechash_slock);
1782 if (!vq)
1783 break;
1784 vgone(vq);
1785 }
1786 return (0);
1787}
1788
1789/*
1790 * Recycle an unused vnode to the front of the free list.
1791 * Release the passed interlock if the vnode will be recycled.
1792 */
1793int
1794vrecycle(vp, inter_lkp, p)
1795 struct vnode *vp;
1796 struct simplelock *inter_lkp;
1797 struct proc *p;
1798{
1799
1800 simple_lock(&vp->v_interlock);
1801 if (vp->v_usecount == 0) {
1802 if (inter_lkp) {
1803 simple_unlock(inter_lkp);
1804 }
1805 vgonel(vp, p);
1806 return (1);
1807 }
1808 simple_unlock(&vp->v_interlock);
1809 return (0);
1810}
1811
1812/*
1813 * Eliminate all activity associated with a vnode
1814 * in preparation for reuse.
1815 */
1816void
1817vgone(vp)
1818 register struct vnode *vp;
1819{
1820 struct proc *p = curproc; /* XXX */
1821
1822 simple_lock(&vp->v_interlock);
1823 vgonel(vp, p);
1824}
1825
1826/*
1827 * vgone, with the vp interlock held.
1828 */
1829void
1830vgonel(vp, p)
1831 struct vnode *vp;
1832 struct proc *p;
1833{
1834 int s;
1835
1836 /*
1837 * If a vgone (or vclean) is already in progress,
1838 * wait until it is done and return.
1839 */
1840 if (vp->v_flag & VXLOCK) {
1841 vp->v_flag |= VXWANT;
1842 simple_unlock(&vp->v_interlock);
1843 tsleep((caddr_t)vp, PINOD, "vgone", 0);
1844 return;
1845 }
1846
1847 /*
1848 * Clean out the filesystem specific data.
1849 */
1850 vclean(vp, DOCLOSE, p);
1851 simple_lock(&vp->v_interlock);
1852
1853 /*
1854 * Delete from old mount point vnode list, if on one.
1855 */
1856 if (vp->v_mount != NULL)
1857 insmntque(vp, (struct mount *)0);
1858 /*
1859 * If special device, remove it from special device alias list
1860 * if it is on one.
1861 */
1862 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
1863 simple_lock(&spechash_slock);
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_ffs.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/dirent.h>
54#include <sys/domain.h>
55#include <sys/eventhandler.h>
56#include <sys/fcntl.h>
57#include <sys/kernel.h>
58#include <sys/kthread.h>
59#include <sys/ktr.h>
60#include <sys/malloc.h>
61#include <sys/mount.h>
62#include <sys/namei.h>
63#include <sys/proc.h>
64#include <sys/reboot.h>
65#include <sys/socket.h>
66#include <sys/stat.h>
67#include <sys/sysctl.h>
68#include <sys/vmmeter.h>
69#include <sys/vnode.h>
70
71#include <machine/limits.h>
72#include <machine/mutex.h>
73
74#include <vm/vm.h>
75#include <vm/vm_object.h>
76#include <vm/vm_extern.h>
77#include <vm/pmap.h>
78#include <vm/vm_map.h>
79#include <vm/vm_page.h>
80#include <vm/vm_pager.h>
81#include <vm/vnode_pager.h>
82#include <vm/vm_zone.h>
83
84static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
85
86static void insmntque __P((struct vnode *vp, struct mount *mp));
87static void vclean __P((struct vnode *vp, int flags, struct proc *p));
88static unsigned long numvnodes;
89SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
90
91enum vtype iftovt_tab[16] = {
92 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
93 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
94};
95int vttoif_tab[9] = {
96 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
97 S_IFSOCK, S_IFIFO, S_IFMT,
98};
99
100static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
101
102static u_long wantfreevnodes = 25;
103SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
104static u_long freevnodes = 0;
105SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
106
107static int reassignbufcalls;
108SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
109static int reassignbufloops;
110SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
111static int reassignbufsortgood;
112SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
113static int reassignbufsortbad;
114SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
115static int reassignbufmethod = 1;
116SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
117
118#ifdef ENABLE_VFS_IOOPT
119int vfs_ioopt = 0;
120SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
121#endif
122
123struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
124struct simplelock mountlist_slock;
125struct simplelock mntvnode_slock;
126int nfs_mount_type = -1;
127#ifndef NULL_SIMPLELOCKS
128static struct simplelock mntid_slock;
129static struct simplelock vnode_free_list_slock;
130static struct simplelock spechash_slock;
131#endif
132struct nfs_public nfs_pub; /* publicly exported FS */
133static vm_zone_t vnode_zone;
134int prtactive = 0; /* 1 => print out reclaim of active vnodes */
135
136/*
137 * The workitem queue.
138 */
139#define SYNCER_MAXDELAY 32
140static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
141time_t syncdelay = 30; /* max time to delay syncing data */
142time_t filedelay = 30; /* time to delay syncing files */
143SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
144time_t dirdelay = 29; /* time to delay syncing directories */
145SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
146time_t metadelay = 28; /* time to delay syncing metadata */
147SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
148static int rushjob; /* number of slots to run ASAP */
149static int stat_rush_requests; /* number of times I/O speeded up */
150SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
151
152static int syncer_delayno = 0;
153static long syncer_mask;
154LIST_HEAD(synclist, vnode);
155static struct synclist *syncer_workitem_pending;
156
157int desiredvnodes;
158SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
159 &desiredvnodes, 0, "Maximum number of vnodes");
160
161static void vfs_free_addrlist __P((struct netexport *nep));
162static int vfs_free_netcred __P((struct radix_node *rn, void *w));
163static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
164 struct export_args *argp));
165
166/*
167 * Initialize the vnode management data structures.
168 */
169void
170vntblinit()
171{
172
173 desiredvnodes = maxproc + cnt.v_page_count / 4;
174 simple_lock_init(&mntvnode_slock);
175 simple_lock_init(&mntid_slock);
176 simple_lock_init(&spechash_slock);
177 TAILQ_INIT(&vnode_free_list);
178 simple_lock_init(&vnode_free_list_slock);
179 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
180 /*
181 * Initialize the filesystem syncer.
182 */
183 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
184 &syncer_mask);
185 syncer_maxdelay = syncer_mask + 1;
186}
187
188/*
189 * Mark a mount point as busy. Used to synchronize access and to delay
190 * unmounting. Interlock is not released on failure.
191 */
192int
193vfs_busy(mp, flags, interlkp, p)
194 struct mount *mp;
195 int flags;
196 struct simplelock *interlkp;
197 struct proc *p;
198{
199 int lkflags;
200
201 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
202 if (flags & LK_NOWAIT)
203 return (ENOENT);
204 mp->mnt_kern_flag |= MNTK_MWAIT;
205 if (interlkp) {
206 simple_unlock(interlkp);
207 }
208 /*
209 * Since all busy locks are shared except the exclusive
210 * lock granted when unmounting, the only place that a
211 * wakeup needs to be done is at the release of the
212 * exclusive lock at the end of dounmount.
213 */
214 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
215 if (interlkp) {
216 simple_lock(interlkp);
217 }
218 return (ENOENT);
219 }
220 lkflags = LK_SHARED | LK_NOPAUSE;
221 if (interlkp)
222 lkflags |= LK_INTERLOCK;
223 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
224 panic("vfs_busy: unexpected lock failure");
225 return (0);
226}
227
228/*
229 * Free a busy filesystem.
230 */
231void
232vfs_unbusy(mp, p)
233 struct mount *mp;
234 struct proc *p;
235{
236
237 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
238}
239
240/*
241 * Lookup a filesystem type, and if found allocate and initialize
242 * a mount structure for it.
243 *
244 * Devname is usually updated by mount(8) after booting.
245 */
246int
247vfs_rootmountalloc(fstypename, devname, mpp)
248 char *fstypename;
249 char *devname;
250 struct mount **mpp;
251{
252 struct proc *p = curproc; /* XXX */
253 struct vfsconf *vfsp;
254 struct mount *mp;
255
256 if (fstypename == NULL)
257 return (ENODEV);
258 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
259 if (!strcmp(vfsp->vfc_name, fstypename))
260 break;
261 if (vfsp == NULL)
262 return (ENODEV);
263 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
264 bzero((char *)mp, (u_long)sizeof(struct mount));
265 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
266 (void)vfs_busy(mp, LK_NOWAIT, 0, p);
267 LIST_INIT(&mp->mnt_vnodelist);
268 mp->mnt_vfc = vfsp;
269 mp->mnt_op = vfsp->vfc_vfsops;
270 mp->mnt_flag = MNT_RDONLY;
271 mp->mnt_vnodecovered = NULLVP;
272 vfsp->vfc_refcount++;
273 mp->mnt_iosize_max = DFLTPHYS;
274 mp->mnt_stat.f_type = vfsp->vfc_typenum;
275 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
276 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
277 mp->mnt_stat.f_mntonname[0] = '/';
278 mp->mnt_stat.f_mntonname[1] = 0;
279 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
280 *mpp = mp;
281 return (0);
282}
283
284/*
285 * Find an appropriate filesystem to use for the root. If a filesystem
286 * has not been preselected, walk through the list of known filesystems
287 * trying those that have mountroot routines, and try them until one
288 * works or we have tried them all.
289 */
290#ifdef notdef /* XXX JH */
291int
292lite2_vfs_mountroot()
293{
294 struct vfsconf *vfsp;
295 extern int (*lite2_mountroot) __P((void));
296 int error;
297
298 if (lite2_mountroot != NULL)
299 return ((*lite2_mountroot)());
300 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
301 if (vfsp->vfc_mountroot == NULL)
302 continue;
303 if ((error = (*vfsp->vfc_mountroot)()) == 0)
304 return (0);
305 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
306 }
307 return (ENODEV);
308}
309#endif
310
311/*
312 * Lookup a mount point by filesystem identifier.
313 */
314struct mount *
315vfs_getvfs(fsid)
316 fsid_t *fsid;
317{
318 register struct mount *mp;
319
320 simple_lock(&mountlist_slock);
321 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
322 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
323 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
324 simple_unlock(&mountlist_slock);
325 return (mp);
326 }
327 }
328 simple_unlock(&mountlist_slock);
329 return ((struct mount *) 0);
330}
331
332/*
333 * Get a new unique fsid. Try to make its val[0] unique, since this value
334 * will be used to create fake device numbers for stat(). Also try (but
335 * not so hard) make its val[0] unique mod 2^16, since some emulators only
336 * support 16-bit device numbers. We end up with unique val[0]'s for the
337 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
338 *
339 * Keep in mind that several mounts may be running in parallel. Starting
340 * the search one past where the previous search terminated is both a
341 * micro-optimization and a defense against returning the same fsid to
342 * different mounts.
343 */
344void
345vfs_getnewfsid(mp)
346 struct mount *mp;
347{
348 static u_int16_t mntid_base;
349 fsid_t tfsid;
350 int mtype;
351
352 simple_lock(&mntid_slock);
353 mtype = mp->mnt_vfc->vfc_typenum;
354 tfsid.val[1] = mtype;
355 mtype = (mtype & 0xFF) << 24;
356 for (;;) {
357 tfsid.val[0] = makeudev(255,
358 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
359 mntid_base++;
360 if (vfs_getvfs(&tfsid) == NULL)
361 break;
362 }
363 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
364 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
365 simple_unlock(&mntid_slock);
366}
367
368/*
369 * Knob to control the precision of file timestamps:
370 *
371 * 0 = seconds only; nanoseconds zeroed.
372 * 1 = seconds and nanoseconds, accurate within 1/HZ.
373 * 2 = seconds and nanoseconds, truncated to microseconds.
374 * >=3 = seconds and nanoseconds, maximum precision.
375 */
376enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
377
378static int timestamp_precision = TSP_SEC;
379SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
380 &timestamp_precision, 0, "");
381
382/*
383 * Get a current timestamp.
384 */
385void
386vfs_timestamp(tsp)
387 struct timespec *tsp;
388{
389 struct timeval tv;
390
391 switch (timestamp_precision) {
392 case TSP_SEC:
393 tsp->tv_sec = time_second;
394 tsp->tv_nsec = 0;
395 break;
396 case TSP_HZ:
397 getnanotime(tsp);
398 break;
399 case TSP_USEC:
400 microtime(&tv);
401 TIMEVAL_TO_TIMESPEC(&tv, tsp);
402 break;
403 case TSP_NSEC:
404 default:
405 nanotime(tsp);
406 break;
407 }
408}
409
410/*
411 * Set vnode attributes to VNOVAL
412 */
413void
414vattr_null(vap)
415 register struct vattr *vap;
416{
417
418 vap->va_type = VNON;
419 vap->va_size = VNOVAL;
420 vap->va_bytes = VNOVAL;
421 vap->va_mode = VNOVAL;
422 vap->va_nlink = VNOVAL;
423 vap->va_uid = VNOVAL;
424 vap->va_gid = VNOVAL;
425 vap->va_fsid = VNOVAL;
426 vap->va_fileid = VNOVAL;
427 vap->va_blocksize = VNOVAL;
428 vap->va_rdev = VNOVAL;
429 vap->va_atime.tv_sec = VNOVAL;
430 vap->va_atime.tv_nsec = VNOVAL;
431 vap->va_mtime.tv_sec = VNOVAL;
432 vap->va_mtime.tv_nsec = VNOVAL;
433 vap->va_ctime.tv_sec = VNOVAL;
434 vap->va_ctime.tv_nsec = VNOVAL;
435 vap->va_flags = VNOVAL;
436 vap->va_gen = VNOVAL;
437 vap->va_vaflags = 0;
438}
439
440/*
441 * Routines having to do with the management of the vnode table.
442 */
443
444/*
445 * Return the next vnode from the free list.
446 */
447int
448getnewvnode(tag, mp, vops, vpp)
449 enum vtagtype tag;
450 struct mount *mp;
451 vop_t **vops;
452 struct vnode **vpp;
453{
454 int s, count;
455 struct proc *p = curproc; /* XXX */
456 struct vnode *vp = NULL;
457 struct mount *vnmp;
458 vm_object_t object;
459
460 /*
461 * We take the least recently used vnode from the freelist
462 * if we can get it and it has no cached pages, and no
463 * namecache entries are relative to it.
464 * Otherwise we allocate a new vnode
465 */
466
467 s = splbio();
468 simple_lock(&vnode_free_list_slock);
469
470 if (wantfreevnodes && freevnodes < wantfreevnodes) {
471 vp = NULL;
472 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
473 /*
474 * XXX: this is only here to be backwards compatible
475 */
476 vp = NULL;
477 } else for (count = 0; count < freevnodes; count++) {
478 vp = TAILQ_FIRST(&vnode_free_list);
479 if (vp == NULL || vp->v_usecount)
480 panic("getnewvnode: free vnode isn't");
481 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
482 /*
483 * Don't recycle if active in the namecache or
484 * if it still has cached pages or we cannot get
485 * its interlock.
486 */
487 if (LIST_FIRST(&vp->v_cache_src) != NULL ||
488 (VOP_GETVOBJECT(vp, &object) == 0 &&
489 (object->resident_page_count || object->ref_count)) ||
490 !simple_lock_try(&vp->v_interlock)) {
491 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
492 vp = NULL;
493 continue;
494 }
495 /*
496 * Skip over it if its filesystem is being suspended.
497 */
498 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0)
499 break;
500 simple_unlock(&vp->v_interlock);
501 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
502 vp = NULL;
503 }
504 if (vp) {
505 vp->v_flag |= VDOOMED;
506 freevnodes--;
507 simple_unlock(&vnode_free_list_slock);
508 cache_purge(vp);
509 vp->v_lease = NULL;
510 if (vp->v_type != VBAD) {
511 vgonel(vp, p);
512 } else {
513 simple_unlock(&vp->v_interlock);
514 }
515 vn_finished_write(vnmp);
516
517#ifdef INVARIANTS
518 {
519 int s;
520
521 if (vp->v_data)
522 panic("cleaned vnode isn't");
523 s = splbio();
524 if (vp->v_numoutput)
525 panic("Clean vnode has pending I/O's");
526 splx(s);
527 if (vp->v_writecount != 0)
528 panic("Non-zero write count");
529 }
530#endif
531 vp->v_flag = 0;
532 vp->v_lastw = 0;
533 vp->v_lasta = 0;
534 vp->v_cstart = 0;
535 vp->v_clen = 0;
536 vp->v_socket = 0;
537 } else {
538 simple_unlock(&vnode_free_list_slock);
539 vp = (struct vnode *) zalloc(vnode_zone);
540 bzero((char *) vp, sizeof *vp);
541 simple_lock_init(&vp->v_interlock);
542 vp->v_dd = vp;
543 cache_purge(vp);
544 LIST_INIT(&vp->v_cache_src);
545 TAILQ_INIT(&vp->v_cache_dst);
546 numvnodes++;
547 }
548
549 TAILQ_INIT(&vp->v_cleanblkhd);
550 TAILQ_INIT(&vp->v_dirtyblkhd);
551 vp->v_type = VNON;
552 vp->v_tag = tag;
553 vp->v_op = vops;
554 insmntque(vp, mp);
555 *vpp = vp;
556 vp->v_usecount = 1;
557 vp->v_data = 0;
558 splx(s);
559
560 vfs_object_create(vp, p, p->p_ucred);
561 return (0);
562}
563
564/*
565 * Move a vnode from one mount queue to another.
566 */
567static void
568insmntque(vp, mp)
569 register struct vnode *vp;
570 register struct mount *mp;
571{
572
573 simple_lock(&mntvnode_slock);
574 /*
575 * Delete from old mount point vnode list, if on one.
576 */
577 if (vp->v_mount != NULL)
578 LIST_REMOVE(vp, v_mntvnodes);
579 /*
580 * Insert into list of vnodes for the new mount point, if available.
581 */
582 if ((vp->v_mount = mp) == NULL) {
583 simple_unlock(&mntvnode_slock);
584 return;
585 }
586 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
587 simple_unlock(&mntvnode_slock);
588}
589
590/*
591 * Update outstanding I/O count and do wakeup if requested.
592 */
593void
594vwakeup(bp)
595 register struct buf *bp;
596{
597 register struct vnode *vp;
598
599 bp->b_flags &= ~B_WRITEINPROG;
600 if ((vp = bp->b_vp)) {
601 vp->v_numoutput--;
602 if (vp->v_numoutput < 0)
603 panic("vwakeup: neg numoutput");
604 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
605 vp->v_flag &= ~VBWAIT;
606 wakeup((caddr_t) &vp->v_numoutput);
607 }
608 }
609}
610
611/*
612 * Flush out and invalidate all buffers associated with a vnode.
613 * Called with the underlying object locked.
614 */
615int
616vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
617 register struct vnode *vp;
618 int flags;
619 struct ucred *cred;
620 struct proc *p;
621 int slpflag, slptimeo;
622{
623 register struct buf *bp;
624 struct buf *nbp, *blist;
625 int s, error;
626 vm_object_t object;
627
628 if (flags & V_SAVE) {
629 s = splbio();
630 while (vp->v_numoutput) {
631 vp->v_flag |= VBWAIT;
632 error = tsleep((caddr_t)&vp->v_numoutput,
633 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
634 if (error) {
635 splx(s);
636 return (error);
637 }
638 }
639 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
640 splx(s);
641 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
642 return (error);
643 s = splbio();
644 if (vp->v_numoutput > 0 ||
645 !TAILQ_EMPTY(&vp->v_dirtyblkhd))
646 panic("vinvalbuf: dirty bufs");
647 }
648 splx(s);
649 }
650 s = splbio();
651 for (;;) {
652 blist = TAILQ_FIRST(&vp->v_cleanblkhd);
653 if (!blist)
654 blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
655 if (!blist)
656 break;
657
658 for (bp = blist; bp; bp = nbp) {
659 nbp = TAILQ_NEXT(bp, b_vnbufs);
660 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
661 error = BUF_TIMELOCK(bp,
662 LK_EXCLUSIVE | LK_SLEEPFAIL,
663 "vinvalbuf", slpflag, slptimeo);
664 if (error == ENOLCK)
665 break;
666 splx(s);
667 return (error);
668 }
669 /*
670 * XXX Since there are no node locks for NFS, I
671 * believe there is a slight chance that a delayed
672 * write will occur while sleeping just above, so
673 * check for it. Note that vfs_bio_awrite expects
674 * buffers to reside on a queue, while VOP_BWRITE and
675 * brelse do not.
676 */
677 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
678 (flags & V_SAVE)) {
679
680 if (bp->b_vp == vp) {
681 if (bp->b_flags & B_CLUSTEROK) {
682 BUF_UNLOCK(bp);
683 vfs_bio_awrite(bp);
684 } else {
685 bremfree(bp);
686 bp->b_flags |= B_ASYNC;
687 BUF_WRITE(bp);
688 }
689 } else {
690 bremfree(bp);
691 (void) BUF_WRITE(bp);
692 }
693 break;
694 }
695 bremfree(bp);
696 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
697 bp->b_flags &= ~B_ASYNC;
698 brelse(bp);
699 }
700 }
701
702 while (vp->v_numoutput > 0) {
703 vp->v_flag |= VBWAIT;
704 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
705 }
706
707 splx(s);
708
709 /*
710 * Destroy the copy in the VM cache, too.
711 */
712 simple_lock(&vp->v_interlock);
713 if (VOP_GETVOBJECT(vp, &object) == 0) {
714 vm_object_page_remove(object, 0, 0,
715 (flags & V_SAVE) ? TRUE : FALSE);
716 }
717 simple_unlock(&vp->v_interlock);
718
719 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
720 panic("vinvalbuf: flush failed");
721 return (0);
722}
723
724/*
725 * Truncate a file's buffer and pages to a specified length. This
726 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
727 * sync activity.
728 */
729int
730vtruncbuf(vp, cred, p, length, blksize)
731 register struct vnode *vp;
732 struct ucred *cred;
733 struct proc *p;
734 off_t length;
735 int blksize;
736{
737 register struct buf *bp;
738 struct buf *nbp;
739 int s, anyfreed;
740 int trunclbn;
741
742 /*
743 * Round up to the *next* lbn.
744 */
745 trunclbn = (length + blksize - 1) / blksize;
746
747 s = splbio();
748restart:
749 anyfreed = 1;
750 for (;anyfreed;) {
751 anyfreed = 0;
752 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
753 nbp = TAILQ_NEXT(bp, b_vnbufs);
754 if (bp->b_lblkno >= trunclbn) {
755 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
756 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
757 goto restart;
758 } else {
759 bremfree(bp);
760 bp->b_flags |= (B_INVAL | B_RELBUF);
761 bp->b_flags &= ~B_ASYNC;
762 brelse(bp);
763 anyfreed = 1;
764 }
765 if (nbp &&
766 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
767 (nbp->b_vp != vp) ||
768 (nbp->b_flags & B_DELWRI))) {
769 goto restart;
770 }
771 }
772 }
773
774 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
775 nbp = TAILQ_NEXT(bp, b_vnbufs);
776 if (bp->b_lblkno >= trunclbn) {
777 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
778 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
779 goto restart;
780 } else {
781 bremfree(bp);
782 bp->b_flags |= (B_INVAL | B_RELBUF);
783 bp->b_flags &= ~B_ASYNC;
784 brelse(bp);
785 anyfreed = 1;
786 }
787 if (nbp &&
788 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
789 (nbp->b_vp != vp) ||
790 (nbp->b_flags & B_DELWRI) == 0)) {
791 goto restart;
792 }
793 }
794 }
795 }
796
797 if (length > 0) {
798restartsync:
799 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
800 nbp = TAILQ_NEXT(bp, b_vnbufs);
801 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
802 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
803 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
804 goto restart;
805 } else {
806 bremfree(bp);
807 if (bp->b_vp == vp) {
808 bp->b_flags |= B_ASYNC;
809 } else {
810 bp->b_flags &= ~B_ASYNC;
811 }
812 BUF_WRITE(bp);
813 }
814 goto restartsync;
815 }
816
817 }
818 }
819
820 while (vp->v_numoutput > 0) {
821 vp->v_flag |= VBWAIT;
822 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
823 }
824
825 splx(s);
826
827 vnode_pager_setsize(vp, length);
828
829 return (0);
830}
831
832/*
833 * Associate a buffer with a vnode.
834 */
835void
836bgetvp(vp, bp)
837 register struct vnode *vp;
838 register struct buf *bp;
839{
840 int s;
841
842 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
843
844 vhold(vp);
845 bp->b_vp = vp;
846 bp->b_dev = vn_todev(vp);
847 /*
848 * Insert onto list for new vnode.
849 */
850 s = splbio();
851 bp->b_xflags |= BX_VNCLEAN;
852 bp->b_xflags &= ~BX_VNDIRTY;
853 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
854 splx(s);
855}
856
857/*
858 * Disassociate a buffer from a vnode.
859 */
860void
861brelvp(bp)
862 register struct buf *bp;
863{
864 struct vnode *vp;
865 struct buflists *listheadp;
866 int s;
867
868 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
869
870 /*
871 * Delete from old vnode list, if on one.
872 */
873 vp = bp->b_vp;
874 s = splbio();
875 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
876 if (bp->b_xflags & BX_VNDIRTY)
877 listheadp = &vp->v_dirtyblkhd;
878 else
879 listheadp = &vp->v_cleanblkhd;
880 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
881 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
882 }
883 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
884 vp->v_flag &= ~VONWORKLST;
885 LIST_REMOVE(vp, v_synclist);
886 }
887 splx(s);
888 bp->b_vp = (struct vnode *) 0;
889 vdrop(vp);
890}
891
892/*
893 * The workitem queue.
894 *
895 * It is useful to delay writes of file data and filesystem metadata
896 * for tens of seconds so that quickly created and deleted files need
897 * not waste disk bandwidth being created and removed. To realize this,
898 * we append vnodes to a "workitem" queue. When running with a soft
899 * updates implementation, most pending metadata dependencies should
900 * not wait for more than a few seconds. Thus, mounted on block devices
901 * are delayed only about a half the time that file data is delayed.
902 * Similarly, directory updates are more critical, so are only delayed
903 * about a third the time that file data is delayed. Thus, there are
904 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
905 * one each second (driven off the filesystem syncer process). The
906 * syncer_delayno variable indicates the next queue that is to be processed.
907 * Items that need to be processed soon are placed in this queue:
908 *
909 * syncer_workitem_pending[syncer_delayno]
910 *
911 * A delay of fifteen seconds is done by placing the request fifteen
912 * entries later in the queue:
913 *
914 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
915 *
916 */
917
918/*
919 * Add an item to the syncer work queue.
920 */
921static void
922vn_syncer_add_to_worklist(struct vnode *vp, int delay)
923{
924 int s, slot;
925
926 s = splbio();
927
928 if (vp->v_flag & VONWORKLST) {
929 LIST_REMOVE(vp, v_synclist);
930 }
931
932 if (delay > syncer_maxdelay - 2)
933 delay = syncer_maxdelay - 2;
934 slot = (syncer_delayno + delay) & syncer_mask;
935
936 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
937 vp->v_flag |= VONWORKLST;
938 splx(s);
939}
940
941struct proc *updateproc;
942static void sched_sync __P((void));
943static struct kproc_desc up_kp = {
944 "syncer",
945 sched_sync,
946 &updateproc
947};
948SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
949
950/*
951 * System filesystem synchronizer daemon.
952 */
953void
954sched_sync(void)
955{
956 struct synclist *slp;
957 struct vnode *vp;
958 struct mount *mp;
959 long starttime;
960 int s;
961 struct proc *p = updateproc;
962
963 mtx_enter(&Giant, MTX_DEF);
964
965 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
966 SHUTDOWN_PRI_LAST);
967
968 for (;;) {
969 kproc_suspend_loop(p);
970
971 starttime = time_second;
972
973 /*
974 * Push files whose dirty time has expired. Be careful
975 * of interrupt race on slp queue.
976 */
977 s = splbio();
978 slp = &syncer_workitem_pending[syncer_delayno];
979 syncer_delayno += 1;
980 if (syncer_delayno == syncer_maxdelay)
981 syncer_delayno = 0;
982 splx(s);
983
984 while ((vp = LIST_FIRST(slp)) != NULL) {
985 if (VOP_ISLOCKED(vp, NULL) == 0 &&
986 vn_start_write(vp, &mp, V_NOWAIT) == 0) {
987 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
988 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
989 VOP_UNLOCK(vp, 0, p);
990 vn_finished_write(mp);
991 }
992 s = splbio();
993 if (LIST_FIRST(slp) == vp) {
994 /*
995 * Note: v_tag VT_VFS vps can remain on the
996 * worklist too with no dirty blocks, but
997 * since sync_fsync() moves it to a different
998 * slot we are safe.
999 */
1000 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1001 !vn_isdisk(vp, NULL))
1002 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1003 /*
1004 * Put us back on the worklist. The worklist
1005 * routine will remove us from our current
1006 * position and then add us back in at a later
1007 * position.
1008 */
1009 vn_syncer_add_to_worklist(vp, syncdelay);
1010 }
1011 splx(s);
1012 }
1013
1014 /*
1015 * Do soft update processing.
1016 */
1017#ifdef SOFTUPDATES
1018 softdep_process_worklist(NULL);
1019#endif
1020
1021 /*
1022 * The variable rushjob allows the kernel to speed up the
1023 * processing of the filesystem syncer process. A rushjob
1024 * value of N tells the filesystem syncer to process the next
1025 * N seconds worth of work on its queue ASAP. Currently rushjob
1026 * is used by the soft update code to speed up the filesystem
1027 * syncer process when the incore state is getting so far
1028 * ahead of the disk that the kernel memory pool is being
1029 * threatened with exhaustion.
1030 */
1031 if (rushjob > 0) {
1032 rushjob -= 1;
1033 continue;
1034 }
1035 /*
1036 * If it has taken us less than a second to process the
1037 * current work, then wait. Otherwise start right over
1038 * again. We can still lose time if any single round
1039 * takes more than two seconds, but it does not really
1040 * matter as we are just trying to generally pace the
1041 * filesystem activity.
1042 */
1043 if (time_second == starttime)
1044 tsleep(&lbolt, PPAUSE, "syncer", 0);
1045 }
1046}
1047
1048/*
1049 * Request the syncer daemon to speed up its work.
1050 * We never push it to speed up more than half of its
1051 * normal turn time, otherwise it could take over the cpu.
1052 */
1053int
1054speedup_syncer()
1055{
1056 int s;
1057
1058 s = splhigh();
1059 if (updateproc->p_wchan == &lbolt)
1060 setrunnable(updateproc);
1061 splx(s);
1062 if (rushjob < syncdelay / 2) {
1063 rushjob += 1;
1064 stat_rush_requests += 1;
1065 return (1);
1066 }
1067 return(0);
1068}
1069
1070/*
1071 * Associate a p-buffer with a vnode.
1072 *
1073 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1074 * with the buffer. i.e. the bp has not been linked into the vnode or
1075 * ref-counted.
1076 */
1077void
1078pbgetvp(vp, bp)
1079 register struct vnode *vp;
1080 register struct buf *bp;
1081{
1082
1083 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1084
1085 bp->b_vp = vp;
1086 bp->b_flags |= B_PAGING;
1087 bp->b_dev = vn_todev(vp);
1088}
1089
1090/*
1091 * Disassociate a p-buffer from a vnode.
1092 */
1093void
1094pbrelvp(bp)
1095 register struct buf *bp;
1096{
1097
1098 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1099
1100 /* XXX REMOVE ME */
1101 if (bp->b_vnbufs.tqe_next != NULL) {
1102 panic(
1103 "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1104 bp,
1105 (int)bp->b_flags
1106 );
1107 }
1108 bp->b_vp = (struct vnode *) 0;
1109 bp->b_flags &= ~B_PAGING;
1110}
1111
1112void
1113pbreassignbuf(bp, newvp)
1114 struct buf *bp;
1115 struct vnode *newvp;
1116{
1117 if ((bp->b_flags & B_PAGING) == 0) {
1118 panic(
1119 "pbreassignbuf() on non phys bp %p",
1120 bp
1121 );
1122 }
1123 bp->b_vp = newvp;
1124}
1125
1126/*
1127 * Reassign a buffer from one vnode to another.
1128 * Used to assign file specific control information
1129 * (indirect blocks) to the vnode to which they belong.
1130 */
1131void
1132reassignbuf(bp, newvp)
1133 register struct buf *bp;
1134 register struct vnode *newvp;
1135{
1136 struct buflists *listheadp;
1137 int delay;
1138 int s;
1139
1140 if (newvp == NULL) {
1141 printf("reassignbuf: NULL");
1142 return;
1143 }
1144 ++reassignbufcalls;
1145
1146 /*
1147 * B_PAGING flagged buffers cannot be reassigned because their vp
1148 * is not fully linked in.
1149 */
1150 if (bp->b_flags & B_PAGING)
1151 panic("cannot reassign paging buffer");
1152
1153 s = splbio();
1154 /*
1155 * Delete from old vnode list, if on one.
1156 */
1157 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1158 if (bp->b_xflags & BX_VNDIRTY)
1159 listheadp = &bp->b_vp->v_dirtyblkhd;
1160 else
1161 listheadp = &bp->b_vp->v_cleanblkhd;
1162 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1163 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1164 if (bp->b_vp != newvp) {
1165 vdrop(bp->b_vp);
1166 bp->b_vp = NULL; /* for clarification */
1167 }
1168 }
1169 /*
1170 * If dirty, put on list of dirty buffers; otherwise insert onto list
1171 * of clean buffers.
1172 */
1173 if (bp->b_flags & B_DELWRI) {
1174 struct buf *tbp;
1175
1176 listheadp = &newvp->v_dirtyblkhd;
1177 if ((newvp->v_flag & VONWORKLST) == 0) {
1178 switch (newvp->v_type) {
1179 case VDIR:
1180 delay = dirdelay;
1181 break;
1182 case VCHR:
1183 case VBLK:
1184 if (newvp->v_specmountpoint != NULL) {
1185 delay = metadelay;
1186 break;
1187 }
1188 /* fall through */
1189 default:
1190 delay = filedelay;
1191 }
1192 vn_syncer_add_to_worklist(newvp, delay);
1193 }
1194 bp->b_xflags |= BX_VNDIRTY;
1195 tbp = TAILQ_FIRST(listheadp);
1196 if (tbp == NULL ||
1197 bp->b_lblkno == 0 ||
1198 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
1199 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1200 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1201 ++reassignbufsortgood;
1202 } else if (bp->b_lblkno < 0) {
1203 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1204 ++reassignbufsortgood;
1205 } else if (reassignbufmethod == 1) {
1206 /*
1207 * New sorting algorithm, only handle sequential case,
1208 * otherwise append to end (but before metadata)
1209 */
1210 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1211 (tbp->b_xflags & BX_VNDIRTY)) {
1212 /*
1213 * Found the best place to insert the buffer
1214 */
1215 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1216 ++reassignbufsortgood;
1217 } else {
1218 /*
1219 * Missed, append to end, but before meta-data.
1220 * We know that the head buffer in the list is
1221 * not meta-data due to prior conditionals.
1222 *
1223 * Indirect effects: NFS second stage write
1224 * tends to wind up here, giving maximum
1225 * distance between the unstable write and the
1226 * commit rpc.
1227 */
1228 tbp = TAILQ_LAST(listheadp, buflists);
1229 while (tbp && tbp->b_lblkno < 0)
1230 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
1231 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1232 ++reassignbufsortbad;
1233 }
1234 } else {
1235 /*
1236 * Old sorting algorithm, scan queue and insert
1237 */
1238 struct buf *ttbp;
1239 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1240 (ttbp->b_lblkno < bp->b_lblkno)) {
1241 ++reassignbufloops;
1242 tbp = ttbp;
1243 }
1244 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1245 }
1246 } else {
1247 bp->b_xflags |= BX_VNCLEAN;
1248 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1249 if ((newvp->v_flag & VONWORKLST) &&
1250 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1251 newvp->v_flag &= ~VONWORKLST;
1252 LIST_REMOVE(newvp, v_synclist);
1253 }
1254 }
1255 if (bp->b_vp != newvp) {
1256 bp->b_vp = newvp;
1257 vhold(bp->b_vp);
1258 }
1259 splx(s);
1260}
1261
1262/*
1263 * Create a vnode for a block device.
1264 * Used for mounting the root file system.
1265 * XXX: This now changed to a VCHR due to the block/char merging.
1266 */
1267int
1268bdevvp(dev, vpp)
1269 dev_t dev;
1270 struct vnode **vpp;
1271{
1272 register struct vnode *vp;
1273 struct vnode *nvp;
1274 int error;
1275
1276 if (dev == NODEV) {
1277 *vpp = NULLVP;
1278 return (ENXIO);
1279 }
1280 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1281 if (error) {
1282 *vpp = NULLVP;
1283 return (error);
1284 }
1285 vp = nvp;
1286 vp->v_type = VCHR;
1287 addalias(vp, dev);
1288 *vpp = vp;
1289 return (0);
1290}
1291
1292/*
1293 * Add vnode to the alias list hung off the dev_t.
1294 *
1295 * The reason for this gunk is that multiple vnodes can reference
1296 * the same physical device, so checking vp->v_usecount to see
1297 * how many users there are is inadequate; the v_usecount for
1298 * the vnodes need to be accumulated. vcount() does that.
1299 */
1300struct vnode *
1301addaliasu(nvp, nvp_rdev)
1302 struct vnode *nvp;
1303 udev_t nvp_rdev;
1304{
1305 struct vnode *ovp;
1306 vop_t **ops;
1307 dev_t dev;
1308
1309 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1310 panic("addaliasu on non-special vnode");
1311 dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
1312 /*
1313 * Check to see if we have a bdevvp vnode with no associated
1314 * filesystem. If so, we want to associate the filesystem of
1315 * the new newly instigated vnode with the bdevvp vnode and
1316 * discard the newly created vnode rather than leaving the
1317 * bdevvp vnode lying around with no associated filesystem.
1318 */
1319 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1320 addalias(nvp, dev);
1321 return (nvp);
1322 }
1323 /*
1324 * Discard unneeded vnode, but save its node specific data.
1325 * Note that if there is a lock, it is carried over in the
1326 * node specific data to the replacement vnode.
1327 */
1328 vref(ovp);
1329 ovp->v_data = nvp->v_data;
1330 ovp->v_tag = nvp->v_tag;
1331 nvp->v_data = NULL;
1332 ops = nvp->v_op;
1333 nvp->v_op = ovp->v_op;
1334 ovp->v_op = ops;
1335 insmntque(ovp, nvp->v_mount);
1336 vrele(nvp);
1337 vgone(nvp);
1338 return (ovp);
1339}
1340
1341void
1342addalias(nvp, dev)
1343 struct vnode *nvp;
1344 dev_t dev;
1345{
1346
1347 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1348 panic("addalias on non-special vnode");
1349
1350 nvp->v_rdev = dev;
1351 simple_lock(&spechash_slock);
1352 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1353 simple_unlock(&spechash_slock);
1354}
1355
1356/*
1357 * Grab a particular vnode from the free list, increment its
1358 * reference count and lock it. The vnode lock bit is set if the
1359 * vnode is being eliminated in vgone. The process is awakened
1360 * when the transition is completed, and an error returned to
1361 * indicate that the vnode is no longer usable (possibly having
1362 * been changed to a new file system type).
1363 */
1364int
1365vget(vp, flags, p)
1366 register struct vnode *vp;
1367 int flags;
1368 struct proc *p;
1369{
1370 int error;
1371
1372 /*
1373 * If the vnode is in the process of being cleaned out for
1374 * another use, we wait for the cleaning to finish and then
1375 * return failure. Cleaning is determined by checking that
1376 * the VXLOCK flag is set.
1377 */
1378 if ((flags & LK_INTERLOCK) == 0) {
1379 simple_lock(&vp->v_interlock);
1380 }
1381 if (vp->v_flag & VXLOCK) {
1382 vp->v_flag |= VXWANT;
1383 simple_unlock(&vp->v_interlock);
1384 tsleep((caddr_t)vp, PINOD, "vget", 0);
1385 return (ENOENT);
1386 }
1387
1388 vp->v_usecount++;
1389
1390 if (VSHOULDBUSY(vp))
1391 vbusy(vp);
1392 if (flags & LK_TYPE_MASK) {
1393 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1394 /*
1395 * must expand vrele here because we do not want
1396 * to call VOP_INACTIVE if the reference count
1397 * drops back to zero since it was never really
1398 * active. We must remove it from the free list
1399 * before sleeping so that multiple processes do
1400 * not try to recycle it.
1401 */
1402 simple_lock(&vp->v_interlock);
1403 vp->v_usecount--;
1404 if (VSHOULDFREE(vp))
1405 vfree(vp);
1406 simple_unlock(&vp->v_interlock);
1407 }
1408 return (error);
1409 }
1410 simple_unlock(&vp->v_interlock);
1411 return (0);
1412}
1413
1414void
1415vref(struct vnode *vp)
1416{
1417 simple_lock(&vp->v_interlock);
1418 vp->v_usecount++;
1419 simple_unlock(&vp->v_interlock);
1420}
1421
1422/*
1423 * Vnode put/release.
1424 * If count drops to zero, call inactive routine and return to freelist.
1425 */
1426void
1427vrele(vp)
1428 struct vnode *vp;
1429{
1430 struct proc *p = curproc; /* XXX */
1431
1432 KASSERT(vp != NULL, ("vrele: null vp"));
1433 KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close"));
1434
1435 simple_lock(&vp->v_interlock);
1436
1437 if (vp->v_usecount > 1) {
1438
1439 vp->v_usecount--;
1440 simple_unlock(&vp->v_interlock);
1441
1442 return;
1443 }
1444
1445 if (vp->v_usecount == 1) {
1446
1447 vp->v_usecount--;
1448 if (VSHOULDFREE(vp))
1449 vfree(vp);
1450 /*
1451 * If we are doing a vput, the node is already locked, and we must
1452 * call VOP_INACTIVE with the node locked. So, in the case of
1453 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1454 */
1455 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1456 VOP_INACTIVE(vp, p);
1457 }
1458
1459 } else {
1460#ifdef DIAGNOSTIC
1461 vprint("vrele: negative ref count", vp);
1462 simple_unlock(&vp->v_interlock);
1463#endif
1464 panic("vrele: negative ref cnt");
1465 }
1466}
1467
1468void
1469vput(vp)
1470 struct vnode *vp;
1471{
1472 struct proc *p = curproc; /* XXX */
1473
1474 KASSERT(vp != NULL, ("vput: null vp"));
1475 KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close"));
1476
1477 simple_lock(&vp->v_interlock);
1478
1479 if (vp->v_usecount > 1) {
1480
1481 vp->v_usecount--;
1482 VOP_UNLOCK(vp, LK_INTERLOCK, p);
1483 return;
1484
1485 }
1486
1487 if (vp->v_usecount == 1) {
1488
1489 vp->v_usecount--;
1490 if (VSHOULDFREE(vp))
1491 vfree(vp);
1492 /*
1493 * If we are doing a vput, the node is already locked, and we must
1494 * call VOP_INACTIVE with the node locked. So, in the case of
1495 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1496 */
1497 simple_unlock(&vp->v_interlock);
1498 VOP_INACTIVE(vp, p);
1499
1500 } else {
1501#ifdef DIAGNOSTIC
1502 vprint("vput: negative ref count", vp);
1503#endif
1504 panic("vput: negative ref cnt");
1505 }
1506}
1507
1508/*
1509 * Somebody doesn't want the vnode recycled.
1510 */
1511void
1512vhold(vp)
1513 register struct vnode *vp;
1514{
1515 int s;
1516
1517 s = splbio();
1518 vp->v_holdcnt++;
1519 if (VSHOULDBUSY(vp))
1520 vbusy(vp);
1521 splx(s);
1522}
1523
1524/*
1525 * One less who cares about this vnode.
1526 */
1527void
1528vdrop(vp)
1529 register struct vnode *vp;
1530{
1531 int s;
1532
1533 s = splbio();
1534 if (vp->v_holdcnt <= 0)
1535 panic("vdrop: holdcnt");
1536 vp->v_holdcnt--;
1537 if (VSHOULDFREE(vp))
1538 vfree(vp);
1539 splx(s);
1540}
1541
1542/*
1543 * Remove any vnodes in the vnode table belonging to mount point mp.
1544 *
1545 * If MNT_NOFORCE is specified, there should not be any active ones,
1546 * return error if any are found (nb: this is a user error, not a
1547 * system error). If MNT_FORCE is specified, detach any active vnodes
1548 * that are found.
1549 */
1550#ifdef DIAGNOSTIC
1551static int busyprt = 0; /* print out busy vnodes */
1552SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1553#endif
1554
1555int
1556vflush(mp, skipvp, flags)
1557 struct mount *mp;
1558 struct vnode *skipvp;
1559 int flags;
1560{
1561 struct proc *p = curproc; /* XXX */
1562 struct vnode *vp, *nvp;
1563 int busy = 0;
1564
1565 simple_lock(&mntvnode_slock);
1566loop:
1567 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1568 /*
1569 * Make sure this vnode wasn't reclaimed in getnewvnode().
1570 * Start over if it has (it won't be on the list anymore).
1571 */
1572 if (vp->v_mount != mp)
1573 goto loop;
1574 nvp = LIST_NEXT(vp, v_mntvnodes);
1575 /*
1576 * Skip over a selected vnode.
1577 */
1578 if (vp == skipvp)
1579 continue;
1580
1581 simple_lock(&vp->v_interlock);
1582 /*
1583 * Skip over a vnodes marked VSYSTEM.
1584 */
1585 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1586 simple_unlock(&vp->v_interlock);
1587 continue;
1588 }
1589 /*
1590 * If WRITECLOSE is set, only flush out regular file vnodes
1591 * open for writing.
1592 */
1593 if ((flags & WRITECLOSE) &&
1594 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1595 simple_unlock(&vp->v_interlock);
1596 continue;
1597 }
1598
1599 /*
1600 * With v_usecount == 0, all we need to do is clear out the
1601 * vnode data structures and we are done.
1602 */
1603 if (vp->v_usecount == 0) {
1604 simple_unlock(&mntvnode_slock);
1605 vgonel(vp, p);
1606 simple_lock(&mntvnode_slock);
1607 continue;
1608 }
1609
1610 /*
1611 * If FORCECLOSE is set, forcibly close the vnode. For block
1612 * or character devices, revert to an anonymous device. For
1613 * all other files, just kill them.
1614 */
1615 if (flags & FORCECLOSE) {
1616 simple_unlock(&mntvnode_slock);
1617 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1618 vgonel(vp, p);
1619 } else {
1620 vclean(vp, 0, p);
1621 vp->v_op = spec_vnodeop_p;
1622 insmntque(vp, (struct mount *) 0);
1623 }
1624 simple_lock(&mntvnode_slock);
1625 continue;
1626 }
1627#ifdef DIAGNOSTIC
1628 if (busyprt)
1629 vprint("vflush: busy vnode", vp);
1630#endif
1631 simple_unlock(&vp->v_interlock);
1632 busy++;
1633 }
1634 simple_unlock(&mntvnode_slock);
1635 if (busy)
1636 return (EBUSY);
1637 return (0);
1638}
1639
1640/*
1641 * Disassociate the underlying file system from a vnode.
1642 */
1643static void
1644vclean(vp, flags, p)
1645 struct vnode *vp;
1646 int flags;
1647 struct proc *p;
1648{
1649 int active;
1650
1651 /*
1652 * Check to see if the vnode is in use. If so we have to reference it
1653 * before we clean it out so that its count cannot fall to zero and
1654 * generate a race against ourselves to recycle it.
1655 */
1656 if ((active = vp->v_usecount))
1657 vp->v_usecount++;
1658
1659 /*
1660 * Prevent the vnode from being recycled or brought into use while we
1661 * clean it out.
1662 */
1663 if (vp->v_flag & VXLOCK)
1664 panic("vclean: deadlock");
1665 vp->v_flag |= VXLOCK;
1666 /*
1667 * Even if the count is zero, the VOP_INACTIVE routine may still
1668 * have the object locked while it cleans it out. The VOP_LOCK
1669 * ensures that the VOP_INACTIVE routine is done with its work.
1670 * For active vnodes, it ensures that no other activity can
1671 * occur while the underlying object is being cleaned out.
1672 */
1673 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1674
1675 /*
1676 * Clean out any buffers associated with the vnode.
1677 * If the flush fails, just toss the buffers.
1678 */
1679 if (flags & DOCLOSE) {
1680 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
1681 (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
1682 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
1683 vinvalbuf(vp, 0, NOCRED, p, 0, 0);
1684 }
1685
1686 VOP_DESTROYVOBJECT(vp);
1687
1688 /*
1689 * If purging an active vnode, it must be closed and
1690 * deactivated before being reclaimed. Note that the
1691 * VOP_INACTIVE will unlock the vnode.
1692 */
1693 if (active) {
1694 if (flags & DOCLOSE)
1695 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1696 VOP_INACTIVE(vp, p);
1697 } else {
1698 /*
1699 * Any other processes trying to obtain this lock must first
1700 * wait for VXLOCK to clear, then call the new lock operation.
1701 */
1702 VOP_UNLOCK(vp, 0, p);
1703 }
1704 /*
1705 * Reclaim the vnode.
1706 */
1707 if (VOP_RECLAIM(vp, p))
1708 panic("vclean: cannot reclaim");
1709
1710 if (active) {
1711 /*
1712 * Inline copy of vrele() since VOP_INACTIVE
1713 * has already been called.
1714 */
1715 simple_lock(&vp->v_interlock);
1716 if (--vp->v_usecount <= 0) {
1717#ifdef DIAGNOSTIC
1718 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1719 vprint("vclean: bad ref count", vp);
1720 panic("vclean: ref cnt");
1721 }
1722#endif
1723 vfree(vp);
1724 }
1725 simple_unlock(&vp->v_interlock);
1726 }
1727
1728 cache_purge(vp);
1729 if (vp->v_vnlock) {
1730 FREE(vp->v_vnlock, M_VNODE);
1731 vp->v_vnlock = NULL;
1732 }
1733
1734 if (VSHOULDFREE(vp))
1735 vfree(vp);
1736
1737 /*
1738 * Done with purge, notify sleepers of the grim news.
1739 */
1740 vp->v_op = dead_vnodeop_p;
1741 vn_pollgone(vp);
1742 vp->v_tag = VT_NON;
1743 vp->v_flag &= ~VXLOCK;
1744 if (vp->v_flag & VXWANT) {
1745 vp->v_flag &= ~VXWANT;
1746 wakeup((caddr_t) vp);
1747 }
1748}
1749
1750/*
1751 * Eliminate all activity associated with the requested vnode
1752 * and with all vnodes aliased to the requested vnode.
1753 */
1754int
1755vop_revoke(ap)
1756 struct vop_revoke_args /* {
1757 struct vnode *a_vp;
1758 int a_flags;
1759 } */ *ap;
1760{
1761 struct vnode *vp, *vq;
1762 dev_t dev;
1763
1764 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1765
1766 vp = ap->a_vp;
1767 /*
1768 * If a vgone (or vclean) is already in progress,
1769 * wait until it is done and return.
1770 */
1771 if (vp->v_flag & VXLOCK) {
1772 vp->v_flag |= VXWANT;
1773 simple_unlock(&vp->v_interlock);
1774 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1775 return (0);
1776 }
1777 dev = vp->v_rdev;
1778 for (;;) {
1779 simple_lock(&spechash_slock);
1780 vq = SLIST_FIRST(&dev->si_hlist);
1781 simple_unlock(&spechash_slock);
1782 if (!vq)
1783 break;
1784 vgone(vq);
1785 }
1786 return (0);
1787}
1788
1789/*
1790 * Recycle an unused vnode to the front of the free list.
1791 * Release the passed interlock if the vnode will be recycled.
1792 */
1793int
1794vrecycle(vp, inter_lkp, p)
1795 struct vnode *vp;
1796 struct simplelock *inter_lkp;
1797 struct proc *p;
1798{
1799
1800 simple_lock(&vp->v_interlock);
1801 if (vp->v_usecount == 0) {
1802 if (inter_lkp) {
1803 simple_unlock(inter_lkp);
1804 }
1805 vgonel(vp, p);
1806 return (1);
1807 }
1808 simple_unlock(&vp->v_interlock);
1809 return (0);
1810}
1811
1812/*
1813 * Eliminate all activity associated with a vnode
1814 * in preparation for reuse.
1815 */
1816void
1817vgone(vp)
1818 register struct vnode *vp;
1819{
1820 struct proc *p = curproc; /* XXX */
1821
1822 simple_lock(&vp->v_interlock);
1823 vgonel(vp, p);
1824}
1825
1826/*
1827 * vgone, with the vp interlock held.
1828 */
1829void
1830vgonel(vp, p)
1831 struct vnode *vp;
1832 struct proc *p;
1833{
1834 int s;
1835
1836 /*
1837 * If a vgone (or vclean) is already in progress,
1838 * wait until it is done and return.
1839 */
1840 if (vp->v_flag & VXLOCK) {
1841 vp->v_flag |= VXWANT;
1842 simple_unlock(&vp->v_interlock);
1843 tsleep((caddr_t)vp, PINOD, "vgone", 0);
1844 return;
1845 }
1846
1847 /*
1848 * Clean out the filesystem specific data.
1849 */
1850 vclean(vp, DOCLOSE, p);
1851 simple_lock(&vp->v_interlock);
1852
1853 /*
1854 * Delete from old mount point vnode list, if on one.
1855 */
1856 if (vp->v_mount != NULL)
1857 insmntque(vp, (struct mount *)0);
1858 /*
1859 * If special device, remove it from special device alias list
1860 * if it is on one.
1861 */
1862 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
1863 simple_lock(&spechash_slock);
1864 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);
1864 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
1865 freedev(vp->v_rdev);
1866 simple_unlock(&spechash_slock);
1867 vp->v_rdev = NULL;
1868 }
1869
1870 /*
1871 * If it is on the freelist and not already at the head,
1872 * move it to the head of the list. The test of the
1873 * VDOOMED flag and the reference count of zero is because
1874 * it will be removed from the free list by getnewvnode,
1875 * but will not have its reference count incremented until
1876 * after calling vgone. If the reference count were
1877 * incremented first, vgone would (incorrectly) try to
1878 * close the previous instance of the underlying object.
1879 */
1880 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1881 s = splbio();
1882 simple_lock(&vnode_free_list_slock);
1883 if (vp->v_flag & VFREE)
1884 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1885 else
1886 freevnodes++;
1887 vp->v_flag |= VFREE;
1888 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1889 simple_unlock(&vnode_free_list_slock);
1890 splx(s);
1891 }
1892
1893 vp->v_type = VBAD;
1894 simple_unlock(&vp->v_interlock);
1895}
1896
1897/*
1898 * Lookup a vnode by device number.
1899 */
1900int
1901vfinddev(dev, type, vpp)
1902 dev_t dev;
1903 enum vtype type;
1904 struct vnode **vpp;
1905{
1906 struct vnode *vp;
1907
1908 simple_lock(&spechash_slock);
1909 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
1910 if (type == vp->v_type) {
1911 *vpp = vp;
1912 simple_unlock(&spechash_slock);
1913 return (1);
1914 }
1915 }
1916 simple_unlock(&spechash_slock);
1917 return (0);
1918}
1919
1920/*
1921 * Calculate the total number of references to a special device.
1922 */
1923int
1924vcount(vp)
1925 struct vnode *vp;
1926{
1927 struct vnode *vq;
1928 int count;
1929
1930 count = 0;
1931 simple_lock(&spechash_slock);
1865 freedev(vp->v_rdev);
1866 simple_unlock(&spechash_slock);
1867 vp->v_rdev = NULL;
1868 }
1869
1870 /*
1871 * If it is on the freelist and not already at the head,
1872 * move it to the head of the list. The test of the
1873 * VDOOMED flag and the reference count of zero is because
1874 * it will be removed from the free list by getnewvnode,
1875 * but will not have its reference count incremented until
1876 * after calling vgone. If the reference count were
1877 * incremented first, vgone would (incorrectly) try to
1878 * close the previous instance of the underlying object.
1879 */
1880 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1881 s = splbio();
1882 simple_lock(&vnode_free_list_slock);
1883 if (vp->v_flag & VFREE)
1884 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1885 else
1886 freevnodes++;
1887 vp->v_flag |= VFREE;
1888 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1889 simple_unlock(&vnode_free_list_slock);
1890 splx(s);
1891 }
1892
1893 vp->v_type = VBAD;
1894 simple_unlock(&vp->v_interlock);
1895}
1896
1897/*
1898 * Lookup a vnode by device number.
1899 */
1900int
1901vfinddev(dev, type, vpp)
1902 dev_t dev;
1903 enum vtype type;
1904 struct vnode **vpp;
1905{
1906 struct vnode *vp;
1907
1908 simple_lock(&spechash_slock);
1909 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
1910 if (type == vp->v_type) {
1911 *vpp = vp;
1912 simple_unlock(&spechash_slock);
1913 return (1);
1914 }
1915 }
1916 simple_unlock(&spechash_slock);
1917 return (0);
1918}
1919
1920/*
1921 * Calculate the total number of references to a special device.
1922 */
1923int
1924vcount(vp)
1925 struct vnode *vp;
1926{
1927 struct vnode *vq;
1928 int count;
1929
1930 count = 0;
1931 simple_lock(&spechash_slock);
1932 SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
1932 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
1933 count += vq->v_usecount;
1934 simple_unlock(&spechash_slock);
1935 return (count);
1936}
1937
1938/*
1939 * Same as above, but using the dev_t as argument
1940 */
1941
1942int
1943count_dev(dev)
1944 dev_t dev;
1945{
1946 struct vnode *vp;
1947
1948 vp = SLIST_FIRST(&dev->si_hlist);
1949 if (vp == NULL)
1950 return (0);
1951 return(vcount(vp));
1952}
1953
1954/*
1955 * Print out a description of a vnode.
1956 */
1957static char *typename[] =
1958{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1959
1960void
1961vprint(label, vp)
1962 char *label;
1963 struct vnode *vp;
1964{
1965 char buf[96];
1966
1967 if (label != NULL)
1968 printf("%s: %p: ", label, (void *)vp);
1969 else
1970 printf("%p: ", (void *)vp);
1971 printf("type %s, usecount %d, writecount %d, refcount %d,",
1972 typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1973 vp->v_holdcnt);
1974 buf[0] = '\0';
1975 if (vp->v_flag & VROOT)
1976 strcat(buf, "|VROOT");
1977 if (vp->v_flag & VTEXT)
1978 strcat(buf, "|VTEXT");
1979 if (vp->v_flag & VSYSTEM)
1980 strcat(buf, "|VSYSTEM");
1981 if (vp->v_flag & VXLOCK)
1982 strcat(buf, "|VXLOCK");
1983 if (vp->v_flag & VXWANT)
1984 strcat(buf, "|VXWANT");
1985 if (vp->v_flag & VBWAIT)
1986 strcat(buf, "|VBWAIT");
1987 if (vp->v_flag & VDOOMED)
1988 strcat(buf, "|VDOOMED");
1989 if (vp->v_flag & VFREE)
1990 strcat(buf, "|VFREE");
1991 if (vp->v_flag & VOBJBUF)
1992 strcat(buf, "|VOBJBUF");
1993 if (buf[0] != '\0')
1994 printf(" flags (%s)", &buf[1]);
1995 if (vp->v_data == NULL) {
1996 printf("\n");
1997 } else {
1998 printf("\n\t");
1999 VOP_PRINT(vp);
2000 }
2001}
2002
2003#ifdef DDB
2004#include <ddb/ddb.h>
2005/*
2006 * List all of the locked vnodes in the system.
2007 * Called when debugging the kernel.
2008 */
2009DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2010{
2011 struct proc *p = curproc; /* XXX */
2012 struct mount *mp, *nmp;
2013 struct vnode *vp;
2014
2015 printf("Locked vnodes\n");
2016 simple_lock(&mountlist_slock);
2017 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2018 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2019 nmp = TAILQ_NEXT(mp, mnt_list);
2020 continue;
2021 }
2022 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2023 if (VOP_ISLOCKED(vp, NULL))
2024 vprint((char *)0, vp);
2025 }
2026 simple_lock(&mountlist_slock);
2027 nmp = TAILQ_NEXT(mp, mnt_list);
2028 vfs_unbusy(mp, p);
2029 }
2030 simple_unlock(&mountlist_slock);
2031}
2032#endif
2033
2034/*
2035 * Top level filesystem related information gathering.
2036 */
2037static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS));
2038
2039static int
2040vfs_sysctl(SYSCTL_HANDLER_ARGS)
2041{
2042 int *name = (int *)arg1 - 1; /* XXX */
2043 u_int namelen = arg2 + 1; /* XXX */
2044 struct vfsconf *vfsp;
2045
2046#if 1 || defined(COMPAT_PRELITE2)
2047 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2048 if (namelen == 1)
2049 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2050#endif
2051
2052#ifdef notyet
2053 /* all sysctl names at this level are at least name and field */
2054 if (namelen < 2)
2055 return (ENOTDIR); /* overloaded */
2056 if (name[0] != VFS_GENERIC) {
2057 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2058 if (vfsp->vfc_typenum == name[0])
2059 break;
2060 if (vfsp == NULL)
2061 return (EOPNOTSUPP);
2062 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2063 oldp, oldlenp, newp, newlen, p));
2064 }
2065#endif
2066 switch (name[1]) {
2067 case VFS_MAXTYPENUM:
2068 if (namelen != 2)
2069 return (ENOTDIR);
2070 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2071 case VFS_CONF:
2072 if (namelen != 3)
2073 return (ENOTDIR); /* overloaded */
2074 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2075 if (vfsp->vfc_typenum == name[2])
2076 break;
2077 if (vfsp == NULL)
2078 return (EOPNOTSUPP);
2079 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2080 }
2081 return (EOPNOTSUPP);
2082}
2083
2084SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2085 "Generic filesystem");
2086
2087#if 1 || defined(COMPAT_PRELITE2)
2088
2089static int
2090sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2091{
2092 int error;
2093 struct vfsconf *vfsp;
2094 struct ovfsconf ovfs;
2095
2096 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2097 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
2098 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2099 ovfs.vfc_index = vfsp->vfc_typenum;
2100 ovfs.vfc_refcount = vfsp->vfc_refcount;
2101 ovfs.vfc_flags = vfsp->vfc_flags;
2102 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2103 if (error)
2104 return error;
2105 }
2106 return 0;
2107}
2108
2109#endif /* 1 || COMPAT_PRELITE2 */
2110
2111#if 0
2112#define KINFO_VNODESLOP 10
2113/*
2114 * Dump vnode list (via sysctl).
2115 * Copyout address of vnode followed by vnode.
2116 */
2117/* ARGSUSED */
2118static int
2119sysctl_vnode(SYSCTL_HANDLER_ARGS)
2120{
2121 struct proc *p = curproc; /* XXX */
2122 struct mount *mp, *nmp;
2123 struct vnode *nvp, *vp;
2124 int error;
2125
2126#define VPTRSZ sizeof (struct vnode *)
2127#define VNODESZ sizeof (struct vnode)
2128
2129 req->lock = 0;
2130 if (!req->oldptr) /* Make an estimate */
2131 return (SYSCTL_OUT(req, 0,
2132 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2133
2134 simple_lock(&mountlist_slock);
2135 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2136 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2137 nmp = TAILQ_NEXT(mp, mnt_list);
2138 continue;
2139 }
2140again:
2141 simple_lock(&mntvnode_slock);
2142 for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2143 vp != NULL;
2144 vp = nvp) {
2145 /*
2146 * Check that the vp is still associated with
2147 * this filesystem. RACE: could have been
2148 * recycled onto the same filesystem.
2149 */
2150 if (vp->v_mount != mp) {
2151 simple_unlock(&mntvnode_slock);
2152 goto again;
2153 }
2154 nvp = LIST_NEXT(vp, v_mntvnodes);
2155 simple_unlock(&mntvnode_slock);
2156 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2157 (error = SYSCTL_OUT(req, vp, VNODESZ)))
2158 return (error);
2159 simple_lock(&mntvnode_slock);
2160 }
2161 simple_unlock(&mntvnode_slock);
2162 simple_lock(&mountlist_slock);
2163 nmp = TAILQ_NEXT(mp, mnt_list);
2164 vfs_unbusy(mp, p);
2165 }
2166 simple_unlock(&mountlist_slock);
2167
2168 return (0);
2169}
2170#endif
2171
2172/*
2173 * XXX
2174 * Exporting the vnode list on large systems causes them to crash.
2175 * Exporting the vnode list on medium systems causes sysctl to coredump.
2176 */
2177#if 0
2178SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2179 0, 0, sysctl_vnode, "S,vnode", "");
2180#endif
2181
2182/*
2183 * Check to see if a filesystem is mounted on a block device.
2184 */
2185int
2186vfs_mountedon(vp)
2187 struct vnode *vp;
2188{
2189
2190 if (vp->v_specmountpoint != NULL)
2191 return (EBUSY);
2192 return (0);
2193}
2194
2195/*
2196 * Unmount all filesystems. The list is traversed in reverse order
2197 * of mounting to avoid dependencies.
2198 */
2199void
2200vfs_unmountall()
2201{
2202 struct mount *mp;
2203 struct proc *p;
2204 int error;
2205
2206 if (curproc != NULL)
2207 p = curproc;
2208 else
2209 p = initproc; /* XXX XXX should this be proc0? */
2210 /*
2211 * Since this only runs when rebooting, it is not interlocked.
2212 */
2213 while(!TAILQ_EMPTY(&mountlist)) {
2214 mp = TAILQ_LAST(&mountlist, mntlist);
2215 error = dounmount(mp, MNT_FORCE, p);
2216 if (error) {
2217 TAILQ_REMOVE(&mountlist, mp, mnt_list);
2218 printf("unmount of %s failed (",
2219 mp->mnt_stat.f_mntonname);
2220 if (error == EBUSY)
2221 printf("BUSY)\n");
2222 else
2223 printf("%d)\n", error);
2224 } else {
2225 /* The unmount has removed mp from the mountlist */
2226 }
2227 }
2228}
2229
2230/*
2231 * Build hash lists of net addresses and hang them off the mount point.
2232 * Called by ufs_mount() to set up the lists of export addresses.
2233 */
2234static int
2235vfs_hang_addrlist(mp, nep, argp)
2236 struct mount *mp;
2237 struct netexport *nep;
2238 struct export_args *argp;
2239{
2240 register struct netcred *np;
2241 register struct radix_node_head *rnh;
2242 register int i;
2243 struct radix_node *rn;
2244 struct sockaddr *saddr, *smask = 0;
2245 struct domain *dom;
2246 int error;
2247
2248 if (argp->ex_addrlen == 0) {
2249 if (mp->mnt_flag & MNT_DEFEXPORTED)
2250 return (EPERM);
2251 np = &nep->ne_defexported;
2252 np->netc_exflags = argp->ex_flags;
2253 np->netc_anon = argp->ex_anon;
2254 np->netc_anon.cr_ref = 1;
2255 mp->mnt_flag |= MNT_DEFEXPORTED;
2256 return (0);
2257 }
2258 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2259 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2260 bzero((caddr_t) np, i);
2261 saddr = (struct sockaddr *) (np + 1);
2262 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2263 goto out;
2264 if (saddr->sa_len > argp->ex_addrlen)
2265 saddr->sa_len = argp->ex_addrlen;
2266 if (argp->ex_masklen) {
2267 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2268 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2269 if (error)
2270 goto out;
2271 if (smask->sa_len > argp->ex_masklen)
2272 smask->sa_len = argp->ex_masklen;
2273 }
2274 i = saddr->sa_family;
2275 if ((rnh = nep->ne_rtable[i]) == 0) {
2276 /*
2277 * Seems silly to initialize every AF when most are not used,
2278 * do so on demand here
2279 */
2280 for (dom = domains; dom; dom = dom->dom_next)
2281 if (dom->dom_family == i && dom->dom_rtattach) {
2282 dom->dom_rtattach((void **) &nep->ne_rtable[i],
2283 dom->dom_rtoffset);
2284 break;
2285 }
2286 if ((rnh = nep->ne_rtable[i]) == 0) {
2287 error = ENOBUFS;
2288 goto out;
2289 }
2290 }
2291 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2292 np->netc_rnodes);
2293 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
2294 error = EPERM;
2295 goto out;
2296 }
2297 np->netc_exflags = argp->ex_flags;
2298 np->netc_anon = argp->ex_anon;
2299 np->netc_anon.cr_ref = 1;
2300 return (0);
2301out:
2302 free(np, M_NETADDR);
2303 return (error);
2304}
2305
2306/* ARGSUSED */
2307static int
2308vfs_free_netcred(rn, w)
2309 struct radix_node *rn;
2310 void *w;
2311{
2312 register struct radix_node_head *rnh = (struct radix_node_head *) w;
2313
2314 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2315 free((caddr_t) rn, M_NETADDR);
2316 return (0);
2317}
2318
2319/*
2320 * Free the net address hash lists that are hanging off the mount points.
2321 */
2322static void
2323vfs_free_addrlist(nep)
2324 struct netexport *nep;
2325{
2326 register int i;
2327 register struct radix_node_head *rnh;
2328
2329 for (i = 0; i <= AF_MAX; i++)
2330 if ((rnh = nep->ne_rtable[i])) {
2331 (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2332 (caddr_t) rnh);
2333 free((caddr_t) rnh, M_RTABLE);
2334 nep->ne_rtable[i] = 0;
2335 }
2336}
2337
2338int
2339vfs_export(mp, nep, argp)
2340 struct mount *mp;
2341 struct netexport *nep;
2342 struct export_args *argp;
2343{
2344 int error;
2345
2346 if (argp->ex_flags & MNT_DELEXPORT) {
2347 if (mp->mnt_flag & MNT_EXPUBLIC) {
2348 vfs_setpublicfs(NULL, NULL, NULL);
2349 mp->mnt_flag &= ~MNT_EXPUBLIC;
2350 }
2351 vfs_free_addrlist(nep);
2352 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2353 }
2354 if (argp->ex_flags & MNT_EXPORTED) {
2355 if (argp->ex_flags & MNT_EXPUBLIC) {
2356 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2357 return (error);
2358 mp->mnt_flag |= MNT_EXPUBLIC;
2359 }
2360 if ((error = vfs_hang_addrlist(mp, nep, argp)))
2361 return (error);
2362 mp->mnt_flag |= MNT_EXPORTED;
2363 }
2364 return (0);
2365}
2366
2367
2368/*
2369 * Set the publicly exported filesystem (WebNFS). Currently, only
2370 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2371 */
2372int
2373vfs_setpublicfs(mp, nep, argp)
2374 struct mount *mp;
2375 struct netexport *nep;
2376 struct export_args *argp;
2377{
2378 int error;
2379 struct vnode *rvp;
2380 char *cp;
2381
2382 /*
2383 * mp == NULL -> invalidate the current info, the FS is
2384 * no longer exported. May be called from either vfs_export
2385 * or unmount, so check if it hasn't already been done.
2386 */
2387 if (mp == NULL) {
2388 if (nfs_pub.np_valid) {
2389 nfs_pub.np_valid = 0;
2390 if (nfs_pub.np_index != NULL) {
2391 FREE(nfs_pub.np_index, M_TEMP);
2392 nfs_pub.np_index = NULL;
2393 }
2394 }
2395 return (0);
2396 }
2397
2398 /*
2399 * Only one allowed at a time.
2400 */
2401 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2402 return (EBUSY);
2403
2404 /*
2405 * Get real filehandle for root of exported FS.
2406 */
2407 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2408 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2409
2410 if ((error = VFS_ROOT(mp, &rvp)))
2411 return (error);
2412
2413 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2414 return (error);
2415
2416 vput(rvp);
2417
2418 /*
2419 * If an indexfile was specified, pull it in.
2420 */
2421 if (argp->ex_indexfile != NULL) {
2422 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2423 M_WAITOK);
2424 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2425 MAXNAMLEN, (size_t *)0);
2426 if (!error) {
2427 /*
2428 * Check for illegal filenames.
2429 */
2430 for (cp = nfs_pub.np_index; *cp; cp++) {
2431 if (*cp == '/') {
2432 error = EINVAL;
2433 break;
2434 }
2435 }
2436 }
2437 if (error) {
2438 FREE(nfs_pub.np_index, M_TEMP);
2439 return (error);
2440 }
2441 }
2442
2443 nfs_pub.np_mount = mp;
2444 nfs_pub.np_valid = 1;
2445 return (0);
2446}
2447
2448struct netcred *
2449vfs_export_lookup(mp, nep, nam)
2450 register struct mount *mp;
2451 struct netexport *nep;
2452 struct sockaddr *nam;
2453{
2454 register struct netcred *np;
2455 register struct radix_node_head *rnh;
2456 struct sockaddr *saddr;
2457
2458 np = NULL;
2459 if (mp->mnt_flag & MNT_EXPORTED) {
2460 /*
2461 * Lookup in the export list first.
2462 */
2463 if (nam != NULL) {
2464 saddr = nam;
2465 rnh = nep->ne_rtable[saddr->sa_family];
2466 if (rnh != NULL) {
2467 np = (struct netcred *)
2468 (*rnh->rnh_matchaddr)((caddr_t)saddr,
2469 rnh);
2470 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2471 np = NULL;
2472 }
2473 }
2474 /*
2475 * If no address match, use the default if it exists.
2476 */
2477 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2478 np = &nep->ne_defexported;
2479 }
2480 return (np);
2481}
2482
2483/*
2484 * perform msync on all vnodes under a mount point
2485 * the mount point must be locked.
2486 */
2487void
2488vfs_msync(struct mount *mp, int flags) {
2489 struct vnode *vp, *nvp;
2490 struct vm_object *obj;
2491 int anyio, tries;
2492
2493 tries = 5;
2494loop:
2495 anyio = 0;
2496 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
2497
2498 nvp = LIST_NEXT(vp, v_mntvnodes);
2499
2500 if (vp->v_mount != mp) {
2501 goto loop;
2502 }
2503
2504 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
2505 continue;
2506
2507 if (flags != MNT_WAIT) {
2508 if (VOP_GETVOBJECT(vp, &obj) != 0 ||
2509 (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2510 continue;
2511 if (VOP_ISLOCKED(vp, NULL))
2512 continue;
2513 }
2514
2515 simple_lock(&vp->v_interlock);
2516 if (VOP_GETVOBJECT(vp, &obj) == 0 &&
2517 (obj->flags & OBJ_MIGHTBEDIRTY)) {
2518 if (!vget(vp,
2519 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2520 if (VOP_GETVOBJECT(vp, &obj) == 0) {
2521 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
2522 anyio = 1;
2523 }
2524 vput(vp);
2525 }
2526 } else {
2527 simple_unlock(&vp->v_interlock);
2528 }
2529 }
2530 if (anyio && (--tries > 0))
2531 goto loop;
2532}
2533
2534/*
2535 * Create the VM object needed for VMIO and mmap support. This
2536 * is done for all VREG files in the system. Some filesystems might
2537 * afford the additional metadata buffering capability of the
2538 * VMIO code by making the device node be VMIO mode also.
2539 *
2540 * vp must be locked when vfs_object_create is called.
2541 */
2542int
2543vfs_object_create(vp, p, cred)
2544 struct vnode *vp;
2545 struct proc *p;
2546 struct ucred *cred;
2547{
2548 return (VOP_CREATEVOBJECT(vp, cred, p));
2549}
2550
2551void
2552vfree(vp)
2553 struct vnode *vp;
2554{
2555 int s;
2556
2557 s = splbio();
2558 simple_lock(&vnode_free_list_slock);
2559 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
2560 if (vp->v_flag & VAGE) {
2561 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2562 } else {
2563 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2564 }
2565 freevnodes++;
2566 simple_unlock(&vnode_free_list_slock);
2567 vp->v_flag &= ~VAGE;
2568 vp->v_flag |= VFREE;
2569 splx(s);
2570}
2571
2572void
2573vbusy(vp)
2574 struct vnode *vp;
2575{
2576 int s;
2577
2578 s = splbio();
2579 simple_lock(&vnode_free_list_slock);
2580 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
2581 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2582 freevnodes--;
2583 simple_unlock(&vnode_free_list_slock);
2584 vp->v_flag &= ~(VFREE|VAGE);
2585 splx(s);
2586}
2587
2588/*
2589 * Record a process's interest in events which might happen to
2590 * a vnode. Because poll uses the historic select-style interface
2591 * internally, this routine serves as both the ``check for any
2592 * pending events'' and the ``record my interest in future events''
2593 * functions. (These are done together, while the lock is held,
2594 * to avoid race conditions.)
2595 */
2596int
2597vn_pollrecord(vp, p, events)
2598 struct vnode *vp;
2599 struct proc *p;
2600 short events;
2601{
2602 simple_lock(&vp->v_pollinfo.vpi_lock);
2603 if (vp->v_pollinfo.vpi_revents & events) {
2604 /*
2605 * This leaves events we are not interested
2606 * in available for the other process which
2607 * which presumably had requested them
2608 * (otherwise they would never have been
2609 * recorded).
2610 */
2611 events &= vp->v_pollinfo.vpi_revents;
2612 vp->v_pollinfo.vpi_revents &= ~events;
2613
2614 simple_unlock(&vp->v_pollinfo.vpi_lock);
2615 return events;
2616 }
2617 vp->v_pollinfo.vpi_events |= events;
2618 selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2619 simple_unlock(&vp->v_pollinfo.vpi_lock);
2620 return 0;
2621}
2622
2623/*
2624 * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
2625 * it is possible for us to miss an event due to race conditions, but
2626 * that condition is expected to be rare, so for the moment it is the
2627 * preferred interface.
2628 */
2629void
2630vn_pollevent(vp, events)
2631 struct vnode *vp;
2632 short events;
2633{
2634 simple_lock(&vp->v_pollinfo.vpi_lock);
2635 if (vp->v_pollinfo.vpi_events & events) {
2636 /*
2637 * We clear vpi_events so that we don't
2638 * call selwakeup() twice if two events are
2639 * posted before the polling process(es) is
2640 * awakened. This also ensures that we take at
2641 * most one selwakeup() if the polling process
2642 * is no longer interested. However, it does
2643 * mean that only one event can be noticed at
2644 * a time. (Perhaps we should only clear those
2645 * event bits which we note?) XXX
2646 */
2647 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
2648 vp->v_pollinfo.vpi_revents |= events;
2649 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2650 }
2651 simple_unlock(&vp->v_pollinfo.vpi_lock);
2652}
2653
2654/*
2655 * Wake up anyone polling on vp because it is being revoked.
2656 * This depends on dead_poll() returning POLLHUP for correct
2657 * behavior.
2658 */
2659void
2660vn_pollgone(vp)
2661 struct vnode *vp;
2662{
2663 simple_lock(&vp->v_pollinfo.vpi_lock);
2664 if (vp->v_pollinfo.vpi_events) {
2665 vp->v_pollinfo.vpi_events = 0;
2666 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2667 }
2668 simple_unlock(&vp->v_pollinfo.vpi_lock);
2669}
2670
2671
2672
2673/*
2674 * Routine to create and manage a filesystem syncer vnode.
2675 */
2676#define sync_close ((int (*) __P((struct vop_close_args *)))nullop)
2677static int sync_fsync __P((struct vop_fsync_args *));
2678static int sync_inactive __P((struct vop_inactive_args *));
2679static int sync_reclaim __P((struct vop_reclaim_args *));
2680#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock)
2681#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock)
2682static int sync_print __P((struct vop_print_args *));
2683#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2684
2685static vop_t **sync_vnodeop_p;
2686static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2687 { &vop_default_desc, (vop_t *) vop_eopnotsupp },
2688 { &vop_close_desc, (vop_t *) sync_close }, /* close */
2689 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
2690 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
2691 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
2692 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */
2693 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */
2694 { &vop_print_desc, (vop_t *) sync_print }, /* print */
2695 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */
2696 { NULL, NULL }
2697};
2698static struct vnodeopv_desc sync_vnodeop_opv_desc =
2699 { &sync_vnodeop_p, sync_vnodeop_entries };
2700
2701VNODEOP_SET(sync_vnodeop_opv_desc);
2702
2703/*
2704 * Create a new filesystem syncer vnode for the specified mount point.
2705 */
2706int
2707vfs_allocate_syncvnode(mp)
2708 struct mount *mp;
2709{
2710 struct vnode *vp;
2711 static long start, incr, next;
2712 int error;
2713
2714 /* Allocate a new vnode */
2715 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2716 mp->mnt_syncer = NULL;
2717 return (error);
2718 }
2719 vp->v_type = VNON;
2720 /*
2721 * Place the vnode onto the syncer worklist. We attempt to
2722 * scatter them about on the list so that they will go off
2723 * at evenly distributed times even if all the filesystems
2724 * are mounted at once.
2725 */
2726 next += incr;
2727 if (next == 0 || next > syncer_maxdelay) {
2728 start /= 2;
2729 incr /= 2;
2730 if (start == 0) {
2731 start = syncer_maxdelay / 2;
2732 incr = syncer_maxdelay;
2733 }
2734 next = start;
2735 }
2736 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2737 mp->mnt_syncer = vp;
2738 return (0);
2739}
2740
2741/*
2742 * Do a lazy sync of the filesystem.
2743 */
2744static int
2745sync_fsync(ap)
2746 struct vop_fsync_args /* {
2747 struct vnode *a_vp;
2748 struct ucred *a_cred;
2749 int a_waitfor;
2750 struct proc *a_p;
2751 } */ *ap;
2752{
2753 struct vnode *syncvp = ap->a_vp;
2754 struct mount *mp = syncvp->v_mount;
2755 struct proc *p = ap->a_p;
2756 int asyncflag;
2757
2758 /*
2759 * We only need to do something if this is a lazy evaluation.
2760 */
2761 if (ap->a_waitfor != MNT_LAZY)
2762 return (0);
2763
2764 /*
2765 * Move ourselves to the back of the sync list.
2766 */
2767 vn_syncer_add_to_worklist(syncvp, syncdelay);
2768
2769 /*
2770 * Walk the list of vnodes pushing all that are dirty and
2771 * not already on the sync list.
2772 */
2773 simple_lock(&mountlist_slock);
2774 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2775 simple_unlock(&mountlist_slock);
2776 return (0);
2777 }
2778 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
2779 vfs_unbusy(mp, p);
2780 simple_unlock(&mountlist_slock);
2781 return (0);
2782 }
2783 asyncflag = mp->mnt_flag & MNT_ASYNC;
2784 mp->mnt_flag &= ~MNT_ASYNC;
2785 vfs_msync(mp, MNT_NOWAIT);
2786 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2787 if (asyncflag)
2788 mp->mnt_flag |= MNT_ASYNC;
2789 vn_finished_write(mp);
2790 vfs_unbusy(mp, p);
2791 return (0);
2792}
2793
2794/*
2795 * The syncer vnode is no referenced.
2796 */
2797static int
2798sync_inactive(ap)
2799 struct vop_inactive_args /* {
2800 struct vnode *a_vp;
2801 struct proc *a_p;
2802 } */ *ap;
2803{
2804
2805 vgone(ap->a_vp);
2806 return (0);
2807}
2808
2809/*
2810 * The syncer vnode is no longer needed and is being decommissioned.
2811 *
2812 * Modifications to the worklist must be protected at splbio().
2813 */
2814static int
2815sync_reclaim(ap)
2816 struct vop_reclaim_args /* {
2817 struct vnode *a_vp;
2818 } */ *ap;
2819{
2820 struct vnode *vp = ap->a_vp;
2821 int s;
2822
2823 s = splbio();
2824 vp->v_mount->mnt_syncer = NULL;
2825 if (vp->v_flag & VONWORKLST) {
2826 LIST_REMOVE(vp, v_synclist);
2827 vp->v_flag &= ~VONWORKLST;
2828 }
2829 splx(s);
2830
2831 return (0);
2832}
2833
2834/*
2835 * Print out a syncer vnode.
2836 */
2837static int
2838sync_print(ap)
2839 struct vop_print_args /* {
2840 struct vnode *a_vp;
2841 } */ *ap;
2842{
2843 struct vnode *vp = ap->a_vp;
2844
2845 printf("syncer vnode");
2846 if (vp->v_vnlock != NULL)
2847 lockmgr_printinfo(vp->v_vnlock);
2848 printf("\n");
2849 return (0);
2850}
2851
2852/*
2853 * extract the dev_t from a VBLK or VCHR
2854 */
2855dev_t
2856vn_todev(vp)
2857 struct vnode *vp;
2858{
2859 if (vp->v_type != VBLK && vp->v_type != VCHR)
2860 return (NODEV);
2861 return (vp->v_rdev);
2862}
2863
2864/*
2865 * Check if vnode represents a disk device
2866 */
2867int
2868vn_isdisk(vp, errp)
2869 struct vnode *vp;
2870 int *errp;
2871{
2872 struct cdevsw *cdevsw;
2873
2874 if (vp->v_type != VBLK && vp->v_type != VCHR) {
2875 if (errp != NULL)
2876 *errp = ENOTBLK;
2877 return (0);
2878 }
2879 if (vp->v_rdev == NULL) {
2880 if (errp != NULL)
2881 *errp = ENXIO;
2882 return (0);
2883 }
2884 cdevsw = devsw(vp->v_rdev);
2885 if (cdevsw == NULL) {
2886 if (errp != NULL)
2887 *errp = ENXIO;
2888 return (0);
2889 }
2890 if (!(cdevsw->d_flags & D_DISK)) {
2891 if (errp != NULL)
2892 *errp = ENOTBLK;
2893 return (0);
2894 }
2895 if (errp != NULL)
2896 *errp = 0;
2897 return (1);
2898}
2899
2900void
2901NDFREE(ndp, flags)
2902 struct nameidata *ndp;
2903 const uint flags;
2904{
2905 if (!(flags & NDF_NO_FREE_PNBUF) &&
2906 (ndp->ni_cnd.cn_flags & HASBUF)) {
2907 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
2908 ndp->ni_cnd.cn_flags &= ~HASBUF;
2909 }
2910 if (!(flags & NDF_NO_DVP_UNLOCK) &&
2911 (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
2912 ndp->ni_dvp != ndp->ni_vp)
2913 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
2914 if (!(flags & NDF_NO_DVP_RELE) &&
2915 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
2916 vrele(ndp->ni_dvp);
2917 ndp->ni_dvp = NULL;
2918 }
2919 if (!(flags & NDF_NO_VP_UNLOCK) &&
2920 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
2921 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
2922 if (!(flags & NDF_NO_VP_RELE) &&
2923 ndp->ni_vp) {
2924 vrele(ndp->ni_vp);
2925 ndp->ni_vp = NULL;
2926 }
2927 if (!(flags & NDF_NO_STARTDIR_RELE) &&
2928 (ndp->ni_cnd.cn_flags & SAVESTART)) {
2929 vrele(ndp->ni_startdir);
2930 ndp->ni_startdir = NULL;
2931 }
2932}
2933
2934int
2935vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
2936 enum vtype type;
2937 mode_t file_mode;
2938 uid_t file_uid;
2939 gid_t file_gid;
2940 mode_t acc_mode;
2941 struct ucred *cred;
2942 int *privused;
2943{
2944 mode_t dac_granted;
2945#ifdef CAPABILITIES
2946 mode_t cap_granted;
2947#endif
2948
2949 /*
2950 * Look for a normal, non-privileged way to access the file/directory
2951 * as requested. If it exists, go with that.
2952 */
2953
2954 if (privused != NULL)
2955 *privused = 0;
2956
2957 dac_granted = 0;
2958
2959 /* Check the owner. */
2960 if (cred->cr_uid == file_uid) {
2961 if (file_mode & S_IXUSR)
2962 dac_granted |= VEXEC;
2963 if (file_mode & S_IRUSR)
2964 dac_granted |= VREAD;
2965 if (file_mode & S_IWUSR)
2966 dac_granted |= VWRITE;
2967
2968 if ((acc_mode & dac_granted) == acc_mode)
2969 return (0);
2970
2971 goto privcheck;
2972 }
2973
2974 /* Otherwise, check the groups (first match) */
2975 if (groupmember(file_gid, cred)) {
2976 if (file_mode & S_IXGRP)
2977 dac_granted |= VEXEC;
2978 if (file_mode & S_IRGRP)
2979 dac_granted |= VREAD;
2980 if (file_mode & S_IWGRP)
2981 dac_granted |= VWRITE;
2982
2983 if ((acc_mode & dac_granted) == acc_mode)
2984 return (0);
2985
2986 goto privcheck;
2987 }
2988
2989 /* Otherwise, check everyone else. */
2990 if (file_mode & S_IXOTH)
2991 dac_granted |= VEXEC;
2992 if (file_mode & S_IROTH)
2993 dac_granted |= VREAD;
2994 if (file_mode & S_IWOTH)
2995 dac_granted |= VWRITE;
2996 if ((acc_mode & dac_granted) == acc_mode)
2997 return (0);
2998
2999privcheck:
3000 if (!suser_xxx(cred, NULL, PRISON_ROOT)) {
3001 /* XXX audit: privilege used */
3002 if (privused != NULL)
3003 *privused = 1;
3004 return (0);
3005 }
3006
3007#ifdef CAPABILITIES
3008 /*
3009 * Build a capability mask to determine if the set of capabilities
3010 * satisfies the requirements when combined with the granted mask
3011 * from above.
3012 * For each capability, if the capability is required, bitwise
3013 * or the request type onto the cap_granted mask.
3014 */
3015 cap_granted = 0;
3016 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3017 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3018 cap_granted |= VEXEC;
3019
3020 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3021 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3022 cap_granted |= VREAD;
3023
3024 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3025 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3026 cap_granted |= VWRITE;
3027
3028 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3029 /* XXX audit: privilege used */
3030 if (privused != NULL)
3031 *privused = 1;
3032 return (0);
3033 }
3034#endif
3035
3036 return (EACCES);
3037}
1933 count += vq->v_usecount;
1934 simple_unlock(&spechash_slock);
1935 return (count);
1936}
1937
1938/*
1939 * Same as above, but using the dev_t as argument
1940 */
1941
1942int
1943count_dev(dev)
1944 dev_t dev;
1945{
1946 struct vnode *vp;
1947
1948 vp = SLIST_FIRST(&dev->si_hlist);
1949 if (vp == NULL)
1950 return (0);
1951 return(vcount(vp));
1952}
1953
1954/*
1955 * Print out a description of a vnode.
1956 */
1957static char *typename[] =
1958{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1959
1960void
1961vprint(label, vp)
1962 char *label;
1963 struct vnode *vp;
1964{
1965 char buf[96];
1966
1967 if (label != NULL)
1968 printf("%s: %p: ", label, (void *)vp);
1969 else
1970 printf("%p: ", (void *)vp);
1971 printf("type %s, usecount %d, writecount %d, refcount %d,",
1972 typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1973 vp->v_holdcnt);
1974 buf[0] = '\0';
1975 if (vp->v_flag & VROOT)
1976 strcat(buf, "|VROOT");
1977 if (vp->v_flag & VTEXT)
1978 strcat(buf, "|VTEXT");
1979 if (vp->v_flag & VSYSTEM)
1980 strcat(buf, "|VSYSTEM");
1981 if (vp->v_flag & VXLOCK)
1982 strcat(buf, "|VXLOCK");
1983 if (vp->v_flag & VXWANT)
1984 strcat(buf, "|VXWANT");
1985 if (vp->v_flag & VBWAIT)
1986 strcat(buf, "|VBWAIT");
1987 if (vp->v_flag & VDOOMED)
1988 strcat(buf, "|VDOOMED");
1989 if (vp->v_flag & VFREE)
1990 strcat(buf, "|VFREE");
1991 if (vp->v_flag & VOBJBUF)
1992 strcat(buf, "|VOBJBUF");
1993 if (buf[0] != '\0')
1994 printf(" flags (%s)", &buf[1]);
1995 if (vp->v_data == NULL) {
1996 printf("\n");
1997 } else {
1998 printf("\n\t");
1999 VOP_PRINT(vp);
2000 }
2001}
2002
2003#ifdef DDB
2004#include <ddb/ddb.h>
2005/*
2006 * List all of the locked vnodes in the system.
2007 * Called when debugging the kernel.
2008 */
2009DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2010{
2011 struct proc *p = curproc; /* XXX */
2012 struct mount *mp, *nmp;
2013 struct vnode *vp;
2014
2015 printf("Locked vnodes\n");
2016 simple_lock(&mountlist_slock);
2017 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2018 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2019 nmp = TAILQ_NEXT(mp, mnt_list);
2020 continue;
2021 }
2022 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2023 if (VOP_ISLOCKED(vp, NULL))
2024 vprint((char *)0, vp);
2025 }
2026 simple_lock(&mountlist_slock);
2027 nmp = TAILQ_NEXT(mp, mnt_list);
2028 vfs_unbusy(mp, p);
2029 }
2030 simple_unlock(&mountlist_slock);
2031}
2032#endif
2033
2034/*
2035 * Top level filesystem related information gathering.
2036 */
2037static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS));
2038
2039static int
2040vfs_sysctl(SYSCTL_HANDLER_ARGS)
2041{
2042 int *name = (int *)arg1 - 1; /* XXX */
2043 u_int namelen = arg2 + 1; /* XXX */
2044 struct vfsconf *vfsp;
2045
2046#if 1 || defined(COMPAT_PRELITE2)
2047 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2048 if (namelen == 1)
2049 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2050#endif
2051
2052#ifdef notyet
2053 /* all sysctl names at this level are at least name and field */
2054 if (namelen < 2)
2055 return (ENOTDIR); /* overloaded */
2056 if (name[0] != VFS_GENERIC) {
2057 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2058 if (vfsp->vfc_typenum == name[0])
2059 break;
2060 if (vfsp == NULL)
2061 return (EOPNOTSUPP);
2062 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2063 oldp, oldlenp, newp, newlen, p));
2064 }
2065#endif
2066 switch (name[1]) {
2067 case VFS_MAXTYPENUM:
2068 if (namelen != 2)
2069 return (ENOTDIR);
2070 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2071 case VFS_CONF:
2072 if (namelen != 3)
2073 return (ENOTDIR); /* overloaded */
2074 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2075 if (vfsp->vfc_typenum == name[2])
2076 break;
2077 if (vfsp == NULL)
2078 return (EOPNOTSUPP);
2079 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2080 }
2081 return (EOPNOTSUPP);
2082}
2083
2084SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2085 "Generic filesystem");
2086
2087#if 1 || defined(COMPAT_PRELITE2)
2088
2089static int
2090sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2091{
2092 int error;
2093 struct vfsconf *vfsp;
2094 struct ovfsconf ovfs;
2095
2096 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2097 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
2098 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2099 ovfs.vfc_index = vfsp->vfc_typenum;
2100 ovfs.vfc_refcount = vfsp->vfc_refcount;
2101 ovfs.vfc_flags = vfsp->vfc_flags;
2102 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2103 if (error)
2104 return error;
2105 }
2106 return 0;
2107}
2108
2109#endif /* 1 || COMPAT_PRELITE2 */
2110
2111#if 0
2112#define KINFO_VNODESLOP 10
2113/*
2114 * Dump vnode list (via sysctl).
2115 * Copyout address of vnode followed by vnode.
2116 */
2117/* ARGSUSED */
2118static int
2119sysctl_vnode(SYSCTL_HANDLER_ARGS)
2120{
2121 struct proc *p = curproc; /* XXX */
2122 struct mount *mp, *nmp;
2123 struct vnode *nvp, *vp;
2124 int error;
2125
2126#define VPTRSZ sizeof (struct vnode *)
2127#define VNODESZ sizeof (struct vnode)
2128
2129 req->lock = 0;
2130 if (!req->oldptr) /* Make an estimate */
2131 return (SYSCTL_OUT(req, 0,
2132 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2133
2134 simple_lock(&mountlist_slock);
2135 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2136 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2137 nmp = TAILQ_NEXT(mp, mnt_list);
2138 continue;
2139 }
2140again:
2141 simple_lock(&mntvnode_slock);
2142 for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2143 vp != NULL;
2144 vp = nvp) {
2145 /*
2146 * Check that the vp is still associated with
2147 * this filesystem. RACE: could have been
2148 * recycled onto the same filesystem.
2149 */
2150 if (vp->v_mount != mp) {
2151 simple_unlock(&mntvnode_slock);
2152 goto again;
2153 }
2154 nvp = LIST_NEXT(vp, v_mntvnodes);
2155 simple_unlock(&mntvnode_slock);
2156 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2157 (error = SYSCTL_OUT(req, vp, VNODESZ)))
2158 return (error);
2159 simple_lock(&mntvnode_slock);
2160 }
2161 simple_unlock(&mntvnode_slock);
2162 simple_lock(&mountlist_slock);
2163 nmp = TAILQ_NEXT(mp, mnt_list);
2164 vfs_unbusy(mp, p);
2165 }
2166 simple_unlock(&mountlist_slock);
2167
2168 return (0);
2169}
2170#endif
2171
2172/*
2173 * XXX
2174 * Exporting the vnode list on large systems causes them to crash.
2175 * Exporting the vnode list on medium systems causes sysctl to coredump.
2176 */
2177#if 0
2178SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2179 0, 0, sysctl_vnode, "S,vnode", "");
2180#endif
2181
2182/*
2183 * Check to see if a filesystem is mounted on a block device.
2184 */
2185int
2186vfs_mountedon(vp)
2187 struct vnode *vp;
2188{
2189
2190 if (vp->v_specmountpoint != NULL)
2191 return (EBUSY);
2192 return (0);
2193}
2194
2195/*
2196 * Unmount all filesystems. The list is traversed in reverse order
2197 * of mounting to avoid dependencies.
2198 */
2199void
2200vfs_unmountall()
2201{
2202 struct mount *mp;
2203 struct proc *p;
2204 int error;
2205
2206 if (curproc != NULL)
2207 p = curproc;
2208 else
2209 p = initproc; /* XXX XXX should this be proc0? */
2210 /*
2211 * Since this only runs when rebooting, it is not interlocked.
2212 */
2213 while(!TAILQ_EMPTY(&mountlist)) {
2214 mp = TAILQ_LAST(&mountlist, mntlist);
2215 error = dounmount(mp, MNT_FORCE, p);
2216 if (error) {
2217 TAILQ_REMOVE(&mountlist, mp, mnt_list);
2218 printf("unmount of %s failed (",
2219 mp->mnt_stat.f_mntonname);
2220 if (error == EBUSY)
2221 printf("BUSY)\n");
2222 else
2223 printf("%d)\n", error);
2224 } else {
2225 /* The unmount has removed mp from the mountlist */
2226 }
2227 }
2228}
2229
2230/*
2231 * Build hash lists of net addresses and hang them off the mount point.
2232 * Called by ufs_mount() to set up the lists of export addresses.
2233 */
2234static int
2235vfs_hang_addrlist(mp, nep, argp)
2236 struct mount *mp;
2237 struct netexport *nep;
2238 struct export_args *argp;
2239{
2240 register struct netcred *np;
2241 register struct radix_node_head *rnh;
2242 register int i;
2243 struct radix_node *rn;
2244 struct sockaddr *saddr, *smask = 0;
2245 struct domain *dom;
2246 int error;
2247
2248 if (argp->ex_addrlen == 0) {
2249 if (mp->mnt_flag & MNT_DEFEXPORTED)
2250 return (EPERM);
2251 np = &nep->ne_defexported;
2252 np->netc_exflags = argp->ex_flags;
2253 np->netc_anon = argp->ex_anon;
2254 np->netc_anon.cr_ref = 1;
2255 mp->mnt_flag |= MNT_DEFEXPORTED;
2256 return (0);
2257 }
2258 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2259 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2260 bzero((caddr_t) np, i);
2261 saddr = (struct sockaddr *) (np + 1);
2262 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2263 goto out;
2264 if (saddr->sa_len > argp->ex_addrlen)
2265 saddr->sa_len = argp->ex_addrlen;
2266 if (argp->ex_masklen) {
2267 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2268 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2269 if (error)
2270 goto out;
2271 if (smask->sa_len > argp->ex_masklen)
2272 smask->sa_len = argp->ex_masklen;
2273 }
2274 i = saddr->sa_family;
2275 if ((rnh = nep->ne_rtable[i]) == 0) {
2276 /*
2277 * Seems silly to initialize every AF when most are not used,
2278 * do so on demand here
2279 */
2280 for (dom = domains; dom; dom = dom->dom_next)
2281 if (dom->dom_family == i && dom->dom_rtattach) {
2282 dom->dom_rtattach((void **) &nep->ne_rtable[i],
2283 dom->dom_rtoffset);
2284 break;
2285 }
2286 if ((rnh = nep->ne_rtable[i]) == 0) {
2287 error = ENOBUFS;
2288 goto out;
2289 }
2290 }
2291 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2292 np->netc_rnodes);
2293 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
2294 error = EPERM;
2295 goto out;
2296 }
2297 np->netc_exflags = argp->ex_flags;
2298 np->netc_anon = argp->ex_anon;
2299 np->netc_anon.cr_ref = 1;
2300 return (0);
2301out:
2302 free(np, M_NETADDR);
2303 return (error);
2304}
2305
2306/* ARGSUSED */
2307static int
2308vfs_free_netcred(rn, w)
2309 struct radix_node *rn;
2310 void *w;
2311{
2312 register struct radix_node_head *rnh = (struct radix_node_head *) w;
2313
2314 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2315 free((caddr_t) rn, M_NETADDR);
2316 return (0);
2317}
2318
2319/*
2320 * Free the net address hash lists that are hanging off the mount points.
2321 */
2322static void
2323vfs_free_addrlist(nep)
2324 struct netexport *nep;
2325{
2326 register int i;
2327 register struct radix_node_head *rnh;
2328
2329 for (i = 0; i <= AF_MAX; i++)
2330 if ((rnh = nep->ne_rtable[i])) {
2331 (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2332 (caddr_t) rnh);
2333 free((caddr_t) rnh, M_RTABLE);
2334 nep->ne_rtable[i] = 0;
2335 }
2336}
2337
2338int
2339vfs_export(mp, nep, argp)
2340 struct mount *mp;
2341 struct netexport *nep;
2342 struct export_args *argp;
2343{
2344 int error;
2345
2346 if (argp->ex_flags & MNT_DELEXPORT) {
2347 if (mp->mnt_flag & MNT_EXPUBLIC) {
2348 vfs_setpublicfs(NULL, NULL, NULL);
2349 mp->mnt_flag &= ~MNT_EXPUBLIC;
2350 }
2351 vfs_free_addrlist(nep);
2352 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2353 }
2354 if (argp->ex_flags & MNT_EXPORTED) {
2355 if (argp->ex_flags & MNT_EXPUBLIC) {
2356 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2357 return (error);
2358 mp->mnt_flag |= MNT_EXPUBLIC;
2359 }
2360 if ((error = vfs_hang_addrlist(mp, nep, argp)))
2361 return (error);
2362 mp->mnt_flag |= MNT_EXPORTED;
2363 }
2364 return (0);
2365}
2366
2367
2368/*
2369 * Set the publicly exported filesystem (WebNFS). Currently, only
2370 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2371 */
2372int
2373vfs_setpublicfs(mp, nep, argp)
2374 struct mount *mp;
2375 struct netexport *nep;
2376 struct export_args *argp;
2377{
2378 int error;
2379 struct vnode *rvp;
2380 char *cp;
2381
2382 /*
2383 * mp == NULL -> invalidate the current info, the FS is
2384 * no longer exported. May be called from either vfs_export
2385 * or unmount, so check if it hasn't already been done.
2386 */
2387 if (mp == NULL) {
2388 if (nfs_pub.np_valid) {
2389 nfs_pub.np_valid = 0;
2390 if (nfs_pub.np_index != NULL) {
2391 FREE(nfs_pub.np_index, M_TEMP);
2392 nfs_pub.np_index = NULL;
2393 }
2394 }
2395 return (0);
2396 }
2397
2398 /*
2399 * Only one allowed at a time.
2400 */
2401 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2402 return (EBUSY);
2403
2404 /*
2405 * Get real filehandle for root of exported FS.
2406 */
2407 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2408 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2409
2410 if ((error = VFS_ROOT(mp, &rvp)))
2411 return (error);
2412
2413 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2414 return (error);
2415
2416 vput(rvp);
2417
2418 /*
2419 * If an indexfile was specified, pull it in.
2420 */
2421 if (argp->ex_indexfile != NULL) {
2422 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2423 M_WAITOK);
2424 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2425 MAXNAMLEN, (size_t *)0);
2426 if (!error) {
2427 /*
2428 * Check for illegal filenames.
2429 */
2430 for (cp = nfs_pub.np_index; *cp; cp++) {
2431 if (*cp == '/') {
2432 error = EINVAL;
2433 break;
2434 }
2435 }
2436 }
2437 if (error) {
2438 FREE(nfs_pub.np_index, M_TEMP);
2439 return (error);
2440 }
2441 }
2442
2443 nfs_pub.np_mount = mp;
2444 nfs_pub.np_valid = 1;
2445 return (0);
2446}
2447
2448struct netcred *
2449vfs_export_lookup(mp, nep, nam)
2450 register struct mount *mp;
2451 struct netexport *nep;
2452 struct sockaddr *nam;
2453{
2454 register struct netcred *np;
2455 register struct radix_node_head *rnh;
2456 struct sockaddr *saddr;
2457
2458 np = NULL;
2459 if (mp->mnt_flag & MNT_EXPORTED) {
2460 /*
2461 * Lookup in the export list first.
2462 */
2463 if (nam != NULL) {
2464 saddr = nam;
2465 rnh = nep->ne_rtable[saddr->sa_family];
2466 if (rnh != NULL) {
2467 np = (struct netcred *)
2468 (*rnh->rnh_matchaddr)((caddr_t)saddr,
2469 rnh);
2470 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2471 np = NULL;
2472 }
2473 }
2474 /*
2475 * If no address match, use the default if it exists.
2476 */
2477 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2478 np = &nep->ne_defexported;
2479 }
2480 return (np);
2481}
2482
2483/*
2484 * perform msync on all vnodes under a mount point
2485 * the mount point must be locked.
2486 */
2487void
2488vfs_msync(struct mount *mp, int flags) {
2489 struct vnode *vp, *nvp;
2490 struct vm_object *obj;
2491 int anyio, tries;
2492
2493 tries = 5;
2494loop:
2495 anyio = 0;
2496 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
2497
2498 nvp = LIST_NEXT(vp, v_mntvnodes);
2499
2500 if (vp->v_mount != mp) {
2501 goto loop;
2502 }
2503
2504 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
2505 continue;
2506
2507 if (flags != MNT_WAIT) {
2508 if (VOP_GETVOBJECT(vp, &obj) != 0 ||
2509 (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2510 continue;
2511 if (VOP_ISLOCKED(vp, NULL))
2512 continue;
2513 }
2514
2515 simple_lock(&vp->v_interlock);
2516 if (VOP_GETVOBJECT(vp, &obj) == 0 &&
2517 (obj->flags & OBJ_MIGHTBEDIRTY)) {
2518 if (!vget(vp,
2519 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2520 if (VOP_GETVOBJECT(vp, &obj) == 0) {
2521 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
2522 anyio = 1;
2523 }
2524 vput(vp);
2525 }
2526 } else {
2527 simple_unlock(&vp->v_interlock);
2528 }
2529 }
2530 if (anyio && (--tries > 0))
2531 goto loop;
2532}
2533
2534/*
2535 * Create the VM object needed for VMIO and mmap support. This
2536 * is done for all VREG files in the system. Some filesystems might
2537 * afford the additional metadata buffering capability of the
2538 * VMIO code by making the device node be VMIO mode also.
2539 *
2540 * vp must be locked when vfs_object_create is called.
2541 */
2542int
2543vfs_object_create(vp, p, cred)
2544 struct vnode *vp;
2545 struct proc *p;
2546 struct ucred *cred;
2547{
2548 return (VOP_CREATEVOBJECT(vp, cred, p));
2549}
2550
2551void
2552vfree(vp)
2553 struct vnode *vp;
2554{
2555 int s;
2556
2557 s = splbio();
2558 simple_lock(&vnode_free_list_slock);
2559 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
2560 if (vp->v_flag & VAGE) {
2561 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2562 } else {
2563 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2564 }
2565 freevnodes++;
2566 simple_unlock(&vnode_free_list_slock);
2567 vp->v_flag &= ~VAGE;
2568 vp->v_flag |= VFREE;
2569 splx(s);
2570}
2571
2572void
2573vbusy(vp)
2574 struct vnode *vp;
2575{
2576 int s;
2577
2578 s = splbio();
2579 simple_lock(&vnode_free_list_slock);
2580 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
2581 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2582 freevnodes--;
2583 simple_unlock(&vnode_free_list_slock);
2584 vp->v_flag &= ~(VFREE|VAGE);
2585 splx(s);
2586}
2587
2588/*
2589 * Record a process's interest in events which might happen to
2590 * a vnode. Because poll uses the historic select-style interface
2591 * internally, this routine serves as both the ``check for any
2592 * pending events'' and the ``record my interest in future events''
2593 * functions. (These are done together, while the lock is held,
2594 * to avoid race conditions.)
2595 */
2596int
2597vn_pollrecord(vp, p, events)
2598 struct vnode *vp;
2599 struct proc *p;
2600 short events;
2601{
2602 simple_lock(&vp->v_pollinfo.vpi_lock);
2603 if (vp->v_pollinfo.vpi_revents & events) {
2604 /*
2605 * This leaves events we are not interested
2606 * in available for the other process which
2607 * which presumably had requested them
2608 * (otherwise they would never have been
2609 * recorded).
2610 */
2611 events &= vp->v_pollinfo.vpi_revents;
2612 vp->v_pollinfo.vpi_revents &= ~events;
2613
2614 simple_unlock(&vp->v_pollinfo.vpi_lock);
2615 return events;
2616 }
2617 vp->v_pollinfo.vpi_events |= events;
2618 selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2619 simple_unlock(&vp->v_pollinfo.vpi_lock);
2620 return 0;
2621}
2622
2623/*
2624 * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
2625 * it is possible for us to miss an event due to race conditions, but
2626 * that condition is expected to be rare, so for the moment it is the
2627 * preferred interface.
2628 */
2629void
2630vn_pollevent(vp, events)
2631 struct vnode *vp;
2632 short events;
2633{
2634 simple_lock(&vp->v_pollinfo.vpi_lock);
2635 if (vp->v_pollinfo.vpi_events & events) {
2636 /*
2637 * We clear vpi_events so that we don't
2638 * call selwakeup() twice if two events are
2639 * posted before the polling process(es) is
2640 * awakened. This also ensures that we take at
2641 * most one selwakeup() if the polling process
2642 * is no longer interested. However, it does
2643 * mean that only one event can be noticed at
2644 * a time. (Perhaps we should only clear those
2645 * event bits which we note?) XXX
2646 */
2647 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
2648 vp->v_pollinfo.vpi_revents |= events;
2649 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2650 }
2651 simple_unlock(&vp->v_pollinfo.vpi_lock);
2652}
2653
2654/*
2655 * Wake up anyone polling on vp because it is being revoked.
2656 * This depends on dead_poll() returning POLLHUP for correct
2657 * behavior.
2658 */
2659void
2660vn_pollgone(vp)
2661 struct vnode *vp;
2662{
2663 simple_lock(&vp->v_pollinfo.vpi_lock);
2664 if (vp->v_pollinfo.vpi_events) {
2665 vp->v_pollinfo.vpi_events = 0;
2666 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2667 }
2668 simple_unlock(&vp->v_pollinfo.vpi_lock);
2669}
2670
2671
2672
2673/*
2674 * Routine to create and manage a filesystem syncer vnode.
2675 */
2676#define sync_close ((int (*) __P((struct vop_close_args *)))nullop)
2677static int sync_fsync __P((struct vop_fsync_args *));
2678static int sync_inactive __P((struct vop_inactive_args *));
2679static int sync_reclaim __P((struct vop_reclaim_args *));
2680#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock)
2681#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock)
2682static int sync_print __P((struct vop_print_args *));
2683#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2684
2685static vop_t **sync_vnodeop_p;
2686static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2687 { &vop_default_desc, (vop_t *) vop_eopnotsupp },
2688 { &vop_close_desc, (vop_t *) sync_close }, /* close */
2689 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
2690 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
2691 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
2692 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */
2693 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */
2694 { &vop_print_desc, (vop_t *) sync_print }, /* print */
2695 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */
2696 { NULL, NULL }
2697};
2698static struct vnodeopv_desc sync_vnodeop_opv_desc =
2699 { &sync_vnodeop_p, sync_vnodeop_entries };
2700
2701VNODEOP_SET(sync_vnodeop_opv_desc);
2702
2703/*
2704 * Create a new filesystem syncer vnode for the specified mount point.
2705 */
2706int
2707vfs_allocate_syncvnode(mp)
2708 struct mount *mp;
2709{
2710 struct vnode *vp;
2711 static long start, incr, next;
2712 int error;
2713
2714 /* Allocate a new vnode */
2715 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2716 mp->mnt_syncer = NULL;
2717 return (error);
2718 }
2719 vp->v_type = VNON;
2720 /*
2721 * Place the vnode onto the syncer worklist. We attempt to
2722 * scatter them about on the list so that they will go off
2723 * at evenly distributed times even if all the filesystems
2724 * are mounted at once.
2725 */
2726 next += incr;
2727 if (next == 0 || next > syncer_maxdelay) {
2728 start /= 2;
2729 incr /= 2;
2730 if (start == 0) {
2731 start = syncer_maxdelay / 2;
2732 incr = syncer_maxdelay;
2733 }
2734 next = start;
2735 }
2736 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2737 mp->mnt_syncer = vp;
2738 return (0);
2739}
2740
2741/*
2742 * Do a lazy sync of the filesystem.
2743 */
2744static int
2745sync_fsync(ap)
2746 struct vop_fsync_args /* {
2747 struct vnode *a_vp;
2748 struct ucred *a_cred;
2749 int a_waitfor;
2750 struct proc *a_p;
2751 } */ *ap;
2752{
2753 struct vnode *syncvp = ap->a_vp;
2754 struct mount *mp = syncvp->v_mount;
2755 struct proc *p = ap->a_p;
2756 int asyncflag;
2757
2758 /*
2759 * We only need to do something if this is a lazy evaluation.
2760 */
2761 if (ap->a_waitfor != MNT_LAZY)
2762 return (0);
2763
2764 /*
2765 * Move ourselves to the back of the sync list.
2766 */
2767 vn_syncer_add_to_worklist(syncvp, syncdelay);
2768
2769 /*
2770 * Walk the list of vnodes pushing all that are dirty and
2771 * not already on the sync list.
2772 */
2773 simple_lock(&mountlist_slock);
2774 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2775 simple_unlock(&mountlist_slock);
2776 return (0);
2777 }
2778 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
2779 vfs_unbusy(mp, p);
2780 simple_unlock(&mountlist_slock);
2781 return (0);
2782 }
2783 asyncflag = mp->mnt_flag & MNT_ASYNC;
2784 mp->mnt_flag &= ~MNT_ASYNC;
2785 vfs_msync(mp, MNT_NOWAIT);
2786 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2787 if (asyncflag)
2788 mp->mnt_flag |= MNT_ASYNC;
2789 vn_finished_write(mp);
2790 vfs_unbusy(mp, p);
2791 return (0);
2792}
2793
2794/*
2795 * The syncer vnode is no referenced.
2796 */
2797static int
2798sync_inactive(ap)
2799 struct vop_inactive_args /* {
2800 struct vnode *a_vp;
2801 struct proc *a_p;
2802 } */ *ap;
2803{
2804
2805 vgone(ap->a_vp);
2806 return (0);
2807}
2808
2809/*
2810 * The syncer vnode is no longer needed and is being decommissioned.
2811 *
2812 * Modifications to the worklist must be protected at splbio().
2813 */
2814static int
2815sync_reclaim(ap)
2816 struct vop_reclaim_args /* {
2817 struct vnode *a_vp;
2818 } */ *ap;
2819{
2820 struct vnode *vp = ap->a_vp;
2821 int s;
2822
2823 s = splbio();
2824 vp->v_mount->mnt_syncer = NULL;
2825 if (vp->v_flag & VONWORKLST) {
2826 LIST_REMOVE(vp, v_synclist);
2827 vp->v_flag &= ~VONWORKLST;
2828 }
2829 splx(s);
2830
2831 return (0);
2832}
2833
2834/*
2835 * Print out a syncer vnode.
2836 */
2837static int
2838sync_print(ap)
2839 struct vop_print_args /* {
2840 struct vnode *a_vp;
2841 } */ *ap;
2842{
2843 struct vnode *vp = ap->a_vp;
2844
2845 printf("syncer vnode");
2846 if (vp->v_vnlock != NULL)
2847 lockmgr_printinfo(vp->v_vnlock);
2848 printf("\n");
2849 return (0);
2850}
2851
2852/*
2853 * extract the dev_t from a VBLK or VCHR
2854 */
2855dev_t
2856vn_todev(vp)
2857 struct vnode *vp;
2858{
2859 if (vp->v_type != VBLK && vp->v_type != VCHR)
2860 return (NODEV);
2861 return (vp->v_rdev);
2862}
2863
2864/*
2865 * Check if vnode represents a disk device
2866 */
2867int
2868vn_isdisk(vp, errp)
2869 struct vnode *vp;
2870 int *errp;
2871{
2872 struct cdevsw *cdevsw;
2873
2874 if (vp->v_type != VBLK && vp->v_type != VCHR) {
2875 if (errp != NULL)
2876 *errp = ENOTBLK;
2877 return (0);
2878 }
2879 if (vp->v_rdev == NULL) {
2880 if (errp != NULL)
2881 *errp = ENXIO;
2882 return (0);
2883 }
2884 cdevsw = devsw(vp->v_rdev);
2885 if (cdevsw == NULL) {
2886 if (errp != NULL)
2887 *errp = ENXIO;
2888 return (0);
2889 }
2890 if (!(cdevsw->d_flags & D_DISK)) {
2891 if (errp != NULL)
2892 *errp = ENOTBLK;
2893 return (0);
2894 }
2895 if (errp != NULL)
2896 *errp = 0;
2897 return (1);
2898}
2899
2900void
2901NDFREE(ndp, flags)
2902 struct nameidata *ndp;
2903 const uint flags;
2904{
2905 if (!(flags & NDF_NO_FREE_PNBUF) &&
2906 (ndp->ni_cnd.cn_flags & HASBUF)) {
2907 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
2908 ndp->ni_cnd.cn_flags &= ~HASBUF;
2909 }
2910 if (!(flags & NDF_NO_DVP_UNLOCK) &&
2911 (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
2912 ndp->ni_dvp != ndp->ni_vp)
2913 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
2914 if (!(flags & NDF_NO_DVP_RELE) &&
2915 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
2916 vrele(ndp->ni_dvp);
2917 ndp->ni_dvp = NULL;
2918 }
2919 if (!(flags & NDF_NO_VP_UNLOCK) &&
2920 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
2921 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
2922 if (!(flags & NDF_NO_VP_RELE) &&
2923 ndp->ni_vp) {
2924 vrele(ndp->ni_vp);
2925 ndp->ni_vp = NULL;
2926 }
2927 if (!(flags & NDF_NO_STARTDIR_RELE) &&
2928 (ndp->ni_cnd.cn_flags & SAVESTART)) {
2929 vrele(ndp->ni_startdir);
2930 ndp->ni_startdir = NULL;
2931 }
2932}
2933
2934int
2935vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
2936 enum vtype type;
2937 mode_t file_mode;
2938 uid_t file_uid;
2939 gid_t file_gid;
2940 mode_t acc_mode;
2941 struct ucred *cred;
2942 int *privused;
2943{
2944 mode_t dac_granted;
2945#ifdef CAPABILITIES
2946 mode_t cap_granted;
2947#endif
2948
2949 /*
2950 * Look for a normal, non-privileged way to access the file/directory
2951 * as requested. If it exists, go with that.
2952 */
2953
2954 if (privused != NULL)
2955 *privused = 0;
2956
2957 dac_granted = 0;
2958
2959 /* Check the owner. */
2960 if (cred->cr_uid == file_uid) {
2961 if (file_mode & S_IXUSR)
2962 dac_granted |= VEXEC;
2963 if (file_mode & S_IRUSR)
2964 dac_granted |= VREAD;
2965 if (file_mode & S_IWUSR)
2966 dac_granted |= VWRITE;
2967
2968 if ((acc_mode & dac_granted) == acc_mode)
2969 return (0);
2970
2971 goto privcheck;
2972 }
2973
2974 /* Otherwise, check the groups (first match) */
2975 if (groupmember(file_gid, cred)) {
2976 if (file_mode & S_IXGRP)
2977 dac_granted |= VEXEC;
2978 if (file_mode & S_IRGRP)
2979 dac_granted |= VREAD;
2980 if (file_mode & S_IWGRP)
2981 dac_granted |= VWRITE;
2982
2983 if ((acc_mode & dac_granted) == acc_mode)
2984 return (0);
2985
2986 goto privcheck;
2987 }
2988
2989 /* Otherwise, check everyone else. */
2990 if (file_mode & S_IXOTH)
2991 dac_granted |= VEXEC;
2992 if (file_mode & S_IROTH)
2993 dac_granted |= VREAD;
2994 if (file_mode & S_IWOTH)
2995 dac_granted |= VWRITE;
2996 if ((acc_mode & dac_granted) == acc_mode)
2997 return (0);
2998
2999privcheck:
3000 if (!suser_xxx(cred, NULL, PRISON_ROOT)) {
3001 /* XXX audit: privilege used */
3002 if (privused != NULL)
3003 *privused = 1;
3004 return (0);
3005 }
3006
3007#ifdef CAPABILITIES
3008 /*
3009 * Build a capability mask to determine if the set of capabilities
3010 * satisfies the requirements when combined with the granted mask
3011 * from above.
3012 * For each capability, if the capability is required, bitwise
3013 * or the request type onto the cap_granted mask.
3014 */
3015 cap_granted = 0;
3016 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3017 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3018 cap_granted |= VEXEC;
3019
3020 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3021 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3022 cap_granted |= VREAD;
3023
3024 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3025 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3026 cap_granted |= VWRITE;
3027
3028 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3029 /* XXX audit: privilege used */
3030 if (privused != NULL)
3031 *privused = 1;
3032 return (0);
3033 }
3034#endif
3035
3036 return (EACCES);
3037}