vnode_pager.c revision 140767
1139825Simp/*-
21541Srgrimes * Copyright (c) 1990 University of Utah.
31549Srgrimes * Copyright (c) 1991 The Regents of the University of California.
41549Srgrimes * All rights reserved.
59507Sdg * Copyright (c) 1993, 1994 John S. Dyson
69507Sdg * Copyright (c) 1995, David Greenman
71541Srgrimes *
81541Srgrimes * This code is derived from software contributed to Berkeley by
91541Srgrimes * the Systems Programming Group of the University of Utah Computer
101541Srgrimes * Science Department.
111541Srgrimes *
121541Srgrimes * Redistribution and use in source and binary forms, with or without
131541Srgrimes * modification, are permitted provided that the following conditions
141541Srgrimes * are met:
151541Srgrimes * 1. Redistributions of source code must retain the above copyright
161541Srgrimes *    notice, this list of conditions and the following disclaimer.
171541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
181541Srgrimes *    notice, this list of conditions and the following disclaimer in the
191541Srgrimes *    documentation and/or other materials provided with the distribution.
201541Srgrimes * 3. All advertising materials mentioning features or use of this software
2158705Scharnier *    must display the following acknowledgement:
221541Srgrimes *	This product includes software developed by the University of
231541Srgrimes *	California, Berkeley and its contributors.
241541Srgrimes * 4. Neither the name of the University nor the names of its contributors
251541Srgrimes *    may be used to endorse or promote products derived from this software
261541Srgrimes *    without specific prior written permission.
271541Srgrimes *
281541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
291541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
301541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
311541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
321541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
331541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
341541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
351541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
361541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
371541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
381541Srgrimes * SUCH DAMAGE.
391541Srgrimes *
401549Srgrimes *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
411541Srgrimes */
421541Srgrimes
431541Srgrimes/*
441541Srgrimes * Page to/from files (vnodes).
451541Srgrimes */
461541Srgrimes
471549Srgrimes/*
481549Srgrimes * TODO:
499507Sdg *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
507695Sdg *	greatly re-simplify the vnode_pager.
511549Srgrimes */
521549Srgrimes
53116226Sobrien#include <sys/cdefs.h>
54116226Sobrien__FBSDID("$FreeBSD: head/sys/vm/vnode_pager.c 140767 2005-01-24 21:21:59Z phk $");
55116226Sobrien
561541Srgrimes#include <sys/param.h>
571541Srgrimes#include <sys/systm.h>
581541Srgrimes#include <sys/proc.h>
591541Srgrimes#include <sys/vnode.h>
601541Srgrimes#include <sys/mount.h>
6160041Sphk#include <sys/bio.h>
629507Sdg#include <sys/buf.h>
6312662Sdg#include <sys/vmmeter.h>
64140767Sphk#include <sys/limits.h>
6551340Sdillon#include <sys/conf.h>
66127926Salc#include <sys/sf_buf.h>
671541Srgrimes
681541Srgrimes#include <vm/vm.h>
6912662Sdg#include <vm/vm_object.h>
701541Srgrimes#include <vm/vm_page.h>
719507Sdg#include <vm/vm_pager.h>
7231853Sdyson#include <vm/vm_map.h>
731541Srgrimes#include <vm/vnode_pager.h>
7412662Sdg#include <vm/vm_extern.h>
751541Srgrimes
7692727Salfredstatic void vnode_pager_init(void);
7792727Salfredstatic vm_offset_t vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
7892727Salfred					 int *run);
7992727Salfredstatic int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
8092727Salfredstatic int vnode_pager_input_old(vm_object_t object, vm_page_t m);
8192727Salfredstatic void vnode_pager_dealloc(vm_object_t);
8292727Salfredstatic int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
8392727Salfredstatic void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
8492727Salfredstatic boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
85140767Sphkstatic vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t);
8611943Sbde
871541Srgrimesstruct pagerops vnodepagerops = {
88118466Sphk	.pgo_init =	vnode_pager_init,
89118466Sphk	.pgo_alloc =	vnode_pager_alloc,
90118466Sphk	.pgo_dealloc =	vnode_pager_dealloc,
91118466Sphk	.pgo_getpages =	vnode_pager_getpages,
92118466Sphk	.pgo_putpages =	vnode_pager_putpages,
93118466Sphk	.pgo_haspage =	vnode_pager_haspage,
941541Srgrimes};
951541Srgrimes
9679127Sjhbint vnode_pbuf_freecnt;
9710556Sdyson
98104094Sphkstatic void
9979127Sjhbvnode_pager_init(void)
10079127Sjhb{
10142957Sdillon
10279127Sjhb	vnode_pbuf_freecnt = nswbuf / 2 + 1;
10379127Sjhb}
10479127Sjhb
105140767Sphk/* Create the VM system backing object for this vnode */
106140767Sphkint
107140767Sphkvnode_create_vobject(struct vnode *vp, size_t isize, struct thread *td)
108140767Sphk{
109140767Sphk	vm_object_t object;
110140767Sphk	vm_ooffset_t size = isize;
111140767Sphk	struct vattr va;
112140767Sphk
113140767Sphk	if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
114140767Sphk		return (0);
115140767Sphk
116140767Sphk	while ((object = vp->v_object) != NULL) {
117140767Sphk		VM_OBJECT_LOCK(object);
118140767Sphk		if (!(object->flags & OBJ_DEAD)) {
119140767Sphk			VM_OBJECT_UNLOCK(object);
120140767Sphk			return (0);
121140767Sphk		}
122140767Sphk		VOP_UNLOCK(vp, 0, td);
123140767Sphk		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
124140767Sphk		msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vodead", 0);
125140767Sphk		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
126140767Sphk	}
127140767Sphk
128140767Sphk	if (size == 0) {
129140767Sphk		if (vn_isdisk(vp, NULL)) {
130140767Sphk			size = IDX_TO_OFF(INT_MAX);
131140767Sphk		} else {
132140767Sphk			if (VOP_GETATTR(vp, &va, td->td_ucred, td) != 0)
133140767Sphk				return (0);
134140767Sphk			size = va.va_size;
135140767Sphk		}
136140767Sphk	}
137140767Sphk
138140767Sphk	object = vnode_pager_alloc(vp, size, 0, 0);
139140767Sphk	/*
140140767Sphk	 * Dereference the reference we just created.  This assumes
141140767Sphk	 * that the object is associated with the vp.
142140767Sphk	 */
143140767Sphk	VM_OBJECT_LOCK(object);
144140767Sphk	object->ref_count--;
145140767Sphk	VM_OBJECT_UNLOCK(object);
146140767Sphk	vrele(vp);
147140767Sphk
148140767Sphk	KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
149140767Sphk
150140767Sphk	return (0);
151140767Sphk}
152140767Sphk
1531541Srgrimes/*
1541541Srgrimes * Allocate (or lookup) pager for a vnode.
1551541Srgrimes * Handle is a vnode pointer.
15698604Salc *
15798604Salc * MPSAFE
1581541Srgrimes */
1599507Sdgvm_object_t
16040286Sdgvnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
16128751Sbde		  vm_ooffset_t offset)
1621541Srgrimes{
1639456Sdg	vm_object_t object;
1641541Srgrimes	struct vnode *vp;
1651541Srgrimes
1661541Srgrimes	/*
1671541Srgrimes	 * Pageout to vnode, no can do yet.
1681541Srgrimes	 */
1691541Srgrimes	if (handle == NULL)
1701827Sdg		return (NULL);
1711541Srgrimes
1729411Sdg	vp = (struct vnode *) handle;
1739411Sdg
174103923Sjeff	ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc");
175103923Sjeff
1761541Srgrimes	/*
1779411Sdg	 * Prevent race condition when allocating the object. This
1789411Sdg	 * can happen with NFS vnodes since the nfsnode isn't locked.
1791541Srgrimes	 */
180101308Sjeff	VI_LOCK(vp);
181101308Sjeff	while (vp->v_iflag & VI_OLOCK) {
182101308Sjeff		vp->v_iflag |= VI_OWANT;
183101308Sjeff		msleep(vp, VI_MTX(vp), PVM, "vnpobj", 0);
1849411Sdg	}
185101308Sjeff	vp->v_iflag |= VI_OLOCK;
186101308Sjeff	VI_UNLOCK(vp);
1879411Sdg
1889411Sdg	/*
1899411Sdg	 * If the object is being terminated, wait for it to
1909411Sdg	 * go away.
1919411Sdg	 */
192114074Salc	while ((object = vp->v_object) != NULL) {
193114074Salc		VM_OBJECT_LOCK(object);
194114074Salc		if ((object->flags & OBJ_DEAD) == 0)
195114074Salc			break;
196137297Salc		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
197114074Salc		msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vadead", 0);
1989507Sdg	}
1995455Sdg
20032071Sdyson	if (vp->v_usecount == 0)
20132071Sdyson		panic("vnode_pager_alloc: no vnode reference");
20232071Sdyson
2039507Sdg	if (object == NULL) {
2041541Srgrimes		/*
2051541Srgrimes		 * And an object of the appropriate size
2061541Srgrimes		 */
20740286Sdg		object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
2081827Sdg
20940286Sdg		object->un_pager.vnp.vnp_size = size;
2101549Srgrimes
2119507Sdg		object->handle = handle;
2129507Sdg		vp->v_object = object;
2131541Srgrimes	} else {
21432286Sdyson		object->ref_count++;
215114074Salc		VM_OBJECT_UNLOCK(object);
2161541Srgrimes	}
217101308Sjeff	VI_LOCK(vp);
21898604Salc	vp->v_usecount++;
219101308Sjeff	vp->v_iflag &= ~VI_OLOCK;
220101308Sjeff	if (vp->v_iflag & VI_OWANT) {
221101308Sjeff		vp->v_iflag &= ~VI_OWANT;
2229411Sdg		wakeup(vp);
2239411Sdg	}
224101308Sjeff	VI_UNLOCK(vp);
2259507Sdg	return (object);
2261541Srgrimes}
2271541Srgrimes
228114774Salc/*
229114774Salc *	The object must be locked.
230114774Salc */
23112820Sphkstatic void
2329507Sdgvnode_pager_dealloc(object)
2339507Sdg	vm_object_t object;
2341541Srgrimes{
23579242Sdillon	struct vnode *vp = object->handle;
2361541Srgrimes
2379507Sdg	if (vp == NULL)
2389507Sdg		panic("vnode_pager_dealloc: pager already dealloced");
2399507Sdg
240114774Salc	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
24133817Sdyson	vm_object_pip_wait(object, "vnpdea");
2421541Srgrimes
2439507Sdg	object->handle = NULL;
24433109Sdyson	object->type = OBJT_DEAD;
245137297Salc	if (object->flags & OBJ_DISCONNECTWNT) {
246137297Salc		vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
247137297Salc		wakeup(object);
248137297Salc	}
249101308Sjeff	ASSERT_VOP_LOCKED(vp, "vnode_pager_dealloc");
2509507Sdg	vp->v_object = NULL;
251140734Sphk	vp->v_vflag &= ~VV_TEXT;
2521549Srgrimes}
2531541Srgrimes
25412820Sphkstatic boolean_t
25512767Sdysonvnode_pager_haspage(object, pindex, before, after)
2569507Sdg	vm_object_t object;
25712767Sdyson	vm_pindex_t pindex;
2589507Sdg	int *before;
2599507Sdg	int *after;
2601541Srgrimes{
2619507Sdg	struct vnode *vp = object->handle;
26296572Sphk	daddr_t bn;
26312423Sphk	int err;
26410556Sdyson	daddr_t reqblock;
26511701Sdyson	int poff;
26611701Sdyson	int bsize;
26712914Sdyson	int pagesperblock, blocksperpage;
268140723Sjeff	int vfslocked;
2691541Srgrimes
270116695Salc	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
27151340Sdillon	/*
27251340Sdillon	 * If no vp or vp is doomed or marked transparent to VM, we do not
27351340Sdillon	 * have the page.
27451340Sdillon	 */
275101308Sjeff	if (vp == NULL)
27632585Sdyson		return FALSE;
27732585Sdyson
278103923Sjeff	VI_LOCK(vp);
279103923Sjeff	if (vp->v_iflag & VI_DOOMED) {
280103923Sjeff		VI_UNLOCK(vp);
281101308Sjeff		return FALSE;
282103923Sjeff	}
283103923Sjeff	VI_UNLOCK(vp);
2841541Srgrimes	/*
2855455Sdg	 * If filesystem no longer mounted or offset beyond end of file we do
2865455Sdg	 * not have the page.
2871541Srgrimes	 */
28812767Sdyson	if ((vp->v_mount == NULL) ||
28981140Sjhb	    (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size))
2904797Sdg		return FALSE;
2911541Srgrimes
29211576Sdg	bsize = vp->v_mount->mnt_stat.f_iosize;
29310556Sdyson	pagesperblock = bsize / PAGE_SIZE;
29412914Sdyson	blocksperpage = 0;
29512914Sdyson	if (pagesperblock > 0) {
29612914Sdyson		reqblock = pindex / pagesperblock;
29712914Sdyson	} else {
29812914Sdyson		blocksperpage = (PAGE_SIZE / bsize);
29912914Sdyson		reqblock = pindex * blocksperpage;
30012914Sdyson	}
301116695Salc	VM_OBJECT_UNLOCK(object);
302140723Sjeff	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
303119045Sphk	err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
304140723Sjeff	VFS_UNLOCK_GIANT(vfslocked);
305116695Salc	VM_OBJECT_LOCK(object);
3068876Srgrimes	if (err)
3079507Sdg		return TRUE;
30892029Seivind	if (bn == -1)
30910576Sdyson		return FALSE;
31012914Sdyson	if (pagesperblock > 0) {
31112914Sdyson		poff = pindex - (reqblock * pagesperblock);
31212914Sdyson		if (before) {
31312914Sdyson			*before *= pagesperblock;
31412914Sdyson			*before += poff;
31510669Sdyson		}
31612914Sdyson		if (after) {
31712914Sdyson			int numafter;
31812914Sdyson			*after *= pagesperblock;
31912914Sdyson			numafter = pagesperblock - (poff + 1);
32099211Srobert			if (IDX_TO_OFF(pindex + numafter) >
32199211Srobert			    object->un_pager.vnp.vnp_size) {
32299211Srobert				numafter =
32399211Srobert		    		    OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
32499211Srobert				    pindex;
32512914Sdyson			}
32612914Sdyson			*after += numafter;
32712914Sdyson		}
32812914Sdyson	} else {
32912914Sdyson		if (before) {
33012914Sdyson			*before /= blocksperpage;
33112914Sdyson		}
33212914Sdyson
33312914Sdyson		if (after) {
33412914Sdyson			*after /= blocksperpage;
33512914Sdyson		}
33610556Sdyson	}
33710576Sdyson	return TRUE;
3381541Srgrimes}
3391541Srgrimes
3401541Srgrimes/*
3411541Srgrimes * Lets the VM system know about a change in size for a file.
3429507Sdg * We adjust our own internal size and flush any cached pages in
3431541Srgrimes * the associated object that are affected by the size change.
3441541Srgrimes *
3451541Srgrimes * Note: this routine may be invoked as a result of a pager put
3461541Srgrimes * operation (possibly at object termination time), so we must be careful.
3471541Srgrimes */
3481541Srgrimesvoid
3491541Srgrimesvnode_pager_setsize(vp, nsize)
3501541Srgrimes	struct vnode *vp;
35112767Sdyson	vm_ooffset_t nsize;
3521541Srgrimes{
353116167Salc	vm_object_t object;
354116167Salc	vm_page_t m;
35538542Sluoqi	vm_pindex_t nobjsize;
3561541Srgrimes
357116167Salc	if ((object = vp->v_object) == NULL)
3581541Srgrimes		return;
359116167Salc	VM_OBJECT_LOCK(object);
360116167Salc	if (nsize == object->un_pager.vnp.vnp_size) {
361116167Salc		/*
362116167Salc		 * Hasn't changed size
363116167Salc		 */
364116167Salc		VM_OBJECT_UNLOCK(object);
3653374Sdg		return;
366116167Salc	}
36738542Sluoqi	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
3689507Sdg	if (nsize < object->un_pager.vnp.vnp_size) {
369116167Salc		/*
370116167Salc		 * File has shrunk. Toss any cached pages beyond the new EOF.
371116167Salc		 */
372116167Salc		if (nobjsize < object->size)
37338542Sluoqi			vm_object_page_remove(object, nobjsize, object->size,
374116167Salc			    FALSE);
3751827Sdg		/*
3761827Sdg		 * this gets rid of garbage at the end of a page that is now
37787834Sdillon		 * only partially backed by the vnode.
37887834Sdillon		 *
37987834Sdillon		 * XXX for some reason (I don't know yet), if we take a
38087834Sdillon		 * completely invalid page and mark it partially valid
38187834Sdillon		 * it can screw up NFS reads, so we don't allow the case.
3821827Sdg		 */
383116167Salc		if ((nsize & PAGE_MASK) &&
384121230Salc		    (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
385121230Salc		    m->valid != 0) {
386121230Salc			int base = (int)nsize & PAGE_MASK;
387121230Salc			int size = PAGE_SIZE - base;
38870374Sdillon
389121230Salc			/*
390121230Salc			 * Clear out partial-page garbage in case
391121230Salc			 * the page has been mapped.
392121230Salc			 */
393121230Salc			pmap_zero_page_area(m, base, size);
39470374Sdillon
395121230Salc			/*
396121230Salc			 * XXX work around SMP data integrity race
397121230Salc			 * by unmapping the page from user processes.
398121230Salc			 * The garbage we just cleared may be mapped
399121230Salc			 * to a user process running on another cpu
400121230Salc			 * and this code is not running through normal
401121230Salc			 * I/O channels which handle SMP issues for
402121230Salc			 * us, so unmap page to synchronize all cpus.
403121230Salc			 *
404121230Salc			 * XXX should vm_pager_unmap_page() have
405121230Salc			 * dealt with this?
406121230Salc			 */
407121230Salc			vm_page_lock_queues();
408121230Salc			pmap_remove_all(m);
40987834Sdillon
410121230Salc			/*
411121230Salc			 * Clear out partial-page dirty bits.  This
412121230Salc			 * has the side effect of setting the valid
413121230Salc			 * bits, but that is ok.  There are a bunch
414121230Salc			 * of places in the VM system where we expected
415121230Salc			 * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
416121230Salc			 * case is one of them.  If the page is still
417121230Salc			 * partially dirty, make it fully dirty.
418121230Salc			 *
419121230Salc			 * note that we do not clear out the valid
420121230Salc			 * bits.  This would prevent bogus_page
421121230Salc			 * replacement from working properly.
422121230Salc			 */
423121230Salc			vm_page_set_validclean(m, base, size);
424121230Salc			if (m->dirty != 0)
425121230Salc				m->dirty = VM_PAGE_BITS_ALL;
426116167Salc			vm_page_unlock_queues();
4271827Sdg		}
4281541Srgrimes	}
42912767Sdyson	object->un_pager.vnp.vnp_size = nsize;
43038542Sluoqi	object->size = nobjsize;
431116167Salc	VM_OBJECT_UNLOCK(object);
4321541Srgrimes}
4331541Srgrimes
4341549Srgrimes/*
4351549Srgrimes * calculate the linear (byte) disk address of specified virtual
4361549Srgrimes * file address
4371549Srgrimes */
43812820Sphkstatic vm_offset_t
4396151Sdgvnode_pager_addr(vp, address, run)
4401549Srgrimes	struct vnode *vp;
44112767Sdyson	vm_ooffset_t address;
4426151Sdg	int *run;
4431549Srgrimes{
4445455Sdg	int rtaddress;
4455455Sdg	int bsize;
44696572Sphk	daddr_t block;
4475455Sdg	int err;
44812767Sdyson	daddr_t vblock;
44912767Sdyson	int voffset;
4501549Srgrimes
451138531Salc	if (address < 0)
4525455Sdg		return -1;
4535455Sdg
45411701Sdyson	if (vp->v_mount == NULL)
45511701Sdyson		return -1;
45611701Sdyson
4571549Srgrimes	bsize = vp->v_mount->mnt_stat.f_iosize;
4581549Srgrimes	vblock = address / bsize;
4591549Srgrimes	voffset = address % bsize;
4601549Srgrimes
461119045Sphk	err = VOP_BMAP(vp, vblock, NULL, &block, run, NULL);
4621549Srgrimes
4636151Sdg	if (err || (block == -1))
4641549Srgrimes		rtaddress = -1;
4656151Sdg	else {
4666626Sdg		rtaddress = block + voffset / DEV_BSIZE;
46792029Seivind		if (run) {
4686151Sdg			*run += 1;
4696151Sdg			*run *= bsize/PAGE_SIZE;
4706151Sdg			*run -= voffset/PAGE_SIZE;
4716151Sdg		}
4726151Sdg	}
4731549Srgrimes
4741549Srgrimes	return rtaddress;
4751549Srgrimes}
4761549Srgrimes
4771549Srgrimes/*
47896755Strhodes * small block filesystem vnode pager input
4791549Srgrimes */
48012820Sphkstatic int
4819507Sdgvnode_pager_input_smlfs(object, m)
4829507Sdg	vm_object_t object;
4831549Srgrimes	vm_page_t m;
4841549Srgrimes{
4855455Sdg	int i;
486137726Sphk	struct vnode *vp;
487137726Sphk	struct bufobj *bo;
4881549Srgrimes	struct buf *bp;
489127926Salc	struct sf_buf *sf;
4905455Sdg	int fileaddr;
4911549Srgrimes	vm_offset_t bsize;
4925455Sdg	int error = 0;
4931549Srgrimes
4949507Sdg	vp = object->handle;
49511701Sdyson	if (vp->v_mount == NULL)
49611701Sdyson		return VM_PAGER_BAD;
49711701Sdyson
4981549Srgrimes	bsize = vp->v_mount->mnt_stat.f_iosize;
4991549Srgrimes
500137726Sphk	VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);
5011549Srgrimes
502127926Salc	sf = sf_buf_alloc(m, 0);
5031549Srgrimes
5041827Sdg	for (i = 0; i < PAGE_SIZE / bsize; i++) {
50586092Sdillon		vm_ooffset_t address;
5061827Sdg
50745561Sdt		if (vm_page_bits(i * bsize, bsize) & m->valid)
5085455Sdg			continue;
5091549Srgrimes
51086092Sdillon		address = IDX_TO_OFF(m->pindex) + i * bsize;
51186092Sdillon		if (address >= object->un_pager.vnp.vnp_size) {
51286092Sdillon			fileaddr = -1;
51386092Sdillon		} else {
51486092Sdillon			fileaddr = vnode_pager_addr(vp, address, NULL);
51586092Sdillon		}
5161827Sdg		if (fileaddr != -1) {
51742957Sdillon			bp = getpbuf(&vnode_pbuf_freecnt);
5181549Srgrimes
5191827Sdg			/* build a minimal buffer header */
52058345Sphk			bp->b_iocmd = BIO_READ;
521119092Sphk			bp->b_iodone = bdone;
52284827Sjhb			KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
52384827Sjhb			KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
52491406Sjhb			bp->b_rcred = crhold(curthread->td_ucred);
52591406Sjhb			bp->b_wcred = crhold(curthread->td_ucred);
526127926Salc			bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
5276626Sdg			bp->b_blkno = fileaddr;
528137726Sphk			pbgetbo(bo, bp);
5291549Srgrimes			bp->b_bcount = bsize;
5301549Srgrimes			bp->b_bufsize = bsize;
53170374Sdillon			bp->b_runningbufspace = bp->b_bufsize;
53270374Sdillon			runningbufspace += bp->b_runningbufspace;
5331827Sdg
5341827Sdg			/* do the input */
535121205Sphk			bp->b_iooffset = dbtob(bp->b_blkno);
536136927Sphk			bstrategy(bp);
5371549Srgrimes
53833758Sdyson			/* we definitely need to be at splvm here */
5391549Srgrimes
540119092Sphk			bwait(bp, PVM, "vnsrd");
541119092Sphk
54258934Sphk			if ((bp->b_ioflags & BIO_ERROR) != 0)
5431549Srgrimes				error = EIO;
5441549Srgrimes
5451827Sdg			/*
5461827Sdg			 * free the buffer header back to the swap buffer pool
5471827Sdg			 */
548137726Sphk			pbrelbo(bp);
54942957Sdillon			relpbuf(bp, &vnode_pbuf_freecnt);
5501827Sdg			if (error)
5511549Srgrimes				break;
5525455Sdg
553121264Salc			VM_OBJECT_LOCK(object);
554107189Salc			vm_page_lock_queues();
55515583Sphk			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
556107189Salc			vm_page_unlock_queues();
557121264Salc			VM_OBJECT_UNLOCK(object);
5581549Srgrimes		} else {
559121264Salc			VM_OBJECT_LOCK(object);
560107189Salc			vm_page_lock_queues();
56115583Sphk			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
562107189Salc			vm_page_unlock_queues();
563121264Salc			VM_OBJECT_UNLOCK(object);
564127926Salc			bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
5651549Srgrimes		}
5661549Srgrimes	}
567127926Salc	sf_buf_free(sf);
568107347Salc	vm_page_lock_queues();
56960755Speter	pmap_clear_modify(m);
570107347Salc	vm_page_unlock_queues();
5711827Sdg	if (error) {
5724207Sdg		return VM_PAGER_ERROR;
5731549Srgrimes	}
5741549Srgrimes	return VM_PAGER_OK;
5751549Srgrimes
5761549Srgrimes}
5771549Srgrimes
5781549Srgrimes
5791549Srgrimes/*
580139296Sphk * old style vnode pager input routine
5811549Srgrimes */
58212820Sphkstatic int
5839507Sdgvnode_pager_input_old(object, m)
5849507Sdg	vm_object_t object;
5851549Srgrimes	vm_page_t m;
5861549Srgrimes{
5871541Srgrimes	struct uio auio;
5881541Srgrimes	struct iovec aiov;
5895455Sdg	int error;
5905455Sdg	int size;
591127926Salc	struct sf_buf *sf;
59277398Sjhb	struct vnode *vp;
5931549Srgrimes
594121495Salc	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
5951549Srgrimes	error = 0;
5961827Sdg
5971549Srgrimes	/*
5981549Srgrimes	 * Return failure if beyond current EOF
5991549Srgrimes	 */
60012767Sdyson	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
6011549Srgrimes		return VM_PAGER_BAD;
6021549Srgrimes	} else {
6031549Srgrimes		size = PAGE_SIZE;
60412767Sdyson		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
60512767Sdyson			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
606121495Salc		vp = object->handle;
607121495Salc		VM_OBJECT_UNLOCK(object);
6087178Sdg
6095455Sdg		/*
6105455Sdg		 * Allocate a kernel virtual address and initialize so that
6115455Sdg		 * we can use VOP_READ/WRITE routines.
6125455Sdg		 */
613127926Salc		sf = sf_buf_alloc(m, 0);
6147178Sdg
615127926Salc		aiov.iov_base = (caddr_t)sf_buf_kva(sf);
6161549Srgrimes		aiov.iov_len = size;
6171549Srgrimes		auio.uio_iov = &aiov;
6181549Srgrimes		auio.uio_iovcnt = 1;
61912767Sdyson		auio.uio_offset = IDX_TO_OFF(m->pindex);
6201549Srgrimes		auio.uio_segflg = UIO_SYSSPACE;
6211549Srgrimes		auio.uio_rw = UIO_READ;
6221549Srgrimes		auio.uio_resid = size;
62383366Sjulian		auio.uio_td = curthread;
6241549Srgrimes
62591406Sjhb		error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
6261549Srgrimes		if (!error) {
62779242Sdillon			int count = size - auio.uio_resid;
6281549Srgrimes
6291549Srgrimes			if (count == 0)
6301549Srgrimes				error = EINVAL;
6311549Srgrimes			else if (count != PAGE_SIZE)
632127926Salc				bzero((caddr_t)sf_buf_kva(sf) + count,
633127926Salc				    PAGE_SIZE - count);
6341549Srgrimes		}
635127926Salc		sf_buf_free(sf);
636121230Salc
637121230Salc		VM_OBJECT_LOCK(object);
6381549Srgrimes	}
639107347Salc	vm_page_lock_queues();
64060755Speter	pmap_clear_modify(m);
64149945Salc	vm_page_undirty(m);
642121230Salc	vm_page_unlock_queues();
64339739Srvb	if (!error)
64439739Srvb		m->valid = VM_PAGE_BITS_ALL;
6454207Sdg	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
6461549Srgrimes}
6471549Srgrimes
6481549Srgrimes/*
6491549Srgrimes * generic vnode pager input routine
6501549Srgrimes */
65110556Sdyson
65233847Smsmith/*
65376827Salfred * Local media VFS's that do not implement their own VOP_GETPAGES
65499211Srobert * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
65599211Srobert * to implement the previous behaviour.
65633847Smsmith *
65733847Smsmith * All other FS's should use the bypass to get to the local media
65833847Smsmith * backing vp's VOP_GETPAGES.
65933847Smsmith */
66012820Sphkstatic int
6619507Sdgvnode_pager_getpages(object, m, count, reqpage)
6629507Sdg	vm_object_t object;
6631549Srgrimes	vm_page_t *m;
6649507Sdg	int count;
6659507Sdg	int reqpage;
6661549Srgrimes{
66710556Sdyson	int rtval;
66810556Sdyson	struct vnode *vp;
66934403Smsmith	int bytes = count * PAGE_SIZE;
670140723Sjeff	int vfslocked;
67132286Sdyson
67210556Sdyson	vp = object->handle;
673116279Salc	VM_OBJECT_UNLOCK(object);
674140723Sjeff	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
67534403Smsmith	rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
67676827Salfred	KASSERT(rtval != EOPNOTSUPP,
67776827Salfred	    ("vnode_pager: FS getpages not implemented\n"));
678140723Sjeff	VFS_UNLOCK_GIANT(vfslocked);
679116279Salc	VM_OBJECT_LOCK(object);
68033847Smsmith	return rtval;
68110556Sdyson}
68210556Sdyson
68333847Smsmith/*
68433847Smsmith * This is now called from local media FS's to operate against their
68533847Smsmith * own vnodes if they fail to implement VOP_GETPAGES.
68633847Smsmith */
68733847Smsmithint
68833847Smsmithvnode_pager_generic_getpages(vp, m, bytecount, reqpage)
68933847Smsmith	struct vnode *vp;
69010556Sdyson	vm_page_t *m;
69133847Smsmith	int bytecount;
69210556Sdyson	int reqpage;
69310556Sdyson{
69433847Smsmith	vm_object_t object;
69512767Sdyson	vm_offset_t kva;
69634206Sdyson	off_t foff, tfoff, nextoff;
697100832Salc	int i, j, size, bsize, first, firstaddr;
698137726Sphk	struct bufobj *bo;
6996151Sdg	int runpg;
7006151Sdg	int runend;
7017178Sdg	struct buf *bp;
70233847Smsmith	int count;
7035455Sdg	int error = 0;
7041549Srgrimes
70533847Smsmith	object = vp->v_object;
70633847Smsmith	count = bytecount / PAGE_SIZE;
70733847Smsmith
708137726Sphk	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
709137726Sphk	    ("vnode_pager_generic_getpages does not support devices"));
71011701Sdyson	if (vp->v_mount == NULL)
71111701Sdyson		return VM_PAGER_BAD;
71211701Sdyson
7131549Srgrimes	bsize = vp->v_mount->mnt_stat.f_iosize;
7141549Srgrimes
7151549Srgrimes	/* get the UNDERLYING device for the file with VOP_BMAP() */
7161827Sdg
7171549Srgrimes	/*
7181827Sdg	 * originally, we did not check for an error return value -- assuming
7191827Sdg	 * an fs always has a bmap entry point -- that assumption is wrong!!!
7201549Srgrimes	 */
72112767Sdyson	foff = IDX_TO_OFF(m[reqpage]->pindex);
7221827Sdg
7231549Srgrimes	/*
7241887Sdg	 * if we can't bmap, use old VOP code
7251549Srgrimes	 */
726137726Sphk	if (VOP_BMAP(vp, 0, &bo, 0, NULL, NULL)) {
727116512Salc		VM_OBJECT_LOCK(object);
728100832Salc		vm_page_lock_queues();
729100832Salc		for (i = 0; i < count; i++)
730100832Salc			if (i != reqpage)
73175692Salfred				vm_page_free(m[i]);
732100832Salc		vm_page_unlock_queues();
7333612Sdg		cnt.v_vnodein++;
7343612Sdg		cnt.v_vnodepgsin++;
735121495Salc		error = vnode_pager_input_old(object, m[reqpage]);
736121495Salc		VM_OBJECT_UNLOCK(object);
737121495Salc		return (error);
7381549Srgrimes
7391827Sdg		/*
7401827Sdg		 * if the blocksize is smaller than a page size, then use
7411827Sdg		 * special small filesystem code.  NFS sometimes has a small
7421827Sdg		 * blocksize, but it can handle large reads itself.
7431827Sdg		 */
7441827Sdg	} else if ((PAGE_SIZE / bsize) > 1 &&
74538866Sbde	    (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
746116512Salc		VM_OBJECT_LOCK(object);
747100832Salc		vm_page_lock_queues();
748100832Salc		for (i = 0; i < count; i++)
749100832Salc			if (i != reqpage)
75075692Salfred				vm_page_free(m[i]);
751100832Salc		vm_page_unlock_queues();
752116512Salc		VM_OBJECT_UNLOCK(object);
7533612Sdg		cnt.v_vnodein++;
7543612Sdg		cnt.v_vnodepgsin++;
7559507Sdg		return vnode_pager_input_smlfs(object, m[reqpage]);
7561549Srgrimes	}
75745347Sjulian
7581549Srgrimes	/*
75945347Sjulian	 * If we have a completely valid page available to us, we can
76045347Sjulian	 * clean up and return.  Otherwise we have to re-read the
76145347Sjulian	 * media.
7621549Srgrimes	 */
763121227Salc	VM_OBJECT_LOCK(object);
76445347Sjulian	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
765100832Salc		vm_page_lock_queues();
766100832Salc		for (i = 0; i < count; i++)
7675455Sdg			if (i != reqpage)
76875692Salfred				vm_page_free(m[i]);
769100832Salc		vm_page_unlock_queues();
770116512Salc		VM_OBJECT_UNLOCK(object);
7715455Sdg		return VM_PAGER_OK;
7721549Srgrimes	}
77345347Sjulian	m[reqpage]->valid = 0;
774121227Salc	VM_OBJECT_UNLOCK(object);
7757178Sdg
7765455Sdg	/*
7775455Sdg	 * here on direct device I/O
7785455Sdg	 */
77992029Seivind	firstaddr = -1;
7801549Srgrimes
7811549Srgrimes	/*
7826151Sdg	 * calculate the run that includes the required page
7831549Srgrimes	 */
78492029Seivind	for (first = 0, i = 0; i < count; i = runend) {
78512767Sdyson		firstaddr = vnode_pager_addr(vp,
78612767Sdyson			IDX_TO_OFF(m[i]->pindex), &runpg);
7876151Sdg		if (firstaddr == -1) {
788116512Salc			VM_OBJECT_LOCK(object);
7899507Sdg			if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
790106603Smux				panic("vnode_pager_getpages: unexpected missing page: firstaddr: %d, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx",
791106603Smux				    firstaddr, (uintmax_t)(foff >> 32),
792106603Smux				    (uintmax_t)foff,
793106603Smux				    (uintmax_t)
794106603Smux				    (object->un_pager.vnp.vnp_size >> 32),
795106603Smux				    (uintmax_t)object->un_pager.vnp.vnp_size);
7966151Sdg			}
797100832Salc			vm_page_lock_queues();
79875692Salfred			vm_page_free(m[i]);
799100832Salc			vm_page_unlock_queues();
800116512Salc			VM_OBJECT_UNLOCK(object);
8016151Sdg			runend = i + 1;
8026151Sdg			first = runend;
8036151Sdg			continue;
8041549Srgrimes		}
8056151Sdg		runend = i + runpg;
8069507Sdg		if (runend <= reqpage) {
807116512Salc			VM_OBJECT_LOCK(object);
808100832Salc			vm_page_lock_queues();
809100832Salc			for (j = i; j < runend; j++)
81075692Salfred				vm_page_free(m[j]);
811100832Salc			vm_page_unlock_queues();
812116512Salc			VM_OBJECT_UNLOCK(object);
8131549Srgrimes		} else {
8149507Sdg			if (runpg < (count - first)) {
815116512Salc				VM_OBJECT_LOCK(object);
816100832Salc				vm_page_lock_queues();
8179507Sdg				for (i = first + runpg; i < count; i++)
81875692Salfred					vm_page_free(m[i]);
819100832Salc				vm_page_unlock_queues();
820116512Salc				VM_OBJECT_UNLOCK(object);
8216151Sdg				count = first + runpg;
8226151Sdg			}
8236151Sdg			break;
8241549Srgrimes		}
8256151Sdg		first = runend;
8261549Srgrimes	}
8271549Srgrimes
8281549Srgrimes	/*
8291827Sdg	 * the first and last page have been calculated now, move input pages
8301827Sdg	 * to be zero based...
8311549Srgrimes	 */
8321549Srgrimes	if (first != 0) {
8331549Srgrimes		for (i = first; i < count; i++) {
8341549Srgrimes			m[i - first] = m[i];
8351549Srgrimes		}
8361549Srgrimes		count -= first;
8371549Srgrimes		reqpage -= first;
8381549Srgrimes	}
8396151Sdg
8401549Srgrimes	/*
8411549Srgrimes	 * calculate the file virtual address for the transfer
8421549Srgrimes	 */
84312767Sdyson	foff = IDX_TO_OFF(m[0]->pindex);
8441827Sdg
8451549Srgrimes	/*
8461549Srgrimes	 * calculate the size of the transfer
8471549Srgrimes	 */
8481549Srgrimes	size = count * PAGE_SIZE;
849134892Sphk	KASSERT(count > 0, ("zero count"));
8509507Sdg	if ((foff + size) > object->un_pager.vnp.vnp_size)
8519507Sdg		size = object->un_pager.vnp.vnp_size - foff;
852134892Sphk	KASSERT(size > 0, ("zero size"));
8531549Srgrimes
8541549Srgrimes	/*
85551340Sdillon	 * round up physical size for real devices.
8561549Srgrimes	 */
857137726Sphk	if (1) {
858137726Sphk		int secmask = bo->bo_bsize - 1;
859136977Sphk		KASSERT(secmask < PAGE_SIZE && secmask > 0,
860136977Sphk		    ("vnode_pager_generic_getpages: sector size %d too large",
861136977Sphk		    secmask + 1));
86251340Sdillon		size = (size + secmask) & ~secmask;
86351340Sdillon	}
8641549Srgrimes
86542957Sdillon	bp = getpbuf(&vnode_pbuf_freecnt);
8665455Sdg	kva = (vm_offset_t) bp->b_data;
8671887Sdg
8681549Srgrimes	/*
8691549Srgrimes	 * and map the pages to be read into the kva
8701549Srgrimes	 */
8711887Sdg	pmap_qenter(kva, m, count);
8721549Srgrimes
8731549Srgrimes	/* build a minimal buffer header */
87458345Sphk	bp->b_iocmd = BIO_READ;
875119092Sphk	bp->b_iodone = bdone;
87684827Sjhb	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
87784827Sjhb	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
87891406Sjhb	bp->b_rcred = crhold(curthread->td_ucred);
87991406Sjhb	bp->b_wcred = crhold(curthread->td_ucred);
8806626Sdg	bp->b_blkno = firstaddr;
881137726Sphk	pbgetbo(bo, bp);
8821549Srgrimes	bp->b_bcount = size;
8831549Srgrimes	bp->b_bufsize = size;
88470374Sdillon	bp->b_runningbufspace = bp->b_bufsize;
88570374Sdillon	runningbufspace += bp->b_runningbufspace;
8861549Srgrimes
8873612Sdg	cnt.v_vnodein++;
8883612Sdg	cnt.v_vnodepgsin += count;
8893612Sdg
8901549Srgrimes	/* do the input */
891121205Sphk	bp->b_iooffset = dbtob(bp->b_blkno);
892136927Sphk	bstrategy(bp);
8933612Sdg
894119092Sphk	bwait(bp, PVM, "vnread");
8951549Srgrimes
89658934Sphk	if ((bp->b_ioflags & BIO_ERROR) != 0)
8971549Srgrimes		error = EIO;
8981549Srgrimes
8991549Srgrimes	if (!error) {
9001549Srgrimes		if (size != count * PAGE_SIZE)
9011827Sdg			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
9021549Srgrimes	}
9035455Sdg	pmap_qremove(kva, count);
9041549Srgrimes
9051549Srgrimes	/*
9061549Srgrimes	 * free the buffer header back to the swap buffer pool
9071549Srgrimes	 */
908137726Sphk	pbrelbo(bp);
90942957Sdillon	relpbuf(bp, &vnode_pbuf_freecnt);
9101549Srgrimes
911116512Salc	VM_OBJECT_LOCK(object);
912100736Salc	vm_page_lock_queues();
91334206Sdyson	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
91434206Sdyson		vm_page_t mt;
91534206Sdyson
91634206Sdyson		nextoff = tfoff + PAGE_SIZE;
91734206Sdyson		mt = m[i];
91834206Sdyson
91947239Sdt		if (nextoff <= object->un_pager.vnp.vnp_size) {
92045347Sjulian			/*
92145347Sjulian			 * Read filled up entire page.
92245347Sjulian			 */
92334206Sdyson			mt->valid = VM_PAGE_BITS_ALL;
92449945Salc			vm_page_undirty(mt);	/* should be an assert? XXX */
92560755Speter			pmap_clear_modify(mt);
92634206Sdyson		} else {
92745347Sjulian			/*
92845347Sjulian			 * Read did not fill up entire page.  Since this
92945347Sjulian			 * is getpages, the page may be mapped, so we have
93045347Sjulian			 * to zero the invalid portions of the page even
93145347Sjulian			 * though we aren't setting them valid.
93245347Sjulian			 *
93345347Sjulian			 * Currently we do not set the entire page valid,
93445347Sjulian			 * we just try to clear the piece that we couldn't
93545347Sjulian			 * read.
93645347Sjulian			 */
93747239Sdt			vm_page_set_validclean(mt, 0,
93847239Sdt			    object->un_pager.vnp.vnp_size - tfoff);
93946349Salc			/* handled by vm_fault now */
94046349Salc			/* vm_page_zero_invalid(mt, FALSE); */
94134206Sdyson		}
94234206Sdyson
9431549Srgrimes		if (i != reqpage) {
9441827Sdg
9451549Srgrimes			/*
9461827Sdg			 * whether or not to leave the page activated is up in
9471827Sdg			 * the air, but we should put the page on a page queue
9481827Sdg			 * somewhere. (it already is in the object). Result:
94958634Scharnier			 * It appears that empirical results show that
9501827Sdg			 * deactivating pages is best.
9511549Srgrimes			 */
9521827Sdg
9531549Srgrimes			/*
9541827Sdg			 * just in case someone was asking for this page we
9551827Sdg			 * now tell them that it is ok to use
9561549Srgrimes			 */
9571549Srgrimes			if (!error) {
95834206Sdyson				if (mt->flags & PG_WANTED)
95934206Sdyson					vm_page_activate(mt);
96033109Sdyson				else
96134206Sdyson					vm_page_deactivate(mt);
96238799Sdfr				vm_page_wakeup(mt);
9631549Srgrimes			} else {
96475692Salfred				vm_page_free(mt);
9651549Srgrimes			}
9661549Srgrimes		}
9671549Srgrimes	}
968100736Salc	vm_page_unlock_queues();
969116512Salc	VM_OBJECT_UNLOCK(object);
9701549Srgrimes	if (error) {
9719507Sdg		printf("vnode_pager_getpages: I/O read error\n");
9721549Srgrimes	}
9734207Sdg	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
9741549Srgrimes}
9751549Srgrimes
97633847Smsmith/*
97733847Smsmith * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
97833847Smsmith * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
97933847Smsmith * vnode_pager_generic_putpages() to implement the previous behaviour.
98033847Smsmith *
98133847Smsmith * All other FS's should use the bypass to get to the local media
98233847Smsmith * backing vp's VOP_PUTPAGES.
98333847Smsmith */
98443129Sdillonstatic void
98510556Sdysonvnode_pager_putpages(object, m, count, sync, rtvals)
98610556Sdyson	vm_object_t object;
98710556Sdyson	vm_page_t *m;
98810556Sdyson	int count;
98910556Sdyson	boolean_t sync;
99010556Sdyson	int *rtvals;
99110556Sdyson{
99210556Sdyson	int rtval;
99310556Sdyson	struct vnode *vp;
99462976Smckusick	struct mount *mp;
99534403Smsmith	int bytes = count * PAGE_SIZE;
99618973Sdyson
99744321Salc	/*
99844321Salc	 * Force synchronous operation if we are extremely low on memory
99944321Salc	 * to prevent a low-memory deadlock.  VOP operations often need to
100044321Salc	 * allocate more memory to initiate the I/O ( i.e. do a BMAP
100144321Salc	 * operation ).  The swapper handles the case by limiting the amount
100244321Salc	 * of asynchronous I/O, but that sort of solution doesn't scale well
100344321Salc	 * for the vnode pager without a lot of work.
100444321Salc	 *
100544321Salc	 * Also, the backing vnode's iodone routine may not wake the pageout
100644321Salc	 * daemon up.  This should be probably be addressed XXX.
100744321Salc	 */
100844321Salc
100944321Salc	if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
101044321Salc		sync |= OBJPC_SYNC;
101144321Salc
101244321Salc	/*
101344321Salc	 * Call device-specific putpages function
101444321Salc	 */
101510556Sdyson	vp = object->handle;
1016121455Salc	VM_OBJECT_UNLOCK(object);
101762976Smckusick	if (vp->v_type != VREG)
101862976Smckusick		mp = NULL;
101962976Smckusick	(void)vn_start_write(vp, &mp, V_WAIT);
102034403Smsmith	rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
102176827Salfred	KASSERT(rtval != EOPNOTSUPP,
102276827Salfred	    ("vnode_pager: stale FS putpages\n"));
102362976Smckusick	vn_finished_write(mp);
1024121455Salc	VM_OBJECT_LOCK(object);
102510556Sdyson}
102610556Sdyson
102733847Smsmith
10281549Srgrimes/*
102933847Smsmith * This is now called from local media FS's to operate against their
103045057Seivind * own vnodes if they fail to implement VOP_PUTPAGES.
103170374Sdillon *
103270374Sdillon * This is typically called indirectly via the pageout daemon and
103370374Sdillon * clustering has already typically occured, so in general we ask the
103470374Sdillon * underlying filesystem to write the data out asynchronously rather
103570374Sdillon * then delayed.
10361549Srgrimes */
103733847Smsmithint
103834206Sdysonvnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
103933847Smsmith	struct vnode *vp;
10401549Srgrimes	vm_page_t *m;
104133847Smsmith	int bytecount;
104234206Sdyson	int flags;
10435455Sdg	int *rtvals;
10441549Srgrimes{
10457695Sdg	int i;
104633847Smsmith	vm_object_t object;
104733847Smsmith	int count;
10481549Srgrimes
10497695Sdg	int maxsize, ncount;
105012767Sdyson	vm_ooffset_t poffset;
10517695Sdg	struct uio auio;
10527695Sdg	struct iovec aiov;
10537695Sdg	int error;
105434206Sdyson	int ioflags;
10551549Srgrimes
105633847Smsmith	object = vp->v_object;
105733847Smsmith	count = bytecount / PAGE_SIZE;
105833847Smsmith
10591827Sdg	for (i = 0; i < count; i++)
10601549Srgrimes		rtvals[i] = VM_PAGER_AGAIN;
10611549Srgrimes
1062138406Salc	if ((int64_t)m[0]->pindex < 0) {
1063119544Smarcel		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n",
1064119544Smarcel			(long)m[0]->pindex, (u_long)m[0]->dirty);
10657695Sdg		rtvals[0] = VM_PAGER_BAD;
10667695Sdg		return VM_PAGER_BAD;
10675455Sdg	}
10687178Sdg
10697695Sdg	maxsize = count * PAGE_SIZE;
10707695Sdg	ncount = count;
10711549Srgrimes
107212767Sdyson	poffset = IDX_TO_OFF(m[0]->pindex);
107384854Sdillon
107484854Sdillon	/*
107584854Sdillon	 * If the page-aligned write is larger then the actual file we
107684854Sdillon	 * have to invalidate pages occuring beyond the file EOF.  However,
107784854Sdillon	 * there is an edge case where a file may not be page-aligned where
107884854Sdillon	 * the last page is partially invalid.  In this case the filesystem
107984854Sdillon	 * may not properly clear the dirty bits for the entire page (which
108084854Sdillon	 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
108184854Sdillon	 * With the page locked we are free to fix-up the dirty bits here.
108287834Sdillon	 *
108387834Sdillon	 * We do not under any circumstances truncate the valid bits, as
108487834Sdillon	 * this will screw up bogus page replacement.
108584854Sdillon	 */
108612767Sdyson	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
108784854Sdillon		if (object->un_pager.vnp.vnp_size > poffset) {
108884854Sdillon			int pgoff;
108984854Sdillon
109012767Sdyson			maxsize = object->un_pager.vnp.vnp_size - poffset;
109184854Sdillon			ncount = btoc(maxsize);
109284854Sdillon			if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
1093119370Salc				vm_page_lock_queues();
109484854Sdillon				vm_page_clear_dirty(m[ncount - 1], pgoff,
109584854Sdillon					PAGE_SIZE - pgoff);
1096119370Salc				vm_page_unlock_queues();
109784854Sdillon			}
109884854Sdillon		} else {
10998585Sdg			maxsize = 0;
110084854Sdillon			ncount = 0;
110184854Sdillon		}
11028585Sdg		if (ncount < count) {
11038585Sdg			for (i = ncount; i < count; i++) {
11047695Sdg				rtvals[i] = VM_PAGER_BAD;
11051549Srgrimes			}
11061549Srgrimes		}
11071541Srgrimes	}
11087695Sdg
110970374Sdillon	/*
111070374Sdillon	 * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
111170374Sdillon	 * rather then a bdwrite() to prevent paging I/O from saturating
1112108358Sdillon	 * the buffer cache.  Dummy-up the sequential heuristic to cause
1113108358Sdillon	 * large ranges to cluster.  If neither IO_SYNC or IO_ASYNC is set,
1114108358Sdillon	 * the system decides how to cluster.
111570374Sdillon	 */
111634206Sdyson	ioflags = IO_VMIO;
1117108358Sdillon	if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
1118108358Sdillon		ioflags |= IO_SYNC;
1119108358Sdillon	else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
1120108358Sdillon		ioflags |= IO_ASYNC;
112134206Sdyson	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
1122108358Sdillon	ioflags |= IO_SEQMAX << IO_SEQSHIFT;
11231827Sdg
11247695Sdg	aiov.iov_base = (caddr_t) 0;
11257695Sdg	aiov.iov_len = maxsize;
11267695Sdg	auio.uio_iov = &aiov;
11277695Sdg	auio.uio_iovcnt = 1;
112812767Sdyson	auio.uio_offset = poffset;
11297695Sdg	auio.uio_segflg = UIO_NOCOPY;
11307695Sdg	auio.uio_rw = UIO_WRITE;
11317695Sdg	auio.uio_resid = maxsize;
113283366Sjulian	auio.uio_td = (struct thread *) 0;
113391406Sjhb	error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
11343612Sdg	cnt.v_vnodeout++;
11357695Sdg	cnt.v_vnodepgsout += ncount;
11363612Sdg
11378585Sdg	if (error) {
11389507Sdg		printf("vnode_pager_putpages: I/O error %d\n", error);
11397695Sdg	}
11408585Sdg	if (auio.uio_resid) {
114137555Sbde		printf("vnode_pager_putpages: residual I/O %d at %lu\n",
114237555Sbde		    auio.uio_resid, (u_long)m[0]->pindex);
11437695Sdg	}
114433936Sdyson	for (i = 0; i < ncount; i++) {
114533936Sdyson		rtvals[i] = VM_PAGER_OK;
11467695Sdg	}
11477695Sdg	return rtvals[0];
11487695Sdg}
11491549Srgrimes
11507695Sdgstruct vnode *
1151120183Salcvnode_pager_lock(vm_object_t first_object)
11529507Sdg{
1153120183Salc	struct vnode *vp;
1154120183Salc	vm_object_t backing_object, object;
115522521Sdyson
1156120183Salc	VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
1157120183Salc	for (object = first_object; object != NULL; object = backing_object) {
1158120183Salc		if (object->type != OBJT_VNODE) {
1159120183Salc			if ((backing_object = object->backing_object) != NULL)
1160120183Salc				VM_OBJECT_LOCK(backing_object);
1161120183Salc			if (object != first_object)
1162120183Salc				VM_OBJECT_UNLOCK(object);
11637695Sdg			continue;
1164120183Salc		}
1165120183Salc	retry:
116677094Sjhb		if (object->flags & OBJ_DEAD) {
1167120183Salc			if (object != first_object)
1168120183Salc				VM_OBJECT_UNLOCK(object);
116932585Sdyson			return NULL;
117077094Sjhb		}
1171120183Salc		vp = object->handle;
1172120183Salc		VI_LOCK(vp);
1173120183Salc		VM_OBJECT_UNLOCK(object);
1174120183Salc		if (first_object != object)
1175120183Salc			VM_OBJECT_UNLOCK(first_object);
1176120183Salc		if (vget(vp, LK_CANRECURSE | LK_INTERLOCK | LK_NOPAUSE |
1177120183Salc		    LK_RETRY | LK_SHARED, curthread)) {
1178120183Salc			VM_OBJECT_LOCK(first_object);
1179120183Salc			if (object != first_object)
1180120183Salc				VM_OBJECT_LOCK(object);
1181120183Salc			if (object->type != OBJT_VNODE) {
1182120183Salc				if (object != first_object)
1183120183Salc					VM_OBJECT_UNLOCK(object);
118434611Sdyson				return NULL;
1185120183Salc			}
118632585Sdyson			printf("vnode_pager_lock: retrying\n");
1187120183Salc			goto retry;
118832585Sdyson		}
1189120183Salc		VM_OBJECT_LOCK(first_object);
1190120183Salc		return (vp);
11911549Srgrimes	}
11929507Sdg	return NULL;
11937695Sdg}
1194