vnode_pager.c revision 45347
11541Srgrimes/*
21541Srgrimes * Copyright (c) 1990 University of Utah.
31549Srgrimes * Copyright (c) 1991 The Regents of the University of California.
41549Srgrimes * All rights reserved.
59507Sdg * Copyright (c) 1993, 1994 John S. Dyson
69507Sdg * Copyright (c) 1995, David Greenman
71541Srgrimes *
81541Srgrimes * This code is derived from software contributed to Berkeley by
91541Srgrimes * the Systems Programming Group of the University of Utah Computer
101541Srgrimes * Science Department.
111541Srgrimes *
121541Srgrimes * Redistribution and use in source and binary forms, with or without
131541Srgrimes * modification, are permitted provided that the following conditions
141541Srgrimes * are met:
151541Srgrimes * 1. Redistributions of source code must retain the above copyright
161541Srgrimes *    notice, this list of conditions and the following disclaimer.
171541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
181541Srgrimes *    notice, this list of conditions and the following disclaimer in the
191541Srgrimes *    documentation and/or other materials provided with the distribution.
201541Srgrimes * 3. All advertising materials mentioning features or use of this software
211541Srgrimes *    must display the following acknowledgement:
221541Srgrimes *	This product includes software developed by the University of
231541Srgrimes *	California, Berkeley and its contributors.
241541Srgrimes * 4. Neither the name of the University nor the names of its contributors
251541Srgrimes *    may be used to endorse or promote products derived from this software
261541Srgrimes *    without specific prior written permission.
271541Srgrimes *
281541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
291541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
301541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
311541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
321541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
331541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
341541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
351541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
361541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
371541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
381541Srgrimes * SUCH DAMAGE.
391541Srgrimes *
401549Srgrimes *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
4145347Sjulian *	$Id: vnode_pager.c,v 1.105 1999/03/27 02:39:01 eivind Exp $
421541Srgrimes */
431541Srgrimes
441541Srgrimes/*
451541Srgrimes * Page to/from files (vnodes).
461541Srgrimes */
471541Srgrimes
481549Srgrimes/*
491549Srgrimes * TODO:
509507Sdg *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
517695Sdg *	greatly re-simplify the vnode_pager.
521549Srgrimes */
531549Srgrimes
541541Srgrimes#include <sys/param.h>
551541Srgrimes#include <sys/systm.h>
561541Srgrimes#include <sys/proc.h>
571541Srgrimes#include <sys/vnode.h>
581541Srgrimes#include <sys/mount.h>
599507Sdg#include <sys/buf.h>
6012662Sdg#include <sys/vmmeter.h>
611541Srgrimes
621541Srgrimes#include <vm/vm.h>
6312662Sdg#include <vm/vm_prot.h>
6412662Sdg#include <vm/vm_object.h>
651541Srgrimes#include <vm/vm_page.h>
669507Sdg#include <vm/vm_pager.h>
6731853Sdyson#include <vm/vm_map.h>
681541Srgrimes#include <vm/vnode_pager.h>
6912662Sdg#include <vm/vm_extern.h>
701541Srgrimes
7112820Sphkstatic vm_offset_t vnode_pager_addr __P((struct vnode *vp, vm_ooffset_t address,
7211943Sbde					 int *run));
7312820Sphkstatic void vnode_pager_iodone __P((struct buf *bp));
7412820Sphkstatic int vnode_pager_input_smlfs __P((vm_object_t object, vm_page_t m));
7512820Sphkstatic int vnode_pager_input_old __P((vm_object_t object, vm_page_t m));
7612820Sphkstatic void vnode_pager_dealloc __P((vm_object_t));
7712820Sphkstatic int vnode_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
7843129Sdillonstatic void vnode_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
7912820Sphkstatic boolean_t vnode_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *));
8011943Sbde
811541Srgrimesstruct pagerops vnodepagerops = {
829507Sdg	NULL,
831541Srgrimes	vnode_pager_alloc,
841541Srgrimes	vnode_pager_dealloc,
859507Sdg	vnode_pager_getpages,
869507Sdg	vnode_pager_putpages,
879507Sdg	vnode_pager_haspage,
889507Sdg	NULL
891541Srgrimes};
901541Srgrimes
9142957Sdillonint vnode_pbuf_freecnt = -1;	/* start out unlimited */
9210556Sdyson
9342957Sdillon
941541Srgrimes/*
951541Srgrimes * Allocate (or lookup) pager for a vnode.
961541Srgrimes * Handle is a vnode pointer.
971541Srgrimes */
989507Sdgvm_object_t
9940286Sdgvnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
10028751Sbde		  vm_ooffset_t offset)
1011541Srgrimes{
1029456Sdg	vm_object_t object;
1031541Srgrimes	struct vnode *vp;
1041541Srgrimes
1051541Srgrimes	/*
1061541Srgrimes	 * Pageout to vnode, no can do yet.
1071541Srgrimes	 */
1081541Srgrimes	if (handle == NULL)
1091827Sdg		return (NULL);
1101541Srgrimes
11142957Sdillon	/*
11242957Sdillon	 * XXX hack - This initialization should be put somewhere else.
11342957Sdillon	 */
11442957Sdillon	if (vnode_pbuf_freecnt < 0) {
11542957Sdillon	    vnode_pbuf_freecnt = nswbuf / 2 + 1;
11642957Sdillon	}
11742957Sdillon
1189411Sdg	vp = (struct vnode *) handle;
1199411Sdg
1201541Srgrimes	/*
1219411Sdg	 * Prevent race condition when allocating the object. This
1229411Sdg	 * can happen with NFS vnodes since the nfsnode isn't locked.
1231541Srgrimes	 */
1249411Sdg	while (vp->v_flag & VOLOCK) {
1259411Sdg		vp->v_flag |= VOWANT;
1269411Sdg		tsleep(vp, PVM, "vnpobj", 0);
1279411Sdg	}
1289411Sdg	vp->v_flag |= VOLOCK;
1299411Sdg
1309411Sdg	/*
1319411Sdg	 * If the object is being terminated, wait for it to
1329411Sdg	 * go away.
1339411Sdg	 */
13413490Sdyson	while (((object = vp->v_object) != NULL) &&
13513490Sdyson		(object->flags & OBJ_DEAD)) {
1369356Sdg		tsleep(object, PVM, "vadead", 0);
1379507Sdg	}
1385455Sdg
13932071Sdyson	if (vp->v_usecount == 0)
14032071Sdyson		panic("vnode_pager_alloc: no vnode reference");
14132071Sdyson
1429507Sdg	if (object == NULL) {
1431541Srgrimes		/*
1441541Srgrimes		 * And an object of the appropriate size
1451541Srgrimes		 */
14640286Sdg		object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
14732286Sdyson		object->flags = 0;
1481827Sdg
14940286Sdg		object->un_pager.vnp.vnp_size = size;
1501549Srgrimes
1519507Sdg		object->handle = handle;
1529507Sdg		vp->v_object = object;
15332286Sdyson		vp->v_usecount++;
1541541Srgrimes	} else {
15532286Sdyson		object->ref_count++;
15632286Sdyson		vp->v_usecount++;
1571541Srgrimes	}
1589411Sdg
1599411Sdg	vp->v_flag &= ~VOLOCK;
1609411Sdg	if (vp->v_flag & VOWANT) {
1619411Sdg		vp->v_flag &= ~VOWANT;
1629411Sdg		wakeup(vp);
1639411Sdg	}
1649507Sdg	return (object);
1651541Srgrimes}
1661541Srgrimes
16712820Sphkstatic void
1689507Sdgvnode_pager_dealloc(object)
1699507Sdg	vm_object_t object;
1701541Srgrimes{
1719507Sdg	register struct vnode *vp = object->handle;
1721541Srgrimes
1739507Sdg	if (vp == NULL)
1749507Sdg		panic("vnode_pager_dealloc: pager already dealloced");
1759507Sdg
17633817Sdyson	vm_object_pip_wait(object, "vnpdea");
1771541Srgrimes
1789507Sdg	object->handle = NULL;
17933109Sdyson	object->type = OBJT_DEAD;
1809507Sdg	vp->v_object = NULL;
18133109Sdyson	vp->v_flag &= ~(VTEXT | VOBJBUF);
1821549Srgrimes}
1831541Srgrimes
18412820Sphkstatic boolean_t
18512767Sdysonvnode_pager_haspage(object, pindex, before, after)
1869507Sdg	vm_object_t object;
18712767Sdyson	vm_pindex_t pindex;
1889507Sdg	int *before;
1899507Sdg	int *after;
1901541Srgrimes{
1919507Sdg	struct vnode *vp = object->handle;
1921541Srgrimes	daddr_t bn;
19312423Sphk	int err;
19410556Sdyson	daddr_t reqblock;
19511701Sdyson	int poff;
19611701Sdyson	int bsize;
19712914Sdyson	int pagesperblock, blocksperpage;
1981541Srgrimes
19932585Sdyson	if ((vp == NULL) || (vp->v_flag & VDOOMED))
20032585Sdyson		return FALSE;
20132585Sdyson
2021541Srgrimes	/*
2035455Sdg	 * If filesystem no longer mounted or offset beyond end of file we do
2045455Sdg	 * not have the page.
2051541Srgrimes	 */
20612767Sdyson	if ((vp->v_mount == NULL) ||
20712767Sdyson		(IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size))
2084797Sdg		return FALSE;
2091541Srgrimes
21011576Sdg	bsize = vp->v_mount->mnt_stat.f_iosize;
21110556Sdyson	pagesperblock = bsize / PAGE_SIZE;
21212914Sdyson	blocksperpage = 0;
21312914Sdyson	if (pagesperblock > 0) {
21412914Sdyson		reqblock = pindex / pagesperblock;
21512914Sdyson	} else {
21612914Sdyson		blocksperpage = (PAGE_SIZE / bsize);
21712914Sdyson		reqblock = pindex * blocksperpage;
21812914Sdyson	}
21910556Sdyson	err = VOP_BMAP(vp, reqblock, (struct vnode **) 0, &bn,
22010556Sdyson		after, before);
2218876Srgrimes	if (err)
2229507Sdg		return TRUE;
22310702Sdyson	if ( bn == -1)
22410576Sdyson		return FALSE;
22512914Sdyson	if (pagesperblock > 0) {
22612914Sdyson		poff = pindex - (reqblock * pagesperblock);
22712914Sdyson		if (before) {
22812914Sdyson			*before *= pagesperblock;
22912914Sdyson			*before += poff;
23010669Sdyson		}
23112914Sdyson		if (after) {
23212914Sdyson			int numafter;
23312914Sdyson			*after *= pagesperblock;
23412914Sdyson			numafter = pagesperblock - (poff + 1);
23512914Sdyson			if (IDX_TO_OFF(pindex + numafter) > object->un_pager.vnp.vnp_size) {
23612914Sdyson				numafter = OFF_TO_IDX((object->un_pager.vnp.vnp_size - IDX_TO_OFF(pindex)));
23712914Sdyson			}
23812914Sdyson			*after += numafter;
23912914Sdyson		}
24012914Sdyson	} else {
24112914Sdyson		if (before) {
24212914Sdyson			*before /= blocksperpage;
24312914Sdyson		}
24412914Sdyson
24512914Sdyson		if (after) {
24612914Sdyson			*after /= blocksperpage;
24712914Sdyson		}
24810556Sdyson	}
24910576Sdyson	return TRUE;
2501541Srgrimes}
2511541Srgrimes
2521541Srgrimes/*
2531541Srgrimes * Lets the VM system know about a change in size for a file.
2549507Sdg * We adjust our own internal size and flush any cached pages in
2551541Srgrimes * the associated object that are affected by the size change.
2561541Srgrimes *
2571541Srgrimes * Note: this routine may be invoked as a result of a pager put
2581541Srgrimes * operation (possibly at object termination time), so we must be careful.
2591541Srgrimes */
2601541Srgrimesvoid
2611541Srgrimesvnode_pager_setsize(vp, nsize)
2621541Srgrimes	struct vnode *vp;
26312767Sdyson	vm_ooffset_t nsize;
2641541Srgrimes{
26538542Sluoqi	vm_pindex_t nobjsize;
2669507Sdg	vm_object_t object = vp->v_object;
2671541Srgrimes
2689507Sdg	if (object == NULL)
2691541Srgrimes		return;
2701827Sdg
2711541Srgrimes	/*
2721541Srgrimes	 * Hasn't changed size
2731541Srgrimes	 */
2749507Sdg	if (nsize == object->un_pager.vnp.vnp_size)
2753374Sdg		return;
2761827Sdg
27738542Sluoqi	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
27838542Sluoqi
2791541Srgrimes	/*
2801827Sdg	 * File has shrunk. Toss any cached pages beyond the new EOF.
2811541Srgrimes	 */
2829507Sdg	if (nsize < object->un_pager.vnp.vnp_size) {
28338542Sluoqi		vm_freeze_copyopts(object, OFF_TO_IDX(nsize), object->size);
28438542Sluoqi		if (nobjsize < object->size) {
28538542Sluoqi			vm_object_page_remove(object, nobjsize, object->size,
28638542Sluoqi				FALSE);
2875455Sdg		}
2881827Sdg		/*
2891827Sdg		 * this gets rid of garbage at the end of a page that is now
2901827Sdg		 * only partially backed by the vnode...
2911827Sdg		 */
2921827Sdg		if (nsize & PAGE_MASK) {
2931827Sdg			vm_offset_t kva;
2941827Sdg			vm_page_t m;
2951827Sdg
29612767Sdyson			m = vm_page_lookup(object, OFF_TO_IDX(nsize));
2971827Sdg			if (m) {
2981827Sdg				kva = vm_pager_map_page(m);
2991827Sdg				bzero((caddr_t) kva + (nsize & PAGE_MASK),
30012767Sdyson				    (int) (round_page(nsize) - nsize));
3011827Sdg				vm_pager_unmap_page(kva);
3021827Sdg			}
3031827Sdg		}
3041541Srgrimes	}
30512767Sdyson	object->un_pager.vnp.vnp_size = nsize;
30638542Sluoqi	object->size = nobjsize;
3071541Srgrimes}
3081541Srgrimes
3091541Srgrimesvoid
3101549Srgrimesvnode_pager_freepage(m)
3111549Srgrimes	vm_page_t m;
3121541Srgrimes{
3131549Srgrimes	vm_page_free(m);
3141549Srgrimes}
3151549Srgrimes
3161549Srgrimes/*
3171549Srgrimes * calculate the linear (byte) disk address of specified virtual
3181549Srgrimes * file address
3191549Srgrimes */
32012820Sphkstatic vm_offset_t
3216151Sdgvnode_pager_addr(vp, address, run)
3221549Srgrimes	struct vnode *vp;
32312767Sdyson	vm_ooffset_t address;
3246151Sdg	int *run;
3251549Srgrimes{
3265455Sdg	int rtaddress;
3275455Sdg	int bsize;
32812767Sdyson	daddr_t block;
3291549Srgrimes	struct vnode *rtvp;
3305455Sdg	int err;
33112767Sdyson	daddr_t vblock;
33212767Sdyson	int voffset;
3331549Srgrimes
3345455Sdg	if ((int) address < 0)
3355455Sdg		return -1;
3365455Sdg
33711701Sdyson	if (vp->v_mount == NULL)
33811701Sdyson		return -1;
33911701Sdyson
3401549Srgrimes	bsize = vp->v_mount->mnt_stat.f_iosize;
3411549Srgrimes	vblock = address / bsize;
3421549Srgrimes	voffset = address % bsize;
3431549Srgrimes
34410551Sdyson	err = VOP_BMAP(vp, vblock, &rtvp, &block, run, NULL);
3451549Srgrimes
3466151Sdg	if (err || (block == -1))
3471549Srgrimes		rtaddress = -1;
3486151Sdg	else {
3496626Sdg		rtaddress = block + voffset / DEV_BSIZE;
3506151Sdg		if( run) {
3516151Sdg			*run += 1;
3526151Sdg			*run *= bsize/PAGE_SIZE;
3536151Sdg			*run -= voffset/PAGE_SIZE;
3546151Sdg		}
3556151Sdg	}
3561549Srgrimes
3571549Srgrimes	return rtaddress;
3581549Srgrimes}
3591549Srgrimes
3601549Srgrimes/*
3611549Srgrimes * interrupt routine for I/O completion
3621549Srgrimes */
36312820Sphkstatic void
3641549Srgrimesvnode_pager_iodone(bp)
3651549Srgrimes	struct buf *bp;
3661549Srgrimes{
3671549Srgrimes	bp->b_flags |= B_DONE;
3689507Sdg	wakeup(bp);
3691549Srgrimes}
3701549Srgrimes
3711549Srgrimes/*
3721549Srgrimes * small block file system vnode pager input
3731549Srgrimes */
37412820Sphkstatic int
3759507Sdgvnode_pager_input_smlfs(object, m)
3769507Sdg	vm_object_t object;
3771549Srgrimes	vm_page_t m;
3781549Srgrimes{
3795455Sdg	int i;
3805455Sdg	int s;
3811549Srgrimes	struct vnode *dp, *vp;
3821549Srgrimes	struct buf *bp;
3831549Srgrimes	vm_offset_t kva;
3845455Sdg	int fileaddr;
3851549Srgrimes	vm_offset_t bsize;
3865455Sdg	int error = 0;
3871549Srgrimes
3889507Sdg	vp = object->handle;
38911701Sdyson	if (vp->v_mount == NULL)
39011701Sdyson		return VM_PAGER_BAD;
39111701Sdyson
3921549Srgrimes	bsize = vp->v_mount->mnt_stat.f_iosize;
3931549Srgrimes
3947178Sdg
39510551Sdyson	VOP_BMAP(vp, 0, &dp, 0, NULL, NULL);
3961549Srgrimes
3971549Srgrimes	kva = vm_pager_map_page(m);
3981549Srgrimes
3991827Sdg	for (i = 0; i < PAGE_SIZE / bsize; i++) {
4001827Sdg
40112767Sdyson		if ((vm_page_bits(IDX_TO_OFF(m->pindex) + i * bsize, bsize) & m->valid))
4025455Sdg			continue;
4031549Srgrimes
40412767Sdyson		fileaddr = vnode_pager_addr(vp,
40512767Sdyson			IDX_TO_OFF(m->pindex) + i * bsize, (int *)0);
4061827Sdg		if (fileaddr != -1) {
40742957Sdillon			bp = getpbuf(&vnode_pbuf_freecnt);
4081549Srgrimes
4091827Sdg			/* build a minimal buffer header */
4101549Srgrimes			bp->b_flags = B_BUSY | B_READ | B_CALL;
4111549Srgrimes			bp->b_iodone = vnode_pager_iodone;
4121549Srgrimes			bp->b_proc = curproc;
4131549Srgrimes			bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
4141827Sdg			if (bp->b_rcred != NOCRED)
4151549Srgrimes				crhold(bp->b_rcred);
4161827Sdg			if (bp->b_wcred != NOCRED)
4171549Srgrimes				crhold(bp->b_wcred);
41831493Sphk			bp->b_data = (caddr_t) kva + i * bsize;
4196626Sdg			bp->b_blkno = fileaddr;
4205455Sdg			pbgetvp(dp, bp);
4211549Srgrimes			bp->b_bcount = bsize;
4221549Srgrimes			bp->b_bufsize = bsize;
4231827Sdg
4241827Sdg			/* do the input */
42537384Sjulian			VOP_STRATEGY(bp->b_vp, bp);
4261549Srgrimes
42733758Sdyson			/* we definitely need to be at splvm here */
4281549Srgrimes
42933758Sdyson			s = splvm();
4301549Srgrimes			while ((bp->b_flags & B_DONE) == 0) {
4319356Sdg				tsleep(bp, PVM, "vnsrd", 0);
4321549Srgrimes			}
4331549Srgrimes			splx(s);
4341549Srgrimes			if ((bp->b_flags & B_ERROR) != 0)
4351549Srgrimes				error = EIO;
4361549Srgrimes
4371827Sdg			/*
4381827Sdg			 * free the buffer header back to the swap buffer pool
4391827Sdg			 */
44042957Sdillon			relpbuf(bp, &vnode_pbuf_freecnt);
4411827Sdg			if (error)
4421549Srgrimes				break;
4435455Sdg
44415583Sphk			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
4451549Srgrimes		} else {
44615583Sphk			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
4471549Srgrimes			bzero((caddr_t) kva + i * bsize, bsize);
4481549Srgrimes		}
4491549Srgrimes	}
4501549Srgrimes	vm_pager_unmap_page(kva);
45117334Sdyson	pmap_clear_modify(VM_PAGE_TO_PHYS(m));
45238799Sdfr	vm_page_flag_clear(m, PG_ZERO);
4531827Sdg	if (error) {
4544207Sdg		return VM_PAGER_ERROR;
4551549Srgrimes	}
4561549Srgrimes	return VM_PAGER_OK;
4571549Srgrimes
4581549Srgrimes}
4591549Srgrimes
4601549Srgrimes
4611549Srgrimes/*
4621549Srgrimes * old style vnode pager output routine
4631549Srgrimes */
46412820Sphkstatic int
4659507Sdgvnode_pager_input_old(object, m)
4669507Sdg	vm_object_t object;
4671549Srgrimes	vm_page_t m;
4681549Srgrimes{
4691541Srgrimes	struct uio auio;
4701541Srgrimes	struct iovec aiov;
4715455Sdg	int error;
4725455Sdg	int size;
4731549Srgrimes	vm_offset_t kva;
4741549Srgrimes
4751549Srgrimes	error = 0;
4761827Sdg
4771549Srgrimes	/*
4781549Srgrimes	 * Return failure if beyond current EOF
4791549Srgrimes	 */
48012767Sdyson	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
4811549Srgrimes		return VM_PAGER_BAD;
4821549Srgrimes	} else {
4831549Srgrimes		size = PAGE_SIZE;
48412767Sdyson		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
48512767Sdyson			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
4867178Sdg
4875455Sdg		/*
4885455Sdg		 * Allocate a kernel virtual address and initialize so that
4895455Sdg		 * we can use VOP_READ/WRITE routines.
4905455Sdg		 */
4911549Srgrimes		kva = vm_pager_map_page(m);
4927178Sdg
4931827Sdg		aiov.iov_base = (caddr_t) kva;
4941549Srgrimes		aiov.iov_len = size;
4951549Srgrimes		auio.uio_iov = &aiov;
4961549Srgrimes		auio.uio_iovcnt = 1;
49712767Sdyson		auio.uio_offset = IDX_TO_OFF(m->pindex);
4981549Srgrimes		auio.uio_segflg = UIO_SYSSPACE;
4991549Srgrimes		auio.uio_rw = UIO_READ;
5001549Srgrimes		auio.uio_resid = size;
50141503Srvb		auio.uio_procp = curproc;
5021549Srgrimes
5039507Sdg		error = VOP_READ(object->handle, &auio, 0, curproc->p_ucred);
5041549Srgrimes		if (!error) {
5051549Srgrimes			register int count = size - auio.uio_resid;
5061549Srgrimes
5071549Srgrimes			if (count == 0)
5081549Srgrimes				error = EINVAL;
5091549Srgrimes			else if (count != PAGE_SIZE)
5101827Sdg				bzero((caddr_t) kva + count, PAGE_SIZE - count);
5111549Srgrimes		}
5121549Srgrimes		vm_pager_unmap_page(kva);
5131549Srgrimes	}
51417334Sdyson	pmap_clear_modify(VM_PAGE_TO_PHYS(m));
5155455Sdg	m->dirty = 0;
51638799Sdfr	vm_page_flag_clear(m, PG_ZERO);
51739739Srvb	if (!error)
51839739Srvb		m->valid = VM_PAGE_BITS_ALL;
5194207Sdg	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
5201549Srgrimes}
5211549Srgrimes
5221549Srgrimes/*
5231549Srgrimes * generic vnode pager input routine
5241549Srgrimes */
52510556Sdyson
52633847Smsmith/*
52733847Smsmith * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
52833847Smsmith * implement their own VOP_GETPAGES, their VOP_GETPAGES should call to
52933847Smsmith * vnode_pager_generic_getpages() to implement the previous behaviour.
53033847Smsmith *
53133847Smsmith * All other FS's should use the bypass to get to the local media
53233847Smsmith * backing vp's VOP_GETPAGES.
53333847Smsmith */
53412820Sphkstatic int
5359507Sdgvnode_pager_getpages(object, m, count, reqpage)
5369507Sdg	vm_object_t object;
5371549Srgrimes	vm_page_t *m;
5389507Sdg	int count;
5399507Sdg	int reqpage;
5401549Srgrimes{
54110556Sdyson	int rtval;
54210556Sdyson	struct vnode *vp;
54334403Smsmith	int bytes = count * PAGE_SIZE;
54432286Sdyson
54510556Sdyson	vp = object->handle;
54633847Smsmith	/*
54733847Smsmith	 * XXX temporary diagnostic message to help track stale FS code,
54833847Smsmith	 * Returning EOPNOTSUPP from here may make things unhappy.
54933847Smsmith	 */
55034403Smsmith	rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
55134403Smsmith	if (rtval == EOPNOTSUPP) {
55234403Smsmith	    printf("vnode_pager: *** WARNING *** stale FS getpages\n");
55334403Smsmith	    rtval = vnode_pager_generic_getpages( vp, m, bytes, reqpage);
55434403Smsmith	}
55533847Smsmith	return rtval;
55610556Sdyson}
55710556Sdyson
55833847Smsmith
55933847Smsmith/*
56033847Smsmith * This is now called from local media FS's to operate against their
56133847Smsmith * own vnodes if they fail to implement VOP_GETPAGES.
56233847Smsmith */
56333847Smsmithint
56433847Smsmithvnode_pager_generic_getpages(vp, m, bytecount, reqpage)
56533847Smsmith	struct vnode *vp;
56610556Sdyson	vm_page_t *m;
56733847Smsmith	int bytecount;
56810556Sdyson	int reqpage;
56910556Sdyson{
57033847Smsmith	vm_object_t object;
57112767Sdyson	vm_offset_t kva;
57234206Sdyson	off_t foff, tfoff, nextoff;
5739507Sdg	int i, size, bsize, first, firstaddr;
57433847Smsmith	struct vnode *dp;
5756151Sdg	int runpg;
5766151Sdg	int runend;
5777178Sdg	struct buf *bp;
5785455Sdg	int s;
57933847Smsmith	int count;
5805455Sdg	int error = 0;
5811549Srgrimes
58233847Smsmith	object = vp->v_object;
58333847Smsmith	count = bytecount / PAGE_SIZE;
58433847Smsmith
58511701Sdyson	if (vp->v_mount == NULL)
58611701Sdyson		return VM_PAGER_BAD;
58711701Sdyson
5881549Srgrimes	bsize = vp->v_mount->mnt_stat.f_iosize;
5891549Srgrimes
5901549Srgrimes	/* get the UNDERLYING device for the file with VOP_BMAP() */
5911827Sdg
5921549Srgrimes	/*
5931827Sdg	 * originally, we did not check for an error return value -- assuming
5941827Sdg	 * an fs always has a bmap entry point -- that assumption is wrong!!!
5951549Srgrimes	 */
59612767Sdyson	foff = IDX_TO_OFF(m[reqpage]->pindex);
5971827Sdg
5981549Srgrimes	/*
5991887Sdg	 * if we can't bmap, use old VOP code
6001549Srgrimes	 */
60110551Sdyson	if (VOP_BMAP(vp, 0, &dp, 0, NULL, NULL)) {
6021549Srgrimes		for (i = 0; i < count; i++) {
6031549Srgrimes			if (i != reqpage) {
6041549Srgrimes				vnode_pager_freepage(m[i]);
6051549Srgrimes			}
6061549Srgrimes		}
6073612Sdg		cnt.v_vnodein++;
6083612Sdg		cnt.v_vnodepgsin++;
6099507Sdg		return vnode_pager_input_old(object, m[reqpage]);
6101549Srgrimes
6111827Sdg		/*
6121827Sdg		 * if the blocksize is smaller than a page size, then use
6131827Sdg		 * special small filesystem code.  NFS sometimes has a small
6141827Sdg		 * blocksize, but it can handle large reads itself.
6151827Sdg		 */
6161827Sdg	} else if ((PAGE_SIZE / bsize) > 1 &&
61738866Sbde	    (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
6181549Srgrimes		for (i = 0; i < count; i++) {
6191549Srgrimes			if (i != reqpage) {
6201549Srgrimes				vnode_pager_freepage(m[i]);
6211549Srgrimes			}
6221549Srgrimes		}
6233612Sdg		cnt.v_vnodein++;
6243612Sdg		cnt.v_vnodepgsin++;
6259507Sdg		return vnode_pager_input_smlfs(object, m[reqpage]);
6261549Srgrimes	}
62745347Sjulian
6281549Srgrimes	/*
62945347Sjulian	 * If we have a completely valid page available to us, we can
63045347Sjulian	 * clean up and return.  Otherwise we have to re-read the
63145347Sjulian	 * media.
6321549Srgrimes	 */
63325930Sdfr
63445347Sjulian	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
6355455Sdg		for (i = 0; i < count; i++) {
6365455Sdg			if (i != reqpage)
6375455Sdg				vnode_pager_freepage(m[i]);
6381549Srgrimes		}
6395455Sdg		return VM_PAGER_OK;
6401549Srgrimes	}
64145347Sjulian	m[reqpage]->valid = 0;
6427178Sdg
6435455Sdg	/*
6445455Sdg	 * here on direct device I/O
6455455Sdg	 */
6461549Srgrimes
6476151Sdg	firstaddr = -1;
6481549Srgrimes	/*
6496151Sdg	 * calculate the run that includes the required page
6501549Srgrimes	 */
6516151Sdg	for(first = 0, i = 0; i < count; i = runend) {
65212767Sdyson		firstaddr = vnode_pager_addr(vp,
65312767Sdyson			IDX_TO_OFF(m[i]->pindex), &runpg);
6546151Sdg		if (firstaddr == -1) {
6559507Sdg			if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
65637562Sbde				/* XXX no %qd in kernel. */
65737562Sbde				panic("vnode_pager_getpages: unexpected missing page: firstaddr: %d, foff: 0x%lx%08lx, vnp_size: 0x%lx%08lx",
65837562Sbde			   	 firstaddr, (u_long)(foff >> 32),
65937562Sbde			   	 (u_long)(u_int32_t)foff,
66037562Sbde				 (u_long)(u_int32_t)
66137562Sbde				 (object->un_pager.vnp.vnp_size >> 32),
66237562Sbde				 (u_long)(u_int32_t)
66337562Sbde				 object->un_pager.vnp.vnp_size);
6646151Sdg			}
6651549Srgrimes			vnode_pager_freepage(m[i]);
6666151Sdg			runend = i + 1;
6676151Sdg			first = runend;
6686151Sdg			continue;
6691549Srgrimes		}
6706151Sdg		runend = i + runpg;
6719507Sdg		if (runend <= reqpage) {
6726151Sdg			int j;
6739507Sdg			for (j = i; j < runend; j++) {
6746151Sdg				vnode_pager_freepage(m[j]);
6756151Sdg			}
6761549Srgrimes		} else {
6779507Sdg			if (runpg < (count - first)) {
6789507Sdg				for (i = first + runpg; i < count; i++)
6796151Sdg					vnode_pager_freepage(m[i]);
6806151Sdg				count = first + runpg;
6816151Sdg			}
6826151Sdg			break;
6831549Srgrimes		}
6846151Sdg		first = runend;
6851549Srgrimes	}
6861549Srgrimes
6871549Srgrimes	/*
6881827Sdg	 * the first and last page have been calculated now, move input pages
6891827Sdg	 * to be zero based...
6901549Srgrimes	 */
6911549Srgrimes	if (first != 0) {
6921549Srgrimes		for (i = first; i < count; i++) {
6931549Srgrimes			m[i - first] = m[i];
6941549Srgrimes		}
6951549Srgrimes		count -= first;
6961549Srgrimes		reqpage -= first;
6971549Srgrimes	}
6986151Sdg
6991549Srgrimes	/*
7001549Srgrimes	 * calculate the file virtual address for the transfer
7011549Srgrimes	 */
70212767Sdyson	foff = IDX_TO_OFF(m[0]->pindex);
7031827Sdg
7041549Srgrimes	/*
7051549Srgrimes	 * calculate the size of the transfer
7061549Srgrimes	 */
7071549Srgrimes	size = count * PAGE_SIZE;
7089507Sdg	if ((foff + size) > object->un_pager.vnp.vnp_size)
7099507Sdg		size = object->un_pager.vnp.vnp_size - foff;
7101549Srgrimes
7111549Srgrimes	/*
7121549Srgrimes	 * round up physical size for real devices
7131549Srgrimes	 */
7141827Sdg	if (dp->v_type == VBLK || dp->v_type == VCHR)
7151549Srgrimes		size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
7161549Srgrimes
71742957Sdillon	bp = getpbuf(&vnode_pbuf_freecnt);
7185455Sdg	kva = (vm_offset_t) bp->b_data;
7191887Sdg
7201549Srgrimes	/*
7211549Srgrimes	 * and map the pages to be read into the kva
7221549Srgrimes	 */
7231887Sdg	pmap_qenter(kva, m, count);
7241549Srgrimes
7251549Srgrimes	/* build a minimal buffer header */
7261549Srgrimes	bp->b_flags = B_BUSY | B_READ | B_CALL;
7271549Srgrimes	bp->b_iodone = vnode_pager_iodone;
7281549Srgrimes	/* B_PHYS is not set, but it is nice to fill this in */
7291549Srgrimes	bp->b_proc = curproc;
7301549Srgrimes	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
7311827Sdg	if (bp->b_rcred != NOCRED)
7321549Srgrimes		crhold(bp->b_rcred);
7331827Sdg	if (bp->b_wcred != NOCRED)
7341549Srgrimes		crhold(bp->b_wcred);
7356626Sdg	bp->b_blkno = firstaddr;
7365455Sdg	pbgetvp(dp, bp);
7371549Srgrimes	bp->b_bcount = size;
7381549Srgrimes	bp->b_bufsize = size;
7391549Srgrimes
7403612Sdg	cnt.v_vnodein++;
7413612Sdg	cnt.v_vnodepgsin += count;
7423612Sdg
7431549Srgrimes	/* do the input */
74437384Sjulian	VOP_STRATEGY(bp->b_vp, bp);
7453612Sdg
74633758Sdyson	s = splvm();
74733758Sdyson	/* we definitely need to be at splvm here */
7481549Srgrimes
7491549Srgrimes	while ((bp->b_flags & B_DONE) == 0) {
7509356Sdg		tsleep(bp, PVM, "vnread", 0);
7511549Srgrimes	}
7521549Srgrimes	splx(s);
7531549Srgrimes	if ((bp->b_flags & B_ERROR) != 0)
7541549Srgrimes		error = EIO;
7551549Srgrimes
7561549Srgrimes	if (!error) {
7571549Srgrimes		if (size != count * PAGE_SIZE)
7581827Sdg			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
7591549Srgrimes	}
7605455Sdg	pmap_qremove(kva, count);
7611549Srgrimes
7621549Srgrimes	/*
7631549Srgrimes	 * free the buffer header back to the swap buffer pool
7641549Srgrimes	 */
76542957Sdillon	relpbuf(bp, &vnode_pbuf_freecnt);
7661549Srgrimes
76734206Sdyson	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
76834206Sdyson		vm_page_t mt;
76934206Sdyson
77034206Sdyson		nextoff = tfoff + PAGE_SIZE;
77134206Sdyson		mt = m[i];
77234206Sdyson
77334206Sdyson		if (nextoff <= size) {
77445347Sjulian			/*
77545347Sjulian			 * Read filled up entire page.
77645347Sjulian			 */
77734206Sdyson			mt->valid = VM_PAGE_BITS_ALL;
77834206Sdyson			mt->dirty = 0;
77934206Sdyson			pmap_clear_modify(VM_PAGE_TO_PHYS(mt));
78034206Sdyson		} else {
78145347Sjulian			/*
78245347Sjulian			 * Read did not fill up entire page.  Since this
78345347Sjulian			 * is getpages, the page may be mapped, so we have
78445347Sjulian			 * to zero the invalid portions of the page even
78545347Sjulian			 * though we aren't setting them valid.
78645347Sjulian			 *
78745347Sjulian			 * Currently we do not set the entire page valid,
78845347Sjulian			 * we just try to clear the piece that we couldn't
78945347Sjulian			 * read.
79045347Sjulian			 */
79145347Sjulian			vm_page_set_validclean(mt, 0, size - tfoff);
79245347Sjulian			vm_page_zero_invalid(mt, FALSE);
79334206Sdyson		}
79434206Sdyson
79538799Sdfr		vm_page_flag_clear(mt, PG_ZERO);
7961549Srgrimes		if (i != reqpage) {
7971827Sdg
7981549Srgrimes			/*
7991827Sdg			 * whether or not to leave the page activated is up in
8001827Sdg			 * the air, but we should put the page on a page queue
8011827Sdg			 * somewhere. (it already is in the object). Result:
8021827Sdg			 * It appears that emperical results show that
8031827Sdg			 * deactivating pages is best.
8041549Srgrimes			 */
8051827Sdg
8061549Srgrimes			/*
8071827Sdg			 * just in case someone was asking for this page we
8081827Sdg			 * now tell them that it is ok to use
8091549Srgrimes			 */
8101549Srgrimes			if (!error) {
81134206Sdyson				if (mt->flags & PG_WANTED)
81234206Sdyson					vm_page_activate(mt);
81333109Sdyson				else
81434206Sdyson					vm_page_deactivate(mt);
81538799Sdfr				vm_page_wakeup(mt);
8161549Srgrimes			} else {
81734206Sdyson				vnode_pager_freepage(mt);
8181549Srgrimes			}
8191549Srgrimes		}
8201549Srgrimes	}
8211549Srgrimes	if (error) {
8229507Sdg		printf("vnode_pager_getpages: I/O read error\n");
8231549Srgrimes	}
8244207Sdg	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
8251549Srgrimes}
8261549Srgrimes
82733847Smsmith/*
82833847Smsmith * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
82933847Smsmith * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
83033847Smsmith * vnode_pager_generic_putpages() to implement the previous behaviour.
83133847Smsmith *
83233847Smsmith * All other FS's should use the bypass to get to the local media
83333847Smsmith * backing vp's VOP_PUTPAGES.
83433847Smsmith */
83543129Sdillonstatic void
83610556Sdysonvnode_pager_putpages(object, m, count, sync, rtvals)
83710556Sdyson	vm_object_t object;
83810556Sdyson	vm_page_t *m;
83910556Sdyson	int count;
84010556Sdyson	boolean_t sync;
84110556Sdyson	int *rtvals;
84210556Sdyson{
84310556Sdyson	int rtval;
84410556Sdyson	struct vnode *vp;
84534403Smsmith	int bytes = count * PAGE_SIZE;
84618973Sdyson
84744321Salc	/*
84844321Salc	 * Force synchronous operation if we are extremely low on memory
84944321Salc	 * to prevent a low-memory deadlock.  VOP operations often need to
85044321Salc	 * allocate more memory to initiate the I/O ( i.e. do a BMAP
85144321Salc	 * operation ).  The swapper handles the case by limiting the amount
85244321Salc	 * of asynchronous I/O, but that sort of solution doesn't scale well
85344321Salc	 * for the vnode pager without a lot of work.
85444321Salc	 *
85544321Salc	 * Also, the backing vnode's iodone routine may not wake the pageout
85644321Salc	 * daemon up.  This should be probably be addressed XXX.
85744321Salc	 */
85844321Salc
85944321Salc	if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
86044321Salc		sync |= OBJPC_SYNC;
86144321Salc
86244321Salc	/*
86344321Salc	 * Call device-specific putpages function
86444321Salc	 */
86544321Salc
86610556Sdyson	vp = object->handle;
86734403Smsmith	rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
86834403Smsmith	if (rtval == EOPNOTSUPP) {
86934403Smsmith	    printf("vnode_pager: *** WARNING *** stale FS putpages\n");
87034403Smsmith	    rtval = vnode_pager_generic_putpages( vp, m, bytes, sync, rtvals);
87134403Smsmith	}
87210556Sdyson}
87310556Sdyson
87433847Smsmith
8751549Srgrimes/*
87633847Smsmith * This is now called from local media FS's to operate against their
87745057Seivind * own vnodes if they fail to implement VOP_PUTPAGES.
8781549Srgrimes */
87933847Smsmithint
88034206Sdysonvnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
88133847Smsmith	struct vnode *vp;
8821549Srgrimes	vm_page_t *m;
88333847Smsmith	int bytecount;
88434206Sdyson	int flags;
8855455Sdg	int *rtvals;
8861549Srgrimes{
8877695Sdg	int i;
88833847Smsmith	vm_object_t object;
88933847Smsmith	int count;
8901549Srgrimes
8917695Sdg	int maxsize, ncount;
89212767Sdyson	vm_ooffset_t poffset;
8937695Sdg	struct uio auio;
8947695Sdg	struct iovec aiov;
8957695Sdg	int error;
89634206Sdyson	int ioflags;
8971549Srgrimes
89833847Smsmith	object = vp->v_object;
89933847Smsmith	count = bytecount / PAGE_SIZE;
90033847Smsmith
9011827Sdg	for (i = 0; i < count; i++)
9021549Srgrimes		rtvals[i] = VM_PAGER_AGAIN;
9031549Srgrimes
90412767Sdyson	if ((int) m[0]->pindex < 0) {
90534206Sdyson		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%x(%x)\n",
90634206Sdyson			m[0]->pindex, m[0]->dirty);
9077695Sdg		rtvals[0] = VM_PAGER_BAD;
9087695Sdg		return VM_PAGER_BAD;
9095455Sdg	}
9107178Sdg
9117695Sdg	maxsize = count * PAGE_SIZE;
9127695Sdg	ncount = count;
9131549Srgrimes
91412767Sdyson	poffset = IDX_TO_OFF(m[0]->pindex);
91512767Sdyson	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
91612767Sdyson		if (object->un_pager.vnp.vnp_size > poffset)
91712767Sdyson			maxsize = object->un_pager.vnp.vnp_size - poffset;
9188585Sdg		else
9198585Sdg			maxsize = 0;
92015583Sphk		ncount = btoc(maxsize);
9218585Sdg		if (ncount < count) {
9228585Sdg			for (i = ncount; i < count; i++) {
9237695Sdg				rtvals[i] = VM_PAGER_BAD;
9241549Srgrimes			}
9251549Srgrimes		}
9261541Srgrimes	}
9277695Sdg
92834206Sdyson	ioflags = IO_VMIO;
92934206Sdyson	ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: 0;
93034206Sdyson	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
9311827Sdg
9327695Sdg	aiov.iov_base = (caddr_t) 0;
9337695Sdg	aiov.iov_len = maxsize;
9347695Sdg	auio.uio_iov = &aiov;
9357695Sdg	auio.uio_iovcnt = 1;
93612767Sdyson	auio.uio_offset = poffset;
9377695Sdg	auio.uio_segflg = UIO_NOCOPY;
9387695Sdg	auio.uio_rw = UIO_WRITE;
9397695Sdg	auio.uio_resid = maxsize;
9407695Sdg	auio.uio_procp = (struct proc *) 0;
94134206Sdyson	error = VOP_WRITE(vp, &auio, ioflags, curproc->p_ucred);
9423612Sdg	cnt.v_vnodeout++;
9437695Sdg	cnt.v_vnodepgsout += ncount;
9443612Sdg
9458585Sdg	if (error) {
9469507Sdg		printf("vnode_pager_putpages: I/O error %d\n", error);
9477695Sdg	}
9488585Sdg	if (auio.uio_resid) {
94937555Sbde		printf("vnode_pager_putpages: residual I/O %d at %lu\n",
95037555Sbde		    auio.uio_resid, (u_long)m[0]->pindex);
9517695Sdg	}
95233936Sdyson	for (i = 0; i < ncount; i++) {
95333936Sdyson		rtvals[i] = VM_PAGER_OK;
9547695Sdg	}
9557695Sdg	return rtvals[0];
9567695Sdg}
9571549Srgrimes
9587695Sdgstruct vnode *
9599507Sdgvnode_pager_lock(object)
9609507Sdg	vm_object_t object;
9619507Sdg{
96222521Sdyson	struct proc *p = curproc;	/* XXX */
96322521Sdyson
9649507Sdg	for (; object != NULL; object = object->backing_object) {
9659507Sdg		if (object->type != OBJT_VNODE)
9667695Sdg			continue;
96732585Sdyson		if (object->flags & OBJ_DEAD)
96832585Sdyson			return NULL;
9691549Srgrimes
97032585Sdyson		while (vget(object->handle,
97132585Sdyson			LK_NOPAUSE | LK_SHARED | LK_RETRY | LK_CANRECURSE, p)) {
97234611Sdyson			if ((object->flags & OBJ_DEAD) || (object->type != OBJT_VNODE))
97334611Sdyson				return NULL;
97432585Sdyson			printf("vnode_pager_lock: retrying\n");
97532585Sdyson		}
9769507Sdg		return object->handle;
9771549Srgrimes	}
9789507Sdg	return NULL;
9797695Sdg}
980