vnode_pager.c revision 106603
1155191Srwatson/*
2155191Srwatson * Copyright (c) 1990 University of Utah.
3146759Srwatson * Copyright (c) 1991 The Regents of the University of California.
4146759Srwatson * All rights reserved.
5155191Srwatson * Copyright (c) 1993, 1994 John S. Dyson
6155191Srwatson * Copyright (c) 1995, David Greenman
7146759Srwatson *
8146759Srwatson * This code is derived from software contributed to Berkeley by
9146759Srwatson * the Systems Programming Group of the University of Utah Computer
10146759Srwatson * Science Department.
11155191Srwatson *
12155191Srwatson * Redistribution and use in source and binary forms, with or without
13155191Srwatson * modification, are permitted provided that the following conditions
14155191Srwatson * are met:
15155191Srwatson * 1. Redistributions of source code must retain the above copyright
16155191Srwatson *    notice, this list of conditions and the following disclaimer.
17155191Srwatson * 2. Redistributions in binary form must reproduce the above copyright
18155191Srwatson *    notice, this list of conditions and the following disclaimer in the
19146759Srwatson *    documentation and/or other materials provided with the distribution.
20155191Srwatson * 3. All advertising materials mentioning features or use of this software
21155191Srwatson *    must display the following acknowledgement:
22155191Srwatson *	This product includes software developed by the University of
23155191Srwatson *	California, Berkeley and its contributors.
24155191Srwatson * 4. Neither the name of the University nor the names of its contributors
25155191Srwatson *    may be used to endorse or promote products derived from this software
26155191Srwatson *    without specific prior written permission.
27155191Srwatson *
28155191Srwatson * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29155191Srwatson * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30155191Srwatson * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31155191Srwatson * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32155191Srwatson * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33155191Srwatson * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34146759Srwatson * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35146759Srwatson * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36146759Srwatson * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37155191Srwatson * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38155191Srwatson * SUCH DAMAGE.
39146759Srwatson *
40155191Srwatson *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
41155191Srwatson * $FreeBSD: head/sys/vm/vnode_pager.c 106603 2002-11-07 23:16:22Z mux $
42155191Srwatson */
43155191Srwatson
44155191Srwatson/*
45155191Srwatson * Page to/from files (vnodes).
46155191Srwatson */
47155191Srwatson
48155191Srwatson/*
49146759Srwatson * TODO:
50155191Srwatson *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
51155191Srwatson *	greatly re-simplify the vnode_pager.
52155191Srwatson */
53155191Srwatson
54146759Srwatson#include <sys/param.h>
55155191Srwatson#include <sys/systm.h>
56155191Srwatson#include <sys/proc.h>
57155191Srwatson#include <sys/vnode.h>
58155191Srwatson#include <sys/mount.h>
59155191Srwatson#include <sys/bio.h>
60155191Srwatson#include <sys/buf.h>
61155191Srwatson#include <sys/vmmeter.h>
62155191Srwatson#include <sys/conf.h>
63155191Srwatson#include <sys/stdint.h>
64155191Srwatson
65155191Srwatson#include <vm/vm.h>
66155191Srwatson#include <vm/vm_object.h>
67155191Srwatson#include <vm/vm_page.h>
68155191Srwatson#include <vm/vm_pager.h>
69155191Srwatson#include <vm/vm_map.h>
70155191Srwatson#include <vm/vnode_pager.h>
71155191Srwatson#include <vm/vm_extern.h>
72155191Srwatson
73155191Srwatsonstatic void vnode_pager_init(void);
74155191Srwatsonstatic vm_offset_t vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
75155191Srwatson					 int *run);
76155191Srwatsonstatic void vnode_pager_iodone(struct buf *bp);
77155191Srwatsonstatic int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
78155191Srwatsonstatic int vnode_pager_input_old(vm_object_t object, vm_page_t m);
79155191Srwatsonstatic void vnode_pager_dealloc(vm_object_t);
80155191Srwatsonstatic int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
81155191Srwatsonstatic void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
82155191Srwatsonstatic boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
83155191Srwatson
84155191Srwatsonstruct pagerops vnodepagerops = {
85155191Srwatson	vnode_pager_init,
86155191Srwatson	vnode_pager_alloc,
87155191Srwatson	vnode_pager_dealloc,
88155191Srwatson	vnode_pager_getpages,
89155191Srwatson	vnode_pager_putpages,
90155191Srwatson	vnode_pager_haspage,
91155191Srwatson	NULL
92155191Srwatson};
93155191Srwatson
94155191Srwatsonint vnode_pbuf_freecnt;
95155191Srwatson
96155191Srwatsonstatic void
97155191Srwatsonvnode_pager_init(void)
98155191Srwatson{
99155191Srwatson
100155191Srwatson	vnode_pbuf_freecnt = nswbuf / 2 + 1;
101155191Srwatson}
102155191Srwatson
103155191Srwatson/*
104155191Srwatson * Allocate (or lookup) pager for a vnode.
105155191Srwatson * Handle is a vnode pointer.
106155191Srwatson *
107155191Srwatson * MPSAFE
108155191Srwatson */
109155191Srwatsonvm_object_t
110155191Srwatsonvnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
111155191Srwatson		  vm_ooffset_t offset)
112155191Srwatson{
113155191Srwatson	vm_object_t object;
114155191Srwatson	struct vnode *vp;
115155191Srwatson
116155191Srwatson	/*
117155191Srwatson	 * Pageout to vnode, no can do yet.
118155191Srwatson	 */
119155191Srwatson	if (handle == NULL)
120155191Srwatson		return (NULL);
121155191Srwatson
122155191Srwatson	vp = (struct vnode *) handle;
123155191Srwatson
124155191Srwatson	ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc");
125155191Srwatson
126155191Srwatson	mtx_lock(&Giant);
127155191Srwatson	/*
128155191Srwatson	 * Prevent race condition when allocating the object. This
129155191Srwatson	 * can happen with NFS vnodes since the nfsnode isn't locked.
130155191Srwatson	 */
131155191Srwatson	VI_LOCK(vp);
132155191Srwatson	while (vp->v_iflag & VI_OLOCK) {
133155191Srwatson		vp->v_iflag |= VI_OWANT;
134155191Srwatson		msleep(vp, VI_MTX(vp), PVM, "vnpobj", 0);
135155191Srwatson	}
136155191Srwatson	vp->v_iflag |= VI_OLOCK;
137155191Srwatson	VI_UNLOCK(vp);
138155191Srwatson
139155191Srwatson	/*
140155191Srwatson	 * If the object is being terminated, wait for it to
141155191Srwatson	 * go away.
142155191Srwatson	 */
143155191Srwatson	while (((object = vp->v_object) != NULL) &&
144155191Srwatson		(object->flags & OBJ_DEAD)) {
145155191Srwatson		tsleep(object, PVM, "vadead", 0);
146155191Srwatson	}
147155191Srwatson
148155191Srwatson	if (vp->v_usecount == 0)
149155191Srwatson		panic("vnode_pager_alloc: no vnode reference");
150155191Srwatson
151155191Srwatson	if (object == NULL) {
152155191Srwatson		/*
153155191Srwatson		 * And an object of the appropriate size
154155191Srwatson		 */
155155191Srwatson		object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
156155191Srwatson
157155191Srwatson		object->un_pager.vnp.vnp_size = size;
158155191Srwatson
159155191Srwatson		object->handle = handle;
160155191Srwatson		vp->v_object = object;
161155191Srwatson	} else {
162155191Srwatson		object->ref_count++;
163155191Srwatson	}
164155191Srwatson	VI_LOCK(vp);
165155191Srwatson	vp->v_usecount++;
166155191Srwatson	vp->v_iflag &= ~VI_OLOCK;
167155191Srwatson	if (vp->v_iflag & VI_OWANT) {
168155191Srwatson		vp->v_iflag &= ~VI_OWANT;
169155191Srwatson		wakeup(vp);
170155191Srwatson	}
171155191Srwatson	VI_UNLOCK(vp);
172155191Srwatson	mtx_unlock(&Giant);
173155191Srwatson	return (object);
174155191Srwatson}
175155191Srwatson
176155191Srwatsonstatic void
177155191Srwatsonvnode_pager_dealloc(object)
178155191Srwatson	vm_object_t object;
179155191Srwatson{
180155191Srwatson	struct vnode *vp = object->handle;
181155191Srwatson
182155191Srwatson	GIANT_REQUIRED;
183155191Srwatson	if (vp == NULL)
184155191Srwatson		panic("vnode_pager_dealloc: pager already dealloced");
185155191Srwatson
186155191Srwatson	vm_object_pip_wait(object, "vnpdea");
187155191Srwatson
188155191Srwatson	object->handle = NULL;
189155191Srwatson	object->type = OBJT_DEAD;
190155191Srwatson	ASSERT_VOP_LOCKED(vp, "vnode_pager_dealloc");
191155191Srwatson	vp->v_object = NULL;
192155191Srwatson	vp->v_vflag &= ~(VV_TEXT | VV_OBJBUF);
193155191Srwatson}
194155191Srwatson
195155191Srwatsonstatic boolean_t
196155191Srwatsonvnode_pager_haspage(object, pindex, before, after)
197155191Srwatson	vm_object_t object;
198155191Srwatson	vm_pindex_t pindex;
199155191Srwatson	int *before;
200155191Srwatson	int *after;
201155191Srwatson{
202155191Srwatson	struct vnode *vp = object->handle;
203155191Srwatson	daddr_t bn;
204146759Srwatson	int err;
205155191Srwatson	daddr_t reqblock;
206155191Srwatson	int poff;
207146759Srwatson	int bsize;
208155191Srwatson	int pagesperblock, blocksperpage;
209155191Srwatson
210155191Srwatson	GIANT_REQUIRED;
211155191Srwatson	/*
212155191Srwatson	 * If no vp or vp is doomed or marked transparent to VM, we do not
213155191Srwatson	 * have the page.
214155191Srwatson	 */
215155191Srwatson	if (vp == NULL)
216155191Srwatson		return FALSE;
217155191Srwatson
218155191Srwatson	VI_LOCK(vp);
219155191Srwatson	if (vp->v_iflag & VI_DOOMED) {
220155191Srwatson		VI_UNLOCK(vp);
221155191Srwatson		return FALSE;
222155191Srwatson	}
223155191Srwatson	VI_UNLOCK(vp);
224155191Srwatson	/*
225155191Srwatson	 * If filesystem no longer mounted or offset beyond end of file we do
226155191Srwatson	 * not have the page.
227155191Srwatson	 */
228155191Srwatson	if ((vp->v_mount == NULL) ||
229155191Srwatson	    (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size))
230155191Srwatson		return FALSE;
231155191Srwatson
232155191Srwatson	bsize = vp->v_mount->mnt_stat.f_iosize;
233155191Srwatson	pagesperblock = bsize / PAGE_SIZE;
234155191Srwatson	blocksperpage = 0;
235155191Srwatson	if (pagesperblock > 0) {
236155191Srwatson		reqblock = pindex / pagesperblock;
237155191Srwatson	} else {
238155191Srwatson		blocksperpage = (PAGE_SIZE / bsize);
239155191Srwatson		reqblock = pindex * blocksperpage;
240155191Srwatson	}
241155191Srwatson	err = VOP_BMAP(vp, reqblock, (struct vnode **) 0, &bn,
242155191Srwatson		after, before);
243155191Srwatson	if (err)
244155191Srwatson		return TRUE;
245155191Srwatson	if (bn == -1)
246155191Srwatson		return FALSE;
247155191Srwatson	if (pagesperblock > 0) {
248155191Srwatson		poff = pindex - (reqblock * pagesperblock);
249155191Srwatson		if (before) {
250155191Srwatson			*before *= pagesperblock;
251155191Srwatson			*before += poff;
252155191Srwatson		}
253155191Srwatson		if (after) {
254155191Srwatson			int numafter;
255155191Srwatson			*after *= pagesperblock;
256155191Srwatson			numafter = pagesperblock - (poff + 1);
257155191Srwatson			if (IDX_TO_OFF(pindex + numafter) >
258155191Srwatson			    object->un_pager.vnp.vnp_size) {
259155191Srwatson				numafter =
260155191Srwatson		    		    OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
261155191Srwatson				    pindex;
262155191Srwatson			}
263155191Srwatson			*after += numafter;
264155191Srwatson		}
265155191Srwatson	} else {
266155191Srwatson		if (before) {
267155191Srwatson			*before /= blocksperpage;
268155191Srwatson		}
269155191Srwatson
270155191Srwatson		if (after) {
271155191Srwatson			*after /= blocksperpage;
272155191Srwatson		}
273155191Srwatson	}
274155191Srwatson	return TRUE;
275155191Srwatson}
276155191Srwatson
277155191Srwatson/*
278155191Srwatson * Lets the VM system know about a change in size for a file.
279155191Srwatson * We adjust our own internal size and flush any cached pages in
280155191Srwatson * the associated object that are affected by the size change.
281155191Srwatson *
282155191Srwatson * Note: this routine may be invoked as a result of a pager put
283155191Srwatson * operation (possibly at object termination time), so we must be careful.
284155191Srwatson */
285155191Srwatsonvoid
286155191Srwatsonvnode_pager_setsize(vp, nsize)
287155191Srwatson	struct vnode *vp;
288155191Srwatson	vm_ooffset_t nsize;
289155191Srwatson{
290155191Srwatson	vm_pindex_t nobjsize;
291155191Srwatson	vm_object_t object = vp->v_object;
292155191Srwatson
293155191Srwatson	GIANT_REQUIRED;
294155191Srwatson
295155191Srwatson	if (object == NULL)
296155191Srwatson		return;
297155191Srwatson
298155191Srwatson	/*
299155191Srwatson	 * Hasn't changed size
300155191Srwatson	 */
301155191Srwatson	if (nsize == object->un_pager.vnp.vnp_size)
302155191Srwatson		return;
303155191Srwatson
304155191Srwatson	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
305155191Srwatson
306155191Srwatson	/*
307155191Srwatson	 * File has shrunk. Toss any cached pages beyond the new EOF.
308155191Srwatson	 */
309155191Srwatson	if (nsize < object->un_pager.vnp.vnp_size) {
310155191Srwatson#ifdef ENABLE_VFS_IOOPT
311155191Srwatson		vm_freeze_copyopts(object, OFF_TO_IDX(nsize), object->size);
312155191Srwatson#endif
313155191Srwatson		if (nobjsize < object->size) {
314155191Srwatson			vm_object_page_remove(object, nobjsize, object->size,
315155191Srwatson				FALSE);
316155191Srwatson		}
317155191Srwatson		/*
318155191Srwatson		 * this gets rid of garbage at the end of a page that is now
319155191Srwatson		 * only partially backed by the vnode.
320155191Srwatson		 *
321155191Srwatson		 * XXX for some reason (I don't know yet), if we take a
322155191Srwatson		 * completely invalid page and mark it partially valid
323155191Srwatson		 * it can screw up NFS reads, so we don't allow the case.
324155191Srwatson		 */
325155191Srwatson		if (nsize & PAGE_MASK) {
326155191Srwatson			vm_page_t m;
327155191Srwatson
328155191Srwatson			m = vm_page_lookup(object, OFF_TO_IDX(nsize));
329155191Srwatson			if (m && m->valid) {
330155191Srwatson				int base = (int)nsize & PAGE_MASK;
331155191Srwatson				int size = PAGE_SIZE - base;
332155191Srwatson
333155191Srwatson				/*
334155191Srwatson				 * Clear out partial-page garbage in case
335155191Srwatson				 * the page has been mapped.
336155191Srwatson				 */
337155191Srwatson				pmap_zero_page_area(m, base, size);
338155191Srwatson
339155191Srwatson				/*
340155191Srwatson				 * XXX work around SMP data integrity race
341155191Srwatson				 * by unmapping the page from user processes.
342155191Srwatson				 * The garbage we just cleared may be mapped
343155191Srwatson				 * to a user process running on another cpu
344155191Srwatson				 * and this code is not running through normal
345155191Srwatson				 * I/O channels which handle SMP issues for
346				 * us, so unmap page to synchronize all cpus.
347				 *
348				 * XXX should vm_pager_unmap_page() have
349				 * dealt with this?
350				 */
351				vm_page_protect(m, VM_PROT_NONE);
352
353				/*
354				 * Clear out partial-page dirty bits.  This
355				 * has the side effect of setting the valid
356				 * bits, but that is ok.  There are a bunch
357				 * of places in the VM system where we expected
358				 * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
359				 * case is one of them.  If the page is still
360				 * partially dirty, make it fully dirty.
361				 *
362				 * note that we do not clear out the valid
363				 * bits.  This would prevent bogus_page
364				 * replacement from working properly.
365				 */
366				vm_page_set_validclean(m, base, size);
367				if (m->dirty != 0)
368					m->dirty = VM_PAGE_BITS_ALL;
369			}
370		}
371	}
372	object->un_pager.vnp.vnp_size = nsize;
373	object->size = nobjsize;
374}
375
376/*
377 * calculate the linear (byte) disk address of specified virtual
378 * file address
379 */
380static vm_offset_t
381vnode_pager_addr(vp, address, run)
382	struct vnode *vp;
383	vm_ooffset_t address;
384	int *run;
385{
386	int rtaddress;
387	int bsize;
388	daddr_t block;
389	struct vnode *rtvp;
390	int err;
391	daddr_t vblock;
392	int voffset;
393
394	GIANT_REQUIRED;
395	if ((int) address < 0)
396		return -1;
397
398	if (vp->v_mount == NULL)
399		return -1;
400
401	bsize = vp->v_mount->mnt_stat.f_iosize;
402	vblock = address / bsize;
403	voffset = address % bsize;
404
405	err = VOP_BMAP(vp, vblock, &rtvp, &block, run, NULL);
406
407	if (err || (block == -1))
408		rtaddress = -1;
409	else {
410		rtaddress = block + voffset / DEV_BSIZE;
411		if (run) {
412			*run += 1;
413			*run *= bsize/PAGE_SIZE;
414			*run -= voffset/PAGE_SIZE;
415		}
416	}
417
418	return rtaddress;
419}
420
421/*
422 * interrupt routine for I/O completion
423 */
424static void
425vnode_pager_iodone(bp)
426	struct buf *bp;
427{
428	bp->b_flags |= B_DONE;
429	wakeup(bp);
430}
431
432/*
433 * small block filesystem vnode pager input
434 */
435static int
436vnode_pager_input_smlfs(object, m)
437	vm_object_t object;
438	vm_page_t m;
439{
440	int i;
441	int s;
442	struct vnode *dp, *vp;
443	struct buf *bp;
444	vm_offset_t kva;
445	int fileaddr;
446	vm_offset_t bsize;
447	int error = 0;
448
449	GIANT_REQUIRED;
450
451	vp = object->handle;
452	if (vp->v_mount == NULL)
453		return VM_PAGER_BAD;
454
455	bsize = vp->v_mount->mnt_stat.f_iosize;
456
457	VOP_BMAP(vp, 0, &dp, 0, NULL, NULL);
458
459	kva = vm_pager_map_page(m);
460
461	for (i = 0; i < PAGE_SIZE / bsize; i++) {
462		vm_ooffset_t address;
463
464		if (vm_page_bits(i * bsize, bsize) & m->valid)
465			continue;
466
467		address = IDX_TO_OFF(m->pindex) + i * bsize;
468		if (address >= object->un_pager.vnp.vnp_size) {
469			fileaddr = -1;
470		} else {
471			fileaddr = vnode_pager_addr(vp, address, NULL);
472		}
473		if (fileaddr != -1) {
474			bp = getpbuf(&vnode_pbuf_freecnt);
475
476			/* build a minimal buffer header */
477			bp->b_iocmd = BIO_READ;
478			bp->b_iodone = vnode_pager_iodone;
479			KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
480			KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
481			bp->b_rcred = crhold(curthread->td_ucred);
482			bp->b_wcred = crhold(curthread->td_ucred);
483			bp->b_data = (caddr_t) kva + i * bsize;
484			bp->b_blkno = fileaddr;
485			pbgetvp(dp, bp);
486			bp->b_bcount = bsize;
487			bp->b_bufsize = bsize;
488			bp->b_runningbufspace = bp->b_bufsize;
489			runningbufspace += bp->b_runningbufspace;
490
491			/* do the input */
492			BUF_STRATEGY(bp);
493
494			/* we definitely need to be at splvm here */
495
496			s = splvm();
497			while ((bp->b_flags & B_DONE) == 0) {
498				tsleep(bp, PVM, "vnsrd", 0);
499			}
500			splx(s);
501			if ((bp->b_ioflags & BIO_ERROR) != 0)
502				error = EIO;
503
504			/*
505			 * free the buffer header back to the swap buffer pool
506			 */
507			relpbuf(bp, &vnode_pbuf_freecnt);
508			if (error)
509				break;
510
511			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
512		} else {
513			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
514			bzero((caddr_t) kva + i * bsize, bsize);
515		}
516	}
517	vm_pager_unmap_page(kva);
518	pmap_clear_modify(m);
519	vm_page_flag_clear(m, PG_ZERO);
520	if (error) {
521		return VM_PAGER_ERROR;
522	}
523	return VM_PAGER_OK;
524
525}
526
527
528/*
529 * old style vnode pager output routine
530 */
531static int
532vnode_pager_input_old(object, m)
533	vm_object_t object;
534	vm_page_t m;
535{
536	struct uio auio;
537	struct iovec aiov;
538	int error;
539	int size;
540	vm_offset_t kva;
541	struct vnode *vp;
542
543	GIANT_REQUIRED;
544	error = 0;
545
546	/*
547	 * Return failure if beyond current EOF
548	 */
549	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
550		return VM_PAGER_BAD;
551	} else {
552		size = PAGE_SIZE;
553		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
554			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
555
556		/*
557		 * Allocate a kernel virtual address and initialize so that
558		 * we can use VOP_READ/WRITE routines.
559		 */
560		kva = vm_pager_map_page(m);
561
562		vp = object->handle;
563		aiov.iov_base = (caddr_t) kva;
564		aiov.iov_len = size;
565		auio.uio_iov = &aiov;
566		auio.uio_iovcnt = 1;
567		auio.uio_offset = IDX_TO_OFF(m->pindex);
568		auio.uio_segflg = UIO_SYSSPACE;
569		auio.uio_rw = UIO_READ;
570		auio.uio_resid = size;
571		auio.uio_td = curthread;
572
573		error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
574		if (!error) {
575			int count = size - auio.uio_resid;
576
577			if (count == 0)
578				error = EINVAL;
579			else if (count != PAGE_SIZE)
580				bzero((caddr_t) kva + count, PAGE_SIZE - count);
581		}
582		vm_pager_unmap_page(kva);
583	}
584	pmap_clear_modify(m);
585	vm_page_undirty(m);
586	vm_page_flag_clear(m, PG_ZERO);
587	if (!error)
588		m->valid = VM_PAGE_BITS_ALL;
589	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
590}
591
592/*
593 * generic vnode pager input routine
594 */
595
596/*
597 * Local media VFS's that do not implement their own VOP_GETPAGES
598 * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
599 * to implement the previous behaviour.
600 *
601 * All other FS's should use the bypass to get to the local media
602 * backing vp's VOP_GETPAGES.
603 */
604static int
605vnode_pager_getpages(object, m, count, reqpage)
606	vm_object_t object;
607	vm_page_t *m;
608	int count;
609	int reqpage;
610{
611	int rtval;
612	struct vnode *vp;
613	int bytes = count * PAGE_SIZE;
614
615	GIANT_REQUIRED;
616	vp = object->handle;
617	rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
618	KASSERT(rtval != EOPNOTSUPP,
619	    ("vnode_pager: FS getpages not implemented\n"));
620	return rtval;
621}
622
623/*
624 * This is now called from local media FS's to operate against their
625 * own vnodes if they fail to implement VOP_GETPAGES.
626 */
627int
628vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
629	struct vnode *vp;
630	vm_page_t *m;
631	int bytecount;
632	int reqpage;
633{
634	vm_object_t object;
635	vm_offset_t kva;
636	off_t foff, tfoff, nextoff;
637	int i, j, size, bsize, first, firstaddr;
638	struct vnode *dp;
639	int runpg;
640	int runend;
641	struct buf *bp;
642	int s;
643	int count;
644	int error = 0;
645
646	GIANT_REQUIRED;
647	object = vp->v_object;
648	count = bytecount / PAGE_SIZE;
649
650	if (vp->v_mount == NULL)
651		return VM_PAGER_BAD;
652
653	bsize = vp->v_mount->mnt_stat.f_iosize;
654
655	/* get the UNDERLYING device for the file with VOP_BMAP() */
656
657	/*
658	 * originally, we did not check for an error return value -- assuming
659	 * an fs always has a bmap entry point -- that assumption is wrong!!!
660	 */
661	foff = IDX_TO_OFF(m[reqpage]->pindex);
662
663	/*
664	 * if we can't bmap, use old VOP code
665	 */
666	if (VOP_BMAP(vp, 0, &dp, 0, NULL, NULL)) {
667		vm_page_lock_queues();
668		for (i = 0; i < count; i++)
669			if (i != reqpage)
670				vm_page_free(m[i]);
671		vm_page_unlock_queues();
672		cnt.v_vnodein++;
673		cnt.v_vnodepgsin++;
674		return vnode_pager_input_old(object, m[reqpage]);
675
676		/*
677		 * if the blocksize is smaller than a page size, then use
678		 * special small filesystem code.  NFS sometimes has a small
679		 * blocksize, but it can handle large reads itself.
680		 */
681	} else if ((PAGE_SIZE / bsize) > 1 &&
682	    (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
683		vm_page_lock_queues();
684		for (i = 0; i < count; i++)
685			if (i != reqpage)
686				vm_page_free(m[i]);
687		vm_page_unlock_queues();
688		cnt.v_vnodein++;
689		cnt.v_vnodepgsin++;
690		return vnode_pager_input_smlfs(object, m[reqpage]);
691	}
692
693	/*
694	 * If we have a completely valid page available to us, we can
695	 * clean up and return.  Otherwise we have to re-read the
696	 * media.
697	 */
698	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
699		vm_page_lock_queues();
700		for (i = 0; i < count; i++)
701			if (i != reqpage)
702				vm_page_free(m[i]);
703		vm_page_unlock_queues();
704		return VM_PAGER_OK;
705	}
706	m[reqpage]->valid = 0;
707
708	/*
709	 * here on direct device I/O
710	 */
711	firstaddr = -1;
712
713	/*
714	 * calculate the run that includes the required page
715	 */
716	for (first = 0, i = 0; i < count; i = runend) {
717		firstaddr = vnode_pager_addr(vp,
718			IDX_TO_OFF(m[i]->pindex), &runpg);
719		if (firstaddr == -1) {
720			if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
721				panic("vnode_pager_getpages: unexpected missing page: firstaddr: %d, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx",
722				    firstaddr, (uintmax_t)(foff >> 32),
723				    (uintmax_t)foff,
724				    (uintmax_t)
725				    (object->un_pager.vnp.vnp_size >> 32),
726				    (uintmax_t)object->un_pager.vnp.vnp_size);
727			}
728			vm_page_lock_queues();
729			vm_page_free(m[i]);
730			vm_page_unlock_queues();
731			runend = i + 1;
732			first = runend;
733			continue;
734		}
735		runend = i + runpg;
736		if (runend <= reqpage) {
737			vm_page_lock_queues();
738			for (j = i; j < runend; j++)
739				vm_page_free(m[j]);
740			vm_page_unlock_queues();
741		} else {
742			if (runpg < (count - first)) {
743				vm_page_lock_queues();
744				for (i = first + runpg; i < count; i++)
745					vm_page_free(m[i]);
746				vm_page_unlock_queues();
747				count = first + runpg;
748			}
749			break;
750		}
751		first = runend;
752	}
753
754	/*
755	 * the first and last page have been calculated now, move input pages
756	 * to be zero based...
757	 */
758	if (first != 0) {
759		for (i = first; i < count; i++) {
760			m[i - first] = m[i];
761		}
762		count -= first;
763		reqpage -= first;
764	}
765
766	/*
767	 * calculate the file virtual address for the transfer
768	 */
769	foff = IDX_TO_OFF(m[0]->pindex);
770
771	/*
772	 * calculate the size of the transfer
773	 */
774	size = count * PAGE_SIZE;
775	if ((foff + size) > object->un_pager.vnp.vnp_size)
776		size = object->un_pager.vnp.vnp_size - foff;
777
778	/*
779	 * round up physical size for real devices.
780	 */
781	if (dp->v_type == VBLK || dp->v_type == VCHR) {
782		int secmask = dp->v_rdev->si_bsize_phys - 1;
783		KASSERT(secmask < PAGE_SIZE, ("vnode_pager_generic_getpages: sector size %d too large\n", secmask + 1));
784		size = (size + secmask) & ~secmask;
785	}
786
787	bp = getpbuf(&vnode_pbuf_freecnt);
788	kva = (vm_offset_t) bp->b_data;
789
790	/*
791	 * and map the pages to be read into the kva
792	 */
793	pmap_qenter(kva, m, count);
794
795	/* build a minimal buffer header */
796	bp->b_iocmd = BIO_READ;
797	bp->b_iodone = vnode_pager_iodone;
798	/* B_PHYS is not set, but it is nice to fill this in */
799	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
800	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
801	bp->b_rcred = crhold(curthread->td_ucred);
802	bp->b_wcred = crhold(curthread->td_ucred);
803	bp->b_blkno = firstaddr;
804	pbgetvp(dp, bp);
805	bp->b_bcount = size;
806	bp->b_bufsize = size;
807	bp->b_runningbufspace = bp->b_bufsize;
808	runningbufspace += bp->b_runningbufspace;
809
810	cnt.v_vnodein++;
811	cnt.v_vnodepgsin += count;
812
813	/* do the input */
814	BUF_STRATEGY(bp);
815
816	s = splvm();
817	/* we definitely need to be at splvm here */
818
819	while ((bp->b_flags & B_DONE) == 0) {
820		tsleep(bp, PVM, "vnread", 0);
821	}
822	splx(s);
823	if ((bp->b_ioflags & BIO_ERROR) != 0)
824		error = EIO;
825
826	if (!error) {
827		if (size != count * PAGE_SIZE)
828			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
829	}
830	pmap_qremove(kva, count);
831
832	/*
833	 * free the buffer header back to the swap buffer pool
834	 */
835	relpbuf(bp, &vnode_pbuf_freecnt);
836
837	vm_page_lock_queues();
838	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
839		vm_page_t mt;
840
841		nextoff = tfoff + PAGE_SIZE;
842		mt = m[i];
843
844		if (nextoff <= object->un_pager.vnp.vnp_size) {
845			/*
846			 * Read filled up entire page.
847			 */
848			mt->valid = VM_PAGE_BITS_ALL;
849			vm_page_undirty(mt);	/* should be an assert? XXX */
850			pmap_clear_modify(mt);
851		} else {
852			/*
853			 * Read did not fill up entire page.  Since this
854			 * is getpages, the page may be mapped, so we have
855			 * to zero the invalid portions of the page even
856			 * though we aren't setting them valid.
857			 *
858			 * Currently we do not set the entire page valid,
859			 * we just try to clear the piece that we couldn't
860			 * read.
861			 */
862			vm_page_set_validclean(mt, 0,
863			    object->un_pager.vnp.vnp_size - tfoff);
864			/* handled by vm_fault now */
865			/* vm_page_zero_invalid(mt, FALSE); */
866		}
867
868		vm_page_flag_clear(mt, PG_ZERO);
869		if (i != reqpage) {
870
871			/*
872			 * whether or not to leave the page activated is up in
873			 * the air, but we should put the page on a page queue
874			 * somewhere. (it already is in the object). Result:
875			 * It appears that empirical results show that
876			 * deactivating pages is best.
877			 */
878
879			/*
880			 * just in case someone was asking for this page we
881			 * now tell them that it is ok to use
882			 */
883			if (!error) {
884				if (mt->flags & PG_WANTED)
885					vm_page_activate(mt);
886				else
887					vm_page_deactivate(mt);
888				vm_page_wakeup(mt);
889			} else {
890				vm_page_free(mt);
891			}
892		}
893	}
894	vm_page_unlock_queues();
895	if (error) {
896		printf("vnode_pager_getpages: I/O read error\n");
897	}
898	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
899}
900
901/*
902 * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
903 * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
904 * vnode_pager_generic_putpages() to implement the previous behaviour.
905 *
906 * All other FS's should use the bypass to get to the local media
907 * backing vp's VOP_PUTPAGES.
908 */
909static void
910vnode_pager_putpages(object, m, count, sync, rtvals)
911	vm_object_t object;
912	vm_page_t *m;
913	int count;
914	boolean_t sync;
915	int *rtvals;
916{
917	int rtval;
918	struct vnode *vp;
919	struct mount *mp;
920	int bytes = count * PAGE_SIZE;
921
922	GIANT_REQUIRED;
923	/*
924	 * Force synchronous operation if we are extremely low on memory
925	 * to prevent a low-memory deadlock.  VOP operations often need to
926	 * allocate more memory to initiate the I/O ( i.e. do a BMAP
927	 * operation ).  The swapper handles the case by limiting the amount
928	 * of asynchronous I/O, but that sort of solution doesn't scale well
929	 * for the vnode pager without a lot of work.
930	 *
931	 * Also, the backing vnode's iodone routine may not wake the pageout
932	 * daemon up.  This should be probably be addressed XXX.
933	 */
934
935	if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
936		sync |= OBJPC_SYNC;
937
938	/*
939	 * Call device-specific putpages function
940	 */
941	vp = object->handle;
942	if (vp->v_type != VREG)
943		mp = NULL;
944	(void)vn_start_write(vp, &mp, V_WAIT);
945	rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
946	KASSERT(rtval != EOPNOTSUPP,
947	    ("vnode_pager: stale FS putpages\n"));
948	vn_finished_write(mp);
949}
950
951
952/*
953 * This is now called from local media FS's to operate against their
954 * own vnodes if they fail to implement VOP_PUTPAGES.
955 *
956 * This is typically called indirectly via the pageout daemon and
957 * clustering has already typically occured, so in general we ask the
958 * underlying filesystem to write the data out asynchronously rather
959 * then delayed.
960 */
961int
962vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
963	struct vnode *vp;
964	vm_page_t *m;
965	int bytecount;
966	int flags;
967	int *rtvals;
968{
969	int i;
970	vm_object_t object;
971	int count;
972
973	int maxsize, ncount;
974	vm_ooffset_t poffset;
975	struct uio auio;
976	struct iovec aiov;
977	int error;
978	int ioflags;
979
980	GIANT_REQUIRED;
981	object = vp->v_object;
982	count = bytecount / PAGE_SIZE;
983
984	for (i = 0; i < count; i++)
985		rtvals[i] = VM_PAGER_AGAIN;
986
987	if ((int) m[0]->pindex < 0) {
988		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%x)\n",
989			(long)m[0]->pindex, m[0]->dirty);
990		rtvals[0] = VM_PAGER_BAD;
991		return VM_PAGER_BAD;
992	}
993
994	maxsize = count * PAGE_SIZE;
995	ncount = count;
996
997	poffset = IDX_TO_OFF(m[0]->pindex);
998
999	/*
1000	 * If the page-aligned write is larger then the actual file we
1001	 * have to invalidate pages occuring beyond the file EOF.  However,
1002	 * there is an edge case where a file may not be page-aligned where
1003	 * the last page is partially invalid.  In this case the filesystem
1004	 * may not properly clear the dirty bits for the entire page (which
1005	 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
1006	 * With the page locked we are free to fix-up the dirty bits here.
1007	 *
1008	 * We do not under any circumstances truncate the valid bits, as
1009	 * this will screw up bogus page replacement.
1010	 */
1011	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
1012		if (object->un_pager.vnp.vnp_size > poffset) {
1013			int pgoff;
1014
1015			maxsize = object->un_pager.vnp.vnp_size - poffset;
1016			ncount = btoc(maxsize);
1017			if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
1018				vm_page_clear_dirty(m[ncount - 1], pgoff,
1019					PAGE_SIZE - pgoff);
1020			}
1021		} else {
1022			maxsize = 0;
1023			ncount = 0;
1024		}
1025		if (ncount < count) {
1026			for (i = ncount; i < count; i++) {
1027				rtvals[i] = VM_PAGER_BAD;
1028			}
1029		}
1030	}
1031
1032	/*
1033	 * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
1034	 * rather then a bdwrite() to prevent paging I/O from saturating
1035	 * the buffer cache.
1036	 */
1037	ioflags = IO_VMIO;
1038	ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: IO_ASYNC;
1039	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
1040
1041	aiov.iov_base = (caddr_t) 0;
1042	aiov.iov_len = maxsize;
1043	auio.uio_iov = &aiov;
1044	auio.uio_iovcnt = 1;
1045	auio.uio_offset = poffset;
1046	auio.uio_segflg = UIO_NOCOPY;
1047	auio.uio_rw = UIO_WRITE;
1048	auio.uio_resid = maxsize;
1049	auio.uio_td = (struct thread *) 0;
1050	error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
1051	cnt.v_vnodeout++;
1052	cnt.v_vnodepgsout += ncount;
1053
1054	if (error) {
1055		printf("vnode_pager_putpages: I/O error %d\n", error);
1056	}
1057	if (auio.uio_resid) {
1058		printf("vnode_pager_putpages: residual I/O %d at %lu\n",
1059		    auio.uio_resid, (u_long)m[0]->pindex);
1060	}
1061	for (i = 0; i < ncount; i++) {
1062		rtvals[i] = VM_PAGER_OK;
1063	}
1064	return rtvals[0];
1065}
1066
1067struct vnode *
1068vnode_pager_lock(object)
1069	vm_object_t object;
1070{
1071	struct thread *td = curthread;	/* XXX */
1072
1073	GIANT_REQUIRED;
1074
1075	for (; object != NULL; object = object->backing_object) {
1076		if (object->type != OBJT_VNODE)
1077			continue;
1078		if (object->flags & OBJ_DEAD) {
1079			return NULL;
1080		}
1081
1082		/* XXX; If object->handle can change, we need to cache it. */
1083		while (vget(object->handle,
1084			LK_NOPAUSE | LK_SHARED | LK_RETRY | LK_CANRECURSE, td)){
1085			if ((object->flags & OBJ_DEAD) || (object->type != OBJT_VNODE))
1086				return NULL;
1087			printf("vnode_pager_lock: retrying\n");
1088		}
1089		return object->handle;
1090	}
1091	return NULL;
1092}
1093