vnode_pager.c revision 79127
1/*
2 * Copyright (c) 1990 University of Utah.
3 * Copyright (c) 1991 The Regents of the University of California.
4 * All rights reserved.
5 * Copyright (c) 1993, 1994 John S. Dyson
6 * Copyright (c) 1995, David Greenman
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 *    must display the following acknowledgement:
22 *	This product includes software developed by the University of
23 *	California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 *    may be used to endorse or promote products derived from this software
26 *    without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
41 * $FreeBSD: head/sys/vm/vnode_pager.c 79127 2001-07-03 07:35:56Z jhb $
42 */
43
44/*
45 * Page to/from files (vnodes).
46 */
47
48/*
49 * TODO:
50 *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
51 *	greatly re-simplify the vnode_pager.
52 */
53
54#include <sys/param.h>
55#include <sys/systm.h>
56#include <sys/proc.h>
57#include <sys/vnode.h>
58#include <sys/mount.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/vmmeter.h>
62#include <sys/conf.h>
63
64#include <vm/vm.h>
65#include <vm/vm_object.h>
66#include <vm/vm_page.h>
67#include <vm/vm_pager.h>
68#include <vm/vm_map.h>
69#include <vm/vnode_pager.h>
70#include <vm/vm_extern.h>
71
72static void vnode_pager_init __P((void));
73static vm_offset_t vnode_pager_addr __P((struct vnode *vp, vm_ooffset_t address,
74					 int *run));
75static void vnode_pager_iodone __P((struct buf *bp));
76static int vnode_pager_input_smlfs __P((vm_object_t object, vm_page_t m));
77static int vnode_pager_input_old __P((vm_object_t object, vm_page_t m));
78static void vnode_pager_dealloc __P((vm_object_t));
79static int vnode_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
80static void vnode_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
81static boolean_t vnode_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *));
82
83struct pagerops vnodepagerops = {
84	vnode_pager_init,
85	vnode_pager_alloc,
86	vnode_pager_dealloc,
87	vnode_pager_getpages,
88	vnode_pager_putpages,
89	vnode_pager_haspage,
90	NULL
91};
92
93int vnode_pbuf_freecnt;
94
95void
96vnode_pager_init(void)
97{
98
99	vnode_pbuf_freecnt = nswbuf / 2 + 1;
100}
101
102/*
103 * Allocate (or lookup) pager for a vnode.
104 * Handle is a vnode pointer.
105 */
106vm_object_t
107vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
108		  vm_ooffset_t offset)
109{
110	vm_object_t object;
111	struct vnode *vp;
112
113	mtx_assert(&Giant, MA_OWNED);
114	/*
115	 * Pageout to vnode, no can do yet.
116	 */
117	if (handle == NULL)
118		return (NULL);
119
120	vp = (struct vnode *) handle;
121
122	/*
123	 * Prevent race condition when allocating the object. This
124	 * can happen with NFS vnodes since the nfsnode isn't locked.
125	 */
126	mtx_unlock(&vm_mtx);
127	while (vp->v_flag & VOLOCK) {
128		vp->v_flag |= VOWANT;
129		tsleep(vp, PVM, "vnpobj", 0);
130	}
131	vp->v_flag |= VOLOCK;
132	mtx_lock(&vm_mtx);
133
134	/*
135	 * If the object is being terminated, wait for it to
136	 * go away.
137	 */
138	while (((object = vp->v_object) != NULL) &&
139		(object->flags & OBJ_DEAD)) {
140		msleep(object, &vm_mtx, PVM, "vadead", 0);
141	}
142
143	if (vp->v_usecount == 0)
144		panic("vnode_pager_alloc: no vnode reference");
145
146	if (object == NULL) {
147		/*
148		 * And an object of the appropriate size
149		 */
150		object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
151		object->flags = 0;
152
153		object->un_pager.vnp.vnp_size = size;
154
155		object->handle = handle;
156		vp->v_object = object;
157		vp->v_usecount++;
158	} else {
159		object->ref_count++;
160		vp->v_usecount++;
161	}
162
163	mtx_unlock(&vm_mtx);
164	vp->v_flag &= ~VOLOCK;
165	if (vp->v_flag & VOWANT) {
166		vp->v_flag &= ~VOWANT;
167		wakeup(vp);
168	}
169	mtx_lock(&vm_mtx);
170	return (object);
171}
172
173static void
174vnode_pager_dealloc(object)
175	vm_object_t object;
176{
177	register struct vnode *vp = object->handle;
178
179	mtx_assert(&Giant, MA_OWNED);
180	if (vp == NULL)
181		panic("vnode_pager_dealloc: pager already dealloced");
182
183	vm_object_pip_wait(object, "vnpdea");
184
185	object->handle = NULL;
186	object->type = OBJT_DEAD;
187	vp->v_object = NULL;
188	vp->v_flag &= ~(VTEXT | VOBJBUF);
189}
190
191static boolean_t
192vnode_pager_haspage(object, pindex, before, after)
193	vm_object_t object;
194	vm_pindex_t pindex;
195	int *before;
196	int *after;
197{
198	struct vnode *vp = object->handle;
199	daddr_t bn;
200	int err;
201	daddr_t reqblock;
202	int poff;
203	int bsize;
204	int pagesperblock, blocksperpage;
205
206	mtx_assert(&Giant, MA_OWNED);
207	/*
208	 * If no vp or vp is doomed or marked transparent to VM, we do not
209	 * have the page.
210	 */
211	if ((vp == NULL) || (vp->v_flag & VDOOMED))
212		return FALSE;
213
214	/*
215	 * If filesystem no longer mounted or offset beyond end of file we do
216	 * not have the page.
217	 */
218	if ((vp->v_mount == NULL) ||
219		(IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size))
220		return FALSE;
221
222	bsize = vp->v_mount->mnt_stat.f_iosize;
223	pagesperblock = bsize / PAGE_SIZE;
224	blocksperpage = 0;
225	if (pagesperblock > 0) {
226		reqblock = pindex / pagesperblock;
227	} else {
228		blocksperpage = (PAGE_SIZE / bsize);
229		reqblock = pindex * blocksperpage;
230	}
231	mtx_unlock(&vm_mtx);
232	err = VOP_BMAP(vp, reqblock, (struct vnode **) 0, &bn,
233		after, before);
234	mtx_lock(&vm_mtx);
235	if (err)
236		return TRUE;
237	if ( bn == -1)
238		return FALSE;
239	if (pagesperblock > 0) {
240		poff = pindex - (reqblock * pagesperblock);
241		if (before) {
242			*before *= pagesperblock;
243			*before += poff;
244		}
245		if (after) {
246			int numafter;
247			*after *= pagesperblock;
248			numafter = pagesperblock - (poff + 1);
249			if (IDX_TO_OFF(pindex + numafter) > object->un_pager.vnp.vnp_size) {
250				numafter = OFF_TO_IDX((object->un_pager.vnp.vnp_size - IDX_TO_OFF(pindex)));
251			}
252			*after += numafter;
253		}
254	} else {
255		if (before) {
256			*before /= blocksperpage;
257		}
258
259		if (after) {
260			*after /= blocksperpage;
261		}
262	}
263	return TRUE;
264}
265
266/*
267 * Lets the VM system know about a change in size for a file.
268 * We adjust our own internal size and flush any cached pages in
269 * the associated object that are affected by the size change.
270 *
271 * Note: this routine may be invoked as a result of a pager put
272 * operation (possibly at object termination time), so we must be careful.
273 */
274void
275vnode_pager_setsize(vp, nsize)
276	struct vnode *vp;
277	vm_ooffset_t nsize;
278{
279	vm_pindex_t nobjsize;
280	vm_object_t object = vp->v_object;
281
282	if (object == NULL)
283		return;
284
285	/*
286	 * Hasn't changed size
287	 */
288	if (nsize == object->un_pager.vnp.vnp_size)
289		return;
290
291	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
292
293	/*
294	 * File has shrunk. Toss any cached pages beyond the new EOF.
295	 */
296	if (nsize < object->un_pager.vnp.vnp_size) {
297		int hadvmlock;
298
299		hadvmlock = mtx_owned(&vm_mtx);
300		if (!hadvmlock)
301			mtx_lock(&vm_mtx);
302		vm_freeze_copyopts(object, OFF_TO_IDX(nsize), object->size);
303		if (nobjsize < object->size) {
304			vm_object_page_remove(object, nobjsize, object->size,
305				FALSE);
306		}
307		/*
308		 * this gets rid of garbage at the end of a page that is now
309		 * only partially backed by the vnode...
310		 */
311		if (nsize & PAGE_MASK) {
312			vm_offset_t kva;
313			vm_page_t m;
314
315			m = vm_page_lookup(object, OFF_TO_IDX(nsize));
316			if (m) {
317				int base = (int)nsize & PAGE_MASK;
318				int size = PAGE_SIZE - base;
319
320				/*
321				 * Clear out partial-page garbage in case
322				 * the page has been mapped.
323				 */
324				kva = vm_pager_map_page(m);
325				bzero((caddr_t)kva + base, size);
326				vm_pager_unmap_page(kva);
327
328				/*
329				 * Clear out partial-page dirty bits.  This
330				 * has the side effect of setting the valid
331				 * bits, but that is ok.  There are a bunch
332				 * of places in the VM system where we expected
333				 * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
334				 * case is one of them.  If the page is still
335				 * partially dirty, make it fully dirty.
336				 */
337				vm_page_set_validclean(m, base, size);
338				if (m->dirty != 0)
339					m->dirty = VM_PAGE_BITS_ALL;
340			}
341		}
342		if (!hadvmlock)
343			mtx_unlock(&vm_mtx);
344	}
345	object->un_pager.vnp.vnp_size = nsize;
346	object->size = nobjsize;
347}
348
349/*
350 * calculate the linear (byte) disk address of specified virtual
351 * file address
352 */
353static vm_offset_t
354vnode_pager_addr(vp, address, run)
355	struct vnode *vp;
356	vm_ooffset_t address;
357	int *run;
358{
359	int rtaddress;
360	int bsize;
361	daddr_t block;
362	struct vnode *rtvp;
363	int err;
364	daddr_t vblock;
365	int voffset;
366
367	mtx_assert(&Giant, MA_OWNED);
368	if ((int) address < 0)
369		return -1;
370
371	if (vp->v_mount == NULL)
372		return -1;
373
374	bsize = vp->v_mount->mnt_stat.f_iosize;
375	vblock = address / bsize;
376	voffset = address % bsize;
377	mtx_unlock(&vm_mtx);
378
379	err = VOP_BMAP(vp, vblock, &rtvp, &block, run, NULL);
380
381	mtx_lock(&vm_mtx);
382	if (err || (block == -1))
383		rtaddress = -1;
384	else {
385		rtaddress = block + voffset / DEV_BSIZE;
386		if( run) {
387			*run += 1;
388			*run *= bsize/PAGE_SIZE;
389			*run -= voffset/PAGE_SIZE;
390		}
391	}
392
393	return rtaddress;
394}
395
396/*
397 * interrupt routine for I/O completion
398 */
399static void
400vnode_pager_iodone(bp)
401	struct buf *bp;
402{
403	bp->b_flags |= B_DONE;
404	wakeup(bp);
405}
406
407/*
408 * small block file system vnode pager input
409 */
410static int
411vnode_pager_input_smlfs(object, m)
412	vm_object_t object;
413	vm_page_t m;
414{
415	int i;
416	int s;
417	struct vnode *dp, *vp;
418	struct buf *bp;
419	vm_offset_t kva;
420	int fileaddr;
421	vm_offset_t bsize;
422	int error = 0;
423
424	mtx_assert(&Giant, MA_OWNED);
425	vp = object->handle;
426	if (vp->v_mount == NULL)
427		return VM_PAGER_BAD;
428
429	bsize = vp->v_mount->mnt_stat.f_iosize;
430	mtx_unlock(&vm_mtx);
431
432	VOP_BMAP(vp, 0, &dp, 0, NULL, NULL);
433
434	mtx_lock(&vm_mtx);
435	kva = vm_pager_map_page(m);
436
437	for (i = 0; i < PAGE_SIZE / bsize; i++) {
438
439		if (vm_page_bits(i * bsize, bsize) & m->valid)
440			continue;
441
442		fileaddr = vnode_pager_addr(vp,
443			IDX_TO_OFF(m->pindex) + i * bsize, (int *)0);
444		if (fileaddr != -1) {
445			mtx_unlock(&vm_mtx);
446			bp = getpbuf(&vnode_pbuf_freecnt);
447
448			/* build a minimal buffer header */
449			bp->b_iocmd = BIO_READ;
450			bp->b_iodone = vnode_pager_iodone;
451			bp->b_rcred = bp->b_wcred = curproc->p_ucred;
452			if (bp->b_rcred != NOCRED)
453				crhold(bp->b_rcred);
454			if (bp->b_wcred != NOCRED)
455				crhold(bp->b_wcred);
456			bp->b_data = (caddr_t) kva + i * bsize;
457			bp->b_blkno = fileaddr;
458			pbgetvp(dp, bp);
459			bp->b_bcount = bsize;
460			bp->b_bufsize = bsize;
461			bp->b_runningbufspace = bp->b_bufsize;
462			runningbufspace += bp->b_runningbufspace;
463
464			/* do the input */
465			BUF_STRATEGY(bp);
466
467			/* we definitely need to be at splvm here */
468
469			s = splvm();
470			while ((bp->b_flags & B_DONE) == 0) {
471				tsleep(bp, PVM, "vnsrd", 0);
472			}
473			splx(s);
474			if ((bp->b_ioflags & BIO_ERROR) != 0)
475				error = EIO;
476
477			/*
478			 * free the buffer header back to the swap buffer pool
479			 */
480			relpbuf(bp, &vnode_pbuf_freecnt);
481			mtx_lock(&vm_mtx);
482			if (error)
483				break;
484
485			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
486		} else {
487			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
488			bzero((caddr_t) kva + i * bsize, bsize);
489		}
490	}
491	vm_pager_unmap_page(kva);
492	pmap_clear_modify(m);
493	vm_page_flag_clear(m, PG_ZERO);
494	if (error) {
495		return VM_PAGER_ERROR;
496	}
497	return VM_PAGER_OK;
498
499}
500
501
502/*
503 * old style vnode pager output routine
504 */
505static int
506vnode_pager_input_old(object, m)
507	vm_object_t object;
508	vm_page_t m;
509{
510	struct uio auio;
511	struct iovec aiov;
512	int error;
513	int size;
514	vm_offset_t kva;
515	struct vnode *vp;
516
517	mtx_assert(&Giant, MA_OWNED);
518	error = 0;
519
520	/*
521	 * Return failure if beyond current EOF
522	 */
523	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
524		return VM_PAGER_BAD;
525	} else {
526		size = PAGE_SIZE;
527		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
528			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
529
530		/*
531		 * Allocate a kernel virtual address and initialize so that
532		 * we can use VOP_READ/WRITE routines.
533		 */
534		kva = vm_pager_map_page(m);
535
536		vp = object->handle;
537		mtx_unlock(&vm_mtx);
538		aiov.iov_base = (caddr_t) kva;
539		aiov.iov_len = size;
540		auio.uio_iov = &aiov;
541		auio.uio_iovcnt = 1;
542		auio.uio_offset = IDX_TO_OFF(m->pindex);
543		auio.uio_segflg = UIO_SYSSPACE;
544		auio.uio_rw = UIO_READ;
545		auio.uio_resid = size;
546		auio.uio_procp = curproc;
547
548		error = VOP_READ(vp, &auio, 0, curproc->p_ucred);
549		if (!error) {
550			register int count = size - auio.uio_resid;
551
552			if (count == 0)
553				error = EINVAL;
554			else if (count != PAGE_SIZE)
555				bzero((caddr_t) kva + count, PAGE_SIZE - count);
556		}
557		mtx_lock(&vm_mtx);
558		vm_pager_unmap_page(kva);
559	}
560	pmap_clear_modify(m);
561	vm_page_undirty(m);
562	vm_page_flag_clear(m, PG_ZERO);
563	if (!error)
564		m->valid = VM_PAGE_BITS_ALL;
565	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
566}
567
568/*
569 * generic vnode pager input routine
570 */
571
572/*
573 * Local media VFS's that do not implement their own VOP_GETPAGES
574 * should have their VOP_GETPAGES should call to
575 * vnode_pager_generic_getpages() to implement the previous behaviour.
576 *
577 * All other FS's should use the bypass to get to the local media
578 * backing vp's VOP_GETPAGES.
579 */
580static int
581vnode_pager_getpages(object, m, count, reqpage)
582	vm_object_t object;
583	vm_page_t *m;
584	int count;
585	int reqpage;
586{
587	int rtval;
588	struct vnode *vp;
589	int bytes = count * PAGE_SIZE;
590
591	mtx_assert(&Giant, MA_OWNED);
592	vp = object->handle;
593	rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
594	KASSERT(rtval != EOPNOTSUPP,
595	    ("vnode_pager: FS getpages not implemented\n"));
596	return rtval;
597}
598
599
600/*
601 * This is now called from local media FS's to operate against their
602 * own vnodes if they fail to implement VOP_GETPAGES.
603 */
604int
605vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
606	struct vnode *vp;
607	vm_page_t *m;
608	int bytecount;
609	int reqpage;
610{
611	vm_object_t object;
612	vm_offset_t kva;
613	off_t foff, tfoff, nextoff;
614	int i, size, bsize, first, firstaddr;
615	struct vnode *dp;
616	int runpg;
617	int runend;
618	struct buf *bp;
619	int s;
620	int count;
621	int error = 0;
622
623	mtx_assert(&Giant, MA_OWNED);
624	object = vp->v_object;
625	count = bytecount / PAGE_SIZE;
626
627	if (vp->v_mount == NULL)
628		return VM_PAGER_BAD;
629
630	bsize = vp->v_mount->mnt_stat.f_iosize;
631
632	/* get the UNDERLYING device for the file with VOP_BMAP() */
633
634	/*
635	 * originally, we did not check for an error return value -- assuming
636	 * an fs always has a bmap entry point -- that assumption is wrong!!!
637	 */
638	foff = IDX_TO_OFF(m[reqpage]->pindex);
639
640	/*
641	 * if we can't bmap, use old VOP code
642	 */
643	mtx_unlock(&vm_mtx);
644	if (VOP_BMAP(vp, 0, &dp, 0, NULL, NULL)) {
645		mtx_lock(&vm_mtx);
646		for (i = 0; i < count; i++) {
647			if (i != reqpage) {
648				vm_page_free(m[i]);
649			}
650		}
651		cnt.v_vnodein++;
652		cnt.v_vnodepgsin++;
653		return vnode_pager_input_old(object, m[reqpage]);
654
655		/*
656		 * if the blocksize is smaller than a page size, then use
657		 * special small filesystem code.  NFS sometimes has a small
658		 * blocksize, but it can handle large reads itself.
659		 */
660	} else if ((PAGE_SIZE / bsize) > 1 &&
661	    (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
662		mtx_lock(&vm_mtx);
663		for (i = 0; i < count; i++) {
664			if (i != reqpage) {
665				vm_page_free(m[i]);
666			}
667		}
668		cnt.v_vnodein++;
669		cnt.v_vnodepgsin++;
670		return vnode_pager_input_smlfs(object, m[reqpage]);
671	}
672	mtx_lock(&vm_mtx);
673
674	/*
675	 * If we have a completely valid page available to us, we can
676	 * clean up and return.  Otherwise we have to re-read the
677	 * media.
678	 */
679
680	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
681		for (i = 0; i < count; i++) {
682			if (i != reqpage)
683				vm_page_free(m[i]);
684		}
685		return VM_PAGER_OK;
686	}
687	m[reqpage]->valid = 0;
688
689	/*
690	 * here on direct device I/O
691	 */
692
693	firstaddr = -1;
694	/*
695	 * calculate the run that includes the required page
696	 */
697	for(first = 0, i = 0; i < count; i = runend) {
698		firstaddr = vnode_pager_addr(vp,
699			IDX_TO_OFF(m[i]->pindex), &runpg);
700		if (firstaddr == -1) {
701			if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
702				/* XXX no %qd in kernel. */
703				panic("vnode_pager_getpages: unexpected missing page: firstaddr: %d, foff: 0x%lx%08lx, vnp_size: 0x%lx%08lx",
704			   	 firstaddr, (u_long)(foff >> 32),
705			   	 (u_long)(u_int32_t)foff,
706				 (u_long)(u_int32_t)
707				 (object->un_pager.vnp.vnp_size >> 32),
708				 (u_long)(u_int32_t)
709				 object->un_pager.vnp.vnp_size);
710			}
711			vm_page_free(m[i]);
712			runend = i + 1;
713			first = runend;
714			continue;
715		}
716		runend = i + runpg;
717		if (runend <= reqpage) {
718			int j;
719			for (j = i; j < runend; j++) {
720				vm_page_free(m[j]);
721			}
722		} else {
723			if (runpg < (count - first)) {
724				for (i = first + runpg; i < count; i++)
725					vm_page_free(m[i]);
726				count = first + runpg;
727			}
728			break;
729		}
730		first = runend;
731	}
732
733	/*
734	 * the first and last page have been calculated now, move input pages
735	 * to be zero based...
736	 */
737	if (first != 0) {
738		for (i = first; i < count; i++) {
739			m[i - first] = m[i];
740		}
741		count -= first;
742		reqpage -= first;
743	}
744
745	/*
746	 * calculate the file virtual address for the transfer
747	 */
748	foff = IDX_TO_OFF(m[0]->pindex);
749
750	/*
751	 * calculate the size of the transfer
752	 */
753	size = count * PAGE_SIZE;
754	if ((foff + size) > object->un_pager.vnp.vnp_size)
755		size = object->un_pager.vnp.vnp_size - foff;
756
757	/*
758	 * round up physical size for real devices.
759	 */
760	if (dp->v_type == VBLK || dp->v_type == VCHR) {
761		int secmask = dp->v_rdev->si_bsize_phys - 1;
762		KASSERT(secmask < PAGE_SIZE, ("vnode_pager_generic_getpages: sector size %d too large\n", secmask + 1));
763		size = (size + secmask) & ~secmask;
764	}
765
766	bp = getpbuf(&vnode_pbuf_freecnt);
767	kva = (vm_offset_t) bp->b_data;
768
769	/*
770	 * and map the pages to be read into the kva
771	 */
772	pmap_qenter(kva, m, count);
773	mtx_unlock(&vm_mtx);
774
775	/* build a minimal buffer header */
776	bp->b_iocmd = BIO_READ;
777	bp->b_iodone = vnode_pager_iodone;
778	/* B_PHYS is not set, but it is nice to fill this in */
779	bp->b_rcred = bp->b_wcred = curproc->p_ucred;
780	if (bp->b_rcred != NOCRED)
781		crhold(bp->b_rcred);
782	if (bp->b_wcred != NOCRED)
783		crhold(bp->b_wcred);
784	bp->b_blkno = firstaddr;
785	pbgetvp(dp, bp);
786	bp->b_bcount = size;
787	bp->b_bufsize = size;
788	bp->b_runningbufspace = bp->b_bufsize;
789	runningbufspace += bp->b_runningbufspace;
790
791	cnt.v_vnodein++;
792	cnt.v_vnodepgsin += count;
793
794	/* do the input */
795	BUF_STRATEGY(bp);
796
797	s = splvm();
798	/* we definitely need to be at splvm here */
799
800	while ((bp->b_flags & B_DONE) == 0) {
801		tsleep(bp, PVM, "vnread", 0);
802	}
803	splx(s);
804	if ((bp->b_ioflags & BIO_ERROR) != 0)
805		error = EIO;
806
807	if (!error) {
808		if (size != count * PAGE_SIZE)
809			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
810	}
811	mtx_lock(&vm_mtx);
812	pmap_qremove(kva, count);
813
814	/*
815	 * free the buffer header back to the swap buffer pool
816	 */
817	relpbuf(bp, &vnode_pbuf_freecnt);
818
819	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
820		vm_page_t mt;
821
822		nextoff = tfoff + PAGE_SIZE;
823		mt = m[i];
824
825		if (nextoff <= object->un_pager.vnp.vnp_size) {
826			/*
827			 * Read filled up entire page.
828			 */
829			mt->valid = VM_PAGE_BITS_ALL;
830			vm_page_undirty(mt);	/* should be an assert? XXX */
831			pmap_clear_modify(mt);
832		} else {
833			/*
834			 * Read did not fill up entire page.  Since this
835			 * is getpages, the page may be mapped, so we have
836			 * to zero the invalid portions of the page even
837			 * though we aren't setting them valid.
838			 *
839			 * Currently we do not set the entire page valid,
840			 * we just try to clear the piece that we couldn't
841			 * read.
842			 */
843			vm_page_set_validclean(mt, 0,
844			    object->un_pager.vnp.vnp_size - tfoff);
845			/* handled by vm_fault now */
846			/* vm_page_zero_invalid(mt, FALSE); */
847		}
848
849		vm_page_flag_clear(mt, PG_ZERO);
850		if (i != reqpage) {
851
852			/*
853			 * whether or not to leave the page activated is up in
854			 * the air, but we should put the page on a page queue
855			 * somewhere. (it already is in the object). Result:
856			 * It appears that empirical results show that
857			 * deactivating pages is best.
858			 */
859
860			/*
861			 * just in case someone was asking for this page we
862			 * now tell them that it is ok to use
863			 */
864			if (!error) {
865				if (mt->flags & PG_WANTED)
866					vm_page_activate(mt);
867				else
868					vm_page_deactivate(mt);
869				vm_page_wakeup(mt);
870			} else {
871				vm_page_free(mt);
872			}
873		}
874	}
875	if (error) {
876		printf("vnode_pager_getpages: I/O read error\n");
877	}
878	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
879}
880
881/*
882 * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
883 * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
884 * vnode_pager_generic_putpages() to implement the previous behaviour.
885 *
886 * All other FS's should use the bypass to get to the local media
887 * backing vp's VOP_PUTPAGES.
888 */
889static void
890vnode_pager_putpages(object, m, count, sync, rtvals)
891	vm_object_t object;
892	vm_page_t *m;
893	int count;
894	boolean_t sync;
895	int *rtvals;
896{
897	int rtval;
898	struct vnode *vp;
899	struct mount *mp;
900	int bytes = count * PAGE_SIZE;
901
902	mtx_assert(&Giant, MA_OWNED);
903	/*
904	 * Force synchronous operation if we are extremely low on memory
905	 * to prevent a low-memory deadlock.  VOP operations often need to
906	 * allocate more memory to initiate the I/O ( i.e. do a BMAP
907	 * operation ).  The swapper handles the case by limiting the amount
908	 * of asynchronous I/O, but that sort of solution doesn't scale well
909	 * for the vnode pager without a lot of work.
910	 *
911	 * Also, the backing vnode's iodone routine may not wake the pageout
912	 * daemon up.  This should be probably be addressed XXX.
913	 */
914
915	if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
916		sync |= OBJPC_SYNC;
917
918	/*
919	 * Call device-specific putpages function
920	 */
921
922	vp = object->handle;
923	mtx_unlock(&vm_mtx);
924	if (vp->v_type != VREG)
925		mp = NULL;
926	(void)vn_start_write(vp, &mp, V_WAIT);
927	mtx_lock(&vm_mtx);
928	rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
929	KASSERT(rtval != EOPNOTSUPP,
930	    ("vnode_pager: stale FS putpages\n"));
931	mtx_unlock(&vm_mtx);
932	vn_finished_write(mp);
933	mtx_lock(&vm_mtx);
934}
935
936
937/*
938 * This is now called from local media FS's to operate against their
939 * own vnodes if they fail to implement VOP_PUTPAGES.
940 *
941 * This is typically called indirectly via the pageout daemon and
942 * clustering has already typically occured, so in general we ask the
943 * underlying filesystem to write the data out asynchronously rather
944 * then delayed.
945 */
946int
947vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
948	struct vnode *vp;
949	vm_page_t *m;
950	int bytecount;
951	int flags;
952	int *rtvals;
953{
954	int i;
955	vm_object_t object;
956	int count;
957
958	int maxsize, ncount;
959	vm_ooffset_t poffset;
960	struct uio auio;
961	struct iovec aiov;
962	int error;
963	int ioflags;
964
965	mtx_assert(&Giant, MA_OWNED);
966	object = vp->v_object;
967	count = bytecount / PAGE_SIZE;
968
969	for (i = 0; i < count; i++)
970		rtvals[i] = VM_PAGER_AGAIN;
971
972	if ((int) m[0]->pindex < 0) {
973		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%x)\n",
974			(long)m[0]->pindex, m[0]->dirty);
975		rtvals[0] = VM_PAGER_BAD;
976		return VM_PAGER_BAD;
977	}
978
979	maxsize = count * PAGE_SIZE;
980	ncount = count;
981
982	poffset = IDX_TO_OFF(m[0]->pindex);
983	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
984		if (object->un_pager.vnp.vnp_size > poffset)
985			maxsize = object->un_pager.vnp.vnp_size - poffset;
986		else
987			maxsize = 0;
988		ncount = btoc(maxsize);
989		if (ncount < count) {
990			for (i = ncount; i < count; i++) {
991				rtvals[i] = VM_PAGER_BAD;
992			}
993		}
994	}
995	mtx_unlock(&vm_mtx);
996
997	/*
998	 * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
999	 * rather then a bdwrite() to prevent paging I/O from saturating
1000	 * the buffer cache.
1001	 */
1002	ioflags = IO_VMIO;
1003	ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: IO_ASYNC;
1004	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
1005
1006	aiov.iov_base = (caddr_t) 0;
1007	aiov.iov_len = maxsize;
1008	auio.uio_iov = &aiov;
1009	auio.uio_iovcnt = 1;
1010	auio.uio_offset = poffset;
1011	auio.uio_segflg = UIO_NOCOPY;
1012	auio.uio_rw = UIO_WRITE;
1013	auio.uio_resid = maxsize;
1014	auio.uio_procp = (struct proc *) 0;
1015	error = VOP_WRITE(vp, &auio, ioflags, curproc->p_ucred);
1016	mtx_lock(&vm_mtx);
1017	cnt.v_vnodeout++;
1018	cnt.v_vnodepgsout += ncount;
1019
1020	if (error) {
1021		printf("vnode_pager_putpages: I/O error %d\n", error);
1022	}
1023	if (auio.uio_resid) {
1024		printf("vnode_pager_putpages: residual I/O %d at %lu\n",
1025		    auio.uio_resid, (u_long)m[0]->pindex);
1026	}
1027	for (i = 0; i < ncount; i++) {
1028		rtvals[i] = VM_PAGER_OK;
1029	}
1030	return rtvals[0];
1031}
1032
1033struct vnode *
1034vnode_pager_lock(object)
1035	vm_object_t object;
1036{
1037	struct proc *p = curproc;	/* XXX */
1038
1039	mtx_assert(&vm_mtx, MA_NOTOWNED);
1040	mtx_assert(&Giant, MA_OWNED);
1041	mtx_lock(&vm_mtx);
1042	for (; object != NULL; object = object->backing_object) {
1043		if (object->type != OBJT_VNODE)
1044			continue;
1045		if (object->flags & OBJ_DEAD) {
1046			mtx_unlock(&vm_mtx);
1047			return NULL;
1048		}
1049
1050		mtx_unlock(&vm_mtx);
1051		/* XXX; If object->handle can change, we need to cache it. */
1052		while (vget(object->handle,
1053			LK_NOPAUSE | LK_SHARED | LK_RETRY | LK_CANRECURSE, p)) {
1054			if ((object->flags & OBJ_DEAD) || (object->type != OBJT_VNODE))
1055				return NULL;
1056			printf("vnode_pager_lock: retrying\n");
1057		}
1058		return object->handle;
1059	}
1060	mtx_unlock(&vm_mtx);
1061	return NULL;
1062}
1063