vnode_pager.c revision 7162
1/*
2 * Copyright (c) 1990 University of Utah.
3 * Copyright (c) 1991 The Regents of the University of California.
4 * All rights reserved.
5 * Copyright (c) 1993,1994 John S. Dyson
6 *
7 * This code is derived from software contributed to Berkeley by
8 * the Systems Programming Group of the University of Utah Computer
9 * Science Department.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
40 *	$Id: vnode_pager.c,v 1.30 1995/03/16 18:17:34 bde Exp $
41 */
42
43/*
44 * Page to/from files (vnodes).
45 *
46 * TODO:
47 *	pageouts
48 *	fix credential use (uses current process credentials now)
49 */
50
51/*
52 * MODIFICATIONS:
53 * John S. Dyson  08 Dec 93
54 *
55 * This file in conjunction with some vm_fault mods, eliminate the performance
56 * advantage for using the buffer cache and minimize memory copies.
57 *
58 * 1) Supports multiple - block reads
59 * 2) Bypasses buffer cache for reads
60 *
61 * TODO:
62 *
63 * 1) Totally bypass buffer cache for reads
64 *    (Currently will still sometimes use buffer cache for reads)
65 * 2) Bypass buffer cache for writes
66 *    (Code does not support it, but mods are simple)
67 */
68
69#include <sys/param.h>
70#include <sys/systm.h>
71#include <sys/kernel.h>
72#include <sys/proc.h>
73#include <sys/malloc.h>
74#include <sys/vnode.h>
75#include <sys/uio.h>
76#include <sys/mount.h>
77
78#include <vm/vm.h>
79#include <vm/vm_page.h>
80#include <vm/vnode_pager.h>
81
82#include <sys/buf.h>
83#include <miscfs/specfs/specdev.h>
84
85int vnode_pager_putmulti();
86
87void vnode_pager_init();
88void vnode_pager_dealloc();
89int vnode_pager_getpage();
90int vnode_pager_getmulti();
91int vnode_pager_putpage();
92boolean_t vnode_pager_haspage();
93
94struct pagerops vnodepagerops = {
95	vnode_pager_init,
96	vnode_pager_alloc,
97	vnode_pager_dealloc,
98	vnode_pager_getpage,
99	vnode_pager_getmulti,
100	vnode_pager_putpage,
101	vnode_pager_putmulti,
102	vnode_pager_haspage
103};
104
105
106
107static int vnode_pager_input(vn_pager_t vnp, vm_page_t * m, int count, int reqpage);
108static int vnode_pager_output(vn_pager_t vnp, vm_page_t * m, int count, int *rtvals);
109
110extern vm_map_t pager_map;
111
112struct pagerlst vnode_pager_list;	/* list of managed vnodes */
113
114#define MAXBP (PAGE_SIZE/DEV_BSIZE);
115
116void
117vnode_pager_init()
118{
119	TAILQ_INIT(&vnode_pager_list);
120}
121
122/*
123 * Allocate (or lookup) pager for a vnode.
124 * Handle is a vnode pointer.
125 */
126vm_pager_t
127vnode_pager_alloc(handle, size, prot, offset)
128	caddr_t handle;
129	vm_size_t size;
130	vm_prot_t prot;
131	vm_offset_t offset;
132{
133	register vm_pager_t pager;
134	register vn_pager_t vnp;
135	vm_object_t object, tobject;
136	struct vattr vattr;
137	struct vnode *vp;
138	struct proc *p = curproc;	/* XXX */
139	int rtval;
140
141	/*
142	 * Pageout to vnode, no can do yet.
143	 */
144	if (handle == NULL)
145		return (NULL);
146
147	/*
148	 * Vnodes keep a pointer to any associated pager so no need to lookup
149	 * with vm_pager_lookup.
150	 */
151	vp = (struct vnode *) handle;
152	while ((object = (vm_object_t) vp->v_vmdata) && (object->flags & OBJ_DEAD))
153		tsleep((caddr_t) object, PVM, "vadead", 0);
154
155	pager = NULL;
156	if (object != NULL)
157		pager = object->pager;
158	if (pager == NULL) {
159
160		/*
161		 * Allocate pager structures
162		 */
163		pager = (vm_pager_t) malloc(sizeof *pager, M_VMPAGER, M_WAITOK);
164		if (pager == NULL)
165			return (NULL);
166		vnp = (vn_pager_t) malloc(sizeof *vnp, M_VMPGDATA, M_WAITOK);
167		if (vnp == NULL) {
168			free((caddr_t) pager, M_VMPAGER);
169			return (NULL);
170		}
171		/*
172		 * And an object of the appropriate size
173		 */
174		if ((rtval = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) == 0) {
175			object = vm_object_allocate(round_page(vattr.va_size));
176			object->flags = OBJ_CANPERSIST;
177			vm_object_enter(object, pager);
178			object->pager = pager;
179		} else {
180			printf("Error in getattr: %d\n", rtval);
181			free((caddr_t) vnp, M_VMPGDATA);
182			free((caddr_t) pager, M_VMPAGER);
183			return (NULL);
184		}
185
186		/*
187		 * Hold a reference to the vnode and initialize pager data.
188		 */
189		VREF(vp);
190		vnp->vnp_flags = 0;
191		vnp->vnp_vp = vp;
192		vnp->vnp_size = vattr.va_size;
193
194		TAILQ_INSERT_TAIL(&vnode_pager_list, pager, pg_list);
195		pager->pg_handle = handle;
196		pager->pg_type = PG_VNODE;
197		pager->pg_ops = &vnodepagerops;
198		pager->pg_data = (caddr_t) vnp;
199		vp->v_vmdata = (caddr_t) object;
200	} else {
201
202		/*
203		 * vm_object_lookup() will remove the object from the cache if
204		 * found and also gain a reference to the object.
205		 */
206		(void) vm_object_lookup(pager);
207	}
208	return (pager);
209}
210
211void
212vnode_pager_dealloc(pager)
213	vm_pager_t pager;
214{
215	register vn_pager_t vnp = (vn_pager_t) pager->pg_data;
216	register struct vnode *vp;
217	vm_object_t object;
218
219	vp = vnp->vnp_vp;
220	if (vp) {
221		int s = splbio();
222
223		object = (vm_object_t) vp->v_vmdata;
224		if (object) {
225			while (object->paging_in_progress) {
226				object->flags |= OBJ_PIPWNT;
227				tsleep(object, PVM, "vnpdea", 0);
228			}
229		}
230		splx(s);
231
232		vp->v_vmdata = NULL;
233		vp->v_flag &= ~(VTEXT | VVMIO);
234		vp->v_flag |= VAGE;
235		vrele(vp);
236	}
237	TAILQ_REMOVE(&vnode_pager_list, pager, pg_list);
238	free((caddr_t) vnp, M_VMPGDATA);
239	free((caddr_t) pager, M_VMPAGER);
240}
241
242int
243vnode_pager_getmulti(pager, m, count, reqpage, sync)
244	vm_pager_t pager;
245	vm_page_t *m;
246	int count;
247	int reqpage;
248	boolean_t sync;
249{
250
251	return vnode_pager_input((vn_pager_t) pager->pg_data, m, count, reqpage);
252}
253
254int
255vnode_pager_getpage(pager, m, sync)
256	vm_pager_t pager;
257	vm_page_t m;
258	boolean_t sync;
259{
260
261	vm_page_t marray[1];
262
263	if (pager == NULL)
264		return FALSE;
265	marray[0] = m;
266
267	return vnode_pager_input((vn_pager_t) pager->pg_data, marray, 1, 0);
268}
269
270boolean_t
271vnode_pager_putpage(pager, m, sync)
272	vm_pager_t pager;
273	vm_page_t m;
274	boolean_t sync;
275{
276	vm_page_t marray[1];
277	int rtvals[1];
278
279	if (pager == NULL)
280		return FALSE;
281	marray[0] = m;
282	vnode_pager_output((vn_pager_t) pager->pg_data, marray, 1, rtvals);
283	return rtvals[0];
284}
285
286int
287vnode_pager_putmulti(pager, m, c, sync, rtvals)
288	vm_pager_t pager;
289	vm_page_t *m;
290	int c;
291	boolean_t sync;
292	int *rtvals;
293{
294	return vnode_pager_output((vn_pager_t) pager->pg_data, m, c, rtvals);
295}
296
297
298boolean_t
299vnode_pager_haspage(pager, offset)
300	vm_pager_t pager;
301	vm_offset_t offset;
302{
303	register vn_pager_t vnp = (vn_pager_t) pager->pg_data;
304	register struct vnode *vp = vnp->vnp_vp;
305	daddr_t bn;
306	int err;
307	daddr_t block;
308
309	/*
310	 * If filesystem no longer mounted or offset beyond end of file we do
311	 * not have the page.
312	 */
313	if ((vp->v_mount == NULL) || (offset >= vnp->vnp_size))
314		return FALSE;
315
316	block = offset / vp->v_mount->mnt_stat.f_iosize;
317	if (incore(vp, block))
318		return TRUE;
319	/*
320	 * Read the index to find the disk block to read from.  If there is no
321	 * block, report that we don't have this data.
322	 *
323	 * Assumes that the vnode has whole page or nothing.
324	 */
325	err = VOP_BMAP(vp, block, (struct vnode **) 0, &bn, 0);
326	if (err)
327		return (TRUE);
328	return ((long) bn < 0 ? FALSE : TRUE);
329}
330
331/*
332 * Lets the VM system know about a change in size for a file.
333 * If this vnode is mapped into some address space (i.e. we have a pager
334 * for it) we adjust our own internal size and flush any cached pages in
335 * the associated object that are affected by the size change.
336 *
337 * Note: this routine may be invoked as a result of a pager put
338 * operation (possibly at object termination time), so we must be careful.
339 */
340void
341vnode_pager_setsize(vp, nsize)
342	struct vnode *vp;
343	u_long nsize;
344{
345	register vn_pager_t vnp;
346	register vm_object_t object;
347	vm_pager_t pager;
348
349	/*
350	 * Not a mapped vnode
351	 */
352	if (vp == NULL || vp->v_type != VREG || vp->v_vmdata == NULL)
353		return;
354
355	/*
356	 * Hasn't changed size
357	 */
358	object = (vm_object_t) vp->v_vmdata;
359	if (object == NULL)
360		return;
361	if ((pager = object->pager) == NULL)
362		return;
363	vnp = (vn_pager_t) pager->pg_data;
364	if (nsize == vnp->vnp_size)
365		return;
366
367	/*
368	 * No object. This can happen during object termination since
369	 * vm_object_page_clean is called after the object has been removed
370	 * from the hash table, and clean may cause vnode write operations
371	 * which can wind up back here.
372	 */
373	object = vm_object_lookup(pager);
374	if (object == NULL)
375		return;
376
377	/*
378	 * File has shrunk. Toss any cached pages beyond the new EOF.
379	 */
380	if (nsize < vnp->vnp_size) {
381		if (round_page((vm_offset_t) nsize) < vnp->vnp_size) {
382			vm_object_lock(object);
383			vm_object_page_remove(object,
384			    round_page((vm_offset_t) nsize), vnp->vnp_size);
385			vm_object_unlock(object);
386		}
387		/*
388		 * this gets rid of garbage at the end of a page that is now
389		 * only partially backed by the vnode...
390		 */
391		if (nsize & PAGE_MASK) {
392			vm_offset_t kva;
393			vm_page_t m;
394
395			m = vm_page_lookup(object, trunc_page((vm_offset_t) nsize));
396			if (m) {
397				kva = vm_pager_map_page(m);
398				bzero((caddr_t) kva + (nsize & PAGE_MASK),
399				    round_page(nsize) - nsize);
400				vm_pager_unmap_page(kva);
401			}
402		}
403	}
404	vnp->vnp_size = (vm_offset_t) nsize;
405	object->size = round_page(nsize);
406
407	vm_object_deallocate(object);
408}
409
410void
411vnode_pager_umount(mp)
412	register struct mount *mp;
413{
414	register vm_pager_t pager, npager;
415	struct vnode *vp;
416
417	for (pager = vnode_pager_list.tqh_first; pager != NULL; pager = npager) {
418		/*
419		 * Save the next pointer now since uncaching may terminate the
420		 * object and render pager invalid
421		 */
422		npager = pager->pg_list.tqe_next;
423		vp = ((vn_pager_t) pager->pg_data)->vnp_vp;
424		if (mp == (struct mount *) 0 || vp->v_mount == mp) {
425			VOP_LOCK(vp);
426			(void) vnode_pager_uncache(vp);
427			VOP_UNLOCK(vp);
428		}
429	}
430}
431
432/*
433 * Remove vnode associated object from the object cache.
434 * This routine must be called with the vnode locked.
435 *
436 * XXX unlock the vnode.
437 * We must do this since uncaching the object may result in its
438 * destruction which may initiate paging activity which may necessitate
439 * re-locking the vnode.
440 */
441boolean_t
442vnode_pager_uncache(vp)
443	register struct vnode *vp;
444{
445	register vm_object_t object;
446	boolean_t uncached;
447	vm_pager_t pager;
448
449	/*
450	 * Not a mapped vnode
451	 */
452	object = (vm_object_t) vp->v_vmdata;
453	if (object == NULL)
454		return (TRUE);
455
456	pager = object->pager;
457	if (pager == NULL)
458		return (TRUE);
459
460#ifdef DEBUG
461	if (!VOP_ISLOCKED(vp)) {
462		extern int (**nfsv2_vnodeop_p)();
463
464		if (vp->v_op != nfsv2_vnodeop_p)
465			panic("vnode_pager_uncache: vnode not locked!");
466	}
467#endif
468	/*
469	 * Must use vm_object_lookup() as it actually removes the object from
470	 * the cache list.
471	 */
472	object = vm_object_lookup(pager);
473	if (object) {
474		uncached = (object->ref_count <= 1);
475		VOP_UNLOCK(vp);
476		pager_cache(object, FALSE);
477		VOP_LOCK(vp);
478	} else
479		uncached = TRUE;
480	return (uncached);
481}
482
483
484void
485vnode_pager_freepage(m)
486	vm_page_t m;
487{
488	PAGE_WAKEUP(m);
489	vm_page_free(m);
490}
491
492/*
493 * calculate the linear (byte) disk address of specified virtual
494 * file address
495 */
496vm_offset_t
497vnode_pager_addr(vp, address, run)
498	struct vnode *vp;
499	vm_offset_t address;
500	int *run;
501{
502	int rtaddress;
503	int bsize;
504	vm_offset_t block;
505	struct vnode *rtvp;
506	int err;
507	int vblock, voffset;
508
509	if ((int) address < 0)
510		return -1;
511
512	bsize = vp->v_mount->mnt_stat.f_iosize;
513	vblock = address / bsize;
514	voffset = address % bsize;
515
516	err = VOP_BMAP(vp, vblock, &rtvp, &block, run);
517
518	if (err || (block == -1))
519		rtaddress = -1;
520	else {
521		rtaddress = block + voffset / DEV_BSIZE;
522		if( run) {
523			*run += 1;
524			*run *= bsize/PAGE_SIZE;
525			*run -= voffset/PAGE_SIZE;
526		}
527	}
528
529	return rtaddress;
530}
531
532/*
533 * interrupt routine for I/O completion
534 */
535void
536vnode_pager_iodone(bp)
537	struct buf *bp;
538{
539	bp->b_flags |= B_DONE;
540	wakeup((caddr_t) bp);
541	if (bp->b_flags & B_ASYNC) {
542		vm_offset_t paddr;
543		vm_page_t m;
544		vm_object_t obj = 0;
545		int i;
546		int npages;
547
548		paddr = (vm_offset_t) bp->b_data;
549		if (bp->b_bufsize != bp->b_bcount)
550			bzero(bp->b_data + bp->b_bcount,
551			    bp->b_bufsize - bp->b_bcount);
552
553		npages = (bp->b_bufsize + PAGE_SIZE - 1) / PAGE_SIZE;
554		for (i = 0; i < npages; i++) {
555			m = PHYS_TO_VM_PAGE(pmap_kextract(paddr + i * PAGE_SIZE));
556			obj = m->object;
557			if (m) {
558				m->dirty = 0;
559				m->valid = VM_PAGE_BITS_ALL;
560				if (m->flags & PG_WANTED)
561					m->flags |= PG_REFERENCED;
562				PAGE_WAKEUP(m);
563			} else {
564				panic("vnode_pager_iodone: page is gone!!!");
565			}
566		}
567		pmap_qremove(paddr, npages);
568		if (obj) {
569			vm_object_pip_wakeup(obj);
570		} else {
571			panic("vnode_pager_iodone: object is gone???");
572		}
573		relpbuf(bp);
574	}
575}
576
577/*
578 * small block file system vnode pager input
579 */
580int
581vnode_pager_input_smlfs(vnp, m)
582	vn_pager_t vnp;
583	vm_page_t m;
584{
585	int i;
586	int s;
587	struct vnode *dp, *vp;
588	struct buf *bp;
589	vm_offset_t kva;
590	int fileaddr;
591	int block;
592	vm_offset_t bsize;
593	int error = 0;
594
595	vp = vnp->vnp_vp;
596	bsize = vp->v_mount->mnt_stat.f_iosize;
597
598	VOP_BMAP(vp, 0, &dp, 0, 0);
599
600	kva = vm_pager_map_page(m);
601
602	for (i = 0; i < PAGE_SIZE / bsize; i++) {
603
604		if ((vm_page_bits(m->offset + i * bsize, bsize) & m->valid))
605			continue;
606
607		fileaddr = vnode_pager_addr(vp, m->offset + i * bsize, (int *)0);
608		if (fileaddr != -1) {
609			bp = getpbuf();
610
611			/* build a minimal buffer header */
612			bp->b_flags = B_BUSY | B_READ | B_CALL;
613			bp->b_iodone = vnode_pager_iodone;
614			bp->b_proc = curproc;
615			bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
616			if (bp->b_rcred != NOCRED)
617				crhold(bp->b_rcred);
618			if (bp->b_wcred != NOCRED)
619				crhold(bp->b_wcred);
620			bp->b_un.b_addr = (caddr_t) kva + i * bsize;
621			bp->b_blkno = fileaddr;
622			pbgetvp(dp, bp);
623			bp->b_bcount = bsize;
624			bp->b_bufsize = bsize;
625
626			/* do the input */
627			VOP_STRATEGY(bp);
628
629			/* we definitely need to be at splbio here */
630
631			s = splbio();
632			while ((bp->b_flags & B_DONE) == 0) {
633				tsleep((caddr_t) bp, PVM, "vnsrd", 0);
634			}
635			splx(s);
636			if ((bp->b_flags & B_ERROR) != 0)
637				error = EIO;
638
639			/*
640			 * free the buffer header back to the swap buffer pool
641			 */
642			relpbuf(bp);
643			if (error)
644				break;
645
646			vm_page_set_clean(m, i * bsize, bsize);
647			vm_page_set_valid(m, i * bsize, bsize);
648		} else {
649			vm_page_set_clean(m, i * bsize, bsize);
650			bzero((caddr_t) kva + i * bsize, bsize);
651		}
652nextblock:
653	}
654	vm_pager_unmap_page(kva);
655	pmap_clear_modify(VM_PAGE_TO_PHYS(m));
656	if (error) {
657		return VM_PAGER_ERROR;
658	}
659	return VM_PAGER_OK;
660
661}
662
663
664/*
665 * old style vnode pager output routine
666 */
667int
668vnode_pager_input_old(vnp, m)
669	vn_pager_t vnp;
670	vm_page_t m;
671{
672	struct uio auio;
673	struct iovec aiov;
674	int error;
675	int size;
676	vm_offset_t kva;
677
678	error = 0;
679
680	/*
681	 * Return failure if beyond current EOF
682	 */
683	if (m->offset >= vnp->vnp_size) {
684		return VM_PAGER_BAD;
685	} else {
686		size = PAGE_SIZE;
687		if (m->offset + size > vnp->vnp_size)
688			size = vnp->vnp_size - m->offset;
689		/*
690		 * Allocate a kernel virtual address and initialize so that
691		 * we can use VOP_READ/WRITE routines.
692		 */
693		kva = vm_pager_map_page(m);
694		aiov.iov_base = (caddr_t) kva;
695		aiov.iov_len = size;
696		auio.uio_iov = &aiov;
697		auio.uio_iovcnt = 1;
698		auio.uio_offset = m->offset;
699		auio.uio_segflg = UIO_SYSSPACE;
700		auio.uio_rw = UIO_READ;
701		auio.uio_resid = size;
702		auio.uio_procp = (struct proc *) 0;
703
704		error = VOP_READ(vnp->vnp_vp, &auio, 0, curproc->p_ucred);
705		if (!error) {
706			register int count = size - auio.uio_resid;
707
708			if (count == 0)
709				error = EINVAL;
710			else if (count != PAGE_SIZE)
711				bzero((caddr_t) kva + count, PAGE_SIZE - count);
712		}
713		vm_pager_unmap_page(kva);
714	}
715	pmap_clear_modify(VM_PAGE_TO_PHYS(m));
716	m->dirty = 0;
717	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
718}
719
720/*
721 * generic vnode pager input routine
722 */
723int
724vnode_pager_input(vnp, m, count, reqpage)
725	register vn_pager_t vnp;
726	vm_page_t *m;
727	int count, reqpage;
728{
729	int i;
730	vm_offset_t kva, foff;
731	int size, sizea;
732	vm_object_t object;
733	struct vnode *dp, *vp;
734	int bsize;
735
736	int first, last;
737	int firstaddr;
738	int block, offset;
739	int runpg;
740	int runend;
741
742	struct buf *bp, *bpa;
743	int counta;
744	int s;
745	int failflag;
746
747	int error = 0;
748
749	object = m[reqpage]->object;	/* all vm_page_t items are in same
750					 * object */
751
752	vp = vnp->vnp_vp;
753	bsize = vp->v_mount->mnt_stat.f_iosize;
754
755	/* get the UNDERLYING device for the file with VOP_BMAP() */
756
757	/*
758	 * originally, we did not check for an error return value -- assuming
759	 * an fs always has a bmap entry point -- that assumption is wrong!!!
760	 */
761	foff = m[reqpage]->offset;
762
763	/*
764	 * if we can't bmap, use old VOP code
765	 */
766	if (VOP_BMAP(vp, 0, &dp, 0, 0)) {
767		for (i = 0; i < count; i++) {
768			if (i != reqpage) {
769				vnode_pager_freepage(m[i]);
770			}
771		}
772		cnt.v_vnodein++;
773		cnt.v_vnodepgsin++;
774		return vnode_pager_input_old(vnp, m[reqpage]);
775
776		/*
777		 * if the blocksize is smaller than a page size, then use
778		 * special small filesystem code.  NFS sometimes has a small
779		 * blocksize, but it can handle large reads itself.
780		 */
781	} else if ((PAGE_SIZE / bsize) > 1 &&
782	    (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) {
783
784		for (i = 0; i < count; i++) {
785			if (i != reqpage) {
786				vnode_pager_freepage(m[i]);
787			}
788		}
789		cnt.v_vnodein++;
790		cnt.v_vnodepgsin++;
791		return vnode_pager_input_smlfs(vnp, m[reqpage]);
792	}
793	/*
794	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block
795	 * then, the entire page is valid --
796	 */
797	if (m[reqpage]->valid) {
798		m[reqpage]->valid = VM_PAGE_BITS_ALL;
799		for (i = 0; i < count; i++) {
800			if (i != reqpage)
801				vnode_pager_freepage(m[i]);
802		}
803		return VM_PAGER_OK;
804	}
805	/*
806	 * here on direct device I/O
807	 */
808
809
810	firstaddr = -1;
811	/*
812	 * calculate the run that includes the required page
813	 */
814	for(first = 0, i = 0; i < count; i = runend) {
815		firstaddr = vnode_pager_addr(vp, m[i]->offset, &runpg);
816		if (firstaddr == -1) {
817			if( i == reqpage && foff < vnp->vnp_size) {
818				printf("vnode_pager_input: unexpected missing page: firstaddr: %d, foff: %d, vnp_size: %d\n",
819			   	 firstaddr, foff, vnp->vnp_size);
820				panic("vnode_pager_input:...");
821			}
822			vnode_pager_freepage(m[i]);
823			runend = i + 1;
824			first = runend;
825			continue;
826		}
827		runend = i + runpg;
828		if( runend <= reqpage) {
829			int j;
830			for(j = i; j < runend; j++) {
831				vnode_pager_freepage(m[j]);
832			}
833		} else {
834			if( runpg < (count - first)) {
835				for(i=first + runpg; i < count; i++)
836					vnode_pager_freepage(m[i]);
837				count = first + runpg;
838			}
839			break;
840		}
841		first = runend;
842	}
843
844	/*
845	 * the first and last page have been calculated now, move input pages
846	 * to be zero based...
847	 */
848	if (first != 0) {
849		for (i = first; i < count; i++) {
850			m[i - first] = m[i];
851		}
852		count -= first;
853		reqpage -= first;
854	}
855
856	/*
857	 * calculate the file virtual address for the transfer
858	 */
859	foff = m[0]->offset;
860#if 0
861	printf("foff: 0x%lx, firstaddr: 0x%lx\n",
862		foff, firstaddr);
863	DELAY(6000000);
864#endif
865
866	/*
867	 * calculate the size of the transfer
868	 */
869	size = count * PAGE_SIZE;
870	if ((foff + size) > vnp->vnp_size)
871		size = vnp->vnp_size - foff;
872
873	/*
874	 * round up physical size for real devices
875	 */
876	if (dp->v_type == VBLK || dp->v_type == VCHR)
877		size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
878
879	counta = 0;
880	if (count * PAGE_SIZE > bsize)
881		counta = (count - reqpage) - 1;
882	bpa = 0;
883	sizea = 0;
884	bp = getpbuf();
885	if (counta) {
886		bpa = (struct buf *) trypbuf();
887		if (bpa) {
888			count -= counta;
889			sizea = size - count * PAGE_SIZE;
890			size = count * PAGE_SIZE;
891		}
892	}
893	kva = (vm_offset_t) bp->b_data;
894
895	/*
896	 * and map the pages to be read into the kva
897	 */
898	pmap_qenter(kva, m, count);
899
900	/* build a minimal buffer header */
901	bp->b_flags = B_BUSY | B_READ | B_CALL;
902	bp->b_iodone = vnode_pager_iodone;
903	/* B_PHYS is not set, but it is nice to fill this in */
904	bp->b_proc = curproc;
905	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
906	if (bp->b_rcred != NOCRED)
907		crhold(bp->b_rcred);
908	if (bp->b_wcred != NOCRED)
909		crhold(bp->b_wcred);
910	bp->b_blkno = firstaddr;
911	pbgetvp(dp, bp);
912	bp->b_bcount = size;
913	bp->b_bufsize = size;
914
915	cnt.v_vnodein++;
916	cnt.v_vnodepgsin += count;
917
918	/* do the input */
919	VOP_STRATEGY(bp);
920
921	if (counta) {
922		for (i = 0; i < counta; i++) {
923			vm_page_deactivate(m[count + i]);
924		}
925		pmap_qenter((vm_offset_t) bpa->b_data, &m[count], counta);
926		++m[count]->object->paging_in_progress;
927		bpa->b_flags = B_BUSY | B_READ | B_CALL | B_ASYNC;
928		bpa->b_iodone = vnode_pager_iodone;
929		/* B_PHYS is not set, but it is nice to fill this in */
930		bpa->b_proc = curproc;
931		bpa->b_rcred = bpa->b_wcred = bpa->b_proc->p_ucred;
932		if (bpa->b_rcred != NOCRED)
933			crhold(bpa->b_rcred);
934		if (bpa->b_wcred != NOCRED)
935			crhold(bpa->b_wcred);
936		bpa->b_blkno = firstaddr + count * (PAGE_SIZE / DEV_BSIZE);
937		pbgetvp(dp, bpa);
938		bpa->b_bcount = sizea;
939		bpa->b_bufsize = counta * PAGE_SIZE;
940
941		cnt.v_vnodepgsin += counta;
942		VOP_STRATEGY(bpa);
943	}
944	s = splbio();
945	/* we definitely need to be at splbio here */
946
947	while ((bp->b_flags & B_DONE) == 0) {
948		tsleep((caddr_t) bp, PVM, "vnread", 0);
949	}
950	splx(s);
951	if ((bp->b_flags & B_ERROR) != 0)
952		error = EIO;
953
954	if (!error) {
955		if (size != count * PAGE_SIZE)
956			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
957	}
958	pmap_qremove(kva, count);
959
960	/*
961	 * free the buffer header back to the swap buffer pool
962	 */
963	relpbuf(bp);
964
965finishup:
966	for (i = 0; i < count; i++) {
967		pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
968		m[i]->dirty = 0;
969		m[i]->valid = VM_PAGE_BITS_ALL;
970		if (i != reqpage) {
971
972			/*
973			 * whether or not to leave the page activated is up in
974			 * the air, but we should put the page on a page queue
975			 * somewhere. (it already is in the object). Result:
976			 * It appears that emperical results show that
977			 * deactivating pages is best.
978			 */
979
980			/*
981			 * just in case someone was asking for this page we
982			 * now tell them that it is ok to use
983			 */
984			if (!error) {
985				vm_page_deactivate(m[i]);
986				PAGE_WAKEUP(m[i]);
987			} else {
988				vnode_pager_freepage(m[i]);
989			}
990		}
991	}
992	if (error) {
993		printf("vnode_pager_input: I/O read error\n");
994	}
995	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
996}
997
998/*
999 * old-style vnode pager output routine
1000 */
1001int
1002vnode_pager_output_old(vnp, m)
1003	register vn_pager_t vnp;
1004	vm_page_t m;
1005{
1006	vm_offset_t kva, kva2;
1007	vm_offset_t size;
1008	struct iovec aiov;
1009	struct uio auio;
1010	struct vnode *vp;
1011	int error;
1012
1013	vp = vnp->vnp_vp;
1014
1015	/*
1016	 * Dont return failure if beyond current EOF placate the VM system.
1017	 */
1018	if (m->offset >= vnp->vnp_size) {
1019		return VM_PAGER_OK;
1020	} else {
1021		size = PAGE_SIZE;
1022		if (m->offset + size > vnp->vnp_size)
1023			size = vnp->vnp_size - m->offset;
1024
1025		kva2 = kmem_alloc(pager_map, PAGE_SIZE);
1026		/*
1027		 * Allocate a kernel virtual address and initialize so that
1028		 * we can use VOP_WRITE routines.
1029		 */
1030		kva = vm_pager_map_page(m);
1031		bcopy((caddr_t) kva, (caddr_t) kva2, size);
1032		vm_pager_unmap_page(kva);
1033		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
1034		PAGE_WAKEUP(m);
1035
1036		aiov.iov_base = (caddr_t) kva2;
1037		aiov.iov_len = size;
1038		auio.uio_iov = &aiov;
1039		auio.uio_iovcnt = 1;
1040		auio.uio_offset = m->offset;
1041		auio.uio_segflg = UIO_SYSSPACE;
1042		auio.uio_rw = UIO_WRITE;
1043		auio.uio_resid = size;
1044		auio.uio_procp = (struct proc *) 0;
1045
1046		error = VOP_WRITE(vp, &auio, 0, curproc->p_ucred);
1047
1048		kmem_free_wakeup(pager_map, kva2, PAGE_SIZE);
1049		if (!error) {
1050			if ((size - auio.uio_resid) == 0) {
1051				error = EINVAL;
1052			}
1053		}
1054		return error ? VM_PAGER_ERROR : VM_PAGER_OK;
1055	}
1056}
1057
1058/*
1059 * vnode pager output on a small-block file system
1060 */
1061int
1062vnode_pager_output_smlfs(vnp, m)
1063	vn_pager_t vnp;
1064	vm_page_t m;
1065{
1066	int i;
1067	int s;
1068	struct vnode *dp, *vp;
1069	struct buf *bp;
1070	vm_offset_t kva;
1071	int fileaddr;
1072	vm_offset_t bsize;
1073	int error = 0;
1074
1075	vp = vnp->vnp_vp;
1076	bsize = vp->v_mount->mnt_stat.f_iosize;
1077
1078	VOP_BMAP(vp, 0, &dp, 0, 0);
1079	kva = vm_pager_map_page(m);
1080	for (i = 0; !error && i < (PAGE_SIZE / bsize); i++) {
1081
1082		if ((vm_page_bits(m->offset + i * bsize, bsize) & m->valid & m->dirty) == 0)
1083			continue;
1084		/*
1085		 * calculate logical block and offset
1086		 */
1087		fileaddr = vnode_pager_addr(vp, m->offset + i * bsize, (int *)0);
1088		if (fileaddr != -1) {
1089
1090			bp = getpbuf();
1091
1092			/* build a minimal buffer header */
1093			bp->b_flags = B_BUSY | B_CALL | B_WRITE;
1094			bp->b_iodone = vnode_pager_iodone;
1095			bp->b_proc = curproc;
1096			bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1097			if (bp->b_rcred != NOCRED)
1098				crhold(bp->b_rcred);
1099			if (bp->b_wcred != NOCRED)
1100				crhold(bp->b_wcred);
1101			bp->b_un.b_addr = (caddr_t) kva + i * bsize;
1102			bp->b_blkno = fileaddr;
1103			pbgetvp(dp, bp);
1104			++dp->v_numoutput;
1105			/* for NFS */
1106			bp->b_dirtyoff = 0;
1107			bp->b_dirtyend = bsize;
1108			bp->b_bcount = bsize;
1109			bp->b_bufsize = bsize;
1110
1111			/* do the input */
1112			VOP_STRATEGY(bp);
1113
1114			/* we definitely need to be at splbio here */
1115
1116			s = splbio();
1117			while ((bp->b_flags & B_DONE) == 0) {
1118				tsleep((caddr_t) bp, PVM, "vnswrt", 0);
1119			}
1120			splx(s);
1121			if ((bp->b_flags & B_ERROR) != 0)
1122				error = EIO;
1123
1124			vm_page_set_clean(m, i * bsize, bsize);
1125			/*
1126			 * free the buffer header back to the swap buffer pool
1127			 */
1128			relpbuf(bp);
1129		}
1130	}
1131	vm_pager_unmap_page(kva);
1132	if (error)
1133		return VM_PAGER_ERROR;
1134	else
1135		return VM_PAGER_OK;
1136}
1137
1138/*
1139 * generic vnode pager output routine
1140 */
1141int
1142vnode_pager_output(vnp, m, count, rtvals)
1143	vn_pager_t vnp;
1144	vm_page_t *m;
1145	int count;
1146	int *rtvals;
1147{
1148	int i, j;
1149	vm_offset_t kva, foff;
1150	int size;
1151	vm_object_t object;
1152	struct vnode *dp, *vp;
1153	struct buf *bp;
1154	vm_offset_t reqaddr;
1155	int bsize;
1156	int s;
1157	daddr_t block;
1158	struct timeval tv;
1159	int runpg;
1160
1161	int error = 0;
1162
1163retryoutput:
1164	object = m[0]->object;	/* all vm_page_t items are in same object */
1165
1166	vp = vnp->vnp_vp;
1167
1168	/*
1169	 * Make sure underlying filesystem is still mounted.
1170	 */
1171	if (vp->v_mount == NULL)
1172		return VM_PAGER_FAIL;
1173
1174	bsize = vp->v_mount->mnt_stat.f_iosize;
1175
1176	for (i = 0; i < count; i++)
1177		rtvals[i] = VM_PAGER_AGAIN;
1178
1179	if ((int) m[0]->offset < 0) {
1180		printf("vnode_pager_output: attempt to write meta-data!!! -- 0x%x\n", m[0]->offset);
1181		m[0]->dirty = 0;
1182		rtvals[0] = VM_PAGER_OK;
1183		return VM_PAGER_OK;
1184	}
1185	/*
1186	 * if the filesystem does not have a bmap, then use the old code
1187	 */
1188	if (VOP_BMAP(vp, (m[0]->offset / bsize), &dp, &block, 0) ||
1189	    (block == -1)) {
1190
1191		rtvals[0] = vnode_pager_output_old(vnp, m[0]);
1192
1193		m[0]->dirty = 0;
1194		cnt.v_vnodeout++;
1195		cnt.v_vnodepgsout++;
1196		return rtvals[0];
1197	}
1198	tv = time;
1199	VOP_UPDATE(vp, &tv, &tv, 0);
1200
1201	/*
1202	 * if the filesystem has a small blocksize, then use the small block
1203	 * filesystem output code
1204	 */
1205	if ((bsize < PAGE_SIZE) &&
1206	    (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) {
1207
1208		for (i = 0; i < count; i++) {
1209			rtvals[i] = vnode_pager_output_smlfs(vnp, m[i]);
1210			if (rtvals[i] == VM_PAGER_OK) {
1211				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1212			}
1213		}
1214		cnt.v_vnodeout++;
1215		cnt.v_vnodepgsout += count;
1216		return rtvals[0];
1217	}
1218	for (i = 0; i < count; i++) {
1219		foff = m[i]->offset;
1220		if (foff >= vnp->vnp_size) {
1221			for (j = i; j < count; j++)
1222				rtvals[j] = VM_PAGER_BAD;
1223			count = i;
1224			break;
1225		}
1226	}
1227	if (count == 0) {
1228		return rtvals[0];
1229	}
1230	foff = m[0]->offset;
1231	reqaddr = vnode_pager_addr(vp, foff, &runpg);
1232	if( runpg < count)
1233		count = runpg;
1234
1235	/*
1236	 * calculate the size of the transfer
1237	 */
1238	size = count * PAGE_SIZE;
1239	if ((foff + size) > vnp->vnp_size)
1240		size = vnp->vnp_size - foff;
1241
1242	/*
1243	 * round up physical size for real devices
1244	 */
1245	if (dp->v_type == VBLK || dp->v_type == VCHR)
1246		size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1247
1248	bp = getpbuf();
1249	kva = (vm_offset_t) bp->b_data;
1250	/*
1251	 * and map the pages to be read into the kva
1252	 */
1253	pmap_qenter(kva, m, count);
1254
1255	/* build a minimal buffer header */
1256	bp->b_flags = B_BUSY | B_WRITE | B_CALL;
1257	bp->b_iodone = vnode_pager_iodone;
1258	/* B_PHYS is not set, but it is nice to fill this in */
1259	bp->b_proc = curproc;
1260	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1261
1262	if (bp->b_rcred != NOCRED)
1263		crhold(bp->b_rcred);
1264	if (bp->b_wcred != NOCRED)
1265		crhold(bp->b_wcred);
1266	bp->b_blkno = reqaddr;
1267	pbgetvp(dp, bp);
1268	++dp->v_numoutput;
1269
1270	/* for NFS */
1271	bp->b_dirtyoff = 0;
1272	bp->b_dirtyend = size;
1273
1274	bp->b_bcount = size;
1275	bp->b_bufsize = size;
1276
1277	cnt.v_vnodeout++;
1278	cnt.v_vnodepgsout += count;
1279
1280	/* do the output */
1281	VOP_STRATEGY(bp);
1282
1283	s = splbio();
1284
1285	/* we definitely need to be at splbio here */
1286
1287	while ((bp->b_flags & B_DONE) == 0) {
1288		tsleep((caddr_t) bp, PVM, "vnwrite", 0);
1289	}
1290	splx(s);
1291
1292	if ((bp->b_flags & B_ERROR) != 0)
1293		error = EIO;
1294
1295	pmap_qremove(kva, count);
1296
1297	/*
1298	 * free the buffer header back to the swap buffer pool
1299	 */
1300	relpbuf(bp);
1301
1302	if (!error) {
1303		for (i = 0; i < count; i++) {
1304			pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1305			m[i]->dirty = 0;
1306			rtvals[i] = VM_PAGER_OK;
1307		}
1308	} else if (count != 1) {
1309		error = 0;
1310		count = 1;
1311		goto retryoutput;
1312	}
1313	if (error) {
1314		printf("vnode_pager_output: I/O write error\n");
1315	}
1316	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
1317}
1318