vnode_pager.c revision 1827
1/*
2 * Copyright (c) 1990 University of Utah.
3 * Copyright (c) 1991 The Regents of the University of California.
4 * All rights reserved.
5 * Copyright (c) 1993,1994 John S. Dyson
6 *
7 * This code is derived from software contributed to Berkeley by
8 * the Systems Programming Group of the University of Utah Computer
9 * Science Department.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
40 *	$Id: vnode_pager.c,v 1.2 1994/05/25 09:21:11 rgrimes Exp $
41 */
42
43/*
44 * Page to/from files (vnodes).
45 *
46 * TODO:
47 *	pageouts
48 *	fix credential use (uses current process credentials now)
49 */
50
51/*
52 * MODIFICATIONS:
53 * John S. Dyson  08 Dec 93
54 *
55 * This file in conjunction with some vm_fault mods, eliminate the performance
56 * advantage for using the buffer cache and minimize memory copies.
57 *
58 * 1) Supports multiple - block reads
59 * 2) Bypasses buffer cache for reads
60 *
61 * TODO:
62 *
63 * 1) Totally bypass buffer cache for reads
64 *    (Currently will still sometimes use buffer cache for reads)
65 * 2) Bypass buffer cache for writes
66 *    (Code does not support it, but mods are simple)
67 */
68
69#include <sys/param.h>
70#include <sys/systm.h>
71#include <sys/proc.h>
72#include <sys/malloc.h>
73#include <sys/vnode.h>
74#include <sys/uio.h>
75#include <sys/mount.h>
76
77#include <vm/vm.h>
78#include <vm/vm_page.h>
79#include <vm/vnode_pager.h>
80
81#include <sys/buf.h>
82#include <miscfs/specfs/specdev.h>
83
84int     vnode_pager_putmulti();
85
86void    vnode_pager_init();
87vm_pager_t vnode_pager_alloc(caddr_t, vm_offset_t, vm_prot_t, vm_offset_t);
88void    vnode_pager_dealloc();
89int     vnode_pager_getpage();
90int     vnode_pager_getmulti();
91int     vnode_pager_putpage();
92boolean_t vnode_pager_haspage();
93
94struct pagerops vnodepagerops = {
95	vnode_pager_init,
96	vnode_pager_alloc,
97	vnode_pager_dealloc,
98	vnode_pager_getpage,
99	vnode_pager_getmulti,
100	vnode_pager_putpage,
101	vnode_pager_putmulti,
102	vnode_pager_haspage
103};
104
105static int vnode_pager_input(vn_pager_t vnp, vm_page_t * m, int count, int reqpage);
106static int vnode_pager_output(vn_pager_t vnp, vm_page_t * m, int count, int *rtvals);
107struct buf * getpbuf();
108void relpbuf(struct buf * bp);
109
110extern vm_map_t pager_map;
111
112struct pagerlst vnode_pager_list;	/* list of managed vnodes */
113
114#define MAXBP (PAGE_SIZE/DEV_BSIZE);
115
116void
117vnode_pager_init()
118{
119	TAILQ_INIT(&vnode_pager_list);
120}
121
122/*
123 * Allocate (or lookup) pager for a vnode.
124 * Handle is a vnode pointer.
125 */
126vm_pager_t
127vnode_pager_alloc(handle, size, prot, offset)
128	caddr_t handle;
129	vm_size_t size;
130	vm_prot_t prot;
131	vm_offset_t offset;
132{
133	register vm_pager_t pager;
134	register vn_pager_t vnp;
135	vm_object_t object;
136	struct vattr vattr;
137	struct vnode *vp;
138	struct proc *p = curproc;	/* XXX */
139
140	/*
141	 * Pageout to vnode, no can do yet.
142	 */
143	if (handle == NULL)
144		return (NULL);
145
146	/*
147	 * Vnodes keep a pointer to any associated pager so no need to lookup
148	 * with vm_pager_lookup.
149	 */
150	vp = (struct vnode *) handle;
151	pager = (vm_pager_t) vp->v_vmdata;
152	if (pager == NULL) {
153
154		/*
155		 * Allocate pager structures
156		 */
157		pager = (vm_pager_t) malloc(sizeof *pager, M_VMPAGER, M_WAITOK);
158		if (pager == NULL)
159			return (NULL);
160		vnp = (vn_pager_t) malloc(sizeof *vnp, M_VMPGDATA, M_WAITOK);
161		if (vnp == NULL) {
162			free((caddr_t) pager, M_VMPAGER);
163			return (NULL);
164		}
165
166		/*
167		 * And an object of the appropriate size
168		 */
169		if (VOP_GETATTR(vp, &vattr, p->p_ucred, p) == 0) {
170			object = vm_object_allocate(round_page(vattr.va_size));
171			vm_object_enter(object, pager);
172			vm_object_setpager(object, pager, 0, TRUE);
173		} else {
174			free((caddr_t) vnp, M_VMPGDATA);
175			free((caddr_t) pager, M_VMPAGER);
176			return (NULL);
177		}
178
179		/*
180		 * Hold a reference to the vnode and initialize pager data.
181		 */
182		VREF(vp);
183		vnp->vnp_flags = 0;
184		vnp->vnp_vp = vp;
185		vnp->vnp_size = vattr.va_size;
186
187		TAILQ_INSERT_TAIL(&vnode_pager_list, pager, pg_list);
188		pager->pg_handle = handle;
189		pager->pg_type = PG_VNODE;
190		pager->pg_ops = &vnodepagerops;
191		pager->pg_data = (caddr_t) vnp;
192		vp->v_vmdata = (caddr_t) pager;
193	} else {
194
195		/*
196		 * vm_object_lookup() will remove the object from the cache if
197		 * found and also gain a reference to the object.
198		 */
199		object = vm_object_lookup(pager);
200	}
201	return (pager);
202}
203
204void
205vnode_pager_dealloc(pager)
206	vm_pager_t pager;
207{
208	register vn_pager_t vnp = (vn_pager_t) pager->pg_data;
209	register struct vnode *vp;
210	struct proc *p = curproc;	/* XXX */
211
212	if (vp = vnp->vnp_vp) {
213		vp->v_vmdata = NULL;
214		vp->v_flag &= ~VTEXT;
215#if 0
216		/* can hang if done at reboot on NFS FS */
217		(void) VOP_FSYNC(vp, p->p_ucred, p);
218#endif
219		vrele(vp);
220	}
221	TAILQ_REMOVE(&vnode_pager_list, pager, pg_list);
222	free((caddr_t) vnp, M_VMPGDATA);
223	free((caddr_t) pager, M_VMPAGER);
224}
225
226int
227vnode_pager_getmulti(pager, m, count, reqpage, sync)
228	vm_pager_t pager;
229	vm_page_t *m;
230	int     count;
231	int     reqpage;
232	boolean_t sync;
233{
234
235	return vnode_pager_input((vn_pager_t) pager->pg_data, m, count, reqpage);
236}
237
238int
239vnode_pager_getpage(pager, m, sync)
240	vm_pager_t pager;
241	vm_page_t m;
242	boolean_t sync;
243{
244
245	int     err;
246	vm_page_t marray[1];
247
248	if (pager == NULL)
249		return FALSE;
250	marray[0] = m;
251
252	return vnode_pager_input((vn_pager_t) pager->pg_data, marray, 1, 0);
253}
254
255boolean_t
256vnode_pager_putpage(pager, m, sync)
257	vm_pager_t pager;
258	vm_page_t m;
259	boolean_t sync;
260{
261	int     err;
262	vm_page_t marray[1];
263	int     rtvals[1];
264
265	if (pager == NULL)
266		return FALSE;
267	marray[0] = m;
268	vnode_pager_output((vn_pager_t) pager->pg_data, marray, 1, rtvals);
269	return rtvals[0];
270}
271
272int
273vnode_pager_putmulti(pager, m, c, sync, rtvals)
274	vm_pager_t pager;
275	vm_page_t *m;
276	int     c;
277	boolean_t sync;
278	int    *rtvals;
279{
280	return vnode_pager_output((vn_pager_t) pager->pg_data, m, c, rtvals);
281}
282
283
284boolean_t
285vnode_pager_haspage(pager, offset)
286	vm_pager_t pager;
287	vm_offset_t offset;
288{
289	register vn_pager_t vnp = (vn_pager_t) pager->pg_data;
290	daddr_t bn;
291	int     err;
292
293	/*
294	 * Offset beyond end of file, do not have the page
295	 */
296	if (offset >= vnp->vnp_size) {
297		return (FALSE);
298	}
299
300	/*
301	 * Read the index to find the disk block to read from.  If there is no
302	 * block, report that we don't have this data.
303	 *
304	 * Assumes that the vnode has whole page or nothing.
305	 */
306	err = VOP_BMAP(vnp->vnp_vp,
307		       offset / vnp->vnp_vp->v_mount->mnt_stat.f_iosize,
308		       (struct vnode **) 0, &bn, 0);
309	if (err) {
310		return (TRUE);
311	}
312	return ((long) bn < 0 ? FALSE : TRUE);
313}
314
315/*
316 * Lets the VM system know about a change in size for a file.
317 * If this vnode is mapped into some address space (i.e. we have a pager
318 * for it) we adjust our own internal size and flush any cached pages in
319 * the associated object that are affected by the size change.
320 *
321 * Note: this routine may be invoked as a result of a pager put
322 * operation (possibly at object termination time), so we must be careful.
323 */
324void
325vnode_pager_setsize(vp, nsize)
326	struct vnode *vp;
327	u_long  nsize;
328{
329	register vn_pager_t vnp;
330	register vm_object_t object;
331	vm_pager_t pager;
332
333	/*
334	 * Not a mapped vnode
335	 */
336	if (vp == NULL || vp->v_type != VREG || vp->v_vmdata == NULL)
337		return;
338
339	/*
340	 * Hasn't changed size
341	 */
342	pager = (vm_pager_t) vp->v_vmdata;
343	vnp = (vn_pager_t) pager->pg_data;
344	if (nsize == vnp->vnp_size)
345		return;
346
347	/*
348	 * No object. This can happen during object termination since
349	 * vm_object_page_clean is called after the object has been removed
350	 * from the hash table, and clean may cause vnode write operations
351	 * which can wind up back here.
352	 */
353	object = vm_object_lookup(pager);
354	if (object == NULL)
355		return;
356
357	/*
358	 * File has shrunk. Toss any cached pages beyond the new EOF.
359	 */
360	if (nsize < vnp->vnp_size) {
361		vm_object_lock(object);
362		vm_object_page_remove(object,
363			     round_page((vm_offset_t) nsize), vnp->vnp_size);
364		vm_object_unlock(object);
365
366		/*
367		 * this gets rid of garbage at the end of a page that is now
368		 * only partially backed by the vnode...
369		 */
370		if (nsize & PAGE_MASK) {
371			vm_offset_t kva;
372			vm_page_t m;
373
374			m = vm_page_lookup(object, trunc_page((vm_offset_t) nsize));
375			if (m) {
376				kva = vm_pager_map_page(m);
377				bzero((caddr_t) kva + (nsize & PAGE_MASK),
378				      round_page(nsize) - nsize);
379				vm_pager_unmap_page(kva);
380			}
381		}
382	} else {
383
384		/*
385		 * this allows the filesystem and VM cache to stay in sync if
386		 * the VM page hasn't been modified...  After the page is
387		 * removed -- it will be faulted back in from the filesystem
388		 * cache.
389		 */
390		if (vnp->vnp_size & PAGE_MASK) {
391			vm_page_t m;
392
393			m = vm_page_lookup(object, trunc_page(vnp->vnp_size));
394			if (m && (m->flags & PG_CLEAN)) {
395				vm_object_lock(object);
396				vm_object_page_remove(object,
397					       vnp->vnp_size, vnp->vnp_size);
398				vm_object_unlock(object);
399			}
400		}
401	}
402	vnp->vnp_size = (vm_offset_t) nsize;
403	object->size = round_page(nsize);
404
405	vm_object_deallocate(object);
406}
407
408void
409vnode_pager_umount(mp)
410	register struct mount *mp;
411{
412	register vm_pager_t pager, npager;
413	struct vnode *vp;
414
415	pager = vnode_pager_list.tqh_first;
416	while (pager) {
417
418		/*
419		 * Save the next pointer now since uncaching may terminate the
420		 * object and render pager invalid
421		 */
422		vp = ((vn_pager_t) pager->pg_data)->vnp_vp;
423		npager = pager->pg_list.tqe_next;
424		if (mp == (struct mount *) 0 || vp->v_mount == mp)
425			(void) vnode_pager_uncache(vp);
426		pager = npager;
427	}
428}
429
430/*
431 * Remove vnode associated object from the object cache.
432 *
433 * Note: this routine may be invoked as a result of a pager put
434 * operation (possibly at object termination time), so we must be careful.
435 */
436boolean_t
437vnode_pager_uncache(vp)
438	register struct vnode *vp;
439{
440	register vm_object_t object;
441	boolean_t uncached, locked;
442	vm_pager_t pager;
443
444	/*
445	 * Not a mapped vnode
446	 */
447	pager = (vm_pager_t) vp->v_vmdata;
448	if (pager == NULL)
449		return (TRUE);
450
451	/*
452	 * Unlock the vnode if it is currently locked. We do this since
453	 * uncaching the object may result in its destruction which may
454	 * initiate paging activity which may necessitate locking the vnode.
455	 */
456	locked = VOP_ISLOCKED(vp);
457	if (locked)
458		VOP_UNLOCK(vp);
459
460	/*
461	 * Must use vm_object_lookup() as it actually removes the object from
462	 * the cache list.
463	 */
464	object = vm_object_lookup(pager);
465	if (object) {
466		uncached = (object->ref_count <= 1);
467		pager_cache(object, FALSE);
468	} else
469		uncached = TRUE;
470	if (locked)
471		VOP_LOCK(vp);
472	return (uncached);
473}
474
475
476void
477vnode_pager_freepage(m)
478	vm_page_t m;
479{
480	PAGE_WAKEUP(m);
481	vm_page_free(m);
482}
483
484/*
485 * calculate the linear (byte) disk address of specified virtual
486 * file address
487 */
488vm_offset_t
489vnode_pager_addr(vp, address)
490	struct vnode *vp;
491	vm_offset_t address;
492{
493	int     rtaddress;
494	int     bsize;
495	vm_offset_t block;
496	struct vnode *rtvp;
497	int     err;
498	int     vblock, voffset;
499
500	bsize = vp->v_mount->mnt_stat.f_iosize;
501	vblock = address / bsize;
502	voffset = address % bsize;
503
504	err = VOP_BMAP(vp, vblock, &rtvp, &block, 0);
505
506	if (err)
507		rtaddress = -1;
508	else
509		rtaddress = block * DEV_BSIZE + voffset;
510
511	return rtaddress;
512}
513
514/*
515 * interrupt routine for I/O completion
516 */
517void
518vnode_pager_iodone(bp)
519	struct buf *bp;
520{
521	bp->b_flags |= B_DONE;
522	wakeup((caddr_t) bp);
523}
524
525/*
526 * small block file system vnode pager input
527 */
528int
529vnode_pager_input_smlfs(vnp, m)
530	vn_pager_t vnp;
531	vm_page_t m;
532{
533	int     i;
534	int     s;
535	vm_offset_t paging_offset;
536	struct vnode *dp, *vp;
537	struct buf *bp;
538	vm_offset_t mapsize;
539	vm_offset_t foff;
540	vm_offset_t kva;
541	int     fileaddr;
542	int     block;
543	vm_offset_t bsize;
544	int     error = 0;
545
546	paging_offset = m->object->paging_offset;
547	vp = vnp->vnp_vp;
548	bsize = vp->v_mount->mnt_stat.f_iosize;
549	foff = m->offset + paging_offset;
550
551	VOP_BMAP(vp, foff, &dp, 0, 0);
552
553	kva = vm_pager_map_page(m);
554
555	for (i = 0; i < PAGE_SIZE / bsize; i++) {
556
557		/*
558		 * calculate logical block and offset
559		 */
560		block = foff / bsize + i;
561		s = splbio();
562		while (bp = incore(vp, block)) {
563			int     amount;
564
565			/*
566			 * wait until the buffer is avail or gone
567			 */
568			if (bp->b_flags & B_BUSY) {
569				bp->b_flags |= B_WANTED;
570				tsleep((caddr_t) bp, PVM, "vnwblk", 0);
571				continue;
572			}
573			amount = bsize;
574			if ((foff + bsize) > vnp->vnp_size)
575				amount = vnp->vnp_size - foff;
576
577			/*
578			 * make sure that this page is in the buffer
579			 */
580			if ((amount > 0) && amount <= bp->b_bcount) {
581				bp->b_flags |= B_BUSY;
582				splx(s);
583
584				/*
585				 * copy the data from the buffer
586				 */
587				bcopy(bp->b_un.b_addr, (caddr_t) kva + i * bsize, amount);
588				if (amount < bsize) {
589					bzero((caddr_t) kva + amount, bsize - amount);
590				}
591				bp->b_flags &= ~B_BUSY;
592				wakeup((caddr_t) bp);
593				goto nextblock;
594			}
595			break;
596		}
597		splx(s);
598		fileaddr = vnode_pager_addr(vp, foff + i * bsize);
599		if (fileaddr != -1) {
600			bp = getpbuf();
601			VHOLD(vp);
602
603			/* build a minimal buffer header */
604			bp->b_flags = B_BUSY | B_READ | B_CALL;
605			bp->b_iodone = vnode_pager_iodone;
606			bp->b_proc = curproc;
607			bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
608			if (bp->b_rcred != NOCRED)
609				crhold(bp->b_rcred);
610			if (bp->b_wcred != NOCRED)
611				crhold(bp->b_wcred);
612			bp->b_un.b_addr = (caddr_t) kva + i * bsize;
613			bp->b_blkno = fileaddr / DEV_BSIZE;
614			bgetvp(dp, bp);
615			bp->b_bcount = bsize;
616			bp->b_bufsize = bsize;
617
618			/* do the input */
619			VOP_STRATEGY(bp);
620
621			/* we definitely need to be at splbio here */
622
623			s = splbio();
624			while ((bp->b_flags & B_DONE) == 0) {
625				tsleep((caddr_t) bp, PVM, "vnsrd", 0);
626			}
627			splx(s);
628			if ((bp->b_flags & B_ERROR) != 0)
629				error = EIO;
630
631			/*
632			 * free the buffer header back to the swap buffer pool
633			 */
634			relpbuf(bp);
635			HOLDRELE(vp);
636			if (error)
637				break;
638		} else {
639			bzero((caddr_t) kva + i * bsize, bsize);
640		}
641nextblock:
642	}
643	vm_pager_unmap_page(kva);
644	if (error) {
645		return VM_PAGER_FAIL;
646	}
647	pmap_clear_modify(VM_PAGE_TO_PHYS(m));
648	m->flags |= PG_CLEAN;
649	m->flags &= ~PG_LAUNDRY;
650	return VM_PAGER_OK;
651
652}
653
654
655/*
656 * old style vnode pager output routine
657 */
658int
659vnode_pager_input_old(vnp, m)
660	vn_pager_t vnp;
661	vm_page_t m;
662{
663	int     i;
664	struct uio auio;
665	struct iovec aiov;
666	int     error;
667	int     size;
668	vm_offset_t foff;
669	vm_offset_t kva;
670
671	error = 0;
672	foff = m->offset + m->object->paging_offset;
673
674	/*
675	 * Return failure if beyond current EOF
676	 */
677	if (foff >= vnp->vnp_size) {
678		return VM_PAGER_BAD;
679	} else {
680		size = PAGE_SIZE;
681		if (foff + size > vnp->vnp_size)
682			size = vnp->vnp_size - foff;
683/*
684 * Allocate a kernel virtual address and initialize so that
685 * we can use VOP_READ/WRITE routines.
686 */
687		kva = vm_pager_map_page(m);
688		aiov.iov_base = (caddr_t) kva;
689		aiov.iov_len = size;
690		auio.uio_iov = &aiov;
691		auio.uio_iovcnt = 1;
692		auio.uio_offset = foff;
693		auio.uio_segflg = UIO_SYSSPACE;
694		auio.uio_rw = UIO_READ;
695		auio.uio_resid = size;
696		auio.uio_procp = (struct proc *) 0;
697
698		error = VOP_READ(vnp->vnp_vp, &auio, 0, curproc->p_ucred);
699		if (!error) {
700			register int count = size - auio.uio_resid;
701
702			if (count == 0)
703				error = EINVAL;
704			else if (count != PAGE_SIZE)
705				bzero((caddr_t) kva + count, PAGE_SIZE - count);
706		}
707		vm_pager_unmap_page(kva);
708	}
709	pmap_clear_modify(VM_PAGE_TO_PHYS(m));
710	m->flags |= PG_CLEAN;
711	m->flags &= ~PG_LAUNDRY;
712	return error ? VM_PAGER_FAIL : VM_PAGER_OK;
713}
714
715/*
716 * generic vnode pager input routine
717 */
718int
719vnode_pager_input(vnp, m, count, reqpage)
720	register vn_pager_t vnp;
721	vm_page_t *m;
722	int     count, reqpage;
723{
724	int     i, j;
725	vm_offset_t kva, foff;
726	int     size;
727	struct proc *p = curproc;	/* XXX */
728	vm_object_t object;
729	vm_offset_t paging_offset;
730	struct vnode *dp, *vp;
731	vm_offset_t mapsize;
732	int     bsize;
733
734	int     first, last;
735	int     reqaddr, firstaddr;
736	int     block, offset;
737
738	int     nbp;
739	struct buf *bp;
740	int     s;
741	int     failflag;
742
743	int     errtype = 0;	/* 0 is file type otherwise vm type */
744	int     error = 0;
745
746	object = m[reqpage]->object;	/* all vm_page_t items are in same
747					 * object */
748	paging_offset = object->paging_offset;
749
750	vp = vnp->vnp_vp;
751	bsize = vp->v_mount->mnt_stat.f_iosize;
752
753	/* get the UNDERLYING device for the file with VOP_BMAP() */
754
755	/*
756	 * originally, we did not check for an error return value -- assuming
757	 * an fs always has a bmap entry point -- that assumption is wrong!!!
758	 */
759	kva = 0;
760	mapsize = 0;
761	foff = m[reqpage]->offset + paging_offset;
762	if (!VOP_BMAP(vp, foff, &dp, 0, 0)) {
763
764		/*
765		 * we do not block for a kva, notice we default to a kva
766		 * conservative behavior
767		 */
768		kva = kmem_alloc_pageable(pager_map, (mapsize = count * PAGE_SIZE));
769		if (!kva) {
770			for (i = 0; i < count; i++) {
771				if (i != reqpage) {
772					vnode_pager_freepage(m[i]);
773				}
774			}
775			m[0] = m[reqpage];
776			kva = kmem_alloc_wait(pager_map, mapsize = PAGE_SIZE);
777			reqpage = 0;
778			count = 1;
779		}
780	}
781
782	/*
783	 * if we can't get a kva or we can't bmap, use old VOP code
784	 */
785	if (!kva) {
786		for (i = 0; i < count; i++) {
787			if (i != reqpage) {
788				vnode_pager_freepage(m[i]);
789			}
790		}
791		return vnode_pager_input_old(vnp, m[reqpage]);
792
793		/*
794		 * if the blocksize is smaller than a page size, then use
795		 * special small filesystem code.  NFS sometimes has a small
796		 * blocksize, but it can handle large reads itself.
797		 */
798	} else if ((PAGE_SIZE / bsize) > 1 &&
799		   (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) {
800
801		kmem_free_wakeup(pager_map, kva, mapsize);
802
803		for (i = 0; i < count; i++) {
804			if (i != reqpage) {
805				vnode_pager_freepage(m[i]);
806			}
807		}
808		return vnode_pager_input_smlfs(vnp, m[reqpage]);
809	}
810/*
811 * here on direct device I/O
812 */
813
814
815	/*
816	 * This pathetic hack gets data from the buffer cache, if it's there.
817	 * I believe that this is not really necessary, and the ends can be
818	 * gotten by defaulting to the normal vfs read behavior, but this
819	 * might be more efficient, because the will NOT invoke read-aheads
820	 * and one of the purposes of this code is to bypass the buffer cache
821	 * and keep from flushing it by reading in a program.
822	 */
823
824	/*
825	 * calculate logical block and offset
826	 */
827	block = foff / bsize;
828	offset = foff % bsize;
829	s = splbio();
830
831	/*
832	 * if we have a buffer in core, then try to use it
833	 */
834	while (bp = incore(vp, block)) {
835		int     amount;
836
837		/*
838		 * wait until the buffer is avail or gone
839		 */
840		if (bp->b_flags & B_BUSY) {
841			bp->b_flags |= B_WANTED;
842			tsleep((caddr_t) bp, PVM, "vnwblk", 0);
843			continue;
844		}
845		amount = PAGE_SIZE;
846		if ((foff + amount) > vnp->vnp_size)
847			amount = vnp->vnp_size - foff;
848
849		/*
850		 * make sure that this page is in the buffer
851		 */
852		if ((amount > 0) && (offset + amount) <= bp->b_bcount) {
853			bp->b_flags |= B_BUSY;
854			splx(s);
855
856			/*
857			 * map the requested page
858			 */
859			pmap_kenter(kva, VM_PAGE_TO_PHYS(m[reqpage]));
860			pmap_update();
861
862			/*
863			 * copy the data from the buffer
864			 */
865			bcopy(bp->b_un.b_addr + offset, (caddr_t) kva, amount);
866			if (amount < PAGE_SIZE) {
867				bzero((caddr_t) kva + amount, PAGE_SIZE - amount);
868			}
869
870			/*
871			 * unmap the page and free the kva
872			 */
873			pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE);
874			kmem_free_wakeup(pager_map, kva, mapsize);
875
876			/*
877			 * release the buffer back to the block subsystem
878			 */
879			bp->b_flags &= ~B_BUSY;
880			wakeup((caddr_t) bp);
881
882			/*
883			 * we did not have to do any work to get the requested
884			 * page, the read behind/ahead does not justify a read
885			 */
886			for (i = 0; i < count; i++) {
887				if (i != reqpage) {
888					vnode_pager_freepage(m[i]);
889				}
890			}
891			count = 1;
892			reqpage = 0;
893			m[0] = m[reqpage];
894
895			/*
896			 * sorry for the goto
897			 */
898			goto finishup;
899		}
900
901		/*
902		 * buffer is nowhere to be found, read from the disk
903		 */
904		break;
905	}
906	splx(s);
907
908	reqaddr = vnode_pager_addr(vp, foff);
909	s = splbio();
910
911	/*
912	 * Make sure that our I/O request is contiguous. Scan backward and
913	 * stop for the first discontiguous entry or stop for a page being in
914	 * buffer cache.
915	 */
916	failflag = 0;
917	first = reqpage;
918	for (i = reqpage - 1; i >= 0; --i) {
919		if (failflag ||
920		    incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) ||
921		    (vnode_pager_addr(vp, m[i]->offset + paging_offset))
922		    != reqaddr + (i - reqpage) * PAGE_SIZE) {
923			vnode_pager_freepage(m[i]);
924			failflag = 1;
925		} else {
926			first = i;
927		}
928	}
929
930	/*
931	 * Scan forward and stop for the first non-contiguous entry or stop
932	 * for a page being in buffer cache.
933	 */
934	failflag = 0;
935	last = reqpage + 1;
936	for (i = reqpage + 1; i < count; i++) {
937		if (failflag ||
938		    incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) ||
939		    (vnode_pager_addr(vp, m[i]->offset + paging_offset))
940		    != reqaddr + (i - reqpage) * PAGE_SIZE) {
941			vnode_pager_freepage(m[i]);
942			failflag = 1;
943		} else {
944			last = i + 1;
945		}
946	}
947	splx(s);
948
949	/*
950	 * the first and last page have been calculated now, move input pages
951	 * to be zero based...
952	 */
953	count = last;
954	if (first != 0) {
955		for (i = first; i < count; i++) {
956			m[i - first] = m[i];
957		}
958		count -= first;
959		reqpage -= first;
960	}
961
962	/*
963	 * calculate the file virtual address for the transfer
964	 */
965	foff = m[0]->offset + paging_offset;
966
967	/*
968	 * and get the disk physical address (in bytes)
969	 */
970	firstaddr = vnode_pager_addr(vp, foff);
971
972	/*
973	 * calculate the size of the transfer
974	 */
975	size = count * PAGE_SIZE;
976	if ((foff + size) > vnp->vnp_size)
977		size = vnp->vnp_size - foff;
978
979	/*
980	 * round up physical size for real devices
981	 */
982	if (dp->v_type == VBLK || dp->v_type == VCHR)
983		size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
984
985	/*
986	 * and map the pages to be read into the kva
987	 */
988	for (i = 0; i < count; i++)
989		pmap_kenter(kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i]));
990
991	pmap_update();
992	bp = getpbuf();
993	VHOLD(vp);
994
995	/* build a minimal buffer header */
996	bp->b_flags = B_BUSY | B_READ | B_CALL;
997	bp->b_iodone = vnode_pager_iodone;
998	/* B_PHYS is not set, but it is nice to fill this in */
999	bp->b_proc = curproc;
1000	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1001	if (bp->b_rcred != NOCRED)
1002		crhold(bp->b_rcred);
1003	if (bp->b_wcred != NOCRED)
1004		crhold(bp->b_wcred);
1005	bp->b_un.b_addr = (caddr_t) kva;
1006	bp->b_blkno = firstaddr / DEV_BSIZE;
1007	bgetvp(dp, bp);
1008	bp->b_bcount = size;
1009	bp->b_bufsize = size;
1010
1011	/* do the input */
1012	VOP_STRATEGY(bp);
1013
1014	s = splbio();
1015	/* we definitely need to be at splbio here */
1016
1017	while ((bp->b_flags & B_DONE) == 0) {
1018		tsleep((caddr_t) bp, PVM, "vnread", 0);
1019	}
1020	splx(s);
1021	if ((bp->b_flags & B_ERROR) != 0)
1022		error = EIO;
1023
1024	if (!error) {
1025		if (size != count * PAGE_SIZE)
1026			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
1027	}
1028	pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count);
1029	kmem_free_wakeup(pager_map, kva, mapsize);
1030
1031	/*
1032	 * free the buffer header back to the swap buffer pool
1033	 */
1034	relpbuf(bp);
1035	HOLDRELE(vp);
1036
1037finishup:
1038	for (i = 0; i < count; i++) {
1039		pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1040		m[i]->flags |= PG_CLEAN;
1041		m[i]->flags &= ~PG_LAUNDRY;
1042		if (i != reqpage) {
1043
1044			/*
1045			 * whether or not to leave the page activated is up in
1046			 * the air, but we should put the page on a page queue
1047			 * somewhere. (it already is in the object). Result:
1048			 * It appears that emperical results show that
1049			 * deactivating pages is best.
1050			 */
1051
1052			/*
1053			 * just in case someone was asking for this page we
1054			 * now tell them that it is ok to use
1055			 */
1056			if (!error) {
1057				vm_page_deactivate(m[i]);
1058				PAGE_WAKEUP(m[i]);
1059				m[i]->flags &= ~PG_FAKE;
1060			} else {
1061				vnode_pager_freepage(m[i]);
1062			}
1063		}
1064	}
1065	if (error) {
1066		printf("vnode pager read error: %d\n", error);
1067	}
1068	if (errtype)
1069		return error;
1070	return (error ? VM_PAGER_FAIL : VM_PAGER_OK);
1071}
1072
1073/*
1074 * old-style vnode pager output routine
1075 */
1076int
1077vnode_pager_output_old(vnp, m)
1078	register vn_pager_t vnp;
1079	vm_page_t m;
1080{
1081	vm_offset_t foff;
1082	vm_offset_t kva;
1083	vm_offset_t size;
1084	struct iovec aiov;
1085	struct uio auio;
1086	struct vnode *vp;
1087	int     error;
1088
1089	vp = vnp->vnp_vp;
1090	foff = m->offset + m->object->paging_offset;
1091
1092	/*
1093	 * Return failure if beyond current EOF
1094	 */
1095	if (foff >= vnp->vnp_size) {
1096		return VM_PAGER_BAD;
1097	} else {
1098		size = PAGE_SIZE;
1099		if (foff + size > vnp->vnp_size)
1100			size = vnp->vnp_size - foff;
1101/*
1102 * Allocate a kernel virtual address and initialize so that
1103 * we can use VOP_WRITE routines.
1104 */
1105		kva = vm_pager_map_page(m);
1106		aiov.iov_base = (caddr_t) kva;
1107		aiov.iov_len = size;
1108		auio.uio_iov = &aiov;
1109		auio.uio_iovcnt = 1;
1110		auio.uio_offset = foff;
1111		auio.uio_segflg = UIO_SYSSPACE;
1112		auio.uio_rw = UIO_WRITE;
1113		auio.uio_resid = size;
1114		auio.uio_procp = (struct proc *) 0;
1115
1116		error = VOP_WRITE(vp, &auio, 0, curproc->p_ucred);
1117
1118		if (!error) {
1119			if ((size - auio.uio_resid) == 0) {
1120				error = EINVAL;
1121			}
1122		}
1123		vm_pager_unmap_page(kva);
1124		return error ? VM_PAGER_FAIL : VM_PAGER_OK;
1125	}
1126}
1127
1128/*
1129 * vnode pager output on a small-block file system
1130 */
1131int
1132vnode_pager_output_smlfs(vnp, m)
1133	vn_pager_t vnp;
1134	vm_page_t m;
1135{
1136	int     i;
1137	int     s;
1138	vm_offset_t paging_offset;
1139	struct vnode *dp, *vp;
1140	struct buf *bp;
1141	vm_offset_t mapsize;
1142	vm_offset_t foff;
1143	vm_offset_t kva;
1144	int     fileaddr;
1145	int     block;
1146	vm_offset_t bsize;
1147	int     error = 0;
1148
1149	paging_offset = m->object->paging_offset;
1150	vp = vnp->vnp_vp;
1151	bsize = vp->v_mount->mnt_stat.f_iosize;
1152	foff = m->offset + paging_offset;
1153
1154	VOP_BMAP(vp, foff, &dp, 0, 0);
1155	kva = vm_pager_map_page(m);
1156	for (i = 0; !error && i < (PAGE_SIZE / bsize); i++) {
1157
1158		/*
1159		 * calculate logical block and offset
1160		 */
1161		fileaddr = vnode_pager_addr(vp, foff + i * bsize);
1162		if (fileaddr != -1) {
1163			s = splbio();
1164			if (bp = incore(vp, (foff / bsize) + i)) {
1165				bp = getblk(vp, (foff / bsize) + i, bp->b_bufsize, 0, 0);
1166				bp->b_flags |= B_INVAL;
1167				brelse(bp);
1168			}
1169			splx(s);
1170
1171			bp = getpbuf();
1172			VHOLD(vp);
1173
1174			/* build a minimal buffer header */
1175			bp->b_flags = B_BUSY | B_CALL | B_WRITE;
1176			bp->b_iodone = vnode_pager_iodone;
1177			bp->b_proc = curproc;
1178			bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1179			if (bp->b_rcred != NOCRED)
1180				crhold(bp->b_rcred);
1181			if (bp->b_wcred != NOCRED)
1182				crhold(bp->b_wcred);
1183			bp->b_un.b_addr = (caddr_t) kva + i * bsize;
1184			bp->b_blkno = fileaddr / DEV_BSIZE;
1185			bgetvp(dp, bp);
1186			++dp->v_numoutput;
1187			/* for NFS */
1188			bp->b_dirtyoff = 0;
1189			bp->b_dirtyend = bsize;
1190			bp->b_bcount = bsize;
1191			bp->b_bufsize = bsize;
1192
1193			/* do the input */
1194			VOP_STRATEGY(bp);
1195
1196			/* we definitely need to be at splbio here */
1197
1198			s = splbio();
1199			while ((bp->b_flags & B_DONE) == 0) {
1200				tsleep((caddr_t) bp, PVM, "vnswrt", 0);
1201			}
1202			splx(s);
1203			if ((bp->b_flags & B_ERROR) != 0)
1204				error = EIO;
1205
1206			/*
1207			 * free the buffer header back to the swap buffer pool
1208			 */
1209			relpbuf(bp);
1210			HOLDRELE(vp);
1211		}
1212	}
1213	vm_pager_unmap_page(kva);
1214	if (error)
1215		return VM_PAGER_FAIL;
1216	else
1217		return VM_PAGER_OK;
1218}
1219
1220/*
1221 * generic vnode pager output routine
1222 */
1223int
1224vnode_pager_output(vnp, m, count, rtvals)
1225	vn_pager_t vnp;
1226	vm_page_t *m;
1227	int     count;
1228	int    *rtvals;
1229{
1230	int     i, j;
1231	vm_offset_t kva, foff;
1232	int     size;
1233	struct proc *p = curproc;	/* XXX */
1234	vm_object_t object;
1235	vm_offset_t paging_offset;
1236	struct vnode *dp, *vp;
1237	struct buf *bp;
1238	vm_offset_t mapsize;
1239	vm_offset_t reqaddr;
1240	int     bsize;
1241	int     s;
1242
1243	int     error = 0;
1244
1245retryoutput:
1246	object = m[0]->object;	/* all vm_page_t items are in same object */
1247	paging_offset = object->paging_offset;
1248
1249	vp = vnp->vnp_vp;
1250	bsize = vp->v_mount->mnt_stat.f_iosize;
1251
1252	for (i = 0; i < count; i++)
1253		rtvals[i] = VM_PAGER_AGAIN;
1254
1255	/*
1256	 * if the filesystem does not have a bmap, then use the old code
1257	 */
1258	if (VOP_BMAP(vp, m[0]->offset + paging_offset, &dp, 0, 0)) {
1259
1260		rtvals[0] = vnode_pager_output_old(vnp, m[0]);
1261
1262		pmap_clear_modify(VM_PAGE_TO_PHYS(m[0]));
1263		m[0]->flags |= PG_CLEAN;
1264		m[0]->flags &= ~PG_LAUNDRY;
1265		return rtvals[0];
1266	}
1267
1268	/*
1269	 * if the filesystem has a small blocksize, then use the small block
1270	 * filesystem output code
1271	 */
1272	if ((bsize < PAGE_SIZE) &&
1273	    (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) {
1274
1275		for (i = 0; i < count; i++) {
1276			rtvals[i] = vnode_pager_output_smlfs(vnp, m[i]);
1277			if (rtvals[i] == VM_PAGER_OK) {
1278				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1279				m[i]->flags |= PG_CLEAN;
1280				m[i]->flags &= ~PG_LAUNDRY;
1281			}
1282		}
1283		return rtvals[0];
1284	}
1285
1286	/*
1287	 * get some kva for the output
1288	 */
1289	kva = kmem_alloc_pageable(pager_map, (mapsize = count * PAGE_SIZE));
1290	if (!kva) {
1291		kva = kmem_alloc_pageable(pager_map, (mapsize = PAGE_SIZE));
1292		count = 1;
1293		if (!kva)
1294			return rtvals[0];
1295	}
1296	for (i = 0; i < count; i++) {
1297		foff = m[i]->offset + paging_offset;
1298		if (foff >= vnp->vnp_size) {
1299			for (j = i; j < count; j++)
1300				rtvals[j] = VM_PAGER_BAD;
1301			count = i;
1302			break;
1303		}
1304	}
1305	if (count == 0) {
1306		return rtvals[0];
1307	}
1308	foff = m[0]->offset + paging_offset;
1309	reqaddr = vnode_pager_addr(vp, foff);
1310
1311	/*
1312	 * Scan forward and stop for the first non-contiguous entry or stop
1313	 * for a page being in buffer cache.
1314	 */
1315	for (i = 1; i < count; i++) {
1316		if (vnode_pager_addr(vp, m[i]->offset + paging_offset)
1317		    != reqaddr + i * PAGE_SIZE) {
1318			count = i;
1319			break;
1320		}
1321	}
1322
1323	/*
1324	 * calculate the size of the transfer
1325	 */
1326	size = count * PAGE_SIZE;
1327	if ((foff + size) > vnp->vnp_size)
1328		size = vnp->vnp_size - foff;
1329
1330	/*
1331	 * round up physical size for real devices
1332	 */
1333	if (dp->v_type == VBLK || dp->v_type == VCHR)
1334		size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1335
1336	/*
1337	 * and map the pages to be read into the kva
1338	 */
1339	for (i = 0; i < count; i++)
1340		pmap_kenter(kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i]));
1341	pmap_update();
1342/*
1343	printf("vnode: writing foff: %d, devoff: %d, size: %d\n",
1344		foff, reqaddr, size);
1345*/
1346
1347	/*
1348	 * next invalidate the incore vfs_bio data
1349	 */
1350	for (i = 0; i < count; i++) {
1351		int     filblock = (foff + i * PAGE_SIZE) / bsize;
1352		struct buf *fbp;
1353
1354		s = splbio();
1355		if (fbp = incore(vp, filblock)) {
1356			fbp = getblk(vp, filblock, fbp->b_bufsize, 0, 0);
1357			if (fbp->b_flags & B_DELWRI) {
1358				if (fbp->b_bufsize <= PAGE_SIZE)
1359					fbp->b_flags &= ~B_DELWRI;
1360				else {
1361					bwrite(fbp);
1362					fbp = getblk(vp, filblock,
1363						     fbp->b_bufsize, 0, 0);
1364				}
1365			}
1366			fbp->b_flags |= B_INVAL;
1367			brelse(fbp);
1368		}
1369		splx(s);
1370	}
1371
1372
1373	bp = getpbuf();
1374	VHOLD(vp);
1375	/* build a minimal buffer header */
1376	bp->b_flags = B_BUSY | B_WRITE | B_CALL;
1377	bp->b_iodone = vnode_pager_iodone;
1378	/* B_PHYS is not set, but it is nice to fill this in */
1379	bp->b_proc = curproc;
1380	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1381
1382	if (bp->b_rcred != NOCRED)
1383		crhold(bp->b_rcred);
1384	if (bp->b_wcred != NOCRED)
1385		crhold(bp->b_wcred);
1386	bp->b_un.b_addr = (caddr_t) kva;
1387	bp->b_blkno = reqaddr / DEV_BSIZE;
1388	bgetvp(dp, bp);
1389	++dp->v_numoutput;
1390
1391	/* for NFS */
1392	bp->b_dirtyoff = 0;
1393	bp->b_dirtyend = size;
1394
1395	bp->b_bcount = size;
1396	bp->b_bufsize = size;
1397
1398	/* do the output */
1399	VOP_STRATEGY(bp);
1400
1401	s = splbio();
1402
1403	/* we definitely need to be at splbio here */
1404
1405	while ((bp->b_flags & B_DONE) == 0) {
1406		tsleep((caddr_t) bp, PVM, "vnwrite", 0);
1407	}
1408	splx(s);
1409
1410	if ((bp->b_flags & B_ERROR) != 0)
1411		error = EIO;
1412
1413	pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count);
1414	kmem_free_wakeup(pager_map, kva, mapsize);
1415
1416	/*
1417	 * free the buffer header back to the swap buffer pool
1418	 */
1419	relpbuf(bp);
1420	HOLDRELE(vp);
1421
1422	if (!error) {
1423		for (i = 0; i < count; i++) {
1424			pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1425			m[i]->flags |= PG_CLEAN;
1426			m[i]->flags &= ~PG_LAUNDRY;
1427			rtvals[i] = VM_PAGER_OK;
1428		}
1429	} else if (count != 1) {
1430		error = 0;
1431		count = 1;
1432		goto retryoutput;
1433	}
1434	if (error) {
1435		printf("vnode pager write error: %d\n", error);
1436	}
1437	return (error ? VM_PAGER_FAIL : VM_PAGER_OK);
1438}
1439