ffs_rawread.c revision 144367
1/*-
2 * Copyright (c) 2000-2003 Tor Egge
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_rawread.c 144367 2005-03-31 04:37:09Z jeff $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/fcntl.h>
33#include <sys/file.h>
34#include <sys/stat.h>
35#include <sys/proc.h>
36#include <sys/limits.h>
37#include <sys/mount.h>
38#include <sys/namei.h>
39#include <sys/vnode.h>
40#include <sys/conf.h>
41#include <sys/filio.h>
42#include <sys/ttycom.h>
43#include <sys/bio.h>
44#include <sys/buf.h>
45#include <ufs/ufs/extattr.h>
46#include <ufs/ufs/quota.h>
47#include <ufs/ufs/inode.h>
48#include <ufs/ufs/ufsmount.h>
49#include <ufs/ufs/ufs_extern.h>
50#include <ufs/ffs/fs.h>
51#include <ufs/ffs/ffs_extern.h>
52
53#include <vm/vm.h>
54#include <vm/vm_extern.h>
55#include <vm/vm_object.h>
56#include <sys/kernel.h>
57#include <sys/sysctl.h>
58
59static int ffs_rawread_readahead(struct vnode *vp,
60				 caddr_t udata,
61				 off_t offset,
62				 size_t len,
63				 struct thread *td,
64				 struct buf *bp,
65				 caddr_t sa);
66static int ffs_rawread_main(struct vnode *vp,
67			    struct uio *uio);
68
69static int ffs_rawread_sync(struct vnode *vp, struct thread *td);
70
71int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
72
73void ffs_rawread_setup(void);
74
75static void ffs_rawreadwakeup(struct buf *bp);
76
77
78SYSCTL_DECL(_vfs_ffs);
79
80static int ffsrawbufcnt = 4;
81SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0,
82	   "Buffers available for raw reads");
83
84static int allowrawread = 1;
85SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0,
86	   "Flag to enable raw reads");
87
88static int rawreadahead = 1;
89SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0,
90	   "Flag to enable readahead for long raw reads");
91
92
93void
94ffs_rawread_setup(void)
95{
96	ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8;
97}
98
99
100static int
101ffs_rawread_sync(struct vnode *vp, struct thread *td)
102{
103	int spl;
104	int error;
105	int upgraded;
106	struct bufobj *bo;
107
108	GIANT_REQUIRED;
109	/* Check for dirty mmap, pending writes and dirty buffers */
110	spl = splbio();
111	VI_LOCK(vp);
112	bo = &vp->v_bufobj;
113	if (bo->bo_numoutput > 0 ||
114	    bo->bo_dirty.bv_cnt > 0 ||
115	    (vp->v_iflag & VI_OBJDIRTY) != 0) {
116		splx(spl);
117		VI_UNLOCK(vp);
118
119		if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) {
120			upgraded = 1;
121			/* Upgrade to exclusive lock, this might block */
122			VOP_LOCK(vp, LK_UPGRADE, td);
123		} else
124			upgraded = 0;
125
126
127		/* Attempt to msync mmap() regions to clean dirty mmap */
128		VI_LOCK(vp);
129		if ((vp->v_iflag & VI_OBJDIRTY) != 0) {
130			VI_UNLOCK(vp);
131			if (vp->v_object != NULL) {
132				VM_OBJECT_LOCK(vp->v_object);
133				vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
134				VM_OBJECT_UNLOCK(vp->v_object);
135			}
136			VI_LOCK(vp);
137		}
138
139		/* Wait for pending writes to complete */
140		spl = splbio();
141		error = bufobj_wwait(&vp->v_bufobj, 0, 0);
142		if (error != 0) {
143			/* XXX: can't happen with a zero timeout ??? */
144			splx(spl);
145			VI_UNLOCK(vp);
146			if (upgraded != 0)
147				VOP_LOCK(vp, LK_DOWNGRADE, td);
148			return (error);
149		}
150		/* Flush dirty buffers */
151		if (bo->bo_dirty.bv_cnt > 0) {
152			splx(spl);
153			VI_UNLOCK(vp);
154			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) {
155				if (upgraded != 0)
156					VOP_LOCK(vp, LK_DOWNGRADE, td);
157				return (error);
158			}
159			VI_LOCK(vp);
160			spl = splbio();
161			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
162				panic("ffs_rawread_sync: dirty bufs");
163		}
164		splx(spl);
165		VI_UNLOCK(vp);
166		if (upgraded != 0)
167			VOP_LOCK(vp, LK_DOWNGRADE, td);
168	} else {
169		splx(spl);
170		VI_UNLOCK(vp);
171	}
172	return 0;
173}
174
175
176static int
177ffs_rawread_readahead(struct vnode *vp,
178		      caddr_t udata,
179		      off_t offset,
180		      size_t len,
181		      struct thread *td,
182		      struct buf *bp,
183		      caddr_t sa)
184{
185	int error;
186	u_int iolen;
187	off_t blockno;
188	int blockoff;
189	int bsize;
190	struct vnode *dp;
191	int bforwards;
192	struct inode *ip;
193	ufs2_daddr_t blkno;
194
195	GIANT_REQUIRED;
196	bsize = vp->v_mount->mnt_stat.f_iosize;
197
198	ip = VTOI(vp);
199	dp = ip->i_devvp;
200
201	iolen = ((vm_offset_t) udata) & PAGE_MASK;
202	bp->b_bcount = len;
203	if (bp->b_bcount + iolen > bp->b_kvasize) {
204		bp->b_bcount = bp->b_kvasize;
205		if (iolen != 0)
206			bp->b_bcount -= PAGE_SIZE;
207	}
208	bp->b_flags = 0;	/* XXX necessary ? */
209	bp->b_iocmd = BIO_READ;
210	bp->b_iodone = ffs_rawreadwakeup;
211	bp->b_data = udata;
212	bp->b_saveaddr = sa;
213	blockno = offset / bsize;
214	blockoff = (offset % bsize) / DEV_BSIZE;
215	if ((daddr_t) blockno != blockno) {
216		return EINVAL; /* blockno overflow */
217	}
218
219	bp->b_lblkno = bp->b_blkno = blockno;
220
221	error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL);
222	if (error != 0)
223		return error;
224	if (blkno == -1) {
225
226		/* Fill holes with NULs to preserve semantics */
227
228		if (bp->b_bcount + blockoff * DEV_BSIZE > bsize)
229			bp->b_bcount = bsize - blockoff * DEV_BSIZE;
230		bp->b_bufsize = bp->b_bcount;
231
232		if (vmapbuf(bp) < 0)
233			return EFAULT;
234
235		if (ticks - PCPU_GET(switchticks) >= hogticks)
236			uio_yield();
237		bzero(bp->b_data, bp->b_bufsize);
238
239		/* Mark operation completed (similar to bufdone()) */
240
241		bp->b_resid = 0;
242		bp->b_flags |= B_DONE;
243		return 0;
244	}
245	bp->b_blkno = blkno + blockoff;
246	bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE;
247
248	if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards))
249		bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE;
250	bp->b_bufsize = bp->b_bcount;
251
252	if (vmapbuf(bp) < 0)
253		return EFAULT;
254
255	BO_STRATEGY(&dp->v_bufobj, bp);
256	return 0;
257}
258
259
260static int
261ffs_rawread_main(struct vnode *vp,
262		 struct uio *uio)
263{
264	int error, nerror;
265	struct buf *bp, *nbp, *tbp;
266	caddr_t sa, nsa, tsa;
267	u_int iolen;
268	int spl;
269	caddr_t udata;
270	long resid;
271	off_t offset;
272	struct thread *td;
273
274	GIANT_REQUIRED;
275	td = uio->uio_td ? uio->uio_td : curthread;
276	udata = uio->uio_iov->iov_base;
277	resid = uio->uio_resid;
278	offset = uio->uio_offset;
279
280	/*
281	 * keep the process from being swapped
282	 */
283	PHOLD(td->td_proc);
284
285	error = 0;
286	nerror = 0;
287
288	bp = NULL;
289	nbp = NULL;
290	sa = NULL;
291	nsa = NULL;
292
293	while (resid > 0) {
294
295		if (bp == NULL) { /* Setup first read */
296			/* XXX: Leave some bufs for swap */
297			bp = getpbuf(&ffsrawbufcnt);
298			sa = bp->b_data;
299			bp->b_vp = vp;
300			error = ffs_rawread_readahead(vp, udata, offset,
301						     resid, td, bp, sa);
302			if (error != 0)
303				break;
304
305			if (resid > bp->b_bufsize) { /* Setup fist readahead */
306				/* XXX: Leave bufs for swap */
307				if (rawreadahead != 0)
308					nbp = trypbuf(&ffsrawbufcnt);
309				else
310					nbp = NULL;
311				if (nbp != NULL) {
312					nsa = nbp->b_data;
313					nbp->b_vp = vp;
314
315					nerror = ffs_rawread_readahead(vp,
316								       udata +
317								       bp->b_bufsize,
318								       offset +
319								       bp->b_bufsize,
320								       resid -
321								       bp->b_bufsize,
322								       td,
323								       nbp,
324								       nsa);
325					if (nerror) {
326						relpbuf(nbp, &ffsrawbufcnt);
327						nbp = NULL;
328					}
329				}
330			}
331		}
332
333		spl = splbio();
334		bwait(bp, PRIBIO, "rawrd");
335		splx(spl);
336
337		vunmapbuf(bp);
338
339		iolen = bp->b_bcount - bp->b_resid;
340		if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) {
341			nerror = 0;	/* Ignore possible beyond EOF error */
342			break; /* EOF */
343		}
344
345		if ((bp->b_ioflags & BIO_ERROR) != 0) {
346			error = bp->b_error;
347			break;
348		}
349		resid -= iolen;
350		udata += iolen;
351		offset += iolen;
352		if (iolen < bp->b_bufsize) {
353			/* Incomplete read.  Try to read remaining part */
354			error = ffs_rawread_readahead(vp,
355						      udata,
356						      offset,
357						      bp->b_bufsize - iolen,
358						      td,
359						      bp,
360						      sa);
361			if (error != 0)
362				break;
363		} else if (nbp != NULL) { /* Complete read with readahead */
364
365			tbp = bp;
366			bp = nbp;
367			nbp = tbp;
368
369			tsa = sa;
370			sa = nsa;
371			nsa = tsa;
372
373			if (resid <= bp->b_bufsize) { /* No more readaheads */
374				relpbuf(nbp, &ffsrawbufcnt);
375				nbp = NULL;
376			} else { /* Setup next readahead */
377				nerror = ffs_rawread_readahead(vp,
378							       udata +
379							       bp->b_bufsize,
380							       offset +
381							       bp->b_bufsize,
382							       resid -
383							       bp->b_bufsize,
384							       td,
385							       nbp,
386							       nsa);
387				if (nerror != 0) {
388					relpbuf(nbp, &ffsrawbufcnt);
389					nbp = NULL;
390				}
391			}
392		} else if (nerror != 0) {/* Deferred Readahead error */
393			break;
394		}  else if (resid > 0) { /* More to read, no readahead */
395			error = ffs_rawread_readahead(vp, udata, offset,
396						      resid, td, bp, sa);
397			if (error != 0)
398				break;
399		}
400	}
401
402	if (bp != NULL)
403		relpbuf(bp, &ffsrawbufcnt);
404	if (nbp != NULL) {			/* Run down readahead buffer */
405		spl = splbio();
406		bwait(nbp, PRIBIO, "rawrd");
407		splx(spl);
408		vunmapbuf(nbp);
409		relpbuf(nbp, &ffsrawbufcnt);
410	}
411
412	if (error == 0)
413		error = nerror;
414	PRELE(td->td_proc);
415	uio->uio_iov->iov_base = udata;
416	uio->uio_resid = resid;
417	uio->uio_offset = offset;
418	return error;
419}
420
421
422int
423ffs_rawread(struct vnode *vp,
424	    struct uio *uio,
425	    int *workdone)
426{
427	if (allowrawread != 0 &&
428	    uio->uio_iovcnt == 1 &&
429	    uio->uio_segflg == UIO_USERSPACE &&
430	    uio->uio_resid == uio->uio_iov->iov_len &&
431	    (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags &
432	     TDP_DEADLKTREAT) == 0) {
433		int secsize;		/* Media sector size */
434		off_t filebytes;	/* Bytes left of file */
435		int blockbytes;		/* Bytes left of file in full blocks */
436		int partialbytes;	/* Bytes in last partial block */
437		int skipbytes;		/* Bytes not to read in ffs_rawread */
438		struct inode *ip;
439		int error;
440
441
442		/* Only handle sector aligned reads */
443		ip = VTOI(vp);
444		secsize = ip->i_devvp->v_bufobj.bo_bsize;
445		if ((uio->uio_offset & (secsize - 1)) == 0 &&
446		    (uio->uio_resid & (secsize - 1)) == 0) {
447
448			/* Sync dirty pages and buffers if needed */
449			error = ffs_rawread_sync(vp,
450						 (uio->uio_td != NULL) ?
451						 uio->uio_td : curthread);
452			if (error != 0)
453				return error;
454
455			/* Check for end of file */
456			if (ip->i_size > uio->uio_offset) {
457				filebytes = ip->i_size - uio->uio_offset;
458
459				/* No special eof handling needed ? */
460				if (uio->uio_resid <= filebytes) {
461					*workdone = 1;
462					return ffs_rawread_main(vp, uio);
463				}
464
465				partialbytes = ((unsigned int) ip->i_size) %
466					ip->i_fs->fs_bsize;
467				blockbytes = (int) filebytes - partialbytes;
468				if (blockbytes > 0) {
469					skipbytes = uio->uio_resid -
470						blockbytes;
471					uio->uio_resid = blockbytes;
472					error = ffs_rawread_main(vp, uio);
473					uio->uio_resid += skipbytes;
474					if (error != 0)
475						return error;
476					/* Read remaining part using buffer */
477				}
478			}
479		}
480	}
481	*workdone = 0;
482	return 0;
483}
484
485
486static void
487ffs_rawreadwakeup(struct buf *bp)
488{
489	bdone(bp);
490}
491