ffs_rawread.c revision 166506
174462Salfred/*-
274462Salfred * Copyright (c) 2000-2003 Tor Egge
3261046Smav * All rights reserved.
4261046Smav *
5261046Smav * Redistribution and use in source and binary forms, with or without
68858Srgrimes * modification, are permitted provided that the following conditions
7261046Smav * are met:
8261046Smav * 1. Redistributions of source code must retain the above copyright
9261046Smav *    notice, this list of conditions and the following disclaimer.
10261046Smav * 2. Redistributions in binary form must reproduce the above copyright
11261046Smav *    notice, this list of conditions and the following disclaimer in the
12261046Smav *    documentation and/or other materials provided with the distribution.
13261046Smav *
14261046Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15261046Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16261046Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
178858Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18261046Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19261046Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20261046Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21261046Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22261046Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23261046Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24261046Smav * SUCH DAMAGE.
25261046Smav */
26261046Smav
27261046Smav#include <sys/cdefs.h>
28261046Smav__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_rawread.c 166506 2007-02-04 23:42:02Z tegge $");
298858Srgrimes
3074462Salfred#include <sys/param.h>
311903Swollman#include <sys/systm.h>
3250473Speter#include <sys/fcntl.h>
331839Swollman#include <sys/file.h>
341839Swollman#include <sys/stat.h>
351839Swollman#include <sys/proc.h>
361839Swollman#include <sys/limits.h>
371839Swollman#include <sys/mount.h>
381839Swollman#include <sys/namei.h>
391839Swollman#include <sys/vnode.h>
401839Swollman#include <sys/conf.h>
411839Swollman#include <sys/filio.h>
421839Swollman#include <sys/ttycom.h>
431839Swollman#include <sys/bio.h>
441839Swollman#include <sys/buf.h>
451839Swollman#include <ufs/ufs/extattr.h>
461839Swollman#include <ufs/ufs/quota.h>
471839Swollman#include <ufs/ufs/inode.h>
481839Swollman#include <ufs/ufs/ufsmount.h>
491839Swollman#include <ufs/ufs/ufs_extern.h>
501839Swollman#include <ufs/ffs/fs.h>
511839Swollman#include <ufs/ffs/ffs_extern.h>
521839Swollman
531839Swollman#include <vm/vm.h>
541839Swollman#include <vm/vm_extern.h>
551839Swollman#include <vm/vm_object.h>
561839Swollman#include <sys/kernel.h>
571839Swollman#include <sys/sysctl.h>
581839Swollman
591839Swollmanstatic int ffs_rawread_readahead(struct vnode *vp,
601839Swollman				 caddr_t udata,
611839Swollman				 off_t offset,
621839Swollman				 size_t len,
631839Swollman				 struct thread *td,
641839Swollman				 struct buf *bp,
651839Swollman				 caddr_t sa);
661839Swollmanstatic int ffs_rawread_main(struct vnode *vp,
671839Swollman			    struct uio *uio);
681839Swollman
691839Swollmanstatic int ffs_rawread_sync(struct vnode *vp, struct thread *td);
701839Swollman
711839Swollmanint ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
7274462Salfred
7374462Salfredvoid ffs_rawread_setup(void);
741903Swollman
751903SwollmanSYSCTL_DECL(_vfs_ffs);
761839Swollman
771839Swollmanstatic int ffsrawbufcnt = 4;
781839SwollmanSYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0,
791839Swollman	   "Buffers available for raw reads");
801839Swollman
811839Swollmanstatic int allowrawread = 1;
821839SwollmanSYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0,
831839Swollman	   "Flag to enable raw reads");
841839Swollman
851839Swollmanstatic int rawreadahead = 1;
861839SwollmanSYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0,
871839Swollman	   "Flag to enable readahead for long raw reads");
881839Swollman
891839Swollman
901839Swollmanvoid
911839Swollmanffs_rawread_setup(void)
921839Swollman{
931839Swollman	ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8;
941839Swollman}
951839Swollman
961839Swollman
971839Swollmanstatic int
981839Swollmanffs_rawread_sync(struct vnode *vp, struct thread *td)
991839Swollman{
1001903Swollman	int spl;
10193032Simp	int error;
10293032Simp	int upgraded;
10393032Simp	struct bufobj *bo;
1041903Swollman	struct mount *mp;
1051903Swollman
10674462Salfred	/* Check for dirty mmap, pending writes and dirty buffers */
107	spl = splbio();
108	VI_LOCK(vp);
109	bo = &vp->v_bufobj;
110	if (bo->bo_numoutput > 0 ||
111	    bo->bo_dirty.bv_cnt > 0 ||
112	    (vp->v_iflag & VI_OBJDIRTY) != 0) {
113		splx(spl);
114		VI_UNLOCK(vp);
115
116		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
117			if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE)
118				upgraded = 1;
119			else
120				upgraded = 0;
121			VOP_UNLOCK(vp, 0, td);
122			(void) vn_start_write(vp, &mp, V_WAIT);
123			VOP_LOCK(vp, LK_EXCLUSIVE, td);
124		} else if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) {
125			upgraded = 1;
126			/* Upgrade to exclusive lock, this might block */
127			VOP_LOCK(vp, LK_UPGRADE, td);
128		} else
129			upgraded = 0;
130
131
132		VI_LOCK(vp);
133		/* Check if vnode was reclaimed while unlocked. */
134		if ((vp->v_iflag & VI_DOOMED) != 0) {
135			VI_UNLOCK(vp);
136			if (upgraded != 0)
137				VOP_LOCK(vp, LK_DOWNGRADE, td);
138			vn_finished_write(mp);
139			return (EIO);
140		}
141		/* Attempt to msync mmap() regions to clean dirty mmap */
142		if ((vp->v_iflag & VI_OBJDIRTY) != 0) {
143			VI_UNLOCK(vp);
144			if (vp->v_object != NULL) {
145				VM_OBJECT_LOCK(vp->v_object);
146				vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
147				VM_OBJECT_UNLOCK(vp->v_object);
148			}
149			VI_LOCK(vp);
150		}
151
152		/* Wait for pending writes to complete */
153		spl = splbio();
154		error = bufobj_wwait(&vp->v_bufobj, 0, 0);
155		if (error != 0) {
156			/* XXX: can't happen with a zero timeout ??? */
157			splx(spl);
158			VI_UNLOCK(vp);
159			if (upgraded != 0)
160				VOP_LOCK(vp, LK_DOWNGRADE, td);
161			vn_finished_write(mp);
162			return (error);
163		}
164		/* Flush dirty buffers */
165		if (bo->bo_dirty.bv_cnt > 0) {
166			splx(spl);
167			VI_UNLOCK(vp);
168			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) {
169				if (upgraded != 0)
170					VOP_LOCK(vp, LK_DOWNGRADE, td);
171				vn_finished_write(mp);
172				return (error);
173			}
174			VI_LOCK(vp);
175			spl = splbio();
176			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
177				panic("ffs_rawread_sync: dirty bufs");
178		}
179		splx(spl);
180		VI_UNLOCK(vp);
181		if (upgraded != 0)
182			VOP_LOCK(vp, LK_DOWNGRADE, td);
183		vn_finished_write(mp);
184	} else {
185		splx(spl);
186		VI_UNLOCK(vp);
187	}
188	return 0;
189}
190
191
192static int
193ffs_rawread_readahead(struct vnode *vp,
194		      caddr_t udata,
195		      off_t offset,
196		      size_t len,
197		      struct thread *td,
198		      struct buf *bp,
199		      caddr_t sa)
200{
201	int error;
202	u_int iolen;
203	off_t blockno;
204	int blockoff;
205	int bsize;
206	struct vnode *dp;
207	int bforwards;
208	struct inode *ip;
209	ufs2_daddr_t blkno;
210
211	bsize = vp->v_mount->mnt_stat.f_iosize;
212
213	ip = VTOI(vp);
214	dp = ip->i_devvp;
215
216	iolen = ((vm_offset_t) udata) & PAGE_MASK;
217	bp->b_bcount = len;
218	if (bp->b_bcount + iolen > bp->b_kvasize) {
219		bp->b_bcount = bp->b_kvasize;
220		if (iolen != 0)
221			bp->b_bcount -= PAGE_SIZE;
222	}
223	bp->b_flags = 0;	/* XXX necessary ? */
224	bp->b_iocmd = BIO_READ;
225	bp->b_iodone = bdone;
226	bp->b_data = udata;
227	bp->b_saveaddr = sa;
228	blockno = offset / bsize;
229	blockoff = (offset % bsize) / DEV_BSIZE;
230	if ((daddr_t) blockno != blockno) {
231		return EINVAL; /* blockno overflow */
232	}
233
234	bp->b_lblkno = bp->b_blkno = blockno;
235
236	error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL);
237	if (error != 0)
238		return error;
239	if (blkno == -1) {
240
241		/* Fill holes with NULs to preserve semantics */
242
243		if (bp->b_bcount + blockoff * DEV_BSIZE > bsize)
244			bp->b_bcount = bsize - blockoff * DEV_BSIZE;
245		bp->b_bufsize = bp->b_bcount;
246
247		if (vmapbuf(bp) < 0)
248			return EFAULT;
249
250		if (ticks - PCPU_GET(switchticks) >= hogticks)
251			uio_yield();
252		bzero(bp->b_data, bp->b_bufsize);
253
254		/* Mark operation completed (similar to bufdone()) */
255
256		bp->b_resid = 0;
257		bp->b_flags |= B_DONE;
258		return 0;
259	}
260	bp->b_blkno = blkno + blockoff;
261	bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE;
262
263	if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards))
264		bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE;
265	bp->b_bufsize = bp->b_bcount;
266
267	if (vmapbuf(bp) < 0)
268		return EFAULT;
269
270	BO_STRATEGY(&dp->v_bufobj, bp);
271	return 0;
272}
273
274
275static int
276ffs_rawread_main(struct vnode *vp,
277		 struct uio *uio)
278{
279	int error, nerror;
280	struct buf *bp, *nbp, *tbp;
281	caddr_t sa, nsa, tsa;
282	u_int iolen;
283	int spl;
284	caddr_t udata;
285	long resid;
286	off_t offset;
287	struct thread *td;
288
289	td = uio->uio_td ? uio->uio_td : curthread;
290	udata = uio->uio_iov->iov_base;
291	resid = uio->uio_resid;
292	offset = uio->uio_offset;
293
294	/*
295	 * keep the process from being swapped
296	 */
297	PHOLD(td->td_proc);
298
299	error = 0;
300	nerror = 0;
301
302	bp = NULL;
303	nbp = NULL;
304	sa = NULL;
305	nsa = NULL;
306
307	while (resid > 0) {
308
309		if (bp == NULL) { /* Setup first read */
310			/* XXX: Leave some bufs for swap */
311			bp = getpbuf(&ffsrawbufcnt);
312			sa = bp->b_data;
313			pbgetvp(vp, bp);
314			error = ffs_rawread_readahead(vp, udata, offset,
315						     resid, td, bp, sa);
316			if (error != 0)
317				break;
318
319			if (resid > bp->b_bufsize) { /* Setup fist readahead */
320				/* XXX: Leave bufs for swap */
321				if (rawreadahead != 0)
322					nbp = trypbuf(&ffsrawbufcnt);
323				else
324					nbp = NULL;
325				if (nbp != NULL) {
326					nsa = nbp->b_data;
327					pbgetvp(vp, nbp);
328
329					nerror = ffs_rawread_readahead(vp,
330								       udata +
331								       bp->b_bufsize,
332								       offset +
333								       bp->b_bufsize,
334								       resid -
335								       bp->b_bufsize,
336								       td,
337								       nbp,
338								       nsa);
339					if (nerror) {
340						pbrelvp(nbp);
341						relpbuf(nbp, &ffsrawbufcnt);
342						nbp = NULL;
343					}
344				}
345			}
346		}
347
348		spl = splbio();
349		bwait(bp, PRIBIO, "rawrd");
350		splx(spl);
351
352		vunmapbuf(bp);
353
354		iolen = bp->b_bcount - bp->b_resid;
355		if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) {
356			nerror = 0;	/* Ignore possible beyond EOF error */
357			break; /* EOF */
358		}
359
360		if ((bp->b_ioflags & BIO_ERROR) != 0) {
361			error = bp->b_error;
362			break;
363		}
364		resid -= iolen;
365		udata += iolen;
366		offset += iolen;
367		if (iolen < bp->b_bufsize) {
368			/* Incomplete read.  Try to read remaining part */
369			error = ffs_rawread_readahead(vp,
370						      udata,
371						      offset,
372						      bp->b_bufsize - iolen,
373						      td,
374						      bp,
375						      sa);
376			if (error != 0)
377				break;
378		} else if (nbp != NULL) { /* Complete read with readahead */
379
380			tbp = bp;
381			bp = nbp;
382			nbp = tbp;
383
384			tsa = sa;
385			sa = nsa;
386			nsa = tsa;
387
388			if (resid <= bp->b_bufsize) { /* No more readaheads */
389				pbrelvp(nbp);
390				relpbuf(nbp, &ffsrawbufcnt);
391				nbp = NULL;
392			} else { /* Setup next readahead */
393				nerror = ffs_rawread_readahead(vp,
394							       udata +
395							       bp->b_bufsize,
396							       offset +
397							       bp->b_bufsize,
398							       resid -
399							       bp->b_bufsize,
400							       td,
401							       nbp,
402							       nsa);
403				if (nerror != 0) {
404					pbrelvp(nbp);
405					relpbuf(nbp, &ffsrawbufcnt);
406					nbp = NULL;
407				}
408			}
409		} else if (nerror != 0) {/* Deferred Readahead error */
410			break;
411		}  else if (resid > 0) { /* More to read, no readahead */
412			error = ffs_rawread_readahead(vp, udata, offset,
413						      resid, td, bp, sa);
414			if (error != 0)
415				break;
416		}
417	}
418
419	if (bp != NULL) {
420		pbrelvp(bp);
421		relpbuf(bp, &ffsrawbufcnt);
422	}
423	if (nbp != NULL) {			/* Run down readahead buffer */
424		spl = splbio();
425		bwait(nbp, PRIBIO, "rawrd");
426		splx(spl);
427		vunmapbuf(nbp);
428		pbrelvp(nbp);
429		relpbuf(nbp, &ffsrawbufcnt);
430	}
431
432	if (error == 0)
433		error = nerror;
434	PRELE(td->td_proc);
435	uio->uio_iov->iov_base = udata;
436	uio->uio_resid = resid;
437	uio->uio_offset = offset;
438	return error;
439}
440
441
442int
443ffs_rawread(struct vnode *vp,
444	    struct uio *uio,
445	    int *workdone)
446{
447	if (allowrawread != 0 &&
448	    uio->uio_iovcnt == 1 &&
449	    uio->uio_segflg == UIO_USERSPACE &&
450	    uio->uio_resid == uio->uio_iov->iov_len &&
451	    (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags &
452	     TDP_DEADLKTREAT) == 0) {
453		int secsize;		/* Media sector size */
454		off_t filebytes;	/* Bytes left of file */
455		int blockbytes;		/* Bytes left of file in full blocks */
456		int partialbytes;	/* Bytes in last partial block */
457		int skipbytes;		/* Bytes not to read in ffs_rawread */
458		struct inode *ip;
459		int error;
460
461
462		/* Only handle sector aligned reads */
463		ip = VTOI(vp);
464		secsize = ip->i_devvp->v_bufobj.bo_bsize;
465		if ((uio->uio_offset & (secsize - 1)) == 0 &&
466		    (uio->uio_resid & (secsize - 1)) == 0) {
467
468			/* Sync dirty pages and buffers if needed */
469			error = ffs_rawread_sync(vp,
470						 (uio->uio_td != NULL) ?
471						 uio->uio_td : curthread);
472			if (error != 0)
473				return error;
474
475			/* Check for end of file */
476			if (ip->i_size > uio->uio_offset) {
477				filebytes = ip->i_size - uio->uio_offset;
478
479				/* No special eof handling needed ? */
480				if (uio->uio_resid <= filebytes) {
481					*workdone = 1;
482					return ffs_rawread_main(vp, uio);
483				}
484
485				partialbytes = ((unsigned int) ip->i_size) %
486					ip->i_fs->fs_bsize;
487				blockbytes = (int) filebytes - partialbytes;
488				if (blockbytes > 0) {
489					skipbytes = uio->uio_resid -
490						blockbytes;
491					uio->uio_resid = blockbytes;
492					error = ffs_rawread_main(vp, uio);
493					uio->uio_resid += skipbytes;
494					if (error != 0)
495						return error;
496					/* Read remaining part using buffer */
497				}
498			}
499		}
500	}
501	*workdone = 0;
502	return 0;
503}
504