ffs_rawread.c revision 166506
1112694Stegge/*- 2112694Stegge * Copyright (c) 2000-2003 Tor Egge 3112694Stegge * All rights reserved. 4112694Stegge * 5112694Stegge * Redistribution and use in source and binary forms, with or without 6112694Stegge * modification, are permitted provided that the following conditions 7112694Stegge * are met: 8112694Stegge * 1. Redistributions of source code must retain the above copyright 9112694Stegge * notice, this list of conditions and the following disclaimer. 10112694Stegge * 2. Redistributions in binary form must reproduce the above copyright 11112694Stegge * notice, this list of conditions and the following disclaimer in the 12112694Stegge * documentation and/or other materials provided with the distribution. 13112694Stegge * 14112694Stegge * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15112694Stegge * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16112694Stegge * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17112694Stegge * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18112694Stegge * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19112694Stegge * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20112694Stegge * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21112694Stegge * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22112694Stegge * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23112694Stegge * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24112694Stegge * SUCH DAMAGE. 25112694Stegge */ 26112694Stegge 27116192Sobrien#include <sys/cdefs.h> 28116192Sobrien__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_rawread.c 166506 2007-02-04 23:42:02Z tegge $"); 29116192Sobrien 30112694Stegge#include <sys/param.h> 31112694Stegge#include <sys/systm.h> 32112694Stegge#include <sys/fcntl.h> 33112694Stegge#include <sys/file.h> 34112694Stegge#include <sys/stat.h> 35112694Stegge#include <sys/proc.h> 36114216Skan#include <sys/limits.h> 37112694Stegge#include <sys/mount.h> 38112694Stegge#include <sys/namei.h> 39112694Stegge#include <sys/vnode.h> 40112694Stegge#include <sys/conf.h> 41112694Stegge#include <sys/filio.h> 42112694Stegge#include <sys/ttycom.h> 43112694Stegge#include <sys/bio.h> 44112694Stegge#include <sys/buf.h> 45118986Salc#include <ufs/ufs/extattr.h> 46112694Stegge#include <ufs/ufs/quota.h> 47112694Stegge#include <ufs/ufs/inode.h> 48118986Salc#include <ufs/ufs/ufsmount.h> 49119088Salc#include <ufs/ufs/ufs_extern.h> 50112694Stegge#include <ufs/ffs/fs.h> 51141522Sphk#include <ufs/ffs/ffs_extern.h> 52112694Stegge 53112694Stegge#include <vm/vm.h> 54112694Stegge#include <vm/vm_extern.h> 55112694Stegge#include <vm/vm_object.h> 56112694Stegge#include <sys/kernel.h> 57112694Stegge#include <sys/sysctl.h> 58112694Stegge 59112694Steggestatic int ffs_rawread_readahead(struct vnode *vp, 60112694Stegge caddr_t udata, 61112694Stegge off_t offset, 62112694Stegge size_t len, 63112694Stegge struct thread *td, 64112694Stegge struct buf *bp, 65112694Stegge caddr_t sa); 66112694Steggestatic int ffs_rawread_main(struct vnode *vp, 67112694Stegge struct uio *uio); 68112694Stegge 69112694Steggestatic int ffs_rawread_sync(struct vnode *vp, struct thread *td); 70112694Stegge 71112694Steggeint ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 72112694Stegge 73112694Steggevoid ffs_rawread_setup(void); 74112694Stegge 75112694SteggeSYSCTL_DECL(_vfs_ffs); 76112694Stegge 77112694Steggestatic int ffsrawbufcnt = 4; 78112694SteggeSYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 79112694Stegge "Buffers available for raw reads"); 80112694Stegge 81112694Steggestatic int allowrawread = 1; 82112694SteggeSYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 83112694Stegge "Flag to enable raw reads"); 84112694Stegge 85112694Steggestatic int rawreadahead = 1; 86112694SteggeSYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 87112694Stegge "Flag to enable readahead for long raw reads"); 88112694Stegge 89112694Stegge 90112694Steggevoid 91112694Steggeffs_rawread_setup(void) 92112694Stegge{ 93112694Stegge ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 94112694Stegge} 95112694Stegge 96112694Stegge 97112694Steggestatic int 98112694Steggeffs_rawread_sync(struct vnode *vp, struct thread *td) 99112694Stegge{ 100112694Stegge int spl; 101112694Stegge int error; 102112694Stegge int upgraded; 103136943Sphk struct bufobj *bo; 104156225Stegge struct mount *mp; 105112694Stegge 106112694Stegge /* Check for dirty mmap, pending writes and dirty buffers */ 107112694Stegge spl = splbio(); 108112694Stegge VI_LOCK(vp); 109136943Sphk bo = &vp->v_bufobj; 110136943Sphk if (bo->bo_numoutput > 0 || 111136968Sphk bo->bo_dirty.bv_cnt > 0 || 112112694Stegge (vp->v_iflag & VI_OBJDIRTY) != 0) { 113112694Stegge splx(spl); 114112694Stegge VI_UNLOCK(vp); 115112694Stegge 116156225Stegge if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 117156225Stegge if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) 118156225Stegge upgraded = 1; 119156225Stegge else 120156225Stegge upgraded = 0; 121156225Stegge VOP_UNLOCK(vp, 0, td); 122156225Stegge (void) vn_start_write(vp, &mp, V_WAIT); 123156225Stegge VOP_LOCK(vp, LK_EXCLUSIVE, td); 124156225Stegge } else if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) { 125112694Stegge upgraded = 1; 126112694Stegge /* Upgrade to exclusive lock, this might block */ 127144367Sjeff VOP_LOCK(vp, LK_UPGRADE, td); 128112694Stegge } else 129112694Stegge upgraded = 0; 130112694Stegge 131112694Stegge 132158325Stegge VI_LOCK(vp); 133158325Stegge /* Check if vnode was reclaimed while unlocked. */ 134158325Stegge if ((vp->v_iflag & VI_DOOMED) != 0) { 135158325Stegge VI_UNLOCK(vp); 136158325Stegge if (upgraded != 0) 137158325Stegge VOP_LOCK(vp, LK_DOWNGRADE, td); 138158325Stegge vn_finished_write(mp); 139158325Stegge return (EIO); 140158325Stegge } 141112694Stegge /* Attempt to msync mmap() regions to clean dirty mmap */ 142112694Stegge if ((vp->v_iflag & VI_OBJDIRTY) != 0) { 143112694Stegge VI_UNLOCK(vp); 144140782Sphk if (vp->v_object != NULL) { 145140782Sphk VM_OBJECT_LOCK(vp->v_object); 146140782Sphk vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC); 147140782Sphk VM_OBJECT_UNLOCK(vp->v_object); 148115145Salc } 149112694Stegge VI_LOCK(vp); 150112694Stegge } 151112694Stegge 152112694Stegge /* Wait for pending writes to complete */ 153112694Stegge spl = splbio(); 154136943Sphk error = bufobj_wwait(&vp->v_bufobj, 0, 0); 155136943Sphk if (error != 0) { 156136943Sphk /* XXX: can't happen with a zero timeout ??? */ 157136943Sphk splx(spl); 158136943Sphk VI_UNLOCK(vp); 159136943Sphk if (upgraded != 0) 160136943Sphk VOP_LOCK(vp, LK_DOWNGRADE, td); 161158325Stegge vn_finished_write(mp); 162136943Sphk return (error); 163112694Stegge } 164112694Stegge /* Flush dirty buffers */ 165136968Sphk if (bo->bo_dirty.bv_cnt > 0) { 166112694Stegge splx(spl); 167112694Stegge VI_UNLOCK(vp); 168141522Sphk if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) { 169112694Stegge if (upgraded != 0) 170112694Stegge VOP_LOCK(vp, LK_DOWNGRADE, td); 171158325Stegge vn_finished_write(mp); 172112694Stegge return (error); 173112694Stegge } 174112694Stegge VI_LOCK(vp); 175112694Stegge spl = splbio(); 176136968Sphk if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 177112694Stegge panic("ffs_rawread_sync: dirty bufs"); 178112694Stegge } 179112694Stegge splx(spl); 180112694Stegge VI_UNLOCK(vp); 181112694Stegge if (upgraded != 0) 182112694Stegge VOP_LOCK(vp, LK_DOWNGRADE, td); 183156225Stegge vn_finished_write(mp); 184112694Stegge } else { 185112694Stegge splx(spl); 186112694Stegge VI_UNLOCK(vp); 187112694Stegge } 188112694Stegge return 0; 189112694Stegge} 190112694Stegge 191112694Stegge 192112694Steggestatic int 193112694Steggeffs_rawread_readahead(struct vnode *vp, 194112694Stegge caddr_t udata, 195112694Stegge off_t offset, 196112694Stegge size_t len, 197112694Stegge struct thread *td, 198112694Stegge struct buf *bp, 199112694Stegge caddr_t sa) 200112694Stegge{ 201112694Stegge int error; 202112694Stegge u_int iolen; 203112694Stegge off_t blockno; 204112694Stegge int blockoff; 205112694Stegge int bsize; 206112694Stegge struct vnode *dp; 207112694Stegge int bforwards; 208119049Sphk struct inode *ip; 209119049Sphk ufs2_daddr_t blkno; 210112694Stegge 211112694Stegge bsize = vp->v_mount->mnt_stat.f_iosize; 212112694Stegge 213119049Sphk ip = VTOI(vp); 214119049Sphk dp = ip->i_devvp; 215119049Sphk 216112694Stegge iolen = ((vm_offset_t) udata) & PAGE_MASK; 217112694Stegge bp->b_bcount = len; 218112694Stegge if (bp->b_bcount + iolen > bp->b_kvasize) { 219112694Stegge bp->b_bcount = bp->b_kvasize; 220112694Stegge if (iolen != 0) 221112694Stegge bp->b_bcount -= PAGE_SIZE; 222112694Stegge } 223122747Sphk bp->b_flags = 0; /* XXX necessary ? */ 224112694Stegge bp->b_iocmd = BIO_READ; 225145702Sjeff bp->b_iodone = bdone; 226112694Stegge bp->b_data = udata; 227112694Stegge bp->b_saveaddr = sa; 228121354Stegge blockno = offset / bsize; 229121354Stegge blockoff = (offset % bsize) / DEV_BSIZE; 230112694Stegge if ((daddr_t) blockno != blockno) { 231112694Stegge return EINVAL; /* blockno overflow */ 232112694Stegge } 233112694Stegge 234112694Stegge bp->b_lblkno = bp->b_blkno = blockno; 235112694Stegge 236119049Sphk error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 237119049Sphk if (error != 0) 238112694Stegge return error; 239121354Stegge if (blkno == -1) { 240112694Stegge 241112694Stegge /* Fill holes with NULs to preserve semantics */ 242112694Stegge 243112694Stegge if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 244112694Stegge bp->b_bcount = bsize - blockoff * DEV_BSIZE; 245112694Stegge bp->b_bufsize = bp->b_bcount; 246112694Stegge 247112694Stegge if (vmapbuf(bp) < 0) 248112694Stegge return EFAULT; 249112694Stegge 250112694Stegge if (ticks - PCPU_GET(switchticks) >= hogticks) 251112694Stegge uio_yield(); 252112694Stegge bzero(bp->b_data, bp->b_bufsize); 253112694Stegge 254112694Stegge /* Mark operation completed (similar to bufdone()) */ 255112694Stegge 256112694Stegge bp->b_resid = 0; 257112694Stegge bp->b_flags |= B_DONE; 258112694Stegge return 0; 259112694Stegge } 260121354Stegge bp->b_blkno = blkno + blockoff; 261121354Stegge bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 262112694Stegge 263112694Stegge if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 264112694Stegge bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 265112694Stegge bp->b_bufsize = bp->b_bcount; 266112694Stegge 267112724Stegge if (vmapbuf(bp) < 0) 268112694Stegge return EFAULT; 269112694Stegge 270140051Sphk BO_STRATEGY(&dp->v_bufobj, bp); 271112694Stegge return 0; 272112694Stegge} 273112694Stegge 274112694Stegge 275112694Steggestatic int 276112694Steggeffs_rawread_main(struct vnode *vp, 277112694Stegge struct uio *uio) 278112694Stegge{ 279112694Stegge int error, nerror; 280112694Stegge struct buf *bp, *nbp, *tbp; 281112694Stegge caddr_t sa, nsa, tsa; 282112694Stegge u_int iolen; 283112694Stegge int spl; 284112694Stegge caddr_t udata; 285112694Stegge long resid; 286112694Stegge off_t offset; 287112694Stegge struct thread *td; 288112694Stegge 289112694Stegge td = uio->uio_td ? uio->uio_td : curthread; 290112694Stegge udata = uio->uio_iov->iov_base; 291112694Stegge resid = uio->uio_resid; 292112694Stegge offset = uio->uio_offset; 293112694Stegge 294112694Stegge /* 295112694Stegge * keep the process from being swapped 296112694Stegge */ 297112694Stegge PHOLD(td->td_proc); 298112694Stegge 299112694Stegge error = 0; 300112694Stegge nerror = 0; 301112694Stegge 302112694Stegge bp = NULL; 303112694Stegge nbp = NULL; 304112694Stegge sa = NULL; 305112694Stegge nsa = NULL; 306112694Stegge 307112694Stegge while (resid > 0) { 308112694Stegge 309112694Stegge if (bp == NULL) { /* Setup first read */ 310112694Stegge /* XXX: Leave some bufs for swap */ 311112694Stegge bp = getpbuf(&ffsrawbufcnt); 312112694Stegge sa = bp->b_data; 313166506Stegge pbgetvp(vp, bp); 314112694Stegge error = ffs_rawread_readahead(vp, udata, offset, 315112694Stegge resid, td, bp, sa); 316112694Stegge if (error != 0) 317112694Stegge break; 318112694Stegge 319112694Stegge if (resid > bp->b_bufsize) { /* Setup fist readahead */ 320112694Stegge /* XXX: Leave bufs for swap */ 321112694Stegge if (rawreadahead != 0) 322112694Stegge nbp = trypbuf(&ffsrawbufcnt); 323112694Stegge else 324112694Stegge nbp = NULL; 325112694Stegge if (nbp != NULL) { 326112694Stegge nsa = nbp->b_data; 327166506Stegge pbgetvp(vp, nbp); 328112694Stegge 329112694Stegge nerror = ffs_rawread_readahead(vp, 330112694Stegge udata + 331112694Stegge bp->b_bufsize, 332112694Stegge offset + 333112694Stegge bp->b_bufsize, 334112694Stegge resid - 335112694Stegge bp->b_bufsize, 336112694Stegge td, 337112694Stegge nbp, 338112694Stegge nsa); 339112694Stegge if (nerror) { 340166506Stegge pbrelvp(nbp); 341112694Stegge relpbuf(nbp, &ffsrawbufcnt); 342112694Stegge nbp = NULL; 343112694Stegge } 344112694Stegge } 345112694Stegge } 346112694Stegge } 347112694Stegge 348112694Stegge spl = splbio(); 349112718Stegge bwait(bp, PRIBIO, "rawrd"); 350112694Stegge splx(spl); 351112694Stegge 352112694Stegge vunmapbuf(bp); 353112694Stegge 354112694Stegge iolen = bp->b_bcount - bp->b_resid; 355112694Stegge if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 356112694Stegge nerror = 0; /* Ignore possible beyond EOF error */ 357112694Stegge break; /* EOF */ 358112694Stegge } 359112694Stegge 360112694Stegge if ((bp->b_ioflags & BIO_ERROR) != 0) { 361112694Stegge error = bp->b_error; 362112694Stegge break; 363112694Stegge } 364112694Stegge resid -= iolen; 365112694Stegge udata += iolen; 366112694Stegge offset += iolen; 367112694Stegge if (iolen < bp->b_bufsize) { 368112694Stegge /* Incomplete read. Try to read remaining part */ 369112694Stegge error = ffs_rawread_readahead(vp, 370112694Stegge udata, 371112694Stegge offset, 372112694Stegge bp->b_bufsize - iolen, 373112694Stegge td, 374112694Stegge bp, 375112694Stegge sa); 376112694Stegge if (error != 0) 377112694Stegge break; 378112694Stegge } else if (nbp != NULL) { /* Complete read with readahead */ 379112694Stegge 380112694Stegge tbp = bp; 381112694Stegge bp = nbp; 382112694Stegge nbp = tbp; 383112694Stegge 384112694Stegge tsa = sa; 385112694Stegge sa = nsa; 386112694Stegge nsa = tsa; 387112694Stegge 388112694Stegge if (resid <= bp->b_bufsize) { /* No more readaheads */ 389166506Stegge pbrelvp(nbp); 390112694Stegge relpbuf(nbp, &ffsrawbufcnt); 391112694Stegge nbp = NULL; 392112694Stegge } else { /* Setup next readahead */ 393112694Stegge nerror = ffs_rawread_readahead(vp, 394112694Stegge udata + 395112694Stegge bp->b_bufsize, 396112694Stegge offset + 397112694Stegge bp->b_bufsize, 398112694Stegge resid - 399112694Stegge bp->b_bufsize, 400112694Stegge td, 401112694Stegge nbp, 402112694Stegge nsa); 403112694Stegge if (nerror != 0) { 404166506Stegge pbrelvp(nbp); 405112694Stegge relpbuf(nbp, &ffsrawbufcnt); 406112694Stegge nbp = NULL; 407112694Stegge } 408112694Stegge } 409112694Stegge } else if (nerror != 0) {/* Deferred Readahead error */ 410112694Stegge break; 411112694Stegge } else if (resid > 0) { /* More to read, no readahead */ 412112694Stegge error = ffs_rawread_readahead(vp, udata, offset, 413112694Stegge resid, td, bp, sa); 414112694Stegge if (error != 0) 415112694Stegge break; 416112694Stegge } 417112694Stegge } 418112694Stegge 419166506Stegge if (bp != NULL) { 420166506Stegge pbrelvp(bp); 421112694Stegge relpbuf(bp, &ffsrawbufcnt); 422166506Stegge } 423112694Stegge if (nbp != NULL) { /* Run down readahead buffer */ 424112694Stegge spl = splbio(); 425112718Stegge bwait(nbp, PRIBIO, "rawrd"); 426112694Stegge splx(spl); 427112694Stegge vunmapbuf(nbp); 428166506Stegge pbrelvp(nbp); 429112694Stegge relpbuf(nbp, &ffsrawbufcnt); 430112694Stegge } 431112694Stegge 432112694Stegge if (error == 0) 433112694Stegge error = nerror; 434112694Stegge PRELE(td->td_proc); 435112694Stegge uio->uio_iov->iov_base = udata; 436112694Stegge uio->uio_resid = resid; 437112694Stegge uio->uio_offset = offset; 438112694Stegge return error; 439112694Stegge} 440112694Stegge 441112694Stegge 442112694Steggeint 443112694Steggeffs_rawread(struct vnode *vp, 444112694Stegge struct uio *uio, 445112694Stegge int *workdone) 446112694Stegge{ 447112694Stegge if (allowrawread != 0 && 448112694Stegge uio->uio_iovcnt == 1 && 449112694Stegge uio->uio_segflg == UIO_USERSPACE && 450112694Stegge uio->uio_resid == uio->uio_iov->iov_len && 451130023Stjr (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 452130023Stjr TDP_DEADLKTREAT) == 0) { 453112694Stegge int secsize; /* Media sector size */ 454112694Stegge off_t filebytes; /* Bytes left of file */ 455112694Stegge int blockbytes; /* Bytes left of file in full blocks */ 456112694Stegge int partialbytes; /* Bytes in last partial block */ 457112694Stegge int skipbytes; /* Bytes not to read in ffs_rawread */ 458112694Stegge struct inode *ip; 459112694Stegge int error; 460112694Stegge 461112694Stegge 462112694Stegge /* Only handle sector aligned reads */ 463112694Stegge ip = VTOI(vp); 464136966Sphk secsize = ip->i_devvp->v_bufobj.bo_bsize; 465112694Stegge if ((uio->uio_offset & (secsize - 1)) == 0 && 466112694Stegge (uio->uio_resid & (secsize - 1)) == 0) { 467112694Stegge 468112694Stegge /* Sync dirty pages and buffers if needed */ 469112694Stegge error = ffs_rawread_sync(vp, 470112694Stegge (uio->uio_td != NULL) ? 471112694Stegge uio->uio_td : curthread); 472112694Stegge if (error != 0) 473112694Stegge return error; 474112694Stegge 475112694Stegge /* Check for end of file */ 476112694Stegge if (ip->i_size > uio->uio_offset) { 477112694Stegge filebytes = ip->i_size - uio->uio_offset; 478112694Stegge 479112694Stegge /* No special eof handling needed ? */ 480112694Stegge if (uio->uio_resid <= filebytes) { 481112694Stegge *workdone = 1; 482112694Stegge return ffs_rawread_main(vp, uio); 483112694Stegge } 484112694Stegge 485112694Stegge partialbytes = ((unsigned int) ip->i_size) % 486112694Stegge ip->i_fs->fs_bsize; 487112694Stegge blockbytes = (int) filebytes - partialbytes; 488112694Stegge if (blockbytes > 0) { 489112694Stegge skipbytes = uio->uio_resid - 490112694Stegge blockbytes; 491112694Stegge uio->uio_resid = blockbytes; 492112694Stegge error = ffs_rawread_main(vp, uio); 493112694Stegge uio->uio_resid += skipbytes; 494112694Stegge if (error != 0) 495112694Stegge return error; 496112694Stegge /* Read remaining part using buffer */ 497112694Stegge } 498112694Stegge } 499112694Stegge } 500112694Stegge } 501112694Stegge *workdone = 0; 502112694Stegge return 0; 503112694Stegge} 504