1112694Stegge/*- 2112694Stegge * Copyright (c) 2000-2003 Tor Egge 3112694Stegge * All rights reserved. 4112694Stegge * 5112694Stegge * Redistribution and use in source and binary forms, with or without 6112694Stegge * modification, are permitted provided that the following conditions 7112694Stegge * are met: 8112694Stegge * 1. Redistributions of source code must retain the above copyright 9112694Stegge * notice, this list of conditions and the following disclaimer. 10112694Stegge * 2. Redistributions in binary form must reproduce the above copyright 11112694Stegge * notice, this list of conditions and the following disclaimer in the 12112694Stegge * documentation and/or other materials provided with the distribution. 13112694Stegge * 14112694Stegge * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15112694Stegge * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16112694Stegge * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17112694Stegge * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18112694Stegge * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19112694Stegge * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20112694Stegge * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21112694Stegge * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22112694Stegge * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23112694Stegge * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24112694Stegge * SUCH DAMAGE. 25112694Stegge */ 26112694Stegge 27116192Sobrien#include <sys/cdefs.h> 28116192Sobrien__FBSDID("$FreeBSD$"); 29116192Sobrien 30112694Stegge#include <sys/param.h> 31112694Stegge#include <sys/systm.h> 32112694Stegge#include <sys/fcntl.h> 33112694Stegge#include <sys/file.h> 34112694Stegge#include <sys/stat.h> 35112694Stegge#include <sys/proc.h> 36114216Skan#include <sys/limits.h> 37112694Stegge#include <sys/mount.h> 38112694Stegge#include <sys/namei.h> 39112694Stegge#include <sys/vnode.h> 40112694Stegge#include <sys/conf.h> 41112694Stegge#include <sys/filio.h> 42112694Stegge#include <sys/ttycom.h> 43112694Stegge#include <sys/bio.h> 44112694Stegge#include <sys/buf.h> 45248084Sattilio#include <sys/rwlock.h> 46118986Salc#include <ufs/ufs/extattr.h> 47112694Stegge#include <ufs/ufs/quota.h> 48112694Stegge#include <ufs/ufs/inode.h> 49118986Salc#include <ufs/ufs/ufsmount.h> 50119088Salc#include <ufs/ufs/ufs_extern.h> 51112694Stegge#include <ufs/ffs/fs.h> 52141522Sphk#include <ufs/ffs/ffs_extern.h> 53112694Stegge 54112694Stegge#include <vm/vm.h> 55112694Stegge#include <vm/vm_extern.h> 56112694Stegge#include <vm/vm_object.h> 57112694Stegge#include <sys/kernel.h> 58112694Stegge#include <sys/sysctl.h> 59112694Stegge 60112694Steggestatic int ffs_rawread_readahead(struct vnode *vp, 61112694Stegge caddr_t udata, 62112694Stegge off_t offset, 63112694Stegge size_t len, 64112694Stegge struct thread *td, 65112694Stegge struct buf *bp, 66112694Stegge caddr_t sa); 67112694Steggestatic int ffs_rawread_main(struct vnode *vp, 68112694Stegge struct uio *uio); 69112694Stegge 70176559Sattiliostatic int ffs_rawread_sync(struct vnode *vp); 71112694Stegge 72112694Steggeint ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 73112694Stegge 74112694SteggeSYSCTL_DECL(_vfs_ffs); 75112694Stegge 76112694Steggestatic int ffsrawbufcnt = 4; 77112694SteggeSYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 78112694Stegge "Buffers available for raw reads"); 79112694Stegge 80112694Steggestatic int allowrawread = 1; 81112694SteggeSYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 82112694Stegge "Flag to enable raw reads"); 83112694Stegge 84112694Steggestatic int rawreadahead = 1; 85112694SteggeSYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 86112694Stegge "Flag to enable readahead for long raw reads"); 87112694Stegge 88267494Skibstatic void 89267494Skibffs_rawread_setup(void *arg __unused) 90267494Skib{ 91112694Stegge 92112694Stegge ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 93112694Stegge} 94267494SkibSYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 95112694Stegge 96112694Steggestatic int 97176559Sattilioffs_rawread_sync(struct vnode *vp) 98112694Stegge{ 99112694Stegge int error; 100112694Stegge int upgraded; 101136943Sphk struct bufobj *bo; 102156225Stegge struct mount *mp; 103200770Skib vm_object_t obj; 104112694Stegge 105112694Stegge /* Check for dirty mmap, pending writes and dirty buffers */ 106177493Sjeff bo = &vp->v_bufobj; 107177493Sjeff BO_LOCK(bo); 108112694Stegge VI_LOCK(vp); 109136943Sphk if (bo->bo_numoutput > 0 || 110136968Sphk bo->bo_dirty.bv_cnt > 0 || 111200770Skib ((obj = vp->v_object) != NULL && 112200770Skib (obj->flags & OBJ_MIGHTBEDIRTY) != 0)) { 113112694Stegge VI_UNLOCK(vp); 114177493Sjeff BO_UNLOCK(bo); 115112694Stegge 116156225Stegge if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 117176559Sattilio if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 118156225Stegge upgraded = 1; 119156225Stegge else 120156225Stegge upgraded = 0; 121175294Sattilio VOP_UNLOCK(vp, 0); 122156225Stegge (void) vn_start_write(vp, &mp, V_WAIT); 123175294Sattilio VOP_LOCK(vp, LK_EXCLUSIVE); 124176559Sattilio } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 125112694Stegge upgraded = 1; 126112694Stegge /* Upgrade to exclusive lock, this might block */ 127175294Sattilio VOP_LOCK(vp, LK_UPGRADE); 128112694Stegge } else 129112694Stegge upgraded = 0; 130112694Stegge 131112694Stegge 132158325Stegge VI_LOCK(vp); 133158325Stegge /* Check if vnode was reclaimed while unlocked. */ 134158325Stegge if ((vp->v_iflag & VI_DOOMED) != 0) { 135158325Stegge VI_UNLOCK(vp); 136158325Stegge if (upgraded != 0) 137175294Sattilio VOP_LOCK(vp, LK_DOWNGRADE); 138158325Stegge vn_finished_write(mp); 139158325Stegge return (EIO); 140158325Stegge } 141112694Stegge /* Attempt to msync mmap() regions to clean dirty mmap */ 142200770Skib if ((obj = vp->v_object) != NULL && 143200770Skib (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 144112694Stegge VI_UNLOCK(vp); 145248084Sattilio VM_OBJECT_WLOCK(obj); 146200770Skib vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 147248084Sattilio VM_OBJECT_WUNLOCK(obj); 148177493Sjeff } else 149177493Sjeff VI_UNLOCK(vp); 150112694Stegge 151112694Stegge /* Wait for pending writes to complete */ 152177493Sjeff BO_LOCK(bo); 153136943Sphk error = bufobj_wwait(&vp->v_bufobj, 0, 0); 154136943Sphk if (error != 0) { 155136943Sphk /* XXX: can't happen with a zero timeout ??? */ 156177493Sjeff BO_UNLOCK(bo); 157136943Sphk if (upgraded != 0) 158175294Sattilio VOP_LOCK(vp, LK_DOWNGRADE); 159158325Stegge vn_finished_write(mp); 160136943Sphk return (error); 161112694Stegge } 162112694Stegge /* Flush dirty buffers */ 163136968Sphk if (bo->bo_dirty.bv_cnt > 0) { 164177493Sjeff BO_UNLOCK(bo); 165233438Smckusick if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 166112694Stegge if (upgraded != 0) 167175294Sattilio VOP_LOCK(vp, LK_DOWNGRADE); 168158325Stegge vn_finished_write(mp); 169112694Stegge return (error); 170112694Stegge } 171177493Sjeff BO_LOCK(bo); 172136968Sphk if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 173112694Stegge panic("ffs_rawread_sync: dirty bufs"); 174112694Stegge } 175177493Sjeff BO_UNLOCK(bo); 176112694Stegge if (upgraded != 0) 177175294Sattilio VOP_LOCK(vp, LK_DOWNGRADE); 178156225Stegge vn_finished_write(mp); 179112694Stegge } else { 180112694Stegge VI_UNLOCK(vp); 181177493Sjeff BO_UNLOCK(bo); 182112694Stegge } 183112694Stegge return 0; 184112694Stegge} 185112694Stegge 186112694Stegge 187112694Steggestatic int 188112694Steggeffs_rawread_readahead(struct vnode *vp, 189112694Stegge caddr_t udata, 190112694Stegge off_t offset, 191112694Stegge size_t len, 192112694Stegge struct thread *td, 193112694Stegge struct buf *bp, 194112694Stegge caddr_t sa) 195112694Stegge{ 196112694Stegge int error; 197112694Stegge u_int iolen; 198112694Stegge off_t blockno; 199112694Stegge int blockoff; 200112694Stegge int bsize; 201112694Stegge struct vnode *dp; 202112694Stegge int bforwards; 203119049Sphk struct inode *ip; 204119049Sphk ufs2_daddr_t blkno; 205112694Stegge 206112694Stegge bsize = vp->v_mount->mnt_stat.f_iosize; 207112694Stegge 208119049Sphk ip = VTOI(vp); 209119049Sphk dp = ip->i_devvp; 210119049Sphk 211112694Stegge iolen = ((vm_offset_t) udata) & PAGE_MASK; 212112694Stegge bp->b_bcount = len; 213112694Stegge if (bp->b_bcount + iolen > bp->b_kvasize) { 214112694Stegge bp->b_bcount = bp->b_kvasize; 215112694Stegge if (iolen != 0) 216112694Stegge bp->b_bcount -= PAGE_SIZE; 217112694Stegge } 218122747Sphk bp->b_flags = 0; /* XXX necessary ? */ 219112694Stegge bp->b_iocmd = BIO_READ; 220145702Sjeff bp->b_iodone = bdone; 221112694Stegge bp->b_data = udata; 222112694Stegge bp->b_saveaddr = sa; 223121354Stegge blockno = offset / bsize; 224121354Stegge blockoff = (offset % bsize) / DEV_BSIZE; 225112694Stegge if ((daddr_t) blockno != blockno) { 226112694Stegge return EINVAL; /* blockno overflow */ 227112694Stegge } 228112694Stegge 229112694Stegge bp->b_lblkno = bp->b_blkno = blockno; 230112694Stegge 231119049Sphk error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 232119049Sphk if (error != 0) 233112694Stegge return error; 234121354Stegge if (blkno == -1) { 235112694Stegge 236112694Stegge /* Fill holes with NULs to preserve semantics */ 237112694Stegge 238112694Stegge if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 239112694Stegge bp->b_bcount = bsize - blockoff * DEV_BSIZE; 240112694Stegge bp->b_bufsize = bp->b_bcount; 241112694Stegge 242248515Skib if (vmapbuf(bp, 1) < 0) 243112694Stegge return EFAULT; 244112694Stegge 245218195Smdf maybe_yield(); 246112694Stegge bzero(bp->b_data, bp->b_bufsize); 247112694Stegge 248112694Stegge /* Mark operation completed (similar to bufdone()) */ 249112694Stegge 250112694Stegge bp->b_resid = 0; 251112694Stegge bp->b_flags |= B_DONE; 252112694Stegge return 0; 253112694Stegge } 254121354Stegge bp->b_blkno = blkno + blockoff; 255121354Stegge bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 256112694Stegge 257112694Stegge if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 258112694Stegge bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 259112694Stegge bp->b_bufsize = bp->b_bcount; 260112694Stegge 261248515Skib if (vmapbuf(bp, 1) < 0) 262112694Stegge return EFAULT; 263112694Stegge 264140051Sphk BO_STRATEGY(&dp->v_bufobj, bp); 265112694Stegge return 0; 266112694Stegge} 267112694Stegge 268112694Stegge 269112694Steggestatic int 270112694Steggeffs_rawread_main(struct vnode *vp, 271112694Stegge struct uio *uio) 272112694Stegge{ 273112694Stegge int error, nerror; 274112694Stegge struct buf *bp, *nbp, *tbp; 275112694Stegge caddr_t sa, nsa, tsa; 276112694Stegge u_int iolen; 277112694Stegge int spl; 278112694Stegge caddr_t udata; 279112694Stegge long resid; 280112694Stegge off_t offset; 281112694Stegge struct thread *td; 282112694Stegge 283112694Stegge td = uio->uio_td ? uio->uio_td : curthread; 284112694Stegge udata = uio->uio_iov->iov_base; 285112694Stegge resid = uio->uio_resid; 286112694Stegge offset = uio->uio_offset; 287112694Stegge 288112694Stegge /* 289112694Stegge * keep the process from being swapped 290112694Stegge */ 291112694Stegge PHOLD(td->td_proc); 292112694Stegge 293112694Stegge error = 0; 294112694Stegge nerror = 0; 295112694Stegge 296112694Stegge bp = NULL; 297112694Stegge nbp = NULL; 298112694Stegge sa = NULL; 299112694Stegge nsa = NULL; 300112694Stegge 301112694Stegge while (resid > 0) { 302112694Stegge 303112694Stegge if (bp == NULL) { /* Setup first read */ 304112694Stegge /* XXX: Leave some bufs for swap */ 305112694Stegge bp = getpbuf(&ffsrawbufcnt); 306112694Stegge sa = bp->b_data; 307166506Stegge pbgetvp(vp, bp); 308112694Stegge error = ffs_rawread_readahead(vp, udata, offset, 309112694Stegge resid, td, bp, sa); 310112694Stegge if (error != 0) 311112694Stegge break; 312112694Stegge 313112694Stegge if (resid > bp->b_bufsize) { /* Setup fist readahead */ 314112694Stegge /* XXX: Leave bufs for swap */ 315112694Stegge if (rawreadahead != 0) 316112694Stegge nbp = trypbuf(&ffsrawbufcnt); 317112694Stegge else 318112694Stegge nbp = NULL; 319112694Stegge if (nbp != NULL) { 320112694Stegge nsa = nbp->b_data; 321166506Stegge pbgetvp(vp, nbp); 322112694Stegge 323112694Stegge nerror = ffs_rawread_readahead(vp, 324112694Stegge udata + 325112694Stegge bp->b_bufsize, 326112694Stegge offset + 327112694Stegge bp->b_bufsize, 328112694Stegge resid - 329112694Stegge bp->b_bufsize, 330112694Stegge td, 331112694Stegge nbp, 332112694Stegge nsa); 333112694Stegge if (nerror) { 334166506Stegge pbrelvp(nbp); 335112694Stegge relpbuf(nbp, &ffsrawbufcnt); 336112694Stegge nbp = NULL; 337112694Stegge } 338112694Stegge } 339112694Stegge } 340112694Stegge } 341112694Stegge 342112694Stegge spl = splbio(); 343112718Stegge bwait(bp, PRIBIO, "rawrd"); 344112694Stegge splx(spl); 345112694Stegge 346112694Stegge vunmapbuf(bp); 347112694Stegge 348112694Stegge iolen = bp->b_bcount - bp->b_resid; 349112694Stegge if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 350112694Stegge nerror = 0; /* Ignore possible beyond EOF error */ 351112694Stegge break; /* EOF */ 352112694Stegge } 353112694Stegge 354112694Stegge if ((bp->b_ioflags & BIO_ERROR) != 0) { 355112694Stegge error = bp->b_error; 356112694Stegge break; 357112694Stegge } 358112694Stegge resid -= iolen; 359112694Stegge udata += iolen; 360112694Stegge offset += iolen; 361112694Stegge if (iolen < bp->b_bufsize) { 362112694Stegge /* Incomplete read. Try to read remaining part */ 363112694Stegge error = ffs_rawread_readahead(vp, 364112694Stegge udata, 365112694Stegge offset, 366112694Stegge bp->b_bufsize - iolen, 367112694Stegge td, 368112694Stegge bp, 369112694Stegge sa); 370112694Stegge if (error != 0) 371112694Stegge break; 372112694Stegge } else if (nbp != NULL) { /* Complete read with readahead */ 373112694Stegge 374112694Stegge tbp = bp; 375112694Stegge bp = nbp; 376112694Stegge nbp = tbp; 377112694Stegge 378112694Stegge tsa = sa; 379112694Stegge sa = nsa; 380112694Stegge nsa = tsa; 381112694Stegge 382112694Stegge if (resid <= bp->b_bufsize) { /* No more readaheads */ 383166506Stegge pbrelvp(nbp); 384112694Stegge relpbuf(nbp, &ffsrawbufcnt); 385112694Stegge nbp = NULL; 386112694Stegge } else { /* Setup next readahead */ 387112694Stegge nerror = ffs_rawread_readahead(vp, 388112694Stegge udata + 389112694Stegge bp->b_bufsize, 390112694Stegge offset + 391112694Stegge bp->b_bufsize, 392112694Stegge resid - 393112694Stegge bp->b_bufsize, 394112694Stegge td, 395112694Stegge nbp, 396112694Stegge nsa); 397112694Stegge if (nerror != 0) { 398166506Stegge pbrelvp(nbp); 399112694Stegge relpbuf(nbp, &ffsrawbufcnt); 400112694Stegge nbp = NULL; 401112694Stegge } 402112694Stegge } 403112694Stegge } else if (nerror != 0) {/* Deferred Readahead error */ 404112694Stegge break; 405112694Stegge } else if (resid > 0) { /* More to read, no readahead */ 406112694Stegge error = ffs_rawread_readahead(vp, udata, offset, 407112694Stegge resid, td, bp, sa); 408112694Stegge if (error != 0) 409112694Stegge break; 410112694Stegge } 411112694Stegge } 412112694Stegge 413166506Stegge if (bp != NULL) { 414166506Stegge pbrelvp(bp); 415112694Stegge relpbuf(bp, &ffsrawbufcnt); 416166506Stegge } 417112694Stegge if (nbp != NULL) { /* Run down readahead buffer */ 418112694Stegge spl = splbio(); 419112718Stegge bwait(nbp, PRIBIO, "rawrd"); 420112694Stegge splx(spl); 421112694Stegge vunmapbuf(nbp); 422166506Stegge pbrelvp(nbp); 423112694Stegge relpbuf(nbp, &ffsrawbufcnt); 424112694Stegge } 425112694Stegge 426112694Stegge if (error == 0) 427112694Stegge error = nerror; 428112694Stegge PRELE(td->td_proc); 429112694Stegge uio->uio_iov->iov_base = udata; 430112694Stegge uio->uio_resid = resid; 431112694Stegge uio->uio_offset = offset; 432112694Stegge return error; 433112694Stegge} 434112694Stegge 435112694Stegge 436112694Steggeint 437112694Steggeffs_rawread(struct vnode *vp, 438112694Stegge struct uio *uio, 439112694Stegge int *workdone) 440112694Stegge{ 441112694Stegge if (allowrawread != 0 && 442112694Stegge uio->uio_iovcnt == 1 && 443112694Stegge uio->uio_segflg == UIO_USERSPACE && 444112694Stegge uio->uio_resid == uio->uio_iov->iov_len && 445130023Stjr (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 446130023Stjr TDP_DEADLKTREAT) == 0) { 447112694Stegge int secsize; /* Media sector size */ 448112694Stegge off_t filebytes; /* Bytes left of file */ 449112694Stegge int blockbytes; /* Bytes left of file in full blocks */ 450112694Stegge int partialbytes; /* Bytes in last partial block */ 451112694Stegge int skipbytes; /* Bytes not to read in ffs_rawread */ 452112694Stegge struct inode *ip; 453112694Stegge int error; 454112694Stegge 455112694Stegge 456112694Stegge /* Only handle sector aligned reads */ 457112694Stegge ip = VTOI(vp); 458136966Sphk secsize = ip->i_devvp->v_bufobj.bo_bsize; 459112694Stegge if ((uio->uio_offset & (secsize - 1)) == 0 && 460112694Stegge (uio->uio_resid & (secsize - 1)) == 0) { 461112694Stegge 462112694Stegge /* Sync dirty pages and buffers if needed */ 463176559Sattilio error = ffs_rawread_sync(vp); 464112694Stegge if (error != 0) 465112694Stegge return error; 466112694Stegge 467112694Stegge /* Check for end of file */ 468112694Stegge if (ip->i_size > uio->uio_offset) { 469112694Stegge filebytes = ip->i_size - uio->uio_offset; 470112694Stegge 471112694Stegge /* No special eof handling needed ? */ 472112694Stegge if (uio->uio_resid <= filebytes) { 473112694Stegge *workdone = 1; 474112694Stegge return ffs_rawread_main(vp, uio); 475112694Stegge } 476112694Stegge 477112694Stegge partialbytes = ((unsigned int) ip->i_size) % 478112694Stegge ip->i_fs->fs_bsize; 479112694Stegge blockbytes = (int) filebytes - partialbytes; 480112694Stegge if (blockbytes > 0) { 481112694Stegge skipbytes = uio->uio_resid - 482112694Stegge blockbytes; 483112694Stegge uio->uio_resid = blockbytes; 484112694Stegge error = ffs_rawread_main(vp, uio); 485112694Stegge uio->uio_resid += skipbytes; 486112694Stegge if (error != 0) 487112694Stegge return error; 488112694Stegge /* Read remaining part using buffer */ 489112694Stegge } 490112694Stegge } 491112694Stegge } 492112694Stegge } 493112694Stegge *workdone = 0; 494112694Stegge return 0; 495112694Stegge} 496