1112694Stegge/*- 2112694Stegge * Copyright (c) 2000-2003 Tor Egge 3112694Stegge * All rights reserved. 4112694Stegge * 5112694Stegge * Redistribution and use in source and binary forms, with or without 6112694Stegge * modification, are permitted provided that the following conditions 7112694Stegge * are met: 8112694Stegge * 1. Redistributions of source code must retain the above copyright 9112694Stegge * notice, this list of conditions and the following disclaimer. 10112694Stegge * 2. Redistributions in binary form must reproduce the above copyright 11112694Stegge * notice, this list of conditions and the following disclaimer in the 12112694Stegge * documentation and/or other materials provided with the distribution. 13112694Stegge * 14112694Stegge * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15112694Stegge * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16112694Stegge * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17112694Stegge * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18112694Stegge * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19112694Stegge * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20112694Stegge * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21112694Stegge * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22112694Stegge * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23112694Stegge * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24112694Stegge * SUCH DAMAGE. 25112694Stegge */ 26112694Stegge 27116192Sobrien#include <sys/cdefs.h> 28116192Sobrien__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_rawread.c 318267 2017-05-14 12:00:00Z kib $"); 29116192Sobrien 30112694Stegge#include <sys/param.h> 31112694Stegge#include <sys/systm.h> 32112694Stegge#include <sys/fcntl.h> 33112694Stegge#include <sys/file.h> 34112694Stegge#include <sys/stat.h> 35112694Stegge#include <sys/proc.h> 36114216Skan#include <sys/limits.h> 37112694Stegge#include <sys/mount.h> 38112694Stegge#include <sys/namei.h> 39112694Stegge#include <sys/vnode.h> 40112694Stegge#include <sys/conf.h> 41112694Stegge#include <sys/filio.h> 42112694Stegge#include <sys/ttycom.h> 43112694Stegge#include <sys/bio.h> 44112694Stegge#include <sys/buf.h> 45248084Sattilio#include <sys/rwlock.h> 46118986Salc#include <ufs/ufs/extattr.h> 47112694Stegge#include <ufs/ufs/quota.h> 48112694Stegge#include <ufs/ufs/inode.h> 49118986Salc#include <ufs/ufs/ufsmount.h> 50119088Salc#include <ufs/ufs/ufs_extern.h> 51112694Stegge#include <ufs/ffs/fs.h> 52141522Sphk#include <ufs/ffs/ffs_extern.h> 53112694Stegge 54112694Stegge#include <vm/vm.h> 55112694Stegge#include <vm/vm_extern.h> 56112694Stegge#include <vm/vm_object.h> 57112694Stegge#include <sys/kernel.h> 58112694Stegge#include <sys/sysctl.h> 59112694Stegge 60112694Steggestatic int ffs_rawread_readahead(struct vnode *vp, 61112694Stegge caddr_t udata, 62112694Stegge off_t offset, 63112694Stegge size_t len, 64112694Stegge struct thread *td, 65112694Stegge struct buf *bp, 66112694Stegge caddr_t sa); 67112694Steggestatic int ffs_rawread_main(struct vnode *vp, 68112694Stegge struct uio *uio); 69112694Stegge 70176559Sattiliostatic int ffs_rawread_sync(struct vnode *vp); 71112694Stegge 72112694Steggeint ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 73112694Stegge 74112694SteggeSYSCTL_DECL(_vfs_ffs); 75112694Stegge 76112694Steggestatic int ffsrawbufcnt = 4; 77112694SteggeSYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 78112694Stegge "Buffers available for raw reads"); 79112694Stegge 80112694Steggestatic int allowrawread = 1; 81112694SteggeSYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 82112694Stegge "Flag to enable raw reads"); 83112694Stegge 84112694Steggestatic int rawreadahead = 1; 85112694SteggeSYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 86112694Stegge "Flag to enable readahead for long raw reads"); 87112694Stegge 88267494Skibstatic void 89267494Skibffs_rawread_setup(void *arg __unused) 90267494Skib{ 91112694Stegge 92112694Stegge ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 93112694Stegge} 94267494SkibSYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 95112694Stegge 96112694Steggestatic int 97176559Sattilioffs_rawread_sync(struct vnode *vp) 98112694Stegge{ 99112694Stegge int error; 100112694Stegge int upgraded; 101136943Sphk struct bufobj *bo; 102156225Stegge struct mount *mp; 103200770Skib vm_object_t obj; 104112694Stegge 105112694Stegge /* Check for dirty mmap, pending writes and dirty buffers */ 106177493Sjeff bo = &vp->v_bufobj; 107177493Sjeff BO_LOCK(bo); 108112694Stegge VI_LOCK(vp); 109136943Sphk if (bo->bo_numoutput > 0 || 110136968Sphk bo->bo_dirty.bv_cnt > 0 || 111200770Skib ((obj = vp->v_object) != NULL && 112200770Skib (obj->flags & OBJ_MIGHTBEDIRTY) != 0)) { 113112694Stegge VI_UNLOCK(vp); 114177493Sjeff BO_UNLOCK(bo); 115112694Stegge 116156225Stegge if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 117176559Sattilio if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 118156225Stegge upgraded = 1; 119156225Stegge else 120156225Stegge upgraded = 0; 121175294Sattilio VOP_UNLOCK(vp, 0); 122156225Stegge (void) vn_start_write(vp, &mp, V_WAIT); 123175294Sattilio VOP_LOCK(vp, LK_EXCLUSIVE); 124176559Sattilio } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 125112694Stegge upgraded = 1; 126112694Stegge /* Upgrade to exclusive lock, this might block */ 127175294Sattilio VOP_LOCK(vp, LK_UPGRADE); 128112694Stegge } else 129112694Stegge upgraded = 0; 130112694Stegge 131112694Stegge 132158325Stegge VI_LOCK(vp); 133158325Stegge /* Check if vnode was reclaimed while unlocked. */ 134158325Stegge if ((vp->v_iflag & VI_DOOMED) != 0) { 135158325Stegge VI_UNLOCK(vp); 136158325Stegge if (upgraded != 0) 137175294Sattilio VOP_LOCK(vp, LK_DOWNGRADE); 138158325Stegge vn_finished_write(mp); 139158325Stegge return (EIO); 140158325Stegge } 141112694Stegge /* Attempt to msync mmap() regions to clean dirty mmap */ 142200770Skib if ((obj = vp->v_object) != NULL && 143200770Skib (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 144112694Stegge VI_UNLOCK(vp); 145248084Sattilio VM_OBJECT_WLOCK(obj); 146200770Skib vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 147248084Sattilio VM_OBJECT_WUNLOCK(obj); 148177493Sjeff } else 149177493Sjeff VI_UNLOCK(vp); 150112694Stegge 151112694Stegge /* Wait for pending writes to complete */ 152177493Sjeff BO_LOCK(bo); 153136943Sphk error = bufobj_wwait(&vp->v_bufobj, 0, 0); 154136943Sphk if (error != 0) { 155136943Sphk /* XXX: can't happen with a zero timeout ??? */ 156177493Sjeff BO_UNLOCK(bo); 157136943Sphk if (upgraded != 0) 158175294Sattilio VOP_LOCK(vp, LK_DOWNGRADE); 159158325Stegge vn_finished_write(mp); 160136943Sphk return (error); 161112694Stegge } 162112694Stegge /* Flush dirty buffers */ 163136968Sphk if (bo->bo_dirty.bv_cnt > 0) { 164177493Sjeff BO_UNLOCK(bo); 165233438Smckusick if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 166112694Stegge if (upgraded != 0) 167175294Sattilio VOP_LOCK(vp, LK_DOWNGRADE); 168158325Stegge vn_finished_write(mp); 169112694Stegge return (error); 170112694Stegge } 171177493Sjeff BO_LOCK(bo); 172136968Sphk if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 173112694Stegge panic("ffs_rawread_sync: dirty bufs"); 174112694Stegge } 175177493Sjeff BO_UNLOCK(bo); 176112694Stegge if (upgraded != 0) 177175294Sattilio VOP_LOCK(vp, LK_DOWNGRADE); 178156225Stegge vn_finished_write(mp); 179112694Stegge } else { 180112694Stegge VI_UNLOCK(vp); 181177493Sjeff BO_UNLOCK(bo); 182112694Stegge } 183112694Stegge return 0; 184112694Stegge} 185112694Stegge 186112694Stegge 187112694Steggestatic int 188112694Steggeffs_rawread_readahead(struct vnode *vp, 189112694Stegge caddr_t udata, 190112694Stegge off_t offset, 191112694Stegge size_t len, 192112694Stegge struct thread *td, 193112694Stegge struct buf *bp, 194112694Stegge caddr_t sa) 195112694Stegge{ 196112694Stegge int error; 197112694Stegge u_int iolen; 198112694Stegge off_t blockno; 199112694Stegge int blockoff; 200112694Stegge int bsize; 201112694Stegge struct vnode *dp; 202112694Stegge int bforwards; 203119049Sphk struct inode *ip; 204119049Sphk ufs2_daddr_t blkno; 205112694Stegge 206112694Stegge bsize = vp->v_mount->mnt_stat.f_iosize; 207112694Stegge 208119049Sphk ip = VTOI(vp); 209119049Sphk dp = ip->i_devvp; 210119049Sphk 211112694Stegge iolen = ((vm_offset_t) udata) & PAGE_MASK; 212112694Stegge bp->b_bcount = len; 213112694Stegge if (bp->b_bcount + iolen > bp->b_kvasize) { 214112694Stegge bp->b_bcount = bp->b_kvasize; 215112694Stegge if (iolen != 0) 216112694Stegge bp->b_bcount -= PAGE_SIZE; 217112694Stegge } 218122747Sphk bp->b_flags = 0; /* XXX necessary ? */ 219112694Stegge bp->b_iocmd = BIO_READ; 220145702Sjeff bp->b_iodone = bdone; 221112694Stegge bp->b_data = udata; 222112694Stegge bp->b_saveaddr = sa; 223121354Stegge blockno = offset / bsize; 224121354Stegge blockoff = (offset % bsize) / DEV_BSIZE; 225112694Stegge if ((daddr_t) blockno != blockno) { 226112694Stegge return EINVAL; /* blockno overflow */ 227112694Stegge } 228112694Stegge 229112694Stegge bp->b_lblkno = bp->b_blkno = blockno; 230112694Stegge 231119049Sphk error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 232119049Sphk if (error != 0) 233112694Stegge return error; 234121354Stegge if (blkno == -1) { 235112694Stegge 236112694Stegge /* Fill holes with NULs to preserve semantics */ 237112694Stegge 238112694Stegge if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 239112694Stegge bp->b_bcount = bsize - blockoff * DEV_BSIZE; 240112694Stegge bp->b_bufsize = bp->b_bcount; 241112694Stegge 242248515Skib if (vmapbuf(bp, 1) < 0) 243112694Stegge return EFAULT; 244112694Stegge 245218195Smdf maybe_yield(); 246112694Stegge bzero(bp->b_data, bp->b_bufsize); 247112694Stegge 248112694Stegge /* Mark operation completed (similar to bufdone()) */ 249112694Stegge 250112694Stegge bp->b_resid = 0; 251112694Stegge bp->b_flags |= B_DONE; 252112694Stegge return 0; 253112694Stegge } 254121354Stegge bp->b_blkno = blkno + blockoff; 255121354Stegge bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 256112694Stegge 257112694Stegge if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 258112694Stegge bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 259112694Stegge bp->b_bufsize = bp->b_bcount; 260112694Stegge 261248515Skib if (vmapbuf(bp, 1) < 0) 262112694Stegge return EFAULT; 263112694Stegge 264140051Sphk BO_STRATEGY(&dp->v_bufobj, bp); 265112694Stegge return 0; 266112694Stegge} 267112694Stegge 268112694Stegge 269112694Steggestatic int 270112694Steggeffs_rawread_main(struct vnode *vp, 271112694Stegge struct uio *uio) 272112694Stegge{ 273112694Stegge int error, nerror; 274112694Stegge struct buf *bp, *nbp, *tbp; 275112694Stegge caddr_t sa, nsa, tsa; 276112694Stegge u_int iolen; 277112694Stegge caddr_t udata; 278112694Stegge long resid; 279112694Stegge off_t offset; 280112694Stegge struct thread *td; 281112694Stegge 282112694Stegge td = uio->uio_td ? uio->uio_td : curthread; 283112694Stegge udata = uio->uio_iov->iov_base; 284112694Stegge resid = uio->uio_resid; 285112694Stegge offset = uio->uio_offset; 286112694Stegge 287112694Stegge /* 288112694Stegge * keep the process from being swapped 289112694Stegge */ 290112694Stegge PHOLD(td->td_proc); 291112694Stegge 292112694Stegge error = 0; 293112694Stegge nerror = 0; 294112694Stegge 295112694Stegge bp = NULL; 296112694Stegge nbp = NULL; 297112694Stegge sa = NULL; 298112694Stegge nsa = NULL; 299112694Stegge 300112694Stegge while (resid > 0) { 301112694Stegge 302112694Stegge if (bp == NULL) { /* Setup first read */ 303112694Stegge /* XXX: Leave some bufs for swap */ 304112694Stegge bp = getpbuf(&ffsrawbufcnt); 305112694Stegge sa = bp->b_data; 306166506Stegge pbgetvp(vp, bp); 307112694Stegge error = ffs_rawread_readahead(vp, udata, offset, 308112694Stegge resid, td, bp, sa); 309112694Stegge if (error != 0) 310112694Stegge break; 311112694Stegge 312112694Stegge if (resid > bp->b_bufsize) { /* Setup fist readahead */ 313112694Stegge /* XXX: Leave bufs for swap */ 314112694Stegge if (rawreadahead != 0) 315112694Stegge nbp = trypbuf(&ffsrawbufcnt); 316112694Stegge else 317112694Stegge nbp = NULL; 318112694Stegge if (nbp != NULL) { 319112694Stegge nsa = nbp->b_data; 320166506Stegge pbgetvp(vp, nbp); 321112694Stegge 322112694Stegge nerror = ffs_rawread_readahead(vp, 323112694Stegge udata + 324112694Stegge bp->b_bufsize, 325112694Stegge offset + 326112694Stegge bp->b_bufsize, 327112694Stegge resid - 328112694Stegge bp->b_bufsize, 329112694Stegge td, 330112694Stegge nbp, 331112694Stegge nsa); 332112694Stegge if (nerror) { 333166506Stegge pbrelvp(nbp); 334112694Stegge relpbuf(nbp, &ffsrawbufcnt); 335112694Stegge nbp = NULL; 336112694Stegge } 337112694Stegge } 338112694Stegge } 339112694Stegge } 340112694Stegge 341112718Stegge bwait(bp, PRIBIO, "rawrd"); 342112694Stegge vunmapbuf(bp); 343112694Stegge 344112694Stegge iolen = bp->b_bcount - bp->b_resid; 345112694Stegge if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 346112694Stegge nerror = 0; /* Ignore possible beyond EOF error */ 347112694Stegge break; /* EOF */ 348112694Stegge } 349112694Stegge 350112694Stegge if ((bp->b_ioflags & BIO_ERROR) != 0) { 351112694Stegge error = bp->b_error; 352112694Stegge break; 353112694Stegge } 354112694Stegge resid -= iolen; 355112694Stegge udata += iolen; 356112694Stegge offset += iolen; 357112694Stegge if (iolen < bp->b_bufsize) { 358112694Stegge /* Incomplete read. Try to read remaining part */ 359112694Stegge error = ffs_rawread_readahead(vp, 360112694Stegge udata, 361112694Stegge offset, 362112694Stegge bp->b_bufsize - iolen, 363112694Stegge td, 364112694Stegge bp, 365112694Stegge sa); 366112694Stegge if (error != 0) 367112694Stegge break; 368112694Stegge } else if (nbp != NULL) { /* Complete read with readahead */ 369112694Stegge 370112694Stegge tbp = bp; 371112694Stegge bp = nbp; 372112694Stegge nbp = tbp; 373112694Stegge 374112694Stegge tsa = sa; 375112694Stegge sa = nsa; 376112694Stegge nsa = tsa; 377112694Stegge 378112694Stegge if (resid <= bp->b_bufsize) { /* No more readaheads */ 379166506Stegge pbrelvp(nbp); 380112694Stegge relpbuf(nbp, &ffsrawbufcnt); 381112694Stegge nbp = NULL; 382112694Stegge } else { /* Setup next readahead */ 383112694Stegge nerror = ffs_rawread_readahead(vp, 384112694Stegge udata + 385112694Stegge bp->b_bufsize, 386112694Stegge offset + 387112694Stegge bp->b_bufsize, 388112694Stegge resid - 389112694Stegge bp->b_bufsize, 390112694Stegge td, 391112694Stegge nbp, 392112694Stegge nsa); 393112694Stegge if (nerror != 0) { 394166506Stegge pbrelvp(nbp); 395112694Stegge relpbuf(nbp, &ffsrawbufcnt); 396112694Stegge nbp = NULL; 397112694Stegge } 398112694Stegge } 399112694Stegge } else if (nerror != 0) {/* Deferred Readahead error */ 400112694Stegge break; 401112694Stegge } else if (resid > 0) { /* More to read, no readahead */ 402112694Stegge error = ffs_rawread_readahead(vp, udata, offset, 403112694Stegge resid, td, bp, sa); 404112694Stegge if (error != 0) 405112694Stegge break; 406112694Stegge } 407112694Stegge } 408112694Stegge 409166506Stegge if (bp != NULL) { 410166506Stegge pbrelvp(bp); 411112694Stegge relpbuf(bp, &ffsrawbufcnt); 412166506Stegge } 413112694Stegge if (nbp != NULL) { /* Run down readahead buffer */ 414112718Stegge bwait(nbp, PRIBIO, "rawrd"); 415112694Stegge vunmapbuf(nbp); 416166506Stegge pbrelvp(nbp); 417112694Stegge relpbuf(nbp, &ffsrawbufcnt); 418112694Stegge } 419112694Stegge 420112694Stegge if (error == 0) 421112694Stegge error = nerror; 422112694Stegge PRELE(td->td_proc); 423112694Stegge uio->uio_iov->iov_base = udata; 424112694Stegge uio->uio_resid = resid; 425112694Stegge uio->uio_offset = offset; 426112694Stegge return error; 427112694Stegge} 428112694Stegge 429112694Stegge 430112694Steggeint 431112694Steggeffs_rawread(struct vnode *vp, 432112694Stegge struct uio *uio, 433112694Stegge int *workdone) 434112694Stegge{ 435112694Stegge if (allowrawread != 0 && 436112694Stegge uio->uio_iovcnt == 1 && 437112694Stegge uio->uio_segflg == UIO_USERSPACE && 438112694Stegge uio->uio_resid == uio->uio_iov->iov_len && 439130023Stjr (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 440130023Stjr TDP_DEADLKTREAT) == 0) { 441112694Stegge int secsize; /* Media sector size */ 442112694Stegge off_t filebytes; /* Bytes left of file */ 443112694Stegge int blockbytes; /* Bytes left of file in full blocks */ 444112694Stegge int partialbytes; /* Bytes in last partial block */ 445112694Stegge int skipbytes; /* Bytes not to read in ffs_rawread */ 446112694Stegge struct inode *ip; 447112694Stegge int error; 448112694Stegge 449112694Stegge 450112694Stegge /* Only handle sector aligned reads */ 451112694Stegge ip = VTOI(vp); 452136966Sphk secsize = ip->i_devvp->v_bufobj.bo_bsize; 453112694Stegge if ((uio->uio_offset & (secsize - 1)) == 0 && 454112694Stegge (uio->uio_resid & (secsize - 1)) == 0) { 455112694Stegge 456112694Stegge /* Sync dirty pages and buffers if needed */ 457176559Sattilio error = ffs_rawread_sync(vp); 458112694Stegge if (error != 0) 459112694Stegge return error; 460112694Stegge 461112694Stegge /* Check for end of file */ 462112694Stegge if (ip->i_size > uio->uio_offset) { 463112694Stegge filebytes = ip->i_size - uio->uio_offset; 464112694Stegge 465112694Stegge /* No special eof handling needed ? */ 466112694Stegge if (uio->uio_resid <= filebytes) { 467112694Stegge *workdone = 1; 468112694Stegge return ffs_rawread_main(vp, uio); 469112694Stegge } 470112694Stegge 471112694Stegge partialbytes = ((unsigned int) ip->i_size) % 472112694Stegge ip->i_fs->fs_bsize; 473112694Stegge blockbytes = (int) filebytes - partialbytes; 474112694Stegge if (blockbytes > 0) { 475112694Stegge skipbytes = uio->uio_resid - 476112694Stegge blockbytes; 477112694Stegge uio->uio_resid = blockbytes; 478112694Stegge error = ffs_rawread_main(vp, uio); 479112694Stegge uio->uio_resid += skipbytes; 480112694Stegge if (error != 0) 481112694Stegge return error; 482112694Stegge /* Read remaining part using buffer */ 483112694Stegge } 484112694Stegge } 485112694Stegge } 486112694Stegge } 487112694Stegge *workdone = 0; 488112694Stegge return 0; 489112694Stegge} 490