ffs_rawread.c revision 140782
1/*- 2 * Copyright (c) 2000-2003 Tor Egge 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_rawread.c 140782 2005-01-25 00:40:01Z phk $"); 29 30#include <sys/param.h> 31#include <sys/systm.h> 32#include <sys/fcntl.h> 33#include <sys/file.h> 34#include <sys/stat.h> 35#include <sys/proc.h> 36#include <sys/limits.h> 37#include <sys/mount.h> 38#include <sys/namei.h> 39#include <sys/vnode.h> 40#include <sys/conf.h> 41#include <sys/filio.h> 42#include <sys/ttycom.h> 43#include <sys/bio.h> 44#include <sys/buf.h> 45#include <ufs/ufs/extattr.h> 46#include <ufs/ufs/quota.h> 47#include <ufs/ufs/inode.h> 48#include <ufs/ufs/ufsmount.h> 49#include <ufs/ufs/ufs_extern.h> 50#include <ufs/ffs/fs.h> 51 52#include <vm/vm.h> 53#include <vm/vm_extern.h> 54#include <vm/vm_object.h> 55#include <sys/kernel.h> 56#include <sys/sysctl.h> 57 58static int ffs_rawread_readahead(struct vnode *vp, 59 caddr_t udata, 60 off_t offset, 61 size_t len, 62 struct thread *td, 63 struct buf *bp, 64 caddr_t sa); 65static int ffs_rawread_main(struct vnode *vp, 66 struct uio *uio); 67 68static int ffs_rawread_sync(struct vnode *vp, struct thread *td); 69 70int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 71 72void ffs_rawread_setup(void); 73 74static void ffs_rawreadwakeup(struct buf *bp); 75 76 77SYSCTL_DECL(_vfs_ffs); 78 79static int ffsrawbufcnt = 4; 80SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 81 "Buffers available for raw reads"); 82 83static int allowrawread = 1; 84SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 85 "Flag to enable raw reads"); 86 87static int rawreadahead = 1; 88SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 89 "Flag to enable readahead for long raw reads"); 90 91 92void 93ffs_rawread_setup(void) 94{ 95 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 96} 97 98 99static int 100ffs_rawread_sync(struct vnode *vp, struct thread *td) 101{ 102 int spl; 103 int error; 104 int upgraded; 105 struct bufobj *bo; 106 107 GIANT_REQUIRED; 108 /* Check for dirty mmap, pending writes and dirty buffers */ 109 spl = splbio(); 110 VI_LOCK(vp); 111 bo = &vp->v_bufobj; 112 if (bo->bo_numoutput > 0 || 113 bo->bo_dirty.bv_cnt > 0 || 114 (vp->v_iflag & VI_OBJDIRTY) != 0) { 115 splx(spl); 116 VI_UNLOCK(vp); 117 118 if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) { 119 upgraded = 1; 120 /* Upgrade to exclusive lock, this might block */ 121 VOP_LOCK(vp, LK_UPGRADE | LK_NOPAUSE, td); 122 } else 123 upgraded = 0; 124 125 126 /* Attempt to msync mmap() regions to clean dirty mmap */ 127 VI_LOCK(vp); 128 if ((vp->v_iflag & VI_OBJDIRTY) != 0) { 129 VI_UNLOCK(vp); 130 if (vp->v_object != NULL) { 131 VM_OBJECT_LOCK(vp->v_object); 132 vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC); 133 VM_OBJECT_UNLOCK(vp->v_object); 134 } 135 VI_LOCK(vp); 136 } 137 138 /* Wait for pending writes to complete */ 139 spl = splbio(); 140 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 141 if (error != 0) { 142 /* XXX: can't happen with a zero timeout ??? */ 143 splx(spl); 144 VI_UNLOCK(vp); 145 if (upgraded != 0) 146 VOP_LOCK(vp, LK_DOWNGRADE, td); 147 return (error); 148 } 149 /* Flush dirty buffers */ 150 if (bo->bo_dirty.bv_cnt > 0) { 151 splx(spl); 152 VI_UNLOCK(vp); 153 if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0) { 154 if (upgraded != 0) 155 VOP_LOCK(vp, LK_DOWNGRADE, td); 156 return (error); 157 } 158 VI_LOCK(vp); 159 spl = splbio(); 160 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 161 panic("ffs_rawread_sync: dirty bufs"); 162 } 163 splx(spl); 164 VI_UNLOCK(vp); 165 if (upgraded != 0) 166 VOP_LOCK(vp, LK_DOWNGRADE, td); 167 } else { 168 splx(spl); 169 VI_UNLOCK(vp); 170 } 171 return 0; 172} 173 174 175static int 176ffs_rawread_readahead(struct vnode *vp, 177 caddr_t udata, 178 off_t offset, 179 size_t len, 180 struct thread *td, 181 struct buf *bp, 182 caddr_t sa) 183{ 184 int error; 185 u_int iolen; 186 off_t blockno; 187 int blockoff; 188 int bsize; 189 struct vnode *dp; 190 int bforwards; 191 struct inode *ip; 192 ufs2_daddr_t blkno; 193 194 GIANT_REQUIRED; 195 bsize = vp->v_mount->mnt_stat.f_iosize; 196 197 ip = VTOI(vp); 198 dp = ip->i_devvp; 199 200 iolen = ((vm_offset_t) udata) & PAGE_MASK; 201 bp->b_bcount = len; 202 if (bp->b_bcount + iolen > bp->b_kvasize) { 203 bp->b_bcount = bp->b_kvasize; 204 if (iolen != 0) 205 bp->b_bcount -= PAGE_SIZE; 206 } 207 bp->b_flags = 0; /* XXX necessary ? */ 208 bp->b_iocmd = BIO_READ; 209 bp->b_iodone = ffs_rawreadwakeup; 210 bp->b_data = udata; 211 bp->b_saveaddr = sa; 212 blockno = offset / bsize; 213 blockoff = (offset % bsize) / DEV_BSIZE; 214 if ((daddr_t) blockno != blockno) { 215 return EINVAL; /* blockno overflow */ 216 } 217 218 bp->b_lblkno = bp->b_blkno = blockno; 219 220 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 221 if (error != 0) 222 return error; 223 if (blkno == -1) { 224 225 /* Fill holes with NULs to preserve semantics */ 226 227 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 228 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 229 bp->b_bufsize = bp->b_bcount; 230 231 if (vmapbuf(bp) < 0) 232 return EFAULT; 233 234 if (ticks - PCPU_GET(switchticks) >= hogticks) 235 uio_yield(); 236 bzero(bp->b_data, bp->b_bufsize); 237 238 /* Mark operation completed (similar to bufdone()) */ 239 240 bp->b_resid = 0; 241 bp->b_flags |= B_DONE; 242 return 0; 243 } 244 bp->b_blkno = blkno + blockoff; 245 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 246 247 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 248 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 249 bp->b_bufsize = bp->b_bcount; 250 251 if (vmapbuf(bp) < 0) 252 return EFAULT; 253 254 BO_STRATEGY(&dp->v_bufobj, bp); 255 return 0; 256} 257 258 259static int 260ffs_rawread_main(struct vnode *vp, 261 struct uio *uio) 262{ 263 int error, nerror; 264 struct buf *bp, *nbp, *tbp; 265 caddr_t sa, nsa, tsa; 266 u_int iolen; 267 int spl; 268 caddr_t udata; 269 long resid; 270 off_t offset; 271 struct thread *td; 272 273 GIANT_REQUIRED; 274 td = uio->uio_td ? uio->uio_td : curthread; 275 udata = uio->uio_iov->iov_base; 276 resid = uio->uio_resid; 277 offset = uio->uio_offset; 278 279 /* 280 * keep the process from being swapped 281 */ 282 PHOLD(td->td_proc); 283 284 error = 0; 285 nerror = 0; 286 287 bp = NULL; 288 nbp = NULL; 289 sa = NULL; 290 nsa = NULL; 291 292 while (resid > 0) { 293 294 if (bp == NULL) { /* Setup first read */ 295 /* XXX: Leave some bufs for swap */ 296 bp = getpbuf(&ffsrawbufcnt); 297 sa = bp->b_data; 298 bp->b_vp = vp; 299 error = ffs_rawread_readahead(vp, udata, offset, 300 resid, td, bp, sa); 301 if (error != 0) 302 break; 303 304 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 305 /* XXX: Leave bufs for swap */ 306 if (rawreadahead != 0) 307 nbp = trypbuf(&ffsrawbufcnt); 308 else 309 nbp = NULL; 310 if (nbp != NULL) { 311 nsa = nbp->b_data; 312 nbp->b_vp = vp; 313 314 nerror = ffs_rawread_readahead(vp, 315 udata + 316 bp->b_bufsize, 317 offset + 318 bp->b_bufsize, 319 resid - 320 bp->b_bufsize, 321 td, 322 nbp, 323 nsa); 324 if (nerror) { 325 relpbuf(nbp, &ffsrawbufcnt); 326 nbp = NULL; 327 } 328 } 329 } 330 } 331 332 spl = splbio(); 333 bwait(bp, PRIBIO, "rawrd"); 334 splx(spl); 335 336 vunmapbuf(bp); 337 338 iolen = bp->b_bcount - bp->b_resid; 339 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 340 nerror = 0; /* Ignore possible beyond EOF error */ 341 break; /* EOF */ 342 } 343 344 if ((bp->b_ioflags & BIO_ERROR) != 0) { 345 error = bp->b_error; 346 break; 347 } 348 resid -= iolen; 349 udata += iolen; 350 offset += iolen; 351 if (iolen < bp->b_bufsize) { 352 /* Incomplete read. Try to read remaining part */ 353 error = ffs_rawread_readahead(vp, 354 udata, 355 offset, 356 bp->b_bufsize - iolen, 357 td, 358 bp, 359 sa); 360 if (error != 0) 361 break; 362 } else if (nbp != NULL) { /* Complete read with readahead */ 363 364 tbp = bp; 365 bp = nbp; 366 nbp = tbp; 367 368 tsa = sa; 369 sa = nsa; 370 nsa = tsa; 371 372 if (resid <= bp->b_bufsize) { /* No more readaheads */ 373 relpbuf(nbp, &ffsrawbufcnt); 374 nbp = NULL; 375 } else { /* Setup next readahead */ 376 nerror = ffs_rawread_readahead(vp, 377 udata + 378 bp->b_bufsize, 379 offset + 380 bp->b_bufsize, 381 resid - 382 bp->b_bufsize, 383 td, 384 nbp, 385 nsa); 386 if (nerror != 0) { 387 relpbuf(nbp, &ffsrawbufcnt); 388 nbp = NULL; 389 } 390 } 391 } else if (nerror != 0) {/* Deferred Readahead error */ 392 break; 393 } else if (resid > 0) { /* More to read, no readahead */ 394 error = ffs_rawread_readahead(vp, udata, offset, 395 resid, td, bp, sa); 396 if (error != 0) 397 break; 398 } 399 } 400 401 if (bp != NULL) 402 relpbuf(bp, &ffsrawbufcnt); 403 if (nbp != NULL) { /* Run down readahead buffer */ 404 spl = splbio(); 405 bwait(nbp, PRIBIO, "rawrd"); 406 splx(spl); 407 vunmapbuf(nbp); 408 relpbuf(nbp, &ffsrawbufcnt); 409 } 410 411 if (error == 0) 412 error = nerror; 413 PRELE(td->td_proc); 414 uio->uio_iov->iov_base = udata; 415 uio->uio_resid = resid; 416 uio->uio_offset = offset; 417 return error; 418} 419 420 421int 422ffs_rawread(struct vnode *vp, 423 struct uio *uio, 424 int *workdone) 425{ 426 if (allowrawread != 0 && 427 uio->uio_iovcnt == 1 && 428 uio->uio_segflg == UIO_USERSPACE && 429 uio->uio_resid == uio->uio_iov->iov_len && 430 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 431 TDP_DEADLKTREAT) == 0) { 432 int secsize; /* Media sector size */ 433 off_t filebytes; /* Bytes left of file */ 434 int blockbytes; /* Bytes left of file in full blocks */ 435 int partialbytes; /* Bytes in last partial block */ 436 int skipbytes; /* Bytes not to read in ffs_rawread */ 437 struct inode *ip; 438 int error; 439 440 441 /* Only handle sector aligned reads */ 442 ip = VTOI(vp); 443 secsize = ip->i_devvp->v_bufobj.bo_bsize; 444 if ((uio->uio_offset & (secsize - 1)) == 0 && 445 (uio->uio_resid & (secsize - 1)) == 0) { 446 447 /* Sync dirty pages and buffers if needed */ 448 error = ffs_rawread_sync(vp, 449 (uio->uio_td != NULL) ? 450 uio->uio_td : curthread); 451 if (error != 0) 452 return error; 453 454 /* Check for end of file */ 455 if (ip->i_size > uio->uio_offset) { 456 filebytes = ip->i_size - uio->uio_offset; 457 458 /* No special eof handling needed ? */ 459 if (uio->uio_resid <= filebytes) { 460 *workdone = 1; 461 return ffs_rawread_main(vp, uio); 462 } 463 464 partialbytes = ((unsigned int) ip->i_size) % 465 ip->i_fs->fs_bsize; 466 blockbytes = (int) filebytes - partialbytes; 467 if (blockbytes > 0) { 468 skipbytes = uio->uio_resid - 469 blockbytes; 470 uio->uio_resid = blockbytes; 471 error = ffs_rawread_main(vp, uio); 472 uio->uio_resid += skipbytes; 473 if (error != 0) 474 return error; 475 /* Read remaining part using buffer */ 476 } 477 } 478 } 479 } 480 *workdone = 0; 481 return 0; 482} 483 484 485static void 486ffs_rawreadwakeup(struct buf *bp) 487{ 488 bdone(bp); 489} 490