ffs_rawread.c revision 116192
1/*- 2 * Copyright (c) 2000-2003 Tor Egge 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_rawread.c 116192 2003-06-11 06:34:30Z obrien $"); 29 30#include <sys/param.h> 31#include <sys/systm.h> 32#include <sys/fcntl.h> 33#include <sys/file.h> 34#include <sys/stat.h> 35#include <sys/proc.h> 36#include <sys/limits.h> 37#include <sys/mount.h> 38#include <sys/namei.h> 39#include <sys/vnode.h> 40#include <sys/conf.h> 41#include <sys/filio.h> 42#include <sys/ttycom.h> 43#include <sys/bio.h> 44#include <sys/buf.h> 45#include <ufs/ufs/quota.h> 46#include <ufs/ufs/inode.h> 47#include <ufs/ffs/fs.h> 48 49#include <vm/vm.h> 50#include <vm/vm_extern.h> 51#include <vm/vm_object.h> 52#include <sys/kernel.h> 53#include <sys/sysctl.h> 54 55static int ffs_rawread_readahead(struct vnode *vp, 56 caddr_t udata, 57 off_t offset, 58 size_t len, 59 struct thread *td, 60 struct buf *bp, 61 caddr_t sa); 62static int ffs_rawread_main(struct vnode *vp, 63 struct uio *uio); 64 65static int ffs_rawread_sync(struct vnode *vp, struct thread *td); 66 67int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 68 69void ffs_rawread_setup(void); 70 71static void ffs_rawreadwakeup(struct buf *bp); 72 73 74SYSCTL_DECL(_vfs_ffs); 75 76static int ffsrawbufcnt = 4; 77SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 78 "Buffers available for raw reads"); 79 80static int allowrawread = 1; 81SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 82 "Flag to enable raw reads"); 83 84static int rawreadahead = 1; 85SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 86 "Flag to enable readahead for long raw reads"); 87 88 89void 90ffs_rawread_setup(void) 91{ 92 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 93} 94 95 96static int 97ffs_rawread_sync(struct vnode *vp, struct thread *td) 98{ 99 int spl; 100 int error; 101 int upgraded; 102 103 GIANT_REQUIRED; 104 /* Check for dirty mmap, pending writes and dirty buffers */ 105 spl = splbio(); 106 VI_LOCK(vp); 107 if (vp->v_numoutput > 0 || 108 !TAILQ_EMPTY(&vp->v_dirtyblkhd) || 109 (vp->v_iflag & VI_OBJDIRTY) != 0) { 110 splx(spl); 111 VI_UNLOCK(vp); 112 113 if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) { 114 upgraded = 1; 115 /* Upgrade to exclusive lock, this might block */ 116 VOP_LOCK(vp, LK_UPGRADE | LK_NOPAUSE, td); 117 } else 118 upgraded = 0; 119 120 121 /* Attempt to msync mmap() regions to clean dirty mmap */ 122 VI_LOCK(vp); 123 if ((vp->v_iflag & VI_OBJDIRTY) != 0) { 124 struct vm_object *obj; 125 VI_UNLOCK(vp); 126 if (VOP_GETVOBJECT(vp, &obj) == 0) { 127 VM_OBJECT_LOCK(obj); 128 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 129 VM_OBJECT_UNLOCK(obj); 130 } 131 VI_LOCK(vp); 132 } 133 134 /* Wait for pending writes to complete */ 135 spl = splbio(); 136 while (vp->v_numoutput) { 137 vp->v_iflag |= VI_BWAIT; 138 error = msleep((caddr_t)&vp->v_numoutput, 139 VI_MTX(vp), 140 PRIBIO + 1, 141 "rawrdfls", 0); 142 if (error != 0) { 143 splx(spl); 144 VI_UNLOCK(vp); 145 if (upgraded != 0) 146 VOP_LOCK(vp, LK_DOWNGRADE, td); 147 return (error); 148 } 149 } 150 /* Flush dirty buffers */ 151 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 152 splx(spl); 153 VI_UNLOCK(vp); 154 if ((error = VOP_FSYNC(vp, NOCRED, MNT_WAIT, td)) != 0) { 155 if (upgraded != 0) 156 VOP_LOCK(vp, LK_DOWNGRADE, td); 157 return (error); 158 } 159 VI_LOCK(vp); 160 spl = splbio(); 161 if (vp->v_numoutput > 0 || 162 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 163 panic("ffs_rawread_sync: dirty bufs"); 164 } 165 splx(spl); 166 VI_UNLOCK(vp); 167 if (upgraded != 0) 168 VOP_LOCK(vp, LK_DOWNGRADE, td); 169 } else { 170 splx(spl); 171 VI_UNLOCK(vp); 172 } 173 return 0; 174} 175 176 177static int 178ffs_rawread_readahead(struct vnode *vp, 179 caddr_t udata, 180 off_t offset, 181 size_t len, 182 struct thread *td, 183 struct buf *bp, 184 caddr_t sa) 185{ 186 int error; 187 u_int iolen; 188 off_t blockno; 189 int blockoff; 190 int bsize; 191 struct vnode *dp; 192 int bforwards; 193 194 GIANT_REQUIRED; 195 bsize = vp->v_mount->mnt_stat.f_iosize; 196 197 iolen = ((vm_offset_t) udata) & PAGE_MASK; 198 bp->b_bcount = len; 199 if (bp->b_bcount + iolen > bp->b_kvasize) { 200 bp->b_bcount = bp->b_kvasize; 201 if (iolen != 0) 202 bp->b_bcount -= PAGE_SIZE; 203 } 204 bp->b_flags = B_PHYS; 205 bp->b_iocmd = BIO_READ; 206 bp->b_iodone = ffs_rawreadwakeup; 207 bp->b_data = udata; 208 bp->b_saveaddr = sa; 209 bp->b_offset = offset; 210 blockno = bp->b_offset / bsize; 211 blockoff = (bp->b_offset % bsize) / DEV_BSIZE; 212 if ((daddr_t) blockno != blockno) { 213 return EINVAL; /* blockno overflow */ 214 } 215 216 bp->b_lblkno = bp->b_blkno = blockno; 217 218 error = VOP_BMAP(vp, bp->b_lblkno, &dp, &bp->b_blkno, &bforwards, 219 NULL); 220 if (error != 0) { 221 return error; 222 } 223 if (bp->b_blkno == -1) { 224 225 /* Fill holes with NULs to preserve semantics */ 226 227 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 228 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 229 bp->b_bufsize = bp->b_bcount; 230 231 if (vmapbuf(bp) < 0) 232 return EFAULT; 233 234 if (ticks - PCPU_GET(switchticks) >= hogticks) 235 uio_yield(); 236 bzero(bp->b_data, bp->b_bufsize); 237 238 /* Mark operation completed (similar to bufdone()) */ 239 240 bp->b_resid = 0; 241 bp->b_flags |= B_DONE; 242 return 0; 243 } 244 245 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 246 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 247 bp->b_bufsize = bp->b_bcount; 248 bp->b_blkno += blockoff; 249 bp->b_dev = dp->v_rdev; 250 251 if (vmapbuf(bp) < 0) 252 return EFAULT; 253 254 if (dp->v_type == VCHR) 255 (void) VOP_SPECSTRATEGY(dp, bp); 256 else 257 (void) VOP_STRATEGY(dp, bp); 258 return 0; 259} 260 261 262static int 263ffs_rawread_main(struct vnode *vp, 264 struct uio *uio) 265{ 266 int error, nerror; 267 struct buf *bp, *nbp, *tbp; 268 caddr_t sa, nsa, tsa; 269 u_int iolen; 270 int spl; 271 caddr_t udata; 272 long resid; 273 off_t offset; 274 struct thread *td; 275 276 GIANT_REQUIRED; 277 td = uio->uio_td ? uio->uio_td : curthread; 278 udata = uio->uio_iov->iov_base; 279 resid = uio->uio_resid; 280 offset = uio->uio_offset; 281 282 /* 283 * keep the process from being swapped 284 */ 285 PHOLD(td->td_proc); 286 287 error = 0; 288 nerror = 0; 289 290 bp = NULL; 291 nbp = NULL; 292 sa = NULL; 293 nsa = NULL; 294 295 while (resid > 0) { 296 297 if (bp == NULL) { /* Setup first read */ 298 /* XXX: Leave some bufs for swap */ 299 bp = getpbuf(&ffsrawbufcnt); 300 sa = bp->b_data; 301 bp->b_vp = vp; 302 error = ffs_rawread_readahead(vp, udata, offset, 303 resid, td, bp, sa); 304 if (error != 0) 305 break; 306 307 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 308 /* XXX: Leave bufs for swap */ 309 if (rawreadahead != 0) 310 nbp = trypbuf(&ffsrawbufcnt); 311 else 312 nbp = NULL; 313 if (nbp != NULL) { 314 nsa = nbp->b_data; 315 nbp->b_vp = vp; 316 317 nerror = ffs_rawread_readahead(vp, 318 udata + 319 bp->b_bufsize, 320 offset + 321 bp->b_bufsize, 322 resid - 323 bp->b_bufsize, 324 td, 325 nbp, 326 nsa); 327 if (nerror) { 328 relpbuf(nbp, &ffsrawbufcnt); 329 nbp = NULL; 330 } 331 } 332 } 333 } 334 335 spl = splbio(); 336 bwait(bp, PRIBIO, "rawrd"); 337 splx(spl); 338 339 vunmapbuf(bp); 340 341 iolen = bp->b_bcount - bp->b_resid; 342 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 343 nerror = 0; /* Ignore possible beyond EOF error */ 344 break; /* EOF */ 345 } 346 347 if ((bp->b_ioflags & BIO_ERROR) != 0) { 348 error = bp->b_error; 349 break; 350 } 351 resid -= iolen; 352 udata += iolen; 353 offset += iolen; 354 if (iolen < bp->b_bufsize) { 355 /* Incomplete read. Try to read remaining part */ 356 error = ffs_rawread_readahead(vp, 357 udata, 358 offset, 359 bp->b_bufsize - iolen, 360 td, 361 bp, 362 sa); 363 if (error != 0) 364 break; 365 } else if (nbp != NULL) { /* Complete read with readahead */ 366 367 tbp = bp; 368 bp = nbp; 369 nbp = tbp; 370 371 tsa = sa; 372 sa = nsa; 373 nsa = tsa; 374 375 if (resid <= bp->b_bufsize) { /* No more readaheads */ 376 relpbuf(nbp, &ffsrawbufcnt); 377 nbp = NULL; 378 } else { /* Setup next readahead */ 379 nerror = ffs_rawread_readahead(vp, 380 udata + 381 bp->b_bufsize, 382 offset + 383 bp->b_bufsize, 384 resid - 385 bp->b_bufsize, 386 td, 387 nbp, 388 nsa); 389 if (nerror != 0) { 390 relpbuf(nbp, &ffsrawbufcnt); 391 nbp = NULL; 392 } 393 } 394 } else if (nerror != 0) {/* Deferred Readahead error */ 395 break; 396 } else if (resid > 0) { /* More to read, no readahead */ 397 error = ffs_rawread_readahead(vp, udata, offset, 398 resid, td, bp, sa); 399 if (error != 0) 400 break; 401 } 402 } 403 404 if (bp != NULL) 405 relpbuf(bp, &ffsrawbufcnt); 406 if (nbp != NULL) { /* Run down readahead buffer */ 407 spl = splbio(); 408 bwait(nbp, PRIBIO, "rawrd"); 409 splx(spl); 410 vunmapbuf(nbp); 411 relpbuf(nbp, &ffsrawbufcnt); 412 } 413 414 if (error == 0) 415 error = nerror; 416 PRELE(td->td_proc); 417 uio->uio_iov->iov_base = udata; 418 uio->uio_resid = resid; 419 uio->uio_offset = offset; 420 return error; 421} 422 423 424int 425ffs_rawread(struct vnode *vp, 426 struct uio *uio, 427 int *workdone) 428{ 429 if (allowrawread != 0 && 430 uio->uio_iovcnt == 1 && 431 uio->uio_segflg == UIO_USERSPACE && 432 uio->uio_resid == uio->uio_iov->iov_len && 433 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_flags & 434 TDF_DEADLKTREAT) == 0) { 435 int secsize; /* Media sector size */ 436 off_t filebytes; /* Bytes left of file */ 437 int blockbytes; /* Bytes left of file in full blocks */ 438 int partialbytes; /* Bytes in last partial block */ 439 int skipbytes; /* Bytes not to read in ffs_rawread */ 440 struct inode *ip; 441 int error; 442 443 444 /* Only handle sector aligned reads */ 445 ip = VTOI(vp); 446 secsize = ip->i_devvp->v_rdev->si_bsize_phys; 447 if ((uio->uio_offset & (secsize - 1)) == 0 && 448 (uio->uio_resid & (secsize - 1)) == 0) { 449 450 /* Sync dirty pages and buffers if needed */ 451 error = ffs_rawread_sync(vp, 452 (uio->uio_td != NULL) ? 453 uio->uio_td : curthread); 454 if (error != 0) 455 return error; 456 457 /* Check for end of file */ 458 if (ip->i_size > uio->uio_offset) { 459 filebytes = ip->i_size - uio->uio_offset; 460 461 /* No special eof handling needed ? */ 462 if (uio->uio_resid <= filebytes) { 463 *workdone = 1; 464 return ffs_rawread_main(vp, uio); 465 } 466 467 partialbytes = ((unsigned int) ip->i_size) % 468 ip->i_fs->fs_bsize; 469 blockbytes = (int) filebytes - partialbytes; 470 if (blockbytes > 0) { 471 skipbytes = uio->uio_resid - 472 blockbytes; 473 uio->uio_resid = blockbytes; 474 error = ffs_rawread_main(vp, uio); 475 uio->uio_resid += skipbytes; 476 if (error != 0) 477 return error; 478 /* Read remaining part using buffer */ 479 } 480 } 481 } 482 } 483 *workdone = 0; 484 return 0; 485} 486 487 488static void 489ffs_rawreadwakeup(struct buf *bp) 490{ 491 bdone(bp); 492} 493