vfs_bio.c revision 9706
1243789Sdim/* 2243789Sdim * Copyright (c) 1994 John S. Dyson 3243789Sdim * All rights reserved. 4243789Sdim * 5243789Sdim * Redistribution and use in source and binary forms, with or without 6243789Sdim * modification, are permitted provided that the following conditions 7243789Sdim * are met: 8243789Sdim * 1. Redistributions of source code must retain the above copyright 9243789Sdim * notice immediately at the beginning of the file, without modification, 10243789Sdim * this list of conditions, and the following disclaimer. 11243789Sdim * 2. Redistributions in binary form must reproduce the above copyright 12249423Sdim * notice, this list of conditions and the following disclaimer in the 13243789Sdim * documentation and/or other materials provided with the distribution. 14243789Sdim * 3. Absolutely no warranty of function or purpose is made by the author 15249423Sdim * John S. Dyson. 16243789Sdim * 4. This work was done expressly for inclusion into FreeBSD. Other use 17243789Sdim * is allowed if this notation is included. 18243789Sdim * 5. Modifications may be freely made to this file if the above conditions 19243789Sdim * are met. 20243789Sdim * 21243789Sdim * $Id: vfs_bio.c,v 1.53 1995/07/24 03:16:41 davidg Exp $ 22243789Sdim */ 23249423Sdim 24243789Sdim/* 25243789Sdim * this file contains a new buffer I/O scheme implementing a coherent 26243789Sdim * VM object and buffer cache scheme. Pains have been taken to make 27243789Sdim * sure that the performance degradation associated with schemes such 28249423Sdim * as this is not realized. 29243789Sdim * 30243789Sdim * Author: John S. Dyson 31243789Sdim * Significant help during the development and debugging phases 32243789Sdim * had been provided by David Greenman, also of the FreeBSD core team. 33243789Sdim */ 34243789Sdim 35249423Sdim#define VMIO 36243789Sdim#include <sys/param.h> 37243789Sdim#include <sys/systm.h> 38243789Sdim#include <sys/kernel.h> 39243789Sdim#include <sys/proc.h> 40249423Sdim#include <sys/vnode.h> 41249423Sdim#include <vm/vm.h> 42249423Sdim#include <vm/vm_kern.h> 43243789Sdim#include <vm/vm_pageout.h> 44249423Sdim#include <vm/vm_page.h> 45243789Sdim#include <vm/vm_object.h> 46243789Sdim#include <sys/buf.h> 47249423Sdim#include <sys/mount.h> 48249423Sdim#include <sys/malloc.h> 49249423Sdim#include <sys/resourcevar.h> 50249423Sdim#include <sys/proc.h> 51249423Sdim 52249423Sdim#include <miscfs/specfs/specdev.h> 53243789Sdim 54243789Sdimstruct buf *buf; /* buffer header pool */ 55243789Sdimint nbuf; /* number of buffer headers calculated 56243789Sdim * elsewhere */ 57249423Sdimstruct swqueue bswlist; 58249423Sdim 59249423Sdimvoid vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); 60249423Sdimvoid vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); 61243789Sdimvoid vfs_clean_pages(struct buf * bp); 62249423Sdimstatic void vfs_setdirty(struct buf *bp); 63243789Sdim 64249423Sdimint needsbuffer; 65243789Sdim 66249423Sdim/* 67249423Sdim * Internal update daemon, process 3 68249423Sdim * The variable vfs_update_wakeup allows for internal syncs. 69249423Sdim */ 70249423Sdimint vfs_update_wakeup; 71249423Sdim 72249423Sdim 73249423Sdim/* 74249423Sdim * buffers base kva 75249423Sdim */ 76249423Sdimcaddr_t buffers_kva; 77249423Sdim 78249423Sdim/* 79243789Sdim * bogus page -- for I/O to/from partially complete buffers 80243789Sdim * this is a temporary solution to the problem, but it is not 81243789Sdim * really that bad. it would be better to split the buffer 82249423Sdim * for input in the case of buffers partially already in memory, 83249423Sdim * but the code is intricate enough already. 84249423Sdim */ 85243789Sdimvm_page_t bogus_page; 86243789Sdimvm_offset_t bogus_offset; 87249423Sdim 88249423Sdimint bufspace, maxbufspace; 89243789Sdim 90243789Sdim/* 91243789Sdim * advisory minimum for size of LRU queue or VMIO queue 92243789Sdim */ 93249423Sdimint minbuf; 94243789Sdim 95249423Sdim/* 96249423Sdim * Initialize buffer headers and related structures. 97249423Sdim */ 98249423Sdimvoid 99249423Sdimbufinit() 100249423Sdim{ 101249423Sdim struct buf *bp; 102249423Sdim int i; 103249423Sdim 104243789Sdim TAILQ_INIT(&bswlist); 105249423Sdim LIST_INIT(&invalhash); 106249423Sdim 107249423Sdim /* first, make a null hash table */ 108249423Sdim for (i = 0; i < BUFHSZ; i++) 109249423Sdim LIST_INIT(&bufhashtbl[i]); 110249423Sdim 111243789Sdim /* next, make a null set of free lists */ 112249423Sdim for (i = 0; i < BUFFER_QUEUES; i++) 113249423Sdim TAILQ_INIT(&bufqueues[i]); 114249423Sdim 115243789Sdim buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); 116243789Sdim /* finally, initialize each buffer header and stick on empty q */ 117249423Sdim for (i = 0; i < nbuf; i++) { 118243789Sdim bp = &buf[i]; 119249423Sdim bzero(bp, sizeof *bp); 120249423Sdim bp->b_flags = B_INVAL; /* we're just an empty header */ 121249423Sdim bp->b_dev = NODEV; 122249423Sdim bp->b_rcred = NOCRED; 123249423Sdim bp->b_wcred = NOCRED; 124249423Sdim bp->b_qindex = QUEUE_EMPTY; 125243789Sdim bp->b_vnbufs.le_next = NOLIST; 126243789Sdim bp->b_data = buffers_kva + i * MAXBSIZE; 127243789Sdim TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 128243789Sdim LIST_INSERT_HEAD(&invalhash, bp, b_hash); 129243789Sdim } 130243789Sdim/* 131249423Sdim * maxbufspace is currently calculated to support all filesystem blocks 132243789Sdim * to be 8K. If you happen to use a 16K filesystem, the size of the buffer 133243789Sdim * cache is still the same as it would be for 8K filesystems. This 134243789Sdim * keeps the size of the buffer cache "in check" for big block filesystems. 135243789Sdim */ 136243789Sdim minbuf = nbuf / 3; 137243789Sdim maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE; 138243789Sdim 139243789Sdim bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 140249423Sdim bogus_page = vm_page_alloc(kernel_object, 141243789Sdim bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL); 142249423Sdim 143243789Sdim} 144243789Sdim 145249423Sdim/* 146243789Sdim * remove the buffer from the appropriate free list 147249423Sdim */ 148249423Sdimvoid 149249423Sdimbremfree(struct buf * bp) 150249423Sdim{ 151249423Sdim int s = splbio(); 152249423Sdim 153249423Sdim if (bp->b_qindex != QUEUE_NONE) { 154243789Sdim TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 155243789Sdim bp->b_qindex = QUEUE_NONE; 156243789Sdim } else { 157249423Sdim panic("bremfree: removing a buffer when not on a queue"); 158243789Sdim } 159249423Sdim splx(s); 160249423Sdim} 161243789Sdim 162243789Sdim/* 163243789Sdim * Get a buffer with the specified data. Look in the cache first. 164249423Sdim */ 165243789Sdimint 166243789Sdimbread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 167249423Sdim struct buf ** bpp) 168249423Sdim{ 169249423Sdim struct buf *bp; 170249423Sdim 171249423Sdim bp = getblk(vp, blkno, size, 0, 0); 172249423Sdim *bpp = bp; 173249423Sdim 174249423Sdim /* if not found in cache, do some I/O */ 175249423Sdim if ((bp->b_flags & B_CACHE) == 0) { 176249423Sdim if (curproc != NULL) 177249423Sdim curproc->p_stats->p_ru.ru_inblock++; 178243789Sdim bp->b_flags |= B_READ; 179243789Sdim bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 180243789Sdim if (bp->b_rcred == NOCRED) { 181243789Sdim if (cred != NOCRED) 182249423Sdim crhold(cred); 183249423Sdim bp->b_rcred = cred; 184249423Sdim } 185249423Sdim vfs_busy_pages(bp, 0); 186249423Sdim VOP_STRATEGY(bp); 187249423Sdim return (biowait(bp)); 188249423Sdim } 189249423Sdim return (0); 190249423Sdim} 191249423Sdim 192249423Sdim/* 193249423Sdim * Operates like bread, but also starts asynchronous I/O on 194249423Sdim * read-ahead blocks. 195243789Sdim */ 196243789Sdimint 197243789Sdimbreadn(struct vnode * vp, daddr_t blkno, int size, 198243789Sdim daddr_t * rablkno, int *rabsize, 199243789Sdim int cnt, struct ucred * cred, struct buf ** bpp) 200243789Sdim{ 201243789Sdim struct buf *bp, *rabp; 202243789Sdim int i; 203249423Sdim int rv = 0, readwait = 0; 204249423Sdim 205249423Sdim *bpp = bp = getblk(vp, blkno, size, 0, 0); 206249423Sdim 207243789Sdim /* if not found in cache, do some I/O */ 208243789Sdim if ((bp->b_flags & B_CACHE) == 0) { 209243789Sdim if (curproc != NULL) 210243789Sdim curproc->p_stats->p_ru.ru_inblock++; 211243789Sdim bp->b_flags |= B_READ; 212243789Sdim bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 213243789Sdim if (bp->b_rcred == NOCRED) { 214249423Sdim if (cred != NOCRED) 215249423Sdim crhold(cred); 216249423Sdim bp->b_rcred = cred; 217249423Sdim } 218243789Sdim vfs_busy_pages(bp, 0); 219243789Sdim VOP_STRATEGY(bp); 220243789Sdim ++readwait; 221243789Sdim } 222243789Sdim for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 223243789Sdim if (inmem(vp, *rablkno)) 224249423Sdim continue; 225243789Sdim rabp = getblk(vp, *rablkno, *rabsize, 0, 0); 226249423Sdim 227249423Sdim if ((rabp->b_flags & B_CACHE) == 0) { 228243789Sdim if (curproc != NULL) 229249423Sdim curproc->p_stats->p_ru.ru_inblock++; 230249423Sdim rabp->b_flags |= B_READ | B_ASYNC; 231249423Sdim rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 232249423Sdim if (rabp->b_rcred == NOCRED) { 233249423Sdim if (cred != NOCRED) 234249423Sdim crhold(cred); 235249423Sdim rabp->b_rcred = cred; 236249423Sdim } 237243789Sdim vfs_busy_pages(rabp, 0); 238249423Sdim VOP_STRATEGY(rabp); 239249423Sdim } else { 240249423Sdim brelse(rabp); 241249423Sdim } 242249423Sdim } 243249423Sdim 244249423Sdim if (readwait) { 245249423Sdim rv = biowait(bp); 246249423Sdim } 247249423Sdim return (rv); 248249423Sdim} 249249423Sdim 250249423Sdim/* 251249423Sdim * Write, release buffer on completion. (Done by iodone 252249423Sdim * if async.) 253249423Sdim */ 254249423Sdimint 255249423Sdimbwrite(struct buf * bp) 256249423Sdim{ 257249423Sdim int oldflags = bp->b_flags; 258249423Sdim 259249423Sdim if (bp->b_flags & B_INVAL) { 260249423Sdim brelse(bp); 261249423Sdim return (0); 262249423Sdim } 263249423Sdim if (!(bp->b_flags & B_BUSY)) 264249423Sdim panic("bwrite: buffer is not busy???"); 265249423Sdim 266249423Sdim bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 267249423Sdim bp->b_flags |= B_WRITEINPROG; 268249423Sdim 269243789Sdim if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) { 270243789Sdim reassignbuf(bp, bp->b_vp); 271249423Sdim } 272243789Sdim 273249423Sdim bp->b_vp->v_numoutput++; 274243789Sdim vfs_busy_pages(bp, 1); 275249423Sdim if (curproc != NULL) 276243789Sdim curproc->p_stats->p_ru.ru_oublock++; 277249423Sdim VOP_STRATEGY(bp); 278249423Sdim 279249423Sdim if ((oldflags & B_ASYNC) == 0) { 280249423Sdim int rtval = biowait(bp); 281249423Sdim 282249423Sdim if (oldflags & B_DELWRI) { 283249423Sdim reassignbuf(bp, bp->b_vp); 284243789Sdim } 285249423Sdim brelse(bp); 286249423Sdim return (rtval); 287249423Sdim } 288243789Sdim return (0); 289249423Sdim} 290243789Sdim 291243789Sdimint 292243789Sdimvn_bwrite(ap) 293243789Sdim struct vop_bwrite_args *ap; 294243789Sdim{ 295243789Sdim return (bwrite(ap->a_bp)); 296243789Sdim} 297243789Sdim 298243789Sdim/* 299243789Sdim * Delayed write. (Buffer is marked dirty). 300243789Sdim */ 301243789Sdimvoid 302243789Sdimbdwrite(struct buf * bp) 303243789Sdim{ 304243789Sdim 305243789Sdim if ((bp->b_flags & B_BUSY) == 0) { 306249423Sdim panic("bdwrite: buffer is not busy"); 307249423Sdim } 308243789Sdim if (bp->b_flags & B_INVAL) { 309243789Sdim brelse(bp); 310243789Sdim return; 311243789Sdim } 312243789Sdim if (bp->b_flags & B_TAPE) { 313249423Sdim bawrite(bp); 314243789Sdim return; 315243789Sdim } 316243789Sdim bp->b_flags &= ~(B_READ|B_RELBUF); 317243789Sdim if ((bp->b_flags & B_DELWRI) == 0) { 318243789Sdim bp->b_flags |= B_DONE | B_DELWRI; 319243789Sdim reassignbuf(bp, bp->b_vp); 320243789Sdim } 321243789Sdim 322243789Sdim /* 323243789Sdim * This bmap keeps the system from needing to do the bmap later, 324249423Sdim * perhaps when the system is attempting to do a sync. Since it 325249423Sdim * is likely that the indirect block -- or whatever other datastructure 326249423Sdim * that the filesystem needs is still in memory now, it is a good 327249423Sdim * thing to do this. Note also, that if the pageout daemon is 328249423Sdim * requesting a sync -- there might not be enough memory to do 329243789Sdim * the bmap then... So, this is important to do. 330243789Sdim */ 331243789Sdim if( bp->b_lblkno == bp->b_blkno) { 332249423Sdim VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); 333249423Sdim } 334249423Sdim 335249423Sdim /* 336249423Sdim * Set the *dirty* buffer range based upon the VM system dirty pages. 337243789Sdim */ 338249423Sdim vfs_setdirty(bp); 339243789Sdim 340249423Sdim /* 341249423Sdim * We need to do this here to satisfy the vnode_pager and the 342249423Sdim * pageout daemon, so that it thinks that the pages have been 343249423Sdim * "cleaned". Note that since the pages are in a delayed write 344249423Sdim * buffer -- the VFS layer "will" see that the pages get written 345249423Sdim * out on the next sync, or perhaps the cluster will be completed. 346249423Sdim */ 347249423Sdim vfs_clean_pages(bp); 348243789Sdim brelse(bp); 349243789Sdim return; 350249423Sdim} 351249423Sdim 352249423Sdim/* 353249423Sdim * Asynchronous write. 354249423Sdim * Start output on a buffer, but do not wait for it to complete. 355249423Sdim * The buffer is released when the output completes. 356249423Sdim */ 357249423Sdimvoid 358249423Sdimbawrite(struct buf * bp) 359243789Sdim{ 360243789Sdim bp->b_flags |= B_ASYNC; 361249423Sdim (void) VOP_BWRITE(bp); 362249423Sdim} 363243789Sdim 364249423Sdim/* 365249423Sdim * Release a buffer. 366243789Sdim */ 367243789Sdimvoid 368243789Sdimbrelse(struct buf * bp) 369243789Sdim{ 370243789Sdim int s; 371243789Sdim 372243789Sdim if (bp->b_flags & B_CLUSTER) { 373243789Sdim relpbuf(bp); 374243789Sdim return; 375243789Sdim } 376243789Sdim /* anyone need a "free" block? */ 377243789Sdim s = splbio(); 378243789Sdim 379249423Sdim if (needsbuffer) { 380249423Sdim needsbuffer = 0; 381249423Sdim wakeup((caddr_t) &needsbuffer); 382249423Sdim } 383249423Sdim 384249423Sdim /* anyone need this block? */ 385249423Sdim if (bp->b_flags & B_WANTED) { 386249423Sdim bp->b_flags &= ~(B_WANTED | B_AGE); 387249423Sdim wakeup((caddr_t) bp); 388249423Sdim } else if (bp->b_flags & B_VMIO) { 389249423Sdim bp->b_flags &= ~B_WANTED; 390249423Sdim wakeup((caddr_t) bp); 391249423Sdim } 392243789Sdim if (bp->b_flags & B_LOCKED) 393243789Sdim bp->b_flags &= ~B_ERROR; 394243789Sdim 395243789Sdim if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || 396249423Sdim (bp->b_bufsize <= 0)) { 397249423Sdim bp->b_flags |= B_INVAL; 398249423Sdim bp->b_flags &= ~(B_DELWRI | B_CACHE); 399249423Sdim if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) 400243789Sdim brelvp(bp); 401243789Sdim } 402249423Sdim 403249423Sdim /* 404249423Sdim * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 405249423Sdim * constituted, so the B_INVAL flag is used to *invalidate* the buffer, 406249423Sdim * but the VM object is kept around. The B_NOCACHE flag is used to 407249423Sdim * invalidate the pages in the VM object. 408249423Sdim */ 409249423Sdim if (bp->b_flags & B_VMIO) { 410249423Sdim vm_offset_t foff; 411249423Sdim vm_object_t obj; 412243789Sdim int i, resid; 413243789Sdim vm_page_t m; 414243789Sdim int iototal = bp->b_bufsize; 415243789Sdim 416249423Sdim foff = 0; 417249423Sdim obj = 0; 418249423Sdim if (bp->b_npages) { 419249423Sdim if (bp->b_vp && bp->b_vp->v_mount) { 420249423Sdim foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 421249423Sdim } else { 422249423Sdim /* 423249423Sdim * vnode pointer has been ripped away -- 424249423Sdim * probably file gone... 425243789Sdim */ 426243789Sdim foff = bp->b_pages[0]->offset; 427243789Sdim } 428243789Sdim } 429243789Sdim for (i = 0; i < bp->b_npages; i++) { 430243789Sdim m = bp->b_pages[i]; 431249423Sdim if (m == bogus_page) { 432243789Sdim m = vm_page_lookup(obj, foff); 433243789Sdim if (!m) { 434243789Sdim panic("brelse: page missing\n"); 435243789Sdim } 436249423Sdim bp->b_pages[i] = m; 437249423Sdim pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 438249423Sdim } 439249423Sdim resid = (m->offset + PAGE_SIZE) - foff; 440249423Sdim if (resid > iototal) 441249423Sdim resid = iototal; 442249423Sdim if (resid > 0) { 443249423Sdim /* 444249423Sdim * Don't invalidate the page if the local machine has already 445249423Sdim * modified it. This is the lesser of two evils, and should 446249423Sdim * be fixed. 447249423Sdim */ 448249423Sdim if (bp->b_flags & (B_NOCACHE | B_ERROR)) { 449243789Sdim vm_page_test_dirty(m); 450249423Sdim if (m->dirty == 0) { 451249423Sdim vm_page_set_invalid(m, foff, resid); 452249423Sdim if (m->valid == 0) 453249423Sdim vm_page_protect(m, VM_PROT_NONE); 454249423Sdim } 455243789Sdim } 456243789Sdim } 457243789Sdim foff += resid; 458243789Sdim iototal -= resid; 459243789Sdim } 460249423Sdim 461243789Sdim if (bp->b_flags & (B_INVAL | B_RELBUF)) { 462243789Sdim for(i=0;i<bp->b_npages;i++) { 463249423Sdim m = bp->b_pages[i]; 464243789Sdim --m->bmapped; 465243789Sdim if (m->bmapped == 0) { 466243789Sdim if (m->flags & PG_WANTED) { 467243789Sdim wakeup((caddr_t) m); 468249423Sdim m->flags &= ~PG_WANTED; 469243789Sdim } 470243789Sdim vm_page_test_dirty(m); 471243789Sdim if ((m->dirty & m->valid) == 0 && 472243789Sdim (m->flags & PG_REFERENCED) == 0 && 473249423Sdim !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { 474249423Sdim vm_page_cache(m); 475243789Sdim } else if ((m->flags & PG_ACTIVE) == 0) { 476249423Sdim vm_page_activate(m); 477249423Sdim m->act_count = 0; 478249423Sdim } 479249423Sdim } 480249423Sdim } 481249423Sdim bufspace -= bp->b_bufsize; 482249423Sdim pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 483249423Sdim bp->b_npages = 0; 484249423Sdim bp->b_bufsize = 0; 485249423Sdim bp->b_flags &= ~B_VMIO; 486249423Sdim if (bp->b_vp) 487243789Sdim brelvp(bp); 488243789Sdim } 489243789Sdim } 490243789Sdim if (bp->b_qindex != QUEUE_NONE) 491243789Sdim panic("brelse: free buffer onto another queue???"); 492243789Sdim 493249423Sdim /* enqueue */ 494249423Sdim /* buffers with no memory */ 495249423Sdim if (bp->b_bufsize == 0) { 496243789Sdim bp->b_qindex = QUEUE_EMPTY; 497243789Sdim TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 498249423Sdim LIST_REMOVE(bp, b_hash); 499249423Sdim LIST_INSERT_HEAD(&invalhash, bp, b_hash); 500249423Sdim bp->b_dev = NODEV; 501249423Sdim /* buffers with junk contents */ 502249423Sdim } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { 503249423Sdim bp->b_qindex = QUEUE_AGE; 504249423Sdim TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 505243789Sdim LIST_REMOVE(bp, b_hash); 506249423Sdim LIST_INSERT_HEAD(&invalhash, bp, b_hash); 507243789Sdim bp->b_dev = NODEV; 508243789Sdim /* buffers that are locked */ 509243789Sdim } else if (bp->b_flags & B_LOCKED) { 510243789Sdim bp->b_qindex = QUEUE_LOCKED; 511243789Sdim TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 512243789Sdim /* buffers with stale but valid contents */ 513249423Sdim } else if (bp->b_flags & B_AGE) { 514249423Sdim bp->b_qindex = QUEUE_AGE; 515249423Sdim TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 516249423Sdim /* buffers with valid and quite potentially reuseable contents */ 517249423Sdim } else { 518249423Sdim bp->b_qindex = QUEUE_LRU; 519249423Sdim TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 520249423Sdim } 521243789Sdim 522243789Sdim /* unlock */ 523243789Sdim bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 524249423Sdim splx(s); 525249423Sdim} 526243789Sdim 527243789Sdim/* 528243789Sdim * this routine implements clustered async writes for 529249423Sdim * clearing out B_DELWRI buffers... This is much better 530249423Sdim * than the old way of writing only one buffer at a time. 531249423Sdim */ 532249423Sdimvoid 533249423Sdimvfs_bio_awrite(struct buf * bp) 534243789Sdim{ 535243789Sdim int i; 536243789Sdim daddr_t lblkno = bp->b_lblkno; 537243789Sdim struct vnode *vp = bp->b_vp; 538243789Sdim int s; 539243789Sdim int ncl; 540243789Sdim struct buf *bpa; 541243789Sdim 542243789Sdim s = splbio(); 543243789Sdim if( vp->v_mount && (vp->v_flag & VVMIO) && 544243789Sdim (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 545243789Sdim int size = vp->v_mount->mnt_stat.f_iosize; 546243789Sdim 547249423Sdim for (i = 1; i < MAXPHYS / size; i++) { 548249423Sdim if ((bpa = incore(vp, lblkno + i)) && 549249423Sdim ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == 550249423Sdim (B_DELWRI | B_CLUSTEROK)) && 551249423Sdim (bpa->b_bufsize == size)) { 552243789Sdim if ((bpa->b_blkno == bpa->b_lblkno) || 553243789Sdim (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE)) 554249423Sdim break; 555249423Sdim } else { 556249423Sdim break; 557249423Sdim } 558249423Sdim } 559243789Sdim ncl = i; 560249423Sdim /* 561249423Sdim * this is a possible cluster write 562249423Sdim */ 563249423Sdim if (ncl != 1) { 564249423Sdim bremfree(bp); 565249423Sdim cluster_wbuild(vp, bp, size, lblkno, ncl, -1); 566249423Sdim splx(s); 567249423Sdim return; 568249423Sdim } 569249423Sdim } 570249423Sdim /* 571243789Sdim * default (old) behavior, writing out only one block 572249423Sdim */ 573249423Sdim bremfree(bp); 574249423Sdim bp->b_flags |= B_BUSY | B_ASYNC; 575249423Sdim (void) VOP_BWRITE(bp); 576249423Sdim splx(s); 577249423Sdim} 578249423Sdim 579249423Sdim 580249423Sdim/* 581249423Sdim * Find a buffer header which is available for use. 582249423Sdim */ 583249423Sdimstatic struct buf * 584249423Sdimgetnewbuf(int slpflag, int slptimeo, int doingvmio) 585249423Sdim{ 586249423Sdim struct buf *bp; 587249423Sdim int s; 588249423Sdim int firstbp = 1; 589249423Sdim 590249423Sdim s = splbio(); 591249423Sdimstart: 592249423Sdim if (bufspace >= maxbufspace) 593249423Sdim goto trytofreespace; 594249423Sdim 595249423Sdim /* can we constitute a new buffer? */ 596249423Sdim if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { 597249423Sdim if (bp->b_qindex != QUEUE_EMPTY) 598249423Sdim panic("getnewbuf: inconsistent EMPTY queue"); 599243789Sdim bremfree(bp); 600243789Sdim goto fillbuf; 601243789Sdim } 602243789Sdimtrytofreespace: 603243789Sdim /* 604243789Sdim * We keep the file I/O from hogging metadata I/O 605243789Sdim * This is desirable because file data is cached in the 606243789Sdim * VM/Buffer cache even if a buffer is freed. 607243789Sdim */ 608243789Sdim if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { 609243789Sdim if (bp->b_qindex != QUEUE_AGE) 610243789Sdim panic("getnewbuf: inconsistent AGE queue"); 611243789Sdim } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { 612243789Sdim if (bp->b_qindex != QUEUE_LRU) 613243789Sdim panic("getnewbuf: inconsistent LRU queue"); 614243789Sdim } 615249423Sdim if (!bp) { 616249423Sdim /* wait for a free buffer of any kind */ 617249423Sdim needsbuffer = 1; 618249423Sdim tsleep((caddr_t) &needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo); 619243789Sdim splx(s); 620243789Sdim return (0); 621243789Sdim } 622243789Sdim 623249423Sdim /* if we are a delayed write, convert to an async write */ 624249423Sdim if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { 625243789Sdim vfs_bio_awrite(bp); 626243789Sdim if (!slpflag && !slptimeo) { 627243789Sdim splx(s); 628249423Sdim return (0); 629249423Sdim } 630249423Sdim goto start; 631249423Sdim } 632249423Sdim 633243789Sdim if (bp->b_flags & B_WANTED) { 634243789Sdim bp->b_flags &= ~B_WANTED; 635249423Sdim wakeup((caddr_t) bp); 636243789Sdim } 637249423Sdim bremfree(bp); 638249423Sdim 639243789Sdim if (bp->b_flags & B_VMIO) { 640249423Sdim bp->b_flags |= B_RELBUF | B_BUSY | B_DONE; 641243789Sdim brelse(bp); 642243789Sdim bremfree(bp); 643243789Sdim } 644243789Sdim 645243789Sdim if (bp->b_vp) 646243789Sdim brelvp(bp); 647243789Sdim 648243789Sdim /* we are not free, nor do we contain interesting data */ 649249423Sdim if (bp->b_rcred != NOCRED) 650249423Sdim crfree(bp->b_rcred); 651243789Sdim if (bp->b_wcred != NOCRED) 652243789Sdim crfree(bp->b_wcred); 653243789Sdimfillbuf: 654243789Sdim bp->b_flags |= B_BUSY; 655243789Sdim LIST_REMOVE(bp, b_hash); 656243789Sdim LIST_INSERT_HEAD(&invalhash, bp, b_hash); 657243789Sdim splx(s); 658243789Sdim if (bp->b_bufsize) { 659243789Sdim allocbuf(bp, 0); 660249423Sdim } 661243789Sdim bp->b_flags = B_BUSY; 662249423Sdim bp->b_dev = NODEV; 663249423Sdim bp->b_vp = NULL; 664243789Sdim bp->b_blkno = bp->b_lblkno = 0; 665243789Sdim bp->b_iodone = 0; 666243789Sdim bp->b_error = 0; 667243789Sdim bp->b_resid = 0; 668243789Sdim bp->b_bcount = 0; 669249423Sdim bp->b_npages = 0; 670243789Sdim bp->b_wcred = bp->b_rcred = NOCRED; 671243789Sdim bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 672243789Sdim bp->b_dirtyoff = bp->b_dirtyend = 0; 673243789Sdim bp->b_validoff = bp->b_validend = 0; 674243789Sdim if (bufspace >= maxbufspace) { 675249423Sdim s = splbio(); 676249423Sdim bp->b_flags |= B_INVAL; 677243789Sdim brelse(bp); 678249423Sdim goto trytofreespace; 679249423Sdim } 680249423Sdim return (bp); 681249423Sdim} 682249423Sdim 683249423Sdim/* 684249423Sdim * Check to see if a block is currently memory resident. 685249423Sdim */ 686243789Sdimstruct buf * 687249423Sdimincore(struct vnode * vp, daddr_t blkno) 688249423Sdim{ 689249423Sdim struct buf *bp; 690249423Sdim struct bufhashhdr *bh; 691243789Sdim 692243789Sdim int s = splbio(); 693249423Sdim 694249423Sdim bh = BUFHASH(vp, blkno); 695249423Sdim bp = bh->lh_first; 696249423Sdim 697249423Sdim /* Search hash chain */ 698249423Sdim while (bp) { 699243789Sdim /* hit */ 700249423Sdim if (bp->b_lblkno == blkno && bp->b_vp == vp && 701249423Sdim (bp->b_flags & B_INVAL) == 0) { 702249423Sdim splx(s); 703249423Sdim return (bp); 704249423Sdim } 705249423Sdim bp = bp->b_hash.le_next; 706249423Sdim } 707249423Sdim splx(s); 708249423Sdim 709249423Sdim return (0); 710249423Sdim} 711243789Sdim 712243789Sdim/* 713243789Sdim * Returns true if no I/O is needed to access the 714243789Sdim * associated VM object. This is like incore except 715243789Sdim * it also hunts around in the VM system for the data. 716243789Sdim */ 717243789Sdim 718243789Sdimint 719243789Sdiminmem(struct vnode * vp, daddr_t blkno) 720243789Sdim{ 721249423Sdim vm_object_t obj; 722243789Sdim vm_offset_t off, toff, tinc; 723243789Sdim vm_page_t m; 724249423Sdim 725243789Sdim if (incore(vp, blkno)) 726243789Sdim return 1; 727243789Sdim if (vp->v_mount == 0) 728243789Sdim return 0; 729243789Sdim if ((vp->v_object == 0) || (vp->v_flag & VVMIO) == 0) 730243789Sdim return 0; 731249423Sdim 732249423Sdim obj = vp->v_object; 733249423Sdim tinc = PAGE_SIZE; 734249423Sdim if (tinc > vp->v_mount->mnt_stat.f_iosize) 735249423Sdim tinc = vp->v_mount->mnt_stat.f_iosize; 736249423Sdim off = blkno * vp->v_mount->mnt_stat.f_iosize; 737249423Sdim 738249423Sdim for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 739249423Sdim int mask; 740249423Sdim 741249423Sdim m = vm_page_lookup(obj, trunc_page(toff + off)); 742249423Sdim if (!m) 743249423Sdim return 0; 744249423Sdim if (vm_page_is_valid(m, toff + off, tinc) == 0) 745249423Sdim return 0; 746249423Sdim } 747249423Sdim return 1; 748249423Sdim} 749249423Sdim 750249423Sdim/* 751249423Sdim * now we set the dirty range for the buffer -- 752249423Sdim * for NFS -- if the file is mapped and pages have 753249423Sdim * been written to, let it know. We want the 754249423Sdim * entire range of the buffer to be marked dirty if 755249423Sdim * any of the pages have been written to for consistancy 756249423Sdim * with the b_validoff, b_validend set in the nfs write 757249423Sdim * code, and used by the nfs read code. 758249423Sdim */ 759249423Sdimstatic void 760249423Sdimvfs_setdirty(struct buf *bp) { 761249423Sdim int i; 762249423Sdim vm_object_t object; 763249423Sdim vm_offset_t boffset, offset; 764243789Sdim /* 765249423Sdim * We qualify the scan for modified pages on whether the 766249423Sdim * object has been flushed yet. The OBJ_WRITEABLE flag 767249423Sdim * is not cleared simply by protecting pages off. 768249423Sdim */ 769249423Sdim if ((bp->b_flags & B_VMIO) && 770249423Sdim ((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) { 771243789Sdim /* 772243789Sdim * test the pages to see if they have been modified directly 773243789Sdim * by users through the VM system. 774249423Sdim */ 775249423Sdim for (i = 0; i < bp->b_npages; i++) 776243789Sdim vm_page_test_dirty(bp->b_pages[i]); 777243789Sdim 778243789Sdim /* 779243789Sdim * scan forwards for the first page modified 780243789Sdim */ 781243789Sdim for (i = 0; i < bp->b_npages; i++) { 782249423Sdim if (bp->b_pages[i]->dirty) { 783243789Sdim break; 784243789Sdim } 785243789Sdim } 786249423Sdim boffset = i * PAGE_SIZE; 787249423Sdim if (boffset < bp->b_dirtyoff) { 788249423Sdim bp->b_dirtyoff = boffset; 789249423Sdim } 790243789Sdim 791243789Sdim /* 792243789Sdim * scan backwards for the last page modified 793243789Sdim */ 794243789Sdim for (i = bp->b_npages - 1; i >= 0; --i) { 795243789Sdim if (bp->b_pages[i]->dirty) { 796243789Sdim break; 797249423Sdim } 798249423Sdim } 799249423Sdim boffset = (i + 1) * PAGE_SIZE; 800249423Sdim offset = boffset + bp->b_pages[0]->offset; 801249423Sdim if (offset >= object->size) { 802249423Sdim boffset = object->size - bp->b_pages[0]->offset; 803249423Sdim } 804249423Sdim if (bp->b_dirtyend < boffset) { 805249423Sdim bp->b_dirtyend = boffset; 806249423Sdim } 807249423Sdim } 808249423Sdim} 809249423Sdim 810249423Sdim/* 811249423Sdim * Get a block given a specified block and offset into a file/device. 812249423Sdim */ 813243789Sdimstruct buf * 814243789Sdimgetblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 815249423Sdim{ 816243789Sdim struct buf *bp; 817243789Sdim int s; 818243789Sdim struct bufhashhdr *bh; 819243789Sdim vm_offset_t off; 820249423Sdim int nleft; 821249423Sdim 822249423Sdim s = splbio(); 823249423Sdimloop: 824249423Sdim if (bp = incore(vp, blkno)) { 825249423Sdim if (bp->b_flags & B_BUSY) { 826249423Sdim bp->b_flags |= B_WANTED; 827249423Sdim if (!tsleep((caddr_t) bp, PRIBIO | slpflag, "getblk", slptimeo)) 828249423Sdim goto loop; 829249423Sdim 830249423Sdim splx(s); 831249423Sdim return (struct buf *) NULL; 832249423Sdim } 833249423Sdim bp->b_flags |= B_BUSY | B_CACHE; 834249423Sdim bremfree(bp); 835249423Sdim /* 836249423Sdim * check for size inconsistancies 837249423Sdim */ 838249423Sdim if (bp->b_bcount != size) { 839249423Sdim if (bp->b_flags & B_VMIO) { 840249423Sdim allocbuf(bp, size); 841249423Sdim } else { 842249423Sdim bp->b_flags |= B_NOCACHE; 843243789Sdim VOP_BWRITE(bp); 844243789Sdim goto loop; 845243789Sdim } 846249423Sdim } 847243789Sdim splx(s); 848249423Sdim return (bp); 849243789Sdim } else { 850243789Sdim vm_object_t obj; 851243789Sdim int doingvmio; 852243789Sdim 853243789Sdim if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { 854243789Sdim doingvmio = 1; 855243789Sdim } else { 856243789Sdim doingvmio = 0; 857249423Sdim } 858249423Sdim if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) { 859249423Sdim if (slpflag || slptimeo) 860243789Sdim return NULL; 861243789Sdim goto loop; 862249423Sdim } 863243789Sdim 864243789Sdim /* 865243789Sdim * This code is used to make sure that a buffer is not 866243789Sdim * created while the getnewbuf routine is blocked. 867243789Sdim * Normally the vnode is locked so this isn't a problem. 868243789Sdim * VBLK type I/O requests, however, don't lock the vnode. 869249423Sdim */ 870249423Sdim if (!VOP_ISLOCKED(vp) && incore(vp, blkno)) { 871249423Sdim bp->b_flags |= B_INVAL; 872243789Sdim brelse(bp); 873243789Sdim goto loop; 874249423Sdim } 875249423Sdim 876249423Sdim /* 877249423Sdim * Insert the buffer into the hash, so that it can 878249423Sdim * be found by incore. 879249423Sdim */ 880243789Sdim bp->b_blkno = bp->b_lblkno = blkno; 881249423Sdim bgetvp(vp, bp); 882243789Sdim LIST_REMOVE(bp, b_hash); 883243789Sdim bh = BUFHASH(vp, blkno); 884249423Sdim LIST_INSERT_HEAD(bh, bp, b_hash); 885249423Sdim 886249423Sdim if (doingvmio) { 887243789Sdim bp->b_flags |= (B_VMIO | B_CACHE); 888249423Sdim#if defined(VFS_BIO_DEBUG) 889243789Sdim if (vp->v_type != VREG) 890249423Sdim printf("getblk: vmioing file type %d???\n", vp->v_type); 891249423Sdim#endif 892249423Sdim } else { 893243789Sdim bp->b_flags &= ~B_VMIO; 894243789Sdim } 895249423Sdim splx(s); 896249423Sdim 897249423Sdim allocbuf(bp, size); 898249423Sdim return (bp); 899249423Sdim } 900249423Sdim} 901249423Sdim 902249423Sdim/* 903249423Sdim * Get an empty, disassociated buffer of given size. 904249423Sdim */ 905249423Sdimstruct buf * 906249423Sdimgeteblk(int size) 907249423Sdim{ 908249423Sdim struct buf *bp; 909249423Sdim 910249423Sdim while ((bp = getnewbuf(0, 0, 0)) == 0); 911249423Sdim allocbuf(bp, size); 912249423Sdim bp->b_flags |= B_INVAL; 913249423Sdim return (bp); 914249423Sdim} 915249423Sdim 916249423Sdim/* 917249423Sdim * This code constitutes the buffer memory from either anonymous system 918249423Sdim * memory (in the case of non-VMIO operations) or from an associated 919249423Sdim * VM object (in the case of VMIO operations). 920249423Sdim * 921249423Sdim * Note that this code is tricky, and has many complications to resolve 922249423Sdim * deadlock or inconsistant data situations. Tread lightly!!! 923249423Sdim * 924249423Sdim * Modify the length of a buffer's underlying buffer storage without 925249423Sdim * destroying information (unless, of course the buffer is shrinking). 926249423Sdim */ 927249423Sdimint 928249423Sdimallocbuf(struct buf * bp, int size) 929249423Sdim{ 930249423Sdim 931249423Sdim int s; 932249423Sdim int newbsize, mbsize; 933249423Sdim int i; 934249423Sdim 935249423Sdim if (!(bp->b_flags & B_BUSY)) 936249423Sdim panic("allocbuf: buffer not busy"); 937249423Sdim 938249423Sdim if ((bp->b_flags & B_VMIO) == 0) { 939249423Sdim /* 940249423Sdim * Just get anonymous memory from the kernel 941249423Sdim */ 942249423Sdim mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; 943249423Sdim newbsize = round_page(size); 944249423Sdim 945249423Sdim if (newbsize < bp->b_bufsize) { 946249423Sdim vm_hold_free_pages( 947249423Sdim bp, 948249423Sdim (vm_offset_t) bp->b_data + newbsize, 949249423Sdim (vm_offset_t) bp->b_data + bp->b_bufsize); 950249423Sdim } else if (newbsize > bp->b_bufsize) { 951249423Sdim vm_hold_load_pages( 952249423Sdim bp, 953249423Sdim (vm_offset_t) bp->b_data + bp->b_bufsize, 954249423Sdim (vm_offset_t) bp->b_data + newbsize); 955249423Sdim } 956249423Sdim } else { 957249423Sdim vm_page_t m; 958249423Sdim int desiredpages; 959249423Sdim 960249423Sdim newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; 961249423Sdim desiredpages = round_page(newbsize) / PAGE_SIZE; 962249423Sdim 963249423Sdim if (newbsize < bp->b_bufsize) { 964249423Sdim if (desiredpages < bp->b_npages) { 965249423Sdim pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + 966249423Sdim desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages)); 967249423Sdim for (i = desiredpages; i < bp->b_npages; i++) { 968249423Sdim m = bp->b_pages[i]; 969249423Sdim s = splhigh(); 970249423Sdim while ((m->flags & PG_BUSY) || (m->busy != 0)) { 971249423Sdim m->flags |= PG_WANTED; 972249423Sdim tsleep(m, PVM, "biodep", 0); 973249423Sdim } 974249423Sdim splx(s); 975249423Sdim 976249423Sdim if (m->bmapped == 0) { 977249423Sdim printf("allocbuf: bmapped is zero for page %d\n", i); 978249423Sdim panic("allocbuf: error"); 979249423Sdim } 980249423Sdim --m->bmapped; 981249423Sdim if (m->bmapped == 0) { 982249423Sdim vm_page_protect(m, VM_PROT_NONE); 983249423Sdim vm_page_free(m); 984249423Sdim } 985249423Sdim bp->b_pages[i] = NULL; 986249423Sdim } 987249423Sdim bp->b_npages = desiredpages; 988249423Sdim } 989249423Sdim } else if (newbsize > bp->b_bufsize) { 990249423Sdim vm_object_t obj; 991249423Sdim vm_offset_t tinc, off, toff, objoff; 992249423Sdim int pageindex, curbpnpages; 993249423Sdim struct vnode *vp; 994249423Sdim int bsize; 995249423Sdim 996249423Sdim vp = bp->b_vp; 997249423Sdim bsize = vp->v_mount->mnt_stat.f_iosize; 998249423Sdim 999249423Sdim if (bp->b_npages < desiredpages) { 1000249423Sdim obj = vp->v_object; 1001243789Sdim tinc = PAGE_SIZE; 1002243789Sdim if (tinc > bsize) 1003249423Sdim tinc = bsize; 1004243789Sdim off = bp->b_lblkno * bsize; 1005243789Sdim doretry: 1006243789Sdim curbpnpages = bp->b_npages; 1007243789Sdim bp->b_flags |= B_CACHE; 1008243789Sdim for (toff = 0; toff < newbsize; toff += tinc) { 1009243789Sdim int mask; 1010243789Sdim int bytesinpage; 1011249423Sdim 1012243789Sdim pageindex = toff / PAGE_SIZE; 1013243789Sdim objoff = trunc_page(toff + off); 1014243789Sdim if (pageindex < curbpnpages) { 1015243789Sdim int pb; 1016243789Sdim 1017243789Sdim m = bp->b_pages[pageindex]; 1018243789Sdim if (m->offset != objoff) 1019243789Sdim panic("allocbuf: page changed offset??!!!?"); 1020249423Sdim bytesinpage = tinc; 1021249423Sdim if (tinc > (newbsize - toff)) 1022243789Sdim bytesinpage = newbsize - toff; 1023249423Sdim if (!vm_page_is_valid(m, toff + off, bytesinpage)) { 1024243789Sdim bp->b_flags &= ~B_CACHE; 1025243789Sdim } 1026249423Sdim if ((m->flags & PG_ACTIVE) == 0) { 1027249423Sdim vm_page_activate(m); 1028249423Sdim m->act_count = 0; 1029243789Sdim } 1030243789Sdim continue; 1031243789Sdim } 1032243789Sdim m = vm_page_lookup(obj, objoff); 1033243789Sdim if (!m) { 1034243789Sdim m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); 1035243789Sdim if (!m) { 1036243789Sdim int j; 1037243789Sdim 1038249423Sdim for (j = bp->b_npages; j < pageindex; j++) { 1039249423Sdim PAGE_WAKEUP(bp->b_pages[j]); 1040249423Sdim } 1041249423Sdim VM_WAIT; 1042243789Sdim goto doretry; 1043243789Sdim } 1044249423Sdim vm_page_activate(m); 1045249423Sdim m->act_count = 0; 1046249423Sdim m->valid = 0; 1047249423Sdim bp->b_flags &= ~B_CACHE; 1048249423Sdim } else if (m->flags & PG_BUSY) { 1049249423Sdim int j; 1050249423Sdim 1051249423Sdim for (j = bp->b_npages; j < pageindex; j++) { 1052249423Sdim PAGE_WAKEUP(bp->b_pages[j]); 1053249423Sdim } 1054249423Sdim 1055249423Sdim s = splbio(); 1056249423Sdim m->flags |= PG_WANTED; 1057249423Sdim tsleep(m, PRIBIO, "pgtblk", 0); 1058249423Sdim splx(s); 1059249423Sdim 1060249423Sdim goto doretry; 1061249423Sdim } else { 1062249423Sdim int pb; 1063249423Sdim if ((curproc != pageproc) && 1064249423Sdim (m->flags & PG_CACHE) && 1065249423Sdim (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { 1066249423Sdim pagedaemon_wakeup(); 1067243789Sdim } 1068249423Sdim bytesinpage = tinc; 1069249423Sdim if (tinc > (newbsize - toff)) 1070243789Sdim bytesinpage = newbsize - toff; 1071249423Sdim if (!vm_page_is_valid(m, toff + off, bytesinpage)) { 1072249423Sdim bp->b_flags &= ~B_CACHE; 1073249423Sdim } 1074249423Sdim if ((m->flags & PG_ACTIVE) == 0) { 1075249423Sdim vm_page_activate(m); 1076243789Sdim m->act_count = 0; 1077249423Sdim } 1078249423Sdim m->flags |= PG_BUSY; 1079249423Sdim } 1080249423Sdim bp->b_pages[pageindex] = m; 1081249423Sdim curbpnpages = pageindex + 1; 1082249423Sdim } 1083249423Sdim for (i = bp->b_npages; i < curbpnpages; i++) { 1084249423Sdim m = bp->b_pages[i]; 1085249423Sdim m->bmapped++; 1086249423Sdim PAGE_WAKEUP(m); 1087249423Sdim } 1088249423Sdim bp->b_npages = curbpnpages; 1089249423Sdim bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 1090249423Sdim pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); 1091249423Sdim bp->b_data += off % PAGE_SIZE; 1092249423Sdim } 1093249423Sdim } 1094249423Sdim } 1095249423Sdim bufspace += (newbsize - bp->b_bufsize); 1096249423Sdim bp->b_bufsize = newbsize; 1097249423Sdim bp->b_bcount = size; 1098249423Sdim return 1; 1099249423Sdim} 1100249423Sdim 1101249423Sdim/* 1102249423Sdim * Wait for buffer I/O completion, returning error status. 1103249423Sdim */ 1104249423Sdimint 1105249423Sdimbiowait(register struct buf * bp) 1106249423Sdim{ 1107243789Sdim int s; 1108243789Sdim 1109249423Sdim s = splbio(); 1110249423Sdim while ((bp->b_flags & B_DONE) == 0) 1111249423Sdim tsleep((caddr_t) bp, PRIBIO, "biowait", 0); 1112249423Sdim splx(s); 1113249423Sdim if (bp->b_flags & B_EINTR) { 1114249423Sdim bp->b_flags &= ~B_EINTR; 1115249423Sdim return (EINTR); 1116249423Sdim } 1117249423Sdim if (bp->b_flags & B_ERROR) { 1118249423Sdim return (bp->b_error ? bp->b_error : EIO); 1119249423Sdim } else { 1120249423Sdim return (0); 1121249423Sdim } 1122249423Sdim} 1123249423Sdim 1124249423Sdim/* 1125249423Sdim * Finish I/O on a buffer, calling an optional function. 1126249423Sdim * This is usually called from interrupt level, so process blocking 1127249423Sdim * is not *a good idea*. 1128249423Sdim */ 1129249423Sdimvoid 1130243789Sdimbiodone(register struct buf * bp) 1131243789Sdim{ 1132243789Sdim int s; 1133249423Sdim 1134243789Sdim s = splbio(); 1135243789Sdim if (!(bp->b_flags & B_BUSY)) 1136243789Sdim panic("biodone: buffer not busy"); 1137243789Sdim 1138243789Sdim if (bp->b_flags & B_DONE) { 1139249423Sdim splx(s); 1140249423Sdim printf("biodone: buffer already done\n"); 1141249423Sdim return; 1142249423Sdim } 1143249423Sdim bp->b_flags |= B_DONE; 1144249423Sdim 1145249423Sdim if ((bp->b_flags & B_READ) == 0) { 1146249423Sdim struct vnode *vp = bp->b_vp; 1147249423Sdim vwakeup(bp); 1148249423Sdim } 1149249423Sdim#ifdef BOUNCE_BUFFERS 1150249423Sdim if (bp->b_flags & B_BOUNCE) 1151249423Sdim vm_bounce_free(bp); 1152249423Sdim#endif 1153249423Sdim 1154249423Sdim /* call optional completion function if requested */ 1155249423Sdim if (bp->b_flags & B_CALL) { 1156249423Sdim bp->b_flags &= ~B_CALL; 1157249423Sdim (*bp->b_iodone) (bp); 1158249423Sdim splx(s); 1159249423Sdim return; 1160243789Sdim } 1161243789Sdim if (bp->b_flags & B_VMIO) { 1162243789Sdim int i, resid; 1163249423Sdim vm_offset_t foff; 1164249423Sdim vm_page_t m; 1165249423Sdim vm_object_t obj; 1166249423Sdim int iosize; 1167249423Sdim struct vnode *vp = bp->b_vp; 1168249423Sdim 1169249423Sdim foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1170249423Sdim obj = vp->v_object; 1171249423Sdim if (!obj) { 1172249423Sdim panic("biodone: no object"); 1173249423Sdim } 1174249423Sdim#if defined(VFS_BIO_DEBUG) 1175249423Sdim if (obj->paging_in_progress < bp->b_npages) { 1176249423Sdim printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 1177243789Sdim obj->paging_in_progress, bp->b_npages); 1178249423Sdim } 1179249423Sdim#endif 1180243789Sdim iosize = bp->b_bufsize; 1181243789Sdim for (i = 0; i < bp->b_npages; i++) { 1182249423Sdim int bogusflag = 0; 1183243789Sdim m = bp->b_pages[i]; 1184243789Sdim if (m == bogus_page) { 1185243789Sdim bogusflag = 1; 1186243789Sdim m = vm_page_lookup(obj, foff); 1187243789Sdim if (!m) { 1188243789Sdim#if defined(VFS_BIO_DEBUG) 1189249423Sdim printf("biodone: page disappeared\n"); 1190249423Sdim#endif 1191249423Sdim --obj->paging_in_progress; 1192243789Sdim continue; 1193249423Sdim } 1194249423Sdim bp->b_pages[i] = m; 1195249423Sdim pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1196243789Sdim } 1197249423Sdim#if defined(VFS_BIO_DEBUG) 1198249423Sdim if (trunc_page(foff) != m->offset) { 1199249423Sdim printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset); 1200249423Sdim } 1201249423Sdim#endif 1202249423Sdim resid = (m->offset + PAGE_SIZE) - foff; 1203249423Sdim if (resid > iosize) 1204243789Sdim resid = iosize; 1205249423Sdim /* 1206243789Sdim * In the write case, the valid and clean bits are 1207249423Sdim * already changed correctly, so we only need to do this 1208243789Sdim * here in the read case. 1209249423Sdim */ 1210249423Sdim if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 1211249423Sdim vm_page_set_valid(m, foff & (PAGE_SIZE-1), resid); 1212249423Sdim vm_page_set_clean(m, foff & (PAGE_SIZE-1), resid); 1213249423Sdim } 1214243789Sdim 1215249423Sdim /* 1216243789Sdim * when debugging new filesystems or buffer I/O methods, this 1217243789Sdim * is the most common error that pops up. if you see this, you 1218243789Sdim * have not set the page busy flag correctly!!! 1219243789Sdim */ 1220243789Sdim if (m->busy == 0) { 1221243789Sdim printf("biodone: page busy < 0, " 1222243789Sdim "off: %ld, foff: %ld, " 1223249423Sdim "resid: %d, index: %d\n", 1224249423Sdim m->offset, foff, resid, i); 1225249423Sdim printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n", 1226243789Sdim bp->b_vp->v_mount->mnt_stat.f_iosize, 1227249423Sdim bp->b_lblkno, bp->b_flags, bp->b_npages); 1228249423Sdim printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n", 1229249423Sdim m->valid, m->dirty, m->bmapped); 1230243789Sdim panic("biodone: page busy < 0\n"); 1231249423Sdim } 1232249423Sdim --m->busy; 1233249423Sdim if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1234249423Sdim m->flags &= ~PG_WANTED; 1235249423Sdim wakeup((caddr_t) m); 1236249423Sdim } 1237249423Sdim --obj->paging_in_progress; 1238249423Sdim foff += resid; 1239249423Sdim iosize -= resid; 1240249423Sdim } 1241243789Sdim if (obj && obj->paging_in_progress == 0 && 1242243789Sdim (obj->flags & OBJ_PIPWNT)) { 1243249423Sdim obj->flags &= ~OBJ_PIPWNT; 1244243789Sdim wakeup((caddr_t) obj); 1245243789Sdim } 1246243789Sdim } 1247249423Sdim /* 1248249423Sdim * For asynchronous completions, release the buffer now. The brelse 1249249423Sdim * checks for B_WANTED and will do the wakeup there if necessary - so 1250249423Sdim * no need to do a wakeup here in the async case. 1251243789Sdim */ 1252249423Sdim 1253249423Sdim if (bp->b_flags & B_ASYNC) { 1254249423Sdim brelse(bp); 1255249423Sdim } else { 1256243789Sdim bp->b_flags &= ~B_WANTED; 1257249423Sdim wakeup((caddr_t) bp); 1258243789Sdim } 1259249423Sdim splx(s); 1260249423Sdim} 1261249423Sdim 1262249423Sdimint 1263249423Sdimcount_lock_queue() 1264249423Sdim{ 1265249423Sdim int count; 1266249423Sdim struct buf *bp; 1267249423Sdim 1268249423Sdim count = 0; 1269243789Sdim for (bp = bufqueues[QUEUE_LOCKED].tqh_first; 1270249423Sdim bp != NULL; 1271249423Sdim bp = bp->b_freelist.tqe_next) 1272249423Sdim count++; 1273249423Sdim return (count); 1274249423Sdim} 1275243789Sdim 1276249423Sdimint vfs_update_interval = 30; 1277249423Sdim 1278249423Sdimvoid 1279243789Sdimvfs_update() 1280249423Sdim{ 1281249423Sdim (void) spl0(); 1282249423Sdim while (1) { 1283249423Sdim tsleep((caddr_t) &vfs_update_wakeup, PRIBIO, "update", 1284249423Sdim hz * vfs_update_interval); 1285249423Sdim vfs_update_wakeup = 0; 1286249423Sdim sync(curproc, NULL, NULL); 1287249423Sdim } 1288249423Sdim} 1289249423Sdim 1290249423Sdim/* 1291249423Sdim * This routine is called in lieu of iodone in the case of 1292249423Sdim * incomplete I/O. This keeps the busy status for pages 1293249423Sdim * consistant. 1294249423Sdim */ 1295249423Sdimvoid 1296249423Sdimvfs_unbusy_pages(struct buf * bp) 1297249423Sdim{ 1298249423Sdim int i; 1299249423Sdim 1300249423Sdim if (bp->b_flags & B_VMIO) { 1301249423Sdim struct vnode *vp = bp->b_vp; 1302249423Sdim vm_object_t obj = vp->v_object; 1303249423Sdim vm_offset_t foff; 1304249423Sdim 1305249423Sdim foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno); 1306249423Sdim 1307249423Sdim for (i = 0; i < bp->b_npages; i++) { 1308249423Sdim vm_page_t m = bp->b_pages[i]; 1309249423Sdim 1310249423Sdim if (m == bogus_page) { 1311249423Sdim m = vm_page_lookup(obj, foff + i * PAGE_SIZE); 1312249423Sdim if (!m) { 1313249423Sdim panic("vfs_unbusy_pages: page missing\n"); 1314243789Sdim } 1315249423Sdim bp->b_pages[i] = m; 1316249423Sdim pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1317249423Sdim } 1318249423Sdim --obj->paging_in_progress; 1319249423Sdim --m->busy; 1320249423Sdim if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1321249423Sdim m->flags &= ~PG_WANTED; 1322249423Sdim wakeup((caddr_t) m); 1323249423Sdim } 1324249423Sdim } 1325249423Sdim if (obj->paging_in_progress == 0 && 1326249423Sdim (obj->flags & OBJ_PIPWNT)) { 1327249423Sdim obj->flags &= ~OBJ_PIPWNT; 1328249423Sdim wakeup((caddr_t) obj); 1329249423Sdim } 1330249423Sdim } 1331249423Sdim} 1332249423Sdim 1333249423Sdim/* 1334249423Sdim * This routine is called before a device strategy routine. 1335249423Sdim * It is used to tell the VM system that paging I/O is in 1336249423Sdim * progress, and treat the pages associated with the buffer 1337249423Sdim * almost as being PG_BUSY. Also the object paging_in_progress 1338249423Sdim * flag is handled to make sure that the object doesn't become 1339249423Sdim * inconsistant. 1340249423Sdim */ 1341249423Sdimvoid 1342249423Sdimvfs_busy_pages(struct buf * bp, int clear_modify) 1343249423Sdim{ 1344249423Sdim int i; 1345249423Sdim 1346249423Sdim if (bp->b_flags & B_VMIO) { 1347249423Sdim vm_object_t obj = bp->b_vp->v_object; 1348249423Sdim vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1349249423Sdim int iocount = bp->b_bufsize; 1350249423Sdim 1351249423Sdim vfs_setdirty(bp); 1352249423Sdim for (i = 0; i < bp->b_npages; i++) { 1353249423Sdim vm_page_t m = bp->b_pages[i]; 1354249423Sdim int resid = (m->offset + PAGE_SIZE) - foff; 1355249423Sdim 1356243789Sdim if (resid > iocount) 1357249423Sdim resid = iocount; 1358249423Sdim obj->paging_in_progress++; 1359249423Sdim m->busy++; 1360249423Sdim if (clear_modify) { 1361249423Sdim vm_page_protect(m, VM_PROT_READ); 1362249423Sdim vm_page_set_valid(m, 1363249423Sdim foff & (PAGE_SIZE-1), resid); 1364249423Sdim vm_page_set_clean(m, 1365249423Sdim foff & (PAGE_SIZE-1), resid); 1366243789Sdim } else if (bp->b_bcount >= PAGE_SIZE) { 1367243789Sdim if (m->valid && (bp->b_flags & B_CACHE) == 0) { 1368249423Sdim bp->b_pages[i] = bogus_page; 1369249423Sdim pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1370249423Sdim } 1371249423Sdim } 1372249423Sdim foff += resid; 1373249423Sdim iocount -= resid; 1374249423Sdim } 1375249423Sdim } 1376249423Sdim} 1377249423Sdim 1378249423Sdim/* 1379249423Sdim * Tell the VM system that the pages associated with this buffer 1380243789Sdim * are clean. This is used for delayed writes where the data is 1381249423Sdim * going to go to disk eventually without additional VM intevention. 1382249423Sdim */ 1383249423Sdimvoid 1384243789Sdimvfs_clean_pages(struct buf * bp) 1385243789Sdim{ 1386243789Sdim int i; 1387243789Sdim 1388243789Sdim if (bp->b_flags & B_VMIO) { 1389243789Sdim vm_offset_t foff = 1390243789Sdim bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1391243789Sdim int iocount = bp->b_bufsize; 1392243789Sdim 1393243789Sdim for (i = 0; i < bp->b_npages; i++) { 1394243789Sdim vm_page_t m = bp->b_pages[i]; 1395243789Sdim int resid = (m->offset + PAGE_SIZE) - foff; 1396243789Sdim 1397243789Sdim if (resid > iocount) 1398243789Sdim resid = iocount; 1399243789Sdim if (resid > 0) { 1400243789Sdim vm_page_set_valid(m, 1401243789Sdim foff & (PAGE_SIZE-1), resid); 1402243789Sdim vm_page_set_clean(m, 1403243789Sdim foff & (PAGE_SIZE-1), resid); 1404243789Sdim } 1405243789Sdim foff += resid; 1406243789Sdim iocount -= resid; 1407243789Sdim } 1408243789Sdim } 1409243789Sdim} 1410249423Sdim 1411243789Sdimvoid 1412249423Sdimvfs_bio_clrbuf(struct buf *bp) { 1413243789Sdim int i; 1414249423Sdim if( bp->b_flags & B_VMIO) { 1415243789Sdim if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { 1416249423Sdim int j; 1417249423Sdim if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) { 1418249423Sdim for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) { 1419243789Sdim bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE); 1420243789Sdim } 1421243789Sdim } 1422249423Sdim bp->b_resid = 0; 1423249423Sdim return; 1424243789Sdim } 1425243789Sdim for(i=0;i<bp->b_npages;i++) { 1426249423Sdim if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) 1427249423Sdim continue; 1428243789Sdim if( bp->b_pages[i]->valid == 0) { 1429243789Sdim bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE); 1430243789Sdim } else { 1431243789Sdim int j; 1432243789Sdim for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { 1433243789Sdim if( (bp->b_pages[i]->valid & (1<<j)) == 0) 1434243789Sdim bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE); 1435243789Sdim } 1436243789Sdim } 1437243789Sdim bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; 1438243789Sdim } 1439249423Sdim bp->b_resid = 0; 1440249423Sdim } else { 1441243789Sdim clrbuf(bp); 1442249423Sdim } 1443249423Sdim} 1444249423Sdim 1445243789Sdim/* 1446249423Sdim * vm_hold_load_pages and vm_hold_unload pages get pages into 1447249423Sdim * a buffers address space. The pages are anonymous and are 1448243789Sdim * not associated with a file object. 1449249423Sdim */ 1450249423Sdimvoid 1451243789Sdimvm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) 1452249423Sdim{ 1453249423Sdim vm_offset_t pg; 1454249423Sdim vm_page_t p; 1455249423Sdim vm_offset_t from = round_page(froma); 1456249423Sdim vm_offset_t to = round_page(toa); 1457249423Sdim 1458249423Sdim for (pg = from; pg < to; pg += PAGE_SIZE) { 1459243789Sdim 1460243789Sdimtryagain: 1461243789Sdim 1462243789Sdim p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS, 1463243789Sdim VM_ALLOC_NORMAL); 1464249423Sdim if (!p) { 1465249423Sdim VM_WAIT; 1466249423Sdim goto tryagain; 1467249423Sdim } 1468249423Sdim vm_page_wire(p); 1469249423Sdim pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 1470249423Sdim bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p; 1471249423Sdim PAGE_WAKEUP(p); 1472249423Sdim bp->b_npages++; 1473249423Sdim } 1474249423Sdim} 1475249423Sdim 1476249423Sdimvoid 1477249423Sdimvm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) 1478249423Sdim{ 1479249423Sdim vm_offset_t pg; 1480249423Sdim vm_page_t p; 1481249423Sdim vm_offset_t from = round_page(froma); 1482249423Sdim vm_offset_t to = round_page(toa); 1483249423Sdim 1484249423Sdim for (pg = from; pg < to; pg += PAGE_SIZE) { 1485249423Sdim p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE]; 1486249423Sdim bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0; 1487249423Sdim pmap_kremove(pg); 1488249423Sdim vm_page_free(p); 1489249423Sdim --bp->b_npages; 1490249423Sdim } 1491249423Sdim} 1492249423Sdim