Cross Reference: /freebsd-10.2-release/sys/kern/vfs

Deleted Added

sdiff udiff text old ( 12799 ) new ( 12819 )

full compact

vfs_bio.c (12799)	vfs_bio.c (12819)
1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. This work was done expressly for inclusion into FreeBSD. Other use 17 * is allowed if this notation is included. 18 * 5. Modifications may be freely made to this file if the above conditions 19 * are met. 20 *	1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. This work was done expressly for inclusion into FreeBSD. Other use 17 * is allowed if this notation is included. 18 * 5. Modifications may be freely made to this file if the above conditions 19 * are met. 20 *
21 * $Id: vfs_bio.c,v 1.76 1995/12/11 04:56:05 dyson Exp $	21 * $Id: vfs_bio.c,v 1.78 1995/12/13 03:47:01 dyson Exp $
22 / 23 24/ 25 * this file contains a new buffer I/O scheme implementing a coherent 26 * VM object and buffer cache scheme. Pains have been taken to make 27 * sure that the performance degradation associated with schemes such 28 * as this is not realized. 29 * 30 * Author: John S. Dyson 31 * Significant help during the development and debugging phases 32 * had been provided by David Greenman, also of the FreeBSD core team. 33 */ 34 35#define VMIO 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/sysproto.h> 39#include <sys/kernel.h> 40#include <sys/sysctl.h> 41#include <sys/proc.h> 42#include <sys/vnode.h> 43#include <sys/vmmeter.h> 44#include <vm/vm.h> 45#include <vm/vm_param.h> 46#include <vm/vm_prot.h> 47#include <vm/vm_kern.h> 48#include <vm/vm_pageout.h> 49#include <vm/vm_page.h> 50#include <vm/vm_object.h> 51#include <vm/vm_extern.h> 52#include <sys/buf.h> 53#include <sys/mount.h> 54#include <sys/malloc.h> 55#include <sys/resourcevar.h> 56#include <sys/proc.h> 57 58#include <miscfs/specfs/specdev.h> 59 60static void vfs_update __P((void));	22 / 23 24/ 25 * this file contains a new buffer I/O scheme implementing a coherent 26 * VM object and buffer cache scheme. Pains have been taken to make 27 * sure that the performance degradation associated with schemes such 28 * as this is not realized. 29 * 30 * Author: John S. Dyson 31 * Significant help during the development and debugging phases 32 * had been provided by David Greenman, also of the FreeBSD core team. 33 */ 34 35#define VMIO 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/sysproto.h> 39#include <sys/kernel.h> 40#include <sys/sysctl.h> 41#include <sys/proc.h> 42#include <sys/vnode.h> 43#include <sys/vmmeter.h> 44#include <vm/vm.h> 45#include <vm/vm_param.h> 46#include <vm/vm_prot.h> 47#include <vm/vm_kern.h> 48#include <vm/vm_pageout.h> 49#include <vm/vm_page.h> 50#include <vm/vm_object.h> 51#include <vm/vm_extern.h> 52#include <sys/buf.h> 53#include <sys/mount.h> 54#include <sys/malloc.h> 55#include <sys/resourcevar.h> 56#include <sys/proc.h> 57 58#include <miscfs/specfs/specdev.h> 59 60static void vfs_update __P((void));
61struct proc *updateproc;	61static struct proc *updateproc;
62static struct kproc_desc up_kp = { 63 "update", 64 vfs_update, 65 &updateproc 66}; 67SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 68 69struct buf buf; / buffer header pool */ 70struct swqueue bswlist; 71 72int count_lock_queue __P((void));	62static struct kproc_desc up_kp = { 63 "update", 64 vfs_update, 65 &updateproc 66}; 67SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 68 69struct buf buf; / buffer header pool */ 70struct swqueue bswlist; 71 72int count_lock_queue __P((void));
73void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); 74void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); 75void vfs_clean_pages(struct buf * bp);	73static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, 74 vm_offset_t to); 75static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, 76 vm_offset_t to); 77static void vfs_clean_pages(struct buf * bp);
76static void vfs_setdirty(struct buf bp); 77 78int needsbuffer; 79 80/ 81 * Internal update daemon, process 3 82 * The variable vfs_update_wakeup allows for internal syncs. 83 / 84int vfs_update_wakeup; 85 86 87/ 88 * buffers base kva 89 / 90caddr_t buffers_kva; 91 92/ 93 * bogus page -- for I/O to/from partially complete buffers 94 * this is a temporary solution to the problem, but it is not 95 * really that bad. it would be better to split the buffer 96 * for input in the case of buffers partially already in memory, 97 * but the code is intricate enough already. 98 */ 99vm_page_t bogus_page;	78static void vfs_setdirty(struct buf bp); 79 80int needsbuffer; 81 82/ 83 * Internal update daemon, process 3 84 * The variable vfs_update_wakeup allows for internal syncs. 85 / 86int vfs_update_wakeup; 87 88 89/ 90 * buffers base kva 91 / 92caddr_t buffers_kva; 93 94/ 95 * bogus page -- for I/O to/from partially complete buffers 96 * this is a temporary solution to the problem, but it is not 97 * really that bad. it would be better to split the buffer 98 * for input in the case of buffers partially already in memory, 99 * but the code is intricate enough already. 100 / 101*vm_page_t bogus_page;
100vm_offset_t bogus_offset;	102static vm_offset_t bogus_offset;
101	103
102int bufspace, maxbufspace;	104static int bufspace, maxbufspace;
103	105
104struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; 105struct bqueues bufqueues[BUFFER_QUEUES];	106static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; 107static struct bqueues bufqueues[BUFFER_QUEUES];
106 107#define BUF_MAXUSE 8 108 109/* 110 * Initialize buffer headers and related structures. 111 / 112void 113bufinit() 114{ 115* struct buf bp; 116* int i; 117 118 TAILQ_INIT(&bswlist); 119 LIST_INIT(&invalhash); 120 121 /* first, make a null hash table / 122* for (i = 0; i < BUFHSZ; i++) 123 LIST_INIT(&bufhashtbl[i]); 124 125 /* next, make a null set of free lists / 126* for (i = 0; i < BUFFER_QUEUES; i++) 127 TAILQ_INIT(&bufqueues[i]); 128 129 buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); 130 /* finally, initialize each buffer header and stick on empty q / 131* for (i = 0; i < nbuf; i++) { 132 bp = &buf[i]; 133 bzero(bp, sizeof bp); 134* bp->b_flags = B_INVAL; /* we're just an empty header / 135* bp->b_dev = NODEV; 136 bp->b_rcred = NOCRED; 137 bp->b_wcred = NOCRED; 138 bp->b_qindex = QUEUE_EMPTY; 139 bp->b_vnbufs.le_next = NOLIST; 140 bp->b_data = buffers_kva + i * MAXBSIZE; 141 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 142 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 143 } 144/* 145 * maxbufspace is currently calculated to support all filesystem blocks 146 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer 147 * cache is still the same as it would be for 8K filesystems. This 148 * keeps the size of the buffer cache "in check" for big block filesystems. 149 / 150* maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE; 151 152 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 153 bogus_page = vm_page_alloc(kernel_object, 154 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 155 VM_ALLOC_NORMAL); 156 157} 158 159/* 160 * remove the buffer from the appropriate free list 161 / 162void 163bremfree(struct buf bp) 164{ 165 int s = splbio(); 166 167 if (bp->b_qindex != QUEUE_NONE) { 168 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 169 bp->b_qindex = QUEUE_NONE; 170 } else { 171 panic("bremfree: removing a buffer when not on a queue"); 172 } 173 splx(s); 174} 175 176/* 177 * Get a buffer with the specified data. Look in the cache first. 178 / 179int 180bread(struct vnode vp, daddr_t blkno, int size, struct ucred * cred, 181 struct buf ** bpp) 182{ 183 struct buf bp; 184* 185 bp = getblk(vp, blkno, size, 0, 0); 186 bpp = bp; 187* 188 /* if not found in cache, do some I/O / 189* if ((bp->b_flags & B_CACHE) == 0) { 190 if (curproc != NULL) 191 curproc->p_stats->p_ru.ru_inblock++; 192 bp->b_flags \|= B_READ; 193 bp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 194 if (bp->b_rcred == NOCRED) { 195 if (cred != NOCRED) 196 crhold(cred); 197 bp->b_rcred = cred; 198 } 199 vfs_busy_pages(bp, 0); 200 VOP_STRATEGY(bp); 201 return (biowait(bp)); 202 } 203 return (0); 204} 205 206/* 207 * Operates like bread, but also starts asynchronous I/O on 208 * read-ahead blocks. 209 / 210int 211breadn(struct vnode vp, daddr_t blkno, int size, 212 daddr_t * rablkno, int rabsize, 213* int cnt, struct ucred * cred, struct buf ** bpp) 214{ 215 struct buf bp, rabp; 216 int i; 217 int rv = 0, readwait = 0; 218 219 bpp = bp = getblk(vp, blkno, size, 0, 0); 220* 221 /* if not found in cache, do some I/O / 222* if ((bp->b_flags & B_CACHE) == 0) { 223 if (curproc != NULL) 224 curproc->p_stats->p_ru.ru_inblock++; 225 bp->b_flags \|= B_READ; 226 bp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 227 if (bp->b_rcred == NOCRED) { 228 if (cred != NOCRED) 229 crhold(cred); 230 bp->b_rcred = cred; 231 } 232 vfs_busy_pages(bp, 0); 233 VOP_STRATEGY(bp); 234 ++readwait; 235 } 236 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 237 if (inmem(vp, rablkno)) 238* continue; 239 rabp = getblk(vp, rablkno, rabsize, 0, 0); 240 241 if ((rabp->b_flags & B_CACHE) == 0) { 242 if (curproc != NULL) 243 curproc->p_stats->p_ru.ru_inblock++; 244 rabp->b_flags \|= B_READ \| B_ASYNC; 245 rabp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 246 if (rabp->b_rcred == NOCRED) { 247 if (cred != NOCRED) 248 crhold(cred); 249 rabp->b_rcred = cred; 250 } 251 vfs_busy_pages(rabp, 0); 252 VOP_STRATEGY(rabp); 253 } else { 254 brelse(rabp); 255 } 256 } 257 258 if (readwait) { 259 rv = biowait(bp); 260 } 261 return (rv); 262} 263 264/* 265 * Write, release buffer on completion. (Done by iodone 266 * if async.) 267 / 268int 269bwrite(struct buf bp) 270{ 271 int oldflags = bp->b_flags; 272 273 if (bp->b_flags & B_INVAL) { 274 brelse(bp); 275 return (0); 276 } 277 if (!(bp->b_flags & B_BUSY)) 278 panic("bwrite: buffer is not busy???"); 279 280 bp->b_flags &= ~(B_READ \| B_DONE \| B_ERROR \| B_DELWRI); 281 bp->b_flags \|= B_WRITEINPROG; 282 283 if ((oldflags & (B_ASYNC\|B_DELWRI)) == (B_ASYNC\|B_DELWRI)) { 284 reassignbuf(bp, bp->b_vp); 285 } 286 287 bp->b_vp->v_numoutput++; 288 vfs_busy_pages(bp, 1); 289 if (curproc != NULL) 290 curproc->p_stats->p_ru.ru_oublock++; 291 VOP_STRATEGY(bp); 292 293 if ((oldflags & B_ASYNC) == 0) { 294 int rtval = biowait(bp); 295 296 if (oldflags & B_DELWRI) { 297 reassignbuf(bp, bp->b_vp); 298 } 299 brelse(bp); 300 return (rtval); 301 } 302 return (0); 303} 304 305int 306vn_bwrite(ap) 307 struct vop_bwrite_args ap; 308{ 309* return (bwrite(ap->a_bp)); 310} 311 312/* 313 * Delayed write. (Buffer is marked dirty). 314 / 315void 316bdwrite(struct buf bp) 317{ 318 319 if ((bp->b_flags & B_BUSY) == 0) { 320 panic("bdwrite: buffer is not busy"); 321 } 322 if (bp->b_flags & B_INVAL) { 323 brelse(bp); 324 return; 325 } 326 if (bp->b_flags & B_TAPE) { 327 bawrite(bp); 328 return; 329 } 330 bp->b_flags &= ~(B_READ\|B_RELBUF); 331 if ((bp->b_flags & B_DELWRI) == 0) { 332 bp->b_flags \|= B_DONE \| B_DELWRI; 333 reassignbuf(bp, bp->b_vp); 334 } 335 336 /* 337 * This bmap keeps the system from needing to do the bmap later, 338 * perhaps when the system is attempting to do a sync. Since it 339 * is likely that the indirect block -- or whatever other datastructure 340 * that the filesystem needs is still in memory now, it is a good 341 * thing to do this. Note also, that if the pageout daemon is 342 * requesting a sync -- there might not be enough memory to do 343 * the bmap then... So, this is important to do. 344 / 345* if( bp->b_lblkno == bp->b_blkno) { 346 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 347 } 348 349 /* 350 * Set the dirty buffer range based upon the VM system dirty pages. 351 / 352* vfs_setdirty(bp); 353 354 /* 355 * We need to do this here to satisfy the vnode_pager and the 356 * pageout daemon, so that it thinks that the pages have been 357 * "cleaned". Note that since the pages are in a delayed write 358 * buffer -- the VFS layer "will" see that the pages get written 359 * out on the next sync, or perhaps the cluster will be completed. 360 / 361* vfs_clean_pages(bp); 362 brelse(bp); 363 return; 364} 365 366/* 367 * Asynchronous write. 368 * Start output on a buffer, but do not wait for it to complete. 369 * The buffer is released when the output completes. 370 / 371void 372bawrite(struct buf bp) 373{ 374 bp->b_flags \|= B_ASYNC; 375 (void) VOP_BWRITE(bp); 376} 377 378/* 379 * Release a buffer. 380 / 381void 382brelse(struct buf bp) 383{ 384 int s; 385 386 if (bp->b_flags & B_CLUSTER) { 387 relpbuf(bp); 388 return; 389 } 390 /* anyone need a "free" block? / 391* s = splbio(); 392 393 if (needsbuffer) { 394 needsbuffer = 0; 395 wakeup(&needsbuffer); 396 } 397 398 /* anyone need this block? / 399* if (bp->b_flags & B_WANTED) { 400 bp->b_flags &= ~(B_WANTED \| B_AGE); 401 wakeup(bp); 402 } 403 404 if (bp->b_flags & B_LOCKED) 405 bp->b_flags &= ~B_ERROR; 406 407 if ((bp->b_flags & (B_NOCACHE \| B_INVAL \| B_ERROR)) \|\| 408 (bp->b_bufsize <= 0)) { 409 bp->b_flags \|= B_INVAL; 410 bp->b_flags &= ~(B_DELWRI \| B_CACHE); 411 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) 412 brelvp(bp); 413 } 414 415 /* 416 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 417 * constituted, so the B_INVAL flag is used to invalidate the buffer, 418 * but the VM object is kept around. The B_NOCACHE flag is used to 419 * invalidate the pages in the VM object. 420 / 421* if (bp->b_flags & B_VMIO) { 422 vm_ooffset_t foff; 423 vm_object_t obj; 424 int i, resid; 425 vm_page_t m; 426 struct vnode vp; 427* int iototal = bp->b_bufsize; 428 429 vp = bp->b_vp; 430 if (!vp) 431 panic("brelse: missing vp"); 432 433 if (bp->b_npages) { 434 vm_pindex_t poff; 435 obj = (vm_object_t) vp->v_object; 436 if (vp->v_type == VBLK) 437 foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT; 438 else 439 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 440 poff = OFF_TO_IDX(foff); 441 for (i = 0; i < bp->b_npages; i++) { 442 m = bp->b_pages[i]; 443 if (m == bogus_page) { 444 m = vm_page_lookup(obj, poff + i); 445 if (!m) { 446 panic("brelse: page missing\n"); 447 } 448 bp->b_pages[i] = m; 449 pmap_qenter(trunc_page(bp->b_data), 450 bp->b_pages, bp->b_npages); 451 } 452 resid = IDX_TO_OFF(m->pindex+1) - foff; 453 if (resid > iototal) 454 resid = iototal; 455 if (resid > 0) { 456 /* 457 * Don't invalidate the page if the local machine has already 458 * modified it. This is the lesser of two evils, and should 459 * be fixed. 460 / 461* if (bp->b_flags & (B_NOCACHE \| B_ERROR)) { 462 vm_page_test_dirty(m); 463 if (m->dirty == 0) { 464 vm_page_set_invalid(m, (vm_offset_t) foff, resid); 465 if (m->valid == 0) 466 vm_page_protect(m, VM_PROT_NONE); 467 } 468 } 469 } 470 foff += resid; 471 iototal -= resid; 472 } 473 } 474 475 if (bp->b_flags & (B_INVAL \| B_RELBUF)) { 476 for(i = 0; i < bp->b_npages; i++) { 477 m = bp->b_pages[i]; 478 --m->bmapped; 479 if (m->bmapped == 0) { 480 if (m->flags & PG_WANTED) { 481 m->flags &= ~PG_WANTED; 482 wakeup(m); 483 } 484 if ((m->busy == 0) && ((m->flags & PG_BUSY) == 0)) { 485 if (m->object->flags & OBJ_MIGHTBEDIRTY) { 486 vm_page_test_dirty(m); 487 } 488 /* 489 * if page isn't valid, no sense in keeping it around 490 / 491* if (m->valid == 0) { 492 vm_page_protect(m, VM_PROT_NONE); 493 vm_page_free(m); 494 /* 495 * if page isn't dirty and hasn't been referenced by 496 * a process, then cache it 497 / 498* } else if ((m->dirty & m->valid) == 0 && 499 (m->flags & PG_REFERENCED) == 0 && 500 !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { 501 vm_page_cache(m); 502 /* 503 * otherwise activate it 504 / 505* } else if ((m->flags & PG_ACTIVE) == 0) { 506 vm_page_activate(m); 507 m->act_count = 0; 508 } 509 } 510 } 511 } 512 bufspace -= bp->b_bufsize; 513 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 514 bp->b_npages = 0; 515 bp->b_bufsize = 0; 516 bp->b_flags &= ~B_VMIO; 517 if (bp->b_vp) 518 brelvp(bp); 519 } 520 } 521 if (bp->b_qindex != QUEUE_NONE) 522 panic("brelse: free buffer onto another queue???"); 523 524 /* enqueue / 525* /* buffers with no memory / 526* if (bp->b_bufsize == 0) { 527 bp->b_qindex = QUEUE_EMPTY; 528 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 529 LIST_REMOVE(bp, b_hash); 530 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 531 bp->b_dev = NODEV; 532 /* buffers with junk contents / 533* } else if (bp->b_flags & (B_ERROR \| B_INVAL \| B_NOCACHE \| B_RELBUF)) { 534 bp->b_qindex = QUEUE_AGE; 535 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 536 LIST_REMOVE(bp, b_hash); 537 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 538 bp->b_dev = NODEV; 539 /* buffers that are locked / 540* } else if (bp->b_flags & B_LOCKED) { 541 bp->b_qindex = QUEUE_LOCKED; 542 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 543 /* buffers with stale but valid contents / 544* } else if (bp->b_flags & B_AGE) { 545 bp->b_qindex = QUEUE_AGE; 546 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 547 /* buffers with valid and quite potentially reuseable contents / 548* } else { 549 bp->b_qindex = QUEUE_LRU; 550 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 551 } 552 553 /* unlock / 554* bp->b_flags &= ~(B_WANTED \| B_BUSY \| B_ASYNC \| B_NOCACHE \| B_AGE \| B_RELBUF); 555 splx(s); 556} 557 558/* 559 * Check to see if a block is currently memory resident. 560 / 561__inline struct buf 562gbincore(struct vnode * vp, daddr_t blkno) 563{ 564 struct buf bp; 565* struct bufhashhdr bh; 566* 567 bh = BUFHASH(vp, blkno); 568 bp = bh->lh_first; 569 570 /* Search hash chain / 571* while (bp != NULL) { 572 /* hit / 573* if (bp->b_vp == vp && bp->b_lblkno == blkno && 574 (bp->b_flags & B_INVAL) == 0) { 575 break; 576 } 577 bp = bp->b_hash.le_next; 578 } 579 return (bp); 580} 581 582/* 583 * this routine implements clustered async writes for 584 * clearing out B_DELWRI buffers... This is much better 585 * than the old way of writing only one buffer at a time. 586 / 587int 588vfs_bio_awrite(struct buf bp) 589{ 590 int i; 591 daddr_t lblkno = bp->b_lblkno; 592 struct vnode vp = bp->b_vp; 593* int s; 594 int ncl; 595 struct buf bpa; 596* int nwritten; 597 598 s = splbio(); 599 /* 600 * right now we support clustered writing only to regular files 601 / 602* if ((vp->v_type == VREG) && 603 (vp->v_mount != 0) && /* Only on nodes that have the size info / 604* (bp->b_flags & (B_CLUSTEROK \| B_INVAL)) == B_CLUSTEROK) { 605 int size; 606 int maxcl; 607 608 size = vp->v_mount->mnt_stat.f_iosize; 609 maxcl = MAXPHYS / size; 610 611 for (i = 1; i < maxcl; i++) { 612 if ((bpa = gbincore(vp, lblkno + i)) && 613 ((bpa->b_flags & (B_BUSY \| B_DELWRI \| B_CLUSTEROK \| B_INVAL)) == 614 (B_DELWRI \| B_CLUSTEROK)) && 615 (bpa->b_bufsize == size)) { 616 if ((bpa->b_blkno == bpa->b_lblkno) \|\| 617 (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) 618 break; 619 } else { 620 break; 621 } 622 } 623 ncl = i; 624 /* 625 * this is a possible cluster write 626 / 627* if (ncl != 1) { 628 nwritten = cluster_wbuild(vp, size, lblkno, ncl); 629 splx(s); 630 return nwritten; 631 } 632 } 633 bremfree(bp); 634 splx(s); 635 /* 636 * default (old) behavior, writing out only one block 637 / 638* bp->b_flags \|= B_BUSY \| B_ASYNC; 639 nwritten = bp->b_bufsize; 640 (void) VOP_BWRITE(bp); 641 return nwritten; 642} 643 644 645/* 646 * Find a buffer header which is available for use. 647 / 648static struct buf 649getnewbuf(int slpflag, int slptimeo, int doingvmio) 650{ 651 struct buf bp; 652* int s; 653 int nbyteswritten = 0; 654 655 s = splbio(); 656start: 657 if (bufspace >= maxbufspace) 658 goto trytofreespace; 659 660 /* can we constitute a new buffer? / 661* if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { 662 if (bp->b_qindex != QUEUE_EMPTY) 663 panic("getnewbuf: inconsistent EMPTY queue"); 664 bremfree(bp); 665 goto fillbuf; 666 } 667trytofreespace: 668 /* 669 * We keep the file I/O from hogging metadata I/O 670 * This is desirable because file data is cached in the 671 * VM/Buffer cache even if a buffer is freed. 672 / 673* if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { 674 if (bp->b_qindex != QUEUE_AGE) 675 panic("getnewbuf: inconsistent AGE queue"); 676 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { 677 if (bp->b_qindex != QUEUE_LRU) 678 panic("getnewbuf: inconsistent LRU queue"); 679 } 680 if (!bp) { 681 /* wait for a free buffer of any kind / 682* needsbuffer = 1; 683 tsleep(&needsbuffer, 684 (PRIBIO + 1) \| slpflag, "newbuf", slptimeo); 685 splx(s); 686 return (0); 687 } 688 689 if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { 690 --bp->b_usecount; 691 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); 692 if (bufqueues[QUEUE_LRU].tqh_first != NULL) { 693 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 694 goto start; 695 } 696 } 697 698 /* if we are a delayed write, convert to an async write / 699* if ((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) { 700 nbyteswritten += vfs_bio_awrite(bp); 701 if (!slpflag && !slptimeo) { 702 splx(s); 703 return (0); 704 } 705 goto start; 706 } 707 708 if (bp->b_flags & B_WANTED) { 709 bp->b_flags &= ~B_WANTED; 710 wakeup(bp); 711 } 712 bremfree(bp); 713 714 if (bp->b_flags & B_VMIO) { 715 bp->b_flags \|= B_RELBUF \| B_BUSY \| B_DONE; 716 brelse(bp); 717 bremfree(bp); 718 } 719 720 if (bp->b_vp) 721 brelvp(bp); 722 723 /* we are not free, nor do we contain interesting data / 724* if (bp->b_rcred != NOCRED) 725 crfree(bp->b_rcred); 726 if (bp->b_wcred != NOCRED) 727 crfree(bp->b_wcred); 728fillbuf: 729 bp->b_flags \|= B_BUSY; 730 LIST_REMOVE(bp, b_hash); 731 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 732 splx(s); 733 if (bp->b_bufsize) { 734 allocbuf(bp, 0); 735 } 736 bp->b_flags = B_BUSY; 737 bp->b_dev = NODEV; 738 bp->b_vp = NULL; 739 bp->b_blkno = bp->b_lblkno = 0; 740 bp->b_iodone = 0; 741 bp->b_error = 0; 742 bp->b_resid = 0; 743 bp->b_bcount = 0; 744 bp->b_npages = 0; 745 bp->b_wcred = bp->b_rcred = NOCRED; 746 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 747 bp->b_dirtyoff = bp->b_dirtyend = 0; 748 bp->b_validoff = bp->b_validend = 0; 749 bp->b_usecount = 2; 750 if (bufspace >= maxbufspace + nbyteswritten) { 751 s = splbio(); 752 bp->b_flags \|= B_INVAL; 753 brelse(bp); 754 goto trytofreespace; 755 } 756 return (bp); 757} 758 759/* 760 * Check to see if a block is currently memory resident. 761 / 762struct buf 763incore(struct vnode * vp, daddr_t blkno) 764{ 765 struct buf bp; 766* struct bufhashhdr bh; 767* 768 int s = splbio(); 769 770 bh = BUFHASH(vp, blkno); 771 bp = bh->lh_first; 772 773 /* Search hash chain / 774* while (bp != NULL) { 775 /* hit / 776* if (bp->b_vp == vp && bp->b_lblkno == blkno && 777 (bp->b_flags & B_INVAL) == 0) { 778 break; 779 } 780 bp = bp->b_hash.le_next; 781 } 782 splx(s); 783 return (bp); 784} 785 786/* 787 * Returns true if no I/O is needed to access the 788 * associated VM object. This is like incore except 789 * it also hunts around in the VM system for the data. 790 / 791* 792int 793inmem(struct vnode * vp, daddr_t blkno) 794{ 795 vm_object_t obj; 796 vm_offset_t toff, tinc; 797 vm_page_t m; 798 vm_ooffset_t off; 799 800 if (incore(vp, blkno)) 801 return 1; 802 if (vp->v_mount == NULL) 803 return 0; 804 if ((vp->v_object == NULL) \|\| (vp->v_flag & VVMIO) == 0) 805 return 0; 806 807 obj = vp->v_object; 808 tinc = PAGE_SIZE; 809 if (tinc > vp->v_mount->mnt_stat.f_iosize) 810 tinc = vp->v_mount->mnt_stat.f_iosize; 811 off = blkno * vp->v_mount->mnt_stat.f_iosize; 812 813 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 814 815 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 816 if (!m) 817 return 0; 818 if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0) 819 return 0; 820 } 821 return 1; 822} 823 824/* 825 * now we set the dirty range for the buffer -- 826 * for NFS -- if the file is mapped and pages have 827 * been written to, let it know. We want the 828 * entire range of the buffer to be marked dirty if 829 * any of the pages have been written to for consistancy 830 * with the b_validoff, b_validend set in the nfs write 831 * code, and used by the nfs read code. 832 / 833static void 834vfs_setdirty(struct buf bp) { 835 int i; 836 vm_object_t object; 837 vm_offset_t boffset, offset; 838 /* 839 * We qualify the scan for modified pages on whether the 840 * object has been flushed yet. The OBJ_WRITEABLE flag 841 * is not cleared simply by protecting pages off. 842 / 843* if ((bp->b_flags & B_VMIO) && 844 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE\|OBJ_CLEANING))) { 845 /* 846 * test the pages to see if they have been modified directly 847 * by users through the VM system. 848 / 849* for (i = 0; i < bp->b_npages; i++) 850 vm_page_test_dirty(bp->b_pages[i]); 851 852 /* 853 * scan forwards for the first page modified 854 / 855* for (i = 0; i < bp->b_npages; i++) { 856 if (bp->b_pages[i]->dirty) { 857 break; 858 } 859 } 860 boffset = (i << PAGE_SHIFT); 861 if (boffset < bp->b_dirtyoff) { 862 bp->b_dirtyoff = boffset; 863 } 864 865 /* 866 * scan backwards for the last page modified 867 / 868* for (i = bp->b_npages - 1; i >= 0; --i) { 869 if (bp->b_pages[i]->dirty) { 870 break; 871 } 872 } 873 boffset = (i + 1); 874 offset = boffset + bp->b_pages[0]->pindex; 875 if (offset >= object->size) 876 boffset = object->size - bp->b_pages[0]->pindex; 877 if (bp->b_dirtyend < (boffset << PAGE_SHIFT)) 878 bp->b_dirtyend = (boffset << PAGE_SHIFT); 879 } 880} 881 882/* 883 * Get a block given a specified block and offset into a file/device. 884 / 885struct buf 886getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 887{ 888 struct buf bp; 889* int s; 890 struct bufhashhdr bh; 891* 892 s = splbio(); 893loop: 894 if ((bp = gbincore(vp, blkno))) { 895 if (bp->b_flags & B_BUSY) { 896 bp->b_flags \|= B_WANTED; 897 if (bp->b_usecount < BUF_MAXUSE) 898 ++bp->b_usecount; 899 if (!tsleep(bp, 900 (PRIBIO + 1) \| slpflag, "getblk", slptimeo)) 901 goto loop; 902 903 splx(s); 904 return (struct buf ) NULL; 905* } 906 bp->b_flags \|= B_BUSY \| B_CACHE; 907 bremfree(bp); 908 909 /* 910 * check for size inconsistancies (note that they shouldn't happen 911 * but do when filesystems don't handle the size changes correctly.) 912 * We are conservative on metadata and don't just extend the buffer 913 * but write and re-constitute it. 914 / 915* 916 if (bp->b_bcount != size) { 917 if (bp->b_flags & B_VMIO) { 918 allocbuf(bp, size); 919 } else { 920 bp->b_flags \|= B_NOCACHE; 921 VOP_BWRITE(bp); 922 goto loop; 923 } 924 } 925 926 /* 927 * make sure that all pages in the buffer are valid, if they 928 * aren't, clear the cache flag. 929 * ASSUMPTION: 930 * if the buffer is greater than 1 page in size, it is assumed 931 * that the buffer address starts on a page boundary... 932 / 933* if (bp->b_flags & B_VMIO) { 934 int szleft, i; 935 szleft = size; 936 for (i=0;i<bp->b_npages;i++) { 937 if (szleft > PAGE_SIZE) { 938 if ((bp->b_pages[i]->valid & VM_PAGE_BITS_ALL) != 939 VM_PAGE_BITS_ALL) { 940 bp->b_flags &= ~(B_CACHE\|B_DONE); 941 break; 942 } 943 szleft -= PAGE_SIZE; 944 } else { 945 if (!vm_page_is_valid(bp->b_pages[i], 946 (((vm_offset_t) bp->b_data) & PAGE_MASK), 947 szleft)) { 948 bp->b_flags &= ~(B_CACHE\|B_DONE); 949 break; 950 } 951 szleft = 0; 952 } 953 } 954 } 955 if (bp->b_usecount < BUF_MAXUSE) 956 ++bp->b_usecount; 957 splx(s); 958 return (bp); 959 } else { 960 vm_object_t obj; 961 int doingvmio; 962 963 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { 964 doingvmio = 1; 965 } else { 966 doingvmio = 0; 967 } 968 if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) { 969 if (slpflag \|\| slptimeo) { 970 splx(s); 971 return NULL; 972 } 973 goto loop; 974 } 975 976 /* 977 * This code is used to make sure that a buffer is not 978 * created while the getnewbuf routine is blocked. 979 * Normally the vnode is locked so this isn't a problem. 980 * VBLK type I/O requests, however, don't lock the vnode. 981 / 982* if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { 983 bp->b_flags \|= B_INVAL; 984 brelse(bp); 985 goto loop; 986 } 987 988 /* 989 * Insert the buffer into the hash, so that it can 990 * be found by incore. 991 / 992* bp->b_blkno = bp->b_lblkno = blkno; 993 bgetvp(vp, bp); 994 LIST_REMOVE(bp, b_hash); 995 bh = BUFHASH(vp, blkno); 996 LIST_INSERT_HEAD(bh, bp, b_hash); 997 998 if (doingvmio) { 999 bp->b_flags \|= (B_VMIO \| B_CACHE); 1000#if defined(VFS_BIO_DEBUG) 1001 if (vp->v_type != VREG) 1002 printf("getblk: vmioing file type %d???\n", vp->v_type); 1003#endif 1004 } else { 1005 bp->b_flags &= ~B_VMIO; 1006 } 1007 splx(s); 1008 1009 allocbuf(bp, size); 1010 return (bp); 1011 } 1012} 1013 1014/* 1015 * Get an empty, disassociated buffer of given size. 1016 / 1017struct buf 1018geteblk(int size) 1019{ 1020 struct buf bp; 1021* 1022 while ((bp = getnewbuf(0, 0, 0)) == 0); 1023 allocbuf(bp, size); 1024 bp->b_flags \|= B_INVAL; 1025 return (bp); 1026} 1027 1028/* 1029 * This code constitutes the buffer memory from either anonymous system 1030 * memory (in the case of non-VMIO operations) or from an associated 1031 * VM object (in the case of VMIO operations). 1032 * 1033 * Note that this code is tricky, and has many complications to resolve 1034 * deadlock or inconsistant data situations. Tread lightly!!! 1035 * 1036 * Modify the length of a buffer's underlying buffer storage without 1037 * destroying information (unless, of course the buffer is shrinking). 1038 / 1039int 1040allocbuf(struct buf bp, int size) 1041{ 1042 1043 int s; 1044 int newbsize, mbsize; 1045 int i; 1046 1047 if (!(bp->b_flags & B_BUSY)) 1048 panic("allocbuf: buffer not busy"); 1049 1050 if ((bp->b_flags & B_VMIO) == 0) { 1051 /* 1052 * Just get anonymous memory from the kernel 1053 / 1054* mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1055 newbsize = round_page(size); 1056 1057 if (newbsize < bp->b_bufsize) { 1058 vm_hold_free_pages( 1059 bp, 1060 (vm_offset_t) bp->b_data + newbsize, 1061 (vm_offset_t) bp->b_data + bp->b_bufsize); 1062 } else if (newbsize > bp->b_bufsize) { 1063 vm_hold_load_pages( 1064 bp, 1065 (vm_offset_t) bp->b_data + bp->b_bufsize, 1066 (vm_offset_t) bp->b_data + newbsize); 1067 } 1068 } else { 1069 vm_page_t m; 1070 int desiredpages; 1071 1072 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1073 desiredpages = (round_page(newbsize) >> PAGE_SHIFT); 1074 1075 if (newbsize < bp->b_bufsize) { 1076 if (desiredpages < bp->b_npages) { 1077 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + 1078 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 1079 for (i = desiredpages; i < bp->b_npages; i++) { 1080 m = bp->b_pages[i]; 1081 s = splhigh(); 1082 while ((m->flags & PG_BUSY) \|\| (m->busy != 0)) { 1083 m->flags \|= PG_WANTED; 1084 tsleep(m, PVM, "biodep", 0); 1085 } 1086 splx(s); 1087 1088 if (m->bmapped == 0) { 1089 printf("allocbuf: bmapped is zero for page %d\n", i); 1090 panic("allocbuf: error"); 1091 } 1092 --m->bmapped; 1093 if (m->bmapped == 0) { 1094 vm_page_protect(m, VM_PROT_NONE); 1095 vm_page_free(m); 1096 } 1097 bp->b_pages[i] = NULL; 1098 } 1099 bp->b_npages = desiredpages; 1100 } 1101 } else if (newbsize > bp->b_bufsize) { 1102 vm_object_t obj; 1103 vm_offset_t tinc, toff; 1104 vm_ooffset_t off; 1105 vm_pindex_t objoff; 1106 int pageindex, curbpnpages; 1107 struct vnode vp; 1108* int bsize; 1109 1110 vp = bp->b_vp; 1111 1112 if (vp->v_type == VBLK) 1113 bsize = DEV_BSIZE; 1114 else 1115 bsize = vp->v_mount->mnt_stat.f_iosize; 1116 1117 if (bp->b_npages < desiredpages) { 1118 obj = vp->v_object; 1119 tinc = PAGE_SIZE; 1120 if (tinc > bsize) 1121 tinc = bsize; 1122 off = (vm_ooffset_t) bp->b_lblkno * bsize; 1123 doretry: 1124 curbpnpages = bp->b_npages; 1125 bp->b_flags \|= B_CACHE; 1126 for (toff = 0; toff < newbsize; toff += tinc) { 1127 int bytesinpage; 1128 1129 pageindex = toff >> PAGE_SHIFT; 1130 objoff = OFF_TO_IDX(off + toff); 1131 if (pageindex < curbpnpages) { 1132 1133 m = bp->b_pages[pageindex]; 1134 if (m->pindex != objoff) 1135 panic("allocbuf: page changed offset??!!!?"); 1136 bytesinpage = tinc; 1137 if (tinc > (newbsize - toff)) 1138 bytesinpage = newbsize - toff; 1139 if (!vm_page_is_valid(m, 1140 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)), 1141 bytesinpage)) { 1142 bp->b_flags &= ~B_CACHE; 1143 } 1144 if ((m->flags & PG_ACTIVE) == 0) { 1145 vm_page_activate(m); 1146 m->act_count = 0; 1147 } 1148 continue; 1149 } 1150 m = vm_page_lookup(obj, objoff); 1151 if (!m) { 1152 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); 1153 if (!m) { 1154 int j; 1155 1156 for (j = bp->b_npages; j < pageindex; j++) { 1157 PAGE_WAKEUP(bp->b_pages[j]); 1158 } 1159 VM_WAIT; 1160 goto doretry; 1161 } 1162 vm_page_activate(m); 1163 m->act_count = 0; 1164 m->valid = 0; 1165 bp->b_flags &= ~B_CACHE; 1166 } else if (m->flags & PG_BUSY) { 1167 int j; 1168 1169 for (j = bp->b_npages; j < pageindex; j++) { 1170 PAGE_WAKEUP(bp->b_pages[j]); 1171 } 1172 1173 s = splbio(); 1174 m->flags \|= PG_WANTED; 1175 tsleep(m, PVM, "pgtblk", 0); 1176 splx(s); 1177 1178 goto doretry; 1179 } else { 1180 if ((curproc != pageproc) && 1181 (m->flags & PG_CACHE) && 1182 (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { 1183 pagedaemon_wakeup(); 1184 } 1185 bytesinpage = tinc; 1186 if (tinc > (newbsize - toff)) 1187 bytesinpage = newbsize - toff; 1188 if (!vm_page_is_valid(m, 1189 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)), 1190 bytesinpage)) { 1191 bp->b_flags &= ~B_CACHE; 1192 } 1193 if ((m->flags & PG_ACTIVE) == 0) { 1194 vm_page_activate(m); 1195 m->act_count = 0; 1196 } 1197 m->flags \|= PG_BUSY; 1198 } 1199 bp->b_pages[pageindex] = m; 1200 curbpnpages = pageindex + 1; 1201 } 1202 for (i = bp->b_npages; i < curbpnpages; i++) { 1203 m = bp->b_pages[i]; 1204 m->bmapped++; 1205 PAGE_WAKEUP(m); 1206 } 1207 bp->b_npages = curbpnpages; 1208 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 1209 pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); 1210 bp->b_data += off & (PAGE_SIZE - 1); 1211 } 1212 } 1213 } 1214 bufspace += (newbsize - bp->b_bufsize); 1215 bp->b_bufsize = newbsize; 1216 bp->b_bcount = size; 1217 return 1; 1218} 1219 1220/* 1221 * Wait for buffer I/O completion, returning error status. 1222 / 1223int 1224biowait(register struct buf bp) 1225{ 1226 int s; 1227 1228 s = splbio(); 1229 while ((bp->b_flags & B_DONE) == 0) 1230 tsleep(bp, PRIBIO, "biowait", 0); 1231 splx(s); 1232 if (bp->b_flags & B_EINTR) { 1233 bp->b_flags &= ~B_EINTR; 1234 return (EINTR); 1235 } 1236 if (bp->b_flags & B_ERROR) { 1237 return (bp->b_error ? bp->b_error : EIO); 1238 } else { 1239 return (0); 1240 } 1241} 1242 1243/* 1244 * Finish I/O on a buffer, calling an optional function. 1245 * This is usually called from interrupt level, so process blocking 1246 * is not a good idea. 1247 / 1248void 1249biodone(register struct buf bp) 1250{ 1251 int s; 1252 1253 s = splbio(); 1254 if (!(bp->b_flags & B_BUSY)) 1255 panic("biodone: buffer not busy"); 1256 1257 if (bp->b_flags & B_DONE) { 1258 splx(s); 1259 printf("biodone: buffer already done\n"); 1260 return; 1261 } 1262 bp->b_flags \|= B_DONE; 1263 1264 if ((bp->b_flags & B_READ) == 0) { 1265 vwakeup(bp); 1266 } 1267#ifdef BOUNCE_BUFFERS 1268 if (bp->b_flags & B_BOUNCE) 1269 vm_bounce_free(bp); 1270#endif 1271 1272 /* call optional completion function if requested / 1273* if (bp->b_flags & B_CALL) { 1274 bp->b_flags &= ~B_CALL; 1275 (bp->b_iodone) (bp); 1276* splx(s); 1277 return; 1278 } 1279 if (bp->b_flags & B_VMIO) { 1280 int i, resid; 1281 vm_ooffset_t foff; 1282 vm_page_t m; 1283 vm_object_t obj; 1284 int iosize; 1285 struct vnode vp = bp->b_vp; 1286* 1287 if (vp->v_type == VBLK) 1288 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1289 else 1290 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1291 obj = vp->v_object; 1292 if (!obj) { 1293 panic("biodone: no object"); 1294 } 1295#if defined(VFS_BIO_DEBUG) 1296 if (obj->paging_in_progress < bp->b_npages) { 1297 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 1298 obj->paging_in_progress, bp->b_npages); 1299 } 1300#endif 1301 iosize = bp->b_bufsize; 1302 for (i = 0; i < bp->b_npages; i++) { 1303 int bogusflag = 0; 1304 m = bp->b_pages[i]; 1305 if (m == bogus_page) { 1306 bogusflag = 1; 1307 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 1308 if (!m) { 1309#if defined(VFS_BIO_DEBUG) 1310 printf("biodone: page disappeared\n"); 1311#endif 1312 --obj->paging_in_progress; 1313 continue; 1314 } 1315 bp->b_pages[i] = m; 1316 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1317 } 1318#if defined(VFS_BIO_DEBUG) 1319 if (OFF_TO_IDX(foff) != m->pindex) { 1320 printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex); 1321 } 1322#endif 1323 resid = IDX_TO_OFF(m->pindex + 1) - foff; 1324 if (resid > iosize) 1325 resid = iosize; 1326 /* 1327 * In the write case, the valid and clean bits are 1328 * already changed correctly, so we only need to do this 1329 * here in the read case. 1330 / 1331* if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 1332 vm_page_set_validclean(m, 1333 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid); 1334 } 1335 1336 /* 1337 * when debugging new filesystems or buffer I/O methods, this 1338 * is the most common error that pops up. if you see this, you 1339 * have not set the page busy flag correctly!!! 1340 / 1341* if (m->busy == 0) { 1342 printf("biodone: page busy < 0, " 1343 "pindex: %d, foff: 0x(%x,%x), " 1344 "resid: %d, index: %d\n", 1345 (int) m->pindex, (int)(foff >> 32), 1346 (int) foff & 0xffffffff, resid, i); 1347 if (vp->v_type != VBLK) 1348 printf(" iosize: %d, lblkno: %d, flags: 0x%lx, npages: %d\n", 1349 bp->b_vp->v_mount->mnt_stat.f_iosize, 1350 (int) bp->b_lblkno, 1351 bp->b_flags, bp->b_npages); 1352 else 1353 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", 1354 (int) bp->b_lblkno, 1355 bp->b_flags, bp->b_npages); 1356 printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n", 1357 m->valid, m->dirty, m->bmapped); 1358 panic("biodone: page busy < 0\n"); 1359 } 1360 --m->busy; 1361 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1362 m->flags &= ~PG_WANTED; 1363 wakeup(m); 1364 } 1365 --obj->paging_in_progress; 1366 foff += resid; 1367 iosize -= resid; 1368 } 1369 if (obj && obj->paging_in_progress == 0 && 1370 (obj->flags & OBJ_PIPWNT)) { 1371 obj->flags &= ~OBJ_PIPWNT; 1372 wakeup(obj); 1373 } 1374 } 1375 /* 1376 * For asynchronous completions, release the buffer now. The brelse 1377 * checks for B_WANTED and will do the wakeup there if necessary - so 1378 * no need to do a wakeup here in the async case. 1379 / 1380* 1381 if (bp->b_flags & B_ASYNC) { 1382 brelse(bp); 1383 } else { 1384 wakeup(bp); 1385 } 1386 splx(s); 1387} 1388 1389int 1390count_lock_queue() 1391{ 1392 int count; 1393 struct buf bp; 1394* 1395 count = 0; 1396 for (bp = bufqueues[QUEUE_LOCKED].tqh_first; 1397 bp != NULL; 1398 bp = bp->b_freelist.tqe_next) 1399 count++; 1400 return (count); 1401} 1402 1403int vfs_update_interval = 30; 1404 1405static void 1406vfs_update() 1407{ 1408 (void) spl0(); /* XXX redundant? wrong place? / 1409* while (1) { 1410 tsleep(&vfs_update_wakeup, PUSER, "update", 1411 hz * vfs_update_interval); 1412 vfs_update_wakeup = 0; 1413 sync(curproc, NULL, NULL); 1414 } 1415} 1416 1417static int 1418sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS 1419{ 1420 int error = sysctl_handle_int(oidp, 1421 oidp->oid_arg1, oidp->oid_arg2, req); 1422 if (!error) 1423 wakeup(&vfs_update_wakeup); 1424 return error; 1425} 1426 1427SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT\|CTLFLAG_RW, 1428 &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); 1429 1430 1431/* 1432 * This routine is called in lieu of iodone in the case of 1433 * incomplete I/O. This keeps the busy status for pages 1434 * consistant. 1435 / 1436void 1437vfs_unbusy_pages(struct buf bp) 1438{ 1439 int i; 1440 1441 if (bp->b_flags & B_VMIO) { 1442 struct vnode vp = bp->b_vp; 1443* vm_object_t obj = vp->v_object; 1444 vm_ooffset_t foff; 1445 1446 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1447 1448 for (i = 0; i < bp->b_npages; i++) { 1449 vm_page_t m = bp->b_pages[i]; 1450 1451 if (m == bogus_page) { 1452 m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i); 1453 if (!m) { 1454 panic("vfs_unbusy_pages: page missing\n"); 1455 } 1456 bp->b_pages[i] = m; 1457 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1458 } 1459 --obj->paging_in_progress; 1460 --m->busy; 1461 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1462 m->flags &= ~PG_WANTED; 1463 wakeup(m); 1464 } 1465 } 1466 if (obj->paging_in_progress == 0 && 1467 (obj->flags & OBJ_PIPWNT)) { 1468 obj->flags &= ~OBJ_PIPWNT; 1469 wakeup(obj); 1470 } 1471 } 1472} 1473 1474/* 1475 * This routine is called before a device strategy routine. 1476 * It is used to tell the VM system that paging I/O is in 1477 * progress, and treat the pages associated with the buffer 1478 * almost as being PG_BUSY. Also the object paging_in_progress 1479 * flag is handled to make sure that the object doesn't become 1480 * inconsistant. 1481 / 1482void 1483vfs_busy_pages(struct buf bp, int clear_modify) 1484{ 1485 int i; 1486 1487 if (bp->b_flags & B_VMIO) { 1488 vm_object_t obj = bp->b_vp->v_object; 1489 vm_ooffset_t foff; 1490 int iocount = bp->b_bufsize; 1491 1492 if (bp->b_vp->v_type == VBLK) 1493 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1494 else 1495 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1496 vfs_setdirty(bp); 1497 for (i = 0; i < bp->b_npages; i++) { 1498 vm_page_t m = bp->b_pages[i]; 1499 int resid = IDX_TO_OFF(m->pindex + 1) - foff; 1500 1501 if (resid > iocount) 1502 resid = iocount; 1503 if ((bp->b_flags & B_CLUSTER) == 0) { 1504 obj->paging_in_progress++; 1505 m->busy++; 1506 } 1507 if (clear_modify) { 1508 vm_page_protect(m, VM_PROT_READ); 1509 vm_page_set_validclean(m, 1510 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid); 1511 } else if (bp->b_bcount >= PAGE_SIZE) { 1512 if (m->valid && (bp->b_flags & B_CACHE) == 0) { 1513 bp->b_pages[i] = bogus_page; 1514 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1515 } 1516 } 1517 foff += resid; 1518 iocount -= resid; 1519 } 1520 } 1521} 1522 1523/* 1524 * Tell the VM system that the pages associated with this buffer 1525 * are clean. This is used for delayed writes where the data is 1526 * going to go to disk eventually without additional VM intevention. 1527 / 1528void 1529vfs_clean_pages(struct buf bp) 1530{ 1531 int i; 1532 1533 if (bp->b_flags & B_VMIO) { 1534 vm_ooffset_t foff; 1535 int iocount = bp->b_bufsize; 1536 1537 if (bp->b_vp->v_type == VBLK) 1538 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1539 else 1540 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1541 1542 for (i = 0; i < bp->b_npages; i++) { 1543 vm_page_t m = bp->b_pages[i]; 1544 int resid = IDX_TO_OFF(m->pindex + 1) - foff; 1545 1546 if (resid > iocount) 1547 resid = iocount; 1548 if (resid > 0) { 1549 vm_page_set_validclean(m, 1550 ((vm_offset_t) foff & (PAGE_SIZE-1)), resid); 1551 } 1552 foff += resid; 1553 iocount -= resid; 1554 } 1555 } 1556} 1557 1558void 1559vfs_bio_clrbuf(struct buf bp) { 1560* int i; 1561 if( bp->b_flags & B_VMIO) { 1562 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { 1563 int mask; 1564 mask = 0; 1565 for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE) 1566 mask \|= (1 << (i/DEV_BSIZE)); 1567 if( bp->b_pages[0]->valid != mask) { 1568 bzero(bp->b_data, bp->b_bufsize); 1569 } 1570 bp->b_pages[0]->valid = mask; 1571 bp->b_resid = 0; 1572 return; 1573 } 1574 for(i=0;i<bp->b_npages;i++) { 1575 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) 1576 continue; 1577 if( bp->b_pages[i]->valid == 0) { 1578 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) 1579 bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE); 1580 } else { 1581 int j; 1582 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { 1583 if( (bp->b_pages[i]->valid & (1<<j)) == 0) 1584 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE); 1585 } 1586 } 1587 bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; 1588 } 1589 bp->b_resid = 0; 1590 } else { 1591 clrbuf(bp); 1592 } 1593} 1594 1595/* 1596 * vm_hold_load_pages and vm_hold_unload pages get pages into 1597 * a buffers address space. The pages are anonymous and are 1598 * not associated with a file object. 1599 / 1600void 1601vm_hold_load_pages(struct buf bp, vm_offset_t froma, vm_offset_t toa) 1602{ 1603 vm_offset_t pg; 1604 vm_page_t p; 1605 vm_offset_t from = round_page(froma); 1606 vm_offset_t to = round_page(toa); 1607 1608 for (pg = from; pg < to; pg += PAGE_SIZE) { 1609 1610tryagain: 1611 1612 p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 1613 VM_ALLOC_NORMAL); 1614 if (!p) { 1615 VM_WAIT; 1616 goto tryagain; 1617 } 1618 vm_page_wire(p); 1619 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 1620 bp->b_pages[((caddr_t) pg - bp->b_data) >> PAGE_SHIFT] = p; 1621 PAGE_WAKEUP(p); 1622 bp->b_npages++; 1623 } 1624} 1625 1626void 1627vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) 1628{ 1629 vm_offset_t pg; 1630 vm_page_t p; 1631 vm_offset_t from = round_page(froma); 1632 vm_offset_t to = round_page(toa); 1633 1634 for (pg = from; pg < to; pg += PAGE_SIZE) { 1635 int index = ((caddr_t) pg - bp->b_data) >> PAGE_SHIFT; 1636 p = bp->b_pages[index]; 1637 bp->b_pages[index] = 0; 1638 pmap_kremove(pg); 1639 vm_page_free(p); 1640 --bp->b_npages; 1641 } 1642}	108 109#define BUF_MAXUSE 8 110 111/* 112 * Initialize buffer headers and related structures. 113 / 114void 115bufinit() 116{ 117* struct buf bp; 118* int i; 119 120 TAILQ_INIT(&bswlist); 121 LIST_INIT(&invalhash); 122 123 /* first, make a null hash table / 124* for (i = 0; i < BUFHSZ; i++) 125 LIST_INIT(&bufhashtbl[i]); 126 127 /* next, make a null set of free lists / 128* for (i = 0; i < BUFFER_QUEUES; i++) 129 TAILQ_INIT(&bufqueues[i]); 130 131 buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); 132 /* finally, initialize each buffer header and stick on empty q / 133* for (i = 0; i < nbuf; i++) { 134 bp = &buf[i]; 135 bzero(bp, sizeof bp); 136* bp->b_flags = B_INVAL; /* we're just an empty header / 137* bp->b_dev = NODEV; 138 bp->b_rcred = NOCRED; 139 bp->b_wcred = NOCRED; 140 bp->b_qindex = QUEUE_EMPTY; 141 bp->b_vnbufs.le_next = NOLIST; 142 bp->b_data = buffers_kva + i * MAXBSIZE; 143 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 144 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 145 } 146/* 147 * maxbufspace is currently calculated to support all filesystem blocks 148 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer 149 * cache is still the same as it would be for 8K filesystems. This 150 * keeps the size of the buffer cache "in check" for big block filesystems. 151 / 152* maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE; 153 154 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 155 bogus_page = vm_page_alloc(kernel_object, 156 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 157 VM_ALLOC_NORMAL); 158 159} 160 161/* 162 * remove the buffer from the appropriate free list 163 / 164void 165bremfree(struct buf bp) 166{ 167 int s = splbio(); 168 169 if (bp->b_qindex != QUEUE_NONE) { 170 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 171 bp->b_qindex = QUEUE_NONE; 172 } else { 173 panic("bremfree: removing a buffer when not on a queue"); 174 } 175 splx(s); 176} 177 178/* 179 * Get a buffer with the specified data. Look in the cache first. 180 / 181int 182bread(struct vnode vp, daddr_t blkno, int size, struct ucred * cred, 183 struct buf ** bpp) 184{ 185 struct buf bp; 186* 187 bp = getblk(vp, blkno, size, 0, 0); 188 bpp = bp; 189* 190 /* if not found in cache, do some I/O / 191* if ((bp->b_flags & B_CACHE) == 0) { 192 if (curproc != NULL) 193 curproc->p_stats->p_ru.ru_inblock++; 194 bp->b_flags \|= B_READ; 195 bp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 196 if (bp->b_rcred == NOCRED) { 197 if (cred != NOCRED) 198 crhold(cred); 199 bp->b_rcred = cred; 200 } 201 vfs_busy_pages(bp, 0); 202 VOP_STRATEGY(bp); 203 return (biowait(bp)); 204 } 205 return (0); 206} 207 208/* 209 * Operates like bread, but also starts asynchronous I/O on 210 * read-ahead blocks. 211 / 212int 213breadn(struct vnode vp, daddr_t blkno, int size, 214 daddr_t * rablkno, int rabsize, 215* int cnt, struct ucred * cred, struct buf ** bpp) 216{ 217 struct buf bp, rabp; 218 int i; 219 int rv = 0, readwait = 0; 220 221 bpp = bp = getblk(vp, blkno, size, 0, 0); 222* 223 /* if not found in cache, do some I/O / 224* if ((bp->b_flags & B_CACHE) == 0) { 225 if (curproc != NULL) 226 curproc->p_stats->p_ru.ru_inblock++; 227 bp->b_flags \|= B_READ; 228 bp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 229 if (bp->b_rcred == NOCRED) { 230 if (cred != NOCRED) 231 crhold(cred); 232 bp->b_rcred = cred; 233 } 234 vfs_busy_pages(bp, 0); 235 VOP_STRATEGY(bp); 236 ++readwait; 237 } 238 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 239 if (inmem(vp, rablkno)) 240* continue; 241 rabp = getblk(vp, rablkno, rabsize, 0, 0); 242 243 if ((rabp->b_flags & B_CACHE) == 0) { 244 if (curproc != NULL) 245 curproc->p_stats->p_ru.ru_inblock++; 246 rabp->b_flags \|= B_READ \| B_ASYNC; 247 rabp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 248 if (rabp->b_rcred == NOCRED) { 249 if (cred != NOCRED) 250 crhold(cred); 251 rabp->b_rcred = cred; 252 } 253 vfs_busy_pages(rabp, 0); 254 VOP_STRATEGY(rabp); 255 } else { 256 brelse(rabp); 257 } 258 } 259 260 if (readwait) { 261 rv = biowait(bp); 262 } 263 return (rv); 264} 265 266/* 267 * Write, release buffer on completion. (Done by iodone 268 * if async.) 269 / 270int 271bwrite(struct buf bp) 272{ 273 int oldflags = bp->b_flags; 274 275 if (bp->b_flags & B_INVAL) { 276 brelse(bp); 277 return (0); 278 } 279 if (!(bp->b_flags & B_BUSY)) 280 panic("bwrite: buffer is not busy???"); 281 282 bp->b_flags &= ~(B_READ \| B_DONE \| B_ERROR \| B_DELWRI); 283 bp->b_flags \|= B_WRITEINPROG; 284 285 if ((oldflags & (B_ASYNC\|B_DELWRI)) == (B_ASYNC\|B_DELWRI)) { 286 reassignbuf(bp, bp->b_vp); 287 } 288 289 bp->b_vp->v_numoutput++; 290 vfs_busy_pages(bp, 1); 291 if (curproc != NULL) 292 curproc->p_stats->p_ru.ru_oublock++; 293 VOP_STRATEGY(bp); 294 295 if ((oldflags & B_ASYNC) == 0) { 296 int rtval = biowait(bp); 297 298 if (oldflags & B_DELWRI) { 299 reassignbuf(bp, bp->b_vp); 300 } 301 brelse(bp); 302 return (rtval); 303 } 304 return (0); 305} 306 307int 308vn_bwrite(ap) 309 struct vop_bwrite_args ap; 310{ 311* return (bwrite(ap->a_bp)); 312} 313 314/* 315 * Delayed write. (Buffer is marked dirty). 316 / 317void 318bdwrite(struct buf bp) 319{ 320 321 if ((bp->b_flags & B_BUSY) == 0) { 322 panic("bdwrite: buffer is not busy"); 323 } 324 if (bp->b_flags & B_INVAL) { 325 brelse(bp); 326 return; 327 } 328 if (bp->b_flags & B_TAPE) { 329 bawrite(bp); 330 return; 331 } 332 bp->b_flags &= ~(B_READ\|B_RELBUF); 333 if ((bp->b_flags & B_DELWRI) == 0) { 334 bp->b_flags \|= B_DONE \| B_DELWRI; 335 reassignbuf(bp, bp->b_vp); 336 } 337 338 /* 339 * This bmap keeps the system from needing to do the bmap later, 340 * perhaps when the system is attempting to do a sync. Since it 341 * is likely that the indirect block -- or whatever other datastructure 342 * that the filesystem needs is still in memory now, it is a good 343 * thing to do this. Note also, that if the pageout daemon is 344 * requesting a sync -- there might not be enough memory to do 345 * the bmap then... So, this is important to do. 346 / 347* if( bp->b_lblkno == bp->b_blkno) { 348 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 349 } 350 351 /* 352 * Set the dirty buffer range based upon the VM system dirty pages. 353 / 354* vfs_setdirty(bp); 355 356 /* 357 * We need to do this here to satisfy the vnode_pager and the 358 * pageout daemon, so that it thinks that the pages have been 359 * "cleaned". Note that since the pages are in a delayed write 360 * buffer -- the VFS layer "will" see that the pages get written 361 * out on the next sync, or perhaps the cluster will be completed. 362 / 363* vfs_clean_pages(bp); 364 brelse(bp); 365 return; 366} 367 368/* 369 * Asynchronous write. 370 * Start output on a buffer, but do not wait for it to complete. 371 * The buffer is released when the output completes. 372 / 373void 374bawrite(struct buf bp) 375{ 376 bp->b_flags \|= B_ASYNC; 377 (void) VOP_BWRITE(bp); 378} 379 380/* 381 * Release a buffer. 382 / 383void 384brelse(struct buf bp) 385{ 386 int s; 387 388 if (bp->b_flags & B_CLUSTER) { 389 relpbuf(bp); 390 return; 391 } 392 /* anyone need a "free" block? / 393* s = splbio(); 394 395 if (needsbuffer) { 396 needsbuffer = 0; 397 wakeup(&needsbuffer); 398 } 399 400 /* anyone need this block? / 401* if (bp->b_flags & B_WANTED) { 402 bp->b_flags &= ~(B_WANTED \| B_AGE); 403 wakeup(bp); 404 } 405 406 if (bp->b_flags & B_LOCKED) 407 bp->b_flags &= ~B_ERROR; 408 409 if ((bp->b_flags & (B_NOCACHE \| B_INVAL \| B_ERROR)) \|\| 410 (bp->b_bufsize <= 0)) { 411 bp->b_flags \|= B_INVAL; 412 bp->b_flags &= ~(B_DELWRI \| B_CACHE); 413 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) 414 brelvp(bp); 415 } 416 417 /* 418 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 419 * constituted, so the B_INVAL flag is used to invalidate the buffer, 420 * but the VM object is kept around. The B_NOCACHE flag is used to 421 * invalidate the pages in the VM object. 422 / 423* if (bp->b_flags & B_VMIO) { 424 vm_ooffset_t foff; 425 vm_object_t obj; 426 int i, resid; 427 vm_page_t m; 428 struct vnode vp; 429* int iototal = bp->b_bufsize; 430 431 vp = bp->b_vp; 432 if (!vp) 433 panic("brelse: missing vp"); 434 435 if (bp->b_npages) { 436 vm_pindex_t poff; 437 obj = (vm_object_t) vp->v_object; 438 if (vp->v_type == VBLK) 439 foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT; 440 else 441 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 442 poff = OFF_TO_IDX(foff); 443 for (i = 0; i < bp->b_npages; i++) { 444 m = bp->b_pages[i]; 445 if (m == bogus_page) { 446 m = vm_page_lookup(obj, poff + i); 447 if (!m) { 448 panic("brelse: page missing\n"); 449 } 450 bp->b_pages[i] = m; 451 pmap_qenter(trunc_page(bp->b_data), 452 bp->b_pages, bp->b_npages); 453 } 454 resid = IDX_TO_OFF(m->pindex+1) - foff; 455 if (resid > iototal) 456 resid = iototal; 457 if (resid > 0) { 458 /* 459 * Don't invalidate the page if the local machine has already 460 * modified it. This is the lesser of two evils, and should 461 * be fixed. 462 / 463* if (bp->b_flags & (B_NOCACHE \| B_ERROR)) { 464 vm_page_test_dirty(m); 465 if (m->dirty == 0) { 466 vm_page_set_invalid(m, (vm_offset_t) foff, resid); 467 if (m->valid == 0) 468 vm_page_protect(m, VM_PROT_NONE); 469 } 470 } 471 } 472 foff += resid; 473 iototal -= resid; 474 } 475 } 476 477 if (bp->b_flags & (B_INVAL \| B_RELBUF)) { 478 for(i = 0; i < bp->b_npages; i++) { 479 m = bp->b_pages[i]; 480 --m->bmapped; 481 if (m->bmapped == 0) { 482 if (m->flags & PG_WANTED) { 483 m->flags &= ~PG_WANTED; 484 wakeup(m); 485 } 486 if ((m->busy == 0) && ((m->flags & PG_BUSY) == 0)) { 487 if (m->object->flags & OBJ_MIGHTBEDIRTY) { 488 vm_page_test_dirty(m); 489 } 490 /* 491 * if page isn't valid, no sense in keeping it around 492 / 493* if (m->valid == 0) { 494 vm_page_protect(m, VM_PROT_NONE); 495 vm_page_free(m); 496 /* 497 * if page isn't dirty and hasn't been referenced by 498 * a process, then cache it 499 / 500* } else if ((m->dirty & m->valid) == 0 && 501 (m->flags & PG_REFERENCED) == 0 && 502 !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { 503 vm_page_cache(m); 504 /* 505 * otherwise activate it 506 / 507* } else if ((m->flags & PG_ACTIVE) == 0) { 508 vm_page_activate(m); 509 m->act_count = 0; 510 } 511 } 512 } 513 } 514 bufspace -= bp->b_bufsize; 515 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 516 bp->b_npages = 0; 517 bp->b_bufsize = 0; 518 bp->b_flags &= ~B_VMIO; 519 if (bp->b_vp) 520 brelvp(bp); 521 } 522 } 523 if (bp->b_qindex != QUEUE_NONE) 524 panic("brelse: free buffer onto another queue???"); 525 526 /* enqueue / 527* /* buffers with no memory / 528* if (bp->b_bufsize == 0) { 529 bp->b_qindex = QUEUE_EMPTY; 530 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 531 LIST_REMOVE(bp, b_hash); 532 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 533 bp->b_dev = NODEV; 534 /* buffers with junk contents / 535* } else if (bp->b_flags & (B_ERROR \| B_INVAL \| B_NOCACHE \| B_RELBUF)) { 536 bp->b_qindex = QUEUE_AGE; 537 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 538 LIST_REMOVE(bp, b_hash); 539 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 540 bp->b_dev = NODEV; 541 /* buffers that are locked / 542* } else if (bp->b_flags & B_LOCKED) { 543 bp->b_qindex = QUEUE_LOCKED; 544 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 545 /* buffers with stale but valid contents / 546* } else if (bp->b_flags & B_AGE) { 547 bp->b_qindex = QUEUE_AGE; 548 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 549 /* buffers with valid and quite potentially reuseable contents / 550* } else { 551 bp->b_qindex = QUEUE_LRU; 552 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 553 } 554 555 /* unlock / 556* bp->b_flags &= ~(B_WANTED \| B_BUSY \| B_ASYNC \| B_NOCACHE \| B_AGE \| B_RELBUF); 557 splx(s); 558} 559 560/* 561 * Check to see if a block is currently memory resident. 562 / 563__inline struct buf 564gbincore(struct vnode * vp, daddr_t blkno) 565{ 566 struct buf bp; 567* struct bufhashhdr bh; 568* 569 bh = BUFHASH(vp, blkno); 570 bp = bh->lh_first; 571 572 /* Search hash chain / 573* while (bp != NULL) { 574 /* hit / 575* if (bp->b_vp == vp && bp->b_lblkno == blkno && 576 (bp->b_flags & B_INVAL) == 0) { 577 break; 578 } 579 bp = bp->b_hash.le_next; 580 } 581 return (bp); 582} 583 584/* 585 * this routine implements clustered async writes for 586 * clearing out B_DELWRI buffers... This is much better 587 * than the old way of writing only one buffer at a time. 588 / 589int 590vfs_bio_awrite(struct buf bp) 591{ 592 int i; 593 daddr_t lblkno = bp->b_lblkno; 594 struct vnode vp = bp->b_vp; 595* int s; 596 int ncl; 597 struct buf bpa; 598* int nwritten; 599 600 s = splbio(); 601 /* 602 * right now we support clustered writing only to regular files 603 / 604* if ((vp->v_type == VREG) && 605 (vp->v_mount != 0) && /* Only on nodes that have the size info / 606* (bp->b_flags & (B_CLUSTEROK \| B_INVAL)) == B_CLUSTEROK) { 607 int size; 608 int maxcl; 609 610 size = vp->v_mount->mnt_stat.f_iosize; 611 maxcl = MAXPHYS / size; 612 613 for (i = 1; i < maxcl; i++) { 614 if ((bpa = gbincore(vp, lblkno + i)) && 615 ((bpa->b_flags & (B_BUSY \| B_DELWRI \| B_CLUSTEROK \| B_INVAL)) == 616 (B_DELWRI \| B_CLUSTEROK)) && 617 (bpa->b_bufsize == size)) { 618 if ((bpa->b_blkno == bpa->b_lblkno) \|\| 619 (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) 620 break; 621 } else { 622 break; 623 } 624 } 625 ncl = i; 626 /* 627 * this is a possible cluster write 628 / 629* if (ncl != 1) { 630 nwritten = cluster_wbuild(vp, size, lblkno, ncl); 631 splx(s); 632 return nwritten; 633 } 634 } 635 bremfree(bp); 636 splx(s); 637 /* 638 * default (old) behavior, writing out only one block 639 / 640* bp->b_flags \|= B_BUSY \| B_ASYNC; 641 nwritten = bp->b_bufsize; 642 (void) VOP_BWRITE(bp); 643 return nwritten; 644} 645 646 647/* 648 * Find a buffer header which is available for use. 649 / 650static struct buf 651getnewbuf(int slpflag, int slptimeo, int doingvmio) 652{ 653 struct buf bp; 654* int s; 655 int nbyteswritten = 0; 656 657 s = splbio(); 658start: 659 if (bufspace >= maxbufspace) 660 goto trytofreespace; 661 662 /* can we constitute a new buffer? / 663* if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { 664 if (bp->b_qindex != QUEUE_EMPTY) 665 panic("getnewbuf: inconsistent EMPTY queue"); 666 bremfree(bp); 667 goto fillbuf; 668 } 669trytofreespace: 670 /* 671 * We keep the file I/O from hogging metadata I/O 672 * This is desirable because file data is cached in the 673 * VM/Buffer cache even if a buffer is freed. 674 / 675* if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { 676 if (bp->b_qindex != QUEUE_AGE) 677 panic("getnewbuf: inconsistent AGE queue"); 678 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { 679 if (bp->b_qindex != QUEUE_LRU) 680 panic("getnewbuf: inconsistent LRU queue"); 681 } 682 if (!bp) { 683 /* wait for a free buffer of any kind / 684* needsbuffer = 1; 685 tsleep(&needsbuffer, 686 (PRIBIO + 1) \| slpflag, "newbuf", slptimeo); 687 splx(s); 688 return (0); 689 } 690 691 if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { 692 --bp->b_usecount; 693 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); 694 if (bufqueues[QUEUE_LRU].tqh_first != NULL) { 695 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 696 goto start; 697 } 698 } 699 700 /* if we are a delayed write, convert to an async write / 701* if ((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) { 702 nbyteswritten += vfs_bio_awrite(bp); 703 if (!slpflag && !slptimeo) { 704 splx(s); 705 return (0); 706 } 707 goto start; 708 } 709 710 if (bp->b_flags & B_WANTED) { 711 bp->b_flags &= ~B_WANTED; 712 wakeup(bp); 713 } 714 bremfree(bp); 715 716 if (bp->b_flags & B_VMIO) { 717 bp->b_flags \|= B_RELBUF \| B_BUSY \| B_DONE; 718 brelse(bp); 719 bremfree(bp); 720 } 721 722 if (bp->b_vp) 723 brelvp(bp); 724 725 /* we are not free, nor do we contain interesting data / 726* if (bp->b_rcred != NOCRED) 727 crfree(bp->b_rcred); 728 if (bp->b_wcred != NOCRED) 729 crfree(bp->b_wcred); 730fillbuf: 731 bp->b_flags \|= B_BUSY; 732 LIST_REMOVE(bp, b_hash); 733 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 734 splx(s); 735 if (bp->b_bufsize) { 736 allocbuf(bp, 0); 737 } 738 bp->b_flags = B_BUSY; 739 bp->b_dev = NODEV; 740 bp->b_vp = NULL; 741 bp->b_blkno = bp->b_lblkno = 0; 742 bp->b_iodone = 0; 743 bp->b_error = 0; 744 bp->b_resid = 0; 745 bp->b_bcount = 0; 746 bp->b_npages = 0; 747 bp->b_wcred = bp->b_rcred = NOCRED; 748 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 749 bp->b_dirtyoff = bp->b_dirtyend = 0; 750 bp->b_validoff = bp->b_validend = 0; 751 bp->b_usecount = 2; 752 if (bufspace >= maxbufspace + nbyteswritten) { 753 s = splbio(); 754 bp->b_flags \|= B_INVAL; 755 brelse(bp); 756 goto trytofreespace; 757 } 758 return (bp); 759} 760 761/* 762 * Check to see if a block is currently memory resident. 763 / 764struct buf 765incore(struct vnode * vp, daddr_t blkno) 766{ 767 struct buf bp; 768* struct bufhashhdr bh; 769* 770 int s = splbio(); 771 772 bh = BUFHASH(vp, blkno); 773 bp = bh->lh_first; 774 775 /* Search hash chain / 776* while (bp != NULL) { 777 /* hit / 778* if (bp->b_vp == vp && bp->b_lblkno == blkno && 779 (bp->b_flags & B_INVAL) == 0) { 780 break; 781 } 782 bp = bp->b_hash.le_next; 783 } 784 splx(s); 785 return (bp); 786} 787 788/* 789 * Returns true if no I/O is needed to access the 790 * associated VM object. This is like incore except 791 * it also hunts around in the VM system for the data. 792 / 793* 794int 795inmem(struct vnode * vp, daddr_t blkno) 796{ 797 vm_object_t obj; 798 vm_offset_t toff, tinc; 799 vm_page_t m; 800 vm_ooffset_t off; 801 802 if (incore(vp, blkno)) 803 return 1; 804 if (vp->v_mount == NULL) 805 return 0; 806 if ((vp->v_object == NULL) \|\| (vp->v_flag & VVMIO) == 0) 807 return 0; 808 809 obj = vp->v_object; 810 tinc = PAGE_SIZE; 811 if (tinc > vp->v_mount->mnt_stat.f_iosize) 812 tinc = vp->v_mount->mnt_stat.f_iosize; 813 off = blkno * vp->v_mount->mnt_stat.f_iosize; 814 815 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 816 817 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 818 if (!m) 819 return 0; 820 if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0) 821 return 0; 822 } 823 return 1; 824} 825 826/* 827 * now we set the dirty range for the buffer -- 828 * for NFS -- if the file is mapped and pages have 829 * been written to, let it know. We want the 830 * entire range of the buffer to be marked dirty if 831 * any of the pages have been written to for consistancy 832 * with the b_validoff, b_validend set in the nfs write 833 * code, and used by the nfs read code. 834 / 835static void 836vfs_setdirty(struct buf bp) { 837 int i; 838 vm_object_t object; 839 vm_offset_t boffset, offset; 840 /* 841 * We qualify the scan for modified pages on whether the 842 * object has been flushed yet. The OBJ_WRITEABLE flag 843 * is not cleared simply by protecting pages off. 844 / 845* if ((bp->b_flags & B_VMIO) && 846 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE\|OBJ_CLEANING))) { 847 /* 848 * test the pages to see if they have been modified directly 849 * by users through the VM system. 850 / 851* for (i = 0; i < bp->b_npages; i++) 852 vm_page_test_dirty(bp->b_pages[i]); 853 854 /* 855 * scan forwards for the first page modified 856 / 857* for (i = 0; i < bp->b_npages; i++) { 858 if (bp->b_pages[i]->dirty) { 859 break; 860 } 861 } 862 boffset = (i << PAGE_SHIFT); 863 if (boffset < bp->b_dirtyoff) { 864 bp->b_dirtyoff = boffset; 865 } 866 867 /* 868 * scan backwards for the last page modified 869 / 870* for (i = bp->b_npages - 1; i >= 0; --i) { 871 if (bp->b_pages[i]->dirty) { 872 break; 873 } 874 } 875 boffset = (i + 1); 876 offset = boffset + bp->b_pages[0]->pindex; 877 if (offset >= object->size) 878 boffset = object->size - bp->b_pages[0]->pindex; 879 if (bp->b_dirtyend < (boffset << PAGE_SHIFT)) 880 bp->b_dirtyend = (boffset << PAGE_SHIFT); 881 } 882} 883 884/* 885 * Get a block given a specified block and offset into a file/device. 886 / 887struct buf 888getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 889{ 890 struct buf bp; 891* int s; 892 struct bufhashhdr bh; 893* 894 s = splbio(); 895loop: 896 if ((bp = gbincore(vp, blkno))) { 897 if (bp->b_flags & B_BUSY) { 898 bp->b_flags \|= B_WANTED; 899 if (bp->b_usecount < BUF_MAXUSE) 900 ++bp->b_usecount; 901 if (!tsleep(bp, 902 (PRIBIO + 1) \| slpflag, "getblk", slptimeo)) 903 goto loop; 904 905 splx(s); 906 return (struct buf ) NULL; 907* } 908 bp->b_flags \|= B_BUSY \| B_CACHE; 909 bremfree(bp); 910 911 /* 912 * check for size inconsistancies (note that they shouldn't happen 913 * but do when filesystems don't handle the size changes correctly.) 914 * We are conservative on metadata and don't just extend the buffer 915 * but write and re-constitute it. 916 / 917* 918 if (bp->b_bcount != size) { 919 if (bp->b_flags & B_VMIO) { 920 allocbuf(bp, size); 921 } else { 922 bp->b_flags \|= B_NOCACHE; 923 VOP_BWRITE(bp); 924 goto loop; 925 } 926 } 927 928 /* 929 * make sure that all pages in the buffer are valid, if they 930 * aren't, clear the cache flag. 931 * ASSUMPTION: 932 * if the buffer is greater than 1 page in size, it is assumed 933 * that the buffer address starts on a page boundary... 934 / 935* if (bp->b_flags & B_VMIO) { 936 int szleft, i; 937 szleft = size; 938 for (i=0;i<bp->b_npages;i++) { 939 if (szleft > PAGE_SIZE) { 940 if ((bp->b_pages[i]->valid & VM_PAGE_BITS_ALL) != 941 VM_PAGE_BITS_ALL) { 942 bp->b_flags &= ~(B_CACHE\|B_DONE); 943 break; 944 } 945 szleft -= PAGE_SIZE; 946 } else { 947 if (!vm_page_is_valid(bp->b_pages[i], 948 (((vm_offset_t) bp->b_data) & PAGE_MASK), 949 szleft)) { 950 bp->b_flags &= ~(B_CACHE\|B_DONE); 951 break; 952 } 953 szleft = 0; 954 } 955 } 956 } 957 if (bp->b_usecount < BUF_MAXUSE) 958 ++bp->b_usecount; 959 splx(s); 960 return (bp); 961 } else { 962 vm_object_t obj; 963 int doingvmio; 964 965 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { 966 doingvmio = 1; 967 } else { 968 doingvmio = 0; 969 } 970 if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) { 971 if (slpflag \|\| slptimeo) { 972 splx(s); 973 return NULL; 974 } 975 goto loop; 976 } 977 978 /* 979 * This code is used to make sure that a buffer is not 980 * created while the getnewbuf routine is blocked. 981 * Normally the vnode is locked so this isn't a problem. 982 * VBLK type I/O requests, however, don't lock the vnode. 983 / 984* if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { 985 bp->b_flags \|= B_INVAL; 986 brelse(bp); 987 goto loop; 988 } 989 990 /* 991 * Insert the buffer into the hash, so that it can 992 * be found by incore. 993 / 994* bp->b_blkno = bp->b_lblkno = blkno; 995 bgetvp(vp, bp); 996 LIST_REMOVE(bp, b_hash); 997 bh = BUFHASH(vp, blkno); 998 LIST_INSERT_HEAD(bh, bp, b_hash); 999 1000 if (doingvmio) { 1001 bp->b_flags \|= (B_VMIO \| B_CACHE); 1002#if defined(VFS_BIO_DEBUG) 1003 if (vp->v_type != VREG) 1004 printf("getblk: vmioing file type %d???\n", vp->v_type); 1005#endif 1006 } else { 1007 bp->b_flags &= ~B_VMIO; 1008 } 1009 splx(s); 1010 1011 allocbuf(bp, size); 1012 return (bp); 1013 } 1014} 1015 1016/* 1017 * Get an empty, disassociated buffer of given size. 1018 / 1019struct buf 1020geteblk(int size) 1021{ 1022 struct buf bp; 1023* 1024 while ((bp = getnewbuf(0, 0, 0)) == 0); 1025 allocbuf(bp, size); 1026 bp->b_flags \|= B_INVAL; 1027 return (bp); 1028} 1029 1030/* 1031 * This code constitutes the buffer memory from either anonymous system 1032 * memory (in the case of non-VMIO operations) or from an associated 1033 * VM object (in the case of VMIO operations). 1034 * 1035 * Note that this code is tricky, and has many complications to resolve 1036 * deadlock or inconsistant data situations. Tread lightly!!! 1037 * 1038 * Modify the length of a buffer's underlying buffer storage without 1039 * destroying information (unless, of course the buffer is shrinking). 1040 / 1041int 1042allocbuf(struct buf bp, int size) 1043{ 1044 1045 int s; 1046 int newbsize, mbsize; 1047 int i; 1048 1049 if (!(bp->b_flags & B_BUSY)) 1050 panic("allocbuf: buffer not busy"); 1051 1052 if ((bp->b_flags & B_VMIO) == 0) { 1053 /* 1054 * Just get anonymous memory from the kernel 1055 / 1056* mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1057 newbsize = round_page(size); 1058 1059 if (newbsize < bp->b_bufsize) { 1060 vm_hold_free_pages( 1061 bp, 1062 (vm_offset_t) bp->b_data + newbsize, 1063 (vm_offset_t) bp->b_data + bp->b_bufsize); 1064 } else if (newbsize > bp->b_bufsize) { 1065 vm_hold_load_pages( 1066 bp, 1067 (vm_offset_t) bp->b_data + bp->b_bufsize, 1068 (vm_offset_t) bp->b_data + newbsize); 1069 } 1070 } else { 1071 vm_page_t m; 1072 int desiredpages; 1073 1074 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1075 desiredpages = (round_page(newbsize) >> PAGE_SHIFT); 1076 1077 if (newbsize < bp->b_bufsize) { 1078 if (desiredpages < bp->b_npages) { 1079 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + 1080 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 1081 for (i = desiredpages; i < bp->b_npages; i++) { 1082 m = bp->b_pages[i]; 1083 s = splhigh(); 1084 while ((m->flags & PG_BUSY) \|\| (m->busy != 0)) { 1085 m->flags \|= PG_WANTED; 1086 tsleep(m, PVM, "biodep", 0); 1087 } 1088 splx(s); 1089 1090 if (m->bmapped == 0) { 1091 printf("allocbuf: bmapped is zero for page %d\n", i); 1092 panic("allocbuf: error"); 1093 } 1094 --m->bmapped; 1095 if (m->bmapped == 0) { 1096 vm_page_protect(m, VM_PROT_NONE); 1097 vm_page_free(m); 1098 } 1099 bp->b_pages[i] = NULL; 1100 } 1101 bp->b_npages = desiredpages; 1102 } 1103 } else if (newbsize > bp->b_bufsize) { 1104 vm_object_t obj; 1105 vm_offset_t tinc, toff; 1106 vm_ooffset_t off; 1107 vm_pindex_t objoff; 1108 int pageindex, curbpnpages; 1109 struct vnode vp; 1110* int bsize; 1111 1112 vp = bp->b_vp; 1113 1114 if (vp->v_type == VBLK) 1115 bsize = DEV_BSIZE; 1116 else 1117 bsize = vp->v_mount->mnt_stat.f_iosize; 1118 1119 if (bp->b_npages < desiredpages) { 1120 obj = vp->v_object; 1121 tinc = PAGE_SIZE; 1122 if (tinc > bsize) 1123 tinc = bsize; 1124 off = (vm_ooffset_t) bp->b_lblkno * bsize; 1125 doretry: 1126 curbpnpages = bp->b_npages; 1127 bp->b_flags \|= B_CACHE; 1128 for (toff = 0; toff < newbsize; toff += tinc) { 1129 int bytesinpage; 1130 1131 pageindex = toff >> PAGE_SHIFT; 1132 objoff = OFF_TO_IDX(off + toff); 1133 if (pageindex < curbpnpages) { 1134 1135 m = bp->b_pages[pageindex]; 1136 if (m->pindex != objoff) 1137 panic("allocbuf: page changed offset??!!!?"); 1138 bytesinpage = tinc; 1139 if (tinc > (newbsize - toff)) 1140 bytesinpage = newbsize - toff; 1141 if (!vm_page_is_valid(m, 1142 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)), 1143 bytesinpage)) { 1144 bp->b_flags &= ~B_CACHE; 1145 } 1146 if ((m->flags & PG_ACTIVE) == 0) { 1147 vm_page_activate(m); 1148 m->act_count = 0; 1149 } 1150 continue; 1151 } 1152 m = vm_page_lookup(obj, objoff); 1153 if (!m) { 1154 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); 1155 if (!m) { 1156 int j; 1157 1158 for (j = bp->b_npages; j < pageindex; j++) { 1159 PAGE_WAKEUP(bp->b_pages[j]); 1160 } 1161 VM_WAIT; 1162 goto doretry; 1163 } 1164 vm_page_activate(m); 1165 m->act_count = 0; 1166 m->valid = 0; 1167 bp->b_flags &= ~B_CACHE; 1168 } else if (m->flags & PG_BUSY) { 1169 int j; 1170 1171 for (j = bp->b_npages; j < pageindex; j++) { 1172 PAGE_WAKEUP(bp->b_pages[j]); 1173 } 1174 1175 s = splbio(); 1176 m->flags \|= PG_WANTED; 1177 tsleep(m, PVM, "pgtblk", 0); 1178 splx(s); 1179 1180 goto doretry; 1181 } else { 1182 if ((curproc != pageproc) && 1183 (m->flags & PG_CACHE) && 1184 (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { 1185 pagedaemon_wakeup(); 1186 } 1187 bytesinpage = tinc; 1188 if (tinc > (newbsize - toff)) 1189 bytesinpage = newbsize - toff; 1190 if (!vm_page_is_valid(m, 1191 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)), 1192 bytesinpage)) { 1193 bp->b_flags &= ~B_CACHE; 1194 } 1195 if ((m->flags & PG_ACTIVE) == 0) { 1196 vm_page_activate(m); 1197 m->act_count = 0; 1198 } 1199 m->flags \|= PG_BUSY; 1200 } 1201 bp->b_pages[pageindex] = m; 1202 curbpnpages = pageindex + 1; 1203 } 1204 for (i = bp->b_npages; i < curbpnpages; i++) { 1205 m = bp->b_pages[i]; 1206 m->bmapped++; 1207 PAGE_WAKEUP(m); 1208 } 1209 bp->b_npages = curbpnpages; 1210 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 1211 pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); 1212 bp->b_data += off & (PAGE_SIZE - 1); 1213 } 1214 } 1215 } 1216 bufspace += (newbsize - bp->b_bufsize); 1217 bp->b_bufsize = newbsize; 1218 bp->b_bcount = size; 1219 return 1; 1220} 1221 1222/* 1223 * Wait for buffer I/O completion, returning error status. 1224 / 1225int 1226biowait(register struct buf bp) 1227{ 1228 int s; 1229 1230 s = splbio(); 1231 while ((bp->b_flags & B_DONE) == 0) 1232 tsleep(bp, PRIBIO, "biowait", 0); 1233 splx(s); 1234 if (bp->b_flags & B_EINTR) { 1235 bp->b_flags &= ~B_EINTR; 1236 return (EINTR); 1237 } 1238 if (bp->b_flags & B_ERROR) { 1239 return (bp->b_error ? bp->b_error : EIO); 1240 } else { 1241 return (0); 1242 } 1243} 1244 1245/* 1246 * Finish I/O on a buffer, calling an optional function. 1247 * This is usually called from interrupt level, so process blocking 1248 * is not a good idea. 1249 / 1250void 1251biodone(register struct buf bp) 1252{ 1253 int s; 1254 1255 s = splbio(); 1256 if (!(bp->b_flags & B_BUSY)) 1257 panic("biodone: buffer not busy"); 1258 1259 if (bp->b_flags & B_DONE) { 1260 splx(s); 1261 printf("biodone: buffer already done\n"); 1262 return; 1263 } 1264 bp->b_flags \|= B_DONE; 1265 1266 if ((bp->b_flags & B_READ) == 0) { 1267 vwakeup(bp); 1268 } 1269#ifdef BOUNCE_BUFFERS 1270 if (bp->b_flags & B_BOUNCE) 1271 vm_bounce_free(bp); 1272#endif 1273 1274 /* call optional completion function if requested / 1275* if (bp->b_flags & B_CALL) { 1276 bp->b_flags &= ~B_CALL; 1277 (bp->b_iodone) (bp); 1278* splx(s); 1279 return; 1280 } 1281 if (bp->b_flags & B_VMIO) { 1282 int i, resid; 1283 vm_ooffset_t foff; 1284 vm_page_t m; 1285 vm_object_t obj; 1286 int iosize; 1287 struct vnode vp = bp->b_vp; 1288* 1289 if (vp->v_type == VBLK) 1290 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1291 else 1292 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1293 obj = vp->v_object; 1294 if (!obj) { 1295 panic("biodone: no object"); 1296 } 1297#if defined(VFS_BIO_DEBUG) 1298 if (obj->paging_in_progress < bp->b_npages) { 1299 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 1300 obj->paging_in_progress, bp->b_npages); 1301 } 1302#endif 1303 iosize = bp->b_bufsize; 1304 for (i = 0; i < bp->b_npages; i++) { 1305 int bogusflag = 0; 1306 m = bp->b_pages[i]; 1307 if (m == bogus_page) { 1308 bogusflag = 1; 1309 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 1310 if (!m) { 1311#if defined(VFS_BIO_DEBUG) 1312 printf("biodone: page disappeared\n"); 1313#endif 1314 --obj->paging_in_progress; 1315 continue; 1316 } 1317 bp->b_pages[i] = m; 1318 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1319 } 1320#if defined(VFS_BIO_DEBUG) 1321 if (OFF_TO_IDX(foff) != m->pindex) { 1322 printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex); 1323 } 1324#endif 1325 resid = IDX_TO_OFF(m->pindex + 1) - foff; 1326 if (resid > iosize) 1327 resid = iosize; 1328 /* 1329 * In the write case, the valid and clean bits are 1330 * already changed correctly, so we only need to do this 1331 * here in the read case. 1332 / 1333* if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 1334 vm_page_set_validclean(m, 1335 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid); 1336 } 1337 1338 /* 1339 * when debugging new filesystems or buffer I/O methods, this 1340 * is the most common error that pops up. if you see this, you 1341 * have not set the page busy flag correctly!!! 1342 / 1343* if (m->busy == 0) { 1344 printf("biodone: page busy < 0, " 1345 "pindex: %d, foff: 0x(%x,%x), " 1346 "resid: %d, index: %d\n", 1347 (int) m->pindex, (int)(foff >> 32), 1348 (int) foff & 0xffffffff, resid, i); 1349 if (vp->v_type != VBLK) 1350 printf(" iosize: %d, lblkno: %d, flags: 0x%lx, npages: %d\n", 1351 bp->b_vp->v_mount->mnt_stat.f_iosize, 1352 (int) bp->b_lblkno, 1353 bp->b_flags, bp->b_npages); 1354 else 1355 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", 1356 (int) bp->b_lblkno, 1357 bp->b_flags, bp->b_npages); 1358 printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n", 1359 m->valid, m->dirty, m->bmapped); 1360 panic("biodone: page busy < 0\n"); 1361 } 1362 --m->busy; 1363 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1364 m->flags &= ~PG_WANTED; 1365 wakeup(m); 1366 } 1367 --obj->paging_in_progress; 1368 foff += resid; 1369 iosize -= resid; 1370 } 1371 if (obj && obj->paging_in_progress == 0 && 1372 (obj->flags & OBJ_PIPWNT)) { 1373 obj->flags &= ~OBJ_PIPWNT; 1374 wakeup(obj); 1375 } 1376 } 1377 /* 1378 * For asynchronous completions, release the buffer now. The brelse 1379 * checks for B_WANTED and will do the wakeup there if necessary - so 1380 * no need to do a wakeup here in the async case. 1381 / 1382* 1383 if (bp->b_flags & B_ASYNC) { 1384 brelse(bp); 1385 } else { 1386 wakeup(bp); 1387 } 1388 splx(s); 1389} 1390 1391int 1392count_lock_queue() 1393{ 1394 int count; 1395 struct buf bp; 1396* 1397 count = 0; 1398 for (bp = bufqueues[QUEUE_LOCKED].tqh_first; 1399 bp != NULL; 1400 bp = bp->b_freelist.tqe_next) 1401 count++; 1402 return (count); 1403} 1404 1405int vfs_update_interval = 30; 1406 1407static void 1408vfs_update() 1409{ 1410 (void) spl0(); /* XXX redundant? wrong place? / 1411* while (1) { 1412 tsleep(&vfs_update_wakeup, PUSER, "update", 1413 hz * vfs_update_interval); 1414 vfs_update_wakeup = 0; 1415 sync(curproc, NULL, NULL); 1416 } 1417} 1418 1419static int 1420sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS 1421{ 1422 int error = sysctl_handle_int(oidp, 1423 oidp->oid_arg1, oidp->oid_arg2, req); 1424 if (!error) 1425 wakeup(&vfs_update_wakeup); 1426 return error; 1427} 1428 1429SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT\|CTLFLAG_RW, 1430 &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); 1431 1432 1433/* 1434 * This routine is called in lieu of iodone in the case of 1435 * incomplete I/O. This keeps the busy status for pages 1436 * consistant. 1437 / 1438void 1439vfs_unbusy_pages(struct buf bp) 1440{ 1441 int i; 1442 1443 if (bp->b_flags & B_VMIO) { 1444 struct vnode vp = bp->b_vp; 1445* vm_object_t obj = vp->v_object; 1446 vm_ooffset_t foff; 1447 1448 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1449 1450 for (i = 0; i < bp->b_npages; i++) { 1451 vm_page_t m = bp->b_pages[i]; 1452 1453 if (m == bogus_page) { 1454 m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i); 1455 if (!m) { 1456 panic("vfs_unbusy_pages: page missing\n"); 1457 } 1458 bp->b_pages[i] = m; 1459 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1460 } 1461 --obj->paging_in_progress; 1462 --m->busy; 1463 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1464 m->flags &= ~PG_WANTED; 1465 wakeup(m); 1466 } 1467 } 1468 if (obj->paging_in_progress == 0 && 1469 (obj->flags & OBJ_PIPWNT)) { 1470 obj->flags &= ~OBJ_PIPWNT; 1471 wakeup(obj); 1472 } 1473 } 1474} 1475 1476/* 1477 * This routine is called before a device strategy routine. 1478 * It is used to tell the VM system that paging I/O is in 1479 * progress, and treat the pages associated with the buffer 1480 * almost as being PG_BUSY. Also the object paging_in_progress 1481 * flag is handled to make sure that the object doesn't become 1482 * inconsistant. 1483 / 1484void 1485vfs_busy_pages(struct buf bp, int clear_modify) 1486{ 1487 int i; 1488 1489 if (bp->b_flags & B_VMIO) { 1490 vm_object_t obj = bp->b_vp->v_object; 1491 vm_ooffset_t foff; 1492 int iocount = bp->b_bufsize; 1493 1494 if (bp->b_vp->v_type == VBLK) 1495 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1496 else 1497 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1498 vfs_setdirty(bp); 1499 for (i = 0; i < bp->b_npages; i++) { 1500 vm_page_t m = bp->b_pages[i]; 1501 int resid = IDX_TO_OFF(m->pindex + 1) - foff; 1502 1503 if (resid > iocount) 1504 resid = iocount; 1505 if ((bp->b_flags & B_CLUSTER) == 0) { 1506 obj->paging_in_progress++; 1507 m->busy++; 1508 } 1509 if (clear_modify) { 1510 vm_page_protect(m, VM_PROT_READ); 1511 vm_page_set_validclean(m, 1512 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid); 1513 } else if (bp->b_bcount >= PAGE_SIZE) { 1514 if (m->valid && (bp->b_flags & B_CACHE) == 0) { 1515 bp->b_pages[i] = bogus_page; 1516 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1517 } 1518 } 1519 foff += resid; 1520 iocount -= resid; 1521 } 1522 } 1523} 1524 1525/* 1526 * Tell the VM system that the pages associated with this buffer 1527 * are clean. This is used for delayed writes where the data is 1528 * going to go to disk eventually without additional VM intevention. 1529 / 1530void 1531vfs_clean_pages(struct buf bp) 1532{ 1533 int i; 1534 1535 if (bp->b_flags & B_VMIO) { 1536 vm_ooffset_t foff; 1537 int iocount = bp->b_bufsize; 1538 1539 if (bp->b_vp->v_type == VBLK) 1540 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1541 else 1542 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1543 1544 for (i = 0; i < bp->b_npages; i++) { 1545 vm_page_t m = bp->b_pages[i]; 1546 int resid = IDX_TO_OFF(m->pindex + 1) - foff; 1547 1548 if (resid > iocount) 1549 resid = iocount; 1550 if (resid > 0) { 1551 vm_page_set_validclean(m, 1552 ((vm_offset_t) foff & (PAGE_SIZE-1)), resid); 1553 } 1554 foff += resid; 1555 iocount -= resid; 1556 } 1557 } 1558} 1559 1560void 1561vfs_bio_clrbuf(struct buf bp) { 1562* int i; 1563 if( bp->b_flags & B_VMIO) { 1564 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { 1565 int mask; 1566 mask = 0; 1567 for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE) 1568 mask \|= (1 << (i/DEV_BSIZE)); 1569 if( bp->b_pages[0]->valid != mask) { 1570 bzero(bp->b_data, bp->b_bufsize); 1571 } 1572 bp->b_pages[0]->valid = mask; 1573 bp->b_resid = 0; 1574 return; 1575 } 1576 for(i=0;i<bp->b_npages;i++) { 1577 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) 1578 continue; 1579 if( bp->b_pages[i]->valid == 0) { 1580 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) 1581 bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE); 1582 } else { 1583 int j; 1584 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { 1585 if( (bp->b_pages[i]->valid & (1<<j)) == 0) 1586 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE); 1587 } 1588 } 1589 bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; 1590 } 1591 bp->b_resid = 0; 1592 } else { 1593 clrbuf(bp); 1594 } 1595} 1596 1597/* 1598 * vm_hold_load_pages and vm_hold_unload pages get pages into 1599 * a buffers address space. The pages are anonymous and are 1600 * not associated with a file object. 1601 / 1602void 1603vm_hold_load_pages(struct buf bp, vm_offset_t froma, vm_offset_t toa) 1604{ 1605 vm_offset_t pg; 1606 vm_page_t p; 1607 vm_offset_t from = round_page(froma); 1608 vm_offset_t to = round_page(toa); 1609 1610 for (pg = from; pg < to; pg += PAGE_SIZE) { 1611 1612tryagain: 1613 1614 p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 1615 VM_ALLOC_NORMAL); 1616 if (!p) { 1617 VM_WAIT; 1618 goto tryagain; 1619 } 1620 vm_page_wire(p); 1621 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 1622 bp->b_pages[((caddr_t) pg - bp->b_data) >> PAGE_SHIFT] = p; 1623 PAGE_WAKEUP(p); 1624 bp->b_npages++; 1625 } 1626} 1627 1628void 1629vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) 1630{ 1631 vm_offset_t pg; 1632 vm_page_t p; 1633 vm_offset_t from = round_page(froma); 1634 vm_offset_t to = round_page(toa); 1635 1636 for (pg = from; pg < to; pg += PAGE_SIZE) { 1637 int index = ((caddr_t) pg - bp->b_data) >> PAGE_SHIFT; 1638 p = bp->b_pages[index]; 1639 bp->b_pages[index] = 0; 1640 pmap_kremove(pg); 1641 vm_page_free(p); 1642 --bp->b_npages; 1643 } 1644}