Cross Reference: /freebsd-10.3-release/sys/kern/vfs

Deleted Added

sdiff udiff text old ( 10541 ) new ( 10551 )

full compact

vfs_bio.c (10541)	vfs_bio.c (10551)
1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. This work was done expressly for inclusion into FreeBSD. Other use 17 * is allowed if this notation is included. 18 * 5. Modifications may be freely made to this file if the above conditions 19 * are met. 20 *	1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. This work was done expressly for inclusion into FreeBSD. Other use 17 * is allowed if this notation is included. 18 * 5. Modifications may be freely made to this file if the above conditions 19 * are met. 20 *
21 * $Id: vfs_bio.c,v 1.60 1995/08/28 09:18:53 julian Exp $	21 * $Id: vfs_bio.c,v 1.61 1995/09/03 19:56:14 dyson Exp $
22 / 23 24/ 25 * this file contains a new buffer I/O scheme implementing a coherent 26 * VM object and buffer cache scheme. Pains have been taken to make 27 * sure that the performance degradation associated with schemes such 28 * as this is not realized. 29 * 30 * Author: John S. Dyson 31 * Significant help during the development and debugging phases 32 * had been provided by David Greenman, also of the FreeBSD core team. 33 / 34 35#define VMIO 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/kernel.h> 39#include <sys/proc.h> 40#include <sys/vnode.h> 41#include <vm/vm.h> 42#include <vm/vm_kern.h> 43#include <vm/vm_pageout.h> 44#include <vm/vm_page.h> 45#include <vm/vm_object.h> 46#include <sys/buf.h> 47#include <sys/mount.h> 48#include <sys/malloc.h> 49#include <sys/resourcevar.h> 50#include <sys/proc.h> 51 52#include <miscfs/specfs/specdev.h> 53 54/ 55 * System initialization 56 / 57 58static void vfs_update __P((void)); 59struct proc updateproc; 60 61static struct kproc_desc up_kp = { 62 "update", 63 vfs_update, 64 &updateproc 65}; 66SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, (caddr_t)&up_kp) 67 68 69struct buf buf; / buffer header pool / 70struct swqueue bswlist; 71 72void vm_hold_free_pages(struct buf bp, vm_offset_t from, vm_offset_t to); 73void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); 74void vfs_clean_pages(struct buf * bp); 75static void vfs_setdirty(struct buf bp); 76static __inline struct buf gbincore(struct vnode * vp, daddr_t blkno); 77 78int needsbuffer; 79 80/* 81 * Internal update daemon, process 3 82 * The variable vfs_update_wakeup allows for internal syncs. 83 / 84int vfs_update_wakeup; 85 86 87/ 88 * buffers base kva 89 / 90caddr_t buffers_kva; 91 92/ 93 * bogus page -- for I/O to/from partially complete buffers 94 * this is a temporary solution to the problem, but it is not 95 * really that bad. it would be better to split the buffer 96 * for input in the case of buffers partially already in memory, 97 * but the code is intricate enough already. 98 / 99vm_page_t bogus_page; 100vm_offset_t bogus_offset; 101* 102int bufspace, maxbufspace; 103 104/* 105 * advisory minimum for size of LRU queue or VMIO queue 106 / 107int minbuf; 108* 109struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; 110struct bqueues bufqueues[BUFFER_QUEUES]; 111 112/* 113 * Initialize buffer headers and related structures. 114 / 115void 116bufinit() 117{ 118* struct buf bp; 119* int i; 120 121 TAILQ_INIT(&bswlist); 122 LIST_INIT(&invalhash); 123 124 /* first, make a null hash table / 125* for (i = 0; i < BUFHSZ; i++) 126 LIST_INIT(&bufhashtbl[i]); 127 128 /* next, make a null set of free lists / 129* for (i = 0; i < BUFFER_QUEUES; i++) 130 TAILQ_INIT(&bufqueues[i]); 131 132 buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); 133 /* finally, initialize each buffer header and stick on empty q / 134* for (i = 0; i < nbuf; i++) { 135 bp = &buf[i]; 136 bzero(bp, sizeof bp); 137* bp->b_flags = B_INVAL; /* we're just an empty header / 138* bp->b_dev = NODEV; 139 bp->b_rcred = NOCRED; 140 bp->b_wcred = NOCRED; 141 bp->b_qindex = QUEUE_EMPTY; 142 bp->b_vnbufs.le_next = NOLIST; 143 bp->b_data = buffers_kva + i * MAXBSIZE; 144 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 145 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 146 } 147/* 148 * maxbufspace is currently calculated to support all filesystem blocks 149 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer 150 * cache is still the same as it would be for 8K filesystems. This 151 * keeps the size of the buffer cache "in check" for big block filesystems. 152 / 153* minbuf = nbuf / 3; 154 maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE; 155 156 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 157 bogus_page = vm_page_alloc(kernel_object, 158 bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL); 159 160} 161 162/* 163 * remove the buffer from the appropriate free list 164 / 165void 166bremfree(struct buf bp) 167{ 168 int s = splbio(); 169 170 if (bp->b_qindex != QUEUE_NONE) { 171 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 172 bp->b_qindex = QUEUE_NONE; 173 } else { 174 panic("bremfree: removing a buffer when not on a queue"); 175 } 176 splx(s); 177} 178 179/* 180 * Get a buffer with the specified data. Look in the cache first. 181 / 182int 183bread(struct vnode vp, daddr_t blkno, int size, struct ucred * cred, 184 struct buf ** bpp) 185{ 186 struct buf bp; 187* 188 bp = getblk(vp, blkno, size, 0, 0); 189 bpp = bp; 190* 191 /* if not found in cache, do some I/O / 192* if ((bp->b_flags & B_CACHE) == 0) { 193 if (curproc != NULL) 194 curproc->p_stats->p_ru.ru_inblock++; 195 bp->b_flags \|= B_READ; 196 bp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 197 if (bp->b_rcred == NOCRED) { 198 if (cred != NOCRED) 199 crhold(cred); 200 bp->b_rcred = cred; 201 } 202 vfs_busy_pages(bp, 0); 203 VOP_STRATEGY(bp); 204 return (biowait(bp)); 205 } 206 return (0); 207} 208 209/* 210 * Operates like bread, but also starts asynchronous I/O on 211 * read-ahead blocks. 212 / 213int 214breadn(struct vnode vp, daddr_t blkno, int size, 215 daddr_t * rablkno, int rabsize, 216* int cnt, struct ucred * cred, struct buf ** bpp) 217{ 218 struct buf bp, rabp; 219 int i; 220 int rv = 0, readwait = 0; 221 222 bpp = bp = getblk(vp, blkno, size, 0, 0); 223* 224 /* if not found in cache, do some I/O / 225* if ((bp->b_flags & B_CACHE) == 0) { 226 if (curproc != NULL) 227 curproc->p_stats->p_ru.ru_inblock++; 228 bp->b_flags \|= B_READ; 229 bp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 230 if (bp->b_rcred == NOCRED) { 231 if (cred != NOCRED) 232 crhold(cred); 233 bp->b_rcred = cred; 234 } 235 vfs_busy_pages(bp, 0); 236 VOP_STRATEGY(bp); 237 ++readwait; 238 } 239 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 240 if (inmem(vp, rablkno)) 241* continue; 242 rabp = getblk(vp, rablkno, rabsize, 0, 0); 243 244 if ((rabp->b_flags & B_CACHE) == 0) { 245 if (curproc != NULL) 246 curproc->p_stats->p_ru.ru_inblock++; 247 rabp->b_flags \|= B_READ \| B_ASYNC; 248 rabp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 249 if (rabp->b_rcred == NOCRED) { 250 if (cred != NOCRED) 251 crhold(cred); 252 rabp->b_rcred = cred; 253 } 254 vfs_busy_pages(rabp, 0); 255 VOP_STRATEGY(rabp); 256 } else { 257 brelse(rabp); 258 } 259 } 260 261 if (readwait) { 262 rv = biowait(bp); 263 } 264 return (rv); 265} 266 267/* 268 * Write, release buffer on completion. (Done by iodone 269 * if async.) 270 / 271int 272bwrite(struct buf bp) 273{ 274 int oldflags = bp->b_flags; 275 276 if (bp->b_flags & B_INVAL) { 277 brelse(bp); 278 return (0); 279 } 280 if (!(bp->b_flags & B_BUSY)) 281 panic("bwrite: buffer is not busy???"); 282 283 bp->b_flags &= ~(B_READ \| B_DONE \| B_ERROR \| B_DELWRI); 284 bp->b_flags \|= B_WRITEINPROG; 285 286 if ((oldflags & (B_ASYNC\|B_DELWRI)) == (B_ASYNC\|B_DELWRI)) { 287 reassignbuf(bp, bp->b_vp); 288 } 289 290 bp->b_vp->v_numoutput++; 291 vfs_busy_pages(bp, 1); 292 if (curproc != NULL) 293 curproc->p_stats->p_ru.ru_oublock++; 294 VOP_STRATEGY(bp); 295 296 if ((oldflags & B_ASYNC) == 0) { 297 int rtval = biowait(bp); 298 299 if (oldflags & B_DELWRI) { 300 reassignbuf(bp, bp->b_vp); 301 } 302 brelse(bp); 303 return (rtval); 304 } 305 return (0); 306} 307 308int 309vn_bwrite(ap) 310 struct vop_bwrite_args ap; 311{ 312* return (bwrite(ap->a_bp)); 313} 314 315/* 316 * Delayed write. (Buffer is marked dirty). 317 / 318void 319bdwrite(struct buf bp) 320{ 321 322 if ((bp->b_flags & B_BUSY) == 0) { 323 panic("bdwrite: buffer is not busy"); 324 } 325 if (bp->b_flags & B_INVAL) { 326 brelse(bp); 327 return; 328 } 329 if (bp->b_flags & B_TAPE) { 330 bawrite(bp); 331 return; 332 } 333 bp->b_flags &= ~(B_READ\|B_RELBUF); 334 if ((bp->b_flags & B_DELWRI) == 0) { 335 bp->b_flags \|= B_DONE \| B_DELWRI; 336 reassignbuf(bp, bp->b_vp); 337 } 338 339 /* 340 * This bmap keeps the system from needing to do the bmap later, 341 * perhaps when the system is attempting to do a sync. Since it 342 * is likely that the indirect block -- or whatever other datastructure 343 * that the filesystem needs is still in memory now, it is a good 344 * thing to do this. Note also, that if the pageout daemon is 345 * requesting a sync -- there might not be enough memory to do 346 * the bmap then... So, this is important to do. 347 / 348* if( bp->b_lblkno == bp->b_blkno) {	22 / 23 24/ 25 * this file contains a new buffer I/O scheme implementing a coherent 26 * VM object and buffer cache scheme. Pains have been taken to make 27 * sure that the performance degradation associated with schemes such 28 * as this is not realized. 29 * 30 * Author: John S. Dyson 31 * Significant help during the development and debugging phases 32 * had been provided by David Greenman, also of the FreeBSD core team. 33 / 34 35#define VMIO 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/kernel.h> 39#include <sys/proc.h> 40#include <sys/vnode.h> 41#include <vm/vm.h> 42#include <vm/vm_kern.h> 43#include <vm/vm_pageout.h> 44#include <vm/vm_page.h> 45#include <vm/vm_object.h> 46#include <sys/buf.h> 47#include <sys/mount.h> 48#include <sys/malloc.h> 49#include <sys/resourcevar.h> 50#include <sys/proc.h> 51 52#include <miscfs/specfs/specdev.h> 53 54/ 55 * System initialization 56 / 57 58static void vfs_update __P((void)); 59struct proc updateproc; 60 61static struct kproc_desc up_kp = { 62 "update", 63 vfs_update, 64 &updateproc 65}; 66SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, (caddr_t)&up_kp) 67 68 69struct buf buf; / buffer header pool / 70struct swqueue bswlist; 71 72void vm_hold_free_pages(struct buf bp, vm_offset_t from, vm_offset_t to); 73void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); 74void vfs_clean_pages(struct buf * bp); 75static void vfs_setdirty(struct buf bp); 76static __inline struct buf gbincore(struct vnode * vp, daddr_t blkno); 77 78int needsbuffer; 79 80/* 81 * Internal update daemon, process 3 82 * The variable vfs_update_wakeup allows for internal syncs. 83 / 84int vfs_update_wakeup; 85 86 87/ 88 * buffers base kva 89 / 90caddr_t buffers_kva; 91 92/ 93 * bogus page -- for I/O to/from partially complete buffers 94 * this is a temporary solution to the problem, but it is not 95 * really that bad. it would be better to split the buffer 96 * for input in the case of buffers partially already in memory, 97 * but the code is intricate enough already. 98 / 99vm_page_t bogus_page; 100vm_offset_t bogus_offset; 101* 102int bufspace, maxbufspace; 103 104/* 105 * advisory minimum for size of LRU queue or VMIO queue 106 / 107int minbuf; 108* 109struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; 110struct bqueues bufqueues[BUFFER_QUEUES]; 111 112/* 113 * Initialize buffer headers and related structures. 114 / 115void 116bufinit() 117{ 118* struct buf bp; 119* int i; 120 121 TAILQ_INIT(&bswlist); 122 LIST_INIT(&invalhash); 123 124 /* first, make a null hash table / 125* for (i = 0; i < BUFHSZ; i++) 126 LIST_INIT(&bufhashtbl[i]); 127 128 /* next, make a null set of free lists / 129* for (i = 0; i < BUFFER_QUEUES; i++) 130 TAILQ_INIT(&bufqueues[i]); 131 132 buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); 133 /* finally, initialize each buffer header and stick on empty q / 134* for (i = 0; i < nbuf; i++) { 135 bp = &buf[i]; 136 bzero(bp, sizeof bp); 137* bp->b_flags = B_INVAL; /* we're just an empty header / 138* bp->b_dev = NODEV; 139 bp->b_rcred = NOCRED; 140 bp->b_wcred = NOCRED; 141 bp->b_qindex = QUEUE_EMPTY; 142 bp->b_vnbufs.le_next = NOLIST; 143 bp->b_data = buffers_kva + i * MAXBSIZE; 144 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 145 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 146 } 147/* 148 * maxbufspace is currently calculated to support all filesystem blocks 149 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer 150 * cache is still the same as it would be for 8K filesystems. This 151 * keeps the size of the buffer cache "in check" for big block filesystems. 152 / 153* minbuf = nbuf / 3; 154 maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE; 155 156 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 157 bogus_page = vm_page_alloc(kernel_object, 158 bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL); 159 160} 161 162/* 163 * remove the buffer from the appropriate free list 164 / 165void 166bremfree(struct buf bp) 167{ 168 int s = splbio(); 169 170 if (bp->b_qindex != QUEUE_NONE) { 171 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 172 bp->b_qindex = QUEUE_NONE; 173 } else { 174 panic("bremfree: removing a buffer when not on a queue"); 175 } 176 splx(s); 177} 178 179/* 180 * Get a buffer with the specified data. Look in the cache first. 181 / 182int 183bread(struct vnode vp, daddr_t blkno, int size, struct ucred * cred, 184 struct buf ** bpp) 185{ 186 struct buf bp; 187* 188 bp = getblk(vp, blkno, size, 0, 0); 189 bpp = bp; 190* 191 /* if not found in cache, do some I/O / 192* if ((bp->b_flags & B_CACHE) == 0) { 193 if (curproc != NULL) 194 curproc->p_stats->p_ru.ru_inblock++; 195 bp->b_flags \|= B_READ; 196 bp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 197 if (bp->b_rcred == NOCRED) { 198 if (cred != NOCRED) 199 crhold(cred); 200 bp->b_rcred = cred; 201 } 202 vfs_busy_pages(bp, 0); 203 VOP_STRATEGY(bp); 204 return (biowait(bp)); 205 } 206 return (0); 207} 208 209/* 210 * Operates like bread, but also starts asynchronous I/O on 211 * read-ahead blocks. 212 / 213int 214breadn(struct vnode vp, daddr_t blkno, int size, 215 daddr_t * rablkno, int rabsize, 216* int cnt, struct ucred * cred, struct buf ** bpp) 217{ 218 struct buf bp, rabp; 219 int i; 220 int rv = 0, readwait = 0; 221 222 bpp = bp = getblk(vp, blkno, size, 0, 0); 223* 224 /* if not found in cache, do some I/O / 225* if ((bp->b_flags & B_CACHE) == 0) { 226 if (curproc != NULL) 227 curproc->p_stats->p_ru.ru_inblock++; 228 bp->b_flags \|= B_READ; 229 bp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 230 if (bp->b_rcred == NOCRED) { 231 if (cred != NOCRED) 232 crhold(cred); 233 bp->b_rcred = cred; 234 } 235 vfs_busy_pages(bp, 0); 236 VOP_STRATEGY(bp); 237 ++readwait; 238 } 239 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 240 if (inmem(vp, rablkno)) 241* continue; 242 rabp = getblk(vp, rablkno, rabsize, 0, 0); 243 244 if ((rabp->b_flags & B_CACHE) == 0) { 245 if (curproc != NULL) 246 curproc->p_stats->p_ru.ru_inblock++; 247 rabp->b_flags \|= B_READ \| B_ASYNC; 248 rabp->b_flags &= ~(B_DONE \| B_ERROR \| B_INVAL); 249 if (rabp->b_rcred == NOCRED) { 250 if (cred != NOCRED) 251 crhold(cred); 252 rabp->b_rcred = cred; 253 } 254 vfs_busy_pages(rabp, 0); 255 VOP_STRATEGY(rabp); 256 } else { 257 brelse(rabp); 258 } 259 } 260 261 if (readwait) { 262 rv = biowait(bp); 263 } 264 return (rv); 265} 266 267/* 268 * Write, release buffer on completion. (Done by iodone 269 * if async.) 270 / 271int 272bwrite(struct buf bp) 273{ 274 int oldflags = bp->b_flags; 275 276 if (bp->b_flags & B_INVAL) { 277 brelse(bp); 278 return (0); 279 } 280 if (!(bp->b_flags & B_BUSY)) 281 panic("bwrite: buffer is not busy???"); 282 283 bp->b_flags &= ~(B_READ \| B_DONE \| B_ERROR \| B_DELWRI); 284 bp->b_flags \|= B_WRITEINPROG; 285 286 if ((oldflags & (B_ASYNC\|B_DELWRI)) == (B_ASYNC\|B_DELWRI)) { 287 reassignbuf(bp, bp->b_vp); 288 } 289 290 bp->b_vp->v_numoutput++; 291 vfs_busy_pages(bp, 1); 292 if (curproc != NULL) 293 curproc->p_stats->p_ru.ru_oublock++; 294 VOP_STRATEGY(bp); 295 296 if ((oldflags & B_ASYNC) == 0) { 297 int rtval = biowait(bp); 298 299 if (oldflags & B_DELWRI) { 300 reassignbuf(bp, bp->b_vp); 301 } 302 brelse(bp); 303 return (rtval); 304 } 305 return (0); 306} 307 308int 309vn_bwrite(ap) 310 struct vop_bwrite_args ap; 311{ 312* return (bwrite(ap->a_bp)); 313} 314 315/* 316 * Delayed write. (Buffer is marked dirty). 317 / 318void 319bdwrite(struct buf bp) 320{ 321 322 if ((bp->b_flags & B_BUSY) == 0) { 323 panic("bdwrite: buffer is not busy"); 324 } 325 if (bp->b_flags & B_INVAL) { 326 brelse(bp); 327 return; 328 } 329 if (bp->b_flags & B_TAPE) { 330 bawrite(bp); 331 return; 332 } 333 bp->b_flags &= ~(B_READ\|B_RELBUF); 334 if ((bp->b_flags & B_DELWRI) == 0) { 335 bp->b_flags \|= B_DONE \| B_DELWRI; 336 reassignbuf(bp, bp->b_vp); 337 } 338 339 /* 340 * This bmap keeps the system from needing to do the bmap later, 341 * perhaps when the system is attempting to do a sync. Since it 342 * is likely that the indirect block -- or whatever other datastructure 343 * that the filesystem needs is still in memory now, it is a good 344 * thing to do this. Note also, that if the pageout daemon is 345 * requesting a sync -- there might not be enough memory to do 346 * the bmap then... So, this is important to do. 347 / 348* if( bp->b_lblkno == bp->b_blkno) {
349 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);	349 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
350 } 351 352 /* 353 * Set the dirty buffer range based upon the VM system dirty pages. 354 / 355* vfs_setdirty(bp); 356 357 /* 358 * We need to do this here to satisfy the vnode_pager and the 359 * pageout daemon, so that it thinks that the pages have been 360 * "cleaned". Note that since the pages are in a delayed write 361 * buffer -- the VFS layer "will" see that the pages get written 362 * out on the next sync, or perhaps the cluster will be completed. 363 / 364* vfs_clean_pages(bp); 365 brelse(bp); 366 return; 367} 368 369/* 370 * Asynchronous write. 371 * Start output on a buffer, but do not wait for it to complete. 372 * The buffer is released when the output completes. 373 / 374void 375bawrite(struct buf bp) 376{ 377 bp->b_flags \|= B_ASYNC; 378 (void) VOP_BWRITE(bp); 379} 380 381/* 382 * Release a buffer. 383 / 384void 385brelse(struct buf bp) 386{ 387 int s; 388 389 if (bp->b_flags & B_CLUSTER) { 390 relpbuf(bp); 391 return; 392 } 393 /* anyone need a "free" block? / 394* s = splbio(); 395 396 if (needsbuffer) { 397 needsbuffer = 0; 398 wakeup(&needsbuffer); 399 } 400 401 /* anyone need this block? / 402* if (bp->b_flags & B_WANTED) { 403 bp->b_flags &= ~(B_WANTED \| B_AGE); 404 wakeup(bp); 405 } else if (bp->b_flags & B_VMIO) { 406 bp->b_flags &= ~B_WANTED; 407 wakeup(bp); 408 } 409 if (bp->b_flags & B_LOCKED) 410 bp->b_flags &= ~B_ERROR; 411 412 if ((bp->b_flags & (B_NOCACHE \| B_INVAL \| B_ERROR)) \|\| 413 (bp->b_bufsize <= 0)) { 414 bp->b_flags \|= B_INVAL; 415 bp->b_flags &= ~(B_DELWRI \| B_CACHE); 416 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) 417 brelvp(bp); 418 } 419 420 /* 421 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 422 * constituted, so the B_INVAL flag is used to invalidate the buffer, 423 * but the VM object is kept around. The B_NOCACHE flag is used to 424 * invalidate the pages in the VM object. 425 / 426* if (bp->b_flags & B_VMIO) { 427 vm_offset_t foff; 428 vm_object_t obj; 429 int i, resid; 430 vm_page_t m; 431 int iototal = bp->b_bufsize; 432 433 foff = 0; 434 obj = 0; 435 if (bp->b_npages) { 436 if (bp->b_vp && bp->b_vp->v_mount) { 437 foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 438 } else { 439 /* 440 * vnode pointer has been ripped away -- 441 * probably file gone... 442 / 443* foff = bp->b_pages[0]->offset; 444 } 445 } 446 for (i = 0; i < bp->b_npages; i++) { 447 m = bp->b_pages[i]; 448 if (m == bogus_page) { 449 m = vm_page_lookup(obj, foff); 450 if (!m) { 451 panic("brelse: page missing\n"); 452 } 453 bp->b_pages[i] = m; 454 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 455 } 456 resid = (m->offset + PAGE_SIZE) - foff; 457 if (resid > iototal) 458 resid = iototal; 459 if (resid > 0) { 460 /* 461 * Don't invalidate the page if the local machine has already 462 * modified it. This is the lesser of two evils, and should 463 * be fixed. 464 / 465* if (bp->b_flags & (B_NOCACHE \| B_ERROR)) { 466 vm_page_test_dirty(m); 467 if (m->dirty == 0) { 468 vm_page_set_invalid(m, foff, resid); 469 if (m->valid == 0) 470 vm_page_protect(m, VM_PROT_NONE); 471 } 472 } 473 } 474 foff += resid; 475 iototal -= resid; 476 } 477 478 if (bp->b_flags & (B_INVAL \| B_RELBUF)) { 479 for(i=0;i<bp->b_npages;i++) { 480 m = bp->b_pages[i]; 481 --m->bmapped; 482 if (m->bmapped == 0) { 483 if (m->flags & PG_WANTED) { 484 wakeup(m); 485 m->flags &= ~PG_WANTED; 486 } 487 vm_page_test_dirty(m); 488 if ((m->dirty & m->valid) == 0 && 489 (m->flags & PG_REFERENCED) == 0 && 490 !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { 491 vm_page_cache(m); 492 } else if ((m->flags & PG_ACTIVE) == 0) { 493 vm_page_activate(m); 494 m->act_count = 0; 495 } 496 } 497 } 498 bufspace -= bp->b_bufsize; 499 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 500 bp->b_npages = 0; 501 bp->b_bufsize = 0; 502 bp->b_flags &= ~B_VMIO; 503 if (bp->b_vp) 504 brelvp(bp); 505 } 506 } 507 if (bp->b_qindex != QUEUE_NONE) 508 panic("brelse: free buffer onto another queue???"); 509 510 /* enqueue / 511* /* buffers with no memory / 512* if (bp->b_bufsize == 0) { 513 bp->b_qindex = QUEUE_EMPTY; 514 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 515 LIST_REMOVE(bp, b_hash); 516 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 517 bp->b_dev = NODEV; 518 /* buffers with junk contents / 519* } else if (bp->b_flags & (B_ERROR \| B_INVAL \| B_NOCACHE \| B_RELBUF)) { 520 bp->b_qindex = QUEUE_AGE; 521 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 522 LIST_REMOVE(bp, b_hash); 523 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 524 bp->b_dev = NODEV; 525 /* buffers that are locked / 526* } else if (bp->b_flags & B_LOCKED) { 527 bp->b_qindex = QUEUE_LOCKED; 528 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 529 /* buffers with stale but valid contents / 530* } else if (bp->b_flags & B_AGE) { 531 bp->b_qindex = QUEUE_AGE; 532 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 533 /* buffers with valid and quite potentially reuseable contents / 534* } else { 535 bp->b_qindex = QUEUE_LRU; 536 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 537 } 538 539 /* unlock / 540* bp->b_flags &= ~(B_WANTED \| B_BUSY \| B_ASYNC \| B_NOCACHE \| B_AGE \| B_RELBUF); 541 splx(s); 542} 543 544/* 545 * Check to see if a block is currently memory resident. 546 / 547static __inline struct buf 548gbincore(struct vnode * vp, daddr_t blkno) 549{ 550 struct buf bp; 551* struct bufhashhdr bh; 552* 553 bh = BUFHASH(vp, blkno); 554 bp = bh->lh_first; 555 556 /* Search hash chain / 557* while (bp != NULL) { 558 /* hit / 559* if (bp->b_vp == vp && bp->b_lblkno == blkno) { 560 break; 561 } 562 bp = bp->b_hash.le_next; 563 } 564 return (bp); 565} 566 567/* 568 * this routine implements clustered async writes for 569 * clearing out B_DELWRI buffers... This is much better 570 * than the old way of writing only one buffer at a time. 571 / 572void 573vfs_bio_awrite(struct buf bp) 574{ 575 int i; 576 daddr_t lblkno = bp->b_lblkno; 577 struct vnode vp = bp->b_vp; 578* int s; 579 int ncl; 580 struct buf bpa; 581* 582 s = splbio(); 583 if (vp->v_mount && (vp->v_flag & VVMIO) && 584 (bp->b_flags & (B_CLUSTEROK \| B_INVAL)) == B_CLUSTEROK) { 585 int size = vp->v_mount->mnt_stat.f_iosize; 586 int maxcl = MAXPHYS / size; 587 588 for (i = 1; i < maxcl; i++) { 589 if ((bpa = gbincore(vp, lblkno + i)) && 590 ((bpa->b_flags & (B_BUSY \| B_DELWRI \| B_CLUSTEROK \| B_INVAL)) == 591 (B_DELWRI \| B_CLUSTEROK)) && 592 (bpa->b_bufsize == size)) { 593 if ((bpa->b_blkno == bpa->b_lblkno) \|\| 594 (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE)) 595 break; 596 } else { 597 break; 598 } 599 } 600 ncl = i; 601 /* 602 * this is a possible cluster write 603 / 604* if (ncl != 1) { 605 bremfree(bp); 606 cluster_wbuild(vp, bp, size, lblkno, ncl, -1); 607 splx(s); 608 return; 609 } 610 } 611 /* 612 * default (old) behavior, writing out only one block 613 / 614* bremfree(bp); 615 bp->b_flags \|= B_BUSY \| B_ASYNC; 616 (void) VOP_BWRITE(bp); 617 splx(s); 618} 619 620 621/* 622 * Find a buffer header which is available for use. 623 / 624static struct buf 625getnewbuf(int slpflag, int slptimeo, int doingvmio) 626{ 627 struct buf bp; 628* int s; 629 int firstbp = 1; 630 631 s = splbio(); 632start: 633 if (bufspace >= maxbufspace) 634 goto trytofreespace; 635 636 /* can we constitute a new buffer? / 637* if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { 638 if (bp->b_qindex != QUEUE_EMPTY) 639 panic("getnewbuf: inconsistent EMPTY queue"); 640 bremfree(bp); 641 goto fillbuf; 642 } 643trytofreespace: 644 /* 645 * We keep the file I/O from hogging metadata I/O 646 * This is desirable because file data is cached in the 647 * VM/Buffer cache even if a buffer is freed. 648 / 649* if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { 650 if (bp->b_qindex != QUEUE_AGE) 651 panic("getnewbuf: inconsistent AGE queue"); 652 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { 653 if (bp->b_qindex != QUEUE_LRU) 654 panic("getnewbuf: inconsistent LRU queue"); 655 } 656 if (!bp) { 657 /* wait for a free buffer of any kind / 658* needsbuffer = 1; 659 tsleep(&needsbuffer, PRIBIO \| slpflag, "newbuf", slptimeo); 660 splx(s); 661 return (0); 662 } 663 664 /* if we are a delayed write, convert to an async write / 665* if ((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) { 666 vfs_bio_awrite(bp); 667 if (!slpflag && !slptimeo) { 668 splx(s); 669 return (0); 670 } 671 goto start; 672 } 673 674 if (bp->b_flags & B_WANTED) { 675 bp->b_flags &= ~B_WANTED; 676 wakeup(bp); 677 } 678 bremfree(bp); 679 680 if (bp->b_flags & B_VMIO) { 681 bp->b_flags \|= B_RELBUF \| B_BUSY \| B_DONE; 682 brelse(bp); 683 bremfree(bp); 684 } 685 686 if (bp->b_vp) 687 brelvp(bp); 688 689 /* we are not free, nor do we contain interesting data / 690* if (bp->b_rcred != NOCRED) 691 crfree(bp->b_rcred); 692 if (bp->b_wcred != NOCRED) 693 crfree(bp->b_wcred); 694fillbuf: 695 bp->b_flags \|= B_BUSY; 696 LIST_REMOVE(bp, b_hash); 697 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 698 splx(s); 699 if (bp->b_bufsize) { 700 allocbuf(bp, 0); 701 } 702 bp->b_flags = B_BUSY; 703 bp->b_dev = NODEV; 704 bp->b_vp = NULL; 705 bp->b_blkno = bp->b_lblkno = 0; 706 bp->b_iodone = 0; 707 bp->b_error = 0; 708 bp->b_resid = 0; 709 bp->b_bcount = 0; 710 bp->b_npages = 0; 711 bp->b_wcred = bp->b_rcred = NOCRED; 712 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 713 bp->b_dirtyoff = bp->b_dirtyend = 0; 714 bp->b_validoff = bp->b_validend = 0; 715 if (bufspace >= maxbufspace) { 716 s = splbio(); 717 bp->b_flags \|= B_INVAL; 718 brelse(bp); 719 goto trytofreespace; 720 } 721 return (bp); 722} 723 724/* 725 * Check to see if a block is currently memory resident. 726 / 727struct buf 728incore(struct vnode * vp, daddr_t blkno) 729{ 730 struct buf bp; 731* struct bufhashhdr bh; 732* 733 int s = splbio(); 734 735 bh = BUFHASH(vp, blkno); 736 bp = bh->lh_first; 737 738 /* Search hash chain / 739* while (bp != NULL) { 740 /* hit / 741* if (bp->b_vp == vp && bp->b_lblkno == blkno && 742 (bp->b_flags & B_INVAL) == 0) { 743 break; 744 } 745 bp = bp->b_hash.le_next; 746 } 747 splx(s); 748 return (bp); 749} 750 751/* 752 * Returns true if no I/O is needed to access the 753 * associated VM object. This is like incore except 754 * it also hunts around in the VM system for the data. 755 / 756* 757int 758inmem(struct vnode * vp, daddr_t blkno) 759{ 760 vm_object_t obj; 761 vm_offset_t off, toff, tinc; 762 vm_page_t m; 763 764 if (incore(vp, blkno)) 765 return 1; 766 if (vp->v_mount == NULL) 767 return 0; 768 if ((vp->v_object == NULL) \|\| (vp->v_flag & VVMIO) == 0) 769 return 0; 770 771 obj = vp->v_object; 772 tinc = PAGE_SIZE; 773 if (tinc > vp->v_mount->mnt_stat.f_iosize) 774 tinc = vp->v_mount->mnt_stat.f_iosize; 775 off = blkno * vp->v_mount->mnt_stat.f_iosize; 776 777 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 778 int mask; 779 780 m = vm_page_lookup(obj, trunc_page(toff + off)); 781 if (!m) 782 return 0; 783 if (vm_page_is_valid(m, toff + off, tinc) == 0) 784 return 0; 785 } 786 return 1; 787} 788 789/* 790 * now we set the dirty range for the buffer -- 791 * for NFS -- if the file is mapped and pages have 792 * been written to, let it know. We want the 793 * entire range of the buffer to be marked dirty if 794 * any of the pages have been written to for consistancy 795 * with the b_validoff, b_validend set in the nfs write 796 * code, and used by the nfs read code. 797 / 798static void 799vfs_setdirty(struct buf bp) { 800 int i; 801 vm_object_t object; 802 vm_offset_t boffset, offset; 803 /* 804 * We qualify the scan for modified pages on whether the 805 * object has been flushed yet. The OBJ_WRITEABLE flag 806 * is not cleared simply by protecting pages off. 807 / 808* if ((bp->b_flags & B_VMIO) && 809 ((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) { 810 /* 811 * test the pages to see if they have been modified directly 812 * by users through the VM system. 813 / 814* for (i = 0; i < bp->b_npages; i++) 815 vm_page_test_dirty(bp->b_pages[i]); 816 817 /* 818 * scan forwards for the first page modified 819 / 820* for (i = 0; i < bp->b_npages; i++) { 821 if (bp->b_pages[i]->dirty) { 822 break; 823 } 824 } 825 boffset = i * PAGE_SIZE; 826 if (boffset < bp->b_dirtyoff) { 827 bp->b_dirtyoff = boffset; 828 } 829 830 /* 831 * scan backwards for the last page modified 832 / 833* for (i = bp->b_npages - 1; i >= 0; --i) { 834 if (bp->b_pages[i]->dirty) { 835 break; 836 } 837 } 838 boffset = (i + 1) * PAGE_SIZE; 839 offset = boffset + bp->b_pages[0]->offset; 840 if (offset >= object->size) { 841 boffset = object->size - bp->b_pages[0]->offset; 842 } 843 if (bp->b_dirtyend < boffset) { 844 bp->b_dirtyend = boffset; 845 } 846 } 847} 848 849/* 850 * Get a block given a specified block and offset into a file/device. 851 / 852struct buf 853getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 854{ 855 struct buf bp; 856* int s; 857 struct bufhashhdr bh; 858* vm_offset_t off; 859 int nleft; 860 861 s = splbio(); 862loop: 863 if (bp = gbincore(vp, blkno)) { 864 if (bp->b_flags & (B_BUSY\|B_INVAL)) { 865 bp->b_flags \|= B_WANTED; 866 if (!tsleep(bp, PRIBIO \| slpflag, "getblk", slptimeo)) 867 goto loop; 868 869 splx(s); 870 return (struct buf ) NULL; 871* } 872 bp->b_flags \|= B_BUSY \| B_CACHE; 873 bremfree(bp); 874 /* 875 * check for size inconsistancies 876 / 877* if (bp->b_bcount != size) { 878 allocbuf(bp, size); 879 } 880 splx(s); 881 return (bp); 882 } else { 883 vm_object_t obj; 884 int doingvmio; 885 886 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { 887 doingvmio = 1; 888 } else { 889 doingvmio = 0; 890 } 891 if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) { 892 if (slpflag \|\| slptimeo) 893 return NULL; 894 goto loop; 895 } 896 897 /* 898 * This code is used to make sure that a buffer is not 899 * created while the getnewbuf routine is blocked. 900 * Normally the vnode is locked so this isn't a problem. 901 * VBLK type I/O requests, however, don't lock the vnode. 902 / 903* if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { 904 bp->b_flags \|= B_INVAL; 905 brelse(bp); 906 goto loop; 907 } 908 909 /* 910 * Insert the buffer into the hash, so that it can 911 * be found by incore. 912 / 913* bp->b_blkno = bp->b_lblkno = blkno; 914 bgetvp(vp, bp); 915 LIST_REMOVE(bp, b_hash); 916 bh = BUFHASH(vp, blkno); 917 LIST_INSERT_HEAD(bh, bp, b_hash); 918 919 if (doingvmio) { 920 bp->b_flags \|= (B_VMIO \| B_CACHE); 921#if defined(VFS_BIO_DEBUG) 922 if (vp->v_type != VREG) 923 printf("getblk: vmioing file type %d???\n", vp->v_type); 924#endif 925 } else { 926 bp->b_flags &= ~B_VMIO; 927 } 928 splx(s); 929 930 allocbuf(bp, size); 931 return (bp); 932 } 933} 934 935/* 936 * Get an empty, disassociated buffer of given size. 937 / 938struct buf 939geteblk(int size) 940{ 941 struct buf bp; 942* 943 while ((bp = getnewbuf(0, 0, 0)) == 0); 944 allocbuf(bp, size); 945 bp->b_flags \|= B_INVAL; 946 return (bp); 947} 948 949/* 950 * This code constitutes the buffer memory from either anonymous system 951 * memory (in the case of non-VMIO operations) or from an associated 952 * VM object (in the case of VMIO operations). 953 * 954 * Note that this code is tricky, and has many complications to resolve 955 * deadlock or inconsistant data situations. Tread lightly!!! 956 * 957 * Modify the length of a buffer's underlying buffer storage without 958 * destroying information (unless, of course the buffer is shrinking). 959 / 960int 961allocbuf(struct buf bp, int size) 962{ 963 964 int s; 965 int newbsize, mbsize; 966 int i; 967 968 if (!(bp->b_flags & B_BUSY)) 969 panic("allocbuf: buffer not busy"); 970 971 if ((bp->b_flags & B_VMIO) == 0) { 972 /* 973 * Just get anonymous memory from the kernel 974 / 975* mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; 976 newbsize = round_page(size); 977 978 if (newbsize < bp->b_bufsize) { 979 vm_hold_free_pages( 980 bp, 981 (vm_offset_t) bp->b_data + newbsize, 982 (vm_offset_t) bp->b_data + bp->b_bufsize); 983 } else if (newbsize > bp->b_bufsize) { 984 vm_hold_load_pages( 985 bp, 986 (vm_offset_t) bp->b_data + bp->b_bufsize, 987 (vm_offset_t) bp->b_data + newbsize); 988 } 989 } else { 990 vm_page_t m; 991 int desiredpages; 992 993 newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; 994 desiredpages = round_page(newbsize) / PAGE_SIZE; 995 996 if (newbsize < bp->b_bufsize) { 997 if (desiredpages < bp->b_npages) { 998 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + 999 desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages)); 1000 for (i = desiredpages; i < bp->b_npages; i++) { 1001 m = bp->b_pages[i]; 1002 s = splhigh(); 1003 while ((m->flags & PG_BUSY) \|\| (m->busy != 0)) { 1004 m->flags \|= PG_WANTED; 1005 tsleep(m, PVM, "biodep", 0); 1006 } 1007 splx(s); 1008 1009 if (m->bmapped == 0) { 1010 printf("allocbuf: bmapped is zero for page %d\n", i); 1011 panic("allocbuf: error"); 1012 } 1013 --m->bmapped; 1014 if (m->bmapped == 0) { 1015 vm_page_protect(m, VM_PROT_NONE); 1016 vm_page_free(m); 1017 } 1018 bp->b_pages[i] = NULL; 1019 } 1020 bp->b_npages = desiredpages; 1021 } 1022 } else if (newbsize > bp->b_bufsize) { 1023 vm_object_t obj; 1024 vm_offset_t tinc, off, toff, objoff; 1025 int pageindex, curbpnpages; 1026 struct vnode vp; 1027* int bsize; 1028 1029 vp = bp->b_vp; 1030 bsize = vp->v_mount->mnt_stat.f_iosize; 1031 1032 if (bp->b_npages < desiredpages) { 1033 obj = vp->v_object; 1034 tinc = PAGE_SIZE; 1035 if (tinc > bsize) 1036 tinc = bsize; 1037 off = bp->b_lblkno * bsize; 1038 doretry: 1039 curbpnpages = bp->b_npages; 1040 bp->b_flags \|= B_CACHE; 1041 for (toff = 0; toff < newbsize; toff += tinc) { 1042 int mask; 1043 int bytesinpage; 1044 1045 pageindex = toff / PAGE_SIZE; 1046 objoff = trunc_page(toff + off); 1047 if (pageindex < curbpnpages) { 1048 int pb; 1049 1050 m = bp->b_pages[pageindex]; 1051 if (m->offset != objoff) 1052 panic("allocbuf: page changed offset??!!!?"); 1053 bytesinpage = tinc; 1054 if (tinc > (newbsize - toff)) 1055 bytesinpage = newbsize - toff; 1056 if (!vm_page_is_valid(m, toff + off, bytesinpage)) { 1057 bp->b_flags &= ~B_CACHE; 1058 } 1059 if ((m->flags & PG_ACTIVE) == 0) { 1060 vm_page_activate(m); 1061 m->act_count = 0; 1062 } 1063 continue; 1064 } 1065 m = vm_page_lookup(obj, objoff); 1066 if (!m) { 1067 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); 1068 if (!m) { 1069 int j; 1070 1071 for (j = bp->b_npages; j < pageindex; j++) { 1072 PAGE_WAKEUP(bp->b_pages[j]); 1073 } 1074 VM_WAIT; 1075 goto doretry; 1076 } 1077 vm_page_activate(m); 1078 m->act_count = 0; 1079 m->valid = 0; 1080 bp->b_flags &= ~B_CACHE; 1081 } else if (m->flags & PG_BUSY) { 1082 int j; 1083 1084 for (j = bp->b_npages; j < pageindex; j++) { 1085 PAGE_WAKEUP(bp->b_pages[j]); 1086 } 1087 1088 s = splbio(); 1089 m->flags \|= PG_WANTED; 1090 tsleep(m, PRIBIO, "pgtblk", 0); 1091 splx(s); 1092 1093 goto doretry; 1094 } else { 1095 int pb; 1096 if ((curproc != pageproc) && 1097 (m->flags & PG_CACHE) && 1098 (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { 1099 pagedaemon_wakeup(); 1100 } 1101 bytesinpage = tinc; 1102 if (tinc > (newbsize - toff)) 1103 bytesinpage = newbsize - toff; 1104 if (!vm_page_is_valid(m, toff + off, bytesinpage)) { 1105 bp->b_flags &= ~B_CACHE; 1106 } 1107 if ((m->flags & PG_ACTIVE) == 0) { 1108 vm_page_activate(m); 1109 m->act_count = 0; 1110 } 1111 m->flags \|= PG_BUSY; 1112 } 1113 bp->b_pages[pageindex] = m; 1114 curbpnpages = pageindex + 1; 1115 } 1116 for (i = bp->b_npages; i < curbpnpages; i++) { 1117 m = bp->b_pages[i]; 1118 m->bmapped++; 1119 PAGE_WAKEUP(m); 1120 } 1121 bp->b_npages = curbpnpages; 1122 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 1123 pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); 1124 bp->b_data += off % PAGE_SIZE; 1125 } 1126 } 1127 } 1128 bufspace += (newbsize - bp->b_bufsize); 1129 bp->b_bufsize = newbsize; 1130 bp->b_bcount = size; 1131 return 1; 1132} 1133 1134/* 1135 * Wait for buffer I/O completion, returning error status. 1136 / 1137int 1138biowait(register struct buf bp) 1139{ 1140 int s; 1141 1142 s = splbio(); 1143 while ((bp->b_flags & B_DONE) == 0) 1144 tsleep(bp, PRIBIO, "biowait", 0); 1145 splx(s); 1146 if (bp->b_flags & B_EINTR) { 1147 bp->b_flags &= ~B_EINTR; 1148 return (EINTR); 1149 } 1150 if (bp->b_flags & B_ERROR) { 1151 return (bp->b_error ? bp->b_error : EIO); 1152 } else { 1153 return (0); 1154 } 1155} 1156 1157/* 1158 * Finish I/O on a buffer, calling an optional function. 1159 * This is usually called from interrupt level, so process blocking 1160 * is not a good idea. 1161 / 1162void 1163biodone(register struct buf bp) 1164{ 1165 int s; 1166 1167 s = splbio(); 1168 if (!(bp->b_flags & B_BUSY)) 1169 panic("biodone: buffer not busy"); 1170 1171 if (bp->b_flags & B_DONE) { 1172 splx(s); 1173 printf("biodone: buffer already done\n"); 1174 return; 1175 } 1176 bp->b_flags \|= B_DONE; 1177 1178 if ((bp->b_flags & B_READ) == 0) { 1179 struct vnode vp = bp->b_vp; 1180* vwakeup(bp); 1181 } 1182#ifdef BOUNCE_BUFFERS 1183 if (bp->b_flags & B_BOUNCE) 1184 vm_bounce_free(bp); 1185#endif 1186 1187 /* call optional completion function if requested / 1188* if (bp->b_flags & B_CALL) { 1189 bp->b_flags &= ~B_CALL; 1190 (bp->b_iodone) (bp); 1191* splx(s); 1192 return; 1193 } 1194 if (bp->b_flags & B_VMIO) { 1195 int i, resid; 1196 vm_offset_t foff; 1197 vm_page_t m; 1198 vm_object_t obj; 1199 int iosize; 1200 struct vnode vp = bp->b_vp; 1201* 1202 foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1203 obj = vp->v_object; 1204 if (!obj) { 1205 panic("biodone: no object"); 1206 } 1207#if defined(VFS_BIO_DEBUG) 1208 if (obj->paging_in_progress < bp->b_npages) { 1209 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 1210 obj->paging_in_progress, bp->b_npages); 1211 } 1212#endif 1213 iosize = bp->b_bufsize; 1214 for (i = 0; i < bp->b_npages; i++) { 1215 int bogusflag = 0; 1216 m = bp->b_pages[i]; 1217 if (m == bogus_page) { 1218 bogusflag = 1; 1219 m = vm_page_lookup(obj, foff); 1220 if (!m) { 1221#if defined(VFS_BIO_DEBUG) 1222 printf("biodone: page disappeared\n"); 1223#endif 1224 --obj->paging_in_progress; 1225 continue; 1226 } 1227 bp->b_pages[i] = m; 1228 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1229 } 1230#if defined(VFS_BIO_DEBUG) 1231 if (trunc_page(foff) != m->offset) { 1232 printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset); 1233 } 1234#endif 1235 resid = (m->offset + PAGE_SIZE) - foff; 1236 if (resid > iosize) 1237 resid = iosize; 1238 /* 1239 * In the write case, the valid and clean bits are 1240 * already changed correctly, so we only need to do this 1241 * here in the read case. 1242 / 1243* if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 1244 vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid); 1245 } 1246 1247 /* 1248 * when debugging new filesystems or buffer I/O methods, this 1249 * is the most common error that pops up. if you see this, you 1250 * have not set the page busy flag correctly!!! 1251 / 1252* if (m->busy == 0) { 1253 printf("biodone: page busy < 0, " 1254 "off: %ld, foff: %ld, " 1255 "resid: %d, index: %d\n", 1256 m->offset, foff, resid, i); 1257 printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n", 1258 bp->b_vp->v_mount->mnt_stat.f_iosize, 1259 bp->b_lblkno, bp->b_flags, bp->b_npages); 1260 printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n", 1261 m->valid, m->dirty, m->bmapped); 1262 panic("biodone: page busy < 0\n"); 1263 } 1264 --m->busy; 1265 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1266 m->flags &= ~PG_WANTED; 1267 wakeup(m); 1268 } 1269 --obj->paging_in_progress; 1270 foff += resid; 1271 iosize -= resid; 1272 } 1273 if (obj && obj->paging_in_progress == 0 && 1274 (obj->flags & OBJ_PIPWNT)) { 1275 obj->flags &= ~OBJ_PIPWNT; 1276 wakeup(obj); 1277 } 1278 } 1279 /* 1280 * For asynchronous completions, release the buffer now. The brelse 1281 * checks for B_WANTED and will do the wakeup there if necessary - so 1282 * no need to do a wakeup here in the async case. 1283 / 1284* 1285 if (bp->b_flags & B_ASYNC) { 1286 brelse(bp); 1287 } else { 1288 bp->b_flags &= ~B_WANTED; 1289 wakeup(bp); 1290 } 1291 splx(s); 1292} 1293 1294int 1295count_lock_queue() 1296{ 1297 int count; 1298 struct buf bp; 1299* 1300 count = 0; 1301 for (bp = bufqueues[QUEUE_LOCKED].tqh_first; 1302 bp != NULL; 1303 bp = bp->b_freelist.tqe_next) 1304 count++; 1305 return (count); 1306} 1307 1308int vfs_update_interval = 30; 1309 1310void 1311vfs_update() 1312{ 1313 (void) spl0(); 1314 while (1) { 1315 tsleep(&vfs_update_wakeup, PRIBIO, "update", 1316 hz * vfs_update_interval); 1317 vfs_update_wakeup = 0; 1318 sync(curproc, NULL, NULL); 1319 } 1320} 1321 1322/* 1323 * This routine is called in lieu of iodone in the case of 1324 * incomplete I/O. This keeps the busy status for pages 1325 * consistant. 1326 / 1327void 1328vfs_unbusy_pages(struct buf bp) 1329{ 1330 int i; 1331 1332 if (bp->b_flags & B_VMIO) { 1333 struct vnode vp = bp->b_vp; 1334* vm_object_t obj = vp->v_object; 1335 vm_offset_t foff; 1336 1337 foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno); 1338 1339 for (i = 0; i < bp->b_npages; i++) { 1340 vm_page_t m = bp->b_pages[i]; 1341 1342 if (m == bogus_page) { 1343 m = vm_page_lookup(obj, foff + i * PAGE_SIZE); 1344 if (!m) { 1345 panic("vfs_unbusy_pages: page missing\n"); 1346 } 1347 bp->b_pages[i] = m; 1348 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1349 } 1350 --obj->paging_in_progress; 1351 --m->busy; 1352 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1353 m->flags &= ~PG_WANTED; 1354 wakeup(m); 1355 } 1356 } 1357 if (obj->paging_in_progress == 0 && 1358 (obj->flags & OBJ_PIPWNT)) { 1359 obj->flags &= ~OBJ_PIPWNT; 1360 wakeup(obj); 1361 } 1362 } 1363} 1364 1365/* 1366 * This routine is called before a device strategy routine. 1367 * It is used to tell the VM system that paging I/O is in 1368 * progress, and treat the pages associated with the buffer 1369 * almost as being PG_BUSY. Also the object paging_in_progress 1370 * flag is handled to make sure that the object doesn't become 1371 * inconsistant. 1372 / 1373void 1374vfs_busy_pages(struct buf bp, int clear_modify) 1375{ 1376 int i; 1377 1378 if (bp->b_flags & B_VMIO) { 1379 vm_object_t obj = bp->b_vp->v_object; 1380 vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1381 int iocount = bp->b_bufsize; 1382 1383 vfs_setdirty(bp); 1384 for (i = 0; i < bp->b_npages; i++) { 1385 vm_page_t m = bp->b_pages[i]; 1386 int resid = (m->offset + PAGE_SIZE) - foff; 1387 1388 if (resid > iocount) 1389 resid = iocount; 1390 if ((bp->b_flags & B_CLUSTER) == 0) { 1391 obj->paging_in_progress++; 1392 m->busy++; 1393 } 1394 if (clear_modify) { 1395 vm_page_protect(m, VM_PROT_READ); 1396 vm_page_set_validclean(m, 1397 foff & (PAGE_SIZE-1), resid); 1398 } else if (bp->b_bcount >= PAGE_SIZE) { 1399 if (m->valid && (bp->b_flags & B_CACHE) == 0) { 1400 bp->b_pages[i] = bogus_page; 1401 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1402 } 1403 } 1404 foff += resid; 1405 iocount -= resid; 1406 } 1407 } 1408} 1409 1410/* 1411 * Tell the VM system that the pages associated with this buffer 1412 * are clean. This is used for delayed writes where the data is 1413 * going to go to disk eventually without additional VM intevention. 1414 / 1415void 1416vfs_clean_pages(struct buf bp) 1417{ 1418 int i; 1419 1420 if (bp->b_flags & B_VMIO) { 1421 vm_offset_t foff = 1422 bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1423 int iocount = bp->b_bufsize; 1424 1425 for (i = 0; i < bp->b_npages; i++) { 1426 vm_page_t m = bp->b_pages[i]; 1427 int resid = (m->offset + PAGE_SIZE) - foff; 1428 1429 if (resid > iocount) 1430 resid = iocount; 1431 if (resid > 0) { 1432 vm_page_set_validclean(m, 1433 foff & (PAGE_SIZE-1), resid); 1434 } 1435 foff += resid; 1436 iocount -= resid; 1437 } 1438 } 1439} 1440 1441void 1442vfs_bio_clrbuf(struct buf bp) { 1443* int i; 1444 if( bp->b_flags & B_VMIO) { 1445 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { 1446 int j; 1447 if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) { 1448 for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) { 1449 bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE); 1450 } 1451 } 1452 bp->b_resid = 0; 1453 return; 1454 } 1455 for(i=0;i<bp->b_npages;i++) { 1456 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) 1457 continue; 1458 if( bp->b_pages[i]->valid == 0) { 1459 bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE); 1460 } else { 1461 int j; 1462 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { 1463 if( (bp->b_pages[i]->valid & (1<<j)) == 0) 1464 bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE); 1465 } 1466 } 1467 bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; 1468 } 1469 bp->b_resid = 0; 1470 } else { 1471 clrbuf(bp); 1472 } 1473} 1474 1475/* 1476 * vm_hold_load_pages and vm_hold_unload pages get pages into 1477 * a buffers address space. The pages are anonymous and are 1478 * not associated with a file object. 1479 / 1480void 1481vm_hold_load_pages(struct buf bp, vm_offset_t froma, vm_offset_t toa) 1482{ 1483 vm_offset_t pg; 1484 vm_page_t p; 1485 vm_offset_t from = round_page(froma); 1486 vm_offset_t to = round_page(toa); 1487 1488 for (pg = from; pg < to; pg += PAGE_SIZE) { 1489 1490tryagain: 1491 1492 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS, 1493 VM_ALLOC_NORMAL); 1494 if (!p) { 1495 VM_WAIT; 1496 goto tryagain; 1497 } 1498 vm_page_wire(p); 1499 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 1500 bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p; 1501 PAGE_WAKEUP(p); 1502 bp->b_npages++; 1503 } 1504} 1505 1506void 1507vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) 1508{ 1509 vm_offset_t pg; 1510 vm_page_t p; 1511 vm_offset_t from = round_page(froma); 1512 vm_offset_t to = round_page(toa); 1513 1514 for (pg = from; pg < to; pg += PAGE_SIZE) { 1515 p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE]; 1516 bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0; 1517 pmap_kremove(pg); 1518 vm_page_free(p); 1519 --bp->b_npages; 1520 } 1521}	350 } 351 352 /* 353 * Set the dirty buffer range based upon the VM system dirty pages. 354 / 355* vfs_setdirty(bp); 356 357 /* 358 * We need to do this here to satisfy the vnode_pager and the 359 * pageout daemon, so that it thinks that the pages have been 360 * "cleaned". Note that since the pages are in a delayed write 361 * buffer -- the VFS layer "will" see that the pages get written 362 * out on the next sync, or perhaps the cluster will be completed. 363 / 364* vfs_clean_pages(bp); 365 brelse(bp); 366 return; 367} 368 369/* 370 * Asynchronous write. 371 * Start output on a buffer, but do not wait for it to complete. 372 * The buffer is released when the output completes. 373 / 374void 375bawrite(struct buf bp) 376{ 377 bp->b_flags \|= B_ASYNC; 378 (void) VOP_BWRITE(bp); 379} 380 381/* 382 * Release a buffer. 383 / 384void 385brelse(struct buf bp) 386{ 387 int s; 388 389 if (bp->b_flags & B_CLUSTER) { 390 relpbuf(bp); 391 return; 392 } 393 /* anyone need a "free" block? / 394* s = splbio(); 395 396 if (needsbuffer) { 397 needsbuffer = 0; 398 wakeup(&needsbuffer); 399 } 400 401 /* anyone need this block? / 402* if (bp->b_flags & B_WANTED) { 403 bp->b_flags &= ~(B_WANTED \| B_AGE); 404 wakeup(bp); 405 } else if (bp->b_flags & B_VMIO) { 406 bp->b_flags &= ~B_WANTED; 407 wakeup(bp); 408 } 409 if (bp->b_flags & B_LOCKED) 410 bp->b_flags &= ~B_ERROR; 411 412 if ((bp->b_flags & (B_NOCACHE \| B_INVAL \| B_ERROR)) \|\| 413 (bp->b_bufsize <= 0)) { 414 bp->b_flags \|= B_INVAL; 415 bp->b_flags &= ~(B_DELWRI \| B_CACHE); 416 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) 417 brelvp(bp); 418 } 419 420 /* 421 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 422 * constituted, so the B_INVAL flag is used to invalidate the buffer, 423 * but the VM object is kept around. The B_NOCACHE flag is used to 424 * invalidate the pages in the VM object. 425 / 426* if (bp->b_flags & B_VMIO) { 427 vm_offset_t foff; 428 vm_object_t obj; 429 int i, resid; 430 vm_page_t m; 431 int iototal = bp->b_bufsize; 432 433 foff = 0; 434 obj = 0; 435 if (bp->b_npages) { 436 if (bp->b_vp && bp->b_vp->v_mount) { 437 foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 438 } else { 439 /* 440 * vnode pointer has been ripped away -- 441 * probably file gone... 442 / 443* foff = bp->b_pages[0]->offset; 444 } 445 } 446 for (i = 0; i < bp->b_npages; i++) { 447 m = bp->b_pages[i]; 448 if (m == bogus_page) { 449 m = vm_page_lookup(obj, foff); 450 if (!m) { 451 panic("brelse: page missing\n"); 452 } 453 bp->b_pages[i] = m; 454 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 455 } 456 resid = (m->offset + PAGE_SIZE) - foff; 457 if (resid > iototal) 458 resid = iototal; 459 if (resid > 0) { 460 /* 461 * Don't invalidate the page if the local machine has already 462 * modified it. This is the lesser of two evils, and should 463 * be fixed. 464 / 465* if (bp->b_flags & (B_NOCACHE \| B_ERROR)) { 466 vm_page_test_dirty(m); 467 if (m->dirty == 0) { 468 vm_page_set_invalid(m, foff, resid); 469 if (m->valid == 0) 470 vm_page_protect(m, VM_PROT_NONE); 471 } 472 } 473 } 474 foff += resid; 475 iototal -= resid; 476 } 477 478 if (bp->b_flags & (B_INVAL \| B_RELBUF)) { 479 for(i=0;i<bp->b_npages;i++) { 480 m = bp->b_pages[i]; 481 --m->bmapped; 482 if (m->bmapped == 0) { 483 if (m->flags & PG_WANTED) { 484 wakeup(m); 485 m->flags &= ~PG_WANTED; 486 } 487 vm_page_test_dirty(m); 488 if ((m->dirty & m->valid) == 0 && 489 (m->flags & PG_REFERENCED) == 0 && 490 !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { 491 vm_page_cache(m); 492 } else if ((m->flags & PG_ACTIVE) == 0) { 493 vm_page_activate(m); 494 m->act_count = 0; 495 } 496 } 497 } 498 bufspace -= bp->b_bufsize; 499 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 500 bp->b_npages = 0; 501 bp->b_bufsize = 0; 502 bp->b_flags &= ~B_VMIO; 503 if (bp->b_vp) 504 brelvp(bp); 505 } 506 } 507 if (bp->b_qindex != QUEUE_NONE) 508 panic("brelse: free buffer onto another queue???"); 509 510 /* enqueue / 511* /* buffers with no memory / 512* if (bp->b_bufsize == 0) { 513 bp->b_qindex = QUEUE_EMPTY; 514 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 515 LIST_REMOVE(bp, b_hash); 516 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 517 bp->b_dev = NODEV; 518 /* buffers with junk contents / 519* } else if (bp->b_flags & (B_ERROR \| B_INVAL \| B_NOCACHE \| B_RELBUF)) { 520 bp->b_qindex = QUEUE_AGE; 521 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 522 LIST_REMOVE(bp, b_hash); 523 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 524 bp->b_dev = NODEV; 525 /* buffers that are locked / 526* } else if (bp->b_flags & B_LOCKED) { 527 bp->b_qindex = QUEUE_LOCKED; 528 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 529 /* buffers with stale but valid contents / 530* } else if (bp->b_flags & B_AGE) { 531 bp->b_qindex = QUEUE_AGE; 532 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 533 /* buffers with valid and quite potentially reuseable contents / 534* } else { 535 bp->b_qindex = QUEUE_LRU; 536 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 537 } 538 539 /* unlock / 540* bp->b_flags &= ~(B_WANTED \| B_BUSY \| B_ASYNC \| B_NOCACHE \| B_AGE \| B_RELBUF); 541 splx(s); 542} 543 544/* 545 * Check to see if a block is currently memory resident. 546 / 547static __inline struct buf 548gbincore(struct vnode * vp, daddr_t blkno) 549{ 550 struct buf bp; 551* struct bufhashhdr bh; 552* 553 bh = BUFHASH(vp, blkno); 554 bp = bh->lh_first; 555 556 /* Search hash chain / 557* while (bp != NULL) { 558 /* hit / 559* if (bp->b_vp == vp && bp->b_lblkno == blkno) { 560 break; 561 } 562 bp = bp->b_hash.le_next; 563 } 564 return (bp); 565} 566 567/* 568 * this routine implements clustered async writes for 569 * clearing out B_DELWRI buffers... This is much better 570 * than the old way of writing only one buffer at a time. 571 / 572void 573vfs_bio_awrite(struct buf bp) 574{ 575 int i; 576 daddr_t lblkno = bp->b_lblkno; 577 struct vnode vp = bp->b_vp; 578* int s; 579 int ncl; 580 struct buf bpa; 581* 582 s = splbio(); 583 if (vp->v_mount && (vp->v_flag & VVMIO) && 584 (bp->b_flags & (B_CLUSTEROK \| B_INVAL)) == B_CLUSTEROK) { 585 int size = vp->v_mount->mnt_stat.f_iosize; 586 int maxcl = MAXPHYS / size; 587 588 for (i = 1; i < maxcl; i++) { 589 if ((bpa = gbincore(vp, lblkno + i)) && 590 ((bpa->b_flags & (B_BUSY \| B_DELWRI \| B_CLUSTEROK \| B_INVAL)) == 591 (B_DELWRI \| B_CLUSTEROK)) && 592 (bpa->b_bufsize == size)) { 593 if ((bpa->b_blkno == bpa->b_lblkno) \|\| 594 (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE)) 595 break; 596 } else { 597 break; 598 } 599 } 600 ncl = i; 601 /* 602 * this is a possible cluster write 603 / 604* if (ncl != 1) { 605 bremfree(bp); 606 cluster_wbuild(vp, bp, size, lblkno, ncl, -1); 607 splx(s); 608 return; 609 } 610 } 611 /* 612 * default (old) behavior, writing out only one block 613 / 614* bremfree(bp); 615 bp->b_flags \|= B_BUSY \| B_ASYNC; 616 (void) VOP_BWRITE(bp); 617 splx(s); 618} 619 620 621/* 622 * Find a buffer header which is available for use. 623 / 624static struct buf 625getnewbuf(int slpflag, int slptimeo, int doingvmio) 626{ 627 struct buf bp; 628* int s; 629 int firstbp = 1; 630 631 s = splbio(); 632start: 633 if (bufspace >= maxbufspace) 634 goto trytofreespace; 635 636 /* can we constitute a new buffer? / 637* if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { 638 if (bp->b_qindex != QUEUE_EMPTY) 639 panic("getnewbuf: inconsistent EMPTY queue"); 640 bremfree(bp); 641 goto fillbuf; 642 } 643trytofreespace: 644 /* 645 * We keep the file I/O from hogging metadata I/O 646 * This is desirable because file data is cached in the 647 * VM/Buffer cache even if a buffer is freed. 648 / 649* if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { 650 if (bp->b_qindex != QUEUE_AGE) 651 panic("getnewbuf: inconsistent AGE queue"); 652 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { 653 if (bp->b_qindex != QUEUE_LRU) 654 panic("getnewbuf: inconsistent LRU queue"); 655 } 656 if (!bp) { 657 /* wait for a free buffer of any kind / 658* needsbuffer = 1; 659 tsleep(&needsbuffer, PRIBIO \| slpflag, "newbuf", slptimeo); 660 splx(s); 661 return (0); 662 } 663 664 /* if we are a delayed write, convert to an async write / 665* if ((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) { 666 vfs_bio_awrite(bp); 667 if (!slpflag && !slptimeo) { 668 splx(s); 669 return (0); 670 } 671 goto start; 672 } 673 674 if (bp->b_flags & B_WANTED) { 675 bp->b_flags &= ~B_WANTED; 676 wakeup(bp); 677 } 678 bremfree(bp); 679 680 if (bp->b_flags & B_VMIO) { 681 bp->b_flags \|= B_RELBUF \| B_BUSY \| B_DONE; 682 brelse(bp); 683 bremfree(bp); 684 } 685 686 if (bp->b_vp) 687 brelvp(bp); 688 689 /* we are not free, nor do we contain interesting data / 690* if (bp->b_rcred != NOCRED) 691 crfree(bp->b_rcred); 692 if (bp->b_wcred != NOCRED) 693 crfree(bp->b_wcred); 694fillbuf: 695 bp->b_flags \|= B_BUSY; 696 LIST_REMOVE(bp, b_hash); 697 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 698 splx(s); 699 if (bp->b_bufsize) { 700 allocbuf(bp, 0); 701 } 702 bp->b_flags = B_BUSY; 703 bp->b_dev = NODEV; 704 bp->b_vp = NULL; 705 bp->b_blkno = bp->b_lblkno = 0; 706 bp->b_iodone = 0; 707 bp->b_error = 0; 708 bp->b_resid = 0; 709 bp->b_bcount = 0; 710 bp->b_npages = 0; 711 bp->b_wcred = bp->b_rcred = NOCRED; 712 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 713 bp->b_dirtyoff = bp->b_dirtyend = 0; 714 bp->b_validoff = bp->b_validend = 0; 715 if (bufspace >= maxbufspace) { 716 s = splbio(); 717 bp->b_flags \|= B_INVAL; 718 brelse(bp); 719 goto trytofreespace; 720 } 721 return (bp); 722} 723 724/* 725 * Check to see if a block is currently memory resident. 726 / 727struct buf 728incore(struct vnode * vp, daddr_t blkno) 729{ 730 struct buf bp; 731* struct bufhashhdr bh; 732* 733 int s = splbio(); 734 735 bh = BUFHASH(vp, blkno); 736 bp = bh->lh_first; 737 738 /* Search hash chain / 739* while (bp != NULL) { 740 /* hit / 741* if (bp->b_vp == vp && bp->b_lblkno == blkno && 742 (bp->b_flags & B_INVAL) == 0) { 743 break; 744 } 745 bp = bp->b_hash.le_next; 746 } 747 splx(s); 748 return (bp); 749} 750 751/* 752 * Returns true if no I/O is needed to access the 753 * associated VM object. This is like incore except 754 * it also hunts around in the VM system for the data. 755 / 756* 757int 758inmem(struct vnode * vp, daddr_t blkno) 759{ 760 vm_object_t obj; 761 vm_offset_t off, toff, tinc; 762 vm_page_t m; 763 764 if (incore(vp, blkno)) 765 return 1; 766 if (vp->v_mount == NULL) 767 return 0; 768 if ((vp->v_object == NULL) \|\| (vp->v_flag & VVMIO) == 0) 769 return 0; 770 771 obj = vp->v_object; 772 tinc = PAGE_SIZE; 773 if (tinc > vp->v_mount->mnt_stat.f_iosize) 774 tinc = vp->v_mount->mnt_stat.f_iosize; 775 off = blkno * vp->v_mount->mnt_stat.f_iosize; 776 777 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 778 int mask; 779 780 m = vm_page_lookup(obj, trunc_page(toff + off)); 781 if (!m) 782 return 0; 783 if (vm_page_is_valid(m, toff + off, tinc) == 0) 784 return 0; 785 } 786 return 1; 787} 788 789/* 790 * now we set the dirty range for the buffer -- 791 * for NFS -- if the file is mapped and pages have 792 * been written to, let it know. We want the 793 * entire range of the buffer to be marked dirty if 794 * any of the pages have been written to for consistancy 795 * with the b_validoff, b_validend set in the nfs write 796 * code, and used by the nfs read code. 797 / 798static void 799vfs_setdirty(struct buf bp) { 800 int i; 801 vm_object_t object; 802 vm_offset_t boffset, offset; 803 /* 804 * We qualify the scan for modified pages on whether the 805 * object has been flushed yet. The OBJ_WRITEABLE flag 806 * is not cleared simply by protecting pages off. 807 / 808* if ((bp->b_flags & B_VMIO) && 809 ((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) { 810 /* 811 * test the pages to see if they have been modified directly 812 * by users through the VM system. 813 / 814* for (i = 0; i < bp->b_npages; i++) 815 vm_page_test_dirty(bp->b_pages[i]); 816 817 /* 818 * scan forwards for the first page modified 819 / 820* for (i = 0; i < bp->b_npages; i++) { 821 if (bp->b_pages[i]->dirty) { 822 break; 823 } 824 } 825 boffset = i * PAGE_SIZE; 826 if (boffset < bp->b_dirtyoff) { 827 bp->b_dirtyoff = boffset; 828 } 829 830 /* 831 * scan backwards for the last page modified 832 / 833* for (i = bp->b_npages - 1; i >= 0; --i) { 834 if (bp->b_pages[i]->dirty) { 835 break; 836 } 837 } 838 boffset = (i + 1) * PAGE_SIZE; 839 offset = boffset + bp->b_pages[0]->offset; 840 if (offset >= object->size) { 841 boffset = object->size - bp->b_pages[0]->offset; 842 } 843 if (bp->b_dirtyend < boffset) { 844 bp->b_dirtyend = boffset; 845 } 846 } 847} 848 849/* 850 * Get a block given a specified block and offset into a file/device. 851 / 852struct buf 853getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 854{ 855 struct buf bp; 856* int s; 857 struct bufhashhdr bh; 858* vm_offset_t off; 859 int nleft; 860 861 s = splbio(); 862loop: 863 if (bp = gbincore(vp, blkno)) { 864 if (bp->b_flags & (B_BUSY\|B_INVAL)) { 865 bp->b_flags \|= B_WANTED; 866 if (!tsleep(bp, PRIBIO \| slpflag, "getblk", slptimeo)) 867 goto loop; 868 869 splx(s); 870 return (struct buf ) NULL; 871* } 872 bp->b_flags \|= B_BUSY \| B_CACHE; 873 bremfree(bp); 874 /* 875 * check for size inconsistancies 876 / 877* if (bp->b_bcount != size) { 878 allocbuf(bp, size); 879 } 880 splx(s); 881 return (bp); 882 } else { 883 vm_object_t obj; 884 int doingvmio; 885 886 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { 887 doingvmio = 1; 888 } else { 889 doingvmio = 0; 890 } 891 if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) { 892 if (slpflag \|\| slptimeo) 893 return NULL; 894 goto loop; 895 } 896 897 /* 898 * This code is used to make sure that a buffer is not 899 * created while the getnewbuf routine is blocked. 900 * Normally the vnode is locked so this isn't a problem. 901 * VBLK type I/O requests, however, don't lock the vnode. 902 / 903* if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { 904 bp->b_flags \|= B_INVAL; 905 brelse(bp); 906 goto loop; 907 } 908 909 /* 910 * Insert the buffer into the hash, so that it can 911 * be found by incore. 912 / 913* bp->b_blkno = bp->b_lblkno = blkno; 914 bgetvp(vp, bp); 915 LIST_REMOVE(bp, b_hash); 916 bh = BUFHASH(vp, blkno); 917 LIST_INSERT_HEAD(bh, bp, b_hash); 918 919 if (doingvmio) { 920 bp->b_flags \|= (B_VMIO \| B_CACHE); 921#if defined(VFS_BIO_DEBUG) 922 if (vp->v_type != VREG) 923 printf("getblk: vmioing file type %d???\n", vp->v_type); 924#endif 925 } else { 926 bp->b_flags &= ~B_VMIO; 927 } 928 splx(s); 929 930 allocbuf(bp, size); 931 return (bp); 932 } 933} 934 935/* 936 * Get an empty, disassociated buffer of given size. 937 / 938struct buf 939geteblk(int size) 940{ 941 struct buf bp; 942* 943 while ((bp = getnewbuf(0, 0, 0)) == 0); 944 allocbuf(bp, size); 945 bp->b_flags \|= B_INVAL; 946 return (bp); 947} 948 949/* 950 * This code constitutes the buffer memory from either anonymous system 951 * memory (in the case of non-VMIO operations) or from an associated 952 * VM object (in the case of VMIO operations). 953 * 954 * Note that this code is tricky, and has many complications to resolve 955 * deadlock or inconsistant data situations. Tread lightly!!! 956 * 957 * Modify the length of a buffer's underlying buffer storage without 958 * destroying information (unless, of course the buffer is shrinking). 959 / 960int 961allocbuf(struct buf bp, int size) 962{ 963 964 int s; 965 int newbsize, mbsize; 966 int i; 967 968 if (!(bp->b_flags & B_BUSY)) 969 panic("allocbuf: buffer not busy"); 970 971 if ((bp->b_flags & B_VMIO) == 0) { 972 /* 973 * Just get anonymous memory from the kernel 974 / 975* mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; 976 newbsize = round_page(size); 977 978 if (newbsize < bp->b_bufsize) { 979 vm_hold_free_pages( 980 bp, 981 (vm_offset_t) bp->b_data + newbsize, 982 (vm_offset_t) bp->b_data + bp->b_bufsize); 983 } else if (newbsize > bp->b_bufsize) { 984 vm_hold_load_pages( 985 bp, 986 (vm_offset_t) bp->b_data + bp->b_bufsize, 987 (vm_offset_t) bp->b_data + newbsize); 988 } 989 } else { 990 vm_page_t m; 991 int desiredpages; 992 993 newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; 994 desiredpages = round_page(newbsize) / PAGE_SIZE; 995 996 if (newbsize < bp->b_bufsize) { 997 if (desiredpages < bp->b_npages) { 998 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + 999 desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages)); 1000 for (i = desiredpages; i < bp->b_npages; i++) { 1001 m = bp->b_pages[i]; 1002 s = splhigh(); 1003 while ((m->flags & PG_BUSY) \|\| (m->busy != 0)) { 1004 m->flags \|= PG_WANTED; 1005 tsleep(m, PVM, "biodep", 0); 1006 } 1007 splx(s); 1008 1009 if (m->bmapped == 0) { 1010 printf("allocbuf: bmapped is zero for page %d\n", i); 1011 panic("allocbuf: error"); 1012 } 1013 --m->bmapped; 1014 if (m->bmapped == 0) { 1015 vm_page_protect(m, VM_PROT_NONE); 1016 vm_page_free(m); 1017 } 1018 bp->b_pages[i] = NULL; 1019 } 1020 bp->b_npages = desiredpages; 1021 } 1022 } else if (newbsize > bp->b_bufsize) { 1023 vm_object_t obj; 1024 vm_offset_t tinc, off, toff, objoff; 1025 int pageindex, curbpnpages; 1026 struct vnode vp; 1027* int bsize; 1028 1029 vp = bp->b_vp; 1030 bsize = vp->v_mount->mnt_stat.f_iosize; 1031 1032 if (bp->b_npages < desiredpages) { 1033 obj = vp->v_object; 1034 tinc = PAGE_SIZE; 1035 if (tinc > bsize) 1036 tinc = bsize; 1037 off = bp->b_lblkno * bsize; 1038 doretry: 1039 curbpnpages = bp->b_npages; 1040 bp->b_flags \|= B_CACHE; 1041 for (toff = 0; toff < newbsize; toff += tinc) { 1042 int mask; 1043 int bytesinpage; 1044 1045 pageindex = toff / PAGE_SIZE; 1046 objoff = trunc_page(toff + off); 1047 if (pageindex < curbpnpages) { 1048 int pb; 1049 1050 m = bp->b_pages[pageindex]; 1051 if (m->offset != objoff) 1052 panic("allocbuf: page changed offset??!!!?"); 1053 bytesinpage = tinc; 1054 if (tinc > (newbsize - toff)) 1055 bytesinpage = newbsize - toff; 1056 if (!vm_page_is_valid(m, toff + off, bytesinpage)) { 1057 bp->b_flags &= ~B_CACHE; 1058 } 1059 if ((m->flags & PG_ACTIVE) == 0) { 1060 vm_page_activate(m); 1061 m->act_count = 0; 1062 } 1063 continue; 1064 } 1065 m = vm_page_lookup(obj, objoff); 1066 if (!m) { 1067 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); 1068 if (!m) { 1069 int j; 1070 1071 for (j = bp->b_npages; j < pageindex; j++) { 1072 PAGE_WAKEUP(bp->b_pages[j]); 1073 } 1074 VM_WAIT; 1075 goto doretry; 1076 } 1077 vm_page_activate(m); 1078 m->act_count = 0; 1079 m->valid = 0; 1080 bp->b_flags &= ~B_CACHE; 1081 } else if (m->flags & PG_BUSY) { 1082 int j; 1083 1084 for (j = bp->b_npages; j < pageindex; j++) { 1085 PAGE_WAKEUP(bp->b_pages[j]); 1086 } 1087 1088 s = splbio(); 1089 m->flags \|= PG_WANTED; 1090 tsleep(m, PRIBIO, "pgtblk", 0); 1091 splx(s); 1092 1093 goto doretry; 1094 } else { 1095 int pb; 1096 if ((curproc != pageproc) && 1097 (m->flags & PG_CACHE) && 1098 (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { 1099 pagedaemon_wakeup(); 1100 } 1101 bytesinpage = tinc; 1102 if (tinc > (newbsize - toff)) 1103 bytesinpage = newbsize - toff; 1104 if (!vm_page_is_valid(m, toff + off, bytesinpage)) { 1105 bp->b_flags &= ~B_CACHE; 1106 } 1107 if ((m->flags & PG_ACTIVE) == 0) { 1108 vm_page_activate(m); 1109 m->act_count = 0; 1110 } 1111 m->flags \|= PG_BUSY; 1112 } 1113 bp->b_pages[pageindex] = m; 1114 curbpnpages = pageindex + 1; 1115 } 1116 for (i = bp->b_npages; i < curbpnpages; i++) { 1117 m = bp->b_pages[i]; 1118 m->bmapped++; 1119 PAGE_WAKEUP(m); 1120 } 1121 bp->b_npages = curbpnpages; 1122 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 1123 pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); 1124 bp->b_data += off % PAGE_SIZE; 1125 } 1126 } 1127 } 1128 bufspace += (newbsize - bp->b_bufsize); 1129 bp->b_bufsize = newbsize; 1130 bp->b_bcount = size; 1131 return 1; 1132} 1133 1134/* 1135 * Wait for buffer I/O completion, returning error status. 1136 / 1137int 1138biowait(register struct buf bp) 1139{ 1140 int s; 1141 1142 s = splbio(); 1143 while ((bp->b_flags & B_DONE) == 0) 1144 tsleep(bp, PRIBIO, "biowait", 0); 1145 splx(s); 1146 if (bp->b_flags & B_EINTR) { 1147 bp->b_flags &= ~B_EINTR; 1148 return (EINTR); 1149 } 1150 if (bp->b_flags & B_ERROR) { 1151 return (bp->b_error ? bp->b_error : EIO); 1152 } else { 1153 return (0); 1154 } 1155} 1156 1157/* 1158 * Finish I/O on a buffer, calling an optional function. 1159 * This is usually called from interrupt level, so process blocking 1160 * is not a good idea. 1161 / 1162void 1163biodone(register struct buf bp) 1164{ 1165 int s; 1166 1167 s = splbio(); 1168 if (!(bp->b_flags & B_BUSY)) 1169 panic("biodone: buffer not busy"); 1170 1171 if (bp->b_flags & B_DONE) { 1172 splx(s); 1173 printf("biodone: buffer already done\n"); 1174 return; 1175 } 1176 bp->b_flags \|= B_DONE; 1177 1178 if ((bp->b_flags & B_READ) == 0) { 1179 struct vnode vp = bp->b_vp; 1180* vwakeup(bp); 1181 } 1182#ifdef BOUNCE_BUFFERS 1183 if (bp->b_flags & B_BOUNCE) 1184 vm_bounce_free(bp); 1185#endif 1186 1187 /* call optional completion function if requested / 1188* if (bp->b_flags & B_CALL) { 1189 bp->b_flags &= ~B_CALL; 1190 (bp->b_iodone) (bp); 1191* splx(s); 1192 return; 1193 } 1194 if (bp->b_flags & B_VMIO) { 1195 int i, resid; 1196 vm_offset_t foff; 1197 vm_page_t m; 1198 vm_object_t obj; 1199 int iosize; 1200 struct vnode vp = bp->b_vp; 1201* 1202 foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1203 obj = vp->v_object; 1204 if (!obj) { 1205 panic("biodone: no object"); 1206 } 1207#if defined(VFS_BIO_DEBUG) 1208 if (obj->paging_in_progress < bp->b_npages) { 1209 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 1210 obj->paging_in_progress, bp->b_npages); 1211 } 1212#endif 1213 iosize = bp->b_bufsize; 1214 for (i = 0; i < bp->b_npages; i++) { 1215 int bogusflag = 0; 1216 m = bp->b_pages[i]; 1217 if (m == bogus_page) { 1218 bogusflag = 1; 1219 m = vm_page_lookup(obj, foff); 1220 if (!m) { 1221#if defined(VFS_BIO_DEBUG) 1222 printf("biodone: page disappeared\n"); 1223#endif 1224 --obj->paging_in_progress; 1225 continue; 1226 } 1227 bp->b_pages[i] = m; 1228 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1229 } 1230#if defined(VFS_BIO_DEBUG) 1231 if (trunc_page(foff) != m->offset) { 1232 printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset); 1233 } 1234#endif 1235 resid = (m->offset + PAGE_SIZE) - foff; 1236 if (resid > iosize) 1237 resid = iosize; 1238 /* 1239 * In the write case, the valid and clean bits are 1240 * already changed correctly, so we only need to do this 1241 * here in the read case. 1242 / 1243* if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 1244 vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid); 1245 } 1246 1247 /* 1248 * when debugging new filesystems or buffer I/O methods, this 1249 * is the most common error that pops up. if you see this, you 1250 * have not set the page busy flag correctly!!! 1251 / 1252* if (m->busy == 0) { 1253 printf("biodone: page busy < 0, " 1254 "off: %ld, foff: %ld, " 1255 "resid: %d, index: %d\n", 1256 m->offset, foff, resid, i); 1257 printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n", 1258 bp->b_vp->v_mount->mnt_stat.f_iosize, 1259 bp->b_lblkno, bp->b_flags, bp->b_npages); 1260 printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n", 1261 m->valid, m->dirty, m->bmapped); 1262 panic("biodone: page busy < 0\n"); 1263 } 1264 --m->busy; 1265 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1266 m->flags &= ~PG_WANTED; 1267 wakeup(m); 1268 } 1269 --obj->paging_in_progress; 1270 foff += resid; 1271 iosize -= resid; 1272 } 1273 if (obj && obj->paging_in_progress == 0 && 1274 (obj->flags & OBJ_PIPWNT)) { 1275 obj->flags &= ~OBJ_PIPWNT; 1276 wakeup(obj); 1277 } 1278 } 1279 /* 1280 * For asynchronous completions, release the buffer now. The brelse 1281 * checks for B_WANTED and will do the wakeup there if necessary - so 1282 * no need to do a wakeup here in the async case. 1283 / 1284* 1285 if (bp->b_flags & B_ASYNC) { 1286 brelse(bp); 1287 } else { 1288 bp->b_flags &= ~B_WANTED; 1289 wakeup(bp); 1290 } 1291 splx(s); 1292} 1293 1294int 1295count_lock_queue() 1296{ 1297 int count; 1298 struct buf bp; 1299* 1300 count = 0; 1301 for (bp = bufqueues[QUEUE_LOCKED].tqh_first; 1302 bp != NULL; 1303 bp = bp->b_freelist.tqe_next) 1304 count++; 1305 return (count); 1306} 1307 1308int vfs_update_interval = 30; 1309 1310void 1311vfs_update() 1312{ 1313 (void) spl0(); 1314 while (1) { 1315 tsleep(&vfs_update_wakeup, PRIBIO, "update", 1316 hz * vfs_update_interval); 1317 vfs_update_wakeup = 0; 1318 sync(curproc, NULL, NULL); 1319 } 1320} 1321 1322/* 1323 * This routine is called in lieu of iodone in the case of 1324 * incomplete I/O. This keeps the busy status for pages 1325 * consistant. 1326 / 1327void 1328vfs_unbusy_pages(struct buf bp) 1329{ 1330 int i; 1331 1332 if (bp->b_flags & B_VMIO) { 1333 struct vnode vp = bp->b_vp; 1334* vm_object_t obj = vp->v_object; 1335 vm_offset_t foff; 1336 1337 foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno); 1338 1339 for (i = 0; i < bp->b_npages; i++) { 1340 vm_page_t m = bp->b_pages[i]; 1341 1342 if (m == bogus_page) { 1343 m = vm_page_lookup(obj, foff + i * PAGE_SIZE); 1344 if (!m) { 1345 panic("vfs_unbusy_pages: page missing\n"); 1346 } 1347 bp->b_pages[i] = m; 1348 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1349 } 1350 --obj->paging_in_progress; 1351 --m->busy; 1352 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1353 m->flags &= ~PG_WANTED; 1354 wakeup(m); 1355 } 1356 } 1357 if (obj->paging_in_progress == 0 && 1358 (obj->flags & OBJ_PIPWNT)) { 1359 obj->flags &= ~OBJ_PIPWNT; 1360 wakeup(obj); 1361 } 1362 } 1363} 1364 1365/* 1366 * This routine is called before a device strategy routine. 1367 * It is used to tell the VM system that paging I/O is in 1368 * progress, and treat the pages associated with the buffer 1369 * almost as being PG_BUSY. Also the object paging_in_progress 1370 * flag is handled to make sure that the object doesn't become 1371 * inconsistant. 1372 / 1373void 1374vfs_busy_pages(struct buf bp, int clear_modify) 1375{ 1376 int i; 1377 1378 if (bp->b_flags & B_VMIO) { 1379 vm_object_t obj = bp->b_vp->v_object; 1380 vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1381 int iocount = bp->b_bufsize; 1382 1383 vfs_setdirty(bp); 1384 for (i = 0; i < bp->b_npages; i++) { 1385 vm_page_t m = bp->b_pages[i]; 1386 int resid = (m->offset + PAGE_SIZE) - foff; 1387 1388 if (resid > iocount) 1389 resid = iocount; 1390 if ((bp->b_flags & B_CLUSTER) == 0) { 1391 obj->paging_in_progress++; 1392 m->busy++; 1393 } 1394 if (clear_modify) { 1395 vm_page_protect(m, VM_PROT_READ); 1396 vm_page_set_validclean(m, 1397 foff & (PAGE_SIZE-1), resid); 1398 } else if (bp->b_bcount >= PAGE_SIZE) { 1399 if (m->valid && (bp->b_flags & B_CACHE) == 0) { 1400 bp->b_pages[i] = bogus_page; 1401 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1402 } 1403 } 1404 foff += resid; 1405 iocount -= resid; 1406 } 1407 } 1408} 1409 1410/* 1411 * Tell the VM system that the pages associated with this buffer 1412 * are clean. This is used for delayed writes where the data is 1413 * going to go to disk eventually without additional VM intevention. 1414 / 1415void 1416vfs_clean_pages(struct buf bp) 1417{ 1418 int i; 1419 1420 if (bp->b_flags & B_VMIO) { 1421 vm_offset_t foff = 1422 bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1423 int iocount = bp->b_bufsize; 1424 1425 for (i = 0; i < bp->b_npages; i++) { 1426 vm_page_t m = bp->b_pages[i]; 1427 int resid = (m->offset + PAGE_SIZE) - foff; 1428 1429 if (resid > iocount) 1430 resid = iocount; 1431 if (resid > 0) { 1432 vm_page_set_validclean(m, 1433 foff & (PAGE_SIZE-1), resid); 1434 } 1435 foff += resid; 1436 iocount -= resid; 1437 } 1438 } 1439} 1440 1441void 1442vfs_bio_clrbuf(struct buf bp) { 1443* int i; 1444 if( bp->b_flags & B_VMIO) { 1445 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { 1446 int j; 1447 if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) { 1448 for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) { 1449 bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE); 1450 } 1451 } 1452 bp->b_resid = 0; 1453 return; 1454 } 1455 for(i=0;i<bp->b_npages;i++) { 1456 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) 1457 continue; 1458 if( bp->b_pages[i]->valid == 0) { 1459 bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE); 1460 } else { 1461 int j; 1462 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { 1463 if( (bp->b_pages[i]->valid & (1<<j)) == 0) 1464 bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE); 1465 } 1466 } 1467 bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; 1468 } 1469 bp->b_resid = 0; 1470 } else { 1471 clrbuf(bp); 1472 } 1473} 1474 1475/* 1476 * vm_hold_load_pages and vm_hold_unload pages get pages into 1477 * a buffers address space. The pages are anonymous and are 1478 * not associated with a file object. 1479 / 1480void 1481vm_hold_load_pages(struct buf bp, vm_offset_t froma, vm_offset_t toa) 1482{ 1483 vm_offset_t pg; 1484 vm_page_t p; 1485 vm_offset_t from = round_page(froma); 1486 vm_offset_t to = round_page(toa); 1487 1488 for (pg = from; pg < to; pg += PAGE_SIZE) { 1489 1490tryagain: 1491 1492 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS, 1493 VM_ALLOC_NORMAL); 1494 if (!p) { 1495 VM_WAIT; 1496 goto tryagain; 1497 } 1498 vm_page_wire(p); 1499 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 1500 bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p; 1501 PAGE_WAKEUP(p); 1502 bp->b_npages++; 1503 } 1504} 1505 1506void 1507vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) 1508{ 1509 vm_offset_t pg; 1510 vm_page_t p; 1511 vm_offset_t from = round_page(froma); 1512 vm_offset_t to = round_page(toa); 1513 1514 for (pg = from; pg < to; pg += PAGE_SIZE) { 1515 p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE]; 1516 bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0; 1517 pmap_kremove(pg); 1518 vm_page_free(p); 1519 --bp->b_npages; 1520 } 1521}