Deleted Added
full compact
vfs_bio.c (10541) vfs_bio.c (10551)
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice immediately at the beginning of the file, without modification,
10 * this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 * John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD. Other use
17 * is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice immediately at the beginning of the file, without modification,
10 * this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 * John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD. Other use
17 * is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
21 * $Id: vfs_bio.c,v 1.60 1995/08/28 09:18:53 julian Exp $
21 * $Id: vfs_bio.c,v 1.61 1995/09/03 19:56:14 dyson Exp $
22 */
23
24/*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme. Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author: John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35#define VMIO
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/proc.h>
40#include <sys/vnode.h>
41#include <vm/vm.h>
42#include <vm/vm_kern.h>
43#include <vm/vm_pageout.h>
44#include <vm/vm_page.h>
45#include <vm/vm_object.h>
46#include <sys/buf.h>
47#include <sys/mount.h>
48#include <sys/malloc.h>
49#include <sys/resourcevar.h>
50#include <sys/proc.h>
51
52#include <miscfs/specfs/specdev.h>
53
54/*
55 * System initialization
56 */
57
58static void vfs_update __P((void));
59struct proc *updateproc;
60
61static struct kproc_desc up_kp = {
62 "update",
63 vfs_update,
64 &updateproc
65};
66SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, (caddr_t)&up_kp)
67
68
69struct buf *buf; /* buffer header pool */
70struct swqueue bswlist;
71
72void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
73void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
74void vfs_clean_pages(struct buf * bp);
75static void vfs_setdirty(struct buf *bp);
76static __inline struct buf * gbincore(struct vnode * vp, daddr_t blkno);
77
78int needsbuffer;
79
80/*
81 * Internal update daemon, process 3
82 * The variable vfs_update_wakeup allows for internal syncs.
83 */
84int vfs_update_wakeup;
85
86
87/*
88 * buffers base kva
89 */
90caddr_t buffers_kva;
91
92/*
93 * bogus page -- for I/O to/from partially complete buffers
94 * this is a temporary solution to the problem, but it is not
95 * really that bad. it would be better to split the buffer
96 * for input in the case of buffers partially already in memory,
97 * but the code is intricate enough already.
98 */
99vm_page_t bogus_page;
100vm_offset_t bogus_offset;
101
102int bufspace, maxbufspace;
103
104/*
105 * advisory minimum for size of LRU queue or VMIO queue
106 */
107int minbuf;
108
109struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
110struct bqueues bufqueues[BUFFER_QUEUES];
111
112/*
113 * Initialize buffer headers and related structures.
114 */
115void
116bufinit()
117{
118 struct buf *bp;
119 int i;
120
121 TAILQ_INIT(&bswlist);
122 LIST_INIT(&invalhash);
123
124 /* first, make a null hash table */
125 for (i = 0; i < BUFHSZ; i++)
126 LIST_INIT(&bufhashtbl[i]);
127
128 /* next, make a null set of free lists */
129 for (i = 0; i < BUFFER_QUEUES; i++)
130 TAILQ_INIT(&bufqueues[i]);
131
132 buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
133 /* finally, initialize each buffer header and stick on empty q */
134 for (i = 0; i < nbuf; i++) {
135 bp = &buf[i];
136 bzero(bp, sizeof *bp);
137 bp->b_flags = B_INVAL; /* we're just an empty header */
138 bp->b_dev = NODEV;
139 bp->b_rcred = NOCRED;
140 bp->b_wcred = NOCRED;
141 bp->b_qindex = QUEUE_EMPTY;
142 bp->b_vnbufs.le_next = NOLIST;
143 bp->b_data = buffers_kva + i * MAXBSIZE;
144 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
145 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
146 }
147/*
148 * maxbufspace is currently calculated to support all filesystem blocks
149 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer
150 * cache is still the same as it would be for 8K filesystems. This
151 * keeps the size of the buffer cache "in check" for big block filesystems.
152 */
153 minbuf = nbuf / 3;
154 maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
155
156 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
157 bogus_page = vm_page_alloc(kernel_object,
158 bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL);
159
160}
161
162/*
163 * remove the buffer from the appropriate free list
164 */
165void
166bremfree(struct buf * bp)
167{
168 int s = splbio();
169
170 if (bp->b_qindex != QUEUE_NONE) {
171 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
172 bp->b_qindex = QUEUE_NONE;
173 } else {
174 panic("bremfree: removing a buffer when not on a queue");
175 }
176 splx(s);
177}
178
179/*
180 * Get a buffer with the specified data. Look in the cache first.
181 */
182int
183bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
184 struct buf ** bpp)
185{
186 struct buf *bp;
187
188 bp = getblk(vp, blkno, size, 0, 0);
189 *bpp = bp;
190
191 /* if not found in cache, do some I/O */
192 if ((bp->b_flags & B_CACHE) == 0) {
193 if (curproc != NULL)
194 curproc->p_stats->p_ru.ru_inblock++;
195 bp->b_flags |= B_READ;
196 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
197 if (bp->b_rcred == NOCRED) {
198 if (cred != NOCRED)
199 crhold(cred);
200 bp->b_rcred = cred;
201 }
202 vfs_busy_pages(bp, 0);
203 VOP_STRATEGY(bp);
204 return (biowait(bp));
205 }
206 return (0);
207}
208
209/*
210 * Operates like bread, but also starts asynchronous I/O on
211 * read-ahead blocks.
212 */
213int
214breadn(struct vnode * vp, daddr_t blkno, int size,
215 daddr_t * rablkno, int *rabsize,
216 int cnt, struct ucred * cred, struct buf ** bpp)
217{
218 struct buf *bp, *rabp;
219 int i;
220 int rv = 0, readwait = 0;
221
222 *bpp = bp = getblk(vp, blkno, size, 0, 0);
223
224 /* if not found in cache, do some I/O */
225 if ((bp->b_flags & B_CACHE) == 0) {
226 if (curproc != NULL)
227 curproc->p_stats->p_ru.ru_inblock++;
228 bp->b_flags |= B_READ;
229 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
230 if (bp->b_rcred == NOCRED) {
231 if (cred != NOCRED)
232 crhold(cred);
233 bp->b_rcred = cred;
234 }
235 vfs_busy_pages(bp, 0);
236 VOP_STRATEGY(bp);
237 ++readwait;
238 }
239 for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
240 if (inmem(vp, *rablkno))
241 continue;
242 rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
243
244 if ((rabp->b_flags & B_CACHE) == 0) {
245 if (curproc != NULL)
246 curproc->p_stats->p_ru.ru_inblock++;
247 rabp->b_flags |= B_READ | B_ASYNC;
248 rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
249 if (rabp->b_rcred == NOCRED) {
250 if (cred != NOCRED)
251 crhold(cred);
252 rabp->b_rcred = cred;
253 }
254 vfs_busy_pages(rabp, 0);
255 VOP_STRATEGY(rabp);
256 } else {
257 brelse(rabp);
258 }
259 }
260
261 if (readwait) {
262 rv = biowait(bp);
263 }
264 return (rv);
265}
266
267/*
268 * Write, release buffer on completion. (Done by iodone
269 * if async.)
270 */
271int
272bwrite(struct buf * bp)
273{
274 int oldflags = bp->b_flags;
275
276 if (bp->b_flags & B_INVAL) {
277 brelse(bp);
278 return (0);
279 }
280 if (!(bp->b_flags & B_BUSY))
281 panic("bwrite: buffer is not busy???");
282
283 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
284 bp->b_flags |= B_WRITEINPROG;
285
286 if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
287 reassignbuf(bp, bp->b_vp);
288 }
289
290 bp->b_vp->v_numoutput++;
291 vfs_busy_pages(bp, 1);
292 if (curproc != NULL)
293 curproc->p_stats->p_ru.ru_oublock++;
294 VOP_STRATEGY(bp);
295
296 if ((oldflags & B_ASYNC) == 0) {
297 int rtval = biowait(bp);
298
299 if (oldflags & B_DELWRI) {
300 reassignbuf(bp, bp->b_vp);
301 }
302 brelse(bp);
303 return (rtval);
304 }
305 return (0);
306}
307
308int
309vn_bwrite(ap)
310 struct vop_bwrite_args *ap;
311{
312 return (bwrite(ap->a_bp));
313}
314
315/*
316 * Delayed write. (Buffer is marked dirty).
317 */
318void
319bdwrite(struct buf * bp)
320{
321
322 if ((bp->b_flags & B_BUSY) == 0) {
323 panic("bdwrite: buffer is not busy");
324 }
325 if (bp->b_flags & B_INVAL) {
326 brelse(bp);
327 return;
328 }
329 if (bp->b_flags & B_TAPE) {
330 bawrite(bp);
331 return;
332 }
333 bp->b_flags &= ~(B_READ|B_RELBUF);
334 if ((bp->b_flags & B_DELWRI) == 0) {
335 bp->b_flags |= B_DONE | B_DELWRI;
336 reassignbuf(bp, bp->b_vp);
337 }
338
339 /*
340 * This bmap keeps the system from needing to do the bmap later,
341 * perhaps when the system is attempting to do a sync. Since it
342 * is likely that the indirect block -- or whatever other datastructure
343 * that the filesystem needs is still in memory now, it is a good
344 * thing to do this. Note also, that if the pageout daemon is
345 * requesting a sync -- there might not be enough memory to do
346 * the bmap then... So, this is important to do.
347 */
348 if( bp->b_lblkno == bp->b_blkno) {
22 */
23
24/*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme. Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author: John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35#define VMIO
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/proc.h>
40#include <sys/vnode.h>
41#include <vm/vm.h>
42#include <vm/vm_kern.h>
43#include <vm/vm_pageout.h>
44#include <vm/vm_page.h>
45#include <vm/vm_object.h>
46#include <sys/buf.h>
47#include <sys/mount.h>
48#include <sys/malloc.h>
49#include <sys/resourcevar.h>
50#include <sys/proc.h>
51
52#include <miscfs/specfs/specdev.h>
53
54/*
55 * System initialization
56 */
57
58static void vfs_update __P((void));
59struct proc *updateproc;
60
61static struct kproc_desc up_kp = {
62 "update",
63 vfs_update,
64 &updateproc
65};
66SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, (caddr_t)&up_kp)
67
68
69struct buf *buf; /* buffer header pool */
70struct swqueue bswlist;
71
72void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
73void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
74void vfs_clean_pages(struct buf * bp);
75static void vfs_setdirty(struct buf *bp);
76static __inline struct buf * gbincore(struct vnode * vp, daddr_t blkno);
77
78int needsbuffer;
79
80/*
81 * Internal update daemon, process 3
82 * The variable vfs_update_wakeup allows for internal syncs.
83 */
84int vfs_update_wakeup;
85
86
87/*
88 * buffers base kva
89 */
90caddr_t buffers_kva;
91
92/*
93 * bogus page -- for I/O to/from partially complete buffers
94 * this is a temporary solution to the problem, but it is not
95 * really that bad. it would be better to split the buffer
96 * for input in the case of buffers partially already in memory,
97 * but the code is intricate enough already.
98 */
99vm_page_t bogus_page;
100vm_offset_t bogus_offset;
101
102int bufspace, maxbufspace;
103
104/*
105 * advisory minimum for size of LRU queue or VMIO queue
106 */
107int minbuf;
108
109struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
110struct bqueues bufqueues[BUFFER_QUEUES];
111
112/*
113 * Initialize buffer headers and related structures.
114 */
115void
116bufinit()
117{
118 struct buf *bp;
119 int i;
120
121 TAILQ_INIT(&bswlist);
122 LIST_INIT(&invalhash);
123
124 /* first, make a null hash table */
125 for (i = 0; i < BUFHSZ; i++)
126 LIST_INIT(&bufhashtbl[i]);
127
128 /* next, make a null set of free lists */
129 for (i = 0; i < BUFFER_QUEUES; i++)
130 TAILQ_INIT(&bufqueues[i]);
131
132 buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
133 /* finally, initialize each buffer header and stick on empty q */
134 for (i = 0; i < nbuf; i++) {
135 bp = &buf[i];
136 bzero(bp, sizeof *bp);
137 bp->b_flags = B_INVAL; /* we're just an empty header */
138 bp->b_dev = NODEV;
139 bp->b_rcred = NOCRED;
140 bp->b_wcred = NOCRED;
141 bp->b_qindex = QUEUE_EMPTY;
142 bp->b_vnbufs.le_next = NOLIST;
143 bp->b_data = buffers_kva + i * MAXBSIZE;
144 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
145 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
146 }
147/*
148 * maxbufspace is currently calculated to support all filesystem blocks
149 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer
150 * cache is still the same as it would be for 8K filesystems. This
151 * keeps the size of the buffer cache "in check" for big block filesystems.
152 */
153 minbuf = nbuf / 3;
154 maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
155
156 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
157 bogus_page = vm_page_alloc(kernel_object,
158 bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL);
159
160}
161
162/*
163 * remove the buffer from the appropriate free list
164 */
165void
166bremfree(struct buf * bp)
167{
168 int s = splbio();
169
170 if (bp->b_qindex != QUEUE_NONE) {
171 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
172 bp->b_qindex = QUEUE_NONE;
173 } else {
174 panic("bremfree: removing a buffer when not on a queue");
175 }
176 splx(s);
177}
178
179/*
180 * Get a buffer with the specified data. Look in the cache first.
181 */
182int
183bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
184 struct buf ** bpp)
185{
186 struct buf *bp;
187
188 bp = getblk(vp, blkno, size, 0, 0);
189 *bpp = bp;
190
191 /* if not found in cache, do some I/O */
192 if ((bp->b_flags & B_CACHE) == 0) {
193 if (curproc != NULL)
194 curproc->p_stats->p_ru.ru_inblock++;
195 bp->b_flags |= B_READ;
196 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
197 if (bp->b_rcred == NOCRED) {
198 if (cred != NOCRED)
199 crhold(cred);
200 bp->b_rcred = cred;
201 }
202 vfs_busy_pages(bp, 0);
203 VOP_STRATEGY(bp);
204 return (biowait(bp));
205 }
206 return (0);
207}
208
209/*
210 * Operates like bread, but also starts asynchronous I/O on
211 * read-ahead blocks.
212 */
213int
214breadn(struct vnode * vp, daddr_t blkno, int size,
215 daddr_t * rablkno, int *rabsize,
216 int cnt, struct ucred * cred, struct buf ** bpp)
217{
218 struct buf *bp, *rabp;
219 int i;
220 int rv = 0, readwait = 0;
221
222 *bpp = bp = getblk(vp, blkno, size, 0, 0);
223
224 /* if not found in cache, do some I/O */
225 if ((bp->b_flags & B_CACHE) == 0) {
226 if (curproc != NULL)
227 curproc->p_stats->p_ru.ru_inblock++;
228 bp->b_flags |= B_READ;
229 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
230 if (bp->b_rcred == NOCRED) {
231 if (cred != NOCRED)
232 crhold(cred);
233 bp->b_rcred = cred;
234 }
235 vfs_busy_pages(bp, 0);
236 VOP_STRATEGY(bp);
237 ++readwait;
238 }
239 for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
240 if (inmem(vp, *rablkno))
241 continue;
242 rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
243
244 if ((rabp->b_flags & B_CACHE) == 0) {
245 if (curproc != NULL)
246 curproc->p_stats->p_ru.ru_inblock++;
247 rabp->b_flags |= B_READ | B_ASYNC;
248 rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
249 if (rabp->b_rcred == NOCRED) {
250 if (cred != NOCRED)
251 crhold(cred);
252 rabp->b_rcred = cred;
253 }
254 vfs_busy_pages(rabp, 0);
255 VOP_STRATEGY(rabp);
256 } else {
257 brelse(rabp);
258 }
259 }
260
261 if (readwait) {
262 rv = biowait(bp);
263 }
264 return (rv);
265}
266
267/*
268 * Write, release buffer on completion. (Done by iodone
269 * if async.)
270 */
271int
272bwrite(struct buf * bp)
273{
274 int oldflags = bp->b_flags;
275
276 if (bp->b_flags & B_INVAL) {
277 brelse(bp);
278 return (0);
279 }
280 if (!(bp->b_flags & B_BUSY))
281 panic("bwrite: buffer is not busy???");
282
283 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
284 bp->b_flags |= B_WRITEINPROG;
285
286 if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
287 reassignbuf(bp, bp->b_vp);
288 }
289
290 bp->b_vp->v_numoutput++;
291 vfs_busy_pages(bp, 1);
292 if (curproc != NULL)
293 curproc->p_stats->p_ru.ru_oublock++;
294 VOP_STRATEGY(bp);
295
296 if ((oldflags & B_ASYNC) == 0) {
297 int rtval = biowait(bp);
298
299 if (oldflags & B_DELWRI) {
300 reassignbuf(bp, bp->b_vp);
301 }
302 brelse(bp);
303 return (rtval);
304 }
305 return (0);
306}
307
308int
309vn_bwrite(ap)
310 struct vop_bwrite_args *ap;
311{
312 return (bwrite(ap->a_bp));
313}
314
315/*
316 * Delayed write. (Buffer is marked dirty).
317 */
318void
319bdwrite(struct buf * bp)
320{
321
322 if ((bp->b_flags & B_BUSY) == 0) {
323 panic("bdwrite: buffer is not busy");
324 }
325 if (bp->b_flags & B_INVAL) {
326 brelse(bp);
327 return;
328 }
329 if (bp->b_flags & B_TAPE) {
330 bawrite(bp);
331 return;
332 }
333 bp->b_flags &= ~(B_READ|B_RELBUF);
334 if ((bp->b_flags & B_DELWRI) == 0) {
335 bp->b_flags |= B_DONE | B_DELWRI;
336 reassignbuf(bp, bp->b_vp);
337 }
338
339 /*
340 * This bmap keeps the system from needing to do the bmap later,
341 * perhaps when the system is attempting to do a sync. Since it
342 * is likely that the indirect block -- or whatever other datastructure
343 * that the filesystem needs is still in memory now, it is a good
344 * thing to do this. Note also, that if the pageout daemon is
345 * requesting a sync -- there might not be enough memory to do
346 * the bmap then... So, this is important to do.
347 */
348 if( bp->b_lblkno == bp->b_blkno) {
349 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
349 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
350 }
351
352 /*
353 * Set the *dirty* buffer range based upon the VM system dirty pages.
354 */
355 vfs_setdirty(bp);
356
357 /*
358 * We need to do this here to satisfy the vnode_pager and the
359 * pageout daemon, so that it thinks that the pages have been
360 * "cleaned". Note that since the pages are in a delayed write
361 * buffer -- the VFS layer "will" see that the pages get written
362 * out on the next sync, or perhaps the cluster will be completed.
363 */
364 vfs_clean_pages(bp);
365 brelse(bp);
366 return;
367}
368
369/*
370 * Asynchronous write.
371 * Start output on a buffer, but do not wait for it to complete.
372 * The buffer is released when the output completes.
373 */
374void
375bawrite(struct buf * bp)
376{
377 bp->b_flags |= B_ASYNC;
378 (void) VOP_BWRITE(bp);
379}
380
381/*
382 * Release a buffer.
383 */
384void
385brelse(struct buf * bp)
386{
387 int s;
388
389 if (bp->b_flags & B_CLUSTER) {
390 relpbuf(bp);
391 return;
392 }
393 /* anyone need a "free" block? */
394 s = splbio();
395
396 if (needsbuffer) {
397 needsbuffer = 0;
398 wakeup(&needsbuffer);
399 }
400
401 /* anyone need this block? */
402 if (bp->b_flags & B_WANTED) {
403 bp->b_flags &= ~(B_WANTED | B_AGE);
404 wakeup(bp);
405 } else if (bp->b_flags & B_VMIO) {
406 bp->b_flags &= ~B_WANTED;
407 wakeup(bp);
408 }
409 if (bp->b_flags & B_LOCKED)
410 bp->b_flags &= ~B_ERROR;
411
412 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
413 (bp->b_bufsize <= 0)) {
414 bp->b_flags |= B_INVAL;
415 bp->b_flags &= ~(B_DELWRI | B_CACHE);
416 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
417 brelvp(bp);
418 }
419
420 /*
421 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
422 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
423 * but the VM object is kept around. The B_NOCACHE flag is used to
424 * invalidate the pages in the VM object.
425 */
426 if (bp->b_flags & B_VMIO) {
427 vm_offset_t foff;
428 vm_object_t obj;
429 int i, resid;
430 vm_page_t m;
431 int iototal = bp->b_bufsize;
432
433 foff = 0;
434 obj = 0;
435 if (bp->b_npages) {
436 if (bp->b_vp && bp->b_vp->v_mount) {
437 foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
438 } else {
439 /*
440 * vnode pointer has been ripped away --
441 * probably file gone...
442 */
443 foff = bp->b_pages[0]->offset;
444 }
445 }
446 for (i = 0; i < bp->b_npages; i++) {
447 m = bp->b_pages[i];
448 if (m == bogus_page) {
449 m = vm_page_lookup(obj, foff);
450 if (!m) {
451 panic("brelse: page missing\n");
452 }
453 bp->b_pages[i] = m;
454 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
455 }
456 resid = (m->offset + PAGE_SIZE) - foff;
457 if (resid > iototal)
458 resid = iototal;
459 if (resid > 0) {
460 /*
461 * Don't invalidate the page if the local machine has already
462 * modified it. This is the lesser of two evils, and should
463 * be fixed.
464 */
465 if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
466 vm_page_test_dirty(m);
467 if (m->dirty == 0) {
468 vm_page_set_invalid(m, foff, resid);
469 if (m->valid == 0)
470 vm_page_protect(m, VM_PROT_NONE);
471 }
472 }
473 }
474 foff += resid;
475 iototal -= resid;
476 }
477
478 if (bp->b_flags & (B_INVAL | B_RELBUF)) {
479 for(i=0;i<bp->b_npages;i++) {
480 m = bp->b_pages[i];
481 --m->bmapped;
482 if (m->bmapped == 0) {
483 if (m->flags & PG_WANTED) {
484 wakeup(m);
485 m->flags &= ~PG_WANTED;
486 }
487 vm_page_test_dirty(m);
488 if ((m->dirty & m->valid) == 0 &&
489 (m->flags & PG_REFERENCED) == 0 &&
490 !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
491 vm_page_cache(m);
492 } else if ((m->flags & PG_ACTIVE) == 0) {
493 vm_page_activate(m);
494 m->act_count = 0;
495 }
496 }
497 }
498 bufspace -= bp->b_bufsize;
499 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
500 bp->b_npages = 0;
501 bp->b_bufsize = 0;
502 bp->b_flags &= ~B_VMIO;
503 if (bp->b_vp)
504 brelvp(bp);
505 }
506 }
507 if (bp->b_qindex != QUEUE_NONE)
508 panic("brelse: free buffer onto another queue???");
509
510 /* enqueue */
511 /* buffers with no memory */
512 if (bp->b_bufsize == 0) {
513 bp->b_qindex = QUEUE_EMPTY;
514 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
515 LIST_REMOVE(bp, b_hash);
516 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
517 bp->b_dev = NODEV;
518 /* buffers with junk contents */
519 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
520 bp->b_qindex = QUEUE_AGE;
521 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
522 LIST_REMOVE(bp, b_hash);
523 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
524 bp->b_dev = NODEV;
525 /* buffers that are locked */
526 } else if (bp->b_flags & B_LOCKED) {
527 bp->b_qindex = QUEUE_LOCKED;
528 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
529 /* buffers with stale but valid contents */
530 } else if (bp->b_flags & B_AGE) {
531 bp->b_qindex = QUEUE_AGE;
532 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
533 /* buffers with valid and quite potentially reuseable contents */
534 } else {
535 bp->b_qindex = QUEUE_LRU;
536 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
537 }
538
539 /* unlock */
540 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
541 splx(s);
542}
543
544/*
545 * Check to see if a block is currently memory resident.
546 */
547static __inline struct buf *
548gbincore(struct vnode * vp, daddr_t blkno)
549{
550 struct buf *bp;
551 struct bufhashhdr *bh;
552
553 bh = BUFHASH(vp, blkno);
554 bp = bh->lh_first;
555
556 /* Search hash chain */
557 while (bp != NULL) {
558 /* hit */
559 if (bp->b_vp == vp && bp->b_lblkno == blkno) {
560 break;
561 }
562 bp = bp->b_hash.le_next;
563 }
564 return (bp);
565}
566
567/*
568 * this routine implements clustered async writes for
569 * clearing out B_DELWRI buffers... This is much better
570 * than the old way of writing only one buffer at a time.
571 */
572void
573vfs_bio_awrite(struct buf * bp)
574{
575 int i;
576 daddr_t lblkno = bp->b_lblkno;
577 struct vnode *vp = bp->b_vp;
578 int s;
579 int ncl;
580 struct buf *bpa;
581
582 s = splbio();
583 if (vp->v_mount && (vp->v_flag & VVMIO) &&
584 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
585 int size = vp->v_mount->mnt_stat.f_iosize;
586 int maxcl = MAXPHYS / size;
587
588 for (i = 1; i < maxcl; i++) {
589 if ((bpa = gbincore(vp, lblkno + i)) &&
590 ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
591 (B_DELWRI | B_CLUSTEROK)) &&
592 (bpa->b_bufsize == size)) {
593 if ((bpa->b_blkno == bpa->b_lblkno) ||
594 (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
595 break;
596 } else {
597 break;
598 }
599 }
600 ncl = i;
601 /*
602 * this is a possible cluster write
603 */
604 if (ncl != 1) {
605 bremfree(bp);
606 cluster_wbuild(vp, bp, size, lblkno, ncl, -1);
607 splx(s);
608 return;
609 }
610 }
611 /*
612 * default (old) behavior, writing out only one block
613 */
614 bremfree(bp);
615 bp->b_flags |= B_BUSY | B_ASYNC;
616 (void) VOP_BWRITE(bp);
617 splx(s);
618}
619
620
621/*
622 * Find a buffer header which is available for use.
623 */
624static struct buf *
625getnewbuf(int slpflag, int slptimeo, int doingvmio)
626{
627 struct buf *bp;
628 int s;
629 int firstbp = 1;
630
631 s = splbio();
632start:
633 if (bufspace >= maxbufspace)
634 goto trytofreespace;
635
636 /* can we constitute a new buffer? */
637 if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
638 if (bp->b_qindex != QUEUE_EMPTY)
639 panic("getnewbuf: inconsistent EMPTY queue");
640 bremfree(bp);
641 goto fillbuf;
642 }
643trytofreespace:
644 /*
645 * We keep the file I/O from hogging metadata I/O
646 * This is desirable because file data is cached in the
647 * VM/Buffer cache even if a buffer is freed.
648 */
649 if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
650 if (bp->b_qindex != QUEUE_AGE)
651 panic("getnewbuf: inconsistent AGE queue");
652 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
653 if (bp->b_qindex != QUEUE_LRU)
654 panic("getnewbuf: inconsistent LRU queue");
655 }
656 if (!bp) {
657 /* wait for a free buffer of any kind */
658 needsbuffer = 1;
659 tsleep(&needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
660 splx(s);
661 return (0);
662 }
663
664 /* if we are a delayed write, convert to an async write */
665 if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
666 vfs_bio_awrite(bp);
667 if (!slpflag && !slptimeo) {
668 splx(s);
669 return (0);
670 }
671 goto start;
672 }
673
674 if (bp->b_flags & B_WANTED) {
675 bp->b_flags &= ~B_WANTED;
676 wakeup(bp);
677 }
678 bremfree(bp);
679
680 if (bp->b_flags & B_VMIO) {
681 bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
682 brelse(bp);
683 bremfree(bp);
684 }
685
686 if (bp->b_vp)
687 brelvp(bp);
688
689 /* we are not free, nor do we contain interesting data */
690 if (bp->b_rcred != NOCRED)
691 crfree(bp->b_rcred);
692 if (bp->b_wcred != NOCRED)
693 crfree(bp->b_wcred);
694fillbuf:
695 bp->b_flags |= B_BUSY;
696 LIST_REMOVE(bp, b_hash);
697 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
698 splx(s);
699 if (bp->b_bufsize) {
700 allocbuf(bp, 0);
701 }
702 bp->b_flags = B_BUSY;
703 bp->b_dev = NODEV;
704 bp->b_vp = NULL;
705 bp->b_blkno = bp->b_lblkno = 0;
706 bp->b_iodone = 0;
707 bp->b_error = 0;
708 bp->b_resid = 0;
709 bp->b_bcount = 0;
710 bp->b_npages = 0;
711 bp->b_wcred = bp->b_rcred = NOCRED;
712 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
713 bp->b_dirtyoff = bp->b_dirtyend = 0;
714 bp->b_validoff = bp->b_validend = 0;
715 if (bufspace >= maxbufspace) {
716 s = splbio();
717 bp->b_flags |= B_INVAL;
718 brelse(bp);
719 goto trytofreespace;
720 }
721 return (bp);
722}
723
724/*
725 * Check to see if a block is currently memory resident.
726 */
727struct buf *
728incore(struct vnode * vp, daddr_t blkno)
729{
730 struct buf *bp;
731 struct bufhashhdr *bh;
732
733 int s = splbio();
734
735 bh = BUFHASH(vp, blkno);
736 bp = bh->lh_first;
737
738 /* Search hash chain */
739 while (bp != NULL) {
740 /* hit */
741 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
742 (bp->b_flags & B_INVAL) == 0) {
743 break;
744 }
745 bp = bp->b_hash.le_next;
746 }
747 splx(s);
748 return (bp);
749}
750
751/*
752 * Returns true if no I/O is needed to access the
753 * associated VM object. This is like incore except
754 * it also hunts around in the VM system for the data.
755 */
756
757int
758inmem(struct vnode * vp, daddr_t blkno)
759{
760 vm_object_t obj;
761 vm_offset_t off, toff, tinc;
762 vm_page_t m;
763
764 if (incore(vp, blkno))
765 return 1;
766 if (vp->v_mount == NULL)
767 return 0;
768 if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
769 return 0;
770
771 obj = vp->v_object;
772 tinc = PAGE_SIZE;
773 if (tinc > vp->v_mount->mnt_stat.f_iosize)
774 tinc = vp->v_mount->mnt_stat.f_iosize;
775 off = blkno * vp->v_mount->mnt_stat.f_iosize;
776
777 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
778 int mask;
779
780 m = vm_page_lookup(obj, trunc_page(toff + off));
781 if (!m)
782 return 0;
783 if (vm_page_is_valid(m, toff + off, tinc) == 0)
784 return 0;
785 }
786 return 1;
787}
788
789/*
790 * now we set the dirty range for the buffer --
791 * for NFS -- if the file is mapped and pages have
792 * been written to, let it know. We want the
793 * entire range of the buffer to be marked dirty if
794 * any of the pages have been written to for consistancy
795 * with the b_validoff, b_validend set in the nfs write
796 * code, and used by the nfs read code.
797 */
798static void
799vfs_setdirty(struct buf *bp) {
800 int i;
801 vm_object_t object;
802 vm_offset_t boffset, offset;
803 /*
804 * We qualify the scan for modified pages on whether the
805 * object has been flushed yet. The OBJ_WRITEABLE flag
806 * is not cleared simply by protecting pages off.
807 */
808 if ((bp->b_flags & B_VMIO) &&
809 ((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) {
810 /*
811 * test the pages to see if they have been modified directly
812 * by users through the VM system.
813 */
814 for (i = 0; i < bp->b_npages; i++)
815 vm_page_test_dirty(bp->b_pages[i]);
816
817 /*
818 * scan forwards for the first page modified
819 */
820 for (i = 0; i < bp->b_npages; i++) {
821 if (bp->b_pages[i]->dirty) {
822 break;
823 }
824 }
825 boffset = i * PAGE_SIZE;
826 if (boffset < bp->b_dirtyoff) {
827 bp->b_dirtyoff = boffset;
828 }
829
830 /*
831 * scan backwards for the last page modified
832 */
833 for (i = bp->b_npages - 1; i >= 0; --i) {
834 if (bp->b_pages[i]->dirty) {
835 break;
836 }
837 }
838 boffset = (i + 1) * PAGE_SIZE;
839 offset = boffset + bp->b_pages[0]->offset;
840 if (offset >= object->size) {
841 boffset = object->size - bp->b_pages[0]->offset;
842 }
843 if (bp->b_dirtyend < boffset) {
844 bp->b_dirtyend = boffset;
845 }
846 }
847}
848
849/*
850 * Get a block given a specified block and offset into a file/device.
851 */
852struct buf *
853getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
854{
855 struct buf *bp;
856 int s;
857 struct bufhashhdr *bh;
858 vm_offset_t off;
859 int nleft;
860
861 s = splbio();
862loop:
863 if (bp = gbincore(vp, blkno)) {
864 if (bp->b_flags & (B_BUSY|B_INVAL)) {
865 bp->b_flags |= B_WANTED;
866 if (!tsleep(bp, PRIBIO | slpflag, "getblk", slptimeo))
867 goto loop;
868
869 splx(s);
870 return (struct buf *) NULL;
871 }
872 bp->b_flags |= B_BUSY | B_CACHE;
873 bremfree(bp);
874 /*
875 * check for size inconsistancies
876 */
877 if (bp->b_bcount != size) {
878 allocbuf(bp, size);
879 }
880 splx(s);
881 return (bp);
882 } else {
883 vm_object_t obj;
884 int doingvmio;
885
886 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
887 doingvmio = 1;
888 } else {
889 doingvmio = 0;
890 }
891 if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
892 if (slpflag || slptimeo)
893 return NULL;
894 goto loop;
895 }
896
897 /*
898 * This code is used to make sure that a buffer is not
899 * created while the getnewbuf routine is blocked.
900 * Normally the vnode is locked so this isn't a problem.
901 * VBLK type I/O requests, however, don't lock the vnode.
902 */
903 if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
904 bp->b_flags |= B_INVAL;
905 brelse(bp);
906 goto loop;
907 }
908
909 /*
910 * Insert the buffer into the hash, so that it can
911 * be found by incore.
912 */
913 bp->b_blkno = bp->b_lblkno = blkno;
914 bgetvp(vp, bp);
915 LIST_REMOVE(bp, b_hash);
916 bh = BUFHASH(vp, blkno);
917 LIST_INSERT_HEAD(bh, bp, b_hash);
918
919 if (doingvmio) {
920 bp->b_flags |= (B_VMIO | B_CACHE);
921#if defined(VFS_BIO_DEBUG)
922 if (vp->v_type != VREG)
923 printf("getblk: vmioing file type %d???\n", vp->v_type);
924#endif
925 } else {
926 bp->b_flags &= ~B_VMIO;
927 }
928 splx(s);
929
930 allocbuf(bp, size);
931 return (bp);
932 }
933}
934
935/*
936 * Get an empty, disassociated buffer of given size.
937 */
938struct buf *
939geteblk(int size)
940{
941 struct buf *bp;
942
943 while ((bp = getnewbuf(0, 0, 0)) == 0);
944 allocbuf(bp, size);
945 bp->b_flags |= B_INVAL;
946 return (bp);
947}
948
949/*
950 * This code constitutes the buffer memory from either anonymous system
951 * memory (in the case of non-VMIO operations) or from an associated
952 * VM object (in the case of VMIO operations).
953 *
954 * Note that this code is tricky, and has many complications to resolve
955 * deadlock or inconsistant data situations. Tread lightly!!!
956 *
957 * Modify the length of a buffer's underlying buffer storage without
958 * destroying information (unless, of course the buffer is shrinking).
959 */
960int
961allocbuf(struct buf * bp, int size)
962{
963
964 int s;
965 int newbsize, mbsize;
966 int i;
967
968 if (!(bp->b_flags & B_BUSY))
969 panic("allocbuf: buffer not busy");
970
971 if ((bp->b_flags & B_VMIO) == 0) {
972 /*
973 * Just get anonymous memory from the kernel
974 */
975 mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
976 newbsize = round_page(size);
977
978 if (newbsize < bp->b_bufsize) {
979 vm_hold_free_pages(
980 bp,
981 (vm_offset_t) bp->b_data + newbsize,
982 (vm_offset_t) bp->b_data + bp->b_bufsize);
983 } else if (newbsize > bp->b_bufsize) {
984 vm_hold_load_pages(
985 bp,
986 (vm_offset_t) bp->b_data + bp->b_bufsize,
987 (vm_offset_t) bp->b_data + newbsize);
988 }
989 } else {
990 vm_page_t m;
991 int desiredpages;
992
993 newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
994 desiredpages = round_page(newbsize) / PAGE_SIZE;
995
996 if (newbsize < bp->b_bufsize) {
997 if (desiredpages < bp->b_npages) {
998 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
999 desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
1000 for (i = desiredpages; i < bp->b_npages; i++) {
1001 m = bp->b_pages[i];
1002 s = splhigh();
1003 while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1004 m->flags |= PG_WANTED;
1005 tsleep(m, PVM, "biodep", 0);
1006 }
1007 splx(s);
1008
1009 if (m->bmapped == 0) {
1010 printf("allocbuf: bmapped is zero for page %d\n", i);
1011 panic("allocbuf: error");
1012 }
1013 --m->bmapped;
1014 if (m->bmapped == 0) {
1015 vm_page_protect(m, VM_PROT_NONE);
1016 vm_page_free(m);
1017 }
1018 bp->b_pages[i] = NULL;
1019 }
1020 bp->b_npages = desiredpages;
1021 }
1022 } else if (newbsize > bp->b_bufsize) {
1023 vm_object_t obj;
1024 vm_offset_t tinc, off, toff, objoff;
1025 int pageindex, curbpnpages;
1026 struct vnode *vp;
1027 int bsize;
1028
1029 vp = bp->b_vp;
1030 bsize = vp->v_mount->mnt_stat.f_iosize;
1031
1032 if (bp->b_npages < desiredpages) {
1033 obj = vp->v_object;
1034 tinc = PAGE_SIZE;
1035 if (tinc > bsize)
1036 tinc = bsize;
1037 off = bp->b_lblkno * bsize;
1038 doretry:
1039 curbpnpages = bp->b_npages;
1040 bp->b_flags |= B_CACHE;
1041 for (toff = 0; toff < newbsize; toff += tinc) {
1042 int mask;
1043 int bytesinpage;
1044
1045 pageindex = toff / PAGE_SIZE;
1046 objoff = trunc_page(toff + off);
1047 if (pageindex < curbpnpages) {
1048 int pb;
1049
1050 m = bp->b_pages[pageindex];
1051 if (m->offset != objoff)
1052 panic("allocbuf: page changed offset??!!!?");
1053 bytesinpage = tinc;
1054 if (tinc > (newbsize - toff))
1055 bytesinpage = newbsize - toff;
1056 if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1057 bp->b_flags &= ~B_CACHE;
1058 }
1059 if ((m->flags & PG_ACTIVE) == 0) {
1060 vm_page_activate(m);
1061 m->act_count = 0;
1062 }
1063 continue;
1064 }
1065 m = vm_page_lookup(obj, objoff);
1066 if (!m) {
1067 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1068 if (!m) {
1069 int j;
1070
1071 for (j = bp->b_npages; j < pageindex; j++) {
1072 PAGE_WAKEUP(bp->b_pages[j]);
1073 }
1074 VM_WAIT;
1075 goto doretry;
1076 }
1077 vm_page_activate(m);
1078 m->act_count = 0;
1079 m->valid = 0;
1080 bp->b_flags &= ~B_CACHE;
1081 } else if (m->flags & PG_BUSY) {
1082 int j;
1083
1084 for (j = bp->b_npages; j < pageindex; j++) {
1085 PAGE_WAKEUP(bp->b_pages[j]);
1086 }
1087
1088 s = splbio();
1089 m->flags |= PG_WANTED;
1090 tsleep(m, PRIBIO, "pgtblk", 0);
1091 splx(s);
1092
1093 goto doretry;
1094 } else {
1095 int pb;
1096 if ((curproc != pageproc) &&
1097 (m->flags & PG_CACHE) &&
1098 (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
1099 pagedaemon_wakeup();
1100 }
1101 bytesinpage = tinc;
1102 if (tinc > (newbsize - toff))
1103 bytesinpage = newbsize - toff;
1104 if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1105 bp->b_flags &= ~B_CACHE;
1106 }
1107 if ((m->flags & PG_ACTIVE) == 0) {
1108 vm_page_activate(m);
1109 m->act_count = 0;
1110 }
1111 m->flags |= PG_BUSY;
1112 }
1113 bp->b_pages[pageindex] = m;
1114 curbpnpages = pageindex + 1;
1115 }
1116 for (i = bp->b_npages; i < curbpnpages; i++) {
1117 m = bp->b_pages[i];
1118 m->bmapped++;
1119 PAGE_WAKEUP(m);
1120 }
1121 bp->b_npages = curbpnpages;
1122 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1123 pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1124 bp->b_data += off % PAGE_SIZE;
1125 }
1126 }
1127 }
1128 bufspace += (newbsize - bp->b_bufsize);
1129 bp->b_bufsize = newbsize;
1130 bp->b_bcount = size;
1131 return 1;
1132}
1133
1134/*
1135 * Wait for buffer I/O completion, returning error status.
1136 */
1137int
1138biowait(register struct buf * bp)
1139{
1140 int s;
1141
1142 s = splbio();
1143 while ((bp->b_flags & B_DONE) == 0)
1144 tsleep(bp, PRIBIO, "biowait", 0);
1145 splx(s);
1146 if (bp->b_flags & B_EINTR) {
1147 bp->b_flags &= ~B_EINTR;
1148 return (EINTR);
1149 }
1150 if (bp->b_flags & B_ERROR) {
1151 return (bp->b_error ? bp->b_error : EIO);
1152 } else {
1153 return (0);
1154 }
1155}
1156
1157/*
1158 * Finish I/O on a buffer, calling an optional function.
1159 * This is usually called from interrupt level, so process blocking
1160 * is not *a good idea*.
1161 */
1162void
1163biodone(register struct buf * bp)
1164{
1165 int s;
1166
1167 s = splbio();
1168 if (!(bp->b_flags & B_BUSY))
1169 panic("biodone: buffer not busy");
1170
1171 if (bp->b_flags & B_DONE) {
1172 splx(s);
1173 printf("biodone: buffer already done\n");
1174 return;
1175 }
1176 bp->b_flags |= B_DONE;
1177
1178 if ((bp->b_flags & B_READ) == 0) {
1179 struct vnode *vp = bp->b_vp;
1180 vwakeup(bp);
1181 }
1182#ifdef BOUNCE_BUFFERS
1183 if (bp->b_flags & B_BOUNCE)
1184 vm_bounce_free(bp);
1185#endif
1186
1187 /* call optional completion function if requested */
1188 if (bp->b_flags & B_CALL) {
1189 bp->b_flags &= ~B_CALL;
1190 (*bp->b_iodone) (bp);
1191 splx(s);
1192 return;
1193 }
1194 if (bp->b_flags & B_VMIO) {
1195 int i, resid;
1196 vm_offset_t foff;
1197 vm_page_t m;
1198 vm_object_t obj;
1199 int iosize;
1200 struct vnode *vp = bp->b_vp;
1201
1202 foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1203 obj = vp->v_object;
1204 if (!obj) {
1205 panic("biodone: no object");
1206 }
1207#if defined(VFS_BIO_DEBUG)
1208 if (obj->paging_in_progress < bp->b_npages) {
1209 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1210 obj->paging_in_progress, bp->b_npages);
1211 }
1212#endif
1213 iosize = bp->b_bufsize;
1214 for (i = 0; i < bp->b_npages; i++) {
1215 int bogusflag = 0;
1216 m = bp->b_pages[i];
1217 if (m == bogus_page) {
1218 bogusflag = 1;
1219 m = vm_page_lookup(obj, foff);
1220 if (!m) {
1221#if defined(VFS_BIO_DEBUG)
1222 printf("biodone: page disappeared\n");
1223#endif
1224 --obj->paging_in_progress;
1225 continue;
1226 }
1227 bp->b_pages[i] = m;
1228 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1229 }
1230#if defined(VFS_BIO_DEBUG)
1231 if (trunc_page(foff) != m->offset) {
1232 printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
1233 }
1234#endif
1235 resid = (m->offset + PAGE_SIZE) - foff;
1236 if (resid > iosize)
1237 resid = iosize;
1238 /*
1239 * In the write case, the valid and clean bits are
1240 * already changed correctly, so we only need to do this
1241 * here in the read case.
1242 */
1243 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1244 vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid);
1245 }
1246
1247 /*
1248 * when debugging new filesystems or buffer I/O methods, this
1249 * is the most common error that pops up. if you see this, you
1250 * have not set the page busy flag correctly!!!
1251 */
1252 if (m->busy == 0) {
1253 printf("biodone: page busy < 0, "
1254 "off: %ld, foff: %ld, "
1255 "resid: %d, index: %d\n",
1256 m->offset, foff, resid, i);
1257 printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n",
1258 bp->b_vp->v_mount->mnt_stat.f_iosize,
1259 bp->b_lblkno, bp->b_flags, bp->b_npages);
1260 printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1261 m->valid, m->dirty, m->bmapped);
1262 panic("biodone: page busy < 0\n");
1263 }
1264 --m->busy;
1265 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1266 m->flags &= ~PG_WANTED;
1267 wakeup(m);
1268 }
1269 --obj->paging_in_progress;
1270 foff += resid;
1271 iosize -= resid;
1272 }
1273 if (obj && obj->paging_in_progress == 0 &&
1274 (obj->flags & OBJ_PIPWNT)) {
1275 obj->flags &= ~OBJ_PIPWNT;
1276 wakeup(obj);
1277 }
1278 }
1279 /*
1280 * For asynchronous completions, release the buffer now. The brelse
1281 * checks for B_WANTED and will do the wakeup there if necessary - so
1282 * no need to do a wakeup here in the async case.
1283 */
1284
1285 if (bp->b_flags & B_ASYNC) {
1286 brelse(bp);
1287 } else {
1288 bp->b_flags &= ~B_WANTED;
1289 wakeup(bp);
1290 }
1291 splx(s);
1292}
1293
1294int
1295count_lock_queue()
1296{
1297 int count;
1298 struct buf *bp;
1299
1300 count = 0;
1301 for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1302 bp != NULL;
1303 bp = bp->b_freelist.tqe_next)
1304 count++;
1305 return (count);
1306}
1307
1308int vfs_update_interval = 30;
1309
1310void
1311vfs_update()
1312{
1313 (void) spl0();
1314 while (1) {
1315 tsleep(&vfs_update_wakeup, PRIBIO, "update",
1316 hz * vfs_update_interval);
1317 vfs_update_wakeup = 0;
1318 sync(curproc, NULL, NULL);
1319 }
1320}
1321
1322/*
1323 * This routine is called in lieu of iodone in the case of
1324 * incomplete I/O. This keeps the busy status for pages
1325 * consistant.
1326 */
1327void
1328vfs_unbusy_pages(struct buf * bp)
1329{
1330 int i;
1331
1332 if (bp->b_flags & B_VMIO) {
1333 struct vnode *vp = bp->b_vp;
1334 vm_object_t obj = vp->v_object;
1335 vm_offset_t foff;
1336
1337 foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno);
1338
1339 for (i = 0; i < bp->b_npages; i++) {
1340 vm_page_t m = bp->b_pages[i];
1341
1342 if (m == bogus_page) {
1343 m = vm_page_lookup(obj, foff + i * PAGE_SIZE);
1344 if (!m) {
1345 panic("vfs_unbusy_pages: page missing\n");
1346 }
1347 bp->b_pages[i] = m;
1348 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1349 }
1350 --obj->paging_in_progress;
1351 --m->busy;
1352 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1353 m->flags &= ~PG_WANTED;
1354 wakeup(m);
1355 }
1356 }
1357 if (obj->paging_in_progress == 0 &&
1358 (obj->flags & OBJ_PIPWNT)) {
1359 obj->flags &= ~OBJ_PIPWNT;
1360 wakeup(obj);
1361 }
1362 }
1363}
1364
1365/*
1366 * This routine is called before a device strategy routine.
1367 * It is used to tell the VM system that paging I/O is in
1368 * progress, and treat the pages associated with the buffer
1369 * almost as being PG_BUSY. Also the object paging_in_progress
1370 * flag is handled to make sure that the object doesn't become
1371 * inconsistant.
1372 */
1373void
1374vfs_busy_pages(struct buf * bp, int clear_modify)
1375{
1376 int i;
1377
1378 if (bp->b_flags & B_VMIO) {
1379 vm_object_t obj = bp->b_vp->v_object;
1380 vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1381 int iocount = bp->b_bufsize;
1382
1383 vfs_setdirty(bp);
1384 for (i = 0; i < bp->b_npages; i++) {
1385 vm_page_t m = bp->b_pages[i];
1386 int resid = (m->offset + PAGE_SIZE) - foff;
1387
1388 if (resid > iocount)
1389 resid = iocount;
1390 if ((bp->b_flags & B_CLUSTER) == 0) {
1391 obj->paging_in_progress++;
1392 m->busy++;
1393 }
1394 if (clear_modify) {
1395 vm_page_protect(m, VM_PROT_READ);
1396 vm_page_set_validclean(m,
1397 foff & (PAGE_SIZE-1), resid);
1398 } else if (bp->b_bcount >= PAGE_SIZE) {
1399 if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1400 bp->b_pages[i] = bogus_page;
1401 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1402 }
1403 }
1404 foff += resid;
1405 iocount -= resid;
1406 }
1407 }
1408}
1409
1410/*
1411 * Tell the VM system that the pages associated with this buffer
1412 * are clean. This is used for delayed writes where the data is
1413 * going to go to disk eventually without additional VM intevention.
1414 */
1415void
1416vfs_clean_pages(struct buf * bp)
1417{
1418 int i;
1419
1420 if (bp->b_flags & B_VMIO) {
1421 vm_offset_t foff =
1422 bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1423 int iocount = bp->b_bufsize;
1424
1425 for (i = 0; i < bp->b_npages; i++) {
1426 vm_page_t m = bp->b_pages[i];
1427 int resid = (m->offset + PAGE_SIZE) - foff;
1428
1429 if (resid > iocount)
1430 resid = iocount;
1431 if (resid > 0) {
1432 vm_page_set_validclean(m,
1433 foff & (PAGE_SIZE-1), resid);
1434 }
1435 foff += resid;
1436 iocount -= resid;
1437 }
1438 }
1439}
1440
1441void
1442vfs_bio_clrbuf(struct buf *bp) {
1443 int i;
1444 if( bp->b_flags & B_VMIO) {
1445 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1446 int j;
1447 if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) {
1448 for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) {
1449 bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE);
1450 }
1451 }
1452 bp->b_resid = 0;
1453 return;
1454 }
1455 for(i=0;i<bp->b_npages;i++) {
1456 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1457 continue;
1458 if( bp->b_pages[i]->valid == 0) {
1459 bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE);
1460 } else {
1461 int j;
1462 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1463 if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1464 bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE);
1465 }
1466 }
1467 bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
1468 }
1469 bp->b_resid = 0;
1470 } else {
1471 clrbuf(bp);
1472 }
1473}
1474
1475/*
1476 * vm_hold_load_pages and vm_hold_unload pages get pages into
1477 * a buffers address space. The pages are anonymous and are
1478 * not associated with a file object.
1479 */
1480void
1481vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1482{
1483 vm_offset_t pg;
1484 vm_page_t p;
1485 vm_offset_t from = round_page(froma);
1486 vm_offset_t to = round_page(toa);
1487
1488 for (pg = from; pg < to; pg += PAGE_SIZE) {
1489
1490tryagain:
1491
1492 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS,
1493 VM_ALLOC_NORMAL);
1494 if (!p) {
1495 VM_WAIT;
1496 goto tryagain;
1497 }
1498 vm_page_wire(p);
1499 pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1500 bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
1501 PAGE_WAKEUP(p);
1502 bp->b_npages++;
1503 }
1504}
1505
1506void
1507vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1508{
1509 vm_offset_t pg;
1510 vm_page_t p;
1511 vm_offset_t from = round_page(froma);
1512 vm_offset_t to = round_page(toa);
1513
1514 for (pg = from; pg < to; pg += PAGE_SIZE) {
1515 p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
1516 bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
1517 pmap_kremove(pg);
1518 vm_page_free(p);
1519 --bp->b_npages;
1520 }
1521}
350 }
351
352 /*
353 * Set the *dirty* buffer range based upon the VM system dirty pages.
354 */
355 vfs_setdirty(bp);
356
357 /*
358 * We need to do this here to satisfy the vnode_pager and the
359 * pageout daemon, so that it thinks that the pages have been
360 * "cleaned". Note that since the pages are in a delayed write
361 * buffer -- the VFS layer "will" see that the pages get written
362 * out on the next sync, or perhaps the cluster will be completed.
363 */
364 vfs_clean_pages(bp);
365 brelse(bp);
366 return;
367}
368
369/*
370 * Asynchronous write.
371 * Start output on a buffer, but do not wait for it to complete.
372 * The buffer is released when the output completes.
373 */
374void
375bawrite(struct buf * bp)
376{
377 bp->b_flags |= B_ASYNC;
378 (void) VOP_BWRITE(bp);
379}
380
381/*
382 * Release a buffer.
383 */
384void
385brelse(struct buf * bp)
386{
387 int s;
388
389 if (bp->b_flags & B_CLUSTER) {
390 relpbuf(bp);
391 return;
392 }
393 /* anyone need a "free" block? */
394 s = splbio();
395
396 if (needsbuffer) {
397 needsbuffer = 0;
398 wakeup(&needsbuffer);
399 }
400
401 /* anyone need this block? */
402 if (bp->b_flags & B_WANTED) {
403 bp->b_flags &= ~(B_WANTED | B_AGE);
404 wakeup(bp);
405 } else if (bp->b_flags & B_VMIO) {
406 bp->b_flags &= ~B_WANTED;
407 wakeup(bp);
408 }
409 if (bp->b_flags & B_LOCKED)
410 bp->b_flags &= ~B_ERROR;
411
412 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
413 (bp->b_bufsize <= 0)) {
414 bp->b_flags |= B_INVAL;
415 bp->b_flags &= ~(B_DELWRI | B_CACHE);
416 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
417 brelvp(bp);
418 }
419
420 /*
421 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
422 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
423 * but the VM object is kept around. The B_NOCACHE flag is used to
424 * invalidate the pages in the VM object.
425 */
426 if (bp->b_flags & B_VMIO) {
427 vm_offset_t foff;
428 vm_object_t obj;
429 int i, resid;
430 vm_page_t m;
431 int iototal = bp->b_bufsize;
432
433 foff = 0;
434 obj = 0;
435 if (bp->b_npages) {
436 if (bp->b_vp && bp->b_vp->v_mount) {
437 foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
438 } else {
439 /*
440 * vnode pointer has been ripped away --
441 * probably file gone...
442 */
443 foff = bp->b_pages[0]->offset;
444 }
445 }
446 for (i = 0; i < bp->b_npages; i++) {
447 m = bp->b_pages[i];
448 if (m == bogus_page) {
449 m = vm_page_lookup(obj, foff);
450 if (!m) {
451 panic("brelse: page missing\n");
452 }
453 bp->b_pages[i] = m;
454 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
455 }
456 resid = (m->offset + PAGE_SIZE) - foff;
457 if (resid > iototal)
458 resid = iototal;
459 if (resid > 0) {
460 /*
461 * Don't invalidate the page if the local machine has already
462 * modified it. This is the lesser of two evils, and should
463 * be fixed.
464 */
465 if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
466 vm_page_test_dirty(m);
467 if (m->dirty == 0) {
468 vm_page_set_invalid(m, foff, resid);
469 if (m->valid == 0)
470 vm_page_protect(m, VM_PROT_NONE);
471 }
472 }
473 }
474 foff += resid;
475 iototal -= resid;
476 }
477
478 if (bp->b_flags & (B_INVAL | B_RELBUF)) {
479 for(i=0;i<bp->b_npages;i++) {
480 m = bp->b_pages[i];
481 --m->bmapped;
482 if (m->bmapped == 0) {
483 if (m->flags & PG_WANTED) {
484 wakeup(m);
485 m->flags &= ~PG_WANTED;
486 }
487 vm_page_test_dirty(m);
488 if ((m->dirty & m->valid) == 0 &&
489 (m->flags & PG_REFERENCED) == 0 &&
490 !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
491 vm_page_cache(m);
492 } else if ((m->flags & PG_ACTIVE) == 0) {
493 vm_page_activate(m);
494 m->act_count = 0;
495 }
496 }
497 }
498 bufspace -= bp->b_bufsize;
499 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
500 bp->b_npages = 0;
501 bp->b_bufsize = 0;
502 bp->b_flags &= ~B_VMIO;
503 if (bp->b_vp)
504 brelvp(bp);
505 }
506 }
507 if (bp->b_qindex != QUEUE_NONE)
508 panic("brelse: free buffer onto another queue???");
509
510 /* enqueue */
511 /* buffers with no memory */
512 if (bp->b_bufsize == 0) {
513 bp->b_qindex = QUEUE_EMPTY;
514 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
515 LIST_REMOVE(bp, b_hash);
516 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
517 bp->b_dev = NODEV;
518 /* buffers with junk contents */
519 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
520 bp->b_qindex = QUEUE_AGE;
521 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
522 LIST_REMOVE(bp, b_hash);
523 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
524 bp->b_dev = NODEV;
525 /* buffers that are locked */
526 } else if (bp->b_flags & B_LOCKED) {
527 bp->b_qindex = QUEUE_LOCKED;
528 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
529 /* buffers with stale but valid contents */
530 } else if (bp->b_flags & B_AGE) {
531 bp->b_qindex = QUEUE_AGE;
532 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
533 /* buffers with valid and quite potentially reuseable contents */
534 } else {
535 bp->b_qindex = QUEUE_LRU;
536 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
537 }
538
539 /* unlock */
540 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
541 splx(s);
542}
543
544/*
545 * Check to see if a block is currently memory resident.
546 */
547static __inline struct buf *
548gbincore(struct vnode * vp, daddr_t blkno)
549{
550 struct buf *bp;
551 struct bufhashhdr *bh;
552
553 bh = BUFHASH(vp, blkno);
554 bp = bh->lh_first;
555
556 /* Search hash chain */
557 while (bp != NULL) {
558 /* hit */
559 if (bp->b_vp == vp && bp->b_lblkno == blkno) {
560 break;
561 }
562 bp = bp->b_hash.le_next;
563 }
564 return (bp);
565}
566
567/*
568 * this routine implements clustered async writes for
569 * clearing out B_DELWRI buffers... This is much better
570 * than the old way of writing only one buffer at a time.
571 */
572void
573vfs_bio_awrite(struct buf * bp)
574{
575 int i;
576 daddr_t lblkno = bp->b_lblkno;
577 struct vnode *vp = bp->b_vp;
578 int s;
579 int ncl;
580 struct buf *bpa;
581
582 s = splbio();
583 if (vp->v_mount && (vp->v_flag & VVMIO) &&
584 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
585 int size = vp->v_mount->mnt_stat.f_iosize;
586 int maxcl = MAXPHYS / size;
587
588 for (i = 1; i < maxcl; i++) {
589 if ((bpa = gbincore(vp, lblkno + i)) &&
590 ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
591 (B_DELWRI | B_CLUSTEROK)) &&
592 (bpa->b_bufsize == size)) {
593 if ((bpa->b_blkno == bpa->b_lblkno) ||
594 (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
595 break;
596 } else {
597 break;
598 }
599 }
600 ncl = i;
601 /*
602 * this is a possible cluster write
603 */
604 if (ncl != 1) {
605 bremfree(bp);
606 cluster_wbuild(vp, bp, size, lblkno, ncl, -1);
607 splx(s);
608 return;
609 }
610 }
611 /*
612 * default (old) behavior, writing out only one block
613 */
614 bremfree(bp);
615 bp->b_flags |= B_BUSY | B_ASYNC;
616 (void) VOP_BWRITE(bp);
617 splx(s);
618}
619
620
621/*
622 * Find a buffer header which is available for use.
623 */
624static struct buf *
625getnewbuf(int slpflag, int slptimeo, int doingvmio)
626{
627 struct buf *bp;
628 int s;
629 int firstbp = 1;
630
631 s = splbio();
632start:
633 if (bufspace >= maxbufspace)
634 goto trytofreespace;
635
636 /* can we constitute a new buffer? */
637 if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
638 if (bp->b_qindex != QUEUE_EMPTY)
639 panic("getnewbuf: inconsistent EMPTY queue");
640 bremfree(bp);
641 goto fillbuf;
642 }
643trytofreespace:
644 /*
645 * We keep the file I/O from hogging metadata I/O
646 * This is desirable because file data is cached in the
647 * VM/Buffer cache even if a buffer is freed.
648 */
649 if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
650 if (bp->b_qindex != QUEUE_AGE)
651 panic("getnewbuf: inconsistent AGE queue");
652 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
653 if (bp->b_qindex != QUEUE_LRU)
654 panic("getnewbuf: inconsistent LRU queue");
655 }
656 if (!bp) {
657 /* wait for a free buffer of any kind */
658 needsbuffer = 1;
659 tsleep(&needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
660 splx(s);
661 return (0);
662 }
663
664 /* if we are a delayed write, convert to an async write */
665 if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
666 vfs_bio_awrite(bp);
667 if (!slpflag && !slptimeo) {
668 splx(s);
669 return (0);
670 }
671 goto start;
672 }
673
674 if (bp->b_flags & B_WANTED) {
675 bp->b_flags &= ~B_WANTED;
676 wakeup(bp);
677 }
678 bremfree(bp);
679
680 if (bp->b_flags & B_VMIO) {
681 bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
682 brelse(bp);
683 bremfree(bp);
684 }
685
686 if (bp->b_vp)
687 brelvp(bp);
688
689 /* we are not free, nor do we contain interesting data */
690 if (bp->b_rcred != NOCRED)
691 crfree(bp->b_rcred);
692 if (bp->b_wcred != NOCRED)
693 crfree(bp->b_wcred);
694fillbuf:
695 bp->b_flags |= B_BUSY;
696 LIST_REMOVE(bp, b_hash);
697 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
698 splx(s);
699 if (bp->b_bufsize) {
700 allocbuf(bp, 0);
701 }
702 bp->b_flags = B_BUSY;
703 bp->b_dev = NODEV;
704 bp->b_vp = NULL;
705 bp->b_blkno = bp->b_lblkno = 0;
706 bp->b_iodone = 0;
707 bp->b_error = 0;
708 bp->b_resid = 0;
709 bp->b_bcount = 0;
710 bp->b_npages = 0;
711 bp->b_wcred = bp->b_rcred = NOCRED;
712 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
713 bp->b_dirtyoff = bp->b_dirtyend = 0;
714 bp->b_validoff = bp->b_validend = 0;
715 if (bufspace >= maxbufspace) {
716 s = splbio();
717 bp->b_flags |= B_INVAL;
718 brelse(bp);
719 goto trytofreespace;
720 }
721 return (bp);
722}
723
724/*
725 * Check to see if a block is currently memory resident.
726 */
727struct buf *
728incore(struct vnode * vp, daddr_t blkno)
729{
730 struct buf *bp;
731 struct bufhashhdr *bh;
732
733 int s = splbio();
734
735 bh = BUFHASH(vp, blkno);
736 bp = bh->lh_first;
737
738 /* Search hash chain */
739 while (bp != NULL) {
740 /* hit */
741 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
742 (bp->b_flags & B_INVAL) == 0) {
743 break;
744 }
745 bp = bp->b_hash.le_next;
746 }
747 splx(s);
748 return (bp);
749}
750
751/*
752 * Returns true if no I/O is needed to access the
753 * associated VM object. This is like incore except
754 * it also hunts around in the VM system for the data.
755 */
756
757int
758inmem(struct vnode * vp, daddr_t blkno)
759{
760 vm_object_t obj;
761 vm_offset_t off, toff, tinc;
762 vm_page_t m;
763
764 if (incore(vp, blkno))
765 return 1;
766 if (vp->v_mount == NULL)
767 return 0;
768 if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
769 return 0;
770
771 obj = vp->v_object;
772 tinc = PAGE_SIZE;
773 if (tinc > vp->v_mount->mnt_stat.f_iosize)
774 tinc = vp->v_mount->mnt_stat.f_iosize;
775 off = blkno * vp->v_mount->mnt_stat.f_iosize;
776
777 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
778 int mask;
779
780 m = vm_page_lookup(obj, trunc_page(toff + off));
781 if (!m)
782 return 0;
783 if (vm_page_is_valid(m, toff + off, tinc) == 0)
784 return 0;
785 }
786 return 1;
787}
788
789/*
790 * now we set the dirty range for the buffer --
791 * for NFS -- if the file is mapped and pages have
792 * been written to, let it know. We want the
793 * entire range of the buffer to be marked dirty if
794 * any of the pages have been written to for consistancy
795 * with the b_validoff, b_validend set in the nfs write
796 * code, and used by the nfs read code.
797 */
798static void
799vfs_setdirty(struct buf *bp) {
800 int i;
801 vm_object_t object;
802 vm_offset_t boffset, offset;
803 /*
804 * We qualify the scan for modified pages on whether the
805 * object has been flushed yet. The OBJ_WRITEABLE flag
806 * is not cleared simply by protecting pages off.
807 */
808 if ((bp->b_flags & B_VMIO) &&
809 ((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) {
810 /*
811 * test the pages to see if they have been modified directly
812 * by users through the VM system.
813 */
814 for (i = 0; i < bp->b_npages; i++)
815 vm_page_test_dirty(bp->b_pages[i]);
816
817 /*
818 * scan forwards for the first page modified
819 */
820 for (i = 0; i < bp->b_npages; i++) {
821 if (bp->b_pages[i]->dirty) {
822 break;
823 }
824 }
825 boffset = i * PAGE_SIZE;
826 if (boffset < bp->b_dirtyoff) {
827 bp->b_dirtyoff = boffset;
828 }
829
830 /*
831 * scan backwards for the last page modified
832 */
833 for (i = bp->b_npages - 1; i >= 0; --i) {
834 if (bp->b_pages[i]->dirty) {
835 break;
836 }
837 }
838 boffset = (i + 1) * PAGE_SIZE;
839 offset = boffset + bp->b_pages[0]->offset;
840 if (offset >= object->size) {
841 boffset = object->size - bp->b_pages[0]->offset;
842 }
843 if (bp->b_dirtyend < boffset) {
844 bp->b_dirtyend = boffset;
845 }
846 }
847}
848
849/*
850 * Get a block given a specified block and offset into a file/device.
851 */
852struct buf *
853getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
854{
855 struct buf *bp;
856 int s;
857 struct bufhashhdr *bh;
858 vm_offset_t off;
859 int nleft;
860
861 s = splbio();
862loop:
863 if (bp = gbincore(vp, blkno)) {
864 if (bp->b_flags & (B_BUSY|B_INVAL)) {
865 bp->b_flags |= B_WANTED;
866 if (!tsleep(bp, PRIBIO | slpflag, "getblk", slptimeo))
867 goto loop;
868
869 splx(s);
870 return (struct buf *) NULL;
871 }
872 bp->b_flags |= B_BUSY | B_CACHE;
873 bremfree(bp);
874 /*
875 * check for size inconsistancies
876 */
877 if (bp->b_bcount != size) {
878 allocbuf(bp, size);
879 }
880 splx(s);
881 return (bp);
882 } else {
883 vm_object_t obj;
884 int doingvmio;
885
886 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
887 doingvmio = 1;
888 } else {
889 doingvmio = 0;
890 }
891 if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
892 if (slpflag || slptimeo)
893 return NULL;
894 goto loop;
895 }
896
897 /*
898 * This code is used to make sure that a buffer is not
899 * created while the getnewbuf routine is blocked.
900 * Normally the vnode is locked so this isn't a problem.
901 * VBLK type I/O requests, however, don't lock the vnode.
902 */
903 if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
904 bp->b_flags |= B_INVAL;
905 brelse(bp);
906 goto loop;
907 }
908
909 /*
910 * Insert the buffer into the hash, so that it can
911 * be found by incore.
912 */
913 bp->b_blkno = bp->b_lblkno = blkno;
914 bgetvp(vp, bp);
915 LIST_REMOVE(bp, b_hash);
916 bh = BUFHASH(vp, blkno);
917 LIST_INSERT_HEAD(bh, bp, b_hash);
918
919 if (doingvmio) {
920 bp->b_flags |= (B_VMIO | B_CACHE);
921#if defined(VFS_BIO_DEBUG)
922 if (vp->v_type != VREG)
923 printf("getblk: vmioing file type %d???\n", vp->v_type);
924#endif
925 } else {
926 bp->b_flags &= ~B_VMIO;
927 }
928 splx(s);
929
930 allocbuf(bp, size);
931 return (bp);
932 }
933}
934
935/*
936 * Get an empty, disassociated buffer of given size.
937 */
938struct buf *
939geteblk(int size)
940{
941 struct buf *bp;
942
943 while ((bp = getnewbuf(0, 0, 0)) == 0);
944 allocbuf(bp, size);
945 bp->b_flags |= B_INVAL;
946 return (bp);
947}
948
949/*
950 * This code constitutes the buffer memory from either anonymous system
951 * memory (in the case of non-VMIO operations) or from an associated
952 * VM object (in the case of VMIO operations).
953 *
954 * Note that this code is tricky, and has many complications to resolve
955 * deadlock or inconsistant data situations. Tread lightly!!!
956 *
957 * Modify the length of a buffer's underlying buffer storage without
958 * destroying information (unless, of course the buffer is shrinking).
959 */
960int
961allocbuf(struct buf * bp, int size)
962{
963
964 int s;
965 int newbsize, mbsize;
966 int i;
967
968 if (!(bp->b_flags & B_BUSY))
969 panic("allocbuf: buffer not busy");
970
971 if ((bp->b_flags & B_VMIO) == 0) {
972 /*
973 * Just get anonymous memory from the kernel
974 */
975 mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
976 newbsize = round_page(size);
977
978 if (newbsize < bp->b_bufsize) {
979 vm_hold_free_pages(
980 bp,
981 (vm_offset_t) bp->b_data + newbsize,
982 (vm_offset_t) bp->b_data + bp->b_bufsize);
983 } else if (newbsize > bp->b_bufsize) {
984 vm_hold_load_pages(
985 bp,
986 (vm_offset_t) bp->b_data + bp->b_bufsize,
987 (vm_offset_t) bp->b_data + newbsize);
988 }
989 } else {
990 vm_page_t m;
991 int desiredpages;
992
993 newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
994 desiredpages = round_page(newbsize) / PAGE_SIZE;
995
996 if (newbsize < bp->b_bufsize) {
997 if (desiredpages < bp->b_npages) {
998 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
999 desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
1000 for (i = desiredpages; i < bp->b_npages; i++) {
1001 m = bp->b_pages[i];
1002 s = splhigh();
1003 while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1004 m->flags |= PG_WANTED;
1005 tsleep(m, PVM, "biodep", 0);
1006 }
1007 splx(s);
1008
1009 if (m->bmapped == 0) {
1010 printf("allocbuf: bmapped is zero for page %d\n", i);
1011 panic("allocbuf: error");
1012 }
1013 --m->bmapped;
1014 if (m->bmapped == 0) {
1015 vm_page_protect(m, VM_PROT_NONE);
1016 vm_page_free(m);
1017 }
1018 bp->b_pages[i] = NULL;
1019 }
1020 bp->b_npages = desiredpages;
1021 }
1022 } else if (newbsize > bp->b_bufsize) {
1023 vm_object_t obj;
1024 vm_offset_t tinc, off, toff, objoff;
1025 int pageindex, curbpnpages;
1026 struct vnode *vp;
1027 int bsize;
1028
1029 vp = bp->b_vp;
1030 bsize = vp->v_mount->mnt_stat.f_iosize;
1031
1032 if (bp->b_npages < desiredpages) {
1033 obj = vp->v_object;
1034 tinc = PAGE_SIZE;
1035 if (tinc > bsize)
1036 tinc = bsize;
1037 off = bp->b_lblkno * bsize;
1038 doretry:
1039 curbpnpages = bp->b_npages;
1040 bp->b_flags |= B_CACHE;
1041 for (toff = 0; toff < newbsize; toff += tinc) {
1042 int mask;
1043 int bytesinpage;
1044
1045 pageindex = toff / PAGE_SIZE;
1046 objoff = trunc_page(toff + off);
1047 if (pageindex < curbpnpages) {
1048 int pb;
1049
1050 m = bp->b_pages[pageindex];
1051 if (m->offset != objoff)
1052 panic("allocbuf: page changed offset??!!!?");
1053 bytesinpage = tinc;
1054 if (tinc > (newbsize - toff))
1055 bytesinpage = newbsize - toff;
1056 if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1057 bp->b_flags &= ~B_CACHE;
1058 }
1059 if ((m->flags & PG_ACTIVE) == 0) {
1060 vm_page_activate(m);
1061 m->act_count = 0;
1062 }
1063 continue;
1064 }
1065 m = vm_page_lookup(obj, objoff);
1066 if (!m) {
1067 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1068 if (!m) {
1069 int j;
1070
1071 for (j = bp->b_npages; j < pageindex; j++) {
1072 PAGE_WAKEUP(bp->b_pages[j]);
1073 }
1074 VM_WAIT;
1075 goto doretry;
1076 }
1077 vm_page_activate(m);
1078 m->act_count = 0;
1079 m->valid = 0;
1080 bp->b_flags &= ~B_CACHE;
1081 } else if (m->flags & PG_BUSY) {
1082 int j;
1083
1084 for (j = bp->b_npages; j < pageindex; j++) {
1085 PAGE_WAKEUP(bp->b_pages[j]);
1086 }
1087
1088 s = splbio();
1089 m->flags |= PG_WANTED;
1090 tsleep(m, PRIBIO, "pgtblk", 0);
1091 splx(s);
1092
1093 goto doretry;
1094 } else {
1095 int pb;
1096 if ((curproc != pageproc) &&
1097 (m->flags & PG_CACHE) &&
1098 (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
1099 pagedaemon_wakeup();
1100 }
1101 bytesinpage = tinc;
1102 if (tinc > (newbsize - toff))
1103 bytesinpage = newbsize - toff;
1104 if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1105 bp->b_flags &= ~B_CACHE;
1106 }
1107 if ((m->flags & PG_ACTIVE) == 0) {
1108 vm_page_activate(m);
1109 m->act_count = 0;
1110 }
1111 m->flags |= PG_BUSY;
1112 }
1113 bp->b_pages[pageindex] = m;
1114 curbpnpages = pageindex + 1;
1115 }
1116 for (i = bp->b_npages; i < curbpnpages; i++) {
1117 m = bp->b_pages[i];
1118 m->bmapped++;
1119 PAGE_WAKEUP(m);
1120 }
1121 bp->b_npages = curbpnpages;
1122 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1123 pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1124 bp->b_data += off % PAGE_SIZE;
1125 }
1126 }
1127 }
1128 bufspace += (newbsize - bp->b_bufsize);
1129 bp->b_bufsize = newbsize;
1130 bp->b_bcount = size;
1131 return 1;
1132}
1133
1134/*
1135 * Wait for buffer I/O completion, returning error status.
1136 */
1137int
1138biowait(register struct buf * bp)
1139{
1140 int s;
1141
1142 s = splbio();
1143 while ((bp->b_flags & B_DONE) == 0)
1144 tsleep(bp, PRIBIO, "biowait", 0);
1145 splx(s);
1146 if (bp->b_flags & B_EINTR) {
1147 bp->b_flags &= ~B_EINTR;
1148 return (EINTR);
1149 }
1150 if (bp->b_flags & B_ERROR) {
1151 return (bp->b_error ? bp->b_error : EIO);
1152 } else {
1153 return (0);
1154 }
1155}
1156
1157/*
1158 * Finish I/O on a buffer, calling an optional function.
1159 * This is usually called from interrupt level, so process blocking
1160 * is not *a good idea*.
1161 */
1162void
1163biodone(register struct buf * bp)
1164{
1165 int s;
1166
1167 s = splbio();
1168 if (!(bp->b_flags & B_BUSY))
1169 panic("biodone: buffer not busy");
1170
1171 if (bp->b_flags & B_DONE) {
1172 splx(s);
1173 printf("biodone: buffer already done\n");
1174 return;
1175 }
1176 bp->b_flags |= B_DONE;
1177
1178 if ((bp->b_flags & B_READ) == 0) {
1179 struct vnode *vp = bp->b_vp;
1180 vwakeup(bp);
1181 }
1182#ifdef BOUNCE_BUFFERS
1183 if (bp->b_flags & B_BOUNCE)
1184 vm_bounce_free(bp);
1185#endif
1186
1187 /* call optional completion function if requested */
1188 if (bp->b_flags & B_CALL) {
1189 bp->b_flags &= ~B_CALL;
1190 (*bp->b_iodone) (bp);
1191 splx(s);
1192 return;
1193 }
1194 if (bp->b_flags & B_VMIO) {
1195 int i, resid;
1196 vm_offset_t foff;
1197 vm_page_t m;
1198 vm_object_t obj;
1199 int iosize;
1200 struct vnode *vp = bp->b_vp;
1201
1202 foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1203 obj = vp->v_object;
1204 if (!obj) {
1205 panic("biodone: no object");
1206 }
1207#if defined(VFS_BIO_DEBUG)
1208 if (obj->paging_in_progress < bp->b_npages) {
1209 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1210 obj->paging_in_progress, bp->b_npages);
1211 }
1212#endif
1213 iosize = bp->b_bufsize;
1214 for (i = 0; i < bp->b_npages; i++) {
1215 int bogusflag = 0;
1216 m = bp->b_pages[i];
1217 if (m == bogus_page) {
1218 bogusflag = 1;
1219 m = vm_page_lookup(obj, foff);
1220 if (!m) {
1221#if defined(VFS_BIO_DEBUG)
1222 printf("biodone: page disappeared\n");
1223#endif
1224 --obj->paging_in_progress;
1225 continue;
1226 }
1227 bp->b_pages[i] = m;
1228 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1229 }
1230#if defined(VFS_BIO_DEBUG)
1231 if (trunc_page(foff) != m->offset) {
1232 printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
1233 }
1234#endif
1235 resid = (m->offset + PAGE_SIZE) - foff;
1236 if (resid > iosize)
1237 resid = iosize;
1238 /*
1239 * In the write case, the valid and clean bits are
1240 * already changed correctly, so we only need to do this
1241 * here in the read case.
1242 */
1243 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1244 vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid);
1245 }
1246
1247 /*
1248 * when debugging new filesystems or buffer I/O methods, this
1249 * is the most common error that pops up. if you see this, you
1250 * have not set the page busy flag correctly!!!
1251 */
1252 if (m->busy == 0) {
1253 printf("biodone: page busy < 0, "
1254 "off: %ld, foff: %ld, "
1255 "resid: %d, index: %d\n",
1256 m->offset, foff, resid, i);
1257 printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n",
1258 bp->b_vp->v_mount->mnt_stat.f_iosize,
1259 bp->b_lblkno, bp->b_flags, bp->b_npages);
1260 printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1261 m->valid, m->dirty, m->bmapped);
1262 panic("biodone: page busy < 0\n");
1263 }
1264 --m->busy;
1265 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1266 m->flags &= ~PG_WANTED;
1267 wakeup(m);
1268 }
1269 --obj->paging_in_progress;
1270 foff += resid;
1271 iosize -= resid;
1272 }
1273 if (obj && obj->paging_in_progress == 0 &&
1274 (obj->flags & OBJ_PIPWNT)) {
1275 obj->flags &= ~OBJ_PIPWNT;
1276 wakeup(obj);
1277 }
1278 }
1279 /*
1280 * For asynchronous completions, release the buffer now. The brelse
1281 * checks for B_WANTED and will do the wakeup there if necessary - so
1282 * no need to do a wakeup here in the async case.
1283 */
1284
1285 if (bp->b_flags & B_ASYNC) {
1286 brelse(bp);
1287 } else {
1288 bp->b_flags &= ~B_WANTED;
1289 wakeup(bp);
1290 }
1291 splx(s);
1292}
1293
1294int
1295count_lock_queue()
1296{
1297 int count;
1298 struct buf *bp;
1299
1300 count = 0;
1301 for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1302 bp != NULL;
1303 bp = bp->b_freelist.tqe_next)
1304 count++;
1305 return (count);
1306}
1307
1308int vfs_update_interval = 30;
1309
1310void
1311vfs_update()
1312{
1313 (void) spl0();
1314 while (1) {
1315 tsleep(&vfs_update_wakeup, PRIBIO, "update",
1316 hz * vfs_update_interval);
1317 vfs_update_wakeup = 0;
1318 sync(curproc, NULL, NULL);
1319 }
1320}
1321
1322/*
1323 * This routine is called in lieu of iodone in the case of
1324 * incomplete I/O. This keeps the busy status for pages
1325 * consistant.
1326 */
1327void
1328vfs_unbusy_pages(struct buf * bp)
1329{
1330 int i;
1331
1332 if (bp->b_flags & B_VMIO) {
1333 struct vnode *vp = bp->b_vp;
1334 vm_object_t obj = vp->v_object;
1335 vm_offset_t foff;
1336
1337 foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno);
1338
1339 for (i = 0; i < bp->b_npages; i++) {
1340 vm_page_t m = bp->b_pages[i];
1341
1342 if (m == bogus_page) {
1343 m = vm_page_lookup(obj, foff + i * PAGE_SIZE);
1344 if (!m) {
1345 panic("vfs_unbusy_pages: page missing\n");
1346 }
1347 bp->b_pages[i] = m;
1348 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1349 }
1350 --obj->paging_in_progress;
1351 --m->busy;
1352 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1353 m->flags &= ~PG_WANTED;
1354 wakeup(m);
1355 }
1356 }
1357 if (obj->paging_in_progress == 0 &&
1358 (obj->flags & OBJ_PIPWNT)) {
1359 obj->flags &= ~OBJ_PIPWNT;
1360 wakeup(obj);
1361 }
1362 }
1363}
1364
1365/*
1366 * This routine is called before a device strategy routine.
1367 * It is used to tell the VM system that paging I/O is in
1368 * progress, and treat the pages associated with the buffer
1369 * almost as being PG_BUSY. Also the object paging_in_progress
1370 * flag is handled to make sure that the object doesn't become
1371 * inconsistant.
1372 */
1373void
1374vfs_busy_pages(struct buf * bp, int clear_modify)
1375{
1376 int i;
1377
1378 if (bp->b_flags & B_VMIO) {
1379 vm_object_t obj = bp->b_vp->v_object;
1380 vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1381 int iocount = bp->b_bufsize;
1382
1383 vfs_setdirty(bp);
1384 for (i = 0; i < bp->b_npages; i++) {
1385 vm_page_t m = bp->b_pages[i];
1386 int resid = (m->offset + PAGE_SIZE) - foff;
1387
1388 if (resid > iocount)
1389 resid = iocount;
1390 if ((bp->b_flags & B_CLUSTER) == 0) {
1391 obj->paging_in_progress++;
1392 m->busy++;
1393 }
1394 if (clear_modify) {
1395 vm_page_protect(m, VM_PROT_READ);
1396 vm_page_set_validclean(m,
1397 foff & (PAGE_SIZE-1), resid);
1398 } else if (bp->b_bcount >= PAGE_SIZE) {
1399 if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1400 bp->b_pages[i] = bogus_page;
1401 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1402 }
1403 }
1404 foff += resid;
1405 iocount -= resid;
1406 }
1407 }
1408}
1409
1410/*
1411 * Tell the VM system that the pages associated with this buffer
1412 * are clean. This is used for delayed writes where the data is
1413 * going to go to disk eventually without additional VM intevention.
1414 */
1415void
1416vfs_clean_pages(struct buf * bp)
1417{
1418 int i;
1419
1420 if (bp->b_flags & B_VMIO) {
1421 vm_offset_t foff =
1422 bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1423 int iocount = bp->b_bufsize;
1424
1425 for (i = 0; i < bp->b_npages; i++) {
1426 vm_page_t m = bp->b_pages[i];
1427 int resid = (m->offset + PAGE_SIZE) - foff;
1428
1429 if (resid > iocount)
1430 resid = iocount;
1431 if (resid > 0) {
1432 vm_page_set_validclean(m,
1433 foff & (PAGE_SIZE-1), resid);
1434 }
1435 foff += resid;
1436 iocount -= resid;
1437 }
1438 }
1439}
1440
1441void
1442vfs_bio_clrbuf(struct buf *bp) {
1443 int i;
1444 if( bp->b_flags & B_VMIO) {
1445 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1446 int j;
1447 if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) {
1448 for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) {
1449 bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE);
1450 }
1451 }
1452 bp->b_resid = 0;
1453 return;
1454 }
1455 for(i=0;i<bp->b_npages;i++) {
1456 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1457 continue;
1458 if( bp->b_pages[i]->valid == 0) {
1459 bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE);
1460 } else {
1461 int j;
1462 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1463 if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1464 bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE);
1465 }
1466 }
1467 bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
1468 }
1469 bp->b_resid = 0;
1470 } else {
1471 clrbuf(bp);
1472 }
1473}
1474
1475/*
1476 * vm_hold_load_pages and vm_hold_unload pages get pages into
1477 * a buffers address space. The pages are anonymous and are
1478 * not associated with a file object.
1479 */
1480void
1481vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1482{
1483 vm_offset_t pg;
1484 vm_page_t p;
1485 vm_offset_t from = round_page(froma);
1486 vm_offset_t to = round_page(toa);
1487
1488 for (pg = from; pg < to; pg += PAGE_SIZE) {
1489
1490tryagain:
1491
1492 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS,
1493 VM_ALLOC_NORMAL);
1494 if (!p) {
1495 VM_WAIT;
1496 goto tryagain;
1497 }
1498 vm_page_wire(p);
1499 pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1500 bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
1501 PAGE_WAKEUP(p);
1502 bp->b_npages++;
1503 }
1504}
1505
1506void
1507vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1508{
1509 vm_offset_t pg;
1510 vm_page_t p;
1511 vm_offset_t from = round_page(froma);
1512 vm_offset_t to = round_page(toa);
1513
1514 for (pg = from; pg < to; pg += PAGE_SIZE) {
1515 p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
1516 bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
1517 pmap_kremove(pg);
1518 vm_page_free(p);
1519 --bp->b_npages;
1520 }
1521}