Deleted Added
full compact
vfs_bio.c (12799) vfs_bio.c (12819)
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice immediately at the beginning of the file, without modification,
10 * this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 * John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD. Other use
17 * is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice immediately at the beginning of the file, without modification,
10 * this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 * John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD. Other use
17 * is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
21 * $Id: vfs_bio.c,v 1.76 1995/12/11 04:56:05 dyson Exp $
21 * $Id: vfs_bio.c,v 1.78 1995/12/13 03:47:01 dyson Exp $
22 */
23
24/*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme. Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author: John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35#define VMIO
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/sysproto.h>
39#include <sys/kernel.h>
40#include <sys/sysctl.h>
41#include <sys/proc.h>
42#include <sys/vnode.h>
43#include <sys/vmmeter.h>
44#include <vm/vm.h>
45#include <vm/vm_param.h>
46#include <vm/vm_prot.h>
47#include <vm/vm_kern.h>
48#include <vm/vm_pageout.h>
49#include <vm/vm_page.h>
50#include <vm/vm_object.h>
51#include <vm/vm_extern.h>
52#include <sys/buf.h>
53#include <sys/mount.h>
54#include <sys/malloc.h>
55#include <sys/resourcevar.h>
56#include <sys/proc.h>
57
58#include <miscfs/specfs/specdev.h>
59
60static void vfs_update __P((void));
22 */
23
24/*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme. Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author: John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35#define VMIO
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/sysproto.h>
39#include <sys/kernel.h>
40#include <sys/sysctl.h>
41#include <sys/proc.h>
42#include <sys/vnode.h>
43#include <sys/vmmeter.h>
44#include <vm/vm.h>
45#include <vm/vm_param.h>
46#include <vm/vm_prot.h>
47#include <vm/vm_kern.h>
48#include <vm/vm_pageout.h>
49#include <vm/vm_page.h>
50#include <vm/vm_object.h>
51#include <vm/vm_extern.h>
52#include <sys/buf.h>
53#include <sys/mount.h>
54#include <sys/malloc.h>
55#include <sys/resourcevar.h>
56#include <sys/proc.h>
57
58#include <miscfs/specfs/specdev.h>
59
60static void vfs_update __P((void));
61struct proc *updateproc;
61static struct proc *updateproc;
62static struct kproc_desc up_kp = {
63 "update",
64 vfs_update,
65 &updateproc
66};
67SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
68
69struct buf *buf; /* buffer header pool */
70struct swqueue bswlist;
71
72int count_lock_queue __P((void));
62static struct kproc_desc up_kp = {
63 "update",
64 vfs_update,
65 &updateproc
66};
67SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
68
69struct buf *buf; /* buffer header pool */
70struct swqueue bswlist;
71
72int count_lock_queue __P((void));
73void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
74void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
75void vfs_clean_pages(struct buf * bp);
73static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
74 vm_offset_t to);
75static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
76 vm_offset_t to);
77static void vfs_clean_pages(struct buf * bp);
76static void vfs_setdirty(struct buf *bp);
77
78int needsbuffer;
79
80/*
81 * Internal update daemon, process 3
82 * The variable vfs_update_wakeup allows for internal syncs.
83 */
84int vfs_update_wakeup;
85
86
87/*
88 * buffers base kva
89 */
90caddr_t buffers_kva;
91
92/*
93 * bogus page -- for I/O to/from partially complete buffers
94 * this is a temporary solution to the problem, but it is not
95 * really that bad. it would be better to split the buffer
96 * for input in the case of buffers partially already in memory,
97 * but the code is intricate enough already.
98 */
99vm_page_t bogus_page;
78static void vfs_setdirty(struct buf *bp);
79
80int needsbuffer;
81
82/*
83 * Internal update daemon, process 3
84 * The variable vfs_update_wakeup allows for internal syncs.
85 */
86int vfs_update_wakeup;
87
88
89/*
90 * buffers base kva
91 */
92caddr_t buffers_kva;
93
94/*
95 * bogus page -- for I/O to/from partially complete buffers
96 * this is a temporary solution to the problem, but it is not
97 * really that bad. it would be better to split the buffer
98 * for input in the case of buffers partially already in memory,
99 * but the code is intricate enough already.
100 */
101vm_page_t bogus_page;
100vm_offset_t bogus_offset;
102static vm_offset_t bogus_offset;
101
103
102int bufspace, maxbufspace;
104static int bufspace, maxbufspace;
103
105
104struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
105struct bqueues bufqueues[BUFFER_QUEUES];
106static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
107static struct bqueues bufqueues[BUFFER_QUEUES];
106
107#define BUF_MAXUSE 8
108
109/*
110 * Initialize buffer headers and related structures.
111 */
112void
113bufinit()
114{
115 struct buf *bp;
116 int i;
117
118 TAILQ_INIT(&bswlist);
119 LIST_INIT(&invalhash);
120
121 /* first, make a null hash table */
122 for (i = 0; i < BUFHSZ; i++)
123 LIST_INIT(&bufhashtbl[i]);
124
125 /* next, make a null set of free lists */
126 for (i = 0; i < BUFFER_QUEUES; i++)
127 TAILQ_INIT(&bufqueues[i]);
128
129 buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
130 /* finally, initialize each buffer header and stick on empty q */
131 for (i = 0; i < nbuf; i++) {
132 bp = &buf[i];
133 bzero(bp, sizeof *bp);
134 bp->b_flags = B_INVAL; /* we're just an empty header */
135 bp->b_dev = NODEV;
136 bp->b_rcred = NOCRED;
137 bp->b_wcred = NOCRED;
138 bp->b_qindex = QUEUE_EMPTY;
139 bp->b_vnbufs.le_next = NOLIST;
140 bp->b_data = buffers_kva + i * MAXBSIZE;
141 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
142 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
143 }
144/*
145 * maxbufspace is currently calculated to support all filesystem blocks
146 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer
147 * cache is still the same as it would be for 8K filesystems. This
148 * keeps the size of the buffer cache "in check" for big block filesystems.
149 */
150 maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
151
152 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
153 bogus_page = vm_page_alloc(kernel_object,
154 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
155 VM_ALLOC_NORMAL);
156
157}
158
159/*
160 * remove the buffer from the appropriate free list
161 */
162void
163bremfree(struct buf * bp)
164{
165 int s = splbio();
166
167 if (bp->b_qindex != QUEUE_NONE) {
168 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
169 bp->b_qindex = QUEUE_NONE;
170 } else {
171 panic("bremfree: removing a buffer when not on a queue");
172 }
173 splx(s);
174}
175
176/*
177 * Get a buffer with the specified data. Look in the cache first.
178 */
179int
180bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
181 struct buf ** bpp)
182{
183 struct buf *bp;
184
185 bp = getblk(vp, blkno, size, 0, 0);
186 *bpp = bp;
187
188 /* if not found in cache, do some I/O */
189 if ((bp->b_flags & B_CACHE) == 0) {
190 if (curproc != NULL)
191 curproc->p_stats->p_ru.ru_inblock++;
192 bp->b_flags |= B_READ;
193 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
194 if (bp->b_rcred == NOCRED) {
195 if (cred != NOCRED)
196 crhold(cred);
197 bp->b_rcred = cred;
198 }
199 vfs_busy_pages(bp, 0);
200 VOP_STRATEGY(bp);
201 return (biowait(bp));
202 }
203 return (0);
204}
205
206/*
207 * Operates like bread, but also starts asynchronous I/O on
208 * read-ahead blocks.
209 */
210int
211breadn(struct vnode * vp, daddr_t blkno, int size,
212 daddr_t * rablkno, int *rabsize,
213 int cnt, struct ucred * cred, struct buf ** bpp)
214{
215 struct buf *bp, *rabp;
216 int i;
217 int rv = 0, readwait = 0;
218
219 *bpp = bp = getblk(vp, blkno, size, 0, 0);
220
221 /* if not found in cache, do some I/O */
222 if ((bp->b_flags & B_CACHE) == 0) {
223 if (curproc != NULL)
224 curproc->p_stats->p_ru.ru_inblock++;
225 bp->b_flags |= B_READ;
226 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
227 if (bp->b_rcred == NOCRED) {
228 if (cred != NOCRED)
229 crhold(cred);
230 bp->b_rcred = cred;
231 }
232 vfs_busy_pages(bp, 0);
233 VOP_STRATEGY(bp);
234 ++readwait;
235 }
236 for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
237 if (inmem(vp, *rablkno))
238 continue;
239 rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
240
241 if ((rabp->b_flags & B_CACHE) == 0) {
242 if (curproc != NULL)
243 curproc->p_stats->p_ru.ru_inblock++;
244 rabp->b_flags |= B_READ | B_ASYNC;
245 rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
246 if (rabp->b_rcred == NOCRED) {
247 if (cred != NOCRED)
248 crhold(cred);
249 rabp->b_rcred = cred;
250 }
251 vfs_busy_pages(rabp, 0);
252 VOP_STRATEGY(rabp);
253 } else {
254 brelse(rabp);
255 }
256 }
257
258 if (readwait) {
259 rv = biowait(bp);
260 }
261 return (rv);
262}
263
264/*
265 * Write, release buffer on completion. (Done by iodone
266 * if async.)
267 */
268int
269bwrite(struct buf * bp)
270{
271 int oldflags = bp->b_flags;
272
273 if (bp->b_flags & B_INVAL) {
274 brelse(bp);
275 return (0);
276 }
277 if (!(bp->b_flags & B_BUSY))
278 panic("bwrite: buffer is not busy???");
279
280 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
281 bp->b_flags |= B_WRITEINPROG;
282
283 if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
284 reassignbuf(bp, bp->b_vp);
285 }
286
287 bp->b_vp->v_numoutput++;
288 vfs_busy_pages(bp, 1);
289 if (curproc != NULL)
290 curproc->p_stats->p_ru.ru_oublock++;
291 VOP_STRATEGY(bp);
292
293 if ((oldflags & B_ASYNC) == 0) {
294 int rtval = biowait(bp);
295
296 if (oldflags & B_DELWRI) {
297 reassignbuf(bp, bp->b_vp);
298 }
299 brelse(bp);
300 return (rtval);
301 }
302 return (0);
303}
304
305int
306vn_bwrite(ap)
307 struct vop_bwrite_args *ap;
308{
309 return (bwrite(ap->a_bp));
310}
311
312/*
313 * Delayed write. (Buffer is marked dirty).
314 */
315void
316bdwrite(struct buf * bp)
317{
318
319 if ((bp->b_flags & B_BUSY) == 0) {
320 panic("bdwrite: buffer is not busy");
321 }
322 if (bp->b_flags & B_INVAL) {
323 brelse(bp);
324 return;
325 }
326 if (bp->b_flags & B_TAPE) {
327 bawrite(bp);
328 return;
329 }
330 bp->b_flags &= ~(B_READ|B_RELBUF);
331 if ((bp->b_flags & B_DELWRI) == 0) {
332 bp->b_flags |= B_DONE | B_DELWRI;
333 reassignbuf(bp, bp->b_vp);
334 }
335
336 /*
337 * This bmap keeps the system from needing to do the bmap later,
338 * perhaps when the system is attempting to do a sync. Since it
339 * is likely that the indirect block -- or whatever other datastructure
340 * that the filesystem needs is still in memory now, it is a good
341 * thing to do this. Note also, that if the pageout daemon is
342 * requesting a sync -- there might not be enough memory to do
343 * the bmap then... So, this is important to do.
344 */
345 if( bp->b_lblkno == bp->b_blkno) {
346 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
347 }
348
349 /*
350 * Set the *dirty* buffer range based upon the VM system dirty pages.
351 */
352 vfs_setdirty(bp);
353
354 /*
355 * We need to do this here to satisfy the vnode_pager and the
356 * pageout daemon, so that it thinks that the pages have been
357 * "cleaned". Note that since the pages are in a delayed write
358 * buffer -- the VFS layer "will" see that the pages get written
359 * out on the next sync, or perhaps the cluster will be completed.
360 */
361 vfs_clean_pages(bp);
362 brelse(bp);
363 return;
364}
365
366/*
367 * Asynchronous write.
368 * Start output on a buffer, but do not wait for it to complete.
369 * The buffer is released when the output completes.
370 */
371void
372bawrite(struct buf * bp)
373{
374 bp->b_flags |= B_ASYNC;
375 (void) VOP_BWRITE(bp);
376}
377
378/*
379 * Release a buffer.
380 */
381void
382brelse(struct buf * bp)
383{
384 int s;
385
386 if (bp->b_flags & B_CLUSTER) {
387 relpbuf(bp);
388 return;
389 }
390 /* anyone need a "free" block? */
391 s = splbio();
392
393 if (needsbuffer) {
394 needsbuffer = 0;
395 wakeup(&needsbuffer);
396 }
397
398 /* anyone need this block? */
399 if (bp->b_flags & B_WANTED) {
400 bp->b_flags &= ~(B_WANTED | B_AGE);
401 wakeup(bp);
402 }
403
404 if (bp->b_flags & B_LOCKED)
405 bp->b_flags &= ~B_ERROR;
406
407 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
408 (bp->b_bufsize <= 0)) {
409 bp->b_flags |= B_INVAL;
410 bp->b_flags &= ~(B_DELWRI | B_CACHE);
411 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
412 brelvp(bp);
413 }
414
415 /*
416 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
417 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
418 * but the VM object is kept around. The B_NOCACHE flag is used to
419 * invalidate the pages in the VM object.
420 */
421 if (bp->b_flags & B_VMIO) {
422 vm_ooffset_t foff;
423 vm_object_t obj;
424 int i, resid;
425 vm_page_t m;
426 struct vnode *vp;
427 int iototal = bp->b_bufsize;
428
429 vp = bp->b_vp;
430 if (!vp)
431 panic("brelse: missing vp");
432
433 if (bp->b_npages) {
434 vm_pindex_t poff;
435 obj = (vm_object_t) vp->v_object;
436 if (vp->v_type == VBLK)
437 foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
438 else
439 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
440 poff = OFF_TO_IDX(foff);
441 for (i = 0; i < bp->b_npages; i++) {
442 m = bp->b_pages[i];
443 if (m == bogus_page) {
444 m = vm_page_lookup(obj, poff + i);
445 if (!m) {
446 panic("brelse: page missing\n");
447 }
448 bp->b_pages[i] = m;
449 pmap_qenter(trunc_page(bp->b_data),
450 bp->b_pages, bp->b_npages);
451 }
452 resid = IDX_TO_OFF(m->pindex+1) - foff;
453 if (resid > iototal)
454 resid = iototal;
455 if (resid > 0) {
456 /*
457 * Don't invalidate the page if the local machine has already
458 * modified it. This is the lesser of two evils, and should
459 * be fixed.
460 */
461 if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
462 vm_page_test_dirty(m);
463 if (m->dirty == 0) {
464 vm_page_set_invalid(m, (vm_offset_t) foff, resid);
465 if (m->valid == 0)
466 vm_page_protect(m, VM_PROT_NONE);
467 }
468 }
469 }
470 foff += resid;
471 iototal -= resid;
472 }
473 }
474
475 if (bp->b_flags & (B_INVAL | B_RELBUF)) {
476 for(i = 0; i < bp->b_npages; i++) {
477 m = bp->b_pages[i];
478 --m->bmapped;
479 if (m->bmapped == 0) {
480 if (m->flags & PG_WANTED) {
481 m->flags &= ~PG_WANTED;
482 wakeup(m);
483 }
484 if ((m->busy == 0) && ((m->flags & PG_BUSY) == 0)) {
485 if (m->object->flags & OBJ_MIGHTBEDIRTY) {
486 vm_page_test_dirty(m);
487 }
488 /*
489 * if page isn't valid, no sense in keeping it around
490 */
491 if (m->valid == 0) {
492 vm_page_protect(m, VM_PROT_NONE);
493 vm_page_free(m);
494 /*
495 * if page isn't dirty and hasn't been referenced by
496 * a process, then cache it
497 */
498 } else if ((m->dirty & m->valid) == 0 &&
499 (m->flags & PG_REFERENCED) == 0 &&
500 !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
501 vm_page_cache(m);
502 /*
503 * otherwise activate it
504 */
505 } else if ((m->flags & PG_ACTIVE) == 0) {
506 vm_page_activate(m);
507 m->act_count = 0;
508 }
509 }
510 }
511 }
512 bufspace -= bp->b_bufsize;
513 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
514 bp->b_npages = 0;
515 bp->b_bufsize = 0;
516 bp->b_flags &= ~B_VMIO;
517 if (bp->b_vp)
518 brelvp(bp);
519 }
520 }
521 if (bp->b_qindex != QUEUE_NONE)
522 panic("brelse: free buffer onto another queue???");
523
524 /* enqueue */
525 /* buffers with no memory */
526 if (bp->b_bufsize == 0) {
527 bp->b_qindex = QUEUE_EMPTY;
528 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
529 LIST_REMOVE(bp, b_hash);
530 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
531 bp->b_dev = NODEV;
532 /* buffers with junk contents */
533 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
534 bp->b_qindex = QUEUE_AGE;
535 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
536 LIST_REMOVE(bp, b_hash);
537 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
538 bp->b_dev = NODEV;
539 /* buffers that are locked */
540 } else if (bp->b_flags & B_LOCKED) {
541 bp->b_qindex = QUEUE_LOCKED;
542 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
543 /* buffers with stale but valid contents */
544 } else if (bp->b_flags & B_AGE) {
545 bp->b_qindex = QUEUE_AGE;
546 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
547 /* buffers with valid and quite potentially reuseable contents */
548 } else {
549 bp->b_qindex = QUEUE_LRU;
550 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
551 }
552
553 /* unlock */
554 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
555 splx(s);
556}
557
558/*
559 * Check to see if a block is currently memory resident.
560 */
561__inline struct buf *
562gbincore(struct vnode * vp, daddr_t blkno)
563{
564 struct buf *bp;
565 struct bufhashhdr *bh;
566
567 bh = BUFHASH(vp, blkno);
568 bp = bh->lh_first;
569
570 /* Search hash chain */
571 while (bp != NULL) {
572 /* hit */
573 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
574 (bp->b_flags & B_INVAL) == 0) {
575 break;
576 }
577 bp = bp->b_hash.le_next;
578 }
579 return (bp);
580}
581
582/*
583 * this routine implements clustered async writes for
584 * clearing out B_DELWRI buffers... This is much better
585 * than the old way of writing only one buffer at a time.
586 */
587int
588vfs_bio_awrite(struct buf * bp)
589{
590 int i;
591 daddr_t lblkno = bp->b_lblkno;
592 struct vnode *vp = bp->b_vp;
593 int s;
594 int ncl;
595 struct buf *bpa;
596 int nwritten;
597
598 s = splbio();
599 /*
600 * right now we support clustered writing only to regular files
601 */
602 if ((vp->v_type == VREG) &&
603 (vp->v_mount != 0) && /* Only on nodes that have the size info */
604 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
605 int size;
606 int maxcl;
607
608 size = vp->v_mount->mnt_stat.f_iosize;
609 maxcl = MAXPHYS / size;
610
611 for (i = 1; i < maxcl; i++) {
612 if ((bpa = gbincore(vp, lblkno + i)) &&
613 ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
614 (B_DELWRI | B_CLUSTEROK)) &&
615 (bpa->b_bufsize == size)) {
616 if ((bpa->b_blkno == bpa->b_lblkno) ||
617 (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
618 break;
619 } else {
620 break;
621 }
622 }
623 ncl = i;
624 /*
625 * this is a possible cluster write
626 */
627 if (ncl != 1) {
628 nwritten = cluster_wbuild(vp, size, lblkno, ncl);
629 splx(s);
630 return nwritten;
631 }
632 }
633 bremfree(bp);
634 splx(s);
635 /*
636 * default (old) behavior, writing out only one block
637 */
638 bp->b_flags |= B_BUSY | B_ASYNC;
639 nwritten = bp->b_bufsize;
640 (void) VOP_BWRITE(bp);
641 return nwritten;
642}
643
644
645/*
646 * Find a buffer header which is available for use.
647 */
648static struct buf *
649getnewbuf(int slpflag, int slptimeo, int doingvmio)
650{
651 struct buf *bp;
652 int s;
653 int nbyteswritten = 0;
654
655 s = splbio();
656start:
657 if (bufspace >= maxbufspace)
658 goto trytofreespace;
659
660 /* can we constitute a new buffer? */
661 if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
662 if (bp->b_qindex != QUEUE_EMPTY)
663 panic("getnewbuf: inconsistent EMPTY queue");
664 bremfree(bp);
665 goto fillbuf;
666 }
667trytofreespace:
668 /*
669 * We keep the file I/O from hogging metadata I/O
670 * This is desirable because file data is cached in the
671 * VM/Buffer cache even if a buffer is freed.
672 */
673 if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
674 if (bp->b_qindex != QUEUE_AGE)
675 panic("getnewbuf: inconsistent AGE queue");
676 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
677 if (bp->b_qindex != QUEUE_LRU)
678 panic("getnewbuf: inconsistent LRU queue");
679 }
680 if (!bp) {
681 /* wait for a free buffer of any kind */
682 needsbuffer = 1;
683 tsleep(&needsbuffer,
684 (PRIBIO + 1) | slpflag, "newbuf", slptimeo);
685 splx(s);
686 return (0);
687 }
688
689 if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
690 --bp->b_usecount;
691 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
692 if (bufqueues[QUEUE_LRU].tqh_first != NULL) {
693 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
694 goto start;
695 }
696 }
697
698 /* if we are a delayed write, convert to an async write */
699 if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
700 nbyteswritten += vfs_bio_awrite(bp);
701 if (!slpflag && !slptimeo) {
702 splx(s);
703 return (0);
704 }
705 goto start;
706 }
707
708 if (bp->b_flags & B_WANTED) {
709 bp->b_flags &= ~B_WANTED;
710 wakeup(bp);
711 }
712 bremfree(bp);
713
714 if (bp->b_flags & B_VMIO) {
715 bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
716 brelse(bp);
717 bremfree(bp);
718 }
719
720 if (bp->b_vp)
721 brelvp(bp);
722
723 /* we are not free, nor do we contain interesting data */
724 if (bp->b_rcred != NOCRED)
725 crfree(bp->b_rcred);
726 if (bp->b_wcred != NOCRED)
727 crfree(bp->b_wcred);
728fillbuf:
729 bp->b_flags |= B_BUSY;
730 LIST_REMOVE(bp, b_hash);
731 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
732 splx(s);
733 if (bp->b_bufsize) {
734 allocbuf(bp, 0);
735 }
736 bp->b_flags = B_BUSY;
737 bp->b_dev = NODEV;
738 bp->b_vp = NULL;
739 bp->b_blkno = bp->b_lblkno = 0;
740 bp->b_iodone = 0;
741 bp->b_error = 0;
742 bp->b_resid = 0;
743 bp->b_bcount = 0;
744 bp->b_npages = 0;
745 bp->b_wcred = bp->b_rcred = NOCRED;
746 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
747 bp->b_dirtyoff = bp->b_dirtyend = 0;
748 bp->b_validoff = bp->b_validend = 0;
749 bp->b_usecount = 2;
750 if (bufspace >= maxbufspace + nbyteswritten) {
751 s = splbio();
752 bp->b_flags |= B_INVAL;
753 brelse(bp);
754 goto trytofreespace;
755 }
756 return (bp);
757}
758
759/*
760 * Check to see if a block is currently memory resident.
761 */
762struct buf *
763incore(struct vnode * vp, daddr_t blkno)
764{
765 struct buf *bp;
766 struct bufhashhdr *bh;
767
768 int s = splbio();
769
770 bh = BUFHASH(vp, blkno);
771 bp = bh->lh_first;
772
773 /* Search hash chain */
774 while (bp != NULL) {
775 /* hit */
776 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
777 (bp->b_flags & B_INVAL) == 0) {
778 break;
779 }
780 bp = bp->b_hash.le_next;
781 }
782 splx(s);
783 return (bp);
784}
785
786/*
787 * Returns true if no I/O is needed to access the
788 * associated VM object. This is like incore except
789 * it also hunts around in the VM system for the data.
790 */
791
792int
793inmem(struct vnode * vp, daddr_t blkno)
794{
795 vm_object_t obj;
796 vm_offset_t toff, tinc;
797 vm_page_t m;
798 vm_ooffset_t off;
799
800 if (incore(vp, blkno))
801 return 1;
802 if (vp->v_mount == NULL)
803 return 0;
804 if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
805 return 0;
806
807 obj = vp->v_object;
808 tinc = PAGE_SIZE;
809 if (tinc > vp->v_mount->mnt_stat.f_iosize)
810 tinc = vp->v_mount->mnt_stat.f_iosize;
811 off = blkno * vp->v_mount->mnt_stat.f_iosize;
812
813 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
814
815 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
816 if (!m)
817 return 0;
818 if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
819 return 0;
820 }
821 return 1;
822}
823
824/*
825 * now we set the dirty range for the buffer --
826 * for NFS -- if the file is mapped and pages have
827 * been written to, let it know. We want the
828 * entire range of the buffer to be marked dirty if
829 * any of the pages have been written to for consistancy
830 * with the b_validoff, b_validend set in the nfs write
831 * code, and used by the nfs read code.
832 */
833static void
834vfs_setdirty(struct buf *bp) {
835 int i;
836 vm_object_t object;
837 vm_offset_t boffset, offset;
838 /*
839 * We qualify the scan for modified pages on whether the
840 * object has been flushed yet. The OBJ_WRITEABLE flag
841 * is not cleared simply by protecting pages off.
842 */
843 if ((bp->b_flags & B_VMIO) &&
844 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
845 /*
846 * test the pages to see if they have been modified directly
847 * by users through the VM system.
848 */
849 for (i = 0; i < bp->b_npages; i++)
850 vm_page_test_dirty(bp->b_pages[i]);
851
852 /*
853 * scan forwards for the first page modified
854 */
855 for (i = 0; i < bp->b_npages; i++) {
856 if (bp->b_pages[i]->dirty) {
857 break;
858 }
859 }
860 boffset = (i << PAGE_SHIFT);
861 if (boffset < bp->b_dirtyoff) {
862 bp->b_dirtyoff = boffset;
863 }
864
865 /*
866 * scan backwards for the last page modified
867 */
868 for (i = bp->b_npages - 1; i >= 0; --i) {
869 if (bp->b_pages[i]->dirty) {
870 break;
871 }
872 }
873 boffset = (i + 1);
874 offset = boffset + bp->b_pages[0]->pindex;
875 if (offset >= object->size)
876 boffset = object->size - bp->b_pages[0]->pindex;
877 if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
878 bp->b_dirtyend = (boffset << PAGE_SHIFT);
879 }
880}
881
882/*
883 * Get a block given a specified block and offset into a file/device.
884 */
885struct buf *
886getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
887{
888 struct buf *bp;
889 int s;
890 struct bufhashhdr *bh;
891
892 s = splbio();
893loop:
894 if ((bp = gbincore(vp, blkno))) {
895 if (bp->b_flags & B_BUSY) {
896 bp->b_flags |= B_WANTED;
897 if (bp->b_usecount < BUF_MAXUSE)
898 ++bp->b_usecount;
899 if (!tsleep(bp,
900 (PRIBIO + 1) | slpflag, "getblk", slptimeo))
901 goto loop;
902
903 splx(s);
904 return (struct buf *) NULL;
905 }
906 bp->b_flags |= B_BUSY | B_CACHE;
907 bremfree(bp);
908
909 /*
910 * check for size inconsistancies (note that they shouldn't happen
911 * but do when filesystems don't handle the size changes correctly.)
912 * We are conservative on metadata and don't just extend the buffer
913 * but write and re-constitute it.
914 */
915
916 if (bp->b_bcount != size) {
917 if (bp->b_flags & B_VMIO) {
918 allocbuf(bp, size);
919 } else {
920 bp->b_flags |= B_NOCACHE;
921 VOP_BWRITE(bp);
922 goto loop;
923 }
924 }
925
926 /*
927 * make sure that all pages in the buffer are valid, if they
928 * aren't, clear the cache flag.
929 * ASSUMPTION:
930 * if the buffer is greater than 1 page in size, it is assumed
931 * that the buffer address starts on a page boundary...
932 */
933 if (bp->b_flags & B_VMIO) {
934 int szleft, i;
935 szleft = size;
936 for (i=0;i<bp->b_npages;i++) {
937 if (szleft > PAGE_SIZE) {
938 if ((bp->b_pages[i]->valid & VM_PAGE_BITS_ALL) !=
939 VM_PAGE_BITS_ALL) {
940 bp->b_flags &= ~(B_CACHE|B_DONE);
941 break;
942 }
943 szleft -= PAGE_SIZE;
944 } else {
945 if (!vm_page_is_valid(bp->b_pages[i],
946 (((vm_offset_t) bp->b_data) & PAGE_MASK),
947 szleft)) {
948 bp->b_flags &= ~(B_CACHE|B_DONE);
949 break;
950 }
951 szleft = 0;
952 }
953 }
954 }
955 if (bp->b_usecount < BUF_MAXUSE)
956 ++bp->b_usecount;
957 splx(s);
958 return (bp);
959 } else {
960 vm_object_t obj;
961 int doingvmio;
962
963 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
964 doingvmio = 1;
965 } else {
966 doingvmio = 0;
967 }
968 if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
969 if (slpflag || slptimeo) {
970 splx(s);
971 return NULL;
972 }
973 goto loop;
974 }
975
976 /*
977 * This code is used to make sure that a buffer is not
978 * created while the getnewbuf routine is blocked.
979 * Normally the vnode is locked so this isn't a problem.
980 * VBLK type I/O requests, however, don't lock the vnode.
981 */
982 if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
983 bp->b_flags |= B_INVAL;
984 brelse(bp);
985 goto loop;
986 }
987
988 /*
989 * Insert the buffer into the hash, so that it can
990 * be found by incore.
991 */
992 bp->b_blkno = bp->b_lblkno = blkno;
993 bgetvp(vp, bp);
994 LIST_REMOVE(bp, b_hash);
995 bh = BUFHASH(vp, blkno);
996 LIST_INSERT_HEAD(bh, bp, b_hash);
997
998 if (doingvmio) {
999 bp->b_flags |= (B_VMIO | B_CACHE);
1000#if defined(VFS_BIO_DEBUG)
1001 if (vp->v_type != VREG)
1002 printf("getblk: vmioing file type %d???\n", vp->v_type);
1003#endif
1004 } else {
1005 bp->b_flags &= ~B_VMIO;
1006 }
1007 splx(s);
1008
1009 allocbuf(bp, size);
1010 return (bp);
1011 }
1012}
1013
1014/*
1015 * Get an empty, disassociated buffer of given size.
1016 */
1017struct buf *
1018geteblk(int size)
1019{
1020 struct buf *bp;
1021
1022 while ((bp = getnewbuf(0, 0, 0)) == 0);
1023 allocbuf(bp, size);
1024 bp->b_flags |= B_INVAL;
1025 return (bp);
1026}
1027
1028/*
1029 * This code constitutes the buffer memory from either anonymous system
1030 * memory (in the case of non-VMIO operations) or from an associated
1031 * VM object (in the case of VMIO operations).
1032 *
1033 * Note that this code is tricky, and has many complications to resolve
1034 * deadlock or inconsistant data situations. Tread lightly!!!
1035 *
1036 * Modify the length of a buffer's underlying buffer storage without
1037 * destroying information (unless, of course the buffer is shrinking).
1038 */
1039int
1040allocbuf(struct buf * bp, int size)
1041{
1042
1043 int s;
1044 int newbsize, mbsize;
1045 int i;
1046
1047 if (!(bp->b_flags & B_BUSY))
1048 panic("allocbuf: buffer not busy");
1049
1050 if ((bp->b_flags & B_VMIO) == 0) {
1051 /*
1052 * Just get anonymous memory from the kernel
1053 */
1054 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1055 newbsize = round_page(size);
1056
1057 if (newbsize < bp->b_bufsize) {
1058 vm_hold_free_pages(
1059 bp,
1060 (vm_offset_t) bp->b_data + newbsize,
1061 (vm_offset_t) bp->b_data + bp->b_bufsize);
1062 } else if (newbsize > bp->b_bufsize) {
1063 vm_hold_load_pages(
1064 bp,
1065 (vm_offset_t) bp->b_data + bp->b_bufsize,
1066 (vm_offset_t) bp->b_data + newbsize);
1067 }
1068 } else {
1069 vm_page_t m;
1070 int desiredpages;
1071
1072 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1073 desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1074
1075 if (newbsize < bp->b_bufsize) {
1076 if (desiredpages < bp->b_npages) {
1077 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1078 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1079 for (i = desiredpages; i < bp->b_npages; i++) {
1080 m = bp->b_pages[i];
1081 s = splhigh();
1082 while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1083 m->flags |= PG_WANTED;
1084 tsleep(m, PVM, "biodep", 0);
1085 }
1086 splx(s);
1087
1088 if (m->bmapped == 0) {
1089 printf("allocbuf: bmapped is zero for page %d\n", i);
1090 panic("allocbuf: error");
1091 }
1092 --m->bmapped;
1093 if (m->bmapped == 0) {
1094 vm_page_protect(m, VM_PROT_NONE);
1095 vm_page_free(m);
1096 }
1097 bp->b_pages[i] = NULL;
1098 }
1099 bp->b_npages = desiredpages;
1100 }
1101 } else if (newbsize > bp->b_bufsize) {
1102 vm_object_t obj;
1103 vm_offset_t tinc, toff;
1104 vm_ooffset_t off;
1105 vm_pindex_t objoff;
1106 int pageindex, curbpnpages;
1107 struct vnode *vp;
1108 int bsize;
1109
1110 vp = bp->b_vp;
1111
1112 if (vp->v_type == VBLK)
1113 bsize = DEV_BSIZE;
1114 else
1115 bsize = vp->v_mount->mnt_stat.f_iosize;
1116
1117 if (bp->b_npages < desiredpages) {
1118 obj = vp->v_object;
1119 tinc = PAGE_SIZE;
1120 if (tinc > bsize)
1121 tinc = bsize;
1122 off = (vm_ooffset_t) bp->b_lblkno * bsize;
1123 doretry:
1124 curbpnpages = bp->b_npages;
1125 bp->b_flags |= B_CACHE;
1126 for (toff = 0; toff < newbsize; toff += tinc) {
1127 int bytesinpage;
1128
1129 pageindex = toff >> PAGE_SHIFT;
1130 objoff = OFF_TO_IDX(off + toff);
1131 if (pageindex < curbpnpages) {
1132
1133 m = bp->b_pages[pageindex];
1134 if (m->pindex != objoff)
1135 panic("allocbuf: page changed offset??!!!?");
1136 bytesinpage = tinc;
1137 if (tinc > (newbsize - toff))
1138 bytesinpage = newbsize - toff;
1139 if (!vm_page_is_valid(m,
1140 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)),
1141 bytesinpage)) {
1142 bp->b_flags &= ~B_CACHE;
1143 }
1144 if ((m->flags & PG_ACTIVE) == 0) {
1145 vm_page_activate(m);
1146 m->act_count = 0;
1147 }
1148 continue;
1149 }
1150 m = vm_page_lookup(obj, objoff);
1151 if (!m) {
1152 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1153 if (!m) {
1154 int j;
1155
1156 for (j = bp->b_npages; j < pageindex; j++) {
1157 PAGE_WAKEUP(bp->b_pages[j]);
1158 }
1159 VM_WAIT;
1160 goto doretry;
1161 }
1162 vm_page_activate(m);
1163 m->act_count = 0;
1164 m->valid = 0;
1165 bp->b_flags &= ~B_CACHE;
1166 } else if (m->flags & PG_BUSY) {
1167 int j;
1168
1169 for (j = bp->b_npages; j < pageindex; j++) {
1170 PAGE_WAKEUP(bp->b_pages[j]);
1171 }
1172
1173 s = splbio();
1174 m->flags |= PG_WANTED;
1175 tsleep(m, PVM, "pgtblk", 0);
1176 splx(s);
1177
1178 goto doretry;
1179 } else {
1180 if ((curproc != pageproc) &&
1181 (m->flags & PG_CACHE) &&
1182 (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
1183 pagedaemon_wakeup();
1184 }
1185 bytesinpage = tinc;
1186 if (tinc > (newbsize - toff))
1187 bytesinpage = newbsize - toff;
1188 if (!vm_page_is_valid(m,
1189 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)),
1190 bytesinpage)) {
1191 bp->b_flags &= ~B_CACHE;
1192 }
1193 if ((m->flags & PG_ACTIVE) == 0) {
1194 vm_page_activate(m);
1195 m->act_count = 0;
1196 }
1197 m->flags |= PG_BUSY;
1198 }
1199 bp->b_pages[pageindex] = m;
1200 curbpnpages = pageindex + 1;
1201 }
1202 for (i = bp->b_npages; i < curbpnpages; i++) {
1203 m = bp->b_pages[i];
1204 m->bmapped++;
1205 PAGE_WAKEUP(m);
1206 }
1207 bp->b_npages = curbpnpages;
1208 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1209 pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1210 bp->b_data += off & (PAGE_SIZE - 1);
1211 }
1212 }
1213 }
1214 bufspace += (newbsize - bp->b_bufsize);
1215 bp->b_bufsize = newbsize;
1216 bp->b_bcount = size;
1217 return 1;
1218}
1219
1220/*
1221 * Wait for buffer I/O completion, returning error status.
1222 */
1223int
1224biowait(register struct buf * bp)
1225{
1226 int s;
1227
1228 s = splbio();
1229 while ((bp->b_flags & B_DONE) == 0)
1230 tsleep(bp, PRIBIO, "biowait", 0);
1231 splx(s);
1232 if (bp->b_flags & B_EINTR) {
1233 bp->b_flags &= ~B_EINTR;
1234 return (EINTR);
1235 }
1236 if (bp->b_flags & B_ERROR) {
1237 return (bp->b_error ? bp->b_error : EIO);
1238 } else {
1239 return (0);
1240 }
1241}
1242
1243/*
1244 * Finish I/O on a buffer, calling an optional function.
1245 * This is usually called from interrupt level, so process blocking
1246 * is not *a good idea*.
1247 */
1248void
1249biodone(register struct buf * bp)
1250{
1251 int s;
1252
1253 s = splbio();
1254 if (!(bp->b_flags & B_BUSY))
1255 panic("biodone: buffer not busy");
1256
1257 if (bp->b_flags & B_DONE) {
1258 splx(s);
1259 printf("biodone: buffer already done\n");
1260 return;
1261 }
1262 bp->b_flags |= B_DONE;
1263
1264 if ((bp->b_flags & B_READ) == 0) {
1265 vwakeup(bp);
1266 }
1267#ifdef BOUNCE_BUFFERS
1268 if (bp->b_flags & B_BOUNCE)
1269 vm_bounce_free(bp);
1270#endif
1271
1272 /* call optional completion function if requested */
1273 if (bp->b_flags & B_CALL) {
1274 bp->b_flags &= ~B_CALL;
1275 (*bp->b_iodone) (bp);
1276 splx(s);
1277 return;
1278 }
1279 if (bp->b_flags & B_VMIO) {
1280 int i, resid;
1281 vm_ooffset_t foff;
1282 vm_page_t m;
1283 vm_object_t obj;
1284 int iosize;
1285 struct vnode *vp = bp->b_vp;
1286
1287 if (vp->v_type == VBLK)
1288 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1289 else
1290 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1291 obj = vp->v_object;
1292 if (!obj) {
1293 panic("biodone: no object");
1294 }
1295#if defined(VFS_BIO_DEBUG)
1296 if (obj->paging_in_progress < bp->b_npages) {
1297 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1298 obj->paging_in_progress, bp->b_npages);
1299 }
1300#endif
1301 iosize = bp->b_bufsize;
1302 for (i = 0; i < bp->b_npages; i++) {
1303 int bogusflag = 0;
1304 m = bp->b_pages[i];
1305 if (m == bogus_page) {
1306 bogusflag = 1;
1307 m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1308 if (!m) {
1309#if defined(VFS_BIO_DEBUG)
1310 printf("biodone: page disappeared\n");
1311#endif
1312 --obj->paging_in_progress;
1313 continue;
1314 }
1315 bp->b_pages[i] = m;
1316 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1317 }
1318#if defined(VFS_BIO_DEBUG)
1319 if (OFF_TO_IDX(foff) != m->pindex) {
1320 printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1321 }
1322#endif
1323 resid = IDX_TO_OFF(m->pindex + 1) - foff;
1324 if (resid > iosize)
1325 resid = iosize;
1326 /*
1327 * In the write case, the valid and clean bits are
1328 * already changed correctly, so we only need to do this
1329 * here in the read case.
1330 */
1331 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1332 vm_page_set_validclean(m,
1333 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid);
1334 }
1335
1336 /*
1337 * when debugging new filesystems or buffer I/O methods, this
1338 * is the most common error that pops up. if you see this, you
1339 * have not set the page busy flag correctly!!!
1340 */
1341 if (m->busy == 0) {
1342 printf("biodone: page busy < 0, "
1343 "pindex: %d, foff: 0x(%x,%x), "
1344 "resid: %d, index: %d\n",
1345 (int) m->pindex, (int)(foff >> 32),
1346 (int) foff & 0xffffffff, resid, i);
1347 if (vp->v_type != VBLK)
1348 printf(" iosize: %d, lblkno: %d, flags: 0x%lx, npages: %d\n",
1349 bp->b_vp->v_mount->mnt_stat.f_iosize,
1350 (int) bp->b_lblkno,
1351 bp->b_flags, bp->b_npages);
1352 else
1353 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1354 (int) bp->b_lblkno,
1355 bp->b_flags, bp->b_npages);
1356 printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1357 m->valid, m->dirty, m->bmapped);
1358 panic("biodone: page busy < 0\n");
1359 }
1360 --m->busy;
1361 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1362 m->flags &= ~PG_WANTED;
1363 wakeup(m);
1364 }
1365 --obj->paging_in_progress;
1366 foff += resid;
1367 iosize -= resid;
1368 }
1369 if (obj && obj->paging_in_progress == 0 &&
1370 (obj->flags & OBJ_PIPWNT)) {
1371 obj->flags &= ~OBJ_PIPWNT;
1372 wakeup(obj);
1373 }
1374 }
1375 /*
1376 * For asynchronous completions, release the buffer now. The brelse
1377 * checks for B_WANTED and will do the wakeup there if necessary - so
1378 * no need to do a wakeup here in the async case.
1379 */
1380
1381 if (bp->b_flags & B_ASYNC) {
1382 brelse(bp);
1383 } else {
1384 wakeup(bp);
1385 }
1386 splx(s);
1387}
1388
1389int
1390count_lock_queue()
1391{
1392 int count;
1393 struct buf *bp;
1394
1395 count = 0;
1396 for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1397 bp != NULL;
1398 bp = bp->b_freelist.tqe_next)
1399 count++;
1400 return (count);
1401}
1402
1403int vfs_update_interval = 30;
1404
1405static void
1406vfs_update()
1407{
1408 (void) spl0(); /* XXX redundant? wrong place? */
1409 while (1) {
1410 tsleep(&vfs_update_wakeup, PUSER, "update",
1411 hz * vfs_update_interval);
1412 vfs_update_wakeup = 0;
1413 sync(curproc, NULL, NULL);
1414 }
1415}
1416
1417static int
1418sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
1419{
1420 int error = sysctl_handle_int(oidp,
1421 oidp->oid_arg1, oidp->oid_arg2, req);
1422 if (!error)
1423 wakeup(&vfs_update_wakeup);
1424 return error;
1425}
1426
1427SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
1428 &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
1429
1430
1431/*
1432 * This routine is called in lieu of iodone in the case of
1433 * incomplete I/O. This keeps the busy status for pages
1434 * consistant.
1435 */
1436void
1437vfs_unbusy_pages(struct buf * bp)
1438{
1439 int i;
1440
1441 if (bp->b_flags & B_VMIO) {
1442 struct vnode *vp = bp->b_vp;
1443 vm_object_t obj = vp->v_object;
1444 vm_ooffset_t foff;
1445
1446 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1447
1448 for (i = 0; i < bp->b_npages; i++) {
1449 vm_page_t m = bp->b_pages[i];
1450
1451 if (m == bogus_page) {
1452 m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
1453 if (!m) {
1454 panic("vfs_unbusy_pages: page missing\n");
1455 }
1456 bp->b_pages[i] = m;
1457 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1458 }
1459 --obj->paging_in_progress;
1460 --m->busy;
1461 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1462 m->flags &= ~PG_WANTED;
1463 wakeup(m);
1464 }
1465 }
1466 if (obj->paging_in_progress == 0 &&
1467 (obj->flags & OBJ_PIPWNT)) {
1468 obj->flags &= ~OBJ_PIPWNT;
1469 wakeup(obj);
1470 }
1471 }
1472}
1473
1474/*
1475 * This routine is called before a device strategy routine.
1476 * It is used to tell the VM system that paging I/O is in
1477 * progress, and treat the pages associated with the buffer
1478 * almost as being PG_BUSY. Also the object paging_in_progress
1479 * flag is handled to make sure that the object doesn't become
1480 * inconsistant.
1481 */
1482void
1483vfs_busy_pages(struct buf * bp, int clear_modify)
1484{
1485 int i;
1486
1487 if (bp->b_flags & B_VMIO) {
1488 vm_object_t obj = bp->b_vp->v_object;
1489 vm_ooffset_t foff;
1490 int iocount = bp->b_bufsize;
1491
1492 if (bp->b_vp->v_type == VBLK)
1493 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1494 else
1495 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1496 vfs_setdirty(bp);
1497 for (i = 0; i < bp->b_npages; i++) {
1498 vm_page_t m = bp->b_pages[i];
1499 int resid = IDX_TO_OFF(m->pindex + 1) - foff;
1500
1501 if (resid > iocount)
1502 resid = iocount;
1503 if ((bp->b_flags & B_CLUSTER) == 0) {
1504 obj->paging_in_progress++;
1505 m->busy++;
1506 }
1507 if (clear_modify) {
1508 vm_page_protect(m, VM_PROT_READ);
1509 vm_page_set_validclean(m,
1510 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid);
1511 } else if (bp->b_bcount >= PAGE_SIZE) {
1512 if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1513 bp->b_pages[i] = bogus_page;
1514 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1515 }
1516 }
1517 foff += resid;
1518 iocount -= resid;
1519 }
1520 }
1521}
1522
1523/*
1524 * Tell the VM system that the pages associated with this buffer
1525 * are clean. This is used for delayed writes where the data is
1526 * going to go to disk eventually without additional VM intevention.
1527 */
1528void
1529vfs_clean_pages(struct buf * bp)
1530{
1531 int i;
1532
1533 if (bp->b_flags & B_VMIO) {
1534 vm_ooffset_t foff;
1535 int iocount = bp->b_bufsize;
1536
1537 if (bp->b_vp->v_type == VBLK)
1538 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1539 else
1540 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1541
1542 for (i = 0; i < bp->b_npages; i++) {
1543 vm_page_t m = bp->b_pages[i];
1544 int resid = IDX_TO_OFF(m->pindex + 1) - foff;
1545
1546 if (resid > iocount)
1547 resid = iocount;
1548 if (resid > 0) {
1549 vm_page_set_validclean(m,
1550 ((vm_offset_t) foff & (PAGE_SIZE-1)), resid);
1551 }
1552 foff += resid;
1553 iocount -= resid;
1554 }
1555 }
1556}
1557
1558void
1559vfs_bio_clrbuf(struct buf *bp) {
1560 int i;
1561 if( bp->b_flags & B_VMIO) {
1562 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1563 int mask;
1564 mask = 0;
1565 for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
1566 mask |= (1 << (i/DEV_BSIZE));
1567 if( bp->b_pages[0]->valid != mask) {
1568 bzero(bp->b_data, bp->b_bufsize);
1569 }
1570 bp->b_pages[0]->valid = mask;
1571 bp->b_resid = 0;
1572 return;
1573 }
1574 for(i=0;i<bp->b_npages;i++) {
1575 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1576 continue;
1577 if( bp->b_pages[i]->valid == 0) {
1578 if ((bp->b_pages[i]->flags & PG_ZERO) == 0)
1579 bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
1580 } else {
1581 int j;
1582 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1583 if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1584 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
1585 }
1586 }
1587 bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
1588 }
1589 bp->b_resid = 0;
1590 } else {
1591 clrbuf(bp);
1592 }
1593}
1594
1595/*
1596 * vm_hold_load_pages and vm_hold_unload pages get pages into
1597 * a buffers address space. The pages are anonymous and are
1598 * not associated with a file object.
1599 */
1600void
1601vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1602{
1603 vm_offset_t pg;
1604 vm_page_t p;
1605 vm_offset_t from = round_page(froma);
1606 vm_offset_t to = round_page(toa);
1607
1608 for (pg = from; pg < to; pg += PAGE_SIZE) {
1609
1610tryagain:
1611
1612 p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
1613 VM_ALLOC_NORMAL);
1614 if (!p) {
1615 VM_WAIT;
1616 goto tryagain;
1617 }
1618 vm_page_wire(p);
1619 pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1620 bp->b_pages[((caddr_t) pg - bp->b_data) >> PAGE_SHIFT] = p;
1621 PAGE_WAKEUP(p);
1622 bp->b_npages++;
1623 }
1624}
1625
1626void
1627vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1628{
1629 vm_offset_t pg;
1630 vm_page_t p;
1631 vm_offset_t from = round_page(froma);
1632 vm_offset_t to = round_page(toa);
1633
1634 for (pg = from; pg < to; pg += PAGE_SIZE) {
1635 int index = ((caddr_t) pg - bp->b_data) >> PAGE_SHIFT;
1636 p = bp->b_pages[index];
1637 bp->b_pages[index] = 0;
1638 pmap_kremove(pg);
1639 vm_page_free(p);
1640 --bp->b_npages;
1641 }
1642}
108
109#define BUF_MAXUSE 8
110
111/*
112 * Initialize buffer headers and related structures.
113 */
114void
115bufinit()
116{
117 struct buf *bp;
118 int i;
119
120 TAILQ_INIT(&bswlist);
121 LIST_INIT(&invalhash);
122
123 /* first, make a null hash table */
124 for (i = 0; i < BUFHSZ; i++)
125 LIST_INIT(&bufhashtbl[i]);
126
127 /* next, make a null set of free lists */
128 for (i = 0; i < BUFFER_QUEUES; i++)
129 TAILQ_INIT(&bufqueues[i]);
130
131 buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
132 /* finally, initialize each buffer header and stick on empty q */
133 for (i = 0; i < nbuf; i++) {
134 bp = &buf[i];
135 bzero(bp, sizeof *bp);
136 bp->b_flags = B_INVAL; /* we're just an empty header */
137 bp->b_dev = NODEV;
138 bp->b_rcred = NOCRED;
139 bp->b_wcred = NOCRED;
140 bp->b_qindex = QUEUE_EMPTY;
141 bp->b_vnbufs.le_next = NOLIST;
142 bp->b_data = buffers_kva + i * MAXBSIZE;
143 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
144 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
145 }
146/*
147 * maxbufspace is currently calculated to support all filesystem blocks
148 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer
149 * cache is still the same as it would be for 8K filesystems. This
150 * keeps the size of the buffer cache "in check" for big block filesystems.
151 */
152 maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
153
154 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
155 bogus_page = vm_page_alloc(kernel_object,
156 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
157 VM_ALLOC_NORMAL);
158
159}
160
161/*
162 * remove the buffer from the appropriate free list
163 */
164void
165bremfree(struct buf * bp)
166{
167 int s = splbio();
168
169 if (bp->b_qindex != QUEUE_NONE) {
170 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
171 bp->b_qindex = QUEUE_NONE;
172 } else {
173 panic("bremfree: removing a buffer when not on a queue");
174 }
175 splx(s);
176}
177
178/*
179 * Get a buffer with the specified data. Look in the cache first.
180 */
181int
182bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
183 struct buf ** bpp)
184{
185 struct buf *bp;
186
187 bp = getblk(vp, blkno, size, 0, 0);
188 *bpp = bp;
189
190 /* if not found in cache, do some I/O */
191 if ((bp->b_flags & B_CACHE) == 0) {
192 if (curproc != NULL)
193 curproc->p_stats->p_ru.ru_inblock++;
194 bp->b_flags |= B_READ;
195 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
196 if (bp->b_rcred == NOCRED) {
197 if (cred != NOCRED)
198 crhold(cred);
199 bp->b_rcred = cred;
200 }
201 vfs_busy_pages(bp, 0);
202 VOP_STRATEGY(bp);
203 return (biowait(bp));
204 }
205 return (0);
206}
207
208/*
209 * Operates like bread, but also starts asynchronous I/O on
210 * read-ahead blocks.
211 */
212int
213breadn(struct vnode * vp, daddr_t blkno, int size,
214 daddr_t * rablkno, int *rabsize,
215 int cnt, struct ucred * cred, struct buf ** bpp)
216{
217 struct buf *bp, *rabp;
218 int i;
219 int rv = 0, readwait = 0;
220
221 *bpp = bp = getblk(vp, blkno, size, 0, 0);
222
223 /* if not found in cache, do some I/O */
224 if ((bp->b_flags & B_CACHE) == 0) {
225 if (curproc != NULL)
226 curproc->p_stats->p_ru.ru_inblock++;
227 bp->b_flags |= B_READ;
228 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
229 if (bp->b_rcred == NOCRED) {
230 if (cred != NOCRED)
231 crhold(cred);
232 bp->b_rcred = cred;
233 }
234 vfs_busy_pages(bp, 0);
235 VOP_STRATEGY(bp);
236 ++readwait;
237 }
238 for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
239 if (inmem(vp, *rablkno))
240 continue;
241 rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
242
243 if ((rabp->b_flags & B_CACHE) == 0) {
244 if (curproc != NULL)
245 curproc->p_stats->p_ru.ru_inblock++;
246 rabp->b_flags |= B_READ | B_ASYNC;
247 rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
248 if (rabp->b_rcred == NOCRED) {
249 if (cred != NOCRED)
250 crhold(cred);
251 rabp->b_rcred = cred;
252 }
253 vfs_busy_pages(rabp, 0);
254 VOP_STRATEGY(rabp);
255 } else {
256 brelse(rabp);
257 }
258 }
259
260 if (readwait) {
261 rv = biowait(bp);
262 }
263 return (rv);
264}
265
266/*
267 * Write, release buffer on completion. (Done by iodone
268 * if async.)
269 */
270int
271bwrite(struct buf * bp)
272{
273 int oldflags = bp->b_flags;
274
275 if (bp->b_flags & B_INVAL) {
276 brelse(bp);
277 return (0);
278 }
279 if (!(bp->b_flags & B_BUSY))
280 panic("bwrite: buffer is not busy???");
281
282 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
283 bp->b_flags |= B_WRITEINPROG;
284
285 if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
286 reassignbuf(bp, bp->b_vp);
287 }
288
289 bp->b_vp->v_numoutput++;
290 vfs_busy_pages(bp, 1);
291 if (curproc != NULL)
292 curproc->p_stats->p_ru.ru_oublock++;
293 VOP_STRATEGY(bp);
294
295 if ((oldflags & B_ASYNC) == 0) {
296 int rtval = biowait(bp);
297
298 if (oldflags & B_DELWRI) {
299 reassignbuf(bp, bp->b_vp);
300 }
301 brelse(bp);
302 return (rtval);
303 }
304 return (0);
305}
306
307int
308vn_bwrite(ap)
309 struct vop_bwrite_args *ap;
310{
311 return (bwrite(ap->a_bp));
312}
313
314/*
315 * Delayed write. (Buffer is marked dirty).
316 */
317void
318bdwrite(struct buf * bp)
319{
320
321 if ((bp->b_flags & B_BUSY) == 0) {
322 panic("bdwrite: buffer is not busy");
323 }
324 if (bp->b_flags & B_INVAL) {
325 brelse(bp);
326 return;
327 }
328 if (bp->b_flags & B_TAPE) {
329 bawrite(bp);
330 return;
331 }
332 bp->b_flags &= ~(B_READ|B_RELBUF);
333 if ((bp->b_flags & B_DELWRI) == 0) {
334 bp->b_flags |= B_DONE | B_DELWRI;
335 reassignbuf(bp, bp->b_vp);
336 }
337
338 /*
339 * This bmap keeps the system from needing to do the bmap later,
340 * perhaps when the system is attempting to do a sync. Since it
341 * is likely that the indirect block -- or whatever other datastructure
342 * that the filesystem needs is still in memory now, it is a good
343 * thing to do this. Note also, that if the pageout daemon is
344 * requesting a sync -- there might not be enough memory to do
345 * the bmap then... So, this is important to do.
346 */
347 if( bp->b_lblkno == bp->b_blkno) {
348 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
349 }
350
351 /*
352 * Set the *dirty* buffer range based upon the VM system dirty pages.
353 */
354 vfs_setdirty(bp);
355
356 /*
357 * We need to do this here to satisfy the vnode_pager and the
358 * pageout daemon, so that it thinks that the pages have been
359 * "cleaned". Note that since the pages are in a delayed write
360 * buffer -- the VFS layer "will" see that the pages get written
361 * out on the next sync, or perhaps the cluster will be completed.
362 */
363 vfs_clean_pages(bp);
364 brelse(bp);
365 return;
366}
367
368/*
369 * Asynchronous write.
370 * Start output on a buffer, but do not wait for it to complete.
371 * The buffer is released when the output completes.
372 */
373void
374bawrite(struct buf * bp)
375{
376 bp->b_flags |= B_ASYNC;
377 (void) VOP_BWRITE(bp);
378}
379
380/*
381 * Release a buffer.
382 */
383void
384brelse(struct buf * bp)
385{
386 int s;
387
388 if (bp->b_flags & B_CLUSTER) {
389 relpbuf(bp);
390 return;
391 }
392 /* anyone need a "free" block? */
393 s = splbio();
394
395 if (needsbuffer) {
396 needsbuffer = 0;
397 wakeup(&needsbuffer);
398 }
399
400 /* anyone need this block? */
401 if (bp->b_flags & B_WANTED) {
402 bp->b_flags &= ~(B_WANTED | B_AGE);
403 wakeup(bp);
404 }
405
406 if (bp->b_flags & B_LOCKED)
407 bp->b_flags &= ~B_ERROR;
408
409 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
410 (bp->b_bufsize <= 0)) {
411 bp->b_flags |= B_INVAL;
412 bp->b_flags &= ~(B_DELWRI | B_CACHE);
413 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
414 brelvp(bp);
415 }
416
417 /*
418 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
419 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
420 * but the VM object is kept around. The B_NOCACHE flag is used to
421 * invalidate the pages in the VM object.
422 */
423 if (bp->b_flags & B_VMIO) {
424 vm_ooffset_t foff;
425 vm_object_t obj;
426 int i, resid;
427 vm_page_t m;
428 struct vnode *vp;
429 int iototal = bp->b_bufsize;
430
431 vp = bp->b_vp;
432 if (!vp)
433 panic("brelse: missing vp");
434
435 if (bp->b_npages) {
436 vm_pindex_t poff;
437 obj = (vm_object_t) vp->v_object;
438 if (vp->v_type == VBLK)
439 foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
440 else
441 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
442 poff = OFF_TO_IDX(foff);
443 for (i = 0; i < bp->b_npages; i++) {
444 m = bp->b_pages[i];
445 if (m == bogus_page) {
446 m = vm_page_lookup(obj, poff + i);
447 if (!m) {
448 panic("brelse: page missing\n");
449 }
450 bp->b_pages[i] = m;
451 pmap_qenter(trunc_page(bp->b_data),
452 bp->b_pages, bp->b_npages);
453 }
454 resid = IDX_TO_OFF(m->pindex+1) - foff;
455 if (resid > iototal)
456 resid = iototal;
457 if (resid > 0) {
458 /*
459 * Don't invalidate the page if the local machine has already
460 * modified it. This is the lesser of two evils, and should
461 * be fixed.
462 */
463 if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
464 vm_page_test_dirty(m);
465 if (m->dirty == 0) {
466 vm_page_set_invalid(m, (vm_offset_t) foff, resid);
467 if (m->valid == 0)
468 vm_page_protect(m, VM_PROT_NONE);
469 }
470 }
471 }
472 foff += resid;
473 iototal -= resid;
474 }
475 }
476
477 if (bp->b_flags & (B_INVAL | B_RELBUF)) {
478 for(i = 0; i < bp->b_npages; i++) {
479 m = bp->b_pages[i];
480 --m->bmapped;
481 if (m->bmapped == 0) {
482 if (m->flags & PG_WANTED) {
483 m->flags &= ~PG_WANTED;
484 wakeup(m);
485 }
486 if ((m->busy == 0) && ((m->flags & PG_BUSY) == 0)) {
487 if (m->object->flags & OBJ_MIGHTBEDIRTY) {
488 vm_page_test_dirty(m);
489 }
490 /*
491 * if page isn't valid, no sense in keeping it around
492 */
493 if (m->valid == 0) {
494 vm_page_protect(m, VM_PROT_NONE);
495 vm_page_free(m);
496 /*
497 * if page isn't dirty and hasn't been referenced by
498 * a process, then cache it
499 */
500 } else if ((m->dirty & m->valid) == 0 &&
501 (m->flags & PG_REFERENCED) == 0 &&
502 !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
503 vm_page_cache(m);
504 /*
505 * otherwise activate it
506 */
507 } else if ((m->flags & PG_ACTIVE) == 0) {
508 vm_page_activate(m);
509 m->act_count = 0;
510 }
511 }
512 }
513 }
514 bufspace -= bp->b_bufsize;
515 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
516 bp->b_npages = 0;
517 bp->b_bufsize = 0;
518 bp->b_flags &= ~B_VMIO;
519 if (bp->b_vp)
520 brelvp(bp);
521 }
522 }
523 if (bp->b_qindex != QUEUE_NONE)
524 panic("brelse: free buffer onto another queue???");
525
526 /* enqueue */
527 /* buffers with no memory */
528 if (bp->b_bufsize == 0) {
529 bp->b_qindex = QUEUE_EMPTY;
530 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
531 LIST_REMOVE(bp, b_hash);
532 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
533 bp->b_dev = NODEV;
534 /* buffers with junk contents */
535 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
536 bp->b_qindex = QUEUE_AGE;
537 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
538 LIST_REMOVE(bp, b_hash);
539 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
540 bp->b_dev = NODEV;
541 /* buffers that are locked */
542 } else if (bp->b_flags & B_LOCKED) {
543 bp->b_qindex = QUEUE_LOCKED;
544 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
545 /* buffers with stale but valid contents */
546 } else if (bp->b_flags & B_AGE) {
547 bp->b_qindex = QUEUE_AGE;
548 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
549 /* buffers with valid and quite potentially reuseable contents */
550 } else {
551 bp->b_qindex = QUEUE_LRU;
552 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
553 }
554
555 /* unlock */
556 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
557 splx(s);
558}
559
560/*
561 * Check to see if a block is currently memory resident.
562 */
563__inline struct buf *
564gbincore(struct vnode * vp, daddr_t blkno)
565{
566 struct buf *bp;
567 struct bufhashhdr *bh;
568
569 bh = BUFHASH(vp, blkno);
570 bp = bh->lh_first;
571
572 /* Search hash chain */
573 while (bp != NULL) {
574 /* hit */
575 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
576 (bp->b_flags & B_INVAL) == 0) {
577 break;
578 }
579 bp = bp->b_hash.le_next;
580 }
581 return (bp);
582}
583
584/*
585 * this routine implements clustered async writes for
586 * clearing out B_DELWRI buffers... This is much better
587 * than the old way of writing only one buffer at a time.
588 */
589int
590vfs_bio_awrite(struct buf * bp)
591{
592 int i;
593 daddr_t lblkno = bp->b_lblkno;
594 struct vnode *vp = bp->b_vp;
595 int s;
596 int ncl;
597 struct buf *bpa;
598 int nwritten;
599
600 s = splbio();
601 /*
602 * right now we support clustered writing only to regular files
603 */
604 if ((vp->v_type == VREG) &&
605 (vp->v_mount != 0) && /* Only on nodes that have the size info */
606 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
607 int size;
608 int maxcl;
609
610 size = vp->v_mount->mnt_stat.f_iosize;
611 maxcl = MAXPHYS / size;
612
613 for (i = 1; i < maxcl; i++) {
614 if ((bpa = gbincore(vp, lblkno + i)) &&
615 ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
616 (B_DELWRI | B_CLUSTEROK)) &&
617 (bpa->b_bufsize == size)) {
618 if ((bpa->b_blkno == bpa->b_lblkno) ||
619 (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
620 break;
621 } else {
622 break;
623 }
624 }
625 ncl = i;
626 /*
627 * this is a possible cluster write
628 */
629 if (ncl != 1) {
630 nwritten = cluster_wbuild(vp, size, lblkno, ncl);
631 splx(s);
632 return nwritten;
633 }
634 }
635 bremfree(bp);
636 splx(s);
637 /*
638 * default (old) behavior, writing out only one block
639 */
640 bp->b_flags |= B_BUSY | B_ASYNC;
641 nwritten = bp->b_bufsize;
642 (void) VOP_BWRITE(bp);
643 return nwritten;
644}
645
646
647/*
648 * Find a buffer header which is available for use.
649 */
650static struct buf *
651getnewbuf(int slpflag, int slptimeo, int doingvmio)
652{
653 struct buf *bp;
654 int s;
655 int nbyteswritten = 0;
656
657 s = splbio();
658start:
659 if (bufspace >= maxbufspace)
660 goto trytofreespace;
661
662 /* can we constitute a new buffer? */
663 if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
664 if (bp->b_qindex != QUEUE_EMPTY)
665 panic("getnewbuf: inconsistent EMPTY queue");
666 bremfree(bp);
667 goto fillbuf;
668 }
669trytofreespace:
670 /*
671 * We keep the file I/O from hogging metadata I/O
672 * This is desirable because file data is cached in the
673 * VM/Buffer cache even if a buffer is freed.
674 */
675 if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
676 if (bp->b_qindex != QUEUE_AGE)
677 panic("getnewbuf: inconsistent AGE queue");
678 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
679 if (bp->b_qindex != QUEUE_LRU)
680 panic("getnewbuf: inconsistent LRU queue");
681 }
682 if (!bp) {
683 /* wait for a free buffer of any kind */
684 needsbuffer = 1;
685 tsleep(&needsbuffer,
686 (PRIBIO + 1) | slpflag, "newbuf", slptimeo);
687 splx(s);
688 return (0);
689 }
690
691 if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
692 --bp->b_usecount;
693 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
694 if (bufqueues[QUEUE_LRU].tqh_first != NULL) {
695 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
696 goto start;
697 }
698 }
699
700 /* if we are a delayed write, convert to an async write */
701 if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
702 nbyteswritten += vfs_bio_awrite(bp);
703 if (!slpflag && !slptimeo) {
704 splx(s);
705 return (0);
706 }
707 goto start;
708 }
709
710 if (bp->b_flags & B_WANTED) {
711 bp->b_flags &= ~B_WANTED;
712 wakeup(bp);
713 }
714 bremfree(bp);
715
716 if (bp->b_flags & B_VMIO) {
717 bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
718 brelse(bp);
719 bremfree(bp);
720 }
721
722 if (bp->b_vp)
723 brelvp(bp);
724
725 /* we are not free, nor do we contain interesting data */
726 if (bp->b_rcred != NOCRED)
727 crfree(bp->b_rcred);
728 if (bp->b_wcred != NOCRED)
729 crfree(bp->b_wcred);
730fillbuf:
731 bp->b_flags |= B_BUSY;
732 LIST_REMOVE(bp, b_hash);
733 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
734 splx(s);
735 if (bp->b_bufsize) {
736 allocbuf(bp, 0);
737 }
738 bp->b_flags = B_BUSY;
739 bp->b_dev = NODEV;
740 bp->b_vp = NULL;
741 bp->b_blkno = bp->b_lblkno = 0;
742 bp->b_iodone = 0;
743 bp->b_error = 0;
744 bp->b_resid = 0;
745 bp->b_bcount = 0;
746 bp->b_npages = 0;
747 bp->b_wcred = bp->b_rcred = NOCRED;
748 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
749 bp->b_dirtyoff = bp->b_dirtyend = 0;
750 bp->b_validoff = bp->b_validend = 0;
751 bp->b_usecount = 2;
752 if (bufspace >= maxbufspace + nbyteswritten) {
753 s = splbio();
754 bp->b_flags |= B_INVAL;
755 brelse(bp);
756 goto trytofreespace;
757 }
758 return (bp);
759}
760
761/*
762 * Check to see if a block is currently memory resident.
763 */
764struct buf *
765incore(struct vnode * vp, daddr_t blkno)
766{
767 struct buf *bp;
768 struct bufhashhdr *bh;
769
770 int s = splbio();
771
772 bh = BUFHASH(vp, blkno);
773 bp = bh->lh_first;
774
775 /* Search hash chain */
776 while (bp != NULL) {
777 /* hit */
778 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
779 (bp->b_flags & B_INVAL) == 0) {
780 break;
781 }
782 bp = bp->b_hash.le_next;
783 }
784 splx(s);
785 return (bp);
786}
787
788/*
789 * Returns true if no I/O is needed to access the
790 * associated VM object. This is like incore except
791 * it also hunts around in the VM system for the data.
792 */
793
794int
795inmem(struct vnode * vp, daddr_t blkno)
796{
797 vm_object_t obj;
798 vm_offset_t toff, tinc;
799 vm_page_t m;
800 vm_ooffset_t off;
801
802 if (incore(vp, blkno))
803 return 1;
804 if (vp->v_mount == NULL)
805 return 0;
806 if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
807 return 0;
808
809 obj = vp->v_object;
810 tinc = PAGE_SIZE;
811 if (tinc > vp->v_mount->mnt_stat.f_iosize)
812 tinc = vp->v_mount->mnt_stat.f_iosize;
813 off = blkno * vp->v_mount->mnt_stat.f_iosize;
814
815 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
816
817 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
818 if (!m)
819 return 0;
820 if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
821 return 0;
822 }
823 return 1;
824}
825
826/*
827 * now we set the dirty range for the buffer --
828 * for NFS -- if the file is mapped and pages have
829 * been written to, let it know. We want the
830 * entire range of the buffer to be marked dirty if
831 * any of the pages have been written to for consistancy
832 * with the b_validoff, b_validend set in the nfs write
833 * code, and used by the nfs read code.
834 */
835static void
836vfs_setdirty(struct buf *bp) {
837 int i;
838 vm_object_t object;
839 vm_offset_t boffset, offset;
840 /*
841 * We qualify the scan for modified pages on whether the
842 * object has been flushed yet. The OBJ_WRITEABLE flag
843 * is not cleared simply by protecting pages off.
844 */
845 if ((bp->b_flags & B_VMIO) &&
846 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
847 /*
848 * test the pages to see if they have been modified directly
849 * by users through the VM system.
850 */
851 for (i = 0; i < bp->b_npages; i++)
852 vm_page_test_dirty(bp->b_pages[i]);
853
854 /*
855 * scan forwards for the first page modified
856 */
857 for (i = 0; i < bp->b_npages; i++) {
858 if (bp->b_pages[i]->dirty) {
859 break;
860 }
861 }
862 boffset = (i << PAGE_SHIFT);
863 if (boffset < bp->b_dirtyoff) {
864 bp->b_dirtyoff = boffset;
865 }
866
867 /*
868 * scan backwards for the last page modified
869 */
870 for (i = bp->b_npages - 1; i >= 0; --i) {
871 if (bp->b_pages[i]->dirty) {
872 break;
873 }
874 }
875 boffset = (i + 1);
876 offset = boffset + bp->b_pages[0]->pindex;
877 if (offset >= object->size)
878 boffset = object->size - bp->b_pages[0]->pindex;
879 if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
880 bp->b_dirtyend = (boffset << PAGE_SHIFT);
881 }
882}
883
884/*
885 * Get a block given a specified block and offset into a file/device.
886 */
887struct buf *
888getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
889{
890 struct buf *bp;
891 int s;
892 struct bufhashhdr *bh;
893
894 s = splbio();
895loop:
896 if ((bp = gbincore(vp, blkno))) {
897 if (bp->b_flags & B_BUSY) {
898 bp->b_flags |= B_WANTED;
899 if (bp->b_usecount < BUF_MAXUSE)
900 ++bp->b_usecount;
901 if (!tsleep(bp,
902 (PRIBIO + 1) | slpflag, "getblk", slptimeo))
903 goto loop;
904
905 splx(s);
906 return (struct buf *) NULL;
907 }
908 bp->b_flags |= B_BUSY | B_CACHE;
909 bremfree(bp);
910
911 /*
912 * check for size inconsistancies (note that they shouldn't happen
913 * but do when filesystems don't handle the size changes correctly.)
914 * We are conservative on metadata and don't just extend the buffer
915 * but write and re-constitute it.
916 */
917
918 if (bp->b_bcount != size) {
919 if (bp->b_flags & B_VMIO) {
920 allocbuf(bp, size);
921 } else {
922 bp->b_flags |= B_NOCACHE;
923 VOP_BWRITE(bp);
924 goto loop;
925 }
926 }
927
928 /*
929 * make sure that all pages in the buffer are valid, if they
930 * aren't, clear the cache flag.
931 * ASSUMPTION:
932 * if the buffer is greater than 1 page in size, it is assumed
933 * that the buffer address starts on a page boundary...
934 */
935 if (bp->b_flags & B_VMIO) {
936 int szleft, i;
937 szleft = size;
938 for (i=0;i<bp->b_npages;i++) {
939 if (szleft > PAGE_SIZE) {
940 if ((bp->b_pages[i]->valid & VM_PAGE_BITS_ALL) !=
941 VM_PAGE_BITS_ALL) {
942 bp->b_flags &= ~(B_CACHE|B_DONE);
943 break;
944 }
945 szleft -= PAGE_SIZE;
946 } else {
947 if (!vm_page_is_valid(bp->b_pages[i],
948 (((vm_offset_t) bp->b_data) & PAGE_MASK),
949 szleft)) {
950 bp->b_flags &= ~(B_CACHE|B_DONE);
951 break;
952 }
953 szleft = 0;
954 }
955 }
956 }
957 if (bp->b_usecount < BUF_MAXUSE)
958 ++bp->b_usecount;
959 splx(s);
960 return (bp);
961 } else {
962 vm_object_t obj;
963 int doingvmio;
964
965 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
966 doingvmio = 1;
967 } else {
968 doingvmio = 0;
969 }
970 if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
971 if (slpflag || slptimeo) {
972 splx(s);
973 return NULL;
974 }
975 goto loop;
976 }
977
978 /*
979 * This code is used to make sure that a buffer is not
980 * created while the getnewbuf routine is blocked.
981 * Normally the vnode is locked so this isn't a problem.
982 * VBLK type I/O requests, however, don't lock the vnode.
983 */
984 if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
985 bp->b_flags |= B_INVAL;
986 brelse(bp);
987 goto loop;
988 }
989
990 /*
991 * Insert the buffer into the hash, so that it can
992 * be found by incore.
993 */
994 bp->b_blkno = bp->b_lblkno = blkno;
995 bgetvp(vp, bp);
996 LIST_REMOVE(bp, b_hash);
997 bh = BUFHASH(vp, blkno);
998 LIST_INSERT_HEAD(bh, bp, b_hash);
999
1000 if (doingvmio) {
1001 bp->b_flags |= (B_VMIO | B_CACHE);
1002#if defined(VFS_BIO_DEBUG)
1003 if (vp->v_type != VREG)
1004 printf("getblk: vmioing file type %d???\n", vp->v_type);
1005#endif
1006 } else {
1007 bp->b_flags &= ~B_VMIO;
1008 }
1009 splx(s);
1010
1011 allocbuf(bp, size);
1012 return (bp);
1013 }
1014}
1015
1016/*
1017 * Get an empty, disassociated buffer of given size.
1018 */
1019struct buf *
1020geteblk(int size)
1021{
1022 struct buf *bp;
1023
1024 while ((bp = getnewbuf(0, 0, 0)) == 0);
1025 allocbuf(bp, size);
1026 bp->b_flags |= B_INVAL;
1027 return (bp);
1028}
1029
1030/*
1031 * This code constitutes the buffer memory from either anonymous system
1032 * memory (in the case of non-VMIO operations) or from an associated
1033 * VM object (in the case of VMIO operations).
1034 *
1035 * Note that this code is tricky, and has many complications to resolve
1036 * deadlock or inconsistant data situations. Tread lightly!!!
1037 *
1038 * Modify the length of a buffer's underlying buffer storage without
1039 * destroying information (unless, of course the buffer is shrinking).
1040 */
1041int
1042allocbuf(struct buf * bp, int size)
1043{
1044
1045 int s;
1046 int newbsize, mbsize;
1047 int i;
1048
1049 if (!(bp->b_flags & B_BUSY))
1050 panic("allocbuf: buffer not busy");
1051
1052 if ((bp->b_flags & B_VMIO) == 0) {
1053 /*
1054 * Just get anonymous memory from the kernel
1055 */
1056 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1057 newbsize = round_page(size);
1058
1059 if (newbsize < bp->b_bufsize) {
1060 vm_hold_free_pages(
1061 bp,
1062 (vm_offset_t) bp->b_data + newbsize,
1063 (vm_offset_t) bp->b_data + bp->b_bufsize);
1064 } else if (newbsize > bp->b_bufsize) {
1065 vm_hold_load_pages(
1066 bp,
1067 (vm_offset_t) bp->b_data + bp->b_bufsize,
1068 (vm_offset_t) bp->b_data + newbsize);
1069 }
1070 } else {
1071 vm_page_t m;
1072 int desiredpages;
1073
1074 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1075 desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1076
1077 if (newbsize < bp->b_bufsize) {
1078 if (desiredpages < bp->b_npages) {
1079 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1080 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1081 for (i = desiredpages; i < bp->b_npages; i++) {
1082 m = bp->b_pages[i];
1083 s = splhigh();
1084 while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1085 m->flags |= PG_WANTED;
1086 tsleep(m, PVM, "biodep", 0);
1087 }
1088 splx(s);
1089
1090 if (m->bmapped == 0) {
1091 printf("allocbuf: bmapped is zero for page %d\n", i);
1092 panic("allocbuf: error");
1093 }
1094 --m->bmapped;
1095 if (m->bmapped == 0) {
1096 vm_page_protect(m, VM_PROT_NONE);
1097 vm_page_free(m);
1098 }
1099 bp->b_pages[i] = NULL;
1100 }
1101 bp->b_npages = desiredpages;
1102 }
1103 } else if (newbsize > bp->b_bufsize) {
1104 vm_object_t obj;
1105 vm_offset_t tinc, toff;
1106 vm_ooffset_t off;
1107 vm_pindex_t objoff;
1108 int pageindex, curbpnpages;
1109 struct vnode *vp;
1110 int bsize;
1111
1112 vp = bp->b_vp;
1113
1114 if (vp->v_type == VBLK)
1115 bsize = DEV_BSIZE;
1116 else
1117 bsize = vp->v_mount->mnt_stat.f_iosize;
1118
1119 if (bp->b_npages < desiredpages) {
1120 obj = vp->v_object;
1121 tinc = PAGE_SIZE;
1122 if (tinc > bsize)
1123 tinc = bsize;
1124 off = (vm_ooffset_t) bp->b_lblkno * bsize;
1125 doretry:
1126 curbpnpages = bp->b_npages;
1127 bp->b_flags |= B_CACHE;
1128 for (toff = 0; toff < newbsize; toff += tinc) {
1129 int bytesinpage;
1130
1131 pageindex = toff >> PAGE_SHIFT;
1132 objoff = OFF_TO_IDX(off + toff);
1133 if (pageindex < curbpnpages) {
1134
1135 m = bp->b_pages[pageindex];
1136 if (m->pindex != objoff)
1137 panic("allocbuf: page changed offset??!!!?");
1138 bytesinpage = tinc;
1139 if (tinc > (newbsize - toff))
1140 bytesinpage = newbsize - toff;
1141 if (!vm_page_is_valid(m,
1142 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)),
1143 bytesinpage)) {
1144 bp->b_flags &= ~B_CACHE;
1145 }
1146 if ((m->flags & PG_ACTIVE) == 0) {
1147 vm_page_activate(m);
1148 m->act_count = 0;
1149 }
1150 continue;
1151 }
1152 m = vm_page_lookup(obj, objoff);
1153 if (!m) {
1154 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1155 if (!m) {
1156 int j;
1157
1158 for (j = bp->b_npages; j < pageindex; j++) {
1159 PAGE_WAKEUP(bp->b_pages[j]);
1160 }
1161 VM_WAIT;
1162 goto doretry;
1163 }
1164 vm_page_activate(m);
1165 m->act_count = 0;
1166 m->valid = 0;
1167 bp->b_flags &= ~B_CACHE;
1168 } else if (m->flags & PG_BUSY) {
1169 int j;
1170
1171 for (j = bp->b_npages; j < pageindex; j++) {
1172 PAGE_WAKEUP(bp->b_pages[j]);
1173 }
1174
1175 s = splbio();
1176 m->flags |= PG_WANTED;
1177 tsleep(m, PVM, "pgtblk", 0);
1178 splx(s);
1179
1180 goto doretry;
1181 } else {
1182 if ((curproc != pageproc) &&
1183 (m->flags & PG_CACHE) &&
1184 (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
1185 pagedaemon_wakeup();
1186 }
1187 bytesinpage = tinc;
1188 if (tinc > (newbsize - toff))
1189 bytesinpage = newbsize - toff;
1190 if (!vm_page_is_valid(m,
1191 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)),
1192 bytesinpage)) {
1193 bp->b_flags &= ~B_CACHE;
1194 }
1195 if ((m->flags & PG_ACTIVE) == 0) {
1196 vm_page_activate(m);
1197 m->act_count = 0;
1198 }
1199 m->flags |= PG_BUSY;
1200 }
1201 bp->b_pages[pageindex] = m;
1202 curbpnpages = pageindex + 1;
1203 }
1204 for (i = bp->b_npages; i < curbpnpages; i++) {
1205 m = bp->b_pages[i];
1206 m->bmapped++;
1207 PAGE_WAKEUP(m);
1208 }
1209 bp->b_npages = curbpnpages;
1210 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1211 pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1212 bp->b_data += off & (PAGE_SIZE - 1);
1213 }
1214 }
1215 }
1216 bufspace += (newbsize - bp->b_bufsize);
1217 bp->b_bufsize = newbsize;
1218 bp->b_bcount = size;
1219 return 1;
1220}
1221
1222/*
1223 * Wait for buffer I/O completion, returning error status.
1224 */
1225int
1226biowait(register struct buf * bp)
1227{
1228 int s;
1229
1230 s = splbio();
1231 while ((bp->b_flags & B_DONE) == 0)
1232 tsleep(bp, PRIBIO, "biowait", 0);
1233 splx(s);
1234 if (bp->b_flags & B_EINTR) {
1235 bp->b_flags &= ~B_EINTR;
1236 return (EINTR);
1237 }
1238 if (bp->b_flags & B_ERROR) {
1239 return (bp->b_error ? bp->b_error : EIO);
1240 } else {
1241 return (0);
1242 }
1243}
1244
1245/*
1246 * Finish I/O on a buffer, calling an optional function.
1247 * This is usually called from interrupt level, so process blocking
1248 * is not *a good idea*.
1249 */
1250void
1251biodone(register struct buf * bp)
1252{
1253 int s;
1254
1255 s = splbio();
1256 if (!(bp->b_flags & B_BUSY))
1257 panic("biodone: buffer not busy");
1258
1259 if (bp->b_flags & B_DONE) {
1260 splx(s);
1261 printf("biodone: buffer already done\n");
1262 return;
1263 }
1264 bp->b_flags |= B_DONE;
1265
1266 if ((bp->b_flags & B_READ) == 0) {
1267 vwakeup(bp);
1268 }
1269#ifdef BOUNCE_BUFFERS
1270 if (bp->b_flags & B_BOUNCE)
1271 vm_bounce_free(bp);
1272#endif
1273
1274 /* call optional completion function if requested */
1275 if (bp->b_flags & B_CALL) {
1276 bp->b_flags &= ~B_CALL;
1277 (*bp->b_iodone) (bp);
1278 splx(s);
1279 return;
1280 }
1281 if (bp->b_flags & B_VMIO) {
1282 int i, resid;
1283 vm_ooffset_t foff;
1284 vm_page_t m;
1285 vm_object_t obj;
1286 int iosize;
1287 struct vnode *vp = bp->b_vp;
1288
1289 if (vp->v_type == VBLK)
1290 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1291 else
1292 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1293 obj = vp->v_object;
1294 if (!obj) {
1295 panic("biodone: no object");
1296 }
1297#if defined(VFS_BIO_DEBUG)
1298 if (obj->paging_in_progress < bp->b_npages) {
1299 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1300 obj->paging_in_progress, bp->b_npages);
1301 }
1302#endif
1303 iosize = bp->b_bufsize;
1304 for (i = 0; i < bp->b_npages; i++) {
1305 int bogusflag = 0;
1306 m = bp->b_pages[i];
1307 if (m == bogus_page) {
1308 bogusflag = 1;
1309 m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1310 if (!m) {
1311#if defined(VFS_BIO_DEBUG)
1312 printf("biodone: page disappeared\n");
1313#endif
1314 --obj->paging_in_progress;
1315 continue;
1316 }
1317 bp->b_pages[i] = m;
1318 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1319 }
1320#if defined(VFS_BIO_DEBUG)
1321 if (OFF_TO_IDX(foff) != m->pindex) {
1322 printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1323 }
1324#endif
1325 resid = IDX_TO_OFF(m->pindex + 1) - foff;
1326 if (resid > iosize)
1327 resid = iosize;
1328 /*
1329 * In the write case, the valid and clean bits are
1330 * already changed correctly, so we only need to do this
1331 * here in the read case.
1332 */
1333 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1334 vm_page_set_validclean(m,
1335 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid);
1336 }
1337
1338 /*
1339 * when debugging new filesystems or buffer I/O methods, this
1340 * is the most common error that pops up. if you see this, you
1341 * have not set the page busy flag correctly!!!
1342 */
1343 if (m->busy == 0) {
1344 printf("biodone: page busy < 0, "
1345 "pindex: %d, foff: 0x(%x,%x), "
1346 "resid: %d, index: %d\n",
1347 (int) m->pindex, (int)(foff >> 32),
1348 (int) foff & 0xffffffff, resid, i);
1349 if (vp->v_type != VBLK)
1350 printf(" iosize: %d, lblkno: %d, flags: 0x%lx, npages: %d\n",
1351 bp->b_vp->v_mount->mnt_stat.f_iosize,
1352 (int) bp->b_lblkno,
1353 bp->b_flags, bp->b_npages);
1354 else
1355 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1356 (int) bp->b_lblkno,
1357 bp->b_flags, bp->b_npages);
1358 printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1359 m->valid, m->dirty, m->bmapped);
1360 panic("biodone: page busy < 0\n");
1361 }
1362 --m->busy;
1363 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1364 m->flags &= ~PG_WANTED;
1365 wakeup(m);
1366 }
1367 --obj->paging_in_progress;
1368 foff += resid;
1369 iosize -= resid;
1370 }
1371 if (obj && obj->paging_in_progress == 0 &&
1372 (obj->flags & OBJ_PIPWNT)) {
1373 obj->flags &= ~OBJ_PIPWNT;
1374 wakeup(obj);
1375 }
1376 }
1377 /*
1378 * For asynchronous completions, release the buffer now. The brelse
1379 * checks for B_WANTED and will do the wakeup there if necessary - so
1380 * no need to do a wakeup here in the async case.
1381 */
1382
1383 if (bp->b_flags & B_ASYNC) {
1384 brelse(bp);
1385 } else {
1386 wakeup(bp);
1387 }
1388 splx(s);
1389}
1390
1391int
1392count_lock_queue()
1393{
1394 int count;
1395 struct buf *bp;
1396
1397 count = 0;
1398 for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1399 bp != NULL;
1400 bp = bp->b_freelist.tqe_next)
1401 count++;
1402 return (count);
1403}
1404
1405int vfs_update_interval = 30;
1406
1407static void
1408vfs_update()
1409{
1410 (void) spl0(); /* XXX redundant? wrong place? */
1411 while (1) {
1412 tsleep(&vfs_update_wakeup, PUSER, "update",
1413 hz * vfs_update_interval);
1414 vfs_update_wakeup = 0;
1415 sync(curproc, NULL, NULL);
1416 }
1417}
1418
1419static int
1420sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
1421{
1422 int error = sysctl_handle_int(oidp,
1423 oidp->oid_arg1, oidp->oid_arg2, req);
1424 if (!error)
1425 wakeup(&vfs_update_wakeup);
1426 return error;
1427}
1428
1429SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
1430 &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
1431
1432
1433/*
1434 * This routine is called in lieu of iodone in the case of
1435 * incomplete I/O. This keeps the busy status for pages
1436 * consistant.
1437 */
1438void
1439vfs_unbusy_pages(struct buf * bp)
1440{
1441 int i;
1442
1443 if (bp->b_flags & B_VMIO) {
1444 struct vnode *vp = bp->b_vp;
1445 vm_object_t obj = vp->v_object;
1446 vm_ooffset_t foff;
1447
1448 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1449
1450 for (i = 0; i < bp->b_npages; i++) {
1451 vm_page_t m = bp->b_pages[i];
1452
1453 if (m == bogus_page) {
1454 m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
1455 if (!m) {
1456 panic("vfs_unbusy_pages: page missing\n");
1457 }
1458 bp->b_pages[i] = m;
1459 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1460 }
1461 --obj->paging_in_progress;
1462 --m->busy;
1463 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1464 m->flags &= ~PG_WANTED;
1465 wakeup(m);
1466 }
1467 }
1468 if (obj->paging_in_progress == 0 &&
1469 (obj->flags & OBJ_PIPWNT)) {
1470 obj->flags &= ~OBJ_PIPWNT;
1471 wakeup(obj);
1472 }
1473 }
1474}
1475
1476/*
1477 * This routine is called before a device strategy routine.
1478 * It is used to tell the VM system that paging I/O is in
1479 * progress, and treat the pages associated with the buffer
1480 * almost as being PG_BUSY. Also the object paging_in_progress
1481 * flag is handled to make sure that the object doesn't become
1482 * inconsistant.
1483 */
1484void
1485vfs_busy_pages(struct buf * bp, int clear_modify)
1486{
1487 int i;
1488
1489 if (bp->b_flags & B_VMIO) {
1490 vm_object_t obj = bp->b_vp->v_object;
1491 vm_ooffset_t foff;
1492 int iocount = bp->b_bufsize;
1493
1494 if (bp->b_vp->v_type == VBLK)
1495 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1496 else
1497 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1498 vfs_setdirty(bp);
1499 for (i = 0; i < bp->b_npages; i++) {
1500 vm_page_t m = bp->b_pages[i];
1501 int resid = IDX_TO_OFF(m->pindex + 1) - foff;
1502
1503 if (resid > iocount)
1504 resid = iocount;
1505 if ((bp->b_flags & B_CLUSTER) == 0) {
1506 obj->paging_in_progress++;
1507 m->busy++;
1508 }
1509 if (clear_modify) {
1510 vm_page_protect(m, VM_PROT_READ);
1511 vm_page_set_validclean(m,
1512 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid);
1513 } else if (bp->b_bcount >= PAGE_SIZE) {
1514 if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1515 bp->b_pages[i] = bogus_page;
1516 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1517 }
1518 }
1519 foff += resid;
1520 iocount -= resid;
1521 }
1522 }
1523}
1524
1525/*
1526 * Tell the VM system that the pages associated with this buffer
1527 * are clean. This is used for delayed writes where the data is
1528 * going to go to disk eventually without additional VM intevention.
1529 */
1530void
1531vfs_clean_pages(struct buf * bp)
1532{
1533 int i;
1534
1535 if (bp->b_flags & B_VMIO) {
1536 vm_ooffset_t foff;
1537 int iocount = bp->b_bufsize;
1538
1539 if (bp->b_vp->v_type == VBLK)
1540 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1541 else
1542 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1543
1544 for (i = 0; i < bp->b_npages; i++) {
1545 vm_page_t m = bp->b_pages[i];
1546 int resid = IDX_TO_OFF(m->pindex + 1) - foff;
1547
1548 if (resid > iocount)
1549 resid = iocount;
1550 if (resid > 0) {
1551 vm_page_set_validclean(m,
1552 ((vm_offset_t) foff & (PAGE_SIZE-1)), resid);
1553 }
1554 foff += resid;
1555 iocount -= resid;
1556 }
1557 }
1558}
1559
1560void
1561vfs_bio_clrbuf(struct buf *bp) {
1562 int i;
1563 if( bp->b_flags & B_VMIO) {
1564 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1565 int mask;
1566 mask = 0;
1567 for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
1568 mask |= (1 << (i/DEV_BSIZE));
1569 if( bp->b_pages[0]->valid != mask) {
1570 bzero(bp->b_data, bp->b_bufsize);
1571 }
1572 bp->b_pages[0]->valid = mask;
1573 bp->b_resid = 0;
1574 return;
1575 }
1576 for(i=0;i<bp->b_npages;i++) {
1577 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1578 continue;
1579 if( bp->b_pages[i]->valid == 0) {
1580 if ((bp->b_pages[i]->flags & PG_ZERO) == 0)
1581 bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
1582 } else {
1583 int j;
1584 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1585 if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1586 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
1587 }
1588 }
1589 bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
1590 }
1591 bp->b_resid = 0;
1592 } else {
1593 clrbuf(bp);
1594 }
1595}
1596
1597/*
1598 * vm_hold_load_pages and vm_hold_unload pages get pages into
1599 * a buffers address space. The pages are anonymous and are
1600 * not associated with a file object.
1601 */
1602void
1603vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1604{
1605 vm_offset_t pg;
1606 vm_page_t p;
1607 vm_offset_t from = round_page(froma);
1608 vm_offset_t to = round_page(toa);
1609
1610 for (pg = from; pg < to; pg += PAGE_SIZE) {
1611
1612tryagain:
1613
1614 p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
1615 VM_ALLOC_NORMAL);
1616 if (!p) {
1617 VM_WAIT;
1618 goto tryagain;
1619 }
1620 vm_page_wire(p);
1621 pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1622 bp->b_pages[((caddr_t) pg - bp->b_data) >> PAGE_SHIFT] = p;
1623 PAGE_WAKEUP(p);
1624 bp->b_npages++;
1625 }
1626}
1627
1628void
1629vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1630{
1631 vm_offset_t pg;
1632 vm_page_t p;
1633 vm_offset_t from = round_page(froma);
1634 vm_offset_t to = round_page(toa);
1635
1636 for (pg = from; pg < to; pg += PAGE_SIZE) {
1637 int index = ((caddr_t) pg - bp->b_data) >> PAGE_SHIFT;
1638 p = bp->b_pages[index];
1639 bp->b_pages[index] = 0;
1640 pmap_kremove(pg);
1641 vm_page_free(p);
1642 --bp->b_npages;
1643 }
1644}