Deleted Added
sdiff udiff text old ( 49535 ) new ( 50275 )
full compact
1/*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice immediately at the beginning of the file, without modification,
10 * this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 * John S. Dyson.
13 *
14 * $Id: vfs_bio.c,v 1.224 1999/07/26 06:25:16 alc Exp $
15 */
16
17/*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme. Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author: John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 *
27 * see man buf(9) for more info.
28 */
29
30#define VMIO
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/sysproto.h>
34#include <sys/kernel.h>
35#include <sys/sysctl.h>
36#include <sys/proc.h>
37#include <sys/kthread.h>
38#include <sys/vnode.h>
39#include <sys/vmmeter.h>
40#include <sys/lock.h>
41#include <vm/vm.h>
42#include <vm/vm_param.h>
43#include <vm/vm_prot.h>
44#include <vm/vm_kern.h>
45#include <vm/vm_pageout.h>
46#include <vm/vm_page.h>
47#include <vm/vm_object.h>
48#include <vm/vm_extern.h>
49#include <vm/vm_map.h>
50#include <sys/buf.h>
51#include <sys/mount.h>
52#include <sys/malloc.h>
53#include <sys/resourcevar.h>
54#include <sys/conf.h>
55
56static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
57
58struct bio_ops bioops; /* I/O operation notification */
59
60struct buf *buf; /* buffer header pool */
61struct swqueue bswlist;
62
63static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
64 vm_offset_t to);
65static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
66 vm_offset_t to);
67static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
68 int pageno, vm_page_t m);
69static void vfs_clean_pages(struct buf * bp);
70static void vfs_setdirty(struct buf *bp);
71static void vfs_vmio_release(struct buf *bp);
72static int flushbufqueues(void);
73
74static int bd_request;
75
76static void buf_daemon __P((void));
77/*
78 * bogus page -- for I/O to/from partially complete buffers
79 * this is a temporary solution to the problem, but it is not
80 * really that bad. it would be better to split the buffer
81 * for input in the case of buffers partially already in memory,
82 * but the code is intricate enough already.
83 */
84vm_page_t bogus_page;
85int runningbufspace;
86int vmiodirenable = FALSE;
87static vm_offset_t bogus_offset;
88
89static int bufspace, maxbufspace, vmiospace,
90 bufmallocspace, maxbufmallocspace, hibufspace;
91#if 0
92static int maxvmiobufspace;
93#endif
94static int maxbdrun;
95static int needsbuffer;
96static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
97static int numfreebuffers, lofreebuffers, hifreebuffers;
98static int getnewbufcalls;
99static int getnewbufrestarts;
100static int kvafreespace;
101
102SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
103 &numdirtybuffers, 0, "");
104SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
105 &lodirtybuffers, 0, "");
106SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
107 &hidirtybuffers, 0, "");
108SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
109 &numfreebuffers, 0, "");
110SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
111 &lofreebuffers, 0, "");
112SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
113 &hifreebuffers, 0, "");
114SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
115 &runningbufspace, 0, "");
116SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
117 &maxbufspace, 0, "");
118SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
119 &hibufspace, 0, "");
120SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
121 &bufspace, 0, "");
122SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
123 &maxbdrun, 0, "");
124#if 0
125SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
126 &maxvmiobufspace, 0, "");
127#endif
128SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
129 &vmiospace, 0, "");
130SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
131 &maxbufmallocspace, 0, "");
132SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
133 &bufmallocspace, 0, "");
134SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
135 &kvafreespace, 0, "");
136SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
137 &getnewbufcalls, 0, "");
138SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
139 &getnewbufrestarts, 0, "");
140SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW,
141 &vmiodirenable, 0, "");
142
143
144static int bufhashmask;
145static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
146struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
147char *buf_wmesg = BUF_WMESG;
148
149extern int vm_swap_size;
150
151#define BUF_MAXUSE 24
152
153#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
154#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */
155#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
156#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
157#define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */
158
159/*
160 * Buffer hash table code. Note that the logical block scans linearly, which
161 * gives us some L1 cache locality.
162 */
163
164static __inline
165struct bufhashhdr *
166bufhash(struct vnode *vnp, daddr_t bn)
167{
168 return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
169}
170
171/*
172 * kvaspacewakeup:
173 *
174 * Called when kva space is potential available for recovery or when
175 * kva space is recovered in the buffer_map. This function wakes up
176 * anyone waiting for buffer_map kva space. Even though the buffer_map
177 * is larger then maxbufspace, this situation will typically occur
178 * when the buffer_map gets fragmented.
179 */
180
181static __inline void
182kvaspacewakeup(void)
183{
184 /*
185 * If someone is waiting for KVA space, wake them up. Even
186 * though we haven't freed the kva space yet, the waiting
187 * process will be able to now.
188 */
189 if (needsbuffer & VFS_BIO_NEED_KVASPACE) {
190 needsbuffer &= ~VFS_BIO_NEED_KVASPACE;
191 wakeup(&needsbuffer);
192 }
193}
194
195/*
196 * numdirtywakeup:
197 *
198 * If someone is blocked due to there being too many dirty buffers,
199 * and numdirtybuffers is now reasonable, wake them up.
200 */
201
202static __inline void
203numdirtywakeup(void)
204{
205 if (numdirtybuffers < hidirtybuffers) {
206 if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
207 needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
208 wakeup(&needsbuffer);
209 }
210 }
211}
212
213/*
214 * bufspacewakeup:
215 *
216 * Called when buffer space is potentially available for recovery or when
217 * buffer space is recovered. getnewbuf() will block on this flag when
218 * it is unable to free sufficient buffer space. Buffer space becomes
219 * recoverable when bp's get placed back in the queues.
220 */
221
222static __inline void
223bufspacewakeup(void)
224{
225 /*
226 * If someone is waiting for BUF space, wake them up. Even
227 * though we haven't freed the kva space yet, the waiting
228 * process will be able to now.
229 */
230 if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
231 needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
232 wakeup(&needsbuffer);
233 }
234}
235
236/*
237 * bufcountwakeup:
238 *
239 * Called when a buffer has been added to one of the free queues to
240 * account for the buffer and to wakeup anyone waiting for free buffers.
241 * This typically occurs when large amounts of metadata are being handled
242 * by the buffer cache ( else buffer space runs out first, usually ).
243 */
244
245static __inline void
246bufcountwakeup(void)
247{
248 ++numfreebuffers;
249 if (needsbuffer) {
250 needsbuffer &= ~VFS_BIO_NEED_ANY;
251 if (numfreebuffers >= hifreebuffers)
252 needsbuffer &= ~VFS_BIO_NEED_FREE;
253 wakeup(&needsbuffer);
254 }
255}
256
257/*
258 * vfs_buf_test_cache:
259 *
260 * Called when a buffer is extended. This function clears the B_CACHE
261 * bit if the newly extended portion of the buffer does not contain
262 * valid data.
263 */
264static __inline__
265void
266vfs_buf_test_cache(struct buf *bp,
267 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
268 vm_page_t m)
269{
270 if (bp->b_flags & B_CACHE) {
271 int base = (foff + off) & PAGE_MASK;
272 if (vm_page_is_valid(m, base, size) == 0)
273 bp->b_flags &= ~B_CACHE;
274 }
275}
276
277static __inline__
278void
279bd_wakeup(int dirtybuflevel)
280{
281 if (numdirtybuffers >= dirtybuflevel && bd_request == 0) {
282 bd_request = 1;
283 wakeup(&bd_request);
284 }
285}
286
287
288/*
289 * Initialize buffer headers and related structures.
290 */
291
292caddr_t
293bufhashinit(caddr_t vaddr)
294{
295 /* first, make a null hash table */
296 for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
297 ;
298 bufhashtbl = (void *)vaddr;
299 vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
300 --bufhashmask;
301 return(vaddr);
302}
303
304void
305bufinit(void)
306{
307 struct buf *bp;
308 int i;
309
310 TAILQ_INIT(&bswlist);
311 LIST_INIT(&invalhash);
312 simple_lock_init(&buftimelock);
313
314 for (i = 0; i <= bufhashmask; i++)
315 LIST_INIT(&bufhashtbl[i]);
316
317 /* next, make a null set of free lists */
318 for (i = 0; i < BUFFER_QUEUES; i++)
319 TAILQ_INIT(&bufqueues[i]);
320
321 /* finally, initialize each buffer header and stick on empty q */
322 for (i = 0; i < nbuf; i++) {
323 bp = &buf[i];
324 bzero(bp, sizeof *bp);
325 bp->b_flags = B_INVAL; /* we're just an empty header */
326 bp->b_dev = NODEV;
327 bp->b_rcred = NOCRED;
328 bp->b_wcred = NOCRED;
329 bp->b_qindex = QUEUE_EMPTY;
330 bp->b_xflags = 0;
331 LIST_INIT(&bp->b_dep);
332 BUF_LOCKINIT(bp);
333 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
334 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
335 }
336
337 /*
338 * maxbufspace is currently calculated to support all filesystem
339 * blocks to be 8K. If you happen to use a 16K filesystem, the size
340 * of the buffer cache is still the same as it would be for 8K
341 * filesystems. This keeps the size of the buffer cache "in check"
342 * for big block filesystems.
343 *
344 * maxbufspace is calculated as around 50% of the KVA available in
345 * the buffer_map ( DFLTSIZE vs BKVASIZE ), I presume to reduce the
346 * effect of fragmentation.
347 */
348 maxbufspace = (nbuf + 8) * DFLTBSIZE;
349 if ((hibufspace = maxbufspace - MAXBSIZE * 5) <= MAXBSIZE)
350 hibufspace = 3 * maxbufspace / 4;
351#if 0
352/*
353 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
354 */
355 maxvmiobufspace = 2 * hibufspace / 3;
356#endif
357/*
358 * Limit the amount of malloc memory since it is wired permanently into
359 * the kernel space. Even though this is accounted for in the buffer
360 * allocation, we don't want the malloced region to grow uncontrolled.
361 * The malloc scheme improves memory utilization significantly on average
362 * (small) directories.
363 */
364 maxbufmallocspace = hibufspace / 20;
365
366/*
367 * Reduce the chance of a deadlock occuring by limiting the number
368 * of delayed-write dirty buffers we allow to stack up.
369 */
370 lodirtybuffers = nbuf / 7 + 10;
371 hidirtybuffers = nbuf / 4 + 20;
372 numdirtybuffers = 0;
373
374/*
375 * Try to keep the number of free buffers in the specified range,
376 * and give the syncer access to an emergency reserve.
377 */
378 lofreebuffers = nbuf / 18 + 5;
379 hifreebuffers = 2 * lofreebuffers;
380 numfreebuffers = nbuf;
381
382/*
383 * Maximum number of async ops initiated per buf_daemon loop. This is
384 * somewhat of a hack at the moment, we really need to limit ourselves
385 * based on the number of bytes of I/O in-transit that were initiated
386 * from buf_daemon.
387 */
388 if ((maxbdrun = nswbuf / 4) < 4)
389 maxbdrun = 4;
390
391 kvafreespace = 0;
392
393 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
394 bogus_page = vm_page_alloc(kernel_object,
395 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
396 VM_ALLOC_NORMAL);
397
398}
399
400/*
401 * Free the kva allocation for a buffer
402 * Must be called only at splbio or higher,
403 * as this is the only locking for buffer_map.
404 */
405static void
406bfreekva(struct buf * bp)
407{
408 if (bp->b_kvasize) {
409 vm_map_delete(buffer_map,
410 (vm_offset_t) bp->b_kvabase,
411 (vm_offset_t) bp->b_kvabase + bp->b_kvasize
412 );
413 bp->b_kvasize = 0;
414 kvaspacewakeup();
415 }
416}
417
418/*
419 * bremfree:
420 *
421 * Remove the buffer from the appropriate free list.
422 */
423void
424bremfree(struct buf * bp)
425{
426 int s = splbio();
427 int old_qindex = bp->b_qindex;
428
429 if (bp->b_qindex != QUEUE_NONE) {
430 if (bp->b_qindex == QUEUE_EMPTYKVA) {
431 kvafreespace -= bp->b_kvasize;
432 }
433 KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
434 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
435 bp->b_qindex = QUEUE_NONE;
436 runningbufspace += bp->b_bufsize;
437 } else {
438#if !defined(MAX_PERF)
439 if (BUF_REFCNT(bp) <= 1)
440 panic("bremfree: removing a buffer not on a queue");
441#endif
442 }
443
444 /*
445 * Fixup numfreebuffers count. If the buffer is invalid or not
446 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
447 * the buffer was free and we must decrement numfreebuffers.
448 */
449 if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
450 switch(old_qindex) {
451 case QUEUE_DIRTY:
452 case QUEUE_CLEAN:
453 case QUEUE_EMPTY:
454 case QUEUE_EMPTYKVA:
455 --numfreebuffers;
456 break;
457 default:
458 break;
459 }
460 }
461 splx(s);
462}
463
464
465/*
466 * Get a buffer with the specified data. Look in the cache first. We
467 * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE
468 * is set, the buffer is valid and we do not have to do anything ( see
469 * getblk() ).
470 */
471int
472bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
473 struct buf ** bpp)
474{
475 struct buf *bp;
476
477 bp = getblk(vp, blkno, size, 0, 0);
478 *bpp = bp;
479
480 /* if not found in cache, do some I/O */
481 if ((bp->b_flags & B_CACHE) == 0) {
482 if (curproc != NULL)
483 curproc->p_stats->p_ru.ru_inblock++;
484 KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
485 bp->b_flags |= B_READ;
486 bp->b_flags &= ~(B_ERROR | B_INVAL);
487 if (bp->b_rcred == NOCRED) {
488 if (cred != NOCRED)
489 crhold(cred);
490 bp->b_rcred = cred;
491 }
492 vfs_busy_pages(bp, 0);
493 VOP_STRATEGY(vp, bp);
494 return (biowait(bp));
495 }
496 return (0);
497}
498
499/*
500 * Operates like bread, but also starts asynchronous I/O on
501 * read-ahead blocks. We must clear B_ERROR and B_INVAL prior
502 * to initiating I/O . If B_CACHE is set, the buffer is valid
503 * and we do not have to do anything.
504 */
505int
506breadn(struct vnode * vp, daddr_t blkno, int size,
507 daddr_t * rablkno, int *rabsize,
508 int cnt, struct ucred * cred, struct buf ** bpp)
509{
510 struct buf *bp, *rabp;
511 int i;
512 int rv = 0, readwait = 0;
513
514 *bpp = bp = getblk(vp, blkno, size, 0, 0);
515
516 /* if not found in cache, do some I/O */
517 if ((bp->b_flags & B_CACHE) == 0) {
518 if (curproc != NULL)
519 curproc->p_stats->p_ru.ru_inblock++;
520 bp->b_flags |= B_READ;
521 bp->b_flags &= ~(B_ERROR | B_INVAL);
522 if (bp->b_rcred == NOCRED) {
523 if (cred != NOCRED)
524 crhold(cred);
525 bp->b_rcred = cred;
526 }
527 vfs_busy_pages(bp, 0);
528 VOP_STRATEGY(vp, bp);
529 ++readwait;
530 }
531
532 for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
533 if (inmem(vp, *rablkno))
534 continue;
535 rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
536
537 if ((rabp->b_flags & B_CACHE) == 0) {
538 if (curproc != NULL)
539 curproc->p_stats->p_ru.ru_inblock++;
540 rabp->b_flags |= B_READ | B_ASYNC;
541 rabp->b_flags &= ~(B_ERROR | B_INVAL);
542 if (rabp->b_rcred == NOCRED) {
543 if (cred != NOCRED)
544 crhold(cred);
545 rabp->b_rcred = cred;
546 }
547 vfs_busy_pages(rabp, 0);
548 BUF_KERNPROC(rabp);
549 VOP_STRATEGY(vp, rabp);
550 } else {
551 brelse(rabp);
552 }
553 }
554
555 if (readwait) {
556 rv = biowait(bp);
557 }
558 return (rv);
559}
560
561/*
562 * Write, release buffer on completion. (Done by iodone
563 * if async). Do not bother writing anything if the buffer
564 * is invalid.
565 *
566 * Note that we set B_CACHE here, indicating that buffer is
567 * fully valid and thus cacheable. This is true even of NFS
568 * now so we set it generally. This could be set either here
569 * or in biodone() since the I/O is synchronous. We put it
570 * here.
571 */
572int
573bwrite(struct buf * bp)
574{
575 int oldflags, s;
576 struct vnode *vp;
577 struct mount *mp;
578
579 if (bp->b_flags & B_INVAL) {
580 brelse(bp);
581 return (0);
582 }
583
584 oldflags = bp->b_flags;
585
586#if !defined(MAX_PERF)
587 if (BUF_REFCNT(bp) == 0)
588 panic("bwrite: buffer is not busy???");
589#endif
590 s = splbio();
591 bundirty(bp);
592
593 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
594 bp->b_flags |= B_WRITEINPROG | B_CACHE;
595
596 bp->b_vp->v_numoutput++;
597 vfs_busy_pages(bp, 1);
598 if (curproc != NULL)
599 curproc->p_stats->p_ru.ru_oublock++;
600 splx(s);
601 if (oldflags & B_ASYNC)
602 BUF_KERNPROC(bp);
603 VOP_STRATEGY(bp->b_vp, bp);
604
605 /*
606 * Collect statistics on synchronous and asynchronous writes.
607 * Writes to block devices are charged to their associated
608 * filesystem (if any).
609 */
610 if ((vp = bp->b_vp) != NULL) {
611 if (vp->v_type == VBLK)
612 mp = vp->v_specmountpoint;
613 else
614 mp = vp->v_mount;
615 if (mp != NULL) {
616 if ((oldflags & B_ASYNC) == 0)
617 mp->mnt_stat.f_syncwrites++;
618 else
619 mp->mnt_stat.f_asyncwrites++;
620 }
621 }
622
623 if ((oldflags & B_ASYNC) == 0) {
624 int rtval = biowait(bp);
625 brelse(bp);
626 return (rtval);
627 }
628
629 return (0);
630}
631
632/*
633 * Delayed write. (Buffer is marked dirty). Do not bother writing
634 * anything if the buffer is marked invalid.
635 *
636 * Note that since the buffer must be completely valid, we can safely
637 * set B_CACHE. In fact, we have to set B_CACHE here rather then in
638 * biodone() in order to prevent getblk from writing the buffer
639 * out synchronously.
640 */
641void
642bdwrite(struct buf * bp)
643{
644#if 0
645 struct vnode *vp;
646#endif
647
648#if !defined(MAX_PERF)
649 if (BUF_REFCNT(bp) == 0)
650 panic("bdwrite: buffer is not busy");
651#endif
652
653 if (bp->b_flags & B_INVAL) {
654 brelse(bp);
655 return;
656 }
657 bdirty(bp);
658
659 /*
660 * Set B_CACHE, indicating that the buffer is fully valid. This is
661 * true even of NFS now.
662 */
663 bp->b_flags |= B_CACHE;
664
665 /*
666 * This bmap keeps the system from needing to do the bmap later,
667 * perhaps when the system is attempting to do a sync. Since it
668 * is likely that the indirect block -- or whatever other datastructure
669 * that the filesystem needs is still in memory now, it is a good
670 * thing to do this. Note also, that if the pageout daemon is
671 * requesting a sync -- there might not be enough memory to do
672 * the bmap then... So, this is important to do.
673 */
674 if (bp->b_lblkno == bp->b_blkno) {
675 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
676 }
677
678 /*
679 * Set the *dirty* buffer range based upon the VM system dirty pages.
680 */
681 vfs_setdirty(bp);
682
683 /*
684 * We need to do this here to satisfy the vnode_pager and the
685 * pageout daemon, so that it thinks that the pages have been
686 * "cleaned". Note that since the pages are in a delayed write
687 * buffer -- the VFS layer "will" see that the pages get written
688 * out on the next sync, or perhaps the cluster will be completed.
689 */
690 vfs_clean_pages(bp);
691 bqrelse(bp);
692
693 /*
694 * Wakeup the buffer flushing daemon if we have saturated the
695 * buffer cache.
696 */
697
698 bd_wakeup(hidirtybuffers);
699
700 /*
701 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
702 * due to the softdep code.
703 */
704#if 0
705 /*
706 * XXX The soft dependency code is not prepared to
707 * have I/O done when a bdwrite is requested. For
708 * now we just let the write be delayed if it is
709 * requested by the soft dependency code.
710 */
711 if ((vp = bp->b_vp) &&
712 ((vp->v_type == VBLK && vp->v_specmountpoint &&
713 (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
714 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
715 return;
716#endif
717}
718
719/*
720 * bdirty:
721 *
722 * Turn buffer into delayed write request. We must clear B_READ and
723 * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to
724 * itself to properly update it in the dirty/clean lists. We mark it
725 * B_DONE to ensure that any asynchronization of the buffer properly
726 * clears B_DONE ( else a panic will occur later ).
727 *
728 * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
729 * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty()
730 * should only be called if the buffer is known-good.
731 *
732 * Since the buffer is not on a queue, we do not update the numfreebuffers
733 * count.
734 *
735 * Must be called at splbio().
736 * The buffer must be on QUEUE_NONE.
737 */
738void
739bdirty(bp)
740 struct buf *bp;
741{
742 KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
743 bp->b_flags &= ~(B_READ|B_RELBUF);
744
745 if ((bp->b_flags & B_DELWRI) == 0) {
746 bp->b_flags |= B_DONE | B_DELWRI;
747 reassignbuf(bp, bp->b_vp);
748 ++numdirtybuffers;
749 bd_wakeup(hidirtybuffers);
750 }
751}
752
753/*
754 * bundirty:
755 *
756 * Clear B_DELWRI for buffer.
757 *
758 * Since the buffer is not on a queue, we do not update the numfreebuffers
759 * count.
760 *
761 * Must be called at splbio().
762 * The buffer must be on QUEUE_NONE.
763 */
764
765void
766bundirty(bp)
767 struct buf *bp;
768{
769 KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
770
771 if (bp->b_flags & B_DELWRI) {
772 bp->b_flags &= ~B_DELWRI;
773 reassignbuf(bp, bp->b_vp);
774 --numdirtybuffers;
775 numdirtywakeup();
776 }
777}
778
779/*
780 * bawrite:
781 *
782 * Asynchronous write. Start output on a buffer, but do not wait for
783 * it to complete. The buffer is released when the output completes.
784 *
785 * bwrite() ( or the VOP routine anyway ) is responsible for handling
786 * B_INVAL buffers. Not us.
787 */
788void
789bawrite(struct buf * bp)
790{
791 bp->b_flags |= B_ASYNC;
792 (void) VOP_BWRITE(bp->b_vp, bp);
793}
794
795/*
796 * bowrite:
797 *
798 * Ordered write. Start output on a buffer, and flag it so that the
799 * device will write it in the order it was queued. The buffer is
800 * released when the output completes. bwrite() ( or the VOP routine
801 * anyway ) is responsible for handling B_INVAL buffers.
802 */
803int
804bowrite(struct buf * bp)
805{
806 bp->b_flags |= B_ORDERED | B_ASYNC;
807 return (VOP_BWRITE(bp->b_vp, bp));
808}
809
810/*
811 * bwillwrite:
812 *
813 * Called prior to the locking of any vnodes when we are expecting to
814 * write. We do not want to starve the buffer cache with too many
815 * dirty buffers so we block here. By blocking prior to the locking
816 * of any vnodes we attempt to avoid the situation where a locked vnode
817 * prevents the various system daemons from flushing related buffers.
818 */
819
820void
821bwillwrite(void)
822{
823 int twenty = (hidirtybuffers - lodirtybuffers) / 5;
824
825 if (numdirtybuffers > hidirtybuffers + twenty) {
826 int s;
827
828 s = splbio();
829 while (numdirtybuffers > hidirtybuffers) {
830 bd_wakeup(hidirtybuffers);
831 needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
832 tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
833 }
834 splx(s);
835 }
836}
837
838/*
839 * brelse:
840 *
841 * Release a busy buffer and, if requested, free its resources. The
842 * buffer will be stashed in the appropriate bufqueue[] allowing it
843 * to be accessed later as a cache entity or reused for other purposes.
844 */
845void
846brelse(struct buf * bp)
847{
848 int s;
849
850 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
851
852#if 0
853 if (bp->b_flags & B_CLUSTER) {
854 relpbuf(bp, NULL);
855 return;
856 }
857#endif
858
859 s = splbio();
860
861 if (bp->b_flags & B_LOCKED)
862 bp->b_flags &= ~B_ERROR;
863
864 if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) {
865 /*
866 * Failed write, redirty. Must clear B_ERROR to prevent
867 * pages from being scrapped. Note: B_INVAL is ignored
868 * here but will presumably be dealt with later.
869 */
870 bp->b_flags &= ~B_ERROR;
871 bdirty(bp);
872 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
873 (bp->b_bufsize <= 0)) {
874 /*
875 * Either a failed I/O or we were asked to free or not
876 * cache the buffer.
877 */
878 bp->b_flags |= B_INVAL;
879 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
880 (*bioops.io_deallocate)(bp);
881 if (bp->b_flags & B_DELWRI) {
882 --numdirtybuffers;
883 numdirtywakeup();
884 }
885 bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
886 if ((bp->b_flags & B_VMIO) == 0) {
887 if (bp->b_bufsize)
888 allocbuf(bp, 0);
889 if (bp->b_vp)
890 brelvp(bp);
891 }
892 }
893
894 /*
895 * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release()
896 * is called with B_DELWRI set, the underlying pages may wind up
897 * getting freed causing a previous write (bdwrite()) to get 'lost'
898 * because pages associated with a B_DELWRI bp are marked clean.
899 *
900 * We still allow the B_INVAL case to call vfs_vmio_release(), even
901 * if B_DELWRI is set.
902 */
903
904 if (bp->b_flags & B_DELWRI)
905 bp->b_flags &= ~B_RELBUF;
906
907 /*
908 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
909 * constituted, not even NFS buffers now. Two flags effect this. If
910 * B_INVAL, the struct buf is invalidated but the VM object is kept
911 * around ( i.e. so it is trivial to reconstitute the buffer later ).
912 *
913 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
914 * invalidated. B_ERROR cannot be set for a failed write unless the
915 * buffer is also B_INVAL because it hits the re-dirtying code above.
916 *
917 * Normally we can do this whether a buffer is B_DELWRI or not. If
918 * the buffer is an NFS buffer, it is tracking piecemeal writes or
919 * the commit state and we cannot afford to lose the buffer.
920 */
921 if ((bp->b_flags & B_VMIO)
922 && !(bp->b_vp->v_tag == VT_NFS &&
923 bp->b_vp->v_type != VBLK &&
924 (bp->b_flags & B_DELWRI))
925 ) {
926
927 int i, j, resid;
928 vm_page_t m;
929 off_t foff;
930 vm_pindex_t poff;
931 vm_object_t obj;
932 struct vnode *vp;
933
934 vp = bp->b_vp;
935
936 /*
937 * Get the base offset and length of the buffer. Note that
938 * for block sizes that are less then PAGE_SIZE, the b_data
939 * base of the buffer does not represent exactly b_offset and
940 * neither b_offset nor b_size are necessarily page aligned.
941 * Instead, the starting position of b_offset is:
942 *
943 * b_data + (b_offset & PAGE_MASK)
944 *
945 * block sizes less then DEV_BSIZE (usually 512) are not
946 * supported due to the page granularity bits (m->valid,
947 * m->dirty, etc...).
948 *
949 * See man buf(9) for more information
950 */
951
952 resid = bp->b_bufsize;
953 foff = bp->b_offset;
954
955 for (i = 0; i < bp->b_npages; i++) {
956 m = bp->b_pages[i];
957 vm_page_flag_clear(m, PG_ZERO);
958 if (m == bogus_page) {
959
960 obj = (vm_object_t) vp->v_object;
961 poff = OFF_TO_IDX(bp->b_offset);
962
963 for (j = i; j < bp->b_npages; j++) {
964 m = bp->b_pages[j];
965 if (m == bogus_page) {
966 m = vm_page_lookup(obj, poff + j);
967#if !defined(MAX_PERF)
968 if (!m) {
969 panic("brelse: page missing\n");
970 }
971#endif
972 bp->b_pages[j] = m;
973 }
974 }
975
976 if ((bp->b_flags & B_INVAL) == 0) {
977 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
978 }
979 }
980 if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
981 int poffset = foff & PAGE_MASK;
982 int presid = resid > (PAGE_SIZE - poffset) ?
983 (PAGE_SIZE - poffset) : resid;
984
985 KASSERT(presid >= 0, ("brelse: extra page"));
986 vm_page_set_invalid(m, poffset, presid);
987 }
988 resid -= PAGE_SIZE - (foff & PAGE_MASK);
989 foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
990 }
991
992 if (bp->b_flags & (B_INVAL | B_RELBUF))
993 vfs_vmio_release(bp);
994
995 } else if (bp->b_flags & B_VMIO) {
996
997 if (bp->b_flags & (B_INVAL | B_RELBUF))
998 vfs_vmio_release(bp);
999
1000 }
1001
1002#if !defined(MAX_PERF)
1003 if (bp->b_qindex != QUEUE_NONE)
1004 panic("brelse: free buffer onto another queue???");
1005#endif
1006 if (BUF_REFCNT(bp) > 1) {
1007 /* Temporary panic to verify exclusive locking */
1008 /* This panic goes away when we allow shared refs */
1009 panic("brelse: multiple refs");
1010 /* do not release to free list */
1011 BUF_UNLOCK(bp);
1012 splx(s);
1013 return;
1014 }
1015
1016 /* enqueue */
1017
1018 /* buffers with no memory */
1019 if (bp->b_bufsize == 0) {
1020 bp->b_flags |= B_INVAL;
1021 if (bp->b_kvasize)
1022 bp->b_qindex = QUEUE_EMPTYKVA;
1023 else
1024 bp->b_qindex = QUEUE_EMPTY;
1025 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1026 LIST_REMOVE(bp, b_hash);
1027 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1028 bp->b_dev = NODEV;
1029 kvafreespace += bp->b_kvasize;
1030 if (bp->b_kvasize)
1031 kvaspacewakeup();
1032 /* buffers with junk contents */
1033 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
1034 bp->b_flags |= B_INVAL;
1035 bp->b_qindex = QUEUE_CLEAN;
1036 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1037 LIST_REMOVE(bp, b_hash);
1038 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1039 bp->b_dev = NODEV;
1040
1041 /* buffers that are locked */
1042 } else if (bp->b_flags & B_LOCKED) {
1043 bp->b_qindex = QUEUE_LOCKED;
1044 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1045
1046 /* remaining buffers */
1047 } else {
1048 switch(bp->b_flags & (B_DELWRI|B_AGE)) {
1049 case B_DELWRI | B_AGE:
1050 bp->b_qindex = QUEUE_DIRTY;
1051 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1052 break;
1053 case B_DELWRI:
1054 bp->b_qindex = QUEUE_DIRTY;
1055 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1056 break;
1057 case B_AGE:
1058 bp->b_qindex = QUEUE_CLEAN;
1059 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1060 break;
1061 default:
1062 bp->b_qindex = QUEUE_CLEAN;
1063 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1064 break;
1065 }
1066 }
1067
1068 /*
1069 * If B_INVAL, clear B_DELWRI. We've already placed the buffer
1070 * on the correct queue.
1071 */
1072 if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
1073 bp->b_flags &= ~B_DELWRI;
1074 --numdirtybuffers;
1075 numdirtywakeup();
1076 }
1077
1078 runningbufspace -= bp->b_bufsize;
1079
1080 /*
1081 * Fixup numfreebuffers count. The bp is on an appropriate queue
1082 * unless locked. We then bump numfreebuffers if it is not B_DELWRI.
1083 * We've already handled the B_INVAL case ( B_DELWRI will be clear
1084 * if B_INVAL is set ).
1085 */
1086
1087 if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
1088 bufcountwakeup();
1089
1090 /*
1091 * Something we can maybe free.
1092 */
1093
1094 if (bp->b_bufsize)
1095 bufspacewakeup();
1096
1097 /* unlock */
1098 BUF_UNLOCK(bp);
1099 bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1100 splx(s);
1101}
1102
1103/*
1104 * Release a buffer back to the appropriate queue but do not try to free
1105 * it.
1106 *
1107 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1108 * biodone() to requeue an async I/O on completion. It is also used when
1109 * known good buffers need to be requeued but we think we may need the data
1110 * again soon.
1111 */
1112void
1113bqrelse(struct buf * bp)
1114{
1115 int s;
1116
1117 s = splbio();
1118
1119 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1120
1121#if !defined(MAX_PERF)
1122 if (bp->b_qindex != QUEUE_NONE)
1123 panic("bqrelse: free buffer onto another queue???");
1124#endif
1125 if (BUF_REFCNT(bp) > 1) {
1126 /* do not release to free list */
1127 panic("bqrelse: multiple refs");
1128 BUF_UNLOCK(bp);
1129 splx(s);
1130 return;
1131 }
1132 if (bp->b_flags & B_LOCKED) {
1133 bp->b_flags &= ~B_ERROR;
1134 bp->b_qindex = QUEUE_LOCKED;
1135 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1136 /* buffers with stale but valid contents */
1137 } else if (bp->b_flags & B_DELWRI) {
1138 bp->b_qindex = QUEUE_DIRTY;
1139 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1140 } else {
1141 bp->b_qindex = QUEUE_CLEAN;
1142 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1143 }
1144
1145 runningbufspace -= bp->b_bufsize;
1146
1147 if ((bp->b_flags & B_LOCKED) == 0 &&
1148 ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
1149 bufcountwakeup();
1150 }
1151
1152 /*
1153 * Something we can maybe wakeup
1154 */
1155 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
1156 bufspacewakeup();
1157
1158 /* unlock */
1159 BUF_UNLOCK(bp);
1160 bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1161 splx(s);
1162}
1163
1164static void
1165vfs_vmio_release(bp)
1166 struct buf *bp;
1167{
1168 int i, s;
1169 vm_page_t m;
1170
1171 s = splvm();
1172 for (i = 0; i < bp->b_npages; i++) {
1173 m = bp->b_pages[i];
1174 bp->b_pages[i] = NULL;
1175 /*
1176 * In order to keep page LRU ordering consistent, put
1177 * everything on the inactive queue.
1178 */
1179 vm_page_unwire(m, 0);
1180 /*
1181 * We don't mess with busy pages, it is
1182 * the responsibility of the process that
1183 * busied the pages to deal with them.
1184 */
1185 if ((m->flags & PG_BUSY) || (m->busy != 0))
1186 continue;
1187
1188 if (m->wire_count == 0) {
1189 vm_page_flag_clear(m, PG_ZERO);
1190 /*
1191 * Might as well free the page if we can and it has
1192 * no valid data.
1193 */
1194 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
1195 vm_page_busy(m);
1196 vm_page_protect(m, VM_PROT_NONE);
1197 vm_page_free(m);
1198 }
1199 }
1200 }
1201 bufspace -= bp->b_bufsize;
1202 vmiospace -= bp->b_bufsize;
1203 runningbufspace -= bp->b_bufsize;
1204 splx(s);
1205 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
1206 if (bp->b_bufsize)
1207 bufspacewakeup();
1208 bp->b_npages = 0;
1209 bp->b_bufsize = 0;
1210 bp->b_flags &= ~B_VMIO;
1211 if (bp->b_vp)
1212 brelvp(bp);
1213}
1214
1215/*
1216 * Check to see if a block is currently memory resident.
1217 */
1218struct buf *
1219gbincore(struct vnode * vp, daddr_t blkno)
1220{
1221 struct buf *bp;
1222 struct bufhashhdr *bh;
1223
1224 bh = bufhash(vp, blkno);
1225 bp = bh->lh_first;
1226
1227 /* Search hash chain */
1228 while (bp != NULL) {
1229 /* hit */
1230 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
1231 (bp->b_flags & B_INVAL) == 0) {
1232 break;
1233 }
1234 bp = bp->b_hash.le_next;
1235 }
1236 return (bp);
1237}
1238
1239/*
1240 * vfs_bio_awrite:
1241 *
1242 * Implement clustered async writes for clearing out B_DELWRI buffers.
1243 * This is much better then the old way of writing only one buffer at
1244 * a time. Note that we may not be presented with the buffers in the
1245 * correct order, so we search for the cluster in both directions.
1246 */
1247int
1248vfs_bio_awrite(struct buf * bp)
1249{
1250 int i;
1251 int j;
1252 daddr_t lblkno = bp->b_lblkno;
1253 struct vnode *vp = bp->b_vp;
1254 int s;
1255 int ncl;
1256 struct buf *bpa;
1257 int nwritten;
1258 int size;
1259 int maxcl;
1260
1261 s = splbio();
1262 /*
1263 * right now we support clustered writing only to regular files. If
1264 * we find a clusterable block we could be in the middle of a cluster
1265 * rather then at the beginning.
1266 */
1267 if ((vp->v_type == VREG) &&
1268 (vp->v_mount != 0) && /* Only on nodes that have the size info */
1269 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1270
1271 size = vp->v_mount->mnt_stat.f_iosize;
1272 maxcl = MAXPHYS / size;
1273
1274 for (i = 1; i < maxcl; i++) {
1275 if ((bpa = gbincore(vp, lblkno + i)) &&
1276 BUF_REFCNT(bpa) == 0 &&
1277 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1278 (B_DELWRI | B_CLUSTEROK)) &&
1279 (bpa->b_bufsize == size)) {
1280 if ((bpa->b_blkno == bpa->b_lblkno) ||
1281 (bpa->b_blkno !=
1282 bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
1283 break;
1284 } else {
1285 break;
1286 }
1287 }
1288 for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
1289 if ((bpa = gbincore(vp, lblkno - j)) &&
1290 BUF_REFCNT(bpa) == 0 &&
1291 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1292 (B_DELWRI | B_CLUSTEROK)) &&
1293 (bpa->b_bufsize == size)) {
1294 if ((bpa->b_blkno == bpa->b_lblkno) ||
1295 (bpa->b_blkno !=
1296 bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
1297 break;
1298 } else {
1299 break;
1300 }
1301 }
1302 --j;
1303 ncl = i + j;
1304 /*
1305 * this is a possible cluster write
1306 */
1307 if (ncl != 1) {
1308 nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
1309 splx(s);
1310 return nwritten;
1311 }
1312 }
1313
1314 BUF_LOCK(bp, LK_EXCLUSIVE);
1315 bremfree(bp);
1316 bp->b_flags |= B_ASYNC;
1317
1318 splx(s);
1319 /*
1320 * default (old) behavior, writing out only one block
1321 *
1322 * XXX returns b_bufsize instead of b_bcount for nwritten?
1323 */
1324 nwritten = bp->b_bufsize;
1325 (void) VOP_BWRITE(bp->b_vp, bp);
1326
1327 return nwritten;
1328}
1329
1330/*
1331 * getnewbuf:
1332 *
1333 * Find and initialize a new buffer header, freeing up existing buffers
1334 * in the bufqueues as necessary. The new buffer is returned locked.
1335 *
1336 * Important: B_INVAL is not set. If the caller wishes to throw the
1337 * buffer away, the caller must set B_INVAL prior to calling brelse().
1338 *
1339 * We block if:
1340 * We have insufficient buffer headers
1341 * We have insufficient buffer space
1342 * buffer_map is too fragmented ( space reservation fails )
1343 * If we have to flush dirty buffers ( but we try to avoid this )
1344 *
1345 * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
1346 * Instead we ask the buf daemon to do it for us. We attempt to
1347 * avoid piecemeal wakeups of the pageout daemon.
1348 */
1349
1350static struct buf *
1351getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
1352{
1353 struct buf *bp;
1354 struct buf *nbp;
1355 struct buf *dbp;
1356 int outofspace;
1357 int nqindex;
1358 int defrag = 0;
1359
1360 ++getnewbufcalls;
1361 --getnewbufrestarts;
1362restart:
1363 ++getnewbufrestarts;
1364
1365 /*
1366 * Calculate whether we are out of buffer space. This state is
1367 * recalculated on every restart. If we are out of space, we
1368 * have to turn off defragmentation. Setting defrag to -1 when
1369 * outofspace is positive means "defrag while freeing buffers".
1370 * The looping conditional will be muffed up if defrag is left
1371 * positive when outofspace is positive.
1372 */
1373
1374 dbp = NULL;
1375 outofspace = 0;
1376 if (bufspace >= hibufspace) {
1377 if ((curproc->p_flag & P_BUFEXHAUST) == 0 ||
1378 bufspace >= maxbufspace) {
1379 outofspace = 1;
1380 if (defrag > 0)
1381 defrag = -1;
1382 }
1383 }
1384
1385 /*
1386 * defrag state is semi-persistant. 1 means we are flagged for
1387 * defragging. -1 means we actually defragged something.
1388 */
1389 /* nop */
1390
1391 /*
1392 * Setup for scan. If we do not have enough free buffers,
1393 * we setup a degenerate case that immediately fails. Note
1394 * that if we are specially marked process, we are allowed to
1395 * dip into our reserves.
1396 *
1397 * Normally we want to find an EMPTYKVA buffer. That is, a
1398 * buffer with kva already allocated. If there are no EMPTYKVA
1399 * buffers we back up to the truely EMPTY buffers. When defragging
1400 * we do not bother backing up since we have to locate buffers with
1401 * kva to defrag. If we are out of space we skip both EMPTY and
1402 * EMPTYKVA and dig right into the CLEAN queue.
1403 *
1404 * In this manner we avoid scanning unnecessary buffers. It is very
1405 * important for us to do this because the buffer cache is almost
1406 * constantly out of space or in need of defragmentation.
1407 */
1408
1409 if ((curproc->p_flag & P_BUFEXHAUST) == 0 &&
1410 numfreebuffers < lofreebuffers) {
1411 nqindex = QUEUE_CLEAN;
1412 nbp = NULL;
1413 } else {
1414 nqindex = QUEUE_EMPTYKVA;
1415 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
1416 if (nbp == NULL) {
1417 if (defrag <= 0) {
1418 nqindex = QUEUE_EMPTY;
1419 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1420 }
1421 }
1422 if (outofspace || nbp == NULL) {
1423 nqindex = QUEUE_CLEAN;
1424 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
1425 }
1426 }
1427
1428 /*
1429 * Run scan, possibly freeing data and/or kva mappings on the fly
1430 * depending.
1431 */
1432
1433 while ((bp = nbp) != NULL) {
1434 int qindex = nqindex;
1435
1436 /*
1437 * Calculate next bp ( we can only use it if we do not block
1438 * or do other fancy things ).
1439 */
1440 if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
1441 switch(qindex) {
1442 case QUEUE_EMPTY:
1443 nqindex = QUEUE_EMPTYKVA;
1444 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
1445 break;
1446 /* fall through */
1447 case QUEUE_EMPTYKVA:
1448 nqindex = QUEUE_CLEAN;
1449 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
1450 break;
1451 /* fall through */
1452 case QUEUE_CLEAN:
1453 /*
1454 * nbp is NULL.
1455 */
1456 break;
1457 }
1458 }
1459
1460 /*
1461 * Sanity Checks
1462 */
1463 KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
1464
1465 /*
1466 * Note: we no longer distinguish between VMIO and non-VMIO
1467 * buffers.
1468 */
1469
1470 KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
1471
1472 /*
1473 * If we are defragging and the buffer isn't useful for fixing
1474 * that problem we continue. If we are out of space and the
1475 * buffer isn't useful for fixing that problem we continue.
1476 */
1477
1478 if (defrag > 0 && bp->b_kvasize == 0)
1479 continue;
1480 if (outofspace > 0 && bp->b_bufsize == 0)
1481 continue;
1482
1483 /*
1484 * Start freeing the bp. This is somewhat involved. nbp
1485 * remains valid only for QUEUE_EMPTY[KVA] bp's.
1486 */
1487
1488 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
1489 panic("getnewbuf: locked buf");
1490 bremfree(bp);
1491
1492 if (qindex == QUEUE_CLEAN) {
1493 if (bp->b_flags & B_VMIO) {
1494 bp->b_flags &= ~B_ASYNC;
1495 vfs_vmio_release(bp);
1496 }
1497 if (bp->b_vp)
1498 brelvp(bp);
1499 }
1500
1501 /*
1502 * NOTE: nbp is now entirely invalid. We can only restart
1503 * the scan from this point on.
1504 *
1505 * Get the rest of the buffer freed up. b_kva* is still
1506 * valid after this operation.
1507 */
1508
1509 if (bp->b_rcred != NOCRED) {
1510 crfree(bp->b_rcred);
1511 bp->b_rcred = NOCRED;
1512 }
1513 if (bp->b_wcred != NOCRED) {
1514 crfree(bp->b_wcred);
1515 bp->b_wcred = NOCRED;
1516 }
1517 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
1518 (*bioops.io_deallocate)(bp);
1519 LIST_REMOVE(bp, b_hash);
1520 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1521
1522 if (bp->b_bufsize)
1523 allocbuf(bp, 0);
1524
1525 bp->b_flags = 0;
1526 bp->b_dev = NODEV;
1527 bp->b_vp = NULL;
1528 bp->b_blkno = bp->b_lblkno = 0;
1529 bp->b_offset = NOOFFSET;
1530 bp->b_iodone = 0;
1531 bp->b_error = 0;
1532 bp->b_resid = 0;
1533 bp->b_bcount = 0;
1534 bp->b_npages = 0;
1535 bp->b_dirtyoff = bp->b_dirtyend = 0;
1536
1537 LIST_INIT(&bp->b_dep);
1538
1539 /*
1540 * Ok, now that we have a free buffer, if we are defragging
1541 * we have to recover the kvaspace. If we are out of space
1542 * we have to free the buffer (which we just did), but we
1543 * do not have to recover kva space unless we hit a defrag
1544 * hicup. Being able to avoid freeing the kva space leads
1545 * to a significant reduction in overhead.
1546 */
1547
1548 if (defrag > 0) {
1549 defrag = -1;
1550 bp->b_flags |= B_INVAL;
1551 bfreekva(bp);
1552 brelse(bp);
1553 goto restart;
1554 }
1555
1556 if (outofspace > 0) {
1557 outofspace = -1;
1558 bp->b_flags |= B_INVAL;
1559 if (defrag < 0)
1560 bfreekva(bp);
1561 brelse(bp);
1562 goto restart;
1563 }
1564
1565 /*
1566 * We are done
1567 */
1568 break;
1569 }
1570
1571 /*
1572 * If we exhausted our list, sleep as appropriate. We may have to
1573 * wakeup various daemons and write out some dirty buffers.
1574 *
1575 * Generally we are sleeping due to insufficient buffer space.
1576 */
1577
1578 if (bp == NULL) {
1579 int flags;
1580 char *waitmsg;
1581
1582dosleep:
1583 if (defrag > 0) {
1584 flags = VFS_BIO_NEED_KVASPACE;
1585 waitmsg = "nbufkv";
1586 } else if (outofspace > 0) {
1587 waitmsg = "nbufbs";
1588 flags = VFS_BIO_NEED_BUFSPACE;
1589 } else {
1590 waitmsg = "newbuf";
1591 flags = VFS_BIO_NEED_ANY;
1592 }
1593
1594 /* XXX */
1595
1596 (void) speedup_syncer();
1597 needsbuffer |= flags;
1598 while (needsbuffer & flags) {
1599 if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
1600 waitmsg, slptimeo))
1601 return (NULL);
1602 }
1603 } else {
1604 /*
1605 * We finally have a valid bp. We aren't quite out of the
1606 * woods, we still have to reserve kva space.
1607 */
1608 vm_offset_t addr = 0;
1609
1610 maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1611
1612 if (maxsize != bp->b_kvasize) {
1613 bfreekva(bp);
1614
1615 if (vm_map_findspace(buffer_map,
1616 vm_map_min(buffer_map), maxsize, &addr)) {
1617 /*
1618 * Uh oh. Buffer map is to fragmented. Try
1619 * to defragment.
1620 */
1621 if (defrag <= 0) {
1622 defrag = 1;
1623 bp->b_flags |= B_INVAL;
1624 brelse(bp);
1625 goto restart;
1626 }
1627 /*
1628 * Uh oh. We couldn't seem to defragment
1629 */
1630 bp = NULL;
1631 goto dosleep;
1632 }
1633 }
1634 if (addr) {
1635 vm_map_insert(buffer_map, NULL, 0,
1636 addr, addr + maxsize,
1637 VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1638
1639 bp->b_kvabase = (caddr_t) addr;
1640 bp->b_kvasize = maxsize;
1641 }
1642 bp->b_data = bp->b_kvabase;
1643 }
1644 return(bp);
1645}
1646
1647/*
1648 * waitfreebuffers:
1649 *
1650 * Wait for sufficient free buffers. Only called from normal processes.
1651 */
1652
1653static void
1654waitfreebuffers(int slpflag, int slptimeo)
1655{
1656 while (numfreebuffers < hifreebuffers) {
1657 if (numfreebuffers >= hifreebuffers)
1658 break;
1659 needsbuffer |= VFS_BIO_NEED_FREE;
1660 if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1661 break;
1662 }
1663}
1664
1665/*
1666 * buf_daemon:
1667 *
1668 * buffer flushing daemon. Buffers are normally flushed by the
1669 * update daemon but if it cannot keep up this process starts to
1670 * take the load in an attempt to prevent getnewbuf() from blocking.
1671 */
1672
1673static struct proc *bufdaemonproc;
1674static int bd_interval;
1675static int bd_flushto;
1676
1677static struct kproc_desc buf_kp = {
1678 "bufdaemon",
1679 buf_daemon,
1680 &bufdaemonproc
1681};
1682SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
1683
1684static void
1685buf_daemon()
1686{
1687 int s;
1688 /*
1689 * This process is allowed to take the buffer cache to the limit
1690 */
1691 curproc->p_flag |= P_BUFEXHAUST;
1692 s = splbio();
1693
1694 bd_interval = 5 * hz; /* dynamically adjusted */
1695 bd_flushto = hidirtybuffers; /* dynamically adjusted */
1696
1697 while (TRUE) {
1698 bd_request = 0;
1699
1700 /*
1701 * Do the flush. Limit the number of buffers we flush in one
1702 * go. The failure condition occurs when processes are writing
1703 * buffers faster then we can dispose of them. In this case
1704 * we may be flushing so often that the previous set of flushes
1705 * have not had time to complete, causing us to run out of
1706 * physical buffers and block.
1707 */
1708 {
1709 int runcount = maxbdrun;
1710
1711 while (numdirtybuffers > bd_flushto && runcount) {
1712 --runcount;
1713 if (flushbufqueues() == 0)
1714 break;
1715 }
1716 }
1717
1718 /*
1719 * If nobody is requesting anything we sleep
1720 */
1721 if (bd_request == 0)
1722 tsleep(&bd_request, PVM, "psleep", bd_interval);
1723
1724 /*
1725 * We calculate how much to add or subtract from bd_flushto
1726 * and bd_interval based on how far off we are from the
1727 * optimal number of dirty buffers, which is 20% below the
1728 * hidirtybuffers mark. We cannot use hidirtybuffers straight
1729 * because being right on the mark will cause getnewbuf()
1730 * to oscillate our wakeup.
1731 *
1732 * The larger the error in either direction, the more we adjust
1733 * bd_flushto and bd_interval. The time interval is adjusted
1734 * by 2 seconds per whole-buffer-range of error. This is an
1735 * exponential convergence algorithm, with large errors
1736 * producing large changes and small errors producing small
1737 * changes.
1738 */
1739
1740 {
1741 int brange = hidirtybuffers - lodirtybuffers;
1742 int middb = hidirtybuffers - brange / 5;
1743 int deltabuf = middb - numdirtybuffers;
1744
1745 bd_flushto += deltabuf / 20;
1746 bd_interval += deltabuf * (2 * hz) / (brange * 1);
1747 }
1748 if (bd_flushto < lodirtybuffers)
1749 bd_flushto = lodirtybuffers;
1750 if (bd_flushto > hidirtybuffers)
1751 bd_flushto = hidirtybuffers;
1752 if (bd_interval < hz / 10)
1753 bd_interval = hz / 10;
1754 if (bd_interval > 5 * hz)
1755 bd_interval = 5 * hz;
1756 }
1757}
1758
1759/*
1760 * flushbufqueues:
1761 *
1762 * Try to flush a buffer in the dirty queue. We must be careful to
1763 * free up B_INVAL buffers instead of write them, which NFS is
1764 * particularly sensitive to.
1765 */
1766
1767static int
1768flushbufqueues(void)
1769{
1770 struct buf *bp;
1771 int r = 0;
1772
1773 bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
1774
1775 while (bp) {
1776 KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
1777 if ((bp->b_flags & B_DELWRI) != 0) {
1778 if (bp->b_flags & B_INVAL) {
1779 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
1780 panic("flushbufqueues: locked buf");
1781 bremfree(bp);
1782 brelse(bp);
1783 ++r;
1784 break;
1785 }
1786 vfs_bio_awrite(bp);
1787 ++r;
1788 break;
1789 }
1790 bp = TAILQ_NEXT(bp, b_freelist);
1791 }
1792 return(r);
1793}
1794
1795/*
1796 * Check to see if a block is currently memory resident.
1797 */
1798struct buf *
1799incore(struct vnode * vp, daddr_t blkno)
1800{
1801 struct buf *bp;
1802
1803 int s = splbio();
1804 bp = gbincore(vp, blkno);
1805 splx(s);
1806 return (bp);
1807}
1808
1809/*
1810 * Returns true if no I/O is needed to access the
1811 * associated VM object. This is like incore except
1812 * it also hunts around in the VM system for the data.
1813 */
1814
1815int
1816inmem(struct vnode * vp, daddr_t blkno)
1817{
1818 vm_object_t obj;
1819 vm_offset_t toff, tinc, size;
1820 vm_page_t m;
1821 vm_ooffset_t off;
1822
1823 if (incore(vp, blkno))
1824 return 1;
1825 if (vp->v_mount == NULL)
1826 return 0;
1827 if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1828 return 0;
1829
1830 obj = vp->v_object;
1831 size = PAGE_SIZE;
1832 if (size > vp->v_mount->mnt_stat.f_iosize)
1833 size = vp->v_mount->mnt_stat.f_iosize;
1834 off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
1835
1836 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1837 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1838 if (!m)
1839 return 0;
1840 tinc = size;
1841 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
1842 tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
1843 if (vm_page_is_valid(m,
1844 (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1845 return 0;
1846 }
1847 return 1;
1848}
1849
1850/*
1851 * vfs_setdirty:
1852 *
1853 * Sets the dirty range for a buffer based on the status of the dirty
1854 * bits in the pages comprising the buffer.
1855 *
1856 * The range is limited to the size of the buffer.
1857 *
1858 * This routine is primarily used by NFS, but is generalized for the
1859 * B_VMIO case.
1860 */
1861static void
1862vfs_setdirty(struct buf *bp)
1863{
1864 int i;
1865 vm_object_t object;
1866
1867 /*
1868 * Degenerate case - empty buffer
1869 */
1870
1871 if (bp->b_bufsize == 0)
1872 return;
1873
1874 /*
1875 * We qualify the scan for modified pages on whether the
1876 * object has been flushed yet. The OBJ_WRITEABLE flag
1877 * is not cleared simply by protecting pages off.
1878 */
1879
1880 if ((bp->b_flags & B_VMIO) == 0)
1881 return;
1882
1883 object = bp->b_pages[0]->object;
1884
1885 if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
1886 printf("Warning: object %p writeable but not mightbedirty\n", object);
1887 if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
1888 printf("Warning: object %p mightbedirty but not writeable\n", object);
1889
1890 if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
1891 vm_offset_t boffset;
1892 vm_offset_t eoffset;
1893
1894 /*
1895 * test the pages to see if they have been modified directly
1896 * by users through the VM system.
1897 */
1898 for (i = 0; i < bp->b_npages; i++) {
1899 vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1900 vm_page_test_dirty(bp->b_pages[i]);
1901 }
1902
1903 /*
1904 * Calculate the encompassing dirty range, boffset and eoffset,
1905 * (eoffset - boffset) bytes.
1906 */
1907
1908 for (i = 0; i < bp->b_npages; i++) {
1909 if (bp->b_pages[i]->dirty)
1910 break;
1911 }
1912 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1913
1914 for (i = bp->b_npages - 1; i >= 0; --i) {
1915 if (bp->b_pages[i]->dirty) {
1916 break;
1917 }
1918 }
1919 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1920
1921 /*
1922 * Fit it to the buffer.
1923 */
1924
1925 if (eoffset > bp->b_bcount)
1926 eoffset = bp->b_bcount;
1927
1928 /*
1929 * If we have a good dirty range, merge with the existing
1930 * dirty range.
1931 */
1932
1933 if (boffset < eoffset) {
1934 if (bp->b_dirtyoff > boffset)
1935 bp->b_dirtyoff = boffset;
1936 if (bp->b_dirtyend < eoffset)
1937 bp->b_dirtyend = eoffset;
1938 }
1939 }
1940}
1941
1942/*
1943 * getblk:
1944 *
1945 * Get a block given a specified block and offset into a file/device.
1946 * The buffers B_DONE bit will be cleared on return, making it almost
1947 * ready for an I/O initiation. B_INVAL may or may not be set on
1948 * return. The caller should clear B_INVAL prior to initiating a
1949 * READ.
1950 *
1951 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
1952 * an existing buffer.
1953 *
1954 * For a VMIO buffer, B_CACHE is modified according to the backing VM.
1955 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
1956 * and then cleared based on the backing VM. If the previous buffer is
1957 * non-0-sized but invalid, B_CACHE will be cleared.
1958 *
1959 * If getblk() must create a new buffer, the new buffer is returned with
1960 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
1961 * case it is returned with B_INVAL clear and B_CACHE set based on the
1962 * backing VM.
1963 *
1964 * getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
1965 * B_CACHE bit is clear.
1966 *
1967 * What this means, basically, is that the caller should use B_CACHE to
1968 * determine whether the buffer is fully valid or not and should clear
1969 * B_INVAL prior to issuing a read. If the caller intends to validate
1970 * the buffer by loading its data area with something, the caller needs
1971 * to clear B_INVAL. If the caller does this without issuing an I/O,
1972 * the caller should set B_CACHE ( as an optimization ), else the caller
1973 * should issue the I/O and biodone() will set B_CACHE if the I/O was
1974 * a write attempt or if it was a successfull read. If the caller
1975 * intends to issue a READ, the caller must clear B_INVAL and B_ERROR
1976 * prior to issuing the READ. biodone() will *not* clear B_INVAL.
1977 */
1978struct buf *
1979getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1980{
1981 struct buf *bp;
1982 int s;
1983 struct bufhashhdr *bh;
1984
1985#if !defined(MAX_PERF)
1986 if (size > MAXBSIZE)
1987 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1988#endif
1989
1990 s = splbio();
1991loop:
1992 /*
1993 * Block if we are low on buffers. Certain processes are allowed
1994 * to completely exhaust the buffer cache.
1995 */
1996 if (curproc->p_flag & P_BUFEXHAUST) {
1997 if (numfreebuffers == 0) {
1998 needsbuffer |= VFS_BIO_NEED_ANY;
1999 tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
2000 slptimeo);
2001 }
2002 } else if (numfreebuffers < lofreebuffers) {
2003 waitfreebuffers(slpflag, slptimeo);
2004 }
2005
2006 if ((bp = gbincore(vp, blkno))) {
2007 /*
2008 * Buffer is in-core. If the buffer is not busy, it must
2009 * be on a queue.
2010 */
2011
2012 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
2013 if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
2014 "getblk", slpflag, slptimeo) == ENOLCK)
2015 goto loop;
2016 splx(s);
2017 return (struct buf *) NULL;
2018 }
2019
2020 /*
2021 * The buffer is locked. B_CACHE is cleared if the buffer is
2022 * invalid. Ohterwise, for a non-VMIO buffer, B_CACHE is set
2023 * and for a VMIO buffer B_CACHE is adjusted according to the
2024 * backing VM cache.
2025 */
2026 if (bp->b_flags & B_INVAL)
2027 bp->b_flags &= ~B_CACHE;
2028 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
2029 bp->b_flags |= B_CACHE;
2030 bremfree(bp);
2031
2032 /*
2033 * check for size inconsistancies for non-VMIO case.
2034 */
2035
2036 if (bp->b_bcount != size) {
2037 if ((bp->b_flags & B_VMIO) == 0 ||
2038 (size > bp->b_kvasize)) {
2039 if (bp->b_flags & B_DELWRI) {
2040 bp->b_flags |= B_NOCACHE;
2041 VOP_BWRITE(bp->b_vp, bp);
2042 } else {
2043 if ((bp->b_flags & B_VMIO) &&
2044 (LIST_FIRST(&bp->b_dep) == NULL)) {
2045 bp->b_flags |= B_RELBUF;
2046 brelse(bp);
2047 } else {
2048 bp->b_flags |= B_NOCACHE;
2049 VOP_BWRITE(bp->b_vp, bp);
2050 }
2051 }
2052 goto loop;
2053 }
2054 }
2055
2056 /*
2057 * If the size is inconsistant in the VMIO case, we can resize
2058 * the buffer. This might lead to B_CACHE getting set or
2059 * cleared. If the size has not changed, B_CACHE remains
2060 * unchanged from its previous state.
2061 */
2062
2063 if (bp->b_bcount != size)
2064 allocbuf(bp, size);
2065
2066 KASSERT(bp->b_offset != NOOFFSET,
2067 ("getblk: no buffer offset"));
2068
2069 /*
2070 * A buffer with B_DELWRI set and B_CACHE clear must
2071 * be committed before we can return the buffer in
2072 * order to prevent the caller from issuing a read
2073 * ( due to B_CACHE not being set ) and overwriting
2074 * it.
2075 *
2076 * Most callers, including NFS and FFS, need this to
2077 * operate properly either because they assume they
2078 * can issue a read if B_CACHE is not set, or because
2079 * ( for example ) an uncached B_DELWRI might loop due
2080 * to softupdates re-dirtying the buffer. In the latter
2081 * case, B_CACHE is set after the first write completes,
2082 * preventing further loops.
2083 */
2084
2085 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
2086 VOP_BWRITE(bp->b_vp, bp);
2087 goto loop;
2088 }
2089
2090 splx(s);
2091 bp->b_flags &= ~B_DONE;
2092 } else {
2093 /*
2094 * Buffer is not in-core, create new buffer. The buffer
2095 * returned by getnewbuf() is locked. Note that the returned
2096 * buffer is also considered valid (not marked B_INVAL).
2097 */
2098 int bsize, maxsize, vmio;
2099 off_t offset;
2100
2101 if (vp->v_type == VBLK)
2102 bsize = DEV_BSIZE;
2103 else if (vp->v_mountedhere)
2104 bsize = vp->v_mountedhere->mnt_stat.f_iosize;
2105 else if (vp->v_mount)
2106 bsize = vp->v_mount->mnt_stat.f_iosize;
2107 else
2108 bsize = size;
2109
2110 offset = (off_t)blkno * bsize;
2111 vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
2112 maxsize = vmio ? size + (offset & PAGE_MASK) : size;
2113 maxsize = imax(maxsize, bsize);
2114
2115 if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
2116 if (slpflag || slptimeo) {
2117 splx(s);
2118 return NULL;
2119 }
2120 goto loop;
2121 }
2122
2123 /*
2124 * This code is used to make sure that a buffer is not
2125 * created while the getnewbuf routine is blocked.
2126 * This can be a problem whether the vnode is locked or not.
2127 * If the buffer is created out from under us, we have to
2128 * throw away the one we just created. There is now window
2129 * race because we are safely running at splbio() from the
2130 * point of the duplicate buffer creation through to here,
2131 * and we've locked the buffer.
2132 */
2133 if (gbincore(vp, blkno)) {
2134 bp->b_flags |= B_INVAL;
2135 brelse(bp);
2136 goto loop;
2137 }
2138
2139 /*
2140 * Insert the buffer into the hash, so that it can
2141 * be found by incore.
2142 */
2143 bp->b_blkno = bp->b_lblkno = blkno;
2144 bp->b_offset = offset;
2145
2146 bgetvp(vp, bp);
2147 LIST_REMOVE(bp, b_hash);
2148 bh = bufhash(vp, blkno);
2149 LIST_INSERT_HEAD(bh, bp, b_hash);
2150
2151 /*
2152 * set B_VMIO bit. allocbuf() the buffer bigger. Since the
2153 * buffer size starts out as 0, B_CACHE will be set by
2154 * allocbuf() for the VMIO case prior to it testing the
2155 * backing store for validity.
2156 */
2157
2158 if (vmio) {
2159 bp->b_flags |= B_VMIO;
2160#if defined(VFS_BIO_DEBUG)
2161 if (vp->v_type != VREG && vp->v_type != VBLK)
2162 printf("getblk: vmioing file type %d???\n", vp->v_type);
2163#endif
2164 } else {
2165 bp->b_flags &= ~B_VMIO;
2166 }
2167
2168 allocbuf(bp, size);
2169
2170 splx(s);
2171 bp->b_flags &= ~B_DONE;
2172 }
2173 return (bp);
2174}
2175
2176/*
2177 * Get an empty, disassociated buffer of given size. The buffer is initially
2178 * set to B_INVAL.
2179 */
2180struct buf *
2181geteblk(int size)
2182{
2183 struct buf *bp;
2184 int s;
2185
2186 s = splbio();
2187 while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
2188 splx(s);
2189 allocbuf(bp, size);
2190 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
2191 return (bp);
2192}
2193
2194
2195/*
2196 * This code constitutes the buffer memory from either anonymous system
2197 * memory (in the case of non-VMIO operations) or from an associated
2198 * VM object (in the case of VMIO operations). This code is able to
2199 * resize a buffer up or down.
2200 *
2201 * Note that this code is tricky, and has many complications to resolve
2202 * deadlock or inconsistant data situations. Tread lightly!!!
2203 * There are B_CACHE and B_DELWRI interactions that must be dealt with by
2204 * the caller. Calling this code willy nilly can result in the loss of data.
2205 *
2206 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with
2207 * B_CACHE for the non-VMIO case.
2208 */
2209
2210int
2211allocbuf(struct buf *bp, int size)
2212{
2213 int newbsize, mbsize;
2214 int i;
2215
2216#if !defined(MAX_PERF)
2217 if (BUF_REFCNT(bp) == 0)
2218 panic("allocbuf: buffer not busy");
2219
2220 if (bp->b_kvasize < size)
2221 panic("allocbuf: buffer too small");
2222#endif
2223
2224 if ((bp->b_flags & B_VMIO) == 0) {
2225 caddr_t origbuf;
2226 int origbufsize;
2227 /*
2228 * Just get anonymous memory from the kernel. Don't
2229 * mess with B_CACHE.
2230 */
2231 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2232#if !defined(NO_B_MALLOC)
2233 if (bp->b_flags & B_MALLOC)
2234 newbsize = mbsize;
2235 else
2236#endif
2237 newbsize = round_page(size);
2238
2239 if (newbsize < bp->b_bufsize) {
2240#if !defined(NO_B_MALLOC)
2241 /*
2242 * malloced buffers are not shrunk
2243 */
2244 if (bp->b_flags & B_MALLOC) {
2245 if (newbsize) {
2246 bp->b_bcount = size;
2247 } else {
2248 free(bp->b_data, M_BIOBUF);
2249 bufspace -= bp->b_bufsize;
2250 bufmallocspace -= bp->b_bufsize;
2251 runningbufspace -= bp->b_bufsize;
2252 if (bp->b_bufsize)
2253 bufspacewakeup();
2254 bp->b_data = bp->b_kvabase;
2255 bp->b_bufsize = 0;
2256 bp->b_bcount = 0;
2257 bp->b_flags &= ~B_MALLOC;
2258 }
2259 return 1;
2260 }
2261#endif
2262 vm_hold_free_pages(
2263 bp,
2264 (vm_offset_t) bp->b_data + newbsize,
2265 (vm_offset_t) bp->b_data + bp->b_bufsize);
2266 } else if (newbsize > bp->b_bufsize) {
2267#if !defined(NO_B_MALLOC)
2268 /*
2269 * We only use malloced memory on the first allocation.
2270 * and revert to page-allocated memory when the buffer
2271 * grows.
2272 */
2273 if ( (bufmallocspace < maxbufmallocspace) &&
2274 (bp->b_bufsize == 0) &&
2275 (mbsize <= PAGE_SIZE/2)) {
2276
2277 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
2278 bp->b_bufsize = mbsize;
2279 bp->b_bcount = size;
2280 bp->b_flags |= B_MALLOC;
2281 bufspace += mbsize;
2282 bufmallocspace += mbsize;
2283 runningbufspace += bp->b_bufsize;
2284 return 1;
2285 }
2286#endif
2287 origbuf = NULL;
2288 origbufsize = 0;
2289#if !defined(NO_B_MALLOC)
2290 /*
2291 * If the buffer is growing on its other-than-first allocation,
2292 * then we revert to the page-allocation scheme.
2293 */
2294 if (bp->b_flags & B_MALLOC) {
2295 origbuf = bp->b_data;
2296 origbufsize = bp->b_bufsize;
2297 bp->b_data = bp->b_kvabase;
2298 bufspace -= bp->b_bufsize;
2299 bufmallocspace -= bp->b_bufsize;
2300 runningbufspace -= bp->b_bufsize;
2301 if (bp->b_bufsize)
2302 bufspacewakeup();
2303 bp->b_bufsize = 0;
2304 bp->b_flags &= ~B_MALLOC;
2305 newbsize = round_page(newbsize);
2306 }
2307#endif
2308 vm_hold_load_pages(
2309 bp,
2310 (vm_offset_t) bp->b_data + bp->b_bufsize,
2311 (vm_offset_t) bp->b_data + newbsize);
2312#if !defined(NO_B_MALLOC)
2313 if (origbuf) {
2314 bcopy(origbuf, bp->b_data, origbufsize);
2315 free(origbuf, M_BIOBUF);
2316 }
2317#endif
2318 }
2319 } else {
2320 vm_page_t m;
2321 int desiredpages;
2322
2323 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2324 desiredpages = (size == 0) ? 0 :
2325 num_pages((bp->b_offset & PAGE_MASK) + newbsize);
2326
2327#if !defined(NO_B_MALLOC)
2328 if (bp->b_flags & B_MALLOC)
2329 panic("allocbuf: VMIO buffer can't be malloced");
2330#endif
2331 /*
2332 * Set B_CACHE initially if buffer is 0 length or will become
2333 * 0-length.
2334 */
2335 if (size == 0 || bp->b_bufsize == 0)
2336 bp->b_flags |= B_CACHE;
2337
2338 if (newbsize < bp->b_bufsize) {
2339 /*
2340 * DEV_BSIZE aligned new buffer size is less then the
2341 * DEV_BSIZE aligned existing buffer size. Figure out
2342 * if we have to remove any pages.
2343 */
2344 if (desiredpages < bp->b_npages) {
2345 for (i = desiredpages; i < bp->b_npages; i++) {
2346 /*
2347 * the page is not freed here -- it
2348 * is the responsibility of
2349 * vnode_pager_setsize
2350 */
2351 m = bp->b_pages[i];
2352 KASSERT(m != bogus_page,
2353 ("allocbuf: bogus page found"));
2354 while (vm_page_sleep_busy(m, TRUE, "biodep"))
2355 ;
2356
2357 bp->b_pages[i] = NULL;
2358 vm_page_unwire(m, 0);
2359 }
2360 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
2361 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
2362 bp->b_npages = desiredpages;
2363 }
2364 } else if (size > bp->b_bcount) {
2365 /*
2366 * We are growing the buffer, possibly in a
2367 * byte-granular fashion.
2368 */
2369 struct vnode *vp;
2370 vm_object_t obj;
2371 vm_offset_t toff;
2372 vm_offset_t tinc;
2373
2374 /*
2375 * Step 1, bring in the VM pages from the object,
2376 * allocating them if necessary. We must clear
2377 * B_CACHE if these pages are not valid for the
2378 * range covered by the buffer.
2379 */
2380
2381 vp = bp->b_vp;
2382 obj = vp->v_object;
2383
2384 while (bp->b_npages < desiredpages) {
2385 vm_page_t m;
2386 vm_pindex_t pi;
2387
2388 pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
2389 if ((m = vm_page_lookup(obj, pi)) == NULL) {
2390 m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
2391 if (m == NULL) {
2392 VM_WAIT;
2393 vm_pageout_deficit += desiredpages - bp->b_npages;
2394 } else {
2395 vm_page_wire(m);
2396 vm_page_wakeup(m);
2397 bp->b_flags &= ~B_CACHE;
2398 bp->b_pages[bp->b_npages] = m;
2399 ++bp->b_npages;
2400 }
2401 continue;
2402 }
2403
2404 /*
2405 * We found a page. If we have to sleep on it,
2406 * retry because it might have gotten freed out
2407 * from under us.
2408 *
2409 * We can only test PG_BUSY here. Blocking on
2410 * m->busy might lead to a deadlock:
2411 *
2412 * vm_fault->getpages->cluster_read->allocbuf
2413 *
2414 */
2415
2416 if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
2417 continue;
2418
2419 /*
2420 * We have a good page. Should we wakeup the
2421 * page daemon?
2422 */
2423 if ((curproc != pageproc) &&
2424 ((m->queue - m->pc) == PQ_CACHE) &&
2425 ((cnt.v_free_count + cnt.v_cache_count) <
2426 (cnt.v_free_min + cnt.v_cache_min))) {
2427 pagedaemon_wakeup();
2428 }
2429 vm_page_flag_clear(m, PG_ZERO);
2430 vm_page_wire(m);
2431 bp->b_pages[bp->b_npages] = m;
2432 ++bp->b_npages;
2433 }
2434
2435 /*
2436 * Step 2. We've loaded the pages into the buffer,
2437 * we have to figure out if we can still have B_CACHE
2438 * set. Note that B_CACHE is set according to the
2439 * byte-granular range ( bcount and size ), new the
2440 * aligned range ( newbsize ).
2441 *
2442 * The VM test is against m->valid, which is DEV_BSIZE
2443 * aligned. Needless to say, the validity of the data
2444 * needs to also be DEV_BSIZE aligned. Note that this
2445 * fails with NFS if the server or some other client
2446 * extends the file's EOF. If our buffer is resized,
2447 * B_CACHE may remain set! XXX
2448 */
2449
2450 toff = bp->b_bcount;
2451 tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
2452
2453 while ((bp->b_flags & B_CACHE) && toff < size) {
2454 vm_pindex_t pi;
2455
2456 if (tinc > (size - toff))
2457 tinc = size - toff;
2458
2459 pi = ((bp->b_offset & PAGE_MASK) + toff) >>
2460 PAGE_SHIFT;
2461
2462 vfs_buf_test_cache(
2463 bp,
2464 bp->b_offset,
2465 toff,
2466 tinc,
2467 bp->b_pages[pi]
2468 );
2469 toff += tinc;
2470 tinc = PAGE_SIZE;
2471 }
2472
2473 /*
2474 * Step 3, fixup the KVM pmap. Remember that
2475 * bp->b_data is relative to bp->b_offset, but
2476 * bp->b_offset may be offset into the first page.
2477 */
2478
2479 bp->b_data = (caddr_t)
2480 trunc_page((vm_offset_t)bp->b_data);
2481 pmap_qenter(
2482 (vm_offset_t)bp->b_data,
2483 bp->b_pages,
2484 bp->b_npages
2485 );
2486 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
2487 (vm_offset_t)(bp->b_offset & PAGE_MASK));
2488 }
2489 }
2490 if (bp->b_flags & B_VMIO)
2491 vmiospace += (newbsize - bp->b_bufsize);
2492 bufspace += (newbsize - bp->b_bufsize);
2493 runningbufspace += (newbsize - bp->b_bufsize);
2494 if (newbsize < bp->b_bufsize)
2495 bufspacewakeup();
2496 bp->b_bufsize = newbsize; /* actual buffer allocation */
2497 bp->b_bcount = size; /* requested buffer size */
2498 return 1;
2499}
2500
2501/*
2502 * biowait:
2503 *
2504 * Wait for buffer I/O completion, returning error status. The buffer
2505 * is left locked and B_DONE on return. B_EINTR is converted into a EINTR
2506 * error and cleared.
2507 */
2508int
2509biowait(register struct buf * bp)
2510{
2511 int s;
2512
2513 s = splbio();
2514 while ((bp->b_flags & B_DONE) == 0) {
2515#if defined(NO_SCHEDULE_MODS)
2516 tsleep(bp, PRIBIO, "biowait", 0);
2517#else
2518 if (bp->b_flags & B_READ)
2519 tsleep(bp, PRIBIO, "biord", 0);
2520 else
2521 tsleep(bp, PRIBIO, "biowr", 0);
2522#endif
2523 }
2524 splx(s);
2525 if (bp->b_flags & B_EINTR) {
2526 bp->b_flags &= ~B_EINTR;
2527 return (EINTR);
2528 }
2529 if (bp->b_flags & B_ERROR) {
2530 return (bp->b_error ? bp->b_error : EIO);
2531 } else {
2532 return (0);
2533 }
2534}
2535
2536/*
2537 * biodone:
2538 *
2539 * Finish I/O on a buffer, optionally calling a completion function.
2540 * This is usually called from an interrupt so process blocking is
2541 * not allowed.
2542 *
2543 * biodone is also responsible for setting B_CACHE in a B_VMIO bp.
2544 * In a non-VMIO bp, B_CACHE will be set on the next getblk()
2545 * assuming B_INVAL is clear.
2546 *
2547 * For the VMIO case, we set B_CACHE if the op was a read and no
2548 * read error occured, or if the op was a write. B_CACHE is never
2549 * set if the buffer is invalid or otherwise uncacheable.
2550 *
2551 * biodone does not mess with B_INVAL, allowing the I/O routine or the
2552 * initiator to leave B_INVAL set to brelse the buffer out of existance
2553 * in the biodone routine.
2554 */
2555void
2556biodone(register struct buf * bp)
2557{
2558 int s;
2559
2560 s = splbio();
2561
2562 KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
2563 KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
2564
2565 bp->b_flags |= B_DONE;
2566
2567 if (bp->b_flags & B_FREEBUF) {
2568 brelse(bp);
2569 splx(s);
2570 return;
2571 }
2572
2573 if ((bp->b_flags & B_READ) == 0) {
2574 vwakeup(bp);
2575 }
2576
2577 /* call optional completion function if requested */
2578 if (bp->b_flags & B_CALL) {
2579 bp->b_flags &= ~B_CALL;
2580 (*bp->b_iodone) (bp);
2581 splx(s);
2582 return;
2583 }
2584 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
2585 (*bioops.io_complete)(bp);
2586
2587 if (bp->b_flags & B_VMIO) {
2588 int i, resid;
2589 vm_ooffset_t foff;
2590 vm_page_t m;
2591 vm_object_t obj;
2592 int iosize;
2593 struct vnode *vp = bp->b_vp;
2594
2595 obj = vp->v_object;
2596
2597#if defined(VFS_BIO_DEBUG)
2598 if (vp->v_usecount == 0) {
2599 panic("biodone: zero vnode ref count");
2600 }
2601
2602 if (vp->v_object == NULL) {
2603 panic("biodone: missing VM object");
2604 }
2605
2606 if ((vp->v_flag & VOBJBUF) == 0) {
2607 panic("biodone: vnode is not setup for merged cache");
2608 }
2609#endif
2610
2611 foff = bp->b_offset;
2612 KASSERT(bp->b_offset != NOOFFSET,
2613 ("biodone: no buffer offset"));
2614
2615#if !defined(MAX_PERF)
2616 if (!obj) {
2617 panic("biodone: no object");
2618 }
2619#endif
2620#if defined(VFS_BIO_DEBUG)
2621 if (obj->paging_in_progress < bp->b_npages) {
2622 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
2623 obj->paging_in_progress, bp->b_npages);
2624 }
2625#endif
2626
2627 /*
2628 * Set B_CACHE if the op was a normal read and no error
2629 * occured. B_CACHE is set for writes in the b*write()
2630 * routines.
2631 */
2632 iosize = bp->b_bcount;
2633 if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
2634 bp->b_flags |= B_CACHE;
2635 }
2636
2637 for (i = 0; i < bp->b_npages; i++) {
2638 int bogusflag = 0;
2639 m = bp->b_pages[i];
2640 if (m == bogus_page) {
2641 bogusflag = 1;
2642 m = vm_page_lookup(obj, OFF_TO_IDX(foff));
2643 if (!m) {
2644#if defined(VFS_BIO_DEBUG)
2645 printf("biodone: page disappeared\n");
2646#endif
2647 vm_object_pip_subtract(obj, 1);
2648 bp->b_flags &= ~B_CACHE;
2649 continue;
2650 }
2651 bp->b_pages[i] = m;
2652 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2653 }
2654#if defined(VFS_BIO_DEBUG)
2655 if (OFF_TO_IDX(foff) != m->pindex) {
2656 printf(
2657"biodone: foff(%lu)/m->pindex(%d) mismatch\n",
2658 (unsigned long)foff, m->pindex);
2659 }
2660#endif
2661 resid = IDX_TO_OFF(m->pindex + 1) - foff;
2662 if (resid > iosize)
2663 resid = iosize;
2664
2665 /*
2666 * In the write case, the valid and clean bits are
2667 * already changed correctly ( see bdwrite() ), so we
2668 * only need to do this here in the read case.
2669 */
2670 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
2671 vfs_page_set_valid(bp, foff, i, m);
2672 }
2673 vm_page_flag_clear(m, PG_ZERO);
2674
2675 /*
2676 * when debugging new filesystems or buffer I/O methods, this
2677 * is the most common error that pops up. if you see this, you
2678 * have not set the page busy flag correctly!!!
2679 */
2680 if (m->busy == 0) {
2681#if !defined(MAX_PERF)
2682 printf("biodone: page busy < 0, "
2683 "pindex: %d, foff: 0x(%x,%x), "
2684 "resid: %d, index: %d\n",
2685 (int) m->pindex, (int)(foff >> 32),
2686 (int) foff & 0xffffffff, resid, i);
2687#endif
2688 if (vp->v_type != VBLK)
2689#if !defined(MAX_PERF)
2690 printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
2691 bp->b_vp->v_mount->mnt_stat.f_iosize,
2692 (int) bp->b_lblkno,
2693 bp->b_flags, bp->b_npages);
2694 else
2695 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
2696 (int) bp->b_lblkno,
2697 bp->b_flags, bp->b_npages);
2698 printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2699 m->valid, m->dirty, m->wire_count);
2700#endif
2701 panic("biodone: page busy < 0\n");
2702 }
2703 vm_page_io_finish(m);
2704 vm_object_pip_subtract(obj, 1);
2705 foff += resid;
2706 iosize -= resid;
2707 }
2708 if (obj)
2709 vm_object_pip_wakeupn(obj, 0);
2710 }
2711 /*
2712 * For asynchronous completions, release the buffer now. The brelse
2713 * will do a wakeup there if necessary - so no need to do a wakeup
2714 * here in the async case. The sync case always needs to do a wakeup.
2715 */
2716
2717 if (bp->b_flags & B_ASYNC) {
2718 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2719 brelse(bp);
2720 else
2721 bqrelse(bp);
2722 } else {
2723 wakeup(bp);
2724 }
2725 splx(s);
2726}
2727
2728/*
2729 * This routine is called in lieu of iodone in the case of
2730 * incomplete I/O. This keeps the busy status for pages
2731 * consistant.
2732 */
2733void
2734vfs_unbusy_pages(struct buf * bp)
2735{
2736 int i;
2737
2738 if (bp->b_flags & B_VMIO) {
2739 struct vnode *vp = bp->b_vp;
2740 vm_object_t obj = vp->v_object;
2741
2742 for (i = 0; i < bp->b_npages; i++) {
2743 vm_page_t m = bp->b_pages[i];
2744
2745 if (m == bogus_page) {
2746 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2747#if !defined(MAX_PERF)
2748 if (!m) {
2749 panic("vfs_unbusy_pages: page missing\n");
2750 }
2751#endif
2752 bp->b_pages[i] = m;
2753 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2754 }
2755 vm_object_pip_subtract(obj, 1);
2756 vm_page_flag_clear(m, PG_ZERO);
2757 vm_page_io_finish(m);
2758 }
2759 vm_object_pip_wakeupn(obj, 0);
2760 }
2761}
2762
2763/*
2764 * vfs_page_set_valid:
2765 *
2766 * Set the valid bits in a page based on the supplied offset. The
2767 * range is restricted to the buffer's size.
2768 *
2769 * This routine is typically called after a read completes.
2770 */
2771static void
2772vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2773{
2774 vm_ooffset_t soff, eoff;
2775
2776 /*
2777 * Start and end offsets in buffer. eoff - soff may not cross a
2778 * page boundry or cross the end of the buffer. The end of the
2779 * buffer, in this case, is our file EOF, not the allocation size
2780 * of the buffer.
2781 */
2782 soff = off;
2783 eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
2784 if (eoff > bp->b_offset + bp->b_bcount)
2785 eoff = bp->b_offset + bp->b_bcount;
2786
2787 /*
2788 * Set valid range. This is typically the entire buffer and thus the
2789 * entire page.
2790 */
2791 if (eoff > soff) {
2792 vm_page_set_validclean(
2793 m,
2794 (vm_offset_t) (soff & PAGE_MASK),
2795 (vm_offset_t) (eoff - soff)
2796 );
2797 }
2798}
2799
2800/*
2801 * This routine is called before a device strategy routine.
2802 * It is used to tell the VM system that paging I/O is in
2803 * progress, and treat the pages associated with the buffer
2804 * almost as being PG_BUSY. Also the object paging_in_progress
2805 * flag is handled to make sure that the object doesn't become
2806 * inconsistant.
2807 *
2808 * Since I/O has not been initiated yet, certain buffer flags
2809 * such as B_ERROR or B_INVAL may be in an inconsistant state
2810 * and should be ignored.
2811 */
2812void
2813vfs_busy_pages(struct buf * bp, int clear_modify)
2814{
2815 int i, bogus;
2816
2817 if (bp->b_flags & B_VMIO) {
2818 struct vnode *vp = bp->b_vp;
2819 vm_object_t obj = vp->v_object;
2820 vm_ooffset_t foff;
2821
2822 foff = bp->b_offset;
2823 KASSERT(bp->b_offset != NOOFFSET,
2824 ("vfs_busy_pages: no buffer offset"));
2825 vfs_setdirty(bp);
2826
2827retry:
2828 for (i = 0; i < bp->b_npages; i++) {
2829 vm_page_t m = bp->b_pages[i];
2830 if (vm_page_sleep_busy(m, FALSE, "vbpage"))
2831 goto retry;
2832 }
2833
2834 bogus = 0;
2835 for (i = 0; i < bp->b_npages; i++) {
2836 vm_page_t m = bp->b_pages[i];
2837
2838 vm_page_flag_clear(m, PG_ZERO);
2839 if ((bp->b_flags & B_CLUSTER) == 0) {
2840 vm_object_pip_add(obj, 1);
2841 vm_page_io_start(m);
2842 }
2843
2844 /*
2845 * When readying a buffer for a read ( i.e
2846 * clear_modify == 0 ), it is important to do
2847 * bogus_page replacement for valid pages in
2848 * partially instantiated buffers. Partially
2849 * instantiated buffers can, in turn, occur when
2850 * reconstituting a buffer from its VM backing store
2851 * base. We only have to do this if B_CACHE is
2852 * clear ( which causes the I/O to occur in the
2853 * first place ). The replacement prevents the read
2854 * I/O from overwriting potentially dirty VM-backed
2855 * pages. XXX bogus page replacement is, uh, bogus.
2856 * It may not work properly with small-block devices.
2857 * We need to find a better way.
2858 */
2859
2860 vm_page_protect(m, VM_PROT_NONE);
2861 if (clear_modify)
2862 vfs_page_set_valid(bp, foff, i, m);
2863 else if (m->valid == VM_PAGE_BITS_ALL &&
2864 (bp->b_flags & B_CACHE) == 0) {
2865 bp->b_pages[i] = bogus_page;
2866 bogus++;
2867 }
2868 foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2869 }
2870 if (bogus)
2871 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2872 }
2873}
2874
2875/*
2876 * Tell the VM system that the pages associated with this buffer
2877 * are clean. This is used for delayed writes where the data is
2878 * going to go to disk eventually without additional VM intevention.
2879 *
2880 * Note that while we only really need to clean through to b_bcount, we
2881 * just go ahead and clean through to b_bufsize.
2882 */
2883static void
2884vfs_clean_pages(struct buf * bp)
2885{
2886 int i;
2887
2888 if (bp->b_flags & B_VMIO) {
2889 vm_ooffset_t foff;
2890
2891 foff = bp->b_offset;
2892 KASSERT(bp->b_offset != NOOFFSET,
2893 ("vfs_clean_pages: no buffer offset"));
2894 for (i = 0; i < bp->b_npages; i++) {
2895 vm_page_t m = bp->b_pages[i];
2896 vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2897 vm_ooffset_t eoff = noff;
2898
2899 if (eoff > bp->b_offset + bp->b_bufsize)
2900 eoff = bp->b_offset + bp->b_bufsize;
2901 vfs_page_set_valid(bp, foff, i, m);
2902 /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2903 foff = noff;
2904 }
2905 }
2906}
2907
2908/*
2909 * vfs_bio_set_validclean:
2910 *
2911 * Set the range within the buffer to valid and clean. The range is
2912 * relative to the beginning of the buffer, b_offset. Note that b_offset
2913 * itself may be offset from the beginning of the first page.
2914 */
2915
2916void
2917vfs_bio_set_validclean(struct buf *bp, int base, int size)
2918{
2919 if (bp->b_flags & B_VMIO) {
2920 int i;
2921 int n;
2922
2923 /*
2924 * Fixup base to be relative to beginning of first page.
2925 * Set initial n to be the maximum number of bytes in the
2926 * first page that can be validated.
2927 */
2928
2929 base += (bp->b_offset & PAGE_MASK);
2930 n = PAGE_SIZE - (base & PAGE_MASK);
2931
2932 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
2933 vm_page_t m = bp->b_pages[i];
2934
2935 if (n > size)
2936 n = size;
2937
2938 vm_page_set_validclean(m, base & PAGE_MASK, n);
2939 base += n;
2940 size -= n;
2941 n = PAGE_SIZE;
2942 }
2943 }
2944}
2945
2946/*
2947 * vfs_bio_clrbuf:
2948 *
2949 * clear a buffer. This routine essentially fakes an I/O, so we need
2950 * to clear B_ERROR and B_INVAL.
2951 *
2952 * Note that while we only theoretically need to clear through b_bcount,
2953 * we go ahead and clear through b_bufsize.
2954 */
2955
2956void
2957vfs_bio_clrbuf(struct buf *bp) {
2958 int i, mask = 0;
2959 caddr_t sa, ea;
2960 if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2961 bp->b_flags &= ~(B_INVAL|B_ERROR);
2962 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
2963 (bp->b_offset & PAGE_MASK) == 0) {
2964 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
2965 if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2966 ((bp->b_pages[0]->valid & mask) != mask)) {
2967 bzero(bp->b_data, bp->b_bufsize);
2968 }
2969 bp->b_pages[0]->valid |= mask;
2970 bp->b_resid = 0;
2971 return;
2972 }
2973 ea = sa = bp->b_data;
2974 for(i=0;i<bp->b_npages;i++,sa=ea) {
2975 int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE;
2976 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
2977 ea = (caddr_t)ulmin((u_long)ea,
2978 (u_long)bp->b_data + bp->b_bufsize);
2979 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
2980 if ((bp->b_pages[i]->valid & mask) == mask)
2981 continue;
2982 if ((bp->b_pages[i]->valid & mask) == 0) {
2983 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2984 bzero(sa, ea - sa);
2985 }
2986 } else {
2987 for (; sa < ea; sa += DEV_BSIZE, j++) {
2988 if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2989 (bp->b_pages[i]->valid & (1<<j)) == 0)
2990 bzero(sa, DEV_BSIZE);
2991 }
2992 }
2993 bp->b_pages[i]->valid |= mask;
2994 vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
2995 }
2996 bp->b_resid = 0;
2997 } else {
2998 clrbuf(bp);
2999 }
3000}
3001
3002/*
3003 * vm_hold_load_pages and vm_hold_unload pages get pages into
3004 * a buffers address space. The pages are anonymous and are
3005 * not associated with a file object.
3006 */
3007void
3008vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3009{
3010 vm_offset_t pg;
3011 vm_page_t p;
3012 int index;
3013
3014 to = round_page(to);
3015 from = round_page(from);
3016 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3017
3018 for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3019
3020tryagain:
3021
3022 p = vm_page_alloc(kernel_object,
3023 ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
3024 VM_ALLOC_NORMAL);
3025 if (!p) {
3026 vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
3027 VM_WAIT;
3028 goto tryagain;
3029 }
3030 vm_page_wire(p);
3031 p->valid = VM_PAGE_BITS_ALL;
3032 vm_page_flag_clear(p, PG_ZERO);
3033 pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
3034 bp->b_pages[index] = p;
3035 vm_page_wakeup(p);
3036 }
3037 bp->b_npages = index;
3038}
3039
3040void
3041vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3042{
3043 vm_offset_t pg;
3044 vm_page_t p;
3045 int index, newnpages;
3046
3047 from = round_page(from);
3048 to = round_page(to);
3049 newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3050
3051 for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3052 p = bp->b_pages[index];
3053 if (p && (index < bp->b_npages)) {
3054#if !defined(MAX_PERF)
3055 if (p->busy) {
3056 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
3057 bp->b_blkno, bp->b_lblkno);
3058 }
3059#endif
3060 bp->b_pages[index] = NULL;
3061 pmap_kremove(pg);
3062 vm_page_busy(p);
3063 vm_page_unwire(p, 0);
3064 vm_page_free(p);
3065 }
3066 }
3067 bp->b_npages = newnpages;
3068}
3069
3070
3071#include "opt_ddb.h"
3072#ifdef DDB
3073#include <ddb/ddb.h>
3074
3075DB_SHOW_COMMAND(buffer, db_show_buffer)
3076{
3077 /* get args */
3078 struct buf *bp = (struct buf *)addr;
3079
3080 if (!have_addr) {
3081 db_printf("usage: show buffer <addr>\n");
3082 return;
3083 }
3084
3085 db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
3086 db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
3087 "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
3088 "b_blkno = %d, b_pblkno = %d\n",
3089 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
3090 major(bp->b_dev), minor(bp->b_dev),
3091 bp->b_data, bp->b_blkno, bp->b_pblkno);
3092 if (bp->b_npages) {
3093 int i;
3094 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
3095 for (i = 0; i < bp->b_npages; i++) {
3096 vm_page_t m;
3097 m = bp->b_pages[i];
3098 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
3099 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
3100 if ((i + 1) < bp->b_npages)
3101 db_printf(",");
3102 }
3103 db_printf("\n");
3104 }
3105}
3106#endif /* DDB */