vfs_bio.c revision 3688
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 *
19 * $Id: vfs_bio.c,v 1.14 1994/10/05 09:48:21 davidg Exp $
20 */
21
22#include <sys/param.h>
23#include <sys/systm.h>
24#include <sys/kernel.h>
25#include <sys/proc.h>
26#include <sys/vnode.h>
27#include <sys/buf.h>
28#include <sys/mount.h>
29#include <sys/malloc.h>
30#include <sys/resourcevar.h>
31#include <sys/proc.h>
32#include <vm/vm.h>
33#include <vm/vm_pageout.h>
34
35#include <miscfs/specfs/specdev.h>
36
37struct	buf *buf;		/* buffer header pool */
38int	nbuf;			/* number of buffer headers calculated elsewhere */
39struct swqueue bswlist;
40
41extern	vm_map_t buffer_map, io_map;
42
43void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
44void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
45
46int needsbuffer;
47
48/*
49 * Internal update daemon, process 3
50 *	The variable vfs_update_wakeup allows for internal syncs.
51 */
52int vfs_update_wakeup;
53
54/*
55 * Initialize buffer headers and related structures.
56 */
57void
58bufinit()
59{
60	struct buf *bp;
61	int i;
62	caddr_t baddr;
63
64	TAILQ_INIT(&bswlist);
65	LIST_INIT(&invalhash);
66
67	/* first, make a null hash table */
68	for(i=0;i<BUFHSZ;i++)
69		LIST_INIT(&bufhashtbl[i]);
70
71	/* next, make a null set of free lists */
72	for(i=0;i<BUFFER_QUEUES;i++)
73		TAILQ_INIT(&bufqueues[i]);
74
75	baddr = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
76	/* finally, initialize each buffer header and stick on empty q */
77	for(i=0;i<nbuf;i++) {
78		bp = &buf[i];
79		bzero(bp, sizeof *bp);
80		bp->b_flags = B_INVAL;	/* we're just an empty header */
81		bp->b_dev = NODEV;
82		bp->b_vp = NULL;
83		bp->b_rcred = NOCRED;
84		bp->b_wcred = NOCRED;
85		bp->b_qindex = QUEUE_EMPTY;
86		bp->b_vnbufs.le_next = NOLIST;
87		bp->b_data = baddr + i * MAXBSIZE;
88		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
89		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
90	}
91}
92
93/*
94 * remove the buffer from the appropriate free list
95 */
96void
97bremfree(struct buf *bp)
98{
99	int s = splbio();
100	if( bp->b_qindex != QUEUE_NONE) {
101		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
102		bp->b_qindex = QUEUE_NONE;
103	} else {
104		panic("bremfree: removing a buffer when not on a queue");
105	}
106	splx(s);
107}
108
109/*
110 * Get a buffer with the specified data.  Look in the cache first.
111 */
112int
113bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
114	struct buf **bpp)
115{
116	struct buf *bp;
117
118	bp = getblk (vp, blkno, size, 0, 0);
119	*bpp = bp;
120
121	/* if not found in cache, do some I/O */
122	if ((bp->b_flags & B_CACHE) == 0) {
123		if (curproc && curproc->p_stats)	/* count block I/O */
124			curproc->p_stats->p_ru.ru_inblock++;
125		bp->b_flags |= B_READ;
126		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
127		if( bp->b_rcred == NOCRED) {
128			if (cred != NOCRED)
129				crhold(cred);
130			bp->b_rcred = cred;
131		}
132		VOP_STRATEGY(bp);
133		return( biowait (bp));
134	}
135
136	return (0);
137}
138
139/*
140 * Operates like bread, but also starts asynchronous I/O on
141 * read-ahead blocks.
142 */
143int
144breadn(struct vnode *vp, daddr_t blkno, int size,
145	daddr_t *rablkno, int *rabsize,
146	int cnt, struct ucred *cred, struct buf **bpp)
147{
148	struct buf *bp, *rabp;
149	int i;
150	int rv = 0, readwait = 0;
151
152	*bpp = bp = getblk (vp, blkno, size, 0, 0);
153
154	/* if not found in cache, do some I/O */
155	if ((bp->b_flags & B_CACHE) == 0) {
156		if (curproc && curproc->p_stats)	/* count block I/O */
157			curproc->p_stats->p_ru.ru_inblock++;
158		bp->b_flags |= B_READ;
159		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
160		if( bp->b_rcred == NOCRED) {
161			if (cred != NOCRED)
162				crhold(cred);
163			bp->b_rcred = cred;
164		}
165		VOP_STRATEGY(bp);
166		++readwait;
167	}
168
169	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
170		if( incore(vp, *rablkno)) {
171			continue;
172		}
173		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
174
175		if ((rabp->b_flags & B_CACHE) == 0) {
176			if (curproc && curproc->p_stats)
177				curproc->p_stats->p_ru.ru_inblock++;
178			rabp->b_flags |= B_READ | B_ASYNC;
179			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
180			if( rabp->b_rcred == NOCRED) {
181				if (cred != NOCRED)
182					crhold(cred);
183				rabp->b_rcred = cred;
184			}
185			VOP_STRATEGY(rabp);
186		} else {
187			brelse(rabp);
188		}
189	}
190
191	if( readwait) {
192		rv = biowait (bp);
193	}
194
195	return (rv);
196}
197
198/*
199 * Write, release buffer on completion.  (Done by iodone
200 * if async.)
201 */
202int
203bwrite(struct buf *bp)
204{
205	int oldflags = bp->b_flags;
206
207	if(bp->b_flags & B_INVAL) {
208		brelse(bp);
209		return (0);
210	}
211
212	if(!(bp->b_flags & B_BUSY))
213		panic("bwrite: buffer is not busy???");
214
215	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
216	bp->b_flags |= B_WRITEINPROG;
217
218	if (oldflags & B_ASYNC) {
219		if (oldflags & B_DELWRI) {
220			reassignbuf(bp, bp->b_vp);
221		} else if( curproc) {
222			++curproc->p_stats->p_ru.ru_oublock;
223		}
224	}
225
226	bp->b_vp->v_numoutput++;
227	VOP_STRATEGY(bp);
228
229	if( (oldflags & B_ASYNC) == 0) {
230		int rtval = biowait(bp);
231		if (oldflags & B_DELWRI) {
232			reassignbuf(bp, bp->b_vp);
233		} else if( curproc) {
234			++curproc->p_stats->p_ru.ru_oublock;
235		}
236		brelse(bp);
237		return (rtval);
238	}
239
240	return(0);
241}
242
243int
244vn_bwrite(ap)
245	struct vop_bwrite_args *ap;
246{
247	return (bwrite(ap->a_bp));
248}
249
250/*
251 * Delayed write. (Buffer is marked dirty).
252 */
253void
254bdwrite(struct buf *bp)
255{
256
257	if((bp->b_flags & B_BUSY) == 0) {
258		panic("bdwrite: buffer is not busy");
259	}
260
261	if(bp->b_flags & B_INVAL) {
262		brelse(bp);
263		return;
264	}
265
266	if(bp->b_flags & B_TAPE) {
267		bawrite(bp);
268		return;
269	}
270
271	bp->b_flags &= ~B_READ;
272	if( (bp->b_flags & B_DELWRI) == 0) {
273		if( curproc)
274			++curproc->p_stats->p_ru.ru_oublock;
275		bp->b_flags |= B_DONE|B_DELWRI;
276		reassignbuf(bp, bp->b_vp);
277	}
278	brelse(bp);
279	return;
280}
281
282/*
283 * Asynchronous write.
284 * Start output on a buffer, but do not wait for it to complete.
285 * The buffer is released when the output completes.
286 */
287void
288bawrite(struct buf *bp)
289{
290	bp->b_flags |= B_ASYNC;
291	(void) bwrite(bp);
292}
293
294/*
295 * Release a buffer.
296 */
297void
298brelse(struct buf *bp)
299{
300	int x;
301
302	/* anyone need a "free" block? */
303	x=splbio();
304	if (needsbuffer) {
305		needsbuffer = 0;
306		wakeup((caddr_t)&needsbuffer);
307	}
308
309	/* anyone need this block? */
310	if (bp->b_flags & B_WANTED) {
311		bp->b_flags &= ~(B_WANTED|B_AGE);
312		wakeup((caddr_t)bp);
313	}
314
315	if (bp->b_flags & B_LOCKED)
316		bp->b_flags &= ~B_ERROR;
317
318	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
319		(bp->b_bufsize <= 0)) {
320		bp->b_flags |= B_INVAL;
321		bp->b_flags &= ~(B_DELWRI|B_CACHE);
322		if(bp->b_vp)
323			brelvp(bp);
324	}
325
326	if( bp->b_qindex != QUEUE_NONE)
327		panic("brelse: free buffer onto another queue???");
328
329	/* enqueue */
330	/* buffers with no memory */
331	if(bp->b_bufsize == 0) {
332		bp->b_qindex = QUEUE_EMPTY;
333		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
334		LIST_REMOVE(bp, b_hash);
335		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
336		bp->b_dev = NODEV;
337	/* buffers with junk contents */
338	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
339		bp->b_qindex = QUEUE_AGE;
340		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
341		LIST_REMOVE(bp, b_hash);
342		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
343		bp->b_dev = NODEV;
344	/* buffers that are locked */
345	} else if(bp->b_flags & B_LOCKED) {
346		bp->b_qindex = QUEUE_LOCKED;
347		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
348	/* buffers with stale but valid contents */
349	} else if(bp->b_flags & B_AGE) {
350		bp->b_qindex = QUEUE_AGE;
351		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
352	/* buffers with valid and quite potentially reuseable contents */
353	} else {
354		bp->b_qindex = QUEUE_LRU;
355		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
356	}
357
358	/* unlock */
359	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
360	splx(x);
361}
362
363int freebufspace;
364int allocbufspace;
365
366/*
367 * Find a buffer header which is available for use.
368 */
369struct buf *
370getnewbuf(int slpflag, int slptimeo)
371{
372	struct buf *bp;
373	int s;
374	s = splbio();
375start:
376	/* can we constitute a new buffer? */
377	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
378		if( bp->b_qindex != QUEUE_EMPTY)
379			panic("getnewbuf: inconsistent EMPTY queue");
380		bremfree(bp);
381		goto fillbuf;
382	}
383
384	if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
385		if( bp->b_qindex != QUEUE_AGE)
386			panic("getnewbuf: inconsistent AGE queue");
387		bremfree(bp);
388	} else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
389		if( bp->b_qindex != QUEUE_LRU)
390			panic("getnewbuf: inconsistent LRU queue");
391		bremfree(bp);
392	} else	{
393		/* wait for a free buffer of any kind */
394		needsbuffer = 1;
395		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
396		splx(s);
397		return (0);
398	}
399
400
401	/* if we are a delayed write, convert to an async write */
402	if (bp->b_flags & B_DELWRI) {
403		bp->b_flags |= B_BUSY;
404		bawrite (bp);
405		goto start;
406	}
407
408	if(bp->b_vp)
409		brelvp(bp);
410
411	/* we are not free, nor do we contain interesting data */
412	if (bp->b_rcred != NOCRED)
413		crfree(bp->b_rcred);
414	if (bp->b_wcred != NOCRED)
415		crfree(bp->b_wcred);
416fillbuf:
417	bp->b_flags = B_BUSY;
418	LIST_REMOVE(bp, b_hash);
419	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
420	splx(s);
421	bp->b_dev = NODEV;
422	bp->b_vp = NULL;
423	bp->b_blkno = bp->b_lblkno = 0;
424	bp->b_iodone = 0;
425	bp->b_error = 0;
426	bp->b_resid = 0;
427	bp->b_bcount = 0;
428	bp->b_wcred = bp->b_rcred = NOCRED;
429	bp->b_dirtyoff = bp->b_dirtyend = 0;
430	bp->b_validoff = bp->b_validend = 0;
431	return (bp);
432}
433
434/*
435 * Check to see if a block is currently memory resident.
436 */
437struct buf *
438incore(struct vnode *vp, daddr_t blkno)
439{
440	struct buf *bp;
441	struct bufhashhdr *bh;
442
443	int s = splbio();
444
445	bh = BUFHASH(vp, blkno);
446	bp = bh->lh_first;
447
448	/* Search hash chain */
449	while (bp) {
450#ifdef DEBUG
451		if( (bp < buf) || (bp >= buf + nbuf)) {
452			printf("incore: buf out of range: %p, hash: %d\n",
453				bp, bh - bufhashtbl);
454			panic("incore: buf fault");
455		}
456#endif
457		/* hit */
458		if (bp->b_lblkno == blkno && bp->b_vp == vp
459			&& (bp->b_flags & B_INVAL) == 0) {
460			splx(s);
461			return (bp);
462		}
463		bp = bp->b_hash.le_next;
464	}
465	splx(s);
466
467	return(0);
468}
469
470/*
471 * Get a block given a specified block and offset into a file/device.
472 */
473struct buf *
474getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
475{
476	struct buf *bp;
477	int s;
478	struct bufhashhdr *bh;
479
480	s = splbio();
481loop:
482	if ((bp = incore(vp, blkno))) {
483		if (bp->b_flags & B_BUSY) {
484			bp->b_flags |= B_WANTED;
485			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
486			goto loop;
487		}
488		bp->b_flags |= B_BUSY | B_CACHE;
489		bremfree(bp);
490		/*
491		 * check for size inconsistancies
492		 */
493		if (bp->b_bcount != size) {
494			printf("getblk: invalid buffer size: %ld\n", bp->b_bcount);
495			bp->b_flags |= B_INVAL;
496			bwrite(bp);
497			goto loop;
498		}
499	} else {
500		if ((bp = getnewbuf(0, 0)) == 0)
501			goto loop;
502		bp->b_blkno = bp->b_lblkno = blkno;
503		bgetvp(vp, bp);
504		LIST_REMOVE(bp, b_hash);
505		bh = BUFHASH(vp, blkno);
506		LIST_INSERT_HEAD(bh, bp, b_hash);
507		allocbuf(bp, size);
508	}
509	splx(s);
510	return (bp);
511}
512
513/*
514 * Get an empty, disassociated buffer of given size.
515 */
516struct buf *
517geteblk(int size)
518{
519	struct buf *bp;
520	while ((bp = getnewbuf(0, 0)) == 0)
521		;
522	allocbuf(bp, size);
523	bp->b_flags |= B_INVAL;
524	return (bp);
525}
526
527/*
528 * Modify the length of a buffer's underlying buffer storage without
529 * destroying information (unless, of course the buffer is shrinking).
530 */
531void
532allocbuf(struct buf *bp, int size)
533{
534
535	int newbsize = round_page(size);
536
537	if( newbsize == bp->b_bufsize) {
538		bp->b_bcount = size;
539		return;
540	} else if( newbsize < bp->b_bufsize) {
541		vm_hold_free_pages(
542			(vm_offset_t) bp->b_data + newbsize,
543			(vm_offset_t) bp->b_data + bp->b_bufsize);
544	} else if( newbsize > bp->b_bufsize) {
545		vm_hold_load_pages(
546			(vm_offset_t) bp->b_data + bp->b_bufsize,
547			(vm_offset_t) bp->b_data + newbsize);
548	}
549
550	/* adjust buffer cache's idea of memory allocated to buffer contents */
551	freebufspace -= newbsize - bp->b_bufsize;
552	allocbufspace += newbsize - bp->b_bufsize;
553
554	bp->b_bufsize = newbsize;
555	bp->b_bcount = size;
556}
557
558/*
559 * Wait for buffer I/O completion, returning error status.
560 */
561int
562biowait(register struct buf *bp)
563{
564	int s;
565
566	s = splbio();
567	while ((bp->b_flags & B_DONE) == 0)
568		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
569	if((bp->b_flags & B_ERROR) || bp->b_error) {
570		if ((bp->b_flags & B_INVAL) == 0) {
571			bp->b_flags |= B_INVAL;
572			bp->b_dev = NODEV;
573			LIST_REMOVE(bp, b_hash);
574			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
575		}
576		if (!bp->b_error)
577			bp->b_error = EIO;
578		else
579			bp->b_flags |= B_ERROR;
580		splx(s);
581		return (bp->b_error);
582	} else {
583		splx(s);
584		return (0);
585	}
586}
587
588/*
589 * Finish I/O on a buffer, calling an optional function.
590 * This is usually called from interrupt level, so process blocking
591 * is not *a good idea*.
592 */
593void
594biodone(register struct buf *bp)
595{
596	int s;
597	s = splbio();
598	bp->b_flags |= B_DONE;
599
600	if ((bp->b_flags & B_READ) == 0)  {
601		vwakeup(bp);
602	}
603
604#ifdef BOUNCE_BUFFERS
605	if (bp->b_flags & B_BOUNCE)
606		vm_bounce_free(bp);
607#endif
608
609	/* call optional completion function if requested */
610	if (bp->b_flags & B_CALL) {
611		bp->b_flags &= ~B_CALL;
612		(*bp->b_iodone)(bp);
613		splx(s);
614		return;
615	}
616
617/*
618 * For asynchronous completions, release the buffer now. The brelse
619 *	checks for B_WANTED and will do the wakeup there if necessary -
620 *	so no need to do a wakeup here in the async case.
621 */
622
623	if (bp->b_flags & B_ASYNC) {
624		brelse(bp);
625	} else {
626		bp->b_flags &= ~B_WANTED;
627		wakeup((caddr_t) bp);
628	}
629	splx(s);
630}
631
632int
633count_lock_queue()
634{
635	int count;
636	struct buf *bp;
637
638	count = 0;
639	for(bp = bufqueues[QUEUE_LOCKED].tqh_first;
640	    bp != NULL;
641	    bp = bp->b_freelist.tqe_next)
642		count++;
643	return(count);
644}
645
646int vfs_update_interval = 30;
647
648void
649vfs_update() {
650	(void) spl0();
651	while(1) {
652		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
653			hz * vfs_update_interval);
654		vfs_update_wakeup = 0;
655		sync(curproc, NULL, NULL);
656	}
657}
658
659#if 0
660#define MAXFREEBP 128
661#define LDFREE_BUSY 1
662#define LDFREE_WANT 2
663int loadfreeing;
664struct buf *freebp[MAXFREEBP];
665#endif
666/*
667 * these routines are not in the correct place (yet)
668 * also they work *ONLY* for kernel_pmap!!!
669 */
670void
671vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
672	vm_offset_t pg;
673	vm_page_t p;
674	vm_offset_t from = round_page(froma);
675	vm_offset_t to = round_page(toa);
676
677	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
678
679	tryagain:
680#if 0
681/*
682 * don't allow buffer cache to cause VM paging
683 */
684		if ( cnt.v_free_count < cnt.v_free_min) {
685			if( !loadfreeing ) {
686				int n=0;
687				struct buf *bp;
688				loadfreeing = LDFREE_BUSY;
689				while( (cnt.v_free_count <= cnt.v_free_min) &&
690					(n < MAXFREEBP)) {
691					bp = geteblk(0);
692					if( bp)
693						freebp[n++] = bp;
694					else
695						break;
696				}
697				while(--n >= 0) {
698					brelse(freebp[n]);
699				}
700				if( loadfreeing & LDFREE_WANT)
701					wakeup((caddr_t) &loadfreeing);
702				loadfreeing = 0;
703			} else {
704				loadfreeing |= LDFREE_WANT;
705				tsleep(&loadfreeing, PRIBIO, "biofree", 0);
706			}
707		}
708#endif
709		if (cnt.v_free_count <=
710			cnt.v_free_reserved + (toa-froma) / PAGE_SIZE) {
711			VM_WAIT;
712			goto tryagain;
713		}
714
715		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
716		if( !p) {
717			VM_WAIT;
718			goto tryagain;
719		}
720
721		vm_page_wire(p);
722		pmap_kenter( pg, VM_PAGE_TO_PHYS(p));
723	}
724}
725
726void
727vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa)
728{
729	vm_offset_t pg;
730	vm_page_t p;
731	vm_offset_t from = round_page(froma);
732	vm_offset_t to = round_page(toa);
733
734	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
735		p = PHYS_TO_VM_PAGE( pmap_kextract( pg));
736		pmap_kremove( pg);
737		vm_page_free(p);
738	}
739}
740
741void
742bufstats()
743{
744}
745
746