vfs_bio.c revision 1817
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 *
19 * $Id$
20 */
21
22#include <sys/param.h>
23#include <sys/systm.h>
24#include <sys/kernel.h>
25#include <sys/proc.h>
26#include <sys/vnode.h>
27#include <sys/buf.h>
28#include <sys/mount.h>
29#include <sys/malloc.h>
30#include <sys/resourcevar.h>
31#include <vm/vm.h>
32#include <vm/vm_pageout.h>
33
34#include <miscfs/specfs/specdev.h>
35
36struct	buf *buf;		/* buffer header pool */
37int	nbuf;			/* number of buffer headers calculated elsewhere */
38
39extern	vm_map_t buffer_map, io_map;
40
41void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
42void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
43
44int needsbuffer;
45
46/*
47 * Internal update daemon, process 3
48 *	The variable vfs_update_wakeup allows for internal syncs.
49 */
50int vfs_update_wakeup;
51
52/*
53 * Initialize buffer headers and related structures.
54 */
55void bufinit()
56{
57	struct buf *bp;
58	int i;
59
60	TAILQ_INIT(&bswlist);
61	LIST_INIT(&invalhash);
62
63	/* first, make a null hash table */
64	for(i=0;i<BUFHSZ;i++)
65		LIST_INIT(&bufhashtbl[i]);
66
67	/* next, make a null set of free lists */
68	for(i=0;i<BUFFER_QUEUES;i++)
69		TAILQ_INIT(&bufqueues[i]);
70
71	/* finally, initialize each buffer header and stick on empty q */
72	for(i=0;i<nbuf;i++) {
73		bp = &buf[i];
74		bzero(bp, sizeof *bp);
75		bp->b_flags = B_INVAL;	/* we're just an empty header */
76		bp->b_dev = NODEV;
77		bp->b_vp = NULL;
78		bp->b_rcred = NOCRED;
79		bp->b_wcred = NOCRED;
80		bp->b_qindex = QUEUE_EMPTY;
81		bp->b_vnbufs.le_next = NOLIST;
82		bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE);
83		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
84		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
85	}
86}
87
88/*
89 * remove the buffer from the appropriate free list
90 */
91void
92bremfree(struct buf *bp)
93{
94	int s = splbio();
95	if( bp->b_qindex != QUEUE_NONE) {
96		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
97		bp->b_qindex = QUEUE_NONE;
98	} else {
99		panic("bremfree: removing a buffer when not on a queue");
100	}
101	splx(s);
102}
103
104/*
105 * Get a buffer with the specified data.  Look in the cache first.
106 */
107int
108bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
109	struct buf **bpp)
110{
111	struct buf *bp;
112
113	bp = getblk (vp, blkno, size, 0, 0);
114	*bpp = bp;
115
116	/* if not found in cache, do some I/O */
117	if ((bp->b_flags & B_CACHE) == 0) {
118		if (curproc && curproc->p_stats)	/* count block I/O */
119			curproc->p_stats->p_ru.ru_inblock++;
120		bp->b_flags |= B_READ;
121		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
122		if( bp->b_rcred == NOCRED) {
123			if (cred != NOCRED)
124				crhold(cred);
125			bp->b_rcred = cred;
126		}
127		VOP_STRATEGY(bp);
128		return( biowait (bp));
129	}
130
131	return (0);
132}
133
134/*
135 * Operates like bread, but also starts asynchronous I/O on
136 * read-ahead blocks.
137 */
138int
139breadn(struct vnode *vp, daddr_t blkno, int size,
140	daddr_t *rablkno, int *rabsize,
141	int cnt, struct ucred *cred, struct buf **bpp)
142{
143	struct buf *bp, *rabp;
144	int i;
145	int rv = 0, readwait = 0;
146
147	*bpp = bp = getblk (vp, blkno, size, 0, 0);
148
149	/* if not found in cache, do some I/O */
150	if ((bp->b_flags & B_CACHE) == 0) {
151		if (curproc && curproc->p_stats)	/* count block I/O */
152			curproc->p_stats->p_ru.ru_inblock++;
153		bp->b_flags |= B_READ;
154		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
155		if( bp->b_rcred == NOCRED) {
156			if (cred != NOCRED)
157				crhold(cred);
158			bp->b_rcred = cred;
159		}
160		VOP_STRATEGY(bp);
161		++readwait;
162	}
163
164	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
165		if( incore(vp, *rablkno)) {
166			continue;
167		}
168		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
169
170		if ((rabp->b_flags & B_CACHE) == 0) {
171			if (curproc && curproc->p_stats)
172				curproc->p_stats->p_ru.ru_inblock++;
173			rabp->b_flags |= B_READ | B_ASYNC;
174			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
175			if( rabp->b_rcred == NOCRED) {
176				if (cred != NOCRED)
177					crhold(cred);
178				rabp->b_rcred = cred;
179			}
180			VOP_STRATEGY(rabp);
181		} else {
182			brelse(rabp);
183		}
184	}
185
186	if( readwait) {
187		rv = biowait (bp);
188	}
189
190	return (rv);
191}
192
193/*
194 * Write, release buffer on completion.  (Done by iodone
195 * if async.)
196 */
197int
198bwrite(struct buf *bp)
199{
200	int oldflags = bp->b_flags;
201
202	if(bp->b_flags & B_INVAL) {
203		brelse(bp);
204		return (0);
205	}
206
207	if(!(bp->b_flags & B_BUSY))
208		panic("bwrite: buffer is not busy???");
209
210	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
211	bp->b_flags |= B_WRITEINPROG;
212
213	if (oldflags & B_ASYNC) {
214		if (oldflags & B_DELWRI) {
215			reassignbuf(bp, bp->b_vp);
216		} else if( curproc) {
217			++curproc->p_stats->p_ru.ru_oublock;
218		}
219	}
220
221	bp->b_vp->v_numoutput++;
222	VOP_STRATEGY(bp);
223
224	if( (oldflags & B_ASYNC) == 0) {
225		int rtval = biowait(bp);
226		if (oldflags & B_DELWRI) {
227			reassignbuf(bp, bp->b_vp);
228		} else if( curproc) {
229			++curproc->p_stats->p_ru.ru_oublock;
230		}
231		brelse(bp);
232		return (rtval);
233	}
234
235	return(0);
236}
237
238int
239vn_bwrite(ap)
240	struct vop_bwrite_args *ap;
241{
242	return (bwrite(ap->a_bp));
243}
244
245/*
246 * Delayed write. (Buffer is marked dirty).
247 */
248void
249bdwrite(struct buf *bp)
250{
251
252	if((bp->b_flags & B_BUSY) == 0) {
253		panic("bdwrite: buffer is not busy");
254	}
255
256	if(bp->b_flags & B_INVAL) {
257		brelse(bp);
258		return;
259	}
260
261	if(bp->b_flags & B_TAPE) {
262		bawrite(bp);
263		return;
264	}
265
266	bp->b_flags &= ~B_READ;
267	if( (bp->b_flags & B_DELWRI) == 0) {
268		if( curproc)
269			++curproc->p_stats->p_ru.ru_oublock;
270		bp->b_flags |= B_DONE|B_DELWRI;
271		reassignbuf(bp, bp->b_vp);
272	}
273	brelse(bp);
274	return;
275}
276
277/*
278 * Asynchronous write.
279 * Start output on a buffer, but do not wait for it to complete.
280 * The buffer is released when the output completes.
281 */
282void
283bawrite(struct buf *bp)
284{
285	bp->b_flags |= B_ASYNC;
286	(void) bwrite(bp);
287}
288
289/*
290 * Release a buffer.
291 */
292void
293brelse(struct buf *bp)
294{
295	int x;
296
297	/* anyone need a "free" block? */
298	x=splbio();
299	if (needsbuffer) {
300		needsbuffer = 0;
301		wakeup((caddr_t)&needsbuffer);
302	}
303	/* anyone need this very block? */
304	if (bp->b_flags & B_WANTED) {
305		bp->b_flags &= ~(B_WANTED|B_AGE);
306		wakeup((caddr_t)bp);
307	}
308
309	if (bp->b_flags & B_LOCKED)
310		bp->b_flags &= ~B_ERROR;
311
312	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
313		(bp->b_bufsize <= 0)) {
314		bp->b_flags |= B_INVAL;
315		bp->b_flags &= ~(B_DELWRI|B_CACHE);
316		if(bp->b_vp)
317			brelvp(bp);
318	}
319
320	if( bp->b_qindex != QUEUE_NONE)
321		panic("brelse: free buffer onto another queue???");
322
323	/* enqueue */
324	/* buffers with junk contents */
325	if(bp->b_bufsize == 0) {
326		bp->b_qindex = QUEUE_EMPTY;
327		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
328		LIST_REMOVE(bp, b_hash);
329		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
330		bp->b_dev = NODEV;
331	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
332		bp->b_qindex = QUEUE_AGE;
333		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
334		LIST_REMOVE(bp, b_hash);
335		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
336		bp->b_dev = NODEV;
337	/* buffers that are locked */
338	} else if(bp->b_flags & B_LOCKED) {
339		bp->b_qindex = QUEUE_LOCKED;
340		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
341	/* buffers with stale but valid contents */
342	} else if(bp->b_flags & B_AGE) {
343		bp->b_qindex = QUEUE_AGE;
344		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
345	/* buffers with valid and quite potentially reuseable contents */
346	} else {
347		bp->b_qindex = QUEUE_LRU;
348		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
349	}
350
351	/* unlock */
352	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
353	splx(x);
354}
355
356int freebufspace;
357int allocbufspace;
358
359/*
360 * Find a buffer header which is available for use.
361 */
362struct buf *
363getnewbuf(int slpflag, int slptimeo)
364{
365	struct buf *bp;
366	int x;
367	x = splbio();
368start:
369	/* can we constitute a new buffer? */
370	if (bp = bufqueues[QUEUE_EMPTY].tqh_first) {
371		if( bp->b_qindex != QUEUE_EMPTY)
372			panic("getnewbuf: inconsistent EMPTY queue");
373		bremfree(bp);
374		goto fillbuf;
375	}
376
377tryfree:
378	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
379		if( bp->b_qindex != QUEUE_AGE)
380			panic("getnewbuf: inconsistent AGE queue");
381		bremfree(bp);
382	} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
383		if( bp->b_qindex != QUEUE_LRU)
384			panic("getnewbuf: inconsistent LRU queue");
385		bremfree(bp);
386	} else	{
387		/* wait for a free buffer of any kind */
388		needsbuffer = 1;
389		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
390		splx(x);
391		return (0);
392	}
393
394
395	/* if we are a delayed write, convert to an async write */
396	if (bp->b_flags & B_DELWRI) {
397		bp->b_flags |= B_BUSY;
398		bawrite (bp);
399		goto start;
400	}
401
402	if(bp->b_vp)
403		brelvp(bp);
404
405	/* we are not free, nor do we contain interesting data */
406	if (bp->b_rcred != NOCRED)
407		crfree(bp->b_rcred);
408	if (bp->b_wcred != NOCRED)
409		crfree(bp->b_wcred);
410fillbuf:
411	bp->b_flags = B_BUSY;
412	LIST_REMOVE(bp, b_hash);
413	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
414	splx(x);
415	bp->b_dev = NODEV;
416	bp->b_vp = NULL;
417	bp->b_blkno = bp->b_lblkno = 0;
418	bp->b_iodone = 0;
419	bp->b_error = 0;
420	bp->b_resid = 0;
421	bp->b_bcount = 0;
422	bp->b_wcred = bp->b_rcred = NOCRED;
423	bp->b_dirtyoff = bp->b_dirtyend = 0;
424	bp->b_validoff = bp->b_validend = 0;
425	return (bp);
426}
427
428/*
429 * Check to see if a block is currently memory resident.
430 */
431struct buf *
432incore(struct vnode *vp, daddr_t blkno)
433{
434	struct buf *bp;
435	struct bufhashhdr *bh;
436
437	int s = splbio();
438
439	bh = BUFHASH(vp, blkno);
440	bp = bh->lh_first;
441
442	/* Search hash chain */
443	while (bp) {
444		if( (bp < buf) || (bp >= buf + nbuf)) {
445			printf("incore: buf out of range: %lx, hash: %d\n",
446				bp, bh - bufhashtbl);
447			panic("incore: buf fault");
448		}
449		/* hit */
450		if (bp->b_lblkno == blkno && bp->b_vp == vp
451			&& (bp->b_flags & B_INVAL) == 0) {
452			splx(s);
453			return (bp);
454		}
455		bp = bp->b_hash.le_next;
456	}
457	splx(s);
458
459	return(0);
460}
461
462/*
463 * Get a block given a specified block and offset into a file/device.
464 */
465struct buf *
466getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
467{
468	struct buf *bp;
469	int x;
470	struct bufhashhdr *bh;
471
472	x = splbio();
473loop:
474	if (bp = incore(vp, blkno)) {
475		if (bp->b_flags & B_BUSY) {
476			bp->b_flags |= B_WANTED;
477			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
478			goto loop;
479		}
480		bp->b_flags |= B_BUSY | B_CACHE;
481		bremfree(bp);
482		/*
483		 * check for size inconsistancies
484		 */
485		if (bp->b_bcount != size) {
486			printf("getblk: invalid buffer size: %d\n", bp->b_bcount);
487			bp->b_flags |= B_INVAL;
488			bwrite(bp);
489			goto loop;
490		}
491	} else {
492
493		if ((bp = getnewbuf(0, 0)) == 0)
494			goto loop;
495		allocbuf(bp, size);
496		/*
497		 * have to check again, because of a possible
498		 * race condition.
499		 */
500		if (incore( vp, blkno)) {
501			allocbuf(bp, 0);
502			bp->b_flags |= B_INVAL;
503			brelse(bp);
504			goto loop;
505		}
506		bp->b_blkno = bp->b_lblkno = blkno;
507		bgetvp(vp, bp);
508		LIST_REMOVE(bp, b_hash);
509		bh = BUFHASH(vp, blkno);
510		LIST_INSERT_HEAD(bh, bp, b_hash);
511	}
512	splx(x);
513	return (bp);
514}
515
516/*
517 * Get an empty, disassociated buffer of given size.
518 */
519struct buf *
520geteblk(int size)
521{
522	struct buf *bp;
523	while ((bp = getnewbuf(0, 0)) == 0)
524		;
525	allocbuf(bp, size);
526	bp->b_flags |= B_INVAL;
527	return (bp);
528}
529
530/*
531 * Modify the length of a buffer's underlying buffer storage without
532 * destroying information (unless, of course the buffer is shrinking).
533 */
534void
535allocbuf(struct buf *bp, int size)
536{
537
538	int newbsize = round_page(size);
539
540	if( newbsize == bp->b_bufsize) {
541		bp->b_bcount = size;
542		return;
543	} else if( newbsize < bp->b_bufsize) {
544		vm_hold_free_pages(
545			(vm_offset_t) bp->b_data + newbsize,
546			(vm_offset_t) bp->b_data + bp->b_bufsize);
547	} else if( newbsize > bp->b_bufsize) {
548		vm_hold_load_pages(
549			(vm_offset_t) bp->b_data + bp->b_bufsize,
550			(vm_offset_t) bp->b_data + newbsize);
551	}
552
553	/* adjust buffer cache's idea of memory allocated to buffer contents */
554	freebufspace -= newbsize - bp->b_bufsize;
555	allocbufspace += newbsize - bp->b_bufsize;
556
557	bp->b_bufsize = newbsize;
558	bp->b_bcount = size;
559}
560
561/*
562 * Wait for buffer I/O completion, returning error status.
563 */
564int
565biowait(register struct buf *bp)
566{
567	int x;
568
569	x = splbio();
570	while ((bp->b_flags & B_DONE) == 0)
571		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
572	if((bp->b_flags & B_ERROR) || bp->b_error) {
573		if ((bp->b_flags & B_INVAL) == 0) {
574			bp->b_flags |= B_INVAL;
575			bp->b_dev = NODEV;
576			LIST_REMOVE(bp, b_hash);
577			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
578		}
579		if (!bp->b_error)
580			bp->b_error = EIO;
581		else
582			bp->b_flags |= B_ERROR;
583		splx(x);
584		return (bp->b_error);
585	} else {
586		splx(x);
587		return (0);
588	}
589}
590
591/*
592 * Finish I/O on a buffer, calling an optional function.
593 * This is usually called from interrupt level, so process blocking
594 * is not *a good idea*.
595 */
596void
597biodone(register struct buf *bp)
598{
599	int s;
600	s = splbio();
601	bp->b_flags |= B_DONE;
602
603	if ((bp->b_flags & B_READ) == 0)  {
604		vwakeup(bp);
605	}
606
607	/* call optional completion function if requested */
608	if (bp->b_flags & B_CALL) {
609		bp->b_flags &= ~B_CALL;
610		(*bp->b_iodone)(bp);
611		splx(s);
612		return;
613	}
614
615/*
616 * For asynchronous completions, release the buffer now. The brelse
617 *	checks for B_WANTED and will do the wakeup there if necessary -
618 *	so no need to do a wakeup here in the async case.
619 */
620
621	if (bp->b_flags & B_ASYNC) {
622		brelse(bp);
623	} else {
624		bp->b_flags &= ~B_WANTED;
625		wakeup((caddr_t) bp);
626	}
627	splx(s);
628}
629
630int
631count_lock_queue()
632{
633	int count;
634	struct buf *bp;
635
636	count = 0;
637	for(bp = bufqueues[QUEUE_LOCKED].tqh_first;
638	    bp != NULL;
639	    bp = bp->b_freelist.tqe_next)
640		count++;
641	return(count);
642}
643
644#ifndef UPDATE_INTERVAL
645int vfs_update_interval = 30;
646#else
647int vfs_update_interval = UPDATE_INTERVAL;
648#endif
649
650void
651vfs_update() {
652	(void) spl0();
653	while(1) {
654		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
655			hz * vfs_update_interval);
656		vfs_update_wakeup = 0;
657		sync(curproc, NULL, NULL);
658	}
659}
660
661/*
662 * these routines are not in the correct place (yet)
663 * also they work *ONLY* for kernel_pmap!!!
664 */
665void
666vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
667	vm_offset_t pg;
668	vm_page_t p;
669	vm_offset_t from = round_page(froma);
670	vm_offset_t to = round_page(toa);
671
672	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
673		vm_offset_t pa;
674
675	tryagain:
676		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
677		if( !p) {
678			VM_WAIT;
679			goto tryagain;
680		}
681
682		vm_page_wire(p);
683		pmap_enter(kernel_pmap, pg, VM_PAGE_TO_PHYS(p),
684			VM_PROT_READ|VM_PROT_WRITE, 1);
685	}
686}
687
688void
689vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) {
690	vm_offset_t pg;
691	vm_page_t p;
692	vm_offset_t from = round_page(froma);
693	vm_offset_t to = round_page(toa);
694
695	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
696		vm_offset_t pa;
697		pa = pmap_kextract(pg);
698		if( !pa) {
699			printf("No pa for va: %x\n", pg);
700		} else {
701			p = PHYS_TO_VM_PAGE( pa);
702			pmap_remove(kernel_pmap, pg, pg + PAGE_SIZE);
703			vm_page_free(p);
704		}
705	}
706}
707
708void
709bufstats()
710{
711}
712
713