lfs_subr.c revision 1.44
1/*	$NetBSD: lfs_subr.c,v 1.44 2003/09/07 11:44:22 yamt Exp $	*/
2
3/*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant@hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the NetBSD
21 *	Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 *    contributors may be used to endorse or promote products derived
24 *    from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38/*
39 * Copyright (c) 1991, 1993
40 *	The Regents of the University of California.  All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 *    notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 *    notice, this list of conditions and the following disclaimer in the
49 *    documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 *    may be used to endorse or promote products derived from this software
52 *    without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 *	@(#)lfs_subr.c	8.4 (Berkeley) 5/8/95
67 */
68
69#include <sys/cdefs.h>
70__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.44 2003/09/07 11:44:22 yamt Exp $");
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/namei.h>
75#include <sys/vnode.h>
76#include <sys/buf.h>
77#include <sys/mount.h>
78#include <sys/malloc.h>
79#include <sys/proc.h>
80
81#include <ufs/ufs/inode.h>
82#include <ufs/lfs/lfs.h>
83#include <ufs/lfs/lfs_extern.h>
84
85#include <uvm/uvm.h>
86
87/*
88 * Return buffer with the contents of block "offset" from the beginning of
89 * directory "ip".  If "res" is non-zero, fill it in with a pointer to the
90 * remaining space in the directory.
91 */
92int
93lfs_blkatoff(void *v)
94{
95	struct vop_blkatoff_args /* {
96		struct vnode *a_vp;
97		off_t a_offset;
98		char **a_res;
99		struct buf **a_bpp;
100		} */ *ap = v;
101	struct lfs *fs;
102	struct inode *ip;
103	struct buf *bp;
104	daddr_t lbn;
105	int bsize, error;
106
107	ip = VTOI(ap->a_vp);
108	fs = ip->i_lfs;
109	lbn = lblkno(fs, ap->a_offset);
110	bsize = blksize(fs, ip, lbn);
111
112	*ap->a_bpp = NULL;
113	if ((error = bread(ap->a_vp, lbn, bsize, NOCRED, &bp)) != 0) {
114		brelse(bp);
115		return (error);
116	}
117	if (ap->a_res)
118		*ap->a_res = (char *)bp->b_data + blkoff(fs, ap->a_offset);
119	*ap->a_bpp = bp;
120	return (0);
121}
122
123#ifdef LFS_DEBUG_MALLOC
124char *lfs_res_names[LFS_NB_COUNT] = {
125	"summary",
126	"superblock",
127	"ifile block",
128	"cluster",
129	"clean",
130};
131#endif
132
133int lfs_res_qty[LFS_NB_COUNT] = {
134	LFS_N_SUMMARIES,
135	LFS_N_SBLOCKS,
136	LFS_N_IBLOCKS,
137	LFS_N_CLUSTERS,
138	LFS_N_CLEAN,
139};
140
141void
142lfs_setup_resblks(struct lfs *fs)
143{
144	int i, j;
145	int maxbpp;
146
147	fs->lfs_resblk = (res_t *)malloc(LFS_N_TOTAL * sizeof(res_t), M_SEGMENT,
148					  M_WAITOK);
149	for (i = 0; i < LFS_N_TOTAL; i++) {
150		fs->lfs_resblk[i].inuse = 0;
151		fs->lfs_resblk[i].p = NULL;
152	}
153	for (i = 0; i < LFS_RESHASH_WIDTH; i++)
154		LIST_INIT(fs->lfs_reshash + i);
155
156	/*
157	 * These types of allocations can be larger than a page,
158	 * so we can't use the pool subsystem for them.
159	 */
160	for (i = 0, j = 0; j < LFS_N_SUMMARIES; j++, i++)
161		fs->lfs_resblk[i].size = fs->lfs_sumsize;
162	for (j = 0; j < LFS_N_SBLOCKS; j++, i++)
163		fs->lfs_resblk[i].size = LFS_SBPAD;
164	for (j = 0; j < LFS_N_IBLOCKS; j++, i++)
165		fs->lfs_resblk[i].size = fs->lfs_bsize;
166	for (j = 0; j < LFS_N_CLUSTERS; j++, i++)
167		fs->lfs_resblk[i].size = MAXPHYS;
168	for (j = 0; j < LFS_N_CLEAN; j++, i++)
169		fs->lfs_resblk[i].size = MAXPHYS;
170
171	for (i = 0; i < LFS_N_TOTAL; i++) {
172		fs->lfs_resblk[i].p = malloc(fs->lfs_resblk[i].size,
173					     M_SEGMENT, M_WAITOK);
174	}
175
176	/*
177	 * Initialize pools for small types (XXX is BPP small?)
178	 */
179	pool_init(&fs->lfs_clpool, sizeof(struct lfs_cluster), 0, 0,
180		LFS_N_CL, "lfsclpl", &pool_allocator_nointr);
181	pool_init(&fs->lfs_segpool, sizeof(struct segment), 0, 0,
182		LFS_N_SEG, "lfssegpool", &pool_allocator_nointr);
183	maxbpp = ((fs->lfs_sumsize - SEGSUM_SIZE(fs)) / sizeof(int32_t) + 2);
184	maxbpp = MIN(maxbpp, fs->lfs_ssize / fs->lfs_fsize + 2);
185	pool_init(&fs->lfs_bpppool, maxbpp * sizeof(struct buf *), 0, 0,
186		LFS_N_BPP, "lfsbpppl", &pool_allocator_nointr);
187}
188
189void
190lfs_free_resblks(struct lfs *fs)
191{
192	int i;
193
194	pool_destroy(&fs->lfs_bpppool);
195	pool_destroy(&fs->lfs_segpool);
196	pool_destroy(&fs->lfs_clpool);
197
198	for (i = 0; i < LFS_N_TOTAL; i++) {
199		while (fs->lfs_resblk[i].inuse)
200			tsleep(&fs->lfs_resblk, PRIBIO + 1, "lfs_free", 0);
201		if (fs->lfs_resblk[i].p != NULL)
202			free(fs->lfs_resblk[i].p, M_SEGMENT);
203	}
204	free(fs->lfs_resblk, M_SEGMENT);
205}
206
207static unsigned int
208lfs_mhash(void *vp)
209{
210	return (unsigned int)(((unsigned long)vp) >> 2) % LFS_RESHASH_WIDTH;
211}
212
213/*
214 * Return memory of the given size for the given purpose, or use one of a
215 * number of spare last-resort buffers, if malloc returns NULL.
216 */
217void *
218lfs_malloc(struct lfs *fs, size_t size, int type)
219{
220	struct lfs_res_blk *re;
221	void *r;
222	int i, s, start;
223	unsigned int h;
224
225	r = NULL;
226
227	/* If no mem allocated for this type, it just waits */
228	if (lfs_res_qty[type] == 0) {
229		r = malloc(size, M_SEGMENT, M_WAITOK);
230		return r;
231	}
232
233	/* Otherwise try a quick malloc, and if it works, great */
234	if ((r = malloc(size, M_SEGMENT, M_NOWAIT)) != NULL) {
235		return r;
236	}
237
238	/*
239	 * If malloc returned NULL, we are forced to use one of our
240	 * reserve blocks.  We have on hand at least one summary block,
241	 * at least one cluster block, at least one superblock,
242	 * and several indirect blocks.
243	 */
244	/* skip over blocks of other types */
245	for (i = 0, start = 0; i < type; i++)
246		start += lfs_res_qty[i];
247	while (r == NULL) {
248		for (i = 0; i < lfs_res_qty[type]; i++) {
249			if (fs->lfs_resblk[start + i].inuse == 0) {
250				re = fs->lfs_resblk + start + i;
251				re->inuse = 1;
252				r = re->p;
253				KASSERT(re->size >= size);
254				h = lfs_mhash(r);
255				s = splbio();
256				LIST_INSERT_HEAD(&fs->lfs_reshash[h], re, res);
257				splx(s);
258				return r;
259			}
260		}
261#ifdef LFS_DEBUG_MALLOC
262		printf("sleeping on %s (%d)\n", lfs_res_names[type], lfs_res_qty[type]);
263#endif
264		tsleep(&fs->lfs_resblk, PVM, "lfs_malloc", 0);
265#ifdef LFS_DEBUG_MALLOC
266		printf("done sleeping on %s\n", lfs_res_names[type]);
267#endif
268	}
269	/* NOTREACHED */
270	return r;
271}
272
273void
274lfs_free(struct lfs *fs, void *p, int type)
275{
276	int s;
277	unsigned int h;
278	res_t *re;
279#ifdef DEBUG
280	int i;
281#endif
282
283	h = lfs_mhash(p);
284	s = splbio();
285	LIST_FOREACH(re, &fs->lfs_reshash[h], res) {
286		if (re->p == p) {
287			KASSERT(re->inuse == 1);
288			LIST_REMOVE(re, res);
289			re->inuse = 0;
290			wakeup(&fs->lfs_resblk);
291			splx(s);
292			return;
293		}
294	}
295#ifdef DEBUG
296	for (i = 0; i < LFS_N_TOTAL; i++) {
297		if (fs->lfs_resblk[i].p == p)
298			panic("lfs_free: inconsistent reserved block");
299	}
300#endif
301	splx(s);
302
303	/*
304	 * If we didn't find it, free it.
305	 */
306	free(p, M_SEGMENT);
307}
308
309/*
310 * lfs_seglock --
311 *	Single thread the segment writer.
312 */
313int
314lfs_seglock(struct lfs *fs, unsigned long flags)
315{
316	struct segment *sp;
317
318	simple_lock(&fs->lfs_interlock);
319	if (fs->lfs_seglock) {
320		if (fs->lfs_lockpid == curproc->p_pid) {
321			simple_unlock(&fs->lfs_interlock);
322			++fs->lfs_seglock;
323			fs->lfs_sp->seg_flags |= flags;
324			return 0;
325		} else if (flags & SEGM_PAGEDAEMON) {
326			simple_unlock(&fs->lfs_interlock);
327			return EWOULDBLOCK;
328		} else while (fs->lfs_seglock)
329			(void)ltsleep(&fs->lfs_seglock, PRIBIO + 1,
330				      "lfs seglock", 0, &fs->lfs_interlock);
331	}
332
333	fs->lfs_seglock = 1;
334	fs->lfs_lockpid = curproc->p_pid;
335	simple_unlock(&fs->lfs_interlock);
336	fs->lfs_cleanind = 0;
337
338	/* Drain fragment size changes out */
339	lockmgr(&fs->lfs_fraglock, LK_EXCLUSIVE, 0);
340
341	sp = fs->lfs_sp = pool_get(&fs->lfs_segpool, PR_WAITOK);
342	sp->bpp = pool_get(&fs->lfs_bpppool, PR_WAITOK);
343	sp->seg_flags = flags;
344	sp->vp = NULL;
345	sp->seg_iocount = 0;
346	(void) lfs_initseg(fs);
347
348	/*
349	 * Keep a cumulative count of the outstanding I/O operations.  If the
350	 * disk drive catches up with us it could go to zero before we finish,
351	 * so we artificially increment it by one until we've scheduled all of
352	 * the writes we intend to do.
353	 */
354	++fs->lfs_iocount;
355	return 0;
356}
357
358static void lfs_unmark_dirop(struct lfs *);
359
360static void
361lfs_unmark_dirop(struct lfs *fs)
362{
363	struct inode *ip, *nip;
364	struct vnode *vp;
365	int doit;
366
367	simple_lock(&fs->lfs_interlock);
368	doit = !(fs->lfs_flags & LFS_UNDIROP);
369	if (doit)
370		fs->lfs_flags |= LFS_UNDIROP;
371	simple_unlock(&fs->lfs_interlock);
372	if (!doit)
373		return;
374
375	for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
376		nip = TAILQ_NEXT(ip, i_lfs_dchain);
377		vp = ITOV(ip);
378
379		if (VOP_ISLOCKED(vp) &&
380			   vp->v_lock.lk_lockholder != curproc->p_pid) {
381			continue;
382		}
383		if ((VTOI(vp)->i_flag & IN_ADIROP) == 0) {
384			--lfs_dirvcount;
385			vp->v_flag &= ~VDIROP;
386			TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
387			wakeup(&lfs_dirvcount);
388			fs->lfs_unlockvp = vp;
389			vrele(vp);
390			fs->lfs_unlockvp = NULL;
391		}
392	}
393
394	simple_lock(&fs->lfs_interlock);
395	fs->lfs_flags &= ~LFS_UNDIROP;
396	simple_unlock(&fs->lfs_interlock);
397}
398
399static void
400lfs_auto_segclean(struct lfs *fs)
401{
402	int i, error;
403
404	/*
405	 * Now that we've swapped lfs_activesb, but while we still
406	 * hold the segment lock, run through the segment list marking
407	 * the empty ones clean.
408	 * XXX - do we really need to do them all at once?
409	 */
410	for (i = 0; i < fs->lfs_nseg; i++) {
411		if ((fs->lfs_suflags[0][i] &
412		     (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
413		    (SEGUSE_DIRTY | SEGUSE_EMPTY) &&
414		    (fs->lfs_suflags[1][i] &
415		     (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
416		    (SEGUSE_DIRTY | SEGUSE_EMPTY)) {
417
418			if ((error = lfs_do_segclean(fs, i)) != 0) {
419#ifdef DEBUG
420				printf("lfs_auto_segclean: lfs_do_segclean returned %d for seg %d\n", error, i);
421#endif /* DEBUG */
422			}
423		}
424		fs->lfs_suflags[1 - fs->lfs_activesb][i] =
425			fs->lfs_suflags[fs->lfs_activesb][i];
426	}
427}
428
429/*
430 * lfs_segunlock --
431 *	Single thread the segment writer.
432 */
433void
434lfs_segunlock(struct lfs *fs)
435{
436	struct segment *sp;
437	unsigned long sync, ckp;
438	struct buf *bp;
439	int do_unmark_dirop = 0;
440
441	sp = fs->lfs_sp;
442
443	simple_lock(&fs->lfs_interlock);
444	if (fs->lfs_seglock == 1) {
445		if ((sp->seg_flags & SEGM_PROT) == 0)
446			do_unmark_dirop = 1;
447		simple_unlock(&fs->lfs_interlock);
448		sync = sp->seg_flags & SEGM_SYNC;
449		ckp = sp->seg_flags & SEGM_CKP;
450		if (sp->bpp != sp->cbpp) {
451			/* Free allocated segment summary */
452			fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize);
453			bp = *sp->bpp;
454			lfs_freebuf(fs, bp);
455		} else
456			printf ("unlock to 0 with no summary");
457
458		pool_put(&fs->lfs_bpppool, sp->bpp);
459		sp->bpp = NULL;
460
461		/*
462		 * If we're not sync, we're done with sp, get rid of it.
463		 * Otherwise, we keep a local copy around but free
464		 * fs->lfs_sp so another process can use it (we have to
465		 * wait but they don't have to wait for us).
466		 */
467		if (!sync)
468			pool_put(&fs->lfs_segpool, sp);
469		fs->lfs_sp = NULL;
470
471		/*
472		 * If the I/O count is non-zero, sleep until it reaches zero.
473		 * At the moment, the user's process hangs around so we can
474		 * sleep.
475		 */
476		if (--fs->lfs_iocount == 0)
477			LFS_DEBUG_COUNTLOCKED("lfs_segunlock");
478		if (fs->lfs_iocount <= 1)
479			wakeup(&fs->lfs_iocount);
480		/*
481		 * If we're not checkpointing, we don't have to block
482		 * other processes to wait for a synchronous write
483		 * to complete.
484		 */
485		if (!ckp) {
486			simple_lock(&fs->lfs_interlock);
487			--fs->lfs_seglock;
488			fs->lfs_lockpid = 0;
489			simple_unlock(&fs->lfs_interlock);
490			wakeup(&fs->lfs_seglock);
491		}
492		/*
493		 * We let checkpoints happen asynchronously.  That means
494		 * that during recovery, we have to roll forward between
495		 * the two segments described by the first and second
496		 * superblocks to make sure that the checkpoint described
497		 * by a superblock completed.
498		 */
499		while (ckp && sync && fs->lfs_iocount)
500			(void)tsleep(&fs->lfs_iocount, PRIBIO + 1,
501				     "lfs_iocount", 0);
502		while (sync && sp->seg_iocount) {
503			(void)tsleep(&sp->seg_iocount, PRIBIO + 1,
504				     "seg_iocount", 0);
505			/* printf("sleeping on iocount %x == %d\n", sp, sp->seg_iocount); */
506		}
507		if (sync)
508			pool_put(&fs->lfs_segpool, sp);
509
510		if (ckp) {
511			fs->lfs_nactive = 0;
512			/* If we *know* everything's on disk, write both sbs */
513			/* XXX should wait for this one	 */
514			if (sync)
515				lfs_writesuper(fs, fs->lfs_sboffs[fs->lfs_activesb]);
516			lfs_writesuper(fs, fs->lfs_sboffs[1 - fs->lfs_activesb]);
517			if (!(fs->lfs_ivnode->v_mount->mnt_flag & MNT_UNMOUNT))
518				lfs_auto_segclean(fs);
519			fs->lfs_activesb = 1 - fs->lfs_activesb;
520			simple_lock(&fs->lfs_interlock);
521			--fs->lfs_seglock;
522			fs->lfs_lockpid = 0;
523			simple_unlock(&fs->lfs_interlock);
524			wakeup(&fs->lfs_seglock);
525		}
526		/* Reenable fragment size changes */
527		lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
528		if (do_unmark_dirop)
529			lfs_unmark_dirop(fs);
530	} else if (fs->lfs_seglock == 0) {
531		simple_unlock(&fs->lfs_interlock);
532		panic ("Seglock not held");
533	} else {
534		--fs->lfs_seglock;
535		simple_unlock(&fs->lfs_interlock);
536	}
537}
538
539/*
540 * drain dirops and start writer.
541 */
542int
543lfs_writer_enter(struct lfs *fs, const char *wmesg)
544{
545	int error = 0;
546
547	simple_lock(&fs->lfs_interlock);
548
549	/* disallow dirops during flush */
550	fs->lfs_writer++;
551
552	while (fs->lfs_dirops > 0) {
553		++fs->lfs_diropwait;
554		error = ltsleep(&fs->lfs_writer, PRIBIO+1, wmesg, 0,
555		    &fs->lfs_interlock);
556		--fs->lfs_diropwait;
557	}
558
559	if (error)
560		fs->lfs_writer--;
561
562	simple_unlock(&fs->lfs_interlock);
563
564	return error;
565}
566
567void
568lfs_writer_leave(struct lfs *fs)
569{
570	boolean_t dowakeup;
571
572	simple_lock(&fs->lfs_interlock);
573	dowakeup = !(--fs->lfs_writer);
574	simple_unlock(&fs->lfs_interlock);
575	if (dowakeup)
576		wakeup(&fs->lfs_dirops);
577}
578