lfs.c revision 1.26
1/* $NetBSD: lfs.c,v 1.26 2006/11/09 19:36:36 christos Exp $ */
2/*-
3 * Copyright (c) 2003 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Konrad E. Schroder <perseant@hhhh.org>.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the NetBSD
20 *	Foundation, Inc. and its contributors.
21 * 4. Neither the name of The NetBSD Foundation nor the names of its
22 *    contributors may be used to endorse or promote products derived
23 *    from this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37/*
38 * Copyright (c) 1989, 1991, 1993
39 *	The Regents of the University of California.  All rights reserved.
40 * (c) UNIX System Laboratories, Inc.
41 * All or some portions of this file are derived from material licensed
42 * to the University of California by American Telephone and Telegraph
43 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
44 * the permission of UNIX System Laboratories, Inc.
45 *
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
48 * are met:
49 * 1. Redistributions of source code must retain the above copyright
50 *    notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 *    notice, this list of conditions and the following disclaimer in the
53 *    documentation and/or other materials provided with the distribution.
54 * 3. Neither the name of the University nor the names of its contributors
55 *    may be used to endorse or promote products derived from this software
56 *    without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 *
70 *	@(#)ufs_bmap.c	8.8 (Berkeley) 8/11/95
71 */
72
73
74#include <sys/types.h>
75#include <sys/param.h>
76#include <sys/time.h>
77#include <sys/buf.h>
78#include <sys/mount.h>
79
80#include <ufs/ufs/inode.h>
81#include <ufs/ufs/ufsmount.h>
82#define vnode uvnode
83#include <ufs/lfs/lfs.h>
84#undef vnode
85
86#include <assert.h>
87#include <err.h>
88#include <errno.h>
89#include <stdarg.h>
90#include <stdio.h>
91#include <stdlib.h>
92#include <string.h>
93#include <unistd.h>
94#include <util.h>
95
96#include "bufcache.h"
97#include "vnode.h"
98#include "lfs_user.h"
99#include "segwrite.h"
100
101#define panic call_panic
102
103extern u_int32_t cksum(void *, size_t);
104extern u_int32_t lfs_sb_cksum(struct dlfs *);
105extern void pwarn(const char *, ...);
106
107extern struct uvnodelst vnodelist;
108extern struct uvnodelst getvnodelist[VNODE_HASH_MAX];
109extern int nvnodes;
110
111static int
112lfs_fragextend(struct uvnode *, int, int, daddr_t, struct ubuf **);
113
114int fsdirty = 0;
115void (*panic_func)(int, const char *, va_list) = my_vpanic;
116
117/*
118 * LFS buffer and uvnode operations
119 */
120
121int
122lfs_vop_strategy(struct ubuf * bp)
123{
124	int count;
125
126	if (bp->b_flags & B_READ) {
127		count = pread(bp->b_vp->v_fd, bp->b_data, bp->b_bcount,
128		    dbtob(bp->b_blkno));
129		if (count == bp->b_bcount)
130			bp->b_flags |= B_DONE;
131	} else {
132		count = pwrite(bp->b_vp->v_fd, bp->b_data, bp->b_bcount,
133		    dbtob(bp->b_blkno));
134		if (count == 0) {
135			perror("pwrite");
136			return -1;
137		}
138		bp->b_flags &= ~B_DELWRI;
139		reassignbuf(bp, bp->b_vp);
140	}
141	return 0;
142}
143
144int
145lfs_vop_bwrite(struct ubuf * bp)
146{
147	struct lfs *fs;
148
149	fs = bp->b_vp->v_fs;
150	if (!(bp->b_flags & B_DELWRI)) {
151		fs->lfs_avail -= btofsb(fs, bp->b_bcount);
152	}
153	bp->b_flags |= B_DELWRI | B_LOCKED;
154	reassignbuf(bp, bp->b_vp);
155	brelse(bp);
156	return 0;
157}
158
159/*
160 * ufs_bmaparray does the bmap conversion, and if requested returns the
161 * array of logical blocks which must be traversed to get to a block.
162 * Each entry contains the offset into that block that gets you to the
163 * next block and the disk address of the block (if it is assigned).
164 */
165int
166ufs_bmaparray(struct lfs * fs, struct uvnode * vp, daddr_t bn, daddr_t * bnp, struct indir * ap, int *nump)
167{
168	struct inode *ip;
169	struct ubuf *bp;
170	struct indir a[NIADDR + 1], *xap;
171	daddr_t daddr;
172	daddr_t metalbn;
173	int error, num;
174
175	ip = VTOI(vp);
176
177	if (bn >= 0 && bn < NDADDR) {
178		if (nump != NULL)
179			*nump = 0;
180		*bnp = fsbtodb(fs, ip->i_ffs1_db[bn]);
181		if (*bnp == 0)
182			*bnp = -1;
183		return (0);
184	}
185	xap = ap == NULL ? a : ap;
186	if (!nump)
187		nump = &num;
188	if ((error = ufs_getlbns(fs, vp, bn, xap, nump)) != 0)
189		return (error);
190
191	num = *nump;
192
193	/* Get disk address out of indirect block array */
194	daddr = ip->i_ffs1_ib[xap->in_off];
195
196	for (bp = NULL, ++xap; --num; ++xap) {
197		/* Exit the loop if there is no disk address assigned yet and
198		 * the indirect block isn't in the cache, or if we were
199		 * looking for an indirect block and we've found it. */
200
201		metalbn = xap->in_lbn;
202		if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn)
203			break;
204		/*
205		 * If we get here, we've either got the block in the cache
206		 * or we have a disk address for it, go fetch it.
207		 */
208		if (bp)
209			brelse(bp);
210
211		xap->in_exists = 1;
212		bp = getblk(vp, metalbn, fs->lfs_bsize);
213
214		if (!(bp->b_flags & (B_DONE | B_DELWRI))) {
215			bp->b_blkno = fsbtodb(fs, daddr);
216			bp->b_flags |= B_READ;
217			VOP_STRATEGY(bp);
218		}
219		daddr = ((ufs_daddr_t *) bp->b_data)[xap->in_off];
220	}
221	if (bp)
222		brelse(bp);
223
224	daddr = fsbtodb(fs, (ufs_daddr_t) daddr);
225	*bnp = daddr == 0 ? -1 : daddr;
226	return (0);
227}
228
229/*
230 * Create an array of logical block number/offset pairs which represent the
231 * path of indirect blocks required to access a data block.  The first "pair"
232 * contains the logical block number of the appropriate single, double or
233 * triple indirect block and the offset into the inode indirect block array.
234 * Note, the logical block number of the inode single/double/triple indirect
235 * block appears twice in the array, once with the offset into the i_ffs1_ib and
236 * once with the offset into the page itself.
237 */
238int
239ufs_getlbns(struct lfs * fs, struct uvnode * vp, daddr_t bn, struct indir * ap, int *nump)
240{
241	daddr_t metalbn, realbn;
242	int64_t blockcnt;
243	int lbc;
244	int i, numlevels, off;
245	int lognindir, indir;
246
247	metalbn = 0;    /* XXXGCC -Wuninitialized [sh3] */
248
249	if (nump)
250		*nump = 0;
251	numlevels = 0;
252	realbn = bn;
253	if (bn < 0)
254		bn = -bn;
255
256	lognindir = -1;
257	for (indir = fs->lfs_nindir; indir; indir >>= 1)
258		++lognindir;
259
260	/* Determine the number of levels of indirection.  After this loop is
261	 * done, blockcnt indicates the number of data blocks possible at the
262	 * given level of indirection, and NIADDR - i is the number of levels
263	 * of indirection needed to locate the requested block. */
264
265	bn -= NDADDR;
266	for (lbc = 0, i = NIADDR;; i--, bn -= blockcnt) {
267		if (i == 0)
268			return (EFBIG);
269
270		lbc += lognindir;
271		blockcnt = (int64_t) 1 << lbc;
272
273		if (bn < blockcnt)
274			break;
275	}
276
277	/* Calculate the address of the first meta-block. */
278	metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + NIADDR - i);
279
280	/* At each iteration, off is the offset into the bap array which is an
281	 * array of disk addresses at the current level of indirection. The
282	 * logical block number and the offset in that block are stored into
283	 * the argument array. */
284	ap->in_lbn = metalbn;
285	ap->in_off = off = NIADDR - i;
286	ap->in_exists = 0;
287	ap++;
288	for (++numlevels; i <= NIADDR; i++) {
289		/* If searching for a meta-data block, quit when found. */
290		if (metalbn == realbn)
291			break;
292
293		lbc -= lognindir;
294		blockcnt = (int64_t) 1 << lbc;
295		off = (bn >> lbc) & (fs->lfs_nindir - 1);
296
297		++numlevels;
298		ap->in_lbn = metalbn;
299		ap->in_off = off;
300		ap->in_exists = 0;
301		++ap;
302
303		metalbn -= -1 + (off << lbc);
304	}
305	if (nump)
306		*nump = numlevels;
307	return (0);
308}
309
310int
311lfs_vop_bmap(struct uvnode * vp, daddr_t lbn, daddr_t * daddrp)
312{
313	return ufs_bmaparray(vp->v_fs, vp, lbn, daddrp, NULL, NULL);
314}
315
316/* Search a block for a specific dinode. */
317struct ufs1_dinode *
318lfs_ifind(struct lfs * fs, ino_t ino, struct ubuf * bp)
319{
320	struct ufs1_dinode *dip = (struct ufs1_dinode *) bp->b_data;
321	struct ufs1_dinode *ldip, *fin;
322
323	fin = dip + INOPB(fs);
324
325	/*
326	 * Read the inode block backwards, since later versions of the
327	 * inode will supercede earlier ones.  Though it is unlikely, it is
328	 * possible that the same inode will appear in the same inode block.
329	 */
330	for (ldip = fin - 1; ldip >= dip; --ldip)
331		if (ldip->di_inumber == ino)
332			return (ldip);
333	return NULL;
334}
335
336/*
337 * lfs_raw_vget makes us a new vnode from the inode at the given disk address.
338 * XXX it currently loses atime information.
339 */
340struct uvnode *
341lfs_raw_vget(struct lfs * fs, ino_t ino, int fd, ufs_daddr_t daddr)
342{
343	struct uvnode *vp;
344	struct inode *ip;
345	struct ufs1_dinode *dip;
346	struct ubuf *bp;
347	int i, hash;
348
349	vp = ecalloc(1, sizeof(*vp));
350	vp->v_fd = fd;
351	vp->v_fs = fs;
352	vp->v_usecount = 0;
353	vp->v_strategy_op = lfs_vop_strategy;
354	vp->v_bwrite_op = lfs_vop_bwrite;
355	vp->v_bmap_op = lfs_vop_bmap;
356	LIST_INIT(&vp->v_cleanblkhd);
357	LIST_INIT(&vp->v_dirtyblkhd);
358
359	ip = ecalloc(1, sizeof(*ip));
360
361	ip->i_din.ffs1_din = ecalloc(1, sizeof(*ip->i_din.ffs1_din));
362
363	/* Initialize the inode -- from lfs_vcreate. */
364	ip->inode_ext.lfs = ecalloc(1, sizeof(*ip->inode_ext.lfs));
365	vp->v_data = ip;
366	/* ip->i_vnode = vp; */
367	ip->i_number = ino;
368	ip->i_lockf = 0;
369	ip->i_diroff = 0;
370	ip->i_lfs_effnblks = 0;
371	ip->i_flag = 0;
372
373	/* Load inode block and find inode */
374	if (daddr > 0) {
375		bread(fs->lfs_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize, NULL, &bp);
376		bp->b_flags |= B_AGE;
377		dip = lfs_ifind(fs, ino, bp);
378		if (dip == NULL) {
379			brelse(bp);
380			free(ip);
381			free(vp);
382			return NULL;
383		}
384		memcpy(ip->i_din.ffs1_din, dip, sizeof(*dip));
385		brelse(bp);
386	}
387	ip->i_number = ino;
388	/* ip->i_devvp = fs->lfs_devvp; */
389	ip->i_lfs = fs;
390
391	ip->i_ffs_effnlink = ip->i_ffs1_nlink;
392	ip->i_lfs_effnblks = ip->i_ffs1_blocks;
393	ip->i_lfs_osize = ip->i_ffs1_size;
394#if 0
395	if (fs->lfs_version > 1) {
396		ip->i_ffs1_atime = ts.tv_sec;
397		ip->i_ffs1_atimensec = ts.tv_nsec;
398	}
399#endif
400
401	memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize));
402	for (i = 0; i < NDADDR; i++)
403		if (ip->i_ffs1_db[i] != 0)
404			ip->i_lfs_fragsize[i] = blksize(fs, ip, i);
405
406	++nvnodes;
407	hash = ((int)(intptr_t)fs + ino) & (VNODE_HASH_MAX - 1);
408	LIST_INSERT_HEAD(&getvnodelist[hash], vp, v_getvnodes);
409	LIST_INSERT_HEAD(&vnodelist, vp, v_mntvnodes);
410
411	return vp;
412}
413
414static struct uvnode *
415lfs_vget(void *vfs, ino_t ino)
416{
417	struct lfs *fs = (struct lfs *)vfs;
418	ufs_daddr_t daddr;
419	struct ubuf *bp;
420	IFILE *ifp;
421
422	LFS_IENTRY(ifp, fs, ino, bp);
423	daddr = ifp->if_daddr;
424	brelse(bp);
425	if (daddr <= 0 || dtosn(fs, daddr) >= fs->lfs_nseg)
426		return NULL;
427	return lfs_raw_vget(fs, ino, fs->lfs_ivnode->v_fd, daddr);
428}
429
430/* Check superblock magic number and checksum */
431static int
432check_sb(struct lfs *fs)
433{
434	u_int32_t checksum;
435
436	if (fs->lfs_magic != LFS_MAGIC) {
437		printf("Superblock magic number (0x%lx) does not match "
438		       "expected 0x%lx\n", (unsigned long) fs->lfs_magic,
439		       (unsigned long) LFS_MAGIC);
440		return 1;
441	}
442	/* checksum */
443	checksum = lfs_sb_cksum(&(fs->lfs_dlfs));
444	if (fs->lfs_cksum != checksum) {
445		printf("Superblock checksum (%lx) does not match computed checksum (%lx)\n",
446		    (unsigned long) fs->lfs_cksum, (unsigned long) checksum);
447		return 1;
448	}
449	return 0;
450}
451
452/* Initialize LFS library; load superblocks and choose which to use. */
453struct lfs *
454lfs_init(int devfd, daddr_t sblkno, daddr_t idaddr, int dummy_read, int debug)
455{
456	struct uvnode *devvp;
457	struct ubuf *bp;
458	int tryalt;
459	struct lfs *fs, *altfs;
460	int error;
461
462	vfs_init();
463
464	devvp = ecalloc(1, sizeof(*devvp));
465	devvp->v_fs = NULL;
466	devvp->v_fd = devfd;
467	devvp->v_strategy_op = raw_vop_strategy;
468	devvp->v_bwrite_op = raw_vop_bwrite;
469	devvp->v_bmap_op = raw_vop_bmap;
470	LIST_INIT(&devvp->v_cleanblkhd);
471	LIST_INIT(&devvp->v_dirtyblkhd);
472
473	tryalt = 0;
474	if (dummy_read) {
475		if (sblkno == 0)
476			sblkno = btodb(LFS_LABELPAD);
477		fs = ecalloc(1, sizeof(*fs));
478		fs->lfs_devvp = devvp;
479	} else {
480		if (sblkno == 0) {
481			sblkno = btodb(LFS_LABELPAD);
482			tryalt = 1;
483		} else if (debug) {
484			printf("No -b flag given, not attempting to verify checkpoint\n");
485		}
486		error = bread(devvp, sblkno, LFS_SBPAD, NOCRED, &bp);
487		fs = ecalloc(1, sizeof(*fs));
488		fs->lfs_dlfs = *((struct dlfs *) bp->b_data);
489		fs->lfs_devvp = devvp;
490		bp->b_flags |= B_INVAL;
491		brelse(bp);
492
493		if (tryalt) {
494			error = bread(devvp, fsbtodb(fs, fs->lfs_sboffs[1]),
495		    	LFS_SBPAD, NOCRED, &bp);
496			altfs = ecalloc(1, sizeof(*altfs));
497			altfs->lfs_dlfs = *((struct dlfs *) bp->b_data);
498			altfs->lfs_devvp = devvp;
499			bp->b_flags |= B_INVAL;
500			brelse(bp);
501
502			if (check_sb(fs) || fs->lfs_idaddr <= 0) {
503				if (debug)
504					printf("Primary superblock is no good, using first alternate\n");
505				free(fs);
506				fs = altfs;
507			} else {
508				/* If both superblocks check out, try verification */
509				if (check_sb(altfs)) {
510					if (debug)
511						printf("First alternate superblock is no good, using primary\n");
512					free(altfs);
513				} else {
514					if (lfs_verify(fs, altfs, devvp, debug) == fs) {
515						free(altfs);
516					} else {
517						free(fs);
518						fs = altfs;
519					}
520				}
521			}
522		}
523		if (check_sb(fs)) {
524			free(fs);
525			return NULL;
526		}
527	}
528
529	/* Compatibility */
530	if (fs->lfs_version < 2) {
531		fs->lfs_sumsize = LFS_V1_SUMMARY_SIZE;
532		fs->lfs_ibsize = fs->lfs_bsize;
533		fs->lfs_start = fs->lfs_sboffs[0];
534		fs->lfs_tstamp = fs->lfs_otstamp;
535		fs->lfs_fsbtodb = 0;
536	}
537
538	if (!dummy_read) {
539		fs->lfs_suflags = emalloc(2 * sizeof(u_int32_t *));
540		fs->lfs_suflags[0] = emalloc(fs->lfs_nseg * sizeof(u_int32_t));
541		fs->lfs_suflags[1] = emalloc(fs->lfs_nseg * sizeof(u_int32_t));
542	}
543
544	if (idaddr == 0)
545		idaddr = fs->lfs_idaddr;
546	else
547		fs->lfs_idaddr = idaddr;
548	/* NB: If dummy_read!=0, idaddr==0 here so we get a fake inode. */
549	fs->lfs_ivnode = lfs_raw_vget(fs,
550		(dummy_read ? LFS_IFILE_INUM : fs->lfs_ifile), devvp->v_fd,
551		idaddr);
552	if (fs->lfs_ivnode == NULL)
553		return NULL;
554
555	register_vget((void *)fs, lfs_vget);
556
557	return fs;
558}
559
560/*
561 * Check partial segment validity between fs->lfs_offset and the given goal.
562 *
563 * If goal == 0, just keep on going until the segments stop making sense,
564 * and return the address of the last valid partial segment.
565 *
566 * If goal != 0, return the address of the first partial segment that failed,
567 * or "goal" if we reached it without failure (the partial segment *at* goal
568 * need not be valid).
569 */
570ufs_daddr_t
571try_verify(struct lfs *osb, struct uvnode *devvp, ufs_daddr_t goal, int debug)
572{
573	ufs_daddr_t daddr, odaddr;
574	SEGSUM *sp;
575	int i, bc, hitclean;
576	struct ubuf *bp;
577	ufs_daddr_t nodirop_daddr;
578	u_int64_t serial;
579
580	bc = 0;
581	hitclean = 0;
582	odaddr = -1;
583	daddr = osb->lfs_offset;
584	nodirop_daddr = daddr;
585	serial = osb->lfs_serial;
586	while (daddr != goal) {
587		/*
588		 * Don't mistakenly read a superblock, if there is one here.
589		 */
590		if (sntod(osb, dtosn(osb, daddr)) == daddr) {
591			if (daddr == osb->lfs_start)
592				daddr += btofsb(osb, LFS_LABELPAD);
593			for (i = 0; i < LFS_MAXNUMSB; i++) {
594				if (osb->lfs_sboffs[i] < daddr)
595					break;
596				if (osb->lfs_sboffs[i] == daddr)
597					daddr += btofsb(osb, LFS_SBPAD);
598			}
599		}
600
601		/* Read in summary block */
602		bread(devvp, fsbtodb(osb, daddr), osb->lfs_sumsize, NULL, &bp);
603		sp = (SEGSUM *)bp->b_data;
604
605		/*
606		 * Check for a valid segment summary belonging to our fs.
607		 */
608		if (sp->ss_magic != SS_MAGIC ||
609		    sp->ss_ident != osb->lfs_ident ||
610		    sp->ss_serial < serial ||	/* XXX strengthen this */
611		    sp->ss_sumsum != cksum(&sp->ss_datasum, osb->lfs_sumsize -
612			sizeof(sp->ss_sumsum))) {
613			brelse(bp);
614			if (debug) {
615				if (sp->ss_magic != SS_MAGIC)
616					pwarn("pseg at 0x%x: "
617					      "wrong magic number\n",
618					      (int)daddr);
619				else if (sp->ss_ident != osb->lfs_ident)
620					pwarn("pseg at 0x%x: "
621					      "expected ident %llx, got %llx\n",
622					      (int)daddr,
623					      (long long)sp->ss_ident,
624					      (long long)osb->lfs_ident);
625				else if (sp->ss_serial >= serial)
626					pwarn("pseg at 0x%x: "
627					      "serial %d < %d\n", (int)daddr,
628					      (int)sp->ss_serial, (int)serial);
629				else
630					pwarn("pseg at 0x%x: "
631					      "summary checksum wrong\n",
632					      (int)daddr);
633			}
634			break;
635		}
636		if (debug && sp->ss_serial != serial)
637			pwarn("warning, serial=%d ss_serial=%d\n",
638				(int)serial, (int)sp->ss_serial);
639		++serial;
640		bc = check_summary(osb, sp, daddr, debug, devvp, NULL);
641		if (bc == 0) {
642			brelse(bp);
643			break;
644		}
645		if (debug)
646			pwarn("summary good: 0x%x/%d\n", (int)daddr,
647			      (int)sp->ss_serial);
648		assert (bc > 0);
649		odaddr = daddr;
650		daddr += btofsb(osb, osb->lfs_sumsize + bc);
651		if (dtosn(osb, odaddr) != dtosn(osb, daddr) ||
652		    dtosn(osb, daddr) != dtosn(osb, daddr +
653			btofsb(osb, osb->lfs_sumsize + osb->lfs_bsize) - 1)) {
654			daddr = sp->ss_next;
655		}
656
657		/*
658		 * Check for the beginning and ending of a sequence of
659		 * dirops.  Writes from the cleaner never involve new
660		 * information, and are always checkpoints; so don't try
661		 * to roll forward through them.  Likewise, psegs written
662		 * by a previous roll-forward attempt are not interesting.
663		 */
664		if (sp->ss_flags & (SS_CLEAN | SS_RFW))
665			hitclean = 1;
666		if (hitclean == 0 && (sp->ss_flags & SS_CONT) == 0)
667			nodirop_daddr = daddr;
668
669		brelse(bp);
670	}
671
672	if (goal == 0)
673		return nodirop_daddr;
674	else
675		return daddr;
676}
677
678/* Use try_verify to check whether the newer superblock is valid. */
679struct lfs *
680lfs_verify(struct lfs *sb0, struct lfs *sb1, struct uvnode *devvp, int debug)
681{
682	ufs_daddr_t daddr;
683	struct lfs *osb, *nsb;
684
685	/*
686	 * Verify the checkpoint of the newer superblock,
687	 * if the timestamp/serial number of the two superblocks is
688	 * different.
689	 */
690
691	osb = NULL;
692	if (debug)
693		pwarn("sb0 %lld, sb1 %lld",
694		      (long long) sb0->lfs_serial,
695		      (long long) sb1->lfs_serial);
696
697	if ((sb0->lfs_version == 1 &&
698		sb0->lfs_otstamp != sb1->lfs_otstamp) ||
699	    (sb0->lfs_version > 1 &&
700		sb0->lfs_serial != sb1->lfs_serial)) {
701		if (sb0->lfs_version == 1) {
702			if (sb0->lfs_otstamp > sb1->lfs_otstamp) {
703				osb = sb1;
704				nsb = sb0;
705			} else {
706				osb = sb0;
707				nsb = sb1;
708			}
709		} else {
710			if (sb0->lfs_serial > sb1->lfs_serial) {
711				osb = sb1;
712				nsb = sb0;
713			} else {
714				osb = sb0;
715				nsb = sb1;
716			}
717		}
718		if (debug) {
719			printf("Attempting to verify newer checkpoint...");
720			fflush(stdout);
721		}
722		daddr = try_verify(osb, devvp, nsb->lfs_offset, debug);
723
724		if (debug)
725			printf("done.\n");
726		if (daddr == nsb->lfs_offset) {
727			pwarn("** Newer checkpoint verified, recovered %lld seconds of data\n",
728			    (long long) nsb->lfs_tstamp - (long long) osb->lfs_tstamp);
729			sbdirty();
730		} else {
731			pwarn("** Newer checkpoint invalid, lost %lld seconds of data\n", (long long) nsb->lfs_tstamp - (long long) osb->lfs_tstamp);
732		}
733		return (daddr == nsb->lfs_offset ? nsb : osb);
734	}
735	/* Nothing to check */
736	return osb;
737}
738
739/* Verify a partial-segment summary; return the number of bytes on disk. */
740int
741check_summary(struct lfs *fs, SEGSUM *sp, ufs_daddr_t pseg_addr, int debug,
742	      struct uvnode *devvp, void (func(ufs_daddr_t, FINFO *)))
743{
744	FINFO *fp;
745	int bc;			/* Bytes in partial segment */
746	int nblocks;
747	ufs_daddr_t seg_addr, daddr;
748	ufs_daddr_t *dp, *idp;
749	struct ubuf *bp;
750	int i, j, k, datac, len;
751	long sn;
752	u_int32_t *datap;
753	u_int32_t ccksum;
754
755	sn = dtosn(fs, pseg_addr);
756	seg_addr = sntod(fs, sn);
757
758	/* We've already checked the sumsum, just do the data bounds and sum */
759
760	/* Count the blocks. */
761	nblocks = howmany(sp->ss_ninos, INOPB(fs));
762	bc = nblocks << (fs->lfs_version > 1 ? fs->lfs_ffshift : fs->lfs_bshift);
763	assert(bc >= 0);
764
765	fp = (FINFO *) (sp + 1);
766	for (i = 0; i < sp->ss_nfinfo; i++) {
767		nblocks += fp->fi_nblocks;
768		bc += fp->fi_lastlength + ((fp->fi_nblocks - 1)
769					   << fs->lfs_bshift);
770		assert(bc >= 0);
771		fp = (FINFO *) (fp->fi_blocks + fp->fi_nblocks);
772		if (((char *)fp) - (char *)sp > fs->lfs_sumsize)
773			return 0;
774	}
775	datap = emalloc(nblocks * sizeof(*datap));
776	datac = 0;
777
778	dp = (ufs_daddr_t *) sp;
779	dp += fs->lfs_sumsize / sizeof(ufs_daddr_t);
780	dp--;
781
782	idp = dp;
783	daddr = pseg_addr + btofsb(fs, fs->lfs_sumsize);
784	fp = (FINFO *) (sp + 1);
785	for (i = 0, j = 0;
786	     i < sp->ss_nfinfo || j < howmany(sp->ss_ninos, INOPB(fs)); i++) {
787		if (i >= sp->ss_nfinfo && *idp != daddr) {
788			pwarn("Not enough inode blocks in pseg at 0x%" PRIx32
789			      ": found %d, wanted %d\n",
790			      pseg_addr, j, howmany(sp->ss_ninos, INOPB(fs)));
791			if (debug)
792				pwarn("*idp=%x, daddr=%" PRIx32 "\n", *idp,
793				      daddr);
794			break;
795		}
796		while (j < howmany(sp->ss_ninos, INOPB(fs)) && *idp == daddr) {
797			bread(devvp, fsbtodb(fs, daddr), fs->lfs_ibsize, NOCRED, &bp);
798			datap[datac++] = ((u_int32_t *) (bp->b_data))[0];
799			brelse(bp);
800
801			++j;
802			daddr += btofsb(fs, fs->lfs_ibsize);
803			--idp;
804		}
805		if (i < sp->ss_nfinfo) {
806			if (func)
807				func(daddr, fp);
808			for (k = 0; k < fp->fi_nblocks; k++) {
809				len = (k == fp->fi_nblocks - 1 ?
810				       fp->fi_lastlength
811				       : fs->lfs_bsize);
812				bread(devvp, fsbtodb(fs, daddr), len, NOCRED, &bp);
813				datap[datac++] = ((u_int32_t *) (bp->b_data))[0];
814				brelse(bp);
815				daddr += btofsb(fs, len);
816			}
817			fp = (FINFO *) (fp->fi_blocks + fp->fi_nblocks);
818		}
819	}
820
821	if (datac != nblocks) {
822		pwarn("Partial segment at 0x%llx expected %d blocks counted %d\n",
823		    (long long) pseg_addr, nblocks, datac);
824	}
825	ccksum = cksum(datap, nblocks * sizeof(u_int32_t));
826	/* Check the data checksum */
827	if (ccksum != sp->ss_datasum) {
828		pwarn("Partial segment at 0x%" PRIx32 " data checksum"
829		      " mismatch: given 0x%x, computed 0x%x\n",
830		      pseg_addr, sp->ss_datasum, ccksum);
831		free(datap);
832		return 0;
833	}
834	free(datap);
835	assert(bc >= 0);
836	return bc;
837}
838
839/* print message and exit */
840void
841my_vpanic(int fatal, const char *fmt, va_list ap)
842{
843        (void) vprintf(fmt, ap);
844	exit(8);
845}
846
847void
848call_panic(const char *fmt, ...)
849{
850	va_list ap;
851
852	va_start(ap, fmt);
853        panic_func(1, fmt, ap);
854	va_end(ap);
855}
856
857/* Allocate a new inode. */
858struct uvnode *
859lfs_valloc(struct lfs *fs, ino_t ino)
860{
861	struct ubuf *bp, *cbp;
862	struct ifile *ifp;
863	ino_t new_ino;
864	int error;
865	int new_gen;
866	CLEANERINFO *cip;
867
868	/* Get the head of the freelist. */
869	LFS_GET_HEADFREE(fs, cip, cbp, &new_ino);
870
871	/*
872	 * Remove the inode from the free list and write the new start
873	 * of the free list into the superblock.
874	 */
875	LFS_IENTRY(ifp, fs, new_ino, bp);
876	if (ifp->if_daddr != LFS_UNUSED_DADDR)
877		panic("lfs_valloc: inuse inode %d on the free list", new_ino);
878	LFS_PUT_HEADFREE(fs, cip, cbp, ifp->if_nextfree);
879
880	new_gen = ifp->if_version; /* version was updated by vfree */
881	brelse(bp);
882
883	/* Extend IFILE so that the next lfs_valloc will succeed. */
884	if (fs->lfs_freehd == LFS_UNUSED_INUM) {
885		if ((error = extend_ifile(fs)) != 0) {
886			LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
887			return NULL;
888		}
889	}
890
891	/* Set superblock modified bit and increment file count. */
892        sbdirty();
893	++fs->lfs_nfiles;
894
895        return lfs_raw_vget(fs, ino, fs->lfs_devvp->v_fd, 0x0);
896}
897
898#ifdef IN_FSCK_LFS
899void reset_maxino(ino_t);
900#endif
901
902/*
903 * Add a new block to the Ifile, to accommodate future file creations.
904 */
905int
906extend_ifile(struct lfs *fs)
907{
908	struct uvnode *vp;
909	struct inode *ip;
910	IFILE *ifp;
911	IFILE_V1 *ifp_v1;
912	struct ubuf *bp, *cbp;
913	daddr_t i, blkno, max;
914	ino_t oldlast;
915	CLEANERINFO *cip;
916
917	vp = fs->lfs_ivnode;
918	ip = VTOI(vp);
919	blkno = lblkno(fs, ip->i_ffs1_size);
920
921	lfs_balloc(vp, ip->i_ffs1_size, fs->lfs_bsize, &bp);
922	ip->i_ffs1_size += fs->lfs_bsize;
923	ip->i_flag |= IN_MODIFIED;
924
925	i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) *
926		fs->lfs_ifpb;
927	LFS_GET_HEADFREE(fs, cip, cbp, &oldlast);
928	LFS_PUT_HEADFREE(fs, cip, cbp, i);
929	max = i + fs->lfs_ifpb;
930	fs->lfs_bfree -= btofsb(fs, fs->lfs_bsize);
931
932	if (fs->lfs_version == 1) {
933		for (ifp_v1 = (IFILE_V1 *)bp->b_data; i < max; ++ifp_v1) {
934			ifp_v1->if_version = 1;
935			ifp_v1->if_daddr = LFS_UNUSED_DADDR;
936			ifp_v1->if_nextfree = ++i;
937		}
938		ifp_v1--;
939		ifp_v1->if_nextfree = oldlast;
940	} else {
941		for (ifp = (IFILE *)bp->b_data; i < max; ++ifp) {
942			ifp->if_version = 1;
943			ifp->if_daddr = LFS_UNUSED_DADDR;
944			ifp->if_nextfree = ++i;
945		}
946		ifp--;
947		ifp->if_nextfree = oldlast;
948	}
949	LFS_PUT_TAILFREE(fs, cip, cbp, max - 1);
950
951	LFS_BWRITE_LOG(bp);
952
953#ifdef IN_FSCK_LFS
954	reset_maxino(((ip->i_ffs1_size >> fs->lfs_bshift) - fs->lfs_segtabsz -
955		     fs->lfs_cleansz) * fs->lfs_ifpb);
956#endif
957	return 0;
958}
959
960/*
961 * Allocate a block, and to inode and filesystem block accounting for it
962 * and for any indirect blocks the may need to be created in order for
963 * this block to be created.
964 *
965 * Blocks which have never been accounted for (i.e., which "do not exist")
966 * have disk address 0, which is translated by ufs_bmap to the special value
967 * UNASSIGNED == -1, as in the historical UFS.
968 *
969 * Blocks which have been accounted for but which have not yet been written
970 * to disk are given the new special disk address UNWRITTEN == -2, so that
971 * they can be differentiated from completely new blocks.
972 */
973int
974lfs_balloc(struct uvnode *vp, off_t startoffset, int iosize, struct ubuf **bpp)
975{
976	int offset;
977	daddr_t daddr, idaddr;
978	struct ubuf *ibp, *bp;
979	struct inode *ip;
980	struct lfs *fs;
981	struct indir indirs[NIADDR+2], *idp;
982	daddr_t	lbn, lastblock;
983	int bb, bcount;
984	int error, frags, i, nsize, osize, num;
985
986	ip = VTOI(vp);
987	fs = ip->i_lfs;
988	offset = blkoff(fs, startoffset);
989	lbn = lblkno(fs, startoffset);
990
991	/*
992	 * Three cases: it's a block beyond the end of file, it's a block in
993	 * the file that may or may not have been assigned a disk address or
994	 * we're writing an entire block.
995	 *
996	 * Note, if the daddr is UNWRITTEN, the block already exists in
997	 * the cache (it was read or written earlier).	If so, make sure
998	 * we don't count it as a new block or zero out its contents. If
999	 * it did not, make sure we allocate any necessary indirect
1000	 * blocks.
1001	 *
1002	 * If we are writing a block beyond the end of the file, we need to
1003	 * check if the old last block was a fragment.	If it was, we need
1004	 * to rewrite it.
1005	 */
1006
1007	if (bpp)
1008		*bpp = NULL;
1009
1010	/* Check for block beyond end of file and fragment extension needed. */
1011	lastblock = lblkno(fs, ip->i_ffs1_size);
1012	if (lastblock < NDADDR && lastblock < lbn) {
1013		osize = blksize(fs, ip, lastblock);
1014		if (osize < fs->lfs_bsize && osize > 0) {
1015			if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize,
1016						    lastblock,
1017						    (bpp ? &bp : NULL))))
1018				return (error);
1019			ip->i_ffs1_size = ip->i_ffs1_size =
1020			    (lastblock + 1) * fs->lfs_bsize;
1021			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1022			if (bpp)
1023				(void) VOP_BWRITE(bp);
1024		}
1025	}
1026
1027	/*
1028	 * If the block we are writing is a direct block, it's the last
1029	 * block in the file, and offset + iosize is less than a full
1030	 * block, we can write one or more fragments.  There are two cases:
1031	 * the block is brand new and we should allocate it the correct
1032	 * size or it already exists and contains some fragments and
1033	 * may need to extend it.
1034	 */
1035	if (lbn < NDADDR && lblkno(fs, ip->i_ffs1_size) <= lbn) {
1036		osize = blksize(fs, ip, lbn);
1037		nsize = fragroundup(fs, offset + iosize);
1038		if (lblktosize(fs, lbn) >= ip->i_ffs1_size) {
1039			/* Brand new block or fragment */
1040			frags = numfrags(fs, nsize);
1041			bb = fragstofsb(fs, frags);
1042			if (bpp) {
1043				*bpp = bp = getblk(vp, lbn, nsize);
1044				bp->b_blkno = UNWRITTEN;
1045			}
1046			ip->i_lfs_effnblks += bb;
1047			fs->lfs_bfree -= bb;
1048			ip->i_ffs1_db[lbn] = UNWRITTEN;
1049		} else {
1050			if (nsize <= osize) {
1051				/* No need to extend */
1052				if (bpp && (error = bread(vp, lbn, osize, NOCRED, &bp)))
1053					return error;
1054			} else {
1055				/* Extend existing block */
1056				if ((error =
1057				     lfs_fragextend(vp, osize, nsize, lbn,
1058						    (bpp ? &bp : NULL))))
1059					return error;
1060			}
1061			if (bpp)
1062				*bpp = bp;
1063		}
1064		return 0;
1065	}
1066
1067	error = ufs_bmaparray(fs, vp, lbn, &daddr, &indirs[0], &num);
1068	if (error)
1069		return (error);
1070
1071	daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
1072
1073	/*
1074	 * Do byte accounting all at once, so we can gracefully fail *before*
1075	 * we start assigning blocks.
1076	 */
1077        bb = fsbtodb(fs, 1); /* bb = VFSTOUFS(vp->v_mount)->um_seqinc; */
1078	bcount = 0;
1079	if (daddr == UNASSIGNED) {
1080		bcount = bb;
1081	}
1082	for (i = 1; i < num; ++i) {
1083		if (!indirs[i].in_exists) {
1084			bcount += bb;
1085		}
1086	}
1087	fs->lfs_bfree -= bcount;
1088	ip->i_lfs_effnblks += bcount;
1089
1090	if (daddr == UNASSIGNED) {
1091		if (num > 0 && ip->i_ffs1_ib[indirs[0].in_off] == 0) {
1092			ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
1093		}
1094
1095		/*
1096		 * Create new indirect blocks if necessary
1097		 */
1098		if (num > 1) {
1099			idaddr = ip->i_ffs1_ib[indirs[0].in_off];
1100			for (i = 1; i < num; ++i) {
1101				ibp = getblk(vp, indirs[i].in_lbn,
1102				    fs->lfs_bsize);
1103				if (!indirs[i].in_exists) {
1104					memset(ibp->b_data, 0, ibp->b_bufsize);
1105					ibp->b_blkno = UNWRITTEN;
1106				} else if (!(ibp->b_flags & (B_DELWRI | B_DONE))) {
1107					ibp->b_blkno = fsbtodb(fs, idaddr);
1108					ibp->b_flags |= B_READ;
1109					VOP_STRATEGY(ibp);
1110				}
1111				/*
1112				 * This block exists, but the next one may not.
1113				 * If that is the case mark it UNWRITTEN to
1114                                 * keep the accounting straight.
1115				 */
1116				/* XXX ondisk32 */
1117				if (((int32_t *)ibp->b_data)[indirs[i].in_off] == 0)
1118					((int32_t *)ibp->b_data)[indirs[i].in_off] =
1119						UNWRITTEN;
1120				/* XXX ondisk32 */
1121				idaddr = ((int32_t *)ibp->b_data)[indirs[i].in_off];
1122				if ((error = VOP_BWRITE(ibp)))
1123					return error;
1124			}
1125		}
1126	}
1127
1128
1129	/*
1130	 * Get the existing block from the cache, if requested.
1131	 */
1132	frags = fsbtofrags(fs, bb);
1133	if (bpp)
1134		*bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn));
1135
1136	/*
1137	 * The block we are writing may be a brand new block
1138	 * in which case we need to do accounting.
1139	 *
1140	 * We can tell a truly new block because ufs_bmaparray will say
1141	 * it is UNASSIGNED.  Once we allocate it we will assign it the
1142	 * disk address UNWRITTEN.
1143	 */
1144	if (daddr == UNASSIGNED) {
1145		if (bpp) {
1146			/* Note the new address */
1147			bp->b_blkno = UNWRITTEN;
1148		}
1149
1150		switch (num) {
1151		    case 0:
1152			ip->i_ffs1_db[lbn] = UNWRITTEN;
1153			break;
1154		    case 1:
1155			ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
1156			break;
1157		    default:
1158			idp = &indirs[num - 1];
1159			if (bread(vp, idp->in_lbn, fs->lfs_bsize, NOCRED,
1160				  &ibp))
1161				panic("lfs_balloc: bread bno %lld",
1162				    (long long)idp->in_lbn);
1163			/* XXX ondisk32 */
1164			((int32_t *)ibp->b_data)[idp->in_off] = UNWRITTEN;
1165			VOP_BWRITE(ibp);
1166		}
1167	} else if (bpp && !(bp->b_flags & (B_DONE|B_DELWRI))) {
1168		/*
1169		 * Not a brand new block, also not in the cache;
1170		 * read it in from disk.
1171		 */
1172		if (iosize == fs->lfs_bsize)
1173			/* Optimization: I/O is unnecessary. */
1174			bp->b_blkno = daddr;
1175		else {
1176			/*
1177			 * We need to read the block to preserve the
1178			 * existing bytes.
1179			 */
1180			bp->b_blkno = daddr;
1181			bp->b_flags |= B_READ;
1182			VOP_STRATEGY(bp);
1183			return 0;
1184		}
1185	}
1186
1187	return (0);
1188}
1189
1190int
1191lfs_fragextend(struct uvnode *vp, int osize, int nsize, daddr_t lbn,
1192               struct ubuf **bpp)
1193{
1194	struct inode *ip;
1195	struct lfs *fs;
1196	long bb;
1197	int error;
1198	size_t obufsize;
1199
1200	ip = VTOI(vp);
1201	fs = ip->i_lfs;
1202	bb = (long)fragstofsb(fs, numfrags(fs, nsize - osize));
1203	error = 0;
1204
1205	/*
1206	 * If we are not asked to actually return the block, all we need
1207	 * to do is allocate space for it.  UBC will handle dirtying the
1208	 * appropriate things and making sure it all goes to disk.
1209	 * Don't bother to read in that case.
1210	 */
1211	if (bpp && (error = bread(vp, lbn, osize, NOCRED, bpp))) {
1212		brelse(*bpp);
1213		goto out;
1214	}
1215
1216	fs->lfs_bfree -= bb;
1217	ip->i_lfs_effnblks += bb;
1218	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1219
1220	if (bpp) {
1221		obufsize = (*bpp)->b_bufsize;
1222		(*bpp)->b_data = erealloc((*bpp)->b_data, nsize);
1223		(void)memset((*bpp)->b_data + osize, 0, nsize - osize);
1224	}
1225
1226    out:
1227	return (error);
1228}
1229