suj.c revision 222958
136270Swpaul/*-
236270Swpaul * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
336270Swpaul * All rights reserved.
436270Swpaul *
536270Swpaul * Redistribution and use in source and binary forms, with or without
636270Swpaul * modification, are permitted provided that the following conditions
736270Swpaul * are met:
836270Swpaul * 1. Redistributions of source code must retain the above copyright
936270Swpaul *    notice, this list of conditions and the following disclaimer.
1036270Swpaul * 2. Redistributions in binary form must reproduce the above copyright
1136270Swpaul *    notice, this list of conditions and the following disclaimer in the
1236270Swpaul *    documentation and/or other materials provided with the distribution.
1336270Swpaul *
1436270Swpaul * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
1536270Swpaul * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1636270Swpaul * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1736270Swpaul * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
1836270Swpaul * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1936270Swpaul * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2036270Swpaul * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2136270Swpaul * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2236270Swpaul * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2336270Swpaul * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2436270Swpaul * SUCH DAMAGE.
2536270Swpaul */
2636270Swpaul
2736270Swpaul#include <sys/cdefs.h>
2836270Swpaul__FBSDID("$FreeBSD: head/sbin/fsck_ffs/suj.c 222958 2011-06-10 22:48:35Z jeff $");
2936270Swpaul
3036270Swpaul#include <sys/param.h>
3136270Swpaul#include <sys/disk.h>
3250477Speter#include <sys/disklabel.h>
3336270Swpaul#include <sys/mount.h>
3436270Swpaul#include <sys/stat.h>
3536270Swpaul
3636270Swpaul#include <ufs/ufs/ufsmount.h>
3736270Swpaul#include <ufs/ufs/dinode.h>
3836270Swpaul#include <ufs/ufs/dir.h>
3936270Swpaul#include <ufs/ffs/fs.h>
4036270Swpaul
4139583Swpaul#include <assert.h>
4236270Swpaul#include <err.h>
4336270Swpaul#include <setjmp.h>
4436270Swpaul#include <stdarg.h>
4536270Swpaul#include <stdio.h>
4639583Swpaul#include <stdlib.h>
4736270Swpaul#include <stdint.h>
4836270Swpaul#include <libufs.h>
4936270Swpaul#include <string.h>
5036270Swpaul#include <strings.h>
5136270Swpaul#include <sysexits.h>
5236270Swpaul#include <time.h>
5336270Swpaul
5436270Swpaul#include "fsck.h"
5536270Swpaul
5636270Swpaul#define	DOTDOT_OFFSET	DIRECTSIZ(1)
5736270Swpaul#define	SUJ_HASHSIZE	2048
5839583Swpaul#define	SUJ_HASHMASK	(SUJ_HASHSIZE - 1)
5936270Swpaul#define	SUJ_HASH(x)	((x * 2654435761) & SUJ_HASHMASK)
6036270Swpaul
6136270Swpaulstruct suj_seg {
6236270Swpaul	TAILQ_ENTRY(suj_seg) ss_next;
6336270Swpaul	struct jsegrec	ss_rec;
6436270Swpaul	uint8_t		*ss_blk;
6536270Swpaul};
6639583Swpaul
6739583Swpaulstruct suj_rec {
6839583Swpaul	TAILQ_ENTRY(suj_rec) sr_next;
6939583Swpaul	union jrec	*sr_rec;
7039583Swpaul};
7139583SwpaulTAILQ_HEAD(srechd, suj_rec);
7239583Swpaul
7339583Swpaulstruct suj_ino {
7439583Swpaul	LIST_ENTRY(suj_ino)	si_next;
7536270Swpaul	struct srechd		si_recs;
7636270Swpaul	struct srechd		si_newrecs;
7739583Swpaul	struct srechd		si_movs;
7836270Swpaul	struct jtrncrec		*si_trunc;
7936270Swpaul	ino_t			si_ino;
8036270Swpaul	char			si_skipparent;
8136270Swpaul	char			si_hasrecs;
8236270Swpaul	char			si_blkadj;
8336270Swpaul	char			si_linkadj;
8436270Swpaul	int			si_mode;
8536270Swpaul	nlink_t			si_nlinkadj;
8636270Swpaul	nlink_t			si_nlink;
8736270Swpaul	nlink_t			si_dotlinks;
8836270Swpaul};
8936270SwpaulLIST_HEAD(inohd, suj_ino);
9036270Swpaul
9136270Swpaulstruct suj_blk {
9236270Swpaul	LIST_ENTRY(suj_blk)	sb_next;
9336270Swpaul	struct srechd		sb_recs;
9436270Swpaul	ufs2_daddr_t		sb_blk;
9536270Swpaul};
9636270SwpaulLIST_HEAD(blkhd, suj_blk);
9736270Swpaul
9836270Swpaulstruct data_blk {
9936270Swpaul	LIST_ENTRY(data_blk)	db_next;
10036270Swpaul	uint8_t			*db_buf;
10136270Swpaul	ufs2_daddr_t		db_blk;
10236270Swpaul	int			db_size;
10336270Swpaul	int			db_dirty;
10436270Swpaul};
10536270Swpaul
10636270Swpaulstruct ino_blk {
10736270Swpaul	LIST_ENTRY(ino_blk)	ib_next;
10836270Swpaul	uint8_t			*ib_buf;
10936270Swpaul	int			ib_dirty;
11036270Swpaul	ufs2_daddr_t		ib_blk;
11136270Swpaul};
11236270SwpaulLIST_HEAD(iblkhd, ino_blk);
11336270Swpaul
11436270Swpaulstruct suj_cg {
11536270Swpaul	LIST_ENTRY(suj_cg)	sc_next;
11636270Swpaul	struct blkhd		sc_blkhash[SUJ_HASHSIZE];
11736270Swpaul	struct inohd		sc_inohash[SUJ_HASHSIZE];
11836270Swpaul	struct iblkhd		sc_iblkhash[SUJ_HASHSIZE];
11936270Swpaul	struct ino_blk		*sc_lastiblk;
12036270Swpaul	struct suj_ino		*sc_lastino;
12136270Swpaul	struct suj_blk		*sc_lastblk;
12236270Swpaul	uint8_t			*sc_cgbuf;
12336270Swpaul	struct cg		*sc_cgp;
12436270Swpaul	int			sc_dirty;
12536270Swpaul	int			sc_cgx;
12636270Swpaul};
12736270Swpaul
12836270SwpaulLIST_HEAD(cghd, suj_cg) cghash[SUJ_HASHSIZE];
12936270SwpaulLIST_HEAD(dblkhd, data_blk) dbhash[SUJ_HASHSIZE];
13036270Swpaulstruct suj_cg *lastcg;
13136270Swpaulstruct data_blk *lastblk;
13236270Swpaul
13336270SwpaulTAILQ_HEAD(seghd, suj_seg) allsegs;
13436270Swpauluint64_t oldseq;
13536270Swpaulstatic struct uufsd *disk = NULL;
13636270Swpaulstatic struct fs *fs = NULL;
13736270Swpaulino_t sujino;
13836270Swpaul
13936270Swpaul/*
14036270Swpaul * Summary statistics.
14136270Swpaul */
14236270Swpauluint64_t freefrags;
14336270Swpauluint64_t freeblocks;
14436270Swpauluint64_t freeinos;
14536270Swpauluint64_t freedir;
14636270Swpauluint64_t jbytes;
14736270Swpauluint64_t jrecs;
14836270Swpaul
14936270Swpaulstatic jmp_buf	jmpbuf;
15036270Swpaul
15136270Swpaultypedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int);
15236270Swpaulstatic void err_suj(const char *, ...) __dead2;
15336270Swpaulstatic void ino_trunc(ino_t, off_t);
15436270Swpaulstatic void ino_decr(ino_t);
15536270Swpaulstatic void ino_adjust(struct suj_ino *);
15636270Swpaulstatic void ino_build(struct suj_ino *);
15736270Swpaulstatic int blk_isfree(ufs2_daddr_t);
15836270Swpaul
15936270Swpaulstatic void *
16036270Swpaulerrmalloc(size_t n)
16136270Swpaul{
16236270Swpaul	void *a;
16336270Swpaul
16436270Swpaul	a = malloc(n);
16536270Swpaul	if (a == NULL)
16636270Swpaul		err(EX_OSERR, "malloc(%zu)", n);
16736270Swpaul	return (a);
16836270Swpaul}
16936270Swpaul
17036270Swpaul/*
17136270Swpaul * When hit a fatal error in journalling check, print out
17236270Swpaul * the error and then offer to fallback to normal fsck.
17336270Swpaul */
17436270Swpaulstatic void
17536270Swpaulerr_suj(const char * restrict fmt, ...)
17636270Swpaul{
17736270Swpaul	va_list ap;
17836270Swpaul
17936270Swpaul	if (preen)
18036270Swpaul		(void)fprintf(stdout, "%s: ", cdevname);
18136270Swpaul
18236270Swpaul	va_start(ap, fmt);
18336270Swpaul	(void)vfprintf(stdout, fmt, ap);
18436270Swpaul	va_end(ap);
18536270Swpaul
18636270Swpaul	longjmp(jmpbuf, -1);
18736270Swpaul}
18836270Swpaul
18936270Swpaul/*
19036270Swpaul * Open the given provider, load superblock.
19136270Swpaul */
19236270Swpaulstatic void
19336270Swpaulopendisk(const char *devnam)
19436270Swpaul{
19536270Swpaul	if (disk != NULL)
19636270Swpaul		return;
19736270Swpaul	disk = malloc(sizeof(*disk));
19836270Swpaul	if (disk == NULL)
19936270Swpaul		err(EX_OSERR, "malloc(%zu)", sizeof(*disk));
20045155Swpaul	if (ufs_disk_fillout(disk, devnam) == -1) {
20145155Swpaul		err(EX_OSERR, "ufs_disk_fillout(%s) failed: %s", devnam,
20245155Swpaul		    disk->d_error);
20348992Swpaul	}
20448992Swpaul	fs = &disk->d_fs;
20548992Swpaul	if (real_dev_bsize == 0 && ioctl(disk->d_fd, DIOCGSECTORSIZE,
20636270Swpaul	    &real_dev_bsize) == -1)
20750462Swpaul		real_dev_bsize = secsize;
20850462Swpaul	if (debug)
20950462Swpaul		printf("dev_bsize %ld\n", real_dev_bsize);
21036270Swpaul}
21136270Swpaul
21236270Swpaul/*
21339957Swpaul * Mark file system as clean, write the super-block back, close the disk.
21439957Swpaul */
21539957Swpaulstatic void
21639957Swpaulclosedisk(const char *devnam)
21739957Swpaul{
21839957Swpaul	struct csum *cgsum;
21939957Swpaul	int i;
22036270Swpaul
22136270Swpaul	/*
22251089Speter	 * Recompute the fs summary info from correct cs summaries.
22350462Swpaul	 */
22450462Swpaul	bzero(&fs->fs_cstotal, sizeof(struct csum_total));
22541591Sarchie	for (i = 0; i < fs->fs_ncg; i++) {
22641591Sarchie		cgsum = &fs->fs_cs(fs, i);
22750477Speter		fs->fs_cstotal.cs_nffree += cgsum->cs_nffree;
22836270Swpaul		fs->fs_cstotal.cs_nbfree += cgsum->cs_nbfree;
22936270Swpaul		fs->fs_cstotal.cs_nifree += cgsum->cs_nifree;
23036270Swpaul		fs->fs_cstotal.cs_ndir += cgsum->cs_ndir;
23136270Swpaul	}
23236270Swpaul	fs->fs_pendinginodes = 0;
23336270Swpaul	fs->fs_pendingblocks = 0;
23436270Swpaul	fs->fs_clean = 1;
23536270Swpaul	fs->fs_time = time(NULL);
23636270Swpaul	fs->fs_mtime = time(NULL);
23736270Swpaul	if (sbwrite(disk, 0) == -1)
23836270Swpaul		err(EX_OSERR, "sbwrite(%s)", devnam);
23936270Swpaul	if (ufs_disk_close(disk) == -1)
24036270Swpaul		err(EX_OSERR, "ufs_disk_close(%s)", devnam);
24136270Swpaul	free(disk);
24236270Swpaul	disk = NULL;
24336270Swpaul	fs = NULL;
24436270Swpaul}
24536270Swpaul
24636270Swpaul/*
24736270Swpaul * Lookup a cg by number in the hash so we can keep track of which cgs
24836270Swpaul * need stats rebuilt.
24936270Swpaul */
25036270Swpaulstatic struct suj_cg *
25137626Swpaulcg_lookup(int cgx)
25237626Swpaul{
25337626Swpaul	struct cghd *hd;
25437626Swpaul	struct suj_cg *sc;
25537626Swpaul
25637626Swpaul	if (cgx < 0 || cgx >= fs->fs_ncg)
25737626Swpaul		err_suj("Bad cg number %d\n", cgx);
25837626Swpaul	if (lastcg && lastcg->sc_cgx == cgx)
25937626Swpaul		return (lastcg);
26037626Swpaul	hd = &cghash[SUJ_HASH(cgx)];
26137626Swpaul	LIST_FOREACH(sc, hd, sc_next)
26237626Swpaul		if (sc->sc_cgx == cgx) {
26336270Swpaul			lastcg = sc;
26436270Swpaul			return (sc);
26536270Swpaul		}
26648992Swpaul	sc = errmalloc(sizeof(*sc));
26748992Swpaul	bzero(sc, sizeof(*sc));
26848992Swpaul	sc->sc_cgbuf = errmalloc(fs->fs_bsize);
26936270Swpaul	sc->sc_cgp = (struct cg *)sc->sc_cgbuf;
27036270Swpaul	sc->sc_cgx = cgx;
27136270Swpaul	LIST_INSERT_HEAD(hd, sc, sc_next);
27236270Swpaul	if (bread(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf,
27336270Swpaul	    fs->fs_bsize) == -1)
27436270Swpaul		err_suj("Unable to read cylinder group %d\n", sc->sc_cgx);
27536270Swpaul
27637626Swpaul	return (sc);
27737626Swpaul}
27836270Swpaul
27936270Swpaul/*
28036270Swpaul * Lookup an inode number in the hash and allocate a suj_ino if it does
28136270Swpaul * not exist.
28236270Swpaul */
28336270Swpaulstatic struct suj_ino *
28436735Sdfrino_lookup(ino_t ino, int creat)
28536270Swpaul{
28636270Swpaul	struct suj_ino *sino;
28736270Swpaul	struct inohd *hd;
28848992Swpaul	struct suj_cg *sc;
28936270Swpaul
29036270Swpaul	sc = cg_lookup(ino_to_cg(fs, ino));
29136270Swpaul	if (sc->sc_lastino && sc->sc_lastino->si_ino == ino)
29241656Swpaul		return (sc->sc_lastino);
29339583Swpaul	hd = &sc->sc_inohash[SUJ_HASH(ino)];
29441656Swpaul	LIST_FOREACH(sino, hd, si_next)
29539583Swpaul		if (sino->si_ino == ino)
29636270Swpaul			return (sino);
29739583Swpaul	if (creat == 0)
29839583Swpaul		return (NULL);
29939583Swpaul	sino = errmalloc(sizeof(*sino));
30039583Swpaul	bzero(sino, sizeof(*sino));
30150462Swpaul	sino->si_ino = ino;
30250462Swpaul	TAILQ_INIT(&sino->si_recs);
30350462Swpaul	TAILQ_INIT(&sino->si_newrecs);
30436270Swpaul	TAILQ_INIT(&sino->si_movs);
30536270Swpaul	LIST_INSERT_HEAD(hd, sino, si_next);
30641656Swpaul
30736270Swpaul	return (sino);
30841656Swpaul}
30939583Swpaul
31050468Swpaul/*
31136270Swpaul * Lookup a block number in the hash and allocate a suj_blk if it does
31236270Swpaul * not exist.
31336270Swpaul */
31441656Swpaulstatic struct suj_blk *
31541656Swpaulblk_lookup(ufs2_daddr_t blk, int creat)
31641656Swpaul{
31741656Swpaul	struct suj_blk *sblk;
31841656Swpaul	struct suj_cg *sc;
31941656Swpaul	struct blkhd *hd;
32041656Swpaul
32141656Swpaul	sc = cg_lookup(dtog(fs, blk));
32241656Swpaul	if (sc->sc_lastblk && sc->sc_lastblk->sb_blk == blk)
32341656Swpaul		return (sc->sc_lastblk);
32439583Swpaul	hd = &sc->sc_blkhash[SUJ_HASH(fragstoblks(fs, blk))];
32549010Swpaul	LIST_FOREACH(sblk, hd, sb_next)
32649010Swpaul		if (sblk->sb_blk == blk)
32749010Swpaul			return (sblk);
32849010Swpaul	if (creat == 0)
32949010Swpaul		return (NULL);
33049010Swpaul	sblk = errmalloc(sizeof(*sblk));
33149010Swpaul	bzero(sblk, sizeof(*sblk));
33249010Swpaul	sblk->sb_blk = blk;
33348992Swpaul	TAILQ_INIT(&sblk->sb_recs);
33448992Swpaul	LIST_INSERT_HEAD(hd, sblk, sb_next);
33548992Swpaul
33648992Swpaul	return (sblk);
33748992Swpaul}
33848992Swpaul
33950462Swpaulstatic struct data_blk *
34050462Swpauldblk_lookup(ufs2_daddr_t blk)
34150462Swpaul{
34250462Swpaul	struct data_blk *dblk;
34350462Swpaul	struct dblkhd *hd;
34450462Swpaul
34550462Swpaul	hd = &dbhash[SUJ_HASH(fragstoblks(fs, blk))];
34650462Swpaul	if (lastblk && lastblk->db_blk == blk)
34750462Swpaul		return (lastblk);
34850462Swpaul	LIST_FOREACH(dblk, hd, db_next)
34948992Swpaul		if (dblk->db_blk == blk)
35048992Swpaul			return (dblk);
35148992Swpaul	/*
35248992Swpaul	 * The inode block wasn't located, allocate a new one.
35351455Swpaul	 */
35448992Swpaul	dblk = errmalloc(sizeof(*dblk));
35548992Swpaul	bzero(dblk, sizeof(*dblk));
35648992Swpaul	LIST_INSERT_HEAD(hd, dblk, db_next);
35748992Swpaul	dblk->db_blk = blk;
35848992Swpaul	return (dblk);
35948992Swpaul}
36051533Swpaul
36151473Swpaulstatic uint8_t *
36248992Swpauldblk_read(ufs2_daddr_t blk, int size)
36339583Swpaul{
36441656Swpaul	struct data_blk *dblk;
36541656Swpaul
36639583Swpaul	dblk = dblk_lookup(blk);
36739583Swpaul	/*
36839583Swpaul	 * I doubt size mismatches can happen in practice but it is trivial
36939583Swpaul	 * to handle.
37039583Swpaul	 */
37139583Swpaul	if (size != dblk->db_size) {
37241656Swpaul		if (dblk->db_buf)
37341656Swpaul			free(dblk->db_buf);
37439583Swpaul		dblk->db_buf = errmalloc(size);
37539583Swpaul		dblk->db_size = size;
37639583Swpaul		if (bread(disk, fsbtodb(fs, blk), dblk->db_buf, size) == -1)
37739583Swpaul			err_suj("Failed to read data block %jd\n", blk);
37839583Swpaul	}
37939583Swpaul	return (dblk->db_buf);
38041656Swpaul}
38141656Swpaul
38239583Swpaulstatic void
38339583Swpauldblk_dirty(ufs2_daddr_t blk)
38439583Swpaul{
38539583Swpaul	struct data_blk *dblk;
38639583Swpaul
38739583Swpaul	dblk = dblk_lookup(blk);
38841656Swpaul	dblk->db_dirty = 1;
38941656Swpaul}
39041656Swpaul
39139583Swpaulstatic void
39239583Swpauldblk_write(void)
39339583Swpaul{
39439583Swpaul	struct data_blk *dblk;
39539583Swpaul	int i;
39639583Swpaul
39739583Swpaul	for (i = 0; i < SUJ_HASHSIZE; i++) {
39841656Swpaul		LIST_FOREACH(dblk, &dbhash[i], db_next) {
39941656Swpaul			if (dblk->db_dirty == 0 || dblk->db_size == 0)
40041656Swpaul				continue;
40139583Swpaul			if (bwrite(disk, fsbtodb(fs, dblk->db_blk),
40239583Swpaul			    dblk->db_buf, dblk->db_size) == -1)
40339583Swpaul				err_suj("Unable to write block %jd\n",
40439583Swpaul				    dblk->db_blk);
40539583Swpaul		}
40639583Swpaul	}
40739583Swpaul}
40841656Swpaul
40941656Swpaulstatic union dinode *
41041656Swpaulino_read(ino_t ino)
41139583Swpaul{
41239583Swpaul	struct ino_blk *iblk;
41339583Swpaul	struct iblkhd *hd;
41439583Swpaul	struct suj_cg *sc;
41539583Swpaul	ufs2_daddr_t blk;
41639583Swpaul	int off;
41739583Swpaul
41841656Swpaul	blk = ino_to_fsba(fs, ino);
41941656Swpaul	sc = cg_lookup(ino_to_cg(fs, ino));
42041656Swpaul	iblk = sc->sc_lastiblk;
42139583Swpaul	if (iblk && iblk->ib_blk == blk)
42239583Swpaul		goto found;
42339583Swpaul	hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))];
42439583Swpaul	LIST_FOREACH(iblk, hd, ib_next)
42539583Swpaul		if (iblk->ib_blk == blk)
42639583Swpaul			goto found;
42739583Swpaul	/*
42839583Swpaul	 * The inode block wasn't located, allocate a new one.
42939583Swpaul	 */
43039583Swpaul	iblk = errmalloc(sizeof(*iblk));
43139583Swpaul	bzero(iblk, sizeof(*iblk));
43239583Swpaul	iblk->ib_buf = errmalloc(fs->fs_bsize);
43341656Swpaul	iblk->ib_blk = blk;
43441656Swpaul	LIST_INSERT_HEAD(hd, iblk, ib_next);
43541656Swpaul	if (bread(disk, fsbtodb(fs, blk), iblk->ib_buf, fs->fs_bsize) == -1)
43639583Swpaul		err_suj("Failed to read inode block %jd\n", blk);
43739583Swpaulfound:
43839583Swpaul	sc->sc_lastiblk = iblk;
43939583Swpaul	off = ino_to_fsbo(fs, ino);
44039583Swpaul	if (fs->fs_magic == FS_UFS1_MAGIC)
44139583Swpaul		return (union dinode *)&((struct ufs1_dinode *)iblk->ib_buf)[off];
44239583Swpaul	else
44339583Swpaul		return (union dinode *)&((struct ufs2_dinode *)iblk->ib_buf)[off];
44439583Swpaul}
44539583Swpaul
44639583Swpaulstatic void
44739583Swpaulino_dirty(ino_t ino)
44841656Swpaul{
44941656Swpaul	struct ino_blk *iblk;
45041656Swpaul	struct iblkhd *hd;
45139583Swpaul	struct suj_cg *sc;
45239583Swpaul	ufs2_daddr_t blk;
45339583Swpaul
45439583Swpaul	blk = ino_to_fsba(fs, ino);
45539583Swpaul	sc = cg_lookup(ino_to_cg(fs, ino));
45639583Swpaul	iblk = sc->sc_lastiblk;
45739583Swpaul	if (iblk && iblk->ib_blk == blk) {
45839583Swpaul		iblk->ib_dirty = 1;
45939583Swpaul		return;
46039583Swpaul	}
46139583Swpaul	hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))];
46239583Swpaul	LIST_FOREACH(iblk, hd, ib_next) {
46341656Swpaul		if (iblk->ib_blk == blk) {
46441656Swpaul			iblk->ib_dirty = 1;
46541656Swpaul			return;
46639583Swpaul		}
46739583Swpaul	}
46839583Swpaul	ino_read(ino);
46939583Swpaul	ino_dirty(ino);
47039583Swpaul}
47139583Swpaul
47239583Swpaulstatic void
47339583Swpauliblk_write(struct ino_blk *iblk)
47439583Swpaul{
47539583Swpaul
47639583Swpaul	if (iblk->ib_dirty == 0)
47736270Swpaul		return;
47836270Swpaul	if (bwrite(disk, fsbtodb(fs, iblk->ib_blk), iblk->ib_buf,
47936270Swpaul	    fs->fs_bsize) == -1)
48039583Swpaul		err_suj("Failed to write inode block %jd\n", iblk->ib_blk);
48139583Swpaul}
48241656Swpaul
48336270Swpaulstatic int
48436270Swpaulblk_overlaps(struct jblkrec *brec, ufs2_daddr_t start, int frags)
48536270Swpaul{
48636270Swpaul	ufs2_daddr_t bstart;
48736270Swpaul	ufs2_daddr_t bend;
48836270Swpaul	ufs2_daddr_t end;
48939583Swpaul
49036270Swpaul	end = start + frags;
49136270Swpaul	bstart = brec->jb_blkno + brec->jb_oldfrags;
49236270Swpaul	bend = bstart + brec->jb_frags;
49336270Swpaul	if (start < bend && end > bstart)
49436270Swpaul		return (1);
49536270Swpaul	return (0);
49639583Swpaul}
49736270Swpaul
49839583Swpaulstatic int
49936270Swpaulblk_equals(struct jblkrec *brec, ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t start,
50039583Swpaul    int frags)
50139583Swpaul{
50239583Swpaul
50339583Swpaul	if (brec->jb_ino != ino || brec->jb_lbn != lbn)
50436270Swpaul		return (0);
50536270Swpaul	if (brec->jb_blkno + brec->jb_oldfrags != start)
50636270Swpaul		return (0);
50736270Swpaul	if (brec->jb_frags != frags)
50836270Swpaul		return (0);
50939583Swpaul	return (1);
51036270Swpaul}
51136270Swpaul
51236270Swpaulstatic void
51336270Swpaulblk_setmask(struct jblkrec *brec, int *mask)
51439583Swpaul{
51539583Swpaul	int i;
51639583Swpaul
51736270Swpaul	for (i = brec->jb_oldfrags; i < brec->jb_oldfrags + brec->jb_frags; i++)
51836270Swpaul		*mask |= 1 << i;
51936270Swpaul}
52036270Swpaul
52136270Swpaul/*
52236270Swpaul * Determine whether a given block has been reallocated to a new location.
52336270Swpaul * Returns a mask of overlapping bits if any frags have been reused or
52439583Swpaul * zero if the block has not been re-used and the contents can be trusted.
52539583Swpaul *
52641656Swpaul * This is used to ensure that an orphaned pointer due to truncate is safe
52736270Swpaul * to be freed.  The mask value can be used to free partial blocks.
52836270Swpaul */
52936270Swpaulstatic int
53036270Swpaulblk_freemask(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags)
53136270Swpaul{
53239583Swpaul	struct suj_blk *sblk;
53339583Swpaul	struct suj_rec *srec;
53436270Swpaul	struct jblkrec *brec;
53539583Swpaul	int mask;
53636270Swpaul	int off;
53736270Swpaul
53836270Swpaul	/*
53939583Swpaul	 * To be certain we're not freeing a reallocated block we lookup
54039583Swpaul	 * this block in the blk hash and see if there is an allocation
54139583Swpaul	 * journal record that overlaps with any fragments in the block
54236270Swpaul	 * we're concerned with.  If any fragments have ben reallocated
54339583Swpaul	 * the block has already been freed and re-used for another purpose.
54436270Swpaul	 */
54536270Swpaul	mask = 0;
54636270Swpaul	sblk = blk_lookup(blknum(fs, blk), 0);
54736270Swpaul	if (sblk == NULL)
54839583Swpaul		return (0);
54939583Swpaul	off = blk - sblk->sb_blk;
55039583Swpaul	TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
55136270Swpaul		brec = (struct jblkrec *)srec->sr_rec;
55239583Swpaul		/*
55336270Swpaul		 * If the block overlaps but does not match
55436270Swpaul		 * exactly it's a new allocation.  If it matches
55536270Swpaul		 * exactly this record refers to the current
55636270Swpaul		 * location.
55736270Swpaul		 */
55836270Swpaul		if (blk_overlaps(brec, blk, frags) == 0)
55939583Swpaul			continue;
56039583Swpaul		if (blk_equals(brec, ino, lbn, blk, frags) == 1)
56139583Swpaul			mask = 0;
56236270Swpaul		else
56339583Swpaul			blk_setmask(brec, &mask);
56436270Swpaul	}
56536270Swpaul	if (debug)
56636270Swpaul		printf("blk_freemask: blk %jd sblk %jd off %d mask 0x%X\n",
56736270Swpaul		    blk, sblk->sb_blk, off, mask);
56839583Swpaul	return (mask >> off);
56936270Swpaul}
57039583Swpaul
57139583Swpaul/*
57239583Swpaul * Determine whether it is safe to follow an indirect.  It is not safe
57336270Swpaul * if any part of the indirect has been reallocated or the last journal
57439583Swpaul * entry was an allocation.  Just allocated indirects may not have valid
57536501Swpaul * pointers yet and all of their children will have their own records.
57636270Swpaul * It is also not safe to follow an indirect if the cg bitmap has been
57736270Swpaul * cleared as a new allocation may write to the block prior to the journal
57836270Swpaul * being written.
57936270Swpaul *
58036270Swpaul * Returns 1 if it's safe to follow the indirect and 0 otherwise.
58136270Swpaul */
58236270Swpaulstatic int
58336270Swpaulblk_isindir(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn)
58436270Swpaul{
58536270Swpaul	struct suj_blk *sblk;
58636270Swpaul	struct jblkrec *brec;
58736270Swpaul
58836270Swpaul	sblk = blk_lookup(blk, 0);
58939583Swpaul	if (sblk == NULL)
59039583Swpaul		return (1);
59139583Swpaul	if (TAILQ_EMPTY(&sblk->sb_recs))
59239583Swpaul		return (1);
59339583Swpaul	brec = (struct jblkrec *)TAILQ_LAST(&sblk->sb_recs, srechd)->sr_rec;
59439583Swpaul	if (blk_equals(brec, ino, lbn, blk, fs->fs_frag))
59539583Swpaul		if (brec->jb_op == JOP_FREEBLK)
59639583Swpaul			return (!blk_isfree(blk));
59736270Swpaul	return (0);
59839583Swpaul}
59939583Swpaul
60039583Swpaul/*
60139583Swpaul * Clear an inode from the cg bitmap.  If the inode was already clear return
60239583Swpaul * 0 so the caller knows it does not have to check the inode contents.
60339583Swpaul */
60439583Swpaulstatic int
60539583Swpaulino_free(ino_t ino, int mode)
60639583Swpaul{
60739583Swpaul	struct suj_cg *sc;
60839583Swpaul	uint8_t *inosused;
60939583Swpaul	struct cg *cgp;
61039583Swpaul	int cg;
61139583Swpaul
61239583Swpaul	cg = ino_to_cg(fs, ino);
61339583Swpaul	ino = ino % fs->fs_ipg;
61436270Swpaul	sc = cg_lookup(cg);
61536270Swpaul	cgp = sc->sc_cgp;
61639583Swpaul	inosused = cg_inosused(cgp);
61736270Swpaul	/*
61836270Swpaul	 * The bitmap may never have made it to the disk so we have to
61939583Swpaul	 * conditionally clear.  We can avoid writing the cg in this case.
62039583Swpaul	 */
62136270Swpaul	if (isclr(inosused, ino))
62236270Swpaul		return (0);
62336270Swpaul	freeinos++;
62436270Swpaul	clrbit(inosused, ino);
62536270Swpaul	if (ino < cgp->cg_irotor)
62639583Swpaul		cgp->cg_irotor = ino;
62739583Swpaul	cgp->cg_cs.cs_nifree++;
62836270Swpaul	if ((mode & IFMT) == IFDIR) {
62936270Swpaul		freedir++;
63036270Swpaul		cgp->cg_cs.cs_ndir--;
63136270Swpaul	}
63236270Swpaul	sc->sc_dirty = 1;
63336270Swpaul
63439583Swpaul	return (1);
63536270Swpaul}
63639583Swpaul
63736270Swpaul/*
63839583Swpaul * Free 'frags' frags starting at filesystem block 'bno' skipping any frags
63936270Swpaul * set in the mask.
64039583Swpaul */
64136270Swpaulstatic void
64236270Swpaulblk_free(ufs2_daddr_t bno, int mask, int frags)
64336270Swpaul{
64439583Swpaul	ufs1_daddr_t fragno, cgbno;
64539583Swpaul	struct suj_cg *sc;
64636270Swpaul	struct cg *cgp;
64736270Swpaul	int i, cg;
64836270Swpaul	uint8_t *blksfree;
64936270Swpaul
65036270Swpaul	if (debug)
65136270Swpaul		printf("Freeing %d frags at blk %jd\n", frags, bno);
65236270Swpaul	cg = dtog(fs, bno);
65336270Swpaul	sc = cg_lookup(cg);
65439583Swpaul	cgp = sc->sc_cgp;
65536270Swpaul	cgbno = dtogd(fs, bno);
65636270Swpaul	blksfree = cg_blksfree(cgp);
65736270Swpaul
65836270Swpaul	/*
65936270Swpaul	 * If it's not allocated we only wrote the journal entry
66036270Swpaul	 * and never the bitmaps.  Here we unconditionally clear and
66136270Swpaul	 * resolve the cg summary later.
66236270Swpaul	 */
66336270Swpaul	if (frags == fs->fs_frag && mask == 0) {
66436270Swpaul		fragno = fragstoblks(fs, cgbno);
66536270Swpaul		ffs_setblock(fs, blksfree, fragno);
66636270Swpaul		freeblocks++;
66739583Swpaul	} else {
66836270Swpaul		/*
66939583Swpaul		 * deallocate the fragment
67036270Swpaul		 */
67136270Swpaul		for (i = 0; i < frags; i++)
67236270Swpaul			if ((mask & (1 << i)) == 0 && isclr(blksfree, cgbno +i)) {
67336270Swpaul				freefrags++;
67436270Swpaul				setbit(blksfree, cgbno + i);
67539583Swpaul			}
67636270Swpaul	}
67736270Swpaul	sc->sc_dirty = 1;
67836270Swpaul}
67936270Swpaul
68039583Swpaul/*
68139583Swpaul * Returns 1 if the whole block starting at 'bno' is marked free and 0
68239583Swpaul * otherwise.
68339583Swpaul */
68436270Swpaulstatic int
68536270Swpaulblk_isfree(ufs2_daddr_t bno)
68636270Swpaul{
68736270Swpaul	struct suj_cg *sc;
68839583Swpaul
68936270Swpaul	sc = cg_lookup(dtog(fs, bno));
69036270Swpaul	return ffs_isblock(fs, cg_blksfree(sc->sc_cgp), dtogd(fs, bno));
69139583Swpaul}
69239583Swpaul
69336270Swpaul/*
69436270Swpaul * Fetch an indirect block to find the block at a given lbn.  The lbn
69539583Swpaul * may be negative to fetch a specific indirect block pointer or positive
69639583Swpaul * to fetch a specific block.
69736270Swpaul */
69836270Swpaulstatic ufs2_daddr_t
69939583Swpaulindir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn)
70036270Swpaul{
70136270Swpaul	ufs2_daddr_t *bap2;
70236270Swpaul	ufs2_daddr_t *bap1;
70336270Swpaul	ufs_lbn_t lbnadd;
70436270Swpaul	ufs_lbn_t base;
70536270Swpaul	int level;
70636270Swpaul	int i;
70739583Swpaul
70839583Swpaul	if (blk == 0)
70936270Swpaul		return (0);
71036270Swpaul	level = lbn_level(cur);
71136270Swpaul	if (level == -1)
71236270Swpaul		err_suj("Invalid indir lbn %jd\n", lbn);
71336270Swpaul	if (level == 0 && lbn < 0)
71439583Swpaul		err_suj("Invalid lbn %jd\n", lbn);
71536270Swpaul	bap2 = (void *)dblk_read(blk, fs->fs_bsize);
71639583Swpaul	bap1 = (void *)bap2;
71736270Swpaul	lbnadd = 1;
71836270Swpaul	base = -(cur + level);
71939583Swpaul	for (i = level; i > 0; i--)
72036270Swpaul		lbnadd *= NINDIR(fs);
72136270Swpaul	if (lbn > 0)
72236270Swpaul		i = (lbn - base) / lbnadd;
72336270Swpaul	else
72439583Swpaul		i = (-lbn - base) / lbnadd;
72539583Swpaul	if (i < 0 || i >= NINDIR(fs))
72636270Swpaul		err_suj("Invalid indirect index %d produced by lbn %jd\n",
72736270Swpaul		    i, lbn);
72836270Swpaul	if (level == 0)
72939583Swpaul		cur = base + (i * lbnadd);
73036270Swpaul	else
73136270Swpaul		cur = -(base + (i * lbnadd)) - (level - 1);
73236270Swpaul	if (fs->fs_magic == FS_UFS1_MAGIC)
73336270Swpaul		blk = bap1[i];
73436270Swpaul	else
73536270Swpaul		blk = bap2[i];
73636270Swpaul	if (cur == lbn)
73736270Swpaul		return (blk);
73836270Swpaul	if (level == 0)
73939583Swpaul		err_suj("Invalid lbn %jd at level 0\n", lbn);
74039583Swpaul	return indir_blkatoff(blk, ino, cur, lbn);
74136270Swpaul}
74236270Swpaul
74336270Swpaul/*
74436270Swpaul * Finds the disk block address at the specified lbn within the inode
74536270Swpaul * specified by ip.  This follows the whole tree and honors di_size and
74636270Swpaul * di_extsize so it is a true test of reachability.  The lbn may be
74739583Swpaul * negative if an extattr or indirect block is requested.
74836270Swpaul */
74936270Swpaulstatic ufs2_daddr_t
75036270Swpaulino_blkatoff(union dinode *ip, ino_t ino, ufs_lbn_t lbn, int *frags)
75136270Swpaul{
75236270Swpaul	ufs_lbn_t tmpval;
75336270Swpaul	ufs_lbn_t cur;
75436270Swpaul	ufs_lbn_t next;
75536270Swpaul	int i;
75636270Swpaul
75736270Swpaul	/*
75836270Swpaul	 * Handle extattr blocks first.
75936270Swpaul	 */
76036270Swpaul	if (lbn < 0 && lbn >= -NXADDR) {
76139583Swpaul		lbn = -1 - lbn;
76236270Swpaul		if (lbn > lblkno(fs, ip->dp2.di_extsize - 1))
76339583Swpaul			return (0);
76436270Swpaul		*frags = numfrags(fs, sblksize(fs, ip->dp2.di_extsize, lbn));
76536270Swpaul		return (ip->dp2.di_extb[lbn]);
76636270Swpaul	}
76736270Swpaul	/*
76836270Swpaul	 * Now direct and indirect.
76939583Swpaul	 */
77036270Swpaul	if (DIP(ip, di_mode) == IFLNK &&
77139583Swpaul	    DIP(ip, di_size) < fs->fs_maxsymlinklen)
77239583Swpaul		return (0);
77339583Swpaul	if (lbn >= 0 && lbn < NDADDR) {
77439583Swpaul		*frags = numfrags(fs, sblksize(fs, DIP(ip, di_size), lbn));
77539583Swpaul		return (DIP(ip, di_db[lbn]));
77639583Swpaul	}
77736270Swpaul	*frags = fs->fs_frag;
77839583Swpaul
77939583Swpaul	for (i = 0, tmpval = NINDIR(fs), cur = NDADDR; i < NIADDR; i++,
78036270Swpaul	    tmpval *= NINDIR(fs), cur = next) {
78136270Swpaul		next = cur + tmpval;
78236270Swpaul		if (lbn == -cur - i)
78336270Swpaul			return (DIP(ip, di_ib[i]));
78439583Swpaul		/*
78536270Swpaul		 * Determine whether the lbn in question is within this tree.
78636270Swpaul		 */
78736270Swpaul		if (lbn < 0 && -lbn >= next)
78839583Swpaul			continue;
78936270Swpaul		if (lbn > 0 && lbn >= next)
79036270Swpaul			continue;
79136270Swpaul		return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn);
79236270Swpaul	}
79336270Swpaul	err_suj("lbn %jd not in ino\n", lbn);
79436270Swpaul	/* NOTREACHED */
79550462Swpaul}
79650462Swpaul
79750462Swpaul/*
79850462Swpaul * Determine whether a block exists at a particular lbn in an inode.
79936270Swpaul * Returns 1 if found, 0 if not.  lbn may be negative for indirects
80036270Swpaul * or ext blocks.
80136270Swpaul */
80250462Swpaulstatic int
80336270Swpaulblk_isat(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int *frags)
80436270Swpaul{
80550462Swpaul	union dinode *ip;
80636270Swpaul	ufs2_daddr_t nblk;
80739583Swpaul
80836270Swpaul	ip = ino_read(ino);
80936270Swpaul
81036270Swpaul	if (DIP(ip, di_nlink) == 0 || DIP(ip, di_mode) == 0)
81136270Swpaul		return (0);
81250462Swpaul	nblk = ino_blkatoff(ip, ino, lbn, frags);
81350462Swpaul
81450462Swpaul	return (nblk == blk);
81550462Swpaul}
81636270Swpaul
81736270Swpaul/*
81836270Swpaul * Clear the directory entry at diroff that should point to child.  Minimal
81950462Swpaul * checking is done and it is assumed that this path was verified with isat.
82036270Swpaul */
82136270Swpaulstatic void
82250462Swpaulino_clrat(ino_t parent, off_t diroff, ino_t child)
82336270Swpaul{
82436270Swpaul	union dinode *dip;
82536270Swpaul	struct direct *dp;
82639583Swpaul	ufs2_daddr_t blk;
82736270Swpaul	uint8_t *block;
82850462Swpaul	ufs_lbn_t lbn;
82936270Swpaul	int blksize;
83036270Swpaul	int frags;
83150462Swpaul	int doff;
83250462Swpaul
83350462Swpaul	if (debug)
83436270Swpaul		printf("Clearing inode %d from parent %d at offset %jd\n",
83550462Swpaul		    child, parent, diroff);
83636270Swpaul
83750462Swpaul	lbn = lblkno(fs, diroff);
83850462Swpaul	doff = blkoff(fs, diroff);
83936270Swpaul	dip = ino_read(parent);
84050462Swpaul	blk = ino_blkatoff(dip, parent, lbn, &frags);
84150462Swpaul	blksize = sblksize(fs, DIP(dip, di_size), lbn);
84236270Swpaul	block = dblk_read(blk, blksize);
84350462Swpaul	dp = (struct direct *)&block[doff];
84436270Swpaul	if (dp->d_ino != child)
84536270Swpaul		errx(1, "Inode %d does not exist in %d at %jd",
84636270Swpaul		    child, parent, diroff);
84736270Swpaul	dp->d_ino = 0;
84836270Swpaul	dblk_dirty(blk);
84936270Swpaul	/*
85050462Swpaul	 * The actual .. reference count will already have been removed
85136270Swpaul	 * from the parent by the .. remref record.
85236270Swpaul	 */
85336270Swpaul}
85436270Swpaul
85536270Swpaul/*
85650462Swpaul * Determines whether a pointer to an inode exists within a directory
85750462Swpaul * at a specified offset.  Returns the mode of the found entry.
85836270Swpaul */
85950462Swpaulstatic int
86036270Swpaulino_isat(ino_t parent, off_t diroff, ino_t child, int *mode, int *isdot)
86150462Swpaul{
86239583Swpaul	union dinode *dip;
86336270Swpaul	struct direct *dp;
86450462Swpaul	ufs2_daddr_t blk;
86539583Swpaul	uint8_t *block;
86636270Swpaul	ufs_lbn_t lbn;
86736270Swpaul	int blksize;
86836270Swpaul	int frags;
86936270Swpaul	int dpoff;
87036270Swpaul	int doff;
87136270Swpaul
87236464Swpaul	*isdot = 0;
87336464Swpaul	dip = ino_read(parent);
87436464Swpaul	*mode = DIP(dip, di_mode);
87536464Swpaul	if ((*mode & IFMT) != IFDIR) {
87636464Swpaul		if (debug) {
87736464Swpaul			/*
87836464Swpaul			 * This can happen if the parent inode
87936464Swpaul			 * was reallocated.
88036464Swpaul			 */
88136270Swpaul			if (*mode != 0)
88241656Swpaul				printf("Directory %d has bad mode %o\n",
88336270Swpaul				    parent, *mode);
88437626Swpaul			else
88536270Swpaul				printf("Directory %d zero inode\n", parent);
88636464Swpaul		}
88736464Swpaul		return (0);
88836464Swpaul	}
88936270Swpaul	lbn = lblkno(fs, diroff);
89036270Swpaul	doff = blkoff(fs, diroff);
89139583Swpaul	blksize = sblksize(fs, DIP(dip, di_size), lbn);
89239583Swpaul	if (diroff + DIRECTSIZ(1) > DIP(dip, di_size) || doff >= blksize) {
89339583Swpaul		if (debug)
89439583Swpaul			printf("ino %d absent from %d due to offset %jd"
89539583Swpaul			    " exceeding size %jd\n",
89639583Swpaul			    child, parent, diroff, DIP(dip, di_size));
89739583Swpaul		return (0);
89839583Swpaul	}
89939583Swpaul	blk = ino_blkatoff(dip, parent, lbn, &frags);
90041656Swpaul	if (blk <= 0) {
90139583Swpaul		if (debug)
90239583Swpaul			printf("Sparse directory %d", parent);
90339583Swpaul		return (0);
90439583Swpaul	}
90539583Swpaul	block = dblk_read(blk, blksize);
90639583Swpaul	/*
90739583Swpaul	 * Walk through the records from the start of the block to be
90839583Swpaul	 * certain we hit a valid record and not some junk in the middle
90939583Swpaul	 * of a file name.  Stop when we reach or pass the expected offset.
91039583Swpaul	 */
91139583Swpaul	dpoff = (doff / DIRBLKSIZ) * DIRBLKSIZ;
91239583Swpaul	do {
91339583Swpaul		dp = (struct direct *)&block[dpoff];
91439583Swpaul		if (dpoff == doff)
91539583Swpaul			break;
91639583Swpaul		if (dp->d_reclen == 0)
91739583Swpaul			break;
91839583Swpaul		dpoff += dp->d_reclen;
91939583Swpaul	} while (dpoff <= doff);
92039583Swpaul	if (dpoff > fs->fs_bsize)
92139583Swpaul		err_suj("Corrupt directory block in dir ino %d\n", parent);
92239583Swpaul	/* Not found. */
92339583Swpaul	if (dpoff != doff) {
92439583Swpaul		if (debug)
92539583Swpaul			printf("ino %d not found in %d, lbn %jd, dpoff %d\n",
92639583Swpaul			    child, parent, lbn, dpoff);
92739583Swpaul		return (0);
92839583Swpaul	}
92939583Swpaul	/*
93036270Swpaul	 * We found the item in question.  Record the mode and whether it's
93136270Swpaul	 * a . or .. link for the caller.
93236270Swpaul	 */
93336270Swpaul	if (dp->d_ino == child) {
93436270Swpaul		if (child == parent)
93539583Swpaul			*isdot = 1;
93636270Swpaul		else if (dp->d_namlen == 2 &&
93739583Swpaul		    dp->d_name[0] == '.' && dp->d_name[1] == '.')
93836270Swpaul			*isdot = 1;
93936270Swpaul		*mode = DTTOIF(dp->d_type);
94039583Swpaul		return (1);
94139583Swpaul	}
94241656Swpaul	if (debug)
94339583Swpaul		printf("ino %d doesn't match dirent ino %d in parent %d\n",
94439583Swpaul		    child, dp->d_ino, parent);
94539583Swpaul	return (0);
94639583Swpaul}
94739583Swpaul
94836270Swpaul#define	VISIT_INDIR	0x0001
94936270Swpaul#define	VISIT_EXT	0x0002
95036270Swpaul#define	VISIT_ROOT	0x0004	/* Operation came via root & valid pointers. */
95139583Swpaul
95239583Swpaul/*
95336270Swpaul * Read an indirect level which may or may not be linked into an inode.
95436270Swpaul */
95539583Swpaulstatic void
95639583Swpaulindir_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, uint64_t *frags,
95739583Swpaul    ino_visitor visitor, int flags)
95839583Swpaul{
95939583Swpaul	ufs2_daddr_t *bap2;
96039583Swpaul	ufs1_daddr_t *bap1;
96136270Swpaul	ufs_lbn_t lbnadd;
96236270Swpaul	ufs2_daddr_t nblk;
96339583Swpaul	ufs_lbn_t nlbn;
96439583Swpaul	int level;
96539583Swpaul	int i;
96639583Swpaul
96739583Swpaul	/*
96839583Swpaul	 * Don't visit indirect blocks with contents we can't trust.  This
96939583Swpaul	 * should only happen when indir_visit() is called to complete a
97039583Swpaul	 * truncate that never finished and not when a pointer is found via
97139583Swpaul	 * an inode.
97239583Swpaul	 */
97339583Swpaul	if (blk == 0)
97439583Swpaul		return;
97536270Swpaul	level = lbn_level(lbn);
97636270Swpaul	if (level == -1)
97736270Swpaul		err_suj("Invalid level for lbn %jd\n", lbn);
97836270Swpaul	if ((flags & VISIT_ROOT) == 0 && blk_isindir(blk, ino, lbn) == 0) {
97936270Swpaul		if (debug)
98036317Swpaul			printf("blk %jd ino %d lbn %jd(%d) is not indir.\n",
98136270Swpaul			    blk, ino, lbn, level);
98236270Swpaul		goto out;
98336270Swpaul	}
98439583Swpaul	lbnadd = 1;
98539583Swpaul	for (i = level; i > 0; i--)
98636270Swpaul		lbnadd *= NINDIR(fs);
98736270Swpaul	bap1 = (void *)dblk_read(blk, fs->fs_bsize);
98836270Swpaul	bap2 = (void *)bap1;
98936270Swpaul	for (i = 0; i < NINDIR(fs); i++) {
99039583Swpaul		if (fs->fs_magic == FS_UFS1_MAGIC)
99139583Swpaul			nblk = *bap1++;
99239583Swpaul		else
99339583Swpaul			nblk = *bap2++;
99439583Swpaul		if (nblk == 0)
99539583Swpaul			continue;
99650468Swpaul		if (level == 0) {
99750468Swpaul			nlbn = -lbn + i * lbnadd;
99850468Swpaul			(*frags) += fs->fs_frag;
99939583Swpaul			visitor(ino, nlbn, nblk, fs->fs_frag);
100039583Swpaul		} else {
100150468Swpaul			nlbn = (lbn + 1) - (i * lbnadd);
100239583Swpaul			indir_visit(ino, nlbn, nblk, frags, visitor, flags);
100350468Swpaul		}
100439583Swpaul	}
100550468Swpaulout:
100639583Swpaul	if (flags & VISIT_INDIR) {
100750468Swpaul		(*frags) += fs->fs_frag;
100839583Swpaul		visitor(ino, lbn, blk, fs->fs_frag);
100950468Swpaul	}
101050468Swpaul}
101139583Swpaul
101250468Swpaul/*
101339583Swpaul * Visit each block in an inode as specified by 'flags' and call a
101450468Swpaul * callback function.  The callback may inspect or free blocks.  The
101539583Swpaul * count of frags found according to the size in the file is returned.
101650468Swpaul * This is not valid for sparse files but may be used to determine
101739583Swpaul * the correct di_blocks for a file.
101850468Swpaul */
101939583Swpaulstatic uint64_t
102039583Swpaulino_visit(union dinode *ip, ino_t ino, ino_visitor visitor, int flags)
102139583Swpaul{
102239583Swpaul	ufs_lbn_t nextlbn;
102339583Swpaul	ufs_lbn_t tmpval;
102436270Swpaul	ufs_lbn_t lbn;
102536270Swpaul	uint64_t size;
102639583Swpaul	uint64_t fragcnt;
102736270Swpaul	int mode;
102836270Swpaul	int frags;
102939583Swpaul	int i;
103050468Swpaul
103136270Swpaul	size = DIP(ip, di_size);
103239583Swpaul	mode = DIP(ip, di_mode) & IFMT;
103336270Swpaul	fragcnt = 0;
103436270Swpaul	if ((flags & VISIT_EXT) &&
103539583Swpaul	    fs->fs_magic == FS_UFS2_MAGIC && ip->dp2.di_extsize) {
103639583Swpaul		for (i = 0; i < NXADDR; i++) {
103736270Swpaul			if (ip->dp2.di_extb[i] == 0)
103836270Swpaul				continue;
103939583Swpaul			frags = sblksize(fs, ip->dp2.di_extsize, i);
104039583Swpaul			frags = numfrags(fs, frags);
104136270Swpaul			fragcnt += frags;
104236270Swpaul			visitor(ino, -1 - i, ip->dp2.di_extb[i], frags);
104336270Swpaul		}
104436270Swpaul	}
104536270Swpaul	/* Skip datablocks for short links and devices. */
104639583Swpaul	if (mode == IFBLK || mode == IFCHR ||
104745155Swpaul	    (mode == IFLNK && size < fs->fs_maxsymlinklen))
104839583Swpaul		return (fragcnt);
104936270Swpaul	for (i = 0; i < NDADDR; i++) {
105039583Swpaul		if (DIP(ip, di_db[i]) == 0)
105136270Swpaul			continue;
105236270Swpaul		frags = sblksize(fs, size, i);
105345155Swpaul		frags = numfrags(fs, frags);
105445155Swpaul		fragcnt += frags;
105545155Swpaul		visitor(ino, i, DIP(ip, di_db[i]), frags);
105645155Swpaul	}
105736270Swpaul	/*
105836270Swpaul	 * We know the following indirects are real as we're following
105936270Swpaul	 * real pointers to them.
106036270Swpaul	 */
106136270Swpaul	flags |= VISIT_ROOT;
106239583Swpaul	for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++,
106336270Swpaul	    lbn = nextlbn) {
106436270Swpaul		nextlbn = lbn + tmpval;
106539583Swpaul		tmpval *= NINDIR(fs);
106639583Swpaul		if (DIP(ip, di_ib[i]) == 0)
106736270Swpaul			continue;
106836270Swpaul		indir_visit(ino, -lbn - i, DIP(ip, di_ib[i]), &fragcnt, visitor,
106939583Swpaul		    flags);
107036270Swpaul	}
107136270Swpaul	return (fragcnt);
107239583Swpaul}
107336270Swpaul
107436270Swpaul/*
107536270Swpaul * Null visitor function used when we just want to count blocks and
107636270Swpaul * record the lbn.
107736270Swpaul */
107836270Swpaulufs_lbn_t visitlbn;
107936270Swpaulstatic void
108036270Swpaulnull_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
108136270Swpaul{
108239583Swpaul	if (lbn > 0)
108336270Swpaul		visitlbn = lbn;
108448992Swpaul}
108548992Swpaul
108636270Swpaul/*
108736270Swpaul * Recalculate di_blocks when we discover that a block allocation or
108836270Swpaul * free was not successfully completed.  The kernel does not roll this back
108936270Swpaul * because it would be too expensive to compute which indirects were
109036270Swpaul * reachable at the time the inode was written.
109136270Swpaul */
109248992Swpaulstatic void
109348992Swpaulino_adjblks(struct suj_ino *sino)
109448992Swpaul{
109548992Swpaul	union dinode *ip;
109648992Swpaul	uint64_t blocks;
109736270Swpaul	uint64_t frags;
109836270Swpaul	off_t isize;
109936270Swpaul	off_t size;
110048992Swpaul	ino_t ino;
110136270Swpaul
110236270Swpaul	ino = sino->si_ino;
110348992Swpaul	ip = ino_read(ino);
110448992Swpaul	/* No need to adjust zero'd inodes. */
110536270Swpaul	if (DIP(ip, di_mode) == 0)
110650462Swpaul		return;
110736270Swpaul	/*
110839583Swpaul	 * Visit all blocks and count them as well as recording the last
110939583Swpaul	 * valid lbn in the file.  If the file size doesn't agree with the
111039583Swpaul	 * last lbn we need to truncate to fix it.  Otherwise just adjust
111139583Swpaul	 * the blocks count.
111248992Swpaul	 */
111336270Swpaul	visitlbn = 0;
111436270Swpaul	frags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT);
111536270Swpaul	blocks = fsbtodb(fs, frags);
111648992Swpaul	/*
111748992Swpaul	 * We assume the size and direct block list is kept coherent by
111848992Swpaul	 * softdep.  For files that have extended into indirects we truncate
111948992Swpaul	 * to the size in the inode or the maximum size permitted by
112048992Swpaul	 * populated indirects.
112139583Swpaul	 */
112239583Swpaul	if (visitlbn >= NDADDR) {
112339583Swpaul		isize = DIP(ip, di_size);
112439583Swpaul		size = lblktosize(fs, visitlbn + 1);
112536270Swpaul		if (isize > size)
112639583Swpaul			isize = size;
112739583Swpaul		/* Always truncate to free any unpopulated indirects. */
112836270Swpaul		ino_trunc(sino->si_ino, isize);
112939583Swpaul		return;
113039583Swpaul	}
113136270Swpaul	if (blocks == DIP(ip, di_blocks))
113236270Swpaul		return;
113336270Swpaul	if (debug)
113436270Swpaul		printf("ino %d adjusting block count from %jd to %jd\n",
113536270Swpaul		    ino, DIP(ip, di_blocks), blocks);
113636270Swpaul	DIP_SET(ip, di_blocks, blocks);
113748992Swpaul	ino_dirty(ino);
113839583Swpaul}
113948992Swpaul
114048992Swpaulstatic void
114136270Swpaulblk_free_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
114239583Swpaul{
114339583Swpaul	int mask;
114439583Swpaul
114548992Swpaul	mask = blk_freemask(blk, ino, lbn, frags);
114639583Swpaul	if (debug)
114739583Swpaul		printf("blk %jd freemask 0x%X\n", blk, mask);
114839583Swpaul	blk_free(blk, mask, frags);
114948992Swpaul}
115048992Swpaul
115148992Swpaul/*
115248992Swpaul * Free a block or tree of blocks that was previously rooted in ino at
115348992Swpaul * the given lbn.  If the lbn is an indirect all children are freed
115448992Swpaul * recursively.
115548992Swpaul */
115648992Swpaulstatic void
115748992Swpaulblk_free_lbn(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags, int follow)
115848992Swpaul{
115948992Swpaul	uint64_t resid;
116048992Swpaul	int mask;
116145155Swpaul
116239583Swpaul	mask = blk_freemask(blk, ino, lbn, frags);
116336270Swpaul	if (debug)
116439583Swpaul		printf("blk %jd freemask 0x%X\n", blk, mask);
116548992Swpaul	resid = 0;
116636270Swpaul	if (lbn <= -NDADDR && follow && mask == 0)
116736270Swpaul		indir_visit(ino, lbn, blk, &resid, blk_free_visit, VISIT_INDIR);
116836270Swpaul	else
116948992Swpaul		blk_free(blk, mask, frags);
117048992Swpaul}
117148992Swpaul
117248992Swpaulstatic void
117348992Swpaulino_setskip(struct suj_ino *sino, ino_t parent)
117448992Swpaul{
117548992Swpaul	int isdot;
117636270Swpaul	int mode;
117739583Swpaul
117836270Swpaul	if (ino_isat(sino->si_ino, DOTDOT_OFFSET, parent, &mode, &isdot))
117948992Swpaul		sino->si_skipparent = 1;
118048992Swpaul}
118148992Swpaul
118248992Swpaulstatic void
118348992Swpaulino_remref(ino_t parent, ino_t child, uint64_t diroff, int isdotdot)
118448992Swpaul{
118548992Swpaul	struct suj_ino *sino;
118648992Swpaul	struct suj_rec *srec;
118748992Swpaul	struct jrefrec *rrec;
118839583Swpaul
118939583Swpaul	/*
119039583Swpaul	 * Lookup this inode to see if we have a record for it.
119139583Swpaul	 */
119239583Swpaul	sino = ino_lookup(child, 0);
119339583Swpaul	/*
119439583Swpaul	 * Tell any child directories we've already removed their
119548992Swpaul	 * parent link cnt.  Don't try to adjust our link down again.
119639583Swpaul	 */
119748992Swpaul	if (sino != NULL && isdotdot == 0)
119839583Swpaul		ino_setskip(sino, parent);
119936270Swpaul	/*
120036270Swpaul	 * No valid record for this inode.  Just drop the on-disk
120148992Swpaul	 * link by one.
120248992Swpaul	 */
120348992Swpaul	if (sino == NULL || sino->si_hasrecs == 0) {
120448992Swpaul		ino_decr(child);
120548992Swpaul		return;
120649010Swpaul	}
120739583Swpaul	/*
120848992Swpaul	 * Use ino_adjust() if ino_check() has already processed this
120936270Swpaul	 * child.  If we lose the last non-dot reference to a
121036270Swpaul	 * directory it will be discarded.
121136270Swpaul	 */
121248992Swpaul	if (sino->si_linkadj) {
121348992Swpaul		sino->si_nlink--;
121448992Swpaul		if (isdotdot)
121548992Swpaul			sino->si_dotlinks--;
121649010Swpaul		ino_adjust(sino);
121749010Swpaul		return;
121848992Swpaul	}
121948992Swpaul	/*
122048992Swpaul	 * If we haven't yet processed this inode we need to make
122148992Swpaul	 * sure we will successfully discover the lost path.  If not
122236270Swpaul	 * use nlinkadj to remember.
122351439Swpaul	 */
122436270Swpaul	TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
122551439Swpaul		rrec = (struct jrefrec *)srec->sr_rec;
122651657Swpaul		if (rrec->jr_parent == parent &&
122739583Swpaul		    rrec->jr_diroff == diroff)
122851439Swpaul			return;
122949010Swpaul	}
123048992Swpaul	sino->si_nlinkadj++;
123149010Swpaul}
123239583Swpaul
123348992Swpaul/*
123436270Swpaul * Free the children of a directory when the directory is discarded.
123536270Swpaul */
123636270Swpaulstatic void
123739583Swpaulino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
123839583Swpaul{
123939583Swpaul	struct suj_ino *sino;
124039583Swpaul	struct direct *dp;
124143235Swpaul	off_t diroff;
124239583Swpaul	uint8_t *block;
124339583Swpaul	int skipparent;
124439583Swpaul	int isdotdot;
124539583Swpaul	int dpoff;
124639583Swpaul	int size;
124739583Swpaul
124850468Swpaul	sino = ino_lookup(ino, 0);
124939583Swpaul	if (sino)
125039583Swpaul		skipparent = sino->si_skipparent;
125138030Swpaul	else
125239583Swpaul		skipparent = 0;
125339583Swpaul	size = lfragtosize(fs, frags);
125439583Swpaul	block = dblk_read(blk, size);
125539583Swpaul	dp = (struct direct *)&block[0];
125649010Swpaul	for (dpoff = 0; dpoff < size && dp->d_reclen; dpoff += dp->d_reclen) {
125748992Swpaul		dp = (struct direct *)&block[dpoff];
125849010Swpaul		if (dp->d_ino == 0 || dp->d_ino == WINO)
125951439Swpaul			continue;
126051439Swpaul		if (dp->d_namlen == 1 && dp->d_name[0] == '.')
126139583Swpaul			continue;
126248992Swpaul		isdotdot = dp->d_namlen == 2 && dp->d_name[0] == '.' &&
126339583Swpaul		    dp->d_name[1] == '.';
126439583Swpaul		if (isdotdot && skipparent == 1)
126539583Swpaul			continue;
126639583Swpaul		if (debug)
126739583Swpaul			printf("Directory %d removing ino %d name %s\n",
126839583Swpaul			    ino, dp->d_ino, dp->d_name);
126939583Swpaul		diroff = lblktosize(fs, lbn) + dpoff;
127039583Swpaul		ino_remref(ino, dp->d_ino, diroff, isdotdot);
127139583Swpaul	}
127239583Swpaul}
127339583Swpaul
127439583Swpaul/*
127539583Swpaul * Reclaim an inode, freeing all blocks and decrementing all children's
127639583Swpaul * link counts.  Free the inode back to the cg.
127739583Swpaul */
127839583Swpaulstatic void
127939583Swpaulino_reclaim(union dinode *ip, ino_t ino, int mode)
128039583Swpaul{
128139583Swpaul	uint32_t gen;
128239583Swpaul
128339583Swpaul	if (ino == ROOTINO)
128439583Swpaul		err_suj("Attempting to free ROOTINO\n");
128539583Swpaul	if (debug)
128639583Swpaul		printf("Truncating and freeing ino %d, nlink %d, mode %o\n",
128739583Swpaul		    ino, DIP(ip, di_nlink), DIP(ip, di_mode));
128839583Swpaul
128936270Swpaul	/* We are freeing an inode or directory. */
129036270Swpaul	if ((DIP(ip, di_mode) & IFMT) == IFDIR)
129139583Swpaul		ino_visit(ip, ino, ino_free_children, 0);
129239583Swpaul	DIP_SET(ip, di_nlink, 0);
129336270Swpaul	ino_visit(ip, ino, blk_free_visit, VISIT_EXT | VISIT_INDIR);
129439583Swpaul	/* Here we have to clear the inode and release any blocks it holds. */
129539583Swpaul	gen = DIP(ip, di_gen);
129639583Swpaul	if (fs->fs_magic == FS_UFS1_MAGIC)
129739583Swpaul		bzero(ip, sizeof(struct ufs1_dinode));
129839583Swpaul	else
129939583Swpaul		bzero(ip, sizeof(struct ufs2_dinode));
130039583Swpaul	DIP_SET(ip, di_gen, gen);
130139583Swpaul	ino_dirty(ino);
130239583Swpaul	ino_free(ino, mode);
130339583Swpaul	return;
130439583Swpaul}
130551439Swpaul
130639583Swpaul/*
130739583Swpaul * Adjust an inode's link count down by one when a directory goes away.
130839583Swpaul */
130939583Swpaulstatic void
131050468Swpaulino_decr(ino_t ino)
131139583Swpaul{
131239583Swpaul	union dinode *ip;
131336270Swpaul	int reqlink;
131450462Swpaul	int nlink;
131550462Swpaul	int mode;
131650462Swpaul
131736270Swpaul	ip = ino_read(ino);
131850462Swpaul	nlink = DIP(ip, di_nlink);
131950462Swpaul	mode = DIP(ip, di_mode);
132045155Swpaul	if (nlink < 1)
132145155Swpaul		err_suj("Inode %d link count %d invalid\n", ino, nlink);
132245155Swpaul	if (mode == 0)
132345155Swpaul		err_suj("Inode %d has a link of %d with 0 mode\n", ino, nlink);
132445155Swpaul	nlink--;
132545155Swpaul	if ((mode & IFMT) == IFDIR)
132645155Swpaul		reqlink = 2;
132745166Swpaul	else
132845155Swpaul		reqlink = 1;
132945155Swpaul	if (nlink < reqlink) {
133045155Swpaul		if (debug)
133145155Swpaul			printf("ino %d not enough links to live %d < %d\n",
133245155Swpaul			    ino, nlink, reqlink);
133336270Swpaul		ino_reclaim(ip, ino, mode);
133436270Swpaul		return;
133539583Swpaul	}
133639583Swpaul	DIP_SET(ip, di_nlink, nlink);
133739583Swpaul	ino_dirty(ino);
133839583Swpaul}
133939583Swpaul
134038030Swpaul/*
134139583Swpaul * Adjust the inode link count to 'nlink'.  If the count reaches zero
134239583Swpaul * free it.
134336270Swpaul */
134436270Swpaulstatic void
134548992Swpaulino_adjust(struct suj_ino *sino)
134636270Swpaul{
134736270Swpaul	struct jrefrec *rrec;
134848992Swpaul	struct suj_rec *srec;
134948992Swpaul	struct suj_ino *stmp;
135048992Swpaul	union dinode *ip;
135148992Swpaul	nlink_t nlink;
135248992Swpaul	int recmode;
135348992Swpaul	int reqlink;
135448992Swpaul	int isdot;
135548992Swpaul	int mode;
135648992Swpaul	ino_t ino;
135748992Swpaul
135848992Swpaul	nlink = sino->si_nlink;
135948992Swpaul	ino = sino->si_ino;
136048992Swpaul	mode = sino->si_mode & IFMT;
136148992Swpaul	/*
136248992Swpaul	 * If it's a directory with no dot links, it was truncated before
136350462Swpaul	 * the name was cleared.  We need to clear the dirent that
136450462Swpaul	 * points at it.
136550462Swpaul	 */
136651439Swpaul	if (mode == IFDIR && nlink == 1 && sino->si_dotlinks == 0) {
136750462Swpaul		sino->si_nlink = nlink = 0;
136850462Swpaul		TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
136948992Swpaul			rrec = (struct jrefrec *)srec->sr_rec;
137048992Swpaul			if (ino_isat(rrec->jr_parent, rrec->jr_diroff, ino,
137148992Swpaul			    &recmode, &isdot) == 0)
137249010Swpaul				continue;
137348992Swpaul			ino_clrat(rrec->jr_parent, rrec->jr_diroff, ino);
137448992Swpaul			break;
137548992Swpaul		}
137648992Swpaul		if (srec == NULL)
137748992Swpaul			errx(1, "Directory %d name not found", ino);
137848992Swpaul	}
137936270Swpaul	/*
138036270Swpaul	 * If it's a directory with no real names pointing to it go ahead
138136270Swpaul	 * and truncate it.  This will free any children.
138236270Swpaul	 */
138336270Swpaul	if (mode == IFDIR && nlink - sino->si_dotlinks == 0) {
138436270Swpaul		sino->si_nlink = nlink = 0;
138536270Swpaul		/*
138636270Swpaul		 * Mark any .. links so they know not to free this inode
138736270Swpaul		 * when they are removed.
138836270Swpaul		 */
138936270Swpaul		TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
139036270Swpaul			rrec = (struct jrefrec *)srec->sr_rec;
139136270Swpaul			if (rrec->jr_diroff == DOTDOT_OFFSET) {
139236270Swpaul				stmp = ino_lookup(rrec->jr_parent, 0);
139336270Swpaul				if (stmp)
139436270Swpaul					ino_setskip(stmp, ino);
139536270Swpaul			}
139636270Swpaul		}
139736270Swpaul	}
139836270Swpaul	ip = ino_read(ino);
139936270Swpaul	mode = DIP(ip, di_mode) & IFMT;
140036270Swpaul	if (nlink > LINK_MAX)
140136270Swpaul		err_suj(
140236270Swpaul		    "ino %d nlink manipulation error, new link %d, old link %d\n",
140336270Swpaul		    ino, nlink, DIP(ip, di_nlink));
140436270Swpaul	if (debug)
140536270Swpaul		printf("Adjusting ino %d, nlink %d, old link %d lastmode %o\n",
140636270Swpaul		    ino, nlink, DIP(ip, di_nlink), sino->si_mode);
140736270Swpaul	if (mode == 0) {
140836270Swpaul		if (debug)
140936270Swpaul			printf("ino %d, zero inode freeing bitmap\n", ino);
141036270Swpaul		ino_free(ino, sino->si_mode);
141136270Swpaul		return;
141236270Swpaul	}
141336270Swpaul	/* XXX Should be an assert? */
141436270Swpaul	if (mode != sino->si_mode && debug)
141536270Swpaul		printf("ino %d, mode %o != %o\n", ino, mode, sino->si_mode);
141636270Swpaul	if ((mode & IFMT) == IFDIR)
141736270Swpaul		reqlink = 2;
141836270Swpaul	else
141940795Swpaul		reqlink = 1;
142036270Swpaul	/* If the inode doesn't have enough links to live, free it. */
142137626Swpaul	if (nlink < reqlink) {
142239583Swpaul		if (debug)
142339583Swpaul			printf("ino %d not enough links to live %d < %d\n",
142440795Swpaul			    ino, nlink, reqlink);
142536270Swpaul		ino_reclaim(ip, ino, mode);
142636270Swpaul		return;
142736270Swpaul	}
142836270Swpaul	/* If required write the updated link count. */
142936270Swpaul	if (DIP(ip, di_nlink) == nlink) {
143036270Swpaul		if (debug)
143136270Swpaul			printf("ino %d, link matches, skipping.\n", ino);
143236270Swpaul		return;
143336270Swpaul	}
143436270Swpaul	DIP_SET(ip, di_nlink, nlink);
143536270Swpaul	ino_dirty(ino);
143636270Swpaul}
143736270Swpaul
143836270Swpaul/*
143936270Swpaul * Truncate some or all blocks in an indirect, freeing any that are required
144036270Swpaul * and zeroing the indirect.
144136270Swpaul */
144237626Swpaulstatic void
144336270Swpaulindir_trunc(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, ufs_lbn_t lastlbn)
144436270Swpaul{
144536270Swpaul	ufs2_daddr_t *bap2;
144636270Swpaul	ufs1_daddr_t *bap1;
144736270Swpaul	ufs_lbn_t lbnadd;
144856060Swpaul	ufs2_daddr_t nblk;
144936270Swpaul	ufs_lbn_t next;
145036270Swpaul	ufs_lbn_t nlbn;
145136270Swpaul	int dirty;
145236270Swpaul	int level;
145336270Swpaul	int i;
145436270Swpaul
145556060Swpaul	if (blk == 0)
145639583Swpaul		return;
145736270Swpaul	dirty = 0;
145836270Swpaul	level = lbn_level(lbn);
145936270Swpaul	if (level == -1)
146036270Swpaul		err_suj("Invalid level for lbn %jd\n", lbn);
146145155Swpaul	lbnadd = 1;
146245155Swpaul	for (i = level; i > 0; i--)
146345155Swpaul		lbnadd *= NINDIR(fs);
146445155Swpaul	bap1 = (void *)dblk_read(blk, fs->fs_bsize);
146536270Swpaul	bap2 = (void *)bap1;
146636270Swpaul	for (i = 0; i < NINDIR(fs); i++) {
146736270Swpaul		if (fs->fs_magic == FS_UFS1_MAGIC)
146836270Swpaul			nblk = *bap1++;
146937626Swpaul		else
147037626Swpaul			nblk = *bap2++;
147156060Swpaul		if (nblk == 0)
147236270Swpaul			continue;
147336270Swpaul		if (level != 0) {
147436270Swpaul			nlbn = (lbn + 1) - (i * lbnadd);
147536270Swpaul			/*
147636270Swpaul			 * Calculate the lbn of the next indirect to
147736270Swpaul			 * determine if any of this indirect must be
147836270Swpaul			 * reclaimed.
147936270Swpaul			 */
148036270Swpaul			next = -(lbn + level) + ((i+1) * lbnadd);
148136270Swpaul			if (next <= lastlbn)
148236270Swpaul				continue;
148336270Swpaul			indir_trunc(ino, nlbn, nblk, lastlbn);
148436270Swpaul			/* If all of this indirect was reclaimed, free it. */
148536270Swpaul			nlbn = next - lbnadd;
148636270Swpaul			if (nlbn < lastlbn)
148736270Swpaul				continue;
148836270Swpaul		} else {
148936270Swpaul			nlbn = -lbn + i * lbnadd;
149036270Swpaul			if (nlbn < lastlbn)
149136270Swpaul				continue;
149236270Swpaul		}
149336270Swpaul		dirty = 1;
149436270Swpaul		blk_free(nblk, 0, fs->fs_frag);
149536270Swpaul		if (fs->fs_magic == FS_UFS1_MAGIC)
149636270Swpaul			*(bap1 - 1) = 0;
149736270Swpaul		else
149836270Swpaul			*(bap2 - 1) = 0;
149936270Swpaul	}
150036270Swpaul	if (dirty)
150136270Swpaul		dblk_dirty(blk);
150236270Swpaul}
150336270Swpaul
150436270Swpaul/*
150536270Swpaul * Truncate an inode to the minimum of the given size or the last populated
150636270Swpaul * block after any over size have been discarded.  The kernel would allocate
150737626Swpaul * the last block in the file but fsck does not and neither do we.  This
150836270Swpaul * code never extends files, only shrinks them.
150936270Swpaul */
151036270Swpaulstatic void
151136270Swpaulino_trunc(ino_t ino, off_t size)
151256060Swpaul{
151356060Swpaul	union dinode *ip;
151456060Swpaul	ufs2_daddr_t bn;
151556060Swpaul	uint64_t totalfrags;
151636270Swpaul	ufs_lbn_t nextlbn;
151736270Swpaul	ufs_lbn_t lastlbn;
151836270Swpaul	ufs_lbn_t tmpval;
151936270Swpaul	ufs_lbn_t lbn;
152036270Swpaul	ufs_lbn_t i;
152139583Swpaul	int frags;
152239583Swpaul	off_t cursize;
152339583Swpaul	off_t off;
152439583Swpaul	int mode;
152539583Swpaul
152639583Swpaul	ip = ino_read(ino);
152739583Swpaul	mode = DIP(ip, di_mode) & IFMT;
152836270Swpaul	cursize = DIP(ip, di_size);
152936270Swpaul	if (debug)
153036270Swpaul		printf("Truncating ino %d, mode %o to size %jd from size %jd\n",
153136270Swpaul		    ino, mode, size, cursize);
153236270Swpaul
153336270Swpaul	/* Skip datablocks for short links and devices. */
153436270Swpaul	if (mode == 0 || mode == IFBLK || mode == IFCHR ||
153536270Swpaul	    (mode == IFLNK && cursize < fs->fs_maxsymlinklen))
153636270Swpaul		return;
153737626Swpaul	/* Don't extend. */
153837626Swpaul	if (size > cursize)
153937626Swpaul		size = cursize;
154037626Swpaul	lastlbn = lblkno(fs, blkroundup(fs, size));
154137626Swpaul	for (i = lastlbn; i < NDADDR; i++) {
154237626Swpaul		if (DIP(ip, di_db[i]) == 0)
154339583Swpaul			continue;
154439583Swpaul		frags = sblksize(fs, cursize, i);
154537626Swpaul		frags = numfrags(fs, frags);
154637626Swpaul		blk_free(DIP(ip, di_db[i]), 0, frags);
154737626Swpaul		DIP_SET(ip, di_db[i], 0);
154837626Swpaul	}
154937626Swpaul	/*
155036270Swpaul	 * Follow indirect blocks, freeing anything required.
155136270Swpaul	 */
155236270Swpaul	for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++,
155336270Swpaul	    lbn = nextlbn) {
155436270Swpaul		nextlbn = lbn + tmpval;
155536270Swpaul		tmpval *= NINDIR(fs);
155636270Swpaul		/* If we're not freeing any in this indirect range skip it. */
155736270Swpaul		if (lastlbn >= nextlbn)
155836270Swpaul			continue;
155936270Swpaul		if (DIP(ip, di_ib[i]) == 0)
156036270Swpaul			continue;
156136270Swpaul		indir_trunc(ino, -lbn - i, DIP(ip, di_ib[i]), lastlbn);
156236270Swpaul		/* If we freed everything in this indirect free the indir. */
156336270Swpaul		if (lastlbn > lbn)
156436270Swpaul			continue;
156536270Swpaul		blk_free(DIP(ip, di_ib[i]), 0, frags);
156636270Swpaul		DIP_SET(ip, di_ib[i], 0);
156736270Swpaul	}
156836270Swpaul	ino_dirty(ino);
156951583Swpaul	/*
157036270Swpaul	 * Now that we've freed any whole blocks that exceed the desired
157136270Swpaul	 * truncation size, figure out how many blocks remain and what the
157236270Swpaul	 * last populated lbn is.  We will set the size to this last lbn
157336270Swpaul	 * rather than worrying about allocating the final lbn as the kernel
157436270Swpaul	 * would've done.  This is consistent with normal fsck behavior.
157536270Swpaul	 */
157636270Swpaul	visitlbn = 0;
157736270Swpaul	totalfrags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT);
157836270Swpaul	if (size > lblktosize(fs, visitlbn + 1))
157936270Swpaul		size = lblktosize(fs, visitlbn + 1);
158036270Swpaul	/*
158136270Swpaul	 * If we're truncating direct blocks we have to adjust frags
158236270Swpaul	 * accordingly.
158336270Swpaul	 */
158436270Swpaul	if (visitlbn < NDADDR && totalfrags) {
158536270Swpaul		long oldspace, newspace;
158636270Swpaul
158736270Swpaul		bn = DIP(ip, di_db[visitlbn]);
158836270Swpaul		if (bn == 0)
158936270Swpaul			err_suj("Bad blk at ino %d lbn %jd\n", ino, visitlbn);
159036270Swpaul		oldspace = sblksize(fs, cursize, visitlbn);
159136270Swpaul		newspace = sblksize(fs, size, visitlbn);
159236270Swpaul		if (oldspace != newspace) {
159356060Swpaul			bn += numfrags(fs, newspace);
159436270Swpaul			frags = numfrags(fs, oldspace - newspace);
159556060Swpaul			blk_free(bn, 0, frags);
159636270Swpaul			totalfrags -= frags;
159756060Swpaul		}
159836270Swpaul	}
159936270Swpaul	DIP_SET(ip, di_blocks, fsbtodb(fs, totalfrags));
160036270Swpaul	DIP_SET(ip, di_size, size);
160139583Swpaul	/*
160236270Swpaul	 * If we've truncated into the middle of a block or frag we have
160356060Swpaul	 * to zero it here.  Otherwise the file could extend into
160456060Swpaul	 * uninitialized space later.
160539583Swpaul	 */
160636270Swpaul	off = blkoff(fs, size);
160736270Swpaul	if (off && DIP(ip, di_mode) != IFDIR) {
160836270Swpaul		uint8_t *buf;
160936270Swpaul		long clrsize;
161036270Swpaul
161136270Swpaul		bn = ino_blkatoff(ip, ino, visitlbn, &frags);
161236270Swpaul		if (bn == 0)
161336270Swpaul			err_suj("Block missing from ino %d at lbn %jd\n",
161436270Swpaul			    ino, visitlbn);
161536270Swpaul		clrsize = frags * fs->fs_fsize;
161636270Swpaul		buf = dblk_read(bn, clrsize);
161736270Swpaul		clrsize -= off;
161836270Swpaul		buf += off;
161936270Swpaul		bzero(buf, clrsize);
162036270Swpaul		dblk_dirty(bn);
162136270Swpaul	}
162236270Swpaul	return;
162336270Swpaul}
162436270Swpaul
162536270Swpaul/*
162636270Swpaul * Process records available for one inode and determine whether the
162736270Swpaul * link count is correct or needs adjusting.
162836270Swpaul */
162936270Swpaulstatic void
163036270Swpaulino_check(struct suj_ino *sino)
163136270Swpaul{
163236270Swpaul	struct suj_rec *srec;
163336270Swpaul	struct jrefrec *rrec;
163436270Swpaul	nlink_t dotlinks;
163536270Swpaul	int newlinks;
163637626Swpaul	int removes;
163737626Swpaul	int nlink;
163836270Swpaul	ino_t ino;
163936270Swpaul	int isdot;
164036270Swpaul	int isat;
164136270Swpaul	int mode;
164236270Swpaul
164336270Swpaul	if (sino->si_hasrecs == 0)
164436270Swpaul		return;
164536270Swpaul	ino = sino->si_ino;
164636270Swpaul	rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec;
164736270Swpaul	nlink = rrec->jr_nlink;
164836270Swpaul	newlinks = 0;
164936270Swpaul	dotlinks = 0;
165036270Swpaul	removes = sino->si_nlinkadj;
165136270Swpaul	TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
165236270Swpaul		rrec = (struct jrefrec *)srec->sr_rec;
165336270Swpaul		isat = ino_isat(rrec->jr_parent, rrec->jr_diroff,
165436270Swpaul		    rrec->jr_ino, &mode, &isdot);
165536270Swpaul		if (isat && (mode & IFMT) != (rrec->jr_mode & IFMT))
165636270Swpaul			err_suj("Inode mode/directory type mismatch %o != %o\n",
165736270Swpaul			    mode, rrec->jr_mode);
165836270Swpaul		if (debug)
165936270Swpaul			printf("jrefrec: op %d ino %d, nlink %d, parent %d, "
166036270Swpaul			    "diroff %jd, mode %o, isat %d, isdot %d\n",
166136270Swpaul			    rrec->jr_op, rrec->jr_ino, rrec->jr_nlink,
166236270Swpaul			    rrec->jr_parent, rrec->jr_diroff, rrec->jr_mode,
166336270Swpaul			    isat, isdot);
166436270Swpaul		mode = rrec->jr_mode & IFMT;
166536270Swpaul		if (rrec->jr_op == JOP_REMREF)
166636270Swpaul			removes++;
166736270Swpaul		newlinks += isat;
166836270Swpaul		if (isdot)
166936270Swpaul			dotlinks += isat;
167036270Swpaul	}
167136270Swpaul	/*
167236270Swpaul	 * The number of links that remain are the starting link count
167336270Swpaul	 * subtracted by the total number of removes with the total
167436270Swpaul	 * links discovered back in.  An incomplete remove thus
167536270Swpaul	 * makes no change to the link count but an add increases
167636270Swpaul	 * by one.
167736270Swpaul	 */
167836270Swpaul	if (debug)
167936270Swpaul		printf("ino %d nlink %d newlinks %d removes %d dotlinks %d\n",
168036270Swpaul		    ino, nlink, newlinks, removes, dotlinks);
168136270Swpaul	nlink += newlinks;
168236270Swpaul	nlink -= removes;
168339583Swpaul	sino->si_linkadj = 1;
168436270Swpaul	sino->si_nlink = nlink;
168539583Swpaul	sino->si_dotlinks = dotlinks;
168651439Swpaul	sino->si_mode = mode;
168736270Swpaul	ino_adjust(sino);
168839583Swpaul}
168936270Swpaul
169036270Swpaul/*
169139583Swpaul * Process records available for one block and determine whether it is
169236270Swpaul * still allocated and whether the owning inode needs to be updated or
169336270Swpaul * a free completed.
169436270Swpaul */
169536270Swpaulstatic void
169636270Swpaulblk_check(struct suj_blk *sblk)
169736270Swpaul{
169836270Swpaul	struct suj_rec *srec;
169936270Swpaul	struct jblkrec *brec;
170036270Swpaul	struct suj_ino *sino;
170136270Swpaul	ufs2_daddr_t blk;
170236270Swpaul	int mask;
170336270Swpaul	int frags;
170436270Swpaul	int isat;
170536270Swpaul
170639627Swpaul	/*
170739627Swpaul	 * Each suj_blk actually contains records for any fragments in that
170841656Swpaul	 * block.  As a result we must evaluate each record individually.
170936270Swpaul	 */
171039583Swpaul	sino = NULL;
171137626Swpaul	TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
171236270Swpaul		brec = (struct jblkrec *)srec->sr_rec;
171339583Swpaul		frags = brec->jb_frags;
171436270Swpaul		blk = brec->jb_blkno + brec->jb_oldfrags;
171536270Swpaul		isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags);
171636270Swpaul		if (sino == NULL || sino->si_ino != brec->jb_ino) {
171736270Swpaul			sino = ino_lookup(brec->jb_ino, 1);
171836270Swpaul			sino->si_blkadj = 1;
171936270Swpaul		}
172036270Swpaul		if (debug)
172136270Swpaul			printf("op %d blk %jd ino %d lbn %jd frags %d isat %d (%d)\n",
172236270Swpaul			    brec->jb_op, blk, brec->jb_ino, brec->jb_lbn,
172336270Swpaul			    brec->jb_frags, isat, frags);
172436270Swpaul		/*
172536270Swpaul		 * If we found the block at this address we still have to
172636270Swpaul		 * determine if we need to free the tail end that was
172739583Swpaul		 * added by adding contiguous fragments from the same block.
172839583Swpaul		 */
172936270Swpaul		if (isat == 1) {
173036270Swpaul			if (frags == brec->jb_frags)
173136270Swpaul				continue;
173236270Swpaul			mask = blk_freemask(blk, brec->jb_ino, brec->jb_lbn,
173336270Swpaul			    brec->jb_frags);
173436270Swpaul			mask >>= frags;
173539583Swpaul			blk += frags;
173639583Swpaul			frags = brec->jb_frags - frags;
173736270Swpaul			blk_free(blk, mask, frags);
173836270Swpaul			continue;
173936270Swpaul		}
174036270Swpaul		/*
174136270Swpaul	 	 * The block wasn't found, attempt to free it.  It won't be
174236270Swpaul		 * freed if it was actually reallocated.  If this was an
174336270Swpaul		 * allocation we don't want to follow indirects as they
174436270Swpaul		 * may not be written yet.  Any children of the indirect will
174539583Swpaul		 * have their own records.  If it's a free we need to
174636270Swpaul		 * recursively free children.
174736270Swpaul		 */
174839583Swpaul		blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags,
174939583Swpaul		    brec->jb_op == JOP_FREEBLK);
175036270Swpaul	}
175136270Swpaul}
175236270Swpaul
175336270Swpaul/*
175436270Swpaul * Walk the list of inode records for this cg and resolve moved and duplicate
175536270Swpaul * inode references now that we have a complete picture.
175636270Swpaul */
175736270Swpaulstatic void
175839583Swpaulcg_build(struct suj_cg *sc)
175939583Swpaul{
176039583Swpaul	struct suj_ino *sino;
176139583Swpaul	int i;
176239583Swpaul
176339583Swpaul	for (i = 0; i < SUJ_HASHSIZE; i++)
176436270Swpaul		LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
176536270Swpaul			ino_build(sino);
176636270Swpaul}
176736270Swpaul
176836270Swpaul/*
176936270Swpaul * Handle inodes requiring truncation.  This must be done prior to
177036270Swpaul * looking up any inodes in directories.
177136270Swpaul */
177239583Swpaulstatic void
177339583Swpaulcg_trunc(struct suj_cg *sc)
177436270Swpaul{
177536270Swpaul	struct suj_ino *sino;
177636270Swpaul	int i;
177736270Swpaul
177836270Swpaul	for (i = 0; i < SUJ_HASHSIZE; i++) {
177939583Swpaul		LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) {
178039583Swpaul			if (sino->si_trunc) {
178136270Swpaul				ino_trunc(sino->si_ino,
178236270Swpaul				    sino->si_trunc->jt_size);
178336270Swpaul				sino->si_blkadj = 0;
178436270Swpaul				sino->si_trunc = NULL;
178536270Swpaul			}
178636270Swpaul			if (sino->si_blkadj)
178736270Swpaul				ino_adjblks(sino);
178836270Swpaul		}
178936270Swpaul	}
179036270Swpaul}
179136270Swpaul
179236270Swpaul/*
179336270Swpaul * Free any partially allocated blocks and then resolve inode block
179436270Swpaul * counts.
179536270Swpaul */
179636270Swpaulstatic void
179737626Swpaulcg_check_blk(struct suj_cg *sc)
179839583Swpaul{
179937626Swpaul	struct suj_blk *sblk;
180036270Swpaul	int i;
180137626Swpaul
180237626Swpaul
180337626Swpaul	for (i = 0; i < SUJ_HASHSIZE; i++)
180436270Swpaul		LIST_FOREACH(sblk, &sc->sc_blkhash[i], sb_next)
180536270Swpaul			blk_check(sblk);
180636270Swpaul}
180736270Swpaul
180836270Swpaul/*
180936270Swpaul * Walk the list of inode records for this cg, recovering any
181036270Swpaul * changes which were not complete at the time of crash.
181136270Swpaul */
181236270Swpaulstatic void
181350462Swpaulcg_check_ino(struct suj_cg *sc)
181436270Swpaul{
181548992Swpaul	struct suj_ino *sino;
181636270Swpaul	int i;
181748992Swpaul
181848992Swpaul	for (i = 0; i < SUJ_HASHSIZE; i++)
181936270Swpaul		LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
182036270Swpaul			ino_check(sino);
182136270Swpaul}
182236270Swpaul
182336270Swpaul/*
182436270Swpaul * Write a potentially dirty cg.  Recalculate the summary information and
182536270Swpaul * update the superblock summary.
182639583Swpaul */
182739583Swpaulstatic void
182839583Swpaulcg_write(struct suj_cg *sc)
182939583Swpaul{
183039583Swpaul	ufs1_daddr_t fragno, cgbno, maxbno;
183139583Swpaul	u_int8_t *blksfree;
183236270Swpaul	struct cg *cgp;
183336270Swpaul	int blk;
183436270Swpaul	int i;
183536270Swpaul
183636270Swpaul	if (sc->sc_dirty == 0)
183736270Swpaul		return;
183836270Swpaul	/*
183936270Swpaul	 * Fix the frag and cluster summary.
184036270Swpaul	 */
184151439Swpaul	cgp = sc->sc_cgp;
184251439Swpaul	cgp->cg_cs.cs_nbfree = 0;
184351439Swpaul	cgp->cg_cs.cs_nffree = 0;
184451439Swpaul	bzero(&cgp->cg_frsum, sizeof(cgp->cg_frsum));
184551439Swpaul	maxbno = fragstoblks(fs, fs->fs_fpg);
184651439Swpaul	if (fs->fs_contigsumsize > 0) {
184751439Swpaul		for (i = 1; i <= fs->fs_contigsumsize; i++)
184851439Swpaul			cg_clustersum(cgp)[i] = 0;
184951439Swpaul		bzero(cg_clustersfree(cgp), howmany(maxbno, CHAR_BIT));
185051439Swpaul	}
185151439Swpaul	blksfree = cg_blksfree(cgp);
185251439Swpaul	for (cgbno = 0; cgbno < maxbno; cgbno++) {
185351439Swpaul		if (ffs_isfreeblock(fs, blksfree, cgbno))
185451439Swpaul			continue;
185536270Swpaul		if (ffs_isblock(fs, blksfree, cgbno)) {
185636302Swpaul			ffs_clusteracct(fs, cgp, cgbno, 1);
185750462Swpaul			cgp->cg_cs.cs_nbfree++;
185850462Swpaul			continue;
185950462Swpaul		}
186050462Swpaul		fragno = blkstofrags(fs, cgbno);
186150462Swpaul		blk = blkmap(fs, blksfree, fragno);
186248992Swpaul		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
186348992Swpaul		for (i = 0; i < fs->fs_frag; i++)
186436302Swpaul			if (isset(blksfree, fragno + i))
186536270Swpaul				cgp->cg_cs.cs_nffree++;
186636270Swpaul	}
186736270Swpaul	/*
186836270Swpaul	 * Update the superblock cg summary from our now correct values
186936270Swpaul	 * before writing the block.
187036270Swpaul	 */
187136270Swpaul	fs->fs_cs(fs, sc->sc_cgx) = cgp->cg_cs;
187236270Swpaul	if (bwrite(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf,
187336270Swpaul	    fs->fs_bsize) == -1)
187436270Swpaul		err_suj("Unable to write cylinder group %d\n", sc->sc_cgx);
187536270Swpaul}
187636270Swpaul
187736270Swpaul/*
187836270Swpaul * Write out any modified inodes.
187936270Swpaul */
188036270Swpaulstatic void
188136270Swpaulcg_write_inos(struct suj_cg *sc)
188236270Swpaul{
188336270Swpaul	struct ino_blk *iblk;
188436270Swpaul	int i;
188536270Swpaul
188636270Swpaul	for (i = 0; i < SUJ_HASHSIZE; i++)
188736270Swpaul		LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next)
188836270Swpaul			if (iblk->ib_dirty)
188936270Swpaul				iblk_write(iblk);
189036270Swpaul}
189136270Swpaul
189236270Swpaulstatic void
189336270Swpaulcg_apply(void (*apply)(struct suj_cg *))
189436270Swpaul{
189536270Swpaul	struct suj_cg *scg;
189636270Swpaul	int i;
189736270Swpaul
189836270Swpaul	for (i = 0; i < SUJ_HASHSIZE; i++)
189936270Swpaul		LIST_FOREACH(scg, &cghash[i], sc_next)
190036270Swpaul			apply(scg);
190136270Swpaul}
190236270Swpaul
190336270Swpaul/*
190436270Swpaul * Process the unlinked but referenced file list.  Freeing all inodes.
190536270Swpaul */
190636270Swpaulstatic void
190736270Swpaulino_unlinked(void)
190836270Swpaul{
190936270Swpaul	union dinode *ip;
191036270Swpaul	uint16_t mode;
191136270Swpaul	ino_t inon;
191236270Swpaul	ino_t ino;
191336270Swpaul
191436270Swpaul	ino = fs->fs_sujfree;
191556060Swpaul	fs->fs_sujfree = 0;
191636270Swpaul	while (ino != 0) {
191736270Swpaul		ip = ino_read(ino);
191836270Swpaul		mode = DIP(ip, di_mode) & IFMT;
191936270Swpaul		inon = DIP(ip, di_freelink);
192036270Swpaul		DIP_SET(ip, di_freelink, 0);
192136270Swpaul		/*
192256060Swpaul		 * XXX Should this be an errx?
192336270Swpaul		 */
192436270Swpaul		if (DIP(ip, di_nlink) == 0) {
192536270Swpaul			if (debug)
192636270Swpaul				printf("Freeing unlinked ino %d mode %o\n",
192736270Swpaul				    ino, mode);
192836270Swpaul			ino_reclaim(ip, ino, mode);
192936270Swpaul		} else if (debug)
193036270Swpaul			printf("Skipping ino %d mode %o with link %d\n",
193136270Swpaul			    ino, mode, DIP(ip, di_nlink));
193236270Swpaul		ino = inon;
193336270Swpaul	}
193436270Swpaul}
193536270Swpaul
193636270Swpaul/*
193736270Swpaul * Append a new record to the list of records requiring processing.
193836270Swpaul */
193936270Swpaulstatic void
194036270Swpaulino_append(union jrec *rec)
194136270Swpaul{
194236270Swpaul	struct jrefrec *refrec;
194336270Swpaul	struct jmvrec *mvrec;
194439583Swpaul	struct suj_ino *sino;
194539583Swpaul	struct suj_rec *srec;
194636270Swpaul
194736270Swpaul	mvrec = &rec->rec_jmvrec;
194836270Swpaul	refrec = &rec->rec_jrefrec;
194936270Swpaul	if (debug && mvrec->jm_op == JOP_MVREF)
195036270Swpaul		printf("ino move: ino %d, parent %d, diroff %jd, oldoff %jd\n",
195136270Swpaul		    mvrec->jm_ino, mvrec->jm_parent, mvrec->jm_newoff,
195236270Swpaul		    mvrec->jm_oldoff);
195336270Swpaul	else if (debug &&
195436270Swpaul	    (refrec->jr_op == JOP_ADDREF || refrec->jr_op == JOP_REMREF))
195536270Swpaul		printf("ino ref: op %d, ino %d, nlink %d, "
195636270Swpaul		    "parent %d, diroff %jd\n",
195736270Swpaul		    refrec->jr_op, refrec->jr_ino, refrec->jr_nlink,
195836270Swpaul		    refrec->jr_parent, refrec->jr_diroff);
195936270Swpaul	sino = ino_lookup(((struct jrefrec *)rec)->jr_ino, 1);
196036270Swpaul	sino->si_hasrecs = 1;
196136270Swpaul	srec = errmalloc(sizeof(*srec));
196236270Swpaul	srec->sr_rec = rec;
196336270Swpaul	TAILQ_INSERT_TAIL(&sino->si_newrecs, srec, sr_next);
196436270Swpaul}
196536270Swpaul
196636270Swpaul/*
196736270Swpaul * Add a reference adjustment to the sino list and eliminate dups.  The
196836270Swpaul * primary loop in ino_build_ref() checks for dups but new ones may be
196936270Swpaul * created as a result of offset adjustments.
197036270Swpaul */
197136270Swpaulstatic void
197236270Swpaulino_add_ref(struct suj_ino *sino, struct suj_rec *srec)
197336270Swpaul{
197436270Swpaul	struct jrefrec *refrec;
197536270Swpaul	struct suj_rec *srn;
197636270Swpaul	struct jrefrec *rrn;
197736270Swpaul
197836270Swpaul	refrec = (struct jrefrec *)srec->sr_rec;
197936270Swpaul	/*
198036270Swpaul	 * We walk backwards so that the oldest link count is preserved.  If
198136270Swpaul	 * an add record conflicts with a remove keep the remove.  Redundant
198236270Swpaul	 * removes are eliminated in ino_build_ref.  Otherwise we keep the
198336270Swpaul	 * oldest record at a given location.
198436270Swpaul	 */
198536270Swpaul	for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn;
198636270Swpaul	    srn = TAILQ_PREV(srn, srechd, sr_next)) {
198736270Swpaul		rrn = (struct jrefrec *)srn->sr_rec;
198836270Swpaul		if (rrn->jr_parent != refrec->jr_parent ||
198936270Swpaul		    rrn->jr_diroff != refrec->jr_diroff)
199036270Swpaul			continue;
199136270Swpaul		if (rrn->jr_op == JOP_REMREF || refrec->jr_op == JOP_ADDREF) {
199236270Swpaul			rrn->jr_mode = refrec->jr_mode;
199336270Swpaul			return;
199436270Swpaul		}
199536270Swpaul		/*
199636270Swpaul		 * Adding a remove.
199736270Swpaul		 *
199836270Swpaul		 * Replace the record in place with the old nlink in case
199936270Swpaul		 * we replace the head of the list.  Abandon srec as a dup.
200036270Swpaul		 */
200136270Swpaul		refrec->jr_nlink = rrn->jr_nlink;
200236270Swpaul		srn->sr_rec = srec->sr_rec;
200336270Swpaul		return;
200436270Swpaul	}
200536270Swpaul	TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next);
200636270Swpaul}
200736270Swpaul
200836270Swpaul/*
200936270Swpaul * Create a duplicate of a reference at a previous location.
201036270Swpaul */
201136270Swpaulstatic void
201236270Swpaulino_dup_ref(struct suj_ino *sino, struct jrefrec *refrec, off_t diroff)
201336270Swpaul{
201436270Swpaul	struct jrefrec *rrn;
201536270Swpaul	struct suj_rec *srn;
201636270Swpaul
201736270Swpaul	rrn = errmalloc(sizeof(*refrec));
201836270Swpaul	*rrn = *refrec;
201941526Swpaul	rrn->jr_op = JOP_ADDREF;
202041526Swpaul	rrn->jr_diroff = diroff;
202141526Swpaul	srn = errmalloc(sizeof(*srn));
202241526Swpaul	srn->sr_rec = (union jrec *)rrn;
202341526Swpaul	ino_add_ref(sino, srn);
202441526Swpaul}
202536270Swpaul
202636270Swpaul/*
202736270Swpaul * Add a reference to the list at all known locations.  We follow the offset
202836270Swpaul * changes for a single instance and create duplicate add refs at each so
202936270Swpaul * that we can tolerate any version of the directory block.  Eliminate
203036270Swpaul * removes which collide with adds that are seen in the journal.  They should
203136270Swpaul * not adjust the link count down.
203236270Swpaul */
203336270Swpaulstatic void
203439583Swpaulino_build_ref(struct suj_ino *sino, struct suj_rec *srec)
203536270Swpaul{
203636270Swpaul	struct jrefrec *refrec;
203739583Swpaul	struct jmvrec *mvrec;
203839583Swpaul	struct suj_rec *srp;
203936270Swpaul	struct suj_rec *srn;
204036270Swpaul	struct jrefrec *rrn;
204139583Swpaul	off_t diroff;
204236270Swpaul
204336270Swpaul	refrec = (struct jrefrec *)srec->sr_rec;
204436270Swpaul	/*
204542146Swpaul	 * Search for a mvrec that matches this offset.  Whether it's an add
204636270Swpaul	 * or a remove we can delete the mvref after creating a dup record in
204736270Swpaul	 * the old location.
204836270Swpaul	 */
204936270Swpaul	if (!TAILQ_EMPTY(&sino->si_movs)) {
205036270Swpaul		diroff = refrec->jr_diroff;
205136270Swpaul		for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn; srn = srp) {
205236270Swpaul			srp = TAILQ_PREV(srn, srechd, sr_next);
205336270Swpaul			mvrec = (struct jmvrec *)srn->sr_rec;
205436270Swpaul			if (mvrec->jm_parent != refrec->jr_parent ||
205536270Swpaul			    mvrec->jm_newoff != diroff)
205636270Swpaul				continue;
205736270Swpaul			diroff = mvrec->jm_oldoff;
205836270Swpaul			TAILQ_REMOVE(&sino->si_movs, srn, sr_next);
205936270Swpaul			free(srn);
206036270Swpaul			ino_dup_ref(sino, refrec, diroff);
206136270Swpaul		}
206250462Swpaul	}
206336270Swpaul	/*
206436270Swpaul	 * If a remove wasn't eliminated by an earlier add just append it to
206536270Swpaul	 * the list.
206636270Swpaul	 */
206736270Swpaul	if (refrec->jr_op == JOP_REMREF) {
206836270Swpaul		ino_add_ref(sino, srec);
206936270Swpaul		return;
207036270Swpaul	}
207136270Swpaul	/*
207236270Swpaul	 * Walk the list of records waiting to be added to the list.  We
207351439Swpaul	 * must check for moves that apply to our current offset and remove
207451439Swpaul	 * them from the list.  Remove any duplicates to eliminate removes
207551439Swpaul	 * with corresponding adds.
207651439Swpaul	 */
207751439Swpaul	TAILQ_FOREACH_SAFE(srn, &sino->si_newrecs, sr_next, srp) {
207851439Swpaul		switch (srn->sr_rec->rec_jrefrec.jr_op) {
207951439Swpaul		case JOP_ADDREF:
208036270Swpaul			/*
208136270Swpaul			 * This should actually be an error we should
208236270Swpaul			 * have a remove for every add journaled.
208339583Swpaul			 */
208439583Swpaul			rrn = (struct jrefrec *)srn->sr_rec;
208539583Swpaul			if (rrn->jr_parent != refrec->jr_parent ||
208639583Swpaul			    rrn->jr_diroff != refrec->jr_diroff)
208736270Swpaul				break;
208836270Swpaul			TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next);
208936270Swpaul			break;
209036270Swpaul		case JOP_REMREF:
209139583Swpaul			/*
209239583Swpaul			 * Once we remove the current iteration of the
209339583Swpaul			 * record at this address we're done.
209439583Swpaul			 */
209536270Swpaul			rrn = (struct jrefrec *)srn->sr_rec;
209650468Swpaul			if (rrn->jr_parent != refrec->jr_parent ||
209750468Swpaul			    rrn->jr_diroff != refrec->jr_diroff)
209836270Swpaul				break;
209941656Swpaul			TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next);
210036270Swpaul			ino_add_ref(sino, srec);
210139583Swpaul			return;
210239583Swpaul		case JOP_MVREF:
210339583Swpaul			/*
210436270Swpaul			 * Update our diroff based on any moves that match
210539583Swpaul			 * and remove the move.
210639583Swpaul			 */
210739583Swpaul			mvrec = (struct jmvrec *)srn->sr_rec;
210839583Swpaul			if (mvrec->jm_parent != refrec->jr_parent ||
210936270Swpaul			    mvrec->jm_oldoff != refrec->jr_diroff)
211036270Swpaul				break;
211136270Swpaul			ino_dup_ref(sino, refrec, mvrec->jm_oldoff);
211236270Swpaul			refrec->jr_diroff = mvrec->jm_newoff;
211336270Swpaul			TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next);
211436270Swpaul			break;
211539583Swpaul		default:
211639583Swpaul			err_suj("ino_build_ref: Unknown op %d\n",
211736270Swpaul			    srn->sr_rec->rec_jrefrec.jr_op);
211836270Swpaul		}
211939583Swpaul	}
212039583Swpaul	ino_add_ref(sino, srec);
212136270Swpaul}
212250462Swpaul
212350462Swpaul/*
212450462Swpaul * Walk the list of new records and add them in-order resolving any
212550462Swpaul * dups and adjusted offsets.
212650462Swpaul */
212750462Swpaulstatic void
212838030Swpaulino_build(struct suj_ino *sino)
212936270Swpaul{
213050468Swpaul	struct suj_rec *srec;
213136270Swpaul
213236270Swpaul	while ((srec = TAILQ_FIRST(&sino->si_newrecs)) != NULL) {
213336270Swpaul		TAILQ_REMOVE(&sino->si_newrecs, srec, sr_next);
213436270Swpaul		switch (srec->sr_rec->rec_jrefrec.jr_op) {
213536270Swpaul		case JOP_ADDREF:
213636270Swpaul		case JOP_REMREF:
213736270Swpaul			ino_build_ref(sino, srec);
213836270Swpaul			break;
213936270Swpaul		case JOP_MVREF:
214036270Swpaul			/*
214136270Swpaul			 * Add this mvrec to the queue of pending mvs.
214236270Swpaul			 */
214336270Swpaul			TAILQ_INSERT_TAIL(&sino->si_movs, srec, sr_next);
214436270Swpaul			break;
214536270Swpaul		default:
214636270Swpaul			err_suj("ino_build: Unknown op %d\n",
214736270Swpaul			    srec->sr_rec->rec_jrefrec.jr_op);
214836270Swpaul		}
214936270Swpaul	}
215050462Swpaul	if (TAILQ_EMPTY(&sino->si_recs))
215136270Swpaul		sino->si_hasrecs = 0;
215236270Swpaul}
215336270Swpaul
215450462Swpaul/*
215550462Swpaul * Modify journal records so they refer to the base block number
215650462Swpaul * and a start and end frag range.  This is to facilitate the discovery
215750462Swpaul * of overlapping fragment allocations.
215850462Swpaul */
215950462Swpaulstatic void
216036270Swpaulblk_build(struct jblkrec *blkrec)
216136270Swpaul{
216236270Swpaul	struct suj_rec *srec;
216336270Swpaul	struct suj_blk *sblk;
216436270Swpaul	struct jblkrec *blkrn;
216536270Swpaul	ufs2_daddr_t blk;
216636270Swpaul	int frag;
216736270Swpaul
216836270Swpaul	if (debug)
216936270Swpaul		printf("blk_build: op %d blkno %jd frags %d oldfrags %d "
217036270Swpaul		    "ino %d lbn %jd\n",
217136270Swpaul		    blkrec->jb_op, blkrec->jb_blkno, blkrec->jb_frags,
217250462Swpaul		    blkrec->jb_oldfrags, blkrec->jb_ino, blkrec->jb_lbn);
217336270Swpaul
217436270Swpaul	blk = blknum(fs, blkrec->jb_blkno);
217536270Swpaul	frag = fragnum(fs, blkrec->jb_blkno);
217636270Swpaul	sblk = blk_lookup(blk, 1);
217736270Swpaul	/*
217845155Swpaul	 * Rewrite the record using oldfrags to indicate the offset into
217945155Swpaul	 * the block.  Leave jb_frags as the actual allocated count.
218045155Swpaul	 */
218145155Swpaul	blkrec->jb_blkno -= frag;
218245155Swpaul	blkrec->jb_oldfrags = frag;
218345155Swpaul	if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag)
218445155Swpaul		err_suj("Invalid fragment count %d oldfrags %d\n",
218545155Swpaul		    blkrec->jb_frags, frag);
218645155Swpaul	/*
218745155Swpaul	 * Detect dups.  If we detect a dup we always discard the oldest
218836270Swpaul	 * record as it is superseded by the new record.  This speeds up
218950462Swpaul	 * later stages but also eliminates free records which are used
219050462Swpaul	 * to indicate that the contents of indirects can be trusted.
219150462Swpaul	 */
219250462Swpaul	TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
219336270Swpaul		blkrn = (struct jblkrec *)srec->sr_rec;
219436270Swpaul		if (blkrn->jb_ino != blkrec->jb_ino ||
219536270Swpaul		    blkrn->jb_lbn != blkrec->jb_lbn ||
219636270Swpaul		    blkrn->jb_blkno != blkrec->jb_blkno ||
219736270Swpaul		    blkrn->jb_frags != blkrec->jb_frags ||
219836270Swpaul		    blkrn->jb_oldfrags != blkrec->jb_oldfrags)
219936270Swpaul			continue;
220036735Sdfr		if (debug)
220136270Swpaul			printf("Removed dup.\n");
220236270Swpaul		/* Discard the free which is a dup with an alloc. */
220336270Swpaul		if (blkrec->jb_op == JOP_FREEBLK)
220436270Swpaul			return;
220536270Swpaul		TAILQ_REMOVE(&sblk->sb_recs, srec, sr_next);
220636270Swpaul		free(srec);
220736270Swpaul		break;
220836270Swpaul	}
220936270Swpaul	srec = errmalloc(sizeof(*srec));
221036270Swpaul	srec->sr_rec = (union jrec *)blkrec;
221136270Swpaul	TAILQ_INSERT_TAIL(&sblk->sb_recs, srec, sr_next);
221236270Swpaul}
221336270Swpaul
221436270Swpaulstatic void
221536270Swpaulino_build_trunc(struct jtrncrec *rec)
221636270Swpaul{
221750462Swpaul	struct suj_ino *sino;
221850462Swpaul
221950462Swpaul	if (debug)
222050462Swpaul		printf("ino_build_trunc: op %d ino %d, size %jd\n",
222150462Swpaul		    rec->jt_op, rec->jt_ino, rec->jt_size);
222250462Swpaul	sino = ino_lookup(rec->jt_ino, 1);
222350462Swpaul	if (rec->jt_op == JOP_SYNC) {
222450462Swpaul		sino->si_trunc = NULL;
222550462Swpaul		return;
222650462Swpaul	}
222750462Swpaul	if (sino->si_trunc == NULL || sino->si_trunc->jt_size > rec->jt_size)
222850462Swpaul		sino->si_trunc = rec;
222936270Swpaul}
223036270Swpaul
223136270Swpaul/*
223236270Swpaul * Build up tables of the operations we need to recover.
223336270Swpaul */
223450462Swpaulstatic void
223536270Swpaulsuj_build(void)
223636270Swpaul{
223736270Swpaul	struct suj_seg *seg;
223836270Swpaul	union jrec *rec;
223936270Swpaul	int off;
224036270Swpaul	int i;
224136270Swpaul
224236270Swpaul	TAILQ_FOREACH(seg, &allsegs, ss_next) {
224336270Swpaul		if (debug)
224450462Swpaul			printf("seg %jd has %d records, oldseq %jd.\n",
224550462Swpaul			    seg->ss_rec.jsr_seq, seg->ss_rec.jsr_cnt,
224650462Swpaul			    seg->ss_rec.jsr_oldest);
224750462Swpaul		off = 0;
224850462Swpaul		rec = (union jrec *)seg->ss_blk;
224950462Swpaul		for (i = 0; i < seg->ss_rec.jsr_cnt; off += JREC_SIZE, rec++) {
225050462Swpaul			/* skip the segrec. */
225150462Swpaul			if ((off % real_dev_bsize) == 0)
225236270Swpaul				continue;
225336270Swpaul			switch (rec->rec_jrefrec.jr_op) {
225436270Swpaul			case JOP_ADDREF:
225536270Swpaul			case JOP_REMREF:
225636270Swpaul			case JOP_MVREF:
225736270Swpaul				ino_append(rec);
225836270Swpaul				break;
225936270Swpaul			case JOP_NEWBLK:
226036270Swpaul			case JOP_FREEBLK:
226136270Swpaul				blk_build((struct jblkrec *)rec);
226236270Swpaul				break;
226336270Swpaul			case JOP_TRUNC:
226436270Swpaul				ino_build_trunc((struct jtrncrec *)rec);
226536270Swpaul				break;
226636270Swpaul			default:
226736270Swpaul				err_suj("Unknown journal operation %d (%d)\n",
226836270Swpaul				    rec->rec_jrefrec.jr_op, off);
226936270Swpaul			}
227050462Swpaul			i++;
227136270Swpaul		}
227236270Swpaul	}
227336270Swpaul}
227450468Swpaul
227536270Swpaul/*
227636270Swpaul * Prune the journal segments to those we care about based on the
227736270Swpaul * oldest sequence in the newest segment.  Order the segment list
227836270Swpaul * based on sequence number.
227936270Swpaul */
228036270Swpaulstatic void
228136270Swpaulsuj_prune(void)
228236270Swpaul{
228336270Swpaul	struct suj_seg *seg;
228436270Swpaul	struct suj_seg *segn;
228536270Swpaul	uint64_t newseq;
228636270Swpaul	int discard;
228736270Swpaul
228836270Swpaul	if (debug)
228936270Swpaul		printf("Pruning up to %jd\n", oldseq);
229036270Swpaul	/* First free the expired segments. */
229136270Swpaul	TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) {
229236270Swpaul		if (seg->ss_rec.jsr_seq >= oldseq)
229336270Swpaul			continue;
229436270Swpaul		TAILQ_REMOVE(&allsegs, seg, ss_next);
229536270Swpaul		free(seg->ss_blk);
229639583Swpaul		free(seg);
229739583Swpaul	}
229839583Swpaul	/* Next ensure that segments are ordered properly. */
229936270Swpaul	seg = TAILQ_FIRST(&allsegs);
230036270Swpaul	if (seg == NULL) {
230139583Swpaul		if (debug)
230239583Swpaul			printf("Empty journal\n");
230339583Swpaul		return;
230436270Swpaul	}
230536270Swpaul	newseq = seg->ss_rec.jsr_seq;
230636270Swpaul	for (;;) {
230736270Swpaul		seg = TAILQ_LAST(&allsegs, seghd);
230839583Swpaul		if (seg->ss_rec.jsr_seq >= newseq)
230936270Swpaul			break;
231036270Swpaul		TAILQ_REMOVE(&allsegs, seg, ss_next);
231136270Swpaul		TAILQ_INSERT_HEAD(&allsegs, seg, ss_next);
231236270Swpaul		newseq = seg->ss_rec.jsr_seq;
231339583Swpaul
231436270Swpaul	}
231536270Swpaul	if (newseq != oldseq) {
231636270Swpaul		err_suj("Journal file sequence mismatch %jd != %jd\n",
231736270Swpaul		    newseq, oldseq);
231836270Swpaul	}
231936270Swpaul	/*
232036270Swpaul	 * The kernel may asynchronously write segments which can create
232136270Swpaul	 * gaps in the sequence space.  Throw away any segments after the
232236270Swpaul	 * gap as the kernel guarantees only those that are contiguously
232336270Swpaul	 * reachable are marked as completed.
232436270Swpaul	 */
232536270Swpaul	discard = 0;
232636270Swpaul	TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) {
232736270Swpaul		if (!discard && newseq++ == seg->ss_rec.jsr_seq) {
232836270Swpaul			jrecs += seg->ss_rec.jsr_cnt;
232936270Swpaul			jbytes += seg->ss_rec.jsr_blocks * real_dev_bsize;
233036270Swpaul			continue;
233136270Swpaul		}
233236270Swpaul		discard = 1;
233336270Swpaul		if (debug)
233436270Swpaul			printf("Journal order mismatch %jd != %jd pruning\n",
233536270Swpaul			    newseq-1, seg->ss_rec.jsr_seq);
233636270Swpaul		TAILQ_REMOVE(&allsegs, seg, ss_next);
233736270Swpaul		free(seg->ss_blk);
233836270Swpaul		free(seg);
233936270Swpaul	}
234036270Swpaul	if (debug)
234136270Swpaul		printf("Processing journal segments from %jd to %jd\n",
234236270Swpaul		    oldseq, newseq-1);
234336270Swpaul}
234436270Swpaul
234536270Swpaul/*
234636270Swpaul * Verify the journal inode before attempting to read records.
234736270Swpaul */
234848992Swpaulstatic int
234948992Swpaulsuj_verifyino(union dinode *ip)
235036270Swpaul{
235139583Swpaul
235236270Swpaul	if (DIP(ip, di_nlink) != 1) {
235348992Swpaul		printf("Invalid link count %d for journal inode %d\n",
235436270Swpaul		    DIP(ip, di_nlink), sujino);
235539583Swpaul		return (-1);
235636270Swpaul	}
235736270Swpaul
235836270Swpaul	if ((DIP(ip, di_flags) & (SF_IMMUTABLE | SF_NOUNLINK)) !=
2359	    (SF_IMMUTABLE | SF_NOUNLINK)) {
2360		printf("Invalid flags 0x%X for journal inode %d\n",
2361		    DIP(ip, di_flags), sujino);
2362		return (-1);
2363	}
2364
2365	if (DIP(ip, di_mode) != (IFREG | IREAD)) {
2366		printf("Invalid mode %o for journal inode %d\n",
2367		    DIP(ip, di_mode), sujino);
2368		return (-1);
2369	}
2370
2371	if (DIP(ip, di_size) < SUJ_MIN || DIP(ip, di_size) > SUJ_MAX) {
2372		printf("Invalid size %jd for journal inode %d\n",
2373		    DIP(ip, di_size), sujino);
2374		return (-1);
2375	}
2376
2377	if (DIP(ip, di_modrev) != fs->fs_mtime) {
2378		printf("Journal timestamp does not match fs mount time\n");
2379		return (-1);
2380	}
2381
2382	return (0);
2383}
2384
2385struct jblocks {
2386	struct jextent *jb_extent;	/* Extent array. */
2387	int		jb_avail;	/* Available extents. */
2388	int		jb_used;	/* Last used extent. */
2389	int		jb_head;	/* Allocator head. */
2390	int		jb_off;		/* Allocator extent offset. */
2391};
2392struct jextent {
2393	ufs2_daddr_t	je_daddr;	/* Disk block address. */
2394	int		je_blocks;	/* Disk block count. */
2395};
2396
2397struct jblocks *suj_jblocks;
2398
2399static struct jblocks *
2400jblocks_create(void)
2401{
2402	struct jblocks *jblocks;
2403	int size;
2404
2405	jblocks = errmalloc(sizeof(*jblocks));
2406	jblocks->jb_avail = 10;
2407	jblocks->jb_used = 0;
2408	jblocks->jb_head = 0;
2409	jblocks->jb_off = 0;
2410	size = sizeof(struct jextent) * jblocks->jb_avail;
2411	jblocks->jb_extent = errmalloc(size);
2412	bzero(jblocks->jb_extent, size);
2413
2414	return (jblocks);
2415}
2416
2417/*
2418 * Return the next available disk block and the amount of contiguous
2419 * free space it contains.
2420 */
2421static ufs2_daddr_t
2422jblocks_next(struct jblocks *jblocks, int bytes, int *actual)
2423{
2424	struct jextent *jext;
2425	ufs2_daddr_t daddr;
2426	int freecnt;
2427	int blocks;
2428
2429	blocks = bytes / disk->d_bsize;
2430	jext = &jblocks->jb_extent[jblocks->jb_head];
2431	freecnt = jext->je_blocks - jblocks->jb_off;
2432	if (freecnt == 0) {
2433		jblocks->jb_off = 0;
2434		if (++jblocks->jb_head > jblocks->jb_used)
2435			return (0);
2436		jext = &jblocks->jb_extent[jblocks->jb_head];
2437		freecnt = jext->je_blocks;
2438	}
2439	if (freecnt > blocks)
2440		freecnt = blocks;
2441	*actual = freecnt * disk->d_bsize;
2442	daddr = jext->je_daddr + jblocks->jb_off;
2443
2444	return (daddr);
2445}
2446
2447/*
2448 * Advance the allocation head by a specified number of bytes, consuming
2449 * one journal segment.
2450 */
2451static void
2452jblocks_advance(struct jblocks *jblocks, int bytes)
2453{
2454
2455	jblocks->jb_off += bytes / disk->d_bsize;
2456}
2457
2458static void
2459jblocks_destroy(struct jblocks *jblocks)
2460{
2461
2462	free(jblocks->jb_extent);
2463	free(jblocks);
2464}
2465
2466static void
2467jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks)
2468{
2469	struct jextent *jext;
2470	int size;
2471
2472	jext = &jblocks->jb_extent[jblocks->jb_used];
2473	/* Adding the first block. */
2474	if (jext->je_daddr == 0) {
2475		jext->je_daddr = daddr;
2476		jext->je_blocks = blocks;
2477		return;
2478	}
2479	/* Extending the last extent. */
2480	if (jext->je_daddr + jext->je_blocks == daddr) {
2481		jext->je_blocks += blocks;
2482		return;
2483	}
2484	/* Adding a new extent. */
2485	if (++jblocks->jb_used == jblocks->jb_avail) {
2486		jblocks->jb_avail *= 2;
2487		size = sizeof(struct jextent) * jblocks->jb_avail;
2488		jext = errmalloc(size);
2489		bzero(jext, size);
2490		bcopy(jblocks->jb_extent, jext,
2491		    sizeof(struct jextent) * jblocks->jb_used);
2492		free(jblocks->jb_extent);
2493		jblocks->jb_extent = jext;
2494	}
2495	jext = &jblocks->jb_extent[jblocks->jb_used];
2496	jext->je_daddr = daddr;
2497	jext->je_blocks = blocks;
2498
2499	return;
2500}
2501
2502/*
2503 * Add a file block from the journal to the extent map.  We can't read
2504 * each file block individually because the kernel treats it as a circular
2505 * buffer and segments may span mutliple contiguous blocks.
2506 */
2507static void
2508suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
2509{
2510
2511	jblocks_add(suj_jblocks, fsbtodb(fs, blk), fsbtodb(fs, frags));
2512}
2513
2514static void
2515suj_read(void)
2516{
2517	uint8_t block[1 * 1024 * 1024];
2518	struct suj_seg *seg;
2519	struct jsegrec *recn;
2520	struct jsegrec *rec;
2521	ufs2_daddr_t blk;
2522	int readsize;
2523	int blocks;
2524	int recsize;
2525	int size;
2526	int i;
2527
2528	/*
2529	 * Read records until we exhaust the journal space.  If we find
2530	 * an invalid record we start searching for a valid segment header
2531	 * at the next block.  This is because we don't have a head/tail
2532	 * pointer and must recover the information indirectly.  At the gap
2533	 * between the head and tail we won't necessarily have a valid
2534	 * segment.
2535	 */
2536restart:
2537	for (;;) {
2538		size = sizeof(block);
2539		blk = jblocks_next(suj_jblocks, size, &readsize);
2540		if (blk == 0)
2541			return;
2542		size = readsize;
2543		/*
2544		 * Read 1MB at a time and scan for records within this block.
2545		 */
2546		if (bread(disk, blk, &block, size) == -1) {
2547			err_suj("Error reading journal block %jd\n",
2548			    (intmax_t)blk);
2549		}
2550		for (rec = (void *)block; size; size -= recsize,
2551		    rec = (struct jsegrec *)((uintptr_t)rec + recsize)) {
2552			recsize = real_dev_bsize;
2553			if (rec->jsr_time != fs->fs_mtime) {
2554				if (debug)
2555					printf("Rec time %jd != fs mtime %jd\n",
2556					    rec->jsr_time, fs->fs_mtime);
2557				jblocks_advance(suj_jblocks, recsize);
2558				continue;
2559			}
2560			if (rec->jsr_cnt == 0) {
2561				if (debug)
2562					printf("Found illegal count %d\n",
2563					    rec->jsr_cnt);
2564				jblocks_advance(suj_jblocks, recsize);
2565				continue;
2566			}
2567			blocks = rec->jsr_blocks;
2568			recsize = blocks * real_dev_bsize;
2569			if (recsize > size) {
2570				/*
2571				 * We may just have run out of buffer, restart
2572				 * the loop to re-read from this spot.
2573				 */
2574				if (size < fs->fs_bsize &&
2575				    size != readsize &&
2576				    recsize <= fs->fs_bsize)
2577					goto restart;
2578				if (debug)
2579					printf("Found invalid segsize %d > %d\n",
2580					    recsize, size);
2581				recsize = real_dev_bsize;
2582				jblocks_advance(suj_jblocks, recsize);
2583				continue;
2584			}
2585			/*
2586			 * Verify that all blocks in the segment are present.
2587			 */
2588			for (i = 1; i < blocks; i++) {
2589				recn = (void *)((uintptr_t)rec) + i *
2590				    real_dev_bsize;
2591				if (recn->jsr_seq == rec->jsr_seq &&
2592				    recn->jsr_time == rec->jsr_time)
2593					continue;
2594				if (debug)
2595					printf("Incomplete record %jd (%d)\n",
2596					    rec->jsr_seq, i);
2597				recsize = i * real_dev_bsize;
2598				jblocks_advance(suj_jblocks, recsize);
2599				goto restart;
2600			}
2601			seg = errmalloc(sizeof(*seg));
2602			seg->ss_blk = errmalloc(recsize);
2603			seg->ss_rec = *rec;
2604			bcopy((void *)rec, seg->ss_blk, recsize);
2605			if (rec->jsr_oldest > oldseq)
2606				oldseq = rec->jsr_oldest;
2607			TAILQ_INSERT_TAIL(&allsegs, seg, ss_next);
2608			jblocks_advance(suj_jblocks, recsize);
2609		}
2610	}
2611}
2612
2613/*
2614 * Search a directory block for the SUJ_FILE.
2615 */
2616static void
2617suj_find(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
2618{
2619	char block[MAXBSIZE];
2620	struct direct *dp;
2621	int bytes;
2622	int off;
2623
2624	if (sujino)
2625		return;
2626	bytes = lfragtosize(fs, frags);
2627	if (bread(disk, fsbtodb(fs, blk), block, bytes) <= 0)
2628		err_suj("Failed to read ROOTINO directory block %jd\n", blk);
2629	for (off = 0; off < bytes; off += dp->d_reclen) {
2630		dp = (struct direct *)&block[off];
2631		if (dp->d_reclen == 0)
2632			break;
2633		if (dp->d_ino == 0)
2634			continue;
2635		if (dp->d_namlen != strlen(SUJ_FILE))
2636			continue;
2637		if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
2638			continue;
2639		sujino = dp->d_ino;
2640		return;
2641	}
2642}
2643
2644/*
2645 * Orchestrate the verification of a filesystem via the softupdates journal.
2646 */
2647int
2648suj_check(const char *filesys)
2649{
2650	union dinode *jip;
2651	union dinode *ip;
2652	uint64_t blocks;
2653	int retval;
2654	struct suj_seg *seg;
2655	struct suj_seg *segn;
2656
2657	opendisk(filesys);
2658	TAILQ_INIT(&allsegs);
2659
2660	/*
2661	 * Set an exit point when SUJ check failed
2662	 */
2663	retval = setjmp(jmpbuf);
2664	if (retval != 0) {
2665		pwarn("UNEXPECTED SU+J INCONSISTENCY\n");
2666		TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) {
2667			TAILQ_REMOVE(&allsegs, seg, ss_next);
2668				free(seg->ss_blk);
2669				free(seg);
2670		}
2671		if (reply("FALLBACK TO FULL FSCK") == 0) {
2672			ckfini(0);
2673			exit(EEXIT);
2674		} else
2675			return (-1);
2676	}
2677
2678	/*
2679	 * Find the journal inode.
2680	 */
2681	ip = ino_read(ROOTINO);
2682	sujino = 0;
2683	ino_visit(ip, ROOTINO, suj_find, 0);
2684	if (sujino == 0) {
2685		printf("Journal inode removed.  Use tunefs to re-create.\n");
2686		sblock.fs_flags &= ~FS_SUJ;
2687		sblock.fs_sujfree = 0;
2688		return (-1);
2689	}
2690	/*
2691	 * Fetch the journal inode and verify it.
2692	 */
2693	jip = ino_read(sujino);
2694	printf("** SU+J Recovering %s\n", filesys);
2695	if (suj_verifyino(jip) != 0)
2696		return (-1);
2697	/*
2698	 * Build a list of journal blocks in jblocks before parsing the
2699	 * available journal blocks in with suj_read().
2700	 */
2701	printf("** Reading %jd byte journal from inode %d.\n",
2702	    DIP(jip, di_size), sujino);
2703	suj_jblocks = jblocks_create();
2704	blocks = ino_visit(jip, sujino, suj_add_block, 0);
2705	if (blocks != numfrags(fs, DIP(jip, di_size))) {
2706		printf("Sparse journal inode %d.\n", sujino);
2707		return (-1);
2708	}
2709	suj_read();
2710	jblocks_destroy(suj_jblocks);
2711	suj_jblocks = NULL;
2712	if (preen || reply("RECOVER")) {
2713		printf("** Building recovery table.\n");
2714		suj_prune();
2715		suj_build();
2716		cg_apply(cg_build);
2717		printf("** Resolving unreferenced inode list.\n");
2718		ino_unlinked();
2719		printf("** Processing journal entries.\n");
2720		cg_apply(cg_trunc);
2721		cg_apply(cg_check_blk);
2722		cg_apply(cg_check_ino);
2723	}
2724	if (preen == 0 && (jrecs > 0 || jbytes > 0) && reply("WRITE CHANGES") == 0)
2725		return (0);
2726	/*
2727	 * To remain idempotent with partial truncations the free bitmaps
2728	 * must be written followed by indirect blocks and lastly inode
2729	 * blocks.  This preserves access to the modified pointers until
2730	 * they are freed.
2731	 */
2732	cg_apply(cg_write);
2733	dblk_write();
2734	cg_apply(cg_write_inos);
2735	/* Write back superblock. */
2736	closedisk(filesys);
2737	if (jrecs > 0 || jbytes > 0) {
2738		printf("** %jd journal records in %jd bytes for %.2f%% utilization\n",
2739		    jrecs, jbytes, ((float)jrecs / (float)(jbytes / JREC_SIZE)) * 100);
2740		printf("** Freed %jd inodes (%jd dirs) %jd blocks, and %jd frags.\n",
2741		    freeinos, freedir, freeblocks, freefrags);
2742	}
2743
2744	return (0);
2745}
2746