1/*      $NetBSD: coalesce.c,v 1.18 2009/08/06 00:51:55 pooka Exp $  */
2
3/*-
4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant@hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/param.h>
33#include <sys/mount.h>
34#include <sys/time.h>
35#include <sys/resource.h>
36#include <sys/types.h>
37#include <sys/wait.h>
38#include <sys/mman.h>
39
40#include <ufs/ufs/dinode.h>
41#include <ufs/lfs/lfs.h>
42
43#include <fcntl.h>
44#include <signal.h>
45#include <stdio.h>
46#include <stdlib.h>
47#include <string.h>
48#include <time.h>
49#include <unistd.h>
50#include <util.h>
51#include <errno.h>
52#include <err.h>
53
54#include <syslog.h>
55
56#include "bufcache.h"
57#include "vnode.h"
58#include "cleaner.h"
59#include "kernelops.h"
60
61extern int debug, do_mmap;
62
63int log2int(int n)
64{
65	int log;
66
67	log = 0;
68	while (n > 0) {
69		++log;
70		n >>= 1;
71	}
72	return log - 1;
73}
74
75enum coalesce_returncodes {
76	COALESCE_OK = 0,
77	COALESCE_NOINODE,
78	COALESCE_TOOSMALL,
79	COALESCE_BADSIZE,
80	COALESCE_BADBLOCKSIZE,
81	COALESCE_NOMEM,
82	COALESCE_BADBMAPV,
83	COALESCE_BADMARKV,
84	COALESCE_NOTWORTHIT,
85	COALESCE_NOTHINGLEFT,
86	COALESCE_EIO,
87
88	COALESCE_MAXERROR
89};
90
91const char *coalesce_return[] = {
92	"Successfully coalesced",
93	"File not in use or inode not found",
94	"Not large enough to coalesce",
95	"Negative size",
96	"Not enough blocks to account for size",
97	"Malloc failed",
98	"LFCNBMAPV failed",
99	"Not broken enough to fix",
100	"Too many blocks not found",
101	"Too many blocks found in active segments",
102	"I/O error",
103
104	"No such error"
105};
106
107static struct ufs1_dinode *
108get_dinode(struct clfs *fs, ino_t ino)
109{
110	IFILE *ifp;
111	daddr_t daddr;
112	struct ubuf *bp;
113	struct ufs1_dinode *dip, *r;
114
115	lfs_ientry(&ifp, fs, ino, &bp);
116	daddr = ifp->if_daddr;
117	brelse(bp, 0);
118
119	if (daddr == 0x0)
120		return NULL;
121
122	bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp);
123	for (dip = (struct ufs1_dinode *)bp->b_data;
124	     dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
125		if (dip->di_inumber == ino) {
126			r = (struct ufs1_dinode *)malloc(sizeof(*r));
127			if (r == NULL)
128				break;
129			memcpy(r, dip, sizeof(*r));
130			brelse(bp, 0);
131			return r;
132		}
133	brelse(bp, 0);
134	return NULL;
135}
136
137/*
138 * Find out if this inode's data blocks are discontinuous; if they are,
139 * rewrite them using markv.  Return the number of inodes rewritten.
140 */
141static int
142clean_inode(struct clfs *fs, ino_t ino)
143{
144	BLOCK_INFO *bip = NULL, *tbip;
145	CLEANERINFO cip;
146	struct ubuf *bp;
147	struct ufs1_dinode *dip;
148	struct clfs_seguse *sup;
149	struct lfs_fcntl_markv /* {
150		BLOCK_INFO *blkiov;
151		int blkcnt;
152	} */ lim;
153	daddr_t toff;
154	int i;
155	int nb, onb, noff;
156	int retval;
157	int bps;
158
159	dip = get_dinode(fs, ino);
160	if (dip == NULL)
161		return COALESCE_NOINODE;
162
163	/* Compute file block size, set up for bmapv */
164	onb = nb = lblkno(fs, dip->di_size);
165
166	/* XXX for now, don't do any file small enough to have fragments */
167	if (nb < NDADDR) {
168		free(dip);
169		return COALESCE_TOOSMALL;
170	}
171
172	/* Sanity checks */
173#if 0	/* di_size is uint64_t -- this is a noop */
174	if (dip->di_size < 0) {
175		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
176		free(dip);
177		return COALESCE_BADSIZE;
178	}
179#endif
180	if (nb > dip->di_blocks) {
181		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
182		     dip->di_blocks);
183		free(dip);
184		return COALESCE_BADBLOCKSIZE;
185	}
186
187	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
188	if (bip == NULL) {
189		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
190		    (unsigned long long)ino, nb);
191		free(dip);
192		return COALESCE_NOMEM;
193	}
194	for (i = 0; i < nb; i++) {
195		memset(bip + i, 0, sizeof(BLOCK_INFO));
196		bip[i].bi_inode = ino;
197		bip[i].bi_lbn = i;
198		bip[i].bi_version = dip->di_gen;
199		/* Don't set the size, but let lfs_bmap fill it in */
200	}
201	lim.blkiov = bip;
202	lim.blkcnt = nb;
203	if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
204		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
205		       fs->lfs_fsmnt);
206		retval = COALESCE_BADBMAPV;
207		goto out;
208	}
209#if 0
210	for (i = 0; i < nb; i++) {
211		printf("bi_size = %d, bi_ino = %d, "
212		    "bi_lbn = %d, bi_daddr = %d\n",
213		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
214		    bip[i].bi_daddr);
215	}
216#endif
217	noff = toff = 0;
218	for (i = 1; i < nb; i++) {
219		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
220			++noff;
221		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
222		    - fs->lfs_frag) >> fs->lfs_fbshift;
223	}
224
225	/*
226	 * If this file is not discontinuous, there's no point in rewriting it.
227	 *
228	 * Explicitly allow a certain amount of discontinuity, since large
229	 * files will be broken among segments and medium-sized files
230	 * can have a break or two and it's okay.
231	 */
232	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
233	    segtod(fs, noff) * 2 < nb) {
234		retval = COALESCE_NOTWORTHIT;
235		goto out;
236	} else if (debug)
237		syslog(LOG_DEBUG, "ino %llu total discontinuity "
238		    "%d (%lld) for %d blocks", (unsigned long long)ino,
239		    noff, (long long)toff, nb);
240
241	/* Search for blocks in active segments; don't move them. */
242	for (i = 0; i < nb; i++) {
243		if (bip[i].bi_daddr <= 0)
244			continue;
245		sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
246		if (sup->flags & SEGUSE_ACTIVE)
247			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
248	}
249
250	/*
251	 * Get rid of any blocks we've marked dead.  If this is an older
252	 * kernel that doesn't have bmapv fill in the block sizes, we'll
253	 * toss everything here.
254	 */
255	onb = nb;
256	toss_old_blocks(fs, &bip, &nb, NULL);
257	nb = i;
258
259	/*
260	 * We may have tossed enough blocks that it is no longer worthwhile
261	 * to rewrite this inode.
262	 */
263	if (nb == 0 || onb - nb > log2int(onb)) {
264		if (debug)
265			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
266		retval = COALESCE_NOTHINGLEFT;
267		goto out;
268	}
269
270	/*
271	 * We are going to rewrite this inode.
272	 * For any remaining blocks, read in their contents.
273	 */
274	for (i = 0; i < nb; i++) {
275		bip[i].bi_bp = malloc(bip[i].bi_size);
276		if (bip[i].bi_bp == NULL) {
277			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
278			    bip[i].bi_size);
279			retval = COALESCE_NOMEM;
280			goto out;
281		}
282
283		if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
284			  fsbtob(fs, bip[i].bi_daddr)) < 0) {
285			retval = COALESCE_EIO;
286			goto out;
287		}
288	}
289	if (debug)
290		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
291		    (unsigned long long)ino, nb);
292
293	/*
294	 * Write in segment-sized chunks.  If at any point we'd write more
295	 * than half of the available segments, sleep until that's not
296	 * true any more.
297	 */
298	bps = segtod(fs, 1);
299	for (tbip = bip; tbip < bip + nb; tbip += bps) {
300		do {
301			bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp);
302			cip = *(CLEANERINFO *)bp->b_data;
303			brelse(bp, B_INVAL);
304
305			if (cip.clean < 4) /* XXX magic number 4 */
306				kops.ko_fcntl(fs->clfs_ifilefd,
307				    LFCNSEGWAIT, NULL);
308		} while(cip.clean < 4);
309
310		lim.blkiov = tbip;
311		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
312		if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
313			retval = COALESCE_BADMARKV;
314			goto out;
315		}
316	}
317
318	retval = COALESCE_OK;
319out:
320	free(dip);
321	if (bip) {
322		for (i = 0; i < onb; i++)
323			if (bip[i].bi_bp)
324				free(bip[i].bi_bp);
325		free(bip);
326	}
327	return retval;
328}
329
330/*
331 * Try coalescing every inode in the filesystem.
332 * Return the number of inodes actually altered.
333 */
334int clean_all_inodes(struct clfs *fs)
335{
336	int i, r, maxino;
337	int totals[COALESCE_MAXERROR];
338	struct stat st;
339
340	memset(totals, 0, sizeof(totals));
341
342	fstat(fs->clfs_ifilefd, &st);
343	maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
344		fs->lfs_segtabsz - fs->lfs_cleansz;
345
346	for (i = 0; i < maxino; i++) {
347		r = clean_inode(fs, i);
348		++totals[r];
349	}
350
351	for (i = 0; i < COALESCE_MAXERROR; i++)
352		if (totals[i])
353			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
354			       totals[i]);
355
356	return totals[COALESCE_OK];
357}
358
359/*
360 * Fork a child process to coalesce this fs.
361 */
362int
363fork_coalesce(struct clfs *fs)
364{
365	static pid_t childpid;
366	int num;
367
368	/*
369	 * If already running a coalescing child, don't start a new one.
370	 */
371	if (childpid) {
372		if (waitpid(childpid, NULL, WNOHANG) == childpid)
373			childpid = 0;
374	}
375	if (childpid && kill(childpid, 0) >= 0) {
376		/* already running a coalesce process */
377		if (debug)
378			syslog(LOG_DEBUG, "coalescing already in progress");
379		return 0;
380	}
381
382	/*
383	 * Fork a child and let the child coalease
384	 */
385	childpid = fork();
386	if (childpid < 0) {
387		syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
388		return 0;
389	} else if (childpid == 0) {
390		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
391		       fs->lfs_fsmnt, getpid());
392		num = clean_all_inodes(fs);
393		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
394		       fs->lfs_fsmnt, num);
395		exit(0);
396	}
397
398	return 0;
399}
400