bplist.c revision 1.1.1.2
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/bplist.h>
27#include <sys/zfs_context.h>
28
29void
30bplist_init(bplist_t *bpl)
31{
32	bzero(bpl, sizeof (*bpl));
33	mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
34}
35
36void
37bplist_fini(bplist_t *bpl)
38{
39	ASSERT(bpl->bpl_queue == NULL);
40	mutex_destroy(&bpl->bpl_lock);
41}
42
43static int
44bplist_hold(bplist_t *bpl)
45{
46	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
47	if (bpl->bpl_dbuf == NULL) {
48		int err = dmu_bonus_hold(bpl->bpl_mos,
49		    bpl->bpl_object, bpl, &bpl->bpl_dbuf);
50		if (err)
51			return (err);
52		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
53	}
54	return (0);
55}
56
57uint64_t
58bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
59{
60	int size;
61
62	size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
63	    BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
64
65	return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
66	    DMU_OT_BPLIST_HDR, size, tx));
67}
68
69void
70bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
71{
72	VERIFY(dmu_object_free(mos, object, tx) == 0);
73}
74
75int
76bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
77{
78	dmu_object_info_t doi;
79	int err;
80
81	err = dmu_object_info(mos, object, &doi);
82	if (err)
83		return (err);
84
85	mutex_enter(&bpl->bpl_lock);
86
87	ASSERT(bpl->bpl_dbuf == NULL);
88	ASSERT(bpl->bpl_phys == NULL);
89	ASSERT(bpl->bpl_cached_dbuf == NULL);
90	ASSERT(bpl->bpl_queue == NULL);
91	ASSERT(object != 0);
92	ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
93	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
94
95	bpl->bpl_mos = mos;
96	bpl->bpl_object = object;
97	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
98	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
99	bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
100
101	mutex_exit(&bpl->bpl_lock);
102	return (0);
103}
104
105void
106bplist_close(bplist_t *bpl)
107{
108	mutex_enter(&bpl->bpl_lock);
109
110	ASSERT(bpl->bpl_queue == NULL);
111
112	if (bpl->bpl_cached_dbuf) {
113		dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
114		bpl->bpl_cached_dbuf = NULL;
115	}
116	if (bpl->bpl_dbuf) {
117		dmu_buf_rele(bpl->bpl_dbuf, bpl);
118		bpl->bpl_dbuf = NULL;
119		bpl->bpl_phys = NULL;
120	}
121
122	mutex_exit(&bpl->bpl_lock);
123}
124
125boolean_t
126bplist_empty(bplist_t *bpl)
127{
128	boolean_t rv;
129
130	if (bpl->bpl_object == 0)
131		return (B_TRUE);
132
133	mutex_enter(&bpl->bpl_lock);
134	VERIFY(0 == bplist_hold(bpl)); /* XXX */
135	rv = (bpl->bpl_phys->bpl_entries == 0);
136	mutex_exit(&bpl->bpl_lock);
137
138	return (rv);
139}
140
141static int
142bplist_cache(bplist_t *bpl, uint64_t blkid)
143{
144	int err = 0;
145
146	if (bpl->bpl_cached_dbuf == NULL ||
147	    bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
148		if (bpl->bpl_cached_dbuf != NULL)
149			dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
150		err = dmu_buf_hold(bpl->bpl_mos,
151		    bpl->bpl_object, blkid << bpl->bpl_blockshift,
152		    bpl, &bpl->bpl_cached_dbuf);
153		ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
154		    1ULL << bpl->bpl_blockshift);
155	}
156	return (err);
157}
158
159int
160bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
161{
162	uint64_t blk, off;
163	blkptr_t *bparray;
164	int err;
165
166	mutex_enter(&bpl->bpl_lock);
167
168	err = bplist_hold(bpl);
169	if (err) {
170		mutex_exit(&bpl->bpl_lock);
171		return (err);
172	}
173
174	if (*itorp >= bpl->bpl_phys->bpl_entries) {
175		mutex_exit(&bpl->bpl_lock);
176		return (ENOENT);
177	}
178
179	blk = *itorp >> bpl->bpl_bpshift;
180	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
181
182	err = bplist_cache(bpl, blk);
183	if (err) {
184		mutex_exit(&bpl->bpl_lock);
185		return (err);
186	}
187
188	bparray = bpl->bpl_cached_dbuf->db_data;
189	*bp = bparray[off];
190	(*itorp)++;
191	mutex_exit(&bpl->bpl_lock);
192	return (0);
193}
194
195int
196bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
197{
198	uint64_t blk, off;
199	blkptr_t *bparray;
200	int err;
201
202	ASSERT(!BP_IS_HOLE(bp));
203	mutex_enter(&bpl->bpl_lock);
204	err = bplist_hold(bpl);
205	if (err)
206		return (err);
207
208	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
209	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
210
211	err = bplist_cache(bpl, blk);
212	if (err) {
213		mutex_exit(&bpl->bpl_lock);
214		return (err);
215	}
216
217	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
218	bparray = bpl->bpl_cached_dbuf->db_data;
219	bparray[off] = *bp;
220
221	/* We never need the fill count. */
222	bparray[off].blk_fill = 0;
223
224	/* The bplist will compress better if we can leave off the checksum */
225	if (!BP_GET_DEDUP(&bparray[off]))
226		bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
227
228	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
229	bpl->bpl_phys->bpl_entries++;
230	bpl->bpl_phys->bpl_bytes +=
231	    bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp);
232	if (bpl->bpl_havecomp) {
233		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
234		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
235	}
236	mutex_exit(&bpl->bpl_lock);
237
238	return (0);
239}
240
241void
242bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx)
243{
244	VERIFY(bplist_enqueue(bpl, bp, tx) == 0);
245}
246
247/*
248 * Deferred entry; will be processed later by bplist_sync().
249 */
250void
251bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
252{
253	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
254
255	ASSERT(!BP_IS_HOLE(bp));
256	mutex_enter(&bpl->bpl_lock);
257	bpq->bpq_blk = *bp;
258	bpq->bpq_next = bpl->bpl_queue;
259	bpl->bpl_queue = bpq;
260	mutex_exit(&bpl->bpl_lock);
261}
262
263void
264bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx)
265{
266	bplist_q_t *bpq;
267
268	mutex_enter(&bpl->bpl_lock);
269	while ((bpq = bpl->bpl_queue) != NULL) {
270		bpl->bpl_queue = bpq->bpq_next;
271		mutex_exit(&bpl->bpl_lock);
272		func(arg, &bpq->bpq_blk, tx);
273		kmem_free(bpq, sizeof (*bpq));
274		mutex_enter(&bpl->bpl_lock);
275	}
276	mutex_exit(&bpl->bpl_lock);
277}
278
279void
280bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
281{
282	mutex_enter(&bpl->bpl_lock);
283	ASSERT3P(bpl->bpl_queue, ==, NULL);
284	VERIFY(0 == bplist_hold(bpl));
285	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
286	VERIFY(0 == dmu_free_range(bpl->bpl_mos,
287	    bpl->bpl_object, 0, -1ULL, tx));
288	bpl->bpl_phys->bpl_entries = 0;
289	bpl->bpl_phys->bpl_bytes = 0;
290	if (bpl->bpl_havecomp) {
291		bpl->bpl_phys->bpl_comp = 0;
292		bpl->bpl_phys->bpl_uncomp = 0;
293	}
294	mutex_exit(&bpl->bpl_lock);
295}
296
297int
298bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
299{
300	int err;
301
302	mutex_enter(&bpl->bpl_lock);
303
304	err = bplist_hold(bpl);
305	if (err) {
306		mutex_exit(&bpl->bpl_lock);
307		return (err);
308	}
309
310	*usedp = bpl->bpl_phys->bpl_bytes;
311	if (bpl->bpl_havecomp) {
312		*compp = bpl->bpl_phys->bpl_comp;
313		*uncompp = bpl->bpl_phys->bpl_uncomp;
314	}
315	mutex_exit(&bpl->bpl_lock);
316
317	if (!bpl->bpl_havecomp) {
318		uint64_t itor = 0, comp = 0, uncomp = 0;
319		blkptr_t bp;
320
321		while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
322			comp += BP_GET_PSIZE(&bp);
323			uncomp += BP_GET_UCSIZE(&bp);
324		}
325		if (err == ENOENT)
326			err = 0;
327		*compp = comp;
328		*uncompp = uncomp;
329	}
330
331	return (err);
332}
333
334/*
335 * Return (in *dsizep) the amount of space on the deadlist which is:
336 * mintxg < blk_birth <= maxtxg
337 */
338int
339bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
340    uint64_t *dsizep)
341{
342	uint64_t size = 0;
343	uint64_t itor = 0;
344	blkptr_t bp;
345	int err;
346
347	/*
348	 * As an optimization, if they want the whole txg range, just
349	 * get bpl_bytes rather than iterating over the bps.
350	 */
351	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
352		mutex_enter(&bpl->bpl_lock);
353		err = bplist_hold(bpl);
354		if (err == 0)
355			*dsizep = bpl->bpl_phys->bpl_bytes;
356		mutex_exit(&bpl->bpl_lock);
357		return (err);
358	}
359
360	while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
361		if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
362			size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp);
363		}
364	}
365	if (err == ENOENT)
366		err = 0;
367	*dsizep = size;
368	return (err);
369}
370