bplist.c revision 1.1.1.1
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/bplist.h>
27#include <sys/zfs_context.h>
28
29static int
30bplist_hold(bplist_t *bpl)
31{
32	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
33	if (bpl->bpl_dbuf == NULL) {
34		int err = dmu_bonus_hold(bpl->bpl_mos,
35		    bpl->bpl_object, bpl, &bpl->bpl_dbuf);
36		if (err)
37			return (err);
38		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
39	}
40	return (0);
41}
42
43uint64_t
44bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
45{
46	int size;
47
48	size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
49	    BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
50
51	return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
52	    DMU_OT_BPLIST_HDR, size, tx));
53}
54
55void
56bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
57{
58	VERIFY(dmu_object_free(mos, object, tx) == 0);
59}
60
61int
62bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
63{
64	dmu_object_info_t doi;
65	int err;
66
67	err = dmu_object_info(mos, object, &doi);
68	if (err)
69		return (err);
70
71	mutex_enter(&bpl->bpl_lock);
72
73	ASSERT(bpl->bpl_dbuf == NULL);
74	ASSERT(bpl->bpl_phys == NULL);
75	ASSERT(bpl->bpl_cached_dbuf == NULL);
76	ASSERT(bpl->bpl_queue == NULL);
77	ASSERT(object != 0);
78	ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
79	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
80
81	bpl->bpl_mos = mos;
82	bpl->bpl_object = object;
83	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
84	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
85	bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
86
87	mutex_exit(&bpl->bpl_lock);
88	return (0);
89}
90
91void
92bplist_close(bplist_t *bpl)
93{
94	mutex_enter(&bpl->bpl_lock);
95
96	ASSERT(bpl->bpl_queue == NULL);
97
98	if (bpl->bpl_cached_dbuf) {
99		dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
100		bpl->bpl_cached_dbuf = NULL;
101	}
102	if (bpl->bpl_dbuf) {
103		dmu_buf_rele(bpl->bpl_dbuf, bpl);
104		bpl->bpl_dbuf = NULL;
105		bpl->bpl_phys = NULL;
106	}
107
108	mutex_exit(&bpl->bpl_lock);
109}
110
111boolean_t
112bplist_empty(bplist_t *bpl)
113{
114	boolean_t rv;
115
116	if (bpl->bpl_object == 0)
117		return (B_TRUE);
118
119	mutex_enter(&bpl->bpl_lock);
120	VERIFY(0 == bplist_hold(bpl)); /* XXX */
121	rv = (bpl->bpl_phys->bpl_entries == 0);
122	mutex_exit(&bpl->bpl_lock);
123
124	return (rv);
125}
126
127static int
128bplist_cache(bplist_t *bpl, uint64_t blkid)
129{
130	int err = 0;
131
132	if (bpl->bpl_cached_dbuf == NULL ||
133	    bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
134		if (bpl->bpl_cached_dbuf != NULL)
135			dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
136		err = dmu_buf_hold(bpl->bpl_mos,
137		    bpl->bpl_object, blkid << bpl->bpl_blockshift,
138		    bpl, &bpl->bpl_cached_dbuf);
139		ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
140		    1ULL << bpl->bpl_blockshift);
141	}
142	return (err);
143}
144
145int
146bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
147{
148	uint64_t blk, off;
149	blkptr_t *bparray;
150	int err;
151
152	mutex_enter(&bpl->bpl_lock);
153
154	err = bplist_hold(bpl);
155	if (err) {
156		mutex_exit(&bpl->bpl_lock);
157		return (err);
158	}
159
160	if (*itorp >= bpl->bpl_phys->bpl_entries) {
161		mutex_exit(&bpl->bpl_lock);
162		return (ENOENT);
163	}
164
165	blk = *itorp >> bpl->bpl_bpshift;
166	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
167
168	err = bplist_cache(bpl, blk);
169	if (err) {
170		mutex_exit(&bpl->bpl_lock);
171		return (err);
172	}
173
174	bparray = bpl->bpl_cached_dbuf->db_data;
175	*bp = bparray[off];
176	(*itorp)++;
177	mutex_exit(&bpl->bpl_lock);
178	return (0);
179}
180
181int
182bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
183{
184	uint64_t blk, off;
185	blkptr_t *bparray;
186	int err;
187
188	ASSERT(!BP_IS_HOLE(bp));
189	mutex_enter(&bpl->bpl_lock);
190	err = bplist_hold(bpl);
191	if (err)
192		return (err);
193
194	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
195	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
196
197	err = bplist_cache(bpl, blk);
198	if (err) {
199		mutex_exit(&bpl->bpl_lock);
200		return (err);
201	}
202
203	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
204	bparray = bpl->bpl_cached_dbuf->db_data;
205	bparray[off] = *bp;
206
207	/* We never need the fill count. */
208	bparray[off].blk_fill = 0;
209
210	/* The bplist will compress better if we can leave off the checksum */
211	bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
212
213	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
214	bpl->bpl_phys->bpl_entries++;
215	bpl->bpl_phys->bpl_bytes +=
216	    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
217	if (bpl->bpl_havecomp) {
218		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
219		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
220	}
221	mutex_exit(&bpl->bpl_lock);
222
223	return (0);
224}
225
226/*
227 * Deferred entry; will be written later by bplist_sync().
228 */
229void
230bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
231{
232	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
233
234	ASSERT(!BP_IS_HOLE(bp));
235	mutex_enter(&bpl->bpl_lock);
236	bpq->bpq_blk = *bp;
237	bpq->bpq_next = bpl->bpl_queue;
238	bpl->bpl_queue = bpq;
239	mutex_exit(&bpl->bpl_lock);
240}
241
242void
243bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
244{
245	bplist_q_t *bpq;
246
247	mutex_enter(&bpl->bpl_lock);
248	while ((bpq = bpl->bpl_queue) != NULL) {
249		bpl->bpl_queue = bpq->bpq_next;
250		mutex_exit(&bpl->bpl_lock);
251		VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
252		kmem_free(bpq, sizeof (*bpq));
253		mutex_enter(&bpl->bpl_lock);
254	}
255	mutex_exit(&bpl->bpl_lock);
256}
257
258void
259bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
260{
261	mutex_enter(&bpl->bpl_lock);
262	ASSERT3P(bpl->bpl_queue, ==, NULL);
263	VERIFY(0 == bplist_hold(bpl));
264	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
265	VERIFY(0 == dmu_free_range(bpl->bpl_mos,
266	    bpl->bpl_object, 0, -1ULL, tx));
267	bpl->bpl_phys->bpl_entries = 0;
268	bpl->bpl_phys->bpl_bytes = 0;
269	if (bpl->bpl_havecomp) {
270		bpl->bpl_phys->bpl_comp = 0;
271		bpl->bpl_phys->bpl_uncomp = 0;
272	}
273	mutex_exit(&bpl->bpl_lock);
274}
275
276int
277bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
278{
279	int err;
280
281	mutex_enter(&bpl->bpl_lock);
282
283	err = bplist_hold(bpl);
284	if (err) {
285		mutex_exit(&bpl->bpl_lock);
286		return (err);
287	}
288
289	*usedp = bpl->bpl_phys->bpl_bytes;
290	if (bpl->bpl_havecomp) {
291		*compp = bpl->bpl_phys->bpl_comp;
292		*uncompp = bpl->bpl_phys->bpl_uncomp;
293	}
294	mutex_exit(&bpl->bpl_lock);
295
296	if (!bpl->bpl_havecomp) {
297		uint64_t itor = 0, comp = 0, uncomp = 0;
298		blkptr_t bp;
299
300		while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
301			comp += BP_GET_PSIZE(&bp);
302			uncomp += BP_GET_UCSIZE(&bp);
303		}
304		if (err == ENOENT)
305			err = 0;
306		*compp = comp;
307		*uncompp = uncomp;
308	}
309
310	return (err);
311}
312
313/*
314 * Return (in *dasizep) the amount of space on the deadlist which is:
315 * mintxg < blk_birth <= maxtxg
316 */
317int
318bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
319    uint64_t *dasizep)
320{
321	uint64_t size = 0;
322	uint64_t itor = 0;
323	blkptr_t bp;
324	int err;
325
326	/*
327	 * As an optimization, if they want the whole txg range, just
328	 * get bpl_bytes rather than iterating over the bps.
329	 */
330	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
331		mutex_enter(&bpl->bpl_lock);
332		err = bplist_hold(bpl);
333		if (err == 0)
334			*dasizep = bpl->bpl_phys->bpl_bytes;
335		mutex_exit(&bpl->bpl_lock);
336		return (err);
337	}
338
339	while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
340		if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
341			size +=
342			    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
343		}
344	}
345	if (err == ENOENT)
346		err = 0;
347	*dasizep = size;
348	return (err);
349}
350