1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2020 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
12#include "xfs_bit.h"
13#include "xfs_mount.h"
14#include "xfs_inode.h"
15#include "xfs_trans.h"
16#include "xfs_btree.h"
17#include "xfs_trace.h"
18#include "xfs_btree_staging.h"
19
20/*
21 * Staging Cursors and Fake Roots for Btrees
22 * =========================================
23 *
24 * A staging btree cursor is a special type of btree cursor that callers must
25 * use to construct a new btree index using the btree bulk loader code.  The
26 * bulk loading code uses the staging btree cursor to abstract the details of
27 * initializing new btree blocks and filling them with records or key/ptr
28 * pairs.  Regular btree operations (e.g. queries and modifications) are not
29 * supported with staging cursors, and callers must not invoke them.
30 *
31 * Fake root structures contain all the information about a btree that is under
32 * construction by the bulk loading code.  Staging btree cursors point to fake
33 * root structures instead of the usual AG header or inode structure.
34 *
35 * Callers are expected to initialize a fake root structure and pass it into
36 * the _stage_cursor function for a specific btree type.  When bulk loading is
37 * complete, callers should call the _commit_staged_btree function for that
38 * specific btree type to commit the new btree into the filesystem.
39 */
40
41/*
42 * Bulk Loading for AG Btrees
43 * ==========================
44 *
45 * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
46 * staging cursor.  Callers should initialize this to zero.
47 *
48 * The _stage_cursor() function for a specific btree type should call
49 * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
50 * cursor.  The corresponding _commit_staged_btree() function should log the
51 * new root and call xfs_btree_commit_afakeroot() to transform the staging
52 * cursor into a regular btree cursor.
53 */
54
55/*
56 * Initialize a AG-rooted btree cursor with the given AG btree fake root.
57 */
58void
59xfs_btree_stage_afakeroot(
60	struct xfs_btree_cur		*cur,
61	struct xbtree_afakeroot		*afake)
62{
63	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
64	ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
65	ASSERT(cur->bc_tp == NULL);
66
67	cur->bc_ag.afake = afake;
68	cur->bc_nlevels = afake->af_levels;
69	cur->bc_flags |= XFS_BTREE_STAGING;
70}
71
72/*
73 * Transform an AG-rooted staging btree cursor back into a regular cursor by
74 * substituting a real btree root for the fake one and restoring normal btree
75 * cursor ops.  The caller must log the btree root change prior to calling
76 * this.
77 */
78void
79xfs_btree_commit_afakeroot(
80	struct xfs_btree_cur		*cur,
81	struct xfs_trans		*tp,
82	struct xfs_buf			*agbp)
83{
84	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
85	ASSERT(cur->bc_tp == NULL);
86
87	trace_xfs_btree_commit_afakeroot(cur);
88
89	cur->bc_ag.afake = NULL;
90	cur->bc_ag.agbp = agbp;
91	cur->bc_flags &= ~XFS_BTREE_STAGING;
92	cur->bc_tp = tp;
93}
94
95/*
96 * Bulk Loading for Inode-Rooted Btrees
97 * ====================================
98 *
99 * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
100 * the staging cursor.  This structure should be initialized as follows:
101 *
102 * - if_fork_size field should be set to the number of bytes available to the
103 *   fork in the inode.
104 *
105 * - if_fork should point to a freshly allocated struct xfs_ifork.
106 *
107 * - if_format should be set to the appropriate fork type (e.g.
108 *   XFS_DINODE_FMT_BTREE).
109 *
110 * All other fields must be zero.
111 *
112 * The _stage_cursor() function for a specific btree type should call
113 * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
114 * cursor.  The corresponding _commit_staged_btree() function should log the
115 * new root and call xfs_btree_commit_ifakeroot() to transform the staging
116 * cursor into a regular btree cursor.
117 */
118
119/*
120 * Initialize an inode-rooted btree cursor with the given inode btree fake
121 * root.  The btree cursor's bc_ops will be overridden as needed to make the
122 * staging functionality work.  If new_ops is not NULL, these new ops will be
123 * passed out to the caller for further overriding.
124 */
125void
126xfs_btree_stage_ifakeroot(
127	struct xfs_btree_cur		*cur,
128	struct xbtree_ifakeroot		*ifake)
129{
130	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
131	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
132	ASSERT(cur->bc_tp == NULL);
133
134	cur->bc_ino.ifake = ifake;
135	cur->bc_nlevels = ifake->if_levels;
136	cur->bc_ino.forksize = ifake->if_fork_size;
137	cur->bc_flags |= XFS_BTREE_STAGING;
138}
139
140/*
141 * Transform an inode-rooted staging btree cursor back into a regular cursor by
142 * substituting a real btree root for the fake one and restoring normal btree
143 * cursor ops.  The caller must log the btree root change prior to calling
144 * this.
145 */
146void
147xfs_btree_commit_ifakeroot(
148	struct xfs_btree_cur		*cur,
149	struct xfs_trans		*tp,
150	int				whichfork)
151{
152	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
153	ASSERT(cur->bc_tp == NULL);
154
155	trace_xfs_btree_commit_ifakeroot(cur);
156
157	cur->bc_ino.ifake = NULL;
158	cur->bc_ino.whichfork = whichfork;
159	cur->bc_flags &= ~XFS_BTREE_STAGING;
160	cur->bc_tp = tp;
161}
162
163/*
164 * Bulk Loading of Staged Btrees
165 * =============================
166 *
167 * This interface is used with a staged btree cursor to create a totally new
168 * btree with a large number of records (i.e. more than what would fit in a
169 * single root block).  When the creation is complete, the new root can be
170 * linked atomically into the filesystem by committing the staged cursor.
171 *
172 * Creation of a new btree proceeds roughly as follows:
173 *
174 * The first step is to initialize an appropriate fake btree root structure and
175 * then construct a staged btree cursor.  Refer to the block comments about
176 * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
177 * more information about how to do this.
178 *
179 * The second step is to initialize a struct xfs_btree_bload context as
180 * documented in the structure definition.
181 *
182 * The third step is to call xfs_btree_bload_compute_geometry to compute the
183 * height of and the number of blocks needed to construct the btree.  See the
184 * section "Computing the Geometry of the New Btree" for details about this
185 * computation.
186 *
187 * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
188 * save them for later use by ->claim_block().  Bulk loading requires all
189 * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
190 * rebuild, and to minimize seek distances of the new btree.
191 *
192 * Step five is to call xfs_btree_bload() to start constructing the btree.
193 *
194 * The final step is to commit the staging btree cursor, which logs the new
195 * btree root and turns the staging cursor into a regular cursor.  The caller
196 * is responsible for cleaning up the previous btree blocks, if any.
197 *
198 * Computing the Geometry of the New Btree
199 * =======================================
200 *
201 * The number of items placed in each btree block is computed via the following
202 * algorithm: For leaf levels, the number of items for the level is nr_records
203 * in the bload structure.  For node levels, the number of items for the level
204 * is the number of blocks in the next lower level of the tree.  For each
205 * level, the desired number of items per block is defined as:
206 *
207 * desired = max(minrecs, maxrecs - slack factor)
208 *
209 * The number of blocks for the level is defined to be:
210 *
211 * blocks = floor(nr_items / desired)
212 *
213 * Note this is rounded down so that the npb calculation below will never fall
214 * below minrecs.  The number of items that will actually be loaded into each
215 * btree block is defined as:
216 *
217 * npb =  nr_items / blocks
218 *
219 * Some of the leftmost blocks in the level will contain one extra record as
220 * needed to handle uneven division.  If the number of records in any block
221 * would exceed maxrecs for that level, blocks is incremented and npb is
222 * recalculated.
223 *
224 * In other words, we compute the number of blocks needed to satisfy a given
225 * loading level, then spread the items as evenly as possible.
226 *
227 * The height and number of fs blocks required to create the btree are computed
228 * and returned via btree_height and nr_blocks.
229 */
230
231/*
232 * Put a btree block that we're loading onto the ordered list and release it.
233 * The btree blocks will be written to disk when bulk loading is finished.
234 * If we reach the dirty buffer threshold, flush them to disk before
235 * continuing.
236 */
237static int
238xfs_btree_bload_drop_buf(
239	struct xfs_btree_bload		*bbl,
240	struct list_head		*buffers_list,
241	struct xfs_buf			**bpp)
242{
243	struct xfs_buf			*bp = *bpp;
244	int				error;
245
246	if (!bp)
247		return 0;
248
249	/*
250	 * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
251	 * xfs_buf_read will not pointlessly reread the contents from the disk.
252	 */
253	bp->b_flags |= XBF_DONE;
254
255	xfs_buf_delwri_queue_here(bp, buffers_list);
256	xfs_buf_relse(bp);
257	*bpp = NULL;
258	bbl->nr_dirty++;
259
260	if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
261		return 0;
262
263	error = xfs_buf_delwri_submit(buffers_list);
264	if (error)
265		return error;
266
267	bbl->nr_dirty = 0;
268	return 0;
269}
270
271/*
272 * Allocate and initialize one btree block for bulk loading.
273 *
274 * The new btree block will have its level and numrecs fields set to the values
275 * of the level and nr_this_block parameters, respectively.
276 *
277 * The caller should ensure that ptrp, bpp, and blockp refer to the left
278 * sibling of the new block, if there is any.  On exit, ptrp, bpp, and blockp
279 * will all point to the new block.
280 */
281STATIC int
282xfs_btree_bload_prep_block(
283	struct xfs_btree_cur		*cur,
284	struct xfs_btree_bload		*bbl,
285	struct list_head		*buffers_list,
286	unsigned int			level,
287	unsigned int			nr_this_block,
288	union xfs_btree_ptr		*ptrp, /* in/out */
289	struct xfs_buf			**bpp, /* in/out */
290	struct xfs_btree_block		**blockp, /* in/out */
291	void				*priv)
292{
293	union xfs_btree_ptr		new_ptr;
294	struct xfs_buf			*new_bp;
295	struct xfs_btree_block		*new_block;
296	int				ret;
297
298	if (xfs_btree_at_iroot(cur, level)) {
299		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
300		size_t			new_size;
301
302		ASSERT(*bpp == NULL);
303
304		/* Allocate a new incore btree root block. */
305		new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
306		ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
307		ifp->if_broot_bytes = (int)new_size;
308
309		/* Initialize it and send it out. */
310		xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
311				level, nr_this_block, cur->bc_ino.ip->i_ino);
312
313		*bpp = NULL;
314		*blockp = ifp->if_broot;
315		xfs_btree_set_ptr_null(cur, ptrp);
316		return 0;
317	}
318
319	/* Claim one of the caller's preallocated blocks. */
320	xfs_btree_set_ptr_null(cur, &new_ptr);
321	ret = bbl->claim_block(cur, &new_ptr, priv);
322	if (ret)
323		return ret;
324
325	ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
326
327	ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
328	if (ret)
329		return ret;
330
331	/*
332	 * The previous block (if any) is the left sibling of the new block,
333	 * so set its right sibling pointer to the new block and drop it.
334	 */
335	if (*blockp)
336		xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
337
338	ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
339	if (ret)
340		return ret;
341
342	/* Initialize the new btree block. */
343	xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
344	xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
345
346	/* Set the out parameters. */
347	*bpp = new_bp;
348	*blockp = new_block;
349	xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
350	return 0;
351}
352
353/* Load one leaf block. */
354STATIC int
355xfs_btree_bload_leaf(
356	struct xfs_btree_cur		*cur,
357	unsigned int			recs_this_block,
358	xfs_btree_bload_get_records_fn	get_records,
359	struct xfs_btree_block		*block,
360	void				*priv)
361{
362	unsigned int			j = 1;
363	int				ret;
364
365	/* Fill the leaf block with records. */
366	while (j <= recs_this_block) {
367		ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
368		if (ret < 0)
369			return ret;
370		j += ret;
371	}
372
373	return 0;
374}
375
376/*
377 * Load one node block with key/ptr pairs.
378 *
379 * child_ptr must point to a block within the next level down in the tree.  A
380 * key/ptr entry will be created in the new node block to the block pointed to
381 * by child_ptr.  On exit, child_ptr points to the next block on the child
382 * level that needs processing.
383 */
384STATIC int
385xfs_btree_bload_node(
386	struct xfs_btree_cur	*cur,
387	unsigned int		recs_this_block,
388	union xfs_btree_ptr	*child_ptr,
389	struct xfs_btree_block	*block)
390{
391	unsigned int		j;
392	int			ret;
393
394	/* Fill the node block with keys and pointers. */
395	for (j = 1; j <= recs_this_block; j++) {
396		union xfs_btree_key	child_key;
397		union xfs_btree_ptr	*block_ptr;
398		union xfs_btree_key	*block_key;
399		struct xfs_btree_block	*child_block;
400		struct xfs_buf		*child_bp;
401
402		ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
403
404		/*
405		 * Read the lower-level block in case the buffer for it has
406		 * been reclaimed.  LRU refs will be set on the block, which is
407		 * desirable if the new btree commits.
408		 */
409		ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
410				&child_bp);
411		if (ret)
412			return ret;
413
414		block_ptr = xfs_btree_ptr_addr(cur, j, block);
415		xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
416
417		block_key = xfs_btree_key_addr(cur, j, block);
418		xfs_btree_get_keys(cur, child_block, &child_key);
419		xfs_btree_copy_keys(cur, block_key, &child_key, 1);
420
421		xfs_btree_get_sibling(cur, child_block, child_ptr,
422				XFS_BB_RIGHTSIB);
423		xfs_buf_relse(child_bp);
424	}
425
426	return 0;
427}
428
429/*
430 * Compute the maximum number of records (or keyptrs) per block that we want to
431 * install at this level in the btree.  Caller is responsible for having set
432 * @cur->bc_ino.forksize to the desired fork size, if appropriate.
433 */
434STATIC unsigned int
435xfs_btree_bload_max_npb(
436	struct xfs_btree_cur	*cur,
437	struct xfs_btree_bload	*bbl,
438	unsigned int		level)
439{
440	unsigned int		ret;
441
442	if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
443		return cur->bc_ops->get_dmaxrecs(cur, level);
444
445	ret = cur->bc_ops->get_maxrecs(cur, level);
446	if (level == 0)
447		ret -= bbl->leaf_slack;
448	else
449		ret -= bbl->node_slack;
450	return ret;
451}
452
453/*
454 * Compute the desired number of records (or keyptrs) per block that we want to
455 * install at this level in the btree, which must be somewhere between minrecs
456 * and max_npb.  The caller is free to install fewer records per block.
457 */
458STATIC unsigned int
459xfs_btree_bload_desired_npb(
460	struct xfs_btree_cur	*cur,
461	struct xfs_btree_bload	*bbl,
462	unsigned int		level)
463{
464	unsigned int		npb = xfs_btree_bload_max_npb(cur, bbl, level);
465
466	/* Root blocks are not subject to minrecs rules. */
467	if (level == cur->bc_nlevels - 1)
468		return max(1U, npb);
469
470	return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
471}
472
473/*
474 * Compute the number of records to be stored in each block at this level and
475 * the number of blocks for this level.  For leaf levels, we must populate an
476 * empty root block even if there are no records, so we have to have at least
477 * one block.
478 */
479STATIC void
480xfs_btree_bload_level_geometry(
481	struct xfs_btree_cur	*cur,
482	struct xfs_btree_bload	*bbl,
483	unsigned int		level,
484	uint64_t		nr_this_level,
485	unsigned int		*avg_per_block,
486	uint64_t		*blocks,
487	uint64_t		*blocks_with_extra)
488{
489	uint64_t		npb;
490	uint64_t		dontcare;
491	unsigned int		desired_npb;
492	unsigned int		maxnr;
493
494	/*
495	 * Compute the absolute maximum number of records that we can store in
496	 * the ondisk block or inode root.
497	 */
498	if (cur->bc_ops->get_dmaxrecs)
499		maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
500	else
501		maxnr = cur->bc_ops->get_maxrecs(cur, level);
502
503	/*
504	 * Compute the number of blocks we need to fill each block with the
505	 * desired number of records/keyptrs per block.  Because desired_npb
506	 * could be minrecs, we use regular integer division (which rounds
507	 * the block count down) so that in the next step the effective # of
508	 * items per block will never be less than desired_npb.
509	 */
510	desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
511	*blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
512	*blocks = max(1ULL, *blocks);
513
514	/*
515	 * Compute the number of records that we will actually put in each
516	 * block, assuming that we want to spread the records evenly between
517	 * the blocks.  Take care that the effective # of items per block (npb)
518	 * won't exceed maxrecs even for the blocks that get an extra record,
519	 * since desired_npb could be maxrecs, and in the previous step we
520	 * rounded the block count down.
521	 */
522	npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
523	if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
524		(*blocks)++;
525		npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
526	}
527
528	*avg_per_block = min_t(uint64_t, npb, nr_this_level);
529
530	trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
531			*avg_per_block, desired_npb, *blocks,
532			*blocks_with_extra);
533}
534
535/*
536 * Ensure a slack value is appropriate for the btree.
537 *
538 * If the slack value is negative, set slack so that we fill the block to
539 * halfway between minrecs and maxrecs.  Make sure the slack is never so large
540 * that we can underflow minrecs.
541 */
542static void
543xfs_btree_bload_ensure_slack(
544	struct xfs_btree_cur	*cur,
545	int			*slack,
546	int			level)
547{
548	int			maxr;
549	int			minr;
550
551	maxr = cur->bc_ops->get_maxrecs(cur, level);
552	minr = cur->bc_ops->get_minrecs(cur, level);
553
554	/*
555	 * If slack is negative, automatically set slack so that we load the
556	 * btree block approximately halfway between minrecs and maxrecs.
557	 * Generally, this will net us 75% loading.
558	 */
559	if (*slack < 0)
560		*slack = maxr - ((maxr + minr) >> 1);
561
562	*slack = min(*slack, maxr - minr);
563}
564
565/*
566 * Prepare a btree cursor for a bulk load operation by computing the geometry
567 * fields in bbl.  Caller must ensure that the btree cursor is a staging
568 * cursor.  This function can be called multiple times.
569 */
570int
571xfs_btree_bload_compute_geometry(
572	struct xfs_btree_cur	*cur,
573	struct xfs_btree_bload	*bbl,
574	uint64_t		nr_records)
575{
576	uint64_t		nr_blocks = 0;
577	uint64_t		nr_this_level;
578
579	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
580
581	/*
582	 * Make sure that the slack values make sense for traditional leaf and
583	 * node blocks.  Inode-rooted btrees will return different minrecs and
584	 * maxrecs values for the root block (bc_nlevels == level - 1).  We're
585	 * checking levels 0 and 1 here, so set bc_nlevels such that the btree
586	 * code doesn't interpret either as the root level.
587	 */
588	cur->bc_nlevels = cur->bc_maxlevels - 1;
589	xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
590	xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
591
592	bbl->nr_records = nr_this_level = nr_records;
593	for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
594		uint64_t	level_blocks;
595		uint64_t	dontcare64;
596		unsigned int	level = cur->bc_nlevels - 1;
597		unsigned int	avg_per_block;
598
599		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
600				&avg_per_block, &level_blocks, &dontcare64);
601
602		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
603			/*
604			 * If all the items we want to store at this level
605			 * would fit in the inode root block, then we have our
606			 * btree root and are done.
607			 *
608			 * Note that bmap btrees forbid records in the root.
609			 */
610			if (level != 0 && nr_this_level <= avg_per_block) {
611				nr_blocks++;
612				break;
613			}
614
615			/*
616			 * Otherwise, we have to store all the items for this
617			 * level in traditional btree blocks and therefore need
618			 * another level of btree to point to those blocks.
619			 *
620			 * We have to re-compute the geometry for each level of
621			 * an inode-rooted btree because the geometry differs
622			 * between a btree root in an inode fork and a
623			 * traditional btree block.
624			 *
625			 * This distinction is made in the btree code based on
626			 * whether level == bc_nlevels - 1.  Based on the
627			 * previous root block size check against the root
628			 * block geometry, we know that we aren't yet ready to
629			 * populate the root.  Increment bc_nevels and
630			 * recalculate the geometry for a traditional
631			 * block-based btree level.
632			 */
633			cur->bc_nlevels++;
634			ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
635			xfs_btree_bload_level_geometry(cur, bbl, level,
636					nr_this_level, &avg_per_block,
637					&level_blocks, &dontcare64);
638		} else {
639			/*
640			 * If all the items we want to store at this level
641			 * would fit in a single root block, we're done.
642			 */
643			if (nr_this_level <= avg_per_block) {
644				nr_blocks++;
645				break;
646			}
647
648			/* Otherwise, we need another level of btree. */
649			cur->bc_nlevels++;
650			ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
651		}
652
653		nr_blocks += level_blocks;
654		nr_this_level = level_blocks;
655	}
656
657	if (cur->bc_nlevels > cur->bc_maxlevels)
658		return -EOVERFLOW;
659
660	bbl->btree_height = cur->bc_nlevels;
661	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
662		bbl->nr_blocks = nr_blocks - 1;
663	else
664		bbl->nr_blocks = nr_blocks;
665	return 0;
666}
667
668/* Bulk load a btree given the parameters and geometry established in bbl. */
669int
670xfs_btree_bload(
671	struct xfs_btree_cur		*cur,
672	struct xfs_btree_bload		*bbl,
673	void				*priv)
674{
675	struct list_head		buffers_list;
676	union xfs_btree_ptr		child_ptr;
677	union xfs_btree_ptr		ptr;
678	struct xfs_buf			*bp = NULL;
679	struct xfs_btree_block		*block = NULL;
680	uint64_t			nr_this_level = bbl->nr_records;
681	uint64_t			blocks;
682	uint64_t			i;
683	uint64_t			blocks_with_extra;
684	uint64_t			total_blocks = 0;
685	unsigned int			avg_per_block;
686	unsigned int			level = 0;
687	int				ret;
688
689	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
690
691	INIT_LIST_HEAD(&buffers_list);
692	cur->bc_nlevels = bbl->btree_height;
693	xfs_btree_set_ptr_null(cur, &child_ptr);
694	xfs_btree_set_ptr_null(cur, &ptr);
695	bbl->nr_dirty = 0;
696
697	xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
698			&avg_per_block, &blocks, &blocks_with_extra);
699
700	/* Load each leaf block. */
701	for (i = 0; i < blocks; i++) {
702		unsigned int		nr_this_block = avg_per_block;
703
704		/*
705		 * Due to rounding, btree blocks will not be evenly populated
706		 * in most cases.  blocks_with_extra tells us how many blocks
707		 * will receive an extra record to distribute the excess across
708		 * the current level as evenly as possible.
709		 */
710		if (i < blocks_with_extra)
711			nr_this_block++;
712
713		ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
714				nr_this_block, &ptr, &bp, &block, priv);
715		if (ret)
716			goto out;
717
718		trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
719				nr_this_block);
720
721		ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
722				block, priv);
723		if (ret)
724			goto out;
725
726		/*
727		 * Record the leftmost leaf pointer so we know where to start
728		 * with the first node level.
729		 */
730		if (i == 0)
731			xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
732	}
733	total_blocks += blocks;
734
735	ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
736	if (ret)
737		goto out;
738
739	/* Populate the internal btree nodes. */
740	for (level = 1; level < cur->bc_nlevels; level++) {
741		union xfs_btree_ptr	first_ptr;
742
743		nr_this_level = blocks;
744		block = NULL;
745		xfs_btree_set_ptr_null(cur, &ptr);
746
747		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
748				&avg_per_block, &blocks, &blocks_with_extra);
749
750		/* Load each node block. */
751		for (i = 0; i < blocks; i++) {
752			unsigned int	nr_this_block = avg_per_block;
753
754			if (i < blocks_with_extra)
755				nr_this_block++;
756
757			ret = xfs_btree_bload_prep_block(cur, bbl,
758					&buffers_list, level, nr_this_block,
759					&ptr, &bp, &block, priv);
760			if (ret)
761				goto out;
762
763			trace_xfs_btree_bload_block(cur, level, i, blocks,
764					&ptr, nr_this_block);
765
766			ret = xfs_btree_bload_node(cur, nr_this_block,
767					&child_ptr, block);
768			if (ret)
769				goto out;
770
771			/*
772			 * Record the leftmost node pointer so that we know
773			 * where to start the next node level above this one.
774			 */
775			if (i == 0)
776				xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
777		}
778		total_blocks += blocks;
779
780		ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
781		if (ret)
782			goto out;
783
784		xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
785	}
786
787	/* Initialize the new root. */
788	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
789		ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
790		cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
791		cur->bc_ino.ifake->if_blocks = total_blocks - 1;
792	} else {
793		cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
794		cur->bc_ag.afake->af_levels = cur->bc_nlevels;
795		cur->bc_ag.afake->af_blocks = total_blocks;
796	}
797
798	/*
799	 * Write the new blocks to disk.  If the ordered list isn't empty after
800	 * that, then something went wrong and we have to fail.  This should
801	 * never happen, but we'll check anyway.
802	 */
803	ret = xfs_buf_delwri_submit(&buffers_list);
804	if (ret)
805		goto out;
806	if (!list_empty(&buffers_list)) {
807		ASSERT(list_empty(&buffers_list));
808		ret = -EIO;
809	}
810
811out:
812	xfs_buf_delwri_cancel(&buffers_list);
813	if (bp)
814		xfs_buf_relse(bp);
815	return ret;
816}
817