1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle.  All rights reserved.
4 */
5
6#include <linux/fs.h>
7#include <linux/blkdev.h>
8#include <linux/radix-tree.h>
9#include <linux/writeback.h>
10#include <linux/workqueue.h>
11#include <linux/kthread.h>
12#include <linux/slab.h>
13#include <linux/migrate.h>
14#include <linux/ratelimit.h>
15#include <linux/uuid.h>
16#include <linux/semaphore.h>
17#include <linux/error-injection.h>
18#include <linux/crc32c.h>
19#include <linux/sched/mm.h>
20#include <asm/unaligned.h>
21#include <crypto/hash.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "btrfs_inode.h"
26#include "volumes.h"
27#include "print-tree.h"
28#include "locking.h"
29#include "tree-log.h"
30#include "free-space-cache.h"
31#include "free-space-tree.h"
32#include "check-integrity.h"
33#include "rcu-string.h"
34#include "dev-replace.h"
35#include "raid56.h"
36#include "sysfs.h"
37#include "qgroup.h"
38#include "compression.h"
39#include "tree-checker.h"
40#include "ref-verify.h"
41#include "block-group.h"
42#include "discard.h"
43#include "space-info.h"
44#include "zoned.h"
45#include "subpage.h"
46
47#define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
48				 BTRFS_HEADER_FLAG_RELOC |\
49				 BTRFS_SUPER_FLAG_ERROR |\
50				 BTRFS_SUPER_FLAG_SEEDING |\
51				 BTRFS_SUPER_FLAG_METADUMP |\
52				 BTRFS_SUPER_FLAG_METADUMP_V2)
53
54static void end_workqueue_fn(struct btrfs_work *work);
55static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
56static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
57				      struct btrfs_fs_info *fs_info);
58static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
59static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
60					struct extent_io_tree *dirty_pages,
61					int mark);
62static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
63				       struct extent_io_tree *pinned_extents);
64static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
65static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
66
67/*
68 * btrfs_end_io_wq structs are used to do processing in task context when an IO
69 * is complete.  This is used during reads to verify checksums, and it is used
70 * by writes to insert metadata for new file extents after IO is complete.
71 */
72struct btrfs_end_io_wq {
73	struct bio *bio;
74	bio_end_io_t *end_io;
75	void *private;
76	struct btrfs_fs_info *info;
77	blk_status_t status;
78	enum btrfs_wq_endio_type metadata;
79	struct btrfs_work work;
80};
81
82static struct kmem_cache *btrfs_end_io_wq_cache;
83
84int __init btrfs_end_io_wq_init(void)
85{
86	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
87					sizeof(struct btrfs_end_io_wq),
88					0,
89					SLAB_MEM_SPREAD,
90					NULL);
91	if (!btrfs_end_io_wq_cache)
92		return -ENOMEM;
93	return 0;
94}
95
96void __cold btrfs_end_io_wq_exit(void)
97{
98	kmem_cache_destroy(btrfs_end_io_wq_cache);
99}
100
101static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
102{
103	if (fs_info->csum_shash)
104		crypto_free_shash(fs_info->csum_shash);
105}
106
107/*
108 * async submit bios are used to offload expensive checksumming
109 * onto the worker threads.  They checksum file and metadata bios
110 * just before they are sent down the IO stack.
111 */
112struct async_submit_bio {
113	struct inode *inode;
114	struct bio *bio;
115	extent_submit_bio_start_t *submit_bio_start;
116	int mirror_num;
117
118	/* Optional parameter for submit_bio_start used by direct io */
119	u64 dio_file_offset;
120	struct btrfs_work work;
121	blk_status_t status;
122};
123
124/*
125 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
126 * eb, the lockdep key is determined by the btrfs_root it belongs to and
127 * the level the eb occupies in the tree.
128 *
129 * Different roots are used for different purposes and may nest inside each
130 * other and they require separate keysets.  As lockdep keys should be
131 * static, assign keysets according to the purpose of the root as indicated
132 * by btrfs_root->root_key.objectid.  This ensures that all special purpose
133 * roots have separate keysets.
134 *
135 * Lock-nesting across peer nodes is always done with the immediate parent
136 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
137 * subclass to avoid triggering lockdep warning in such cases.
138 *
139 * The key is set by the readpage_end_io_hook after the buffer has passed
140 * csum validation but before the pages are unlocked.  It is also set by
141 * btrfs_init_new_buffer on freshly allocated blocks.
142 *
143 * We also add a check to make sure the highest level of the tree is the
144 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
145 * needs update as well.
146 */
147#ifdef CONFIG_DEBUG_LOCK_ALLOC
148# if BTRFS_MAX_LEVEL != 8
149#  error
150# endif
151
152#define DEFINE_LEVEL(stem, level)					\
153	.names[level] = "btrfs-" stem "-0" #level,
154
155#define DEFINE_NAME(stem)						\
156	DEFINE_LEVEL(stem, 0)						\
157	DEFINE_LEVEL(stem, 1)						\
158	DEFINE_LEVEL(stem, 2)						\
159	DEFINE_LEVEL(stem, 3)						\
160	DEFINE_LEVEL(stem, 4)						\
161	DEFINE_LEVEL(stem, 5)						\
162	DEFINE_LEVEL(stem, 6)						\
163	DEFINE_LEVEL(stem, 7)
164
165static struct btrfs_lockdep_keyset {
166	u64			id;		/* root objectid */
167	/* Longest entry: btrfs-free-space-00 */
168	char			names[BTRFS_MAX_LEVEL][20];
169	struct lock_class_key	keys[BTRFS_MAX_LEVEL];
170} btrfs_lockdep_keysets[] = {
171	{ .id = BTRFS_ROOT_TREE_OBJECTID,	DEFINE_NAME("root")	},
172	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	DEFINE_NAME("extent")	},
173	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	DEFINE_NAME("chunk")	},
174	{ .id = BTRFS_DEV_TREE_OBJECTID,	DEFINE_NAME("dev")	},
175	{ .id = BTRFS_CSUM_TREE_OBJECTID,	DEFINE_NAME("csum")	},
176	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	DEFINE_NAME("quota")	},
177	{ .id = BTRFS_TREE_LOG_OBJECTID,	DEFINE_NAME("log")	},
178	{ .id = BTRFS_TREE_RELOC_OBJECTID,	DEFINE_NAME("treloc")	},
179	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	DEFINE_NAME("dreloc")	},
180	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
181	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
182	{ .id = 0,				DEFINE_NAME("tree")	},
183};
184
185#undef DEFINE_LEVEL
186#undef DEFINE_NAME
187
188void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
189				    int level)
190{
191	struct btrfs_lockdep_keyset *ks;
192
193	BUG_ON(level >= ARRAY_SIZE(ks->keys));
194
195	/* find the matching keyset, id 0 is the default entry */
196	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
197		if (ks->id == objectid)
198			break;
199
200	lockdep_set_class_and_name(&eb->lock,
201				   &ks->keys[level], ks->names[level]);
202}
203
204#endif
205
206/*
207 * Compute the csum of a btree block and store the result to provided buffer.
208 */
209static void csum_tree_block(struct extent_buffer *buf, u8 *result)
210{
211	struct btrfs_fs_info *fs_info = buf->fs_info;
212	const int num_pages = num_extent_pages(buf);
213	const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
214	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
215	char *kaddr;
216	int i;
217
218	shash->tfm = fs_info->csum_shash;
219	crypto_shash_init(shash);
220	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
221	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
222			    first_page_part - BTRFS_CSUM_SIZE);
223
224	for (i = 1; i < num_pages; i++) {
225		kaddr = page_address(buf->pages[i]);
226		crypto_shash_update(shash, kaddr, PAGE_SIZE);
227	}
228	memset(result, 0, BTRFS_CSUM_SIZE);
229	crypto_shash_final(shash, result);
230}
231
232/*
233 * we can't consider a given block up to date unless the transid of the
234 * block matches the transid in the parent node's pointer.  This is how we
235 * detect blocks that either didn't get written at all or got written
236 * in the wrong place.
237 */
238static int verify_parent_transid(struct extent_io_tree *io_tree,
239				 struct extent_buffer *eb, u64 parent_transid,
240				 int atomic)
241{
242	struct extent_state *cached_state = NULL;
243	int ret;
244
245	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
246		return 0;
247
248	if (atomic)
249		return -EAGAIN;
250
251	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
252			 &cached_state);
253	if (extent_buffer_uptodate(eb) &&
254	    btrfs_header_generation(eb) == parent_transid) {
255		ret = 0;
256		goto out;
257	}
258	btrfs_err_rl(eb->fs_info,
259		"parent transid verify failed on %llu wanted %llu found %llu",
260			eb->start,
261			parent_transid, btrfs_header_generation(eb));
262	ret = 1;
263	clear_extent_buffer_uptodate(eb);
264out:
265	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
266			     &cached_state);
267	return ret;
268}
269
270static bool btrfs_supported_super_csum(u16 csum_type)
271{
272	switch (csum_type) {
273	case BTRFS_CSUM_TYPE_CRC32:
274	case BTRFS_CSUM_TYPE_XXHASH:
275	case BTRFS_CSUM_TYPE_SHA256:
276	case BTRFS_CSUM_TYPE_BLAKE2:
277		return true;
278	default:
279		return false;
280	}
281}
282
283/*
284 * Return 0 if the superblock checksum type matches the checksum value of that
285 * algorithm. Pass the raw disk superblock data.
286 */
287static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
288				  char *raw_disk_sb)
289{
290	struct btrfs_super_block *disk_sb =
291		(struct btrfs_super_block *)raw_disk_sb;
292	char result[BTRFS_CSUM_SIZE];
293	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
294
295	shash->tfm = fs_info->csum_shash;
296
297	/*
298	 * The super_block structure does not span the whole
299	 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
300	 * filled with zeros and is included in the checksum.
301	 */
302	crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
303			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
304
305	if (memcmp(disk_sb->csum, result, fs_info->csum_size))
306		return 1;
307
308	return 0;
309}
310
311int btrfs_verify_level_key(struct extent_buffer *eb, int level,
312			   struct btrfs_key *first_key, u64 parent_transid)
313{
314	struct btrfs_fs_info *fs_info = eb->fs_info;
315	int found_level;
316	struct btrfs_key found_key;
317	int ret;
318
319	found_level = btrfs_header_level(eb);
320	if (found_level != level) {
321		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
322		     KERN_ERR "BTRFS: tree level check failed\n");
323		btrfs_err(fs_info,
324"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
325			  eb->start, level, found_level);
326		return -EIO;
327	}
328
329	if (!first_key)
330		return 0;
331
332	/*
333	 * For live tree block (new tree blocks in current transaction),
334	 * we need proper lock context to avoid race, which is impossible here.
335	 * So we only checks tree blocks which is read from disk, whose
336	 * generation <= fs_info->last_trans_committed.
337	 */
338	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
339		return 0;
340
341	/* We have @first_key, so this @eb must have at least one item */
342	if (btrfs_header_nritems(eb) == 0) {
343		btrfs_err(fs_info,
344		"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
345			  eb->start);
346		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
347		return -EUCLEAN;
348	}
349
350	if (found_level)
351		btrfs_node_key_to_cpu(eb, &found_key, 0);
352	else
353		btrfs_item_key_to_cpu(eb, &found_key, 0);
354	ret = btrfs_comp_cpu_keys(first_key, &found_key);
355
356	if (ret) {
357		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
358		     KERN_ERR "BTRFS: tree first key check failed\n");
359		btrfs_err(fs_info,
360"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
361			  eb->start, parent_transid, first_key->objectid,
362			  first_key->type, first_key->offset,
363			  found_key.objectid, found_key.type,
364			  found_key.offset);
365	}
366	return ret;
367}
368
369/*
370 * helper to read a given tree block, doing retries as required when
371 * the checksums don't match and we have alternate mirrors to try.
372 *
373 * @parent_transid:	expected transid, skip check if 0
374 * @level:		expected level, mandatory check
375 * @first_key:		expected key of first slot, skip check if NULL
376 */
377static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
378					  u64 parent_transid, int level,
379					  struct btrfs_key *first_key)
380{
381	struct btrfs_fs_info *fs_info = eb->fs_info;
382	struct extent_io_tree *io_tree;
383	int failed = 0;
384	int ret;
385	int num_copies = 0;
386	int mirror_num = 0;
387	int failed_mirror = 0;
388
389	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
390	while (1) {
391		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
392		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
393		if (!ret) {
394			if (verify_parent_transid(io_tree, eb,
395						   parent_transid, 0))
396				ret = -EIO;
397			else if (btrfs_verify_level_key(eb, level,
398						first_key, parent_transid))
399				ret = -EUCLEAN;
400			else
401				break;
402		}
403
404		num_copies = btrfs_num_copies(fs_info,
405					      eb->start, eb->len);
406		if (num_copies == 1)
407			break;
408
409		if (!failed_mirror) {
410			failed = 1;
411			failed_mirror = eb->read_mirror;
412		}
413
414		mirror_num++;
415		if (mirror_num == failed_mirror)
416			mirror_num++;
417
418		if (mirror_num > num_copies)
419			break;
420	}
421
422	if (failed && !ret && failed_mirror)
423		btrfs_repair_eb_io_failure(eb, failed_mirror);
424
425	return ret;
426}
427
428static int csum_one_extent_buffer(struct extent_buffer *eb)
429{
430	struct btrfs_fs_info *fs_info = eb->fs_info;
431	u8 result[BTRFS_CSUM_SIZE];
432	int ret;
433
434	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
435				    offsetof(struct btrfs_header, fsid),
436				    BTRFS_FSID_SIZE) == 0);
437	csum_tree_block(eb, result);
438
439	if (btrfs_header_level(eb))
440		ret = btrfs_check_node(eb);
441	else
442		ret = btrfs_check_leaf_full(eb);
443
444	if (ret < 0) {
445		btrfs_print_tree(eb, 0);
446		btrfs_err(fs_info,
447			"block=%llu write time tree block corruption detected",
448			eb->start);
449		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
450		return ret;
451	}
452	write_extent_buffer(eb, result, 0, fs_info->csum_size);
453
454	return 0;
455}
456
457/* Checksum all dirty extent buffers in one bio_vec */
458static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
459				      struct bio_vec *bvec)
460{
461	struct page *page = bvec->bv_page;
462	u64 bvec_start = page_offset(page) + bvec->bv_offset;
463	u64 cur;
464	int ret = 0;
465
466	for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
467	     cur += fs_info->nodesize) {
468		struct extent_buffer *eb;
469		bool uptodate;
470
471		eb = find_extent_buffer(fs_info, cur);
472		uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
473						       fs_info->nodesize);
474
475		/* A dirty eb shouldn't disappear from buffer_radix */
476		if (WARN_ON(!eb))
477			return -EUCLEAN;
478
479		if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
480			free_extent_buffer(eb);
481			return -EUCLEAN;
482		}
483		if (WARN_ON(!uptodate)) {
484			free_extent_buffer(eb);
485			return -EUCLEAN;
486		}
487
488		ret = csum_one_extent_buffer(eb);
489		free_extent_buffer(eb);
490		if (ret < 0)
491			return ret;
492	}
493	return ret;
494}
495
496/*
497 * Checksum a dirty tree block before IO.  This has extra checks to make sure
498 * we only fill in the checksum field in the first page of a multi-page block.
499 * For subpage extent buffers we need bvec to also read the offset in the page.
500 */
501static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
502{
503	struct page *page = bvec->bv_page;
504	u64 start = page_offset(page);
505	u64 found_start;
506	struct extent_buffer *eb;
507
508	if (fs_info->sectorsize < PAGE_SIZE)
509		return csum_dirty_subpage_buffers(fs_info, bvec);
510
511	eb = (struct extent_buffer *)page->private;
512	if (page != eb->pages[0])
513		return 0;
514
515	found_start = btrfs_header_bytenr(eb);
516
517	if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
518		WARN_ON(found_start != 0);
519		return 0;
520	}
521
522	/*
523	 * Please do not consolidate these warnings into a single if.
524	 * It is useful to know what went wrong.
525	 */
526	if (WARN_ON(found_start != start))
527		return -EUCLEAN;
528	if (WARN_ON(!PageUptodate(page)))
529		return -EUCLEAN;
530
531	return csum_one_extent_buffer(eb);
532}
533
534static int check_tree_block_fsid(struct extent_buffer *eb)
535{
536	struct btrfs_fs_info *fs_info = eb->fs_info;
537	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
538	u8 fsid[BTRFS_FSID_SIZE];
539	u8 *metadata_uuid;
540
541	read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
542			   BTRFS_FSID_SIZE);
543	/*
544	 * Checking the incompat flag is only valid for the current fs. For
545	 * seed devices it's forbidden to have their uuid changed so reading
546	 * ->fsid in this case is fine
547	 */
548	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
549		metadata_uuid = fs_devices->metadata_uuid;
550	else
551		metadata_uuid = fs_devices->fsid;
552
553	if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
554		return 0;
555
556	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
557		if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
558			return 0;
559
560	return 1;
561}
562
563/* Do basic extent buffer checks at read time */
564static int validate_extent_buffer(struct extent_buffer *eb)
565{
566	struct btrfs_fs_info *fs_info = eb->fs_info;
567	u64 found_start;
568	const u32 csum_size = fs_info->csum_size;
569	u8 found_level;
570	u8 result[BTRFS_CSUM_SIZE];
571	const u8 *header_csum;
572	int ret = 0;
573
574	found_start = btrfs_header_bytenr(eb);
575	if (found_start != eb->start) {
576		btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
577			     eb->start, found_start);
578		ret = -EIO;
579		goto out;
580	}
581	if (check_tree_block_fsid(eb)) {
582		btrfs_err_rl(fs_info, "bad fsid on block %llu",
583			     eb->start);
584		ret = -EIO;
585		goto out;
586	}
587	found_level = btrfs_header_level(eb);
588	if (found_level >= BTRFS_MAX_LEVEL) {
589		btrfs_err(fs_info, "bad tree block level %d on %llu",
590			  (int)btrfs_header_level(eb), eb->start);
591		ret = -EIO;
592		goto out;
593	}
594
595	csum_tree_block(eb, result);
596	header_csum = page_address(eb->pages[0]) +
597		get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
598
599	if (memcmp(result, header_csum, csum_size) != 0) {
600		btrfs_warn_rl(fs_info,
601	"checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
602			      eb->start,
603			      CSUM_FMT_VALUE(csum_size, header_csum),
604			      CSUM_FMT_VALUE(csum_size, result),
605			      btrfs_header_level(eb));
606		ret = -EUCLEAN;
607		goto out;
608	}
609
610	/*
611	 * If this is a leaf block and it is corrupt, set the corrupt bit so
612	 * that we don't try and read the other copies of this block, just
613	 * return -EIO.
614	 */
615	if (found_level == 0 && btrfs_check_leaf_full(eb)) {
616		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
617		ret = -EIO;
618	}
619
620	if (found_level > 0 && btrfs_check_node(eb))
621		ret = -EIO;
622
623	if (!ret)
624		set_extent_buffer_uptodate(eb);
625	else
626		btrfs_err(fs_info,
627			  "block=%llu read time tree block corruption detected",
628			  eb->start);
629out:
630	return ret;
631}
632
633static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
634				   int mirror)
635{
636	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
637	struct extent_buffer *eb;
638	bool reads_done;
639	int ret = 0;
640
641	/*
642	 * We don't allow bio merge for subpage metadata read, so we should
643	 * only get one eb for each endio hook.
644	 */
645	ASSERT(end == start + fs_info->nodesize - 1);
646	ASSERT(PagePrivate(page));
647
648	eb = find_extent_buffer(fs_info, start);
649	/*
650	 * When we are reading one tree block, eb must have been inserted into
651	 * the radix tree. If not, something is wrong.
652	 */
653	ASSERT(eb);
654
655	reads_done = atomic_dec_and_test(&eb->io_pages);
656	/* Subpage read must finish in page read */
657	ASSERT(reads_done);
658
659	eb->read_mirror = mirror;
660	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
661		ret = -EIO;
662		goto err;
663	}
664	ret = validate_extent_buffer(eb);
665	if (ret < 0)
666		goto err;
667
668	set_extent_buffer_uptodate(eb);
669
670	free_extent_buffer(eb);
671	return ret;
672err:
673	/*
674	 * end_bio_extent_readpage decrements io_pages in case of error,
675	 * make sure it has something to decrement.
676	 */
677	atomic_inc(&eb->io_pages);
678	clear_extent_buffer_uptodate(eb);
679	free_extent_buffer(eb);
680	return ret;
681}
682
683int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
684				   struct page *page, u64 start, u64 end,
685				   int mirror)
686{
687	struct extent_buffer *eb;
688	int ret = 0;
689	int reads_done;
690
691	ASSERT(page->private);
692
693	if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
694		return validate_subpage_buffer(page, start, end, mirror);
695
696	eb = (struct extent_buffer *)page->private;
697
698	/*
699	 * The pending IO might have been the only thing that kept this buffer
700	 * in memory.  Make sure we have a ref for all this other checks
701	 */
702	atomic_inc(&eb->refs);
703
704	reads_done = atomic_dec_and_test(&eb->io_pages);
705	if (!reads_done)
706		goto err;
707
708	eb->read_mirror = mirror;
709	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
710		ret = -EIO;
711		goto err;
712	}
713	ret = validate_extent_buffer(eb);
714err:
715	if (ret) {
716		/*
717		 * our io error hook is going to dec the io pages
718		 * again, we have to make sure it has something
719		 * to decrement
720		 */
721		atomic_inc(&eb->io_pages);
722		clear_extent_buffer_uptodate(eb);
723	}
724	free_extent_buffer(eb);
725
726	return ret;
727}
728
729static void end_workqueue_bio(struct bio *bio)
730{
731	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
732	struct btrfs_fs_info *fs_info;
733	struct btrfs_workqueue *wq;
734
735	fs_info = end_io_wq->info;
736	end_io_wq->status = bio->bi_status;
737
738	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
739		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
740			wq = fs_info->endio_meta_write_workers;
741		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
742			wq = fs_info->endio_freespace_worker;
743		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
744			wq = fs_info->endio_raid56_workers;
745		else
746			wq = fs_info->endio_write_workers;
747	} else {
748		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
749			wq = fs_info->endio_raid56_workers;
750		else if (end_io_wq->metadata)
751			wq = fs_info->endio_meta_workers;
752		else
753			wq = fs_info->endio_workers;
754	}
755
756	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
757	btrfs_queue_work(wq, &end_io_wq->work);
758}
759
760blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
761			enum btrfs_wq_endio_type metadata)
762{
763	struct btrfs_end_io_wq *end_io_wq;
764
765	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
766	if (!end_io_wq)
767		return BLK_STS_RESOURCE;
768
769	end_io_wq->private = bio->bi_private;
770	end_io_wq->end_io = bio->bi_end_io;
771	end_io_wq->info = info;
772	end_io_wq->status = 0;
773	end_io_wq->bio = bio;
774	end_io_wq->metadata = metadata;
775
776	bio->bi_private = end_io_wq;
777	bio->bi_end_io = end_workqueue_bio;
778	return 0;
779}
780
781static void run_one_async_start(struct btrfs_work *work)
782{
783	struct async_submit_bio *async;
784	blk_status_t ret;
785
786	async = container_of(work, struct  async_submit_bio, work);
787	ret = async->submit_bio_start(async->inode, async->bio,
788				      async->dio_file_offset);
789	if (ret)
790		async->status = ret;
791}
792
793/*
794 * In order to insert checksums into the metadata in large chunks, we wait
795 * until bio submission time.   All the pages in the bio are checksummed and
796 * sums are attached onto the ordered extent record.
797 *
798 * At IO completion time the csums attached on the ordered extent record are
799 * inserted into the tree.
800 */
801static void run_one_async_done(struct btrfs_work *work)
802{
803	struct async_submit_bio *async;
804	struct inode *inode;
805	blk_status_t ret;
806
807	async = container_of(work, struct  async_submit_bio, work);
808	inode = async->inode;
809
810	/* If an error occurred we just want to clean up the bio and move on */
811	if (async->status) {
812		async->bio->bi_status = async->status;
813		bio_endio(async->bio);
814		return;
815	}
816
817	/*
818	 * All of the bios that pass through here are from async helpers.
819	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
820	 * This changes nothing when cgroups aren't in use.
821	 */
822	async->bio->bi_opf |= REQ_CGROUP_PUNT;
823	ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
824	if (ret) {
825		async->bio->bi_status = ret;
826		bio_endio(async->bio);
827	}
828}
829
830static void run_one_async_free(struct btrfs_work *work)
831{
832	struct async_submit_bio *async;
833
834	async = container_of(work, struct  async_submit_bio, work);
835	kfree(async);
836}
837
838blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
839				 int mirror_num, unsigned long bio_flags,
840				 u64 dio_file_offset,
841				 extent_submit_bio_start_t *submit_bio_start)
842{
843	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
844	struct async_submit_bio *async;
845
846	async = kmalloc(sizeof(*async), GFP_NOFS);
847	if (!async)
848		return BLK_STS_RESOURCE;
849
850	async->inode = inode;
851	async->bio = bio;
852	async->mirror_num = mirror_num;
853	async->submit_bio_start = submit_bio_start;
854
855	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
856			run_one_async_free);
857
858	async->dio_file_offset = dio_file_offset;
859
860	async->status = 0;
861
862	if (op_is_sync(bio->bi_opf))
863		btrfs_set_work_high_priority(&async->work);
864
865	btrfs_queue_work(fs_info->workers, &async->work);
866	return 0;
867}
868
869static blk_status_t btree_csum_one_bio(struct bio *bio)
870{
871	struct bio_vec *bvec;
872	struct btrfs_root *root;
873	int ret = 0;
874	struct bvec_iter_all iter_all;
875
876	ASSERT(!bio_flagged(bio, BIO_CLONED));
877	bio_for_each_segment_all(bvec, bio, iter_all) {
878		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
879		ret = csum_dirty_buffer(root->fs_info, bvec);
880		if (ret)
881			break;
882	}
883
884	return errno_to_blk_status(ret);
885}
886
887static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
888					   u64 dio_file_offset)
889{
890	/*
891	 * when we're called for a write, we're already in the async
892	 * submission context.  Just jump into btrfs_map_bio
893	 */
894	return btree_csum_one_bio(bio);
895}
896
897static bool should_async_write(struct btrfs_fs_info *fs_info,
898			     struct btrfs_inode *bi)
899{
900	if (btrfs_is_zoned(fs_info))
901		return false;
902	if (atomic_read(&bi->sync_writers))
903		return false;
904	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
905		return false;
906	return true;
907}
908
909blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
910				       int mirror_num, unsigned long bio_flags)
911{
912	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
913	blk_status_t ret;
914
915	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
916		/*
917		 * called for a read, do the setup so that checksum validation
918		 * can happen in the async kernel threads
919		 */
920		ret = btrfs_bio_wq_end_io(fs_info, bio,
921					  BTRFS_WQ_ENDIO_METADATA);
922		if (ret)
923			goto out_w_error;
924		ret = btrfs_map_bio(fs_info, bio, mirror_num);
925	} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
926		ret = btree_csum_one_bio(bio);
927		if (ret)
928			goto out_w_error;
929		ret = btrfs_map_bio(fs_info, bio, mirror_num);
930	} else {
931		/*
932		 * kthread helpers are used to submit writes so that
933		 * checksumming can happen in parallel across all CPUs
934		 */
935		ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
936					  0, btree_submit_bio_start);
937	}
938
939	if (ret)
940		goto out_w_error;
941	return 0;
942
943out_w_error:
944	bio->bi_status = ret;
945	bio_endio(bio);
946	return ret;
947}
948
949#ifdef CONFIG_MIGRATION
950static int btree_migratepage(struct address_space *mapping,
951			struct page *newpage, struct page *page,
952			enum migrate_mode mode)
953{
954	/*
955	 * we can't safely write a btree page from here,
956	 * we haven't done the locking hook
957	 */
958	if (PageDirty(page))
959		return -EAGAIN;
960	/*
961	 * Buffers may be managed in a filesystem specific way.
962	 * We must have no buffers or drop them.
963	 */
964	if (page_has_private(page) &&
965	    !try_to_release_page(page, GFP_KERNEL))
966		return -EAGAIN;
967	return migrate_page(mapping, newpage, page, mode);
968}
969#endif
970
971
972static int btree_writepages(struct address_space *mapping,
973			    struct writeback_control *wbc)
974{
975	struct btrfs_fs_info *fs_info;
976	int ret;
977
978	if (wbc->sync_mode == WB_SYNC_NONE) {
979
980		if (wbc->for_kupdate)
981			return 0;
982
983		fs_info = BTRFS_I(mapping->host)->root->fs_info;
984		/* this is a bit racy, but that's ok */
985		ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
986					     BTRFS_DIRTY_METADATA_THRESH,
987					     fs_info->dirty_metadata_batch);
988		if (ret < 0)
989			return 0;
990	}
991	return btree_write_cache_pages(mapping, wbc);
992}
993
994static int btree_releasepage(struct page *page, gfp_t gfp_flags)
995{
996	if (PageWriteback(page) || PageDirty(page))
997		return 0;
998
999	return try_release_extent_buffer(page);
1000}
1001
1002static void btree_invalidatepage(struct page *page, unsigned int offset,
1003				 unsigned int length)
1004{
1005	struct extent_io_tree *tree;
1006	tree = &BTRFS_I(page->mapping->host)->io_tree;
1007	extent_invalidatepage(tree, page, offset);
1008	btree_releasepage(page, GFP_NOFS);
1009	if (PagePrivate(page)) {
1010		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
1011			   "page private not zero on page %llu",
1012			   (unsigned long long)page_offset(page));
1013		detach_page_private(page);
1014	}
1015}
1016
1017static int btree_set_page_dirty(struct page *page)
1018{
1019#ifdef DEBUG
1020	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
1021	struct btrfs_subpage *subpage;
1022	struct extent_buffer *eb;
1023	int cur_bit = 0;
1024	u64 page_start = page_offset(page);
1025
1026	if (fs_info->sectorsize == PAGE_SIZE) {
1027		BUG_ON(!PagePrivate(page));
1028		eb = (struct extent_buffer *)page->private;
1029		BUG_ON(!eb);
1030		BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1031		BUG_ON(!atomic_read(&eb->refs));
1032		btrfs_assert_tree_write_locked(eb);
1033		return __set_page_dirty_nobuffers(page);
1034	}
1035	ASSERT(PagePrivate(page) && page->private);
1036	subpage = (struct btrfs_subpage *)page->private;
1037
1038	ASSERT(subpage->dirty_bitmap);
1039	while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
1040		unsigned long flags;
1041		u64 cur;
1042		u16 tmp = (1 << cur_bit);
1043
1044		spin_lock_irqsave(&subpage->lock, flags);
1045		if (!(tmp & subpage->dirty_bitmap)) {
1046			spin_unlock_irqrestore(&subpage->lock, flags);
1047			cur_bit++;
1048			continue;
1049		}
1050		spin_unlock_irqrestore(&subpage->lock, flags);
1051		cur = page_start + cur_bit * fs_info->sectorsize;
1052
1053		eb = find_extent_buffer(fs_info, cur);
1054		ASSERT(eb);
1055		ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1056		ASSERT(atomic_read(&eb->refs));
1057		btrfs_assert_tree_write_locked(eb);
1058		free_extent_buffer(eb);
1059
1060		cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
1061	}
1062#endif
1063	return __set_page_dirty_nobuffers(page);
1064}
1065
1066static const struct address_space_operations btree_aops = {
1067	.writepages	= btree_writepages,
1068	.releasepage	= btree_releasepage,
1069	.invalidatepage = btree_invalidatepage,
1070#ifdef CONFIG_MIGRATION
1071	.migratepage	= btree_migratepage,
1072#endif
1073	.set_page_dirty = btree_set_page_dirty,
1074};
1075
1076struct extent_buffer *btrfs_find_create_tree_block(
1077						struct btrfs_fs_info *fs_info,
1078						u64 bytenr, u64 owner_root,
1079						int level)
1080{
1081	if (btrfs_is_testing(fs_info))
1082		return alloc_test_extent_buffer(fs_info, bytenr);
1083	return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
1084}
1085
1086/*
1087 * Read tree block at logical address @bytenr and do variant basic but critical
1088 * verification.
1089 *
1090 * @owner_root:		the objectid of the root owner for this block.
1091 * @parent_transid:	expected transid of this tree block, skip check if 0
1092 * @level:		expected level, mandatory check
1093 * @first_key:		expected key in slot 0, skip check if NULL
1094 */
1095struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
1096				      u64 owner_root, u64 parent_transid,
1097				      int level, struct btrfs_key *first_key)
1098{
1099	struct extent_buffer *buf = NULL;
1100	int ret;
1101
1102	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
1103	if (IS_ERR(buf))
1104		return buf;
1105
1106	ret = btree_read_extent_buffer_pages(buf, parent_transid,
1107					     level, first_key);
1108	if (ret) {
1109		free_extent_buffer_stale(buf);
1110		return ERR_PTR(ret);
1111	}
1112	return buf;
1113
1114}
1115
1116void btrfs_clean_tree_block(struct extent_buffer *buf)
1117{
1118	struct btrfs_fs_info *fs_info = buf->fs_info;
1119	if (btrfs_header_generation(buf) ==
1120	    fs_info->running_transaction->transid) {
1121		btrfs_assert_tree_write_locked(buf);
1122
1123		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1124			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
1125						 -buf->len,
1126						 fs_info->dirty_metadata_batch);
1127			clear_extent_buffer_dirty(buf);
1128		}
1129	}
1130}
1131
1132static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1133			 u64 objectid)
1134{
1135	bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
1136
1137	memset(&root->root_key, 0, sizeof(root->root_key));
1138	memset(&root->root_item, 0, sizeof(root->root_item));
1139	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1140	root->fs_info = fs_info;
1141	root->root_key.objectid = objectid;
1142	root->node = NULL;
1143	root->commit_root = NULL;
1144	root->state = 0;
1145	RB_CLEAR_NODE(&root->rb_node);
1146
1147	root->last_trans = 0;
1148	root->free_objectid = 0;
1149	root->nr_delalloc_inodes = 0;
1150	root->nr_ordered_extents = 0;
1151	root->inode_tree = RB_ROOT;
1152	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1153
1154	btrfs_init_root_block_rsv(root);
1155
1156	INIT_LIST_HEAD(&root->dirty_list);
1157	INIT_LIST_HEAD(&root->root_list);
1158	INIT_LIST_HEAD(&root->delalloc_inodes);
1159	INIT_LIST_HEAD(&root->delalloc_root);
1160	INIT_LIST_HEAD(&root->ordered_extents);
1161	INIT_LIST_HEAD(&root->ordered_root);
1162	INIT_LIST_HEAD(&root->reloc_dirty_list);
1163	INIT_LIST_HEAD(&root->logged_list[0]);
1164	INIT_LIST_HEAD(&root->logged_list[1]);
1165	spin_lock_init(&root->inode_lock);
1166	spin_lock_init(&root->delalloc_lock);
1167	spin_lock_init(&root->ordered_extent_lock);
1168	spin_lock_init(&root->accounting_lock);
1169	spin_lock_init(&root->log_extents_lock[0]);
1170	spin_lock_init(&root->log_extents_lock[1]);
1171	spin_lock_init(&root->qgroup_meta_rsv_lock);
1172	mutex_init(&root->objectid_mutex);
1173	mutex_init(&root->log_mutex);
1174	mutex_init(&root->ordered_extent_mutex);
1175	mutex_init(&root->delalloc_mutex);
1176	init_waitqueue_head(&root->qgroup_flush_wait);
1177	init_waitqueue_head(&root->log_writer_wait);
1178	init_waitqueue_head(&root->log_commit_wait[0]);
1179	init_waitqueue_head(&root->log_commit_wait[1]);
1180	INIT_LIST_HEAD(&root->log_ctxs[0]);
1181	INIT_LIST_HEAD(&root->log_ctxs[1]);
1182	atomic_set(&root->log_commit[0], 0);
1183	atomic_set(&root->log_commit[1], 0);
1184	atomic_set(&root->log_writers, 0);
1185	atomic_set(&root->log_batch, 0);
1186	refcount_set(&root->refs, 1);
1187	atomic_set(&root->snapshot_force_cow, 0);
1188	atomic_set(&root->nr_swapfiles, 0);
1189	root->log_transid = 0;
1190	root->log_transid_committed = -1;
1191	root->last_log_commit = 0;
1192	root->anon_dev = 0;
1193	if (!dummy) {
1194		extent_io_tree_init(fs_info, &root->dirty_log_pages,
1195				    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
1196		extent_io_tree_init(fs_info, &root->log_csum_range,
1197				    IO_TREE_LOG_CSUM_RANGE, NULL);
1198	}
1199
1200	spin_lock_init(&root->root_item_lock);
1201	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1202#ifdef CONFIG_BTRFS_DEBUG
1203	INIT_LIST_HEAD(&root->leak_list);
1204	spin_lock(&fs_info->fs_roots_radix_lock);
1205	list_add_tail(&root->leak_list, &fs_info->allocated_roots);
1206	spin_unlock(&fs_info->fs_roots_radix_lock);
1207#endif
1208}
1209
1210static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1211					   u64 objectid, gfp_t flags)
1212{
1213	struct btrfs_root *root = kzalloc(sizeof(*root), flags);
1214	if (root)
1215		__setup_root(root, fs_info, objectid);
1216	return root;
1217}
1218
1219#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1220/* Should only be used by the testing infrastructure */
1221struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
1222{
1223	struct btrfs_root *root;
1224
1225	if (!fs_info)
1226		return ERR_PTR(-EINVAL);
1227
1228	root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
1229	if (!root)
1230		return ERR_PTR(-ENOMEM);
1231
1232	/* We don't use the stripesize in selftest, set it as sectorsize */
1233	root->alloc_bytenr = 0;
1234
1235	return root;
1236}
1237#endif
1238
1239static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
1240{
1241	const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
1242	const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
1243
1244	return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
1245}
1246
1247static int global_root_key_cmp(const void *k, const struct rb_node *node)
1248{
1249	const struct btrfs_key *key = k;
1250	const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
1251
1252	return btrfs_comp_cpu_keys(key, &root->root_key);
1253}
1254
1255int btrfs_global_root_insert(struct btrfs_root *root)
1256{
1257	struct btrfs_fs_info *fs_info = root->fs_info;
1258	struct rb_node *tmp;
1259
1260	write_lock(&fs_info->global_root_lock);
1261	tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
1262	write_unlock(&fs_info->global_root_lock);
1263	ASSERT(!tmp);
1264
1265	return tmp ? -EEXIST : 0;
1266}
1267
1268void btrfs_global_root_delete(struct btrfs_root *root)
1269{
1270	struct btrfs_fs_info *fs_info = root->fs_info;
1271
1272	write_lock(&fs_info->global_root_lock);
1273	rb_erase(&root->rb_node, &fs_info->global_root_tree);
1274	write_unlock(&fs_info->global_root_lock);
1275}
1276
1277struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
1278				     struct btrfs_key *key)
1279{
1280	struct rb_node *node;
1281	struct btrfs_root *root = NULL;
1282
1283	read_lock(&fs_info->global_root_lock);
1284	node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
1285	if (node)
1286		root = container_of(node, struct btrfs_root, rb_node);
1287	read_unlock(&fs_info->global_root_lock);
1288
1289	return root;
1290}
1291
1292struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
1293{
1294	struct btrfs_key key = {
1295		.objectid = BTRFS_CSUM_TREE_OBJECTID,
1296		.type = BTRFS_ROOT_ITEM_KEY,
1297		.offset = 0,
1298	};
1299
1300	return btrfs_global_root(fs_info, &key);
1301}
1302
1303struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
1304{
1305	struct btrfs_key key = {
1306		.objectid = BTRFS_EXTENT_TREE_OBJECTID,
1307		.type = BTRFS_ROOT_ITEM_KEY,
1308		.offset = 0,
1309	};
1310
1311	return btrfs_global_root(fs_info, &key);
1312}
1313
1314struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1315				     u64 objectid)
1316{
1317	struct btrfs_fs_info *fs_info = trans->fs_info;
1318	struct extent_buffer *leaf;
1319	struct btrfs_root *tree_root = fs_info->tree_root;
1320	struct btrfs_root *root;
1321	struct btrfs_key key;
1322	unsigned int nofs_flag;
1323	int ret = 0;
1324
1325	/*
1326	 * We're holding a transaction handle, so use a NOFS memory allocation
1327	 * context to avoid deadlock if reclaim happens.
1328	 */
1329	nofs_flag = memalloc_nofs_save();
1330	root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
1331	memalloc_nofs_restore(nofs_flag);
1332	if (!root)
1333		return ERR_PTR(-ENOMEM);
1334
1335	root->root_key.objectid = objectid;
1336	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1337	root->root_key.offset = 0;
1338
1339	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
1340				      BTRFS_NESTING_NORMAL);
1341	if (IS_ERR(leaf)) {
1342		ret = PTR_ERR(leaf);
1343		leaf = NULL;
1344		goto fail_unlock;
1345	}
1346
1347	root->node = leaf;
1348	btrfs_mark_buffer_dirty(leaf);
1349
1350	root->commit_root = btrfs_root_node(root);
1351	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1352
1353	btrfs_set_root_flags(&root->root_item, 0);
1354	btrfs_set_root_limit(&root->root_item, 0);
1355	btrfs_set_root_bytenr(&root->root_item, leaf->start);
1356	btrfs_set_root_generation(&root->root_item, trans->transid);
1357	btrfs_set_root_level(&root->root_item, 0);
1358	btrfs_set_root_refs(&root->root_item, 1);
1359	btrfs_set_root_used(&root->root_item, leaf->len);
1360	btrfs_set_root_last_snapshot(&root->root_item, 0);
1361	btrfs_set_root_dirid(&root->root_item, 0);
1362	if (is_fstree(objectid))
1363		generate_random_guid(root->root_item.uuid);
1364	else
1365		export_guid(root->root_item.uuid, &guid_null);
1366	btrfs_set_root_drop_level(&root->root_item, 0);
1367
1368	btrfs_tree_unlock(leaf);
1369
1370	key.objectid = objectid;
1371	key.type = BTRFS_ROOT_ITEM_KEY;
1372	key.offset = 0;
1373	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1374	if (ret)
1375		goto fail;
1376
1377	return root;
1378
1379fail_unlock:
1380	if (leaf)
1381		btrfs_tree_unlock(leaf);
1382fail:
1383	btrfs_put_root(root);
1384
1385	return ERR_PTR(ret);
1386}
1387
1388static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1389					 struct btrfs_fs_info *fs_info)
1390{
1391	struct btrfs_root *root;
1392
1393	root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
1394	if (!root)
1395		return ERR_PTR(-ENOMEM);
1396
1397	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1398	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1399	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1400
1401	return root;
1402}
1403
1404int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
1405			      struct btrfs_root *root)
1406{
1407	struct extent_buffer *leaf;
1408
1409	/*
1410	 * DON'T set SHAREABLE bit for log trees.
1411	 *
1412	 * Log trees are not exposed to user space thus can't be snapshotted,
1413	 * and they go away before a real commit is actually done.
1414	 *
1415	 * They do store pointers to file data extents, and those reference
1416	 * counts still get updated (along with back refs to the log tree).
1417	 */
1418
1419	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1420			NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
1421	if (IS_ERR(leaf))
1422		return PTR_ERR(leaf);
1423
1424	root->node = leaf;
1425
1426	btrfs_mark_buffer_dirty(root->node);
1427	btrfs_tree_unlock(root->node);
1428
1429	return 0;
1430}
1431
1432int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1433			     struct btrfs_fs_info *fs_info)
1434{
1435	struct btrfs_root *log_root;
1436
1437	log_root = alloc_log_tree(trans, fs_info);
1438	if (IS_ERR(log_root))
1439		return PTR_ERR(log_root);
1440
1441	if (!btrfs_is_zoned(fs_info)) {
1442		int ret = btrfs_alloc_log_tree_node(trans, log_root);
1443
1444		if (ret) {
1445			btrfs_put_root(log_root);
1446			return ret;
1447		}
1448	}
1449
1450	WARN_ON(fs_info->log_root_tree);
1451	fs_info->log_root_tree = log_root;
1452	return 0;
1453}
1454
1455int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1456		       struct btrfs_root *root)
1457{
1458	struct btrfs_fs_info *fs_info = root->fs_info;
1459	struct btrfs_root *log_root;
1460	struct btrfs_inode_item *inode_item;
1461	int ret;
1462
1463	log_root = alloc_log_tree(trans, fs_info);
1464	if (IS_ERR(log_root))
1465		return PTR_ERR(log_root);
1466
1467	ret = btrfs_alloc_log_tree_node(trans, log_root);
1468	if (ret) {
1469		btrfs_put_root(log_root);
1470		return ret;
1471	}
1472
1473	log_root->last_trans = trans->transid;
1474	log_root->root_key.offset = root->root_key.objectid;
1475
1476	inode_item = &log_root->root_item.inode;
1477	btrfs_set_stack_inode_generation(inode_item, 1);
1478	btrfs_set_stack_inode_size(inode_item, 3);
1479	btrfs_set_stack_inode_nlink(inode_item, 1);
1480	btrfs_set_stack_inode_nbytes(inode_item,
1481				     fs_info->nodesize);
1482	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1483
1484	btrfs_set_root_node(&log_root->root_item, log_root->node);
1485
1486	WARN_ON(root->log_root);
1487	root->log_root = log_root;
1488	root->log_transid = 0;
1489	root->log_transid_committed = -1;
1490	root->last_log_commit = 0;
1491	return 0;
1492}
1493
1494static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1495					      struct btrfs_path *path,
1496					      struct btrfs_key *key)
1497{
1498	struct btrfs_root *root;
1499	struct btrfs_fs_info *fs_info = tree_root->fs_info;
1500	u64 generation;
1501	int ret;
1502	int level;
1503
1504	root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1505	if (!root)
1506		return ERR_PTR(-ENOMEM);
1507
1508	ret = btrfs_find_root(tree_root, key, path,
1509			      &root->root_item, &root->root_key);
1510	if (ret) {
1511		if (ret > 0)
1512			ret = -ENOENT;
1513		goto fail;
1514	}
1515
1516	generation = btrfs_root_generation(&root->root_item);
1517	level = btrfs_root_level(&root->root_item);
1518	root->node = read_tree_block(fs_info,
1519				     btrfs_root_bytenr(&root->root_item),
1520				     key->objectid, generation, level, NULL);
1521	if (IS_ERR(root->node)) {
1522		ret = PTR_ERR(root->node);
1523		root->node = NULL;
1524		goto fail;
1525	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1526		ret = -EIO;
1527		goto fail;
1528	}
1529	root->commit_root = btrfs_root_node(root);
1530	return root;
1531fail:
1532	btrfs_put_root(root);
1533	return ERR_PTR(ret);
1534}
1535
1536struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1537					struct btrfs_key *key)
1538{
1539	struct btrfs_root *root;
1540	struct btrfs_path *path;
1541
1542	path = btrfs_alloc_path();
1543	if (!path)
1544		return ERR_PTR(-ENOMEM);
1545	root = read_tree_root_path(tree_root, path, key);
1546	btrfs_free_path(path);
1547
1548	return root;
1549}
1550
1551/*
1552 * Initialize subvolume root in-memory structure
1553 *
1554 * @anon_dev:	anonymous device to attach to the root, if zero, allocate new
1555 */
1556static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1557{
1558	int ret;
1559	unsigned int nofs_flag;
1560
1561	/*
1562	 * We might be called under a transaction (e.g. indirect backref
1563	 * resolution) which could deadlock if it triggers memory reclaim
1564	 */
1565	nofs_flag = memalloc_nofs_save();
1566	ret = btrfs_drew_lock_init(&root->snapshot_lock);
1567	memalloc_nofs_restore(nofs_flag);
1568	if (ret)
1569		goto fail;
1570
1571	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1572	    !btrfs_is_data_reloc_root(root)) {
1573		set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1574		btrfs_check_and_init_root_item(&root->root_item);
1575	}
1576
1577	/*
1578	 * Don't assign anonymous block device to roots that are not exposed to
1579	 * userspace, the id pool is limited to 1M
1580	 */
1581	if (is_fstree(root->root_key.objectid) &&
1582	    btrfs_root_refs(&root->root_item) > 0) {
1583		if (!anon_dev) {
1584			ret = get_anon_bdev(&root->anon_dev);
1585			if (ret)
1586				goto fail;
1587		} else {
1588			root->anon_dev = anon_dev;
1589		}
1590	}
1591
1592	mutex_lock(&root->objectid_mutex);
1593	ret = btrfs_init_root_free_objectid(root);
1594	if (ret) {
1595		mutex_unlock(&root->objectid_mutex);
1596		goto fail;
1597	}
1598
1599	ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1600
1601	mutex_unlock(&root->objectid_mutex);
1602
1603	return 0;
1604fail:
1605	/* The caller is responsible to call btrfs_free_fs_root */
1606	return ret;
1607}
1608
1609static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1610					       u64 root_id)
1611{
1612	struct btrfs_root *root;
1613
1614	spin_lock(&fs_info->fs_roots_radix_lock);
1615	root = radix_tree_lookup(&fs_info->fs_roots_radix,
1616				 (unsigned long)root_id);
1617	if (root)
1618		root = btrfs_grab_root(root);
1619	spin_unlock(&fs_info->fs_roots_radix_lock);
1620	return root;
1621}
1622
1623static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1624						u64 objectid)
1625{
1626	struct btrfs_key key = {
1627		.objectid = objectid,
1628		.type = BTRFS_ROOT_ITEM_KEY,
1629		.offset = 0,
1630	};
1631
1632	if (objectid == BTRFS_ROOT_TREE_OBJECTID)
1633		return btrfs_grab_root(fs_info->tree_root);
1634	if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
1635		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1636	if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
1637		return btrfs_grab_root(fs_info->chunk_root);
1638	if (objectid == BTRFS_DEV_TREE_OBJECTID)
1639		return btrfs_grab_root(fs_info->dev_root);
1640	if (objectid == BTRFS_CSUM_TREE_OBJECTID)
1641		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1642	if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
1643		return btrfs_grab_root(fs_info->quota_root) ?
1644			fs_info->quota_root : ERR_PTR(-ENOENT);
1645	if (objectid == BTRFS_UUID_TREE_OBJECTID)
1646		return btrfs_grab_root(fs_info->uuid_root) ?
1647			fs_info->uuid_root : ERR_PTR(-ENOENT);
1648	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
1649		struct btrfs_root *root = btrfs_global_root(fs_info, &key);
1650
1651		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
1652	}
1653	return NULL;
1654}
1655
1656int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1657			 struct btrfs_root *root)
1658{
1659	int ret;
1660
1661	ret = radix_tree_preload(GFP_NOFS);
1662	if (ret)
1663		return ret;
1664
1665	spin_lock(&fs_info->fs_roots_radix_lock);
1666	ret = radix_tree_insert(&fs_info->fs_roots_radix,
1667				(unsigned long)root->root_key.objectid,
1668				root);
1669	if (ret == 0) {
1670		btrfs_grab_root(root);
1671		set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1672	}
1673	spin_unlock(&fs_info->fs_roots_radix_lock);
1674	radix_tree_preload_end();
1675
1676	return ret;
1677}
1678
1679void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
1680{
1681#ifdef CONFIG_BTRFS_DEBUG
1682	struct btrfs_root *root;
1683
1684	while (!list_empty(&fs_info->allocated_roots)) {
1685		char buf[BTRFS_ROOT_NAME_BUF_LEN];
1686
1687		root = list_first_entry(&fs_info->allocated_roots,
1688					struct btrfs_root, leak_list);
1689		btrfs_err(fs_info, "leaked root %s refcount %d",
1690			  btrfs_root_name(&root->root_key, buf),
1691			  refcount_read(&root->refs));
1692		while (refcount_read(&root->refs) > 1)
1693			btrfs_put_root(root);
1694		btrfs_put_root(root);
1695	}
1696#endif
1697}
1698
1699static void free_global_roots(struct btrfs_fs_info *fs_info)
1700{
1701	struct btrfs_root *root;
1702	struct rb_node *node;
1703
1704	while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
1705		root = rb_entry(node, struct btrfs_root, rb_node);
1706		rb_erase(&root->rb_node, &fs_info->global_root_tree);
1707		btrfs_put_root(root);
1708	}
1709}
1710
1711void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1712{
1713	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1714	percpu_counter_destroy(&fs_info->delalloc_bytes);
1715	percpu_counter_destroy(&fs_info->ordered_bytes);
1716	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1717	btrfs_free_csum_hash(fs_info);
1718	btrfs_free_stripe_hash_table(fs_info);
1719	btrfs_free_ref_cache(fs_info);
1720	kfree(fs_info->balance_ctl);
1721	kfree(fs_info->delayed_root);
1722	free_global_roots(fs_info);
1723	btrfs_put_root(fs_info->tree_root);
1724	btrfs_put_root(fs_info->chunk_root);
1725	btrfs_put_root(fs_info->dev_root);
1726	btrfs_put_root(fs_info->quota_root);
1727	btrfs_put_root(fs_info->uuid_root);
1728	btrfs_put_root(fs_info->fs_root);
1729	btrfs_put_root(fs_info->data_reloc_root);
1730	btrfs_check_leaked_roots(fs_info);
1731	btrfs_extent_buffer_leak_debug_check(fs_info);
1732	kfree(fs_info->super_copy);
1733	kfree(fs_info->super_for_commit);
1734	kfree(fs_info->subpage_info);
1735	kvfree(fs_info);
1736}
1737
1738
1739/*
1740 * Get an in-memory reference of a root structure.
1741 *
1742 * For essential trees like root/extent tree, we grab it from fs_info directly.
1743 * For subvolume trees, we check the cached filesystem roots first. If not
1744 * found, then read it from disk and add it to cached fs roots.
1745 *
1746 * Caller should release the root by calling btrfs_put_root() after the usage.
1747 *
1748 * NOTE: Reloc and log trees can't be read by this function as they share the
1749 *	 same root objectid.
1750 *
1751 * @objectid:	root id
1752 * @anon_dev:	preallocated anonymous block device number for new roots,
1753 * 		pass 0 for new allocation.
1754 * @check_ref:	whether to check root item references, If true, return -ENOENT
1755 *		for orphan roots
1756 */
1757static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1758					     u64 objectid, dev_t anon_dev,
1759					     bool check_ref)
1760{
1761	struct btrfs_root *root;
1762	struct btrfs_path *path;
1763	struct btrfs_key key;
1764	int ret;
1765
1766	root = btrfs_get_global_root(fs_info, objectid);
1767	if (root)
1768		return root;
1769again:
1770	root = btrfs_lookup_fs_root(fs_info, objectid);
1771	if (root) {
1772		/* Shouldn't get preallocated anon_dev for cached roots */
1773		ASSERT(!anon_dev);
1774		if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1775			btrfs_put_root(root);
1776			return ERR_PTR(-ENOENT);
1777		}
1778		return root;
1779	}
1780
1781	key.objectid = objectid;
1782	key.type = BTRFS_ROOT_ITEM_KEY;
1783	key.offset = (u64)-1;
1784	root = btrfs_read_tree_root(fs_info->tree_root, &key);
1785	if (IS_ERR(root))
1786		return root;
1787
1788	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1789		ret = -ENOENT;
1790		goto fail;
1791	}
1792
1793	ret = btrfs_init_fs_root(root, anon_dev);
1794	if (ret)
1795		goto fail;
1796
1797	path = btrfs_alloc_path();
1798	if (!path) {
1799		ret = -ENOMEM;
1800		goto fail;
1801	}
1802	key.objectid = BTRFS_ORPHAN_OBJECTID;
1803	key.type = BTRFS_ORPHAN_ITEM_KEY;
1804	key.offset = objectid;
1805
1806	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1807	btrfs_free_path(path);
1808	if (ret < 0)
1809		goto fail;
1810	if (ret == 0)
1811		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1812
1813	ret = btrfs_insert_fs_root(fs_info, root);
1814	if (ret) {
1815		btrfs_put_root(root);
1816		if (ret == -EEXIST)
1817			goto again;
1818		goto fail;
1819	}
1820	return root;
1821fail:
1822	/*
1823	 * If our caller provided us an anonymous device, then it's his
1824	 * responsability to free it in case we fail. So we have to set our
1825	 * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
1826	 * and once again by our caller.
1827	 */
1828	if (anon_dev)
1829		root->anon_dev = 0;
1830	btrfs_put_root(root);
1831	return ERR_PTR(ret);
1832}
1833
1834/*
1835 * Get in-memory reference of a root structure
1836 *
1837 * @objectid:	tree objectid
1838 * @check_ref:	if set, verify that the tree exists and the item has at least
1839 *		one reference
1840 */
1841struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1842				     u64 objectid, bool check_ref)
1843{
1844	return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
1845}
1846
1847/*
1848 * Get in-memory reference of a root structure, created as new, optionally pass
1849 * the anonymous block device id
1850 *
1851 * @objectid:	tree objectid
1852 * @anon_dev:	if zero, allocate a new anonymous block device or use the
1853 *		parameter value
1854 */
1855struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1856					 u64 objectid, dev_t anon_dev)
1857{
1858	return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1859}
1860
1861/*
1862 * btrfs_get_fs_root_commit_root - return a root for the given objectid
1863 * @fs_info:	the fs_info
1864 * @objectid:	the objectid we need to lookup
1865 *
1866 * This is exclusively used for backref walking, and exists specifically because
1867 * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
1868 * creation time, which means we may have to read the tree_root in order to look
1869 * up a fs root that is not in memory.  If the root is not in memory we will
1870 * read the tree root commit root and look up the fs root from there.  This is a
1871 * temporary root, it will not be inserted into the radix tree as it doesn't
1872 * have the most uptodate information, it'll simply be discarded once the
1873 * backref code is finished using the root.
1874 */
1875struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1876						 struct btrfs_path *path,
1877						 u64 objectid)
1878{
1879	struct btrfs_root *root;
1880	struct btrfs_key key;
1881
1882	ASSERT(path->search_commit_root && path->skip_locking);
1883
1884	/*
1885	 * This can return -ENOENT if we ask for a root that doesn't exist, but
1886	 * since this is called via the backref walking code we won't be looking
1887	 * up a root that doesn't exist, unless there's corruption.  So if root
1888	 * != NULL just return it.
1889	 */
1890	root = btrfs_get_global_root(fs_info, objectid);
1891	if (root)
1892		return root;
1893
1894	root = btrfs_lookup_fs_root(fs_info, objectid);
1895	if (root)
1896		return root;
1897
1898	key.objectid = objectid;
1899	key.type = BTRFS_ROOT_ITEM_KEY;
1900	key.offset = (u64)-1;
1901	root = read_tree_root_path(fs_info->tree_root, path, &key);
1902	btrfs_release_path(path);
1903
1904	return root;
1905}
1906
1907/*
1908 * called by the kthread helper functions to finally call the bio end_io
1909 * functions.  This is where read checksum verification actually happens
1910 */
1911static void end_workqueue_fn(struct btrfs_work *work)
1912{
1913	struct bio *bio;
1914	struct btrfs_end_io_wq *end_io_wq;
1915
1916	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1917	bio = end_io_wq->bio;
1918
1919	bio->bi_status = end_io_wq->status;
1920	bio->bi_private = end_io_wq->private;
1921	bio->bi_end_io = end_io_wq->end_io;
1922	bio_endio(bio);
1923	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1924}
1925
1926static int cleaner_kthread(void *arg)
1927{
1928	struct btrfs_root *root = arg;
1929	struct btrfs_fs_info *fs_info = root->fs_info;
1930	int again;
1931
1932	while (1) {
1933		again = 0;
1934
1935		set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1936
1937		/* Make the cleaner go to sleep early. */
1938		if (btrfs_need_cleaner_sleep(fs_info))
1939			goto sleep;
1940
1941		/*
1942		 * Do not do anything if we might cause open_ctree() to block
1943		 * before we have finished mounting the filesystem.
1944		 */
1945		if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1946			goto sleep;
1947
1948		if (!mutex_trylock(&fs_info->cleaner_mutex))
1949			goto sleep;
1950
1951		/*
1952		 * Avoid the problem that we change the status of the fs
1953		 * during the above check and trylock.
1954		 */
1955		if (btrfs_need_cleaner_sleep(fs_info)) {
1956			mutex_unlock(&fs_info->cleaner_mutex);
1957			goto sleep;
1958		}
1959
1960		btrfs_run_delayed_iputs(fs_info);
1961
1962		again = btrfs_clean_one_deleted_snapshot(root);
1963		mutex_unlock(&fs_info->cleaner_mutex);
1964
1965		/*
1966		 * The defragger has dealt with the R/O remount and umount,
1967		 * needn't do anything special here.
1968		 */
1969		btrfs_run_defrag_inodes(fs_info);
1970
1971		/*
1972		 * Acquires fs_info->reclaim_bgs_lock to avoid racing
1973		 * with relocation (btrfs_relocate_chunk) and relocation
1974		 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1975		 * after acquiring fs_info->reclaim_bgs_lock. So we
1976		 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1977		 * unused block groups.
1978		 */
1979		btrfs_delete_unused_bgs(fs_info);
1980
1981		/*
1982		 * Reclaim block groups in the reclaim_bgs list after we deleted
1983		 * all unused block_groups. This possibly gives us some more free
1984		 * space.
1985		 */
1986		btrfs_reclaim_bgs(fs_info);
1987sleep:
1988		clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1989		if (kthread_should_park())
1990			kthread_parkme();
1991		if (kthread_should_stop())
1992			return 0;
1993		if (!again) {
1994			set_current_state(TASK_INTERRUPTIBLE);
1995			schedule();
1996			__set_current_state(TASK_RUNNING);
1997		}
1998	}
1999}
2000
2001static int transaction_kthread(void *arg)
2002{
2003	struct btrfs_root *root = arg;
2004	struct btrfs_fs_info *fs_info = root->fs_info;
2005	struct btrfs_trans_handle *trans;
2006	struct btrfs_transaction *cur;
2007	u64 transid;
2008	time64_t delta;
2009	unsigned long delay;
2010	bool cannot_commit;
2011
2012	do {
2013		cannot_commit = false;
2014		delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
2015		mutex_lock(&fs_info->transaction_kthread_mutex);
2016
2017		spin_lock(&fs_info->trans_lock);
2018		cur = fs_info->running_transaction;
2019		if (!cur) {
2020			spin_unlock(&fs_info->trans_lock);
2021			goto sleep;
2022		}
2023
2024		delta = ktime_get_seconds() - cur->start_time;
2025		if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
2026		    cur->state < TRANS_STATE_COMMIT_START &&
2027		    delta < fs_info->commit_interval) {
2028			spin_unlock(&fs_info->trans_lock);
2029			delay -= msecs_to_jiffies((delta - 1) * 1000);
2030			delay = min(delay,
2031				    msecs_to_jiffies(fs_info->commit_interval * 1000));
2032			goto sleep;
2033		}
2034		transid = cur->transid;
2035		spin_unlock(&fs_info->trans_lock);
2036
2037		/* If the file system is aborted, this will always fail. */
2038		trans = btrfs_attach_transaction(root);
2039		if (IS_ERR(trans)) {
2040			if (PTR_ERR(trans) != -ENOENT)
2041				cannot_commit = true;
2042			goto sleep;
2043		}
2044		if (transid == trans->transid) {
2045			btrfs_commit_transaction(trans);
2046		} else {
2047			btrfs_end_transaction(trans);
2048		}
2049sleep:
2050		wake_up_process(fs_info->cleaner_kthread);
2051		mutex_unlock(&fs_info->transaction_kthread_mutex);
2052
2053		if (BTRFS_FS_ERROR(fs_info))
2054			btrfs_cleanup_transaction(fs_info);
2055		if (!kthread_should_stop() &&
2056				(!btrfs_transaction_blocked(fs_info) ||
2057				 cannot_commit))
2058			schedule_timeout_interruptible(delay);
2059	} while (!kthread_should_stop());
2060	return 0;
2061}
2062
2063/*
2064 * This will find the highest generation in the array of root backups.  The
2065 * index of the highest array is returned, or -EINVAL if we can't find
2066 * anything.
2067 *
2068 * We check to make sure the array is valid by comparing the
2069 * generation of the latest  root in the array with the generation
2070 * in the super block.  If they don't match we pitch it.
2071 */
2072static int find_newest_super_backup(struct btrfs_fs_info *info)
2073{
2074	const u64 newest_gen = btrfs_super_generation(info->super_copy);
2075	u64 cur;
2076	struct btrfs_root_backup *root_backup;
2077	int i;
2078
2079	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2080		root_backup = info->super_copy->super_roots + i;
2081		cur = btrfs_backup_tree_root_gen(root_backup);
2082		if (cur == newest_gen)
2083			return i;
2084	}
2085
2086	return -EINVAL;
2087}
2088
2089/*
2090 * copy all the root pointers into the super backup array.
2091 * this will bump the backup pointer by one when it is
2092 * done
2093 */
2094static void backup_super_roots(struct btrfs_fs_info *info)
2095{
2096	const int next_backup = info->backup_root_index;
2097	struct btrfs_root_backup *root_backup;
2098	struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
2099	struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
2100
2101	root_backup = info->super_for_commit->super_roots + next_backup;
2102
2103	/*
2104	 * make sure all of our padding and empty slots get zero filled
2105	 * regardless of which ones we use today
2106	 */
2107	memset(root_backup, 0, sizeof(*root_backup));
2108
2109	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
2110
2111	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
2112	btrfs_set_backup_tree_root_gen(root_backup,
2113			       btrfs_header_generation(info->tree_root->node));
2114
2115	btrfs_set_backup_tree_root_level(root_backup,
2116			       btrfs_header_level(info->tree_root->node));
2117
2118	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
2119	btrfs_set_backup_chunk_root_gen(root_backup,
2120			       btrfs_header_generation(info->chunk_root->node));
2121	btrfs_set_backup_chunk_root_level(root_backup,
2122			       btrfs_header_level(info->chunk_root->node));
2123
2124	btrfs_set_backup_extent_root(root_backup, extent_root->node->start);
2125	btrfs_set_backup_extent_root_gen(root_backup,
2126			       btrfs_header_generation(extent_root->node));
2127	btrfs_set_backup_extent_root_level(root_backup,
2128			       btrfs_header_level(extent_root->node));
2129
2130	/*
2131	 * we might commit during log recovery, which happens before we set
2132	 * the fs_root.  Make sure it is valid before we fill it in.
2133	 */
2134	if (info->fs_root && info->fs_root->node) {
2135		btrfs_set_backup_fs_root(root_backup,
2136					 info->fs_root->node->start);
2137		btrfs_set_backup_fs_root_gen(root_backup,
2138			       btrfs_header_generation(info->fs_root->node));
2139		btrfs_set_backup_fs_root_level(root_backup,
2140			       btrfs_header_level(info->fs_root->node));
2141	}
2142
2143	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
2144	btrfs_set_backup_dev_root_gen(root_backup,
2145			       btrfs_header_generation(info->dev_root->node));
2146	btrfs_set_backup_dev_root_level(root_backup,
2147				       btrfs_header_level(info->dev_root->node));
2148
2149	btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
2150	btrfs_set_backup_csum_root_gen(root_backup,
2151				       btrfs_header_generation(csum_root->node));
2152	btrfs_set_backup_csum_root_level(root_backup,
2153					 btrfs_header_level(csum_root->node));
2154
2155	btrfs_set_backup_total_bytes(root_backup,
2156			     btrfs_super_total_bytes(info->super_copy));
2157	btrfs_set_backup_bytes_used(root_backup,
2158			     btrfs_super_bytes_used(info->super_copy));
2159	btrfs_set_backup_num_devices(root_backup,
2160			     btrfs_super_num_devices(info->super_copy));
2161
2162	/*
2163	 * if we don't copy this out to the super_copy, it won't get remembered
2164	 * for the next commit
2165	 */
2166	memcpy(&info->super_copy->super_roots,
2167	       &info->super_for_commit->super_roots,
2168	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
2169}
2170
2171/*
2172 * read_backup_root - Reads a backup root based on the passed priority. Prio 0
2173 * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
2174 *
2175 * fs_info - filesystem whose backup roots need to be read
2176 * priority - priority of backup root required
2177 *
2178 * Returns backup root index on success and -EINVAL otherwise.
2179 */
2180static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
2181{
2182	int backup_index = find_newest_super_backup(fs_info);
2183	struct btrfs_super_block *super = fs_info->super_copy;
2184	struct btrfs_root_backup *root_backup;
2185
2186	if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
2187		if (priority == 0)
2188			return backup_index;
2189
2190		backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
2191		backup_index %= BTRFS_NUM_BACKUP_ROOTS;
2192	} else {
2193		return -EINVAL;
2194	}
2195
2196	root_backup = super->super_roots + backup_index;
2197
2198	btrfs_set_super_generation(super,
2199				   btrfs_backup_tree_root_gen(root_backup));
2200	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
2201	btrfs_set_super_root_level(super,
2202				   btrfs_backup_tree_root_level(root_backup));
2203	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
2204
2205	/*
2206	 * Fixme: the total bytes and num_devices need to match or we should
2207	 * need a fsck
2208	 */
2209	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
2210	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
2211
2212	return backup_index;
2213}
2214
2215/* helper to cleanup workers */
2216static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2217{
2218	btrfs_destroy_workqueue(fs_info->fixup_workers);
2219	btrfs_destroy_workqueue(fs_info->delalloc_workers);
2220	btrfs_destroy_workqueue(fs_info->workers);
2221	btrfs_destroy_workqueue(fs_info->endio_workers);
2222	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2223	btrfs_destroy_workqueue(fs_info->rmw_workers);
2224	btrfs_destroy_workqueue(fs_info->endio_write_workers);
2225	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2226	btrfs_destroy_workqueue(fs_info->delayed_workers);
2227	btrfs_destroy_workqueue(fs_info->caching_workers);
2228	btrfs_destroy_workqueue(fs_info->flush_workers);
2229	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2230	if (fs_info->discard_ctl.discard_workers)
2231		destroy_workqueue(fs_info->discard_ctl.discard_workers);
2232	/*
2233	 * Now that all other work queues are destroyed, we can safely destroy
2234	 * the queues used for metadata I/O, since tasks from those other work
2235	 * queues can do metadata I/O operations.
2236	 */
2237	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
2238	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2239}
2240
2241static void free_root_extent_buffers(struct btrfs_root *root)
2242{
2243	if (root) {
2244		free_extent_buffer(root->node);
2245		free_extent_buffer(root->commit_root);
2246		root->node = NULL;
2247		root->commit_root = NULL;
2248	}
2249}
2250
2251static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
2252{
2253	struct btrfs_root *root, *tmp;
2254
2255	rbtree_postorder_for_each_entry_safe(root, tmp,
2256					     &fs_info->global_root_tree,
2257					     rb_node)
2258		free_root_extent_buffers(root);
2259}
2260
2261/* helper to cleanup tree roots */
2262static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
2263{
2264	free_root_extent_buffers(info->tree_root);
2265
2266	free_global_root_pointers(info);
2267	free_root_extent_buffers(info->dev_root);
2268	free_root_extent_buffers(info->quota_root);
2269	free_root_extent_buffers(info->uuid_root);
2270	free_root_extent_buffers(info->fs_root);
2271	free_root_extent_buffers(info->data_reloc_root);
2272	if (free_chunk_root)
2273		free_root_extent_buffers(info->chunk_root);
2274}
2275
2276void btrfs_put_root(struct btrfs_root *root)
2277{
2278	if (!root)
2279		return;
2280
2281	if (refcount_dec_and_test(&root->refs)) {
2282		WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2283		WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
2284		if (root->anon_dev)
2285			free_anon_bdev(root->anon_dev);
2286		btrfs_drew_lock_destroy(&root->snapshot_lock);
2287		free_root_extent_buffers(root);
2288#ifdef CONFIG_BTRFS_DEBUG
2289		spin_lock(&root->fs_info->fs_roots_radix_lock);
2290		list_del_init(&root->leak_list);
2291		spin_unlock(&root->fs_info->fs_roots_radix_lock);
2292#endif
2293		kfree(root);
2294	}
2295}
2296
2297void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2298{
2299	int ret;
2300	struct btrfs_root *gang[8];
2301	int i;
2302
2303	while (!list_empty(&fs_info->dead_roots)) {
2304		gang[0] = list_entry(fs_info->dead_roots.next,
2305				     struct btrfs_root, root_list);
2306		list_del(&gang[0]->root_list);
2307
2308		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
2309			btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2310		btrfs_put_root(gang[0]);
2311	}
2312
2313	while (1) {
2314		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2315					     (void **)gang, 0,
2316					     ARRAY_SIZE(gang));
2317		if (!ret)
2318			break;
2319		for (i = 0; i < ret; i++)
2320			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2321	}
2322}
2323
2324static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2325{
2326	mutex_init(&fs_info->scrub_lock);
2327	atomic_set(&fs_info->scrubs_running, 0);
2328	atomic_set(&fs_info->scrub_pause_req, 0);
2329	atomic_set(&fs_info->scrubs_paused, 0);
2330	atomic_set(&fs_info->scrub_cancel_req, 0);
2331	init_waitqueue_head(&fs_info->scrub_pause_wait);
2332	refcount_set(&fs_info->scrub_workers_refcnt, 0);
2333}
2334
2335static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
2336{
2337	spin_lock_init(&fs_info->balance_lock);
2338	mutex_init(&fs_info->balance_mutex);
2339	atomic_set(&fs_info->balance_pause_req, 0);
2340	atomic_set(&fs_info->balance_cancel_req, 0);
2341	fs_info->balance_ctl = NULL;
2342	init_waitqueue_head(&fs_info->balance_wait_q);
2343	atomic_set(&fs_info->reloc_cancel_req, 0);
2344}
2345
2346static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2347{
2348	struct inode *inode = fs_info->btree_inode;
2349
2350	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2351	set_nlink(inode, 1);
2352	/*
2353	 * we set the i_size on the btree inode to the max possible int.
2354	 * the real end of the address space is determined by all of
2355	 * the devices in the system
2356	 */
2357	inode->i_size = OFFSET_MAX;
2358	inode->i_mapping->a_ops = &btree_aops;
2359
2360	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2361	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
2362			    IO_TREE_BTREE_INODE_IO, inode);
2363	BTRFS_I(inode)->io_tree.track_uptodate = false;
2364	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2365
2366	BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
2367	memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
2368	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
2369	btrfs_insert_inode_hash(inode);
2370}
2371
2372static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2373{
2374	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2375	init_rwsem(&fs_info->dev_replace.rwsem);
2376	init_waitqueue_head(&fs_info->dev_replace.replace_wait);
2377}
2378
2379static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2380{
2381	spin_lock_init(&fs_info->qgroup_lock);
2382	mutex_init(&fs_info->qgroup_ioctl_lock);
2383	fs_info->qgroup_tree = RB_ROOT;
2384	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2385	fs_info->qgroup_seq = 1;
2386	fs_info->qgroup_ulist = NULL;
2387	fs_info->qgroup_rescan_running = false;
2388	mutex_init(&fs_info->qgroup_rescan_lock);
2389}
2390
2391static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
2392{
2393	u32 max_active = fs_info->thread_pool_size;
2394	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2395
2396	fs_info->workers =
2397		btrfs_alloc_workqueue(fs_info, "worker",
2398				      flags | WQ_HIGHPRI, max_active, 16);
2399
2400	fs_info->delalloc_workers =
2401		btrfs_alloc_workqueue(fs_info, "delalloc",
2402				      flags, max_active, 2);
2403
2404	fs_info->flush_workers =
2405		btrfs_alloc_workqueue(fs_info, "flush_delalloc",
2406				      flags, max_active, 0);
2407
2408	fs_info->caching_workers =
2409		btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
2410
2411	fs_info->fixup_workers =
2412		btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
2413
2414	/*
2415	 * endios are largely parallel and should have a very
2416	 * low idle thresh
2417	 */
2418	fs_info->endio_workers =
2419		btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
2420	fs_info->endio_meta_workers =
2421		btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
2422				      max_active, 4);
2423	fs_info->endio_meta_write_workers =
2424		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
2425				      max_active, 2);
2426	fs_info->endio_raid56_workers =
2427		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
2428				      max_active, 4);
2429	fs_info->rmw_workers =
2430		btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
2431	fs_info->endio_write_workers =
2432		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
2433				      max_active, 2);
2434	fs_info->endio_freespace_worker =
2435		btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
2436				      max_active, 0);
2437	fs_info->delayed_workers =
2438		btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
2439				      max_active, 0);
2440	fs_info->qgroup_rescan_workers =
2441		btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2442	fs_info->discard_ctl.discard_workers =
2443		alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
2444
2445	if (!(fs_info->workers && fs_info->delalloc_workers &&
2446	      fs_info->flush_workers &&
2447	      fs_info->endio_workers && fs_info->endio_meta_workers &&
2448	      fs_info->endio_meta_write_workers &&
2449	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2450	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2451	      fs_info->caching_workers && fs_info->fixup_workers &&
2452	      fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
2453	      fs_info->discard_ctl.discard_workers)) {
2454		return -ENOMEM;
2455	}
2456
2457	return 0;
2458}
2459
2460static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2461{
2462	struct crypto_shash *csum_shash;
2463	const char *csum_driver = btrfs_super_csum_driver(csum_type);
2464
2465	csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2466
2467	if (IS_ERR(csum_shash)) {
2468		btrfs_err(fs_info, "error allocating %s hash for checksum",
2469			  csum_driver);
2470		return PTR_ERR(csum_shash);
2471	}
2472
2473	fs_info->csum_shash = csum_shash;
2474
2475	return 0;
2476}
2477
2478static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2479			    struct btrfs_fs_devices *fs_devices)
2480{
2481	int ret;
2482	struct btrfs_root *log_tree_root;
2483	struct btrfs_super_block *disk_super = fs_info->super_copy;
2484	u64 bytenr = btrfs_super_log_root(disk_super);
2485	int level = btrfs_super_log_root_level(disk_super);
2486
2487	if (fs_devices->rw_devices == 0) {
2488		btrfs_warn(fs_info, "log replay required on RO media");
2489		return -EIO;
2490	}
2491
2492	log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2493					 GFP_KERNEL);
2494	if (!log_tree_root)
2495		return -ENOMEM;
2496
2497	log_tree_root->node = read_tree_block(fs_info, bytenr,
2498					      BTRFS_TREE_LOG_OBJECTID,
2499					      fs_info->generation + 1, level,
2500					      NULL);
2501	if (IS_ERR(log_tree_root->node)) {
2502		btrfs_warn(fs_info, "failed to read log tree");
2503		ret = PTR_ERR(log_tree_root->node);
2504		log_tree_root->node = NULL;
2505		btrfs_put_root(log_tree_root);
2506		return ret;
2507	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
2508		btrfs_err(fs_info, "failed to read log tree");
2509		btrfs_put_root(log_tree_root);
2510		return -EIO;
2511	}
2512	/* returns with log_tree_root freed on success */
2513	ret = btrfs_recover_log_trees(log_tree_root);
2514	if (ret) {
2515		btrfs_handle_fs_error(fs_info, ret,
2516				      "Failed to recover log tree");
2517		btrfs_put_root(log_tree_root);
2518		return ret;
2519	}
2520
2521	if (sb_rdonly(fs_info->sb)) {
2522		ret = btrfs_commit_super(fs_info);
2523		if (ret)
2524			return ret;
2525	}
2526
2527	return 0;
2528}
2529
2530static int load_global_roots_objectid(struct btrfs_root *tree_root,
2531				      struct btrfs_path *path, u64 objectid,
2532				      const char *name)
2533{
2534	struct btrfs_fs_info *fs_info = tree_root->fs_info;
2535	struct btrfs_root *root;
2536	int ret;
2537	struct btrfs_key key = {
2538		.objectid = objectid,
2539		.type = BTRFS_ROOT_ITEM_KEY,
2540		.offset = 0,
2541	};
2542	bool found = false;
2543
2544	/* If we have IGNOREDATACSUMS skip loading these roots. */
2545	if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
2546	    btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2547		set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2548		return 0;
2549	}
2550
2551	while (1) {
2552		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2553		if (ret < 0)
2554			break;
2555
2556		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2557			ret = btrfs_next_leaf(tree_root, path);
2558			if (ret) {
2559				if (ret > 0)
2560					ret = 0;
2561				break;
2562			}
2563		}
2564		ret = 0;
2565
2566		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2567		if (key.objectid != objectid)
2568			break;
2569		btrfs_release_path(path);
2570
2571		found = true;
2572		root = read_tree_root_path(tree_root, path, &key);
2573		if (IS_ERR(root)) {
2574			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2575				ret = PTR_ERR(root);
2576			break;
2577		}
2578		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2579		ret = btrfs_global_root_insert(root);
2580		if (ret) {
2581			btrfs_put_root(root);
2582			break;
2583		}
2584		key.offset++;
2585	}
2586	btrfs_release_path(path);
2587
2588	if (!found || ret) {
2589		if (objectid == BTRFS_CSUM_TREE_OBJECTID)
2590			set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2591
2592		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2593			ret = ret ? ret : -ENOENT;
2594		else
2595			ret = 0;
2596		btrfs_err(fs_info, "failed to load root %s", name);
2597	}
2598	return ret;
2599}
2600
2601static int load_global_roots(struct btrfs_root *tree_root)
2602{
2603	struct btrfs_path *path;
2604	int ret = 0;
2605
2606	path = btrfs_alloc_path();
2607	if (!path)
2608		return -ENOMEM;
2609
2610	ret = load_global_roots_objectid(tree_root, path,
2611					 BTRFS_EXTENT_TREE_OBJECTID, "extent");
2612	if (ret)
2613		goto out;
2614	ret = load_global_roots_objectid(tree_root, path,
2615					 BTRFS_CSUM_TREE_OBJECTID, "csum");
2616	if (ret)
2617		goto out;
2618	if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
2619		goto out;
2620	ret = load_global_roots_objectid(tree_root, path,
2621					 BTRFS_FREE_SPACE_TREE_OBJECTID,
2622					 "free space");
2623out:
2624	btrfs_free_path(path);
2625	return ret;
2626}
2627
2628static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2629{
2630	struct btrfs_root *tree_root = fs_info->tree_root;
2631	struct btrfs_root *root;
2632	struct btrfs_key location;
2633	int ret;
2634
2635	BUG_ON(!fs_info->tree_root);
2636
2637	ret = load_global_roots(tree_root);
2638	if (ret)
2639		return ret;
2640
2641	location.objectid = BTRFS_DEV_TREE_OBJECTID;
2642	location.type = BTRFS_ROOT_ITEM_KEY;
2643	location.offset = 0;
2644
2645	root = btrfs_read_tree_root(tree_root, &location);
2646	if (IS_ERR(root)) {
2647		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2648			ret = PTR_ERR(root);
2649			goto out;
2650		}
2651	} else {
2652		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2653		fs_info->dev_root = root;
2654	}
2655	/* Initialize fs_info for all devices in any case */
2656	btrfs_init_devices_late(fs_info);
2657
2658	/*
2659	 * This tree can share blocks with some other fs tree during relocation
2660	 * and we need a proper setup by btrfs_get_fs_root
2661	 */
2662	root = btrfs_get_fs_root(tree_root->fs_info,
2663				 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2664	if (IS_ERR(root)) {
2665		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2666			ret = PTR_ERR(root);
2667			goto out;
2668		}
2669	} else {
2670		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2671		fs_info->data_reloc_root = root;
2672	}
2673
2674	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2675	root = btrfs_read_tree_root(tree_root, &location);
2676	if (!IS_ERR(root)) {
2677		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2678		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2679		fs_info->quota_root = root;
2680	}
2681
2682	location.objectid = BTRFS_UUID_TREE_OBJECTID;
2683	root = btrfs_read_tree_root(tree_root, &location);
2684	if (IS_ERR(root)) {
2685		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2686			ret = PTR_ERR(root);
2687			if (ret != -ENOENT)
2688				goto out;
2689		}
2690	} else {
2691		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2692		fs_info->uuid_root = root;
2693	}
2694
2695	return 0;
2696out:
2697	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2698		   location.objectid, ret);
2699	return ret;
2700}
2701
2702/*
2703 * Real super block validation
2704 * NOTE: super csum type and incompat features will not be checked here.
2705 *
2706 * @sb:		super block to check
2707 * @mirror_num:	the super block number to check its bytenr:
2708 * 		0	the primary (1st) sb
2709 * 		1, 2	2nd and 3rd backup copy
2710 * 	       -1	skip bytenr check
2711 */
2712static int validate_super(struct btrfs_fs_info *fs_info,
2713			    struct btrfs_super_block *sb, int mirror_num)
2714{
2715	u64 nodesize = btrfs_super_nodesize(sb);
2716	u64 sectorsize = btrfs_super_sectorsize(sb);
2717	int ret = 0;
2718
2719	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
2720		btrfs_err(fs_info, "no valid FS found");
2721		ret = -EINVAL;
2722	}
2723	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
2724		btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
2725				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2726		ret = -EINVAL;
2727	}
2728	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
2729		btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2730				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2731		ret = -EINVAL;
2732	}
2733	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
2734		btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2735				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2736		ret = -EINVAL;
2737	}
2738	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
2739		btrfs_err(fs_info, "log_root level too big: %d >= %d",
2740				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2741		ret = -EINVAL;
2742	}
2743
2744	/*
2745	 * Check sectorsize and nodesize first, other check will need it.
2746	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2747	 */
2748	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
2749	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2750		btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2751		ret = -EINVAL;
2752	}
2753
2754	/*
2755	 * For 4K page size, we only support 4K sector size.
2756	 * For 64K page size, we support 64K and 4K sector sizes.
2757	 */
2758	if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
2759	    (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
2760				     sectorsize != SZ_64K))) {
2761		btrfs_err(fs_info,
2762			"sectorsize %llu not yet supported for page size %lu",
2763			sectorsize, PAGE_SIZE);
2764		ret = -EINVAL;
2765	}
2766
2767	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
2768	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2769		btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2770		ret = -EINVAL;
2771	}
2772	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2773		btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2774			  le32_to_cpu(sb->__unused_leafsize), nodesize);
2775		ret = -EINVAL;
2776	}
2777
2778	/* Root alignment check */
2779	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2780		btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2781			   btrfs_super_root(sb));
2782		ret = -EINVAL;
2783	}
2784	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2785		btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2786			   btrfs_super_chunk_root(sb));
2787		ret = -EINVAL;
2788	}
2789	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2790		btrfs_warn(fs_info, "log_root block unaligned: %llu",
2791			   btrfs_super_log_root(sb));
2792		ret = -EINVAL;
2793	}
2794
2795	if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
2796		   BTRFS_FSID_SIZE)) {
2797		btrfs_err(fs_info,
2798		"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2799			fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
2800		ret = -EINVAL;
2801	}
2802
2803	if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
2804	    memcmp(fs_info->fs_devices->metadata_uuid,
2805		   fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
2806		btrfs_err(fs_info,
2807"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2808			fs_info->super_copy->metadata_uuid,
2809			fs_info->fs_devices->metadata_uuid);
2810		ret = -EINVAL;
2811	}
2812
2813	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2814		   BTRFS_FSID_SIZE) != 0) {
2815		btrfs_err(fs_info,
2816			"dev_item UUID does not match metadata fsid: %pU != %pU",
2817			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2818		ret = -EINVAL;
2819	}
2820
2821	/*
2822	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
2823	 * done later
2824	 */
2825	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
2826		btrfs_err(fs_info, "bytes_used is too small %llu",
2827			  btrfs_super_bytes_used(sb));
2828		ret = -EINVAL;
2829	}
2830	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
2831		btrfs_err(fs_info, "invalid stripesize %u",
2832			  btrfs_super_stripesize(sb));
2833		ret = -EINVAL;
2834	}
2835	if (btrfs_super_num_devices(sb) > (1UL << 31))
2836		btrfs_warn(fs_info, "suspicious number of devices: %llu",
2837			   btrfs_super_num_devices(sb));
2838	if (btrfs_super_num_devices(sb) == 0) {
2839		btrfs_err(fs_info, "number of devices is 0");
2840		ret = -EINVAL;
2841	}
2842
2843	if (mirror_num >= 0 &&
2844	    btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2845		btrfs_err(fs_info, "super offset mismatch %llu != %u",
2846			  btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2847		ret = -EINVAL;
2848	}
2849
2850	/*
2851	 * Obvious sys_chunk_array corruptions, it must hold at least one key
2852	 * and one chunk
2853	 */
2854	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2855		btrfs_err(fs_info, "system chunk array too big %u > %u",
2856			  btrfs_super_sys_array_size(sb),
2857			  BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2858		ret = -EINVAL;
2859	}
2860	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
2861			+ sizeof(struct btrfs_chunk)) {
2862		btrfs_err(fs_info, "system chunk array too small %u < %zu",
2863			  btrfs_super_sys_array_size(sb),
2864			  sizeof(struct btrfs_disk_key)
2865			  + sizeof(struct btrfs_chunk));
2866		ret = -EINVAL;
2867	}
2868
2869	/*
2870	 * The generation is a global counter, we'll trust it more than the others
2871	 * but it's still possible that it's the one that's wrong.
2872	 */
2873	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
2874		btrfs_warn(fs_info,
2875			"suspicious: generation < chunk_root_generation: %llu < %llu",
2876			btrfs_super_generation(sb),
2877			btrfs_super_chunk_root_generation(sb));
2878	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
2879	    && btrfs_super_cache_generation(sb) != (u64)-1)
2880		btrfs_warn(fs_info,
2881			"suspicious: generation < cache_generation: %llu < %llu",
2882			btrfs_super_generation(sb),
2883			btrfs_super_cache_generation(sb));
2884
2885	return ret;
2886}
2887
2888/*
2889 * Validation of super block at mount time.
2890 * Some checks already done early at mount time, like csum type and incompat
2891 * flags will be skipped.
2892 */
2893static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2894{
2895	return validate_super(fs_info, fs_info->super_copy, 0);
2896}
2897
2898/*
2899 * Validation of super block at write time.
2900 * Some checks like bytenr check will be skipped as their values will be
2901 * overwritten soon.
2902 * Extra checks like csum type and incompat flags will be done here.
2903 */
2904static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2905				      struct btrfs_super_block *sb)
2906{
2907	int ret;
2908
2909	ret = validate_super(fs_info, sb, -1);
2910	if (ret < 0)
2911		goto out;
2912	if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2913		ret = -EUCLEAN;
2914		btrfs_err(fs_info, "invalid csum type, has %u want %u",
2915			  btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2916		goto out;
2917	}
2918	if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
2919		ret = -EUCLEAN;
2920		btrfs_err(fs_info,
2921		"invalid incompat flags, has 0x%llx valid mask 0x%llx",
2922			  btrfs_super_incompat_flags(sb),
2923			  (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2924		goto out;
2925	}
2926out:
2927	if (ret < 0)
2928		btrfs_err(fs_info,
2929		"super block corruption detected before writing it to disk");
2930	return ret;
2931}
2932
2933static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2934{
2935	int backup_index = find_newest_super_backup(fs_info);
2936	struct btrfs_super_block *sb = fs_info->super_copy;
2937	struct btrfs_root *tree_root = fs_info->tree_root;
2938	bool handle_error = false;
2939	int ret = 0;
2940	int i;
2941
2942	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2943		u64 generation;
2944		int level;
2945
2946		if (handle_error) {
2947			if (!IS_ERR(tree_root->node))
2948				free_extent_buffer(tree_root->node);
2949			tree_root->node = NULL;
2950
2951			if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2952				break;
2953
2954			free_root_pointers(fs_info, 0);
2955
2956			/*
2957			 * Don't use the log in recovery mode, it won't be
2958			 * valid
2959			 */
2960			btrfs_set_super_log_root(sb, 0);
2961
2962			/* We can't trust the free space cache either */
2963			btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2964
2965			ret = read_backup_root(fs_info, i);
2966			backup_index = ret;
2967			if (ret < 0)
2968				return ret;
2969		}
2970		generation = btrfs_super_generation(sb);
2971		level = btrfs_super_root_level(sb);
2972		tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
2973						  BTRFS_ROOT_TREE_OBJECTID,
2974						  generation, level, NULL);
2975		if (IS_ERR(tree_root->node)) {
2976			handle_error = true;
2977			ret = PTR_ERR(tree_root->node);
2978			tree_root->node = NULL;
2979			btrfs_warn(fs_info, "couldn't read tree root");
2980			continue;
2981
2982		} else if (!extent_buffer_uptodate(tree_root->node)) {
2983			handle_error = true;
2984			ret = -EIO;
2985			btrfs_warn(fs_info, "error while reading tree root");
2986			continue;
2987		}
2988
2989		btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2990		tree_root->commit_root = btrfs_root_node(tree_root);
2991		btrfs_set_root_refs(&tree_root->root_item, 1);
2992
2993		/*
2994		 * No need to hold btrfs_root::objectid_mutex since the fs
2995		 * hasn't been fully initialised and we are the only user
2996		 */
2997		ret = btrfs_init_root_free_objectid(tree_root);
2998		if (ret < 0) {
2999			handle_error = true;
3000			continue;
3001		}
3002
3003		ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
3004
3005		ret = btrfs_read_roots(fs_info);
3006		if (ret < 0) {
3007			handle_error = true;
3008			continue;
3009		}
3010
3011		/* All successful */
3012		fs_info->generation = generation;
3013		fs_info->last_trans_committed = generation;
3014		fs_info->last_reloc_trans = 0;
3015
3016		/* Always begin writing backup roots after the one being used */
3017		if (backup_index < 0) {
3018			fs_info->backup_root_index = 0;
3019		} else {
3020			fs_info->backup_root_index = backup_index + 1;
3021			fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
3022		}
3023		break;
3024	}
3025
3026	return ret;
3027}
3028
3029void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
3030{
3031	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
3032	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
3033	INIT_LIST_HEAD(&fs_info->trans_list);
3034	INIT_LIST_HEAD(&fs_info->dead_roots);
3035	INIT_LIST_HEAD(&fs_info->delayed_iputs);
3036	INIT_LIST_HEAD(&fs_info->delalloc_roots);
3037	INIT_LIST_HEAD(&fs_info->caching_block_groups);
3038	spin_lock_init(&fs_info->delalloc_root_lock);
3039	spin_lock_init(&fs_info->trans_lock);
3040	spin_lock_init(&fs_info->fs_roots_radix_lock);
3041	spin_lock_init(&fs_info->delayed_iput_lock);
3042	spin_lock_init(&fs_info->defrag_inodes_lock);
3043	spin_lock_init(&fs_info->super_lock);
3044	spin_lock_init(&fs_info->buffer_lock);
3045	spin_lock_init(&fs_info->unused_bgs_lock);
3046	spin_lock_init(&fs_info->treelog_bg_lock);
3047	spin_lock_init(&fs_info->zone_active_bgs_lock);
3048	spin_lock_init(&fs_info->relocation_bg_lock);
3049	rwlock_init(&fs_info->tree_mod_log_lock);
3050	rwlock_init(&fs_info->global_root_lock);
3051	mutex_init(&fs_info->unused_bg_unpin_mutex);
3052	mutex_init(&fs_info->reclaim_bgs_lock);
3053	mutex_init(&fs_info->reloc_mutex);
3054	mutex_init(&fs_info->delalloc_root_mutex);
3055	mutex_init(&fs_info->zoned_meta_io_lock);
3056	seqlock_init(&fs_info->profiles_lock);
3057
3058	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
3059	INIT_LIST_HEAD(&fs_info->space_info);
3060	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
3061	INIT_LIST_HEAD(&fs_info->unused_bgs);
3062	INIT_LIST_HEAD(&fs_info->reclaim_bgs);
3063	INIT_LIST_HEAD(&fs_info->zone_active_bgs);
3064#ifdef CONFIG_BTRFS_DEBUG
3065	INIT_LIST_HEAD(&fs_info->allocated_roots);
3066	INIT_LIST_HEAD(&fs_info->allocated_ebs);
3067	spin_lock_init(&fs_info->eb_leak_lock);
3068#endif
3069	extent_map_tree_init(&fs_info->mapping_tree);
3070	btrfs_init_block_rsv(&fs_info->global_block_rsv,
3071			     BTRFS_BLOCK_RSV_GLOBAL);
3072	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
3073	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
3074	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
3075	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
3076			     BTRFS_BLOCK_RSV_DELOPS);
3077	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
3078			     BTRFS_BLOCK_RSV_DELREFS);
3079
3080	atomic_set(&fs_info->async_delalloc_pages, 0);
3081	atomic_set(&fs_info->defrag_running, 0);
3082	atomic_set(&fs_info->nr_delayed_iputs, 0);
3083	atomic64_set(&fs_info->tree_mod_seq, 0);
3084	fs_info->global_root_tree = RB_ROOT;
3085	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
3086	fs_info->metadata_ratio = 0;
3087	fs_info->defrag_inodes = RB_ROOT;
3088	atomic64_set(&fs_info->free_chunk_space, 0);
3089	fs_info->tree_mod_log = RB_ROOT;
3090	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
3091	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
3092	btrfs_init_ref_verify(fs_info);
3093
3094	fs_info->thread_pool_size = min_t(unsigned long,
3095					  num_online_cpus() + 2, 8);
3096
3097	INIT_LIST_HEAD(&fs_info->ordered_roots);
3098	spin_lock_init(&fs_info->ordered_root_lock);
3099
3100	btrfs_init_scrub(fs_info);
3101#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3102	fs_info->check_integrity_print_mask = 0;
3103#endif
3104	btrfs_init_balance(fs_info);
3105	btrfs_init_async_reclaim_work(fs_info);
3106
3107	spin_lock_init(&fs_info->block_group_cache_lock);
3108	fs_info->block_group_cache_tree = RB_ROOT;
3109	fs_info->first_logical_byte = (u64)-1;
3110
3111	extent_io_tree_init(fs_info, &fs_info->excluded_extents,
3112			    IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
3113
3114	mutex_init(&fs_info->ordered_operations_mutex);
3115	mutex_init(&fs_info->tree_log_mutex);
3116	mutex_init(&fs_info->chunk_mutex);
3117	mutex_init(&fs_info->transaction_kthread_mutex);
3118	mutex_init(&fs_info->cleaner_mutex);
3119	mutex_init(&fs_info->ro_block_group_mutex);
3120	init_rwsem(&fs_info->commit_root_sem);
3121	init_rwsem(&fs_info->cleanup_work_sem);
3122	init_rwsem(&fs_info->subvol_sem);
3123	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
3124
3125	btrfs_init_dev_replace_locks(fs_info);
3126	btrfs_init_qgroup(fs_info);
3127	btrfs_discard_init(fs_info);
3128
3129	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
3130	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
3131
3132	init_waitqueue_head(&fs_info->transaction_throttle);
3133	init_waitqueue_head(&fs_info->transaction_wait);
3134	init_waitqueue_head(&fs_info->transaction_blocked_wait);
3135	init_waitqueue_head(&fs_info->async_submit_wait);
3136	init_waitqueue_head(&fs_info->delayed_iputs_wait);
3137
3138	/* Usable values until the real ones are cached from the superblock */
3139	fs_info->nodesize = 4096;
3140	fs_info->sectorsize = 4096;
3141	fs_info->sectorsize_bits = ilog2(4096);
3142	fs_info->stripesize = 4096;
3143
3144	spin_lock_init(&fs_info->swapfile_pins_lock);
3145	fs_info->swapfile_pins = RB_ROOT;
3146
3147	fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
3148	INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
3149}
3150
3151static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
3152{
3153	int ret;
3154
3155	fs_info->sb = sb;
3156	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
3157	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
3158
3159	ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
3160	if (ret)
3161		return ret;
3162
3163	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
3164	if (ret)
3165		return ret;
3166
3167	fs_info->dirty_metadata_batch = PAGE_SIZE *
3168					(1 + ilog2(nr_cpu_ids));
3169
3170	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
3171	if (ret)
3172		return ret;
3173
3174	ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
3175			GFP_KERNEL);
3176	if (ret)
3177		return ret;
3178
3179	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
3180					GFP_KERNEL);
3181	if (!fs_info->delayed_root)
3182		return -ENOMEM;
3183	btrfs_init_delayed_root(fs_info->delayed_root);
3184
3185	if (sb_rdonly(sb))
3186		set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
3187
3188	return btrfs_alloc_stripe_hash_table(fs_info);
3189}
3190
3191static int btrfs_uuid_rescan_kthread(void *data)
3192{
3193	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
3194	int ret;
3195
3196	/*
3197	 * 1st step is to iterate through the existing UUID tree and
3198	 * to delete all entries that contain outdated data.
3199	 * 2nd step is to add all missing entries to the UUID tree.
3200	 */
3201	ret = btrfs_uuid_tree_iterate(fs_info);
3202	if (ret < 0) {
3203		if (ret != -EINTR)
3204			btrfs_warn(fs_info, "iterating uuid_tree failed %d",
3205				   ret);
3206		up(&fs_info->uuid_tree_rescan_sem);
3207		return ret;
3208	}
3209	return btrfs_uuid_scan_kthread(data);
3210}
3211
3212static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3213{
3214	struct task_struct *task;
3215
3216	down(&fs_info->uuid_tree_rescan_sem);
3217	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3218	if (IS_ERR(task)) {
3219		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
3220		btrfs_warn(fs_info, "failed to start uuid_rescan task");
3221		up(&fs_info->uuid_tree_rescan_sem);
3222		return PTR_ERR(task);
3223	}
3224
3225	return 0;
3226}
3227
3228/*
3229 * Some options only have meaning at mount time and shouldn't persist across
3230 * remounts, or be displayed. Clear these at the end of mount and remount
3231 * code paths.
3232 */
3233void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
3234{
3235	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
3236	btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
3237}
3238
3239/*
3240 * Mounting logic specific to read-write file systems. Shared by open_ctree
3241 * and btrfs_remount when remounting from read-only to read-write.
3242 */
3243int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
3244{
3245	int ret;
3246	const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
3247	bool clear_free_space_tree = false;
3248
3249	if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
3250	    btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3251		clear_free_space_tree = true;
3252	} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3253		   !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
3254		btrfs_warn(fs_info, "free space tree is invalid");
3255		clear_free_space_tree = true;
3256	}
3257
3258	if (clear_free_space_tree) {
3259		btrfs_info(fs_info, "clearing free space tree");
3260		ret = btrfs_clear_free_space_tree(fs_info);
3261		if (ret) {
3262			btrfs_warn(fs_info,
3263				   "failed to clear free space tree: %d", ret);
3264			goto out;
3265		}
3266	}
3267
3268	/*
3269	 * btrfs_find_orphan_roots() is responsible for finding all the dead
3270	 * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
3271	 * them into the fs_info->fs_roots_radix tree. This must be done before
3272	 * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
3273	 * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
3274	 * item before the root's tree is deleted - this means that if we unmount
3275	 * or crash before the deletion completes, on the next mount we will not
3276	 * delete what remains of the tree because the orphan item does not
3277	 * exists anymore, which is what tells us we have a pending deletion.
3278	 */
3279	ret = btrfs_find_orphan_roots(fs_info);
3280	if (ret)
3281		goto out;
3282
3283	ret = btrfs_cleanup_fs_roots(fs_info);
3284	if (ret)
3285		goto out;
3286
3287	down_read(&fs_info->cleanup_work_sem);
3288	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3289	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3290		up_read(&fs_info->cleanup_work_sem);
3291		goto out;
3292	}
3293	up_read(&fs_info->cleanup_work_sem);
3294
3295	mutex_lock(&fs_info->cleaner_mutex);
3296	ret = btrfs_recover_relocation(fs_info->tree_root);
3297	mutex_unlock(&fs_info->cleaner_mutex);
3298	if (ret < 0) {
3299		btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
3300		goto out;
3301	}
3302
3303	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3304	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3305		btrfs_info(fs_info, "creating free space tree");
3306		ret = btrfs_create_free_space_tree(fs_info);
3307		if (ret) {
3308			btrfs_warn(fs_info,
3309				"failed to create free space tree: %d", ret);
3310			goto out;
3311		}
3312	}
3313
3314	if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
3315		ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
3316		if (ret)
3317			goto out;
3318	}
3319
3320	ret = btrfs_resume_balance_async(fs_info);
3321	if (ret)
3322		goto out;
3323
3324	ret = btrfs_resume_dev_replace_async(fs_info);
3325	if (ret) {
3326		btrfs_warn(fs_info, "failed to resume dev_replace");
3327		goto out;
3328	}
3329
3330	btrfs_qgroup_rescan_resume(fs_info);
3331
3332	if (!fs_info->uuid_root) {
3333		btrfs_info(fs_info, "creating UUID tree");
3334		ret = btrfs_create_uuid_tree(fs_info);
3335		if (ret) {
3336			btrfs_warn(fs_info,
3337				   "failed to create the UUID tree %d", ret);
3338			goto out;
3339		}
3340	}
3341
3342out:
3343	return ret;
3344}
3345
3346int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
3347		      char *options)
3348{
3349	u32 sectorsize;
3350	u32 nodesize;
3351	u32 stripesize;
3352	u64 generation;
3353	u64 features;
3354	u16 csum_type;
3355	struct btrfs_super_block *disk_super;
3356	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3357	struct btrfs_root *tree_root;
3358	struct btrfs_root *chunk_root;
3359	int ret;
3360	int err = -EINVAL;
3361	int level;
3362
3363	ret = init_mount_fs_info(fs_info, sb);
3364	if (ret) {
3365		err = ret;
3366		goto fail;
3367	}
3368
3369	/* These need to be init'ed before we start creating inodes and such. */
3370	tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
3371				     GFP_KERNEL);
3372	fs_info->tree_root = tree_root;
3373	chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
3374				      GFP_KERNEL);
3375	fs_info->chunk_root = chunk_root;
3376	if (!tree_root || !chunk_root) {
3377		err = -ENOMEM;
3378		goto fail;
3379	}
3380
3381	fs_info->btree_inode = new_inode(sb);
3382	if (!fs_info->btree_inode) {
3383		err = -ENOMEM;
3384		goto fail;
3385	}
3386	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
3387	btrfs_init_btree_inode(fs_info);
3388
3389	invalidate_bdev(fs_devices->latest_dev->bdev);
3390
3391	/*
3392	 * Read super block and check the signature bytes only
3393	 */
3394	disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
3395	if (IS_ERR(disk_super)) {
3396		err = PTR_ERR(disk_super);
3397		goto fail_alloc;
3398	}
3399
3400	/*
3401	 * Verify the type first, if that or the checksum value are
3402	 * corrupted, we'll find out
3403	 */
3404	csum_type = btrfs_super_csum_type(disk_super);
3405	if (!btrfs_supported_super_csum(csum_type)) {
3406		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3407			  csum_type);
3408		err = -EINVAL;
3409		btrfs_release_disk_super(disk_super);
3410		goto fail_alloc;
3411	}
3412
3413	fs_info->csum_size = btrfs_super_csum_size(disk_super);
3414
3415	ret = btrfs_init_csum_hash(fs_info, csum_type);
3416	if (ret) {
3417		err = ret;
3418		btrfs_release_disk_super(disk_super);
3419		goto fail_alloc;
3420	}
3421
3422	/*
3423	 * We want to check superblock checksum, the type is stored inside.
3424	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
3425	 */
3426	if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
3427		btrfs_err(fs_info, "superblock checksum mismatch");
3428		err = -EINVAL;
3429		btrfs_release_disk_super(disk_super);
3430		goto fail_alloc;
3431	}
3432
3433	/*
3434	 * super_copy is zeroed at allocation time and we never touch the
3435	 * following bytes up to INFO_SIZE, the checksum is calculated from
3436	 * the whole block of INFO_SIZE
3437	 */
3438	memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3439	btrfs_release_disk_super(disk_super);
3440
3441	disk_super = fs_info->super_copy;
3442
3443
3444	features = btrfs_super_flags(disk_super);
3445	if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
3446		features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
3447		btrfs_set_super_flags(disk_super, features);
3448		btrfs_info(fs_info,
3449			"found metadata UUID change in progress flag, clearing");
3450	}
3451
3452	memcpy(fs_info->super_for_commit, fs_info->super_copy,
3453	       sizeof(*fs_info->super_for_commit));
3454
3455	ret = btrfs_validate_mount_super(fs_info);
3456	if (ret) {
3457		btrfs_err(fs_info, "superblock contains fatal errors");
3458		err = -EINVAL;
3459		goto fail_alloc;
3460	}
3461
3462	if (!btrfs_super_root(disk_super))
3463		goto fail_alloc;
3464
3465	/* check FS state, whether FS is broken. */
3466	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3467		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
3468
3469	/*
3470	 * In the long term, we'll store the compression type in the super
3471	 * block, and it'll be used for per file compression control.
3472	 */
3473	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
3474
3475	/*
3476	 * Flag our filesystem as having big metadata blocks if they are bigger
3477	 * than the page size.
3478	 */
3479	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
3480		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
3481			btrfs_info(fs_info,
3482				"flagging fs with big metadata feature");
3483		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3484	}
3485
3486	/* Set up fs_info before parsing mount options */
3487	nodesize = btrfs_super_nodesize(disk_super);
3488	sectorsize = btrfs_super_sectorsize(disk_super);
3489	stripesize = sectorsize;
3490	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3491	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3492
3493	fs_info->nodesize = nodesize;
3494	fs_info->sectorsize = sectorsize;
3495	fs_info->sectorsize_bits = ilog2(sectorsize);
3496	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3497	fs_info->stripesize = stripesize;
3498
3499	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
3500	if (ret) {
3501		err = ret;
3502		goto fail_alloc;
3503	}
3504
3505	features = btrfs_super_incompat_flags(disk_super) &
3506		~BTRFS_FEATURE_INCOMPAT_SUPP;
3507	if (features) {
3508		btrfs_err(fs_info,
3509		    "cannot mount because of unsupported optional features (%llx)",
3510		    features);
3511		err = -EINVAL;
3512		goto fail_alloc;
3513	}
3514
3515	features = btrfs_super_incompat_flags(disk_super);
3516	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3517	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3518		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3519	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3520		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3521
3522	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
3523		btrfs_info(fs_info, "has skinny extents");
3524
3525	/*
3526	 * mixed block groups end up with duplicate but slightly offset
3527	 * extent buffers for the same range.  It leads to corruptions
3528	 */
3529	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3530	    (sectorsize != nodesize)) {
3531		btrfs_err(fs_info,
3532"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3533			nodesize, sectorsize);
3534		goto fail_alloc;
3535	}
3536
3537	/*
3538	 * Needn't use the lock because there is no other task which will
3539	 * update the flag.
3540	 */
3541	btrfs_set_super_incompat_flags(disk_super, features);
3542
3543	features = btrfs_super_compat_ro_flags(disk_super) &
3544		~BTRFS_FEATURE_COMPAT_RO_SUPP;
3545	if (!sb_rdonly(sb) && features) {
3546		btrfs_err(fs_info,
3547	"cannot mount read-write because of unsupported optional features (%llx)",
3548		       features);
3549		err = -EINVAL;
3550		goto fail_alloc;
3551	}
3552
3553	if (sectorsize < PAGE_SIZE) {
3554		struct btrfs_subpage_info *subpage_info;
3555
3556		btrfs_warn(fs_info,
3557		"read-write for sector size %u with page size %lu is experimental",
3558			   sectorsize, PAGE_SIZE);
3559		if (btrfs_super_incompat_flags(fs_info->super_copy) &
3560			BTRFS_FEATURE_INCOMPAT_RAID56) {
3561			btrfs_err(fs_info,
3562		"RAID56 is not yet supported for sector size %u with page size %lu",
3563				sectorsize, PAGE_SIZE);
3564			err = -EINVAL;
3565			goto fail_alloc;
3566		}
3567		subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
3568		if (!subpage_info)
3569			goto fail_alloc;
3570		btrfs_init_subpage_info(subpage_info, sectorsize);
3571		fs_info->subpage_info = subpage_info;
3572	}
3573
3574	ret = btrfs_init_workqueues(fs_info);
3575	if (ret) {
3576		err = ret;
3577		goto fail_sb_buffer;
3578	}
3579
3580	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
3581	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3582
3583	sb->s_blocksize = sectorsize;
3584	sb->s_blocksize_bits = blksize_bits(sectorsize);
3585	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3586
3587	mutex_lock(&fs_info->chunk_mutex);
3588	ret = btrfs_read_sys_array(fs_info);
3589	mutex_unlock(&fs_info->chunk_mutex);
3590	if (ret) {
3591		btrfs_err(fs_info, "failed to read the system array: %d", ret);
3592		goto fail_sb_buffer;
3593	}
3594
3595	generation = btrfs_super_chunk_root_generation(disk_super);
3596	level = btrfs_super_chunk_root_level(disk_super);
3597
3598	chunk_root->node = read_tree_block(fs_info,
3599					   btrfs_super_chunk_root(disk_super),
3600					   BTRFS_CHUNK_TREE_OBJECTID,
3601					   generation, level, NULL);
3602	if (IS_ERR(chunk_root->node) ||
3603	    !extent_buffer_uptodate(chunk_root->node)) {
3604		btrfs_err(fs_info, "failed to read chunk root");
3605		if (!IS_ERR(chunk_root->node))
3606			free_extent_buffer(chunk_root->node);
3607		chunk_root->node = NULL;
3608		goto fail_tree_roots;
3609	}
3610	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
3611	chunk_root->commit_root = btrfs_root_node(chunk_root);
3612
3613	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3614			   offsetof(struct btrfs_header, chunk_tree_uuid),
3615			   BTRFS_UUID_SIZE);
3616
3617	ret = btrfs_read_chunk_tree(fs_info);
3618	if (ret) {
3619		btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3620		goto fail_tree_roots;
3621	}
3622
3623	/*
3624	 * At this point we know all the devices that make this filesystem,
3625	 * including the seed devices but we don't know yet if the replace
3626	 * target is required. So free devices that are not part of this
3627	 * filesystem but skip the replace target device which is checked
3628	 * below in btrfs_init_dev_replace().
3629	 */
3630	btrfs_free_extra_devids(fs_devices);
3631	if (!fs_devices->latest_dev->bdev) {
3632		btrfs_err(fs_info, "failed to read devices");
3633		goto fail_tree_roots;
3634	}
3635
3636	ret = init_tree_roots(fs_info);
3637	if (ret)
3638		goto fail_tree_roots;
3639
3640	/*
3641	 * Get zone type information of zoned block devices. This will also
3642	 * handle emulation of a zoned filesystem if a regular device has the
3643	 * zoned incompat feature flag set.
3644	 */
3645	ret = btrfs_get_dev_zone_info_all_devices(fs_info);
3646	if (ret) {
3647		btrfs_err(fs_info,
3648			  "zoned: failed to read device zone info: %d",
3649			  ret);
3650		goto fail_block_groups;
3651	}
3652
3653	/*
3654	 * If we have a uuid root and we're not being told to rescan we need to
3655	 * check the generation here so we can set the
3656	 * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
3657	 * transaction during a balance or the log replay without updating the
3658	 * uuid generation, and then if we crash we would rescan the uuid tree,
3659	 * even though it was perfectly fine.
3660	 */
3661	if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3662	    fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
3663		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3664
3665	ret = btrfs_verify_dev_extents(fs_info);
3666	if (ret) {
3667		btrfs_err(fs_info,
3668			  "failed to verify dev extents against chunks: %d",
3669			  ret);
3670		goto fail_block_groups;
3671	}
3672	ret = btrfs_recover_balance(fs_info);
3673	if (ret) {
3674		btrfs_err(fs_info, "failed to recover balance: %d", ret);
3675		goto fail_block_groups;
3676	}
3677
3678	ret = btrfs_init_dev_stats(fs_info);
3679	if (ret) {
3680		btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3681		goto fail_block_groups;
3682	}
3683
3684	ret = btrfs_init_dev_replace(fs_info);
3685	if (ret) {
3686		btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3687		goto fail_block_groups;
3688	}
3689
3690	ret = btrfs_check_zoned_mode(fs_info);
3691	if (ret) {
3692		btrfs_err(fs_info, "failed to initialize zoned mode: %d",
3693			  ret);
3694		goto fail_block_groups;
3695	}
3696
3697	ret = btrfs_sysfs_add_fsid(fs_devices);
3698	if (ret) {
3699		btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3700				ret);
3701		goto fail_block_groups;
3702	}
3703
3704	ret = btrfs_sysfs_add_mounted(fs_info);
3705	if (ret) {
3706		btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3707		goto fail_fsdev_sysfs;
3708	}
3709
3710	ret = btrfs_init_space_info(fs_info);
3711	if (ret) {
3712		btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3713		goto fail_sysfs;
3714	}
3715
3716	ret = btrfs_read_block_groups(fs_info);
3717	if (ret) {
3718		btrfs_err(fs_info, "failed to read block groups: %d", ret);
3719		goto fail_sysfs;
3720	}
3721
3722	btrfs_free_zone_cache(fs_info);
3723
3724	if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
3725	    !btrfs_check_rw_degradable(fs_info, NULL)) {
3726		btrfs_warn(fs_info,
3727		"writable mount is not allowed due to too many missing devices");
3728		goto fail_sysfs;
3729	}
3730
3731	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
3732					       "btrfs-cleaner");
3733	if (IS_ERR(fs_info->cleaner_kthread))
3734		goto fail_sysfs;
3735
3736	fs_info->transaction_kthread = kthread_run(transaction_kthread,
3737						   tree_root,
3738						   "btrfs-transaction");
3739	if (IS_ERR(fs_info->transaction_kthread))
3740		goto fail_cleaner;
3741
3742	if (!btrfs_test_opt(fs_info, NOSSD) &&
3743	    !fs_info->fs_devices->rotating) {
3744		btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
3745	}
3746
3747	/*
3748	 * Mount does not set all options immediately, we can do it now and do
3749	 * not have to wait for transaction commit
3750	 */
3751	btrfs_apply_pending_changes(fs_info);
3752
3753#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3754	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3755		ret = btrfsic_mount(fs_info, fs_devices,
3756				    btrfs_test_opt(fs_info,
3757					CHECK_INTEGRITY_DATA) ? 1 : 0,
3758				    fs_info->check_integrity_print_mask);
3759		if (ret)
3760			btrfs_warn(fs_info,
3761				"failed to initialize integrity check module: %d",
3762				ret);
3763	}
3764#endif
3765	ret = btrfs_read_qgroup_config(fs_info);
3766	if (ret)
3767		goto fail_trans_kthread;
3768
3769	if (btrfs_build_ref_tree(fs_info))
3770		btrfs_err(fs_info, "couldn't build ref tree");
3771
3772	/* do not make disk changes in broken FS or nologreplay is given */
3773	if (btrfs_super_log_root(disk_super) != 0 &&
3774	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3775		btrfs_info(fs_info, "start tree-log replay");
3776		ret = btrfs_replay_log(fs_info, fs_devices);
3777		if (ret) {
3778			err = ret;
3779			goto fail_qgroup;
3780		}
3781	}
3782
3783	fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3784	if (IS_ERR(fs_info->fs_root)) {
3785		err = PTR_ERR(fs_info->fs_root);
3786		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
3787		fs_info->fs_root = NULL;
3788		goto fail_qgroup;
3789	}
3790
3791	if (sb_rdonly(sb))
3792		goto clear_oneshot;
3793
3794	ret = btrfs_start_pre_rw_mount(fs_info);
3795	if (ret) {
3796		close_ctree(fs_info);
3797		return ret;
3798	}
3799	btrfs_discard_resume(fs_info);
3800
3801	if (fs_info->uuid_root &&
3802	    (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3803	     fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3804		btrfs_info(fs_info, "checking UUID tree");
3805		ret = btrfs_check_uuid_tree(fs_info);
3806		if (ret) {
3807			btrfs_warn(fs_info,
3808				"failed to check the UUID tree: %d", ret);
3809			close_ctree(fs_info);
3810			return ret;
3811		}
3812	}
3813
3814	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3815
3816clear_oneshot:
3817	btrfs_clear_oneshot_options(fs_info);
3818	return 0;
3819
3820fail_qgroup:
3821	btrfs_free_qgroup_config(fs_info);
3822fail_trans_kthread:
3823	kthread_stop(fs_info->transaction_kthread);
3824	btrfs_cleanup_transaction(fs_info);
3825	btrfs_free_fs_roots(fs_info);
3826fail_cleaner:
3827	kthread_stop(fs_info->cleaner_kthread);
3828
3829	/*
3830	 * make sure we're done with the btree inode before we stop our
3831	 * kthreads
3832	 */
3833	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3834
3835fail_sysfs:
3836	btrfs_sysfs_remove_mounted(fs_info);
3837
3838fail_fsdev_sysfs:
3839	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3840
3841fail_block_groups:
3842	btrfs_put_block_group_cache(fs_info);
3843
3844fail_tree_roots:
3845	if (fs_info->data_reloc_root)
3846		btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3847	free_root_pointers(fs_info, true);
3848	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3849
3850fail_sb_buffer:
3851	btrfs_stop_all_workers(fs_info);
3852	btrfs_free_block_groups(fs_info);
3853fail_alloc:
3854	btrfs_mapping_tree_free(&fs_info->mapping_tree);
3855
3856	iput(fs_info->btree_inode);
3857fail:
3858	btrfs_close_devices(fs_info->fs_devices);
3859	return err;
3860}
3861ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3862
3863static void btrfs_end_super_write(struct bio *bio)
3864{
3865	struct btrfs_device *device = bio->bi_private;
3866	struct bio_vec *bvec;
3867	struct bvec_iter_all iter_all;
3868	struct page *page;
3869
3870	bio_for_each_segment_all(bvec, bio, iter_all) {
3871		page = bvec->bv_page;
3872
3873		if (bio->bi_status) {
3874			btrfs_warn_rl_in_rcu(device->fs_info,
3875				"lost page write due to IO error on %s (%d)",
3876				rcu_str_deref(device->name),
3877				blk_status_to_errno(bio->bi_status));
3878			ClearPageUptodate(page);
3879			SetPageError(page);
3880			btrfs_dev_stat_inc_and_print(device,
3881						     BTRFS_DEV_STAT_WRITE_ERRS);
3882		} else {
3883			SetPageUptodate(page);
3884		}
3885
3886		put_page(page);
3887		unlock_page(page);
3888	}
3889
3890	bio_put(bio);
3891}
3892
3893struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
3894						   int copy_num)
3895{
3896	struct btrfs_super_block *super;
3897	struct page *page;
3898	u64 bytenr, bytenr_orig;
3899	struct address_space *mapping = bdev->bd_inode->i_mapping;
3900	int ret;
3901
3902	bytenr_orig = btrfs_sb_offset(copy_num);
3903	ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
3904	if (ret == -ENOENT)
3905		return ERR_PTR(-EINVAL);
3906	else if (ret)
3907		return ERR_PTR(ret);
3908
3909	if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
3910		return ERR_PTR(-EINVAL);
3911
3912	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
3913	if (IS_ERR(page))
3914		return ERR_CAST(page);
3915
3916	super = page_address(page);
3917	if (btrfs_super_magic(super) != BTRFS_MAGIC) {
3918		btrfs_release_disk_super(super);
3919		return ERR_PTR(-ENODATA);
3920	}
3921
3922	if (btrfs_super_bytenr(super) != bytenr_orig) {
3923		btrfs_release_disk_super(super);
3924		return ERR_PTR(-EINVAL);
3925	}
3926
3927	return super;
3928}
3929
3930
3931struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
3932{
3933	struct btrfs_super_block *super, *latest = NULL;
3934	int i;
3935	u64 transid = 0;
3936
3937	/* we would like to check all the supers, but that would make
3938	 * a btrfs mount succeed after a mkfs from a different FS.
3939	 * So, we need to add a special mount option to scan for
3940	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3941	 */
3942	for (i = 0; i < 1; i++) {
3943		super = btrfs_read_dev_one_super(bdev, i);
3944		if (IS_ERR(super))
3945			continue;
3946
3947		if (!latest || btrfs_super_generation(super) > transid) {
3948			if (latest)
3949				btrfs_release_disk_super(super);
3950
3951			latest = super;
3952			transid = btrfs_super_generation(super);
3953		}
3954	}
3955
3956	return super;
3957}
3958
3959/*
3960 * Write superblock @sb to the @device. Do not wait for completion, all the
3961 * pages we use for writing are locked.
3962 *
3963 * Write @max_mirrors copies of the superblock, where 0 means default that fit
3964 * the expected device size at commit time. Note that max_mirrors must be
3965 * same for write and wait phases.
3966 *
3967 * Return number of errors when page is not found or submission fails.
3968 */
3969static int write_dev_supers(struct btrfs_device *device,
3970			    struct btrfs_super_block *sb, int max_mirrors)
3971{
3972	struct btrfs_fs_info *fs_info = device->fs_info;
3973	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
3974	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3975	int i;
3976	int errors = 0;
3977	int ret;
3978	u64 bytenr, bytenr_orig;
3979
3980	if (max_mirrors == 0)
3981		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3982
3983	shash->tfm = fs_info->csum_shash;
3984
3985	for (i = 0; i < max_mirrors; i++) {
3986		struct page *page;
3987		struct bio *bio;
3988		struct btrfs_super_block *disk_super;
3989
3990		bytenr_orig = btrfs_sb_offset(i);
3991		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
3992		if (ret == -ENOENT) {
3993			continue;
3994		} else if (ret < 0) {
3995			btrfs_err(device->fs_info,
3996				"couldn't get super block location for mirror %d",
3997				i);
3998			errors++;
3999			continue;
4000		}
4001		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
4002		    device->commit_total_bytes)
4003			break;
4004
4005		btrfs_set_super_bytenr(sb, bytenr_orig);
4006
4007		crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
4008				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
4009				    sb->csum);
4010
4011		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
4012					   GFP_NOFS);
4013		if (!page) {
4014			btrfs_err(device->fs_info,
4015			    "couldn't get super block page for bytenr %llu",
4016			    bytenr);
4017			errors++;
4018			continue;
4019		}
4020
4021		/* Bump the refcount for wait_dev_supers() */
4022		get_page(page);
4023
4024		disk_super = page_address(page);
4025		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
4026
4027		/*
4028		 * Directly use bios here instead of relying on the page cache
4029		 * to do I/O, so we don't lose the ability to do integrity
4030		 * checking.
4031		 */
4032		bio = bio_alloc(GFP_NOFS, 1);
4033		bio_set_dev(bio, device->bdev);
4034		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
4035		bio->bi_private = device;
4036		bio->bi_end_io = btrfs_end_super_write;
4037		__bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
4038			       offset_in_page(bytenr));
4039
4040		/*
4041		 * We FUA only the first super block.  The others we allow to
4042		 * go down lazy and there's a short window where the on-disk
4043		 * copies might still contain the older version.
4044		 */
4045		bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
4046		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
4047			bio->bi_opf |= REQ_FUA;
4048
4049		btrfsic_submit_bio(bio);
4050
4051		if (btrfs_advance_sb_log(device, i))
4052			errors++;
4053	}
4054	return errors < i ? 0 : -1;
4055}
4056
4057/*
4058 * Wait for write completion of superblocks done by write_dev_supers,
4059 * @max_mirrors same for write and wait phases.
4060 *
4061 * Return number of errors when page is not found or not marked up to
4062 * date.
4063 */
4064static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
4065{
4066	int i;
4067	int errors = 0;
4068	bool primary_failed = false;
4069	int ret;
4070	u64 bytenr;
4071
4072	if (max_mirrors == 0)
4073		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
4074
4075	for (i = 0; i < max_mirrors; i++) {
4076		struct page *page;
4077
4078		ret = btrfs_sb_log_location(device, i, READ, &bytenr);
4079		if (ret == -ENOENT) {
4080			break;
4081		} else if (ret < 0) {
4082			errors++;
4083			if (i == 0)
4084				primary_failed = true;
4085			continue;
4086		}
4087		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
4088		    device->commit_total_bytes)
4089			break;
4090
4091		page = find_get_page(device->bdev->bd_inode->i_mapping,
4092				     bytenr >> PAGE_SHIFT);
4093		if (!page) {
4094			errors++;
4095			if (i == 0)
4096				primary_failed = true;
4097			continue;
4098		}
4099		/* Page is submitted locked and unlocked once the IO completes */
4100		wait_on_page_locked(page);
4101		if (PageError(page)) {
4102			errors++;
4103			if (i == 0)
4104				primary_failed = true;
4105		}
4106
4107		/* Drop our reference */
4108		put_page(page);
4109
4110		/* Drop the reference from the writing run */
4111		put_page(page);
4112	}
4113
4114	/* log error, force error return */
4115	if (primary_failed) {
4116		btrfs_err(device->fs_info, "error writing primary super block to device %llu",
4117			  device->devid);
4118		return -1;
4119	}
4120
4121	return errors < i ? 0 : -1;
4122}
4123
4124/*
4125 * endio for the write_dev_flush, this will wake anyone waiting
4126 * for the barrier when it is done
4127 */
4128static void btrfs_end_empty_barrier(struct bio *bio)
4129{
4130	complete(bio->bi_private);
4131}
4132
4133/*
4134 * Submit a flush request to the device if it supports it. Error handling is
4135 * done in the waiting counterpart.
4136 */
4137static void write_dev_flush(struct btrfs_device *device)
4138{
4139	struct bio *bio = device->flush_bio;
4140
4141#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4142	/*
4143	 * When a disk has write caching disabled, we skip submission of a bio
4144	 * with flush and sync requests before writing the superblock, since
4145	 * it's not needed. However when the integrity checker is enabled, this
4146	 * results in reports that there are metadata blocks referred by a
4147	 * superblock that were not properly flushed. So don't skip the bio
4148	 * submission only when the integrity checker is enabled for the sake
4149	 * of simplicity, since this is a debug tool and not meant for use in
4150	 * non-debug builds.
4151	 */
4152	struct request_queue *q = bdev_get_queue(device->bdev);
4153	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
4154		return;
4155#endif
4156
4157	bio_reset(bio);
4158	bio->bi_end_io = btrfs_end_empty_barrier;
4159	bio_set_dev(bio, device->bdev);
4160	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
4161	init_completion(&device->flush_wait);
4162	bio->bi_private = &device->flush_wait;
4163
4164	btrfsic_submit_bio(bio);
4165	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
4166}
4167
4168/*
4169 * If the flush bio has been submitted by write_dev_flush, wait for it.
4170 */
4171static blk_status_t wait_dev_flush(struct btrfs_device *device)
4172{
4173	struct bio *bio = device->flush_bio;
4174
4175	if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
4176		return BLK_STS_OK;
4177
4178	clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
4179	wait_for_completion_io(&device->flush_wait);
4180
4181	return bio->bi_status;
4182}
4183
4184static int check_barrier_error(struct btrfs_fs_info *fs_info)
4185{
4186	if (!btrfs_check_rw_degradable(fs_info, NULL))
4187		return -EIO;
4188	return 0;
4189}
4190
4191/*
4192 * send an empty flush down to each device in parallel,
4193 * then wait for them
4194 */
4195static int barrier_all_devices(struct btrfs_fs_info *info)
4196{
4197	struct list_head *head;
4198	struct btrfs_device *dev;
4199	int errors_wait = 0;
4200	blk_status_t ret;
4201
4202	lockdep_assert_held(&info->fs_devices->device_list_mutex);
4203	/* send down all the barriers */
4204	head = &info->fs_devices->devices;
4205	list_for_each_entry(dev, head, dev_list) {
4206		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4207			continue;
4208		if (!dev->bdev)
4209			continue;
4210		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4211		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4212			continue;
4213
4214		write_dev_flush(dev);
4215		dev->last_flush_error = BLK_STS_OK;
4216	}
4217
4218	/* wait for all the barriers */
4219	list_for_each_entry(dev, head, dev_list) {
4220		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4221			continue;
4222		if (!dev->bdev) {
4223			errors_wait++;
4224			continue;
4225		}
4226		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4227		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4228			continue;
4229
4230		ret = wait_dev_flush(dev);
4231		if (ret) {
4232			dev->last_flush_error = ret;
4233			btrfs_dev_stat_inc_and_print(dev,
4234					BTRFS_DEV_STAT_FLUSH_ERRS);
4235			errors_wait++;
4236		}
4237	}
4238
4239	if (errors_wait) {
4240		/*
4241		 * At some point we need the status of all disks
4242		 * to arrive at the volume status. So error checking
4243		 * is being pushed to a separate loop.
4244		 */
4245		return check_barrier_error(info);
4246	}
4247	return 0;
4248}
4249
4250int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
4251{
4252	int raid_type;
4253	int min_tolerated = INT_MAX;
4254
4255	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
4256	    (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
4257		min_tolerated = min_t(int, min_tolerated,
4258				    btrfs_raid_array[BTRFS_RAID_SINGLE].
4259				    tolerated_failures);
4260
4261	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4262		if (raid_type == BTRFS_RAID_SINGLE)
4263			continue;
4264		if (!(flags & btrfs_raid_array[raid_type].bg_flag))
4265			continue;
4266		min_tolerated = min_t(int, min_tolerated,
4267				    btrfs_raid_array[raid_type].
4268				    tolerated_failures);
4269	}
4270
4271	if (min_tolerated == INT_MAX) {
4272		pr_warn("BTRFS: unknown raid flag: %llu", flags);
4273		min_tolerated = 0;
4274	}
4275
4276	return min_tolerated;
4277}
4278
4279int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
4280{
4281	struct list_head *head;
4282	struct btrfs_device *dev;
4283	struct btrfs_super_block *sb;
4284	struct btrfs_dev_item *dev_item;
4285	int ret;
4286	int do_barriers;
4287	int max_errors;
4288	int total_errors = 0;
4289	u64 flags;
4290
4291	do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
4292
4293	/*
4294	 * max_mirrors == 0 indicates we're from commit_transaction,
4295	 * not from fsync where the tree roots in fs_info have not
4296	 * been consistent on disk.
4297	 */
4298	if (max_mirrors == 0)
4299		backup_super_roots(fs_info);
4300
4301	sb = fs_info->super_for_commit;
4302	dev_item = &sb->dev_item;
4303
4304	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4305	head = &fs_info->fs_devices->devices;
4306	max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
4307
4308	if (do_barriers) {
4309		ret = barrier_all_devices(fs_info);
4310		if (ret) {
4311			mutex_unlock(
4312				&fs_info->fs_devices->device_list_mutex);
4313			btrfs_handle_fs_error(fs_info, ret,
4314					      "errors while submitting device barriers.");
4315			return ret;
4316		}
4317	}
4318
4319	list_for_each_entry(dev, head, dev_list) {
4320		if (!dev->bdev) {
4321			total_errors++;
4322			continue;
4323		}
4324		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4325		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4326			continue;
4327
4328		btrfs_set_stack_device_generation(dev_item, 0);
4329		btrfs_set_stack_device_type(dev_item, dev->type);
4330		btrfs_set_stack_device_id(dev_item, dev->devid);
4331		btrfs_set_stack_device_total_bytes(dev_item,
4332						   dev->commit_total_bytes);
4333		btrfs_set_stack_device_bytes_used(dev_item,
4334						  dev->commit_bytes_used);
4335		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
4336		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
4337		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
4338		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4339		memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
4340		       BTRFS_FSID_SIZE);
4341
4342		flags = btrfs_super_flags(sb);
4343		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
4344
4345		ret = btrfs_validate_write_super(fs_info, sb);
4346		if (ret < 0) {
4347			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4348			btrfs_handle_fs_error(fs_info, -EUCLEAN,
4349				"unexpected superblock corruption detected");
4350			return -EUCLEAN;
4351		}
4352
4353		ret = write_dev_supers(dev, sb, max_mirrors);
4354		if (ret)
4355			total_errors++;
4356	}
4357	if (total_errors > max_errors) {
4358		btrfs_err(fs_info, "%d errors while writing supers",
4359			  total_errors);
4360		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4361
4362		/* FUA is masked off if unsupported and can't be the reason */
4363		btrfs_handle_fs_error(fs_info, -EIO,
4364				      "%d errors while writing supers",
4365				      total_errors);
4366		return -EIO;
4367	}
4368
4369	total_errors = 0;
4370	list_for_each_entry(dev, head, dev_list) {
4371		if (!dev->bdev)
4372			continue;
4373		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4374		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4375			continue;
4376
4377		ret = wait_dev_supers(dev, max_mirrors);
4378		if (ret)
4379			total_errors++;
4380	}
4381	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4382	if (total_errors > max_errors) {
4383		btrfs_handle_fs_error(fs_info, -EIO,
4384				      "%d errors while writing supers",
4385				      total_errors);
4386		return -EIO;
4387	}
4388	return 0;
4389}
4390
4391/* Drop a fs root from the radix tree and free it. */
4392void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
4393				  struct btrfs_root *root)
4394{
4395	bool drop_ref = false;
4396
4397	spin_lock(&fs_info->fs_roots_radix_lock);
4398	radix_tree_delete(&fs_info->fs_roots_radix,
4399			  (unsigned long)root->root_key.objectid);
4400	if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4401		drop_ref = true;
4402	spin_unlock(&fs_info->fs_roots_radix_lock);
4403
4404	if (BTRFS_FS_ERROR(fs_info)) {
4405		ASSERT(root->log_root == NULL);
4406		if (root->reloc_root) {
4407			btrfs_put_root(root->reloc_root);
4408			root->reloc_root = NULL;
4409		}
4410	}
4411
4412	if (drop_ref)
4413		btrfs_put_root(root);
4414}
4415
4416int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
4417{
4418	u64 root_objectid = 0;
4419	struct btrfs_root *gang[8];
4420	int i = 0;
4421	int err = 0;
4422	unsigned int ret = 0;
4423
4424	while (1) {
4425		spin_lock(&fs_info->fs_roots_radix_lock);
4426		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4427					     (void **)gang, root_objectid,
4428					     ARRAY_SIZE(gang));
4429		if (!ret) {
4430			spin_unlock(&fs_info->fs_roots_radix_lock);
4431			break;
4432		}
4433		root_objectid = gang[ret - 1]->root_key.objectid + 1;
4434
4435		for (i = 0; i < ret; i++) {
4436			/* Avoid to grab roots in dead_roots */
4437			if (btrfs_root_refs(&gang[i]->root_item) == 0) {
4438				gang[i] = NULL;
4439				continue;
4440			}
4441			/* grab all the search result for later use */
4442			gang[i] = btrfs_grab_root(gang[i]);
4443		}
4444		spin_unlock(&fs_info->fs_roots_radix_lock);
4445
4446		for (i = 0; i < ret; i++) {
4447			if (!gang[i])
4448				continue;
4449			root_objectid = gang[i]->root_key.objectid;
4450			err = btrfs_orphan_cleanup(gang[i]);
4451			if (err)
4452				break;
4453			btrfs_put_root(gang[i]);
4454		}
4455		root_objectid++;
4456	}
4457
4458	/* release the uncleaned roots due to error */
4459	for (; i < ret; i++) {
4460		if (gang[i])
4461			btrfs_put_root(gang[i]);
4462	}
4463	return err;
4464}
4465
4466int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4467{
4468	struct btrfs_root *root = fs_info->tree_root;
4469	struct btrfs_trans_handle *trans;
4470
4471	mutex_lock(&fs_info->cleaner_mutex);
4472	btrfs_run_delayed_iputs(fs_info);
4473	mutex_unlock(&fs_info->cleaner_mutex);
4474	wake_up_process(fs_info->cleaner_kthread);
4475
4476	/* wait until ongoing cleanup work done */
4477	down_write(&fs_info->cleanup_work_sem);
4478	up_write(&fs_info->cleanup_work_sem);
4479
4480	trans = btrfs_join_transaction(root);
4481	if (IS_ERR(trans))
4482		return PTR_ERR(trans);
4483	return btrfs_commit_transaction(trans);
4484}
4485
4486static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
4487{
4488	struct btrfs_transaction *trans;
4489	struct btrfs_transaction *tmp;
4490	bool found = false;
4491
4492	if (list_empty(&fs_info->trans_list))
4493		return;
4494
4495	/*
4496	 * This function is only called at the very end of close_ctree(),
4497	 * thus no other running transaction, no need to take trans_lock.
4498	 */
4499	ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
4500	list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
4501		struct extent_state *cached = NULL;
4502		u64 dirty_bytes = 0;
4503		u64 cur = 0;
4504		u64 found_start;
4505		u64 found_end;
4506
4507		found = true;
4508		while (!find_first_extent_bit(&trans->dirty_pages, cur,
4509			&found_start, &found_end, EXTENT_DIRTY, &cached)) {
4510			dirty_bytes += found_end + 1 - found_start;
4511			cur = found_end + 1;
4512		}
4513		btrfs_warn(fs_info,
4514	"transaction %llu (with %llu dirty metadata bytes) is not committed",
4515			   trans->transid, dirty_bytes);
4516		btrfs_cleanup_one_transaction(trans, fs_info);
4517
4518		if (trans == fs_info->running_transaction)
4519			fs_info->running_transaction = NULL;
4520		list_del_init(&trans->list);
4521
4522		btrfs_put_transaction(trans);
4523		trace_btrfs_transaction_commit(fs_info);
4524	}
4525	ASSERT(!found);
4526}
4527
4528void __cold close_ctree(struct btrfs_fs_info *fs_info)
4529{
4530	int ret;
4531
4532	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4533	/*
4534	 * We don't want the cleaner to start new transactions, add more delayed
4535	 * iputs, etc. while we're closing. We can't use kthread_stop() yet
4536	 * because that frees the task_struct, and the transaction kthread might
4537	 * still try to wake up the cleaner.
4538	 */
4539	kthread_park(fs_info->cleaner_kthread);
4540
4541	/* wait for the qgroup rescan worker to stop */
4542	btrfs_qgroup_wait_for_completion(fs_info, false);
4543
4544	/* wait for the uuid_scan task to finish */
4545	down(&fs_info->uuid_tree_rescan_sem);
4546	/* avoid complains from lockdep et al., set sem back to initial state */
4547	up(&fs_info->uuid_tree_rescan_sem);
4548
4549	/* pause restriper - we want to resume on mount */
4550	btrfs_pause_balance(fs_info);
4551
4552	btrfs_dev_replace_suspend_for_unmount(fs_info);
4553
4554	btrfs_scrub_cancel(fs_info);
4555
4556	/* wait for any defraggers to finish */
4557	wait_event(fs_info->transaction_wait,
4558		   (atomic_read(&fs_info->defrag_running) == 0));
4559
4560	/* clear out the rbtree of defraggable inodes */
4561	btrfs_cleanup_defrag_inodes(fs_info);
4562
4563	cancel_work_sync(&fs_info->async_reclaim_work);
4564	cancel_work_sync(&fs_info->async_data_reclaim_work);
4565	cancel_work_sync(&fs_info->preempt_reclaim_work);
4566
4567	cancel_work_sync(&fs_info->reclaim_bgs_work);
4568
4569	/* Cancel or finish ongoing discard work */
4570	btrfs_discard_cleanup(fs_info);
4571
4572	if (!sb_rdonly(fs_info->sb)) {
4573		/*
4574		 * The cleaner kthread is stopped, so do one final pass over
4575		 * unused block groups.
4576		 */
4577		btrfs_delete_unused_bgs(fs_info);
4578
4579		/*
4580		 * There might be existing delayed inode workers still running
4581		 * and holding an empty delayed inode item. We must wait for
4582		 * them to complete first because they can create a transaction.
4583		 * This happens when someone calls btrfs_balance_delayed_items()
4584		 * and then a transaction commit runs the same delayed nodes
4585		 * before any delayed worker has done something with the nodes.
4586		 * We must wait for any worker here and not at transaction
4587		 * commit time since that could cause a deadlock.
4588		 * This is a very rare case.
4589		 */
4590		btrfs_flush_workqueue(fs_info->delayed_workers);
4591
4592		ret = btrfs_commit_super(fs_info);
4593		if (ret)
4594			btrfs_err(fs_info, "commit super ret %d", ret);
4595	}
4596
4597	if (BTRFS_FS_ERROR(fs_info))
4598		btrfs_error_commit_super(fs_info);
4599
4600	kthread_stop(fs_info->transaction_kthread);
4601	kthread_stop(fs_info->cleaner_kthread);
4602
4603	ASSERT(list_empty(&fs_info->delayed_iputs));
4604	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4605
4606	if (btrfs_check_quota_leak(fs_info)) {
4607		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4608		btrfs_err(fs_info, "qgroup reserved space leaked");
4609	}
4610
4611	btrfs_free_qgroup_config(fs_info);
4612	ASSERT(list_empty(&fs_info->delalloc_roots));
4613
4614	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4615		btrfs_info(fs_info, "at unmount delalloc count %lld",
4616		       percpu_counter_sum(&fs_info->delalloc_bytes));
4617	}
4618
4619	if (percpu_counter_sum(&fs_info->ordered_bytes))
4620		btrfs_info(fs_info, "at unmount dio bytes count %lld",
4621			   percpu_counter_sum(&fs_info->ordered_bytes));
4622
4623	btrfs_sysfs_remove_mounted(fs_info);
4624	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4625
4626	btrfs_put_block_group_cache(fs_info);
4627
4628	/*
4629	 * we must make sure there is not any read request to
4630	 * submit after we stopping all workers.
4631	 */
4632	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4633	btrfs_stop_all_workers(fs_info);
4634
4635	/* We shouldn't have any transaction open at this point */
4636	warn_about_uncommitted_trans(fs_info);
4637
4638	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4639	free_root_pointers(fs_info, true);
4640	btrfs_free_fs_roots(fs_info);
4641
4642	/*
4643	 * We must free the block groups after dropping the fs_roots as we could
4644	 * have had an IO error and have left over tree log blocks that aren't
4645	 * cleaned up until the fs roots are freed.  This makes the block group
4646	 * accounting appear to be wrong because there's pending reserved bytes,
4647	 * so make sure we do the block group cleanup afterwards.
4648	 */
4649	btrfs_free_block_groups(fs_info);
4650
4651	iput(fs_info->btree_inode);
4652
4653#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4654	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4655		btrfsic_unmount(fs_info->fs_devices);
4656#endif
4657
4658	btrfs_mapping_tree_free(&fs_info->mapping_tree);
4659	btrfs_close_devices(fs_info->fs_devices);
4660}
4661
4662int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
4663			  int atomic)
4664{
4665	int ret;
4666	struct inode *btree_inode = buf->pages[0]->mapping->host;
4667
4668	ret = extent_buffer_uptodate(buf);
4669	if (!ret)
4670		return ret;
4671
4672	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
4673				    parent_transid, atomic);
4674	if (ret == -EAGAIN)
4675		return ret;
4676	return !ret;
4677}
4678
4679void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4680{
4681	struct btrfs_fs_info *fs_info = buf->fs_info;
4682	u64 transid = btrfs_header_generation(buf);
4683	int was_dirty;
4684
4685#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4686	/*
4687	 * This is a fast path so only do this check if we have sanity tests
4688	 * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4689	 * outside of the sanity tests.
4690	 */
4691	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4692		return;
4693#endif
4694	btrfs_assert_tree_write_locked(buf);
4695	if (transid != fs_info->generation)
4696		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4697			buf->start, transid, fs_info->generation);
4698	was_dirty = set_extent_buffer_dirty(buf);
4699	if (!was_dirty)
4700		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4701					 buf->len,
4702					 fs_info->dirty_metadata_batch);
4703#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4704	/*
4705	 * Since btrfs_mark_buffer_dirty() can be called with item pointer set
4706	 * but item data not updated.
4707	 * So here we should only check item pointers, not item data.
4708	 */
4709	if (btrfs_header_level(buf) == 0 &&
4710	    btrfs_check_leaf_relaxed(buf)) {
4711		btrfs_print_leaf(buf);
4712		ASSERT(0);
4713	}
4714#endif
4715}
4716
4717static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4718					int flush_delayed)
4719{
4720	/*
4721	 * looks as though older kernels can get into trouble with
4722	 * this code, they end up stuck in balance_dirty_pages forever
4723	 */
4724	int ret;
4725
4726	if (current->flags & PF_MEMALLOC)
4727		return;
4728
4729	if (flush_delayed)
4730		btrfs_balance_delayed_items(fs_info);
4731
4732	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
4733				     BTRFS_DIRTY_METADATA_THRESH,
4734				     fs_info->dirty_metadata_batch);
4735	if (ret > 0) {
4736		balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4737	}
4738}
4739
4740void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4741{
4742	__btrfs_btree_balance_dirty(fs_info, 1);
4743}
4744
4745void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4746{
4747	__btrfs_btree_balance_dirty(fs_info, 0);
4748}
4749
4750int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
4751		      struct btrfs_key *first_key)
4752{
4753	return btree_read_extent_buffer_pages(buf, parent_transid,
4754					      level, first_key);
4755}
4756
4757static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4758{
4759	/* cleanup FS via transaction */
4760	btrfs_cleanup_transaction(fs_info);
4761
4762	mutex_lock(&fs_info->cleaner_mutex);
4763	btrfs_run_delayed_iputs(fs_info);
4764	mutex_unlock(&fs_info->cleaner_mutex);
4765
4766	down_write(&fs_info->cleanup_work_sem);
4767	up_write(&fs_info->cleanup_work_sem);
4768}
4769
4770static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4771{
4772	struct btrfs_root *gang[8];
4773	u64 root_objectid = 0;
4774	int ret;
4775
4776	spin_lock(&fs_info->fs_roots_radix_lock);
4777	while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4778					     (void **)gang, root_objectid,
4779					     ARRAY_SIZE(gang))) != 0) {
4780		int i;
4781
4782		for (i = 0; i < ret; i++)
4783			gang[i] = btrfs_grab_root(gang[i]);
4784		spin_unlock(&fs_info->fs_roots_radix_lock);
4785
4786		for (i = 0; i < ret; i++) {
4787			if (!gang[i])
4788				continue;
4789			root_objectid = gang[i]->root_key.objectid;
4790			btrfs_free_log(NULL, gang[i]);
4791			btrfs_put_root(gang[i]);
4792		}
4793		root_objectid++;
4794		spin_lock(&fs_info->fs_roots_radix_lock);
4795	}
4796	spin_unlock(&fs_info->fs_roots_radix_lock);
4797	btrfs_free_log_root_tree(NULL, fs_info);
4798}
4799
4800static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4801{
4802	struct btrfs_ordered_extent *ordered;
4803
4804	spin_lock(&root->ordered_extent_lock);
4805	/*
4806	 * This will just short circuit the ordered completion stuff which will
4807	 * make sure the ordered extent gets properly cleaned up.
4808	 */
4809	list_for_each_entry(ordered, &root->ordered_extents,
4810			    root_extent_list)
4811		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4812	spin_unlock(&root->ordered_extent_lock);
4813}
4814
4815static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4816{
4817	struct btrfs_root *root;
4818	struct list_head splice;
4819
4820	INIT_LIST_HEAD(&splice);
4821
4822	spin_lock(&fs_info->ordered_root_lock);
4823	list_splice_init(&fs_info->ordered_roots, &splice);
4824	while (!list_empty(&splice)) {
4825		root = list_first_entry(&splice, struct btrfs_root,
4826					ordered_root);
4827		list_move_tail(&root->ordered_root,
4828			       &fs_info->ordered_roots);
4829
4830		spin_unlock(&fs_info->ordered_root_lock);
4831		btrfs_destroy_ordered_extents(root);
4832
4833		cond_resched();
4834		spin_lock(&fs_info->ordered_root_lock);
4835	}
4836	spin_unlock(&fs_info->ordered_root_lock);
4837
4838	/*
4839	 * We need this here because if we've been flipped read-only we won't
4840	 * get sync() from the umount, so we need to make sure any ordered
4841	 * extents that haven't had their dirty pages IO start writeout yet
4842	 * actually get run and error out properly.
4843	 */
4844	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
4845}
4846
4847static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4848				      struct btrfs_fs_info *fs_info)
4849{
4850	struct rb_node *node;
4851	struct btrfs_delayed_ref_root *delayed_refs;
4852	struct btrfs_delayed_ref_node *ref;
4853	int ret = 0;
4854
4855	delayed_refs = &trans->delayed_refs;
4856
4857	spin_lock(&delayed_refs->lock);
4858	if (atomic_read(&delayed_refs->num_entries) == 0) {
4859		spin_unlock(&delayed_refs->lock);
4860		btrfs_debug(fs_info, "delayed_refs has NO entry");
4861		return ret;
4862	}
4863
4864	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4865		struct btrfs_delayed_ref_head *head;
4866		struct rb_node *n;
4867		bool pin_bytes = false;
4868
4869		head = rb_entry(node, struct btrfs_delayed_ref_head,
4870				href_node);
4871		if (btrfs_delayed_ref_lock(delayed_refs, head))
4872			continue;
4873
4874		spin_lock(&head->lock);
4875		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4876			ref = rb_entry(n, struct btrfs_delayed_ref_node,
4877				       ref_node);
4878			ref->in_tree = 0;
4879			rb_erase_cached(&ref->ref_node, &head->ref_tree);
4880			RB_CLEAR_NODE(&ref->ref_node);
4881			if (!list_empty(&ref->add_list))
4882				list_del(&ref->add_list);
4883			atomic_dec(&delayed_refs->num_entries);
4884			btrfs_put_delayed_ref(ref);
4885		}
4886		if (head->must_insert_reserved)
4887			pin_bytes = true;
4888		btrfs_free_delayed_extent_op(head->extent_op);
4889		btrfs_delete_ref_head(delayed_refs, head);
4890		spin_unlock(&head->lock);
4891		spin_unlock(&delayed_refs->lock);
4892		mutex_unlock(&head->mutex);
4893
4894		if (pin_bytes) {
4895			struct btrfs_block_group *cache;
4896
4897			cache = btrfs_lookup_block_group(fs_info, head->bytenr);
4898			BUG_ON(!cache);
4899
4900			spin_lock(&cache->space_info->lock);
4901			spin_lock(&cache->lock);
4902			cache->pinned += head->num_bytes;
4903			btrfs_space_info_update_bytes_pinned(fs_info,
4904				cache->space_info, head->num_bytes);
4905			cache->reserved -= head->num_bytes;
4906			cache->space_info->bytes_reserved -= head->num_bytes;
4907			spin_unlock(&cache->lock);
4908			spin_unlock(&cache->space_info->lock);
4909
4910			btrfs_put_block_group(cache);
4911
4912			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
4913				head->bytenr + head->num_bytes - 1);
4914		}
4915		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4916		btrfs_put_delayed_ref_head(head);
4917		cond_resched();
4918		spin_lock(&delayed_refs->lock);
4919	}
4920	btrfs_qgroup_destroy_extent_records(trans);
4921
4922	spin_unlock(&delayed_refs->lock);
4923
4924	return ret;
4925}
4926
4927static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4928{
4929	struct btrfs_inode *btrfs_inode;
4930	struct list_head splice;
4931
4932	INIT_LIST_HEAD(&splice);
4933
4934	spin_lock(&root->delalloc_lock);
4935	list_splice_init(&root->delalloc_inodes, &splice);
4936
4937	while (!list_empty(&splice)) {
4938		struct inode *inode = NULL;
4939		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4940					       delalloc_inodes);
4941		__btrfs_del_delalloc_inode(root, btrfs_inode);
4942		spin_unlock(&root->delalloc_lock);
4943
4944		/*
4945		 * Make sure we get a live inode and that it'll not disappear
4946		 * meanwhile.
4947		 */
4948		inode = igrab(&btrfs_inode->vfs_inode);
4949		if (inode) {
4950			invalidate_inode_pages2(inode->i_mapping);
4951			iput(inode);
4952		}
4953		spin_lock(&root->delalloc_lock);
4954	}
4955	spin_unlock(&root->delalloc_lock);
4956}
4957
4958static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4959{
4960	struct btrfs_root *root;
4961	struct list_head splice;
4962
4963	INIT_LIST_HEAD(&splice);
4964
4965	spin_lock(&fs_info->delalloc_root_lock);
4966	list_splice_init(&fs_info->delalloc_roots, &splice);
4967	while (!list_empty(&splice)) {
4968		root = list_first_entry(&splice, struct btrfs_root,
4969					 delalloc_root);
4970		root = btrfs_grab_root(root);
4971		BUG_ON(!root);
4972		spin_unlock(&fs_info->delalloc_root_lock);
4973
4974		btrfs_destroy_delalloc_inodes(root);
4975		btrfs_put_root(root);
4976
4977		spin_lock(&fs_info->delalloc_root_lock);
4978	}
4979	spin_unlock(&fs_info->delalloc_root_lock);
4980}
4981
4982static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
4983					struct extent_io_tree *dirty_pages,
4984					int mark)
4985{
4986	int ret;
4987	struct extent_buffer *eb;
4988	u64 start = 0;
4989	u64 end;
4990
4991	while (1) {
4992		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4993					    mark, NULL);
4994		if (ret)
4995			break;
4996
4997		clear_extent_bits(dirty_pages, start, end, mark);
4998		while (start <= end) {
4999			eb = find_extent_buffer(fs_info, start);
5000			start += fs_info->nodesize;
5001			if (!eb)
5002				continue;
5003			wait_on_extent_buffer_writeback(eb);
5004
5005			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
5006					       &eb->bflags))
5007				clear_extent_buffer_dirty(eb);
5008			free_extent_buffer_stale(eb);
5009		}
5010	}
5011
5012	return ret;
5013}
5014
5015static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
5016				       struct extent_io_tree *unpin)
5017{
5018	u64 start;
5019	u64 end;
5020	int ret;
5021
5022	while (1) {
5023		struct extent_state *cached_state = NULL;
5024
5025		/*
5026		 * The btrfs_finish_extent_commit() may get the same range as
5027		 * ours between find_first_extent_bit and clear_extent_dirty.
5028		 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
5029		 * the same extent range.
5030		 */
5031		mutex_lock(&fs_info->unused_bg_unpin_mutex);
5032		ret = find_first_extent_bit(unpin, 0, &start, &end,
5033					    EXTENT_DIRTY, &cached_state);
5034		if (ret) {
5035			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5036			break;
5037		}
5038
5039		clear_extent_dirty(unpin, start, end, &cached_state);
5040		free_extent_state(cached_state);
5041		btrfs_error_unpin_extent_range(fs_info, start, end);
5042		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5043		cond_resched();
5044	}
5045
5046	return 0;
5047}
5048
5049static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
5050{
5051	struct inode *inode;
5052
5053	inode = cache->io_ctl.inode;
5054	if (inode) {
5055		invalidate_inode_pages2(inode->i_mapping);
5056		BTRFS_I(inode)->generation = 0;
5057		cache->io_ctl.inode = NULL;
5058		iput(inode);
5059	}
5060	ASSERT(cache->io_ctl.pages == NULL);
5061	btrfs_put_block_group(cache);
5062}
5063
5064void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
5065			     struct btrfs_fs_info *fs_info)
5066{
5067	struct btrfs_block_group *cache;
5068
5069	spin_lock(&cur_trans->dirty_bgs_lock);
5070	while (!list_empty(&cur_trans->dirty_bgs)) {
5071		cache = list_first_entry(&cur_trans->dirty_bgs,
5072					 struct btrfs_block_group,
5073					 dirty_list);
5074
5075		if (!list_empty(&cache->io_list)) {
5076			spin_unlock(&cur_trans->dirty_bgs_lock);
5077			list_del_init(&cache->io_list);
5078			btrfs_cleanup_bg_io(cache);
5079			spin_lock(&cur_trans->dirty_bgs_lock);
5080		}
5081
5082		list_del_init(&cache->dirty_list);
5083		spin_lock(&cache->lock);
5084		cache->disk_cache_state = BTRFS_DC_ERROR;
5085		spin_unlock(&cache->lock);
5086
5087		spin_unlock(&cur_trans->dirty_bgs_lock);
5088		btrfs_put_block_group(cache);
5089		btrfs_delayed_refs_rsv_release(fs_info, 1);
5090		spin_lock(&cur_trans->dirty_bgs_lock);
5091	}
5092	spin_unlock(&cur_trans->dirty_bgs_lock);
5093
5094	/*
5095	 * Refer to the definition of io_bgs member for details why it's safe
5096	 * to use it without any locking
5097	 */
5098	while (!list_empty(&cur_trans->io_bgs)) {
5099		cache = list_first_entry(&cur_trans->io_bgs,
5100					 struct btrfs_block_group,
5101					 io_list);
5102
5103		list_del_init(&cache->io_list);
5104		spin_lock(&cache->lock);
5105		cache->disk_cache_state = BTRFS_DC_ERROR;
5106		spin_unlock(&cache->lock);
5107		btrfs_cleanup_bg_io(cache);
5108	}
5109}
5110
5111void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
5112				   struct btrfs_fs_info *fs_info)
5113{
5114	struct btrfs_device *dev, *tmp;
5115
5116	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
5117	ASSERT(list_empty(&cur_trans->dirty_bgs));
5118	ASSERT(list_empty(&cur_trans->io_bgs));
5119
5120	list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
5121				 post_commit_list) {
5122		list_del_init(&dev->post_commit_list);
5123	}
5124
5125	btrfs_destroy_delayed_refs(cur_trans, fs_info);
5126
5127	cur_trans->state = TRANS_STATE_COMMIT_START;
5128	wake_up(&fs_info->transaction_blocked_wait);
5129
5130	cur_trans->state = TRANS_STATE_UNBLOCKED;
5131	wake_up(&fs_info->transaction_wait);
5132
5133	btrfs_destroy_delayed_inodes(fs_info);
5134
5135	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
5136				     EXTENT_DIRTY);
5137	btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
5138
5139	btrfs_free_redirty_list(cur_trans);
5140
5141	cur_trans->state =TRANS_STATE_COMPLETED;
5142	wake_up(&cur_trans->commit_wait);
5143}
5144
5145static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
5146{
5147	struct btrfs_transaction *t;
5148
5149	mutex_lock(&fs_info->transaction_kthread_mutex);
5150
5151	spin_lock(&fs_info->trans_lock);
5152	while (!list_empty(&fs_info->trans_list)) {
5153		t = list_first_entry(&fs_info->trans_list,
5154				     struct btrfs_transaction, list);
5155		if (t->state >= TRANS_STATE_COMMIT_START) {
5156			refcount_inc(&t->use_count);
5157			spin_unlock(&fs_info->trans_lock);
5158			btrfs_wait_for_commit(fs_info, t->transid);
5159			btrfs_put_transaction(t);
5160			spin_lock(&fs_info->trans_lock);
5161			continue;
5162		}
5163		if (t == fs_info->running_transaction) {
5164			t->state = TRANS_STATE_COMMIT_DOING;
5165			spin_unlock(&fs_info->trans_lock);
5166			/*
5167			 * We wait for 0 num_writers since we don't hold a trans
5168			 * handle open currently for this transaction.
5169			 */
5170			wait_event(t->writer_wait,
5171				   atomic_read(&t->num_writers) == 0);
5172		} else {
5173			spin_unlock(&fs_info->trans_lock);
5174		}
5175		btrfs_cleanup_one_transaction(t, fs_info);
5176
5177		spin_lock(&fs_info->trans_lock);
5178		if (t == fs_info->running_transaction)
5179			fs_info->running_transaction = NULL;
5180		list_del_init(&t->list);
5181		spin_unlock(&fs_info->trans_lock);
5182
5183		btrfs_put_transaction(t);
5184		trace_btrfs_transaction_commit(fs_info);
5185		spin_lock(&fs_info->trans_lock);
5186	}
5187	spin_unlock(&fs_info->trans_lock);
5188	btrfs_destroy_all_ordered_extents(fs_info);
5189	btrfs_destroy_delayed_inodes(fs_info);
5190	btrfs_assert_delayed_root_empty(fs_info);
5191	btrfs_destroy_all_delalloc_inodes(fs_info);
5192	btrfs_drop_all_logs(fs_info);
5193	mutex_unlock(&fs_info->transaction_kthread_mutex);
5194
5195	return 0;
5196}
5197
5198int btrfs_init_root_free_objectid(struct btrfs_root *root)
5199{
5200	struct btrfs_path *path;
5201	int ret;
5202	struct extent_buffer *l;
5203	struct btrfs_key search_key;
5204	struct btrfs_key found_key;
5205	int slot;
5206
5207	path = btrfs_alloc_path();
5208	if (!path)
5209		return -ENOMEM;
5210
5211	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
5212	search_key.type = -1;
5213	search_key.offset = (u64)-1;
5214	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5215	if (ret < 0)
5216		goto error;
5217	BUG_ON(ret == 0); /* Corruption */
5218	if (path->slots[0] > 0) {
5219		slot = path->slots[0] - 1;
5220		l = path->nodes[0];
5221		btrfs_item_key_to_cpu(l, &found_key, slot);
5222		root->free_objectid = max_t(u64, found_key.objectid + 1,
5223					    BTRFS_FIRST_FREE_OBJECTID);
5224	} else {
5225		root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
5226	}
5227	ret = 0;
5228error:
5229	btrfs_free_path(path);
5230	return ret;
5231}
5232
5233int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
5234{
5235	int ret;
5236	mutex_lock(&root->objectid_mutex);
5237
5238	if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
5239		btrfs_warn(root->fs_info,
5240			   "the objectid of root %llu reaches its highest value",
5241			   root->root_key.objectid);
5242		ret = -ENOSPC;
5243		goto out;
5244	}
5245
5246	*objectid = root->free_objectid++;
5247	ret = 0;
5248out:
5249	mutex_unlock(&root->objectid_mutex);
5250	return ret;
5251}
5252