1cac06d84SQu Wenruo// SPDX-License-Identifier: GPL-2.0
2cac06d84SQu Wenruo
3cac06d84SQu Wenruo#include <linux/slab.h>
4cac06d84SQu Wenruo#include "ctree.h"
5cac06d84SQu Wenruo#include "subpage.h"
63d078efaSQu Wenruo#include "btrfs_inode.h"
7cac06d84SQu Wenruo
8894d1378SQu Wenruo/*
9894d1378SQu Wenruo * Subpage (sectorsize < PAGE_SIZE) support overview:
10894d1378SQu Wenruo *
11894d1378SQu Wenruo * Limitations:
12894d1378SQu Wenruo *
13894d1378SQu Wenruo * - Only support 64K page size for now
14894d1378SQu Wenruo *   This is to make metadata handling easier, as 64K page would ensure
15894d1378SQu Wenruo *   all nodesize would fit inside one page, thus we don't need to handle
16894d1378SQu Wenruo *   cases where a tree block crosses several pages.
17894d1378SQu Wenruo *
18894d1378SQu Wenruo * - Only metadata read-write for now
19894d1378SQu Wenruo *   The data read-write part is in development.
20894d1378SQu Wenruo *
21894d1378SQu Wenruo * - Metadata can't cross 64K page boundary
22894d1378SQu Wenruo *   btrfs-progs and kernel have done that for a while, thus only ancient
23894d1378SQu Wenruo *   filesystems could have such problem.  For such case, do a graceful
24894d1378SQu Wenruo *   rejection.
25894d1378SQu Wenruo *
26894d1378SQu Wenruo * Special behavior:
27894d1378SQu Wenruo *
28894d1378SQu Wenruo * - Metadata
29894d1378SQu Wenruo *   Metadata read is fully supported.
30894d1378SQu Wenruo *   Meaning when reading one tree block will only trigger the read for the
31894d1378SQu Wenruo *   needed range, other unrelated range in the same page will not be touched.
32894d1378SQu Wenruo *
33894d1378SQu Wenruo *   Metadata write support is partial.
34894d1378SQu Wenruo *   The writeback is still for the full page, but we will only submit
35894d1378SQu Wenruo *   the dirty extent buffers in the page.
36894d1378SQu Wenruo *
37894d1378SQu Wenruo *   This means, if we have a metadata page like this:
38894d1378SQu Wenruo *
39894d1378SQu Wenruo *   Page offset
40894d1378SQu Wenruo *   0         16K         32K         48K        64K
41894d1378SQu Wenruo *   |/////////|           |///////////|
42894d1378SQu Wenruo *        \- Tree block A        \- Tree block B
43894d1378SQu Wenruo *
44894d1378SQu Wenruo *   Even if we just want to writeback tree block A, we will also writeback
45894d1378SQu Wenruo *   tree block B if it's also dirty.
46894d1378SQu Wenruo *
47894d1378SQu Wenruo *   This may cause extra metadata writeback which results more COW.
48894d1378SQu Wenruo *
49894d1378SQu Wenruo * Implementation:
50894d1378SQu Wenruo *
51894d1378SQu Wenruo * - Common
52894d1378SQu Wenruo *   Both metadata and data will use a new structure, btrfs_subpage, to
53894d1378SQu Wenruo *   record the status of each sector inside a page.  This provides the extra
54894d1378SQu Wenruo *   granularity needed.
55894d1378SQu Wenruo *
56894d1378SQu Wenruo * - Metadata
57894d1378SQu Wenruo *   Since we have multiple tree blocks inside one page, we can't rely on page
58894d1378SQu Wenruo *   locking anymore, or we will have greatly reduced concurrency or even
59894d1378SQu Wenruo *   deadlocks (hold one tree lock while trying to lock another tree lock in
60894d1378SQu Wenruo *   the same page).
61894d1378SQu Wenruo *
62894d1378SQu Wenruo *   Thus for metadata locking, subpage support relies on io_tree locking only.
63894d1378SQu Wenruo *   This means a slightly higher tree locking latency.
64894d1378SQu Wenruo */
65894d1378SQu Wenruo
668481dd80SQu Wenruovoid btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
678481dd80SQu Wenruo{
688481dd80SQu Wenruo	unsigned int cur = 0;
698481dd80SQu Wenruo	unsigned int nr_bits;
708481dd80SQu Wenruo
718481dd80SQu Wenruo	ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
728481dd80SQu Wenruo
738481dd80SQu Wenruo	nr_bits = PAGE_SIZE / sectorsize;
748481dd80SQu Wenruo	subpage_info->bitmap_nr_bits = nr_bits;
758481dd80SQu Wenruo
768481dd80SQu Wenruo	subpage_info->uptodate_offset = cur;
778481dd80SQu Wenruo	cur += nr_bits;
788481dd80SQu Wenruo
798481dd80SQu Wenruo	subpage_info->error_offset = cur;
808481dd80SQu Wenruo	cur += nr_bits;
818481dd80SQu Wenruo
828481dd80SQu Wenruo	subpage_info->dirty_offset = cur;
838481dd80SQu Wenruo	cur += nr_bits;
848481dd80SQu Wenruo
858481dd80SQu Wenruo	subpage_info->writeback_offset = cur;
868481dd80SQu Wenruo	cur += nr_bits;
878481dd80SQu Wenruo
888481dd80SQu Wenruo	subpage_info->ordered_offset = cur;
898481dd80SQu Wenruo	cur += nr_bits;
908481dd80SQu Wenruo
91e4f94347SQu Wenruo	subpage_info->checked_offset = cur;
92e4f94347SQu Wenruo	cur += nr_bits;
93e4f94347SQu Wenruo
948481dd80SQu Wenruo	subpage_info->total_nr_bits = cur;
958481dd80SQu Wenruo}
968481dd80SQu Wenruo
97cac06d84SQu Wenruoint btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
98cac06d84SQu Wenruo			 struct page *page, enum btrfs_subpage_type type)
99cac06d84SQu Wenruo{
100651fb419SQu Wenruo	struct btrfs_subpage *subpage;
101cac06d84SQu Wenruo
102cac06d84SQu Wenruo	/*
103cac06d84SQu Wenruo	 * We have cases like a dummy extent buffer page, which is not mappped
104cac06d84SQu Wenruo	 * and doesn't need to be locked.
105cac06d84SQu Wenruo	 */
106cac06d84SQu Wenruo	if (page->mapping)
107cac06d84SQu Wenruo		ASSERT(PageLocked(page));
108651fb419SQu Wenruo
109cac06d84SQu Wenruo	/* Either not subpage, or the page already has private attached */
110cac06d84SQu Wenruo	if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
111cac06d84SQu Wenruo		return 0;
112cac06d84SQu Wenruo
113651fb419SQu Wenruo	subpage = btrfs_alloc_subpage(fs_info, type);
114651fb419SQu Wenruo	if (IS_ERR(subpage))
115651fb419SQu Wenruo		return  PTR_ERR(subpage);
116651fb419SQu Wenruo
117cac06d84SQu Wenruo	attach_page_private(page, subpage);
118cac06d84SQu Wenruo	return 0;
119cac06d84SQu Wenruo}
120cac06d84SQu Wenruo
121cac06d84SQu Wenruovoid btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
122cac06d84SQu Wenruo			  struct page *page)
123cac06d84SQu Wenruo{
124cac06d84SQu Wenruo	struct btrfs_subpage *subpage;
125cac06d84SQu Wenruo
126cac06d84SQu Wenruo	/* Either not subpage, or already detached */
127cac06d84SQu Wenruo	if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
128cac06d84SQu Wenruo		return;
129cac06d84SQu Wenruo
130cac06d84SQu Wenruo	subpage = (struct btrfs_subpage *)detach_page_private(page);
131cac06d84SQu Wenruo	ASSERT(subpage);
132760f991fSQu Wenruo	btrfs_free_subpage(subpage);
133760f991fSQu Wenruo}
134760f991fSQu Wenruo
135651fb419SQu Wenruostruct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
136651fb419SQu Wenruo					  enum btrfs_subpage_type type)
137760f991fSQu Wenruo{
138651fb419SQu Wenruo	struct btrfs_subpage *ret;
13972a69cd0SQu Wenruo	unsigned int real_size;
140651fb419SQu Wenruo
141fdf250dbSQu Wenruo	ASSERT(fs_info->sectorsize < PAGE_SIZE);
142760f991fSQu Wenruo
14372a69cd0SQu Wenruo	real_size = struct_size(ret, bitmaps,
14472a69cd0SQu Wenruo			BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
14572a69cd0SQu Wenruo	ret = kzalloc(real_size, GFP_NOFS);
146651fb419SQu Wenruo	if (!ret)
147651fb419SQu Wenruo		return ERR_PTR(-ENOMEM);
148651fb419SQu Wenruo
149651fb419SQu Wenruo	spin_lock_init(&ret->lock);
1501e1de387SQu Wenruo	if (type == BTRFS_SUBPAGE_METADATA) {
151651fb419SQu Wenruo		atomic_set(&ret->eb_refs, 0);
1521e1de387SQu Wenruo	} else {
153651fb419SQu Wenruo		atomic_set(&ret->readers, 0);
154651fb419SQu Wenruo		atomic_set(&ret->writers, 0);
1551e1de387SQu Wenruo	}
156651fb419SQu Wenruo	return ret;
157760f991fSQu Wenruo}
158760f991fSQu Wenruo
159760f991fSQu Wenruovoid btrfs_free_subpage(struct btrfs_subpage *subpage)
160760f991fSQu Wenruo{
161cac06d84SQu Wenruo	kfree(subpage);
162cac06d84SQu Wenruo}
1638ff8466dSQu Wenruo
1648ff8466dSQu Wenruo/*
1658ff8466dSQu Wenruo * Increase the eb_refs of current subpage.
1668ff8466dSQu Wenruo *
1678ff8466dSQu Wenruo * This is important for eb allocation, to prevent race with last eb freeing
1688ff8466dSQu Wenruo * of the same page.
1698ff8466dSQu Wenruo * With the eb_refs increased before the eb inserted into radix tree,
1708ff8466dSQu Wenruo * detach_extent_buffer_page() won't detach the page private while we're still
1718ff8466dSQu Wenruo * allocating the extent buffer.
1728ff8466dSQu Wenruo */
1738ff8466dSQu Wenruovoid btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
1748ff8466dSQu Wenruo			    struct page *page)
1758ff8466dSQu Wenruo{
1768ff8466dSQu Wenruo	struct btrfs_subpage *subpage;
1778ff8466dSQu Wenruo
1788ff8466dSQu Wenruo	if (fs_info->sectorsize == PAGE_SIZE)
1798ff8466dSQu Wenruo		return;
1808ff8466dSQu Wenruo
1818ff8466dSQu Wenruo	ASSERT(PagePrivate(page) && page->mapping);
1828ff8466dSQu Wenruo	lockdep_assert_held(&page->mapping->private_lock);
1838ff8466dSQu Wenruo
1848ff8466dSQu Wenruo	subpage = (struct btrfs_subpage *)page->private;
1858ff8466dSQu Wenruo	atomic_inc(&subpage->eb_refs);
1868ff8466dSQu Wenruo}
1878ff8466dSQu Wenruo
1888ff8466dSQu Wenruovoid btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
1898ff8466dSQu Wenruo			    struct page *page)
1908ff8466dSQu Wenruo{
1918ff8466dSQu Wenruo	struct btrfs_subpage *subpage;
1928ff8466dSQu Wenruo
1938ff8466dSQu Wenruo	if (fs_info->sectorsize == PAGE_SIZE)
1948ff8466dSQu Wenruo		return;
1958ff8466dSQu Wenruo
1968ff8466dSQu Wenruo	ASSERT(PagePrivate(page) && page->mapping);
1978ff8466dSQu Wenruo	lockdep_assert_held(&page->mapping->private_lock);
1988ff8466dSQu Wenruo
1998ff8466dSQu Wenruo	subpage = (struct btrfs_subpage *)page->private;
2008ff8466dSQu Wenruo	ASSERT(atomic_read(&subpage->eb_refs));
2018ff8466dSQu Wenruo	atomic_dec(&subpage->eb_refs);
2028ff8466dSQu Wenruo}
203a1d767c1SQu Wenruo
20492082d40SQu Wenruostatic void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
205a1d767c1SQu Wenruo		struct page *page, u64 start, u32 len)
206a1d767c1SQu Wenruo{
207a1d767c1SQu Wenruo	/* Basic checks */
208a1d767c1SQu Wenruo	ASSERT(PagePrivate(page) && page->private);
209a1d767c1SQu Wenruo	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
210a1d767c1SQu Wenruo	       IS_ALIGNED(len, fs_info->sectorsize));
211a1d767c1SQu Wenruo	/*
212a1d767c1SQu Wenruo	 * The range check only works for mapped page, we can still have
213a1d767c1SQu Wenruo	 * unmapped page like dummy extent buffer pages.
214a1d767c1SQu Wenruo	 */
215a1d767c1SQu Wenruo	if (page->mapping)
216a1d767c1SQu Wenruo		ASSERT(page_offset(page) <= start &&
217a1d767c1SQu Wenruo		       start + len <= page_offset(page) + PAGE_SIZE);
21892082d40SQu Wenruo}
21992082d40SQu Wenruo
22092082d40SQu Wenruovoid btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
22192082d40SQu Wenruo		struct page *page, u64 start, u32 len)
22292082d40SQu Wenruo{
22392082d40SQu Wenruo	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
22492082d40SQu Wenruo	const int nbits = len >> fs_info->sectorsize_bits;
22592082d40SQu Wenruo
22692082d40SQu Wenruo	btrfs_subpage_assert(fs_info, page, start, len);
22792082d40SQu Wenruo
2283d078efaSQu Wenruo	atomic_add(nbits, &subpage->readers);
22992082d40SQu Wenruo}
23092082d40SQu Wenruo
23192082d40SQu Wenruovoid btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
23292082d40SQu Wenruo		struct page *page, u64 start, u32 len)
23392082d40SQu Wenruo{
23492082d40SQu Wenruo	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
23592082d40SQu Wenruo	const int nbits = len >> fs_info->sectorsize_bits;
2363d078efaSQu Wenruo	bool is_data;
2373d078efaSQu Wenruo	bool last;
23892082d40SQu Wenruo
23992082d40SQu Wenruo	btrfs_subpage_assert(fs_info, page, start, len);
2403d078efaSQu Wenruo	is_data = is_data_inode(page->mapping->host);
24192082d40SQu Wenruo	ASSERT(atomic_read(&subpage->readers) >= nbits);
2423d078efaSQu Wenruo	last = atomic_sub_and_test(nbits, &subpage->readers);
2433d078efaSQu Wenruo
2443d078efaSQu Wenruo	/*
2453d078efaSQu Wenruo	 * For data we need to unlock the page if the last read has finished.
2463d078efaSQu Wenruo	 *
2473d078efaSQu Wenruo	 * And please don't replace @last with atomic_sub_and_test() call
2483d078efaSQu Wenruo	 * inside if () condition.
2493d078efaSQu Wenruo	 * As we want the atomic_sub_and_test() to be always executed.
2503d078efaSQu Wenruo	 */
2513d078efaSQu Wenruo	if (is_data && last)
25292082d40SQu Wenruo		unlock_page(page);
25392082d40SQu Wenruo}
25492082d40SQu Wenruo
2551e1de387SQu Wenruostatic void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
2561e1de387SQu Wenruo{
2571e1de387SQu Wenruo	u64 orig_start = *start;
2581e1de387SQu Wenruo	u32 orig_len = *len;
2591e1de387SQu Wenruo
2601e1de387SQu Wenruo	*start = max_t(u64, page_offset(page), orig_start);
261e4f94347SQu Wenruo	/*
262e4f94347SQu Wenruo	 * For certain call sites like btrfs_drop_pages(), we may have pages
263e4f94347SQu Wenruo	 * beyond the target range. In that case, just set @len to 0, subpage
264e4f94347SQu Wenruo	 * helpers can handle @len == 0 without any problem.
265e4f94347SQu Wenruo	 */
266e4f94347SQu Wenruo	if (page_offset(page) >= orig_start + orig_len)
267e4f94347SQu Wenruo		*len = 0;
268e4f94347SQu Wenruo	else
269e4f94347SQu Wenruo		*len = min_t(u64, page_offset(page) + PAGE_SIZE,
270e4f94347SQu Wenruo			     orig_start + orig_len) - *start;
2711e1de387SQu Wenruo}
2721e1de387SQu Wenruo
2731e1de387SQu Wenruovoid btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
274