space-info.c revision 93c60b17
1// SPDX-License-Identifier: GPL-2.0
2
3#include "misc.h"
4#include "ctree.h"
5#include "space-info.h"
6#include "sysfs.h"
7#include "volumes.h"
8#include "free-space-cache.h"
9#include "ordered-data.h"
10#include "transaction.h"
11#include "block-group.h"
12
13/*
14 * HOW DOES SPACE RESERVATION WORK
15 *
16 * If you want to know about delalloc specifically, there is a separate comment
17 * for that with the delalloc code.  This comment is about how the whole system
18 * works generally.
19 *
20 * BASIC CONCEPTS
21 *
22 *   1) space_info.  This is the ultimate arbiter of how much space we can use.
23 *   There's a description of the bytes_ fields with the struct declaration,
24 *   refer to that for specifics on each field.  Suffice it to say that for
25 *   reservations we care about total_bytes - SUM(space_info->bytes_) when
26 *   determining if there is space to make an allocation.  There is a space_info
27 *   for METADATA, SYSTEM, and DATA areas.
28 *
29 *   2) block_rsv's.  These are basically buckets for every different type of
30 *   metadata reservation we have.  You can see the comment in the block_rsv
31 *   code on the rules for each type, but generally block_rsv->reserved is how
32 *   much space is accounted for in space_info->bytes_may_use.
33 *
34 *   3) btrfs_calc*_size.  These are the worst case calculations we used based
35 *   on the number of items we will want to modify.  We have one for changing
36 *   items, and one for inserting new items.  Generally we use these helpers to
37 *   determine the size of the block reserves, and then use the actual bytes
38 *   values to adjust the space_info counters.
39 *
40 * MAKING RESERVATIONS, THE NORMAL CASE
41 *
42 *   We call into either btrfs_reserve_data_bytes() or
43 *   btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
44 *   num_bytes we want to reserve.
45 *
46 *   ->reserve
47 *     space_info->bytes_may_reserve += num_bytes
48 *
49 *   ->extent allocation
50 *     Call btrfs_add_reserved_bytes() which does
51 *     space_info->bytes_may_reserve -= num_bytes
52 *     space_info->bytes_reserved += extent_bytes
53 *
54 *   ->insert reference
55 *     Call btrfs_update_block_group() which does
56 *     space_info->bytes_reserved -= extent_bytes
57 *     space_info->bytes_used += extent_bytes
58 *
59 * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
60 *
61 *   Assume we are unable to simply make the reservation because we do not have
62 *   enough space
63 *
64 *   -> __reserve_bytes
65 *     create a reserve_ticket with ->bytes set to our reservation, add it to
66 *     the tail of space_info->tickets, kick async flush thread
67 *
68 *   ->handle_reserve_ticket
69 *     wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
70 *     on the ticket.
71 *
72 *   -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
73 *     Flushes various things attempting to free up space.
74 *
75 *   -> btrfs_try_granting_tickets()
76 *     This is called by anything that either subtracts space from
77 *     space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
78 *     space_info->total_bytes.  This loops through the ->priority_tickets and
79 *     then the ->tickets list checking to see if the reservation can be
80 *     completed.  If it can the space is added to space_info->bytes_may_use and
81 *     the ticket is woken up.
82 *
83 *   -> ticket wakeup
84 *     Check if ->bytes == 0, if it does we got our reservation and we can carry
85 *     on, if not return the appropriate error (ENOSPC, but can be EINTR if we
86 *     were interrupted.)
87 *
88 * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
89 *
90 *   Same as the above, except we add ourselves to the
91 *   space_info->priority_tickets, and we do not use ticket->wait, we simply
92 *   call flush_space() ourselves for the states that are safe for us to call
93 *   without deadlocking and hope for the best.
94 *
95 * THE FLUSHING STATES
96 *
97 *   Generally speaking we will have two cases for each state, a "nice" state
98 *   and a "ALL THE THINGS" state.  In btrfs we delay a lot of work in order to
99 *   reduce the locking over head on the various trees, and even to keep from
100 *   doing any work at all in the case of delayed refs.  Each of these delayed
101 *   things however hold reservations, and so letting them run allows us to
102 *   reclaim space so we can make new reservations.
103 *
104 *   FLUSH_DELAYED_ITEMS
105 *     Every inode has a delayed item to update the inode.  Take a simple write
106 *     for example, we would update the inode item at write time to update the
107 *     mtime, and then again at finish_ordered_io() time in order to update the
108 *     isize or bytes.  We keep these delayed items to coalesce these operations
109 *     into a single operation done on demand.  These are an easy way to reclaim
110 *     metadata space.
111 *
112 *   FLUSH_DELALLOC
113 *     Look at the delalloc comment to get an idea of how much space is reserved
114 *     for delayed allocation.  We can reclaim some of this space simply by
115 *     running delalloc, but usually we need to wait for ordered extents to
116 *     reclaim the bulk of this space.
117 *
118 *   FLUSH_DELAYED_REFS
119 *     We have a block reserve for the outstanding delayed refs space, and every
120 *     delayed ref operation holds a reservation.  Running these is a quick way
121 *     to reclaim space, but we want to hold this until the end because COW can
122 *     churn a lot and we can avoid making some extent tree modifications if we
123 *     are able to delay for as long as possible.
124 *
125 *   ALLOC_CHUNK
126 *     We will skip this the first time through space reservation, because of
127 *     overcommit and we don't want to have a lot of useless metadata space when
128 *     our worst case reservations will likely never come true.
129 *
130 *   RUN_DELAYED_IPUTS
131 *     If we're freeing inodes we're likely freeing checksums, file extent
132 *     items, and extent tree items.  Loads of space could be freed up by these
133 *     operations, however they won't be usable until the transaction commits.
134 *
135 *   COMMIT_TRANS
136 *     This will commit the transaction.  Historically we had a lot of logic
137 *     surrounding whether or not we'd commit the transaction, but this waits born
138 *     out of a pre-tickets era where we could end up committing the transaction
139 *     thousands of times in a row without making progress.  Now thanks to our
140 *     ticketing system we know if we're not making progress and can error
141 *     everybody out after a few commits rather than burning the disk hoping for
142 *     a different answer.
143 *
144 * OVERCOMMIT
145 *
146 *   Because we hold so many reservations for metadata we will allow you to
147 *   reserve more space than is currently free in the currently allocate
148 *   metadata space.  This only happens with metadata, data does not allow
149 *   overcommitting.
150 *
151 *   You can see the current logic for when we allow overcommit in
152 *   btrfs_can_overcommit(), but it only applies to unallocated space.  If there
153 *   is no unallocated space to be had, all reservations are kept within the
154 *   free space in the allocated metadata chunks.
155 *
156 *   Because of overcommitting, you generally want to use the
157 *   btrfs_can_overcommit() logic for metadata allocations, as it does the right
158 *   thing with or without extra unallocated space.
159 */
160
161u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
162			  bool may_use_included)
163{
164	ASSERT(s_info);
165	return s_info->bytes_used + s_info->bytes_reserved +
166		s_info->bytes_pinned + s_info->bytes_readonly +
167		s_info->bytes_zone_unusable +
168		(may_use_included ? s_info->bytes_may_use : 0);
169}
170
171/*
172 * after adding space to the filesystem, we need to clear the full flags
173 * on all the space infos.
174 */
175void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
176{
177	struct list_head *head = &info->space_info;
178	struct btrfs_space_info *found;
179
180	list_for_each_entry(found, head, list)
181		found->full = 0;
182}
183
184static int create_space_info(struct btrfs_fs_info *info, u64 flags)
185{
186
187	struct btrfs_space_info *space_info;
188	int i;
189	int ret;
190
191	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
192	if (!space_info)
193		return -ENOMEM;
194
195	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
196		INIT_LIST_HEAD(&space_info->block_groups[i]);
197	init_rwsem(&space_info->groups_sem);
198	spin_lock_init(&space_info->lock);
199	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
200	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
201	INIT_LIST_HEAD(&space_info->ro_bgs);
202	INIT_LIST_HEAD(&space_info->tickets);
203	INIT_LIST_HEAD(&space_info->priority_tickets);
204	space_info->clamp = 1;
205
206	ret = btrfs_sysfs_add_space_info_type(info, space_info);
207	if (ret)
208		return ret;
209
210	list_add(&space_info->list, &info->space_info);
211	if (flags & BTRFS_BLOCK_GROUP_DATA)
212		info->data_sinfo = space_info;
213
214	return ret;
215}
216
217int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
218{
219	struct btrfs_super_block *disk_super;
220	u64 features;
221	u64 flags;
222	int mixed = 0;
223	int ret;
224
225	disk_super = fs_info->super_copy;
226	if (!btrfs_super_root(disk_super))
227		return -EINVAL;
228
229	features = btrfs_super_incompat_flags(disk_super);
230	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
231		mixed = 1;
232
233	flags = BTRFS_BLOCK_GROUP_SYSTEM;
234	ret = create_space_info(fs_info, flags);
235	if (ret)
236		goto out;
237
238	if (mixed) {
239		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
240		ret = create_space_info(fs_info, flags);
241	} else {
242		flags = BTRFS_BLOCK_GROUP_METADATA;
243		ret = create_space_info(fs_info, flags);
244		if (ret)
245			goto out;
246
247		flags = BTRFS_BLOCK_GROUP_DATA;
248		ret = create_space_info(fs_info, flags);
249	}
250out:
251	return ret;
252}
253
254void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
255			     u64 total_bytes, u64 bytes_used,
256			     u64 bytes_readonly, u64 bytes_zone_unusable,
257			     struct btrfs_space_info **space_info)
258{
259	struct btrfs_space_info *found;
260	int factor;
261
262	factor = btrfs_bg_type_to_factor(flags);
263
264	found = btrfs_find_space_info(info, flags);
265	ASSERT(found);
266	spin_lock(&found->lock);
267	found->total_bytes += total_bytes;
268	found->disk_total += total_bytes * factor;
269	found->bytes_used += bytes_used;
270	found->disk_used += bytes_used * factor;
271	found->bytes_readonly += bytes_readonly;
272	found->bytes_zone_unusable += bytes_zone_unusable;
273	if (total_bytes > 0)
274		found->full = 0;
275	btrfs_try_granting_tickets(info, found);
276	spin_unlock(&found->lock);
277	*space_info = found;
278}
279
280struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
281					       u64 flags)
282{
283	struct list_head *head = &info->space_info;
284	struct btrfs_space_info *found;
285
286	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
287
288	list_for_each_entry(found, head, list) {
289		if (found->flags & flags)
290			return found;
291	}
292	return NULL;
293}
294
295static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
296			  struct btrfs_space_info *space_info,
297			  enum btrfs_reserve_flush_enum flush)
298{
299	u64 profile;
300	u64 avail;
301	int factor;
302
303	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
304		profile = btrfs_system_alloc_profile(fs_info);
305	else
306		profile = btrfs_metadata_alloc_profile(fs_info);
307
308	avail = atomic64_read(&fs_info->free_chunk_space);
309
310	/*
311	 * If we have dup, raid1 or raid10 then only half of the free
312	 * space is actually usable.  For raid56, the space info used
313	 * doesn't include the parity drive, so we don't have to
314	 * change the math
315	 */
316	factor = btrfs_bg_type_to_factor(profile);
317	avail = div_u64(avail, factor);
318
319	/*
320	 * If we aren't flushing all things, let us overcommit up to
321	 * 1/2th of the space. If we can flush, don't let us overcommit
322	 * too much, let it overcommit up to 1/8 of the space.
323	 */
324	if (flush == BTRFS_RESERVE_FLUSH_ALL)
325		avail >>= 3;
326	else
327		avail >>= 1;
328	return avail;
329}
330
331int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
332			 struct btrfs_space_info *space_info, u64 bytes,
333			 enum btrfs_reserve_flush_enum flush)
334{
335	u64 avail;
336	u64 used;
337
338	/* Don't overcommit when in mixed mode */
339	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
340		return 0;
341
342	used = btrfs_space_info_used(space_info, true);
343	avail = calc_available_free_space(fs_info, space_info, flush);
344
345	if (used + bytes < space_info->total_bytes + avail)
346		return 1;
347	return 0;
348}
349
350static void remove_ticket(struct btrfs_space_info *space_info,
351			  struct reserve_ticket *ticket)
352{
353	if (!list_empty(&ticket->list)) {
354		list_del_init(&ticket->list);
355		ASSERT(space_info->reclaim_size >= ticket->bytes);
356		space_info->reclaim_size -= ticket->bytes;
357	}
358}
359
360/*
361 * This is for space we already have accounted in space_info->bytes_may_use, so
362 * basically when we're returning space from block_rsv's.
363 */
364void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
365				struct btrfs_space_info *space_info)
366{
367	struct list_head *head;
368	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
369
370	lockdep_assert_held(&space_info->lock);
371
372	head = &space_info->priority_tickets;
373again:
374	while (!list_empty(head)) {
375		struct reserve_ticket *ticket;
376		u64 used = btrfs_space_info_used(space_info, true);
377
378		ticket = list_first_entry(head, struct reserve_ticket, list);
379
380		/* Check and see if our ticket can be satisfied now. */
381		if ((used + ticket->bytes <= space_info->total_bytes) ||
382		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
383					 flush)) {
384			btrfs_space_info_update_bytes_may_use(fs_info,
385							      space_info,
386							      ticket->bytes);
387			remove_ticket(space_info, ticket);
388			ticket->bytes = 0;
389			space_info->tickets_id++;
390			wake_up(&ticket->wait);
391		} else {
392			break;
393		}
394	}
395
396	if (head == &space_info->priority_tickets) {
397		head = &space_info->tickets;
398		flush = BTRFS_RESERVE_FLUSH_ALL;
399		goto again;
400	}
401}
402
403#define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
404do {									\
405	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
406	spin_lock(&__rsv->lock);					\
407	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
408		   __rsv->size, __rsv->reserved);			\
409	spin_unlock(&__rsv->lock);					\
410} while (0)
411
412static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
413				    struct btrfs_space_info *info)
414{
415	lockdep_assert_held(&info->lock);
416
417	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
418		   info->flags,
419		   info->total_bytes - btrfs_space_info_used(info, true),
420		   info->full ? "" : "not ");
421	btrfs_info(fs_info,
422		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
423		info->total_bytes, info->bytes_used, info->bytes_pinned,
424		info->bytes_reserved, info->bytes_may_use,
425		info->bytes_readonly, info->bytes_zone_unusable);
426
427	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
428	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
429	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
430	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
431	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
432
433}
434
435void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
436			   struct btrfs_space_info *info, u64 bytes,
437			   int dump_block_groups)
438{
439	struct btrfs_block_group *cache;
440	int index = 0;
441
442	spin_lock(&info->lock);
443	__btrfs_dump_space_info(fs_info, info);
444	spin_unlock(&info->lock);
445
446	if (!dump_block_groups)
447		return;
448
449	down_read(&info->groups_sem);
450again:
451	list_for_each_entry(cache, &info->block_groups[index], list) {
452		spin_lock(&cache->lock);
453		btrfs_info(fs_info,
454			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
455			cache->start, cache->length, cache->used, cache->pinned,
456			cache->reserved, cache->zone_unusable,
457			cache->ro ? "[readonly]" : "");
458		spin_unlock(&cache->lock);
459		btrfs_dump_free_space(cache, bytes);
460	}
461	if (++index < BTRFS_NR_RAID_TYPES)
462		goto again;
463	up_read(&info->groups_sem);
464}
465
466static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
467					u64 to_reclaim)
468{
469	u64 bytes;
470	u64 nr;
471
472	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
473	nr = div64_u64(to_reclaim, bytes);
474	if (!nr)
475		nr = 1;
476	return nr;
477}
478
479#define EXTENT_SIZE_PER_ITEM	SZ_256K
480
481/*
482 * shrink metadata reservation for delalloc
483 */
484static void shrink_delalloc(struct btrfs_fs_info *fs_info,
485			    struct btrfs_space_info *space_info,
486			    u64 to_reclaim, bool wait_ordered,
487			    bool for_preempt)
488{
489	struct btrfs_trans_handle *trans;
490	u64 delalloc_bytes;
491	u64 ordered_bytes;
492	u64 items;
493	long time_left;
494	int loops;
495
496	delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
497	ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
498	if (delalloc_bytes == 0 && ordered_bytes == 0)
499		return;
500
501	/* Calc the number of the pages we need flush for space reservation */
502	if (to_reclaim == U64_MAX) {
503		items = U64_MAX;
504	} else {
505		/*
506		 * to_reclaim is set to however much metadata we need to
507		 * reclaim, but reclaiming that much data doesn't really track
508		 * exactly.  What we really want to do is reclaim full inode's
509		 * worth of reservations, however that's not available to us
510		 * here.  We will take a fraction of the delalloc bytes for our
511		 * flushing loops and hope for the best.  Delalloc will expand
512		 * the amount we write to cover an entire dirty extent, which
513		 * will reclaim the metadata reservation for that range.  If
514		 * it's not enough subsequent flush stages will be more
515		 * aggressive.
516		 */
517		to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
518		items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
519	}
520
521	trans = (struct btrfs_trans_handle *)current->journal_info;
522
523	/*
524	 * If we are doing more ordered than delalloc we need to just wait on
525	 * ordered extents, otherwise we'll waste time trying to flush delalloc
526	 * that likely won't give us the space back we need.
527	 */
528	if (ordered_bytes > delalloc_bytes && !for_preempt)
529		wait_ordered = true;
530
531	loops = 0;
532	while ((delalloc_bytes || ordered_bytes) && loops < 3) {
533		u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
534		long nr_pages = min_t(u64, temp, LONG_MAX);
535		int async_pages;
536
537		btrfs_start_delalloc_roots(fs_info, nr_pages, true);
538
539		/*
540		 * We need to make sure any outstanding async pages are now
541		 * processed before we continue.  This is because things like
542		 * sync_inode() try to be smart and skip writing if the inode is
543		 * marked clean.  We don't use filemap_fwrite for flushing
544		 * because we want to control how many pages we write out at a
545		 * time, thus this is the only safe way to make sure we've
546		 * waited for outstanding compressed workers to have started
547		 * their jobs and thus have ordered extents set up properly.
548		 *
549		 * This exists because we do not want to wait for each
550		 * individual inode to finish its async work, we simply want to
551		 * start the IO on everybody, and then come back here and wait
552		 * for all of the async work to catch up.  Once we're done with
553		 * that we know we'll have ordered extents for everything and we
554		 * can decide if we wait for that or not.
555		 *
556		 * If we choose to replace this in the future, make absolutely
557		 * sure that the proper waiting is being done in the async case,
558		 * as there have been bugs in that area before.
559		 */
560		async_pages = atomic_read(&fs_info->async_delalloc_pages);
561		if (!async_pages)
562			goto skip_async;
563
564		/*
565		 * We don't want to wait forever, if we wrote less pages in this
566		 * loop than we have outstanding, only wait for that number of
567		 * pages, otherwise we can wait for all async pages to finish
568		 * before continuing.
569		 */
570		if (async_pages > nr_pages)
571			async_pages -= nr_pages;
572		else
573			async_pages = 0;
574		wait_event(fs_info->async_submit_wait,
575			   atomic_read(&fs_info->async_delalloc_pages) <=
576			   async_pages);
577skip_async:
578		loops++;
579		if (wait_ordered && !trans) {
580			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
581		} else {
582			time_left = schedule_timeout_killable(1);
583			if (time_left)
584				break;
585		}
586
587		/*
588		 * If we are for preemption we just want a one-shot of delalloc
589		 * flushing so we can stop flushing if we decide we don't need
590		 * to anymore.
591		 */
592		if (for_preempt)
593			break;
594
595		spin_lock(&space_info->lock);
596		if (list_empty(&space_info->tickets) &&
597		    list_empty(&space_info->priority_tickets)) {
598			spin_unlock(&space_info->lock);
599			break;
600		}
601		spin_unlock(&space_info->lock);
602
603		delalloc_bytes = percpu_counter_sum_positive(
604						&fs_info->delalloc_bytes);
605		ordered_bytes = percpu_counter_sum_positive(
606						&fs_info->ordered_bytes);
607	}
608}
609
610/*
611 * Try to flush some data based on policy set by @state. This is only advisory
612 * and may fail for various reasons. The caller is supposed to examine the
613 * state of @space_info to detect the outcome.
614 */
615static void flush_space(struct btrfs_fs_info *fs_info,
616		       struct btrfs_space_info *space_info, u64 num_bytes,
617		       enum btrfs_flush_state state, bool for_preempt)
618{
619	struct btrfs_root *root = fs_info->extent_root;
620	struct btrfs_trans_handle *trans;
621	int nr;
622	int ret = 0;
623
624	switch (state) {
625	case FLUSH_DELAYED_ITEMS_NR:
626	case FLUSH_DELAYED_ITEMS:
627		if (state == FLUSH_DELAYED_ITEMS_NR)
628			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
629		else
630			nr = -1;
631
632		trans = btrfs_join_transaction(root);
633		if (IS_ERR(trans)) {
634			ret = PTR_ERR(trans);
635			break;
636		}
637		ret = btrfs_run_delayed_items_nr(trans, nr);
638		btrfs_end_transaction(trans);
639		break;
640	case FLUSH_DELALLOC:
641	case FLUSH_DELALLOC_WAIT:
642	case FLUSH_DELALLOC_FULL:
643		if (state == FLUSH_DELALLOC_FULL)
644			num_bytes = U64_MAX;
645		shrink_delalloc(fs_info, space_info, num_bytes,
646				state != FLUSH_DELALLOC, for_preempt);
647		break;
648	case FLUSH_DELAYED_REFS_NR:
649	case FLUSH_DELAYED_REFS:
650		trans = btrfs_join_transaction(root);
651		if (IS_ERR(trans)) {
652			ret = PTR_ERR(trans);
653			break;
654		}
655		if (state == FLUSH_DELAYED_REFS_NR)
656			nr = calc_reclaim_items_nr(fs_info, num_bytes);
657		else
658			nr = 0;
659		btrfs_run_delayed_refs(trans, nr);
660		btrfs_end_transaction(trans);
661		break;
662	case ALLOC_CHUNK:
663	case ALLOC_CHUNK_FORCE:
664		trans = btrfs_join_transaction(root);
665		if (IS_ERR(trans)) {
666			ret = PTR_ERR(trans);
667			break;
668		}
669		ret = btrfs_chunk_alloc(trans,
670				btrfs_get_alloc_profile(fs_info, space_info->flags),
671				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
672					CHUNK_ALLOC_FORCE);
673		btrfs_end_transaction(trans);
674		if (ret > 0 || ret == -ENOSPC)
675			ret = 0;
676		break;
677	case RUN_DELAYED_IPUTS:
678		/*
679		 * If we have pending delayed iputs then we could free up a
680		 * bunch of pinned space, so make sure we run the iputs before
681		 * we do our pinned bytes check below.
682		 */
683		btrfs_run_delayed_iputs(fs_info);
684		btrfs_wait_on_delayed_iputs(fs_info);
685		break;
686	case COMMIT_TRANS:
687		ASSERT(current->journal_info == NULL);
688		trans = btrfs_join_transaction(root);
689		if (IS_ERR(trans)) {
690			ret = PTR_ERR(trans);
691			break;
692		}
693		ret = btrfs_commit_transaction(trans);
694		break;
695	default:
696		ret = -ENOSPC;
697		break;
698	}
699
700	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
701				ret, for_preempt);
702	return;
703}
704
705static inline u64
706btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
707				 struct btrfs_space_info *space_info)
708{
709	u64 used;
710	u64 avail;
711	u64 to_reclaim = space_info->reclaim_size;
712
713	lockdep_assert_held(&space_info->lock);
714
715	avail = calc_available_free_space(fs_info, space_info,
716					  BTRFS_RESERVE_FLUSH_ALL);
717	used = btrfs_space_info_used(space_info, true);
718
719	/*
720	 * We may be flushing because suddenly we have less space than we had
721	 * before, and now we're well over-committed based on our current free
722	 * space.  If that's the case add in our overage so we make sure to put
723	 * appropriate pressure on the flushing state machine.
724	 */
725	if (space_info->total_bytes + avail < used)
726		to_reclaim += used - (space_info->total_bytes + avail);
727
728	return to_reclaim;
729}
730
731static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
732				    struct btrfs_space_info *space_info)
733{
734	u64 global_rsv_size = fs_info->global_block_rsv.reserved;
735	u64 ordered, delalloc;
736	u64 thresh = div_factor_fine(space_info->total_bytes, 90);
737	u64 used;
738
739	/* If we're just plain full then async reclaim just slows us down. */
740	if ((space_info->bytes_used + space_info->bytes_reserved +
741	     global_rsv_size) >= thresh)
742		return false;
743
744	/*
745	 * We have tickets queued, bail so we don't compete with the async
746	 * flushers.
747	 */
748	if (space_info->reclaim_size)
749		return false;
750
751	/*
752	 * If we have over half of the free space occupied by reservations or
753	 * pinned then we want to start flushing.
754	 *
755	 * We do not do the traditional thing here, which is to say
756	 *
757	 *   if (used >= ((total_bytes + avail) / 2))
758	 *     return 1;
759	 *
760	 * because this doesn't quite work how we want.  If we had more than 50%
761	 * of the space_info used by bytes_used and we had 0 available we'd just
762	 * constantly run the background flusher.  Instead we want it to kick in
763	 * if our reclaimable space exceeds our clamped free space.
764	 *
765	 * Our clamping range is 2^1 -> 2^8.  Practically speaking that means
766	 * the following:
767	 *
768	 * Amount of RAM        Minimum threshold       Maximum threshold
769	 *
770	 *        256GiB                     1GiB                  128GiB
771	 *        128GiB                   512MiB                   64GiB
772	 *         64GiB                   256MiB                   32GiB
773	 *         32GiB                   128MiB                   16GiB
774	 *         16GiB                    64MiB                    8GiB
775	 *
776	 * These are the range our thresholds will fall in, corresponding to how
777	 * much delalloc we need for the background flusher to kick in.
778	 */
779
780	thresh = calc_available_free_space(fs_info, space_info,
781					   BTRFS_RESERVE_FLUSH_ALL);
782	used = space_info->bytes_used + space_info->bytes_reserved +
783	       space_info->bytes_readonly + global_rsv_size;
784	if (used < space_info->total_bytes)
785		thresh += space_info->total_bytes - used;
786	thresh >>= space_info->clamp;
787
788	used = space_info->bytes_pinned;
789
790	/*
791	 * If we have more ordered bytes than delalloc bytes then we're either
792	 * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
793	 * around.  Preemptive flushing is only useful in that it can free up
794	 * space before tickets need to wait for things to finish.  In the case
795	 * of ordered extents, preemptively waiting on ordered extents gets us
796	 * nothing, if our reservations are tied up in ordered extents we'll
797	 * simply have to slow down writers by forcing them to wait on ordered
798	 * extents.
799	 *
800	 * In the case that ordered is larger than delalloc, only include the
801	 * block reserves that we would actually be able to directly reclaim
802	 * from.  In this case if we're heavy on metadata operations this will
803	 * clearly be heavy enough to warrant preemptive flushing.  In the case
804	 * of heavy DIO or ordered reservations, preemptive flushing will just
805	 * waste time and cause us to slow down.
806	 *
807	 * We want to make sure we truly are maxed out on ordered however, so
808	 * cut ordered in half, and if it's still higher than delalloc then we
809	 * can keep flushing.  This is to avoid the case where we start
810	 * flushing, and now delalloc == ordered and we stop preemptively
811	 * flushing when we could still have several gigs of delalloc to flush.
812	 */
813	ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
814	delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
815	if (ordered >= delalloc)
816		used += fs_info->delayed_refs_rsv.reserved +
817			fs_info->delayed_block_rsv.reserved;
818	else
819		used += space_info->bytes_may_use - global_rsv_size;
820
821	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
822		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
823}
824
825static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
826				  struct btrfs_space_info *space_info,
827				  struct reserve_ticket *ticket)
828{
829	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
830	u64 min_bytes;
831
832	if (global_rsv->space_info != space_info)
833		return false;
834
835	spin_lock(&global_rsv->lock);
836	min_bytes = div_factor(global_rsv->size, 1);
837	if (global_rsv->reserved < min_bytes + ticket->bytes) {
838		spin_unlock(&global_rsv->lock);
839		return false;
840	}
841	global_rsv->reserved -= ticket->bytes;
842	remove_ticket(space_info, ticket);
843	ticket->bytes = 0;
844	wake_up(&ticket->wait);
845	space_info->tickets_id++;
846	if (global_rsv->reserved < global_rsv->size)
847		global_rsv->full = 0;
848	spin_unlock(&global_rsv->lock);
849
850	return true;
851}
852
853/*
854 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
855 * @fs_info - fs_info for this fs
856 * @space_info - the space info we were flushing
857 *
858 * We call this when we've exhausted our flushing ability and haven't made
859 * progress in satisfying tickets.  The reservation code handles tickets in
860 * order, so if there is a large ticket first and then smaller ones we could
861 * very well satisfy the smaller tickets.  This will attempt to wake up any
862 * tickets in the list to catch this case.
863 *
864 * This function returns true if it was able to make progress by clearing out
865 * other tickets, or if it stumbles across a ticket that was smaller than the
866 * first ticket.
867 */
868static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
869				   struct btrfs_space_info *space_info)
870{
871	struct reserve_ticket *ticket;
872	u64 tickets_id = space_info->tickets_id;
873
874	trace_btrfs_fail_all_tickets(fs_info, space_info);
875
876	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
877		btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
878		__btrfs_dump_space_info(fs_info, space_info);
879	}
880
881	while (!list_empty(&space_info->tickets) &&
882	       tickets_id == space_info->tickets_id) {
883		ticket = list_first_entry(&space_info->tickets,
884					  struct reserve_ticket, list);
885
886		if (ticket->steal &&
887		    steal_from_global_rsv(fs_info, space_info, ticket))
888			return true;
889
890		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
891			btrfs_info(fs_info, "failing ticket with %llu bytes",
892				   ticket->bytes);
893
894		remove_ticket(space_info, ticket);
895		ticket->error = -ENOSPC;
896		wake_up(&ticket->wait);
897
898		/*
899		 * We're just throwing tickets away, so more flushing may not
900		 * trip over btrfs_try_granting_tickets, so we need to call it
901		 * here to see if we can make progress with the next ticket in
902		 * the list.
903		 */
904		btrfs_try_granting_tickets(fs_info, space_info);
905	}
906	return (tickets_id != space_info->tickets_id);
907}
908
909/*
910 * This is for normal flushers, we can wait all goddamned day if we want to.  We
911 * will loop and continuously try to flush as long as we are making progress.
912 * We count progress as clearing off tickets each time we have to loop.
913 */
914static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
915{
916	struct btrfs_fs_info *fs_info;
917	struct btrfs_space_info *space_info;
918	u64 to_reclaim;
919	enum btrfs_flush_state flush_state;
920	int commit_cycles = 0;
921	u64 last_tickets_id;
922
923	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
924	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
925
926	spin_lock(&space_info->lock);
927	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
928	if (!to_reclaim) {
929		space_info->flush = 0;
930		spin_unlock(&space_info->lock);
931		return;
932	}
933	last_tickets_id = space_info->tickets_id;
934	spin_unlock(&space_info->lock);
935
936	flush_state = FLUSH_DELAYED_ITEMS_NR;
937	do {
938		flush_space(fs_info, space_info, to_reclaim, flush_state, false);
939		spin_lock(&space_info->lock);
940		if (list_empty(&space_info->tickets)) {
941			space_info->flush = 0;
942			spin_unlock(&space_info->lock);
943			return;
944		}
945		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
946							      space_info);
947		if (last_tickets_id == space_info->tickets_id) {
948			flush_state++;
949		} else {
950			last_tickets_id = space_info->tickets_id;
951			flush_state = FLUSH_DELAYED_ITEMS_NR;
952			if (commit_cycles)
953				commit_cycles--;
954		}
955
956		/*
957		 * We do not want to empty the system of delalloc unless we're
958		 * under heavy pressure, so allow one trip through the flushing
959		 * logic before we start doing a FLUSH_DELALLOC_FULL.
960		 */
961		if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
962			flush_state++;
963
964		/*
965		 * We don't want to force a chunk allocation until we've tried
966		 * pretty hard to reclaim space.  Think of the case where we
967		 * freed up a bunch of space and so have a lot of pinned space
968		 * to reclaim.  We would rather use that than possibly create a
969		 * underutilized metadata chunk.  So if this is our first run
970		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
971		 * commit the transaction.  If nothing has changed the next go
972		 * around then we can force a chunk allocation.
973		 */
974		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
975			flush_state++;
976
977		if (flush_state > COMMIT_TRANS) {
978			commit_cycles++;
979			if (commit_cycles > 2) {
980				if (maybe_fail_all_tickets(fs_info, space_info)) {
981					flush_state = FLUSH_DELAYED_ITEMS_NR;
982					commit_cycles--;
983				} else {
984					space_info->flush = 0;
985				}
986			} else {
987				flush_state = FLUSH_DELAYED_ITEMS_NR;
988			}
989		}
990		spin_unlock(&space_info->lock);
991	} while (flush_state <= COMMIT_TRANS);
992}
993
994/*
995 * This handles pre-flushing of metadata space before we get to the point that
996 * we need to start blocking threads on tickets.  The logic here is different
997 * from the other flush paths because it doesn't rely on tickets to tell us how
998 * much we need to flush, instead it attempts to keep us below the 80% full
999 * watermark of space by flushing whichever reservation pool is currently the
1000 * largest.
1001 */
1002static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
1003{
1004	struct btrfs_fs_info *fs_info;
1005	struct btrfs_space_info *space_info;
1006	struct btrfs_block_rsv *delayed_block_rsv;
1007	struct btrfs_block_rsv *delayed_refs_rsv;
1008	struct btrfs_block_rsv *global_rsv;
1009	struct btrfs_block_rsv *trans_rsv;
1010	int loops = 0;
1011
1012	fs_info = container_of(work, struct btrfs_fs_info,
1013			       preempt_reclaim_work);
1014	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
1015	delayed_block_rsv = &fs_info->delayed_block_rsv;
1016	delayed_refs_rsv = &fs_info->delayed_refs_rsv;
1017	global_rsv = &fs_info->global_block_rsv;
1018	trans_rsv = &fs_info->trans_block_rsv;
1019
1020	spin_lock(&space_info->lock);
1021	while (need_preemptive_reclaim(fs_info, space_info)) {
1022		enum btrfs_flush_state flush;
1023		u64 delalloc_size = 0;
1024		u64 to_reclaim, block_rsv_size;
1025		u64 global_rsv_size = global_rsv->reserved;
1026
1027		loops++;
1028
1029		/*
1030		 * We don't have a precise counter for the metadata being
1031		 * reserved for delalloc, so we'll approximate it by subtracting
1032		 * out the block rsv's space from the bytes_may_use.  If that
1033		 * amount is higher than the individual reserves, then we can
1034		 * assume it's tied up in delalloc reservations.
1035		 */
1036		block_rsv_size = global_rsv_size +
1037			delayed_block_rsv->reserved +
1038			delayed_refs_rsv->reserved +
1039			trans_rsv->reserved;
1040		if (block_rsv_size < space_info->bytes_may_use)
1041			delalloc_size = space_info->bytes_may_use - block_rsv_size;
1042		spin_unlock(&space_info->lock);
1043
1044		/*
1045		 * We don't want to include the global_rsv in our calculation,
1046		 * because that's space we can't touch.  Subtract it from the
1047		 * block_rsv_size for the next checks.
1048		 */
1049		block_rsv_size -= global_rsv_size;
1050
1051		/*
1052		 * We really want to avoid flushing delalloc too much, as it
1053		 * could result in poor allocation patterns, so only flush it if
1054		 * it's larger than the rest of the pools combined.
1055		 */
1056		if (delalloc_size > block_rsv_size) {
1057			to_reclaim = delalloc_size;
1058			flush = FLUSH_DELALLOC;
1059		} else if (space_info->bytes_pinned >
1060			   (delayed_block_rsv->reserved +
1061			    delayed_refs_rsv->reserved)) {
1062			to_reclaim = space_info->bytes_pinned;
1063			flush = COMMIT_TRANS;
1064		} else if (delayed_block_rsv->reserved >
1065			   delayed_refs_rsv->reserved) {
1066			to_reclaim = delayed_block_rsv->reserved;
1067			flush = FLUSH_DELAYED_ITEMS_NR;
1068		} else {
1069			to_reclaim = delayed_refs_rsv->reserved;
1070			flush = FLUSH_DELAYED_REFS_NR;
1071		}
1072
1073		/*
1074		 * We don't want to reclaim everything, just a portion, so scale
1075		 * down the to_reclaim by 1/4.  If it takes us down to 0,
1076		 * reclaim 1 items worth.
1077		 */
1078		to_reclaim >>= 2;
1079		if (!to_reclaim)
1080			to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
1081		flush_space(fs_info, space_info, to_reclaim, flush, true);
1082		cond_resched();
1083		spin_lock(&space_info->lock);
1084	}
1085
1086	/* We only went through once, back off our clamping. */
1087	if (loops == 1 && !space_info->reclaim_size)
1088		space_info->clamp = max(1, space_info->clamp - 1);
1089	trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
1090	spin_unlock(&space_info->lock);
1091}
1092
1093/*
1094 * FLUSH_DELALLOC_WAIT:
1095 *   Space is freed from flushing delalloc in one of two ways.
1096 *
1097 *   1) compression is on and we allocate less space than we reserved
1098 *   2) we are overwriting existing space
1099 *
1100 *   For #1 that extra space is reclaimed as soon as the delalloc pages are
1101 *   COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
1102 *   length to ->bytes_reserved, and subtracts the reserved space from
1103 *   ->bytes_may_use.
1104 *
1105 *   For #2 this is trickier.  Once the ordered extent runs we will drop the
1106 *   extent in the range we are overwriting, which creates a delayed ref for
1107 *   that freed extent.  This however is not reclaimed until the transaction
1108 *   commits, thus the next stages.
1109 *
1110 * RUN_DELAYED_IPUTS
1111 *   If we are freeing inodes, we want to make sure all delayed iputs have
1112 *   completed, because they could have been on an inode with i_nlink == 0, and
1113 *   thus have been truncated and freed up space.  But again this space is not
1114 *   immediately re-usable, it comes in the form of a delayed ref, which must be
1115 *   run and then the transaction must be committed.
1116 *
1117 * COMMIT_TRANS
1118 *   This is where we reclaim all of the pinned space generated by running the
1119 *   iputs
1120 *
1121 * ALLOC_CHUNK_FORCE
1122 *   For data we start with alloc chunk force, however we could have been full
1123 *   before, and then the transaction commit could have freed new block groups,
1124 *   so if we now have space to allocate do the force chunk allocation.
1125 */
1126static const enum btrfs_flush_state data_flush_states[] = {
1127	FLUSH_DELALLOC_FULL,
1128	RUN_DELAYED_IPUTS,
1129	COMMIT_TRANS,
1130	ALLOC_CHUNK_FORCE,
1131};
1132
1133static void btrfs_async_reclaim_data_space(struct work_struct *work)
1134{
1135	struct btrfs_fs_info *fs_info;
1136	struct btrfs_space_info *space_info;
1137	u64 last_tickets_id;
1138	enum btrfs_flush_state flush_state = 0;
1139
1140	fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
1141	space_info = fs_info->data_sinfo;
1142
1143	spin_lock(&space_info->lock);
1144	if (list_empty(&space_info->tickets)) {
1145		space_info->flush = 0;
1146		spin_unlock(&space_info->lock);
1147		return;
1148	}
1149	last_tickets_id = space_info->tickets_id;
1150	spin_unlock(&space_info->lock);
1151
1152	while (!space_info->full) {
1153		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
1154		spin_lock(&space_info->lock);
1155		if (list_empty(&space_info->tickets)) {
1156			space_info->flush = 0;
1157			spin_unlock(&space_info->lock);
1158			return;
1159		}
1160		last_tickets_id = space_info->tickets_id;
1161		spin_unlock(&space_info->lock);
1162	}
1163
1164	while (flush_state < ARRAY_SIZE(data_flush_states)) {
1165		flush_space(fs_info, space_info, U64_MAX,
1166			    data_flush_states[flush_state], false);
1167		spin_lock(&space_info->lock);
1168		if (list_empty(&space_info->tickets)) {
1169			space_info->flush = 0;
1170			spin_unlock(&space_info->lock);
1171			return;
1172		}
1173
1174		if (last_tickets_id == space_info->tickets_id) {
1175			flush_state++;
1176		} else {
1177			last_tickets_id = space_info->tickets_id;
1178			flush_state = 0;
1179		}
1180
1181		if (flush_state >= ARRAY_SIZE(data_flush_states)) {
1182			if (space_info->full) {
1183				if (maybe_fail_all_tickets(fs_info, space_info))
1184					flush_state = 0;
1185				else
1186					space_info->flush = 0;
1187			} else {
1188				flush_state = 0;
1189			}
1190		}
1191		spin_unlock(&space_info->lock);
1192	}
1193}
1194
1195void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
1196{
1197	INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
1198	INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
1199	INIT_WORK(&fs_info->preempt_reclaim_work,
1200		  btrfs_preempt_reclaim_metadata_space);
1201}
1202
1203static const enum btrfs_flush_state priority_flush_states[] = {
1204	FLUSH_DELAYED_ITEMS_NR,
1205	FLUSH_DELAYED_ITEMS,
1206	ALLOC_CHUNK,
1207};
1208
1209static const enum btrfs_flush_state evict_flush_states[] = {
1210	FLUSH_DELAYED_ITEMS_NR,
1211	FLUSH_DELAYED_ITEMS,
1212	FLUSH_DELAYED_REFS_NR,
1213	FLUSH_DELAYED_REFS,
1214	FLUSH_DELALLOC,
1215	FLUSH_DELALLOC_WAIT,
1216	FLUSH_DELALLOC_FULL,
1217	ALLOC_CHUNK,
1218	COMMIT_TRANS,
1219};
1220
1221static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
1222				struct btrfs_space_info *space_info,
1223				struct reserve_ticket *ticket,
1224				const enum btrfs_flush_state *states,
1225				int states_nr)
1226{
1227	u64 to_reclaim;
1228	int flush_state;
1229
1230	spin_lock(&space_info->lock);
1231	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
1232	if (!to_reclaim) {
1233		spin_unlock(&space_info->lock);
1234		return;
1235	}
1236	spin_unlock(&space_info->lock);
1237
1238	flush_state = 0;
1239	do {
1240		flush_space(fs_info, space_info, to_reclaim, states[flush_state],
1241			    false);
1242		flush_state++;
1243		spin_lock(&space_info->lock);
1244		if (ticket->bytes == 0) {
1245			spin_unlock(&space_info->lock);
1246			return;
1247		}
1248		spin_unlock(&space_info->lock);
1249	} while (flush_state < states_nr);
1250}
1251
1252static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
1253					struct btrfs_space_info *space_info,
1254					struct reserve_ticket *ticket)
1255{
1256	while (!space_info->full) {
1257		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
1258		spin_lock(&space_info->lock);
1259		if (ticket->bytes == 0) {
1260			spin_unlock(&space_info->lock);
1261			return;
1262		}
1263		spin_unlock(&space_info->lock);
1264	}
1265}
1266
1267static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
1268				struct btrfs_space_info *space_info,
1269				struct reserve_ticket *ticket)
1270
1271{
1272	DEFINE_WAIT(wait);
1273	int ret = 0;
1274
1275	spin_lock(&space_info->lock);
1276	while (ticket->bytes > 0 && ticket->error == 0) {
1277		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
1278		if (ret) {
1279			/*
1280			 * Delete us from the list. After we unlock the space
1281			 * info, we don't want the async reclaim job to reserve
1282			 * space for this ticket. If that would happen, then the
1283			 * ticket's task would not known that space was reserved
1284			 * despite getting an error, resulting in a space leak
1285			 * (bytes_may_use counter of our space_info).
1286			 */
1287			remove_ticket(space_info, ticket);
1288			ticket->error = -EINTR;
1289			break;
1290		}
1291		spin_unlock(&space_info->lock);
1292
1293		schedule();
1294
1295		finish_wait(&ticket->wait, &wait);
1296		spin_lock(&space_info->lock);
1297	}
1298	spin_unlock(&space_info->lock);
1299}
1300
1301/**
1302 * Do the appropriate flushing and waiting for a ticket
1303 *
1304 * @fs_info:    the filesystem
1305 * @space_info: space info for the reservation
1306 * @ticket:     ticket for the reservation
1307 * @start_ns:   timestamp when the reservation started
1308 * @orig_bytes: amount of bytes originally reserved
1309 * @flush:      how much we can flush
1310 *
1311 * This does the work of figuring out how to flush for the ticket, waiting for
1312 * the reservation, and returning the appropriate error if there is one.
1313 */
1314static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
1315				 struct btrfs_space_info *space_info,
1316				 struct reserve_ticket *ticket,
1317				 u64 start_ns, u64 orig_bytes,
1318				 enum btrfs_reserve_flush_enum flush)
1319{
1320	int ret;
1321
1322	switch (flush) {
1323	case BTRFS_RESERVE_FLUSH_DATA:
1324	case BTRFS_RESERVE_FLUSH_ALL:
1325	case BTRFS_RESERVE_FLUSH_ALL_STEAL:
1326		wait_reserve_ticket(fs_info, space_info, ticket);
1327		break;
1328	case BTRFS_RESERVE_FLUSH_LIMIT:
1329		priority_reclaim_metadata_space(fs_info, space_info, ticket,
1330						priority_flush_states,
1331						ARRAY_SIZE(priority_flush_states));
1332		break;
1333	case BTRFS_RESERVE_FLUSH_EVICT:
1334		priority_reclaim_metadata_space(fs_info, space_info, ticket,
1335						evict_flush_states,
1336						ARRAY_SIZE(evict_flush_states));
1337		break;
1338	case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
1339		priority_reclaim_data_space(fs_info, space_info, ticket);
1340		break;
1341	default:
1342		ASSERT(0);
1343		break;
1344	}
1345
1346	spin_lock(&space_info->lock);
1347	ret = ticket->error;
1348	if (ticket->bytes || ticket->error) {
1349		/*
1350		 * We were a priority ticket, so we need to delete ourselves
1351		 * from the list.  Because we could have other priority tickets
1352		 * behind us that require less space, run
1353		 * btrfs_try_granting_tickets() to see if their reservations can
1354		 * now be made.
1355		 */
1356		if (!list_empty(&ticket->list)) {
1357			remove_ticket(space_info, ticket);
1358			btrfs_try_granting_tickets(fs_info, space_info);
1359		}
1360
1361		if (!ret)
1362			ret = -ENOSPC;
1363	}
1364	spin_unlock(&space_info->lock);
1365	ASSERT(list_empty(&ticket->list));
1366	/*
1367	 * Check that we can't have an error set if the reservation succeeded,
1368	 * as that would confuse tasks and lead them to error out without
1369	 * releasing reserved space (if an error happens the expectation is that
1370	 * space wasn't reserved at all).
1371	 */
1372	ASSERT(!(ticket->bytes == 0 && ticket->error));
1373	trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
1374				   start_ns, flush, ticket->error);
1375	return ret;
1376}
1377
1378/*
1379 * This returns true if this flush state will go through the ordinary flushing
1380 * code.
1381 */
1382static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
1383{
1384	return	(flush == BTRFS_RESERVE_FLUSH_ALL) ||
1385		(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
1386}
1387
1388static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
1389				       struct btrfs_space_info *space_info)
1390{
1391	u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
1392	u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
1393
1394	/*
1395	 * If we're heavy on ordered operations then clamping won't help us.  We
1396	 * need to clamp specifically to keep up with dirty'ing buffered
1397	 * writers, because there's not a 1:1 correlation of writing delalloc
1398	 * and freeing space, like there is with flushing delayed refs or
1399	 * delayed nodes.  If we're already more ordered than delalloc then
1400	 * we're keeping up, otherwise we aren't and should probably clamp.
1401	 */
1402	if (ordered < delalloc)
1403		space_info->clamp = min(space_info->clamp + 1, 8);
1404}
1405
1406/**
1407 * Try to reserve bytes from the block_rsv's space
1408 *
1409 * @fs_info:    the filesystem
1410 * @space_info: space info we want to allocate from
1411 * @orig_bytes: number of bytes we want
1412 * @flush:      whether or not we can flush to make our reservation
1413 *
1414 * This will reserve orig_bytes number of bytes from the space info associated
1415 * with the block_rsv.  If there is not enough space it will make an attempt to
1416 * flush out space to make room.  It will do this by flushing delalloc if
1417 * possible or committing the transaction.  If flush is 0 then no attempts to
1418 * regain reservations will be made and this will fail if there is not enough
1419 * space already.
1420 */
1421static int __reserve_bytes(struct btrfs_fs_info *fs_info,
1422			   struct btrfs_space_info *space_info, u64 orig_bytes,
1423			   enum btrfs_reserve_flush_enum flush)
1424{
1425	struct work_struct *async_work;
1426	struct reserve_ticket ticket;
1427	u64 start_ns = 0;
1428	u64 used;
1429	int ret = 0;
1430	bool pending_tickets;
1431
1432	ASSERT(orig_bytes);
1433	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
1434
1435	if (flush == BTRFS_RESERVE_FLUSH_DATA)
1436		async_work = &fs_info->async_data_reclaim_work;
1437	else
1438		async_work = &fs_info->async_reclaim_work;
1439
1440	spin_lock(&space_info->lock);
1441	ret = -ENOSPC;
1442	used = btrfs_space_info_used(space_info, true);
1443
1444	/*
1445	 * We don't want NO_FLUSH allocations to jump everybody, they can
1446	 * generally handle ENOSPC in a different way, so treat them the same as
1447	 * normal flushers when it comes to skipping pending tickets.
1448	 */
1449	if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
1450		pending_tickets = !list_empty(&space_info->tickets) ||
1451			!list_empty(&space_info->priority_tickets);
1452	else
1453		pending_tickets = !list_empty(&space_info->priority_tickets);
1454
1455	/*
1456	 * Carry on if we have enough space (short-circuit) OR call
1457	 * can_overcommit() to ensure we can overcommit to continue.
1458	 */
1459	if (!pending_tickets &&
1460	    ((used + orig_bytes <= space_info->total_bytes) ||
1461	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
1462		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
1463						      orig_bytes);
1464		ret = 0;
1465	}
1466
1467	/*
1468	 * If we couldn't make a reservation then setup our reservation ticket
1469	 * and kick the async worker if it's not already running.
1470	 *
1471	 * If we are a priority flusher then we just need to add our ticket to
1472	 * the list and we will do our own flushing further down.
1473	 */
1474	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
1475		ticket.bytes = orig_bytes;
1476		ticket.error = 0;
1477		space_info->reclaim_size += ticket.bytes;
1478		init_waitqueue_head(&ticket.wait);
1479		ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
1480		if (trace_btrfs_reserve_ticket_enabled())
1481			start_ns = ktime_get_ns();
1482
1483		if (flush == BTRFS_RESERVE_FLUSH_ALL ||
1484		    flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
1485		    flush == BTRFS_RESERVE_FLUSH_DATA) {
1486			list_add_tail(&ticket.list, &space_info->tickets);
1487			if (!space_info->flush) {
1488				/*
1489				 * We were forced to add a reserve ticket, so
1490				 * our preemptive flushing is unable to keep
1491				 * up.  Clamp down on the threshold for the
1492				 * preemptive flushing in order to keep up with
1493				 * the workload.
1494				 */
1495				maybe_clamp_preempt(fs_info, space_info);
1496
1497				space_info->flush = 1;
1498				trace_btrfs_trigger_flush(fs_info,
1499							  space_info->flags,
1500							  orig_bytes, flush,
1501							  "enospc");
1502				queue_work(system_unbound_wq, async_work);
1503			}
1504		} else {
1505			list_add_tail(&ticket.list,
1506				      &space_info->priority_tickets);
1507		}
1508	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1509		used += orig_bytes;
1510		/*
1511		 * We will do the space reservation dance during log replay,
1512		 * which means we won't have fs_info->fs_root set, so don't do
1513		 * the async reclaim as we will panic.
1514		 */
1515		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1516		    !work_busy(&fs_info->preempt_reclaim_work) &&
1517		    need_preemptive_reclaim(fs_info, space_info)) {
1518			trace_btrfs_trigger_flush(fs_info, space_info->flags,
1519						  orig_bytes, flush, "preempt");
1520			queue_work(system_unbound_wq,
1521				   &fs_info->preempt_reclaim_work);
1522		}
1523	}
1524	spin_unlock(&space_info->lock);
1525	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1526		return ret;
1527
1528	return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
1529				     orig_bytes, flush);
1530}
1531
1532/**
1533 * Trye to reserve metadata bytes from the block_rsv's space
1534 *
1535 * @root:       the root we're allocating for
1536 * @block_rsv:  block_rsv we're allocating for
1537 * @orig_bytes: number of bytes we want
1538 * @flush:      whether or not we can flush to make our reservation
1539 *
1540 * This will reserve orig_bytes number of bytes from the space info associated
1541 * with the block_rsv.  If there is not enough space it will make an attempt to
1542 * flush out space to make room.  It will do this by flushing delalloc if
1543 * possible or committing the transaction.  If flush is 0 then no attempts to
1544 * regain reservations will be made and this will fail if there is not enough
1545 * space already.
1546 */
1547int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1548				 struct btrfs_block_rsv *block_rsv,
1549				 u64 orig_bytes,
1550				 enum btrfs_reserve_flush_enum flush)
1551{
1552	struct btrfs_fs_info *fs_info = root->fs_info;
1553	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1554	int ret;
1555
1556	ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
1557	if (ret == -ENOSPC &&
1558	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1559		if (block_rsv != global_rsv &&
1560		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1561			ret = 0;
1562	}
1563	if (ret == -ENOSPC) {
1564		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1565					      block_rsv->space_info->flags,
1566					      orig_bytes, 1);
1567
1568		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1569			btrfs_dump_space_info(fs_info, block_rsv->space_info,
1570					      orig_bytes, 0);
1571	}
1572	return ret;
1573}
1574
1575/**
1576 * Try to reserve data bytes for an allocation
1577 *
1578 * @fs_info: the filesystem
1579 * @bytes:   number of bytes we need
1580 * @flush:   how we are allowed to flush
1581 *
1582 * This will reserve bytes from the data space info.  If there is not enough
1583 * space then we will attempt to flush space as specified by flush.
1584 */
1585int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
1586			     enum btrfs_reserve_flush_enum flush)
1587{
1588	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
1589	int ret;
1590
1591	ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
1592	       flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
1593	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
1594
1595	ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
1596	if (ret == -ENOSPC) {
1597		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1598					      data_sinfo->flags, bytes, 1);
1599		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1600			btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
1601	}
1602	return ret;
1603}
1604