space-info.c revision fcdef39c
1// SPDX-License-Identifier: GPL-2.0
2
3#include "misc.h"
4#include "ctree.h"
5#include "space-info.h"
6#include "sysfs.h"
7#include "volumes.h"
8#include "free-space-cache.h"
9#include "ordered-data.h"
10#include "transaction.h"
11#include "block-group.h"
12
13/*
14 * HOW DOES SPACE RESERVATION WORK
15 *
16 * If you want to know about delalloc specifically, there is a separate comment
17 * for that with the delalloc code.  This comment is about how the whole system
18 * works generally.
19 *
20 * BASIC CONCEPTS
21 *
22 *   1) space_info.  This is the ultimate arbiter of how much space we can use.
23 *   There's a description of the bytes_ fields with the struct declaration,
24 *   refer to that for specifics on each field.  Suffice it to say that for
25 *   reservations we care about total_bytes - SUM(space_info->bytes_) when
26 *   determining if there is space to make an allocation.  There is a space_info
27 *   for METADATA, SYSTEM, and DATA areas.
28 *
29 *   2) block_rsv's.  These are basically buckets for every different type of
30 *   metadata reservation we have.  You can see the comment in the block_rsv
31 *   code on the rules for each type, but generally block_rsv->reserved is how
32 *   much space is accounted for in space_info->bytes_may_use.
33 *
34 *   3) btrfs_calc*_size.  These are the worst case calculations we used based
35 *   on the number of items we will want to modify.  We have one for changing
36 *   items, and one for inserting new items.  Generally we use these helpers to
37 *   determine the size of the block reserves, and then use the actual bytes
38 *   values to adjust the space_info counters.
39 *
40 * MAKING RESERVATIONS, THE NORMAL CASE
41 *
42 *   We call into either btrfs_reserve_data_bytes() or
43 *   btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
44 *   num_bytes we want to reserve.
45 *
46 *   ->reserve
47 *     space_info->bytes_may_reserve += num_bytes
48 *
49 *   ->extent allocation
50 *     Call btrfs_add_reserved_bytes() which does
51 *     space_info->bytes_may_reserve -= num_bytes
52 *     space_info->bytes_reserved += extent_bytes
53 *
54 *   ->insert reference
55 *     Call btrfs_update_block_group() which does
56 *     space_info->bytes_reserved -= extent_bytes
57 *     space_info->bytes_used += extent_bytes
58 *
59 * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
60 *
61 *   Assume we are unable to simply make the reservation because we do not have
62 *   enough space
63 *
64 *   -> __reserve_bytes
65 *     create a reserve_ticket with ->bytes set to our reservation, add it to
66 *     the tail of space_info->tickets, kick async flush thread
67 *
68 *   ->handle_reserve_ticket
69 *     wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
70 *     on the ticket.
71 *
72 *   -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
73 *     Flushes various things attempting to free up space.
74 *
75 *   -> btrfs_try_granting_tickets()
76 *     This is called by anything that either subtracts space from
77 *     space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
78 *     space_info->total_bytes.  This loops through the ->priority_tickets and
79 *     then the ->tickets list checking to see if the reservation can be
80 *     completed.  If it can the space is added to space_info->bytes_may_use and
81 *     the ticket is woken up.
82 *
83 *   -> ticket wakeup
84 *     Check if ->bytes == 0, if it does we got our reservation and we can carry
85 *     on, if not return the appropriate error (ENOSPC, but can be EINTR if we
86 *     were interrupted.)
87 *
88 * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
89 *
90 *   Same as the above, except we add ourselves to the
91 *   space_info->priority_tickets, and we do not use ticket->wait, we simply
92 *   call flush_space() ourselves for the states that are safe for us to call
93 *   without deadlocking and hope for the best.
94 *
95 * THE FLUSHING STATES
96 *
97 *   Generally speaking we will have two cases for each state, a "nice" state
98 *   and a "ALL THE THINGS" state.  In btrfs we delay a lot of work in order to
99 *   reduce the locking over head on the various trees, and even to keep from
100 *   doing any work at all in the case of delayed refs.  Each of these delayed
101 *   things however hold reservations, and so letting them run allows us to
102 *   reclaim space so we can make new reservations.
103 *
104 *   FLUSH_DELAYED_ITEMS
105 *     Every inode has a delayed item to update the inode.  Take a simple write
106 *     for example, we would update the inode item at write time to update the
107 *     mtime, and then again at finish_ordered_io() time in order to update the
108 *     isize or bytes.  We keep these delayed items to coalesce these operations
109 *     into a single operation done on demand.  These are an easy way to reclaim
110 *     metadata space.
111 *
112 *   FLUSH_DELALLOC
113 *     Look at the delalloc comment to get an idea of how much space is reserved
114 *     for delayed allocation.  We can reclaim some of this space simply by
115 *     running delalloc, but usually we need to wait for ordered extents to
116 *     reclaim the bulk of this space.
117 *
118 *   FLUSH_DELAYED_REFS
119 *     We have a block reserve for the outstanding delayed refs space, and every
120 *     delayed ref operation holds a reservation.  Running these is a quick way
121 *     to reclaim space, but we want to hold this until the end because COW can
122 *     churn a lot and we can avoid making some extent tree modifications if we
123 *     are able to delay for as long as possible.
124 *
125 *   ALLOC_CHUNK
126 *     We will skip this the first time through space reservation, because of
127 *     overcommit and we don't want to have a lot of useless metadata space when
128 *     our worst case reservations will likely never come true.
129 *
130 *   RUN_DELAYED_IPUTS
131 *     If we're freeing inodes we're likely freeing checksums, file extent
132 *     items, and extent tree items.  Loads of space could be freed up by these
133 *     operations, however they won't be usable until the transaction commits.
134 *
135 *   COMMIT_TRANS
136 *     This will commit the transaction.  Historically we had a lot of logic
137 *     surrounding whether or not we'd commit the transaction, but this waits born
138 *     out of a pre-tickets era where we could end up committing the transaction
139 *     thousands of times in a row without making progress.  Now thanks to our
140 *     ticketing system we know if we're not making progress and can error
141 *     everybody out after a few commits rather than burning the disk hoping for
142 *     a different answer.
143 *
144 * OVERCOMMIT
145 *
146 *   Because we hold so many reservations for metadata we will allow you to
147 *   reserve more space than is currently free in the currently allocate
148 *   metadata space.  This only happens with metadata, data does not allow
149 *   overcommitting.
150 *
151 *   You can see the current logic for when we allow overcommit in
152 *   btrfs_can_overcommit(), but it only applies to unallocated space.  If there
153 *   is no unallocated space to be had, all reservations are kept within the
154 *   free space in the allocated metadata chunks.
155 *
156 *   Because of overcommitting, you generally want to use the
157 *   btrfs_can_overcommit() logic for metadata allocations, as it does the right
158 *   thing with or without extra unallocated space.
159 */
160
161u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
162			  bool may_use_included)
163{
164	ASSERT(s_info);
165	return s_info->bytes_used + s_info->bytes_reserved +
166		s_info->bytes_pinned + s_info->bytes_readonly +
167		s_info->bytes_zone_unusable +
168		(may_use_included ? s_info->bytes_may_use : 0);
169}
170
171/*
172 * after adding space to the filesystem, we need to clear the full flags
173 * on all the space infos.
174 */
175void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
176{
177	struct list_head *head = &info->space_info;
178	struct btrfs_space_info *found;
179
180	list_for_each_entry(found, head, list)
181		found->full = 0;
182}
183
184static int create_space_info(struct btrfs_fs_info *info, u64 flags)
185{
186
187	struct btrfs_space_info *space_info;
188	int i;
189	int ret;
190
191	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
192	if (!space_info)
193		return -ENOMEM;
194
195	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
196		INIT_LIST_HEAD(&space_info->block_groups[i]);
197	init_rwsem(&space_info->groups_sem);
198	spin_lock_init(&space_info->lock);
199	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
200	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
201	INIT_LIST_HEAD(&space_info->ro_bgs);
202	INIT_LIST_HEAD(&space_info->tickets);
203	INIT_LIST_HEAD(&space_info->priority_tickets);
204	space_info->clamp = 1;
205
206	ret = btrfs_sysfs_add_space_info_type(info, space_info);
207	if (ret)
208		return ret;
209
210	list_add(&space_info->list, &info->space_info);
211	if (flags & BTRFS_BLOCK_GROUP_DATA)
212		info->data_sinfo = space_info;
213
214	return ret;
215}
216
217int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
218{
219	struct btrfs_super_block *disk_super;
220	u64 features;
221	u64 flags;
222	int mixed = 0;
223	int ret;
224
225	disk_super = fs_info->super_copy;
226	if (!btrfs_super_root(disk_super))
227		return -EINVAL;
228
229	features = btrfs_super_incompat_flags(disk_super);
230	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
231		mixed = 1;
232
233	flags = BTRFS_BLOCK_GROUP_SYSTEM;
234	ret = create_space_info(fs_info, flags);
235	if (ret)
236		goto out;
237
238	if (mixed) {
239		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
240		ret = create_space_info(fs_info, flags);
241	} else {
242		flags = BTRFS_BLOCK_GROUP_METADATA;
243		ret = create_space_info(fs_info, flags);
244		if (ret)
245			goto out;
246
247		flags = BTRFS_BLOCK_GROUP_DATA;
248		ret = create_space_info(fs_info, flags);
249	}
250out:
251	return ret;
252}
253
254void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
255			     u64 total_bytes, u64 bytes_used,
256			     u64 bytes_readonly, u64 bytes_zone_unusable,
257			     struct btrfs_space_info **space_info)
258{
259	struct btrfs_space_info *found;
260	int factor;
261
262	factor = btrfs_bg_type_to_factor(flags);
263
264	found = btrfs_find_space_info(info, flags);
265	ASSERT(found);
266	spin_lock(&found->lock);
267	found->total_bytes += total_bytes;
268	found->disk_total += total_bytes * factor;
269	found->bytes_used += bytes_used;
270	found->disk_used += bytes_used * factor;
271	found->bytes_readonly += bytes_readonly;
272	found->bytes_zone_unusable += bytes_zone_unusable;
273	if (total_bytes > 0)
274		found->full = 0;
275	btrfs_try_granting_tickets(info, found);
276	spin_unlock(&found->lock);
277	*space_info = found;
278}
279
280struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
281					       u64 flags)
282{
283	struct list_head *head = &info->space_info;
284	struct btrfs_space_info *found;
285
286	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
287
288	list_for_each_entry(found, head, list) {
289		if (found->flags & flags)
290			return found;
291	}
292	return NULL;
293}
294
295static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
296			  struct btrfs_space_info *space_info,
297			  enum btrfs_reserve_flush_enum flush)
298{
299	u64 profile;
300	u64 avail;
301	int factor;
302
303	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
304		profile = btrfs_system_alloc_profile(fs_info);
305	else
306		profile = btrfs_metadata_alloc_profile(fs_info);
307
308	avail = atomic64_read(&fs_info->free_chunk_space);
309
310	/*
311	 * If we have dup, raid1 or raid10 then only half of the free
312	 * space is actually usable.  For raid56, the space info used
313	 * doesn't include the parity drive, so we don't have to
314	 * change the math
315	 */
316	factor = btrfs_bg_type_to_factor(profile);
317	avail = div_u64(avail, factor);
318
319	/*
320	 * If we aren't flushing all things, let us overcommit up to
321	 * 1/2th of the space. If we can flush, don't let us overcommit
322	 * too much, let it overcommit up to 1/8 of the space.
323	 */
324	if (flush == BTRFS_RESERVE_FLUSH_ALL)
325		avail >>= 3;
326	else
327		avail >>= 1;
328	return avail;
329}
330
331int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
332			 struct btrfs_space_info *space_info, u64 bytes,
333			 enum btrfs_reserve_flush_enum flush)
334{
335	u64 avail;
336	u64 used;
337
338	/* Don't overcommit when in mixed mode */
339	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
340		return 0;
341
342	used = btrfs_space_info_used(space_info, true);
343	avail = calc_available_free_space(fs_info, space_info, flush);
344
345	if (used + bytes < space_info->total_bytes + avail)
346		return 1;
347	return 0;
348}
349
350static void remove_ticket(struct btrfs_space_info *space_info,
351			  struct reserve_ticket *ticket)
352{
353	if (!list_empty(&ticket->list)) {
354		list_del_init(&ticket->list);
355		ASSERT(space_info->reclaim_size >= ticket->bytes);
356		space_info->reclaim_size -= ticket->bytes;
357	}
358}
359
360/*
361 * This is for space we already have accounted in space_info->bytes_may_use, so
362 * basically when we're returning space from block_rsv's.
363 */
364void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
365				struct btrfs_space_info *space_info)
366{
367	struct list_head *head;
368	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
369
370	lockdep_assert_held(&space_info->lock);
371
372	head = &space_info->priority_tickets;
373again:
374	while (!list_empty(head)) {
375		struct reserve_ticket *ticket;
376		u64 used = btrfs_space_info_used(space_info, true);
377
378		ticket = list_first_entry(head, struct reserve_ticket, list);
379
380		/* Check and see if our ticket can be satisfied now. */
381		if ((used + ticket->bytes <= space_info->total_bytes) ||
382		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
383					 flush)) {
384			btrfs_space_info_update_bytes_may_use(fs_info,
385							      space_info,
386							      ticket->bytes);
387			remove_ticket(space_info, ticket);
388			ticket->bytes = 0;
389			space_info->tickets_id++;
390			wake_up(&ticket->wait);
391		} else {
392			break;
393		}
394	}
395
396	if (head == &space_info->priority_tickets) {
397		head = &space_info->tickets;
398		flush = BTRFS_RESERVE_FLUSH_ALL;
399		goto again;
400	}
401}
402
403#define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
404do {									\
405	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
406	spin_lock(&__rsv->lock);					\
407	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
408		   __rsv->size, __rsv->reserved);			\
409	spin_unlock(&__rsv->lock);					\
410} while (0)
411
412static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
413				    struct btrfs_space_info *info)
414{
415	lockdep_assert_held(&info->lock);
416
417	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
418		   info->flags,
419		   info->total_bytes - btrfs_space_info_used(info, true),
420		   info->full ? "" : "not ");
421	btrfs_info(fs_info,
422		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
423		info->total_bytes, info->bytes_used, info->bytes_pinned,
424		info->bytes_reserved, info->bytes_may_use,
425		info->bytes_readonly, info->bytes_zone_unusable);
426
427	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
428	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
429	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
430	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
431	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
432
433}
434
435void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
436			   struct btrfs_space_info *info, u64 bytes,
437			   int dump_block_groups)
438{
439	struct btrfs_block_group *cache;
440	int index = 0;
441
442	spin_lock(&info->lock);
443	__btrfs_dump_space_info(fs_info, info);
444	spin_unlock(&info->lock);
445
446	if (!dump_block_groups)
447		return;
448
449	down_read(&info->groups_sem);
450again:
451	list_for_each_entry(cache, &info->block_groups[index], list) {
452		spin_lock(&cache->lock);
453		btrfs_info(fs_info,
454			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
455			cache->start, cache->length, cache->used, cache->pinned,
456			cache->reserved, cache->zone_unusable,
457			cache->ro ? "[readonly]" : "");
458		spin_unlock(&cache->lock);
459		btrfs_dump_free_space(cache, bytes);
460	}
461	if (++index < BTRFS_NR_RAID_TYPES)
462		goto again;
463	up_read(&info->groups_sem);
464}
465
466static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
467					u64 to_reclaim)
468{
469	u64 bytes;
470	u64 nr;
471
472	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
473	nr = div64_u64(to_reclaim, bytes);
474	if (!nr)
475		nr = 1;
476	return nr;
477}
478
479#define EXTENT_SIZE_PER_ITEM	SZ_256K
480
481/*
482 * shrink metadata reservation for delalloc
483 */
484static void shrink_delalloc(struct btrfs_fs_info *fs_info,
485			    struct btrfs_space_info *space_info,
486			    u64 to_reclaim, bool wait_ordered,
487			    bool for_preempt)
488{
489	struct btrfs_trans_handle *trans;
490	u64 delalloc_bytes;
491	u64 ordered_bytes;
492	u64 items;
493	long time_left;
494	int loops;
495
496	/* Calc the number of the pages we need flush for space reservation */
497	if (to_reclaim == U64_MAX) {
498		items = U64_MAX;
499	} else {
500		/*
501		 * to_reclaim is set to however much metadata we need to
502		 * reclaim, but reclaiming that much data doesn't really track
503		 * exactly, so increase the amount to reclaim by 2x in order to
504		 * make sure we're flushing enough delalloc to hopefully reclaim
505		 * some metadata reservations.
506		 */
507		items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
508		to_reclaim = items * EXTENT_SIZE_PER_ITEM;
509	}
510
511	trans = (struct btrfs_trans_handle *)current->journal_info;
512
513	delalloc_bytes = percpu_counter_sum_positive(
514						&fs_info->delalloc_bytes);
515	ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
516	if (delalloc_bytes == 0 && ordered_bytes == 0)
517		return;
518
519	/*
520	 * If we are doing more ordered than delalloc we need to just wait on
521	 * ordered extents, otherwise we'll waste time trying to flush delalloc
522	 * that likely won't give us the space back we need.
523	 */
524	if (ordered_bytes > delalloc_bytes && !for_preempt)
525		wait_ordered = true;
526
527	loops = 0;
528	while ((delalloc_bytes || ordered_bytes) && loops < 3) {
529		u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
530		long nr_pages = min_t(u64, temp, LONG_MAX);
531
532		btrfs_start_delalloc_roots(fs_info, nr_pages, true);
533
534		loops++;
535		if (wait_ordered && !trans) {
536			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
537		} else {
538			time_left = schedule_timeout_killable(1);
539			if (time_left)
540				break;
541		}
542
543		/*
544		 * If we are for preemption we just want a one-shot of delalloc
545		 * flushing so we can stop flushing if we decide we don't need
546		 * to anymore.
547		 */
548		if (for_preempt)
549			break;
550
551		spin_lock(&space_info->lock);
552		if (list_empty(&space_info->tickets) &&
553		    list_empty(&space_info->priority_tickets)) {
554			spin_unlock(&space_info->lock);
555			break;
556		}
557		spin_unlock(&space_info->lock);
558
559		delalloc_bytes = percpu_counter_sum_positive(
560						&fs_info->delalloc_bytes);
561		ordered_bytes = percpu_counter_sum_positive(
562						&fs_info->ordered_bytes);
563	}
564}
565
566/*
567 * Try to flush some data based on policy set by @state. This is only advisory
568 * and may fail for various reasons. The caller is supposed to examine the
569 * state of @space_info to detect the outcome.
570 */
571static void flush_space(struct btrfs_fs_info *fs_info,
572		       struct btrfs_space_info *space_info, u64 num_bytes,
573		       enum btrfs_flush_state state, bool for_preempt)
574{
575	struct btrfs_root *root = fs_info->extent_root;
576	struct btrfs_trans_handle *trans;
577	int nr;
578	int ret = 0;
579
580	switch (state) {
581	case FLUSH_DELAYED_ITEMS_NR:
582	case FLUSH_DELAYED_ITEMS:
583		if (state == FLUSH_DELAYED_ITEMS_NR)
584			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
585		else
586			nr = -1;
587
588		trans = btrfs_join_transaction(root);
589		if (IS_ERR(trans)) {
590			ret = PTR_ERR(trans);
591			break;
592		}
593		ret = btrfs_run_delayed_items_nr(trans, nr);
594		btrfs_end_transaction(trans);
595		break;
596	case FLUSH_DELALLOC:
597	case FLUSH_DELALLOC_WAIT:
598		shrink_delalloc(fs_info, space_info, num_bytes,
599				state == FLUSH_DELALLOC_WAIT, for_preempt);
600		break;
601	case FLUSH_DELAYED_REFS_NR:
602	case FLUSH_DELAYED_REFS:
603		trans = btrfs_join_transaction(root);
604		if (IS_ERR(trans)) {
605			ret = PTR_ERR(trans);
606			break;
607		}
608		if (state == FLUSH_DELAYED_REFS_NR)
609			nr = calc_reclaim_items_nr(fs_info, num_bytes);
610		else
611			nr = 0;
612		btrfs_run_delayed_refs(trans, nr);
613		btrfs_end_transaction(trans);
614		break;
615	case ALLOC_CHUNK:
616	case ALLOC_CHUNK_FORCE:
617		trans = btrfs_join_transaction(root);
618		if (IS_ERR(trans)) {
619			ret = PTR_ERR(trans);
620			break;
621		}
622		ret = btrfs_chunk_alloc(trans,
623				btrfs_get_alloc_profile(fs_info, space_info->flags),
624				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
625					CHUNK_ALLOC_FORCE);
626		btrfs_end_transaction(trans);
627		if (ret > 0 || ret == -ENOSPC)
628			ret = 0;
629		break;
630	case RUN_DELAYED_IPUTS:
631		/*
632		 * If we have pending delayed iputs then we could free up a
633		 * bunch of pinned space, so make sure we run the iputs before
634		 * we do our pinned bytes check below.
635		 */
636		btrfs_run_delayed_iputs(fs_info);
637		btrfs_wait_on_delayed_iputs(fs_info);
638		break;
639	case COMMIT_TRANS:
640		ASSERT(current->journal_info == NULL);
641		trans = btrfs_join_transaction(root);
642		if (IS_ERR(trans)) {
643			ret = PTR_ERR(trans);
644			break;
645		}
646		ret = btrfs_commit_transaction(trans);
647		break;
648	default:
649		ret = -ENOSPC;
650		break;
651	}
652
653	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
654				ret, for_preempt);
655	return;
656}
657
658static inline u64
659btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
660				 struct btrfs_space_info *space_info)
661{
662	u64 used;
663	u64 avail;
664	u64 to_reclaim = space_info->reclaim_size;
665
666	lockdep_assert_held(&space_info->lock);
667
668	avail = calc_available_free_space(fs_info, space_info,
669					  BTRFS_RESERVE_FLUSH_ALL);
670	used = btrfs_space_info_used(space_info, true);
671
672	/*
673	 * We may be flushing because suddenly we have less space than we had
674	 * before, and now we're well over-committed based on our current free
675	 * space.  If that's the case add in our overage so we make sure to put
676	 * appropriate pressure on the flushing state machine.
677	 */
678	if (space_info->total_bytes + avail < used)
679		to_reclaim += used - (space_info->total_bytes + avail);
680
681	return to_reclaim;
682}
683
684static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
685				    struct btrfs_space_info *space_info)
686{
687	u64 global_rsv_size = fs_info->global_block_rsv.reserved;
688	u64 ordered, delalloc;
689	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
690	u64 used;
691
692	/* If we're just plain full then async reclaim just slows us down. */
693	if ((space_info->bytes_used + space_info->bytes_reserved +
694	     global_rsv_size) >= thresh)
695		return false;
696
697	/*
698	 * We have tickets queued, bail so we don't compete with the async
699	 * flushers.
700	 */
701	if (space_info->reclaim_size)
702		return false;
703
704	/*
705	 * If we have over half of the free space occupied by reservations or
706	 * pinned then we want to start flushing.
707	 *
708	 * We do not do the traditional thing here, which is to say
709	 *
710	 *   if (used >= ((total_bytes + avail) / 2))
711	 *     return 1;
712	 *
713	 * because this doesn't quite work how we want.  If we had more than 50%
714	 * of the space_info used by bytes_used and we had 0 available we'd just
715	 * constantly run the background flusher.  Instead we want it to kick in
716	 * if our reclaimable space exceeds our clamped free space.
717	 *
718	 * Our clamping range is 2^1 -> 2^8.  Practically speaking that means
719	 * the following:
720	 *
721	 * Amount of RAM        Minimum threshold       Maximum threshold
722	 *
723	 *        256GiB                     1GiB                  128GiB
724	 *        128GiB                   512MiB                   64GiB
725	 *         64GiB                   256MiB                   32GiB
726	 *         32GiB                   128MiB                   16GiB
727	 *         16GiB                    64MiB                    8GiB
728	 *
729	 * These are the range our thresholds will fall in, corresponding to how
730	 * much delalloc we need for the background flusher to kick in.
731	 */
732
733	thresh = calc_available_free_space(fs_info, space_info,
734					   BTRFS_RESERVE_FLUSH_ALL);
735	used = space_info->bytes_used + space_info->bytes_reserved +
736	       space_info->bytes_readonly + global_rsv_size;
737	if (used < space_info->total_bytes)
738		thresh += space_info->total_bytes - used;
739	thresh >>= space_info->clamp;
740
741	used = space_info->bytes_pinned;
742
743	/*
744	 * If we have more ordered bytes than delalloc bytes then we're either
745	 * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
746	 * around.  Preemptive flushing is only useful in that it can free up
747	 * space before tickets need to wait for things to finish.  In the case
748	 * of ordered extents, preemptively waiting on ordered extents gets us
749	 * nothing, if our reservations are tied up in ordered extents we'll
750	 * simply have to slow down writers by forcing them to wait on ordered
751	 * extents.
752	 *
753	 * In the case that ordered is larger than delalloc, only include the
754	 * block reserves that we would actually be able to directly reclaim
755	 * from.  In this case if we're heavy on metadata operations this will
756	 * clearly be heavy enough to warrant preemptive flushing.  In the case
757	 * of heavy DIO or ordered reservations, preemptive flushing will just
758	 * waste time and cause us to slow down.
759	 *
760	 * We want to make sure we truly are maxed out on ordered however, so
761	 * cut ordered in half, and if it's still higher than delalloc then we
762	 * can keep flushing.  This is to avoid the case where we start
763	 * flushing, and now delalloc == ordered and we stop preemptively
764	 * flushing when we could still have several gigs of delalloc to flush.
765	 */
766	ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
767	delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
768	if (ordered >= delalloc)
769		used += fs_info->delayed_refs_rsv.reserved +
770			fs_info->delayed_block_rsv.reserved;
771	else
772		used += space_info->bytes_may_use - global_rsv_size;
773
774	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
775		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
776}
777
778static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
779				  struct btrfs_space_info *space_info,
780				  struct reserve_ticket *ticket)
781{
782	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
783	u64 min_bytes;
784
785	if (global_rsv->space_info != space_info)
786		return false;
787
788	spin_lock(&global_rsv->lock);
789	min_bytes = div_factor(global_rsv->size, 1);
790	if (global_rsv->reserved < min_bytes + ticket->bytes) {
791		spin_unlock(&global_rsv->lock);
792		return false;
793	}
794	global_rsv->reserved -= ticket->bytes;
795	remove_ticket(space_info, ticket);
796	ticket->bytes = 0;
797	wake_up(&ticket->wait);
798	space_info->tickets_id++;
799	if (global_rsv->reserved < global_rsv->size)
800		global_rsv->full = 0;
801	spin_unlock(&global_rsv->lock);
802
803	return true;
804}
805
806/*
807 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
808 * @fs_info - fs_info for this fs
809 * @space_info - the space info we were flushing
810 *
811 * We call this when we've exhausted our flushing ability and haven't made
812 * progress in satisfying tickets.  The reservation code handles tickets in
813 * order, so if there is a large ticket first and then smaller ones we could
814 * very well satisfy the smaller tickets.  This will attempt to wake up any
815 * tickets in the list to catch this case.
816 *
817 * This function returns true if it was able to make progress by clearing out
818 * other tickets, or if it stumbles across a ticket that was smaller than the
819 * first ticket.
820 */
821static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
822				   struct btrfs_space_info *space_info)
823{
824	struct reserve_ticket *ticket;
825	u64 tickets_id = space_info->tickets_id;
826
827	trace_btrfs_fail_all_tickets(fs_info, space_info);
828
829	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
830		btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
831		__btrfs_dump_space_info(fs_info, space_info);
832	}
833
834	while (!list_empty(&space_info->tickets) &&
835	       tickets_id == space_info->tickets_id) {
836		ticket = list_first_entry(&space_info->tickets,
837					  struct reserve_ticket, list);
838
839		if (ticket->steal &&
840		    steal_from_global_rsv(fs_info, space_info, ticket))
841			return true;
842
843		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
844			btrfs_info(fs_info, "failing ticket with %llu bytes",
845				   ticket->bytes);
846
847		remove_ticket(space_info, ticket);
848		ticket->error = -ENOSPC;
849		wake_up(&ticket->wait);
850
851		/*
852		 * We're just throwing tickets away, so more flushing may not
853		 * trip over btrfs_try_granting_tickets, so we need to call it
854		 * here to see if we can make progress with the next ticket in
855		 * the list.
856		 */
857		btrfs_try_granting_tickets(fs_info, space_info);
858	}
859	return (tickets_id != space_info->tickets_id);
860}
861
862/*
863 * This is for normal flushers, we can wait all goddamned day if we want to.  We
864 * will loop and continuously try to flush as long as we are making progress.
865 * We count progress as clearing off tickets each time we have to loop.
866 */
867static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
868{
869	struct btrfs_fs_info *fs_info;
870	struct btrfs_space_info *space_info;
871	u64 to_reclaim;
872	enum btrfs_flush_state flush_state;
873	int commit_cycles = 0;
874	u64 last_tickets_id;
875
876	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
877	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
878
879	spin_lock(&space_info->lock);
880	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
881	if (!to_reclaim) {
882		space_info->flush = 0;
883		spin_unlock(&space_info->lock);
884		return;
885	}
886	last_tickets_id = space_info->tickets_id;
887	spin_unlock(&space_info->lock);
888
889	flush_state = FLUSH_DELAYED_ITEMS_NR;
890	do {
891		flush_space(fs_info, space_info, to_reclaim, flush_state, false);
892		spin_lock(&space_info->lock);
893		if (list_empty(&space_info->tickets)) {
894			space_info->flush = 0;
895			spin_unlock(&space_info->lock);
896			return;
897		}
898		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
899							      space_info);
900		if (last_tickets_id == space_info->tickets_id) {
901			flush_state++;
902		} else {
903			last_tickets_id = space_info->tickets_id;
904			flush_state = FLUSH_DELAYED_ITEMS_NR;
905			if (commit_cycles)
906				commit_cycles--;
907		}
908
909		/*
910		 * We don't want to force a chunk allocation until we've tried
911		 * pretty hard to reclaim space.  Think of the case where we
912		 * freed up a bunch of space and so have a lot of pinned space
913		 * to reclaim.  We would rather use that than possibly create a
914		 * underutilized metadata chunk.  So if this is our first run
915		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
916		 * commit the transaction.  If nothing has changed the next go
917		 * around then we can force a chunk allocation.
918		 */
919		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
920			flush_state++;
921
922		if (flush_state > COMMIT_TRANS) {
923			commit_cycles++;
924			if (commit_cycles > 2) {
925				if (maybe_fail_all_tickets(fs_info, space_info)) {
926					flush_state = FLUSH_DELAYED_ITEMS_NR;
927					commit_cycles--;
928				} else {
929					space_info->flush = 0;
930				}
931			} else {
932				flush_state = FLUSH_DELAYED_ITEMS_NR;
933			}
934		}
935		spin_unlock(&space_info->lock);
936	} while (flush_state <= COMMIT_TRANS);
937}
938
939/*
940 * This handles pre-flushing of metadata space before we get to the point that
941 * we need to start blocking threads on tickets.  The logic here is different
942 * from the other flush paths because it doesn't rely on tickets to tell us how
943 * much we need to flush, instead it attempts to keep us below the 80% full
944 * watermark of space by flushing whichever reservation pool is currently the
945 * largest.
946 */
947static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
948{
949	struct btrfs_fs_info *fs_info;
950	struct btrfs_space_info *space_info;
951	struct btrfs_block_rsv *delayed_block_rsv;
952	struct btrfs_block_rsv *delayed_refs_rsv;
953	struct btrfs_block_rsv *global_rsv;
954	struct btrfs_block_rsv *trans_rsv;
955	int loops = 0;
956
957	fs_info = container_of(work, struct btrfs_fs_info,
958			       preempt_reclaim_work);
959	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
960	delayed_block_rsv = &fs_info->delayed_block_rsv;
961	delayed_refs_rsv = &fs_info->delayed_refs_rsv;
962	global_rsv = &fs_info->global_block_rsv;
963	trans_rsv = &fs_info->trans_block_rsv;
964
965	spin_lock(&space_info->lock);
966	while (need_preemptive_reclaim(fs_info, space_info)) {
967		enum btrfs_flush_state flush;
968		u64 delalloc_size = 0;
969		u64 to_reclaim, block_rsv_size;
970		u64 global_rsv_size = global_rsv->reserved;
971
972		loops++;
973
974		/*
975		 * We don't have a precise counter for the metadata being
976		 * reserved for delalloc, so we'll approximate it by subtracting
977		 * out the block rsv's space from the bytes_may_use.  If that
978		 * amount is higher than the individual reserves, then we can
979		 * assume it's tied up in delalloc reservations.
980		 */
981		block_rsv_size = global_rsv_size +
982			delayed_block_rsv->reserved +
983			delayed_refs_rsv->reserved +
984			trans_rsv->reserved;
985		if (block_rsv_size < space_info->bytes_may_use)
986			delalloc_size = space_info->bytes_may_use - block_rsv_size;
987		spin_unlock(&space_info->lock);
988
989		/*
990		 * We don't want to include the global_rsv in our calculation,
991		 * because that's space we can't touch.  Subtract it from the
992		 * block_rsv_size for the next checks.
993		 */
994		block_rsv_size -= global_rsv_size;
995
996		/*
997		 * We really want to avoid flushing delalloc too much, as it
998		 * could result in poor allocation patterns, so only flush it if
999		 * it's larger than the rest of the pools combined.
1000		 */
1001		if (delalloc_size > block_rsv_size) {
1002			to_reclaim = delalloc_size;
1003			flush = FLUSH_DELALLOC;
1004		} else if (space_info->bytes_pinned >
1005			   (delayed_block_rsv->reserved +
1006			    delayed_refs_rsv->reserved)) {
1007			to_reclaim = space_info->bytes_pinned;
1008			flush = COMMIT_TRANS;
1009		} else if (delayed_block_rsv->reserved >
1010			   delayed_refs_rsv->reserved) {
1011			to_reclaim = delayed_block_rsv->reserved;
1012			flush = FLUSH_DELAYED_ITEMS_NR;
1013		} else {
1014			to_reclaim = delayed_refs_rsv->reserved;
1015			flush = FLUSH_DELAYED_REFS_NR;
1016		}
1017
1018		/*
1019		 * We don't want to reclaim everything, just a portion, so scale
1020		 * down the to_reclaim by 1/4.  If it takes us down to 0,
1021		 * reclaim 1 items worth.
1022		 */
1023		to_reclaim >>= 2;
1024		if (!to_reclaim)
1025			to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
1026		flush_space(fs_info, space_info, to_reclaim, flush, true);
1027		cond_resched();
1028		spin_lock(&space_info->lock);
1029	}
1030
1031	/* We only went through once, back off our clamping. */
1032	if (loops == 1 && !space_info->reclaim_size)
1033		space_info->clamp = max(1, space_info->clamp - 1);
1034	trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
1035	spin_unlock(&space_info->lock);
1036}
1037
1038/*
1039 * FLUSH_DELALLOC_WAIT:
1040 *   Space is freed from flushing delalloc in one of two ways.
1041 *
1042 *   1) compression is on and we allocate less space than we reserved
1043 *   2) we are overwriting existing space
1044 *
1045 *   For #1 that extra space is reclaimed as soon as the delalloc pages are
1046 *   COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
1047 *   length to ->bytes_reserved, and subtracts the reserved space from
1048 *   ->bytes_may_use.
1049 *
1050 *   For #2 this is trickier.  Once the ordered extent runs we will drop the
1051 *   extent in the range we are overwriting, which creates a delayed ref for
1052 *   that freed extent.  This however is not reclaimed until the transaction
1053 *   commits, thus the next stages.
1054 *
1055 * RUN_DELAYED_IPUTS
1056 *   If we are freeing inodes, we want to make sure all delayed iputs have
1057 *   completed, because they could have been on an inode with i_nlink == 0, and
1058 *   thus have been truncated and freed up space.  But again this space is not
1059 *   immediately re-usable, it comes in the form of a delayed ref, which must be
1060 *   run and then the transaction must be committed.
1061 *
1062 * COMMIT_TRANS
1063 *   This is where we reclaim all of the pinned space generated by running the
1064 *   iputs
1065 *
1066 * ALLOC_CHUNK_FORCE
1067 *   For data we start with alloc chunk force, however we could have been full
1068 *   before, and then the transaction commit could have freed new block groups,
1069 *   so if we now have space to allocate do the force chunk allocation.
1070 */
1071static const enum btrfs_flush_state data_flush_states[] = {
1072	FLUSH_DELALLOC_WAIT,
1073	RUN_DELAYED_IPUTS,
1074	COMMIT_TRANS,
1075	ALLOC_CHUNK_FORCE,
1076};
1077
1078static void btrfs_async_reclaim_data_space(struct work_struct *work)
1079{
1080	struct btrfs_fs_info *fs_info;
1081	struct btrfs_space_info *space_info;
1082	u64 last_tickets_id;
1083	enum btrfs_flush_state flush_state = 0;
1084
1085	fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
1086	space_info = fs_info->data_sinfo;
1087
1088	spin_lock(&space_info->lock);
1089	if (list_empty(&space_info->tickets)) {
1090		space_info->flush = 0;
1091		spin_unlock(&space_info->lock);
1092		return;
1093	}
1094	last_tickets_id = space_info->tickets_id;
1095	spin_unlock(&space_info->lock);
1096
1097	while (!space_info->full) {
1098		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
1099		spin_lock(&space_info->lock);
1100		if (list_empty(&space_info->tickets)) {
1101			space_info->flush = 0;
1102			spin_unlock(&space_info->lock);
1103			return;
1104		}
1105		last_tickets_id = space_info->tickets_id;
1106		spin_unlock(&space_info->lock);
1107	}
1108
1109	while (flush_state < ARRAY_SIZE(data_flush_states)) {
1110		flush_space(fs_info, space_info, U64_MAX,
1111			    data_flush_states[flush_state], false);
1112		spin_lock(&space_info->lock);
1113		if (list_empty(&space_info->tickets)) {
1114			space_info->flush = 0;
1115			spin_unlock(&space_info->lock);
1116			return;
1117		}
1118
1119		if (last_tickets_id == space_info->tickets_id) {
1120			flush_state++;
1121		} else {
1122			last_tickets_id = space_info->tickets_id;
1123			flush_state = 0;
1124		}
1125
1126		if (flush_state >= ARRAY_SIZE(data_flush_states)) {
1127			if (space_info->full) {
1128				if (maybe_fail_all_tickets(fs_info, space_info))
1129					flush_state = 0;
1130				else
1131					space_info->flush = 0;
1132			} else {
1133				flush_state = 0;
1134			}
1135		}
1136		spin_unlock(&space_info->lock);
1137	}
1138}
1139
1140void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
1141{
1142	INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
1143	INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
1144	INIT_WORK(&fs_info->preempt_reclaim_work,
1145		  btrfs_preempt_reclaim_metadata_space);
1146}
1147
1148static const enum btrfs_flush_state priority_flush_states[] = {
1149	FLUSH_DELAYED_ITEMS_NR,
1150	FLUSH_DELAYED_ITEMS,
1151	ALLOC_CHUNK,
1152};
1153
1154static const enum btrfs_flush_state evict_flush_states[] = {
1155	FLUSH_DELAYED_ITEMS_NR,
1156	FLUSH_DELAYED_ITEMS,
1157	FLUSH_DELAYED_REFS_NR,
1158	FLUSH_DELAYED_REFS,
1159	FLUSH_DELALLOC,
1160	FLUSH_DELALLOC_WAIT,
1161	ALLOC_CHUNK,
1162	COMMIT_TRANS,
1163};
1164
1165static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
1166				struct btrfs_space_info *space_info,
1167				struct reserve_ticket *ticket,
1168				const enum btrfs_flush_state *states,
1169				int states_nr)
1170{
1171	u64 to_reclaim;
1172	int flush_state;
1173
1174	spin_lock(&space_info->lock);
1175	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
1176	if (!to_reclaim) {
1177		spin_unlock(&space_info->lock);
1178		return;
1179	}
1180	spin_unlock(&space_info->lock);
1181
1182	flush_state = 0;
1183	do {
1184		flush_space(fs_info, space_info, to_reclaim, states[flush_state],
1185			    false);
1186		flush_state++;
1187		spin_lock(&space_info->lock);
1188		if (ticket->bytes == 0) {
1189			spin_unlock(&space_info->lock);
1190			return;
1191		}
1192		spin_unlock(&space_info->lock);
1193	} while (flush_state < states_nr);
1194}
1195
1196static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
1197					struct btrfs_space_info *space_info,
1198					struct reserve_ticket *ticket)
1199{
1200	while (!space_info->full) {
1201		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
1202		spin_lock(&space_info->lock);
1203		if (ticket->bytes == 0) {
1204			spin_unlock(&space_info->lock);
1205			return;
1206		}
1207		spin_unlock(&space_info->lock);
1208	}
1209}
1210
1211static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
1212				struct btrfs_space_info *space_info,
1213				struct reserve_ticket *ticket)
1214
1215{
1216	DEFINE_WAIT(wait);
1217	int ret = 0;
1218
1219	spin_lock(&space_info->lock);
1220	while (ticket->bytes > 0 && ticket->error == 0) {
1221		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
1222		if (ret) {
1223			/*
1224			 * Delete us from the list. After we unlock the space
1225			 * info, we don't want the async reclaim job to reserve
1226			 * space for this ticket. If that would happen, then the
1227			 * ticket's task would not known that space was reserved
1228			 * despite getting an error, resulting in a space leak
1229			 * (bytes_may_use counter of our space_info).
1230			 */
1231			remove_ticket(space_info, ticket);
1232			ticket->error = -EINTR;
1233			break;
1234		}
1235		spin_unlock(&space_info->lock);
1236
1237		schedule();
1238
1239		finish_wait(&ticket->wait, &wait);
1240		spin_lock(&space_info->lock);
1241	}
1242	spin_unlock(&space_info->lock);
1243}
1244
1245/**
1246 * Do the appropriate flushing and waiting for a ticket
1247 *
1248 * @fs_info:    the filesystem
1249 * @space_info: space info for the reservation
1250 * @ticket:     ticket for the reservation
1251 * @start_ns:   timestamp when the reservation started
1252 * @orig_bytes: amount of bytes originally reserved
1253 * @flush:      how much we can flush
1254 *
1255 * This does the work of figuring out how to flush for the ticket, waiting for
1256 * the reservation, and returning the appropriate error if there is one.
1257 */
1258static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
1259				 struct btrfs_space_info *space_info,
1260				 struct reserve_ticket *ticket,
1261				 u64 start_ns, u64 orig_bytes,
1262				 enum btrfs_reserve_flush_enum flush)
1263{
1264	int ret;
1265
1266	switch (flush) {
1267	case BTRFS_RESERVE_FLUSH_DATA:
1268	case BTRFS_RESERVE_FLUSH_ALL:
1269	case BTRFS_RESERVE_FLUSH_ALL_STEAL:
1270		wait_reserve_ticket(fs_info, space_info, ticket);
1271		break;
1272	case BTRFS_RESERVE_FLUSH_LIMIT:
1273		priority_reclaim_metadata_space(fs_info, space_info, ticket,
1274						priority_flush_states,
1275						ARRAY_SIZE(priority_flush_states));
1276		break;
1277	case BTRFS_RESERVE_FLUSH_EVICT:
1278		priority_reclaim_metadata_space(fs_info, space_info, ticket,
1279						evict_flush_states,
1280						ARRAY_SIZE(evict_flush_states));
1281		break;
1282	case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
1283		priority_reclaim_data_space(fs_info, space_info, ticket);
1284		break;
1285	default:
1286		ASSERT(0);
1287		break;
1288	}
1289
1290	spin_lock(&space_info->lock);
1291	ret = ticket->error;
1292	if (ticket->bytes || ticket->error) {
1293		/*
1294		 * We were a priority ticket, so we need to delete ourselves
1295		 * from the list.  Because we could have other priority tickets
1296		 * behind us that require less space, run
1297		 * btrfs_try_granting_tickets() to see if their reservations can
1298		 * now be made.
1299		 */
1300		if (!list_empty(&ticket->list)) {
1301			remove_ticket(space_info, ticket);
1302			btrfs_try_granting_tickets(fs_info, space_info);
1303		}
1304
1305		if (!ret)
1306			ret = -ENOSPC;
1307	}
1308	spin_unlock(&space_info->lock);
1309	ASSERT(list_empty(&ticket->list));
1310	/*
1311	 * Check that we can't have an error set if the reservation succeeded,
1312	 * as that would confuse tasks and lead them to error out without
1313	 * releasing reserved space (if an error happens the expectation is that
1314	 * space wasn't reserved at all).
1315	 */
1316	ASSERT(!(ticket->bytes == 0 && ticket->error));
1317	trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
1318				   start_ns, flush, ticket->error);
1319	return ret;
1320}
1321
1322/*
1323 * This returns true if this flush state will go through the ordinary flushing
1324 * code.
1325 */
1326static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
1327{
1328	return	(flush == BTRFS_RESERVE_FLUSH_ALL) ||
1329		(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
1330}
1331
1332static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
1333				       struct btrfs_space_info *space_info)
1334{
1335	u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
1336	u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
1337
1338	/*
1339	 * If we're heavy on ordered operations then clamping won't help us.  We
1340	 * need to clamp specifically to keep up with dirty'ing buffered
1341	 * writers, because there's not a 1:1 correlation of writing delalloc
1342	 * and freeing space, like there is with flushing delayed refs or
1343	 * delayed nodes.  If we're already more ordered than delalloc then
1344	 * we're keeping up, otherwise we aren't and should probably clamp.
1345	 */
1346	if (ordered < delalloc)
1347		space_info->clamp = min(space_info->clamp + 1, 8);
1348}
1349
1350/**
1351 * Try to reserve bytes from the block_rsv's space
1352 *
1353 * @fs_info:    the filesystem
1354 * @space_info: space info we want to allocate from
1355 * @orig_bytes: number of bytes we want
1356 * @flush:      whether or not we can flush to make our reservation
1357 *
1358 * This will reserve orig_bytes number of bytes from the space info associated
1359 * with the block_rsv.  If there is not enough space it will make an attempt to
1360 * flush out space to make room.  It will do this by flushing delalloc if
1361 * possible or committing the transaction.  If flush is 0 then no attempts to
1362 * regain reservations will be made and this will fail if there is not enough
1363 * space already.
1364 */
1365static int __reserve_bytes(struct btrfs_fs_info *fs_info,
1366			   struct btrfs_space_info *space_info, u64 orig_bytes,
1367			   enum btrfs_reserve_flush_enum flush)
1368{
1369	struct work_struct *async_work;
1370	struct reserve_ticket ticket;
1371	u64 start_ns = 0;
1372	u64 used;
1373	int ret = 0;
1374	bool pending_tickets;
1375
1376	ASSERT(orig_bytes);
1377	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
1378
1379	if (flush == BTRFS_RESERVE_FLUSH_DATA)
1380		async_work = &fs_info->async_data_reclaim_work;
1381	else
1382		async_work = &fs_info->async_reclaim_work;
1383
1384	spin_lock(&space_info->lock);
1385	ret = -ENOSPC;
1386	used = btrfs_space_info_used(space_info, true);
1387
1388	/*
1389	 * We don't want NO_FLUSH allocations to jump everybody, they can
1390	 * generally handle ENOSPC in a different way, so treat them the same as
1391	 * normal flushers when it comes to skipping pending tickets.
1392	 */
1393	if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
1394		pending_tickets = !list_empty(&space_info->tickets) ||
1395			!list_empty(&space_info->priority_tickets);
1396	else
1397		pending_tickets = !list_empty(&space_info->priority_tickets);
1398
1399	/*
1400	 * Carry on if we have enough space (short-circuit) OR call
1401	 * can_overcommit() to ensure we can overcommit to continue.
1402	 */
1403	if (!pending_tickets &&
1404	    ((used + orig_bytes <= space_info->total_bytes) ||
1405	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
1406		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
1407						      orig_bytes);
1408		ret = 0;
1409	}
1410
1411	/*
1412	 * If we couldn't make a reservation then setup our reservation ticket
1413	 * and kick the async worker if it's not already running.
1414	 *
1415	 * If we are a priority flusher then we just need to add our ticket to
1416	 * the list and we will do our own flushing further down.
1417	 */
1418	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
1419		ticket.bytes = orig_bytes;
1420		ticket.error = 0;
1421		space_info->reclaim_size += ticket.bytes;
1422		init_waitqueue_head(&ticket.wait);
1423		ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
1424		if (trace_btrfs_reserve_ticket_enabled())
1425			start_ns = ktime_get_ns();
1426
1427		if (flush == BTRFS_RESERVE_FLUSH_ALL ||
1428		    flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
1429		    flush == BTRFS_RESERVE_FLUSH_DATA) {
1430			list_add_tail(&ticket.list, &space_info->tickets);
1431			if (!space_info->flush) {
1432				/*
1433				 * We were forced to add a reserve ticket, so
1434				 * our preemptive flushing is unable to keep
1435				 * up.  Clamp down on the threshold for the
1436				 * preemptive flushing in order to keep up with
1437				 * the workload.
1438				 */
1439				maybe_clamp_preempt(fs_info, space_info);
1440
1441				space_info->flush = 1;
1442				trace_btrfs_trigger_flush(fs_info,
1443							  space_info->flags,
1444							  orig_bytes, flush,
1445							  "enospc");
1446				queue_work(system_unbound_wq, async_work);
1447			}
1448		} else {
1449			list_add_tail(&ticket.list,
1450				      &space_info->priority_tickets);
1451		}
1452	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1453		used += orig_bytes;
1454		/*
1455		 * We will do the space reservation dance during log replay,
1456		 * which means we won't have fs_info->fs_root set, so don't do
1457		 * the async reclaim as we will panic.
1458		 */
1459		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1460		    !work_busy(&fs_info->preempt_reclaim_work) &&
1461		    need_preemptive_reclaim(fs_info, space_info)) {
1462			trace_btrfs_trigger_flush(fs_info, space_info->flags,
1463						  orig_bytes, flush, "preempt");
1464			queue_work(system_unbound_wq,
1465				   &fs_info->preempt_reclaim_work);
1466		}
1467	}
1468	spin_unlock(&space_info->lock);
1469	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1470		return ret;
1471
1472	return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
1473				     orig_bytes, flush);
1474}
1475
1476/**
1477 * Trye to reserve metadata bytes from the block_rsv's space
1478 *
1479 * @root:       the root we're allocating for
1480 * @block_rsv:  block_rsv we're allocating for
1481 * @orig_bytes: number of bytes we want
1482 * @flush:      whether or not we can flush to make our reservation
1483 *
1484 * This will reserve orig_bytes number of bytes from the space info associated
1485 * with the block_rsv.  If there is not enough space it will make an attempt to
1486 * flush out space to make room.  It will do this by flushing delalloc if
1487 * possible or committing the transaction.  If flush is 0 then no attempts to
1488 * regain reservations will be made and this will fail if there is not enough
1489 * space already.
1490 */
1491int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1492				 struct btrfs_block_rsv *block_rsv,
1493				 u64 orig_bytes,
1494				 enum btrfs_reserve_flush_enum flush)
1495{
1496	struct btrfs_fs_info *fs_info = root->fs_info;
1497	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1498	int ret;
1499
1500	ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
1501	if (ret == -ENOSPC &&
1502	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1503		if (block_rsv != global_rsv &&
1504		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1505			ret = 0;
1506	}
1507	if (ret == -ENOSPC) {
1508		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1509					      block_rsv->space_info->flags,
1510					      orig_bytes, 1);
1511
1512		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1513			btrfs_dump_space_info(fs_info, block_rsv->space_info,
1514					      orig_bytes, 0);
1515	}
1516	return ret;
1517}
1518
1519/**
1520 * Try to reserve data bytes for an allocation
1521 *
1522 * @fs_info: the filesystem
1523 * @bytes:   number of bytes we need
1524 * @flush:   how we are allowed to flush
1525 *
1526 * This will reserve bytes from the data space info.  If there is not enough
1527 * space then we will attempt to flush space as specified by flush.
1528 */
1529int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
1530			     enum btrfs_reserve_flush_enum flush)
1531{
1532	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
1533	int ret;
1534
1535	ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
1536	       flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
1537	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
1538
1539	ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
1540	if (ret == -ENOSPC) {
1541		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1542					      data_sinfo->flags, bytes, 1);
1543		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1544			btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
1545	}
1546	return ret;
1547}
1548