segment.c revision 52118743
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * fs/f2fs/segment.c
4 *
5 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
6 *             http://www.samsung.com/
7 */
8#include <linux/fs.h>
9#include <linux/f2fs_fs.h>
10#include <linux/bio.h>
11#include <linux/blkdev.h>
12#include <linux/prefetch.h>
13#include <linux/kthread.h>
14#include <linux/swap.h>
15#include <linux/timer.h>
16#include <linux/freezer.h>
17#include <linux/sched/signal.h>
18
19#include "f2fs.h"
20#include "segment.h"
21#include "node.h"
22#include "gc.h"
23#include "iostat.h"
24#include <trace/events/f2fs.h>
25
26#define __reverse_ffz(x) __reverse_ffs(~(x))
27
28static struct kmem_cache *discard_entry_slab;
29static struct kmem_cache *discard_cmd_slab;
30static struct kmem_cache *sit_entry_set_slab;
31static struct kmem_cache *inmem_entry_slab;
32
33static unsigned long __reverse_ulong(unsigned char *str)
34{
35	unsigned long tmp = 0;
36	int shift = 24, idx = 0;
37
38#if BITS_PER_LONG == 64
39	shift = 56;
40#endif
41	while (shift >= 0) {
42		tmp |= (unsigned long)str[idx++] << shift;
43		shift -= BITS_PER_BYTE;
44	}
45	return tmp;
46}
47
48/*
49 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
50 * MSB and LSB are reversed in a byte by f2fs_set_bit.
51 */
52static inline unsigned long __reverse_ffs(unsigned long word)
53{
54	int num = 0;
55
56#if BITS_PER_LONG == 64
57	if ((word & 0xffffffff00000000UL) == 0)
58		num += 32;
59	else
60		word >>= 32;
61#endif
62	if ((word & 0xffff0000) == 0)
63		num += 16;
64	else
65		word >>= 16;
66
67	if ((word & 0xff00) == 0)
68		num += 8;
69	else
70		word >>= 8;
71
72	if ((word & 0xf0) == 0)
73		num += 4;
74	else
75		word >>= 4;
76
77	if ((word & 0xc) == 0)
78		num += 2;
79	else
80		word >>= 2;
81
82	if ((word & 0x2) == 0)
83		num += 1;
84	return num;
85}
86
87/*
88 * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
89 * f2fs_set_bit makes MSB and LSB reversed in a byte.
90 * @size must be integral times of unsigned long.
91 * Example:
92 *                             MSB <--> LSB
93 *   f2fs_set_bit(0, bitmap) => 1000 0000
94 *   f2fs_set_bit(7, bitmap) => 0000 0001
95 */
96static unsigned long __find_rev_next_bit(const unsigned long *addr,
97			unsigned long size, unsigned long offset)
98{
99	const unsigned long *p = addr + BIT_WORD(offset);
100	unsigned long result = size;
101	unsigned long tmp;
102
103	if (offset >= size)
104		return size;
105
106	size -= (offset & ~(BITS_PER_LONG - 1));
107	offset %= BITS_PER_LONG;
108
109	while (1) {
110		if (*p == 0)
111			goto pass;
112
113		tmp = __reverse_ulong((unsigned char *)p);
114
115		tmp &= ~0UL >> offset;
116		if (size < BITS_PER_LONG)
117			tmp &= (~0UL << (BITS_PER_LONG - size));
118		if (tmp)
119			goto found;
120pass:
121		if (size <= BITS_PER_LONG)
122			break;
123		size -= BITS_PER_LONG;
124		offset = 0;
125		p++;
126	}
127	return result;
128found:
129	return result - size + __reverse_ffs(tmp);
130}
131
132static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
133			unsigned long size, unsigned long offset)
134{
135	const unsigned long *p = addr + BIT_WORD(offset);
136	unsigned long result = size;
137	unsigned long tmp;
138
139	if (offset >= size)
140		return size;
141
142	size -= (offset & ~(BITS_PER_LONG - 1));
143	offset %= BITS_PER_LONG;
144
145	while (1) {
146		if (*p == ~0UL)
147			goto pass;
148
149		tmp = __reverse_ulong((unsigned char *)p);
150
151		if (offset)
152			tmp |= ~0UL << (BITS_PER_LONG - offset);
153		if (size < BITS_PER_LONG)
154			tmp |= ~0UL >> size;
155		if (tmp != ~0UL)
156			goto found;
157pass:
158		if (size <= BITS_PER_LONG)
159			break;
160		size -= BITS_PER_LONG;
161		offset = 0;
162		p++;
163	}
164	return result;
165found:
166	return result - size + __reverse_ffz(tmp);
167}
168
169bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
170{
171	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
172	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
173	int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
174
175	if (f2fs_lfs_mode(sbi))
176		return false;
177	if (sbi->gc_mode == GC_URGENT_HIGH)
178		return true;
179	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
180		return true;
181
182	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
183			SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
184}
185
186void f2fs_register_inmem_page(struct inode *inode, struct page *page)
187{
188	struct inmem_pages *new;
189
190	set_page_private_atomic(page);
191
192	new = f2fs_kmem_cache_alloc(inmem_entry_slab,
193					GFP_NOFS, true, NULL);
194
195	/* add atomic page indices to the list */
196	new->page = page;
197	INIT_LIST_HEAD(&new->list);
198
199	/* increase reference count with clean state */
200	get_page(page);
201	mutex_lock(&F2FS_I(inode)->inmem_lock);
202	list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages);
203	inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
204	mutex_unlock(&F2FS_I(inode)->inmem_lock);
205
206	trace_f2fs_register_inmem_page(page, INMEM);
207}
208
209static int __revoke_inmem_pages(struct inode *inode,
210				struct list_head *head, bool drop, bool recover,
211				bool trylock)
212{
213	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
214	struct inmem_pages *cur, *tmp;
215	int err = 0;
216
217	list_for_each_entry_safe(cur, tmp, head, list) {
218		struct page *page = cur->page;
219
220		if (drop)
221			trace_f2fs_commit_inmem_page(page, INMEM_DROP);
222
223		if (trylock) {
224			/*
225			 * to avoid deadlock in between page lock and
226			 * inmem_lock.
227			 */
228			if (!trylock_page(page))
229				continue;
230		} else {
231			lock_page(page);
232		}
233
234		f2fs_wait_on_page_writeback(page, DATA, true, true);
235
236		if (recover) {
237			struct dnode_of_data dn;
238			struct node_info ni;
239
240			trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
241retry:
242			set_new_dnode(&dn, inode, NULL, NULL, 0);
243			err = f2fs_get_dnode_of_data(&dn, page->index,
244								LOOKUP_NODE);
245			if (err) {
246				if (err == -ENOMEM) {
247					congestion_wait(BLK_RW_ASYNC,
248							DEFAULT_IO_TIMEOUT);
249					cond_resched();
250					goto retry;
251				}
252				err = -EAGAIN;
253				goto next;
254			}
255
256			err = f2fs_get_node_info(sbi, dn.nid, &ni);
257			if (err) {
258				f2fs_put_dnode(&dn);
259				return err;
260			}
261
262			if (cur->old_addr == NEW_ADDR) {
263				f2fs_invalidate_blocks(sbi, dn.data_blkaddr);
264				f2fs_update_data_blkaddr(&dn, NEW_ADDR);
265			} else
266				f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
267					cur->old_addr, ni.version, true, true);
268			f2fs_put_dnode(&dn);
269		}
270next:
271		/* we don't need to invalidate this in the sccessful status */
272		if (drop || recover) {
273			ClearPageUptodate(page);
274			clear_page_private_gcing(page);
275		}
276		detach_page_private(page);
277		set_page_private(page, 0);
278		f2fs_put_page(page, 1);
279
280		list_del(&cur->list);
281		kmem_cache_free(inmem_entry_slab, cur);
282		dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
283	}
284	return err;
285}
286
287void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure)
288{
289	struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
290	struct inode *inode;
291	struct f2fs_inode_info *fi;
292	unsigned int count = sbi->atomic_files;
293	unsigned int looped = 0;
294next:
295	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
296	if (list_empty(head)) {
297		spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
298		return;
299	}
300	fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
301	inode = igrab(&fi->vfs_inode);
302	if (inode)
303		list_move_tail(&fi->inmem_ilist, head);
304	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
305
306	if (inode) {
307		if (gc_failure) {
308			if (!fi->i_gc_failures[GC_FAILURE_ATOMIC])
309				goto skip;
310		}
311		set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
312		f2fs_drop_inmem_pages(inode);
313skip:
314		iput(inode);
315	}
316	congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
317	cond_resched();
318	if (gc_failure) {
319		if (++looped >= count)
320			return;
321	}
322	goto next;
323}
324
325void f2fs_drop_inmem_pages(struct inode *inode)
326{
327	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
328	struct f2fs_inode_info *fi = F2FS_I(inode);
329
330	do {
331		mutex_lock(&fi->inmem_lock);
332		if (list_empty(&fi->inmem_pages)) {
333			fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
334
335			spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
336			if (!list_empty(&fi->inmem_ilist))
337				list_del_init(&fi->inmem_ilist);
338			if (f2fs_is_atomic_file(inode)) {
339				clear_inode_flag(inode, FI_ATOMIC_FILE);
340				sbi->atomic_files--;
341			}
342			spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
343
344			mutex_unlock(&fi->inmem_lock);
345			break;
346		}
347		__revoke_inmem_pages(inode, &fi->inmem_pages,
348						true, false, true);
349		mutex_unlock(&fi->inmem_lock);
350	} while (1);
351}
352
353void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
354{
355	struct f2fs_inode_info *fi = F2FS_I(inode);
356	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
357	struct list_head *head = &fi->inmem_pages;
358	struct inmem_pages *cur = NULL;
359
360	f2fs_bug_on(sbi, !page_private_atomic(page));
361
362	mutex_lock(&fi->inmem_lock);
363	list_for_each_entry(cur, head, list) {
364		if (cur->page == page)
365			break;
366	}
367
368	f2fs_bug_on(sbi, list_empty(head) || cur->page != page);
369	list_del(&cur->list);
370	mutex_unlock(&fi->inmem_lock);
371
372	dec_page_count(sbi, F2FS_INMEM_PAGES);
373	kmem_cache_free(inmem_entry_slab, cur);
374
375	ClearPageUptodate(page);
376	clear_page_private_atomic(page);
377	f2fs_put_page(page, 0);
378
379	detach_page_private(page);
380	set_page_private(page, 0);
381
382	trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
383}
384
385static int __f2fs_commit_inmem_pages(struct inode *inode)
386{
387	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
388	struct f2fs_inode_info *fi = F2FS_I(inode);
389	struct inmem_pages *cur, *tmp;
390	struct f2fs_io_info fio = {
391		.sbi = sbi,
392		.ino = inode->i_ino,
393		.type = DATA,
394		.op = REQ_OP_WRITE,
395		.op_flags = REQ_SYNC | REQ_PRIO,
396		.io_type = FS_DATA_IO,
397	};
398	struct list_head revoke_list;
399	bool submit_bio = false;
400	int err = 0;
401
402	INIT_LIST_HEAD(&revoke_list);
403
404	list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
405		struct page *page = cur->page;
406
407		lock_page(page);
408		if (page->mapping == inode->i_mapping) {
409			trace_f2fs_commit_inmem_page(page, INMEM);
410
411			f2fs_wait_on_page_writeback(page, DATA, true, true);
412
413			set_page_dirty(page);
414			if (clear_page_dirty_for_io(page)) {
415				inode_dec_dirty_pages(inode);
416				f2fs_remove_dirty_inode(inode);
417			}
418retry:
419			fio.page = page;
420			fio.old_blkaddr = NULL_ADDR;
421			fio.encrypted_page = NULL;
422			fio.need_lock = LOCK_DONE;
423			err = f2fs_do_write_data_page(&fio);
424			if (err) {
425				if (err == -ENOMEM) {
426					congestion_wait(BLK_RW_ASYNC,
427							DEFAULT_IO_TIMEOUT);
428					cond_resched();
429					goto retry;
430				}
431				unlock_page(page);
432				break;
433			}
434			/* record old blkaddr for revoking */
435			cur->old_addr = fio.old_blkaddr;
436			submit_bio = true;
437		}
438		unlock_page(page);
439		list_move_tail(&cur->list, &revoke_list);
440	}
441
442	if (submit_bio)
443		f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA);
444
445	if (err) {
446		/*
447		 * try to revoke all committed pages, but still we could fail
448		 * due to no memory or other reason, if that happened, EAGAIN
449		 * will be returned, which means in such case, transaction is
450		 * already not integrity, caller should use journal to do the
451		 * recovery or rewrite & commit last transaction. For other
452		 * error number, revoking was done by filesystem itself.
453		 */
454		err = __revoke_inmem_pages(inode, &revoke_list,
455						false, true, false);
456
457		/* drop all uncommitted pages */
458		__revoke_inmem_pages(inode, &fi->inmem_pages,
459						true, false, false);
460	} else {
461		__revoke_inmem_pages(inode, &revoke_list,
462						false, false, false);
463	}
464
465	return err;
466}
467
468int f2fs_commit_inmem_pages(struct inode *inode)
469{
470	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
471	struct f2fs_inode_info *fi = F2FS_I(inode);
472	int err;
473
474	f2fs_balance_fs(sbi, true);
475
476	down_write(&fi->i_gc_rwsem[WRITE]);
477
478	f2fs_lock_op(sbi);
479	set_inode_flag(inode, FI_ATOMIC_COMMIT);
480
481	mutex_lock(&fi->inmem_lock);
482	err = __f2fs_commit_inmem_pages(inode);
483	mutex_unlock(&fi->inmem_lock);
484
485	clear_inode_flag(inode, FI_ATOMIC_COMMIT);
486
487	f2fs_unlock_op(sbi);
488	up_write(&fi->i_gc_rwsem[WRITE]);
489
490	return err;
491}
492
493/*
494 * This function balances dirty node and dentry pages.
495 * In addition, it controls garbage collection.
496 */
497void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
498{
499	if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
500		f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
501		f2fs_stop_checkpoint(sbi, false);
502	}
503
504	/* balance_fs_bg is able to be pending */
505	if (need && excess_cached_nats(sbi))
506		f2fs_balance_fs_bg(sbi, false);
507
508	if (!f2fs_is_checkpoint_ready(sbi))
509		return;
510
511	/*
512	 * We should do GC or end up with checkpoint, if there are so many dirty
513	 * dir/node pages without enough free segments.
514	 */
515	if (has_not_enough_free_secs(sbi, 0, 0)) {
516		if (test_opt(sbi, GC_MERGE) && sbi->gc_thread &&
517					sbi->gc_thread->f2fs_gc_task) {
518			DEFINE_WAIT(wait);
519
520			prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait,
521						TASK_UNINTERRUPTIBLE);
522			wake_up(&sbi->gc_thread->gc_wait_queue_head);
523			io_schedule();
524			finish_wait(&sbi->gc_thread->fggc_wq, &wait);
525		} else {
526			down_write(&sbi->gc_lock);
527			f2fs_gc(sbi, false, false, false, NULL_SEGNO);
528		}
529	}
530}
531
532void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
533{
534	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
535		return;
536
537	/* try to shrink extent cache when there is no enough memory */
538	if (!f2fs_available_free_memory(sbi, EXTENT_CACHE))
539		f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
540
541	/* check the # of cached NAT entries */
542	if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
543		f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
544
545	if (!f2fs_available_free_memory(sbi, FREE_NIDS))
546		f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS);
547	else
548		f2fs_build_free_nids(sbi, false, false);
549
550	if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) ||
551		excess_prefree_segs(sbi))
552		goto do_sync;
553
554	/* there is background inflight IO or foreground operation recently */
555	if (is_inflight_io(sbi, REQ_TIME) ||
556		(!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem)))
557		return;
558
559	/* exceed periodical checkpoint timeout threshold */
560	if (f2fs_time_over(sbi, CP_TIME))
561		goto do_sync;
562
563	/* checkpoint is the only way to shrink partial cached entries */
564	if (f2fs_available_free_memory(sbi, NAT_ENTRIES) ||
565		f2fs_available_free_memory(sbi, INO_ENTRIES))
566		return;
567
568do_sync:
569	if (test_opt(sbi, DATA_FLUSH) && from_bg) {
570		struct blk_plug plug;
571
572		mutex_lock(&sbi->flush_lock);
573
574		blk_start_plug(&plug);
575		f2fs_sync_dirty_inodes(sbi, FILE_INODE);
576		blk_finish_plug(&plug);
577
578		mutex_unlock(&sbi->flush_lock);
579	}
580	f2fs_sync_fs(sbi->sb, true);
581	stat_inc_bg_cp_count(sbi->stat_info);
582}
583
584static int __submit_flush_wait(struct f2fs_sb_info *sbi,
585				struct block_device *bdev)
586{
587	int ret = blkdev_issue_flush(bdev);
588
589	trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
590				test_opt(sbi, FLUSH_MERGE), ret);
591	return ret;
592}
593
594static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino)
595{
596	int ret = 0;
597	int i;
598
599	if (!f2fs_is_multi_device(sbi))
600		return __submit_flush_wait(sbi, sbi->sb->s_bdev);
601
602	for (i = 0; i < sbi->s_ndevs; i++) {
603		if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO))
604			continue;
605		ret = __submit_flush_wait(sbi, FDEV(i).bdev);
606		if (ret)
607			break;
608	}
609	return ret;
610}
611
612static int issue_flush_thread(void *data)
613{
614	struct f2fs_sb_info *sbi = data;
615	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
616	wait_queue_head_t *q = &fcc->flush_wait_queue;
617repeat:
618	if (kthread_should_stop())
619		return 0;
620
621	if (!llist_empty(&fcc->issue_list)) {
622		struct flush_cmd *cmd, *next;
623		int ret;
624
625		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
626		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
627
628		cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode);
629
630		ret = submit_flush_wait(sbi, cmd->ino);
631		atomic_inc(&fcc->issued_flush);
632
633		llist_for_each_entry_safe(cmd, next,
634					  fcc->dispatch_list, llnode) {
635			cmd->ret = ret;
636			complete(&cmd->wait);
637		}
638		fcc->dispatch_list = NULL;
639	}
640
641	wait_event_interruptible(*q,
642		kthread_should_stop() || !llist_empty(&fcc->issue_list));
643	goto repeat;
644}
645
646int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
647{
648	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
649	struct flush_cmd cmd;
650	int ret;
651
652	if (test_opt(sbi, NOBARRIER))
653		return 0;
654
655	if (!test_opt(sbi, FLUSH_MERGE)) {
656		atomic_inc(&fcc->queued_flush);
657		ret = submit_flush_wait(sbi, ino);
658		atomic_dec(&fcc->queued_flush);
659		atomic_inc(&fcc->issued_flush);
660		return ret;
661	}
662
663	if (atomic_inc_return(&fcc->queued_flush) == 1 ||
664	    f2fs_is_multi_device(sbi)) {
665		ret = submit_flush_wait(sbi, ino);
666		atomic_dec(&fcc->queued_flush);
667
668		atomic_inc(&fcc->issued_flush);
669		return ret;
670	}
671
672	cmd.ino = ino;
673	init_completion(&cmd.wait);
674
675	llist_add(&cmd.llnode, &fcc->issue_list);
676
677	/*
678	 * update issue_list before we wake up issue_flush thread, this
679	 * smp_mb() pairs with another barrier in ___wait_event(), see
680	 * more details in comments of waitqueue_active().
681	 */
682	smp_mb();
683
684	if (waitqueue_active(&fcc->flush_wait_queue))
685		wake_up(&fcc->flush_wait_queue);
686
687	if (fcc->f2fs_issue_flush) {
688		wait_for_completion(&cmd.wait);
689		atomic_dec(&fcc->queued_flush);
690	} else {
691		struct llist_node *list;
692
693		list = llist_del_all(&fcc->issue_list);
694		if (!list) {
695			wait_for_completion(&cmd.wait);
696			atomic_dec(&fcc->queued_flush);
697		} else {
698			struct flush_cmd *tmp, *next;
699
700			ret = submit_flush_wait(sbi, ino);
701
702			llist_for_each_entry_safe(tmp, next, list, llnode) {
703				if (tmp == &cmd) {
704					cmd.ret = ret;
705					atomic_dec(&fcc->queued_flush);
706					continue;
707				}
708				tmp->ret = ret;
709				complete(&tmp->wait);
710			}
711		}
712	}
713
714	return cmd.ret;
715}
716
717int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
718{
719	dev_t dev = sbi->sb->s_bdev->bd_dev;
720	struct flush_cmd_control *fcc;
721	int err = 0;
722
723	if (SM_I(sbi)->fcc_info) {
724		fcc = SM_I(sbi)->fcc_info;
725		if (fcc->f2fs_issue_flush)
726			return err;
727		goto init_thread;
728	}
729
730	fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL);
731	if (!fcc)
732		return -ENOMEM;
733	atomic_set(&fcc->issued_flush, 0);
734	atomic_set(&fcc->queued_flush, 0);
735	init_waitqueue_head(&fcc->flush_wait_queue);
736	init_llist_head(&fcc->issue_list);
737	SM_I(sbi)->fcc_info = fcc;
738	if (!test_opt(sbi, FLUSH_MERGE))
739		return err;
740
741init_thread:
742	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
743				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
744	if (IS_ERR(fcc->f2fs_issue_flush)) {
745		err = PTR_ERR(fcc->f2fs_issue_flush);
746		kfree(fcc);
747		SM_I(sbi)->fcc_info = NULL;
748		return err;
749	}
750
751	return err;
752}
753
754void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
755{
756	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
757
758	if (fcc && fcc->f2fs_issue_flush) {
759		struct task_struct *flush_thread = fcc->f2fs_issue_flush;
760
761		fcc->f2fs_issue_flush = NULL;
762		kthread_stop(flush_thread);
763	}
764	if (free) {
765		kfree(fcc);
766		SM_I(sbi)->fcc_info = NULL;
767	}
768}
769
770int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
771{
772	int ret = 0, i;
773
774	if (!f2fs_is_multi_device(sbi))
775		return 0;
776
777	if (test_opt(sbi, NOBARRIER))
778		return 0;
779
780	for (i = 1; i < sbi->s_ndevs; i++) {
781		int count = DEFAULT_RETRY_IO_COUNT;
782
783		if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
784			continue;
785
786		do {
787			ret = __submit_flush_wait(sbi, FDEV(i).bdev);
788			if (ret)
789				congestion_wait(BLK_RW_ASYNC,
790						DEFAULT_IO_TIMEOUT);
791		} while (ret && --count);
792
793		if (ret) {
794			f2fs_stop_checkpoint(sbi, false);
795			break;
796		}
797
798		spin_lock(&sbi->dev_lock);
799		f2fs_clear_bit(i, (char *)&sbi->dirty_device);
800		spin_unlock(&sbi->dev_lock);
801	}
802
803	return ret;
804}
805
806static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
807		enum dirty_type dirty_type)
808{
809	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
810
811	/* need not be added */
812	if (IS_CURSEG(sbi, segno))
813		return;
814
815	if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
816		dirty_i->nr_dirty[dirty_type]++;
817
818	if (dirty_type == DIRTY) {
819		struct seg_entry *sentry = get_seg_entry(sbi, segno);
820		enum dirty_type t = sentry->type;
821
822		if (unlikely(t >= DIRTY)) {
823			f2fs_bug_on(sbi, 1);
824			return;
825		}
826		if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
827			dirty_i->nr_dirty[t]++;
828
829		if (__is_large_section(sbi)) {
830			unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
831			block_t valid_blocks =
832				get_valid_blocks(sbi, segno, true);
833
834			f2fs_bug_on(sbi, unlikely(!valid_blocks ||
835					valid_blocks == BLKS_PER_SEC(sbi)));
836
837			if (!IS_CURSEC(sbi, secno))
838				set_bit(secno, dirty_i->dirty_secmap);
839		}
840	}
841}
842
843static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
844		enum dirty_type dirty_type)
845{
846	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
847	block_t valid_blocks;
848
849	if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
850		dirty_i->nr_dirty[dirty_type]--;
851
852	if (dirty_type == DIRTY) {
853		struct seg_entry *sentry = get_seg_entry(sbi, segno);
854		enum dirty_type t = sentry->type;
855
856		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
857			dirty_i->nr_dirty[t]--;
858
859		valid_blocks = get_valid_blocks(sbi, segno, true);
860		if (valid_blocks == 0) {
861			clear_bit(GET_SEC_FROM_SEG(sbi, segno),
862						dirty_i->victim_secmap);
863#ifdef CONFIG_F2FS_CHECK_FS
864			clear_bit(segno, SIT_I(sbi)->invalid_segmap);
865#endif
866		}
867		if (__is_large_section(sbi)) {
868			unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
869
870			if (!valid_blocks ||
871					valid_blocks == BLKS_PER_SEC(sbi)) {
872				clear_bit(secno, dirty_i->dirty_secmap);
873				return;
874			}
875
876			if (!IS_CURSEC(sbi, secno))
877				set_bit(secno, dirty_i->dirty_secmap);
878		}
879	}
880}
881
882/*
883 * Should not occur error such as -ENOMEM.
884 * Adding dirty entry into seglist is not critical operation.
885 * If a given segment is one of current working segments, it won't be added.
886 */
887static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
888{
889	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
890	unsigned short valid_blocks, ckpt_valid_blocks;
891	unsigned int usable_blocks;
892
893	if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
894		return;
895
896	usable_blocks = f2fs_usable_blks_in_seg(sbi, segno);
897	mutex_lock(&dirty_i->seglist_lock);
898
899	valid_blocks = get_valid_blocks(sbi, segno, false);
900	ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false);
901
902	if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) ||
903		ckpt_valid_blocks == usable_blocks)) {
904		__locate_dirty_segment(sbi, segno, PRE);
905		__remove_dirty_segment(sbi, segno, DIRTY);
906	} else if (valid_blocks < usable_blocks) {
907		__locate_dirty_segment(sbi, segno, DIRTY);
908	} else {
909		/* Recovery routine with SSR needs this */
910		__remove_dirty_segment(sbi, segno, DIRTY);
911	}
912
913	mutex_unlock(&dirty_i->seglist_lock);
914}
915
916/* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */
917void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi)
918{
919	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
920	unsigned int segno;
921
922	mutex_lock(&dirty_i->seglist_lock);
923	for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
924		if (get_valid_blocks(sbi, segno, false))
925			continue;
926		if (IS_CURSEG(sbi, segno))
927			continue;
928		__locate_dirty_segment(sbi, segno, PRE);
929		__remove_dirty_segment(sbi, segno, DIRTY);
930	}
931	mutex_unlock(&dirty_i->seglist_lock);
932}
933
934block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
935{
936	int ovp_hole_segs =
937		(overprovision_segments(sbi) - reserved_segments(sbi));
938	block_t ovp_holes = ovp_hole_segs << sbi->log_blocks_per_seg;
939	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
940	block_t holes[2] = {0, 0};	/* DATA and NODE */
941	block_t unusable;
942	struct seg_entry *se;
943	unsigned int segno;
944
945	mutex_lock(&dirty_i->seglist_lock);
946	for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
947		se = get_seg_entry(sbi, segno);
948		if (IS_NODESEG(se->type))
949			holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) -
950							se->valid_blocks;
951		else
952			holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) -
953							se->valid_blocks;
954	}
955	mutex_unlock(&dirty_i->seglist_lock);
956
957	unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE];
958	if (unusable > ovp_holes)
959		return unusable - ovp_holes;
960	return 0;
961}
962
963int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable)
964{
965	int ovp_hole_segs =
966		(overprovision_segments(sbi) - reserved_segments(sbi));
967	if (unusable > F2FS_OPTION(sbi).unusable_cap)
968		return -EAGAIN;
969	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
970		dirty_segments(sbi) > ovp_hole_segs)
971		return -EAGAIN;
972	return 0;
973}
974
975/* This is only used by SBI_CP_DISABLED */
976static unsigned int get_free_segment(struct f2fs_sb_info *sbi)
977{
978	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
979	unsigned int segno = 0;
980
981	mutex_lock(&dirty_i->seglist_lock);
982	for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
983		if (get_valid_blocks(sbi, segno, false))
984			continue;
985		if (get_ckpt_valid_blocks(sbi, segno, false))
986			continue;
987		mutex_unlock(&dirty_i->seglist_lock);
988		return segno;
989	}
990	mutex_unlock(&dirty_i->seglist_lock);
991	return NULL_SEGNO;
992}
993
994static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
995		struct block_device *bdev, block_t lstart,
996		block_t start, block_t len)
997{
998	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
999	struct list_head *pend_list;
1000	struct discard_cmd *dc;
1001
1002	f2fs_bug_on(sbi, !len);
1003
1004	pend_list = &dcc->pend_list[plist_idx(len)];
1005
1006	dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL);
1007	INIT_LIST_HEAD(&dc->list);
1008	dc->bdev = bdev;
1009	dc->lstart = lstart;
1010	dc->start = start;
1011	dc->len = len;
1012	dc->ref = 0;
1013	dc->state = D_PREP;
1014	dc->queued = 0;
1015	dc->error = 0;
1016	init_completion(&dc->wait);
1017	list_add_tail(&dc->list, pend_list);
1018	spin_lock_init(&dc->lock);
1019	dc->bio_ref = 0;
1020	atomic_inc(&dcc->discard_cmd_cnt);
1021	dcc->undiscard_blks += len;
1022
1023	return dc;
1024}
1025
1026static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi,
1027				struct block_device *bdev, block_t lstart,
1028				block_t start, block_t len,
1029				struct rb_node *parent, struct rb_node **p,
1030				bool leftmost)
1031{
1032	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1033	struct discard_cmd *dc;
1034
1035	dc = __create_discard_cmd(sbi, bdev, lstart, start, len);
1036
1037	rb_link_node(&dc->rb_node, parent, p);
1038	rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost);
1039
1040	return dc;
1041}
1042
1043static void __detach_discard_cmd(struct discard_cmd_control *dcc,
1044							struct discard_cmd *dc)
1045{
1046	if (dc->state == D_DONE)
1047		atomic_sub(dc->queued, &dcc->queued_discard);
1048
1049	list_del(&dc->list);
1050	rb_erase_cached(&dc->rb_node, &dcc->root);
1051	dcc->undiscard_blks -= dc->len;
1052
1053	kmem_cache_free(discard_cmd_slab, dc);
1054
1055	atomic_dec(&dcc->discard_cmd_cnt);
1056}
1057
1058static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
1059							struct discard_cmd *dc)
1060{
1061	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1062	unsigned long flags;
1063
1064	trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len);
1065
1066	spin_lock_irqsave(&dc->lock, flags);
1067	if (dc->bio_ref) {
1068		spin_unlock_irqrestore(&dc->lock, flags);
1069		return;
1070	}
1071	spin_unlock_irqrestore(&dc->lock, flags);
1072
1073	f2fs_bug_on(sbi, dc->ref);
1074
1075	if (dc->error == -EOPNOTSUPP)
1076		dc->error = 0;
1077
1078	if (dc->error)
1079		printk_ratelimited(
1080			"%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d",
1081			KERN_INFO, sbi->sb->s_id,
1082			dc->lstart, dc->start, dc->len, dc->error);
1083	__detach_discard_cmd(dcc, dc);
1084}
1085
1086static void f2fs_submit_discard_endio(struct bio *bio)
1087{
1088	struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
1089	unsigned long flags;
1090
1091	spin_lock_irqsave(&dc->lock, flags);
1092	if (!dc->error)
1093		dc->error = blk_status_to_errno(bio->bi_status);
1094	dc->bio_ref--;
1095	if (!dc->bio_ref && dc->state == D_SUBMIT) {
1096		dc->state = D_DONE;
1097		complete_all(&dc->wait);
1098	}
1099	spin_unlock_irqrestore(&dc->lock, flags);
1100	bio_put(bio);
1101}
1102
1103static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
1104				block_t start, block_t end)
1105{
1106#ifdef CONFIG_F2FS_CHECK_FS
1107	struct seg_entry *sentry;
1108	unsigned int segno;
1109	block_t blk = start;
1110	unsigned long offset, size, max_blocks = sbi->blocks_per_seg;
1111	unsigned long *map;
1112
1113	while (blk < end) {
1114		segno = GET_SEGNO(sbi, blk);
1115		sentry = get_seg_entry(sbi, segno);
1116		offset = GET_BLKOFF_FROM_SEG0(sbi, blk);
1117
1118		if (end < START_BLOCK(sbi, segno + 1))
1119			size = GET_BLKOFF_FROM_SEG0(sbi, end);
1120		else
1121			size = max_blocks;
1122		map = (unsigned long *)(sentry->cur_valid_map);
1123		offset = __find_rev_next_bit(map, size, offset);
1124		f2fs_bug_on(sbi, offset != size);
1125		blk = START_BLOCK(sbi, segno + 1);
1126	}
1127#endif
1128}
1129
1130static void __init_discard_policy(struct f2fs_sb_info *sbi,
1131				struct discard_policy *dpolicy,
1132				int discard_type, unsigned int granularity)
1133{
1134	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1135
1136	/* common policy */
1137	dpolicy->type = discard_type;
1138	dpolicy->sync = true;
1139	dpolicy->ordered = false;
1140	dpolicy->granularity = granularity;
1141
1142	dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
1143	dpolicy->io_aware_gran = MAX_PLIST_NUM;
1144	dpolicy->timeout = false;
1145
1146	if (discard_type == DPOLICY_BG) {
1147		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
1148		dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
1149		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
1150		dpolicy->io_aware = true;
1151		dpolicy->sync = false;
1152		dpolicy->ordered = true;
1153		if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
1154			dpolicy->granularity = 1;
1155			if (atomic_read(&dcc->discard_cmd_cnt))
1156				dpolicy->max_interval =
1157					DEF_MIN_DISCARD_ISSUE_TIME;
1158		}
1159	} else if (discard_type == DPOLICY_FORCE) {
1160		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
1161		dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
1162		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
1163		dpolicy->io_aware = false;
1164	} else if (discard_type == DPOLICY_FSTRIM) {
1165		dpolicy->io_aware = false;
1166	} else if (discard_type == DPOLICY_UMOUNT) {
1167		dpolicy->io_aware = false;
1168		/* we need to issue all to keep CP_TRIMMED_FLAG */
1169		dpolicy->granularity = 1;
1170		dpolicy->timeout = true;
1171	}
1172}
1173
1174static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
1175				struct block_device *bdev, block_t lstart,
1176				block_t start, block_t len);
1177/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
1178static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
1179						struct discard_policy *dpolicy,
1180						struct discard_cmd *dc,
1181						unsigned int *issued)
1182{
1183	struct block_device *bdev = dc->bdev;
1184	struct request_queue *q = bdev_get_queue(bdev);
1185	unsigned int max_discard_blocks =
1186			SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
1187	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1188	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
1189					&(dcc->fstrim_list) : &(dcc->wait_list);
1190	int flag = dpolicy->sync ? REQ_SYNC : 0;
1191	block_t lstart, start, len, total_len;
1192	int err = 0;
1193
1194	if (dc->state != D_PREP)
1195		return 0;
1196
1197	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
1198		return 0;
1199
1200	trace_f2fs_issue_discard(bdev, dc->start, dc->len);
1201
1202	lstart = dc->lstart;
1203	start = dc->start;
1204	len = dc->len;
1205	total_len = len;
1206
1207	dc->len = 0;
1208
1209	while (total_len && *issued < dpolicy->max_requests && !err) {
1210		struct bio *bio = NULL;
1211		unsigned long flags;
1212		bool last = true;
1213
1214		if (len > max_discard_blocks) {
1215			len = max_discard_blocks;
1216			last = false;
1217		}
1218
1219		(*issued)++;
1220		if (*issued == dpolicy->max_requests)
1221			last = true;
1222
1223		dc->len += len;
1224
1225		if (time_to_inject(sbi, FAULT_DISCARD)) {
1226			f2fs_show_injection_info(sbi, FAULT_DISCARD);
1227			err = -EIO;
1228			goto submit;
1229		}
1230		err = __blkdev_issue_discard(bdev,
1231					SECTOR_FROM_BLOCK(start),
1232					SECTOR_FROM_BLOCK(len),
1233					GFP_NOFS, 0, &bio);
1234submit:
1235		if (err) {
1236			spin_lock_irqsave(&dc->lock, flags);
1237			if (dc->state == D_PARTIAL)
1238				dc->state = D_SUBMIT;
1239			spin_unlock_irqrestore(&dc->lock, flags);
1240
1241			break;
1242		}
1243
1244		f2fs_bug_on(sbi, !bio);
1245
1246		/*
1247		 * should keep before submission to avoid D_DONE
1248		 * right away
1249		 */
1250		spin_lock_irqsave(&dc->lock, flags);
1251		if (last)
1252			dc->state = D_SUBMIT;
1253		else
1254			dc->state = D_PARTIAL;
1255		dc->bio_ref++;
1256		spin_unlock_irqrestore(&dc->lock, flags);
1257
1258		atomic_inc(&dcc->queued_discard);
1259		dc->queued++;
1260		list_move_tail(&dc->list, wait_list);
1261
1262		/* sanity check on discard range */
1263		__check_sit_bitmap(sbi, lstart, lstart + len);
1264
1265		bio->bi_private = dc;
1266		bio->bi_end_io = f2fs_submit_discard_endio;
1267		bio->bi_opf |= flag;
1268		submit_bio(bio);
1269
1270		atomic_inc(&dcc->issued_discard);
1271
1272		f2fs_update_iostat(sbi, FS_DISCARD, 1);
1273
1274		lstart += len;
1275		start += len;
1276		total_len -= len;
1277		len = total_len;
1278	}
1279
1280	if (!err && len) {
1281		dcc->undiscard_blks -= len;
1282		__update_discard_tree_range(sbi, bdev, lstart, start, len);
1283	}
1284	return err;
1285}
1286
1287static void __insert_discard_tree(struct f2fs_sb_info *sbi,
1288				struct block_device *bdev, block_t lstart,
1289				block_t start, block_t len,
1290				struct rb_node **insert_p,
1291				struct rb_node *insert_parent)
1292{
1293	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1294	struct rb_node **p;
1295	struct rb_node *parent = NULL;
1296	bool leftmost = true;
1297
1298	if (insert_p && insert_parent) {
1299		parent = insert_parent;
1300		p = insert_p;
1301		goto do_insert;
1302	}
1303
1304	p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent,
1305							lstart, &leftmost);
1306do_insert:
1307	__attach_discard_cmd(sbi, bdev, lstart, start, len, parent,
1308								p, leftmost);
1309}
1310
1311static void __relocate_discard_cmd(struct discard_cmd_control *dcc,
1312						struct discard_cmd *dc)
1313{
1314	list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]);
1315}
1316
1317static void __punch_discard_cmd(struct f2fs_sb_info *sbi,
1318				struct discard_cmd *dc, block_t blkaddr)
1319{
1320	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1321	struct discard_info di = dc->di;
1322	bool modified = false;
1323
1324	if (dc->state == D_DONE || dc->len == 1) {
1325		__remove_discard_cmd(sbi, dc);
1326		return;
1327	}
1328
1329	dcc->undiscard_blks -= di.len;
1330
1331	if (blkaddr > di.lstart) {
1332		dc->len = blkaddr - dc->lstart;
1333		dcc->undiscard_blks += dc->len;
1334		__relocate_discard_cmd(dcc, dc);
1335		modified = true;
1336	}
1337
1338	if (blkaddr < di.lstart + di.len - 1) {
1339		if (modified) {
1340			__insert_discard_tree(sbi, dc->bdev, blkaddr + 1,
1341					di.start + blkaddr + 1 - di.lstart,
1342					di.lstart + di.len - 1 - blkaddr,
1343					NULL, NULL);
1344		} else {
1345			dc->lstart++;
1346			dc->len--;
1347			dc->start++;
1348			dcc->undiscard_blks += dc->len;
1349			__relocate_discard_cmd(dcc, dc);
1350		}
1351	}
1352}
1353
1354static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
1355				struct block_device *bdev, block_t lstart,
1356				block_t start, block_t len)
1357{
1358	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1359	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
1360	struct discard_cmd *dc;
1361	struct discard_info di = {0};
1362	struct rb_node **insert_p = NULL, *insert_parent = NULL;
1363	struct request_queue *q = bdev_get_queue(bdev);
1364	unsigned int max_discard_blocks =
1365			SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
1366	block_t end = lstart + len;
1367
1368	dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
1369					NULL, lstart,
1370					(struct rb_entry **)&prev_dc,
1371					(struct rb_entry **)&next_dc,
1372					&insert_p, &insert_parent, true, NULL);
1373	if (dc)
1374		prev_dc = dc;
1375
1376	if (!prev_dc) {
1377		di.lstart = lstart;
1378		di.len = next_dc ? next_dc->lstart - lstart : len;
1379		di.len = min(di.len, len);
1380		di.start = start;
1381	}
1382
1383	while (1) {
1384		struct rb_node *node;
1385		bool merged = false;
1386		struct discard_cmd *tdc = NULL;
1387
1388		if (prev_dc) {
1389			di.lstart = prev_dc->lstart + prev_dc->len;
1390			if (di.lstart < lstart)
1391				di.lstart = lstart;
1392			if (di.lstart >= end)
1393				break;
1394
1395			if (!next_dc || next_dc->lstart > end)
1396				di.len = end - di.lstart;
1397			else
1398				di.len = next_dc->lstart - di.lstart;
1399			di.start = start + di.lstart - lstart;
1400		}
1401
1402		if (!di.len)
1403			goto next;
1404
1405		if (prev_dc && prev_dc->state == D_PREP &&
1406			prev_dc->bdev == bdev &&
1407			__is_discard_back_mergeable(&di, &prev_dc->di,
1408							max_discard_blocks)) {
1409			prev_dc->di.len += di.len;
1410			dcc->undiscard_blks += di.len;
1411			__relocate_discard_cmd(dcc, prev_dc);
1412			di = prev_dc->di;
1413			tdc = prev_dc;
1414			merged = true;
1415		}
1416
1417		if (next_dc && next_dc->state == D_PREP &&
1418			next_dc->bdev == bdev &&
1419			__is_discard_front_mergeable(&di, &next_dc->di,
1420							max_discard_blocks)) {
1421			next_dc->di.lstart = di.lstart;
1422			next_dc->di.len += di.len;
1423			next_dc->di.start = di.start;
1424			dcc->undiscard_blks += di.len;
1425			__relocate_discard_cmd(dcc, next_dc);
1426			if (tdc)
1427				__remove_discard_cmd(sbi, tdc);
1428			merged = true;
1429		}
1430
1431		if (!merged) {
1432			__insert_discard_tree(sbi, bdev, di.lstart, di.start,
1433							di.len, NULL, NULL);
1434		}
1435 next:
1436		prev_dc = next_dc;
1437		if (!prev_dc)
1438			break;
1439
1440		node = rb_next(&prev_dc->rb_node);
1441		next_dc = rb_entry_safe(node, struct discard_cmd, rb_node);
1442	}
1443}
1444
1445static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
1446		struct block_device *bdev, block_t blkstart, block_t blklen)
1447{
1448	block_t lblkstart = blkstart;
1449
1450	if (!f2fs_bdev_support_discard(bdev))
1451		return 0;
1452
1453	trace_f2fs_queue_discard(bdev, blkstart, blklen);
1454
1455	if (f2fs_is_multi_device(sbi)) {
1456		int devi = f2fs_target_device_index(sbi, blkstart);
1457
1458		blkstart -= FDEV(devi).start_blk;
1459	}
1460	mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock);
1461	__update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
1462	mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
1463	return 0;
1464}
1465
1466static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
1467					struct discard_policy *dpolicy)
1468{
1469	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1470	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
1471	struct rb_node **insert_p = NULL, *insert_parent = NULL;
1472	struct discard_cmd *dc;
1473	struct blk_plug plug;
1474	unsigned int pos = dcc->next_pos;
1475	unsigned int issued = 0;
1476	bool io_interrupted = false;
1477
1478	mutex_lock(&dcc->cmd_lock);
1479	dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
1480					NULL, pos,
1481					(struct rb_entry **)&prev_dc,
1482					(struct rb_entry **)&next_dc,
1483					&insert_p, &insert_parent, true, NULL);
1484	if (!dc)
1485		dc = next_dc;
1486
1487	blk_start_plug(&plug);
1488
1489	while (dc) {
1490		struct rb_node *node;
1491		int err = 0;
1492
1493		if (dc->state != D_PREP)
1494			goto next;
1495
1496		if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) {
1497			io_interrupted = true;
1498			break;
1499		}
1500
1501		dcc->next_pos = dc->lstart + dc->len;
1502		err = __submit_discard_cmd(sbi, dpolicy, dc, &issued);
1503
1504		if (issued >= dpolicy->max_requests)
1505			break;
1506next:
1507		node = rb_next(&dc->rb_node);
1508		if (err)
1509			__remove_discard_cmd(sbi, dc);
1510		dc = rb_entry_safe(node, struct discard_cmd, rb_node);
1511	}
1512
1513	blk_finish_plug(&plug);
1514
1515	if (!dc)
1516		dcc->next_pos = 0;
1517
1518	mutex_unlock(&dcc->cmd_lock);
1519
1520	if (!issued && io_interrupted)
1521		issued = -1;
1522
1523	return issued;
1524}
1525static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
1526					struct discard_policy *dpolicy);
1527
1528static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
1529					struct discard_policy *dpolicy)
1530{
1531	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1532	struct list_head *pend_list;
1533	struct discard_cmd *dc, *tmp;
1534	struct blk_plug plug;
1535	int i, issued;
1536	bool io_interrupted = false;
1537
1538	if (dpolicy->timeout)
1539		f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT);
1540
1541retry:
1542	issued = 0;
1543	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
1544		if (dpolicy->timeout &&
1545				f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
1546			break;
1547
1548		if (i + 1 < dpolicy->granularity)
1549			break;
1550
1551		if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
1552			return __issue_discard_cmd_orderly(sbi, dpolicy);
1553
1554		pend_list = &dcc->pend_list[i];
1555
1556		mutex_lock(&dcc->cmd_lock);
1557		if (list_empty(pend_list))
1558			goto next;
1559		if (unlikely(dcc->rbtree_check))
1560			f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi,
1561							&dcc->root, false));
1562		blk_start_plug(&plug);
1563		list_for_each_entry_safe(dc, tmp, pend_list, list) {
1564			f2fs_bug_on(sbi, dc->state != D_PREP);
1565
1566			if (dpolicy->timeout &&
1567				f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
1568				break;
1569
1570			if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
1571						!is_idle(sbi, DISCARD_TIME)) {
1572				io_interrupted = true;
1573				break;
1574			}
1575
1576			__submit_discard_cmd(sbi, dpolicy, dc, &issued);
1577
1578			if (issued >= dpolicy->max_requests)
1579				break;
1580		}
1581		blk_finish_plug(&plug);
1582next:
1583		mutex_unlock(&dcc->cmd_lock);
1584
1585		if (issued >= dpolicy->max_requests || io_interrupted)
1586			break;
1587	}
1588
1589	if (dpolicy->type == DPOLICY_UMOUNT && issued) {
1590		__wait_all_discard_cmd(sbi, dpolicy);
1591		goto retry;
1592	}
1593
1594	if (!issued && io_interrupted)
1595		issued = -1;
1596
1597	return issued;
1598}
1599
1600static bool __drop_discard_cmd(struct f2fs_sb_info *sbi)
1601{
1602	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1603	struct list_head *pend_list;
1604	struct discard_cmd *dc, *tmp;
1605	int i;
1606	bool dropped = false;
1607
1608	mutex_lock(&dcc->cmd_lock);
1609	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
1610		pend_list = &dcc->pend_list[i];
1611		list_for_each_entry_safe(dc, tmp, pend_list, list) {
1612			f2fs_bug_on(sbi, dc->state != D_PREP);
1613			__remove_discard_cmd(sbi, dc);
1614			dropped = true;
1615		}
1616	}
1617	mutex_unlock(&dcc->cmd_lock);
1618
1619	return dropped;
1620}
1621
1622void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi)
1623{
1624	__drop_discard_cmd(sbi);
1625}
1626
1627static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi,
1628							struct discard_cmd *dc)
1629{
1630	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1631	unsigned int len = 0;
1632
1633	wait_for_completion_io(&dc->wait);
1634	mutex_lock(&dcc->cmd_lock);
1635	f2fs_bug_on(sbi, dc->state != D_DONE);
1636	dc->ref--;
1637	if (!dc->ref) {
1638		if (!dc->error)
1639			len = dc->len;
1640		__remove_discard_cmd(sbi, dc);
1641	}
1642	mutex_unlock(&dcc->cmd_lock);
1643
1644	return len;
1645}
1646
1647static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
1648						struct discard_policy *dpolicy,
1649						block_t start, block_t end)
1650{
1651	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1652	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
1653					&(dcc->fstrim_list) : &(dcc->wait_list);
1654	struct discard_cmd *dc, *tmp;
1655	bool need_wait;
1656	unsigned int trimmed = 0;
1657
1658next:
1659	need_wait = false;
1660
1661	mutex_lock(&dcc->cmd_lock);
1662	list_for_each_entry_safe(dc, tmp, wait_list, list) {
1663		if (dc->lstart + dc->len <= start || end <= dc->lstart)
1664			continue;
1665		if (dc->len < dpolicy->granularity)
1666			continue;
1667		if (dc->state == D_DONE && !dc->ref) {
1668			wait_for_completion_io(&dc->wait);
1669			if (!dc->error)
1670				trimmed += dc->len;
1671			__remove_discard_cmd(sbi, dc);
1672		} else {
1673			dc->ref++;
1674			need_wait = true;
1675			break;
1676		}
1677	}
1678	mutex_unlock(&dcc->cmd_lock);
1679
1680	if (need_wait) {
1681		trimmed += __wait_one_discard_bio(sbi, dc);
1682		goto next;
1683	}
1684
1685	return trimmed;
1686}
1687
1688static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
1689						struct discard_policy *dpolicy)
1690{
1691	struct discard_policy dp;
1692	unsigned int discard_blks;
1693
1694	if (dpolicy)
1695		return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
1696
1697	/* wait all */
1698	__init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1);
1699	discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
1700	__init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1);
1701	discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
1702
1703	return discard_blks;
1704}
1705
1706/* This should be covered by global mutex, &sit_i->sentry_lock */
1707static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
1708{
1709	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1710	struct discard_cmd *dc;
1711	bool need_wait = false;
1712
1713	mutex_lock(&dcc->cmd_lock);
1714	dc = (struct discard_cmd *)f2fs_lookup_rb_tree(&dcc->root,
1715							NULL, blkaddr);
1716	if (dc) {
1717		if (dc->state == D_PREP) {
1718			__punch_discard_cmd(sbi, dc, blkaddr);
1719		} else {
1720			dc->ref++;
1721			need_wait = true;
1722		}
1723	}
1724	mutex_unlock(&dcc->cmd_lock);
1725
1726	if (need_wait)
1727		__wait_one_discard_bio(sbi, dc);
1728}
1729
1730void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi)
1731{
1732	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1733
1734	if (dcc && dcc->f2fs_issue_discard) {
1735		struct task_struct *discard_thread = dcc->f2fs_issue_discard;
1736
1737		dcc->f2fs_issue_discard = NULL;
1738		kthread_stop(discard_thread);
1739	}
1740}
1741
1742/* This comes from f2fs_put_super */
1743bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
1744{
1745	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1746	struct discard_policy dpolicy;
1747	bool dropped;
1748
1749	__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
1750					dcc->discard_granularity);
1751	__issue_discard_cmd(sbi, &dpolicy);
1752	dropped = __drop_discard_cmd(sbi);
1753
1754	/* just to make sure there is no pending discard commands */
1755	__wait_all_discard_cmd(sbi, NULL);
1756
1757	f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt));
1758	return dropped;
1759}
1760
1761static int issue_discard_thread(void *data)
1762{
1763	struct f2fs_sb_info *sbi = data;
1764	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1765	wait_queue_head_t *q = &dcc->discard_wait_queue;
1766	struct discard_policy dpolicy;
1767	unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
1768	int issued;
1769
1770	set_freezable();
1771
1772	do {
1773		if (sbi->gc_mode == GC_URGENT_HIGH ||
1774			!f2fs_available_free_memory(sbi, DISCARD_CACHE))
1775			__init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
1776		else
1777			__init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
1778						dcc->discard_granularity);
1779
1780		if (!atomic_read(&dcc->discard_cmd_cnt))
1781		       wait_ms = dpolicy.max_interval;
1782
1783		wait_event_interruptible_timeout(*q,
1784				kthread_should_stop() || freezing(current) ||
1785				dcc->discard_wake,
1786				msecs_to_jiffies(wait_ms));
1787
1788		if (dcc->discard_wake)
1789			dcc->discard_wake = 0;
1790
1791		/* clean up pending candidates before going to sleep */
1792		if (atomic_read(&dcc->queued_discard))
1793			__wait_all_discard_cmd(sbi, NULL);
1794
1795		if (try_to_freeze())
1796			continue;
1797		if (f2fs_readonly(sbi->sb))
1798			continue;
1799		if (kthread_should_stop())
1800			return 0;
1801		if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
1802			wait_ms = dpolicy.max_interval;
1803			continue;
1804		}
1805		if (!atomic_read(&dcc->discard_cmd_cnt))
1806			continue;
1807
1808		sb_start_intwrite(sbi->sb);
1809
1810		issued = __issue_discard_cmd(sbi, &dpolicy);
1811		if (issued > 0) {
1812			__wait_all_discard_cmd(sbi, &dpolicy);
1813			wait_ms = dpolicy.min_interval;
1814		} else if (issued == -1) {
1815			wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME);
1816			if (!wait_ms)
1817				wait_ms = dpolicy.mid_interval;
1818		} else {
1819			wait_ms = dpolicy.max_interval;
1820		}
1821
1822		sb_end_intwrite(sbi->sb);
1823
1824	} while (!kthread_should_stop());
1825	return 0;
1826}
1827
1828#ifdef CONFIG_BLK_DEV_ZONED
1829static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
1830		struct block_device *bdev, block_t blkstart, block_t blklen)
1831{
1832	sector_t sector, nr_sects;
1833	block_t lblkstart = blkstart;
1834	int devi = 0;
1835
1836	if (f2fs_is_multi_device(sbi)) {
1837		devi = f2fs_target_device_index(sbi, blkstart);
1838		if (blkstart < FDEV(devi).start_blk ||
1839		    blkstart > FDEV(devi).end_blk) {
1840			f2fs_err(sbi, "Invalid block %x", blkstart);
1841			return -EIO;
1842		}
1843		blkstart -= FDEV(devi).start_blk;
1844	}
1845
1846	/* For sequential zones, reset the zone write pointer */
1847	if (f2fs_blkz_is_seq(sbi, devi, blkstart)) {
1848		sector = SECTOR_FROM_BLOCK(blkstart);
1849		nr_sects = SECTOR_FROM_BLOCK(blklen);
1850
1851		if (sector & (bdev_zone_sectors(bdev) - 1) ||
1852				nr_sects != bdev_zone_sectors(bdev)) {
1853			f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)",
1854				 devi, sbi->s_ndevs ? FDEV(devi).path : "",
1855				 blkstart, blklen);
1856			return -EIO;
1857		}
1858		trace_f2fs_issue_reset_zone(bdev, blkstart);
1859		return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
1860					sector, nr_sects, GFP_NOFS);
1861	}
1862
1863	/* For conventional zones, use regular discard if supported */
1864	return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
1865}
1866#endif
1867
1868static int __issue_discard_async(struct f2fs_sb_info *sbi,
1869		struct block_device *bdev, block_t blkstart, block_t blklen)
1870{
1871#ifdef CONFIG_BLK_DEV_ZONED
1872	if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev))
1873		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
1874#endif
1875	return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
1876}
1877
1878static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
1879				block_t blkstart, block_t blklen)
1880{
1881	sector_t start = blkstart, len = 0;
1882	struct block_device *bdev;
1883	struct seg_entry *se;
1884	unsigned int offset;
1885	block_t i;
1886	int err = 0;
1887
1888	bdev = f2fs_target_device(sbi, blkstart, NULL);
1889
1890	for (i = blkstart; i < blkstart + blklen; i++, len++) {
1891		if (i != start) {
1892			struct block_device *bdev2 =
1893				f2fs_target_device(sbi, i, NULL);
1894
1895			if (bdev2 != bdev) {
1896				err = __issue_discard_async(sbi, bdev,
1897						start, len);
1898				if (err)
1899					return err;
1900				bdev = bdev2;
1901				start = i;
1902				len = 0;
1903			}
1904		}
1905
1906		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
1907		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
1908
1909		if (f2fs_block_unit_discard(sbi) &&
1910				!f2fs_test_and_set_bit(offset, se->discard_map))
1911			sbi->discard_blks--;
1912	}
1913
1914	if (len)
1915		err = __issue_discard_async(sbi, bdev, start, len);
1916	return err;
1917}
1918
1919static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
1920							bool check_only)
1921{
1922	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
1923	int max_blocks = sbi->blocks_per_seg;
1924	struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
1925	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
1926	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
1927	unsigned long *discard_map = (unsigned long *)se->discard_map;
1928	unsigned long *dmap = SIT_I(sbi)->tmp_map;
1929	unsigned int start = 0, end = -1;
1930	bool force = (cpc->reason & CP_DISCARD);
1931	struct discard_entry *de = NULL;
1932	struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
1933	int i;
1934
1935	if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) ||
1936			!f2fs_block_unit_discard(sbi))
1937		return false;
1938
1939	if (!force) {
1940		if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks ||
1941			SM_I(sbi)->dcc_info->nr_discards >=
1942				SM_I(sbi)->dcc_info->max_discards)
1943			return false;
1944	}
1945
1946	/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
1947	for (i = 0; i < entries; i++)
1948		dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] :
1949				(cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
1950
1951	while (force || SM_I(sbi)->dcc_info->nr_discards <=
1952				SM_I(sbi)->dcc_info->max_discards) {
1953		start = __find_rev_next_bit(dmap, max_blocks, end + 1);
1954		if (start >= max_blocks)
1955			break;
1956
1957		end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
1958		if (force && start && end != max_blocks
1959					&& (end - start) < cpc->trim_minlen)
1960			continue;
1961
1962		if (check_only)
1963			return true;
1964
1965		if (!de) {
1966			de = f2fs_kmem_cache_alloc(discard_entry_slab,
1967						GFP_F2FS_ZERO, true, NULL);
1968			de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
1969			list_add_tail(&de->list, head);
1970		}
1971
1972		for (i = start; i < end; i++)
1973			__set_bit_le(i, (void *)de->discard_map);
1974
1975		SM_I(sbi)->dcc_info->nr_discards += end - start;
1976	}
1977	return false;
1978}
1979
1980static void release_discard_addr(struct discard_entry *entry)
1981{
1982	list_del(&entry->list);
1983	kmem_cache_free(discard_entry_slab, entry);
1984}
1985
1986void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi)
1987{
1988	struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
1989	struct discard_entry *entry, *this;
1990
1991	/* drop caches */
1992	list_for_each_entry_safe(entry, this, head, list)
1993		release_discard_addr(entry);
1994}
1995
1996/*
1997 * Should call f2fs_clear_prefree_segments after checkpoint is done.
1998 */
1999static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
2000{
2001	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
2002	unsigned int segno;
2003
2004	mutex_lock(&dirty_i->seglist_lock);
2005	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
2006		__set_test_and_free(sbi, segno, false);
2007	mutex_unlock(&dirty_i->seglist_lock);
2008}
2009
2010void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
2011						struct cp_control *cpc)
2012{
2013	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
2014	struct list_head *head = &dcc->entry_list;
2015	struct discard_entry *entry, *this;
2016	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
2017	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
2018	unsigned int start = 0, end = -1;
2019	unsigned int secno, start_segno;
2020	bool force = (cpc->reason & CP_DISCARD);
2021	bool section_alignment = F2FS_OPTION(sbi).discard_unit ==
2022						DISCARD_UNIT_SECTION;
2023
2024	if (f2fs_lfs_mode(sbi) && __is_large_section(sbi))
2025		section_alignment = true;
2026
2027	mutex_lock(&dirty_i->seglist_lock);
2028
2029	while (1) {
2030		int i;
2031
2032		if (section_alignment && end != -1)
2033			end--;
2034		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
2035		if (start >= MAIN_SEGS(sbi))
2036			break;
2037		end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
2038								start + 1);
2039
2040		if (section_alignment) {
2041			start = rounddown(start, sbi->segs_per_sec);
2042			end = roundup(end, sbi->segs_per_sec);
2043		}
2044
2045		for (i = start; i < end; i++) {
2046			if (test_and_clear_bit(i, prefree_map))
2047				dirty_i->nr_dirty[PRE]--;
2048		}
2049
2050		if (!f2fs_realtime_discard_enable(sbi))
2051			continue;
2052
2053		if (force && start >= cpc->trim_start &&
2054					(end - 1) <= cpc->trim_end)
2055				continue;
2056
2057		if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) {
2058			f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
2059				(end - start) << sbi->log_blocks_per_seg);
2060			continue;
2061		}
2062next:
2063		secno = GET_SEC_FROM_SEG(sbi, start);
2064		start_segno = GET_SEG_FROM_SEC(sbi, secno);
2065		if (!IS_CURSEC(sbi, secno) &&
2066			!get_valid_blocks(sbi, start, true))
2067			f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
2068				sbi->segs_per_sec << sbi->log_blocks_per_seg);
2069
2070		start = start_segno + sbi->segs_per_sec;
2071		if (start < end)
2072			goto next;
2073		else
2074			end = start - 1;
2075	}
2076	mutex_unlock(&dirty_i->seglist_lock);
2077
2078	if (!f2fs_block_unit_discard(sbi))
2079		goto wakeup;
2080
2081	/* send small discards */
2082	list_for_each_entry_safe(entry, this, head, list) {
2083		unsigned int cur_pos = 0, next_pos, len, total_len = 0;
2084		bool is_valid = test_bit_le(0, entry->discard_map);
2085
2086find_next:
2087		if (is_valid) {
2088			next_pos = find_next_zero_bit_le(entry->discard_map,
2089					sbi->blocks_per_seg, cur_pos);
2090			len = next_pos - cur_pos;
2091
2092			if (f2fs_sb_has_blkzoned(sbi) ||
2093			    (force && len < cpc->trim_minlen))
2094				goto skip;
2095
2096			f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
2097									len);
2098			total_len += len;
2099		} else {
2100			next_pos = find_next_bit_le(entry->discard_map,
2101					sbi->blocks_per_seg, cur_pos);
2102		}
2103skip:
2104		cur_pos = next_pos;
2105		is_valid = !is_valid;
2106
2107		if (cur_pos < sbi->blocks_per_seg)
2108			goto find_next;
2109
2110		release_discard_addr(entry);
2111		dcc->nr_discards -= total_len;
2112	}
2113
2114wakeup:
2115	wake_up_discard_thread(sbi, false);
2116}
2117
2118static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
2119{
2120	dev_t dev = sbi->sb->s_bdev->bd_dev;
2121	struct discard_cmd_control *dcc;
2122	int err = 0, i;
2123
2124	if (SM_I(sbi)->dcc_info) {
2125		dcc = SM_I(sbi)->dcc_info;
2126		goto init_thread;
2127	}
2128
2129	dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL);
2130	if (!dcc)
2131		return -ENOMEM;
2132
2133	dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
2134	if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
2135		dcc->discard_granularity = sbi->blocks_per_seg;
2136	else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
2137		dcc->discard_granularity = BLKS_PER_SEC(sbi);
2138
2139	INIT_LIST_HEAD(&dcc->entry_list);
2140	for (i = 0; i < MAX_PLIST_NUM; i++)
2141		INIT_LIST_HEAD(&dcc->pend_list[i]);
2142	INIT_LIST_HEAD(&dcc->wait_list);
2143	INIT_LIST_HEAD(&dcc->fstrim_list);
2144	mutex_init(&dcc->cmd_lock);
2145	atomic_set(&dcc->issued_discard, 0);
2146	atomic_set(&dcc->queued_discard, 0);
2147	atomic_set(&dcc->discard_cmd_cnt, 0);
2148	dcc->nr_discards = 0;
2149	dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
2150	dcc->undiscard_blks = 0;
2151	dcc->next_pos = 0;
2152	dcc->root = RB_ROOT_CACHED;
2153	dcc->rbtree_check = false;
2154
2155	init_waitqueue_head(&dcc->discard_wait_queue);
2156	SM_I(sbi)->dcc_info = dcc;
2157init_thread:
2158	dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
2159				"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
2160	if (IS_ERR(dcc->f2fs_issue_discard)) {
2161		err = PTR_ERR(dcc->f2fs_issue_discard);
2162		kfree(dcc);
2163		SM_I(sbi)->dcc_info = NULL;
2164		return err;
2165	}
2166
2167	return err;
2168}
2169
2170static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
2171{
2172	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
2173
2174	if (!dcc)
2175		return;
2176
2177	f2fs_stop_discard_thread(sbi);
2178
2179	/*
2180	 * Recovery can cache discard commands, so in error path of
2181	 * fill_super(), it needs to give a chance to handle them.
2182	 */
2183	if (unlikely(atomic_read(&dcc->discard_cmd_cnt)))
2184		f2fs_issue_discard_timeout(sbi);
2185
2186	kfree(dcc);
2187	SM_I(sbi)->dcc_info = NULL;
2188}
2189
2190static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
2191{
2192	struct sit_info *sit_i = SIT_I(sbi);
2193
2194	if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
2195		sit_i->dirty_sentries++;
2196		return false;
2197	}
2198
2199	return true;
2200}
2201
2202static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
2203					unsigned int segno, int modified)
2204{
2205	struct seg_entry *se = get_seg_entry(sbi, segno);
2206
2207	se->type = type;
2208	if (modified)
2209		__mark_sit_entry_dirty(sbi, segno);
2210}
2211
2212static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi,
2213								block_t blkaddr)
2214{
2215	unsigned int segno = GET_SEGNO(sbi, blkaddr);
2216
2217	if (segno == NULL_SEGNO)
2218		return 0;
2219	return get_seg_entry(sbi, segno)->mtime;
2220}
2221
2222static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr,
2223						unsigned long long old_mtime)
2224{
2225	struct seg_entry *se;
2226	unsigned int segno = GET_SEGNO(sbi, blkaddr);
2227	unsigned long long ctime = get_mtime(sbi, false);
2228	unsigned long long mtime = old_mtime ? old_mtime : ctime;
2229
2230	if (segno == NULL_SEGNO)
2231		return;
2232
2233	se = get_seg_entry(sbi, segno);
2234
2235	if (!se->mtime)
2236		se->mtime = mtime;
2237	else
2238		se->mtime = div_u64(se->mtime * se->valid_blocks + mtime,
2239						se->valid_blocks + 1);
2240
2241	if (ctime > SIT_I(sbi)->max_mtime)
2242		SIT_I(sbi)->max_mtime = ctime;
2243}
2244
2245static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
2246{
2247	struct seg_entry *se;
2248	unsigned int segno, offset;
2249	long int new_vblocks;
2250	bool exist;
2251#ifdef CONFIG_F2FS_CHECK_FS
2252	bool mir_exist;
2253#endif
2254
2255	segno = GET_SEGNO(sbi, blkaddr);
2256
2257	se = get_seg_entry(sbi, segno);
2258	new_vblocks = se->valid_blocks + del;
2259	offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
2260
2261	f2fs_bug_on(sbi, (new_vblocks < 0 ||
2262			(new_vblocks > f2fs_usable_blks_in_seg(sbi, segno))));
2263
2264	se->valid_blocks = new_vblocks;
2265
2266	/* Update valid block bitmap */
2267	if (del > 0) {
2268		exist = f2fs_test_and_set_bit(offset, se->cur_valid_map);
2269#ifdef CONFIG_F2FS_CHECK_FS
2270		mir_exist = f2fs_test_and_set_bit(offset,
2271						se->cur_valid_map_mir);
2272		if (unlikely(exist != mir_exist)) {
2273			f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d",
2274				 blkaddr, exist);
2275			f2fs_bug_on(sbi, 1);
2276		}
2277#endif
2278		if (unlikely(exist)) {
2279			f2fs_err(sbi, "Bitmap was wrongly set, blk:%u",
2280				 blkaddr);
2281			f2fs_bug_on(sbi, 1);
2282			se->valid_blocks--;
2283			del = 0;
2284		}
2285
2286		if (f2fs_block_unit_discard(sbi) &&
2287				!f2fs_test_and_set_bit(offset, se->discard_map))
2288			sbi->discard_blks--;
2289
2290		/*
2291		 * SSR should never reuse block which is checkpointed
2292		 * or newly invalidated.
2293		 */
2294		if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
2295			if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
2296				se->ckpt_valid_blocks++;
2297		}
2298	} else {
2299		exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map);
2300#ifdef CONFIG_F2FS_CHECK_FS
2301		mir_exist = f2fs_test_and_clear_bit(offset,
2302						se->cur_valid_map_mir);
2303		if (unlikely(exist != mir_exist)) {
2304			f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d",
2305				 blkaddr, exist);
2306			f2fs_bug_on(sbi, 1);
2307		}
2308#endif
2309		if (unlikely(!exist)) {
2310			f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u",
2311				 blkaddr);
2312			f2fs_bug_on(sbi, 1);
2313			se->valid_blocks++;
2314			del = 0;
2315		} else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
2316			/*
2317			 * If checkpoints are off, we must not reuse data that
2318			 * was used in the previous checkpoint. If it was used
2319			 * before, we must track that to know how much space we
2320			 * really have.
2321			 */
2322			if (f2fs_test_bit(offset, se->ckpt_valid_map)) {
2323				spin_lock(&sbi->stat_lock);
2324				sbi->unusable_block_count++;
2325				spin_unlock(&sbi->stat_lock);
2326			}
2327		}
2328
2329		if (f2fs_block_unit_discard(sbi) &&
2330			f2fs_test_and_clear_bit(offset, se->discard_map))
2331			sbi->discard_blks++;
2332	}
2333	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
2334		se->ckpt_valid_blocks += del;
2335
2336	__mark_sit_entry_dirty(sbi, segno);
2337
2338	/* update total number of valid blocks to be written in ckpt area */
2339	SIT_I(sbi)->written_valid_blocks += del;
2340
2341	if (__is_large_section(sbi))
2342		get_sec_entry(sbi, segno)->valid_blocks += del;
2343}
2344
2345void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
2346{
2347	unsigned int segno = GET_SEGNO(sbi, addr);
2348	struct sit_info *sit_i = SIT_I(sbi);
2349
2350	f2fs_bug_on(sbi, addr == NULL_ADDR);
2351	if (addr == NEW_ADDR || addr == COMPRESS_ADDR)
2352		return;
2353
2354	invalidate_mapping_pages(META_MAPPING(sbi), addr, addr);
2355	f2fs_invalidate_compress_page(sbi, addr);
2356
2357	/* add it into sit main buffer */
2358	down_write(&sit_i->sentry_lock);
2359
2360	update_segment_mtime(sbi, addr, 0);
2361	update_sit_entry(sbi, addr, -1);
2362
2363	/* add it into dirty seglist */
2364	locate_dirty_segment(sbi, segno);
2365
2366	up_write(&sit_i->sentry_lock);
2367}
2368
2369bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
2370{
2371	struct sit_info *sit_i = SIT_I(sbi);
2372	unsigned int segno, offset;
2373	struct seg_entry *se;
2374	bool is_cp = false;
2375
2376	if (!__is_valid_data_blkaddr(blkaddr))
2377		return true;
2378
2379	down_read(&sit_i->sentry_lock);
2380
2381	segno = GET_SEGNO(sbi, blkaddr);
2382	se = get_seg_entry(sbi, segno);
2383	offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
2384
2385	if (f2fs_test_bit(offset, se->ckpt_valid_map))
2386		is_cp = true;
2387
2388	up_read(&sit_i->sentry_lock);
2389
2390	return is_cp;
2391}
2392
2393/*
2394 * This function should be resided under the curseg_mutex lock
2395 */
2396static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
2397					struct f2fs_summary *sum)
2398{
2399	struct curseg_info *curseg = CURSEG_I(sbi, type);
2400	void *addr = curseg->sum_blk;
2401
2402	addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
2403	memcpy(addr, sum, sizeof(struct f2fs_summary));
2404}
2405
2406/*
2407 * Calculate the number of current summary pages for writing
2408 */
2409int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
2410{
2411	int valid_sum_count = 0;
2412	int i, sum_in_page;
2413
2414	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
2415		if (sbi->ckpt->alloc_type[i] == SSR)
2416			valid_sum_count += sbi->blocks_per_seg;
2417		else {
2418			if (for_ra)
2419				valid_sum_count += le16_to_cpu(
2420					F2FS_CKPT(sbi)->cur_data_blkoff[i]);
2421			else
2422				valid_sum_count += curseg_blkoff(sbi, i);
2423		}
2424	}
2425
2426	sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE -
2427			SUM_FOOTER_SIZE) / SUMMARY_SIZE;
2428	if (valid_sum_count <= sum_in_page)
2429		return 1;
2430	else if ((valid_sum_count - sum_in_page) <=
2431		(PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
2432		return 2;
2433	return 3;
2434}
2435
2436/*
2437 * Caller should put this summary page
2438 */
2439struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
2440{
2441	if (unlikely(f2fs_cp_error(sbi)))
2442		return ERR_PTR(-EIO);
2443	return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno));
2444}
2445
2446void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
2447					void *src, block_t blk_addr)
2448{
2449	struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
2450
2451	memcpy(page_address(page), src, PAGE_SIZE);
2452	set_page_dirty(page);
2453	f2fs_put_page(page, 1);
2454}
2455
2456static void write_sum_page(struct f2fs_sb_info *sbi,
2457			struct f2fs_summary_block *sum_blk, block_t blk_addr)
2458{
2459	f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr);
2460}
2461
2462static void write_current_sum_page(struct f2fs_sb_info *sbi,
2463						int type, block_t blk_addr)
2464{
2465	struct curseg_info *curseg = CURSEG_I(sbi, type);
2466	struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
2467	struct f2fs_summary_block *src = curseg->sum_blk;
2468	struct f2fs_summary_block *dst;
2469
2470	dst = (struct f2fs_summary_block *)page_address(page);
2471	memset(dst, 0, PAGE_SIZE);
2472
2473	mutex_lock(&curseg->curseg_mutex);
2474
2475	down_read(&curseg->journal_rwsem);
2476	memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
2477	up_read(&curseg->journal_rwsem);
2478
2479	memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
2480	memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
2481
2482	mutex_unlock(&curseg->curseg_mutex);
2483
2484	set_page_dirty(page);
2485	f2fs_put_page(page, 1);
2486}
2487
2488static int is_next_segment_free(struct f2fs_sb_info *sbi,
2489				struct curseg_info *curseg, int type)
2490{
2491	unsigned int segno = curseg->segno + 1;
2492	struct free_segmap_info *free_i = FREE_I(sbi);
2493
2494	if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
2495		return !test_bit(segno, free_i->free_segmap);
2496	return 0;
2497}
2498
2499/*
2500 * Find a new segment from the free segments bitmap to right order
2501 * This function should be returned with success, otherwise BUG
2502 */
2503static void get_new_segment(struct f2fs_sb_info *sbi,
2504			unsigned int *newseg, bool new_sec, int dir)
2505{
2506	struct free_segmap_info *free_i = FREE_I(sbi);
2507	unsigned int segno, secno, zoneno;
2508	unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
2509	unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
2510	unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
2511	unsigned int left_start = hint;
2512	bool init = true;
2513	int go_left = 0;
2514	int i;
2515
2516	spin_lock(&free_i->segmap_lock);
2517
2518	if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
2519		segno = find_next_zero_bit(free_i->free_segmap,
2520			GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
2521		if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
2522			goto got_it;
2523	}
2524find_other_zone:
2525	secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
2526	if (secno >= MAIN_SECS(sbi)) {
2527		if (dir == ALLOC_RIGHT) {
2528			secno = find_next_zero_bit(free_i->free_secmap,
2529							MAIN_SECS(sbi), 0);
2530			f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
2531		} else {
2532			go_left = 1;
2533			left_start = hint - 1;
2534		}
2535	}
2536	if (go_left == 0)
2537		goto skip_left;
2538
2539	while (test_bit(left_start, free_i->free_secmap)) {
2540		if (left_start > 0) {
2541			left_start--;
2542			continue;
2543		}
2544		left_start = find_next_zero_bit(free_i->free_secmap,
2545							MAIN_SECS(sbi), 0);
2546		f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
2547		break;
2548	}
2549	secno = left_start;
2550skip_left:
2551	segno = GET_SEG_FROM_SEC(sbi, secno);
2552	zoneno = GET_ZONE_FROM_SEC(sbi, secno);
2553
2554	/* give up on finding another zone */
2555	if (!init)
2556		goto got_it;
2557	if (sbi->secs_per_zone == 1)
2558		goto got_it;
2559	if (zoneno == old_zoneno)
2560		goto got_it;
2561	if (dir == ALLOC_LEFT) {
2562		if (!go_left && zoneno + 1 >= total_zones)
2563			goto got_it;
2564		if (go_left && zoneno == 0)
2565			goto got_it;
2566	}
2567	for (i = 0; i < NR_CURSEG_TYPE; i++)
2568		if (CURSEG_I(sbi, i)->zone == zoneno)
2569			break;
2570
2571	if (i < NR_CURSEG_TYPE) {
2572		/* zone is in user, try another */
2573		if (go_left)
2574			hint = zoneno * sbi->secs_per_zone - 1;
2575		else if (zoneno + 1 >= total_zones)
2576			hint = 0;
2577		else
2578			hint = (zoneno + 1) * sbi->secs_per_zone;
2579		init = false;
2580		goto find_other_zone;
2581	}
2582got_it:
2583	/* set it as dirty segment in free segmap */
2584	f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
2585	__set_inuse(sbi, segno);
2586	*newseg = segno;
2587	spin_unlock(&free_i->segmap_lock);
2588}
2589
2590static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
2591{
2592	struct curseg_info *curseg = CURSEG_I(sbi, type);
2593	struct summary_footer *sum_footer;
2594	unsigned short seg_type = curseg->seg_type;
2595
2596	curseg->inited = true;
2597	curseg->segno = curseg->next_segno;
2598	curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
2599	curseg->next_blkoff = 0;
2600	curseg->next_segno = NULL_SEGNO;
2601
2602	sum_footer = &(curseg->sum_blk->footer);
2603	memset(sum_footer, 0, sizeof(struct summary_footer));
2604
2605	sanity_check_seg_type(sbi, seg_type);
2606
2607	if (IS_DATASEG(seg_type))
2608		SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
2609	if (IS_NODESEG(seg_type))
2610		SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
2611	__set_sit_entry_type(sbi, seg_type, curseg->segno, modified);
2612}
2613
2614static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
2615{
2616	struct curseg_info *curseg = CURSEG_I(sbi, type);
2617	unsigned short seg_type = curseg->seg_type;
2618
2619	sanity_check_seg_type(sbi, seg_type);
2620
2621	/* if segs_per_sec is large than 1, we need to keep original policy. */
2622	if (__is_large_section(sbi))
2623		return curseg->segno;
2624
2625	/* inmem log may not locate on any segment after mount */
2626	if (!curseg->inited)
2627		return 0;
2628
2629	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
2630		return 0;
2631
2632	if (test_opt(sbi, NOHEAP) &&
2633		(seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)))
2634		return 0;
2635
2636	if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
2637		return SIT_I(sbi)->last_victim[ALLOC_NEXT];
2638
2639	/* find segments from 0 to reuse freed segments */
2640	if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
2641		return 0;
2642
2643	return curseg->segno;
2644}
2645
2646/*
2647 * Allocate a current working segment.
2648 * This function always allocates a free segment in LFS manner.
2649 */
2650static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
2651{
2652	struct curseg_info *curseg = CURSEG_I(sbi, type);
2653	unsigned short seg_type = curseg->seg_type;
2654	unsigned int segno = curseg->segno;
2655	int dir = ALLOC_LEFT;
2656
2657	if (curseg->inited)
2658		write_sum_page(sbi, curseg->sum_blk,
2659				GET_SUM_BLOCK(sbi, segno));
2660	if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA)
2661		dir = ALLOC_RIGHT;
2662
2663	if (test_opt(sbi, NOHEAP))
2664		dir = ALLOC_RIGHT;
2665
2666	segno = __get_next_segno(sbi, type);
2667	get_new_segment(sbi, &segno, new_sec, dir);
2668	curseg->next_segno = segno;
2669	reset_curseg(sbi, type, 1);
2670	curseg->alloc_type = LFS;
2671}
2672
2673static int __next_free_blkoff(struct f2fs_sb_info *sbi,
2674					int segno, block_t start)
2675{
2676	struct seg_entry *se = get_seg_entry(sbi, segno);
2677	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
2678	unsigned long *target_map = SIT_I(sbi)->tmp_map;
2679	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
2680	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
2681	int i;
2682
2683	for (i = 0; i < entries; i++)
2684		target_map[i] = ckpt_map[i] | cur_map[i];
2685
2686	return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
2687}
2688
2689/*
2690 * If a segment is written by LFS manner, next block offset is just obtained
2691 * by increasing the current block offset. However, if a segment is written by
2692 * SSR manner, next block offset obtained by calling __next_free_blkoff
2693 */
2694static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
2695				struct curseg_info *seg)
2696{
2697	if (seg->alloc_type == SSR)
2698		seg->next_blkoff =
2699			__next_free_blkoff(sbi, seg->segno,
2700						seg->next_blkoff + 1);
2701	else
2702		seg->next_blkoff++;
2703}
2704
2705bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
2706{
2707	return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg;
2708}
2709
2710/*
2711 * This function always allocates a used segment(from dirty seglist) by SSR
2712 * manner, so it should recover the existing segment information of valid blocks
2713 */
2714static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
2715{
2716	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
2717	struct curseg_info *curseg = CURSEG_I(sbi, type);
2718	unsigned int new_segno = curseg->next_segno;
2719	struct f2fs_summary_block *sum_node;
2720	struct page *sum_page;
2721
2722	if (flush)
2723		write_sum_page(sbi, curseg->sum_blk,
2724					GET_SUM_BLOCK(sbi, curseg->segno));
2725
2726	__set_test_and_inuse(sbi, new_segno);
2727
2728	mutex_lock(&dirty_i->seglist_lock);
2729	__remove_dirty_segment(sbi, new_segno, PRE);
2730	__remove_dirty_segment(sbi, new_segno, DIRTY);
2731	mutex_unlock(&dirty_i->seglist_lock);
2732
2733	reset_curseg(sbi, type, 1);
2734	curseg->alloc_type = SSR;
2735	curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0);
2736
2737	sum_page = f2fs_get_sum_page(sbi, new_segno);
2738	if (IS_ERR(sum_page)) {
2739		/* GC won't be able to use stale summary pages by cp_error */
2740		memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
2741		return;
2742	}
2743	sum_node = (struct f2fs_summary_block *)page_address(sum_page);
2744	memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
2745	f2fs_put_page(sum_page, 1);
2746}
2747
2748static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
2749				int alloc_mode, unsigned long long age);
2750
2751static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
2752					int target_type, int alloc_mode,
2753					unsigned long long age)
2754{
2755	struct curseg_info *curseg = CURSEG_I(sbi, type);
2756
2757	curseg->seg_type = target_type;
2758
2759	if (get_ssr_segment(sbi, type, alloc_mode, age)) {
2760		struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
2761
2762		curseg->seg_type = se->type;
2763		change_curseg(sbi, type, true);
2764	} else {
2765		/* allocate cold segment by default */
2766		curseg->seg_type = CURSEG_COLD_DATA;
2767		new_curseg(sbi, type, true);
2768	}
2769	stat_inc_seg_type(sbi, curseg);
2770}
2771
2772static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
2773{
2774	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC);
2775
2776	if (!sbi->am.atgc_enabled)
2777		return;
2778
2779	down_read(&SM_I(sbi)->curseg_lock);
2780
2781	mutex_lock(&curseg->curseg_mutex);
2782	down_write(&SIT_I(sbi)->sentry_lock);
2783
2784	get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0);
2785
2786	up_write(&SIT_I(sbi)->sentry_lock);
2787	mutex_unlock(&curseg->curseg_mutex);
2788
2789	up_read(&SM_I(sbi)->curseg_lock);
2790
2791}
2792void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
2793{
2794	__f2fs_init_atgc_curseg(sbi);
2795}
2796
2797static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type)
2798{
2799	struct curseg_info *curseg = CURSEG_I(sbi, type);
2800
2801	mutex_lock(&curseg->curseg_mutex);
2802	if (!curseg->inited)
2803		goto out;
2804
2805	if (get_valid_blocks(sbi, curseg->segno, false)) {
2806		write_sum_page(sbi, curseg->sum_blk,
2807				GET_SUM_BLOCK(sbi, curseg->segno));
2808	} else {
2809		mutex_lock(&DIRTY_I(sbi)->seglist_lock);
2810		__set_test_and_free(sbi, curseg->segno, true);
2811		mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
2812	}
2813out:
2814	mutex_unlock(&curseg->curseg_mutex);
2815}
2816
2817void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi)
2818{
2819	__f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED);
2820
2821	if (sbi->am.atgc_enabled)
2822		__f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC);
2823}
2824
2825static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type)
2826{
2827	struct curseg_info *curseg = CURSEG_I(sbi, type);
2828
2829	mutex_lock(&curseg->curseg_mutex);
2830	if (!curseg->inited)
2831		goto out;
2832	if (get_valid_blocks(sbi, curseg->segno, false))
2833		goto out;
2834
2835	mutex_lock(&DIRTY_I(sbi)->seglist_lock);
2836	__set_test_and_inuse(sbi, curseg->segno);
2837	mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
2838out:
2839	mutex_unlock(&curseg->curseg_mutex);
2840}
2841
2842void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi)
2843{
2844	__f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED);
2845
2846	if (sbi->am.atgc_enabled)
2847		__f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC);
2848}
2849
2850static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
2851				int alloc_mode, unsigned long long age)
2852{
2853	struct curseg_info *curseg = CURSEG_I(sbi, type);
2854	const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
2855	unsigned segno = NULL_SEGNO;
2856	unsigned short seg_type = curseg->seg_type;
2857	int i, cnt;
2858	bool reversed = false;
2859
2860	sanity_check_seg_type(sbi, seg_type);
2861
2862	/* f2fs_need_SSR() already forces to do this */
2863	if (!v_ops->get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) {
2864		curseg->next_segno = segno;
2865		return 1;
2866	}
2867
2868	/* For node segments, let's do SSR more intensively */
2869	if (IS_NODESEG(seg_type)) {
2870		if (seg_type >= CURSEG_WARM_NODE) {
2871			reversed = true;
2872			i = CURSEG_COLD_NODE;
2873		} else {
2874			i = CURSEG_HOT_NODE;
2875		}
2876		cnt = NR_CURSEG_NODE_TYPE;
2877	} else {
2878		if (seg_type >= CURSEG_WARM_DATA) {
2879			reversed = true;
2880			i = CURSEG_COLD_DATA;
2881		} else {
2882			i = CURSEG_HOT_DATA;
2883		}
2884		cnt = NR_CURSEG_DATA_TYPE;
2885	}
2886
2887	for (; cnt-- > 0; reversed ? i-- : i++) {
2888		if (i == seg_type)
2889			continue;
2890		if (!v_ops->get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) {
2891			curseg->next_segno = segno;
2892			return 1;
2893		}
2894	}
2895
2896	/* find valid_blocks=0 in dirty list */
2897	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
2898		segno = get_free_segment(sbi);
2899		if (segno != NULL_SEGNO) {
2900			curseg->next_segno = segno;
2901			return 1;
2902		}
2903	}
2904	return 0;
2905}
2906
2907/*
2908 * flush out current segment and replace it with new segment
2909 * This function should be returned with success, otherwise BUG
2910 */
2911static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
2912						int type, bool force)
2913{
2914	struct curseg_info *curseg = CURSEG_I(sbi, type);
2915
2916	if (force)
2917		new_curseg(sbi, type, true);
2918	else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
2919					curseg->seg_type == CURSEG_WARM_NODE)
2920		new_curseg(sbi, type, false);
2921	else if (curseg->alloc_type == LFS &&
2922			is_next_segment_free(sbi, curseg, type) &&
2923			likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
2924		new_curseg(sbi, type, false);
2925	else if (f2fs_need_SSR(sbi) &&
2926			get_ssr_segment(sbi, type, SSR, 0))
2927		change_curseg(sbi, type, true);
2928	else
2929		new_curseg(sbi, type, false);
2930
2931	stat_inc_seg_type(sbi, curseg);
2932}
2933
2934void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
2935					unsigned int start, unsigned int end)
2936{
2937	struct curseg_info *curseg = CURSEG_I(sbi, type);
2938	unsigned int segno;
2939
2940	down_read(&SM_I(sbi)->curseg_lock);
2941	mutex_lock(&curseg->curseg_mutex);
2942	down_write(&SIT_I(sbi)->sentry_lock);
2943
2944	segno = CURSEG_I(sbi, type)->segno;
2945	if (segno < start || segno > end)
2946		goto unlock;
2947
2948	if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
2949		change_curseg(sbi, type, true);
2950	else
2951		new_curseg(sbi, type, true);
2952
2953	stat_inc_seg_type(sbi, curseg);
2954
2955	locate_dirty_segment(sbi, segno);
2956unlock:
2957	up_write(&SIT_I(sbi)->sentry_lock);
2958
2959	if (segno != curseg->segno)
2960		f2fs_notice(sbi, "For resize: curseg of type %d: %u ==> %u",
2961			    type, segno, curseg->segno);
2962
2963	mutex_unlock(&curseg->curseg_mutex);
2964	up_read(&SM_I(sbi)->curseg_lock);
2965}
2966
2967static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
2968						bool new_sec, bool force)
2969{
2970	struct curseg_info *curseg = CURSEG_I(sbi, type);
2971	unsigned int old_segno;
2972
2973	if (!curseg->inited)
2974		goto alloc;
2975
2976	if (force || curseg->next_blkoff ||
2977		get_valid_blocks(sbi, curseg->segno, new_sec))
2978		goto alloc;
2979
2980	if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec))
2981		return;
2982alloc:
2983	old_segno = curseg->segno;
2984	SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
2985	locate_dirty_segment(sbi, old_segno);
2986}
2987
2988static void __allocate_new_section(struct f2fs_sb_info *sbi,
2989						int type, bool force)
2990{
2991	__allocate_new_segment(sbi, type, true, force);
2992}
2993
2994void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
2995{
2996	down_read(&SM_I(sbi)->curseg_lock);
2997	down_write(&SIT_I(sbi)->sentry_lock);
2998	__allocate_new_section(sbi, type, force);
2999	up_write(&SIT_I(sbi)->sentry_lock);
3000	up_read(&SM_I(sbi)->curseg_lock);
3001}
3002
3003void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
3004{
3005	int i;
3006
3007	down_read(&SM_I(sbi)->curseg_lock);
3008	down_write(&SIT_I(sbi)->sentry_lock);
3009	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
3010		__allocate_new_segment(sbi, i, false, false);
3011	up_write(&SIT_I(sbi)->sentry_lock);
3012	up_read(&SM_I(sbi)->curseg_lock);
3013}
3014
3015static const struct segment_allocation default_salloc_ops = {
3016	.allocate_segment = allocate_segment_by_default,
3017};
3018
3019bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
3020						struct cp_control *cpc)
3021{
3022	__u64 trim_start = cpc->trim_start;
3023	bool has_candidate = false;
3024
3025	down_write(&SIT_I(sbi)->sentry_lock);
3026	for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) {
3027		if (add_discard_addrs(sbi, cpc, true)) {
3028			has_candidate = true;
3029			break;
3030		}
3031	}
3032	up_write(&SIT_I(sbi)->sentry_lock);
3033
3034	cpc->trim_start = trim_start;
3035	return has_candidate;
3036}
3037
3038static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
3039					struct discard_policy *dpolicy,
3040					unsigned int start, unsigned int end)
3041{
3042	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
3043	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
3044	struct rb_node **insert_p = NULL, *insert_parent = NULL;
3045	struct discard_cmd *dc;
3046	struct blk_plug plug;
3047	int issued;
3048	unsigned int trimmed = 0;
3049
3050next:
3051	issued = 0;
3052
3053	mutex_lock(&dcc->cmd_lock);
3054	if (unlikely(dcc->rbtree_check))
3055		f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi,
3056							&dcc->root, false));
3057
3058	dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
3059					NULL, start,
3060					(struct rb_entry **)&prev_dc,
3061					(struct rb_entry **)&next_dc,
3062					&insert_p, &insert_parent, true, NULL);
3063	if (!dc)
3064		dc = next_dc;
3065
3066	blk_start_plug(&plug);
3067
3068	while (dc && dc->lstart <= end) {
3069		struct rb_node *node;
3070		int err = 0;
3071
3072		if (dc->len < dpolicy->granularity)
3073			goto skip;
3074
3075		if (dc->state != D_PREP) {
3076			list_move_tail(&dc->list, &dcc->fstrim_list);
3077			goto skip;
3078		}
3079
3080		err = __submit_discard_cmd(sbi, dpolicy, dc, &issued);
3081
3082		if (issued >= dpolicy->max_requests) {
3083			start = dc->lstart + dc->len;
3084
3085			if (err)
3086				__remove_discard_cmd(sbi, dc);
3087
3088			blk_finish_plug(&plug);
3089			mutex_unlock(&dcc->cmd_lock);
3090			trimmed += __wait_all_discard_cmd(sbi, NULL);
3091			congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
3092			goto next;
3093		}
3094skip:
3095		node = rb_next(&dc->rb_node);
3096		if (err)
3097			__remove_discard_cmd(sbi, dc);
3098		dc = rb_entry_safe(node, struct discard_cmd, rb_node);
3099
3100		if (fatal_signal_pending(current))
3101			break;
3102	}
3103
3104	blk_finish_plug(&plug);
3105	mutex_unlock(&dcc->cmd_lock);
3106
3107	return trimmed;
3108}
3109
3110int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
3111{
3112	__u64 start = F2FS_BYTES_TO_BLK(range->start);
3113	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
3114	unsigned int start_segno, end_segno;
3115	block_t start_block, end_block;
3116	struct cp_control cpc;
3117	struct discard_policy dpolicy;
3118	unsigned long long trimmed = 0;
3119	int err = 0;
3120	bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
3121
3122	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
3123		return -EINVAL;
3124
3125	if (end < MAIN_BLKADDR(sbi))
3126		goto out;
3127
3128	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
3129		f2fs_warn(sbi, "Found FS corruption, run fsck to fix.");
3130		return -EFSCORRUPTED;
3131	}
3132
3133	/* start/end segment number in main_area */
3134	start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
3135	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
3136						GET_SEGNO(sbi, end);
3137	if (need_align) {
3138		start_segno = rounddown(start_segno, sbi->segs_per_sec);
3139		end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1;
3140	}
3141
3142	cpc.reason = CP_DISCARD;
3143	cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
3144	cpc.trim_start = start_segno;
3145	cpc.trim_end = end_segno;
3146
3147	if (sbi->discard_blks == 0)
3148		goto out;
3149
3150	down_write(&sbi->gc_lock);
3151	err = f2fs_write_checkpoint(sbi, &cpc);
3152	up_write(&sbi->gc_lock);
3153	if (err)
3154		goto out;
3155
3156	/*
3157	 * We filed discard candidates, but actually we don't need to wait for
3158	 * all of them, since they'll be issued in idle time along with runtime
3159	 * discard option. User configuration looks like using runtime discard
3160	 * or periodic fstrim instead of it.
3161	 */
3162	if (f2fs_realtime_discard_enable(sbi))
3163		goto out;
3164
3165	start_block = START_BLOCK(sbi, start_segno);
3166	end_block = START_BLOCK(sbi, end_segno + 1);
3167
3168	__init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
3169	trimmed = __issue_discard_cmd_range(sbi, &dpolicy,
3170					start_block, end_block);
3171
3172	trimmed += __wait_discard_cmd_range(sbi, &dpolicy,
3173					start_block, end_block);
3174out:
3175	if (!err)
3176		range->len = F2FS_BLK_TO_BYTES(trimmed);
3177	return err;
3178}
3179
3180static bool __has_curseg_space(struct f2fs_sb_info *sbi,
3181					struct curseg_info *curseg)
3182{
3183	return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi,
3184							curseg->segno);
3185}
3186
3187int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
3188{
3189	switch (hint) {
3190	case WRITE_LIFE_SHORT:
3191		return CURSEG_HOT_DATA;
3192	case WRITE_LIFE_EXTREME:
3193		return CURSEG_COLD_DATA;
3194	default:
3195		return CURSEG_WARM_DATA;
3196	}
3197}
3198
3199/* This returns write hints for each segment type. This hints will be
3200 * passed down to block layer. There are mapping tables which depend on
3201 * the mount option 'whint_mode'.
3202 *
3203 * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
3204 *
3205 * 2) whint_mode=user-based. F2FS tries to pass down hints given by users.
3206 *
3207 * User                  F2FS                     Block
3208 * ----                  ----                     -----
3209 *                       META                     WRITE_LIFE_NOT_SET
3210 *                       HOT_NODE                 "
3211 *                       WARM_NODE                "
3212 *                       COLD_NODE                "
3213 * ioctl(COLD)           COLD_DATA                WRITE_LIFE_EXTREME
3214 * extension list        "                        "
3215 *
3216 * -- buffered io
3217 * WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
3218 * WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
3219 * WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
3220 * WRITE_LIFE_NONE       "                        "
3221 * WRITE_LIFE_MEDIUM     "                        "
3222 * WRITE_LIFE_LONG       "                        "
3223 *
3224 * -- direct io
3225 * WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
3226 * WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
3227 * WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
3228 * WRITE_LIFE_NONE       "                        WRITE_LIFE_NONE
3229 * WRITE_LIFE_MEDIUM     "                        WRITE_LIFE_MEDIUM
3230 * WRITE_LIFE_LONG       "                        WRITE_LIFE_LONG
3231 *
3232 * 3) whint_mode=fs-based. F2FS passes down hints with its policy.
3233 *
3234 * User                  F2FS                     Block
3235 * ----                  ----                     -----
3236 *                       META                     WRITE_LIFE_MEDIUM;
3237 *                       HOT_NODE                 WRITE_LIFE_NOT_SET
3238 *                       WARM_NODE                "
3239 *                       COLD_NODE                WRITE_LIFE_NONE
3240 * ioctl(COLD)           COLD_DATA                WRITE_LIFE_EXTREME
3241 * extension list        "                        "
3242 *
3243 * -- buffered io
3244 * WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
3245 * WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
3246 * WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_LONG
3247 * WRITE_LIFE_NONE       "                        "
3248 * WRITE_LIFE_MEDIUM     "                        "
3249 * WRITE_LIFE_LONG       "                        "
3250 *
3251 * -- direct io
3252 * WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
3253 * WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
3254 * WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
3255 * WRITE_LIFE_NONE       "                        WRITE_LIFE_NONE
3256 * WRITE_LIFE_MEDIUM     "                        WRITE_LIFE_MEDIUM
3257 * WRITE_LIFE_LONG       "                        WRITE_LIFE_LONG
3258 */
3259
3260enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
3261				enum page_type type, enum temp_type temp)
3262{
3263	if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) {
3264		if (type == DATA) {
3265			if (temp == WARM)
3266				return WRITE_LIFE_NOT_SET;
3267			else if (temp == HOT)
3268				return WRITE_LIFE_SHORT;
3269			else if (temp == COLD)
3270				return WRITE_LIFE_EXTREME;
3271		} else {
3272			return WRITE_LIFE_NOT_SET;
3273		}
3274	} else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) {
3275		if (type == DATA) {
3276			if (temp == WARM)
3277				return WRITE_LIFE_LONG;
3278			else if (temp == HOT)
3279				return WRITE_LIFE_SHORT;
3280			else if (temp == COLD)
3281				return WRITE_LIFE_EXTREME;
3282		} else if (type == NODE) {
3283			if (temp == WARM || temp == HOT)
3284				return WRITE_LIFE_NOT_SET;
3285			else if (temp == COLD)
3286				return WRITE_LIFE_NONE;
3287		} else if (type == META) {
3288			return WRITE_LIFE_MEDIUM;
3289		}
3290	}
3291	return WRITE_LIFE_NOT_SET;
3292}
3293
3294static int __get_segment_type_2(struct f2fs_io_info *fio)
3295{
3296	if (fio->type == DATA)
3297		return CURSEG_HOT_DATA;
3298	else
3299		return CURSEG_HOT_NODE;
3300}
3301
3302static int __get_segment_type_4(struct f2fs_io_info *fio)
3303{
3304	if (fio->type == DATA) {
3305		struct inode *inode = fio->page->mapping->host;
3306
3307		if (S_ISDIR(inode->i_mode))
3308			return CURSEG_HOT_DATA;
3309		else
3310			return CURSEG_COLD_DATA;
3311	} else {
3312		if (IS_DNODE(fio->page) && is_cold_node(fio->page))
3313			return CURSEG_WARM_NODE;
3314		else
3315			return CURSEG_COLD_NODE;
3316	}
3317}
3318
3319static int __get_segment_type_6(struct f2fs_io_info *fio)
3320{
3321	if (fio->type == DATA) {
3322		struct inode *inode = fio->page->mapping->host;
3323
3324		if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
3325			return CURSEG_COLD_DATA_PINNED;
3326
3327		if (page_private_gcing(fio->page)) {
3328			if (fio->sbi->am.atgc_enabled &&
3329				(fio->io_type == FS_DATA_IO) &&
3330				(fio->sbi->gc_mode != GC_URGENT_HIGH))
3331				return CURSEG_ALL_DATA_ATGC;
3332			else
3333				return CURSEG_COLD_DATA;
3334		}
3335		if (file_is_cold(inode) || f2fs_need_compress_data(inode))
3336			return CURSEG_COLD_DATA;
3337		if (file_is_hot(inode) ||
3338				is_inode_flag_set(inode, FI_HOT_DATA) ||
3339				f2fs_is_atomic_file(inode) ||
3340				f2fs_is_volatile_file(inode))
3341			return CURSEG_HOT_DATA;
3342		return f2fs_rw_hint_to_seg_type(inode->i_write_hint);
3343	} else {
3344		if (IS_DNODE(fio->page))
3345			return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
3346						CURSEG_HOT_NODE;
3347		return CURSEG_COLD_NODE;
3348	}
3349}
3350
3351static int __get_segment_type(struct f2fs_io_info *fio)
3352{
3353	int type = 0;
3354
3355	switch (F2FS_OPTION(fio->sbi).active_logs) {
3356	case 2:
3357		type = __get_segment_type_2(fio);
3358		break;
3359	case 4:
3360		type = __get_segment_type_4(fio);
3361		break;
3362	case 6:
3363		type = __get_segment_type_6(fio);
3364		break;
3365	default:
3366		f2fs_bug_on(fio->sbi, true);
3367	}
3368
3369	if (IS_HOT(type))
3370		fio->temp = HOT;
3371	else if (IS_WARM(type))
3372		fio->temp = WARM;
3373	else
3374		fio->temp = COLD;
3375	return type;
3376}
3377
3378void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
3379		block_t old_blkaddr, block_t *new_blkaddr,
3380		struct f2fs_summary *sum, int type,
3381		struct f2fs_io_info *fio)
3382{
3383	struct sit_info *sit_i = SIT_I(sbi);
3384	struct curseg_info *curseg = CURSEG_I(sbi, type);
3385	unsigned long long old_mtime;
3386	bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
3387	struct seg_entry *se = NULL;
3388
3389	down_read(&SM_I(sbi)->curseg_lock);
3390
3391	mutex_lock(&curseg->curseg_mutex);
3392	down_write(&sit_i->sentry_lock);
3393
3394	if (from_gc) {
3395		f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO);
3396		se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr));
3397		sanity_check_seg_type(sbi, se->type);
3398		f2fs_bug_on(sbi, IS_NODESEG(se->type));
3399	}
3400	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
3401
3402	f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg);
3403
3404	f2fs_wait_discard_bio(sbi, *new_blkaddr);
3405
3406	/*
3407	 * __add_sum_entry should be resided under the curseg_mutex
3408	 * because, this function updates a summary entry in the
3409	 * current summary block.
3410	 */
3411	__add_sum_entry(sbi, type, sum);
3412
3413	__refresh_next_blkoff(sbi, curseg);
3414
3415	stat_inc_block_count(sbi, curseg);
3416
3417	if (from_gc) {
3418		old_mtime = get_segment_mtime(sbi, old_blkaddr);
3419	} else {
3420		update_segment_mtime(sbi, old_blkaddr, 0);
3421		old_mtime = 0;
3422	}
3423	update_segment_mtime(sbi, *new_blkaddr, old_mtime);
3424
3425	/*
3426	 * SIT information should be updated before segment allocation,
3427	 * since SSR needs latest valid block information.
3428	 */
3429	update_sit_entry(sbi, *new_blkaddr, 1);
3430	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
3431		update_sit_entry(sbi, old_blkaddr, -1);
3432
3433	if (!__has_curseg_space(sbi, curseg)) {
3434		if (from_gc)
3435			get_atssr_segment(sbi, type, se->type,
3436						AT_SSR, se->mtime);
3437		else
3438			sit_i->s_ops->allocate_segment(sbi, type, false);
3439	}
3440	/*
3441	 * segment dirty status should be updated after segment allocation,
3442	 * so we just need to update status only one time after previous
3443	 * segment being closed.
3444	 */
3445	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
3446	locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
3447
3448	up_write(&sit_i->sentry_lock);
3449
3450	if (page && IS_NODESEG(type)) {
3451		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
3452
3453		f2fs_inode_chksum_set(sbi, page);
3454	}
3455
3456	if (fio) {
3457		struct f2fs_bio_info *io;
3458
3459		if (F2FS_IO_ALIGNED(sbi))
3460			fio->retry = false;
3461
3462		INIT_LIST_HEAD(&fio->list);
3463		fio->in_list = true;
3464		io = sbi->write_io[fio->type] + fio->temp;
3465		spin_lock(&io->io_lock);
3466		list_add_tail(&fio->list, &io->io_list);
3467		spin_unlock(&io->io_lock);
3468	}
3469
3470	mutex_unlock(&curseg->curseg_mutex);
3471
3472	up_read(&SM_I(sbi)->curseg_lock);
3473}
3474
3475static void update_device_state(struct f2fs_io_info *fio)
3476{
3477	struct f2fs_sb_info *sbi = fio->sbi;
3478	unsigned int devidx;
3479
3480	if (!f2fs_is_multi_device(sbi))
3481		return;
3482
3483	devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
3484
3485	/* update device state for fsync */
3486	f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
3487
3488	/* update device state for checkpoint */
3489	if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
3490		spin_lock(&sbi->dev_lock);
3491		f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
3492		spin_unlock(&sbi->dev_lock);
3493	}
3494}
3495
3496static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
3497{
3498	int type = __get_segment_type(fio);
3499	bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
3500
3501	if (keep_order)
3502		down_read(&fio->sbi->io_order_lock);
3503reallocate:
3504	f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
3505			&fio->new_blkaddr, sum, type, fio);
3506	if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) {
3507		invalidate_mapping_pages(META_MAPPING(fio->sbi),
3508					fio->old_blkaddr, fio->old_blkaddr);
3509		f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr);
3510	}
3511
3512	/* writeout dirty page into bdev */
3513	f2fs_submit_page_write(fio);
3514	if (fio->retry) {
3515		fio->old_blkaddr = fio->new_blkaddr;
3516		goto reallocate;
3517	}
3518
3519	update_device_state(fio);
3520
3521	if (keep_order)
3522		up_read(&fio->sbi->io_order_lock);
3523}
3524
3525void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
3526					enum iostat_type io_type)
3527{
3528	struct f2fs_io_info fio = {
3529		.sbi = sbi,
3530		.type = META,
3531		.temp = HOT,
3532		.op = REQ_OP_WRITE,
3533		.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
3534		.old_blkaddr = page->index,
3535		.new_blkaddr = page->index,
3536		.page = page,
3537		.encrypted_page = NULL,
3538		.in_list = false,
3539	};
3540
3541	if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
3542		fio.op_flags &= ~REQ_META;
3543
3544	set_page_writeback(page);
3545	ClearPageError(page);
3546	f2fs_submit_page_write(&fio);
3547
3548	stat_inc_meta_count(sbi, page->index);
3549	f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);
3550}
3551
3552void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio)
3553{
3554	struct f2fs_summary sum;
3555
3556	set_summary(&sum, nid, 0, 0);
3557	do_write_page(&sum, fio);
3558
3559	f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
3560}
3561
3562void f2fs_outplace_write_data(struct dnode_of_data *dn,
3563					struct f2fs_io_info *fio)
3564{
3565	struct f2fs_sb_info *sbi = fio->sbi;
3566	struct f2fs_summary sum;
3567
3568	f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
3569	set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version);
3570	do_write_page(&sum, fio);
3571	f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
3572
3573	f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE);
3574}
3575
3576int f2fs_inplace_write_data(struct f2fs_io_info *fio)
3577{
3578	int err;
3579	struct f2fs_sb_info *sbi = fio->sbi;
3580	unsigned int segno;
3581
3582	fio->new_blkaddr = fio->old_blkaddr;
3583	/* i/o temperature is needed for passing down write hints */
3584	__get_segment_type(fio);
3585
3586	segno = GET_SEGNO(sbi, fio->new_blkaddr);
3587
3588	if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) {
3589		set_sbi_flag(sbi, SBI_NEED_FSCK);
3590		f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.",
3591			  __func__, segno);
3592		err = -EFSCORRUPTED;
3593		goto drop_bio;
3594	}
3595
3596	if (f2fs_cp_error(sbi)) {
3597		err = -EIO;
3598		goto drop_bio;
3599	}
3600
3601	stat_inc_inplace_blocks(fio->sbi);
3602
3603	if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE)))
3604		err = f2fs_merge_page_bio(fio);
3605	else
3606		err = f2fs_submit_page_bio(fio);
3607	if (!err) {
3608		update_device_state(fio);
3609		f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
3610	}
3611
3612	return err;
3613drop_bio:
3614	if (fio->bio && *(fio->bio)) {
3615		struct bio *bio = *(fio->bio);
3616
3617		bio->bi_status = BLK_STS_IOERR;
3618		bio_endio(bio);
3619		*(fio->bio) = NULL;
3620	}
3621	return err;
3622}
3623
3624static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi,
3625						unsigned int segno)
3626{
3627	int i;
3628
3629	for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
3630		if (CURSEG_I(sbi, i)->segno == segno)
3631			break;
3632	}
3633	return i;
3634}
3635
3636void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
3637				block_t old_blkaddr, block_t new_blkaddr,
3638				bool recover_curseg, bool recover_newaddr,
3639				bool from_gc)
3640{
3641	struct sit_info *sit_i = SIT_I(sbi);
3642	struct curseg_info *curseg;
3643	unsigned int segno, old_cursegno;
3644	struct seg_entry *se;
3645	int type;
3646	unsigned short old_blkoff;
3647	unsigned char old_alloc_type;
3648
3649	segno = GET_SEGNO(sbi, new_blkaddr);
3650	se = get_seg_entry(sbi, segno);
3651	type = se->type;
3652
3653	down_write(&SM_I(sbi)->curseg_lock);
3654
3655	if (!recover_curseg) {
3656		/* for recovery flow */
3657		if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
3658			if (old_blkaddr == NULL_ADDR)
3659				type = CURSEG_COLD_DATA;
3660			else
3661				type = CURSEG_WARM_DATA;
3662		}
3663	} else {
3664		if (IS_CURSEG(sbi, segno)) {
3665			/* se->type is volatile as SSR allocation */
3666			type = __f2fs_get_curseg(sbi, segno);
3667			f2fs_bug_on(sbi, type == NO_CHECK_TYPE);
3668		} else {
3669			type = CURSEG_WARM_DATA;
3670		}
3671	}
3672
3673	f2fs_bug_on(sbi, !IS_DATASEG(type));
3674	curseg = CURSEG_I(sbi, type);
3675
3676	mutex_lock(&curseg->curseg_mutex);
3677	down_write(&sit_i->sentry_lock);
3678
3679	old_cursegno = curseg->segno;
3680	old_blkoff = curseg->next_blkoff;
3681	old_alloc_type = curseg->alloc_type;
3682
3683	/* change the current segment */
3684	if (segno != curseg->segno) {
3685		curseg->next_segno = segno;
3686		change_curseg(sbi, type, true);
3687	}
3688
3689	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
3690	__add_sum_entry(sbi, type, sum);
3691
3692	if (!recover_curseg || recover_newaddr) {
3693		if (!from_gc)
3694			update_segment_mtime(sbi, new_blkaddr, 0);
3695		update_sit_entry(sbi, new_blkaddr, 1);
3696	}
3697	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
3698		invalidate_mapping_pages(META_MAPPING(sbi),
3699					old_blkaddr, old_blkaddr);
3700		f2fs_invalidate_compress_page(sbi, old_blkaddr);
3701		if (!from_gc)
3702			update_segment_mtime(sbi, old_blkaddr, 0);
3703		update_sit_entry(sbi, old_blkaddr, -1);
3704	}
3705
3706	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
3707	locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr));
3708
3709	locate_dirty_segment(sbi, old_cursegno);
3710
3711	if (recover_curseg) {
3712		if (old_cursegno != curseg->segno) {
3713			curseg->next_segno = old_cursegno;
3714			change_curseg(sbi, type, true);
3715		}
3716		curseg->next_blkoff = old_blkoff;
3717		curseg->alloc_type = old_alloc_type;
3718	}
3719
3720	up_write(&sit_i->sentry_lock);
3721	mutex_unlock(&curseg->curseg_mutex);
3722	up_write(&SM_I(sbi)->curseg_lock);
3723}
3724
3725void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
3726				block_t old_addr, block_t new_addr,
3727				unsigned char version, bool recover_curseg,
3728				bool recover_newaddr)
3729{
3730	struct f2fs_summary sum;
3731
3732	set_summary(&sum, dn->nid, dn->ofs_in_node, version);
3733
3734	f2fs_do_replace_block(sbi, &sum, old_addr, new_addr,
3735					recover_curseg, recover_newaddr, false);
3736
3737	f2fs_update_data_blkaddr(dn, new_addr);
3738}
3739
3740void f2fs_wait_on_page_writeback(struct page *page,
3741				enum page_type type, bool ordered, bool locked)
3742{
3743	if (PageWriteback(page)) {
3744		struct f2fs_sb_info *sbi = F2FS_P_SB(page);
3745
3746		/* submit cached LFS IO */
3747		f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
3748		/* sbumit cached IPU IO */
3749		f2fs_submit_merged_ipu_write(sbi, NULL, page);
3750		if (ordered) {
3751			wait_on_page_writeback(page);
3752			f2fs_bug_on(sbi, locked && PageWriteback(page));
3753		} else {
3754			wait_for_stable_page(page);
3755		}
3756	}
3757}
3758
3759void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
3760{
3761	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
3762	struct page *cpage;
3763
3764	if (!f2fs_post_read_required(inode))
3765		return;
3766
3767	if (!__is_valid_data_blkaddr(blkaddr))
3768		return;
3769
3770	cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
3771	if (cpage) {
3772		f2fs_wait_on_page_writeback(cpage, DATA, true, true);
3773		f2fs_put_page(cpage, 1);
3774	}
3775}
3776
3777void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
3778								block_t len)
3779{
3780	block_t i;
3781
3782	for (i = 0; i < len; i++)
3783		f2fs_wait_on_block_writeback(inode, blkaddr + i);
3784}
3785
3786static int read_compacted_summaries(struct f2fs_sb_info *sbi)
3787{
3788	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
3789	struct curseg_info *seg_i;
3790	unsigned char *kaddr;
3791	struct page *page;
3792	block_t start;
3793	int i, j, offset;
3794
3795	start = start_sum_block(sbi);
3796
3797	page = f2fs_get_meta_page(sbi, start++);
3798	if (IS_ERR(page))
3799		return PTR_ERR(page);
3800	kaddr = (unsigned char *)page_address(page);
3801
3802	/* Step 1: restore nat cache */
3803	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
3804	memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE);
3805
3806	/* Step 2: restore sit cache */
3807	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
3808	memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE);
3809	offset = 2 * SUM_JOURNAL_SIZE;
3810
3811	/* Step 3: restore summary entries */
3812	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
3813		unsigned short blk_off;
3814		unsigned int segno;
3815
3816		seg_i = CURSEG_I(sbi, i);
3817		segno = le32_to_cpu(ckpt->cur_data_segno[i]);
3818		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]);
3819		seg_i->next_segno = segno;
3820		reset_curseg(sbi, i, 0);
3821		seg_i->alloc_type = ckpt->alloc_type[i];
3822		seg_i->next_blkoff = blk_off;
3823
3824		if (seg_i->alloc_type == SSR)
3825			blk_off = sbi->blocks_per_seg;
3826
3827		for (j = 0; j < blk_off; j++) {
3828			struct f2fs_summary *s;
3829
3830			s = (struct f2fs_summary *)(kaddr + offset);
3831			seg_i->sum_blk->entries[j] = *s;
3832			offset += SUMMARY_SIZE;
3833			if (offset + SUMMARY_SIZE <= PAGE_SIZE -
3834						SUM_FOOTER_SIZE)
3835				continue;
3836
3837			f2fs_put_page(page, 1);
3838			page = NULL;
3839
3840			page = f2fs_get_meta_page(sbi, start++);
3841			if (IS_ERR(page))
3842				return PTR_ERR(page);
3843			kaddr = (unsigned char *)page_address(page);
3844			offset = 0;
3845		}
3846	}
3847	f2fs_put_page(page, 1);
3848	return 0;
3849}
3850
3851static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
3852{
3853	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
3854	struct f2fs_summary_block *sum;
3855	struct curseg_info *curseg;
3856	struct page *new;
3857	unsigned short blk_off;
3858	unsigned int segno = 0;
3859	block_t blk_addr = 0;
3860	int err = 0;
3861
3862	/* get segment number and block addr */
3863	if (IS_DATASEG(type)) {
3864		segno = le32_to_cpu(ckpt->cur_data_segno[type]);
3865		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
3866							CURSEG_HOT_DATA]);
3867		if (__exist_node_summaries(sbi))
3868			blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type);
3869		else
3870			blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
3871	} else {
3872		segno = le32_to_cpu(ckpt->cur_node_segno[type -
3873							CURSEG_HOT_NODE]);
3874		blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
3875							CURSEG_HOT_NODE]);
3876		if (__exist_node_summaries(sbi))
3877			blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
3878							type - CURSEG_HOT_NODE);
3879		else
3880			blk_addr = GET_SUM_BLOCK(sbi, segno);
3881	}
3882
3883	new = f2fs_get_meta_page(sbi, blk_addr);
3884	if (IS_ERR(new))
3885		return PTR_ERR(new);
3886	sum = (struct f2fs_summary_block *)page_address(new);
3887
3888	if (IS_NODESEG(type)) {
3889		if (__exist_node_summaries(sbi)) {
3890			struct f2fs_summary *ns = &sum->entries[0];
3891			int i;
3892
3893			for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
3894				ns->version = 0;
3895				ns->ofs_in_node = 0;
3896			}
3897		} else {
3898			err = f2fs_restore_node_summary(sbi, segno, sum);
3899			if (err)
3900				goto out;
3901		}
3902	}
3903
3904	/* set uncompleted segment to curseg */
3905	curseg = CURSEG_I(sbi, type);
3906	mutex_lock(&curseg->curseg_mutex);
3907
3908	/* update journal info */
3909	down_write(&curseg->journal_rwsem);
3910	memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
3911	up_write(&curseg->journal_rwsem);
3912
3913	memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
3914	memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
3915	curseg->next_segno = segno;
3916	reset_curseg(sbi, type, 0);
3917	curseg->alloc_type = ckpt->alloc_type[type];
3918	curseg->next_blkoff = blk_off;
3919	mutex_unlock(&curseg->curseg_mutex);
3920out:
3921	f2fs_put_page(new, 1);
3922	return err;
3923}
3924
3925static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
3926{
3927	struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal;
3928	struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal;
3929	int type = CURSEG_HOT_DATA;
3930	int err;
3931
3932	if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) {
3933		int npages = f2fs_npages_for_summary_flush(sbi, true);
3934
3935		if (npages >= 2)
3936			f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages,
3937							META_CP, true);
3938
3939		/* restore for compacted data summary */
3940		err = read_compacted_summaries(sbi);
3941		if (err)
3942			return err;
3943		type = CURSEG_HOT_NODE;
3944	}
3945
3946	if (__exist_node_summaries(sbi))
3947		f2fs_ra_meta_pages(sbi,
3948				sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type),
3949				NR_CURSEG_PERSIST_TYPE - type, META_CP, true);
3950
3951	for (; type <= CURSEG_COLD_NODE; type++) {
3952		err = read_normal_summaries(sbi, type);
3953		if (err)
3954			return err;
3955	}
3956
3957	/* sanity check for summary blocks */
3958	if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES ||
3959			sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) {
3960		f2fs_err(sbi, "invalid journal entries nats %u sits %u",
3961			 nats_in_cursum(nat_j), sits_in_cursum(sit_j));
3962		return -EINVAL;
3963	}
3964
3965	return 0;
3966}
3967
3968static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
3969{
3970	struct page *page;
3971	unsigned char *kaddr;
3972	struct f2fs_summary *summary;
3973	struct curseg_info *seg_i;
3974	int written_size = 0;
3975	int i, j;
3976
3977	page = f2fs_grab_meta_page(sbi, blkaddr++);
3978	kaddr = (unsigned char *)page_address(page);
3979	memset(kaddr, 0, PAGE_SIZE);
3980
3981	/* Step 1: write nat cache */
3982	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
3983	memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE);
3984	written_size += SUM_JOURNAL_SIZE;
3985
3986	/* Step 2: write sit cache */
3987	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
3988	memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE);
3989	written_size += SUM_JOURNAL_SIZE;
3990
3991	/* Step 3: write summary entries */
3992	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
3993		unsigned short blkoff;
3994
3995		seg_i = CURSEG_I(sbi, i);
3996		if (sbi->ckpt->alloc_type[i] == SSR)
3997			blkoff = sbi->blocks_per_seg;
3998		else
3999			blkoff = curseg_blkoff(sbi, i);
4000
4001		for (j = 0; j < blkoff; j++) {
4002			if (!page) {
4003				page = f2fs_grab_meta_page(sbi, blkaddr++);
4004				kaddr = (unsigned char *)page_address(page);
4005				memset(kaddr, 0, PAGE_SIZE);
4006				written_size = 0;
4007			}
4008			summary = (struct f2fs_summary *)(kaddr + written_size);
4009			*summary = seg_i->sum_blk->entries[j];
4010			written_size += SUMMARY_SIZE;
4011
4012			if (written_size + SUMMARY_SIZE <= PAGE_SIZE -
4013							SUM_FOOTER_SIZE)
4014				continue;
4015
4016			set_page_dirty(page);
4017			f2fs_put_page(page, 1);
4018			page = NULL;
4019		}
4020	}
4021	if (page) {
4022		set_page_dirty(page);
4023		f2fs_put_page(page, 1);
4024	}
4025}
4026
4027static void write_normal_summaries(struct f2fs_sb_info *sbi,
4028					block_t blkaddr, int type)
4029{
4030	int i, end;
4031
4032	if (IS_DATASEG(type))
4033		end = type + NR_CURSEG_DATA_TYPE;
4034	else
4035		end = type + NR_CURSEG_NODE_TYPE;
4036
4037	for (i = type; i < end; i++)
4038		write_current_sum_page(sbi, i, blkaddr + (i - type));
4039}
4040
4041void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
4042{
4043	if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG))
4044		write_compacted_summaries(sbi, start_blk);
4045	else
4046		write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
4047}
4048
4049void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
4050{
4051	write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
4052}
4053
4054int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
4055					unsigned int val, int alloc)
4056{
4057	int i;
4058
4059	if (type == NAT_JOURNAL) {
4060		for (i = 0; i < nats_in_cursum(journal); i++) {
4061			if (le32_to_cpu(nid_in_journal(journal, i)) == val)
4062				return i;
4063		}
4064		if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL))
4065			return update_nats_in_cursum(journal, 1);
4066	} else if (type == SIT_JOURNAL) {
4067		for (i = 0; i < sits_in_cursum(journal); i++)
4068			if (le32_to_cpu(segno_in_journal(journal, i)) == val)
4069				return i;
4070		if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL))
4071			return update_sits_in_cursum(journal, 1);
4072	}
4073	return -1;
4074}
4075
4076static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
4077					unsigned int segno)
4078{
4079	return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno));
4080}
4081
4082static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
4083					unsigned int start)
4084{
4085	struct sit_info *sit_i = SIT_I(sbi);
4086	struct page *page;
4087	pgoff_t src_off, dst_off;
4088
4089	src_off = current_sit_addr(sbi, start);
4090	dst_off = next_sit_addr(sbi, src_off);
4091
4092	page = f2fs_grab_meta_page(sbi, dst_off);
4093	seg_info_to_sit_page(sbi, page, start);
4094
4095	set_page_dirty(page);
4096	set_to_next_sit(sit_i, start);
4097
4098	return page;
4099}
4100
4101static struct sit_entry_set *grab_sit_entry_set(void)
4102{
4103	struct sit_entry_set *ses =
4104			f2fs_kmem_cache_alloc(sit_entry_set_slab,
4105						GFP_NOFS, true, NULL);
4106
4107	ses->entry_cnt = 0;
4108	INIT_LIST_HEAD(&ses->set_list);
4109	return ses;
4110}
4111
4112static void release_sit_entry_set(struct sit_entry_set *ses)
4113{
4114	list_del(&ses->set_list);
4115	kmem_cache_free(sit_entry_set_slab, ses);
4116}
4117
4118static void adjust_sit_entry_set(struct sit_entry_set *ses,
4119						struct list_head *head)
4120{
4121	struct sit_entry_set *next = ses;
4122
4123	if (list_is_last(&ses->set_list, head))
4124		return;
4125
4126	list_for_each_entry_continue(next, head, set_list)
4127		if (ses->entry_cnt <= next->entry_cnt)
4128			break;
4129
4130	list_move_tail(&ses->set_list, &next->set_list);
4131}
4132
4133static void add_sit_entry(unsigned int segno, struct list_head *head)
4134{
4135	struct sit_entry_set *ses;
4136	unsigned int start_segno = START_SEGNO(segno);
4137
4138	list_for_each_entry(ses, head, set_list) {
4139		if (ses->start_segno == start_segno) {
4140			ses->entry_cnt++;
4141			adjust_sit_entry_set(ses, head);
4142			return;
4143		}
4144	}
4145
4146	ses = grab_sit_entry_set();
4147
4148	ses->start_segno = start_segno;
4149	ses->entry_cnt++;
4150	list_add(&ses->set_list, head);
4151}
4152
4153static void add_sits_in_set(struct f2fs_sb_info *sbi)
4154{
4155	struct f2fs_sm_info *sm_info = SM_I(sbi);
4156	struct list_head *set_list = &sm_info->sit_entry_set;
4157	unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
4158	unsigned int segno;
4159
4160	for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
4161		add_sit_entry(segno, set_list);
4162}
4163
4164static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
4165{
4166	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
4167	struct f2fs_journal *journal = curseg->journal;
4168	int i;
4169
4170	down_write(&curseg->journal_rwsem);
4171	for (i = 0; i < sits_in_cursum(journal); i++) {
4172		unsigned int segno;
4173		bool dirtied;
4174
4175		segno = le32_to_cpu(segno_in_journal(journal, i));
4176		dirtied = __mark_sit_entry_dirty(sbi, segno);
4177
4178		if (!dirtied)
4179			add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
4180	}
4181	update_sits_in_cursum(journal, -i);
4182	up_write(&curseg->journal_rwsem);
4183}
4184
4185/*
4186 * CP calls this function, which flushes SIT entries including sit_journal,
4187 * and moves prefree segs to free segs.
4188 */
4189void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
4190{
4191	struct sit_info *sit_i = SIT_I(sbi);
4192	unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
4193	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
4194	struct f2fs_journal *journal = curseg->journal;
4195	struct sit_entry_set *ses, *tmp;
4196	struct list_head *head = &SM_I(sbi)->sit_entry_set;
4197	bool to_journal = !is_sbi_flag_set(sbi, SBI_IS_RESIZEFS);
4198	struct seg_entry *se;
4199
4200	down_write(&sit_i->sentry_lock);
4201
4202	if (!sit_i->dirty_sentries)
4203		goto out;
4204
4205	/*
4206	 * add and account sit entries of dirty bitmap in sit entry
4207	 * set temporarily
4208	 */
4209	add_sits_in_set(sbi);
4210
4211	/*
4212	 * if there are no enough space in journal to store dirty sit
4213	 * entries, remove all entries from journal and add and account
4214	 * them in sit entry set.
4215	 */
4216	if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) ||
4217								!to_journal)
4218		remove_sits_in_journal(sbi);
4219
4220	/*
4221	 * there are two steps to flush sit entries:
4222	 * #1, flush sit entries to journal in current cold data summary block.
4223	 * #2, flush sit entries to sit page.
4224	 */
4225	list_for_each_entry_safe(ses, tmp, head, set_list) {
4226		struct page *page = NULL;
4227		struct f2fs_sit_block *raw_sit = NULL;
4228		unsigned int start_segno = ses->start_segno;
4229		unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
4230						(unsigned long)MAIN_SEGS(sbi));
4231		unsigned int segno = start_segno;
4232
4233		if (to_journal &&
4234			!__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL))
4235			to_journal = false;
4236
4237		if (to_journal) {
4238			down_write(&curseg->journal_rwsem);
4239		} else {
4240			page = get_next_sit_page(sbi, start_segno);
4241			raw_sit = page_address(page);
4242		}
4243
4244		/* flush dirty sit entries in region of current sit set */
4245		for_each_set_bit_from(segno, bitmap, end) {
4246			int offset, sit_offset;
4247
4248			se = get_seg_entry(sbi, segno);
4249#ifdef CONFIG_F2FS_CHECK_FS
4250			if (memcmp(se->cur_valid_map, se->cur_valid_map_mir,
4251						SIT_VBLOCK_MAP_SIZE))
4252				f2fs_bug_on(sbi, 1);
4253#endif
4254
4255			/* add discard candidates */
4256			if (!(cpc->reason & CP_DISCARD)) {
4257				cpc->trim_start = segno;
4258				add_discard_addrs(sbi, cpc, false);
4259			}
4260
4261			if (to_journal) {
4262				offset = f2fs_lookup_journal_in_cursum(journal,
4263							SIT_JOURNAL, segno, 1);
4264				f2fs_bug_on(sbi, offset < 0);
4265				segno_in_journal(journal, offset) =
4266							cpu_to_le32(segno);
4267				seg_info_to_raw_sit(se,
4268					&sit_in_journal(journal, offset));
4269				check_block_count(sbi, segno,
4270					&sit_in_journal(journal, offset));
4271			} else {
4272				sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
4273				seg_info_to_raw_sit(se,
4274						&raw_sit->entries[sit_offset]);
4275				check_block_count(sbi, segno,
4276						&raw_sit->entries[sit_offset]);
4277			}
4278
4279			__clear_bit(segno, bitmap);
4280			sit_i->dirty_sentries--;
4281			ses->entry_cnt--;
4282		}
4283
4284		if (to_journal)
4285			up_write(&curseg->journal_rwsem);
4286		else
4287			f2fs_put_page(page, 1);
4288
4289		f2fs_bug_on(sbi, ses->entry_cnt);
4290		release_sit_entry_set(ses);
4291	}
4292
4293	f2fs_bug_on(sbi, !list_empty(head));
4294	f2fs_bug_on(sbi, sit_i->dirty_sentries);
4295out:
4296	if (cpc->reason & CP_DISCARD) {
4297		__u64 trim_start = cpc->trim_start;
4298
4299		for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
4300			add_discard_addrs(sbi, cpc, false);
4301
4302		cpc->trim_start = trim_start;
4303	}
4304	up_write(&sit_i->sentry_lock);
4305
4306	set_prefree_as_free_segments(sbi);
4307}
4308
4309static int build_sit_info(struct f2fs_sb_info *sbi)
4310{
4311	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
4312	struct sit_info *sit_i;
4313	unsigned int sit_segs, start;
4314	char *src_bitmap, *bitmap;
4315	unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size;
4316	unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0;
4317
4318	/* allocate memory for SIT information */
4319	sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL);
4320	if (!sit_i)
4321		return -ENOMEM;
4322
4323	SM_I(sbi)->sit_info = sit_i;
4324
4325	sit_i->sentries =
4326		f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry),
4327					      MAIN_SEGS(sbi)),
4328			      GFP_KERNEL);
4329	if (!sit_i->sentries)
4330		return -ENOMEM;
4331
4332	main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
4333	sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size,
4334								GFP_KERNEL);
4335	if (!sit_i->dirty_sentries_bitmap)
4336		return -ENOMEM;
4337
4338#ifdef CONFIG_F2FS_CHECK_FS
4339	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map);
4340#else
4341	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map);
4342#endif
4343	sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
4344	if (!sit_i->bitmap)
4345		return -ENOMEM;
4346
4347	bitmap = sit_i->bitmap;
4348
4349	for (start = 0; start < MAIN_SEGS(sbi); start++) {
4350		sit_i->sentries[start].cur_valid_map = bitmap;
4351		bitmap += SIT_VBLOCK_MAP_SIZE;
4352
4353		sit_i->sentries[start].ckpt_valid_map = bitmap;
4354		bitmap += SIT_VBLOCK_MAP_SIZE;
4355
4356#ifdef CONFIG_F2FS_CHECK_FS
4357		sit_i->sentries[start].cur_valid_map_mir = bitmap;
4358		bitmap += SIT_VBLOCK_MAP_SIZE;
4359#endif
4360
4361		if (discard_map) {
4362			sit_i->sentries[start].discard_map = bitmap;
4363			bitmap += SIT_VBLOCK_MAP_SIZE;
4364		}
4365	}
4366
4367	sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
4368	if (!sit_i->tmp_map)
4369		return -ENOMEM;
4370
4371	if (__is_large_section(sbi)) {
4372		sit_i->sec_entries =
4373			f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry),
4374						      MAIN_SECS(sbi)),
4375				      GFP_KERNEL);
4376		if (!sit_i->sec_entries)
4377			return -ENOMEM;
4378	}
4379
4380	/* get information related with SIT */
4381	sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
4382
4383	/* setup SIT bitmap from ckeckpoint pack */
4384	sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
4385	src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
4386
4387	sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL);
4388	if (!sit_i->sit_bitmap)
4389		return -ENOMEM;
4390
4391#ifdef CONFIG_F2FS_CHECK_FS
4392	sit_i->sit_bitmap_mir = kmemdup(src_bitmap,
4393					sit_bitmap_size, GFP_KERNEL);
4394	if (!sit_i->sit_bitmap_mir)
4395		return -ENOMEM;
4396
4397	sit_i->invalid_segmap = f2fs_kvzalloc(sbi,
4398					main_bitmap_size, GFP_KERNEL);
4399	if (!sit_i->invalid_segmap)
4400		return -ENOMEM;
4401#endif
4402
4403	/* init SIT information */
4404	sit_i->s_ops = &default_salloc_ops;
4405
4406	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
4407	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
4408	sit_i->written_valid_blocks = 0;
4409	sit_i->bitmap_size = sit_bitmap_size;
4410	sit_i->dirty_sentries = 0;
4411	sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
4412	sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
4413	sit_i->mounted_time = ktime_get_boottime_seconds();
4414	init_rwsem(&sit_i->sentry_lock);
4415	return 0;
4416}
4417
4418static int build_free_segmap(struct f2fs_sb_info *sbi)
4419{
4420	struct free_segmap_info *free_i;
4421	unsigned int bitmap_size, sec_bitmap_size;
4422
4423	/* allocate memory for free segmap information */
4424	free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL);
4425	if (!free_i)
4426		return -ENOMEM;
4427
4428	SM_I(sbi)->free_info = free_i;
4429
4430	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
4431	free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL);
4432	if (!free_i->free_segmap)
4433		return -ENOMEM;
4434
4435	sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
4436	free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL);
4437	if (!free_i->free_secmap)
4438		return -ENOMEM;
4439
4440	/* set all segments as dirty temporarily */
4441	memset(free_i->free_segmap, 0xff, bitmap_size);
4442	memset(free_i->free_secmap, 0xff, sec_bitmap_size);
4443
4444	/* init free segmap information */
4445	free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
4446	free_i->free_segments = 0;
4447	free_i->free_sections = 0;
4448	spin_lock_init(&free_i->segmap_lock);
4449	return 0;
4450}
4451
4452static int build_curseg(struct f2fs_sb_info *sbi)
4453{
4454	struct curseg_info *array;
4455	int i;
4456
4457	array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE,
4458					sizeof(*array)), GFP_KERNEL);
4459	if (!array)
4460		return -ENOMEM;
4461
4462	SM_I(sbi)->curseg_array = array;
4463
4464	for (i = 0; i < NO_CHECK_TYPE; i++) {
4465		mutex_init(&array[i].curseg_mutex);
4466		array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
4467		if (!array[i].sum_blk)
4468			return -ENOMEM;
4469		init_rwsem(&array[i].journal_rwsem);
4470		array[i].journal = f2fs_kzalloc(sbi,
4471				sizeof(struct f2fs_journal), GFP_KERNEL);
4472		if (!array[i].journal)
4473			return -ENOMEM;
4474		if (i < NR_PERSISTENT_LOG)
4475			array[i].seg_type = CURSEG_HOT_DATA + i;
4476		else if (i == CURSEG_COLD_DATA_PINNED)
4477			array[i].seg_type = CURSEG_COLD_DATA;
4478		else if (i == CURSEG_ALL_DATA_ATGC)
4479			array[i].seg_type = CURSEG_COLD_DATA;
4480		array[i].segno = NULL_SEGNO;
4481		array[i].next_blkoff = 0;
4482		array[i].inited = false;
4483	}
4484	return restore_curseg_summaries(sbi);
4485}
4486
4487static int build_sit_entries(struct f2fs_sb_info *sbi)
4488{
4489	struct sit_info *sit_i = SIT_I(sbi);
4490	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
4491	struct f2fs_journal *journal = curseg->journal;
4492	struct seg_entry *se;
4493	struct f2fs_sit_entry sit;
4494	int sit_blk_cnt = SIT_BLK_CNT(sbi);
4495	unsigned int i, start, end;
4496	unsigned int readed, start_blk = 0;
4497	int err = 0;
4498	block_t total_node_blocks = 0;
4499
4500	do {
4501		readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS,
4502							META_SIT, true);
4503
4504		start = start_blk * sit_i->sents_per_block;
4505		end = (start_blk + readed) * sit_i->sents_per_block;
4506
4507		for (; start < end && start < MAIN_SEGS(sbi); start++) {
4508			struct f2fs_sit_block *sit_blk;
4509			struct page *page;
4510
4511			se = &sit_i->sentries[start];
4512			page = get_current_sit_page(sbi, start);
4513			if (IS_ERR(page))
4514				return PTR_ERR(page);
4515			sit_blk = (struct f2fs_sit_block *)page_address(page);
4516			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
4517			f2fs_put_page(page, 1);
4518
4519			err = check_block_count(sbi, start, &sit);
4520			if (err)
4521				return err;
4522			seg_info_from_raw_sit(se, &sit);
4523			if (IS_NODESEG(se->type))
4524				total_node_blocks += se->valid_blocks;
4525
4526			if (f2fs_block_unit_discard(sbi)) {
4527				/* build discard map only one time */
4528				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
4529					memset(se->discard_map, 0xff,
4530						SIT_VBLOCK_MAP_SIZE);
4531				} else {
4532					memcpy(se->discard_map,
4533						se->cur_valid_map,
4534						SIT_VBLOCK_MAP_SIZE);
4535					sbi->discard_blks +=
4536						sbi->blocks_per_seg -
4537						se->valid_blocks;
4538				}
4539			}
4540
4541			if (__is_large_section(sbi))
4542				get_sec_entry(sbi, start)->valid_blocks +=
4543							se->valid_blocks;
4544		}
4545		start_blk += readed;
4546	} while (start_blk < sit_blk_cnt);
4547
4548	down_read(&curseg->journal_rwsem);
4549	for (i = 0; i < sits_in_cursum(journal); i++) {
4550		unsigned int old_valid_blocks;
4551
4552		start = le32_to_cpu(segno_in_journal(journal, i));
4553		if (start >= MAIN_SEGS(sbi)) {
4554			f2fs_err(sbi, "Wrong journal entry on segno %u",
4555				 start);
4556			err = -EFSCORRUPTED;
4557			break;
4558		}
4559
4560		se = &sit_i->sentries[start];
4561		sit = sit_in_journal(journal, i);
4562
4563		old_valid_blocks = se->valid_blocks;
4564		if (IS_NODESEG(se->type))
4565			total_node_blocks -= old_valid_blocks;
4566
4567		err = check_block_count(sbi, start, &sit);
4568		if (err)
4569			break;
4570		seg_info_from_raw_sit(se, &sit);
4571		if (IS_NODESEG(se->type))
4572			total_node_blocks += se->valid_blocks;
4573
4574		if (f2fs_block_unit_discard(sbi)) {
4575			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
4576				memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
4577			} else {
4578				memcpy(se->discard_map, se->cur_valid_map,
4579							SIT_VBLOCK_MAP_SIZE);
4580				sbi->discard_blks += old_valid_blocks;
4581				sbi->discard_blks -= se->valid_blocks;
4582			}
4583		}
4584
4585		if (__is_large_section(sbi)) {
4586			get_sec_entry(sbi, start)->valid_blocks +=
4587							se->valid_blocks;
4588			get_sec_entry(sbi, start)->valid_blocks -=
4589							old_valid_blocks;
4590		}
4591	}
4592	up_read(&curseg->journal_rwsem);
4593
4594	if (!err && total_node_blocks != valid_node_count(sbi)) {
4595		f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
4596			 total_node_blocks, valid_node_count(sbi));
4597		err = -EFSCORRUPTED;
4598	}
4599
4600	return err;
4601}
4602
4603static void init_free_segmap(struct f2fs_sb_info *sbi)
4604{
4605	unsigned int start;
4606	int type;
4607	struct seg_entry *sentry;
4608
4609	for (start = 0; start < MAIN_SEGS(sbi); start++) {
4610		if (f2fs_usable_blks_in_seg(sbi, start) == 0)
4611			continue;
4612		sentry = get_seg_entry(sbi, start);
4613		if (!sentry->valid_blocks)
4614			__set_free(sbi, start);
4615		else
4616			SIT_I(sbi)->written_valid_blocks +=
4617						sentry->valid_blocks;
4618	}
4619
4620	/* set use the current segments */
4621	for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
4622		struct curseg_info *curseg_t = CURSEG_I(sbi, type);
4623
4624		__set_test_and_inuse(sbi, curseg_t->segno);
4625	}
4626}
4627
4628static void init_dirty_segmap(struct f2fs_sb_info *sbi)
4629{
4630	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
4631	struct free_segmap_info *free_i = FREE_I(sbi);
4632	unsigned int segno = 0, offset = 0, secno;
4633	block_t valid_blocks, usable_blks_in_seg;
4634	block_t blks_per_sec = BLKS_PER_SEC(sbi);
4635
4636	while (1) {
4637		/* find dirty segment based on free segmap */
4638		segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
4639		if (segno >= MAIN_SEGS(sbi))
4640			break;
4641		offset = segno + 1;
4642		valid_blocks = get_valid_blocks(sbi, segno, false);
4643		usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno);
4644		if (valid_blocks == usable_blks_in_seg || !valid_blocks)
4645			continue;
4646		if (valid_blocks > usable_blks_in_seg) {
4647			f2fs_bug_on(sbi, 1);
4648			continue;
4649		}
4650		mutex_lock(&dirty_i->seglist_lock);
4651		__locate_dirty_segment(sbi, segno, DIRTY);
4652		mutex_unlock(&dirty_i->seglist_lock);
4653	}
4654
4655	if (!__is_large_section(sbi))
4656		return;
4657
4658	mutex_lock(&dirty_i->seglist_lock);
4659	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
4660		valid_blocks = get_valid_blocks(sbi, segno, true);
4661		secno = GET_SEC_FROM_SEG(sbi, segno);
4662
4663		if (!valid_blocks || valid_blocks == blks_per_sec)
4664			continue;
4665		if (IS_CURSEC(sbi, secno))
4666			continue;
4667		set_bit(secno, dirty_i->dirty_secmap);
4668	}
4669	mutex_unlock(&dirty_i->seglist_lock);
4670}
4671
4672static int init_victim_secmap(struct f2fs_sb_info *sbi)
4673{
4674	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
4675	unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
4676
4677	dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
4678	if (!dirty_i->victim_secmap)
4679		return -ENOMEM;
4680	return 0;
4681}
4682
4683static int build_dirty_segmap(struct f2fs_sb_info *sbi)
4684{
4685	struct dirty_seglist_info *dirty_i;
4686	unsigned int bitmap_size, i;
4687
4688	/* allocate memory for dirty segments list information */
4689	dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info),
4690								GFP_KERNEL);
4691	if (!dirty_i)
4692		return -ENOMEM;
4693
4694	SM_I(sbi)->dirty_info = dirty_i;
4695	mutex_init(&dirty_i->seglist_lock);
4696
4697	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
4698
4699	for (i = 0; i < NR_DIRTY_TYPE; i++) {
4700		dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size,
4701								GFP_KERNEL);
4702		if (!dirty_i->dirty_segmap[i])
4703			return -ENOMEM;
4704	}
4705
4706	if (__is_large_section(sbi)) {
4707		bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
4708		dirty_i->dirty_secmap = f2fs_kvzalloc(sbi,
4709						bitmap_size, GFP_KERNEL);
4710		if (!dirty_i->dirty_secmap)
4711			return -ENOMEM;
4712	}
4713
4714	init_dirty_segmap(sbi);
4715	return init_victim_secmap(sbi);
4716}
4717
4718static int sanity_check_curseg(struct f2fs_sb_info *sbi)
4719{
4720	int i;
4721
4722	/*
4723	 * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr;
4724	 * In LFS curseg, all blkaddr after .next_blkoff should be unused.
4725	 */
4726	for (i = 0; i < NR_PERSISTENT_LOG; i++) {
4727		struct curseg_info *curseg = CURSEG_I(sbi, i);
4728		struct seg_entry *se = get_seg_entry(sbi, curseg->segno);
4729		unsigned int blkofs = curseg->next_blkoff;
4730
4731		if (f2fs_sb_has_readonly(sbi) &&
4732			i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE)
4733			continue;
4734
4735		sanity_check_seg_type(sbi, curseg->seg_type);
4736
4737		if (f2fs_test_bit(blkofs, se->cur_valid_map))
4738			goto out;
4739
4740		if (curseg->alloc_type == SSR)
4741			continue;
4742
4743		for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) {
4744			if (!f2fs_test_bit(blkofs, se->cur_valid_map))
4745				continue;
4746out:
4747			f2fs_err(sbi,
4748				 "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u",
4749				 i, curseg->segno, curseg->alloc_type,
4750				 curseg->next_blkoff, blkofs);
4751			return -EFSCORRUPTED;
4752		}
4753	}
4754	return 0;
4755}
4756
4757#ifdef CONFIG_BLK_DEV_ZONED
4758
4759static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
4760				    struct f2fs_dev_info *fdev,
4761				    struct blk_zone *zone)
4762{
4763	unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno;
4764	block_t zone_block, wp_block, last_valid_block;
4765	unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
4766	int i, s, b, ret;
4767	struct seg_entry *se;
4768
4769	if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
4770		return 0;
4771
4772	wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block);
4773	wp_segno = GET_SEGNO(sbi, wp_block);
4774	wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
4775	zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block);
4776	zone_segno = GET_SEGNO(sbi, zone_block);
4777	zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno);
4778
4779	if (zone_segno >= MAIN_SEGS(sbi))
4780		return 0;
4781
4782	/*
4783	 * Skip check of zones cursegs point to, since
4784	 * fix_curseg_write_pointer() checks them.
4785	 */
4786	for (i = 0; i < NO_CHECK_TYPE; i++)
4787		if (zone_secno == GET_SEC_FROM_SEG(sbi,
4788						   CURSEG_I(sbi, i)->segno))
4789			return 0;
4790
4791	/*
4792	 * Get last valid block of the zone.
4793	 */
4794	last_valid_block = zone_block - 1;
4795	for (s = sbi->segs_per_sec - 1; s >= 0; s--) {
4796		segno = zone_segno + s;
4797		se = get_seg_entry(sbi, segno);
4798		for (b = sbi->blocks_per_seg - 1; b >= 0; b--)
4799			if (f2fs_test_bit(b, se->cur_valid_map)) {
4800				last_valid_block = START_BLOCK(sbi, segno) + b;
4801				break;
4802			}
4803		if (last_valid_block >= zone_block)
4804			break;
4805	}
4806
4807	/*
4808	 * If last valid block is beyond the write pointer, report the
4809	 * inconsistency. This inconsistency does not cause write error
4810	 * because the zone will not be selected for write operation until
4811	 * it get discarded. Just report it.
4812	 */
4813	if (last_valid_block >= wp_block) {
4814		f2fs_notice(sbi, "Valid block beyond write pointer: "
4815			    "valid block[0x%x,0x%x] wp[0x%x,0x%x]",
4816			    GET_SEGNO(sbi, last_valid_block),
4817			    GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
4818			    wp_segno, wp_blkoff);
4819		return 0;
4820	}
4821
4822	/*
4823	 * If there is no valid block in the zone and if write pointer is
4824	 * not at zone start, reset the write pointer.
4825	 */
4826	if (last_valid_block + 1 == zone_block && zone->wp != zone->start) {
4827		f2fs_notice(sbi,
4828			    "Zone without valid block has non-zero write "
4829			    "pointer. Reset the write pointer: wp[0x%x,0x%x]",
4830			    wp_segno, wp_blkoff);
4831		ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
4832					zone->len >> log_sectors_per_block);
4833		if (ret) {
4834			f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
4835				 fdev->path, ret);
4836			return ret;
4837		}
4838	}
4839
4840	return 0;
4841}
4842
4843static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi,
4844						  block_t zone_blkaddr)
4845{
4846	int i;
4847
4848	for (i = 0; i < sbi->s_ndevs; i++) {
4849		if (!bdev_is_zoned(FDEV(i).bdev))
4850			continue;
4851		if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr &&
4852				zone_blkaddr <= FDEV(i).end_blk))
4853			return &FDEV(i);
4854	}
4855
4856	return NULL;
4857}
4858
4859static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx,
4860			      void *data)
4861{
4862	memcpy(data, zone, sizeof(struct blk_zone));
4863	return 0;
4864}
4865
4866static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
4867{
4868	struct curseg_info *cs = CURSEG_I(sbi, type);
4869	struct f2fs_dev_info *zbd;
4870	struct blk_zone zone;
4871	unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off;
4872	block_t cs_zone_block, wp_block;
4873	unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
4874	sector_t zone_sector;
4875	int err;
4876
4877	cs_section = GET_SEC_FROM_SEG(sbi, cs->segno);
4878	cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section));
4879
4880	zbd = get_target_zoned_dev(sbi, cs_zone_block);
4881	if (!zbd)
4882		return 0;
4883
4884	/* report zone for the sector the curseg points to */
4885	zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
4886		<< log_sectors_per_block;
4887	err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
4888				  report_one_zone_cb, &zone);
4889	if (err != 1) {
4890		f2fs_err(sbi, "Report zone failed: %s errno=(%d)",
4891			 zbd->path, err);
4892		return err;
4893	}
4894
4895	if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
4896		return 0;
4897
4898	wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
4899	wp_segno = GET_SEGNO(sbi, wp_block);
4900	wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
4901	wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
4902
4903	if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
4904		wp_sector_off == 0)
4905		return 0;
4906
4907	f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
4908		    "curseg[0x%x,0x%x] wp[0x%x,0x%x]",
4909		    type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff);
4910
4911	f2fs_notice(sbi, "Assign new section to curseg[%d]: "
4912		    "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
4913
4914	f2fs_allocate_new_section(sbi, type, true);
4915
4916	/* check consistency of the zone curseg pointed to */
4917	if (check_zone_write_pointer(sbi, zbd, &zone))
4918		return -EIO;
4919
4920	/* check newly assigned zone */
4921	cs_section = GET_SEC_FROM_SEG(sbi, cs->segno);
4922	cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section));
4923
4924	zbd = get_target_zoned_dev(sbi, cs_zone_block);
4925	if (!zbd)
4926		return 0;
4927
4928	zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
4929		<< log_sectors_per_block;
4930	err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
4931				  report_one_zone_cb, &zone);
4932	if (err != 1) {
4933		f2fs_err(sbi, "Report zone failed: %s errno=(%d)",
4934			 zbd->path, err);
4935		return err;
4936	}
4937
4938	if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
4939		return 0;
4940
4941	if (zone.wp != zone.start) {
4942		f2fs_notice(sbi,
4943			    "New zone for curseg[%d] is not yet discarded. "
4944			    "Reset the zone: curseg[0x%x,0x%x]",
4945			    type, cs->segno, cs->next_blkoff);
4946		err = __f2fs_issue_discard_zone(sbi, zbd->bdev,
4947				zone_sector >> log_sectors_per_block,
4948				zone.len >> log_sectors_per_block);
4949		if (err) {
4950			f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
4951				 zbd->path, err);
4952			return err;
4953		}
4954	}
4955
4956	return 0;
4957}
4958
4959int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
4960{
4961	int i, ret;
4962
4963	for (i = 0; i < NR_PERSISTENT_LOG; i++) {
4964		ret = fix_curseg_write_pointer(sbi, i);
4965		if (ret)
4966			return ret;
4967	}
4968
4969	return 0;
4970}
4971
4972struct check_zone_write_pointer_args {
4973	struct f2fs_sb_info *sbi;
4974	struct f2fs_dev_info *fdev;
4975};
4976
4977static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx,
4978				      void *data)
4979{
4980	struct check_zone_write_pointer_args *args;
4981
4982	args = (struct check_zone_write_pointer_args *)data;
4983
4984	return check_zone_write_pointer(args->sbi, args->fdev, zone);
4985}
4986
4987int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
4988{
4989	int i, ret;
4990	struct check_zone_write_pointer_args args;
4991
4992	for (i = 0; i < sbi->s_ndevs; i++) {
4993		if (!bdev_is_zoned(FDEV(i).bdev))
4994			continue;
4995
4996		args.sbi = sbi;
4997		args.fdev = &FDEV(i);
4998		ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES,
4999					  check_zone_write_pointer_cb, &args);
5000		if (ret < 0)
5001			return ret;
5002	}
5003
5004	return 0;
5005}
5006
5007static bool is_conv_zone(struct f2fs_sb_info *sbi, unsigned int zone_idx,
5008						unsigned int dev_idx)
5009{
5010	if (!bdev_is_zoned(FDEV(dev_idx).bdev))
5011		return true;
5012	return !test_bit(zone_idx, FDEV(dev_idx).blkz_seq);
5013}
5014
5015/* Return the zone index in the given device */
5016static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno,
5017					int dev_idx)
5018{
5019	block_t sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno));
5020
5021	return (sec_start_blkaddr - FDEV(dev_idx).start_blk) >>
5022						sbi->log_blocks_per_blkz;
5023}
5024
5025/*
5026 * Return the usable segments in a section based on the zone's
5027 * corresponding zone capacity. Zone is equal to a section.
5028 */
5029static inline unsigned int f2fs_usable_zone_segs_in_sec(
5030		struct f2fs_sb_info *sbi, unsigned int segno)
5031{
5032	unsigned int dev_idx, zone_idx, unusable_segs_in_sec;
5033
5034	dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno));
5035	zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx);
5036
5037	/* Conventional zone's capacity is always equal to zone size */
5038	if (is_conv_zone(sbi, zone_idx, dev_idx))
5039		return sbi->segs_per_sec;
5040
5041	/*
5042	 * If the zone_capacity_blocks array is NULL, then zone capacity
5043	 * is equal to the zone size for all zones
5044	 */
5045	if (!FDEV(dev_idx).zone_capacity_blocks)
5046		return sbi->segs_per_sec;
5047
5048	/* Get the segment count beyond zone capacity block */
5049	unusable_segs_in_sec = (sbi->blocks_per_blkz -
5050				FDEV(dev_idx).zone_capacity_blocks[zone_idx]) >>
5051				sbi->log_blocks_per_seg;
5052	return sbi->segs_per_sec - unusable_segs_in_sec;
5053}
5054
5055/*
5056 * Return the number of usable blocks in a segment. The number of blocks
5057 * returned is always equal to the number of blocks in a segment for
5058 * segments fully contained within a sequential zone capacity or a
5059 * conventional zone. For segments partially contained in a sequential
5060 * zone capacity, the number of usable blocks up to the zone capacity
5061 * is returned. 0 is returned in all other cases.
5062 */
5063static inline unsigned int f2fs_usable_zone_blks_in_seg(
5064			struct f2fs_sb_info *sbi, unsigned int segno)
5065{
5066	block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr;
5067	unsigned int zone_idx, dev_idx, secno;
5068
5069	secno = GET_SEC_FROM_SEG(sbi, segno);
5070	seg_start = START_BLOCK(sbi, segno);
5071	dev_idx = f2fs_target_device_index(sbi, seg_start);
5072	zone_idx = get_zone_idx(sbi, secno, dev_idx);
5073
5074	/*
5075	 * Conventional zone's capacity is always equal to zone size,
5076	 * so, blocks per segment is unchanged.
5077	 */
5078	if (is_conv_zone(sbi, zone_idx, dev_idx))
5079		return sbi->blocks_per_seg;
5080
5081	if (!FDEV(dev_idx).zone_capacity_blocks)
5082		return sbi->blocks_per_seg;
5083
5084	sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno));
5085	sec_cap_blkaddr = sec_start_blkaddr +
5086				FDEV(dev_idx).zone_capacity_blocks[zone_idx];
5087
5088	/*
5089	 * If segment starts before zone capacity and spans beyond
5090	 * zone capacity, then usable blocks are from seg start to
5091	 * zone capacity. If the segment starts after the zone capacity,
5092	 * then there are no usable blocks.
5093	 */
5094	if (seg_start >= sec_cap_blkaddr)
5095		return 0;
5096	if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr)
5097		return sec_cap_blkaddr - seg_start;
5098
5099	return sbi->blocks_per_seg;
5100}
5101#else
5102int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
5103{
5104	return 0;
5105}
5106
5107int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
5108{
5109	return 0;
5110}
5111
5112static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi,
5113							unsigned int segno)
5114{
5115	return 0;
5116}
5117
5118static inline unsigned int f2fs_usable_zone_segs_in_sec(struct f2fs_sb_info *sbi,
5119							unsigned int segno)
5120{
5121	return 0;
5122}
5123#endif
5124unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
5125					unsigned int segno)
5126{
5127	if (f2fs_sb_has_blkzoned(sbi))
5128		return f2fs_usable_zone_blks_in_seg(sbi, segno);
5129
5130	return sbi->blocks_per_seg;
5131}
5132
5133unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
5134					unsigned int segno)
5135{
5136	if (f2fs_sb_has_blkzoned(sbi))
5137		return f2fs_usable_zone_segs_in_sec(sbi, segno);
5138
5139	return sbi->segs_per_sec;
5140}
5141
5142/*
5143 * Update min, max modified time for cost-benefit GC algorithm
5144 */
5145static void init_min_max_mtime(struct f2fs_sb_info *sbi)
5146{
5147	struct sit_info *sit_i = SIT_I(sbi);
5148	unsigned int segno;
5149
5150	down_write(&sit_i->sentry_lock);
5151
5152	sit_i->min_mtime = ULLONG_MAX;
5153
5154	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
5155		unsigned int i;
5156		unsigned long long mtime = 0;
5157
5158		for (i = 0; i < sbi->segs_per_sec; i++)
5159			mtime += get_seg_entry(sbi, segno + i)->mtime;
5160
5161		mtime = div_u64(mtime, sbi->segs_per_sec);
5162
5163		if (sit_i->min_mtime > mtime)
5164			sit_i->min_mtime = mtime;
5165	}
5166	sit_i->max_mtime = get_mtime(sbi, false);
5167	sit_i->dirty_max_mtime = 0;
5168	up_write(&sit_i->sentry_lock);
5169}
5170
5171int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
5172{
5173	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
5174	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
5175	struct f2fs_sm_info *sm_info;
5176	int err;
5177
5178	sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL);
5179	if (!sm_info)
5180		return -ENOMEM;
5181
5182	/* init sm info */
5183	sbi->sm_info = sm_info;
5184	sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
5185	sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
5186	sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
5187	sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
5188	sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
5189	sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
5190	sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
5191	sm_info->rec_prefree_segments = sm_info->main_segments *
5192					DEF_RECLAIM_PREFREE_SEGMENTS / 100;
5193	if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
5194		sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
5195
5196	if (!f2fs_lfs_mode(sbi))
5197		sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
5198	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
5199	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
5200	sm_info->min_seq_blocks = sbi->blocks_per_seg;
5201	sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
5202	sm_info->min_ssr_sections = reserved_sections(sbi);
5203
5204	INIT_LIST_HEAD(&sm_info->sit_entry_set);
5205
5206	init_rwsem(&sm_info->curseg_lock);
5207
5208	if (!f2fs_readonly(sbi->sb)) {
5209		err = f2fs_create_flush_cmd_control(sbi);
5210		if (err)
5211			return err;
5212	}
5213
5214	err = create_discard_cmd_control(sbi);
5215	if (err)
5216		return err;
5217
5218	err = build_sit_info(sbi);
5219	if (err)
5220		return err;
5221	err = build_free_segmap(sbi);
5222	if (err)
5223		return err;
5224	err = build_curseg(sbi);
5225	if (err)
5226		return err;
5227
5228	/* reinit free segmap based on SIT */
5229	err = build_sit_entries(sbi);
5230	if (err)
5231		return err;
5232
5233	init_free_segmap(sbi);
5234	err = build_dirty_segmap(sbi);
5235	if (err)
5236		return err;
5237
5238	err = sanity_check_curseg(sbi);
5239	if (err)
5240		return err;
5241
5242	init_min_max_mtime(sbi);
5243	return 0;
5244}
5245
5246static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
5247		enum dirty_type dirty_type)
5248{
5249	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
5250
5251	mutex_lock(&dirty_i->seglist_lock);
5252	kvfree(dirty_i->dirty_segmap[dirty_type]);
5253	dirty_i->nr_dirty[dirty_type] = 0;
5254	mutex_unlock(&dirty_i->seglist_lock);
5255}
5256
5257static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
5258{
5259	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
5260
5261	kvfree(dirty_i->victim_secmap);
5262}
5263
5264static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
5265{
5266	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
5267	int i;
5268
5269	if (!dirty_i)
5270		return;
5271
5272	/* discard pre-free/dirty segments list */
5273	for (i = 0; i < NR_DIRTY_TYPE; i++)
5274		discard_dirty_segmap(sbi, i);
5275
5276	if (__is_large_section(sbi)) {
5277		mutex_lock(&dirty_i->seglist_lock);
5278		kvfree(dirty_i->dirty_secmap);
5279		mutex_unlock(&dirty_i->seglist_lock);
5280	}
5281
5282	destroy_victim_secmap(sbi);
5283	SM_I(sbi)->dirty_info = NULL;
5284	kfree(dirty_i);
5285}
5286
5287static void destroy_curseg(struct f2fs_sb_info *sbi)
5288{
5289	struct curseg_info *array = SM_I(sbi)->curseg_array;
5290	int i;
5291
5292	if (!array)
5293		return;
5294	SM_I(sbi)->curseg_array = NULL;
5295	for (i = 0; i < NR_CURSEG_TYPE; i++) {
5296		kfree(array[i].sum_blk);
5297		kfree(array[i].journal);
5298	}
5299	kfree(array);
5300}
5301
5302static void destroy_free_segmap(struct f2fs_sb_info *sbi)
5303{
5304	struct free_segmap_info *free_i = SM_I(sbi)->free_info;
5305
5306	if (!free_i)
5307		return;
5308	SM_I(sbi)->free_info = NULL;
5309	kvfree(free_i->free_segmap);
5310	kvfree(free_i->free_secmap);
5311	kfree(free_i);
5312}
5313
5314static void destroy_sit_info(struct f2fs_sb_info *sbi)
5315{
5316	struct sit_info *sit_i = SIT_I(sbi);
5317
5318	if (!sit_i)
5319		return;
5320
5321	if (sit_i->sentries)
5322		kvfree(sit_i->bitmap);
5323	kfree(sit_i->tmp_map);
5324
5325	kvfree(sit_i->sentries);
5326	kvfree(sit_i->sec_entries);
5327	kvfree(sit_i->dirty_sentries_bitmap);
5328
5329	SM_I(sbi)->sit_info = NULL;
5330	kvfree(sit_i->sit_bitmap);
5331#ifdef CONFIG_F2FS_CHECK_FS
5332	kvfree(sit_i->sit_bitmap_mir);
5333	kvfree(sit_i->invalid_segmap);
5334#endif
5335	kfree(sit_i);
5336}
5337
5338void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
5339{
5340	struct f2fs_sm_info *sm_info = SM_I(sbi);
5341
5342	if (!sm_info)
5343		return;
5344	f2fs_destroy_flush_cmd_control(sbi, true);
5345	destroy_discard_cmd_control(sbi);
5346	destroy_dirty_segmap(sbi);
5347	destroy_curseg(sbi);
5348	destroy_free_segmap(sbi);
5349	destroy_sit_info(sbi);
5350	sbi->sm_info = NULL;
5351	kfree(sm_info);
5352}
5353
5354int __init f2fs_create_segment_manager_caches(void)
5355{
5356	discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry",
5357			sizeof(struct discard_entry));
5358	if (!discard_entry_slab)
5359		goto fail;
5360
5361	discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd",
5362			sizeof(struct discard_cmd));
5363	if (!discard_cmd_slab)
5364		goto destroy_discard_entry;
5365
5366	sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set",
5367			sizeof(struct sit_entry_set));
5368	if (!sit_entry_set_slab)
5369		goto destroy_discard_cmd;
5370
5371	inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry",
5372			sizeof(struct inmem_pages));
5373	if (!inmem_entry_slab)
5374		goto destroy_sit_entry_set;
5375	return 0;
5376
5377destroy_sit_entry_set:
5378	kmem_cache_destroy(sit_entry_set_slab);
5379destroy_discard_cmd:
5380	kmem_cache_destroy(discard_cmd_slab);
5381destroy_discard_entry:
5382	kmem_cache_destroy(discard_entry_slab);
5383fail:
5384	return -ENOMEM;
5385}
5386
5387void f2fs_destroy_segment_manager_caches(void)
5388{
5389	kmem_cache_destroy(sit_entry_set_slab);
5390	kmem_cache_destroy(discard_cmd_slab);
5391	kmem_cache_destroy(discard_entry_slab);
5392	kmem_cache_destroy(inmem_entry_slab);
5393}
5394