1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
5 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
6 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
7 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
8 *	-  July2000
9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10 */
11
12/*
13 * This handles all read/write requests to block devices
14 */
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/bio.h>
18#include <linux/blkdev.h>
19#include <linux/blk-pm.h>
20#include <linux/blk-integrity.h>
21#include <linux/highmem.h>
22#include <linux/mm.h>
23#include <linux/pagemap.h>
24#include <linux/kernel_stat.h>
25#include <linux/string.h>
26#include <linux/init.h>
27#include <linux/completion.h>
28#include <linux/slab.h>
29#include <linux/swap.h>
30#include <linux/writeback.h>
31#include <linux/task_io_accounting_ops.h>
32#include <linux/fault-inject.h>
33#include <linux/list_sort.h>
34#include <linux/delay.h>
35#include <linux/ratelimit.h>
36#include <linux/pm_runtime.h>
37#include <linux/t10-pi.h>
38#include <linux/debugfs.h>
39#include <linux/bpf.h>
40#include <linux/part_stat.h>
41#include <linux/sched/sysctl.h>
42#include <linux/blk-crypto.h>
43
44#define CREATE_TRACE_POINTS
45#include <trace/events/block.h>
46
47#include "blk.h"
48#include "blk-mq-sched.h"
49#include "blk-pm.h"
50#include "blk-cgroup.h"
51#include "blk-throttle.h"
52#include "blk-ioprio.h"
53
54struct dentry *blk_debugfs_root;
55
56EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
57EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
58EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
59EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
60EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
61EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
62
63static DEFINE_IDA(blk_queue_ida);
64
65/*
66 * For queue allocation
67 */
68static struct kmem_cache *blk_requestq_cachep;
69
70/*
71 * Controlling structure to kblockd
72 */
73static struct workqueue_struct *kblockd_workqueue;
74
75/**
76 * blk_queue_flag_set - atomically set a queue flag
77 * @flag: flag to be set
78 * @q: request queue
79 */
80void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
81{
82	set_bit(flag, &q->queue_flags);
83}
84EXPORT_SYMBOL(blk_queue_flag_set);
85
86/**
87 * blk_queue_flag_clear - atomically clear a queue flag
88 * @flag: flag to be cleared
89 * @q: request queue
90 */
91void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
92{
93	clear_bit(flag, &q->queue_flags);
94}
95EXPORT_SYMBOL(blk_queue_flag_clear);
96
97/**
98 * blk_queue_flag_test_and_set - atomically test and set a queue flag
99 * @flag: flag to be set
100 * @q: request queue
101 *
102 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
103 * the flag was already set.
104 */
105bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
106{
107	return test_and_set_bit(flag, &q->queue_flags);
108}
109EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
110
111#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
112static const char *const blk_op_name[] = {
113	REQ_OP_NAME(READ),
114	REQ_OP_NAME(WRITE),
115	REQ_OP_NAME(FLUSH),
116	REQ_OP_NAME(DISCARD),
117	REQ_OP_NAME(SECURE_ERASE),
118	REQ_OP_NAME(ZONE_RESET),
119	REQ_OP_NAME(ZONE_RESET_ALL),
120	REQ_OP_NAME(ZONE_OPEN),
121	REQ_OP_NAME(ZONE_CLOSE),
122	REQ_OP_NAME(ZONE_FINISH),
123	REQ_OP_NAME(ZONE_APPEND),
124	REQ_OP_NAME(WRITE_ZEROES),
125	REQ_OP_NAME(DRV_IN),
126	REQ_OP_NAME(DRV_OUT),
127};
128#undef REQ_OP_NAME
129
130/**
131 * blk_op_str - Return string XXX in the REQ_OP_XXX.
132 * @op: REQ_OP_XXX.
133 *
134 * Description: Centralize block layer function to convert REQ_OP_XXX into
135 * string format. Useful in the debugging and tracing bio or request. For
136 * invalid REQ_OP_XXX it returns string "UNKNOWN".
137 */
138inline const char *blk_op_str(enum req_op op)
139{
140	const char *op_str = "UNKNOWN";
141
142	if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
143		op_str = blk_op_name[op];
144
145	return op_str;
146}
147EXPORT_SYMBOL_GPL(blk_op_str);
148
149static const struct {
150	int		errno;
151	const char	*name;
152} blk_errors[] = {
153	[BLK_STS_OK]		= { 0,		"" },
154	[BLK_STS_NOTSUPP]	= { -EOPNOTSUPP, "operation not supported" },
155	[BLK_STS_TIMEOUT]	= { -ETIMEDOUT,	"timeout" },
156	[BLK_STS_NOSPC]		= { -ENOSPC,	"critical space allocation" },
157	[BLK_STS_TRANSPORT]	= { -ENOLINK,	"recoverable transport" },
158	[BLK_STS_TARGET]	= { -EREMOTEIO,	"critical target" },
159	[BLK_STS_RESV_CONFLICT]	= { -EBADE,	"reservation conflict" },
160	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
161	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
162	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
163	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
164	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
165	[BLK_STS_OFFLINE]	= { -ENODEV,	"device offline" },
166
167	/* device mapper special case, should not leak out: */
168	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
169
170	/* zone device specific errors */
171	[BLK_STS_ZONE_OPEN_RESOURCE]	= { -ETOOMANYREFS, "open zones exceeded" },
172	[BLK_STS_ZONE_ACTIVE_RESOURCE]	= { -EOVERFLOW, "active zones exceeded" },
173
174	/* Command duration limit device-side timeout */
175	[BLK_STS_DURATION_LIMIT]	= { -ETIME, "duration limit exceeded" },
176
177	/* everything else not covered above: */
178	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
179};
180
181blk_status_t errno_to_blk_status(int errno)
182{
183	int i;
184
185	for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
186		if (blk_errors[i].errno == errno)
187			return (__force blk_status_t)i;
188	}
189
190	return BLK_STS_IOERR;
191}
192EXPORT_SYMBOL_GPL(errno_to_blk_status);
193
194int blk_status_to_errno(blk_status_t status)
195{
196	int idx = (__force int)status;
197
198	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
199		return -EIO;
200	return blk_errors[idx].errno;
201}
202EXPORT_SYMBOL_GPL(blk_status_to_errno);
203
204const char *blk_status_to_str(blk_status_t status)
205{
206	int idx = (__force int)status;
207
208	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
209		return "<null>";
210	return blk_errors[idx].name;
211}
212EXPORT_SYMBOL_GPL(blk_status_to_str);
213
214/**
215 * blk_sync_queue - cancel any pending callbacks on a queue
216 * @q: the queue
217 *
218 * Description:
219 *     The block layer may perform asynchronous callback activity
220 *     on a queue, such as calling the unplug function after a timeout.
221 *     A block device may call blk_sync_queue to ensure that any
222 *     such activity is cancelled, thus allowing it to release resources
223 *     that the callbacks might use. The caller must already have made sure
224 *     that its ->submit_bio will not re-add plugging prior to calling
225 *     this function.
226 *
227 *     This function does not cancel any asynchronous activity arising
228 *     out of elevator or throttling code. That would require elevator_exit()
229 *     and blkcg_exit_queue() to be called with queue lock initialized.
230 *
231 */
232void blk_sync_queue(struct request_queue *q)
233{
234	del_timer_sync(&q->timeout);
235	cancel_work_sync(&q->timeout_work);
236}
237EXPORT_SYMBOL(blk_sync_queue);
238
239/**
240 * blk_set_pm_only - increment pm_only counter
241 * @q: request queue pointer
242 */
243void blk_set_pm_only(struct request_queue *q)
244{
245	atomic_inc(&q->pm_only);
246}
247EXPORT_SYMBOL_GPL(blk_set_pm_only);
248
249void blk_clear_pm_only(struct request_queue *q)
250{
251	int pm_only;
252
253	pm_only = atomic_dec_return(&q->pm_only);
254	WARN_ON_ONCE(pm_only < 0);
255	if (pm_only == 0)
256		wake_up_all(&q->mq_freeze_wq);
257}
258EXPORT_SYMBOL_GPL(blk_clear_pm_only);
259
260static void blk_free_queue_rcu(struct rcu_head *rcu_head)
261{
262	struct request_queue *q = container_of(rcu_head,
263			struct request_queue, rcu_head);
264
265	percpu_ref_exit(&q->q_usage_counter);
266	kmem_cache_free(blk_requestq_cachep, q);
267}
268
269static void blk_free_queue(struct request_queue *q)
270{
271	blk_free_queue_stats(q->stats);
272	if (queue_is_mq(q))
273		blk_mq_release(q);
274
275	ida_free(&blk_queue_ida, q->id);
276	call_rcu(&q->rcu_head, blk_free_queue_rcu);
277}
278
279/**
280 * blk_put_queue - decrement the request_queue refcount
281 * @q: the request_queue structure to decrement the refcount for
282 *
283 * Decrements the refcount of the request_queue and free it when the refcount
284 * reaches 0.
285 */
286void blk_put_queue(struct request_queue *q)
287{
288	if (refcount_dec_and_test(&q->refs))
289		blk_free_queue(q);
290}
291EXPORT_SYMBOL(blk_put_queue);
292
293void blk_queue_start_drain(struct request_queue *q)
294{
295	/*
296	 * When queue DYING flag is set, we need to block new req
297	 * entering queue, so we call blk_freeze_queue_start() to
298	 * prevent I/O from crossing blk_queue_enter().
299	 */
300	blk_freeze_queue_start(q);
301	if (queue_is_mq(q))
302		blk_mq_wake_waiters(q);
303	/* Make blk_queue_enter() reexamine the DYING flag. */
304	wake_up_all(&q->mq_freeze_wq);
305}
306
307/**
308 * blk_queue_enter() - try to increase q->q_usage_counter
309 * @q: request queue pointer
310 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
311 */
312int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
313{
314	const bool pm = flags & BLK_MQ_REQ_PM;
315
316	while (!blk_try_enter_queue(q, pm)) {
317		if (flags & BLK_MQ_REQ_NOWAIT)
318			return -EAGAIN;
319
320		/*
321		 * read pair of barrier in blk_freeze_queue_start(), we need to
322		 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
323		 * reading .mq_freeze_depth or queue dying flag, otherwise the
324		 * following wait may never return if the two reads are
325		 * reordered.
326		 */
327		smp_rmb();
328		wait_event(q->mq_freeze_wq,
329			   (!q->mq_freeze_depth &&
330			    blk_pm_resume_queue(pm, q)) ||
331			   blk_queue_dying(q));
332		if (blk_queue_dying(q))
333			return -ENODEV;
334	}
335
336	return 0;
337}
338
339int __bio_queue_enter(struct request_queue *q, struct bio *bio)
340{
341	while (!blk_try_enter_queue(q, false)) {
342		struct gendisk *disk = bio->bi_bdev->bd_disk;
343
344		if (bio->bi_opf & REQ_NOWAIT) {
345			if (test_bit(GD_DEAD, &disk->state))
346				goto dead;
347			bio_wouldblock_error(bio);
348			return -EAGAIN;
349		}
350
351		/*
352		 * read pair of barrier in blk_freeze_queue_start(), we need to
353		 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
354		 * reading .mq_freeze_depth or queue dying flag, otherwise the
355		 * following wait may never return if the two reads are
356		 * reordered.
357		 */
358		smp_rmb();
359		wait_event(q->mq_freeze_wq,
360			   (!q->mq_freeze_depth &&
361			    blk_pm_resume_queue(false, q)) ||
362			   test_bit(GD_DEAD, &disk->state));
363		if (test_bit(GD_DEAD, &disk->state))
364			goto dead;
365	}
366
367	return 0;
368dead:
369	bio_io_error(bio);
370	return -ENODEV;
371}
372
373void blk_queue_exit(struct request_queue *q)
374{
375	percpu_ref_put(&q->q_usage_counter);
376}
377
378static void blk_queue_usage_counter_release(struct percpu_ref *ref)
379{
380	struct request_queue *q =
381		container_of(ref, struct request_queue, q_usage_counter);
382
383	wake_up_all(&q->mq_freeze_wq);
384}
385
386static void blk_rq_timed_out_timer(struct timer_list *t)
387{
388	struct request_queue *q = from_timer(q, t, timeout);
389
390	kblockd_schedule_work(&q->timeout_work);
391}
392
393static void blk_timeout_work(struct work_struct *work)
394{
395}
396
397struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
398{
399	struct request_queue *q;
400	int error;
401
402	q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
403				  node_id);
404	if (!q)
405		return ERR_PTR(-ENOMEM);
406
407	q->last_merge = NULL;
408
409	q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
410	if (q->id < 0) {
411		error = q->id;
412		goto fail_q;
413	}
414
415	q->stats = blk_alloc_queue_stats();
416	if (!q->stats) {
417		error = -ENOMEM;
418		goto fail_id;
419	}
420
421	error = blk_set_default_limits(lim);
422	if (error)
423		goto fail_stats;
424	q->limits = *lim;
425
426	q->node = node_id;
427
428	atomic_set(&q->nr_active_requests_shared_tags, 0);
429
430	timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
431	INIT_WORK(&q->timeout_work, blk_timeout_work);
432	INIT_LIST_HEAD(&q->icq_list);
433
434	refcount_set(&q->refs, 1);
435	mutex_init(&q->debugfs_mutex);
436	mutex_init(&q->sysfs_lock);
437	mutex_init(&q->sysfs_dir_lock);
438	mutex_init(&q->limits_lock);
439	mutex_init(&q->rq_qos_mutex);
440	spin_lock_init(&q->queue_lock);
441
442	init_waitqueue_head(&q->mq_freeze_wq);
443	mutex_init(&q->mq_freeze_lock);
444
445	blkg_init_queue(q);
446
447	/*
448	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
449	 * See blk_register_queue() for details.
450	 */
451	error = percpu_ref_init(&q->q_usage_counter,
452				blk_queue_usage_counter_release,
453				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
454	if (error)
455		goto fail_stats;
456
457	q->nr_requests = BLKDEV_DEFAULT_RQ;
458
459	return q;
460
461fail_stats:
462	blk_free_queue_stats(q->stats);
463fail_id:
464	ida_free(&blk_queue_ida, q->id);
465fail_q:
466	kmem_cache_free(blk_requestq_cachep, q);
467	return ERR_PTR(error);
468}
469
470/**
471 * blk_get_queue - increment the request_queue refcount
472 * @q: the request_queue structure to increment the refcount for
473 *
474 * Increment the refcount of the request_queue kobject.
475 *
476 * Context: Any context.
477 */
478bool blk_get_queue(struct request_queue *q)
479{
480	if (unlikely(blk_queue_dying(q)))
481		return false;
482	refcount_inc(&q->refs);
483	return true;
484}
485EXPORT_SYMBOL(blk_get_queue);
486
487#ifdef CONFIG_FAIL_MAKE_REQUEST
488
489static DECLARE_FAULT_ATTR(fail_make_request);
490
491static int __init setup_fail_make_request(char *str)
492{
493	return setup_fault_attr(&fail_make_request, str);
494}
495__setup("fail_make_request=", setup_fail_make_request);
496
497bool should_fail_request(struct block_device *part, unsigned int bytes)
498{
499	return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
500}
501
502static int __init fail_make_request_debugfs(void)
503{
504	struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
505						NULL, &fail_make_request);
506
507	return PTR_ERR_OR_ZERO(dir);
508}
509
510late_initcall(fail_make_request_debugfs);
511#endif /* CONFIG_FAIL_MAKE_REQUEST */
512
513static inline void bio_check_ro(struct bio *bio)
514{
515	if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
516		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
517			return;
518
519		if (bio->bi_bdev->bd_ro_warned)
520			return;
521
522		bio->bi_bdev->bd_ro_warned = true;
523		/*
524		 * Use ioctl to set underlying disk of raid/dm to read-only
525		 * will trigger this.
526		 */
527		pr_warn("Trying to write to read-only block-device %pg\n",
528			bio->bi_bdev);
529	}
530}
531
532static noinline int should_fail_bio(struct bio *bio)
533{
534	if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
535		return -EIO;
536	return 0;
537}
538ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
539
540/*
541 * Check whether this bio extends beyond the end of the device or partition.
542 * This may well happen - the kernel calls bread() without checking the size of
543 * the device, e.g., when mounting a file system.
544 */
545static inline int bio_check_eod(struct bio *bio)
546{
547	sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
548	unsigned int nr_sectors = bio_sectors(bio);
549
550	if (nr_sectors &&
551	    (nr_sectors > maxsector ||
552	     bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
553		pr_info_ratelimited("%s: attempt to access beyond end of device\n"
554				    "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
555				    current->comm, bio->bi_bdev, bio->bi_opf,
556				    bio->bi_iter.bi_sector, nr_sectors, maxsector);
557		return -EIO;
558	}
559	return 0;
560}
561
562/*
563 * Remap block n of partition p to block n+start(p) of the disk.
564 */
565static int blk_partition_remap(struct bio *bio)
566{
567	struct block_device *p = bio->bi_bdev;
568
569	if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
570		return -EIO;
571	if (bio_sectors(bio)) {
572		bio->bi_iter.bi_sector += p->bd_start_sect;
573		trace_block_bio_remap(bio, p->bd_dev,
574				      bio->bi_iter.bi_sector -
575				      p->bd_start_sect);
576	}
577	bio_set_flag(bio, BIO_REMAPPED);
578	return 0;
579}
580
581/*
582 * Check write append to a zoned block device.
583 */
584static inline blk_status_t blk_check_zone_append(struct request_queue *q,
585						 struct bio *bio)
586{
587	int nr_sectors = bio_sectors(bio);
588
589	/* Only applicable to zoned block devices */
590	if (!bdev_is_zoned(bio->bi_bdev))
591		return BLK_STS_NOTSUPP;
592
593	/* The bio sector must point to the start of a sequential zone */
594	if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
595	    !bio_zone_is_seq(bio))
596		return BLK_STS_IOERR;
597
598	/*
599	 * Not allowed to cross zone boundaries. Otherwise, the BIO will be
600	 * split and could result in non-contiguous sectors being written in
601	 * different zones.
602	 */
603	if (nr_sectors > q->limits.chunk_sectors)
604		return BLK_STS_IOERR;
605
606	/* Make sure the BIO is small enough and will not get split */
607	if (nr_sectors > q->limits.max_zone_append_sectors)
608		return BLK_STS_IOERR;
609
610	bio->bi_opf |= REQ_NOMERGE;
611
612	return BLK_STS_OK;
613}
614
615static void __submit_bio(struct bio *bio)
616{
617	if (unlikely(!blk_crypto_bio_prep(&bio)))
618		return;
619
620	if (!bio->bi_bdev->bd_has_submit_bio) {
621		blk_mq_submit_bio(bio);
622	} else if (likely(bio_queue_enter(bio) == 0)) {
623		struct gendisk *disk = bio->bi_bdev->bd_disk;
624
625		disk->fops->submit_bio(bio);
626		blk_queue_exit(disk->queue);
627	}
628}
629
630/*
631 * The loop in this function may be a bit non-obvious, and so deserves some
632 * explanation:
633 *
634 *  - Before entering the loop, bio->bi_next is NULL (as all callers ensure
635 *    that), so we have a list with a single bio.
636 *  - We pretend that we have just taken it off a longer list, so we assign
637 *    bio_list to a pointer to the bio_list_on_stack, thus initialising the
638 *    bio_list of new bios to be added.  ->submit_bio() may indeed add some more
639 *    bios through a recursive call to submit_bio_noacct.  If it did, we find a
640 *    non-NULL value in bio_list and re-enter the loop from the top.
641 *  - In this case we really did just take the bio of the top of the list (no
642 *    pretending) and so remove it from bio_list, and call into ->submit_bio()
643 *    again.
644 *
645 * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
646 * bio_list_on_stack[1] contains bios that were submitted before the current
647 *	->submit_bio, but that haven't been processed yet.
648 */
649static void __submit_bio_noacct(struct bio *bio)
650{
651	struct bio_list bio_list_on_stack[2];
652
653	BUG_ON(bio->bi_next);
654
655	bio_list_init(&bio_list_on_stack[0]);
656	current->bio_list = bio_list_on_stack;
657
658	do {
659		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
660		struct bio_list lower, same;
661
662		/*
663		 * Create a fresh bio_list for all subordinate requests.
664		 */
665		bio_list_on_stack[1] = bio_list_on_stack[0];
666		bio_list_init(&bio_list_on_stack[0]);
667
668		__submit_bio(bio);
669
670		/*
671		 * Sort new bios into those for a lower level and those for the
672		 * same level.
673		 */
674		bio_list_init(&lower);
675		bio_list_init(&same);
676		while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
677			if (q == bdev_get_queue(bio->bi_bdev))
678				bio_list_add(&same, bio);
679			else
680				bio_list_add(&lower, bio);
681
682		/*
683		 * Now assemble so we handle the lowest level first.
684		 */
685		bio_list_merge(&bio_list_on_stack[0], &lower);
686		bio_list_merge(&bio_list_on_stack[0], &same);
687		bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
688	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
689
690	current->bio_list = NULL;
691}
692
693static void __submit_bio_noacct_mq(struct bio *bio)
694{
695	struct bio_list bio_list[2] = { };
696
697	current->bio_list = bio_list;
698
699	do {
700		__submit_bio(bio);
701	} while ((bio = bio_list_pop(&bio_list[0])));
702
703	current->bio_list = NULL;
704}
705
706void submit_bio_noacct_nocheck(struct bio *bio)
707{
708	blk_cgroup_bio_start(bio);
709	blkcg_bio_issue_init(bio);
710
711	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
712		trace_block_bio_queue(bio);
713		/*
714		 * Now that enqueuing has been traced, we need to trace
715		 * completion as well.
716		 */
717		bio_set_flag(bio, BIO_TRACE_COMPLETION);
718	}
719
720	/*
721	 * We only want one ->submit_bio to be active at a time, else stack
722	 * usage with stacked devices could be a problem.  Use current->bio_list
723	 * to collect a list of requests submited by a ->submit_bio method while
724	 * it is active, and then process them after it returned.
725	 */
726	if (current->bio_list)
727		bio_list_add(&current->bio_list[0], bio);
728	else if (!bio->bi_bdev->bd_has_submit_bio)
729		__submit_bio_noacct_mq(bio);
730	else
731		__submit_bio_noacct(bio);
732}
733
734/**
735 * submit_bio_noacct - re-submit a bio to the block device layer for I/O
736 * @bio:  The bio describing the location in memory and on the device.
737 *
738 * This is a version of submit_bio() that shall only be used for I/O that is
739 * resubmitted to lower level drivers by stacking block drivers.  All file
740 * systems and other upper level users of the block layer should use
741 * submit_bio() instead.
742 */
743void submit_bio_noacct(struct bio *bio)
744{
745	struct block_device *bdev = bio->bi_bdev;
746	struct request_queue *q = bdev_get_queue(bdev);
747	blk_status_t status = BLK_STS_IOERR;
748
749	might_sleep();
750
751	/*
752	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
753	 * if queue does not support NOWAIT.
754	 */
755	if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
756		goto not_supported;
757
758	if (should_fail_bio(bio))
759		goto end_io;
760	bio_check_ro(bio);
761	if (!bio_flagged(bio, BIO_REMAPPED)) {
762		if (unlikely(bio_check_eod(bio)))
763			goto end_io;
764		if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
765			goto end_io;
766	}
767
768	/*
769	 * Filter flush bio's early so that bio based drivers without flush
770	 * support don't have to worry about them.
771	 */
772	if (op_is_flush(bio->bi_opf)) {
773		if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
774				 bio_op(bio) != REQ_OP_ZONE_APPEND))
775			goto end_io;
776		if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
777			bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
778			if (!bio_sectors(bio)) {
779				status = BLK_STS_OK;
780				goto end_io;
781			}
782		}
783	}
784
785	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
786		bio_clear_polled(bio);
787
788	switch (bio_op(bio)) {
789	case REQ_OP_READ:
790	case REQ_OP_WRITE:
791		break;
792	case REQ_OP_FLUSH:
793		/*
794		 * REQ_OP_FLUSH can't be submitted through bios, it is only
795		 * synthetized in struct request by the flush state machine.
796		 */
797		goto not_supported;
798	case REQ_OP_DISCARD:
799		if (!bdev_max_discard_sectors(bdev))
800			goto not_supported;
801		break;
802	case REQ_OP_SECURE_ERASE:
803		if (!bdev_max_secure_erase_sectors(bdev))
804			goto not_supported;
805		break;
806	case REQ_OP_ZONE_APPEND:
807		status = blk_check_zone_append(q, bio);
808		if (status != BLK_STS_OK)
809			goto end_io;
810		break;
811	case REQ_OP_WRITE_ZEROES:
812		if (!q->limits.max_write_zeroes_sectors)
813			goto not_supported;
814		break;
815	case REQ_OP_ZONE_RESET:
816	case REQ_OP_ZONE_OPEN:
817	case REQ_OP_ZONE_CLOSE:
818	case REQ_OP_ZONE_FINISH:
819		if (!bdev_is_zoned(bio->bi_bdev))
820			goto not_supported;
821		break;
822	case REQ_OP_ZONE_RESET_ALL:
823		if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
824			goto not_supported;
825		break;
826	case REQ_OP_DRV_IN:
827	case REQ_OP_DRV_OUT:
828		/*
829		 * Driver private operations are only used with passthrough
830		 * requests.
831		 */
832		fallthrough;
833	default:
834		goto not_supported;
835	}
836
837	if (blk_throtl_bio(bio))
838		return;
839	submit_bio_noacct_nocheck(bio);
840	return;
841
842not_supported:
843	status = BLK_STS_NOTSUPP;
844end_io:
845	bio->bi_status = status;
846	bio_endio(bio);
847}
848EXPORT_SYMBOL(submit_bio_noacct);
849
850static void bio_set_ioprio(struct bio *bio)
851{
852	/* Nobody set ioprio so far? Initialize it based on task's nice value */
853	if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
854		bio->bi_ioprio = get_current_ioprio();
855	blkcg_set_ioprio(bio);
856}
857
858/**
859 * submit_bio - submit a bio to the block device layer for I/O
860 * @bio: The &struct bio which describes the I/O
861 *
862 * submit_bio() is used to submit I/O requests to block devices.  It is passed a
863 * fully set up &struct bio that describes the I/O that needs to be done.  The
864 * bio will be send to the device described by the bi_bdev field.
865 *
866 * The success/failure status of the request, along with notification of
867 * completion, is delivered asynchronously through the ->bi_end_io() callback
868 * in @bio.  The bio must NOT be touched by the caller until ->bi_end_io() has
869 * been called.
870 */
871void submit_bio(struct bio *bio)
872{
873	if (bio_op(bio) == REQ_OP_READ) {
874		task_io_account_read(bio->bi_iter.bi_size);
875		count_vm_events(PGPGIN, bio_sectors(bio));
876	} else if (bio_op(bio) == REQ_OP_WRITE) {
877		count_vm_events(PGPGOUT, bio_sectors(bio));
878	}
879
880	bio_set_ioprio(bio);
881	submit_bio_noacct(bio);
882}
883EXPORT_SYMBOL(submit_bio);
884
885/**
886 * bio_poll - poll for BIO completions
887 * @bio: bio to poll for
888 * @iob: batches of IO
889 * @flags: BLK_POLL_* flags that control the behavior
890 *
891 * Poll for completions on queue associated with the bio. Returns number of
892 * completed entries found.
893 *
894 * Note: the caller must either be the context that submitted @bio, or
895 * be in a RCU critical section to prevent freeing of @bio.
896 */
897int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
898{
899	blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
900	struct block_device *bdev;
901	struct request_queue *q;
902	int ret = 0;
903
904	bdev = READ_ONCE(bio->bi_bdev);
905	if (!bdev)
906		return 0;
907
908	q = bdev_get_queue(bdev);
909	if (cookie == BLK_QC_T_NONE ||
910	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
911		return 0;
912
913	/*
914	 * As the requests that require a zone lock are not plugged in the
915	 * first place, directly accessing the plug instead of using
916	 * blk_mq_plug() should not have any consequences during flushing for
917	 * zoned devices.
918	 */
919	blk_flush_plug(current->plug, false);
920
921	/*
922	 * We need to be able to enter a frozen queue, similar to how
923	 * timeouts also need to do that. If that is blocked, then we can
924	 * have pending IO when a queue freeze is started, and then the
925	 * wait for the freeze to finish will wait for polled requests to
926	 * timeout as the poller is preventer from entering the queue and
927	 * completing them. As long as we prevent new IO from being queued,
928	 * that should be all that matters.
929	 */
930	if (!percpu_ref_tryget(&q->q_usage_counter))
931		return 0;
932	if (queue_is_mq(q)) {
933		ret = blk_mq_poll(q, cookie, iob, flags);
934	} else {
935		struct gendisk *disk = q->disk;
936
937		if (disk && disk->fops->poll_bio)
938			ret = disk->fops->poll_bio(bio, iob, flags);
939	}
940	blk_queue_exit(q);
941	return ret;
942}
943EXPORT_SYMBOL_GPL(bio_poll);
944
945/*
946 * Helper to implement file_operations.iopoll.  Requires the bio to be stored
947 * in iocb->private, and cleared before freeing the bio.
948 */
949int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
950		    unsigned int flags)
951{
952	struct bio *bio;
953	int ret = 0;
954
955	/*
956	 * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
957	 * point to a freshly allocated bio at this point.  If that happens
958	 * we have a few cases to consider:
959	 *
960	 *  1) the bio is beeing initialized and bi_bdev is NULL.  We can just
961	 *     simply nothing in this case
962	 *  2) the bio points to a not poll enabled device.  bio_poll will catch
963	 *     this and return 0
964	 *  3) the bio points to a poll capable device, including but not
965	 *     limited to the one that the original bio pointed to.  In this
966	 *     case we will call into the actual poll method and poll for I/O,
967	 *     even if we don't need to, but it won't cause harm either.
968	 *
969	 * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
970	 * is still allocated. Because partitions hold a reference to the whole
971	 * device bdev and thus disk, the disk is also still valid.  Grabbing
972	 * a reference to the queue in bio_poll() ensures the hctxs and requests
973	 * are still valid as well.
974	 */
975	rcu_read_lock();
976	bio = READ_ONCE(kiocb->private);
977	if (bio)
978		ret = bio_poll(bio, iob, flags);
979	rcu_read_unlock();
980
981	return ret;
982}
983EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
984
985void update_io_ticks(struct block_device *part, unsigned long now, bool end)
986{
987	unsigned long stamp;
988again:
989	stamp = READ_ONCE(part->bd_stamp);
990	if (unlikely(time_after(now, stamp))) {
991		if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
992			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
993	}
994	if (part->bd_partno) {
995		part = bdev_whole(part);
996		goto again;
997	}
998}
999
1000unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
1001				 unsigned long start_time)
1002{
1003	part_stat_lock();
1004	update_io_ticks(bdev, start_time, false);
1005	part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
1006	part_stat_unlock();
1007
1008	return start_time;
1009}
1010EXPORT_SYMBOL(bdev_start_io_acct);
1011
1012/**
1013 * bio_start_io_acct - start I/O accounting for bio based drivers
1014 * @bio:	bio to start account for
1015 *
1016 * Returns the start time that should be passed back to bio_end_io_acct().
1017 */
1018unsigned long bio_start_io_acct(struct bio *bio)
1019{
1020	return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
1021}
1022EXPORT_SYMBOL_GPL(bio_start_io_acct);
1023
1024void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
1025		      unsigned int sectors, unsigned long start_time)
1026{
1027	const int sgrp = op_stat_group(op);
1028	unsigned long now = READ_ONCE(jiffies);
1029	unsigned long duration = now - start_time;
1030
1031	part_stat_lock();
1032	update_io_ticks(bdev, now, true);
1033	part_stat_inc(bdev, ios[sgrp]);
1034	part_stat_add(bdev, sectors[sgrp], sectors);
1035	part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
1036	part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
1037	part_stat_unlock();
1038}
1039EXPORT_SYMBOL(bdev_end_io_acct);
1040
1041void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
1042			      struct block_device *orig_bdev)
1043{
1044	bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
1045}
1046EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
1047
1048/**
1049 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
1050 * @q : the queue of the device being checked
1051 *
1052 * Description:
1053 *    Check if underlying low-level drivers of a device are busy.
1054 *    If the drivers want to export their busy state, they must set own
1055 *    exporting function using blk_queue_lld_busy() first.
1056 *
1057 *    Basically, this function is used only by request stacking drivers
1058 *    to stop dispatching requests to underlying devices when underlying
1059 *    devices are busy.  This behavior helps more I/O merging on the queue
1060 *    of the request stacking driver and prevents I/O throughput regression
1061 *    on burst I/O load.
1062 *
1063 * Return:
1064 *    0 - Not busy (The request stacking driver should dispatch request)
1065 *    1 - Busy (The request stacking driver should stop dispatching request)
1066 */
1067int blk_lld_busy(struct request_queue *q)
1068{
1069	if (queue_is_mq(q) && q->mq_ops->busy)
1070		return q->mq_ops->busy(q);
1071
1072	return 0;
1073}
1074EXPORT_SYMBOL_GPL(blk_lld_busy);
1075
1076int kblockd_schedule_work(struct work_struct *work)
1077{
1078	return queue_work(kblockd_workqueue, work);
1079}
1080EXPORT_SYMBOL(kblockd_schedule_work);
1081
1082int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
1083				unsigned long delay)
1084{
1085	return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
1086}
1087EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
1088
1089void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
1090{
1091	struct task_struct *tsk = current;
1092
1093	/*
1094	 * If this is a nested plug, don't actually assign it.
1095	 */
1096	if (tsk->plug)
1097		return;
1098
1099	plug->cur_ktime = 0;
1100	plug->mq_list = NULL;
1101	plug->cached_rq = NULL;
1102	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
1103	plug->rq_count = 0;
1104	plug->multiple_queues = false;
1105	plug->has_elevator = false;
1106	INIT_LIST_HEAD(&plug->cb_list);
1107
1108	/*
1109	 * Store ordering should not be needed here, since a potential
1110	 * preempt will imply a full memory barrier
1111	 */
1112	tsk->plug = plug;
1113}
1114
1115/**
1116 * blk_start_plug - initialize blk_plug and track it inside the task_struct
1117 * @plug:	The &struct blk_plug that needs to be initialized
1118 *
1119 * Description:
1120 *   blk_start_plug() indicates to the block layer an intent by the caller
1121 *   to submit multiple I/O requests in a batch.  The block layer may use
1122 *   this hint to defer submitting I/Os from the caller until blk_finish_plug()
1123 *   is called.  However, the block layer may choose to submit requests
1124 *   before a call to blk_finish_plug() if the number of queued I/Os
1125 *   exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
1126 *   %BLK_PLUG_FLUSH_SIZE.  The queued I/Os may also be submitted early if
1127 *   the task schedules (see below).
1128 *
1129 *   Tracking blk_plug inside the task_struct will help with auto-flushing the
1130 *   pending I/O should the task end up blocking between blk_start_plug() and
1131 *   blk_finish_plug(). This is important from a performance perspective, but
1132 *   also ensures that we don't deadlock. For instance, if the task is blocking
1133 *   for a memory allocation, memory reclaim could end up wanting to free a
1134 *   page belonging to that request that is currently residing in our private
1135 *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
1136 *   this kind of deadlock.
1137 */
1138void blk_start_plug(struct blk_plug *plug)
1139{
1140	blk_start_plug_nr_ios(plug, 1);
1141}
1142EXPORT_SYMBOL(blk_start_plug);
1143
1144static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
1145{
1146	LIST_HEAD(callbacks);
1147
1148	while (!list_empty(&plug->cb_list)) {
1149		list_splice_init(&plug->cb_list, &callbacks);
1150
1151		while (!list_empty(&callbacks)) {
1152			struct blk_plug_cb *cb = list_first_entry(&callbacks,
1153							  struct blk_plug_cb,
1154							  list);
1155			list_del(&cb->list);
1156			cb->callback(cb, from_schedule);
1157		}
1158	}
1159}
1160
1161struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
1162				      int size)
1163{
1164	struct blk_plug *plug = current->plug;
1165	struct blk_plug_cb *cb;
1166
1167	if (!plug)
1168		return NULL;
1169
1170	list_for_each_entry(cb, &plug->cb_list, list)
1171		if (cb->callback == unplug && cb->data == data)
1172			return cb;
1173
1174	/* Not currently on the callback list */
1175	BUG_ON(size < sizeof(*cb));
1176	cb = kzalloc(size, GFP_ATOMIC);
1177	if (cb) {
1178		cb->data = data;
1179		cb->callback = unplug;
1180		list_add(&cb->list, &plug->cb_list);
1181	}
1182	return cb;
1183}
1184EXPORT_SYMBOL(blk_check_plugged);
1185
1186void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
1187{
1188	if (!list_empty(&plug->cb_list))
1189		flush_plug_callbacks(plug, from_schedule);
1190	blk_mq_flush_plug_list(plug, from_schedule);
1191	/*
1192	 * Unconditionally flush out cached requests, even if the unplug
1193	 * event came from schedule. Since we know hold references to the
1194	 * queue for cached requests, we don't want a blocked task holding
1195	 * up a queue freeze/quiesce event.
1196	 */
1197	if (unlikely(!rq_list_empty(plug->cached_rq)))
1198		blk_mq_free_plug_rqs(plug);
1199
1200	plug->cur_ktime = 0;
1201	current->flags &= ~PF_BLOCK_TS;
1202}
1203
1204/**
1205 * blk_finish_plug - mark the end of a batch of submitted I/O
1206 * @plug:	The &struct blk_plug passed to blk_start_plug()
1207 *
1208 * Description:
1209 * Indicate that a batch of I/O submissions is complete.  This function
1210 * must be paired with an initial call to blk_start_plug().  The intent
1211 * is to allow the block layer to optimize I/O submission.  See the
1212 * documentation for blk_start_plug() for more information.
1213 */
1214void blk_finish_plug(struct blk_plug *plug)
1215{
1216	if (plug == current->plug) {
1217		__blk_flush_plug(plug, false);
1218		current->plug = NULL;
1219	}
1220}
1221EXPORT_SYMBOL(blk_finish_plug);
1222
1223void blk_io_schedule(void)
1224{
1225	/* Prevent hang_check timer from firing at us during very long I/O */
1226	unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
1227
1228	if (timeout)
1229		io_schedule_timeout(timeout);
1230	else
1231		io_schedule();
1232}
1233EXPORT_SYMBOL_GPL(blk_io_schedule);
1234
1235int __init blk_dev_init(void)
1236{
1237	BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
1238	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
1239			sizeof_field(struct request, cmd_flags));
1240	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
1241			sizeof_field(struct bio, bi_opf));
1242
1243	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
1244	kblockd_workqueue = alloc_workqueue("kblockd",
1245					    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1246	if (!kblockd_workqueue)
1247		panic("Failed to create kblockd\n");
1248
1249	blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);
1250
1251	blk_debugfs_root = debugfs_create_dir("block", NULL);
1252
1253	return 0;
1254}
1255