io_uring.c revision 89b263f6
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqe (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
29 *
30 * Also see the examples in the liburing library:
31 *
32 *	git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
40 * Copyright (c) 2018-2019 Christoph Hellwig
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <net/compat.h>
48#include <linux/refcount.h>
49#include <linux/uio.h>
50#include <linux/bits.h>
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
58#include <linux/percpu.h>
59#include <linux/slab.h>
60#include <linux/blkdev.h>
61#include <linux/bvec.h>
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
65#include <net/scm.h>
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
72#include <linux/highmem.h>
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
75#include <linux/fadvise.h>
76#include <linux/eventpoll.h>
77#include <linux/splice.h>
78#include <linux/task_work.h>
79#include <linux/pagemap.h>
80#include <linux/io_uring.h>
81#include <linux/tracehook.h>
82
83#define CREATE_TRACE_POINTS
84#include <trace/events/io_uring.h>
85
86#include <uapi/linux/io_uring.h>
87
88#include "internal.h"
89#include "io-wq.h"
90
91#define IORING_MAX_ENTRIES	32768
92#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
93#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
94
95/* 512 entries per page on 64-bit archs, 64 pages max */
96#define IORING_MAX_FIXED_FILES	(1U << 15)
97#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
98				 IORING_REGISTER_LAST + IORING_OP_LAST)
99
100#define IO_RSRC_TAG_TABLE_SHIFT	9
101#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
102#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
103
104#define IORING_MAX_REG_BUFFERS	(1U << 14)
105
106#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
107				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
108				IOSQE_BUFFER_SELECT)
109#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
110				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
111
112#define IO_TCTX_REFS_CACHE_NR	(1U << 10)
113
114struct io_uring {
115	u32 head ____cacheline_aligned_in_smp;
116	u32 tail ____cacheline_aligned_in_smp;
117};
118
119/*
120 * This data is shared with the application through the mmap at offsets
121 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
122 *
123 * The offsets to the member fields are published through struct
124 * io_sqring_offsets when calling io_uring_setup.
125 */
126struct io_rings {
127	/*
128	 * Head and tail offsets into the ring; the offsets need to be
129	 * masked to get valid indices.
130	 *
131	 * The kernel controls head of the sq ring and the tail of the cq ring,
132	 * and the application controls tail of the sq ring and the head of the
133	 * cq ring.
134	 */
135	struct io_uring		sq, cq;
136	/*
137	 * Bitmasks to apply to head and tail offsets (constant, equals
138	 * ring_entries - 1)
139	 */
140	u32			sq_ring_mask, cq_ring_mask;
141	/* Ring sizes (constant, power of 2) */
142	u32			sq_ring_entries, cq_ring_entries;
143	/*
144	 * Number of invalid entries dropped by the kernel due to
145	 * invalid index stored in array
146	 *
147	 * Written by the kernel, shouldn't be modified by the
148	 * application (i.e. get number of "new events" by comparing to
149	 * cached value).
150	 *
151	 * After a new SQ head value was read by the application this
152	 * counter includes all submissions that were dropped reaching
153	 * the new SQ head (and possibly more).
154	 */
155	u32			sq_dropped;
156	/*
157	 * Runtime SQ flags
158	 *
159	 * Written by the kernel, shouldn't be modified by the
160	 * application.
161	 *
162	 * The application needs a full memory barrier before checking
163	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
164	 */
165	u32			sq_flags;
166	/*
167	 * Runtime CQ flags
168	 *
169	 * Written by the application, shouldn't be modified by the
170	 * kernel.
171	 */
172	u32			cq_flags;
173	/*
174	 * Number of completion events lost because the queue was full;
175	 * this should be avoided by the application by making sure
176	 * there are not more requests pending than there is space in
177	 * the completion queue.
178	 *
179	 * Written by the kernel, shouldn't be modified by the
180	 * application (i.e. get number of "new events" by comparing to
181	 * cached value).
182	 *
183	 * As completion events come in out of order this counter is not
184	 * ordered with any other data.
185	 */
186	u32			cq_overflow;
187	/*
188	 * Ring buffer of completion events.
189	 *
190	 * The kernel writes completion events fresh every time they are
191	 * produced, so the application is allowed to modify pending
192	 * entries.
193	 */
194	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
195};
196
197enum io_uring_cmd_flags {
198	IO_URING_F_NONBLOCK		= 1,
199	IO_URING_F_COMPLETE_DEFER	= 2,
200};
201
202struct io_mapped_ubuf {
203	u64		ubuf;
204	u64		ubuf_end;
205	unsigned int	nr_bvecs;
206	unsigned long	acct_pages;
207	struct bio_vec	bvec[];
208};
209
210struct io_ring_ctx;
211
212struct io_overflow_cqe {
213	struct io_uring_cqe cqe;
214	struct list_head list;
215};
216
217struct io_fixed_file {
218	/* file * with additional FFS_* flags */
219	unsigned long file_ptr;
220};
221
222struct io_rsrc_put {
223	struct list_head list;
224	u64 tag;
225	union {
226		void *rsrc;
227		struct file *file;
228		struct io_mapped_ubuf *buf;
229	};
230};
231
232struct io_file_table {
233	struct io_fixed_file *files;
234};
235
236struct io_rsrc_node {
237	struct percpu_ref		refs;
238	struct list_head		node;
239	struct list_head		rsrc_list;
240	struct io_rsrc_data		*rsrc_data;
241	struct llist_node		llist;
242	bool				done;
243};
244
245typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
246
247struct io_rsrc_data {
248	struct io_ring_ctx		*ctx;
249
250	u64				**tags;
251	unsigned int			nr;
252	rsrc_put_fn			*do_put;
253	atomic_t			refs;
254	struct completion		done;
255	bool				quiesce;
256};
257
258struct io_buffer {
259	struct list_head list;
260	__u64 addr;
261	__u32 len;
262	__u16 bid;
263};
264
265struct io_restriction {
266	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
267	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
268	u8 sqe_flags_allowed;
269	u8 sqe_flags_required;
270	bool registered;
271};
272
273enum {
274	IO_SQ_THREAD_SHOULD_STOP = 0,
275	IO_SQ_THREAD_SHOULD_PARK,
276};
277
278struct io_sq_data {
279	refcount_t		refs;
280	atomic_t		park_pending;
281	struct mutex		lock;
282
283	/* ctx's that are using this sqd */
284	struct list_head	ctx_list;
285
286	struct task_struct	*thread;
287	struct wait_queue_head	wait;
288
289	unsigned		sq_thread_idle;
290	int			sq_cpu;
291	pid_t			task_pid;
292	pid_t			task_tgid;
293
294	unsigned long		state;
295	struct completion	exited;
296};
297
298#define IO_COMPL_BATCH			32
299#define IO_REQ_CACHE_SIZE		32
300#define IO_REQ_ALLOC_BATCH		8
301
302struct io_submit_link {
303	struct io_kiocb		*head;
304	struct io_kiocb		*last;
305};
306
307struct io_submit_state {
308	struct blk_plug		plug;
309	struct io_submit_link	link;
310
311	/*
312	 * io_kiocb alloc cache
313	 */
314	void			*reqs[IO_REQ_CACHE_SIZE];
315	unsigned int		free_reqs;
316
317	bool			plug_started;
318
319	/*
320	 * Batch completion logic
321	 */
322	struct io_kiocb		*compl_reqs[IO_COMPL_BATCH];
323	unsigned int		compl_nr;
324	/* inline/task_work completion list, under ->uring_lock */
325	struct list_head	free_list;
326
327	unsigned int		ios_left;
328};
329
330struct io_ring_ctx {
331	/* const or read-mostly hot data */
332	struct {
333		struct percpu_ref	refs;
334
335		struct io_rings		*rings;
336		unsigned int		flags;
337		unsigned int		compat: 1;
338		unsigned int		drain_next: 1;
339		unsigned int		eventfd_async: 1;
340		unsigned int		restricted: 1;
341		unsigned int		off_timeout_used: 1;
342		unsigned int		drain_active: 1;
343	} ____cacheline_aligned_in_smp;
344
345	/* submission data */
346	struct {
347		struct mutex		uring_lock;
348
349		/*
350		 * Ring buffer of indices into array of io_uring_sqe, which is
351		 * mmapped by the application using the IORING_OFF_SQES offset.
352		 *
353		 * This indirection could e.g. be used to assign fixed
354		 * io_uring_sqe entries to operations and only submit them to
355		 * the queue when needed.
356		 *
357		 * The kernel modifies neither the indices array nor the entries
358		 * array.
359		 */
360		u32			*sq_array;
361		struct io_uring_sqe	*sq_sqes;
362		unsigned		cached_sq_head;
363		unsigned		sq_entries;
364		struct list_head	defer_list;
365
366		/*
367		 * Fixed resources fast path, should be accessed only under
368		 * uring_lock, and updated through io_uring_register(2)
369		 */
370		struct io_rsrc_node	*rsrc_node;
371		struct io_file_table	file_table;
372		unsigned		nr_user_files;
373		unsigned		nr_user_bufs;
374		struct io_mapped_ubuf	**user_bufs;
375
376		struct io_submit_state	submit_state;
377		struct list_head	timeout_list;
378		struct list_head	cq_overflow_list;
379		struct xarray		io_buffers;
380		struct xarray		personalities;
381		u32			pers_next;
382		unsigned		sq_thread_idle;
383	} ____cacheline_aligned_in_smp;
384
385	/* IRQ completion list, under ->completion_lock */
386	struct list_head	locked_free_list;
387	unsigned int		locked_free_nr;
388
389	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
390	struct io_sq_data	*sq_data;	/* if using sq thread polling */
391
392	struct wait_queue_head	sqo_sq_wait;
393	struct list_head	sqd_list;
394
395	unsigned long		check_cq_overflow;
396
397	struct {
398		unsigned		cached_cq_tail;
399		unsigned		cq_entries;
400		struct eventfd_ctx	*cq_ev_fd;
401		struct wait_queue_head	poll_wait;
402		struct wait_queue_head	cq_wait;
403		unsigned		cq_extra;
404		atomic_t		cq_timeouts;
405		struct fasync_struct	*cq_fasync;
406		unsigned		cq_last_tm_flush;
407	} ____cacheline_aligned_in_smp;
408
409	struct {
410		spinlock_t		completion_lock;
411
412		spinlock_t		timeout_lock;
413
414		/*
415		 * ->iopoll_list is protected by the ctx->uring_lock for
416		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
417		 * For SQPOLL, only the single threaded io_sq_thread() will
418		 * manipulate the list, hence no extra locking is needed there.
419		 */
420		struct list_head	iopoll_list;
421		struct hlist_head	*cancel_hash;
422		unsigned		cancel_hash_bits;
423		bool			poll_multi_queue;
424	} ____cacheline_aligned_in_smp;
425
426	struct io_restriction		restrictions;
427
428	/* slow path rsrc auxilary data, used by update/register */
429	struct {
430		struct io_rsrc_node		*rsrc_backup_node;
431		struct io_mapped_ubuf		*dummy_ubuf;
432		struct io_rsrc_data		*file_data;
433		struct io_rsrc_data		*buf_data;
434
435		struct delayed_work		rsrc_put_work;
436		struct llist_head		rsrc_put_llist;
437		struct list_head		rsrc_ref_list;
438		spinlock_t			rsrc_ref_lock;
439	};
440
441	/* Keep this last, we don't need it for the fast path */
442	struct {
443		#if defined(CONFIG_UNIX)
444			struct socket		*ring_sock;
445		#endif
446		/* hashed buffered write serialization */
447		struct io_wq_hash		*hash_map;
448
449		/* Only used for accounting purposes */
450		struct user_struct		*user;
451		struct mm_struct		*mm_account;
452
453		/* ctx exit and cancelation */
454		struct llist_head		fallback_llist;
455		struct delayed_work		fallback_work;
456		struct work_struct		exit_work;
457		struct list_head		tctx_list;
458		struct completion		ref_comp;
459	};
460};
461
462struct io_uring_task {
463	/* submission side */
464	int			cached_refs;
465	struct xarray		xa;
466	struct wait_queue_head	wait;
467	const struct io_ring_ctx *last;
468	struct io_wq		*io_wq;
469	struct percpu_counter	inflight;
470	atomic_t		inflight_tracked;
471	atomic_t		in_idle;
472
473	spinlock_t		task_lock;
474	struct io_wq_work_list	task_list;
475	struct callback_head	task_work;
476	bool			task_running;
477};
478
479/*
480 * First field must be the file pointer in all the
481 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
482 */
483struct io_poll_iocb {
484	struct file			*file;
485	struct wait_queue_head		*head;
486	__poll_t			events;
487	bool				done;
488	bool				canceled;
489	struct wait_queue_entry		wait;
490};
491
492struct io_poll_update {
493	struct file			*file;
494	u64				old_user_data;
495	u64				new_user_data;
496	__poll_t			events;
497	bool				update_events;
498	bool				update_user_data;
499};
500
501struct io_close {
502	struct file			*file;
503	int				fd;
504};
505
506struct io_timeout_data {
507	struct io_kiocb			*req;
508	struct hrtimer			timer;
509	struct timespec64		ts;
510	enum hrtimer_mode		mode;
511};
512
513struct io_accept {
514	struct file			*file;
515	struct sockaddr __user		*addr;
516	int __user			*addr_len;
517	int				flags;
518	unsigned long			nofile;
519};
520
521struct io_sync {
522	struct file			*file;
523	loff_t				len;
524	loff_t				off;
525	int				flags;
526	int				mode;
527};
528
529struct io_cancel {
530	struct file			*file;
531	u64				addr;
532};
533
534struct io_timeout {
535	struct file			*file;
536	u32				off;
537	u32				target_seq;
538	struct list_head		list;
539	/* head of the link, used by linked timeouts only */
540	struct io_kiocb			*head;
541	/* for linked completions */
542	struct io_kiocb			*prev;
543};
544
545struct io_timeout_rem {
546	struct file			*file;
547	u64				addr;
548
549	/* timeout update */
550	struct timespec64		ts;
551	u32				flags;
552};
553
554struct io_rw {
555	/* NOTE: kiocb has the file as the first member, so don't do it here */
556	struct kiocb			kiocb;
557	u64				addr;
558	u64				len;
559};
560
561struct io_connect {
562	struct file			*file;
563	struct sockaddr __user		*addr;
564	int				addr_len;
565};
566
567struct io_sr_msg {
568	struct file			*file;
569	union {
570		struct compat_msghdr __user	*umsg_compat;
571		struct user_msghdr __user	*umsg;
572		void __user			*buf;
573	};
574	int				msg_flags;
575	int				bgid;
576	size_t				len;
577	struct io_buffer		*kbuf;
578};
579
580struct io_open {
581	struct file			*file;
582	int				dfd;
583	struct filename			*filename;
584	struct open_how			how;
585	unsigned long			nofile;
586};
587
588struct io_rsrc_update {
589	struct file			*file;
590	u64				arg;
591	u32				nr_args;
592	u32				offset;
593};
594
595struct io_fadvise {
596	struct file			*file;
597	u64				offset;
598	u32				len;
599	u32				advice;
600};
601
602struct io_madvise {
603	struct file			*file;
604	u64				addr;
605	u32				len;
606	u32				advice;
607};
608
609struct io_epoll {
610	struct file			*file;
611	int				epfd;
612	int				op;
613	int				fd;
614	struct epoll_event		event;
615};
616
617struct io_splice {
618	struct file			*file_out;
619	struct file			*file_in;
620	loff_t				off_out;
621	loff_t				off_in;
622	u64				len;
623	unsigned int			flags;
624};
625
626struct io_provide_buf {
627	struct file			*file;
628	__u64				addr;
629	__u32				len;
630	__u32				bgid;
631	__u16				nbufs;
632	__u16				bid;
633};
634
635struct io_statx {
636	struct file			*file;
637	int				dfd;
638	unsigned int			mask;
639	unsigned int			flags;
640	const char __user		*filename;
641	struct statx __user		*buffer;
642};
643
644struct io_shutdown {
645	struct file			*file;
646	int				how;
647};
648
649struct io_rename {
650	struct file			*file;
651	int				old_dfd;
652	int				new_dfd;
653	struct filename			*oldpath;
654	struct filename			*newpath;
655	int				flags;
656};
657
658struct io_unlink {
659	struct file			*file;
660	int				dfd;
661	int				flags;
662	struct filename			*filename;
663};
664
665struct io_completion {
666	struct file			*file;
667	u32				cflags;
668};
669
670struct io_async_connect {
671	struct sockaddr_storage		address;
672};
673
674struct io_async_msghdr {
675	struct iovec			fast_iov[UIO_FASTIOV];
676	/* points to an allocated iov, if NULL we use fast_iov instead */
677	struct iovec			*free_iov;
678	struct sockaddr __user		*uaddr;
679	struct msghdr			msg;
680	struct sockaddr_storage		addr;
681};
682
683struct io_async_rw {
684	struct iovec			fast_iov[UIO_FASTIOV];
685	const struct iovec		*free_iovec;
686	struct iov_iter			iter;
687	size_t				bytes_done;
688	struct wait_page_queue		wpq;
689};
690
691enum {
692	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
693	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
694	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
695	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
696	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
697	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
698
699	/* first byte is taken by user flags, shift it to not overlap */
700	REQ_F_FAIL_BIT		= 8,
701	REQ_F_INFLIGHT_BIT,
702	REQ_F_CUR_POS_BIT,
703	REQ_F_NOWAIT_BIT,
704	REQ_F_LINK_TIMEOUT_BIT,
705	REQ_F_NEED_CLEANUP_BIT,
706	REQ_F_POLLED_BIT,
707	REQ_F_BUFFER_SELECTED_BIT,
708	REQ_F_LTIMEOUT_ACTIVE_BIT,
709	REQ_F_COMPLETE_INLINE_BIT,
710	REQ_F_REISSUE_BIT,
711	REQ_F_DONT_REISSUE_BIT,
712	REQ_F_CREDS_BIT,
713	/* keep async read/write and isreg together and in order */
714	REQ_F_NOWAIT_READ_BIT,
715	REQ_F_NOWAIT_WRITE_BIT,
716	REQ_F_ISREG_BIT,
717
718	/* not a real bit, just to check we're not overflowing the space */
719	__REQ_F_LAST_BIT,
720};
721
722enum {
723	/* ctx owns file */
724	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
725	/* drain existing IO first */
726	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
727	/* linked sqes */
728	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
729	/* doesn't sever on completion < 0 */
730	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
731	/* IOSQE_ASYNC */
732	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
733	/* IOSQE_BUFFER_SELECT */
734	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
735
736	/* fail rest of links */
737	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
738	/* on inflight list, should be cancelled and waited on exit reliably */
739	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
740	/* read/write uses file position */
741	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
742	/* must not punt to workers */
743	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
744	/* has or had linked timeout */
745	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
746	/* needs cleanup */
747	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
748	/* already went through poll handler */
749	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
750	/* buffer already selected */
751	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
752	/* linked timeout is active, i.e. prepared by link's head */
753	REQ_F_LTIMEOUT_ACTIVE	= BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
754	/* completion is deferred through io_comp_state */
755	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
756	/* caller should reissue async */
757	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
758	/* don't attempt request reissue, see io_rw_reissue() */
759	REQ_F_DONT_REISSUE	= BIT(REQ_F_DONT_REISSUE_BIT),
760	/* supports async reads */
761	REQ_F_NOWAIT_READ	= BIT(REQ_F_NOWAIT_READ_BIT),
762	/* supports async writes */
763	REQ_F_NOWAIT_WRITE	= BIT(REQ_F_NOWAIT_WRITE_BIT),
764	/* regular file */
765	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
766	/* has creds assigned */
767	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
768};
769
770struct async_poll {
771	struct io_poll_iocb	poll;
772	struct io_poll_iocb	*double_poll;
773};
774
775typedef void (*io_req_tw_func_t)(struct io_kiocb *req);
776
777struct io_task_work {
778	union {
779		struct io_wq_work_node	node;
780		struct llist_node	fallback_node;
781	};
782	io_req_tw_func_t		func;
783};
784
785enum {
786	IORING_RSRC_FILE		= 0,
787	IORING_RSRC_BUFFER		= 1,
788};
789
790/*
791 * NOTE! Each of the iocb union members has the file pointer
792 * as the first entry in their struct definition. So you can
793 * access the file pointer through any of the sub-structs,
794 * or directly as just 'ki_filp' in this struct.
795 */
796struct io_kiocb {
797	union {
798		struct file		*file;
799		struct io_rw		rw;
800		struct io_poll_iocb	poll;
801		struct io_poll_update	poll_update;
802		struct io_accept	accept;
803		struct io_sync		sync;
804		struct io_cancel	cancel;
805		struct io_timeout	timeout;
806		struct io_timeout_rem	timeout_rem;
807		struct io_connect	connect;
808		struct io_sr_msg	sr_msg;
809		struct io_open		open;
810		struct io_close		close;
811		struct io_rsrc_update	rsrc_update;
812		struct io_fadvise	fadvise;
813		struct io_madvise	madvise;
814		struct io_epoll		epoll;
815		struct io_splice	splice;
816		struct io_provide_buf	pbuf;
817		struct io_statx		statx;
818		struct io_shutdown	shutdown;
819		struct io_rename	rename;
820		struct io_unlink	unlink;
821		/* use only after cleaning per-op data, see io_clean_op() */
822		struct io_completion	compl;
823	};
824
825	/* opcode allocated if it needs to store data for async defer */
826	void				*async_data;
827	u8				opcode;
828	/* polled IO has completed */
829	u8				iopoll_completed;
830
831	u16				buf_index;
832	u32				result;
833
834	struct io_ring_ctx		*ctx;
835	unsigned int			flags;
836	atomic_t			refs;
837	struct task_struct		*task;
838	u64				user_data;
839
840	struct io_kiocb			*link;
841	struct percpu_ref		*fixed_rsrc_refs;
842
843	/* used with ctx->iopoll_list with reads/writes */
844	struct list_head		inflight_entry;
845	struct io_task_work		io_task_work;
846	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
847	struct hlist_node		hash_node;
848	struct async_poll		*apoll;
849	struct io_wq_work		work;
850	const struct cred		*creds;
851
852	/* store used ubuf, so we can prevent reloading */
853	struct io_mapped_ubuf		*imu;
854};
855
856struct io_tctx_node {
857	struct list_head	ctx_node;
858	struct task_struct	*task;
859	struct io_ring_ctx	*ctx;
860};
861
862struct io_defer_entry {
863	struct list_head	list;
864	struct io_kiocb		*req;
865	u32			seq;
866};
867
868struct io_op_def {
869	/* needs req->file assigned */
870	unsigned		needs_file : 1;
871	/* hash wq insertion if file is a regular file */
872	unsigned		hash_reg_file : 1;
873	/* unbound wq insertion if file is a non-regular file */
874	unsigned		unbound_nonreg_file : 1;
875	/* opcode is not supported by this kernel */
876	unsigned		not_supported : 1;
877	/* set if opcode supports polled "wait" */
878	unsigned		pollin : 1;
879	unsigned		pollout : 1;
880	/* op supports buffer selection */
881	unsigned		buffer_select : 1;
882	/* do prep async if is going to be punted */
883	unsigned		needs_async_setup : 1;
884	/* should block plug */
885	unsigned		plug : 1;
886	/* size of async data needed, if any */
887	unsigned short		async_size;
888};
889
890static const struct io_op_def io_op_defs[] = {
891	[IORING_OP_NOP] = {},
892	[IORING_OP_READV] = {
893		.needs_file		= 1,
894		.unbound_nonreg_file	= 1,
895		.pollin			= 1,
896		.buffer_select		= 1,
897		.needs_async_setup	= 1,
898		.plug			= 1,
899		.async_size		= sizeof(struct io_async_rw),
900	},
901	[IORING_OP_WRITEV] = {
902		.needs_file		= 1,
903		.hash_reg_file		= 1,
904		.unbound_nonreg_file	= 1,
905		.pollout		= 1,
906		.needs_async_setup	= 1,
907		.plug			= 1,
908		.async_size		= sizeof(struct io_async_rw),
909	},
910	[IORING_OP_FSYNC] = {
911		.needs_file		= 1,
912	},
913	[IORING_OP_READ_FIXED] = {
914		.needs_file		= 1,
915		.unbound_nonreg_file	= 1,
916		.pollin			= 1,
917		.plug			= 1,
918		.async_size		= sizeof(struct io_async_rw),
919	},
920	[IORING_OP_WRITE_FIXED] = {
921		.needs_file		= 1,
922		.hash_reg_file		= 1,
923		.unbound_nonreg_file	= 1,
924		.pollout		= 1,
925		.plug			= 1,
926		.async_size		= sizeof(struct io_async_rw),
927	},
928	[IORING_OP_POLL_ADD] = {
929		.needs_file		= 1,
930		.unbound_nonreg_file	= 1,
931	},
932	[IORING_OP_POLL_REMOVE] = {},
933	[IORING_OP_SYNC_FILE_RANGE] = {
934		.needs_file		= 1,
935	},
936	[IORING_OP_SENDMSG] = {
937		.needs_file		= 1,
938		.unbound_nonreg_file	= 1,
939		.pollout		= 1,
940		.needs_async_setup	= 1,
941		.async_size		= sizeof(struct io_async_msghdr),
942	},
943	[IORING_OP_RECVMSG] = {
944		.needs_file		= 1,
945		.unbound_nonreg_file	= 1,
946		.pollin			= 1,
947		.buffer_select		= 1,
948		.needs_async_setup	= 1,
949		.async_size		= sizeof(struct io_async_msghdr),
950	},
951	[IORING_OP_TIMEOUT] = {
952		.async_size		= sizeof(struct io_timeout_data),
953	},
954	[IORING_OP_TIMEOUT_REMOVE] = {
955		/* used by timeout updates' prep() */
956	},
957	[IORING_OP_ACCEPT] = {
958		.needs_file		= 1,
959		.unbound_nonreg_file	= 1,
960		.pollin			= 1,
961	},
962	[IORING_OP_ASYNC_CANCEL] = {},
963	[IORING_OP_LINK_TIMEOUT] = {
964		.async_size		= sizeof(struct io_timeout_data),
965	},
966	[IORING_OP_CONNECT] = {
967		.needs_file		= 1,
968		.unbound_nonreg_file	= 1,
969		.pollout		= 1,
970		.needs_async_setup	= 1,
971		.async_size		= sizeof(struct io_async_connect),
972	},
973	[IORING_OP_FALLOCATE] = {
974		.needs_file		= 1,
975	},
976	[IORING_OP_OPENAT] = {},
977	[IORING_OP_CLOSE] = {},
978	[IORING_OP_FILES_UPDATE] = {},
979	[IORING_OP_STATX] = {},
980	[IORING_OP_READ] = {
981		.needs_file		= 1,
982		.unbound_nonreg_file	= 1,
983		.pollin			= 1,
984		.buffer_select		= 1,
985		.plug			= 1,
986		.async_size		= sizeof(struct io_async_rw),
987	},
988	[IORING_OP_WRITE] = {
989		.needs_file		= 1,
990		.unbound_nonreg_file	= 1,
991		.pollout		= 1,
992		.plug			= 1,
993		.async_size		= sizeof(struct io_async_rw),
994	},
995	[IORING_OP_FADVISE] = {
996		.needs_file		= 1,
997	},
998	[IORING_OP_MADVISE] = {},
999	[IORING_OP_SEND] = {
1000		.needs_file		= 1,
1001		.unbound_nonreg_file	= 1,
1002		.pollout		= 1,
1003	},
1004	[IORING_OP_RECV] = {
1005		.needs_file		= 1,
1006		.unbound_nonreg_file	= 1,
1007		.pollin			= 1,
1008		.buffer_select		= 1,
1009	},
1010	[IORING_OP_OPENAT2] = {
1011	},
1012	[IORING_OP_EPOLL_CTL] = {
1013		.unbound_nonreg_file	= 1,
1014	},
1015	[IORING_OP_SPLICE] = {
1016		.needs_file		= 1,
1017		.hash_reg_file		= 1,
1018		.unbound_nonreg_file	= 1,
1019	},
1020	[IORING_OP_PROVIDE_BUFFERS] = {},
1021	[IORING_OP_REMOVE_BUFFERS] = {},
1022	[IORING_OP_TEE] = {
1023		.needs_file		= 1,
1024		.hash_reg_file		= 1,
1025		.unbound_nonreg_file	= 1,
1026	},
1027	[IORING_OP_SHUTDOWN] = {
1028		.needs_file		= 1,
1029	},
1030	[IORING_OP_RENAMEAT] = {},
1031	[IORING_OP_UNLINKAT] = {},
1032};
1033
1034static bool io_disarm_next(struct io_kiocb *req);
1035static void io_uring_del_tctx_node(unsigned long index);
1036static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1037					 struct task_struct *task,
1038					 bool cancel_all);
1039static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1040
1041static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1042				 long res, unsigned int cflags);
1043static void io_put_req(struct io_kiocb *req);
1044static void io_put_req_deferred(struct io_kiocb *req, int nr);
1045static void io_dismantle_req(struct io_kiocb *req);
1046static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
1047static void io_queue_linked_timeout(struct io_kiocb *req);
1048static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1049				     struct io_uring_rsrc_update2 *up,
1050				     unsigned nr_args);
1051static void io_clean_op(struct io_kiocb *req);
1052static struct file *io_file_get(struct io_ring_ctx *ctx,
1053				struct io_kiocb *req, int fd, bool fixed);
1054static void __io_queue_sqe(struct io_kiocb *req);
1055static void io_rsrc_put_work(struct work_struct *work);
1056
1057static void io_req_task_queue(struct io_kiocb *req);
1058static void io_submit_flush_completions(struct io_ring_ctx *ctx);
1059static int io_req_prep_async(struct io_kiocb *req);
1060
1061static struct kmem_cache *req_cachep;
1062
1063static const struct file_operations io_uring_fops;
1064
1065struct sock *io_uring_get_socket(struct file *file)
1066{
1067#if defined(CONFIG_UNIX)
1068	if (file->f_op == &io_uring_fops) {
1069		struct io_ring_ctx *ctx = file->private_data;
1070
1071		return ctx->ring_sock->sk;
1072	}
1073#endif
1074	return NULL;
1075}
1076EXPORT_SYMBOL(io_uring_get_socket);
1077
1078#define io_for_each_link(pos, head) \
1079	for (pos = (head); pos; pos = pos->link)
1080
1081static inline void io_req_set_rsrc_node(struct io_kiocb *req)
1082{
1083	struct io_ring_ctx *ctx = req->ctx;
1084
1085	if (!req->fixed_rsrc_refs) {
1086		req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1087		percpu_ref_get(req->fixed_rsrc_refs);
1088	}
1089}
1090
1091static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1092{
1093	bool got = percpu_ref_tryget(ref);
1094
1095	/* already at zero, wait for ->release() */
1096	if (!got)
1097		wait_for_completion(compl);
1098	percpu_ref_resurrect(ref);
1099	if (got)
1100		percpu_ref_put(ref);
1101}
1102
1103static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1104			  bool cancel_all)
1105{
1106	struct io_kiocb *req;
1107
1108	if (task && head->task != task)
1109		return false;
1110	if (cancel_all)
1111		return true;
1112
1113	io_for_each_link(req, head) {
1114		if (req->flags & REQ_F_INFLIGHT)
1115			return true;
1116	}
1117	return false;
1118}
1119
1120static inline void req_set_fail(struct io_kiocb *req)
1121{
1122	req->flags |= REQ_F_FAIL;
1123}
1124
1125static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1126{
1127	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1128
1129	complete(&ctx->ref_comp);
1130}
1131
1132static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1133{
1134	return !req->timeout.off;
1135}
1136
1137static void io_fallback_req_func(struct work_struct *work)
1138{
1139	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1140						fallback_work.work);
1141	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1142	struct io_kiocb *req, *tmp;
1143
1144	percpu_ref_get(&ctx->refs);
1145	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
1146		req->io_task_work.func(req);
1147	percpu_ref_put(&ctx->refs);
1148}
1149
1150static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1151{
1152	struct io_ring_ctx *ctx;
1153	int hash_bits;
1154
1155	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1156	if (!ctx)
1157		return NULL;
1158
1159	/*
1160	 * Use 5 bits less than the max cq entries, that should give us around
1161	 * 32 entries per hash list if totally full and uniformly spread.
1162	 */
1163	hash_bits = ilog2(p->cq_entries);
1164	hash_bits -= 5;
1165	if (hash_bits <= 0)
1166		hash_bits = 1;
1167	ctx->cancel_hash_bits = hash_bits;
1168	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1169					GFP_KERNEL);
1170	if (!ctx->cancel_hash)
1171		goto err;
1172	__hash_init(ctx->cancel_hash, 1U << hash_bits);
1173
1174	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1175	if (!ctx->dummy_ubuf)
1176		goto err;
1177	/* set invalid range, so io_import_fixed() fails meeting it */
1178	ctx->dummy_ubuf->ubuf = -1UL;
1179
1180	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1181			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1182		goto err;
1183
1184	ctx->flags = p->flags;
1185	init_waitqueue_head(&ctx->sqo_sq_wait);
1186	INIT_LIST_HEAD(&ctx->sqd_list);
1187	init_waitqueue_head(&ctx->poll_wait);
1188	INIT_LIST_HEAD(&ctx->cq_overflow_list);
1189	init_completion(&ctx->ref_comp);
1190	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1191	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1192	mutex_init(&ctx->uring_lock);
1193	init_waitqueue_head(&ctx->cq_wait);
1194	spin_lock_init(&ctx->completion_lock);
1195	spin_lock_init(&ctx->timeout_lock);
1196	INIT_LIST_HEAD(&ctx->iopoll_list);
1197	INIT_LIST_HEAD(&ctx->defer_list);
1198	INIT_LIST_HEAD(&ctx->timeout_list);
1199	spin_lock_init(&ctx->rsrc_ref_lock);
1200	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1201	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1202	init_llist_head(&ctx->rsrc_put_llist);
1203	INIT_LIST_HEAD(&ctx->tctx_list);
1204	INIT_LIST_HEAD(&ctx->submit_state.free_list);
1205	INIT_LIST_HEAD(&ctx->locked_free_list);
1206	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1207	return ctx;
1208err:
1209	kfree(ctx->dummy_ubuf);
1210	kfree(ctx->cancel_hash);
1211	kfree(ctx);
1212	return NULL;
1213}
1214
1215static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1216{
1217	struct io_rings *r = ctx->rings;
1218
1219	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1220	ctx->cq_extra--;
1221}
1222
1223static bool req_need_defer(struct io_kiocb *req, u32 seq)
1224{
1225	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1226		struct io_ring_ctx *ctx = req->ctx;
1227
1228		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1229	}
1230
1231	return false;
1232}
1233
1234#define FFS_ASYNC_READ		0x1UL
1235#define FFS_ASYNC_WRITE		0x2UL
1236#ifdef CONFIG_64BIT
1237#define FFS_ISREG		0x4UL
1238#else
1239#define FFS_ISREG		0x0UL
1240#endif
1241#define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
1242
1243static inline bool io_req_ffs_set(struct io_kiocb *req)
1244{
1245	return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
1246}
1247
1248static void io_req_track_inflight(struct io_kiocb *req)
1249{
1250	if (!(req->flags & REQ_F_INFLIGHT)) {
1251		req->flags |= REQ_F_INFLIGHT;
1252		atomic_inc(&current->io_uring->inflight_tracked);
1253	}
1254}
1255
1256static void io_prep_async_work(struct io_kiocb *req)
1257{
1258	const struct io_op_def *def = &io_op_defs[req->opcode];
1259	struct io_ring_ctx *ctx = req->ctx;
1260
1261	if (!(req->flags & REQ_F_CREDS)) {
1262		req->flags |= REQ_F_CREDS;
1263		req->creds = get_current_cred();
1264	}
1265
1266	req->work.list.next = NULL;
1267	req->work.flags = 0;
1268	if (req->flags & REQ_F_FORCE_ASYNC)
1269		req->work.flags |= IO_WQ_WORK_CONCURRENT;
1270
1271	if (req->flags & REQ_F_ISREG) {
1272		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1273			io_wq_hash_work(&req->work, file_inode(req->file));
1274	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1275		if (def->unbound_nonreg_file)
1276			req->work.flags |= IO_WQ_WORK_UNBOUND;
1277	}
1278
1279	switch (req->opcode) {
1280	case IORING_OP_SPLICE:
1281	case IORING_OP_TEE:
1282		if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1283			req->work.flags |= IO_WQ_WORK_UNBOUND;
1284		break;
1285	}
1286}
1287
1288static void io_prep_async_link(struct io_kiocb *req)
1289{
1290	struct io_kiocb *cur;
1291
1292	if (req->flags & REQ_F_LINK_TIMEOUT) {
1293		struct io_ring_ctx *ctx = req->ctx;
1294
1295		spin_lock_irq(&ctx->completion_lock);
1296		io_for_each_link(cur, req)
1297			io_prep_async_work(cur);
1298		spin_unlock_irq(&ctx->completion_lock);
1299	} else {
1300		io_for_each_link(cur, req)
1301			io_prep_async_work(cur);
1302	}
1303}
1304
1305static void io_queue_async_work(struct io_kiocb *req)
1306{
1307	struct io_ring_ctx *ctx = req->ctx;
1308	struct io_kiocb *link = io_prep_linked_timeout(req);
1309	struct io_uring_task *tctx = req->task->io_uring;
1310
1311	BUG_ON(!tctx);
1312	BUG_ON(!tctx->io_wq);
1313
1314	/* init ->work of the whole link before punting */
1315	io_prep_async_link(req);
1316
1317	/*
1318	 * Not expected to happen, but if we do have a bug where this _can_
1319	 * happen, catch it here and ensure the request is marked as
1320	 * canceled. That will make io-wq go through the usual work cancel
1321	 * procedure rather than attempt to run this request (or create a new
1322	 * worker for it).
1323	 */
1324	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1325		req->work.flags |= IO_WQ_WORK_CANCEL;
1326
1327	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1328					&req->work, req->flags);
1329	io_wq_enqueue(tctx->io_wq, &req->work);
1330	if (link)
1331		io_queue_linked_timeout(link);
1332}
1333
1334static void io_kill_timeout(struct io_kiocb *req, int status)
1335	__must_hold(&req->ctx->completion_lock)
1336	__must_hold(&req->ctx->timeout_lock)
1337{
1338	struct io_timeout_data *io = req->async_data;
1339
1340	if (hrtimer_try_to_cancel(&io->timer) != -1) {
1341		atomic_set(&req->ctx->cq_timeouts,
1342			atomic_read(&req->ctx->cq_timeouts) + 1);
1343		list_del_init(&req->timeout.list);
1344		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
1345		io_put_req_deferred(req, 1);
1346	}
1347}
1348
1349static void io_queue_deferred(struct io_ring_ctx *ctx)
1350{
1351	while (!list_empty(&ctx->defer_list)) {
1352		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1353						struct io_defer_entry, list);
1354
1355		if (req_need_defer(de->req, de->seq))
1356			break;
1357		list_del_init(&de->list);
1358		io_req_task_queue(de->req);
1359		kfree(de);
1360	}
1361}
1362
1363static void io_flush_timeouts(struct io_ring_ctx *ctx)
1364	__must_hold(&ctx->completion_lock)
1365{
1366	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1367	unsigned long flags;
1368
1369	spin_lock_irqsave(&ctx->timeout_lock, flags);
1370	while (!list_empty(&ctx->timeout_list)) {
1371		u32 events_needed, events_got;
1372		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1373						struct io_kiocb, timeout.list);
1374
1375		if (io_is_timeout_noseq(req))
1376			break;
1377
1378		/*
1379		 * Since seq can easily wrap around over time, subtract
1380		 * the last seq at which timeouts were flushed before comparing.
1381		 * Assuming not more than 2^31-1 events have happened since,
1382		 * these subtractions won't have wrapped, so we can check if
1383		 * target is in [last_seq, current_seq] by comparing the two.
1384		 */
1385		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1386		events_got = seq - ctx->cq_last_tm_flush;
1387		if (events_got < events_needed)
1388			break;
1389
1390		list_del_init(&req->timeout.list);
1391		io_kill_timeout(req, 0);
1392	}
1393	ctx->cq_last_tm_flush = seq;
1394	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
1395}
1396
1397static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1398{
1399	if (ctx->off_timeout_used)
1400		io_flush_timeouts(ctx);
1401	if (ctx->drain_active)
1402		io_queue_deferred(ctx);
1403}
1404
1405static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1406{
1407	if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1408		__io_commit_cqring_flush(ctx);
1409	/* order cqe stores with ring update */
1410	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1411}
1412
1413static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1414{
1415	struct io_rings *r = ctx->rings;
1416
1417	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1418}
1419
1420static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1421{
1422	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1423}
1424
1425static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1426{
1427	struct io_rings *rings = ctx->rings;
1428	unsigned tail, mask = ctx->cq_entries - 1;
1429
1430	/*
1431	 * writes to the cq entry need to come after reading head; the
1432	 * control dependency is enough as we're using WRITE_ONCE to
1433	 * fill the cq entry
1434	 */
1435	if (__io_cqring_events(ctx) == ctx->cq_entries)
1436		return NULL;
1437
1438	tail = ctx->cached_cq_tail++;
1439	return &rings->cqes[tail & mask];
1440}
1441
1442static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1443{
1444	if (likely(!ctx->cq_ev_fd))
1445		return false;
1446	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1447		return false;
1448	return !ctx->eventfd_async || io_wq_current_is_worker();
1449}
1450
1451static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1452{
1453	/*
1454	 * wake_up_all() may seem excessive, but io_wake_function() and
1455	 * io_should_wake() handle the termination of the loop and only
1456	 * wake as many waiters as we need to.
1457	 */
1458	if (wq_has_sleeper(&ctx->cq_wait))
1459		wake_up_all(&ctx->cq_wait);
1460	if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1461		wake_up(&ctx->sq_data->wait);
1462	if (io_should_trigger_evfd(ctx))
1463		eventfd_signal(ctx->cq_ev_fd, 1);
1464	if (waitqueue_active(&ctx->poll_wait)) {
1465		wake_up_interruptible(&ctx->poll_wait);
1466		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1467	}
1468}
1469
1470static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1471{
1472	if (ctx->flags & IORING_SETUP_SQPOLL) {
1473		if (wq_has_sleeper(&ctx->cq_wait))
1474			wake_up_all(&ctx->cq_wait);
1475	}
1476	if (io_should_trigger_evfd(ctx))
1477		eventfd_signal(ctx->cq_ev_fd, 1);
1478	if (waitqueue_active(&ctx->poll_wait)) {
1479		wake_up_interruptible(&ctx->poll_wait);
1480		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1481	}
1482}
1483
1484/* Returns true if there are no backlogged entries after the flush */
1485static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1486{
1487	unsigned long flags;
1488	bool all_flushed, posted;
1489
1490	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1491		return false;
1492
1493	posted = false;
1494	spin_lock_irqsave(&ctx->completion_lock, flags);
1495	while (!list_empty(&ctx->cq_overflow_list)) {
1496		struct io_uring_cqe *cqe = io_get_cqe(ctx);
1497		struct io_overflow_cqe *ocqe;
1498
1499		if (!cqe && !force)
1500			break;
1501		ocqe = list_first_entry(&ctx->cq_overflow_list,
1502					struct io_overflow_cqe, list);
1503		if (cqe)
1504			memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1505		else
1506			io_account_cq_overflow(ctx);
1507
1508		posted = true;
1509		list_del(&ocqe->list);
1510		kfree(ocqe);
1511	}
1512
1513	all_flushed = list_empty(&ctx->cq_overflow_list);
1514	if (all_flushed) {
1515		clear_bit(0, &ctx->check_cq_overflow);
1516		WRITE_ONCE(ctx->rings->sq_flags,
1517			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
1518	}
1519
1520	if (posted)
1521		io_commit_cqring(ctx);
1522	spin_unlock_irqrestore(&ctx->completion_lock, flags);
1523	if (posted)
1524		io_cqring_ev_posted(ctx);
1525	return all_flushed;
1526}
1527
1528static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1529{
1530	bool ret = true;
1531
1532	if (test_bit(0, &ctx->check_cq_overflow)) {
1533		/* iopoll syncs against uring_lock, not completion_lock */
1534		if (ctx->flags & IORING_SETUP_IOPOLL)
1535			mutex_lock(&ctx->uring_lock);
1536		ret = __io_cqring_overflow_flush(ctx, false);
1537		if (ctx->flags & IORING_SETUP_IOPOLL)
1538			mutex_unlock(&ctx->uring_lock);
1539	}
1540
1541	return ret;
1542}
1543
1544/*
1545 * Shamelessly stolen from the mm implementation of page reference checking,
1546 * see commit f958d7b528b1 for details.
1547 */
1548#define req_ref_zero_or_close_to_overflow(req)	\
1549	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1550
1551static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1552{
1553	return atomic_inc_not_zero(&req->refs);
1554}
1555
1556static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
1557{
1558	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1559	return atomic_sub_and_test(refs, &req->refs);
1560}
1561
1562static inline bool req_ref_put_and_test(struct io_kiocb *req)
1563{
1564	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1565	return atomic_dec_and_test(&req->refs);
1566}
1567
1568static inline void req_ref_put(struct io_kiocb *req)
1569{
1570	WARN_ON_ONCE(req_ref_put_and_test(req));
1571}
1572
1573static inline void req_ref_get(struct io_kiocb *req)
1574{
1575	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1576	atomic_inc(&req->refs);
1577}
1578
1579/* must to be called somewhat shortly after putting a request */
1580static inline void io_put_task(struct task_struct *task, int nr)
1581{
1582	struct io_uring_task *tctx = task->io_uring;
1583
1584	percpu_counter_sub(&tctx->inflight, nr);
1585	if (unlikely(atomic_read(&tctx->in_idle)))
1586		wake_up(&tctx->wait);
1587	put_task_struct_many(task, nr);
1588}
1589
1590static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1591				     long res, unsigned int cflags)
1592{
1593	struct io_overflow_cqe *ocqe;
1594
1595	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1596	if (!ocqe) {
1597		/*
1598		 * If we're in ring overflow flush mode, or in task cancel mode,
1599		 * or cannot allocate an overflow entry, then we need to drop it
1600		 * on the floor.
1601		 */
1602		io_account_cq_overflow(ctx);
1603		return false;
1604	}
1605	if (list_empty(&ctx->cq_overflow_list)) {
1606		set_bit(0, &ctx->check_cq_overflow);
1607		WRITE_ONCE(ctx->rings->sq_flags,
1608			   ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1609
1610	}
1611	ocqe->cqe.user_data = user_data;
1612	ocqe->cqe.res = res;
1613	ocqe->cqe.flags = cflags;
1614	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1615	return true;
1616}
1617
1618static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1619					  long res, unsigned int cflags)
1620{
1621	struct io_uring_cqe *cqe;
1622
1623	trace_io_uring_complete(ctx, user_data, res, cflags);
1624
1625	/*
1626	 * If we can't get a cq entry, userspace overflowed the
1627	 * submission (by quite a lot). Increment the overflow count in
1628	 * the ring.
1629	 */
1630	cqe = io_get_cqe(ctx);
1631	if (likely(cqe)) {
1632		WRITE_ONCE(cqe->user_data, user_data);
1633		WRITE_ONCE(cqe->res, res);
1634		WRITE_ONCE(cqe->flags, cflags);
1635		return true;
1636	}
1637	return io_cqring_event_overflow(ctx, user_data, res, cflags);
1638}
1639
1640/* not as hot to bloat with inlining */
1641static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1642					  long res, unsigned int cflags)
1643{
1644	return __io_cqring_fill_event(ctx, user_data, res, cflags);
1645}
1646
1647static void io_req_complete_post(struct io_kiocb *req, long res,
1648				 unsigned int cflags)
1649{
1650	struct io_ring_ctx *ctx = req->ctx;
1651	unsigned long flags;
1652
1653	spin_lock_irqsave(&ctx->completion_lock, flags);
1654	__io_cqring_fill_event(ctx, req->user_data, res, cflags);
1655	/*
1656	 * If we're the last reference to this request, add to our locked
1657	 * free_list cache.
1658	 */
1659	if (req_ref_put_and_test(req)) {
1660		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1661			if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL))
1662				io_disarm_next(req);
1663			if (req->link) {
1664				io_req_task_queue(req->link);
1665				req->link = NULL;
1666			}
1667		}
1668		io_dismantle_req(req);
1669		io_put_task(req->task, 1);
1670		list_add(&req->inflight_entry, &ctx->locked_free_list);
1671		ctx->locked_free_nr++;
1672	} else {
1673		if (!percpu_ref_tryget(&ctx->refs))
1674			req = NULL;
1675	}
1676	io_commit_cqring(ctx);
1677	spin_unlock_irqrestore(&ctx->completion_lock, flags);
1678
1679	if (req) {
1680		io_cqring_ev_posted(ctx);
1681		percpu_ref_put(&ctx->refs);
1682	}
1683}
1684
1685static inline bool io_req_needs_clean(struct io_kiocb *req)
1686{
1687	return req->flags & IO_REQ_CLEAN_FLAGS;
1688}
1689
1690static void io_req_complete_state(struct io_kiocb *req, long res,
1691				  unsigned int cflags)
1692{
1693	if (io_req_needs_clean(req))
1694		io_clean_op(req);
1695	req->result = res;
1696	req->compl.cflags = cflags;
1697	req->flags |= REQ_F_COMPLETE_INLINE;
1698}
1699
1700static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1701				     long res, unsigned cflags)
1702{
1703	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1704		io_req_complete_state(req, res, cflags);
1705	else
1706		io_req_complete_post(req, res, cflags);
1707}
1708
1709static inline void io_req_complete(struct io_kiocb *req, long res)
1710{
1711	__io_req_complete(req, 0, res, 0);
1712}
1713
1714static void io_req_complete_failed(struct io_kiocb *req, long res)
1715{
1716	req_set_fail(req);
1717	io_put_req(req);
1718	io_req_complete_post(req, res, 0);
1719}
1720
1721/*
1722 * Don't initialise the fields below on every allocation, but do that in
1723 * advance and keep them valid across allocations.
1724 */
1725static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1726{
1727	req->ctx = ctx;
1728	req->link = NULL;
1729	req->async_data = NULL;
1730	/* not necessary, but safer to zero */
1731	req->result = 0;
1732}
1733
1734static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1735					struct io_submit_state *state)
1736{
1737	spin_lock_irq(&ctx->completion_lock);
1738	list_splice_init(&ctx->locked_free_list, &state->free_list);
1739	ctx->locked_free_nr = 0;
1740	spin_unlock_irq(&ctx->completion_lock);
1741}
1742
1743/* Returns true IFF there are requests in the cache */
1744static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1745{
1746	struct io_submit_state *state = &ctx->submit_state;
1747	int nr;
1748
1749	/*
1750	 * If we have more than a batch's worth of requests in our IRQ side
1751	 * locked cache, grab the lock and move them over to our submission
1752	 * side cache.
1753	 */
1754	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
1755		io_flush_cached_locked_reqs(ctx, state);
1756
1757	nr = state->free_reqs;
1758	while (!list_empty(&state->free_list)) {
1759		struct io_kiocb *req = list_first_entry(&state->free_list,
1760					struct io_kiocb, inflight_entry);
1761
1762		list_del(&req->inflight_entry);
1763		state->reqs[nr++] = req;
1764		if (nr == ARRAY_SIZE(state->reqs))
1765			break;
1766	}
1767
1768	state->free_reqs = nr;
1769	return nr != 0;
1770}
1771
1772static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1773{
1774	struct io_submit_state *state = &ctx->submit_state;
1775	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1776	int ret, i;
1777
1778	BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
1779
1780	if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
1781		goto got_req;
1782
1783	ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1784				    state->reqs);
1785
1786	/*
1787	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1788	 * retry single alloc to be on the safe side.
1789	 */
1790	if (unlikely(ret <= 0)) {
1791		state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1792		if (!state->reqs[0])
1793			return NULL;
1794		ret = 1;
1795	}
1796
1797	for (i = 0; i < ret; i++)
1798		io_preinit_req(state->reqs[i], ctx);
1799	state->free_reqs = ret;
1800got_req:
1801	state->free_reqs--;
1802	return state->reqs[state->free_reqs];
1803}
1804
1805static inline void io_put_file(struct file *file)
1806{
1807	if (file)
1808		fput(file);
1809}
1810
1811static void io_dismantle_req(struct io_kiocb *req)
1812{
1813	unsigned int flags = req->flags;
1814
1815	if (io_req_needs_clean(req))
1816		io_clean_op(req);
1817	if (!(flags & REQ_F_FIXED_FILE))
1818		io_put_file(req->file);
1819	if (req->fixed_rsrc_refs)
1820		percpu_ref_put(req->fixed_rsrc_refs);
1821	if (req->async_data) {
1822		kfree(req->async_data);
1823		req->async_data = NULL;
1824	}
1825}
1826
1827static void __io_free_req(struct io_kiocb *req)
1828{
1829	struct io_ring_ctx *ctx = req->ctx;
1830	unsigned long flags;
1831
1832	io_dismantle_req(req);
1833	io_put_task(req->task, 1);
1834
1835	spin_lock_irqsave(&ctx->completion_lock, flags);
1836	list_add(&req->inflight_entry, &ctx->locked_free_list);
1837	ctx->locked_free_nr++;
1838	spin_unlock_irqrestore(&ctx->completion_lock, flags);
1839
1840	percpu_ref_put(&ctx->refs);
1841}
1842
1843static inline void io_remove_next_linked(struct io_kiocb *req)
1844{
1845	struct io_kiocb *nxt = req->link;
1846
1847	req->link = nxt->link;
1848	nxt->link = NULL;
1849}
1850
1851static bool io_kill_linked_timeout(struct io_kiocb *req)
1852	__must_hold(&req->ctx->completion_lock)
1853	__must_hold(&req->ctx->timeout_lock)
1854{
1855	struct io_kiocb *link = req->link;
1856
1857	/*
1858	 * Can happen if a linked timeout fired and link had been like
1859	 * req -> link t-out -> link t-out [-> ...]
1860	 */
1861	if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1862		struct io_timeout_data *io = link->async_data;
1863
1864		io_remove_next_linked(req);
1865		link->timeout.head = NULL;
1866		if (hrtimer_try_to_cancel(&io->timer) != -1) {
1867			io_cqring_fill_event(link->ctx, link->user_data,
1868					     -ECANCELED, 0);
1869			io_put_req_deferred(link, 1);
1870			return true;
1871		}
1872	}
1873	return false;
1874}
1875
1876static void io_fail_links(struct io_kiocb *req)
1877	__must_hold(&req->ctx->completion_lock)
1878{
1879	struct io_kiocb *nxt, *link = req->link;
1880
1881	req->link = NULL;
1882	while (link) {
1883		nxt = link->link;
1884		link->link = NULL;
1885
1886		trace_io_uring_fail_link(req, link);
1887		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
1888		io_put_req_deferred(link, 2);
1889		link = nxt;
1890	}
1891}
1892
1893static bool io_disarm_next(struct io_kiocb *req)
1894	__must_hold(&req->ctx->completion_lock)
1895{
1896	bool posted = false;
1897
1898	if (likely(req->flags & REQ_F_LINK_TIMEOUT)) {
1899		struct io_ring_ctx *ctx = req->ctx;
1900
1901		spin_lock_irq(&ctx->timeout_lock);
1902		posted = io_kill_linked_timeout(req);
1903		spin_unlock_irq(&ctx->timeout_lock);
1904	}
1905	if (unlikely((req->flags & REQ_F_FAIL) &&
1906		     !(req->flags & REQ_F_HARDLINK))) {
1907		posted |= (req->link != NULL);
1908		io_fail_links(req);
1909	}
1910	return posted;
1911}
1912
1913static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
1914{
1915	struct io_kiocb *nxt;
1916
1917	/*
1918	 * If LINK is set, we have dependent requests in this chain. If we
1919	 * didn't fail this request, queue the first one up, moving any other
1920	 * dependencies to the next request. In case of failure, fail the rest
1921	 * of the chain.
1922	 */
1923	if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) {
1924		struct io_ring_ctx *ctx = req->ctx;
1925		unsigned long flags;
1926		bool posted;
1927
1928		spin_lock_irqsave(&ctx->completion_lock, flags);
1929		posted = io_disarm_next(req);
1930		if (posted)
1931			io_commit_cqring(req->ctx);
1932		spin_unlock_irqrestore(&ctx->completion_lock, flags);
1933		if (posted)
1934			io_cqring_ev_posted(ctx);
1935	}
1936	nxt = req->link;
1937	req->link = NULL;
1938	return nxt;
1939}
1940
1941static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1942{
1943	if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
1944		return NULL;
1945	return __io_req_find_next(req);
1946}
1947
1948static void ctx_flush_and_put(struct io_ring_ctx *ctx)
1949{
1950	if (!ctx)
1951		return;
1952	if (ctx->submit_state.compl_nr) {
1953		mutex_lock(&ctx->uring_lock);
1954		io_submit_flush_completions(ctx);
1955		mutex_unlock(&ctx->uring_lock);
1956	}
1957	percpu_ref_put(&ctx->refs);
1958}
1959
1960static void tctx_task_work(struct callback_head *cb)
1961{
1962	struct io_ring_ctx *ctx = NULL;
1963	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
1964						  task_work);
1965
1966	while (1) {
1967		struct io_wq_work_node *node;
1968
1969		spin_lock_irq(&tctx->task_lock);
1970		node = tctx->task_list.first;
1971		INIT_WQ_LIST(&tctx->task_list);
1972		if (!node)
1973			tctx->task_running = false;
1974		spin_unlock_irq(&tctx->task_lock);
1975		if (!node)
1976			break;
1977
1978		do {
1979			struct io_wq_work_node *next = node->next;
1980			struct io_kiocb *req = container_of(node, struct io_kiocb,
1981							    io_task_work.node);
1982
1983			if (req->ctx != ctx) {
1984				ctx_flush_and_put(ctx);
1985				ctx = req->ctx;
1986				percpu_ref_get(&ctx->refs);
1987			}
1988			req->io_task_work.func(req);
1989			node = next;
1990		} while (node);
1991
1992		cond_resched();
1993	}
1994
1995	ctx_flush_and_put(ctx);
1996}
1997
1998static void io_req_task_work_add(struct io_kiocb *req)
1999{
2000	struct task_struct *tsk = req->task;
2001	struct io_uring_task *tctx = tsk->io_uring;
2002	enum task_work_notify_mode notify;
2003	struct io_wq_work_node *node;
2004	unsigned long flags;
2005	bool running;
2006
2007	WARN_ON_ONCE(!tctx);
2008
2009	spin_lock_irqsave(&tctx->task_lock, flags);
2010	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2011	running = tctx->task_running;
2012	if (!running)
2013		tctx->task_running = true;
2014	spin_unlock_irqrestore(&tctx->task_lock, flags);
2015
2016	/* task_work already pending, we're done */
2017	if (running)
2018		return;
2019
2020	/*
2021	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2022	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2023	 * processing task_work. There's no reliable way to tell if TWA_RESUME
2024	 * will do the job.
2025	 */
2026	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
2027	if (!task_work_add(tsk, &tctx->task_work, notify)) {
2028		wake_up_process(tsk);
2029		return;
2030	}
2031
2032	spin_lock_irqsave(&tctx->task_lock, flags);
2033	tctx->task_running = false;
2034	node = tctx->task_list.first;
2035	INIT_WQ_LIST(&tctx->task_list);
2036	spin_unlock_irqrestore(&tctx->task_lock, flags);
2037
2038	while (node) {
2039		req = container_of(node, struct io_kiocb, io_task_work.node);
2040		node = node->next;
2041		if (llist_add(&req->io_task_work.fallback_node,
2042			      &req->ctx->fallback_llist))
2043			schedule_delayed_work(&req->ctx->fallback_work, 1);
2044	}
2045}
2046
2047static void io_req_task_cancel(struct io_kiocb *req)
2048{
2049	struct io_ring_ctx *ctx = req->ctx;
2050
2051	/* ctx is guaranteed to stay alive while we hold uring_lock */
2052	mutex_lock(&ctx->uring_lock);
2053	io_req_complete_failed(req, req->result);
2054	mutex_unlock(&ctx->uring_lock);
2055}
2056
2057static void io_req_task_submit(struct io_kiocb *req)
2058{
2059	struct io_ring_ctx *ctx = req->ctx;
2060
2061	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
2062	mutex_lock(&ctx->uring_lock);
2063	if (likely(!(req->task->flags & PF_EXITING)))
2064		__io_queue_sqe(req);
2065	else
2066		io_req_complete_failed(req, -EFAULT);
2067	mutex_unlock(&ctx->uring_lock);
2068}
2069
2070static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2071{
2072	req->result = ret;
2073	req->io_task_work.func = io_req_task_cancel;
2074	io_req_task_work_add(req);
2075}
2076
2077static void io_req_task_queue(struct io_kiocb *req)
2078{
2079	req->io_task_work.func = io_req_task_submit;
2080	io_req_task_work_add(req);
2081}
2082
2083static void io_req_task_queue_reissue(struct io_kiocb *req)
2084{
2085	req->io_task_work.func = io_queue_async_work;
2086	io_req_task_work_add(req);
2087}
2088
2089static inline void io_queue_next(struct io_kiocb *req)
2090{
2091	struct io_kiocb *nxt = io_req_find_next(req);
2092
2093	if (nxt)
2094		io_req_task_queue(nxt);
2095}
2096
2097static void io_free_req(struct io_kiocb *req)
2098{
2099	io_queue_next(req);
2100	__io_free_req(req);
2101}
2102
2103struct req_batch {
2104	struct task_struct	*task;
2105	int			task_refs;
2106	int			ctx_refs;
2107};
2108
2109static inline void io_init_req_batch(struct req_batch *rb)
2110{
2111	rb->task_refs = 0;
2112	rb->ctx_refs = 0;
2113	rb->task = NULL;
2114}
2115
2116static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2117				     struct req_batch *rb)
2118{
2119	if (rb->ctx_refs)
2120		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2121	if (rb->task == current)
2122		current->io_uring->cached_refs += rb->task_refs;
2123	else if (rb->task)
2124		io_put_task(rb->task, rb->task_refs);
2125}
2126
2127static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2128			      struct io_submit_state *state)
2129{
2130	io_queue_next(req);
2131	io_dismantle_req(req);
2132
2133	if (req->task != rb->task) {
2134		if (rb->task)
2135			io_put_task(rb->task, rb->task_refs);
2136		rb->task = req->task;
2137		rb->task_refs = 0;
2138	}
2139	rb->task_refs++;
2140	rb->ctx_refs++;
2141
2142	if (state->free_reqs != ARRAY_SIZE(state->reqs))
2143		state->reqs[state->free_reqs++] = req;
2144	else
2145		list_add(&req->inflight_entry, &state->free_list);
2146}
2147
2148static void io_submit_flush_completions(struct io_ring_ctx *ctx)
2149	__must_hold(&req->ctx->uring_lock)
2150{
2151	struct io_submit_state *state = &ctx->submit_state;
2152	int i, nr = state->compl_nr;
2153	struct req_batch rb;
2154
2155	spin_lock_irq(&ctx->completion_lock);
2156	for (i = 0; i < nr; i++) {
2157		struct io_kiocb *req = state->compl_reqs[i];
2158
2159		__io_cqring_fill_event(ctx, req->user_data, req->result,
2160					req->compl.cflags);
2161	}
2162	io_commit_cqring(ctx);
2163	spin_unlock_irq(&ctx->completion_lock);
2164	io_cqring_ev_posted(ctx);
2165
2166	io_init_req_batch(&rb);
2167	for (i = 0; i < nr; i++) {
2168		struct io_kiocb *req = state->compl_reqs[i];
2169
2170		/* submission and completion refs */
2171		if (req_ref_sub_and_test(req, 2))
2172			io_req_free_batch(&rb, req, &ctx->submit_state);
2173	}
2174
2175	io_req_free_batch_finish(ctx, &rb);
2176	state->compl_nr = 0;
2177}
2178
2179/*
2180 * Drop reference to request, return next in chain (if there is one) if this
2181 * was the last reference to this request.
2182 */
2183static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2184{
2185	struct io_kiocb *nxt = NULL;
2186
2187	if (req_ref_put_and_test(req)) {
2188		nxt = io_req_find_next(req);
2189		__io_free_req(req);
2190	}
2191	return nxt;
2192}
2193
2194static inline void io_put_req(struct io_kiocb *req)
2195{
2196	if (req_ref_put_and_test(req))
2197		io_free_req(req);
2198}
2199
2200static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2201{
2202	if (req_ref_sub_and_test(req, refs)) {
2203		req->io_task_work.func = io_free_req;
2204		io_req_task_work_add(req);
2205	}
2206}
2207
2208static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2209{
2210	/* See comment at the top of this file */
2211	smp_rmb();
2212	return __io_cqring_events(ctx);
2213}
2214
2215static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2216{
2217	struct io_rings *rings = ctx->rings;
2218
2219	/* make sure SQ entry isn't read before tail */
2220	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2221}
2222
2223static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2224{
2225	unsigned int cflags;
2226
2227	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2228	cflags |= IORING_CQE_F_BUFFER;
2229	req->flags &= ~REQ_F_BUFFER_SELECTED;
2230	kfree(kbuf);
2231	return cflags;
2232}
2233
2234static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2235{
2236	struct io_buffer *kbuf;
2237
2238	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2239	return io_put_kbuf(req, kbuf);
2240}
2241
2242static inline bool io_run_task_work(void)
2243{
2244	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
2245		__set_current_state(TASK_RUNNING);
2246		tracehook_notify_signal();
2247		return true;
2248	}
2249
2250	return false;
2251}
2252
2253/*
2254 * Find and free completed poll iocbs
2255 */
2256static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2257			       struct list_head *done, bool resubmit)
2258{
2259	struct req_batch rb;
2260	struct io_kiocb *req;
2261
2262	/* order with ->result store in io_complete_rw_iopoll() */
2263	smp_rmb();
2264
2265	io_init_req_batch(&rb);
2266	while (!list_empty(done)) {
2267		int cflags = 0;
2268
2269		req = list_first_entry(done, struct io_kiocb, inflight_entry);
2270		list_del(&req->inflight_entry);
2271
2272		if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
2273		    !(req->flags & REQ_F_DONT_REISSUE)) {
2274			req->iopoll_completed = 0;
2275			req_ref_get(req);
2276			io_req_task_queue_reissue(req);
2277			continue;
2278		}
2279
2280		if (req->flags & REQ_F_BUFFER_SELECTED)
2281			cflags = io_put_rw_kbuf(req);
2282
2283		__io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
2284		(*nr_events)++;
2285
2286		if (req_ref_put_and_test(req))
2287			io_req_free_batch(&rb, req, &ctx->submit_state);
2288	}
2289
2290	io_commit_cqring(ctx);
2291	io_cqring_ev_posted_iopoll(ctx);
2292	io_req_free_batch_finish(ctx, &rb);
2293}
2294
2295static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2296			long min, bool resubmit)
2297{
2298	struct io_kiocb *req, *tmp;
2299	LIST_HEAD(done);
2300	bool spin;
2301
2302	/*
2303	 * Only spin for completions if we don't have multiple devices hanging
2304	 * off our complete list, and we're under the requested amount.
2305	 */
2306	spin = !ctx->poll_multi_queue && *nr_events < min;
2307
2308	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2309		struct kiocb *kiocb = &req->rw.kiocb;
2310		int ret;
2311
2312		/*
2313		 * Move completed and retryable entries to our local lists.
2314		 * If we find a request that requires polling, break out
2315		 * and complete those lists first, if we have entries there.
2316		 */
2317		if (READ_ONCE(req->iopoll_completed)) {
2318			list_move_tail(&req->inflight_entry, &done);
2319			continue;
2320		}
2321		if (!list_empty(&done))
2322			break;
2323
2324		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2325		if (unlikely(ret < 0))
2326			return ret;
2327		else if (ret)
2328			spin = false;
2329
2330		/* iopoll may have completed current req */
2331		if (READ_ONCE(req->iopoll_completed))
2332			list_move_tail(&req->inflight_entry, &done);
2333	}
2334
2335	if (!list_empty(&done))
2336		io_iopoll_complete(ctx, nr_events, &done, resubmit);
2337
2338	return 0;
2339}
2340
2341/*
2342 * We can't just wait for polled events to come to us, we have to actively
2343 * find and complete them.
2344 */
2345static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2346{
2347	if (!(ctx->flags & IORING_SETUP_IOPOLL))
2348		return;
2349
2350	mutex_lock(&ctx->uring_lock);
2351	while (!list_empty(&ctx->iopoll_list)) {
2352		unsigned int nr_events = 0;
2353
2354		io_do_iopoll(ctx, &nr_events, 0, false);
2355
2356		/* let it sleep and repeat later if can't complete a request */
2357		if (nr_events == 0)
2358			break;
2359		/*
2360		 * Ensure we allow local-to-the-cpu processing to take place,
2361		 * in this case we need to ensure that we reap all events.
2362		 * Also let task_work, etc. to progress by releasing the mutex
2363		 */
2364		if (need_resched()) {
2365			mutex_unlock(&ctx->uring_lock);
2366			cond_resched();
2367			mutex_lock(&ctx->uring_lock);
2368		}
2369	}
2370	mutex_unlock(&ctx->uring_lock);
2371}
2372
2373static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2374{
2375	unsigned int nr_events = 0;
2376	int ret = 0;
2377
2378	/*
2379	 * We disallow the app entering submit/complete with polling, but we
2380	 * still need to lock the ring to prevent racing with polled issue
2381	 * that got punted to a workqueue.
2382	 */
2383	mutex_lock(&ctx->uring_lock);
2384	/*
2385	 * Don't enter poll loop if we already have events pending.
2386	 * If we do, we can potentially be spinning for commands that
2387	 * already triggered a CQE (eg in error).
2388	 */
2389	if (test_bit(0, &ctx->check_cq_overflow))
2390		__io_cqring_overflow_flush(ctx, false);
2391	if (io_cqring_events(ctx))
2392		goto out;
2393	do {
2394		/*
2395		 * If a submit got punted to a workqueue, we can have the
2396		 * application entering polling for a command before it gets
2397		 * issued. That app will hold the uring_lock for the duration
2398		 * of the poll right here, so we need to take a breather every
2399		 * now and then to ensure that the issue has a chance to add
2400		 * the poll to the issued list. Otherwise we can spin here
2401		 * forever, while the workqueue is stuck trying to acquire the
2402		 * very same mutex.
2403		 */
2404		if (list_empty(&ctx->iopoll_list)) {
2405			u32 tail = ctx->cached_cq_tail;
2406
2407			mutex_unlock(&ctx->uring_lock);
2408			io_run_task_work();
2409			mutex_lock(&ctx->uring_lock);
2410
2411			/* some requests don't go through iopoll_list */
2412			if (tail != ctx->cached_cq_tail ||
2413			    list_empty(&ctx->iopoll_list))
2414				break;
2415		}
2416		ret = io_do_iopoll(ctx, &nr_events, min, true);
2417	} while (!ret && nr_events < min && !need_resched());
2418out:
2419	mutex_unlock(&ctx->uring_lock);
2420	return ret;
2421}
2422
2423static void kiocb_end_write(struct io_kiocb *req)
2424{
2425	/*
2426	 * Tell lockdep we inherited freeze protection from submission
2427	 * thread.
2428	 */
2429	if (req->flags & REQ_F_ISREG) {
2430		struct super_block *sb = file_inode(req->file)->i_sb;
2431
2432		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
2433		sb_end_write(sb);
2434	}
2435}
2436
2437#ifdef CONFIG_BLOCK
2438static bool io_resubmit_prep(struct io_kiocb *req)
2439{
2440	struct io_async_rw *rw = req->async_data;
2441
2442	if (!rw)
2443		return !io_req_prep_async(req);
2444	/* may have left rw->iter inconsistent on -EIOCBQUEUED */
2445	iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
2446	return true;
2447}
2448
2449static bool io_rw_should_reissue(struct io_kiocb *req)
2450{
2451	umode_t mode = file_inode(req->file)->i_mode;
2452	struct io_ring_ctx *ctx = req->ctx;
2453
2454	if (!S_ISBLK(mode) && !S_ISREG(mode))
2455		return false;
2456	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2457	    !(ctx->flags & IORING_SETUP_IOPOLL)))
2458		return false;
2459	/*
2460	 * If ref is dying, we might be running poll reap from the exit work.
2461	 * Don't attempt to reissue from that path, just let it fail with
2462	 * -EAGAIN.
2463	 */
2464	if (percpu_ref_is_dying(&ctx->refs))
2465		return false;
2466	/*
2467	 * Play it safe and assume not safe to re-import and reissue if we're
2468	 * not in the original thread group (or in task context).
2469	 */
2470	if (!same_thread_group(req->task, current) || !in_task())
2471		return false;
2472	return true;
2473}
2474#else
2475static bool io_resubmit_prep(struct io_kiocb *req)
2476{
2477	return false;
2478}
2479static bool io_rw_should_reissue(struct io_kiocb *req)
2480{
2481	return false;
2482}
2483#endif
2484
2485static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2486			     unsigned int issue_flags)
2487{
2488	int cflags = 0;
2489
2490	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2491		kiocb_end_write(req);
2492	if (res != req->result) {
2493		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2494		    io_rw_should_reissue(req)) {
2495			req->flags |= REQ_F_REISSUE;
2496			return;
2497		}
2498		req_set_fail(req);
2499	}
2500	if (req->flags & REQ_F_BUFFER_SELECTED)
2501		cflags = io_put_rw_kbuf(req);
2502	__io_req_complete(req, issue_flags, res, cflags);
2503}
2504
2505static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2506{
2507	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2508
2509	__io_complete_rw(req, res, res2, 0);
2510}
2511
2512static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2513{
2514	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2515
2516	if (kiocb->ki_flags & IOCB_WRITE)
2517		kiocb_end_write(req);
2518	if (unlikely(res != req->result)) {
2519		if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
2520		    io_resubmit_prep(req))) {
2521			req_set_fail(req);
2522			req->flags |= REQ_F_DONT_REISSUE;
2523		}
2524	}
2525
2526	WRITE_ONCE(req->result, res);
2527	/* order with io_iopoll_complete() checking ->result */
2528	smp_wmb();
2529	WRITE_ONCE(req->iopoll_completed, 1);
2530}
2531
2532/*
2533 * After the iocb has been issued, it's safe to be found on the poll list.
2534 * Adding the kiocb to the list AFTER submission ensures that we don't
2535 * find it from a io_do_iopoll() thread before the issuer is done
2536 * accessing the kiocb cookie.
2537 */
2538static void io_iopoll_req_issued(struct io_kiocb *req)
2539{
2540	struct io_ring_ctx *ctx = req->ctx;
2541	const bool in_async = io_wq_current_is_worker();
2542
2543	/* workqueue context doesn't hold uring_lock, grab it now */
2544	if (unlikely(in_async))
2545		mutex_lock(&ctx->uring_lock);
2546
2547	/*
2548	 * Track whether we have multiple files in our lists. This will impact
2549	 * how we do polling eventually, not spinning if we're on potentially
2550	 * different devices.
2551	 */
2552	if (list_empty(&ctx->iopoll_list)) {
2553		ctx->poll_multi_queue = false;
2554	} else if (!ctx->poll_multi_queue) {
2555		struct io_kiocb *list_req;
2556		unsigned int queue_num0, queue_num1;
2557
2558		list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2559						inflight_entry);
2560
2561		if (list_req->file != req->file) {
2562			ctx->poll_multi_queue = true;
2563		} else {
2564			queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
2565			queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
2566			if (queue_num0 != queue_num1)
2567				ctx->poll_multi_queue = true;
2568		}
2569	}
2570
2571	/*
2572	 * For fast devices, IO may have already completed. If it has, add
2573	 * it to the front so we find it first.
2574	 */
2575	if (READ_ONCE(req->iopoll_completed))
2576		list_add(&req->inflight_entry, &ctx->iopoll_list);
2577	else
2578		list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2579
2580	if (unlikely(in_async)) {
2581		/*
2582		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2583		 * in sq thread task context or in io worker task context. If
2584		 * current task context is sq thread, we don't need to check
2585		 * whether should wake up sq thread.
2586		 */
2587		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2588		    wq_has_sleeper(&ctx->sq_data->wait))
2589			wake_up(&ctx->sq_data->wait);
2590
2591		mutex_unlock(&ctx->uring_lock);
2592	}
2593}
2594
2595static bool io_bdev_nowait(struct block_device *bdev)
2596{
2597	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2598}
2599
2600/*
2601 * If we tracked the file through the SCM inflight mechanism, we could support
2602 * any file. For now, just ensure that anything potentially problematic is done
2603 * inline.
2604 */
2605static bool __io_file_supports_nowait(struct file *file, int rw)
2606{
2607	umode_t mode = file_inode(file)->i_mode;
2608
2609	if (S_ISBLK(mode)) {
2610		if (IS_ENABLED(CONFIG_BLOCK) &&
2611		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2612			return true;
2613		return false;
2614	}
2615	if (S_ISSOCK(mode))
2616		return true;
2617	if (S_ISREG(mode)) {
2618		if (IS_ENABLED(CONFIG_BLOCK) &&
2619		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2620		    file->f_op != &io_uring_fops)
2621			return true;
2622		return false;
2623	}
2624
2625	/* any ->read/write should understand O_NONBLOCK */
2626	if (file->f_flags & O_NONBLOCK)
2627		return true;
2628
2629	if (!(file->f_mode & FMODE_NOWAIT))
2630		return false;
2631
2632	if (rw == READ)
2633		return file->f_op->read_iter != NULL;
2634
2635	return file->f_op->write_iter != NULL;
2636}
2637
2638static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
2639{
2640	if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
2641		return true;
2642	else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
2643		return true;
2644
2645	return __io_file_supports_nowait(req->file, rw);
2646}
2647
2648static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2649{
2650	struct io_ring_ctx *ctx = req->ctx;
2651	struct kiocb *kiocb = &req->rw.kiocb;
2652	struct file *file = req->file;
2653	unsigned ioprio;
2654	int ret;
2655
2656	if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
2657		req->flags |= REQ_F_ISREG;
2658
2659	kiocb->ki_pos = READ_ONCE(sqe->off);
2660	if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
2661		req->flags |= REQ_F_CUR_POS;
2662		kiocb->ki_pos = file->f_pos;
2663	}
2664	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2665	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2666	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2667	if (unlikely(ret))
2668		return ret;
2669
2670	/* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2671	if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2672		req->flags |= REQ_F_NOWAIT;
2673
2674	ioprio = READ_ONCE(sqe->ioprio);
2675	if (ioprio) {
2676		ret = ioprio_check_cap(ioprio);
2677		if (ret)
2678			return ret;
2679
2680		kiocb->ki_ioprio = ioprio;
2681	} else
2682		kiocb->ki_ioprio = get_current_ioprio();
2683
2684	if (ctx->flags & IORING_SETUP_IOPOLL) {
2685		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2686		    !kiocb->ki_filp->f_op->iopoll)
2687			return -EOPNOTSUPP;
2688
2689		kiocb->ki_flags |= IOCB_HIPRI;
2690		kiocb->ki_complete = io_complete_rw_iopoll;
2691		req->iopoll_completed = 0;
2692	} else {
2693		if (kiocb->ki_flags & IOCB_HIPRI)
2694			return -EINVAL;
2695		kiocb->ki_complete = io_complete_rw;
2696	}
2697
2698	if (req->opcode == IORING_OP_READ_FIXED ||
2699	    req->opcode == IORING_OP_WRITE_FIXED) {
2700		req->imu = NULL;
2701		io_req_set_rsrc_node(req);
2702	}
2703
2704	req->rw.addr = READ_ONCE(sqe->addr);
2705	req->rw.len = READ_ONCE(sqe->len);
2706	req->buf_index = READ_ONCE(sqe->buf_index);
2707	return 0;
2708}
2709
2710static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2711{
2712	switch (ret) {
2713	case -EIOCBQUEUED:
2714		break;
2715	case -ERESTARTSYS:
2716	case -ERESTARTNOINTR:
2717	case -ERESTARTNOHAND:
2718	case -ERESTART_RESTARTBLOCK:
2719		/*
2720		 * We can't just restart the syscall, since previously
2721		 * submitted sqes may already be in progress. Just fail this
2722		 * IO with EINTR.
2723		 */
2724		ret = -EINTR;
2725		fallthrough;
2726	default:
2727		kiocb->ki_complete(kiocb, ret, 0);
2728	}
2729}
2730
2731static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2732		       unsigned int issue_flags)
2733{
2734	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2735	struct io_async_rw *io = req->async_data;
2736	bool check_reissue = kiocb->ki_complete == io_complete_rw;
2737
2738	/* add previously done IO, if any */
2739	if (io && io->bytes_done > 0) {
2740		if (ret < 0)
2741			ret = io->bytes_done;
2742		else
2743			ret += io->bytes_done;
2744	}
2745
2746	if (req->flags & REQ_F_CUR_POS)
2747		req->file->f_pos = kiocb->ki_pos;
2748	if (ret >= 0 && check_reissue)
2749		__io_complete_rw(req, ret, 0, issue_flags);
2750	else
2751		io_rw_done(kiocb, ret);
2752
2753	if (check_reissue && (req->flags & REQ_F_REISSUE)) {
2754		req->flags &= ~REQ_F_REISSUE;
2755		if (io_resubmit_prep(req)) {
2756			req_ref_get(req);
2757			io_req_task_queue_reissue(req);
2758		} else {
2759			int cflags = 0;
2760
2761			req_set_fail(req);
2762			if (req->flags & REQ_F_BUFFER_SELECTED)
2763				cflags = io_put_rw_kbuf(req);
2764			__io_req_complete(req, issue_flags, ret, cflags);
2765		}
2766	}
2767}
2768
2769static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
2770			     struct io_mapped_ubuf *imu)
2771{
2772	size_t len = req->rw.len;
2773	u64 buf_end, buf_addr = req->rw.addr;
2774	size_t offset;
2775
2776	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
2777		return -EFAULT;
2778	/* not inside the mapped region */
2779	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
2780		return -EFAULT;
2781
2782	/*
2783	 * May not be a start of buffer, set size appropriately
2784	 * and advance us to the beginning.
2785	 */
2786	offset = buf_addr - imu->ubuf;
2787	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2788
2789	if (offset) {
2790		/*
2791		 * Don't use iov_iter_advance() here, as it's really slow for
2792		 * using the latter parts of a big fixed buffer - it iterates
2793		 * over each segment manually. We can cheat a bit here, because
2794		 * we know that:
2795		 *
2796		 * 1) it's a BVEC iter, we set it up
2797		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2798		 *    first and last bvec
2799		 *
2800		 * So just find our index, and adjust the iterator afterwards.
2801		 * If the offset is within the first bvec (or the whole first
2802		 * bvec, just use iov_iter_advance(). This makes it easier
2803		 * since we can just skip the first segment, which may not
2804		 * be PAGE_SIZE aligned.
2805		 */
2806		const struct bio_vec *bvec = imu->bvec;
2807
2808		if (offset <= bvec->bv_len) {
2809			iov_iter_advance(iter, offset);
2810		} else {
2811			unsigned long seg_skip;
2812
2813			/* skip first vec */
2814			offset -= bvec->bv_len;
2815			seg_skip = 1 + (offset >> PAGE_SHIFT);
2816
2817			iter->bvec = bvec + seg_skip;
2818			iter->nr_segs -= seg_skip;
2819			iter->count -= bvec->bv_len + offset;
2820			iter->iov_offset = offset & ~PAGE_MASK;
2821		}
2822	}
2823
2824	return 0;
2825}
2826
2827static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2828{
2829	struct io_ring_ctx *ctx = req->ctx;
2830	struct io_mapped_ubuf *imu = req->imu;
2831	u16 index, buf_index = req->buf_index;
2832
2833	if (likely(!imu)) {
2834		if (unlikely(buf_index >= ctx->nr_user_bufs))
2835			return -EFAULT;
2836		index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2837		imu = READ_ONCE(ctx->user_bufs[index]);
2838		req->imu = imu;
2839	}
2840	return __io_import_fixed(req, rw, iter, imu);
2841}
2842
2843static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2844{
2845	if (needs_lock)
2846		mutex_unlock(&ctx->uring_lock);
2847}
2848
2849static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2850{
2851	/*
2852	 * "Normal" inline submissions always hold the uring_lock, since we
2853	 * grab it from the system call. Same is true for the SQPOLL offload.
2854	 * The only exception is when we've detached the request and issue it
2855	 * from an async worker thread, grab the lock for that case.
2856	 */
2857	if (needs_lock)
2858		mutex_lock(&ctx->uring_lock);
2859}
2860
2861static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2862					  int bgid, struct io_buffer *kbuf,
2863					  bool needs_lock)
2864{
2865	struct io_buffer *head;
2866
2867	if (req->flags & REQ_F_BUFFER_SELECTED)
2868		return kbuf;
2869
2870	io_ring_submit_lock(req->ctx, needs_lock);
2871
2872	lockdep_assert_held(&req->ctx->uring_lock);
2873
2874	head = xa_load(&req->ctx->io_buffers, bgid);
2875	if (head) {
2876		if (!list_empty(&head->list)) {
2877			kbuf = list_last_entry(&head->list, struct io_buffer,
2878							list);
2879			list_del(&kbuf->list);
2880		} else {
2881			kbuf = head;
2882			xa_erase(&req->ctx->io_buffers, bgid);
2883		}
2884		if (*len > kbuf->len)
2885			*len = kbuf->len;
2886	} else {
2887		kbuf = ERR_PTR(-ENOBUFS);
2888	}
2889
2890	io_ring_submit_unlock(req->ctx, needs_lock);
2891
2892	return kbuf;
2893}
2894
2895static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2896					bool needs_lock)
2897{
2898	struct io_buffer *kbuf;
2899	u16 bgid;
2900
2901	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2902	bgid = req->buf_index;
2903	kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2904	if (IS_ERR(kbuf))
2905		return kbuf;
2906	req->rw.addr = (u64) (unsigned long) kbuf;
2907	req->flags |= REQ_F_BUFFER_SELECTED;
2908	return u64_to_user_ptr(kbuf->addr);
2909}
2910
2911#ifdef CONFIG_COMPAT
2912static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2913				bool needs_lock)
2914{
2915	struct compat_iovec __user *uiov;
2916	compat_ssize_t clen;
2917	void __user *buf;
2918	ssize_t len;
2919
2920	uiov = u64_to_user_ptr(req->rw.addr);
2921	if (!access_ok(uiov, sizeof(*uiov)))
2922		return -EFAULT;
2923	if (__get_user(clen, &uiov->iov_len))
2924		return -EFAULT;
2925	if (clen < 0)
2926		return -EINVAL;
2927
2928	len = clen;
2929	buf = io_rw_buffer_select(req, &len, needs_lock);
2930	if (IS_ERR(buf))
2931		return PTR_ERR(buf);
2932	iov[0].iov_base = buf;
2933	iov[0].iov_len = (compat_size_t) len;
2934	return 0;
2935}
2936#endif
2937
2938static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2939				      bool needs_lock)
2940{
2941	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2942	void __user *buf;
2943	ssize_t len;
2944
2945	if (copy_from_user(iov, uiov, sizeof(*uiov)))
2946		return -EFAULT;
2947
2948	len = iov[0].iov_len;
2949	if (len < 0)
2950		return -EINVAL;
2951	buf = io_rw_buffer_select(req, &len, needs_lock);
2952	if (IS_ERR(buf))
2953		return PTR_ERR(buf);
2954	iov[0].iov_base = buf;
2955	iov[0].iov_len = len;
2956	return 0;
2957}
2958
2959static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2960				    bool needs_lock)
2961{
2962	if (req->flags & REQ_F_BUFFER_SELECTED) {
2963		struct io_buffer *kbuf;
2964
2965		kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2966		iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2967		iov[0].iov_len = kbuf->len;
2968		return 0;
2969	}
2970	if (req->rw.len != 1)
2971		return -EINVAL;
2972
2973#ifdef CONFIG_COMPAT
2974	if (req->ctx->compat)
2975		return io_compat_import(req, iov, needs_lock);
2976#endif
2977
2978	return __io_iov_buffer_select(req, iov, needs_lock);
2979}
2980
2981static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
2982			   struct iov_iter *iter, bool needs_lock)
2983{
2984	void __user *buf = u64_to_user_ptr(req->rw.addr);
2985	size_t sqe_len = req->rw.len;
2986	u8 opcode = req->opcode;
2987	ssize_t ret;
2988
2989	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2990		*iovec = NULL;
2991		return io_import_fixed(req, rw, iter);
2992	}
2993
2994	/* buffer index only valid with fixed read/write, or buffer select  */
2995	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2996		return -EINVAL;
2997
2998	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2999		if (req->flags & REQ_F_BUFFER_SELECT) {
3000			buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
3001			if (IS_ERR(buf))
3002				return PTR_ERR(buf);
3003			req->rw.len = sqe_len;
3004		}
3005
3006		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3007		*iovec = NULL;
3008		return ret;
3009	}
3010
3011	if (req->flags & REQ_F_BUFFER_SELECT) {
3012		ret = io_iov_buffer_select(req, *iovec, needs_lock);
3013		if (!ret)
3014			iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
3015		*iovec = NULL;
3016		return ret;
3017	}
3018
3019	return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3020			      req->ctx->compat);
3021}
3022
3023static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3024{
3025	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3026}
3027
3028/*
3029 * For files that don't have ->read_iter() and ->write_iter(), handle them
3030 * by looping over ->read() or ->write() manually.
3031 */
3032static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3033{
3034	struct kiocb *kiocb = &req->rw.kiocb;
3035	struct file *file = req->file;
3036	ssize_t ret = 0;
3037
3038	/*
3039	 * Don't support polled IO through this interface, and we can't
3040	 * support non-blocking either. For the latter, this just causes
3041	 * the kiocb to be handled from an async context.
3042	 */
3043	if (kiocb->ki_flags & IOCB_HIPRI)
3044		return -EOPNOTSUPP;
3045	if (kiocb->ki_flags & IOCB_NOWAIT)
3046		return -EAGAIN;
3047
3048	while (iov_iter_count(iter)) {
3049		struct iovec iovec;
3050		ssize_t nr;
3051
3052		if (!iov_iter_is_bvec(iter)) {
3053			iovec = iov_iter_iovec(iter);
3054		} else {
3055			iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3056			iovec.iov_len = req->rw.len;
3057		}
3058
3059		if (rw == READ) {
3060			nr = file->f_op->read(file, iovec.iov_base,
3061					      iovec.iov_len, io_kiocb_ppos(kiocb));
3062		} else {
3063			nr = file->f_op->write(file, iovec.iov_base,
3064					       iovec.iov_len, io_kiocb_ppos(kiocb));
3065		}
3066
3067		if (nr < 0) {
3068			if (!ret)
3069				ret = nr;
3070			break;
3071		}
3072		ret += nr;
3073		if (nr != iovec.iov_len)
3074			break;
3075		req->rw.len -= nr;
3076		req->rw.addr += nr;
3077		iov_iter_advance(iter, nr);
3078	}
3079
3080	return ret;
3081}
3082
3083static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3084			  const struct iovec *fast_iov, struct iov_iter *iter)
3085{
3086	struct io_async_rw *rw = req->async_data;
3087
3088	memcpy(&rw->iter, iter, sizeof(*iter));
3089	rw->free_iovec = iovec;
3090	rw->bytes_done = 0;
3091	/* can only be fixed buffers, no need to do anything */
3092	if (iov_iter_is_bvec(iter))
3093		return;
3094	if (!iovec) {
3095		unsigned iov_off = 0;
3096
3097		rw->iter.iov = rw->fast_iov;
3098		if (iter->iov != fast_iov) {
3099			iov_off = iter->iov - fast_iov;
3100			rw->iter.iov += iov_off;
3101		}
3102		if (rw->fast_iov != fast_iov)
3103			memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3104			       sizeof(struct iovec) * iter->nr_segs);
3105	} else {
3106		req->flags |= REQ_F_NEED_CLEANUP;
3107	}
3108}
3109
3110static inline int io_alloc_async_data(struct io_kiocb *req)
3111{
3112	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3113	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3114	return req->async_data == NULL;
3115}
3116
3117static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3118			     const struct iovec *fast_iov,
3119			     struct iov_iter *iter, bool force)
3120{
3121	if (!force && !io_op_defs[req->opcode].needs_async_setup)
3122		return 0;
3123	if (!req->async_data) {
3124		if (io_alloc_async_data(req)) {
3125			kfree(iovec);
3126			return -ENOMEM;
3127		}
3128
3129		io_req_map_rw(req, iovec, fast_iov, iter);
3130	}
3131	return 0;
3132}
3133
3134static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3135{
3136	struct io_async_rw *iorw = req->async_data;
3137	struct iovec *iov = iorw->fast_iov;
3138	int ret;
3139
3140	ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3141	if (unlikely(ret < 0))
3142		return ret;
3143
3144	iorw->bytes_done = 0;
3145	iorw->free_iovec = iov;
3146	if (iov)
3147		req->flags |= REQ_F_NEED_CLEANUP;
3148	return 0;
3149}
3150
3151static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3152{
3153	if (unlikely(!(req->file->f_mode & FMODE_READ)))
3154		return -EBADF;
3155	return io_prep_rw(req, sqe);
3156}
3157
3158/*
3159 * This is our waitqueue callback handler, registered through lock_page_async()
3160 * when we initially tried to do the IO with the iocb armed our waitqueue.
3161 * This gets called when the page is unlocked, and we generally expect that to
3162 * happen when the page IO is completed and the page is now uptodate. This will
3163 * queue a task_work based retry of the operation, attempting to copy the data
3164 * again. If the latter fails because the page was NOT uptodate, then we will
3165 * do a thread based blocking retry of the operation. That's the unexpected
3166 * slow path.
3167 */
3168static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3169			     int sync, void *arg)
3170{
3171	struct wait_page_queue *wpq;
3172	struct io_kiocb *req = wait->private;
3173	struct wait_page_key *key = arg;
3174
3175	wpq = container_of(wait, struct wait_page_queue, wait);
3176
3177	if (!wake_page_match(wpq, key))
3178		return 0;
3179
3180	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3181	list_del_init(&wait->entry);
3182
3183	/* submit ref gets dropped, acquire a new one */
3184	req_ref_get(req);
3185	io_req_task_queue(req);
3186	return 1;
3187}
3188
3189/*
3190 * This controls whether a given IO request should be armed for async page
3191 * based retry. If we return false here, the request is handed to the async
3192 * worker threads for retry. If we're doing buffered reads on a regular file,
3193 * we prepare a private wait_page_queue entry and retry the operation. This
3194 * will either succeed because the page is now uptodate and unlocked, or it
3195 * will register a callback when the page is unlocked at IO completion. Through
3196 * that callback, io_uring uses task_work to setup a retry of the operation.
3197 * That retry will attempt the buffered read again. The retry will generally
3198 * succeed, or in rare cases where it fails, we then fall back to using the
3199 * async worker threads for a blocking retry.
3200 */
3201static bool io_rw_should_retry(struct io_kiocb *req)
3202{
3203	struct io_async_rw *rw = req->async_data;
3204	struct wait_page_queue *wait = &rw->wpq;
3205	struct kiocb *kiocb = &req->rw.kiocb;
3206
3207	/* never retry for NOWAIT, we just complete with -EAGAIN */
3208	if (req->flags & REQ_F_NOWAIT)
3209		return false;
3210
3211	/* Only for buffered IO */
3212	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3213		return false;
3214
3215	/*
3216	 * just use poll if we can, and don't attempt if the fs doesn't
3217	 * support callback based unlocks
3218	 */
3219	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3220		return false;
3221
3222	wait->wait.func = io_async_buf_func;
3223	wait->wait.private = req;
3224	wait->wait.flags = 0;
3225	INIT_LIST_HEAD(&wait->wait.entry);
3226	kiocb->ki_flags |= IOCB_WAITQ;
3227	kiocb->ki_flags &= ~IOCB_NOWAIT;
3228	kiocb->ki_waitq = wait;
3229	return true;
3230}
3231
3232static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3233{
3234	if (req->file->f_op->read_iter)
3235		return call_read_iter(req->file, &req->rw.kiocb, iter);
3236	else if (req->file->f_op->read)
3237		return loop_rw_iter(READ, req, iter);
3238	else
3239		return -EINVAL;
3240}
3241
3242static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3243{
3244	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3245	struct kiocb *kiocb = &req->rw.kiocb;
3246	struct iov_iter __iter, *iter = &__iter;
3247	struct io_async_rw *rw = req->async_data;
3248	ssize_t io_size, ret, ret2;
3249	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3250
3251	if (rw) {
3252		iter = &rw->iter;
3253		iovec = NULL;
3254	} else {
3255		ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3256		if (ret < 0)
3257			return ret;
3258	}
3259	io_size = iov_iter_count(iter);
3260	req->result = io_size;
3261
3262	/* Ensure we clear previously set non-block flag */
3263	if (!force_nonblock)
3264		kiocb->ki_flags &= ~IOCB_NOWAIT;
3265	else
3266		kiocb->ki_flags |= IOCB_NOWAIT;
3267
3268	/* If the file doesn't support async, just async punt */
3269	if (force_nonblock && !io_file_supports_nowait(req, READ)) {
3270		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3271		return ret ?: -EAGAIN;
3272	}
3273
3274	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3275	if (unlikely(ret)) {
3276		kfree(iovec);
3277		return ret;
3278	}
3279
3280	ret = io_iter_do_read(req, iter);
3281
3282	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3283		req->flags &= ~REQ_F_REISSUE;
3284		/* IOPOLL retry should happen for io-wq threads */
3285		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3286			goto done;
3287		/* no retry on NONBLOCK nor RWF_NOWAIT */
3288		if (req->flags & REQ_F_NOWAIT)
3289			goto done;
3290		/* some cases will consume bytes even on error returns */
3291		iov_iter_revert(iter, io_size - iov_iter_count(iter));
3292		ret = 0;
3293	} else if (ret == -EIOCBQUEUED) {
3294		goto out_free;
3295	} else if (ret <= 0 || ret == io_size || !force_nonblock ||
3296		   (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
3297		/* read all, failed, already did sync or don't want to retry */
3298		goto done;
3299	}
3300
3301	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3302	if (ret2)
3303		return ret2;
3304
3305	iovec = NULL;
3306	rw = req->async_data;
3307	/* now use our persistent iterator, if we aren't already */
3308	iter = &rw->iter;
3309
3310	do {
3311		io_size -= ret;
3312		rw->bytes_done += ret;
3313		/* if we can retry, do so with the callbacks armed */
3314		if (!io_rw_should_retry(req)) {
3315			kiocb->ki_flags &= ~IOCB_WAITQ;
3316			return -EAGAIN;
3317		}
3318
3319		/*
3320		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3321		 * we get -EIOCBQUEUED, then we'll get a notification when the
3322		 * desired page gets unlocked. We can also get a partial read
3323		 * here, and if we do, then just retry at the new offset.
3324		 */
3325		ret = io_iter_do_read(req, iter);
3326		if (ret == -EIOCBQUEUED)
3327			return 0;
3328		/* we got some bytes, but not all. retry. */
3329		kiocb->ki_flags &= ~IOCB_WAITQ;
3330	} while (ret > 0 && ret < io_size);
3331done:
3332	kiocb_done(kiocb, ret, issue_flags);
3333out_free:
3334	/* it's faster to check here then delegate to kfree */
3335	if (iovec)
3336		kfree(iovec);
3337	return 0;
3338}
3339
3340static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3341{
3342	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3343		return -EBADF;
3344	return io_prep_rw(req, sqe);
3345}
3346
3347static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3348{
3349	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3350	struct kiocb *kiocb = &req->rw.kiocb;
3351	struct iov_iter __iter, *iter = &__iter;
3352	struct io_async_rw *rw = req->async_data;
3353	ssize_t ret, ret2, io_size;
3354	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3355
3356	if (rw) {
3357		iter = &rw->iter;
3358		iovec = NULL;
3359	} else {
3360		ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3361		if (ret < 0)
3362			return ret;
3363	}
3364	io_size = iov_iter_count(iter);
3365	req->result = io_size;
3366
3367	/* Ensure we clear previously set non-block flag */
3368	if (!force_nonblock)
3369		kiocb->ki_flags &= ~IOCB_NOWAIT;
3370	else
3371		kiocb->ki_flags |= IOCB_NOWAIT;
3372
3373	/* If the file doesn't support async, just async punt */
3374	if (force_nonblock && !io_file_supports_nowait(req, WRITE))
3375		goto copy_iov;
3376
3377	/* file path doesn't support NOWAIT for non-direct_IO */
3378	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3379	    (req->flags & REQ_F_ISREG))
3380		goto copy_iov;
3381
3382	ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3383	if (unlikely(ret))
3384		goto out_free;
3385
3386	/*
3387	 * Open-code file_start_write here to grab freeze protection,
3388	 * which will be released by another thread in
3389	 * io_complete_rw().  Fool lockdep by telling it the lock got
3390	 * released so that it doesn't complain about the held lock when
3391	 * we return to userspace.
3392	 */
3393	if (req->flags & REQ_F_ISREG) {
3394		sb_start_write(file_inode(req->file)->i_sb);
3395		__sb_writers_release(file_inode(req->file)->i_sb,
3396					SB_FREEZE_WRITE);
3397	}
3398	kiocb->ki_flags |= IOCB_WRITE;
3399
3400	if (req->file->f_op->write_iter)
3401		ret2 = call_write_iter(req->file, kiocb, iter);
3402	else if (req->file->f_op->write)
3403		ret2 = loop_rw_iter(WRITE, req, iter);
3404	else
3405		ret2 = -EINVAL;
3406
3407	if (req->flags & REQ_F_REISSUE) {
3408		req->flags &= ~REQ_F_REISSUE;
3409		ret2 = -EAGAIN;
3410	}
3411
3412	/*
3413	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3414	 * retry them without IOCB_NOWAIT.
3415	 */
3416	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3417		ret2 = -EAGAIN;
3418	/* no retry on NONBLOCK nor RWF_NOWAIT */
3419	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3420		goto done;
3421	if (!force_nonblock || ret2 != -EAGAIN) {
3422		/* IOPOLL retry should happen for io-wq threads */
3423		if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3424			goto copy_iov;
3425done:
3426		kiocb_done(kiocb, ret2, issue_flags);
3427	} else {
3428copy_iov:
3429		/* some cases will consume bytes even on error returns */
3430		iov_iter_revert(iter, io_size - iov_iter_count(iter));
3431		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3432		return ret ?: -EAGAIN;
3433	}
3434out_free:
3435	/* it's reportedly faster than delegating the null check to kfree() */
3436	if (iovec)
3437		kfree(iovec);
3438	return ret;
3439}
3440
3441static int io_renameat_prep(struct io_kiocb *req,
3442			    const struct io_uring_sqe *sqe)
3443{
3444	struct io_rename *ren = &req->rename;
3445	const char __user *oldf, *newf;
3446
3447	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3448		return -EINVAL;
3449	if (sqe->ioprio || sqe->buf_index)
3450		return -EINVAL;
3451	if (unlikely(req->flags & REQ_F_FIXED_FILE))
3452		return -EBADF;
3453
3454	ren->old_dfd = READ_ONCE(sqe->fd);
3455	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3456	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3457	ren->new_dfd = READ_ONCE(sqe->len);
3458	ren->flags = READ_ONCE(sqe->rename_flags);
3459
3460	ren->oldpath = getname(oldf);
3461	if (IS_ERR(ren->oldpath))
3462		return PTR_ERR(ren->oldpath);
3463
3464	ren->newpath = getname(newf);
3465	if (IS_ERR(ren->newpath)) {
3466		putname(ren->oldpath);
3467		return PTR_ERR(ren->newpath);
3468	}
3469
3470	req->flags |= REQ_F_NEED_CLEANUP;
3471	return 0;
3472}
3473
3474static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3475{
3476	struct io_rename *ren = &req->rename;
3477	int ret;
3478
3479	if (issue_flags & IO_URING_F_NONBLOCK)
3480		return -EAGAIN;
3481
3482	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3483				ren->newpath, ren->flags);
3484
3485	req->flags &= ~REQ_F_NEED_CLEANUP;
3486	if (ret < 0)
3487		req_set_fail(req);
3488	io_req_complete(req, ret);
3489	return 0;
3490}
3491
3492static int io_unlinkat_prep(struct io_kiocb *req,
3493			    const struct io_uring_sqe *sqe)
3494{
3495	struct io_unlink *un = &req->unlink;
3496	const char __user *fname;
3497
3498	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3499		return -EINVAL;
3500	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
3501		return -EINVAL;
3502	if (unlikely(req->flags & REQ_F_FIXED_FILE))
3503		return -EBADF;
3504
3505	un->dfd = READ_ONCE(sqe->fd);
3506
3507	un->flags = READ_ONCE(sqe->unlink_flags);
3508	if (un->flags & ~AT_REMOVEDIR)
3509		return -EINVAL;
3510
3511	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3512	un->filename = getname(fname);
3513	if (IS_ERR(un->filename))
3514		return PTR_ERR(un->filename);
3515
3516	req->flags |= REQ_F_NEED_CLEANUP;
3517	return 0;
3518}
3519
3520static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3521{
3522	struct io_unlink *un = &req->unlink;
3523	int ret;
3524
3525	if (issue_flags & IO_URING_F_NONBLOCK)
3526		return -EAGAIN;
3527
3528	if (un->flags & AT_REMOVEDIR)
3529		ret = do_rmdir(un->dfd, un->filename);
3530	else
3531		ret = do_unlinkat(un->dfd, un->filename);
3532
3533	req->flags &= ~REQ_F_NEED_CLEANUP;
3534	if (ret < 0)
3535		req_set_fail(req);
3536	io_req_complete(req, ret);
3537	return 0;
3538}
3539
3540static int io_shutdown_prep(struct io_kiocb *req,
3541			    const struct io_uring_sqe *sqe)
3542{
3543#if defined(CONFIG_NET)
3544	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3545		return -EINVAL;
3546	if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3547	    sqe->buf_index)
3548		return -EINVAL;
3549
3550	req->shutdown.how = READ_ONCE(sqe->len);
3551	return 0;
3552#else
3553	return -EOPNOTSUPP;
3554#endif
3555}
3556
3557static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3558{
3559#if defined(CONFIG_NET)
3560	struct socket *sock;
3561	int ret;
3562
3563	if (issue_flags & IO_URING_F_NONBLOCK)
3564		return -EAGAIN;
3565
3566	sock = sock_from_file(req->file);
3567	if (unlikely(!sock))
3568		return -ENOTSOCK;
3569
3570	ret = __sys_shutdown_sock(sock, req->shutdown.how);
3571	if (ret < 0)
3572		req_set_fail(req);
3573	io_req_complete(req, ret);
3574	return 0;
3575#else
3576	return -EOPNOTSUPP;
3577#endif
3578}
3579
3580static int __io_splice_prep(struct io_kiocb *req,
3581			    const struct io_uring_sqe *sqe)
3582{
3583	struct io_splice *sp = &req->splice;
3584	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3585
3586	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3587		return -EINVAL;
3588
3589	sp->file_in = NULL;
3590	sp->len = READ_ONCE(sqe->len);
3591	sp->flags = READ_ONCE(sqe->splice_flags);
3592
3593	if (unlikely(sp->flags & ~valid_flags))
3594		return -EINVAL;
3595
3596	sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
3597				  (sp->flags & SPLICE_F_FD_IN_FIXED));
3598	if (!sp->file_in)
3599		return -EBADF;
3600	req->flags |= REQ_F_NEED_CLEANUP;
3601	return 0;
3602}
3603
3604static int io_tee_prep(struct io_kiocb *req,
3605		       const struct io_uring_sqe *sqe)
3606{
3607	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3608		return -EINVAL;
3609	return __io_splice_prep(req, sqe);
3610}
3611
3612static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3613{
3614	struct io_splice *sp = &req->splice;
3615	struct file *in = sp->file_in;
3616	struct file *out = sp->file_out;
3617	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3618	long ret = 0;
3619
3620	if (issue_flags & IO_URING_F_NONBLOCK)
3621		return -EAGAIN;
3622	if (sp->len)
3623		ret = do_tee(in, out, sp->len, flags);
3624
3625	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3626		io_put_file(in);
3627	req->flags &= ~REQ_F_NEED_CLEANUP;
3628
3629	if (ret != sp->len)
3630		req_set_fail(req);
3631	io_req_complete(req, ret);
3632	return 0;
3633}
3634
3635static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3636{
3637	struct io_splice *sp = &req->splice;
3638
3639	sp->off_in = READ_ONCE(sqe->splice_off_in);
3640	sp->off_out = READ_ONCE(sqe->off);
3641	return __io_splice_prep(req, sqe);
3642}
3643
3644static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
3645{
3646	struct io_splice *sp = &req->splice;
3647	struct file *in = sp->file_in;
3648	struct file *out = sp->file_out;
3649	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3650	loff_t *poff_in, *poff_out;
3651	long ret = 0;
3652
3653	if (issue_flags & IO_URING_F_NONBLOCK)
3654		return -EAGAIN;
3655
3656	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3657	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3658
3659	if (sp->len)
3660		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3661
3662	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3663		io_put_file(in);
3664	req->flags &= ~REQ_F_NEED_CLEANUP;
3665
3666	if (ret != sp->len)
3667		req_set_fail(req);
3668	io_req_complete(req, ret);
3669	return 0;
3670}
3671
3672/*
3673 * IORING_OP_NOP just posts a completion event, nothing else.
3674 */
3675static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
3676{
3677	struct io_ring_ctx *ctx = req->ctx;
3678
3679	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3680		return -EINVAL;
3681
3682	__io_req_complete(req, issue_flags, 0, 0);
3683	return 0;
3684}
3685
3686static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3687{
3688	struct io_ring_ctx *ctx = req->ctx;
3689
3690	if (!req->file)
3691		return -EBADF;
3692
3693	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3694		return -EINVAL;
3695	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3696		return -EINVAL;
3697
3698	req->sync.flags = READ_ONCE(sqe->fsync_flags);
3699	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3700		return -EINVAL;
3701
3702	req->sync.off = READ_ONCE(sqe->off);
3703	req->sync.len = READ_ONCE(sqe->len);
3704	return 0;
3705}
3706
3707static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
3708{
3709	loff_t end = req->sync.off + req->sync.len;
3710	int ret;
3711
3712	/* fsync always requires a blocking context */
3713	if (issue_flags & IO_URING_F_NONBLOCK)
3714		return -EAGAIN;
3715
3716	ret = vfs_fsync_range(req->file, req->sync.off,
3717				end > 0 ? end : LLONG_MAX,
3718				req->sync.flags & IORING_FSYNC_DATASYNC);
3719	if (ret < 0)
3720		req_set_fail(req);
3721	io_req_complete(req, ret);
3722	return 0;
3723}
3724
3725static int io_fallocate_prep(struct io_kiocb *req,
3726			     const struct io_uring_sqe *sqe)
3727{
3728	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3729		return -EINVAL;
3730	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3731		return -EINVAL;
3732
3733	req->sync.off = READ_ONCE(sqe->off);
3734	req->sync.len = READ_ONCE(sqe->addr);
3735	req->sync.mode = READ_ONCE(sqe->len);
3736	return 0;
3737}
3738
3739static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
3740{
3741	int ret;
3742
3743	/* fallocate always requiring blocking context */
3744	if (issue_flags & IO_URING_F_NONBLOCK)
3745		return -EAGAIN;
3746	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3747				req->sync.len);
3748	if (ret < 0)
3749		req_set_fail(req);
3750	io_req_complete(req, ret);
3751	return 0;
3752}
3753
3754static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3755{
3756	const char __user *fname;
3757	int ret;
3758
3759	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3760		return -EINVAL;
3761	if (unlikely(sqe->ioprio || sqe->buf_index))
3762		return -EINVAL;
3763	if (unlikely(req->flags & REQ_F_FIXED_FILE))
3764		return -EBADF;
3765
3766	/* open.how should be already initialised */
3767	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3768		req->open.how.flags |= O_LARGEFILE;
3769
3770	req->open.dfd = READ_ONCE(sqe->fd);
3771	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3772	req->open.filename = getname(fname);
3773	if (IS_ERR(req->open.filename)) {
3774		ret = PTR_ERR(req->open.filename);
3775		req->open.filename = NULL;
3776		return ret;
3777	}
3778	req->open.nofile = rlimit(RLIMIT_NOFILE);
3779	req->flags |= REQ_F_NEED_CLEANUP;
3780	return 0;
3781}
3782
3783static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3784{
3785	u64 mode = READ_ONCE(sqe->len);
3786	u64 flags = READ_ONCE(sqe->open_flags);
3787
3788	req->open.how = build_open_how(flags, mode);
3789	return __io_openat_prep(req, sqe);
3790}
3791
3792static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3793{
3794	struct open_how __user *how;
3795	size_t len;
3796	int ret;
3797
3798	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3799	len = READ_ONCE(sqe->len);
3800	if (len < OPEN_HOW_SIZE_VER0)
3801		return -EINVAL;
3802
3803	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3804					len);
3805	if (ret)
3806		return ret;
3807
3808	return __io_openat_prep(req, sqe);
3809}
3810
3811static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
3812{
3813	struct open_flags op;
3814	struct file *file;
3815	bool nonblock_set;
3816	bool resolve_nonblock;
3817	int ret;
3818
3819	ret = build_open_flags(&req->open.how, &op);
3820	if (ret)
3821		goto err;
3822	nonblock_set = op.open_flag & O_NONBLOCK;
3823	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
3824	if (issue_flags & IO_URING_F_NONBLOCK) {
3825		/*
3826		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3827		 * it'll always -EAGAIN
3828		 */
3829		if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3830			return -EAGAIN;
3831		op.lookup_flags |= LOOKUP_CACHED;
3832		op.open_flag |= O_NONBLOCK;
3833	}
3834
3835	ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3836	if (ret < 0)
3837		goto err;
3838
3839	file = do_filp_open(req->open.dfd, req->open.filename, &op);
3840	if (IS_ERR(file)) {
3841		/*
3842		 * We could hang on to this 'fd' on retrying, but seems like
3843		 * marginal gain for something that is now known to be a slower
3844		 * path. So just put it, and we'll get a new one when we retry.
3845		 */
3846		put_unused_fd(ret);
3847
3848		ret = PTR_ERR(file);
3849		/* only retry if RESOLVE_CACHED wasn't already set by application */
3850		if (ret == -EAGAIN &&
3851		    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
3852			return -EAGAIN;
3853		goto err;
3854	}
3855
3856	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3857		file->f_flags &= ~O_NONBLOCK;
3858	fsnotify_open(file);
3859	fd_install(ret, file);
3860err:
3861	putname(req->open.filename);
3862	req->flags &= ~REQ_F_NEED_CLEANUP;
3863	if (ret < 0)
3864		req_set_fail(req);
3865	__io_req_complete(req, issue_flags, ret, 0);
3866	return 0;
3867}
3868
3869static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
3870{
3871	return io_openat2(req, issue_flags);
3872}
3873
3874static int io_remove_buffers_prep(struct io_kiocb *req,
3875				  const struct io_uring_sqe *sqe)
3876{
3877	struct io_provide_buf *p = &req->pbuf;
3878	u64 tmp;
3879
3880	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3881		return -EINVAL;
3882
3883	tmp = READ_ONCE(sqe->fd);
3884	if (!tmp || tmp > USHRT_MAX)
3885		return -EINVAL;
3886
3887	memset(p, 0, sizeof(*p));
3888	p->nbufs = tmp;
3889	p->bgid = READ_ONCE(sqe->buf_group);
3890	return 0;
3891}
3892
3893static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3894			       int bgid, unsigned nbufs)
3895{
3896	unsigned i = 0;
3897
3898	/* shouldn't happen */
3899	if (!nbufs)
3900		return 0;
3901
3902	/* the head kbuf is the list itself */
3903	while (!list_empty(&buf->list)) {
3904		struct io_buffer *nxt;
3905
3906		nxt = list_first_entry(&buf->list, struct io_buffer, list);
3907		list_del(&nxt->list);
3908		kfree(nxt);
3909		if (++i == nbufs)
3910			return i;
3911	}
3912	i++;
3913	kfree(buf);
3914	xa_erase(&ctx->io_buffers, bgid);
3915
3916	return i;
3917}
3918
3919static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
3920{
3921	struct io_provide_buf *p = &req->pbuf;
3922	struct io_ring_ctx *ctx = req->ctx;
3923	struct io_buffer *head;
3924	int ret = 0;
3925	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3926
3927	io_ring_submit_lock(ctx, !force_nonblock);
3928
3929	lockdep_assert_held(&ctx->uring_lock);
3930
3931	ret = -ENOENT;
3932	head = xa_load(&ctx->io_buffers, p->bgid);
3933	if (head)
3934		ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3935	if (ret < 0)
3936		req_set_fail(req);
3937
3938	/* complete before unlock, IOPOLL may need the lock */
3939	__io_req_complete(req, issue_flags, ret, 0);
3940	io_ring_submit_unlock(ctx, !force_nonblock);
3941	return 0;
3942}
3943
3944static int io_provide_buffers_prep(struct io_kiocb *req,
3945				   const struct io_uring_sqe *sqe)
3946{
3947	unsigned long size, tmp_check;
3948	struct io_provide_buf *p = &req->pbuf;
3949	u64 tmp;
3950
3951	if (sqe->ioprio || sqe->rw_flags)
3952		return -EINVAL;
3953
3954	tmp = READ_ONCE(sqe->fd);
3955	if (!tmp || tmp > USHRT_MAX)
3956		return -E2BIG;
3957	p->nbufs = tmp;
3958	p->addr = READ_ONCE(sqe->addr);
3959	p->len = READ_ONCE(sqe->len);
3960
3961	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
3962				&size))
3963		return -EOVERFLOW;
3964	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
3965		return -EOVERFLOW;
3966
3967	size = (unsigned long)p->len * p->nbufs;
3968	if (!access_ok(u64_to_user_ptr(p->addr), size))
3969		return -EFAULT;
3970
3971	p->bgid = READ_ONCE(sqe->buf_group);
3972	tmp = READ_ONCE(sqe->off);
3973	if (tmp > USHRT_MAX)
3974		return -E2BIG;
3975	p->bid = tmp;
3976	return 0;
3977}
3978
3979static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3980{
3981	struct io_buffer *buf;
3982	u64 addr = pbuf->addr;
3983	int i, bid = pbuf->bid;
3984
3985	for (i = 0; i < pbuf->nbufs; i++) {
3986		buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3987		if (!buf)
3988			break;
3989
3990		buf->addr = addr;
3991		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
3992		buf->bid = bid;
3993		addr += pbuf->len;
3994		bid++;
3995		if (!*head) {
3996			INIT_LIST_HEAD(&buf->list);
3997			*head = buf;
3998		} else {
3999			list_add_tail(&buf->list, &(*head)->list);
4000		}
4001	}
4002
4003	return i ? i : -ENOMEM;
4004}
4005
4006static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4007{
4008	struct io_provide_buf *p = &req->pbuf;
4009	struct io_ring_ctx *ctx = req->ctx;
4010	struct io_buffer *head, *list;
4011	int ret = 0;
4012	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4013
4014	io_ring_submit_lock(ctx, !force_nonblock);
4015
4016	lockdep_assert_held(&ctx->uring_lock);
4017
4018	list = head = xa_load(&ctx->io_buffers, p->bgid);
4019
4020	ret = io_add_buffers(p, &head);
4021	if (ret >= 0 && !list) {
4022		ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4023		if (ret < 0)
4024			__io_remove_buffers(ctx, head, p->bgid, -1U);
4025	}
4026	if (ret < 0)
4027		req_set_fail(req);
4028	/* complete before unlock, IOPOLL may need the lock */
4029	__io_req_complete(req, issue_flags, ret, 0);
4030	io_ring_submit_unlock(ctx, !force_nonblock);
4031	return 0;
4032}
4033
4034static int io_epoll_ctl_prep(struct io_kiocb *req,
4035			     const struct io_uring_sqe *sqe)
4036{
4037#if defined(CONFIG_EPOLL)
4038	if (sqe->ioprio || sqe->buf_index)
4039		return -EINVAL;
4040	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4041		return -EINVAL;
4042
4043	req->epoll.epfd = READ_ONCE(sqe->fd);
4044	req->epoll.op = READ_ONCE(sqe->len);
4045	req->epoll.fd = READ_ONCE(sqe->off);
4046
4047	if (ep_op_has_event(req->epoll.op)) {
4048		struct epoll_event __user *ev;
4049
4050		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4051		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4052			return -EFAULT;
4053	}
4054
4055	return 0;
4056#else
4057	return -EOPNOTSUPP;
4058#endif
4059}
4060
4061static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4062{
4063#if defined(CONFIG_EPOLL)
4064	struct io_epoll *ie = &req->epoll;
4065	int ret;
4066	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4067
4068	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4069	if (force_nonblock && ret == -EAGAIN)
4070		return -EAGAIN;
4071
4072	if (ret < 0)
4073		req_set_fail(req);
4074	__io_req_complete(req, issue_flags, ret, 0);
4075	return 0;
4076#else
4077	return -EOPNOTSUPP;
4078#endif
4079}
4080
4081static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4082{
4083#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4084	if (sqe->ioprio || sqe->buf_index || sqe->off)
4085		return -EINVAL;
4086	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4087		return -EINVAL;
4088
4089	req->madvise.addr = READ_ONCE(sqe->addr);
4090	req->madvise.len = READ_ONCE(sqe->len);
4091	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4092	return 0;
4093#else
4094	return -EOPNOTSUPP;
4095#endif
4096}
4097
4098static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4099{
4100#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4101	struct io_madvise *ma = &req->madvise;
4102	int ret;
4103
4104	if (issue_flags & IO_URING_F_NONBLOCK)
4105		return -EAGAIN;
4106
4107	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4108	if (ret < 0)
4109		req_set_fail(req);
4110	io_req_complete(req, ret);
4111	return 0;
4112#else
4113	return -EOPNOTSUPP;
4114#endif
4115}
4116
4117static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4118{
4119	if (sqe->ioprio || sqe->buf_index || sqe->addr)
4120		return -EINVAL;
4121	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4122		return -EINVAL;
4123
4124	req->fadvise.offset = READ_ONCE(sqe->off);
4125	req->fadvise.len = READ_ONCE(sqe->len);
4126	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4127	return 0;
4128}
4129
4130static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4131{
4132	struct io_fadvise *fa = &req->fadvise;
4133	int ret;
4134
4135	if (issue_flags & IO_URING_F_NONBLOCK) {
4136		switch (fa->advice) {
4137		case POSIX_FADV_NORMAL:
4138		case POSIX_FADV_RANDOM:
4139		case POSIX_FADV_SEQUENTIAL:
4140			break;
4141		default:
4142			return -EAGAIN;
4143		}
4144	}
4145
4146	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4147	if (ret < 0)
4148		req_set_fail(req);
4149	__io_req_complete(req, issue_flags, ret, 0);
4150	return 0;
4151}
4152
4153static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4154{
4155	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4156		return -EINVAL;
4157	if (sqe->ioprio || sqe->buf_index)
4158		return -EINVAL;
4159	if (req->flags & REQ_F_FIXED_FILE)
4160		return -EBADF;
4161
4162	req->statx.dfd = READ_ONCE(sqe->fd);
4163	req->statx.mask = READ_ONCE(sqe->len);
4164	req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4165	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4166	req->statx.flags = READ_ONCE(sqe->statx_flags);
4167
4168	return 0;
4169}
4170
4171static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4172{
4173	struct io_statx *ctx = &req->statx;
4174	int ret;
4175
4176	if (issue_flags & IO_URING_F_NONBLOCK)
4177		return -EAGAIN;
4178
4179	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4180		       ctx->buffer);
4181
4182	if (ret < 0)
4183		req_set_fail(req);
4184	io_req_complete(req, ret);
4185	return 0;
4186}
4187
4188static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4189{
4190	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4191		return -EINVAL;
4192	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4193	    sqe->rw_flags || sqe->buf_index)
4194		return -EINVAL;
4195	if (req->flags & REQ_F_FIXED_FILE)
4196		return -EBADF;
4197
4198	req->close.fd = READ_ONCE(sqe->fd);
4199	return 0;
4200}
4201
4202static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4203{
4204	struct files_struct *files = current->files;
4205	struct io_close *close = &req->close;
4206	struct fdtable *fdt;
4207	struct file *file = NULL;
4208	int ret = -EBADF;
4209
4210	spin_lock(&files->file_lock);
4211	fdt = files_fdtable(files);
4212	if (close->fd >= fdt->max_fds) {
4213		spin_unlock(&files->file_lock);
4214		goto err;
4215	}
4216	file = fdt->fd[close->fd];
4217	if (!file || file->f_op == &io_uring_fops) {
4218		spin_unlock(&files->file_lock);
4219		file = NULL;
4220		goto err;
4221	}
4222
4223	/* if the file has a flush method, be safe and punt to async */
4224	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4225		spin_unlock(&files->file_lock);
4226		return -EAGAIN;
4227	}
4228
4229	ret = __close_fd_get_file(close->fd, &file);
4230	spin_unlock(&files->file_lock);
4231	if (ret < 0) {
4232		if (ret == -ENOENT)
4233			ret = -EBADF;
4234		goto err;
4235	}
4236
4237	/* No ->flush() or already async, safely close from here */
4238	ret = filp_close(file, current->files);
4239err:
4240	if (ret < 0)
4241		req_set_fail(req);
4242	if (file)
4243		fput(file);
4244	__io_req_complete(req, issue_flags, ret, 0);
4245	return 0;
4246}
4247
4248static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4249{
4250	struct io_ring_ctx *ctx = req->ctx;
4251
4252	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4253		return -EINVAL;
4254	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4255		return -EINVAL;
4256
4257	req->sync.off = READ_ONCE(sqe->off);
4258	req->sync.len = READ_ONCE(sqe->len);
4259	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4260	return 0;
4261}
4262
4263static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4264{
4265	int ret;
4266
4267	/* sync_file_range always requires a blocking context */
4268	if (issue_flags & IO_URING_F_NONBLOCK)
4269		return -EAGAIN;
4270
4271	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4272				req->sync.flags);
4273	if (ret < 0)
4274		req_set_fail(req);
4275	io_req_complete(req, ret);
4276	return 0;
4277}
4278
4279#if defined(CONFIG_NET)
4280static int io_setup_async_msg(struct io_kiocb *req,
4281			      struct io_async_msghdr *kmsg)
4282{
4283	struct io_async_msghdr *async_msg = req->async_data;
4284
4285	if (async_msg)
4286		return -EAGAIN;
4287	if (io_alloc_async_data(req)) {
4288		kfree(kmsg->free_iov);
4289		return -ENOMEM;
4290	}
4291	async_msg = req->async_data;
4292	req->flags |= REQ_F_NEED_CLEANUP;
4293	memcpy(async_msg, kmsg, sizeof(*kmsg));
4294	async_msg->msg.msg_name = &async_msg->addr;
4295	/* if were using fast_iov, set it to the new one */
4296	if (!async_msg->free_iov)
4297		async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4298
4299	return -EAGAIN;
4300}
4301
4302static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4303			       struct io_async_msghdr *iomsg)
4304{
4305	iomsg->msg.msg_name = &iomsg->addr;
4306	iomsg->free_iov = iomsg->fast_iov;
4307	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4308				   req->sr_msg.msg_flags, &iomsg->free_iov);
4309}
4310
4311static int io_sendmsg_prep_async(struct io_kiocb *req)
4312{
4313	int ret;
4314
4315	ret = io_sendmsg_copy_hdr(req, req->async_data);
4316	if (!ret)
4317		req->flags |= REQ_F_NEED_CLEANUP;
4318	return ret;
4319}
4320
4321static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4322{
4323	struct io_sr_msg *sr = &req->sr_msg;
4324
4325	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4326		return -EINVAL;
4327
4328	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4329	sr->len = READ_ONCE(sqe->len);
4330	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4331	if (sr->msg_flags & MSG_DONTWAIT)
4332		req->flags |= REQ_F_NOWAIT;
4333
4334#ifdef CONFIG_COMPAT
4335	if (req->ctx->compat)
4336		sr->msg_flags |= MSG_CMSG_COMPAT;
4337#endif
4338	return 0;
4339}
4340
4341static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4342{
4343	struct io_async_msghdr iomsg, *kmsg;
4344	struct socket *sock;
4345	unsigned flags;
4346	int min_ret = 0;
4347	int ret;
4348
4349	sock = sock_from_file(req->file);
4350	if (unlikely(!sock))
4351		return -ENOTSOCK;
4352
4353	kmsg = req->async_data;
4354	if (!kmsg) {
4355		ret = io_sendmsg_copy_hdr(req, &iomsg);
4356		if (ret)
4357			return ret;
4358		kmsg = &iomsg;
4359	}
4360
4361	flags = req->sr_msg.msg_flags;
4362	if (issue_flags & IO_URING_F_NONBLOCK)
4363		flags |= MSG_DONTWAIT;
4364	if (flags & MSG_WAITALL)
4365		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4366
4367	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4368	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4369		return io_setup_async_msg(req, kmsg);
4370	if (ret == -ERESTARTSYS)
4371		ret = -EINTR;
4372
4373	/* fast path, check for non-NULL to avoid function call */
4374	if (kmsg->free_iov)
4375		kfree(kmsg->free_iov);
4376	req->flags &= ~REQ_F_NEED_CLEANUP;
4377	if (ret < min_ret)
4378		req_set_fail(req);
4379	__io_req_complete(req, issue_flags, ret, 0);
4380	return 0;
4381}
4382
4383static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4384{
4385	struct io_sr_msg *sr = &req->sr_msg;
4386	struct msghdr msg;
4387	struct iovec iov;
4388	struct socket *sock;
4389	unsigned flags;
4390	int min_ret = 0;
4391	int ret;
4392
4393	sock = sock_from_file(req->file);
4394	if (unlikely(!sock))
4395		return -ENOTSOCK;
4396
4397	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4398	if (unlikely(ret))
4399		return ret;
4400
4401	msg.msg_name = NULL;
4402	msg.msg_control = NULL;
4403	msg.msg_controllen = 0;
4404	msg.msg_namelen = 0;
4405
4406	flags = req->sr_msg.msg_flags;
4407	if (issue_flags & IO_URING_F_NONBLOCK)
4408		flags |= MSG_DONTWAIT;
4409	if (flags & MSG_WAITALL)
4410		min_ret = iov_iter_count(&msg.msg_iter);
4411
4412	msg.msg_flags = flags;
4413	ret = sock_sendmsg(sock, &msg);
4414	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4415		return -EAGAIN;
4416	if (ret == -ERESTARTSYS)
4417		ret = -EINTR;
4418
4419	if (ret < min_ret)
4420		req_set_fail(req);
4421	__io_req_complete(req, issue_flags, ret, 0);
4422	return 0;
4423}
4424
4425static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4426				 struct io_async_msghdr *iomsg)
4427{
4428	struct io_sr_msg *sr = &req->sr_msg;
4429	struct iovec __user *uiov;
4430	size_t iov_len;
4431	int ret;
4432
4433	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4434					&iomsg->uaddr, &uiov, &iov_len);
4435	if (ret)
4436		return ret;
4437
4438	if (req->flags & REQ_F_BUFFER_SELECT) {
4439		if (iov_len > 1)
4440			return -EINVAL;
4441		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4442			return -EFAULT;
4443		sr->len = iomsg->fast_iov[0].iov_len;
4444		iomsg->free_iov = NULL;
4445	} else {
4446		iomsg->free_iov = iomsg->fast_iov;
4447		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4448				     &iomsg->free_iov, &iomsg->msg.msg_iter,
4449				     false);
4450		if (ret > 0)
4451			ret = 0;
4452	}
4453
4454	return ret;
4455}
4456
4457#ifdef CONFIG_COMPAT
4458static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4459					struct io_async_msghdr *iomsg)
4460{
4461	struct io_sr_msg *sr = &req->sr_msg;
4462	struct compat_iovec __user *uiov;
4463	compat_uptr_t ptr;
4464	compat_size_t len;
4465	int ret;
4466
4467	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4468				  &ptr, &len);
4469	if (ret)
4470		return ret;
4471
4472	uiov = compat_ptr(ptr);
4473	if (req->flags & REQ_F_BUFFER_SELECT) {
4474		compat_ssize_t clen;
4475
4476		if (len > 1)
4477			return -EINVAL;
4478		if (!access_ok(uiov, sizeof(*uiov)))
4479			return -EFAULT;
4480		if (__get_user(clen, &uiov->iov_len))
4481			return -EFAULT;
4482		if (clen < 0)
4483			return -EINVAL;
4484		sr->len = clen;
4485		iomsg->free_iov = NULL;
4486	} else {
4487		iomsg->free_iov = iomsg->fast_iov;
4488		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4489				   UIO_FASTIOV, &iomsg->free_iov,
4490				   &iomsg->msg.msg_iter, true);
4491		if (ret < 0)
4492			return ret;
4493	}
4494
4495	return 0;
4496}
4497#endif
4498
4499static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4500			       struct io_async_msghdr *iomsg)
4501{
4502	iomsg->msg.msg_name = &iomsg->addr;
4503
4504#ifdef CONFIG_COMPAT
4505	if (req->ctx->compat)
4506		return __io_compat_recvmsg_copy_hdr(req, iomsg);
4507#endif
4508
4509	return __io_recvmsg_copy_hdr(req, iomsg);
4510}
4511
4512static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4513					       bool needs_lock)
4514{
4515	struct io_sr_msg *sr = &req->sr_msg;
4516	struct io_buffer *kbuf;
4517
4518	kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4519	if (IS_ERR(kbuf))
4520		return kbuf;
4521
4522	sr->kbuf = kbuf;
4523	req->flags |= REQ_F_BUFFER_SELECTED;
4524	return kbuf;
4525}
4526
4527static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4528{
4529	return io_put_kbuf(req, req->sr_msg.kbuf);
4530}
4531
4532static int io_recvmsg_prep_async(struct io_kiocb *req)
4533{
4534	int ret;
4535
4536	ret = io_recvmsg_copy_hdr(req, req->async_data);
4537	if (!ret)
4538		req->flags |= REQ_F_NEED_CLEANUP;
4539	return ret;
4540}
4541
4542static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4543{
4544	struct io_sr_msg *sr = &req->sr_msg;
4545
4546	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4547		return -EINVAL;
4548
4549	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4550	sr->len = READ_ONCE(sqe->len);
4551	sr->bgid = READ_ONCE(sqe->buf_group);
4552	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4553	if (sr->msg_flags & MSG_DONTWAIT)
4554		req->flags |= REQ_F_NOWAIT;
4555
4556#ifdef CONFIG_COMPAT
4557	if (req->ctx->compat)
4558		sr->msg_flags |= MSG_CMSG_COMPAT;
4559#endif
4560	return 0;
4561}
4562
4563static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4564{
4565	struct io_async_msghdr iomsg, *kmsg;
4566	struct socket *sock;
4567	struct io_buffer *kbuf;
4568	unsigned flags;
4569	int min_ret = 0;
4570	int ret, cflags = 0;
4571	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4572
4573	sock = sock_from_file(req->file);
4574	if (unlikely(!sock))
4575		return -ENOTSOCK;
4576
4577	kmsg = req->async_data;
4578	if (!kmsg) {
4579		ret = io_recvmsg_copy_hdr(req, &iomsg);
4580		if (ret)
4581			return ret;
4582		kmsg = &iomsg;
4583	}
4584
4585	if (req->flags & REQ_F_BUFFER_SELECT) {
4586		kbuf = io_recv_buffer_select(req, !force_nonblock);
4587		if (IS_ERR(kbuf))
4588			return PTR_ERR(kbuf);
4589		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4590		kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4591		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4592				1, req->sr_msg.len);
4593	}
4594
4595	flags = req->sr_msg.msg_flags;
4596	if (force_nonblock)
4597		flags |= MSG_DONTWAIT;
4598	if (flags & MSG_WAITALL)
4599		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4600
4601	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4602					kmsg->uaddr, flags);
4603	if (force_nonblock && ret == -EAGAIN)
4604		return io_setup_async_msg(req, kmsg);
4605	if (ret == -ERESTARTSYS)
4606		ret = -EINTR;
4607
4608	if (req->flags & REQ_F_BUFFER_SELECTED)
4609		cflags = io_put_recv_kbuf(req);
4610	/* fast path, check for non-NULL to avoid function call */
4611	if (kmsg->free_iov)
4612		kfree(kmsg->free_iov);
4613	req->flags &= ~REQ_F_NEED_CLEANUP;
4614	if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
4615		req_set_fail(req);
4616	__io_req_complete(req, issue_flags, ret, cflags);
4617	return 0;
4618}
4619
4620static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
4621{
4622	struct io_buffer *kbuf;
4623	struct io_sr_msg *sr = &req->sr_msg;
4624	struct msghdr msg;
4625	void __user *buf = sr->buf;
4626	struct socket *sock;
4627	struct iovec iov;
4628	unsigned flags;
4629	int min_ret = 0;
4630	int ret, cflags = 0;
4631	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4632
4633	sock = sock_from_file(req->file);
4634	if (unlikely(!sock))
4635		return -ENOTSOCK;
4636
4637	if (req->flags & REQ_F_BUFFER_SELECT) {
4638		kbuf = io_recv_buffer_select(req, !force_nonblock);
4639		if (IS_ERR(kbuf))
4640			return PTR_ERR(kbuf);
4641		buf = u64_to_user_ptr(kbuf->addr);
4642	}
4643
4644	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4645	if (unlikely(ret))
4646		goto out_free;
4647
4648	msg.msg_name = NULL;
4649	msg.msg_control = NULL;
4650	msg.msg_controllen = 0;
4651	msg.msg_namelen = 0;
4652	msg.msg_iocb = NULL;
4653	msg.msg_flags = 0;
4654
4655	flags = req->sr_msg.msg_flags;
4656	if (force_nonblock)
4657		flags |= MSG_DONTWAIT;
4658	if (flags & MSG_WAITALL)
4659		min_ret = iov_iter_count(&msg.msg_iter);
4660
4661	ret = sock_recvmsg(sock, &msg, flags);
4662	if (force_nonblock && ret == -EAGAIN)
4663		return -EAGAIN;
4664	if (ret == -ERESTARTSYS)
4665		ret = -EINTR;
4666out_free:
4667	if (req->flags & REQ_F_BUFFER_SELECTED)
4668		cflags = io_put_recv_kbuf(req);
4669	if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
4670		req_set_fail(req);
4671	__io_req_complete(req, issue_flags, ret, cflags);
4672	return 0;
4673}
4674
4675static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4676{
4677	struct io_accept *accept = &req->accept;
4678
4679	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4680		return -EINVAL;
4681	if (sqe->ioprio || sqe->len || sqe->buf_index)
4682		return -EINVAL;
4683
4684	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4685	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4686	accept->flags = READ_ONCE(sqe->accept_flags);
4687	accept->nofile = rlimit(RLIMIT_NOFILE);
4688	return 0;
4689}
4690
4691static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
4692{
4693	struct io_accept *accept = &req->accept;
4694	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4695	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4696	int ret;
4697
4698	if (req->file->f_flags & O_NONBLOCK)
4699		req->flags |= REQ_F_NOWAIT;
4700
4701	ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4702					accept->addr_len, accept->flags,
4703					accept->nofile);
4704	if (ret == -EAGAIN && force_nonblock)
4705		return -EAGAIN;
4706	if (ret < 0) {
4707		if (ret == -ERESTARTSYS)
4708			ret = -EINTR;
4709		req_set_fail(req);
4710	}
4711	__io_req_complete(req, issue_flags, ret, 0);
4712	return 0;
4713}
4714
4715static int io_connect_prep_async(struct io_kiocb *req)
4716{
4717	struct io_async_connect *io = req->async_data;
4718	struct io_connect *conn = &req->connect;
4719
4720	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4721}
4722
4723static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4724{
4725	struct io_connect *conn = &req->connect;
4726
4727	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4728		return -EINVAL;
4729	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4730		return -EINVAL;
4731
4732	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4733	conn->addr_len =  READ_ONCE(sqe->addr2);
4734	return 0;
4735}
4736
4737static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
4738{
4739	struct io_async_connect __io, *io;
4740	unsigned file_flags;
4741	int ret;
4742	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4743
4744	if (req->async_data) {
4745		io = req->async_data;
4746	} else {
4747		ret = move_addr_to_kernel(req->connect.addr,
4748						req->connect.addr_len,
4749						&__io.address);
4750		if (ret)
4751			goto out;
4752		io = &__io;
4753	}
4754
4755	file_flags = force_nonblock ? O_NONBLOCK : 0;
4756
4757	ret = __sys_connect_file(req->file, &io->address,
4758					req->connect.addr_len, file_flags);
4759	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4760		if (req->async_data)
4761			return -EAGAIN;
4762		if (io_alloc_async_data(req)) {
4763			ret = -ENOMEM;
4764			goto out;
4765		}
4766		memcpy(req->async_data, &__io, sizeof(__io));
4767		return -EAGAIN;
4768	}
4769	if (ret == -ERESTARTSYS)
4770		ret = -EINTR;
4771out:
4772	if (ret < 0)
4773		req_set_fail(req);
4774	__io_req_complete(req, issue_flags, ret, 0);
4775	return 0;
4776}
4777#else /* !CONFIG_NET */
4778#define IO_NETOP_FN(op)							\
4779static int io_##op(struct io_kiocb *req, unsigned int issue_flags)	\
4780{									\
4781	return -EOPNOTSUPP;						\
4782}
4783
4784#define IO_NETOP_PREP(op)						\
4785IO_NETOP_FN(op)								\
4786static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4787{									\
4788	return -EOPNOTSUPP;						\
4789}									\
4790
4791#define IO_NETOP_PREP_ASYNC(op)						\
4792IO_NETOP_PREP(op)							\
4793static int io_##op##_prep_async(struct io_kiocb *req)			\
4794{									\
4795	return -EOPNOTSUPP;						\
4796}
4797
4798IO_NETOP_PREP_ASYNC(sendmsg);
4799IO_NETOP_PREP_ASYNC(recvmsg);
4800IO_NETOP_PREP_ASYNC(connect);
4801IO_NETOP_PREP(accept);
4802IO_NETOP_FN(send);
4803IO_NETOP_FN(recv);
4804#endif /* CONFIG_NET */
4805
4806struct io_poll_table {
4807	struct poll_table_struct pt;
4808	struct io_kiocb *req;
4809	int nr_entries;
4810	int error;
4811};
4812
4813static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4814			   __poll_t mask, io_req_tw_func_t func)
4815{
4816	/* for instances that support it check for an event match first: */
4817	if (mask && !(mask & poll->events))
4818		return 0;
4819
4820	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4821
4822	list_del_init(&poll->wait.entry);
4823
4824	req->result = mask;
4825	req->io_task_work.func = func;
4826
4827	/*
4828	 * If this fails, then the task is exiting. When a task exits, the
4829	 * work gets canceled, so just cancel this request as well instead
4830	 * of executing it. We can't safely execute it anyway, as we may not
4831	 * have the needed state needed for it anyway.
4832	 */
4833	io_req_task_work_add(req);
4834	return 1;
4835}
4836
4837static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4838	__acquires(&req->ctx->completion_lock)
4839{
4840	struct io_ring_ctx *ctx = req->ctx;
4841
4842	if (unlikely(req->task->flags & PF_EXITING))
4843		WRITE_ONCE(poll->canceled, true);
4844
4845	if (!req->result && !READ_ONCE(poll->canceled)) {
4846		struct poll_table_struct pt = { ._key = poll->events };
4847
4848		req->result = vfs_poll(req->file, &pt) & poll->events;
4849	}
4850
4851	spin_lock_irq(&ctx->completion_lock);
4852	if (!req->result && !READ_ONCE(poll->canceled)) {
4853		add_wait_queue(poll->head, &poll->wait);
4854		return true;
4855	}
4856
4857	return false;
4858}
4859
4860static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
4861{
4862	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
4863	if (req->opcode == IORING_OP_POLL_ADD)
4864		return req->async_data;
4865	return req->apoll->double_poll;
4866}
4867
4868static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4869{
4870	if (req->opcode == IORING_OP_POLL_ADD)
4871		return &req->poll;
4872	return &req->apoll->poll;
4873}
4874
4875static void io_poll_remove_double(struct io_kiocb *req)
4876	__must_hold(&req->ctx->completion_lock)
4877{
4878	struct io_poll_iocb *poll = io_poll_get_double(req);
4879
4880	lockdep_assert_held(&req->ctx->completion_lock);
4881
4882	if (poll && poll->head) {
4883		struct wait_queue_head *head = poll->head;
4884
4885		spin_lock(&head->lock);
4886		list_del_init(&poll->wait.entry);
4887		if (poll->wait.private)
4888			req_ref_put(req);
4889		poll->head = NULL;
4890		spin_unlock(&head->lock);
4891	}
4892}
4893
4894static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
4895	__must_hold(&req->ctx->completion_lock)
4896{
4897	struct io_ring_ctx *ctx = req->ctx;
4898	unsigned flags = IORING_CQE_F_MORE;
4899	int error;
4900
4901	if (READ_ONCE(req->poll.canceled)) {
4902		error = -ECANCELED;
4903		req->poll.events |= EPOLLONESHOT;
4904	} else {
4905		error = mangle_poll(mask);
4906	}
4907	if (req->poll.events & EPOLLONESHOT)
4908		flags = 0;
4909	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
4910		req->poll.done = true;
4911		flags = 0;
4912	}
4913	if (flags & IORING_CQE_F_MORE)
4914		ctx->cq_extra++;
4915
4916	io_commit_cqring(ctx);
4917	return !(flags & IORING_CQE_F_MORE);
4918}
4919
4920static void io_poll_task_func(struct io_kiocb *req)
4921{
4922	struct io_ring_ctx *ctx = req->ctx;
4923	struct io_kiocb *nxt;
4924
4925	if (io_poll_rewait(req, &req->poll)) {
4926		spin_unlock_irq(&ctx->completion_lock);
4927	} else {
4928		bool done;
4929
4930		done = io_poll_complete(req, req->result);
4931		if (done) {
4932			io_poll_remove_double(req);
4933			hash_del(&req->hash_node);
4934		} else {
4935			req->result = 0;
4936			add_wait_queue(req->poll.head, &req->poll.wait);
4937		}
4938		spin_unlock_irq(&ctx->completion_lock);
4939		io_cqring_ev_posted(ctx);
4940
4941		if (done) {
4942			nxt = io_put_req_find_next(req);
4943			if (nxt)
4944				io_req_task_submit(nxt);
4945		}
4946	}
4947}
4948
4949static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4950			       int sync, void *key)
4951{
4952	struct io_kiocb *req = wait->private;
4953	struct io_poll_iocb *poll = io_poll_get_single(req);
4954	__poll_t mask = key_to_poll(key);
4955
4956	/* for instances that support it check for an event match first: */
4957	if (mask && !(mask & poll->events))
4958		return 0;
4959	if (!(poll->events & EPOLLONESHOT))
4960		return poll->wait.func(&poll->wait, mode, sync, key);
4961
4962	list_del_init(&wait->entry);
4963
4964	if (poll->head) {
4965		bool done;
4966
4967		spin_lock(&poll->head->lock);
4968		done = list_empty(&poll->wait.entry);
4969		if (!done)
4970			list_del_init(&poll->wait.entry);
4971		/* make sure double remove sees this as being gone */
4972		wait->private = NULL;
4973		spin_unlock(&poll->head->lock);
4974		if (!done) {
4975			/* use wait func handler, so it matches the rq type */
4976			poll->wait.func(&poll->wait, mode, sync, key);
4977		}
4978	}
4979	req_ref_put(req);
4980	return 1;
4981}
4982
4983static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4984			      wait_queue_func_t wake_func)
4985{
4986	poll->head = NULL;
4987	poll->done = false;
4988	poll->canceled = false;
4989#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
4990	/* mask in events that we always want/need */
4991	poll->events = events | IO_POLL_UNMASK;
4992	INIT_LIST_HEAD(&poll->wait.entry);
4993	init_waitqueue_func_entry(&poll->wait, wake_func);
4994}
4995
4996static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4997			    struct wait_queue_head *head,
4998			    struct io_poll_iocb **poll_ptr)
4999{
5000	struct io_kiocb *req = pt->req;
5001
5002	/*
5003	 * The file being polled uses multiple waitqueues for poll handling
5004	 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
5005	 * if this happens.
5006	 */
5007	if (unlikely(pt->nr_entries)) {
5008		struct io_poll_iocb *poll_one = poll;
5009
5010		/* already have a 2nd entry, fail a third attempt */
5011		if (*poll_ptr) {
5012			pt->error = -EINVAL;
5013			return;
5014		}
5015		/*
5016		 * Can't handle multishot for double wait for now, turn it
5017		 * into one-shot mode.
5018		 */
5019		if (!(poll_one->events & EPOLLONESHOT))
5020			poll_one->events |= EPOLLONESHOT;
5021		/* double add on the same waitqueue head, ignore */
5022		if (poll_one->head == head)
5023			return;
5024		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5025		if (!poll) {
5026			pt->error = -ENOMEM;
5027			return;
5028		}
5029		io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5030		req_ref_get(req);
5031		poll->wait.private = req;
5032		*poll_ptr = poll;
5033	}
5034
5035	pt->nr_entries++;
5036	poll->head = head;
5037
5038	if (poll->events & EPOLLEXCLUSIVE)
5039		add_wait_queue_exclusive(head, &poll->wait);
5040	else
5041		add_wait_queue(head, &poll->wait);
5042}
5043
5044static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5045			       struct poll_table_struct *p)
5046{
5047	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5048	struct async_poll *apoll = pt->req->apoll;
5049
5050	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5051}
5052
5053static void io_async_task_func(struct io_kiocb *req)
5054{
5055	struct async_poll *apoll = req->apoll;
5056	struct io_ring_ctx *ctx = req->ctx;
5057
5058	trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
5059
5060	if (io_poll_rewait(req, &apoll->poll)) {
5061		spin_unlock_irq(&ctx->completion_lock);
5062		return;
5063	}
5064
5065	hash_del(&req->hash_node);
5066	io_poll_remove_double(req);
5067	spin_unlock_irq(&ctx->completion_lock);
5068
5069	if (!READ_ONCE(apoll->poll.canceled))
5070		io_req_task_submit(req);
5071	else
5072		io_req_complete_failed(req, -ECANCELED);
5073}
5074
5075static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5076			void *key)
5077{
5078	struct io_kiocb *req = wait->private;
5079	struct io_poll_iocb *poll = &req->apoll->poll;
5080
5081	trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5082					key_to_poll(key));
5083
5084	return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5085}
5086
5087static void io_poll_req_insert(struct io_kiocb *req)
5088{
5089	struct io_ring_ctx *ctx = req->ctx;
5090	struct hlist_head *list;
5091
5092	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5093	hlist_add_head(&req->hash_node, list);
5094}
5095
5096static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5097				      struct io_poll_iocb *poll,
5098				      struct io_poll_table *ipt, __poll_t mask,
5099				      wait_queue_func_t wake_func)
5100	__acquires(&ctx->completion_lock)
5101{
5102	struct io_ring_ctx *ctx = req->ctx;
5103	bool cancel = false;
5104
5105	INIT_HLIST_NODE(&req->hash_node);
5106	io_init_poll_iocb(poll, mask, wake_func);
5107	poll->file = req->file;
5108	poll->wait.private = req;
5109
5110	ipt->pt._key = mask;
5111	ipt->req = req;
5112	ipt->error = 0;
5113	ipt->nr_entries = 0;
5114
5115	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5116	if (unlikely(!ipt->nr_entries) && !ipt->error)
5117		ipt->error = -EINVAL;
5118
5119	spin_lock_irq(&ctx->completion_lock);
5120	if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
5121		io_poll_remove_double(req);
5122	if (likely(poll->head)) {
5123		spin_lock(&poll->head->lock);
5124		if (unlikely(list_empty(&poll->wait.entry))) {
5125			if (ipt->error)
5126				cancel = true;
5127			ipt->error = 0;
5128			mask = 0;
5129		}
5130		if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
5131			list_del_init(&poll->wait.entry);
5132		else if (cancel)
5133			WRITE_ONCE(poll->canceled, true);
5134		else if (!poll->done) /* actually waiting for an event */
5135			io_poll_req_insert(req);
5136		spin_unlock(&poll->head->lock);
5137	}
5138
5139	return mask;
5140}
5141
5142enum {
5143	IO_APOLL_OK,
5144	IO_APOLL_ABORTED,
5145	IO_APOLL_READY
5146};
5147
5148static int io_arm_poll_handler(struct io_kiocb *req)
5149{
5150	const struct io_op_def *def = &io_op_defs[req->opcode];
5151	struct io_ring_ctx *ctx = req->ctx;
5152	struct async_poll *apoll;
5153	struct io_poll_table ipt;
5154	__poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
5155	int rw;
5156
5157	if (!req->file || !file_can_poll(req->file))
5158		return IO_APOLL_ABORTED;
5159	if (req->flags & REQ_F_POLLED)
5160		return IO_APOLL_ABORTED;
5161	if (!def->pollin && !def->pollout)
5162		return IO_APOLL_ABORTED;
5163
5164	if (def->pollin) {
5165		rw = READ;
5166		mask |= POLLIN | POLLRDNORM;
5167
5168		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5169		if ((req->opcode == IORING_OP_RECVMSG) &&
5170		    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5171			mask &= ~POLLIN;
5172	} else {
5173		rw = WRITE;
5174		mask |= POLLOUT | POLLWRNORM;
5175	}
5176
5177	/* if we can't nonblock try, then no point in arming a poll handler */
5178	if (!io_file_supports_nowait(req, rw))
5179		return IO_APOLL_ABORTED;
5180
5181	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5182	if (unlikely(!apoll))
5183		return IO_APOLL_ABORTED;
5184	apoll->double_poll = NULL;
5185	req->apoll = apoll;
5186	req->flags |= REQ_F_POLLED;
5187	ipt.pt._qproc = io_async_queue_proc;
5188
5189	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5190					io_async_wake);
5191	if (ret || ipt.error) {
5192		spin_unlock_irq(&ctx->completion_lock);
5193		if (ret)
5194			return IO_APOLL_READY;
5195		return IO_APOLL_ABORTED;
5196	}
5197	spin_unlock_irq(&ctx->completion_lock);
5198	trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5199				mask, apoll->poll.events);
5200	return IO_APOLL_OK;
5201}
5202
5203static bool __io_poll_remove_one(struct io_kiocb *req,
5204				 struct io_poll_iocb *poll, bool do_cancel)
5205	__must_hold(&req->ctx->completion_lock)
5206{
5207	bool do_complete = false;
5208
5209	if (!poll->head)
5210		return false;
5211	spin_lock(&poll->head->lock);
5212	if (do_cancel)
5213		WRITE_ONCE(poll->canceled, true);
5214	if (!list_empty(&poll->wait.entry)) {
5215		list_del_init(&poll->wait.entry);
5216		do_complete = true;
5217	}
5218	spin_unlock(&poll->head->lock);
5219	hash_del(&req->hash_node);
5220	return do_complete;
5221}
5222
5223static bool io_poll_remove_one(struct io_kiocb *req)
5224	__must_hold(&req->ctx->completion_lock)
5225{
5226	int refs;
5227	bool do_complete;
5228
5229	io_poll_remove_double(req);
5230	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
5231
5232	if (do_complete) {
5233		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
5234		io_commit_cqring(req->ctx);
5235		req_set_fail(req);
5236
5237		/* non-poll requests have submit ref still */
5238		refs = 1 + (req->opcode != IORING_OP_POLL_ADD);
5239		io_put_req_deferred(req, refs);
5240	}
5241	return do_complete;
5242}
5243
5244/*
5245 * Returns true if we found and killed one or more poll requests
5246 */
5247static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5248			       bool cancel_all)
5249{
5250	struct hlist_node *tmp;
5251	struct io_kiocb *req;
5252	int posted = 0, i;
5253
5254	spin_lock_irq(&ctx->completion_lock);
5255	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5256		struct hlist_head *list;
5257
5258		list = &ctx->cancel_hash[i];
5259		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5260			if (io_match_task(req, tsk, cancel_all))
5261				posted += io_poll_remove_one(req);
5262		}
5263	}
5264	spin_unlock_irq(&ctx->completion_lock);
5265
5266	if (posted)
5267		io_cqring_ev_posted(ctx);
5268
5269	return posted != 0;
5270}
5271
5272static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5273				     bool poll_only)
5274	__must_hold(&ctx->completion_lock)
5275{
5276	struct hlist_head *list;
5277	struct io_kiocb *req;
5278
5279	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5280	hlist_for_each_entry(req, list, hash_node) {
5281		if (sqe_addr != req->user_data)
5282			continue;
5283		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5284			continue;
5285		return req;
5286	}
5287	return NULL;
5288}
5289
5290static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5291			  bool poll_only)
5292	__must_hold(&ctx->completion_lock)
5293{
5294	struct io_kiocb *req;
5295
5296	req = io_poll_find(ctx, sqe_addr, poll_only);
5297	if (!req)
5298		return -ENOENT;
5299	if (io_poll_remove_one(req))
5300		return 0;
5301
5302	return -EALREADY;
5303}
5304
5305static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5306				     unsigned int flags)
5307{
5308	u32 events;
5309
5310	events = READ_ONCE(sqe->poll32_events);
5311#ifdef __BIG_ENDIAN
5312	events = swahw32(events);
5313#endif
5314	if (!(flags & IORING_POLL_ADD_MULTI))
5315		events |= EPOLLONESHOT;
5316	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
5317}
5318
5319static int io_poll_update_prep(struct io_kiocb *req,
5320			       const struct io_uring_sqe *sqe)
5321{
5322	struct io_poll_update *upd = &req->poll_update;
5323	u32 flags;
5324
5325	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5326		return -EINVAL;
5327	if (sqe->ioprio || sqe->buf_index)
5328		return -EINVAL;
5329	flags = READ_ONCE(sqe->len);
5330	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5331		      IORING_POLL_ADD_MULTI))
5332		return -EINVAL;
5333	/* meaningless without update */
5334	if (flags == IORING_POLL_ADD_MULTI)
5335		return -EINVAL;
5336
5337	upd->old_user_data = READ_ONCE(sqe->addr);
5338	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5339	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
5340
5341	upd->new_user_data = READ_ONCE(sqe->off);
5342	if (!upd->update_user_data && upd->new_user_data)
5343		return -EINVAL;
5344	if (upd->update_events)
5345		upd->events = io_poll_parse_events(sqe, flags);
5346	else if (sqe->poll32_events)
5347		return -EINVAL;
5348
5349	return 0;
5350}
5351
5352static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5353			void *key)
5354{
5355	struct io_kiocb *req = wait->private;
5356	struct io_poll_iocb *poll = &req->poll;
5357
5358	return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5359}
5360
5361static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5362			       struct poll_table_struct *p)
5363{
5364	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5365
5366	__io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5367}
5368
5369static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5370{
5371	struct io_poll_iocb *poll = &req->poll;
5372	u32 flags;
5373
5374	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5375		return -EINVAL;
5376	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
5377		return -EINVAL;
5378	flags = READ_ONCE(sqe->len);
5379	if (flags & ~IORING_POLL_ADD_MULTI)
5380		return -EINVAL;
5381
5382	poll->events = io_poll_parse_events(sqe, flags);
5383	return 0;
5384}
5385
5386static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5387{
5388	struct io_poll_iocb *poll = &req->poll;
5389	struct io_ring_ctx *ctx = req->ctx;
5390	struct io_poll_table ipt;
5391	__poll_t mask;
5392
5393	ipt.pt._qproc = io_poll_queue_proc;
5394
5395	mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5396					io_poll_wake);
5397
5398	if (mask) { /* no async, we'd stolen it */
5399		ipt.error = 0;
5400		io_poll_complete(req, mask);
5401	}
5402	spin_unlock_irq(&ctx->completion_lock);
5403
5404	if (mask) {
5405		io_cqring_ev_posted(ctx);
5406		if (poll->events & EPOLLONESHOT)
5407			io_put_req(req);
5408	}
5409	return ipt.error;
5410}
5411
5412static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
5413{
5414	struct io_ring_ctx *ctx = req->ctx;
5415	struct io_kiocb *preq;
5416	bool completing;
5417	int ret;
5418
5419	spin_lock_irq(&ctx->completion_lock);
5420	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
5421	if (!preq) {
5422		ret = -ENOENT;
5423		goto err;
5424	}
5425
5426	if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5427		completing = true;
5428		ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5429		goto err;
5430	}
5431
5432	/*
5433	 * Don't allow racy completion with singleshot, as we cannot safely
5434	 * update those. For multishot, if we're racing with completion, just
5435	 * let completion re-add it.
5436	 */
5437	completing = !__io_poll_remove_one(preq, &preq->poll, false);
5438	if (completing && (preq->poll.events & EPOLLONESHOT)) {
5439		ret = -EALREADY;
5440		goto err;
5441	}
5442	/* we now have a detached poll request. reissue. */
5443	ret = 0;
5444err:
5445	if (ret < 0) {
5446		spin_unlock_irq(&ctx->completion_lock);
5447		req_set_fail(req);
5448		io_req_complete(req, ret);
5449		return 0;
5450	}
5451	/* only mask one event flags, keep behavior flags */
5452	if (req->poll_update.update_events) {
5453		preq->poll.events &= ~0xffff;
5454		preq->poll.events |= req->poll_update.events & 0xffff;
5455		preq->poll.events |= IO_POLL_UNMASK;
5456	}
5457	if (req->poll_update.update_user_data)
5458		preq->user_data = req->poll_update.new_user_data;
5459	spin_unlock_irq(&ctx->completion_lock);
5460
5461	/* complete update request, we're done with it */
5462	io_req_complete(req, ret);
5463
5464	if (!completing) {
5465		ret = io_poll_add(preq, issue_flags);
5466		if (ret < 0) {
5467			req_set_fail(preq);
5468			io_req_complete(preq, ret);
5469		}
5470	}
5471	return 0;
5472}
5473
5474static void io_req_task_timeout(struct io_kiocb *req)
5475{
5476	struct io_ring_ctx *ctx = req->ctx;
5477
5478	spin_lock_irq(&ctx->completion_lock);
5479	io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
5480	io_commit_cqring(ctx);
5481	spin_unlock_irq(&ctx->completion_lock);
5482
5483	io_cqring_ev_posted(ctx);
5484	req_set_fail(req);
5485	io_put_req(req);
5486}
5487
5488static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5489{
5490	struct io_timeout_data *data = container_of(timer,
5491						struct io_timeout_data, timer);
5492	struct io_kiocb *req = data->req;
5493	struct io_ring_ctx *ctx = req->ctx;
5494	unsigned long flags;
5495
5496	spin_lock_irqsave(&ctx->timeout_lock, flags);
5497	list_del_init(&req->timeout.list);
5498	atomic_set(&req->ctx->cq_timeouts,
5499		atomic_read(&req->ctx->cq_timeouts) + 1);
5500	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
5501
5502	req->io_task_work.func = io_req_task_timeout;
5503	io_req_task_work_add(req);
5504	return HRTIMER_NORESTART;
5505}
5506
5507static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5508					   __u64 user_data)
5509	__must_hold(&ctx->timeout_lock)
5510{
5511	struct io_timeout_data *io;
5512	struct io_kiocb *req;
5513	bool found = false;
5514
5515	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5516		found = user_data == req->user_data;
5517		if (found)
5518			break;
5519	}
5520	if (!found)
5521		return ERR_PTR(-ENOENT);
5522
5523	io = req->async_data;
5524	if (hrtimer_try_to_cancel(&io->timer) == -1)
5525		return ERR_PTR(-EALREADY);
5526	list_del_init(&req->timeout.list);
5527	return req;
5528}
5529
5530static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5531	__must_hold(&ctx->timeout_lock)
5532{
5533	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5534
5535	if (IS_ERR(req))
5536		return PTR_ERR(req);
5537
5538	req_set_fail(req);
5539	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
5540	io_put_req_deferred(req, 1);
5541	return 0;
5542}
5543
5544static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5545			     struct timespec64 *ts, enum hrtimer_mode mode)
5546	__must_hold(&ctx->timeout_lock)
5547{
5548	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5549	struct io_timeout_data *data;
5550
5551	if (IS_ERR(req))
5552		return PTR_ERR(req);
5553
5554	req->timeout.off = 0; /* noseq */
5555	data = req->async_data;
5556	list_add_tail(&req->timeout.list, &ctx->timeout_list);
5557	hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5558	data->timer.function = io_timeout_fn;
5559	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5560	return 0;
5561}
5562
5563static int io_timeout_remove_prep(struct io_kiocb *req,
5564				  const struct io_uring_sqe *sqe)
5565{
5566	struct io_timeout_rem *tr = &req->timeout_rem;
5567
5568	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5569		return -EINVAL;
5570	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5571		return -EINVAL;
5572	if (sqe->ioprio || sqe->buf_index || sqe->len)
5573		return -EINVAL;
5574
5575	tr->addr = READ_ONCE(sqe->addr);
5576	tr->flags = READ_ONCE(sqe->timeout_flags);
5577	if (tr->flags & IORING_TIMEOUT_UPDATE) {
5578		if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5579			return -EINVAL;
5580		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5581			return -EFAULT;
5582	} else if (tr->flags) {
5583		/* timeout removal doesn't support flags */
5584		return -EINVAL;
5585	}
5586
5587	return 0;
5588}
5589
5590static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5591{
5592	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5593					    : HRTIMER_MODE_REL;
5594}
5595
5596/*
5597 * Remove or update an existing timeout command
5598 */
5599static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
5600{
5601	struct io_timeout_rem *tr = &req->timeout_rem;
5602	struct io_ring_ctx *ctx = req->ctx;
5603	int ret;
5604
5605	spin_lock_irq(&ctx->timeout_lock);
5606	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
5607		ret = io_timeout_cancel(ctx, tr->addr);
5608	else
5609		ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5610					io_translate_timeout_mode(tr->flags));
5611	spin_unlock_irq(&ctx->timeout_lock);
5612
5613	spin_lock_irq(&ctx->completion_lock);
5614	io_cqring_fill_event(ctx, req->user_data, ret, 0);
5615	io_commit_cqring(ctx);
5616	spin_unlock_irq(&ctx->completion_lock);
5617	io_cqring_ev_posted(ctx);
5618	if (ret < 0)
5619		req_set_fail(req);
5620	io_put_req(req);
5621	return 0;
5622}
5623
5624static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5625			   bool is_timeout_link)
5626{
5627	struct io_timeout_data *data;
5628	unsigned flags;
5629	u32 off = READ_ONCE(sqe->off);
5630
5631	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5632		return -EINVAL;
5633	if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5634		return -EINVAL;
5635	if (off && is_timeout_link)
5636		return -EINVAL;
5637	flags = READ_ONCE(sqe->timeout_flags);
5638	if (flags & ~IORING_TIMEOUT_ABS)
5639		return -EINVAL;
5640
5641	req->timeout.off = off;
5642	if (unlikely(off && !req->ctx->off_timeout_used))
5643		req->ctx->off_timeout_used = true;
5644
5645	if (!req->async_data && io_alloc_async_data(req))
5646		return -ENOMEM;
5647
5648	data = req->async_data;
5649	data->req = req;
5650
5651	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5652		return -EFAULT;
5653
5654	data->mode = io_translate_timeout_mode(flags);
5655	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5656	if (is_timeout_link)
5657		io_req_track_inflight(req);
5658	return 0;
5659}
5660
5661static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
5662{
5663	struct io_ring_ctx *ctx = req->ctx;
5664	struct io_timeout_data *data = req->async_data;
5665	struct list_head *entry;
5666	u32 tail, off = req->timeout.off;
5667
5668	spin_lock_irq(&ctx->timeout_lock);
5669
5670	/*
5671	 * sqe->off holds how many events that need to occur for this
5672	 * timeout event to be satisfied. If it isn't set, then this is
5673	 * a pure timeout request, sequence isn't used.
5674	 */
5675	if (io_is_timeout_noseq(req)) {
5676		entry = ctx->timeout_list.prev;
5677		goto add;
5678	}
5679
5680	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5681	req->timeout.target_seq = tail + off;
5682
5683	/* Update the last seq here in case io_flush_timeouts() hasn't.
5684	 * This is safe because ->completion_lock is held, and submissions
5685	 * and completions are never mixed in the same ->completion_lock section.
5686	 */
5687	ctx->cq_last_tm_flush = tail;
5688
5689	/*
5690	 * Insertion sort, ensuring the first entry in the list is always
5691	 * the one we need first.
5692	 */
5693	list_for_each_prev(entry, &ctx->timeout_list) {
5694		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5695						  timeout.list);
5696
5697		if (io_is_timeout_noseq(nxt))
5698			continue;
5699		/* nxt.seq is behind @tail, otherwise would've been completed */
5700		if (off >= nxt->timeout.target_seq - tail)
5701			break;
5702	}
5703add:
5704	list_add(&req->timeout.list, entry);
5705	data->timer.function = io_timeout_fn;
5706	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5707	spin_unlock_irq(&ctx->timeout_lock);
5708	return 0;
5709}
5710
5711struct io_cancel_data {
5712	struct io_ring_ctx *ctx;
5713	u64 user_data;
5714};
5715
5716static bool io_cancel_cb(struct io_wq_work *work, void *data)
5717{
5718	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5719	struct