io_uring.c revision 14afdd6e
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqe (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
29 *
30 * Also see the examples in the liburing library:
31 *
32 *	git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
40 * Copyright (c) 2018-2019 Christoph Hellwig
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <net/compat.h>
48#include <linux/refcount.h>
49#include <linux/uio.h>
50#include <linux/bits.h>
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
58#include <linux/percpu.h>
59#include <linux/slab.h>
60#include <linux/blkdev.h>
61#include <linux/bvec.h>
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
65#include <net/scm.h>
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
72#include <linux/highmem.h>
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
75#include <linux/fadvise.h>
76#include <linux/eventpoll.h>
77#include <linux/splice.h>
78#include <linux/task_work.h>
79#include <linux/pagemap.h>
80#include <linux/io_uring.h>
81#include <linux/tracehook.h>
82
83#define CREATE_TRACE_POINTS
84#include <trace/events/io_uring.h>
85
86#include <uapi/linux/io_uring.h>
87
88#include "internal.h"
89#include "io-wq.h"
90
91#define IORING_MAX_ENTRIES	32768
92#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
93#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
94
95/* only define max */
96#define IORING_MAX_FIXED_FILES	(1U << 15)
97#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
98				 IORING_REGISTER_LAST + IORING_OP_LAST)
99
100#define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
101#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
102#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
103
104#define IORING_MAX_REG_BUFFERS	(1U << 14)
105
106#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
107				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
108				IOSQE_BUFFER_SELECT)
109#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
110				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
111
112#define IO_TCTX_REFS_CACHE_NR	(1U << 10)
113
114struct io_uring {
115	u32 head ____cacheline_aligned_in_smp;
116	u32 tail ____cacheline_aligned_in_smp;
117};
118
119/*
120 * This data is shared with the application through the mmap at offsets
121 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
122 *
123 * The offsets to the member fields are published through struct
124 * io_sqring_offsets when calling io_uring_setup.
125 */
126struct io_rings {
127	/*
128	 * Head and tail offsets into the ring; the offsets need to be
129	 * masked to get valid indices.
130	 *
131	 * The kernel controls head of the sq ring and the tail of the cq ring,
132	 * and the application controls tail of the sq ring and the head of the
133	 * cq ring.
134	 */
135	struct io_uring		sq, cq;
136	/*
137	 * Bitmasks to apply to head and tail offsets (constant, equals
138	 * ring_entries - 1)
139	 */
140	u32			sq_ring_mask, cq_ring_mask;
141	/* Ring sizes (constant, power of 2) */
142	u32			sq_ring_entries, cq_ring_entries;
143	/*
144	 * Number of invalid entries dropped by the kernel due to
145	 * invalid index stored in array
146	 *
147	 * Written by the kernel, shouldn't be modified by the
148	 * application (i.e. get number of "new events" by comparing to
149	 * cached value).
150	 *
151	 * After a new SQ head value was read by the application this
152	 * counter includes all submissions that were dropped reaching
153	 * the new SQ head (and possibly more).
154	 */
155	u32			sq_dropped;
156	/*
157	 * Runtime SQ flags
158	 *
159	 * Written by the kernel, shouldn't be modified by the
160	 * application.
161	 *
162	 * The application needs a full memory barrier before checking
163	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
164	 */
165	u32			sq_flags;
166	/*
167	 * Runtime CQ flags
168	 *
169	 * Written by the application, shouldn't be modified by the
170	 * kernel.
171	 */
172	u32			cq_flags;
173	/*
174	 * Number of completion events lost because the queue was full;
175	 * this should be avoided by the application by making sure
176	 * there are not more requests pending than there is space in
177	 * the completion queue.
178	 *
179	 * Written by the kernel, shouldn't be modified by the
180	 * application (i.e. get number of "new events" by comparing to
181	 * cached value).
182	 *
183	 * As completion events come in out of order this counter is not
184	 * ordered with any other data.
185	 */
186	u32			cq_overflow;
187	/*
188	 * Ring buffer of completion events.
189	 *
190	 * The kernel writes completion events fresh every time they are
191	 * produced, so the application is allowed to modify pending
192	 * entries.
193	 */
194	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
195};
196
197enum io_uring_cmd_flags {
198	IO_URING_F_NONBLOCK		= 1,
199	IO_URING_F_COMPLETE_DEFER	= 2,
200};
201
202struct io_mapped_ubuf {
203	u64		ubuf;
204	u64		ubuf_end;
205	unsigned int	nr_bvecs;
206	unsigned long	acct_pages;
207	struct bio_vec	bvec[];
208};
209
210struct io_ring_ctx;
211
212struct io_overflow_cqe {
213	struct io_uring_cqe cqe;
214	struct list_head list;
215};
216
217struct io_fixed_file {
218	/* file * with additional FFS_* flags */
219	unsigned long file_ptr;
220};
221
222struct io_rsrc_put {
223	struct list_head list;
224	u64 tag;
225	union {
226		void *rsrc;
227		struct file *file;
228		struct io_mapped_ubuf *buf;
229	};
230};
231
232struct io_file_table {
233	struct io_fixed_file *files;
234};
235
236struct io_rsrc_node {
237	struct percpu_ref		refs;
238	struct list_head		node;
239	struct list_head		rsrc_list;
240	struct io_rsrc_data		*rsrc_data;
241	struct llist_node		llist;
242	bool				done;
243};
244
245typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
246
247struct io_rsrc_data {
248	struct io_ring_ctx		*ctx;
249
250	u64				**tags;
251	unsigned int			nr;
252	rsrc_put_fn			*do_put;
253	atomic_t			refs;
254	struct completion		done;
255	bool				quiesce;
256};
257
258struct io_buffer {
259	struct list_head list;
260	__u64 addr;
261	__u32 len;
262	__u16 bid;
263};
264
265struct io_restriction {
266	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
267	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
268	u8 sqe_flags_allowed;
269	u8 sqe_flags_required;
270	bool registered;
271};
272
273enum {
274	IO_SQ_THREAD_SHOULD_STOP = 0,
275	IO_SQ_THREAD_SHOULD_PARK,
276};
277
278struct io_sq_data {
279	refcount_t		refs;
280	atomic_t		park_pending;
281	struct mutex		lock;
282
283	/* ctx's that are using this sqd */
284	struct list_head	ctx_list;
285
286	struct task_struct	*thread;
287	struct wait_queue_head	wait;
288
289	unsigned		sq_thread_idle;
290	int			sq_cpu;
291	pid_t			task_pid;
292	pid_t			task_tgid;
293
294	unsigned long		state;
295	struct completion	exited;
296};
297
298#define IO_COMPL_BATCH			32
299#define IO_REQ_CACHE_SIZE		32
300#define IO_REQ_ALLOC_BATCH		8
301
302struct io_submit_link {
303	struct io_kiocb		*head;
304	struct io_kiocb		*last;
305};
306
307struct io_submit_state {
308	struct blk_plug		plug;
309	struct io_submit_link	link;
310
311	/*
312	 * io_kiocb alloc cache
313	 */
314	void			*reqs[IO_REQ_CACHE_SIZE];
315	unsigned int		free_reqs;
316
317	bool			plug_started;
318
319	/*
320	 * Batch completion logic
321	 */
322	struct io_kiocb		*compl_reqs[IO_COMPL_BATCH];
323	unsigned int		compl_nr;
324	/* inline/task_work completion list, under ->uring_lock */
325	struct list_head	free_list;
326
327	unsigned int		ios_left;
328};
329
330struct io_ring_ctx {
331	/* const or read-mostly hot data */
332	struct {
333		struct percpu_ref	refs;
334
335		struct io_rings		*rings;
336		unsigned int		flags;
337		unsigned int		compat: 1;
338		unsigned int		drain_next: 1;
339		unsigned int		eventfd_async: 1;
340		unsigned int		restricted: 1;
341		unsigned int		off_timeout_used: 1;
342		unsigned int		drain_active: 1;
343	} ____cacheline_aligned_in_smp;
344
345	/* submission data */
346	struct {
347		struct mutex		uring_lock;
348
349		/*
350		 * Ring buffer of indices into array of io_uring_sqe, which is
351		 * mmapped by the application using the IORING_OFF_SQES offset.
352		 *
353		 * This indirection could e.g. be used to assign fixed
354		 * io_uring_sqe entries to operations and only submit them to
355		 * the queue when needed.
356		 *
357		 * The kernel modifies neither the indices array nor the entries
358		 * array.
359		 */
360		u32			*sq_array;
361		struct io_uring_sqe	*sq_sqes;
362		unsigned		cached_sq_head;
363		unsigned		sq_entries;
364		struct list_head	defer_list;
365
366		/*
367		 * Fixed resources fast path, should be accessed only under
368		 * uring_lock, and updated through io_uring_register(2)
369		 */
370		struct io_rsrc_node	*rsrc_node;
371		struct io_file_table	file_table;
372		unsigned		nr_user_files;
373		unsigned		nr_user_bufs;
374		struct io_mapped_ubuf	**user_bufs;
375
376		struct io_submit_state	submit_state;
377		struct list_head	timeout_list;
378		struct list_head	cq_overflow_list;
379		struct xarray		io_buffers;
380		struct xarray		personalities;
381		u32			pers_next;
382		unsigned		sq_thread_idle;
383	} ____cacheline_aligned_in_smp;
384
385	/* IRQ completion list, under ->completion_lock */
386	struct list_head	locked_free_list;
387	unsigned int		locked_free_nr;
388
389	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
390	struct io_sq_data	*sq_data;	/* if using sq thread polling */
391
392	struct wait_queue_head	sqo_sq_wait;
393	struct list_head	sqd_list;
394
395	unsigned long		check_cq_overflow;
396
397	struct {
398		unsigned		cached_cq_tail;
399		unsigned		cq_entries;
400		struct eventfd_ctx	*cq_ev_fd;
401		struct wait_queue_head	poll_wait;
402		struct wait_queue_head	cq_wait;
403		unsigned		cq_extra;
404		atomic_t		cq_timeouts;
405		struct fasync_struct	*cq_fasync;
406		unsigned		cq_last_tm_flush;
407	} ____cacheline_aligned_in_smp;
408
409	struct {
410		spinlock_t		completion_lock;
411
412		spinlock_t		timeout_lock;
413
414		/*
415		 * ->iopoll_list is protected by the ctx->uring_lock for
416		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
417		 * For SQPOLL, only the single threaded io_sq_thread() will
418		 * manipulate the list, hence no extra locking is needed there.
419		 */
420		struct list_head	iopoll_list;
421		struct hlist_head	*cancel_hash;
422		unsigned		cancel_hash_bits;
423		bool			poll_multi_queue;
424	} ____cacheline_aligned_in_smp;
425
426	struct io_restriction		restrictions;
427
428	/* slow path rsrc auxilary data, used by update/register */
429	struct {
430		struct io_rsrc_node		*rsrc_backup_node;
431		struct io_mapped_ubuf		*dummy_ubuf;
432		struct io_rsrc_data		*file_data;
433		struct io_rsrc_data		*buf_data;
434
435		struct delayed_work		rsrc_put_work;
436		struct llist_head		rsrc_put_llist;
437		struct list_head		rsrc_ref_list;
438		spinlock_t			rsrc_ref_lock;
439	};
440
441	/* Keep this last, we don't need it for the fast path */
442	struct {
443		#if defined(CONFIG_UNIX)
444			struct socket		*ring_sock;
445		#endif
446		/* hashed buffered write serialization */
447		struct io_wq_hash		*hash_map;
448
449		/* Only used for accounting purposes */
450		struct user_struct		*user;
451		struct mm_struct		*mm_account;
452
453		/* ctx exit and cancelation */
454		struct llist_head		fallback_llist;
455		struct delayed_work		fallback_work;
456		struct work_struct		exit_work;
457		struct list_head		tctx_list;
458		struct completion		ref_comp;
459	};
460};
461
462struct io_uring_task {
463	/* submission side */
464	int			cached_refs;
465	struct xarray		xa;
466	struct wait_queue_head	wait;
467	const struct io_ring_ctx *last;
468	struct io_wq		*io_wq;
469	struct percpu_counter	inflight;
470	atomic_t		inflight_tracked;
471	atomic_t		in_idle;
472
473	spinlock_t		task_lock;
474	struct io_wq_work_list	task_list;
475	struct callback_head	task_work;
476	bool			task_running;
477};
478
479/*
480 * First field must be the file pointer in all the
481 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
482 */
483struct io_poll_iocb {
484	struct file			*file;
485	struct wait_queue_head		*head;
486	__poll_t			events;
487	bool				done;
488	bool				canceled;
489	struct wait_queue_entry		wait;
490};
491
492struct io_poll_update {
493	struct file			*file;
494	u64				old_user_data;
495	u64				new_user_data;
496	__poll_t			events;
497	bool				update_events;
498	bool				update_user_data;
499};
500
501struct io_close {
502	struct file			*file;
503	int				fd;
504};
505
506struct io_timeout_data {
507	struct io_kiocb			*req;
508	struct hrtimer			timer;
509	struct timespec64		ts;
510	enum hrtimer_mode		mode;
511};
512
513struct io_accept {
514	struct file			*file;
515	struct sockaddr __user		*addr;
516	int __user			*addr_len;
517	int				flags;
518	u32				file_slot;
519	unsigned long			nofile;
520};
521
522struct io_sync {
523	struct file			*file;
524	loff_t				len;
525	loff_t				off;
526	int				flags;
527	int				mode;
528};
529
530struct io_cancel {
531	struct file			*file;
532	u64				addr;
533};
534
535struct io_timeout {
536	struct file			*file;
537	u32				off;
538	u32				target_seq;
539	struct list_head		list;
540	/* head of the link, used by linked timeouts only */
541	struct io_kiocb			*head;
542	/* for linked completions */
543	struct io_kiocb			*prev;
544};
545
546struct io_timeout_rem {
547	struct file			*file;
548	u64				addr;
549
550	/* timeout update */
551	struct timespec64		ts;
552	u32				flags;
553};
554
555struct io_rw {
556	/* NOTE: kiocb has the file as the first member, so don't do it here */
557	struct kiocb			kiocb;
558	u64				addr;
559	u64				len;
560};
561
562struct io_connect {
563	struct file			*file;
564	struct sockaddr __user		*addr;
565	int				addr_len;
566};
567
568struct io_sr_msg {
569	struct file			*file;
570	union {
571		struct compat_msghdr __user	*umsg_compat;
572		struct user_msghdr __user	*umsg;
573		void __user			*buf;
574	};
575	int				msg_flags;
576	int				bgid;
577	size_t				len;
578	struct io_buffer		*kbuf;
579};
580
581struct io_open {
582	struct file			*file;
583	int				dfd;
584	u32				file_slot;
585	struct filename			*filename;
586	struct open_how			how;
587	unsigned long			nofile;
588};
589
590struct io_rsrc_update {
591	struct file			*file;
592	u64				arg;
593	u32				nr_args;
594	u32				offset;
595};
596
597struct io_fadvise {
598	struct file			*file;
599	u64				offset;
600	u32				len;
601	u32				advice;
602};
603
604struct io_madvise {
605	struct file			*file;
606	u64				addr;
607	u32				len;
608	u32				advice;
609};
610
611struct io_epoll {
612	struct file			*file;
613	int				epfd;
614	int				op;
615	int				fd;
616	struct epoll_event		event;
617};
618
619struct io_splice {
620	struct file			*file_out;
621	struct file			*file_in;
622	loff_t				off_out;
623	loff_t				off_in;
624	u64				len;
625	unsigned int			flags;
626};
627
628struct io_provide_buf {
629	struct file			*file;
630	__u64				addr;
631	__u32				len;
632	__u32				bgid;
633	__u16				nbufs;
634	__u16				bid;
635};
636
637struct io_statx {
638	struct file			*file;
639	int				dfd;
640	unsigned int			mask;
641	unsigned int			flags;
642	const char __user		*filename;
643	struct statx __user		*buffer;
644};
645
646struct io_shutdown {
647	struct file			*file;
648	int				how;
649};
650
651struct io_rename {
652	struct file			*file;
653	int				old_dfd;
654	int				new_dfd;
655	struct filename			*oldpath;
656	struct filename			*newpath;
657	int				flags;
658};
659
660struct io_unlink {
661	struct file			*file;
662	int				dfd;
663	int				flags;
664	struct filename			*filename;
665};
666
667struct io_completion {
668	struct file			*file;
669	u32				cflags;
670};
671
672struct io_async_connect {
673	struct sockaddr_storage		address;
674};
675
676struct io_async_msghdr {
677	struct iovec			fast_iov[UIO_FASTIOV];
678	/* points to an allocated iov, if NULL we use fast_iov instead */
679	struct iovec			*free_iov;
680	struct sockaddr __user		*uaddr;
681	struct msghdr			msg;
682	struct sockaddr_storage		addr;
683};
684
685struct io_async_rw {
686	struct iovec			fast_iov[UIO_FASTIOV];
687	const struct iovec		*free_iovec;
688	struct iov_iter			iter;
689	size_t				bytes_done;
690	struct wait_page_queue		wpq;
691};
692
693enum {
694	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
695	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
696	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
697	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
698	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
699	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
700
701	/* first byte is taken by user flags, shift it to not overlap */
702	REQ_F_FAIL_BIT		= 8,
703	REQ_F_INFLIGHT_BIT,
704	REQ_F_CUR_POS_BIT,
705	REQ_F_NOWAIT_BIT,
706	REQ_F_LINK_TIMEOUT_BIT,
707	REQ_F_NEED_CLEANUP_BIT,
708	REQ_F_POLLED_BIT,
709	REQ_F_BUFFER_SELECTED_BIT,
710	REQ_F_COMPLETE_INLINE_BIT,
711	REQ_F_REISSUE_BIT,
712	REQ_F_DONT_REISSUE_BIT,
713	REQ_F_CREDS_BIT,
714	REQ_F_REFCOUNT_BIT,
715	REQ_F_ARM_LTIMEOUT_BIT,
716	/* keep async read/write and isreg together and in order */
717	REQ_F_NOWAIT_READ_BIT,
718	REQ_F_NOWAIT_WRITE_BIT,
719	REQ_F_ISREG_BIT,
720
721	/* not a real bit, just to check we're not overflowing the space */
722	__REQ_F_LAST_BIT,
723};
724
725enum {
726	/* ctx owns file */
727	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
728	/* drain existing IO first */
729	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
730	/* linked sqes */
731	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
732	/* doesn't sever on completion < 0 */
733	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
734	/* IOSQE_ASYNC */
735	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
736	/* IOSQE_BUFFER_SELECT */
737	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
738
739	/* fail rest of links */
740	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
741	/* on inflight list, should be cancelled and waited on exit reliably */
742	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
743	/* read/write uses file position */
744	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
745	/* must not punt to workers */
746	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
747	/* has or had linked timeout */
748	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
749	/* needs cleanup */
750	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
751	/* already went through poll handler */
752	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
753	/* buffer already selected */
754	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
755	/* completion is deferred through io_comp_state */
756	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
757	/* caller should reissue async */
758	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
759	/* don't attempt request reissue, see io_rw_reissue() */
760	REQ_F_DONT_REISSUE	= BIT(REQ_F_DONT_REISSUE_BIT),
761	/* supports async reads */
762	REQ_F_NOWAIT_READ	= BIT(REQ_F_NOWAIT_READ_BIT),
763	/* supports async writes */
764	REQ_F_NOWAIT_WRITE	= BIT(REQ_F_NOWAIT_WRITE_BIT),
765	/* regular file */
766	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
767	/* has creds assigned */
768	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
769	/* skip refcounting if not set */
770	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
771	/* there is a linked timeout that has to be armed */
772	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
773};
774
775struct async_poll {
776	struct io_poll_iocb	poll;
777	struct io_poll_iocb	*double_poll;
778};
779
780typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
781
782struct io_task_work {
783	union {
784		struct io_wq_work_node	node;
785		struct llist_node	fallback_node;
786	};
787	io_req_tw_func_t		func;
788};
789
790enum {
791	IORING_RSRC_FILE		= 0,
792	IORING_RSRC_BUFFER		= 1,
793};
794
795/*
796 * NOTE! Each of the iocb union members has the file pointer
797 * as the first entry in their struct definition. So you can
798 * access the file pointer through any of the sub-structs,
799 * or directly as just 'ki_filp' in this struct.
800 */
801struct io_kiocb {
802	union {
803		struct file		*file;
804		struct io_rw		rw;
805		struct io_poll_iocb	poll;
806		struct io_poll_update	poll_update;
807		struct io_accept	accept;
808		struct io_sync		sync;
809		struct io_cancel	cancel;
810		struct io_timeout	timeout;
811		struct io_timeout_rem	timeout_rem;
812		struct io_connect	connect;
813		struct io_sr_msg	sr_msg;
814		struct io_open		open;
815		struct io_close		close;
816		struct io_rsrc_update	rsrc_update;
817		struct io_fadvise	fadvise;
818		struct io_madvise	madvise;
819		struct io_epoll		epoll;
820		struct io_splice	splice;
821		struct io_provide_buf	pbuf;
822		struct io_statx		statx;
823		struct io_shutdown	shutdown;
824		struct io_rename	rename;
825		struct io_unlink	unlink;
826		/* use only after cleaning per-op data, see io_clean_op() */
827		struct io_completion	compl;
828	};
829
830	/* opcode allocated if it needs to store data for async defer */
831	void				*async_data;
832	u8				opcode;
833	/* polled IO has completed */
834	u8				iopoll_completed;
835
836	u16				buf_index;
837	u32				result;
838
839	struct io_ring_ctx		*ctx;
840	unsigned int			flags;
841	atomic_t			refs;
842	struct task_struct		*task;
843	u64				user_data;
844
845	struct io_kiocb			*link;
846	struct percpu_ref		*fixed_rsrc_refs;
847
848	/* used with ctx->iopoll_list with reads/writes */
849	struct list_head		inflight_entry;
850	struct io_task_work		io_task_work;
851	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
852	struct hlist_node		hash_node;
853	struct async_poll		*apoll;
854	struct io_wq_work		work;
855	const struct cred		*creds;
856
857	/* store used ubuf, so we can prevent reloading */
858	struct io_mapped_ubuf		*imu;
859};
860
861struct io_tctx_node {
862	struct list_head	ctx_node;
863	struct task_struct	*task;
864	struct io_ring_ctx	*ctx;
865};
866
867struct io_defer_entry {
868	struct list_head	list;
869	struct io_kiocb		*req;
870	u32			seq;
871};
872
873struct io_op_def {
874	/* needs req->file assigned */
875	unsigned		needs_file : 1;
876	/* hash wq insertion if file is a regular file */
877	unsigned		hash_reg_file : 1;
878	/* unbound wq insertion if file is a non-regular file */
879	unsigned		unbound_nonreg_file : 1;
880	/* opcode is not supported by this kernel */
881	unsigned		not_supported : 1;
882	/* set if opcode supports polled "wait" */
883	unsigned		pollin : 1;
884	unsigned		pollout : 1;
885	/* op supports buffer selection */
886	unsigned		buffer_select : 1;
887	/* do prep async if is going to be punted */
888	unsigned		needs_async_setup : 1;
889	/* should block plug */
890	unsigned		plug : 1;
891	/* size of async data needed, if any */
892	unsigned short		async_size;
893};
894
895static const struct io_op_def io_op_defs[] = {
896	[IORING_OP_NOP] = {},
897	[IORING_OP_READV] = {
898		.needs_file		= 1,
899		.unbound_nonreg_file	= 1,
900		.pollin			= 1,
901		.buffer_select		= 1,
902		.needs_async_setup	= 1,
903		.plug			= 1,
904		.async_size		= sizeof(struct io_async_rw),
905	},
906	[IORING_OP_WRITEV] = {
907		.needs_file		= 1,
908		.hash_reg_file		= 1,
909		.unbound_nonreg_file	= 1,
910		.pollout		= 1,
911		.needs_async_setup	= 1,
912		.plug			= 1,
913		.async_size		= sizeof(struct io_async_rw),
914	},
915	[IORING_OP_FSYNC] = {
916		.needs_file		= 1,
917	},
918	[IORING_OP_READ_FIXED] = {
919		.needs_file		= 1,
920		.unbound_nonreg_file	= 1,
921		.pollin			= 1,
922		.plug			= 1,
923		.async_size		= sizeof(struct io_async_rw),
924	},
925	[IORING_OP_WRITE_FIXED] = {
926		.needs_file		= 1,
927		.hash_reg_file		= 1,
928		.unbound_nonreg_file	= 1,
929		.pollout		= 1,
930		.plug			= 1,
931		.async_size		= sizeof(struct io_async_rw),
932	},
933	[IORING_OP_POLL_ADD] = {
934		.needs_file		= 1,
935		.unbound_nonreg_file	= 1,
936	},
937	[IORING_OP_POLL_REMOVE] = {},
938	[IORING_OP_SYNC_FILE_RANGE] = {
939		.needs_file		= 1,
940	},
941	[IORING_OP_SENDMSG] = {
942		.needs_file		= 1,
943		.unbound_nonreg_file	= 1,
944		.pollout		= 1,
945		.needs_async_setup	= 1,
946		.async_size		= sizeof(struct io_async_msghdr),
947	},
948	[IORING_OP_RECVMSG] = {
949		.needs_file		= 1,
950		.unbound_nonreg_file	= 1,
951		.pollin			= 1,
952		.buffer_select		= 1,
953		.needs_async_setup	= 1,
954		.async_size		= sizeof(struct io_async_msghdr),
955	},
956	[IORING_OP_TIMEOUT] = {
957		.async_size		= sizeof(struct io_timeout_data),
958	},
959	[IORING_OP_TIMEOUT_REMOVE] = {
960		/* used by timeout updates' prep() */
961	},
962	[IORING_OP_ACCEPT] = {
963		.needs_file		= 1,
964		.unbound_nonreg_file	= 1,
965		.pollin			= 1,
966	},
967	[IORING_OP_ASYNC_CANCEL] = {},
968	[IORING_OP_LINK_TIMEOUT] = {
969		.async_size		= sizeof(struct io_timeout_data),
970	},
971	[IORING_OP_CONNECT] = {
972		.needs_file		= 1,
973		.unbound_nonreg_file	= 1,
974		.pollout		= 1,
975		.needs_async_setup	= 1,
976		.async_size		= sizeof(struct io_async_connect),
977	},
978	[IORING_OP_FALLOCATE] = {
979		.needs_file		= 1,
980	},
981	[IORING_OP_OPENAT] = {},
982	[IORING_OP_CLOSE] = {},
983	[IORING_OP_FILES_UPDATE] = {},
984	[IORING_OP_STATX] = {},
985	[IORING_OP_READ] = {
986		.needs_file		= 1,
987		.unbound_nonreg_file	= 1,
988		.pollin			= 1,
989		.buffer_select		= 1,
990		.plug			= 1,
991		.async_size		= sizeof(struct io_async_rw),
992	},
993	[IORING_OP_WRITE] = {
994		.needs_file		= 1,
995		.unbound_nonreg_file	= 1,
996		.pollout		= 1,
997		.plug			= 1,
998		.async_size		= sizeof(struct io_async_rw),
999	},
1000	[IORING_OP_FADVISE] = {
1001		.needs_file		= 1,
1002	},
1003	[IORING_OP_MADVISE] = {},
1004	[IORING_OP_SEND] = {
1005		.needs_file		= 1,
1006		.unbound_nonreg_file	= 1,
1007		.pollout		= 1,
1008	},
1009	[IORING_OP_RECV] = {
1010		.needs_file		= 1,
1011		.unbound_nonreg_file	= 1,
1012		.pollin			= 1,
1013		.buffer_select		= 1,
1014	},
1015	[IORING_OP_OPENAT2] = {
1016	},
1017	[IORING_OP_EPOLL_CTL] = {
1018		.unbound_nonreg_file	= 1,
1019	},
1020	[IORING_OP_SPLICE] = {
1021		.needs_file		= 1,
1022		.hash_reg_file		= 1,
1023		.unbound_nonreg_file	= 1,
1024	},
1025	[IORING_OP_PROVIDE_BUFFERS] = {},
1026	[IORING_OP_REMOVE_BUFFERS] = {},
1027	[IORING_OP_TEE] = {
1028		.needs_file		= 1,
1029		.hash_reg_file		= 1,
1030		.unbound_nonreg_file	= 1,
1031	},
1032	[IORING_OP_SHUTDOWN] = {
1033		.needs_file		= 1,
1034	},
1035	[IORING_OP_RENAMEAT] = {},
1036	[IORING_OP_UNLINKAT] = {},
1037};
1038
1039/* requests with any of those set should undergo io_disarm_next() */
1040#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1041
1042static bool io_disarm_next(struct io_kiocb *req);
1043static void io_uring_del_tctx_node(unsigned long index);
1044static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1045					 struct task_struct *task,
1046					 bool cancel_all);
1047static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1048
1049static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1050				 long res, unsigned int cflags);
1051static void io_put_req(struct io_kiocb *req);
1052static void io_put_req_deferred(struct io_kiocb *req);
1053static void io_dismantle_req(struct io_kiocb *req);
1054static void io_queue_linked_timeout(struct io_kiocb *req);
1055static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1056				     struct io_uring_rsrc_update2 *up,
1057				     unsigned nr_args);
1058static void io_clean_op(struct io_kiocb *req);
1059static struct file *io_file_get(struct io_ring_ctx *ctx,
1060				struct io_kiocb *req, int fd, bool fixed);
1061static void __io_queue_sqe(struct io_kiocb *req);
1062static void io_rsrc_put_work(struct work_struct *work);
1063
1064static void io_req_task_queue(struct io_kiocb *req);
1065static void io_submit_flush_completions(struct io_ring_ctx *ctx);
1066static int io_req_prep_async(struct io_kiocb *req);
1067
1068static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1069				 unsigned int issue_flags, u32 slot_index);
1070
1071static struct kmem_cache *req_cachep;
1072
1073static const struct file_operations io_uring_fops;
1074
1075struct sock *io_uring_get_socket(struct file *file)
1076{
1077#if defined(CONFIG_UNIX)
1078	if (file->f_op == &io_uring_fops) {
1079		struct io_ring_ctx *ctx = file->private_data;
1080
1081		return ctx->ring_sock->sk;
1082	}
1083#endif
1084	return NULL;
1085}
1086EXPORT_SYMBOL(io_uring_get_socket);
1087
1088static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1089{
1090	if (!*locked) {
1091		mutex_lock(&ctx->uring_lock);
1092		*locked = true;
1093	}
1094}
1095
1096#define io_for_each_link(pos, head) \
1097	for (pos = (head); pos; pos = pos->link)
1098
1099/*
1100 * Shamelessly stolen from the mm implementation of page reference checking,
1101 * see commit f958d7b528b1 for details.
1102 */
1103#define req_ref_zero_or_close_to_overflow(req)	\
1104	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1105
1106static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1107{
1108	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1109	return atomic_inc_not_zero(&req->refs);
1110}
1111
1112static inline bool req_ref_put_and_test(struct io_kiocb *req)
1113{
1114	if (likely(!(req->flags & REQ_F_REFCOUNT)))
1115		return true;
1116
1117	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1118	return atomic_dec_and_test(&req->refs);
1119}
1120
1121static inline void req_ref_put(struct io_kiocb *req)
1122{
1123	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1124	WARN_ON_ONCE(req_ref_put_and_test(req));
1125}
1126
1127static inline void req_ref_get(struct io_kiocb *req)
1128{
1129	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1130	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1131	atomic_inc(&req->refs);
1132}
1133
1134static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
1135{
1136	if (!(req->flags & REQ_F_REFCOUNT)) {
1137		req->flags |= REQ_F_REFCOUNT;
1138		atomic_set(&req->refs, nr);
1139	}
1140}
1141
1142static inline void io_req_set_refcount(struct io_kiocb *req)
1143{
1144	__io_req_set_refcount(req, 1);
1145}
1146
1147static inline void io_req_set_rsrc_node(struct io_kiocb *req)
1148{
1149	struct io_ring_ctx *ctx = req->ctx;
1150
1151	if (!req->fixed_rsrc_refs) {
1152		req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1153		percpu_ref_get(req->fixed_rsrc_refs);
1154	}
1155}
1156
1157static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1158{
1159	bool got = percpu_ref_tryget(ref);
1160
1161	/* already at zero, wait for ->release() */
1162	if (!got)
1163		wait_for_completion(compl);
1164	percpu_ref_resurrect(ref);
1165	if (got)
1166		percpu_ref_put(ref);
1167}
1168
1169static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1170			  bool cancel_all)
1171{
1172	struct io_kiocb *req;
1173
1174	if (task && head->task != task)
1175		return false;
1176	if (cancel_all)
1177		return true;
1178
1179	io_for_each_link(req, head) {
1180		if (req->flags & REQ_F_INFLIGHT)
1181			return true;
1182	}
1183	return false;
1184}
1185
1186static inline void req_set_fail(struct io_kiocb *req)
1187{
1188	req->flags |= REQ_F_FAIL;
1189}
1190
1191static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1192{
1193	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1194
1195	complete(&ctx->ref_comp);
1196}
1197
1198static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1199{
1200	return !req->timeout.off;
1201}
1202
1203static void io_fallback_req_func(struct work_struct *work)
1204{
1205	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1206						fallback_work.work);
1207	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1208	struct io_kiocb *req, *tmp;
1209	bool locked = false;
1210
1211	percpu_ref_get(&ctx->refs);
1212	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
1213		req->io_task_work.func(req, &locked);
1214
1215	if (locked) {
1216		if (ctx->submit_state.compl_nr)
1217			io_submit_flush_completions(ctx);
1218		mutex_unlock(&ctx->uring_lock);
1219	}
1220	percpu_ref_put(&ctx->refs);
1221
1222}
1223
1224static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1225{
1226	struct io_ring_ctx *ctx;
1227	int hash_bits;
1228
1229	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1230	if (!ctx)
1231		return NULL;
1232
1233	/*
1234	 * Use 5 bits less than the max cq entries, that should give us around
1235	 * 32 entries per hash list if totally full and uniformly spread.
1236	 */
1237	hash_bits = ilog2(p->cq_entries);
1238	hash_bits -= 5;
1239	if (hash_bits <= 0)
1240		hash_bits = 1;
1241	ctx->cancel_hash_bits = hash_bits;
1242	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1243					GFP_KERNEL);
1244	if (!ctx->cancel_hash)
1245		goto err;
1246	__hash_init(ctx->cancel_hash, 1U << hash_bits);
1247
1248	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1249	if (!ctx->dummy_ubuf)
1250		goto err;
1251	/* set invalid range, so io_import_fixed() fails meeting it */
1252	ctx->dummy_ubuf->ubuf = -1UL;
1253
1254	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1255			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1256		goto err;
1257
1258	ctx->flags = p->flags;
1259	init_waitqueue_head(&ctx->sqo_sq_wait);
1260	INIT_LIST_HEAD(&ctx->sqd_list);
1261	init_waitqueue_head(&ctx->poll_wait);
1262	INIT_LIST_HEAD(&ctx->cq_overflow_list);
1263	init_completion(&ctx->ref_comp);
1264	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1265	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1266	mutex_init(&ctx->uring_lock);
1267	init_waitqueue_head(&ctx->cq_wait);
1268	spin_lock_init(&ctx->completion_lock);
1269	spin_lock_init(&ctx->timeout_lock);
1270	INIT_LIST_HEAD(&ctx->iopoll_list);
1271	INIT_LIST_HEAD(&ctx->defer_list);
1272	INIT_LIST_HEAD(&ctx->timeout_list);
1273	spin_lock_init(&ctx->rsrc_ref_lock);
1274	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1275	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1276	init_llist_head(&ctx->rsrc_put_llist);
1277	INIT_LIST_HEAD(&ctx->tctx_list);
1278	INIT_LIST_HEAD(&ctx->submit_state.free_list);
1279	INIT_LIST_HEAD(&ctx->locked_free_list);
1280	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1281	return ctx;
1282err:
1283	kfree(ctx->dummy_ubuf);
1284	kfree(ctx->cancel_hash);
1285	kfree(ctx);
1286	return NULL;
1287}
1288
1289static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1290{
1291	struct io_rings *r = ctx->rings;
1292
1293	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1294	ctx->cq_extra--;
1295}
1296
1297static bool req_need_defer(struct io_kiocb *req, u32 seq)
1298{
1299	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1300		struct io_ring_ctx *ctx = req->ctx;
1301
1302		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1303	}
1304
1305	return false;
1306}
1307
1308#define FFS_ASYNC_READ		0x1UL
1309#define FFS_ASYNC_WRITE		0x2UL
1310#ifdef CONFIG_64BIT
1311#define FFS_ISREG		0x4UL
1312#else
1313#define FFS_ISREG		0x0UL
1314#endif
1315#define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
1316
1317static inline bool io_req_ffs_set(struct io_kiocb *req)
1318{
1319	return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
1320}
1321
1322static void io_req_track_inflight(struct io_kiocb *req)
1323{
1324	if (!(req->flags & REQ_F_INFLIGHT)) {
1325		req->flags |= REQ_F_INFLIGHT;
1326		atomic_inc(&current->io_uring->inflight_tracked);
1327	}
1328}
1329
1330static inline void io_unprep_linked_timeout(struct io_kiocb *req)
1331{
1332	req->flags &= ~REQ_F_LINK_TIMEOUT;
1333}
1334
1335static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1336{
1337	if (WARN_ON_ONCE(!req->link))
1338		return NULL;
1339
1340	req->flags &= ~REQ_F_ARM_LTIMEOUT;
1341	req->flags |= REQ_F_LINK_TIMEOUT;
1342
1343	/* linked timeouts should have two refs once prep'ed */
1344	io_req_set_refcount(req);
1345	__io_req_set_refcount(req->link, 2);
1346	return req->link;
1347}
1348
1349static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1350{
1351	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
1352		return NULL;
1353	return __io_prep_linked_timeout(req);
1354}
1355
1356static void io_prep_async_work(struct io_kiocb *req)
1357{
1358	const struct io_op_def *def = &io_op_defs[req->opcode];
1359	struct io_ring_ctx *ctx = req->ctx;
1360
1361	if (!(req->flags & REQ_F_CREDS)) {
1362		req->flags |= REQ_F_CREDS;
1363		req->creds = get_current_cred();
1364	}
1365
1366	req->work.list.next = NULL;
1367	req->work.flags = 0;
1368	if (req->flags & REQ_F_FORCE_ASYNC)
1369		req->work.flags |= IO_WQ_WORK_CONCURRENT;
1370
1371	if (req->flags & REQ_F_ISREG) {
1372		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1373			io_wq_hash_work(&req->work, file_inode(req->file));
1374	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1375		if (def->unbound_nonreg_file)
1376			req->work.flags |= IO_WQ_WORK_UNBOUND;
1377	}
1378
1379	switch (req->opcode) {
1380	case IORING_OP_SPLICE:
1381	case IORING_OP_TEE:
1382		if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1383			req->work.flags |= IO_WQ_WORK_UNBOUND;
1384		break;
1385	}
1386}
1387
1388static void io_prep_async_link(struct io_kiocb *req)
1389{
1390	struct io_kiocb *cur;
1391
1392	if (req->flags & REQ_F_LINK_TIMEOUT) {
1393		struct io_ring_ctx *ctx = req->ctx;
1394
1395		spin_lock(&ctx->completion_lock);
1396		io_for_each_link(cur, req)
1397			io_prep_async_work(cur);
1398		spin_unlock(&ctx->completion_lock);
1399	} else {
1400		io_for_each_link(cur, req)
1401			io_prep_async_work(cur);
1402	}
1403}
1404
1405static void io_queue_async_work(struct io_kiocb *req, bool *locked)
1406{
1407	struct io_ring_ctx *ctx = req->ctx;
1408	struct io_kiocb *link = io_prep_linked_timeout(req);
1409	struct io_uring_task *tctx = req->task->io_uring;
1410
1411	/* must not take the lock, NULL it as a precaution */
1412	locked = NULL;
1413
1414	BUG_ON(!tctx);
1415	BUG_ON(!tctx->io_wq);
1416
1417	/* init ->work of the whole link before punting */
1418	io_prep_async_link(req);
1419
1420	/*
1421	 * Not expected to happen, but if we do have a bug where this _can_
1422	 * happen, catch it here and ensure the request is marked as
1423	 * canceled. That will make io-wq go through the usual work cancel
1424	 * procedure rather than attempt to run this request (or create a new
1425	 * worker for it).
1426	 */
1427	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1428		req->work.flags |= IO_WQ_WORK_CANCEL;
1429
1430	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1431					&req->work, req->flags);
1432	io_wq_enqueue(tctx->io_wq, &req->work);
1433	if (link)
1434		io_queue_linked_timeout(link);
1435}
1436
1437static void io_kill_timeout(struct io_kiocb *req, int status)
1438	__must_hold(&req->ctx->completion_lock)
1439	__must_hold(&req->ctx->timeout_lock)
1440{
1441	struct io_timeout_data *io = req->async_data;
1442
1443	if (hrtimer_try_to_cancel(&io->timer) != -1) {
1444		atomic_set(&req->ctx->cq_timeouts,
1445			atomic_read(&req->ctx->cq_timeouts) + 1);
1446		list_del_init(&req->timeout.list);
1447		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
1448		io_put_req_deferred(req);
1449	}
1450}
1451
1452static void io_queue_deferred(struct io_ring_ctx *ctx)
1453{
1454	while (!list_empty(&ctx->defer_list)) {
1455		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1456						struct io_defer_entry, list);
1457
1458		if (req_need_defer(de->req, de->seq))
1459			break;
1460		list_del_init(&de->list);
1461		io_req_task_queue(de->req);
1462		kfree(de);
1463	}
1464}
1465
1466static void io_flush_timeouts(struct io_ring_ctx *ctx)
1467	__must_hold(&ctx->completion_lock)
1468{
1469	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1470
1471	spin_lock_irq(&ctx->timeout_lock);
1472	while (!list_empty(&ctx->timeout_list)) {
1473		u32 events_needed, events_got;
1474		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1475						struct io_kiocb, timeout.list);
1476
1477		if (io_is_timeout_noseq(req))
1478			break;
1479
1480		/*
1481		 * Since seq can easily wrap around over time, subtract
1482		 * the last seq at which timeouts were flushed before comparing.
1483		 * Assuming not more than 2^31-1 events have happened since,
1484		 * these subtractions won't have wrapped, so we can check if
1485		 * target is in [last_seq, current_seq] by comparing the two.
1486		 */
1487		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1488		events_got = seq - ctx->cq_last_tm_flush;
1489		if (events_got < events_needed)
1490			break;
1491
1492		list_del_init(&req->timeout.list);
1493		io_kill_timeout(req, 0);
1494	}
1495	ctx->cq_last_tm_flush = seq;
1496	spin_unlock_irq(&ctx->timeout_lock);
1497}
1498
1499static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1500{
1501	if (ctx->off_timeout_used)
1502		io_flush_timeouts(ctx);
1503	if (ctx->drain_active)
1504		io_queue_deferred(ctx);
1505}
1506
1507static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1508{
1509	if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1510		__io_commit_cqring_flush(ctx);
1511	/* order cqe stores with ring update */
1512	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1513}
1514
1515static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1516{
1517	struct io_rings *r = ctx->rings;
1518
1519	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1520}
1521
1522static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1523{
1524	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1525}
1526
1527static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1528{
1529	struct io_rings *rings = ctx->rings;
1530	unsigned tail, mask = ctx->cq_entries - 1;
1531
1532	/*
1533	 * writes to the cq entry need to come after reading head; the
1534	 * control dependency is enough as we're using WRITE_ONCE to
1535	 * fill the cq entry
1536	 */
1537	if (__io_cqring_events(ctx) == ctx->cq_entries)
1538		return NULL;
1539
1540	tail = ctx->cached_cq_tail++;
1541	return &rings->cqes[tail & mask];
1542}
1543
1544static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1545{
1546	if (likely(!ctx->cq_ev_fd))
1547		return false;
1548	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1549		return false;
1550	return !ctx->eventfd_async || io_wq_current_is_worker();
1551}
1552
1553/*
1554 * This should only get called when at least one event has been posted.
1555 * Some applications rely on the eventfd notification count only changing
1556 * IFF a new CQE has been added to the CQ ring. There's no depedency on
1557 * 1:1 relationship between how many times this function is called (and
1558 * hence the eventfd count) and number of CQEs posted to the CQ ring.
1559 */
1560static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1561{
1562	/*
1563	 * wake_up_all() may seem excessive, but io_wake_function() and
1564	 * io_should_wake() handle the termination of the loop and only
1565	 * wake as many waiters as we need to.
1566	 */
1567	if (wq_has_sleeper(&ctx->cq_wait))
1568		wake_up_all(&ctx->cq_wait);
1569	if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1570		wake_up(&ctx->sq_data->wait);
1571	if (io_should_trigger_evfd(ctx))
1572		eventfd_signal(ctx->cq_ev_fd, 1);
1573	if (waitqueue_active(&ctx->poll_wait)) {
1574		wake_up_interruptible(&ctx->poll_wait);
1575		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1576	}
1577}
1578
1579static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1580{
1581	if (ctx->flags & IORING_SETUP_SQPOLL) {
1582		if (wq_has_sleeper(&ctx->cq_wait))
1583			wake_up_all(&ctx->cq_wait);
1584	}
1585	if (io_should_trigger_evfd(ctx))
1586		eventfd_signal(ctx->cq_ev_fd, 1);
1587	if (waitqueue_active(&ctx->poll_wait)) {
1588		wake_up_interruptible(&ctx->poll_wait);
1589		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1590	}
1591}
1592
1593/* Returns true if there are no backlogged entries after the flush */
1594static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1595{
1596	bool all_flushed, posted;
1597
1598	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1599		return false;
1600
1601	posted = false;
1602	spin_lock(&ctx->completion_lock);
1603	while (!list_empty(&ctx->cq_overflow_list)) {
1604		struct io_uring_cqe *cqe = io_get_cqe(ctx);
1605		struct io_overflow_cqe *ocqe;
1606
1607		if (!cqe && !force)
1608			break;
1609		ocqe = list_first_entry(&ctx->cq_overflow_list,
1610					struct io_overflow_cqe, list);
1611		if (cqe)
1612			memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1613		else
1614			io_account_cq_overflow(ctx);
1615
1616		posted = true;
1617		list_del(&ocqe->list);
1618		kfree(ocqe);
1619	}
1620
1621	all_flushed = list_empty(&ctx->cq_overflow_list);
1622	if (all_flushed) {
1623		clear_bit(0, &ctx->check_cq_overflow);
1624		WRITE_ONCE(ctx->rings->sq_flags,
1625			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
1626	}
1627
1628	if (posted)
1629		io_commit_cqring(ctx);
1630	spin_unlock(&ctx->completion_lock);
1631	if (posted)
1632		io_cqring_ev_posted(ctx);
1633	return all_flushed;
1634}
1635
1636static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1637{
1638	bool ret = true;
1639
1640	if (test_bit(0, &ctx->check_cq_overflow)) {
1641		/* iopoll syncs against uring_lock, not completion_lock */
1642		if (ctx->flags & IORING_SETUP_IOPOLL)
1643			mutex_lock(&ctx->uring_lock);
1644		ret = __io_cqring_overflow_flush(ctx, false);
1645		if (ctx->flags & IORING_SETUP_IOPOLL)
1646			mutex_unlock(&ctx->uring_lock);
1647	}
1648
1649	return ret;
1650}
1651
1652/* must to be called somewhat shortly after putting a request */
1653static inline void io_put_task(struct task_struct *task, int nr)
1654{
1655	struct io_uring_task *tctx = task->io_uring;
1656
1657	if (likely(task == current)) {
1658		tctx->cached_refs += nr;
1659	} else {
1660		percpu_counter_sub(&tctx->inflight, nr);
1661		if (unlikely(atomic_read(&tctx->in_idle)))
1662			wake_up(&tctx->wait);
1663		put_task_struct_many(task, nr);
1664	}
1665}
1666
1667static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1668				     long res, unsigned int cflags)
1669{
1670	struct io_overflow_cqe *ocqe;
1671
1672	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1673	if (!ocqe) {
1674		/*
1675		 * If we're in ring overflow flush mode, or in task cancel mode,
1676		 * or cannot allocate an overflow entry, then we need to drop it
1677		 * on the floor.
1678		 */
1679		io_account_cq_overflow(ctx);
1680		return false;
1681	}
1682	if (list_empty(&ctx->cq_overflow_list)) {
1683		set_bit(0, &ctx->check_cq_overflow);
1684		WRITE_ONCE(ctx->rings->sq_flags,
1685			   ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1686
1687	}
1688	ocqe->cqe.user_data = user_data;
1689	ocqe->cqe.res = res;
1690	ocqe->cqe.flags = cflags;
1691	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1692	return true;
1693}
1694
1695static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1696					  long res, unsigned int cflags)
1697{
1698	struct io_uring_cqe *cqe;
1699
1700	trace_io_uring_complete(ctx, user_data, res, cflags);
1701
1702	/*
1703	 * If we can't get a cq entry, userspace overflowed the
1704	 * submission (by quite a lot). Increment the overflow count in
1705	 * the ring.
1706	 */
1707	cqe = io_get_cqe(ctx);
1708	if (likely(cqe)) {
1709		WRITE_ONCE(cqe->user_data, user_data);
1710		WRITE_ONCE(cqe->res, res);
1711		WRITE_ONCE(cqe->flags, cflags);
1712		return true;
1713	}
1714	return io_cqring_event_overflow(ctx, user_data, res, cflags);
1715}
1716
1717/* not as hot to bloat with inlining */
1718static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1719					  long res, unsigned int cflags)
1720{
1721	return __io_cqring_fill_event(ctx, user_data, res, cflags);
1722}
1723
1724static void io_req_complete_post(struct io_kiocb *req, long res,
1725				 unsigned int cflags)
1726{
1727	struct io_ring_ctx *ctx = req->ctx;
1728
1729	spin_lock(&ctx->completion_lock);
1730	__io_cqring_fill_event(ctx, req->user_data, res, cflags);
1731	/*
1732	 * If we're the last reference to this request, add to our locked
1733	 * free_list cache.
1734	 */
1735	if (req_ref_put_and_test(req)) {
1736		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1737			if (req->flags & IO_DISARM_MASK)
1738				io_disarm_next(req);
1739			if (req->link) {
1740				io_req_task_queue(req->link);
1741				req->link = NULL;
1742			}
1743		}
1744		io_dismantle_req(req);
1745		io_put_task(req->task, 1);
1746		list_add(&req->inflight_entry, &ctx->locked_free_list);
1747		ctx->locked_free_nr++;
1748	} else {
1749		if (!percpu_ref_tryget(&ctx->refs))
1750			req = NULL;
1751	}
1752	io_commit_cqring(ctx);
1753	spin_unlock(&ctx->completion_lock);
1754
1755	if (req) {
1756		io_cqring_ev_posted(ctx);
1757		percpu_ref_put(&ctx->refs);
1758	}
1759}
1760
1761static inline bool io_req_needs_clean(struct io_kiocb *req)
1762{
1763	return req->flags & IO_REQ_CLEAN_FLAGS;
1764}
1765
1766static void io_req_complete_state(struct io_kiocb *req, long res,
1767				  unsigned int cflags)
1768{
1769	if (io_req_needs_clean(req))
1770		io_clean_op(req);
1771	req->result = res;
1772	req->compl.cflags = cflags;
1773	req->flags |= REQ_F_COMPLETE_INLINE;
1774}
1775
1776static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1777				     long res, unsigned cflags)
1778{
1779	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1780		io_req_complete_state(req, res, cflags);
1781	else
1782		io_req_complete_post(req, res, cflags);
1783}
1784
1785static inline void io_req_complete(struct io_kiocb *req, long res)
1786{
1787	__io_req_complete(req, 0, res, 0);
1788}
1789
1790static void io_req_complete_failed(struct io_kiocb *req, long res)
1791{
1792	req_set_fail(req);
1793	io_req_complete_post(req, res, 0);
1794}
1795
1796/*
1797 * Don't initialise the fields below on every allocation, but do that in
1798 * advance and keep them valid across allocations.
1799 */
1800static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1801{
1802	req->ctx = ctx;
1803	req->link = NULL;
1804	req->async_data = NULL;
1805	/* not necessary, but safer to zero */
1806	req->result = 0;
1807}
1808
1809static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1810					struct io_submit_state *state)
1811{
1812	spin_lock(&ctx->completion_lock);
1813	list_splice_init(&ctx->locked_free_list, &state->free_list);
1814	ctx->locked_free_nr = 0;
1815	spin_unlock(&ctx->completion_lock);
1816}
1817
1818/* Returns true IFF there are requests in the cache */
1819static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1820{
1821	struct io_submit_state *state = &ctx->submit_state;
1822	int nr;
1823
1824	/*
1825	 * If we have more than a batch's worth of requests in our IRQ side
1826	 * locked cache, grab the lock and move them over to our submission
1827	 * side cache.
1828	 */
1829	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
1830		io_flush_cached_locked_reqs(ctx, state);
1831
1832	nr = state->free_reqs;
1833	while (!list_empty(&state->free_list)) {
1834		struct io_kiocb *req = list_first_entry(&state->free_list,
1835					struct io_kiocb, inflight_entry);
1836
1837		list_del(&req->inflight_entry);
1838		state->reqs[nr++] = req;
1839		if (nr == ARRAY_SIZE(state->reqs))
1840			break;
1841	}
1842
1843	state->free_reqs = nr;
1844	return nr != 0;
1845}
1846
1847/*
1848 * A request might get retired back into the request caches even before opcode
1849 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
1850 * Because of that, io_alloc_req() should be called only under ->uring_lock
1851 * and with extra caution to not get a request that is still worked on.
1852 */
1853static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1854	__must_hold(&ctx->uring_lock)
1855{
1856	struct io_submit_state *state = &ctx->submit_state;
1857	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1858	int ret, i;
1859
1860	BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
1861
1862	if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
1863		goto got_req;
1864
1865	ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1866				    state->reqs);
1867
1868	/*
1869	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1870	 * retry single alloc to be on the safe side.
1871	 */
1872	if (unlikely(ret <= 0)) {
1873		state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1874		if (!state->reqs[0])
1875			return NULL;
1876		ret = 1;
1877	}
1878
1879	for (i = 0; i < ret; i++)
1880		io_preinit_req(state->reqs[i], ctx);
1881	state->free_reqs = ret;
1882got_req:
1883	state->free_reqs--;
1884	return state->reqs[state->free_reqs];
1885}
1886
1887static inline void io_put_file(struct file *file)
1888{
1889	if (file)
1890		fput(file);
1891}
1892
1893static void io_dismantle_req(struct io_kiocb *req)
1894{
1895	unsigned int flags = req->flags;
1896
1897	if (io_req_needs_clean(req))
1898		io_clean_op(req);
1899	if (!(flags & REQ_F_FIXED_FILE))
1900		io_put_file(req->file);
1901	if (req->fixed_rsrc_refs)
1902		percpu_ref_put(req->fixed_rsrc_refs);
1903	if (req->async_data) {
1904		kfree(req->async_data);
1905		req->async_data = NULL;
1906	}
1907}
1908
1909static void __io_free_req(struct io_kiocb *req)
1910{
1911	struct io_ring_ctx *ctx = req->ctx;
1912
1913	io_dismantle_req(req);
1914	io_put_task(req->task, 1);
1915
1916	spin_lock(&ctx->completion_lock);
1917	list_add(&req->inflight_entry, &ctx->locked_free_list);
1918	ctx->locked_free_nr++;
1919	spin_unlock(&ctx->completion_lock);
1920
1921	percpu_ref_put(&ctx->refs);
1922}
1923
1924static inline void io_remove_next_linked(struct io_kiocb *req)
1925{
1926	struct io_kiocb *nxt = req->link;
1927
1928	req->link = nxt->link;
1929	nxt->link = NULL;
1930}
1931
1932static bool io_kill_linked_timeout(struct io_kiocb *req)
1933	__must_hold(&req->ctx->completion_lock)
1934	__must_hold(&req->ctx->timeout_lock)
1935{
1936	struct io_kiocb *link = req->link;
1937
1938	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
1939		struct io_timeout_data *io = link->async_data;
1940
1941		io_remove_next_linked(req);
1942		link->timeout.head = NULL;
1943		if (hrtimer_try_to_cancel(&io->timer) != -1) {
1944			io_cqring_fill_event(link->ctx, link->user_data,
1945					     -ECANCELED, 0);
1946			io_put_req_deferred(link);
1947			return true;
1948		}
1949	}
1950	return false;
1951}
1952
1953static void io_fail_links(struct io_kiocb *req)
1954	__must_hold(&req->ctx->completion_lock)
1955{
1956	struct io_kiocb *nxt, *link = req->link;
1957
1958	req->link = NULL;
1959	while (link) {
1960		nxt = link->link;
1961		link->link = NULL;
1962
1963		trace_io_uring_fail_link(req, link);
1964		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
1965		io_put_req_deferred(link);
1966		link = nxt;
1967	}
1968}
1969
1970static bool io_disarm_next(struct io_kiocb *req)
1971	__must_hold(&req->ctx->completion_lock)
1972{
1973	bool posted = false;
1974
1975	if (req->flags & REQ_F_ARM_LTIMEOUT) {
1976		struct io_kiocb *link = req->link;
1977
1978		req->flags &= ~REQ_F_ARM_LTIMEOUT;
1979		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
1980			io_remove_next_linked(req);
1981			io_cqring_fill_event(link->ctx, link->user_data,
1982					     -ECANCELED, 0);
1983			io_put_req_deferred(link);
1984			posted = true;
1985		}
1986	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
1987		struct io_ring_ctx *ctx = req->ctx;
1988
1989		spin_lock_irq(&ctx->timeout_lock);
1990		posted = io_kill_linked_timeout(req);
1991		spin_unlock_irq(&ctx->timeout_lock);
1992	}
1993	if (unlikely((req->flags & REQ_F_FAIL) &&
1994		     !(req->flags & REQ_F_HARDLINK))) {
1995		posted |= (req->link != NULL);
1996		io_fail_links(req);
1997	}
1998	return posted;
1999}
2000
2001static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
2002{
2003	struct io_kiocb *nxt;
2004
2005	/*
2006	 * If LINK is set, we have dependent requests in this chain. If we
2007	 * didn't fail this request, queue the first one up, moving any other
2008	 * dependencies to the next request. In case of failure, fail the rest
2009	 * of the chain.
2010	 */
2011	if (req->flags & IO_DISARM_MASK) {
2012		struct io_ring_ctx *ctx = req->ctx;
2013		bool posted;
2014
2015		spin_lock(&ctx->completion_lock);
2016		posted = io_disarm_next(req);
2017		if (posted)
2018			io_commit_cqring(req->ctx);
2019		spin_unlock(&ctx->completion_lock);
2020		if (posted)
2021			io_cqring_ev_posted(ctx);
2022	}
2023	nxt = req->link;
2024	req->link = NULL;
2025	return nxt;
2026}
2027
2028static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
2029{
2030	if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2031		return NULL;
2032	return __io_req_find_next(req);
2033}
2034
2035static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2036{
2037	if (!ctx)
2038		return;
2039	if (*locked) {
2040		if (ctx->submit_state.compl_nr)
2041			io_submit_flush_completions(ctx);
2042		mutex_unlock(&ctx->uring_lock);
2043		*locked = false;
2044	}
2045	percpu_ref_put(&ctx->refs);
2046}
2047
2048static void tctx_task_work(struct callback_head *cb)
2049{
2050	bool locked = false;
2051	struct io_ring_ctx *ctx = NULL;
2052	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2053						  task_work);
2054
2055	while (1) {
2056		struct io_wq_work_node *node;
2057
2058		spin_lock_irq(&tctx->task_lock);
2059		node = tctx->task_list.first;
2060		INIT_WQ_LIST(&tctx->task_list);
2061		if (!node)
2062			tctx->task_running = false;
2063		spin_unlock_irq(&tctx->task_lock);
2064		if (!node)
2065			break;
2066
2067		do {
2068			struct io_wq_work_node *next = node->next;
2069			struct io_kiocb *req = container_of(node, struct io_kiocb,
2070							    io_task_work.node);
2071
2072			if (req->ctx != ctx) {
2073				ctx_flush_and_put(ctx, &locked);
2074				ctx = req->ctx;
2075				/* if not contended, grab and improve batching */
2076				locked = mutex_trylock(&ctx->uring_lock);
2077				percpu_ref_get(&ctx->refs);
2078			}
2079			req->io_task_work.func(req, &locked);
2080			node = next;
2081		} while (node);
2082
2083		cond_resched();
2084	}
2085
2086	ctx_flush_and_put(ctx, &locked);
2087}
2088
2089static void io_req_task_work_add(struct io_kiocb *req)
2090{
2091	struct task_struct *tsk = req->task;
2092	struct io_uring_task *tctx = tsk->io_uring;
2093	enum task_work_notify_mode notify;
2094	struct io_wq_work_node *node;
2095	unsigned long flags;
2096	bool running;
2097
2098	WARN_ON_ONCE(!tctx);
2099
2100	spin_lock_irqsave(&tctx->task_lock, flags);
2101	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2102	running = tctx->task_running;
2103	if (!running)
2104		tctx->task_running = true;
2105	spin_unlock_irqrestore(&tctx->task_lock, flags);
2106
2107	/* task_work already pending, we're done */
2108	if (running)
2109		return;
2110
2111	/*
2112	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2113	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2114	 * processing task_work. There's no reliable way to tell if TWA_RESUME
2115	 * will do the job.
2116	 */
2117	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
2118	if (!task_work_add(tsk, &tctx->task_work, notify)) {
2119		wake_up_process(tsk);
2120		return;
2121	}
2122
2123	spin_lock_irqsave(&tctx->task_lock, flags);
2124	tctx->task_running = false;
2125	node = tctx->task_list.first;
2126	INIT_WQ_LIST(&tctx->task_list);
2127	spin_unlock_irqrestore(&tctx->task_lock, flags);
2128
2129	while (node) {
2130		req = container_of(node, struct io_kiocb, io_task_work.node);
2131		node = node->next;
2132		if (llist_add(&req->io_task_work.fallback_node,
2133			      &req->ctx->fallback_llist))
2134			schedule_delayed_work(&req->ctx->fallback_work, 1);
2135	}
2136}
2137
2138static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
2139{
2140	struct io_ring_ctx *ctx = req->ctx;
2141
2142	/* ctx is guaranteed to stay alive while we hold uring_lock */
2143	io_tw_lock(ctx, locked);
2144	io_req_complete_failed(req, req->result);
2145}
2146
2147static void io_req_task_submit(struct io_kiocb *req, bool *locked)
2148{
2149	struct io_ring_ctx *ctx = req->ctx;
2150
2151	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
2152	io_tw_lock(ctx, locked);
2153	/* req->task == current here, checking PF_EXITING is safe */
2154	if (likely(!(req->task->flags & PF_EXITING)))
2155		__io_queue_sqe(req);
2156	else
2157		io_req_complete_failed(req, -EFAULT);
2158}
2159
2160static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2161{
2162	req->result = ret;
2163	req->io_task_work.func = io_req_task_cancel;
2164	io_req_task_work_add(req);
2165}
2166
2167static void io_req_task_queue(struct io_kiocb *req)
2168{
2169	req->io_task_work.func = io_req_task_submit;
2170	io_req_task_work_add(req);
2171}
2172
2173static void io_req_task_queue_reissue(struct io_kiocb *req)
2174{
2175	req->io_task_work.func = io_queue_async_work;
2176	io_req_task_work_add(req);
2177}
2178
2179static inline void io_queue_next(struct io_kiocb *req)
2180{
2181	struct io_kiocb *nxt = io_req_find_next(req);
2182
2183	if (nxt)
2184		io_req_task_queue(nxt);
2185}
2186
2187static void io_free_req(struct io_kiocb *req)
2188{
2189	io_queue_next(req);
2190	__io_free_req(req);
2191}
2192
2193static void io_free_req_work(struct io_kiocb *req, bool *locked)
2194{
2195	io_free_req(req);
2196}
2197
2198struct req_batch {
2199	struct task_struct	*task;
2200	int			task_refs;
2201	int			ctx_refs;
2202};
2203
2204static inline void io_init_req_batch(struct req_batch *rb)
2205{
2206	rb->task_refs = 0;
2207	rb->ctx_refs = 0;
2208	rb->task = NULL;
2209}
2210
2211static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2212				     struct req_batch *rb)
2213{
2214	if (rb->ctx_refs)
2215		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2216	if (rb->task)
2217		io_put_task(rb->task, rb->task_refs);
2218}
2219
2220static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2221			      struct io_submit_state *state)
2222{
2223	io_queue_next(req);
2224	io_dismantle_req(req);
2225
2226	if (req->task != rb->task) {
2227		if (rb->task)
2228			io_put_task(rb->task, rb->task_refs);
2229		rb->task = req->task;
2230		rb->task_refs = 0;
2231	}
2232	rb->task_refs++;
2233	rb->ctx_refs++;
2234
2235	if (state->free_reqs != ARRAY_SIZE(state->reqs))
2236		state->reqs[state->free_reqs++] = req;
2237	else
2238		list_add(&req->inflight_entry, &state->free_list);
2239}
2240
2241static void io_submit_flush_completions(struct io_ring_ctx *ctx)
2242	__must_hold(&ctx->uring_lock)
2243{
2244	struct io_submit_state *state = &ctx->submit_state;
2245	int i, nr = state->compl_nr;
2246	struct req_batch rb;
2247
2248	spin_lock(&ctx->completion_lock);
2249	for (i = 0; i < nr; i++) {
2250		struct io_kiocb *req = state->compl_reqs[i];
2251
2252		__io_cqring_fill_event(ctx, req->user_data, req->result,
2253					req->compl.cflags);
2254	}
2255	io_commit_cqring(ctx);
2256	spin_unlock(&ctx->completion_lock);
2257	io_cqring_ev_posted(ctx);
2258
2259	io_init_req_batch(&rb);
2260	for (i = 0; i < nr; i++) {
2261		struct io_kiocb *req = state->compl_reqs[i];
2262
2263		if (req_ref_put_and_test(req))
2264			io_req_free_batch(&rb, req, &ctx->submit_state);
2265	}
2266
2267	io_req_free_batch_finish(ctx, &rb);
2268	state->compl_nr = 0;
2269}
2270
2271/*
2272 * Drop reference to request, return next in chain (if there is one) if this
2273 * was the last reference to this request.
2274 */
2275static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2276{
2277	struct io_kiocb *nxt = NULL;
2278
2279	if (req_ref_put_and_test(req)) {
2280		nxt = io_req_find_next(req);
2281		__io_free_req(req);
2282	}
2283	return nxt;
2284}
2285
2286static inline void io_put_req(struct io_kiocb *req)
2287{
2288	if (req_ref_put_and_test(req))
2289		io_free_req(req);
2290}
2291
2292static inline void io_put_req_deferred(struct io_kiocb *req)
2293{
2294	if (req_ref_put_and_test(req)) {
2295		req->io_task_work.func = io_free_req_work;
2296		io_req_task_work_add(req);
2297	}
2298}
2299
2300static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2301{
2302	/* See comment at the top of this file */
2303	smp_rmb();
2304	return __io_cqring_events(ctx);
2305}
2306
2307static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2308{
2309	struct io_rings *rings = ctx->rings;
2310
2311	/* make sure SQ entry isn't read before tail */
2312	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2313}
2314
2315static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2316{
2317	unsigned int cflags;
2318
2319	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2320	cflags |= IORING_CQE_F_BUFFER;
2321	req->flags &= ~REQ_F_BUFFER_SELECTED;
2322	kfree(kbuf);
2323	return cflags;
2324}
2325
2326static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2327{
2328	struct io_buffer *kbuf;
2329
2330	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
2331		return 0;
2332	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2333	return io_put_kbuf(req, kbuf);
2334}
2335
2336static inline bool io_run_task_work(void)
2337{
2338	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
2339		__set_current_state(TASK_RUNNING);
2340		tracehook_notify_signal();
2341		return true;
2342	}
2343
2344	return false;
2345}
2346
2347/*
2348 * Find and free completed poll iocbs
2349 */
2350static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2351			       struct list_head *done)
2352{
2353	struct req_batch rb;
2354	struct io_kiocb *req;
2355
2356	/* order with ->result store in io_complete_rw_iopoll() */
2357	smp_rmb();
2358
2359	io_init_req_batch(&rb);
2360	while (!list_empty(done)) {
2361		req = list_first_entry(done, struct io_kiocb, inflight_entry);
2362		list_del(&req->inflight_entry);
2363
2364		if (READ_ONCE(req->result) == -EAGAIN &&
2365		    !(req->flags & REQ_F_DONT_REISSUE)) {
2366			req->iopoll_completed = 0;
2367			io_req_task_queue_reissue(req);
2368			continue;
2369		}
2370
2371		__io_cqring_fill_event(ctx, req->user_data, req->result,
2372					io_put_rw_kbuf(req));
2373		(*nr_events)++;
2374
2375		if (req_ref_put_and_test(req))
2376			io_req_free_batch(&rb, req, &ctx->submit_state);
2377	}
2378
2379	io_commit_cqring(ctx);
2380	io_cqring_ev_posted_iopoll(ctx);
2381	io_req_free_batch_finish(ctx, &rb);
2382}
2383
2384static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2385			long min)
2386{
2387	struct io_kiocb *req, *tmp;
2388	LIST_HEAD(done);
2389	bool spin;
2390
2391	/*
2392	 * Only spin for completions if we don't have multiple devices hanging
2393	 * off our complete list, and we're under the requested amount.
2394	 */
2395	spin = !ctx->poll_multi_queue && *nr_events < min;
2396
2397	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2398		struct kiocb *kiocb = &req->rw.kiocb;
2399		int ret;
2400
2401		/*
2402		 * Move completed and retryable entries to our local lists.
2403		 * If we find a request that requires polling, break out
2404		 * and complete those lists first, if we have entries there.
2405		 */
2406		if (READ_ONCE(req->iopoll_completed)) {
2407			list_move_tail(&req->inflight_entry, &done);
2408			continue;
2409		}
2410		if (!list_empty(&done))
2411			break;
2412
2413		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2414		if (unlikely(ret < 0))
2415			return ret;
2416		else if (ret)
2417			spin = false;
2418
2419		/* iopoll may have completed current req */
2420		if (READ_ONCE(req->iopoll_completed))
2421			list_move_tail(&req->inflight_entry, &done);
2422	}
2423
2424	if (!list_empty(&done))
2425		io_iopoll_complete(ctx, nr_events, &done);
2426
2427	return 0;
2428}
2429
2430/*
2431 * We can't just wait for polled events to come to us, we have to actively
2432 * find and complete them.
2433 */
2434static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2435{
2436	if (!(ctx->flags & IORING_SETUP_IOPOLL))
2437		return;
2438
2439	mutex_lock(&ctx->uring_lock);
2440	while (!list_empty(&ctx->iopoll_list)) {
2441		unsigned int nr_events = 0;
2442
2443		io_do_iopoll(ctx, &nr_events, 0);
2444
2445		/* let it sleep and repeat later if can't complete a request */
2446		if (nr_events == 0)
2447			break;
2448		/*
2449		 * Ensure we allow local-to-the-cpu processing to take place,
2450		 * in this case we need to ensure that we reap all events.
2451		 * Also let task_work, etc. to progress by releasing the mutex
2452		 */
2453		if (need_resched()) {
2454			mutex_unlock(&ctx->uring_lock);
2455			cond_resched();
2456			mutex_lock(&ctx->uring_lock);
2457		}
2458	}
2459	mutex_unlock(&ctx->uring_lock);
2460}
2461
2462static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2463{
2464	unsigned int nr_events = 0;
2465	int ret = 0;
2466
2467	/*
2468	 * We disallow the app entering submit/complete with polling, but we
2469	 * still need to lock the ring to prevent racing with polled issue
2470	 * that got punted to a workqueue.
2471	 */
2472	mutex_lock(&ctx->uring_lock);
2473	/*
2474	 * Don't enter poll loop if we already have events pending.
2475	 * If we do, we can potentially be spinning for commands that
2476	 * already triggered a CQE (eg in error).
2477	 */
2478	if (test_bit(0, &ctx->check_cq_overflow))
2479		__io_cqring_overflow_flush(ctx, false);
2480	if (io_cqring_events(ctx))
2481		goto out;
2482	do {
2483		/*
2484		 * If a submit got punted to a workqueue, we can have the
2485		 * application entering polling for a command before it gets
2486		 * issued. That app will hold the uring_lock for the duration
2487		 * of the poll right here, so we need to take a breather every
2488		 * now and then to ensure that the issue has a chance to add
2489		 * the poll to the issued list. Otherwise we can spin here
2490		 * forever, while the workqueue is stuck trying to acquire the
2491		 * very same mutex.
2492		 */
2493		if (list_empty(&ctx->iopoll_list)) {
2494			u32 tail = ctx->cached_cq_tail;
2495
2496			mutex_unlock(&ctx->uring_lock);
2497			io_run_task_work();
2498			mutex_lock(&ctx->uring_lock);
2499
2500			/* some requests don't go through iopoll_list */
2501			if (tail != ctx->cached_cq_tail ||
2502			    list_empty(&ctx->iopoll_list))
2503				break;
2504		}
2505		ret = io_do_iopoll(ctx, &nr_events, min);
2506	} while (!ret && nr_events < min && !need_resched());
2507out:
2508	mutex_unlock(&ctx->uring_lock);
2509	return ret;
2510}
2511
2512static void kiocb_end_write(struct io_kiocb *req)
2513{
2514	/*
2515	 * Tell lockdep we inherited freeze protection from submission
2516	 * thread.
2517	 */
2518	if (req->flags & REQ_F_ISREG) {
2519		struct super_block *sb = file_inode(req->file)->i_sb;
2520
2521		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
2522		sb_end_write(sb);
2523	}
2524}
2525
2526#ifdef CONFIG_BLOCK
2527static bool io_resubmit_prep(struct io_kiocb *req)
2528{
2529	struct io_async_rw *rw = req->async_data;
2530
2531	if (!rw)
2532		return !io_req_prep_async(req);
2533	/* may have left rw->iter inconsistent on -EIOCBQUEUED */
2534	iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
2535	return true;
2536}
2537
2538static bool io_rw_should_reissue(struct io_kiocb *req)
2539{
2540	umode_t mode = file_inode(req->file)->i_mode;
2541	struct io_ring_ctx *ctx = req->ctx;
2542
2543	if (!S_ISBLK(mode) && !S_ISREG(mode))
2544		return false;
2545	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2546	    !(ctx->flags & IORING_SETUP_IOPOLL)))
2547		return false;
2548	/*
2549	 * If ref is dying, we might be running poll reap from the exit work.
2550	 * Don't attempt to reissue from that path, just let it fail with
2551	 * -EAGAIN.
2552	 */
2553	if (percpu_ref_is_dying(&ctx->refs))
2554		return false;
2555	/*
2556	 * Play it safe and assume not safe to re-import and reissue if we're
2557	 * not in the original thread group (or in task context).
2558	 */
2559	if (!same_thread_group(req->task, current) || !in_task())
2560		return false;
2561	return true;
2562}
2563#else
2564static bool io_resubmit_prep(struct io_kiocb *req)
2565{
2566	return false;
2567}
2568static bool io_rw_should_reissue(struct io_kiocb *req)
2569{
2570	return false;
2571}
2572#endif
2573
2574static bool __io_complete_rw_common(struct io_kiocb *req, long res)
2575{
2576	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2577		kiocb_end_write(req);
2578	if (res != req->result) {
2579		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2580		    io_rw_should_reissue(req)) {
2581			req->flags |= REQ_F_REISSUE;
2582			return true;
2583		}
2584		req_set_fail(req);
2585		req->result = res;
2586	}
2587	return false;
2588}
2589
2590static void io_req_task_complete(struct io_kiocb *req, bool *locked)
2591{
2592	unsigned int cflags = io_put_rw_kbuf(req);
2593	long res = req->result;
2594
2595	if (*locked) {
2596		struct io_ring_ctx *ctx = req->ctx;
2597		struct io_submit_state *state = &ctx->submit_state;
2598
2599		io_req_complete_state(req, res, cflags);
2600		state->compl_reqs[state->compl_nr++] = req;
2601		if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
2602			io_submit_flush_completions(ctx);
2603	} else {
2604		io_req_complete_post(req, res, cflags);
2605	}
2606}
2607
2608static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2609			     unsigned int issue_flags)
2610{
2611	if (__io_complete_rw_common(req, res))
2612		return;
2613	__io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
2614}
2615
2616static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2617{
2618	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2619
2620	if (__io_complete_rw_common(req, res))
2621		return;
2622	req->result = res;
2623	req->io_task_work.func = io_req_task_complete;
2624	io_req_task_work_add(req);
2625}
2626
2627static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2628{
2629	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2630
2631	if (kiocb->ki_flags & IOCB_WRITE)
2632		kiocb_end_write(req);
2633	if (unlikely(res != req->result)) {
2634		if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
2635		    io_resubmit_prep(req))) {
2636			req_set_fail(req);
2637			req->flags |= REQ_F_DONT_REISSUE;
2638		}
2639	}
2640
2641	WRITE_ONCE(req->result, res);
2642	/* order with io_iopoll_complete() checking ->result */
2643	smp_wmb();
2644	WRITE_ONCE(req->iopoll_completed, 1);
2645}
2646
2647/*
2648 * After the iocb has been issued, it's safe to be found on the poll list.
2649 * Adding the kiocb to the list AFTER submission ensures that we don't
2650 * find it from a io_do_iopoll() thread before the issuer is done
2651 * accessing the kiocb cookie.
2652 */
2653static void io_iopoll_req_issued(struct io_kiocb *req)
2654{
2655	struct io_ring_ctx *ctx = req->ctx;
2656	const bool in_async = io_wq_current_is_worker();
2657
2658	/* workqueue context doesn't hold uring_lock, grab it now */
2659	if (unlikely(in_async))
2660		mutex_lock(&ctx->uring_lock);
2661
2662	/*
2663	 * Track whether we have multiple files in our lists. This will impact
2664	 * how we do polling eventually, not spinning if we're on potentially
2665	 * different devices.
2666	 */
2667	if (list_empty(&ctx->iopoll_list)) {
2668		ctx->poll_multi_queue = false;
2669	} else if (!ctx->poll_multi_queue) {
2670		struct io_kiocb *list_req;
2671		unsigned int queue_num0, queue_num1;
2672
2673		list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2674						inflight_entry);
2675
2676		if (list_req->file != req->file) {
2677			ctx->poll_multi_queue = true;
2678		} else {
2679			queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
2680			queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
2681			if (queue_num0 != queue_num1)
2682				ctx->poll_multi_queue = true;
2683		}
2684	}
2685
2686	/*
2687	 * For fast devices, IO may have already completed. If it has, add
2688	 * it to the front so we find it first.
2689	 */
2690	if (READ_ONCE(req->iopoll_completed))
2691		list_add(&req->inflight_entry, &ctx->iopoll_list);
2692	else
2693		list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2694
2695	if (unlikely(in_async)) {
2696		/*
2697		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2698		 * in sq thread task context or in io worker task context. If
2699		 * current task context is sq thread, we don't need to check
2700		 * whether should wake up sq thread.
2701		 */
2702		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2703		    wq_has_sleeper(&ctx->sq_data->wait))
2704			wake_up(&ctx->sq_data->wait);
2705
2706		mutex_unlock(&ctx->uring_lock);
2707	}
2708}
2709
2710static bool io_bdev_nowait(struct block_device *bdev)
2711{
2712	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2713}
2714
2715/*
2716 * If we tracked the file through the SCM inflight mechanism, we could support
2717 * any file. For now, just ensure that anything potentially problematic is done
2718 * inline.
2719 */
2720static bool __io_file_supports_nowait(struct file *file, int rw)
2721{
2722	umode_t mode = file_inode(file)->i_mode;
2723
2724	if (S_ISBLK(mode)) {
2725		if (IS_ENABLED(CONFIG_BLOCK) &&
2726		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2727			return true;
2728		return false;
2729	}
2730	if (S_ISSOCK(mode))
2731		return true;
2732	if (S_ISREG(mode)) {
2733		if (IS_ENABLED(CONFIG_BLOCK) &&
2734		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2735		    file->f_op != &io_uring_fops)
2736			return true;
2737		return false;
2738	}
2739
2740	/* any ->read/write should understand O_NONBLOCK */
2741	if (file->f_flags & O_NONBLOCK)
2742		return true;
2743
2744	if (!(file->f_mode & FMODE_NOWAIT))
2745		return false;
2746
2747	if (rw == READ)
2748		return file->f_op->read_iter != NULL;
2749
2750	return file->f_op->write_iter != NULL;
2751}
2752
2753static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
2754{
2755	if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
2756		return true;
2757	else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
2758		return true;
2759
2760	return __io_file_supports_nowait(req->file, rw);
2761}
2762
2763static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2764{
2765	struct io_ring_ctx *ctx = req->ctx;
2766	struct kiocb *kiocb = &req->rw.kiocb;
2767	struct file *file = req->file;
2768	unsigned ioprio;
2769	int ret;
2770
2771	if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
2772		req->flags |= REQ_F_ISREG;
2773
2774	kiocb->ki_pos = READ_ONCE(sqe->off);
2775	if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
2776		req->flags |= REQ_F_CUR_POS;
2777		kiocb->ki_pos = file->f_pos;
2778	}
2779	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2780	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2781	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2782	if (unlikely(ret))
2783		return ret;
2784
2785	/* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2786	if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2787		req->flags |= REQ_F_NOWAIT;
2788
2789	ioprio = READ_ONCE(sqe->ioprio);
2790	if (ioprio) {
2791		ret = ioprio_check_cap(ioprio);
2792		if (ret)
2793			return ret;
2794
2795		kiocb->ki_ioprio = ioprio;
2796	} else
2797		kiocb->ki_ioprio = get_current_ioprio();
2798
2799	if (ctx->flags & IORING_SETUP_IOPOLL) {
2800		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2801		    !kiocb->ki_filp->f_op->iopoll)
2802			return -EOPNOTSUPP;
2803
2804		kiocb->ki_flags |= IOCB_HIPRI;
2805		kiocb->ki_complete = io_complete_rw_iopoll;
2806		req->iopoll_completed = 0;
2807	} else {
2808		if (kiocb->ki_flags & IOCB_HIPRI)
2809			return -EINVAL;
2810		kiocb->ki_complete = io_complete_rw;
2811	}
2812
2813	if (req->opcode == IORING_OP_READ_FIXED ||
2814	    req->opcode == IORING_OP_WRITE_FIXED) {
2815		req->imu = NULL;
2816		io_req_set_rsrc_node(req);
2817	}
2818
2819	req->rw.addr = READ_ONCE(sqe->addr);
2820	req->rw.len = READ_ONCE(sqe->len);
2821	req->buf_index = READ_ONCE(sqe->buf_index);
2822	return 0;
2823}
2824
2825static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2826{
2827	switch (ret) {
2828	case -EIOCBQUEUED:
2829		break;
2830	case -ERESTARTSYS:
2831	case -ERESTARTNOINTR:
2832	case -ERESTARTNOHAND:
2833	case -ERESTART_RESTARTBLOCK:
2834		/*
2835		 * We can't just restart the syscall, since previously
2836		 * submitted sqes may already be in progress. Just fail this
2837		 * IO with EINTR.
2838		 */
2839		ret = -EINTR;
2840		fallthrough;
2841	default:
2842		kiocb->ki_complete(kiocb, ret, 0);
2843	}
2844}
2845
2846static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2847		       unsigned int issue_flags)
2848{
2849	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2850	struct io_async_rw *io = req->async_data;
2851	bool check_reissue = kiocb->ki_complete == io_complete_rw;
2852
2853	/* add previously done IO, if any */
2854	if (io && io->bytes_done > 0) {
2855		if (ret < 0)
2856			ret = io->bytes_done;
2857		else
2858			ret += io->bytes_done;
2859	}
2860
2861	if (req->flags & REQ_F_CUR_POS)
2862		req->file->f_pos = kiocb->ki_pos;
2863	if (ret >= 0 && check_reissue)
2864		__io_complete_rw(req, ret, 0, issue_flags);
2865	else
2866		io_rw_done(kiocb, ret);
2867
2868	if (check_reissue && (req->flags & REQ_F_REISSUE)) {
2869		req->flags &= ~REQ_F_REISSUE;
2870		if (io_resubmit_prep(req)) {
2871			io_req_task_queue_reissue(req);
2872		} else {
2873			req_set_fail(req);
2874			__io_req_complete(req, issue_flags, ret,
2875					  io_put_rw_kbuf(req));
2876		}
2877	}
2878}
2879
2880static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
2881			     struct io_mapped_ubuf *imu)
2882{
2883	size_t len = req->rw.len;
2884	u64 buf_end, buf_addr = req->rw.addr;
2885	size_t offset;
2886
2887	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
2888		return -EFAULT;
2889	/* not inside the mapped region */
2890	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
2891		return -EFAULT;
2892
2893	/*
2894	 * May not be a start of buffer, set size appropriately
2895	 * and advance us to the beginning.
2896	 */
2897	offset = buf_addr - imu->ubuf;
2898	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2899
2900	if (offset) {
2901		/*
2902		 * Don't use iov_iter_advance() here, as it's really slow for
2903		 * using the latter parts of a big fixed buffer - it iterates
2904		 * over each segment manually. We can cheat a bit here, because
2905		 * we know that:
2906		 *
2907		 * 1) it's a BVEC iter, we set it up
2908		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2909		 *    first and last bvec
2910		 *
2911		 * So just find our index, and adjust the iterator afterwards.
2912		 * If the offset is within the first bvec (or the whole first
2913		 * bvec, just use iov_iter_advance(). This makes it easier
2914		 * since we can just skip the first segment, which may not
2915		 * be PAGE_SIZE aligned.
2916		 */
2917		const struct bio_vec *bvec = imu->bvec;
2918
2919		if (offset <= bvec->bv_len) {
2920			iov_iter_advance(iter, offset);
2921		} else {
2922			unsigned long seg_skip;
2923
2924			/* skip first vec */
2925			offset -= bvec->bv_len;
2926			seg_skip = 1 + (offset >> PAGE_SHIFT);
2927
2928			iter->bvec = bvec + seg_skip;
2929			iter->nr_segs -= seg_skip;
2930			iter->count -= bvec->bv_len + offset;
2931			iter->iov_offset = offset & ~PAGE_MASK;
2932		}
2933	}
2934
2935	return 0;
2936}
2937
2938static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2939{
2940	struct io_ring_ctx *ctx = req->ctx;
2941	struct io_mapped_ubuf *imu = req->imu;
2942	u16 index, buf_index = req->buf_index;
2943
2944	if (likely(!imu)) {
2945		if (unlikely(buf_index >= ctx->nr_user_bufs))
2946			return -EFAULT;
2947		index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2948		imu = READ_ONCE(ctx->user_bufs[index]);
2949		req->imu = imu;
2950	}
2951	return __io_import_fixed(req, rw, iter, imu);
2952}
2953
2954static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2955{
2956	if (needs_lock)
2957		mutex_unlock(&ctx->uring_lock);
2958}
2959
2960static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2961{
2962	/*
2963	 * "Normal" inline submissions always hold the uring_lock, since we
2964	 * grab it from the system call. Same is true for the SQPOLL offload.
2965	 * The only exception is when we've detached the request and issue it
2966	 * from an async worker thread, grab the lock for that case.
2967	 */
2968	if (needs_lock)
2969		mutex_lock(&ctx->uring_lock);
2970}
2971
2972static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2973					  int bgid, struct io_buffer *kbuf,
2974					  bool needs_lock)
2975{
2976	struct io_buffer *head;
2977
2978	if (req->flags & REQ_F_BUFFER_SELECTED)
2979		return kbuf;
2980
2981	io_ring_submit_lock(req->ctx, needs_lock);
2982
2983	lockdep_assert_held(&req->ctx->uring_lock);
2984
2985	head = xa_load(&req->ctx->io_buffers, bgid);
2986	if (head) {
2987		if (!list_empty(&head->list)) {
2988			kbuf = list_last_entry(&head->list, struct io_buffer,
2989							list);
2990			list_del(&kbuf->list);
2991		} else {
2992			kbuf = head;
2993			xa_erase(&req->ctx->io_buffers, bgid);
2994		}
2995		if (*len > kbuf->len)
2996			*len = kbuf->len;
2997	} else {
2998		kbuf = ERR_PTR(-ENOBUFS);
2999	}
3000
3001	io_ring_submit_unlock(req->ctx, needs_lock);
3002
3003	return kbuf;
3004}
3005
3006static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3007					bool needs_lock)
3008{
3009	struct io_buffer *kbuf;
3010	u16 bgid;
3011
3012	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3013	bgid = req->buf_index;
3014	kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
3015	if (IS_ERR(kbuf))
3016		return kbuf;
3017	req->rw.addr = (u64) (unsigned long) kbuf;
3018	req->flags |= REQ_F_BUFFER_SELECTED;
3019	return u64_to_user_ptr(kbuf->addr);
3020}
3021
3022#ifdef CONFIG_COMPAT
3023static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3024				bool needs_lock)
3025{
3026	struct compat_iovec __user *uiov;
3027	compat_ssize_t clen;
3028	void __user *buf;
3029	ssize_t len;
3030
3031	uiov = u64_to_user_ptr(req->rw.addr);
3032	if (!access_ok(uiov, sizeof(*uiov)))
3033		return -EFAULT;
3034	if (__get_user(clen, &uiov->iov_len))
3035		return -EFAULT;
3036	if (clen < 0)
3037		return -EINVAL;
3038
3039	len = clen;
3040	buf = io_rw_buffer_select(req, &len, needs_lock);
3041	if (IS_ERR(buf))
3042		return PTR_ERR(buf);
3043	iov[0].iov_base = buf;
3044	iov[0].iov_len = (compat_size_t) len;
3045	return 0;
3046}
3047#endif
3048
3049static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3050				      bool needs_lock)
3051{
3052	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3053	void __user *buf;
3054	ssize_t len;
3055
3056	if (copy_from_user(iov, uiov, sizeof(*uiov)))
3057		return -EFAULT;
3058
3059	len = iov[0].iov_len;
3060	if (len < 0)
3061		return -EINVAL;
3062	buf = io_rw_buffer_select(req, &len, needs_lock);
3063	if (IS_ERR(buf))
3064		return PTR_ERR(buf);
3065	iov[0].iov_base = buf;
3066	iov[0].iov_len = len;
3067	return 0;
3068}
3069
3070static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3071				    bool needs_lock)
3072{
3073	if (req->flags & REQ_F_BUFFER_SELECTED) {
3074		struct io_buffer *kbuf;
3075
3076		kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3077		iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3078		iov[0].iov_len = kbuf->len;
3079		return 0;
3080	}
3081	if (req->rw.len != 1)
3082		return -EINVAL;
3083
3084#ifdef CONFIG_COMPAT
3085	if (req->ctx->compat)
3086		return io_compat_import(req, iov, needs_lock);
3087#endif
3088
3089	return __io_iov_buffer_select(req, iov, needs_lock);
3090}
3091
3092static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
3093			   struct iov_iter *iter, bool needs_lock)
3094{
3095	void __user *buf = u64_to_user_ptr(req->rw.addr);
3096	size_t sqe_len = req->rw.len;
3097	u8 opcode = req->opcode;
3098	ssize_t ret;
3099
3100	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
3101		*iovec = NULL;
3102		return io_import_fixed(req, rw, iter);
3103	}
3104
3105	/* buffer index only valid with fixed read/write, or buffer select  */
3106	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
3107		return -EINVAL;
3108
3109	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3110		if (req->flags & REQ_F_BUFFER_SELECT) {
3111			buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
3112			if (IS_ERR(buf))
3113				return PTR_ERR(buf);
3114			req->rw.len = sqe_len;
3115		}
3116
3117		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3118		*iovec = NULL;
3119		return ret;
3120	}
3121
3122	if (req->flags & REQ_F_BUFFER_SELECT) {
3123		ret = io_iov_buffer_select(req, *iovec, needs_lock);
3124		if (!ret)
3125			iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
3126		*iovec = NULL;
3127		return ret;
3128	}
3129
3130	return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3131			      req->ctx->compat);
3132}
3133
3134static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3135{
3136	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3137}
3138
3139/*
3140 * For files that don't have ->read_iter() and ->write_iter(), handle them
3141 * by looping over ->read() or ->write() manually.
3142 */
3143static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3144{
3145	struct kiocb *kiocb = &req->rw.kiocb;
3146	struct file *file = req->file;
3147	ssize_t ret = 0;
3148
3149	/*
3150	 * Don't support polled IO through this interface, and we can't
3151	 * support non-blocking either. For the latter, this just causes
3152	 * the kiocb to be handled from an async context.
3153	 */
3154	if (kiocb->ki_flags & IOCB_HIPRI)
3155		return -EOPNOTSUPP;
3156	if (kiocb->ki_flags & IOCB_NOWAIT)
3157		return -EAGAIN;
3158
3159	while (iov_iter_count(iter)) {
3160		struct iovec iovec;
3161		ssize_t nr;
3162
3163		if (!iov_iter_is_bvec(iter)) {
3164			iovec = iov_iter_iovec(iter);
3165		} else {
3166			iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3167			iovec.iov_len = req->rw.len;
3168		}
3169
3170		if (rw == READ) {
3171			nr = file->f_op->read(file, iovec.iov_base,
3172					      iovec.iov_len, io_kiocb_ppos(kiocb));
3173		} else {
3174			nr = file->f_op->write(file, iovec.iov_base,
3175					       iovec.iov_len, io_kiocb_ppos(kiocb));
3176		}
3177
3178		if (nr < 0) {
3179			if (!ret)
3180				ret = nr;
3181			break;
3182		}
3183		ret += nr;
3184		if (nr != iovec.iov_len)
3185			break;
3186		req->rw.len -= nr;
3187		req->rw.addr += nr;
3188		iov_iter_advance(iter, nr);
3189	}
3190
3191	return ret;
3192}
3193
3194static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3195			  const struct iovec *fast_iov, struct iov_iter *iter)
3196{
3197	struct io_async_rw *rw = req->async_data;
3198
3199	memcpy(&rw->iter, iter, sizeof(*iter));
3200	rw->free_iovec = iovec;
3201	rw->bytes_done = 0;
3202	/* can only be fixed buffers, no need to do anything */
3203	if (iov_iter_is_bvec(iter))
3204		return;
3205	if (!iovec) {
3206		unsigned iov_off = 0;
3207
3208		rw->iter.iov = rw->fast_iov;
3209		if (iter->iov != fast_iov) {
3210			iov_off = iter->iov - fast_iov;
3211			rw->iter.iov += iov_off;
3212		}
3213		if (rw->fast_iov != fast_iov)
3214			memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3215			       sizeof(struct iovec) * iter->nr_segs);
3216	} else {
3217		req->flags |= REQ_F_NEED_CLEANUP;
3218	}
3219}
3220
3221static inline int io_alloc_async_data(struct io_kiocb *req)
3222{
3223	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3224	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3225	return req->async_data == NULL;
3226}
3227
3228static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3229			     const struct iovec *fast_iov,
3230			     struct iov_iter *iter, bool force)
3231{
3232	if (!force && !io_op_defs[req->opcode].needs_async_setup)
3233		return 0;
3234	if (!req->async_data) {
3235		if (io_alloc_async_data(req)) {
3236			kfree(iovec);
3237			return -ENOMEM;
3238		}
3239
3240		io_req_map_rw(req, iovec, fast_iov, iter);
3241	}
3242	return 0;
3243}
3244
3245static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3246{
3247	struct io_async_rw *iorw = req->async_data;
3248	struct iovec *iov = iorw->fast_iov;
3249	int ret;
3250
3251	ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3252	if (unlikely(ret < 0))
3253		return ret;
3254
3255	iorw->bytes_done = 0;
3256	iorw->free_iovec = iov;
3257	if (iov)
3258		req->flags |= REQ_F_NEED_CLEANUP;
3259	return 0;
3260}
3261
3262static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3263{
3264	if (unlikely(!(req->file->f_mode & FMODE_READ)))
3265		return -EBADF;
3266	return io_prep_rw(req, sqe);
3267}
3268
3269/*
3270 * This is our waitqueue callback handler, registered through lock_page_async()
3271 * when we initially tried to do the IO with the iocb armed our waitqueue.
3272 * This gets called when the page is unlocked, and we generally expect that to
3273 * happen when the page IO is completed and the page is now uptodate. This will
3274 * queue a task_work based retry of the operation, attempting to copy the data
3275 * again. If the latter fails because the page was NOT uptodate, then we will
3276 * do a thread based blocking retry of the operation. That's the unexpected
3277 * slow path.
3278 */
3279static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3280			     int sync, void *arg)
3281{
3282	struct wait_page_queue *wpq;
3283	struct io_kiocb *req = wait->private;
3284	struct wait_page_key *key = arg;
3285
3286	wpq = container_of(wait, struct wait_page_queue, wait);
3287
3288	if (!wake_page_match(wpq, key))
3289		return 0;
3290
3291	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3292	list_del_init(&wait->entry);
3293	io_req_task_queue(req);
3294	return 1;
3295}
3296
3297/*
3298 * This controls whether a given IO request should be armed for async page
3299 * based retry. If we return false here, the request is handed to the async
3300 * worker threads for retry. If we're doing buffered reads on a regular file,
3301 * we prepare a private wait_page_queue entry and retry the operation. This
3302 * will either succeed because the page is now uptodate and unlocked, or it
3303 * will register a callback when the page is unlocked at IO completion. Through
3304 * that callback, io_uring uses task_work to setup a retry of the operation.
3305 * That retry will attempt the buffered read again. The retry will generally
3306 * succeed, or in rare cases where it fails, we then fall back to using the
3307 * async worker threads for a blocking retry.
3308 */
3309static bool io_rw_should_retry(struct io_kiocb *req)
3310{
3311	struct io_async_rw *rw = req->async_data;
3312	struct wait_page_queue *wait = &rw->wpq;
3313	struct kiocb *kiocb = &req->rw.kiocb;
3314
3315	/* never retry for NOWAIT, we just complete with -EAGAIN */
3316	if (req->flags & REQ_F_NOWAIT)
3317		return false;
3318
3319	/* Only for buffered IO */
3320	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3321		return false;
3322
3323	/*
3324	 * just use poll if we can, and don't attempt if the fs doesn't
3325	 * support callback based unlocks
3326	 */
3327	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3328		return false;
3329
3330	wait->wait.func = io_async_buf_func;
3331	wait->wait.private = req;
3332	wait->wait.flags = 0;
3333	INIT_LIST_HEAD(&wait->wait.entry);
3334	kiocb->ki_flags |= IOCB_WAITQ;
3335	kiocb->ki_flags &= ~IOCB_NOWAIT;
3336	kiocb->ki_waitq = wait;
3337	return true;
3338}
3339
3340static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3341{
3342	if (req->file->f_op->read_iter)
3343		return call_read_iter(req->file, &req->rw.kiocb, iter);
3344	else if (req->file->f_op->read)
3345		return loop_rw_iter(READ, req, iter);
3346	else
3347		return -EINVAL;
3348}
3349
3350static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3351{
3352	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3353	struct kiocb *kiocb = &req->rw.kiocb;
3354	struct iov_iter __iter, *iter = &__iter;
3355	struct io_async_rw *rw = req->async_data;
3356	ssize_t io_size, ret, ret2;
3357	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3358
3359	if (rw) {
3360		iter = &rw->iter;
3361		iovec = NULL;
3362	} else {
3363		ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3364		if (ret < 0)
3365			return ret;
3366	}
3367	io_size = iov_iter_count(iter);
3368	req->result = io_size;
3369
3370	/* Ensure we clear previously set non-block flag */
3371	if (!force_nonblock)
3372		kiocb->ki_flags &= ~IOCB_NOWAIT;
3373	else
3374		kiocb->ki_flags |= IOCB_NOWAIT;
3375
3376	/* If the file doesn't support async, just async punt */
3377	if (force_nonblock && !io_file_supports_nowait(req, READ)) {
3378		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3379		return ret ?: -EAGAIN;
3380	}
3381
3382	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3383	if (unlikely(ret)) {
3384		kfree(iovec);
3385		return ret;
3386	}
3387
3388	ret = io_iter_do_read(req, iter);
3389
3390	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3391		req->flags &= ~REQ_F_REISSUE;
3392		/* IOPOLL retry should happen for io-wq threads */
3393		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3394			goto done;
3395		/* no retry on NONBLOCK nor RWF_NOWAIT */
3396		if (req->flags & REQ_F_NOWAIT)
3397			goto done;
3398		/* some cases will consume bytes even on error returns */
3399		iov_iter_revert(iter, io_size - iov_iter_count(iter));
3400		ret = 0;
3401	} else if (ret == -EIOCBQUEUED) {
3402		goto out_free;
3403	} else if (ret <= 0 || ret == io_size || !force_nonblock ||
3404		   (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
3405		/* read all, failed, already did sync or don't want to retry */
3406		goto done;
3407	}
3408
3409	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3410	if (ret2)
3411		return ret2;
3412
3413	iovec = NULL;
3414	rw = req->async_data;
3415	/* now use our persistent iterator, if we aren't already */
3416	iter = &rw->iter;
3417
3418	do {
3419		io_size -= ret;
3420		rw->bytes_done += ret;
3421		/* if we can retry, do so with the callbacks armed */
3422		if (!io_rw_should_retry(req)) {
3423			kiocb->ki_flags &= ~IOCB_WAITQ;
3424			return -EAGAIN;
3425		}
3426
3427		/*
3428		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3429		 * we get -EIOCBQUEUED, then we'll get a notification when the
3430		 * desired page gets unlocked. We can also get a partial read
3431		 * here, and if we do, then just retry at the new offset.
3432		 */
3433		ret = io_iter_do_read(req, iter);
3434		if (ret == -EIOCBQUEUED)
3435			return 0;
3436		/* we got some bytes, but not all. retry. */
3437		kiocb->ki_flags &= ~IOCB_WAITQ;
3438	} while (ret > 0 && ret < io_size);
3439done:
3440	kiocb_done(kiocb, ret, issue_flags);
3441out_free:
3442	/* it's faster to check here then delegate to kfree */
3443	if (iovec)
3444		kfree(iovec);
3445	return 0;
3446}
3447
3448static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3449{
3450	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3451		return -EBADF;
3452	return io_prep_rw(req, sqe);
3453}
3454
3455static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3456{
3457	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3458	struct kiocb *kiocb = &req->rw.kiocb;
3459	struct iov_iter __iter, *iter = &__iter;
3460	struct io_async_rw *rw = req->async_data;
3461	ssize_t ret, ret2, io_size;
3462	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3463
3464	if (rw) {
3465		iter = &rw->iter;
3466		iovec = NULL;
3467	} else {
3468		ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3469		if (ret < 0)
3470			return ret;
3471	}
3472	io_size = iov_iter_count(iter);
3473	req->result = io_size;
3474
3475	/* Ensure we clear previously set non-block flag */
3476	if (!force_nonblock)
3477		kiocb->ki_flags &= ~IOCB_NOWAIT;
3478	else
3479		kiocb->ki_flags |= IOCB_NOWAIT;
3480
3481	/* If the file doesn't support async, just async punt */
3482	if (force_nonblock && !io_file_supports_nowait(req, WRITE))
3483		goto copy_iov;
3484
3485	/* file path doesn't support NOWAIT for non-direct_IO */
3486	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3487	    (req->flags & REQ_F_ISREG))
3488		goto copy_iov;
3489
3490	ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3491	if (unlikely(ret))
3492		goto out_free;
3493
3494	/*
3495	 * Open-code file_start_write here to grab freeze protection,
3496	 * which will be released by another thread in
3497	 * io_complete_rw().  Fool lockdep by telling it the lock got
3498	 * released so that it doesn't complain about the held lock when
3499	 * we return to userspace.
3500	 */
3501	if (req->flags & REQ_F_ISREG) {
3502		sb_start_write(file_inode(req->file)->i_sb);
3503		__sb_writers_release(file_inode(req->file)->i_sb,
3504					SB_FREEZE_WRITE);
3505	}
3506	kiocb->ki_flags |= IOCB_WRITE;
3507
3508	if (req->file->f_op->write_iter)
3509		ret2 = call_write_iter(req->file, kiocb, iter);
3510	else if (req->file->f_op->write)
3511		ret2 = loop_rw_iter(WRITE, req, iter);
3512	else
3513		ret2 = -EINVAL;
3514
3515	if (req->flags & REQ_F_REISSUE) {
3516		req->flags &= ~REQ_F_REISSUE;
3517		ret2 = -EAGAIN;
3518	}
3519
3520	/*
3521	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3522	 * retry them without IOCB_NOWAIT.
3523	 */
3524	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3525		ret2 = -EAGAIN;
3526	/* no retry on NONBLOCK nor RWF_NOWAIT */
3527	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3528		goto done;
3529	if (!force_nonblock || ret2 != -EAGAIN) {
3530		/* IOPOLL retry should happen for io-wq threads */
3531		if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3532			goto copy_iov;
3533done:
3534		kiocb_done(kiocb, ret2, issue_flags);
3535	} else {
3536copy_iov:
3537		/* some cases will consume bytes even on error returns */
3538		iov_iter_revert(iter, io_size - iov_iter_count(iter));
3539		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3540		return ret ?: -EAGAIN;
3541	}
3542out_free:
3543	/* it's reportedly faster than delegating the null check to kfree() */
3544	if (iovec)
3545		kfree(iovec);
3546	return ret;
3547}
3548
3549static int io_renameat_prep(struct io_kiocb *req,
3550			    const struct io_uring_sqe *sqe)
3551{
3552	struct io_rename *ren = &req->rename;
3553	const char __user *oldf, *newf;
3554
3555	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3556		return -EINVAL;
3557	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3558		return -EINVAL;
3559	if (unlikely(req->flags & REQ_F_FIXED_FILE))
3560		return -EBADF;
3561
3562	ren->old_dfd = READ_ONCE(sqe->fd);
3563	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3564	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3565	ren->new_dfd = READ_ONCE(sqe->len);
3566	ren->flags = READ_ONCE(sqe->rename_flags);
3567
3568	ren->oldpath = getname(oldf);
3569	if (IS_ERR(ren->oldpath))
3570		return PTR_ERR(ren->oldpath);
3571
3572	ren->newpath = getname(newf);
3573	if (IS_ERR(ren->newpath)) {
3574		putname(ren->oldpath);
3575		return PTR_ERR(ren->newpath);
3576	}
3577
3578	req->flags |= REQ_F_NEED_CLEANUP;
3579	return 0;
3580}
3581
3582static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3583{
3584	struct io_rename *ren = &req->rename;
3585	int ret;
3586
3587	if (issue_flags & IO_URING_F_NONBLOCK)
3588		return -EAGAIN;
3589
3590	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3591				ren->newpath, ren->flags);
3592
3593	req->flags &= ~REQ_F_NEED_CLEANUP;
3594	if (ret < 0)
3595		req_set_fail(req);
3596	io_req_complete(req, ret);
3597	return 0;
3598}
3599
3600static int io_unlinkat_prep(struct io_kiocb *req,
3601			    const struct io_uring_sqe *sqe)
3602{
3603	struct io_unlink *un = &req->unlink;
3604	const char __user *fname;
3605
3606	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3607		return -EINVAL;
3608	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3609	    sqe->splice_fd_in)
3610		return -EINVAL;
3611	if (unlikely(req->flags & REQ_F_FIXED_FILE))
3612		return -EBADF;
3613
3614	un->dfd = READ_ONCE(sqe->fd);
3615
3616	un->flags = READ_ONCE(sqe->unlink_flags);
3617	if (un->flags & ~AT_REMOVEDIR)
3618		return -EINVAL;
3619
3620	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3621	un->filename = getname(fname);
3622	if (IS_ERR(un->filename))
3623		return PTR_ERR(un->filename);
3624
3625	req->flags |= REQ_F_NEED_CLEANUP;
3626	return 0;
3627}
3628
3629static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3630{
3631	struct io_unlink *un = &req->unlink;
3632	int ret;
3633
3634	if (issue_flags & IO_URING_F_NONBLOCK)
3635		return -EAGAIN;
3636
3637	if (un->flags & AT_REMOVEDIR)
3638		ret = do_rmdir(un->dfd, un->filename);
3639	else
3640		ret = do_unlinkat(un->dfd, un->filename);
3641
3642	req->flags &= ~REQ_F_NEED_CLEANUP;
3643	if (ret < 0)
3644		req_set_fail(req);
3645	io_req_complete(req, ret);
3646	return 0;
3647}
3648
3649static int io_shutdown_prep(struct io_kiocb *req,
3650			    const struct io_uring_sqe *sqe)
3651{
3652#if defined(CONFIG_NET)
3653	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3654		return -EINVAL;
3655	if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3656		     sqe->buf_index || sqe->splice_fd_in))
3657		return -EINVAL;
3658
3659	req->shutdown.how = READ_ONCE(sqe->len);
3660	return 0;
3661#else
3662	return -EOPNOTSUPP;
3663#endif
3664}
3665
3666static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3667{
3668#if defined(CONFIG_NET)
3669	struct socket *sock;
3670	int ret;
3671
3672	if (issue_flags & IO_URING_F_NONBLOCK)
3673		return -EAGAIN;
3674
3675	sock = sock_from_file(req->file);
3676	if (unlikely(!sock))
3677		return -ENOTSOCK;
3678
3679	ret = __sys_shutdown_sock(sock, req->shutdown.how);
3680	if (ret < 0)
3681		req_set_fail(req);
3682	io_req_complete(req, ret);
3683	return 0;
3684#else
3685	return -EOPNOTSUPP;
3686#endif
3687}
3688
3689static int __io_splice_prep(struct io_kiocb *req,
3690			    const struct io_uring_sqe *sqe)
3691{
3692	struct io_splice *sp = &req->splice;
3693	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3694
3695	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3696		return -EINVAL;
3697
3698	sp->file_in = NULL;
3699	sp->len = READ_ONCE(sqe->len);
3700	sp->flags = READ_ONCE(sqe->splice_flags);
3701
3702	if (unlikely(sp->flags & ~valid_flags))
3703		return -EINVAL;
3704
3705	sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
3706				  (sp->flags & SPLICE_F_FD_IN_FIXED));
3707	if (!sp->file_in)
3708		return -EBADF;
3709	req->flags |= REQ_F_NEED_CLEANUP;
3710	return 0;
3711}
3712
3713static int io_tee_prep(struct io_kiocb *req,
3714		       const struct io_uring_sqe *sqe)
3715{
3716	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3717		return -EINVAL;
3718	return __io_splice_prep(req, sqe);
3719}
3720
3721static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3722{
3723	struct io_splice *sp = &req->splice;
3724	struct file *in = sp->file_in;
3725	struct file *out = sp->file_out;
3726	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3727	long ret = 0;
3728
3729	if (issue_flags & IO_URING_F_NONBLOCK)
3730		return -EAGAIN;
3731	if (sp->len)
3732		ret = do_tee(in, out, sp->len, flags);
3733
3734	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3735		io_put_file(in);
3736	req->flags &= ~REQ_F_NEED_CLEANUP;
3737
3738	if (ret != sp->len)
3739		req_set_fail(req);
3740	io_req_complete(req, ret);
3741	return 0;
3742}
3743
3744static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3745{
3746	struct io_splice *sp = &req->splice;
3747
3748	sp->off_in = READ_ONCE(sqe->splice_off_in);
3749	sp->off_out = READ_ONCE(sqe->off);
3750	return __io_splice_prep(req, sqe);
3751}
3752
3753static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
3754{
3755	struct io_splice *sp = &req->splice;
3756	struct file *in = sp->file_in;
3757	struct file *out = sp->file_out;
3758	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3759	loff_t *poff_in, *poff_out;
3760	long ret = 0;
3761
3762	if (issue_flags & IO_URING_F_NONBLOCK)
3763		return -EAGAIN;
3764
3765	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3766	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3767
3768	if (sp->len)
3769		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3770
3771	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3772		io_put_file(in);
3773	req->flags &= ~REQ_F_NEED_CLEANUP;
3774
3775	if (ret != sp->len)
3776		req_set_fail(req);
3777	io_req_complete(req, ret);
3778	return 0;
3779}
3780
3781/*
3782 * IORING_OP_NOP just posts a completion event, nothing else.
3783 */
3784static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
3785{
3786	struct io_ring_ctx *ctx = req->ctx;
3787
3788	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3789		return -EINVAL;
3790
3791	__io_req_complete(req, issue_flags, 0, 0);
3792	return 0;
3793}
3794
3795static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3796{
3797	struct io_ring_ctx *ctx = req->ctx;
3798
3799	if (!req->file)
3800		return -EBADF;
3801
3802	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3803		return -EINVAL;
3804	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
3805		     sqe->splice_fd_in))
3806		return -EINVAL;
3807
3808	req->sync.flags = READ_ONCE(sqe->fsync_flags);
3809	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3810		return -EINVAL;
3811
3812	req->sync.off = READ_ONCE(sqe->off);
3813	req->sync.len = READ_ONCE(sqe->len);
3814	return 0;
3815}
3816
3817static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
3818{
3819	loff_t end = req->sync.off + req->sync.len;
3820	int ret;
3821
3822	/* fsync always requires a blocking context */
3823	if (issue_flags & IO_URING_F_NONBLOCK)
3824		return -EAGAIN;
3825
3826	ret = vfs_fsync_range(req->file, req->sync.off,
3827				end > 0 ? end : LLONG_MAX,
3828				req->sync.flags & IORING_FSYNC_DATASYNC);
3829	if (ret < 0)
3830		req_set_fail(req);
3831	io_req_complete(req, ret);
3832	return 0;
3833}
3834
3835static int io_fallocate_prep(struct io_kiocb *req,
3836			     const struct io_uring_sqe *sqe)
3837{
3838	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
3839	    sqe->splice_fd_in)
3840		return -EINVAL;
3841	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3842		return -EINVAL;
3843
3844	req->sync.off = READ_ONCE(sqe->off);
3845	req->sync.len = READ_ONCE(sqe->addr);
3846	req->sync.mode = READ_ONCE(sqe->len);
3847	return 0;
3848}
3849
3850static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
3851{
3852	int ret;
3853
3854	/* fallocate always requiring blocking context */
3855	if (issue_flags & IO_URING_F_NONBLOCK)
3856		return -EAGAIN;
3857	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3858				req->sync.len);
3859	if (ret < 0)
3860		req_set_fail(req);
3861	io_req_complete(req, ret);
3862	return 0;
3863}
3864
3865static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3866{
3867	const char __user *fname;
3868	int ret;
3869
3870	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3871		return -EINVAL;
3872	if (unlikely(sqe->ioprio || sqe->buf_index))
3873		return -EINVAL;
3874	if (unlikely(req->flags & REQ_F_FIXED_FILE))
3875		return -EBADF;
3876
3877	/* open.how should be already initialised */
3878	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3879		req->open.how.flags |= O_LARGEFILE;
3880
3881	req->open.dfd = READ_ONCE(sqe->fd);
3882	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3883	req->open.filename = getname(fname);
3884	if (IS_ERR(req->open.filename)) {
3885		ret = PTR_ERR(req->open.filename);
3886		req->open.filename = NULL;
3887		return ret;
3888	}
3889
3890	req->open.file_slot = READ_ONCE(sqe->file_index);
3891	if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
3892		return -EINVAL;
3893
3894	req->open.nofile = rlimit(RLIMIT_NOFILE);
3895	req->flags |= REQ_F_NEED_CLEANUP;
3896	return 0;
3897}
3898
3899static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3900{
3901	u64 mode = READ_ONCE(sqe->len);
3902	u64 flags = READ_ONCE(sqe->open_flags);
3903
3904	req->open.how = build_open_how(flags, mode);
3905	return __io_openat_prep(req, sqe);
3906}
3907
3908static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3909{
3910	struct open_how __user *how;
3911	size_t len;
3912	int ret;
3913
3914	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3915	len = READ_ONCE(sqe->len);
3916	if (len < OPEN_HOW_SIZE_VER0)
3917		return -EINVAL;
3918
3919	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3920					len);
3921	if (ret)
3922		return ret;
3923
3924	return __io_openat_prep(req, sqe);
3925}
3926
3927static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
3928{
3929	struct open_flags op;
3930	struct file *file;
3931	bool resolve_nonblock, nonblock_set;
3932	bool fixed = !!req->open.file_slot;
3933	int ret;
3934
3935	ret = build_open_flags(&req->open.how, &op);
3936	if (ret)
3937		goto err;
3938	nonblock_set = op.open_flag & O_NONBLOCK;
3939	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
3940	if (issue_flags & IO_URING_F_NONBLOCK) {
3941		/*
3942		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3943		 * it'll always -EAGAIN
3944		 */
3945		if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3946			return -EAGAIN;
3947		op.lookup_flags |= LOOKUP_CACHED;
3948		op.open_flag |= O_NONBLOCK;
3949	}
3950
3951	if (!fixed) {
3952		ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3953		if (ret < 0)
3954			goto err;
3955	}
3956
3957	file = do_filp_open(req->open.dfd, req->open.filename, &op);
3958	if (IS_ERR(file)) {
3959		/*
3960		 * We could hang on to this 'fd' on retrying, but seems like
3961		 * marginal gain for something that is now known to be a slower
3962		 * path. So just put it, and we'll get a new one when we retry.
3963		 */
3964		if (!fixed)
3965			put_unused_fd(ret);
3966
3967		ret = PTR_ERR(file);
3968		/* only retry if RESOLVE_CACHED wasn't already set by application */
3969		if (ret == -EAGAIN &&
3970		    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
3971			return -EAGAIN;
3972		goto err;
3973	}
3974
3975	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3976		file->f_flags &= ~O_NONBLOCK;
3977	fsnotify_open(file);
3978
3979	if (!fixed)
3980		fd_install(ret, file);
3981	else
3982		ret = io_install_fixed_file(req, file, issue_flags,
3983					    req->open.file_slot - 1);
3984err:
3985	putname(req->open.filename);
3986	req->flags &= ~REQ_F_NEED_CLEANUP;
3987	if (ret < 0)
3988		req_set_fail(req);
3989	__io_req_complete(req, issue_flags, ret, 0);
3990	return 0;
3991}
3992
3993static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
3994{
3995	return io_openat2(req, issue_flags);
3996}
3997
3998static int io_remove_buffers_prep(struct io_kiocb *req,
3999				  const struct io_uring_sqe *sqe)
4000{
4001	struct io_provide_buf *p = &req->pbuf;
4002	u64 tmp;
4003
4004	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4005	    sqe->splice_fd_in)
4006		return -EINVAL;
4007
4008	tmp = READ_ONCE(sqe->fd);
4009	if (!tmp || tmp > USHRT_MAX)
4010		return -EINVAL;
4011
4012	memset(p, 0, sizeof(*p));
4013	p->nbufs = tmp;
4014	p->bgid = READ_ONCE(sqe->buf_group);
4015	return 0;
4016}
4017
4018static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4019			       int bgid, unsigned nbufs)
4020{
4021	unsigned i = 0;
4022
4023	/* shouldn't happen */
4024	if (!nbufs)
4025		return 0;
4026
4027	/* the head kbuf is the list itself */
4028	while (!list_empty(&buf->list)) {
4029		struct io_buffer *nxt;
4030
4031		nxt = list_first_entry(&buf->list, struct io_buffer, list);
4032		list_del(&nxt->list);
4033		kfree(nxt);
4034		if (++i == nbufs)
4035			return i;
4036	}
4037	i++;
4038	kfree(buf);
4039	xa_erase(&ctx->io_buffers, bgid);
4040
4041	return i;
4042}
4043
4044static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
4045{
4046	struct io_provide_buf *p = &req->pbuf;
4047	struct io_ring_ctx *ctx = req->ctx;
4048	struct io_buffer *head;
4049	int ret = 0;
4050	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4051
4052	io_ring_submit_lock(ctx, !force_nonblock);
4053
4054	lockdep_assert_held(&ctx->uring_lock);
4055
4056	ret = -ENOENT;
4057	head = xa_load(&ctx->io_buffers, p->bgid);
4058	if (head)
4059		ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
4060	if (ret < 0)
4061		req_set_fail(req);
4062
4063	/* complete before unlock, IOPOLL may need the lock */
4064	__io_req_complete(req, issue_flags, ret, 0);
4065	io_ring_submit_unlock(ctx, !force_nonblock);
4066	return 0;
4067}
4068
4069static int io_provide_buffers_prep(struct io_kiocb *req,
4070				   const struct io_uring_sqe *sqe)
4071{
4072	unsigned long size, tmp_check;
4073	struct io_provide_buf *p = &req->pbuf;
4074	u64 tmp;
4075
4076	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
4077		return -EINVAL;
4078
4079	tmp = READ_ONCE(sqe->fd);
4080	if (!tmp || tmp > USHRT_MAX)
4081		return -E2BIG;
4082	p->nbufs = tmp;
4083	p->addr = READ_ONCE(sqe->addr);
4084	p->len = READ_ONCE(sqe->len);
4085
4086	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4087				&size))
4088		return -EOVERFLOW;
4089	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4090		return -EOVERFLOW;
4091
4092	size = (unsigned long)p->len * p->nbufs;
4093	if (!access_ok(u64_to_user_ptr(p->addr), size))
4094		return -EFAULT;
4095
4096	p->bgid = READ_ONCE(sqe->buf_group);
4097	tmp = READ_ONCE(sqe->off);
4098	if (tmp > USHRT_MAX)
4099		return -E2BIG;
4100	p->bid = tmp;
4101	return 0;
4102}
4103
4104static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4105{
4106	struct io_buffer *buf;
4107	u64 addr = pbuf->addr;
4108	int i, bid = pbuf->bid;
4109
4110	for (i = 0; i < pbuf->nbufs; i++) {
4111		buf = kmalloc(sizeof(*buf), GFP_KERNEL);
4112		if (!buf)
4113			break;
4114
4115		buf->addr = addr;
4116		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
4117		buf->bid = bid;
4118		addr += pbuf->len;
4119		bid++;
4120		if (!*head) {
4121			INIT_LIST_HEAD(&buf->list);
4122			*head = buf;
4123		} else {
4124			list_add_tail(&buf->list, &(*head)->list);
4125		}
4126	}
4127
4128	return i ? i : -ENOMEM;
4129}
4130
4131static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4132{
4133	struct io_provide_buf *p = &req->pbuf;
4134	struct io_ring_ctx *ctx = req->ctx;
4135	struct io_buffer *head, *list;
4136	int ret = 0;
4137	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4138
4139	io_ring_submit_lock(ctx, !force_nonblock);
4140
4141	lockdep_assert_held(&ctx->uring_lock);
4142
4143	list = head = xa_load(&ctx->io_buffers, p->bgid);
4144
4145	ret = io_add_buffers(p, &head);
4146	if (ret >= 0 && !list) {
4147		ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4148		if (ret < 0)
4149			__io_remove_buffers(ctx, head, p->bgid, -1U);
4150	}
4151	if (ret < 0)
4152		req_set_fail(req);
4153	/* complete before unlock, IOPOLL may need the lock */
4154	__io_req_complete(req, issue_flags, ret, 0);
4155	io_ring_submit_unlock(ctx, !force_nonblock);
4156	return 0;
4157}
4158
4159static int io_epoll_ctl_prep(struct io_kiocb *req,
4160			     const struct io_uring_sqe *sqe)
4161{
4162#if defined(CONFIG_EPOLL)
4163	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4164		return -EINVAL;
4165	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4166		return -EINVAL;
4167
4168	req->epoll.epfd = READ_ONCE(sqe->fd);
4169	req->epoll.op = READ_ONCE(sqe->len);
4170	req->epoll.fd = READ_ONCE(sqe->off);
4171
4172	if (ep_op_has_event(req->epoll.op)) {
4173		struct epoll_event __user *ev;
4174
4175		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4176		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4177			return -EFAULT;
4178	}
4179
4180	return 0;
4181#else
4182	return -EOPNOTSUPP;
4183#endif
4184}
4185
4186static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4187{
4188#if defined(CONFIG_EPOLL)
4189	struct io_epoll *ie = &req->epoll;
4190	int ret;
4191	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4192
4193	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4194	if (force_nonblock && ret == -EAGAIN)
4195		return -EAGAIN;
4196
4197	if (ret < 0)
4198		req_set_fail(req);
4199	__io_req_complete(req, issue_flags, ret, 0);
4200	return 0;
4201#else
4202	return -EOPNOTSUPP;
4203#endif
4204}
4205
4206static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4207{
4208#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4209	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
4210		return -EINVAL;
4211	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4212		return -EINVAL;
4213
4214	req->madvise.addr = READ_ONCE(sqe->addr);
4215	req->madvise.len = READ_ONCE(sqe->len);
4216	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4217	return 0;
4218#else
4219	return -EOPNOTSUPP;
4220#endif
4221}
4222
4223static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4224{
4225#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4226	struct io_madvise *ma = &req->madvise;
4227	int ret;
4228
4229	if (issue_flags & IO_URING_F_NONBLOCK)
4230		return -EAGAIN;
4231
4232	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4233	if (ret < 0)
4234		req_set_fail(req);
4235	io_req_complete(req, ret);
4236	return 0;
4237#else
4238	return -EOPNOTSUPP;
4239#endif
4240}
4241
4242static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4243{
4244	if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4245		return -EINVAL;
4246	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4247		return -EINVAL;
4248
4249	req->fadvise.offset = READ_ONCE(sqe->off);
4250	req->fadvise.len = READ_ONCE(sqe->len);
4251	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4252	return 0;
4253}
4254
4255static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4256{
4257	struct io_fadvise *fa = &req->fadvise;
4258	int ret;
4259
4260	if (issue_flags & IO_URING_F_NONBLOCK) {
4261		switch (fa->advice) {
4262		case POSIX_FADV_NORMAL:
4263		case POSIX_FADV_RANDOM:
4264		case POSIX_FADV_SEQUENTIAL:
4265			break;
4266		default:
4267			return -EAGAIN;
4268		}
4269	}
4270
4271	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4272	if (ret < 0)
4273		req_set_fail(req);
4274	__io_req_complete(req, issue_flags, ret, 0);
4275	return 0;
4276}
4277
4278static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4279{
4280	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4281		return -EINVAL;
4282	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4283		return -EINVAL;
4284	if (req->flags & REQ_F_FIXED_FILE)
4285		return -EBADF;
4286
4287	req->statx.dfd = READ_ONCE(sqe->fd);
4288	req->statx.mask = READ_ONCE(sqe->len);
4289	req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4290	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4291	req->statx.flags = READ_ONCE(sqe->statx_flags);
4292
4293	return 0;
4294}
4295
4296static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4297{
4298	struct io_statx *ctx = &req->statx;
4299	int ret;
4300
4301	if (issue_flags & IO_URING_F_NONBLOCK)
4302		return -EAGAIN;
4303
4304	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4305		       ctx->buffer);
4306
4307	if (ret < 0)
4308		req_set_fail(req);
4309	io_req_complete(req, ret);
4310	return 0;
4311}
4312
4313static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4314{
4315	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4316		return -EINVAL;
4317	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4318	    sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
4319		return -EINVAL;
4320	if (req->flags & REQ_F_FIXED_FILE)
4321		return -EBADF;
4322
4323	req->close.fd = READ_ONCE(sqe->fd);
4324	return 0;
4325}
4326
4327static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4328{
4329	struct files_struct *files = current->files;
4330	struct io_close *close = &req->close;
4331	struct fdtable *fdt;
4332	struct file *file = NULL;
4333	int ret = -EBADF;
4334
4335	spin_lock(&files->file_lock);
4336	fdt = files_fdtable(files);
4337	if (close->fd >= fdt->max_fds) {
4338		spin_unlock(&files->file_lock);
4339		goto err;
4340	}
4341	file = fdt->fd[close->fd];
4342	if (!file || file->f_op == &io_uring_fops) {
4343		spin_unlock(&files->file_lock);
4344		file = NULL;
4345		goto err;
4346	}
4347
4348	/* if the file has a flush method, be safe and punt to async */
4349	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4350		spin_unlock(&files->file_lock);
4351		return -EAGAIN;
4352	}
4353
4354	ret = __close_fd_get_file(close->fd, &file);
4355	spin_unlock(&files->file_lock);
4356	if (ret < 0) {
4357		if (ret == -ENOENT)
4358			ret = -EBADF;
4359		goto err;
4360	}
4361
4362	/* No ->flush() or already async, safely close from here */
4363	ret = filp_close(file, current->files);
4364err:
4365	if (ret < 0)
4366		req_set_fail(req);
4367	if (file)
4368		fput(file);
4369	__io_req_complete(req, issue_flags, ret, 0);
4370	return 0;
4371}
4372
4373static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4374{
4375	struct io_ring_ctx *ctx = req->ctx;
4376
4377	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4378		return -EINVAL;
4379	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4380		     sqe->splice_fd_in))
4381		return -EINVAL;
4382
4383	req->sync.off = READ_ONCE(sqe->off);
4384	req->sync.len = READ_ONCE(sqe->len);
4385	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4386	return 0;
4387}
4388
4389static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4390{
4391	int ret;
4392
4393	/* sync_file_range always requires a blocking context */
4394	if (issue_flags & IO_URING_F_NONBLOCK)
4395		return -EAGAIN;
4396
4397	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4398				req->sync.flags);
4399	if (ret < 0)
4400		req_set_fail(req);
4401	io_req_complete(req, ret);
4402	return 0;
4403}
4404
4405#if defined(CONFIG_NET)
4406static int io_setup_async_msg(struct io_kiocb *req,
4407			      struct io_async_msghdr *kmsg)
4408{
4409	struct io_async_msghdr *async_msg = req->async_data;
4410
4411	if (async_msg)
4412		return -EAGAIN;
4413	if (io_alloc_async_data(req)) {
4414		kfree(kmsg->free_iov);
4415		return -ENOMEM;
4416	}
4417	async_msg = req->async_data;
4418	req->flags |= REQ_F_NEED_CLEANUP;
4419	memcpy(async_msg, kmsg, sizeof(*kmsg));
4420	async_msg->msg.msg_name = &async_msg->addr;
4421	/* if were using fast_iov, set it to the new one */
4422	if (!async_msg->free_iov)
4423		async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4424
4425	return -EAGAIN;
4426}
4427
4428static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4429			       struct io_async_msghdr *iomsg)
4430{
4431	iomsg->msg.msg_name = &iomsg->addr;
4432	iomsg->free_iov = iomsg->fast_iov;
4433	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4434				   req->sr_msg.msg_flags, &iomsg->free_iov);
4435}
4436
4437static int io_sendmsg_prep_async(struct io_kiocb *req)
4438{
4439	int ret;
4440
4441	ret = io_sendmsg_copy_hdr(req, req->async_data);
4442	if (!ret)
4443		req->flags |= REQ_F_NEED_CLEANUP;
4444	return ret;
4445}
4446
4447static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4448{
4449	struct io_sr_msg *sr = &req->sr_msg;
4450
4451	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4452		return -EINVAL;
4453
4454	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4455	sr->len = READ_ONCE(sqe->len);
4456	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4457	if (sr->msg_flags & MSG_DONTWAIT)
4458		req->flags |= REQ_F_NOWAIT;
4459
4460#ifdef CONFIG_COMPAT
4461	if (req->ctx->compat)
4462		sr->msg_flags |= MSG_CMSG_COMPAT;
4463#endif
4464	return 0;
4465}
4466
4467static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4468{
4469	struct io_async_msghdr iomsg, *kmsg;
4470	struct socket *sock;
4471	unsigned flags;
4472	int min_ret = 0;
4473	int ret;
4474
4475	sock = sock_from_file(req->file);
4476	if (unlikely(!sock))
4477		return -ENOTSOCK;
4478
4479	kmsg = req->async_data;
4480	if (!kmsg) {
4481		ret = io_sendmsg_copy_hdr(req, &iomsg);
4482		if (ret)
4483			return ret;
4484		kmsg = &iomsg;
4485	}
4486
4487	flags = req->sr_msg.msg_flags;
4488	if (issue_flags & IO_URING_F_NONBLOCK)
4489		flags |= MSG_DONTWAIT;
4490	if (flags & MSG_WAITALL)
4491		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4492
4493	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4494	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4495		return io_setup_async_msg(req, kmsg);
4496	if (ret == -ERESTARTSYS)
4497		ret = -EINTR;
4498
4499	/* fast path, check for non-NULL to avoid function call */
4500	if (kmsg->free_iov)
4501		kfree(kmsg->free_iov);
4502	req->flags &= ~REQ_F_NEED_CLEANUP;
4503	if (ret < min_ret)
4504		req_set_fail(req);
4505	__io_req_complete(req, issue_flags, ret, 0);
4506	return 0;
4507}
4508
4509static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4510{
4511	struct io_sr_msg *sr = &req->sr_msg;
4512	struct msghdr msg;
4513	struct iovec iov;
4514	struct socket *sock;
4515	unsigned flags;
4516	int min_ret = 0;
4517	int ret;
4518
4519	sock = sock_from_file(req->file);
4520	if (unlikely(!sock))
4521		return -ENOTSOCK;
4522
4523	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4524	if (unlikely(ret))
4525		return ret;
4526
4527	msg.msg_name = NULL;
4528	msg.msg_control = NULL;
4529	msg.msg_controllen = 0;
4530	msg.msg_namelen = 0;
4531
4532	flags = req->sr_msg.msg_flags;
4533	if (issue_flags & IO_URING_F_NONBLOCK)
4534		flags |= MSG_DONTWAIT;
4535	if (flags & MSG_WAITALL)
4536		min_ret = iov_iter_count(&msg.msg_iter);
4537
4538	msg.msg_flags = flags;
4539	ret = sock_sendmsg(sock, &msg);
4540	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4541		return -EAGAIN;
4542	if (ret == -ERESTARTSYS)
4543		ret = -EINTR;
4544
4545	if (ret < min_ret)
4546		req_set_fail(req);
4547	__io_req_complete(req, issue_flags, ret, 0);
4548	return 0;
4549}
4550
4551static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4552				 struct io_async_msghdr *iomsg)
4553{
4554	struct io_sr_msg *sr = &req->sr_msg;
4555	struct iovec __user *uiov;
4556	size_t iov_len;
4557	int ret;
4558
4559	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4560					&iomsg->uaddr, &uiov, &iov_len);
4561	if (ret)
4562		return ret;
4563
4564	if (req->flags & REQ_F_BUFFER_SELECT) {
4565		if (iov_len > 1)
4566			return -EINVAL;
4567		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4568			return -EFAULT;
4569		sr->len = iomsg->fast_iov[0].iov_len;
4570		iomsg->free_iov = NULL;
4571	} else {
4572		iomsg->free_iov = iomsg->fast_iov;
4573		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4574				     &iomsg->free_iov, &iomsg->msg.msg_iter,
4575				     false);
4576		if (ret > 0)
4577			ret = 0;
4578	}
4579
4580	return ret;
4581}
4582
4583#ifdef CONFIG_COMPAT
4584static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4585					struct io_async_msghdr *iomsg)
4586{
4587	struct io_sr_msg *sr = &req->sr_msg;
4588	struct compat_iovec __user *uiov;
4589	compat_uptr_t ptr;
4590	compat_size_t len;
4591	int ret;
4592
4593	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4594				  &ptr, &len);
4595	if (ret)
4596		return ret;
4597
4598	uiov = compat_ptr(ptr);
4599	if (req->flags & REQ_F_BUFFER_SELECT) {
4600		compat_ssize_t clen;
4601
4602		if (len > 1)
4603			return -EINVAL;
4604		if (!access_ok(uiov, sizeof(*uiov)))
4605			return -EFAULT;
4606		if (__get_user(clen, &uiov->iov_len))
4607			return -EFAULT;
4608		if (clen < 0)
4609			return -EINVAL;
4610		sr->len = clen;
4611		iomsg->free_iov = NULL;
4612	} else {
4613		iomsg->free_iov = iomsg->fast_iov;
4614		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4615				   UIO_FASTIOV, &iomsg->free_iov,
4616				   &iomsg->msg.msg_iter, true);
4617		if (ret < 0)
4618			return ret;
4619	}
4620
4621	return 0;
4622}
4623#endif
4624
4625static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4626			       struct io_async_msghdr *iomsg)
4627{
4628	iomsg->msg.msg_name = &iomsg->addr;
4629
4630#ifdef CONFIG_COMPAT
4631	if (req->ctx->compat)
4632		return __io_compat_recvmsg_copy_hdr(req, iomsg);
4633#endif
4634
4635	return __io_recvmsg_copy_hdr(req, iomsg);
4636}
4637
4638static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4639					       bool needs_lock)
4640{
4641	struct io_sr_msg *sr = &req->sr_msg;
4642	struct io_buffer *kbuf;
4643
4644	kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4645	if (IS_ERR(kbuf))
4646		return kbuf;
4647
4648	sr->kbuf = kbuf;
4649	req->flags |= REQ_F_BUFFER_SELECTED;
4650	return kbuf;
4651}
4652
4653static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4654{
4655	return io_put_kbuf(req, req->sr_msg.kbuf);
4656}
4657
4658static int io_recvmsg_prep_async(struct io_kiocb *req)
4659{
4660	int ret;
4661
4662	ret = io_recvmsg_copy_hdr(req, req->async_data);
4663	if (!ret)
4664		req->flags |= REQ_F_NEED_CLEANUP;
4665	return ret;
4666}
4667
4668static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4669{
4670	struct io_sr_msg *sr = &req->sr_msg;
4671
4672	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4673		return -EINVAL;
4674
4675	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4676	sr->len = READ_ONCE(sqe->len);
4677	sr->bgid = READ_ONCE(sqe->buf_group);
4678	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4679	if (sr->msg_flags & MSG_DONTWAIT)
4680		req->flags |= REQ_F_NOWAIT;
4681
4682#ifdef CONFIG_COMPAT
4683	if (req->ctx->compat)
4684		sr->msg_flags |= MSG_CMSG_COMPAT;
4685#endif
4686	return 0;
4687}
4688
4689static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4690{
4691	struct io_async_msghdr iomsg, *kmsg;
4692	struct socket *sock;
4693	struct io_buffer *kbuf;
4694	unsigned flags;
4695	int min_ret = 0;
4696	int ret, cflags = 0;
4697	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4698
4699	sock = sock_from_file(req->file);
4700	if (unlikely(!sock))
4701		return -ENOTSOCK;
4702
4703	kmsg = req->async_data;
4704	if (!kmsg) {
4705		ret = io_recvmsg_copy_hdr(req, &iomsg);
4706		if (ret)
4707			return ret;
4708		kmsg = &iomsg;
4709	}
4710
4711	if (req->flags & REQ_F_BUFFER_SELECT) {
4712		kbuf = io_recv_buffer_select(req, !force_nonblock);
4713		if (IS_ERR(kbuf))
4714			return PTR_ERR(kbuf);
4715		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4716		kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4717		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4718				1, req->sr_msg.len);
4719	}
4720
4721	flags = req->sr_msg.msg_flags;
4722	if (force_nonblock)
4723		flags |= MSG_DONTWAIT;
4724	if (flags & MSG_WAITALL)
4725		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4726
4727	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4728					kmsg->uaddr, flags);
4729	if (force_nonblock && ret == -EAGAIN)
4730		return io_setup_async_msg(req, kmsg);
4731	if (ret == -ERESTARTSYS)
4732		ret = -EINTR;
4733
4734	if (req->flags & REQ_F_BUFFER_SELECTED)
4735		cflags = io_put_recv_kbuf(req);
4736	/* fast path, check for non-NULL to avoid function call */
4737	if (kmsg->free_iov)
4738		kfree(kmsg->free_iov);
4739	req->flags &= ~REQ_F_NEED_CLEANUP;
4740	if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
4741		req_set_fail(req);
4742	__io_req_complete(req, issue_flags, ret, cflags);
4743	return 0;
4744}
4745
4746static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
4747{
4748	struct io_buffer *kbuf;
4749	struct io_sr_msg *sr = &req->sr_msg;
4750	struct msghdr msg;
4751	void __user *buf = sr->buf;
4752	struct socket *sock;
4753	struct iovec iov;
4754	unsigned flags;
4755	int min_ret = 0;
4756	int ret, cflags = 0;
4757	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4758
4759	sock = sock_from_file(req->file);
4760	if (unlikely(!sock))
4761		return -ENOTSOCK;
4762
4763	if (req->flags & REQ_F_BUFFER_SELECT) {
4764		kbuf = io_recv_buffer_select(req, !force_nonblock);
4765		if (IS_ERR(kbuf))
4766			return PTR_ERR(kbuf);
4767		buf = u64_to_user_ptr(kbuf->addr);
4768	}
4769
4770	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4771	if (unlikely(ret))
4772		goto out_free;
4773
4774	msg.msg_name = NULL;
4775	msg.msg_control = NULL;
4776	msg.msg_controllen = 0;
4777	msg.msg_namelen = 0;
4778	msg.msg_iocb = NULL;
4779	msg.msg_flags = 0;
4780
4781	flags = req->sr_msg.msg_flags;
4782	if (force_nonblock)
4783		flags |= MSG_DONTWAIT;
4784	if (flags & MSG_WAITALL)
4785		min_ret = iov_iter_count(&msg.msg_iter);
4786
4787	ret = sock_recvmsg(sock, &msg, flags);
4788	if (force_nonblock && ret == -EAGAIN)
4789		return -EAGAIN;
4790	if (ret == -ERESTARTSYS)
4791		ret = -EINTR;
4792out_free:
4793	if (req->flags & REQ_F_BUFFER_SELECTED)
4794		cflags = io_put_recv_kbuf(req);
4795	if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
4796		req_set_fail(req);
4797	__io_req_complete(req, issue_flags, ret, cflags);
4798	return 0;
4799}
4800
4801static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4802{
4803	struct io_accept *accept = &req->accept;
4804
4805	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4806		return -EINVAL;
4807	if (sqe->ioprio || sqe->len || sqe->buf_index)
4808		return -EINVAL;
4809
4810	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4811	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4812	accept->flags = READ_ONCE(sqe->accept_flags);
4813	accept->nofile = rlimit(RLIMIT_NOFILE);
4814
4815	accept->file_slot = READ_ONCE(sqe->file_index);
4816	if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
4817				  (accept->flags & SOCK_CLOEXEC)))
4818		return -EINVAL;
4819	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
4820		return -EINVAL;
4821	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
4822		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
4823	return 0;
4824}
4825
4826static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
4827{
4828	struct io_accept *accept = &req->accept;
4829	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4830	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4831	bool fixed = !!accept->file_slot;
4832	struct file *file;
4833	int ret, fd;
4834
4835	if (req->file->f_flags & O_NONBLOCK)
4836		req->flags |= REQ_F_NOWAIT;
4837
4838	if (!fixed) {
4839		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
4840		if (unlikely(fd < 0))
4841			return fd;
4842	}
4843	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
4844			 accept->flags);
4845	if (IS_ERR(file)) {
4846		if (!fixed)
4847			put_unused_fd(fd);
4848		ret = PTR_ERR(file);
4849		if (ret == -EAGAIN && force_nonblock)
4850			return -EAGAIN;
4851		if (ret == -ERESTARTSYS)
4852			ret = -EINTR;
4853		req_set_fail(req);
4854	} else if (!fixed) {
4855		fd_install(fd, file);
4856		ret = fd;
4857	} else {
4858		ret = io_install_fixed_file(req, file, issue_flags,
4859					    accept->file_slot - 1);
4860	}
4861	__io_req_complete(req, issue_flags, ret, 0);
4862	return 0;
4863}
4864
4865static int io_connect_prep_async(struct io_kiocb *req)
4866{
4867	struct io_async_connect *io = req->async_data;
4868	struct io_connect *conn = &req->connect;
4869
4870	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4871}
4872
4873static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4874{
4875	struct io_connect *conn = &req->connect;
4876
4877	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4878		return -EINVAL;
4879	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
4880	    sqe->splice_fd_in)
4881		return -EINVAL;
4882
4883	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4884	conn->addr_len =  READ_ONCE(sqe->addr2);
4885	return 0;
4886}
4887
4888static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
4889{
4890	struct io_async_connect __io, *io;
4891	unsigned file_flags;
4892	int ret;
4893	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4894
4895	if (req->async_data) {
4896		io = req->async_data;
4897	} else {
4898		ret = move_addr_to_kernel(req->connect.addr,
4899						req->connect.addr_len,
4900						&__io.address);
4901		if (ret)
4902			goto out;
4903		io = &__io;
4904	}
4905
4906	file_flags = force_nonblock ? O_NONBLOCK : 0;
4907
4908	ret = __sys_connect_file(req->file, &io->address,
4909					req->connect.addr_len, file_flags);
4910	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4911		if (req->async_data)
4912			return -EAGAIN;
4913		if (io_alloc_async_data(req)) {
4914			ret = -ENOMEM;
4915			goto out;
4916		}
4917		memcpy(req->async_data, &__io, sizeof(__io));
4918		return -EAGAIN;
4919	}
4920	if (ret == -ERESTARTSYS)
4921		ret = -EINTR;
4922out:
4923	if (ret < 0)
4924		req_set_fail(req);
4925	__io_req_complete(req, issue_flags, ret, 0);
4926	return 0;
4927}
4928#else /* !CONFIG_NET */
4929#define IO_NETOP_FN(op)							\
4930static int io_##op(struct io_kiocb *req, unsigned int issue_flags)	\
4931{									\
4932	return -EOPNOTSUPP;						\
4933}
4934
4935#define IO_NETOP_PREP(op)						\
4936IO_NETOP_FN(op)								\
4937static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4938{									\
4939	return -EOPNOTSUPP;						\
4940}									\
4941
4942#define IO_NETOP_PREP_ASYNC(op)						\
4943IO_NETOP_PREP(op)							\
4944static int io_##op##_prep_async(struct io_kiocb *req)			\
4945{									\
4946	return -EOPNOTSUPP;						\
4947}
4948
4949IO_NETOP_PREP_ASYNC(sendmsg);
4950IO_NETOP_PREP_ASYNC(recvmsg);
4951IO_NETOP_PREP_ASYNC(connect);
4952IO_NETOP_PREP(accept);
4953IO_NETOP_FN(send);
4954IO_NETOP_FN(recv);
4955#endif /* CONFIG_NET */
4956
4957struct io_poll_table {
4958	struct poll_table_struct pt;
4959	struct io_kiocb *req;
4960	int nr_entries;
4961	int error;
4962};
4963
4964static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4965			   __poll_t mask, io_req_tw_func_t func)
4966{
4967	/* for instances that support it check for an event match first: */
4968	if (mask && !(mask & poll->events))
4969		return 0;
4970
4971	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4972
4973	list_del_init(&poll->wait.entry);
4974
4975	req->result = mask;
4976	req->io_task_work.func = func;
4977
4978	/*
4979	 * If this fails, then the task is exiting. When a task exits, the
4980	 * work gets canceled, so just cancel this request as well instead
4981	 * of executing it. We can't safely execute it anyway, as we may not
4982	 * have the needed state needed for it anyway.
4983	 */
4984	io_req_task_work_add(req);
4985	return 1;
4986}
4987
4988static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4989	__acquires(&req->ctx->completion_lock)
4990{
4991	struct io_ring_ctx *ctx = req->ctx;
4992
4993	/* req->task == current here, checking PF_EXITING is safe */
4994	if (unlikely(req->task->flags & PF_EXITING))
4995		WRITE_ONCE(poll->canceled, true);
4996
4997	if (!req->result && !READ_ONCE(poll->canceled)) {
4998		struct poll_table_struct pt = { ._key = poll->events };
4999
5000		req->result = vfs_poll(req->file, &pt) & poll->events;
5001	}
5002
5003	spin_lock(&ctx->completion_lock);
5004	if (!req->result && !READ_ONCE(poll->canceled)) {
5005		add_wait_queue(poll->head, &poll->wait);
5006		return true;
5007	}
5008
5009	return false;
5010}
5011
5012static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
5013{
5014	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
5015	if (req->opcode == IORING_OP_POLL_ADD)
5016		return req->async_data;
5017	return req->apoll->double_poll;
5018}
5019
5020static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5021{
5022	if (req->opcode == IORING_OP_POLL_ADD)
5023		return &req->poll;
5024	return &req->apoll->poll;
5025}
5026
5027static void io_poll_remove_double(struct io_kiocb *req)
5028	__must_hold(&req->ctx->completion_lock)
5029{
5030	struct io_poll_iocb *poll = io_poll_get_double(req);
5031
5032	lockdep_assert_held(&req->ctx->completion_lock);
5033
5034	if (poll && poll->head) {
5035		struct wait_queue_head *head = poll->head;
5036
5037		spin_lock_irq(&head->lock);
5038		list_del_init(&poll->wait.entry);
5039		if (poll->wait.private)
5040			req_ref_put(req);
5041		poll->head = NULL;
5042		spin_unlock_irq(&head->lock);
5043	}
5044}
5045
5046static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
5047	__must_hold(&req->ctx->completion_lock)
5048{
5049	struct io_ring_ctx *ctx = req->ctx;
5050	unsigned flags = IORING_CQE_F_MORE;
5051	int error;
5052
5053	if (READ_ONCE(req->poll.canceled)) {
5054		error = -ECANCELED;
5055		req->poll.events |= EPOLLONESHOT;
5056	} else {
5057		error = mangle_poll(mask);
5058	}
5059	if (req->poll.events & EPOLLONESHOT)
5060		flags = 0;
5061	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
5062		req->poll.done = true;
5063		flags = 0;
5064	}
5065	if (flags & IORING_CQE_F_MORE)
5066		ctx->cq_extra++;
5067
5068	io_commit_cqring(ctx);
5069	return !(flags & IORING_CQE_F_MORE);
5070}
5071
5072static void io_poll_task_func(struct io_kiocb *req, bool *locked)
5073{
5074	struct io_ring_ctx *ctx = req->ctx;
5075	struct io_kiocb *nxt;
5076
5077	if (io_poll_rewait(req, &req->poll)) {
5078		spin_unlock(&ctx->completion_lock);
5079	} else {
5080		bool done;
5081
5082		done = io_poll_complete(req, req->result);
5083		if (done) {
5084			io_poll_remove_double(req);
5085			hash_del(&req->hash_node);
5086		} else {
5087			req->result = 0;
5088			add_wait_queue(req->poll.head, &req->poll.wait);
5089		}
5090		spin_unlock(&ctx->completion_lock);
5091		io_cqring_ev_posted(ctx);
5092
5093		if (done) {
5094			nxt = io_put_req_find_next(req);
5095			if (nxt)
5096				io_req_task_submit(nxt, locked);
5097		}
5098	}
5099}
5100
5101static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
5102			       int sync, void *key)
5103{
5104	struct io_kiocb *req = wait->private;
5105	struct io_poll_iocb *poll = io_poll_get_single(req);
5106	__poll_t mask = key_to_poll(key);
5107	unsigned long flags;
5108
5109	/* for instances that support it check for an event match first: */
5110	if (mask && !(mask & poll->events))
5111		return 0;
5112	if (!(poll->events & EPOLLONESHOT))
5113		return poll->wait.func(&poll->wait, mode, sync, key);
5114
5115	list_del_init(&wait->entry);
5116
5117	if (poll->head) {
5118		bool done;
5119
5120		spin_lock_irqsave(&poll->head->lock, flags);
5121		done = list_empty(&poll->wait.entry);
5122		if (!done)
5123			list_del_init(&poll->wait.entry);
5124		/* make sure double remove sees this as being gone */
5125		wait->private = NULL;
5126		spin_unlock_irqrestore(&poll->head->lock, flags);
5127		if (!done) {
5128			/* use wait func handler, so it matches the rq type */
5129			poll->wait.func(&poll->wait, mode, sync, key);
5130		}
5131	}
5132	req_ref_put(req);
5133	return 1;
5134}
5135
5136static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5137			      wait_queue_func_t wake_func)
5138{
5139	poll->head = NULL;
5140	poll->done = false;
5141	poll->canceled = false;
5142#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5143	/* mask in events that we always want/need */
5144	poll->events = events | IO_POLL_UNMASK;
5145	INIT_LIST_HEAD(&poll->wait.entry);
5146	init_waitqueue_func_entry(&poll->wait, wake_func);
5147}
5148
5149static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5150			    struct wait_queue_head *head,
5151			    struct io_poll_iocb **poll_ptr)
5152{
5153	struct io_kiocb *req = pt->req;
5154
5155	/*
5156	 * The file being polled uses multiple waitqueues for poll handling
5157	 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
5158	 * if this happens.
5159	 */
5160	if (unlikely(pt->nr_entries)) {
5161		struct io_poll_iocb *poll_one = poll;
5162
5163		/* double add on the same waitqueue head, ignore */
5164		if (poll_one->head == head)
5165			return;
5166		/* already have a 2nd entry, fail a third attempt */
5167		if (*poll_ptr) {
5168			if ((*poll_ptr)->head == head)
5169				return;
5170			pt->error = -EINVAL;
5171			return;
5172		}
5173		/*
5174		 * Can't handle multishot for double wait for now, turn it
5175		 * into one-shot mode.
5176		 */
5177		if (!(poll_one->events & EPOLLONESHOT))
5178			poll_one->events |= EPOLLONESHOT;
5179		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5180		if (!poll) {
5181			pt->error = -ENOMEM;
5182			return;
5183		}
5184		io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5185		req_ref_get(req);
5186		poll->wait.private = req;
5187		*poll_ptr = poll;
5188	}
5189
5190	pt->nr_entries++;
5191	poll->head = head;
5192
5193	if (poll->events & EPOLLEXCLUSIVE)
5194		add_wait_queue_exclusive(head, &poll->wait);
5195	else
5196		add_wait_queue(head, &poll->wait);
5197}
5198
5199static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5200			       struct poll_table_struct *p)
5201{
5202	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5203	struct async_poll *apoll = pt->req->apoll;
5204
5205	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5206}
5207
5208static void io_async_task_func(struct io_kiocb *req, bool *locked)
5209{
5210	struct async_poll *apoll = req->apoll;
5211	struct io_ring_ctx *ctx = req->ctx;
5212
5213	trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
5214
5215	if (io_poll_rewait(req, &apoll->poll)) {
5216		spin_unlock(&ctx->completion_lock);
5217		return;
5218	}
5219
5220	hash_del(&req->hash_node);
5221	io_poll_remove_double(req);
5222	spin_unlock(&ctx->completion_lock);
5223
5224	if (!READ_ONCE(apoll->poll.canceled))
5225		io_req_task_submit(req, locked);
5226	else
5227		io_req_complete_failed(req, -ECANCELED);
5228}
5229
5230static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5231			void *key)
5232{
5233	struct io_kiocb *req = wait->private;
5234	struct io_poll_iocb *poll = &req->apoll->poll;
5235
5236	trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5237					key_to_poll(key));
5238
5239	return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5240}
5241
5242static void io_poll_req_insert(struct io_kiocb *req)
5243{
5244	struct io_ring_ctx *ctx = req->ctx;
5245	struct hlist_head *list;
5246
5247	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5248	hlist_add_head(&req->hash_node, list);
5249}
5250
5251static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5252				      struct io_poll_iocb *poll,
5253				      struct io_poll_table *ipt, __poll_t mask,
5254				      wait_queue_func_t wake_func)
5255	__acquires(&ctx->completion_lock)
5256{
5257	struct io_ring_ctx *ctx = req->ctx;
5258	bool cancel = false;
5259
5260	INIT_HLIST_NODE(&req->hash_node);
5261	io_init_poll_iocb(poll, mask, wake_func);
5262	poll->file = req->file;
5263	poll->wait.private = req;
5264
5265	ipt->pt._key = mask;
5266	ipt->req = req;
5267	ipt->error = 0;
5268	ipt->nr_entries = 0;
5269
5270	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5271	if (unlikely(!ipt->nr_entries) && !ipt->error)
5272		ipt->error = -EINVAL;
5273
5274	spin_lock(&ctx->completion_lock);
5275	if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
5276		io_poll_remove_double(req);
5277	if (likely(poll->head)) {
5278		spin_lock_irq(&poll->head->lock);
5279		if (unlikely(list_empty(&poll->wait.entry))) {
5280			if (ipt->error)
5281				cancel = true;
5282			ipt->error = 0;
5283			mask = 0;
5284		}
5285		if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
5286			list_del_init(&poll->wait.entry);
5287		else if (cancel)
5288			WRITE_ONCE(poll->canceled, true);
5289		else if (!poll->done) /* actually waiting for an event */
5290			io_poll_req_insert(req);
5291		spin_unlock_irq(&poll->head->lock);
5292	}
5293
5294	return mask;
5295}
5296
5297enum {
5298	IO_APOLL_OK,
5299	IO_APOLL_ABORTED,
5300	IO_APOLL_READY
5301};
5302
5303static int io_arm_poll_handler(struct io_kiocb *req)
5304{
5305	const struct io_op_def *def = &io_op_defs[req->opcode];
5306	struct io_ring_ctx *ctx = req->ctx;
5307	struct async_poll *apoll;
5308	struct io_poll_table ipt;
5309	__poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
5310	int rw;
5311
5312	if (!req->file || !file_can_poll(req->file))
5313		return IO_APOLL_ABORTED;
5314	if (req->flags & REQ_F_POLLED)
5315		return IO_APOLL_ABORTED;
5316	if (!def->pollin && !def->pollout)
5317		return IO_APOLL_ABORTED;
5318
5319	if (def->pollin) {
5320		rw = READ;
5321		mask |= POLLIN | POLLRDNORM;
5322
5323		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5324		if ((req->opcode == IORING_OP_RECVMSG) &&
5325		    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5326			mask &= ~POLLIN;
5327	} else {
5328		rw = WRITE;
5329		mask |= POLLOUT | POLLWRNORM;
5330	}
5331
5332	/* if we can't nonblock try, then no point in arming a poll handler */
5333	if (!io_file_supports_nowait(req, rw))
5334		return IO_APOLL_ABORTED;
5335
5336	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5337	if (unlikely(!apoll))
5338		return IO_APOLL_ABORTED;
5339	apoll->double_poll = NULL;
5340	req->apoll = apoll;
5341	req->flags |= REQ_F_POLLED;
5342	ipt.pt._qproc = io_async_queue_proc;
5343	io_req_set_refcount(req);
5344
5345	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5346					io_async_wake);
5347	spin_unlock(&ctx->completion_lock);
5348	if (ret || ipt.error)
5349		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
5350
5351	trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5352				mask, apoll->poll.events);
5353	return IO_APOLL_OK;
5354}
5355
5356static bool __io_poll_remove_one(struct io_kiocb *req,
5357				 struct io_poll_iocb *poll, bool do_cancel)
5358	__must_hold(&req->ctx->completion_lock)
5359{
5360	bool do_complete = false;
5361
5362	if (!poll->head)
5363		return false;
5364	spin_lock_irq(&poll->head->lock);
5365	if (do_cancel)
5366		WRITE_ONCE(poll->canceled, true);
5367	if (!list_empty(&poll->wait.entry)) {
5368		list_del_init(&poll->wait.entry);
5369		do_complete = true;
5370	}
5371	spin_unlock_irq(&poll->head->lock);
5372	hash_del(&req->hash_node);
5373	return do_complete;
5374}
5375
5376static bool io_poll_remove_one(struct io_kiocb *req)
5377	__must_hold(&req->ctx->completion_lock)
5378{
5379	bool do_complete;
5380
5381	io_poll_remove_double(req);
5382	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
5383
5384	if (do_complete) {
5385		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
5386		io_commit_cqring(req->ctx);
5387		req_set_fail(req);
5388		io_put_req_deferred(req);
5389	}
5390	return do_complete;
5391}
5392
5393/*
5394 * Returns true if we found and killed one or more poll requests
5395 */
5396static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5397			       bool cancel_all)
5398{
5399	struct hlist_node *tmp;
5400	struct io_kiocb *req;
5401	int posted = 0, i;
5402
5403	spin_lock(&ctx->completion_lock);
5404	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5405		struct hlist_head *list;
5406
5407		list = &ctx->cancel_hash[i];
5408		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5409			if (io_match_task(req, tsk, cancel_all))
5410				posted += io_poll_remove_one(req);
5411		}
5412	}
5413	spin_unlock(&ctx->completion_lock);
5414
5415	if (posted)
5416		io_cqring_ev_posted(ctx);
5417
5418	return posted != 0;
5419}
5420
5421static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5422				     bool poll_only)
5423	__must_hold(&ctx->completion_lock)
5424{
5425	struct hlist_head *list;
5426	struct io_kiocb *req;
5427
5428	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5429	hlist_for_each_entry(req, list, hash_node) {
5430		if (sqe_addr != req->user_data)
5431			continue;
5432		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5433			continue;
5434		return req;
5435	}
5436	return NULL;
5437}
5438
5439static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5440			  bool poll_only)
5441	__must_hold(&ctx->completion_lock)
5442{
5443	struct io_kiocb *req;
5444
5445	req = io_poll_find(ctx, sqe_addr, poll_only);
5446	if (!req)
5447		return -ENOENT;
5448	if (io_poll_remove_one(req))
5449		return 0;
5450
5451	return -EALREADY;
5452}
5453
5454static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5455				     unsigned int flags)
5456{
5457	u32 events;
5458
5459	events = READ_ONCE(sqe->poll32_events);
5460#ifdef __BIG_ENDIAN
5461	events = swahw32(events);
5462#endif
5463	if (!(flags & IORING_POLL_ADD_MULTI))
5464		events |= EPOLLONESHOT;
5465	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
5466}
5467
5468static int io_poll_update_prep(struct io_kiocb *req,
5469			       const struct io_uring_sqe *sqe)
5470{
5471	struct io_poll_update *upd = &req->poll_update;
5472	u32 flags;
5473
5474	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5475		return -EINVAL;
5476	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
5477		return -EINVAL;
5478	flags = READ_ONCE(sqe->len);
5479	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5480		      IORING_POLL_ADD_MULTI))
5481		return -EINVAL;
5482	/* meaningless without update */
5483	if (flags == IORING_POLL_ADD_MULTI)
5484		return -EINVAL;
5485
5486	upd->old_user_data = READ_ONCE(sqe->addr);
5487	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5488	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
5489
5490	upd->new_user_data = READ_ONCE(sqe->off);
5491	if (!upd->update_user_data && upd->new_user_data)
5492		return -EINVAL;
5493	if (upd->update_events)
5494		upd->events = io_poll_parse_events(sqe, flags);
5495	else if (sqe->poll32_events)
5496		return -EINVAL;
5497
5498	return 0;
5499}
5500
5501static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5502			void *key)
5503{
5504	struct io_kiocb *req = wait->private;
5505	struct io_poll_iocb *poll = &req->poll;
5506
5507	return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5508}
5509
5510static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5511			       struct poll_table_struct *p)
5512{
5513	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5514
5515	__io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5516}
5517
5518static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5519{
5520	struct io_poll_iocb *poll = &req->poll;
5521	u32 flags;
5522
5523	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5524		return -EINVAL;
5525	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
5526		return -EINVAL;
5527	flags = READ_ONCE(sqe->len);
5528	if (flags & ~IORING_POLL_ADD_MULTI)
5529		return -EINVAL;
5530
5531	io_req_set_refcount(req);
5532	poll->events = io_poll_parse_events(sqe, flags);
5533	return 0;
5534}
5535
5536static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5537{
5538	struct io_poll_iocb *poll = &req->poll;
5539	struct io_ring_ctx *ctx = req->ctx;
5540	struct io_poll_table ipt;
5541	__poll_t mask;
5542
5543	ipt.pt._qproc = io_poll_queue_proc;
5544
5545	mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5546					io_poll_wake);
5547
5548	if (mask) { /* no async, we'd stolen it */
5549		ipt.error = 0;
5550		io_poll_complete(req, mask);
5551	}
5552	spin_unlock(&ctx->completion_lock);
5553
5554	if (mask) {
5555		io_cqring_ev_posted(ctx);
5556		if (poll->events & EPOLLONESHOT)
5557			io_put_req(req);
5558	}
5559	return ipt.error;
5560}
5561
5562static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
5563{
5564	struct io_ring_ctx *ctx = req->ctx;
5565	struct io_kiocb *preq;
5566	bool completing;
5567	int ret;
5568
5569	spin_lock(&ctx->completion_lock);
5570	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
5571	if (!preq) {
5572		ret = -ENOENT;
5573		goto err;
5574	}
5575
5576	if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5577		completing = true;
5578		ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5579		goto err;
5580	}
5581
5582	/*
5583	 * Don't allow racy completion with singleshot, as we cannot safely
5584	 * update those. For multishot, if we're racing with completion, just
5585	 * let completion re-add it.
5586	 */
5587	completing = !__io_poll_remove_one(preq, &preq->poll, false);
5588	if (completing && (preq->poll.events & EPOLLONESHOT)) {
5589		ret = -EALREADY;
5590		goto err;
5591	}
5592	/* we now have a detached poll request. reissue. */
5593	ret = 0;
5594err:
5595	if (ret < 0) {
5596		spin_unlock(&ctx->completion_lock);
5597		req_set_fail(req);
5598		io_req_complete(req, ret);
5599		return 0;
5600	}
5601	/* only mask one event flags, keep behavior flags */
5602	if (req->poll_update.update_events) {
5603		preq->poll.events &= ~0xffff;
5604		preq->poll.events |= req->poll_update.events & 0xffff;
5605		preq->poll.events |= IO_POLL_UNMASK;
5606	}
5607	if (req->poll_update.update_user_data)
5608		preq->user_data = req->poll_update.new_user_data;
5609	spin_unlock(&ctx->completion_lock);
5610
5611	/* complete update request, we're done with it */
5612	io_req_complete(req, ret);
5613
5614	if (!completing) {
5615		ret = io_poll_add(preq, issue_flags);
5616		if (ret < 0) {
5617			req_set_fail(preq);
5618			io_req_complete(preq, ret);
5619		}
5620	}
5621	return 0;
5622}
5623
5624static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
5625{
5626	req_set_fail(req);
5627	io_req_complete_post(req, -ETIME, 0);
5628}
5629
5630static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5631{
5632	struct io_timeout_data *data = container_of(timer,
5633						struct io_timeout_data, timer);
5634	struct io_kiocb *req = data->req;
5635	struct io_ring_ctx *ctx = req->ctx;
5636	unsigned long flags;
5637
5638	spin_lock_irqsave(&ctx->timeout_lock, flags);
5639	list_del_init(&req->timeout.list);
5640	atomic_set(&req->ctx->cq_timeouts,
5641		atomic_read(&req->ctx->cq_timeouts) + 1);
5642	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
5643
5644	req->io_task_work.func = io_req_task_timeout;
5645	io_req_task_work_add(req);
5646	return HRTIMER_NORESTART;
5647}
5648
5649static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5650					   __u64 user_data)
5651	__must_hold(&ctx->timeout_lock)
5652{
5653	struct io_timeout_data *io;
5654	struct io_kiocb *req;
5655	bool found = false;
5656
5657	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5658		found = user_data == req->user_data;
5659		if (found)
5660			break;
5661	}
5662	if (!found)
5663		return ERR_PTR(-ENOENT);
5664
5665	io = req->async_data;
5666	if (hrtimer_try_to_cancel(&io->timer) == -1)
5667		return ERR_PTR(-EALREADY);
5668	list_del_init(&req->timeout.list);
5669	return req;
5670}
5671
5672static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5673	__must_hold(&ctx->completion_lock)
5674	__must_hold(&ctx->timeout_lock)
5675{
5676	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5677
5678	if (IS_ERR(req))
5679		return PTR_ERR(req);
5680
5681	req_set_fail(req);
5682	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
5683	io_put_req_deferred(req);
5684	return 0;
5685}
5686
5687static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5688			     struct timespec64 *ts, enum hrtimer_mode mode)
5689	__must_hold(&ctx->timeout_lock)
5690{
5691	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5692	struct io_timeout_data *data;
5693
5694	if (IS_ERR(req))
5695		return PTR_ERR(req);
5696
5697	req->timeout.off = 0; /* noseq */
5698	data = req->async_data;
5699	list_add_tail(&req->timeout.list, &ctx->timeout_list);
5700	hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5701	data->timer.function = io_timeout_fn;
5702	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5703	return 0;
5704}
5705
5706static int io_timeout_remove_prep(struct io_kiocb *req,
5707				  const struct io_uring_sqe *sqe)
5708{
5709	struct io_timeout_rem *tr = &req->timeout_rem;
5710
5711	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5712		return -EINVAL;
5713	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5714		return -EINVAL;
5715	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
5716		return -EINVAL;
5717
5718	tr->addr = READ_ONCE(sqe<