1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/syscalls.h>
10#include <linux/refcount.h>
11#include <linux/bits.h>
12#include <linux/fs.h>
13#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/uaccess.h>
16#include <linux/nospec.h>
17#include <linux/compat.h>
18#include <linux/io_uring.h>
19#include <linux/io_uring_types.h>
20
21#include "io_uring.h"
22#include "opdef.h"
23#include "tctx.h"
24#include "rsrc.h"
25#include "sqpoll.h"
26#include "register.h"
27#include "cancel.h"
28#include "kbuf.h"
29#include "napi.h"
30
31#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
32				 IORING_REGISTER_LAST + IORING_OP_LAST)
33
34static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
35			       unsigned int eventfd_async)
36{
37	struct io_ev_fd *ev_fd;
38	__s32 __user *fds = arg;
39	int fd;
40
41	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
42					lockdep_is_held(&ctx->uring_lock));
43	if (ev_fd)
44		return -EBUSY;
45
46	if (copy_from_user(&fd, fds, sizeof(*fds)))
47		return -EFAULT;
48
49	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
50	if (!ev_fd)
51		return -ENOMEM;
52
53	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
54	if (IS_ERR(ev_fd->cq_ev_fd)) {
55		int ret = PTR_ERR(ev_fd->cq_ev_fd);
56		kfree(ev_fd);
57		return ret;
58	}
59
60	spin_lock(&ctx->completion_lock);
61	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
62	spin_unlock(&ctx->completion_lock);
63
64	ev_fd->eventfd_async = eventfd_async;
65	ctx->has_evfd = true;
66	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
67	atomic_set(&ev_fd->refs, 1);
68	atomic_set(&ev_fd->ops, 0);
69	return 0;
70}
71
72int io_eventfd_unregister(struct io_ring_ctx *ctx)
73{
74	struct io_ev_fd *ev_fd;
75
76	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
77					lockdep_is_held(&ctx->uring_lock));
78	if (ev_fd) {
79		ctx->has_evfd = false;
80		rcu_assign_pointer(ctx->io_ev_fd, NULL);
81		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
82			call_rcu(&ev_fd->rcu, io_eventfd_ops);
83		return 0;
84	}
85
86	return -ENXIO;
87}
88
89static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
90			   unsigned nr_args)
91{
92	struct io_uring_probe *p;
93	size_t size;
94	int i, ret;
95
96	size = struct_size(p, ops, nr_args);
97	if (size == SIZE_MAX)
98		return -EOVERFLOW;
99	p = kzalloc(size, GFP_KERNEL);
100	if (!p)
101		return -ENOMEM;
102
103	ret = -EFAULT;
104	if (copy_from_user(p, arg, size))
105		goto out;
106	ret = -EINVAL;
107	if (memchr_inv(p, 0, size))
108		goto out;
109
110	p->last_op = IORING_OP_LAST - 1;
111	if (nr_args > IORING_OP_LAST)
112		nr_args = IORING_OP_LAST;
113
114	for (i = 0; i < nr_args; i++) {
115		p->ops[i].op = i;
116		if (!io_issue_defs[i].not_supported)
117			p->ops[i].flags = IO_URING_OP_SUPPORTED;
118	}
119	p->ops_len = i;
120
121	ret = 0;
122	if (copy_to_user(arg, p, size))
123		ret = -EFAULT;
124out:
125	kfree(p);
126	return ret;
127}
128
129int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
130{
131	const struct cred *creds;
132
133	creds = xa_erase(&ctx->personalities, id);
134	if (creds) {
135		put_cred(creds);
136		return 0;
137	}
138
139	return -EINVAL;
140}
141
142
143static int io_register_personality(struct io_ring_ctx *ctx)
144{
145	const struct cred *creds;
146	u32 id;
147	int ret;
148
149	creds = get_current_cred();
150
151	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
152			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
153	if (ret < 0) {
154		put_cred(creds);
155		return ret;
156	}
157	return id;
158}
159
160static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
161					   void __user *arg, unsigned int nr_args)
162{
163	struct io_uring_restriction *res;
164	size_t size;
165	int i, ret;
166
167	/* Restrictions allowed only if rings started disabled */
168	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
169		return -EBADFD;
170
171	/* We allow only a single restrictions registration */
172	if (ctx->restrictions.registered)
173		return -EBUSY;
174
175	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
176		return -EINVAL;
177
178	size = array_size(nr_args, sizeof(*res));
179	if (size == SIZE_MAX)
180		return -EOVERFLOW;
181
182	res = memdup_user(arg, size);
183	if (IS_ERR(res))
184		return PTR_ERR(res);
185
186	ret = 0;
187
188	for (i = 0; i < nr_args; i++) {
189		switch (res[i].opcode) {
190		case IORING_RESTRICTION_REGISTER_OP:
191			if (res[i].register_op >= IORING_REGISTER_LAST) {
192				ret = -EINVAL;
193				goto out;
194			}
195
196			__set_bit(res[i].register_op,
197				  ctx->restrictions.register_op);
198			break;
199		case IORING_RESTRICTION_SQE_OP:
200			if (res[i].sqe_op >= IORING_OP_LAST) {
201				ret = -EINVAL;
202				goto out;
203			}
204
205			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
206			break;
207		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
208			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
209			break;
210		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
211			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
212			break;
213		default:
214			ret = -EINVAL;
215			goto out;
216		}
217	}
218
219out:
220	/* Reset all restrictions if an error happened */
221	if (ret != 0)
222		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
223	else
224		ctx->restrictions.registered = true;
225
226	kfree(res);
227	return ret;
228}
229
230static int io_register_enable_rings(struct io_ring_ctx *ctx)
231{
232	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
233		return -EBADFD;
234
235	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
236		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
237		/*
238		 * Lazy activation attempts would fail if it was polled before
239		 * submitter_task is set.
240		 */
241		if (wq_has_sleeper(&ctx->poll_wq))
242			io_activate_pollwq(ctx);
243	}
244
245	if (ctx->restrictions.registered)
246		ctx->restricted = 1;
247
248	ctx->flags &= ~IORING_SETUP_R_DISABLED;
249	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
250		wake_up(&ctx->sq_data->wait);
251	return 0;
252}
253
254static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
255					 cpumask_var_t new_mask)
256{
257	int ret;
258
259	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
260		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
261	} else {
262		mutex_unlock(&ctx->uring_lock);
263		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
264		mutex_lock(&ctx->uring_lock);
265	}
266
267	return ret;
268}
269
270static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
271				       void __user *arg, unsigned len)
272{
273	cpumask_var_t new_mask;
274	int ret;
275
276	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
277		return -ENOMEM;
278
279	cpumask_clear(new_mask);
280	if (len > cpumask_size())
281		len = cpumask_size();
282
283#ifdef CONFIG_COMPAT
284	if (in_compat_syscall())
285		ret = compat_get_bitmap(cpumask_bits(new_mask),
286					(const compat_ulong_t __user *)arg,
287					len * 8 /* CHAR_BIT */);
288	else
289#endif
290		ret = copy_from_user(new_mask, arg, len);
291
292	if (ret) {
293		free_cpumask_var(new_mask);
294		return -EFAULT;
295	}
296
297	ret = __io_register_iowq_aff(ctx, new_mask);
298	free_cpumask_var(new_mask);
299	return ret;
300}
301
302static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
303{
304	return __io_register_iowq_aff(ctx, NULL);
305}
306
307static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
308					       void __user *arg)
309	__must_hold(&ctx->uring_lock)
310{
311	struct io_tctx_node *node;
312	struct io_uring_task *tctx = NULL;
313	struct io_sq_data *sqd = NULL;
314	__u32 new_count[2];
315	int i, ret;
316
317	if (copy_from_user(new_count, arg, sizeof(new_count)))
318		return -EFAULT;
319	for (i = 0; i < ARRAY_SIZE(new_count); i++)
320		if (new_count[i] > INT_MAX)
321			return -EINVAL;
322
323	if (ctx->flags & IORING_SETUP_SQPOLL) {
324		sqd = ctx->sq_data;
325		if (sqd) {
326			/*
327			 * Observe the correct sqd->lock -> ctx->uring_lock
328			 * ordering. Fine to drop uring_lock here, we hold
329			 * a ref to the ctx.
330			 */
331			refcount_inc(&sqd->refs);
332			mutex_unlock(&ctx->uring_lock);
333			mutex_lock(&sqd->lock);
334			mutex_lock(&ctx->uring_lock);
335			if (sqd->thread)
336				tctx = sqd->thread->io_uring;
337		}
338	} else {
339		tctx = current->io_uring;
340	}
341
342	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
343
344	for (i = 0; i < ARRAY_SIZE(new_count); i++)
345		if (new_count[i])
346			ctx->iowq_limits[i] = new_count[i];
347	ctx->iowq_limits_set = true;
348
349	if (tctx && tctx->io_wq) {
350		ret = io_wq_max_workers(tctx->io_wq, new_count);
351		if (ret)
352			goto err;
353	} else {
354		memset(new_count, 0, sizeof(new_count));
355	}
356
357	if (sqd) {
358		mutex_unlock(&sqd->lock);
359		io_put_sq_data(sqd);
360	}
361
362	if (copy_to_user(arg, new_count, sizeof(new_count)))
363		return -EFAULT;
364
365	/* that's it for SQPOLL, only the SQPOLL task creates requests */
366	if (sqd)
367		return 0;
368
369	/* now propagate the restriction to all registered users */
370	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
371		struct io_uring_task *tctx = node->task->io_uring;
372
373		if (WARN_ON_ONCE(!tctx->io_wq))
374			continue;
375
376		for (i = 0; i < ARRAY_SIZE(new_count); i++)
377			new_count[i] = ctx->iowq_limits[i];
378		/* ignore errors, it always returns zero anyway */
379		(void)io_wq_max_workers(tctx->io_wq, new_count);
380	}
381	return 0;
382err:
383	if (sqd) {
384		mutex_unlock(&sqd->lock);
385		io_put_sq_data(sqd);
386	}
387	return ret;
388}
389
390static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
391			       void __user *arg, unsigned nr_args)
392	__releases(ctx->uring_lock)
393	__acquires(ctx->uring_lock)
394{
395	int ret;
396
397	/*
398	 * We don't quiesce the refs for register anymore and so it can't be
399	 * dying as we're holding a file ref here.
400	 */
401	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
402		return -ENXIO;
403
404	if (ctx->submitter_task && ctx->submitter_task != current)
405		return -EEXIST;
406
407	if (ctx->restricted) {
408		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
409		if (!test_bit(opcode, ctx->restrictions.register_op))
410			return -EACCES;
411	}
412
413	switch (opcode) {
414	case IORING_REGISTER_BUFFERS:
415		ret = -EFAULT;
416		if (!arg)
417			break;
418		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
419		break;
420	case IORING_UNREGISTER_BUFFERS:
421		ret = -EINVAL;
422		if (arg || nr_args)
423			break;
424		ret = io_sqe_buffers_unregister(ctx);
425		break;
426	case IORING_REGISTER_FILES:
427		ret = -EFAULT;
428		if (!arg)
429			break;
430		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
431		break;
432	case IORING_UNREGISTER_FILES:
433		ret = -EINVAL;
434		if (arg || nr_args)
435			break;
436		ret = io_sqe_files_unregister(ctx);
437		break;
438	case IORING_REGISTER_FILES_UPDATE:
439		ret = io_register_files_update(ctx, arg, nr_args);
440		break;
441	case IORING_REGISTER_EVENTFD:
442		ret = -EINVAL;
443		if (nr_args != 1)
444			break;
445		ret = io_eventfd_register(ctx, arg, 0);
446		break;
447	case IORING_REGISTER_EVENTFD_ASYNC:
448		ret = -EINVAL;
449		if (nr_args != 1)
450			break;
451		ret = io_eventfd_register(ctx, arg, 1);
452		break;
453	case IORING_UNREGISTER_EVENTFD:
454		ret = -EINVAL;
455		if (arg || nr_args)
456			break;
457		ret = io_eventfd_unregister(ctx);
458		break;
459	case IORING_REGISTER_PROBE:
460		ret = -EINVAL;
461		if (!arg || nr_args > 256)
462			break;
463		ret = io_probe(ctx, arg, nr_args);
464		break;
465	case IORING_REGISTER_PERSONALITY:
466		ret = -EINVAL;
467		if (arg || nr_args)
468			break;
469		ret = io_register_personality(ctx);
470		break;
471	case IORING_UNREGISTER_PERSONALITY:
472		ret = -EINVAL;
473		if (arg)
474			break;
475		ret = io_unregister_personality(ctx, nr_args);
476		break;
477	case IORING_REGISTER_ENABLE_RINGS:
478		ret = -EINVAL;
479		if (arg || nr_args)
480			break;
481		ret = io_register_enable_rings(ctx);
482		break;
483	case IORING_REGISTER_RESTRICTIONS:
484		ret = io_register_restrictions(ctx, arg, nr_args);
485		break;
486	case IORING_REGISTER_FILES2:
487		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
488		break;
489	case IORING_REGISTER_FILES_UPDATE2:
490		ret = io_register_rsrc_update(ctx, arg, nr_args,
491					      IORING_RSRC_FILE);
492		break;
493	case IORING_REGISTER_BUFFERS2:
494		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
495		break;
496	case IORING_REGISTER_BUFFERS_UPDATE:
497		ret = io_register_rsrc_update(ctx, arg, nr_args,
498					      IORING_RSRC_BUFFER);
499		break;
500	case IORING_REGISTER_IOWQ_AFF:
501		ret = -EINVAL;
502		if (!arg || !nr_args)
503			break;
504		ret = io_register_iowq_aff(ctx, arg, nr_args);
505		break;
506	case IORING_UNREGISTER_IOWQ_AFF:
507		ret = -EINVAL;
508		if (arg || nr_args)
509			break;
510		ret = io_unregister_iowq_aff(ctx);
511		break;
512	case IORING_REGISTER_IOWQ_MAX_WORKERS:
513		ret = -EINVAL;
514		if (!arg || nr_args != 2)
515			break;
516		ret = io_register_iowq_max_workers(ctx, arg);
517		break;
518	case IORING_REGISTER_RING_FDS:
519		ret = io_ringfd_register(ctx, arg, nr_args);
520		break;
521	case IORING_UNREGISTER_RING_FDS:
522		ret = io_ringfd_unregister(ctx, arg, nr_args);
523		break;
524	case IORING_REGISTER_PBUF_RING:
525		ret = -EINVAL;
526		if (!arg || nr_args != 1)
527			break;
528		ret = io_register_pbuf_ring(ctx, arg);
529		break;
530	case IORING_UNREGISTER_PBUF_RING:
531		ret = -EINVAL;
532		if (!arg || nr_args != 1)
533			break;
534		ret = io_unregister_pbuf_ring(ctx, arg);
535		break;
536	case IORING_REGISTER_SYNC_CANCEL:
537		ret = -EINVAL;
538		if (!arg || nr_args != 1)
539			break;
540		ret = io_sync_cancel(ctx, arg);
541		break;
542	case IORING_REGISTER_FILE_ALLOC_RANGE:
543		ret = -EINVAL;
544		if (!arg || nr_args)
545			break;
546		ret = io_register_file_alloc_range(ctx, arg);
547		break;
548	case IORING_REGISTER_PBUF_STATUS:
549		ret = -EINVAL;
550		if (!arg || nr_args != 1)
551			break;
552		ret = io_register_pbuf_status(ctx, arg);
553		break;
554	case IORING_REGISTER_NAPI:
555		ret = -EINVAL;
556		if (!arg || nr_args != 1)
557			break;
558		ret = io_register_napi(ctx, arg);
559		break;
560	case IORING_UNREGISTER_NAPI:
561		ret = -EINVAL;
562		if (nr_args != 1)
563			break;
564		ret = io_unregister_napi(ctx, arg);
565		break;
566	default:
567		ret = -EINVAL;
568		break;
569	}
570
571	return ret;
572}
573
574SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
575		void __user *, arg, unsigned int, nr_args)
576{
577	struct io_ring_ctx *ctx;
578	long ret = -EBADF;
579	struct file *file;
580	bool use_registered_ring;
581
582	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
583	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
584
585	if (opcode >= IORING_REGISTER_LAST)
586		return -EINVAL;
587
588	if (use_registered_ring) {
589		/*
590		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
591		 * need only dereference our task private array to find it.
592		 */
593		struct io_uring_task *tctx = current->io_uring;
594
595		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
596			return -EINVAL;
597		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
598		file = tctx->registered_rings[fd];
599		if (unlikely(!file))
600			return -EBADF;
601	} else {
602		file = fget(fd);
603		if (unlikely(!file))
604			return -EBADF;
605		ret = -EOPNOTSUPP;
606		if (!io_is_uring_fops(file))
607			goto out_fput;
608	}
609
610	ctx = file->private_data;
611
612	mutex_lock(&ctx->uring_lock);
613	ret = __io_uring_register(ctx, opcode, arg, nr_args);
614	mutex_unlock(&ctx->uring_lock);
615	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
616out_fput:
617	if (!use_registered_ring)
618		fput(file);
619	return ret;
620}
621