1// SPDX-License-Identifier: GPL-2.0
2
3#include "io_uring.h"
4#include "napi.h"
5
6#ifdef CONFIG_NET_RX_BUSY_POLL
7
8/* Timeout for cleanout of stale entries. */
9#define NAPI_TIMEOUT		(60 * SEC_CONVERSION)
10
11struct io_napi_entry {
12	unsigned int		napi_id;
13	struct list_head	list;
14
15	unsigned long		timeout;
16	struct hlist_node	node;
17
18	struct rcu_head		rcu;
19};
20
21static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22					       unsigned int napi_id)
23{
24	struct io_napi_entry *e;
25
26	hlist_for_each_entry_rcu(e, hash_list, node) {
27		if (e->napi_id != napi_id)
28			continue;
29		e->timeout = jiffies + NAPI_TIMEOUT;
30		return e;
31	}
32
33	return NULL;
34}
35
36void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
37{
38	struct hlist_head *hash_list;
39	unsigned int napi_id;
40	struct sock *sk;
41	struct io_napi_entry *e;
42
43	sk = sock->sk;
44	if (!sk)
45		return;
46
47	napi_id = READ_ONCE(sk->sk_napi_id);
48
49	/* Non-NAPI IDs can be rejected. */
50	if (napi_id < MIN_NAPI_ID)
51		return;
52
53	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
54
55	rcu_read_lock();
56	e = io_napi_hash_find(hash_list, napi_id);
57	if (e) {
58		e->timeout = jiffies + NAPI_TIMEOUT;
59		rcu_read_unlock();
60		return;
61	}
62	rcu_read_unlock();
63
64	e = kmalloc(sizeof(*e), GFP_NOWAIT);
65	if (!e)
66		return;
67
68	e->napi_id = napi_id;
69	e->timeout = jiffies + NAPI_TIMEOUT;
70
71	spin_lock(&ctx->napi_lock);
72	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
73		spin_unlock(&ctx->napi_lock);
74		kfree(e);
75		return;
76	}
77
78	hlist_add_tail_rcu(&e->node, hash_list);
79	list_add_tail(&e->list, &ctx->napi_list);
80	spin_unlock(&ctx->napi_lock);
81}
82
83static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
84{
85	struct io_napi_entry *e;
86	unsigned int i;
87
88	spin_lock(&ctx->napi_lock);
89	hash_for_each(ctx->napi_ht, i, e, node) {
90		if (time_after(jiffies, e->timeout)) {
91			list_del(&e->list);
92			hash_del_rcu(&e->node);
93			kfree_rcu(e, rcu);
94		}
95	}
96	spin_unlock(&ctx->napi_lock);
97}
98
99static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
100{
101	if (is_stale)
102		__io_napi_remove_stale(ctx);
103}
104
105static inline bool io_napi_busy_loop_timeout(unsigned long start_time,
106					     unsigned long bp_usec)
107{
108	if (bp_usec) {
109		unsigned long end_time = start_time + bp_usec;
110		unsigned long now = busy_loop_current_time();
111
112		return time_after(now, end_time);
113	}
114
115	return true;
116}
117
118static bool io_napi_busy_loop_should_end(void *data,
119					 unsigned long start_time)
120{
121	struct io_wait_queue *iowq = data;
122
123	if (signal_pending(current))
124		return true;
125	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
126		return true;
127	if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to))
128		return true;
129
130	return false;
131}
132
133static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
134				   void *loop_end_arg)
135{
136	struct io_napi_entry *e;
137	bool (*loop_end)(void *, unsigned long) = NULL;
138	bool is_stale = false;
139
140	if (loop_end_arg)
141		loop_end = io_napi_busy_loop_should_end;
142
143	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
144		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
145				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
146
147		if (time_after(jiffies, e->timeout))
148			is_stale = true;
149	}
150
151	return is_stale;
152}
153
154static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
155				       struct io_wait_queue *iowq)
156{
157	unsigned long start_time = busy_loop_current_time();
158	void *loop_end_arg = NULL;
159	bool is_stale = false;
160
161	/* Singular lists use a different napi loop end check function and are
162	 * only executed once.
163	 */
164	if (list_is_singular(&ctx->napi_list))
165		loop_end_arg = iowq;
166
167	rcu_read_lock();
168	do {
169		is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
170	} while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
171	rcu_read_unlock();
172
173	io_napi_remove_stale(ctx, is_stale);
174}
175
176/*
177 * io_napi_init() - Init napi settings
178 * @ctx: pointer to io-uring context structure
179 *
180 * Init napi settings in the io-uring context.
181 */
182void io_napi_init(struct io_ring_ctx *ctx)
183{
184	INIT_LIST_HEAD(&ctx->napi_list);
185	spin_lock_init(&ctx->napi_lock);
186	ctx->napi_prefer_busy_poll = false;
187	ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
188}
189
190/*
191 * io_napi_free() - Deallocate napi
192 * @ctx: pointer to io-uring context structure
193 *
194 * Free the napi list and the hash table in the io-uring context.
195 */
196void io_napi_free(struct io_ring_ctx *ctx)
197{
198	struct io_napi_entry *e;
199	LIST_HEAD(napi_list);
200	unsigned int i;
201
202	spin_lock(&ctx->napi_lock);
203	hash_for_each(ctx->napi_ht, i, e, node) {
204		hash_del_rcu(&e->node);
205		kfree_rcu(e, rcu);
206	}
207	spin_unlock(&ctx->napi_lock);
208}
209
210/*
211 * io_napi_register() - Register napi with io-uring
212 * @ctx: pointer to io-uring context structure
213 * @arg: pointer to io_uring_napi structure
214 *
215 * Register napi in the io-uring context.
216 */
217int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
218{
219	const struct io_uring_napi curr = {
220		.busy_poll_to 	  = ctx->napi_busy_poll_to,
221		.prefer_busy_poll = ctx->napi_prefer_busy_poll
222	};
223	struct io_uring_napi napi;
224
225	if (copy_from_user(&napi, arg, sizeof(napi)))
226		return -EFAULT;
227	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
228		return -EINVAL;
229
230	if (copy_to_user(arg, &curr, sizeof(curr)))
231		return -EFAULT;
232
233	WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to);
234	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
235	WRITE_ONCE(ctx->napi_enabled, true);
236	return 0;
237}
238
239/*
240 * io_napi_unregister() - Unregister napi with io-uring
241 * @ctx: pointer to io-uring context structure
242 * @arg: pointer to io_uring_napi structure
243 *
244 * Unregister napi. If arg has been specified copy the busy poll timeout and
245 * prefer busy poll setting to the passed in structure.
246 */
247int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
248{
249	const struct io_uring_napi curr = {
250		.busy_poll_to 	  = ctx->napi_busy_poll_to,
251		.prefer_busy_poll = ctx->napi_prefer_busy_poll
252	};
253
254	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
255		return -EFAULT;
256
257	WRITE_ONCE(ctx->napi_busy_poll_to, 0);
258	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
259	WRITE_ONCE(ctx->napi_enabled, false);
260	return 0;
261}
262
263/*
264 * __io_napi_adjust_timeout() - Add napi id to the busy poll list
265 * @ctx: pointer to io-uring context structure
266 * @iowq: pointer to io wait queue
267 * @ts: pointer to timespec or NULL
268 *
269 * Adjust the busy loop timeout according to timespec and busy poll timeout.
270 */
271void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
272			      struct timespec64 *ts)
273{
274	unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);
275
276	if (ts) {
277		struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
278
279		if (timespec64_compare(ts, &poll_to_ts) > 0) {
280			*ts = timespec64_sub(*ts, poll_to_ts);
281		} else {
282			u64 to = timespec64_to_ns(ts);
283
284			do_div(to, 1000);
285			ts->tv_sec = 0;
286			ts->tv_nsec = 0;
287		}
288	}
289
290	iowq->napi_busy_poll_to = poll_to;
291}
292
293/*
294 * __io_napi_busy_loop() - execute busy poll loop
295 * @ctx: pointer to io-uring context structure
296 * @iowq: pointer to io wait queue
297 *
298 * Execute the busy poll loop and merge the spliced off list.
299 */
300void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
301{
302	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
303
304	if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
305		io_napi_blocking_busy_loop(ctx, iowq);
306}
307
308/*
309 * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
310 * @ctx: pointer to io-uring context structure
311 *
312 * Splice of the napi list and execute the napi busy poll loop.
313 */
314int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
315{
316	LIST_HEAD(napi_list);
317	bool is_stale = false;
318
319	if (!READ_ONCE(ctx->napi_busy_poll_to))
320		return 0;
321	if (list_empty_careful(&ctx->napi_list))
322		return 0;
323
324	rcu_read_lock();
325	is_stale = __io_napi_do_busy_loop(ctx, NULL);
326	rcu_read_unlock();
327
328	io_napi_remove_stale(ctx, is_stale);
329	return 1;
330}
331
332#endif
333