1// SPDX-License-Identifier: GPL-2.0
2/* XDP sockets
3 *
4 * AF_XDP sockets allows a channel between XDP programs and userspace
5 * applications.
6 * Copyright(c) 2018 Intel Corporation.
7 *
8 * Author(s): Bj��rn T��pel <bjorn.topel@intel.com>
9 *	      Magnus Karlsson <magnus.karlsson@intel.com>
10 */
11
12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14#include <linux/if_xdp.h>
15#include <linux/init.h>
16#include <linux/sched/mm.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/task.h>
19#include <linux/socket.h>
20#include <linux/file.h>
21#include <linux/uaccess.h>
22#include <linux/net.h>
23#include <linux/netdevice.h>
24#include <linux/rculist.h>
25#include <linux/vmalloc.h>
26#include <net/xdp_sock_drv.h>
27#include <net/busy_poll.h>
28#include <net/netdev_rx_queue.h>
29#include <net/xdp.h>
30
31#include "xsk_queue.h"
32#include "xdp_umem.h"
33#include "xsk.h"
34
35#define TX_BATCH_SIZE 32
36#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
37
38static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
39
40void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
41{
42	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
43		return;
44
45	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
46	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
47}
48EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
49
50void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
51{
52	struct xdp_sock *xs;
53
54	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
55		return;
56
57	rcu_read_lock();
58	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
59		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
60	}
61	rcu_read_unlock();
62
63	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
64}
65EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
66
67void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
68{
69	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
70		return;
71
72	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
73	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
74}
75EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
76
77void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
78{
79	struct xdp_sock *xs;
80
81	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
82		return;
83
84	rcu_read_lock();
85	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
86		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
87	}
88	rcu_read_unlock();
89
90	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
91}
92EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
93
94bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
95{
96	return pool->uses_need_wakeup;
97}
98EXPORT_SYMBOL(xsk_uses_need_wakeup);
99
100struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
101					    u16 queue_id)
102{
103	if (queue_id < dev->real_num_rx_queues)
104		return dev->_rx[queue_id].pool;
105	if (queue_id < dev->real_num_tx_queues)
106		return dev->_tx[queue_id].pool;
107
108	return NULL;
109}
110EXPORT_SYMBOL(xsk_get_pool_from_qid);
111
112void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
113{
114	if (queue_id < dev->num_rx_queues)
115		dev->_rx[queue_id].pool = NULL;
116	if (queue_id < dev->num_tx_queues)
117		dev->_tx[queue_id].pool = NULL;
118}
119
120/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
121 * not know if the device has more tx queues than rx, or the opposite.
122 * This might also change during run time.
123 */
124int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
125			u16 queue_id)
126{
127	if (queue_id >= max_t(unsigned int,
128			      dev->real_num_rx_queues,
129			      dev->real_num_tx_queues))
130		return -EINVAL;
131
132	if (queue_id < dev->real_num_rx_queues)
133		dev->_rx[queue_id].pool = pool;
134	if (queue_id < dev->real_num_tx_queues)
135		dev->_tx[queue_id].pool = pool;
136
137	return 0;
138}
139
140static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
141			u32 flags)
142{
143	u64 addr;
144	int err;
145
146	addr = xp_get_handle(xskb);
147	err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
148	if (err) {
149		xs->rx_queue_full++;
150		return err;
151	}
152
153	xp_release(xskb);
154	return 0;
155}
156
157static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
158{
159	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
160	u32 frags = xdp_buff_has_frags(xdp);
161	struct xdp_buff_xsk *pos, *tmp;
162	struct list_head *xskb_list;
163	u32 contd = 0;
164	int err;
165
166	if (frags)
167		contd = XDP_PKT_CONTD;
168
169	err = __xsk_rcv_zc(xs, xskb, len, contd);
170	if (err)
171		goto err;
172	if (likely(!frags))
173		return 0;
174
175	xskb_list = &xskb->pool->xskb_list;
176	list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
177		if (list_is_singular(xskb_list))
178			contd = 0;
179		len = pos->xdp.data_end - pos->xdp.data;
180		err = __xsk_rcv_zc(xs, pos, len, contd);
181		if (err)
182			goto err;
183		list_del(&pos->xskb_list_node);
184	}
185
186	return 0;
187err:
188	xsk_buff_free(xdp);
189	return err;
190}
191
192static void *xsk_copy_xdp_start(struct xdp_buff *from)
193{
194	if (unlikely(xdp_data_meta_unsupported(from)))
195		return from->data;
196	else
197		return from->data_meta;
198}
199
200static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
201			u32 *from_len, skb_frag_t **frag, u32 rem)
202{
203	u32 copied = 0;
204
205	while (1) {
206		u32 copy_len = min_t(u32, *from_len, to_len);
207
208		memcpy(to, *from, copy_len);
209		copied += copy_len;
210		if (rem == copied)
211			return copied;
212
213		if (*from_len == copy_len) {
214			*from = skb_frag_address(*frag);
215			*from_len = skb_frag_size((*frag)++);
216		} else {
217			*from += copy_len;
218			*from_len -= copy_len;
219		}
220		if (to_len == copy_len)
221			return copied;
222
223		to_len -= copy_len;
224		to += copy_len;
225	}
226}
227
228static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
229{
230	u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
231	void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
232	u32 from_len, meta_len, rem, num_desc;
233	struct xdp_buff_xsk *xskb;
234	struct xdp_buff *xsk_xdp;
235	skb_frag_t *frag;
236
237	from_len = xdp->data_end - copy_from;
238	meta_len = xdp->data - copy_from;
239	rem = len + meta_len;
240
241	if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
242		int err;
243
244		xsk_xdp = xsk_buff_alloc(xs->pool);
245		if (!xsk_xdp) {
246			xs->rx_dropped++;
247			return -ENOMEM;
248		}
249		memcpy(xsk_xdp->data - meta_len, copy_from, rem);
250		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
251		err = __xsk_rcv_zc(xs, xskb, len, 0);
252		if (err) {
253			xsk_buff_free(xsk_xdp);
254			return err;
255		}
256
257		return 0;
258	}
259
260	num_desc = (len - 1) / frame_size + 1;
261
262	if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
263		xs->rx_dropped++;
264		return -ENOMEM;
265	}
266	if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
267		xs->rx_queue_full++;
268		return -ENOBUFS;
269	}
270
271	if (xdp_buff_has_frags(xdp)) {
272		struct skb_shared_info *sinfo;
273
274		sinfo = xdp_get_shared_info_from_buff(xdp);
275		frag =  &sinfo->frags[0];
276	}
277
278	do {
279		u32 to_len = frame_size + meta_len;
280		u32 copied;
281
282		xsk_xdp = xsk_buff_alloc(xs->pool);
283		copy_to = xsk_xdp->data - meta_len;
284
285		copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
286		rem -= copied;
287
288		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
289		__xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
290		meta_len = 0;
291	} while (rem);
292
293	return 0;
294}
295
296static bool xsk_tx_writeable(struct xdp_sock *xs)
297{
298	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
299		return false;
300
301	return true;
302}
303
304static bool xsk_is_bound(struct xdp_sock *xs)
305{
306	if (READ_ONCE(xs->state) == XSK_BOUND) {
307		/* Matches smp_wmb() in bind(). */
308		smp_rmb();
309		return true;
310	}
311	return false;
312}
313
314static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
315{
316	struct net_device *dev = xdp->rxq->dev;
317	u32 qid = xdp->rxq->queue_index;
318
319	if (!xsk_is_bound(xs))
320		return -ENXIO;
321
322	if (!dev->_rx[qid].pool || xs->umem != dev->_rx[qid].pool->umem)
323		return -EINVAL;
324
325	if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
326		xs->rx_dropped++;
327		return -ENOSPC;
328	}
329
330	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
331	return 0;
332}
333
334static void xsk_flush(struct xdp_sock *xs)
335{
336	xskq_prod_submit(xs->rx);
337	__xskq_cons_release(xs->pool->fq);
338	sock_def_readable(&xs->sk);
339}
340
341int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
342{
343	u32 len = xdp_get_buff_len(xdp);
344	int err;
345
346	spin_lock_bh(&xs->rx_lock);
347	err = xsk_rcv_check(xs, xdp, len);
348	if (!err) {
349		err = __xsk_rcv(xs, xdp, len);
350		xsk_flush(xs);
351	}
352	spin_unlock_bh(&xs->rx_lock);
353	return err;
354}
355
356static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
357{
358	u32 len = xdp_get_buff_len(xdp);
359	int err;
360
361	err = xsk_rcv_check(xs, xdp, len);
362	if (err)
363		return err;
364
365	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
366		len = xdp->data_end - xdp->data;
367		return xsk_rcv_zc(xs, xdp, len);
368	}
369
370	err = __xsk_rcv(xs, xdp, len);
371	if (!err)
372		xdp_return_buff(xdp);
373	return err;
374}
375
376int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
377{
378	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
379	int err;
380
381	err = xsk_rcv(xs, xdp);
382	if (err)
383		return err;
384
385	if (!xs->flush_node.prev)
386		list_add(&xs->flush_node, flush_list);
387
388	return 0;
389}
390
391void __xsk_map_flush(void)
392{
393	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
394	struct xdp_sock *xs, *tmp;
395
396	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
397		xsk_flush(xs);
398		__list_del_clearprev(&xs->flush_node);
399	}
400}
401
402#ifdef CONFIG_DEBUG_NET
403bool xsk_map_check_flush(void)
404{
405	if (list_empty(this_cpu_ptr(&xskmap_flush_list)))
406		return false;
407	__xsk_map_flush();
408	return true;
409}
410#endif
411
412void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
413{
414	xskq_prod_submit_n(pool->cq, nb_entries);
415}
416EXPORT_SYMBOL(xsk_tx_completed);
417
418void xsk_tx_release(struct xsk_buff_pool *pool)
419{
420	struct xdp_sock *xs;
421
422	rcu_read_lock();
423	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
424		__xskq_cons_release(xs->tx);
425		if (xsk_tx_writeable(xs))
426			xs->sk.sk_write_space(&xs->sk);
427	}
428	rcu_read_unlock();
429}
430EXPORT_SYMBOL(xsk_tx_release);
431
432bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
433{
434	bool budget_exhausted = false;
435	struct xdp_sock *xs;
436
437	rcu_read_lock();
438again:
439	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
440		if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) {
441			budget_exhausted = true;
442			continue;
443		}
444
445		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
446			if (xskq_has_descs(xs->tx))
447				xskq_cons_release(xs->tx);
448			continue;
449		}
450
451		xs->tx_budget_spent++;
452
453		/* This is the backpressure mechanism for the Tx path.
454		 * Reserve space in the completion queue and only proceed
455		 * if there is space in it. This avoids having to implement
456		 * any buffering in the Tx path.
457		 */
458		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
459			goto out;
460
461		xskq_cons_release(xs->tx);
462		rcu_read_unlock();
463		return true;
464	}
465
466	if (budget_exhausted) {
467		list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
468			xs->tx_budget_spent = 0;
469
470		budget_exhausted = false;
471		goto again;
472	}
473
474out:
475	rcu_read_unlock();
476	return false;
477}
478EXPORT_SYMBOL(xsk_tx_peek_desc);
479
480static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
481{
482	struct xdp_desc *descs = pool->tx_descs;
483	u32 nb_pkts = 0;
484
485	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
486		nb_pkts++;
487
488	xsk_tx_release(pool);
489	return nb_pkts;
490}
491
492u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
493{
494	struct xdp_sock *xs;
495
496	rcu_read_lock();
497	if (!list_is_singular(&pool->xsk_tx_list)) {
498		/* Fallback to the non-batched version */
499		rcu_read_unlock();
500		return xsk_tx_peek_release_fallback(pool, nb_pkts);
501	}
502
503	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
504	if (!xs) {
505		nb_pkts = 0;
506		goto out;
507	}
508
509	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
510
511	/* This is the backpressure mechanism for the Tx path. Try to
512	 * reserve space in the completion queue for all packets, but
513	 * if there are fewer slots available, just process that many
514	 * packets. This avoids having to implement any buffering in
515	 * the Tx path.
516	 */
517	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
518	if (!nb_pkts)
519		goto out;
520
521	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
522	if (!nb_pkts) {
523		xs->tx->queue_empty_descs++;
524		goto out;
525	}
526
527	__xskq_cons_release(xs->tx);
528	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
529	xs->sk.sk_write_space(&xs->sk);
530
531out:
532	rcu_read_unlock();
533	return nb_pkts;
534}
535EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
536
537static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
538{
539	struct net_device *dev = xs->dev;
540
541	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
542}
543
544static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr)
545{
546	unsigned long flags;
547	int ret;
548
549	spin_lock_irqsave(&xs->pool->cq_lock, flags);
550	ret = xskq_prod_reserve_addr(xs->pool->cq, addr);
551	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
552
553	return ret;
554}
555
556static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n)
557{
558	unsigned long flags;
559
560	spin_lock_irqsave(&xs->pool->cq_lock, flags);
561	xskq_prod_submit_n(xs->pool->cq, n);
562	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
563}
564
565static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n)
566{
567	unsigned long flags;
568
569	spin_lock_irqsave(&xs->pool->cq_lock, flags);
570	xskq_prod_cancel_n(xs->pool->cq, n);
571	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
572}
573
574static u32 xsk_get_num_desc(struct sk_buff *skb)
575{
576	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
577}
578
579static void xsk_destruct_skb(struct sk_buff *skb)
580{
581	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
582
583	if (compl->tx_timestamp) {
584		/* sw completion timestamp, not a real one */
585		*compl->tx_timestamp = ktime_get_tai_fast_ns();
586	}
587
588	xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
589	sock_wfree(skb);
590}
591
592static void xsk_set_destructor_arg(struct sk_buff *skb)
593{
594	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
595
596	skb_shinfo(skb)->destructor_arg = (void *)num;
597}
598
599static void xsk_consume_skb(struct sk_buff *skb)
600{
601	struct xdp_sock *xs = xdp_sk(skb->sk);
602
603	skb->destructor = sock_wfree;
604	xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb));
605	/* Free skb without triggering the perf drop trace */
606	consume_skb(skb);
607	xs->skb = NULL;
608}
609
610static void xsk_drop_skb(struct sk_buff *skb)
611{
612	xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
613	xsk_consume_skb(skb);
614}
615
616static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
617					      struct xdp_desc *desc)
618{
619	struct xsk_buff_pool *pool = xs->pool;
620	u32 hr, len, ts, offset, copy, copied;
621	struct sk_buff *skb = xs->skb;
622	struct page *page;
623	void *buffer;
624	int err, i;
625	u64 addr;
626
627	if (!skb) {
628		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
629
630		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
631		if (unlikely(!skb))
632			return ERR_PTR(err);
633
634		skb_reserve(skb, hr);
635	}
636
637	addr = desc->addr;
638	len = desc->len;
639	ts = pool->unaligned ? len : pool->chunk_size;
640
641	buffer = xsk_buff_raw_get_data(pool, addr);
642	offset = offset_in_page(buffer);
643	addr = buffer - pool->addrs;
644
645	for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
646		if (unlikely(i >= MAX_SKB_FRAGS))
647			return ERR_PTR(-EOVERFLOW);
648
649		page = pool->umem->pgs[addr >> PAGE_SHIFT];
650		get_page(page);
651
652		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
653		skb_fill_page_desc(skb, i, page, offset, copy);
654
655		copied += copy;
656		addr += copy;
657		offset = 0;
658	}
659
660	skb->len += len;
661	skb->data_len += len;
662	skb->truesize += ts;
663
664	refcount_add(ts, &xs->sk.sk_wmem_alloc);
665
666	return skb;
667}
668
669static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
670				     struct xdp_desc *desc)
671{
672	struct xsk_tx_metadata *meta = NULL;
673	struct net_device *dev = xs->dev;
674	struct sk_buff *skb = xs->skb;
675	bool first_frag = false;
676	int err;
677
678	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
679		skb = xsk_build_skb_zerocopy(xs, desc);
680		if (IS_ERR(skb)) {
681			err = PTR_ERR(skb);
682			goto free_err;
683		}
684	} else {
685		u32 hr, tr, len;
686		void *buffer;
687
688		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
689		len = desc->len;
690
691		if (!skb) {
692			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
693			tr = dev->needed_tailroom;
694			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
695			if (unlikely(!skb))
696				goto free_err;
697
698			skb_reserve(skb, hr);
699			skb_put(skb, len);
700
701			err = skb_store_bits(skb, 0, buffer, len);
702			if (unlikely(err)) {
703				kfree_skb(skb);
704				goto free_err;
705			}
706
707			first_frag = true;
708		} else {
709			int nr_frags = skb_shinfo(skb)->nr_frags;
710			struct page *page;
711			u8 *vaddr;
712
713			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
714				err = -EOVERFLOW;
715				goto free_err;
716			}
717
718			page = alloc_page(xs->sk.sk_allocation);
719			if (unlikely(!page)) {
720				err = -EAGAIN;
721				goto free_err;
722			}
723
724			vaddr = kmap_local_page(page);
725			memcpy(vaddr, buffer, len);
726			kunmap_local(vaddr);
727
728			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
729			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
730		}
731
732		if (first_frag && desc->options & XDP_TX_METADATA) {
733			if (unlikely(xs->pool->tx_metadata_len == 0)) {
734				err = -EINVAL;
735				goto free_err;
736			}
737
738			meta = buffer - xs->pool->tx_metadata_len;
739			if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
740				err = -EINVAL;
741				goto free_err;
742			}
743
744			if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
745				if (unlikely(meta->request.csum_start +
746					     meta->request.csum_offset +
747					     sizeof(__sum16) > len)) {
748					err = -EINVAL;
749					goto free_err;
750				}
751
752				skb->csum_start = hr + meta->request.csum_start;
753				skb->csum_offset = meta->request.csum_offset;
754				skb->ip_summed = CHECKSUM_PARTIAL;
755
756				if (unlikely(xs->pool->tx_sw_csum)) {
757					err = skb_checksum_help(skb);
758					if (err)
759						goto free_err;
760				}
761			}
762		}
763	}
764
765	skb->dev = dev;
766	skb->priority = READ_ONCE(xs->sk.sk_priority);
767	skb->mark = READ_ONCE(xs->sk.sk_mark);
768	skb->destructor = xsk_destruct_skb;
769	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
770	xsk_set_destructor_arg(skb);
771
772	return skb;
773
774free_err:
775	if (err == -EOVERFLOW) {
776		/* Drop the packet */
777		xsk_set_destructor_arg(xs->skb);
778		xsk_drop_skb(xs->skb);
779		xskq_cons_release(xs->tx);
780	} else {
781		/* Let application retry */
782		xsk_cq_cancel_locked(xs, 1);
783	}
784
785	return ERR_PTR(err);
786}
787
788static int __xsk_generic_xmit(struct sock *sk)
789{
790	struct xdp_sock *xs = xdp_sk(sk);
791	u32 max_batch = TX_BATCH_SIZE;
792	bool sent_frame = false;
793	struct xdp_desc desc;
794	struct sk_buff *skb;
795	int err = 0;
796
797	mutex_lock(&xs->mutex);
798
799	/* Since we dropped the RCU read lock, the socket state might have changed. */
800	if (unlikely(!xsk_is_bound(xs))) {
801		err = -ENXIO;
802		goto out;
803	}
804
805	if (xs->queue_id >= xs->dev->real_num_tx_queues)
806		goto out;
807
808	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
809		if (max_batch-- == 0) {
810			err = -EAGAIN;
811			goto out;
812		}
813
814		/* This is the backpressure mechanism for the Tx path.
815		 * Reserve space in the completion queue and only proceed
816		 * if there is space in it. This avoids having to implement
817		 * any buffering in the Tx path.
818		 */
819		if (xsk_cq_reserve_addr_locked(xs, desc.addr))
820			goto out;
821
822		skb = xsk_build_skb(xs, &desc);
823		if (IS_ERR(skb)) {
824			err = PTR_ERR(skb);
825			if (err != -EOVERFLOW)
826				goto out;
827			err = 0;
828			continue;
829		}
830
831		xskq_cons_release(xs->tx);
832
833		if (xp_mb_desc(&desc)) {
834			xs->skb = skb;
835			continue;
836		}
837
838		err = __dev_direct_xmit(skb, xs->queue_id);
839		if  (err == NETDEV_TX_BUSY) {
840			/* Tell user-space to retry the send */
841			xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
842			xsk_consume_skb(skb);
843			err = -EAGAIN;
844			goto out;
845		}
846
847		/* Ignore NET_XMIT_CN as packet might have been sent */
848		if (err == NET_XMIT_DROP) {
849			/* SKB completed but not sent */
850			err = -EBUSY;
851			xs->skb = NULL;
852			goto out;
853		}
854
855		sent_frame = true;
856		xs->skb = NULL;
857	}
858
859	if (xskq_has_descs(xs->tx)) {
860		if (xs->skb)
861			xsk_drop_skb(xs->skb);
862		xskq_cons_release(xs->tx);
863	}
864
865out:
866	if (sent_frame)
867		if (xsk_tx_writeable(xs))
868			sk->sk_write_space(sk);
869
870	mutex_unlock(&xs->mutex);
871	return err;
872}
873
874static int xsk_generic_xmit(struct sock *sk)
875{
876	int ret;
877
878	/* Drop the RCU lock since the SKB path might sleep. */
879	rcu_read_unlock();
880	ret = __xsk_generic_xmit(sk);
881	/* Reaquire RCU lock before going into common code. */
882	rcu_read_lock();
883
884	return ret;
885}
886
887static bool xsk_no_wakeup(struct sock *sk)
888{
889#ifdef CONFIG_NET_RX_BUSY_POLL
890	/* Prefer busy-polling, skip the wakeup. */
891	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
892		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
893#else
894	return false;
895#endif
896}
897
898static int xsk_check_common(struct xdp_sock *xs)
899{
900	if (unlikely(!xsk_is_bound(xs)))
901		return -ENXIO;
902	if (unlikely(!(xs->dev->flags & IFF_UP)))
903		return -ENETDOWN;
904
905	return 0;
906}
907
908static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
909{
910	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
911	struct sock *sk = sock->sk;
912	struct xdp_sock *xs = xdp_sk(sk);
913	struct xsk_buff_pool *pool;
914	int err;
915
916	err = xsk_check_common(xs);
917	if (err)
918		return err;
919	if (unlikely(need_wait))
920		return -EOPNOTSUPP;
921	if (unlikely(!xs->tx))
922		return -ENOBUFS;
923
924	if (sk_can_busy_loop(sk)) {
925		if (xs->zc)
926			__sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
927		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
928	}
929
930	if (xs->zc && xsk_no_wakeup(sk))
931		return 0;
932
933	pool = xs->pool;
934	if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
935		if (xs->zc)
936			return xsk_wakeup(xs, XDP_WAKEUP_TX);
937		return xsk_generic_xmit(sk);
938	}
939	return 0;
940}
941
942static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
943{
944	int ret;
945
946	rcu_read_lock();
947	ret = __xsk_sendmsg(sock, m, total_len);
948	rcu_read_unlock();
949
950	return ret;
951}
952
953static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
954{
955	bool need_wait = !(flags & MSG_DONTWAIT);
956	struct sock *sk = sock->sk;
957	struct xdp_sock *xs = xdp_sk(sk);
958	int err;
959
960	err = xsk_check_common(xs);
961	if (err)
962		return err;
963	if (unlikely(!xs->rx))
964		return -ENOBUFS;
965	if (unlikely(need_wait))
966		return -EOPNOTSUPP;
967
968	if (sk_can_busy_loop(sk))
969		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
970
971	if (xsk_no_wakeup(sk))
972		return 0;
973
974	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
975		return xsk_wakeup(xs, XDP_WAKEUP_RX);
976	return 0;
977}
978
979static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
980{
981	int ret;
982
983	rcu_read_lock();
984	ret = __xsk_recvmsg(sock, m, len, flags);
985	rcu_read_unlock();
986
987	return ret;
988}
989
990static __poll_t xsk_poll(struct file *file, struct socket *sock,
991			     struct poll_table_struct *wait)
992{
993	__poll_t mask = 0;
994	struct sock *sk = sock->sk;
995	struct xdp_sock *xs = xdp_sk(sk);
996	struct xsk_buff_pool *pool;
997
998	sock_poll_wait(file, sock, wait);
999
1000	rcu_read_lock();
1001	if (xsk_check_common(xs))
1002		goto out;
1003
1004	pool = xs->pool;
1005
1006	if (pool->cached_need_wakeup) {
1007		if (xs->zc)
1008			xsk_wakeup(xs, pool->cached_need_wakeup);
1009		else if (xs->tx)
1010			/* Poll needs to drive Tx also in copy mode */
1011			xsk_generic_xmit(sk);
1012	}
1013
1014	if (xs->rx && !xskq_prod_is_empty(xs->rx))
1015		mask |= EPOLLIN | EPOLLRDNORM;
1016	if (xs->tx && xsk_tx_writeable(xs))
1017		mask |= EPOLLOUT | EPOLLWRNORM;
1018out:
1019	rcu_read_unlock();
1020	return mask;
1021}
1022
1023static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
1024			  bool umem_queue)
1025{
1026	struct xsk_queue *q;
1027
1028	if (entries == 0 || *queue || !is_power_of_2(entries))
1029		return -EINVAL;
1030
1031	q = xskq_create(entries, umem_queue);
1032	if (!q)
1033		return -ENOMEM;
1034
1035	/* Make sure queue is ready before it can be seen by others */
1036	smp_wmb();
1037	WRITE_ONCE(*queue, q);
1038	return 0;
1039}
1040
1041static void xsk_unbind_dev(struct xdp_sock *xs)
1042{
1043	struct net_device *dev = xs->dev;
1044
1045	if (xs->state != XSK_BOUND)
1046		return;
1047	WRITE_ONCE(xs->state, XSK_UNBOUND);
1048
1049	/* Wait for driver to stop using the xdp socket. */
1050	xp_del_xsk(xs->pool, xs);
1051	synchronize_net();
1052	dev_put(dev);
1053}
1054
1055static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
1056					      struct xdp_sock __rcu ***map_entry)
1057{
1058	struct xsk_map *map = NULL;
1059	struct xsk_map_node *node;
1060
1061	*map_entry = NULL;
1062
1063	spin_lock_bh(&xs->map_list_lock);
1064	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
1065					node);
1066	if (node) {
1067		bpf_map_inc(&node->map->map);
1068		map = node->map;
1069		*map_entry = node->map_entry;
1070	}
1071	spin_unlock_bh(&xs->map_list_lock);
1072	return map;
1073}
1074
1075static void xsk_delete_from_maps(struct xdp_sock *xs)
1076{
1077	/* This function removes the current XDP socket from all the
1078	 * maps it resides in. We need to take extra care here, due to
1079	 * the two locks involved. Each map has a lock synchronizing
1080	 * updates to the entries, and each socket has a lock that
1081	 * synchronizes access to the list of maps (map_list). For
1082	 * deadlock avoidance the locks need to be taken in the order
1083	 * "map lock"->"socket map list lock". We start off by
1084	 * accessing the socket map list, and take a reference to the
1085	 * map to guarantee existence between the
1086	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
1087	 * calls. Then we ask the map to remove the socket, which
1088	 * tries to remove the socket from the map. Note that there
1089	 * might be updates to the map between
1090	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
1091	 */
1092	struct xdp_sock __rcu **map_entry = NULL;
1093	struct xsk_map *map;
1094
1095	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
1096		xsk_map_try_sock_delete(map, xs, map_entry);
1097		bpf_map_put(&map->map);
1098	}
1099}
1100
1101static int xsk_release(struct socket *sock)
1102{
1103	struct sock *sk = sock->sk;
1104	struct xdp_sock *xs = xdp_sk(sk);
1105	struct net *net;
1106
1107	if (!sk)
1108		return 0;
1109
1110	net = sock_net(sk);
1111
1112	if (xs->skb)
1113		xsk_drop_skb(xs->skb);
1114
1115	mutex_lock(&net->xdp.lock);
1116	sk_del_node_init_rcu(sk);
1117	mutex_unlock(&net->xdp.lock);
1118
1119	sock_prot_inuse_add(net, sk->sk_prot, -1);
1120
1121	xsk_delete_from_maps(xs);
1122	mutex_lock(&xs->mutex);
1123	xsk_unbind_dev(xs);
1124	mutex_unlock(&xs->mutex);
1125
1126	xskq_destroy(xs->rx);
1127	xskq_destroy(xs->tx);
1128	xskq_destroy(xs->fq_tmp);
1129	xskq_destroy(xs->cq_tmp);
1130
1131	sock_orphan(sk);
1132	sock->sk = NULL;
1133
1134	sock_put(sk);
1135
1136	return 0;
1137}
1138
1139static struct socket *xsk_lookup_xsk_from_fd(int fd)
1140{
1141	struct socket *sock;
1142	int err;
1143
1144	sock = sockfd_lookup(fd, &err);
1145	if (!sock)
1146		return ERR_PTR(-ENOTSOCK);
1147
1148	if (sock->sk->sk_family != PF_XDP) {
1149		sockfd_put(sock);
1150		return ERR_PTR(-ENOPROTOOPT);
1151	}
1152
1153	return sock;
1154}
1155
1156static bool xsk_validate_queues(struct xdp_sock *xs)
1157{
1158	return xs->fq_tmp && xs->cq_tmp;
1159}
1160
1161static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
1162{
1163	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
1164	struct sock *sk = sock->sk;
1165	struct xdp_sock *xs = xdp_sk(sk);
1166	struct net_device *dev;
1167	int bound_dev_if;
1168	u32 flags, qid;
1169	int err = 0;
1170
1171	if (addr_len < sizeof(struct sockaddr_xdp))
1172		return -EINVAL;
1173	if (sxdp->sxdp_family != AF_XDP)
1174		return -EINVAL;
1175
1176	flags = sxdp->sxdp_flags;
1177	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
1178		      XDP_USE_NEED_WAKEUP | XDP_USE_SG))
1179		return -EINVAL;
1180
1181	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
1182	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
1183		return -EINVAL;
1184
1185	rtnl_lock();
1186	mutex_lock(&xs->mutex);
1187	if (xs->state != XSK_READY) {
1188		err = -EBUSY;
1189		goto out_release;
1190	}
1191
1192	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
1193	if (!dev) {
1194		err = -ENODEV;
1195		goto out_release;
1196	}
1197
1198	if (!xs->rx && !xs->tx) {
1199		err = -EINVAL;
1200		goto out_unlock;
1201	}
1202
1203	qid = sxdp->sxdp_queue_id;
1204
1205	if (flags & XDP_SHARED_UMEM) {
1206		struct xdp_sock *umem_xs;
1207		struct socket *sock;
1208
1209		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
1210		    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
1211			/* Cannot specify flags for shared sockets. */
1212			err = -EINVAL;
1213			goto out_unlock;
1214		}
1215
1216		if (xs->umem) {
1217			/* We have already our own. */
1218			err = -EINVAL;
1219			goto out_unlock;
1220		}
1221
1222		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
1223		if (IS_ERR(sock)) {
1224			err = PTR_ERR(sock);
1225			goto out_unlock;
1226		}
1227
1228		umem_xs = xdp_sk(sock->sk);
1229		if (!xsk_is_bound(umem_xs)) {
1230			err = -EBADF;
1231			sockfd_put(sock);
1232			goto out_unlock;
1233		}
1234
1235		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
1236			/* Share the umem with another socket on another qid
1237			 * and/or device.
1238			 */
1239			xs->pool = xp_create_and_assign_umem(xs,
1240							     umem_xs->umem);
1241			if (!xs->pool) {
1242				err = -ENOMEM;
1243				sockfd_put(sock);
1244				goto out_unlock;
1245			}
1246
1247			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
1248						   qid);
1249			if (err) {
1250				xp_destroy(xs->pool);
1251				xs->pool = NULL;
1252				sockfd_put(sock);
1253				goto out_unlock;
1254			}
1255		} else {
1256			/* Share the buffer pool with the other socket. */
1257			if (xs->fq_tmp || xs->cq_tmp) {
1258				/* Do not allow setting your own fq or cq. */
1259				err = -EINVAL;
1260				sockfd_put(sock);
1261				goto out_unlock;
1262			}
1263
1264			xp_get_pool(umem_xs->pool);
1265			xs->pool = umem_xs->pool;
1266
1267			/* If underlying shared umem was created without Tx
1268			 * ring, allocate Tx descs array that Tx batching API
1269			 * utilizes
1270			 */
1271			if (xs->tx && !xs->pool->tx_descs) {
1272				err = xp_alloc_tx_descs(xs->pool, xs);
1273				if (err) {
1274					xp_put_pool(xs->pool);
1275					xs->pool = NULL;
1276					sockfd_put(sock);
1277					goto out_unlock;
1278				}
1279			}
1280		}
1281
1282		xdp_get_umem(umem_xs->umem);
1283		WRITE_ONCE(xs->umem, umem_xs->umem);
1284		sockfd_put(sock);
1285	} else if (!xs->umem || !xsk_validate_queues(xs)) {
1286		err = -EINVAL;
1287		goto out_unlock;
1288	} else {
1289		/* This xsk has its own umem. */
1290		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
1291		if (!xs->pool) {
1292			err = -ENOMEM;
1293			goto out_unlock;
1294		}
1295
1296		err = xp_assign_dev(xs->pool, dev, qid, flags);
1297		if (err) {
1298			xp_destroy(xs->pool);
1299			xs->pool = NULL;
1300			goto out_unlock;
1301		}
1302	}
1303
1304	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
1305	xs->fq_tmp = NULL;
1306	xs->cq_tmp = NULL;
1307
1308	xs->dev = dev;
1309	xs->zc = xs->umem->zc;
1310	xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
1311	xs->queue_id = qid;
1312	xp_add_xsk(xs->pool, xs);
1313
1314out_unlock:
1315	if (err) {
1316		dev_put(dev);
1317	} else {
1318		/* Matches smp_rmb() in bind() for shared umem
1319		 * sockets, and xsk_is_bound().
1320		 */
1321		smp_wmb();
1322		WRITE_ONCE(xs->state, XSK_BOUND);
1323	}
1324out_release:
1325	mutex_unlock(&xs->mutex);
1326	rtnl_unlock();
1327	return err;
1328}
1329
1330struct xdp_umem_reg_v1 {
1331	__u64 addr; /* Start of packet data area */
1332	__u64 len; /* Length of packet data area */
1333	__u32 chunk_size;
1334	__u32 headroom;
1335};
1336
1337struct xdp_umem_reg_v2 {
1338	__u64 addr; /* Start of packet data area */
1339	__u64 len; /* Length of packet data area */
1340	__u32 chunk_size;
1341	__u32 headroom;
1342	__u32 flags;
1343};
1344
1345static int xsk_setsockopt(struct socket *sock, int level, int optname,
1346			  sockptr_t optval, unsigned int optlen)
1347{
1348	struct sock *sk = sock->sk;
1349	struct xdp_sock *xs = xdp_sk(sk);
1350	int err;
1351
1352	if (level != SOL_XDP)
1353		return -ENOPROTOOPT;
1354
1355	switch (optname) {
1356	case XDP_RX_RING:
1357	case XDP_TX_RING:
1358	{
1359		struct xsk_queue **q;
1360		int entries;
1361
1362		if (optlen < sizeof(entries))
1363			return -EINVAL;
1364		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1365			return -EFAULT;
1366
1367		mutex_lock(&xs->mutex);
1368		if (xs->state != XSK_READY) {
1369			mutex_unlock(&xs->mutex);
1370			return -EBUSY;
1371		}
1372		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1373		err = xsk_init_queue(entries, q, false);
1374		if (!err && optname == XDP_TX_RING)
1375			/* Tx needs to be explicitly woken up the first time */
1376			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1377		mutex_unlock(&xs->mutex);
1378		return err;
1379	}
1380	case XDP_UMEM_REG:
1381	{
1382		size_t mr_size = sizeof(struct xdp_umem_reg);
1383		struct xdp_umem_reg mr = {};
1384		struct xdp_umem *umem;
1385
1386		if (optlen < sizeof(struct xdp_umem_reg_v1))
1387			return -EINVAL;
1388		else if (optlen < sizeof(struct xdp_umem_reg_v2))
1389			mr_size = sizeof(struct xdp_umem_reg_v1);
1390		else if (optlen < sizeof(mr))
1391			mr_size = sizeof(struct xdp_umem_reg_v2);
1392
1393		if (copy_from_sockptr(&mr, optval, mr_size))
1394			return -EFAULT;
1395
1396		mutex_lock(&xs->mutex);
1397		if (xs->state != XSK_READY || xs->umem) {
1398			mutex_unlock(&xs->mutex);
1399			return -EBUSY;
1400		}
1401
1402		umem = xdp_umem_create(&mr);
1403		if (IS_ERR(umem)) {
1404			mutex_unlock(&xs->mutex);
1405			return PTR_ERR(umem);
1406		}
1407
1408		/* Make sure umem is ready before it can be seen by others */
1409		smp_wmb();
1410		WRITE_ONCE(xs->umem, umem);
1411		mutex_unlock(&xs->mutex);
1412		return 0;
1413	}
1414	case XDP_UMEM_FILL_RING:
1415	case XDP_UMEM_COMPLETION_RING:
1416	{
1417		struct xsk_queue **q;
1418		int entries;
1419
1420		if (optlen < sizeof(entries))
1421			return -EINVAL;
1422		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1423			return -EFAULT;
1424
1425		mutex_lock(&xs->mutex);
1426		if (xs->state != XSK_READY) {
1427			mutex_unlock(&xs->mutex);
1428			return -EBUSY;
1429		}
1430
1431		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1432			&xs->cq_tmp;
1433		err = xsk_init_queue(entries, q, true);
1434		mutex_unlock(&xs->mutex);
1435		return err;
1436	}
1437	default:
1438		break;
1439	}
1440
1441	return -ENOPROTOOPT;
1442}
1443
1444static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1445{
1446	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1447	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1448	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1449}
1450
1451static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1452{
1453	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1454	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1455	ring->desc = offsetof(struct xdp_umem_ring, desc);
1456}
1457
1458struct xdp_statistics_v1 {
1459	__u64 rx_dropped;
1460	__u64 rx_invalid_descs;
1461	__u64 tx_invalid_descs;
1462};
1463
1464static int xsk_getsockopt(struct socket *sock, int level, int optname,
1465			  char __user *optval, int __user *optlen)
1466{
1467	struct sock *sk = sock->sk;
1468	struct xdp_sock *xs = xdp_sk(sk);
1469	int len;
1470
1471	if (level != SOL_XDP)
1472		return -ENOPROTOOPT;
1473
1474	if (get_user(len, optlen))
1475		return -EFAULT;
1476	if (len < 0)
1477		return -EINVAL;
1478
1479	switch (optname) {
1480	case XDP_STATISTICS:
1481	{
1482		struct xdp_statistics stats = {};
1483		bool extra_stats = true;
1484		size_t stats_size;
1485
1486		if (len < sizeof(struct xdp_statistics_v1)) {
1487			return -EINVAL;
1488		} else if (len < sizeof(stats)) {
1489			extra_stats = false;
1490			stats_size = sizeof(struct xdp_statistics_v1);
1491		} else {
1492			stats_size = sizeof(stats);
1493		}
1494
1495		mutex_lock(&xs->mutex);
1496		stats.rx_dropped = xs->rx_dropped;
1497		if (extra_stats) {
1498			stats.rx_ring_full = xs->rx_queue_full;
1499			stats.rx_fill_ring_empty_descs =
1500				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1501			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1502		} else {
1503			stats.rx_dropped += xs->rx_queue_full;
1504		}
1505		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1506		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1507		mutex_unlock(&xs->mutex);
1508
1509		if (copy_to_user(optval, &stats, stats_size))
1510			return -EFAULT;
1511		if (put_user(stats_size, optlen))
1512			return -EFAULT;
1513
1514		return 0;
1515	}
1516	case XDP_MMAP_OFFSETS:
1517	{
1518		struct xdp_mmap_offsets off;
1519		struct xdp_mmap_offsets_v1 off_v1;
1520		bool flags_supported = true;
1521		void *to_copy;
1522
1523		if (len < sizeof(off_v1))
1524			return -EINVAL;
1525		else if (len < sizeof(off))
1526			flags_supported = false;
1527
1528		if (flags_supported) {
1529			/* xdp_ring_offset is identical to xdp_ring_offset_v1
1530			 * except for the flags field added to the end.
1531			 */
1532			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1533					       &off.rx);
1534			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1535					       &off.tx);
1536			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1537					       &off.fr);
1538			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1539					       &off.cr);
1540			off.rx.flags = offsetof(struct xdp_rxtx_ring,
1541						ptrs.flags);
1542			off.tx.flags = offsetof(struct xdp_rxtx_ring,
1543						ptrs.flags);
1544			off.fr.flags = offsetof(struct xdp_umem_ring,
1545						ptrs.flags);
1546			off.cr.flags = offsetof(struct xdp_umem_ring,
1547						ptrs.flags);
1548
1549			len = sizeof(off);
1550			to_copy = &off;
1551		} else {
1552			xsk_enter_rxtx_offsets(&off_v1.rx);
1553			xsk_enter_rxtx_offsets(&off_v1.tx);
1554			xsk_enter_umem_offsets(&off_v1.fr);
1555			xsk_enter_umem_offsets(&off_v1.cr);
1556
1557			len = sizeof(off_v1);
1558			to_copy = &off_v1;
1559		}
1560
1561		if (copy_to_user(optval, to_copy, len))
1562			return -EFAULT;
1563		if (put_user(len, optlen))
1564			return -EFAULT;
1565
1566		return 0;
1567	}
1568	case XDP_OPTIONS:
1569	{
1570		struct xdp_options opts = {};
1571
1572		if (len < sizeof(opts))
1573			return -EINVAL;
1574
1575		mutex_lock(&xs->mutex);
1576		if (xs->zc)
1577			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1578		mutex_unlock(&xs->mutex);
1579
1580		len = sizeof(opts);
1581		if (copy_to_user(optval, &opts, len))
1582			return -EFAULT;
1583		if (put_user(len, optlen))
1584			return -EFAULT;
1585
1586		return 0;
1587	}
1588	default:
1589		break;
1590	}
1591
1592	return -EOPNOTSUPP;
1593}
1594
1595static int xsk_mmap(struct file *file, struct socket *sock,
1596		    struct vm_area_struct *vma)
1597{
1598	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1599	unsigned long size = vma->vm_end - vma->vm_start;
1600	struct xdp_sock *xs = xdp_sk(sock->sk);
1601	int state = READ_ONCE(xs->state);
1602	struct xsk_queue *q = NULL;
1603
1604	if (state != XSK_READY && state != XSK_BOUND)
1605		return -EBUSY;
1606
1607	if (offset == XDP_PGOFF_RX_RING) {
1608		q = READ_ONCE(xs->rx);
1609	} else if (offset == XDP_PGOFF_TX_RING) {
1610		q = READ_ONCE(xs->tx);
1611	} else {
1612		/* Matches the smp_wmb() in XDP_UMEM_REG */
1613		smp_rmb();
1614		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1615			q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
1616						 READ_ONCE(xs->pool->fq);
1617		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1618			q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
1619						 READ_ONCE(xs->pool->cq);
1620	}
1621
1622	if (!q)
1623		return -EINVAL;
1624
1625	/* Matches the smp_wmb() in xsk_init_queue */
1626	smp_rmb();
1627	if (size > q->ring_vmalloc_size)
1628		return -EINVAL;
1629
1630	return remap_vmalloc_range(vma, q->ring, 0);
1631}
1632
1633static int xsk_notifier(struct notifier_block *this,
1634			unsigned long msg, void *ptr)
1635{
1636	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1637	struct net *net = dev_net(dev);
1638	struct sock *sk;
1639
1640	switch (msg) {
1641	case NETDEV_UNREGISTER:
1642		mutex_lock(&net->xdp.lock);
1643		sk_for_each(sk, &net->xdp.list) {
1644			struct xdp_sock *xs = xdp_sk(sk);
1645
1646			mutex_lock(&xs->mutex);
1647			if (xs->dev == dev) {
1648				sk->sk_err = ENETDOWN;
1649				if (!sock_flag(sk, SOCK_DEAD))
1650					sk_error_report(sk);
1651
1652				xsk_unbind_dev(xs);
1653
1654				/* Clear device references. */
1655				xp_clear_dev(xs->pool);
1656			}
1657			mutex_unlock(&xs->mutex);
1658		}
1659		mutex_unlock(&net->xdp.lock);
1660		break;
1661	}
1662	return NOTIFY_DONE;
1663}
1664
1665static struct proto xsk_proto = {
1666	.name =		"XDP",
1667	.owner =	THIS_MODULE,
1668	.obj_size =	sizeof(struct xdp_sock),
1669};
1670
1671static const struct proto_ops xsk_proto_ops = {
1672	.family		= PF_XDP,
1673	.owner		= THIS_MODULE,
1674	.release	= xsk_release,
1675	.bind		= xsk_bind,
1676	.connect	= sock_no_connect,
1677	.socketpair	= sock_no_socketpair,
1678	.accept		= sock_no_accept,
1679	.getname	= sock_no_getname,
1680	.poll		= xsk_poll,
1681	.ioctl		= sock_no_ioctl,
1682	.listen		= sock_no_listen,
1683	.shutdown	= sock_no_shutdown,
1684	.setsockopt	= xsk_setsockopt,
1685	.getsockopt	= xsk_getsockopt,
1686	.sendmsg	= xsk_sendmsg,
1687	.recvmsg	= xsk_recvmsg,
1688	.mmap		= xsk_mmap,
1689};
1690
1691static void xsk_destruct(struct sock *sk)
1692{
1693	struct xdp_sock *xs = xdp_sk(sk);
1694
1695	if (!sock_flag(sk, SOCK_DEAD))
1696		return;
1697
1698	if (!xp_put_pool(xs->pool))
1699		xdp_put_umem(xs->umem, !xs->pool);
1700}
1701
1702static int xsk_create(struct net *net, struct socket *sock, int protocol,
1703		      int kern)
1704{
1705	struct xdp_sock *xs;
1706	struct sock *sk;
1707
1708	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1709		return -EPERM;
1710	if (sock->type != SOCK_RAW)
1711		return -ESOCKTNOSUPPORT;
1712
1713	if (protocol)
1714		return -EPROTONOSUPPORT;
1715
1716	sock->state = SS_UNCONNECTED;
1717
1718	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1719	if (!sk)
1720		return -ENOBUFS;
1721
1722	sock->ops = &xsk_proto_ops;
1723
1724	sock_init_data(sock, sk);
1725
1726	sk->sk_family = PF_XDP;
1727
1728	sk->sk_destruct = xsk_destruct;
1729
1730	sock_set_flag(sk, SOCK_RCU_FREE);
1731
1732	xs = xdp_sk(sk);
1733	xs->state = XSK_READY;
1734	mutex_init(&xs->mutex);
1735	spin_lock_init(&xs->rx_lock);
1736
1737	INIT_LIST_HEAD(&xs->map_list);
1738	spin_lock_init(&xs->map_list_lock);
1739
1740	mutex_lock(&net->xdp.lock);
1741	sk_add_node_rcu(sk, &net->xdp.list);
1742	mutex_unlock(&net->xdp.lock);
1743
1744	sock_prot_inuse_add(net, &xsk_proto, 1);
1745
1746	return 0;
1747}
1748
1749static const struct net_proto_family xsk_family_ops = {
1750	.family = PF_XDP,
1751	.create = xsk_create,
1752	.owner	= THIS_MODULE,
1753};
1754
1755static struct notifier_block xsk_netdev_notifier = {
1756	.notifier_call	= xsk_notifier,
1757};
1758
1759static int __net_init xsk_net_init(struct net *net)
1760{
1761	mutex_init(&net->xdp.lock);
1762	INIT_HLIST_HEAD(&net->xdp.list);
1763	return 0;
1764}
1765
1766static void __net_exit xsk_net_exit(struct net *net)
1767{
1768	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1769}
1770
1771static struct pernet_operations xsk_net_ops = {
1772	.init = xsk_net_init,
1773	.exit = xsk_net_exit,
1774};
1775
1776static int __init xsk_init(void)
1777{
1778	int err, cpu;
1779
1780	err = proto_register(&xsk_proto, 0 /* no slab */);
1781	if (err)
1782		goto out;
1783
1784	err = sock_register(&xsk_family_ops);
1785	if (err)
1786		goto out_proto;
1787
1788	err = register_pernet_subsys(&xsk_net_ops);
1789	if (err)
1790		goto out_sk;
1791
1792	err = register_netdevice_notifier(&xsk_netdev_notifier);
1793	if (err)
1794		goto out_pernet;
1795
1796	for_each_possible_cpu(cpu)
1797		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1798	return 0;
1799
1800out_pernet:
1801	unregister_pernet_subsys(&xsk_net_ops);
1802out_sk:
1803	sock_unregister(PF_XDP);
1804out_proto:
1805	proto_unregister(&xsk_proto);
1806out:
1807	return err;
1808}
1809
1810fs_initcall(xsk_init);
1811