mlx5_en_tx.c revision 324522
1/*-
2 * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: stable/11/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c 324522 2017-10-11 10:00:58Z hselasky $
26 */
27
28#include "en.h"
29#include <machine/atomic.h>
30
31static inline bool
32mlx5e_do_send_cqe(struct mlx5e_sq *sq)
33{
34	sq->cev_counter++;
35	/* interleave the CQEs */
36	if (sq->cev_counter >= sq->cev_factor) {
37		sq->cev_counter = 0;
38		return (1);
39	}
40	return (0);
41}
42
43void
44mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt)
45{
46	u16 pi = sq->pc & sq->wq.sz_m1;
47	struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
48
49	memset(&wqe->ctrl, 0, sizeof(wqe->ctrl));
50
51	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
52	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
53	if (mlx5e_do_send_cqe(sq))
54		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
55	else
56		wqe->ctrl.fm_ce_se = 0;
57
58	/* Copy data for doorbell */
59	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
60
61	sq->mbuf[pi].mbuf = NULL;
62	sq->mbuf[pi].num_bytes = 0;
63	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
64	sq->pc += sq->mbuf[pi].num_wqebbs;
65}
66
67#if (__FreeBSD_version >= 1100000)
68static uint32_t mlx5e_hash_value;
69
70static void
71mlx5e_hash_init(void *arg)
72{
73	mlx5e_hash_value = m_ether_tcpip_hash_init();
74}
75
76/* Make kernel call mlx5e_hash_init after the random stack finished initializing */
77SYSINIT(mlx5e_hash_init, SI_SUB_RANDOM, SI_ORDER_ANY, &mlx5e_hash_init, NULL);
78#endif
79
80static struct mlx5e_sq *
81mlx5e_select_queue(struct ifnet *ifp, struct mbuf *mb)
82{
83	struct mlx5e_priv *priv = ifp->if_softc;
84	struct mlx5e_channel * volatile *ppch;
85	struct mlx5e_channel *pch;
86	u32 ch;
87	u32 tc;
88
89	ppch = priv->channel;
90
91	/* check if channels are successfully opened */
92	if (unlikely(ppch == NULL))
93		return (NULL);
94
95	/* obtain VLAN information if present */
96	if (mb->m_flags & M_VLANTAG) {
97		tc = (mb->m_pkthdr.ether_vtag >> 13);
98		if (tc >= priv->num_tc)
99			tc = priv->default_vlan_prio;
100	} else {
101		tc = priv->default_vlan_prio;
102	}
103
104	ch = priv->params.num_channels;
105
106	/* check if flowid is set */
107	if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) {
108#ifdef RSS
109		u32 temp;
110
111		if (rss_hash2bucket(mb->m_pkthdr.flowid,
112		    M_HASHTYPE_GET(mb), &temp) == 0)
113			ch = temp % ch;
114		else
115#endif
116			ch = (mb->m_pkthdr.flowid % 128) % ch;
117	} else {
118#if (__FreeBSD_version >= 1100000)
119		ch = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 |
120		    MBUF_HASHFLAG_L4, mb, mlx5e_hash_value) % ch;
121#else
122		/*
123		 * m_ether_tcpip_hash not present in stable, so just
124		 * throw unhashed mbufs on queue 0
125		 */
126		ch = 0;
127#endif
128	}
129
130	/* check if channel is allocated and not stopped */
131	pch = ppch[ch];
132	if (likely(pch != NULL && pch->sq[tc].stopped == 0))
133		return (&pch->sq[tc]);
134	return (NULL);
135}
136
137static inline u16
138mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq, struct mbuf *mb)
139{
140	return (MIN(MLX5E_MAX_TX_INLINE, mb->m_len));
141}
142
143static int
144mlx5e_get_header_size(struct mbuf *mb)
145{
146	struct ether_vlan_header *eh;
147	struct tcphdr *th;
148	struct ip *ip;
149	int ip_hlen, tcp_hlen;
150	struct ip6_hdr *ip6;
151	uint16_t eth_type;
152	int eth_hdr_len;
153
154	eh = mtod(mb, struct ether_vlan_header *);
155	if (mb->m_len < ETHER_HDR_LEN)
156		return (0);
157	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
158		eth_type = ntohs(eh->evl_proto);
159		eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
160	} else {
161		eth_type = ntohs(eh->evl_encap_proto);
162		eth_hdr_len = ETHER_HDR_LEN;
163	}
164	if (mb->m_len < eth_hdr_len)
165		return (0);
166	switch (eth_type) {
167	case ETHERTYPE_IP:
168		ip = (struct ip *)(mb->m_data + eth_hdr_len);
169		if (mb->m_len < eth_hdr_len + sizeof(*ip))
170			return (0);
171		if (ip->ip_p != IPPROTO_TCP)
172			return (0);
173		ip_hlen = ip->ip_hl << 2;
174		eth_hdr_len += ip_hlen;
175		break;
176	case ETHERTYPE_IPV6:
177		ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len);
178		if (mb->m_len < eth_hdr_len + sizeof(*ip6))
179			return (0);
180		if (ip6->ip6_nxt != IPPROTO_TCP)
181			return (0);
182		eth_hdr_len += sizeof(*ip6);
183		break;
184	default:
185		return (0);
186	}
187	if (mb->m_len < eth_hdr_len + sizeof(*th))
188		return (0);
189	th = (struct tcphdr *)(mb->m_data + eth_hdr_len);
190	tcp_hlen = th->th_off << 2;
191	eth_hdr_len += tcp_hlen;
192	if (mb->m_len < eth_hdr_len)
193		return (0);
194	return (eth_hdr_len);
195}
196
197/*
198 * The return value is not going back to the stack because of
199 * the drbr
200 */
201static int
202mlx5e_sq_xmit(struct mlx5e_sq *sq, struct mbuf **mbp)
203{
204	bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS];
205	struct mlx5_wqe_data_seg *dseg;
206	struct mlx5e_tx_wqe *wqe;
207	struct ifnet *ifp;
208	int nsegs;
209	int err;
210	int x;
211	struct mbuf *mb = *mbp;
212	u16 ds_cnt;
213	u16 ihs;
214	u16 pi;
215	u8 opcode;
216
217	/*
218	 * Return ENOBUFS if the queue is full, this may trigger reinsertion
219	 * of the mbuf into the drbr (see mlx5e_xmit_locked)
220	 */
221	if (unlikely(!mlx5e_sq_has_room_for(sq, 2 * MLX5_SEND_WQE_MAX_WQEBBS))) {
222		return (ENOBUFS);
223	}
224
225	/* Align SQ edge with NOPs to avoid WQE wrap around */
226	pi = ((~sq->pc) & sq->wq.sz_m1);
227	if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) {
228		/* Send one multi NOP message instead of many */
229		mlx5e_send_nop(sq, (pi + 1) * MLX5_SEND_WQEBB_NUM_DS);
230		pi = ((~sq->pc) & sq->wq.sz_m1);
231		if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1))
232			return (ENOMEM);
233	}
234
235	/* Setup local variables */
236	pi = sq->pc & sq->wq.sz_m1;
237	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
238	ifp = sq->ifp;
239
240	memset(wqe, 0, sizeof(*wqe));
241
242	/* Send a copy of the frame to the BPF listener, if any */
243	if (ifp != NULL && ifp->if_bpf != NULL)
244		ETHER_BPF_MTAP(ifp, mb);
245
246	if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) {
247		wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_CSUM;
248	}
249	if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) {
250		wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_CSUM;
251	}
252	if (wqe->eth.cs_flags == 0) {
253		sq->stats.csum_offload_none++;
254	}
255	if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
256		u32 payload_len;
257		u32 mss = mb->m_pkthdr.tso_segsz;
258		u32 num_pkts;
259
260		wqe->eth.mss = cpu_to_be16(mss);
261		opcode = MLX5_OPCODE_LSO;
262		ihs = mlx5e_get_header_size(mb);
263		payload_len = mb->m_pkthdr.len - ihs;
264		if (payload_len == 0)
265			num_pkts = 1;
266		else
267			num_pkts = DIV_ROUND_UP(payload_len, mss);
268		sq->mbuf[pi].num_bytes = payload_len + (num_pkts * ihs);
269
270		sq->stats.tso_packets++;
271		sq->stats.tso_bytes += payload_len;
272	} else {
273		opcode = MLX5_OPCODE_SEND;
274		ihs = mlx5e_get_inline_hdr_size(sq, mb);
275		sq->mbuf[pi].num_bytes = max_t (unsigned int,
276		    mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN);
277	}
278	if (mb->m_flags & M_VLANTAG) {
279		struct ether_vlan_header *eh =
280		    (struct ether_vlan_header *)wqe->eth.inline_hdr_start;
281
282		/* Range checks */
283		if (ihs > (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN))
284			ihs = (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN);
285		else if (ihs < ETHER_HDR_LEN) {
286			err = EINVAL;
287			goto tx_drop;
288		}
289		m_copydata(mb, 0, ETHER_HDR_LEN, (caddr_t)eh);
290		m_adj(mb, ETHER_HDR_LEN);
291		/* Insert 4 bytes VLAN tag into data stream */
292		eh->evl_proto = eh->evl_encap_proto;
293		eh->evl_encap_proto = htons(ETHERTYPE_VLAN);
294		eh->evl_tag = htons(mb->m_pkthdr.ether_vtag);
295		/* Copy rest of header data, if any */
296		m_copydata(mb, 0, ihs - ETHER_HDR_LEN, (caddr_t)(eh + 1));
297		m_adj(mb, ihs - ETHER_HDR_LEN);
298		/* Extend header by 4 bytes */
299		ihs += ETHER_VLAN_ENCAP_LEN;
300	} else {
301		m_copydata(mb, 0, ihs, wqe->eth.inline_hdr_start);
302		m_adj(mb, ihs);
303	}
304
305	wqe->eth.inline_hdr_sz = cpu_to_be16(ihs);
306
307	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
308	if (likely(ihs > sizeof(wqe->eth.inline_hdr_start))) {
309		ds_cnt += DIV_ROUND_UP(ihs - sizeof(wqe->eth.inline_hdr_start),
310		    MLX5_SEND_WQE_DS);
311	}
312	dseg = ((struct mlx5_wqe_data_seg *)&wqe->ctrl) + ds_cnt;
313
314	/* Trim off empty mbufs */
315	while (mb->m_len == 0) {
316		mb = m_free(mb);
317		/* Check if all data has been inlined */
318		if (mb == NULL)
319			goto skip_dma;
320	}
321
322	err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
323	    mb, segs, &nsegs, BUS_DMA_NOWAIT);
324	if (err == EFBIG) {
325		/*
326		 * Update *mbp before defrag in case it was trimmed in the
327		 * loop above
328		 */
329		*mbp = mb;
330		/* Update statistics */
331		sq->stats.defragged++;
332		/* Too many mbuf fragments */
333		mb = m_defrag(*mbp, M_NOWAIT);
334		if (mb == NULL) {
335			mb = *mbp;
336			goto tx_drop;
337		}
338		/* Try again */
339		err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
340		    mb, segs, &nsegs, BUS_DMA_NOWAIT);
341	}
342	/* Catch errors */
343	if (err != 0)
344		goto tx_drop;
345
346	for (x = 0; x != nsegs; x++) {
347		if (segs[x].ds_len == 0)
348			continue;
349		dseg->addr = cpu_to_be64((uint64_t)segs[x].ds_addr);
350		dseg->lkey = sq->mkey_be;
351		dseg->byte_count = cpu_to_be32((uint32_t)segs[x].ds_len);
352		dseg++;
353	}
354skip_dma:
355	ds_cnt = (dseg - ((struct mlx5_wqe_data_seg *)&wqe->ctrl));
356
357	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
358	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
359	if (mlx5e_do_send_cqe(sq))
360		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
361	else
362		wqe->ctrl.fm_ce_se = 0;
363
364	/* Copy data for doorbell */
365	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
366
367	/* Store pointer to mbuf */
368	sq->mbuf[pi].mbuf = mb;
369	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
370	sq->pc += sq->mbuf[pi].num_wqebbs;
371
372	/* Make sure all mbuf data is written to RAM */
373	if (mb != NULL)
374		bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map, BUS_DMASYNC_PREWRITE);
375
376	sq->stats.packets++;
377	*mbp = NULL;	/* safety clear */
378	return (0);
379
380tx_drop:
381	sq->stats.dropped++;
382	*mbp = NULL;
383	m_freem(mb);
384	return err;
385}
386
387static void
388mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
389{
390	u16 sqcc;
391
392	/*
393	 * sq->cc must be updated only after mlx5_cqwq_update_db_record(),
394	 * otherwise a cq overrun may occur
395	 */
396	sqcc = sq->cc;
397
398	while (budget > 0) {
399		struct mlx5_cqe64 *cqe;
400		struct mbuf *mb;
401		u16 x;
402		u16 ci;
403
404		cqe = mlx5e_get_cqe(&sq->cq);
405		if (!cqe)
406			break;
407
408		mlx5_cqwq_pop(&sq->cq.wq);
409
410		/* update budget according to the event factor */
411		budget -= sq->cev_factor;
412
413		for (x = 0; x != sq->cev_factor; x++) {
414			ci = sqcc & sq->wq.sz_m1;
415			mb = sq->mbuf[ci].mbuf;
416			sq->mbuf[ci].mbuf = NULL;	/* Safety clear */
417
418			if (mb == NULL) {
419				if (sq->mbuf[ci].num_bytes == 0) {
420					/* NOP */
421					sq->stats.nop++;
422				}
423			} else {
424				bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
425				    BUS_DMASYNC_POSTWRITE);
426				bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
427
428				/* Free transmitted mbuf */
429				m_freem(mb);
430			}
431			sqcc += sq->mbuf[ci].num_wqebbs;
432		}
433	}
434
435	mlx5_cqwq_update_db_record(&sq->cq.wq);
436
437	/* Ensure cq space is freed before enabling more cqes */
438	wmb();
439
440	sq->cc = sqcc;
441
442	if (sq->sq_tq != NULL &&
443	    atomic_cmpset_int(&sq->queue_state, MLX5E_SQ_FULL, MLX5E_SQ_READY))
444		taskqueue_enqueue(sq->sq_tq, &sq->sq_task);
445}
446
447static int
448mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb)
449{
450	struct mbuf *next;
451	int err = 0;
452
453	if (likely(mb != NULL)) {
454		/*
455		 * If we can't insert mbuf into drbr, try to xmit anyway.
456		 * We keep the error we got so we could return that after xmit.
457		 */
458		err = drbr_enqueue(ifp, sq->br, mb);
459	}
460
461	/*
462	 * Check if the network interface is closed or if the SQ is
463	 * being stopped:
464	 */
465	if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
466	    sq->stopped != 0))
467		return (err);
468
469	/* Process the queue */
470	while ((next = drbr_peek(ifp, sq->br)) != NULL) {
471		if (mlx5e_sq_xmit(sq, &next) != 0) {
472			if (next == NULL) {
473				drbr_advance(ifp, sq->br);
474			} else {
475				drbr_putback(ifp, sq->br, next);
476				atomic_store_rel_int(&sq->queue_state, MLX5E_SQ_FULL);
477			}
478			break;
479		}
480		drbr_advance(ifp, sq->br);
481	}
482	/* Check if we need to write the doorbell */
483	if (likely(sq->doorbell.d64 != 0)) {
484		mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
485		sq->doorbell.d64 = 0;
486	}
487	/*
488	 * Check if we need to start the event timer which flushes the
489	 * transmit ring on timeout:
490	 */
491	if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
492	    sq->cev_factor != 1)) {
493		/* start the timer */
494		mlx5e_sq_cev_timeout(sq);
495	} else {
496		/* don't send NOPs yet */
497		sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
498	}
499	return (err);
500}
501
502static int
503mlx5e_xmit_locked_no_br(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb)
504{
505	int err = 0;
506
507	if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
508	    sq->stopped != 0)) {
509		m_freem(mb);
510		return (ENETDOWN);
511	}
512
513	/* Do transmit */
514	if (mlx5e_sq_xmit(sq, &mb) != 0) {
515		/* NOTE: m_freem() is NULL safe */
516		m_freem(mb);
517		err = ENOBUFS;
518	}
519
520	/* Check if we need to write the doorbell */
521	if (likely(sq->doorbell.d64 != 0)) {
522		mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
523		sq->doorbell.d64 = 0;
524	}
525
526	/*
527	 * Check if we need to start the event timer which flushes the
528	 * transmit ring on timeout:
529	 */
530	if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
531	    sq->cev_factor != 1)) {
532		/* start the timer */
533		mlx5e_sq_cev_timeout(sq);
534	} else {
535		/* don't send NOPs yet */
536		sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
537	}
538	return (err);
539}
540
541int
542mlx5e_xmit(struct ifnet *ifp, struct mbuf *mb)
543{
544	struct mlx5e_sq *sq;
545	int ret;
546
547	sq = mlx5e_select_queue(ifp, mb);
548	if (unlikely(sq == NULL)) {
549		/* Invalid send queue */
550		m_freem(mb);
551		return (ENXIO);
552	}
553
554	if (unlikely(sq->br == NULL)) {
555		/* rate limited traffic */
556		mtx_lock(&sq->lock);
557		ret = mlx5e_xmit_locked_no_br(ifp, sq, mb);
558		mtx_unlock(&sq->lock);
559	} else if (mtx_trylock(&sq->lock)) {
560		ret = mlx5e_xmit_locked(ifp, sq, mb);
561		mtx_unlock(&sq->lock);
562	} else {
563		ret = drbr_enqueue(ifp, sq->br, mb);
564		taskqueue_enqueue(sq->sq_tq, &sq->sq_task);
565	}
566
567	return (ret);
568}
569
570void
571mlx5e_tx_cq_comp(struct mlx5_core_cq *mcq)
572{
573	struct mlx5e_sq *sq = container_of(mcq, struct mlx5e_sq, cq.mcq);
574
575	mtx_lock(&sq->comp_lock);
576	mlx5e_poll_tx_cq(sq, MLX5E_BUDGET_MAX);
577	mlx5e_cq_arm(&sq->cq, MLX5_GET_DOORBELL_LOCK(&sq->priv->doorbell_lock));
578	mtx_unlock(&sq->comp_lock);
579}
580
581void
582mlx5e_tx_que(void *context, int pending)
583{
584	struct mlx5e_sq *sq = context;
585	struct ifnet *ifp = sq->ifp;
586
587	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
588		mtx_lock(&sq->lock);
589		if (!drbr_empty(ifp, sq->br))
590			mlx5e_xmit_locked(ifp, sq, NULL);
591		mtx_unlock(&sq->lock);
592	}
593}
594