t4_listen.c revision 346934
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/tom/t4_listen.c 346934 2019-04-29 22:16:33Z np $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#ifdef TCP_OFFLOAD
35#include <sys/param.h>
36#include <sys/types.h>
37#include <sys/kernel.h>
38#include <sys/ktr.h>
39#include <sys/module.h>
40#include <sys/protosw.h>
41#include <sys/refcount.h>
42#include <sys/domain.h>
43#include <sys/fnv_hash.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/sysctl.h>
47#include <net/ethernet.h>
48#include <net/if.h>
49#include <net/if_types.h>
50#include <net/if_vlan_var.h>
51#include <net/route.h>
52#include <netinet/in.h>
53#include <netinet/in_fib.h>
54#include <netinet/in_pcb.h>
55#include <netinet/ip.h>
56#include <netinet/ip6.h>
57#include <netinet6/in6_fib.h>
58#include <netinet6/scope6_var.h>
59#include <netinet/tcp_timer.h>
60#define TCPSTATES
61#include <netinet/tcp_fsm.h>
62#include <netinet/tcp_var.h>
63#include <netinet/toecore.h>
64#include <netinet/cc/cc.h>
65
66#include "common/common.h"
67#include "common/t4_msg.h"
68#include "common/t4_regs.h"
69#include "t4_clip.h"
70#include "tom/t4_tom_l2t.h"
71#include "tom/t4_tom.h"
72
73/* stid services */
74static int alloc_stid(struct adapter *, struct listen_ctx *, int);
75static struct listen_ctx *lookup_stid(struct adapter *, int);
76static void free_stid(struct adapter *, struct listen_ctx *);
77
78/* lctx services */
79static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
80    struct vi_info *);
81static int free_lctx(struct adapter *, struct listen_ctx *);
82static void hold_lctx(struct listen_ctx *);
83static void listen_hash_add(struct adapter *, struct listen_ctx *);
84static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
85static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
86static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
87
88static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *,
89    struct offload_settings *);
90static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
91static void send_reset_synqe(struct toedev *, struct synq_entry *);
92
93static int
94alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
95{
96	struct tid_info *t = &sc->tids;
97	u_int stid, n, f, mask;
98	struct stid_region *sr = &lctx->stid_region;
99
100	/*
101	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
102	 * the TCAM.  The start of the stid region is properly aligned (the chip
103	 * requires each region to be 128-cell aligned).
104	 */
105	n = isipv6 ? 2 : 1;
106	mask = n - 1;
107	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
108	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
109	    __func__, t->stid_base, t->nstids, n));
110
111	mtx_lock(&t->stid_lock);
112	if (n > t->nstids - t->stids_in_use) {
113		mtx_unlock(&t->stid_lock);
114		return (-1);
115	}
116
117	if (t->nstids_free_head >= n) {
118		/*
119		 * This allocation will definitely succeed because the region
120		 * starts at a good alignment and we just checked we have enough
121		 * stids free.
122		 */
123		f = t->nstids_free_head & mask;
124		t->nstids_free_head -= n + f;
125		stid = t->nstids_free_head;
126		TAILQ_INSERT_HEAD(&t->stids, sr, link);
127	} else {
128		struct stid_region *s;
129
130		stid = t->nstids_free_head;
131		TAILQ_FOREACH(s, &t->stids, link) {
132			stid += s->used + s->free;
133			f = stid & mask;
134			if (s->free >= n + f) {
135				stid -= n + f;
136				s->free -= n + f;
137				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
138				goto allocated;
139			}
140		}
141
142		if (__predict_false(stid != t->nstids)) {
143			panic("%s: stids TAILQ (%p) corrupt."
144			    "  At %d instead of %d at the end of the queue.",
145			    __func__, &t->stids, stid, t->nstids);
146		}
147
148		mtx_unlock(&t->stid_lock);
149		return (-1);
150	}
151
152allocated:
153	sr->used = n;
154	sr->free = f;
155	t->stids_in_use += n;
156	t->stid_tab[stid] = lctx;
157	mtx_unlock(&t->stid_lock);
158
159	KASSERT(((stid + t->stid_base) & mask) == 0,
160	    ("%s: EDOOFUS.", __func__));
161	return (stid + t->stid_base);
162}
163
164static struct listen_ctx *
165lookup_stid(struct adapter *sc, int stid)
166{
167	struct tid_info *t = &sc->tids;
168
169	return (t->stid_tab[stid - t->stid_base]);
170}
171
172static void
173free_stid(struct adapter *sc, struct listen_ctx *lctx)
174{
175	struct tid_info *t = &sc->tids;
176	struct stid_region *sr = &lctx->stid_region;
177	struct stid_region *s;
178
179	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
180
181	mtx_lock(&t->stid_lock);
182	s = TAILQ_PREV(sr, stid_head, link);
183	if (s != NULL)
184		s->free += sr->used + sr->free;
185	else
186		t->nstids_free_head += sr->used + sr->free;
187	KASSERT(t->stids_in_use >= sr->used,
188	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
189	    t->stids_in_use, sr->used));
190	t->stids_in_use -= sr->used;
191	TAILQ_REMOVE(&t->stids, sr, link);
192	mtx_unlock(&t->stid_lock);
193}
194
195static struct listen_ctx *
196alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
197{
198	struct listen_ctx *lctx;
199
200	INP_WLOCK_ASSERT(inp);
201
202	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
203	if (lctx == NULL)
204		return (NULL);
205
206	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
207	if (lctx->stid < 0) {
208		free(lctx, M_CXGBE);
209		return (NULL);
210	}
211
212	if (inp->inp_vflag & INP_IPV6 &&
213	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
214		lctx->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL);
215		if (lctx->ce == NULL) {
216			free(lctx, M_CXGBE);
217			return (NULL);
218		}
219	}
220
221	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
222	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
223	refcount_init(&lctx->refcount, 1);
224	TAILQ_INIT(&lctx->synq);
225
226	lctx->inp = inp;
227	lctx->vnet = inp->inp_socket->so_vnet;
228	in_pcbref(inp);
229
230	return (lctx);
231}
232
233/* Don't call this directly, use release_lctx instead */
234static int
235free_lctx(struct adapter *sc, struct listen_ctx *lctx)
236{
237	struct inpcb *inp = lctx->inp;
238
239	INP_WLOCK_ASSERT(inp);
240	KASSERT(lctx->refcount == 0,
241	    ("%s: refcount %d", __func__, lctx->refcount));
242	KASSERT(TAILQ_EMPTY(&lctx->synq),
243	    ("%s: synq not empty.", __func__));
244	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
245
246	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
247	    __func__, lctx->stid, lctx, lctx->inp);
248
249	if (lctx->ce)
250		t4_release_lip(sc, lctx->ce);
251	free_stid(sc, lctx);
252	free(lctx, M_CXGBE);
253
254	return (in_pcbrele_wlocked(inp));
255}
256
257static void
258hold_lctx(struct listen_ctx *lctx)
259{
260
261	refcount_acquire(&lctx->refcount);
262}
263
264static inline uint32_t
265listen_hashfn(void *key, u_long mask)
266{
267
268	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
269}
270
271/*
272 * Add a listen_ctx entry to the listen hash table.
273 */
274static void
275listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
276{
277	struct tom_data *td = sc->tom_softc;
278	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
279
280	mtx_lock(&td->lctx_hash_lock);
281	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
282	td->lctx_count++;
283	mtx_unlock(&td->lctx_hash_lock);
284}
285
286/*
287 * Look for the listening socket's context entry in the hash and return it.
288 */
289static struct listen_ctx *
290listen_hash_find(struct adapter *sc, struct inpcb *inp)
291{
292	struct tom_data *td = sc->tom_softc;
293	int bucket = listen_hashfn(inp, td->listen_mask);
294	struct listen_ctx *lctx;
295
296	mtx_lock(&td->lctx_hash_lock);
297	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
298		if (lctx->inp == inp)
299			break;
300	}
301	mtx_unlock(&td->lctx_hash_lock);
302
303	return (lctx);
304}
305
306/*
307 * Removes the listen_ctx structure for inp from the hash and returns it.
308 */
309static struct listen_ctx *
310listen_hash_del(struct adapter *sc, struct inpcb *inp)
311{
312	struct tom_data *td = sc->tom_softc;
313	int bucket = listen_hashfn(inp, td->listen_mask);
314	struct listen_ctx *lctx, *l;
315
316	mtx_lock(&td->lctx_hash_lock);
317	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
318		if (lctx->inp == inp) {
319			LIST_REMOVE(lctx, link);
320			td->lctx_count--;
321			break;
322		}
323	}
324	mtx_unlock(&td->lctx_hash_lock);
325
326	return (lctx);
327}
328
329/*
330 * Releases a hold on the lctx.  Must be called with the listening socket's inp
331 * locked.  The inp may be freed by this function and it returns NULL to
332 * indicate this.
333 */
334static struct inpcb *
335release_lctx(struct adapter *sc, struct listen_ctx *lctx)
336{
337	struct inpcb *inp = lctx->inp;
338	int inp_freed = 0;
339
340	INP_WLOCK_ASSERT(inp);
341	if (refcount_release(&lctx->refcount))
342		inp_freed = free_lctx(sc, lctx);
343
344	return (inp_freed ? NULL : inp);
345}
346
347static void
348send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
349{
350	struct adapter *sc = tod->tod_softc;
351	struct mbuf *m = synqe->syn;
352	struct ifnet *ifp = m->m_pkthdr.rcvif;
353	struct vi_info *vi = ifp->if_softc;
354	struct port_info *pi = vi->pi;
355	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
356	struct wrqe *wr;
357	struct fw_flowc_wr *flowc;
358	struct cpl_abort_req *req;
359	int txqid, rxqid, flowclen;
360	struct sge_wrq *ofld_txq;
361	struct sge_ofld_rxq *ofld_rxq;
362	const int nparams = 6;
363	unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
364
365	INP_WLOCK_ASSERT(synqe->lctx->inp);
366
367	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
368	    __func__, synqe, synqe->flags, synqe->tid,
369	    synqe->flags & TPF_ABORT_SHUTDOWN ?
370	    " (abort already in progress)" : "");
371	if (synqe->flags & TPF_ABORT_SHUTDOWN)
372		return;	/* abort already in progress */
373	synqe->flags |= TPF_ABORT_SHUTDOWN;
374
375	get_qids_from_mbuf(m, &txqid, &rxqid);
376	ofld_txq = &sc->sge.ofld_txq[txqid];
377	ofld_rxq = &sc->sge.ofld_rxq[rxqid];
378
379	/* The wrqe will have two WRs - a flowc followed by an abort_req */
380	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
381
382	wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
383	if (wr == NULL) {
384		/* XXX */
385		panic("%s: allocation failure.", __func__);
386	}
387	flowc = wrtod(wr);
388	req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
389
390	/* First the flowc ... */
391	memset(flowc, 0, wr->wr_len);
392	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
393	    V_FW_FLOWC_WR_NPARAMS(nparams));
394	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
395	    V_FW_WR_FLOWID(synqe->tid));
396	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
397	flowc->mnemval[0].val = htobe32(pfvf);
398	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
399	flowc->mnemval[1].val = htobe32(pi->tx_chan);
400	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
401	flowc->mnemval[2].val = htobe32(pi->tx_chan);
402	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
403	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
404 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
405 	flowc->mnemval[4].val = htobe32(512);
406 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
407 	flowc->mnemval[5].val = htobe32(512);
408	synqe->flags |= TPF_FLOWC_WR_SENT;
409
410	/* ... then ABORT request */
411	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
412	req->rsvd0 = 0;	/* don't have a snd_nxt */
413	req->rsvd1 = 1;	/* no data sent yet */
414	req->cmd = CPL_ABORT_SEND_RST;
415
416	t4_l2t_send(sc, wr, e);
417}
418
419static int
420create_server(struct adapter *sc, struct listen_ctx *lctx)
421{
422	struct wrqe *wr;
423	struct cpl_pass_open_req *req;
424	struct inpcb *inp = lctx->inp;
425
426	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
427	if (wr == NULL) {
428		log(LOG_ERR, "%s: allocation failure", __func__);
429		return (ENOMEM);
430	}
431	req = wrtod(wr);
432
433	INIT_TP_WR(req, 0);
434	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
435	req->local_port = inp->inp_lport;
436	req->peer_port = 0;
437	req->local_ip = inp->inp_laddr.s_addr;
438	req->peer_ip = 0;
439	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
440	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
441	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
442
443	t4_wrq_tx(sc, wr);
444	return (0);
445}
446
447static int
448create_server6(struct adapter *sc, struct listen_ctx *lctx)
449{
450	struct wrqe *wr;
451	struct cpl_pass_open_req6 *req;
452	struct inpcb *inp = lctx->inp;
453
454	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
455	if (wr == NULL) {
456		log(LOG_ERR, "%s: allocation failure", __func__);
457		return (ENOMEM);
458	}
459	req = wrtod(wr);
460
461	INIT_TP_WR(req, 0);
462	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
463	req->local_port = inp->inp_lport;
464	req->peer_port = 0;
465	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
466	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
467	req->peer_ip_hi = 0;
468	req->peer_ip_lo = 0;
469	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
470	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
471	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
472
473	t4_wrq_tx(sc, wr);
474	return (0);
475}
476
477static int
478destroy_server(struct adapter *sc, struct listen_ctx *lctx)
479{
480	struct wrqe *wr;
481	struct cpl_close_listsvr_req *req;
482
483	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
484	if (wr == NULL) {
485		/* XXX */
486		panic("%s: allocation failure.", __func__);
487	}
488	req = wrtod(wr);
489
490	INIT_TP_WR(req, 0);
491	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
492	    lctx->stid));
493	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
494	req->rsvd = htobe16(0);
495
496	t4_wrq_tx(sc, wr);
497	return (0);
498}
499
500/*
501 * Start a listening server by sending a passive open request to HW.
502 *
503 * Can't take adapter lock here and access to sc->flags,
504 * sc->offload_map, if_capenable are all race prone.
505 */
506int
507t4_listen_start(struct toedev *tod, struct tcpcb *tp)
508{
509	struct adapter *sc = tod->tod_softc;
510	struct vi_info *vi;
511	struct port_info *pi;
512	struct inpcb *inp = tp->t_inpcb;
513	struct listen_ctx *lctx;
514	int i, rc, v;
515	struct offload_settings settings;
516
517	INP_WLOCK_ASSERT(inp);
518
519	rw_rlock(&sc->policy_lock);
520	settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 0xffff,
521	    inp);
522	rw_runlock(&sc->policy_lock);
523	if (!settings.offload)
524		return (0);
525
526	/* Don't start a hardware listener for any loopback address. */
527	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
528		return (0);
529	if (!(inp->inp_vflag & INP_IPV6) &&
530	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
531		return (0);
532#if 0
533	ADAPTER_LOCK(sc);
534	if (IS_BUSY(sc)) {
535		log(LOG_ERR, "%s: listen request ignored, %s is busy",
536		    __func__, device_get_nameunit(sc->dev));
537		goto done;
538	}
539
540	KASSERT(uld_active(sc, ULD_TOM),
541	    ("%s: TOM not initialized", __func__));
542#endif
543
544	/*
545	 * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
546	 * such VI's queues to send the passive open and receive the reply to
547	 * it.
548	 *
549	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
550	 * then reject any attempt to bring down such a port (and maybe reject
551	 * attempts to disable IFCAP_TOE on that port too?).
552	 */
553	for_each_port(sc, i) {
554		pi = sc->port[i];
555		for_each_vi(pi, v, vi) {
556			if (vi->flags & VI_INIT_DONE &&
557			    vi->ifp->if_capenable & IFCAP_TOE)
558				goto found;
559		}
560	}
561	goto done;	/* no port that's UP with IFCAP_TOE enabled */
562found:
563
564	if (listen_hash_find(sc, inp) != NULL)
565		goto done;	/* already setup */
566
567	lctx = alloc_lctx(sc, inp, vi);
568	if (lctx == NULL) {
569		log(LOG_ERR,
570		    "%s: listen request ignored, %s couldn't allocate lctx\n",
571		    __func__, device_get_nameunit(sc->dev));
572		goto done;
573	}
574	listen_hash_add(sc, lctx);
575
576	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
577	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
578	    inp->inp_vflag);
579
580	if (inp->inp_vflag & INP_IPV6)
581		rc = create_server6(sc, lctx);
582	else
583		rc = create_server(sc, lctx);
584	if (rc != 0) {
585		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
586		    __func__, device_get_nameunit(sc->dev), rc);
587		(void) listen_hash_del(sc, inp);
588		inp = release_lctx(sc, lctx);
589		/* can't be freed, host stack has a reference */
590		KASSERT(inp != NULL, ("%s: inp freed", __func__));
591		goto done;
592	}
593	lctx->flags |= LCTX_RPL_PENDING;
594done:
595#if 0
596	ADAPTER_UNLOCK(sc);
597#endif
598	return (0);
599}
600
601int
602t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
603{
604	struct listen_ctx *lctx;
605	struct adapter *sc = tod->tod_softc;
606	struct inpcb *inp = tp->t_inpcb;
607	struct synq_entry *synqe;
608
609	INP_WLOCK_ASSERT(inp);
610
611	lctx = listen_hash_del(sc, inp);
612	if (lctx == NULL)
613		return (ENOENT);	/* no hardware listener for this inp */
614
615	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
616	    lctx, lctx->flags);
617
618	/*
619	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
620	 * arrive and clean up when it does.
621	 */
622	if (lctx->flags & LCTX_RPL_PENDING) {
623		KASSERT(TAILQ_EMPTY(&lctx->synq),
624		    ("%s: synq not empty.", __func__));
625		return (EINPROGRESS);
626	}
627
628	/*
629	 * The host stack will abort all the connections on the listening
630	 * socket's so_comp.  It doesn't know about the connections on the synq
631	 * so we need to take care of those.
632	 */
633	TAILQ_FOREACH(synqe, &lctx->synq, link) {
634		if (synqe->flags & TPF_SYNQE_HAS_L2TE)
635			send_reset_synqe(tod, synqe);
636	}
637
638	destroy_server(sc, lctx);
639	return (0);
640}
641
642static inline void
643hold_synqe(struct synq_entry *synqe)
644{
645
646	refcount_acquire(&synqe->refcnt);
647}
648
649static inline void
650release_synqe(struct synq_entry *synqe)
651{
652
653	if (refcount_release(&synqe->refcnt)) {
654		int needfree = synqe->flags & TPF_SYNQE_NEEDFREE;
655
656		m_freem(synqe->syn);
657		if (needfree)
658			free(synqe, M_CXGBE);
659	}
660}
661
662void
663t4_syncache_added(struct toedev *tod __unused, void *arg)
664{
665	struct synq_entry *synqe = arg;
666
667	hold_synqe(synqe);
668}
669
670void
671t4_syncache_removed(struct toedev *tod __unused, void *arg)
672{
673	struct synq_entry *synqe = arg;
674
675	release_synqe(synqe);
676}
677
678int
679t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
680{
681	struct adapter *sc = tod->tod_softc;
682	struct synq_entry *synqe = arg;
683	struct wrqe *wr;
684	struct l2t_entry *e;
685	struct tcpopt to;
686	struct ip *ip = mtod(m, struct ip *);
687	struct tcphdr *th;
688
689	wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
690	if (wr == NULL) {
691		m_freem(m);
692		return (EALREADY);
693	}
694
695	if (ip->ip_v == IPVERSION)
696		th = (void *)(ip + 1);
697	else
698		th = (void *)((struct ip6_hdr *)ip + 1);
699	bzero(&to, sizeof(to));
700	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
701	    TO_SYN);
702
703	/* save these for later */
704	synqe->iss = be32toh(th->th_seq);
705	synqe->ts = to.to_tsval;
706
707	if (chip_id(sc) >= CHELSIO_T5) {
708		struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr);
709
710		rpl5->iss = th->th_seq;
711	}
712
713	e = &sc->l2t->l2tab[synqe->l2e_idx];
714	t4_l2t_send(sc, wr, e);
715
716	m_freem(m);	/* don't need this any more */
717	return (0);
718}
719
720static int
721do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
722    struct mbuf *m)
723{
724	struct adapter *sc = iq->adapter;
725	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
726	int stid = GET_TID(cpl);
727	unsigned int status = cpl->status;
728	struct listen_ctx *lctx = lookup_stid(sc, stid);
729	struct inpcb *inp = lctx->inp;
730#ifdef INVARIANTS
731	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
732#endif
733
734	KASSERT(opcode == CPL_PASS_OPEN_RPL,
735	    ("%s: unexpected opcode 0x%x", __func__, opcode));
736	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
737	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
738
739	INP_WLOCK(inp);
740
741	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
742	    __func__, stid, status, lctx->flags);
743
744	lctx->flags &= ~LCTX_RPL_PENDING;
745
746	if (status != CPL_ERR_NONE)
747		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
748
749#ifdef INVARIANTS
750	/*
751	 * If the inp has been dropped (listening socket closed) then
752	 * listen_stop must have run and taken the inp out of the hash.
753	 */
754	if (inp->inp_flags & INP_DROPPED) {
755		KASSERT(listen_hash_del(sc, inp) == NULL,
756		    ("%s: inp %p still in listen hash", __func__, inp));
757	}
758#endif
759
760	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
761		if (release_lctx(sc, lctx) != NULL)
762			INP_WUNLOCK(inp);
763		return (status);
764	}
765
766	/*
767	 * Listening socket stopped listening earlier and now the chip tells us
768	 * it has started the hardware listener.  Stop it; the lctx will be
769	 * released in do_close_server_rpl.
770	 */
771	if (inp->inp_flags & INP_DROPPED) {
772		destroy_server(sc, lctx);
773		INP_WUNLOCK(inp);
774		return (status);
775	}
776
777	/*
778	 * Failed to start hardware listener.  Take inp out of the hash and
779	 * release our reference on it.  An error message has been logged
780	 * already.
781	 */
782	if (status != CPL_ERR_NONE) {
783		listen_hash_del(sc, inp);
784		if (release_lctx(sc, lctx) != NULL)
785			INP_WUNLOCK(inp);
786		return (status);
787	}
788
789	/* hardware listener open for business */
790
791	INP_WUNLOCK(inp);
792	return (status);
793}
794
795static int
796do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
797    struct mbuf *m)
798{
799	struct adapter *sc = iq->adapter;
800	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
801	int stid = GET_TID(cpl);
802	unsigned int status = cpl->status;
803	struct listen_ctx *lctx = lookup_stid(sc, stid);
804	struct inpcb *inp = lctx->inp;
805#ifdef INVARIANTS
806	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
807#endif
808
809	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
810	    ("%s: unexpected opcode 0x%x", __func__, opcode));
811	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
812	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
813
814	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
815
816	if (status != CPL_ERR_NONE) {
817		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
818		    __func__, status, stid);
819		return (status);
820	}
821
822	INP_WLOCK(inp);
823	inp = release_lctx(sc, lctx);
824	if (inp != NULL)
825		INP_WUNLOCK(inp);
826
827	return (status);
828}
829
830static void
831done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
832{
833	struct listen_ctx *lctx = synqe->lctx;
834	struct inpcb *inp = lctx->inp;
835	struct vi_info *vi = synqe->syn->m_pkthdr.rcvif->if_softc;
836	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
837	int ntids;
838
839	INP_WLOCK_ASSERT(inp);
840	ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
841
842	TAILQ_REMOVE(&lctx->synq, synqe, link);
843	inp = release_lctx(sc, lctx);
844	if (inp)
845		INP_WUNLOCK(inp);
846	remove_tid(sc, synqe->tid, ntids);
847	release_tid(sc, synqe->tid, &sc->sge.ctrlq[vi->pi->port_id]);
848	t4_l2t_release(e);
849	release_synqe(synqe);	/* removed from synq list */
850}
851
852int
853do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
854    struct mbuf *m)
855{
856	struct adapter *sc = iq->adapter;
857	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
858	unsigned int tid = GET_TID(cpl);
859	struct synq_entry *synqe = lookup_tid(sc, tid);
860	struct listen_ctx *lctx = synqe->lctx;
861	struct inpcb *inp = lctx->inp;
862	int txqid;
863	struct sge_wrq *ofld_txq;
864#ifdef INVARIANTS
865	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
866#endif
867
868	KASSERT(opcode == CPL_ABORT_REQ_RSS,
869	    ("%s: unexpected opcode 0x%x", __func__, opcode));
870	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
871	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
872
873	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
874	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
875
876	if (negative_advice(cpl->status))
877		return (0);	/* Ignore negative advice */
878
879	INP_WLOCK(inp);
880
881	get_qids_from_mbuf(synqe->syn, &txqid, NULL);
882	ofld_txq = &sc->sge.ofld_txq[txqid];
883
884	/*
885	 * If we'd initiated an abort earlier the reply to it is responsible for
886	 * cleaning up resources.  Otherwise we tear everything down right here
887	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
888	 */
889	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
890		INP_WUNLOCK(inp);
891		goto done;
892	}
893
894	done_with_synqe(sc, synqe);
895	/* inp lock released by done_with_synqe */
896done:
897	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
898	return (0);
899}
900
901int
902do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
903    struct mbuf *m)
904{
905	struct adapter *sc = iq->adapter;
906	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
907	unsigned int tid = GET_TID(cpl);
908	struct synq_entry *synqe = lookup_tid(sc, tid);
909	struct listen_ctx *lctx = synqe->lctx;
910	struct inpcb *inp = lctx->inp;
911#ifdef INVARIANTS
912	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
913#endif
914
915	KASSERT(opcode == CPL_ABORT_RPL_RSS,
916	    ("%s: unexpected opcode 0x%x", __func__, opcode));
917	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
918	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
919
920	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
921	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
922
923	INP_WLOCK(inp);
924	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
925	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
926	    __func__, synqe, synqe->flags));
927
928	done_with_synqe(sc, synqe);
929	/* inp lock released by done_with_synqe */
930
931	return (0);
932}
933
934void
935t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
936{
937	struct adapter *sc = tod->tod_softc;
938	struct synq_entry *synqe = arg;
939#ifdef INVARIANTS
940	struct inpcb *inp = sotoinpcb(so);
941#endif
942	struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
943	struct toepcb *toep = *(struct toepcb **)(cpl + 1);
944
945	INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
946	INP_WLOCK_ASSERT(inp);
947	KASSERT(synqe->flags & TPF_SYNQE,
948	    ("%s: %p not a synq_entry?", __func__, arg));
949
950	offload_socket(so, toep);
951	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
952	toep->flags |= TPF_CPL_PENDING;
953	update_tid(sc, synqe->tid, toep);
954	synqe->flags |= TPF_SYNQE_EXPANDED;
955}
956
957static inline void
958save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi,
959    struct offload_settings *s)
960{
961	uint32_t txqid, rxqid;
962
963	if (s->txq >= 0 && s->txq < vi->nofldtxq)
964		txqid = s->txq;
965	else
966		txqid = arc4random() % vi->nofldtxq;
967	txqid += vi->first_ofld_txq;
968
969	if (s->rxq >= 0 && s->rxq < vi->nofldrxq)
970		rxqid = s->rxq;
971	else
972		rxqid = arc4random() % vi->nofldrxq;
973	rxqid += vi->first_ofld_rxq;
974
975	m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
976}
977
978static inline void
979get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
980{
981
982	if (txqid)
983		*txqid = m->m_pkthdr.flowid >> 16;
984	if (rxqid)
985		*rxqid = m->m_pkthdr.flowid & 0xffff;
986}
987
988/*
989 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
990 * store some state temporarily.
991 */
992static struct synq_entry *
993mbuf_to_synqe(struct mbuf *m)
994{
995	int len = roundup2(sizeof (struct synq_entry), 8);
996	int tspace = M_TRAILINGSPACE(m);
997	struct synq_entry *synqe = NULL;
998
999	if (tspace < len) {
1000		synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
1001		if (synqe == NULL)
1002			return (NULL);
1003		synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE;
1004	} else {
1005		synqe = (void *)(m->m_data + m->m_len + tspace - len);
1006		synqe->flags = TPF_SYNQE;
1007	}
1008
1009	return (synqe);
1010}
1011
1012static void
1013t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
1014{
1015	bzero(to, sizeof(*to));
1016
1017	if (t4opt->mss) {
1018		to->to_flags |= TOF_MSS;
1019		to->to_mss = be16toh(t4opt->mss);
1020	}
1021
1022	if (t4opt->wsf) {
1023		to->to_flags |= TOF_SCALE;
1024		to->to_wscale = t4opt->wsf;
1025	}
1026
1027	if (t4opt->tstamp)
1028		to->to_flags |= TOF_TS;
1029
1030	if (t4opt->sack)
1031		to->to_flags |= TOF_SACKPERM;
1032}
1033
1034/*
1035 * Options2 for passive open.
1036 */
1037static uint32_t
1038calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
1039	const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode,
1040	struct cc_algo *cc, const struct offload_settings *s)
1041{
1042	struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
1043	uint32_t opt2 = 0;
1044
1045	/*
1046	 * rx flow control, rx coalesce, congestion control, and tx pace are all
1047	 * explicitly set by the driver.  On T5+ the ISS is also set by the
1048	 * driver to the value picked by the kernel.
1049	 */
1050	if (is_t4(sc)) {
1051		opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
1052		opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
1053	} else {
1054		opt2 |= F_T5_OPT_2_VALID;	/* all 4 valid */
1055		opt2 |= F_T5_ISS;		/* ISS provided in CPL */
1056	}
1057
1058	if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323)))
1059		opt2 |= F_SACK_EN;
1060
1061	if (tcpopt->tstamp &&
1062	    (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323)))
1063		opt2 |= F_TSTAMPS_EN;
1064
1065	if (tcpopt->wsf < 15 && V_tcp_do_rfc1323)
1066		opt2 |= F_WND_SCALE_EN;
1067
1068	if (th->th_flags & (TH_ECE | TH_CWR) &&
1069	    (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn)))
1070		opt2 |= F_CCTRL_ECN;
1071
1072	/* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */
1073
1074	opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
1075
1076	/* These defaults are subject to ULP specific fixups later. */
1077	opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
1078
1079	opt2 |= V_PACE(0);
1080
1081	if (s->cong_algo >= 0)
1082		opt2 |= V_CONG_CNTRL(s->cong_algo);
1083	else if (sc->tt.cong_algorithm >= 0)
1084		opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL);
1085	else {
1086		if (strcasecmp(cc->name, "reno") == 0)
1087			opt2 |= V_CONG_CNTRL(CONG_ALG_RENO);
1088		else if (strcasecmp(cc->name, "tahoe") == 0)
1089			opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE);
1090		if (strcasecmp(cc->name, "newreno") == 0)
1091			opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
1092		if (strcasecmp(cc->name, "highspeed") == 0)
1093			opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED);
1094		else {
1095			/*
1096			 * Use newreno in case the algorithm selected by the
1097			 * host stack is not supported by the hardware.
1098			 */
1099			opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
1100		}
1101	}
1102
1103	if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce))
1104		opt2 |= V_RX_COALESCE(M_RX_COALESCE);
1105
1106	/* Note that ofld_rxq is already set according to s->rxq. */
1107	opt2 |= F_RSS_QUEUE_VALID;
1108	opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id);
1109
1110#ifdef USE_DDP_RX_FLOW_CONTROL
1111	if (ulp_mode == ULP_MODE_TCPDDP)
1112		opt2 |= F_RX_FC_DDP;
1113#endif
1114
1115	if (ulp_mode == ULP_MODE_TLS) {
1116		opt2 &= ~V_RX_COALESCE(M_RX_COALESCE);
1117		opt2 |= F_RX_FC_DISABLE;
1118	}
1119
1120	return (htobe32(opt2));
1121}
1122
1123static void
1124pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
1125    struct in_conninfo *inc, struct tcphdr *th)
1126{
1127	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1128	const struct ether_header *eh;
1129	unsigned int hlen = be32toh(cpl->hdr_len);
1130	uintptr_t l3hdr;
1131	const struct tcphdr *tcp;
1132
1133	eh = (const void *)(cpl + 1);
1134	if (chip_id(sc) >= CHELSIO_T6) {
1135		l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1136		tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1137	} else {
1138		l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1139		tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1140	}
1141
1142	if (inc) {
1143		bzero(inc, sizeof(*inc));
1144		inc->inc_fport = tcp->th_sport;
1145		inc->inc_lport = tcp->th_dport;
1146		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1147			const struct ip *ip = (const void *)l3hdr;
1148
1149			inc->inc_faddr = ip->ip_src;
1150			inc->inc_laddr = ip->ip_dst;
1151		} else {
1152			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1153
1154			inc->inc_flags |= INC_ISIPV6;
1155			inc->inc6_faddr = ip6->ip6_src;
1156			inc->inc6_laddr = ip6->ip6_dst;
1157		}
1158	}
1159
1160	if (th) {
1161		bcopy(tcp, th, sizeof(*th));
1162		tcp_fields_to_host(th);		/* just like tcp_input */
1163	}
1164}
1165
1166static struct l2t_entry *
1167get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1168    struct in_conninfo *inc)
1169{
1170	struct l2t_entry *e;
1171	struct sockaddr_in6 sin6;
1172	struct sockaddr *dst = (void *)&sin6;
1173
1174	if (inc->inc_flags & INC_ISIPV6) {
1175		struct nhop6_basic nh6;
1176
1177		bzero(dst, sizeof(struct sockaddr_in6));
1178		dst->sa_len = sizeof(struct sockaddr_in6);
1179		dst->sa_family = AF_INET6;
1180
1181		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1182			/* no need for route lookup */
1183			e = t4_l2t_get(pi, ifp, dst);
1184			return (e);
1185		}
1186
1187		if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr,
1188		    0, 0, 0, &nh6) != 0)
1189			return (NULL);
1190		if (nh6.nh_ifp != ifp)
1191			return (NULL);
1192		((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr;
1193	} else {
1194		struct nhop4_basic nh4;
1195
1196		dst->sa_len = sizeof(struct sockaddr_in);
1197		dst->sa_family = AF_INET;
1198
1199		if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0,
1200		    &nh4) != 0)
1201			return (NULL);
1202		if (nh4.nh_ifp != ifp)
1203			return (NULL);
1204		((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr;
1205	}
1206
1207	e = t4_l2t_get(pi, ifp, dst);
1208	return (e);
1209}
1210
1211#define REJECT_PASS_ACCEPT()	do { \
1212	reject_reason = __LINE__; \
1213	goto reject; \
1214} while (0)
1215
1216/*
1217 * The context associated with a tid entry via insert_tid could be a synq_entry
1218 * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
1219 */
1220CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1221
1222/*
1223 * Incoming SYN on a listening socket.
1224 *
1225 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1226 * etc.
1227 */
1228static int
1229do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1230    struct mbuf *m)
1231{
1232	struct adapter *sc = iq->adapter;
1233	struct toedev *tod;
1234	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1235	struct cpl_pass_accept_rpl *rpl;
1236	struct wrqe *wr;
1237	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1238	unsigned int tid = GET_TID(cpl);
1239	struct listen_ctx *lctx = lookup_stid(sc, stid);
1240	struct inpcb *inp;
1241	struct socket *so;
1242	struct in_conninfo inc;
1243	struct tcphdr th;
1244	struct tcpopt to;
1245	struct port_info *pi;
1246	struct vi_info *vi;
1247	struct ifnet *hw_ifp, *ifp;
1248	struct l2t_entry *e = NULL;
1249	int rscale, mtu_idx, rx_credits, rxqid, ulp_mode;
1250	struct synq_entry *synqe = NULL;
1251	int reject_reason, v, ntids;
1252	uint16_t vid;
1253#ifdef INVARIANTS
1254	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1255#endif
1256	struct offload_settings settings;
1257
1258	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1259	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1260	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1261
1262	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1263	    lctx);
1264
1265	pass_accept_req_to_protohdrs(sc, m, &inc, &th);
1266	t4opt_to_tcpopt(&cpl->tcpopt, &to);
1267
1268	pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
1269
1270	CURVNET_SET(lctx->vnet);
1271
1272	/*
1273	 * Use the MAC index to lookup the associated VI.  If this SYN
1274	 * didn't match a perfect MAC filter, punt.
1275	 */
1276	if (!(be16toh(cpl->l2info) & F_SYN_XACT_MATCH)) {
1277		m_freem(m);
1278		m = NULL;
1279		REJECT_PASS_ACCEPT();
1280	}
1281	for_each_vi(pi, v, vi) {
1282		if (vi->xact_addr_filt == G_SYN_MAC_IDX(be16toh(cpl->l2info)))
1283			goto found;
1284	}
1285	m_freem(m);
1286	m = NULL;
1287	REJECT_PASS_ACCEPT();
1288
1289found:
1290	hw_ifp = vi->ifp;	/* the (v)cxgbeX ifnet */
1291	m->m_pkthdr.rcvif = hw_ifp;
1292	tod = TOEDEV(hw_ifp);
1293
1294	/*
1295	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1296	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
1297	 * doesn't match anything on this interface.
1298	 *
1299	 * XXX: lagg support, lagg + vlan support.
1300	 */
1301	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1302	if (vid != 0xfff) {
1303		ifp = VLAN_DEVAT(hw_ifp, vid);
1304		if (ifp == NULL)
1305			REJECT_PASS_ACCEPT();
1306	} else
1307		ifp = hw_ifp;
1308
1309	/*
1310	 * Don't offload if the peer requested a TCP option that's not known to
1311	 * the silicon.
1312	 */
1313	if (cpl->tcpopt.unknown)
1314		REJECT_PASS_ACCEPT();
1315
1316	if (inc.inc_flags & INC_ISIPV6) {
1317
1318		/* Don't offload if the ifcap isn't enabled */
1319		if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1320			REJECT_PASS_ACCEPT();
1321
1322		/*
1323		 * SYN must be directed to an IP6 address on this ifnet.  This
1324		 * is more restrictive than in6_localip.
1325		 */
1326		if (!in6_ifhasaddr(ifp, &inc.inc6_laddr))
1327			REJECT_PASS_ACCEPT();
1328
1329		ntids = 2;
1330	} else {
1331
1332		/* Don't offload if the ifcap isn't enabled */
1333		if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1334			REJECT_PASS_ACCEPT();
1335
1336		/*
1337		 * SYN must be directed to an IP address on this ifnet.  This
1338		 * is more restrictive than in_localip.
1339		 */
1340		if (!in_ifhasaddr(ifp, inc.inc_laddr))
1341			REJECT_PASS_ACCEPT();
1342
1343		ntids = 1;
1344	}
1345
1346	/*
1347	 * Don't offload if the ifnet that the SYN came in on is not in the same
1348	 * vnet as the listening socket.
1349	 */
1350	if (lctx->vnet != ifp->if_vnet)
1351		REJECT_PASS_ACCEPT();
1352
1353	e = get_l2te_for_nexthop(pi, ifp, &inc);
1354	if (e == NULL)
1355		REJECT_PASS_ACCEPT();
1356
1357	synqe = mbuf_to_synqe(m);
1358	if (synqe == NULL)
1359		REJECT_PASS_ACCEPT();
1360
1361	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1362	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]);
1363	if (wr == NULL)
1364		REJECT_PASS_ACCEPT();
1365	rpl = wrtod(wr);
1366
1367	INP_INFO_RLOCK(&V_tcbinfo);	/* for 4-tuple check */
1368
1369	/* Don't offload if the 4-tuple is already in use */
1370	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1371		INP_INFO_RUNLOCK(&V_tcbinfo);
1372		free(wr, M_CXGBE);
1373		REJECT_PASS_ACCEPT();
1374	}
1375	INP_INFO_RUNLOCK(&V_tcbinfo);
1376
1377	inp = lctx->inp;		/* listening socket, not owned by TOE */
1378	INP_WLOCK(inp);
1379
1380	/* Don't offload if the listening socket has closed */
1381	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1382		/*
1383		 * The listening socket has closed.  The reply from the TOE to
1384		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
1385		 * resources tied to this listen context.
1386		 */
1387		INP_WUNLOCK(inp);
1388		free(wr, M_CXGBE);
1389		REJECT_PASS_ACCEPT();
1390	}
1391	so = inp->inp_socket;
1392	rw_rlock(&sc->policy_lock);
1393	settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 0xffff, inp);
1394	rw_runlock(&sc->policy_lock);
1395	if (!settings.offload) {
1396		INP_WUNLOCK(inp);
1397		free(wr, M_CXGBE);
1398		REJECT_PASS_ACCEPT();
1399	}
1400
1401	mtu_idx = find_best_mtu_idx(sc, &inc, &settings);
1402	rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
1403	SOCKBUF_LOCK(&so->so_rcv);
1404	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1405	rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
1406	SOCKBUF_UNLOCK(&so->so_rcv);
1407
1408	save_qids_in_mbuf(m, vi, &settings);
1409	get_qids_from_mbuf(m, NULL, &rxqid);
1410
1411	if (is_t4(sc))
1412		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1413	else {
1414		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1415
1416		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1417	}
1418	ulp_mode = select_ulp_mode(so, sc, &settings);
1419	switch (ulp_mode) {
1420	case ULP_MODE_TCPDDP:
1421		synqe->flags |= TPF_SYNQE_TCPDDP;
1422		break;
1423	case ULP_MODE_TLS:
1424		synqe->flags |= TPF_SYNQE_TLS;
1425		break;
1426	}
1427	rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode,
1428	    &settings);
1429	rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode,
1430	    CC_ALGO(intotcpcb(inp)), &settings);
1431
1432	synqe->tid = tid;
1433	synqe->lctx = lctx;
1434	synqe->syn = m;
1435	m = NULL;
1436	refcount_init(&synqe->refcnt, 1);	/* 1 means extra hold */
1437	synqe->l2e_idx = e->idx;
1438	synqe->rcv_bufsize = rx_credits;
1439	atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
1440
1441	insert_tid(sc, tid, synqe, ntids);
1442	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
1443	hold_synqe(synqe);	/* hold for the duration it's in the synq */
1444	hold_lctx(lctx);	/* A synqe on the list has a ref on its lctx */
1445
1446	/*
1447	 * If all goes well t4_syncache_respond will get called during
1448	 * syncache_add.  Note that syncache_add releases the pcb lock.
1449	 */
1450	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1451	INP_UNLOCK_ASSERT(inp);	/* ok to assert, we have a ref on the inp */
1452
1453	/*
1454	 * If we replied during syncache_add (synqe->wr has been consumed),
1455	 * good.  Otherwise, set it to 0 so that further syncache_respond
1456	 * attempts by the kernel will be ignored.
1457	 */
1458	if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
1459
1460		/*
1461		 * syncache may or may not have a hold on the synqe, which may
1462		 * or may not be stashed in the original SYN mbuf passed to us.
1463		 * Just copy it over instead of dealing with all possibilities.
1464		 */
1465		m = m_dup(synqe->syn, M_NOWAIT);
1466		if (m)
1467			m->m_pkthdr.rcvif = hw_ifp;
1468
1469		remove_tid(sc, synqe->tid, ntids);
1470		free(wr, M_CXGBE);
1471
1472		/* Yank the synqe out of the lctx synq. */
1473		INP_WLOCK(inp);
1474		TAILQ_REMOVE(&lctx->synq, synqe, link);
1475		release_synqe(synqe);	/* removed from synq list */
1476		inp = release_lctx(sc, lctx);
1477		if (inp)
1478			INP_WUNLOCK(inp);
1479
1480		release_synqe(synqe);	/* extra hold */
1481		REJECT_PASS_ACCEPT();
1482	}
1483
1484	CTR6(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK mode %d",
1485	    __func__, stid, tid, lctx, synqe, ulp_mode);
1486
1487	INP_WLOCK(inp);
1488	synqe->flags |= TPF_SYNQE_HAS_L2TE;
1489	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1490		/*
1491		 * Listening socket closed but tod_listen_stop did not abort
1492		 * this tid because there was no L2T entry for the tid at that
1493		 * time.  Abort it now.  The reply to the abort will clean up.
1494		 */
1495		CTR6(KTR_CXGBE,
1496		    "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT",
1497		    __func__, stid, tid, lctx, synqe, synqe->flags);
1498		if (!(synqe->flags & TPF_SYNQE_EXPANDED))
1499			send_reset_synqe(tod, synqe);
1500		INP_WUNLOCK(inp);
1501		CURVNET_RESTORE();
1502
1503		release_synqe(synqe);	/* extra hold */
1504		return (__LINE__);
1505	}
1506	INP_WUNLOCK(inp);
1507	CURVNET_RESTORE();
1508
1509	release_synqe(synqe);	/* extra hold */
1510	return (0);
1511reject:
1512	CURVNET_RESTORE();
1513	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1514	    reject_reason);
1515
1516	if (e)
1517		t4_l2t_release(e);
1518	release_tid(sc, tid, lctx->ctrlq);
1519
1520	if (__predict_true(m != NULL)) {
1521		m_adj(m, sizeof(*cpl));
1522		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1523		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1524		m->m_pkthdr.csum_data = 0xffff;
1525		hw_ifp->if_input(hw_ifp, m);
1526	}
1527
1528	return (reject_reason);
1529}
1530
1531static void
1532synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1533    const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1534    struct tcphdr *th, struct tcpopt *to)
1535{
1536	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1537
1538	/* start off with the original SYN */
1539	pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th);
1540
1541	/* modify parts to make it look like the ACK to our SYN|ACK */
1542	th->th_flags = TH_ACK;
1543	th->th_ack = synqe->iss + 1;
1544	th->th_seq = be32toh(cpl->rcv_isn);
1545	bzero(to, sizeof(*to));
1546	if (G_TCPOPT_TSTAMP(tcp_opt)) {
1547		to->to_flags |= TOF_TS;
1548		to->to_tsecr = synqe->ts;
1549	}
1550}
1551
1552static int
1553do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1554    struct mbuf *m)
1555{
1556	struct adapter *sc = iq->adapter;
1557	struct vi_info *vi;
1558	struct ifnet *ifp;
1559	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1560#if defined(KTR) || defined(INVARIANTS)
1561	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1562#endif
1563	unsigned int tid = GET_TID(cpl);
1564	struct synq_entry *synqe = lookup_tid(sc, tid);
1565	struct listen_ctx *lctx = synqe->lctx;
1566	struct inpcb *inp = lctx->inp, *new_inp;
1567	struct socket *so;
1568	struct tcphdr th;
1569	struct tcpopt to;
1570	struct in_conninfo inc;
1571	struct toepcb *toep;
1572	u_int txqid, rxqid;
1573#ifdef INVARIANTS
1574	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1575#endif
1576
1577	KASSERT(opcode == CPL_PASS_ESTABLISH,
1578	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1579	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1580	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1581	KASSERT(synqe->flags & TPF_SYNQE,
1582	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1583
1584	CURVNET_SET(lctx->vnet);
1585	INP_INFO_RLOCK(&V_tcbinfo);	/* for syncache_expand */
1586	INP_WLOCK(inp);
1587
1588	CTR6(KTR_CXGBE,
1589	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1590	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1591
1592	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1593
1594		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1595			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1596			    ("%s: listen socket closed but tid %u not aborted.",
1597			    __func__, tid));
1598		}
1599
1600		INP_WUNLOCK(inp);
1601		INP_INFO_RUNLOCK(&V_tcbinfo);
1602		CURVNET_RESTORE();
1603		return (0);
1604	}
1605
1606	ifp = synqe->syn->m_pkthdr.rcvif;
1607	vi = ifp->if_softc;
1608	KASSERT(vi->pi->adapter == sc,
1609	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1610
1611	get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
1612	KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1613	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__, rxqid,
1614	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1615
1616	toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT);
1617	if (toep == NULL) {
1618reset:
1619		/*
1620		 * The reply to this abort will perform final cleanup.  There is
1621		 * no need to check for HAS_L2TE here.  We can be here only if
1622		 * we responded to the PASS_ACCEPT_REQ, and our response had the
1623		 * L2T idx.
1624		 */
1625		send_reset_synqe(TOEDEV(ifp), synqe);
1626		INP_WUNLOCK(inp);
1627		INP_INFO_RUNLOCK(&V_tcbinfo);
1628		CURVNET_RESTORE();
1629		return (0);
1630	}
1631	toep->tid = tid;
1632	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
1633	if (synqe->flags & TPF_SYNQE_TCPDDP)
1634		set_ulp_mode(toep, ULP_MODE_TCPDDP);
1635	else if (synqe->flags & TPF_SYNQE_TLS)
1636		set_ulp_mode(toep, ULP_MODE_TLS);
1637	else
1638		set_ulp_mode(toep, ULP_MODE_NONE);
1639	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1640	toep->rx_credits = synqe->rcv_bufsize;
1641
1642	so = inp->inp_socket;
1643	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1644
1645	/* Come up with something that syncache_expand should be ok with. */
1646	synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1647
1648	/*
1649	 * No more need for anything in the mbuf that carried the
1650	 * CPL_PASS_ACCEPT_REQ.  Drop the CPL_PASS_ESTABLISH and toep pointer
1651	 * there.  XXX: bad form but I don't want to increase the size of synqe.
1652	 */
1653	m = synqe->syn;
1654	KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
1655	    ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
1656	bcopy(cpl, mtod(m, void *), sizeof(*cpl));
1657	*(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
1658
1659	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1660		free_toepcb(toep);
1661		goto reset;
1662	}
1663
1664	/* New connection inpcb is already locked by syncache_expand(). */
1665	new_inp = sotoinpcb(so);
1666	INP_WLOCK_ASSERT(new_inp);
1667	MPASS(so->so_vnet == lctx->vnet);
1668	toep->vnet = lctx->vnet;
1669	if (inc.inc_flags & INC_ISIPV6)
1670		toep->ce = t4_hold_lip(sc, &inc.inc6_laddr, lctx->ce);
1671
1672	/*
1673	 * This is for the unlikely case where the syncache entry that we added
1674	 * has been evicted from the syncache, but the syncache_expand above
1675	 * works because of syncookies.
1676	 *
1677	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1678	 * anyone accept'ing a connection before we've installed our hooks, but
1679	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1680	 */
1681	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1682		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1683		t4_offload_socket(TOEDEV(ifp), synqe, so);
1684	}
1685
1686	INP_WUNLOCK(new_inp);
1687
1688	/* Done with the synqe */
1689	TAILQ_REMOVE(&lctx->synq, synqe, link);
1690	inp = release_lctx(sc, lctx);
1691	if (inp != NULL)
1692		INP_WUNLOCK(inp);
1693	INP_INFO_RUNLOCK(&V_tcbinfo);
1694	CURVNET_RESTORE();
1695	release_synqe(synqe);
1696
1697	return (0);
1698}
1699
1700void
1701t4_init_listen_cpl_handlers(void)
1702{
1703
1704	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1705	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1706	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1707	t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1708}
1709
1710void
1711t4_uninit_listen_cpl_handlers(void)
1712{
1713
1714	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
1715	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
1716	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
1717	t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
1718}
1719#endif
1720