t4_listen.c revision 309560
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/tom/t4_listen.c 309560 2016-12-05 20:43:25Z jhb $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#ifdef TCP_OFFLOAD
35#include <sys/param.h>
36#include <sys/types.h>
37#include <sys/kernel.h>
38#include <sys/ktr.h>
39#include <sys/module.h>
40#include <sys/protosw.h>
41#include <sys/refcount.h>
42#include <sys/domain.h>
43#include <sys/fnv_hash.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <net/ethernet.h>
47#include <net/if.h>
48#include <net/if_types.h>
49#include <net/if_vlan_var.h>
50#include <net/route.h>
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/ip.h>
54#include <netinet/ip6.h>
55#include <netinet6/scope6_var.h>
56#include <netinet/tcp_timer.h>
57#include <netinet/tcp_var.h>
58#define TCPSTATES
59#include <netinet/tcp_fsm.h>
60#include <netinet/toecore.h>
61
62#include "common/common.h"
63#include "common/t4_msg.h"
64#include "common/t4_regs.h"
65#include "tom/t4_tom_l2t.h"
66#include "tom/t4_tom.h"
67
68/* stid services */
69static int alloc_stid(struct adapter *, struct listen_ctx *, int);
70static struct listen_ctx *lookup_stid(struct adapter *, int);
71static void free_stid(struct adapter *, struct listen_ctx *);
72
73/* lctx services */
74static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
75    struct vi_info *);
76static int free_lctx(struct adapter *, struct listen_ctx *);
77static void hold_lctx(struct listen_ctx *);
78static void listen_hash_add(struct adapter *, struct listen_ctx *);
79static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
80static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
81static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
82
83static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *);
84static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
85static void send_reset_synqe(struct toedev *, struct synq_entry *);
86
87static int
88alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
89{
90	struct tid_info *t = &sc->tids;
91	u_int stid, n, f, mask;
92	struct stid_region *sr = &lctx->stid_region;
93
94	/*
95	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
96	 * the TCAM.  The start of the stid region is properly aligned (the chip
97	 * requires each region to be 128-cell aligned).
98	 */
99	n = isipv6 ? 2 : 1;
100	mask = n - 1;
101	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
102	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
103	    __func__, t->stid_base, t->nstids, n));
104
105	mtx_lock(&t->stid_lock);
106	if (n > t->nstids - t->stids_in_use) {
107		mtx_unlock(&t->stid_lock);
108		return (-1);
109	}
110
111	if (t->nstids_free_head >= n) {
112		/*
113		 * This allocation will definitely succeed because the region
114		 * starts at a good alignment and we just checked we have enough
115		 * stids free.
116		 */
117		f = t->nstids_free_head & mask;
118		t->nstids_free_head -= n + f;
119		stid = t->nstids_free_head;
120		TAILQ_INSERT_HEAD(&t->stids, sr, link);
121	} else {
122		struct stid_region *s;
123
124		stid = t->nstids_free_head;
125		TAILQ_FOREACH(s, &t->stids, link) {
126			stid += s->used + s->free;
127			f = stid & mask;
128			if (s->free >= n + f) {
129				stid -= n + f;
130				s->free -= n + f;
131				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
132				goto allocated;
133			}
134		}
135
136		if (__predict_false(stid != t->nstids)) {
137			panic("%s: stids TAILQ (%p) corrupt."
138			    "  At %d instead of %d at the end of the queue.",
139			    __func__, &t->stids, stid, t->nstids);
140		}
141
142		mtx_unlock(&t->stid_lock);
143		return (-1);
144	}
145
146allocated:
147	sr->used = n;
148	sr->free = f;
149	t->stids_in_use += n;
150	t->stid_tab[stid] = lctx;
151	mtx_unlock(&t->stid_lock);
152
153	KASSERT(((stid + t->stid_base) & mask) == 0,
154	    ("%s: EDOOFUS.", __func__));
155	return (stid + t->stid_base);
156}
157
158static struct listen_ctx *
159lookup_stid(struct adapter *sc, int stid)
160{
161	struct tid_info *t = &sc->tids;
162
163	return (t->stid_tab[stid - t->stid_base]);
164}
165
166static void
167free_stid(struct adapter *sc, struct listen_ctx *lctx)
168{
169	struct tid_info *t = &sc->tids;
170	struct stid_region *sr = &lctx->stid_region;
171	struct stid_region *s;
172
173	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
174
175	mtx_lock(&t->stid_lock);
176	s = TAILQ_PREV(sr, stid_head, link);
177	if (s != NULL)
178		s->free += sr->used + sr->free;
179	else
180		t->nstids_free_head += sr->used + sr->free;
181	KASSERT(t->stids_in_use >= sr->used,
182	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
183	    t->stids_in_use, sr->used));
184	t->stids_in_use -= sr->used;
185	TAILQ_REMOVE(&t->stids, sr, link);
186	mtx_unlock(&t->stid_lock);
187}
188
189static struct listen_ctx *
190alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
191{
192	struct listen_ctx *lctx;
193
194	INP_WLOCK_ASSERT(inp);
195
196	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
197	if (lctx == NULL)
198		return (NULL);
199
200	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
201	if (lctx->stid < 0) {
202		free(lctx, M_CXGBE);
203		return (NULL);
204	}
205
206	if (inp->inp_vflag & INP_IPV6 &&
207	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
208		struct tom_data *td = sc->tom_softc;
209
210		lctx->ce = hold_lip(td, &inp->in6p_laddr);
211		if (lctx->ce == NULL) {
212			free(lctx, M_CXGBE);
213			return (NULL);
214		}
215	}
216
217	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
218	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
219	refcount_init(&lctx->refcount, 1);
220	TAILQ_INIT(&lctx->synq);
221
222	lctx->inp = inp;
223	in_pcbref(inp);
224
225	return (lctx);
226}
227
228/* Don't call this directly, use release_lctx instead */
229static int
230free_lctx(struct adapter *sc, struct listen_ctx *lctx)
231{
232	struct inpcb *inp = lctx->inp;
233	struct tom_data *td = sc->tom_softc;
234
235	INP_WLOCK_ASSERT(inp);
236	KASSERT(lctx->refcount == 0,
237	    ("%s: refcount %d", __func__, lctx->refcount));
238	KASSERT(TAILQ_EMPTY(&lctx->synq),
239	    ("%s: synq not empty.", __func__));
240	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
241
242	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
243	    __func__, lctx->stid, lctx, lctx->inp);
244
245	if (lctx->ce)
246		release_lip(td, lctx->ce);
247	free_stid(sc, lctx);
248	free(lctx, M_CXGBE);
249
250	return (in_pcbrele_wlocked(inp));
251}
252
253static void
254hold_lctx(struct listen_ctx *lctx)
255{
256
257	refcount_acquire(&lctx->refcount);
258}
259
260static inline uint32_t
261listen_hashfn(void *key, u_long mask)
262{
263
264	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
265}
266
267/*
268 * Add a listen_ctx entry to the listen hash table.
269 */
270static void
271listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
272{
273	struct tom_data *td = sc->tom_softc;
274	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
275
276	mtx_lock(&td->lctx_hash_lock);
277	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
278	td->lctx_count++;
279	mtx_unlock(&td->lctx_hash_lock);
280}
281
282/*
283 * Look for the listening socket's context entry in the hash and return it.
284 */
285static struct listen_ctx *
286listen_hash_find(struct adapter *sc, struct inpcb *inp)
287{
288	struct tom_data *td = sc->tom_softc;
289	int bucket = listen_hashfn(inp, td->listen_mask);
290	struct listen_ctx *lctx;
291
292	mtx_lock(&td->lctx_hash_lock);
293	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
294		if (lctx->inp == inp)
295			break;
296	}
297	mtx_unlock(&td->lctx_hash_lock);
298
299	return (lctx);
300}
301
302/*
303 * Removes the listen_ctx structure for inp from the hash and returns it.
304 */
305static struct listen_ctx *
306listen_hash_del(struct adapter *sc, struct inpcb *inp)
307{
308	struct tom_data *td = sc->tom_softc;
309	int bucket = listen_hashfn(inp, td->listen_mask);
310	struct listen_ctx *lctx, *l;
311
312	mtx_lock(&td->lctx_hash_lock);
313	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
314		if (lctx->inp == inp) {
315			LIST_REMOVE(lctx, link);
316			td->lctx_count--;
317			break;
318		}
319	}
320	mtx_unlock(&td->lctx_hash_lock);
321
322	return (lctx);
323}
324
325/*
326 * Releases a hold on the lctx.  Must be called with the listening socket's inp
327 * locked.  The inp may be freed by this function and it returns NULL to
328 * indicate this.
329 */
330static struct inpcb *
331release_lctx(struct adapter *sc, struct listen_ctx *lctx)
332{
333	struct inpcb *inp = lctx->inp;
334	int inp_freed = 0;
335
336	INP_WLOCK_ASSERT(inp);
337	if (refcount_release(&lctx->refcount))
338		inp_freed = free_lctx(sc, lctx);
339
340	return (inp_freed ? NULL : inp);
341}
342
343static void
344send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
345{
346	struct adapter *sc = tod->tod_softc;
347	struct mbuf *m = synqe->syn;
348	struct ifnet *ifp = m->m_pkthdr.rcvif;
349	struct vi_info *vi = ifp->if_softc;
350	struct port_info *pi = vi->pi;
351	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
352	struct wrqe *wr;
353	struct fw_flowc_wr *flowc;
354	struct cpl_abort_req *req;
355	int txqid, rxqid, flowclen;
356	struct sge_wrq *ofld_txq;
357	struct sge_ofld_rxq *ofld_rxq;
358	const int nparams = 6;
359	unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
360
361	INP_WLOCK_ASSERT(synqe->lctx->inp);
362
363	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
364	    __func__, synqe, synqe->flags, synqe->tid,
365	    synqe->flags & TPF_ABORT_SHUTDOWN ?
366	    " (abort already in progress)" : "");
367	if (synqe->flags & TPF_ABORT_SHUTDOWN)
368		return;	/* abort already in progress */
369	synqe->flags |= TPF_ABORT_SHUTDOWN;
370
371	get_qids_from_mbuf(m, &txqid, &rxqid);
372	ofld_txq = &sc->sge.ofld_txq[txqid];
373	ofld_rxq = &sc->sge.ofld_rxq[rxqid];
374
375	/* The wrqe will have two WRs - a flowc followed by an abort_req */
376	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
377
378	wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
379	if (wr == NULL) {
380		/* XXX */
381		panic("%s: allocation failure.", __func__);
382	}
383	flowc = wrtod(wr);
384	req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
385
386	/* First the flowc ... */
387	memset(flowc, 0, wr->wr_len);
388	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
389	    V_FW_FLOWC_WR_NPARAMS(nparams));
390	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
391	    V_FW_WR_FLOWID(synqe->tid));
392	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
393	flowc->mnemval[0].val = htobe32(pfvf);
394	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
395	flowc->mnemval[1].val = htobe32(pi->tx_chan);
396	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
397	flowc->mnemval[2].val = htobe32(pi->tx_chan);
398	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
399	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
400 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
401 	flowc->mnemval[4].val = htobe32(512);
402 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
403 	flowc->mnemval[5].val = htobe32(512);
404	synqe->flags |= TPF_FLOWC_WR_SENT;
405
406	/* ... then ABORT request */
407	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
408	req->rsvd0 = 0;	/* don't have a snd_nxt */
409	req->rsvd1 = 1;	/* no data sent yet */
410	req->cmd = CPL_ABORT_SEND_RST;
411
412	t4_l2t_send(sc, wr, e);
413}
414
415static int
416create_server(struct adapter *sc, struct listen_ctx *lctx)
417{
418	struct wrqe *wr;
419	struct cpl_pass_open_req *req;
420	struct inpcb *inp = lctx->inp;
421
422	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
423	if (wr == NULL) {
424		log(LOG_ERR, "%s: allocation failure", __func__);
425		return (ENOMEM);
426	}
427	req = wrtod(wr);
428
429	INIT_TP_WR(req, 0);
430	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
431	req->local_port = inp->inp_lport;
432	req->peer_port = 0;
433	req->local_ip = inp->inp_laddr.s_addr;
434	req->peer_ip = 0;
435	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
436	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
437	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
438
439	t4_wrq_tx(sc, wr);
440	return (0);
441}
442
443static int
444create_server6(struct adapter *sc, struct listen_ctx *lctx)
445{
446	struct wrqe *wr;
447	struct cpl_pass_open_req6 *req;
448	struct inpcb *inp = lctx->inp;
449
450	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
451	if (wr == NULL) {
452		log(LOG_ERR, "%s: allocation failure", __func__);
453		return (ENOMEM);
454	}
455	req = wrtod(wr);
456
457	INIT_TP_WR(req, 0);
458	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
459	req->local_port = inp->inp_lport;
460	req->peer_port = 0;
461	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
462	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
463	req->peer_ip_hi = 0;
464	req->peer_ip_lo = 0;
465	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
466	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
467	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
468
469	t4_wrq_tx(sc, wr);
470	return (0);
471}
472
473static int
474destroy_server(struct adapter *sc, struct listen_ctx *lctx)
475{
476	struct wrqe *wr;
477	struct cpl_close_listsvr_req *req;
478
479	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
480	if (wr == NULL) {
481		/* XXX */
482		panic("%s: allocation failure.", __func__);
483	}
484	req = wrtod(wr);
485
486	INIT_TP_WR(req, 0);
487	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
488	    lctx->stid));
489	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
490	req->rsvd = htobe16(0);
491
492	t4_wrq_tx(sc, wr);
493	return (0);
494}
495
496/*
497 * Start a listening server by sending a passive open request to HW.
498 *
499 * Can't take adapter lock here and access to sc->flags,
500 * sc->offload_map, if_capenable are all race prone.
501 */
502int
503t4_listen_start(struct toedev *tod, struct tcpcb *tp)
504{
505	struct adapter *sc = tod->tod_softc;
506	struct vi_info *vi;
507	struct port_info *pi;
508	struct inpcb *inp = tp->t_inpcb;
509	struct listen_ctx *lctx;
510	int i, rc, v;
511
512	INP_WLOCK_ASSERT(inp);
513
514	/* Don't start a hardware listener for any loopback address. */
515	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
516		return (0);
517	if (!(inp->inp_vflag & INP_IPV6) &&
518	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
519		return (0);
520#if 0
521	ADAPTER_LOCK(sc);
522	if (IS_BUSY(sc)) {
523		log(LOG_ERR, "%s: listen request ignored, %s is busy",
524		    __func__, device_get_nameunit(sc->dev));
525		goto done;
526	}
527
528	KASSERT(uld_active(sc, ULD_TOM),
529	    ("%s: TOM not initialized", __func__));
530#endif
531
532	/*
533	 * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
534	 * such VI's queues to send the passive open and receive the reply to
535	 * it.
536	 *
537	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
538	 * then reject any attempt to bring down such a port (and maybe reject
539	 * attempts to disable IFCAP_TOE on that port too?).
540	 */
541	for_each_port(sc, i) {
542		pi = sc->port[i];
543		for_each_vi(pi, v, vi) {
544			if (vi->flags & VI_INIT_DONE &&
545			    vi->ifp->if_capenable & IFCAP_TOE)
546				goto found;
547		}
548	}
549	goto done;	/* no port that's UP with IFCAP_TOE enabled */
550found:
551
552	if (listen_hash_find(sc, inp) != NULL)
553		goto done;	/* already setup */
554
555	lctx = alloc_lctx(sc, inp, vi);
556	if (lctx == NULL) {
557		log(LOG_ERR,
558		    "%s: listen request ignored, %s couldn't allocate lctx\n",
559		    __func__, device_get_nameunit(sc->dev));
560		goto done;
561	}
562	listen_hash_add(sc, lctx);
563
564	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
565	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
566	    inp->inp_vflag);
567
568	if (inp->inp_vflag & INP_IPV6)
569		rc = create_server6(sc, lctx);
570	else
571		rc = create_server(sc, lctx);
572	if (rc != 0) {
573		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
574		    __func__, device_get_nameunit(sc->dev), rc);
575		(void) listen_hash_del(sc, inp);
576		inp = release_lctx(sc, lctx);
577		/* can't be freed, host stack has a reference */
578		KASSERT(inp != NULL, ("%s: inp freed", __func__));
579		goto done;
580	}
581	lctx->flags |= LCTX_RPL_PENDING;
582done:
583#if 0
584	ADAPTER_UNLOCK(sc);
585#endif
586	return (0);
587}
588
589int
590t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
591{
592	struct listen_ctx *lctx;
593	struct adapter *sc = tod->tod_softc;
594	struct inpcb *inp = tp->t_inpcb;
595	struct synq_entry *synqe;
596
597	INP_WLOCK_ASSERT(inp);
598
599	lctx = listen_hash_del(sc, inp);
600	if (lctx == NULL)
601		return (ENOENT);	/* no hardware listener for this inp */
602
603	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
604	    lctx, lctx->flags);
605
606	/*
607	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
608	 * arrive and clean up when it does.
609	 */
610	if (lctx->flags & LCTX_RPL_PENDING) {
611		KASSERT(TAILQ_EMPTY(&lctx->synq),
612		    ("%s: synq not empty.", __func__));
613		return (EINPROGRESS);
614	}
615
616	/*
617	 * The host stack will abort all the connections on the listening
618	 * socket's so_comp.  It doesn't know about the connections on the synq
619	 * so we need to take care of those.
620	 */
621	TAILQ_FOREACH(synqe, &lctx->synq, link) {
622		if (synqe->flags & TPF_SYNQE_HAS_L2TE)
623			send_reset_synqe(tod, synqe);
624	}
625
626	destroy_server(sc, lctx);
627	return (0);
628}
629
630static inline void
631hold_synqe(struct synq_entry *synqe)
632{
633
634	refcount_acquire(&synqe->refcnt);
635}
636
637static inline void
638release_synqe(struct synq_entry *synqe)
639{
640
641	if (refcount_release(&synqe->refcnt)) {
642		int needfree = synqe->flags & TPF_SYNQE_NEEDFREE;
643
644		m_freem(synqe->syn);
645		if (needfree)
646			free(synqe, M_CXGBE);
647	}
648}
649
650void
651t4_syncache_added(struct toedev *tod __unused, void *arg)
652{
653	struct synq_entry *synqe = arg;
654
655	hold_synqe(synqe);
656}
657
658void
659t4_syncache_removed(struct toedev *tod __unused, void *arg)
660{
661	struct synq_entry *synqe = arg;
662
663	release_synqe(synqe);
664}
665
666/* XXX */
667extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
668
669int
670t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
671{
672	struct adapter *sc = tod->tod_softc;
673	struct synq_entry *synqe = arg;
674	struct wrqe *wr;
675	struct l2t_entry *e;
676	struct tcpopt to;
677	struct ip *ip = mtod(m, struct ip *);
678	struct tcphdr *th;
679
680	wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
681	if (wr == NULL) {
682		m_freem(m);
683		return (EALREADY);
684	}
685
686	if (ip->ip_v == IPVERSION)
687		th = (void *)(ip + 1);
688	else
689		th = (void *)((struct ip6_hdr *)ip + 1);
690	bzero(&to, sizeof(to));
691	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
692	    TO_SYN);
693
694	/* save these for later */
695	synqe->iss = be32toh(th->th_seq);
696	synqe->ts = to.to_tsval;
697
698	if (chip_id(sc) >= CHELSIO_T5) {
699		struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr);
700
701		rpl5->iss = th->th_seq;
702	}
703
704	e = &sc->l2t->l2tab[synqe->l2e_idx];
705	t4_l2t_send(sc, wr, e);
706
707	m_freem(m);	/* don't need this any more */
708	return (0);
709}
710
711static int
712do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
713    struct mbuf *m)
714{
715	struct adapter *sc = iq->adapter;
716	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
717	int stid = GET_TID(cpl);
718	unsigned int status = cpl->status;
719	struct listen_ctx *lctx = lookup_stid(sc, stid);
720	struct inpcb *inp = lctx->inp;
721#ifdef INVARIANTS
722	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
723#endif
724
725	KASSERT(opcode == CPL_PASS_OPEN_RPL,
726	    ("%s: unexpected opcode 0x%x", __func__, opcode));
727	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
728	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
729
730	INP_WLOCK(inp);
731
732	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
733	    __func__, stid, status, lctx->flags);
734
735	lctx->flags &= ~LCTX_RPL_PENDING;
736
737	if (status != CPL_ERR_NONE)
738		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
739
740#ifdef INVARIANTS
741	/*
742	 * If the inp has been dropped (listening socket closed) then
743	 * listen_stop must have run and taken the inp out of the hash.
744	 */
745	if (inp->inp_flags & INP_DROPPED) {
746		KASSERT(listen_hash_del(sc, inp) == NULL,
747		    ("%s: inp %p still in listen hash", __func__, inp));
748	}
749#endif
750
751	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
752		if (release_lctx(sc, lctx) != NULL)
753			INP_WUNLOCK(inp);
754		return (status);
755	}
756
757	/*
758	 * Listening socket stopped listening earlier and now the chip tells us
759	 * it has started the hardware listener.  Stop it; the lctx will be
760	 * released in do_close_server_rpl.
761	 */
762	if (inp->inp_flags & INP_DROPPED) {
763		destroy_server(sc, lctx);
764		INP_WUNLOCK(inp);
765		return (status);
766	}
767
768	/*
769	 * Failed to start hardware listener.  Take inp out of the hash and
770	 * release our reference on it.  An error message has been logged
771	 * already.
772	 */
773	if (status != CPL_ERR_NONE) {
774		listen_hash_del(sc, inp);
775		if (release_lctx(sc, lctx) != NULL)
776			INP_WUNLOCK(inp);
777		return (status);
778	}
779
780	/* hardware listener open for business */
781
782	INP_WUNLOCK(inp);
783	return (status);
784}
785
786static int
787do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
788    struct mbuf *m)
789{
790	struct adapter *sc = iq->adapter;
791	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
792	int stid = GET_TID(cpl);
793	unsigned int status = cpl->status;
794	struct listen_ctx *lctx = lookup_stid(sc, stid);
795	struct inpcb *inp = lctx->inp;
796#ifdef INVARIANTS
797	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
798#endif
799
800	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
801	    ("%s: unexpected opcode 0x%x", __func__, opcode));
802	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
803	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
804
805	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
806
807	if (status != CPL_ERR_NONE) {
808		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
809		    __func__, status, stid);
810		return (status);
811	}
812
813	INP_WLOCK(inp);
814	inp = release_lctx(sc, lctx);
815	if (inp != NULL)
816		INP_WUNLOCK(inp);
817
818	return (status);
819}
820
821static void
822done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
823{
824	struct listen_ctx *lctx = synqe->lctx;
825	struct inpcb *inp = lctx->inp;
826	struct vi_info *vi = synqe->syn->m_pkthdr.rcvif->if_softc;
827	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
828
829	INP_WLOCK_ASSERT(inp);
830
831	TAILQ_REMOVE(&lctx->synq, synqe, link);
832	inp = release_lctx(sc, lctx);
833	if (inp)
834		INP_WUNLOCK(inp);
835	remove_tid(sc, synqe->tid);
836	release_tid(sc, synqe->tid, &sc->sge.ctrlq[vi->pi->port_id]);
837	t4_l2t_release(e);
838	release_synqe(synqe);	/* removed from synq list */
839}
840
841int
842do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
843    struct mbuf *m)
844{
845	struct adapter *sc = iq->adapter;
846	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
847	unsigned int tid = GET_TID(cpl);
848	struct synq_entry *synqe = lookup_tid(sc, tid);
849	struct listen_ctx *lctx = synqe->lctx;
850	struct inpcb *inp = lctx->inp;
851	int txqid;
852	struct sge_wrq *ofld_txq;
853#ifdef INVARIANTS
854	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
855#endif
856
857	KASSERT(opcode == CPL_ABORT_REQ_RSS,
858	    ("%s: unexpected opcode 0x%x", __func__, opcode));
859	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
860	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
861
862	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
863	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
864
865	if (negative_advice(cpl->status))
866		return (0);	/* Ignore negative advice */
867
868	INP_WLOCK(inp);
869
870	get_qids_from_mbuf(synqe->syn, &txqid, NULL);
871	ofld_txq = &sc->sge.ofld_txq[txqid];
872
873	/*
874	 * If we'd initiated an abort earlier the reply to it is responsible for
875	 * cleaning up resources.  Otherwise we tear everything down right here
876	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
877	 */
878	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
879		INP_WUNLOCK(inp);
880		goto done;
881	}
882
883	done_with_synqe(sc, synqe);
884	/* inp lock released by done_with_synqe */
885done:
886	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
887	return (0);
888}
889
890int
891do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
892    struct mbuf *m)
893{
894	struct adapter *sc = iq->adapter;
895	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
896	unsigned int tid = GET_TID(cpl);
897	struct synq_entry *synqe = lookup_tid(sc, tid);
898	struct listen_ctx *lctx = synqe->lctx;
899	struct inpcb *inp = lctx->inp;
900#ifdef INVARIANTS
901	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
902#endif
903
904	KASSERT(opcode == CPL_ABORT_RPL_RSS,
905	    ("%s: unexpected opcode 0x%x", __func__, opcode));
906	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
907	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
908
909	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
910	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
911
912	INP_WLOCK(inp);
913	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
914	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
915	    __func__, synqe, synqe->flags));
916
917	done_with_synqe(sc, synqe);
918	/* inp lock released by done_with_synqe */
919
920	return (0);
921}
922
923void
924t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
925{
926	struct adapter *sc = tod->tod_softc;
927	struct synq_entry *synqe = arg;
928#ifdef INVARIANTS
929	struct inpcb *inp = sotoinpcb(so);
930#endif
931	struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
932	struct toepcb *toep = *(struct toepcb **)(cpl + 1);
933
934	INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
935	INP_WLOCK_ASSERT(inp);
936	KASSERT(synqe->flags & TPF_SYNQE,
937	    ("%s: %p not a synq_entry?", __func__, arg));
938
939	offload_socket(so, toep);
940	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
941	toep->flags |= TPF_CPL_PENDING;
942	update_tid(sc, synqe->tid, toep);
943	synqe->flags |= TPF_SYNQE_EXPANDED;
944}
945
946static inline void
947save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi)
948{
949	uint32_t txqid, rxqid;
950
951	txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
952	rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
953
954	m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
955}
956
957static inline void
958get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
959{
960
961	if (txqid)
962		*txqid = m->m_pkthdr.flowid >> 16;
963	if (rxqid)
964		*rxqid = m->m_pkthdr.flowid & 0xffff;
965}
966
967/*
968 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
969 * store some state temporarily.
970 */
971static struct synq_entry *
972mbuf_to_synqe(struct mbuf *m)
973{
974	int len = roundup2(sizeof (struct synq_entry), 8);
975	int tspace = M_TRAILINGSPACE(m);
976	struct synq_entry *synqe = NULL;
977
978	if (tspace < len) {
979		synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
980		if (synqe == NULL)
981			return (NULL);
982		synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE;
983	} else {
984		synqe = (void *)(m->m_data + m->m_len + tspace - len);
985		synqe->flags = TPF_SYNQE;
986	}
987
988	return (synqe);
989}
990
991static void
992t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
993{
994	bzero(to, sizeof(*to));
995
996	if (t4opt->mss) {
997		to->to_flags |= TOF_MSS;
998		to->to_mss = be16toh(t4opt->mss);
999	}
1000
1001	if (t4opt->wsf) {
1002		to->to_flags |= TOF_SCALE;
1003		to->to_wscale = t4opt->wsf;
1004	}
1005
1006	if (t4opt->tstamp)
1007		to->to_flags |= TOF_TS;
1008
1009	if (t4opt->sack)
1010		to->to_flags |= TOF_SACKPERM;
1011}
1012
1013/*
1014 * Options2 for passive open.
1015 */
1016static uint32_t
1017calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
1018    const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode)
1019{
1020	struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
1021	uint32_t opt2;
1022
1023	opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) |
1024	    F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
1025
1026	if (V_tcp_do_rfc1323) {
1027		if (tcpopt->tstamp)
1028			opt2 |= F_TSTAMPS_EN;
1029		if (tcpopt->sack)
1030			opt2 |= F_SACK_EN;
1031		if (tcpopt->wsf <= 14)
1032			opt2 |= F_WND_SCALE_EN;
1033	}
1034
1035	if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
1036		opt2 |= F_CCTRL_ECN;
1037
1038	/* RX_COALESCE is always a valid value (0 or M_RX_COALESCE). */
1039	if (is_t4(sc))
1040		opt2 |= F_RX_COALESCE_VALID;
1041	else {
1042		opt2 |= F_T5_OPT_2_VALID;
1043		opt2 |= F_CONG_CNTRL_VALID; /* OPT_2_ISS really, for T5 */
1044	}
1045	if (sc->tt.rx_coalesce)
1046		opt2 |= V_RX_COALESCE(M_RX_COALESCE);
1047
1048#ifdef USE_DDP_RX_FLOW_CONTROL
1049	if (ulp_mode == ULP_MODE_TCPDDP)
1050		opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
1051#endif
1052
1053	return htobe32(opt2);
1054}
1055
1056static void
1057pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
1058    struct in_conninfo *inc, struct tcphdr *th)
1059{
1060	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1061	const struct ether_header *eh;
1062	unsigned int hlen = be32toh(cpl->hdr_len);
1063	uintptr_t l3hdr;
1064	const struct tcphdr *tcp;
1065
1066	eh = (const void *)(cpl + 1);
1067	if (chip_id(sc) >= CHELSIO_T6) {
1068		l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1069		tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1070	} else {
1071		l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1072		tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1073	}
1074
1075	if (inc) {
1076		bzero(inc, sizeof(*inc));
1077		inc->inc_fport = tcp->th_sport;
1078		inc->inc_lport = tcp->th_dport;
1079		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1080			const struct ip *ip = (const void *)l3hdr;
1081
1082			inc->inc_faddr = ip->ip_src;
1083			inc->inc_laddr = ip->ip_dst;
1084		} else {
1085			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1086
1087			inc->inc_flags |= INC_ISIPV6;
1088			inc->inc6_faddr = ip6->ip6_src;
1089			inc->inc6_laddr = ip6->ip6_dst;
1090		}
1091	}
1092
1093	if (th) {
1094		bcopy(tcp, th, sizeof(*th));
1095		tcp_fields_to_host(th);		/* just like tcp_input */
1096	}
1097}
1098
1099static int
1100ifnet_has_ip6(struct ifnet *ifp, struct in6_addr *ip6)
1101{
1102	struct ifaddr *ifa;
1103	struct sockaddr_in6 *sin6;
1104	int found = 0;
1105	struct in6_addr in6 = *ip6;
1106
1107	/* Just as in ip6_input */
1108	if (in6_clearscope(&in6) || in6_clearscope(&in6))
1109		return (0);
1110	in6_setscope(&in6, ifp, NULL);
1111
1112	if_addr_rlock(ifp);
1113	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1114		sin6 = (void *)ifa->ifa_addr;
1115		if (sin6->sin6_family != AF_INET6)
1116			continue;
1117
1118		if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &in6)) {
1119			found = 1;
1120			break;
1121		}
1122	}
1123	if_addr_runlock(ifp);
1124
1125	return (found);
1126}
1127
1128static struct l2t_entry *
1129get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1130    struct in_conninfo *inc)
1131{
1132	struct rtentry *rt;
1133	struct l2t_entry *e;
1134	struct sockaddr_in6 sin6;
1135	struct sockaddr *dst = (void *)&sin6;
1136
1137	if (inc->inc_flags & INC_ISIPV6) {
1138		dst->sa_len = sizeof(struct sockaddr_in6);
1139		dst->sa_family = AF_INET6;
1140		((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
1141
1142		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1143			/* no need for route lookup */
1144			e = t4_l2t_get(pi, ifp, dst);
1145			return (e);
1146		}
1147	} else {
1148		dst->sa_len = sizeof(struct sockaddr_in);
1149		dst->sa_family = AF_INET;
1150		((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
1151	}
1152
1153	rt = rtalloc1(dst, 0, 0);
1154	if (rt == NULL)
1155		return (NULL);
1156	else {
1157		struct sockaddr *nexthop;
1158
1159		RT_UNLOCK(rt);
1160		if (rt->rt_ifp != ifp)
1161			e = NULL;
1162		else {
1163			if (rt->rt_flags & RTF_GATEWAY)
1164				nexthop = rt->rt_gateway;
1165			else
1166				nexthop = dst;
1167			e = t4_l2t_get(pi, ifp, nexthop);
1168		}
1169		RTFREE(rt);
1170	}
1171
1172	return (e);
1173}
1174
1175static int
1176ifnet_has_ip(struct ifnet *ifp, struct in_addr in)
1177{
1178	struct ifaddr *ifa;
1179	struct sockaddr_in *sin;
1180	int found = 0;
1181
1182	if_addr_rlock(ifp);
1183	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1184		sin = (void *)ifa->ifa_addr;
1185		if (sin->sin_family != AF_INET)
1186			continue;
1187
1188		if (sin->sin_addr.s_addr == in.s_addr) {
1189			found = 1;
1190			break;
1191		}
1192	}
1193	if_addr_runlock(ifp);
1194
1195	return (found);
1196}
1197
1198#define REJECT_PASS_ACCEPT()	do { \
1199	reject_reason = __LINE__; \
1200	goto reject; \
1201} while (0)
1202
1203/*
1204 * The context associated with a tid entry via insert_tid could be a synq_entry
1205 * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
1206 */
1207CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1208
1209/*
1210 * Incoming SYN on a listening socket.
1211 *
1212 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1213 * etc.
1214 */
1215static int
1216do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1217    struct mbuf *m)
1218{
1219	struct adapter *sc = iq->adapter;
1220	struct toedev *tod;
1221	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1222	struct cpl_pass_accept_rpl *rpl;
1223	struct wrqe *wr;
1224	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1225	unsigned int tid = GET_TID(cpl);
1226	struct listen_ctx *lctx = lookup_stid(sc, stid);
1227	struct inpcb *inp;
1228	struct socket *so;
1229	struct in_conninfo inc;
1230	struct tcphdr th;
1231	struct tcpopt to;
1232	struct port_info *pi;
1233	struct vi_info *vi;
1234	struct ifnet *hw_ifp, *ifp;
1235	struct l2t_entry *e = NULL;
1236	int rscale, mtu_idx, rx_credits, rxqid, ulp_mode;
1237	struct synq_entry *synqe = NULL;
1238	int reject_reason, v;
1239	uint16_t vid;
1240#ifdef INVARIANTS
1241	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1242#endif
1243
1244	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1245	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1246	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1247
1248	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1249	    lctx);
1250
1251	pass_accept_req_to_protohdrs(sc, m, &inc, &th);
1252	t4opt_to_tcpopt(&cpl->tcpopt, &to);
1253
1254	pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
1255
1256	/*
1257	 * Use the MAC index to lookup the associated VI.  If this SYN
1258	 * didn't match a perfect MAC filter, punt.
1259	 */
1260	if (!(be16toh(cpl->l2info) & F_SYN_XACT_MATCH)) {
1261		m_freem(m);
1262		m = NULL;
1263		REJECT_PASS_ACCEPT();
1264	}
1265	for_each_vi(pi, v, vi) {
1266		if (vi->xact_addr_filt == G_SYN_MAC_IDX(be16toh(cpl->l2info)))
1267			goto found;
1268	}
1269	m_freem(m);
1270	m = NULL;
1271	REJECT_PASS_ACCEPT();
1272
1273found:
1274	hw_ifp = vi->ifp;	/* the (v)cxgbeX ifnet */
1275	m->m_pkthdr.rcvif = hw_ifp;
1276	tod = TOEDEV(hw_ifp);
1277
1278	/*
1279	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1280	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
1281	 * doesn't match anything on this interface.
1282	 *
1283	 * XXX: lagg support, lagg + vlan support.
1284	 */
1285	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1286	if (vid != 0xfff) {
1287		ifp = VLAN_DEVAT(hw_ifp, vid);
1288		if (ifp == NULL)
1289			REJECT_PASS_ACCEPT();
1290	} else
1291		ifp = hw_ifp;
1292
1293	/*
1294	 * Don't offload if the peer requested a TCP option that's not known to
1295	 * the silicon.
1296	 */
1297	if (cpl->tcpopt.unknown)
1298		REJECT_PASS_ACCEPT();
1299
1300	if (inc.inc_flags & INC_ISIPV6) {
1301
1302		/* Don't offload if the ifcap isn't enabled */
1303		if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1304			REJECT_PASS_ACCEPT();
1305
1306		/*
1307		 * SYN must be directed to an IP6 address on this ifnet.  This
1308		 * is more restrictive than in6_localip.
1309		 */
1310		if (!ifnet_has_ip6(ifp, &inc.inc6_laddr))
1311			REJECT_PASS_ACCEPT();
1312	} else {
1313
1314		/* Don't offload if the ifcap isn't enabled */
1315		if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1316			REJECT_PASS_ACCEPT();
1317
1318		/*
1319		 * SYN must be directed to an IP address on this ifnet.  This
1320		 * is more restrictive than in_localip.
1321		 */
1322		if (!ifnet_has_ip(ifp, inc.inc_laddr))
1323			REJECT_PASS_ACCEPT();
1324	}
1325
1326	e = get_l2te_for_nexthop(pi, ifp, &inc);
1327	if (e == NULL)
1328		REJECT_PASS_ACCEPT();
1329
1330	synqe = mbuf_to_synqe(m);
1331	if (synqe == NULL)
1332		REJECT_PASS_ACCEPT();
1333
1334	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1335	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]);
1336	if (wr == NULL)
1337		REJECT_PASS_ACCEPT();
1338	rpl = wrtod(wr);
1339
1340	INP_INFO_RLOCK(&V_tcbinfo);	/* for 4-tuple check */
1341
1342	/* Don't offload if the 4-tuple is already in use */
1343	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1344		INP_INFO_RUNLOCK(&V_tcbinfo);
1345		free(wr, M_CXGBE);
1346		REJECT_PASS_ACCEPT();
1347	}
1348	INP_INFO_RUNLOCK(&V_tcbinfo);
1349
1350	inp = lctx->inp;		/* listening socket, not owned by TOE */
1351	INP_WLOCK(inp);
1352
1353	/* Don't offload if the listening socket has closed */
1354	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1355		/*
1356		 * The listening socket has closed.  The reply from the TOE to
1357		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
1358		 * resources tied to this listen context.
1359		 */
1360		INP_WUNLOCK(inp);
1361		free(wr, M_CXGBE);
1362		REJECT_PASS_ACCEPT();
1363	}
1364	so = inp->inp_socket;
1365	CURVNET_SET(so->so_vnet);
1366
1367	mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
1368	rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
1369	SOCKBUF_LOCK(&so->so_rcv);
1370	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1371	rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
1372	SOCKBUF_UNLOCK(&so->so_rcv);
1373
1374	save_qids_in_mbuf(m, vi);
1375	get_qids_from_mbuf(m, NULL, &rxqid);
1376
1377	if (is_t4(sc))
1378		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1379	else {
1380		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1381
1382		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1383	}
1384	if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) {
1385		ulp_mode = ULP_MODE_TCPDDP;
1386		synqe->flags |= TPF_SYNQE_TCPDDP;
1387	} else
1388		ulp_mode = ULP_MODE_NONE;
1389	rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode);
1390	rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode);
1391
1392	synqe->tid = tid;
1393	synqe->lctx = lctx;
1394	synqe->syn = m;
1395	m = NULL;
1396	refcount_init(&synqe->refcnt, 1);	/* 1 means extra hold */
1397	synqe->l2e_idx = e->idx;
1398	synqe->rcv_bufsize = rx_credits;
1399	atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
1400
1401	insert_tid(sc, tid, synqe);
1402	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
1403	hold_synqe(synqe);	/* hold for the duration it's in the synq */
1404	hold_lctx(lctx);	/* A synqe on the list has a ref on its lctx */
1405
1406        /*
1407	 * If all goes well t4_syncache_respond will get called during
1408	 * syncache_add.  Note that syncache_add releases the pcb lock.
1409	 */
1410	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1411	INP_UNLOCK_ASSERT(inp);	/* ok to assert, we have a ref on the inp */
1412	CURVNET_RESTORE();
1413
1414	/*
1415	 * If we replied during syncache_add (synqe->wr has been consumed),
1416	 * good.  Otherwise, set it to 0 so that further syncache_respond
1417	 * attempts by the kernel will be ignored.
1418	 */
1419	if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
1420
1421		/*
1422		 * syncache may or may not have a hold on the synqe, which may
1423		 * or may not be stashed in the original SYN mbuf passed to us.
1424		 * Just copy it over instead of dealing with all possibilities.
1425		 */
1426		m = m_dup(synqe->syn, M_NOWAIT);
1427		if (m)
1428			m->m_pkthdr.rcvif = hw_ifp;
1429
1430		remove_tid(sc, synqe->tid);
1431		free(wr, M_CXGBE);
1432
1433		/* Yank the synqe out of the lctx synq. */
1434		INP_WLOCK(inp);
1435		TAILQ_REMOVE(&lctx->synq, synqe, link);
1436		release_synqe(synqe);	/* removed from synq list */
1437		inp = release_lctx(sc, lctx);
1438		if (inp)
1439			INP_WUNLOCK(inp);
1440
1441		release_synqe(synqe);	/* extra hold */
1442		REJECT_PASS_ACCEPT();
1443	}
1444
1445	CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
1446	    __func__, stid, tid, lctx, synqe);
1447
1448	INP_WLOCK(inp);
1449	synqe->flags |= TPF_SYNQE_HAS_L2TE;
1450	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1451		/*
1452		 * Listening socket closed but tod_listen_stop did not abort
1453		 * this tid because there was no L2T entry for the tid at that
1454		 * time.  Abort it now.  The reply to the abort will clean up.
1455		 */
1456		CTR6(KTR_CXGBE,
1457		    "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT",
1458		    __func__, stid, tid, lctx, synqe, synqe->flags);
1459		if (!(synqe->flags & TPF_SYNQE_EXPANDED))
1460			send_reset_synqe(tod, synqe);
1461		INP_WUNLOCK(inp);
1462
1463		release_synqe(synqe);	/* extra hold */
1464		return (__LINE__);
1465	}
1466	INP_WUNLOCK(inp);
1467
1468	release_synqe(synqe);	/* extra hold */
1469	return (0);
1470reject:
1471	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1472	    reject_reason);
1473
1474	if (e)
1475		t4_l2t_release(e);
1476	release_tid(sc, tid, lctx->ctrlq);
1477
1478	if (__predict_true(m != NULL)) {
1479		m_adj(m, sizeof(*cpl));
1480		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1481		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1482		m->m_pkthdr.csum_data = 0xffff;
1483		hw_ifp->if_input(hw_ifp, m);
1484	}
1485
1486	return (reject_reason);
1487}
1488
1489static void
1490synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1491    const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1492    struct tcphdr *th, struct tcpopt *to)
1493{
1494	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1495
1496	/* start off with the original SYN */
1497	pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th);
1498
1499	/* modify parts to make it look like the ACK to our SYN|ACK */
1500	th->th_flags = TH_ACK;
1501	th->th_ack = synqe->iss + 1;
1502	th->th_seq = be32toh(cpl->rcv_isn);
1503	bzero(to, sizeof(*to));
1504	if (G_TCPOPT_TSTAMP(tcp_opt)) {
1505		to->to_flags |= TOF_TS;
1506		to->to_tsecr = synqe->ts;
1507	}
1508}
1509
1510static int
1511do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1512    struct mbuf *m)
1513{
1514	struct adapter *sc = iq->adapter;
1515	struct vi_info *vi;
1516	struct ifnet *ifp;
1517	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1518#if defined(KTR) || defined(INVARIANTS)
1519	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1520#endif
1521	unsigned int tid = GET_TID(cpl);
1522	struct synq_entry *synqe = lookup_tid(sc, tid);
1523	struct listen_ctx *lctx = synqe->lctx;
1524	struct inpcb *inp = lctx->inp, *new_inp;
1525	struct socket *so;
1526	struct tcphdr th;
1527	struct tcpopt to;
1528	struct in_conninfo inc;
1529	struct toepcb *toep;
1530	u_int txqid, rxqid;
1531#ifdef INVARIANTS
1532	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1533#endif
1534
1535	KASSERT(opcode == CPL_PASS_ESTABLISH,
1536	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1537	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1538	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1539	KASSERT(synqe->flags & TPF_SYNQE,
1540	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1541
1542	INP_INFO_RLOCK(&V_tcbinfo);	/* for syncache_expand */
1543	INP_WLOCK(inp);
1544
1545	CTR6(KTR_CXGBE,
1546	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1547	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1548
1549	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1550
1551		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1552			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1553			    ("%s: listen socket closed but tid %u not aborted.",
1554			    __func__, tid));
1555		}
1556
1557		INP_WUNLOCK(inp);
1558		INP_INFO_RUNLOCK(&V_tcbinfo);
1559		return (0);
1560	}
1561
1562	ifp = synqe->syn->m_pkthdr.rcvif;
1563	vi = ifp->if_softc;
1564	KASSERT(vi->pi->adapter == sc,
1565	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1566
1567	get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
1568	KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1569	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__, rxqid,
1570	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1571
1572	toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT);
1573	if (toep == NULL) {
1574reset:
1575		/*
1576		 * The reply to this abort will perform final cleanup.  There is
1577		 * no need to check for HAS_L2TE here.  We can be here only if
1578		 * we responded to the PASS_ACCEPT_REQ, and our response had the
1579		 * L2T idx.
1580		 */
1581		send_reset_synqe(TOEDEV(ifp), synqe);
1582		INP_WUNLOCK(inp);
1583		INP_INFO_RUNLOCK(&V_tcbinfo);
1584		return (0);
1585	}
1586	toep->tid = tid;
1587	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
1588	if (synqe->flags & TPF_SYNQE_TCPDDP)
1589		set_tcpddp_ulp_mode(toep);
1590	else
1591		toep->ulp_mode = ULP_MODE_NONE;
1592	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1593	toep->rx_credits = synqe->rcv_bufsize;
1594
1595	so = inp->inp_socket;
1596	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1597
1598	/* Come up with something that syncache_expand should be ok with. */
1599	synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1600
1601	/*
1602	 * No more need for anything in the mbuf that carried the
1603	 * CPL_PASS_ACCEPT_REQ.  Drop the CPL_PASS_ESTABLISH and toep pointer
1604	 * there.  XXX: bad form but I don't want to increase the size of synqe.
1605	 */
1606	m = synqe->syn;
1607	KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
1608	    ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
1609	bcopy(cpl, mtod(m, void *), sizeof(*cpl));
1610	*(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
1611
1612	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1613		free_toepcb(toep);
1614		goto reset;
1615	}
1616
1617	/* New connection inpcb is already locked by syncache_expand(). */
1618	new_inp = sotoinpcb(so);
1619	INP_WLOCK_ASSERT(new_inp);
1620
1621	/*
1622	 * This is for the unlikely case where the syncache entry that we added
1623	 * has been evicted from the syncache, but the syncache_expand above
1624	 * works because of syncookies.
1625	 *
1626	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1627	 * anyone accept'ing a connection before we've installed our hooks, but
1628	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1629	 */
1630	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1631		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1632		t4_offload_socket(TOEDEV(ifp), synqe, so);
1633	}
1634
1635	INP_WUNLOCK(new_inp);
1636
1637	/* Done with the synqe */
1638	TAILQ_REMOVE(&lctx->synq, synqe, link);
1639	inp = release_lctx(sc, lctx);
1640	if (inp != NULL)
1641		INP_WUNLOCK(inp);
1642	INP_INFO_RUNLOCK(&V_tcbinfo);
1643	release_synqe(synqe);
1644
1645	return (0);
1646}
1647
1648void
1649t4_init_listen_cpl_handlers(void)
1650{
1651
1652	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1653	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1654	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1655	t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1656}
1657#endif
1658