1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD$");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#ifdef TCP_OFFLOAD
35#include <sys/param.h>
36#include <sys/types.h>
37#include <sys/kernel.h>
38#include <sys/ktr.h>
39#include <sys/module.h>
40#include <sys/protosw.h>
41#include <sys/refcount.h>
42#include <sys/domain.h>
43#include <sys/fnv_hash.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <net/ethernet.h>
47#include <net/if.h>
48#include <net/if_types.h>
49#include <net/if_vlan_var.h>
50#include <net/route.h>
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/ip.h>
54#include <netinet/ip6.h>
55#include <netinet6/scope6_var.h>
56#include <netinet/tcp_timer.h>
57#include <netinet/tcp_var.h>
58#define TCPSTATES
59#include <netinet/tcp_fsm.h>
60#include <netinet/toecore.h>
61
62#include "common/common.h"
63#include "common/t4_msg.h"
64#include "common/t4_regs.h"
65#include "tom/t4_tom_l2t.h"
66#include "tom/t4_tom.h"
67
68/* stid services */
69static int alloc_stid(struct adapter *, struct listen_ctx *, int);
70static struct listen_ctx *lookup_stid(struct adapter *, int);
71static void free_stid(struct adapter *, struct listen_ctx *);
72
73/* lctx services */
74static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
75    struct port_info *);
76static int free_lctx(struct adapter *, struct listen_ctx *);
77static void hold_lctx(struct listen_ctx *);
78static void listen_hash_add(struct adapter *, struct listen_ctx *);
79static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
80static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
81static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
82
83static inline void save_qids_in_mbuf(struct mbuf *, struct port_info *);
84static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
85static void send_reset_synqe(struct toedev *, struct synq_entry *);
86
87static int
88alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
89{
90	struct tid_info *t = &sc->tids;
91	u_int stid, n, f, mask;
92	struct stid_region *sr = &lctx->stid_region;
93
94	/*
95	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
96	 * the TCAM.  The start of the stid region is properly aligned (the chip
97	 * requires each region to be 128-cell aligned).
98	 */
99	n = isipv6 ? 2 : 1;
100	mask = n - 1;
101	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
102	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
103	    __func__, t->stid_base, t->nstids, n));
104
105	mtx_lock(&t->stid_lock);
106	if (n > t->nstids - t->stids_in_use) {
107		mtx_unlock(&t->stid_lock);
108		return (-1);
109	}
110
111	if (t->nstids_free_head >= n) {
112		/*
113		 * This allocation will definitely succeed because the region
114		 * starts at a good alignment and we just checked we have enough
115		 * stids free.
116		 */
117		f = t->nstids_free_head & mask;
118		t->nstids_free_head -= n + f;
119		stid = t->nstids_free_head;
120		TAILQ_INSERT_HEAD(&t->stids, sr, link);
121	} else {
122		struct stid_region *s;
123
124		stid = t->nstids_free_head;
125		TAILQ_FOREACH(s, &t->stids, link) {
126			stid += s->used + s->free;
127			f = stid & mask;
128			if (s->free >= n + f) {
129				stid -= n + f;
130				s->free -= n + f;
131				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
132				goto allocated;
133			}
134		}
135
136		if (__predict_false(stid != t->nstids)) {
137			panic("%s: stids TAILQ (%p) corrupt."
138			    "  At %d instead of %d at the end of the queue.",
139			    __func__, &t->stids, stid, t->nstids);
140		}
141
142		mtx_unlock(&t->stid_lock);
143		return (-1);
144	}
145
146allocated:
147	sr->used = n;
148	sr->free = f;
149	t->stids_in_use += n;
150	t->stid_tab[stid] = lctx;
151	mtx_unlock(&t->stid_lock);
152
153	KASSERT(((stid + t->stid_base) & mask) == 0,
154	    ("%s: EDOOFUS.", __func__));
155	return (stid + t->stid_base);
156}
157
158static struct listen_ctx *
159lookup_stid(struct adapter *sc, int stid)
160{
161	struct tid_info *t = &sc->tids;
162
163	return (t->stid_tab[stid - t->stid_base]);
164}
165
166static void
167free_stid(struct adapter *sc, struct listen_ctx *lctx)
168{
169	struct tid_info *t = &sc->tids;
170	struct stid_region *sr = &lctx->stid_region;
171	struct stid_region *s;
172
173	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
174
175	mtx_lock(&t->stid_lock);
176	s = TAILQ_PREV(sr, stid_head, link);
177	if (s != NULL)
178		s->free += sr->used + sr->free;
179	else
180		t->nstids_free_head += sr->used + sr->free;
181	KASSERT(t->stids_in_use >= sr->used,
182	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
183	    t->stids_in_use, sr->used));
184	t->stids_in_use -= sr->used;
185	TAILQ_REMOVE(&t->stids, sr, link);
186	mtx_unlock(&t->stid_lock);
187}
188
189static struct listen_ctx *
190alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi)
191{
192	struct listen_ctx *lctx;
193
194	INP_WLOCK_ASSERT(inp);
195
196	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
197	if (lctx == NULL)
198		return (NULL);
199
200	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
201	if (lctx->stid < 0) {
202		free(lctx, M_CXGBE);
203		return (NULL);
204	}
205
206	lctx->ctrlq = &sc->sge.ctrlq[pi->port_id];
207	lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq];
208	refcount_init(&lctx->refcount, 1);
209	TAILQ_INIT(&lctx->synq);
210
211	lctx->inp = inp;
212	in_pcbref(inp);
213
214	return (lctx);
215}
216
217/* Don't call this directly, use release_lctx instead */
218static int
219free_lctx(struct adapter *sc, struct listen_ctx *lctx)
220{
221	struct inpcb *inp = lctx->inp;
222
223	INP_WLOCK_ASSERT(inp);
224	KASSERT(lctx->refcount == 0,
225	    ("%s: refcount %d", __func__, lctx->refcount));
226	KASSERT(TAILQ_EMPTY(&lctx->synq),
227	    ("%s: synq not empty.", __func__));
228	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
229
230	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
231	    __func__, lctx->stid, lctx, lctx->inp);
232
233	free_stid(sc, lctx);
234	free(lctx, M_CXGBE);
235
236	return (in_pcbrele_wlocked(inp));
237}
238
239static void
240hold_lctx(struct listen_ctx *lctx)
241{
242
243	refcount_acquire(&lctx->refcount);
244}
245
246static inline uint32_t
247listen_hashfn(void *key, u_long mask)
248{
249
250	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
251}
252
253/*
254 * Add a listen_ctx entry to the listen hash table.
255 */
256static void
257listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
258{
259	struct tom_data *td = sc->tom_softc;
260	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
261
262	mtx_lock(&td->lctx_hash_lock);
263	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
264	td->lctx_count++;
265	mtx_unlock(&td->lctx_hash_lock);
266}
267
268/*
269 * Look for the listening socket's context entry in the hash and return it.
270 */
271static struct listen_ctx *
272listen_hash_find(struct adapter *sc, struct inpcb *inp)
273{
274	struct tom_data *td = sc->tom_softc;
275	int bucket = listen_hashfn(inp, td->listen_mask);
276	struct listen_ctx *lctx;
277
278	mtx_lock(&td->lctx_hash_lock);
279	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
280		if (lctx->inp == inp)
281			break;
282	}
283	mtx_unlock(&td->lctx_hash_lock);
284
285	return (lctx);
286}
287
288/*
289 * Removes the listen_ctx structure for inp from the hash and returns it.
290 */
291static struct listen_ctx *
292listen_hash_del(struct adapter *sc, struct inpcb *inp)
293{
294	struct tom_data *td = sc->tom_softc;
295	int bucket = listen_hashfn(inp, td->listen_mask);
296	struct listen_ctx *lctx, *l;
297
298	mtx_lock(&td->lctx_hash_lock);
299	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
300		if (lctx->inp == inp) {
301			LIST_REMOVE(lctx, link);
302			td->lctx_count--;
303			break;
304		}
305	}
306	mtx_unlock(&td->lctx_hash_lock);
307
308	return (lctx);
309}
310
311/*
312 * Releases a hold on the lctx.  Must be called with the listening socket's inp
313 * locked.  The inp may be freed by this function and it returns NULL to
314 * indicate this.
315 */
316static struct inpcb *
317release_lctx(struct adapter *sc, struct listen_ctx *lctx)
318{
319	struct inpcb *inp = lctx->inp;
320	int inp_freed = 0;
321
322	INP_WLOCK_ASSERT(inp);
323	if (refcount_release(&lctx->refcount))
324		inp_freed = free_lctx(sc, lctx);
325
326	return (inp_freed ? NULL : inp);
327}
328
329static void
330send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
331{
332	struct adapter *sc = tod->tod_softc;
333	struct mbuf *m = synqe->syn;
334	struct ifnet *ifp = m->m_pkthdr.rcvif;
335	struct port_info *pi = ifp->if_softc;
336	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
337	struct wrqe *wr;
338	struct fw_flowc_wr *flowc;
339	struct cpl_abort_req *req;
340	int txqid, rxqid, flowclen;
341	struct sge_wrq *ofld_txq;
342	struct sge_ofld_rxq *ofld_rxq;
343	const int nparams = 6;
344	unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
345
346	INP_WLOCK_ASSERT(synqe->lctx->inp);
347
348	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
349	    __func__, synqe, synqe->flags, synqe->tid,
350	    synqe->flags & TPF_ABORT_SHUTDOWN ?
351	    " (abort already in progress)" : "");
352	if (synqe->flags & TPF_ABORT_SHUTDOWN)
353		return;	/* abort already in progress */
354	synqe->flags |= TPF_ABORT_SHUTDOWN;
355
356	get_qids_from_mbuf(m, &txqid, &rxqid);
357	ofld_txq = &sc->sge.ofld_txq[txqid];
358	ofld_rxq = &sc->sge.ofld_rxq[rxqid];
359
360	/* The wrqe will have two WRs - a flowc followed by an abort_req */
361	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
362
363	wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
364	if (wr == NULL) {
365		/* XXX */
366		panic("%s: allocation failure.", __func__);
367	}
368	flowc = wrtod(wr);
369	req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
370
371	/* First the flowc ... */
372	memset(flowc, 0, wr->wr_len);
373	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
374	    V_FW_FLOWC_WR_NPARAMS(nparams));
375	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
376	    V_FW_WR_FLOWID(synqe->tid));
377	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
378	flowc->mnemval[0].val = htobe32(pfvf);
379	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
380	flowc->mnemval[1].val = htobe32(pi->tx_chan);
381	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
382	flowc->mnemval[2].val = htobe32(pi->tx_chan);
383	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
384	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
385 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
386 	flowc->mnemval[4].val = htobe32(512);
387 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
388 	flowc->mnemval[5].val = htobe32(512);
389	synqe->flags |= TPF_FLOWC_WR_SENT;
390
391	/* ... then ABORT request */
392	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
393	req->rsvd0 = 0;	/* don't have a snd_nxt */
394	req->rsvd1 = 1;	/* no data sent yet */
395	req->cmd = CPL_ABORT_SEND_RST;
396
397	t4_l2t_send(sc, wr, e);
398}
399
400static int
401create_server(struct adapter *sc, struct listen_ctx *lctx)
402{
403	struct wrqe *wr;
404	struct cpl_pass_open_req *req;
405	struct inpcb *inp = lctx->inp;
406
407	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
408	if (wr == NULL) {
409		log(LOG_ERR, "%s: allocation failure", __func__);
410		return (ENOMEM);
411	}
412	req = wrtod(wr);
413
414	INIT_TP_WR(req, 0);
415	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
416	req->local_port = inp->inp_lport;
417	req->peer_port = 0;
418	req->local_ip = inp->inp_laddr.s_addr;
419	req->peer_ip = 0;
420	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
421	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
422	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
423
424	t4_wrq_tx(sc, wr);
425	return (0);
426}
427
428static int
429create_server6(struct adapter *sc, struct listen_ctx *lctx)
430{
431	struct wrqe *wr;
432	struct cpl_pass_open_req6 *req;
433	struct inpcb *inp = lctx->inp;
434
435	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
436	if (wr == NULL) {
437		log(LOG_ERR, "%s: allocation failure", __func__);
438		return (ENOMEM);
439	}
440	req = wrtod(wr);
441
442	INIT_TP_WR(req, 0);
443	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
444	req->local_port = inp->inp_lport;
445	req->peer_port = 0;
446	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
447	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
448	req->peer_ip_hi = 0;
449	req->peer_ip_lo = 0;
450	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
451	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
452	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
453
454	t4_wrq_tx(sc, wr);
455	return (0);
456}
457
458static int
459destroy_server(struct adapter *sc, struct listen_ctx *lctx)
460{
461	struct wrqe *wr;
462	struct cpl_close_listsvr_req *req;
463
464	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
465	if (wr == NULL) {
466		/* XXX */
467		panic("%s: allocation failure.", __func__);
468	}
469	req = wrtod(wr);
470
471	INIT_TP_WR(req, 0);
472	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
473	    lctx->stid));
474	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
475	req->rsvd = htobe16(0);
476
477	t4_wrq_tx(sc, wr);
478	return (0);
479}
480
481/*
482 * Start a listening server by sending a passive open request to HW.
483 *
484 * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
485 * sc->offload_map, if_capenable are all race prone.
486 */
487int
488t4_listen_start(struct toedev *tod, struct tcpcb *tp)
489{
490	struct adapter *sc = tod->tod_softc;
491	struct port_info *pi;
492	struct inpcb *inp = tp->t_inpcb;
493	struct listen_ctx *lctx;
494	int i, rc;
495
496	INP_WLOCK_ASSERT(inp);
497
498#if 0
499	ADAPTER_LOCK(sc);
500	if (IS_BUSY(sc)) {
501		log(LOG_ERR, "%s: listen request ignored, %s is busy",
502		    __func__, device_get_nameunit(sc->dev));
503		goto done;
504	}
505
506	KASSERT(sc->flags & TOM_INIT_DONE,
507	    ("%s: TOM not initialized", __func__));
508#endif
509
510	if ((sc->open_device_map & sc->offload_map) == 0)
511		goto done;	/* no port that's UP with IFCAP_TOE enabled */
512
513	/*
514	 * Find a running port with IFCAP_TOE (4 or 6).  We'll use the first
515	 * such port's queues to send the passive open and receive the reply to
516	 * it.
517	 *
518	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
519	 * then reject any attempt to bring down such a port (and maybe reject
520	 * attempts to disable IFCAP_TOE on that port too?).
521	 */
522	for_each_port(sc, i) {
523		if (isset(&sc->open_device_map, i) &&
524		    sc->port[i]->ifp->if_capenable & IFCAP_TOE)
525				break;
526	}
527	KASSERT(i < sc->params.nports,
528	    ("%s: no running port with TOE capability enabled.", __func__));
529	pi = sc->port[i];
530
531	if (listen_hash_find(sc, inp) != NULL)
532		goto done;	/* already setup */
533
534	lctx = alloc_lctx(sc, inp, pi);
535	if (lctx == NULL) {
536		log(LOG_ERR,
537		    "%s: listen request ignored, %s couldn't allocate lctx\n",
538		    __func__, device_get_nameunit(sc->dev));
539		goto done;
540	}
541	listen_hash_add(sc, lctx);
542
543	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
544	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
545	    inp->inp_vflag);
546
547	if (inp->inp_vflag & INP_IPV6)
548		rc = create_server6(sc, lctx);
549	else
550		rc = create_server(sc, lctx);
551	if (rc != 0) {
552		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
553		    __func__, device_get_nameunit(sc->dev), rc);
554		(void) listen_hash_del(sc, inp);
555		inp = release_lctx(sc, lctx);
556		/* can't be freed, host stack has a reference */
557		KASSERT(inp != NULL, ("%s: inp freed", __func__));
558		goto done;
559	}
560	lctx->flags |= LCTX_RPL_PENDING;
561done:
562#if 0
563	ADAPTER_UNLOCK(sc);
564#endif
565	return (0);
566}
567
568int
569t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
570{
571	struct listen_ctx *lctx;
572	struct adapter *sc = tod->tod_softc;
573	struct inpcb *inp = tp->t_inpcb;
574	struct synq_entry *synqe;
575
576	INP_WLOCK_ASSERT(inp);
577
578	lctx = listen_hash_del(sc, inp);
579	if (lctx == NULL)
580		return (ENOENT);	/* no hardware listener for this inp */
581
582	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
583	    lctx, lctx->flags);
584
585	/*
586	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
587	 * arrive and clean up when it does.
588	 */
589	if (lctx->flags & LCTX_RPL_PENDING) {
590		KASSERT(TAILQ_EMPTY(&lctx->synq),
591		    ("%s: synq not empty.", __func__));
592		return (EINPROGRESS);
593	}
594
595	/*
596	 * The host stack will abort all the connections on the listening
597	 * socket's so_comp.  It doesn't know about the connections on the synq
598	 * so we need to take care of those.
599	 */
600	TAILQ_FOREACH(synqe, &lctx->synq, link) {
601		if (synqe->flags & TPF_SYNQE_HAS_L2TE)
602			send_reset_synqe(tod, synqe);
603	}
604
605	destroy_server(sc, lctx);
606	return (0);
607}
608
609static inline void
610hold_synqe(struct synq_entry *synqe)
611{
612
613	refcount_acquire(&synqe->refcnt);
614}
615
616static inline void
617release_synqe(struct synq_entry *synqe)
618{
619
620	if (refcount_release(&synqe->refcnt)) {
621		int needfree = synqe->flags & TPF_SYNQE_NEEDFREE;
622
623		m_freem(synqe->syn);
624		if (needfree)
625			free(synqe, M_CXGBE);
626	}
627}
628
629void
630t4_syncache_added(struct toedev *tod __unused, void *arg)
631{
632	struct synq_entry *synqe = arg;
633
634	hold_synqe(synqe);
635}
636
637void
638t4_syncache_removed(struct toedev *tod __unused, void *arg)
639{
640	struct synq_entry *synqe = arg;
641
642	release_synqe(synqe);
643}
644
645/* XXX */
646extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
647
648int
649t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
650{
651	struct adapter *sc = tod->tod_softc;
652	struct synq_entry *synqe = arg;
653	struct wrqe *wr;
654	struct l2t_entry *e;
655	struct tcpopt to;
656	struct ip *ip = mtod(m, struct ip *);
657	struct tcphdr *th;
658
659	wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
660	if (wr == NULL) {
661		m_freem(m);
662		return (EALREADY);
663	}
664
665	if (ip->ip_v == IPVERSION)
666		th = (void *)(ip + 1);
667	else
668		th = (void *)((struct ip6_hdr *)ip + 1);
669	bzero(&to, sizeof(to));
670	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
671	    TO_SYN);
672
673	/* save these for later */
674	synqe->iss = be32toh(th->th_seq);
675	synqe->ts = to.to_tsval;
676
677	if (is_t5(sc)) {
678		struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr);
679
680		rpl5->iss = th->th_seq;
681	}
682
683	e = &sc->l2t->l2tab[synqe->l2e_idx];
684	t4_l2t_send(sc, wr, e);
685
686	m_freem(m);	/* don't need this any more */
687	return (0);
688}
689
690static int
691do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
692    struct mbuf *m)
693{
694	struct adapter *sc = iq->adapter;
695	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
696	int stid = GET_TID(cpl);
697	unsigned int status = cpl->status;
698	struct listen_ctx *lctx = lookup_stid(sc, stid);
699	struct inpcb *inp = lctx->inp;
700#ifdef INVARIANTS
701	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
702#endif
703
704	KASSERT(opcode == CPL_PASS_OPEN_RPL,
705	    ("%s: unexpected opcode 0x%x", __func__, opcode));
706	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
707	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
708
709	INP_WLOCK(inp);
710
711	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
712	    __func__, stid, status, lctx->flags);
713
714	lctx->flags &= ~LCTX_RPL_PENDING;
715
716	if (status != CPL_ERR_NONE)
717		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
718
719#ifdef INVARIANTS
720	/*
721	 * If the inp has been dropped (listening socket closed) then
722	 * listen_stop must have run and taken the inp out of the hash.
723	 */
724	if (inp->inp_flags & INP_DROPPED) {
725		KASSERT(listen_hash_del(sc, inp) == NULL,
726		    ("%s: inp %p still in listen hash", __func__, inp));
727	}
728#endif
729
730	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
731		if (release_lctx(sc, lctx) != NULL)
732			INP_WUNLOCK(inp);
733		return (status);
734	}
735
736	/*
737	 * Listening socket stopped listening earlier and now the chip tells us
738	 * it has started the hardware listener.  Stop it; the lctx will be
739	 * released in do_close_server_rpl.
740	 */
741	if (inp->inp_flags & INP_DROPPED) {
742		destroy_server(sc, lctx);
743		INP_WUNLOCK(inp);
744		return (status);
745	}
746
747	/*
748	 * Failed to start hardware listener.  Take inp out of the hash and
749	 * release our reference on it.  An error message has been logged
750	 * already.
751	 */
752	if (status != CPL_ERR_NONE) {
753		listen_hash_del(sc, inp);
754		if (release_lctx(sc, lctx) != NULL)
755			INP_WUNLOCK(inp);
756		return (status);
757	}
758
759	/* hardware listener open for business */
760
761	INP_WUNLOCK(inp);
762	return (status);
763}
764
765static int
766do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
767    struct mbuf *m)
768{
769	struct adapter *sc = iq->adapter;
770	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
771	int stid = GET_TID(cpl);
772	unsigned int status = cpl->status;
773	struct listen_ctx *lctx = lookup_stid(sc, stid);
774	struct inpcb *inp = lctx->inp;
775#ifdef INVARIANTS
776	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
777#endif
778
779	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
780	    ("%s: unexpected opcode 0x%x", __func__, opcode));
781	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
782	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
783
784	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
785
786	if (status != CPL_ERR_NONE) {
787		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
788		    __func__, status, stid);
789		return (status);
790	}
791
792	INP_WLOCK(inp);
793	inp = release_lctx(sc, lctx);
794	if (inp != NULL)
795		INP_WUNLOCK(inp);
796
797	return (status);
798}
799
800static void
801done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
802{
803	struct listen_ctx *lctx = synqe->lctx;
804	struct inpcb *inp = lctx->inp;
805	struct port_info *pi = synqe->syn->m_pkthdr.rcvif->if_softc;
806	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
807
808	INP_WLOCK_ASSERT(inp);
809
810	TAILQ_REMOVE(&lctx->synq, synqe, link);
811	inp = release_lctx(sc, lctx);
812	if (inp)
813		INP_WUNLOCK(inp);
814	remove_tid(sc, synqe->tid);
815	release_tid(sc, synqe->tid, &sc->sge.ctrlq[pi->port_id]);
816	t4_l2t_release(e);
817	release_synqe(synqe);	/* removed from synq list */
818}
819
820int
821do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
822    struct mbuf *m)
823{
824	struct adapter *sc = iq->adapter;
825	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
826	unsigned int tid = GET_TID(cpl);
827	struct synq_entry *synqe = lookup_tid(sc, tid);
828	struct listen_ctx *lctx = synqe->lctx;
829	struct inpcb *inp = lctx->inp;
830	int txqid;
831	struct sge_wrq *ofld_txq;
832#ifdef INVARIANTS
833	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
834#endif
835
836	KASSERT(opcode == CPL_ABORT_REQ_RSS,
837	    ("%s: unexpected opcode 0x%x", __func__, opcode));
838	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
839	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
840
841	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
842	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
843
844	if (negative_advice(cpl->status))
845		return (0);	/* Ignore negative advice */
846
847	INP_WLOCK(inp);
848
849	get_qids_from_mbuf(synqe->syn, &txqid, NULL);
850	ofld_txq = &sc->sge.ofld_txq[txqid];
851
852	/*
853	 * If we'd initiated an abort earlier the reply to it is responsible for
854	 * cleaning up resources.  Otherwise we tear everything down right here
855	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
856	 */
857	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
858		INP_WUNLOCK(inp);
859		goto done;
860	}
861
862	done_with_synqe(sc, synqe);
863	/* inp lock released by done_with_synqe */
864done:
865	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
866	return (0);
867}
868
869int
870do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
871    struct mbuf *m)
872{
873	struct adapter *sc = iq->adapter;
874	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
875	unsigned int tid = GET_TID(cpl);
876	struct synq_entry *synqe = lookup_tid(sc, tid);
877	struct listen_ctx *lctx = synqe->lctx;
878	struct inpcb *inp = lctx->inp;
879#ifdef INVARIANTS
880	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
881#endif
882
883	KASSERT(opcode == CPL_ABORT_RPL_RSS,
884	    ("%s: unexpected opcode 0x%x", __func__, opcode));
885	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
886	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
887
888	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
889	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
890
891	INP_WLOCK(inp);
892	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
893	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
894	    __func__, synqe, synqe->flags));
895
896	done_with_synqe(sc, synqe);
897	/* inp lock released by done_with_synqe */
898
899	return (0);
900}
901
902void
903t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
904{
905	struct adapter *sc = tod->tod_softc;
906	struct synq_entry *synqe = arg;
907#ifdef INVARIANTS
908	struct inpcb *inp = sotoinpcb(so);
909#endif
910	struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
911	struct toepcb *toep = *(struct toepcb **)(cpl + 1);
912
913	INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
914	INP_WLOCK_ASSERT(inp);
915	KASSERT(synqe->flags & TPF_SYNQE,
916	    ("%s: %p not a synq_entry?", __func__, arg));
917
918	offload_socket(so, toep);
919	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
920	toep->flags |= TPF_CPL_PENDING;
921	update_tid(sc, synqe->tid, toep);
922	synqe->flags |= TPF_SYNQE_EXPANDED;
923}
924
925static inline void
926save_qids_in_mbuf(struct mbuf *m, struct port_info *pi)
927{
928	uint32_t txqid, rxqid;
929
930	txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
931	rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
932
933	m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
934}
935
936static inline void
937get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
938{
939
940	if (txqid)
941		*txqid = m->m_pkthdr.flowid >> 16;
942	if (rxqid)
943		*rxqid = m->m_pkthdr.flowid & 0xffff;
944}
945
946/*
947 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
948 * store some state temporarily.
949 */
950static struct synq_entry *
951mbuf_to_synqe(struct mbuf *m)
952{
953	int len = roundup2(sizeof (struct synq_entry), 8);
954	int tspace = M_TRAILINGSPACE(m);
955	struct synq_entry *synqe = NULL;
956
957	if (tspace < len) {
958		synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
959		if (synqe == NULL)
960			return (NULL);
961		synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE;
962	} else {
963		synqe = (void *)(m->m_data + m->m_len + tspace - len);
964		synqe->flags = TPF_SYNQE;
965	}
966
967	return (synqe);
968}
969
970static void
971t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
972{
973	bzero(to, sizeof(*to));
974
975	if (t4opt->mss) {
976		to->to_flags |= TOF_MSS;
977		to->to_mss = be16toh(t4opt->mss);
978	}
979
980	if (t4opt->wsf) {
981		to->to_flags |= TOF_SCALE;
982		to->to_wscale = t4opt->wsf;
983	}
984
985	if (t4opt->tstamp)
986		to->to_flags |= TOF_TS;
987
988	if (t4opt->sack)
989		to->to_flags |= TOF_SACKPERM;
990}
991
992/*
993 * Options2 for passive open.
994 */
995static uint32_t
996calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
997    const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode)
998{
999	struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
1000	uint32_t opt2;
1001
1002	opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) |
1003	    F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
1004
1005	if (V_tcp_do_rfc1323) {
1006		if (tcpopt->tstamp)
1007			opt2 |= F_TSTAMPS_EN;
1008		if (tcpopt->sack)
1009			opt2 |= F_SACK_EN;
1010		if (tcpopt->wsf <= 14)
1011			opt2 |= F_WND_SCALE_EN;
1012	}
1013
1014	if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
1015		opt2 |= F_CCTRL_ECN;
1016
1017	/* RX_COALESCE is always a valid value (0 or M_RX_COALESCE). */
1018	if (is_t4(sc))
1019		opt2 |= F_RX_COALESCE_VALID;
1020	else {
1021		opt2 |= F_T5_OPT_2_VALID;
1022		opt2 |= F_CONG_CNTRL_VALID; /* OPT_2_ISS really, for T5 */
1023	}
1024	if (sc->tt.rx_coalesce)
1025		opt2 |= V_RX_COALESCE(M_RX_COALESCE);
1026
1027#ifdef USE_DDP_RX_FLOW_CONTROL
1028	if (ulp_mode == ULP_MODE_TCPDDP)
1029		opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
1030#endif
1031
1032	return htobe32(opt2);
1033}
1034
1035/* XXX: duplication. */
1036static inline void
1037tcp_fields_to_host(struct tcphdr *th)
1038{
1039
1040	th->th_seq = ntohl(th->th_seq);
1041	th->th_ack = ntohl(th->th_ack);
1042	th->th_win = ntohs(th->th_win);
1043	th->th_urp = ntohs(th->th_urp);
1044}
1045
1046static void
1047pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc,
1048    struct tcphdr *th)
1049{
1050	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1051	const struct ether_header *eh;
1052	unsigned int hlen = be32toh(cpl->hdr_len);
1053	uintptr_t l3hdr;
1054	const struct tcphdr *tcp;
1055
1056	eh = (const void *)(cpl + 1);
1057	l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1058	tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1059
1060	if (inc) {
1061		bzero(inc, sizeof(*inc));
1062		inc->inc_fport = tcp->th_sport;
1063		inc->inc_lport = tcp->th_dport;
1064		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1065			const struct ip *ip = (const void *)l3hdr;
1066
1067			inc->inc_faddr = ip->ip_src;
1068			inc->inc_laddr = ip->ip_dst;
1069		} else {
1070			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1071
1072			inc->inc_flags |= INC_ISIPV6;
1073			inc->inc6_faddr = ip6->ip6_src;
1074			inc->inc6_laddr = ip6->ip6_dst;
1075		}
1076	}
1077
1078	if (th) {
1079		bcopy(tcp, th, sizeof(*th));
1080		tcp_fields_to_host(th);		/* just like tcp_input */
1081	}
1082}
1083
1084static int
1085ifnet_has_ip6(struct ifnet *ifp, struct in6_addr *ip6)
1086{
1087	struct ifaddr *ifa;
1088	struct sockaddr_in6 *sin6;
1089	int found = 0;
1090	struct in6_addr in6 = *ip6;
1091
1092	/* Just as in ip6_input */
1093	if (in6_clearscope(&in6) || in6_clearscope(&in6))
1094		return (0);
1095	in6_setscope(&in6, ifp, NULL);
1096
1097	if_addr_rlock(ifp);
1098	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1099		sin6 = (void *)ifa->ifa_addr;
1100		if (sin6->sin6_family != AF_INET6)
1101			continue;
1102
1103		if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &in6)) {
1104			found = 1;
1105			break;
1106		}
1107	}
1108	if_addr_runlock(ifp);
1109
1110	return (found);
1111}
1112
1113static struct l2t_entry *
1114get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1115    struct in_conninfo *inc)
1116{
1117	struct rtentry *rt;
1118	struct l2t_entry *e;
1119	struct sockaddr_in6 sin6;
1120	struct sockaddr *dst = (void *)&sin6;
1121
1122	if (inc->inc_flags & INC_ISIPV6) {
1123		dst->sa_len = sizeof(struct sockaddr_in6);
1124		dst->sa_family = AF_INET6;
1125		((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
1126
1127		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1128			/* no need for route lookup */
1129			e = t4_l2t_get(pi, ifp, dst);
1130			return (e);
1131		}
1132	} else {
1133		dst->sa_len = sizeof(struct sockaddr_in);
1134		dst->sa_family = AF_INET;
1135		((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
1136	}
1137
1138	rt = rtalloc1(dst, 0, 0);
1139	if (rt == NULL)
1140		return (NULL);
1141	else {
1142		struct sockaddr *nexthop;
1143
1144		RT_UNLOCK(rt);
1145		if (rt->rt_ifp != ifp)
1146			e = NULL;
1147		else {
1148			if (rt->rt_flags & RTF_GATEWAY)
1149				nexthop = rt->rt_gateway;
1150			else
1151				nexthop = dst;
1152			e = t4_l2t_get(pi, ifp, nexthop);
1153		}
1154		RTFREE(rt);
1155	}
1156
1157	return (e);
1158}
1159
1160static int
1161ifnet_has_ip(struct ifnet *ifp, struct in_addr in)
1162{
1163	struct ifaddr *ifa;
1164	struct sockaddr_in *sin;
1165	int found = 0;
1166
1167	if_addr_rlock(ifp);
1168	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1169		sin = (void *)ifa->ifa_addr;
1170		if (sin->sin_family != AF_INET)
1171			continue;
1172
1173		if (sin->sin_addr.s_addr == in.s_addr) {
1174			found = 1;
1175			break;
1176		}
1177	}
1178	if_addr_runlock(ifp);
1179
1180	return (found);
1181}
1182
1183#define REJECT_PASS_ACCEPT()	do { \
1184	reject_reason = __LINE__; \
1185	goto reject; \
1186} while (0)
1187
1188/*
1189 * The context associated with a tid entry via insert_tid could be a synq_entry
1190 * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
1191 */
1192CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1193
1194/*
1195 * Incoming SYN on a listening socket.
1196 *
1197 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1198 * etc.
1199 */
1200static int
1201do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1202    struct mbuf *m)
1203{
1204	struct adapter *sc = iq->adapter;
1205	struct toedev *tod;
1206	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1207	struct cpl_pass_accept_rpl *rpl;
1208	struct wrqe *wr;
1209	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1210	unsigned int tid = GET_TID(cpl);
1211	struct listen_ctx *lctx = lookup_stid(sc, stid);
1212	struct inpcb *inp;
1213	struct socket *so;
1214	struct in_conninfo inc;
1215	struct tcphdr th;
1216	struct tcpopt to;
1217	struct port_info *pi;
1218	struct ifnet *hw_ifp, *ifp;
1219	struct l2t_entry *e = NULL;
1220	int rscale, mtu_idx, rx_credits, rxqid, ulp_mode;
1221	struct synq_entry *synqe = NULL;
1222	int reject_reason;
1223	uint16_t vid;
1224#ifdef INVARIANTS
1225	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1226#endif
1227
1228	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1229	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1230	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1231
1232	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1233	    lctx);
1234
1235	pass_accept_req_to_protohdrs(m, &inc, &th);
1236	t4opt_to_tcpopt(&cpl->tcpopt, &to);
1237
1238	pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
1239	hw_ifp = pi->ifp;	/* the cxgbeX ifnet */
1240	m->m_pkthdr.rcvif = hw_ifp;
1241	tod = TOEDEV(hw_ifp);
1242
1243	/*
1244	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1245	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
1246	 * doesn't match anything on this interface.
1247	 *
1248	 * XXX: lagg support, lagg + vlan support.
1249	 */
1250	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1251	if (vid != 0xfff) {
1252		ifp = VLAN_DEVAT(hw_ifp, vid);
1253		if (ifp == NULL)
1254			REJECT_PASS_ACCEPT();
1255	} else
1256		ifp = hw_ifp;
1257
1258	/*
1259	 * Don't offload if the peer requested a TCP option that's not known to
1260	 * the silicon.
1261	 */
1262	if (cpl->tcpopt.unknown)
1263		REJECT_PASS_ACCEPT();
1264
1265	if (inc.inc_flags & INC_ISIPV6) {
1266
1267		/* Don't offload if the ifcap isn't enabled */
1268		if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1269			REJECT_PASS_ACCEPT();
1270
1271		/*
1272		 * SYN must be directed to an IP6 address on this ifnet.  This
1273		 * is more restrictive than in6_localip.
1274		 */
1275		if (!ifnet_has_ip6(ifp, &inc.inc6_laddr))
1276			REJECT_PASS_ACCEPT();
1277	} else {
1278
1279		/* Don't offload if the ifcap isn't enabled */
1280		if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1281			REJECT_PASS_ACCEPT();
1282
1283		/*
1284		 * SYN must be directed to an IP address on this ifnet.  This
1285		 * is more restrictive than in_localip.
1286		 */
1287		if (!ifnet_has_ip(ifp, inc.inc_laddr))
1288			REJECT_PASS_ACCEPT();
1289	}
1290
1291	e = get_l2te_for_nexthop(pi, ifp, &inc);
1292	if (e == NULL)
1293		REJECT_PASS_ACCEPT();
1294
1295	synqe = mbuf_to_synqe(m);
1296	if (synqe == NULL)
1297		REJECT_PASS_ACCEPT();
1298
1299	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1300	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]);
1301	if (wr == NULL)
1302		REJECT_PASS_ACCEPT();
1303	rpl = wrtod(wr);
1304
1305	INP_INFO_WLOCK(&V_tcbinfo);	/* for 4-tuple check, syncache_add */
1306
1307	/* Don't offload if the 4-tuple is already in use */
1308	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1309		INP_INFO_WUNLOCK(&V_tcbinfo);
1310		free(wr, M_CXGBE);
1311		REJECT_PASS_ACCEPT();
1312	}
1313
1314	inp = lctx->inp;		/* listening socket, not owned by TOE */
1315	INP_WLOCK(inp);
1316
1317	/* Don't offload if the listening socket has closed */
1318	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1319		/*
1320		 * The listening socket has closed.  The reply from the TOE to
1321		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
1322		 * resources tied to this listen context.
1323		 */
1324		INP_WUNLOCK(inp);
1325		INP_INFO_WUNLOCK(&V_tcbinfo);
1326		free(wr, M_CXGBE);
1327		REJECT_PASS_ACCEPT();
1328	}
1329	so = inp->inp_socket;
1330
1331	mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
1332	rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
1333	SOCKBUF_LOCK(&so->so_rcv);
1334	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1335	rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
1336	SOCKBUF_UNLOCK(&so->so_rcv);
1337
1338	save_qids_in_mbuf(m, pi);
1339	get_qids_from_mbuf(m, NULL, &rxqid);
1340
1341	if (is_t4(sc))
1342		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1343	else {
1344		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1345
1346		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1347	}
1348	if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) {
1349		ulp_mode = ULP_MODE_TCPDDP;
1350		synqe->flags |= TPF_SYNQE_TCPDDP;
1351	} else
1352		ulp_mode = ULP_MODE_NONE;
1353	rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits, ulp_mode);
1354	rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode);
1355
1356	synqe->tid = tid;
1357	synqe->lctx = lctx;
1358	synqe->syn = m;
1359	m = NULL;
1360	refcount_init(&synqe->refcnt, 1);	/* 1 means extra hold */
1361	synqe->l2e_idx = e->idx;
1362	synqe->rcv_bufsize = rx_credits;
1363	atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
1364
1365	insert_tid(sc, tid, synqe);
1366	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
1367	hold_synqe(synqe);	/* hold for the duration it's in the synq */
1368	hold_lctx(lctx);	/* A synqe on the list has a ref on its lctx */
1369
1370	/*
1371	 * If all goes well t4_syncache_respond will get called during
1372	 * syncache_add.  Also note that syncache_add releases both pcbinfo and
1373	 * pcb locks.
1374	 */
1375	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1376	INP_UNLOCK_ASSERT(inp);	/* ok to assert, we have a ref on the inp */
1377	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1378
1379	/*
1380	 * If we replied during syncache_add (synqe->wr has been consumed),
1381	 * good.  Otherwise, set it to 0 so that further syncache_respond
1382	 * attempts by the kernel will be ignored.
1383	 */
1384	if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
1385
1386		/*
1387		 * syncache may or may not have a hold on the synqe, which may
1388		 * or may not be stashed in the original SYN mbuf passed to us.
1389		 * Just copy it over instead of dealing with all possibilities.
1390		 */
1391		m = m_dup(synqe->syn, M_NOWAIT);
1392		if (m)
1393			m->m_pkthdr.rcvif = hw_ifp;
1394
1395		remove_tid(sc, synqe->tid);
1396		free(wr, M_CXGBE);
1397
1398		/* Yank the synqe out of the lctx synq. */
1399		INP_WLOCK(inp);
1400		TAILQ_REMOVE(&lctx->synq, synqe, link);
1401		release_synqe(synqe);	/* removed from synq list */
1402		inp = release_lctx(sc, lctx);
1403		if (inp)
1404			INP_WUNLOCK(inp);
1405
1406		release_synqe(synqe);	/* extra hold */
1407		REJECT_PASS_ACCEPT();
1408	}
1409
1410	CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
1411	    __func__, stid, tid, lctx, synqe);
1412
1413	INP_WLOCK(inp);
1414	synqe->flags |= TPF_SYNQE_HAS_L2TE;
1415	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1416		/*
1417		 * Listening socket closed but tod_listen_stop did not abort
1418		 * this tid because there was no L2T entry for the tid at that
1419		 * time.  Abort it now.  The reply to the abort will clean up.
1420		 */
1421		CTR6(KTR_CXGBE,
1422		    "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT",
1423		    __func__, stid, tid, lctx, synqe, synqe->flags);
1424		if (!(synqe->flags & TPF_SYNQE_EXPANDED))
1425			send_reset_synqe(tod, synqe);
1426		INP_WUNLOCK(inp);
1427
1428		release_synqe(synqe);	/* extra hold */
1429		return (__LINE__);
1430	}
1431	INP_WUNLOCK(inp);
1432
1433	release_synqe(synqe);	/* extra hold */
1434	return (0);
1435reject:
1436	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1437	    reject_reason);
1438
1439	if (e)
1440		t4_l2t_release(e);
1441	release_tid(sc, tid, lctx->ctrlq);
1442
1443	if (__predict_true(m != NULL)) {
1444		m_adj(m, sizeof(*cpl));
1445		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1446		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1447		m->m_pkthdr.csum_data = 0xffff;
1448		hw_ifp->if_input(hw_ifp, m);
1449	}
1450
1451	return (reject_reason);
1452}
1453
1454static void
1455synqe_to_protohdrs(struct synq_entry *synqe,
1456    const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1457    struct tcphdr *th, struct tcpopt *to)
1458{
1459	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1460
1461	/* start off with the original SYN */
1462	pass_accept_req_to_protohdrs(synqe->syn, inc, th);
1463
1464	/* modify parts to make it look like the ACK to our SYN|ACK */
1465	th->th_flags = TH_ACK;
1466	th->th_ack = synqe->iss + 1;
1467	th->th_seq = be32toh(cpl->rcv_isn);
1468	bzero(to, sizeof(*to));
1469	if (G_TCPOPT_TSTAMP(tcp_opt)) {
1470		to->to_flags |= TOF_TS;
1471		to->to_tsecr = synqe->ts;
1472	}
1473}
1474
1475static int
1476do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1477    struct mbuf *m)
1478{
1479	struct adapter *sc = iq->adapter;
1480	struct port_info *pi;
1481	struct ifnet *ifp;
1482	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1483#if defined(KTR) || defined(INVARIANTS)
1484	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1485#endif
1486	unsigned int tid = GET_TID(cpl);
1487	struct synq_entry *synqe = lookup_tid(sc, tid);
1488	struct listen_ctx *lctx = synqe->lctx;
1489	struct inpcb *inp = lctx->inp;
1490	struct socket *so;
1491	struct tcphdr th;
1492	struct tcpopt to;
1493	struct in_conninfo inc;
1494	struct toepcb *toep;
1495	u_int txqid, rxqid;
1496#ifdef INVARIANTS
1497	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1498#endif
1499
1500	KASSERT(opcode == CPL_PASS_ESTABLISH,
1501	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1502	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1503	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1504	KASSERT(synqe->flags & TPF_SYNQE,
1505	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1506
1507	INP_INFO_WLOCK(&V_tcbinfo);	/* for syncache_expand */
1508	INP_WLOCK(inp);
1509
1510	CTR6(KTR_CXGBE,
1511	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1512	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1513
1514	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1515
1516		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1517			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1518			    ("%s: listen socket closed but tid %u not aborted.",
1519			    __func__, tid));
1520		}
1521
1522		INP_WUNLOCK(inp);
1523		INP_INFO_WUNLOCK(&V_tcbinfo);
1524		return (0);
1525	}
1526
1527	ifp = synqe->syn->m_pkthdr.rcvif;
1528	pi = ifp->if_softc;
1529	KASSERT(pi->adapter == sc,
1530	    ("%s: pi %p, sc %p mismatch", __func__, pi, sc));
1531
1532	get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
1533	KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1534	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__, rxqid,
1535	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1536
1537	toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT);
1538	if (toep == NULL) {
1539reset:
1540		/*
1541		 * The reply to this abort will perform final cleanup.  There is
1542		 * no need to check for HAS_L2TE here.  We can be here only if
1543		 * we responded to the PASS_ACCEPT_REQ, and our response had the
1544		 * L2T idx.
1545		 */
1546		send_reset_synqe(TOEDEV(ifp), synqe);
1547		INP_WUNLOCK(inp);
1548		INP_INFO_WUNLOCK(&V_tcbinfo);
1549		return (0);
1550	}
1551	toep->tid = tid;
1552	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
1553	if (synqe->flags & TPF_SYNQE_TCPDDP)
1554		set_tcpddp_ulp_mode(toep);
1555	else
1556		toep->ulp_mode = ULP_MODE_NONE;
1557	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1558	toep->rx_credits = synqe->rcv_bufsize;
1559
1560	so = inp->inp_socket;
1561	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1562
1563	/* Come up with something that syncache_expand should be ok with. */
1564	synqe_to_protohdrs(synqe, cpl, &inc, &th, &to);
1565
1566	/*
1567	 * No more need for anything in the mbuf that carried the
1568	 * CPL_PASS_ACCEPT_REQ.  Drop the CPL_PASS_ESTABLISH and toep pointer
1569	 * there.  XXX: bad form but I don't want to increase the size of synqe.
1570	 */
1571	m = synqe->syn;
1572	KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
1573	    ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
1574	bcopy(cpl, mtod(m, void *), sizeof(*cpl));
1575	*(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
1576
1577	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1578		free_toepcb(toep);
1579		goto reset;
1580	}
1581
1582	/*
1583	 * This is for the unlikely case where the syncache entry that we added
1584	 * has been evicted from the syncache, but the syncache_expand above
1585	 * works because of syncookies.
1586	 *
1587	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1588	 * anyone accept'ing a connection before we've installed our hooks, but
1589	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1590	 */
1591	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1592		struct inpcb *new_inp = sotoinpcb(so);
1593
1594		INP_WLOCK(new_inp);
1595		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1596		t4_offload_socket(TOEDEV(ifp), synqe, so);
1597		INP_WUNLOCK(new_inp);
1598	}
1599
1600	/* Done with the synqe */
1601	TAILQ_REMOVE(&lctx->synq, synqe, link);
1602	inp = release_lctx(sc, lctx);
1603	if (inp != NULL)
1604		INP_WUNLOCK(inp);
1605	INP_INFO_WUNLOCK(&V_tcbinfo);
1606	release_synqe(synqe);
1607
1608	return (0);
1609}
1610
1611void
1612t4_init_listen_cpl_handlers(struct adapter *sc)
1613{
1614
1615	t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1616	t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1617	t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1618	t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
1619}
1620#endif
1621