1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30#include "opt_inet.h"
31
32#ifdef TCP_OFFLOAD
33#include <sys/param.h>
34#include <sys/refcount.h>
35#include <sys/socket.h>
36#include <sys/socketvar.h>
37#include <sys/sysctl.h>
38#include <net/if.h>
39#include <net/if_var.h>
40#include <net/route.h>
41#include <netinet/in.h>
42#include <netinet/ip.h>
43#include <netinet/in_fib.h>
44#include <netinet/in_pcb.h>
45#include <netinet/in_var.h>
46#include <netinet/tcp_timer.h>
47#define TCPSTATES
48#include <netinet/tcp_fsm.h>
49#include <netinet/tcp_var.h>
50#include <netinet/toecore.h>
51
52#include "cxgb_include.h"
53#include "ulp/tom/cxgb_tom.h"
54#include "ulp/tom/cxgb_l2t.h"
55#include "ulp/tom/cxgb_toepcb.h"
56
57static void t3_send_reset_synqe(struct toedev *, struct synq_entry *);
58
59static int
60alloc_stid(struct tid_info *t, void *ctx)
61{
62	int stid = -1;
63
64	mtx_lock(&t->stid_lock);
65	if (t->sfree) {
66		union listen_entry *p = t->sfree;
67
68		stid = (p - t->stid_tab) + t->stid_base;
69		t->sfree = p->next;
70		p->ctx = ctx;
71		t->stids_in_use++;
72	}
73	mtx_unlock(&t->stid_lock);
74	return (stid);
75}
76
77static void
78free_stid(struct tid_info *t, int stid)
79{
80	union listen_entry *p = stid2entry(t, stid);
81
82	mtx_lock(&t->stid_lock);
83	p->next = t->sfree;
84	t->sfree = p;
85	t->stids_in_use--;
86	mtx_unlock(&t->stid_lock);
87}
88
89static struct listen_ctx *
90alloc_lctx(struct tom_data *td, struct inpcb *inp, int qset)
91{
92	struct listen_ctx *lctx;
93
94	INP_WLOCK_ASSERT(inp);
95
96	lctx = malloc(sizeof(struct listen_ctx), M_CXGB, M_NOWAIT | M_ZERO);
97	if (lctx == NULL)
98		return (NULL);
99
100	lctx->stid = alloc_stid(&td->tid_maps, lctx);
101	if (lctx->stid < 0) {
102		free(lctx, M_CXGB);
103		return (NULL);
104	}
105
106	lctx->inp = inp;
107	in_pcbref(inp);
108
109	lctx->qset = qset;
110	refcount_init(&lctx->refcnt, 1);
111	TAILQ_INIT(&lctx->synq);
112
113	return (lctx);
114}
115
116/* Don't call this directly, use release_lctx instead */
117static int
118free_lctx(struct tom_data *td, struct listen_ctx *lctx)
119{
120	struct inpcb *inp = lctx->inp;
121
122	INP_WLOCK_ASSERT(inp);
123	KASSERT(lctx->refcnt == 0,
124	    ("%s: refcnt %d", __func__, lctx->refcnt));
125	KASSERT(TAILQ_EMPTY(&lctx->synq),
126	    ("%s: synq not empty.", __func__));
127	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
128
129	CTR4(KTR_CXGB, "%s: stid %u, lctx %p, inp %p",
130	    __func__, lctx->stid, lctx, lctx->inp);
131
132	free_stid(&td->tid_maps, lctx->stid);
133	free(lctx, M_CXGB);
134
135	return in_pcbrele_wlocked(inp);
136}
137
138static void
139hold_lctx(struct listen_ctx *lctx)
140{
141
142	refcount_acquire(&lctx->refcnt);
143}
144
145static inline uint32_t
146listen_hashfn(void *key, u_long mask)
147{
148
149	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
150}
151
152/*
153 * Add a listen_ctx entry to the listen hash table.
154 */
155static void
156listen_hash_add(struct tom_data *td, struct listen_ctx *lctx)
157{
158	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
159
160	mtx_lock(&td->lctx_hash_lock);
161	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
162	td->lctx_count++;
163	mtx_unlock(&td->lctx_hash_lock);
164}
165
166/*
167 * Look for the listening socket's context entry in the hash and return it.
168 */
169static struct listen_ctx *
170listen_hash_find(struct tom_data *td, struct inpcb *inp)
171{
172	int bucket = listen_hashfn(inp, td->listen_mask);
173	struct listen_ctx *lctx;
174
175	mtx_lock(&td->lctx_hash_lock);
176	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
177		if (lctx->inp == inp)
178			break;
179	}
180	mtx_unlock(&td->lctx_hash_lock);
181
182	return (lctx);
183}
184
185/*
186 * Removes the listen_ctx structure for inp from the hash and returns it.
187 */
188static struct listen_ctx *
189listen_hash_del(struct tom_data *td, struct inpcb *inp)
190{
191	int bucket = listen_hashfn(inp, td->listen_mask);
192	struct listen_ctx *lctx, *l;
193
194	mtx_lock(&td->lctx_hash_lock);
195	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
196		if (lctx->inp == inp) {
197			LIST_REMOVE(lctx, link);
198			td->lctx_count--;
199			break;
200		}
201	}
202	mtx_unlock(&td->lctx_hash_lock);
203
204	return (lctx);
205}
206
207/*
208 * Releases a hold on the lctx.  Must be called with the listening socket's inp
209 * locked.  The inp may be freed by this function and it returns NULL to
210 * indicate this.
211 */
212static struct inpcb *
213release_lctx(struct tom_data *td, struct listen_ctx *lctx)
214{
215	struct inpcb *inp = lctx->inp;
216	int inp_freed = 0;
217
218	INP_WLOCK_ASSERT(inp);
219	if (refcount_release(&lctx->refcnt))
220		inp_freed = free_lctx(td, lctx);
221
222	return (inp_freed ? NULL : inp);
223}
224
225static int
226create_server(struct adapter *sc, struct listen_ctx *lctx)
227{
228	struct mbuf *m;
229	struct cpl_pass_open_req *req;
230	struct inpcb *inp = lctx->inp;
231
232	m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req);
233	if (m == NULL)
234		return (ENOMEM);
235
236	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
237	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
238	req->local_port = inp->inp_lport;
239	memcpy(&req->local_ip, &inp->inp_laddr, 4);
240	req->peer_port = 0;
241	req->peer_ip = 0;
242	req->peer_netmask = 0;
243	req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
244	req->opt0l = htonl(V_RCV_BUFSIZ(16));
245	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
246
247	t3_offload_tx(sc, m);
248
249	return (0);
250}
251
252static int
253destroy_server(struct adapter *sc, struct listen_ctx *lctx)
254{
255	struct mbuf *m;
256	struct cpl_close_listserv_req *req;
257
258	m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req);
259	if (m == NULL)
260		return (ENOMEM);
261
262	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
263	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
264	    lctx->stid));
265	req->cpu_idx = 0;
266
267	t3_offload_tx(sc, m);
268
269	return (0);
270}
271
272/*
273 * Process a CPL_CLOSE_LISTSRV_RPL message.  If the status is good we release
274 * the STID.
275 */
276static int
277do_close_server_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
278{
279	struct adapter *sc = qs->adap;
280	struct tom_data *td = sc->tom_softc;
281	struct cpl_close_listserv_rpl *rpl = mtod(m, void *);
282	unsigned int stid = GET_TID(rpl);
283	struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid);
284	struct inpcb *inp = lctx->inp;
285
286	CTR3(KTR_CXGB, "%s: stid %u, status %u", __func__, stid, rpl->status);
287
288	if (rpl->status != CPL_ERR_NONE) {
289		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u",
290		    __func__, rpl->status, stid);
291	} else {
292		INP_WLOCK(inp);
293		KASSERT(listen_hash_del(td, lctx->inp) == NULL,
294		    ("%s: inp %p still in listen hash", __func__, inp));
295		if (release_lctx(td, lctx) != NULL)
296			INP_WUNLOCK(inp);
297	}
298
299	m_freem(m);
300	return (0);
301}
302
303/*
304 * Process a CPL_PASS_OPEN_RPL message.  Remove the lctx from the listen hash
305 * table and free it if there was any error, otherwise nothing to do.
306 */
307static int
308do_pass_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
309{
310	struct adapter *sc = qs->adap;
311	struct tom_data *td = sc->tom_softc;
312       	struct cpl_pass_open_rpl *rpl = mtod(m, void *);
313	int stid = GET_TID(rpl);
314	struct listen_ctx *lctx;
315	struct inpcb *inp;
316
317	/*
318	 * We get these replies also when setting up HW filters.  Just throw
319	 * those away.
320	 */
321	if (stid >= td->tid_maps.stid_base + td->tid_maps.nstids)
322		goto done;
323
324	lctx = lookup_stid(&td->tid_maps, stid);
325	inp = lctx->inp;
326
327	INP_WLOCK(inp);
328
329	CTR4(KTR_CXGB, "%s: stid %u, status %u, flags 0x%x",
330	    __func__, stid, rpl->status, lctx->flags);
331
332	lctx->flags &= ~LCTX_RPL_PENDING;
333
334	if (rpl->status != CPL_ERR_NONE) {
335		log(LOG_ERR, "%s: %s: hw listen (stid %d) failed: %d\n",
336		    __func__, device_get_nameunit(sc->dev), stid, rpl->status);
337	}
338
339#ifdef INVARIANTS
340	/*
341	 * If the inp has been dropped (listening socket closed) then
342	 * listen_stop must have run and taken the inp out of the hash.
343	 */
344	if (inp->inp_flags & INP_DROPPED) {
345		KASSERT(listen_hash_del(td, inp) == NULL,
346		    ("%s: inp %p still in listen hash", __func__, inp));
347	}
348#endif
349
350	if (inp->inp_flags & INP_DROPPED && rpl->status != CPL_ERR_NONE) {
351		if (release_lctx(td, lctx) != NULL)
352			INP_WUNLOCK(inp);
353		goto done;
354	}
355
356	/*
357	 * Listening socket stopped listening earlier and now the chip tells us
358	 * it has started the hardware listener.  Stop it; the lctx will be
359	 * released in do_close_server_rpl.
360	 */
361	if (inp->inp_flags & INP_DROPPED) {
362		destroy_server(sc, lctx);
363		INP_WUNLOCK(inp);
364		goto done;
365	}
366
367	/*
368	 * Failed to start hardware listener.  Take inp out of the hash and
369	 * release our reference on it.  An error message has been logged
370	 * already.
371	 */
372	if (rpl->status != CPL_ERR_NONE) {
373		listen_hash_del(td, inp);
374		if (release_lctx(td, lctx) != NULL)
375			INP_WUNLOCK(inp);
376		goto done;
377	}
378
379	/* hardware listener open for business */
380
381	INP_WUNLOCK(inp);
382done:
383	m_freem(m);
384	return (0);
385}
386
387static void
388pass_accept_req_to_protohdrs(const struct cpl_pass_accept_req *cpl,
389    struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to)
390{
391	const struct tcp_options *t3opt = &cpl->tcp_options;
392
393	bzero(inc, sizeof(*inc));
394	inc->inc_faddr.s_addr = cpl->peer_ip;
395	inc->inc_laddr.s_addr = cpl->local_ip;
396	inc->inc_fport = cpl->peer_port;
397	inc->inc_lport = cpl->local_port;
398
399	bzero(th, sizeof(*th));
400	th->th_sport = cpl->peer_port;
401	th->th_dport = cpl->local_port;
402	th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */
403	th->th_flags = TH_SYN;
404
405	bzero(to, sizeof(*to));
406	if (t3opt->mss) {
407		to->to_flags |= TOF_MSS;
408		to->to_mss = be16toh(t3opt->mss);
409	}
410	if (t3opt->wsf) {
411		to->to_flags |= TOF_SCALE;
412		to->to_wscale = t3opt->wsf;
413	}
414	if (t3opt->tstamp)
415		to->to_flags |= TOF_TS;
416	if (t3opt->sack)
417		to->to_flags |= TOF_SACKPERM;
418}
419
420static inline void
421hold_synqe(struct synq_entry *synqe)
422{
423
424	refcount_acquire(&synqe->refcnt);
425}
426
427static inline void
428release_synqe(struct synq_entry *synqe)
429{
430
431	if (refcount_release(&synqe->refcnt))
432		m_freem(synqe->m);
433}
434
435/*
436 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
437 * store some state temporarily.  There will be enough room in the mbuf's
438 * trailing space as the CPL is not that large.
439 *
440 * XXX: bad hack.
441 */
442static struct synq_entry *
443mbuf_to_synq_entry(struct mbuf *m)
444{
445	int len = roundup(sizeof (struct synq_entry), 8);
446
447	if (__predict_false(M_TRAILINGSPACE(m) < len)) {
448	    panic("%s: no room for synq_entry (%td, %d)\n", __func__,
449	    M_TRAILINGSPACE(m), len);
450	}
451
452	return ((void *)(M_START(m) + M_SIZE(m) - len));
453}
454
455#ifdef KTR
456#define REJECT_PASS_ACCEPT()	do { \
457	reject_reason = __LINE__; \
458	goto reject; \
459} while (0)
460#else
461#define REJECT_PASS_ACCEPT()	do { goto reject; } while (0)
462#endif
463
464/*
465 * The context associated with a tid entry via insert_tid could be a synq_entry
466 * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
467 */
468CTASSERT(offsetof(struct toepcb, tp_flags) == offsetof(struct synq_entry, flags));
469
470/*
471 * Handle a CPL_PASS_ACCEPT_REQ message.
472 */
473static int
474do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
475{
476	struct adapter *sc = qs->adap;
477	struct tom_data *td = sc->tom_softc;
478	struct toedev *tod = &td->tod;
479	const struct cpl_pass_accept_req *req = mtod(m, void *);
480	unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
481	unsigned int tid = GET_TID(req);
482	struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid);
483	struct l2t_entry *e = NULL;
484	struct nhop4_basic nh4;
485	struct sockaddr_in nam;
486	struct inpcb *inp;
487	struct socket *so;
488	struct port_info *pi;
489	struct ifnet *ifp;
490	struct in_conninfo inc;
491	struct tcphdr th;
492	struct tcpopt to;
493	struct synq_entry *synqe = NULL;
494	int i;
495#ifdef KTR
496	int reject_reason;
497#endif
498
499	CTR4(KTR_CXGB, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
500	    lctx);
501
502	pass_accept_req_to_protohdrs(req, &inc, &th, &to);
503
504	/*
505	 * Don't offload if the interface that received the SYN doesn't have
506	 * IFCAP_TOE enabled.
507	 */
508	pi = NULL;
509	for_each_port(sc, i) {
510		if (memcmp(sc->port[i].hw_addr, req->dst_mac, ETHER_ADDR_LEN))
511			continue;
512		pi = &sc->port[i];
513		break;
514	}
515	if (pi == NULL)
516		REJECT_PASS_ACCEPT();
517	ifp = pi->ifp;
518	if ((ifp->if_capenable & IFCAP_TOE4) == 0)
519		REJECT_PASS_ACCEPT();
520
521	/*
522	 * Don't offload if the outgoing interface for the route back to the
523	 * peer is not the same as the interface that received the SYN.
524	 */
525	bzero(&nam, sizeof(nam));
526	nam.sin_len = sizeof(nam);
527	nam.sin_family = AF_INET;
528	nam.sin_addr = inc.inc_faddr;
529	if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, nam.sin_addr, 0, 0, &nh4) != 0)
530		REJECT_PASS_ACCEPT();
531	else {
532		nam.sin_addr = nh4.nh_addr;
533		if (nh4.nh_ifp == ifp)
534			e = t3_l2t_get(pi, ifp, (struct sockaddr *)&nam);
535		if (e == NULL)
536			REJECT_PASS_ACCEPT();	/* no l2te, or ifp mismatch */
537	}
538
539	INP_INFO_RLOCK(&V_tcbinfo);
540
541	/* Don't offload if the 4-tuple is already in use */
542	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
543		INP_INFO_RUNLOCK(&V_tcbinfo);
544		REJECT_PASS_ACCEPT();
545	}
546
547	inp = lctx->inp;	/* listening socket (not owned by the TOE) */
548	INP_WLOCK(inp);
549	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
550		/*
551		 * The listening socket has closed.  The reply from the TOE to
552		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
553		 * resources tied to this listen context.
554		 */
555		INP_WUNLOCK(inp);
556		INP_INFO_RUNLOCK(&V_tcbinfo);
557		REJECT_PASS_ACCEPT();
558	}
559	so = inp->inp_socket;
560
561	/* Reuse the mbuf that delivered the CPL to us */
562	synqe = mbuf_to_synq_entry(m);
563	synqe->flags = TP_IS_A_SYNQ_ENTRY;
564	synqe->m = m;
565	synqe->lctx = lctx;
566	synqe->tid = tid;
567	synqe->e = e;
568	synqe->opt0h = calc_opt0h(so, 0, 0, e);
569	synqe->qset = pi->first_qset + (arc4random() % pi->nqsets);
570	SOCKBUF_LOCK(&so->so_rcv);
571	synqe->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
572	SOCKBUF_UNLOCK(&so->so_rcv);
573	refcount_init(&synqe->refcnt, 1);
574	atomic_store_rel_int(&synqe->reply, RPL_OK);
575
576	insert_tid(td, synqe, tid);
577	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
578	hold_synqe(synqe);
579	hold_lctx(lctx);
580
581	/* syncache_add releases both pcbinfo and pcb locks */
582	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
583	INP_UNLOCK_ASSERT(inp);
584	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
585
586	/*
587	 * If we replied during syncache_add (reply is RPL_DONE), good.
588	 * Otherwise (reply is unchanged - RPL_OK) it's no longer ok to reply.
589	 * The mbuf will stick around as long as the entry is in the syncache.
590	 * The kernel is free to retry syncache_respond but we'll ignore it due
591	 * to RPL_DONT.
592	 */
593	if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONT)) {
594
595		INP_WLOCK(inp);
596		if (__predict_false(inp->inp_flags & INP_DROPPED)) {
597			/* listener closed.  synqe must have been aborted. */
598			KASSERT(synqe->flags & TP_ABORT_SHUTDOWN,
599			    ("%s: listener %p closed but synqe %p not aborted",
600			    __func__, inp, synqe));
601
602			CTR5(KTR_CXGB,
603			    "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED",
604			    __func__, stid, tid, lctx, synqe);
605			INP_WUNLOCK(inp);
606			release_synqe(synqe);
607			return (__LINE__);
608		}
609
610		KASSERT(!(synqe->flags & TP_ABORT_SHUTDOWN),
611		    ("%s: synqe %p aborted, but listener %p not dropped.",
612		    __func__, synqe, inp));
613
614		TAILQ_REMOVE(&lctx->synq, synqe, link);
615		release_synqe(synqe);	/* removed from synq list */
616		inp = release_lctx(td, lctx);
617		if (inp)
618			INP_WUNLOCK(inp);
619
620		release_synqe(synqe);	/* about to exit function */
621		REJECT_PASS_ACCEPT();
622	}
623
624	KASSERT(synqe->reply == RPL_DONE,
625	    ("%s: reply %d", __func__, synqe->reply));
626
627	CTR3(KTR_CXGB, "%s: stid %u, tid %u, OK", __func__, stid, tid);
628	release_synqe(synqe);
629	return (0);
630
631reject:
632	CTR4(KTR_CXGB, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
633	    reject_reason);
634
635	if (synqe == NULL)
636		m_freem(m);
637	if (e)
638		l2t_release(td->l2t, e);
639	queue_tid_release(tod, tid);
640
641	return (0);
642}
643
644static void
645pass_establish_to_protohdrs(const struct cpl_pass_establish *cpl,
646    struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to)
647{
648	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
649
650	bzero(inc, sizeof(*inc));
651	inc->inc_faddr.s_addr = cpl->peer_ip;
652	inc->inc_laddr.s_addr = cpl->local_ip;
653	inc->inc_fport = cpl->peer_port;
654	inc->inc_lport = cpl->local_port;
655
656	bzero(th, sizeof(*th));
657	th->th_sport = cpl->peer_port;
658	th->th_dport = cpl->local_port;
659	th->th_flags = TH_ACK;
660	th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */
661	th->th_ack = be32toh(cpl->snd_isn); /* ditto */
662
663	bzero(to, sizeof(*to));
664	if (G_TCPOPT_TSTAMP(tcp_opt))
665		to->to_flags |= TOF_TS;
666}
667
668/*
669 * Process a CPL_PASS_ESTABLISH message.  The T3 has already established a
670 * connection and we need to do the software side setup.
671 */
672static int
673do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
674{
675	struct adapter *sc = qs->adap;
676	struct tom_data *td = sc->tom_softc;
677	struct cpl_pass_establish *cpl = mtod(m, void *);
678	struct toedev *tod = &td->tod;
679	unsigned int tid = GET_TID(cpl);
680	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
681	struct toepcb *toep;
682	struct socket *so;
683	struct listen_ctx *lctx = synqe->lctx;
684	struct inpcb *inp = lctx->inp, *new_inp;
685	struct tcpopt to;
686	struct tcphdr th;
687	struct in_conninfo inc;
688#ifdef KTR
689	int stid = G_PASS_OPEN_TID(ntohl(cpl->tos_tid));
690#endif
691
692	CTR5(KTR_CXGB, "%s: stid %u, tid %u, lctx %p, inp_flags 0x%x",
693	    __func__, stid, tid, lctx, inp->inp_flags);
694
695	KASSERT(qs->idx == synqe->qset,
696	    ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset));
697
698	INP_INFO_RLOCK(&V_tcbinfo);	/* for syncache_expand */
699	INP_WLOCK(inp);
700
701	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
702		/*
703		 * The listening socket has closed.  The TOM must have aborted
704		 * all the embryonic connections (including this one) that were
705		 * on the lctx's synq.  do_abort_rpl for the tid is responsible
706		 * for cleaning up.
707		 */
708		KASSERT(synqe->flags & TP_ABORT_SHUTDOWN,
709		    ("%s: listen socket dropped but tid %u not aborted.",
710		    __func__, tid));
711		INP_WUNLOCK(inp);
712		INP_INFO_RUNLOCK(&V_tcbinfo);
713		m_freem(m);
714		return (0);
715	}
716
717	pass_establish_to_protohdrs(cpl, &inc, &th, &to);
718
719	/* Lie in order to pass the checks in syncache_expand */
720	to.to_tsecr = synqe->ts;
721	th.th_ack = synqe->iss + 1;
722
723	toep = toepcb_alloc(tod);
724	if (toep == NULL) {
725reset:
726		t3_send_reset_synqe(tod, synqe);
727		INP_WUNLOCK(inp);
728		INP_INFO_RUNLOCK(&V_tcbinfo);
729		m_freem(m);
730		return (0);
731	}
732	toep->tp_qset = qs->idx;
733	toep->tp_l2t = synqe->e;
734	toep->tp_tid = tid;
735	toep->tp_rx_credits = synqe->rx_credits;
736
737	synqe->toep = toep;
738	synqe->cpl = cpl;
739
740	so = inp->inp_socket;
741	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
742		toepcb_free(toep);
743		goto reset;
744	}
745
746	/* New connection inpcb is already locked by syncache_expand(). */
747	new_inp = sotoinpcb(so);
748	INP_WLOCK_ASSERT(new_inp);
749
750	if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) {
751		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
752		t3_offload_socket(tod, synqe, so);
753	}
754
755	INP_WUNLOCK(new_inp);
756
757	/* Remove the synq entry and release its reference on the lctx */
758	TAILQ_REMOVE(&lctx->synq, synqe, link);
759	inp = release_lctx(td, lctx);
760	if (inp)
761		INP_WUNLOCK(inp);
762	INP_INFO_RUNLOCK(&V_tcbinfo);
763	release_synqe(synqe);
764
765	m_freem(m);
766	return (0);
767}
768
769void
770t3_init_listen_cpl_handlers(struct adapter *sc)
771{
772	t3_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
773	t3_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
774	t3_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
775	t3_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
776}
777
778/*
779 * Start a listening server by sending a passive open request to HW.
780 *
781 * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
782 * sc->offload_map, if_capenable are all race prone.
783 */
784int
785t3_listen_start(struct toedev *tod, struct tcpcb *tp)
786{
787	struct tom_data *td = t3_tomdata(tod);
788	struct adapter *sc = tod->tod_softc;
789	struct port_info *pi;
790	struct inpcb *inp = tp->t_inpcb;
791	struct listen_ctx *lctx;
792	int i;
793
794	INP_WLOCK_ASSERT(inp);
795
796	if ((inp->inp_vflag & INP_IPV4) == 0)
797		return (0);
798
799#ifdef notyet
800	ADAPTER_LOCK(sc);
801	if (IS_BUSY(sc)) {
802		log(LOG_ERR, "%s: listen request ignored, %s is busy",
803		    __func__, device_get_nameunit(sc->dev));
804		goto done;
805	}
806
807	KASSERT(sc->flags & TOM_INIT_DONE,
808	    ("%s: TOM not initialized", __func__));
809#endif
810
811	if ((sc->open_device_map & sc->offload_map) == 0)
812		goto done;	/* no port that's UP with IFCAP_TOE enabled */
813
814	/*
815	 * Find a running port with IFCAP_TOE4.  We'll use the first such port's
816	 * queues to send the passive open and receive the reply to it.
817	 *
818	 * XXX: need a way to mark an port in use by offload.  if_cxgbe should
819	 * then reject any attempt to bring down such a port (and maybe reject
820	 * attempts to disable IFCAP_TOE on that port too?).
821	 */
822	for_each_port(sc, i) {
823		if (isset(&sc->open_device_map, i) &&
824		    sc->port[i].ifp->if_capenable & IFCAP_TOE4)
825				break;
826	}
827	KASSERT(i < sc->params.nports,
828	    ("%s: no running port with TOE capability enabled.", __func__));
829	pi = &sc->port[i];
830
831	if (listen_hash_find(td, inp) != NULL)
832		goto done;	/* already setup */
833
834	lctx = alloc_lctx(td, inp, pi->first_qset);
835	if (lctx == NULL) {
836		log(LOG_ERR,
837		    "%s: listen request ignored, %s couldn't allocate lctx\n",
838		    __func__, device_get_nameunit(sc->dev));
839		goto done;
840	}
841	listen_hash_add(td, lctx);
842
843	CTR5(KTR_CXGB, "%s: stid %u (%s), lctx %p, inp %p", __func__,
844	    lctx->stid, tcpstates[tp->t_state], lctx, inp);
845
846	if (create_server(sc, lctx) != 0) {
847		log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__,
848		    device_get_nameunit(sc->dev));
849		(void) listen_hash_del(td, inp);
850		inp = release_lctx(td, lctx);
851		/* can't be freed, host stack has a reference */
852		KASSERT(inp != NULL, ("%s: inp freed", __func__));
853		goto done;
854	}
855	lctx->flags |= LCTX_RPL_PENDING;
856done:
857#ifdef notyet
858	ADAPTER_UNLOCK(sc);
859#endif
860	return (0);
861}
862
863/*
864 * Stop a listening server by sending a close_listsvr request to HW.
865 * The server TID is freed when we get the reply.
866 */
867int
868t3_listen_stop(struct toedev *tod, struct tcpcb *tp)
869{
870	struct listen_ctx *lctx;
871	struct adapter *sc = tod->tod_softc;
872	struct tom_data *td = t3_tomdata(tod);
873	struct inpcb *inp = tp->t_inpcb;
874	struct synq_entry *synqe;
875
876	INP_WLOCK_ASSERT(inp);
877
878	lctx = listen_hash_del(td, inp);
879	if (lctx == NULL)
880		return (ENOENT);	/* no hardware listener for this inp */
881
882	CTR4(KTR_CXGB, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
883	    lctx, lctx->flags);
884
885	/*
886	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
887	 * arrive and clean up when it does.
888	 */
889	if (lctx->flags & LCTX_RPL_PENDING) {
890		KASSERT(TAILQ_EMPTY(&lctx->synq),
891		    ("%s: synq not empty.", __func__));
892		return (EINPROGRESS);
893	}
894
895	/*
896	 * The host stack will abort all the connections on the listening
897	 * socket's so_comp.  It doesn't know about the connections on the synq
898	 * so we need to take care of those.
899	 */
900	TAILQ_FOREACH(synqe, &lctx->synq, link) {
901		KASSERT(synqe->lctx == lctx, ("%s: synq corrupt", __func__));
902		t3_send_reset_synqe(tod, synqe);
903	}
904
905	destroy_server(sc, lctx);
906	return (0);
907}
908
909void
910t3_syncache_added(struct toedev *tod __unused, void *arg)
911{
912	struct synq_entry *synqe = arg;
913
914	hold_synqe(synqe);
915}
916
917void
918t3_syncache_removed(struct toedev *tod __unused, void *arg)
919{
920	struct synq_entry *synqe = arg;
921
922	release_synqe(synqe);
923}
924
925int
926t3_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
927{
928	struct adapter *sc = tod->tod_softc;
929	struct synq_entry *synqe = arg;
930	struct l2t_entry *e = synqe->e;
931	struct ip *ip = mtod(m, struct ip *);
932	struct tcphdr *th = (void *)(ip + 1);
933	struct cpl_pass_accept_rpl *rpl;
934	struct mbuf *r;
935	struct listen_ctx *lctx = synqe->lctx;
936	struct tcpopt to;
937	int mtu_idx, cpu_idx;
938
939	/*
940	 * The first time we run it's during the call to syncache_add.  That's
941	 * the only one we care about.
942	 */
943	if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONE) == 0)
944		goto done;	/* reply to the CPL only if it's ok to do so */
945
946	r = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, rpl);
947	if (r == NULL)
948		goto done;
949
950	/*
951	 * Use only the provided mbuf (with ip and tcp headers) and what's in
952	 * synqe.  Avoid looking at the listening socket (lctx->inp) here.
953	 *
954	 * XXX: if the incoming SYN had the TCP timestamp option but the kernel
955	 * decides it doesn't want to use TCP timestamps we have no way of
956	 * relaying this info to the chip on a per-tid basis (all we have is a
957	 * global knob).
958	 */
959	bzero(&to, sizeof(to));
960	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
961	    TO_SYN);
962
963	/* stash them for later */
964	synqe->iss = be32toh(th->th_seq);
965	synqe->ts = to.to_tsval;
966
967	mtu_idx = find_best_mtu_idx(sc, NULL, to.to_mss);
968	cpu_idx = sc->rrss_map[synqe->qset];
969
970	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
971	rpl->wr.wrh_lo = 0;
972	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, synqe->tid));
973	rpl->opt2 = calc_opt2(cpu_idx);
974	rpl->rsvd = rpl->opt2;		/* workaround for HW bug */
975	rpl->peer_ip = ip->ip_dst.s_addr;
976	rpl->opt0h = synqe->opt0h |
977	    calc_opt0h(NULL, mtu_idx, to.to_wscale, NULL);
978	rpl->opt0l_status = htobe32(CPL_PASS_OPEN_ACCEPT) |
979	    calc_opt0l(NULL, synqe->rx_credits);
980
981	l2t_send(sc, r, e);
982done:
983	m_freem(m);
984	return (0);
985}
986
987int
988do_abort_req_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
989{
990	struct adapter *sc = qs->adap;
991	struct tom_data *td = sc->tom_softc;
992	struct toedev *tod = &td->tod;
993	const struct cpl_abort_req_rss *req = mtod(m, void *);
994	unsigned int tid = GET_TID(req);
995	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
996	struct listen_ctx *lctx = synqe->lctx;
997	struct inpcb *inp = lctx->inp;
998
999	KASSERT(synqe->flags & TP_IS_A_SYNQ_ENTRY,
1000	    ("%s: !SYNQ_ENTRY", __func__));
1001
1002	CTR6(KTR_CXGB, "%s: tid %u, synqe %p (%x), lctx %p, status %d",
1003	    __func__, tid, synqe, synqe->flags, synqe->lctx, req->status);
1004
1005	INP_WLOCK(inp);
1006
1007	if (!(synqe->flags & TP_ABORT_REQ_RCVD)) {
1008		synqe->flags |= TP_ABORT_REQ_RCVD;
1009		synqe->flags |= TP_ABORT_SHUTDOWN;
1010		INP_WUNLOCK(inp);
1011		m_freem(m);
1012		return (0);
1013	}
1014	synqe->flags &= ~TP_ABORT_REQ_RCVD;
1015
1016	/*
1017	 * If we'd sent a reset on this synqe, we'll ignore this and clean up in
1018	 * the T3's reply to our reset instead.
1019	 */
1020	if (synqe->flags & TP_ABORT_RPL_PENDING) {
1021		synqe->flags |= TP_ABORT_RPL_SENT;
1022		INP_WUNLOCK(inp);
1023	} else {
1024		TAILQ_REMOVE(&lctx->synq, synqe, link);
1025		inp = release_lctx(td, lctx);
1026		if (inp)
1027			INP_WUNLOCK(inp);
1028		release_tid(tod, tid, qs->idx);
1029		l2t_release(td->l2t, synqe->e);
1030		release_synqe(synqe);
1031	}
1032
1033	send_abort_rpl(tod, tid, qs->idx);
1034	m_freem(m);
1035	return (0);
1036}
1037
1038int
1039do_abort_rpl_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1040{
1041	struct adapter *sc = qs->adap;
1042	struct tom_data *td = sc->tom_softc;
1043	struct toedev *tod = &td->tod;
1044	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
1045	unsigned int tid = GET_TID(rpl);
1046	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
1047	struct listen_ctx *lctx = synqe->lctx;
1048	struct inpcb *inp = lctx->inp;
1049
1050	CTR3(KTR_CXGB, "%s: tid %d, synqe %p, status %d", tid, synqe,
1051	    rpl->status);
1052
1053	INP_WLOCK(inp);
1054
1055	if (synqe->flags & TP_ABORT_RPL_PENDING) {
1056		if (!(synqe->flags & TP_ABORT_RPL_RCVD)) {
1057			synqe->flags |= TP_ABORT_RPL_RCVD;
1058			INP_WUNLOCK(inp);
1059		} else {
1060			synqe->flags &= ~TP_ABORT_RPL_RCVD;
1061			synqe->flags &= TP_ABORT_RPL_PENDING;
1062
1063			TAILQ_REMOVE(&lctx->synq, synqe, link);
1064			inp = release_lctx(td, lctx);
1065			if (inp)
1066				INP_WUNLOCK(inp);
1067			release_tid(tod, tid, qs->idx);
1068			l2t_release(td->l2t, synqe->e);
1069			release_synqe(synqe);
1070		}
1071	}
1072
1073	m_freem(m);
1074	return (0);
1075}
1076
1077static void
1078t3_send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
1079{
1080	struct cpl_abort_req *req;
1081	unsigned int tid = synqe->tid;
1082	struct adapter *sc = tod->tod_softc;
1083	struct mbuf *m;
1084#ifdef INVARIANTS
1085	struct listen_ctx *lctx = synqe->lctx;
1086	struct inpcb *inp = lctx->inp;
1087#endif
1088
1089	INP_WLOCK_ASSERT(inp);
1090
1091	CTR4(KTR_CXGB, "%s: tid %d, synqe %p (%x)", __func__, tid, synqe,
1092	    synqe->flags);
1093
1094	if (synqe->flags & TP_ABORT_SHUTDOWN)
1095		return;
1096
1097	synqe->flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
1098
1099	m = M_GETHDR_OFLD(synqe->qset, CPL_PRIORITY_DATA, req);
1100	if (m == NULL)
1101		CXGB_UNIMPLEMENTED();
1102
1103	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1104	req->wr.wrh_lo = htonl(V_WR_TID(tid));
1105	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1106	req->rsvd0 = 0;
1107	req->rsvd1 = !(synqe->flags & TP_DATASENT);
1108	req->cmd = CPL_ABORT_SEND_RST;
1109
1110	l2t_send(sc, m, synqe->e);
1111}
1112
1113void
1114t3_offload_socket(struct toedev *tod, void *arg, struct socket *so)
1115{
1116	struct adapter *sc = tod->tod_softc;
1117	struct tom_data *td = sc->tom_softc;
1118	struct synq_entry *synqe = arg;
1119#ifdef INVARIANTS
1120	struct inpcb *inp = sotoinpcb(so);
1121#endif
1122	struct cpl_pass_establish *cpl = synqe->cpl;
1123	struct toepcb *toep = synqe->toep;
1124
1125	INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
1126	INP_WLOCK_ASSERT(inp);
1127
1128	offload_socket(so, toep);
1129	make_established(so, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
1130	update_tid(td, toep, synqe->tid);
1131	synqe->flags |= TP_SYNQE_EXPANDED;
1132}
1133#endif
1134