cxgb_listen.c revision 286227
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_listen.c 286227 2015-08-03 12:13:54Z jch $");
29
30#include "opt_inet.h"
31
32#ifdef TCP_OFFLOAD
33#include <sys/param.h>
34#include <sys/refcount.h>
35#include <sys/socket.h>
36#include <sys/socketvar.h>
37#include <sys/sysctl.h>
38#include <net/if.h>
39#include <net/if_var.h>
40#include <net/route.h>
41#include <netinet/in.h>
42#include <netinet/ip.h>
43#include <netinet/in_pcb.h>
44#include <netinet/in_var.h>
45#include <netinet/tcp_timer.h>
46#include <netinet/tcp_var.h>
47#define TCPSTATES
48#include <netinet/tcp_fsm.h>
49#include <netinet/toecore.h>
50
51#include "cxgb_include.h"
52#include "ulp/tom/cxgb_tom.h"
53#include "ulp/tom/cxgb_l2t.h"
54#include "ulp/tom/cxgb_toepcb.h"
55
56static void t3_send_reset_synqe(struct toedev *, struct synq_entry *);
57
58static int
59alloc_stid(struct tid_info *t, void *ctx)
60{
61	int stid = -1;
62
63	mtx_lock(&t->stid_lock);
64	if (t->sfree) {
65		union listen_entry *p = t->sfree;
66
67		stid = (p - t->stid_tab) + t->stid_base;
68		t->sfree = p->next;
69		p->ctx = ctx;
70		t->stids_in_use++;
71	}
72	mtx_unlock(&t->stid_lock);
73	return (stid);
74}
75
76static void
77free_stid(struct tid_info *t, int stid)
78{
79	union listen_entry *p = stid2entry(t, stid);
80
81	mtx_lock(&t->stid_lock);
82	p->next = t->sfree;
83	t->sfree = p;
84	t->stids_in_use--;
85	mtx_unlock(&t->stid_lock);
86}
87
88static struct listen_ctx *
89alloc_lctx(struct tom_data *td, struct inpcb *inp, int qset)
90{
91	struct listen_ctx *lctx;
92
93	INP_WLOCK_ASSERT(inp);
94
95	lctx = malloc(sizeof(struct listen_ctx), M_CXGB, M_NOWAIT | M_ZERO);
96	if (lctx == NULL)
97		return (NULL);
98
99	lctx->stid = alloc_stid(&td->tid_maps, lctx);
100	if (lctx->stid < 0) {
101		free(lctx, M_CXGB);
102		return (NULL);
103	}
104
105	lctx->inp = inp;
106	in_pcbref(inp);
107
108	lctx->qset = qset;
109	refcount_init(&lctx->refcnt, 1);
110	TAILQ_INIT(&lctx->synq);
111
112	return (lctx);
113}
114
115/* Don't call this directly, use release_lctx instead */
116static int
117free_lctx(struct tom_data *td, struct listen_ctx *lctx)
118{
119	struct inpcb *inp = lctx->inp;
120
121	INP_WLOCK_ASSERT(inp);
122	KASSERT(lctx->refcnt == 0,
123	    ("%s: refcnt %d", __func__, lctx->refcnt));
124	KASSERT(TAILQ_EMPTY(&lctx->synq),
125	    ("%s: synq not empty.", __func__));
126	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
127
128	CTR4(KTR_CXGB, "%s: stid %u, lctx %p, inp %p",
129	    __func__, lctx->stid, lctx, lctx->inp);
130
131	free_stid(&td->tid_maps, lctx->stid);
132	free(lctx, M_CXGB);
133
134	return in_pcbrele_wlocked(inp);
135}
136
137static void
138hold_lctx(struct listen_ctx *lctx)
139{
140
141	refcount_acquire(&lctx->refcnt);
142}
143
144static inline uint32_t
145listen_hashfn(void *key, u_long mask)
146{
147
148	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
149}
150
151/*
152 * Add a listen_ctx entry to the listen hash table.
153 */
154static void
155listen_hash_add(struct tom_data *td, struct listen_ctx *lctx)
156{
157	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
158
159	mtx_lock(&td->lctx_hash_lock);
160	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
161	td->lctx_count++;
162	mtx_unlock(&td->lctx_hash_lock);
163}
164
165/*
166 * Look for the listening socket's context entry in the hash and return it.
167 */
168static struct listen_ctx *
169listen_hash_find(struct tom_data *td, struct inpcb *inp)
170{
171	int bucket = listen_hashfn(inp, td->listen_mask);
172	struct listen_ctx *lctx;
173
174	mtx_lock(&td->lctx_hash_lock);
175	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
176		if (lctx->inp == inp)
177			break;
178	}
179	mtx_unlock(&td->lctx_hash_lock);
180
181	return (lctx);
182}
183
184/*
185 * Removes the listen_ctx structure for inp from the hash and returns it.
186 */
187static struct listen_ctx *
188listen_hash_del(struct tom_data *td, struct inpcb *inp)
189{
190	int bucket = listen_hashfn(inp, td->listen_mask);
191	struct listen_ctx *lctx, *l;
192
193	mtx_lock(&td->lctx_hash_lock);
194	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
195		if (lctx->inp == inp) {
196			LIST_REMOVE(lctx, link);
197			td->lctx_count--;
198			break;
199		}
200	}
201	mtx_unlock(&td->lctx_hash_lock);
202
203	return (lctx);
204}
205
206/*
207 * Releases a hold on the lctx.  Must be called with the listening socket's inp
208 * locked.  The inp may be freed by this function and it returns NULL to
209 * indicate this.
210 */
211static struct inpcb *
212release_lctx(struct tom_data *td, struct listen_ctx *lctx)
213{
214	struct inpcb *inp = lctx->inp;
215	int inp_freed = 0;
216
217	INP_WLOCK_ASSERT(inp);
218	if (refcount_release(&lctx->refcnt))
219		inp_freed = free_lctx(td, lctx);
220
221	return (inp_freed ? NULL : inp);
222}
223
224static int
225create_server(struct adapter *sc, struct listen_ctx *lctx)
226{
227	struct mbuf *m;
228	struct cpl_pass_open_req *req;
229	struct inpcb *inp = lctx->inp;
230
231	m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req);
232	if (m == NULL)
233		return (ENOMEM);
234
235	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
236	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
237	req->local_port = inp->inp_lport;
238	memcpy(&req->local_ip, &inp->inp_laddr, 4);
239	req->peer_port = 0;
240	req->peer_ip = 0;
241	req->peer_netmask = 0;
242	req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
243	req->opt0l = htonl(V_RCV_BUFSIZ(16));
244	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
245
246	t3_offload_tx(sc, m);
247
248	return (0);
249}
250
251static int
252destroy_server(struct adapter *sc, struct listen_ctx *lctx)
253{
254	struct mbuf *m;
255	struct cpl_close_listserv_req *req;
256
257	m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req);
258	if (m == NULL)
259		return (ENOMEM);
260
261	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
262	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
263	    lctx->stid));
264	req->cpu_idx = 0;
265
266	t3_offload_tx(sc, m);
267
268	return (0);
269}
270
271/*
272 * Process a CPL_CLOSE_LISTSRV_RPL message.  If the status is good we release
273 * the STID.
274 */
275static int
276do_close_server_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
277{
278	struct adapter *sc = qs->adap;
279	struct tom_data *td = sc->tom_softc;
280	struct cpl_close_listserv_rpl *rpl = mtod(m, void *);
281	unsigned int stid = GET_TID(rpl);
282	struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid);
283	struct inpcb *inp = lctx->inp;
284
285	CTR3(KTR_CXGB, "%s: stid %u, status %u", __func__, stid, rpl->status);
286
287	if (rpl->status != CPL_ERR_NONE) {
288		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u",
289		    __func__, rpl->status, stid);
290	} else {
291		INP_WLOCK(inp);
292		KASSERT(listen_hash_del(td, lctx->inp) == NULL,
293		    ("%s: inp %p still in listen hash", __func__, inp));
294		if (release_lctx(td, lctx) != NULL)
295			INP_WUNLOCK(inp);
296	}
297
298	m_freem(m);
299	return (0);
300}
301
302/*
303 * Process a CPL_PASS_OPEN_RPL message.  Remove the lctx from the listen hash
304 * table and free it if there was any error, otherwise nothing to do.
305 */
306static int
307do_pass_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
308{
309	struct adapter *sc = qs->adap;
310	struct tom_data *td = sc->tom_softc;
311       	struct cpl_pass_open_rpl *rpl = mtod(m, void *);
312	int stid = GET_TID(rpl);
313	struct listen_ctx *lctx;
314	struct inpcb *inp;
315
316	/*
317	 * We get these replies also when setting up HW filters.  Just throw
318	 * those away.
319	 */
320	if (stid >= td->tid_maps.stid_base + td->tid_maps.nstids)
321		goto done;
322
323	lctx = lookup_stid(&td->tid_maps, stid);
324	inp = lctx->inp;
325
326	INP_WLOCK(inp);
327
328	CTR4(KTR_CXGB, "%s: stid %u, status %u, flags 0x%x",
329	    __func__, stid, rpl->status, lctx->flags);
330
331	lctx->flags &= ~LCTX_RPL_PENDING;
332
333	if (rpl->status != CPL_ERR_NONE) {
334		log(LOG_ERR, "%s: %s: hw listen (stid %d) failed: %d\n",
335		    __func__, device_get_nameunit(sc->dev), stid, rpl->status);
336	}
337
338#ifdef INVARIANTS
339	/*
340	 * If the inp has been dropped (listening socket closed) then
341	 * listen_stop must have run and taken the inp out of the hash.
342	 */
343	if (inp->inp_flags & INP_DROPPED) {
344		KASSERT(listen_hash_del(td, inp) == NULL,
345		    ("%s: inp %p still in listen hash", __func__, inp));
346	}
347#endif
348
349	if (inp->inp_flags & INP_DROPPED && rpl->status != CPL_ERR_NONE) {
350		if (release_lctx(td, lctx) != NULL)
351			INP_WUNLOCK(inp);
352		goto done;
353	}
354
355	/*
356	 * Listening socket stopped listening earlier and now the chip tells us
357	 * it has started the hardware listener.  Stop it; the lctx will be
358	 * released in do_close_server_rpl.
359	 */
360	if (inp->inp_flags & INP_DROPPED) {
361		destroy_server(sc, lctx);
362		INP_WUNLOCK(inp);
363		goto done;
364	}
365
366	/*
367	 * Failed to start hardware listener.  Take inp out of the hash and
368	 * release our reference on it.  An error message has been logged
369	 * already.
370	 */
371	if (rpl->status != CPL_ERR_NONE) {
372		listen_hash_del(td, inp);
373		if (release_lctx(td, lctx) != NULL)
374			INP_WUNLOCK(inp);
375		goto done;
376	}
377
378	/* hardware listener open for business */
379
380	INP_WUNLOCK(inp);
381done:
382	m_freem(m);
383	return (0);
384}
385
386static void
387pass_accept_req_to_protohdrs(const struct cpl_pass_accept_req *cpl,
388    struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to)
389{
390	const struct tcp_options *t3opt = &cpl->tcp_options;
391
392	bzero(inc, sizeof(*inc));
393	inc->inc_faddr.s_addr = cpl->peer_ip;
394	inc->inc_laddr.s_addr = cpl->local_ip;
395	inc->inc_fport = cpl->peer_port;
396	inc->inc_lport = cpl->local_port;
397
398	bzero(th, sizeof(*th));
399	th->th_sport = cpl->peer_port;
400	th->th_dport = cpl->local_port;
401	th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */
402	th->th_flags = TH_SYN;
403
404	bzero(to, sizeof(*to));
405	if (t3opt->mss) {
406		to->to_flags |= TOF_MSS;
407		to->to_mss = be16toh(t3opt->mss);
408	}
409	if (t3opt->wsf) {
410		to->to_flags |= TOF_SCALE;
411		to->to_wscale = t3opt->wsf;
412	}
413	if (t3opt->tstamp)
414		to->to_flags |= TOF_TS;
415	if (t3opt->sack)
416		to->to_flags |= TOF_SACKPERM;
417}
418
419static inline void
420hold_synqe(struct synq_entry *synqe)
421{
422
423	refcount_acquire(&synqe->refcnt);
424}
425
426static inline void
427release_synqe(struct synq_entry *synqe)
428{
429
430	if (refcount_release(&synqe->refcnt))
431		m_freem(synqe->m);
432}
433
434/*
435 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
436 * store some state temporarily.  There will be enough room in the mbuf's
437 * trailing space as the CPL is not that large.
438 *
439 * XXX: bad hack.
440 */
441static struct synq_entry *
442mbuf_to_synq_entry(struct mbuf *m)
443{
444	int len = roundup(sizeof (struct synq_entry), 8);
445
446	if (__predict_false(M_TRAILINGSPACE(m) < len)) {
447	    panic("%s: no room for synq_entry (%td, %d)\n", __func__,
448	    M_TRAILINGSPACE(m), len);
449	}
450
451	return ((void *)(M_START(m) + M_SIZE(m) - len));
452}
453
454#ifdef KTR
455#define REJECT_PASS_ACCEPT()	do { \
456	reject_reason = __LINE__; \
457	goto reject; \
458} while (0)
459#else
460#define REJECT_PASS_ACCEPT()	do { goto reject; } while (0)
461#endif
462
463/*
464 * The context associated with a tid entry via insert_tid could be a synq_entry
465 * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
466 */
467CTASSERT(offsetof(struct toepcb, tp_flags) == offsetof(struct synq_entry, flags));
468
469/*
470 * Handle a CPL_PASS_ACCEPT_REQ message.
471 */
472static int
473do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
474{
475	struct adapter *sc = qs->adap;
476	struct tom_data *td = sc->tom_softc;
477	struct toedev *tod = &td->tod;
478	const struct cpl_pass_accept_req *req = mtod(m, void *);
479	unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
480	unsigned int tid = GET_TID(req);
481	struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid);
482	struct l2t_entry *e = NULL;
483	struct sockaddr_in nam;
484	struct rtentry *rt;
485	struct inpcb *inp;
486	struct socket *so;
487	struct port_info *pi;
488	struct ifnet *ifp;
489	struct in_conninfo inc;
490	struct tcphdr th;
491	struct tcpopt to;
492	struct synq_entry *synqe = NULL;
493	int i;
494#ifdef KTR
495	int reject_reason;
496#endif
497
498	CTR4(KTR_CXGB, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
499	    lctx);
500
501	pass_accept_req_to_protohdrs(req, &inc, &th, &to);
502
503	/*
504	 * Don't offload if the interface that received the SYN doesn't have
505	 * IFCAP_TOE enabled.
506	 */
507	pi = NULL;
508	for_each_port(sc, i) {
509		if (memcmp(sc->port[i].hw_addr, req->dst_mac, ETHER_ADDR_LEN))
510			continue;
511		pi = &sc->port[i];
512		break;
513	}
514	if (pi == NULL)
515		REJECT_PASS_ACCEPT();
516	ifp = pi->ifp;
517	if ((ifp->if_capenable & IFCAP_TOE4) == 0)
518		REJECT_PASS_ACCEPT();
519
520	/*
521	 * Don't offload if the outgoing interface for the route back to the
522	 * peer is not the same as the interface that received the SYN.
523	 */
524	bzero(&nam, sizeof(nam));
525	nam.sin_len = sizeof(nam);
526	nam.sin_family = AF_INET;
527	nam.sin_addr = inc.inc_faddr;
528	rt = rtalloc1((struct sockaddr *)&nam, 0, 0);
529	if (rt == NULL)
530		REJECT_PASS_ACCEPT();
531	else {
532		struct sockaddr *nexthop;
533
534		RT_UNLOCK(rt);
535		nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway :
536		    (struct sockaddr *)&nam;
537		if (rt->rt_ifp == ifp)
538			e = t3_l2t_get(pi, rt->rt_ifp, nexthop);
539		RTFREE(rt);
540		if (e == NULL)
541			REJECT_PASS_ACCEPT();	/* no l2te, or ifp mismatch */
542	}
543
544	INP_INFO_RLOCK(&V_tcbinfo);
545
546	/* Don't offload if the 4-tuple is already in use */
547	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
548		INP_INFO_RUNLOCK(&V_tcbinfo);
549		REJECT_PASS_ACCEPT();
550	}
551
552	inp = lctx->inp;	/* listening socket (not owned by the TOE) */
553	INP_WLOCK(inp);
554	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
555		/*
556		 * The listening socket has closed.  The reply from the TOE to
557		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
558		 * resources tied to this listen context.
559		 */
560		INP_WUNLOCK(inp);
561		INP_INFO_RUNLOCK(&V_tcbinfo);
562		REJECT_PASS_ACCEPT();
563	}
564	so = inp->inp_socket;
565
566	/* Reuse the mbuf that delivered the CPL to us */
567	synqe = mbuf_to_synq_entry(m);
568	synqe->flags = TP_IS_A_SYNQ_ENTRY;
569	synqe->m = m;
570	synqe->lctx = lctx;
571	synqe->tid = tid;
572	synqe->e = e;
573	synqe->opt0h = calc_opt0h(so, 0, 0, e);
574	synqe->qset = pi->first_qset + (arc4random() % pi->nqsets);
575	SOCKBUF_LOCK(&so->so_rcv);
576	synqe->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
577	SOCKBUF_UNLOCK(&so->so_rcv);
578	refcount_init(&synqe->refcnt, 1);
579	atomic_store_rel_int(&synqe->reply, RPL_OK);
580
581	insert_tid(td, synqe, tid);
582	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
583	hold_synqe(synqe);
584	hold_lctx(lctx);
585
586	/* syncache_add releases both pcbinfo and pcb locks */
587	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
588	INP_UNLOCK_ASSERT(inp);
589	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
590
591	/*
592	 * If we replied during syncache_add (reply is RPL_DONE), good.
593	 * Otherwise (reply is unchanged - RPL_OK) it's no longer ok to reply.
594	 * The mbuf will stick around as long as the entry is in the syncache.
595	 * The kernel is free to retry syncache_respond but we'll ignore it due
596	 * to RPL_DONT.
597	 */
598	if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONT)) {
599
600		INP_WLOCK(inp);
601		if (__predict_false(inp->inp_flags & INP_DROPPED)) {
602			/* listener closed.  synqe must have been aborted. */
603			KASSERT(synqe->flags & TP_ABORT_SHUTDOWN,
604			    ("%s: listener %p closed but synqe %p not aborted",
605			    __func__, inp, synqe));
606
607			CTR5(KTR_CXGB,
608			    "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED",
609			    __func__, stid, tid, lctx, synqe);
610			INP_WUNLOCK(inp);
611			release_synqe(synqe);
612			return (__LINE__);
613		}
614
615		KASSERT(!(synqe->flags & TP_ABORT_SHUTDOWN),
616		    ("%s: synqe %p aborted, but listener %p not dropped.",
617		    __func__, synqe, inp));
618
619		TAILQ_REMOVE(&lctx->synq, synqe, link);
620		release_synqe(synqe);	/* removed from synq list */
621		inp = release_lctx(td, lctx);
622		if (inp)
623			INP_WUNLOCK(inp);
624
625		release_synqe(synqe);	/* about to exit function */
626		REJECT_PASS_ACCEPT();
627	}
628
629	KASSERT(synqe->reply == RPL_DONE,
630	    ("%s: reply %d", __func__, synqe->reply));
631
632	CTR3(KTR_CXGB, "%s: stid %u, tid %u, OK", __func__, stid, tid);
633	release_synqe(synqe);
634	return (0);
635
636reject:
637	CTR4(KTR_CXGB, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
638	    reject_reason);
639
640	if (synqe == NULL)
641		m_freem(m);
642	if (e)
643		l2t_release(td->l2t, e);
644	queue_tid_release(tod, tid);
645
646	return (0);
647}
648
649static void
650pass_establish_to_protohdrs(const struct cpl_pass_establish *cpl,
651    struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to)
652{
653	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
654
655	bzero(inc, sizeof(*inc));
656	inc->inc_faddr.s_addr = cpl->peer_ip;
657	inc->inc_laddr.s_addr = cpl->local_ip;
658	inc->inc_fport = cpl->peer_port;
659	inc->inc_lport = cpl->local_port;
660
661	bzero(th, sizeof(*th));
662	th->th_sport = cpl->peer_port;
663	th->th_dport = cpl->local_port;
664	th->th_flags = TH_ACK;
665	th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */
666	th->th_ack = be32toh(cpl->snd_isn); /* ditto */
667
668	bzero(to, sizeof(*to));
669	if (G_TCPOPT_TSTAMP(tcp_opt))
670		to->to_flags |= TOF_TS;
671}
672
673/*
674 * Process a CPL_PASS_ESTABLISH message.  The T3 has already established a
675 * connection and we need to do the software side setup.
676 */
677static int
678do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
679{
680	struct adapter *sc = qs->adap;
681	struct tom_data *td = sc->tom_softc;
682	struct cpl_pass_establish *cpl = mtod(m, void *);
683	struct toedev *tod = &td->tod;
684	unsigned int tid = GET_TID(cpl);
685	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
686	struct toepcb *toep;
687	struct socket *so;
688	struct listen_ctx *lctx = synqe->lctx;
689	struct inpcb *inp = lctx->inp, *new_inp;
690	struct tcpopt to;
691	struct tcphdr th;
692	struct in_conninfo inc;
693#ifdef KTR
694	int stid = G_PASS_OPEN_TID(ntohl(cpl->tos_tid));
695#endif
696
697	CTR5(KTR_CXGB, "%s: stid %u, tid %u, lctx %p, inp_flags 0x%x",
698	    __func__, stid, tid, lctx, inp->inp_flags);
699
700	KASSERT(qs->idx == synqe->qset,
701	    ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset));
702
703	INP_INFO_RLOCK(&V_tcbinfo);	/* for syncache_expand */
704	INP_WLOCK(inp);
705
706	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
707		/*
708		 * The listening socket has closed.  The TOM must have aborted
709		 * all the embryonic connections (including this one) that were
710		 * on the lctx's synq.  do_abort_rpl for the tid is responsible
711		 * for cleaning up.
712		 */
713		KASSERT(synqe->flags & TP_ABORT_SHUTDOWN,
714		    ("%s: listen socket dropped but tid %u not aborted.",
715		    __func__, tid));
716		INP_WUNLOCK(inp);
717		INP_INFO_RUNLOCK(&V_tcbinfo);
718		m_freem(m);
719		return (0);
720	}
721
722	pass_establish_to_protohdrs(cpl, &inc, &th, &to);
723
724	/* Lie in order to pass the checks in syncache_expand */
725	to.to_tsecr = synqe->ts;
726	th.th_ack = synqe->iss + 1;
727
728	toep = toepcb_alloc(tod);
729	if (toep == NULL) {
730reset:
731		t3_send_reset_synqe(tod, synqe);
732		INP_WUNLOCK(inp);
733		INP_INFO_RUNLOCK(&V_tcbinfo);
734		m_freem(m);
735		return (0);
736	}
737	toep->tp_qset = qs->idx;
738	toep->tp_l2t = synqe->e;
739	toep->tp_tid = tid;
740	toep->tp_rx_credits = synqe->rx_credits;
741
742	synqe->toep = toep;
743	synqe->cpl = cpl;
744
745	so = inp->inp_socket;
746	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
747		toepcb_free(toep);
748		goto reset;
749	}
750
751	/* New connection inpcb is already locked by syncache_expand(). */
752	new_inp = sotoinpcb(so);
753	INP_WLOCK_ASSERT(new_inp);
754
755	if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) {
756		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
757		t3_offload_socket(tod, synqe, so);
758	}
759
760	INP_WUNLOCK(new_inp);
761
762	/* Remove the synq entry and release its reference on the lctx */
763	TAILQ_REMOVE(&lctx->synq, synqe, link);
764	inp = release_lctx(td, lctx);
765	if (inp)
766		INP_WUNLOCK(inp);
767	INP_INFO_RUNLOCK(&V_tcbinfo);
768	release_synqe(synqe);
769
770	m_freem(m);
771	return (0);
772}
773
774void
775t3_init_listen_cpl_handlers(struct adapter *sc)
776{
777	t3_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
778	t3_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
779	t3_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
780	t3_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
781}
782
783/*
784 * Start a listening server by sending a passive open request to HW.
785 *
786 * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
787 * sc->offload_map, if_capenable are all race prone.
788 */
789int
790t3_listen_start(struct toedev *tod, struct tcpcb *tp)
791{
792	struct tom_data *td = t3_tomdata(tod);
793	struct adapter *sc = tod->tod_softc;
794	struct port_info *pi;
795	struct inpcb *inp = tp->t_inpcb;
796	struct listen_ctx *lctx;
797	int i;
798
799	INP_WLOCK_ASSERT(inp);
800
801	if ((inp->inp_vflag & INP_IPV4) == 0)
802		return (0);
803
804#ifdef notyet
805	ADAPTER_LOCK(sc);
806	if (IS_BUSY(sc)) {
807		log(LOG_ERR, "%s: listen request ignored, %s is busy",
808		    __func__, device_get_nameunit(sc->dev));
809		goto done;
810	}
811
812	KASSERT(sc->flags & TOM_INIT_DONE,
813	    ("%s: TOM not initialized", __func__));
814#endif
815
816	if ((sc->open_device_map & sc->offload_map) == 0)
817		goto done;	/* no port that's UP with IFCAP_TOE enabled */
818
819	/*
820	 * Find a running port with IFCAP_TOE4.  We'll use the first such port's
821	 * queues to send the passive open and receive the reply to it.
822	 *
823	 * XXX: need a way to mark an port in use by offload.  if_cxgbe should
824	 * then reject any attempt to bring down such a port (and maybe reject
825	 * attempts to disable IFCAP_TOE on that port too?).
826	 */
827	for_each_port(sc, i) {
828		if (isset(&sc->open_device_map, i) &&
829		    sc->port[i].ifp->if_capenable & IFCAP_TOE4)
830				break;
831	}
832	KASSERT(i < sc->params.nports,
833	    ("%s: no running port with TOE capability enabled.", __func__));
834	pi = &sc->port[i];
835
836	if (listen_hash_find(td, inp) != NULL)
837		goto done;	/* already setup */
838
839	lctx = alloc_lctx(td, inp, pi->first_qset);
840	if (lctx == NULL) {
841		log(LOG_ERR,
842		    "%s: listen request ignored, %s couldn't allocate lctx\n",
843		    __func__, device_get_nameunit(sc->dev));
844		goto done;
845	}
846	listen_hash_add(td, lctx);
847
848	CTR5(KTR_CXGB, "%s: stid %u (%s), lctx %p, inp %p", __func__,
849	    lctx->stid, tcpstates[tp->t_state], lctx, inp);
850
851	if (create_server(sc, lctx) != 0) {
852		log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__,
853		    device_get_nameunit(sc->dev));
854		(void) listen_hash_del(td, inp);
855		inp = release_lctx(td, lctx);
856		/* can't be freed, host stack has a reference */
857		KASSERT(inp != NULL, ("%s: inp freed", __func__));
858		goto done;
859	}
860	lctx->flags |= LCTX_RPL_PENDING;
861done:
862#ifdef notyet
863	ADAPTER_UNLOCK(sc);
864#endif
865	return (0);
866}
867
868/*
869 * Stop a listening server by sending a close_listsvr request to HW.
870 * The server TID is freed when we get the reply.
871 */
872int
873t3_listen_stop(struct toedev *tod, struct tcpcb *tp)
874{
875	struct listen_ctx *lctx;
876	struct adapter *sc = tod->tod_softc;
877	struct tom_data *td = t3_tomdata(tod);
878	struct inpcb *inp = tp->t_inpcb;
879	struct synq_entry *synqe;
880
881	INP_WLOCK_ASSERT(inp);
882
883	lctx = listen_hash_del(td, inp);
884	if (lctx == NULL)
885		return (ENOENT);	/* no hardware listener for this inp */
886
887	CTR4(KTR_CXGB, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
888	    lctx, lctx->flags);
889
890	/*
891	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
892	 * arrive and clean up when it does.
893	 */
894	if (lctx->flags & LCTX_RPL_PENDING) {
895		KASSERT(TAILQ_EMPTY(&lctx->synq),
896		    ("%s: synq not empty.", __func__));
897		return (EINPROGRESS);
898	}
899
900	/*
901	 * The host stack will abort all the connections on the listening
902	 * socket's so_comp.  It doesn't know about the connections on the synq
903	 * so we need to take care of those.
904	 */
905	TAILQ_FOREACH(synqe, &lctx->synq, link) {
906		KASSERT(synqe->lctx == lctx, ("%s: synq corrupt", __func__));
907		t3_send_reset_synqe(tod, synqe);
908	}
909
910	destroy_server(sc, lctx);
911	return (0);
912}
913
914void
915t3_syncache_added(struct toedev *tod __unused, void *arg)
916{
917	struct synq_entry *synqe = arg;
918
919	hold_synqe(synqe);
920}
921
922void
923t3_syncache_removed(struct toedev *tod __unused, void *arg)
924{
925	struct synq_entry *synqe = arg;
926
927	release_synqe(synqe);
928}
929
930/* XXX */
931extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
932
933int
934t3_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
935{
936	struct adapter *sc = tod->tod_softc;
937	struct synq_entry *synqe = arg;
938	struct l2t_entry *e = synqe->e;
939	struct ip *ip = mtod(m, struct ip *);
940	struct tcphdr *th = (void *)(ip + 1);
941	struct cpl_pass_accept_rpl *rpl;
942	struct mbuf *r;
943	struct listen_ctx *lctx = synqe->lctx;
944	struct tcpopt to;
945	int mtu_idx, cpu_idx;
946
947	/*
948	 * The first time we run it's during the call to syncache_add.  That's
949	 * the only one we care about.
950	 */
951	if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONE) == 0)
952		goto done;	/* reply to the CPL only if it's ok to do so */
953
954	r = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, rpl);
955	if (r == NULL)
956		goto done;
957
958	/*
959	 * Use only the provided mbuf (with ip and tcp headers) and what's in
960	 * synqe.  Avoid looking at the listening socket (lctx->inp) here.
961	 *
962	 * XXX: if the incoming SYN had the TCP timestamp option but the kernel
963	 * decides it doesn't want to use TCP timestamps we have no way of
964	 * relaying this info to the chip on a per-tid basis (all we have is a
965	 * global knob).
966	 */
967	bzero(&to, sizeof(to));
968	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
969	    TO_SYN);
970
971	/* stash them for later */
972	synqe->iss = be32toh(th->th_seq);
973	synqe->ts = to.to_tsval;
974
975	mtu_idx = find_best_mtu_idx(sc, NULL, to.to_mss);
976	cpu_idx = sc->rrss_map[synqe->qset];
977
978	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
979	rpl->wr.wrh_lo = 0;
980	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, synqe->tid));
981	rpl->opt2 = calc_opt2(cpu_idx);
982	rpl->rsvd = rpl->opt2;		/* workaround for HW bug */
983	rpl->peer_ip = ip->ip_dst.s_addr;
984	rpl->opt0h = synqe->opt0h |
985	    calc_opt0h(NULL, mtu_idx, to.to_wscale, NULL);
986	rpl->opt0l_status = htobe32(CPL_PASS_OPEN_ACCEPT) |
987	    calc_opt0l(NULL, synqe->rx_credits);
988
989	l2t_send(sc, r, e);
990done:
991	m_freem(m);
992	return (0);
993}
994
995int
996do_abort_req_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
997{
998	struct adapter *sc = qs->adap;
999	struct tom_data *td = sc->tom_softc;
1000	struct toedev *tod = &td->tod;
1001	const struct cpl_abort_req_rss *req = mtod(m, void *);
1002	unsigned int tid = GET_TID(req);
1003	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
1004	struct listen_ctx *lctx = synqe->lctx;
1005	struct inpcb *inp = lctx->inp;
1006
1007	KASSERT(synqe->flags & TP_IS_A_SYNQ_ENTRY,
1008	    ("%s: !SYNQ_ENTRY", __func__));
1009
1010	CTR6(KTR_CXGB, "%s: tid %u, synqe %p (%x), lctx %p, status %d",
1011	    __func__, tid, synqe, synqe->flags, synqe->lctx, req->status);
1012
1013	INP_WLOCK(inp);
1014
1015	if (!(synqe->flags & TP_ABORT_REQ_RCVD)) {
1016		synqe->flags |= TP_ABORT_REQ_RCVD;
1017		synqe->flags |= TP_ABORT_SHUTDOWN;
1018		INP_WUNLOCK(inp);
1019		m_freem(m);
1020		return (0);
1021	}
1022	synqe->flags &= ~TP_ABORT_REQ_RCVD;
1023
1024	/*
1025	 * If we'd sent a reset on this synqe, we'll ignore this and clean up in
1026	 * the T3's reply to our reset instead.
1027	 */
1028	if (synqe->flags & TP_ABORT_RPL_PENDING) {
1029		synqe->flags |= TP_ABORT_RPL_SENT;
1030		INP_WUNLOCK(inp);
1031	} else {
1032		TAILQ_REMOVE(&lctx->synq, synqe, link);
1033		inp = release_lctx(td, lctx);
1034		if (inp)
1035			INP_WUNLOCK(inp);
1036		release_tid(tod, tid, qs->idx);
1037		l2t_release(td->l2t, synqe->e);
1038		release_synqe(synqe);
1039	}
1040
1041	send_abort_rpl(tod, tid, qs->idx);
1042	m_freem(m);
1043	return (0);
1044}
1045
1046int
1047do_abort_rpl_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1048{
1049	struct adapter *sc = qs->adap;
1050	struct tom_data *td = sc->tom_softc;
1051	struct toedev *tod = &td->tod;
1052	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
1053	unsigned int tid = GET_TID(rpl);
1054	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
1055	struct listen_ctx *lctx = synqe->lctx;
1056	struct inpcb *inp = lctx->inp;
1057
1058	CTR3(KTR_CXGB, "%s: tid %d, synqe %p, status %d", tid, synqe,
1059	    rpl->status);
1060
1061	INP_WLOCK(inp);
1062
1063	if (synqe->flags & TP_ABORT_RPL_PENDING) {
1064		if (!(synqe->flags & TP_ABORT_RPL_RCVD)) {
1065			synqe->flags |= TP_ABORT_RPL_RCVD;
1066			INP_WUNLOCK(inp);
1067		} else {
1068			synqe->flags &= ~TP_ABORT_RPL_RCVD;
1069			synqe->flags &= TP_ABORT_RPL_PENDING;
1070
1071			TAILQ_REMOVE(&lctx->synq, synqe, link);
1072			inp = release_lctx(td, lctx);
1073			if (inp)
1074				INP_WUNLOCK(inp);
1075			release_tid(tod, tid, qs->idx);
1076			l2t_release(td->l2t, synqe->e);
1077			release_synqe(synqe);
1078		}
1079	}
1080
1081	m_freem(m);
1082	return (0);
1083}
1084
1085static void
1086t3_send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
1087{
1088	struct cpl_abort_req *req;
1089	unsigned int tid = synqe->tid;
1090	struct adapter *sc = tod->tod_softc;
1091	struct mbuf *m;
1092#ifdef INVARIANTS
1093	struct listen_ctx *lctx = synqe->lctx;
1094	struct inpcb *inp = lctx->inp;
1095#endif
1096
1097	INP_WLOCK_ASSERT(inp);
1098
1099	CTR4(KTR_CXGB, "%s: tid %d, synqe %p (%x)", __func__, tid, synqe,
1100	    synqe->flags);
1101
1102	if (synqe->flags & TP_ABORT_SHUTDOWN)
1103		return;
1104
1105	synqe->flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
1106
1107	m = M_GETHDR_OFLD(synqe->qset, CPL_PRIORITY_DATA, req);
1108	if (m == NULL)
1109		CXGB_UNIMPLEMENTED();
1110
1111	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1112	req->wr.wrh_lo = htonl(V_WR_TID(tid));
1113	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1114	req->rsvd0 = 0;
1115	req->rsvd1 = !(synqe->flags & TP_DATASENT);
1116	req->cmd = CPL_ABORT_SEND_RST;
1117
1118	l2t_send(sc, m, synqe->e);
1119}
1120
1121void
1122t3_offload_socket(struct toedev *tod, void *arg, struct socket *so)
1123{
1124	struct adapter *sc = tod->tod_softc;
1125	struct tom_data *td = sc->tom_softc;
1126	struct synq_entry *synqe = arg;
1127#ifdef INVARIANTS
1128	struct inpcb *inp = sotoinpcb(so);
1129#endif
1130	struct cpl_pass_establish *cpl = synqe->cpl;
1131	struct toepcb *toep = synqe->toep;
1132
1133	INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
1134	INP_WLOCK_ASSERT(inp);
1135
1136	offload_socket(so, toep);
1137	make_established(so, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
1138	update_tid(td, toep, synqe->tid);
1139	synqe->flags |= TP_SYNQE_EXPANDED;
1140}
1141#endif
1142