1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/10/sys/dev/cxgb/ulp/tom/cxgb_listen.c 309108 2016-11-24 14:48:46Z jch $");
29
30#include "opt_inet.h"
31
32#ifdef TCP_OFFLOAD
33#include <sys/param.h>
34#include <sys/refcount.h>
35#include <sys/socket.h>
36#include <sys/socketvar.h>
37#include <sys/sysctl.h>
38#include <net/if.h>
39#include <net/route.h>
40#include <netinet/in.h>
41#include <netinet/ip.h>
42#include <netinet/in_pcb.h>
43#include <netinet/in_var.h>
44#include <netinet/tcp_timer.h>
45#include <netinet/tcp_var.h>
46#define TCPSTATES
47#include <netinet/tcp_fsm.h>
48#include <netinet/toecore.h>
49
50#include "cxgb_include.h"
51#include "ulp/tom/cxgb_tom.h"
52#include "ulp/tom/cxgb_l2t.h"
53#include "ulp/tom/cxgb_toepcb.h"
54
55static void t3_send_reset_synqe(struct toedev *, struct synq_entry *);
56
57static int
58alloc_stid(struct tid_info *t, void *ctx)
59{
60	int stid = -1;
61
62	mtx_lock(&t->stid_lock);
63	if (t->sfree) {
64		union listen_entry *p = t->sfree;
65
66		stid = (p - t->stid_tab) + t->stid_base;
67		t->sfree = p->next;
68		p->ctx = ctx;
69		t->stids_in_use++;
70	}
71	mtx_unlock(&t->stid_lock);
72	return (stid);
73}
74
75static void
76free_stid(struct tid_info *t, int stid)
77{
78	union listen_entry *p = stid2entry(t, stid);
79
80	mtx_lock(&t->stid_lock);
81	p->next = t->sfree;
82	t->sfree = p;
83	t->stids_in_use--;
84	mtx_unlock(&t->stid_lock);
85}
86
87static struct listen_ctx *
88alloc_lctx(struct tom_data *td, struct inpcb *inp, int qset)
89{
90	struct listen_ctx *lctx;
91
92	INP_WLOCK_ASSERT(inp);
93
94	lctx = malloc(sizeof(struct listen_ctx), M_CXGB, M_NOWAIT | M_ZERO);
95	if (lctx == NULL)
96		return (NULL);
97
98	lctx->stid = alloc_stid(&td->tid_maps, lctx);
99	if (lctx->stid < 0) {
100		free(lctx, M_CXGB);
101		return (NULL);
102	}
103
104	lctx->inp = inp;
105	in_pcbref(inp);
106
107	lctx->qset = qset;
108	refcount_init(&lctx->refcnt, 1);
109	TAILQ_INIT(&lctx->synq);
110
111	return (lctx);
112}
113
114/* Don't call this directly, use release_lctx instead */
115static int
116free_lctx(struct tom_data *td, struct listen_ctx *lctx)
117{
118	struct inpcb *inp = lctx->inp;
119
120	INP_WLOCK_ASSERT(inp);
121	KASSERT(lctx->refcnt == 0,
122	    ("%s: refcnt %d", __func__, lctx->refcnt));
123	KASSERT(TAILQ_EMPTY(&lctx->synq),
124	    ("%s: synq not empty.", __func__));
125	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
126
127	CTR4(KTR_CXGB, "%s: stid %u, lctx %p, inp %p",
128	    __func__, lctx->stid, lctx, lctx->inp);
129
130	free_stid(&td->tid_maps, lctx->stid);
131	free(lctx, M_CXGB);
132
133	return in_pcbrele_wlocked(inp);
134}
135
136static void
137hold_lctx(struct listen_ctx *lctx)
138{
139
140	refcount_acquire(&lctx->refcnt);
141}
142
143static inline uint32_t
144listen_hashfn(void *key, u_long mask)
145{
146
147	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
148}
149
150/*
151 * Add a listen_ctx entry to the listen hash table.
152 */
153static void
154listen_hash_add(struct tom_data *td, struct listen_ctx *lctx)
155{
156	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
157
158	mtx_lock(&td->lctx_hash_lock);
159	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
160	td->lctx_count++;
161	mtx_unlock(&td->lctx_hash_lock);
162}
163
164/*
165 * Look for the listening socket's context entry in the hash and return it.
166 */
167static struct listen_ctx *
168listen_hash_find(struct tom_data *td, struct inpcb *inp)
169{
170	int bucket = listen_hashfn(inp, td->listen_mask);
171	struct listen_ctx *lctx;
172
173	mtx_lock(&td->lctx_hash_lock);
174	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
175		if (lctx->inp == inp)
176			break;
177	}
178	mtx_unlock(&td->lctx_hash_lock);
179
180	return (lctx);
181}
182
183/*
184 * Removes the listen_ctx structure for inp from the hash and returns it.
185 */
186static struct listen_ctx *
187listen_hash_del(struct tom_data *td, struct inpcb *inp)
188{
189	int bucket = listen_hashfn(inp, td->listen_mask);
190	struct listen_ctx *lctx, *l;
191
192	mtx_lock(&td->lctx_hash_lock);
193	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
194		if (lctx->inp == inp) {
195			LIST_REMOVE(lctx, link);
196			td->lctx_count--;
197			break;
198		}
199	}
200	mtx_unlock(&td->lctx_hash_lock);
201
202	return (lctx);
203}
204
205/*
206 * Releases a hold on the lctx.  Must be called with the listening socket's inp
207 * locked.  The inp may be freed by this function and it returns NULL to
208 * indicate this.
209 */
210static struct inpcb *
211release_lctx(struct tom_data *td, struct listen_ctx *lctx)
212{
213	struct inpcb *inp = lctx->inp;
214	int inp_freed = 0;
215
216	INP_WLOCK_ASSERT(inp);
217	if (refcount_release(&lctx->refcnt))
218		inp_freed = free_lctx(td, lctx);
219
220	return (inp_freed ? NULL : inp);
221}
222
223static int
224create_server(struct adapter *sc, struct listen_ctx *lctx)
225{
226	struct mbuf *m;
227	struct cpl_pass_open_req *req;
228	struct inpcb *inp = lctx->inp;
229
230	m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req);
231	if (m == NULL)
232		return (ENOMEM);
233
234	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
235	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
236	req->local_port = inp->inp_lport;
237	memcpy(&req->local_ip, &inp->inp_laddr, 4);
238	req->peer_port = 0;
239	req->peer_ip = 0;
240	req->peer_netmask = 0;
241	req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
242	req->opt0l = htonl(V_RCV_BUFSIZ(16));
243	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
244
245	t3_offload_tx(sc, m);
246
247	return (0);
248}
249
250static int
251destroy_server(struct adapter *sc, struct listen_ctx *lctx)
252{
253	struct mbuf *m;
254	struct cpl_close_listserv_req *req;
255
256	m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req);
257	if (m == NULL)
258		return (ENOMEM);
259
260	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
261	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
262	    lctx->stid));
263	req->cpu_idx = 0;
264
265	t3_offload_tx(sc, m);
266
267	return (0);
268}
269
270/*
271 * Process a CPL_CLOSE_LISTSRV_RPL message.  If the status is good we release
272 * the STID.
273 */
274static int
275do_close_server_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
276{
277	struct adapter *sc = qs->adap;
278	struct tom_data *td = sc->tom_softc;
279	struct cpl_close_listserv_rpl *rpl = mtod(m, void *);
280	unsigned int stid = GET_TID(rpl);
281	struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid);
282	struct inpcb *inp = lctx->inp;
283
284	CTR3(KTR_CXGB, "%s: stid %u, status %u", __func__, stid, rpl->status);
285
286	if (rpl->status != CPL_ERR_NONE) {
287		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u",
288		    __func__, rpl->status, stid);
289	} else {
290		INP_WLOCK(inp);
291		KASSERT(listen_hash_del(td, lctx->inp) == NULL,
292		    ("%s: inp %p still in listen hash", __func__, inp));
293		if (release_lctx(td, lctx) != NULL)
294			INP_WUNLOCK(inp);
295	}
296
297	m_freem(m);
298	return (0);
299}
300
301/*
302 * Process a CPL_PASS_OPEN_RPL message.  Remove the lctx from the listen hash
303 * table and free it if there was any error, otherwise nothing to do.
304 */
305static int
306do_pass_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
307{
308	struct adapter *sc = qs->adap;
309	struct tom_data *td = sc->tom_softc;
310       	struct cpl_pass_open_rpl *rpl = mtod(m, void *);
311	int stid = GET_TID(rpl);
312	struct listen_ctx *lctx;
313	struct inpcb *inp;
314
315	/*
316	 * We get these replies also when setting up HW filters.  Just throw
317	 * those away.
318	 */
319	if (stid >= td->tid_maps.stid_base + td->tid_maps.nstids)
320		goto done;
321
322	lctx = lookup_stid(&td->tid_maps, stid);
323	inp = lctx->inp;
324
325	INP_WLOCK(inp);
326
327	CTR4(KTR_CXGB, "%s: stid %u, status %u, flags 0x%x",
328	    __func__, stid, rpl->status, lctx->flags);
329
330	lctx->flags &= ~LCTX_RPL_PENDING;
331
332	if (rpl->status != CPL_ERR_NONE) {
333		log(LOG_ERR, "%s: %s: hw listen (stid %d) failed: %d\n",
334		    __func__, device_get_nameunit(sc->dev), stid, rpl->status);
335	}
336
337#ifdef INVARIANTS
338	/*
339	 * If the inp has been dropped (listening socket closed) then
340	 * listen_stop must have run and taken the inp out of the hash.
341	 */
342	if (inp->inp_flags & INP_DROPPED) {
343		KASSERT(listen_hash_del(td, inp) == NULL,
344		    ("%s: inp %p still in listen hash", __func__, inp));
345	}
346#endif
347
348	if (inp->inp_flags & INP_DROPPED && rpl->status != CPL_ERR_NONE) {
349		if (release_lctx(td, lctx) != NULL)
350			INP_WUNLOCK(inp);
351		goto done;
352	}
353
354	/*
355	 * Listening socket stopped listening earlier and now the chip tells us
356	 * it has started the hardware listener.  Stop it; the lctx will be
357	 * released in do_close_server_rpl.
358	 */
359	if (inp->inp_flags & INP_DROPPED) {
360		destroy_server(sc, lctx);
361		INP_WUNLOCK(inp);
362		goto done;
363	}
364
365	/*
366	 * Failed to start hardware listener.  Take inp out of the hash and
367	 * release our reference on it.  An error message has been logged
368	 * already.
369	 */
370	if (rpl->status != CPL_ERR_NONE) {
371		listen_hash_del(td, inp);
372		if (release_lctx(td, lctx) != NULL)
373			INP_WUNLOCK(inp);
374		goto done;
375	}
376
377	/* hardware listener open for business */
378
379	INP_WUNLOCK(inp);
380done:
381	m_freem(m);
382	return (0);
383}
384
385static void
386pass_accept_req_to_protohdrs(const struct cpl_pass_accept_req *cpl,
387    struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to)
388{
389	const struct tcp_options *t3opt = &cpl->tcp_options;
390
391	bzero(inc, sizeof(*inc));
392	inc->inc_faddr.s_addr = cpl->peer_ip;
393	inc->inc_laddr.s_addr = cpl->local_ip;
394	inc->inc_fport = cpl->peer_port;
395	inc->inc_lport = cpl->local_port;
396
397	bzero(th, sizeof(*th));
398	th->th_sport = cpl->peer_port;
399	th->th_dport = cpl->local_port;
400	th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */
401	th->th_flags = TH_SYN;
402
403	bzero(to, sizeof(*to));
404	if (t3opt->mss) {
405		to->to_flags |= TOF_MSS;
406		to->to_mss = be16toh(t3opt->mss);
407	}
408	if (t3opt->wsf) {
409		to->to_flags |= TOF_SCALE;
410		to->to_wscale = t3opt->wsf;
411	}
412	if (t3opt->tstamp)
413		to->to_flags |= TOF_TS;
414	if (t3opt->sack)
415		to->to_flags |= TOF_SACKPERM;
416}
417
418static inline void
419hold_synqe(struct synq_entry *synqe)
420{
421
422	refcount_acquire(&synqe->refcnt);
423}
424
425static inline void
426release_synqe(struct synq_entry *synqe)
427{
428
429	if (refcount_release(&synqe->refcnt))
430		m_freem(synqe->m);
431}
432
433/*
434 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
435 * store some state temporarily.  There will be enough room in the mbuf's
436 * trailing space as the CPL is not that large.
437 *
438 * XXX: bad hack.
439 */
440static struct synq_entry *
441mbuf_to_synq_entry(struct mbuf *m)
442{
443	int len = roundup(sizeof (struct synq_entry), 8);
444	uint8_t *buf;
445	int buflen;
446
447	if (__predict_false(M_TRAILINGSPACE(m) < len)) {
448	    panic("%s: no room for synq_entry (%td, %d)\n", __func__,
449	    M_TRAILINGSPACE(m), len);
450	}
451
452	if (m->m_flags & M_EXT) {
453		buf = m->m_ext.ext_buf;
454		buflen = m->m_ext.ext_size;
455	} else if (m->m_flags & M_PKTHDR) {
456		buf = &m->m_pktdat[0];
457		buflen = MHLEN;
458	} else {
459		buf = &m->m_dat[0];
460		buflen = MLEN;
461	}
462
463	return ((void *)(buf + buflen - len));
464}
465
466#ifdef KTR
467#define REJECT_PASS_ACCEPT()	do { \
468	reject_reason = __LINE__; \
469	goto reject; \
470} while (0)
471#else
472#define REJECT_PASS_ACCEPT()	do { goto reject; } while (0)
473#endif
474
475/*
476 * The context associated with a tid entry via insert_tid could be a synq_entry
477 * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
478 */
479CTASSERT(offsetof(struct toepcb, tp_flags) == offsetof(struct synq_entry, flags));
480
481/*
482 * Handle a CPL_PASS_ACCEPT_REQ message.
483 */
484static int
485do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
486{
487	struct adapter *sc = qs->adap;
488	struct tom_data *td = sc->tom_softc;
489	struct toedev *tod = &td->tod;
490	const struct cpl_pass_accept_req *req = mtod(m, void *);
491	unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
492	unsigned int tid = GET_TID(req);
493	struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid);
494	struct l2t_entry *e = NULL;
495	struct sockaddr_in nam;
496	struct rtentry *rt;
497	struct inpcb *inp;
498	struct socket *so;
499	struct port_info *pi;
500	struct ifnet *ifp;
501	struct in_conninfo inc;
502	struct tcphdr th;
503	struct tcpopt to;
504	struct synq_entry *synqe = NULL;
505	int i;
506#ifdef KTR
507	int reject_reason;
508#endif
509
510	CTR4(KTR_CXGB, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
511	    lctx);
512
513	pass_accept_req_to_protohdrs(req, &inc, &th, &to);
514
515	/*
516	 * Don't offload if the interface that received the SYN doesn't have
517	 * IFCAP_TOE enabled.
518	 */
519	pi = NULL;
520	for_each_port(sc, i) {
521		if (memcmp(sc->port[i].hw_addr, req->dst_mac, ETHER_ADDR_LEN))
522			continue;
523		pi = &sc->port[i];
524		break;
525	}
526	if (pi == NULL)
527		REJECT_PASS_ACCEPT();
528	ifp = pi->ifp;
529	if ((ifp->if_capenable & IFCAP_TOE4) == 0)
530		REJECT_PASS_ACCEPT();
531
532	/*
533	 * Don't offload if the outgoing interface for the route back to the
534	 * peer is not the same as the interface that received the SYN.
535	 */
536	bzero(&nam, sizeof(nam));
537	nam.sin_len = sizeof(nam);
538	nam.sin_family = AF_INET;
539	nam.sin_addr = inc.inc_faddr;
540	rt = rtalloc1((struct sockaddr *)&nam, 0, 0);
541	if (rt == NULL)
542		REJECT_PASS_ACCEPT();
543	else {
544		struct sockaddr *nexthop;
545
546		RT_UNLOCK(rt);
547		nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway :
548		    (struct sockaddr *)&nam;
549		if (rt->rt_ifp == ifp)
550			e = t3_l2t_get(pi, rt->rt_ifp, nexthop);
551		RTFREE(rt);
552		if (e == NULL)
553			REJECT_PASS_ACCEPT();	/* no l2te, or ifp mismatch */
554	}
555
556	INP_INFO_RLOCK(&V_tcbinfo);
557
558	/* Don't offload if the 4-tuple is already in use */
559	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
560		INP_INFO_RUNLOCK(&V_tcbinfo);
561		REJECT_PASS_ACCEPT();
562	}
563
564	inp = lctx->inp;	/* listening socket (not owned by the TOE) */
565	INP_WLOCK(inp);
566	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
567		/*
568		 * The listening socket has closed.  The reply from the TOE to
569		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
570		 * resources tied to this listen context.
571		 */
572		INP_WUNLOCK(inp);
573		INP_INFO_RUNLOCK(&V_tcbinfo);
574		REJECT_PASS_ACCEPT();
575	}
576	so = inp->inp_socket;
577
578	/* Reuse the mbuf that delivered the CPL to us */
579	synqe = mbuf_to_synq_entry(m);
580	synqe->flags = TP_IS_A_SYNQ_ENTRY;
581	synqe->m = m;
582	synqe->lctx = lctx;
583	synqe->tid = tid;
584	synqe->e = e;
585	synqe->opt0h = calc_opt0h(so, 0, 0, e);
586	synqe->qset = pi->first_qset + (arc4random() % pi->nqsets);
587	SOCKBUF_LOCK(&so->so_rcv);
588	synqe->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
589	SOCKBUF_UNLOCK(&so->so_rcv);
590	refcount_init(&synqe->refcnt, 1);
591	atomic_store_rel_int(&synqe->reply, RPL_OK);
592
593	insert_tid(td, synqe, tid);
594	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
595	hold_synqe(synqe);
596	hold_lctx(lctx);
597
598	/* syncache_add releases both pcbinfo and pcb locks */
599	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
600	INP_UNLOCK_ASSERT(inp);
601	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
602
603	/*
604	 * If we replied during syncache_add (reply is RPL_DONE), good.
605	 * Otherwise (reply is unchanged - RPL_OK) it's no longer ok to reply.
606	 * The mbuf will stick around as long as the entry is in the syncache.
607	 * The kernel is free to retry syncache_respond but we'll ignore it due
608	 * to RPL_DONT.
609	 */
610	if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONT)) {
611
612		INP_WLOCK(inp);
613		if (__predict_false(inp->inp_flags & INP_DROPPED)) {
614			/* listener closed.  synqe must have been aborted. */
615			KASSERT(synqe->flags & TP_ABORT_SHUTDOWN,
616			    ("%s: listener %p closed but synqe %p not aborted",
617			    __func__, inp, synqe));
618
619			CTR5(KTR_CXGB,
620			    "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED",
621			    __func__, stid, tid, lctx, synqe);
622			INP_WUNLOCK(inp);
623			release_synqe(synqe);
624			return (__LINE__);
625		}
626
627		KASSERT(!(synqe->flags & TP_ABORT_SHUTDOWN),
628		    ("%s: synqe %p aborted, but listener %p not dropped.",
629		    __func__, synqe, inp));
630
631		TAILQ_REMOVE(&lctx->synq, synqe, link);
632		release_synqe(synqe);	/* removed from synq list */
633		inp = release_lctx(td, lctx);
634		if (inp)
635			INP_WUNLOCK(inp);
636
637		release_synqe(synqe);	/* about to exit function */
638		REJECT_PASS_ACCEPT();
639	}
640
641	KASSERT(synqe->reply == RPL_DONE,
642	    ("%s: reply %d", __func__, synqe->reply));
643
644	CTR3(KTR_CXGB, "%s: stid %u, tid %u, OK", __func__, stid, tid);
645	release_synqe(synqe);
646	return (0);
647
648reject:
649	CTR4(KTR_CXGB, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
650	    reject_reason);
651
652	if (synqe == NULL)
653		m_freem(m);
654	if (e)
655		l2t_release(td->l2t, e);
656	queue_tid_release(tod, tid);
657
658	return (0);
659}
660
661static void
662pass_establish_to_protohdrs(const struct cpl_pass_establish *cpl,
663    struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to)
664{
665	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
666
667	bzero(inc, sizeof(*inc));
668	inc->inc_faddr.s_addr = cpl->peer_ip;
669	inc->inc_laddr.s_addr = cpl->local_ip;
670	inc->inc_fport = cpl->peer_port;
671	inc->inc_lport = cpl->local_port;
672
673	bzero(th, sizeof(*th));
674	th->th_sport = cpl->peer_port;
675	th->th_dport = cpl->local_port;
676	th->th_flags = TH_ACK;
677	th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */
678	th->th_ack = be32toh(cpl->snd_isn); /* ditto */
679
680	bzero(to, sizeof(*to));
681	if (G_TCPOPT_TSTAMP(tcp_opt))
682		to->to_flags |= TOF_TS;
683}
684
685/*
686 * Process a CPL_PASS_ESTABLISH message.  The T3 has already established a
687 * connection and we need to do the software side setup.
688 */
689static int
690do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
691{
692	struct adapter *sc = qs->adap;
693	struct tom_data *td = sc->tom_softc;
694	struct cpl_pass_establish *cpl = mtod(m, void *);
695	struct toedev *tod = &td->tod;
696	unsigned int tid = GET_TID(cpl);
697	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
698	struct toepcb *toep;
699	struct socket *so;
700	struct listen_ctx *lctx = synqe->lctx;
701	struct inpcb *inp = lctx->inp, *new_inp;
702	struct tcpopt to;
703	struct tcphdr th;
704	struct in_conninfo inc;
705#ifdef KTR
706	int stid = G_PASS_OPEN_TID(ntohl(cpl->tos_tid));
707#endif
708
709	CTR5(KTR_CXGB, "%s: stid %u, tid %u, lctx %p, inp_flags 0x%x",
710	    __func__, stid, tid, lctx, inp->inp_flags);
711
712	KASSERT(qs->idx == synqe->qset,
713	    ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset));
714
715	INP_INFO_RLOCK(&V_tcbinfo);	/* for syncache_expand */
716	INP_WLOCK(inp);
717
718	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
719		/*
720		 * The listening socket has closed.  The TOM must have aborted
721		 * all the embryonic connections (including this one) that were
722		 * on the lctx's synq.  do_abort_rpl for the tid is responsible
723		 * for cleaning up.
724		 */
725		KASSERT(synqe->flags & TP_ABORT_SHUTDOWN,
726		    ("%s: listen socket dropped but tid %u not aborted.",
727		    __func__, tid));
728		INP_WUNLOCK(inp);
729		INP_INFO_RUNLOCK(&V_tcbinfo);
730		m_freem(m);
731		return (0);
732	}
733
734	pass_establish_to_protohdrs(cpl, &inc, &th, &to);
735
736	/* Lie in order to pass the checks in syncache_expand */
737	to.to_tsecr = synqe->ts;
738	th.th_ack = synqe->iss + 1;
739
740	toep = toepcb_alloc(tod);
741	if (toep == NULL) {
742reset:
743		t3_send_reset_synqe(tod, synqe);
744		INP_WUNLOCK(inp);
745		INP_INFO_RUNLOCK(&V_tcbinfo);
746		m_freem(m);
747		return (0);
748	}
749	toep->tp_qset = qs->idx;
750	toep->tp_l2t = synqe->e;
751	toep->tp_tid = tid;
752	toep->tp_rx_credits = synqe->rx_credits;
753
754	synqe->toep = toep;
755	synqe->cpl = cpl;
756
757	so = inp->inp_socket;
758	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
759		toepcb_free(toep);
760		goto reset;
761	}
762
763	/* New connection inpcb is already locked by syncache_expand(). */
764	new_inp = sotoinpcb(so);
765	INP_WLOCK_ASSERT(new_inp);
766
767	if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) {
768		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
769		t3_offload_socket(tod, synqe, so);
770	}
771
772	INP_WUNLOCK(new_inp);
773
774	/* Remove the synq entry and release its reference on the lctx */
775	TAILQ_REMOVE(&lctx->synq, synqe, link);
776	inp = release_lctx(td, lctx);
777	if (inp)
778		INP_WUNLOCK(inp);
779	INP_INFO_RUNLOCK(&V_tcbinfo);
780	release_synqe(synqe);
781
782	m_freem(m);
783	return (0);
784}
785
786void
787t3_init_listen_cpl_handlers(struct adapter *sc)
788{
789	t3_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
790	t3_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
791	t3_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
792	t3_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
793}
794
795/*
796 * Start a listening server by sending a passive open request to HW.
797 *
798 * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
799 * sc->offload_map, if_capenable are all race prone.
800 */
801int
802t3_listen_start(struct toedev *tod, struct tcpcb *tp)
803{
804	struct tom_data *td = t3_tomdata(tod);
805	struct adapter *sc = tod->tod_softc;
806	struct port_info *pi;
807	struct inpcb *inp = tp->t_inpcb;
808	struct listen_ctx *lctx;
809	int i;
810
811	INP_WLOCK_ASSERT(inp);
812
813	if ((inp->inp_vflag & INP_IPV4) == 0)
814		return (0);
815
816#ifdef notyet
817	ADAPTER_LOCK(sc);
818	if (IS_BUSY(sc)) {
819		log(LOG_ERR, "%s: listen request ignored, %s is busy",
820		    __func__, device_get_nameunit(sc->dev));
821		goto done;
822	}
823
824	KASSERT(sc->flags & TOM_INIT_DONE,
825	    ("%s: TOM not initialized", __func__));
826#endif
827
828	if ((sc->open_device_map & sc->offload_map) == 0)
829		goto done;	/* no port that's UP with IFCAP_TOE enabled */
830
831	/*
832	 * Find a running port with IFCAP_TOE4.  We'll use the first such port's
833	 * queues to send the passive open and receive the reply to it.
834	 *
835	 * XXX: need a way to mark an port in use by offload.  if_cxgbe should
836	 * then reject any attempt to bring down such a port (and maybe reject
837	 * attempts to disable IFCAP_TOE on that port too?).
838	 */
839	for_each_port(sc, i) {
840		if (isset(&sc->open_device_map, i) &&
841		    sc->port[i].ifp->if_capenable & IFCAP_TOE4)
842				break;
843	}
844	KASSERT(i < sc->params.nports,
845	    ("%s: no running port with TOE capability enabled.", __func__));
846	pi = &sc->port[i];
847
848	if (listen_hash_find(td, inp) != NULL)
849		goto done;	/* already setup */
850
851	lctx = alloc_lctx(td, inp, pi->first_qset);
852	if (lctx == NULL) {
853		log(LOG_ERR,
854		    "%s: listen request ignored, %s couldn't allocate lctx\n",
855		    __func__, device_get_nameunit(sc->dev));
856		goto done;
857	}
858	listen_hash_add(td, lctx);
859
860	CTR5(KTR_CXGB, "%s: stid %u (%s), lctx %p, inp %p", __func__,
861	    lctx->stid, tcpstates[tp->t_state], lctx, inp);
862
863	if (create_server(sc, lctx) != 0) {
864		log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__,
865		    device_get_nameunit(sc->dev));
866		(void) listen_hash_del(td, inp);
867		inp = release_lctx(td, lctx);
868		/* can't be freed, host stack has a reference */
869		KASSERT(inp != NULL, ("%s: inp freed", __func__));
870		goto done;
871	}
872	lctx->flags |= LCTX_RPL_PENDING;
873done:
874#ifdef notyet
875	ADAPTER_UNLOCK(sc);
876#endif
877	return (0);
878}
879
880/*
881 * Stop a listening server by sending a close_listsvr request to HW.
882 * The server TID is freed when we get the reply.
883 */
884int
885t3_listen_stop(struct toedev *tod, struct tcpcb *tp)
886{
887	struct listen_ctx *lctx;
888	struct adapter *sc = tod->tod_softc;
889	struct tom_data *td = t3_tomdata(tod);
890	struct inpcb *inp = tp->t_inpcb;
891	struct synq_entry *synqe;
892
893	INP_WLOCK_ASSERT(inp);
894
895	lctx = listen_hash_del(td, inp);
896	if (lctx == NULL)
897		return (ENOENT);	/* no hardware listener for this inp */
898
899	CTR4(KTR_CXGB, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
900	    lctx, lctx->flags);
901
902	/*
903	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
904	 * arrive and clean up when it does.
905	 */
906	if (lctx->flags & LCTX_RPL_PENDING) {
907		KASSERT(TAILQ_EMPTY(&lctx->synq),
908		    ("%s: synq not empty.", __func__));
909		return (EINPROGRESS);
910	}
911
912	/*
913	 * The host stack will abort all the connections on the listening
914	 * socket's so_comp.  It doesn't know about the connections on the synq
915	 * so we need to take care of those.
916	 */
917	TAILQ_FOREACH(synqe, &lctx->synq, link) {
918		KASSERT(synqe->lctx == lctx, ("%s: synq corrupt", __func__));
919		t3_send_reset_synqe(tod, synqe);
920	}
921
922	destroy_server(sc, lctx);
923	return (0);
924}
925
926void
927t3_syncache_added(struct toedev *tod __unused, void *arg)
928{
929	struct synq_entry *synqe = arg;
930
931	hold_synqe(synqe);
932}
933
934void
935t3_syncache_removed(struct toedev *tod __unused, void *arg)
936{
937	struct synq_entry *synqe = arg;
938
939	release_synqe(synqe);
940}
941
942/* XXX */
943extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
944
945int
946t3_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
947{
948	struct adapter *sc = tod->tod_softc;
949	struct synq_entry *synqe = arg;
950	struct l2t_entry *e = synqe->e;
951	struct ip *ip = mtod(m, struct ip *);
952	struct tcphdr *th = (void *)(ip + 1);
953	struct cpl_pass_accept_rpl *rpl;
954	struct mbuf *r;
955	struct listen_ctx *lctx = synqe->lctx;
956	struct tcpopt to;
957	int mtu_idx, cpu_idx;
958
959	/*
960	 * The first time we run it's during the call to syncache_add.  That's
961	 * the only one we care about.
962	 */
963	if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONE) == 0)
964		goto done;	/* reply to the CPL only if it's ok to do so */
965
966	r = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, rpl);
967	if (r == NULL)
968		goto done;
969
970	/*
971	 * Use only the provided mbuf (with ip and tcp headers) and what's in
972	 * synqe.  Avoid looking at the listening socket (lctx->inp) here.
973	 *
974	 * XXX: if the incoming SYN had the TCP timestamp option but the kernel
975	 * decides it doesn't want to use TCP timestamps we have no way of
976	 * relaying this info to the chip on a per-tid basis (all we have is a
977	 * global knob).
978	 */
979	bzero(&to, sizeof(to));
980	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
981	    TO_SYN);
982
983	/* stash them for later */
984	synqe->iss = be32toh(th->th_seq);
985	synqe->ts = to.to_tsval;
986
987	mtu_idx = find_best_mtu_idx(sc, NULL, to.to_mss);
988	cpu_idx = sc->rrss_map[synqe->qset];
989
990	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
991	rpl->wr.wrh_lo = 0;
992	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, synqe->tid));
993	rpl->opt2 = calc_opt2(cpu_idx);
994	rpl->rsvd = rpl->opt2;		/* workaround for HW bug */
995	rpl->peer_ip = ip->ip_dst.s_addr;
996	rpl->opt0h = synqe->opt0h |
997	    calc_opt0h(NULL, mtu_idx, to.to_wscale, NULL);
998	rpl->opt0l_status = htobe32(CPL_PASS_OPEN_ACCEPT) |
999	    calc_opt0l(NULL, synqe->rx_credits);
1000
1001	l2t_send(sc, r, e);
1002done:
1003	m_freem(m);
1004	return (0);
1005}
1006
1007int
1008do_abort_req_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1009{
1010	struct adapter *sc = qs->adap;
1011	struct tom_data *td = sc->tom_softc;
1012	struct toedev *tod = &td->tod;
1013	const struct cpl_abort_req_rss *req = mtod(m, void *);
1014	unsigned int tid = GET_TID(req);
1015	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
1016	struct listen_ctx *lctx = synqe->lctx;
1017	struct inpcb *inp = lctx->inp;
1018
1019	KASSERT(synqe->flags & TP_IS_A_SYNQ_ENTRY,
1020	    ("%s: !SYNQ_ENTRY", __func__));
1021
1022	CTR6(KTR_CXGB, "%s: tid %u, synqe %p (%x), lctx %p, status %d",
1023	    __func__, tid, synqe, synqe->flags, synqe->lctx, req->status);
1024
1025	INP_WLOCK(inp);
1026
1027	if (!(synqe->flags & TP_ABORT_REQ_RCVD)) {
1028		synqe->flags |= TP_ABORT_REQ_RCVD;
1029		synqe->flags |= TP_ABORT_SHUTDOWN;
1030		INP_WUNLOCK(inp);
1031		m_freem(m);
1032		return (0);
1033	}
1034	synqe->flags &= ~TP_ABORT_REQ_RCVD;
1035
1036	/*
1037	 * If we'd sent a reset on this synqe, we'll ignore this and clean up in
1038	 * the T3's reply to our reset instead.
1039	 */
1040	if (synqe->flags & TP_ABORT_RPL_PENDING) {
1041		synqe->flags |= TP_ABORT_RPL_SENT;
1042		INP_WUNLOCK(inp);
1043	} else {
1044		TAILQ_REMOVE(&lctx->synq, synqe, link);
1045		inp = release_lctx(td, lctx);
1046		if (inp)
1047			INP_WUNLOCK(inp);
1048		release_tid(tod, tid, qs->idx);
1049		l2t_release(td->l2t, synqe->e);
1050		release_synqe(synqe);
1051	}
1052
1053	send_abort_rpl(tod, tid, qs->idx);
1054	m_freem(m);
1055	return (0);
1056}
1057
1058int
1059do_abort_rpl_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1060{
1061	struct adapter *sc = qs->adap;
1062	struct tom_data *td = sc->tom_softc;
1063	struct toedev *tod = &td->tod;
1064	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
1065	unsigned int tid = GET_TID(rpl);
1066	struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid);
1067	struct listen_ctx *lctx = synqe->lctx;
1068	struct inpcb *inp = lctx->inp;
1069
1070	CTR3(KTR_CXGB, "%s: tid %d, synqe %p, status %d", tid, synqe,
1071	    rpl->status);
1072
1073	INP_WLOCK(inp);
1074
1075	if (synqe->flags & TP_ABORT_RPL_PENDING) {
1076		if (!(synqe->flags & TP_ABORT_RPL_RCVD)) {
1077			synqe->flags |= TP_ABORT_RPL_RCVD;
1078			INP_WUNLOCK(inp);
1079		} else {
1080			synqe->flags &= ~TP_ABORT_RPL_RCVD;
1081			synqe->flags &= TP_ABORT_RPL_PENDING;
1082
1083			TAILQ_REMOVE(&lctx->synq, synqe, link);
1084			inp = release_lctx(td, lctx);
1085			if (inp)
1086				INP_WUNLOCK(inp);
1087			release_tid(tod, tid, qs->idx);
1088			l2t_release(td->l2t, synqe->e);
1089			release_synqe(synqe);
1090		}
1091	}
1092
1093	m_freem(m);
1094	return (0);
1095}
1096
1097static void
1098t3_send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
1099{
1100	struct cpl_abort_req *req;
1101	unsigned int tid = synqe->tid;
1102	struct adapter *sc = tod->tod_softc;
1103	struct mbuf *m;
1104#ifdef INVARIANTS
1105	struct listen_ctx *lctx = synqe->lctx;
1106	struct inpcb *inp = lctx->inp;
1107#endif
1108
1109	INP_WLOCK_ASSERT(inp);
1110
1111	CTR4(KTR_CXGB, "%s: tid %d, synqe %p (%x)", __func__, tid, synqe,
1112	    synqe->flags);
1113
1114	if (synqe->flags & TP_ABORT_SHUTDOWN)
1115		return;
1116
1117	synqe->flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
1118
1119	m = M_GETHDR_OFLD(synqe->qset, CPL_PRIORITY_DATA, req);
1120	if (m == NULL)
1121		CXGB_UNIMPLEMENTED();
1122
1123	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1124	req->wr.wrh_lo = htonl(V_WR_TID(tid));
1125	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1126	req->rsvd0 = 0;
1127	req->rsvd1 = !(synqe->flags & TP_DATASENT);
1128	req->cmd = CPL_ABORT_SEND_RST;
1129
1130	l2t_send(sc, m, synqe->e);
1131}
1132
1133void
1134t3_offload_socket(struct toedev *tod, void *arg, struct socket *so)
1135{
1136	struct adapter *sc = tod->tod_softc;
1137	struct tom_data *td = sc->tom_softc;
1138	struct synq_entry *synqe = arg;
1139#ifdef INVARIANTS
1140	struct inpcb *inp = sotoinpcb(so);
1141#endif
1142	struct cpl_pass_establish *cpl = synqe->cpl;
1143	struct toepcb *toep = synqe->toep;
1144
1145	INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
1146	INP_WLOCK_ASSERT(inp);
1147
1148	offload_socket(so, toep);
1149	make_established(so, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
1150	update_tid(td, toep, synqe->tid);
1151	synqe->flags |= TP_SYNQE_EXPANDED;
1152}
1153#endif
1154