cxgb_cpl_io.c revision 294887
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 294887 2016-01-27 04:59:28Z glebius $");
29
30#include "opt_inet.h"
31
32#ifdef TCP_OFFLOAD
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/sockbuf.h>
47#include <sys/sysctl.h>
48#include <sys/syslog.h>
49#include <sys/protosw.h>
50#include <sys/priv.h>
51#include <sys/sglist.h>
52#include <sys/taskqueue.h>
53
54#include <net/if.h>
55#include <net/if_var.h>
56#include <net/ethernet.h>
57#include <net/route.h>
58
59#include <netinet/in.h>
60#include <netinet/in_pcb.h>
61#include <netinet/in_systm.h>
62#include <netinet/in_var.h>
63
64#include <netinet/ip.h>
65#define TCPSTATES
66#include <netinet/tcp_fsm.h>
67#include <netinet/tcp_var.h>
68#include <netinet/toecore.h>
69#include <netinet/tcp_seq.h>
70#include <netinet/tcp_timer.h>
71#include <net/route.h>
72
73#include "cxgb_include.h"
74#include "ulp/tom/cxgb_l2t.h"
75#include "ulp/tom/cxgb_tom.h"
76#include "ulp/tom/cxgb_toepcb.h"
77
78VNET_DECLARE(int, tcp_do_autosndbuf);
79#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
80VNET_DECLARE(int, tcp_autosndbuf_inc);
81#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
82VNET_DECLARE(int, tcp_autosndbuf_max);
83#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
84VNET_DECLARE(int, tcp_do_autorcvbuf);
85#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
86VNET_DECLARE(int, tcp_autorcvbuf_inc);
87#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
88VNET_DECLARE(int, tcp_autorcvbuf_max);
89#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
90extern int always_keepalive;
91
92/*
93 * For ULP connections HW may add headers, e.g., for digests, that aren't part
94 * of the messages sent by the host but that are part of the TCP payload and
95 * therefore consume TCP sequence space.  Tx connection parameters that
96 * operate in TCP sequence space are affected by the HW additions and need to
97 * compensate for them to accurately track TCP sequence numbers. This array
98 * contains the compensating extra lengths for ULP packets.  It is indexed by
99 * a packet's ULP submode.
100 */
101const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
102
103/*
104 * Max receive window supported by HW in bytes.  Only a small part of it can
105 * be set through option0, the rest needs to be set through RX_DATA_ACK.
106 */
107#define MAX_RCV_WND ((1U << 27) - 1)
108
109/*
110 * Min receive window.  We want it to be large enough to accommodate receive
111 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
112 */
113#define MIN_RCV_WND (24 * 1024U)
114#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
115
116static void t3_release_offload_resources(struct toepcb *);
117static void send_reset(struct toepcb *toep);
118
119/*
120 * Called after the last CPL for the toepcb has been received.
121 *
122 * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the
123 * time this function exits.
124 */
125static int
126toepcb_release(struct toepcb *toep)
127{
128	struct inpcb *inp = toep->tp_inp;
129	struct toedev *tod = toep->tp_tod;
130	struct tom_data *td = t3_tomdata(tod);
131	int rc;
132
133	INP_WLOCK_ASSERT(inp);
134	KASSERT(!(toep->tp_flags & TP_CPL_DONE),
135	    ("%s: double release?", __func__));
136
137	CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid);
138
139	toep->tp_flags |= TP_CPL_DONE;
140	toep->tp_inp = NULL;
141
142	mtx_lock(&td->toep_list_lock);
143	TAILQ_REMOVE(&td->toep_list, toep, link);
144	mtx_unlock(&td->toep_list_lock);
145
146	if (!(toep->tp_flags & TP_ATTACHED))
147		t3_release_offload_resources(toep);
148
149	rc = in_pcbrele_wlocked(inp);
150	if (!rc)
151		INP_WUNLOCK(inp);
152	return (rc);
153}
154
155/*
156 * One sided detach.  The tcpcb is going away and we need to unhook the toepcb
157 * hanging off it.  If the TOE driver is also done with the toepcb we'll release
158 * all offload resources.
159 */
160static void
161toepcb_detach(struct inpcb *inp)
162{
163	struct toepcb *toep;
164	struct tcpcb *tp;
165
166	KASSERT(inp, ("%s: inp is NULL", __func__));
167	INP_WLOCK_ASSERT(inp);
168
169	tp = intotcpcb(inp);
170	toep = tp->t_toe;
171
172	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
173	KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__));
174
175	CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__,
176	    tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid,
177	    toep, inp, tp);
178
179	tp->t_toe = NULL;
180	tp->t_flags &= ~TF_TOE;
181	toep->tp_flags &= ~TP_ATTACHED;
182
183	if (toep->tp_flags & TP_CPL_DONE)
184		t3_release_offload_resources(toep);
185}
186
187void
188t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
189{
190
191	toepcb_detach(tp->t_inpcb);
192}
193
194static int
195alloc_atid(struct tid_info *t, void *ctx)
196{
197	int atid = -1;
198
199	mtx_lock(&t->atid_lock);
200	if (t->afree) {
201		union active_open_entry *p = t->afree;
202
203		atid = (p - t->atid_tab) + t->atid_base;
204		t->afree = p->next;
205		p->ctx = ctx;
206		t->atids_in_use++;
207	}
208	mtx_unlock(&t->atid_lock);
209
210	return (atid);
211}
212
213static void
214free_atid(struct tid_info *t, int atid)
215{
216	union active_open_entry *p = atid2entry(t, atid);
217
218	mtx_lock(&t->atid_lock);
219	p->next = t->afree;
220	t->afree = p;
221	t->atids_in_use--;
222	mtx_unlock(&t->atid_lock);
223}
224
225void
226insert_tid(struct tom_data *td, void *ctx, unsigned int tid)
227{
228	struct tid_info *t = &td->tid_maps;
229
230	t->tid_tab[tid] = ctx;
231	atomic_add_int(&t->tids_in_use, 1);
232}
233
234void
235update_tid(struct tom_data *td, void *ctx, unsigned int tid)
236{
237	struct tid_info *t = &td->tid_maps;
238
239	t->tid_tab[tid] = ctx;
240}
241
242void
243remove_tid(struct tom_data *td, unsigned int tid)
244{
245	struct tid_info *t = &td->tid_maps;
246
247	t->tid_tab[tid] = NULL;
248	atomic_add_int(&t->tids_in_use, -1);
249}
250
251/* use ctx as a next pointer in the tid release list */
252void
253queue_tid_release(struct toedev *tod, unsigned int tid)
254{
255	struct tom_data *td = t3_tomdata(tod);
256	void **p = &td->tid_maps.tid_tab[tid];
257	struct adapter *sc = tod->tod_softc;
258
259	mtx_lock(&td->tid_release_lock);
260	*p = td->tid_release_list;
261	td->tid_release_list = p;
262	if (!*p)
263		taskqueue_enqueue(sc->tq, &td->tid_release_task);
264	mtx_unlock(&td->tid_release_lock);
265}
266
267/*
268 * Populate a TID_RELEASE WR.
269 */
270static inline void
271mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid)
272{
273
274	cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
275	OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
276}
277
278void
279release_tid(struct toedev *tod, unsigned int tid, int qset)
280{
281	struct tom_data *td = t3_tomdata(tod);
282	struct adapter *sc = tod->tod_softc;
283	struct mbuf *m;
284	struct cpl_tid_release *cpl;
285#ifdef INVARIANTS
286	struct tid_info *t = &td->tid_maps;
287#endif
288
289	KASSERT(tid >= 0 && tid < t->ntids,
290	    ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids));
291
292	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
293	if (m) {
294		mk_tid_release(cpl, tid);
295		t3_offload_tx(sc, m);
296		remove_tid(td, tid);
297	} else
298		queue_tid_release(tod, tid);
299
300}
301
302void
303t3_process_tid_release_list(void *data, int pending)
304{
305	struct mbuf *m;
306	struct tom_data *td = data;
307	struct adapter *sc = td->tod.tod_softc;
308
309	mtx_lock(&td->tid_release_lock);
310	while (td->tid_release_list) {
311		void **p = td->tid_release_list;
312		unsigned int tid = p - td->tid_maps.tid_tab;
313		struct cpl_tid_release *cpl;
314
315		td->tid_release_list = (void **)*p;
316		m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */
317		if (m == NULL)
318			break;	/* XXX: who reschedules the release task? */
319		mtx_unlock(&td->tid_release_lock);
320		mk_tid_release(cpl, tid);
321		t3_offload_tx(sc, m);
322		remove_tid(td, tid);
323		mtx_lock(&td->tid_release_lock);
324	}
325	mtx_unlock(&td->tid_release_lock);
326}
327
328static void
329close_conn(struct adapter *sc, struct toepcb *toep)
330{
331	struct mbuf *m;
332	struct cpl_close_con_req *req;
333
334	if (toep->tp_flags & TP_FIN_SENT)
335		return;
336
337	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
338	if (m == NULL)
339		CXGB_UNIMPLEMENTED();
340
341	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
342	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
343	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid));
344	req->rsvd = 0;
345
346	toep->tp_flags |= TP_FIN_SENT;
347	t3_offload_tx(sc, m);
348}
349
350static inline void
351make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len,
352    struct mbuf *tail)
353{
354	struct tcpcb *tp = so_sototcpcb(so);
355	struct toepcb *toep = tp->t_toe;
356	struct sockbuf *snd;
357
358	inp_lock_assert(tp->t_inpcb);
359	snd = so_sockbuf_snd(so);
360
361	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
362	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
363	/* len includes the length of any HW ULP additions */
364	req->len = htonl(len);
365	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
366	/* V_TX_ULP_SUBMODE sets both the mode and submode */
367	req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) |
368	    V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1)));
369	req->sndseq = htonl(tp->snd_nxt);
370	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
371		struct adapter *sc = toep->tp_tod->tod_softc;
372		int cpu_idx = sc->rrss_map[toep->tp_qset];
373
374		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
375		    V_TX_CPU_IDX(cpu_idx));
376
377		/* Sendbuffer is in units of 32KB. */
378		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
379			req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15));
380		else
381			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
382
383		toep->tp_flags |= TP_DATASENT;
384	}
385}
386
387/*
388 * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc.
389 * TOM_XXX_MOVE to some common header file.
390 */
391/*
392 * IMM_LEN: # of bytes that can be tx'd as immediate data.  There are 16 flits
393 * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more
394 * for the second gen bit flit.  This leaves us with 12 flits.
395 *
396 * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs.
397 * The first desc has a tx_data_wr (which includes the WR header), the rest have
398 * the WR header only.  All descs have the second gen bit flit.
399 *
400 * sgllen_to_descs: # of tx descs used up by an sgl of given length.  The first
401 * desc has a tx_data_wr (which includes the WR header), the rest have the WR
402 * header only.  All descs have the second gen bit flit.
403 *
404 * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits.
405 *
406 */
407#define IMM_LEN 96
408static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35};
409static int sgllen_to_descs[TX_MAX_SEGS] = {
410	0, 1, 1, 1, 1, 1, 1, 1, 1, 2,	/*  0 -  9 */
411	2, 2, 2, 2, 2, 2, 2, 2, 3, 3,	/* 10 - 19 */
412	3, 3, 3, 3, 3, 3, 3, 4, 4, 4,	/* 20 - 29 */
413	4, 4, 4, 4, 4, 4		/* 30 - 35 */
414};
415#if 0
416static int flits_to_sgllen[TX_DESC_FLITS + 1] = {
417	0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10
418};
419#endif
420#if SGE_NUM_GENBITS != 2
421#error "SGE_NUM_GENBITS really must be 2"
422#endif
423
424int
425t3_push_frames(struct socket *so, int req_completion)
426{
427	struct tcpcb *tp = so_sototcpcb(so);
428	struct toepcb *toep = tp->t_toe;
429	struct mbuf *m0, *sndptr, *m;
430	struct toedev *tod = toep->tp_tod;
431	struct adapter *sc = tod->tod_softc;
432	int bytes, ndesc, total_bytes = 0, mlen;
433	struct sockbuf *snd;
434	struct sglist *sgl;
435	struct ofld_hdr *oh;
436	caddr_t dst;
437	struct tx_data_wr *wr;
438
439	inp_lock_assert(tp->t_inpcb);
440
441	snd = so_sockbuf_snd(so);
442	SOCKBUF_LOCK(snd);
443
444	/*
445	 * Autosize the send buffer.
446	 */
447	if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) {
448		if (sbused(snd) >= (snd->sb_hiwat / 8 * 7) &&
449		    sbused(snd) < VNET(tcp_autosndbuf_max)) {
450			if (!sbreserve_locked(snd, min(snd->sb_hiwat +
451			    VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)),
452			    so, curthread))
453				snd->sb_flags &= ~SB_AUTOSIZE;
454		}
455	}
456
457	if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr)
458		sndptr = toep->tp_m_last->m_next;
459	else
460		sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
461
462	/* Nothing to send or no WRs available for sending data */
463	if (toep->tp_wr_avail == 0 || sndptr == NULL)
464		goto out;
465
466	/* Something to send and at least 1 WR available */
467	while (toep->tp_wr_avail && sndptr != NULL) {
468
469		m0 = m_gethdr(M_NOWAIT, MT_DATA);
470		if (m0 == NULL)
471			break;
472		oh = mtod(m0, struct ofld_hdr *);
473		wr = (void *)(oh + 1);
474		dst = (void *)(wr + 1);
475
476		m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr);
477		oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF |
478		    V_HDR_QSET(toep->tp_qset);
479
480		/*
481		 * Try to construct an immediate data WR if possible.  Stuff as
482		 * much data into it as possible, one whole mbuf at a time.
483		 */
484		mlen = sndptr->m_len;
485		ndesc = bytes = 0;
486		while (mlen <= IMM_LEN - bytes) {
487			bcopy(sndptr->m_data, dst, mlen);
488			bytes += mlen;
489			dst += mlen;
490
491			if (!(sndptr = sndptr->m_next))
492				break;
493			mlen = sndptr->m_len;
494		}
495
496		if (bytes) {
497
498			/* Was able to fit 'bytes' bytes in an immediate WR */
499
500			ndesc = 1;
501			make_tx_data_wr(so, wr, bytes, sndptr);
502
503			m0->m_len += bytes;
504			m0->m_pkthdr.len = m0->m_len;
505
506		} else {
507			int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC);
508
509			/* Need to make an SGL */
510
511			sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT);
512			if (sgl == NULL)
513				break;
514
515			for (m = sndptr; m != NULL; m = m->m_next) {
516				if ((mlen = m->m_len) > 0) {
517					if (sglist_append(sgl, m->m_data, mlen))
518					    break;
519				}
520				bytes += mlen;
521			}
522			sndptr = m;
523			if (bytes == 0) {
524				sglist_free(sgl);
525				break;
526			}
527			ndesc = sgllen_to_descs[sgl->sg_nseg];
528			oh->flags |= F_HDR_SGL;
529			oh->sgl = sgl;
530			make_tx_data_wr(so, wr, bytes, sndptr);
531		}
532
533		oh->flags |= V_HDR_NDESC(ndesc);
534		oh->plen = bytes;
535
536		snd->sb_sndptr = sndptr;
537		snd->sb_sndptroff += bytes;
538		if (sndptr == NULL) {
539			snd->sb_sndptr = snd->sb_mbtail;
540			snd->sb_sndptroff -= snd->sb_mbtail->m_len;
541			toep->tp_m_last = snd->sb_mbtail;
542		} else
543			toep->tp_m_last = NULL;
544
545		total_bytes += bytes;
546
547		toep->tp_wr_avail -= ndesc;
548		toep->tp_wr_unacked += ndesc;
549
550		if ((req_completion && toep->tp_wr_unacked == ndesc) ||
551		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
552			wr->wr.wrh_hi |= htonl(F_WR_COMPL);
553			toep->tp_wr_unacked = 0;
554		}
555
556		enqueue_wr(toep, m0);
557		l2t_send(sc, m0, toep->tp_l2t);
558	}
559out:
560	SOCKBUF_UNLOCK(snd);
561
562	if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN))
563		close_conn(sc, toep);
564
565	return (total_bytes);
566}
567
568static int
569send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
570{
571	struct mbuf *m;
572	struct cpl_rx_data_ack *req;
573	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
574
575	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req);
576	if (m == NULL)
577		return (0);
578
579	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
580	req->wr.wrh_lo = 0;
581	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
582	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
583	t3_offload_tx(sc, m);
584	return (credits);
585}
586
587void
588t3_rcvd(struct toedev *tod, struct tcpcb *tp)
589{
590	struct adapter *sc = tod->tod_softc;
591	struct inpcb *inp = tp->t_inpcb;
592	struct socket *so = inp->inp_socket;
593	struct sockbuf *so_rcv = &so->so_rcv;
594	struct toepcb *toep = tp->t_toe;
595	int must_send;
596
597	INP_WLOCK_ASSERT(inp);
598
599	SOCKBUF_LOCK(so_rcv);
600	KASSERT(toep->tp_enqueued >= sbused(so_rcv),
601	    ("%s: sbused(so_rcv) > enqueued", __func__));
602	toep->tp_rx_credits += toep->tp_enqueued - sbused(so_rcv);
603	toep->tp_enqueued = sbused(so_rcv);
604	SOCKBUF_UNLOCK(so_rcv);
605
606	must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd;
607	if (must_send || toep->tp_rx_credits >= 15 * 1024) {
608		int credits;
609
610		credits = send_rx_credits(sc, toep, toep->tp_rx_credits);
611		toep->tp_rx_credits -= credits;
612		tp->rcv_wnd += credits;
613		tp->rcv_adv += credits;
614	}
615}
616
617static int
618do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
619{
620	struct adapter *sc = qs->adap;
621	struct tom_data *td = sc->tom_softc;
622	struct cpl_rx_urg_notify *hdr = mtod(m, void *);
623	unsigned int tid = GET_TID(hdr);
624	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
625
626	log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp);
627
628	m_freem(m);
629	return (0);
630}
631
632int
633t3_send_fin(struct toedev *tod, struct tcpcb *tp)
634{
635	struct toepcb *toep = tp->t_toe;
636	struct inpcb *inp = tp->t_inpcb;
637	struct socket *so = inp_inpcbtosocket(inp);
638#if defined(KTR)
639	unsigned int tid = toep->tp_tid;
640#endif
641
642	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
643	INP_WLOCK_ASSERT(inp);
644
645	CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep,
646	    toep->tp_flags);
647
648	toep->tp_flags |= TP_SEND_FIN;
649	t3_push_frames(so, 1);
650
651	return (0);
652}
653
654int
655t3_tod_output(struct toedev *tod, struct tcpcb *tp)
656{
657	struct inpcb *inp = tp->t_inpcb;
658	struct socket *so = inp->inp_socket;
659
660	t3_push_frames(so, 1);
661	return (0);
662}
663
664/* What mtu_idx to use, given a 4-tuple and/or an MSS cap */
665int
666find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
667{
668	unsigned short *mtus = &sc->params.mtus[0];
669	int i = 0, mss;
670
671	KASSERT(inc != NULL || pmss > 0,
672	    ("%s: at least one of inc/pmss must be specified", __func__));
673
674	mss = inc ? tcp_mssopt(inc) : pmss;
675	if (pmss > 0 && mss > pmss)
676		mss = pmss;
677
678	while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
679		++i;
680
681	return (i);
682}
683
684static inline void
685purge_wr_queue(struct toepcb *toep)
686{
687	struct mbuf *m;
688	struct ofld_hdr *oh;
689
690	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) {
691		oh = mtod(m, struct ofld_hdr *);
692		if (oh->flags & F_HDR_SGL)
693			sglist_free(oh->sgl);
694		m_freem(m);
695	}
696}
697
698/*
699 * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T
700 * entry, etc.)
701 */
702static void
703t3_release_offload_resources(struct toepcb *toep)
704{
705	struct toedev *tod = toep->tp_tod;
706	struct tom_data *td = t3_tomdata(tod);
707
708	/*
709	 * The TOM explicitly detaches its toepcb from the system's inp before
710	 * it releases the offload resources.
711	 */
712	if (toep->tp_inp) {
713		panic("%s: inp %p still attached to toepcb %p",
714		    __func__, toep->tp_inp, toep);
715	}
716
717	if (toep->tp_wr_avail != toep->tp_wr_max)
718		purge_wr_queue(toep);
719
720	if (toep->tp_l2t) {
721		l2t_release(td->l2t, toep->tp_l2t);
722		toep->tp_l2t = NULL;
723	}
724
725	if (toep->tp_tid >= 0)
726		release_tid(tod, toep->tp_tid, toep->tp_qset);
727
728	toepcb_free(toep);
729}
730
731/*
732 * Determine the receive window size for a socket.
733 */
734unsigned long
735select_rcv_wnd(struct socket *so)
736{
737	unsigned long wnd;
738
739	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
740
741	wnd = sbspace(&so->so_rcv);
742	if (wnd < MIN_RCV_WND)
743		wnd = MIN_RCV_WND;
744
745	return min(wnd, MAX_RCV_WND);
746}
747
748int
749select_rcv_wscale(void)
750{
751	int wscale = 0;
752	unsigned long space = sb_max;
753
754	if (space > MAX_RCV_WND)
755		space = MAX_RCV_WND;
756
757	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
758		wscale++;
759
760	return (wscale);
761}
762
763
764/*
765 * Set up the socket for TCP offload.
766 */
767void
768offload_socket(struct socket *so, struct toepcb *toep)
769{
770	struct toedev *tod = toep->tp_tod;
771	struct tom_data *td = t3_tomdata(tod);
772	struct inpcb *inp = sotoinpcb(so);
773	struct tcpcb *tp = intotcpcb(inp);
774
775	INP_WLOCK_ASSERT(inp);
776
777	/* Update socket */
778	SOCKBUF_LOCK(&so->so_snd);
779	so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE;
780	SOCKBUF_UNLOCK(&so->so_snd);
781	SOCKBUF_LOCK(&so->so_rcv);
782	so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE;
783	SOCKBUF_UNLOCK(&so->so_rcv);
784
785	/* Update TCP PCB */
786	tp->tod = toep->tp_tod;
787	tp->t_toe = toep;
788	tp->t_flags |= TF_TOE;
789
790	/* Install an extra hold on inp */
791	toep->tp_inp = inp;
792	toep->tp_flags |= TP_ATTACHED;
793	in_pcbref(inp);
794
795	/* Add the TOE PCB to the active list */
796	mtx_lock(&td->toep_list_lock);
797	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
798	mtx_unlock(&td->toep_list_lock);
799}
800
801/* This is _not_ the normal way to "unoffload" a socket. */
802void
803undo_offload_socket(struct socket *so)
804{
805	struct inpcb *inp = sotoinpcb(so);
806	struct tcpcb *tp = intotcpcb(inp);
807	struct toepcb *toep = tp->t_toe;
808	struct toedev *tod = toep->tp_tod;
809	struct tom_data *td = t3_tomdata(tod);
810
811	INP_WLOCK_ASSERT(inp);
812
813	so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE;
814	so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE;
815
816	tp->tod = NULL;
817	tp->t_toe = NULL;
818	tp->t_flags &= ~TF_TOE;
819
820	toep->tp_inp = NULL;
821	toep->tp_flags &= ~TP_ATTACHED;
822	if (in_pcbrele_wlocked(inp))
823		panic("%s: inp freed.", __func__);
824
825	mtx_lock(&td->toep_list_lock);
826	TAILQ_REMOVE(&td->toep_list, toep, link);
827	mtx_unlock(&td->toep_list_lock);
828}
829
830/*
831 * Socket could be a listening socket, and we may not have a toepcb at all at
832 * this time.
833 */
834uint32_t
835calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e)
836{
837	uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) |
838	    V_MSS_IDX(mtu_idx);
839
840	if (so != NULL) {
841		struct inpcb *inp = sotoinpcb(so);
842		struct tcpcb *tp = intotcpcb(inp);
843		int keepalive = always_keepalive ||
844		    so_options_get(so) & SO_KEEPALIVE;
845
846		opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
847		opt0h |= V_KEEP_ALIVE(keepalive != 0);
848	}
849
850	if (e != NULL)
851		opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx);
852
853	return (htobe32(opt0h));
854}
855
856uint32_t
857calc_opt0l(struct socket *so, int rcv_bufsize)
858{
859	uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize);
860
861	KASSERT(rcv_bufsize <= M_RCV_BUFSIZ,
862	    ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize));
863
864	if (so != NULL)		/* optional because noone cares about IP TOS */
865		opt0l |= V_TOS(INP_TOS(sotoinpcb(so)));
866
867	return (htobe32(opt0l));
868}
869
870/*
871 * Convert an ACT_OPEN_RPL status to an errno.
872 */
873static int
874act_open_rpl_status_to_errno(int status)
875{
876	switch (status) {
877	case CPL_ERR_CONN_RESET:
878		return (ECONNREFUSED);
879	case CPL_ERR_ARP_MISS:
880		return (EHOSTUNREACH);
881	case CPL_ERR_CONN_TIMEDOUT:
882		return (ETIMEDOUT);
883	case CPL_ERR_TCAM_FULL:
884		return (EAGAIN);
885	case CPL_ERR_CONN_EXIST:
886		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
887		return (EAGAIN);
888	default:
889		return (EIO);
890	}
891}
892
893/*
894 * Return whether a failed active open has allocated a TID
895 */
896static inline int
897act_open_has_tid(int status)
898{
899	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
900	       status != CPL_ERR_ARP_MISS;
901}
902
903/*
904 * Active open failed.
905 */
906static int
907do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
908{
909	struct adapter *sc = qs->adap;
910	struct tom_data *td = sc->tom_softc;
911	struct toedev *tod = &td->tod;
912	struct cpl_act_open_rpl *rpl = mtod(m, void *);
913	unsigned int atid = G_TID(ntohl(rpl->atid));
914	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
915	struct inpcb *inp = toep->tp_inp;
916	int s = rpl->status, rc;
917
918	CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s);
919
920	free_atid(&td->tid_maps, atid);
921	toep->tp_tid = -1;
922
923	if (act_open_has_tid(s))
924		queue_tid_release(tod, GET_TID(rpl));
925
926	rc = act_open_rpl_status_to_errno(s);
927	if (rc != EAGAIN)
928		INP_INFO_RLOCK(&V_tcbinfo);
929	INP_WLOCK(inp);
930	toe_connect_failed(tod, inp, rc);
931	toepcb_release(toep);	/* unlocks inp */
932	if (rc != EAGAIN)
933		INP_INFO_RUNLOCK(&V_tcbinfo);
934
935	m_freem(m);
936	return (0);
937}
938
939/*
940 * Send an active open request.
941 *
942 * State of affairs on entry:
943 * soisconnecting (so_state |= SS_ISCONNECTING)
944 * tcbinfo not locked (this has changed - used to be WLOCKed)
945 * inp WLOCKed
946 * tp->t_state = TCPS_SYN_SENT
947 * rtalloc1, RT_UNLOCK on rt.
948 */
949int
950t3_connect(struct toedev *tod, struct socket *so,
951    struct rtentry *rt, struct sockaddr *nam)
952{
953	struct mbuf *m = NULL;
954	struct l2t_entry *e = NULL;
955	struct tom_data *td = t3_tomdata(tod);
956	struct adapter *sc = tod->tod_softc;
957	struct cpl_act_open_req *cpl;
958	struct inpcb *inp = sotoinpcb(so);
959	struct tcpcb *tp = intotcpcb(inp);
960	struct toepcb *toep;
961	int atid = -1, mtu_idx, rscale, cpu_idx, qset;
962	struct sockaddr *gw;
963	struct ifnet *ifp = rt->rt_ifp;
964	struct port_info *pi = ifp->if_softc;	/* XXX wrong for VLAN etc. */
965
966	INP_WLOCK_ASSERT(inp);
967
968	toep = toepcb_alloc(tod);
969	if (toep == NULL)
970		goto failed;
971
972	atid = alloc_atid(&td->tid_maps, toep);
973	if (atid < 0)
974		goto failed;
975
976	qset = pi->first_qset + (arc4random() % pi->nqsets);
977
978	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
979	if (m == NULL)
980		goto failed;
981
982	gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam;
983	e = t3_l2t_get(pi, ifp, gw);
984	if (e == NULL)
985		goto failed;
986
987	toep->tp_l2t = e;
988	toep->tp_tid = atid;	/* used to double check response */
989	toep->tp_qset = qset;
990
991	SOCKBUF_LOCK(&so->so_rcv);
992	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
993	toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
994	SOCKBUF_UNLOCK(&so->so_rcv);
995
996	offload_socket(so, toep);
997
998	/*
999	 * The kernel sets request_r_scale based on sb_max whereas we need to
1000	 * take hardware's MAX_RCV_WND into account too.  This is normally a
1001	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
1002	 */
1003	if (tp->t_flags & TF_REQ_SCALE)
1004		rscale = tp->request_r_scale = select_rcv_wscale();
1005	else
1006		rscale = 0;
1007	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
1008	cpu_idx = sc->rrss_map[qset];
1009
1010	cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD));
1011	cpl->wr.wrh_lo = 0;
1012	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1013	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
1014	    &cpl->peer_port);
1015	cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e);
1016	cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits);
1017	cpl->params = 0;
1018	cpl->opt2 = calc_opt2(cpu_idx);
1019
1020	CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__,
1021	    toep->tp_tid, tcpstates[tp->t_state], toep, inp);
1022
1023	if (l2t_send(sc, m, e) == 0)
1024		return (0);
1025
1026	undo_offload_socket(so);
1027
1028failed:
1029	CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p",
1030	    __func__, atid, toep, e, m);
1031
1032	if (atid >= 0)
1033		free_atid(&td->tid_maps, atid);
1034
1035	if (e)
1036		l2t_release(td->l2t, e);
1037
1038	if (toep)
1039		toepcb_free(toep);
1040
1041	m_freem(m);
1042
1043	return (ENOMEM);
1044}
1045
1046/*
1047 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do not
1048 * send multiple ABORT_REQs for the same connection and also that we do not try
1049 * to send a message after the connection has closed.
1050 */
1051static void
1052send_reset(struct toepcb *toep)
1053{
1054
1055	struct cpl_abort_req *req;
1056	unsigned int tid = toep->tp_tid;
1057	struct inpcb *inp = toep->tp_inp;
1058	struct socket *so = inp->inp_socket;
1059	struct tcpcb *tp = intotcpcb(inp);
1060	struct toedev *tod = toep->tp_tod;
1061	struct adapter *sc = tod->tod_softc;
1062	struct mbuf *m;
1063
1064	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1065	INP_WLOCK_ASSERT(inp);
1066
1067	CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep,
1068	    toep->tp_flags);
1069
1070	if (toep->tp_flags & TP_ABORT_SHUTDOWN)
1071		return;
1072
1073	toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
1074
1075	/* Purge the send queue */
1076	sbflush(so_sockbuf_snd(so));
1077	purge_wr_queue(toep);
1078
1079	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
1080	if (m == NULL)
1081		CXGB_UNIMPLEMENTED();
1082
1083	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1084	req->wr.wrh_lo = htonl(V_WR_TID(tid));
1085	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1086	req->rsvd0 = htonl(tp->snd_nxt);
1087	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1088	req->cmd = CPL_ABORT_SEND_RST;
1089
1090	if (tp->t_state == TCPS_SYN_SENT)
1091		(void )mbufq_enqueue(&toep->out_of_order_queue, m); /* defer */
1092	else
1093		l2t_send(sc, m, toep->tp_l2t);
1094}
1095
1096int
1097t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp)
1098{
1099
1100	send_reset(tp->t_toe);
1101	return (0);
1102}
1103
1104/*
1105 * Handler for RX_DATA CPL messages.
1106 */
1107static int
1108do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1109{
1110	struct adapter *sc = qs->adap;
1111	struct tom_data *td = sc->tom_softc;
1112	struct cpl_rx_data *hdr = mtod(m, void *);
1113	unsigned int tid = GET_TID(hdr);
1114	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1115	struct inpcb *inp = toep->tp_inp;
1116	struct tcpcb *tp;
1117	struct socket *so;
1118	struct sockbuf *so_rcv;
1119
1120	/* Advance over CPL */
1121	m_adj(m, sizeof(*hdr));
1122
1123	/* XXX: revisit.  This comes from the T4 TOM */
1124	if (__predict_false(inp == NULL)) {
1125		/*
1126		 * do_pass_establish failed and must be attempting to abort the
1127		 * connection.  Meanwhile, the T4 has sent us data for such a
1128		 * connection.
1129		 */
1130#ifdef notyet
1131		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
1132		    ("%s: inp NULL and tid isn't being aborted", __func__));
1133#endif
1134		m_freem(m);
1135		return (0);
1136	}
1137
1138	INP_WLOCK(inp);
1139	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
1140		CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
1141		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
1142		INP_WUNLOCK(inp);
1143		m_freem(m);
1144		return (0);
1145	}
1146
1147	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode))
1148		toep->tp_delack_mode = hdr->dack_mode;
1149
1150	tp = intotcpcb(inp);
1151
1152#ifdef INVARIANTS
1153	if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) {
1154		log(LOG_ERR,
1155		    "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
1156		    __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt);
1157	}
1158#endif
1159	tp->rcv_nxt += m->m_pkthdr.len;
1160	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
1161	    ("%s: negative window size", __func__));
1162	tp->rcv_wnd -= m->m_pkthdr.len;
1163	tp->t_rcvtime = ticks;
1164
1165	so  = inp->inp_socket;
1166	so_rcv = &so->so_rcv;
1167	SOCKBUF_LOCK(so_rcv);
1168
1169	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
1170		CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)",
1171		    __func__, tid, m->m_pkthdr.len);
1172		SOCKBUF_UNLOCK(so_rcv);
1173		INP_WUNLOCK(inp);
1174
1175		INP_INFO_RLOCK(&V_tcbinfo);
1176		INP_WLOCK(inp);
1177		tp = tcp_drop(tp, ECONNRESET);
1178		if (tp)
1179			INP_WUNLOCK(inp);
1180		INP_INFO_RUNLOCK(&V_tcbinfo);
1181
1182		m_freem(m);
1183		return (0);
1184	}
1185
1186	/* receive buffer autosize */
1187	if (so_rcv->sb_flags & SB_AUTOSIZE &&
1188	    V_tcp_do_autorcvbuf &&
1189	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
1190	    (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) {
1191		unsigned int hiwat = so_rcv->sb_hiwat;
1192		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
1193		    V_tcp_autorcvbuf_max);
1194
1195		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
1196			so_rcv->sb_flags &= ~SB_AUTOSIZE;
1197		else
1198			toep->tp_rx_credits += newsize - hiwat;
1199	}
1200
1201	toep->tp_enqueued += m->m_pkthdr.len;
1202	sbappendstream_locked(so_rcv, m, 0);
1203	sorwakeup_locked(so);
1204	SOCKBUF_UNLOCK_ASSERT(so_rcv);
1205
1206	INP_WUNLOCK(inp);
1207	return (0);
1208}
1209
1210/*
1211 * Handler for PEER_CLOSE CPL messages.
1212 */
1213static int
1214do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1215{
1216	struct adapter *sc = qs->adap;
1217	struct tom_data *td = sc->tom_softc;
1218	const struct cpl_peer_close *hdr = mtod(m, void *);
1219	unsigned int tid = GET_TID(hdr);
1220	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1221	struct inpcb *inp = toep->tp_inp;
1222	struct tcpcb *tp;
1223	struct socket *so;
1224
1225	INP_INFO_RLOCK(&V_tcbinfo);
1226	INP_WLOCK(inp);
1227	tp = intotcpcb(inp);
1228
1229	CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
1230	    tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp);
1231
1232	if (toep->tp_flags & TP_ABORT_RPL_PENDING)
1233		goto done;
1234
1235	so = inp_inpcbtosocket(inp);
1236
1237	socantrcvmore(so);
1238	tp->rcv_nxt++;
1239
1240	switch (tp->t_state) {
1241	case TCPS_SYN_RECEIVED:
1242		tp->t_starttime = ticks;
1243		/* FALLTHROUGH */
1244	case TCPS_ESTABLISHED:
1245		tp->t_state = TCPS_CLOSE_WAIT;
1246		break;
1247	case TCPS_FIN_WAIT_1:
1248		tp->t_state = TCPS_CLOSING;
1249		break;
1250	case TCPS_FIN_WAIT_2:
1251		tcp_twstart(tp);
1252		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1253		INP_INFO_RUNLOCK(&V_tcbinfo);
1254
1255		INP_WLOCK(inp);
1256		toepcb_release(toep);	/* no more CPLs expected */
1257
1258		m_freem(m);
1259		return (0);
1260	default:
1261		log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n",
1262		    __func__, toep->tp_tid, tp->t_state);
1263	}
1264
1265done:
1266	INP_WUNLOCK(inp);
1267	INP_INFO_RUNLOCK(&V_tcbinfo);
1268
1269	m_freem(m);
1270	return (0);
1271}
1272
1273/*
1274 * Handler for CLOSE_CON_RPL CPL messages.  peer ACK to our FIN received.
1275 */
1276static int
1277do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1278{
1279	struct adapter *sc = qs->adap;
1280	struct tom_data *td = sc->tom_softc;
1281	const struct cpl_close_con_rpl *rpl = mtod(m, void *);
1282	unsigned int tid = GET_TID(rpl);
1283	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1284	struct inpcb *inp = toep->tp_inp;
1285	struct tcpcb *tp;
1286	struct socket *so;
1287
1288	INP_INFO_RLOCK(&V_tcbinfo);
1289	INP_WLOCK(inp);
1290	tp = intotcpcb(inp);
1291
1292	CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid,
1293	    tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags);
1294
1295	if ((toep->tp_flags & TP_ABORT_RPL_PENDING))
1296		goto done;
1297
1298	so = inp_inpcbtosocket(inp);
1299	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
1300
1301	switch (tp->t_state) {
1302	case TCPS_CLOSING:
1303		tcp_twstart(tp);
1304release:
1305		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1306		INP_INFO_RUNLOCK(&V_tcbinfo);
1307
1308		INP_WLOCK(inp);
1309		toepcb_release(toep);	/* no more CPLs expected */
1310
1311		m_freem(m);
1312		return (0);
1313	case TCPS_LAST_ACK:
1314		if (tcp_close(tp))
1315			INP_WUNLOCK(inp);
1316		goto release;
1317
1318	case TCPS_FIN_WAIT_1:
1319		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1320			soisdisconnected(so);
1321		tp->t_state = TCPS_FIN_WAIT_2;
1322		break;
1323	default:
1324		log(LOG_ERR,
1325		    "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
1326		    __func__, toep->tp_tid, tp->t_state);
1327	}
1328
1329done:
1330	INP_WUNLOCK(inp);
1331	INP_INFO_RUNLOCK(&V_tcbinfo);
1332
1333	m_freem(m);
1334	return (0);
1335}
1336
1337static int
1338do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1339{
1340	struct cpl_smt_write_rpl *rpl = mtod(m, void *);
1341
1342	if (rpl->status != CPL_ERR_NONE) {
1343		log(LOG_ERR,
1344		    "Unexpected SMT_WRITE_RPL status %u for entry %u\n",
1345		    rpl->status, GET_TID(rpl));
1346	}
1347
1348	m_freem(m);
1349	return (0);
1350}
1351
1352static int
1353do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1354{
1355	struct cpl_set_tcb_rpl *rpl = mtod(m, void *);
1356
1357	if (rpl->status != CPL_ERR_NONE) {
1358		log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n",
1359		    rpl->status, GET_TID(rpl));
1360	}
1361
1362	m_freem(m);
1363	return (0);
1364}
1365
1366/*
1367 * Handle an ABORT_RPL_RSS CPL message.
1368 */
1369static int
1370do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1371{
1372	struct adapter *sc = qs->adap;
1373	struct tom_data *td = sc->tom_softc;
1374	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
1375	unsigned int tid = GET_TID(rpl);
1376	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1377	struct inpcb *inp;
1378
1379	/*
1380	 * Ignore replies to post-close aborts indicating that the abort was
1381	 * requested too late.  These connections are terminated when we get
1382	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
1383	 * arrives the TID is either no longer used or it has been recycled.
1384	 */
1385	if (rpl->status == CPL_ERR_ABORT_FAILED) {
1386		m_freem(m);
1387		return (0);
1388	}
1389
1390	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
1391		return (do_abort_rpl_synqe(qs, r, m));
1392
1393	CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep,
1394	    rpl->status);
1395
1396	inp = toep->tp_inp;
1397	INP_WLOCK(inp);
1398
1399	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1400		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) {
1401			toep->tp_flags |= TP_ABORT_RPL_RCVD;
1402			INP_WUNLOCK(inp);
1403		} else {
1404			toep->tp_flags &= ~TP_ABORT_RPL_RCVD;
1405			toep->tp_flags &= TP_ABORT_RPL_PENDING;
1406			toepcb_release(toep);	/* no more CPLs expected */
1407		}
1408	}
1409
1410	m_freem(m);
1411	return (0);
1412}
1413
1414/*
1415 * Convert the status code of an ABORT_REQ into a FreeBSD error code.
1416 */
1417static int
1418abort_status_to_errno(struct tcpcb *tp, int abort_reason)
1419{
1420	switch (abort_reason) {
1421	case CPL_ERR_BAD_SYN:
1422	case CPL_ERR_CONN_RESET:
1423		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
1424	case CPL_ERR_XMIT_TIMEDOUT:
1425	case CPL_ERR_PERSIST_TIMEDOUT:
1426	case CPL_ERR_FINWAIT2_TIMEDOUT:
1427	case CPL_ERR_KEEPALIVE_TIMEDOUT:
1428		return (ETIMEDOUT);
1429	default:
1430		return (EIO);
1431	}
1432}
1433
1434/*
1435 * Returns whether an ABORT_REQ_RSS message is a negative advice.
1436 */
1437static inline int
1438is_neg_adv_abort(unsigned int status)
1439{
1440	return status == CPL_ERR_RTX_NEG_ADVICE ||
1441	    status == CPL_ERR_PERSIST_NEG_ADVICE;
1442}
1443
1444void
1445send_abort_rpl(struct toedev *tod, int tid, int qset)
1446{
1447	struct mbuf *reply;
1448	struct cpl_abort_rpl *rpl;
1449	struct adapter *sc = tod->tod_softc;
1450
1451	reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl);
1452	if (!reply)
1453		CXGB_UNIMPLEMENTED();
1454
1455	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
1456	rpl->wr.wrh_lo = htonl(V_WR_TID(tid));
1457	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
1458	rpl->cmd = CPL_ABORT_NO_RST;
1459
1460	t3_offload_tx(sc, reply);
1461}
1462
1463/*
1464 * Handle an ABORT_REQ_RSS CPL message.  If we're waiting for an ABORT_RPL we
1465 * ignore this request except that we need to reply to it.
1466 */
1467static int
1468do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1469{
1470	struct adapter *sc = qs->adap;
1471	struct tom_data *td = sc->tom_softc;
1472	struct toedev *tod = &td->tod;
1473	const struct cpl_abort_req_rss *req = mtod(m, void *);
1474	unsigned int tid = GET_TID(req);
1475	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1476	struct inpcb *inp;
1477	struct tcpcb *tp;
1478	struct socket *so;
1479	int qset = toep->tp_qset;
1480
1481	if (is_neg_adv_abort(req->status)) {
1482		CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)",
1483		    __func__, req->status, tid, toep->tp_flags);
1484		m_freem(m);
1485		return (0);
1486	}
1487
1488	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
1489		return (do_abort_req_synqe(qs, r, m));
1490
1491	inp = toep->tp_inp;
1492	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
1493	INP_WLOCK(inp);
1494
1495	tp = intotcpcb(inp);
1496	so = inp->inp_socket;
1497
1498	CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d",
1499	    __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags,
1500	    req->status);
1501
1502	if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) {
1503		toep->tp_flags |= TP_ABORT_REQ_RCVD;
1504		toep->tp_flags |= TP_ABORT_SHUTDOWN;
1505		INP_WUNLOCK(inp);
1506		INP_INFO_RUNLOCK(&V_tcbinfo);
1507		m_freem(m);
1508		return (0);
1509	}
1510	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
1511
1512	/*
1513	 * If we'd sent a reset on this toep, we'll ignore this and clean up in
1514	 * the T3's reply to our reset instead.
1515	 */
1516	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1517		toep->tp_flags |= TP_ABORT_RPL_SENT;
1518		INP_WUNLOCK(inp);
1519	} else {
1520		so_error_set(so, abort_status_to_errno(tp, req->status));
1521		tp = tcp_close(tp);
1522		if (tp == NULL)
1523			INP_WLOCK(inp);	/* re-acquire */
1524		toepcb_release(toep);	/* no more CPLs expected */
1525	}
1526	INP_INFO_RUNLOCK(&V_tcbinfo);
1527
1528	send_abort_rpl(tod, tid, qset);
1529	m_freem(m);
1530	return (0);
1531}
1532
1533static void
1534assign_rxopt(struct tcpcb *tp, uint16_t tcpopt)
1535{
1536	struct toepcb *toep = tp->t_toe;
1537	struct adapter *sc = toep->tp_tod->tod_softc;
1538
1539	tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40;
1540
1541	if (G_TCPOPT_TSTAMP(tcpopt)) {
1542		tp->t_flags |= TF_RCVD_TSTMP;
1543		tp->t_flags |= TF_REQ_TSTMP;	/* forcibly set */
1544		tp->ts_recent = 0;		/* XXX */
1545		tp->ts_recent_age = tcp_ts_getticks();
1546	}
1547
1548	if (G_TCPOPT_SACK(tcpopt))
1549		tp->t_flags |= TF_SACK_PERMIT;
1550	else
1551		tp->t_flags &= ~TF_SACK_PERMIT;
1552
1553	if (G_TCPOPT_WSCALE_OK(tcpopt))
1554		tp->t_flags |= TF_RCVD_SCALE;
1555
1556	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
1557	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
1558		tp->rcv_scale = tp->request_r_scale;
1559		tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt);
1560	}
1561
1562}
1563
1564/*
1565 * The ISS and IRS are from after the exchange of SYNs and are off by 1.
1566 */
1567void
1568make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs,
1569    uint16_t cpl_tcpopt)
1570{
1571	struct inpcb *inp = sotoinpcb(so);
1572	struct tcpcb *tp = intotcpcb(inp);
1573	struct toepcb *toep = tp->t_toe;
1574	long bufsize;
1575	uint32_t iss = be32toh(cpl_iss) - 1;	/* true ISS */
1576	uint32_t irs = be32toh(cpl_irs) - 1;	/* true IRS */
1577	uint16_t tcpopt = be16toh(cpl_tcpopt);
1578
1579	INP_WLOCK_ASSERT(inp);
1580
1581	tp->t_state = TCPS_ESTABLISHED;
1582	tp->t_starttime = ticks;
1583	TCPSTAT_INC(tcps_connects);
1584
1585	CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state],
1586	    toep->tp_tid, toep, inp);
1587
1588	tp->irs = irs;
1589	tcp_rcvseqinit(tp);
1590	tp->rcv_wnd = toep->tp_rx_credits << 10;
1591	tp->rcv_adv += tp->rcv_wnd;
1592	tp->last_ack_sent = tp->rcv_nxt;
1593
1594	/*
1595	 * If we were unable to send all rx credits via opt0, save the remainder
1596	 * in rx_credits so that they can be handed over with the next credit
1597	 * update.
1598	 */
1599	SOCKBUF_LOCK(&so->so_rcv);
1600	bufsize = select_rcv_wnd(so);
1601	SOCKBUF_UNLOCK(&so->so_rcv);
1602	toep->tp_rx_credits = bufsize - tp->rcv_wnd;
1603
1604	tp->iss = iss;
1605	tcp_sendseqinit(tp);
1606	tp->snd_una = iss + 1;
1607	tp->snd_nxt = iss + 1;
1608	tp->snd_max = iss + 1;
1609
1610	assign_rxopt(tp, tcpopt);
1611	soisconnected(so);
1612}
1613
1614/*
1615 * Fill in the right TID for CPL messages waiting in the out-of-order queue
1616 * and send them to the TOE.
1617 */
1618static void
1619fixup_and_send_ofo(struct toepcb *toep)
1620{
1621	struct mbuf *m;
1622	struct toedev *tod = toep->tp_tod;
1623	struct adapter *sc = tod->tod_softc;
1624	struct inpcb *inp = toep->tp_inp;
1625	unsigned int tid = toep->tp_tid;
1626
1627	inp_lock_assert(inp);
1628
1629	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
1630		struct ofld_hdr *oh = mtod(m, void *);
1631		/*
1632		 * A variety of messages can be waiting but the fields we'll
1633		 * be touching are common to all so any message type will do.
1634		 */
1635		struct cpl_close_con_req *p = (void *)(oh + 1);
1636
1637		p->wr.wrh_lo = htonl(V_WR_TID(tid));
1638		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
1639		t3_offload_tx(sc, m);
1640	}
1641}
1642
1643/*
1644 * Process a CPL_ACT_ESTABLISH message.
1645 */
1646static int
1647do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1648{
1649	struct adapter *sc = qs->adap;
1650	struct tom_data *td = sc->tom_softc;
1651	struct cpl_act_establish *req = mtod(m, void *);
1652	unsigned int tid = GET_TID(req);
1653	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
1654	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
1655	struct inpcb *inp = toep->tp_inp;
1656	struct tcpcb *tp;
1657	struct socket *so;
1658
1659	CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid);
1660
1661	free_atid(&td->tid_maps, atid);
1662
1663	INP_WLOCK(inp);
1664	tp = intotcpcb(inp);
1665
1666	KASSERT(toep->tp_qset == qs->idx,
1667	    ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx));
1668	KASSERT(toep->tp_tid == atid,
1669	    ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid));
1670
1671	toep->tp_tid = tid;
1672	insert_tid(td, toep, tid);
1673
1674	if (inp->inp_flags & INP_DROPPED) {
1675		/* socket closed by the kernel before hw told us it connected */
1676		send_reset(toep);
1677		goto done;
1678	}
1679
1680	KASSERT(tp->t_state == TCPS_SYN_SENT,
1681	    ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state));
1682
1683	so = inp->inp_socket;
1684	make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt);
1685
1686	/*
1687	 * Now that we finally have a TID send any CPL messages that we had to
1688	 * defer for lack of a TID.
1689	 */
1690	if (mbufq_len(&toep->out_of_order_queue))
1691		fixup_and_send_ofo(toep);
1692
1693done:
1694	INP_WUNLOCK(inp);
1695	m_freem(m);
1696	return (0);
1697}
1698
1699/*
1700 * Process an acknowledgment of WR completion.  Advance snd_una and send the
1701 * next batch of work requests from the write queue.
1702 */
1703static void
1704wr_ack(struct toepcb *toep, struct mbuf *m)
1705{
1706	struct inpcb *inp = toep->tp_inp;
1707	struct tcpcb *tp;
1708	struct cpl_wr_ack *hdr = mtod(m, void *);
1709	struct socket *so;
1710	unsigned int credits = ntohs(hdr->credits);
1711	u32 snd_una = ntohl(hdr->snd_una);
1712	int bytes = 0;
1713	struct sockbuf *snd;
1714	struct mbuf *p;
1715	struct ofld_hdr *oh;
1716
1717	inp_wlock(inp);
1718	tp = intotcpcb(inp);
1719	so = inp->inp_socket;
1720	toep->tp_wr_avail += credits;
1721	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
1722		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
1723
1724	while (credits) {
1725		p = peek_wr(toep);
1726
1727		if (__predict_false(!p)) {
1728			CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, "
1729			    "tid %u, state %u, wr_avail %u", __func__, credits,
1730			    toep->tp_tid, tp->t_state, toep->tp_wr_avail);
1731
1732			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
1733			    "nothing pending, state %u wr_avail=%u\n",
1734			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
1735			break;
1736		}
1737
1738		oh = mtod(p, struct ofld_hdr *);
1739
1740		KASSERT(credits >= G_HDR_NDESC(oh->flags),
1741		    ("%s: partial credits?  %d %d", __func__, credits,
1742		    G_HDR_NDESC(oh->flags)));
1743
1744		dequeue_wr(toep);
1745		credits -= G_HDR_NDESC(oh->flags);
1746		bytes += oh->plen;
1747
1748		if (oh->flags & F_HDR_SGL)
1749			sglist_free(oh->sgl);
1750		m_freem(p);
1751	}
1752
1753	if (__predict_false(SEQ_LT(snd_una, tp->snd_una)))
1754		goto out_free;
1755
1756	if (tp->snd_una != snd_una) {
1757		tp->snd_una = snd_una;
1758		tp->ts_recent_age = tcp_ts_getticks();
1759		if (tp->snd_una == tp->snd_nxt)
1760			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
1761	}
1762
1763	snd = so_sockbuf_snd(so);
1764	if (bytes) {
1765		SOCKBUF_LOCK(snd);
1766		sbdrop_locked(snd, bytes);
1767		so_sowwakeup_locked(so);
1768	}
1769
1770	if (snd->sb_sndptroff < sbused(snd))
1771		t3_push_frames(so, 0);
1772
1773out_free:
1774	inp_wunlock(tp->t_inpcb);
1775	m_freem(m);
1776}
1777
1778/*
1779 * Handler for TX_DATA_ACK CPL messages.
1780 */
1781static int
1782do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1783{
1784	struct adapter *sc = qs->adap;
1785	struct tom_data *td = sc->tom_softc;
1786	struct cpl_wr_ack *hdr = mtod(m, void *);
1787	unsigned int tid = GET_TID(hdr);
1788	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1789
1790	/* XXX bad race */
1791	if (toep)
1792		wr_ack(toep, m);
1793
1794	return (0);
1795}
1796
1797void
1798t3_init_cpl_io(struct adapter *sc)
1799{
1800	t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
1801	t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
1802	t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify);
1803	t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
1804	t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack);
1805	t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
1806	t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
1807	t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
1808	t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
1809	t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl);
1810	t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl);
1811}
1812#endif
1813