111798Smichaelm/*-
211798Smichaelm * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
311798Smichaelm * All rights reserved.
411798Smichaelm * Written by: Navdeep Parhar <np@FreeBSD.org>
511798Smichaelm *
611798Smichaelm * Redistribution and use in source and binary forms, with or without
711798Smichaelm * modification, are permitted provided that the following conditions
811798Smichaelm * are met:
911798Smichaelm * 1. Redistributions of source code must retain the above copyright
1011798Smichaelm *    notice, this list of conditions and the following disclaimer.
1111798Smichaelm * 2. Redistributions in binary form must reproduce the above copyright
1211798Smichaelm *    notice, this list of conditions and the following disclaimer in the
1311798Smichaelm *    documentation and/or other materials provided with the distribution.
1411798Smichaelm *
1511798Smichaelm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1611798Smichaelm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1711798Smichaelm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1811798Smichaelm * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1911798Smichaelm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2011798Smichaelm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2111798Smichaelm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2211798Smichaelm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2311798Smichaelm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2411797SN/A * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2511797SN/A * SUCH DAMAGE.
2611797SN/A */
2711797SN/A
2811797SN/A#include <sys/cdefs.h>
2911797SN/A__FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/tom/t4_cpl_io.c 342583 2018-12-29 00:30:17Z jhb $");
3011797SN/A
3111797SN/A#include "opt_inet.h"
3211797SN/A
3311797SN/A#ifdef TCP_OFFLOAD
3411797SN/A#include <sys/param.h>
3511797SN/A#include <sys/types.h>
3611797SN/A#include <sys/kernel.h>
3711797SN/A#include <sys/ktr.h>
3811797SN/A#include <sys/module.h>
3912522Skshefov#include <sys/protosw.h>
4011797SN/A#include <sys/domain.h>
4111797SN/A#include <sys/socket.h>
4211797SN/A#include <sys/socketvar.h>
4311797SN/A#include <sys/sglist.h>
4411797SN/A#include <netinet/in.h>
4511797SN/A#include <netinet/in_pcb.h>
4611797SN/A#include <netinet/ip.h>
4711797SN/A#include <netinet/ip6.h>
4811797SN/A#include <netinet/tcp_var.h>
4911797SN/A#define TCPSTATES
5011797SN/A#include <netinet/tcp_fsm.h>
5111797SN/A#include <netinet/tcp_seq.h>
5211797SN/A#include <netinet/toecore.h>
5311797SN/A
5411797SN/A#include "common/common.h"
5511797SN/A#include "common/t4_msg.h"
5611797SN/A#include "common/t4_regs.h"
5711797SN/A#include "common/t4_tcb.h"
5811797SN/A#include "tom/t4_tom_l2t.h"
5911797SN/A#include "tom/t4_tom.h"
6011797SN/A
6111797SN/AVNET_DECLARE(int, tcp_do_autosndbuf);
6211797SN/A#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
6311797SN/AVNET_DECLARE(int, tcp_autosndbuf_inc);
6411797SN/A#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
6511797SN/AVNET_DECLARE(int, tcp_autosndbuf_max);
6611797SN/A#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
6711797SN/AVNET_DECLARE(int, tcp_do_autorcvbuf);
6811797SN/A#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
6911797SN/AVNET_DECLARE(int, tcp_autorcvbuf_inc);
7011797SN/A#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
7111797SN/AVNET_DECLARE(int, tcp_autorcvbuf_max);
7211797SN/A#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
7311797SN/A
7411797SN/Astatic inline struct mbuf *
7511797SN/Ambufq_dequeue(struct mbufq *q)
7611797SN/A{
7711797SN/A	struct mbuf *m;
7811797SN/A
7911797SN/A	m = q->head;
8011797SN/A	if (m) {
8111797SN/A		if (q->tail == m)
8211797SN/A			q->tail = NULL;
8311797SN/A		q->head = m->m_nextpkt;
8411797SN/A		m->m_nextpkt = NULL;
8511797SN/A	}
8611797SN/A	return (m);
8711797SN/A}
8811797SN/A
8911797SN/Astatic inline void
9011797SN/Ambufq_enqueue(struct mbufq *q, struct mbuf *m)
9111797SN/A{
92
93	m->m_nextpkt = NULL;
94	if (q->tail)
95		q->tail->m_nextpkt = m;
96	else
97		q->head = m;
98	q->tail = m;
99}
100
101static inline struct mbuf *
102mbufq_first(const struct mbufq *q)
103{
104
105	return (q->head);
106}
107
108void
109send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
110{
111	struct wrqe *wr;
112	struct fw_flowc_wr *flowc;
113	unsigned int nparams = ftxp ? 8 : 6, flowclen;
114	struct vi_info *vi = toep->vi;
115	struct port_info *pi = vi->pi;
116	struct adapter *sc = pi->adapter;
117	unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
118	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
119
120	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
121	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
122
123	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
124
125	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
126	if (wr == NULL) {
127		/* XXX */
128		panic("%s: allocation failure.", __func__);
129	}
130	flowc = wrtod(wr);
131	memset(flowc, 0, wr->wr_len);
132
133	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
134	    V_FW_FLOWC_WR_NPARAMS(nparams));
135	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
136	    V_FW_WR_FLOWID(toep->tid));
137
138	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
139	flowc->mnemval[0].val = htobe32(pfvf);
140	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
141	flowc->mnemval[1].val = htobe32(pi->tx_chan);
142	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
143	flowc->mnemval[2].val = htobe32(pi->tx_chan);
144	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
145	flowc->mnemval[3].val = htobe32(toep->ofld_rxq->iq.abs_id);
146	if (ftxp) {
147		uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
148
149		flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
150		flowc->mnemval[4].val = htobe32(ftxp->snd_nxt);
151		flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
152		flowc->mnemval[5].val = htobe32(ftxp->rcv_nxt);
153		flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
154		flowc->mnemval[6].val = htobe32(sndbuf);
155		flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
156		flowc->mnemval[7].val = htobe32(ftxp->mss);
157
158		CTR6(KTR_CXGBE,
159		    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
160		    __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt,
161		    ftxp->rcv_nxt);
162	} else {
163		flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
164		flowc->mnemval[4].val = htobe32(512);
165		flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
166		flowc->mnemval[5].val = htobe32(512);
167
168		CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
169	}
170
171	txsd->tx_credits = howmany(flowclen, 16);
172	txsd->plen = 0;
173	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
174	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
175	toep->tx_credits -= txsd->tx_credits;
176	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
177		toep->txsd_pidx = 0;
178	toep->txsd_avail--;
179
180	toep->flags |= TPF_FLOWC_WR_SENT;
181        t4_wrq_tx(sc, wr);
182}
183
184void
185send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
186{
187	struct wrqe *wr;
188	struct cpl_abort_req *req;
189	int tid = toep->tid;
190	struct inpcb *inp = toep->inp;
191	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
192
193	INP_WLOCK_ASSERT(inp);
194
195	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
196	    __func__, toep->tid,
197	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
198	    tcpstates[tp->t_state],
199	    toep->flags, inp->inp_flags,
200	    toep->flags & TPF_ABORT_SHUTDOWN ?
201	    " (abort already in progress)" : "");
202
203	if (toep->flags & TPF_ABORT_SHUTDOWN)
204		return;	/* abort already in progress */
205
206	toep->flags |= TPF_ABORT_SHUTDOWN;
207
208	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
209	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
210
211	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
212	if (wr == NULL) {
213		/* XXX */
214		panic("%s: allocation failure.", __func__);
215	}
216	req = wrtod(wr);
217
218	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
219	if (inp->inp_flags & INP_DROPPED)
220		req->rsvd0 = htobe32(snd_nxt);
221	else
222		req->rsvd0 = htobe32(tp->snd_nxt);
223	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
224	req->cmd = CPL_ABORT_SEND_RST;
225
226	/*
227	 * XXX: What's the correct way to tell that the inp hasn't been detached
228	 * from its socket?  Should I even be flushing the snd buffer here?
229	 */
230	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
231		struct socket *so = inp->inp_socket;
232
233		if (so != NULL)	/* because I'm not sure.  See comment above */
234			sbflush(&so->so_snd);
235	}
236
237	t4_l2t_send(sc, wr, toep->l2te);
238}
239
240/*
241 * Called when a connection is established to translate the TCP options
242 * reported by HW to FreeBSD's native format.
243 */
244static void
245assign_rxopt(struct tcpcb *tp, unsigned int opt)
246{
247	struct toepcb *toep = tp->t_toe;
248	struct inpcb *inp = tp->t_inpcb;
249	struct adapter *sc = td_adapter(toep->td);
250	int n;
251
252	INP_LOCK_ASSERT(inp);
253
254	if (inp->inp_inc.inc_flags & INC_ISIPV6)
255		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
256	else
257		n = sizeof(struct ip) + sizeof(struct tcphdr);
258	tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - n;
259
260	CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid,
261	    G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]);
262
263	if (G_TCPOPT_TSTAMP(opt)) {
264		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
265		tp->ts_recent = 0;		/* hmmm */
266		tp->ts_recent_age = tcp_ts_getticks();
267		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
268	}
269
270	if (G_TCPOPT_SACK(opt))
271		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
272	else
273		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
274
275	if (G_TCPOPT_WSCALE_OK(opt))
276		tp->t_flags |= TF_RCVD_SCALE;
277
278	/* Doing window scaling? */
279	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
280	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
281		tp->rcv_scale = tp->request_r_scale;
282		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
283	}
284}
285
286/*
287 * Completes some final bits of initialization for just established connections
288 * and changes their state to TCPS_ESTABLISHED.
289 *
290 * The ISNs are from after the exchange of SYNs.  i.e., the true ISN + 1.
291 */
292void
293make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn,
294    uint16_t opt)
295{
296	struct inpcb *inp = toep->inp;
297	struct socket *so = inp->inp_socket;
298	struct tcpcb *tp = intotcpcb(inp);
299	long bufsize;
300	uint32_t iss = be32toh(snd_isn) - 1;	/* true ISS */
301	uint32_t irs = be32toh(rcv_isn) - 1;	/* true IRS */
302	uint16_t tcpopt = be16toh(opt);
303	struct flowc_tx_params ftxp;
304
305	INP_WLOCK_ASSERT(inp);
306	KASSERT(tp->t_state == TCPS_SYN_SENT ||
307	    tp->t_state == TCPS_SYN_RECEIVED,
308	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
309
310	CTR4(KTR_CXGBE, "%s: tid %d, toep %p, inp %p",
311	    __func__, toep->tid, toep, inp);
312
313	tcp_state_change(tp, TCPS_ESTABLISHED);
314	tp->t_starttime = ticks;
315	TCPSTAT_INC(tcps_connects);
316
317	tp->irs = irs;
318	tcp_rcvseqinit(tp);
319	tp->rcv_wnd = toep->rx_credits << 10;
320	tp->rcv_adv += tp->rcv_wnd;
321	tp->last_ack_sent = tp->rcv_nxt;
322
323	/*
324	 * If we were unable to send all rx credits via opt0, save the remainder
325	 * in rx_credits so that they can be handed over with the next credit
326	 * update.
327	 */
328	SOCKBUF_LOCK(&so->so_rcv);
329	bufsize = select_rcv_wnd(so);
330	SOCKBUF_UNLOCK(&so->so_rcv);
331	toep->rx_credits = bufsize - tp->rcv_wnd;
332
333	tp->iss = iss;
334	tcp_sendseqinit(tp);
335	tp->snd_una = iss + 1;
336	tp->snd_nxt = iss + 1;
337	tp->snd_max = iss + 1;
338
339	assign_rxopt(tp, tcpopt);
340
341	SOCKBUF_LOCK(&so->so_snd);
342	if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
343		bufsize = V_tcp_autosndbuf_max;
344	else
345		bufsize = sbspace(&so->so_snd);
346	SOCKBUF_UNLOCK(&so->so_snd);
347
348	ftxp.snd_nxt = tp->snd_nxt;
349	ftxp.rcv_nxt = tp->rcv_nxt;
350	ftxp.snd_space = bufsize;
351	ftxp.mss = tp->t_maxseg;
352	send_flowc_wr(toep, &ftxp);
353
354	soisconnected(so);
355}
356
357static int
358send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
359{
360	struct wrqe *wr;
361	struct cpl_rx_data_ack *req;
362	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
363
364	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
365
366	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
367	if (wr == NULL)
368		return (0);
369	req = wrtod(wr);
370
371	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
372	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
373
374	t4_wrq_tx(sc, wr);
375	return (credits);
376}
377
378void
379t4_rcvd(struct toedev *tod, struct tcpcb *tp)
380{
381	struct adapter *sc = tod->tod_softc;
382	struct inpcb *inp = tp->t_inpcb;
383	struct socket *so = inp->inp_socket;
384	struct sockbuf *sb = &so->so_rcv;
385	struct toepcb *toep = tp->t_toe;
386	int credits;
387
388	INP_WLOCK_ASSERT(inp);
389
390	SOCKBUF_LOCK(sb);
391	KASSERT(toep->sb_cc >= sb->sb_cc,
392	    ("%s: sb %p has more data (%d) than last time (%d).",
393	    __func__, sb, sb->sb_cc, toep->sb_cc));
394	toep->rx_credits += toep->sb_cc - sb->sb_cc;
395	toep->sb_cc = sb->sb_cc;
396
397	if (toep->rx_credits > 0 &&
398	    (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 ||
399	    (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
400	    toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) {
401
402		credits = send_rx_credits(sc, toep, toep->rx_credits);
403		toep->rx_credits -= credits;
404		tp->rcv_wnd += credits;
405		tp->rcv_adv += credits;
406	}
407	SOCKBUF_UNLOCK(sb);
408}
409
410/*
411 * Close a connection by sending a CPL_CLOSE_CON_REQ message.
412 */
413static int
414close_conn(struct adapter *sc, struct toepcb *toep)
415{
416	struct wrqe *wr;
417	struct cpl_close_con_req *req;
418	unsigned int tid = toep->tid;
419
420	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
421	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
422
423	if (toep->flags & TPF_FIN_SENT)
424		return (0);
425
426	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
427	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
428
429	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
430	if (wr == NULL) {
431		/* XXX */
432		panic("%s: allocation failure.", __func__);
433	}
434	req = wrtod(wr);
435
436        req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
437	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
438	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
439	    V_FW_WR_FLOWID(tid));
440        req->wr.wr_lo = cpu_to_be64(0);
441        OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
442	req->rsvd = 0;
443
444	toep->flags |= TPF_FIN_SENT;
445	toep->flags &= ~TPF_SEND_FIN;
446	t4_l2t_send(sc, wr, toep->l2te);
447
448	return (0);
449}
450
451#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
452#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
453
454/* Maximum amount of immediate data we could stuff in a WR */
455static inline int
456max_imm_payload(int tx_credits)
457{
458	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
459
460	KASSERT(tx_credits >= 0 &&
461		tx_credits <= MAX_OFLD_TX_CREDITS,
462		("%s: %d credits", __func__, tx_credits));
463
464	if (tx_credits < MIN_OFLD_TX_CREDITS)
465		return (0);
466
467	if (tx_credits >= (n * EQ_ESIZE) / 16)
468		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
469	else
470		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
471}
472
473/* Maximum number of SGL entries we could stuff in a WR */
474static inline int
475max_dsgl_nsegs(int tx_credits)
476{
477	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
478	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
479
480	KASSERT(tx_credits >= 0 &&
481		tx_credits <= MAX_OFLD_TX_CREDITS,
482		("%s: %d credits", __func__, tx_credits));
483
484	if (tx_credits < MIN_OFLD_TX_CREDITS)
485		return (0);
486
487	nseg += 2 * (sge_pair_credits * 16 / 24);
488	if ((sge_pair_credits * 16) % 24 == 16)
489		nseg++;
490
491	return (nseg);
492}
493
494static inline void
495write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
496    unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign)
497{
498	struct fw_ofld_tx_data_wr *txwr = dst;
499
500	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
501	    V_FW_WR_IMMDLEN(immdlen));
502	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
503	    V_FW_WR_LEN16(credits));
504	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) |
505	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
506	txwr->plen = htobe32(plen);
507
508	if (txalign > 0) {
509		struct tcpcb *tp = intotcpcb(toep->inp);
510
511		if (plen < 2 * tp->t_maxseg || is_10G_port(toep->vi->pi))
512			txwr->lsodisable_to_flags |=
513			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
514		else
515			txwr->lsodisable_to_flags |=
516			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
517				(tp->t_flags & TF_NODELAY ? 0 :
518				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
519	}
520}
521
522/*
523 * Generate a DSGL from a starting mbuf.  The total number of segments and the
524 * maximum segments in any one mbuf are provided.
525 */
526static void
527write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
528{
529	struct mbuf *m;
530	struct ulptx_sgl *usgl = dst;
531	int i, j, rc;
532	struct sglist sg;
533	struct sglist_seg segs[n];
534
535	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
536
537	sglist_init(&sg, n, segs);
538	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
539	    V_ULPTX_NSGE(nsegs));
540
541	i = -1;
542	for (m = start; m != stop; m = m->m_next) {
543		rc = sglist_append(&sg, mtod(m, void *), m->m_len);
544		if (__predict_false(rc != 0))
545			panic("%s: sglist_append %d", __func__, rc);
546
547		for (j = 0; j < sg.sg_nseg; i++, j++) {
548			if (i < 0) {
549				usgl->len0 = htobe32(segs[j].ss_len);
550				usgl->addr0 = htobe64(segs[j].ss_paddr);
551			} else {
552				usgl->sge[i / 2].len[i & 1] =
553				    htobe32(segs[j].ss_len);
554				usgl->sge[i / 2].addr[i & 1] =
555				    htobe64(segs[j].ss_paddr);
556			}
557#ifdef INVARIANTS
558			nsegs--;
559#endif
560		}
561		sglist_reset(&sg);
562	}
563	if (i & 1)
564		usgl->sge[i / 2].len[1] = htobe32(0);
565	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
566	    __func__, nsegs, start, stop));
567}
568
569/*
570 * Max number of SGL entries an offload tx work request can have.  This is 41
571 * (1 + 40) for a full 512B work request.
572 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
573 */
574#define OFLD_SGL_LEN (41)
575
576/*
577 * Send data and/or a FIN to the peer.
578 *
579 * The socket's so_snd buffer consists of a stream of data starting with sb_mb
580 * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
581 * was transmitted.
582 *
583 * drop indicates the number of bytes that should be dropped from the head of
584 * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
585 * contention on the send buffer lock (before this change it used to do
586 * sowwakeup and then t4_push_frames right after that when recovering from tx
587 * stalls).  When drop is set this function MUST drop the bytes and wake up any
588 * writers.
589 */
590void
591t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
592{
593	struct mbuf *sndptr, *m, *sb_sndptr;
594	struct fw_ofld_tx_data_wr *txwr;
595	struct wrqe *wr;
596	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
597	struct inpcb *inp = toep->inp;
598	struct tcpcb *tp = intotcpcb(inp);
599	struct socket *so = inp->inp_socket;
600	struct sockbuf *sb = &so->so_snd;
601	int tx_credits, shove, compl, sowwakeup;
602	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
603
604	INP_WLOCK_ASSERT(inp);
605	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
606	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
607
608	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
609	    toep->ulp_mode == ULP_MODE_TCPDDP ||
610	    toep->ulp_mode == ULP_MODE_RDMA,
611	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
612
613	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
614		return;
615
616	/*
617	 * This function doesn't resume by itself.  Someone else must clear the
618	 * flag and call this function.
619	 */
620	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
621		KASSERT(drop == 0,
622		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
623		return;
624	}
625
626	do {
627		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
628		max_imm = max_imm_payload(tx_credits);
629		max_nsegs = max_dsgl_nsegs(tx_credits);
630
631		SOCKBUF_LOCK(sb);
632		sowwakeup = drop;
633		if (drop) {
634			sbdrop_locked(sb, drop);
635			drop = 0;
636		}
637		sb_sndptr = sb->sb_sndptr;
638		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
639		plen = 0;
640		nsegs = 0;
641		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
642		for (m = sndptr; m != NULL; m = m->m_next) {
643			int n = sglist_count(mtod(m, void *), m->m_len);
644
645			nsegs += n;
646			plen += m->m_len;
647
648			/* This mbuf sent us _over_ the nsegs limit, back out */
649			if (plen > max_imm && nsegs > max_nsegs) {
650				nsegs -= n;
651				plen -= m->m_len;
652				if (plen == 0) {
653					/* Too few credits */
654					toep->flags |= TPF_TX_SUSPENDED;
655					if (sowwakeup)
656						sowwakeup_locked(so);
657					else
658						SOCKBUF_UNLOCK(sb);
659					SOCKBUF_UNLOCK_ASSERT(sb);
660					return;
661				}
662				break;
663			}
664
665			if (max_nsegs_1mbuf < n)
666				max_nsegs_1mbuf = n;
667			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
668
669			/* This mbuf put us right at the max_nsegs limit */
670			if (plen > max_imm && nsegs == max_nsegs) {
671				m = m->m_next;
672				break;
673			}
674		}
675
676		if (sb->sb_cc > sb->sb_hiwat * 5 / 8 &&
677		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
678			compl = 1;
679		else
680			compl = 0;
681
682		if (sb->sb_flags & SB_AUTOSIZE &&
683		    V_tcp_do_autosndbuf &&
684		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
685		    sb->sb_cc >= sb->sb_hiwat * 7 / 8) {
686			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
687			    V_tcp_autosndbuf_max);
688
689			if (!sbreserve_locked(sb, newsize, so, NULL))
690				sb->sb_flags &= ~SB_AUTOSIZE;
691			else
692				sowwakeup = 1;	/* room available */
693		}
694		if (sowwakeup)
695			sowwakeup_locked(so);
696		else
697			SOCKBUF_UNLOCK(sb);
698		SOCKBUF_UNLOCK_ASSERT(sb);
699
700		/* nothing to send */
701		if (plen == 0) {
702			KASSERT(m == NULL,
703			    ("%s: nothing to send, but m != NULL", __func__));
704			break;
705		}
706
707		if (__predict_false(toep->flags & TPF_FIN_SENT))
708			panic("%s: excess tx.", __func__);
709
710		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
711		if (plen <= max_imm) {
712
713			/* Immediate data tx */
714
715			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
716					toep->ofld_txq);
717			if (wr == NULL) {
718				/* XXX: how will we recover from this? */
719				toep->flags |= TPF_TX_SUSPENDED;
720				return;
721			}
722			txwr = wrtod(wr);
723			credits = howmany(wr->wr_len, 16);
724			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0,
725			    sc->tt.tx_align);
726			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
727			nsegs = 0;
728		} else {
729			int wr_len;
730
731			/* DSGL tx */
732
733			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
734			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
735			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
736			if (wr == NULL) {
737				/* XXX: how will we recover from this? */
738				toep->flags |= TPF_TX_SUSPENDED;
739				return;
740			}
741			txwr = wrtod(wr);
742			credits = howmany(wr_len, 16);
743			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0,
744			    sc->tt.tx_align);
745			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
746			    max_nsegs_1mbuf);
747			if (wr_len & 0xf) {
748				uint64_t *pad = (uint64_t *)
749				    ((uintptr_t)txwr + wr_len);
750				*pad = 0;
751			}
752		}
753
754		KASSERT(toep->tx_credits >= credits,
755			("%s: not enough credits", __func__));
756
757		toep->tx_credits -= credits;
758		toep->tx_nocompl += credits;
759		toep->plen_nocompl += plen;
760		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
761		    toep->tx_nocompl >= toep->tx_total / 4)
762			compl = 1;
763
764		if (compl || toep->ulp_mode == ULP_MODE_RDMA) {
765			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
766			toep->tx_nocompl = 0;
767			toep->plen_nocompl = 0;
768		}
769
770		tp->snd_nxt += plen;
771		tp->snd_max += plen;
772
773		SOCKBUF_LOCK(sb);
774		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
775		sb->sb_sndptr = sb_sndptr;
776		SOCKBUF_UNLOCK(sb);
777
778		toep->flags |= TPF_TX_DATA_SENT;
779		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
780			toep->flags |= TPF_TX_SUSPENDED;
781
782		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
783		txsd->plen = plen;
784		txsd->tx_credits = credits;
785		txsd++;
786		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
787			toep->txsd_pidx = 0;
788			txsd = &toep->txsd[0];
789		}
790		toep->txsd_avail--;
791
792		t4_l2t_send(sc, wr, toep->l2te);
793	} while (m != NULL);
794
795	/* Send a FIN if requested, but only if there's no more data to send */
796	if (m == NULL && toep->flags & TPF_SEND_FIN)
797		close_conn(sc, toep);
798}
799
800static inline void
801rqdrop_locked(struct mbufq *q, int plen)
802{
803	struct mbuf *m;
804
805	while (plen > 0) {
806		m = mbufq_dequeue(q);
807
808		/* Too many credits. */
809		MPASS(m != NULL);
810		M_ASSERTPKTHDR(m);
811
812		/* Partial credits. */
813		MPASS(plen >= m->m_pkthdr.len);
814
815		plen -= m->m_pkthdr.len;
816		m_freem(m);
817	}
818}
819
820void
821t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
822{
823	struct mbuf *sndptr, *m;
824	struct fw_ofld_tx_data_wr *txwr;
825	struct wrqe *wr;
826	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
827	u_int adjusted_plen, ulp_submode;
828	struct inpcb *inp = toep->inp;
829	struct tcpcb *tp = intotcpcb(inp);
830	int tx_credits, shove;
831	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
832	struct mbufq *pduq = &toep->ulp_pduq;
833	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
834
835	INP_WLOCK_ASSERT(inp);
836	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
837	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
838	KASSERT(toep->ulp_mode == ULP_MODE_ISCSI,
839	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
840
841	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
842		return;
843
844	/*
845	 * This function doesn't resume by itself.  Someone else must clear the
846	 * flag and call this function.
847	 */
848	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
849		KASSERT(drop == 0,
850		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
851		return;
852	}
853
854	if (drop)
855		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
856
857	while ((sndptr = mbufq_first(pduq)) != NULL) {
858		M_ASSERTPKTHDR(sndptr);
859
860		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
861		max_imm = max_imm_payload(tx_credits);
862		max_nsegs = max_dsgl_nsegs(tx_credits);
863
864		plen = 0;
865		nsegs = 0;
866		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
867		for (m = sndptr; m != NULL; m = m->m_next) {
868			int n = sglist_count(mtod(m, void *), m->m_len);
869
870			nsegs += n;
871			plen += m->m_len;
872
873			/*
874			 * This mbuf would send us _over_ the nsegs limit.
875			 * Suspend tx because the PDU can't be sent out.
876			 */
877			if (plen > max_imm && nsegs > max_nsegs) {
878				toep->flags |= TPF_TX_SUSPENDED;
879				return;
880			}
881
882			if (max_nsegs_1mbuf < n)
883				max_nsegs_1mbuf = n;
884		}
885
886		if (__predict_false(toep->flags & TPF_FIN_SENT))
887			panic("%s: excess tx.", __func__);
888
889		/*
890		 * We have a PDU to send.  All of it goes out in one WR so 'm'
891		 * is NULL.  A PDU's length is always a multiple of 4.
892		 */
893		MPASS(m == NULL);
894		MPASS((plen & 3) == 0);
895		MPASS(sndptr->m_pkthdr.len == plen);
896
897		shove = !(tp->t_flags & TF_MORETOCOME);
898		ulp_submode = mbuf_ulp_submode(sndptr);
899		MPASS(ulp_submode < nitems(ulp_extra_len));
900
901		/*
902		 * plen doesn't include header and data digests, which are
903		 * generated and inserted in the right places by the TOE, but
904		 * they do occupy TCP sequence space and need to be accounted
905		 * for.
906		 */
907		adjusted_plen = plen + ulp_extra_len[ulp_submode];
908		if (plen <= max_imm) {
909
910			/* Immediate data tx */
911
912			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
913					toep->ofld_txq);
914			if (wr == NULL) {
915				/* XXX: how will we recover from this? */
916				toep->flags |= TPF_TX_SUSPENDED;
917				return;
918			}
919			txwr = wrtod(wr);
920			credits = howmany(wr->wr_len, 16);
921			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
922			    shove, ulp_submode, sc->tt.tx_align);
923			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
924			nsegs = 0;
925		} else {
926			int wr_len;
927
928			/* DSGL tx */
929			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
930			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
931			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
932			if (wr == NULL) {
933				/* XXX: how will we recover from this? */
934				toep->flags |= TPF_TX_SUSPENDED;
935				return;
936			}
937			txwr = wrtod(wr);
938			credits = howmany(wr_len, 16);
939			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
940			    shove, ulp_submode, sc->tt.tx_align);
941			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
942			    max_nsegs_1mbuf);
943			if (wr_len & 0xf) {
944				uint64_t *pad = (uint64_t *)
945				    ((uintptr_t)txwr + wr_len);
946				*pad = 0;
947			}
948		}
949
950		KASSERT(toep->tx_credits >= credits,
951			("%s: not enough credits", __func__));
952
953		m = mbufq_dequeue(pduq);
954		MPASS(m == sndptr);
955		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
956
957		toep->tx_credits -= credits;
958		toep->tx_nocompl += credits;
959		toep->plen_nocompl += plen;
960		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
961		    toep->tx_nocompl >= toep->tx_total / 4) {
962			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
963			toep->tx_nocompl = 0;
964			toep->plen_nocompl = 0;
965		}
966
967		tp->snd_nxt += adjusted_plen;
968		tp->snd_max += adjusted_plen;
969
970		toep->flags |= TPF_TX_DATA_SENT;
971		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
972			toep->flags |= TPF_TX_SUSPENDED;
973
974		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
975		txsd->plen = plen;
976		txsd->tx_credits = credits;
977		txsd++;
978		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
979			toep->txsd_pidx = 0;
980			txsd = &toep->txsd[0];
981		}
982		toep->txsd_avail--;
983
984		t4_l2t_send(sc, wr, toep->l2te);
985	}
986
987	/* Send a FIN if requested, but only if there are no more PDUs to send */
988	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
989		close_conn(sc, toep);
990}
991
992int
993t4_tod_output(struct toedev *tod, struct tcpcb *tp)
994{
995	struct adapter *sc = tod->tod_softc;
996#ifdef INVARIANTS
997	struct inpcb *inp = tp->t_inpcb;
998#endif
999	struct toepcb *toep = tp->t_toe;
1000
1001	INP_WLOCK_ASSERT(inp);
1002	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1003	    ("%s: inp %p dropped.", __func__, inp));
1004	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1005
1006	if (toep->ulp_mode == ULP_MODE_ISCSI)
1007		t4_push_pdus(sc, toep, 0);
1008	else
1009		t4_push_frames(sc, toep, 0);
1010
1011	return (0);
1012}
1013
1014int
1015t4_send_fin(struct toedev *tod, struct tcpcb *tp)
1016{
1017	struct adapter *sc = tod->tod_softc;
1018#ifdef INVARIANTS
1019	struct inpcb *inp = tp->t_inpcb;
1020#endif
1021	struct toepcb *toep = tp->t_toe;
1022
1023	INP_WLOCK_ASSERT(inp);
1024	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1025	    ("%s: inp %p dropped.", __func__, inp));
1026	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1027
1028	toep->flags |= TPF_SEND_FIN;
1029	if (tp->t_state >= TCPS_ESTABLISHED) {
1030		if (toep->ulp_mode == ULP_MODE_ISCSI)
1031			t4_push_pdus(sc, toep, 0);
1032		else
1033			t4_push_frames(sc, toep, 0);
1034	}
1035
1036	return (0);
1037}
1038
1039int
1040t4_send_rst(struct toedev *tod, struct tcpcb *tp)
1041{
1042	struct adapter *sc = tod->tod_softc;
1043#if defined(INVARIANTS)
1044	struct inpcb *inp = tp->t_inpcb;
1045#endif
1046	struct toepcb *toep = tp->t_toe;
1047
1048	INP_WLOCK_ASSERT(inp);
1049	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1050	    ("%s: inp %p dropped.", __func__, inp));
1051	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1052
1053	/* hmmmm */
1054	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
1055	    ("%s: flowc for tid %u [%s] not sent already",
1056	    __func__, toep->tid, tcpstates[tp->t_state]));
1057
1058	send_reset(sc, toep, 0);
1059	return (0);
1060}
1061
1062/*
1063 * Peer has sent us a FIN.
1064 */
1065static int
1066do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1067{
1068	struct adapter *sc = iq->adapter;
1069	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
1070	unsigned int tid = GET_TID(cpl);
1071	struct toepcb *toep = lookup_tid(sc, tid);
1072	struct inpcb *inp = toep->inp;
1073	struct tcpcb *tp = NULL;
1074	struct socket *so;
1075	struct sockbuf *sb;
1076#ifdef INVARIANTS
1077	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1078#endif
1079
1080	KASSERT(opcode == CPL_PEER_CLOSE,
1081	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1082	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1083
1084	if (__predict_false(toep->flags & TPF_SYNQE)) {
1085#ifdef INVARIANTS
1086		struct synq_entry *synqe = (void *)toep;
1087
1088		INP_WLOCK(synqe->lctx->inp);
1089		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1090			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1091			    ("%s: listen socket closed but tid %u not aborted.",
1092			    __func__, tid));
1093		} else {
1094			/*
1095			 * do_pass_accept_req is still running and will
1096			 * eventually take care of this tid.
1097			 */
1098		}
1099		INP_WUNLOCK(synqe->lctx->inp);
1100#endif
1101		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1102		    toep, toep->flags);
1103		return (0);
1104	}
1105
1106	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1107
1108	CURVNET_SET(toep->vnet);
1109	INP_INFO_RLOCK(&V_tcbinfo);
1110	INP_WLOCK(inp);
1111	tp = intotcpcb(inp);
1112
1113	CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
1114	    tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
1115
1116	if (toep->flags & TPF_ABORT_SHUTDOWN)
1117		goto done;
1118
1119	tp->rcv_nxt++;	/* FIN */
1120
1121	so = inp->inp_socket;
1122	sb = &so->so_rcv;
1123	SOCKBUF_LOCK(sb);
1124	if (__predict_false(toep->ddp_flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) {
1125		handle_ddp_close(toep, tp, sb, cpl->rcv_nxt);
1126	}
1127	socantrcvmore_locked(so);	/* unlocks the sockbuf */
1128
1129	if (toep->ulp_mode != ULP_MODE_RDMA) {
1130		KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
1131	    		("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
1132	    		be32toh(cpl->rcv_nxt)));
1133	}
1134
1135	switch (tp->t_state) {
1136	case TCPS_SYN_RECEIVED:
1137		tp->t_starttime = ticks;
1138		/* FALLTHROUGH */
1139
1140	case TCPS_ESTABLISHED:
1141		tcp_state_change(tp, TCPS_CLOSE_WAIT);
1142		break;
1143
1144	case TCPS_FIN_WAIT_1:
1145		tcp_state_change(tp, TCPS_CLOSING);
1146		break;
1147
1148	case TCPS_FIN_WAIT_2:
1149		tcp_twstart(tp);
1150		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
1151		INP_INFO_RUNLOCK(&V_tcbinfo);
1152		CURVNET_RESTORE();
1153
1154		INP_WLOCK(inp);
1155		final_cpl_received(toep);
1156		return (0);
1157
1158	default:
1159		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
1160		    __func__, tid, tp->t_state);
1161	}
1162done:
1163	INP_WUNLOCK(inp);
1164	INP_INFO_RUNLOCK(&V_tcbinfo);
1165	CURVNET_RESTORE();
1166	return (0);
1167}
1168
1169/*
1170 * Peer has ACK'd our FIN.
1171 */
1172static int
1173do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
1174    struct mbuf *m)
1175{
1176	struct adapter *sc = iq->adapter;
1177	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
1178	unsigned int tid = GET_TID(cpl);
1179	struct toepcb *toep = lookup_tid(sc, tid);
1180	struct inpcb *inp = toep->inp;
1181	struct tcpcb *tp = NULL;
1182	struct socket *so = NULL;
1183#ifdef INVARIANTS
1184	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1185#endif
1186
1187	KASSERT(opcode == CPL_CLOSE_CON_RPL,
1188	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1189	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1190	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1191
1192	CURVNET_SET(toep->vnet);
1193	INP_INFO_RLOCK(&V_tcbinfo);
1194	INP_WLOCK(inp);
1195	tp = intotcpcb(inp);
1196
1197	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
1198	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
1199
1200	if (toep->flags & TPF_ABORT_SHUTDOWN)
1201		goto done;
1202
1203	so = inp->inp_socket;
1204	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
1205
1206	switch (tp->t_state) {
1207	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
1208		tcp_twstart(tp);
1209release:
1210		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1211		INP_INFO_RUNLOCK(&V_tcbinfo);
1212		CURVNET_RESTORE();
1213
1214		INP_WLOCK(inp);
1215		final_cpl_received(toep);	/* no more CPLs expected */
1216
1217		return (0);
1218	case TCPS_LAST_ACK:
1219		if (tcp_close(tp))
1220			INP_WUNLOCK(inp);
1221		goto release;
1222
1223	case TCPS_FIN_WAIT_1:
1224		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1225			soisdisconnected(so);
1226		tcp_state_change(tp, TCPS_FIN_WAIT_2);
1227		break;
1228
1229	default:
1230		log(LOG_ERR,
1231		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
1232		    __func__, tid, tcpstates[tp->t_state]);
1233	}
1234done:
1235	INP_WUNLOCK(inp);
1236	INP_INFO_RUNLOCK(&V_tcbinfo);
1237	CURVNET_RESTORE();
1238	return (0);
1239}
1240
1241void
1242send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
1243    int rst_status)
1244{
1245	struct wrqe *wr;
1246	struct cpl_abort_rpl *cpl;
1247
1248	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
1249	if (wr == NULL) {
1250		/* XXX */
1251		panic("%s: allocation failure.", __func__);
1252	}
1253	cpl = wrtod(wr);
1254
1255	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
1256	cpl->cmd = rst_status;
1257
1258	t4_wrq_tx(sc, wr);
1259}
1260
1261static int
1262abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
1263{
1264	switch (abort_reason) {
1265	case CPL_ERR_BAD_SYN:
1266	case CPL_ERR_CONN_RESET:
1267		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
1268	case CPL_ERR_XMIT_TIMEDOUT:
1269	case CPL_ERR_PERSIST_TIMEDOUT:
1270	case CPL_ERR_FINWAIT2_TIMEDOUT:
1271	case CPL_ERR_KEEPALIVE_TIMEDOUT:
1272		return (ETIMEDOUT);
1273	default:
1274		return (EIO);
1275	}
1276}
1277
1278/*
1279 * TCP RST from the peer, timeout, or some other such critical error.
1280 */
1281static int
1282do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1283{
1284	struct adapter *sc = iq->adapter;
1285	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
1286	unsigned int tid = GET_TID(cpl);
1287	struct toepcb *toep = lookup_tid(sc, tid);
1288	struct sge_wrq *ofld_txq = toep->ofld_txq;
1289	struct inpcb *inp;
1290	struct tcpcb *tp;
1291#ifdef INVARIANTS
1292	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1293#endif
1294
1295	KASSERT(opcode == CPL_ABORT_REQ_RSS,
1296	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1297	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1298
1299	if (toep->flags & TPF_SYNQE)
1300		return (do_abort_req_synqe(iq, rss, m));
1301
1302	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1303
1304	if (negative_advice(cpl->status)) {
1305		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
1306		    __func__, cpl->status, tid, toep->flags);
1307		return (0);	/* Ignore negative advice */
1308	}
1309
1310	inp = toep->inp;
1311	CURVNET_SET(toep->vnet);
1312	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
1313	INP_WLOCK(inp);
1314
1315	tp = intotcpcb(inp);
1316
1317	CTR6(KTR_CXGBE,
1318	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
1319	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
1320	    inp->inp_flags, cpl->status);
1321
1322	/*
1323	 * If we'd initiated an abort earlier the reply to it is responsible for
1324	 * cleaning up resources.  Otherwise we tear everything down right here
1325	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
1326	 */
1327	if (toep->flags & TPF_ABORT_SHUTDOWN) {
1328		INP_WUNLOCK(inp);
1329		goto done;
1330	}
1331	toep->flags |= TPF_ABORT_SHUTDOWN;
1332
1333	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
1334		struct socket *so = inp->inp_socket;
1335
1336		if (so != NULL)
1337			so_error_set(so, abort_status_to_errno(tp,
1338			    cpl->status));
1339		tp = tcp_close(tp);
1340		if (tp == NULL)
1341			INP_WLOCK(inp);	/* re-acquire */
1342	}
1343
1344	final_cpl_received(toep);
1345done:
1346	INP_INFO_RUNLOCK(&V_tcbinfo);
1347	CURVNET_RESTORE();
1348	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
1349	return (0);
1350}
1351
1352/*
1353 * Reply to the CPL_ABORT_REQ (send_reset)
1354 */
1355static int
1356do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1357{
1358	struct adapter *sc = iq->adapter;
1359	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
1360	unsigned int tid = GET_TID(cpl);
1361	struct toepcb *toep = lookup_tid(sc, tid);
1362	struct inpcb *inp = toep->inp;
1363#ifdef INVARIANTS
1364	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1365#endif
1366
1367	KASSERT(opcode == CPL_ABORT_RPL_RSS,
1368	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1369	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1370
1371	if (toep->flags & TPF_SYNQE)
1372		return (do_abort_rpl_synqe(iq, rss, m));
1373
1374	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1375
1376	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
1377	    __func__, tid, toep, inp, cpl->status);
1378
1379	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1380	    ("%s: wasn't expecting abort reply", __func__));
1381
1382	INP_WLOCK(inp);
1383	final_cpl_received(toep);
1384
1385	return (0);
1386}
1387
1388static int
1389do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1390{
1391	struct adapter *sc = iq->adapter;
1392	const struct cpl_rx_data *cpl = mtod(m, const void *);
1393	unsigned int tid = GET_TID(cpl);
1394	struct toepcb *toep = lookup_tid(sc, tid);
1395	struct inpcb *inp = toep->inp;
1396	struct tcpcb *tp;
1397	struct socket *so;
1398	struct sockbuf *sb;
1399	int len;
1400	uint32_t ddp_placed = 0;
1401
1402	if (__predict_false(toep->flags & TPF_SYNQE)) {
1403#ifdef INVARIANTS
1404		struct synq_entry *synqe = (void *)toep;
1405
1406		INP_WLOCK(synqe->lctx->inp);
1407		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1408			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1409			    ("%s: listen socket closed but tid %u not aborted.",
1410			    __func__, tid));
1411		} else {
1412			/*
1413			 * do_pass_accept_req is still running and will
1414			 * eventually take care of this tid.
1415			 */
1416		}
1417		INP_WUNLOCK(synqe->lctx->inp);
1418#endif
1419		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1420		    toep, toep->flags);
1421		m_freem(m);
1422		return (0);
1423	}
1424
1425	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1426
1427	/* strip off CPL header */
1428	m_adj(m, sizeof(*cpl));
1429	len = m->m_pkthdr.len;
1430
1431	INP_WLOCK(inp);
1432	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
1433		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
1434		    __func__, tid, len, inp->inp_flags);
1435		INP_WUNLOCK(inp);
1436		m_freem(m);
1437		return (0);
1438	}
1439
1440	tp = intotcpcb(inp);
1441
1442	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
1443		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
1444
1445	tp->rcv_nxt += len;
1446	if (tp->rcv_wnd < len) {
1447		KASSERT(toep->ulp_mode == ULP_MODE_RDMA,
1448				("%s: negative window size", __func__));
1449	}
1450
1451	tp->rcv_wnd -= len;
1452	tp->t_rcvtime = ticks;
1453
1454	so = inp_inpcbtosocket(inp);
1455	sb = &so->so_rcv;
1456	SOCKBUF_LOCK(sb);
1457
1458	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
1459		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
1460		    __func__, tid, len);
1461		m_freem(m);
1462		SOCKBUF_UNLOCK(sb);
1463		INP_WUNLOCK(inp);
1464
1465		CURVNET_SET(toep->vnet);
1466		INP_INFO_RLOCK(&V_tcbinfo);
1467		INP_WLOCK(inp);
1468		tp = tcp_drop(tp, ECONNRESET);
1469		if (tp)
1470			INP_WUNLOCK(inp);
1471		INP_INFO_RUNLOCK(&V_tcbinfo);
1472		CURVNET_RESTORE();
1473
1474		return (0);
1475	}
1476
1477	/* receive buffer autosize */
1478	MPASS(toep->vnet == so->so_vnet);
1479	CURVNET_SET(toep->vnet);
1480	if (sb->sb_flags & SB_AUTOSIZE &&
1481	    V_tcp_do_autorcvbuf &&
1482	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
1483	    len > (sbspace(sb) / 8 * 7)) {
1484		unsigned int hiwat = sb->sb_hiwat;
1485		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
1486		    V_tcp_autorcvbuf_max);
1487
1488		if (!sbreserve_locked(sb, newsize, so, NULL))
1489			sb->sb_flags &= ~SB_AUTOSIZE;
1490		else
1491			toep->rx_credits += newsize - hiwat;
1492	}
1493
1494	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
1495		int changed = !(toep->ddp_flags & DDP_ON) ^ cpl->ddp_off;
1496
1497		if (changed) {
1498			if (toep->ddp_flags & DDP_SC_REQ)
1499				toep->ddp_flags ^= DDP_ON | DDP_SC_REQ;
1500			else {
1501				KASSERT(cpl->ddp_off == 1,
1502				    ("%s: DDP switched on by itself.",
1503				    __func__));
1504
1505				/* Fell out of DDP mode */
1506				toep->ddp_flags &= ~(DDP_ON | DDP_BUF0_ACTIVE |
1507				    DDP_BUF1_ACTIVE);
1508
1509				if (ddp_placed)
1510					insert_ddp_data(toep, ddp_placed);
1511			}
1512		}
1513
1514		if ((toep->ddp_flags & DDP_OK) == 0 &&
1515		    time_uptime >= toep->ddp_disabled + DDP_RETRY_WAIT) {
1516			toep->ddp_score = DDP_LOW_SCORE;
1517			toep->ddp_flags |= DDP_OK;
1518			CTR3(KTR_CXGBE, "%s: tid %u DDP_OK @ %u",
1519			    __func__, tid, time_uptime);
1520		}
1521
1522		if (toep->ddp_flags & DDP_ON) {
1523
1524			/*
1525			 * CPL_RX_DATA with DDP on can only be an indicate.  Ask
1526			 * soreceive to post a buffer or disable DDP.  The
1527			 * payload that arrived in this indicate is appended to
1528			 * the socket buffer as usual.
1529			 */
1530
1531#if 0
1532			CTR5(KTR_CXGBE,
1533			    "%s: tid %u (0x%x) DDP indicate (seq 0x%x, len %d)",
1534			    __func__, tid, toep->flags, be32toh(cpl->seq), len);
1535#endif
1536			sb->sb_flags |= SB_DDP_INDICATE;
1537		} else if ((toep->ddp_flags & (DDP_OK|DDP_SC_REQ)) == DDP_OK &&
1538		    tp->rcv_wnd > DDP_RSVD_WIN && len >= sc->tt.ddp_thres) {
1539
1540			/*
1541			 * DDP allowed but isn't on (and a request to switch it
1542			 * on isn't pending either), and conditions are ripe for
1543			 * it to work.  Switch it on.
1544			 */
1545
1546			enable_ddp(sc, toep);
1547		}
1548	}
1549
1550	KASSERT(toep->sb_cc >= sb->sb_cc,
1551	    ("%s: sb %p has more data (%d) than last time (%d).",
1552	    __func__, sb, sb->sb_cc, toep->sb_cc));
1553	toep->rx_credits += toep->sb_cc - sb->sb_cc;
1554	sbappendstream_locked(sb, m);
1555	toep->sb_cc = sb->sb_cc;
1556	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
1557		int credits;
1558
1559		credits = send_rx_credits(sc, toep, toep->rx_credits);
1560		toep->rx_credits -= credits;
1561		tp->rcv_wnd += credits;
1562		tp->rcv_adv += credits;
1563	}
1564	sorwakeup_locked(so);
1565	SOCKBUF_UNLOCK_ASSERT(sb);
1566
1567	INP_WUNLOCK(inp);
1568	CURVNET_RESTORE();
1569	return (0);
1570}
1571
1572#define S_CPL_FW4_ACK_OPCODE    24
1573#define M_CPL_FW4_ACK_OPCODE    0xff
1574#define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE)
1575#define G_CPL_FW4_ACK_OPCODE(x) \
1576    (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE)
1577
1578#define S_CPL_FW4_ACK_FLOWID    0
1579#define M_CPL_FW4_ACK_FLOWID    0xffffff
1580#define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID)
1581#define G_CPL_FW4_ACK_FLOWID(x) \
1582    (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID)
1583
1584#define S_CPL_FW4_ACK_CR        24
1585#define M_CPL_FW4_ACK_CR        0xff
1586#define V_CPL_FW4_ACK_CR(x)     ((x) << S_CPL_FW4_ACK_CR)
1587#define G_CPL_FW4_ACK_CR(x)     (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR)
1588
1589#define S_CPL_FW4_ACK_SEQVAL    0
1590#define M_CPL_FW4_ACK_SEQVAL    0x1
1591#define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL)
1592#define G_CPL_FW4_ACK_SEQVAL(x) \
1593    (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL)
1594#define F_CPL_FW4_ACK_SEQVAL    V_CPL_FW4_ACK_SEQVAL(1U)
1595
1596static int
1597do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1598{
1599	struct adapter *sc = iq->adapter;
1600	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
1601	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
1602	struct toepcb *toep = lookup_tid(sc, tid);
1603	struct inpcb *inp;
1604	struct tcpcb *tp;
1605	struct socket *so;
1606	uint8_t credits = cpl->credits;
1607	struct ofld_tx_sdesc *txsd;
1608	int plen;
1609#ifdef INVARIANTS
1610	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
1611#endif
1612
1613	/*
1614	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
1615	 * now this comes back carrying the credits for the flowc.
1616	 */
1617	if (__predict_false(toep->flags & TPF_SYNQE)) {
1618		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1619		    ("%s: credits for a synq entry %p", __func__, toep));
1620		return (0);
1621	}
1622
1623	inp = toep->inp;
1624
1625	KASSERT(opcode == CPL_FW4_ACK,
1626	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1627	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1628	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1629
1630	INP_WLOCK(inp);
1631
1632	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
1633		INP_WUNLOCK(inp);
1634		return (0);
1635	}
1636
1637	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
1638	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
1639
1640	tp = intotcpcb(inp);
1641
1642	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
1643		tcp_seq snd_una = be32toh(cpl->snd_una);
1644
1645#ifdef INVARIANTS
1646		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
1647			log(LOG_ERR,
1648			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
1649			    __func__, snd_una, toep->tid, tp->snd_una);
1650		}
1651#endif
1652
1653		if (tp->snd_una != snd_una) {
1654			tp->snd_una = snd_una;
1655			tp->ts_recent_age = tcp_ts_getticks();
1656		}
1657	}
1658
1659	so = inp->inp_socket;
1660	txsd = &toep->txsd[toep->txsd_cidx];
1661	plen = 0;
1662	while (credits) {
1663		KASSERT(credits >= txsd->tx_credits,
1664		    ("%s: too many (or partial) credits", __func__));
1665		credits -= txsd->tx_credits;
1666		toep->tx_credits += txsd->tx_credits;
1667		plen += txsd->plen;
1668		txsd++;
1669		toep->txsd_avail++;
1670		KASSERT(toep->txsd_avail <= toep->txsd_total,
1671		    ("%s: txsd avail > total", __func__));
1672		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
1673			txsd = &toep->txsd[0];
1674			toep->txsd_cidx = 0;
1675		}
1676	}
1677
1678	if (toep->tx_credits == toep->tx_total) {
1679		toep->tx_nocompl = 0;
1680		toep->plen_nocompl = 0;
1681	}
1682
1683	if (toep->flags & TPF_TX_SUSPENDED &&
1684	    toep->tx_credits >= toep->tx_total / 4) {
1685		toep->flags &= ~TPF_TX_SUSPENDED;
1686		CURVNET_SET(toep->vnet);
1687		if (toep->ulp_mode == ULP_MODE_ISCSI)
1688			t4_push_pdus(sc, toep, plen);
1689		else
1690			t4_push_frames(sc, toep, plen);
1691		CURVNET_RESTORE();
1692	} else if (plen > 0) {
1693		struct sockbuf *sb = &so->so_snd;
1694		int sbu;
1695
1696		SOCKBUF_LOCK(sb);
1697		sbu = sb->sb_cc;
1698		if (toep->ulp_mode == ULP_MODE_ISCSI) {
1699
1700			if (__predict_false(sbu > 0)) {
1701				/*
1702				 * The data trasmitted before the tid's ULP mode
1703				 * changed to ISCSI is still in so_snd.
1704				 * Incoming credits should account for so_snd
1705				 * first.
1706				 */
1707				sbdrop_locked(sb, min(sbu, plen));
1708				plen -= min(sbu, plen);
1709			}
1710			sowwakeup_locked(so);	/* unlocks so_snd */
1711			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
1712		} else {
1713			sbdrop_locked(sb, plen);
1714			sowwakeup_locked(so);	/* unlocks so_snd */
1715		}
1716		SOCKBUF_UNLOCK_ASSERT(sb);
1717	}
1718
1719	INP_WUNLOCK(inp);
1720
1721	return (0);
1722}
1723
1724int
1725do_set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1726{
1727#ifdef INVARIANTS
1728	struct adapter *sc = iq->adapter;
1729#endif
1730	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
1731	unsigned int tid = GET_TID(cpl);
1732#ifdef INVARIANTS
1733	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1734#endif
1735
1736	KASSERT(opcode == CPL_SET_TCB_RPL,
1737	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1738	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1739	MPASS(iq != &sc->sge.fwq);
1740
1741	/*
1742	 * TOM and/or other ULPs don't request replies for CPL_SET_TCB or
1743	 * CPL_SET_TCB_FIELD requests.  This can easily change and when it does
1744	 * the dispatch code will go here.
1745	 */
1746#ifdef INVARIANTS
1747	panic("%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p", __func__,
1748	    tid, iq);
1749#else
1750	log(LOG_ERR, "%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p\n",
1751	    __func__, tid, iq);
1752#endif
1753
1754	return (0);
1755}
1756
1757void
1758t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, int tid,
1759    uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie, int iqid)
1760{
1761	struct wrqe *wr;
1762	struct cpl_set_tcb_field *req;
1763
1764	MPASS((cookie & ~M_COOKIE) == 0);
1765	MPASS((iqid & ~M_QUEUENO) == 0);
1766
1767	wr = alloc_wrqe(sizeof(*req), wrq);
1768	if (wr == NULL) {
1769		/* XXX */
1770		panic("%s: allocation failure.", __func__);
1771	}
1772	req = wrtod(wr);
1773
1774	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, tid);
1775	req->reply_ctrl = htobe16(V_QUEUENO(iqid));
1776	if (reply == 0)
1777		req->reply_ctrl |= htobe16(F_NO_REPLY);
1778	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
1779	req->mask = htobe64(mask);
1780	req->val = htobe64(val);
1781
1782	t4_wrq_tx(sc, wr);
1783}
1784
1785void
1786t4_init_cpl_io_handlers(void)
1787{
1788
1789	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
1790	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
1791	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
1792	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
1793	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
1794	t4_register_cpl_handler(CPL_FW4_ACK, do_fw4_ack);
1795}
1796
1797void
1798t4_uninit_cpl_io_handlers(void)
1799{
1800
1801	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
1802	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
1803	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
1804	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, NULL);
1805	t4_register_cpl_handler(CPL_RX_DATA, NULL);
1806	t4_register_cpl_handler(CPL_FW4_ACK, NULL);
1807}
1808#endif
1809