1/*-
2 * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c 355249 2019-11-30 20:22:03Z np $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#ifdef TCP_OFFLOAD
35#include <sys/param.h>
36#include <sys/aio.h>
37#include <sys/file.h>
38#include <sys/kernel.h>
39#include <sys/ktr.h>
40#include <sys/module.h>
41#include <sys/proc.h>
42#include <sys/protosw.h>
43#include <sys/domain.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/sglist.h>
47#include <sys/taskqueue.h>
48#include <netinet/in.h>
49#include <netinet/in_pcb.h>
50#include <netinet/ip.h>
51#include <netinet/ip6.h>
52#define TCPSTATES
53#include <netinet/tcp_fsm.h>
54#include <netinet/tcp_seq.h>
55#include <netinet/tcp_var.h>
56#include <netinet/toecore.h>
57
58#include <security/mac/mac_framework.h>
59
60#include <vm/vm.h>
61#include <vm/vm_extern.h>
62#include <vm/pmap.h>
63#include <vm/vm_map.h>
64#include <vm/vm_page.h>
65
66#include "common/common.h"
67#include "common/t4_msg.h"
68#include "common/t4_regs.h"
69#include "common/t4_tcb.h"
70#include "tom/t4_tom_l2t.h"
71#include "tom/t4_tom.h"
72
73VNET_DECLARE(int, tcp_do_autosndbuf);
74#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
75VNET_DECLARE(int, tcp_autosndbuf_inc);
76#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
77VNET_DECLARE(int, tcp_autosndbuf_max);
78#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
79VNET_DECLARE(int, tcp_do_autorcvbuf);
80#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
81VNET_DECLARE(int, tcp_autorcvbuf_inc);
82#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
83VNET_DECLARE(int, tcp_autorcvbuf_max);
84#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
85
86static void	t4_aiotx_cancel(struct kaiocb *job);
87static void	t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
88
89static size_t
90aiotx_mbuf_pgoff(struct mbuf *m)
91{
92	struct aiotx_buffer *ab;
93
94	MPASS(IS_AIOTX_MBUF(m));
95	ab = m->m_ext.ext_arg1;
96	return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE);
97}
98
99static vm_page_t *
100aiotx_mbuf_pages(struct mbuf *m)
101{
102	struct aiotx_buffer *ab;
103	int npages;
104
105	MPASS(IS_AIOTX_MBUF(m));
106	ab = m->m_ext.ext_arg1;
107	npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE;
108	return (ab->ps.pages + npages);
109}
110
111void
112send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
113{
114	struct wrqe *wr;
115	struct fw_flowc_wr *flowc;
116	unsigned int nparams, flowclen, paramidx;
117	struct vi_info *vi = toep->vi;
118	struct port_info *pi = vi->pi;
119	struct adapter *sc = pi->adapter;
120	unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
121	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
122
123	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
124	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
125
126	if (ftxp != NULL)
127		nparams = 8;
128	else
129		nparams = 6;
130	if (toep->ulp_mode == ULP_MODE_TLS)
131		nparams++;
132	if (toep->tls.fcplenmax != 0)
133		nparams++;
134	if (toep->tc_idx != -1) {
135		MPASS(toep->tc_idx >= 0 &&
136		    toep->tc_idx < sc->chip_params->nsched_cls);
137		nparams++;
138	}
139
140	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
141
142	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
143	if (wr == NULL) {
144		/* XXX */
145		panic("%s: allocation failure.", __func__);
146	}
147	flowc = wrtod(wr);
148	memset(flowc, 0, wr->wr_len);
149
150	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
151	    V_FW_FLOWC_WR_NPARAMS(nparams));
152	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
153	    V_FW_WR_FLOWID(toep->tid));
154
155#define FLOWC_PARAM(__m, __v) \
156	do { \
157		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
158		flowc->mnemval[paramidx].val = htobe32(__v); \
159		paramidx++; \
160	} while (0)
161
162	paramidx = 0;
163
164	FLOWC_PARAM(PFNVFN, pfvf);
165	FLOWC_PARAM(CH, pi->tx_chan);
166	FLOWC_PARAM(PORT, pi->tx_chan);
167	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
168	if (ftxp) {
169		uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
170
171		FLOWC_PARAM(SNDNXT, ftxp->snd_nxt);
172		FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt);
173		FLOWC_PARAM(SNDBUF, sndbuf);
174		FLOWC_PARAM(MSS, ftxp->mss);
175
176		CTR6(KTR_CXGBE,
177		    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
178		    __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt,
179		    ftxp->rcv_nxt);
180	} else {
181		FLOWC_PARAM(SNDBUF, 512);
182		FLOWC_PARAM(MSS, 512);
183
184		CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
185	}
186	if (toep->ulp_mode == ULP_MODE_TLS)
187		FLOWC_PARAM(ULP_MODE, toep->ulp_mode);
188	if (toep->tls.fcplenmax != 0)
189		FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
190	if (toep->tc_idx != -1)
191		FLOWC_PARAM(SCHEDCLASS, toep->tc_idx);
192#undef FLOWC_PARAM
193
194	KASSERT(paramidx == nparams, ("nparams mismatch"));
195
196	txsd->tx_credits = howmany(flowclen, 16);
197	txsd->plen = 0;
198	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
199	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
200	toep->tx_credits -= txsd->tx_credits;
201	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
202		toep->txsd_pidx = 0;
203	toep->txsd_avail--;
204
205	toep->flags |= TPF_FLOWC_WR_SENT;
206        t4_wrq_tx(sc, wr);
207}
208
209#ifdef RATELIMIT
210/*
211 * Input is Bytes/second (so_max_pacing-rate), chip counts in Kilobits/second.
212 */
213static int
214update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
215{
216	int tc_idx, rc;
217	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
218	const int port_id = toep->vi->pi->port_id;
219
220	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
221
222	if (kbps == 0) {
223		/* unbind */
224		tc_idx = -1;
225	} else {
226		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
227		if (rc != 0)
228			return (rc);
229		MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
230	}
231
232	if (toep->tc_idx != tc_idx) {
233		struct wrqe *wr;
234		struct fw_flowc_wr *flowc;
235		int nparams = 1, flowclen, flowclen16;
236		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
237
238		flowclen = sizeof(*flowc) + nparams * sizeof(struct
239		    fw_flowc_mnemval);
240		flowclen16 = howmany(flowclen, 16);
241		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
242		    (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) {
243			if (tc_idx >= 0)
244				t4_release_cl_rl(sc, port_id, tc_idx);
245			return (ENOMEM);
246		}
247
248		flowc = wrtod(wr);
249		memset(flowc, 0, wr->wr_len);
250
251		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
252		    V_FW_FLOWC_WR_NPARAMS(nparams));
253		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
254		    V_FW_WR_FLOWID(toep->tid));
255
256		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
257		if (tc_idx == -1)
258			flowc->mnemval[0].val = htobe32(0xff);
259		else
260			flowc->mnemval[0].val = htobe32(tc_idx);
261
262		txsd->tx_credits = flowclen16;
263		txsd->plen = 0;
264		toep->tx_credits -= txsd->tx_credits;
265		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
266			toep->txsd_pidx = 0;
267		toep->txsd_avail--;
268		t4_wrq_tx(sc, wr);
269	}
270
271	if (toep->tc_idx >= 0)
272		t4_release_cl_rl(sc, port_id, toep->tc_idx);
273	toep->tc_idx = tc_idx;
274
275	return (0);
276}
277#endif
278
279void
280send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
281{
282	struct wrqe *wr;
283	struct cpl_abort_req *req;
284	int tid = toep->tid;
285	struct inpcb *inp = toep->inp;
286	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
287
288	INP_WLOCK_ASSERT(inp);
289
290	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
291	    __func__, toep->tid,
292	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
293	    tcpstates[tp->t_state],
294	    toep->flags, inp->inp_flags,
295	    toep->flags & TPF_ABORT_SHUTDOWN ?
296	    " (abort already in progress)" : "");
297
298	if (toep->flags & TPF_ABORT_SHUTDOWN)
299		return;	/* abort already in progress */
300
301	toep->flags |= TPF_ABORT_SHUTDOWN;
302
303	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
304	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
305
306	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
307	if (wr == NULL) {
308		/* XXX */
309		panic("%s: allocation failure.", __func__);
310	}
311	req = wrtod(wr);
312
313	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
314	if (inp->inp_flags & INP_DROPPED)
315		req->rsvd0 = htobe32(snd_nxt);
316	else
317		req->rsvd0 = htobe32(tp->snd_nxt);
318	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
319	req->cmd = CPL_ABORT_SEND_RST;
320
321	/*
322	 * XXX: What's the correct way to tell that the inp hasn't been detached
323	 * from its socket?  Should I even be flushing the snd buffer here?
324	 */
325	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
326		struct socket *so = inp->inp_socket;
327
328		if (so != NULL)	/* because I'm not sure.  See comment above */
329			sbflush(&so->so_snd);
330	}
331
332	t4_l2t_send(sc, wr, toep->l2te);
333}
334
335/*
336 * Called when a connection is established to translate the TCP options
337 * reported by HW to FreeBSD's native format.
338 */
339static void
340assign_rxopt(struct tcpcb *tp, uint16_t opt)
341{
342	struct toepcb *toep = tp->t_toe;
343	struct inpcb *inp = tp->t_inpcb;
344	struct adapter *sc = td_adapter(toep->td);
345
346	INP_LOCK_ASSERT(inp);
347
348	toep->tcp_opt = opt;
349	toep->mtu_idx = G_TCPOPT_MSS(opt);
350	tp->t_maxseg = sc->params.mtus[toep->mtu_idx];
351	if (inp->inp_inc.inc_flags & INC_ISIPV6)
352		tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
353	else
354		tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
355
356	toep->emss = tp->t_maxseg;
357	if (G_TCPOPT_TSTAMP(opt)) {
358		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
359		tp->ts_recent = 0;		/* hmmm */
360		tp->ts_recent_age = tcp_ts_getticks();
361		toep->emss -= TCPOLEN_TSTAMP_APPA;
362	}
363
364	CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u",
365	    __func__, toep->tid, toep->mtu_idx,
366	    sc->params.mtus[G_TCPOPT_MSS(opt)], tp->t_maxseg, toep->emss);
367
368	if (G_TCPOPT_SACK(opt))
369		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
370	else
371		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
372
373	if (G_TCPOPT_WSCALE_OK(opt))
374		tp->t_flags |= TF_RCVD_SCALE;
375
376	/* Doing window scaling? */
377	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
378	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
379		tp->rcv_scale = tp->request_r_scale;
380		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
381	}
382}
383
384/*
385 * Completes some final bits of initialization for just established connections
386 * and changes their state to TCPS_ESTABLISHED.
387 *
388 * The ISNs are from the exchange of SYNs.
389 */
390void
391make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
392{
393	struct inpcb *inp = toep->inp;
394	struct socket *so = inp->inp_socket;
395	struct tcpcb *tp = intotcpcb(inp);
396	long bufsize;
397	uint16_t tcpopt = be16toh(opt);
398	struct flowc_tx_params ftxp;
399
400	INP_WLOCK_ASSERT(inp);
401	KASSERT(tp->t_state == TCPS_SYN_SENT ||
402	    tp->t_state == TCPS_SYN_RECEIVED,
403	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
404
405	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
406	    __func__, toep->tid, so, inp, tp, toep);
407
408	tcp_state_change(tp, TCPS_ESTABLISHED);
409	tp->t_starttime = ticks;
410	TCPSTAT_INC(tcps_connects);
411
412	tp->irs = irs;
413	tcp_rcvseqinit(tp);
414	tp->rcv_wnd = (u_int)toep->opt0_rcv_bufsize << 10;
415	tp->rcv_adv += tp->rcv_wnd;
416	tp->last_ack_sent = tp->rcv_nxt;
417
418	tp->iss = iss;
419	tcp_sendseqinit(tp);
420	tp->snd_una = iss + 1;
421	tp->snd_nxt = iss + 1;
422	tp->snd_max = iss + 1;
423
424	assign_rxopt(tp, tcpopt);
425
426	SOCKBUF_LOCK(&so->so_snd);
427	if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
428		bufsize = V_tcp_autosndbuf_max;
429	else
430		bufsize = sbspace(&so->so_snd);
431	SOCKBUF_UNLOCK(&so->so_snd);
432
433	ftxp.snd_nxt = tp->snd_nxt;
434	ftxp.rcv_nxt = tp->rcv_nxt;
435	ftxp.snd_space = bufsize;
436	ftxp.mss = toep->emss;
437	send_flowc_wr(toep, &ftxp);
438
439	soisconnected(so);
440}
441
442int
443send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
444{
445	struct wrqe *wr;
446	struct cpl_rx_data_ack *req;
447	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
448
449	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
450
451	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
452	if (wr == NULL)
453		return (0);
454	req = wrtod(wr);
455
456	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
457	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
458
459	t4_wrq_tx(sc, wr);
460	return (credits);
461}
462
463void
464send_rx_modulate(struct adapter *sc, struct toepcb *toep)
465{
466	struct wrqe *wr;
467	struct cpl_rx_data_ack *req;
468
469	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
470	if (wr == NULL)
471		return;
472	req = wrtod(wr);
473
474	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
475	req->credit_dack = htobe32(F_RX_MODULATE_RX);
476
477	t4_wrq_tx(sc, wr);
478}
479
480void
481t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
482{
483	struct adapter *sc = tod->tod_softc;
484	struct inpcb *inp = tp->t_inpcb;
485	struct socket *so = inp->inp_socket;
486	struct sockbuf *sb = &so->so_rcv;
487	struct toepcb *toep = tp->t_toe;
488	int rx_credits;
489
490	INP_WLOCK_ASSERT(inp);
491	SOCKBUF_LOCK_ASSERT(sb);
492
493	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
494	if (toep->ulp_mode == ULP_MODE_TLS) {
495		if (toep->tls.rcv_over >= rx_credits) {
496			toep->tls.rcv_over -= rx_credits;
497			rx_credits = 0;
498		} else {
499			rx_credits -= toep->tls.rcv_over;
500			toep->tls.rcv_over = 0;
501		}
502	}
503
504	if (rx_credits > 0 &&
505	    (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
506	    (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
507	    sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
508		rx_credits = send_rx_credits(sc, toep, rx_credits);
509		tp->rcv_wnd += rx_credits;
510		tp->rcv_adv += rx_credits;
511	} else if (toep->flags & TPF_FORCE_CREDITS)
512		send_rx_modulate(sc, toep);
513}
514
515void
516t4_rcvd(struct toedev *tod, struct tcpcb *tp)
517{
518	struct inpcb *inp = tp->t_inpcb;
519	struct socket *so = inp->inp_socket;
520	struct sockbuf *sb = &so->so_rcv;
521
522	SOCKBUF_LOCK(sb);
523	t4_rcvd_locked(tod, tp);
524	SOCKBUF_UNLOCK(sb);
525}
526
527/*
528 * Close a connection by sending a CPL_CLOSE_CON_REQ message.
529 */
530int
531t4_close_conn(struct adapter *sc, struct toepcb *toep)
532{
533	struct wrqe *wr;
534	struct cpl_close_con_req *req;
535	unsigned int tid = toep->tid;
536
537	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
538	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
539
540	if (toep->flags & TPF_FIN_SENT)
541		return (0);
542
543	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
544	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
545
546	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
547	if (wr == NULL) {
548		/* XXX */
549		panic("%s: allocation failure.", __func__);
550	}
551	req = wrtod(wr);
552
553        req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
554	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
555	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
556	    V_FW_WR_FLOWID(tid));
557        req->wr.wr_lo = cpu_to_be64(0);
558        OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
559	req->rsvd = 0;
560
561	toep->flags |= TPF_FIN_SENT;
562	toep->flags &= ~TPF_SEND_FIN;
563	t4_l2t_send(sc, wr, toep->l2te);
564
565	return (0);
566}
567
568#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
569#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
570
571/* Maximum amount of immediate data we could stuff in a WR */
572static inline int
573max_imm_payload(int tx_credits)
574{
575	const int n = 1;	/* Use no more than one desc for imm. data WR */
576
577	KASSERT(tx_credits >= 0 &&
578		tx_credits <= MAX_OFLD_TX_CREDITS,
579		("%s: %d credits", __func__, tx_credits));
580
581	if (tx_credits < MIN_OFLD_TX_CREDITS)
582		return (0);
583
584	if (tx_credits >= (n * EQ_ESIZE) / 16)
585		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
586	else
587		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
588}
589
590/* Maximum number of SGL entries we could stuff in a WR */
591static inline int
592max_dsgl_nsegs(int tx_credits)
593{
594	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
595	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
596
597	KASSERT(tx_credits >= 0 &&
598		tx_credits <= MAX_OFLD_TX_CREDITS,
599		("%s: %d credits", __func__, tx_credits));
600
601	if (tx_credits < MIN_OFLD_TX_CREDITS)
602		return (0);
603
604	nseg += 2 * (sge_pair_credits * 16 / 24);
605	if ((sge_pair_credits * 16) % 24 == 16)
606		nseg++;
607
608	return (nseg);
609}
610
611static inline void
612write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
613    unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign)
614{
615	struct fw_ofld_tx_data_wr *txwr = dst;
616
617	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
618	    V_FW_WR_IMMDLEN(immdlen));
619	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
620	    V_FW_WR_LEN16(credits));
621	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) |
622	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
623	txwr->plen = htobe32(plen);
624
625	if (txalign > 0) {
626		struct tcpcb *tp = intotcpcb(toep->inp);
627
628		if (plen < 2 * toep->emss)
629			txwr->lsodisable_to_flags |=
630			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
631		else
632			txwr->lsodisable_to_flags |=
633			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
634				(tp->t_flags & TF_NODELAY ? 0 :
635				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
636	}
637}
638
639/*
640 * Generate a DSGL from a starting mbuf.  The total number of segments and the
641 * maximum segments in any one mbuf are provided.
642 */
643static void
644write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
645{
646	struct mbuf *m;
647	struct ulptx_sgl *usgl = dst;
648	int i, j, rc;
649	struct sglist sg;
650	struct sglist_seg segs[n];
651
652	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
653
654	sglist_init(&sg, n, segs);
655	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
656	    V_ULPTX_NSGE(nsegs));
657
658	i = -1;
659	for (m = start; m != stop; m = m->m_next) {
660		if (IS_AIOTX_MBUF(m))
661			rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m),
662			    aiotx_mbuf_pgoff(m), m->m_len);
663		else
664			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
665		if (__predict_false(rc != 0))
666			panic("%s: sglist_append %d", __func__, rc);
667
668		for (j = 0; j < sg.sg_nseg; i++, j++) {
669			if (i < 0) {
670				usgl->len0 = htobe32(segs[j].ss_len);
671				usgl->addr0 = htobe64(segs[j].ss_paddr);
672			} else {
673				usgl->sge[i / 2].len[i & 1] =
674				    htobe32(segs[j].ss_len);
675				usgl->sge[i / 2].addr[i & 1] =
676				    htobe64(segs[j].ss_paddr);
677			}
678#ifdef INVARIANTS
679			nsegs--;
680#endif
681		}
682		sglist_reset(&sg);
683	}
684	if (i & 1)
685		usgl->sge[i / 2].len[1] = htobe32(0);
686	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
687	    __func__, nsegs, start, stop));
688}
689
690/*
691 * Max number of SGL entries an offload tx work request can have.  This is 41
692 * (1 + 40) for a full 512B work request.
693 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
694 */
695#define OFLD_SGL_LEN (41)
696
697/*
698 * Send data and/or a FIN to the peer.
699 *
700 * The socket's so_snd buffer consists of a stream of data starting with sb_mb
701 * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
702 * was transmitted.
703 *
704 * drop indicates the number of bytes that should be dropped from the head of
705 * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
706 * contention on the send buffer lock (before this change it used to do
707 * sowwakeup and then t4_push_frames right after that when recovering from tx
708 * stalls).  When drop is set this function MUST drop the bytes and wake up any
709 * writers.
710 */
711void
712t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
713{
714	struct mbuf *sndptr, *m, *sb_sndptr;
715	struct fw_ofld_tx_data_wr *txwr;
716	struct wrqe *wr;
717	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
718	struct inpcb *inp = toep->inp;
719	struct tcpcb *tp = intotcpcb(inp);
720	struct socket *so = inp->inp_socket;
721	struct sockbuf *sb = &so->so_snd;
722	int tx_credits, shove, compl, sowwakeup;
723	struct ofld_tx_sdesc *txsd;
724	bool aiotx_mbuf_seen;
725
726	INP_WLOCK_ASSERT(inp);
727	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
728	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
729
730	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
731	    toep->ulp_mode == ULP_MODE_TCPDDP ||
732	    toep->ulp_mode == ULP_MODE_TLS ||
733	    toep->ulp_mode == ULP_MODE_RDMA,
734	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
735
736#ifdef VERBOSE_TRACES
737	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
738	    __func__, toep->tid, toep->flags, tp->t_flags);
739#endif
740	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
741		return;
742
743#ifdef RATELIMIT
744	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
745	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
746		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
747	}
748#endif
749
750	/*
751	 * This function doesn't resume by itself.  Someone else must clear the
752	 * flag and call this function.
753	 */
754	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
755		KASSERT(drop == 0,
756		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
757		return;
758	}
759
760	txsd = &toep->txsd[toep->txsd_pidx];
761	do {
762		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
763		max_imm = max_imm_payload(tx_credits);
764		max_nsegs = max_dsgl_nsegs(tx_credits);
765
766		SOCKBUF_LOCK(sb);
767		sowwakeup = drop;
768		if (drop) {
769			sbdrop_locked(sb, drop);
770			drop = 0;
771		}
772		sb_sndptr = sb->sb_sndptr;
773		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
774		plen = 0;
775		nsegs = 0;
776		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
777		aiotx_mbuf_seen = false;
778		for (m = sndptr; m != NULL; m = m->m_next) {
779			int n;
780
781			if (IS_AIOTX_MBUF(m))
782				n = sglist_count_vmpages(aiotx_mbuf_pages(m),
783				    aiotx_mbuf_pgoff(m), m->m_len);
784			else
785				n = sglist_count(mtod(m, void *), m->m_len);
786
787			nsegs += n;
788			plen += m->m_len;
789
790			/* This mbuf sent us _over_ the nsegs limit, back out */
791			if (plen > max_imm && nsegs > max_nsegs) {
792				nsegs -= n;
793				plen -= m->m_len;
794				if (plen == 0) {
795					/* Too few credits */
796					toep->flags |= TPF_TX_SUSPENDED;
797					if (sowwakeup) {
798						if (!TAILQ_EMPTY(
799						    &toep->aiotx_jobq))
800							t4_aiotx_queue_toep(so,
801							    toep);
802						sowwakeup_locked(so);
803					} else
804						SOCKBUF_UNLOCK(sb);
805					SOCKBUF_UNLOCK_ASSERT(sb);
806					return;
807				}
808				break;
809			}
810
811			if (IS_AIOTX_MBUF(m))
812				aiotx_mbuf_seen = true;
813			if (max_nsegs_1mbuf < n)
814				max_nsegs_1mbuf = n;
815			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
816
817			/* This mbuf put us right at the max_nsegs limit */
818			if (plen > max_imm && nsegs == max_nsegs) {
819				m = m->m_next;
820				break;
821			}
822		}
823
824		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
825		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
826			compl = 1;
827		else
828			compl = 0;
829
830		if (sb->sb_flags & SB_AUTOSIZE &&
831		    V_tcp_do_autosndbuf &&
832		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
833		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
834			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
835			    V_tcp_autosndbuf_max);
836
837			if (!sbreserve_locked(sb, newsize, so, NULL))
838				sb->sb_flags &= ~SB_AUTOSIZE;
839			else
840				sowwakeup = 1;	/* room available */
841		}
842		if (sowwakeup) {
843			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
844				t4_aiotx_queue_toep(so, toep);
845			sowwakeup_locked(so);
846		} else
847			SOCKBUF_UNLOCK(sb);
848		SOCKBUF_UNLOCK_ASSERT(sb);
849
850		/* nothing to send */
851		if (plen == 0) {
852			KASSERT(m == NULL,
853			    ("%s: nothing to send, but m != NULL", __func__));
854			break;
855		}
856
857		if (__predict_false(toep->flags & TPF_FIN_SENT))
858			panic("%s: excess tx.", __func__);
859
860		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
861		if (plen <= max_imm && !aiotx_mbuf_seen) {
862
863			/* Immediate data tx */
864
865			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
866					toep->ofld_txq);
867			if (wr == NULL) {
868				/* XXX: how will we recover from this? */
869				toep->flags |= TPF_TX_SUSPENDED;
870				return;
871			}
872			txwr = wrtod(wr);
873			credits = howmany(wr->wr_len, 16);
874			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0,
875			    sc->tt.tx_align);
876			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
877			nsegs = 0;
878		} else {
879			int wr_len;
880
881			/* DSGL tx */
882
883			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
884			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
885			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
886			if (wr == NULL) {
887				/* XXX: how will we recover from this? */
888				toep->flags |= TPF_TX_SUSPENDED;
889				return;
890			}
891			txwr = wrtod(wr);
892			credits = howmany(wr_len, 16);
893			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0,
894			    sc->tt.tx_align);
895			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
896			    max_nsegs_1mbuf);
897			if (wr_len & 0xf) {
898				uint64_t *pad = (uint64_t *)
899				    ((uintptr_t)txwr + wr_len);
900				*pad = 0;
901			}
902		}
903
904		KASSERT(toep->tx_credits >= credits,
905			("%s: not enough credits", __func__));
906
907		toep->tx_credits -= credits;
908		toep->tx_nocompl += credits;
909		toep->plen_nocompl += plen;
910		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
911		    toep->tx_nocompl >= toep->tx_total / 4)
912			compl = 1;
913
914		if (compl || toep->ulp_mode == ULP_MODE_RDMA) {
915			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
916			toep->tx_nocompl = 0;
917			toep->plen_nocompl = 0;
918		}
919
920		tp->snd_nxt += plen;
921		tp->snd_max += plen;
922
923		SOCKBUF_LOCK(sb);
924		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
925		sb->sb_sndptr = sb_sndptr;
926		SOCKBUF_UNLOCK(sb);
927
928		toep->flags |= TPF_TX_DATA_SENT;
929		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
930			toep->flags |= TPF_TX_SUSPENDED;
931
932		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
933		txsd->plen = plen;
934		txsd->tx_credits = credits;
935		txsd++;
936		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
937			toep->txsd_pidx = 0;
938			txsd = &toep->txsd[0];
939		}
940		toep->txsd_avail--;
941
942		t4_l2t_send(sc, wr, toep->l2te);
943	} while (m != NULL);
944
945	/* Send a FIN if requested, but only if there's no more data to send */
946	if (m == NULL && toep->flags & TPF_SEND_FIN)
947		t4_close_conn(sc, toep);
948}
949
950static inline void
951rqdrop_locked(struct mbufq *q, int plen)
952{
953	struct mbuf *m;
954
955	while (plen > 0) {
956		m = mbufq_dequeue(q);
957
958		/* Too many credits. */
959		MPASS(m != NULL);
960		M_ASSERTPKTHDR(m);
961
962		/* Partial credits. */
963		MPASS(plen >= m->m_pkthdr.len);
964
965		plen -= m->m_pkthdr.len;
966		m_freem(m);
967	}
968}
969
970void
971t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
972{
973	struct mbuf *sndptr, *m;
974	struct fw_ofld_tx_data_wr *txwr;
975	struct wrqe *wr;
976	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
977	u_int adjusted_plen, ulp_submode;
978	struct inpcb *inp = toep->inp;
979	struct tcpcb *tp = intotcpcb(inp);
980	int tx_credits, shove;
981	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
982	struct mbufq *pduq = &toep->ulp_pduq;
983	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
984
985	INP_WLOCK_ASSERT(inp);
986	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
987	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
988	KASSERT(toep->ulp_mode == ULP_MODE_ISCSI,
989	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
990
991	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
992		return;
993
994	/*
995	 * This function doesn't resume by itself.  Someone else must clear the
996	 * flag and call this function.
997	 */
998	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
999		KASSERT(drop == 0,
1000		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
1001		return;
1002	}
1003
1004	if (drop)
1005		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
1006
1007	while ((sndptr = mbufq_first(pduq)) != NULL) {
1008		M_ASSERTPKTHDR(sndptr);
1009
1010		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
1011		max_imm = max_imm_payload(tx_credits);
1012		max_nsegs = max_dsgl_nsegs(tx_credits);
1013
1014		plen = 0;
1015		nsegs = 0;
1016		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
1017		for (m = sndptr; m != NULL; m = m->m_next) {
1018			int n = sglist_count(mtod(m, void *), m->m_len);
1019
1020			nsegs += n;
1021			plen += m->m_len;
1022
1023			/*
1024			 * This mbuf would send us _over_ the nsegs limit.
1025			 * Suspend tx because the PDU can't be sent out.
1026			 */
1027			if (plen > max_imm && nsegs > max_nsegs) {
1028				toep->flags |= TPF_TX_SUSPENDED;
1029				return;
1030			}
1031
1032			if (max_nsegs_1mbuf < n)
1033				max_nsegs_1mbuf = n;
1034		}
1035
1036		if (__predict_false(toep->flags & TPF_FIN_SENT))
1037			panic("%s: excess tx.", __func__);
1038
1039		/*
1040		 * We have a PDU to send.  All of it goes out in one WR so 'm'
1041		 * is NULL.  A PDU's length is always a multiple of 4.
1042		 */
1043		MPASS(m == NULL);
1044		MPASS((plen & 3) == 0);
1045		MPASS(sndptr->m_pkthdr.len == plen);
1046
1047		shove = !(tp->t_flags & TF_MORETOCOME);
1048		ulp_submode = mbuf_ulp_submode(sndptr);
1049		MPASS(ulp_submode < nitems(ulp_extra_len));
1050
1051		/*
1052		 * plen doesn't include header and data digests, which are
1053		 * generated and inserted in the right places by the TOE, but
1054		 * they do occupy TCP sequence space and need to be accounted
1055		 * for.
1056		 */
1057		adjusted_plen = plen + ulp_extra_len[ulp_submode];
1058		if (plen <= max_imm) {
1059
1060			/* Immediate data tx */
1061
1062			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
1063					toep->ofld_txq);
1064			if (wr == NULL) {
1065				/* XXX: how will we recover from this? */
1066				toep->flags |= TPF_TX_SUSPENDED;
1067				return;
1068			}
1069			txwr = wrtod(wr);
1070			credits = howmany(wr->wr_len, 16);
1071			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
1072			    shove, ulp_submode, sc->tt.tx_align);
1073			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
1074			nsegs = 0;
1075		} else {
1076			int wr_len;
1077
1078			/* DSGL tx */
1079			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
1080			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
1081			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
1082			if (wr == NULL) {
1083				/* XXX: how will we recover from this? */
1084				toep->flags |= TPF_TX_SUSPENDED;
1085				return;
1086			}
1087			txwr = wrtod(wr);
1088			credits = howmany(wr_len, 16);
1089			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
1090			    shove, ulp_submode, sc->tt.tx_align);
1091			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
1092			    max_nsegs_1mbuf);
1093			if (wr_len & 0xf) {
1094				uint64_t *pad = (uint64_t *)
1095				    ((uintptr_t)txwr + wr_len);
1096				*pad = 0;
1097			}
1098		}
1099
1100		KASSERT(toep->tx_credits >= credits,
1101			("%s: not enough credits", __func__));
1102
1103		m = mbufq_dequeue(pduq);
1104		MPASS(m == sndptr);
1105		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
1106
1107		toep->tx_credits -= credits;
1108		toep->tx_nocompl += credits;
1109		toep->plen_nocompl += plen;
1110		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
1111		    toep->tx_nocompl >= toep->tx_total / 4) {
1112			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
1113			toep->tx_nocompl = 0;
1114			toep->plen_nocompl = 0;
1115		}
1116
1117		tp->snd_nxt += adjusted_plen;
1118		tp->snd_max += adjusted_plen;
1119
1120		toep->flags |= TPF_TX_DATA_SENT;
1121		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
1122			toep->flags |= TPF_TX_SUSPENDED;
1123
1124		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
1125		txsd->plen = plen;
1126		txsd->tx_credits = credits;
1127		txsd++;
1128		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
1129			toep->txsd_pidx = 0;
1130			txsd = &toep->txsd[0];
1131		}
1132		toep->txsd_avail--;
1133
1134		t4_l2t_send(sc, wr, toep->l2te);
1135	}
1136
1137	/* Send a FIN if requested, but only if there are no more PDUs to send */
1138	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
1139		t4_close_conn(sc, toep);
1140}
1141
1142int
1143t4_tod_output(struct toedev *tod, struct tcpcb *tp)
1144{
1145	struct adapter *sc = tod->tod_softc;
1146#ifdef INVARIANTS
1147	struct inpcb *inp = tp->t_inpcb;
1148#endif
1149	struct toepcb *toep = tp->t_toe;
1150
1151	INP_WLOCK_ASSERT(inp);
1152	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1153	    ("%s: inp %p dropped.", __func__, inp));
1154	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1155
1156	if (toep->ulp_mode == ULP_MODE_ISCSI)
1157		t4_push_pdus(sc, toep, 0);
1158	else if (tls_tx_key(toep))
1159		t4_push_tls_records(sc, toep, 0);
1160	else
1161		t4_push_frames(sc, toep, 0);
1162
1163	return (0);
1164}
1165
1166int
1167t4_send_fin(struct toedev *tod, struct tcpcb *tp)
1168{
1169	struct adapter *sc = tod->tod_softc;
1170#ifdef INVARIANTS
1171	struct inpcb *inp = tp->t_inpcb;
1172#endif
1173	struct toepcb *toep = tp->t_toe;
1174
1175	INP_WLOCK_ASSERT(inp);
1176	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1177	    ("%s: inp %p dropped.", __func__, inp));
1178	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1179
1180	toep->flags |= TPF_SEND_FIN;
1181	if (tp->t_state >= TCPS_ESTABLISHED) {
1182		if (toep->ulp_mode == ULP_MODE_ISCSI)
1183			t4_push_pdus(sc, toep, 0);
1184		else if (tls_tx_key(toep))
1185			t4_push_tls_records(sc, toep, 0);
1186		else
1187			t4_push_frames(sc, toep, 0);
1188	}
1189
1190	return (0);
1191}
1192
1193int
1194t4_send_rst(struct toedev *tod, struct tcpcb *tp)
1195{
1196	struct adapter *sc = tod->tod_softc;
1197#if defined(INVARIANTS)
1198	struct inpcb *inp = tp->t_inpcb;
1199#endif
1200	struct toepcb *toep = tp->t_toe;
1201
1202	INP_WLOCK_ASSERT(inp);
1203	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1204	    ("%s: inp %p dropped.", __func__, inp));
1205	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1206
1207	/* hmmmm */
1208	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
1209	    ("%s: flowc for tid %u [%s] not sent already",
1210	    __func__, toep->tid, tcpstates[tp->t_state]));
1211
1212	send_reset(sc, toep, 0);
1213	return (0);
1214}
1215
1216/*
1217 * Peer has sent us a FIN.
1218 */
1219static int
1220do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1221{
1222	struct adapter *sc = iq->adapter;
1223	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
1224	unsigned int tid = GET_TID(cpl);
1225	struct toepcb *toep = lookup_tid(sc, tid);
1226	struct inpcb *inp = toep->inp;
1227	struct tcpcb *tp = NULL;
1228	struct socket *so;
1229#ifdef INVARIANTS
1230	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1231#endif
1232
1233	KASSERT(opcode == CPL_PEER_CLOSE,
1234	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1235	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1236
1237	if (__predict_false(toep->flags & TPF_SYNQE)) {
1238		/*
1239		 * do_pass_establish must have run before do_peer_close and if
1240		 * this is still a synqe instead of a toepcb then the connection
1241		 * must be getting aborted.
1242		 */
1243		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
1244		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1245		    toep, toep->flags);
1246		return (0);
1247	}
1248
1249	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1250
1251	CURVNET_SET(toep->vnet);
1252	INP_INFO_RLOCK(&V_tcbinfo);
1253	INP_WLOCK(inp);
1254	tp = intotcpcb(inp);
1255
1256	CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
1257	    tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
1258
1259	if (toep->flags & TPF_ABORT_SHUTDOWN)
1260		goto done;
1261
1262	tp->rcv_nxt++;	/* FIN */
1263
1264	so = inp->inp_socket;
1265	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
1266		DDP_LOCK(toep);
1267		if (__predict_false(toep->ddp.flags &
1268		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
1269			handle_ddp_close(toep, tp, cpl->rcv_nxt);
1270		DDP_UNLOCK(toep);
1271	}
1272	socantrcvmore(so);
1273
1274	if (toep->ulp_mode != ULP_MODE_RDMA) {
1275		KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
1276	    		("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
1277	    		be32toh(cpl->rcv_nxt)));
1278	}
1279
1280	switch (tp->t_state) {
1281	case TCPS_SYN_RECEIVED:
1282		tp->t_starttime = ticks;
1283		/* FALLTHROUGH */
1284
1285	case TCPS_ESTABLISHED:
1286		tcp_state_change(tp, TCPS_CLOSE_WAIT);
1287		break;
1288
1289	case TCPS_FIN_WAIT_1:
1290		tcp_state_change(tp, TCPS_CLOSING);
1291		break;
1292
1293	case TCPS_FIN_WAIT_2:
1294		tcp_twstart(tp);
1295		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
1296		INP_INFO_RUNLOCK(&V_tcbinfo);
1297		CURVNET_RESTORE();
1298
1299		INP_WLOCK(inp);
1300		final_cpl_received(toep);
1301		return (0);
1302
1303	default:
1304		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
1305		    __func__, tid, tp->t_state);
1306	}
1307done:
1308	INP_WUNLOCK(inp);
1309	INP_INFO_RUNLOCK(&V_tcbinfo);
1310	CURVNET_RESTORE();
1311	return (0);
1312}
1313
1314/*
1315 * Peer has ACK'd our FIN.
1316 */
1317static int
1318do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
1319    struct mbuf *m)
1320{
1321	struct adapter *sc = iq->adapter;
1322	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
1323	unsigned int tid = GET_TID(cpl);
1324	struct toepcb *toep = lookup_tid(sc, tid);
1325	struct inpcb *inp = toep->inp;
1326	struct tcpcb *tp = NULL;
1327	struct socket *so = NULL;
1328#ifdef INVARIANTS
1329	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1330#endif
1331
1332	KASSERT(opcode == CPL_CLOSE_CON_RPL,
1333	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1334	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1335	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1336
1337	CURVNET_SET(toep->vnet);
1338	INP_INFO_RLOCK(&V_tcbinfo);
1339	INP_WLOCK(inp);
1340	tp = intotcpcb(inp);
1341
1342	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
1343	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
1344
1345	if (toep->flags & TPF_ABORT_SHUTDOWN)
1346		goto done;
1347
1348	so = inp->inp_socket;
1349	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
1350
1351	switch (tp->t_state) {
1352	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
1353		tcp_twstart(tp);
1354release:
1355		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1356		INP_INFO_RUNLOCK(&V_tcbinfo);
1357		CURVNET_RESTORE();
1358
1359		INP_WLOCK(inp);
1360		final_cpl_received(toep);	/* no more CPLs expected */
1361
1362		return (0);
1363	case TCPS_LAST_ACK:
1364		if (tcp_close(tp))
1365			INP_WUNLOCK(inp);
1366		goto release;
1367
1368	case TCPS_FIN_WAIT_1:
1369		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1370			soisdisconnected(so);
1371		tcp_state_change(tp, TCPS_FIN_WAIT_2);
1372		break;
1373
1374	default:
1375		log(LOG_ERR,
1376		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
1377		    __func__, tid, tcpstates[tp->t_state]);
1378	}
1379done:
1380	INP_WUNLOCK(inp);
1381	INP_INFO_RUNLOCK(&V_tcbinfo);
1382	CURVNET_RESTORE();
1383	return (0);
1384}
1385
1386void
1387send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
1388    int rst_status)
1389{
1390	struct wrqe *wr;
1391	struct cpl_abort_rpl *cpl;
1392
1393	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
1394	if (wr == NULL) {
1395		/* XXX */
1396		panic("%s: allocation failure.", __func__);
1397	}
1398	cpl = wrtod(wr);
1399
1400	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
1401	cpl->cmd = rst_status;
1402
1403	t4_wrq_tx(sc, wr);
1404}
1405
1406static int
1407abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
1408{
1409	switch (abort_reason) {
1410	case CPL_ERR_BAD_SYN:
1411	case CPL_ERR_CONN_RESET:
1412		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
1413	case CPL_ERR_XMIT_TIMEDOUT:
1414	case CPL_ERR_PERSIST_TIMEDOUT:
1415	case CPL_ERR_FINWAIT2_TIMEDOUT:
1416	case CPL_ERR_KEEPALIVE_TIMEDOUT:
1417		return (ETIMEDOUT);
1418	default:
1419		return (EIO);
1420	}
1421}
1422
1423/*
1424 * TCP RST from the peer, timeout, or some other such critical error.
1425 */
1426static int
1427do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1428{
1429	struct adapter *sc = iq->adapter;
1430	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
1431	unsigned int tid = GET_TID(cpl);
1432	struct toepcb *toep = lookup_tid(sc, tid);
1433	struct sge_wrq *ofld_txq = toep->ofld_txq;
1434	struct inpcb *inp;
1435	struct tcpcb *tp;
1436#ifdef INVARIANTS
1437	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1438#endif
1439
1440	KASSERT(opcode == CPL_ABORT_REQ_RSS,
1441	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1442	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1443
1444	if (toep->flags & TPF_SYNQE)
1445		return (do_abort_req_synqe(iq, rss, m));
1446
1447	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1448
1449	if (negative_advice(cpl->status)) {
1450		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
1451		    __func__, cpl->status, tid, toep->flags);
1452		return (0);	/* Ignore negative advice */
1453	}
1454
1455	inp = toep->inp;
1456	CURVNET_SET(toep->vnet);
1457	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
1458	INP_WLOCK(inp);
1459
1460	tp = intotcpcb(inp);
1461
1462	CTR6(KTR_CXGBE,
1463	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
1464	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
1465	    inp->inp_flags, cpl->status);
1466
1467	/*
1468	 * If we'd initiated an abort earlier the reply to it is responsible for
1469	 * cleaning up resources.  Otherwise we tear everything down right here
1470	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
1471	 */
1472	if (toep->flags & TPF_ABORT_SHUTDOWN) {
1473		INP_WUNLOCK(inp);
1474		goto done;
1475	}
1476	toep->flags |= TPF_ABORT_SHUTDOWN;
1477
1478	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
1479		struct socket *so = inp->inp_socket;
1480
1481		if (so != NULL)
1482			so_error_set(so, abort_status_to_errno(tp,
1483			    cpl->status));
1484		tp = tcp_close(tp);
1485		if (tp == NULL)
1486			INP_WLOCK(inp);	/* re-acquire */
1487	}
1488
1489	final_cpl_received(toep);
1490done:
1491	INP_INFO_RUNLOCK(&V_tcbinfo);
1492	CURVNET_RESTORE();
1493	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
1494	return (0);
1495}
1496
1497/*
1498 * Reply to the CPL_ABORT_REQ (send_reset)
1499 */
1500static int
1501do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1502{
1503	struct adapter *sc = iq->adapter;
1504	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
1505	unsigned int tid = GET_TID(cpl);
1506	struct toepcb *toep = lookup_tid(sc, tid);
1507	struct inpcb *inp = toep->inp;
1508#ifdef INVARIANTS
1509	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1510#endif
1511
1512	KASSERT(opcode == CPL_ABORT_RPL_RSS,
1513	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1514	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1515
1516	if (toep->flags & TPF_SYNQE)
1517		return (do_abort_rpl_synqe(iq, rss, m));
1518
1519	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1520
1521	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
1522	    __func__, tid, toep, inp, cpl->status);
1523
1524	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1525	    ("%s: wasn't expecting abort reply", __func__));
1526
1527	INP_WLOCK(inp);
1528	final_cpl_received(toep);
1529
1530	return (0);
1531}
1532
1533static int
1534do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1535{
1536	struct adapter *sc = iq->adapter;
1537	const struct cpl_rx_data *cpl = mtod(m, const void *);
1538	unsigned int tid = GET_TID(cpl);
1539	struct toepcb *toep = lookup_tid(sc, tid);
1540	struct inpcb *inp = toep->inp;
1541	struct tcpcb *tp;
1542	struct socket *so;
1543	struct sockbuf *sb;
1544	int len, rx_credits;
1545	uint32_t ddp_placed = 0;
1546
1547	if (__predict_false(toep->flags & TPF_SYNQE)) {
1548		/*
1549		 * do_pass_establish must have run before do_rx_data and if this
1550		 * is still a synqe instead of a toepcb then the connection must
1551		 * be getting aborted.
1552		 */
1553		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
1554		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1555		    toep, toep->flags);
1556		m_freem(m);
1557		return (0);
1558	}
1559
1560	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1561
1562	/* strip off CPL header */
1563	m_adj(m, sizeof(*cpl));
1564	len = m->m_pkthdr.len;
1565
1566	INP_WLOCK(inp);
1567	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
1568		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
1569		    __func__, tid, len, inp->inp_flags);
1570		INP_WUNLOCK(inp);
1571		m_freem(m);
1572		return (0);
1573	}
1574
1575	tp = intotcpcb(inp);
1576
1577	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
1578		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
1579
1580	tp->rcv_nxt += len;
1581	if (tp->rcv_wnd < len) {
1582		KASSERT(toep->ulp_mode == ULP_MODE_RDMA,
1583				("%s: negative window size", __func__));
1584	}
1585
1586	tp->rcv_wnd -= len;
1587	tp->t_rcvtime = ticks;
1588
1589	if (toep->ulp_mode == ULP_MODE_TCPDDP)
1590		DDP_LOCK(toep);
1591	so = inp_inpcbtosocket(inp);
1592	sb = &so->so_rcv;
1593	SOCKBUF_LOCK(sb);
1594
1595	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
1596		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
1597		    __func__, tid, len);
1598		m_freem(m);
1599		SOCKBUF_UNLOCK(sb);
1600		if (toep->ulp_mode == ULP_MODE_TCPDDP)
1601			DDP_UNLOCK(toep);
1602		INP_WUNLOCK(inp);
1603
1604		CURVNET_SET(toep->vnet);
1605		INP_INFO_RLOCK(&V_tcbinfo);
1606		INP_WLOCK(inp);
1607		tp = tcp_drop(tp, ECONNRESET);
1608		if (tp)
1609			INP_WUNLOCK(inp);
1610		INP_INFO_RUNLOCK(&V_tcbinfo);
1611		CURVNET_RESTORE();
1612
1613		return (0);
1614	}
1615
1616	/* receive buffer autosize */
1617	MPASS(toep->vnet == so->so_vnet);
1618	CURVNET_SET(toep->vnet);
1619	if (sb->sb_flags & SB_AUTOSIZE &&
1620	    V_tcp_do_autorcvbuf &&
1621	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
1622	    len > (sbspace(sb) / 8 * 7)) {
1623		unsigned int hiwat = sb->sb_hiwat;
1624		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
1625		    V_tcp_autorcvbuf_max);
1626
1627		if (!sbreserve_locked(sb, newsize, so, NULL))
1628			sb->sb_flags &= ~SB_AUTOSIZE;
1629	}
1630
1631	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
1632		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
1633
1634		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
1635			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
1636			    __func__, tid, len);
1637
1638		if (changed) {
1639			if (toep->ddp.flags & DDP_SC_REQ)
1640				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
1641			else {
1642				KASSERT(cpl->ddp_off == 1,
1643				    ("%s: DDP switched on by itself.",
1644				    __func__));
1645
1646				/* Fell out of DDP mode */
1647				toep->ddp.flags &= ~DDP_ON;
1648				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
1649				    __func__);
1650
1651				insert_ddp_data(toep, ddp_placed);
1652			}
1653		}
1654
1655		if (toep->ddp.flags & DDP_ON) {
1656			/*
1657			 * CPL_RX_DATA with DDP on can only be an indicate.
1658			 * Start posting queued AIO requests via DDP.  The
1659			 * payload that arrived in this indicate is appended
1660			 * to the socket buffer as usual.
1661			 */
1662			handle_ddp_indicate(toep);
1663		}
1664	}
1665
1666	sbappendstream_locked(sb, m, 0);
1667	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
1668	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
1669		rx_credits = send_rx_credits(sc, toep, rx_credits);
1670		tp->rcv_wnd += rx_credits;
1671		tp->rcv_adv += rx_credits;
1672	}
1673
1674	if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
1675	    sbavail(sb) != 0) {
1676		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
1677		    tid);
1678		ddp_queue_toep(toep);
1679	}
1680	sorwakeup_locked(so);
1681	SOCKBUF_UNLOCK_ASSERT(sb);
1682	if (toep->ulp_mode == ULP_MODE_TCPDDP)
1683		DDP_UNLOCK(toep);
1684
1685	INP_WUNLOCK(inp);
1686	CURVNET_RESTORE();
1687	return (0);
1688}
1689
1690static int
1691do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1692{
1693	struct adapter *sc = iq->adapter;
1694	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
1695	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
1696	struct toepcb *toep = lookup_tid(sc, tid);
1697	struct inpcb *inp;
1698	struct tcpcb *tp;
1699	struct socket *so;
1700	uint8_t credits = cpl->credits;
1701	struct ofld_tx_sdesc *txsd;
1702	int plen;
1703#ifdef INVARIANTS
1704	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
1705#endif
1706
1707	/*
1708	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
1709	 * now this comes back carrying the credits for the flowc.
1710	 */
1711	if (__predict_false(toep->flags & TPF_SYNQE)) {
1712		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1713		    ("%s: credits for a synq entry %p", __func__, toep));
1714		return (0);
1715	}
1716
1717	inp = toep->inp;
1718
1719	KASSERT(opcode == CPL_FW4_ACK,
1720	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1721	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1722	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1723
1724	INP_WLOCK(inp);
1725
1726	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
1727		INP_WUNLOCK(inp);
1728		return (0);
1729	}
1730
1731	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
1732	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
1733
1734	tp = intotcpcb(inp);
1735
1736	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
1737		tcp_seq snd_una = be32toh(cpl->snd_una);
1738
1739#ifdef INVARIANTS
1740		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
1741			log(LOG_ERR,
1742			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
1743			    __func__, snd_una, toep->tid, tp->snd_una);
1744		}
1745#endif
1746
1747		if (tp->snd_una != snd_una) {
1748			tp->snd_una = snd_una;
1749			tp->ts_recent_age = tcp_ts_getticks();
1750		}
1751	}
1752
1753#ifdef VERBOSE_TRACES
1754	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
1755#endif
1756	so = inp->inp_socket;
1757	txsd = &toep->txsd[toep->txsd_cidx];
1758	plen = 0;
1759	while (credits) {
1760		KASSERT(credits >= txsd->tx_credits,
1761		    ("%s: too many (or partial) credits", __func__));
1762		credits -= txsd->tx_credits;
1763		toep->tx_credits += txsd->tx_credits;
1764		plen += txsd->plen;
1765		if (txsd->iv_buffer) {
1766			free(txsd->iv_buffer, M_CXGBE);
1767			txsd->iv_buffer = NULL;
1768		}
1769		txsd++;
1770		toep->txsd_avail++;
1771		KASSERT(toep->txsd_avail <= toep->txsd_total,
1772		    ("%s: txsd avail > total", __func__));
1773		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
1774			txsd = &toep->txsd[0];
1775			toep->txsd_cidx = 0;
1776		}
1777	}
1778
1779	if (toep->tx_credits == toep->tx_total) {
1780		toep->tx_nocompl = 0;
1781		toep->plen_nocompl = 0;
1782	}
1783
1784	if (toep->flags & TPF_TX_SUSPENDED &&
1785	    toep->tx_credits >= toep->tx_total / 4) {
1786#ifdef VERBOSE_TRACES
1787		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
1788		    tid);
1789#endif
1790		toep->flags &= ~TPF_TX_SUSPENDED;
1791		CURVNET_SET(toep->vnet);
1792		if (toep->ulp_mode == ULP_MODE_ISCSI)
1793			t4_push_pdus(sc, toep, plen);
1794		else if (tls_tx_key(toep))
1795			t4_push_tls_records(sc, toep, plen);
1796		else
1797			t4_push_frames(sc, toep, plen);
1798		CURVNET_RESTORE();
1799	} else if (plen > 0) {
1800		struct sockbuf *sb = &so->so_snd;
1801		int sbu;
1802
1803		SOCKBUF_LOCK(sb);
1804		sbu = sbused(sb);
1805		if (toep->ulp_mode == ULP_MODE_ISCSI) {
1806
1807			if (__predict_false(sbu > 0)) {
1808				/*
1809				 * The data trasmitted before the tid's ULP mode
1810				 * changed to ISCSI is still in so_snd.
1811				 * Incoming credits should account for so_snd
1812				 * first.
1813				 */
1814				sbdrop_locked(sb, min(sbu, plen));
1815				plen -= min(sbu, plen);
1816			}
1817			sowwakeup_locked(so);	/* unlocks so_snd */
1818			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
1819		} else {
1820#ifdef VERBOSE_TRACES
1821			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
1822			    tid, plen);
1823#endif
1824			sbdrop_locked(sb, plen);
1825			if (tls_tx_key(toep)) {
1826				struct tls_ofld_info *tls_ofld = &toep->tls;
1827
1828				MPASS(tls_ofld->sb_off >= plen);
1829				tls_ofld->sb_off -= plen;
1830			}
1831			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
1832				t4_aiotx_queue_toep(so, toep);
1833			sowwakeup_locked(so);	/* unlocks so_snd */
1834		}
1835		SOCKBUF_UNLOCK_ASSERT(sb);
1836	}
1837
1838	INP_WUNLOCK(inp);
1839
1840	return (0);
1841}
1842
1843void
1844t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
1845    uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
1846{
1847	struct wrqe *wr;
1848	struct cpl_set_tcb_field *req;
1849	struct ofld_tx_sdesc *txsd;
1850
1851	MPASS((cookie & ~M_COOKIE) == 0);
1852	if (reply) {
1853		MPASS(cookie != CPL_COOKIE_RESERVED);
1854	}
1855
1856	wr = alloc_wrqe(sizeof(*req), wrq);
1857	if (wr == NULL) {
1858		/* XXX */
1859		panic("%s: allocation failure.", __func__);
1860	}
1861	req = wrtod(wr);
1862
1863	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
1864	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
1865	if (reply == 0)
1866		req->reply_ctrl |= htobe16(F_NO_REPLY);
1867	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
1868	req->mask = htobe64(mask);
1869	req->val = htobe64(val);
1870	if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) {
1871		txsd = &toep->txsd[toep->txsd_pidx];
1872		txsd->tx_credits = howmany(sizeof(*req), 16);
1873		txsd->plen = 0;
1874		KASSERT(toep->tx_credits >= txsd->tx_credits &&
1875		    toep->txsd_avail > 0,
1876		    ("%s: not enough credits (%d)", __func__,
1877		    toep->tx_credits));
1878		toep->tx_credits -= txsd->tx_credits;
1879		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
1880			toep->txsd_pidx = 0;
1881		toep->txsd_avail--;
1882	}
1883
1884	t4_wrq_tx(sc, wr);
1885}
1886
1887void
1888t4_init_cpl_io_handlers(void)
1889{
1890
1891	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
1892	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
1893	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
1894	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
1895	    CPL_COOKIE_TOM);
1896	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
1897	t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
1898}
1899
1900void
1901t4_uninit_cpl_io_handlers(void)
1902{
1903
1904	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
1905	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
1906	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
1907	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
1908	t4_register_cpl_handler(CPL_RX_DATA, NULL);
1909	t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
1910}
1911
1912/*
1913 * Use the 'backend3' field in AIO jobs to store the amount of data
1914 * sent by the AIO job so far and the 'backend4' field to hold an
1915 * error that should be reported when the job is completed.
1916 */
1917#define	aio_sent	backend3
1918#define	aio_error	backend4
1919
1920#define	jobtotid(job)							\
1921	(((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
1922
1923static void
1924free_aiotx_buffer(struct aiotx_buffer *ab)
1925{
1926	struct kaiocb *job;
1927	long status;
1928	int error;
1929
1930	if (refcount_release(&ab->refcount) == 0)
1931		return;
1932
1933	job = ab->job;
1934	error = job->aio_error;
1935	status = job->aio_sent;
1936	vm_page_unhold_pages(ab->ps.pages, ab->ps.npages);
1937	free(ab, M_CXGBE);
1938#ifdef VERBOSE_TRACES
1939	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
1940	    jobtotid(job), job, status, error);
1941#endif
1942	if (error == ECANCELED && status != 0)
1943		error = 0;
1944	if (error == ECANCELED)
1945		aio_cancel(job);
1946	else if (error)
1947		aio_complete(job, -1, error);
1948	else
1949		aio_complete(job, status, 0);
1950}
1951
1952static void
1953t4_aiotx_mbuf_free(struct mbuf *m, void *buffer, void *arg)
1954{
1955	struct aiotx_buffer *ab = buffer;
1956
1957#ifdef VERBOSE_TRACES
1958	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
1959	    m->m_len, jobtotid(ab->job));
1960#endif
1961	free_aiotx_buffer(ab);
1962}
1963
1964/*
1965 * Hold the buffer backing an AIO request and return an AIO transmit
1966 * buffer.
1967 */
1968static int
1969hold_aio(struct kaiocb *job)
1970{
1971	struct aiotx_buffer *ab;
1972	struct vmspace *vm;
1973	vm_map_t map;
1974	vm_offset_t start, end, pgoff;
1975	int n;
1976
1977	MPASS(job->backend1 == NULL);
1978
1979	/*
1980	 * The AIO subsystem will cancel and drain all requests before
1981	 * permitting a process to exit or exec, so p_vmspace should
1982	 * be stable here.
1983	 */
1984	vm = job->userproc->p_vmspace;
1985	map = &vm->vm_map;
1986	start = (uintptr_t)job->uaiocb.aio_buf;
1987	pgoff = start & PAGE_MASK;
1988	end = round_page(start + job->uaiocb.aio_nbytes);
1989	start = trunc_page(start);
1990	n = atop(end - start);
1991
1992	ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
1993	    M_ZERO);
1994	refcount_init(&ab->refcount, 1);
1995	ab->ps.pages = (vm_page_t *)(ab + 1);
1996	ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start,
1997	    VM_PROT_WRITE, ab->ps.pages, n);
1998	if (ab->ps.npages < 0) {
1999		free(ab, M_CXGBE);
2000		return (EFAULT);
2001	}
2002
2003	KASSERT(ab->ps.npages == n,
2004	    ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n));
2005
2006	ab->ps.offset = pgoff;
2007	ab->ps.len = job->uaiocb.aio_nbytes;
2008	ab->job = job;
2009	job->backend1 = ab;
2010#ifdef VERBOSE_TRACES
2011	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
2012	    __func__, jobtotid(job), &ab->ps, job, ab->ps.npages);
2013#endif
2014	return (0);
2015}
2016
2017static void
2018t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
2019{
2020	struct adapter *sc;
2021	struct sockbuf *sb;
2022	struct file *fp;
2023	struct aiotx_buffer *ab;
2024	struct inpcb *inp;
2025	struct tcpcb *tp;
2026	struct mbuf *m;
2027	int error;
2028	bool moretocome, sendmore;
2029
2030	sc = td_adapter(toep->td);
2031	sb = &so->so_snd;
2032	SOCKBUF_UNLOCK(sb);
2033	fp = job->fd_file;
2034	ab = job->backend1;
2035	m = NULL;
2036
2037#ifdef MAC
2038	error = mac_socket_check_send(fp->f_cred, so);
2039	if (error != 0)
2040		goto out;
2041#endif
2042
2043	if (ab == NULL) {
2044		error = hold_aio(job);
2045		if (error != 0)
2046			goto out;
2047		ab = job->backend1;
2048	}
2049
2050	/* Inline sosend_generic(). */
2051
2052	job->msgsnd = 1;
2053
2054	error = sblock(sb, SBL_WAIT);
2055	MPASS(error == 0);
2056
2057sendanother:
2058	m = m_get(M_WAITOK, MT_DATA);
2059
2060	SOCKBUF_LOCK(sb);
2061	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2062		SOCKBUF_UNLOCK(sb);
2063		sbunlock(sb);
2064		if ((so->so_options & SO_NOSIGPIPE) == 0) {
2065			PROC_LOCK(job->userproc);
2066			kern_psignal(job->userproc, SIGPIPE);
2067			PROC_UNLOCK(job->userproc);
2068		}
2069		error = EPIPE;
2070		goto out;
2071	}
2072	if (so->so_error) {
2073		error = so->so_error;
2074		so->so_error = 0;
2075		SOCKBUF_UNLOCK(sb);
2076		sbunlock(sb);
2077		goto out;
2078	}
2079	if ((so->so_state & SS_ISCONNECTED) == 0) {
2080		SOCKBUF_UNLOCK(sb);
2081		sbunlock(sb);
2082		error = ENOTCONN;
2083		goto out;
2084	}
2085	if (sbspace(sb) < sb->sb_lowat) {
2086		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
2087
2088		/*
2089		 * Don't block if there is too little room in the socket
2090		 * buffer.  Instead, requeue the request.
2091		 */
2092		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2093			SOCKBUF_UNLOCK(sb);
2094			sbunlock(sb);
2095			error = ECANCELED;
2096			goto out;
2097		}
2098		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2099		SOCKBUF_UNLOCK(sb);
2100		sbunlock(sb);
2101		goto out;
2102	}
2103
2104	/*
2105	 * Write as much data as the socket permits, but no more than a
2106	 * a single sndbuf at a time.
2107	 */
2108	m->m_len = sbspace(sb);
2109	if (m->m_len > ab->ps.len - job->aio_sent) {
2110		m->m_len = ab->ps.len - job->aio_sent;
2111		moretocome = false;
2112	} else
2113		moretocome = true;
2114	if (m->m_len > sc->tt.sndbuf) {
2115		m->m_len = sc->tt.sndbuf;
2116		sendmore = true;
2117	} else
2118		sendmore = false;
2119
2120	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
2121		moretocome = true;
2122	SOCKBUF_UNLOCK(sb);
2123	MPASS(m->m_len != 0);
2124
2125	/* Inlined tcp_usr_send(). */
2126
2127	inp = toep->inp;
2128	INP_WLOCK(inp);
2129	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
2130		INP_WUNLOCK(inp);
2131		sbunlock(sb);
2132		error = ECONNRESET;
2133		goto out;
2134	}
2135
2136	refcount_acquire(&ab->refcount);
2137	m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab,
2138	    (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV);
2139	m->m_ext.ext_flags |= EXT_FLAG_AIOTX;
2140	job->aio_sent += m->m_len;
2141
2142	sbappendstream(sb, m, 0);
2143	m = NULL;
2144
2145	if (!(inp->inp_flags & INP_DROPPED)) {
2146		tp = intotcpcb(inp);
2147		if (moretocome)
2148			tp->t_flags |= TF_MORETOCOME;
2149		error = tp->t_fb->tfb_tcp_output(tp);
2150		if (moretocome)
2151			tp->t_flags &= ~TF_MORETOCOME;
2152	}
2153
2154	INP_WUNLOCK(inp);
2155	if (sendmore)
2156		goto sendanother;
2157	sbunlock(sb);
2158
2159	if (error)
2160		goto out;
2161
2162	/*
2163	 * If this is a non-blocking socket and the request has not
2164	 * been fully completed, requeue it until the socket is ready
2165	 * again.
2166	 */
2167	if (job->aio_sent < job->uaiocb.aio_nbytes &&
2168	    !(so->so_state & SS_NBIO)) {
2169		SOCKBUF_LOCK(sb);
2170		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2171			SOCKBUF_UNLOCK(sb);
2172			error = ECANCELED;
2173			goto out;
2174		}
2175		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2176		return;
2177	}
2178
2179	/*
2180	 * If the request will not be requeued, drop a reference on
2181	 * the the aiotx buffer.  Any mbufs in flight should still
2182	 * contain a reference, but this drops the reference that the
2183	 * job owns while it is waiting to queue mbufs to the socket.
2184	 */
2185	free_aiotx_buffer(ab);
2186
2187out:
2188	if (error) {
2189		if (ab != NULL) {
2190			job->aio_error = error;
2191			free_aiotx_buffer(ab);
2192		} else {
2193			MPASS(job->aio_sent == 0);
2194			aio_complete(job, -1, error);
2195		}
2196	}
2197	if (m != NULL)
2198		m_free(m);
2199	SOCKBUF_LOCK(sb);
2200}
2201
2202static void
2203t4_aiotx_task(void *context, int pending)
2204{
2205	struct toepcb *toep = context;
2206	struct socket *so;
2207	struct kaiocb *job;
2208
2209	so = toep->aiotx_so;
2210	CURVNET_SET(toep->vnet);
2211	SOCKBUF_LOCK(&so->so_snd);
2212	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
2213		job = TAILQ_FIRST(&toep->aiotx_jobq);
2214		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2215		if (!aio_clear_cancel_function(job))
2216			continue;
2217
2218		t4_aiotx_process_job(toep, so, job);
2219	}
2220	toep->aiotx_so = NULL;
2221	SOCKBUF_UNLOCK(&so->so_snd);
2222	CURVNET_RESTORE();
2223
2224	free_toepcb(toep);
2225	SOCK_LOCK(so);
2226	sorele(so);
2227}
2228
2229static void
2230t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep)
2231{
2232
2233	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
2234#ifdef VERBOSE_TRACES
2235	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
2236	    __func__, toep->tid, toep->aiotx_task_active ? "true" : "false");
2237#endif
2238	if (toep->aiotx_so != NULL)
2239		return;
2240	soref(so);
2241	toep->aiotx_so = so;
2242	hold_toepcb(toep);
2243	soaio_enqueue(&toep->aiotx_task);
2244}
2245
2246static void
2247t4_aiotx_cancel(struct kaiocb *job)
2248{
2249	struct aiotx_buffer *ab;
2250	struct socket *so;
2251	struct sockbuf *sb;
2252	struct tcpcb *tp;
2253	struct toepcb *toep;
2254
2255	so = job->fd_file->f_data;
2256	tp = so_sototcpcb(so);
2257	toep = tp->t_toe;
2258	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
2259	sb = &so->so_snd;
2260
2261	SOCKBUF_LOCK(sb);
2262	if (!aio_cancel_cleared(job))
2263		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2264	SOCKBUF_UNLOCK(sb);
2265
2266	ab = job->backend1;
2267	if (ab != NULL)
2268		free_aiotx_buffer(ab);
2269	else
2270		aio_cancel(job);
2271}
2272
2273int
2274t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
2275{
2276	struct tcpcb *tp = so_sototcpcb(so);
2277	struct toepcb *toep = tp->t_toe;
2278	struct adapter *sc = td_adapter(toep->td);
2279
2280	/* This only handles writes. */
2281	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
2282		return (EOPNOTSUPP);
2283
2284	if (!sc->tt.tx_zcopy)
2285		return (EOPNOTSUPP);
2286
2287	if (tls_tx_key(toep))
2288		return (EOPNOTSUPP);
2289
2290	SOCKBUF_LOCK(&so->so_snd);
2291#ifdef VERBOSE_TRACES
2292	CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
2293#endif
2294	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
2295		panic("new job was cancelled");
2296	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
2297	if (sowriteable(so))
2298		t4_aiotx_queue_toep(so, toep);
2299	SOCKBUF_UNLOCK(&so->so_snd);
2300	return (0);
2301}
2302
2303void
2304aiotx_init_toep(struct toepcb *toep)
2305{
2306
2307	TAILQ_INIT(&toep->aiotx_jobq);
2308	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
2309}
2310#endif
2311