1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
5 * All rights reserved.
6 * Written by: Navdeep Parhar <np@FreeBSD.org>
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31#include "opt_inet.h"
32#include "opt_inet6.h"
33#include "opt_kern_tls.h"
34#include "opt_ratelimit.h"
35
36#ifdef TCP_OFFLOAD
37#include <sys/param.h>
38#include <sys/aio.h>
39#include <sys/file.h>
40#include <sys/kernel.h>
41#include <sys/ktr.h>
42#include <sys/module.h>
43#include <sys/proc.h>
44#include <sys/protosw.h>
45#include <sys/domain.h>
46#include <sys/socket.h>
47#include <sys/socketvar.h>
48#include <sys/sglist.h>
49#include <sys/taskqueue.h>
50#include <netinet/in.h>
51#include <netinet/in_pcb.h>
52#include <netinet/ip.h>
53#include <netinet/ip6.h>
54#define TCPSTATES
55#include <netinet/tcp_fsm.h>
56#include <netinet/tcp_seq.h>
57#include <netinet/tcp_var.h>
58#include <netinet/toecore.h>
59
60#include <security/mac/mac_framework.h>
61
62#include <vm/vm.h>
63#include <vm/vm_extern.h>
64#include <vm/pmap.h>
65#include <vm/vm_map.h>
66#include <vm/vm_page.h>
67
68#include <dev/iscsi/iscsi_proto.h>
69
70#include "common/common.h"
71#include "common/t4_msg.h"
72#include "common/t4_regs.h"
73#include "common/t4_tcb.h"
74#include "tom/t4_tom_l2t.h"
75#include "tom/t4_tom.h"
76
77static void	t4_aiotx_cancel(struct kaiocb *job);
78static void	t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
79
80void
81send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
82{
83	struct wrqe *wr;
84	struct fw_flowc_wr *flowc;
85	unsigned int nparams, flowclen, paramidx;
86	struct vi_info *vi = toep->vi;
87	struct port_info *pi = vi->pi;
88	struct adapter *sc = pi->adapter;
89	unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
90	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
91
92	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
93	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
94
95	if (tp != NULL)
96		nparams = 8;
97	else
98		nparams = 6;
99	if (toep->params.tc_idx != -1) {
100		MPASS(toep->params.tc_idx >= 0 &&
101		    toep->params.tc_idx < sc->params.nsched_cls);
102		nparams++;
103	}
104
105	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
106
107	wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq);
108	if (wr == NULL) {
109		/* XXX */
110		panic("%s: allocation failure.", __func__);
111	}
112	flowc = wrtod(wr);
113	memset(flowc, 0, wr->wr_len);
114
115	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
116	    V_FW_FLOWC_WR_NPARAMS(nparams));
117	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
118	    V_FW_WR_FLOWID(toep->tid));
119
120#define FLOWC_PARAM(__m, __v) \
121	do { \
122		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
123		flowc->mnemval[paramidx].val = htobe32(__v); \
124		paramidx++; \
125	} while (0)
126
127	paramidx = 0;
128
129	FLOWC_PARAM(PFNVFN, pfvf);
130	FLOWC_PARAM(CH, pi->tx_chan);
131	FLOWC_PARAM(PORT, pi->tx_chan);
132	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
133	FLOWC_PARAM(SNDBUF, toep->params.sndbuf);
134	if (tp) {
135		FLOWC_PARAM(MSS, toep->params.emss);
136		FLOWC_PARAM(SNDNXT, tp->snd_nxt);
137		FLOWC_PARAM(RCVNXT, tp->rcv_nxt);
138	} else
139		FLOWC_PARAM(MSS, 512);
140	CTR6(KTR_CXGBE,
141	    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
142	    __func__, toep->tid, toep->params.emss, toep->params.sndbuf,
143	    tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0);
144
145	if (toep->params.tc_idx != -1)
146		FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx);
147#undef FLOWC_PARAM
148
149	KASSERT(paramidx == nparams, ("nparams mismatch"));
150
151	txsd->tx_credits = howmany(flowclen, 16);
152	txsd->plen = 0;
153	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
154	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
155	toep->tx_credits -= txsd->tx_credits;
156	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
157		toep->txsd_pidx = 0;
158	toep->txsd_avail--;
159
160	toep->flags |= TPF_FLOWC_WR_SENT;
161        t4_wrq_tx(sc, wr);
162}
163
164#ifdef RATELIMIT
165/*
166 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second.
167 */
168static int
169update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
170{
171	int tc_idx, rc;
172	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
173	const int port_id = toep->vi->pi->port_id;
174
175	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
176
177	if (kbps == 0) {
178		/* unbind */
179		tc_idx = -1;
180	} else {
181		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
182		if (rc != 0)
183			return (rc);
184		MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);
185	}
186
187	if (toep->params.tc_idx != tc_idx) {
188		struct wrqe *wr;
189		struct fw_flowc_wr *flowc;
190		int nparams = 1, flowclen, flowclen16;
191		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
192
193		flowclen = sizeof(*flowc) + nparams * sizeof(struct
194		    fw_flowc_mnemval);
195		flowclen16 = howmany(flowclen, 16);
196		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
197		    (wr = alloc_wrqe(roundup2(flowclen, 16),
198		    &toep->ofld_txq->wrq)) == NULL) {
199			if (tc_idx >= 0)
200				t4_release_cl_rl(sc, port_id, tc_idx);
201			return (ENOMEM);
202		}
203
204		flowc = wrtod(wr);
205		memset(flowc, 0, wr->wr_len);
206
207		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
208		    V_FW_FLOWC_WR_NPARAMS(nparams));
209		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
210		    V_FW_WR_FLOWID(toep->tid));
211
212		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
213		if (tc_idx == -1)
214			flowc->mnemval[0].val = htobe32(0xff);
215		else
216			flowc->mnemval[0].val = htobe32(tc_idx);
217
218		txsd->tx_credits = flowclen16;
219		txsd->plen = 0;
220		toep->tx_credits -= txsd->tx_credits;
221		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
222			toep->txsd_pidx = 0;
223		toep->txsd_avail--;
224		t4_wrq_tx(sc, wr);
225	}
226
227	if (toep->params.tc_idx >= 0)
228		t4_release_cl_rl(sc, port_id, toep->params.tc_idx);
229	toep->params.tc_idx = tc_idx;
230
231	return (0);
232}
233#endif
234
235void
236send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
237{
238	struct wrqe *wr;
239	struct cpl_abort_req *req;
240	int tid = toep->tid;
241	struct inpcb *inp = toep->inp;
242	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
243
244	INP_WLOCK_ASSERT(inp);
245
246	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
247	    __func__, toep->tid,
248	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
249	    tcpstates[tp->t_state],
250	    toep->flags, inp->inp_flags,
251	    toep->flags & TPF_ABORT_SHUTDOWN ?
252	    " (abort already in progress)" : "");
253
254	if (toep->flags & TPF_ABORT_SHUTDOWN)
255		return;	/* abort already in progress */
256
257	toep->flags |= TPF_ABORT_SHUTDOWN;
258
259	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
260	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
261
262	wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
263	if (wr == NULL) {
264		/* XXX */
265		panic("%s: allocation failure.", __func__);
266	}
267	req = wrtod(wr);
268
269	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
270	if (inp->inp_flags & INP_DROPPED)
271		req->rsvd0 = htobe32(snd_nxt);
272	else
273		req->rsvd0 = htobe32(tp->snd_nxt);
274	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
275	req->cmd = CPL_ABORT_SEND_RST;
276
277	/*
278	 * XXX: What's the correct way to tell that the inp hasn't been detached
279	 * from its socket?  Should I even be flushing the snd buffer here?
280	 */
281	if ((inp->inp_flags & INP_DROPPED) == 0) {
282		struct socket *so = inp->inp_socket;
283
284		if (so != NULL)	/* because I'm not sure.  See comment above */
285			sbflush(&so->so_snd);
286	}
287
288	t4_l2t_send(sc, wr, toep->l2te);
289}
290
291/*
292 * Called when a connection is established to translate the TCP options
293 * reported by HW to FreeBSD's native format.
294 */
295static void
296assign_rxopt(struct tcpcb *tp, uint16_t opt)
297{
298	struct toepcb *toep = tp->t_toe;
299	struct inpcb *inp = tptoinpcb(tp);
300	struct adapter *sc = td_adapter(toep->td);
301
302	INP_LOCK_ASSERT(inp);
303
304	toep->params.mtu_idx = G_TCPOPT_MSS(opt);
305	tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx];
306	if (inp->inp_inc.inc_flags & INC_ISIPV6)
307		tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
308	else
309		tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
310
311	toep->params.emss = tp->t_maxseg;
312	if (G_TCPOPT_TSTAMP(opt)) {
313		toep->params.tstamp = 1;
314		toep->params.emss -= TCPOLEN_TSTAMP_APPA;
315		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
316		tp->ts_recent = 0;		/* hmmm */
317		tp->ts_recent_age = tcp_ts_getticks();
318	} else
319		toep->params.tstamp = 0;
320
321	if (G_TCPOPT_SACK(opt)) {
322		toep->params.sack = 1;
323		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
324	} else {
325		toep->params.sack = 0;
326		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
327	}
328
329	if (G_TCPOPT_WSCALE_OK(opt))
330		tp->t_flags |= TF_RCVD_SCALE;
331
332	/* Doing window scaling? */
333	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
334	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
335		tp->rcv_scale = tp->request_r_scale;
336		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
337	} else
338		toep->params.wscale = 0;
339
340	CTR6(KTR_CXGBE,
341	    "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u",
342	    toep->tid, toep->params.mtu_idx, toep->params.emss,
343	    toep->params.tstamp, toep->params.sack, toep->params.wscale);
344}
345
346/*
347 * Completes some final bits of initialization for just established connections
348 * and changes their state to TCPS_ESTABLISHED.
349 *
350 * The ISNs are from the exchange of SYNs.
351 */
352void
353make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
354{
355	struct inpcb *inp = toep->inp;
356	struct socket *so = inp->inp_socket;
357	struct tcpcb *tp = intotcpcb(inp);
358	uint16_t tcpopt = be16toh(opt);
359
360	INP_WLOCK_ASSERT(inp);
361	KASSERT(tp->t_state == TCPS_SYN_SENT ||
362	    tp->t_state == TCPS_SYN_RECEIVED,
363	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
364
365	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
366	    __func__, toep->tid, so, inp, tp, toep);
367
368	tcp_state_change(tp, TCPS_ESTABLISHED);
369	tp->t_starttime = ticks;
370	TCPSTAT_INC(tcps_connects);
371
372	tp->irs = irs;
373	tcp_rcvseqinit(tp);
374	tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10;
375	tp->rcv_adv += tp->rcv_wnd;
376	tp->last_ack_sent = tp->rcv_nxt;
377
378	tp->iss = iss;
379	tcp_sendseqinit(tp);
380	tp->snd_una = iss + 1;
381	tp->snd_nxt = iss + 1;
382	tp->snd_max = iss + 1;
383
384	assign_rxopt(tp, tcpopt);
385	send_flowc_wr(toep, tp);
386
387	soisconnected(so);
388}
389
390int
391send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
392{
393	struct wrqe *wr;
394	struct cpl_rx_data_ack *req;
395	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
396
397	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
398
399	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
400	if (wr == NULL)
401		return (0);
402	req = wrtod(wr);
403
404	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
405	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
406
407	t4_wrq_tx(sc, wr);
408	return (credits);
409}
410
411void
412t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
413{
414	struct adapter *sc = tod->tod_softc;
415	struct inpcb *inp = tptoinpcb(tp);
416	struct socket *so = inp->inp_socket;
417	struct sockbuf *sb = &so->so_rcv;
418	struct toepcb *toep = tp->t_toe;
419	int rx_credits;
420
421	INP_WLOCK_ASSERT(inp);
422	SOCKBUF_LOCK_ASSERT(sb);
423
424	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
425	if (rx_credits > 0 &&
426	    (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
427	    (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
428	    sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
429		rx_credits = send_rx_credits(sc, toep, rx_credits);
430		tp->rcv_wnd += rx_credits;
431		tp->rcv_adv += rx_credits;
432	}
433}
434
435void
436t4_rcvd(struct toedev *tod, struct tcpcb *tp)
437{
438	struct inpcb *inp = tptoinpcb(tp);
439	struct socket *so = inp->inp_socket;
440	struct sockbuf *sb = &so->so_rcv;
441
442	SOCKBUF_LOCK(sb);
443	t4_rcvd_locked(tod, tp);
444	SOCKBUF_UNLOCK(sb);
445}
446
447/*
448 * Close a connection by sending a CPL_CLOSE_CON_REQ message.
449 */
450int
451t4_close_conn(struct adapter *sc, struct toepcb *toep)
452{
453	struct wrqe *wr;
454	struct cpl_close_con_req *req;
455	unsigned int tid = toep->tid;
456
457	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
458	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
459
460	if (toep->flags & TPF_FIN_SENT)
461		return (0);
462
463	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
464	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
465
466	wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
467	if (wr == NULL) {
468		/* XXX */
469		panic("%s: allocation failure.", __func__);
470	}
471	req = wrtod(wr);
472
473        req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
474	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
475	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
476	    V_FW_WR_FLOWID(tid));
477        req->wr.wr_lo = cpu_to_be64(0);
478        OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
479	req->rsvd = 0;
480
481	toep->flags |= TPF_FIN_SENT;
482	toep->flags &= ~TPF_SEND_FIN;
483	t4_l2t_send(sc, wr, toep->l2te);
484
485	return (0);
486}
487
488#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
489#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
490#define MIN_ISO_TX_CREDITS  (howmany(sizeof(struct cpl_tx_data_iso), 16))
491#define MIN_TX_CREDITS(iso)						\
492	(MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0))
493
494/* Maximum amount of immediate data we could stuff in a WR */
495static inline int
496max_imm_payload(int tx_credits, int iso)
497{
498	const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0;
499	const int n = 1;	/* Use no more than one desc for imm. data WR */
500
501	KASSERT(tx_credits >= 0 &&
502		tx_credits <= MAX_OFLD_TX_CREDITS,
503		("%s: %d credits", __func__, tx_credits));
504
505	if (tx_credits < MIN_TX_CREDITS(iso))
506		return (0);
507
508	if (tx_credits >= (n * EQ_ESIZE) / 16)
509		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) -
510		    iso_cpl_size);
511	else
512		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) -
513		    iso_cpl_size);
514}
515
516/* Maximum number of SGL entries we could stuff in a WR */
517static inline int
518max_dsgl_nsegs(int tx_credits, int iso)
519{
520	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
521	int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso);
522
523	KASSERT(tx_credits >= 0 &&
524		tx_credits <= MAX_OFLD_TX_CREDITS,
525		("%s: %d credits", __func__, tx_credits));
526
527	if (tx_credits < MIN_TX_CREDITS(iso))
528		return (0);
529
530	nseg += 2 * (sge_pair_credits * 16 / 24);
531	if ((sge_pair_credits * 16) % 24 == 16)
532		nseg++;
533
534	return (nseg);
535}
536
537static inline void
538write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode,
539    unsigned int immdlen, unsigned int plen, uint8_t credits, int shove,
540    int ulp_submode)
541{
542	struct fw_ofld_tx_data_wr *txwr = dst;
543
544	txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) |
545	    V_FW_WR_IMMDLEN(immdlen));
546	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
547	    V_FW_WR_LEN16(credits));
548	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) |
549	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
550	txwr->plen = htobe32(plen);
551
552	if (toep->params.tx_align > 0) {
553		if (plen < 2 * toep->params.emss)
554			txwr->lsodisable_to_flags |=
555			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
556		else
557			txwr->lsodisable_to_flags |=
558			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
559				(toep->params.nagle == 0 ? 0 :
560				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
561	}
562}
563
564/*
565 * Generate a DSGL from a starting mbuf.  The total number of segments and the
566 * maximum segments in any one mbuf are provided.
567 */
568static void
569write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
570{
571	struct mbuf *m;
572	struct ulptx_sgl *usgl = dst;
573	int i, j, rc;
574	struct sglist sg;
575	struct sglist_seg segs[n];
576
577	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
578
579	sglist_init(&sg, n, segs);
580	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
581	    V_ULPTX_NSGE(nsegs));
582
583	i = -1;
584	for (m = start; m != stop; m = m->m_next) {
585		if (m->m_flags & M_EXTPG)
586			rc = sglist_append_mbuf_epg(&sg, m,
587			    mtod(m, vm_offset_t), m->m_len);
588		else
589			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
590		if (__predict_false(rc != 0))
591			panic("%s: sglist_append %d", __func__, rc);
592
593		for (j = 0; j < sg.sg_nseg; i++, j++) {
594			if (i < 0) {
595				usgl->len0 = htobe32(segs[j].ss_len);
596				usgl->addr0 = htobe64(segs[j].ss_paddr);
597			} else {
598				usgl->sge[i / 2].len[i & 1] =
599				    htobe32(segs[j].ss_len);
600				usgl->sge[i / 2].addr[i & 1] =
601				    htobe64(segs[j].ss_paddr);
602			}
603#ifdef INVARIANTS
604			nsegs--;
605#endif
606		}
607		sglist_reset(&sg);
608	}
609	if (i & 1)
610		usgl->sge[i / 2].len[1] = htobe32(0);
611	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
612	    __func__, nsegs, start, stop));
613}
614
615/*
616 * Max number of SGL entries an offload tx work request can have.  This is 41
617 * (1 + 40) for a full 512B work request.
618 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
619 */
620#define OFLD_SGL_LEN (41)
621
622/*
623 * Send data and/or a FIN to the peer.
624 *
625 * The socket's so_snd buffer consists of a stream of data starting with sb_mb
626 * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
627 * was transmitted.
628 *
629 * drop indicates the number of bytes that should be dropped from the head of
630 * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
631 * contention on the send buffer lock (before this change it used to do
632 * sowwakeup and then t4_push_frames right after that when recovering from tx
633 * stalls).  When drop is set this function MUST drop the bytes and wake up any
634 * writers.
635 */
636void
637t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
638{
639	struct mbuf *sndptr, *m, *sb_sndptr;
640	struct fw_ofld_tx_data_wr *txwr;
641	struct wrqe *wr;
642	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
643	struct inpcb *inp = toep->inp;
644	struct tcpcb *tp = intotcpcb(inp);
645	struct socket *so = inp->inp_socket;
646	struct sockbuf *sb = &so->so_snd;
647	int tx_credits, shove, compl, sowwakeup;
648	struct ofld_tx_sdesc *txsd;
649	bool nomap_mbuf_seen;
650
651	INP_WLOCK_ASSERT(inp);
652	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
653	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
654
655	KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
656	    ulp_mode(toep) == ULP_MODE_TCPDDP ||
657	    ulp_mode(toep) == ULP_MODE_TLS ||
658	    ulp_mode(toep) == ULP_MODE_RDMA,
659	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
660
661#ifdef VERBOSE_TRACES
662	CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
663	    __func__, toep->tid, toep->flags, tp->t_flags, drop);
664#endif
665	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
666		return;
667
668#ifdef RATELIMIT
669	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
670	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
671		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
672	}
673#endif
674
675	/*
676	 * This function doesn't resume by itself.  Someone else must clear the
677	 * flag and call this function.
678	 */
679	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
680		KASSERT(drop == 0,
681		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
682		return;
683	}
684
685	txsd = &toep->txsd[toep->txsd_pidx];
686	do {
687		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
688		max_imm = max_imm_payload(tx_credits, 0);
689		max_nsegs = max_dsgl_nsegs(tx_credits, 0);
690
691		SOCKBUF_LOCK(sb);
692		sowwakeup = drop;
693		if (drop) {
694			sbdrop_locked(sb, drop);
695			drop = 0;
696		}
697		sb_sndptr = sb->sb_sndptr;
698		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
699		plen = 0;
700		nsegs = 0;
701		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
702		nomap_mbuf_seen = false;
703		for (m = sndptr; m != NULL; m = m->m_next) {
704			int n;
705
706			if ((m->m_flags & M_NOTAVAIL) != 0)
707				break;
708			if (m->m_flags & M_EXTPG) {
709#ifdef KERN_TLS
710				if (m->m_epg_tls != NULL) {
711					toep->flags |= TPF_KTLS;
712					if (plen == 0) {
713						SOCKBUF_UNLOCK(sb);
714						t4_push_ktls(sc, toep, 0);
715						return;
716					}
717					break;
718				}
719#endif
720				n = sglist_count_mbuf_epg(m,
721				    mtod(m, vm_offset_t), m->m_len);
722			} else
723				n = sglist_count(mtod(m, void *), m->m_len);
724
725			nsegs += n;
726			plen += m->m_len;
727
728			/* This mbuf sent us _over_ the nsegs limit, back out */
729			if (plen > max_imm && nsegs > max_nsegs) {
730				nsegs -= n;
731				plen -= m->m_len;
732				if (plen == 0) {
733					/* Too few credits */
734					toep->flags |= TPF_TX_SUSPENDED;
735					if (sowwakeup) {
736						if (!TAILQ_EMPTY(
737						    &toep->aiotx_jobq))
738							t4_aiotx_queue_toep(so,
739							    toep);
740						sowwakeup_locked(so);
741					} else
742						SOCKBUF_UNLOCK(sb);
743					SOCKBUF_UNLOCK_ASSERT(sb);
744					return;
745				}
746				break;
747			}
748
749			if (m->m_flags & M_EXTPG)
750				nomap_mbuf_seen = true;
751			if (max_nsegs_1mbuf < n)
752				max_nsegs_1mbuf = n;
753			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
754
755			/* This mbuf put us right at the max_nsegs limit */
756			if (plen > max_imm && nsegs == max_nsegs) {
757				m = m->m_next;
758				break;
759			}
760		}
761
762		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
763		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
764			compl = 1;
765		else
766			compl = 0;
767
768		if (sb->sb_flags & SB_AUTOSIZE &&
769		    V_tcp_do_autosndbuf &&
770		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
771		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
772			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
773			    V_tcp_autosndbuf_max);
774
775			if (!sbreserve_locked(so, SO_SND, newsize, NULL))
776				sb->sb_flags &= ~SB_AUTOSIZE;
777			else
778				sowwakeup = 1;	/* room available */
779		}
780		if (sowwakeup) {
781			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
782				t4_aiotx_queue_toep(so, toep);
783			sowwakeup_locked(so);
784		} else
785			SOCKBUF_UNLOCK(sb);
786		SOCKBUF_UNLOCK_ASSERT(sb);
787
788		/* nothing to send */
789		if (plen == 0) {
790			KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0,
791			    ("%s: nothing to send, but m != NULL is ready",
792			    __func__));
793			break;
794		}
795
796		if (__predict_false(toep->flags & TPF_FIN_SENT))
797			panic("%s: excess tx.", __func__);
798
799		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
800		if (plen <= max_imm && !nomap_mbuf_seen) {
801
802			/* Immediate data tx */
803
804			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
805					&toep->ofld_txq->wrq);
806			if (wr == NULL) {
807				/* XXX: how will we recover from this? */
808				toep->flags |= TPF_TX_SUSPENDED;
809				return;
810			}
811			txwr = wrtod(wr);
812			credits = howmany(wr->wr_len, 16);
813			write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen,
814			    credits, shove, 0);
815			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
816			nsegs = 0;
817		} else {
818			int wr_len;
819
820			/* DSGL tx */
821
822			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
823			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
824			wr = alloc_wrqe(roundup2(wr_len, 16),
825			    &toep->ofld_txq->wrq);
826			if (wr == NULL) {
827				/* XXX: how will we recover from this? */
828				toep->flags |= TPF_TX_SUSPENDED;
829				return;
830			}
831			txwr = wrtod(wr);
832			credits = howmany(wr_len, 16);
833			write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen,
834			    credits, shove, 0);
835			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
836			    max_nsegs_1mbuf);
837			if (wr_len & 0xf) {
838				uint64_t *pad = (uint64_t *)
839				    ((uintptr_t)txwr + wr_len);
840				*pad = 0;
841			}
842		}
843
844		KASSERT(toep->tx_credits >= credits,
845			("%s: not enough credits", __func__));
846
847		toep->tx_credits -= credits;
848		toep->tx_nocompl += credits;
849		toep->plen_nocompl += plen;
850		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
851		    toep->tx_nocompl >= toep->tx_total / 4)
852			compl = 1;
853
854		if (compl || ulp_mode(toep) == ULP_MODE_RDMA) {
855			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
856			toep->tx_nocompl = 0;
857			toep->plen_nocompl = 0;
858		}
859
860		tp->snd_nxt += plen;
861		tp->snd_max += plen;
862
863		SOCKBUF_LOCK(sb);
864		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
865		sb->sb_sndptr = sb_sndptr;
866		SOCKBUF_UNLOCK(sb);
867
868		toep->flags |= TPF_TX_DATA_SENT;
869		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
870			toep->flags |= TPF_TX_SUSPENDED;
871
872		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
873		txsd->plen = plen;
874		txsd->tx_credits = credits;
875		txsd++;
876		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
877			toep->txsd_pidx = 0;
878			txsd = &toep->txsd[0];
879		}
880		toep->txsd_avail--;
881
882		t4_l2t_send(sc, wr, toep->l2te);
883	} while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0);
884
885	/* Send a FIN if requested, but only if there's no more data to send */
886	if (m == NULL && toep->flags & TPF_SEND_FIN)
887		t4_close_conn(sc, toep);
888}
889
890static inline void
891rqdrop_locked(struct mbufq *q, int plen)
892{
893	struct mbuf *m;
894
895	while (plen > 0) {
896		m = mbufq_dequeue(q);
897
898		/* Too many credits. */
899		MPASS(m != NULL);
900		M_ASSERTPKTHDR(m);
901
902		/* Partial credits. */
903		MPASS(plen >= m->m_pkthdr.len);
904
905		plen -= m->m_pkthdr.len;
906		m_freem(m);
907	}
908}
909
910/*
911 * Not a bit in the TCB, but is a bit in the ulp_submode field of the
912 * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR.
913 */
914#define	ULP_ISO		G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO)
915
916static void
917write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss,
918    int len, int npdu)
919{
920	struct cpl_tx_data_iso *cpl;
921	unsigned int burst_size;
922	unsigned int last;
923
924	/*
925	 * The firmware will set the 'F' bit on the last PDU when
926	 * either condition is true:
927	 *
928	 * - this large PDU is marked as the "last" slice
929	 *
930	 * - the amount of data payload bytes equals the burst_size
931	 *
932	 * The strategy used here is to always set the burst_size
933	 * artificially high (len includes the size of the template
934	 * BHS) and only set the "last" flag if the original PDU had
935	 * 'F' set.
936	 */
937	burst_size = len;
938	last = !!(flags & CXGBE_ISO_F);
939
940	cpl = (struct cpl_tx_data_iso *)dst;
941	cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) |
942	    V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) |
943	    V_CPL_TX_DATA_ISO_CPLHDRLEN(0) |
944	    V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) |
945	    V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) |
946	    V_CPL_TX_DATA_ISO_IMMEDIATE(0) |
947	    V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags)));
948
949	cpl->ahs_len = 0;
950	cpl->mpdu = htons(DIV_ROUND_UP(mss, 4));
951	cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4));
952	cpl->len = htonl(len);
953	cpl->reserved2_seglen_offset = htonl(0);
954	cpl->datasn_offset = htonl(0);
955	cpl->buffer_offset = htonl(0);
956	cpl->reserved3 = 0;
957}
958
959static struct wrqe *
960write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
961{
962	struct mbuf *m;
963	struct fw_ofld_tx_data_wr *txwr;
964	struct cpl_tx_data_iso *cpl_iso;
965	void *p;
966	struct wrqe *wr;
967	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
968	u_int adjusted_plen, imm_data, ulp_submode;
969	struct inpcb *inp = toep->inp;
970	struct tcpcb *tp = intotcpcb(inp);
971	int tx_credits, shove, npdu, wr_len;
972	uint16_t iso_mss;
973	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
974	bool iso, nomap_mbuf_seen;
975
976	M_ASSERTPKTHDR(sndptr);
977
978	tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
979	if (mbuf_raw_wr(sndptr)) {
980		plen = sndptr->m_pkthdr.len;
981		KASSERT(plen <= SGE_MAX_WR_LEN,
982		    ("raw WR len %u is greater than max WR len", plen));
983		if (plen > tx_credits * 16)
984			return (NULL);
985
986		wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq);
987		if (__predict_false(wr == NULL))
988			return (NULL);
989
990		m_copydata(sndptr, 0, plen, wrtod(wr));
991		return (wr);
992	}
993
994	iso = mbuf_iscsi_iso(sndptr);
995	max_imm = max_imm_payload(tx_credits, iso);
996	max_nsegs = max_dsgl_nsegs(tx_credits, iso);
997	iso_mss = mbuf_iscsi_iso_mss(sndptr);
998
999	plen = 0;
1000	nsegs = 0;
1001	max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
1002	nomap_mbuf_seen = false;
1003	for (m = sndptr; m != NULL; m = m->m_next) {
1004		int n;
1005
1006		if (m->m_flags & M_EXTPG)
1007			n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t),
1008			    m->m_len);
1009		else
1010			n = sglist_count(mtod(m, void *), m->m_len);
1011
1012		nsegs += n;
1013		plen += m->m_len;
1014
1015		/*
1016		 * This mbuf would send us _over_ the nsegs limit.
1017		 * Suspend tx because the PDU can't be sent out.
1018		 */
1019		if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs)
1020			return (NULL);
1021
1022		if (m->m_flags & M_EXTPG)
1023			nomap_mbuf_seen = true;
1024		if (max_nsegs_1mbuf < n)
1025			max_nsegs_1mbuf = n;
1026	}
1027
1028	if (__predict_false(toep->flags & TPF_FIN_SENT))
1029		panic("%s: excess tx.", __func__);
1030
1031	/*
1032	 * We have a PDU to send.  All of it goes out in one WR so 'm'
1033	 * is NULL.  A PDU's length is always a multiple of 4.
1034	 */
1035	MPASS(m == NULL);
1036	MPASS((plen & 3) == 0);
1037	MPASS(sndptr->m_pkthdr.len == plen);
1038
1039	shove = !(tp->t_flags & TF_MORETOCOME);
1040
1041	/*
1042	 * plen doesn't include header and data digests, which are
1043	 * generated and inserted in the right places by the TOE, but
1044	 * they do occupy TCP sequence space and need to be accounted
1045	 * for.
1046	 */
1047	ulp_submode = mbuf_ulp_submode(sndptr);
1048	MPASS(ulp_submode < nitems(ulp_extra_len));
1049	npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1;
1050	adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu;
1051	if (iso)
1052		adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1);
1053	wr_len = sizeof(*txwr);
1054	if (iso)
1055		wr_len += sizeof(struct cpl_tx_data_iso);
1056	if (plen <= max_imm && !nomap_mbuf_seen) {
1057		/* Immediate data tx */
1058		imm_data = plen;
1059		wr_len += plen;
1060		nsegs = 0;
1061	} else {
1062		/* DSGL tx */
1063		imm_data = 0;
1064		wr_len += sizeof(struct ulptx_sgl) +
1065		    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
1066	}
1067
1068	wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq);
1069	if (wr == NULL) {
1070		/* XXX: how will we recover from this? */
1071		return (NULL);
1072	}
1073	txwr = wrtod(wr);
1074	credits = howmany(wr->wr_len, 16);
1075
1076	if (iso) {
1077		write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR,
1078		    imm_data + sizeof(struct cpl_tx_data_iso),
1079		    adjusted_plen, credits, shove, ulp_submode | ULP_ISO);
1080		cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1);
1081		MPASS(plen == sndptr->m_pkthdr.len);
1082		write_tx_data_iso(cpl_iso, ulp_submode,
1083		    mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu);
1084		p = cpl_iso + 1;
1085	} else {
1086		write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data,
1087		    adjusted_plen, credits, shove, ulp_submode);
1088		p = txwr + 1;
1089	}
1090
1091	if (imm_data != 0) {
1092		m_copydata(sndptr, 0, plen, p);
1093	} else {
1094		write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf);
1095		if (wr_len & 0xf) {
1096			uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len);
1097			*pad = 0;
1098		}
1099	}
1100
1101	KASSERT(toep->tx_credits >= credits,
1102	    ("%s: not enough credits: credits %u "
1103		"toep->tx_credits %u tx_credits %u nsegs %u "
1104		"max_nsegs %u iso %d", __func__, credits,
1105		toep->tx_credits, tx_credits, nsegs, max_nsegs, iso));
1106
1107	tp->snd_nxt += adjusted_plen;
1108	tp->snd_max += adjusted_plen;
1109
1110	counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu);
1111	counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen);
1112	if (iso)
1113		counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1);
1114
1115	return (wr);
1116}
1117
1118void
1119t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
1120{
1121	struct mbuf *sndptr, *m;
1122	struct fw_wr_hdr *wrhdr;
1123	struct wrqe *wr;
1124	u_int plen, credits;
1125	struct inpcb *inp = toep->inp;
1126	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
1127	struct mbufq *pduq = &toep->ulp_pduq;
1128
1129	INP_WLOCK_ASSERT(inp);
1130	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
1131	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
1132	KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI,
1133	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
1134
1135	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
1136		return;
1137
1138	/*
1139	 * This function doesn't resume by itself.  Someone else must clear the
1140	 * flag and call this function.
1141	 */
1142	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
1143		KASSERT(drop == 0,
1144		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
1145		return;
1146	}
1147
1148	if (drop) {
1149		struct socket *so = inp->inp_socket;
1150		struct sockbuf *sb = &so->so_snd;
1151		int sbu;
1152
1153		/*
1154		 * An unlocked read is ok here as the data should only
1155		 * transition from a non-zero value to either another
1156		 * non-zero value or zero.  Once it is zero it should
1157		 * stay zero.
1158		 */
1159		if (__predict_false(sbused(sb)) > 0) {
1160			SOCKBUF_LOCK(sb);
1161			sbu = sbused(sb);
1162			if (sbu > 0) {
1163				/*
1164				 * The data transmitted before the
1165				 * tid's ULP mode changed to ISCSI is
1166				 * still in so_snd.  Incoming credits
1167				 * should account for so_snd first.
1168				 */
1169				sbdrop_locked(sb, min(sbu, drop));
1170				drop -= min(sbu, drop);
1171			}
1172			sowwakeup_locked(so);	/* unlocks so_snd */
1173		}
1174		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
1175	}
1176
1177	while ((sndptr = mbufq_first(pduq)) != NULL) {
1178		wr = write_iscsi_mbuf_wr(toep, sndptr);
1179		if (wr == NULL) {
1180			toep->flags |= TPF_TX_SUSPENDED;
1181			return;
1182		}
1183
1184		plen = sndptr->m_pkthdr.len;
1185		credits = howmany(wr->wr_len, 16);
1186		KASSERT(toep->tx_credits >= credits,
1187			("%s: not enough credits", __func__));
1188
1189		m = mbufq_dequeue(pduq);
1190		MPASS(m == sndptr);
1191		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
1192
1193		toep->tx_credits -= credits;
1194		toep->tx_nocompl += credits;
1195		toep->plen_nocompl += plen;
1196
1197		/*
1198		 * Ensure there are enough credits for a full-sized WR
1199		 * as page pod WRs can be full-sized.
1200		 */
1201		if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 &&
1202		    toep->tx_nocompl >= toep->tx_total / 4) {
1203			wrhdr = wrtod(wr);
1204			wrhdr->hi |= htobe32(F_FW_WR_COMPL);
1205			toep->tx_nocompl = 0;
1206			toep->plen_nocompl = 0;
1207		}
1208
1209		toep->flags |= TPF_TX_DATA_SENT;
1210		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
1211			toep->flags |= TPF_TX_SUSPENDED;
1212
1213		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
1214		txsd->plen = plen;
1215		txsd->tx_credits = credits;
1216		txsd++;
1217		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
1218			toep->txsd_pidx = 0;
1219			txsd = &toep->txsd[0];
1220		}
1221		toep->txsd_avail--;
1222
1223		t4_l2t_send(sc, wr, toep->l2te);
1224	}
1225
1226	/* Send a FIN if requested, but only if there are no more PDUs to send */
1227	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
1228		t4_close_conn(sc, toep);
1229}
1230
1231static inline void
1232t4_push_data(struct adapter *sc, struct toepcb *toep, int drop)
1233{
1234
1235	if (ulp_mode(toep) == ULP_MODE_ISCSI)
1236		t4_push_pdus(sc, toep, drop);
1237	else if (toep->flags & TPF_KTLS)
1238		t4_push_ktls(sc, toep, drop);
1239	else
1240		t4_push_frames(sc, toep, drop);
1241}
1242
1243int
1244t4_tod_output(struct toedev *tod, struct tcpcb *tp)
1245{
1246	struct adapter *sc = tod->tod_softc;
1247#ifdef INVARIANTS
1248	struct inpcb *inp = tptoinpcb(tp);
1249#endif
1250	struct toepcb *toep = tp->t_toe;
1251
1252	INP_WLOCK_ASSERT(inp);
1253	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1254	    ("%s: inp %p dropped.", __func__, inp));
1255	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1256
1257	t4_push_data(sc, toep, 0);
1258
1259	return (0);
1260}
1261
1262int
1263t4_send_fin(struct toedev *tod, struct tcpcb *tp)
1264{
1265	struct adapter *sc = tod->tod_softc;
1266#ifdef INVARIANTS
1267	struct inpcb *inp = tptoinpcb(tp);
1268#endif
1269	struct toepcb *toep = tp->t_toe;
1270
1271	INP_WLOCK_ASSERT(inp);
1272	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1273	    ("%s: inp %p dropped.", __func__, inp));
1274	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1275
1276	toep->flags |= TPF_SEND_FIN;
1277	if (tp->t_state >= TCPS_ESTABLISHED)
1278		t4_push_data(sc, toep, 0);
1279
1280	return (0);
1281}
1282
1283int
1284t4_send_rst(struct toedev *tod, struct tcpcb *tp)
1285{
1286	struct adapter *sc = tod->tod_softc;
1287#if defined(INVARIANTS)
1288	struct inpcb *inp = tptoinpcb(tp);
1289#endif
1290	struct toepcb *toep = tp->t_toe;
1291
1292	INP_WLOCK_ASSERT(inp);
1293	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1294	    ("%s: inp %p dropped.", __func__, inp));
1295	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1296
1297	/* hmmmm */
1298	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
1299	    ("%s: flowc for tid %u [%s] not sent already",
1300	    __func__, toep->tid, tcpstates[tp->t_state]));
1301
1302	send_reset(sc, toep, 0);
1303	return (0);
1304}
1305
1306/*
1307 * Peer has sent us a FIN.
1308 */
1309static int
1310do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1311{
1312	struct adapter *sc = iq->adapter;
1313	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
1314	unsigned int tid = GET_TID(cpl);
1315	struct toepcb *toep = lookup_tid(sc, tid);
1316	struct inpcb *inp = toep->inp;
1317	struct tcpcb *tp = NULL;
1318	struct socket *so;
1319	struct epoch_tracker et;
1320#ifdef INVARIANTS
1321	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1322#endif
1323
1324	KASSERT(opcode == CPL_PEER_CLOSE,
1325	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1326	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1327
1328	if (__predict_false(toep->flags & TPF_SYNQE)) {
1329		/*
1330		 * do_pass_establish must have run before do_peer_close and if
1331		 * this is still a synqe instead of a toepcb then the connection
1332		 * must be getting aborted.
1333		 */
1334		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
1335		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1336		    toep, toep->flags);
1337		return (0);
1338	}
1339
1340	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1341
1342	CURVNET_SET(toep->vnet);
1343	NET_EPOCH_ENTER(et);
1344	INP_WLOCK(inp);
1345	tp = intotcpcb(inp);
1346
1347	CTR6(KTR_CXGBE,
1348	    "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p",
1349	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
1350	    toep->ddp.flags, inp);
1351
1352	if (toep->flags & TPF_ABORT_SHUTDOWN)
1353		goto done;
1354
1355	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
1356		DDP_LOCK(toep);
1357		if (__predict_false(toep->ddp.flags &
1358		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
1359			handle_ddp_close(toep, tp, cpl->rcv_nxt);
1360		DDP_UNLOCK(toep);
1361	}
1362	so = inp->inp_socket;
1363	socantrcvmore(so);
1364
1365	if (ulp_mode(toep) == ULP_MODE_RDMA ||
1366	    (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) {
1367		/*
1368		 * There might be data received via DDP before the FIN
1369		 * not reported to the driver.  Just assume the
1370		 * sequence number in the CPL is correct as the
1371		 * sequence number of the FIN.
1372		 */
1373	} else {
1374		KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt),
1375		    ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
1376		    be32toh(cpl->rcv_nxt)));
1377	}
1378
1379	tp->rcv_nxt = be32toh(cpl->rcv_nxt);
1380
1381	switch (tp->t_state) {
1382	case TCPS_SYN_RECEIVED:
1383		tp->t_starttime = ticks;
1384		/* FALLTHROUGH */
1385
1386	case TCPS_ESTABLISHED:
1387		tcp_state_change(tp, TCPS_CLOSE_WAIT);
1388		break;
1389
1390	case TCPS_FIN_WAIT_1:
1391		tcp_state_change(tp, TCPS_CLOSING);
1392		break;
1393
1394	case TCPS_FIN_WAIT_2:
1395		restore_so_proto(so, inp->inp_vflag & INP_IPV6);
1396		tcp_twstart(tp);
1397		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
1398		NET_EPOCH_EXIT(et);
1399		CURVNET_RESTORE();
1400
1401		INP_WLOCK(inp);
1402		final_cpl_received(toep);
1403		return (0);
1404
1405	default:
1406		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
1407		    __func__, tid, tp->t_state);
1408	}
1409done:
1410	INP_WUNLOCK(inp);
1411	NET_EPOCH_EXIT(et);
1412	CURVNET_RESTORE();
1413	return (0);
1414}
1415
1416/*
1417 * Peer has ACK'd our FIN.
1418 */
1419static int
1420do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
1421    struct mbuf *m)
1422{
1423	struct adapter *sc = iq->adapter;
1424	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
1425	unsigned int tid = GET_TID(cpl);
1426	struct toepcb *toep = lookup_tid(sc, tid);
1427	struct inpcb *inp = toep->inp;
1428	struct tcpcb *tp = NULL;
1429	struct socket *so = NULL;
1430	struct epoch_tracker et;
1431#ifdef INVARIANTS
1432	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1433#endif
1434
1435	KASSERT(opcode == CPL_CLOSE_CON_RPL,
1436	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1437	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1438	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1439
1440	CURVNET_SET(toep->vnet);
1441	NET_EPOCH_ENTER(et);
1442	INP_WLOCK(inp);
1443	tp = intotcpcb(inp);
1444
1445	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
1446	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
1447
1448	if (toep->flags & TPF_ABORT_SHUTDOWN)
1449		goto done;
1450
1451	so = inp->inp_socket;
1452	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
1453
1454	switch (tp->t_state) {
1455	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
1456		restore_so_proto(so, inp->inp_vflag & INP_IPV6);
1457		tcp_twstart(tp);
1458release:
1459		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1460		NET_EPOCH_EXIT(et);
1461		CURVNET_RESTORE();
1462
1463		INP_WLOCK(inp);
1464		final_cpl_received(toep);	/* no more CPLs expected */
1465
1466		return (0);
1467	case TCPS_LAST_ACK:
1468		if (tcp_close(tp))
1469			INP_WUNLOCK(inp);
1470		goto release;
1471
1472	case TCPS_FIN_WAIT_1:
1473		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1474			soisdisconnected(so);
1475		tcp_state_change(tp, TCPS_FIN_WAIT_2);
1476		break;
1477
1478	default:
1479		log(LOG_ERR,
1480		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
1481		    __func__, tid, tcpstates[tp->t_state]);
1482	}
1483done:
1484	INP_WUNLOCK(inp);
1485	NET_EPOCH_EXIT(et);
1486	CURVNET_RESTORE();
1487	return (0);
1488}
1489
1490void
1491send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid,
1492    int rst_status)
1493{
1494	struct wrqe *wr;
1495	struct cpl_abort_rpl *cpl;
1496
1497	wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq);
1498	if (wr == NULL) {
1499		/* XXX */
1500		panic("%s: allocation failure.", __func__);
1501	}
1502	cpl = wrtod(wr);
1503
1504	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
1505	cpl->cmd = rst_status;
1506
1507	t4_wrq_tx(sc, wr);
1508}
1509
1510static int
1511abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
1512{
1513	switch (abort_reason) {
1514	case CPL_ERR_BAD_SYN:
1515	case CPL_ERR_CONN_RESET:
1516		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
1517	case CPL_ERR_XMIT_TIMEDOUT:
1518	case CPL_ERR_PERSIST_TIMEDOUT:
1519	case CPL_ERR_FINWAIT2_TIMEDOUT:
1520	case CPL_ERR_KEEPALIVE_TIMEDOUT:
1521		return (ETIMEDOUT);
1522	default:
1523		return (EIO);
1524	}
1525}
1526
1527/*
1528 * TCP RST from the peer, timeout, or some other such critical error.
1529 */
1530static int
1531do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1532{
1533	struct adapter *sc = iq->adapter;
1534	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
1535	unsigned int tid = GET_TID(cpl);
1536	struct toepcb *toep = lookup_tid(sc, tid);
1537	struct sge_ofld_txq *ofld_txq = toep->ofld_txq;
1538	struct inpcb *inp;
1539	struct tcpcb *tp;
1540	struct epoch_tracker et;
1541#ifdef INVARIANTS
1542	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1543#endif
1544
1545	KASSERT(opcode == CPL_ABORT_REQ_RSS,
1546	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1547	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1548
1549	if (toep->flags & TPF_SYNQE)
1550		return (do_abort_req_synqe(iq, rss, m));
1551
1552	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1553
1554	if (negative_advice(cpl->status)) {
1555		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
1556		    __func__, cpl->status, tid, toep->flags);
1557		return (0);	/* Ignore negative advice */
1558	}
1559
1560	inp = toep->inp;
1561	CURVNET_SET(toep->vnet);
1562	NET_EPOCH_ENTER(et);	/* for tcp_close */
1563	INP_WLOCK(inp);
1564
1565	tp = intotcpcb(inp);
1566
1567	CTR6(KTR_CXGBE,
1568	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
1569	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
1570	    inp->inp_flags, cpl->status);
1571
1572	/*
1573	 * If we'd initiated an abort earlier the reply to it is responsible for
1574	 * cleaning up resources.  Otherwise we tear everything down right here
1575	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
1576	 */
1577	if (toep->flags & TPF_ABORT_SHUTDOWN) {
1578		INP_WUNLOCK(inp);
1579		goto done;
1580	}
1581	toep->flags |= TPF_ABORT_SHUTDOWN;
1582
1583	if ((inp->inp_flags & INP_DROPPED) == 0) {
1584		struct socket *so = inp->inp_socket;
1585
1586		if (so != NULL)
1587			so_error_set(so, abort_status_to_errno(tp,
1588			    cpl->status));
1589		tp = tcp_close(tp);
1590		if (tp == NULL)
1591			INP_WLOCK(inp);	/* re-acquire */
1592	}
1593
1594	final_cpl_received(toep);
1595done:
1596	NET_EPOCH_EXIT(et);
1597	CURVNET_RESTORE();
1598	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
1599	return (0);
1600}
1601
1602/*
1603 * Reply to the CPL_ABORT_REQ (send_reset)
1604 */
1605static int
1606do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1607{
1608	struct adapter *sc = iq->adapter;
1609	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
1610	unsigned int tid = GET_TID(cpl);
1611	struct toepcb *toep = lookup_tid(sc, tid);
1612	struct inpcb *inp = toep->inp;
1613#ifdef INVARIANTS
1614	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1615#endif
1616
1617	KASSERT(opcode == CPL_ABORT_RPL_RSS,
1618	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1619	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1620
1621	if (toep->flags & TPF_SYNQE)
1622		return (do_abort_rpl_synqe(iq, rss, m));
1623
1624	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1625
1626	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
1627	    __func__, tid, toep, inp, cpl->status);
1628
1629	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1630	    ("%s: wasn't expecting abort reply", __func__));
1631
1632	INP_WLOCK(inp);
1633	final_cpl_received(toep);
1634
1635	return (0);
1636}
1637
1638static int
1639do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1640{
1641	struct adapter *sc = iq->adapter;
1642	const struct cpl_rx_data *cpl = mtod(m, const void *);
1643	unsigned int tid = GET_TID(cpl);
1644	struct toepcb *toep = lookup_tid(sc, tid);
1645	struct inpcb *inp = toep->inp;
1646	struct tcpcb *tp;
1647	struct socket *so;
1648	struct sockbuf *sb;
1649	struct epoch_tracker et;
1650	int len;
1651	uint32_t ddp_placed = 0;
1652
1653	if (__predict_false(toep->flags & TPF_SYNQE)) {
1654		/*
1655		 * do_pass_establish must have run before do_rx_data and if this
1656		 * is still a synqe instead of a toepcb then the connection must
1657		 * be getting aborted.
1658		 */
1659		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
1660		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1661		    toep, toep->flags);
1662		m_freem(m);
1663		return (0);
1664	}
1665
1666	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1667
1668	/* strip off CPL header */
1669	m_adj(m, sizeof(*cpl));
1670	len = m->m_pkthdr.len;
1671
1672	INP_WLOCK(inp);
1673	if (inp->inp_flags & INP_DROPPED) {
1674		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
1675		    __func__, tid, len, inp->inp_flags);
1676		INP_WUNLOCK(inp);
1677		m_freem(m);
1678		return (0);
1679	}
1680
1681	tp = intotcpcb(inp);
1682
1683	if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS &&
1684	   toep->flags & TPF_TLS_RECEIVE)) {
1685		/* Received "raw" data on a TLS socket. */
1686		CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)",
1687		    __func__, tid, len);
1688		do_rx_data_tls(cpl, toep, m);
1689		return (0);
1690	}
1691
1692	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
1693		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
1694
1695	tp->rcv_nxt += len;
1696	if (tp->rcv_wnd < len) {
1697		KASSERT(ulp_mode(toep) == ULP_MODE_RDMA,
1698				("%s: negative window size", __func__));
1699	}
1700
1701	tp->rcv_wnd -= len;
1702	tp->t_rcvtime = ticks;
1703
1704	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
1705		DDP_LOCK(toep);
1706	so = inp_inpcbtosocket(inp);
1707	sb = &so->so_rcv;
1708	SOCKBUF_LOCK(sb);
1709
1710	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
1711		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
1712		    __func__, tid, len);
1713		m_freem(m);
1714		SOCKBUF_UNLOCK(sb);
1715		if (ulp_mode(toep) == ULP_MODE_TCPDDP)
1716			DDP_UNLOCK(toep);
1717		INP_WUNLOCK(inp);
1718
1719		CURVNET_SET(toep->vnet);
1720		NET_EPOCH_ENTER(et);
1721		INP_WLOCK(inp);
1722		tp = tcp_drop(tp, ECONNRESET);
1723		if (tp)
1724			INP_WUNLOCK(inp);
1725		NET_EPOCH_EXIT(et);
1726		CURVNET_RESTORE();
1727
1728		return (0);
1729	}
1730
1731	/* receive buffer autosize */
1732	MPASS(toep->vnet == so->so_vnet);
1733	CURVNET_SET(toep->vnet);
1734	if (sb->sb_flags & SB_AUTOSIZE &&
1735	    V_tcp_do_autorcvbuf &&
1736	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
1737	    len > (sbspace(sb) / 8 * 7)) {
1738		unsigned int hiwat = sb->sb_hiwat;
1739		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
1740		    V_tcp_autorcvbuf_max);
1741
1742		if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
1743			sb->sb_flags &= ~SB_AUTOSIZE;
1744	}
1745
1746	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
1747		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
1748
1749		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
1750			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
1751			    __func__, tid, len);
1752
1753		if (changed) {
1754			if (toep->ddp.flags & DDP_SC_REQ)
1755				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
1756			else if (cpl->ddp_off == 1) {
1757				/* Fell out of DDP mode */
1758				toep->ddp.flags &= ~DDP_ON;
1759				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
1760				    __func__);
1761
1762				insert_ddp_data(toep, ddp_placed);
1763			} else {
1764				/*
1765				 * Data was received while still
1766				 * ULP_MODE_NONE, just fall through.
1767				 */
1768			}
1769		}
1770
1771		if (toep->ddp.flags & DDP_ON) {
1772			/*
1773			 * CPL_RX_DATA with DDP on can only be an indicate.
1774			 * Start posting queued AIO requests via DDP.  The
1775			 * payload that arrived in this indicate is appended
1776			 * to the socket buffer as usual.
1777			 */
1778			handle_ddp_indicate(toep);
1779		}
1780	}
1781
1782	sbappendstream_locked(sb, m, 0);
1783	t4_rcvd_locked(&toep->td->tod, tp);
1784
1785	if (ulp_mode(toep) == ULP_MODE_TCPDDP &&
1786	    (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 &&
1787	    sbavail(sb) != 0) {
1788		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
1789		    tid);
1790		ddp_queue_toep(toep);
1791	}
1792	if (toep->flags & TPF_TLS_STARTING)
1793		tls_received_starting_data(sc, toep, sb, len);
1794	sorwakeup_locked(so);
1795	SOCKBUF_UNLOCK_ASSERT(sb);
1796	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
1797		DDP_UNLOCK(toep);
1798
1799	INP_WUNLOCK(inp);
1800	CURVNET_RESTORE();
1801	return (0);
1802}
1803
1804static int
1805do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1806{
1807	struct adapter *sc = iq->adapter;
1808	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
1809	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
1810	struct toepcb *toep = lookup_tid(sc, tid);
1811	struct inpcb *inp;
1812	struct tcpcb *tp;
1813	struct socket *so;
1814	uint8_t credits = cpl->credits;
1815	struct ofld_tx_sdesc *txsd;
1816	int plen;
1817#ifdef INVARIANTS
1818	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
1819#endif
1820
1821	/*
1822	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
1823	 * now this comes back carrying the credits for the flowc.
1824	 */
1825	if (__predict_false(toep->flags & TPF_SYNQE)) {
1826		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1827		    ("%s: credits for a synq entry %p", __func__, toep));
1828		return (0);
1829	}
1830
1831	inp = toep->inp;
1832
1833	KASSERT(opcode == CPL_FW4_ACK,
1834	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1835	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1836	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1837
1838	INP_WLOCK(inp);
1839
1840	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
1841		INP_WUNLOCK(inp);
1842		return (0);
1843	}
1844
1845	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1846	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
1847
1848	tp = intotcpcb(inp);
1849
1850	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
1851		tcp_seq snd_una = be32toh(cpl->snd_una);
1852
1853#ifdef INVARIANTS
1854		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
1855			log(LOG_ERR,
1856			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
1857			    __func__, snd_una, toep->tid, tp->snd_una);
1858		}
1859#endif
1860
1861		if (tp->snd_una != snd_una) {
1862			tp->snd_una = snd_una;
1863			tp->ts_recent_age = tcp_ts_getticks();
1864		}
1865	}
1866
1867#ifdef VERBOSE_TRACES
1868	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
1869#endif
1870	so = inp->inp_socket;
1871	txsd = &toep->txsd[toep->txsd_cidx];
1872	plen = 0;
1873	while (credits) {
1874		KASSERT(credits >= txsd->tx_credits,
1875		    ("%s: too many (or partial) credits", __func__));
1876		credits -= txsd->tx_credits;
1877		toep->tx_credits += txsd->tx_credits;
1878		plen += txsd->plen;
1879		txsd++;
1880		toep->txsd_avail++;
1881		KASSERT(toep->txsd_avail <= toep->txsd_total,
1882		    ("%s: txsd avail > total", __func__));
1883		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
1884			txsd = &toep->txsd[0];
1885			toep->txsd_cidx = 0;
1886		}
1887	}
1888
1889	if (toep->tx_credits == toep->tx_total) {
1890		toep->tx_nocompl = 0;
1891		toep->plen_nocompl = 0;
1892	}
1893
1894	if (toep->flags & TPF_TX_SUSPENDED &&
1895	    toep->tx_credits >= toep->tx_total / 4) {
1896#ifdef VERBOSE_TRACES
1897		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
1898		    tid);
1899#endif
1900		toep->flags &= ~TPF_TX_SUSPENDED;
1901		CURVNET_SET(toep->vnet);
1902		t4_push_data(sc, toep, plen);
1903		CURVNET_RESTORE();
1904	} else if (plen > 0) {
1905		struct sockbuf *sb = &so->so_snd;
1906		int sbu;
1907
1908		SOCKBUF_LOCK(sb);
1909		sbu = sbused(sb);
1910		if (ulp_mode(toep) == ULP_MODE_ISCSI) {
1911			if (__predict_false(sbu > 0)) {
1912				/*
1913				 * The data transmitted before the
1914				 * tid's ULP mode changed to ISCSI is
1915				 * still in so_snd.  Incoming credits
1916				 * should account for so_snd first.
1917				 */
1918				sbdrop_locked(sb, min(sbu, plen));
1919				plen -= min(sbu, plen);
1920			}
1921			sowwakeup_locked(so);	/* unlocks so_snd */
1922			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
1923		} else {
1924#ifdef VERBOSE_TRACES
1925			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
1926			    tid, plen);
1927#endif
1928			sbdrop_locked(sb, plen);
1929			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
1930				t4_aiotx_queue_toep(so, toep);
1931			sowwakeup_locked(so);	/* unlocks so_snd */
1932		}
1933		SOCKBUF_UNLOCK_ASSERT(sb);
1934	}
1935
1936	INP_WUNLOCK(inp);
1937
1938	return (0);
1939}
1940
1941void
1942t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
1943    uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
1944{
1945	struct wrqe *wr;
1946	struct cpl_set_tcb_field *req;
1947	struct ofld_tx_sdesc *txsd;
1948
1949	MPASS((cookie & ~M_COOKIE) == 0);
1950	if (reply) {
1951		MPASS(cookie != CPL_COOKIE_RESERVED);
1952	}
1953
1954	wr = alloc_wrqe(sizeof(*req), wrq);
1955	if (wr == NULL) {
1956		/* XXX */
1957		panic("%s: allocation failure.", __func__);
1958	}
1959	req = wrtod(wr);
1960
1961	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
1962	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
1963	if (reply == 0)
1964		req->reply_ctrl |= htobe16(F_NO_REPLY);
1965	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
1966	req->mask = htobe64(mask);
1967	req->val = htobe64(val);
1968	if (wrq->eq.type == EQ_OFLD) {
1969		txsd = &toep->txsd[toep->txsd_pidx];
1970		txsd->tx_credits = howmany(sizeof(*req), 16);
1971		txsd->plen = 0;
1972		KASSERT(toep->tx_credits >= txsd->tx_credits &&
1973		    toep->txsd_avail > 0,
1974		    ("%s: not enough credits (%d)", __func__,
1975		    toep->tx_credits));
1976		toep->tx_credits -= txsd->tx_credits;
1977		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
1978			toep->txsd_pidx = 0;
1979		toep->txsd_avail--;
1980	}
1981
1982	t4_wrq_tx(sc, wr);
1983}
1984
1985void
1986t4_init_cpl_io_handlers(void)
1987{
1988
1989	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
1990	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
1991	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
1992	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
1993	    CPL_COOKIE_TOM);
1994	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
1995	t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
1996}
1997
1998void
1999t4_uninit_cpl_io_handlers(void)
2000{
2001
2002	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
2003	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
2004	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
2005	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
2006	t4_register_cpl_handler(CPL_RX_DATA, NULL);
2007	t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
2008}
2009
2010/*
2011 * Use the 'backend1' field in AIO jobs to hold an error that should
2012 * be reported when the job is completed, the 'backend3' field to
2013 * store the amount of data sent by the AIO job so far, and the
2014 * 'backend4' field to hold a reference count on the job.
2015 *
2016 * Each unmapped mbuf holds a reference on the job as does the queue
2017 * so long as the job is queued.
2018 */
2019#define	aio_error	backend1
2020#define	aio_sent	backend3
2021#define	aio_refs	backend4
2022
2023#ifdef VERBOSE_TRACES
2024static int
2025jobtotid(struct kaiocb *job)
2026{
2027	struct socket *so;
2028	struct tcpcb *tp;
2029	struct toepcb *toep;
2030
2031	so = job->fd_file->f_data;
2032	tp = sototcpcb(so);
2033	toep = tp->t_toe;
2034	return (toep->tid);
2035}
2036#endif
2037
2038static void
2039aiotx_free_job(struct kaiocb *job)
2040{
2041	long status;
2042	int error;
2043
2044	if (refcount_release(&job->aio_refs) == 0)
2045		return;
2046
2047	error = (intptr_t)job->aio_error;
2048	status = job->aio_sent;
2049#ifdef VERBOSE_TRACES
2050	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
2051	    jobtotid(job), job, status, error);
2052#endif
2053	if (error != 0 && status != 0)
2054		error = 0;
2055	if (error == ECANCELED)
2056		aio_cancel(job);
2057	else if (error)
2058		aio_complete(job, -1, error);
2059	else {
2060		job->msgsnd = 1;
2061		aio_complete(job, status, 0);
2062	}
2063}
2064
2065static void
2066aiotx_free_pgs(struct mbuf *m)
2067{
2068	struct kaiocb *job;
2069	vm_page_t pg;
2070
2071	M_ASSERTEXTPG(m);
2072	job = m->m_ext.ext_arg1;
2073#ifdef VERBOSE_TRACES
2074	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
2075	    m->m_len, jobtotid(job));
2076#endif
2077
2078	for (int i = 0; i < m->m_epg_npgs; i++) {
2079		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
2080		vm_page_unwire(pg, PQ_ACTIVE);
2081	}
2082
2083	aiotx_free_job(job);
2084}
2085
2086/*
2087 * Allocate a chain of unmapped mbufs describing the next 'len' bytes
2088 * of an AIO job.
2089 */
2090static struct mbuf *
2091alloc_aiotx_mbuf(struct kaiocb *job, int len)
2092{
2093	struct vmspace *vm;
2094	vm_page_t pgs[MBUF_PEXT_MAX_PGS];
2095	struct mbuf *m, *top, *last;
2096	vm_map_t map;
2097	vm_offset_t start;
2098	int i, mlen, npages, pgoff;
2099
2100	KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes,
2101	    ("%s(%p, %d): request to send beyond end of buffer", __func__,
2102	    job, len));
2103
2104	/*
2105	 * The AIO subsystem will cancel and drain all requests before
2106	 * permitting a process to exit or exec, so p_vmspace should
2107	 * be stable here.
2108	 */
2109	vm = job->userproc->p_vmspace;
2110	map = &vm->vm_map;
2111	start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent;
2112	pgoff = start & PAGE_MASK;
2113
2114	top = NULL;
2115	last = NULL;
2116	while (len > 0) {
2117		mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff);
2118		KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0,
2119		    ("%s: next start (%#jx + %#x) is not page aligned",
2120		    __func__, (uintmax_t)start, mlen));
2121
2122		npages = vm_fault_quick_hold_pages(map, start, mlen,
2123		    VM_PROT_WRITE, pgs, nitems(pgs));
2124		if (npages < 0)
2125			break;
2126
2127		m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs);
2128		if (m == NULL) {
2129			vm_page_unhold_pages(pgs, npages);
2130			break;
2131		}
2132
2133		m->m_epg_1st_off = pgoff;
2134		m->m_epg_npgs = npages;
2135		if (npages == 1) {
2136			KASSERT(mlen + pgoff <= PAGE_SIZE,
2137			    ("%s: single page is too large (off %d len %d)",
2138			    __func__, pgoff, mlen));
2139			m->m_epg_last_len = mlen;
2140		} else {
2141			m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) -
2142			    (npages - 2) * PAGE_SIZE;
2143		}
2144		for (i = 0; i < npages; i++)
2145			m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]);
2146
2147		m->m_len = mlen;
2148		m->m_ext.ext_size = npages * PAGE_SIZE;
2149		m->m_ext.ext_arg1 = job;
2150		refcount_acquire(&job->aio_refs);
2151
2152#ifdef VERBOSE_TRACES
2153		CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d",
2154		    __func__, jobtotid(job), m, job, npages);
2155#endif
2156
2157		if (top == NULL)
2158			top = m;
2159		else
2160			last->m_next = m;
2161		last = m;
2162
2163		len -= mlen;
2164		start += mlen;
2165		pgoff = 0;
2166	}
2167
2168	return (top);
2169}
2170
2171static void
2172t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
2173{
2174	struct sockbuf *sb;
2175	struct inpcb *inp;
2176	struct tcpcb *tp;
2177	struct mbuf *m;
2178	u_int sent;
2179	int error, len;
2180	bool moretocome, sendmore;
2181
2182	sb = &so->so_snd;
2183	SOCKBUF_UNLOCK(sb);
2184	m = NULL;
2185
2186#ifdef MAC
2187	error = mac_socket_check_send(job->fd_file->f_cred, so);
2188	if (error != 0)
2189		goto out;
2190#endif
2191
2192	/* Inline sosend_generic(). */
2193
2194	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
2195	MPASS(error == 0);
2196
2197sendanother:
2198	SOCKBUF_LOCK(sb);
2199	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2200		SOCKBUF_UNLOCK(sb);
2201		SOCK_IO_SEND_UNLOCK(so);
2202		if ((so->so_options & SO_NOSIGPIPE) == 0) {
2203			PROC_LOCK(job->userproc);
2204			kern_psignal(job->userproc, SIGPIPE);
2205			PROC_UNLOCK(job->userproc);
2206		}
2207		error = EPIPE;
2208		goto out;
2209	}
2210	if (so->so_error) {
2211		error = so->so_error;
2212		so->so_error = 0;
2213		SOCKBUF_UNLOCK(sb);
2214		SOCK_IO_SEND_UNLOCK(so);
2215		goto out;
2216	}
2217	if ((so->so_state & SS_ISCONNECTED) == 0) {
2218		SOCKBUF_UNLOCK(sb);
2219		SOCK_IO_SEND_UNLOCK(so);
2220		error = ENOTCONN;
2221		goto out;
2222	}
2223	if (sbspace(sb) < sb->sb_lowat) {
2224		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
2225
2226		/*
2227		 * Don't block if there is too little room in the socket
2228		 * buffer.  Instead, requeue the request.
2229		 */
2230		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2231			SOCKBUF_UNLOCK(sb);
2232			SOCK_IO_SEND_UNLOCK(so);
2233			error = ECANCELED;
2234			goto out;
2235		}
2236		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2237		SOCKBUF_UNLOCK(sb);
2238		SOCK_IO_SEND_UNLOCK(so);
2239		goto out;
2240	}
2241
2242	/*
2243	 * Write as much data as the socket permits, but no more than a
2244	 * a single sndbuf at a time.
2245	 */
2246	len = sbspace(sb);
2247	if (len > job->uaiocb.aio_nbytes - job->aio_sent) {
2248		len = job->uaiocb.aio_nbytes - job->aio_sent;
2249		moretocome = false;
2250	} else
2251		moretocome = true;
2252	if (len > toep->params.sndbuf) {
2253		len = toep->params.sndbuf;
2254		sendmore = true;
2255	} else
2256		sendmore = false;
2257
2258	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
2259		moretocome = true;
2260	SOCKBUF_UNLOCK(sb);
2261	MPASS(len != 0);
2262
2263	m = alloc_aiotx_mbuf(job, len);
2264	if (m == NULL) {
2265		SOCK_IO_SEND_UNLOCK(so);
2266		error = EFAULT;
2267		goto out;
2268	}
2269
2270	/* Inlined tcp_usr_send(). */
2271
2272	inp = toep->inp;
2273	INP_WLOCK(inp);
2274	if (inp->inp_flags & INP_DROPPED) {
2275		INP_WUNLOCK(inp);
2276		SOCK_IO_SEND_UNLOCK(so);
2277		error = ECONNRESET;
2278		goto out;
2279	}
2280
2281	sent = m_length(m, NULL);
2282	job->aio_sent += sent;
2283	counter_u64_add(toep->ofld_txq->tx_aio_octets, sent);
2284
2285	sbappendstream(sb, m, 0);
2286	m = NULL;
2287
2288	if (!(inp->inp_flags & INP_DROPPED)) {
2289		tp = intotcpcb(inp);
2290		if (moretocome)
2291			tp->t_flags |= TF_MORETOCOME;
2292		error = tcp_output(tp);
2293		if (error < 0) {
2294			INP_UNLOCK_ASSERT(inp);
2295			SOCK_IO_SEND_UNLOCK(so);
2296			error = -error;
2297			goto out;
2298		}
2299		if (moretocome)
2300			tp->t_flags &= ~TF_MORETOCOME;
2301	}
2302
2303	INP_WUNLOCK(inp);
2304	if (sendmore)
2305		goto sendanother;
2306	SOCK_IO_SEND_UNLOCK(so);
2307
2308	if (error)
2309		goto out;
2310
2311	/*
2312	 * If this is a blocking socket and the request has not been
2313	 * fully completed, requeue it until the socket is ready
2314	 * again.
2315	 */
2316	if (job->aio_sent < job->uaiocb.aio_nbytes &&
2317	    !(so->so_state & SS_NBIO)) {
2318		SOCKBUF_LOCK(sb);
2319		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2320			SOCKBUF_UNLOCK(sb);
2321			error = ECANCELED;
2322			goto out;
2323		}
2324		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2325		return;
2326	}
2327
2328	/*
2329	 * If the request will not be requeued, drop the queue's
2330	 * reference to the job.  Any mbufs in flight should still
2331	 * hold a reference, but this drops the reference that the
2332	 * queue owns while it is waiting to queue mbufs to the
2333	 * socket.
2334	 */
2335	aiotx_free_job(job);
2336	counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1);
2337
2338out:
2339	if (error) {
2340		job->aio_error = (void *)(intptr_t)error;
2341		aiotx_free_job(job);
2342	}
2343	m_freem(m);
2344	SOCKBUF_LOCK(sb);
2345}
2346
2347static void
2348t4_aiotx_task(void *context, int pending)
2349{
2350	struct toepcb *toep = context;
2351	struct socket *so;
2352	struct kaiocb *job;
2353	struct epoch_tracker et;
2354
2355	so = toep->aiotx_so;
2356	CURVNET_SET(toep->vnet);
2357	NET_EPOCH_ENTER(et);
2358	SOCKBUF_LOCK(&so->so_snd);
2359	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
2360		job = TAILQ_FIRST(&toep->aiotx_jobq);
2361		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2362		if (!aio_clear_cancel_function(job))
2363			continue;
2364
2365		t4_aiotx_process_job(toep, so, job);
2366	}
2367	toep->aiotx_so = NULL;
2368	SOCKBUF_UNLOCK(&so->so_snd);
2369	NET_EPOCH_EXIT(et);
2370
2371	free_toepcb(toep);
2372	sorele(so);
2373	CURVNET_RESTORE();
2374}
2375
2376static void
2377t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep)
2378{
2379
2380	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
2381#ifdef VERBOSE_TRACES
2382	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
2383	    __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false");
2384#endif
2385	if (toep->aiotx_so != NULL)
2386		return;
2387	soref(so);
2388	toep->aiotx_so = so;
2389	hold_toepcb(toep);
2390	soaio_enqueue(&toep->aiotx_task);
2391}
2392
2393static void
2394t4_aiotx_cancel(struct kaiocb *job)
2395{
2396	struct socket *so;
2397	struct sockbuf *sb;
2398	struct tcpcb *tp;
2399	struct toepcb *toep;
2400
2401	so = job->fd_file->f_data;
2402	tp = sototcpcb(so);
2403	toep = tp->t_toe;
2404	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
2405	sb = &so->so_snd;
2406
2407	SOCKBUF_LOCK(sb);
2408	if (!aio_cancel_cleared(job))
2409		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2410	SOCKBUF_UNLOCK(sb);
2411
2412	job->aio_error = (void *)(intptr_t)ECANCELED;
2413	aiotx_free_job(job);
2414}
2415
2416int
2417t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
2418{
2419	struct tcpcb *tp = sototcpcb(so);
2420	struct toepcb *toep = tp->t_toe;
2421	struct adapter *sc = td_adapter(toep->td);
2422
2423	/* This only handles writes. */
2424	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
2425		return (EOPNOTSUPP);
2426
2427	if (!sc->tt.tx_zcopy)
2428		return (EOPNOTSUPP);
2429
2430	if (tls_tx_key(toep))
2431		return (EOPNOTSUPP);
2432
2433	SOCKBUF_LOCK(&so->so_snd);
2434#ifdef VERBOSE_TRACES
2435	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
2436#endif
2437	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
2438		panic("new job was cancelled");
2439	refcount_init(&job->aio_refs, 1);
2440	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
2441	if (sowriteable(so))
2442		t4_aiotx_queue_toep(so, toep);
2443	SOCKBUF_UNLOCK(&so->so_snd);
2444	return (0);
2445}
2446
2447void
2448aiotx_init_toep(struct toepcb *toep)
2449{
2450
2451	TAILQ_INIT(&toep->aiotx_jobq);
2452	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
2453}
2454#endif
2455