1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/11/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 330303 2018-03-03 00:54:12Z jhb $");
29
30#include "opt_inet.h"
31
32#ifdef TCP_OFFLOAD
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/sockbuf.h>
47#include <sys/sysctl.h>
48#include <sys/syslog.h>
49#include <sys/protosw.h>
50#include <sys/priv.h>
51#include <sys/sglist.h>
52#include <sys/taskqueue.h>
53
54#include <net/if.h>
55#include <net/if_var.h>
56#include <net/ethernet.h>
57#include <net/route.h>
58
59#include <netinet/in.h>
60#include <netinet/in_pcb.h>
61#include <netinet/in_systm.h>
62#include <netinet/in_var.h>
63
64#include <netinet/ip.h>
65#define TCPSTATES
66#include <netinet/tcp_fsm.h>
67#include <netinet/tcp_var.h>
68#include <netinet/toecore.h>
69#include <netinet/tcp_seq.h>
70#include <netinet/tcp_timer.h>
71#include <net/route.h>
72
73#include "cxgb_include.h"
74#include "ulp/tom/cxgb_l2t.h"
75#include "ulp/tom/cxgb_tom.h"
76#include "ulp/tom/cxgb_toepcb.h"
77
78VNET_DECLARE(int, tcp_do_autosndbuf);
79#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
80VNET_DECLARE(int, tcp_autosndbuf_inc);
81#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
82VNET_DECLARE(int, tcp_autosndbuf_max);
83#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
84VNET_DECLARE(int, tcp_do_autorcvbuf);
85#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
86VNET_DECLARE(int, tcp_autorcvbuf_inc);
87#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
88VNET_DECLARE(int, tcp_autorcvbuf_max);
89#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
90
91/*
92 * For ULP connections HW may add headers, e.g., for digests, that aren't part
93 * of the messages sent by the host but that are part of the TCP payload and
94 * therefore consume TCP sequence space.  Tx connection parameters that
95 * operate in TCP sequence space are affected by the HW additions and need to
96 * compensate for them to accurately track TCP sequence numbers. This array
97 * contains the compensating extra lengths for ULP packets.  It is indexed by
98 * a packet's ULP submode.
99 */
100const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
101
102/*
103 * Max receive window supported by HW in bytes.  Only a small part of it can
104 * be set through option0, the rest needs to be set through RX_DATA_ACK.
105 */
106#define MAX_RCV_WND ((1U << 27) - 1)
107
108/*
109 * Min receive window.  We want it to be large enough to accommodate receive
110 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
111 */
112#define MIN_RCV_WND (24 * 1024U)
113#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
114
115static void t3_release_offload_resources(struct toepcb *);
116static void send_reset(struct toepcb *toep);
117
118/*
119 * Called after the last CPL for the toepcb has been received.
120 *
121 * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the
122 * time this function exits.
123 */
124static int
125toepcb_release(struct toepcb *toep)
126{
127	struct inpcb *inp = toep->tp_inp;
128	struct toedev *tod = toep->tp_tod;
129	struct tom_data *td = t3_tomdata(tod);
130	int rc;
131
132	INP_WLOCK_ASSERT(inp);
133	KASSERT(!(toep->tp_flags & TP_CPL_DONE),
134	    ("%s: double release?", __func__));
135
136	CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid);
137
138	toep->tp_flags |= TP_CPL_DONE;
139	toep->tp_inp = NULL;
140
141	mtx_lock(&td->toep_list_lock);
142	TAILQ_REMOVE(&td->toep_list, toep, link);
143	mtx_unlock(&td->toep_list_lock);
144
145	if (!(toep->tp_flags & TP_ATTACHED))
146		t3_release_offload_resources(toep);
147
148	rc = in_pcbrele_wlocked(inp);
149	if (!rc)
150		INP_WUNLOCK(inp);
151	return (rc);
152}
153
154/*
155 * One sided detach.  The tcpcb is going away and we need to unhook the toepcb
156 * hanging off it.  If the TOE driver is also done with the toepcb we'll release
157 * all offload resources.
158 */
159static void
160toepcb_detach(struct inpcb *inp)
161{
162	struct toepcb *toep;
163	struct tcpcb *tp;
164
165	KASSERT(inp, ("%s: inp is NULL", __func__));
166	INP_WLOCK_ASSERT(inp);
167
168	tp = intotcpcb(inp);
169	toep = tp->t_toe;
170
171	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
172	KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__));
173
174	CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__,
175	    tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid,
176	    toep, inp, tp);
177
178	tp->t_toe = NULL;
179	tp->t_flags &= ~TF_TOE;
180	toep->tp_flags &= ~TP_ATTACHED;
181
182	if (toep->tp_flags & TP_CPL_DONE)
183		t3_release_offload_resources(toep);
184}
185
186void
187t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
188{
189
190	toepcb_detach(tp->t_inpcb);
191}
192
193static int
194alloc_atid(struct tid_info *t, void *ctx)
195{
196	int atid = -1;
197
198	mtx_lock(&t->atid_lock);
199	if (t->afree) {
200		union active_open_entry *p = t->afree;
201
202		atid = (p - t->atid_tab) + t->atid_base;
203		t->afree = p->next;
204		p->ctx = ctx;
205		t->atids_in_use++;
206	}
207	mtx_unlock(&t->atid_lock);
208
209	return (atid);
210}
211
212static void
213free_atid(struct tid_info *t, int atid)
214{
215	union active_open_entry *p = atid2entry(t, atid);
216
217	mtx_lock(&t->atid_lock);
218	p->next = t->afree;
219	t->afree = p;
220	t->atids_in_use--;
221	mtx_unlock(&t->atid_lock);
222}
223
224void
225insert_tid(struct tom_data *td, void *ctx, unsigned int tid)
226{
227	struct tid_info *t = &td->tid_maps;
228
229	t->tid_tab[tid] = ctx;
230	atomic_add_int(&t->tids_in_use, 1);
231}
232
233void
234update_tid(struct tom_data *td, void *ctx, unsigned int tid)
235{
236	struct tid_info *t = &td->tid_maps;
237
238	t->tid_tab[tid] = ctx;
239}
240
241void
242remove_tid(struct tom_data *td, unsigned int tid)
243{
244	struct tid_info *t = &td->tid_maps;
245
246	t->tid_tab[tid] = NULL;
247	atomic_add_int(&t->tids_in_use, -1);
248}
249
250/* use ctx as a next pointer in the tid release list */
251void
252queue_tid_release(struct toedev *tod, unsigned int tid)
253{
254	struct tom_data *td = t3_tomdata(tod);
255	void **p = &td->tid_maps.tid_tab[tid];
256	struct adapter *sc = tod->tod_softc;
257
258	mtx_lock(&td->tid_release_lock);
259	*p = td->tid_release_list;
260	td->tid_release_list = p;
261	if (!*p)
262		taskqueue_enqueue(sc->tq, &td->tid_release_task);
263	mtx_unlock(&td->tid_release_lock);
264}
265
266/*
267 * Populate a TID_RELEASE WR.
268 */
269static inline void
270mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid)
271{
272
273	cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
274	OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
275}
276
277void
278release_tid(struct toedev *tod, unsigned int tid, int qset)
279{
280	struct tom_data *td = t3_tomdata(tod);
281	struct adapter *sc = tod->tod_softc;
282	struct mbuf *m;
283	struct cpl_tid_release *cpl;
284#ifdef INVARIANTS
285	struct tid_info *t = &td->tid_maps;
286#endif
287
288	KASSERT(tid < t->ntids,
289	    ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids));
290
291	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
292	if (m) {
293		mk_tid_release(cpl, tid);
294		t3_offload_tx(sc, m);
295		remove_tid(td, tid);
296	} else
297		queue_tid_release(tod, tid);
298
299}
300
301void
302t3_process_tid_release_list(void *data, int pending)
303{
304	struct mbuf *m;
305	struct tom_data *td = data;
306	struct adapter *sc = td->tod.tod_softc;
307
308	mtx_lock(&td->tid_release_lock);
309	while (td->tid_release_list) {
310		void **p = td->tid_release_list;
311		unsigned int tid = p - td->tid_maps.tid_tab;
312		struct cpl_tid_release *cpl;
313
314		td->tid_release_list = (void **)*p;
315		m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */
316		if (m == NULL)
317			break;	/* XXX: who reschedules the release task? */
318		mtx_unlock(&td->tid_release_lock);
319		mk_tid_release(cpl, tid);
320		t3_offload_tx(sc, m);
321		remove_tid(td, tid);
322		mtx_lock(&td->tid_release_lock);
323	}
324	mtx_unlock(&td->tid_release_lock);
325}
326
327static void
328close_conn(struct adapter *sc, struct toepcb *toep)
329{
330	struct mbuf *m;
331	struct cpl_close_con_req *req;
332
333	if (toep->tp_flags & TP_FIN_SENT)
334		return;
335
336	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
337	if (m == NULL)
338		CXGB_UNIMPLEMENTED();
339
340	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
341	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
342	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid));
343	req->rsvd = 0;
344
345	toep->tp_flags |= TP_FIN_SENT;
346	t3_offload_tx(sc, m);
347}
348
349static inline void
350make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len,
351    struct mbuf *tail)
352{
353	struct tcpcb *tp = so_sototcpcb(so);
354	struct toepcb *toep = tp->t_toe;
355	struct sockbuf *snd;
356
357	inp_lock_assert(tp->t_inpcb);
358	snd = so_sockbuf_snd(so);
359
360	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
361	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
362	/* len includes the length of any HW ULP additions */
363	req->len = htonl(len);
364	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
365	/* V_TX_ULP_SUBMODE sets both the mode and submode */
366	req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) |
367	    V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1)));
368	req->sndseq = htonl(tp->snd_nxt);
369	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
370		struct adapter *sc = toep->tp_tod->tod_softc;
371		int cpu_idx = sc->rrss_map[toep->tp_qset];
372
373		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
374		    V_TX_CPU_IDX(cpu_idx));
375
376		/* Sendbuffer is in units of 32KB. */
377		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
378			req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15));
379		else
380			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
381
382		toep->tp_flags |= TP_DATASENT;
383	}
384}
385
386/*
387 * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc.
388 * TOM_XXX_MOVE to some common header file.
389 */
390/*
391 * IMM_LEN: # of bytes that can be tx'd as immediate data.  There are 16 flits
392 * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more
393 * for the second gen bit flit.  This leaves us with 12 flits.
394 *
395 * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs.
396 * The first desc has a tx_data_wr (which includes the WR header), the rest have
397 * the WR header only.  All descs have the second gen bit flit.
398 *
399 * sgllen_to_descs: # of tx descs used up by an sgl of given length.  The first
400 * desc has a tx_data_wr (which includes the WR header), the rest have the WR
401 * header only.  All descs have the second gen bit flit.
402 *
403 * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits.
404 *
405 */
406#define IMM_LEN 96
407static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35};
408static int sgllen_to_descs[TX_MAX_SEGS] = {
409	0, 1, 1, 1, 1, 1, 1, 1, 1, 2,	/*  0 -  9 */
410	2, 2, 2, 2, 2, 2, 2, 2, 3, 3,	/* 10 - 19 */
411	3, 3, 3, 3, 3, 3, 3, 4, 4, 4,	/* 20 - 29 */
412	4, 4, 4, 4, 4, 4		/* 30 - 35 */
413};
414#if 0
415static int flits_to_sgllen[TX_DESC_FLITS + 1] = {
416	0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10
417};
418#endif
419#if SGE_NUM_GENBITS != 2
420#error "SGE_NUM_GENBITS really must be 2"
421#endif
422
423int
424t3_push_frames(struct socket *so, int req_completion)
425{
426	struct tcpcb *tp = so_sototcpcb(so);
427	struct toepcb *toep = tp->t_toe;
428	struct mbuf *m0, *sndptr, *m;
429	struct toedev *tod = toep->tp_tod;
430	struct adapter *sc = tod->tod_softc;
431	int bytes, ndesc, total_bytes = 0, mlen;
432	struct sockbuf *snd;
433	struct sglist *sgl;
434	struct ofld_hdr *oh;
435	caddr_t dst;
436	struct tx_data_wr *wr;
437
438	inp_lock_assert(tp->t_inpcb);
439
440	snd = so_sockbuf_snd(so);
441	SOCKBUF_LOCK(snd);
442
443	/*
444	 * Autosize the send buffer.
445	 */
446	if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) {
447		if (sbused(snd) >= (snd->sb_hiwat / 8 * 7) &&
448		    sbused(snd) < VNET(tcp_autosndbuf_max)) {
449			if (!sbreserve_locked(snd, min(snd->sb_hiwat +
450			    VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)),
451			    so, curthread))
452				snd->sb_flags &= ~SB_AUTOSIZE;
453		}
454	}
455
456	if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr)
457		sndptr = toep->tp_m_last->m_next;
458	else
459		sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
460
461	/* Nothing to send or no WRs available for sending data */
462	if (toep->tp_wr_avail == 0 || sndptr == NULL)
463		goto out;
464
465	/* Something to send and at least 1 WR available */
466	while (toep->tp_wr_avail && sndptr != NULL) {
467
468		m0 = m_gethdr(M_NOWAIT, MT_DATA);
469		if (m0 == NULL)
470			break;
471		oh = mtod(m0, struct ofld_hdr *);
472		wr = (void *)(oh + 1);
473		dst = (void *)(wr + 1);
474
475		m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr);
476		oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF |
477		    V_HDR_QSET(toep->tp_qset);
478
479		/*
480		 * Try to construct an immediate data WR if possible.  Stuff as
481		 * much data into it as possible, one whole mbuf at a time.
482		 */
483		mlen = sndptr->m_len;
484		ndesc = bytes = 0;
485		while (mlen <= IMM_LEN - bytes) {
486			bcopy(sndptr->m_data, dst, mlen);
487			bytes += mlen;
488			dst += mlen;
489
490			if (!(sndptr = sndptr->m_next))
491				break;
492			mlen = sndptr->m_len;
493		}
494
495		if (bytes) {
496
497			/* Was able to fit 'bytes' bytes in an immediate WR */
498
499			ndesc = 1;
500			make_tx_data_wr(so, wr, bytes, sndptr);
501
502			m0->m_len += bytes;
503			m0->m_pkthdr.len = m0->m_len;
504
505		} else {
506			int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC);
507
508			/* Need to make an SGL */
509
510			sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT);
511			if (sgl == NULL)
512				break;
513
514			for (m = sndptr; m != NULL; m = m->m_next) {
515				if ((mlen = m->m_len) > 0) {
516					if (sglist_append(sgl, m->m_data, mlen))
517					    break;
518				}
519				bytes += mlen;
520			}
521			sndptr = m;
522			if (bytes == 0) {
523				sglist_free(sgl);
524				break;
525			}
526			ndesc = sgllen_to_descs[sgl->sg_nseg];
527			oh->flags |= F_HDR_SGL;
528			oh->sgl = sgl;
529			make_tx_data_wr(so, wr, bytes, sndptr);
530		}
531
532		oh->flags |= V_HDR_NDESC(ndesc);
533		oh->plen = bytes;
534
535		snd->sb_sndptr = sndptr;
536		snd->sb_sndptroff += bytes;
537		if (sndptr == NULL) {
538			snd->sb_sndptr = snd->sb_mbtail;
539			snd->sb_sndptroff -= snd->sb_mbtail->m_len;
540			toep->tp_m_last = snd->sb_mbtail;
541		} else
542			toep->tp_m_last = NULL;
543
544		total_bytes += bytes;
545
546		toep->tp_wr_avail -= ndesc;
547		toep->tp_wr_unacked += ndesc;
548
549		if ((req_completion && toep->tp_wr_unacked == ndesc) ||
550		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
551			wr->wr.wrh_hi |= htonl(F_WR_COMPL);
552			toep->tp_wr_unacked = 0;
553		}
554
555		enqueue_wr(toep, m0);
556		l2t_send(sc, m0, toep->tp_l2t);
557	}
558out:
559	SOCKBUF_UNLOCK(snd);
560
561	if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN))
562		close_conn(sc, toep);
563
564	return (total_bytes);
565}
566
567static int
568send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
569{
570	struct mbuf *m;
571	struct cpl_rx_data_ack *req;
572	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
573
574	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req);
575	if (m == NULL)
576		return (0);
577
578	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
579	req->wr.wrh_lo = 0;
580	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
581	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
582	t3_offload_tx(sc, m);
583	return (credits);
584}
585
586void
587t3_rcvd(struct toedev *tod, struct tcpcb *tp)
588{
589	struct adapter *sc = tod->tod_softc;
590	struct inpcb *inp = tp->t_inpcb;
591	struct socket *so = inp->inp_socket;
592	struct sockbuf *so_rcv = &so->so_rcv;
593	struct toepcb *toep = tp->t_toe;
594	int must_send;
595
596	INP_WLOCK_ASSERT(inp);
597
598	SOCKBUF_LOCK(so_rcv);
599	KASSERT(toep->tp_enqueued >= sbused(so_rcv),
600	    ("%s: sbused(so_rcv) > enqueued", __func__));
601	toep->tp_rx_credits += toep->tp_enqueued - sbused(so_rcv);
602	toep->tp_enqueued = sbused(so_rcv);
603	SOCKBUF_UNLOCK(so_rcv);
604
605	must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd;
606	if (must_send || toep->tp_rx_credits >= 15 * 1024) {
607		int credits;
608
609		credits = send_rx_credits(sc, toep, toep->tp_rx_credits);
610		toep->tp_rx_credits -= credits;
611		tp->rcv_wnd += credits;
612		tp->rcv_adv += credits;
613	}
614}
615
616static int
617do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
618{
619	struct adapter *sc = qs->adap;
620	struct tom_data *td = sc->tom_softc;
621	struct cpl_rx_urg_notify *hdr = mtod(m, void *);
622	unsigned int tid = GET_TID(hdr);
623	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
624
625	log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp);
626
627	m_freem(m);
628	return (0);
629}
630
631int
632t3_send_fin(struct toedev *tod, struct tcpcb *tp)
633{
634	struct toepcb *toep = tp->t_toe;
635	struct inpcb *inp = tp->t_inpcb;
636	struct socket *so = inp_inpcbtosocket(inp);
637#if defined(KTR)
638	unsigned int tid = toep->tp_tid;
639#endif
640
641	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
642	INP_WLOCK_ASSERT(inp);
643
644	CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep,
645	    toep->tp_flags);
646
647	toep->tp_flags |= TP_SEND_FIN;
648	t3_push_frames(so, 1);
649
650	return (0);
651}
652
653int
654t3_tod_output(struct toedev *tod, struct tcpcb *tp)
655{
656	struct inpcb *inp = tp->t_inpcb;
657	struct socket *so = inp->inp_socket;
658
659	t3_push_frames(so, 1);
660	return (0);
661}
662
663/* What mtu_idx to use, given a 4-tuple and/or an MSS cap */
664int
665find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
666{
667	unsigned short *mtus = &sc->params.mtus[0];
668	int i = 0, mss;
669
670	KASSERT(inc != NULL || pmss > 0,
671	    ("%s: at least one of inc/pmss must be specified", __func__));
672
673	mss = inc ? tcp_mssopt(inc) : pmss;
674	if (pmss > 0 && mss > pmss)
675		mss = pmss;
676
677	while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
678		++i;
679
680	return (i);
681}
682
683static inline void
684purge_wr_queue(struct toepcb *toep)
685{
686	struct mbuf *m;
687	struct ofld_hdr *oh;
688
689	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) {
690		oh = mtod(m, struct ofld_hdr *);
691		if (oh->flags & F_HDR_SGL)
692			sglist_free(oh->sgl);
693		m_freem(m);
694	}
695}
696
697/*
698 * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T
699 * entry, etc.)
700 */
701static void
702t3_release_offload_resources(struct toepcb *toep)
703{
704	struct toedev *tod = toep->tp_tod;
705	struct tom_data *td = t3_tomdata(tod);
706
707	/*
708	 * The TOM explicitly detaches its toepcb from the system's inp before
709	 * it releases the offload resources.
710	 */
711	if (toep->tp_inp) {
712		panic("%s: inp %p still attached to toepcb %p",
713		    __func__, toep->tp_inp, toep);
714	}
715
716	if (toep->tp_wr_avail != toep->tp_wr_max)
717		purge_wr_queue(toep);
718
719	if (toep->tp_l2t) {
720		l2t_release(td->l2t, toep->tp_l2t);
721		toep->tp_l2t = NULL;
722	}
723
724	if (toep->tp_tid >= 0)
725		release_tid(tod, toep->tp_tid, toep->tp_qset);
726
727	toepcb_free(toep);
728}
729
730/*
731 * Determine the receive window size for a socket.
732 */
733unsigned long
734select_rcv_wnd(struct socket *so)
735{
736	unsigned long wnd;
737
738	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
739
740	wnd = sbspace(&so->so_rcv);
741	if (wnd < MIN_RCV_WND)
742		wnd = MIN_RCV_WND;
743
744	return min(wnd, MAX_RCV_WND);
745}
746
747int
748select_rcv_wscale(void)
749{
750	int wscale = 0;
751	unsigned long space = sb_max;
752
753	if (space > MAX_RCV_WND)
754		space = MAX_RCV_WND;
755
756	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
757		wscale++;
758
759	return (wscale);
760}
761
762
763/*
764 * Set up the socket for TCP offload.
765 */
766void
767offload_socket(struct socket *so, struct toepcb *toep)
768{
769	struct toedev *tod = toep->tp_tod;
770	struct tom_data *td = t3_tomdata(tod);
771	struct inpcb *inp = sotoinpcb(so);
772	struct tcpcb *tp = intotcpcb(inp);
773
774	INP_WLOCK_ASSERT(inp);
775
776	/* Update socket */
777	SOCKBUF_LOCK(&so->so_snd);
778	so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE;
779	SOCKBUF_UNLOCK(&so->so_snd);
780	SOCKBUF_LOCK(&so->so_rcv);
781	so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE;
782	SOCKBUF_UNLOCK(&so->so_rcv);
783
784	/* Update TCP PCB */
785	tp->tod = toep->tp_tod;
786	tp->t_toe = toep;
787	tp->t_flags |= TF_TOE;
788
789	/* Install an extra hold on inp */
790	toep->tp_inp = inp;
791	toep->tp_flags |= TP_ATTACHED;
792	in_pcbref(inp);
793
794	/* Add the TOE PCB to the active list */
795	mtx_lock(&td->toep_list_lock);
796	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
797	mtx_unlock(&td->toep_list_lock);
798}
799
800/* This is _not_ the normal way to "unoffload" a socket. */
801void
802undo_offload_socket(struct socket *so)
803{
804	struct inpcb *inp = sotoinpcb(so);
805	struct tcpcb *tp = intotcpcb(inp);
806	struct toepcb *toep = tp->t_toe;
807	struct toedev *tod = toep->tp_tod;
808	struct tom_data *td = t3_tomdata(tod);
809
810	INP_WLOCK_ASSERT(inp);
811
812	so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE;
813	so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE;
814
815	tp->tod = NULL;
816	tp->t_toe = NULL;
817	tp->t_flags &= ~TF_TOE;
818
819	toep->tp_inp = NULL;
820	toep->tp_flags &= ~TP_ATTACHED;
821	if (in_pcbrele_wlocked(inp))
822		panic("%s: inp freed.", __func__);
823
824	mtx_lock(&td->toep_list_lock);
825	TAILQ_REMOVE(&td->toep_list, toep, link);
826	mtx_unlock(&td->toep_list_lock);
827}
828
829/*
830 * Socket could be a listening socket, and we may not have a toepcb at all at
831 * this time.
832 */
833uint32_t
834calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e)
835{
836	uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) |
837	    V_MSS_IDX(mtu_idx);
838
839	if (so != NULL) {
840		struct inpcb *inp = sotoinpcb(so);
841		struct tcpcb *tp = intotcpcb(inp);
842		int keepalive = tcp_always_keepalive ||
843		    so_options_get(so) & SO_KEEPALIVE;
844
845		opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
846		opt0h |= V_KEEP_ALIVE(keepalive != 0);
847	}
848
849	if (e != NULL)
850		opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx);
851
852	return (htobe32(opt0h));
853}
854
855uint32_t
856calc_opt0l(struct socket *so, int rcv_bufsize)
857{
858	uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize);
859
860	KASSERT(rcv_bufsize <= M_RCV_BUFSIZ,
861	    ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize));
862
863	if (so != NULL)		/* optional because no one cares about IP TOS */
864		opt0l |= V_TOS(INP_TOS(sotoinpcb(so)));
865
866	return (htobe32(opt0l));
867}
868
869/*
870 * Convert an ACT_OPEN_RPL status to an errno.
871 */
872static int
873act_open_rpl_status_to_errno(int status)
874{
875	switch (status) {
876	case CPL_ERR_CONN_RESET:
877		return (ECONNREFUSED);
878	case CPL_ERR_ARP_MISS:
879		return (EHOSTUNREACH);
880	case CPL_ERR_CONN_TIMEDOUT:
881		return (ETIMEDOUT);
882	case CPL_ERR_TCAM_FULL:
883		return (EAGAIN);
884	case CPL_ERR_CONN_EXIST:
885		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
886		return (EAGAIN);
887	default:
888		return (EIO);
889	}
890}
891
892/*
893 * Return whether a failed active open has allocated a TID
894 */
895static inline int
896act_open_has_tid(int status)
897{
898	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
899	       status != CPL_ERR_ARP_MISS;
900}
901
902/*
903 * Active open failed.
904 */
905static int
906do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
907{
908	struct adapter *sc = qs->adap;
909	struct tom_data *td = sc->tom_softc;
910	struct toedev *tod = &td->tod;
911	struct cpl_act_open_rpl *rpl = mtod(m, void *);
912	unsigned int atid = G_TID(ntohl(rpl->atid));
913	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
914	struct inpcb *inp = toep->tp_inp;
915	int s = rpl->status, rc;
916
917	CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s);
918
919	free_atid(&td->tid_maps, atid);
920	toep->tp_tid = -1;
921
922	if (act_open_has_tid(s))
923		queue_tid_release(tod, GET_TID(rpl));
924
925	rc = act_open_rpl_status_to_errno(s);
926	if (rc != EAGAIN)
927		INP_INFO_RLOCK(&V_tcbinfo);
928	INP_WLOCK(inp);
929	toe_connect_failed(tod, inp, rc);
930	toepcb_release(toep);	/* unlocks inp */
931	if (rc != EAGAIN)
932		INP_INFO_RUNLOCK(&V_tcbinfo);
933
934	m_freem(m);
935	return (0);
936}
937
938/*
939 * Send an active open request.
940 *
941 * State of affairs on entry:
942 * soisconnecting (so_state |= SS_ISCONNECTING)
943 * tcbinfo not locked (this has changed - used to be WLOCKed)
944 * inp WLOCKed
945 * tp->t_state = TCPS_SYN_SENT
946 * rtalloc1, RT_UNLOCK on rt.
947 */
948int
949t3_connect(struct toedev *tod, struct socket *so,
950    struct rtentry *rt, struct sockaddr *nam)
951{
952	struct mbuf *m = NULL;
953	struct l2t_entry *e = NULL;
954	struct tom_data *td = t3_tomdata(tod);
955	struct adapter *sc = tod->tod_softc;
956	struct cpl_act_open_req *cpl;
957	struct inpcb *inp = sotoinpcb(so);
958	struct tcpcb *tp = intotcpcb(inp);
959	struct toepcb *toep;
960	int atid = -1, mtu_idx, rscale, cpu_idx, qset;
961	struct sockaddr *gw;
962	struct ifnet *ifp = rt->rt_ifp;
963	struct port_info *pi = ifp->if_softc;	/* XXX wrong for VLAN etc. */
964
965	INP_WLOCK_ASSERT(inp);
966
967	toep = toepcb_alloc(tod);
968	if (toep == NULL)
969		goto failed;
970
971	atid = alloc_atid(&td->tid_maps, toep);
972	if (atid < 0)
973		goto failed;
974
975	qset = pi->first_qset + (arc4random() % pi->nqsets);
976
977	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
978	if (m == NULL)
979		goto failed;
980
981	gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam;
982	e = t3_l2t_get(pi, ifp, gw);
983	if (e == NULL)
984		goto failed;
985
986	toep->tp_l2t = e;
987	toep->tp_tid = atid;	/* used to double check response */
988	toep->tp_qset = qset;
989
990	SOCKBUF_LOCK(&so->so_rcv);
991	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
992	toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
993	SOCKBUF_UNLOCK(&so->so_rcv);
994
995	offload_socket(so, toep);
996
997	/*
998	 * The kernel sets request_r_scale based on sb_max whereas we need to
999	 * take hardware's MAX_RCV_WND into account too.  This is normally a
1000	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
1001	 */
1002	if (tp->t_flags & TF_REQ_SCALE)
1003		rscale = tp->request_r_scale = select_rcv_wscale();
1004	else
1005		rscale = 0;
1006	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
1007	cpu_idx = sc->rrss_map[qset];
1008
1009	cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD));
1010	cpl->wr.wrh_lo = 0;
1011	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1012	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
1013	    &cpl->peer_port);
1014	cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e);
1015	cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits);
1016	cpl->params = 0;
1017	cpl->opt2 = calc_opt2(cpu_idx);
1018
1019	CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__,
1020	    toep->tp_tid, tcpstates[tp->t_state], toep, inp);
1021
1022	if (l2t_send(sc, m, e) == 0)
1023		return (0);
1024
1025	undo_offload_socket(so);
1026
1027failed:
1028	CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p",
1029	    __func__, atid, toep, e, m);
1030
1031	if (atid >= 0)
1032		free_atid(&td->tid_maps, atid);
1033
1034	if (e)
1035		l2t_release(td->l2t, e);
1036
1037	if (toep)
1038		toepcb_free(toep);
1039
1040	m_freem(m);
1041
1042	return (ENOMEM);
1043}
1044
1045/*
1046 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do not
1047 * send multiple ABORT_REQs for the same connection and also that we do not try
1048 * to send a message after the connection has closed.
1049 */
1050static void
1051send_reset(struct toepcb *toep)
1052{
1053
1054	struct cpl_abort_req *req;
1055	unsigned int tid = toep->tp_tid;
1056	struct inpcb *inp = toep->tp_inp;
1057	struct socket *so = inp->inp_socket;
1058	struct tcpcb *tp = intotcpcb(inp);
1059	struct toedev *tod = toep->tp_tod;
1060	struct adapter *sc = tod->tod_softc;
1061	struct mbuf *m;
1062
1063	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1064	INP_WLOCK_ASSERT(inp);
1065
1066	CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep,
1067	    toep->tp_flags);
1068
1069	if (toep->tp_flags & TP_ABORT_SHUTDOWN)
1070		return;
1071
1072	toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
1073
1074	/* Purge the send queue */
1075	sbflush(so_sockbuf_snd(so));
1076	purge_wr_queue(toep);
1077
1078	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
1079	if (m == NULL)
1080		CXGB_UNIMPLEMENTED();
1081
1082	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1083	req->wr.wrh_lo = htonl(V_WR_TID(tid));
1084	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1085	req->rsvd0 = htonl(tp->snd_nxt);
1086	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1087	req->cmd = CPL_ABORT_SEND_RST;
1088
1089	if (tp->t_state == TCPS_SYN_SENT)
1090		(void )mbufq_enqueue(&toep->out_of_order_queue, m); /* defer */
1091	else
1092		l2t_send(sc, m, toep->tp_l2t);
1093}
1094
1095int
1096t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp)
1097{
1098
1099	send_reset(tp->t_toe);
1100	return (0);
1101}
1102
1103/*
1104 * Handler for RX_DATA CPL messages.
1105 */
1106static int
1107do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1108{
1109	struct adapter *sc = qs->adap;
1110	struct tom_data *td = sc->tom_softc;
1111	struct cpl_rx_data *hdr = mtod(m, void *);
1112	unsigned int tid = GET_TID(hdr);
1113	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1114	struct inpcb *inp = toep->tp_inp;
1115	struct tcpcb *tp;
1116	struct socket *so;
1117	struct sockbuf *so_rcv;
1118
1119	/* Advance over CPL */
1120	m_adj(m, sizeof(*hdr));
1121
1122	/* XXX: revisit.  This comes from the T4 TOM */
1123	if (__predict_false(inp == NULL)) {
1124		/*
1125		 * do_pass_establish failed and must be attempting to abort the
1126		 * connection.  Meanwhile, the T4 has sent us data for such a
1127		 * connection.
1128		 */
1129#ifdef notyet
1130		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
1131		    ("%s: inp NULL and tid isn't being aborted", __func__));
1132#endif
1133		m_freem(m);
1134		return (0);
1135	}
1136
1137	INP_WLOCK(inp);
1138	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
1139		CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
1140		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
1141		INP_WUNLOCK(inp);
1142		m_freem(m);
1143		return (0);
1144	}
1145
1146	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode))
1147		toep->tp_delack_mode = hdr->dack_mode;
1148
1149	tp = intotcpcb(inp);
1150
1151#ifdef INVARIANTS
1152	if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) {
1153		log(LOG_ERR,
1154		    "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
1155		    __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt);
1156	}
1157#endif
1158	tp->rcv_nxt += m->m_pkthdr.len;
1159	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
1160	    ("%s: negative window size", __func__));
1161	tp->rcv_wnd -= m->m_pkthdr.len;
1162	tp->t_rcvtime = ticks;
1163
1164	so  = inp->inp_socket;
1165	so_rcv = &so->so_rcv;
1166	SOCKBUF_LOCK(so_rcv);
1167
1168	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
1169		CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)",
1170		    __func__, tid, m->m_pkthdr.len);
1171		SOCKBUF_UNLOCK(so_rcv);
1172		INP_WUNLOCK(inp);
1173
1174		INP_INFO_RLOCK(&V_tcbinfo);
1175		INP_WLOCK(inp);
1176		tp = tcp_drop(tp, ECONNRESET);
1177		if (tp)
1178			INP_WUNLOCK(inp);
1179		INP_INFO_RUNLOCK(&V_tcbinfo);
1180
1181		m_freem(m);
1182		return (0);
1183	}
1184
1185	/* receive buffer autosize */
1186	if (so_rcv->sb_flags & SB_AUTOSIZE &&
1187	    V_tcp_do_autorcvbuf &&
1188	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
1189	    (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) {
1190		unsigned int hiwat = so_rcv->sb_hiwat;
1191		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
1192		    V_tcp_autorcvbuf_max);
1193
1194		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
1195			so_rcv->sb_flags &= ~SB_AUTOSIZE;
1196		else
1197			toep->tp_rx_credits += newsize - hiwat;
1198	}
1199
1200	toep->tp_enqueued += m->m_pkthdr.len;
1201	sbappendstream_locked(so_rcv, m, 0);
1202	sorwakeup_locked(so);
1203	SOCKBUF_UNLOCK_ASSERT(so_rcv);
1204
1205	INP_WUNLOCK(inp);
1206	return (0);
1207}
1208
1209/*
1210 * Handler for PEER_CLOSE CPL messages.
1211 */
1212static int
1213do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1214{
1215	struct adapter *sc = qs->adap;
1216	struct tom_data *td = sc->tom_softc;
1217	const struct cpl_peer_close *hdr = mtod(m, void *);
1218	unsigned int tid = GET_TID(hdr);
1219	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1220	struct inpcb *inp = toep->tp_inp;
1221	struct tcpcb *tp;
1222	struct socket *so;
1223
1224	INP_INFO_RLOCK(&V_tcbinfo);
1225	INP_WLOCK(inp);
1226	tp = intotcpcb(inp);
1227
1228	CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
1229	    tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp);
1230
1231	if (toep->tp_flags & TP_ABORT_RPL_PENDING)
1232		goto done;
1233
1234	so = inp_inpcbtosocket(inp);
1235
1236	socantrcvmore(so);
1237	tp->rcv_nxt++;
1238
1239	switch (tp->t_state) {
1240	case TCPS_SYN_RECEIVED:
1241		tp->t_starttime = ticks;
1242		/* FALLTHROUGH */
1243	case TCPS_ESTABLISHED:
1244		tp->t_state = TCPS_CLOSE_WAIT;
1245		break;
1246	case TCPS_FIN_WAIT_1:
1247		tp->t_state = TCPS_CLOSING;
1248		break;
1249	case TCPS_FIN_WAIT_2:
1250		tcp_twstart(tp);
1251		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1252		INP_INFO_RUNLOCK(&V_tcbinfo);
1253
1254		INP_WLOCK(inp);
1255		toepcb_release(toep);	/* no more CPLs expected */
1256
1257		m_freem(m);
1258		return (0);
1259	default:
1260		log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n",
1261		    __func__, toep->tp_tid, tp->t_state);
1262	}
1263
1264done:
1265	INP_WUNLOCK(inp);
1266	INP_INFO_RUNLOCK(&V_tcbinfo);
1267
1268	m_freem(m);
1269	return (0);
1270}
1271
1272/*
1273 * Handler for CLOSE_CON_RPL CPL messages.  peer ACK to our FIN received.
1274 */
1275static int
1276do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1277{
1278	struct adapter *sc = qs->adap;
1279	struct tom_data *td = sc->tom_softc;
1280	const struct cpl_close_con_rpl *rpl = mtod(m, void *);
1281	unsigned int tid = GET_TID(rpl);
1282	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1283	struct inpcb *inp = toep->tp_inp;
1284	struct tcpcb *tp;
1285	struct socket *so;
1286
1287	INP_INFO_RLOCK(&V_tcbinfo);
1288	INP_WLOCK(inp);
1289	tp = intotcpcb(inp);
1290
1291	CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid,
1292	    tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags);
1293
1294	if ((toep->tp_flags & TP_ABORT_RPL_PENDING))
1295		goto done;
1296
1297	so = inp_inpcbtosocket(inp);
1298	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
1299
1300	switch (tp->t_state) {
1301	case TCPS_CLOSING:
1302		tcp_twstart(tp);
1303release:
1304		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1305		INP_INFO_RUNLOCK(&V_tcbinfo);
1306
1307		INP_WLOCK(inp);
1308		toepcb_release(toep);	/* no more CPLs expected */
1309
1310		m_freem(m);
1311		return (0);
1312	case TCPS_LAST_ACK:
1313		if (tcp_close(tp))
1314			INP_WUNLOCK(inp);
1315		goto release;
1316
1317	case TCPS_FIN_WAIT_1:
1318		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1319			soisdisconnected(so);
1320		tp->t_state = TCPS_FIN_WAIT_2;
1321		break;
1322	default:
1323		log(LOG_ERR,
1324		    "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
1325		    __func__, toep->tp_tid, tp->t_state);
1326	}
1327
1328done:
1329	INP_WUNLOCK(inp);
1330	INP_INFO_RUNLOCK(&V_tcbinfo);
1331
1332	m_freem(m);
1333	return (0);
1334}
1335
1336static int
1337do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1338{
1339	struct cpl_smt_write_rpl *rpl = mtod(m, void *);
1340
1341	if (rpl->status != CPL_ERR_NONE) {
1342		log(LOG_ERR,
1343		    "Unexpected SMT_WRITE_RPL status %u for entry %u\n",
1344		    rpl->status, GET_TID(rpl));
1345	}
1346
1347	m_freem(m);
1348	return (0);
1349}
1350
1351static int
1352do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1353{
1354	struct cpl_set_tcb_rpl *rpl = mtod(m, void *);
1355
1356	if (rpl->status != CPL_ERR_NONE) {
1357		log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n",
1358		    rpl->status, GET_TID(rpl));
1359	}
1360
1361	m_freem(m);
1362	return (0);
1363}
1364
1365/*
1366 * Handle an ABORT_RPL_RSS CPL message.
1367 */
1368static int
1369do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1370{
1371	struct adapter *sc = qs->adap;
1372	struct tom_data *td = sc->tom_softc;
1373	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
1374	unsigned int tid = GET_TID(rpl);
1375	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1376	struct inpcb *inp;
1377
1378	/*
1379	 * Ignore replies to post-close aborts indicating that the abort was
1380	 * requested too late.  These connections are terminated when we get
1381	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
1382	 * arrives the TID is either no longer used or it has been recycled.
1383	 */
1384	if (rpl->status == CPL_ERR_ABORT_FAILED) {
1385		m_freem(m);
1386		return (0);
1387	}
1388
1389	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
1390		return (do_abort_rpl_synqe(qs, r, m));
1391
1392	CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep,
1393	    rpl->status);
1394
1395	inp = toep->tp_inp;
1396	INP_WLOCK(inp);
1397
1398	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1399		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) {
1400			toep->tp_flags |= TP_ABORT_RPL_RCVD;
1401			INP_WUNLOCK(inp);
1402		} else {
1403			toep->tp_flags &= ~TP_ABORT_RPL_RCVD;
1404			toep->tp_flags &= TP_ABORT_RPL_PENDING;
1405			toepcb_release(toep);	/* no more CPLs expected */
1406		}
1407	}
1408
1409	m_freem(m);
1410	return (0);
1411}
1412
1413/*
1414 * Convert the status code of an ABORT_REQ into a FreeBSD error code.
1415 */
1416static int
1417abort_status_to_errno(struct tcpcb *tp, int abort_reason)
1418{
1419	switch (abort_reason) {
1420	case CPL_ERR_BAD_SYN:
1421	case CPL_ERR_CONN_RESET:
1422		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
1423	case CPL_ERR_XMIT_TIMEDOUT:
1424	case CPL_ERR_PERSIST_TIMEDOUT:
1425	case CPL_ERR_FINWAIT2_TIMEDOUT:
1426	case CPL_ERR_KEEPALIVE_TIMEDOUT:
1427		return (ETIMEDOUT);
1428	default:
1429		return (EIO);
1430	}
1431}
1432
1433/*
1434 * Returns whether an ABORT_REQ_RSS message is a negative advice.
1435 */
1436static inline int
1437is_neg_adv_abort(unsigned int status)
1438{
1439	return status == CPL_ERR_RTX_NEG_ADVICE ||
1440	    status == CPL_ERR_PERSIST_NEG_ADVICE;
1441}
1442
1443void
1444send_abort_rpl(struct toedev *tod, int tid, int qset)
1445{
1446	struct mbuf *reply;
1447	struct cpl_abort_rpl *rpl;
1448	struct adapter *sc = tod->tod_softc;
1449
1450	reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl);
1451	if (!reply)
1452		CXGB_UNIMPLEMENTED();
1453
1454	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
1455	rpl->wr.wrh_lo = htonl(V_WR_TID(tid));
1456	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
1457	rpl->cmd = CPL_ABORT_NO_RST;
1458
1459	t3_offload_tx(sc, reply);
1460}
1461
1462/*
1463 * Handle an ABORT_REQ_RSS CPL message.  If we're waiting for an ABORT_RPL we
1464 * ignore this request except that we need to reply to it.
1465 */
1466static int
1467do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1468{
1469	struct adapter *sc = qs->adap;
1470	struct tom_data *td = sc->tom_softc;
1471	struct toedev *tod = &td->tod;
1472	const struct cpl_abort_req_rss *req = mtod(m, void *);
1473	unsigned int tid = GET_TID(req);
1474	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1475	struct inpcb *inp;
1476	struct tcpcb *tp;
1477	struct socket *so;
1478	int qset = toep->tp_qset;
1479
1480	if (is_neg_adv_abort(req->status)) {
1481		CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)",
1482		    __func__, req->status, tid, toep->tp_flags);
1483		m_freem(m);
1484		return (0);
1485	}
1486
1487	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
1488		return (do_abort_req_synqe(qs, r, m));
1489
1490	inp = toep->tp_inp;
1491	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
1492	INP_WLOCK(inp);
1493
1494	tp = intotcpcb(inp);
1495	so = inp->inp_socket;
1496
1497	CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d",
1498	    __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags,
1499	    req->status);
1500
1501	if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) {
1502		toep->tp_flags |= TP_ABORT_REQ_RCVD;
1503		toep->tp_flags |= TP_ABORT_SHUTDOWN;
1504		INP_WUNLOCK(inp);
1505		INP_INFO_RUNLOCK(&V_tcbinfo);
1506		m_freem(m);
1507		return (0);
1508	}
1509	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
1510
1511	/*
1512	 * If we'd sent a reset on this toep, we'll ignore this and clean up in
1513	 * the T3's reply to our reset instead.
1514	 */
1515	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1516		toep->tp_flags |= TP_ABORT_RPL_SENT;
1517		INP_WUNLOCK(inp);
1518	} else {
1519		so_error_set(so, abort_status_to_errno(tp, req->status));
1520		tp = tcp_close(tp);
1521		if (tp == NULL)
1522			INP_WLOCK(inp);	/* re-acquire */
1523		toepcb_release(toep);	/* no more CPLs expected */
1524	}
1525	INP_INFO_RUNLOCK(&V_tcbinfo);
1526
1527	send_abort_rpl(tod, tid, qset);
1528	m_freem(m);
1529	return (0);
1530}
1531
1532static void
1533assign_rxopt(struct tcpcb *tp, uint16_t tcpopt)
1534{
1535	struct toepcb *toep = tp->t_toe;
1536	struct adapter *sc = toep->tp_tod->tod_softc;
1537
1538	tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40;
1539
1540	if (G_TCPOPT_TSTAMP(tcpopt)) {
1541		tp->t_flags |= TF_RCVD_TSTMP;
1542		tp->t_flags |= TF_REQ_TSTMP;	/* forcibly set */
1543		tp->ts_recent = 0;		/* XXX */
1544		tp->ts_recent_age = tcp_ts_getticks();
1545	}
1546
1547	if (G_TCPOPT_SACK(tcpopt))
1548		tp->t_flags |= TF_SACK_PERMIT;
1549	else
1550		tp->t_flags &= ~TF_SACK_PERMIT;
1551
1552	if (G_TCPOPT_WSCALE_OK(tcpopt))
1553		tp->t_flags |= TF_RCVD_SCALE;
1554
1555	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
1556	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
1557		tp->rcv_scale = tp->request_r_scale;
1558		tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt);
1559	}
1560
1561}
1562
1563/*
1564 * The ISS and IRS are from after the exchange of SYNs and are off by 1.
1565 */
1566void
1567make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs,
1568    uint16_t cpl_tcpopt)
1569{
1570	struct inpcb *inp = sotoinpcb(so);
1571	struct tcpcb *tp = intotcpcb(inp);
1572	struct toepcb *toep = tp->t_toe;
1573	long bufsize;
1574	uint32_t iss = be32toh(cpl_iss) - 1;	/* true ISS */
1575	uint32_t irs = be32toh(cpl_irs) - 1;	/* true IRS */
1576	uint16_t tcpopt = be16toh(cpl_tcpopt);
1577
1578	INP_WLOCK_ASSERT(inp);
1579
1580	tp->t_state = TCPS_ESTABLISHED;
1581	tp->t_starttime = ticks;
1582	TCPSTAT_INC(tcps_connects);
1583
1584	CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state],
1585	    toep->tp_tid, toep, inp);
1586
1587	tp->irs = irs;
1588	tcp_rcvseqinit(tp);
1589	tp->rcv_wnd = toep->tp_rx_credits << 10;
1590	tp->rcv_adv += tp->rcv_wnd;
1591	tp->last_ack_sent = tp->rcv_nxt;
1592
1593	/*
1594	 * If we were unable to send all rx credits via opt0, save the remainder
1595	 * in rx_credits so that they can be handed over with the next credit
1596	 * update.
1597	 */
1598	SOCKBUF_LOCK(&so->so_rcv);
1599	bufsize = select_rcv_wnd(so);
1600	SOCKBUF_UNLOCK(&so->so_rcv);
1601	toep->tp_rx_credits = bufsize - tp->rcv_wnd;
1602
1603	tp->iss = iss;
1604	tcp_sendseqinit(tp);
1605	tp->snd_una = iss + 1;
1606	tp->snd_nxt = iss + 1;
1607	tp->snd_max = iss + 1;
1608
1609	assign_rxopt(tp, tcpopt);
1610	soisconnected(so);
1611}
1612
1613/*
1614 * Fill in the right TID for CPL messages waiting in the out-of-order queue
1615 * and send them to the TOE.
1616 */
1617static void
1618fixup_and_send_ofo(struct toepcb *toep)
1619{
1620	struct mbuf *m;
1621	struct toedev *tod = toep->tp_tod;
1622	struct adapter *sc = tod->tod_softc;
1623	unsigned int tid = toep->tp_tid;
1624
1625	inp_lock_assert(toep->tp_inp);
1626
1627	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
1628		struct ofld_hdr *oh = mtod(m, void *);
1629		/*
1630		 * A variety of messages can be waiting but the fields we'll
1631		 * be touching are common to all so any message type will do.
1632		 */
1633		struct cpl_close_con_req *p = (void *)(oh + 1);
1634
1635		p->wr.wrh_lo = htonl(V_WR_TID(tid));
1636		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
1637		t3_offload_tx(sc, m);
1638	}
1639}
1640
1641/*
1642 * Process a CPL_ACT_ESTABLISH message.
1643 */
1644static int
1645do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1646{
1647	struct adapter *sc = qs->adap;
1648	struct tom_data *td = sc->tom_softc;
1649	struct cpl_act_establish *req = mtod(m, void *);
1650	unsigned int tid = GET_TID(req);
1651	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
1652	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
1653	struct inpcb *inp = toep->tp_inp;
1654	struct tcpcb *tp;
1655	struct socket *so;
1656
1657	CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid);
1658
1659	free_atid(&td->tid_maps, atid);
1660
1661	INP_WLOCK(inp);
1662	tp = intotcpcb(inp);
1663
1664	KASSERT(toep->tp_qset == qs->idx,
1665	    ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx));
1666	KASSERT(toep->tp_tid == atid,
1667	    ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid));
1668
1669	toep->tp_tid = tid;
1670	insert_tid(td, toep, tid);
1671
1672	if (inp->inp_flags & INP_DROPPED) {
1673		/* socket closed by the kernel before hw told us it connected */
1674		send_reset(toep);
1675		goto done;
1676	}
1677
1678	KASSERT(tp->t_state == TCPS_SYN_SENT,
1679	    ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state));
1680
1681	so = inp->inp_socket;
1682	make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt);
1683
1684	/*
1685	 * Now that we finally have a TID send any CPL messages that we had to
1686	 * defer for lack of a TID.
1687	 */
1688	if (mbufq_len(&toep->out_of_order_queue))
1689		fixup_and_send_ofo(toep);
1690
1691done:
1692	INP_WUNLOCK(inp);
1693	m_freem(m);
1694	return (0);
1695}
1696
1697/*
1698 * Process an acknowledgment of WR completion.  Advance snd_una and send the
1699 * next batch of work requests from the write queue.
1700 */
1701static void
1702wr_ack(struct toepcb *toep, struct mbuf *m)
1703{
1704	struct inpcb *inp = toep->tp_inp;
1705	struct tcpcb *tp;
1706	struct cpl_wr_ack *hdr = mtod(m, void *);
1707	struct socket *so;
1708	unsigned int credits = ntohs(hdr->credits);
1709	u32 snd_una = ntohl(hdr->snd_una);
1710	int bytes = 0;
1711	struct sockbuf *snd;
1712	struct mbuf *p;
1713	struct ofld_hdr *oh;
1714
1715	inp_wlock(inp);
1716	tp = intotcpcb(inp);
1717	so = inp->inp_socket;
1718	toep->tp_wr_avail += credits;
1719	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
1720		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
1721
1722	while (credits) {
1723		p = peek_wr(toep);
1724
1725		if (__predict_false(!p)) {
1726			CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, "
1727			    "tid %u, state %u, wr_avail %u", __func__, credits,
1728			    toep->tp_tid, tp->t_state, toep->tp_wr_avail);
1729
1730			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
1731			    "nothing pending, state %u wr_avail=%u\n",
1732			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
1733			break;
1734		}
1735
1736		oh = mtod(p, struct ofld_hdr *);
1737
1738		KASSERT(credits >= G_HDR_NDESC(oh->flags),
1739		    ("%s: partial credits?  %d %d", __func__, credits,
1740		    G_HDR_NDESC(oh->flags)));
1741
1742		dequeue_wr(toep);
1743		credits -= G_HDR_NDESC(oh->flags);
1744		bytes += oh->plen;
1745
1746		if (oh->flags & F_HDR_SGL)
1747			sglist_free(oh->sgl);
1748		m_freem(p);
1749	}
1750
1751	if (__predict_false(SEQ_LT(snd_una, tp->snd_una)))
1752		goto out_free;
1753
1754	if (tp->snd_una != snd_una) {
1755		tp->snd_una = snd_una;
1756		tp->ts_recent_age = tcp_ts_getticks();
1757		if (tp->snd_una == tp->snd_nxt)
1758			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
1759	}
1760
1761	snd = so_sockbuf_snd(so);
1762	if (bytes) {
1763		SOCKBUF_LOCK(snd);
1764		sbdrop_locked(snd, bytes);
1765		so_sowwakeup_locked(so);
1766	}
1767
1768	if (snd->sb_sndptroff < sbused(snd))
1769		t3_push_frames(so, 0);
1770
1771out_free:
1772	inp_wunlock(tp->t_inpcb);
1773	m_freem(m);
1774}
1775
1776/*
1777 * Handler for TX_DATA_ACK CPL messages.
1778 */
1779static int
1780do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1781{
1782	struct adapter *sc = qs->adap;
1783	struct tom_data *td = sc->tom_softc;
1784	struct cpl_wr_ack *hdr = mtod(m, void *);
1785	unsigned int tid = GET_TID(hdr);
1786	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1787
1788	/* XXX bad race */
1789	if (toep)
1790		wr_ack(toep, m);
1791
1792	return (0);
1793}
1794
1795void
1796t3_init_cpl_io(struct adapter *sc)
1797{
1798	t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
1799	t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
1800	t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify);
1801	t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
1802	t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack);
1803	t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
1804	t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
1805	t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
1806	t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
1807	t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl);
1808	t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl);
1809}
1810#endif
1811