1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/10/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 330303 2018-03-03 00:54:12Z jhb $");
29
30#include "opt_inet.h"
31
32#ifdef TCP_OFFLOAD
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/sockbuf.h>
47#include <sys/sysctl.h>
48#include <sys/syslog.h>
49#include <sys/protosw.h>
50#include <sys/priv.h>
51#include <sys/sglist.h>
52#include <sys/taskqueue.h>
53
54#include <net/if.h>
55#include <net/ethernet.h>
56#include <net/route.h>
57
58#include <netinet/in.h>
59#include <netinet/in_pcb.h>
60#include <netinet/in_systm.h>
61#include <netinet/in_var.h>
62
63#include <netinet/ip.h>
64#include <netinet/tcp_var.h>
65#define TCPSTATES
66#include <netinet/tcp_fsm.h>
67#include <netinet/toecore.h>
68#include <netinet/tcp_seq.h>
69#include <netinet/tcp_timer.h>
70#include <net/route.h>
71
72#include "cxgb_include.h"
73#include "ulp/tom/cxgb_l2t.h"
74#include "ulp/tom/cxgb_tom.h"
75#include "ulp/tom/cxgb_toepcb.h"
76
77VNET_DECLARE(int, tcp_do_autosndbuf);
78#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
79VNET_DECLARE(int, tcp_autosndbuf_inc);
80#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
81VNET_DECLARE(int, tcp_autosndbuf_max);
82#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
83VNET_DECLARE(int, tcp_do_autorcvbuf);
84#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
85VNET_DECLARE(int, tcp_autorcvbuf_inc);
86#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
87VNET_DECLARE(int, tcp_autorcvbuf_max);
88#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
89
90/*
91 * For ULP connections HW may add headers, e.g., for digests, that aren't part
92 * of the messages sent by the host but that are part of the TCP payload and
93 * therefore consume TCP sequence space.  Tx connection parameters that
94 * operate in TCP sequence space are affected by the HW additions and need to
95 * compensate for them to accurately track TCP sequence numbers. This array
96 * contains the compensating extra lengths for ULP packets.  It is indexed by
97 * a packet's ULP submode.
98 */
99const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
100
101/*
102 * Max receive window supported by HW in bytes.  Only a small part of it can
103 * be set through option0, the rest needs to be set through RX_DATA_ACK.
104 */
105#define MAX_RCV_WND ((1U << 27) - 1)
106
107/*
108 * Min receive window.  We want it to be large enough to accommodate receive
109 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
110 */
111#define MIN_RCV_WND (24 * 1024U)
112#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
113
114static void t3_release_offload_resources(struct toepcb *);
115static void send_reset(struct toepcb *toep);
116
117/*
118 * Called after the last CPL for the toepcb has been received.
119 *
120 * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the
121 * time this function exits.
122 */
123static int
124toepcb_release(struct toepcb *toep)
125{
126	struct inpcb *inp = toep->tp_inp;
127	struct toedev *tod = toep->tp_tod;
128	struct tom_data *td = t3_tomdata(tod);
129	int rc;
130
131	INP_WLOCK_ASSERT(inp);
132	KASSERT(!(toep->tp_flags & TP_CPL_DONE),
133	    ("%s: double release?", __func__));
134
135	CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid);
136
137	toep->tp_flags |= TP_CPL_DONE;
138	toep->tp_inp = NULL;
139
140	mtx_lock(&td->toep_list_lock);
141	TAILQ_REMOVE(&td->toep_list, toep, link);
142	mtx_unlock(&td->toep_list_lock);
143
144	if (!(toep->tp_flags & TP_ATTACHED))
145		t3_release_offload_resources(toep);
146
147	rc = in_pcbrele_wlocked(inp);
148	if (!rc)
149		INP_WUNLOCK(inp);
150	return (rc);
151}
152
153/*
154 * One sided detach.  The tcpcb is going away and we need to unhook the toepcb
155 * hanging off it.  If the TOE driver is also done with the toepcb we'll release
156 * all offload resources.
157 */
158static void
159toepcb_detach(struct inpcb *inp)
160{
161	struct toepcb *toep;
162	struct tcpcb *tp;
163
164	KASSERT(inp, ("%s: inp is NULL", __func__));
165	INP_WLOCK_ASSERT(inp);
166
167	tp = intotcpcb(inp);
168	toep = tp->t_toe;
169
170	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
171	KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__));
172
173	CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__,
174	    tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid,
175	    toep, inp, tp);
176
177	tp->t_toe = NULL;
178	tp->t_flags &= ~TF_TOE;
179	toep->tp_flags &= ~TP_ATTACHED;
180
181	if (toep->tp_flags & TP_CPL_DONE)
182		t3_release_offload_resources(toep);
183}
184
185void
186t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
187{
188
189	toepcb_detach(tp->t_inpcb);
190}
191
192static int
193alloc_atid(struct tid_info *t, void *ctx)
194{
195	int atid = -1;
196
197	mtx_lock(&t->atid_lock);
198	if (t->afree) {
199		union active_open_entry *p = t->afree;
200
201		atid = (p - t->atid_tab) + t->atid_base;
202		t->afree = p->next;
203		p->ctx = ctx;
204		t->atids_in_use++;
205	}
206	mtx_unlock(&t->atid_lock);
207
208	return (atid);
209}
210
211static void
212free_atid(struct tid_info *t, int atid)
213{
214	union active_open_entry *p = atid2entry(t, atid);
215
216	mtx_lock(&t->atid_lock);
217	p->next = t->afree;
218	t->afree = p;
219	t->atids_in_use--;
220	mtx_unlock(&t->atid_lock);
221}
222
223void
224insert_tid(struct tom_data *td, void *ctx, unsigned int tid)
225{
226	struct tid_info *t = &td->tid_maps;
227
228	t->tid_tab[tid] = ctx;
229	atomic_add_int(&t->tids_in_use, 1);
230}
231
232void
233update_tid(struct tom_data *td, void *ctx, unsigned int tid)
234{
235	struct tid_info *t = &td->tid_maps;
236
237	t->tid_tab[tid] = ctx;
238}
239
240void
241remove_tid(struct tom_data *td, unsigned int tid)
242{
243	struct tid_info *t = &td->tid_maps;
244
245	t->tid_tab[tid] = NULL;
246	atomic_add_int(&t->tids_in_use, -1);
247}
248
249/* use ctx as a next pointer in the tid release list */
250void
251queue_tid_release(struct toedev *tod, unsigned int tid)
252{
253	struct tom_data *td = t3_tomdata(tod);
254	void **p = &td->tid_maps.tid_tab[tid];
255	struct adapter *sc = tod->tod_softc;
256
257	mtx_lock(&td->tid_release_lock);
258	*p = td->tid_release_list;
259	td->tid_release_list = p;
260	if (!*p)
261		taskqueue_enqueue(sc->tq, &td->tid_release_task);
262	mtx_unlock(&td->tid_release_lock);
263}
264
265/*
266 * Populate a TID_RELEASE WR.
267 */
268static inline void
269mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid)
270{
271
272	cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
273	OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
274}
275
276void
277release_tid(struct toedev *tod, unsigned int tid, int qset)
278{
279	struct tom_data *td = t3_tomdata(tod);
280	struct adapter *sc = tod->tod_softc;
281	struct mbuf *m;
282	struct cpl_tid_release *cpl;
283#ifdef INVARIANTS
284	struct tid_info *t = &td->tid_maps;
285#endif
286
287	KASSERT(tid >= 0 && tid < t->ntids,
288	    ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids));
289
290	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
291	if (m) {
292		mk_tid_release(cpl, tid);
293		t3_offload_tx(sc, m);
294		remove_tid(td, tid);
295	} else
296		queue_tid_release(tod, tid);
297
298}
299
300void
301t3_process_tid_release_list(void *data, int pending)
302{
303	struct mbuf *m;
304	struct tom_data *td = data;
305	struct adapter *sc = td->tod.tod_softc;
306
307	mtx_lock(&td->tid_release_lock);
308	while (td->tid_release_list) {
309		void **p = td->tid_release_list;
310		unsigned int tid = p - td->tid_maps.tid_tab;
311		struct cpl_tid_release *cpl;
312
313		td->tid_release_list = (void **)*p;
314		m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */
315		if (m == NULL)
316			break;	/* XXX: who reschedules the release task? */
317		mtx_unlock(&td->tid_release_lock);
318		mk_tid_release(cpl, tid);
319		t3_offload_tx(sc, m);
320		remove_tid(td, tid);
321		mtx_lock(&td->tid_release_lock);
322	}
323	mtx_unlock(&td->tid_release_lock);
324}
325
326static void
327close_conn(struct adapter *sc, struct toepcb *toep)
328{
329	struct mbuf *m;
330	struct cpl_close_con_req *req;
331
332	if (toep->tp_flags & TP_FIN_SENT)
333		return;
334
335	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
336	if (m == NULL)
337		CXGB_UNIMPLEMENTED();
338
339	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
340	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
341	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid));
342	req->rsvd = 0;
343
344	toep->tp_flags |= TP_FIN_SENT;
345	t3_offload_tx(sc, m);
346}
347
348static inline void
349make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len,
350    struct mbuf *tail)
351{
352	struct tcpcb *tp = so_sototcpcb(so);
353	struct toepcb *toep = tp->t_toe;
354	struct sockbuf *snd;
355
356	inp_lock_assert(tp->t_inpcb);
357	snd = so_sockbuf_snd(so);
358
359	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
360	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
361	/* len includes the length of any HW ULP additions */
362	req->len = htonl(len);
363	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
364	/* V_TX_ULP_SUBMODE sets both the mode and submode */
365	req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) |
366	    V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1)));
367	req->sndseq = htonl(tp->snd_nxt);
368	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
369		struct adapter *sc = toep->tp_tod->tod_softc;
370		int cpu_idx = sc->rrss_map[toep->tp_qset];
371
372		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
373		    V_TX_CPU_IDX(cpu_idx));
374
375		/* Sendbuffer is in units of 32KB. */
376		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
377			req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15));
378		else
379			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
380
381		toep->tp_flags |= TP_DATASENT;
382	}
383}
384
385/*
386 * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc.
387 * TOM_XXX_MOVE to some common header file.
388 */
389/*
390 * IMM_LEN: # of bytes that can be tx'd as immediate data.  There are 16 flits
391 * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more
392 * for the second gen bit flit.  This leaves us with 12 flits.
393 *
394 * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs.
395 * The first desc has a tx_data_wr (which includes the WR header), the rest have
396 * the WR header only.  All descs have the second gen bit flit.
397 *
398 * sgllen_to_descs: # of tx descs used up by an sgl of given length.  The first
399 * desc has a tx_data_wr (which includes the WR header), the rest have the WR
400 * header only.  All descs have the second gen bit flit.
401 *
402 * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits.
403 *
404 */
405#define IMM_LEN 96
406static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35};
407static int sgllen_to_descs[TX_MAX_SEGS] = {
408	0, 1, 1, 1, 1, 1, 1, 1, 1, 2,	/*  0 -  9 */
409	2, 2, 2, 2, 2, 2, 2, 2, 3, 3,	/* 10 - 19 */
410	3, 3, 3, 3, 3, 3, 3, 4, 4, 4,	/* 20 - 29 */
411	4, 4, 4, 4, 4, 4		/* 30 - 35 */
412};
413#if 0
414static int flits_to_sgllen[TX_DESC_FLITS + 1] = {
415	0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10
416};
417#endif
418#if SGE_NUM_GENBITS != 2
419#error "SGE_NUM_GENBITS really must be 2"
420#endif
421
422int
423t3_push_frames(struct socket *so, int req_completion)
424{
425	struct tcpcb *tp = so_sototcpcb(so);
426	struct toepcb *toep = tp->t_toe;
427	struct mbuf *m0, *sndptr, *m;
428	struct toedev *tod = toep->tp_tod;
429	struct adapter *sc = tod->tod_softc;
430	int bytes, ndesc, total_bytes = 0, mlen;
431	struct sockbuf *snd;
432	struct sglist *sgl;
433	struct ofld_hdr *oh;
434	caddr_t dst;
435	struct tx_data_wr *wr;
436
437	inp_lock_assert(tp->t_inpcb);
438
439	snd = so_sockbuf_snd(so);
440	SOCKBUF_LOCK(snd);
441
442	/*
443	 * Autosize the send buffer.
444	 */
445	if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) {
446		if (snd->sb_cc >= (snd->sb_hiwat / 8 * 7) &&
447		    snd->sb_cc < VNET(tcp_autosndbuf_max)) {
448			if (!sbreserve_locked(snd, min(snd->sb_hiwat +
449			    VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)),
450			    so, curthread))
451				snd->sb_flags &= ~SB_AUTOSIZE;
452		}
453	}
454
455	if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr)
456		sndptr = toep->tp_m_last->m_next;
457	else
458		sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
459
460	/* Nothing to send or no WRs available for sending data */
461	if (toep->tp_wr_avail == 0 || sndptr == NULL)
462		goto out;
463
464	/* Something to send and at least 1 WR available */
465	while (toep->tp_wr_avail && sndptr != NULL) {
466
467		m0 = m_gethdr(M_NOWAIT, MT_DATA);
468		if (m0 == NULL)
469			break;
470		oh = mtod(m0, struct ofld_hdr *);
471		wr = (void *)(oh + 1);
472		dst = (void *)(wr + 1);
473
474		m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr);
475		oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF |
476		    V_HDR_QSET(toep->tp_qset);
477
478		/*
479		 * Try to construct an immediate data WR if possible.  Stuff as
480		 * much data into it as possible, one whole mbuf at a time.
481		 */
482		mlen = sndptr->m_len;
483		ndesc = bytes = 0;
484		while (mlen <= IMM_LEN - bytes) {
485			bcopy(sndptr->m_data, dst, mlen);
486			bytes += mlen;
487			dst += mlen;
488
489			if (!(sndptr = sndptr->m_next))
490				break;
491			mlen = sndptr->m_len;
492		}
493
494		if (bytes) {
495
496			/* Was able to fit 'bytes' bytes in an immediate WR */
497
498			ndesc = 1;
499			make_tx_data_wr(so, wr, bytes, sndptr);
500
501			m0->m_len += bytes;
502			m0->m_pkthdr.len = m0->m_len;
503
504		} else {
505			int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC);
506
507			/* Need to make an SGL */
508
509			sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT);
510			if (sgl == NULL)
511				break;
512
513			for (m = sndptr; m != NULL; m = m->m_next) {
514				if ((mlen = m->m_len) > 0) {
515					if (sglist_append(sgl, m->m_data, mlen))
516					    break;
517				}
518				bytes += mlen;
519			}
520			sndptr = m;
521			if (bytes == 0) {
522				sglist_free(sgl);
523				break;
524			}
525			ndesc = sgllen_to_descs[sgl->sg_nseg];
526			oh->flags |= F_HDR_SGL;
527			oh->sgl = sgl;
528			make_tx_data_wr(so, wr, bytes, sndptr);
529		}
530
531		oh->flags |= V_HDR_NDESC(ndesc);
532		oh->plen = bytes;
533
534		snd->sb_sndptr = sndptr;
535		snd->sb_sndptroff += bytes;
536		if (sndptr == NULL) {
537			snd->sb_sndptr = snd->sb_mbtail;
538			snd->sb_sndptroff -= snd->sb_mbtail->m_len;
539			toep->tp_m_last = snd->sb_mbtail;
540		} else
541			toep->tp_m_last = NULL;
542
543		total_bytes += bytes;
544
545		toep->tp_wr_avail -= ndesc;
546		toep->tp_wr_unacked += ndesc;
547
548		if ((req_completion && toep->tp_wr_unacked == ndesc) ||
549		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
550			wr->wr.wrh_hi |= htonl(F_WR_COMPL);
551			toep->tp_wr_unacked = 0;
552		}
553
554		enqueue_wr(toep, m0);
555		l2t_send(sc, m0, toep->tp_l2t);
556	}
557out:
558	SOCKBUF_UNLOCK(snd);
559
560	if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN))
561		close_conn(sc, toep);
562
563	return (total_bytes);
564}
565
566static int
567send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
568{
569	struct mbuf *m;
570	struct cpl_rx_data_ack *req;
571	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
572
573	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req);
574	if (m == NULL)
575		return (0);
576
577	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
578	req->wr.wrh_lo = 0;
579	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
580	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
581	t3_offload_tx(sc, m);
582	return (credits);
583}
584
585void
586t3_rcvd(struct toedev *tod, struct tcpcb *tp)
587{
588	struct adapter *sc = tod->tod_softc;
589	struct inpcb *inp = tp->t_inpcb;
590	struct socket *so = inp->inp_socket;
591	struct sockbuf *so_rcv = &so->so_rcv;
592	struct toepcb *toep = tp->t_toe;
593	int must_send;
594
595	INP_WLOCK_ASSERT(inp);
596
597	SOCKBUF_LOCK(so_rcv);
598	KASSERT(toep->tp_enqueued >= so_rcv->sb_cc,
599	    ("%s: so_rcv->sb_cc > enqueued", __func__));
600	toep->tp_rx_credits += toep->tp_enqueued - so_rcv->sb_cc;
601	toep->tp_enqueued = so_rcv->sb_cc;
602	SOCKBUF_UNLOCK(so_rcv);
603
604	must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd;
605	if (must_send || toep->tp_rx_credits >= 15 * 1024) {
606		int credits;
607
608		credits = send_rx_credits(sc, toep, toep->tp_rx_credits);
609		toep->tp_rx_credits -= credits;
610		tp->rcv_wnd += credits;
611		tp->rcv_adv += credits;
612	}
613}
614
615static int
616do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
617{
618	struct adapter *sc = qs->adap;
619	struct tom_data *td = sc->tom_softc;
620	struct cpl_rx_urg_notify *hdr = mtod(m, void *);
621	unsigned int tid = GET_TID(hdr);
622	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
623
624	log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp);
625
626	m_freem(m);
627	return (0);
628}
629
630int
631t3_send_fin(struct toedev *tod, struct tcpcb *tp)
632{
633	struct toepcb *toep = tp->t_toe;
634	struct inpcb *inp = tp->t_inpcb;
635	struct socket *so = inp_inpcbtosocket(inp);
636#if defined(KTR)
637	unsigned int tid = toep->tp_tid;
638#endif
639
640	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
641	INP_WLOCK_ASSERT(inp);
642
643	CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep,
644	    toep->tp_flags);
645
646	toep->tp_flags |= TP_SEND_FIN;
647	t3_push_frames(so, 1);
648
649	return (0);
650}
651
652int
653t3_tod_output(struct toedev *tod, struct tcpcb *tp)
654{
655	struct inpcb *inp = tp->t_inpcb;
656	struct socket *so = inp->inp_socket;
657
658	t3_push_frames(so, 1);
659	return (0);
660}
661
662/* What mtu_idx to use, given a 4-tuple and/or an MSS cap */
663int
664find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
665{
666	unsigned short *mtus = &sc->params.mtus[0];
667	int i = 0, mss;
668
669	KASSERT(inc != NULL || pmss > 0,
670	    ("%s: at least one of inc/pmss must be specified", __func__));
671
672	mss = inc ? tcp_mssopt(inc) : pmss;
673	if (pmss > 0 && mss > pmss)
674		mss = pmss;
675
676	while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
677		++i;
678
679	return (i);
680}
681
682static inline void
683purge_wr_queue(struct toepcb *toep)
684{
685	struct mbuf *m;
686	struct ofld_hdr *oh;
687
688	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) {
689		oh = mtod(m, struct ofld_hdr *);
690		if (oh->flags & F_HDR_SGL)
691			sglist_free(oh->sgl);
692		m_freem(m);
693	}
694}
695
696/*
697 * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T
698 * entry, etc.)
699 */
700static void
701t3_release_offload_resources(struct toepcb *toep)
702{
703	struct toedev *tod = toep->tp_tod;
704	struct tom_data *td = t3_tomdata(tod);
705
706	/*
707	 * The TOM explicitly detaches its toepcb from the system's inp before
708	 * it releases the offload resources.
709	 */
710	if (toep->tp_inp) {
711		panic("%s: inp %p still attached to toepcb %p",
712		    __func__, toep->tp_inp, toep);
713	}
714
715	if (toep->tp_wr_avail != toep->tp_wr_max)
716		purge_wr_queue(toep);
717
718	if (toep->tp_l2t) {
719		l2t_release(td->l2t, toep->tp_l2t);
720		toep->tp_l2t = NULL;
721	}
722
723	if (toep->tp_tid >= 0)
724		release_tid(tod, toep->tp_tid, toep->tp_qset);
725
726	toepcb_free(toep);
727}
728
729/*
730 * Determine the receive window size for a socket.
731 */
732unsigned long
733select_rcv_wnd(struct socket *so)
734{
735	unsigned long wnd;
736
737	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
738
739	wnd = sbspace(&so->so_rcv);
740	if (wnd < MIN_RCV_WND)
741		wnd = MIN_RCV_WND;
742
743	return min(wnd, MAX_RCV_WND);
744}
745
746int
747select_rcv_wscale(void)
748{
749	int wscale = 0;
750	unsigned long space = sb_max;
751
752	if (space > MAX_RCV_WND)
753		space = MAX_RCV_WND;
754
755	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
756		wscale++;
757
758	return (wscale);
759}
760
761
762/*
763 * Set up the socket for TCP offload.
764 */
765void
766offload_socket(struct socket *so, struct toepcb *toep)
767{
768	struct toedev *tod = toep->tp_tod;
769	struct tom_data *td = t3_tomdata(tod);
770	struct inpcb *inp = sotoinpcb(so);
771	struct tcpcb *tp = intotcpcb(inp);
772
773	INP_WLOCK_ASSERT(inp);
774
775	/* Update socket */
776	SOCKBUF_LOCK(&so->so_snd);
777	so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE;
778	SOCKBUF_UNLOCK(&so->so_snd);
779	SOCKBUF_LOCK(&so->so_rcv);
780	so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE;
781	SOCKBUF_UNLOCK(&so->so_rcv);
782
783	/* Update TCP PCB */
784	tp->tod = toep->tp_tod;
785	tp->t_toe = toep;
786	tp->t_flags |= TF_TOE;
787
788	/* Install an extra hold on inp */
789	toep->tp_inp = inp;
790	toep->tp_flags |= TP_ATTACHED;
791	in_pcbref(inp);
792
793	/* Add the TOE PCB to the active list */
794	mtx_lock(&td->toep_list_lock);
795	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
796	mtx_unlock(&td->toep_list_lock);
797}
798
799/* This is _not_ the normal way to "unoffload" a socket. */
800void
801undo_offload_socket(struct socket *so)
802{
803	struct inpcb *inp = sotoinpcb(so);
804	struct tcpcb *tp = intotcpcb(inp);
805	struct toepcb *toep = tp->t_toe;
806	struct toedev *tod = toep->tp_tod;
807	struct tom_data *td = t3_tomdata(tod);
808
809	INP_WLOCK_ASSERT(inp);
810
811	so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE;
812	so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE;
813
814	tp->tod = NULL;
815	tp->t_toe = NULL;
816	tp->t_flags &= ~TF_TOE;
817
818	toep->tp_inp = NULL;
819	toep->tp_flags &= ~TP_ATTACHED;
820	if (in_pcbrele_wlocked(inp))
821		panic("%s: inp freed.", __func__);
822
823	mtx_lock(&td->toep_list_lock);
824	TAILQ_REMOVE(&td->toep_list, toep, link);
825	mtx_unlock(&td->toep_list_lock);
826}
827
828/*
829 * Socket could be a listening socket, and we may not have a toepcb at all at
830 * this time.
831 */
832uint32_t
833calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e)
834{
835	uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) |
836	    V_MSS_IDX(mtu_idx);
837
838	if (so != NULL) {
839		struct inpcb *inp = sotoinpcb(so);
840		struct tcpcb *tp = intotcpcb(inp);
841		int keepalive = tcp_always_keepalive ||
842		    so_options_get(so) & SO_KEEPALIVE;
843
844		opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
845		opt0h |= V_KEEP_ALIVE(keepalive != 0);
846	}
847
848	if (e != NULL)
849		opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx);
850
851	return (htobe32(opt0h));
852}
853
854uint32_t
855calc_opt0l(struct socket *so, int rcv_bufsize)
856{
857	uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize);
858
859	KASSERT(rcv_bufsize <= M_RCV_BUFSIZ,
860	    ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize));
861
862	if (so != NULL)		/* optional because noone cares about IP TOS */
863		opt0l |= V_TOS(INP_TOS(sotoinpcb(so)));
864
865	return (htobe32(opt0l));
866}
867
868/*
869 * Convert an ACT_OPEN_RPL status to an errno.
870 */
871static int
872act_open_rpl_status_to_errno(int status)
873{
874	switch (status) {
875	case CPL_ERR_CONN_RESET:
876		return (ECONNREFUSED);
877	case CPL_ERR_ARP_MISS:
878		return (EHOSTUNREACH);
879	case CPL_ERR_CONN_TIMEDOUT:
880		return (ETIMEDOUT);
881	case CPL_ERR_TCAM_FULL:
882		return (EAGAIN);
883	case CPL_ERR_CONN_EXIST:
884		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
885		return (EAGAIN);
886	default:
887		return (EIO);
888	}
889}
890
891/*
892 * Return whether a failed active open has allocated a TID
893 */
894static inline int
895act_open_has_tid(int status)
896{
897	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
898	       status != CPL_ERR_ARP_MISS;
899}
900
901/*
902 * Active open failed.
903 */
904static int
905do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
906{
907	struct adapter *sc = qs->adap;
908	struct tom_data *td = sc->tom_softc;
909	struct toedev *tod = &td->tod;
910	struct cpl_act_open_rpl *rpl = mtod(m, void *);
911	unsigned int atid = G_TID(ntohl(rpl->atid));
912	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
913	struct inpcb *inp = toep->tp_inp;
914	int s = rpl->status, rc;
915
916	CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s);
917
918	free_atid(&td->tid_maps, atid);
919	toep->tp_tid = -1;
920
921	if (act_open_has_tid(s))
922		queue_tid_release(tod, GET_TID(rpl));
923
924	rc = act_open_rpl_status_to_errno(s);
925	if (rc != EAGAIN)
926		INP_INFO_RLOCK(&V_tcbinfo);
927	INP_WLOCK(inp);
928	toe_connect_failed(tod, inp, rc);
929	toepcb_release(toep);	/* unlocks inp */
930	if (rc != EAGAIN)
931		INP_INFO_RUNLOCK(&V_tcbinfo);
932
933	m_freem(m);
934	return (0);
935}
936
937/*
938 * Send an active open request.
939 *
940 * State of affairs on entry:
941 * soisconnecting (so_state |= SS_ISCONNECTING)
942 * tcbinfo not locked (this has changed - used to be WLOCKed)
943 * inp WLOCKed
944 * tp->t_state = TCPS_SYN_SENT
945 * rtalloc1, RT_UNLOCK on rt.
946 */
947int
948t3_connect(struct toedev *tod, struct socket *so,
949    struct rtentry *rt, struct sockaddr *nam)
950{
951	struct mbuf *m = NULL;
952	struct l2t_entry *e = NULL;
953	struct tom_data *td = t3_tomdata(tod);
954	struct adapter *sc = tod->tod_softc;
955	struct cpl_act_open_req *cpl;
956	struct inpcb *inp = sotoinpcb(so);
957	struct tcpcb *tp = intotcpcb(inp);
958	struct toepcb *toep;
959	int atid = -1, mtu_idx, rscale, cpu_idx, qset;
960	struct sockaddr *gw;
961	struct ifnet *ifp = rt->rt_ifp;
962	struct port_info *pi = ifp->if_softc;	/* XXX wrong for VLAN etc. */
963
964	INP_WLOCK_ASSERT(inp);
965
966	toep = toepcb_alloc(tod);
967	if (toep == NULL)
968		goto failed;
969
970	atid = alloc_atid(&td->tid_maps, toep);
971	if (atid < 0)
972		goto failed;
973
974	qset = pi->first_qset + (arc4random() % pi->nqsets);
975
976	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
977	if (m == NULL)
978		goto failed;
979
980	gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam;
981	e = t3_l2t_get(pi, ifp, gw);
982	if (e == NULL)
983		goto failed;
984
985	toep->tp_l2t = e;
986	toep->tp_tid = atid;	/* used to double check response */
987	toep->tp_qset = qset;
988
989	SOCKBUF_LOCK(&so->so_rcv);
990	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
991	toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
992	SOCKBUF_UNLOCK(&so->so_rcv);
993
994	offload_socket(so, toep);
995
996	/*
997	 * The kernel sets request_r_scale based on sb_max whereas we need to
998	 * take hardware's MAX_RCV_WND into account too.  This is normally a
999	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
1000	 */
1001	if (tp->t_flags & TF_REQ_SCALE)
1002		rscale = tp->request_r_scale = select_rcv_wscale();
1003	else
1004		rscale = 0;
1005	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
1006	cpu_idx = sc->rrss_map[qset];
1007
1008	cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD));
1009	cpl->wr.wrh_lo = 0;
1010	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1011	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
1012	    &cpl->peer_port);
1013	cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e);
1014	cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits);
1015	cpl->params = 0;
1016	cpl->opt2 = calc_opt2(cpu_idx);
1017
1018	CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__,
1019	    toep->tp_tid, tcpstates[tp->t_state], toep, inp);
1020
1021	if (l2t_send(sc, m, e) == 0)
1022		return (0);
1023
1024	undo_offload_socket(so);
1025
1026failed:
1027	CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p",
1028	    __func__, atid, toep, e, m);
1029
1030	if (atid >= 0)
1031		free_atid(&td->tid_maps, atid);
1032
1033	if (e)
1034		l2t_release(td->l2t, e);
1035
1036	if (toep)
1037		toepcb_free(toep);
1038
1039	m_freem(m);
1040
1041	return (ENOMEM);
1042}
1043
1044/*
1045 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do not
1046 * send multiple ABORT_REQs for the same connection and also that we do not try
1047 * to send a message after the connection has closed.
1048 */
1049static void
1050send_reset(struct toepcb *toep)
1051{
1052
1053	struct cpl_abort_req *req;
1054	unsigned int tid = toep->tp_tid;
1055	struct inpcb *inp = toep->tp_inp;
1056	struct socket *so = inp->inp_socket;
1057	struct tcpcb *tp = intotcpcb(inp);
1058	struct toedev *tod = toep->tp_tod;
1059	struct adapter *sc = tod->tod_softc;
1060	struct mbuf *m;
1061
1062	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1063	INP_WLOCK_ASSERT(inp);
1064
1065	CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep,
1066	    toep->tp_flags);
1067
1068	if (toep->tp_flags & TP_ABORT_SHUTDOWN)
1069		return;
1070
1071	toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
1072
1073	/* Purge the send queue */
1074	sbflush(so_sockbuf_snd(so));
1075	purge_wr_queue(toep);
1076
1077	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
1078	if (m == NULL)
1079		CXGB_UNIMPLEMENTED();
1080
1081	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1082	req->wr.wrh_lo = htonl(V_WR_TID(tid));
1083	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1084	req->rsvd0 = htonl(tp->snd_nxt);
1085	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1086	req->cmd = CPL_ABORT_SEND_RST;
1087
1088	if (tp->t_state == TCPS_SYN_SENT)
1089		mbufq_tail(&toep->out_of_order_queue, m); /* defer */
1090	else
1091		l2t_send(sc, m, toep->tp_l2t);
1092}
1093
1094int
1095t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp)
1096{
1097
1098	send_reset(tp->t_toe);
1099	return (0);
1100}
1101
1102/*
1103 * Handler for RX_DATA CPL messages.
1104 */
1105static int
1106do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1107{
1108	struct adapter *sc = qs->adap;
1109	struct tom_data *td = sc->tom_softc;
1110	struct cpl_rx_data *hdr = mtod(m, void *);
1111	unsigned int tid = GET_TID(hdr);
1112	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1113	struct inpcb *inp = toep->tp_inp;
1114	struct tcpcb *tp;
1115	struct socket *so;
1116	struct sockbuf *so_rcv;
1117
1118	/* Advance over CPL */
1119	m_adj(m, sizeof(*hdr));
1120
1121	/* XXX: revisit.  This comes from the T4 TOM */
1122	if (__predict_false(inp == NULL)) {
1123		/*
1124		 * do_pass_establish failed and must be attempting to abort the
1125		 * connection.  Meanwhile, the T4 has sent us data for such a
1126		 * connection.
1127		 */
1128#ifdef notyet
1129		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
1130		    ("%s: inp NULL and tid isn't being aborted", __func__));
1131#endif
1132		m_freem(m);
1133		return (0);
1134	}
1135
1136	INP_WLOCK(inp);
1137	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
1138		CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
1139		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
1140		INP_WUNLOCK(inp);
1141		m_freem(m);
1142		return (0);
1143	}
1144
1145	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode))
1146		toep->tp_delack_mode = hdr->dack_mode;
1147
1148	tp = intotcpcb(inp);
1149
1150#ifdef INVARIANTS
1151	if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) {
1152		log(LOG_ERR,
1153		    "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
1154		    __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt);
1155	}
1156#endif
1157	tp->rcv_nxt += m->m_pkthdr.len;
1158	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
1159	    ("%s: negative window size", __func__));
1160	tp->rcv_wnd -= m->m_pkthdr.len;
1161	tp->t_rcvtime = ticks;
1162
1163	so  = inp->inp_socket;
1164	so_rcv = &so->so_rcv;
1165	SOCKBUF_LOCK(so_rcv);
1166
1167	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
1168		CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)",
1169		    __func__, tid, m->m_pkthdr.len);
1170		SOCKBUF_UNLOCK(so_rcv);
1171		INP_WUNLOCK(inp);
1172
1173		INP_INFO_RLOCK(&V_tcbinfo);
1174		INP_WLOCK(inp);
1175		tp = tcp_drop(tp, ECONNRESET);
1176		if (tp)
1177			INP_WUNLOCK(inp);
1178		INP_INFO_RUNLOCK(&V_tcbinfo);
1179
1180		m_freem(m);
1181		return (0);
1182	}
1183
1184	/* receive buffer autosize */
1185	if (so_rcv->sb_flags & SB_AUTOSIZE &&
1186	    V_tcp_do_autorcvbuf &&
1187	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
1188	    (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) {
1189		unsigned int hiwat = so_rcv->sb_hiwat;
1190		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
1191		    V_tcp_autorcvbuf_max);
1192
1193		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
1194			so_rcv->sb_flags &= ~SB_AUTOSIZE;
1195		else
1196			toep->tp_rx_credits += newsize - hiwat;
1197	}
1198
1199	toep->tp_enqueued += m->m_pkthdr.len;
1200	sbappendstream_locked(so_rcv, m);
1201	sorwakeup_locked(so);
1202	SOCKBUF_UNLOCK_ASSERT(so_rcv);
1203
1204	INP_WUNLOCK(inp);
1205	return (0);
1206}
1207
1208/*
1209 * Handler for PEER_CLOSE CPL messages.
1210 */
1211static int
1212do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1213{
1214	struct adapter *sc = qs->adap;
1215	struct tom_data *td = sc->tom_softc;
1216	const struct cpl_peer_close *hdr = mtod(m, void *);
1217	unsigned int tid = GET_TID(hdr);
1218	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1219	struct inpcb *inp = toep->tp_inp;
1220	struct tcpcb *tp;
1221	struct socket *so;
1222
1223	INP_INFO_RLOCK(&V_tcbinfo);
1224	INP_WLOCK(inp);
1225	tp = intotcpcb(inp);
1226
1227	CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
1228	    tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp);
1229
1230	if (toep->tp_flags & TP_ABORT_RPL_PENDING)
1231		goto done;
1232
1233	so = inp_inpcbtosocket(inp);
1234
1235	socantrcvmore(so);
1236	tp->rcv_nxt++;
1237
1238	switch (tp->t_state) {
1239	case TCPS_SYN_RECEIVED:
1240		tp->t_starttime = ticks;
1241		/* FALLTHROUGH */
1242	case TCPS_ESTABLISHED:
1243		tp->t_state = TCPS_CLOSE_WAIT;
1244		break;
1245	case TCPS_FIN_WAIT_1:
1246		tp->t_state = TCPS_CLOSING;
1247		break;
1248	case TCPS_FIN_WAIT_2:
1249		tcp_twstart(tp);
1250		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1251		INP_INFO_RUNLOCK(&V_tcbinfo);
1252
1253		INP_WLOCK(inp);
1254		toepcb_release(toep);	/* no more CPLs expected */
1255
1256		m_freem(m);
1257		return (0);
1258	default:
1259		log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n",
1260		    __func__, toep->tp_tid, tp->t_state);
1261	}
1262
1263done:
1264	INP_WUNLOCK(inp);
1265	INP_INFO_RUNLOCK(&V_tcbinfo);
1266
1267	m_freem(m);
1268	return (0);
1269}
1270
1271/*
1272 * Handler for CLOSE_CON_RPL CPL messages.  peer ACK to our FIN received.
1273 */
1274static int
1275do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1276{
1277	struct adapter *sc = qs->adap;
1278	struct tom_data *td = sc->tom_softc;
1279	const struct cpl_close_con_rpl *rpl = mtod(m, void *);
1280	unsigned int tid = GET_TID(rpl);
1281	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1282	struct inpcb *inp = toep->tp_inp;
1283	struct tcpcb *tp;
1284	struct socket *so;
1285
1286	INP_INFO_RLOCK(&V_tcbinfo);
1287	INP_WLOCK(inp);
1288	tp = intotcpcb(inp);
1289
1290	CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid,
1291	    tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags);
1292
1293	if ((toep->tp_flags & TP_ABORT_RPL_PENDING))
1294		goto done;
1295
1296	so = inp_inpcbtosocket(inp);
1297	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
1298
1299	switch (tp->t_state) {
1300	case TCPS_CLOSING:
1301		tcp_twstart(tp);
1302release:
1303		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1304		INP_INFO_RUNLOCK(&V_tcbinfo);
1305
1306		INP_WLOCK(inp);
1307		toepcb_release(toep);	/* no more CPLs expected */
1308
1309		m_freem(m);
1310		return (0);
1311	case TCPS_LAST_ACK:
1312		if (tcp_close(tp))
1313			INP_WUNLOCK(inp);
1314		goto release;
1315
1316	case TCPS_FIN_WAIT_1:
1317		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1318			soisdisconnected(so);
1319		tp->t_state = TCPS_FIN_WAIT_2;
1320		break;
1321	default:
1322		log(LOG_ERR,
1323		    "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
1324		    __func__, toep->tp_tid, tp->t_state);
1325	}
1326
1327done:
1328	INP_WUNLOCK(inp);
1329	INP_INFO_RUNLOCK(&V_tcbinfo);
1330
1331	m_freem(m);
1332	return (0);
1333}
1334
1335static int
1336do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1337{
1338	struct cpl_smt_write_rpl *rpl = mtod(m, void *);
1339
1340	if (rpl->status != CPL_ERR_NONE) {
1341		log(LOG_ERR,
1342		    "Unexpected SMT_WRITE_RPL status %u for entry %u\n",
1343		    rpl->status, GET_TID(rpl));
1344	}
1345
1346	m_freem(m);
1347	return (0);
1348}
1349
1350static int
1351do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1352{
1353	struct cpl_set_tcb_rpl *rpl = mtod(m, void *);
1354
1355	if (rpl->status != CPL_ERR_NONE) {
1356		log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n",
1357		    rpl->status, GET_TID(rpl));
1358	}
1359
1360	m_freem(m);
1361	return (0);
1362}
1363
1364/*
1365 * Handle an ABORT_RPL_RSS CPL message.
1366 */
1367static int
1368do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1369{
1370	struct adapter *sc = qs->adap;
1371	struct tom_data *td = sc->tom_softc;
1372	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
1373	unsigned int tid = GET_TID(rpl);
1374	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1375	struct inpcb *inp;
1376
1377	/*
1378	 * Ignore replies to post-close aborts indicating that the abort was
1379	 * requested too late.  These connections are terminated when we get
1380	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
1381	 * arrives the TID is either no longer used or it has been recycled.
1382	 */
1383	if (rpl->status == CPL_ERR_ABORT_FAILED) {
1384		m_freem(m);
1385		return (0);
1386	}
1387
1388	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
1389		return (do_abort_rpl_synqe(qs, r, m));
1390
1391	CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep,
1392	    rpl->status);
1393
1394	inp = toep->tp_inp;
1395	INP_WLOCK(inp);
1396
1397	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1398		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) {
1399			toep->tp_flags |= TP_ABORT_RPL_RCVD;
1400			INP_WUNLOCK(inp);
1401		} else {
1402			toep->tp_flags &= ~TP_ABORT_RPL_RCVD;
1403			toep->tp_flags &= TP_ABORT_RPL_PENDING;
1404			toepcb_release(toep);	/* no more CPLs expected */
1405		}
1406	}
1407
1408	m_freem(m);
1409	return (0);
1410}
1411
1412/*
1413 * Convert the status code of an ABORT_REQ into a FreeBSD error code.
1414 */
1415static int
1416abort_status_to_errno(struct tcpcb *tp, int abort_reason)
1417{
1418	switch (abort_reason) {
1419	case CPL_ERR_BAD_SYN:
1420	case CPL_ERR_CONN_RESET:
1421		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
1422	case CPL_ERR_XMIT_TIMEDOUT:
1423	case CPL_ERR_PERSIST_TIMEDOUT:
1424	case CPL_ERR_FINWAIT2_TIMEDOUT:
1425	case CPL_ERR_KEEPALIVE_TIMEDOUT:
1426		return (ETIMEDOUT);
1427	default:
1428		return (EIO);
1429	}
1430}
1431
1432/*
1433 * Returns whether an ABORT_REQ_RSS message is a negative advice.
1434 */
1435static inline int
1436is_neg_adv_abort(unsigned int status)
1437{
1438	return status == CPL_ERR_RTX_NEG_ADVICE ||
1439	    status == CPL_ERR_PERSIST_NEG_ADVICE;
1440}
1441
1442void
1443send_abort_rpl(struct toedev *tod, int tid, int qset)
1444{
1445	struct mbuf *reply;
1446	struct cpl_abort_rpl *rpl;
1447	struct adapter *sc = tod->tod_softc;
1448
1449	reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl);
1450	if (!reply)
1451		CXGB_UNIMPLEMENTED();
1452
1453	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
1454	rpl->wr.wrh_lo = htonl(V_WR_TID(tid));
1455	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
1456	rpl->cmd = CPL_ABORT_NO_RST;
1457
1458	t3_offload_tx(sc, reply);
1459}
1460
1461/*
1462 * Handle an ABORT_REQ_RSS CPL message.  If we're waiting for an ABORT_RPL we
1463 * ignore this request except that we need to reply to it.
1464 */
1465static int
1466do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1467{
1468	struct adapter *sc = qs->adap;
1469	struct tom_data *td = sc->tom_softc;
1470	struct toedev *tod = &td->tod;
1471	const struct cpl_abort_req_rss *req = mtod(m, void *);
1472	unsigned int tid = GET_TID(req);
1473	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1474	struct inpcb *inp;
1475	struct tcpcb *tp;
1476	struct socket *so;
1477	int qset = toep->tp_qset;
1478
1479	if (is_neg_adv_abort(req->status)) {
1480		CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)",
1481		    __func__, req->status, tid, toep->tp_flags);
1482		m_freem(m);
1483		return (0);
1484	}
1485
1486	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
1487		return (do_abort_req_synqe(qs, r, m));
1488
1489	inp = toep->tp_inp;
1490	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
1491	INP_WLOCK(inp);
1492
1493	tp = intotcpcb(inp);
1494	so = inp->inp_socket;
1495
1496	CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d",
1497	    __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags,
1498	    req->status);
1499
1500	if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) {
1501		toep->tp_flags |= TP_ABORT_REQ_RCVD;
1502		toep->tp_flags |= TP_ABORT_SHUTDOWN;
1503		INP_WUNLOCK(inp);
1504		INP_INFO_RUNLOCK(&V_tcbinfo);
1505		m_freem(m);
1506		return (0);
1507	}
1508	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
1509
1510	/*
1511	 * If we'd sent a reset on this toep, we'll ignore this and clean up in
1512	 * the T3's reply to our reset instead.
1513	 */
1514	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1515		toep->tp_flags |= TP_ABORT_RPL_SENT;
1516		INP_WUNLOCK(inp);
1517	} else {
1518		so_error_set(so, abort_status_to_errno(tp, req->status));
1519		tp = tcp_close(tp);
1520		if (tp == NULL)
1521			INP_WLOCK(inp);	/* re-acquire */
1522		toepcb_release(toep);	/* no more CPLs expected */
1523	}
1524	INP_INFO_RUNLOCK(&V_tcbinfo);
1525
1526	send_abort_rpl(tod, tid, qset);
1527	m_freem(m);
1528	return (0);
1529}
1530
1531static void
1532assign_rxopt(struct tcpcb *tp, uint16_t tcpopt)
1533{
1534	struct toepcb *toep = tp->t_toe;
1535	struct adapter *sc = toep->tp_tod->tod_softc;
1536
1537	tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40;
1538
1539	if (G_TCPOPT_TSTAMP(tcpopt)) {
1540		tp->t_flags |= TF_RCVD_TSTMP;
1541		tp->t_flags |= TF_REQ_TSTMP;	/* forcibly set */
1542		tp->ts_recent = 0;		/* XXX */
1543		tp->ts_recent_age = tcp_ts_getticks();
1544		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
1545	}
1546
1547	if (G_TCPOPT_SACK(tcpopt))
1548		tp->t_flags |= TF_SACK_PERMIT;
1549	else
1550		tp->t_flags &= ~TF_SACK_PERMIT;
1551
1552	if (G_TCPOPT_WSCALE_OK(tcpopt))
1553		tp->t_flags |= TF_RCVD_SCALE;
1554
1555	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
1556	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
1557		tp->rcv_scale = tp->request_r_scale;
1558		tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt);
1559	}
1560
1561}
1562
1563/*
1564 * The ISS and IRS are from after the exchange of SYNs and are off by 1.
1565 */
1566void
1567make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs,
1568    uint16_t cpl_tcpopt)
1569{
1570	struct inpcb *inp = sotoinpcb(so);
1571	struct tcpcb *tp = intotcpcb(inp);
1572	struct toepcb *toep = tp->t_toe;
1573	long bufsize;
1574	uint32_t iss = be32toh(cpl_iss) - 1;	/* true ISS */
1575	uint32_t irs = be32toh(cpl_irs) - 1;	/* true IRS */
1576	uint16_t tcpopt = be16toh(cpl_tcpopt);
1577
1578	INP_WLOCK_ASSERT(inp);
1579
1580	tp->t_state = TCPS_ESTABLISHED;
1581	tp->t_starttime = ticks;
1582	TCPSTAT_INC(tcps_connects);
1583
1584	CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state],
1585	    toep->tp_tid, toep, inp);
1586
1587	tp->irs = irs;
1588	tcp_rcvseqinit(tp);
1589	tp->rcv_wnd = toep->tp_rx_credits << 10;
1590	tp->rcv_adv += tp->rcv_wnd;
1591	tp->last_ack_sent = tp->rcv_nxt;
1592
1593	/*
1594	 * If we were unable to send all rx credits via opt0, save the remainder
1595	 * in rx_credits so that they can be handed over with the next credit
1596	 * update.
1597	 */
1598	SOCKBUF_LOCK(&so->so_rcv);
1599	bufsize = select_rcv_wnd(so);
1600	SOCKBUF_UNLOCK(&so->so_rcv);
1601	toep->tp_rx_credits = bufsize - tp->rcv_wnd;
1602
1603	tp->iss = iss;
1604	tcp_sendseqinit(tp);
1605	tp->snd_una = iss + 1;
1606	tp->snd_nxt = iss + 1;
1607	tp->snd_max = iss + 1;
1608
1609	assign_rxopt(tp, tcpopt);
1610	soisconnected(so);
1611}
1612
1613/*
1614 * Fill in the right TID for CPL messages waiting in the out-of-order queue
1615 * and send them to the TOE.
1616 */
1617static void
1618fixup_and_send_ofo(struct toepcb *toep)
1619{
1620	struct mbuf *m;
1621	struct toedev *tod = toep->tp_tod;
1622	struct adapter *sc = tod->tod_softc;
1623	unsigned int tid = toep->tp_tid;
1624
1625	inp_lock_assert(toep->tp_inp);
1626
1627	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
1628		struct ofld_hdr *oh = mtod(m, void *);
1629		/*
1630		 * A variety of messages can be waiting but the fields we'll
1631		 * be touching are common to all so any message type will do.
1632		 */
1633		struct cpl_close_con_req *p = (void *)(oh + 1);
1634
1635		p->wr.wrh_lo = htonl(V_WR_TID(tid));
1636		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
1637		t3_offload_tx(sc, m);
1638	}
1639}
1640
1641/*
1642 * Process a CPL_ACT_ESTABLISH message.
1643 */
1644static int
1645do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1646{
1647	struct adapter *sc = qs->adap;
1648	struct tom_data *td = sc->tom_softc;
1649	struct cpl_act_establish *req = mtod(m, void *);
1650	unsigned int tid = GET_TID(req);
1651	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
1652	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
1653	struct inpcb *inp = toep->tp_inp;
1654	struct tcpcb *tp;
1655	struct socket *so;
1656
1657	CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid);
1658
1659	free_atid(&td->tid_maps, atid);
1660
1661	INP_WLOCK(inp);
1662	tp = intotcpcb(inp);
1663
1664	KASSERT(toep->tp_qset == qs->idx,
1665	    ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx));
1666	KASSERT(toep->tp_tid == atid,
1667	    ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid));
1668
1669	toep->tp_tid = tid;
1670	insert_tid(td, toep, tid);
1671
1672	if (inp->inp_flags & INP_DROPPED) {
1673		/* socket closed by the kernel before hw told us it connected */
1674		send_reset(toep);
1675		goto done;
1676	}
1677
1678	KASSERT(tp->t_state == TCPS_SYN_SENT,
1679	    ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state));
1680
1681	so = inp->inp_socket;
1682	make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt);
1683
1684	/*
1685	 * Now that we finally have a TID send any CPL messages that we had to
1686	 * defer for lack of a TID.
1687	 */
1688	if (mbufq_len(&toep->out_of_order_queue))
1689		fixup_and_send_ofo(toep);
1690
1691done:
1692	INP_WUNLOCK(inp);
1693	m_freem(m);
1694	return (0);
1695}
1696
1697/*
1698 * Process an acknowledgment of WR completion.  Advance snd_una and send the
1699 * next batch of work requests from the write queue.
1700 */
1701static void
1702wr_ack(struct toepcb *toep, struct mbuf *m)
1703{
1704	struct inpcb *inp = toep->tp_inp;
1705	struct tcpcb *tp;
1706	struct cpl_wr_ack *hdr = mtod(m, void *);
1707	struct socket *so;
1708	unsigned int credits = ntohs(hdr->credits);
1709	u32 snd_una = ntohl(hdr->snd_una);
1710	int bytes = 0;
1711	struct sockbuf *snd;
1712	struct mbuf *p;
1713	struct ofld_hdr *oh;
1714
1715	inp_wlock(inp);
1716	tp = intotcpcb(inp);
1717	so = inp->inp_socket;
1718	toep->tp_wr_avail += credits;
1719	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
1720		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
1721
1722	while (credits) {
1723		p = peek_wr(toep);
1724
1725		if (__predict_false(!p)) {
1726			CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, "
1727			    "tid %u, state %u, wr_avail %u", __func__, credits,
1728			    toep->tp_tid, tp->t_state, toep->tp_wr_avail);
1729
1730			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
1731			    "nothing pending, state %u wr_avail=%u\n",
1732			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
1733			break;
1734		}
1735
1736		oh = mtod(p, struct ofld_hdr *);
1737
1738		KASSERT(credits >= G_HDR_NDESC(oh->flags),
1739		    ("%s: partial credits?  %d %d", __func__, credits,
1740		    G_HDR_NDESC(oh->flags)));
1741
1742		dequeue_wr(toep);
1743		credits -= G_HDR_NDESC(oh->flags);
1744		bytes += oh->plen;
1745
1746		if (oh->flags & F_HDR_SGL)
1747			sglist_free(oh->sgl);
1748		m_freem(p);
1749	}
1750
1751	if (__predict_false(SEQ_LT(snd_una, tp->snd_una)))
1752		goto out_free;
1753
1754	if (tp->snd_una != snd_una) {
1755		tp->snd_una = snd_una;
1756		tp->ts_recent_age = tcp_ts_getticks();
1757		if (tp->snd_una == tp->snd_nxt)
1758			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
1759	}
1760
1761	snd = so_sockbuf_snd(so);
1762	if (bytes) {
1763		SOCKBUF_LOCK(snd);
1764		sbdrop_locked(snd, bytes);
1765		so_sowwakeup_locked(so);
1766	}
1767
1768	if (snd->sb_sndptroff < snd->sb_cc)
1769		t3_push_frames(so, 0);
1770
1771out_free:
1772	inp_wunlock(tp->t_inpcb);
1773	m_freem(m);
1774}
1775
1776/*
1777 * Handler for TX_DATA_ACK CPL messages.
1778 */
1779static int
1780do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
1781{
1782	struct adapter *sc = qs->adap;
1783	struct tom_data *td = sc->tom_softc;
1784	struct cpl_wr_ack *hdr = mtod(m, void *);
1785	unsigned int tid = GET_TID(hdr);
1786	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
1787
1788	/* XXX bad race */
1789	if (toep)
1790		wr_ack(toep, m);
1791
1792	return (0);
1793}
1794
1795void
1796t3_init_cpl_io(struct adapter *sc)
1797{
1798	t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
1799	t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
1800	t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify);
1801	t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
1802	t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack);
1803	t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
1804	t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
1805	t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
1806	t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
1807	t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl);
1808	t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl);
1809}
1810#endif
1811