cxgb_cpl_io.c revision 183292
175295Sdes/**************************************************************************
2230132Suqs
375295SdesCopyright (c) 2007-2008, Chelsio Inc.
475295SdesAll rights reserved.
575295Sdes
675295SdesRedistribution and use in source and binary forms, with or without
775295Sdesmodification, are permitted provided that the following conditions are met:
875295Sdes
975295Sdes 1. Redistributions of source code must retain the above copyright notice,
1075295Sdes    this list of conditions and the following disclaimer.
1175295Sdes
1275295Sdes 2. Neither the name of the Chelsio Corporation nor the names of its
1375295Sdes    contributors may be used to endorse or promote products derived from
1475295Sdes    this software without specific prior written permission.
1575295Sdes
1675295SdesTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
1775295SdesAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1875295SdesIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1975295SdesARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
2075295SdesLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
2175295SdesCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
2275295SdesSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
2375295SdesINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
2475295SdesCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
2575295SdesARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
2675295SdesPOSSIBILITY OF SUCH DAMAGE.
2775295Sdes
2875295Sdes***************************************************************************/
29143592Sdes
30143592Sdes#include <sys/cdefs.h>
31143592Sdes__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 183292 2008-09-23 03:16:54Z kmacy $");
32143592Sdes
33143592Sdes#include <sys/param.h>
3475295Sdes#include <sys/systm.h>
3575295Sdes#include <sys/fcntl.h>
3675295Sdes#include <sys/kernel.h>
3784811Sjhb#include <sys/limits.h>
3875295Sdes#include <sys/ktr.h>
3975295Sdes#include <sys/lock.h>
4075295Sdes#include <sys/mbuf.h>
4177965Sdes#include <sys/mutex.h>
4275295Sdes#include <sys/sockstate.h>
4375295Sdes#include <sys/sockopt.h>
4475295Sdes#include <sys/socket.h>
4575295Sdes#include <sys/sockbuf.h>
4675295Sdes#include <sys/sysctl.h>
4775295Sdes#include <sys/syslog.h>
4875295Sdes#include <sys/protosw.h>
4975295Sdes#include <sys/priv.h>
5085128Sdes
5185128Sdes#if __FreeBSD_version >= 800044
5275295Sdes#include <sys/vimage.h>
5375295Sdes#else
5475295Sdes#define V_tcp_do_autosndbuf tcp_do_autosndbuf
55168764Sdes#define V_tcp_autosndbuf_max tcp_autosndbuf_max
56168764Sdes#define V_tcp_do_rfc1323 tcp_do_rfc1323
57168764Sdes#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
58168764Sdes#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
5985940Sdes#define V_tcpstat tcpstat
6085940Sdes#endif
6185940Sdes
6285940Sdes#include <net/if.h>
6375295Sdes#include <net/route.h>
64168764Sdes
65168764Sdes#include <netinet/in.h>
66168764Sdes#include <netinet/in_pcb.h>
67168764Sdes#include <netinet/in_systm.h>
68168764Sdes#include <netinet/in_var.h>
69168764Sdes
70168764Sdes
71168764Sdes#include <cxgb_osdep.h>
72168764Sdes#include <sys/mbufq.h>
73168764Sdes
74184205Sdes#include <netinet/ip.h>
75168764Sdes#include <netinet/tcp_var.h>
76168764Sdes#include <netinet/tcp_fsm.h>
77168764Sdes#include <netinet/tcp_offload.h>
78168764Sdes#include <netinet/tcp_seq.h>
79168764Sdes#include <netinet/tcp_syncache.h>
80168764Sdes#include <netinet/tcp_timer.h>
81168764Sdes#include <net/route.h>
82168764Sdes
83168764Sdes#include <t3cdev.h>
8485128Sdes#include <common/cxgb_firmware_exports.h>
8585128Sdes#include <common/cxgb_t3_cpl.h>
86168764Sdes#include <common/cxgb_tcb.h>
87168764Sdes#include <common/cxgb_ctl_defs.h>
8885128Sdes#include <cxgb_offload.h>
89168764Sdes#include <vm/vm.h>
90168764Sdes#include <vm/pmap.h>
91168764Sdes#include <machine/bus.h>
92168764Sdes#include <sys/mvec.h>
9385128Sdes#include <ulp/toecore/cxgb_toedev.h>
9487599Sobrien#include <ulp/tom/cxgb_defs.h>
95168764Sdes#include <ulp/tom/cxgb_tom.h>
96168764Sdes#include <ulp/tom/cxgb_t3_ddp.h>
9785128Sdes#include <ulp/tom/cxgb_toepcb.h>
9887599Sobrien#include <ulp/tom/cxgb_tcp.h>
9985128Sdes#include <ulp/tom/cxgb_tcp_offload.h>
10085128Sdes
10185128Sdes/*
10287599Sobrien * For ULP connections HW may add headers, e.g., for digests, that aren't part
10385128Sdes * of the messages sent by the host but that are part of the TCP payload and
104168764Sdes * therefore consume TCP sequence space.  Tx connection parameters that
105168764Sdes * operate in TCP sequence space are affected by the HW additions and need to
106168764Sdes * compensate for them to accurately track TCP sequence numbers. This array
107168764Sdes * contains the compensating extra lengths for ULP packets.  It is indexed by
108168764Sdes * a packet's ULP submode.
109168764Sdes */
110168764Sdesconst unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
111168764Sdes
112168764Sdes#ifdef notyet
113168764Sdes/*
114168764Sdes * This sk_buff holds a fake header-only TCP segment that we use whenever we
115168764Sdes * need to exploit SW TCP functionality that expects TCP headers, such as
116168764Sdes * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
117168764Sdes * CPUs without locking.
11897940Sdes */
119168764Sdesstatic struct mbuf *tcphdr_mbuf __read_mostly;
120168764Sdes#endif
121168764Sdes
122168764Sdes/*
123168764Sdes * Size of WRs in bytes.  Note that we assume all devices we are handling have
124103314Snjl * the same WR size.
125168764Sdes */
126168764Sdesstatic unsigned int wrlen __read_mostly;
127168764Sdes
12885128Sdes/*
12985128Sdes * The number of WRs needed for an skb depends on the number of page fragments
13085128Sdes * in the skb and whether it has any payload in its main body.  This maps the
131168764Sdes * length of the gather list represented by an skb into the # of necessary WRs.
13285128Sdes */
133168764Sdesstatic unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
134168764Sdes
13585128Sdes/*
136168764Sdes * Max receive window supported by HW in bytes.  Only a small part of it can
137168764Sdes * be set through option0, the rest needs to be set through RX_DATA_ACK.
13897940Sdes */
139168764Sdes#define MAX_RCV_WND ((1U << 27) - 1)
140168764Sdes
141168764Sdes/*
14297940Sdes * Min receive window.  We want it to be large enough to accommodate receive
143168764Sdes * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
144168764Sdes */
145168764Sdes#define MIN_RCV_WND (24 * 1024U)
146168764Sdes#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
147168764Sdes
148168764Sdes#define VALIDATE_SEQ 0
149168764Sdes#define VALIDATE_SOCK(so)
150168764Sdes#define DEBUG_WR 0
15185128Sdes
152168764Sdes#define TCP_TIMEWAIT	1
153168764Sdes#define TCP_CLOSE	2
154168764Sdes#define TCP_DROP	3
15597940Sdes
156168764Sdesextern int tcp_do_autorcvbuf;
157168764Sdesextern int tcp_do_autosndbuf;
158168764Sdesextern int tcp_autorcvbuf_max;
159168764Sdesextern int tcp_autosndbuf_max;
160168764Sdes
161168764Sdesstatic void t3_send_reset(struct toepcb *toep);
162168764Sdesstatic void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
16397940Sdesstatic inline void free_atid(struct t3cdev *cdev, unsigned int tid);
164168764Sdesstatic void handle_syncache_event(int event, void *arg);
165168764Sdes
166168764Sdesstatic inline void
167168764SdesSBAPPEND(struct sockbuf *sb, struct mbuf *n)
16885128Sdes{
16985128Sdes	struct mbuf *m;
17085128Sdes
17185128Sdes	m = sb->sb_mb;
17285128Sdes	while (m) {
17385128Sdes		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
174123248Sdes		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
175167482Sdes			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
176167482Sdes		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
17785128Sdes			m->m_next, m->m_nextpkt, m->m_flags));
178168764Sdes		m = m->m_next;
17985128Sdes	}
180168764Sdes	m = n;
181168764Sdes	while (m) {
182168764Sdes		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
183168764Sdes		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
184168764Sdes			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
185168764Sdes		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
186168764Sdes			m->m_next, m->m_nextpkt, m->m_flags));
187168764Sdes		m = m->m_next;
18885128Sdes	}
189168764Sdes	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
19085128Sdes	sbappendstream_locked(sb, n);
19185128Sdes	m = sb->sb_mb;
19285128Sdes
19385128Sdes	while (m) {
19485128Sdes		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
19585128Sdes			m->m_next, m->m_nextpkt, m->m_flags));
196123248Sdes		m = m->m_next;
197167482Sdes	}
198167482Sdes}
19985128Sdes
200168764Sdesstatic inline int
20185128Sdesis_t3a(const struct toedev *dev)
202168764Sdes{
203168764Sdes	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
204168764Sdes}
205168764Sdes
206168764Sdesstatic void
207168764Sdesdump_toepcb(struct toepcb *toep)
208168764Sdes{
20985128Sdes	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
210168764Sdes	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
21185128Sdes	    toep->tp_mtu_idx, toep->tp_tid);
21285128Sdes
21385128Sdes	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
21485128Sdes	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
21585128Sdes	    toep->tp_mss_clamp, toep->tp_flags);
21685128Sdes}
217123248Sdes
218168387Sdes#ifndef RTALLOC2_DEFINED
219167482Sdesstatic struct rtentry *
22085128Sdesrtalloc2(struct sockaddr *dst, int report, u_long ignflags)
221168764Sdes{
22285128Sdes	struct rtentry *rt = NULL;
223168764Sdes
224168764Sdes	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
225168764Sdes		RT_UNLOCK(rt);
226168764Sdes
227168764Sdes	return (rt);
228168764Sdes}
229168764Sdes#endif
230168764Sdes
231168764Sdes/*
23285128Sdes * Determine whether to send a CPL message now or defer it.  A message is
23385128Sdes * deferred if the connection is in SYN_SENT since we don't know the TID yet.
23485128Sdes * For connections in other states the message is sent immediately.
235123248Sdes * If through_l2t is set the message is subject to ARP processing, otherwise
236123248Sdes * it is sent directly.
237123248Sdes */
238123248Sdesstatic inline void
239123248Sdessend_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
240168764Sdes{
241123248Sdes	struct tcpcb *tp = toep->tp_tp;
242168764Sdes
243168764Sdes	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
244168764Sdes		inp_wlock(tp->t_inpcb);
245168720Sdes		mbufq_tail(&toep->out_of_order_queue, m);  // defer
246168764Sdes		inp_wunlock(tp->t_inpcb);
247168764Sdes	} else if (through_l2t)
248123248Sdes		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
249123248Sdes	else
250123248Sdes		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
251168764Sdes}
252168764Sdes
25385128Sdesstatic inline unsigned int
25485128Sdesmkprio(unsigned int cntrl, const struct toepcb *toep)
255168764Sdes{
25685128Sdes        return (cntrl);
257168764Sdes}
25897940Sdes
259168764Sdes/*
26087599Sobrien * Populate a TID_RELEASE WR.  The skb must be already propely sized.
261168764Sdes */
26287599Sobrienstatic inline void
26385128Sdesmk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
264168764Sdes{
265168764Sdes	struct cpl_tid_release *req;
266168764Sdes
26785128Sdes	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
268168764Sdes	m->m_pkthdr.len = m->m_len = sizeof(*req);
269168764Sdes	req = mtod(m, struct cpl_tid_release *);
270168764Sdes	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
271168764Sdes	req->wr.wr_lo = 0;
272168764Sdes	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
273168764Sdes}
274168764Sdes
275168764Sdesstatic inline void
276168764Sdesmake_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
277168764Sdes{
278168764Sdes	struct tcpcb *tp = so_sototcpcb(so);
27985128Sdes	struct toepcb *toep = tp->t_toe;
280168764Sdes	struct tx_data_wr *req;
28185128Sdes	struct sockbuf *snd;
28285128Sdes
283168764Sdes	inp_lock_assert(tp->t_inpcb);
284168764Sdes	snd = so_sockbuf_snd(so);
285168764Sdes
286167482Sdes	req = mtod(m, struct tx_data_wr *);
287168764Sdes	m->m_len = sizeof(*req);
288168764Sdes	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
289168387Sdes	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
290168764Sdes	/* len includes the length of any HW ULP additions */
291168764Sdes	req->len = htonl(len);
292168764Sdes	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
293184205Sdes	/* V_TX_ULP_SUBMODE sets both the mode and submode */
29485128Sdes	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
29585128Sdes	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
29685128Sdes	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
29785128Sdes				   (tail ? 0 : 1))));
29885128Sdes	req->sndseq = htonl(tp->snd_nxt);
29975295Sdes	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
30075295Sdes		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
30175295Sdes				    V_TX_CPU_IDX(toep->tp_qset));
302191990Sattilio
30375295Sdes		/* Sendbuffer is in units of 32KB.
30475295Sdes		 */
30597940Sdes		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
30675295Sdes			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
30775295Sdes		else {
30897940Sdes			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
309168637Sdes		}
31075295Sdes
311168637Sdes		toep->tp_flags |= TP_DATASENT;
312172697Salfred	}
31375295Sdes}
31475295Sdes
31575295Sdes#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
316138495Sphk
31775295Sdesint
31875295Sdest3_push_frames(struct socket *so, int req_completion)
31975295Sdes{
32075295Sdes	struct tcpcb *tp = so_sototcpcb(so);
32175295Sdes	struct toepcb *toep = tp->t_toe;
32275295Sdes
32375295Sdes	struct mbuf *tail, *m0, *last;
32475295Sdes	struct t3cdev *cdev;
32575295Sdes	struct tom_data *d;
32675295Sdes	int state, bytes, count, total_bytes;
32775295Sdes	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
32875295Sdes	struct sockbuf *snd;
329168764Sdes
330158611Skbyanc	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
331158611Skbyanc		DPRINTF("tcp state=%d\n", tp->t_state);
332230249Smckusick		return (0);
333158611Skbyanc	}
334168764Sdes
335168764Sdes	state = so_state_get(so);
336168764Sdes
337168764Sdes	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
338158611Skbyanc		DPRINTF("disconnecting\n");
339158611Skbyanc
340158611Skbyanc		return (0);
34175295Sdes	}
34275295Sdes
34375295Sdes	inp_lock_assert(tp->t_inpcb);
344191990Sattilio
34575295Sdes	snd = so_sockbuf_snd(so);
34675295Sdes	sockbuf_lock(snd);
34775295Sdes
348191990Sattilio	d = TOM_DATA(toep->tp_toedev);
349191990Sattilio	cdev = d->cdev;
35075295Sdes
35175295Sdes	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
35275295Sdes
35375295Sdes	total_bytes = 0;
35475295Sdes	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
35575295Sdes	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
35675295Sdes
357191990Sattilio	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
35875295Sdes		KASSERT(tail, ("sbdrop error"));
35975295Sdes		last = tail = tail->m_next;
36075295Sdes	}
36175295Sdes
362168764Sdes	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
36375295Sdes		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
36475295Sdes		sockbuf_unlock(snd);
36575295Sdes
36675295Sdes		return (0);
36775295Sdes	}
36875295Sdes
369191990Sattilio	toep->tp_m_last = NULL;
37075295Sdes	while (toep->tp_wr_avail && (tail != NULL)) {
371138495Sphk		count = bytes = 0;
37275295Sdes		segp = segs;
37375295Sdes		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
37475295Sdes			sockbuf_unlock(snd);
37575295Sdes			return (0);
37675295Sdes		}
37775295Sdes		/*
37875295Sdes		 * If the data in tail fits as in-line, then
37975295Sdes		 * make an immediate data wr.
38075295Sdes		 */
38185128Sdes		if (tail->m_len <= IMM_LEN) {
38285128Sdes			count = 1;
38385128Sdes			bytes = tail->m_len;
384168720Sdes			last = tail;
38597940Sdes			tail = tail->m_next;
386168764Sdes			m_set_sgl(m0, NULL);
387168764Sdes			m_set_sgllen(m0, 0);
38885128Sdes			make_tx_data_wr(so, m0, bytes, tail);
389168764Sdes			m_append(m0, bytes, mtod(last, caddr_t));
39085128Sdes			KASSERT(!m0->m_next, ("bad append"));
391168764Sdes		} else {
392168764Sdes			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
39385128Sdes			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
39485128Sdes				bytes += tail->m_len;
39585128Sdes				last = tail;
39685128Sdes				count++;
39785128Sdes				/*
39885128Sdes				 * technically an abuse to be using this for a VA
39985128Sdes				 * but less gross than defining my own structure
40085128Sdes				 * or calling pmap_kextract from here :-|
40197940Sdes				 */
40284383Sdes				segp->ds_addr = (bus_addr_t)tail->m_data;
40384383Sdes				segp->ds_len = tail->m_len;
40475295Sdes				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
40575295Sdes				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
40675295Sdes				segp++;
40775295Sdes				tail = tail->m_next;
40875295Sdes			}
40975295Sdes			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
41075295Sdes			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
41175295Sdes
41275295Sdes			m_set_sgl(m0, segs);
41385128Sdes			m_set_sgllen(m0, count);
41497940Sdes			make_tx_data_wr(so, m0, bytes, tail);
415168720Sdes		}
416168720Sdes		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
41785128Sdes
41885128Sdes		if (tail) {
419168637Sdes			snd->sb_sndptr = tail;
42084383Sdes			toep->tp_m_last = NULL;
42184383Sdes		} else
42285128Sdes			toep->tp_m_last = snd->sb_sndptr = last;
42385128Sdes
42475295Sdes
42575295Sdes		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
42675295Sdes
42775295Sdes		snd->sb_sndptroff += bytes;
42875295Sdes		total_bytes += bytes;
42975295Sdes		toep->tp_write_seq += bytes;
43075295Sdes		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
43175295Sdes		    " tail=%p sndptr=%p sndptroff=%d",
43275295Sdes		    toep->tp_wr_avail, count, mbuf_wrs[count],
43375295Sdes		    tail, snd->sb_sndptr, snd->sb_sndptroff);
43475295Sdes		if (tail)
43575295Sdes			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
43675295Sdes			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
43775295Sdes			    total_bytes, toep->tp_m_last, tail->m_data,
43875295Sdes			    tp->snd_una);
43975295Sdes		else
44075295Sdes			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
441132199Sphk			    " tp_m_last=%p snd_una=0x%08x",
44275295Sdes			    total_bytes, toep->tp_m_last, tp->snd_una);
44375295Sdes
44475295Sdes
44575295Sdes#ifdef KTR
44675295Sdes{
44775295Sdes		int i;
44875295Sdes
44975295Sdes		i = 0;
45075295Sdes		while (i < count && m_get_sgllen(m0)) {
45175295Sdes			if ((count - i) >= 3) {
45275295Sdes				CTR6(KTR_TOM,
45375295Sdes				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
45475295Sdes				    " len=%d pa=0x%zx len=%d",
45575295Sdes				    segs[i].ds_addr, segs[i].ds_len,
45686969Sdes				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
457				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
458				    i += 3;
459			} else if ((count - i) == 2) {
460				CTR4(KTR_TOM,
461				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
462				    " len=%d",
463				    segs[i].ds_addr, segs[i].ds_len,
464				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
465				    i += 2;
466			} else {
467				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
468				    segs[i].ds_addr, segs[i].ds_len);
469				i++;
470			}
471
472		}
473}
474#endif
475                 /*
476		 * remember credits used
477		 */
478		m0->m_pkthdr.csum_data = mbuf_wrs[count];
479		m0->m_pkthdr.len = bytes;
480		toep->tp_wr_avail -= mbuf_wrs[count];
481		toep->tp_wr_unacked += mbuf_wrs[count];
482
483		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
484		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
485			struct work_request_hdr *wr = cplhdr(m0);
486
487			wr->wr_hi |= htonl(F_WR_COMPL);
488			toep->tp_wr_unacked = 0;
489		}
490		KASSERT((m0->m_pkthdr.csum_data > 0) &&
491		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
492			m0->m_pkthdr.csum_data));
493		m0->m_type = MT_DONTFREE;
494		enqueue_wr(toep, m0);
495		DPRINTF("sending offload tx with %d bytes in %d segments\n",
496		    bytes, count);
497		l2t_send(cdev, m0, toep->tp_l2t);
498	}
499	sockbuf_unlock(snd);
500	return (total_bytes);
501}
502
503/*
504 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
505 * under any circumstances.  We take the easy way out and always queue the
506 * message to the write_queue.  We can optimize the case where the queue is
507 * already empty though the optimization is probably not worth it.
508 */
509static void
510close_conn(struct socket *so)
511{
512	struct mbuf *m;
513	struct cpl_close_con_req *req;
514	struct tom_data *d;
515	struct inpcb *inp = so_sotoinpcb(so);
516	struct tcpcb *tp;
517	struct toepcb *toep;
518	unsigned int tid;
519
520
521	inp_wlock(inp);
522	tp = so_sototcpcb(so);
523	toep = tp->t_toe;
524
525	if (tp->t_state != TCPS_SYN_SENT)
526		t3_push_frames(so, 1);
527
528	if (toep->tp_flags & TP_FIN_SENT) {
529		inp_wunlock(inp);
530		return;
531	}
532
533	tid = toep->tp_tid;
534
535	d = TOM_DATA(toep->tp_toedev);
536
537	m = m_gethdr_nofail(sizeof(*req));
538	m_set_priority(m, CPL_PRIORITY_DATA);
539	m_set_sgl(m, NULL);
540	m_set_sgllen(m, 0);
541
542	toep->tp_flags |= TP_FIN_SENT;
543	req = mtod(m, struct cpl_close_con_req *);
544
545	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
546	req->wr.wr_lo = htonl(V_WR_TID(tid));
547	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
548	req->rsvd = 0;
549	inp_wunlock(inp);
550	/*
551	 * XXX - need to defer shutdown while there is still data in the queue
552	 *
553	 */
554	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
555	cxgb_ofld_send(d->cdev, m);
556
557}
558
559/*
560 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
561 * and send it along.
562 */
563static void
564abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
565{
566	struct cpl_abort_req *req = cplhdr(m);
567
568	req->cmd = CPL_ABORT_NO_RST;
569	cxgb_ofld_send(cdev, m);
570}
571
572/*
573 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
574 * permitted to return without sending the message in case we cannot allocate
575 * an sk_buff.  Returns the number of credits sent.
576 */
577uint32_t
578t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
579{
580	struct mbuf *m;
581	struct cpl_rx_data_ack *req;
582	struct toepcb *toep = tp->t_toe;
583	struct toedev *tdev = toep->tp_toedev;
584
585	m = m_gethdr_nofail(sizeof(*req));
586
587	DPRINTF("returning %u credits to HW\n", credits);
588
589	req = mtod(m, struct cpl_rx_data_ack *);
590	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
591	req->wr.wr_lo = 0;
592	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
593	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
594	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
595	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
596	return (credits);
597}
598
599/*
600 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
601 * This is only used in DDP mode, so we take the opportunity to also set the
602 * DACK mode and flush any Rx credits.
603 */
604void
605t3_send_rx_modulate(struct toepcb *toep)
606{
607	struct mbuf *m;
608	struct cpl_rx_data_ack *req;
609
610	m = m_gethdr_nofail(sizeof(*req));
611
612	req = mtod(m, struct cpl_rx_data_ack *);
613	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
614	req->wr.wr_lo = 0;
615	m->m_pkthdr.len = m->m_len = sizeof(*req);
616
617	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
618	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
619				 V_RX_DACK_MODE(1) |
620				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
621	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
622	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
623	toep->tp_rcv_wup = toep->tp_copied_seq;
624}
625
626/*
627 * Handle receipt of an urgent pointer.
628 */
629static void
630handle_urg_ptr(struct socket *so, uint32_t urg_seq)
631{
632#ifdef URGENT_DATA_SUPPORTED
633	struct tcpcb *tp = so_sototcpcb(so);
634
635	urg_seq--;   /* initially points past the urgent data, per BSD */
636
637	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
638		return;                                 /* duplicate pointer */
639	sk_send_sigurg(sk);
640	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
641	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
642		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
643
644		tp->copied_seq++;
645		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
646			tom_eat_skb(sk, skb, 0);
647	}
648	tp->urg_data = TCP_URG_NOTYET;
649	tp->urg_seq = urg_seq;
650#endif
651}
652
653/*
654 * Returns true if a socket cannot accept new Rx data.
655 */
656static inline int
657so_no_receive(const struct socket *so)
658{
659	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
660}
661
662/*
663 * Process an urgent data notification.
664 */
665static void
666rx_urg_notify(struct toepcb *toep, struct mbuf *m)
667{
668	struct cpl_rx_urg_notify *hdr = cplhdr(m);
669	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
670
671	VALIDATE_SOCK(so);
672
673	if (!so_no_receive(so))
674		handle_urg_ptr(so, ntohl(hdr->seq));
675
676	m_freem(m);
677}
678
679/*
680 * Handler for RX_URG_NOTIFY CPL messages.
681 */
682static int
683do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
684{
685	struct toepcb *toep = (struct toepcb *)ctx;
686
687	rx_urg_notify(toep, m);
688	return (0);
689}
690
691static __inline int
692is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
693{
694	return (toep->tp_ulp_mode ||
695		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
696		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
697}
698
699/*
700 * Set of states for which we should return RX credits.
701 */
702#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
703
704/*
705 * Called after some received data has been read.  It returns RX credits
706 * to the HW for the amount of data processed.
707 */
708void
709t3_cleanup_rbuf(struct tcpcb *tp, int copied)
710{
711	struct toepcb *toep = tp->t_toe;
712	struct socket *so;
713	struct toedev *dev;
714	int dack_mode, must_send, read;
715	u32 thres, credits, dack = 0;
716	struct sockbuf *rcv;
717
718	so = inp_inpcbtosocket(tp->t_inpcb);
719	rcv = so_sockbuf_rcv(so);
720
721	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
722		(tp->t_state == TCPS_FIN_WAIT_2))) {
723		if (copied) {
724			sockbuf_lock(rcv);
725			toep->tp_copied_seq += copied;
726			sockbuf_unlock(rcv);
727		}
728
729		return;
730	}
731
732	inp_lock_assert(tp->t_inpcb);
733
734	sockbuf_lock(rcv);
735	if (copied)
736		toep->tp_copied_seq += copied;
737	else {
738		read = toep->tp_enqueued_bytes - rcv->sb_cc;
739		toep->tp_copied_seq += read;
740	}
741	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
742	toep->tp_enqueued_bytes = rcv->sb_cc;
743	sockbuf_unlock(rcv);
744
745	if (credits > rcv->sb_mbmax) {
746		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
747		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
748	    credits = rcv->sb_mbmax;
749	}
750
751
752	/*
753	 * XXX this won't accurately reflect credit return - we need
754	 * to look at the difference between the amount that has been
755	 * put in the recv sockbuf and what is there now
756	 */
757
758	if (__predict_false(!credits))
759		return;
760
761	dev = toep->tp_toedev;
762	thres = TOM_TUNABLE(dev, rx_credit_thres);
763
764	if (__predict_false(thres == 0))
765		return;
766
767	if (is_delack_mode_valid(dev, toep)) {
768		dack_mode = TOM_TUNABLE(dev, delack);
769		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
770			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
771
772			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
773				dack = F_RX_DACK_CHANGE |
774				       V_RX_DACK_MODE(dack_mode);
775		}
776	} else
777		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
778
779	/*
780	 * For coalescing to work effectively ensure the receive window has
781	 * at least 16KB left.
782	 */
783	must_send = credits + 16384 >= tp->rcv_wnd;
784
785	if (must_send || credits >= thres)
786		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
787}
788
789static int
790cxgb_toe_disconnect(struct tcpcb *tp)
791{
792	struct socket *so;
793
794	DPRINTF("cxgb_toe_disconnect\n");
795
796	so = inp_inpcbtosocket(tp->t_inpcb);
797	close_conn(so);
798	return (0);
799}
800
801static int
802cxgb_toe_reset(struct tcpcb *tp)
803{
804	struct toepcb *toep = tp->t_toe;
805
806	t3_send_reset(toep);
807
808	/*
809	 * unhook from socket
810	 */
811	tp->t_flags &= ~TF_TOE;
812	toep->tp_tp = NULL;
813	tp->t_toe = NULL;
814	return (0);
815}
816
817static int
818cxgb_toe_send(struct tcpcb *tp)
819{
820	struct socket *so;
821
822	DPRINTF("cxgb_toe_send\n");
823	dump_toepcb(tp->t_toe);
824
825	so = inp_inpcbtosocket(tp->t_inpcb);
826	t3_push_frames(so, 1);
827	return (0);
828}
829
830static int
831cxgb_toe_rcvd(struct tcpcb *tp)
832{
833
834	inp_lock_assert(tp->t_inpcb);
835
836	t3_cleanup_rbuf(tp, 0);
837
838	return (0);
839}
840
841static void
842cxgb_toe_detach(struct tcpcb *tp)
843{
844	struct toepcb *toep;
845
846        /*
847	 * XXX how do we handle teardown in the SYN_SENT state?
848	 *
849	 */
850	inp_lock_assert(tp->t_inpcb);
851	toep = tp->t_toe;
852	toep->tp_tp = NULL;
853
854	/*
855	 * unhook from socket
856	 */
857	tp->t_flags &= ~TF_TOE;
858	tp->t_toe = NULL;
859}
860
861
862static struct toe_usrreqs cxgb_toe_usrreqs = {
863	.tu_disconnect = cxgb_toe_disconnect,
864	.tu_reset = cxgb_toe_reset,
865	.tu_send = cxgb_toe_send,
866	.tu_rcvd = cxgb_toe_rcvd,
867	.tu_detach = cxgb_toe_detach,
868	.tu_detach = cxgb_toe_detach,
869	.tu_syncache_event = handle_syncache_event,
870};
871
872
873static void
874__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
875			    uint64_t mask, uint64_t val, int no_reply)
876{
877	struct cpl_set_tcb_field *req;
878
879	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
880	    toep->tp_tid, word, mask, val);
881
882	req = mtod(m, struct cpl_set_tcb_field *);
883	m->m_pkthdr.len = m->m_len = sizeof(*req);
884	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
885	req->wr.wr_lo = 0;
886	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
887	req->reply = V_NO_REPLY(no_reply);
888	req->cpu_idx = 0;
889	req->word = htons(word);
890	req->mask = htobe64(mask);
891	req->val = htobe64(val);
892
893	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
894	send_or_defer(toep, m, 0);
895}
896
897static void
898t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
899{
900	struct mbuf *m;
901	struct tcpcb *tp = toep->tp_tp;
902
903	if (toep == NULL)
904		return;
905
906	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
907		printf("not seting field\n");
908		return;
909	}
910
911	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
912
913	__set_tcb_field(toep, m, word, mask, val, 1);
914}
915
916/*
917 * Set one of the t_flags bits in the TCB.
918 */
919static void
920set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
921{
922
923	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
924}
925
926/*
927 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
928 */
929static void
930t3_set_nagle(struct toepcb *toep)
931{
932	struct tcpcb *tp = toep->tp_tp;
933
934	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
935}
936
937/*
938 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
939 */
940void
941t3_set_keepalive(struct toepcb *toep, int on_off)
942{
943
944	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
945}
946
947void
948t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
949{
950	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
951}
952
953void
954t3_set_dack_mss(struct toepcb *toep, int on_off)
955{
956
957	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
958}
959
960/*
961 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
962 */
963static void
964t3_set_tos(struct toepcb *toep)
965{
966	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
967
968	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
969			 V_TCB_TOS(tos));
970}
971
972
973/*
974 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
975 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
976 * set the PSH bit in the last segment, which would trigger delivery.]
977 * We work around the issue by setting a DDP buffer in a partial placed state,
978 * which guarantees that TP will schedule a timer.
979 */
980#define TP_DDP_TIMER_WORKAROUND_MASK\
981    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
982     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
983       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
984#define TP_DDP_TIMER_WORKAROUND_VAL\
985    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
986     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
987      32))
988
989static void
990t3_enable_ddp(struct toepcb *toep, int on)
991{
992	if (on) {
993
994		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
995				 V_TF_DDP_OFF(0));
996	} else
997		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
998				 V_TF_DDP_OFF(1) |
999				 TP_DDP_TIMER_WORKAROUND_MASK,
1000				 V_TF_DDP_OFF(1) |
1001				 TP_DDP_TIMER_WORKAROUND_VAL);
1002
1003}
1004
1005void
1006t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1007{
1008	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1009			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1010			 tag_color);
1011}
1012
1013void
1014t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1015		    unsigned int len)
1016{
1017	if (buf_idx == 0)
1018		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1019			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1020			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1021			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1022			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1023	else
1024		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1025			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1026			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1027			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1028			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1029}
1030
1031static int
1032t3_set_cong_control(struct socket *so, const char *name)
1033{
1034#ifdef CONGESTION_CONTROL_SUPPORTED
1035	int cong_algo;
1036
1037	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1038		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1039			break;
1040
1041	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1042		return -EINVAL;
1043#endif
1044	return 0;
1045}
1046
1047int
1048t3_get_tcb(struct toepcb *toep)
1049{
1050	struct cpl_get_tcb *req;
1051	struct tcpcb *tp = toep->tp_tp;
1052	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1053
1054	if (!m)
1055		return (ENOMEM);
1056
1057	inp_lock_assert(tp->t_inpcb);
1058	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1059	req = mtod(m, struct cpl_get_tcb *);
1060	m->m_pkthdr.len = m->m_len = sizeof(*req);
1061	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1062	req->wr.wr_lo = 0;
1063	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1064	req->cpuno = htons(toep->tp_qset);
1065	req->rsvd = 0;
1066	if (tp->t_state == TCPS_SYN_SENT)
1067		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1068	else
1069		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1070	return 0;
1071}
1072
1073static inline void
1074so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1075{
1076
1077	toepcb_hold(toep);
1078
1079	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1080}
1081
1082/**
1083 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1084 *	@d: TOM state
1085 *	@mtu: the target MTU
1086 *
1087 *	Returns the index of the value in the MTU table that is closest to but
1088 *	does not exceed the target MTU.
1089 */
1090static unsigned int
1091find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1092{
1093	int i = 0;
1094
1095	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1096		++i;
1097	return (i);
1098}
1099
1100static unsigned int
1101select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1102{
1103	unsigned int idx;
1104
1105#ifdef notyet
1106	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1107#endif
1108	if (tp) {
1109		tp->t_maxseg = pmtu - 40;
1110		if (tp->t_maxseg < td->mtus[0] - 40)
1111			tp->t_maxseg = td->mtus[0] - 40;
1112		idx = find_best_mtu(td, tp->t_maxseg + 40);
1113
1114		tp->t_maxseg = td->mtus[idx] - 40;
1115	} else
1116		idx = find_best_mtu(td, pmtu);
1117
1118	return (idx);
1119}
1120
1121static inline void
1122free_atid(struct t3cdev *cdev, unsigned int tid)
1123{
1124	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1125
1126	if (toep)
1127		toepcb_release(toep);
1128}
1129
1130/*
1131 * Release resources held by an offload connection (TID, L2T entry, etc.)
1132 */
1133static void
1134t3_release_offload_resources(struct toepcb *toep)
1135{
1136	struct tcpcb *tp = toep->tp_tp;
1137	struct toedev *tdev = toep->tp_toedev;
1138	struct t3cdev *cdev;
1139	struct socket *so;
1140	unsigned int tid = toep->tp_tid;
1141	struct sockbuf *rcv;
1142
1143	CTR0(KTR_TOM, "t3_release_offload_resources");
1144
1145	if (!tdev)
1146		return;
1147
1148	cdev = TOEP_T3C_DEV(toep);
1149	if (!cdev)
1150		return;
1151
1152	toep->tp_qset = 0;
1153	t3_release_ddp_resources(toep);
1154
1155#ifdef CTRL_SKB_CACHE
1156	kfree_skb(CTRL_SKB_CACHE(tp));
1157	CTRL_SKB_CACHE(tp) = NULL;
1158#endif
1159
1160	if (toep->tp_wr_avail != toep->tp_wr_max) {
1161		purge_wr_queue(toep);
1162		reset_wr_list(toep);
1163	}
1164
1165	if (toep->tp_l2t) {
1166		l2t_release(L2DATA(cdev), toep->tp_l2t);
1167		toep->tp_l2t = NULL;
1168	}
1169	toep->tp_tp = NULL;
1170	if (tp) {
1171		inp_lock_assert(tp->t_inpcb);
1172		so = inp_inpcbtosocket(tp->t_inpcb);
1173		rcv = so_sockbuf_rcv(so);
1174		/*
1175		 * cancel any offloaded reads
1176		 *
1177		 */
1178		sockbuf_lock(rcv);
1179		tp->t_toe = NULL;
1180		tp->t_flags &= ~TF_TOE;
1181		if (toep->tp_ddp_state.user_ddp_pending) {
1182			t3_cancel_ubuf(toep, rcv);
1183			toep->tp_ddp_state.user_ddp_pending = 0;
1184		}
1185		so_sorwakeup_locked(so);
1186
1187	}
1188
1189	if (toep->tp_state == TCPS_SYN_SENT) {
1190		free_atid(cdev, tid);
1191#ifdef notyet
1192		__skb_queue_purge(&tp->out_of_order_queue);
1193#endif
1194	} else {                                          // we have TID
1195		cxgb_remove_tid(cdev, toep, tid);
1196		toepcb_release(toep);
1197	}
1198#if 0
1199	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1200#endif
1201}
1202
1203static void
1204install_offload_ops(struct socket *so)
1205{
1206	struct tcpcb *tp = so_sototcpcb(so);
1207
1208	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1209
1210	t3_install_socket_ops(so);
1211	tp->t_flags |= TF_TOE;
1212	tp->t_tu = &cxgb_toe_usrreqs;
1213}
1214
1215/*
1216 * Determine the receive window scaling factor given a target max
1217 * receive window.
1218 */
1219static __inline int
1220select_rcv_wscale(int space)
1221{
1222	int wscale = 0;
1223
1224	if (space > MAX_RCV_WND)
1225		space = MAX_RCV_WND;
1226
1227	if (V_tcp_do_rfc1323)
1228		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1229
1230	return (wscale);
1231}
1232
1233/*
1234 * Determine the receive window size for a socket.
1235 */
1236static unsigned long
1237select_rcv_wnd(struct toedev *dev, struct socket *so)
1238{
1239	struct tom_data *d = TOM_DATA(dev);
1240	unsigned int wnd;
1241	unsigned int max_rcv_wnd;
1242	struct sockbuf *rcv;
1243
1244	rcv = so_sockbuf_rcv(so);
1245
1246	if (V_tcp_do_autorcvbuf)
1247		wnd = V_tcp_autorcvbuf_max;
1248	else
1249		wnd = rcv->sb_hiwat;
1250
1251
1252
1253	/* XXX
1254	 * For receive coalescing to work effectively we need a receive window
1255	 * that can accomodate a coalesced segment.
1256	 */
1257	if (wnd < MIN_RCV_WND)
1258		wnd = MIN_RCV_WND;
1259
1260	/* PR 5138 */
1261	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1262				    (uint32_t)d->rx_page_size * 23 :
1263				    MAX_RCV_WND);
1264
1265	return min(wnd, max_rcv_wnd);
1266}
1267
1268/*
1269 * Assign offload parameters to some socket fields.  This code is used by
1270 * both active and passive opens.
1271 */
1272static inline void
1273init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1274    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1275{
1276	struct tcpcb *tp = so_sototcpcb(so);
1277	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1278	struct sockbuf *snd, *rcv;
1279
1280#ifdef notyet
1281	SOCK_LOCK_ASSERT(so);
1282#endif
1283
1284	snd = so_sockbuf_snd(so);
1285	rcv = so_sockbuf_rcv(so);
1286
1287	log(LOG_INFO, "initializing offload socket\n");
1288	/*
1289	 * We either need to fix push frames to work with sbcompress
1290	 * or we need to add this
1291	 */
1292	snd->sb_flags |= SB_NOCOALESCE;
1293	rcv->sb_flags |= SB_NOCOALESCE;
1294
1295	tp->t_toe = toep;
1296	toep->tp_tp = tp;
1297	toep->tp_toedev = dev;
1298
1299	toep->tp_tid = tid;
1300	toep->tp_l2t = e;
1301	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1302	toep->tp_wr_unacked = 0;
1303	toep->tp_delack_mode = 0;
1304
1305	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1306	/*
1307	 * XXX broken
1308	 *
1309	 */
1310	tp->rcv_wnd = select_rcv_wnd(dev, so);
1311
1312        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1313		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1314	toep->tp_qset_idx = 0;
1315
1316	reset_wr_list(toep);
1317	DPRINTF("initialization done\n");
1318}
1319
1320/*
1321 * The next two functions calculate the option 0 value for a socket.
1322 */
1323static inline unsigned int
1324calc_opt0h(struct socket *so, int mtu_idx)
1325{
1326	struct tcpcb *tp = so_sototcpcb(so);
1327	int wscale = select_rcv_wscale(tp->rcv_wnd);
1328
1329	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1330	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1331	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1332}
1333
1334static inline unsigned int
1335calc_opt0l(struct socket *so, int ulp_mode)
1336{
1337	struct tcpcb *tp = so_sototcpcb(so);
1338	unsigned int val;
1339
1340	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1341	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1342
1343	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1344	return (val);
1345}
1346
1347static inline unsigned int
1348calc_opt2(const struct socket *so, struct toedev *dev)
1349{
1350	int flv_valid;
1351
1352	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1353
1354	return (V_FLAVORS_VALID(flv_valid) |
1355	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1356}
1357
1358#if DEBUG_WR > 1
1359static int
1360count_pending_wrs(const struct toepcb *toep)
1361{
1362	const struct mbuf *m;
1363	int n = 0;
1364
1365	wr_queue_walk(toep, m)
1366		n += m->m_pkthdr.csum_data;
1367	return (n);
1368}
1369#endif
1370
1371#if 0
1372(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1373#endif
1374
1375static void
1376mk_act_open_req(struct socket *so, struct mbuf *m,
1377    unsigned int atid, const struct l2t_entry *e)
1378{
1379	struct cpl_act_open_req *req;
1380	struct inpcb *inp = so_sotoinpcb(so);
1381	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1382	struct toepcb *toep = tp->t_toe;
1383	struct toedev *tdev = toep->tp_toedev;
1384
1385	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1386
1387	req = mtod(m, struct cpl_act_open_req *);
1388	m->m_pkthdr.len = m->m_len = sizeof(*req);
1389
1390	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1391	req->wr.wr_lo = 0;
1392	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1393	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1394#if 0
1395	req->local_port = inp->inp_lport;
1396	req->peer_port = inp->inp_fport;
1397	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1398	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1399#endif
1400	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1401			   V_TX_CHANNEL(e->smt_idx));
1402	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1403	req->params = 0;
1404	req->opt2 = htonl(calc_opt2(so, tdev));
1405}
1406
1407
1408/*
1409 * Convert an ACT_OPEN_RPL status to an errno.
1410 */
1411static int
1412act_open_rpl_status_to_errno(int status)
1413{
1414	switch (status) {
1415	case CPL_ERR_CONN_RESET:
1416		return (ECONNREFUSED);
1417	case CPL_ERR_ARP_MISS:
1418		return (EHOSTUNREACH);
1419	case CPL_ERR_CONN_TIMEDOUT:
1420		return (ETIMEDOUT);
1421	case CPL_ERR_TCAM_FULL:
1422		return (ENOMEM);
1423	case CPL_ERR_CONN_EXIST:
1424		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1425		return (EADDRINUSE);
1426	default:
1427		return (EIO);
1428	}
1429}
1430
1431static void
1432fail_act_open(struct toepcb *toep, int errno)
1433{
1434	struct tcpcb *tp = toep->tp_tp;
1435
1436	t3_release_offload_resources(toep);
1437	if (tp) {
1438		inp_wunlock(tp->t_inpcb);
1439		tcp_offload_drop(tp, errno);
1440	}
1441
1442#ifdef notyet
1443	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1444#endif
1445}
1446
1447/*
1448 * Handle active open failures.
1449 */
1450static void
1451active_open_failed(struct toepcb *toep, struct mbuf *m)
1452{
1453	struct cpl_act_open_rpl *rpl = cplhdr(m);
1454	struct inpcb *inp;
1455
1456	if (toep->tp_tp == NULL)
1457		goto done;
1458
1459	inp = toep->tp_tp->t_inpcb;
1460
1461/*
1462 * Don't handle connection retry for now
1463 */
1464#ifdef notyet
1465	struct inet_connection_sock *icsk = inet_csk(sk);
1466
1467	if (rpl->status == CPL_ERR_CONN_EXIST &&
1468	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1469		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1470		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1471			       jiffies + HZ / 2);
1472	} else
1473#endif
1474	{
1475		inp_wlock(inp);
1476		/*
1477		 * drops the inpcb lock
1478		 */
1479		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1480	}
1481
1482	done:
1483	m_free(m);
1484}
1485
1486/*
1487 * Return whether a failed active open has allocated a TID
1488 */
1489static inline int
1490act_open_has_tid(int status)
1491{
1492	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1493	       status != CPL_ERR_ARP_MISS;
1494}
1495
1496/*
1497 * Process an ACT_OPEN_RPL CPL message.
1498 */
1499static int
1500do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1501{
1502	struct toepcb *toep = (struct toepcb *)ctx;
1503	struct cpl_act_open_rpl *rpl = cplhdr(m);
1504
1505	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1506		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1507
1508	active_open_failed(toep, m);
1509	return (0);
1510}
1511
1512/*
1513 * Handle an ARP failure for an active open.   XXX purge ofo queue
1514 *
1515 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1516 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1517 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1518 * free the atid.  Hmm.
1519 */
1520#ifdef notyet
1521static void
1522act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1523{
1524	struct toepcb *toep = m_get_toep(m);
1525	struct tcpcb *tp = toep->tp_tp;
1526	struct inpcb *inp = tp->t_inpcb;
1527	struct socket *so;
1528
1529	inp_wlock(inp);
1530	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1531		/*
1532		 * drops the inpcb lock
1533		 */
1534		fail_act_open(so, EHOSTUNREACH);
1535		printf("freeing %p\n", m);
1536
1537		m_free(m);
1538	} else
1539		inp_wunlock(inp);
1540}
1541#endif
1542/*
1543 * Send an active open request.
1544 */
1545int
1546t3_connect(struct toedev *tdev, struct socket *so,
1547    struct rtentry *rt, struct sockaddr *nam)
1548{
1549	struct mbuf *m;
1550	struct l2t_entry *e;
1551	struct tom_data *d = TOM_DATA(tdev);
1552	struct inpcb *inp = so_sotoinpcb(so);
1553	struct tcpcb *tp = intotcpcb(inp);
1554	struct toepcb *toep; /* allocated by init_offload_socket */
1555
1556	int atid;
1557
1558	toep = toepcb_alloc();
1559	if (toep == NULL)
1560		goto out_err;
1561
1562	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1563		goto out_err;
1564
1565	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1566	if (!e)
1567		goto free_tid;
1568
1569	inp_lock_assert(inp);
1570	m = m_gethdr(MT_DATA, M_WAITOK);
1571
1572#if 0
1573	m->m_toe.mt_toepcb = tp->t_toe;
1574	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1575#endif
1576	so_lock(so);
1577
1578	init_offload_socket(so, tdev, atid, e, rt, toep);
1579
1580	install_offload_ops(so);
1581
1582	mk_act_open_req(so, m, atid, e);
1583	so_unlock(so);
1584
1585	soisconnecting(so);
1586	toep = tp->t_toe;
1587	m_set_toep(m, tp->t_toe);
1588
1589	toep->tp_state = TCPS_SYN_SENT;
1590	l2t_send(d->cdev, (struct mbuf *)m, e);
1591
1592	if (toep->tp_ulp_mode)
1593		t3_enable_ddp(toep, 0);
1594	return 	(0);
1595
1596free_tid:
1597	printf("failing connect - free atid\n");
1598
1599	free_atid(d->cdev, atid);
1600out_err:
1601	printf("return ENOMEM\n");
1602       return (ENOMEM);
1603}
1604
1605/*
1606 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1607 * not send multiple ABORT_REQs for the same connection and also that we do
1608 * not try to send a message after the connection has closed.  Returns 1 if
1609 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1610 */
1611static void
1612t3_send_reset(struct toepcb *toep)
1613{
1614
1615	struct cpl_abort_req *req;
1616	unsigned int tid = toep->tp_tid;
1617	int mode = CPL_ABORT_SEND_RST;
1618	struct tcpcb *tp = toep->tp_tp;
1619	struct toedev *tdev = toep->tp_toedev;
1620	struct socket *so = NULL;
1621	struct mbuf *m;
1622	struct sockbuf *snd;
1623
1624	if (tp) {
1625		inp_lock_assert(tp->t_inpcb);
1626		so = inp_inpcbtosocket(tp->t_inpcb);
1627	}
1628
1629	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1630		tdev == NULL))
1631		return;
1632	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1633
1634	snd = so_sockbuf_snd(so);
1635	/* Purge the send queue so we don't send anything after an abort. */
1636	if (so)
1637		sbflush(snd);
1638	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1639		mode |= CPL_ABORT_POST_CLOSE_REQ;
1640
1641	m = m_gethdr_nofail(sizeof(*req));
1642	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1643	set_arp_failure_handler(m, abort_arp_failure);
1644
1645	req = mtod(m, struct cpl_abort_req *);
1646	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1647	req->wr.wr_lo = htonl(V_WR_TID(tid));
1648	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1649	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1650	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1651	req->cmd = mode;
1652	if (tp && (tp->t_state == TCPS_SYN_SENT))
1653		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1654	else
1655		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1656}
1657
1658static int
1659t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1660{
1661	struct inpcb *inp;
1662	int error, optval;
1663
1664	if (sopt->sopt_name == IP_OPTIONS)
1665		return (ENOPROTOOPT);
1666
1667	if (sopt->sopt_name != IP_TOS)
1668		return (EOPNOTSUPP);
1669
1670	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1671
1672	if (error)
1673		return (error);
1674
1675	if (optval > IPTOS_PREC_CRITIC_ECP)
1676		return (EINVAL);
1677
1678	inp = so_sotoinpcb(so);
1679	inp_wlock(inp);
1680	inp_ip_tos_set(inp, optval);
1681#if 0
1682	inp->inp_ip_tos = optval;
1683#endif
1684	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1685	inp_wunlock(inp);
1686
1687	return (0);
1688}
1689
1690static int
1691t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1692{
1693	int err = 0;
1694	size_t copied;
1695
1696	if (sopt->sopt_name != TCP_CONGESTION &&
1697	    sopt->sopt_name != TCP_NODELAY)
1698		return (EOPNOTSUPP);
1699
1700	if (sopt->sopt_name == TCP_CONGESTION) {
1701		char name[TCP_CA_NAME_MAX];
1702		int optlen = sopt->sopt_valsize;
1703		struct tcpcb *tp;
1704
1705		if (sopt->sopt_dir == SOPT_GET) {
1706			KASSERT(0, ("unimplemented"));
1707			return (EOPNOTSUPP);
1708		}
1709
1710		if (optlen < 1)
1711			return (EINVAL);
1712
1713		err = copyinstr(sopt->sopt_val, name,
1714		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1715		if (err)
1716			return (err);
1717		if (copied < 1)
1718			return (EINVAL);
1719
1720		tp = so_sototcpcb(so);
1721		/*
1722		 * XXX I need to revisit this
1723		 */
1724		if ((err = t3_set_cong_control(so, name)) == 0) {
1725#ifdef CONGESTION_CONTROL_SUPPORTED
1726			tp->t_cong_control = strdup(name, M_CXGB);
1727#endif
1728		} else
1729			return (err);
1730	} else {
1731		int optval, oldval;
1732		struct inpcb *inp;
1733		struct tcpcb *tp;
1734
1735		if (sopt->sopt_dir == SOPT_GET)
1736			return (EOPNOTSUPP);
1737
1738		err = sooptcopyin(sopt, &optval, sizeof optval,
1739		    sizeof optval);
1740
1741		if (err)
1742			return (err);
1743
1744		inp = so_sotoinpcb(so);
1745		inp_wlock(inp);
1746		tp = inp_inpcbtotcpcb(inp);
1747
1748		oldval = tp->t_flags;
1749		if (optval)
1750			tp->t_flags |= TF_NODELAY;
1751		else
1752			tp->t_flags &= ~TF_NODELAY;
1753		inp_wunlock(inp);
1754
1755
1756		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1757			t3_set_nagle(tp->t_toe);
1758
1759	}
1760
1761	return (0);
1762}
1763
1764int
1765t3_ctloutput(struct socket *so, struct sockopt *sopt)
1766{
1767	int err;
1768
1769	if (sopt->sopt_level != IPPROTO_TCP)
1770		err =  t3_ip_ctloutput(so, sopt);
1771	else
1772		err = t3_tcp_ctloutput(so, sopt);
1773
1774	if (err != EOPNOTSUPP)
1775		return (err);
1776
1777	return (tcp_ctloutput(so, sopt));
1778}
1779
1780/*
1781 * Returns true if we need to explicitly request RST when we receive new data
1782 * on an RX-closed connection.
1783 */
1784static inline int
1785need_rst_on_excess_rx(const struct toepcb *toep)
1786{
1787	return (1);
1788}
1789
1790/*
1791 * Handles Rx data that arrives in a state where the socket isn't accepting
1792 * new data.
1793 */
1794static void
1795handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1796{
1797
1798	if (need_rst_on_excess_rx(toep) &&
1799	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1800		t3_send_reset(toep);
1801	m_freem(m);
1802}
1803
1804/*
1805 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1806 * by getting the DDP offset from the TCB.
1807 */
1808static void
1809tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1810{
1811	struct ddp_state *q = &toep->tp_ddp_state;
1812	struct ddp_buf_state *bsp;
1813	struct cpl_get_tcb_rpl *hdr;
1814	unsigned int ddp_offset;
1815	struct socket *so;
1816	struct tcpcb *tp;
1817	struct sockbuf *rcv;
1818	int state;
1819
1820	uint64_t t;
1821	__be64 *tcb;
1822
1823	tp = toep->tp_tp;
1824	so = inp_inpcbtosocket(tp->t_inpcb);
1825
1826	inp_lock_assert(tp->t_inpcb);
1827	rcv = so_sockbuf_rcv(so);
1828	sockbuf_lock(rcv);
1829
1830	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1831	 * We really need a cookie in order to dispatch the RPLs.
1832	 */
1833	q->get_tcb_count--;
1834
1835	/* It is a possible that a previous CPL already invalidated UBUF DDP
1836	 * and moved the cur_buf idx and hence no further processing of this
1837	 * skb is required. However, the app might be sleeping on
1838	 * !q->get_tcb_count and we need to wake it up.
1839	 */
1840	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1841		int state = so_state_get(so);
1842
1843		m_freem(m);
1844		if (__predict_true((state & SS_NOFDREF) == 0))
1845			so_sorwakeup_locked(so);
1846		else
1847			sockbuf_unlock(rcv);
1848
1849		return;
1850	}
1851
1852	bsp = &q->buf_state[q->cur_buf];
1853	hdr = cplhdr(m);
1854	tcb = (__be64 *)(hdr + 1);
1855	if (q->cur_buf == 0) {
1856		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1857		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1858	} else {
1859		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1860		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1861	}
1862	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1863	m->m_cur_offset = bsp->cur_offset;
1864	bsp->cur_offset = ddp_offset;
1865	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1866
1867	CTR5(KTR_TOM,
1868	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1869	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1870	KASSERT(ddp_offset >= m->m_cur_offset,
1871	    ("ddp_offset=%u less than cur_offset=%u",
1872		ddp_offset, m->m_cur_offset));
1873
1874#if 0
1875{
1876	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1877
1878	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1879	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1880
1881        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1882        rcv_nxt = t >> S_TCB_RCV_NXT;
1883        rcv_nxt &= M_TCB_RCV_NXT;
1884
1885        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1886        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1887        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1888
1889	T3_TRACE2(TIDTB(sk),
1890		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1891		  ddp_flags, rcv_nxt - rx_hdr_offset);
1892	T3_TRACE4(TB(q),
1893		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1894		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1895	T3_TRACE3(TB(q),
1896		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1897		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1898	T3_TRACE2(TB(q),
1899		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1900		 q->buf_state[0].flags, q->buf_state[1].flags);
1901
1902}
1903#endif
1904	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1905		handle_excess_rx(toep, m);
1906		return;
1907	}
1908
1909#ifdef T3_TRACE
1910	if ((int)m->m_pkthdr.len < 0) {
1911		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1912	}
1913#endif
1914	if (bsp->flags & DDP_BF_NOCOPY) {
1915#ifdef T3_TRACE
1916		T3_TRACE0(TB(q),
1917			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1918
1919		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1920			printk("!cancel_ubuf");
1921			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1922		}
1923#endif
1924		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1925		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1926		q->cur_buf ^= 1;
1927	} else if (bsp->flags & DDP_BF_NOFLIP) {
1928
1929		m->m_ddp_flags = 1;    /* always a kernel buffer */
1930
1931		/* now HW buffer carries a user buffer */
1932		bsp->flags &= ~DDP_BF_NOFLIP;
1933		bsp->flags |= DDP_BF_NOCOPY;
1934
1935		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1936		 * any new data in which case we're done. If in addition the
1937		 * offset is 0, then there wasn't a completion for the kbuf
1938		 * and we need to decrement the posted count.
1939		 */
1940		if (m->m_pkthdr.len == 0) {
1941			if (ddp_offset == 0) {
1942				q->kbuf_posted--;
1943				bsp->flags |= DDP_BF_NODATA;
1944			}
1945			sockbuf_unlock(rcv);
1946			m_free(m);
1947			return;
1948		}
1949	} else {
1950		sockbuf_unlock(rcv);
1951
1952		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1953		 * but it got here way late and nobody cares anymore.
1954		 */
1955		m_free(m);
1956		return;
1957	}
1958
1959	m->m_ddp_gl = (unsigned char *)bsp->gl;
1960	m->m_flags |= M_DDP;
1961	m->m_seq = tp->rcv_nxt;
1962	tp->rcv_nxt += m->m_pkthdr.len;
1963	tp->t_rcvtime = ticks;
1964	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1965		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1966	if (m->m_pkthdr.len == 0) {
1967		q->user_ddp_pending = 0;
1968		m_free(m);
1969	} else
1970		SBAPPEND(rcv, m);
1971
1972	state = so_state_get(so);
1973	if (__predict_true((state & SS_NOFDREF) == 0))
1974		so_sorwakeup_locked(so);
1975	else
1976		sockbuf_unlock(rcv);
1977}
1978
1979/*
1980 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1981 * in that case they are similar to DDP completions.
1982 */
1983static int
1984do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1985{
1986	struct toepcb *toep = (struct toepcb *)ctx;
1987
1988	/* OK if socket doesn't exist */
1989	if (toep == NULL) {
1990		printf("null toep in do_get_tcb_rpl\n");
1991		return (CPL_RET_BUF_DONE);
1992	}
1993
1994	inp_wlock(toep->tp_tp->t_inpcb);
1995	tcb_rpl_as_ddp_complete(toep, m);
1996	inp_wunlock(toep->tp_tp->t_inpcb);
1997
1998	return (0);
1999}
2000
2001static void
2002handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2003{
2004	struct tcpcb *tp = toep->tp_tp;
2005	struct socket *so;
2006	struct ddp_state *q;
2007	struct ddp_buf_state *bsp;
2008	struct cpl_rx_data *hdr = cplhdr(m);
2009	unsigned int rcv_nxt = ntohl(hdr->seq);
2010	struct sockbuf *rcv;
2011
2012	if (tp->rcv_nxt == rcv_nxt)
2013		return;
2014
2015	inp_lock_assert(tp->t_inpcb);
2016	so  = inp_inpcbtosocket(tp->t_inpcb);
2017	rcv = so_sockbuf_rcv(so);
2018	sockbuf_lock(rcv);
2019
2020	q = &toep->tp_ddp_state;
2021	bsp = &q->buf_state[q->cur_buf];
2022	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2023		rcv_nxt, tp->rcv_nxt));
2024	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2025	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2026	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2027	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2028
2029#ifdef T3_TRACE
2030	if ((int)m->m_pkthdr.len < 0) {
2031		t3_ddp_error(so, "handle_ddp_data: neg len");
2032	}
2033#endif
2034	m->m_ddp_gl = (unsigned char *)bsp->gl;
2035	m->m_flags |= M_DDP;
2036	m->m_cur_offset = bsp->cur_offset;
2037	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2038	if (bsp->flags & DDP_BF_NOCOPY)
2039		bsp->flags &= ~DDP_BF_NOCOPY;
2040
2041	m->m_seq = tp->rcv_nxt;
2042	tp->rcv_nxt = rcv_nxt;
2043	bsp->cur_offset += m->m_pkthdr.len;
2044	if (!(bsp->flags & DDP_BF_NOFLIP))
2045		q->cur_buf ^= 1;
2046	/*
2047	 * For now, don't re-enable DDP after a connection fell out of  DDP
2048	 * mode.
2049	 */
2050	q->ubuf_ddp_ready = 0;
2051	sockbuf_unlock(rcv);
2052}
2053
2054/*
2055 * Process new data received for a connection.
2056 */
2057static void
2058new_rx_data(struct toepcb *toep, struct mbuf *m)
2059{
2060	struct cpl_rx_data *hdr = cplhdr(m);
2061	struct tcpcb *tp = toep->tp_tp;
2062	struct socket *so;
2063	struct sockbuf *rcv;
2064	int state;
2065	int len = be16toh(hdr->len);
2066
2067	inp_wlock(tp->t_inpcb);
2068
2069	so  = inp_inpcbtosocket(tp->t_inpcb);
2070
2071	if (__predict_false(so_no_receive(so))) {
2072		handle_excess_rx(toep, m);
2073		inp_wunlock(tp->t_inpcb);
2074		TRACE_EXIT;
2075		return;
2076	}
2077
2078	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2079		handle_ddp_data(toep, m);
2080
2081	m->m_seq = ntohl(hdr->seq);
2082	m->m_ulp_mode = 0;                    /* for iSCSI */
2083
2084#if VALIDATE_SEQ
2085	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2086		log(LOG_ERR,
2087		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2088		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2089		       tp->rcv_nxt);
2090		m_freem(m);
2091		inp_wunlock(tp->t_inpcb);
2092		return;
2093	}
2094#endif
2095	m_adj(m, sizeof(*hdr));
2096
2097#ifdef URGENT_DATA_SUPPORTED
2098	/*
2099	 * We don't handle urgent data yet
2100	 */
2101	if (__predict_false(hdr->urg))
2102		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2103	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2104		     tp->urg_seq - tp->rcv_nxt < skb->len))
2105		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2106							 tp->rcv_nxt];
2107#endif
2108	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2109		toep->tp_delack_mode = hdr->dack_mode;
2110		toep->tp_delack_seq = tp->rcv_nxt;
2111	}
2112	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2113	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2114
2115	if (len < m->m_pkthdr.len)
2116		m->m_pkthdr.len = m->m_len = len;
2117
2118	tp->rcv_nxt += m->m_pkthdr.len;
2119	tp->t_rcvtime = ticks;
2120	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2121	CTR2(KTR_TOM,
2122	    "new_rx_data: seq 0x%x len %u",
2123	    m->m_seq, m->m_pkthdr.len);
2124	inp_wunlock(tp->t_inpcb);
2125	rcv = so_sockbuf_rcv(so);
2126	sockbuf_lock(rcv);
2127#if 0
2128	if (sb_notify(rcv))
2129		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2130#endif
2131	SBAPPEND(rcv, m);
2132
2133#ifdef notyet
2134	/*
2135	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2136	 *
2137	 */
2138	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2139
2140	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2141		so, rcv->sb_cc, rcv->sb_mbmax));
2142#endif
2143
2144
2145	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2146	    rcv->sb_cc, rcv->sb_mbcnt);
2147
2148	state = so_state_get(so);
2149	if (__predict_true((state & SS_NOFDREF) == 0))
2150		so_sorwakeup_locked(so);
2151	else
2152		sockbuf_unlock(rcv);
2153}
2154
2155/*
2156 * Handler for RX_DATA CPL messages.
2157 */
2158static int
2159do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2160{
2161	struct toepcb *toep = (struct toepcb *)ctx;
2162
2163	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2164
2165	new_rx_data(toep, m);
2166
2167	return (0);
2168}
2169
2170static void
2171new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2172{
2173	struct tcpcb *tp;
2174	struct ddp_state *q;
2175	struct ddp_buf_state *bsp;
2176	struct cpl_rx_data_ddp *hdr;
2177	struct socket *so;
2178	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2179	int nomoredata = 0;
2180	unsigned int delack_mode;
2181	struct sockbuf *rcv;
2182
2183	tp = toep->tp_tp;
2184	inp_wlock(tp->t_inpcb);
2185	so = inp_inpcbtosocket(tp->t_inpcb);
2186
2187	if (__predict_false(so_no_receive(so))) {
2188
2189		handle_excess_rx(toep, m);
2190		inp_wunlock(tp->t_inpcb);
2191		return;
2192	}
2193
2194	q = &toep->tp_ddp_state;
2195	hdr = cplhdr(m);
2196	ddp_report = ntohl(hdr->u.ddp_report);
2197	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2198	bsp = &q->buf_state[buf_idx];
2199
2200	CTR4(KTR_TOM,
2201	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2202	    "hdr seq 0x%x len %u",
2203	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2204	    ntohs(hdr->len));
2205	CTR3(KTR_TOM,
2206	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2207	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2208
2209	ddp_len = ntohs(hdr->len);
2210	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2211
2212	delack_mode = G_DDP_DACK_MODE(ddp_report);
2213	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2214		toep->tp_delack_mode = delack_mode;
2215		toep->tp_delack_seq = tp->rcv_nxt;
2216	}
2217
2218	m->m_seq = tp->rcv_nxt;
2219	tp->rcv_nxt = rcv_nxt;
2220
2221	tp->t_rcvtime = ticks;
2222	/*
2223	 * Store the length in m->m_len.  We are changing the meaning of
2224	 * m->m_len here, we need to be very careful that nothing from now on
2225	 * interprets ->len of this packet the usual way.
2226	 */
2227	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2228	inp_wunlock(tp->t_inpcb);
2229	CTR3(KTR_TOM,
2230	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2231	    m->m_len, rcv_nxt, m->m_seq);
2232	/*
2233	 * Figure out where the new data was placed in the buffer and store it
2234	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2235	 * account for page pod's pg_offset.
2236	 */
2237	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2238	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2239
2240	rcv = so_sockbuf_rcv(so);
2241	sockbuf_lock(rcv);
2242
2243	m->m_ddp_gl = (unsigned char *)bsp->gl;
2244	m->m_flags |= M_DDP;
2245	bsp->cur_offset = end_offset;
2246	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2247
2248	/*
2249	 * Length is only meaningful for kbuf
2250	 */
2251	if (!(bsp->flags & DDP_BF_NOCOPY))
2252		KASSERT(m->m_len <= bsp->gl->dgl_length,
2253		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2254			m->m_len, bsp->gl->dgl_length));
2255
2256	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2257	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2258        /*
2259	 * Bit 0 of flags stores whether the DDP buffer is completed.
2260	 * Note that other parts of the code depend on this being in bit 0.
2261	 */
2262	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2263		panic("spurious ddp completion");
2264	} else {
2265		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2266		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2267			q->cur_buf ^= 1;                     /* flip buffers */
2268	}
2269
2270	if (bsp->flags & DDP_BF_NOCOPY) {
2271		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2272		bsp->flags &= ~DDP_BF_NOCOPY;
2273	}
2274
2275	if (ddp_report & F_DDP_PSH)
2276		m->m_ddp_flags |= DDP_BF_PSH;
2277	if (nomoredata)
2278		m->m_ddp_flags |= DDP_BF_NODATA;
2279
2280#ifdef notyet
2281	skb_reset_transport_header(skb);
2282	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2283#endif
2284	SBAPPEND(rcv, m);
2285
2286	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2287	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2288		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2289		so_sorwakeup_locked(so);
2290	else
2291		sockbuf_unlock(rcv);
2292}
2293
2294#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2295		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2296		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2297		 F_DDP_INVALID_PPOD)
2298
2299/*
2300 * Handler for RX_DATA_DDP CPL messages.
2301 */
2302static int
2303do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2304{
2305	struct toepcb *toep = ctx;
2306	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2307
2308	VALIDATE_SOCK(so);
2309
2310	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2311		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2312		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2313		return (CPL_RET_BUF_DONE);
2314	}
2315#if 0
2316	skb->h.th = tcphdr_skb->h.th;
2317#endif
2318	new_rx_data_ddp(toep, m);
2319	return (0);
2320}
2321
2322static void
2323process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2324{
2325	struct tcpcb *tp = toep->tp_tp;
2326	struct socket *so;
2327	struct ddp_state *q;
2328	struct ddp_buf_state *bsp;
2329	struct cpl_rx_ddp_complete *hdr;
2330	unsigned int ddp_report, buf_idx, when, delack_mode;
2331	int nomoredata = 0;
2332	struct sockbuf *rcv;
2333
2334	inp_wlock(tp->t_inpcb);
2335	so = inp_inpcbtosocket(tp->t_inpcb);
2336
2337	if (__predict_false(so_no_receive(so))) {
2338		struct inpcb *inp = so_sotoinpcb(so);
2339
2340		handle_excess_rx(toep, m);
2341		inp_wunlock(inp);
2342		return;
2343	}
2344	q = &toep->tp_ddp_state;
2345	hdr = cplhdr(m);
2346	ddp_report = ntohl(hdr->ddp_report);
2347	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2348	m->m_pkthdr.csum_data = tp->rcv_nxt;
2349
2350	rcv = so_sockbuf_rcv(so);
2351	sockbuf_lock(rcv);
2352
2353	bsp = &q->buf_state[buf_idx];
2354	when = bsp->cur_offset;
2355	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2356	tp->rcv_nxt += m->m_len;
2357	tp->t_rcvtime = ticks;
2358
2359	delack_mode = G_DDP_DACK_MODE(ddp_report);
2360	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2361		toep->tp_delack_mode = delack_mode;
2362		toep->tp_delack_seq = tp->rcv_nxt;
2363	}
2364#ifdef notyet
2365	skb_reset_transport_header(skb);
2366	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2367#endif
2368	inp_wunlock(tp->t_inpcb);
2369
2370	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2371	CTR5(KTR_TOM,
2372		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2373		  "ddp_report 0x%x offset %u, len %u",
2374		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2375		   G_DDP_OFFSET(ddp_report), m->m_len);
2376
2377	m->m_cur_offset = bsp->cur_offset;
2378	bsp->cur_offset += m->m_len;
2379
2380	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2381		q->cur_buf ^= 1;                     /* flip buffers */
2382		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2383			nomoredata=1;
2384	}
2385
2386	CTR4(KTR_TOM,
2387		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2388		  "ddp_report %u offset %u",
2389		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2390		   G_DDP_OFFSET(ddp_report));
2391
2392	m->m_ddp_gl = (unsigned char *)bsp->gl;
2393	m->m_flags |= M_DDP;
2394	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2395	if (bsp->flags & DDP_BF_NOCOPY)
2396		bsp->flags &= ~DDP_BF_NOCOPY;
2397	if (nomoredata)
2398		m->m_ddp_flags |= DDP_BF_NODATA;
2399
2400	SBAPPEND(rcv, m);
2401	if ((so_state_get(so) & SS_NOFDREF) == 0)
2402		so_sorwakeup_locked(so);
2403	else
2404		sockbuf_unlock(rcv);
2405}
2406
2407/*
2408 * Handler for RX_DDP_COMPLETE CPL messages.
2409 */
2410static int
2411do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2412{
2413	struct toepcb *toep = ctx;
2414
2415	VALIDATE_SOCK(so);
2416#if 0
2417	skb->h.th = tcphdr_skb->h.th;
2418#endif
2419	process_ddp_complete(toep, m);
2420	return (0);
2421}
2422
2423/*
2424 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2425 * socket state before calling tcp_time_wait to comply with its expectations.
2426 */
2427static void
2428enter_timewait(struct tcpcb *tp)
2429{
2430	/*
2431	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2432	 * process peer_close because we don't want to carry the peer FIN in
2433	 * the socket's receive queue and if we increment rcv_nxt without
2434	 * having the FIN in the receive queue we'll confuse facilities such
2435	 * as SIOCINQ.
2436	 */
2437	inp_wlock(tp->t_inpcb);
2438	tp->rcv_nxt++;
2439
2440	tp->ts_recent_age = 0;	     /* defeat recycling */
2441	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2442	inp_wunlock(tp->t_inpcb);
2443	tcp_offload_twstart(tp);
2444}
2445
2446/*
2447 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2448 * function deals with the data that may be reported along with the FIN.
2449 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2450 * perform normal FIN-related processing.  In the latter case 1 indicates that
2451 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2452 * skb can be freed.
2453 */
2454static int
2455handle_peer_close_data(struct socket *so, struct mbuf *m)
2456{
2457	struct tcpcb *tp = so_sototcpcb(so);
2458	struct toepcb *toep = tp->t_toe;
2459	struct ddp_state *q;
2460	struct ddp_buf_state *bsp;
2461	struct cpl_peer_close *req = cplhdr(m);
2462	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2463	struct sockbuf *rcv;
2464
2465	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2466		return (0);
2467
2468	CTR0(KTR_TOM, "handle_peer_close_data");
2469	if (__predict_false(so_no_receive(so))) {
2470		handle_excess_rx(toep, m);
2471
2472		/*
2473		 * Although we discard the data we want to process the FIN so
2474		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2475		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2476		 * may be what will close the connection.  We return 1 because
2477		 * handle_excess_rx() already freed the packet.
2478		 */
2479		return (1);
2480	}
2481
2482	inp_lock_assert(tp->t_inpcb);
2483	q = &toep->tp_ddp_state;
2484	rcv = so_sockbuf_rcv(so);
2485	sockbuf_lock(rcv);
2486
2487	bsp = &q->buf_state[q->cur_buf];
2488	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2489	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2490	m->m_ddp_gl = (unsigned char *)bsp->gl;
2491	m->m_flags |= M_DDP;
2492	m->m_cur_offset = bsp->cur_offset;
2493	m->m_ddp_flags =
2494	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2495	m->m_seq = tp->rcv_nxt;
2496	tp->rcv_nxt = rcv_nxt;
2497	bsp->cur_offset += m->m_pkthdr.len;
2498	if (!(bsp->flags & DDP_BF_NOFLIP))
2499		q->cur_buf ^= 1;
2500#ifdef notyet
2501	skb_reset_transport_header(skb);
2502	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2503#endif
2504	tp->t_rcvtime = ticks;
2505	SBAPPEND(rcv, m);
2506	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2507		so_sorwakeup_locked(so);
2508	else
2509		sockbuf_unlock(rcv);
2510
2511	return (1);
2512}
2513
2514/*
2515 * Handle a peer FIN.
2516 */
2517static void
2518do_peer_fin(struct toepcb *toep, struct mbuf *m)
2519{
2520	struct socket *so;
2521	struct tcpcb *tp = toep->tp_tp;
2522	int keep, action;
2523
2524	action = keep = 0;
2525	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2526	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2527		printf("abort_pending set\n");
2528
2529		goto out;
2530	}
2531	inp_wlock(tp->t_inpcb);
2532	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2533	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2534		keep = handle_peer_close_data(so, m);
2535		if (keep < 0) {
2536			inp_wunlock(tp->t_inpcb);
2537			return;
2538		}
2539	}
2540	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2541		CTR1(KTR_TOM,
2542		    "waking up waiters for cantrcvmore on %p ", so);
2543		socantrcvmore(so);
2544
2545		/*
2546		 * If connection is half-synchronized
2547		 * (ie NEEDSYN flag on) then delay ACK,
2548		 * so it may be piggybacked when SYN is sent.
2549		 * Otherwise, since we received a FIN then no
2550		 * more input can be expected, send ACK now.
2551		 */
2552		if (tp->t_flags & TF_NEEDSYN)
2553			tp->t_flags |= TF_DELACK;
2554		else
2555			tp->t_flags |= TF_ACKNOW;
2556		tp->rcv_nxt++;
2557	}
2558
2559	switch (tp->t_state) {
2560	case TCPS_SYN_RECEIVED:
2561	    tp->t_starttime = ticks;
2562	/* FALLTHROUGH */
2563	case TCPS_ESTABLISHED:
2564		tp->t_state = TCPS_CLOSE_WAIT;
2565		break;
2566	case TCPS_FIN_WAIT_1:
2567		tp->t_state = TCPS_CLOSING;
2568		break;
2569	case TCPS_FIN_WAIT_2:
2570		/*
2571		 * If we've sent an abort_req we must have sent it too late,
2572		 * HW will send us a reply telling us so, and this peer_close
2573		 * is really the last message for this connection and needs to
2574		 * be treated as an abort_rpl, i.e., transition the connection
2575		 * to TCP_CLOSE (note that the host stack does this at the
2576		 * time of generating the RST but we must wait for HW).
2577		 * Otherwise we enter TIME_WAIT.
2578		 */
2579		t3_release_offload_resources(toep);
2580		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2581			action = TCP_CLOSE;
2582		} else {
2583			action = TCP_TIMEWAIT;
2584		}
2585		break;
2586	default:
2587		log(LOG_ERR,
2588		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2589		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2590	}
2591	inp_wunlock(tp->t_inpcb);
2592
2593	if (action == TCP_TIMEWAIT) {
2594		enter_timewait(tp);
2595	} else if (action == TCP_DROP) {
2596		tcp_offload_drop(tp, 0);
2597	} else if (action == TCP_CLOSE) {
2598		tcp_offload_close(tp);
2599	}
2600
2601#ifdef notyet
2602	/* Do not send POLL_HUP for half duplex close. */
2603	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2604	    sk->sk_state == TCP_CLOSE)
2605		sk_wake_async(so, 1, POLL_HUP);
2606	else
2607		sk_wake_async(so, 1, POLL_IN);
2608#endif
2609
2610out:
2611	if (!keep)
2612		m_free(m);
2613}
2614
2615/*
2616 * Handler for PEER_CLOSE CPL messages.
2617 */
2618static int
2619do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2620{
2621	struct toepcb *toep = (struct toepcb *)ctx;
2622
2623	VALIDATE_SOCK(so);
2624
2625	do_peer_fin(toep, m);
2626	return (0);
2627}
2628
2629static void
2630process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2631{
2632	struct cpl_close_con_rpl *rpl = cplhdr(m);
2633	struct tcpcb *tp = toep->tp_tp;
2634	struct socket *so;
2635	int action = 0;
2636	struct sockbuf *rcv;
2637
2638	inp_wlock(tp->t_inpcb);
2639	so = inp_inpcbtosocket(tp->t_inpcb);
2640
2641	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2642
2643	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2644		inp_wunlock(tp->t_inpcb);
2645		goto out;
2646	}
2647
2648	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2649	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2650
2651	switch (tp->t_state) {
2652	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2653		t3_release_offload_resources(toep);
2654		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2655			action = TCP_CLOSE;
2656
2657		} else {
2658			action = TCP_TIMEWAIT;
2659		}
2660		break;
2661	case TCPS_LAST_ACK:
2662		/*
2663		 * In this state we don't care about pending abort_rpl.
2664		 * If we've sent abort_req it was post-close and was sent too
2665		 * late, this close_con_rpl is the actual last message.
2666		 */
2667		t3_release_offload_resources(toep);
2668		action = TCP_CLOSE;
2669		break;
2670	case TCPS_FIN_WAIT_1:
2671		/*
2672		 * If we can't receive any more
2673		 * data, then closing user can proceed.
2674		 * Starting the timer is contrary to the
2675		 * specification, but if we don't get a FIN
2676		 * we'll hang forever.
2677		 *
2678		 * XXXjl:
2679		 * we should release the tp also, and use a
2680		 * compressed state.
2681		 */
2682		if (so)
2683			rcv = so_sockbuf_rcv(so);
2684		else
2685			break;
2686
2687		if (rcv->sb_state & SBS_CANTRCVMORE) {
2688			int timeout;
2689
2690			if (so)
2691				soisdisconnected(so);
2692			timeout = (tcp_fast_finwait2_recycle) ?
2693			    tcp_finwait2_timeout : tcp_maxidle;
2694			tcp_timer_activate(tp, TT_2MSL, timeout);
2695		}
2696		tp->t_state = TCPS_FIN_WAIT_2;
2697		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2698		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2699			action = TCP_DROP;
2700		}
2701
2702		break;
2703	default:
2704		log(LOG_ERR,
2705		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2706		       toep->tp_toedev->tod_name, toep->tp_tid,
2707		       tp->t_state);
2708	}
2709	inp_wunlock(tp->t_inpcb);
2710
2711
2712	if (action == TCP_TIMEWAIT) {
2713		enter_timewait(tp);
2714	} else if (action == TCP_DROP) {
2715		tcp_offload_drop(tp, 0);
2716	} else if (action == TCP_CLOSE) {
2717		tcp_offload_close(tp);
2718	}
2719out:
2720	m_freem(m);
2721}
2722
2723/*
2724 * Handler for CLOSE_CON_RPL CPL messages.
2725 */
2726static int
2727do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2728			    void *ctx)
2729{
2730	struct toepcb *toep = (struct toepcb *)ctx;
2731
2732	process_close_con_rpl(toep, m);
2733	return (0);
2734}
2735
2736/*
2737 * Process abort replies.  We only process these messages if we anticipate
2738 * them as the coordination between SW and HW in this area is somewhat lacking
2739 * and sometimes we get ABORT_RPLs after we are done with the connection that
2740 * originated the ABORT_REQ.
2741 */
2742static void
2743process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2744{
2745	struct tcpcb *tp = toep->tp_tp;
2746	struct socket *so;
2747	int needclose = 0;
2748
2749#ifdef T3_TRACE
2750	T3_TRACE1(TIDTB(sk),
2751		  "process_abort_rpl: GTS rpl pending %d",
2752		  sock_flag(sk, ABORT_RPL_PENDING));
2753#endif
2754
2755	inp_wlock(tp->t_inpcb);
2756	so = inp_inpcbtosocket(tp->t_inpcb);
2757
2758	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2759		/*
2760		 * XXX panic on tcpdrop
2761		 */
2762		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2763			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2764		else {
2765			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2766			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2767			    !is_t3a(toep->tp_toedev)) {
2768				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2769					panic("TP_ABORT_REQ_RCVD set");
2770				t3_release_offload_resources(toep);
2771				needclose = 1;
2772			}
2773		}
2774	}
2775	inp_wunlock(tp->t_inpcb);
2776
2777	if (needclose)
2778		tcp_offload_close(tp);
2779
2780	m_free(m);
2781}
2782
2783/*
2784 * Handle an ABORT_RPL_RSS CPL message.
2785 */
2786static int
2787do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2788{
2789	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2790	struct toepcb *toep;
2791
2792	/*
2793	 * Ignore replies to post-close aborts indicating that the abort was
2794	 * requested too late.  These connections are terminated when we get
2795	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2796	 * arrives the TID is either no longer used or it has been recycled.
2797	 */
2798	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2799discard:
2800		m_free(m);
2801		return (0);
2802	}
2803
2804	toep = (struct toepcb *)ctx;
2805
2806        /*
2807	 * Sometimes we've already closed the socket, e.g., a post-close
2808	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2809	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2810	 * but FW turns the ABORT_REQ into a regular one and so we get
2811	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2812	 */
2813	if (!toep)
2814		goto discard;
2815
2816	if (toep->tp_tp == NULL) {
2817		log(LOG_NOTICE, "removing tid for abort\n");
2818		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2819		if (toep->tp_l2t)
2820			l2t_release(L2DATA(cdev), toep->tp_l2t);
2821
2822		toepcb_release(toep);
2823		goto discard;
2824	}
2825
2826	log(LOG_NOTICE, "toep=%p\n", toep);
2827	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2828
2829	toepcb_hold(toep);
2830	process_abort_rpl(toep, m);
2831	toepcb_release(toep);
2832	return (0);
2833}
2834
2835/*
2836 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2837 * indicate whether RST should be sent in response.
2838 */
2839static int
2840abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2841{
2842	struct tcpcb *tp = so_sototcpcb(so);
2843
2844	switch (abort_reason) {
2845	case CPL_ERR_BAD_SYN:
2846#if 0
2847		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2848#endif
2849	case CPL_ERR_CONN_RESET:
2850		// XXX need to handle SYN_RECV due to crossed SYNs
2851		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2852	case CPL_ERR_XMIT_TIMEDOUT:
2853	case CPL_ERR_PERSIST_TIMEDOUT:
2854	case CPL_ERR_FINWAIT2_TIMEDOUT:
2855	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2856#if 0
2857		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2858#endif
2859		return (ETIMEDOUT);
2860	default:
2861		return (EIO);
2862	}
2863}
2864
2865static inline void
2866set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2867{
2868	struct cpl_abort_rpl *rpl = cplhdr(m);
2869
2870	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2871	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2872	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2873
2874	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2875	rpl->cmd = cmd;
2876}
2877
2878static void
2879send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2880{
2881	struct mbuf *reply_mbuf;
2882	struct cpl_abort_req_rss *req = cplhdr(m);
2883
2884	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2885	m_set_priority(m, CPL_PRIORITY_DATA);
2886	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2887	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2888	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2889	m_free(m);
2890}
2891
2892/*
2893 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2894 */
2895static inline int
2896is_neg_adv_abort(unsigned int status)
2897{
2898	return status == CPL_ERR_RTX_NEG_ADVICE ||
2899	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2900}
2901
2902static void
2903send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2904{
2905	struct mbuf  *reply_mbuf;
2906	struct cpl_abort_req_rss *req = cplhdr(m);
2907
2908	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2909
2910	if (!reply_mbuf) {
2911		/* Defer the reply.  Stick rst_status into req->cmd. */
2912		req->status = rst_status;
2913		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2914		return;
2915	}
2916
2917	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2918	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2919	m_free(m);
2920
2921	/*
2922	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2923	 * these messages while ARP is pending.  For other connection states
2924	 * it's not a problem.
2925	 */
2926	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2927}
2928
2929#ifdef notyet
2930static void
2931cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2932{
2933	CXGB_UNIMPLEMENTED();
2934#ifdef notyet
2935	struct request_sock *req = child->sk_user_data;
2936
2937	inet_csk_reqsk_queue_removed(parent, req);
2938	synq_remove(tcp_sk(child));
2939	__reqsk_free(req);
2940	child->sk_user_data = NULL;
2941#endif
2942}
2943
2944
2945/*
2946 * Performs the actual work to abort a SYN_RECV connection.
2947 */
2948static void
2949do_abort_syn_rcv(struct socket *child, struct socket *parent)
2950{
2951	struct tcpcb *parenttp = so_sototcpcb(parent);
2952	struct tcpcb *childtp = so_sototcpcb(child);
2953
2954	/*
2955	 * If the server is still open we clean up the child connection,
2956	 * otherwise the server already did the clean up as it was purging
2957	 * its SYN queue and the skb was just sitting in its backlog.
2958	 */
2959	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2960		cleanup_syn_rcv_conn(child, parent);
2961		inp_wlock(childtp->t_inpcb);
2962		t3_release_offload_resources(childtp->t_toe);
2963		inp_wunlock(childtp->t_inpcb);
2964		tcp_offload_close(childtp);
2965	}
2966}
2967#endif
2968
2969/*
2970 * Handle abort requests for a SYN_RECV connection.  These need extra work
2971 * because the socket is on its parent's SYN queue.
2972 */
2973static int
2974abort_syn_rcv(struct socket *so, struct mbuf *m)
2975{
2976	CXGB_UNIMPLEMENTED();
2977#ifdef notyet
2978	struct socket *parent;
2979	struct toedev *tdev = toep->tp_toedev;
2980	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2981	struct socket *oreq = so->so_incomp;
2982	struct t3c_tid_entry *t3c_stid;
2983	struct tid_info *t;
2984
2985	if (!oreq)
2986		return -1;        /* somehow we are not on the SYN queue */
2987
2988	t = &(T3C_DATA(cdev))->tid_maps;
2989	t3c_stid = lookup_stid(t, oreq->ts_recent);
2990	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2991
2992	so_lock(parent);
2993	do_abort_syn_rcv(so, parent);
2994	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2995	so_unlock(parent);
2996#endif
2997	return (0);
2998}
2999
3000/*
3001 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
3002 * request except that we need to reply to it.
3003 */
3004static void
3005process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3006{
3007	int rst_status = CPL_ABORT_NO_RST;
3008	const struct cpl_abort_req_rss *req = cplhdr(m);
3009	struct tcpcb *tp = toep->tp_tp;
3010	struct socket *so;
3011	int needclose = 0;
3012
3013	inp_wlock(tp->t_inpcb);
3014	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3015	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3016		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3017		m_free(m);
3018		goto skip;
3019	}
3020
3021	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3022	/*
3023	 * Three cases to consider:
3024	 * a) We haven't sent an abort_req; close the connection.
3025	 * b) We have sent a post-close abort_req that will get to TP too late
3026	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3027	 *    be ignored and the connection should be closed now.
3028	 * c) We have sent a regular abort_req that will get to TP too late.
3029	 *    That will generate an abort_rpl with status 0, wait for it.
3030	 */
3031	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3032	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3033		int error;
3034
3035		error = abort_status_to_errno(so, req->status,
3036		    &rst_status);
3037		so_error_set(so, error);
3038
3039		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3040			so_sorwakeup(so);
3041		/*
3042		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3043		 * returns 0 is has taken care of the abort.
3044		 */
3045		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3046			goto skip;
3047
3048		t3_release_offload_resources(toep);
3049		needclose = 1;
3050	}
3051	inp_wunlock(tp->t_inpcb);
3052
3053	if (needclose)
3054		tcp_offload_close(tp);
3055
3056	send_abort_rpl(m, tdev, rst_status);
3057	return;
3058skip:
3059	inp_wunlock(tp->t_inpcb);
3060}
3061
3062/*
3063 * Handle an ABORT_REQ_RSS CPL message.
3064 */
3065static int
3066do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3067{
3068	const struct cpl_abort_req_rss *req = cplhdr(m);
3069	struct toepcb *toep = (struct toepcb *)ctx;
3070
3071	if (is_neg_adv_abort(req->status)) {
3072		m_free(m);
3073		return (0);
3074	}
3075
3076	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3077
3078	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3079		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3080		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3081
3082		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3083		if (toep->tp_l2t)
3084			l2t_release(L2DATA(cdev), toep->tp_l2t);
3085
3086		/*
3087		 *  Unhook
3088		 */
3089		toep->tp_tp->t_toe = NULL;
3090		toep->tp_tp->t_flags &= ~TF_TOE;
3091		toep->tp_tp = NULL;
3092		/*
3093		 * XXX need to call syncache_chkrst - but we don't
3094		 * have a way of doing that yet
3095		 */
3096		toepcb_release(toep);
3097		log(LOG_ERR, "abort for unestablished connection :-(\n");
3098		return (0);
3099	}
3100	if (toep->tp_tp == NULL) {
3101		log(LOG_NOTICE, "disconnected toepcb\n");
3102		/* should be freed momentarily */
3103		return (0);
3104	}
3105
3106
3107	toepcb_hold(toep);
3108	process_abort_req(toep, m, toep->tp_toedev);
3109	toepcb_release(toep);
3110	return (0);
3111}
3112#ifdef notyet
3113static void
3114pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3115{
3116	struct toedev *tdev = TOE_DEV(parent);
3117
3118	do_abort_syn_rcv(child, parent);
3119	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3120		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3121
3122		rpl->opt0h = htonl(F_TCAM_BYPASS);
3123		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3124		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3125	} else
3126		m_free(m);
3127}
3128#endif
3129static void
3130handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3131{
3132	CXGB_UNIMPLEMENTED();
3133
3134#ifdef notyet
3135	struct t3cdev *cdev;
3136	struct socket *parent;
3137	struct socket *oreq;
3138	struct t3c_tid_entry *t3c_stid;
3139	struct tid_info *t;
3140	struct tcpcb *otp, *tp = so_sototcpcb(so);
3141	struct toepcb *toep = tp->t_toe;
3142
3143	/*
3144	 * If the connection is being aborted due to the parent listening
3145	 * socket going away there's nothing to do, the ABORT_REQ will close
3146	 * the connection.
3147	 */
3148	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3149		m_free(m);
3150		return;
3151	}
3152
3153	oreq = so->so_incomp;
3154	otp = so_sototcpcb(oreq);
3155
3156	cdev = T3C_DEV(so);
3157	t = &(T3C_DATA(cdev))->tid_maps;
3158	t3c_stid = lookup_stid(t, otp->ts_recent);
3159	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3160
3161	so_lock(parent);
3162	pass_open_abort(so, parent, m);
3163	so_unlock(parent);
3164#endif
3165}
3166
3167/*
3168 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3169 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3170 * connection.
3171 */
3172static void
3173pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3174{
3175
3176#ifdef notyet
3177	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3178	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3179#endif
3180	handle_pass_open_arp_failure(m_get_socket(m), m);
3181}
3182
3183/*
3184 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3185 */
3186static void
3187mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3188{
3189	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3190	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3191	unsigned int tid = GET_TID(req);
3192
3193	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3194	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3195	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3196	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3197	rpl->opt0h = htonl(F_TCAM_BYPASS);
3198	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3199	rpl->opt2 = 0;
3200	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3201}
3202
3203/*
3204 * Send a deferred reject to an accept request.
3205 */
3206static void
3207reject_pass_request(struct toedev *tdev, struct mbuf *m)
3208{
3209	struct mbuf *reply_mbuf;
3210
3211	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3212	mk_pass_accept_rpl(reply_mbuf, m);
3213	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3214	m_free(m);
3215}
3216
3217static void
3218handle_syncache_event(int event, void *arg)
3219{
3220	struct toepcb *toep = arg;
3221
3222	switch (event) {
3223	case TOE_SC_ENTRY_PRESENT:
3224		/*
3225		 * entry already exists - free toepcb
3226		 * and l2t
3227		 */
3228		printf("syncache entry present\n");
3229		toepcb_release(toep);
3230		break;
3231	case TOE_SC_DROP:
3232		/*
3233		 * The syncache has given up on this entry
3234		 * either it timed out, or it was evicted
3235		 * we need to explicitly release the tid
3236		 */
3237		printf("syncache entry dropped\n");
3238		toepcb_release(toep);
3239		break;
3240	default:
3241		log(LOG_ERR, "unknown syncache event %d\n", event);
3242		break;
3243	}
3244}
3245
3246static void
3247syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3248{
3249	struct in_conninfo inc;
3250	struct tcpopt to;
3251	struct tcphdr th;
3252	struct inpcb *inp;
3253	int mss, wsf, sack, ts;
3254	uint32_t rcv_isn = ntohl(req->rcv_isn);
3255
3256	bzero(&to, sizeof(struct tcpopt));
3257	inp = so_sotoinpcb(lso);
3258
3259	/*
3260	 * Fill out information for entering us into the syncache
3261	 */
3262	bzero(&inc, sizeof(inc));
3263	inc.inc_fport = th.th_sport = req->peer_port;
3264	inc.inc_lport = th.th_dport = req->local_port;
3265	th.th_seq = req->rcv_isn;
3266	th.th_flags = TH_SYN;
3267
3268	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3269
3270
3271	inc.inc_isipv6 = 0;
3272	inc.inc_len = 0;
3273	inc.inc_faddr.s_addr = req->peer_ip;
3274	inc.inc_laddr.s_addr = req->local_ip;
3275
3276	DPRINTF("syncache add of %d:%d %d:%d\n",
3277	    ntohl(req->local_ip), ntohs(req->local_port),
3278	    ntohl(req->peer_ip), ntohs(req->peer_port));
3279
3280	mss = req->tcp_options.mss;
3281	wsf = req->tcp_options.wsf;
3282	ts = req->tcp_options.tstamp;
3283	sack = req->tcp_options.sack;
3284	to.to_mss = mss;
3285	to.to_wscale = wsf;
3286	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3287	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3288}
3289
3290
3291/*
3292 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3293 * lock held.  Note that the sock here is a listening socket that is not owned
3294 * by the TOE.
3295 */
3296static void
3297process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3298    struct listen_ctx *lctx)
3299{
3300	int rt_flags;
3301	struct l2t_entry *e;
3302	struct iff_mac tim;
3303	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3304	struct cpl_pass_accept_rpl *rpl;
3305	struct cpl_pass_accept_req *req = cplhdr(m);
3306	unsigned int tid = GET_TID(req);
3307	struct tom_data *d = TOM_DATA(tdev);
3308	struct t3cdev *cdev = d->cdev;
3309	struct tcpcb *tp = so_sototcpcb(so);
3310	struct toepcb *newtoep;
3311	struct rtentry *dst;
3312	struct sockaddr_in nam;
3313	struct t3c_data *td = T3C_DATA(cdev);
3314
3315	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3316	if (__predict_false(reply_mbuf == NULL)) {
3317		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3318			t3_defer_reply(m, tdev, reject_pass_request);
3319		else {
3320			cxgb_queue_tid_release(cdev, tid);
3321			m_free(m);
3322		}
3323		DPRINTF("failed to get reply_mbuf\n");
3324
3325		goto out;
3326	}
3327
3328	if (tp->t_state != TCPS_LISTEN) {
3329		DPRINTF("socket not in listen state\n");
3330
3331		goto reject;
3332	}
3333
3334	tim.mac_addr = req->dst_mac;
3335	tim.vlan_tag = ntohs(req->vlan_tag);
3336	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3337		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3338		goto reject;
3339	}
3340
3341#ifdef notyet
3342	/*
3343	 * XXX do route lookup to confirm that we're still listening on this
3344	 * address
3345	 */
3346	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3347			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3348		goto reject;
3349	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3350		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3351	dst_release(skb->dst);	// done with the input route, release it
3352	skb->dst = NULL;
3353
3354	if ((rt_flags & RTF_LOCAL) == 0)
3355		goto reject;
3356#endif
3357	/*
3358	 * XXX
3359	 */
3360	rt_flags = RTF_LOCAL;
3361	if ((rt_flags & RTF_LOCAL) == 0)
3362		goto reject;
3363
3364	/*
3365	 * Calculate values and add to syncache
3366	 */
3367
3368	newtoep = toepcb_alloc();
3369	if (newtoep == NULL)
3370		goto reject;
3371
3372	bzero(&nam, sizeof(struct sockaddr_in));
3373
3374	nam.sin_len = sizeof(struct sockaddr_in);
3375	nam.sin_family = AF_INET;
3376	nam.sin_addr.s_addr =req->peer_ip;
3377	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3378
3379	if (dst == NULL) {
3380		printf("failed to find route\n");
3381		goto reject;
3382	}
3383	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3384	    (struct sockaddr *)&nam);
3385	if (e == NULL) {
3386		DPRINTF("failed to get l2t\n");
3387	}
3388	/*
3389	 * Point to our listen socket until accept
3390	 */
3391	newtoep->tp_tp = tp;
3392	newtoep->tp_flags = TP_SYN_RCVD;
3393	newtoep->tp_tid = tid;
3394	newtoep->tp_toedev = tdev;
3395	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3396
3397	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3398	so_lock(so);
3399	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3400	so_unlock(so);
3401
3402	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3403		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3404
3405	if (newtoep->tp_ulp_mode) {
3406		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3407
3408		if (ddp_mbuf == NULL)
3409			newtoep->tp_ulp_mode = 0;
3410	}
3411
3412	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3413	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3414	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3415	/*
3416	 * XXX workaround for lack of syncache drop
3417	 */
3418	toepcb_hold(newtoep);
3419	syncache_add_accept_req(req, so, newtoep);
3420
3421	rpl = cplhdr(reply_mbuf);
3422	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3423	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3424	rpl->wr.wr_lo = 0;
3425	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3426	rpl->opt2 = htonl(calc_opt2(so, tdev));
3427	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3428	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3429
3430	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3431	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3432	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3433				  CPL_PASS_OPEN_ACCEPT);
3434
3435	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3436
3437	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3438
3439	l2t_send(cdev, reply_mbuf, e);
3440	m_free(m);
3441	if (newtoep->tp_ulp_mode) {
3442		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3443				V_TF_DDP_OFF(1) |
3444				TP_DDP_TIMER_WORKAROUND_MASK,
3445				V_TF_DDP_OFF(1) |
3446		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3447	} else
3448		printf("not offloading\n");
3449
3450
3451
3452	return;
3453reject:
3454	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3455		mk_pass_accept_rpl(reply_mbuf, m);
3456	else
3457		mk_tid_release(reply_mbuf, newtoep, tid);
3458	cxgb_ofld_send(cdev, reply_mbuf);
3459	m_free(m);
3460out:
3461#if 0
3462	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3463#else
3464	return;
3465#endif
3466}
3467
3468/*
3469 * Handle a CPL_PASS_ACCEPT_REQ message.
3470 */
3471static int
3472do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3473{
3474	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3475	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3476	struct tom_data *d = listen_ctx->tom_data;
3477
3478#if VALIDATE_TID
3479	struct cpl_pass_accept_req *req = cplhdr(m);
3480	unsigned int tid = GET_TID(req);
3481	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3482
3483	if (unlikely(!lsk)) {
3484		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3485		       cdev->name,
3486		       (unsigned long)((union listen_entry *)ctx -
3487					t->stid_tab));
3488		return CPL_RET_BUF_DONE;
3489	}
3490	if (unlikely(tid >= t->ntids)) {
3491		printk(KERN_ERR "%s: passive open TID %u too large\n",
3492		       cdev->name, tid);
3493		return CPL_RET_BUF_DONE;
3494	}
3495	/*
3496	 * For T3A the current user of the TID may have closed but its last
3497	 * message(s) may have been backlogged so the TID appears to be still
3498	 * in use.  Just take the TID away, the connection can close at its
3499	 * own leisure.  For T3B this situation is a bug.
3500	 */
3501	if (!valid_new_tid(t, tid) &&
3502	    cdev->type != T3A) {
3503		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3504		       cdev->name, tid);
3505		return CPL_RET_BUF_DONE;
3506	}
3507#endif
3508
3509	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3510	return (0);
3511}
3512
3513/*
3514 * Called when a connection is established to translate the TCP options
3515 * reported by HW to FreeBSD's native format.
3516 */
3517static void
3518assign_rxopt(struct socket *so, unsigned int opt)
3519{
3520	struct tcpcb *tp = so_sototcpcb(so);
3521	struct toepcb *toep = tp->t_toe;
3522	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3523
3524	inp_lock_assert(tp->t_inpcb);
3525
3526	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3527	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3528	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3529	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3530	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3531	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3532		tp->rcv_scale = tp->request_r_scale;
3533}
3534
3535/*
3536 * Completes some final bits of initialization for just established connections
3537 * and changes their state to TCP_ESTABLISHED.
3538 *
3539 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3540 */
3541static void
3542make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3543{
3544	struct tcpcb *tp = so_sototcpcb(so);
3545	struct toepcb *toep = tp->t_toe;
3546
3547	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3548	assign_rxopt(so, opt);
3549
3550	/*
3551	 *XXXXXXXXXXX
3552	 *
3553	 */
3554#ifdef notyet
3555	so->so_proto->pr_ctloutput = t3_ctloutput;
3556#endif
3557
3558#if 0
3559	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3560#endif
3561	/*
3562	 * XXX not clear what rcv_wup maps to
3563	 */
3564	/*
3565	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3566	 * pass through opt0.
3567	 */
3568	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3569		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3570
3571	dump_toepcb(toep);
3572
3573#ifdef notyet
3574/*
3575 * no clean interface for marking ARP up to date
3576 */
3577	dst_confirm(sk->sk_dst_cache);
3578#endif
3579	tp->t_starttime = ticks;
3580	tp->t_state = TCPS_ESTABLISHED;
3581	soisconnected(so);
3582}
3583
3584static int
3585syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3586{
3587
3588	struct in_conninfo inc;
3589	struct tcpopt to;
3590	struct tcphdr th;
3591	int mss, wsf, sack, ts;
3592	struct mbuf *m = NULL;
3593	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3594	unsigned int opt;
3595
3596#ifdef MAC
3597#error	"no MAC support"
3598#endif
3599
3600	opt = ntohs(req->tcp_opt);
3601
3602	bzero(&to, sizeof(struct tcpopt));
3603
3604	/*
3605	 * Fill out information for entering us into the syncache
3606	 */
3607	bzero(&inc, sizeof(inc));
3608	inc.inc_fport = th.th_sport = req->peer_port;
3609	inc.inc_lport = th.th_dport = req->local_port;
3610	th.th_seq = req->rcv_isn;
3611	th.th_flags = TH_ACK;
3612
3613	inc.inc_isipv6 = 0;
3614	inc.inc_len = 0;
3615	inc.inc_faddr.s_addr = req->peer_ip;
3616	inc.inc_laddr.s_addr = req->local_ip;
3617
3618	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3619	wsf  = G_TCPOPT_WSCALE_OK(opt);
3620	ts   = G_TCPOPT_TSTAMP(opt);
3621	sack = G_TCPOPT_SACK(opt);
3622
3623	to.to_mss = mss;
3624	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3625	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3626
3627	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3628	    ntohl(req->local_ip), ntohs(req->local_port),
3629	    ntohl(req->peer_ip), ntohs(req->peer_port),
3630	    mss, wsf, ts, sack);
3631	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3632}
3633
3634
3635/*
3636 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3637 * if we are in TCP_SYN_RECV due to crossed SYNs
3638 */
3639static int
3640do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3641{
3642	struct cpl_pass_establish *req = cplhdr(m);
3643	struct toepcb *toep = (struct toepcb *)ctx;
3644	struct tcpcb *tp = toep->tp_tp;
3645	struct socket *so, *lso;
3646	struct t3c_data *td = T3C_DATA(cdev);
3647	struct sockbuf *snd, *rcv;
3648
3649	// Complete socket initialization now that we have the SND_ISN
3650
3651	struct toedev *tdev;
3652
3653
3654	tdev = toep->tp_toedev;
3655
3656	inp_wlock(tp->t_inpcb);
3657
3658	/*
3659	 *
3660	 * XXX need to add reference while we're manipulating
3661	 */
3662	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3663
3664	inp_wunlock(tp->t_inpcb);
3665
3666	so_lock(so);
3667	LIST_REMOVE(toep, synq_entry);
3668	so_unlock(so);
3669
3670	if (!syncache_expand_establish_req(req, &so, toep)) {
3671		/*
3672		 * No entry
3673		 */
3674		CXGB_UNIMPLEMENTED();
3675	}
3676	if (so == NULL) {
3677		/*
3678		 * Couldn't create the socket
3679		 */
3680		CXGB_UNIMPLEMENTED();
3681	}
3682
3683	tp = so_sototcpcb(so);
3684	inp_wlock(tp->t_inpcb);
3685
3686	snd = so_sockbuf_snd(so);
3687	rcv = so_sockbuf_rcv(so);
3688
3689	snd->sb_flags |= SB_NOCOALESCE;
3690	rcv->sb_flags |= SB_NOCOALESCE;
3691
3692	toep->tp_tp = tp;
3693	toep->tp_flags = 0;
3694	tp->t_toe = toep;
3695	reset_wr_list(toep);
3696	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3697	tp->rcv_nxt = toep->tp_copied_seq;
3698	install_offload_ops(so);
3699
3700	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3701	toep->tp_wr_unacked = 0;
3702	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3703	toep->tp_qset_idx = 0;
3704	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3705
3706	/*
3707	 * XXX Cancel any keep alive timer
3708	 */
3709
3710	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3711
3712	/*
3713	 * XXX workaround for lack of syncache drop
3714	 */
3715	toepcb_release(toep);
3716	inp_wunlock(tp->t_inpcb);
3717
3718	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3719	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3720#ifdef notyet
3721	/*
3722	 * XXX not sure how these checks map to us
3723	 */
3724	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3725		sk->sk_state_change(sk);
3726		sk_wake_async(so, 0, POLL_OUT);
3727	}
3728	/*
3729	 * The state for the new connection is now up to date.
3730	 * Next check if we should add the connection to the parent's
3731	 * accept queue.  When the parent closes it resets connections
3732	 * on its SYN queue, so check if we are being reset.  If so we
3733	 * don't need to do anything more, the coming ABORT_RPL will
3734	 * destroy this socket.  Otherwise move the connection to the
3735	 * accept queue.
3736	 *
3737	 * Note that we reset the synq before closing the server so if
3738	 * we are not being reset the stid is still open.
3739	 */
3740	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3741		__kfree_skb(skb);
3742		goto unlock;
3743	}
3744#endif
3745	m_free(m);
3746
3747	return (0);
3748}
3749
3750/*
3751 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3752 * and send them to the TOE.
3753 */
3754static void
3755fixup_and_send_ofo(struct toepcb *toep)
3756{
3757	struct mbuf *m;
3758	struct toedev *tdev = toep->tp_toedev;
3759	struct tcpcb *tp = toep->tp_tp;
3760	unsigned int tid = toep->tp_tid;
3761
3762	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3763
3764	inp_lock_assert(tp->t_inpcb);
3765	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3766		/*
3767		 * A variety of messages can be waiting but the fields we'll
3768		 * be touching are common to all so any message type will do.
3769		 */
3770		struct cpl_close_con_req *p = cplhdr(m);
3771
3772		p->wr.wr_lo = htonl(V_WR_TID(tid));
3773		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3774		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3775	}
3776}
3777
3778/*
3779 * Updates socket state from an active establish CPL message.  Runs with the
3780 * socket lock held.
3781 */
3782static void
3783socket_act_establish(struct socket *so, struct mbuf *m)
3784{
3785	struct cpl_act_establish *req = cplhdr(m);
3786	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3787	struct tcpcb *tp = so_sototcpcb(so);
3788	struct toepcb *toep = tp->t_toe;
3789
3790	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3791		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3792		    toep->tp_tid, tp->t_state);
3793
3794	tp->ts_recent_age = ticks;
3795	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3796	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3797
3798	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3799
3800	/*
3801	 * Now that we finally have a TID send any CPL messages that we had to
3802	 * defer for lack of a TID.
3803	 */
3804	if (mbufq_len(&toep->out_of_order_queue))
3805		fixup_and_send_ofo(toep);
3806
3807	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3808		/*
3809		 * XXX does this even make sense?
3810		 */
3811		so_sorwakeup(so);
3812	}
3813	m_free(m);
3814#ifdef notyet
3815/*
3816 * XXX assume no write requests permitted while socket connection is
3817 * incomplete
3818 */
3819	/*
3820	 * Currently the send queue must be empty at this point because the
3821	 * socket layer does not send anything before a connection is
3822	 * established.  To be future proof though we handle the possibility
3823	 * that there are pending buffers to send (either TX_DATA or
3824	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3825	 * buffers according to the just learned write_seq, and then we send
3826	 * them on their way.
3827	 */
3828	fixup_pending_writeq_buffers(sk);
3829	if (t3_push_frames(so, 1))
3830		sk->sk_write_space(sk);
3831#endif
3832
3833	toep->tp_state = tp->t_state;
3834	V_tcpstat.tcps_connects++;
3835
3836}
3837
3838/*
3839 * Process a CPL_ACT_ESTABLISH message.
3840 */
3841static int
3842do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3843{
3844	struct cpl_act_establish *req = cplhdr(m);
3845	unsigned int tid = GET_TID(req);
3846	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3847	struct toepcb *toep = (struct toepcb *)ctx;
3848	struct tcpcb *tp = toep->tp_tp;
3849	struct socket *so;
3850	struct toedev *tdev;
3851	struct tom_data *d;
3852
3853	if (tp == NULL) {
3854		free_atid(cdev, atid);
3855		return (0);
3856	}
3857	inp_wlock(tp->t_inpcb);
3858
3859	/*
3860	 * XXX
3861	 */
3862	so = inp_inpcbtosocket(tp->t_inpcb);
3863	tdev = toep->tp_toedev; /* blow up here if link was down */
3864	d = TOM_DATA(tdev);
3865
3866	/*
3867	 * It's OK if the TID is currently in use, the owning socket may have
3868	 * backlogged its last CPL message(s).  Just take it away.
3869	 */
3870	toep->tp_tid = tid;
3871	toep->tp_tp = tp;
3872	so_insert_tid(d, toep, tid);
3873	free_atid(cdev, atid);
3874	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3875
3876	socket_act_establish(so, m);
3877	inp_wunlock(tp->t_inpcb);
3878	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3879	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3880
3881	return (0);
3882}
3883
3884/*
3885 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3886 * next batch of work requests from the write queue.
3887 */
3888static void
3889wr_ack(struct toepcb *toep, struct mbuf *m)
3890{
3891	struct tcpcb *tp = toep->tp_tp;
3892	struct cpl_wr_ack *hdr = cplhdr(m);
3893	struct socket *so;
3894	unsigned int credits = ntohs(hdr->credits);
3895	u32 snd_una = ntohl(hdr->snd_una);
3896	int bytes = 0;
3897	struct sockbuf *snd;
3898
3899	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3900
3901	inp_wlock(tp->t_inpcb);
3902	so = inp_inpcbtosocket(tp->t_inpcb);
3903	toep->tp_wr_avail += credits;
3904	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3905		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3906
3907	while (credits) {
3908		struct mbuf *p = peek_wr(toep);
3909
3910		if (__predict_false(!p)) {
3911			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3912			    "nothing pending, state %u wr_avail=%u\n",
3913			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3914			break;
3915		}
3916		CTR2(KTR_TOM,
3917			"wr_ack: p->credits=%d p->bytes=%d",
3918		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3919		KASSERT(p->m_pkthdr.csum_data != 0,
3920		    ("empty request still on list"));
3921
3922		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3923
3924#if DEBUG_WR > 1
3925			struct tx_data_wr *w = cplhdr(p);
3926			log(LOG_ERR,
3927			       "TID %u got %u WR credits, need %u, len %u, "
3928			       "main body %u, frags %u, seq # %u, ACK una %u,"
3929			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3930			       toep->tp_tid, credits, p->csum, p->len,
3931			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3932			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3933			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3934#endif
3935			p->m_pkthdr.csum_data -= credits;
3936			break;
3937		} else {
3938			dequeue_wr(toep);
3939			credits -= p->m_pkthdr.csum_data;
3940			bytes += p->m_pkthdr.len;
3941			CTR3(KTR_TOM,
3942			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3943			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3944
3945			m_free(p);
3946		}
3947	}
3948
3949#if DEBUG_WR
3950	check_wr_invariants(tp);
3951#endif
3952
3953	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3954#if VALIDATE_SEQ
3955		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3956
3957		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3958		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3959		    toep->tp_tid, tp->snd_una);
3960#endif
3961		goto out_free;
3962	}
3963
3964	if (tp->snd_una != snd_una) {
3965		tp->snd_una = snd_una;
3966		tp->ts_recent_age = ticks;
3967#ifdef notyet
3968		/*
3969		 * Keep ARP entry "minty fresh"
3970		 */
3971		dst_confirm(sk->sk_dst_cache);
3972#endif
3973		if (tp->snd_una == tp->snd_nxt)
3974			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3975	}
3976
3977	snd = so_sockbuf_snd(so);
3978	if (bytes) {
3979		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3980		snd = so_sockbuf_snd(so);
3981		sockbuf_lock(snd);
3982		sbdrop_locked(snd, bytes);
3983		so_sowwakeup_locked(so);
3984	}
3985
3986	if (snd->sb_sndptroff < snd->sb_cc)
3987		t3_push_frames(so, 0);
3988
3989out_free:
3990	inp_wunlock(tp->t_inpcb);
3991	m_free(m);
3992}
3993
3994/*
3995 * Handler for TX_DATA_ACK CPL messages.
3996 */
3997static int
3998do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3999{
4000	struct toepcb *toep = (struct toepcb *)ctx;
4001
4002	VALIDATE_SOCK(so);
4003
4004	wr_ack(toep, m);
4005	return 0;
4006}
4007
4008/*
4009 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4010 */
4011static int
4012do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4013{
4014	m_freem(m);
4015	return 0;
4016}
4017
4018/*
4019 * Reset a connection that is on a listener's SYN queue or accept queue,
4020 * i.e., one that has not had a struct socket associated with it.
4021 * Must be called from process context.
4022 *
4023 * Modeled after code in inet_csk_listen_stop().
4024 */
4025static void
4026t3_reset_listen_child(struct socket *child)
4027{
4028	struct tcpcb *tp = so_sototcpcb(child);
4029
4030	t3_send_reset(tp->t_toe);
4031}
4032
4033
4034static void
4035t3_child_disconnect(struct socket *so, void *arg)
4036{
4037	struct tcpcb *tp = so_sototcpcb(so);
4038
4039	if (tp->t_flags & TF_TOE) {
4040		inp_wlock(tp->t_inpcb);
4041		t3_reset_listen_child(so);
4042		inp_wunlock(tp->t_inpcb);
4043	}
4044}
4045
4046/*
4047 * Disconnect offloaded established but not yet accepted connections sitting
4048 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4049 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4050 */
4051void
4052t3_disconnect_acceptq(struct socket *listen_so)
4053{
4054
4055	so_lock(listen_so);
4056	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4057	so_unlock(listen_so);
4058}
4059
4060/*
4061 * Reset offloaded connections sitting on a server's syn queue.  As above
4062 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4063 */
4064
4065void
4066t3_reset_synq(struct listen_ctx *lctx)
4067{
4068	struct toepcb *toep;
4069
4070	so_lock(lctx->lso);
4071	while (!LIST_EMPTY(&lctx->synq_head)) {
4072		toep = LIST_FIRST(&lctx->synq_head);
4073		LIST_REMOVE(toep, synq_entry);
4074		toep->tp_tp = NULL;
4075		t3_send_reset(toep);
4076		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4077		toepcb_release(toep);
4078	}
4079	so_unlock(lctx->lso);
4080}
4081
4082
4083int
4084t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4085		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4086		   unsigned int pg_off, unsigned int color)
4087{
4088	unsigned int i, j, pidx;
4089	struct pagepod *p;
4090	struct mbuf *m;
4091	struct ulp_mem_io *req;
4092	unsigned int tid = toep->tp_tid;
4093	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4094	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4095
4096	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4097	    gl, nppods, tag, maxoff, pg_off, color);
4098
4099	for (i = 0; i < nppods; ++i) {
4100		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4101		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4102		req = mtod(m, struct ulp_mem_io *);
4103		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4104		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4105		req->wr.wr_lo = 0;
4106		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4107					   V_ULPTX_CMD(ULP_MEM_WRITE));
4108		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4109				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4110
4111		p = (struct pagepod *)(req + 1);
4112		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4113			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4114			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4115						  V_PPOD_COLOR(color));
4116			p->pp_max_offset = htonl(maxoff);
4117			p->pp_page_offset = htonl(pg_off);
4118			p->pp_rsvd = 0;
4119			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4120				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4121				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4122		} else
4123			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4124		send_or_defer(toep, m, 0);
4125		ppod_addr += PPOD_SIZE;
4126	}
4127	return (0);
4128}
4129
4130/*
4131 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4132 */
4133static inline void
4134mk_cpl_barrier_ulp(struct cpl_barrier *b)
4135{
4136	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4137
4138	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4139	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4140	b->opcode = CPL_BARRIER;
4141}
4142
4143/*
4144 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4145 */
4146static inline void
4147mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4148{
4149	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4150
4151	txpkt = (struct ulp_txpkt *)req;
4152	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4153	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4154	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4155	req->cpuno = htons(cpuno);
4156}
4157
4158/*
4159 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4160 */
4161static inline void
4162mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4163                     unsigned int word, uint64_t mask, uint64_t val)
4164{
4165	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4166
4167	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4168	    tid, word, mask, val);
4169
4170	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4171	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4172	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4173	req->reply = V_NO_REPLY(1);
4174	req->cpu_idx = 0;
4175	req->word = htons(word);
4176	req->mask = htobe64(mask);
4177	req->val = htobe64(val);
4178}
4179
4180/*
4181 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4182 */
4183static void
4184mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4185    unsigned int tid, unsigned int credits)
4186{
4187	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4188
4189	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4190	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4191	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4192	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4193	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4194				 V_RX_CREDITS(credits));
4195}
4196
4197void
4198t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4199{
4200	unsigned int wrlen;
4201	struct mbuf *m;
4202	struct work_request_hdr *wr;
4203	struct cpl_barrier *lock;
4204	struct cpl_set_tcb_field *req;
4205	struct cpl_get_tcb *getreq;
4206	struct ddp_state *p = &toep->tp_ddp_state;
4207
4208#if 0
4209	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4210#endif
4211	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4212		sizeof(*getreq);
4213	m = m_gethdr_nofail(wrlen);
4214	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4215	wr = mtod(m, struct work_request_hdr *);
4216	bzero(wr, wrlen);
4217
4218	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4219	m->m_pkthdr.len = m->m_len = wrlen;
4220
4221	lock = (struct cpl_barrier *)(wr + 1);
4222	mk_cpl_barrier_ulp(lock);
4223
4224	req = (struct cpl_set_tcb_field *)(lock + 1);
4225
4226	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4227
4228	/* Hmmm, not sure if this actually a good thing: reactivating
4229	 * the other buffer might be an issue if it has been completed
4230	 * already. However, that is unlikely, since the fact that the UBUF
4231	 * is not completed indicates that there is no oustanding data.
4232	 */
4233	if (bufidx == 0)
4234		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4235				     V_TF_DDP_ACTIVE_BUF(1) |
4236				     V_TF_DDP_BUF0_VALID(1),
4237				     V_TF_DDP_ACTIVE_BUF(1));
4238	else
4239		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4240				     V_TF_DDP_ACTIVE_BUF(1) |
4241				     V_TF_DDP_BUF1_VALID(1), 0);
4242
4243	getreq = (struct cpl_get_tcb *)(req + 1);
4244	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4245
4246	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4247
4248	/* Keep track of the number of oustanding CPL_GET_TCB requests
4249	 */
4250	p->get_tcb_count++;
4251
4252#ifdef T3_TRACE
4253	T3_TRACE1(TIDTB(so),
4254		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4255#endif
4256	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4257}
4258
4259/**
4260 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4261 * @sk: the socket associated with the buffers
4262 * @bufidx: index of HW DDP buffer (0 or 1)
4263 * @tag0: new tag for HW buffer 0
4264 * @tag1: new tag for HW buffer 1
4265 * @len: new length for HW buf @bufidx
4266 *
4267 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4268 * buffer by changing the buffer tag and length and setting the valid and
4269 * active flag accordingly.  The caller must ensure the new buffer is at
4270 * least as big as the existing one.  Since we typically reprogram both HW
4271 * buffers this function sets both tags for convenience. Read the TCB to
4272 * determine how made data was written into the buffer before the overlay
4273 * took place.
4274 */
4275void
4276t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4277	 	       unsigned int tag1, unsigned int len)
4278{
4279	unsigned int wrlen;
4280	struct mbuf *m;
4281	struct work_request_hdr *wr;
4282	struct cpl_get_tcb *getreq;
4283	struct cpl_set_tcb_field *req;
4284	struct ddp_state *p = &toep->tp_ddp_state;
4285
4286	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4287	    bufidx, tag0, tag1, len);
4288#if 0
4289	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4290#endif
4291	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4292	m = m_gethdr_nofail(wrlen);
4293	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4294	wr = mtod(m, struct work_request_hdr *);
4295	m->m_pkthdr.len = m->m_len = wrlen;
4296	bzero(wr, wrlen);
4297
4298
4299	/* Set the ATOMIC flag to make sure that TP processes the following
4300	 * CPLs in an atomic manner and no wire segments can be interleaved.
4301	 */
4302	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4303	req = (struct cpl_set_tcb_field *)(wr + 1);
4304	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4305			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4306			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4307			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4308			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4309	req++;
4310	if (bufidx == 0) {
4311		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4312			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4313			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4314		req++;
4315		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4316			    V_TF_DDP_PUSH_DISABLE_0(1) |
4317			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4318			    V_TF_DDP_PUSH_DISABLE_0(0) |
4319			    V_TF_DDP_BUF0_VALID(1));
4320	} else {
4321		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4322			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4323			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4324		req++;
4325		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4326			    V_TF_DDP_PUSH_DISABLE_1(1) |
4327			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4328			    V_TF_DDP_PUSH_DISABLE_1(0) |
4329			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4330	}
4331
4332	getreq = (struct cpl_get_tcb *)(req + 1);
4333	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4334
4335	/* Keep track of the number of oustanding CPL_GET_TCB requests
4336	 */
4337	p->get_tcb_count++;
4338
4339#ifdef T3_TRACE
4340	T3_TRACE4(TIDTB(sk),
4341		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4342		  "len %d",
4343		  bufidx, tag0, tag1, len);
4344#endif
4345	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4346}
4347
4348/*
4349 * Sends a compound WR containing all the CPL messages needed to program the
4350 * two HW DDP buffers, namely optionally setting up the length and offset of
4351 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4352 */
4353void
4354t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4355		      unsigned int len1, unsigned int offset1,
4356                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4357{
4358	unsigned int wrlen;
4359	struct mbuf *m;
4360	struct work_request_hdr *wr;
4361	struct cpl_set_tcb_field *req;
4362
4363	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4364	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4365
4366#if 0
4367	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4368#endif
4369	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4370		(len1 ? sizeof(*req) : 0) +
4371		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4372	m = m_gethdr_nofail(wrlen);
4373	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4374	wr = mtod(m, struct work_request_hdr *);
4375	bzero(wr, wrlen);
4376
4377	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4378	m->m_pkthdr.len = m->m_len = wrlen;
4379
4380	req = (struct cpl_set_tcb_field *)(wr + 1);
4381	if (len0) {                  /* program buffer 0 offset and length */
4382		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4383			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4384			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4385			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4386			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4387		req++;
4388	}
4389	if (len1) {                  /* program buffer 1 offset and length */
4390		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4391			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4392			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4393			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4394			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4395		req++;
4396	}
4397
4398	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4399			     ddp_flags);
4400
4401	if (modulate) {
4402		mk_rx_data_ack_ulp(toep,
4403		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4404		    toep->tp_copied_seq - toep->tp_rcv_wup);
4405		toep->tp_rcv_wup = toep->tp_copied_seq;
4406	}
4407
4408#ifdef T3_TRACE
4409	T3_TRACE5(TIDTB(sk),
4410		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4411		  "modulate %d",
4412		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4413		  modulate);
4414#endif
4415
4416	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4417}
4418
4419void
4420t3_init_wr_tab(unsigned int wr_len)
4421{
4422	int i;
4423
4424	if (mbuf_wrs[1])     /* already initialized */
4425		return;
4426
4427	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4428		int sgl_len = (3 * i) / 2 + (i & 1);
4429
4430		sgl_len += 3;
4431		mbuf_wrs[i] = sgl_len <= wr_len ?
4432		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4433	}
4434
4435	wrlen = wr_len * 8;
4436}
4437
4438int
4439t3_init_cpl_io(void)
4440{
4441#ifdef notyet
4442	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4443	if (!tcphdr_skb) {
4444		log(LOG_ERR,
4445		       "Chelsio TCP offload: can't allocate sk_buff\n");
4446		return -1;
4447	}
4448	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4449	tcphdr_skb->h.raw = tcphdr_skb->data;
4450	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4451#endif
4452
4453	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4454	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4455	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4456	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4457	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4458	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4459	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4460	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4461	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4462	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4463	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4464	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4465	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4466	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4467	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4468	return (0);
4469}
4470
4471