cxgb_cpl_io.c revision 183113
1238104Sdes/**************************************************************************
2238104Sdes
3238104SdesCopyright (c) 2007-2008, Chelsio Inc.
4238104SdesAll rights reserved.
5238104Sdes
6238104SdesRedistribution and use in source and binary forms, with or without
7238104Sdesmodification, are permitted provided that the following conditions are met:
8238104Sdes
9238104Sdes 1. Redistributions of source code must retain the above copyright notice,
10238104Sdes    this list of conditions and the following disclaimer.
11238104Sdes
12238104Sdes 2. Neither the name of the Chelsio Corporation nor the names of its
13238104Sdes    contributors may be used to endorse or promote products derived from
14238104Sdes    this software without specific prior written permission.
15238104Sdes
16238104SdesTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17238104SdesAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18238104SdesIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19238104SdesARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20238104SdesLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21238104SdesCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22238104SdesSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23238104SdesINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24238104SdesCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25238104SdesARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26238104SdesPOSSIBILITY OF SUCH DAMAGE.
27238104Sdes
28238104Sdes***************************************************************************/
29238104Sdes
30238104Sdes#include <sys/cdefs.h>
31238104Sdes__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 183113 2008-09-17 15:49:44Z attilio $");
32238104Sdes
33238104Sdes#include <sys/param.h>
34238104Sdes#include <sys/systm.h>
35238104Sdes#include <sys/fcntl.h>
36238104Sdes#include <sys/kernel.h>
37238104Sdes#include <sys/limits.h>
38238104Sdes#include <sys/ktr.h>
39238104Sdes#include <sys/lock.h>
40238104Sdes#include <sys/mbuf.h>
41238104Sdes#include <sys/mutex.h>
42238104Sdes#include <sys/sockstate.h>
43238104Sdes#include <sys/sockopt.h>
44238104Sdes#include <sys/socket.h>
45238104Sdes#include <sys/sockbuf.h>
46238104Sdes#include <sys/sysctl.h>
47238104Sdes#include <sys/syslog.h>
48238104Sdes#include <sys/protosw.h>
49238104Sdes#include <sys/priv.h>
50238104Sdes#include <sys/vimage.h>
51238104Sdes
52238104Sdes#include <net/if.h>
53238104Sdes#include <net/route.h>
54238104Sdes
55238104Sdes#include <netinet/in.h>
56238104Sdes#include <netinet/in_pcb.h>
57238104Sdes#include <netinet/in_systm.h>
58238104Sdes#include <netinet/in_var.h>
59238104Sdes
60238104Sdes
61238104Sdes#include <dev/cxgb/cxgb_osdep.h>
62238104Sdes#include <dev/cxgb/sys/mbufq.h>
63238104Sdes
64238104Sdes#include <netinet/ip.h>
65238104Sdes#include <netinet/tcp_var.h>
66238104Sdes#include <netinet/tcp_fsm.h>
67238104Sdes#include <netinet/tcp_offload.h>
68238104Sdes#include <netinet/tcp_seq.h>
69238104Sdes#include <netinet/tcp_syncache.h>
70238104Sdes#include <netinet/tcp_timer.h>
71238104Sdes#include <net/route.h>
72238104Sdes
73238104Sdes#include <dev/cxgb/t3cdev.h>
74238104Sdes#include <dev/cxgb/common/cxgb_firmware_exports.h>
75238104Sdes#include <dev/cxgb/common/cxgb_t3_cpl.h>
76238104Sdes#include <dev/cxgb/common/cxgb_tcb.h>
77238104Sdes#include <dev/cxgb/common/cxgb_ctl_defs.h>
78238104Sdes#include <dev/cxgb/cxgb_offload.h>
79238104Sdes#include <vm/vm.h>
80238104Sdes#include <vm/pmap.h>
81238104Sdes#include <machine/bus.h>
82238104Sdes#include <dev/cxgb/sys/mvec.h>
83238104Sdes#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
84238104Sdes#include <dev/cxgb/ulp/tom/cxgb_defs.h>
85238104Sdes#include <dev/cxgb/ulp/tom/cxgb_tom.h>
86238104Sdes#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
87238104Sdes#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
88238104Sdes#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
89238104Sdes
90238104Sdes#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
91238104Sdes
92238104Sdes/*
93238104Sdes * For ULP connections HW may add headers, e.g., for digests, that aren't part
94238104Sdes * of the messages sent by the host but that are part of the TCP payload and
95238104Sdes * therefore consume TCP sequence space.  Tx connection parameters that
96238104Sdes * operate in TCP sequence space are affected by the HW additions and need to
97238104Sdes * compensate for them to accurately track TCP sequence numbers. This array
98238104Sdes * contains the compensating extra lengths for ULP packets.  It is indexed by
99238104Sdes * a packet's ULP submode.
100238104Sdes */
101238104Sdesconst unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
102238104Sdes
103238104Sdes#ifdef notyet
104238104Sdes/*
105238104Sdes * This sk_buff holds a fake header-only TCP segment that we use whenever we
106238104Sdes * need to exploit SW TCP functionality that expects TCP headers, such as
107238104Sdes * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
108238104Sdes * CPUs without locking.
109238104Sdes */
110238104Sdesstatic struct mbuf *tcphdr_mbuf __read_mostly;
111238104Sdes#endif
112238104Sdes
113238104Sdes/*
114238104Sdes * Size of WRs in bytes.  Note that we assume all devices we are handling have
115238104Sdes * the same WR size.
116238104Sdes */
117238104Sdesstatic unsigned int wrlen __read_mostly;
118238104Sdes
119238104Sdes/*
120238104Sdes * The number of WRs needed for an skb depends on the number of page fragments
121238104Sdes * in the skb and whether it has any payload in its main body.  This maps the
122238104Sdes * length of the gather list represented by an skb into the # of necessary WRs.
123238104Sdes */
124238104Sdesstatic unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
125238104Sdes
126238104Sdes/*
127238104Sdes * Max receive window supported by HW in bytes.  Only a small part of it can
128238104Sdes * be set through option0, the rest needs to be set through RX_DATA_ACK.
129238104Sdes */
130238104Sdes#define MAX_RCV_WND ((1U << 27) - 1)
131238104Sdes
132238104Sdes/*
133238104Sdes * Min receive window.  We want it to be large enough to accommodate receive
134238104Sdes * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
135238104Sdes */
136238104Sdes#define MIN_RCV_WND (24 * 1024U)
137238104Sdes#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
138238104Sdes
139238104Sdes#define VALIDATE_SEQ 0
140238104Sdes#define VALIDATE_SOCK(so)
141238104Sdes#define DEBUG_WR 0
142238104Sdes
143238104Sdes#define TCP_TIMEWAIT	1
144238104Sdes#define TCP_CLOSE	2
145238104Sdes#define TCP_DROP	3
146238104Sdes
147238104Sdesextern int tcp_do_autorcvbuf;
148238104Sdesextern int tcp_do_autosndbuf;
149238104Sdesextern int tcp_autorcvbuf_max;
150238104Sdesextern int tcp_autosndbuf_max;
151238104Sdes
152238104Sdesstatic void t3_send_reset(struct toepcb *toep);
153238104Sdesstatic void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
154238104Sdesstatic inline void free_atid(struct t3cdev *cdev, unsigned int tid);
155238104Sdesstatic void handle_syncache_event(int event, void *arg);
156238104Sdes
157238104Sdesstatic inline void
158238104SdesSBAPPEND(struct sockbuf *sb, struct mbuf *n)
159238104Sdes{
160238104Sdes	struct mbuf *m;
161238104Sdes
162238104Sdes	m = sb->sb_mb;
163238104Sdes	while (m) {
164238104Sdes		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
165238104Sdes		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
166238104Sdes			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
167238104Sdes		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
168238104Sdes			m->m_next, m->m_nextpkt, m->m_flags));
169238104Sdes		m = m->m_next;
170238104Sdes	}
171238104Sdes	m = n;
172238104Sdes	while (m) {
173238104Sdes		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
174238104Sdes		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
175238104Sdes			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
176238104Sdes		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
177238104Sdes			m->m_next, m->m_nextpkt, m->m_flags));
178238104Sdes		m = m->m_next;
179238104Sdes	}
180238104Sdes	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
181238104Sdes	sbappendstream_locked(sb, n);
182238104Sdes	m = sb->sb_mb;
183238104Sdes
184238104Sdes	while (m) {
185238104Sdes		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
186238104Sdes			m->m_next, m->m_nextpkt, m->m_flags));
187238104Sdes		m = m->m_next;
188238104Sdes	}
189238104Sdes}
190238104Sdes
191238104Sdesstatic inline int
192238104Sdesis_t3a(const struct toedev *dev)
193238104Sdes{
194238104Sdes	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
195238104Sdes}
196238104Sdes
197238104Sdesstatic void
198238104Sdesdump_toepcb(struct toepcb *toep)
199238104Sdes{
200238104Sdes	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
201238104Sdes	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
202238104Sdes	    toep->tp_mtu_idx, toep->tp_tid);
203238104Sdes
204238104Sdes	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
205238104Sdes	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
206238104Sdes	    toep->tp_mss_clamp, toep->tp_flags);
207238104Sdes}
208238104Sdes
209238104Sdes#ifndef RTALLOC2_DEFINED
210238104Sdesstatic struct rtentry *
211238104Sdesrtalloc2(struct sockaddr *dst, int report, u_long ignflags)
212238104Sdes{
213238104Sdes	struct rtentry *rt = NULL;
214238104Sdes
215238104Sdes	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
216238104Sdes		RT_UNLOCK(rt);
217238104Sdes
218238104Sdes	return (rt);
219238104Sdes}
220238104Sdes#endif
221238104Sdes
222238104Sdes/*
223238104Sdes * Determine whether to send a CPL message now or defer it.  A message is
224238104Sdes * deferred if the connection is in SYN_SENT since we don't know the TID yet.
225238104Sdes * For connections in other states the message is sent immediately.
226238104Sdes * If through_l2t is set the message is subject to ARP processing, otherwise
227238104Sdes * it is sent directly.
228238104Sdes */
229238104Sdesstatic inline void
230238104Sdessend_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
231238104Sdes{
232238104Sdes	struct tcpcb *tp = toep->tp_tp;
233238104Sdes
234238104Sdes	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
235238104Sdes		inp_wlock(tp->t_inpcb);
236238104Sdes		mbufq_tail(&toep->out_of_order_queue, m);  // defer
237238104Sdes		inp_wunlock(tp->t_inpcb);
238238104Sdes	} else if (through_l2t)
239238104Sdes		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
240238104Sdes	else
241238104Sdes		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
242238104Sdes}
243238104Sdes
244238104Sdesstatic inline unsigned int
245238104Sdesmkprio(unsigned int cntrl, const struct toepcb *toep)
246238104Sdes{
247238104Sdes        return (cntrl);
248238104Sdes}
249238104Sdes
250238104Sdes/*
251238104Sdes * Populate a TID_RELEASE WR.  The skb must be already propely sized.
252238104Sdes */
253238104Sdesstatic inline void
254238104Sdesmk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
255238104Sdes{
256238104Sdes	struct cpl_tid_release *req;
257238104Sdes
258238104Sdes	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
259238104Sdes	m->m_pkthdr.len = m->m_len = sizeof(*req);
260238104Sdes	req = mtod(m, struct cpl_tid_release *);
261238104Sdes	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
262238104Sdes	req->wr.wr_lo = 0;
263238104Sdes	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
264238104Sdes}
265238104Sdes
266238104Sdesstatic inline void
267238104Sdesmake_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
268238104Sdes{
269238104Sdes	struct tcpcb *tp = so_sototcpcb(so);
270238104Sdes	struct toepcb *toep = tp->t_toe;
271238104Sdes	struct tx_data_wr *req;
272238104Sdes	struct sockbuf *snd;
273238104Sdes
274238104Sdes	inp_lock_assert(tp->t_inpcb);
275238104Sdes	snd = so_sockbuf_snd(so);
276238104Sdes
277238104Sdes	req = mtod(m, struct tx_data_wr *);
278238104Sdes	m->m_len = sizeof(*req);
279238104Sdes	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
280238104Sdes	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
281238104Sdes	/* len includes the length of any HW ULP additions */
282238104Sdes	req->len = htonl(len);
283238104Sdes	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
284238104Sdes	/* V_TX_ULP_SUBMODE sets both the mode and submode */
285238104Sdes	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
286238104Sdes	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
287238104Sdes	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
288238104Sdes				   (tail ? 0 : 1))));
289238104Sdes	req->sndseq = htonl(tp->snd_nxt);
290238104Sdes	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
291238104Sdes		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
292238104Sdes				    V_TX_CPU_IDX(toep->tp_qset));
293238104Sdes
294238104Sdes		/* Sendbuffer is in units of 32KB.
295238104Sdes		 */
296238104Sdes		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
297238104Sdes			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
298238104Sdes		else {
299238104Sdes			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
300238104Sdes		}
301238104Sdes
302238104Sdes		toep->tp_flags |= TP_DATASENT;
303238104Sdes	}
304238104Sdes}
305238104Sdes
306238104Sdes#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
307238104Sdes
308238104Sdesint
309238104Sdest3_push_frames(struct socket *so, int req_completion)
310238104Sdes{
311238104Sdes	struct tcpcb *tp = so_sototcpcb(so);
312238104Sdes	struct toepcb *toep = tp->t_toe;
313238104Sdes
314238104Sdes	struct mbuf *tail, *m0, *last;
315238104Sdes	struct t3cdev *cdev;
316238104Sdes	struct tom_data *d;
317238104Sdes	int state, bytes, count, total_bytes;
318238104Sdes	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
319238104Sdes	struct sockbuf *snd;
320238104Sdes
321238104Sdes	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
322238104Sdes		DPRINTF("tcp state=%d\n", tp->t_state);
323238104Sdes		return (0);
324238104Sdes	}
325238104Sdes
326238104Sdes	state = so_state_get(so);
327238104Sdes
328238104Sdes	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
329238104Sdes		DPRINTF("disconnecting\n");
330238104Sdes
331238104Sdes		return (0);
332238104Sdes	}
333238104Sdes
334238104Sdes	inp_lock_assert(tp->t_inpcb);
335238104Sdes
336238104Sdes	snd = so_sockbuf_snd(so);
337238104Sdes	sockbuf_lock(snd);
338238104Sdes
339238104Sdes	d = TOM_DATA(toep->tp_toedev);
340238104Sdes	cdev = d->cdev;
341238104Sdes
342238104Sdes	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
343238104Sdes
344238104Sdes	total_bytes = 0;
345238104Sdes	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
346238104Sdes	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
347238104Sdes
348238104Sdes	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
349238104Sdes		KASSERT(tail, ("sbdrop error"));
350238104Sdes		last = tail = tail->m_next;
351238104Sdes	}
352238104Sdes
353238104Sdes	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
354238104Sdes		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
355238104Sdes		sockbuf_unlock(snd);
356238104Sdes
357238104Sdes		return (0);
358238104Sdes	}
359238104Sdes
360238104Sdes	toep->tp_m_last = NULL;
361238104Sdes	while (toep->tp_wr_avail && (tail != NULL)) {
362238104Sdes		count = bytes = 0;
363238104Sdes		segp = segs;
364238104Sdes		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
365238104Sdes			sockbuf_unlock(snd);
366238104Sdes			return (0);
367238104Sdes		}
368238104Sdes		/*
369238104Sdes		 * If the data in tail fits as in-line, then
370238104Sdes		 * make an immediate data wr.
371238104Sdes		 */
372238104Sdes		if (tail->m_len <= IMM_LEN) {
373238104Sdes			count = 1;
374238104Sdes			bytes = tail->m_len;
375238104Sdes			last = tail;
376238104Sdes			tail = tail->m_next;
377238104Sdes			m_set_sgl(m0, NULL);
378238104Sdes			m_set_sgllen(m0, 0);
379238104Sdes			make_tx_data_wr(so, m0, bytes, tail);
380238104Sdes			m_append(m0, bytes, mtod(last, caddr_t));
381238104Sdes			KASSERT(!m0->m_next, ("bad append"));
382238104Sdes		} else {
383238104Sdes			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
384238104Sdes			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
385238104Sdes				bytes += tail->m_len;
386238104Sdes				last = tail;
387238104Sdes				count++;
388238104Sdes				/*
389238104Sdes				 * technically an abuse to be using this for a VA
390238104Sdes				 * but less gross than defining my own structure
391238104Sdes				 * or calling pmap_kextract from here :-|
392238104Sdes				 */
393238104Sdes				segp->ds_addr = (bus_addr_t)tail->m_data;
394238104Sdes				segp->ds_len = tail->m_len;
395238104Sdes				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
396238104Sdes				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
397238104Sdes				segp++;
398238104Sdes				tail = tail->m_next;
399238104Sdes			}
400238104Sdes			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
401238104Sdes			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
402238104Sdes
403238104Sdes			m_set_sgl(m0, segs);
404238104Sdes			m_set_sgllen(m0, count);
405238104Sdes			make_tx_data_wr(so, m0, bytes, tail);
406238104Sdes		}
407238104Sdes		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
408238104Sdes
409238104Sdes		if (tail) {
410238104Sdes			snd->sb_sndptr = tail;
411238104Sdes			toep->tp_m_last = NULL;
412238104Sdes		} else
413238104Sdes			toep->tp_m_last = snd->sb_sndptr = last;
414238104Sdes
415238104Sdes
416238104Sdes		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
417238104Sdes
418238104Sdes		snd->sb_sndptroff += bytes;
419238104Sdes		total_bytes += bytes;
420238104Sdes		toep->tp_write_seq += bytes;
421238104Sdes		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
422238104Sdes		    " tail=%p sndptr=%p sndptroff=%d",
423238104Sdes		    toep->tp_wr_avail, count, mbuf_wrs[count],
424238104Sdes		    tail, snd->sb_sndptr, snd->sb_sndptroff);
425238104Sdes		if (tail)
426238104Sdes			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
427238104Sdes			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
428238104Sdes			    total_bytes, toep->tp_m_last, tail->m_data,
429238104Sdes			    tp->snd_una);
430238104Sdes		else
431238104Sdes			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
432238104Sdes			    " tp_m_last=%p snd_una=0x%08x",
433238104Sdes			    total_bytes, toep->tp_m_last, tp->snd_una);
434238104Sdes
435238104Sdes
436238104Sdes#ifdef KTR
437238104Sdes{
438238104Sdes		int i;
439238104Sdes
440238104Sdes		i = 0;
441238104Sdes		while (i < count && m_get_sgllen(m0)) {
442238104Sdes			if ((count - i) >= 3) {
443238104Sdes				CTR6(KTR_TOM,
444238104Sdes				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
445238104Sdes				    " len=%d pa=0x%zx len=%d",
446238104Sdes				    segs[i].ds_addr, segs[i].ds_len,
447238104Sdes				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
448238104Sdes				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
449238104Sdes				    i += 3;
450238104Sdes			} else if ((count - i) == 2) {
451238104Sdes				CTR4(KTR_TOM,
452238104Sdes				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
453238104Sdes				    " len=%d",
454238104Sdes				    segs[i].ds_addr, segs[i].ds_len,
455238104Sdes				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
456238104Sdes				    i += 2;
457238104Sdes			} else {
458238104Sdes				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
459238104Sdes				    segs[i].ds_addr, segs[i].ds_len);
460238104Sdes				i++;
461238104Sdes			}
462238104Sdes
463238104Sdes		}
464238104Sdes}
465238104Sdes#endif
466238104Sdes                 /*
467238104Sdes		 * remember credits used
468238104Sdes		 */
469238104Sdes		m0->m_pkthdr.csum_data = mbuf_wrs[count];
470238104Sdes		m0->m_pkthdr.len = bytes;
471238104Sdes		toep->tp_wr_avail -= mbuf_wrs[count];
472238104Sdes		toep->tp_wr_unacked += mbuf_wrs[count];
473
474		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
475		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
476			struct work_request_hdr *wr = cplhdr(m0);
477
478			wr->wr_hi |= htonl(F_WR_COMPL);
479			toep->tp_wr_unacked = 0;
480		}
481		KASSERT((m0->m_pkthdr.csum_data > 0) &&
482		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
483			m0->m_pkthdr.csum_data));
484		m0->m_type = MT_DONTFREE;
485		enqueue_wr(toep, m0);
486		DPRINTF("sending offload tx with %d bytes in %d segments\n",
487		    bytes, count);
488		l2t_send(cdev, m0, toep->tp_l2t);
489	}
490	sockbuf_unlock(snd);
491	return (total_bytes);
492}
493
494/*
495 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
496 * under any circumstances.  We take the easy way out and always queue the
497 * message to the write_queue.  We can optimize the case where the queue is
498 * already empty though the optimization is probably not worth it.
499 */
500static void
501close_conn(struct socket *so)
502{
503	struct mbuf *m;
504	struct cpl_close_con_req *req;
505	struct tom_data *d;
506	struct inpcb *inp = so_sotoinpcb(so);
507	struct tcpcb *tp;
508	struct toepcb *toep;
509	unsigned int tid;
510
511
512	inp_wlock(inp);
513	tp = so_sototcpcb(so);
514	toep = tp->t_toe;
515
516	if (tp->t_state != TCPS_SYN_SENT)
517		t3_push_frames(so, 1);
518
519	if (toep->tp_flags & TP_FIN_SENT) {
520		inp_wunlock(inp);
521		return;
522	}
523
524	tid = toep->tp_tid;
525
526	d = TOM_DATA(toep->tp_toedev);
527
528	m = m_gethdr_nofail(sizeof(*req));
529	m_set_priority(m, CPL_PRIORITY_DATA);
530	m_set_sgl(m, NULL);
531	m_set_sgllen(m, 0);
532
533	toep->tp_flags |= TP_FIN_SENT;
534	req = mtod(m, struct cpl_close_con_req *);
535
536	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
537	req->wr.wr_lo = htonl(V_WR_TID(tid));
538	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
539	req->rsvd = 0;
540	inp_wunlock(inp);
541	/*
542	 * XXX - need to defer shutdown while there is still data in the queue
543	 *
544	 */
545	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
546	cxgb_ofld_send(d->cdev, m);
547
548}
549
550/*
551 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
552 * and send it along.
553 */
554static void
555abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
556{
557	struct cpl_abort_req *req = cplhdr(m);
558
559	req->cmd = CPL_ABORT_NO_RST;
560	cxgb_ofld_send(cdev, m);
561}
562
563/*
564 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
565 * permitted to return without sending the message in case we cannot allocate
566 * an sk_buff.  Returns the number of credits sent.
567 */
568uint32_t
569t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
570{
571	struct mbuf *m;
572	struct cpl_rx_data_ack *req;
573	struct toepcb *toep = tp->t_toe;
574	struct toedev *tdev = toep->tp_toedev;
575
576	m = m_gethdr_nofail(sizeof(*req));
577
578	DPRINTF("returning %u credits to HW\n", credits);
579
580	req = mtod(m, struct cpl_rx_data_ack *);
581	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
582	req->wr.wr_lo = 0;
583	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
584	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
585	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
586	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
587	return (credits);
588}
589
590/*
591 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
592 * This is only used in DDP mode, so we take the opportunity to also set the
593 * DACK mode and flush any Rx credits.
594 */
595void
596t3_send_rx_modulate(struct toepcb *toep)
597{
598	struct mbuf *m;
599	struct cpl_rx_data_ack *req;
600
601	m = m_gethdr_nofail(sizeof(*req));
602
603	req = mtod(m, struct cpl_rx_data_ack *);
604	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
605	req->wr.wr_lo = 0;
606	m->m_pkthdr.len = m->m_len = sizeof(*req);
607
608	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
609	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
610				 V_RX_DACK_MODE(1) |
611				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
612	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
613	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
614	toep->tp_rcv_wup = toep->tp_copied_seq;
615}
616
617/*
618 * Handle receipt of an urgent pointer.
619 */
620static void
621handle_urg_ptr(struct socket *so, uint32_t urg_seq)
622{
623#ifdef URGENT_DATA_SUPPORTED
624	struct tcpcb *tp = so_sototcpcb(so);
625
626	urg_seq--;   /* initially points past the urgent data, per BSD */
627
628	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
629		return;                                 /* duplicate pointer */
630	sk_send_sigurg(sk);
631	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
632	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
633		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
634
635		tp->copied_seq++;
636		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
637			tom_eat_skb(sk, skb, 0);
638	}
639	tp->urg_data = TCP_URG_NOTYET;
640	tp->urg_seq = urg_seq;
641#endif
642}
643
644/*
645 * Returns true if a socket cannot accept new Rx data.
646 */
647static inline int
648so_no_receive(const struct socket *so)
649{
650	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
651}
652
653/*
654 * Process an urgent data notification.
655 */
656static void
657rx_urg_notify(struct toepcb *toep, struct mbuf *m)
658{
659	struct cpl_rx_urg_notify *hdr = cplhdr(m);
660	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
661
662	VALIDATE_SOCK(so);
663
664	if (!so_no_receive(so))
665		handle_urg_ptr(so, ntohl(hdr->seq));
666
667	m_freem(m);
668}
669
670/*
671 * Handler for RX_URG_NOTIFY CPL messages.
672 */
673static int
674do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
675{
676	struct toepcb *toep = (struct toepcb *)ctx;
677
678	rx_urg_notify(toep, m);
679	return (0);
680}
681
682static __inline int
683is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
684{
685	return (toep->tp_ulp_mode ||
686		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
687		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
688}
689
690/*
691 * Set of states for which we should return RX credits.
692 */
693#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
694
695/*
696 * Called after some received data has been read.  It returns RX credits
697 * to the HW for the amount of data processed.
698 */
699void
700t3_cleanup_rbuf(struct tcpcb *tp, int copied)
701{
702	struct toepcb *toep = tp->t_toe;
703	struct socket *so;
704	struct toedev *dev;
705	int dack_mode, must_send, read;
706	u32 thres, credits, dack = 0;
707	struct sockbuf *rcv;
708
709	so = inp_inpcbtosocket(tp->t_inpcb);
710	rcv = so_sockbuf_rcv(so);
711
712	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
713		(tp->t_state == TCPS_FIN_WAIT_2))) {
714		if (copied) {
715			sockbuf_lock(rcv);
716			toep->tp_copied_seq += copied;
717			sockbuf_unlock(rcv);
718		}
719
720		return;
721	}
722
723	inp_lock_assert(tp->t_inpcb);
724
725	sockbuf_lock(rcv);
726	if (copied)
727		toep->tp_copied_seq += copied;
728	else {
729		read = toep->tp_enqueued_bytes - rcv->sb_cc;
730		toep->tp_copied_seq += read;
731	}
732	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
733	toep->tp_enqueued_bytes = rcv->sb_cc;
734	sockbuf_unlock(rcv);
735
736	if (credits > rcv->sb_mbmax) {
737		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
738		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
739	    credits = rcv->sb_mbmax;
740	}
741
742
743	/*
744	 * XXX this won't accurately reflect credit return - we need
745	 * to look at the difference between the amount that has been
746	 * put in the recv sockbuf and what is there now
747	 */
748
749	if (__predict_false(!credits))
750		return;
751
752	dev = toep->tp_toedev;
753	thres = TOM_TUNABLE(dev, rx_credit_thres);
754
755	if (__predict_false(thres == 0))
756		return;
757
758	if (is_delack_mode_valid(dev, toep)) {
759		dack_mode = TOM_TUNABLE(dev, delack);
760		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
761			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
762
763			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
764				dack = F_RX_DACK_CHANGE |
765				       V_RX_DACK_MODE(dack_mode);
766		}
767	} else
768		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
769
770	/*
771	 * For coalescing to work effectively ensure the receive window has
772	 * at least 16KB left.
773	 */
774	must_send = credits + 16384 >= tp->rcv_wnd;
775
776	if (must_send || credits >= thres)
777		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
778}
779
780static int
781cxgb_toe_disconnect(struct tcpcb *tp)
782{
783	struct socket *so;
784
785	DPRINTF("cxgb_toe_disconnect\n");
786
787	so = inp_inpcbtosocket(tp->t_inpcb);
788	close_conn(so);
789	return (0);
790}
791
792static int
793cxgb_toe_reset(struct tcpcb *tp)
794{
795	struct toepcb *toep = tp->t_toe;
796
797	t3_send_reset(toep);
798
799	/*
800	 * unhook from socket
801	 */
802	tp->t_flags &= ~TF_TOE;
803	toep->tp_tp = NULL;
804	tp->t_toe = NULL;
805	return (0);
806}
807
808static int
809cxgb_toe_send(struct tcpcb *tp)
810{
811	struct socket *so;
812
813	DPRINTF("cxgb_toe_send\n");
814	dump_toepcb(tp->t_toe);
815
816	so = inp_inpcbtosocket(tp->t_inpcb);
817	t3_push_frames(so, 1);
818	return (0);
819}
820
821static int
822cxgb_toe_rcvd(struct tcpcb *tp)
823{
824
825	inp_lock_assert(tp->t_inpcb);
826
827	t3_cleanup_rbuf(tp, 0);
828
829	return (0);
830}
831
832static void
833cxgb_toe_detach(struct tcpcb *tp)
834{
835	struct toepcb *toep;
836
837        /*
838	 * XXX how do we handle teardown in the SYN_SENT state?
839	 *
840	 */
841	inp_lock_assert(tp->t_inpcb);
842	toep = tp->t_toe;
843	toep->tp_tp = NULL;
844
845	/*
846	 * unhook from socket
847	 */
848	tp->t_flags &= ~TF_TOE;
849	tp->t_toe = NULL;
850}
851
852
853static struct toe_usrreqs cxgb_toe_usrreqs = {
854	.tu_disconnect = cxgb_toe_disconnect,
855	.tu_reset = cxgb_toe_reset,
856	.tu_send = cxgb_toe_send,
857	.tu_rcvd = cxgb_toe_rcvd,
858	.tu_detach = cxgb_toe_detach,
859	.tu_detach = cxgb_toe_detach,
860	.tu_syncache_event = handle_syncache_event,
861};
862
863
864static void
865__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
866			    uint64_t mask, uint64_t val, int no_reply)
867{
868	struct cpl_set_tcb_field *req;
869
870	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
871	    toep->tp_tid, word, mask, val);
872
873	req = mtod(m, struct cpl_set_tcb_field *);
874	m->m_pkthdr.len = m->m_len = sizeof(*req);
875	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
876	req->wr.wr_lo = 0;
877	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
878	req->reply = V_NO_REPLY(no_reply);
879	req->cpu_idx = 0;
880	req->word = htons(word);
881	req->mask = htobe64(mask);
882	req->val = htobe64(val);
883
884	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
885	send_or_defer(toep, m, 0);
886}
887
888static void
889t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
890{
891	struct mbuf *m;
892	struct tcpcb *tp = toep->tp_tp;
893
894	if (toep == NULL)
895		return;
896
897	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
898		printf("not seting field\n");
899		return;
900	}
901
902	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
903
904	__set_tcb_field(toep, m, word, mask, val, 1);
905}
906
907/*
908 * Set one of the t_flags bits in the TCB.
909 */
910static void
911set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
912{
913
914	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
915}
916
917/*
918 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
919 */
920static void
921t3_set_nagle(struct toepcb *toep)
922{
923	struct tcpcb *tp = toep->tp_tp;
924
925	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
926}
927
928/*
929 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
930 */
931void
932t3_set_keepalive(struct toepcb *toep, int on_off)
933{
934
935	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
936}
937
938void
939t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
940{
941	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
942}
943
944void
945t3_set_dack_mss(struct toepcb *toep, int on_off)
946{
947
948	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
949}
950
951/*
952 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
953 */
954static void
955t3_set_tos(struct toepcb *toep)
956{
957	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
958
959	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
960			 V_TCB_TOS(tos));
961}
962
963
964/*
965 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
966 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
967 * set the PSH bit in the last segment, which would trigger delivery.]
968 * We work around the issue by setting a DDP buffer in a partial placed state,
969 * which guarantees that TP will schedule a timer.
970 */
971#define TP_DDP_TIMER_WORKAROUND_MASK\
972    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
973     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
974       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
975#define TP_DDP_TIMER_WORKAROUND_VAL\
976    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
977     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
978      32))
979
980static void
981t3_enable_ddp(struct toepcb *toep, int on)
982{
983	if (on) {
984
985		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
986				 V_TF_DDP_OFF(0));
987	} else
988		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
989				 V_TF_DDP_OFF(1) |
990				 TP_DDP_TIMER_WORKAROUND_MASK,
991				 V_TF_DDP_OFF(1) |
992				 TP_DDP_TIMER_WORKAROUND_VAL);
993
994}
995
996void
997t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
998{
999	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1000			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1001			 tag_color);
1002}
1003
1004void
1005t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1006		    unsigned int len)
1007{
1008	if (buf_idx == 0)
1009		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1010			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1011			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1012			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1013			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1014	else
1015		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1016			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1017			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1018			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1019			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1020}
1021
1022static int
1023t3_set_cong_control(struct socket *so, const char *name)
1024{
1025#ifdef CONGESTION_CONTROL_SUPPORTED
1026	int cong_algo;
1027
1028	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1029		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1030			break;
1031
1032	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1033		return -EINVAL;
1034#endif
1035	return 0;
1036}
1037
1038int
1039t3_get_tcb(struct toepcb *toep)
1040{
1041	struct cpl_get_tcb *req;
1042	struct tcpcb *tp = toep->tp_tp;
1043	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1044
1045	if (!m)
1046		return (ENOMEM);
1047
1048	inp_lock_assert(tp->t_inpcb);
1049	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1050	req = mtod(m, struct cpl_get_tcb *);
1051	m->m_pkthdr.len = m->m_len = sizeof(*req);
1052	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1053	req->wr.wr_lo = 0;
1054	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1055	req->cpuno = htons(toep->tp_qset);
1056	req->rsvd = 0;
1057	if (tp->t_state == TCPS_SYN_SENT)
1058		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1059	else
1060		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1061	return 0;
1062}
1063
1064static inline void
1065so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1066{
1067
1068	toepcb_hold(toep);
1069
1070	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1071}
1072
1073/**
1074 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1075 *	@d: TOM state
1076 *	@mtu: the target MTU
1077 *
1078 *	Returns the index of the value in the MTU table that is closest to but
1079 *	does not exceed the target MTU.
1080 */
1081static unsigned int
1082find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1083{
1084	int i = 0;
1085
1086	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1087		++i;
1088	return (i);
1089}
1090
1091static unsigned int
1092select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1093{
1094	unsigned int idx;
1095
1096#ifdef notyet
1097	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1098#endif
1099	if (tp) {
1100		tp->t_maxseg = pmtu - 40;
1101		if (tp->t_maxseg < td->mtus[0] - 40)
1102			tp->t_maxseg = td->mtus[0] - 40;
1103		idx = find_best_mtu(td, tp->t_maxseg + 40);
1104
1105		tp->t_maxseg = td->mtus[idx] - 40;
1106	} else
1107		idx = find_best_mtu(td, pmtu);
1108
1109	return (idx);
1110}
1111
1112static inline void
1113free_atid(struct t3cdev *cdev, unsigned int tid)
1114{
1115	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1116
1117	if (toep)
1118		toepcb_release(toep);
1119}
1120
1121/*
1122 * Release resources held by an offload connection (TID, L2T entry, etc.)
1123 */
1124static void
1125t3_release_offload_resources(struct toepcb *toep)
1126{
1127	struct tcpcb *tp = toep->tp_tp;
1128	struct toedev *tdev = toep->tp_toedev;
1129	struct t3cdev *cdev;
1130	struct socket *so;
1131	unsigned int tid = toep->tp_tid;
1132	struct sockbuf *rcv;
1133
1134	CTR0(KTR_TOM, "t3_release_offload_resources");
1135
1136	if (!tdev)
1137		return;
1138
1139	cdev = TOEP_T3C_DEV(toep);
1140	if (!cdev)
1141		return;
1142
1143	toep->tp_qset = 0;
1144	t3_release_ddp_resources(toep);
1145
1146#ifdef CTRL_SKB_CACHE
1147	kfree_skb(CTRL_SKB_CACHE(tp));
1148	CTRL_SKB_CACHE(tp) = NULL;
1149#endif
1150
1151	if (toep->tp_wr_avail != toep->tp_wr_max) {
1152		purge_wr_queue(toep);
1153		reset_wr_list(toep);
1154	}
1155
1156	if (toep->tp_l2t) {
1157		l2t_release(L2DATA(cdev), toep->tp_l2t);
1158		toep->tp_l2t = NULL;
1159	}
1160	toep->tp_tp = NULL;
1161	if (tp) {
1162		inp_lock_assert(tp->t_inpcb);
1163		so = inp_inpcbtosocket(tp->t_inpcb);
1164		rcv = so_sockbuf_rcv(so);
1165		/*
1166		 * cancel any offloaded reads
1167		 *
1168		 */
1169		sockbuf_lock(rcv);
1170		tp->t_toe = NULL;
1171		tp->t_flags &= ~TF_TOE;
1172		if (toep->tp_ddp_state.user_ddp_pending) {
1173			t3_cancel_ubuf(toep, rcv);
1174			toep->tp_ddp_state.user_ddp_pending = 0;
1175		}
1176		so_sorwakeup_locked(so);
1177
1178	}
1179
1180	if (toep->tp_state == TCPS_SYN_SENT) {
1181		free_atid(cdev, tid);
1182#ifdef notyet
1183		__skb_queue_purge(&tp->out_of_order_queue);
1184#endif
1185	} else {                                          // we have TID
1186		cxgb_remove_tid(cdev, toep, tid);
1187		toepcb_release(toep);
1188	}
1189#if 0
1190	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1191#endif
1192}
1193
1194static void
1195install_offload_ops(struct socket *so)
1196{
1197	struct tcpcb *tp = so_sototcpcb(so);
1198
1199	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1200
1201	t3_install_socket_ops(so);
1202	tp->t_flags |= TF_TOE;
1203	tp->t_tu = &cxgb_toe_usrreqs;
1204}
1205
1206/*
1207 * Determine the receive window scaling factor given a target max
1208 * receive window.
1209 */
1210static __inline int
1211select_rcv_wscale(int space)
1212{
1213	int wscale = 0;
1214
1215	if (space > MAX_RCV_WND)
1216		space = MAX_RCV_WND;
1217
1218	if (V_tcp_do_rfc1323)
1219		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1220
1221	return (wscale);
1222}
1223
1224/*
1225 * Determine the receive window size for a socket.
1226 */
1227static unsigned long
1228select_rcv_wnd(struct toedev *dev, struct socket *so)
1229{
1230	struct tom_data *d = TOM_DATA(dev);
1231	unsigned int wnd;
1232	unsigned int max_rcv_wnd;
1233	struct sockbuf *rcv;
1234
1235	rcv = so_sockbuf_rcv(so);
1236
1237	if (V_tcp_do_autorcvbuf)
1238		wnd = V_tcp_autorcvbuf_max;
1239	else
1240		wnd = rcv->sb_hiwat;
1241
1242
1243
1244	/* XXX
1245	 * For receive coalescing to work effectively we need a receive window
1246	 * that can accomodate a coalesced segment.
1247	 */
1248	if (wnd < MIN_RCV_WND)
1249		wnd = MIN_RCV_WND;
1250
1251	/* PR 5138 */
1252	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1253				    (uint32_t)d->rx_page_size * 23 :
1254				    MAX_RCV_WND);
1255
1256	return min(wnd, max_rcv_wnd);
1257}
1258
1259/*
1260 * Assign offload parameters to some socket fields.  This code is used by
1261 * both active and passive opens.
1262 */
1263static inline void
1264init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1265    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1266{
1267	struct tcpcb *tp = so_sototcpcb(so);
1268	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1269	struct sockbuf *snd, *rcv;
1270
1271#ifdef notyet
1272	SOCK_LOCK_ASSERT(so);
1273#endif
1274
1275	snd = so_sockbuf_snd(so);
1276	rcv = so_sockbuf_rcv(so);
1277
1278	log(LOG_INFO, "initializing offload socket\n");
1279	/*
1280	 * We either need to fix push frames to work with sbcompress
1281	 * or we need to add this
1282	 */
1283	snd->sb_flags |= SB_NOCOALESCE;
1284	rcv->sb_flags |= SB_NOCOALESCE;
1285
1286	tp->t_toe = toep;
1287	toep->tp_tp = tp;
1288	toep->tp_toedev = dev;
1289
1290	toep->tp_tid = tid;
1291	toep->tp_l2t = e;
1292	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1293	toep->tp_wr_unacked = 0;
1294	toep->tp_delack_mode = 0;
1295
1296	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1297	/*
1298	 * XXX broken
1299	 *
1300	 */
1301	tp->rcv_wnd = select_rcv_wnd(dev, so);
1302
1303        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1304		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1305	toep->tp_qset_idx = 0;
1306
1307	reset_wr_list(toep);
1308	DPRINTF("initialization done\n");
1309}
1310
1311/*
1312 * The next two functions calculate the option 0 value for a socket.
1313 */
1314static inline unsigned int
1315calc_opt0h(struct socket *so, int mtu_idx)
1316{
1317	struct tcpcb *tp = so_sototcpcb(so);
1318	int wscale = select_rcv_wscale(tp->rcv_wnd);
1319
1320	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1321	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1322	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1323}
1324
1325static inline unsigned int
1326calc_opt0l(struct socket *so, int ulp_mode)
1327{
1328	struct tcpcb *tp = so_sototcpcb(so);
1329	unsigned int val;
1330
1331	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1332	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1333
1334	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1335	return (val);
1336}
1337
1338static inline unsigned int
1339calc_opt2(const struct socket *so, struct toedev *dev)
1340{
1341	int flv_valid;
1342
1343	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1344
1345	return (V_FLAVORS_VALID(flv_valid) |
1346	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1347}
1348
1349#if DEBUG_WR > 1
1350static int
1351count_pending_wrs(const struct toepcb *toep)
1352{
1353	const struct mbuf *m;
1354	int n = 0;
1355
1356	wr_queue_walk(toep, m)
1357		n += m->m_pkthdr.csum_data;
1358	return (n);
1359}
1360#endif
1361
1362#if 0
1363(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1364#endif
1365
1366static void
1367mk_act_open_req(struct socket *so, struct mbuf *m,
1368    unsigned int atid, const struct l2t_entry *e)
1369{
1370	struct cpl_act_open_req *req;
1371	struct inpcb *inp = so_sotoinpcb(so);
1372	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1373	struct toepcb *toep = tp->t_toe;
1374	struct toedev *tdev = toep->tp_toedev;
1375
1376	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1377
1378	req = mtod(m, struct cpl_act_open_req *);
1379	m->m_pkthdr.len = m->m_len = sizeof(*req);
1380
1381	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1382	req->wr.wr_lo = 0;
1383	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1384	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1385#if 0
1386	req->local_port = inp->inp_lport;
1387	req->peer_port = inp->inp_fport;
1388	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1389	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1390#endif
1391	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1392			   V_TX_CHANNEL(e->smt_idx));
1393	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1394	req->params = 0;
1395	req->opt2 = htonl(calc_opt2(so, tdev));
1396}
1397
1398
1399/*
1400 * Convert an ACT_OPEN_RPL status to an errno.
1401 */
1402static int
1403act_open_rpl_status_to_errno(int status)
1404{
1405	switch (status) {
1406	case CPL_ERR_CONN_RESET:
1407		return (ECONNREFUSED);
1408	case CPL_ERR_ARP_MISS:
1409		return (EHOSTUNREACH);
1410	case CPL_ERR_CONN_TIMEDOUT:
1411		return (ETIMEDOUT);
1412	case CPL_ERR_TCAM_FULL:
1413		return (ENOMEM);
1414	case CPL_ERR_CONN_EXIST:
1415		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1416		return (EADDRINUSE);
1417	default:
1418		return (EIO);
1419	}
1420}
1421
1422static void
1423fail_act_open(struct toepcb *toep, int errno)
1424{
1425	struct tcpcb *tp = toep->tp_tp;
1426
1427	t3_release_offload_resources(toep);
1428	if (tp) {
1429		inp_wunlock(tp->t_inpcb);
1430		tcp_offload_drop(tp, errno);
1431	}
1432
1433#ifdef notyet
1434	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1435#endif
1436}
1437
1438/*
1439 * Handle active open failures.
1440 */
1441static void
1442active_open_failed(struct toepcb *toep, struct mbuf *m)
1443{
1444	struct cpl_act_open_rpl *rpl = cplhdr(m);
1445	struct inpcb *inp;
1446
1447	if (toep->tp_tp == NULL)
1448		goto done;
1449
1450	inp = toep->tp_tp->t_inpcb;
1451
1452/*
1453 * Don't handle connection retry for now
1454 */
1455#ifdef notyet
1456	struct inet_connection_sock *icsk = inet_csk(sk);
1457
1458	if (rpl->status == CPL_ERR_CONN_EXIST &&
1459	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1460		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1461		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1462			       jiffies + HZ / 2);
1463	} else
1464#endif
1465	{
1466		inp_wlock(inp);
1467		/*
1468		 * drops the inpcb lock
1469		 */
1470		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1471	}
1472
1473	done:
1474	m_free(m);
1475}
1476
1477/*
1478 * Return whether a failed active open has allocated a TID
1479 */
1480static inline int
1481act_open_has_tid(int status)
1482{
1483	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1484	       status != CPL_ERR_ARP_MISS;
1485}
1486
1487/*
1488 * Process an ACT_OPEN_RPL CPL message.
1489 */
1490static int
1491do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1492{
1493	struct toepcb *toep = (struct toepcb *)ctx;
1494	struct cpl_act_open_rpl *rpl = cplhdr(m);
1495
1496	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1497		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1498
1499	active_open_failed(toep, m);
1500	return (0);
1501}
1502
1503/*
1504 * Handle an ARP failure for an active open.   XXX purge ofo queue
1505 *
1506 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1507 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1508 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1509 * free the atid.  Hmm.
1510 */
1511#ifdef notyet
1512static void
1513act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1514{
1515	struct toepcb *toep = m_get_toep(m);
1516	struct tcpcb *tp = toep->tp_tp;
1517	struct inpcb *inp = tp->t_inpcb;
1518	struct socket *so;
1519
1520	inp_wlock(inp);
1521	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1522		/*
1523		 * drops the inpcb lock
1524		 */
1525		fail_act_open(so, EHOSTUNREACH);
1526		printf("freeing %p\n", m);
1527
1528		m_free(m);
1529	} else
1530		inp_wunlock(inp);
1531}
1532#endif
1533/*
1534 * Send an active open request.
1535 */
1536int
1537t3_connect(struct toedev *tdev, struct socket *so,
1538    struct rtentry *rt, struct sockaddr *nam)
1539{
1540	struct mbuf *m;
1541	struct l2t_entry *e;
1542	struct tom_data *d = TOM_DATA(tdev);
1543	struct inpcb *inp = so_sotoinpcb(so);
1544	struct tcpcb *tp = intotcpcb(inp);
1545	struct toepcb *toep; /* allocated by init_offload_socket */
1546
1547	int atid;
1548
1549	toep = toepcb_alloc();
1550	if (toep == NULL)
1551		goto out_err;
1552
1553	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1554		goto out_err;
1555
1556	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1557	if (!e)
1558		goto free_tid;
1559
1560	inp_lock_assert(inp);
1561	m = m_gethdr(MT_DATA, M_WAITOK);
1562
1563#if 0
1564	m->m_toe.mt_toepcb = tp->t_toe;
1565	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1566#endif
1567	so_lock(so);
1568
1569	init_offload_socket(so, tdev, atid, e, rt, toep);
1570
1571	install_offload_ops(so);
1572
1573	mk_act_open_req(so, m, atid, e);
1574	so_unlock(so);
1575
1576	soisconnecting(so);
1577	toep = tp->t_toe;
1578	m_set_toep(m, tp->t_toe);
1579
1580	toep->tp_state = TCPS_SYN_SENT;
1581	l2t_send(d->cdev, (struct mbuf *)m, e);
1582
1583	if (toep->tp_ulp_mode)
1584		t3_enable_ddp(toep, 0);
1585	return 	(0);
1586
1587free_tid:
1588	printf("failing connect - free atid\n");
1589
1590	free_atid(d->cdev, atid);
1591out_err:
1592	printf("return ENOMEM\n");
1593       return (ENOMEM);
1594}
1595
1596/*
1597 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1598 * not send multiple ABORT_REQs for the same connection and also that we do
1599 * not try to send a message after the connection has closed.  Returns 1 if
1600 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1601 */
1602static void
1603t3_send_reset(struct toepcb *toep)
1604{
1605
1606	struct cpl_abort_req *req;
1607	unsigned int tid = toep->tp_tid;
1608	int mode = CPL_ABORT_SEND_RST;
1609	struct tcpcb *tp = toep->tp_tp;
1610	struct toedev *tdev = toep->tp_toedev;
1611	struct socket *so = NULL;
1612	struct mbuf *m;
1613	struct sockbuf *snd;
1614
1615	if (tp) {
1616		inp_lock_assert(tp->t_inpcb);
1617		so = inp_inpcbtosocket(tp->t_inpcb);
1618	}
1619
1620	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1621		tdev == NULL))
1622		return;
1623	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1624
1625	snd = so_sockbuf_snd(so);
1626	/* Purge the send queue so we don't send anything after an abort. */
1627	if (so)
1628		sbflush(snd);
1629	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1630		mode |= CPL_ABORT_POST_CLOSE_REQ;
1631
1632	m = m_gethdr_nofail(sizeof(*req));
1633	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1634	set_arp_failure_handler(m, abort_arp_failure);
1635
1636	req = mtod(m, struct cpl_abort_req *);
1637	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1638	req->wr.wr_lo = htonl(V_WR_TID(tid));
1639	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1640	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1641	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1642	req->cmd = mode;
1643	if (tp && (tp->t_state == TCPS_SYN_SENT))
1644		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1645	else
1646		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1647}
1648
1649static int
1650t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1651{
1652	struct inpcb *inp;
1653	int error, optval;
1654
1655	if (sopt->sopt_name == IP_OPTIONS)
1656		return (ENOPROTOOPT);
1657
1658	if (sopt->sopt_name != IP_TOS)
1659		return (EOPNOTSUPP);
1660
1661	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1662
1663	if (error)
1664		return (error);
1665
1666	if (optval > IPTOS_PREC_CRITIC_ECP)
1667		return (EINVAL);
1668
1669	inp = so_sotoinpcb(so);
1670	inp_wlock(inp);
1671	inp_ip_tos_set(inp, optval);
1672#if 0
1673	inp->inp_ip_tos = optval;
1674#endif
1675	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1676	inp_wunlock(inp);
1677
1678	return (0);
1679}
1680
1681static int
1682t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1683{
1684	int err = 0;
1685	size_t copied;
1686
1687	if (sopt->sopt_name != TCP_CONGESTION &&
1688	    sopt->sopt_name != TCP_NODELAY)
1689		return (EOPNOTSUPP);
1690
1691	if (sopt->sopt_name == TCP_CONGESTION) {
1692		char name[TCP_CA_NAME_MAX];
1693		int optlen = sopt->sopt_valsize;
1694		struct tcpcb *tp;
1695
1696		if (sopt->sopt_dir == SOPT_GET) {
1697			KASSERT(0, ("unimplemented"));
1698			return (EOPNOTSUPP);
1699		}
1700
1701		if (optlen < 1)
1702			return (EINVAL);
1703
1704		err = copyinstr(sopt->sopt_val, name,
1705		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1706		if (err)
1707			return (err);
1708		if (copied < 1)
1709			return (EINVAL);
1710
1711		tp = so_sototcpcb(so);
1712		/*
1713		 * XXX I need to revisit this
1714		 */
1715		if ((err = t3_set_cong_control(so, name)) == 0) {
1716#ifdef CONGESTION_CONTROL_SUPPORTED
1717			tp->t_cong_control = strdup(name, M_CXGB);
1718#endif
1719		} else
1720			return (err);
1721	} else {
1722		int optval, oldval;
1723		struct inpcb *inp;
1724		struct tcpcb *tp;
1725
1726		if (sopt->sopt_dir == SOPT_GET)
1727			return (EOPNOTSUPP);
1728
1729		err = sooptcopyin(sopt, &optval, sizeof optval,
1730		    sizeof optval);
1731
1732		if (err)
1733			return (err);
1734
1735		inp = so_sotoinpcb(so);
1736		tp = inp_inpcbtotcpcb(inp);
1737
1738		inp_wlock(inp);
1739
1740		oldval = tp->t_flags;
1741		if (optval)
1742			tp->t_flags |= TF_NODELAY;
1743		else
1744			tp->t_flags &= ~TF_NODELAY;
1745		inp_wunlock(inp);
1746
1747
1748		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1749			t3_set_nagle(tp->t_toe);
1750
1751	}
1752
1753	return (0);
1754}
1755
1756int
1757t3_ctloutput(struct socket *so, struct sockopt *sopt)
1758{
1759	int err;
1760
1761	if (sopt->sopt_level != IPPROTO_TCP)
1762		err =  t3_ip_ctloutput(so, sopt);
1763	else
1764		err = t3_tcp_ctloutput(so, sopt);
1765
1766	if (err != EOPNOTSUPP)
1767		return (err);
1768
1769	return (tcp_ctloutput(so, sopt));
1770}
1771
1772/*
1773 * Returns true if we need to explicitly request RST when we receive new data
1774 * on an RX-closed connection.
1775 */
1776static inline int
1777need_rst_on_excess_rx(const struct toepcb *toep)
1778{
1779	return (1);
1780}
1781
1782/*
1783 * Handles Rx data that arrives in a state where the socket isn't accepting
1784 * new data.
1785 */
1786static void
1787handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1788{
1789
1790	if (need_rst_on_excess_rx(toep) &&
1791	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1792		t3_send_reset(toep);
1793	m_freem(m);
1794}
1795
1796/*
1797 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1798 * by getting the DDP offset from the TCB.
1799 */
1800static void
1801tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1802{
1803	struct ddp_state *q = &toep->tp_ddp_state;
1804	struct ddp_buf_state *bsp;
1805	struct cpl_get_tcb_rpl *hdr;
1806	unsigned int ddp_offset;
1807	struct socket *so;
1808	struct tcpcb *tp;
1809	struct sockbuf *rcv;
1810	int state;
1811
1812	uint64_t t;
1813	__be64 *tcb;
1814
1815	tp = toep->tp_tp;
1816	so = inp_inpcbtosocket(tp->t_inpcb);
1817
1818	inp_lock_assert(tp->t_inpcb);
1819	rcv = so_sockbuf_rcv(so);
1820	sockbuf_lock(rcv);
1821
1822	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1823	 * We really need a cookie in order to dispatch the RPLs.
1824	 */
1825	q->get_tcb_count--;
1826
1827	/* It is a possible that a previous CPL already invalidated UBUF DDP
1828	 * and moved the cur_buf idx and hence no further processing of this
1829	 * skb is required. However, the app might be sleeping on
1830	 * !q->get_tcb_count and we need to wake it up.
1831	 */
1832	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1833		int state = so_state_get(so);
1834
1835		m_freem(m);
1836		if (__predict_true((state & SS_NOFDREF) == 0))
1837			so_sorwakeup_locked(so);
1838		else
1839			sockbuf_unlock(rcv);
1840
1841		return;
1842	}
1843
1844	bsp = &q->buf_state[q->cur_buf];
1845	hdr = cplhdr(m);
1846	tcb = (__be64 *)(hdr + 1);
1847	if (q->cur_buf == 0) {
1848		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1849		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1850	} else {
1851		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1852		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1853	}
1854	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1855	m->m_cur_offset = bsp->cur_offset;
1856	bsp->cur_offset = ddp_offset;
1857	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1858
1859	CTR5(KTR_TOM,
1860	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1861	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1862	KASSERT(ddp_offset >= m->m_cur_offset,
1863	    ("ddp_offset=%u less than cur_offset=%u",
1864		ddp_offset, m->m_cur_offset));
1865
1866#if 0
1867{
1868	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1869
1870	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1871	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1872
1873        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1874        rcv_nxt = t >> S_TCB_RCV_NXT;
1875        rcv_nxt &= M_TCB_RCV_NXT;
1876
1877        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1878        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1879        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1880
1881	T3_TRACE2(TIDTB(sk),
1882		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1883		  ddp_flags, rcv_nxt - rx_hdr_offset);
1884	T3_TRACE4(TB(q),
1885		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1886		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1887	T3_TRACE3(TB(q),
1888		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1889		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1890	T3_TRACE2(TB(q),
1891		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1892		 q->buf_state[0].flags, q->buf_state[1].flags);
1893
1894}
1895#endif
1896	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1897		handle_excess_rx(toep, m);
1898		return;
1899	}
1900
1901#ifdef T3_TRACE
1902	if ((int)m->m_pkthdr.len < 0) {
1903		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1904	}
1905#endif
1906	if (bsp->flags & DDP_BF_NOCOPY) {
1907#ifdef T3_TRACE
1908		T3_TRACE0(TB(q),
1909			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1910
1911		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1912			printk("!cancel_ubuf");
1913			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1914		}
1915#endif
1916		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1917		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1918		q->cur_buf ^= 1;
1919	} else if (bsp->flags & DDP_BF_NOFLIP) {
1920
1921		m->m_ddp_flags = 1;    /* always a kernel buffer */
1922
1923		/* now HW buffer carries a user buffer */
1924		bsp->flags &= ~DDP_BF_NOFLIP;
1925		bsp->flags |= DDP_BF_NOCOPY;
1926
1927		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1928		 * any new data in which case we're done. If in addition the
1929		 * offset is 0, then there wasn't a completion for the kbuf
1930		 * and we need to decrement the posted count.
1931		 */
1932		if (m->m_pkthdr.len == 0) {
1933			if (ddp_offset == 0) {
1934				q->kbuf_posted--;
1935				bsp->flags |= DDP_BF_NODATA;
1936			}
1937			sockbuf_unlock(rcv);
1938			m_free(m);
1939			return;
1940		}
1941	} else {
1942		sockbuf_unlock(rcv);
1943
1944		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1945		 * but it got here way late and nobody cares anymore.
1946		 */
1947		m_free(m);
1948		return;
1949	}
1950
1951	m->m_ddp_gl = (unsigned char *)bsp->gl;
1952	m->m_flags |= M_DDP;
1953	m->m_seq = tp->rcv_nxt;
1954	tp->rcv_nxt += m->m_pkthdr.len;
1955	tp->t_rcvtime = ticks;
1956	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1957		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1958	if (m->m_pkthdr.len == 0) {
1959		q->user_ddp_pending = 0;
1960		m_free(m);
1961	} else
1962		SBAPPEND(rcv, m);
1963
1964	state = so_state_get(so);
1965	if (__predict_true((state & SS_NOFDREF) == 0))
1966		so_sorwakeup_locked(so);
1967	else
1968		sockbuf_unlock(rcv);
1969}
1970
1971/*
1972 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1973 * in that case they are similar to DDP completions.
1974 */
1975static int
1976do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1977{
1978	struct toepcb *toep = (struct toepcb *)ctx;
1979
1980	/* OK if socket doesn't exist */
1981	if (toep == NULL) {
1982		printf("null toep in do_get_tcb_rpl\n");
1983		return (CPL_RET_BUF_DONE);
1984	}
1985
1986	inp_wlock(toep->tp_tp->t_inpcb);
1987	tcb_rpl_as_ddp_complete(toep, m);
1988	inp_wunlock(toep->tp_tp->t_inpcb);
1989
1990	return (0);
1991}
1992
1993static void
1994handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1995{
1996	struct tcpcb *tp = toep->tp_tp;
1997	struct socket *so;
1998	struct ddp_state *q;
1999	struct ddp_buf_state *bsp;
2000	struct cpl_rx_data *hdr = cplhdr(m);
2001	unsigned int rcv_nxt = ntohl(hdr->seq);
2002	struct sockbuf *rcv;
2003
2004	if (tp->rcv_nxt == rcv_nxt)
2005		return;
2006
2007	inp_lock_assert(tp->t_inpcb);
2008	so  = inp_inpcbtosocket(tp->t_inpcb);
2009	rcv = so_sockbuf_rcv(so);
2010	sockbuf_lock(rcv);
2011
2012	q = &toep->tp_ddp_state;
2013	bsp = &q->buf_state[q->cur_buf];
2014	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2015		rcv_nxt, tp->rcv_nxt));
2016	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2017	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2018	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2019	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2020
2021#ifdef T3_TRACE
2022	if ((int)m->m_pkthdr.len < 0) {
2023		t3_ddp_error(so, "handle_ddp_data: neg len");
2024	}
2025#endif
2026	m->m_ddp_gl = (unsigned char *)bsp->gl;
2027	m->m_flags |= M_DDP;
2028	m->m_cur_offset = bsp->cur_offset;
2029	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2030	if (bsp->flags & DDP_BF_NOCOPY)
2031		bsp->flags &= ~DDP_BF_NOCOPY;
2032
2033	m->m_seq = tp->rcv_nxt;
2034	tp->rcv_nxt = rcv_nxt;
2035	bsp->cur_offset += m->m_pkthdr.len;
2036	if (!(bsp->flags & DDP_BF_NOFLIP))
2037		q->cur_buf ^= 1;
2038	/*
2039	 * For now, don't re-enable DDP after a connection fell out of  DDP
2040	 * mode.
2041	 */
2042	q->ubuf_ddp_ready = 0;
2043	sockbuf_unlock(rcv);
2044}
2045
2046/*
2047 * Process new data received for a connection.
2048 */
2049static void
2050new_rx_data(struct toepcb *toep, struct mbuf *m)
2051{
2052	struct cpl_rx_data *hdr = cplhdr(m);
2053	struct tcpcb *tp = toep->tp_tp;
2054	struct socket *so;
2055	struct sockbuf *rcv;
2056	int state;
2057	int len = be16toh(hdr->len);
2058
2059	inp_wlock(tp->t_inpcb);
2060
2061	so  = inp_inpcbtosocket(tp->t_inpcb);
2062
2063	if (__predict_false(so_no_receive(so))) {
2064		handle_excess_rx(toep, m);
2065		inp_wunlock(tp->t_inpcb);
2066		TRACE_EXIT;
2067		return;
2068	}
2069
2070	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2071		handle_ddp_data(toep, m);
2072
2073	m->m_seq = ntohl(hdr->seq);
2074	m->m_ulp_mode = 0;                    /* for iSCSI */
2075
2076#if VALIDATE_SEQ
2077	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2078		log(LOG_ERR,
2079		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2080		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2081		       tp->rcv_nxt);
2082		m_freem(m);
2083		inp_wunlock(tp->t_inpcb);
2084		return;
2085	}
2086#endif
2087	m_adj(m, sizeof(*hdr));
2088
2089#ifdef URGENT_DATA_SUPPORTED
2090	/*
2091	 * We don't handle urgent data yet
2092	 */
2093	if (__predict_false(hdr->urg))
2094		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2095	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2096		     tp->urg_seq - tp->rcv_nxt < skb->len))
2097		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2098							 tp->rcv_nxt];
2099#endif
2100	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2101		toep->tp_delack_mode = hdr->dack_mode;
2102		toep->tp_delack_seq = tp->rcv_nxt;
2103	}
2104	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2105	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2106
2107	if (len < m->m_pkthdr.len)
2108		m->m_pkthdr.len = m->m_len = len;
2109
2110	tp->rcv_nxt += m->m_pkthdr.len;
2111	tp->t_rcvtime = ticks;
2112	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2113	CTR2(KTR_TOM,
2114	    "new_rx_data: seq 0x%x len %u",
2115	    m->m_seq, m->m_pkthdr.len);
2116	inp_wunlock(tp->t_inpcb);
2117	rcv = so_sockbuf_rcv(so);
2118	sockbuf_lock(rcv);
2119#if 0
2120	if (sb_notify(rcv))
2121		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2122#endif
2123	SBAPPEND(rcv, m);
2124
2125#ifdef notyet
2126	/*
2127	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2128	 *
2129	 */
2130	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2131
2132	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2133		so, rcv->sb_cc, rcv->sb_mbmax));
2134#endif
2135
2136
2137	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2138	    rcv->sb_cc, rcv->sb_mbcnt);
2139
2140	state = so_state_get(so);
2141	if (__predict_true((state & SS_NOFDREF) == 0))
2142		so_sorwakeup_locked(so);
2143	else
2144		sockbuf_unlock(rcv);
2145}
2146
2147/*
2148 * Handler for RX_DATA CPL messages.
2149 */
2150static int
2151do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2152{
2153	struct toepcb *toep = (struct toepcb *)ctx;
2154
2155	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2156
2157	new_rx_data(toep, m);
2158
2159	return (0);
2160}
2161
2162static void
2163new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2164{
2165	struct tcpcb *tp;
2166	struct ddp_state *q;
2167	struct ddp_buf_state *bsp;
2168	struct cpl_rx_data_ddp *hdr;
2169	struct socket *so;
2170	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2171	int nomoredata = 0;
2172	unsigned int delack_mode;
2173	struct sockbuf *rcv;
2174
2175	tp = toep->tp_tp;
2176	inp_wlock(tp->t_inpcb);
2177	so = inp_inpcbtosocket(tp->t_inpcb);
2178
2179	if (__predict_false(so_no_receive(so))) {
2180
2181		handle_excess_rx(toep, m);
2182		inp_wunlock(tp->t_inpcb);
2183		return;
2184	}
2185
2186	q = &toep->tp_ddp_state;
2187	hdr = cplhdr(m);
2188	ddp_report = ntohl(hdr->u.ddp_report);
2189	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2190	bsp = &q->buf_state[buf_idx];
2191
2192	CTR4(KTR_TOM,
2193	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2194	    "hdr seq 0x%x len %u",
2195	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2196	    ntohs(hdr->len));
2197	CTR3(KTR_TOM,
2198	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2199	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2200
2201	ddp_len = ntohs(hdr->len);
2202	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2203
2204	delack_mode = G_DDP_DACK_MODE(ddp_report);
2205	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2206		toep->tp_delack_mode = delack_mode;
2207		toep->tp_delack_seq = tp->rcv_nxt;
2208	}
2209
2210	m->m_seq = tp->rcv_nxt;
2211	tp->rcv_nxt = rcv_nxt;
2212
2213	tp->t_rcvtime = ticks;
2214	/*
2215	 * Store the length in m->m_len.  We are changing the meaning of
2216	 * m->m_len here, we need to be very careful that nothing from now on
2217	 * interprets ->len of this packet the usual way.
2218	 */
2219	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2220	inp_wunlock(tp->t_inpcb);
2221	CTR3(KTR_TOM,
2222	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2223	    m->m_len, rcv_nxt, m->m_seq);
2224	/*
2225	 * Figure out where the new data was placed in the buffer and store it
2226	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2227	 * account for page pod's pg_offset.
2228	 */
2229	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2230	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2231
2232	rcv = so_sockbuf_rcv(so);
2233	sockbuf_lock(rcv);
2234
2235	m->m_ddp_gl = (unsigned char *)bsp->gl;
2236	m->m_flags |= M_DDP;
2237	bsp->cur_offset = end_offset;
2238	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2239
2240	/*
2241	 * Length is only meaningful for kbuf
2242	 */
2243	if (!(bsp->flags & DDP_BF_NOCOPY))
2244		KASSERT(m->m_len <= bsp->gl->dgl_length,
2245		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2246			m->m_len, bsp->gl->dgl_length));
2247
2248	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2249	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2250        /*
2251	 * Bit 0 of flags stores whether the DDP buffer is completed.
2252	 * Note that other parts of the code depend on this being in bit 0.
2253	 */
2254	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2255		panic("spurious ddp completion");
2256	} else {
2257		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2258		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2259			q->cur_buf ^= 1;                     /* flip buffers */
2260	}
2261
2262	if (bsp->flags & DDP_BF_NOCOPY) {
2263		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2264		bsp->flags &= ~DDP_BF_NOCOPY;
2265	}
2266
2267	if (ddp_report & F_DDP_PSH)
2268		m->m_ddp_flags |= DDP_BF_PSH;
2269	if (nomoredata)
2270		m->m_ddp_flags |= DDP_BF_NODATA;
2271
2272#ifdef notyet
2273	skb_reset_transport_header(skb);
2274	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2275#endif
2276	SBAPPEND(rcv, m);
2277
2278	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2279	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2280		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2281		so_sorwakeup_locked(so);
2282	else
2283		sockbuf_unlock(rcv);
2284}
2285
2286#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2287		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2288		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2289		 F_DDP_INVALID_PPOD)
2290
2291/*
2292 * Handler for RX_DATA_DDP CPL messages.
2293 */
2294static int
2295do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2296{
2297	struct toepcb *toep = ctx;
2298	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2299
2300	VALIDATE_SOCK(so);
2301
2302	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2303		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2304		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2305		return (CPL_RET_BUF_DONE);
2306	}
2307#if 0
2308	skb->h.th = tcphdr_skb->h.th;
2309#endif
2310	new_rx_data_ddp(toep, m);
2311	return (0);
2312}
2313
2314static void
2315process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2316{
2317	struct tcpcb *tp = toep->tp_tp;
2318	struct socket *so;
2319	struct ddp_state *q;
2320	struct ddp_buf_state *bsp;
2321	struct cpl_rx_ddp_complete *hdr;
2322	unsigned int ddp_report, buf_idx, when, delack_mode;
2323	int nomoredata = 0;
2324	struct sockbuf *rcv;
2325
2326	inp_wlock(tp->t_inpcb);
2327	so = inp_inpcbtosocket(tp->t_inpcb);
2328
2329	if (__predict_false(so_no_receive(so))) {
2330		struct inpcb *inp = so_sotoinpcb(so);
2331
2332		handle_excess_rx(toep, m);
2333		inp_wunlock(inp);
2334		return;
2335	}
2336	q = &toep->tp_ddp_state;
2337	hdr = cplhdr(m);
2338	ddp_report = ntohl(hdr->ddp_report);
2339	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2340	m->m_pkthdr.csum_data = tp->rcv_nxt;
2341
2342	rcv = so_sockbuf_rcv(so);
2343	sockbuf_lock(rcv);
2344
2345	bsp = &q->buf_state[buf_idx];
2346	when = bsp->cur_offset;
2347	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2348	tp->rcv_nxt += m->m_len;
2349	tp->t_rcvtime = ticks;
2350
2351	delack_mode = G_DDP_DACK_MODE(ddp_report);
2352	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2353		toep->tp_delack_mode = delack_mode;
2354		toep->tp_delack_seq = tp->rcv_nxt;
2355	}
2356#ifdef notyet
2357	skb_reset_transport_header(skb);
2358	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2359#endif
2360	inp_wunlock(tp->t_inpcb);
2361
2362	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2363	CTR5(KTR_TOM,
2364		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2365		  "ddp_report 0x%x offset %u, len %u",
2366		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2367		   G_DDP_OFFSET(ddp_report), m->m_len);
2368
2369	m->m_cur_offset = bsp->cur_offset;
2370	bsp->cur_offset += m->m_len;
2371
2372	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2373		q->cur_buf ^= 1;                     /* flip buffers */
2374		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2375			nomoredata=1;
2376	}
2377
2378	CTR4(KTR_TOM,
2379		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2380		  "ddp_report %u offset %u",
2381		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2382		   G_DDP_OFFSET(ddp_report));
2383
2384	m->m_ddp_gl = (unsigned char *)bsp->gl;
2385	m->m_flags |= M_DDP;
2386	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2387	if (bsp->flags & DDP_BF_NOCOPY)
2388		bsp->flags &= ~DDP_BF_NOCOPY;
2389	if (nomoredata)
2390		m->m_ddp_flags |= DDP_BF_NODATA;
2391
2392	SBAPPEND(rcv, m);
2393	if ((so_state_get(so) & SS_NOFDREF) == 0)
2394		so_sorwakeup_locked(so);
2395	else
2396		sockbuf_unlock(rcv);
2397}
2398
2399/*
2400 * Handler for RX_DDP_COMPLETE CPL messages.
2401 */
2402static int
2403do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2404{
2405	struct toepcb *toep = ctx;
2406
2407	VALIDATE_SOCK(so);
2408#if 0
2409	skb->h.th = tcphdr_skb->h.th;
2410#endif
2411	process_ddp_complete(toep, m);
2412	return (0);
2413}
2414
2415/*
2416 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2417 * socket state before calling tcp_time_wait to comply with its expectations.
2418 */
2419static void
2420enter_timewait(struct tcpcb *tp)
2421{
2422	/*
2423	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2424	 * process peer_close because we don't want to carry the peer FIN in
2425	 * the socket's receive queue and if we increment rcv_nxt without
2426	 * having the FIN in the receive queue we'll confuse facilities such
2427	 * as SIOCINQ.
2428	 */
2429	inp_wlock(tp->t_inpcb);
2430	tp->rcv_nxt++;
2431
2432	tp->ts_recent_age = 0;	     /* defeat recycling */
2433	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2434	inp_wunlock(tp->t_inpcb);
2435	tcp_offload_twstart(tp);
2436}
2437
2438/*
2439 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2440 * function deals with the data that may be reported along with the FIN.
2441 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2442 * perform normal FIN-related processing.  In the latter case 1 indicates that
2443 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2444 * skb can be freed.
2445 */
2446static int
2447handle_peer_close_data(struct socket *so, struct mbuf *m)
2448{
2449	struct tcpcb *tp = so_sototcpcb(so);
2450	struct toepcb *toep = tp->t_toe;
2451	struct ddp_state *q;
2452	struct ddp_buf_state *bsp;
2453	struct cpl_peer_close *req = cplhdr(m);
2454	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2455	struct sockbuf *rcv;
2456
2457	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2458		return (0);
2459
2460	CTR0(KTR_TOM, "handle_peer_close_data");
2461	if (__predict_false(so_no_receive(so))) {
2462		handle_excess_rx(toep, m);
2463
2464		/*
2465		 * Although we discard the data we want to process the FIN so
2466		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2467		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2468		 * may be what will close the connection.  We return 1 because
2469		 * handle_excess_rx() already freed the packet.
2470		 */
2471		return (1);
2472	}
2473
2474	inp_lock_assert(tp->t_inpcb);
2475	q = &toep->tp_ddp_state;
2476	rcv = so_sockbuf_rcv(so);
2477	sockbuf_lock(rcv);
2478
2479	bsp = &q->buf_state[q->cur_buf];
2480	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2481	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2482	m->m_ddp_gl = (unsigned char *)bsp->gl;
2483	m->m_flags |= M_DDP;
2484	m->m_cur_offset = bsp->cur_offset;
2485	m->m_ddp_flags =
2486	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2487	m->m_seq = tp->rcv_nxt;
2488	tp->rcv_nxt = rcv_nxt;
2489	bsp->cur_offset += m->m_pkthdr.len;
2490	if (!(bsp->flags & DDP_BF_NOFLIP))
2491		q->cur_buf ^= 1;
2492#ifdef notyet
2493	skb_reset_transport_header(skb);
2494	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2495#endif
2496	tp->t_rcvtime = ticks;
2497	SBAPPEND(rcv, m);
2498	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2499		so_sorwakeup_locked(so);
2500	else
2501		sockbuf_unlock(rcv);
2502
2503	return (1);
2504}
2505
2506/*
2507 * Handle a peer FIN.
2508 */
2509static void
2510do_peer_fin(struct toepcb *toep, struct mbuf *m)
2511{
2512	struct socket *so;
2513	struct tcpcb *tp = toep->tp_tp;
2514	int keep, action;
2515
2516	action = keep = 0;
2517	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2518	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2519		printf("abort_pending set\n");
2520
2521		goto out;
2522	}
2523	inp_wlock(tp->t_inpcb);
2524	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2525	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2526		keep = handle_peer_close_data(so, m);
2527		if (keep < 0) {
2528			inp_wunlock(tp->t_inpcb);
2529			return;
2530		}
2531	}
2532	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2533		CTR1(KTR_TOM,
2534		    "waking up waiters for cantrcvmore on %p ", so);
2535		socantrcvmore(so);
2536
2537		/*
2538		 * If connection is half-synchronized
2539		 * (ie NEEDSYN flag on) then delay ACK,
2540		 * so it may be piggybacked when SYN is sent.
2541		 * Otherwise, since we received a FIN then no
2542		 * more input can be expected, send ACK now.
2543		 */
2544		if (tp->t_flags & TF_NEEDSYN)
2545			tp->t_flags |= TF_DELACK;
2546		else
2547			tp->t_flags |= TF_ACKNOW;
2548		tp->rcv_nxt++;
2549	}
2550
2551	switch (tp->t_state) {
2552	case TCPS_SYN_RECEIVED:
2553	    tp->t_starttime = ticks;
2554	/* FALLTHROUGH */
2555	case TCPS_ESTABLISHED:
2556		tp->t_state = TCPS_CLOSE_WAIT;
2557		break;
2558	case TCPS_FIN_WAIT_1:
2559		tp->t_state = TCPS_CLOSING;
2560		break;
2561	case TCPS_FIN_WAIT_2:
2562		/*
2563		 * If we've sent an abort_req we must have sent it too late,
2564		 * HW will send us a reply telling us so, and this peer_close
2565		 * is really the last message for this connection and needs to
2566		 * be treated as an abort_rpl, i.e., transition the connection
2567		 * to TCP_CLOSE (note that the host stack does this at the
2568		 * time of generating the RST but we must wait for HW).
2569		 * Otherwise we enter TIME_WAIT.
2570		 */
2571		t3_release_offload_resources(toep);
2572		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2573			action = TCP_CLOSE;
2574		} else {
2575			action = TCP_TIMEWAIT;
2576		}
2577		break;
2578	default:
2579		log(LOG_ERR,
2580		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2581		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2582	}
2583	inp_wunlock(tp->t_inpcb);
2584
2585	if (action == TCP_TIMEWAIT) {
2586		enter_timewait(tp);
2587	} else if (action == TCP_DROP) {
2588		tcp_offload_drop(tp, 0);
2589	} else if (action == TCP_CLOSE) {
2590		tcp_offload_close(tp);
2591	}
2592
2593#ifdef notyet
2594	/* Do not send POLL_HUP for half duplex close. */
2595	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2596	    sk->sk_state == TCP_CLOSE)
2597		sk_wake_async(so, 1, POLL_HUP);
2598	else
2599		sk_wake_async(so, 1, POLL_IN);
2600#endif
2601
2602out:
2603	if (!keep)
2604		m_free(m);
2605}
2606
2607/*
2608 * Handler for PEER_CLOSE CPL messages.
2609 */
2610static int
2611do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2612{
2613	struct toepcb *toep = (struct toepcb *)ctx;
2614
2615	VALIDATE_SOCK(so);
2616
2617	do_peer_fin(toep, m);
2618	return (0);
2619}
2620
2621static void
2622process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2623{
2624	struct cpl_close_con_rpl *rpl = cplhdr(m);
2625	struct tcpcb *tp = toep->tp_tp;
2626	struct socket *so;
2627	int action = 0;
2628	struct sockbuf *rcv;
2629
2630	inp_wlock(tp->t_inpcb);
2631	so = inp_inpcbtosocket(tp->t_inpcb);
2632
2633	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2634
2635	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2636		inp_wunlock(tp->t_inpcb);
2637		goto out;
2638	}
2639
2640	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2641	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2642
2643	switch (tp->t_state) {
2644	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2645		t3_release_offload_resources(toep);
2646		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2647			action = TCP_CLOSE;
2648
2649		} else {
2650			action = TCP_TIMEWAIT;
2651		}
2652		break;
2653	case TCPS_LAST_ACK:
2654		/*
2655		 * In this state we don't care about pending abort_rpl.
2656		 * If we've sent abort_req it was post-close and was sent too
2657		 * late, this close_con_rpl is the actual last message.
2658		 */
2659		t3_release_offload_resources(toep);
2660		action = TCP_CLOSE;
2661		break;
2662	case TCPS_FIN_WAIT_1:
2663		/*
2664		 * If we can't receive any more
2665		 * data, then closing user can proceed.
2666		 * Starting the timer is contrary to the
2667		 * specification, but if we don't get a FIN
2668		 * we'll hang forever.
2669		 *
2670		 * XXXjl:
2671		 * we should release the tp also, and use a
2672		 * compressed state.
2673		 */
2674		if (so)
2675			rcv = so_sockbuf_rcv(so);
2676		else
2677			break;
2678
2679		if (rcv->sb_state & SBS_CANTRCVMORE) {
2680			int timeout;
2681
2682			if (so)
2683				soisdisconnected(so);
2684			timeout = (tcp_fast_finwait2_recycle) ?
2685			    tcp_finwait2_timeout : tcp_maxidle;
2686			tcp_timer_activate(tp, TT_2MSL, timeout);
2687		}
2688		tp->t_state = TCPS_FIN_WAIT_2;
2689		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2690		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2691			action = TCP_DROP;
2692		}
2693
2694		break;
2695	default:
2696		log(LOG_ERR,
2697		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2698		       toep->tp_toedev->tod_name, toep->tp_tid,
2699		       tp->t_state);
2700	}
2701	inp_wunlock(tp->t_inpcb);
2702
2703
2704	if (action == TCP_TIMEWAIT) {
2705		enter_timewait(tp);
2706	} else if (action == TCP_DROP) {
2707		tcp_offload_drop(tp, 0);
2708	} else if (action == TCP_CLOSE) {
2709		tcp_offload_close(tp);
2710	}
2711out:
2712	m_freem(m);
2713}
2714
2715/*
2716 * Handler for CLOSE_CON_RPL CPL messages.
2717 */
2718static int
2719do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2720			    void *ctx)
2721{
2722	struct toepcb *toep = (struct toepcb *)ctx;
2723
2724	process_close_con_rpl(toep, m);
2725	return (0);
2726}
2727
2728/*
2729 * Process abort replies.  We only process these messages if we anticipate
2730 * them as the coordination between SW and HW in this area is somewhat lacking
2731 * and sometimes we get ABORT_RPLs after we are done with the connection that
2732 * originated the ABORT_REQ.
2733 */
2734static void
2735process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2736{
2737	struct tcpcb *tp = toep->tp_tp;
2738	struct socket *so;
2739	int needclose = 0;
2740
2741#ifdef T3_TRACE
2742	T3_TRACE1(TIDTB(sk),
2743		  "process_abort_rpl: GTS rpl pending %d",
2744		  sock_flag(sk, ABORT_RPL_PENDING));
2745#endif
2746
2747	inp_wlock(tp->t_inpcb);
2748	so = inp_inpcbtosocket(tp->t_inpcb);
2749
2750	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2751		/*
2752		 * XXX panic on tcpdrop
2753		 */
2754		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2755			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2756		else {
2757			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2758			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2759			    !is_t3a(toep->tp_toedev)) {
2760				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2761					panic("TP_ABORT_REQ_RCVD set");
2762				t3_release_offload_resources(toep);
2763				needclose = 1;
2764			}
2765		}
2766	}
2767	inp_wunlock(tp->t_inpcb);
2768
2769	if (needclose)
2770		tcp_offload_close(tp);
2771
2772	m_free(m);
2773}
2774
2775/*
2776 * Handle an ABORT_RPL_RSS CPL message.
2777 */
2778static int
2779do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2780{
2781	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2782	struct toepcb *toep;
2783
2784	/*
2785	 * Ignore replies to post-close aborts indicating that the abort was
2786	 * requested too late.  These connections are terminated when we get
2787	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2788	 * arrives the TID is either no longer used or it has been recycled.
2789	 */
2790	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2791discard:
2792		m_free(m);
2793		return (0);
2794	}
2795
2796	toep = (struct toepcb *)ctx;
2797
2798        /*
2799	 * Sometimes we've already closed the socket, e.g., a post-close
2800	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2801	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2802	 * but FW turns the ABORT_REQ into a regular one and so we get
2803	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2804	 */
2805	if (!toep)
2806		goto discard;
2807
2808	if (toep->tp_tp == NULL) {
2809		log(LOG_NOTICE, "removing tid for abort\n");
2810		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2811		if (toep->tp_l2t)
2812			l2t_release(L2DATA(cdev), toep->tp_l2t);
2813
2814		toepcb_release(toep);
2815		goto discard;
2816	}
2817
2818	log(LOG_NOTICE, "toep=%p\n", toep);
2819	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2820
2821	toepcb_hold(toep);
2822	process_abort_rpl(toep, m);
2823	toepcb_release(toep);
2824	return (0);
2825}
2826
2827/*
2828 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2829 * indicate whether RST should be sent in response.
2830 */
2831static int
2832abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2833{
2834	struct tcpcb *tp = so_sototcpcb(so);
2835
2836	switch (abort_reason) {
2837	case CPL_ERR_BAD_SYN:
2838#if 0
2839		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2840#endif
2841	case CPL_ERR_CONN_RESET:
2842		// XXX need to handle SYN_RECV due to crossed SYNs
2843		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2844	case CPL_ERR_XMIT_TIMEDOUT:
2845	case CPL_ERR_PERSIST_TIMEDOUT:
2846	case CPL_ERR_FINWAIT2_TIMEDOUT:
2847	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2848#if 0
2849		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2850#endif
2851		return (ETIMEDOUT);
2852	default:
2853		return (EIO);
2854	}
2855}
2856
2857static inline void
2858set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2859{
2860	struct cpl_abort_rpl *rpl = cplhdr(m);
2861
2862	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2863	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2864	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2865
2866	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2867	rpl->cmd = cmd;
2868}
2869
2870static void
2871send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2872{
2873	struct mbuf *reply_mbuf;
2874	struct cpl_abort_req_rss *req = cplhdr(m);
2875
2876	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2877	m_set_priority(m, CPL_PRIORITY_DATA);
2878	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2879	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2880	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2881	m_free(m);
2882}
2883
2884/*
2885 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2886 */
2887static inline int
2888is_neg_adv_abort(unsigned int status)
2889{
2890	return status == CPL_ERR_RTX_NEG_ADVICE ||
2891	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2892}
2893
2894static void
2895send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2896{
2897	struct mbuf  *reply_mbuf;
2898	struct cpl_abort_req_rss *req = cplhdr(m);
2899
2900	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2901
2902	if (!reply_mbuf) {
2903		/* Defer the reply.  Stick rst_status into req->cmd. */
2904		req->status = rst_status;
2905		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2906		return;
2907	}
2908
2909	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2910	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2911	m_free(m);
2912
2913	/*
2914	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2915	 * these messages while ARP is pending.  For other connection states
2916	 * it's not a problem.
2917	 */
2918	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2919}
2920
2921#ifdef notyet
2922static void
2923cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2924{
2925	CXGB_UNIMPLEMENTED();
2926#ifdef notyet
2927	struct request_sock *req = child->sk_user_data;
2928
2929	inet_csk_reqsk_queue_removed(parent, req);
2930	synq_remove(tcp_sk(child));
2931	__reqsk_free(req);
2932	child->sk_user_data = NULL;
2933#endif
2934}
2935
2936
2937/*
2938 * Performs the actual work to abort a SYN_RECV connection.
2939 */
2940static void
2941do_abort_syn_rcv(struct socket *child, struct socket *parent)
2942{
2943	struct tcpcb *parenttp = so_sototcpcb(parent);
2944	struct tcpcb *childtp = so_sototcpcb(child);
2945
2946	/*
2947	 * If the server is still open we clean up the child connection,
2948	 * otherwise the server already did the clean up as it was purging
2949	 * its SYN queue and the skb was just sitting in its backlog.
2950	 */
2951	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2952		cleanup_syn_rcv_conn(child, parent);
2953		inp_wlock(childtp->t_inpcb);
2954		t3_release_offload_resources(childtp->t_toe);
2955		inp_wunlock(childtp->t_inpcb);
2956		tcp_offload_close(childtp);
2957	}
2958}
2959#endif
2960
2961/*
2962 * Handle abort requests for a SYN_RECV connection.  These need extra work
2963 * because the socket is on its parent's SYN queue.
2964 */
2965static int
2966abort_syn_rcv(struct socket *so, struct mbuf *m)
2967{
2968	CXGB_UNIMPLEMENTED();
2969#ifdef notyet
2970	struct socket *parent;
2971	struct toedev *tdev = toep->tp_toedev;
2972	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2973	struct socket *oreq = so->so_incomp;
2974	struct t3c_tid_entry *t3c_stid;
2975	struct tid_info *t;
2976
2977	if (!oreq)
2978		return -1;        /* somehow we are not on the SYN queue */
2979
2980	t = &(T3C_DATA(cdev))->tid_maps;
2981	t3c_stid = lookup_stid(t, oreq->ts_recent);
2982	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2983
2984	so_lock(parent);
2985	do_abort_syn_rcv(so, parent);
2986	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2987	so_unlock(parent);
2988#endif
2989	return (0);
2990}
2991
2992/*
2993 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2994 * request except that we need to reply to it.
2995 */
2996static void
2997process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
2998{
2999	int rst_status = CPL_ABORT_NO_RST;
3000	const struct cpl_abort_req_rss *req = cplhdr(m);
3001	struct tcpcb *tp = toep->tp_tp;
3002	struct socket *so;
3003	int needclose = 0;
3004
3005	inp_wlock(tp->t_inpcb);
3006	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3007	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3008		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3009		m_free(m);
3010		goto skip;
3011	}
3012
3013	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3014	/*
3015	 * Three cases to consider:
3016	 * a) We haven't sent an abort_req; close the connection.
3017	 * b) We have sent a post-close abort_req that will get to TP too late
3018	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3019	 *    be ignored and the connection should be closed now.
3020	 * c) We have sent a regular abort_req that will get to TP too late.
3021	 *    That will generate an abort_rpl with status 0, wait for it.
3022	 */
3023	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3024	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3025		int error;
3026
3027		error = abort_status_to_errno(so, req->status,
3028		    &rst_status);
3029		so_error_set(so, error);
3030
3031		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3032			so_sorwakeup(so);
3033		/*
3034		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3035		 * returns 0 is has taken care of the abort.
3036		 */
3037		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3038			goto skip;
3039
3040		t3_release_offload_resources(toep);
3041		needclose = 1;
3042	}
3043	inp_wunlock(tp->t_inpcb);
3044
3045	if (needclose)
3046		tcp_offload_close(tp);
3047
3048	send_abort_rpl(m, tdev, rst_status);
3049	return;
3050skip:
3051	inp_wunlock(tp->t_inpcb);
3052}
3053
3054/*
3055 * Handle an ABORT_REQ_RSS CPL message.
3056 */
3057static int
3058do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3059{
3060	const struct cpl_abort_req_rss *req = cplhdr(m);
3061	struct toepcb *toep = (struct toepcb *)ctx;
3062
3063	if (is_neg_adv_abort(req->status)) {
3064		m_free(m);
3065		return (0);
3066	}
3067
3068	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3069
3070	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3071		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3072		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3073
3074		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3075		if (toep->tp_l2t)
3076			l2t_release(L2DATA(cdev), toep->tp_l2t);
3077
3078		/*
3079		 *  Unhook
3080		 */
3081		toep->tp_tp->t_toe = NULL;
3082		toep->tp_tp->t_flags &= ~TF_TOE;
3083		toep->tp_tp = NULL;
3084		/*
3085		 * XXX need to call syncache_chkrst - but we don't
3086		 * have a way of doing that yet
3087		 */
3088		toepcb_release(toep);
3089		log(LOG_ERR, "abort for unestablished connection :-(\n");
3090		return (0);
3091	}
3092	if (toep->tp_tp == NULL) {
3093		log(LOG_NOTICE, "disconnected toepcb\n");
3094		/* should be freed momentarily */
3095		return (0);
3096	}
3097
3098
3099	toepcb_hold(toep);
3100	process_abort_req(toep, m, toep->tp_toedev);
3101	toepcb_release(toep);
3102	return (0);
3103}
3104#ifdef notyet
3105static void
3106pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3107{
3108	struct toedev *tdev = TOE_DEV(parent);
3109
3110	do_abort_syn_rcv(child, parent);
3111	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3112		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3113
3114		rpl->opt0h = htonl(F_TCAM_BYPASS);
3115		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3116		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3117	} else
3118		m_free(m);
3119}
3120#endif
3121static void
3122handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3123{
3124	CXGB_UNIMPLEMENTED();
3125
3126#ifdef notyet
3127	struct t3cdev *cdev;
3128	struct socket *parent;
3129	struct socket *oreq;
3130	struct t3c_tid_entry *t3c_stid;
3131	struct tid_info *t;
3132	struct tcpcb *otp, *tp = so_sototcpcb(so);
3133	struct toepcb *toep = tp->t_toe;
3134
3135	/*
3136	 * If the connection is being aborted due to the parent listening
3137	 * socket going away there's nothing to do, the ABORT_REQ will close
3138	 * the connection.
3139	 */
3140	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3141		m_free(m);
3142		return;
3143	}
3144
3145	oreq = so->so_incomp;
3146	otp = so_sototcpcb(oreq);
3147
3148	cdev = T3C_DEV(so);
3149	t = &(T3C_DATA(cdev))->tid_maps;
3150	t3c_stid = lookup_stid(t, otp->ts_recent);
3151	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3152
3153	so_lock(parent);
3154	pass_open_abort(so, parent, m);
3155	so_unlock(parent);
3156#endif
3157}
3158
3159/*
3160 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3161 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3162 * connection.
3163 */
3164static void
3165pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3166{
3167
3168#ifdef notyet
3169	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3170	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3171#endif
3172	handle_pass_open_arp_failure(m_get_socket(m), m);
3173}
3174
3175/*
3176 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3177 */
3178static void
3179mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3180{
3181	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3182	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3183	unsigned int tid = GET_TID(req);
3184
3185	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3186	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3187	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3188	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3189	rpl->opt0h = htonl(F_TCAM_BYPASS);
3190	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3191	rpl->opt2 = 0;
3192	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3193}
3194
3195/*
3196 * Send a deferred reject to an accept request.
3197 */
3198static void
3199reject_pass_request(struct toedev *tdev, struct mbuf *m)
3200{
3201	struct mbuf *reply_mbuf;
3202
3203	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3204	mk_pass_accept_rpl(reply_mbuf, m);
3205	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3206	m_free(m);
3207}
3208
3209static void
3210handle_syncache_event(int event, void *arg)
3211{
3212	struct toepcb *toep = arg;
3213
3214	switch (event) {
3215	case TOE_SC_ENTRY_PRESENT:
3216		/*
3217		 * entry already exists - free toepcb
3218		 * and l2t
3219		 */
3220		printf("syncache entry present\n");
3221		toepcb_release(toep);
3222		break;
3223	case TOE_SC_DROP:
3224		/*
3225		 * The syncache has given up on this entry
3226		 * either it timed out, or it was evicted
3227		 * we need to explicitly release the tid
3228		 */
3229		printf("syncache entry dropped\n");
3230		toepcb_release(toep);
3231		break;
3232	default:
3233		log(LOG_ERR, "unknown syncache event %d\n", event);
3234		break;
3235	}
3236}
3237
3238static void
3239syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3240{
3241	struct in_conninfo inc;
3242	struct tcpopt to;
3243	struct tcphdr th;
3244	struct inpcb *inp;
3245	int mss, wsf, sack, ts;
3246	uint32_t rcv_isn = ntohl(req->rcv_isn);
3247
3248	bzero(&to, sizeof(struct tcpopt));
3249	inp = so_sotoinpcb(lso);
3250
3251	/*
3252	 * Fill out information for entering us into the syncache
3253	 */
3254	inc.inc_fport = th.th_sport = req->peer_port;
3255	inc.inc_lport = th.th_dport = req->local_port;
3256	th.th_seq = req->rcv_isn;
3257	th.th_flags = TH_SYN;
3258
3259	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3260
3261
3262	inc.inc_isipv6 = 0;
3263	inc.inc_len = 0;
3264	inc.inc_faddr.s_addr = req->peer_ip;
3265	inc.inc_laddr.s_addr = req->local_ip;
3266
3267	DPRINTF("syncache add of %d:%d %d:%d\n",
3268	    ntohl(req->local_ip), ntohs(req->local_port),
3269	    ntohl(req->peer_ip), ntohs(req->peer_port));
3270
3271	mss = req->tcp_options.mss;
3272	wsf = req->tcp_options.wsf;
3273	ts = req->tcp_options.tstamp;
3274	sack = req->tcp_options.sack;
3275	to.to_mss = mss;
3276	to.to_wscale = wsf;
3277	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3278	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3279}
3280
3281
3282/*
3283 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3284 * lock held.  Note that the sock here is a listening socket that is not owned
3285 * by the TOE.
3286 */
3287static void
3288process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3289    struct listen_ctx *lctx)
3290{
3291	int rt_flags;
3292	struct l2t_entry *e;
3293	struct iff_mac tim;
3294	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3295	struct cpl_pass_accept_rpl *rpl;
3296	struct cpl_pass_accept_req *req = cplhdr(m);
3297	unsigned int tid = GET_TID(req);
3298	struct tom_data *d = TOM_DATA(tdev);
3299	struct t3cdev *cdev = d->cdev;
3300	struct tcpcb *tp = so_sototcpcb(so);
3301	struct toepcb *newtoep;
3302	struct rtentry *dst;
3303	struct sockaddr_in nam;
3304	struct t3c_data *td = T3C_DATA(cdev);
3305
3306	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3307	if (__predict_false(reply_mbuf == NULL)) {
3308		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3309			t3_defer_reply(m, tdev, reject_pass_request);
3310		else {
3311			cxgb_queue_tid_release(cdev, tid);
3312			m_free(m);
3313		}
3314		DPRINTF("failed to get reply_mbuf\n");
3315
3316		goto out;
3317	}
3318
3319	if (tp->t_state != TCPS_LISTEN) {
3320		DPRINTF("socket not in listen state\n");
3321
3322		goto reject;
3323	}
3324
3325	tim.mac_addr = req->dst_mac;
3326	tim.vlan_tag = ntohs(req->vlan_tag);
3327	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3328		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3329		goto reject;
3330	}
3331
3332#ifdef notyet
3333	/*
3334	 * XXX do route lookup to confirm that we're still listening on this
3335	 * address
3336	 */
3337	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3338			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3339		goto reject;
3340	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3341		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3342	dst_release(skb->dst);	// done with the input route, release it
3343	skb->dst = NULL;
3344
3345	if ((rt_flags & RTF_LOCAL) == 0)
3346		goto reject;
3347#endif
3348	/*
3349	 * XXX
3350	 */
3351	rt_flags = RTF_LOCAL;
3352	if ((rt_flags & RTF_LOCAL) == 0)
3353		goto reject;
3354
3355	/*
3356	 * Calculate values and add to syncache
3357	 */
3358
3359	newtoep = toepcb_alloc();
3360	if (newtoep == NULL)
3361		goto reject;
3362
3363	bzero(&nam, sizeof(struct sockaddr_in));
3364
3365	nam.sin_len = sizeof(struct sockaddr_in);
3366	nam.sin_family = AF_INET;
3367	nam.sin_addr.s_addr =req->peer_ip;
3368	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3369
3370	if (dst == NULL) {
3371		printf("failed to find route\n");
3372		goto reject;
3373	}
3374	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3375	    (struct sockaddr *)&nam);
3376	if (e == NULL) {
3377		DPRINTF("failed to get l2t\n");
3378	}
3379	/*
3380	 * Point to our listen socket until accept
3381	 */
3382	newtoep->tp_tp = tp;
3383	newtoep->tp_flags = TP_SYN_RCVD;
3384	newtoep->tp_tid = tid;
3385	newtoep->tp_toedev = tdev;
3386	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3387
3388	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3389	so_lock(so);
3390	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3391	so_unlock(so);
3392
3393	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3394		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3395
3396	if (newtoep->tp_ulp_mode) {
3397		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3398
3399		if (ddp_mbuf == NULL)
3400			newtoep->tp_ulp_mode = 0;
3401	}
3402
3403	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3404	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3405	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3406	/*
3407	 * XXX workaround for lack of syncache drop
3408	 */
3409	toepcb_hold(newtoep);
3410	syncache_add_accept_req(req, so, newtoep);
3411
3412	rpl = cplhdr(reply_mbuf);
3413	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3414	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3415	rpl->wr.wr_lo = 0;
3416	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3417	rpl->opt2 = htonl(calc_opt2(so, tdev));
3418	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3419	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3420
3421	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3422	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3423	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3424				  CPL_PASS_OPEN_ACCEPT);
3425
3426	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3427
3428	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3429
3430	l2t_send(cdev, reply_mbuf, e);
3431	m_free(m);
3432	if (newtoep->tp_ulp_mode) {
3433		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3434				V_TF_DDP_OFF(1) |
3435				TP_DDP_TIMER_WORKAROUND_MASK,
3436				V_TF_DDP_OFF(1) |
3437		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3438	} else
3439		printf("not offloading\n");
3440
3441
3442
3443	return;
3444reject:
3445	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3446		mk_pass_accept_rpl(reply_mbuf, m);
3447	else
3448		mk_tid_release(reply_mbuf, newtoep, tid);
3449	cxgb_ofld_send(cdev, reply_mbuf);
3450	m_free(m);
3451out:
3452#if 0
3453	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3454#else
3455	return;
3456#endif
3457}
3458
3459/*
3460 * Handle a CPL_PASS_ACCEPT_REQ message.
3461 */
3462static int
3463do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3464{
3465	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3466	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3467	struct tom_data *d = listen_ctx->tom_data;
3468
3469#if VALIDATE_TID
3470	struct cpl_pass_accept_req *req = cplhdr(m);
3471	unsigned int tid = GET_TID(req);
3472	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3473
3474	if (unlikely(!lsk)) {
3475		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3476		       cdev->name,
3477		       (unsigned long)((union listen_entry *)ctx -
3478					t->stid_tab));
3479		return CPL_RET_BUF_DONE;
3480	}
3481	if (unlikely(tid >= t->ntids)) {
3482		printk(KERN_ERR "%s: passive open TID %u too large\n",
3483		       cdev->name, tid);
3484		return CPL_RET_BUF_DONE;
3485	}
3486	/*
3487	 * For T3A the current user of the TID may have closed but its last
3488	 * message(s) may have been backlogged so the TID appears to be still
3489	 * in use.  Just take the TID away, the connection can close at its
3490	 * own leisure.  For T3B this situation is a bug.
3491	 */
3492	if (!valid_new_tid(t, tid) &&
3493	    cdev->type != T3A) {
3494		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3495		       cdev->name, tid);
3496		return CPL_RET_BUF_DONE;
3497	}
3498#endif
3499
3500	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3501	return (0);
3502}
3503
3504/*
3505 * Called when a connection is established to translate the TCP options
3506 * reported by HW to FreeBSD's native format.
3507 */
3508static void
3509assign_rxopt(struct socket *so, unsigned int opt)
3510{
3511	struct tcpcb *tp = so_sototcpcb(so);
3512	struct toepcb *toep = tp->t_toe;
3513	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3514
3515	inp_lock_assert(tp->t_inpcb);
3516
3517	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3518	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3519	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3520	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3521	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3522	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3523		tp->rcv_scale = tp->request_r_scale;
3524}
3525
3526/*
3527 * Completes some final bits of initialization for just established connections
3528 * and changes their state to TCP_ESTABLISHED.
3529 *
3530 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3531 */
3532static void
3533make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3534{
3535	struct tcpcb *tp = so_sototcpcb(so);
3536	struct toepcb *toep = tp->t_toe;
3537
3538	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3539	assign_rxopt(so, opt);
3540
3541	/*
3542	 *XXXXXXXXXXX
3543	 *
3544	 */
3545#ifdef notyet
3546	so->so_proto->pr_ctloutput = t3_ctloutput;
3547#endif
3548
3549#if 0
3550	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3551#endif
3552	/*
3553	 * XXX not clear what rcv_wup maps to
3554	 */
3555	/*
3556	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3557	 * pass through opt0.
3558	 */
3559	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3560		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3561
3562	dump_toepcb(toep);
3563
3564#ifdef notyet
3565/*
3566 * no clean interface for marking ARP up to date
3567 */
3568	dst_confirm(sk->sk_dst_cache);
3569#endif
3570	tp->t_starttime = ticks;
3571	tp->t_state = TCPS_ESTABLISHED;
3572	soisconnected(so);
3573}
3574
3575static int
3576syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3577{
3578
3579	struct in_conninfo inc;
3580	struct tcpopt to;
3581	struct tcphdr th;
3582	int mss, wsf, sack, ts;
3583	struct mbuf *m = NULL;
3584	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3585	unsigned int opt;
3586
3587#ifdef MAC
3588#error	"no MAC support"
3589#endif
3590
3591	opt = ntohs(req->tcp_opt);
3592
3593	bzero(&to, sizeof(struct tcpopt));
3594
3595	/*
3596	 * Fill out information for entering us into the syncache
3597	 */
3598	inc.inc_fport = th.th_sport = req->peer_port;
3599	inc.inc_lport = th.th_dport = req->local_port;
3600	th.th_seq = req->rcv_isn;
3601	th.th_flags = TH_ACK;
3602
3603	inc.inc_isipv6 = 0;
3604	inc.inc_len = 0;
3605	inc.inc_faddr.s_addr = req->peer_ip;
3606	inc.inc_laddr.s_addr = req->local_ip;
3607
3608	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3609	wsf  = G_TCPOPT_WSCALE_OK(opt);
3610	ts   = G_TCPOPT_TSTAMP(opt);
3611	sack = G_TCPOPT_SACK(opt);
3612
3613	to.to_mss = mss;
3614	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3615	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3616
3617	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3618	    ntohl(req->local_ip), ntohs(req->local_port),
3619	    ntohl(req->peer_ip), ntohs(req->peer_port),
3620	    mss, wsf, ts, sack);
3621	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3622}
3623
3624
3625/*
3626 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3627 * if we are in TCP_SYN_RECV due to crossed SYNs
3628 */
3629static int
3630do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3631{
3632	struct cpl_pass_establish *req = cplhdr(m);
3633	struct toepcb *toep = (struct toepcb *)ctx;
3634	struct tcpcb *tp = toep->tp_tp;
3635	struct socket *so, *lso;
3636	struct t3c_data *td = T3C_DATA(cdev);
3637	struct sockbuf *snd, *rcv;
3638
3639	// Complete socket initialization now that we have the SND_ISN
3640
3641	struct toedev *tdev;
3642
3643
3644	tdev = toep->tp_toedev;
3645
3646	inp_wlock(tp->t_inpcb);
3647
3648	/*
3649	 *
3650	 * XXX need to add reference while we're manipulating
3651	 */
3652	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3653
3654	inp_wunlock(tp->t_inpcb);
3655
3656	so_lock(so);
3657	LIST_REMOVE(toep, synq_entry);
3658	so_unlock(so);
3659
3660	if (!syncache_expand_establish_req(req, &so, toep)) {
3661		/*
3662		 * No entry
3663		 */
3664		CXGB_UNIMPLEMENTED();
3665	}
3666	if (so == NULL) {
3667		/*
3668		 * Couldn't create the socket
3669		 */
3670		CXGB_UNIMPLEMENTED();
3671	}
3672
3673	tp = so_sototcpcb(so);
3674	inp_wlock(tp->t_inpcb);
3675
3676	snd = so_sockbuf_snd(so);
3677	rcv = so_sockbuf_rcv(so);
3678
3679	snd->sb_flags |= SB_NOCOALESCE;
3680	rcv->sb_flags |= SB_NOCOALESCE;
3681
3682	toep->tp_tp = tp;
3683	toep->tp_flags = 0;
3684	tp->t_toe = toep;
3685	reset_wr_list(toep);
3686	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3687	tp->rcv_nxt = toep->tp_copied_seq;
3688	install_offload_ops(so);
3689
3690	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3691	toep->tp_wr_unacked = 0;
3692	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3693	toep->tp_qset_idx = 0;
3694	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3695
3696	/*
3697	 * XXX Cancel any keep alive timer
3698	 */
3699
3700	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3701
3702	/*
3703	 * XXX workaround for lack of syncache drop
3704	 */
3705	toepcb_release(toep);
3706	inp_wunlock(tp->t_inpcb);
3707
3708	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3709	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3710#ifdef notyet
3711	/*
3712	 * XXX not sure how these checks map to us
3713	 */
3714	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3715		sk->sk_state_change(sk);
3716		sk_wake_async(so, 0, POLL_OUT);
3717	}
3718	/*
3719	 * The state for the new connection is now up to date.
3720	 * Next check if we should add the connection to the parent's
3721	 * accept queue.  When the parent closes it resets connections
3722	 * on its SYN queue, so check if we are being reset.  If so we
3723	 * don't need to do anything more, the coming ABORT_RPL will
3724	 * destroy this socket.  Otherwise move the connection to the
3725	 * accept queue.
3726	 *
3727	 * Note that we reset the synq before closing the server so if
3728	 * we are not being reset the stid is still open.
3729	 */
3730	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3731		__kfree_skb(skb);
3732		goto unlock;
3733	}
3734#endif
3735	m_free(m);
3736
3737	return (0);
3738}
3739
3740/*
3741 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3742 * and send them to the TOE.
3743 */
3744static void
3745fixup_and_send_ofo(struct toepcb *toep)
3746{
3747	struct mbuf *m;
3748	struct toedev *tdev = toep->tp_toedev;
3749	struct tcpcb *tp = toep->tp_tp;
3750	unsigned int tid = toep->tp_tid;
3751
3752	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3753
3754	inp_lock_assert(tp->t_inpcb);
3755	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3756		/*
3757		 * A variety of messages can be waiting but the fields we'll
3758		 * be touching are common to all so any message type will do.
3759		 */
3760		struct cpl_close_con_req *p = cplhdr(m);
3761
3762		p->wr.wr_lo = htonl(V_WR_TID(tid));
3763		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3764		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3765	}
3766}
3767
3768/*
3769 * Updates socket state from an active establish CPL message.  Runs with the
3770 * socket lock held.
3771 */
3772static void
3773socket_act_establish(struct socket *so, struct mbuf *m)
3774{
3775	struct cpl_act_establish *req = cplhdr(m);
3776	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3777	struct tcpcb *tp = so_sototcpcb(so);
3778	struct toepcb *toep = tp->t_toe;
3779
3780	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3781		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3782		    toep->tp_tid, tp->t_state);
3783
3784	tp->ts_recent_age = ticks;
3785	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3786	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3787
3788	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3789
3790	/*
3791	 * Now that we finally have a TID send any CPL messages that we had to
3792	 * defer for lack of a TID.
3793	 */
3794	if (mbufq_len(&toep->out_of_order_queue))
3795		fixup_and_send_ofo(toep);
3796
3797	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3798		/*
3799		 * XXX does this even make sense?
3800		 */
3801		so_sorwakeup(so);
3802	}
3803	m_free(m);
3804#ifdef notyet
3805/*
3806 * XXX assume no write requests permitted while socket connection is
3807 * incomplete
3808 */
3809	/*
3810	 * Currently the send queue must be empty at this point because the
3811	 * socket layer does not send anything before a connection is
3812	 * established.  To be future proof though we handle the possibility
3813	 * that there are pending buffers to send (either TX_DATA or
3814	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3815	 * buffers according to the just learned write_seq, and then we send
3816	 * them on their way.
3817	 */
3818	fixup_pending_writeq_buffers(sk);
3819	if (t3_push_frames(so, 1))
3820		sk->sk_write_space(sk);
3821#endif
3822
3823	toep->tp_state = tp->t_state;
3824	V_tcpstat.tcps_connects++;
3825
3826}
3827
3828/*
3829 * Process a CPL_ACT_ESTABLISH message.
3830 */
3831static int
3832do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3833{
3834	struct cpl_act_establish *req = cplhdr(m);
3835	unsigned int tid = GET_TID(req);
3836	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3837	struct toepcb *toep = (struct toepcb *)ctx;
3838	struct tcpcb *tp = toep->tp_tp;
3839	struct socket *so;
3840	struct toedev *tdev;
3841	struct tom_data *d;
3842
3843	if (tp == NULL) {
3844		free_atid(cdev, atid);
3845		return (0);
3846	}
3847	inp_wlock(tp->t_inpcb);
3848
3849	/*
3850	 * XXX
3851	 */
3852	so = inp_inpcbtosocket(tp->t_inpcb);
3853	tdev = toep->tp_toedev; /* blow up here if link was down */
3854	d = TOM_DATA(tdev);
3855
3856	/*
3857	 * It's OK if the TID is currently in use, the owning socket may have
3858	 * backlogged its last CPL message(s).  Just take it away.
3859	 */
3860	toep->tp_tid = tid;
3861	toep->tp_tp = tp;
3862	so_insert_tid(d, toep, tid);
3863	free_atid(cdev, atid);
3864	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3865
3866	socket_act_establish(so, m);
3867	inp_wunlock(tp->t_inpcb);
3868	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3869	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3870
3871	return (0);
3872}
3873
3874/*
3875 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3876 * next batch of work requests from the write queue.
3877 */
3878static void
3879wr_ack(struct toepcb *toep, struct mbuf *m)
3880{
3881	struct tcpcb *tp = toep->tp_tp;
3882	struct cpl_wr_ack *hdr = cplhdr(m);
3883	struct socket *so;
3884	unsigned int credits = ntohs(hdr->credits);
3885	u32 snd_una = ntohl(hdr->snd_una);
3886	int bytes = 0;
3887	struct sockbuf *snd;
3888
3889	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3890
3891	inp_wlock(tp->t_inpcb);
3892	so = inp_inpcbtosocket(tp->t_inpcb);
3893	toep->tp_wr_avail += credits;
3894	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3895		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3896
3897	while (credits) {
3898		struct mbuf *p = peek_wr(toep);
3899
3900		if (__predict_false(!p)) {
3901			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3902			    "nothing pending, state %u wr_avail=%u\n",
3903			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3904			break;
3905		}
3906		CTR2(KTR_TOM,
3907			"wr_ack: p->credits=%d p->bytes=%d",
3908		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3909		KASSERT(p->m_pkthdr.csum_data != 0,
3910		    ("empty request still on list"));
3911
3912		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3913
3914#if DEBUG_WR > 1
3915			struct tx_data_wr *w = cplhdr(p);
3916			log(LOG_ERR,
3917			       "TID %u got %u WR credits, need %u, len %u, "
3918			       "main body %u, frags %u, seq # %u, ACK una %u,"
3919			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3920			       toep->tp_tid, credits, p->csum, p->len,
3921			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3922			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3923			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3924#endif
3925			p->m_pkthdr.csum_data -= credits;
3926			break;
3927		} else {
3928			dequeue_wr(toep);
3929			credits -= p->m_pkthdr.csum_data;
3930			bytes += p->m_pkthdr.len;
3931			CTR3(KTR_TOM,
3932			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3933			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3934
3935			m_free(p);
3936		}
3937	}
3938
3939#if DEBUG_WR
3940	check_wr_invariants(tp);
3941#endif
3942
3943	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3944#if VALIDATE_SEQ
3945		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3946
3947		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3948		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3949		    toep->tp_tid, tp->snd_una);
3950#endif
3951		goto out_free;
3952	}
3953
3954	if (tp->snd_una != snd_una) {
3955		tp->snd_una = snd_una;
3956		tp->ts_recent_age = ticks;
3957#ifdef notyet
3958		/*
3959		 * Keep ARP entry "minty fresh"
3960		 */
3961		dst_confirm(sk->sk_dst_cache);
3962#endif
3963		if (tp->snd_una == tp->snd_nxt)
3964			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3965	}
3966
3967	snd = so_sockbuf_snd(so);
3968	if (bytes) {
3969		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3970		snd = so_sockbuf_snd(so);
3971		sockbuf_lock(snd);
3972		sbdrop_locked(snd, bytes);
3973		so_sowwakeup_locked(so);
3974	}
3975
3976	if (snd->sb_sndptroff < snd->sb_cc)
3977		t3_push_frames(so, 0);
3978
3979out_free:
3980	inp_wunlock(tp->t_inpcb);
3981	m_free(m);
3982}
3983
3984/*
3985 * Handler for TX_DATA_ACK CPL messages.
3986 */
3987static int
3988do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3989{
3990	struct toepcb *toep = (struct toepcb *)ctx;
3991
3992	VALIDATE_SOCK(so);
3993
3994	wr_ack(toep, m);
3995	return 0;
3996}
3997
3998/*
3999 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4000 */
4001static int
4002do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4003{
4004	m_freem(m);
4005	return 0;
4006}
4007
4008/*
4009 * Reset a connection that is on a listener's SYN queue or accept queue,
4010 * i.e., one that has not had a struct socket associated with it.
4011 * Must be called from process context.
4012 *
4013 * Modeled after code in inet_csk_listen_stop().
4014 */
4015static void
4016t3_reset_listen_child(struct socket *child)
4017{
4018	struct tcpcb *tp = so_sototcpcb(child);
4019
4020	t3_send_reset(tp->t_toe);
4021}
4022
4023
4024static void
4025t3_child_disconnect(struct socket *so, void *arg)
4026{
4027	struct tcpcb *tp = so_sototcpcb(so);
4028
4029	if (tp->t_flags & TF_TOE) {
4030		inp_wlock(tp->t_inpcb);
4031		t3_reset_listen_child(so);
4032		inp_wunlock(tp->t_inpcb);
4033	}
4034}
4035
4036/*
4037 * Disconnect offloaded established but not yet accepted connections sitting
4038 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4039 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4040 */
4041void
4042t3_disconnect_acceptq(struct socket *listen_so)
4043{
4044
4045	so_lock(listen_so);
4046	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4047	so_unlock(listen_so);
4048}
4049
4050/*
4051 * Reset offloaded connections sitting on a server's syn queue.  As above
4052 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4053 */
4054
4055void
4056t3_reset_synq(struct listen_ctx *lctx)
4057{
4058	struct toepcb *toep;
4059
4060	so_lock(lctx->lso);
4061	while (!LIST_EMPTY(&lctx->synq_head)) {
4062		toep = LIST_FIRST(&lctx->synq_head);
4063		LIST_REMOVE(toep, synq_entry);
4064		toep->tp_tp = NULL;
4065		t3_send_reset(toep);
4066		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4067		toepcb_release(toep);
4068	}
4069	so_unlock(lctx->lso);
4070}
4071
4072
4073int
4074t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4075		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4076		   unsigned int pg_off, unsigned int color)
4077{
4078	unsigned int i, j, pidx;
4079	struct pagepod *p;
4080	struct mbuf *m;
4081	struct ulp_mem_io *req;
4082	unsigned int tid = toep->tp_tid;
4083	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4084	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4085
4086	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4087	    gl, nppods, tag, maxoff, pg_off, color);
4088
4089	for (i = 0; i < nppods; ++i) {
4090		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4091		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4092		req = mtod(m, struct ulp_mem_io *);
4093		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4094		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4095		req->wr.wr_lo = 0;
4096		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4097					   V_ULPTX_CMD(ULP_MEM_WRITE));
4098		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4099				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4100
4101		p = (struct pagepod *)(req + 1);
4102		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4103			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4104			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4105						  V_PPOD_COLOR(color));
4106			p->pp_max_offset = htonl(maxoff);
4107			p->pp_page_offset = htonl(pg_off);
4108			p->pp_rsvd = 0;
4109			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4110				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4111				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4112		} else
4113			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4114		send_or_defer(toep, m, 0);
4115		ppod_addr += PPOD_SIZE;
4116	}
4117	return (0);
4118}
4119
4120/*
4121 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4122 */
4123static inline void
4124mk_cpl_barrier_ulp(struct cpl_barrier *b)
4125{
4126	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4127
4128	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4129	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4130	b->opcode = CPL_BARRIER;
4131}
4132
4133/*
4134 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4135 */
4136static inline void
4137mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4138{
4139	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4140
4141	txpkt = (struct ulp_txpkt *)req;
4142	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4143	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4144	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4145	req->cpuno = htons(cpuno);
4146}
4147
4148/*
4149 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4150 */
4151static inline void
4152mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4153                     unsigned int word, uint64_t mask, uint64_t val)
4154{
4155	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4156
4157	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4158	    tid, word, mask, val);
4159
4160	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4161	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4162	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4163	req->reply = V_NO_REPLY(1);
4164	req->cpu_idx = 0;
4165	req->word = htons(word);
4166	req->mask = htobe64(mask);
4167	req->val = htobe64(val);
4168}
4169
4170/*
4171 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4172 */
4173static void
4174mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4175    unsigned int tid, unsigned int credits)
4176{
4177	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4178
4179	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4180	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4181	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4182	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4183	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4184				 V_RX_CREDITS(credits));
4185}
4186
4187void
4188t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4189{
4190	unsigned int wrlen;
4191	struct mbuf *m;
4192	struct work_request_hdr *wr;
4193	struct cpl_barrier *lock;
4194	struct cpl_set_tcb_field *req;
4195	struct cpl_get_tcb *getreq;
4196	struct ddp_state *p = &toep->tp_ddp_state;
4197
4198#if 0
4199	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4200#endif
4201	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4202		sizeof(*getreq);
4203	m = m_gethdr_nofail(wrlen);
4204	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4205	wr = mtod(m, struct work_request_hdr *);
4206	bzero(wr, wrlen);
4207
4208	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4209	m->m_pkthdr.len = m->m_len = wrlen;
4210
4211	lock = (struct cpl_barrier *)(wr + 1);
4212	mk_cpl_barrier_ulp(lock);
4213
4214	req = (struct cpl_set_tcb_field *)(lock + 1);
4215
4216	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4217
4218	/* Hmmm, not sure if this actually a good thing: reactivating
4219	 * the other buffer might be an issue if it has been completed
4220	 * already. However, that is unlikely, since the fact that the UBUF
4221	 * is not completed indicates that there is no oustanding data.
4222	 */
4223	if (bufidx == 0)
4224		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4225				     V_TF_DDP_ACTIVE_BUF(1) |
4226				     V_TF_DDP_BUF0_VALID(1),
4227				     V_TF_DDP_ACTIVE_BUF(1));
4228	else
4229		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4230				     V_TF_DDP_ACTIVE_BUF(1) |
4231				     V_TF_DDP_BUF1_VALID(1), 0);
4232
4233	getreq = (struct cpl_get_tcb *)(req + 1);
4234	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4235
4236	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4237
4238	/* Keep track of the number of oustanding CPL_GET_TCB requests
4239	 */
4240	p->get_tcb_count++;
4241
4242#ifdef T3_TRACE
4243	T3_TRACE1(TIDTB(so),
4244		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4245#endif
4246	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4247}
4248
4249/**
4250 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4251 * @sk: the socket associated with the buffers
4252 * @bufidx: index of HW DDP buffer (0 or 1)
4253 * @tag0: new tag for HW buffer 0
4254 * @tag1: new tag for HW buffer 1
4255 * @len: new length for HW buf @bufidx
4256 *
4257 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4258 * buffer by changing the buffer tag and length and setting the valid and
4259 * active flag accordingly.  The caller must ensure the new buffer is at
4260 * least as big as the existing one.  Since we typically reprogram both HW
4261 * buffers this function sets both tags for convenience. Read the TCB to
4262 * determine how made data was written into the buffer before the overlay
4263 * took place.
4264 */
4265void
4266t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4267	 	       unsigned int tag1, unsigned int len)
4268{
4269	unsigned int wrlen;
4270	struct mbuf *m;
4271	struct work_request_hdr *wr;
4272	struct cpl_get_tcb *getreq;
4273	struct cpl_set_tcb_field *req;
4274	struct ddp_state *p = &toep->tp_ddp_state;
4275
4276	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4277	    bufidx, tag0, tag1, len);
4278#if 0
4279	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4280#endif
4281	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4282	m = m_gethdr_nofail(wrlen);
4283	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4284	wr = mtod(m, struct work_request_hdr *);
4285	m->m_pkthdr.len = m->m_len = wrlen;
4286	bzero(wr, wrlen);
4287
4288
4289	/* Set the ATOMIC flag to make sure that TP processes the following
4290	 * CPLs in an atomic manner and no wire segments can be interleaved.
4291	 */
4292	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4293	req = (struct cpl_set_tcb_field *)(wr + 1);
4294	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4295			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4296			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4297			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4298			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4299	req++;
4300	if (bufidx == 0) {
4301		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4302			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4303			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4304		req++;
4305		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4306			    V_TF_DDP_PUSH_DISABLE_0(1) |
4307			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4308			    V_TF_DDP_PUSH_DISABLE_0(0) |
4309			    V_TF_DDP_BUF0_VALID(1));
4310	} else {
4311		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4312			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4313			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4314		req++;
4315		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4316			    V_TF_DDP_PUSH_DISABLE_1(1) |
4317			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4318			    V_TF_DDP_PUSH_DISABLE_1(0) |
4319			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4320	}
4321
4322	getreq = (struct cpl_get_tcb *)(req + 1);
4323	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4324
4325	/* Keep track of the number of oustanding CPL_GET_TCB requests
4326	 */
4327	p->get_tcb_count++;
4328
4329#ifdef T3_TRACE
4330	T3_TRACE4(TIDTB(sk),
4331		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4332		  "len %d",
4333		  bufidx, tag0, tag1, len);
4334#endif
4335	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4336}
4337
4338/*
4339 * Sends a compound WR containing all the CPL messages needed to program the
4340 * two HW DDP buffers, namely optionally setting up the length and offset of
4341 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4342 */
4343void
4344t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4345		      unsigned int len1, unsigned int offset1,
4346                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4347{
4348	unsigned int wrlen;
4349	struct mbuf *m;
4350	struct work_request_hdr *wr;
4351	struct cpl_set_tcb_field *req;
4352
4353	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4354	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4355
4356#if 0
4357	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4358#endif
4359	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4360		(len1 ? sizeof(*req) : 0) +
4361		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4362	m = m_gethdr_nofail(wrlen);
4363	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4364	wr = mtod(m, struct work_request_hdr *);
4365	bzero(wr, wrlen);
4366
4367	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4368	m->m_pkthdr.len = m->m_len = wrlen;
4369
4370	req = (struct cpl_set_tcb_field *)(wr + 1);
4371	if (len0) {                  /* program buffer 0 offset and length */
4372		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4373			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4374			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4375			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4376			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4377		req++;
4378	}
4379	if (len1) {                  /* program buffer 1 offset and length */
4380		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4381			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4382			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4383			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4384			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4385		req++;
4386	}
4387
4388	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4389			     ddp_flags);
4390
4391	if (modulate) {
4392		mk_rx_data_ack_ulp(toep,
4393		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4394		    toep->tp_copied_seq - toep->tp_rcv_wup);
4395		toep->tp_rcv_wup = toep->tp_copied_seq;
4396	}
4397
4398#ifdef T3_TRACE
4399	T3_TRACE5(TIDTB(sk),
4400		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4401		  "modulate %d",
4402		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4403		  modulate);
4404#endif
4405
4406	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4407}
4408
4409void
4410t3_init_wr_tab(unsigned int wr_len)
4411{
4412	int i;
4413
4414	if (mbuf_wrs[1])     /* already initialized */
4415		return;
4416
4417	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4418		int sgl_len = (3 * i) / 2 + (i & 1);
4419
4420		sgl_len += 3;
4421		mbuf_wrs[i] = sgl_len <= wr_len ?
4422		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4423	}
4424
4425	wrlen = wr_len * 8;
4426}
4427
4428int
4429t3_init_cpl_io(void)
4430{
4431#ifdef notyet
4432	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4433	if (!tcphdr_skb) {
4434		log(LOG_ERR,
4435		       "Chelsio TCP offload: can't allocate sk_buff\n");
4436		return -1;
4437	}
4438	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4439	tcphdr_skb->h.raw = tcphdr_skb->data;
4440	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4441#endif
4442
4443	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4444	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4445	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4446	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4447	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4448	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4449	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4450	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4451	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4452	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4453	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4454	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4455	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4456	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4457	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4458	return (0);
4459}
4460
4461