cxgb_cpl_io.c revision 177340
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 177340 2008-03-18 03:55:12Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44#include <sys/syslog.h>
45#include <sys/socketvar.h>
46#include <sys/protosw.h>
47#include <sys/priv.h>
48
49#include <net/if.h>
50#include <net/route.h>
51
52#include <netinet/in.h>
53#include <netinet/in_pcb.h>
54#include <netinet/in_systm.h>
55#include <netinet/in_var.h>
56
57
58#include <dev/cxgb/cxgb_osdep.h>
59#include <dev/cxgb/sys/mbufq.h>
60
61#include <netinet/ip.h>
62#include <netinet/tcp_var.h>
63#include <netinet/tcp_fsm.h>
64#include <netinet/tcp_offload.h>
65#include <netinet/tcp_seq.h>
66#include <netinet/tcp_syncache.h>
67#include <netinet/tcp_timer.h>
68#include <net/route.h>
69
70#include <dev/cxgb/t3cdev.h>
71#include <dev/cxgb/common/cxgb_firmware_exports.h>
72#include <dev/cxgb/common/cxgb_t3_cpl.h>
73#include <dev/cxgb/common/cxgb_tcb.h>
74#include <dev/cxgb/common/cxgb_ctl_defs.h>
75#include <dev/cxgb/cxgb_l2t.h>
76#include <dev/cxgb/cxgb_offload.h>
77#include <vm/vm.h>
78#include <vm/pmap.h>
79#include <machine/bus.h>
80#include <dev/cxgb/sys/mvec.h>
81#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
82#include <dev/cxgb/ulp/tom/cxgb_defs.h>
83#include <dev/cxgb/ulp/tom/cxgb_tom.h>
84#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
85#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
86#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space.  Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets.  It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes.  Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body.  This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes.  Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window.  We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139extern int tcp_do_autorcvbuf;
140extern int tcp_do_autosndbuf;
141extern int tcp_autorcvbuf_max;
142extern int tcp_autosndbuf_max;
143
144static void t3_send_reset(struct toepcb *toep);
145static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
146static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
147static void handle_syncache_event(int event, void *arg);
148
149static inline void
150SBAPPEND(struct sockbuf *sb, struct mbuf *n)
151{
152	struct mbuf * m;
153
154	m = sb->sb_mb;
155	while (m) {
156		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
157		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
158			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
159		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
160			m->m_next, m->m_nextpkt, m->m_flags));
161		m = m->m_next;
162	}
163	m = n;
164	while (m) {
165		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
166		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
167			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
168		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
169			m->m_next, m->m_nextpkt, m->m_flags));
170		m = m->m_next;
171	}
172	sbappend_locked(sb, n);
173	m = sb->sb_mb;
174	while (m) {
175		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176			m->m_next, m->m_nextpkt, m->m_flags));
177		m = m->m_next;
178	}
179}
180
181static inline int
182is_t3a(const struct toedev *dev)
183{
184	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
185}
186
187static void
188dump_toepcb(struct toepcb *toep)
189{
190	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
191	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
192	    toep->tp_mtu_idx, toep->tp_tid);
193
194	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
195	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
196	    toep->tp_mss_clamp, toep->tp_flags);
197}
198
199#ifndef RTALLOC2_DEFINED
200static struct rtentry *
201rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
202{
203	struct rtentry *rt = NULL;
204
205	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
206		RT_UNLOCK(rt);
207
208	return (rt);
209}
210#endif
211/*
212 * Determine whether to send a CPL message now or defer it.  A message is
213 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
214 * For connections in other states the message is sent immediately.
215 * If through_l2t is set the message is subject to ARP processing, otherwise
216 * it is sent directly.
217 */
218static inline void
219send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
220{
221	struct tcpcb *tp = toep->tp_tp;
222
223	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
224		INP_LOCK(tp->t_inpcb);
225		mbufq_tail(&toep->out_of_order_queue, m);  // defer
226		INP_UNLOCK(tp->t_inpcb);
227	} else if (through_l2t)
228		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
229	else
230		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
231}
232
233static inline unsigned int
234mkprio(unsigned int cntrl, const struct toepcb *toep)
235{
236        return (cntrl);
237}
238
239/*
240 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
241 */
242static inline void
243mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
244{
245	struct cpl_tid_release *req;
246
247	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
248	m->m_pkthdr.len = m->m_len = sizeof(*req);
249	req = mtod(m, struct cpl_tid_release *);
250	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
251	req->wr.wr_lo = 0;
252	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
253}
254
255static inline void
256make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
257{
258	struct tcpcb *tp = sototcpcb(so);
259	struct toepcb *toep = tp->t_toe;
260	struct tx_data_wr *req;
261
262	INP_LOCK_ASSERT(tp->t_inpcb);
263
264	req = mtod(m, struct tx_data_wr *);
265	m->m_len = sizeof(*req);
266	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
267	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
268	/* len includes the length of any HW ULP additions */
269	req->len = htonl(len);
270	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
271	/* V_TX_ULP_SUBMODE sets both the mode and submode */
272	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
273	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
274	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
275				   (tail ? 0 : 1))));
276	req->sndseq = htonl(tp->snd_nxt);
277	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
278		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
279				    V_TX_CPU_IDX(toep->tp_qset));
280
281		/* Sendbuffer is in units of 32KB.
282		 */
283		if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
284			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
285		else
286			req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
287		toep->tp_flags |= TP_DATASENT;
288	}
289}
290
291#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
292
293int
294t3_push_frames(struct socket *so, int req_completion)
295{
296	struct tcpcb *tp = sototcpcb(so);
297	struct toepcb *toep = tp->t_toe;
298
299	struct mbuf *tail, *m0, *last;
300	struct t3cdev *cdev;
301	struct tom_data *d;
302	int i, bytes, count, total_bytes;
303	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
304
305	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
306		DPRINTF("tcp state=%d\n", tp->t_state);
307		return (0);
308	}
309
310	if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
311		DPRINTF("disconnecting\n");
312
313		return (0);
314	}
315
316
317	INP_LOCK_ASSERT(tp->t_inpcb);
318	SOCKBUF_LOCK(&so->so_snd);
319	d = TOM_DATA(TOE_DEV(so));
320	cdev = d->cdev;
321	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
322	total_bytes = 0;
323	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
324	    toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
325
326	if (last && toep->tp_m_last == last  && so->so_snd.sb_sndptroff != 0) {
327		KASSERT(tail, ("sbdrop error"));
328		last = tail = tail->m_next;
329	}
330
331	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
332		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
333		SOCKBUF_UNLOCK(&so->so_snd);
334		return (0);
335	}
336
337	toep->tp_m_last = NULL;
338	while (toep->tp_wr_avail && (tail != NULL)) {
339		count = bytes = 0;
340		segp = segs;
341		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
342			SOCKBUF_UNLOCK(&so->so_snd);
343			return (0);
344		}
345		/*
346		 * If the data in tail fits as in-line, then
347		 * make an immediate data wr.
348		 */
349		if (tail->m_len <= IMM_LEN) {
350			count = 1;
351			bytes = tail->m_len;
352			last = tail;
353			tail = tail->m_next;
354			m_set_sgl(m0, NULL);
355			m_set_sgllen(m0, 0);
356			make_tx_data_wr(so, m0, bytes, tail);
357			m_append(m0, bytes, mtod(last, caddr_t));
358			KASSERT(!m0->m_next, ("bad append"));
359		} else {
360			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
361			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
362				bytes += tail->m_len;
363				last = tail;
364				count++;
365				/*
366				 * technically an abuse to be using this for a VA
367				 * but less gross than defining my own structure
368				 * or calling pmap_kextract from here :-|
369				 */
370				segp->ds_addr = (bus_addr_t)tail->m_data;
371				segp->ds_len = tail->m_len;
372				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
373				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
374				segp++;
375				tail = tail->m_next;
376			}
377			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
378			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
379
380			m_set_sgl(m0, segs);
381			m_set_sgllen(m0, count);
382			make_tx_data_wr(so, m0, bytes, tail);
383		}
384		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
385
386		if (tail) {
387			so->so_snd.sb_sndptr = tail;
388			toep->tp_m_last = NULL;
389		} else
390			toep->tp_m_last = so->so_snd.sb_sndptr = last;
391
392
393		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
394
395		so->so_snd.sb_sndptroff += bytes;
396		total_bytes += bytes;
397		toep->tp_write_seq += bytes;
398		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
399		    toep->tp_wr_avail, count, mbuf_wrs[count], tail, so->so_snd.sb_sndptr, so->so_snd.sb_sndptroff);
400		if (tail)
401			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
402			    total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
403		else
404			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
405			    total_bytes, toep->tp_m_last, tp->snd_una);
406
407
408		i = 0;
409		while (i < count && m_get_sgllen(m0)) {
410			if ((count - i) >= 3) {
411				CTR6(KTR_TOM,
412				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
413				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
414				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
415				    i += 3;
416			} else if ((count - i) == 2) {
417				CTR4(KTR_TOM,
418				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
419				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
420				    i += 2;
421			} else {
422				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
423				    segs[i].ds_addr, segs[i].ds_len);
424				i++;
425			}
426
427		}
428
429                 /*
430		 * remember credits used
431		 */
432		m0->m_pkthdr.csum_data = mbuf_wrs[count];
433		m0->m_pkthdr.len = bytes;
434		toep->tp_wr_avail -= mbuf_wrs[count];
435		toep->tp_wr_unacked += mbuf_wrs[count];
436
437		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
438		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
439			struct work_request_hdr *wr = cplhdr(m0);
440
441			wr->wr_hi |= htonl(F_WR_COMPL);
442			toep->tp_wr_unacked = 0;
443		}
444		KASSERT((m0->m_pkthdr.csum_data > 0) &&
445		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
446			m0->m_pkthdr.csum_data));
447		m0->m_type = MT_DONTFREE;
448		enqueue_wr(toep, m0);
449		DPRINTF("sending offload tx with %d bytes in %d segments\n",
450		    bytes, count);
451		l2t_send(cdev, m0, toep->tp_l2t);
452	}
453	SOCKBUF_UNLOCK(&so->so_snd);
454	return (total_bytes);
455}
456
457/*
458 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
459 * under any circumstances.  We take the easy way out and always queue the
460 * message to the write_queue.  We can optimize the case where the queue is
461 * already empty though the optimization is probably not worth it.
462 */
463static void
464close_conn(struct socket *so)
465{
466	struct mbuf *m;
467	struct cpl_close_con_req *req;
468	struct tom_data *d;
469	struct inpcb *inp = sotoinpcb(so);
470	struct tcpcb *tp;
471	struct toepcb *toep;
472	unsigned int tid;
473
474
475	INP_LOCK(inp);
476	tp = sototcpcb(so);
477	toep = tp->t_toe;
478
479	if (tp->t_state != TCPS_SYN_SENT)
480		t3_push_frames(so, 1);
481
482	if (toep->tp_flags & TP_FIN_SENT) {
483		INP_UNLOCK(inp);
484		return;
485	}
486
487	tid = toep->tp_tid;
488
489	d = TOM_DATA(toep->tp_toedev);
490
491	m = m_gethdr_nofail(sizeof(*req));
492
493	toep->tp_flags |= TP_FIN_SENT;
494	req = mtod(m, struct cpl_close_con_req *);
495
496	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
497	req->wr.wr_lo = htonl(V_WR_TID(tid));
498	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
499	req->rsvd = htonl(toep->tp_write_seq);
500	INP_UNLOCK(inp);
501	/*
502	 * XXX - need to defer shutdown while there is still data in the queue
503	 *
504	 */
505	cxgb_ofld_send(d->cdev, m);
506
507}
508
509/*
510 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
511 * and send it along.
512 */
513static void
514abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
515{
516	struct cpl_abort_req *req = cplhdr(m);
517
518	req->cmd = CPL_ABORT_NO_RST;
519	cxgb_ofld_send(cdev, m);
520}
521
522/*
523 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
524 * permitted to return without sending the message in case we cannot allocate
525 * an sk_buff.  Returns the number of credits sent.
526 */
527uint32_t
528t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
529{
530	struct mbuf *m;
531	struct cpl_rx_data_ack *req;
532	struct toepcb *toep = tp->t_toe;
533	struct toedev *tdev = toep->tp_toedev;
534
535	m = m_gethdr_nofail(sizeof(*req));
536
537	DPRINTF("returning %u credits to HW\n", credits);
538
539	req = mtod(m, struct cpl_rx_data_ack *);
540	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
541	req->wr.wr_lo = 0;
542	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
543	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
544	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
545	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
546	return (credits);
547}
548
549/*
550 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
551 * This is only used in DDP mode, so we take the opportunity to also set the
552 * DACK mode and flush any Rx credits.
553 */
554void
555t3_send_rx_modulate(struct toepcb *toep)
556{
557	struct mbuf *m;
558	struct cpl_rx_data_ack *req;
559
560	m = m_gethdr_nofail(sizeof(*req));
561
562	req = mtod(m, struct cpl_rx_data_ack *);
563	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
564	req->wr.wr_lo = 0;
565	m->m_pkthdr.len = m->m_len = sizeof(*req);
566
567	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
568	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
569				 V_RX_DACK_MODE(1) |
570				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
571	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
572	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
573	toep->tp_rcv_wup = toep->tp_copied_seq;
574}
575
576/*
577 * Handle receipt of an urgent pointer.
578 */
579static void
580handle_urg_ptr(struct socket *so, uint32_t urg_seq)
581{
582#ifdef URGENT_DATA_SUPPORTED
583	struct tcpcb *tp = sototcpcb(so);
584
585	urg_seq--;   /* initially points past the urgent data, per BSD */
586
587	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
588		return;                                 /* duplicate pointer */
589	sk_send_sigurg(sk);
590	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
591	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
592		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
593
594		tp->copied_seq++;
595		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
596			tom_eat_skb(sk, skb, 0);
597	}
598	tp->urg_data = TCP_URG_NOTYET;
599	tp->urg_seq = urg_seq;
600#endif
601}
602
603/*
604 * Returns true if a socket cannot accept new Rx data.
605 */
606static inline int
607so_no_receive(const struct socket *so)
608{
609	return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
610}
611
612/*
613 * Process an urgent data notification.
614 */
615static void
616rx_urg_notify(struct toepcb *toep, struct mbuf *m)
617{
618	struct cpl_rx_urg_notify *hdr = cplhdr(m);
619	struct socket *so = toeptoso(toep);
620
621	VALIDATE_SOCK(so);
622
623	if (!so_no_receive(so))
624		handle_urg_ptr(so, ntohl(hdr->seq));
625
626	m_freem(m);
627}
628
629/*
630 * Handler for RX_URG_NOTIFY CPL messages.
631 */
632static int
633do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
634{
635	struct toepcb *toep = (struct toepcb *)ctx;
636
637	rx_urg_notify(toep, m);
638	return (0);
639}
640
641static __inline int
642is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
643{
644	return (toep->tp_ulp_mode ||
645		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
646		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
647}
648
649/*
650 * Set of states for which we should return RX credits.
651 */
652#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
653
654/*
655 * Called after some received data has been read.  It returns RX credits
656 * to the HW for the amount of data processed.
657 */
658void
659t3_cleanup_rbuf(struct tcpcb *tp, int copied)
660{
661	struct toepcb *toep = tp->t_toe;
662	struct socket *so;
663	struct toedev *dev;
664	int dack_mode, must_send, read;
665	u32 thres, credits, dack = 0;
666
667	so = tp->t_inpcb->inp_socket;
668	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
669		(tp->t_state == TCPS_FIN_WAIT_2))) {
670		if (copied) {
671			SOCKBUF_LOCK(&so->so_rcv);
672			toep->tp_copied_seq += copied;
673			SOCKBUF_UNLOCK(&so->so_rcv);
674		}
675
676		return;
677	}
678
679	INP_LOCK_ASSERT(tp->t_inpcb);
680	SOCKBUF_LOCK(&so->so_rcv);
681	if (copied)
682		toep->tp_copied_seq += copied;
683	else {
684		read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
685		toep->tp_copied_seq += read;
686	}
687	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
688	toep->tp_enqueued_bytes = so->so_rcv.sb_cc;
689	SOCKBUF_UNLOCK(&so->so_rcv);
690
691	if (credits > so->so_rcv.sb_mbmax) {
692	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
693		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
694	    credits = so->so_rcv.sb_mbmax;
695	}
696
697
698	    /*
699	 * XXX this won't accurately reflect credit return - we need
700	 * to look at the difference between the amount that has been
701	 * put in the recv sockbuf and what is there now
702	 */
703
704	if (__predict_false(!credits))
705		return;
706
707	dev = toep->tp_toedev;
708	thres = TOM_TUNABLE(dev, rx_credit_thres);
709
710	if (__predict_false(thres == 0))
711		return;
712
713	if (is_delack_mode_valid(dev, toep)) {
714		dack_mode = TOM_TUNABLE(dev, delack);
715		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
716			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
717
718			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
719				dack = F_RX_DACK_CHANGE |
720				       V_RX_DACK_MODE(dack_mode);
721		}
722	} else
723		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
724
725	/*
726	 * For coalescing to work effectively ensure the receive window has
727	 * at least 16KB left.
728	 */
729	must_send = credits + 16384 >= tp->rcv_wnd;
730
731	if (must_send || credits >= thres)
732		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
733}
734
735static int
736cxgb_toe_disconnect(struct tcpcb *tp)
737{
738	struct socket *so;
739
740	DPRINTF("cxgb_toe_disconnect\n");
741
742	so = tp->t_inpcb->inp_socket;
743	close_conn(so);
744	return (0);
745}
746
747static int
748cxgb_toe_reset(struct tcpcb *tp)
749{
750	struct toepcb *toep = tp->t_toe;
751
752
753	t3_send_reset(toep);
754
755	/*
756	 * unhook from socket
757	 */
758	tp->t_flags &= ~TF_TOE;
759	toep->tp_tp = NULL;
760	tp->t_toe = NULL;
761	return (0);
762}
763
764static int
765cxgb_toe_send(struct tcpcb *tp)
766{
767	struct socket *so;
768
769	DPRINTF("cxgb_toe_send\n");
770	dump_toepcb(tp->t_toe);
771
772	so = tp->t_inpcb->inp_socket;
773	t3_push_frames(so, 1);
774	return (0);
775}
776
777static int
778cxgb_toe_rcvd(struct tcpcb *tp)
779{
780	INP_LOCK_ASSERT(tp->t_inpcb);
781	t3_cleanup_rbuf(tp, 0);
782
783	return (0);
784}
785
786static void
787cxgb_toe_detach(struct tcpcb *tp)
788{
789	struct toepcb *toep;
790	/*
791	 * XXX how do we handle teardown in the SYN_SENT state?
792	 *
793	 */
794	INP_INFO_WLOCK(&tcbinfo);
795	toep = tp->t_toe;
796	toep->tp_tp = NULL;
797
798	/*
799	 * unhook from socket
800	 */
801	tp->t_flags &= ~TF_TOE;
802	tp->t_toe = NULL;
803	INP_INFO_WUNLOCK(&tcbinfo);
804}
805
806
807static struct toe_usrreqs cxgb_toe_usrreqs = {
808	.tu_disconnect = cxgb_toe_disconnect,
809	.tu_reset = cxgb_toe_reset,
810	.tu_send = cxgb_toe_send,
811	.tu_rcvd = cxgb_toe_rcvd,
812	.tu_detach = cxgb_toe_detach,
813	.tu_detach = cxgb_toe_detach,
814	.tu_syncache_event = handle_syncache_event,
815};
816
817
818static void
819__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
820			    uint64_t mask, uint64_t val, int no_reply)
821{
822	struct cpl_set_tcb_field *req;
823
824	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
825	    toep->tp_tid, word, mask, val);
826
827	req = mtod(m, struct cpl_set_tcb_field *);
828	m->m_pkthdr.len = m->m_len = sizeof(*req);
829	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
830	req->wr.wr_lo = 0;
831	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
832	req->reply = V_NO_REPLY(no_reply);
833	req->cpu_idx = 0;
834	req->word = htons(word);
835	req->mask = htobe64(mask);
836	req->val = htobe64(val);
837
838	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
839	send_or_defer(toep, m, 0);
840}
841
842static void
843t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
844{
845	struct mbuf *m;
846	struct tcpcb *tp = sototcpcb(so);
847	struct toepcb *toep = tp->t_toe;
848
849	if (toep == NULL)
850		return;
851
852	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
853		printf("not seting field\n");
854		return;
855	}
856
857	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
858
859	__set_tcb_field(toep, m, word, mask, val, 1);
860}
861
862/*
863 * Set one of the t_flags bits in the TCB.
864 */
865static void
866set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
867{
868	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
869}
870
871/*
872 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
873 */
874static void
875t3_set_nagle(struct socket *so)
876{
877	struct tcpcb *tp = sototcpcb(so);
878
879	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
880}
881
882/*
883 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
884 */
885void
886t3_set_keepalive(struct socket *so, int on_off)
887{
888	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
889}
890
891void
892t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
893{
894	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
895}
896
897void
898t3_set_dack_mss(struct socket *so, int on_off)
899{
900	set_tcb_tflag(so, S_TF_DACK_MSS, on_off);
901}
902
903/*
904 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
905 */
906static void
907t3_set_tos(struct socket *so)
908{
909	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
910			 V_TCB_TOS(SO_TOS(so)));
911}
912
913
914/*
915 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
916 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
917 * set the PSH bit in the last segment, which would trigger delivery.]
918 * We work around the issue by setting a DDP buffer in a partial placed state,
919 * which guarantees that TP will schedule a timer.
920 */
921#define TP_DDP_TIMER_WORKAROUND_MASK\
922    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
923     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
924       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
925#define TP_DDP_TIMER_WORKAROUND_VAL\
926    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
927     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
928      32))
929
930static void
931t3_enable_ddp(struct socket *so, int on)
932{
933	if (on) {
934
935		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
936				 V_TF_DDP_OFF(0));
937	} else
938		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
939				 V_TF_DDP_OFF(1) |
940				 TP_DDP_TIMER_WORKAROUND_MASK,
941				 V_TF_DDP_OFF(1) |
942				 TP_DDP_TIMER_WORKAROUND_VAL);
943
944}
945
946void
947t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
948{
949	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
950			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
951			 tag_color);
952}
953
954void
955t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
956		    unsigned int len)
957{
958	if (buf_idx == 0)
959		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
960			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
961			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
962			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
963			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
964	else
965		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
966			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
967			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
968			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
969			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
970}
971
972static int
973t3_set_cong_control(struct socket *so, const char *name)
974{
975#ifdef CONGESTION_CONTROL_SUPPORTED
976	int cong_algo;
977
978	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
979		if (!strcmp(name, t3_cong_ops[cong_algo].name))
980			break;
981
982	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
983		return -EINVAL;
984#endif
985	return 0;
986}
987
988int
989t3_get_tcb(struct socket *so)
990{
991	struct cpl_get_tcb *req;
992	struct tcpcb *tp = sototcpcb(so);
993	struct toepcb *toep = tp->t_toe;
994	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
995
996	if (!m)
997		return (ENOMEM);
998
999	INP_LOCK_ASSERT(tp->t_inpcb);
1000	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1001	req = mtod(m, struct cpl_get_tcb *);
1002	m->m_pkthdr.len = m->m_len = sizeof(*req);
1003	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1004	req->wr.wr_lo = 0;
1005	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1006	req->cpuno = htons(toep->tp_qset);
1007	req->rsvd = 0;
1008	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
1009		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1010	else
1011		cxgb_ofld_send(T3C_DEV(so), m);
1012	return 0;
1013}
1014
1015static inline void
1016so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
1017{
1018	struct toepcb *toep = sototoep(so);
1019	toepcb_hold(toep);
1020
1021	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1022}
1023
1024/**
1025 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1026 *	@d: TOM state
1027 *	@mtu: the target MTU
1028 *
1029 *	Returns the index of the value in the MTU table that is closest to but
1030 *	does not exceed the target MTU.
1031 */
1032static unsigned int
1033find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1034{
1035	int i = 0;
1036
1037	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1038		++i;
1039	return (i);
1040}
1041
1042static unsigned int
1043select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1044{
1045	unsigned int idx;
1046
1047#ifdef notyet
1048	struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
1049#endif
1050	if (tp) {
1051		tp->t_maxseg = pmtu - 40;
1052		if (tp->t_maxseg < td->mtus[0] - 40)
1053			tp->t_maxseg = td->mtus[0] - 40;
1054		idx = find_best_mtu(td, tp->t_maxseg + 40);
1055
1056		tp->t_maxseg = td->mtus[idx] - 40;
1057	} else
1058		idx = find_best_mtu(td, pmtu);
1059
1060	return (idx);
1061}
1062
1063static inline void
1064free_atid(struct t3cdev *cdev, unsigned int tid)
1065{
1066	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1067
1068	if (toep)
1069		toepcb_release(toep);
1070}
1071
1072/*
1073 * Release resources held by an offload connection (TID, L2T entry, etc.)
1074 */
1075static void
1076t3_release_offload_resources(struct toepcb *toep)
1077{
1078	struct tcpcb *tp = toep->tp_tp;
1079	struct toedev *tdev = toep->tp_toedev;
1080	struct t3cdev *cdev;
1081	unsigned int tid = toep->tp_tid;
1082
1083	if (!tdev)
1084		return;
1085
1086	cdev = TOEP_T3C_DEV(toep);
1087	if (!cdev)
1088		return;
1089
1090	toep->tp_qset = 0;
1091	t3_release_ddp_resources(toep);
1092
1093#ifdef CTRL_SKB_CACHE
1094	kfree_skb(CTRL_SKB_CACHE(tp));
1095	CTRL_SKB_CACHE(tp) = NULL;
1096#endif
1097
1098	if (toep->tp_wr_avail != toep->tp_wr_max) {
1099		purge_wr_queue(toep);
1100		reset_wr_list(toep);
1101	}
1102
1103	if (toep->tp_l2t) {
1104		l2t_release(L2DATA(cdev), toep->tp_l2t);
1105		toep->tp_l2t = NULL;
1106	}
1107	toep->tp_tp = NULL;
1108	if (tp) {
1109		INP_LOCK_ASSERT(tp->t_inpcb);
1110		tp->t_toe = NULL;
1111		tp->t_flags &= ~TF_TOE;
1112	}
1113
1114	if (toep->tp_state == TCPS_SYN_SENT) {
1115		free_atid(cdev, tid);
1116#ifdef notyet
1117		__skb_queue_purge(&tp->out_of_order_queue);
1118#endif
1119	} else {                                          // we have TID
1120		cxgb_remove_tid(cdev, toep, tid);
1121		toepcb_release(toep);
1122	}
1123#if 0
1124	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1125#endif
1126}
1127
1128static void
1129install_offload_ops(struct socket *so)
1130{
1131	struct tcpcb *tp = sototcpcb(so);
1132
1133	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1134
1135	t3_install_socket_ops(so);
1136	tp->t_flags |= TF_TOE;
1137	tp->t_tu = &cxgb_toe_usrreqs;
1138}
1139
1140/*
1141 * Determine the receive window scaling factor given a target max
1142 * receive window.
1143 */
1144static __inline int
1145select_rcv_wscale(int space)
1146{
1147	int wscale = 0;
1148
1149	if (space > MAX_RCV_WND)
1150		space = MAX_RCV_WND;
1151
1152	if (tcp_do_rfc1323)
1153		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1154
1155	return (wscale);
1156}
1157
1158/*
1159 * Determine the receive window size for a socket.
1160 */
1161static unsigned long
1162select_rcv_wnd(struct toedev *dev, struct socket *so)
1163{
1164	struct tom_data *d = TOM_DATA(dev);
1165	unsigned int wnd;
1166	unsigned int max_rcv_wnd;
1167
1168	if (tcp_do_autorcvbuf)
1169		wnd = tcp_autorcvbuf_max;
1170	else
1171		wnd = so->so_rcv.sb_hiwat;
1172
1173
1174
1175	/* XXX
1176	 * For receive coalescing to work effectively we need a receive window
1177	 * that can accomodate a coalesced segment.
1178	 */
1179	if (wnd < MIN_RCV_WND)
1180		wnd = MIN_RCV_WND;
1181
1182	/* PR 5138 */
1183	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1184				    (uint32_t)d->rx_page_size * 23 :
1185				    MAX_RCV_WND);
1186
1187	return min(wnd, max_rcv_wnd);
1188}
1189
1190/*
1191 * Assign offload parameters to some socket fields.  This code is used by
1192 * both active and passive opens.
1193 */
1194static inline void
1195init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1196    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1197{
1198	struct tcpcb *tp = sototcpcb(so);
1199	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1200
1201	SOCK_LOCK_ASSERT(so);
1202
1203	printf("initializing offload socket\n");
1204	/*
1205	 * We either need to fix push frames to work with sbcompress
1206	 * or we need to add this
1207	 */
1208	so->so_snd.sb_flags |= SB_NOCOALESCE;
1209	so->so_rcv.sb_flags |= SB_NOCOALESCE;
1210
1211	tp->t_toe = toep;
1212	toep->tp_tp = tp;
1213	toep->tp_toedev = dev;
1214
1215	toep->tp_tid = tid;
1216	toep->tp_l2t = e;
1217	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1218	toep->tp_wr_unacked = 0;
1219	toep->tp_delack_mode = 0;
1220
1221	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1222	/*
1223	 * XXX broken
1224	 *
1225	 */
1226	tp->rcv_wnd = select_rcv_wnd(dev, so);
1227
1228        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
1229		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1230	toep->tp_qset_idx = 0;
1231
1232	reset_wr_list(toep);
1233	DPRINTF("initialization done\n");
1234}
1235
1236/*
1237 * The next two functions calculate the option 0 value for a socket.
1238 */
1239static inline unsigned int
1240calc_opt0h(struct socket *so, int mtu_idx)
1241{
1242	struct tcpcb *tp = sototcpcb(so);
1243	int wscale = select_rcv_wscale(tp->rcv_wnd);
1244
1245	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1246	    V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1247	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1248}
1249
1250static inline unsigned int
1251calc_opt0l(struct socket *so, int ulp_mode)
1252{
1253	struct tcpcb *tp = sototcpcb(so);
1254	unsigned int val;
1255
1256	val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
1257	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1258
1259	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
1260	return (val);
1261}
1262
1263static inline unsigned int
1264calc_opt2(const struct socket *so, struct toedev *dev)
1265{
1266	int flv_valid;
1267
1268	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1269
1270	return (V_FLAVORS_VALID(flv_valid) |
1271	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1272}
1273
1274#if DEBUG_WR > 1
1275static int
1276count_pending_wrs(const struct toepcb *toep)
1277{
1278	const struct mbuf *m;
1279	int n = 0;
1280
1281	wr_queue_walk(toep, m)
1282		n += m->m_pkthdr.csum_data;
1283	return (n);
1284}
1285#endif
1286
1287#if 0
1288(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1289#endif
1290
1291static void
1292mk_act_open_req(struct socket *so, struct mbuf *m,
1293    unsigned int atid, const struct l2t_entry *e)
1294{
1295	struct cpl_act_open_req *req;
1296	struct inpcb *inp = sotoinpcb(so);
1297	struct tcpcb *tp = intotcpcb(inp);
1298	struct toepcb *toep = tp->t_toe;
1299	struct toedev *tdev = TOE_DEV(so);
1300
1301	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1302
1303	req = mtod(m, struct cpl_act_open_req *);
1304	m->m_pkthdr.len = m->m_len = sizeof(*req);
1305
1306	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1307	req->wr.wr_lo = 0;
1308	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1309	req->local_port = inp->inp_lport;
1310	req->peer_port = inp->inp_fport;
1311	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1312	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1313	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1314			   V_TX_CHANNEL(e->smt_idx));
1315	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1316	req->params = 0;
1317	req->opt2 = htonl(calc_opt2(so, tdev));
1318}
1319
1320
1321/*
1322 * Convert an ACT_OPEN_RPL status to an errno.
1323 */
1324static int
1325act_open_rpl_status_to_errno(int status)
1326{
1327	switch (status) {
1328	case CPL_ERR_CONN_RESET:
1329		return (ECONNREFUSED);
1330	case CPL_ERR_ARP_MISS:
1331		return (EHOSTUNREACH);
1332	case CPL_ERR_CONN_TIMEDOUT:
1333		return (ETIMEDOUT);
1334	case CPL_ERR_TCAM_FULL:
1335		return (ENOMEM);
1336	case CPL_ERR_CONN_EXIST:
1337		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1338		return (EADDRINUSE);
1339	default:
1340		return (EIO);
1341	}
1342}
1343
1344static void
1345fail_act_open(struct toepcb *toep, int errno)
1346{
1347	struct tcpcb *tp = toep->tp_tp;
1348
1349	t3_release_offload_resources(toep);
1350	if (tp) {
1351		INP_LOCK_ASSERT(tp->t_inpcb);
1352		tcp_drop(tp, errno);
1353	}
1354
1355#ifdef notyet
1356	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1357#endif
1358}
1359
1360/*
1361 * Handle active open failures.
1362 */
1363static void
1364active_open_failed(struct toepcb *toep, struct mbuf *m)
1365{
1366	struct cpl_act_open_rpl *rpl = cplhdr(m);
1367	struct inpcb *inp;
1368
1369	INP_INFO_WLOCK(&tcbinfo);
1370	if (toep->tp_tp == NULL)
1371		goto done;
1372
1373	inp = toep->tp_tp->t_inpcb;
1374	INP_LOCK(inp);
1375
1376/*
1377 * Don't handle connection retry for now
1378 */
1379#ifdef notyet
1380	struct inet_connection_sock *icsk = inet_csk(sk);
1381
1382	if (rpl->status == CPL_ERR_CONN_EXIST &&
1383	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1384		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1385		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1386			       jiffies + HZ / 2);
1387	} else
1388#endif
1389		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1390	INP_UNLOCK(inp);
1391done:
1392	INP_INFO_WUNLOCK(&tcbinfo);
1393
1394	m_free(m);
1395}
1396
1397/*
1398 * Return whether a failed active open has allocated a TID
1399 */
1400static inline int
1401act_open_has_tid(int status)
1402{
1403	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1404	       status != CPL_ERR_ARP_MISS;
1405}
1406
1407/*
1408 * Process an ACT_OPEN_RPL CPL message.
1409 */
1410static int
1411do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1412{
1413	struct toepcb *toep = (struct toepcb *)ctx;
1414	struct cpl_act_open_rpl *rpl = cplhdr(m);
1415
1416	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1417		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1418
1419	active_open_failed(toep, m);
1420	return (0);
1421}
1422
1423/*
1424 * Handle an ARP failure for an active open.   XXX purge ofo queue
1425 *
1426 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1427 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1428 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1429 * free the atid.  Hmm.
1430 */
1431#ifdef notyet
1432static void
1433act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1434{
1435	struct toepcb *toep = m_get_toep(m);
1436	struct tcpcb *tp = toep->tp_tp;
1437	struct inpcb *inp = tp->t_inpcb;
1438	struct socket *so = toeptoso(toep);
1439
1440	INP_LOCK(inp);
1441	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1442		fail_act_open(so, EHOSTUNREACH);
1443		printf("freeing %p\n", m);
1444
1445		m_free(m);
1446	}
1447	INP_UNLOCK(inp);
1448}
1449#endif
1450/*
1451 * Send an active open request.
1452 */
1453int
1454t3_connect(struct toedev *tdev, struct socket *so,
1455    struct rtentry *rt, struct sockaddr *nam)
1456{
1457	struct mbuf *m;
1458	struct l2t_entry *e;
1459	struct tom_data *d = TOM_DATA(tdev);
1460	struct inpcb *inp = sotoinpcb(so);
1461	struct tcpcb *tp = intotcpcb(inp);
1462	struct toepcb *toep; /* allocated by init_offload_socket */
1463
1464	int atid;
1465
1466	toep = toepcb_alloc();
1467	if (toep == NULL)
1468		goto out_err;
1469
1470	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1471		goto out_err;
1472
1473	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1474	if (!e)
1475		goto free_tid;
1476
1477	INP_LOCK_ASSERT(inp);
1478	m = m_gethdr(MT_DATA, M_WAITOK);
1479
1480#if 0
1481	m->m_toe.mt_toepcb = tp->t_toe;
1482	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1483#endif
1484	SOCK_LOCK(so);
1485
1486	init_offload_socket(so, tdev, atid, e, rt, toep);
1487
1488	install_offload_ops(so);
1489
1490	mk_act_open_req(so, m, atid, e);
1491	SOCK_UNLOCK(so);
1492
1493	soisconnecting(so);
1494	toep = tp->t_toe;
1495	m_set_toep(m, tp->t_toe);
1496
1497	toep->tp_state = TCPS_SYN_SENT;
1498	l2t_send(d->cdev, (struct mbuf *)m, e);
1499
1500	if (toep->tp_ulp_mode)
1501		t3_enable_ddp(so, 0);
1502	return 	(0);
1503
1504free_tid:
1505	printf("failing connect - free atid\n");
1506
1507	free_atid(d->cdev, atid);
1508out_err:
1509	printf("return ENOMEM\n");
1510       return (ENOMEM);
1511}
1512
1513/*
1514 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1515 * not send multiple ABORT_REQs for the same connection and also that we do
1516 * not try to send a message after the connection has closed.  Returns 1 if
1517 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1518 */
1519static void
1520t3_send_reset(struct toepcb *toep)
1521{
1522
1523	struct cpl_abort_req *req;
1524	unsigned int tid = toep->tp_tid;
1525	int mode = CPL_ABORT_SEND_RST;
1526	struct tcpcb *tp = toep->tp_tp;
1527	struct toedev *tdev = toep->tp_toedev;
1528	struct socket *so = NULL;
1529	struct mbuf *m;
1530
1531	if (tp) {
1532		INP_LOCK_ASSERT(tp->t_inpcb);
1533		so = toeptoso(toep);
1534	}
1535
1536	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1537		tdev == NULL))
1538		return;
1539	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1540
1541	/* Purge the send queue so we don't send anything after an abort. */
1542	if (so)
1543		sbflush(&so->so_snd);
1544	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1545		mode |= CPL_ABORT_POST_CLOSE_REQ;
1546
1547	m = m_gethdr_nofail(sizeof(*req));
1548	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1549	set_arp_failure_handler(m, abort_arp_failure);
1550
1551	req = mtod(m, struct cpl_abort_req *);
1552	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1553	req->wr.wr_lo = htonl(V_WR_TID(tid));
1554	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1555	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1556	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1557	req->cmd = mode;
1558	if (tp && (tp->t_state == TCPS_SYN_SENT))
1559		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1560	else
1561		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1562}
1563
1564static int
1565t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1566{
1567	struct inpcb *inp;
1568	int error, optval;
1569
1570	if (sopt->sopt_name == IP_OPTIONS)
1571		return (ENOPROTOOPT);
1572
1573	if (sopt->sopt_name != IP_TOS)
1574		return (EOPNOTSUPP);
1575
1576	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1577
1578	if (error)
1579		return (error);
1580
1581	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1582		return (EPERM);
1583
1584	inp = sotoinpcb(so);
1585	inp->inp_ip_tos = optval;
1586
1587	t3_set_tos(so);
1588
1589	return (0);
1590}
1591
1592static int
1593t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1594{
1595	int err = 0;
1596	size_t copied;
1597
1598	if (sopt->sopt_name != TCP_CONGESTION &&
1599	    sopt->sopt_name != TCP_NODELAY)
1600		return (EOPNOTSUPP);
1601
1602	if (sopt->sopt_name == TCP_CONGESTION) {
1603		char name[TCP_CA_NAME_MAX];
1604		int optlen = sopt->sopt_valsize;
1605		struct tcpcb *tp;
1606
1607		if (optlen < 1)
1608			return (EINVAL);
1609
1610		err = copyinstr(sopt->sopt_val, name,
1611		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1612		if (err)
1613			return (err);
1614		if (copied < 1)
1615			return (EINVAL);
1616
1617		tp = sototcpcb(so);
1618		/*
1619		 * XXX I need to revisit this
1620		 */
1621		if ((err = t3_set_cong_control(so, name)) == 0) {
1622#ifdef CONGESTION_CONTROL_SUPPORTED
1623			tp->t_cong_control = strdup(name, M_CXGB);
1624#endif
1625		} else
1626			return (err);
1627	} else {
1628		int optval, oldval;
1629		struct inpcb *inp;
1630		struct tcpcb *tp;
1631
1632		err = sooptcopyin(sopt, &optval, sizeof optval,
1633		    sizeof optval);
1634
1635		if (err)
1636			return (err);
1637
1638		inp = sotoinpcb(so);
1639		tp = intotcpcb(inp);
1640
1641		INP_LOCK(inp);
1642
1643		oldval = tp->t_flags;
1644		if (optval)
1645			tp->t_flags |= TF_NODELAY;
1646		else
1647			tp->t_flags &= ~TF_NODELAY;
1648		INP_UNLOCK(inp);
1649
1650		if (oldval != tp->t_flags)
1651			t3_set_nagle(so);
1652
1653	}
1654
1655	return (0);
1656}
1657
1658static int
1659t3_ctloutput(struct socket *so, struct sockopt *sopt)
1660{
1661	int err;
1662
1663	if (sopt->sopt_level != IPPROTO_TCP)
1664		err =  t3_ip_ctloutput(so, sopt);
1665	else
1666		err = t3_tcp_ctloutput(so, sopt);
1667
1668	if (err != EOPNOTSUPP)
1669		return (err);
1670
1671	return (tcp_ctloutput(so, sopt));
1672}
1673
1674/*
1675 * Returns true if we need to explicitly request RST when we receive new data
1676 * on an RX-closed connection.
1677 */
1678static inline int
1679need_rst_on_excess_rx(const struct toepcb *toep)
1680{
1681	return (1);
1682}
1683
1684/*
1685 * Handles Rx data that arrives in a state where the socket isn't accepting
1686 * new data.
1687 */
1688static void
1689handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1690{
1691
1692	if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1693		t3_send_reset(toep);
1694	m_freem(m);
1695}
1696
1697/*
1698 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1699 * by getting the DDP offset from the TCB.
1700 */
1701static void
1702tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1703{
1704	struct ddp_state *q = &toep->tp_ddp_state;
1705	struct ddp_buf_state *bsp;
1706	struct cpl_get_tcb_rpl *hdr;
1707	unsigned int ddp_offset;
1708	struct socket *so;
1709	struct tcpcb *tp;
1710
1711	uint64_t t;
1712	__be64 *tcb;
1713
1714	so = toeptoso(toep);
1715	tp = toep->tp_tp;
1716
1717	INP_LOCK_ASSERT(tp->t_inpcb);
1718	SOCKBUF_LOCK(&so->so_rcv);
1719
1720	/* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
1721	 * really need a cookie in order to dispatch the RPLs.
1722	 */
1723	q->get_tcb_count--;
1724
1725	/* It is a possible that a previous CPL already invalidated UBUF DDP
1726	 * and moved the cur_buf idx and hence no further processing of this
1727	 * skb is required. However, the app might be sleeping on
1728	 * !q->get_tcb_count and we need to wake it up.
1729	 */
1730	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1731		struct socket *so = toeptoso(toep);
1732
1733		m_freem(m);
1734		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1735			sorwakeup_locked(so);
1736		else
1737			SOCKBUF_UNLOCK(&so->so_rcv);
1738		return;
1739	}
1740
1741	bsp = &q->buf_state[q->cur_buf];
1742	hdr = cplhdr(m);
1743	tcb = (__be64 *)(hdr + 1);
1744	if (q->cur_buf == 0) {
1745		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1746		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1747	} else {
1748		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1749		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1750	}
1751	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1752	m->m_cur_offset = bsp->cur_offset;
1753	bsp->cur_offset = ddp_offset;
1754	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1755
1756	CTR5(KTR_TOM,
1757	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1758	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1759	KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u",
1760		ddp_offset, m->m_cur_offset));
1761
1762#ifdef T3_TRACE
1763	T3_TRACE3(TIDTB(so),
1764		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u",
1765		  tp->rcv_nxt, q->cur_buf, ddp_offset);
1766#endif
1767
1768#if 0
1769{
1770	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1771
1772	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1773	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1774
1775        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1776        rcv_nxt = t >> S_TCB_RCV_NXT;
1777        rcv_nxt &= M_TCB_RCV_NXT;
1778
1779        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1780        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1781        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1782
1783	T3_TRACE2(TIDTB(sk),
1784		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1785		  ddp_flags, rcv_nxt - rx_hdr_offset);
1786	T3_TRACE4(TB(q),
1787		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1788		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1789	T3_TRACE3(TB(q),
1790		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1791		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1792	T3_TRACE2(TB(q),
1793		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1794		 q->buf_state[0].flags, q->buf_state[1].flags);
1795
1796}
1797#endif
1798	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1799		handle_excess_rx(toep, m);
1800		return;
1801	}
1802
1803#ifdef T3_TRACE
1804	if ((int)m->m_pkthdr.len < 0) {
1805		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1806	}
1807#endif
1808	if (bsp->flags & DDP_BF_NOCOPY) {
1809#ifdef T3_TRACE
1810		T3_TRACE0(TB(q),
1811			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1812
1813		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1814			printk("!cancel_ubuf");
1815			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1816		}
1817#endif
1818		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1819		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1820		q->cur_buf ^= 1;
1821	} else if (bsp->flags & DDP_BF_NOFLIP) {
1822
1823		m->m_ddp_flags = 1;    /* always a kernel buffer */
1824
1825		/* now HW buffer carries a user buffer */
1826		bsp->flags &= ~DDP_BF_NOFLIP;
1827		bsp->flags |= DDP_BF_NOCOPY;
1828
1829		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1830		 * any new data in which case we're done. If in addition the
1831		 * offset is 0, then there wasn't a completion for the kbuf
1832		 * and we need to decrement the posted count.
1833		 */
1834		if (m->m_pkthdr.len == 0) {
1835			if (ddp_offset == 0) {
1836				q->kbuf_posted--;
1837				bsp->flags |= DDP_BF_NODATA;
1838			}
1839			SOCKBUF_UNLOCK(&so->so_rcv);
1840
1841			m_free(m);
1842			return;
1843		}
1844	} else {
1845		SOCKBUF_UNLOCK(&so->so_rcv);
1846		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1847		 * but it got here way late and nobody cares anymore.
1848		 */
1849		m_free(m);
1850		return;
1851	}
1852
1853	m->m_ddp_gl = (unsigned char *)bsp->gl;
1854	m->m_flags |= M_DDP;
1855	m->m_seq = tp->rcv_nxt;
1856	tp->rcv_nxt += m->m_pkthdr.len;
1857	tp->t_rcvtime = ticks;
1858#ifdef T3_TRACE
1859	T3_TRACE3(TB(q),
1860		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
1861		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1862#endif
1863	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1864		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1865	if (m->m_pkthdr.len == 0)
1866		q->user_ddp_pending = 0;
1867	else
1868		SBAPPEND(&so->so_rcv, m);
1869	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1870		sorwakeup_locked(so);
1871	else
1872		SOCKBUF_UNLOCK(&so->so_rcv);
1873}
1874
1875/*
1876 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1877 * in that case they are similar to DDP completions.
1878 */
1879static int
1880do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1881{
1882	struct toepcb *toep = (struct toepcb *)ctx;
1883
1884	/* OK if socket doesn't exist */
1885	if (toep == NULL) {
1886		printf("null toep in do_get_tcb_rpl\n");
1887		return (CPL_RET_BUF_DONE);
1888	}
1889
1890	INP_LOCK(toep->tp_tp->t_inpcb);
1891	tcb_rpl_as_ddp_complete(toep, m);
1892	INP_UNLOCK(toep->tp_tp->t_inpcb);
1893
1894	return (0);
1895}
1896
1897static void
1898handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1899{
1900	struct tcpcb *tp = toep->tp_tp;
1901	struct socket *so = toeptoso(toep);
1902	struct ddp_state *q;
1903	struct ddp_buf_state *bsp;
1904	struct cpl_rx_data *hdr = cplhdr(m);
1905	unsigned int rcv_nxt = ntohl(hdr->seq);
1906
1907	if (tp->rcv_nxt == rcv_nxt)
1908		return;
1909
1910	INP_LOCK_ASSERT(tp->t_inpcb);
1911	SOCKBUF_LOCK(&so->so_rcv);
1912	q = &toep->tp_ddp_state;
1913	bsp = &q->buf_state[q->cur_buf];
1914	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
1915		rcv_nxt, tp->rcv_nxt));
1916	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
1917	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
1918	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
1919	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
1920
1921#ifdef T3_TRACE
1922	if ((int)m->m_pkthdr.len < 0) {
1923		t3_ddp_error(so, "handle_ddp_data: neg len");
1924	}
1925#endif
1926
1927	m->m_ddp_gl = (unsigned char *)bsp->gl;
1928	m->m_flags |= M_DDP;
1929	m->m_cur_offset = bsp->cur_offset;
1930	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
1931	if (bsp->flags & DDP_BF_NOCOPY)
1932		bsp->flags &= ~DDP_BF_NOCOPY;
1933
1934	m->m_seq = tp->rcv_nxt;
1935	tp->rcv_nxt = rcv_nxt;
1936	bsp->cur_offset += m->m_pkthdr.len;
1937	if (!(bsp->flags & DDP_BF_NOFLIP))
1938		q->cur_buf ^= 1;
1939	/*
1940	 * For now, don't re-enable DDP after a connection fell out of  DDP
1941	 * mode.
1942	 */
1943	q->ubuf_ddp_ready = 0;
1944	SOCKBUF_UNLOCK(&so->so_rcv);
1945}
1946
1947/*
1948 * Process new data received for a connection.
1949 */
1950static void
1951new_rx_data(struct toepcb *toep, struct mbuf *m)
1952{
1953	struct cpl_rx_data *hdr = cplhdr(m);
1954	struct tcpcb *tp = toep->tp_tp;
1955	struct socket *so = toeptoso(toep);
1956	int len = be16toh(hdr->len);
1957
1958	INP_LOCK(tp->t_inpcb);
1959
1960	if (__predict_false(so_no_receive(so))) {
1961		handle_excess_rx(toep, m);
1962		INP_UNLOCK(tp->t_inpcb);
1963		TRACE_EXIT;
1964		return;
1965	}
1966
1967	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
1968		handle_ddp_data(toep, m);
1969
1970	m->m_seq = ntohl(hdr->seq);
1971	m->m_ulp_mode = 0;                    /* for iSCSI */
1972
1973#if VALIDATE_SEQ
1974	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
1975		log(LOG_ERR,
1976		       "%s: TID %u: Bad sequence number %u, expected %u\n",
1977		    TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq,
1978		       tp->rcv_nxt);
1979		m_freem(m);
1980		INP_UNLOCK(tp->t_inpcb);
1981		return;
1982	}
1983#endif
1984	m_adj(m, sizeof(*hdr));
1985
1986#ifdef URGENT_DATA_SUPPORTED
1987	/*
1988	 * We don't handle urgent data yet
1989	 */
1990	if (__predict_false(hdr->urg))
1991		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
1992	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
1993		     tp->urg_seq - tp->rcv_nxt < skb->len))
1994		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
1995							 tp->rcv_nxt];
1996#endif
1997	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
1998		toep->tp_delack_mode = hdr->dack_mode;
1999		toep->tp_delack_seq = tp->rcv_nxt;
2000	}
2001	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2002	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2003
2004	if (len < m->m_pkthdr.len)
2005		m->m_pkthdr.len = m->m_len = len;
2006
2007	tp->rcv_nxt += m->m_pkthdr.len;
2008	tp->t_rcvtime = ticks;
2009	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2010#ifdef T3_TRACE
2011	T3_TRACE2(TIDTB(sk),
2012	    "new_rx_data: seq 0x%x len %u",
2013	    m->m_seq, m->m_pkthdr.len);
2014#endif
2015	INP_UNLOCK(tp->t_inpcb);
2016	SOCKBUF_LOCK(&so->so_rcv);
2017	if (sb_notify(&so->so_rcv))
2018		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
2019
2020	SBAPPEND(&so->so_rcv, m);
2021
2022#ifdef notyet
2023	/*
2024	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2025	 *
2026	 */
2027	KASSERT(so->so_rcv.sb_cc < (so->so_rcv.sb_mbmax << 1),
2028
2029	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2030		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
2031#endif
2032
2033
2034	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2035	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
2036
2037	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2038		sorwakeup_locked(so);
2039	else
2040		SOCKBUF_UNLOCK(&so->so_rcv);
2041}
2042
2043/*
2044 * Handler for RX_DATA CPL messages.
2045 */
2046static int
2047do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2048{
2049	struct toepcb *toep = (struct toepcb *)ctx;
2050
2051	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2052
2053	new_rx_data(toep, m);
2054
2055	return (0);
2056}
2057
2058static void
2059new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2060{
2061	struct tcpcb *tp;
2062	struct ddp_state *q;
2063	struct ddp_buf_state *bsp;
2064	struct cpl_rx_data_ddp *hdr;
2065	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2066	struct socket *so = toeptoso(toep);
2067	int nomoredata = 0;
2068	unsigned int delack_mode;
2069
2070	tp = sototcpcb(so);
2071
2072	INP_LOCK(tp->t_inpcb);
2073	if (__predict_false(so_no_receive(so))) {
2074
2075		handle_excess_rx(toep, m);
2076		INP_UNLOCK(tp->t_inpcb);
2077		return;
2078	}
2079
2080	q = &toep->tp_ddp_state;
2081	hdr = cplhdr(m);
2082	ddp_report = ntohl(hdr->u.ddp_report);
2083	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2084	bsp = &q->buf_state[buf_idx];
2085
2086#ifdef T3_TRACE
2087	T3_TRACE5(TIDTB(sk),
2088		  "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2089		  "hdr seq 0x%x len %u offset %u",
2090		  tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2091		  ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
2092	T3_TRACE1(TIDTB(sk),
2093		  "new_rx_data_ddp: ddp_report 0x%x",
2094		  ddp_report);
2095#endif
2096	CTR4(KTR_TOM,
2097	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2098	    "hdr seq 0x%x len %u",
2099	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2100	    ntohs(hdr->len));
2101	CTR3(KTR_TOM,
2102	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2103	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2104
2105	ddp_len = ntohs(hdr->len);
2106	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2107
2108	delack_mode = G_DDP_DACK_MODE(ddp_report);
2109	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2110		toep->tp_delack_mode = delack_mode;
2111		toep->tp_delack_seq = tp->rcv_nxt;
2112	}
2113
2114	m->m_seq = tp->rcv_nxt;
2115	tp->rcv_nxt = rcv_nxt;
2116
2117	tp->t_rcvtime = ticks;
2118	/*
2119	 * Store the length in m->m_len.  We are changing the meaning of
2120	 * m->m_len here, we need to be very careful that nothing from now on
2121	 * interprets ->len of this packet the usual way.
2122	 */
2123	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2124	INP_UNLOCK(tp->t_inpcb);
2125	CTR3(KTR_TOM,
2126	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2127	    m->m_len, rcv_nxt, m->m_seq);
2128	/*
2129	 * Figure out where the new data was placed in the buffer and store it
2130	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2131	 * account for page pod's pg_offset.
2132	 */
2133	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2134	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2135
2136	SOCKBUF_LOCK(&so->so_rcv);
2137	m->m_ddp_gl = (unsigned char *)bsp->gl;
2138	m->m_flags |= M_DDP;
2139	bsp->cur_offset = end_offset;
2140	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2141
2142	/*
2143	 * Length is only meaningful for kbuf
2144	 */
2145	if (!(bsp->flags & DDP_BF_NOCOPY))
2146		KASSERT(m->m_len <= bsp->gl->dgl_length,
2147		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2148			m->m_len, bsp->gl->dgl_length));
2149
2150	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2151	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2152
2153
2154        /*
2155	 * Bit 0 of flags stores whether the DDP buffer is completed.
2156	 * Note that other parts of the code depend on this being in bit 0.
2157	 */
2158	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2159		panic("spurious ddp completion");
2160	} else {
2161		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2162		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2163			q->cur_buf ^= 1;                     /* flip buffers */
2164	}
2165
2166	if (bsp->flags & DDP_BF_NOCOPY) {
2167		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2168		bsp->flags &= ~DDP_BF_NOCOPY;
2169	}
2170
2171	if (ddp_report & F_DDP_PSH)
2172		m->m_ddp_flags |= DDP_BF_PSH;
2173	if (nomoredata)
2174		m->m_ddp_flags |= DDP_BF_NODATA;
2175
2176#ifdef notyet
2177	skb_reset_transport_header(skb);
2178	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2179#endif
2180	SBAPPEND(&so->so_rcv, m);
2181
2182	if ((so->so_state & SS_NOFDREF) == 0)
2183		sorwakeup_locked(so);
2184	else
2185		SOCKBUF_UNLOCK(&so->so_rcv);
2186}
2187
2188#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2189		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2190		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2191		 F_DDP_INVALID_PPOD)
2192
2193/*
2194 * Handler for RX_DATA_DDP CPL messages.
2195 */
2196static int
2197do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2198{
2199	struct toepcb *toep = ctx;
2200	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2201
2202	VALIDATE_SOCK(so);
2203
2204	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2205		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2206		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2207		return (CPL_RET_BUF_DONE);
2208	}
2209#if 0
2210	skb->h.th = tcphdr_skb->h.th;
2211#endif
2212	new_rx_data_ddp(toep, m);
2213	return (0);
2214}
2215
2216static void
2217process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2218{
2219	struct tcpcb *tp = toep->tp_tp;
2220	struct socket *so = toeptoso(toep);
2221	struct ddp_state *q;
2222	struct ddp_buf_state *bsp;
2223	struct cpl_rx_ddp_complete *hdr;
2224	unsigned int ddp_report, buf_idx, when, delack_mode;
2225	int nomoredata = 0;
2226
2227	INP_LOCK(tp->t_inpcb);
2228	if (__predict_false(so_no_receive(so))) {
2229		struct inpcb *inp = sotoinpcb(so);
2230
2231		handle_excess_rx(toep, m);
2232		INP_UNLOCK(inp);
2233		return;
2234	}
2235	q = &toep->tp_ddp_state;
2236	hdr = cplhdr(m);
2237	ddp_report = ntohl(hdr->ddp_report);
2238	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2239	m->m_pkthdr.csum_data = tp->rcv_nxt;
2240
2241
2242	SOCKBUF_LOCK(&so->so_rcv);
2243	bsp = &q->buf_state[buf_idx];
2244	when = bsp->cur_offset;
2245	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2246	tp->rcv_nxt += m->m_len;
2247	tp->t_rcvtime = ticks;
2248
2249	delack_mode = G_DDP_DACK_MODE(ddp_report);
2250	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2251		toep->tp_delack_mode = delack_mode;
2252		toep->tp_delack_seq = tp->rcv_nxt;
2253	}
2254#ifdef notyet
2255	skb_reset_transport_header(skb);
2256	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2257#endif
2258	INP_UNLOCK(tp->t_inpcb);
2259
2260	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2261	CTR5(KTR_TOM,
2262		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2263		  "ddp_report 0x%x offset %u, len %u",
2264		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2265		   G_DDP_OFFSET(ddp_report), m->m_len);
2266
2267	bsp->cur_offset += m->m_len;
2268
2269	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2270		q->cur_buf ^= 1;                     /* flip buffers */
2271		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2272			nomoredata=1;
2273	}
2274
2275	CTR4(KTR_TOM,
2276		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2277		  "ddp_report %u offset %u",
2278		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2279		   G_DDP_OFFSET(ddp_report));
2280
2281	m->m_ddp_gl = (unsigned char *)bsp->gl;
2282	m->m_flags |= M_DDP;
2283	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2284	if (bsp->flags & DDP_BF_NOCOPY)
2285		bsp->flags &= ~DDP_BF_NOCOPY;
2286	if (nomoredata)
2287		m->m_ddp_flags |= DDP_BF_NODATA;
2288
2289
2290	SBAPPEND(&so->so_rcv, m);
2291
2292	if ((so->so_state & SS_NOFDREF) == 0)
2293		sorwakeup_locked(so);
2294	else
2295		SOCKBUF_UNLOCK(&so->so_rcv);
2296}
2297
2298/*
2299 * Handler for RX_DDP_COMPLETE CPL messages.
2300 */
2301static int
2302do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2303{
2304	struct toepcb *toep = ctx;
2305
2306	VALIDATE_SOCK(so);
2307#if 0
2308	skb->h.th = tcphdr_skb->h.th;
2309#endif
2310	process_ddp_complete(toep, m);
2311	return (0);
2312}
2313
2314/*
2315 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2316 * socket state before calling tcp_time_wait to comply with its expectations.
2317 */
2318static void
2319enter_timewait(struct socket *so)
2320{
2321	struct tcpcb *tp = sototcpcb(so);
2322
2323	INP_LOCK_ASSERT(tp->t_inpcb);
2324	/*
2325	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2326	 * process peer_close because we don't want to carry the peer FIN in
2327	 * the socket's receive queue and if we increment rcv_nxt without
2328	 * having the FIN in the receive queue we'll confuse facilities such
2329	 * as SIOCINQ.
2330	 */
2331	tp->rcv_nxt++;
2332
2333	tp->ts_recent_age = 0;	     /* defeat recycling */
2334	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2335	tcp_twstart(tp);
2336}
2337
2338/*
2339 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2340 * function deals with the data that may be reported along with the FIN.
2341 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2342 * perform normal FIN-related processing.  In the latter case 1 indicates that
2343 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2344 * skb can be freed.
2345 */
2346static int
2347handle_peer_close_data(struct socket *so, struct mbuf *m)
2348{
2349	struct tcpcb *tp = sototcpcb(so);
2350	struct toepcb *toep = tp->t_toe;
2351	struct ddp_state *q;
2352	struct ddp_buf_state *bsp;
2353	struct cpl_peer_close *req = cplhdr(m);
2354	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2355
2356	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2357		return (0);
2358
2359	if (__predict_false(so_no_receive(so))) {
2360		handle_excess_rx(toep, m);
2361
2362		/*
2363		 * Although we discard the data we want to process the FIN so
2364		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2365		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2366		 * may be what will close the connection.  We return 1 because
2367		 * handle_excess_rx() already freed the packet.
2368		 */
2369		return (1);
2370	}
2371
2372	INP_LOCK_ASSERT(tp->t_inpcb);
2373	q = &toep->tp_ddp_state;
2374	SOCKBUF_LOCK(&so->so_rcv);
2375	bsp = &q->buf_state[q->cur_buf];
2376	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2377	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2378	m->m_ddp_gl = (unsigned char *)bsp->gl;
2379	m->m_flags |= M_DDP;
2380	m->m_cur_offset = bsp->cur_offset;
2381	m->m_ddp_flags =
2382	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2383	m->m_seq = tp->rcv_nxt;
2384	tp->rcv_nxt = rcv_nxt;
2385	bsp->cur_offset += m->m_pkthdr.len;
2386	if (!(bsp->flags & DDP_BF_NOFLIP))
2387		q->cur_buf ^= 1;
2388#ifdef notyet
2389	skb_reset_transport_header(skb);
2390	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2391#endif
2392	tp->t_rcvtime = ticks;
2393	SBAPPEND(&so->so_rcv, m);
2394	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2395		sorwakeup_locked(so);
2396	else
2397		SOCKBUF_UNLOCK(&so->so_rcv);
2398	return (1);
2399}
2400
2401/*
2402 * Handle a peer FIN.
2403 */
2404static void
2405do_peer_fin(struct socket *so, struct mbuf *m)
2406{
2407	struct tcpcb *tp = sototcpcb(so);
2408	struct toepcb *toep = tp->t_toe;
2409	int keep = 0;
2410	DPRINTF("do_peer_fin state=%d\n", tp->t_state);
2411
2412#ifdef T3_TRACE
2413	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
2414#endif
2415
2416	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2417		printf("abort_pending set\n");
2418
2419		goto out;
2420	}
2421	INP_INFO_WLOCK(&tcbinfo);
2422	INP_LOCK(tp->t_inpcb);
2423	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2424		keep = handle_peer_close_data(so, m);
2425		if (keep < 0) {
2426			INP_INFO_WUNLOCK(&tcbinfo);
2427			INP_UNLOCK(tp->t_inpcb);
2428			return;
2429		}
2430	}
2431	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2432		socantrcvmore(so);
2433		/*
2434		 * If connection is half-synchronized
2435		 * (ie NEEDSYN flag on) then delay ACK,
2436		 * so it may be piggybacked when SYN is sent.
2437		 * Otherwise, since we received a FIN then no
2438		 * more input can be expected, send ACK now.
2439		 */
2440		if (tp->t_flags & TF_NEEDSYN)
2441			tp->t_flags |= TF_DELACK;
2442		else
2443			tp->t_flags |= TF_ACKNOW;
2444		tp->rcv_nxt++;
2445	}
2446
2447	switch (tp->t_state) {
2448	case TCPS_SYN_RECEIVED:
2449	    tp->t_starttime = ticks;
2450	/* FALLTHROUGH */
2451	case TCPS_ESTABLISHED:
2452		tp->t_state = TCPS_CLOSE_WAIT;
2453		break;
2454	case TCPS_FIN_WAIT_1:
2455		tp->t_state = TCPS_CLOSING;
2456		break;
2457	case TCPS_FIN_WAIT_2:
2458		/*
2459		 * If we've sent an abort_req we must have sent it too late,
2460		 * HW will send us a reply telling us so, and this peer_close
2461		 * is really the last message for this connection and needs to
2462		 * be treated as an abort_rpl, i.e., transition the connection
2463		 * to TCP_CLOSE (note that the host stack does this at the
2464		 * time of generating the RST but we must wait for HW).
2465		 * Otherwise we enter TIME_WAIT.
2466		 */
2467		t3_release_offload_resources(toep);
2468		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2469			tp = tcp_close(tp);
2470		} else {
2471			enter_timewait(so);
2472		}
2473		break;
2474	default:
2475		log(LOG_ERR,
2476		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2477		       TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
2478	}
2479	INP_INFO_WUNLOCK(&tcbinfo);
2480	if (tp)
2481		INP_UNLOCK(tp->t_inpcb);
2482
2483	DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
2484
2485#ifdef notyet
2486	/* Do not send POLL_HUP for half duplex close. */
2487	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2488	    sk->sk_state == TCP_CLOSE)
2489		sk_wake_async(so, 1, POLL_HUP);
2490	else
2491		sk_wake_async(so, 1, POLL_IN);
2492#endif
2493
2494out:
2495	if (!keep)
2496		m_free(m);
2497}
2498
2499/*
2500 * Handler for PEER_CLOSE CPL messages.
2501 */
2502static int
2503do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2504{
2505	struct toepcb *toep = (struct toepcb *)ctx;
2506	struct socket *so = toeptoso(toep);
2507
2508	VALIDATE_SOCK(so);
2509
2510	do_peer_fin(so, m);
2511	return (0);
2512}
2513
2514static void
2515process_close_con_rpl(struct socket *so, struct mbuf *m)
2516{
2517	struct tcpcb *tp = sototcpcb(so);
2518	struct cpl_close_con_rpl *rpl = cplhdr(m);
2519	struct toepcb *toep = tp->t_toe;
2520
2521	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2522
2523	DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
2524	    !!(so->so_state & SS_NOFDREF));
2525	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
2526		goto out;
2527
2528	INP_INFO_WLOCK(&tcbinfo);
2529	INP_LOCK(tp->t_inpcb);
2530	switch (tp->t_state) {
2531	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2532		t3_release_offload_resources(toep);
2533		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2534			tp = tcp_close(tp);
2535
2536		} else {
2537			enter_timewait(so);
2538			soisdisconnected(so);
2539		}
2540		break;
2541	case TCPS_LAST_ACK:
2542		/*
2543		 * In this state we don't care about pending abort_rpl.
2544		 * If we've sent abort_req it was post-close and was sent too
2545		 * late, this close_con_rpl is the actual last message.
2546		 */
2547		t3_release_offload_resources(toep);
2548		tp = tcp_close(tp);
2549		break;
2550	case TCPS_FIN_WAIT_1:
2551		/*
2552		 * If we can't receive any more
2553		 * data, then closing user can proceed.
2554		 * Starting the timer is contrary to the
2555		 * specification, but if we don't get a FIN
2556		 * we'll hang forever.
2557		 *
2558		 * XXXjl:
2559		 * we should release the tp also, and use a
2560		 * compressed state.
2561		 */
2562		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2563			int timeout;
2564
2565			soisdisconnected(so);
2566			timeout = (tcp_fast_finwait2_recycle) ?
2567			    tcp_finwait2_timeout : tcp_maxidle;
2568			tcp_timer_activate(tp, TT_2MSL, timeout);
2569		}
2570		tp->t_state = TCPS_FIN_WAIT_2;
2571		if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
2572		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2573			tp = tcp_drop(tp, 0);
2574		}
2575
2576		break;
2577	default:
2578		log(LOG_ERR,
2579		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2580		       TOE_DEV(so)->tod_name, toep->tp_tid,
2581		       tp->t_state);
2582	}
2583	INP_INFO_WUNLOCK(&tcbinfo);
2584	if (tp)
2585		INP_UNLOCK(tp->t_inpcb);
2586out:
2587	m_freem(m);
2588}
2589
2590/*
2591 * Handler for CLOSE_CON_RPL CPL messages.
2592 */
2593static int
2594do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2595			    void *ctx)
2596{
2597	struct toepcb *toep = (struct toepcb *)ctx;
2598	struct socket *so = toeptoso(toep);
2599
2600	VALIDATE_SOCK(so);
2601
2602	process_close_con_rpl(so, m);
2603	return (0);
2604}
2605
2606/*
2607 * Process abort replies.  We only process these messages if we anticipate
2608 * them as the coordination between SW and HW in this area is somewhat lacking
2609 * and sometimes we get ABORT_RPLs after we are done with the connection that
2610 * originated the ABORT_REQ.
2611 */
2612static void
2613process_abort_rpl(struct socket *so, struct mbuf *m)
2614{
2615	struct tcpcb *tp = sototcpcb(so);
2616	struct toepcb *toep = tp->t_toe;
2617
2618#ifdef T3_TRACE
2619	T3_TRACE1(TIDTB(sk),
2620		  "process_abort_rpl: GTS rpl pending %d",
2621		  sock_flag(sk, ABORT_RPL_PENDING));
2622#endif
2623
2624	INP_INFO_WLOCK(&tcbinfo);
2625	INP_LOCK(tp->t_inpcb);
2626
2627	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2628		/*
2629		 * XXX panic on tcpdrop
2630		 */
2631		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
2632			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2633		else {
2634			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2635			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2636			    !is_t3a(TOE_DEV(so))) {
2637				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2638					panic("TP_ABORT_REQ_RCVD set");
2639				t3_release_offload_resources(toep);
2640				tp = tcp_close(tp);
2641			}
2642		}
2643	}
2644	if (tp)
2645		INP_UNLOCK(tp->t_inpcb);
2646	INP_INFO_WUNLOCK(&tcbinfo);
2647
2648	m_free(m);
2649}
2650
2651/*
2652 * Handle an ABORT_RPL_RSS CPL message.
2653 */
2654static int
2655do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2656{
2657	struct socket *so;
2658	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2659	struct toepcb *toep;
2660
2661	/*
2662	 * Ignore replies to post-close aborts indicating that the abort was
2663	 * requested too late.  These connections are terminated when we get
2664	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2665	 * arrives the TID is either no longer used or it has been recycled.
2666	 */
2667	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2668discard:
2669		m_free(m);
2670		return (0);
2671	}
2672
2673	toep = (struct toepcb *)ctx;
2674
2675        /*
2676	 * Sometimes we've already closed the socket, e.g., a post-close
2677	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2678	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2679	 * but FW turns the ABORT_REQ into a regular one and so we get
2680	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2681	 */
2682	if (!toep)
2683		goto discard;
2684
2685	if (toep->tp_tp == NULL) {
2686		printf("removing tid for abort\n");
2687		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2688		if (toep->tp_l2t)
2689			l2t_release(L2DATA(cdev), toep->tp_l2t);
2690
2691		toepcb_release(toep);
2692		goto discard;
2693	}
2694
2695	printf("toep=%p\n", toep);
2696	printf("tp=%p\n", toep->tp_tp);
2697
2698	so = toeptoso(toep); /* <- XXX panic */
2699	toepcb_hold(toep);
2700	process_abort_rpl(so, m);
2701	toepcb_release(toep);
2702	return (0);
2703}
2704
2705/*
2706 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2707 * indicate whether RST should be sent in response.
2708 */
2709static int
2710abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2711{
2712	struct tcpcb *tp = sototcpcb(so);
2713
2714	switch (abort_reason) {
2715	case CPL_ERR_BAD_SYN:
2716#if 0
2717		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2718#endif
2719	case CPL_ERR_CONN_RESET:
2720		// XXX need to handle SYN_RECV due to crossed SYNs
2721		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2722	case CPL_ERR_XMIT_TIMEDOUT:
2723	case CPL_ERR_PERSIST_TIMEDOUT:
2724	case CPL_ERR_FINWAIT2_TIMEDOUT:
2725	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2726#if 0
2727		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2728#endif
2729		return (ETIMEDOUT);
2730	default:
2731		return (EIO);
2732	}
2733}
2734
2735static inline void
2736set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2737{
2738	struct cpl_abort_rpl *rpl = cplhdr(m);
2739
2740	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2741	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2742	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2743
2744	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2745	rpl->cmd = cmd;
2746}
2747
2748static void
2749send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2750{
2751	struct mbuf *reply_mbuf;
2752	struct cpl_abort_req_rss *req = cplhdr(m);
2753
2754	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2755	m_set_priority(m, CPL_PRIORITY_DATA);
2756	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2757	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2758	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2759	m_free(m);
2760}
2761
2762/*
2763 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2764 */
2765static inline int
2766is_neg_adv_abort(unsigned int status)
2767{
2768	return status == CPL_ERR_RTX_NEG_ADVICE ||
2769	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2770}
2771
2772static void
2773send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2774{
2775	struct mbuf  *reply_mbuf;
2776	struct cpl_abort_req_rss *req = cplhdr(m);
2777
2778	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2779
2780	if (!reply_mbuf) {
2781		/* Defer the reply.  Stick rst_status into req->cmd. */
2782		req->status = rst_status;
2783		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2784		return;
2785	}
2786
2787	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2788	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2789	m_free(m);
2790
2791	/*
2792	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2793	 * these messages while ARP is pending.  For other connection states
2794	 * it's not a problem.
2795	 */
2796	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2797}
2798
2799#ifdef notyet
2800static void
2801cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2802{
2803	CXGB_UNIMPLEMENTED();
2804#ifdef notyet
2805	struct request_sock *req = child->sk_user_data;
2806
2807	inet_csk_reqsk_queue_removed(parent, req);
2808	synq_remove(tcp_sk(child));
2809	__reqsk_free(req);
2810	child->sk_user_data = NULL;
2811#endif
2812}
2813
2814
2815/*
2816 * Performs the actual work to abort a SYN_RECV connection.
2817 */
2818static void
2819do_abort_syn_rcv(struct socket *child, struct socket *parent)
2820{
2821	struct tcpcb *parenttp = sototcpcb(parent);
2822	struct tcpcb *childtp = sototcpcb(child);
2823
2824	/*
2825	 * If the server is still open we clean up the child connection,
2826	 * otherwise the server already did the clean up as it was purging
2827	 * its SYN queue and the skb was just sitting in its backlog.
2828	 */
2829	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2830		cleanup_syn_rcv_conn(child, parent);
2831		INP_INFO_WLOCK(&tcbinfo);
2832		INP_LOCK(childtp->t_inpcb);
2833		t3_release_offload_resources(childtp->t_toe);
2834		childtp = tcp_close(childtp);
2835		INP_INFO_WUNLOCK(&tcbinfo);
2836		if (childtp)
2837			INP_UNLOCK(childtp->t_inpcb);
2838	}
2839}
2840#endif
2841
2842/*
2843 * Handle abort requests for a SYN_RECV connection.  These need extra work
2844 * because the socket is on its parent's SYN queue.
2845 */
2846static int
2847abort_syn_rcv(struct socket *so, struct mbuf *m)
2848{
2849	CXGB_UNIMPLEMENTED();
2850#ifdef notyet
2851	struct socket *parent;
2852	struct toedev *tdev = TOE_DEV(so);
2853	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2854	struct socket *oreq = so->so_incomp;
2855	struct t3c_tid_entry *t3c_stid;
2856	struct tid_info *t;
2857
2858	if (!oreq)
2859		return -1;        /* somehow we are not on the SYN queue */
2860
2861	t = &(T3C_DATA(cdev))->tid_maps;
2862	t3c_stid = lookup_stid(t, oreq->ts_recent);
2863	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2864
2865	SOCK_LOCK(parent);
2866	do_abort_syn_rcv(so, parent);
2867	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2868	SOCK_UNLOCK(parent);
2869#endif
2870	return (0);
2871}
2872
2873/*
2874 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2875 * request except that we need to reply to it.
2876 */
2877static void
2878process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
2879{
2880	int rst_status = CPL_ABORT_NO_RST;
2881	const struct cpl_abort_req_rss *req = cplhdr(m);
2882	struct tcpcb *tp = sototcpcb(so);
2883	struct toepcb *toep = tp->t_toe;
2884
2885	INP_LOCK(tp->t_inpcb);
2886	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2887		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2888		m_free(m);
2889		goto skip;
2890	}
2891
2892	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
2893	/*
2894	 * Three cases to consider:
2895	 * a) We haven't sent an abort_req; close the connection.
2896	 * b) We have sent a post-close abort_req that will get to TP too late
2897	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
2898	 *    be ignored and the connection should be closed now.
2899	 * c) We have sent a regular abort_req that will get to TP too late.
2900	 *    That will generate an abort_rpl with status 0, wait for it.
2901	 */
2902	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
2903	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
2904		so->so_error = abort_status_to_errno(so, req->status,
2905		    &rst_status);
2906		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2907			sorwakeup(so);
2908		/*
2909		 * SYN_RECV needs special processing.  If abort_syn_rcv()
2910		 * returns 0 is has taken care of the abort.
2911		 */
2912		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
2913			goto skip;
2914
2915		t3_release_offload_resources(toep);
2916		tp = tcp_close(tp);
2917	}
2918	if (tp)
2919		INP_UNLOCK(tp->t_inpcb);
2920	send_abort_rpl(m, tdev, rst_status);
2921	return;
2922
2923skip:
2924	INP_UNLOCK(tp->t_inpcb);
2925}
2926
2927/*
2928 * Handle an ABORT_REQ_RSS CPL message.
2929 */
2930static int
2931do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2932{
2933	const struct cpl_abort_req_rss *req = cplhdr(m);
2934	struct toepcb *toep = (struct toepcb *)ctx;
2935	struct socket *so;
2936	struct inpcb *inp;
2937
2938	if (is_neg_adv_abort(req->status)) {
2939		m_free(m);
2940		return (0);
2941	}
2942
2943	printf("aborting tid=%d\n", toep->tp_tid);
2944
2945	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
2946		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2947		toep->tp_flags |= TP_ABORT_REQ_RCVD;
2948		printf("sending abort rpl\n");
2949
2950		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
2951		printf("sent\n");
2952		if (toep->tp_l2t)
2953			l2t_release(L2DATA(cdev), toep->tp_l2t);
2954
2955		/*
2956		 *  Unhook
2957		 */
2958		toep->tp_tp->t_toe = NULL;
2959		toep->tp_tp->t_flags &= ~TF_TOE;
2960		toep->tp_tp = NULL;
2961		/*
2962		 * XXX need to call syncache_chkrst - but we don't
2963		 * have a way of doing that yet
2964		 */
2965		toepcb_release(toep);
2966		printf("abort for unestablished connection :-(\n");
2967		return (0);
2968	}
2969	if (toep->tp_tp == NULL) {
2970		printf("disconnected toepcb\n");
2971		/* should be freed momentarily */
2972		return (0);
2973	}
2974
2975	so = toeptoso(toep);
2976	inp = sotoinpcb(so);
2977
2978	VALIDATE_SOCK(so);
2979	toepcb_hold(toep);
2980	INP_INFO_WLOCK(&tcbinfo);
2981	process_abort_req(so, m, TOE_DEV(so));
2982	INP_INFO_WUNLOCK(&tcbinfo);
2983	toepcb_release(toep);
2984	return (0);
2985}
2986#ifdef notyet
2987static void
2988pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
2989{
2990	struct toedev *tdev = TOE_DEV(parent);
2991
2992	do_abort_syn_rcv(child, parent);
2993	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
2994		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
2995
2996		rpl->opt0h = htonl(F_TCAM_BYPASS);
2997		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2998		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
2999	} else
3000		m_free(m);
3001}
3002#endif
3003static void
3004handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3005{
3006	CXGB_UNIMPLEMENTED();
3007
3008#ifdef notyet
3009	struct t3cdev *cdev;
3010	struct socket *parent;
3011	struct socket *oreq;
3012	struct t3c_tid_entry *t3c_stid;
3013	struct tid_info *t;
3014	struct tcpcb *otp, *tp = sototcpcb(so);
3015	struct toepcb *toep = tp->t_toe;
3016
3017	/*
3018	 * If the connection is being aborted due to the parent listening
3019	 * socket going away there's nothing to do, the ABORT_REQ will close
3020	 * the connection.
3021	 */
3022	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3023		m_free(m);
3024		return;
3025	}
3026
3027	oreq = so->so_incomp;
3028	otp = sototcpcb(oreq);
3029
3030	cdev = T3C_DEV(so);
3031	t = &(T3C_DATA(cdev))->tid_maps;
3032	t3c_stid = lookup_stid(t, otp->ts_recent);
3033	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3034
3035	SOCK_LOCK(parent);
3036	pass_open_abort(so, parent, m);
3037	SOCK_UNLOCK(parent);
3038#endif
3039}
3040
3041/*
3042 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3043 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3044 * connection.
3045 */
3046static void
3047pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3048{
3049
3050#ifdef notyet
3051	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3052	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3053#endif
3054	handle_pass_open_arp_failure(m_get_socket(m), m);
3055}
3056
3057/*
3058 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3059 */
3060static void
3061mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3062{
3063	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3064	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3065	unsigned int tid = GET_TID(req);
3066
3067	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3068	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3069	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3070	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3071	rpl->opt0h = htonl(F_TCAM_BYPASS);
3072	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3073	rpl->opt2 = 0;
3074	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3075}
3076
3077/*
3078 * Send a deferred reject to an accept request.
3079 */
3080static void
3081reject_pass_request(struct toedev *tdev, struct mbuf *m)
3082{
3083	struct mbuf *reply_mbuf;
3084
3085	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3086	mk_pass_accept_rpl(reply_mbuf, m);
3087	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3088	m_free(m);
3089}
3090
3091static void
3092handle_syncache_event(int event, void *arg)
3093{
3094	struct toepcb *toep = arg;
3095
3096	switch (event) {
3097	case TOE_SC_ENTRY_PRESENT:
3098		/*
3099		 * entry already exists - free toepcb
3100		 * and l2t
3101		 */
3102		printf("syncache entry present\n");
3103		toepcb_release(toep);
3104		break;
3105	case TOE_SC_DROP:
3106		/*
3107		 * The syncache has given up on this entry
3108		 * either it timed out, or it was evicted
3109		 * we need to explicitly release the tid
3110		 */
3111		printf("syncache entry dropped\n");
3112		toepcb_release(toep);
3113		break;
3114	default:
3115		log(LOG_ERR, "unknown syncache event %d\n", event);
3116		break;
3117	}
3118}
3119
3120static void
3121syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3122{
3123	struct in_conninfo inc;
3124	struct tcpopt to;
3125	struct tcphdr th;
3126	struct inpcb *inp;
3127	int mss, wsf, sack, ts;
3128	uint32_t rcv_isn = ntohl(req->rcv_isn);
3129
3130	bzero(&to, sizeof(struct tcpopt));
3131	inp = sotoinpcb(lso);
3132
3133	/*
3134	 * Fill out information for entering us into the syncache
3135	 */
3136	inc.inc_fport = th.th_sport = req->peer_port;
3137	inc.inc_lport = th.th_dport = req->local_port;
3138	th.th_seq = req->rcv_isn;
3139	th.th_flags = TH_SYN;
3140
3141	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3142
3143
3144	inc.inc_isipv6 = 0;
3145	inc.inc_len = 0;
3146	inc.inc_faddr.s_addr = req->peer_ip;
3147	inc.inc_laddr.s_addr = req->local_ip;
3148
3149	DPRINTF("syncache add of %d:%d %d:%d\n",
3150	    ntohl(req->local_ip), ntohs(req->local_port),
3151	    ntohl(req->peer_ip), ntohs(req->peer_port));
3152
3153	mss = req->tcp_options.mss;
3154	wsf = req->tcp_options.wsf;
3155	ts = req->tcp_options.tstamp;
3156	sack = req->tcp_options.sack;
3157	to.to_mss = mss;
3158	to.to_wscale = wsf;
3159	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3160	INP_INFO_WLOCK(&tcbinfo);
3161	INP_LOCK(inp);
3162	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3163}
3164
3165
3166/*
3167 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3168 * lock held.  Note that the sock here is a listening socket that is not owned
3169 * by the TOE.
3170 */
3171static void
3172process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3173    struct listen_ctx *lctx)
3174{
3175	int rt_flags;
3176	struct l2t_entry *e;
3177	struct iff_mac tim;
3178	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3179	struct cpl_pass_accept_rpl *rpl;
3180	struct cpl_pass_accept_req *req = cplhdr(m);
3181	unsigned int tid = GET_TID(req);
3182	struct tom_data *d = TOM_DATA(tdev);
3183	struct t3cdev *cdev = d->cdev;
3184	struct tcpcb *tp = sototcpcb(so);
3185	struct toepcb *newtoep;
3186	struct rtentry *dst;
3187	struct sockaddr_in nam;
3188	struct t3c_data *td = T3C_DATA(cdev);
3189
3190	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3191	if (__predict_false(reply_mbuf == NULL)) {
3192		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3193			t3_defer_reply(m, tdev, reject_pass_request);
3194		else {
3195			cxgb_queue_tid_release(cdev, tid);
3196			m_free(m);
3197		}
3198		DPRINTF("failed to get reply_mbuf\n");
3199
3200		goto out;
3201	}
3202
3203	if (tp->t_state != TCPS_LISTEN) {
3204		DPRINTF("socket not in listen state\n");
3205
3206		goto reject;
3207	}
3208
3209	tim.mac_addr = req->dst_mac;
3210	tim.vlan_tag = ntohs(req->vlan_tag);
3211	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3212		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3213		goto reject;
3214	}
3215
3216#ifdef notyet
3217	/*
3218	 * XXX do route lookup to confirm that we're still listening on this
3219	 * address
3220	 */
3221	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3222			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3223		goto reject;
3224	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3225		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3226	dst_release(skb->dst);	// done with the input route, release it
3227	skb->dst = NULL;
3228
3229	if ((rt_flags & RTF_LOCAL) == 0)
3230		goto reject;
3231#endif
3232	/*
3233	 * XXX
3234	 */
3235	rt_flags = RTF_LOCAL;
3236	if ((rt_flags & RTF_LOCAL) == 0)
3237		goto reject;
3238
3239	/*
3240	 * Calculate values and add to syncache
3241	 */
3242
3243	newtoep = toepcb_alloc();
3244	if (newtoep == NULL)
3245		goto reject;
3246
3247	bzero(&nam, sizeof(struct sockaddr_in));
3248
3249	nam.sin_len = sizeof(struct sockaddr_in);
3250	nam.sin_family = AF_INET;
3251	nam.sin_addr.s_addr =req->peer_ip;
3252	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3253
3254	if (dst == NULL) {
3255		printf("failed to find route\n");
3256		goto reject;
3257	}
3258	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3259	    (struct sockaddr *)&nam);
3260	if (e == NULL) {
3261		DPRINTF("failed to get l2t\n");
3262	}
3263	/*
3264	 * Point to our listen socket until accept
3265	 */
3266	newtoep->tp_tp = tp;
3267	newtoep->tp_flags = TP_SYN_RCVD;
3268	newtoep->tp_tid = tid;
3269	newtoep->tp_toedev = tdev;
3270	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3271
3272	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3273	SOCK_LOCK(so);
3274	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3275	SOCK_UNLOCK(so);
3276
3277	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
3278		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3279
3280	if (newtoep->tp_ulp_mode) {
3281		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3282
3283		if (ddp_mbuf == NULL)
3284			newtoep->tp_ulp_mode = 0;
3285	}
3286
3287	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3288	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3289	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3290	/*
3291	 * XXX workaround for lack of syncache drop
3292	 */
3293	toepcb_hold(newtoep);
3294	syncache_add_accept_req(req, so, newtoep);
3295
3296	rpl = cplhdr(reply_mbuf);
3297	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3298	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3299	rpl->wr.wr_lo = 0;
3300	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3301	rpl->opt2 = htonl(calc_opt2(so, tdev));
3302	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3303	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3304
3305	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3306	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3307	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3308				  CPL_PASS_OPEN_ACCEPT);
3309
3310	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3311
3312	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3313
3314	l2t_send(cdev, reply_mbuf, e);
3315	m_free(m);
3316	if (newtoep->tp_ulp_mode) {
3317		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3318				V_TF_DDP_OFF(1) |
3319				TP_DDP_TIMER_WORKAROUND_MASK,
3320				V_TF_DDP_OFF(1) |
3321		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3322	} else
3323		printf("not offloading\n");
3324
3325
3326
3327	return;
3328reject:
3329	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3330		mk_pass_accept_rpl(reply_mbuf, m);
3331	else
3332		mk_tid_release(reply_mbuf, newtoep, tid);
3333	cxgb_ofld_send(cdev, reply_mbuf);
3334	m_free(m);
3335out:
3336#if 0
3337	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3338#else
3339	return;
3340#endif
3341}
3342
3343/*
3344 * Handle a CPL_PASS_ACCEPT_REQ message.
3345 */
3346static int
3347do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3348{
3349	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3350	struct socket *lso = listen_ctx->lso;
3351	struct tom_data *d = listen_ctx->tom_data;
3352
3353#if VALIDATE_TID
3354	struct cpl_pass_accept_req *req = cplhdr(m);
3355	unsigned int tid = GET_TID(req);
3356	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3357
3358	if (unlikely(!lsk)) {
3359		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3360		       cdev->name,
3361		       (unsigned long)((union listen_entry *)ctx -
3362					t->stid_tab));
3363		return CPL_RET_BUF_DONE;
3364	}
3365	if (unlikely(tid >= t->ntids)) {
3366		printk(KERN_ERR "%s: passive open TID %u too large\n",
3367		       cdev->name, tid);
3368		return CPL_RET_BUF_DONE;
3369	}
3370	/*
3371	 * For T3A the current user of the TID may have closed but its last
3372	 * message(s) may have been backlogged so the TID appears to be still
3373	 * in use.  Just take the TID away, the connection can close at its
3374	 * own leisure.  For T3B this situation is a bug.
3375	 */
3376	if (!valid_new_tid(t, tid) &&
3377	    cdev->type != T3A) {
3378		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3379		       cdev->name, tid);
3380		return CPL_RET_BUF_DONE;
3381	}
3382#endif
3383
3384	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3385	return (0);
3386}
3387
3388/*
3389 * Called when a connection is established to translate the TCP options
3390 * reported by HW to FreeBSD's native format.
3391 */
3392static void
3393assign_rxopt(struct socket *so, unsigned int opt)
3394{
3395	const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
3396	struct tcpcb *tp = sototcpcb(so);
3397	struct toepcb *toep = tp->t_toe;
3398
3399	INP_LOCK_ASSERT(tp->t_inpcb);
3400
3401	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3402	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3403	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3404	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3405	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3406	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3407		tp->rcv_scale = tp->request_r_scale;
3408}
3409
3410/*
3411 * Completes some final bits of initialization for just established connections
3412 * and changes their state to TCP_ESTABLISHED.
3413 *
3414 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3415 */
3416static void
3417make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3418{
3419	struct tcpcb *tp = sototcpcb(so);
3420	struct toepcb *toep = tp->t_toe;
3421
3422	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3423	assign_rxopt(so, opt);
3424	so->so_proto->pr_ctloutput = t3_ctloutput;
3425
3426#if 0
3427	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3428#endif
3429	/*
3430	 * XXX not clear what rcv_wup maps to
3431	 */
3432	/*
3433	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3434	 * pass through opt0.
3435	 */
3436	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3437		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3438
3439	dump_toepcb(toep);
3440
3441#ifdef notyet
3442/*
3443 * no clean interface for marking ARP up to date
3444 */
3445	dst_confirm(sk->sk_dst_cache);
3446#endif
3447	tp->t_starttime = ticks;
3448	tp->t_state = TCPS_ESTABLISHED;
3449	soisconnected(so);
3450}
3451
3452static int
3453syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3454{
3455
3456	struct in_conninfo inc;
3457	struct tcpopt to;
3458	struct tcphdr th;
3459	int mss, wsf, sack, ts;
3460	struct mbuf *m = NULL;
3461	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3462	unsigned int opt;
3463
3464#ifdef MAC
3465#error	"no MAC support"
3466#endif
3467
3468	opt = ntohs(req->tcp_opt);
3469
3470	bzero(&to, sizeof(struct tcpopt));
3471
3472	/*
3473	 * Fill out information for entering us into the syncache
3474	 */
3475	inc.inc_fport = th.th_sport = req->peer_port;
3476	inc.inc_lport = th.th_dport = req->local_port;
3477	th.th_seq = req->rcv_isn;
3478	th.th_flags = TH_ACK;
3479
3480	inc.inc_isipv6 = 0;
3481	inc.inc_len = 0;
3482	inc.inc_faddr.s_addr = req->peer_ip;
3483	inc.inc_laddr.s_addr = req->local_ip;
3484
3485	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3486	wsf  = G_TCPOPT_WSCALE_OK(opt);
3487	ts   = G_TCPOPT_TSTAMP(opt);
3488	sack = G_TCPOPT_SACK(opt);
3489
3490	to.to_mss = mss;
3491	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3492	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3493
3494	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3495	    ntohl(req->local_ip), ntohs(req->local_port),
3496	    ntohl(req->peer_ip), ntohs(req->peer_port),
3497	    mss, wsf, ts, sack);
3498	return syncache_expand(&inc, &to, &th, so, m);
3499}
3500
3501
3502/*
3503 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3504 * if we are in TCP_SYN_RECV due to crossed SYNs
3505 */
3506static int
3507do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3508{
3509	struct cpl_pass_establish *req = cplhdr(m);
3510	struct toepcb *toep = (struct toepcb *)ctx;
3511	struct tcpcb *tp;
3512	struct socket *so, *lso;
3513	struct t3c_data *td = T3C_DATA(cdev);
3514	// Complete socket initialization now that we have the SND_ISN
3515
3516	struct toedev *tdev;
3517
3518	so = lso = toeptoso(toep);
3519	tdev = toep->tp_toedev;
3520
3521	SOCK_LOCK(so);
3522	LIST_REMOVE(toep, synq_entry);
3523	SOCK_UNLOCK(so);
3524
3525	INP_INFO_WLOCK(&tcbinfo);
3526	if (!syncache_expand_establish_req(req, &so, toep)) {
3527		/*
3528		 * No entry
3529		 */
3530		CXGB_UNIMPLEMENTED();
3531	}
3532	if (so == NULL) {
3533		/*
3534		 * Couldn't create the socket
3535		 */
3536		CXGB_UNIMPLEMENTED();
3537	}
3538
3539	/*
3540	 * XXX workaround for lack of syncache drop
3541	 */
3542	toepcb_release(toep);
3543
3544	tp = sototcpcb(so);
3545	INP_LOCK(tp->t_inpcb);
3546
3547	so->so_snd.sb_flags |= SB_NOCOALESCE;
3548	so->so_rcv.sb_flags |= SB_NOCOALESCE;
3549
3550	toep->tp_tp = tp;
3551	toep->tp_flags = 0;
3552	tp->t_toe = toep;
3553	reset_wr_list(toep);
3554	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3555	tp->rcv_nxt = toep->tp_copied_seq;
3556	install_offload_ops(so);
3557
3558	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3559	toep->tp_wr_unacked = 0;
3560	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3561	toep->tp_qset_idx = 0;
3562	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3563
3564	/*
3565	 * XXX Cancel any keep alive timer
3566	 */
3567
3568	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3569	INP_INFO_WUNLOCK(&tcbinfo);
3570	INP_UNLOCK(tp->t_inpcb);
3571
3572	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3573	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3574#ifdef notyet
3575	/*
3576	 * XXX not sure how these checks map to us
3577	 */
3578	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3579		sk->sk_state_change(sk);
3580		sk_wake_async(so, 0, POLL_OUT);
3581	}
3582	/*
3583	 * The state for the new connection is now up to date.
3584	 * Next check if we should add the connection to the parent's
3585	 * accept queue.  When the parent closes it resets connections
3586	 * on its SYN queue, so check if we are being reset.  If so we
3587	 * don't need to do anything more, the coming ABORT_RPL will
3588	 * destroy this socket.  Otherwise move the connection to the
3589	 * accept queue.
3590	 *
3591	 * Note that we reset the synq before closing the server so if
3592	 * we are not being reset the stid is still open.
3593	 */
3594	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3595		__kfree_skb(skb);
3596		goto unlock;
3597	}
3598#endif
3599	m_free(m);
3600
3601	return (0);
3602}
3603
3604/*
3605 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3606 * and send them to the TOE.
3607 */
3608static void
3609fixup_and_send_ofo(struct socket *so)
3610{
3611	struct mbuf *m;
3612	struct toedev *tdev = TOE_DEV(so);
3613	struct tcpcb *tp = sototcpcb(so);
3614	struct toepcb *toep = tp->t_toe;
3615	unsigned int tid = toep->tp_tid;
3616
3617	printf("fixup_and_send_ofo\n");
3618
3619	INP_LOCK_ASSERT(tp->t_inpcb);
3620	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3621		/*
3622		 * A variety of messages can be waiting but the fields we'll
3623		 * be touching are common to all so any message type will do.
3624		 */
3625		struct cpl_close_con_req *p = cplhdr(m);
3626
3627		p->wr.wr_lo = htonl(V_WR_TID(tid));
3628		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3629		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3630	}
3631}
3632
3633/*
3634 * Updates socket state from an active establish CPL message.  Runs with the
3635 * socket lock held.
3636 */
3637static void
3638socket_act_establish(struct socket *so, struct mbuf *m)
3639{
3640	struct cpl_act_establish *req = cplhdr(m);
3641	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3642	struct tcpcb *tp = sototcpcb(so);
3643	struct toepcb *toep = tp->t_toe;
3644
3645	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3646		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3647		    toep->tp_tid, tp->t_state);
3648
3649	tp->ts_recent_age = ticks;
3650	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3651	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3652
3653	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3654
3655	/*
3656	 * Now that we finally have a TID send any CPL messages that we had to
3657	 * defer for lack of a TID.
3658	 */
3659	if (mbufq_len(&toep->out_of_order_queue))
3660		fixup_and_send_ofo(so);
3661
3662	if (__predict_false(so->so_state & SS_NOFDREF)) {
3663		/*
3664		 * XXX does this even make sense?
3665		 */
3666		sorwakeup(so);
3667	}
3668	m_free(m);
3669#ifdef notyet
3670/*
3671 * XXX assume no write requests permitted while socket connection is
3672 * incomplete
3673 */
3674	/*
3675	 * Currently the send queue must be empty at this point because the
3676	 * socket layer does not send anything before a connection is
3677	 * established.  To be future proof though we handle the possibility
3678	 * that there are pending buffers to send (either TX_DATA or
3679	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3680	 * buffers according to the just learned write_seq, and then we send
3681	 * them on their way.
3682	 */
3683	fixup_pending_writeq_buffers(sk);
3684	if (t3_push_frames(so, 1))
3685		sk->sk_write_space(sk);
3686#endif
3687
3688	toep->tp_state = tp->t_state;
3689	tcpstat.tcps_connects++;
3690
3691}
3692
3693/*
3694 * Process a CPL_ACT_ESTABLISH message.
3695 */
3696static int
3697do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3698{
3699	struct cpl_act_establish *req = cplhdr(m);
3700	unsigned int tid = GET_TID(req);
3701	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3702	struct toepcb *toep = (struct toepcb *)ctx;
3703	struct tcpcb *tp = toep->tp_tp;
3704	struct socket *so;
3705	struct toedev *tdev;
3706	struct tom_data *d;
3707
3708	if (tp == NULL) {
3709		free_atid(cdev, atid);
3710		return (0);
3711	}
3712
3713	so = toeptoso(toep);
3714	tdev = TOE_DEV(so); /* blow up here if link was down */
3715	d = TOM_DATA(tdev);
3716
3717	INP_LOCK(tp->t_inpcb);
3718
3719	/*
3720	 * It's OK if the TID is currently in use, the owning socket may have
3721	 * backlogged its last CPL message(s).  Just take it away.
3722	 */
3723	toep->tp_tid = tid;
3724	toep->tp_tp = tp;
3725	so_insert_tid(d, so, tid);
3726	free_atid(cdev, atid);
3727	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3728
3729	socket_act_establish(so, m);
3730	INP_UNLOCK(tp->t_inpcb);
3731	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3732	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3733
3734	return (0);
3735}
3736
3737/*
3738 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3739 * next batch of work requests from the write queue.
3740 */
3741static void
3742wr_ack(struct toepcb *toep, struct mbuf *m)
3743{
3744	struct tcpcb *tp = toep->tp_tp;
3745	struct cpl_wr_ack *hdr = cplhdr(m);
3746	struct socket *so = toeptoso(toep);
3747	unsigned int credits = ntohs(hdr->credits);
3748	u32 snd_una = ntohl(hdr->snd_una);
3749	int bytes = 0;
3750
3751	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3752
3753	INP_LOCK(tp->t_inpcb);
3754
3755	toep->tp_wr_avail += credits;
3756	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3757		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3758
3759	while (credits) {
3760		struct mbuf *p = peek_wr(toep);
3761
3762		if (__predict_false(!p)) {
3763			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3764			    "nothing pending, state %u wr_avail=%u\n",
3765			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3766			break;
3767		}
3768		CTR2(KTR_TOM,
3769			"wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len);
3770
3771		KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list"));
3772		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3773
3774#if DEBUG_WR > 1
3775			struct tx_data_wr *w = cplhdr(p);
3776			log(LOG_ERR,
3777			       "TID %u got %u WR credits, need %u, len %u, "
3778			       "main body %u, frags %u, seq # %u, ACK una %u,"
3779			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3780			       toep->tp_tid, credits, p->csum, p->len,
3781			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3782			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3783			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3784#endif
3785			p->m_pkthdr.csum_data -= credits;
3786			break;
3787		} else {
3788			dequeue_wr(toep);
3789			credits -= p->m_pkthdr.csum_data;
3790			bytes += p->m_pkthdr.len;
3791			CTR3(KTR_TOM,
3792			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3793			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3794
3795			m_free(p);
3796		}
3797	}
3798
3799#if DEBUG_WR
3800	check_wr_invariants(tp);
3801#endif
3802
3803	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3804#if VALIDATE_SEQ
3805		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3806
3807		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3808		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3809		    toep->tp_tid, tp->snd_una);
3810#endif
3811		goto out_free;
3812	}
3813
3814	if (tp->snd_una != snd_una) {
3815		tp->snd_una = snd_una;
3816		tp->ts_recent_age = ticks;
3817#ifdef notyet
3818		/*
3819		 * Keep ARP entry "minty fresh"
3820		 */
3821		dst_confirm(sk->sk_dst_cache);
3822#endif
3823		if (tp->snd_una == tp->snd_nxt)
3824			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3825	}
3826	if (bytes) {
3827		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3828		SOCKBUF_LOCK(&so->so_snd);
3829		sbdrop_locked(&so->so_snd, bytes);
3830		sowwakeup_locked(so);
3831	}
3832
3833	if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
3834		t3_push_frames(so, 0);
3835
3836out_free:
3837	INP_UNLOCK(tp->t_inpcb);
3838	m_free(m);
3839}
3840
3841/*
3842 * Handler for TX_DATA_ACK CPL messages.
3843 */
3844static int
3845do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3846{
3847	struct toepcb *toep = (struct toepcb *)ctx;
3848
3849	VALIDATE_SOCK(so);
3850
3851	wr_ack(toep, m);
3852	return 0;
3853}
3854
3855/*
3856 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
3857 */
3858static int
3859do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
3860{
3861	m_freem(m);
3862	return 0;
3863}
3864
3865/*
3866 * Reset a connection that is on a listener's SYN queue or accept queue,
3867 * i.e., one that has not had a struct socket associated with it.
3868 * Must be called from process context.
3869 *
3870 * Modeled after code in inet_csk_listen_stop().
3871 */
3872static void
3873t3_reset_listen_child(struct socket *child)
3874{
3875	struct tcpcb *tp = sototcpcb(child);
3876
3877	t3_send_reset(tp->t_toe);
3878}
3879
3880/*
3881 * Disconnect offloaded established but not yet accepted connections sitting
3882 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
3883 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
3884 */
3885void
3886t3_disconnect_acceptq(struct socket *listen_so)
3887{
3888	struct socket *so;
3889	struct tcpcb *tp;
3890
3891	TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
3892		tp = sototcpcb(so);
3893
3894		if (tp->t_flags & TF_TOE) {
3895			INP_LOCK(tp->t_inpcb);
3896			t3_reset_listen_child(so);
3897			INP_UNLOCK(tp->t_inpcb);
3898		}
3899
3900	}
3901}
3902
3903/*
3904 * Reset offloaded connections sitting on a server's syn queue.  As above
3905 * we send ABORT_REQ and finish off when we get ABORT_RPL.
3906 */
3907
3908void
3909t3_reset_synq(struct listen_ctx *lctx)
3910{
3911	struct toepcb *toep;
3912
3913	SOCK_LOCK(lctx->lso);
3914	while (!LIST_EMPTY(&lctx->synq_head)) {
3915		toep = LIST_FIRST(&lctx->synq_head);
3916		LIST_REMOVE(toep, synq_entry);
3917		toep->tp_tp = NULL;
3918		t3_send_reset(toep);
3919		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
3920		toepcb_release(toep);
3921	}
3922	SOCK_UNLOCK(lctx->lso);
3923}
3924
3925
3926int
3927t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
3928		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
3929		   unsigned int pg_off, unsigned int color)
3930{
3931	unsigned int i, j, pidx;
3932	struct pagepod *p;
3933	struct mbuf *m;
3934	struct ulp_mem_io *req;
3935	struct tcpcb *tp = sototcpcb(so);
3936	struct toepcb *toep = tp->t_toe;
3937	unsigned int tid = toep->tp_tid;
3938	const struct tom_data *td = TOM_DATA(TOE_DEV(so));
3939	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
3940
3941	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
3942	    gl, nppods, tag, maxoff, pg_off, color);
3943
3944	for (i = 0; i < nppods; ++i) {
3945		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
3946		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
3947		req = mtod(m, struct ulp_mem_io *);
3948		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
3949		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
3950		req->wr.wr_lo = 0;
3951		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
3952					   V_ULPTX_CMD(ULP_MEM_WRITE));
3953		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
3954				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
3955
3956		p = (struct pagepod *)(req + 1);
3957		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
3958			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
3959			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
3960						  V_PPOD_COLOR(color));
3961			p->pp_max_offset = htonl(maxoff);
3962			p->pp_page_offset = htonl(pg_off);
3963			p->pp_rsvd = 0;
3964			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
3965				p->pp_addr[j] = pidx < gl->dgl_nelem ?
3966				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
3967		} else
3968			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
3969		send_or_defer(toep, m, 0);
3970		ppod_addr += PPOD_SIZE;
3971	}
3972	return (0);
3973}
3974
3975/*
3976 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
3977 */
3978static inline void
3979mk_cpl_barrier_ulp(struct cpl_barrier *b)
3980{
3981	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
3982
3983	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3984	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
3985	b->opcode = CPL_BARRIER;
3986}
3987
3988/*
3989 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
3990 */
3991static inline void
3992mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
3993{
3994	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
3995
3996	txpkt = (struct ulp_txpkt *)req;
3997	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3998	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
3999	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4000	req->cpuno = htons(cpuno);
4001}
4002
4003/*
4004 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4005 */
4006static inline void
4007mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4008                     unsigned int word, uint64_t mask, uint64_t val)
4009{
4010	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4011
4012	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4013	    tid, word, mask, val);
4014
4015	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4016	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4017	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4018	req->reply = V_NO_REPLY(1);
4019	req->cpu_idx = 0;
4020	req->word = htons(word);
4021	req->mask = htobe64(mask);
4022	req->val = htobe64(val);
4023}
4024
4025/*
4026 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4027 */
4028static void
4029mk_rx_data_ack_ulp(struct socket *so,struct cpl_rx_data_ack *ack,
4030    unsigned int tid, unsigned int credits)
4031{
4032	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4033
4034	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4035	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4036	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4037	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4038	    V_RX_DACK_MODE(TOM_TUNABLE(TOE_DEV(so), delack)) |
4039				 V_RX_CREDITS(credits));
4040}
4041
4042void
4043t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4044{
4045	unsigned int wrlen;
4046	struct mbuf *m;
4047	struct work_request_hdr *wr;
4048	struct cpl_barrier *lock;
4049	struct cpl_set_tcb_field *req;
4050	struct cpl_get_tcb *getreq;
4051	struct ddp_state *p = &toep->tp_ddp_state;
4052
4053	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4054	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4055		sizeof(*getreq);
4056	m = m_gethdr_nofail(wrlen);
4057	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4058	wr = mtod(m, struct work_request_hdr *);
4059	bzero(wr, wrlen);
4060
4061	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4062	m->m_pkthdr.len = m->m_len = wrlen;
4063
4064	lock = (struct cpl_barrier *)(wr + 1);
4065	mk_cpl_barrier_ulp(lock);
4066
4067	req = (struct cpl_set_tcb_field *)(lock + 1);
4068
4069	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4070
4071	/* Hmmm, not sure if this actually a good thing: reactivating
4072	 * the other buffer might be an issue if it has been completed
4073	 * already. However, that is unlikely, since the fact that the UBUF
4074	 * is not completed indicates that there is no oustanding data.
4075	 */
4076	if (bufidx == 0)
4077		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4078				     V_TF_DDP_ACTIVE_BUF(1) |
4079				     V_TF_DDP_BUF0_VALID(1),
4080				     V_TF_DDP_ACTIVE_BUF(1));
4081	else
4082		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4083				     V_TF_DDP_ACTIVE_BUF(1) |
4084				     V_TF_DDP_BUF1_VALID(1), 0);
4085
4086	getreq = (struct cpl_get_tcb *)(req + 1);
4087	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4088
4089	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4090
4091	/* Keep track of the number of oustanding CPL_GET_TCB requests
4092	 */
4093	p->get_tcb_count++;
4094
4095#ifdef T3_TRACE
4096	T3_TRACE1(TIDTB(so),
4097		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4098#endif
4099	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4100}
4101
4102/**
4103 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4104 * @sk: the socket associated with the buffers
4105 * @bufidx: index of HW DDP buffer (0 or 1)
4106 * @tag0: new tag for HW buffer 0
4107 * @tag1: new tag for HW buffer 1
4108 * @len: new length for HW buf @bufidx
4109 *
4110 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4111 * buffer by changing the buffer tag and length and setting the valid and
4112 * active flag accordingly.  The caller must ensure the new buffer is at
4113 * least as big as the existing one.  Since we typically reprogram both HW
4114 * buffers this function sets both tags for convenience. Read the TCB to
4115 * determine how made data was written into the buffer before the overlay
4116 * took place.
4117 */
4118void
4119t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4120	 	       unsigned int tag1, unsigned int len)
4121{
4122	unsigned int wrlen;
4123	struct mbuf *m;
4124	struct work_request_hdr *wr;
4125	struct cpl_get_tcb *getreq;
4126	struct cpl_set_tcb_field *req;
4127	struct ddp_state *p = &toep->tp_ddp_state;
4128
4129	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4130	    bufidx, tag0, tag1, len);
4131	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4132	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4133	m = m_gethdr_nofail(wrlen);
4134	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4135	wr = mtod(m, struct work_request_hdr *);
4136	m->m_pkthdr.len = m->m_len = wrlen;
4137	bzero(wr, wrlen);
4138
4139
4140	/* Set the ATOMIC flag to make sure that TP processes the following
4141	 * CPLs in an atomic manner and no wire segments can be interleaved.
4142	 */
4143	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4144	req = (struct cpl_set_tcb_field *)(wr + 1);
4145	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4146			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4147			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4148			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4149			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4150	req++;
4151	if (bufidx == 0) {
4152		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4153			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4154			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4155		req++;
4156		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4157			    V_TF_DDP_PUSH_DISABLE_0(1) |
4158			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4159			    V_TF_DDP_PUSH_DISABLE_0(0) |
4160			    V_TF_DDP_BUF0_VALID(1));
4161	} else {
4162		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4163			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4164			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4165		req++;
4166		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4167			    V_TF_DDP_PUSH_DISABLE_1(1) |
4168			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4169			    V_TF_DDP_PUSH_DISABLE_1(0) |
4170			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4171	}
4172
4173	getreq = (struct cpl_get_tcb *)(req + 1);
4174	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4175
4176	/* Keep track of the number of oustanding CPL_GET_TCB requests
4177	 */
4178	p->get_tcb_count++;
4179
4180#ifdef T3_TRACE
4181	T3_TRACE4(TIDTB(sk),
4182		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4183		  "len %d",
4184		  bufidx, tag0, tag1, len);
4185#endif
4186	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4187}
4188
4189/*
4190 * Sends a compound WR containing all the CPL messages needed to program the
4191 * two HW DDP buffers, namely optionally setting up the length and offset of
4192 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4193 */
4194void
4195t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4196		      unsigned int len1, unsigned int offset1,
4197                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4198{
4199	unsigned int wrlen;
4200	struct mbuf *m;
4201	struct work_request_hdr *wr;
4202	struct cpl_set_tcb_field *req;
4203
4204	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4205	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4206
4207	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4208	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4209		(len1 ? sizeof(*req) : 0) +
4210		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4211	m = m_gethdr_nofail(wrlen);
4212	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4213	wr = mtod(m, struct work_request_hdr *);
4214	bzero(wr, wrlen);
4215
4216	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4217	m->m_pkthdr.len = m->m_len = wrlen;
4218
4219	req = (struct cpl_set_tcb_field *)(wr + 1);
4220	if (len0) {                  /* program buffer 0 offset and length */
4221		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4222			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4223			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4224			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4225			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4226		req++;
4227	}
4228	if (len1) {                  /* program buffer 1 offset and length */
4229		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4230			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4231			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4232			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4233			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4234		req++;
4235	}
4236
4237	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4238			     ddp_flags);
4239
4240	if (modulate) {
4241		mk_rx_data_ack_ulp(toeptoso(toep),
4242		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4243		    toep->tp_copied_seq - toep->tp_rcv_wup);
4244		toep->tp_rcv_wup = toep->tp_copied_seq;
4245	}
4246
4247#ifdef T3_TRACE
4248	T3_TRACE5(TIDTB(sk),
4249		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4250		  "modulate %d",
4251		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4252		  modulate);
4253#endif
4254
4255	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4256}
4257
4258void
4259t3_init_wr_tab(unsigned int wr_len)
4260{
4261	int i;
4262
4263	if (mbuf_wrs[1])     /* already initialized */
4264		return;
4265
4266	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4267		int sgl_len = (3 * i) / 2 + (i & 1);
4268
4269		sgl_len += 3;
4270		mbuf_wrs[i] = sgl_len <= wr_len ?
4271		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4272	}
4273
4274	wrlen = wr_len * 8;
4275}
4276
4277int
4278t3_init_cpl_io(void)
4279{
4280#ifdef notyet
4281	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4282	if (!tcphdr_skb) {
4283		log(LOG_ERR,
4284		       "Chelsio TCP offload: can't allocate sk_buff\n");
4285		return -1;
4286	}
4287	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4288	tcphdr_skb->h.raw = tcphdr_skb->data;
4289	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4290#endif
4291
4292	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4293	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4294	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4295	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4296	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4297	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4298	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4299	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4300	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4301	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4302	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4303	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4304	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4305	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4306	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4307	return (0);
4308}
4309
4310