cxgb_cpl_io.c revision 177540
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 177540 2008-03-24 05:21:10Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44#include <sys/syslog.h>
45#include <sys/socketvar.h>
46#include <sys/protosw.h>
47#include <sys/priv.h>
48
49#include <net/if.h>
50#include <net/route.h>
51
52#include <netinet/in.h>
53#include <netinet/in_pcb.h>
54#include <netinet/in_systm.h>
55#include <netinet/in_var.h>
56
57
58#include <dev/cxgb/cxgb_osdep.h>
59#include <dev/cxgb/sys/mbufq.h>
60
61#include <netinet/ip.h>
62#include <netinet/tcp_var.h>
63#include <netinet/tcp_fsm.h>
64#include <netinet/tcp_offload.h>
65#include <netinet/tcp_seq.h>
66#include <netinet/tcp_syncache.h>
67#include <netinet/tcp_timer.h>
68#include <net/route.h>
69
70#include <dev/cxgb/t3cdev.h>
71#include <dev/cxgb/common/cxgb_firmware_exports.h>
72#include <dev/cxgb/common/cxgb_t3_cpl.h>
73#include <dev/cxgb/common/cxgb_tcb.h>
74#include <dev/cxgb/common/cxgb_ctl_defs.h>
75#include <dev/cxgb/cxgb_l2t.h>
76#include <dev/cxgb/cxgb_offload.h>
77#include <vm/vm.h>
78#include <vm/pmap.h>
79#include <machine/bus.h>
80#include <dev/cxgb/sys/mvec.h>
81#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
82#include <dev/cxgb/ulp/tom/cxgb_defs.h>
83#include <dev/cxgb/ulp/tom/cxgb_tom.h>
84#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
85#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
86#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space.  Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets.  It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes.  Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body.  This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes.  Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window.  We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139extern int tcp_do_autorcvbuf;
140extern int tcp_do_autosndbuf;
141extern int tcp_autorcvbuf_max;
142extern int tcp_autosndbuf_max;
143
144static void t3_send_reset(struct toepcb *toep);
145static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
146static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
147static void handle_syncache_event(int event, void *arg);
148
149static inline void
150SBAPPEND(struct sockbuf *sb, struct mbuf *n)
151{
152	struct mbuf * m;
153
154	m = sb->sb_mb;
155	while (m) {
156		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
157		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
158			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
159		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
160			m->m_next, m->m_nextpkt, m->m_flags));
161		m = m->m_next;
162	}
163	m = n;
164	while (m) {
165		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
166		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
167			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
168		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
169			m->m_next, m->m_nextpkt, m->m_flags));
170		m = m->m_next;
171	}
172	sbappend_locked(sb, n);
173	m = sb->sb_mb;
174	while (m) {
175		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176			m->m_next, m->m_nextpkt, m->m_flags));
177		m = m->m_next;
178	}
179}
180
181static inline int
182is_t3a(const struct toedev *dev)
183{
184	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
185}
186
187static void
188dump_toepcb(struct toepcb *toep)
189{
190	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
191	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
192	    toep->tp_mtu_idx, toep->tp_tid);
193
194	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
195	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
196	    toep->tp_mss_clamp, toep->tp_flags);
197}
198
199#ifndef RTALLOC2_DEFINED
200static struct rtentry *
201rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
202{
203	struct rtentry *rt = NULL;
204
205	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
206		RT_UNLOCK(rt);
207
208	return (rt);
209}
210#endif
211/*
212 * Determine whether to send a CPL message now or defer it.  A message is
213 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
214 * For connections in other states the message is sent immediately.
215 * If through_l2t is set the message is subject to ARP processing, otherwise
216 * it is sent directly.
217 */
218static inline void
219send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
220{
221	struct tcpcb *tp = toep->tp_tp;
222
223	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
224		inp_wlock(tp->t_inpcb);
225		mbufq_tail(&toep->out_of_order_queue, m);  // defer
226		inp_wunlock(tp->t_inpcb);
227	} else if (through_l2t)
228		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
229	else
230		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
231}
232
233static inline unsigned int
234mkprio(unsigned int cntrl, const struct toepcb *toep)
235{
236        return (cntrl);
237}
238
239/*
240 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
241 */
242static inline void
243mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
244{
245	struct cpl_tid_release *req;
246
247	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
248	m->m_pkthdr.len = m->m_len = sizeof(*req);
249	req = mtod(m, struct cpl_tid_release *);
250	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
251	req->wr.wr_lo = 0;
252	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
253}
254
255static inline void
256make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
257{
258	struct tcpcb *tp = sototcpcb(so);
259	struct toepcb *toep = tp->t_toe;
260	struct tx_data_wr *req;
261
262	inp_wlock_assert(tp->t_inpcb);
263
264	req = mtod(m, struct tx_data_wr *);
265	m->m_len = sizeof(*req);
266	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
267	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
268	/* len includes the length of any HW ULP additions */
269	req->len = htonl(len);
270	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
271	/* V_TX_ULP_SUBMODE sets both the mode and submode */
272	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
273	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
274	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
275				   (tail ? 0 : 1))));
276	req->sndseq = htonl(tp->snd_nxt);
277	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
278		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
279				    V_TX_CPU_IDX(toep->tp_qset));
280
281		/* Sendbuffer is in units of 32KB.
282		 */
283		if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
284			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
285		else
286			req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
287		toep->tp_flags |= TP_DATASENT;
288	}
289}
290
291#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
292
293int
294t3_push_frames(struct socket *so, int req_completion)
295{
296	struct tcpcb *tp = sototcpcb(so);
297	struct toepcb *toep = tp->t_toe;
298
299	struct mbuf *tail, *m0, *last;
300	struct t3cdev *cdev;
301	struct tom_data *d;
302	int i, bytes, count, total_bytes;
303	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
304
305	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
306		DPRINTF("tcp state=%d\n", tp->t_state);
307		return (0);
308	}
309
310	if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
311		DPRINTF("disconnecting\n");
312
313		return (0);
314	}
315
316
317	inp_wlock_assert(tp->t_inpcb);
318	SOCKBUF_LOCK(&so->so_snd);
319	d = TOM_DATA(TOE_DEV(so));
320	cdev = d->cdev;
321	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
322	total_bytes = 0;
323	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
324	    toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
325
326	if (last && toep->tp_m_last == last  && so->so_snd.sb_sndptroff != 0) {
327		KASSERT(tail, ("sbdrop error"));
328		last = tail = tail->m_next;
329	}
330
331	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
332		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
333		SOCKBUF_UNLOCK(&so->so_snd);
334		return (0);
335	}
336
337	toep->tp_m_last = NULL;
338	while (toep->tp_wr_avail && (tail != NULL)) {
339		count = bytes = 0;
340		segp = segs;
341		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
342			SOCKBUF_UNLOCK(&so->so_snd);
343			return (0);
344		}
345		/*
346		 * If the data in tail fits as in-line, then
347		 * make an immediate data wr.
348		 */
349		if (tail->m_len <= IMM_LEN) {
350			count = 1;
351			bytes = tail->m_len;
352			last = tail;
353			tail = tail->m_next;
354			m_set_sgl(m0, NULL);
355			m_set_sgllen(m0, 0);
356			make_tx_data_wr(so, m0, bytes, tail);
357			m_append(m0, bytes, mtod(last, caddr_t));
358			KASSERT(!m0->m_next, ("bad append"));
359		} else {
360			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
361			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
362				bytes += tail->m_len;
363				last = tail;
364				count++;
365				/*
366				 * technically an abuse to be using this for a VA
367				 * but less gross than defining my own structure
368				 * or calling pmap_kextract from here :-|
369				 */
370				segp->ds_addr = (bus_addr_t)tail->m_data;
371				segp->ds_len = tail->m_len;
372				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
373				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
374				segp++;
375				tail = tail->m_next;
376			}
377			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
378			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
379
380			m_set_sgl(m0, segs);
381			m_set_sgllen(m0, count);
382			make_tx_data_wr(so, m0, bytes, tail);
383		}
384		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
385
386		if (tail) {
387			so->so_snd.sb_sndptr = tail;
388			toep->tp_m_last = NULL;
389		} else
390			toep->tp_m_last = so->so_snd.sb_sndptr = last;
391
392
393		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
394
395		so->so_snd.sb_sndptroff += bytes;
396		total_bytes += bytes;
397		toep->tp_write_seq += bytes;
398		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
399		    toep->tp_wr_avail, count, mbuf_wrs[count], tail, so->so_snd.sb_sndptr, so->so_snd.sb_sndptroff);
400		if (tail)
401			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
402			    total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
403		else
404			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
405			    total_bytes, toep->tp_m_last, tp->snd_una);
406
407
408		i = 0;
409		while (i < count && m_get_sgllen(m0)) {
410			if ((count - i) >= 3) {
411				CTR6(KTR_TOM,
412				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
413				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
414				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
415				    i += 3;
416			} else if ((count - i) == 2) {
417				CTR4(KTR_TOM,
418				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
419				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
420				    i += 2;
421			} else {
422				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
423				    segs[i].ds_addr, segs[i].ds_len);
424				i++;
425			}
426
427		}
428
429                 /*
430		 * remember credits used
431		 */
432		m0->m_pkthdr.csum_data = mbuf_wrs[count];
433		m0->m_pkthdr.len = bytes;
434		toep->tp_wr_avail -= mbuf_wrs[count];
435		toep->tp_wr_unacked += mbuf_wrs[count];
436
437		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
438		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
439			struct work_request_hdr *wr = cplhdr(m0);
440
441			wr->wr_hi |= htonl(F_WR_COMPL);
442			toep->tp_wr_unacked = 0;
443		}
444		KASSERT((m0->m_pkthdr.csum_data > 0) &&
445		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
446			m0->m_pkthdr.csum_data));
447		m0->m_type = MT_DONTFREE;
448		enqueue_wr(toep, m0);
449		DPRINTF("sending offload tx with %d bytes in %d segments\n",
450		    bytes, count);
451		l2t_send(cdev, m0, toep->tp_l2t);
452	}
453	SOCKBUF_UNLOCK(&so->so_snd);
454	return (total_bytes);
455}
456
457/*
458 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
459 * under any circumstances.  We take the easy way out and always queue the
460 * message to the write_queue.  We can optimize the case where the queue is
461 * already empty though the optimization is probably not worth it.
462 */
463static void
464close_conn(struct socket *so)
465{
466	struct mbuf *m;
467	struct cpl_close_con_req *req;
468	struct tom_data *d;
469	struct inpcb *inp = sotoinpcb(so);
470	struct tcpcb *tp;
471	struct toepcb *toep;
472	unsigned int tid;
473
474
475	inp_wlock(inp);
476	tp = sototcpcb(so);
477	toep = tp->t_toe;
478
479	if (tp->t_state != TCPS_SYN_SENT)
480		t3_push_frames(so, 1);
481
482	if (toep->tp_flags & TP_FIN_SENT) {
483		inp_wunlock(inp);
484		return;
485	}
486
487	tid = toep->tp_tid;
488
489	d = TOM_DATA(toep->tp_toedev);
490
491	m = m_gethdr_nofail(sizeof(*req));
492
493	toep->tp_flags |= TP_FIN_SENT;
494	req = mtod(m, struct cpl_close_con_req *);
495
496	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
497	req->wr.wr_lo = htonl(V_WR_TID(tid));
498	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
499	req->rsvd = htonl(toep->tp_write_seq);
500	inp_wunlock(inp);
501	/*
502	 * XXX - need to defer shutdown while there is still data in the queue
503	 *
504	 */
505	cxgb_ofld_send(d->cdev, m);
506
507}
508
509/*
510 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
511 * and send it along.
512 */
513static void
514abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
515{
516	struct cpl_abort_req *req = cplhdr(m);
517
518	req->cmd = CPL_ABORT_NO_RST;
519	cxgb_ofld_send(cdev, m);
520}
521
522/*
523 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
524 * permitted to return without sending the message in case we cannot allocate
525 * an sk_buff.  Returns the number of credits sent.
526 */
527uint32_t
528t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
529{
530	struct mbuf *m;
531	struct cpl_rx_data_ack *req;
532	struct toepcb *toep = tp->t_toe;
533	struct toedev *tdev = toep->tp_toedev;
534
535	m = m_gethdr_nofail(sizeof(*req));
536
537	DPRINTF("returning %u credits to HW\n", credits);
538
539	req = mtod(m, struct cpl_rx_data_ack *);
540	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
541	req->wr.wr_lo = 0;
542	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
543	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
544	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
545	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
546	return (credits);
547}
548
549/*
550 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
551 * This is only used in DDP mode, so we take the opportunity to also set the
552 * DACK mode and flush any Rx credits.
553 */
554void
555t3_send_rx_modulate(struct toepcb *toep)
556{
557	struct mbuf *m;
558	struct cpl_rx_data_ack *req;
559
560	m = m_gethdr_nofail(sizeof(*req));
561
562	req = mtod(m, struct cpl_rx_data_ack *);
563	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
564	req->wr.wr_lo = 0;
565	m->m_pkthdr.len = m->m_len = sizeof(*req);
566
567	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
568	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
569				 V_RX_DACK_MODE(1) |
570				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
571	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
572	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
573	toep->tp_rcv_wup = toep->tp_copied_seq;
574}
575
576/*
577 * Handle receipt of an urgent pointer.
578 */
579static void
580handle_urg_ptr(struct socket *so, uint32_t urg_seq)
581{
582#ifdef URGENT_DATA_SUPPORTED
583	struct tcpcb *tp = sototcpcb(so);
584
585	urg_seq--;   /* initially points past the urgent data, per BSD */
586
587	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
588		return;                                 /* duplicate pointer */
589	sk_send_sigurg(sk);
590	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
591	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
592		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
593
594		tp->copied_seq++;
595		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
596			tom_eat_skb(sk, skb, 0);
597	}
598	tp->urg_data = TCP_URG_NOTYET;
599	tp->urg_seq = urg_seq;
600#endif
601}
602
603/*
604 * Returns true if a socket cannot accept new Rx data.
605 */
606static inline int
607so_no_receive(const struct socket *so)
608{
609	return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
610}
611
612/*
613 * Process an urgent data notification.
614 */
615static void
616rx_urg_notify(struct toepcb *toep, struct mbuf *m)
617{
618	struct cpl_rx_urg_notify *hdr = cplhdr(m);
619	struct socket *so = toeptoso(toep);
620
621	VALIDATE_SOCK(so);
622
623	if (!so_no_receive(so))
624		handle_urg_ptr(so, ntohl(hdr->seq));
625
626	m_freem(m);
627}
628
629/*
630 * Handler for RX_URG_NOTIFY CPL messages.
631 */
632static int
633do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
634{
635	struct toepcb *toep = (struct toepcb *)ctx;
636
637	rx_urg_notify(toep, m);
638	return (0);
639}
640
641static __inline int
642is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
643{
644	return (toep->tp_ulp_mode ||
645		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
646		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
647}
648
649/*
650 * Set of states for which we should return RX credits.
651 */
652#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
653
654/*
655 * Called after some received data has been read.  It returns RX credits
656 * to the HW for the amount of data processed.
657 */
658void
659t3_cleanup_rbuf(struct tcpcb *tp, int copied)
660{
661	struct toepcb *toep = tp->t_toe;
662	struct socket *so;
663	struct toedev *dev;
664	int dack_mode, must_send, read;
665	u32 thres, credits, dack = 0;
666
667	so = tp->t_inpcb->inp_socket;
668	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
669		(tp->t_state == TCPS_FIN_WAIT_2))) {
670		if (copied) {
671			SOCKBUF_LOCK(&so->so_rcv);
672			toep->tp_copied_seq += copied;
673			SOCKBUF_UNLOCK(&so->so_rcv);
674		}
675
676		return;
677	}
678
679	inp_wlock_assert(tp->t_inpcb);
680	SOCKBUF_LOCK(&so->so_rcv);
681	if (copied)
682		toep->tp_copied_seq += copied;
683	else {
684		read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
685		toep->tp_copied_seq += read;
686	}
687	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
688	toep->tp_enqueued_bytes = so->so_rcv.sb_cc;
689	SOCKBUF_UNLOCK(&so->so_rcv);
690
691	if (credits > so->so_rcv.sb_mbmax) {
692	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
693		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
694	    credits = so->so_rcv.sb_mbmax;
695	}
696
697
698	    /*
699	 * XXX this won't accurately reflect credit return - we need
700	 * to look at the difference between the amount that has been
701	 * put in the recv sockbuf and what is there now
702	 */
703
704	if (__predict_false(!credits))
705		return;
706
707	dev = toep->tp_toedev;
708	thres = TOM_TUNABLE(dev, rx_credit_thres);
709
710	if (__predict_false(thres == 0))
711		return;
712
713	if (is_delack_mode_valid(dev, toep)) {
714		dack_mode = TOM_TUNABLE(dev, delack);
715		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
716			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
717
718			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
719				dack = F_RX_DACK_CHANGE |
720				       V_RX_DACK_MODE(dack_mode);
721		}
722	} else
723		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
724
725	/*
726	 * For coalescing to work effectively ensure the receive window has
727	 * at least 16KB left.
728	 */
729	must_send = credits + 16384 >= tp->rcv_wnd;
730
731	if (must_send || credits >= thres)
732		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
733}
734
735static int
736cxgb_toe_disconnect(struct tcpcb *tp)
737{
738	struct socket *so;
739
740	DPRINTF("cxgb_toe_disconnect\n");
741
742	so = tp->t_inpcb->inp_socket;
743	close_conn(so);
744	return (0);
745}
746
747static int
748cxgb_toe_reset(struct tcpcb *tp)
749{
750	struct toepcb *toep = tp->t_toe;
751
752	t3_send_reset(toep);
753
754	/*
755	 * unhook from socket
756	 */
757	tp->t_flags &= ~TF_TOE;
758	toep->tp_tp = NULL;
759	tp->t_toe = NULL;
760	return (0);
761}
762
763static int
764cxgb_toe_send(struct tcpcb *tp)
765{
766	struct socket *so;
767
768	DPRINTF("cxgb_toe_send\n");
769	dump_toepcb(tp->t_toe);
770
771	so = tp->t_inpcb->inp_socket;
772	t3_push_frames(so, 1);
773	return (0);
774}
775
776static int
777cxgb_toe_rcvd(struct tcpcb *tp)
778{
779
780	inp_wlock_assert(tp->t_inpcb);
781	t3_cleanup_rbuf(tp, 0);
782
783	return (0);
784}
785
786static void
787cxgb_toe_detach(struct tcpcb *tp)
788{
789	struct toepcb *toep;
790
791        /*
792	 * XXX how do we handle teardown in the SYN_SENT state?
793	 *
794	 */
795	inp_wlock_assert(tp->t_inpcb);
796	toep = tp->t_toe;
797	toep->tp_tp = NULL;
798
799	/*
800	 * unhook from socket
801	 */
802	tp->t_flags &= ~TF_TOE;
803	tp->t_toe = NULL;
804}
805
806
807static struct toe_usrreqs cxgb_toe_usrreqs = {
808	.tu_disconnect = cxgb_toe_disconnect,
809	.tu_reset = cxgb_toe_reset,
810	.tu_send = cxgb_toe_send,
811	.tu_rcvd = cxgb_toe_rcvd,
812	.tu_detach = cxgb_toe_detach,
813	.tu_detach = cxgb_toe_detach,
814	.tu_syncache_event = handle_syncache_event,
815};
816
817
818static void
819__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
820			    uint64_t mask, uint64_t val, int no_reply)
821{
822	struct cpl_set_tcb_field *req;
823
824	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
825	    toep->tp_tid, word, mask, val);
826
827	req = mtod(m, struct cpl_set_tcb_field *);
828	m->m_pkthdr.len = m->m_len = sizeof(*req);
829	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
830	req->wr.wr_lo = 0;
831	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
832	req->reply = V_NO_REPLY(no_reply);
833	req->cpu_idx = 0;
834	req->word = htons(word);
835	req->mask = htobe64(mask);
836	req->val = htobe64(val);
837
838	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
839	send_or_defer(toep, m, 0);
840}
841
842static void
843t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
844{
845	struct mbuf *m;
846	struct tcpcb *tp = sototcpcb(so);
847	struct toepcb *toep = tp->t_toe;
848
849	if (toep == NULL)
850		return;
851
852	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
853		printf("not seting field\n");
854		return;
855	}
856
857	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
858
859	__set_tcb_field(toep, m, word, mask, val, 1);
860}
861
862/*
863 * Set one of the t_flags bits in the TCB.
864 */
865static void
866set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
867{
868	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
869}
870
871/*
872 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
873 */
874static void
875t3_set_nagle(struct socket *so)
876{
877	struct tcpcb *tp = sototcpcb(so);
878
879	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
880}
881
882/*
883 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
884 */
885void
886t3_set_keepalive(struct socket *so, int on_off)
887{
888	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
889}
890
891void
892t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
893{
894	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
895}
896
897void
898t3_set_dack_mss(struct socket *so, int on_off)
899{
900	set_tcb_tflag(so, S_TF_DACK_MSS, on_off);
901}
902
903/*
904 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
905 */
906static void
907t3_set_tos(struct socket *so)
908{
909	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
910			 V_TCB_TOS(SO_TOS(so)));
911}
912
913
914/*
915 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
916 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
917 * set the PSH bit in the last segment, which would trigger delivery.]
918 * We work around the issue by setting a DDP buffer in a partial placed state,
919 * which guarantees that TP will schedule a timer.
920 */
921#define TP_DDP_TIMER_WORKAROUND_MASK\
922    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
923     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
924       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
925#define TP_DDP_TIMER_WORKAROUND_VAL\
926    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
927     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
928      32))
929
930static void
931t3_enable_ddp(struct socket *so, int on)
932{
933	if (on) {
934
935		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
936				 V_TF_DDP_OFF(0));
937	} else
938		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
939				 V_TF_DDP_OFF(1) |
940				 TP_DDP_TIMER_WORKAROUND_MASK,
941				 V_TF_DDP_OFF(1) |
942				 TP_DDP_TIMER_WORKAROUND_VAL);
943
944}
945
946void
947t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
948{
949	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
950			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
951			 tag_color);
952}
953
954void
955t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
956		    unsigned int len)
957{
958	if (buf_idx == 0)
959		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
960			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
961			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
962			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
963			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
964	else
965		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
966			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
967			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
968			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
969			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
970}
971
972static int
973t3_set_cong_control(struct socket *so, const char *name)
974{
975#ifdef CONGESTION_CONTROL_SUPPORTED
976	int cong_algo;
977
978	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
979		if (!strcmp(name, t3_cong_ops[cong_algo].name))
980			break;
981
982	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
983		return -EINVAL;
984#endif
985	return 0;
986}
987
988int
989t3_get_tcb(struct socket *so)
990{
991	struct cpl_get_tcb *req;
992	struct tcpcb *tp = sototcpcb(so);
993	struct toepcb *toep = tp->t_toe;
994	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
995
996	if (!m)
997		return (ENOMEM);
998
999	inp_wlock_assert(tp->t_inpcb);
1000	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1001	req = mtod(m, struct cpl_get_tcb *);
1002	m->m_pkthdr.len = m->m_len = sizeof(*req);
1003	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1004	req->wr.wr_lo = 0;
1005	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1006	req->cpuno = htons(toep->tp_qset);
1007	req->rsvd = 0;
1008	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
1009		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1010	else
1011		cxgb_ofld_send(T3C_DEV(so), m);
1012	return 0;
1013}
1014
1015static inline void
1016so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
1017{
1018	struct toepcb *toep = sototoep(so);
1019	toepcb_hold(toep);
1020
1021	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1022}
1023
1024/**
1025 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1026 *	@d: TOM state
1027 *	@mtu: the target MTU
1028 *
1029 *	Returns the index of the value in the MTU table that is closest to but
1030 *	does not exceed the target MTU.
1031 */
1032static unsigned int
1033find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1034{
1035	int i = 0;
1036
1037	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1038		++i;
1039	return (i);
1040}
1041
1042static unsigned int
1043select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1044{
1045	unsigned int idx;
1046
1047#ifdef notyet
1048	struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
1049#endif
1050	if (tp) {
1051		tp->t_maxseg = pmtu - 40;
1052		if (tp->t_maxseg < td->mtus[0] - 40)
1053			tp->t_maxseg = td->mtus[0] - 40;
1054		idx = find_best_mtu(td, tp->t_maxseg + 40);
1055
1056		tp->t_maxseg = td->mtus[idx] - 40;
1057	} else
1058		idx = find_best_mtu(td, pmtu);
1059
1060	return (idx);
1061}
1062
1063static inline void
1064free_atid(struct t3cdev *cdev, unsigned int tid)
1065{
1066	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1067
1068	if (toep)
1069		toepcb_release(toep);
1070}
1071
1072/*
1073 * Release resources held by an offload connection (TID, L2T entry, etc.)
1074 */
1075static void
1076t3_release_offload_resources(struct toepcb *toep)
1077{
1078	struct tcpcb *tp = toep->tp_tp;
1079	struct toedev *tdev = toep->tp_toedev;
1080	struct t3cdev *cdev;
1081	unsigned int tid = toep->tp_tid;
1082
1083	if (!tdev)
1084		return;
1085
1086	cdev = TOEP_T3C_DEV(toep);
1087	if (!cdev)
1088		return;
1089
1090	toep->tp_qset = 0;
1091	t3_release_ddp_resources(toep);
1092
1093#ifdef CTRL_SKB_CACHE
1094	kfree_skb(CTRL_SKB_CACHE(tp));
1095	CTRL_SKB_CACHE(tp) = NULL;
1096#endif
1097
1098	if (toep->tp_wr_avail != toep->tp_wr_max) {
1099		purge_wr_queue(toep);
1100		reset_wr_list(toep);
1101	}
1102
1103	if (toep->tp_l2t) {
1104		l2t_release(L2DATA(cdev), toep->tp_l2t);
1105		toep->tp_l2t = NULL;
1106	}
1107	toep->tp_tp = NULL;
1108	if (tp) {
1109		inp_wlock_assert(tp->t_inpcb);
1110		tp->t_toe = NULL;
1111		tp->t_flags &= ~TF_TOE;
1112	}
1113
1114	if (toep->tp_state == TCPS_SYN_SENT) {
1115		free_atid(cdev, tid);
1116#ifdef notyet
1117		__skb_queue_purge(&tp->out_of_order_queue);
1118#endif
1119	} else {                                          // we have TID
1120		cxgb_remove_tid(cdev, toep, tid);
1121		toepcb_release(toep);
1122	}
1123#if 0
1124	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1125#endif
1126}
1127
1128static void
1129install_offload_ops(struct socket *so)
1130{
1131	struct tcpcb *tp = sototcpcb(so);
1132
1133	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1134
1135	t3_install_socket_ops(so);
1136	tp->t_flags |= TF_TOE;
1137	tp->t_tu = &cxgb_toe_usrreqs;
1138}
1139
1140/*
1141 * Determine the receive window scaling factor given a target max
1142 * receive window.
1143 */
1144static __inline int
1145select_rcv_wscale(int space)
1146{
1147	int wscale = 0;
1148
1149	if (space > MAX_RCV_WND)
1150		space = MAX_RCV_WND;
1151
1152	if (tcp_do_rfc1323)
1153		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1154
1155	return (wscale);
1156}
1157
1158/*
1159 * Determine the receive window size for a socket.
1160 */
1161static unsigned long
1162select_rcv_wnd(struct toedev *dev, struct socket *so)
1163{
1164	struct tom_data *d = TOM_DATA(dev);
1165	unsigned int wnd;
1166	unsigned int max_rcv_wnd;
1167
1168	if (tcp_do_autorcvbuf)
1169		wnd = tcp_autorcvbuf_max;
1170	else
1171		wnd = so->so_rcv.sb_hiwat;
1172
1173
1174
1175	/* XXX
1176	 * For receive coalescing to work effectively we need a receive window
1177	 * that can accomodate a coalesced segment.
1178	 */
1179	if (wnd < MIN_RCV_WND)
1180		wnd = MIN_RCV_WND;
1181
1182	/* PR 5138 */
1183	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1184				    (uint32_t)d->rx_page_size * 23 :
1185				    MAX_RCV_WND);
1186
1187	return min(wnd, max_rcv_wnd);
1188}
1189
1190/*
1191 * Assign offload parameters to some socket fields.  This code is used by
1192 * both active and passive opens.
1193 */
1194static inline void
1195init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1196    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1197{
1198	struct tcpcb *tp = sototcpcb(so);
1199	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1200
1201	SOCK_LOCK_ASSERT(so);
1202
1203	printf("initializing offload socket\n");
1204	/*
1205	 * We either need to fix push frames to work with sbcompress
1206	 * or we need to add this
1207	 */
1208	so->so_snd.sb_flags |= SB_NOCOALESCE;
1209	so->so_rcv.sb_flags |= SB_NOCOALESCE;
1210
1211	tp->t_toe = toep;
1212	toep->tp_tp = tp;
1213	toep->tp_toedev = dev;
1214
1215	toep->tp_tid = tid;
1216	toep->tp_l2t = e;
1217	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1218	toep->tp_wr_unacked = 0;
1219	toep->tp_delack_mode = 0;
1220
1221	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1222	/*
1223	 * XXX broken
1224	 *
1225	 */
1226	tp->rcv_wnd = select_rcv_wnd(dev, so);
1227
1228        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
1229		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1230	toep->tp_qset_idx = 0;
1231
1232	reset_wr_list(toep);
1233	DPRINTF("initialization done\n");
1234}
1235
1236/*
1237 * The next two functions calculate the option 0 value for a socket.
1238 */
1239static inline unsigned int
1240calc_opt0h(struct socket *so, int mtu_idx)
1241{
1242	struct tcpcb *tp = sototcpcb(so);
1243	int wscale = select_rcv_wscale(tp->rcv_wnd);
1244
1245	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1246	    V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1247	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1248}
1249
1250static inline unsigned int
1251calc_opt0l(struct socket *so, int ulp_mode)
1252{
1253	struct tcpcb *tp = sototcpcb(so);
1254	unsigned int val;
1255
1256	val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
1257	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1258
1259	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
1260	return (val);
1261}
1262
1263static inline unsigned int
1264calc_opt2(const struct socket *so, struct toedev *dev)
1265{
1266	int flv_valid;
1267
1268	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1269
1270	return (V_FLAVORS_VALID(flv_valid) |
1271	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1272}
1273
1274#if DEBUG_WR > 1
1275static int
1276count_pending_wrs(const struct toepcb *toep)
1277{
1278	const struct mbuf *m;
1279	int n = 0;
1280
1281	wr_queue_walk(toep, m)
1282		n += m->m_pkthdr.csum_data;
1283	return (n);
1284}
1285#endif
1286
1287#if 0
1288(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1289#endif
1290
1291static void
1292mk_act_open_req(struct socket *so, struct mbuf *m,
1293    unsigned int atid, const struct l2t_entry *e)
1294{
1295	struct cpl_act_open_req *req;
1296	struct inpcb *inp = sotoinpcb(so);
1297	struct tcpcb *tp = intotcpcb(inp);
1298	struct toepcb *toep = tp->t_toe;
1299	struct toedev *tdev = TOE_DEV(so);
1300
1301	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1302
1303	req = mtod(m, struct cpl_act_open_req *);
1304	m->m_pkthdr.len = m->m_len = sizeof(*req);
1305
1306	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1307	req->wr.wr_lo = 0;
1308	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1309	req->local_port = inp->inp_lport;
1310	req->peer_port = inp->inp_fport;
1311	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1312	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1313	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1314			   V_TX_CHANNEL(e->smt_idx));
1315	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1316	req->params = 0;
1317	req->opt2 = htonl(calc_opt2(so, tdev));
1318}
1319
1320
1321/*
1322 * Convert an ACT_OPEN_RPL status to an errno.
1323 */
1324static int
1325act_open_rpl_status_to_errno(int status)
1326{
1327	switch (status) {
1328	case CPL_ERR_CONN_RESET:
1329		return (ECONNREFUSED);
1330	case CPL_ERR_ARP_MISS:
1331		return (EHOSTUNREACH);
1332	case CPL_ERR_CONN_TIMEDOUT:
1333		return (ETIMEDOUT);
1334	case CPL_ERR_TCAM_FULL:
1335		return (ENOMEM);
1336	case CPL_ERR_CONN_EXIST:
1337		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1338		return (EADDRINUSE);
1339	default:
1340		return (EIO);
1341	}
1342}
1343
1344static void
1345fail_act_open(struct toepcb *toep, int errno)
1346{
1347	struct tcpcb *tp = toep->tp_tp;
1348
1349	t3_release_offload_resources(toep);
1350	if (tp) {
1351		inp_wlock_assert(tp->t_inpcb);
1352		tcp_drop(tp, errno);
1353	}
1354
1355#ifdef notyet
1356	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1357#endif
1358}
1359
1360/*
1361 * Handle active open failures.
1362 */
1363static void
1364active_open_failed(struct toepcb *toep, struct mbuf *m)
1365{
1366	struct cpl_act_open_rpl *rpl = cplhdr(m);
1367	struct inpcb *inp;
1368
1369	if (toep->tp_tp == NULL)
1370		goto done;
1371
1372	inp = toep->tp_tp->t_inpcb;
1373	inp_wlock(inp);
1374
1375/*
1376 * Don't handle connection retry for now
1377 */
1378#ifdef notyet
1379	struct inet_connection_sock *icsk = inet_csk(sk);
1380
1381	if (rpl->status == CPL_ERR_CONN_EXIST &&
1382	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1383		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1384		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1385			       jiffies + HZ / 2);
1386	} else
1387#endif
1388		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1389	inp_wunlock(inp);
1390done:
1391	m_free(m);
1392}
1393
1394/*
1395 * Return whether a failed active open has allocated a TID
1396 */
1397static inline int
1398act_open_has_tid(int status)
1399{
1400	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1401	       status != CPL_ERR_ARP_MISS;
1402}
1403
1404/*
1405 * Process an ACT_OPEN_RPL CPL message.
1406 */
1407static int
1408do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1409{
1410	struct toepcb *toep = (struct toepcb *)ctx;
1411	struct cpl_act_open_rpl *rpl = cplhdr(m);
1412
1413	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1414		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1415
1416	active_open_failed(toep, m);
1417	return (0);
1418}
1419
1420/*
1421 * Handle an ARP failure for an active open.   XXX purge ofo queue
1422 *
1423 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1424 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1425 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1426 * free the atid.  Hmm.
1427 */
1428#ifdef notyet
1429static void
1430act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1431{
1432	struct toepcb *toep = m_get_toep(m);
1433	struct tcpcb *tp = toep->tp_tp;
1434	struct inpcb *inp = tp->t_inpcb;
1435	struct socket *so = toeptoso(toep);
1436
1437	inp_wlock(inp);
1438	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1439		fail_act_open(so, EHOSTUNREACH);
1440		printf("freeing %p\n", m);
1441
1442		m_free(m);
1443	}
1444	inp_wunlock(inp);
1445}
1446#endif
1447/*
1448 * Send an active open request.
1449 */
1450int
1451t3_connect(struct toedev *tdev, struct socket *so,
1452    struct rtentry *rt, struct sockaddr *nam)
1453{
1454	struct mbuf *m;
1455	struct l2t_entry *e;
1456	struct tom_data *d = TOM_DATA(tdev);
1457	struct inpcb *inp = sotoinpcb(so);
1458	struct tcpcb *tp = intotcpcb(inp);
1459	struct toepcb *toep; /* allocated by init_offload_socket */
1460
1461	int atid;
1462
1463	toep = toepcb_alloc();
1464	if (toep == NULL)
1465		goto out_err;
1466
1467	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1468		goto out_err;
1469
1470	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1471	if (!e)
1472		goto free_tid;
1473
1474	inp_wlock_assert(inp);
1475	m = m_gethdr(MT_DATA, M_WAITOK);
1476
1477#if 0
1478	m->m_toe.mt_toepcb = tp->t_toe;
1479	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1480#endif
1481	SOCK_LOCK(so);
1482
1483	init_offload_socket(so, tdev, atid, e, rt, toep);
1484
1485	install_offload_ops(so);
1486
1487	mk_act_open_req(so, m, atid, e);
1488	SOCK_UNLOCK(so);
1489
1490	soisconnecting(so);
1491	toep = tp->t_toe;
1492	m_set_toep(m, tp->t_toe);
1493
1494	toep->tp_state = TCPS_SYN_SENT;
1495	l2t_send(d->cdev, (struct mbuf *)m, e);
1496
1497	if (toep->tp_ulp_mode)
1498		t3_enable_ddp(so, 0);
1499	return 	(0);
1500
1501free_tid:
1502	printf("failing connect - free atid\n");
1503
1504	free_atid(d->cdev, atid);
1505out_err:
1506	printf("return ENOMEM\n");
1507       return (ENOMEM);
1508}
1509
1510/*
1511 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1512 * not send multiple ABORT_REQs for the same connection and also that we do
1513 * not try to send a message after the connection has closed.  Returns 1 if
1514 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1515 */
1516static void
1517t3_send_reset(struct toepcb *toep)
1518{
1519
1520	struct cpl_abort_req *req;
1521	unsigned int tid = toep->tp_tid;
1522	int mode = CPL_ABORT_SEND_RST;
1523	struct tcpcb *tp = toep->tp_tp;
1524	struct toedev *tdev = toep->tp_toedev;
1525	struct socket *so = NULL;
1526	struct mbuf *m;
1527
1528	if (tp) {
1529		inp_wlock_assert(tp->t_inpcb);
1530		so = toeptoso(toep);
1531	}
1532
1533	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1534		tdev == NULL))
1535		return;
1536	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1537
1538	/* Purge the send queue so we don't send anything after an abort. */
1539	if (so)
1540		sbflush(&so->so_snd);
1541	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1542		mode |= CPL_ABORT_POST_CLOSE_REQ;
1543
1544	m = m_gethdr_nofail(sizeof(*req));
1545	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1546	set_arp_failure_handler(m, abort_arp_failure);
1547
1548	req = mtod(m, struct cpl_abort_req *);
1549	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1550	req->wr.wr_lo = htonl(V_WR_TID(tid));
1551	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1552	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1553	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1554	req->cmd = mode;
1555	if (tp && (tp->t_state == TCPS_SYN_SENT))
1556		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1557	else
1558		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1559}
1560
1561static int
1562t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1563{
1564	struct inpcb *inp;
1565	int error, optval;
1566
1567	if (sopt->sopt_name == IP_OPTIONS)
1568		return (ENOPROTOOPT);
1569
1570	if (sopt->sopt_name != IP_TOS)
1571		return (EOPNOTSUPP);
1572
1573	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1574
1575	if (error)
1576		return (error);
1577
1578	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1579		return (EPERM);
1580
1581	inp = sotoinpcb(so);
1582	inp->inp_ip_tos = optval;
1583
1584	t3_set_tos(so);
1585
1586	return (0);
1587}
1588
1589static int
1590t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1591{
1592	int err = 0;
1593	size_t copied;
1594
1595	if (sopt->sopt_name != TCP_CONGESTION &&
1596	    sopt->sopt_name != TCP_NODELAY)
1597		return (EOPNOTSUPP);
1598
1599	if (sopt->sopt_name == TCP_CONGESTION) {
1600		char name[TCP_CA_NAME_MAX];
1601		int optlen = sopt->sopt_valsize;
1602		struct tcpcb *tp;
1603
1604		if (optlen < 1)
1605			return (EINVAL);
1606
1607		err = copyinstr(sopt->sopt_val, name,
1608		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1609		if (err)
1610			return (err);
1611		if (copied < 1)
1612			return (EINVAL);
1613
1614		tp = sototcpcb(so);
1615		/*
1616		 * XXX I need to revisit this
1617		 */
1618		if ((err = t3_set_cong_control(so, name)) == 0) {
1619#ifdef CONGESTION_CONTROL_SUPPORTED
1620			tp->t_cong_control = strdup(name, M_CXGB);
1621#endif
1622		} else
1623			return (err);
1624	} else {
1625		int optval, oldval;
1626		struct inpcb *inp;
1627		struct tcpcb *tp;
1628
1629		err = sooptcopyin(sopt, &optval, sizeof optval,
1630		    sizeof optval);
1631
1632		if (err)
1633			return (err);
1634
1635		inp = sotoinpcb(so);
1636		tp = intotcpcb(inp);
1637
1638		inp_wlock(inp);
1639
1640		oldval = tp->t_flags;
1641		if (optval)
1642			tp->t_flags |= TF_NODELAY;
1643		else
1644			tp->t_flags &= ~TF_NODELAY;
1645		inp_wunlock(inp);
1646
1647		if (oldval != tp->t_flags)
1648			t3_set_nagle(so);
1649
1650	}
1651
1652	return (0);
1653}
1654
1655static int
1656t3_ctloutput(struct socket *so, struct sockopt *sopt)
1657{
1658	int err;
1659
1660	if (sopt->sopt_level != IPPROTO_TCP)
1661		err =  t3_ip_ctloutput(so, sopt);
1662	else
1663		err = t3_tcp_ctloutput(so, sopt);
1664
1665	if (err != EOPNOTSUPP)
1666		return (err);
1667
1668	return (tcp_ctloutput(so, sopt));
1669}
1670
1671/*
1672 * Returns true if we need to explicitly request RST when we receive new data
1673 * on an RX-closed connection.
1674 */
1675static inline int
1676need_rst_on_excess_rx(const struct toepcb *toep)
1677{
1678	return (1);
1679}
1680
1681/*
1682 * Handles Rx data that arrives in a state where the socket isn't accepting
1683 * new data.
1684 */
1685static void
1686handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1687{
1688
1689	if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1690		t3_send_reset(toep);
1691	m_freem(m);
1692}
1693
1694/*
1695 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1696 * by getting the DDP offset from the TCB.
1697 */
1698static void
1699tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1700{
1701	struct ddp_state *q = &toep->tp_ddp_state;
1702	struct ddp_buf_state *bsp;
1703	struct cpl_get_tcb_rpl *hdr;
1704	unsigned int ddp_offset;
1705	struct socket *so;
1706	struct tcpcb *tp;
1707
1708	uint64_t t;
1709	__be64 *tcb;
1710
1711	so = toeptoso(toep);
1712	tp = toep->tp_tp;
1713
1714	inp_wlock_assert(tp->t_inpcb);
1715	SOCKBUF_LOCK(&so->so_rcv);
1716
1717	/* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
1718	 * really need a cookie in order to dispatch the RPLs.
1719	 */
1720	q->get_tcb_count--;
1721
1722	/* It is a possible that a previous CPL already invalidated UBUF DDP
1723	 * and moved the cur_buf idx and hence no further processing of this
1724	 * skb is required. However, the app might be sleeping on
1725	 * !q->get_tcb_count and we need to wake it up.
1726	 */
1727	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1728		struct socket *so = toeptoso(toep);
1729
1730		m_freem(m);
1731		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1732			sorwakeup_locked(so);
1733		else
1734			SOCKBUF_UNLOCK(&so->so_rcv);
1735		return;
1736	}
1737
1738	bsp = &q->buf_state[q->cur_buf];
1739	hdr = cplhdr(m);
1740	tcb = (__be64 *)(hdr + 1);
1741	if (q->cur_buf == 0) {
1742		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1743		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1744	} else {
1745		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1746		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1747	}
1748	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1749	m->m_cur_offset = bsp->cur_offset;
1750	bsp->cur_offset = ddp_offset;
1751	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1752
1753	CTR5(KTR_TOM,
1754	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1755	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1756	KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u",
1757		ddp_offset, m->m_cur_offset));
1758
1759#ifdef T3_TRACE
1760	T3_TRACE3(TIDTB(so),
1761		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u",
1762		  tp->rcv_nxt, q->cur_buf, ddp_offset);
1763#endif
1764
1765#if 0
1766{
1767	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1768
1769	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1770	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1771
1772        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1773        rcv_nxt = t >> S_TCB_RCV_NXT;
1774        rcv_nxt &= M_TCB_RCV_NXT;
1775
1776        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1777        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1778        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1779
1780	T3_TRACE2(TIDTB(sk),
1781		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1782		  ddp_flags, rcv_nxt - rx_hdr_offset);
1783	T3_TRACE4(TB(q),
1784		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1785		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1786	T3_TRACE3(TB(q),
1787		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1788		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1789	T3_TRACE2(TB(q),
1790		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1791		 q->buf_state[0].flags, q->buf_state[1].flags);
1792
1793}
1794#endif
1795	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1796		handle_excess_rx(toep, m);
1797		return;
1798	}
1799
1800#ifdef T3_TRACE
1801	if ((int)m->m_pkthdr.len < 0) {
1802		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1803	}
1804#endif
1805	if (bsp->flags & DDP_BF_NOCOPY) {
1806#ifdef T3_TRACE
1807		T3_TRACE0(TB(q),
1808			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1809
1810		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1811			printk("!cancel_ubuf");
1812			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1813		}
1814#endif
1815		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1816		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1817		q->cur_buf ^= 1;
1818	} else if (bsp->flags & DDP_BF_NOFLIP) {
1819
1820		m->m_ddp_flags = 1;    /* always a kernel buffer */
1821
1822		/* now HW buffer carries a user buffer */
1823		bsp->flags &= ~DDP_BF_NOFLIP;
1824		bsp->flags |= DDP_BF_NOCOPY;
1825
1826		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1827		 * any new data in which case we're done. If in addition the
1828		 * offset is 0, then there wasn't a completion for the kbuf
1829		 * and we need to decrement the posted count.
1830		 */
1831		if (m->m_pkthdr.len == 0) {
1832			if (ddp_offset == 0) {
1833				q->kbuf_posted--;
1834				bsp->flags |= DDP_BF_NODATA;
1835			}
1836			SOCKBUF_UNLOCK(&so->so_rcv);
1837
1838			m_free(m);
1839			return;
1840		}
1841	} else {
1842		SOCKBUF_UNLOCK(&so->so_rcv);
1843		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1844		 * but it got here way late and nobody cares anymore.
1845		 */
1846		m_free(m);
1847		return;
1848	}
1849
1850	m->m_ddp_gl = (unsigned char *)bsp->gl;
1851	m->m_flags |= M_DDP;
1852	m->m_seq = tp->rcv_nxt;
1853	tp->rcv_nxt += m->m_pkthdr.len;
1854	tp->t_rcvtime = ticks;
1855#ifdef T3_TRACE
1856	T3_TRACE3(TB(q),
1857		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
1858		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1859#endif
1860	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1861		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1862	if (m->m_pkthdr.len == 0)
1863		q->user_ddp_pending = 0;
1864	else
1865		SBAPPEND(&so->so_rcv, m);
1866	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1867		sorwakeup_locked(so);
1868	else
1869		SOCKBUF_UNLOCK(&so->so_rcv);
1870}
1871
1872/*
1873 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1874 * in that case they are similar to DDP completions.
1875 */
1876static int
1877do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1878{
1879	struct toepcb *toep = (struct toepcb *)ctx;
1880
1881	/* OK if socket doesn't exist */
1882	if (toep == NULL) {
1883		printf("null toep in do_get_tcb_rpl\n");
1884		return (CPL_RET_BUF_DONE);
1885	}
1886
1887	inp_wlock(toep->tp_tp->t_inpcb);
1888	tcb_rpl_as_ddp_complete(toep, m);
1889	inp_wunlock(toep->tp_tp->t_inpcb);
1890
1891	return (0);
1892}
1893
1894static void
1895handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1896{
1897	struct tcpcb *tp = toep->tp_tp;
1898	struct socket *so = toeptoso(toep);
1899	struct ddp_state *q;
1900	struct ddp_buf_state *bsp;
1901	struct cpl_rx_data *hdr = cplhdr(m);
1902	unsigned int rcv_nxt = ntohl(hdr->seq);
1903
1904	if (tp->rcv_nxt == rcv_nxt)
1905		return;
1906
1907	inp_wlock_assert(tp->t_inpcb);
1908	SOCKBUF_LOCK(&so->so_rcv);
1909	q = &toep->tp_ddp_state;
1910	bsp = &q->buf_state[q->cur_buf];
1911	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
1912		rcv_nxt, tp->rcv_nxt));
1913	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
1914	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
1915	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
1916	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
1917
1918#ifdef T3_TRACE
1919	if ((int)m->m_pkthdr.len < 0) {
1920		t3_ddp_error(so, "handle_ddp_data: neg len");
1921	}
1922#endif
1923
1924	m->m_ddp_gl = (unsigned char *)bsp->gl;
1925	m->m_flags |= M_DDP;
1926	m->m_cur_offset = bsp->cur_offset;
1927	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
1928	if (bsp->flags & DDP_BF_NOCOPY)
1929		bsp->flags &= ~DDP_BF_NOCOPY;
1930
1931	m->m_seq = tp->rcv_nxt;
1932	tp->rcv_nxt = rcv_nxt;
1933	bsp->cur_offset += m->m_pkthdr.len;
1934	if (!(bsp->flags & DDP_BF_NOFLIP))
1935		q->cur_buf ^= 1;
1936	/*
1937	 * For now, don't re-enable DDP after a connection fell out of  DDP
1938	 * mode.
1939	 */
1940	q->ubuf_ddp_ready = 0;
1941	SOCKBUF_UNLOCK(&so->so_rcv);
1942}
1943
1944/*
1945 * Process new data received for a connection.
1946 */
1947static void
1948new_rx_data(struct toepcb *toep, struct mbuf *m)
1949{
1950	struct cpl_rx_data *hdr = cplhdr(m);
1951	struct tcpcb *tp = toep->tp_tp;
1952	struct socket *so = toeptoso(toep);
1953	int len = be16toh(hdr->len);
1954
1955	inp_wlock(tp->t_inpcb);
1956
1957	if (__predict_false(so_no_receive(so))) {
1958		handle_excess_rx(toep, m);
1959		inp_wunlock(tp->t_inpcb);
1960		TRACE_EXIT;
1961		return;
1962	}
1963
1964	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
1965		handle_ddp_data(toep, m);
1966
1967	m->m_seq = ntohl(hdr->seq);
1968	m->m_ulp_mode = 0;                    /* for iSCSI */
1969
1970#if VALIDATE_SEQ
1971	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
1972		log(LOG_ERR,
1973		       "%s: TID %u: Bad sequence number %u, expected %u\n",
1974		    TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq,
1975		       tp->rcv_nxt);
1976		m_freem(m);
1977		inp_wunlock(tp->t_inpcb);
1978		return;
1979	}
1980#endif
1981	m_adj(m, sizeof(*hdr));
1982
1983#ifdef URGENT_DATA_SUPPORTED
1984	/*
1985	 * We don't handle urgent data yet
1986	 */
1987	if (__predict_false(hdr->urg))
1988		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
1989	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
1990		     tp->urg_seq - tp->rcv_nxt < skb->len))
1991		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
1992							 tp->rcv_nxt];
1993#endif
1994	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
1995		toep->tp_delack_mode = hdr->dack_mode;
1996		toep->tp_delack_seq = tp->rcv_nxt;
1997	}
1998	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
1999	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2000
2001	if (len < m->m_pkthdr.len)
2002		m->m_pkthdr.len = m->m_len = len;
2003
2004	tp->rcv_nxt += m->m_pkthdr.len;
2005	tp->t_rcvtime = ticks;
2006	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2007#ifdef T3_TRACE
2008	T3_TRACE2(TIDTB(sk),
2009	    "new_rx_data: seq 0x%x len %u",
2010	    m->m_seq, m->m_pkthdr.len);
2011#endif
2012	inp_wunlock(tp->t_inpcb);
2013	SOCKBUF_LOCK(&so->so_rcv);
2014	if (sb_notify(&so->so_rcv))
2015		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
2016
2017	SBAPPEND(&so->so_rcv, m);
2018
2019#ifdef notyet
2020	/*
2021	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2022	 *
2023	 */
2024	KASSERT(so->so_rcv.sb_cc < (so->so_rcv.sb_mbmax << 1),
2025
2026	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2027		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
2028#endif
2029
2030
2031	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2032	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
2033
2034	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2035		sorwakeup_locked(so);
2036	else
2037		SOCKBUF_UNLOCK(&so->so_rcv);
2038}
2039
2040/*
2041 * Handler for RX_DATA CPL messages.
2042 */
2043static int
2044do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2045{
2046	struct toepcb *toep = (struct toepcb *)ctx;
2047
2048	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2049
2050	new_rx_data(toep, m);
2051
2052	return (0);
2053}
2054
2055static void
2056new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2057{
2058	struct tcpcb *tp;
2059	struct ddp_state *q;
2060	struct ddp_buf_state *bsp;
2061	struct cpl_rx_data_ddp *hdr;
2062	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2063	struct socket *so = toeptoso(toep);
2064	int nomoredata = 0;
2065	unsigned int delack_mode;
2066
2067	tp = sototcpcb(so);
2068
2069	inp_wlock(tp->t_inpcb);
2070	if (__predict_false(so_no_receive(so))) {
2071
2072		handle_excess_rx(toep, m);
2073		inp_wunlock(tp->t_inpcb);
2074		return;
2075	}
2076
2077	q = &toep->tp_ddp_state;
2078	hdr = cplhdr(m);
2079	ddp_report = ntohl(hdr->u.ddp_report);
2080	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2081	bsp = &q->buf_state[buf_idx];
2082
2083#ifdef T3_TRACE
2084	T3_TRACE5(TIDTB(sk),
2085		  "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2086		  "hdr seq 0x%x len %u offset %u",
2087		  tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2088		  ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
2089	T3_TRACE1(TIDTB(sk),
2090		  "new_rx_data_ddp: ddp_report 0x%x",
2091		  ddp_report);
2092#endif
2093	CTR4(KTR_TOM,
2094	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2095	    "hdr seq 0x%x len %u",
2096	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2097	    ntohs(hdr->len));
2098	CTR3(KTR_TOM,
2099	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2100	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2101
2102	ddp_len = ntohs(hdr->len);
2103	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2104
2105	delack_mode = G_DDP_DACK_MODE(ddp_report);
2106	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2107		toep->tp_delack_mode = delack_mode;
2108		toep->tp_delack_seq = tp->rcv_nxt;
2109	}
2110
2111	m->m_seq = tp->rcv_nxt;
2112	tp->rcv_nxt = rcv_nxt;
2113
2114	tp->t_rcvtime = ticks;
2115	/*
2116	 * Store the length in m->m_len.  We are changing the meaning of
2117	 * m->m_len here, we need to be very careful that nothing from now on
2118	 * interprets ->len of this packet the usual way.
2119	 */
2120	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2121	inp_wunlock(tp->t_inpcb);
2122	CTR3(KTR_TOM,
2123	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2124	    m->m_len, rcv_nxt, m->m_seq);
2125	/*
2126	 * Figure out where the new data was placed in the buffer and store it
2127	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2128	 * account for page pod's pg_offset.
2129	 */
2130	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2131	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2132
2133	SOCKBUF_LOCK(&so->so_rcv);
2134	m->m_ddp_gl = (unsigned char *)bsp->gl;
2135	m->m_flags |= M_DDP;
2136	bsp->cur_offset = end_offset;
2137	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2138
2139	/*
2140	 * Length is only meaningful for kbuf
2141	 */
2142	if (!(bsp->flags & DDP_BF_NOCOPY))
2143		KASSERT(m->m_len <= bsp->gl->dgl_length,
2144		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2145			m->m_len, bsp->gl->dgl_length));
2146
2147	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2148	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2149
2150
2151        /*
2152	 * Bit 0 of flags stores whether the DDP buffer is completed.
2153	 * Note that other parts of the code depend on this being in bit 0.
2154	 */
2155	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2156		panic("spurious ddp completion");
2157	} else {
2158		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2159		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2160			q->cur_buf ^= 1;                     /* flip buffers */
2161	}
2162
2163	if (bsp->flags & DDP_BF_NOCOPY) {
2164		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2165		bsp->flags &= ~DDP_BF_NOCOPY;
2166	}
2167
2168	if (ddp_report & F_DDP_PSH)
2169		m->m_ddp_flags |= DDP_BF_PSH;
2170	if (nomoredata)
2171		m->m_ddp_flags |= DDP_BF_NODATA;
2172
2173#ifdef notyet
2174	skb_reset_transport_header(skb);
2175	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2176#endif
2177	SBAPPEND(&so->so_rcv, m);
2178
2179	if ((so->so_state & SS_NOFDREF) == 0)
2180		sorwakeup_locked(so);
2181	else
2182		SOCKBUF_UNLOCK(&so->so_rcv);
2183}
2184
2185#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2186		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2187		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2188		 F_DDP_INVALID_PPOD)
2189
2190/*
2191 * Handler for RX_DATA_DDP CPL messages.
2192 */
2193static int
2194do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2195{
2196	struct toepcb *toep = ctx;
2197	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2198
2199	VALIDATE_SOCK(so);
2200
2201	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2202		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2203		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2204		return (CPL_RET_BUF_DONE);
2205	}
2206#if 0
2207	skb->h.th = tcphdr_skb->h.th;
2208#endif
2209	new_rx_data_ddp(toep, m);
2210	return (0);
2211}
2212
2213static void
2214process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2215{
2216	struct tcpcb *tp = toep->tp_tp;
2217	struct socket *so = toeptoso(toep);
2218	struct ddp_state *q;
2219	struct ddp_buf_state *bsp;
2220	struct cpl_rx_ddp_complete *hdr;
2221	unsigned int ddp_report, buf_idx, when, delack_mode;
2222	int nomoredata = 0;
2223
2224	inp_wlock(tp->t_inpcb);
2225	if (__predict_false(so_no_receive(so))) {
2226		struct inpcb *inp = sotoinpcb(so);
2227
2228		handle_excess_rx(toep, m);
2229		inp_wunlock(inp);
2230		return;
2231	}
2232	q = &toep->tp_ddp_state;
2233	hdr = cplhdr(m);
2234	ddp_report = ntohl(hdr->ddp_report);
2235	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2236	m->m_pkthdr.csum_data = tp->rcv_nxt;
2237
2238
2239	SOCKBUF_LOCK(&so->so_rcv);
2240	bsp = &q->buf_state[buf_idx];
2241	when = bsp->cur_offset;
2242	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2243	tp->rcv_nxt += m->m_len;
2244	tp->t_rcvtime = ticks;
2245
2246	delack_mode = G_DDP_DACK_MODE(ddp_report);
2247	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2248		toep->tp_delack_mode = delack_mode;
2249		toep->tp_delack_seq = tp->rcv_nxt;
2250	}
2251#ifdef notyet
2252	skb_reset_transport_header(skb);
2253	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2254#endif
2255	inp_wunlock(tp->t_inpcb);
2256
2257	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2258	CTR5(KTR_TOM,
2259		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2260		  "ddp_report 0x%x offset %u, len %u",
2261		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2262		   G_DDP_OFFSET(ddp_report), m->m_len);
2263
2264	bsp->cur_offset += m->m_len;
2265
2266	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2267		q->cur_buf ^= 1;                     /* flip buffers */
2268		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2269			nomoredata=1;
2270	}
2271
2272	CTR4(KTR_TOM,
2273		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2274		  "ddp_report %u offset %u",
2275		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2276		   G_DDP_OFFSET(ddp_report));
2277
2278	m->m_ddp_gl = (unsigned char *)bsp->gl;
2279	m->m_flags |= M_DDP;
2280	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2281	if (bsp->flags & DDP_BF_NOCOPY)
2282		bsp->flags &= ~DDP_BF_NOCOPY;
2283	if (nomoredata)
2284		m->m_ddp_flags |= DDP_BF_NODATA;
2285
2286
2287	SBAPPEND(&so->so_rcv, m);
2288
2289	if ((so->so_state & SS_NOFDREF) == 0)
2290		sorwakeup_locked(so);
2291	else
2292		SOCKBUF_UNLOCK(&so->so_rcv);
2293}
2294
2295/*
2296 * Handler for RX_DDP_COMPLETE CPL messages.
2297 */
2298static int
2299do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2300{
2301	struct toepcb *toep = ctx;
2302
2303	VALIDATE_SOCK(so);
2304#if 0
2305	skb->h.th = tcphdr_skb->h.th;
2306#endif
2307	process_ddp_complete(toep, m);
2308	return (0);
2309}
2310
2311/*
2312 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2313 * socket state before calling tcp_time_wait to comply with its expectations.
2314 */
2315static void
2316enter_timewait(struct socket *so)
2317{
2318	struct tcpcb *tp = sototcpcb(so);
2319
2320	inp_wlock_assert(tp->t_inpcb);
2321	/*
2322	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2323	 * process peer_close because we don't want to carry the peer FIN in
2324	 * the socket's receive queue and if we increment rcv_nxt without
2325	 * having the FIN in the receive queue we'll confuse facilities such
2326	 * as SIOCINQ.
2327	 */
2328	tp->rcv_nxt++;
2329
2330	tp->ts_recent_age = 0;	     /* defeat recycling */
2331	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2332	tcp_twstart(tp);
2333}
2334
2335/*
2336 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2337 * function deals with the data that may be reported along with the FIN.
2338 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2339 * perform normal FIN-related processing.  In the latter case 1 indicates that
2340 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2341 * skb can be freed.
2342 */
2343static int
2344handle_peer_close_data(struct socket *so, struct mbuf *m)
2345{
2346	struct tcpcb *tp = sototcpcb(so);
2347	struct toepcb *toep = tp->t_toe;
2348	struct ddp_state *q;
2349	struct ddp_buf_state *bsp;
2350	struct cpl_peer_close *req = cplhdr(m);
2351	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2352
2353	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2354		return (0);
2355
2356	if (__predict_false(so_no_receive(so))) {
2357		handle_excess_rx(toep, m);
2358
2359		/*
2360		 * Although we discard the data we want to process the FIN so
2361		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2362		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2363		 * may be what will close the connection.  We return 1 because
2364		 * handle_excess_rx() already freed the packet.
2365		 */
2366		return (1);
2367	}
2368
2369	inp_wlock_assert(tp->t_inpcb);
2370	q = &toep->tp_ddp_state;
2371	SOCKBUF_LOCK(&so->so_rcv);
2372	bsp = &q->buf_state[q->cur_buf];
2373	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2374	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2375	m->m_ddp_gl = (unsigned char *)bsp->gl;
2376	m->m_flags |= M_DDP;
2377	m->m_cur_offset = bsp->cur_offset;
2378	m->m_ddp_flags =
2379	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2380	m->m_seq = tp->rcv_nxt;
2381	tp->rcv_nxt = rcv_nxt;
2382	bsp->cur_offset += m->m_pkthdr.len;
2383	if (!(bsp->flags & DDP_BF_NOFLIP))
2384		q->cur_buf ^= 1;
2385#ifdef notyet
2386	skb_reset_transport_header(skb);
2387	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2388#endif
2389	tp->t_rcvtime = ticks;
2390	SBAPPEND(&so->so_rcv, m);
2391	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2392		sorwakeup_locked(so);
2393	else
2394		SOCKBUF_UNLOCK(&so->so_rcv);
2395	return (1);
2396}
2397
2398/*
2399 * Handle a peer FIN.
2400 */
2401static void
2402do_peer_fin(struct socket *so, struct mbuf *m)
2403{
2404	struct tcpcb *tp = sototcpcb(so);
2405	struct toepcb *toep = tp->t_toe;
2406	int keep = 0;
2407	DPRINTF("do_peer_fin state=%d\n", tp->t_state);
2408
2409#ifdef T3_TRACE
2410	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
2411#endif
2412
2413	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2414		printf("abort_pending set\n");
2415
2416		goto out;
2417	}
2418	INP_INFO_WLOCK(&tcbinfo);
2419	inp_wlock(tp->t_inpcb);
2420	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2421		keep = handle_peer_close_data(so, m);
2422		if (keep < 0) {
2423			INP_INFO_WUNLOCK(&tcbinfo);
2424			inp_wunlock(tp->t_inpcb);
2425			return;
2426		}
2427	}
2428	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2429		socantrcvmore(so);
2430		/*
2431		 * If connection is half-synchronized
2432		 * (ie NEEDSYN flag on) then delay ACK,
2433		 * so it may be piggybacked when SYN is sent.
2434		 * Otherwise, since we received a FIN then no
2435		 * more input can be expected, send ACK now.
2436		 */
2437		if (tp->t_flags & TF_NEEDSYN)
2438			tp->t_flags |= TF_DELACK;
2439		else
2440			tp->t_flags |= TF_ACKNOW;
2441		tp->rcv_nxt++;
2442	}
2443
2444	switch (tp->t_state) {
2445	case TCPS_SYN_RECEIVED:
2446	    tp->t_starttime = ticks;
2447	/* FALLTHROUGH */
2448	case TCPS_ESTABLISHED:
2449		tp->t_state = TCPS_CLOSE_WAIT;
2450		break;
2451	case TCPS_FIN_WAIT_1:
2452		tp->t_state = TCPS_CLOSING;
2453		break;
2454	case TCPS_FIN_WAIT_2:
2455		/*
2456		 * If we've sent an abort_req we must have sent it too late,
2457		 * HW will send us a reply telling us so, and this peer_close
2458		 * is really the last message for this connection and needs to
2459		 * be treated as an abort_rpl, i.e., transition the connection
2460		 * to TCP_CLOSE (note that the host stack does this at the
2461		 * time of generating the RST but we must wait for HW).
2462		 * Otherwise we enter TIME_WAIT.
2463		 */
2464		t3_release_offload_resources(toep);
2465		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2466			tp = tcp_close(tp);
2467		} else {
2468			enter_timewait(so);
2469			tp = NULL;
2470		}
2471		break;
2472	default:
2473		log(LOG_ERR,
2474		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2475		       TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
2476	}
2477	INP_INFO_WUNLOCK(&tcbinfo);
2478	if (tp)
2479		inp_wunlock(tp->t_inpcb);
2480
2481	DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
2482
2483#ifdef notyet
2484	/* Do not send POLL_HUP for half duplex close. */
2485	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2486	    sk->sk_state == TCP_CLOSE)
2487		sk_wake_async(so, 1, POLL_HUP);
2488	else
2489		sk_wake_async(so, 1, POLL_IN);
2490#endif
2491
2492out:
2493	if (!keep)
2494		m_free(m);
2495}
2496
2497/*
2498 * Handler for PEER_CLOSE CPL messages.
2499 */
2500static int
2501do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2502{
2503	struct toepcb *toep = (struct toepcb *)ctx;
2504	struct socket *so = toeptoso(toep);
2505
2506	VALIDATE_SOCK(so);
2507
2508	do_peer_fin(so, m);
2509	return (0);
2510}
2511
2512static void
2513process_close_con_rpl(struct socket *so, struct mbuf *m)
2514{
2515	struct tcpcb *tp = sototcpcb(so);
2516	struct cpl_close_con_rpl *rpl = cplhdr(m);
2517	struct toepcb *toep = tp->t_toe;
2518
2519	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2520
2521	DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
2522	    !!(so->so_state & SS_NOFDREF));
2523	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
2524		goto out;
2525
2526	INP_INFO_WLOCK(&tcbinfo);
2527	inp_wlock(tp->t_inpcb);
2528	switch (tp->t_state) {
2529	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2530		t3_release_offload_resources(toep);
2531		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2532			tp = tcp_close(tp);
2533
2534		} else {
2535			enter_timewait(so);
2536			tp = NULL;
2537			soisdisconnected(so);
2538		}
2539		break;
2540	case TCPS_LAST_ACK:
2541		/*
2542		 * In this state we don't care about pending abort_rpl.
2543		 * If we've sent abort_req it was post-close and was sent too
2544		 * late, this close_con_rpl is the actual last message.
2545		 */
2546		t3_release_offload_resources(toep);
2547		tp = tcp_close(tp);
2548		break;
2549	case TCPS_FIN_WAIT_1:
2550		/*
2551		 * If we can't receive any more
2552		 * data, then closing user can proceed.
2553		 * Starting the timer is contrary to the
2554		 * specification, but if we don't get a FIN
2555		 * we'll hang forever.
2556		 *
2557		 * XXXjl:
2558		 * we should release the tp also, and use a
2559		 * compressed state.
2560		 */
2561		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2562			int timeout;
2563
2564			soisdisconnected(so);
2565			timeout = (tcp_fast_finwait2_recycle) ?
2566			    tcp_finwait2_timeout : tcp_maxidle;
2567			tcp_timer_activate(tp, TT_2MSL, timeout);
2568		}
2569		tp->t_state = TCPS_FIN_WAIT_2;
2570		if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
2571		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2572			tp = tcp_drop(tp, 0);
2573		}
2574
2575		break;
2576	default:
2577		log(LOG_ERR,
2578		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2579		       TOE_DEV(so)->tod_name, toep->tp_tid,
2580		       tp->t_state);
2581	}
2582	INP_INFO_WUNLOCK(&tcbinfo);
2583	if (tp)
2584		inp_wunlock(tp->t_inpcb);
2585out:
2586	m_freem(m);
2587}
2588
2589/*
2590 * Handler for CLOSE_CON_RPL CPL messages.
2591 */
2592static int
2593do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2594			    void *ctx)
2595{
2596	struct toepcb *toep = (struct toepcb *)ctx;
2597	struct socket *so = toeptoso(toep);
2598
2599	VALIDATE_SOCK(so);
2600
2601	process_close_con_rpl(so, m);
2602	return (0);
2603}
2604
2605/*
2606 * Process abort replies.  We only process these messages if we anticipate
2607 * them as the coordination between SW and HW in this area is somewhat lacking
2608 * and sometimes we get ABORT_RPLs after we are done with the connection that
2609 * originated the ABORT_REQ.
2610 */
2611static void
2612process_abort_rpl(struct socket *so, struct mbuf *m)
2613{
2614	struct tcpcb *tp = sototcpcb(so);
2615	struct toepcb *toep = tp->t_toe;
2616
2617#ifdef T3_TRACE
2618	T3_TRACE1(TIDTB(sk),
2619		  "process_abort_rpl: GTS rpl pending %d",
2620		  sock_flag(sk, ABORT_RPL_PENDING));
2621#endif
2622
2623	INP_INFO_WLOCK(&tcbinfo);
2624	inp_wlock(tp->t_inpcb);
2625
2626	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2627		/*
2628		 * XXX panic on tcpdrop
2629		 */
2630		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
2631			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2632		else {
2633			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2634			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2635			    !is_t3a(TOE_DEV(so))) {
2636				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2637					panic("TP_ABORT_REQ_RCVD set");
2638				t3_release_offload_resources(toep);
2639				tp = tcp_close(tp);
2640			}
2641		}
2642	}
2643	if (tp)
2644		inp_wunlock(tp->t_inpcb);
2645	INP_INFO_WUNLOCK(&tcbinfo);
2646
2647	m_free(m);
2648}
2649
2650/*
2651 * Handle an ABORT_RPL_RSS CPL message.
2652 */
2653static int
2654do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2655{
2656	struct socket *so;
2657	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2658	struct toepcb *toep;
2659
2660	/*
2661	 * Ignore replies to post-close aborts indicating that the abort was
2662	 * requested too late.  These connections are terminated when we get
2663	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2664	 * arrives the TID is either no longer used or it has been recycled.
2665	 */
2666	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2667discard:
2668		m_free(m);
2669		return (0);
2670	}
2671
2672	toep = (struct toepcb *)ctx;
2673
2674        /*
2675	 * Sometimes we've already closed the socket, e.g., a post-close
2676	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2677	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2678	 * but FW turns the ABORT_REQ into a regular one and so we get
2679	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2680	 */
2681	if (!toep)
2682		goto discard;
2683
2684	if (toep->tp_tp == NULL) {
2685		printf("removing tid for abort\n");
2686		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2687		if (toep->tp_l2t)
2688			l2t_release(L2DATA(cdev), toep->tp_l2t);
2689
2690		toepcb_release(toep);
2691		goto discard;
2692	}
2693
2694	printf("toep=%p\n", toep);
2695	printf("tp=%p\n", toep->tp_tp);
2696
2697	so = toeptoso(toep); /* <- XXX panic */
2698	toepcb_hold(toep);
2699	process_abort_rpl(so, m);
2700	toepcb_release(toep);
2701	return (0);
2702}
2703
2704/*
2705 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2706 * indicate whether RST should be sent in response.
2707 */
2708static int
2709abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2710{
2711	struct tcpcb *tp = sototcpcb(so);
2712
2713	switch (abort_reason) {
2714	case CPL_ERR_BAD_SYN:
2715#if 0
2716		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2717#endif
2718	case CPL_ERR_CONN_RESET:
2719		// XXX need to handle SYN_RECV due to crossed SYNs
2720		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2721	case CPL_ERR_XMIT_TIMEDOUT:
2722	case CPL_ERR_PERSIST_TIMEDOUT:
2723	case CPL_ERR_FINWAIT2_TIMEDOUT:
2724	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2725#if 0
2726		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2727#endif
2728		return (ETIMEDOUT);
2729	default:
2730		return (EIO);
2731	}
2732}
2733
2734static inline void
2735set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2736{
2737	struct cpl_abort_rpl *rpl = cplhdr(m);
2738
2739	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2740	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2741	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2742
2743	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2744	rpl->cmd = cmd;
2745}
2746
2747static void
2748send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2749{
2750	struct mbuf *reply_mbuf;
2751	struct cpl_abort_req_rss *req = cplhdr(m);
2752
2753	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2754	m_set_priority(m, CPL_PRIORITY_DATA);
2755	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2756	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2757	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2758	m_free(m);
2759}
2760
2761/*
2762 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2763 */
2764static inline int
2765is_neg_adv_abort(unsigned int status)
2766{
2767	return status == CPL_ERR_RTX_NEG_ADVICE ||
2768	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2769}
2770
2771static void
2772send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2773{
2774	struct mbuf  *reply_mbuf;
2775	struct cpl_abort_req_rss *req = cplhdr(m);
2776
2777	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2778
2779	if (!reply_mbuf) {
2780		/* Defer the reply.  Stick rst_status into req->cmd. */
2781		req->status = rst_status;
2782		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2783		return;
2784	}
2785
2786	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2787	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2788	m_free(m);
2789
2790	/*
2791	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2792	 * these messages while ARP is pending.  For other connection states
2793	 * it's not a problem.
2794	 */
2795	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2796}
2797
2798#ifdef notyet
2799static void
2800cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2801{
2802	CXGB_UNIMPLEMENTED();
2803#ifdef notyet
2804	struct request_sock *req = child->sk_user_data;
2805
2806	inet_csk_reqsk_queue_removed(parent, req);
2807	synq_remove(tcp_sk(child));
2808	__reqsk_free(req);
2809	child->sk_user_data = NULL;
2810#endif
2811}
2812
2813
2814/*
2815 * Performs the actual work to abort a SYN_RECV connection.
2816 */
2817static void
2818do_abort_syn_rcv(struct socket *child, struct socket *parent)
2819{
2820	struct tcpcb *parenttp = sototcpcb(parent);
2821	struct tcpcb *childtp = sototcpcb(child);
2822
2823	/*
2824	 * If the server is still open we clean up the child connection,
2825	 * otherwise the server already did the clean up as it was purging
2826	 * its SYN queue and the skb was just sitting in its backlog.
2827	 */
2828	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2829		cleanup_syn_rcv_conn(child, parent);
2830		INP_INFO_WLOCK(&tcbinfo);
2831		inp_wlock(childtp->t_inpcb);
2832		t3_release_offload_resources(childtp->t_toe);
2833		childtp = tcp_close(childtp);
2834		INP_INFO_WUNLOCK(&tcbinfo);
2835		if (childtp)
2836			inp_wunlock(childtp->t_inpcb);
2837	}
2838}
2839#endif
2840
2841/*
2842 * Handle abort requests for a SYN_RECV connection.  These need extra work
2843 * because the socket is on its parent's SYN queue.
2844 */
2845static int
2846abort_syn_rcv(struct socket *so, struct mbuf *m)
2847{
2848	CXGB_UNIMPLEMENTED();
2849#ifdef notyet
2850	struct socket *parent;
2851	struct toedev *tdev = TOE_DEV(so);
2852	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2853	struct socket *oreq = so->so_incomp;
2854	struct t3c_tid_entry *t3c_stid;
2855	struct tid_info *t;
2856
2857	if (!oreq)
2858		return -1;        /* somehow we are not on the SYN queue */
2859
2860	t = &(T3C_DATA(cdev))->tid_maps;
2861	t3c_stid = lookup_stid(t, oreq->ts_recent);
2862	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2863
2864	SOCK_LOCK(parent);
2865	do_abort_syn_rcv(so, parent);
2866	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2867	SOCK_UNLOCK(parent);
2868#endif
2869	return (0);
2870}
2871
2872/*
2873 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2874 * request except that we need to reply to it.
2875 */
2876static void
2877process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
2878{
2879	int rst_status = CPL_ABORT_NO_RST;
2880	const struct cpl_abort_req_rss *req = cplhdr(m);
2881	struct tcpcb *tp = sototcpcb(so);
2882	struct toepcb *toep = tp->t_toe;
2883
2884	inp_wlock(tp->t_inpcb);
2885	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2886		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2887		m_free(m);
2888		goto skip;
2889	}
2890
2891	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
2892	/*
2893	 * Three cases to consider:
2894	 * a) We haven't sent an abort_req; close the connection.
2895	 * b) We have sent a post-close abort_req that will get to TP too late
2896	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
2897	 *    be ignored and the connection should be closed now.
2898	 * c) We have sent a regular abort_req that will get to TP too late.
2899	 *    That will generate an abort_rpl with status 0, wait for it.
2900	 */
2901	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
2902	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
2903		so->so_error = abort_status_to_errno(so, req->status,
2904		    &rst_status);
2905		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2906			sorwakeup(so);
2907		/*
2908		 * SYN_RECV needs special processing.  If abort_syn_rcv()
2909		 * returns 0 is has taken care of the abort.
2910		 */
2911		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
2912			goto skip;
2913
2914		t3_release_offload_resources(toep);
2915		tp = tcp_close(tp);
2916	}
2917	if (tp)
2918		inp_wunlock(tp->t_inpcb);
2919	send_abort_rpl(m, tdev, rst_status);
2920	return;
2921
2922skip:
2923	inp_wunlock(tp->t_inpcb);
2924}
2925
2926/*
2927 * Handle an ABORT_REQ_RSS CPL message.
2928 */
2929static int
2930do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2931{
2932	const struct cpl_abort_req_rss *req = cplhdr(m);
2933	struct toepcb *toep = (struct toepcb *)ctx;
2934	struct socket *so;
2935	struct inpcb *inp;
2936
2937	if (is_neg_adv_abort(req->status)) {
2938		m_free(m);
2939		return (0);
2940	}
2941
2942	printf("aborting tid=%d\n", toep->tp_tid);
2943
2944	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
2945		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2946		toep->tp_flags |= TP_ABORT_REQ_RCVD;
2947		printf("sending abort rpl\n");
2948
2949		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
2950		printf("sent\n");
2951		if (toep->tp_l2t)
2952			l2t_release(L2DATA(cdev), toep->tp_l2t);
2953
2954		/*
2955		 *  Unhook
2956		 */
2957		toep->tp_tp->t_toe = NULL;
2958		toep->tp_tp->t_flags &= ~TF_TOE;
2959		toep->tp_tp = NULL;
2960		/*
2961		 * XXX need to call syncache_chkrst - but we don't
2962		 * have a way of doing that yet
2963		 */
2964		toepcb_release(toep);
2965		printf("abort for unestablished connection :-(\n");
2966		return (0);
2967	}
2968	if (toep->tp_tp == NULL) {
2969		printf("disconnected toepcb\n");
2970		/* should be freed momentarily */
2971		return (0);
2972	}
2973
2974	so = toeptoso(toep);
2975	inp = sotoinpcb(so);
2976
2977	VALIDATE_SOCK(so);
2978	toepcb_hold(toep);
2979	INP_INFO_WLOCK(&tcbinfo);
2980	process_abort_req(so, m, TOE_DEV(so));
2981	INP_INFO_WUNLOCK(&tcbinfo);
2982	toepcb_release(toep);
2983	return (0);
2984}
2985#ifdef notyet
2986static void
2987pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
2988{
2989	struct toedev *tdev = TOE_DEV(parent);
2990
2991	do_abort_syn_rcv(child, parent);
2992	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
2993		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
2994
2995		rpl->opt0h = htonl(F_TCAM_BYPASS);
2996		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2997		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
2998	} else
2999		m_free(m);
3000}
3001#endif
3002static void
3003handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3004{
3005	CXGB_UNIMPLEMENTED();
3006
3007#ifdef notyet
3008	struct t3cdev *cdev;
3009	struct socket *parent;
3010	struct socket *oreq;
3011	struct t3c_tid_entry *t3c_stid;
3012	struct tid_info *t;
3013	struct tcpcb *otp, *tp = sototcpcb(so);
3014	struct toepcb *toep = tp->t_toe;
3015
3016	/*
3017	 * If the connection is being aborted due to the parent listening
3018	 * socket going away there's nothing to do, the ABORT_REQ will close
3019	 * the connection.
3020	 */
3021	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3022		m_free(m);
3023		return;
3024	}
3025
3026	oreq = so->so_incomp;
3027	otp = sototcpcb(oreq);
3028
3029	cdev = T3C_DEV(so);
3030	t = &(T3C_DATA(cdev))->tid_maps;
3031	t3c_stid = lookup_stid(t, otp->ts_recent);
3032	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3033
3034	SOCK_LOCK(parent);
3035	pass_open_abort(so, parent, m);
3036	SOCK_UNLOCK(parent);
3037#endif
3038}
3039
3040/*
3041 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3042 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3043 * connection.
3044 */
3045static void
3046pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3047{
3048
3049#ifdef notyet
3050	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3051	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3052#endif
3053	handle_pass_open_arp_failure(m_get_socket(m), m);
3054}
3055
3056/*
3057 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3058 */
3059static void
3060mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3061{
3062	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3063	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3064	unsigned int tid = GET_TID(req);
3065
3066	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3067	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3068	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3069	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3070	rpl->opt0h = htonl(F_TCAM_BYPASS);
3071	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3072	rpl->opt2 = 0;
3073	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3074}
3075
3076/*
3077 * Send a deferred reject to an accept request.
3078 */
3079static void
3080reject_pass_request(struct toedev *tdev, struct mbuf *m)
3081{
3082	struct mbuf *reply_mbuf;
3083
3084	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3085	mk_pass_accept_rpl(reply_mbuf, m);
3086	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3087	m_free(m);
3088}
3089
3090static void
3091handle_syncache_event(int event, void *arg)
3092{
3093	struct toepcb *toep = arg;
3094
3095	switch (event) {
3096	case TOE_SC_ENTRY_PRESENT:
3097		/*
3098		 * entry already exists - free toepcb
3099		 * and l2t
3100		 */
3101		printf("syncache entry present\n");
3102		toepcb_release(toep);
3103		break;
3104	case TOE_SC_DROP:
3105		/*
3106		 * The syncache has given up on this entry
3107		 * either it timed out, or it was evicted
3108		 * we need to explicitly release the tid
3109		 */
3110		printf("syncache entry dropped\n");
3111		toepcb_release(toep);
3112		break;
3113	default:
3114		log(LOG_ERR, "unknown syncache event %d\n", event);
3115		break;
3116	}
3117}
3118
3119static void
3120syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3121{
3122	struct in_conninfo inc;
3123	struct tcpopt to;
3124	struct tcphdr th;
3125	struct inpcb *inp;
3126	int mss, wsf, sack, ts;
3127	uint32_t rcv_isn = ntohl(req->rcv_isn);
3128
3129	bzero(&to, sizeof(struct tcpopt));
3130	inp = sotoinpcb(lso);
3131
3132	/*
3133	 * Fill out information for entering us into the syncache
3134	 */
3135	inc.inc_fport = th.th_sport = req->peer_port;
3136	inc.inc_lport = th.th_dport = req->local_port;
3137	th.th_seq = req->rcv_isn;
3138	th.th_flags = TH_SYN;
3139
3140	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3141
3142
3143	inc.inc_isipv6 = 0;
3144	inc.inc_len = 0;
3145	inc.inc_faddr.s_addr = req->peer_ip;
3146	inc.inc_laddr.s_addr = req->local_ip;
3147
3148	DPRINTF("syncache add of %d:%d %d:%d\n",
3149	    ntohl(req->local_ip), ntohs(req->local_port),
3150	    ntohl(req->peer_ip), ntohs(req->peer_port));
3151
3152	mss = req->tcp_options.mss;
3153	wsf = req->tcp_options.wsf;
3154	ts = req->tcp_options.tstamp;
3155	sack = req->tcp_options.sack;
3156	to.to_mss = mss;
3157	to.to_wscale = wsf;
3158	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3159	INP_INFO_WLOCK(&tcbinfo);
3160	inp_wlock(inp);
3161	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3162}
3163
3164
3165/*
3166 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3167 * lock held.  Note that the sock here is a listening socket that is not owned
3168 * by the TOE.
3169 */
3170static void
3171process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3172    struct listen_ctx *lctx)
3173{
3174	int rt_flags;
3175	struct l2t_entry *e;
3176	struct iff_mac tim;
3177	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3178	struct cpl_pass_accept_rpl *rpl;
3179	struct cpl_pass_accept_req *req = cplhdr(m);
3180	unsigned int tid = GET_TID(req);
3181	struct tom_data *d = TOM_DATA(tdev);
3182	struct t3cdev *cdev = d->cdev;
3183	struct tcpcb *tp = sototcpcb(so);
3184	struct toepcb *newtoep;
3185	struct rtentry *dst;
3186	struct sockaddr_in nam;
3187	struct t3c_data *td = T3C_DATA(cdev);
3188
3189	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3190	if (__predict_false(reply_mbuf == NULL)) {
3191		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3192			t3_defer_reply(m, tdev, reject_pass_request);
3193		else {
3194			cxgb_queue_tid_release(cdev, tid);
3195			m_free(m);
3196		}
3197		DPRINTF("failed to get reply_mbuf\n");
3198
3199		goto out;
3200	}
3201
3202	if (tp->t_state != TCPS_LISTEN) {
3203		DPRINTF("socket not in listen state\n");
3204
3205		goto reject;
3206	}
3207
3208	tim.mac_addr = req->dst_mac;
3209	tim.vlan_tag = ntohs(req->vlan_tag);
3210	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3211		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3212		goto reject;
3213	}
3214
3215#ifdef notyet
3216	/*
3217	 * XXX do route lookup to confirm that we're still listening on this
3218	 * address
3219	 */
3220	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3221			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3222		goto reject;
3223	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3224		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3225	dst_release(skb->dst);	// done with the input route, release it
3226	skb->dst = NULL;
3227
3228	if ((rt_flags & RTF_LOCAL) == 0)
3229		goto reject;
3230#endif
3231	/*
3232	 * XXX
3233	 */
3234	rt_flags = RTF_LOCAL;
3235	if ((rt_flags & RTF_LOCAL) == 0)
3236		goto reject;
3237
3238	/*
3239	 * Calculate values and add to syncache
3240	 */
3241
3242	newtoep = toepcb_alloc();
3243	if (newtoep == NULL)
3244		goto reject;
3245
3246	bzero(&nam, sizeof(struct sockaddr_in));
3247
3248	nam.sin_len = sizeof(struct sockaddr_in);
3249	nam.sin_family = AF_INET;
3250	nam.sin_addr.s_addr =req->peer_ip;
3251	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3252
3253	if (dst == NULL) {
3254		printf("failed to find route\n");
3255		goto reject;
3256	}
3257	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3258	    (struct sockaddr *)&nam);
3259	if (e == NULL) {
3260		DPRINTF("failed to get l2t\n");
3261	}
3262	/*
3263	 * Point to our listen socket until accept
3264	 */
3265	newtoep->tp_tp = tp;
3266	newtoep->tp_flags = TP_SYN_RCVD;
3267	newtoep->tp_tid = tid;
3268	newtoep->tp_toedev = tdev;
3269	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3270
3271	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3272	SOCK_LOCK(so);
3273	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3274	SOCK_UNLOCK(so);
3275
3276	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
3277		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3278
3279	if (newtoep->tp_ulp_mode) {
3280		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3281
3282		if (ddp_mbuf == NULL)
3283			newtoep->tp_ulp_mode = 0;
3284	}
3285
3286	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3287	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3288	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3289	/*
3290	 * XXX workaround for lack of syncache drop
3291	 */
3292	toepcb_hold(newtoep);
3293	syncache_add_accept_req(req, so, newtoep);
3294
3295	rpl = cplhdr(reply_mbuf);
3296	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3297	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3298	rpl->wr.wr_lo = 0;
3299	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3300	rpl->opt2 = htonl(calc_opt2(so, tdev));
3301	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3302	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3303
3304	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3305	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3306	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3307				  CPL_PASS_OPEN_ACCEPT);
3308
3309	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3310
3311	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3312
3313	l2t_send(cdev, reply_mbuf, e);
3314	m_free(m);
3315	if (newtoep->tp_ulp_mode) {
3316		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3317				V_TF_DDP_OFF(1) |
3318				TP_DDP_TIMER_WORKAROUND_MASK,
3319				V_TF_DDP_OFF(1) |
3320		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3321	} else
3322		printf("not offloading\n");
3323
3324
3325
3326	return;
3327reject:
3328	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3329		mk_pass_accept_rpl(reply_mbuf, m);
3330	else
3331		mk_tid_release(reply_mbuf, newtoep, tid);
3332	cxgb_ofld_send(cdev, reply_mbuf);
3333	m_free(m);
3334out:
3335#if 0
3336	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3337#else
3338	return;
3339#endif
3340}
3341
3342/*
3343 * Handle a CPL_PASS_ACCEPT_REQ message.
3344 */
3345static int
3346do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3347{
3348	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3349	struct socket *lso = listen_ctx->lso;
3350	struct tom_data *d = listen_ctx->tom_data;
3351
3352#if VALIDATE_TID
3353	struct cpl_pass_accept_req *req = cplhdr(m);
3354	unsigned int tid = GET_TID(req);
3355	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3356
3357	if (unlikely(!lsk)) {
3358		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3359		       cdev->name,
3360		       (unsigned long)((union listen_entry *)ctx -
3361					t->stid_tab));
3362		return CPL_RET_BUF_DONE;
3363	}
3364	if (unlikely(tid >= t->ntids)) {
3365		printk(KERN_ERR "%s: passive open TID %u too large\n",
3366		       cdev->name, tid);
3367		return CPL_RET_BUF_DONE;
3368	}
3369	/*
3370	 * For T3A the current user of the TID may have closed but its last
3371	 * message(s) may have been backlogged so the TID appears to be still
3372	 * in use.  Just take the TID away, the connection can close at its
3373	 * own leisure.  For T3B this situation is a bug.
3374	 */
3375	if (!valid_new_tid(t, tid) &&
3376	    cdev->type != T3A) {
3377		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3378		       cdev->name, tid);
3379		return CPL_RET_BUF_DONE;
3380	}
3381#endif
3382
3383	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3384	return (0);
3385}
3386
3387/*
3388 * Called when a connection is established to translate the TCP options
3389 * reported by HW to FreeBSD's native format.
3390 */
3391static void
3392assign_rxopt(struct socket *so, unsigned int opt)
3393{
3394	const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
3395	struct tcpcb *tp = sototcpcb(so);
3396	struct toepcb *toep = tp->t_toe;
3397
3398	inp_wlock_assert(tp->t_inpcb);
3399
3400	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3401	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3402	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3403	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3404	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3405	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3406		tp->rcv_scale = tp->request_r_scale;
3407}
3408
3409/*
3410 * Completes some final bits of initialization for just established connections
3411 * and changes their state to TCP_ESTABLISHED.
3412 *
3413 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3414 */
3415static void
3416make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3417{
3418	struct tcpcb *tp = sototcpcb(so);
3419	struct toepcb *toep = tp->t_toe;
3420
3421	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3422	assign_rxopt(so, opt);
3423	so->so_proto->pr_ctloutput = t3_ctloutput;
3424
3425#if 0
3426	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3427#endif
3428	/*
3429	 * XXX not clear what rcv_wup maps to
3430	 */
3431	/*
3432	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3433	 * pass through opt0.
3434	 */
3435	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3436		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3437
3438	dump_toepcb(toep);
3439
3440#ifdef notyet
3441/*
3442 * no clean interface for marking ARP up to date
3443 */
3444	dst_confirm(sk->sk_dst_cache);
3445#endif
3446	tp->t_starttime = ticks;
3447	tp->t_state = TCPS_ESTABLISHED;
3448	soisconnected(so);
3449}
3450
3451static int
3452syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3453{
3454
3455	struct in_conninfo inc;
3456	struct tcpopt to;
3457	struct tcphdr th;
3458	int mss, wsf, sack, ts;
3459	struct mbuf *m = NULL;
3460	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3461	unsigned int opt;
3462
3463#ifdef MAC
3464#error	"no MAC support"
3465#endif
3466
3467	opt = ntohs(req->tcp_opt);
3468
3469	bzero(&to, sizeof(struct tcpopt));
3470
3471	/*
3472	 * Fill out information for entering us into the syncache
3473	 */
3474	inc.inc_fport = th.th_sport = req->peer_port;
3475	inc.inc_lport = th.th_dport = req->local_port;
3476	th.th_seq = req->rcv_isn;
3477	th.th_flags = TH_ACK;
3478
3479	inc.inc_isipv6 = 0;
3480	inc.inc_len = 0;
3481	inc.inc_faddr.s_addr = req->peer_ip;
3482	inc.inc_laddr.s_addr = req->local_ip;
3483
3484	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3485	wsf  = G_TCPOPT_WSCALE_OK(opt);
3486	ts   = G_TCPOPT_TSTAMP(opt);
3487	sack = G_TCPOPT_SACK(opt);
3488
3489	to.to_mss = mss;
3490	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3491	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3492
3493	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3494	    ntohl(req->local_ip), ntohs(req->local_port),
3495	    ntohl(req->peer_ip), ntohs(req->peer_port),
3496	    mss, wsf, ts, sack);
3497	return syncache_expand(&inc, &to, &th, so, m);
3498}
3499
3500
3501/*
3502 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3503 * if we are in TCP_SYN_RECV due to crossed SYNs
3504 */
3505static int
3506do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3507{
3508	struct cpl_pass_establish *req = cplhdr(m);
3509	struct toepcb *toep = (struct toepcb *)ctx;
3510	struct tcpcb *tp;
3511	struct socket *so, *lso;
3512	struct t3c_data *td = T3C_DATA(cdev);
3513	// Complete socket initialization now that we have the SND_ISN
3514
3515	struct toedev *tdev;
3516
3517	so = lso = toeptoso(toep);
3518	tdev = toep->tp_toedev;
3519
3520	SOCK_LOCK(so);
3521	LIST_REMOVE(toep, synq_entry);
3522	SOCK_UNLOCK(so);
3523
3524	INP_INFO_WLOCK(&tcbinfo);
3525	if (!syncache_expand_establish_req(req, &so, toep)) {
3526		/*
3527		 * No entry
3528		 */
3529		CXGB_UNIMPLEMENTED();
3530	}
3531	if (so == NULL) {
3532		/*
3533		 * Couldn't create the socket
3534		 */
3535		CXGB_UNIMPLEMENTED();
3536	}
3537
3538	/*
3539	 * XXX workaround for lack of syncache drop
3540	 */
3541	toepcb_release(toep);
3542
3543	tp = sototcpcb(so);
3544	inp_wlock(tp->t_inpcb);
3545
3546	so->so_snd.sb_flags |= SB_NOCOALESCE;
3547	so->so_rcv.sb_flags |= SB_NOCOALESCE;
3548
3549	toep->tp_tp = tp;
3550	toep->tp_flags = 0;
3551	tp->t_toe = toep;
3552	reset_wr_list(toep);
3553	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3554	tp->rcv_nxt = toep->tp_copied_seq;
3555	install_offload_ops(so);
3556
3557	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3558	toep->tp_wr_unacked = 0;
3559	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3560	toep->tp_qset_idx = 0;
3561	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3562
3563	/*
3564	 * XXX Cancel any keep alive timer
3565	 */
3566
3567	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3568	INP_INFO_WUNLOCK(&tcbinfo);
3569	inp_wunlock(tp->t_inpcb);
3570
3571	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3572	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3573#ifdef notyet
3574	/*
3575	 * XXX not sure how these checks map to us
3576	 */
3577	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3578		sk->sk_state_change(sk);
3579		sk_wake_async(so, 0, POLL_OUT);
3580	}
3581	/*
3582	 * The state for the new connection is now up to date.
3583	 * Next check if we should add the connection to the parent's
3584	 * accept queue.  When the parent closes it resets connections
3585	 * on its SYN queue, so check if we are being reset.  If so we
3586	 * don't need to do anything more, the coming ABORT_RPL will
3587	 * destroy this socket.  Otherwise move the connection to the
3588	 * accept queue.
3589	 *
3590	 * Note that we reset the synq before closing the server so if
3591	 * we are not being reset the stid is still open.
3592	 */
3593	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3594		__kfree_skb(skb);
3595		goto unlock;
3596	}
3597#endif
3598	m_free(m);
3599
3600	return (0);
3601}
3602
3603/*
3604 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3605 * and send them to the TOE.
3606 */
3607static void
3608fixup_and_send_ofo(struct socket *so)
3609{
3610	struct mbuf *m;
3611	struct toedev *tdev = TOE_DEV(so);
3612	struct tcpcb *tp = sototcpcb(so);
3613	struct toepcb *toep = tp->t_toe;
3614	unsigned int tid = toep->tp_tid;
3615
3616	printf("fixup_and_send_ofo\n");
3617
3618	inp_wlock_assert(tp->t_inpcb);
3619	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3620		/*
3621		 * A variety of messages can be waiting but the fields we'll
3622		 * be touching are common to all so any message type will do.
3623		 */
3624		struct cpl_close_con_req *p = cplhdr(m);
3625
3626		p->wr.wr_lo = htonl(V_WR_TID(tid));
3627		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3628		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3629	}
3630}
3631
3632/*
3633 * Updates socket state from an active establish CPL message.  Runs with the
3634 * socket lock held.
3635 */
3636static void
3637socket_act_establish(struct socket *so, struct mbuf *m)
3638{
3639	struct cpl_act_establish *req = cplhdr(m);
3640	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3641	struct tcpcb *tp = sototcpcb(so);
3642	struct toepcb *toep = tp->t_toe;
3643
3644	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3645		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3646		    toep->tp_tid, tp->t_state);
3647
3648	tp->ts_recent_age = ticks;
3649	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3650	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3651
3652	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3653
3654	/*
3655	 * Now that we finally have a TID send any CPL messages that we had to
3656	 * defer for lack of a TID.
3657	 */
3658	if (mbufq_len(&toep->out_of_order_queue))
3659		fixup_and_send_ofo(so);
3660
3661	if (__predict_false(so->so_state & SS_NOFDREF)) {
3662		/*
3663		 * XXX does this even make sense?
3664		 */
3665		sorwakeup(so);
3666	}
3667	m_free(m);
3668#ifdef notyet
3669/*
3670 * XXX assume no write requests permitted while socket connection is
3671 * incomplete
3672 */
3673	/*
3674	 * Currently the send queue must be empty at this point because the
3675	 * socket layer does not send anything before a connection is
3676	 * established.  To be future proof though we handle the possibility
3677	 * that there are pending buffers to send (either TX_DATA or
3678	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3679	 * buffers according to the just learned write_seq, and then we send
3680	 * them on their way.
3681	 */
3682	fixup_pending_writeq_buffers(sk);
3683	if (t3_push_frames(so, 1))
3684		sk->sk_write_space(sk);
3685#endif
3686
3687	toep->tp_state = tp->t_state;
3688	tcpstat.tcps_connects++;
3689
3690}
3691
3692/*
3693 * Process a CPL_ACT_ESTABLISH message.
3694 */
3695static int
3696do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3697{
3698	struct cpl_act_establish *req = cplhdr(m);
3699	unsigned int tid = GET_TID(req);
3700	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3701	struct toepcb *toep = (struct toepcb *)ctx;
3702	struct tcpcb *tp = toep->tp_tp;
3703	struct socket *so;
3704	struct toedev *tdev;
3705	struct tom_data *d;
3706
3707	if (tp == NULL) {
3708		free_atid(cdev, atid);
3709		return (0);
3710	}
3711
3712	so = toeptoso(toep);
3713	tdev = TOE_DEV(so); /* blow up here if link was down */
3714	d = TOM_DATA(tdev);
3715
3716	inp_wlock(tp->t_inpcb);
3717
3718	/*
3719	 * It's OK if the TID is currently in use, the owning socket may have
3720	 * backlogged its last CPL message(s).  Just take it away.
3721	 */
3722	toep->tp_tid = tid;
3723	toep->tp_tp = tp;
3724	so_insert_tid(d, so, tid);
3725	free_atid(cdev, atid);
3726	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3727
3728	socket_act_establish(so, m);
3729	inp_wunlock(tp->t_inpcb);
3730	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3731	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3732
3733	return (0);
3734}
3735
3736/*
3737 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3738 * next batch of work requests from the write queue.
3739 */
3740static void
3741wr_ack(struct toepcb *toep, struct mbuf *m)
3742{
3743	struct tcpcb *tp = toep->tp_tp;
3744	struct cpl_wr_ack *hdr = cplhdr(m);
3745	struct socket *so = toeptoso(toep);
3746	unsigned int credits = ntohs(hdr->credits);
3747	u32 snd_una = ntohl(hdr->snd_una);
3748	int bytes = 0;
3749
3750	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3751
3752	inp_wlock(tp->t_inpcb);
3753
3754	toep->tp_wr_avail += credits;
3755	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3756		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3757
3758	while (credits) {
3759		struct mbuf *p = peek_wr(toep);
3760
3761		if (__predict_false(!p)) {
3762			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3763			    "nothing pending, state %u wr_avail=%u\n",
3764			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3765			break;
3766		}
3767		CTR2(KTR_TOM,
3768			"wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len);
3769
3770		KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list"));
3771		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3772
3773#if DEBUG_WR > 1
3774			struct tx_data_wr *w = cplhdr(p);
3775			log(LOG_ERR,
3776			       "TID %u got %u WR credits, need %u, len %u, "
3777			       "main body %u, frags %u, seq # %u, ACK una %u,"
3778			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3779			       toep->tp_tid, credits, p->csum, p->len,
3780			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3781			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3782			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3783#endif
3784			p->m_pkthdr.csum_data -= credits;
3785			break;
3786		} else {
3787			dequeue_wr(toep);
3788			credits -= p->m_pkthdr.csum_data;
3789			bytes += p->m_pkthdr.len;
3790			CTR3(KTR_TOM,
3791			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3792			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3793
3794			m_free(p);
3795		}
3796	}
3797
3798#if DEBUG_WR
3799	check_wr_invariants(tp);
3800#endif
3801
3802	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3803#if VALIDATE_SEQ
3804		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3805
3806		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3807		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3808		    toep->tp_tid, tp->snd_una);
3809#endif
3810		goto out_free;
3811	}
3812
3813	if (tp->snd_una != snd_una) {
3814		tp->snd_una = snd_una;
3815		tp->ts_recent_age = ticks;
3816#ifdef notyet
3817		/*
3818		 * Keep ARP entry "minty fresh"
3819		 */
3820		dst_confirm(sk->sk_dst_cache);
3821#endif
3822		if (tp->snd_una == tp->snd_nxt)
3823			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3824	}
3825	if (bytes) {
3826		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3827		SOCKBUF_LOCK(&so->so_snd);
3828		sbdrop_locked(&so->so_snd, bytes);
3829		sowwakeup_locked(so);
3830	}
3831
3832	if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
3833		t3_push_frames(so, 0);
3834
3835out_free:
3836	inp_wunlock(tp->t_inpcb);
3837	m_free(m);
3838}
3839
3840/*
3841 * Handler for TX_DATA_ACK CPL messages.
3842 */
3843static int
3844do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3845{
3846	struct toepcb *toep = (struct toepcb *)ctx;
3847
3848	VALIDATE_SOCK(so);
3849
3850	wr_ack(toep, m);
3851	return 0;
3852}
3853
3854/*
3855 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
3856 */
3857static int
3858do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
3859{
3860	m_freem(m);
3861	return 0;
3862}
3863
3864/*
3865 * Reset a connection that is on a listener's SYN queue or accept queue,
3866 * i.e., one that has not had a struct socket associated with it.
3867 * Must be called from process context.
3868 *
3869 * Modeled after code in inet_csk_listen_stop().
3870 */
3871static void
3872t3_reset_listen_child(struct socket *child)
3873{
3874	struct tcpcb *tp = sototcpcb(child);
3875
3876	t3_send_reset(tp->t_toe);
3877}
3878
3879/*
3880 * Disconnect offloaded established but not yet accepted connections sitting
3881 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
3882 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
3883 */
3884void
3885t3_disconnect_acceptq(struct socket *listen_so)
3886{
3887	struct socket *so;
3888	struct tcpcb *tp;
3889
3890	TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
3891		tp = sototcpcb(so);
3892
3893		if (tp->t_flags & TF_TOE) {
3894			inp_wlock(tp->t_inpcb);
3895			t3_reset_listen_child(so);
3896			inp_wunlock(tp->t_inpcb);
3897		}
3898	}
3899}
3900
3901/*
3902 * Reset offloaded connections sitting on a server's syn queue.  As above
3903 * we send ABORT_REQ and finish off when we get ABORT_RPL.
3904 */
3905
3906void
3907t3_reset_synq(struct listen_ctx *lctx)
3908{
3909	struct toepcb *toep;
3910
3911	SOCK_LOCK(lctx->lso);
3912	while (!LIST_EMPTY(&lctx->synq_head)) {
3913		toep = LIST_FIRST(&lctx->synq_head);
3914		LIST_REMOVE(toep, synq_entry);
3915		toep->tp_tp = NULL;
3916		t3_send_reset(toep);
3917		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
3918		toepcb_release(toep);
3919	}
3920	SOCK_UNLOCK(lctx->lso);
3921}
3922
3923
3924int
3925t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
3926		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
3927		   unsigned int pg_off, unsigned int color)
3928{
3929	unsigned int i, j, pidx;
3930	struct pagepod *p;
3931	struct mbuf *m;
3932	struct ulp_mem_io *req;
3933	struct tcpcb *tp = sototcpcb(so);
3934	struct toepcb *toep = tp->t_toe;
3935	unsigned int tid = toep->tp_tid;
3936	const struct tom_data *td = TOM_DATA(TOE_DEV(so));
3937	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
3938
3939	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
3940	    gl, nppods, tag, maxoff, pg_off, color);
3941
3942	for (i = 0; i < nppods; ++i) {
3943		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
3944		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
3945		req = mtod(m, struct ulp_mem_io *);
3946		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
3947		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
3948		req->wr.wr_lo = 0;
3949		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
3950					   V_ULPTX_CMD(ULP_MEM_WRITE));
3951		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
3952				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
3953
3954		p = (struct pagepod *)(req + 1);
3955		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
3956			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
3957			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
3958						  V_PPOD_COLOR(color));
3959			p->pp_max_offset = htonl(maxoff);
3960			p->pp_page_offset = htonl(pg_off);
3961			p->pp_rsvd = 0;
3962			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
3963				p->pp_addr[j] = pidx < gl->dgl_nelem ?
3964				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
3965		} else
3966			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
3967		send_or_defer(toep, m, 0);
3968		ppod_addr += PPOD_SIZE;
3969	}
3970	return (0);
3971}
3972
3973/*
3974 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
3975 */
3976static inline void
3977mk_cpl_barrier_ulp(struct cpl_barrier *b)
3978{
3979	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
3980
3981	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3982	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
3983	b->opcode = CPL_BARRIER;
3984}
3985
3986/*
3987 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
3988 */
3989static inline void
3990mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
3991{
3992	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
3993
3994	txpkt = (struct ulp_txpkt *)req;
3995	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3996	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
3997	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
3998	req->cpuno = htons(cpuno);
3999}
4000
4001/*
4002 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4003 */
4004static inline void
4005mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4006                     unsigned int word, uint64_t mask, uint64_t val)
4007{
4008	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4009
4010	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4011	    tid, word, mask, val);
4012
4013	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4014	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4015	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4016	req->reply = V_NO_REPLY(1);
4017	req->cpu_idx = 0;
4018	req->word = htons(word);
4019	req->mask = htobe64(mask);
4020	req->val = htobe64(val);
4021}
4022
4023/*
4024 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4025 */
4026static void
4027mk_rx_data_ack_ulp(struct socket *so,struct cpl_rx_data_ack *ack,
4028    unsigned int tid, unsigned int credits)
4029{
4030	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4031
4032	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4033	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4034	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4035	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4036	    V_RX_DACK_MODE(TOM_TUNABLE(TOE_DEV(so), delack)) |
4037				 V_RX_CREDITS(credits));
4038}
4039
4040void
4041t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4042{
4043	unsigned int wrlen;
4044	struct mbuf *m;
4045	struct work_request_hdr *wr;
4046	struct cpl_barrier *lock;
4047	struct cpl_set_tcb_field *req;
4048	struct cpl_get_tcb *getreq;
4049	struct ddp_state *p = &toep->tp_ddp_state;
4050
4051	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4052	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4053		sizeof(*getreq);
4054	m = m_gethdr_nofail(wrlen);
4055	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4056	wr = mtod(m, struct work_request_hdr *);
4057	bzero(wr, wrlen);
4058
4059	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4060	m->m_pkthdr.len = m->m_len = wrlen;
4061
4062	lock = (struct cpl_barrier *)(wr + 1);
4063	mk_cpl_barrier_ulp(lock);
4064
4065	req = (struct cpl_set_tcb_field *)(lock + 1);
4066
4067	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4068
4069	/* Hmmm, not sure if this actually a good thing: reactivating
4070	 * the other buffer might be an issue if it has been completed
4071	 * already. However, that is unlikely, since the fact that the UBUF
4072	 * is not completed indicates that there is no oustanding data.
4073	 */
4074	if (bufidx == 0)
4075		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4076				     V_TF_DDP_ACTIVE_BUF(1) |
4077				     V_TF_DDP_BUF0_VALID(1),
4078				     V_TF_DDP_ACTIVE_BUF(1));
4079	else
4080		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4081				     V_TF_DDP_ACTIVE_BUF(1) |
4082				     V_TF_DDP_BUF1_VALID(1), 0);
4083
4084	getreq = (struct cpl_get_tcb *)(req + 1);
4085	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4086
4087	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4088
4089	/* Keep track of the number of oustanding CPL_GET_TCB requests
4090	 */
4091	p->get_tcb_count++;
4092
4093#ifdef T3_TRACE
4094	T3_TRACE1(TIDTB(so),
4095		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4096#endif
4097	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4098}
4099
4100/**
4101 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4102 * @sk: the socket associated with the buffers
4103 * @bufidx: index of HW DDP buffer (0 or 1)
4104 * @tag0: new tag for HW buffer 0
4105 * @tag1: new tag for HW buffer 1
4106 * @len: new length for HW buf @bufidx
4107 *
4108 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4109 * buffer by changing the buffer tag and length and setting the valid and
4110 * active flag accordingly.  The caller must ensure the new buffer is at
4111 * least as big as the existing one.  Since we typically reprogram both HW
4112 * buffers this function sets both tags for convenience. Read the TCB to
4113 * determine how made data was written into the buffer before the overlay
4114 * took place.
4115 */
4116void
4117t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4118	 	       unsigned int tag1, unsigned int len)
4119{
4120	unsigned int wrlen;
4121	struct mbuf *m;
4122	struct work_request_hdr *wr;
4123	struct cpl_get_tcb *getreq;
4124	struct cpl_set_tcb_field *req;
4125	struct ddp_state *p = &toep->tp_ddp_state;
4126
4127	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4128	    bufidx, tag0, tag1, len);
4129	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4130	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4131	m = m_gethdr_nofail(wrlen);
4132	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4133	wr = mtod(m, struct work_request_hdr *);
4134	m->m_pkthdr.len = m->m_len = wrlen;
4135	bzero(wr, wrlen);
4136
4137
4138	/* Set the ATOMIC flag to make sure that TP processes the following
4139	 * CPLs in an atomic manner and no wire segments can be interleaved.
4140	 */
4141	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4142	req = (struct cpl_set_tcb_field *)(wr + 1);
4143	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4144			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4145			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4146			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4147			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4148	req++;
4149	if (bufidx == 0) {
4150		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4151			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4152			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4153		req++;
4154		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4155			    V_TF_DDP_PUSH_DISABLE_0(1) |
4156			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4157			    V_TF_DDP_PUSH_DISABLE_0(0) |
4158			    V_TF_DDP_BUF0_VALID(1));
4159	} else {
4160		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4161			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4162			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4163		req++;
4164		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4165			    V_TF_DDP_PUSH_DISABLE_1(1) |
4166			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4167			    V_TF_DDP_PUSH_DISABLE_1(0) |
4168			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4169	}
4170
4171	getreq = (struct cpl_get_tcb *)(req + 1);
4172	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4173
4174	/* Keep track of the number of oustanding CPL_GET_TCB requests
4175	 */
4176	p->get_tcb_count++;
4177
4178#ifdef T3_TRACE
4179	T3_TRACE4(TIDTB(sk),
4180		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4181		  "len %d",
4182		  bufidx, tag0, tag1, len);
4183#endif
4184	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4185}
4186
4187/*
4188 * Sends a compound WR containing all the CPL messages needed to program the
4189 * two HW DDP buffers, namely optionally setting up the length and offset of
4190 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4191 */
4192void
4193t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4194		      unsigned int len1, unsigned int offset1,
4195                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4196{
4197	unsigned int wrlen;
4198	struct mbuf *m;
4199	struct work_request_hdr *wr;
4200	struct cpl_set_tcb_field *req;
4201
4202	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4203	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4204
4205	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4206	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4207		(len1 ? sizeof(*req) : 0) +
4208		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4209	m = m_gethdr_nofail(wrlen);
4210	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4211	wr = mtod(m, struct work_request_hdr *);
4212	bzero(wr, wrlen);
4213
4214	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4215	m->m_pkthdr.len = m->m_len = wrlen;
4216
4217	req = (struct cpl_set_tcb_field *)(wr + 1);
4218	if (len0) {                  /* program buffer 0 offset and length */
4219		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4220			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4221			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4222			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4223			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4224		req++;
4225	}
4226	if (len1) {                  /* program buffer 1 offset and length */
4227		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4228			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4229			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4230			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4231			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4232		req++;
4233	}
4234
4235	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4236			     ddp_flags);
4237
4238	if (modulate) {
4239		mk_rx_data_ack_ulp(toeptoso(toep),
4240		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4241		    toep->tp_copied_seq - toep->tp_rcv_wup);
4242		toep->tp_rcv_wup = toep->tp_copied_seq;
4243	}
4244
4245#ifdef T3_TRACE
4246	T3_TRACE5(TIDTB(sk),
4247		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4248		  "modulate %d",
4249		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4250		  modulate);
4251#endif
4252
4253	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4254}
4255
4256void
4257t3_init_wr_tab(unsigned int wr_len)
4258{
4259	int i;
4260
4261	if (mbuf_wrs[1])     /* already initialized */
4262		return;
4263
4264	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4265		int sgl_len = (3 * i) / 2 + (i & 1);
4266
4267		sgl_len += 3;
4268		mbuf_wrs[i] = sgl_len <= wr_len ?
4269		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4270	}
4271
4272	wrlen = wr_len * 8;
4273}
4274
4275int
4276t3_init_cpl_io(void)
4277{
4278#ifdef notyet
4279	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4280	if (!tcphdr_skb) {
4281		log(LOG_ERR,
4282		       "Chelsio TCP offload: can't allocate sk_buff\n");
4283		return -1;
4284	}
4285	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4286	tcphdr_skb->h.raw = tcphdr_skb->data;
4287	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4288#endif
4289
4290	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4291	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4292	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4293	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4294	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4295	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4296	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4297	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4298	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4299	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4300	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4301	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4302	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4303	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4304	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4305	return (0);
4306}
4307
4308