cxgb_cpl_io.c revision 177530
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 177530 2008-03-23 22:34:16Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44#include <sys/syslog.h>
45#include <sys/socketvar.h>
46#include <sys/protosw.h>
47#include <sys/priv.h>
48
49#include <net/if.h>
50#include <net/route.h>
51
52#include <netinet/in.h>
53#include <netinet/in_pcb.h>
54#include <netinet/in_systm.h>
55#include <netinet/in_var.h>
56
57
58#include <dev/cxgb/cxgb_osdep.h>
59#include <dev/cxgb/sys/mbufq.h>
60
61#include <netinet/ip.h>
62#include <netinet/tcp_var.h>
63#include <netinet/tcp_fsm.h>
64#include <netinet/tcp_offload.h>
65#include <netinet/tcp_seq.h>
66#include <netinet/tcp_syncache.h>
67#include <netinet/tcp_timer.h>
68#include <net/route.h>
69
70#include <dev/cxgb/t3cdev.h>
71#include <dev/cxgb/common/cxgb_firmware_exports.h>
72#include <dev/cxgb/common/cxgb_t3_cpl.h>
73#include <dev/cxgb/common/cxgb_tcb.h>
74#include <dev/cxgb/common/cxgb_ctl_defs.h>
75#include <dev/cxgb/cxgb_l2t.h>
76#include <dev/cxgb/cxgb_offload.h>
77#include <vm/vm.h>
78#include <vm/pmap.h>
79#include <machine/bus.h>
80#include <dev/cxgb/sys/mvec.h>
81#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
82#include <dev/cxgb/ulp/tom/cxgb_defs.h>
83#include <dev/cxgb/ulp/tom/cxgb_tom.h>
84#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
85#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
86#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space.  Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets.  It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes.  Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body.  This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes.  Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window.  We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139extern int tcp_do_autorcvbuf;
140extern int tcp_do_autosndbuf;
141extern int tcp_autorcvbuf_max;
142extern int tcp_autosndbuf_max;
143
144static void t3_send_reset(struct toepcb *toep);
145static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
146static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
147static void handle_syncache_event(int event, void *arg);
148
149static inline void
150SBAPPEND(struct sockbuf *sb, struct mbuf *n)
151{
152	struct mbuf * m;
153
154	m = sb->sb_mb;
155	while (m) {
156		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
157		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
158			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
159		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
160			m->m_next, m->m_nextpkt, m->m_flags));
161		m = m->m_next;
162	}
163	m = n;
164	while (m) {
165		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
166		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
167			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
168		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
169			m->m_next, m->m_nextpkt, m->m_flags));
170		m = m->m_next;
171	}
172	sbappend_locked(sb, n);
173	m = sb->sb_mb;
174	while (m) {
175		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176			m->m_next, m->m_nextpkt, m->m_flags));
177		m = m->m_next;
178	}
179}
180
181static inline int
182is_t3a(const struct toedev *dev)
183{
184	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
185}
186
187static void
188dump_toepcb(struct toepcb *toep)
189{
190	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
191	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
192	    toep->tp_mtu_idx, toep->tp_tid);
193
194	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
195	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
196	    toep->tp_mss_clamp, toep->tp_flags);
197}
198
199#ifndef RTALLOC2_DEFINED
200static struct rtentry *
201rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
202{
203	struct rtentry *rt = NULL;
204
205	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
206		RT_UNLOCK(rt);
207
208	return (rt);
209}
210#endif
211/*
212 * Determine whether to send a CPL message now or defer it.  A message is
213 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
214 * For connections in other states the message is sent immediately.
215 * If through_l2t is set the message is subject to ARP processing, otherwise
216 * it is sent directly.
217 */
218static inline void
219send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
220{
221	struct tcpcb *tp = toep->tp_tp;
222
223	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
224		inp_wlock(tp->t_inpcb);
225		mbufq_tail(&toep->out_of_order_queue, m);  // defer
226		inp_wunlock(tp->t_inpcb);
227	} else if (through_l2t)
228		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
229	else
230		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
231}
232
233static inline unsigned int
234mkprio(unsigned int cntrl, const struct toepcb *toep)
235{
236        return (cntrl);
237}
238
239/*
240 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
241 */
242static inline void
243mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
244{
245	struct cpl_tid_release *req;
246
247	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
248	m->m_pkthdr.len = m->m_len = sizeof(*req);
249	req = mtod(m, struct cpl_tid_release *);
250	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
251	req->wr.wr_lo = 0;
252	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
253}
254
255static inline void
256make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
257{
258	struct tcpcb *tp = sototcpcb(so);
259	struct toepcb *toep = tp->t_toe;
260	struct tx_data_wr *req;
261
262	inp_wlock_assert(tp->t_inpcb);
263
264	req = mtod(m, struct tx_data_wr *);
265	m->m_len = sizeof(*req);
266	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
267	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
268	/* len includes the length of any HW ULP additions */
269	req->len = htonl(len);
270	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
271	/* V_TX_ULP_SUBMODE sets both the mode and submode */
272	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
273	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
274	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
275				   (tail ? 0 : 1))));
276	req->sndseq = htonl(tp->snd_nxt);
277	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
278		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
279				    V_TX_CPU_IDX(toep->tp_qset));
280
281		/* Sendbuffer is in units of 32KB.
282		 */
283		if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
284			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
285		else
286			req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
287		toep->tp_flags |= TP_DATASENT;
288	}
289}
290
291#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
292
293int
294t3_push_frames(struct socket *so, int req_completion)
295{
296	struct tcpcb *tp = sototcpcb(so);
297	struct toepcb *toep = tp->t_toe;
298
299	struct mbuf *tail, *m0, *last;
300	struct t3cdev *cdev;
301	struct tom_data *d;
302	int i, bytes, count, total_bytes;
303	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
304
305	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
306		DPRINTF("tcp state=%d\n", tp->t_state);
307		return (0);
308	}
309
310	if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
311		DPRINTF("disconnecting\n");
312
313		return (0);
314	}
315
316
317	inp_wlock_assert(tp->t_inpcb);
318	SOCKBUF_LOCK(&so->so_snd);
319	d = TOM_DATA(TOE_DEV(so));
320	cdev = d->cdev;
321	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
322	total_bytes = 0;
323	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
324	    toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
325
326	if (last && toep->tp_m_last == last  && so->so_snd.sb_sndptroff != 0) {
327		KASSERT(tail, ("sbdrop error"));
328		last = tail = tail->m_next;
329	}
330
331	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
332		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
333		SOCKBUF_UNLOCK(&so->so_snd);
334		return (0);
335	}
336
337	toep->tp_m_last = NULL;
338	while (toep->tp_wr_avail && (tail != NULL)) {
339		count = bytes = 0;
340		segp = segs;
341		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
342			SOCKBUF_UNLOCK(&so->so_snd);
343			return (0);
344		}
345		/*
346		 * If the data in tail fits as in-line, then
347		 * make an immediate data wr.
348		 */
349		if (tail->m_len <= IMM_LEN) {
350			count = 1;
351			bytes = tail->m_len;
352			last = tail;
353			tail = tail->m_next;
354			m_set_sgl(m0, NULL);
355			m_set_sgllen(m0, 0);
356			make_tx_data_wr(so, m0, bytes, tail);
357			m_append(m0, bytes, mtod(last, caddr_t));
358			KASSERT(!m0->m_next, ("bad append"));
359		} else {
360			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
361			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
362				bytes += tail->m_len;
363				last = tail;
364				count++;
365				/*
366				 * technically an abuse to be using this for a VA
367				 * but less gross than defining my own structure
368				 * or calling pmap_kextract from here :-|
369				 */
370				segp->ds_addr = (bus_addr_t)tail->m_data;
371				segp->ds_len = tail->m_len;
372				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
373				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
374				segp++;
375				tail = tail->m_next;
376			}
377			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
378			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
379
380			m_set_sgl(m0, segs);
381			m_set_sgllen(m0, count);
382			make_tx_data_wr(so, m0, bytes, tail);
383		}
384		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
385
386		if (tail) {
387			so->so_snd.sb_sndptr = tail;
388			toep->tp_m_last = NULL;
389		} else
390			toep->tp_m_last = so->so_snd.sb_sndptr = last;
391
392
393		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
394
395		so->so_snd.sb_sndptroff += bytes;
396		total_bytes += bytes;
397		toep->tp_write_seq += bytes;
398		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
399		    toep->tp_wr_avail, count, mbuf_wrs[count], tail, so->so_snd.sb_sndptr, so->so_snd.sb_sndptroff);
400		if (tail)
401			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
402			    total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
403		else
404			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
405			    total_bytes, toep->tp_m_last, tp->snd_una);
406
407
408		i = 0;
409		while (i < count && m_get_sgllen(m0)) {
410			if ((count - i) >= 3) {
411				CTR6(KTR_TOM,
412				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
413				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
414				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
415				    i += 3;
416			} else if ((count - i) == 2) {
417				CTR4(KTR_TOM,
418				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
419				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
420				    i += 2;
421			} else {
422				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
423				    segs[i].ds_addr, segs[i].ds_len);
424				i++;
425			}
426
427		}
428
429                 /*
430		 * remember credits used
431		 */
432		m0->m_pkthdr.csum_data = mbuf_wrs[count];
433		m0->m_pkthdr.len = bytes;
434		toep->tp_wr_avail -= mbuf_wrs[count];
435		toep->tp_wr_unacked += mbuf_wrs[count];
436
437		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
438		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
439			struct work_request_hdr *wr = cplhdr(m0);
440
441			wr->wr_hi |= htonl(F_WR_COMPL);
442			toep->tp_wr_unacked = 0;
443		}
444		KASSERT((m0->m_pkthdr.csum_data > 0) &&
445		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
446			m0->m_pkthdr.csum_data));
447		m0->m_type = MT_DONTFREE;
448		enqueue_wr(toep, m0);
449		DPRINTF("sending offload tx with %d bytes in %d segments\n",
450		    bytes, count);
451		l2t_send(cdev, m0, toep->tp_l2t);
452	}
453	SOCKBUF_UNLOCK(&so->so_snd);
454	return (total_bytes);
455}
456
457/*
458 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
459 * under any circumstances.  We take the easy way out and always queue the
460 * message to the write_queue.  We can optimize the case where the queue is
461 * already empty though the optimization is probably not worth it.
462 */
463static void
464close_conn(struct socket *so)
465{
466	struct mbuf *m;
467	struct cpl_close_con_req *req;
468	struct tom_data *d;
469	struct inpcb *inp = sotoinpcb(so);
470	struct tcpcb *tp;
471	struct toepcb *toep;
472	unsigned int tid;
473
474
475	inp_wlock(inp);
476	tp = sototcpcb(so);
477	toep = tp->t_toe;
478
479	if (tp->t_state != TCPS_SYN_SENT)
480		t3_push_frames(so, 1);
481
482	if (toep->tp_flags & TP_FIN_SENT) {
483		inp_wunlock(inp);
484		return;
485	}
486
487	tid = toep->tp_tid;
488
489	d = TOM_DATA(toep->tp_toedev);
490
491	m = m_gethdr_nofail(sizeof(*req));
492
493	toep->tp_flags |= TP_FIN_SENT;
494	req = mtod(m, struct cpl_close_con_req *);
495
496	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
497	req->wr.wr_lo = htonl(V_WR_TID(tid));
498	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
499	req->rsvd = htonl(toep->tp_write_seq);
500	inp_wunlock(inp);
501	/*
502	 * XXX - need to defer shutdown while there is still data in the queue
503	 *
504	 */
505	cxgb_ofld_send(d->cdev, m);
506
507}
508
509/*
510 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
511 * and send it along.
512 */
513static void
514abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
515{
516	struct cpl_abort_req *req = cplhdr(m);
517
518	req->cmd = CPL_ABORT_NO_RST;
519	cxgb_ofld_send(cdev, m);
520}
521
522/*
523 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
524 * permitted to return without sending the message in case we cannot allocate
525 * an sk_buff.  Returns the number of credits sent.
526 */
527uint32_t
528t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
529{
530	struct mbuf *m;
531	struct cpl_rx_data_ack *req;
532	struct toepcb *toep = tp->t_toe;
533	struct toedev *tdev = toep->tp_toedev;
534
535	m = m_gethdr_nofail(sizeof(*req));
536
537	DPRINTF("returning %u credits to HW\n", credits);
538
539	req = mtod(m, struct cpl_rx_data_ack *);
540	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
541	req->wr.wr_lo = 0;
542	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
543	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
544	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
545	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
546	return (credits);
547}
548
549/*
550 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
551 * This is only used in DDP mode, so we take the opportunity to also set the
552 * DACK mode and flush any Rx credits.
553 */
554void
555t3_send_rx_modulate(struct toepcb *toep)
556{
557	struct mbuf *m;
558	struct cpl_rx_data_ack *req;
559
560	m = m_gethdr_nofail(sizeof(*req));
561
562	req = mtod(m, struct cpl_rx_data_ack *);
563	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
564	req->wr.wr_lo = 0;
565	m->m_pkthdr.len = m->m_len = sizeof(*req);
566
567	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
568	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
569				 V_RX_DACK_MODE(1) |
570				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
571	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
572	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
573	toep->tp_rcv_wup = toep->tp_copied_seq;
574}
575
576/*
577 * Handle receipt of an urgent pointer.
578 */
579static void
580handle_urg_ptr(struct socket *so, uint32_t urg_seq)
581{
582#ifdef URGENT_DATA_SUPPORTED
583	struct tcpcb *tp = sototcpcb(so);
584
585	urg_seq--;   /* initially points past the urgent data, per BSD */
586
587	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
588		return;                                 /* duplicate pointer */
589	sk_send_sigurg(sk);
590	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
591	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
592		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
593
594		tp->copied_seq++;
595		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
596			tom_eat_skb(sk, skb, 0);
597	}
598	tp->urg_data = TCP_URG_NOTYET;
599	tp->urg_seq = urg_seq;
600#endif
601}
602
603/*
604 * Returns true if a socket cannot accept new Rx data.
605 */
606static inline int
607so_no_receive(const struct socket *so)
608{
609	return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
610}
611
612/*
613 * Process an urgent data notification.
614 */
615static void
616rx_urg_notify(struct toepcb *toep, struct mbuf *m)
617{
618	struct cpl_rx_urg_notify *hdr = cplhdr(m);
619	struct socket *so = toeptoso(toep);
620
621	VALIDATE_SOCK(so);
622
623	if (!so_no_receive(so))
624		handle_urg_ptr(so, ntohl(hdr->seq));
625
626	m_freem(m);
627}
628
629/*
630 * Handler for RX_URG_NOTIFY CPL messages.
631 */
632static int
633do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
634{
635	struct toepcb *toep = (struct toepcb *)ctx;
636
637	rx_urg_notify(toep, m);
638	return (0);
639}
640
641static __inline int
642is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
643{
644	return (toep->tp_ulp_mode ||
645		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
646		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
647}
648
649/*
650 * Set of states for which we should return RX credits.
651 */
652#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
653
654/*
655 * Called after some received data has been read.  It returns RX credits
656 * to the HW for the amount of data processed.
657 */
658void
659t3_cleanup_rbuf(struct tcpcb *tp, int copied)
660{
661	struct toepcb *toep = tp->t_toe;
662	struct socket *so;
663	struct toedev *dev;
664	int dack_mode, must_send, read;
665	u32 thres, credits, dack = 0;
666
667	so = tp->t_inpcb->inp_socket;
668	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
669		(tp->t_state == TCPS_FIN_WAIT_2))) {
670		if (copied) {
671			SOCKBUF_LOCK(&so->so_rcv);
672			toep->tp_copied_seq += copied;
673			SOCKBUF_UNLOCK(&so->so_rcv);
674		}
675
676		return;
677	}
678
679	inp_wlock_assert(tp->t_inpcb);
680	SOCKBUF_LOCK(&so->so_rcv);
681	if (copied)
682		toep->tp_copied_seq += copied;
683	else {
684		read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
685		toep->tp_copied_seq += read;
686	}
687	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
688	toep->tp_enqueued_bytes = so->so_rcv.sb_cc;
689	SOCKBUF_UNLOCK(&so->so_rcv);
690
691	if (credits > so->so_rcv.sb_mbmax) {
692	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
693		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
694	    credits = so->so_rcv.sb_mbmax;
695	}
696
697
698	    /*
699	 * XXX this won't accurately reflect credit return - we need
700	 * to look at the difference between the amount that has been
701	 * put in the recv sockbuf and what is there now
702	 */
703
704	if (__predict_false(!credits))
705		return;
706
707	dev = toep->tp_toedev;
708	thres = TOM_TUNABLE(dev, rx_credit_thres);
709
710	if (__predict_false(thres == 0))
711		return;
712
713	if (is_delack_mode_valid(dev, toep)) {
714		dack_mode = TOM_TUNABLE(dev, delack);
715		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
716			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
717
718			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
719				dack = F_RX_DACK_CHANGE |
720				       V_RX_DACK_MODE(dack_mode);
721		}
722	} else
723		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
724
725	/*
726	 * For coalescing to work effectively ensure the receive window has
727	 * at least 16KB left.
728	 */
729	must_send = credits + 16384 >= tp->rcv_wnd;
730
731	if (must_send || credits >= thres)
732		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
733}
734
735static int
736cxgb_toe_disconnect(struct tcpcb *tp)
737{
738	struct socket *so;
739
740	DPRINTF("cxgb_toe_disconnect\n");
741
742	so = tp->t_inpcb->inp_socket;
743	close_conn(so);
744	return (0);
745}
746
747static int
748cxgb_toe_reset(struct tcpcb *tp)
749{
750	struct toepcb *toep = tp->t_toe;
751
752	t3_send_reset(toep);
753
754	/*
755	 * unhook from socket
756	 */
757	tp->t_flags &= ~TF_TOE;
758	toep->tp_tp = NULL;
759	tp->t_toe = NULL;
760	return (0);
761}
762
763static int
764cxgb_toe_send(struct tcpcb *tp)
765{
766	struct socket *so;
767
768	DPRINTF("cxgb_toe_send\n");
769	dump_toepcb(tp->t_toe);
770
771	so = tp->t_inpcb->inp_socket;
772	t3_push_frames(so, 1);
773	return (0);
774}
775
776static int
777cxgb_toe_rcvd(struct tcpcb *tp)
778{
779
780	inp_wlock_assert(tp->t_inpcb);
781	t3_cleanup_rbuf(tp, 0);
782
783	return (0);
784}
785
786static void
787cxgb_toe_detach(struct tcpcb *tp)
788{
789	struct toepcb *toep;
790
791        /*
792	 * XXX how do we handle teardown in the SYN_SENT state?
793	 *
794	 */
795	INP_INFO_WLOCK(&tcbinfo);
796	inp_wlock_assert(tp->t_inpcb);
797	toep = tp->t_toe;
798	toep->tp_tp = NULL;
799
800	/*
801	 * unhook from socket
802	 */
803	tp->t_flags &= ~TF_TOE;
804	tp->t_toe = NULL;
805	INP_INFO_WUNLOCK(&tcbinfo);
806}
807
808
809static struct toe_usrreqs cxgb_toe_usrreqs = {
810	.tu_disconnect = cxgb_toe_disconnect,
811	.tu_reset = cxgb_toe_reset,
812	.tu_send = cxgb_toe_send,
813	.tu_rcvd = cxgb_toe_rcvd,
814	.tu_detach = cxgb_toe_detach,
815	.tu_detach = cxgb_toe_detach,
816	.tu_syncache_event = handle_syncache_event,
817};
818
819
820static void
821__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
822			    uint64_t mask, uint64_t val, int no_reply)
823{
824	struct cpl_set_tcb_field *req;
825
826	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
827	    toep->tp_tid, word, mask, val);
828
829	req = mtod(m, struct cpl_set_tcb_field *);
830	m->m_pkthdr.len = m->m_len = sizeof(*req);
831	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
832	req->wr.wr_lo = 0;
833	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
834	req->reply = V_NO_REPLY(no_reply);
835	req->cpu_idx = 0;
836	req->word = htons(word);
837	req->mask = htobe64(mask);
838	req->val = htobe64(val);
839
840	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
841	send_or_defer(toep, m, 0);
842}
843
844static void
845t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
846{
847	struct mbuf *m;
848	struct tcpcb *tp = sototcpcb(so);
849	struct toepcb *toep = tp->t_toe;
850
851	if (toep == NULL)
852		return;
853
854	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
855		printf("not seting field\n");
856		return;
857	}
858
859	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
860
861	__set_tcb_field(toep, m, word, mask, val, 1);
862}
863
864/*
865 * Set one of the t_flags bits in the TCB.
866 */
867static void
868set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
869{
870	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
871}
872
873/*
874 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
875 */
876static void
877t3_set_nagle(struct socket *so)
878{
879	struct tcpcb *tp = sototcpcb(so);
880
881	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
882}
883
884/*
885 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
886 */
887void
888t3_set_keepalive(struct socket *so, int on_off)
889{
890	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
891}
892
893void
894t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
895{
896	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
897}
898
899void
900t3_set_dack_mss(struct socket *so, int on_off)
901{
902	set_tcb_tflag(so, S_TF_DACK_MSS, on_off);
903}
904
905/*
906 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
907 */
908static void
909t3_set_tos(struct socket *so)
910{
911	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
912			 V_TCB_TOS(SO_TOS(so)));
913}
914
915
916/*
917 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
918 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
919 * set the PSH bit in the last segment, which would trigger delivery.]
920 * We work around the issue by setting a DDP buffer in a partial placed state,
921 * which guarantees that TP will schedule a timer.
922 */
923#define TP_DDP_TIMER_WORKAROUND_MASK\
924    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
925     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
926       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
927#define TP_DDP_TIMER_WORKAROUND_VAL\
928    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
929     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
930      32))
931
932static void
933t3_enable_ddp(struct socket *so, int on)
934{
935	if (on) {
936
937		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
938				 V_TF_DDP_OFF(0));
939	} else
940		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
941				 V_TF_DDP_OFF(1) |
942				 TP_DDP_TIMER_WORKAROUND_MASK,
943				 V_TF_DDP_OFF(1) |
944				 TP_DDP_TIMER_WORKAROUND_VAL);
945
946}
947
948void
949t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
950{
951	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
952			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
953			 tag_color);
954}
955
956void
957t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
958		    unsigned int len)
959{
960	if (buf_idx == 0)
961		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
962			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
963			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
964			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
965			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
966	else
967		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
968			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
969			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
970			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
971			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
972}
973
974static int
975t3_set_cong_control(struct socket *so, const char *name)
976{
977#ifdef CONGESTION_CONTROL_SUPPORTED
978	int cong_algo;
979
980	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
981		if (!strcmp(name, t3_cong_ops[cong_algo].name))
982			break;
983
984	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
985		return -EINVAL;
986#endif
987	return 0;
988}
989
990int
991t3_get_tcb(struct socket *so)
992{
993	struct cpl_get_tcb *req;
994	struct tcpcb *tp = sototcpcb(so);
995	struct toepcb *toep = tp->t_toe;
996	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
997
998	if (!m)
999		return (ENOMEM);
1000
1001	inp_wlock_assert(tp->t_inpcb);
1002	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1003	req = mtod(m, struct cpl_get_tcb *);
1004	m->m_pkthdr.len = m->m_len = sizeof(*req);
1005	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1006	req->wr.wr_lo = 0;
1007	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1008	req->cpuno = htons(toep->tp_qset);
1009	req->rsvd = 0;
1010	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
1011		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1012	else
1013		cxgb_ofld_send(T3C_DEV(so), m);
1014	return 0;
1015}
1016
1017static inline void
1018so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
1019{
1020	struct toepcb *toep = sototoep(so);
1021	toepcb_hold(toep);
1022
1023	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1024}
1025
1026/**
1027 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1028 *	@d: TOM state
1029 *	@mtu: the target MTU
1030 *
1031 *	Returns the index of the value in the MTU table that is closest to but
1032 *	does not exceed the target MTU.
1033 */
1034static unsigned int
1035find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1036{
1037	int i = 0;
1038
1039	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1040		++i;
1041	return (i);
1042}
1043
1044static unsigned int
1045select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1046{
1047	unsigned int idx;
1048
1049#ifdef notyet
1050	struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
1051#endif
1052	if (tp) {
1053		tp->t_maxseg = pmtu - 40;
1054		if (tp->t_maxseg < td->mtus[0] - 40)
1055			tp->t_maxseg = td->mtus[0] - 40;
1056		idx = find_best_mtu(td, tp->t_maxseg + 40);
1057
1058		tp->t_maxseg = td->mtus[idx] - 40;
1059	} else
1060		idx = find_best_mtu(td, pmtu);
1061
1062	return (idx);
1063}
1064
1065static inline void
1066free_atid(struct t3cdev *cdev, unsigned int tid)
1067{
1068	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1069
1070	if (toep)
1071		toepcb_release(toep);
1072}
1073
1074/*
1075 * Release resources held by an offload connection (TID, L2T entry, etc.)
1076 */
1077static void
1078t3_release_offload_resources(struct toepcb *toep)
1079{
1080	struct tcpcb *tp = toep->tp_tp;
1081	struct toedev *tdev = toep->tp_toedev;
1082	struct t3cdev *cdev;
1083	unsigned int tid = toep->tp_tid;
1084
1085	if (!tdev)
1086		return;
1087
1088	cdev = TOEP_T3C_DEV(toep);
1089	if (!cdev)
1090		return;
1091
1092	toep->tp_qset = 0;
1093	t3_release_ddp_resources(toep);
1094
1095#ifdef CTRL_SKB_CACHE
1096	kfree_skb(CTRL_SKB_CACHE(tp));
1097	CTRL_SKB_CACHE(tp) = NULL;
1098#endif
1099
1100	if (toep->tp_wr_avail != toep->tp_wr_max) {
1101		purge_wr_queue(toep);
1102		reset_wr_list(toep);
1103	}
1104
1105	if (toep->tp_l2t) {
1106		l2t_release(L2DATA(cdev), toep->tp_l2t);
1107		toep->tp_l2t = NULL;
1108	}
1109	toep->tp_tp = NULL;
1110	if (tp) {
1111		inp_wlock_assert(tp->t_inpcb);
1112		tp->t_toe = NULL;
1113		tp->t_flags &= ~TF_TOE;
1114	}
1115
1116	if (toep->tp_state == TCPS_SYN_SENT) {
1117		free_atid(cdev, tid);
1118#ifdef notyet
1119		__skb_queue_purge(&tp->out_of_order_queue);
1120#endif
1121	} else {                                          // we have TID
1122		cxgb_remove_tid(cdev, toep, tid);
1123		toepcb_release(toep);
1124	}
1125#if 0
1126	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1127#endif
1128}
1129
1130static void
1131install_offload_ops(struct socket *so)
1132{
1133	struct tcpcb *tp = sototcpcb(so);
1134
1135	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1136
1137	t3_install_socket_ops(so);
1138	tp->t_flags |= TF_TOE;
1139	tp->t_tu = &cxgb_toe_usrreqs;
1140}
1141
1142/*
1143 * Determine the receive window scaling factor given a target max
1144 * receive window.
1145 */
1146static __inline int
1147select_rcv_wscale(int space)
1148{
1149	int wscale = 0;
1150
1151	if (space > MAX_RCV_WND)
1152		space = MAX_RCV_WND;
1153
1154	if (tcp_do_rfc1323)
1155		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1156
1157	return (wscale);
1158}
1159
1160/*
1161 * Determine the receive window size for a socket.
1162 */
1163static unsigned long
1164select_rcv_wnd(struct toedev *dev, struct socket *so)
1165{
1166	struct tom_data *d = TOM_DATA(dev);
1167	unsigned int wnd;
1168	unsigned int max_rcv_wnd;
1169
1170	if (tcp_do_autorcvbuf)
1171		wnd = tcp_autorcvbuf_max;
1172	else
1173		wnd = so->so_rcv.sb_hiwat;
1174
1175
1176
1177	/* XXX
1178	 * For receive coalescing to work effectively we need a receive window
1179	 * that can accomodate a coalesced segment.
1180	 */
1181	if (wnd < MIN_RCV_WND)
1182		wnd = MIN_RCV_WND;
1183
1184	/* PR 5138 */
1185	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1186				    (uint32_t)d->rx_page_size * 23 :
1187				    MAX_RCV_WND);
1188
1189	return min(wnd, max_rcv_wnd);
1190}
1191
1192/*
1193 * Assign offload parameters to some socket fields.  This code is used by
1194 * both active and passive opens.
1195 */
1196static inline void
1197init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1198    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1199{
1200	struct tcpcb *tp = sototcpcb(so);
1201	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1202
1203	SOCK_LOCK_ASSERT(so);
1204
1205	printf("initializing offload socket\n");
1206	/*
1207	 * We either need to fix push frames to work with sbcompress
1208	 * or we need to add this
1209	 */
1210	so->so_snd.sb_flags |= SB_NOCOALESCE;
1211	so->so_rcv.sb_flags |= SB_NOCOALESCE;
1212
1213	tp->t_toe = toep;
1214	toep->tp_tp = tp;
1215	toep->tp_toedev = dev;
1216
1217	toep->tp_tid = tid;
1218	toep->tp_l2t = e;
1219	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1220	toep->tp_wr_unacked = 0;
1221	toep->tp_delack_mode = 0;
1222
1223	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1224	/*
1225	 * XXX broken
1226	 *
1227	 */
1228	tp->rcv_wnd = select_rcv_wnd(dev, so);
1229
1230        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
1231		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1232	toep->tp_qset_idx = 0;
1233
1234	reset_wr_list(toep);
1235	DPRINTF("initialization done\n");
1236}
1237
1238/*
1239 * The next two functions calculate the option 0 value for a socket.
1240 */
1241static inline unsigned int
1242calc_opt0h(struct socket *so, int mtu_idx)
1243{
1244	struct tcpcb *tp = sototcpcb(so);
1245	int wscale = select_rcv_wscale(tp->rcv_wnd);
1246
1247	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1248	    V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1249	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1250}
1251
1252static inline unsigned int
1253calc_opt0l(struct socket *so, int ulp_mode)
1254{
1255	struct tcpcb *tp = sototcpcb(so);
1256	unsigned int val;
1257
1258	val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
1259	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1260
1261	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
1262	return (val);
1263}
1264
1265static inline unsigned int
1266calc_opt2(const struct socket *so, struct toedev *dev)
1267{
1268	int flv_valid;
1269
1270	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1271
1272	return (V_FLAVORS_VALID(flv_valid) |
1273	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1274}
1275
1276#if DEBUG_WR > 1
1277static int
1278count_pending_wrs(const struct toepcb *toep)
1279{
1280	const struct mbuf *m;
1281	int n = 0;
1282
1283	wr_queue_walk(toep, m)
1284		n += m->m_pkthdr.csum_data;
1285	return (n);
1286}
1287#endif
1288
1289#if 0
1290(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1291#endif
1292
1293static void
1294mk_act_open_req(struct socket *so, struct mbuf *m,
1295    unsigned int atid, const struct l2t_entry *e)
1296{
1297	struct cpl_act_open_req *req;
1298	struct inpcb *inp = sotoinpcb(so);
1299	struct tcpcb *tp = intotcpcb(inp);
1300	struct toepcb *toep = tp->t_toe;
1301	struct toedev *tdev = TOE_DEV(so);
1302
1303	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1304
1305	req = mtod(m, struct cpl_act_open_req *);
1306	m->m_pkthdr.len = m->m_len = sizeof(*req);
1307
1308	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1309	req->wr.wr_lo = 0;
1310	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1311	req->local_port = inp->inp_lport;
1312	req->peer_port = inp->inp_fport;
1313	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1314	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1315	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1316			   V_TX_CHANNEL(e->smt_idx));
1317	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1318	req->params = 0;
1319	req->opt2 = htonl(calc_opt2(so, tdev));
1320}
1321
1322
1323/*
1324 * Convert an ACT_OPEN_RPL status to an errno.
1325 */
1326static int
1327act_open_rpl_status_to_errno(int status)
1328{
1329	switch (status) {
1330	case CPL_ERR_CONN_RESET:
1331		return (ECONNREFUSED);
1332	case CPL_ERR_ARP_MISS:
1333		return (EHOSTUNREACH);
1334	case CPL_ERR_CONN_TIMEDOUT:
1335		return (ETIMEDOUT);
1336	case CPL_ERR_TCAM_FULL:
1337		return (ENOMEM);
1338	case CPL_ERR_CONN_EXIST:
1339		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1340		return (EADDRINUSE);
1341	default:
1342		return (EIO);
1343	}
1344}
1345
1346static void
1347fail_act_open(struct toepcb *toep, int errno)
1348{
1349	struct tcpcb *tp = toep->tp_tp;
1350
1351	t3_release_offload_resources(toep);
1352	if (tp) {
1353		inp_wlock_assert(tp->t_inpcb);
1354		tcp_drop(tp, errno);
1355	}
1356
1357#ifdef notyet
1358	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1359#endif
1360}
1361
1362/*
1363 * Handle active open failures.
1364 */
1365static void
1366active_open_failed(struct toepcb *toep, struct mbuf *m)
1367{
1368	struct cpl_act_open_rpl *rpl = cplhdr(m);
1369	struct inpcb *inp;
1370
1371	INP_INFO_WLOCK(&tcbinfo);
1372	if (toep->tp_tp == NULL)
1373		goto done;
1374
1375	inp = toep->tp_tp->t_inpcb;
1376	inp_wlock(inp);
1377
1378/*
1379 * Don't handle connection retry for now
1380 */
1381#ifdef notyet
1382	struct inet_connection_sock *icsk = inet_csk(sk);
1383
1384	if (rpl->status == CPL_ERR_CONN_EXIST &&
1385	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1386		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1387		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1388			       jiffies + HZ / 2);
1389	} else
1390#endif
1391		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1392	inp_wunlock(inp);
1393done:
1394	INP_INFO_WUNLOCK(&tcbinfo);
1395	m_free(m);
1396}
1397
1398/*
1399 * Return whether a failed active open has allocated a TID
1400 */
1401static inline int
1402act_open_has_tid(int status)
1403{
1404	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1405	       status != CPL_ERR_ARP_MISS;
1406}
1407
1408/*
1409 * Process an ACT_OPEN_RPL CPL message.
1410 */
1411static int
1412do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1413{
1414	struct toepcb *toep = (struct toepcb *)ctx;
1415	struct cpl_act_open_rpl *rpl = cplhdr(m);
1416
1417	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1418		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1419
1420	active_open_failed(toep, m);
1421	return (0);
1422}
1423
1424/*
1425 * Handle an ARP failure for an active open.   XXX purge ofo queue
1426 *
1427 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1428 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1429 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1430 * free the atid.  Hmm.
1431 */
1432#ifdef notyet
1433static void
1434act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1435{
1436	struct toepcb *toep = m_get_toep(m);
1437	struct tcpcb *tp = toep->tp_tp;
1438	struct inpcb *inp = tp->t_inpcb;
1439	struct socket *so = toeptoso(toep);
1440
1441	inp_wlock(inp);
1442	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1443		fail_act_open(so, EHOSTUNREACH);
1444		printf("freeing %p\n", m);
1445
1446		m_free(m);
1447	}
1448	inp_wunlock(inp);
1449}
1450#endif
1451/*
1452 * Send an active open request.
1453 */
1454int
1455t3_connect(struct toedev *tdev, struct socket *so,
1456    struct rtentry *rt, struct sockaddr *nam)
1457{
1458	struct mbuf *m;
1459	struct l2t_entry *e;
1460	struct tom_data *d = TOM_DATA(tdev);
1461	struct inpcb *inp = sotoinpcb(so);
1462	struct tcpcb *tp = intotcpcb(inp);
1463	struct toepcb *toep; /* allocated by init_offload_socket */
1464
1465	int atid;
1466
1467	toep = toepcb_alloc();
1468	if (toep == NULL)
1469		goto out_err;
1470
1471	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1472		goto out_err;
1473
1474	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1475	if (!e)
1476		goto free_tid;
1477
1478	inp_wlock_assert(inp);
1479	m = m_gethdr(MT_DATA, M_WAITOK);
1480
1481#if 0
1482	m->m_toe.mt_toepcb = tp->t_toe;
1483	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1484#endif
1485	SOCK_LOCK(so);
1486
1487	init_offload_socket(so, tdev, atid, e, rt, toep);
1488
1489	install_offload_ops(so);
1490
1491	mk_act_open_req(so, m, atid, e);
1492	SOCK_UNLOCK(so);
1493
1494	soisconnecting(so);
1495	toep = tp->t_toe;
1496	m_set_toep(m, tp->t_toe);
1497
1498	toep->tp_state = TCPS_SYN_SENT;
1499	l2t_send(d->cdev, (struct mbuf *)m, e);
1500
1501	if (toep->tp_ulp_mode)
1502		t3_enable_ddp(so, 0);
1503	return 	(0);
1504
1505free_tid:
1506	printf("failing connect - free atid\n");
1507
1508	free_atid(d->cdev, atid);
1509out_err:
1510	printf("return ENOMEM\n");
1511       return (ENOMEM);
1512}
1513
1514/*
1515 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1516 * not send multiple ABORT_REQs for the same connection and also that we do
1517 * not try to send a message after the connection has closed.  Returns 1 if
1518 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1519 */
1520static void
1521t3_send_reset(struct toepcb *toep)
1522{
1523
1524	struct cpl_abort_req *req;
1525	unsigned int tid = toep->tp_tid;
1526	int mode = CPL_ABORT_SEND_RST;
1527	struct tcpcb *tp = toep->tp_tp;
1528	struct toedev *tdev = toep->tp_toedev;
1529	struct socket *so = NULL;
1530	struct mbuf *m;
1531
1532	if (tp) {
1533		inp_wlock_assert(tp->t_inpcb);
1534		so = toeptoso(toep);
1535	}
1536
1537	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1538		tdev == NULL))
1539		return;
1540	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1541
1542	/* Purge the send queue so we don't send anything after an abort. */
1543	if (so)
1544		sbflush(&so->so_snd);
1545	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1546		mode |= CPL_ABORT_POST_CLOSE_REQ;
1547
1548	m = m_gethdr_nofail(sizeof(*req));
1549	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1550	set_arp_failure_handler(m, abort_arp_failure);
1551
1552	req = mtod(m, struct cpl_abort_req *);
1553	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1554	req->wr.wr_lo = htonl(V_WR_TID(tid));
1555	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1556	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1557	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1558	req->cmd = mode;
1559	if (tp && (tp->t_state == TCPS_SYN_SENT))
1560		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1561	else
1562		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1563}
1564
1565static int
1566t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1567{
1568	struct inpcb *inp;
1569	int error, optval;
1570
1571	if (sopt->sopt_name == IP_OPTIONS)
1572		return (ENOPROTOOPT);
1573
1574	if (sopt->sopt_name != IP_TOS)
1575		return (EOPNOTSUPP);
1576
1577	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1578
1579	if (error)
1580		return (error);
1581
1582	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1583		return (EPERM);
1584
1585	inp = sotoinpcb(so);
1586	inp->inp_ip_tos = optval;
1587
1588	t3_set_tos(so);
1589
1590	return (0);
1591}
1592
1593static int
1594t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1595{
1596	int err = 0;
1597	size_t copied;
1598
1599	if (sopt->sopt_name != TCP_CONGESTION &&
1600	    sopt->sopt_name != TCP_NODELAY)
1601		return (EOPNOTSUPP);
1602
1603	if (sopt->sopt_name == TCP_CONGESTION) {
1604		char name[TCP_CA_NAME_MAX];
1605		int optlen = sopt->sopt_valsize;
1606		struct tcpcb *tp;
1607
1608		if (optlen < 1)
1609			return (EINVAL);
1610
1611		err = copyinstr(sopt->sopt_val, name,
1612		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1613		if (err)
1614			return (err);
1615		if (copied < 1)
1616			return (EINVAL);
1617
1618		tp = sototcpcb(so);
1619		/*
1620		 * XXX I need to revisit this
1621		 */
1622		if ((err = t3_set_cong_control(so, name)) == 0) {
1623#ifdef CONGESTION_CONTROL_SUPPORTED
1624			tp->t_cong_control = strdup(name, M_CXGB);
1625#endif
1626		} else
1627			return (err);
1628	} else {
1629		int optval, oldval;
1630		struct inpcb *inp;
1631		struct tcpcb *tp;
1632
1633		err = sooptcopyin(sopt, &optval, sizeof optval,
1634		    sizeof optval);
1635
1636		if (err)
1637			return (err);
1638
1639		inp = sotoinpcb(so);
1640		tp = intotcpcb(inp);
1641
1642		inp_wlock(inp);
1643
1644		oldval = tp->t_flags;
1645		if (optval)
1646			tp->t_flags |= TF_NODELAY;
1647		else
1648			tp->t_flags &= ~TF_NODELAY;
1649		inp_wunlock(inp);
1650
1651		if (oldval != tp->t_flags)
1652			t3_set_nagle(so);
1653
1654	}
1655
1656	return (0);
1657}
1658
1659static int
1660t3_ctloutput(struct socket *so, struct sockopt *sopt)
1661{
1662	int err;
1663
1664	if (sopt->sopt_level != IPPROTO_TCP)
1665		err =  t3_ip_ctloutput(so, sopt);
1666	else
1667		err = t3_tcp_ctloutput(so, sopt);
1668
1669	if (err != EOPNOTSUPP)
1670		return (err);
1671
1672	return (tcp_ctloutput(so, sopt));
1673}
1674
1675/*
1676 * Returns true if we need to explicitly request RST when we receive new data
1677 * on an RX-closed connection.
1678 */
1679static inline int
1680need_rst_on_excess_rx(const struct toepcb *toep)
1681{
1682	return (1);
1683}
1684
1685/*
1686 * Handles Rx data that arrives in a state where the socket isn't accepting
1687 * new data.
1688 */
1689static void
1690handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1691{
1692
1693	if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1694		t3_send_reset(toep);
1695	m_freem(m);
1696}
1697
1698/*
1699 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1700 * by getting the DDP offset from the TCB.
1701 */
1702static void
1703tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1704{
1705	struct ddp_state *q = &toep->tp_ddp_state;
1706	struct ddp_buf_state *bsp;
1707	struct cpl_get_tcb_rpl *hdr;
1708	unsigned int ddp_offset;
1709	struct socket *so;
1710	struct tcpcb *tp;
1711
1712	uint64_t t;
1713	__be64 *tcb;
1714
1715	so = toeptoso(toep);
1716	tp = toep->tp_tp;
1717
1718	inp_wlock_assert(tp->t_inpcb);
1719	SOCKBUF_LOCK(&so->so_rcv);
1720
1721	/* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
1722	 * really need a cookie in order to dispatch the RPLs.
1723	 */
1724	q->get_tcb_count--;
1725
1726	/* It is a possible that a previous CPL already invalidated UBUF DDP
1727	 * and moved the cur_buf idx and hence no further processing of this
1728	 * skb is required. However, the app might be sleeping on
1729	 * !q->get_tcb_count and we need to wake it up.
1730	 */
1731	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1732		struct socket *so = toeptoso(toep);
1733
1734		m_freem(m);
1735		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1736			sorwakeup_locked(so);
1737		else
1738			SOCKBUF_UNLOCK(&so->so_rcv);
1739		return;
1740	}
1741
1742	bsp = &q->buf_state[q->cur_buf];
1743	hdr = cplhdr(m);
1744	tcb = (__be64 *)(hdr + 1);
1745	if (q->cur_buf == 0) {
1746		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1747		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1748	} else {
1749		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1750		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1751	}
1752	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1753	m->m_cur_offset = bsp->cur_offset;
1754	bsp->cur_offset = ddp_offset;
1755	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1756
1757	CTR5(KTR_TOM,
1758	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1759	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1760	KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u",
1761		ddp_offset, m->m_cur_offset));
1762
1763#ifdef T3_TRACE
1764	T3_TRACE3(TIDTB(so),
1765		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u",
1766		  tp->rcv_nxt, q->cur_buf, ddp_offset);
1767#endif
1768
1769#if 0
1770{
1771	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1772
1773	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1774	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1775
1776        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1777        rcv_nxt = t >> S_TCB_RCV_NXT;
1778        rcv_nxt &= M_TCB_RCV_NXT;
1779
1780        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1781        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1782        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1783
1784	T3_TRACE2(TIDTB(sk),
1785		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1786		  ddp_flags, rcv_nxt - rx_hdr_offset);
1787	T3_TRACE4(TB(q),
1788		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1789		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1790	T3_TRACE3(TB(q),
1791		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1792		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1793	T3_TRACE2(TB(q),
1794		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1795		 q->buf_state[0].flags, q->buf_state[1].flags);
1796
1797}
1798#endif
1799	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1800		handle_excess_rx(toep, m);
1801		return;
1802	}
1803
1804#ifdef T3_TRACE
1805	if ((int)m->m_pkthdr.len < 0) {
1806		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1807	}
1808#endif
1809	if (bsp->flags & DDP_BF_NOCOPY) {
1810#ifdef T3_TRACE
1811		T3_TRACE0(TB(q),
1812			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1813
1814		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1815			printk("!cancel_ubuf");
1816			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1817		}
1818#endif
1819		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1820		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1821		q->cur_buf ^= 1;
1822	} else if (bsp->flags & DDP_BF_NOFLIP) {
1823
1824		m->m_ddp_flags = 1;    /* always a kernel buffer */
1825
1826		/* now HW buffer carries a user buffer */
1827		bsp->flags &= ~DDP_BF_NOFLIP;
1828		bsp->flags |= DDP_BF_NOCOPY;
1829
1830		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1831		 * any new data in which case we're done. If in addition the
1832		 * offset is 0, then there wasn't a completion for the kbuf
1833		 * and we need to decrement the posted count.
1834		 */
1835		if (m->m_pkthdr.len == 0) {
1836			if (ddp_offset == 0) {
1837				q->kbuf_posted--;
1838				bsp->flags |= DDP_BF_NODATA;
1839			}
1840			SOCKBUF_UNLOCK(&so->so_rcv);
1841
1842			m_free(m);
1843			return;
1844		}
1845	} else {
1846		SOCKBUF_UNLOCK(&so->so_rcv);
1847		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1848		 * but it got here way late and nobody cares anymore.
1849		 */
1850		m_free(m);
1851		return;
1852	}
1853
1854	m->m_ddp_gl = (unsigned char *)bsp->gl;
1855	m->m_flags |= M_DDP;
1856	m->m_seq = tp->rcv_nxt;
1857	tp->rcv_nxt += m->m_pkthdr.len;
1858	tp->t_rcvtime = ticks;
1859#ifdef T3_TRACE
1860	T3_TRACE3(TB(q),
1861		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
1862		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1863#endif
1864	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1865		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1866	if (m->m_pkthdr.len == 0)
1867		q->user_ddp_pending = 0;
1868	else
1869		SBAPPEND(&so->so_rcv, m);
1870	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1871		sorwakeup_locked(so);
1872	else
1873		SOCKBUF_UNLOCK(&so->so_rcv);
1874}
1875
1876/*
1877 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1878 * in that case they are similar to DDP completions.
1879 */
1880static int
1881do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1882{
1883	struct toepcb *toep = (struct toepcb *)ctx;
1884
1885	/* OK if socket doesn't exist */
1886	if (toep == NULL) {
1887		printf("null toep in do_get_tcb_rpl\n");
1888		return (CPL_RET_BUF_DONE);
1889	}
1890
1891	inp_wlock(toep->tp_tp->t_inpcb);
1892	tcb_rpl_as_ddp_complete(toep, m);
1893	inp_wunlock(toep->tp_tp->t_inpcb);
1894
1895	return (0);
1896}
1897
1898static void
1899handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1900{
1901	struct tcpcb *tp = toep->tp_tp;
1902	struct socket *so = toeptoso(toep);
1903	struct ddp_state *q;
1904	struct ddp_buf_state *bsp;
1905	struct cpl_rx_data *hdr = cplhdr(m);
1906	unsigned int rcv_nxt = ntohl(hdr->seq);
1907
1908	if (tp->rcv_nxt == rcv_nxt)
1909		return;
1910
1911	inp_wlock_assert(tp->t_inpcb);
1912	SOCKBUF_LOCK(&so->so_rcv);
1913	q = &toep->tp_ddp_state;
1914	bsp = &q->buf_state[q->cur_buf];
1915	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
1916		rcv_nxt, tp->rcv_nxt));
1917	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
1918	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
1919	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
1920	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
1921
1922#ifdef T3_TRACE
1923	if ((int)m->m_pkthdr.len < 0) {
1924		t3_ddp_error(so, "handle_ddp_data: neg len");
1925	}
1926#endif
1927
1928	m->m_ddp_gl = (unsigned char *)bsp->gl;
1929	m->m_flags |= M_DDP;
1930	m->m_cur_offset = bsp->cur_offset;
1931	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
1932	if (bsp->flags & DDP_BF_NOCOPY)
1933		bsp->flags &= ~DDP_BF_NOCOPY;
1934
1935	m->m_seq = tp->rcv_nxt;
1936	tp->rcv_nxt = rcv_nxt;
1937	bsp->cur_offset += m->m_pkthdr.len;
1938	if (!(bsp->flags & DDP_BF_NOFLIP))
1939		q->cur_buf ^= 1;
1940	/*
1941	 * For now, don't re-enable DDP after a connection fell out of  DDP
1942	 * mode.
1943	 */
1944	q->ubuf_ddp_ready = 0;
1945	SOCKBUF_UNLOCK(&so->so_rcv);
1946}
1947
1948/*
1949 * Process new data received for a connection.
1950 */
1951static void
1952new_rx_data(struct toepcb *toep, struct mbuf *m)
1953{
1954	struct cpl_rx_data *hdr = cplhdr(m);
1955	struct tcpcb *tp = toep->tp_tp;
1956	struct socket *so = toeptoso(toep);
1957	int len = be16toh(hdr->len);
1958
1959	inp_wlock(tp->t_inpcb);
1960
1961	if (__predict_false(so_no_receive(so))) {
1962		handle_excess_rx(toep, m);
1963		inp_wunlock(tp->t_inpcb);
1964		TRACE_EXIT;
1965		return;
1966	}
1967
1968	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
1969		handle_ddp_data(toep, m);
1970
1971	m->m_seq = ntohl(hdr->seq);
1972	m->m_ulp_mode = 0;                    /* for iSCSI */
1973
1974#if VALIDATE_SEQ
1975	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
1976		log(LOG_ERR,
1977		       "%s: TID %u: Bad sequence number %u, expected %u\n",
1978		    TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq,
1979		       tp->rcv_nxt);
1980		m_freem(m);
1981		inp_wunlock(tp->t_inpcb);
1982		return;
1983	}
1984#endif
1985	m_adj(m, sizeof(*hdr));
1986
1987#ifdef URGENT_DATA_SUPPORTED
1988	/*
1989	 * We don't handle urgent data yet
1990	 */
1991	if (__predict_false(hdr->urg))
1992		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
1993	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
1994		     tp->urg_seq - tp->rcv_nxt < skb->len))
1995		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
1996							 tp->rcv_nxt];
1997#endif
1998	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
1999		toep->tp_delack_mode = hdr->dack_mode;
2000		toep->tp_delack_seq = tp->rcv_nxt;
2001	}
2002	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2003	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2004
2005	if (len < m->m_pkthdr.len)
2006		m->m_pkthdr.len = m->m_len = len;
2007
2008	tp->rcv_nxt += m->m_pkthdr.len;
2009	tp->t_rcvtime = ticks;
2010	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2011#ifdef T3_TRACE
2012	T3_TRACE2(TIDTB(sk),
2013	    "new_rx_data: seq 0x%x len %u",
2014	    m->m_seq, m->m_pkthdr.len);
2015#endif
2016	inp_wunlock(tp->t_inpcb);
2017	SOCKBUF_LOCK(&so->so_rcv);
2018	if (sb_notify(&so->so_rcv))
2019		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
2020
2021	SBAPPEND(&so->so_rcv, m);
2022
2023#ifdef notyet
2024	/*
2025	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2026	 *
2027	 */
2028	KASSERT(so->so_rcv.sb_cc < (so->so_rcv.sb_mbmax << 1),
2029
2030	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2031		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
2032#endif
2033
2034
2035	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2036	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
2037
2038	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2039		sorwakeup_locked(so);
2040	else
2041		SOCKBUF_UNLOCK(&so->so_rcv);
2042}
2043
2044/*
2045 * Handler for RX_DATA CPL messages.
2046 */
2047static int
2048do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2049{
2050	struct toepcb *toep = (struct toepcb *)ctx;
2051
2052	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2053
2054	new_rx_data(toep, m);
2055
2056	return (0);
2057}
2058
2059static void
2060new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2061{
2062	struct tcpcb *tp;
2063	struct ddp_state *q;
2064	struct ddp_buf_state *bsp;
2065	struct cpl_rx_data_ddp *hdr;
2066	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2067	struct socket *so = toeptoso(toep);
2068	int nomoredata = 0;
2069	unsigned int delack_mode;
2070
2071	tp = sototcpcb(so);
2072
2073	inp_wlock(tp->t_inpcb);
2074	if (__predict_false(so_no_receive(so))) {
2075
2076		handle_excess_rx(toep, m);
2077		inp_wunlock(tp->t_inpcb);
2078		return;
2079	}
2080
2081	q = &toep->tp_ddp_state;
2082	hdr = cplhdr(m);
2083	ddp_report = ntohl(hdr->u.ddp_report);
2084	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2085	bsp = &q->buf_state[buf_idx];
2086
2087#ifdef T3_TRACE
2088	T3_TRACE5(TIDTB(sk),
2089		  "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2090		  "hdr seq 0x%x len %u offset %u",
2091		  tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2092		  ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
2093	T3_TRACE1(TIDTB(sk),
2094		  "new_rx_data_ddp: ddp_report 0x%x",
2095		  ddp_report);
2096#endif
2097	CTR4(KTR_TOM,
2098	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2099	    "hdr seq 0x%x len %u",
2100	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2101	    ntohs(hdr->len));
2102	CTR3(KTR_TOM,
2103	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2104	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2105
2106	ddp_len = ntohs(hdr->len);
2107	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2108
2109	delack_mode = G_DDP_DACK_MODE(ddp_report);
2110	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2111		toep->tp_delack_mode = delack_mode;
2112		toep->tp_delack_seq = tp->rcv_nxt;
2113	}
2114
2115	m->m_seq = tp->rcv_nxt;
2116	tp->rcv_nxt = rcv_nxt;
2117
2118	tp->t_rcvtime = ticks;
2119	/*
2120	 * Store the length in m->m_len.  We are changing the meaning of
2121	 * m->m_len here, we need to be very careful that nothing from now on
2122	 * interprets ->len of this packet the usual way.
2123	 */
2124	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2125	inp_wunlock(tp->t_inpcb);
2126	CTR3(KTR_TOM,
2127	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2128	    m->m_len, rcv_nxt, m->m_seq);
2129	/*
2130	 * Figure out where the new data was placed in the buffer and store it
2131	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2132	 * account for page pod's pg_offset.
2133	 */
2134	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2135	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2136
2137	SOCKBUF_LOCK(&so->so_rcv);
2138	m->m_ddp_gl = (unsigned char *)bsp->gl;
2139	m->m_flags |= M_DDP;
2140	bsp->cur_offset = end_offset;
2141	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2142
2143	/*
2144	 * Length is only meaningful for kbuf
2145	 */
2146	if (!(bsp->flags & DDP_BF_NOCOPY))
2147		KASSERT(m->m_len <= bsp->gl->dgl_length,
2148		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2149			m->m_len, bsp->gl->dgl_length));
2150
2151	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2152	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2153
2154
2155        /*
2156	 * Bit 0 of flags stores whether the DDP buffer is completed.
2157	 * Note that other parts of the code depend on this being in bit 0.
2158	 */
2159	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2160		panic("spurious ddp completion");
2161	} else {
2162		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2163		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2164			q->cur_buf ^= 1;                     /* flip buffers */
2165	}
2166
2167	if (bsp->flags & DDP_BF_NOCOPY) {
2168		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2169		bsp->flags &= ~DDP_BF_NOCOPY;
2170	}
2171
2172	if (ddp_report & F_DDP_PSH)
2173		m->m_ddp_flags |= DDP_BF_PSH;
2174	if (nomoredata)
2175		m->m_ddp_flags |= DDP_BF_NODATA;
2176
2177#ifdef notyet
2178	skb_reset_transport_header(skb);
2179	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2180#endif
2181	SBAPPEND(&so->so_rcv, m);
2182
2183	if ((so->so_state & SS_NOFDREF) == 0)
2184		sorwakeup_locked(so);
2185	else
2186		SOCKBUF_UNLOCK(&so->so_rcv);
2187}
2188
2189#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2190		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2191		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2192		 F_DDP_INVALID_PPOD)
2193
2194/*
2195 * Handler for RX_DATA_DDP CPL messages.
2196 */
2197static int
2198do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2199{
2200	struct toepcb *toep = ctx;
2201	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2202
2203	VALIDATE_SOCK(so);
2204
2205	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2206		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2207		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2208		return (CPL_RET_BUF_DONE);
2209	}
2210#if 0
2211	skb->h.th = tcphdr_skb->h.th;
2212#endif
2213	new_rx_data_ddp(toep, m);
2214	return (0);
2215}
2216
2217static void
2218process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2219{
2220	struct tcpcb *tp = toep->tp_tp;
2221	struct socket *so = toeptoso(toep);
2222	struct ddp_state *q;
2223	struct ddp_buf_state *bsp;
2224	struct cpl_rx_ddp_complete *hdr;
2225	unsigned int ddp_report, buf_idx, when, delack_mode;
2226	int nomoredata = 0;
2227
2228	inp_wlock(tp->t_inpcb);
2229	if (__predict_false(so_no_receive(so))) {
2230		struct inpcb *inp = sotoinpcb(so);
2231
2232		handle_excess_rx(toep, m);
2233		inp_wunlock(inp);
2234		return;
2235	}
2236	q = &toep->tp_ddp_state;
2237	hdr = cplhdr(m);
2238	ddp_report = ntohl(hdr->ddp_report);
2239	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2240	m->m_pkthdr.csum_data = tp->rcv_nxt;
2241
2242
2243	SOCKBUF_LOCK(&so->so_rcv);
2244	bsp = &q->buf_state[buf_idx];
2245	when = bsp->cur_offset;
2246	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2247	tp->rcv_nxt += m->m_len;
2248	tp->t_rcvtime = ticks;
2249
2250	delack_mode = G_DDP_DACK_MODE(ddp_report);
2251	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2252		toep->tp_delack_mode = delack_mode;
2253		toep->tp_delack_seq = tp->rcv_nxt;
2254	}
2255#ifdef notyet
2256	skb_reset_transport_header(skb);
2257	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2258#endif
2259	inp_wunlock(tp->t_inpcb);
2260
2261	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2262	CTR5(KTR_TOM,
2263		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2264		  "ddp_report 0x%x offset %u, len %u",
2265		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2266		   G_DDP_OFFSET(ddp_report), m->m_len);
2267
2268	bsp->cur_offset += m->m_len;
2269
2270	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2271		q->cur_buf ^= 1;                     /* flip buffers */
2272		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2273			nomoredata=1;
2274	}
2275
2276	CTR4(KTR_TOM,
2277		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2278		  "ddp_report %u offset %u",
2279		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2280		   G_DDP_OFFSET(ddp_report));
2281
2282	m->m_ddp_gl = (unsigned char *)bsp->gl;
2283	m->m_flags |= M_DDP;
2284	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2285	if (bsp->flags & DDP_BF_NOCOPY)
2286		bsp->flags &= ~DDP_BF_NOCOPY;
2287	if (nomoredata)
2288		m->m_ddp_flags |= DDP_BF_NODATA;
2289
2290
2291	SBAPPEND(&so->so_rcv, m);
2292
2293	if ((so->so_state & SS_NOFDREF) == 0)
2294		sorwakeup_locked(so);
2295	else
2296		SOCKBUF_UNLOCK(&so->so_rcv);
2297}
2298
2299/*
2300 * Handler for RX_DDP_COMPLETE CPL messages.
2301 */
2302static int
2303do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2304{
2305	struct toepcb *toep = ctx;
2306
2307	VALIDATE_SOCK(so);
2308#if 0
2309	skb->h.th = tcphdr_skb->h.th;
2310#endif
2311	process_ddp_complete(toep, m);
2312	return (0);
2313}
2314
2315/*
2316 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2317 * socket state before calling tcp_time_wait to comply with its expectations.
2318 */
2319static void
2320enter_timewait(struct socket *so)
2321{
2322	struct tcpcb *tp = sototcpcb(so);
2323
2324	inp_wlock_assert(tp->t_inpcb);
2325	/*
2326	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2327	 * process peer_close because we don't want to carry the peer FIN in
2328	 * the socket's receive queue and if we increment rcv_nxt without
2329	 * having the FIN in the receive queue we'll confuse facilities such
2330	 * as SIOCINQ.
2331	 */
2332	tp->rcv_nxt++;
2333
2334	tp->ts_recent_age = 0;	     /* defeat recycling */
2335	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2336	tcp_twstart(tp);
2337}
2338
2339/*
2340 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2341 * function deals with the data that may be reported along with the FIN.
2342 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2343 * perform normal FIN-related processing.  In the latter case 1 indicates that
2344 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2345 * skb can be freed.
2346 */
2347static int
2348handle_peer_close_data(struct socket *so, struct mbuf *m)
2349{
2350	struct tcpcb *tp = sototcpcb(so);
2351	struct toepcb *toep = tp->t_toe;
2352	struct ddp_state *q;
2353	struct ddp_buf_state *bsp;
2354	struct cpl_peer_close *req = cplhdr(m);
2355	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2356
2357	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2358		return (0);
2359
2360	if (__predict_false(so_no_receive(so))) {
2361		handle_excess_rx(toep, m);
2362
2363		/*
2364		 * Although we discard the data we want to process the FIN so
2365		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2366		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2367		 * may be what will close the connection.  We return 1 because
2368		 * handle_excess_rx() already freed the packet.
2369		 */
2370		return (1);
2371	}
2372
2373	inp_wlock_assert(tp->t_inpcb);
2374	q = &toep->tp_ddp_state;
2375	SOCKBUF_LOCK(&so->so_rcv);
2376	bsp = &q->buf_state[q->cur_buf];
2377	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2378	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2379	m->m_ddp_gl = (unsigned char *)bsp->gl;
2380	m->m_flags |= M_DDP;
2381	m->m_cur_offset = bsp->cur_offset;
2382	m->m_ddp_flags =
2383	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2384	m->m_seq = tp->rcv_nxt;
2385	tp->rcv_nxt = rcv_nxt;
2386	bsp->cur_offset += m->m_pkthdr.len;
2387	if (!(bsp->flags & DDP_BF_NOFLIP))
2388		q->cur_buf ^= 1;
2389#ifdef notyet
2390	skb_reset_transport_header(skb);
2391	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2392#endif
2393	tp->t_rcvtime = ticks;
2394	SBAPPEND(&so->so_rcv, m);
2395	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2396		sorwakeup_locked(so);
2397	else
2398		SOCKBUF_UNLOCK(&so->so_rcv);
2399	return (1);
2400}
2401
2402/*
2403 * Handle a peer FIN.
2404 */
2405static void
2406do_peer_fin(struct socket *so, struct mbuf *m)
2407{
2408	struct tcpcb *tp = sototcpcb(so);
2409	struct toepcb *toep = tp->t_toe;
2410	int keep = 0;
2411	DPRINTF("do_peer_fin state=%d\n", tp->t_state);
2412
2413#ifdef T3_TRACE
2414	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
2415#endif
2416
2417	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2418		printf("abort_pending set\n");
2419
2420		goto out;
2421	}
2422	INP_INFO_WLOCK(&tcbinfo);
2423	inp_wlock(tp->t_inpcb);
2424	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2425		keep = handle_peer_close_data(so, m);
2426		if (keep < 0) {
2427			INP_INFO_WUNLOCK(&tcbinfo);
2428			inp_wunlock(tp->t_inpcb);
2429			return;
2430		}
2431	}
2432	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2433		socantrcvmore(so);
2434		/*
2435		 * If connection is half-synchronized
2436		 * (ie NEEDSYN flag on) then delay ACK,
2437		 * so it may be piggybacked when SYN is sent.
2438		 * Otherwise, since we received a FIN then no
2439		 * more input can be expected, send ACK now.
2440		 */
2441		if (tp->t_flags & TF_NEEDSYN)
2442			tp->t_flags |= TF_DELACK;
2443		else
2444			tp->t_flags |= TF_ACKNOW;
2445		tp->rcv_nxt++;
2446	}
2447
2448	switch (tp->t_state) {
2449	case TCPS_SYN_RECEIVED:
2450	    tp->t_starttime = ticks;
2451	/* FALLTHROUGH */
2452	case TCPS_ESTABLISHED:
2453		tp->t_state = TCPS_CLOSE_WAIT;
2454		break;
2455	case TCPS_FIN_WAIT_1:
2456		tp->t_state = TCPS_CLOSING;
2457		break;
2458	case TCPS_FIN_WAIT_2:
2459		/*
2460		 * If we've sent an abort_req we must have sent it too late,
2461		 * HW will send us a reply telling us so, and this peer_close
2462		 * is really the last message for this connection and needs to
2463		 * be treated as an abort_rpl, i.e., transition the connection
2464		 * to TCP_CLOSE (note that the host stack does this at the
2465		 * time of generating the RST but we must wait for HW).
2466		 * Otherwise we enter TIME_WAIT.
2467		 */
2468		t3_release_offload_resources(toep);
2469		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2470			tp = tcp_close(tp);
2471		} else {
2472			enter_timewait(so);
2473		}
2474		break;
2475	default:
2476		log(LOG_ERR,
2477		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2478		       TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
2479	}
2480	INP_INFO_WUNLOCK(&tcbinfo);
2481	if (tp)
2482		inp_wunlock(tp->t_inpcb);
2483
2484	DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
2485
2486#ifdef notyet
2487	/* Do not send POLL_HUP for half duplex close. */
2488	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2489	    sk->sk_state == TCP_CLOSE)
2490		sk_wake_async(so, 1, POLL_HUP);
2491	else
2492		sk_wake_async(so, 1, POLL_IN);
2493#endif
2494
2495out:
2496	if (!keep)
2497		m_free(m);
2498}
2499
2500/*
2501 * Handler for PEER_CLOSE CPL messages.
2502 */
2503static int
2504do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2505{
2506	struct toepcb *toep = (struct toepcb *)ctx;
2507	struct socket *so = toeptoso(toep);
2508
2509	VALIDATE_SOCK(so);
2510
2511	do_peer_fin(so, m);
2512	return (0);
2513}
2514
2515static void
2516process_close_con_rpl(struct socket *so, struct mbuf *m)
2517{
2518	struct tcpcb *tp = sototcpcb(so);
2519	struct cpl_close_con_rpl *rpl = cplhdr(m);
2520	struct toepcb *toep = tp->t_toe;
2521
2522	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2523
2524	DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
2525	    !!(so->so_state & SS_NOFDREF));
2526	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
2527		goto out;
2528
2529	INP_INFO_WLOCK(&tcbinfo);
2530	inp_wlock(tp->t_inpcb);
2531	switch (tp->t_state) {
2532	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2533		t3_release_offload_resources(toep);
2534		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2535			tp = tcp_close(tp);
2536
2537		} else {
2538			enter_timewait(so);
2539			soisdisconnected(so);
2540		}
2541		break;
2542	case TCPS_LAST_ACK:
2543		/*
2544		 * In this state we don't care about pending abort_rpl.
2545		 * If we've sent abort_req it was post-close and was sent too
2546		 * late, this close_con_rpl is the actual last message.
2547		 */
2548		t3_release_offload_resources(toep);
2549		tp = tcp_close(tp);
2550		break;
2551	case TCPS_FIN_WAIT_1:
2552		/*
2553		 * If we can't receive any more
2554		 * data, then closing user can proceed.
2555		 * Starting the timer is contrary to the
2556		 * specification, but if we don't get a FIN
2557		 * we'll hang forever.
2558		 *
2559		 * XXXjl:
2560		 * we should release the tp also, and use a
2561		 * compressed state.
2562		 */
2563		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2564			int timeout;
2565
2566			soisdisconnected(so);
2567			timeout = (tcp_fast_finwait2_recycle) ?
2568			    tcp_finwait2_timeout : tcp_maxidle;
2569			tcp_timer_activate(tp, TT_2MSL, timeout);
2570		}
2571		tp->t_state = TCPS_FIN_WAIT_2;
2572		if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
2573		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2574			tp = tcp_drop(tp, 0);
2575		}
2576
2577		break;
2578	default:
2579		log(LOG_ERR,
2580		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2581		       TOE_DEV(so)->tod_name, toep->tp_tid,
2582		       tp->t_state);
2583	}
2584	INP_INFO_WUNLOCK(&tcbinfo);
2585	if (tp)
2586		inp_wunlock(tp->t_inpcb);
2587out:
2588	m_freem(m);
2589}
2590
2591/*
2592 * Handler for CLOSE_CON_RPL CPL messages.
2593 */
2594static int
2595do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2596			    void *ctx)
2597{
2598	struct toepcb *toep = (struct toepcb *)ctx;
2599	struct socket *so = toeptoso(toep);
2600
2601	VALIDATE_SOCK(so);
2602
2603	process_close_con_rpl(so, m);
2604	return (0);
2605}
2606
2607/*
2608 * Process abort replies.  We only process these messages if we anticipate
2609 * them as the coordination between SW and HW in this area is somewhat lacking
2610 * and sometimes we get ABORT_RPLs after we are done with the connection that
2611 * originated the ABORT_REQ.
2612 */
2613static void
2614process_abort_rpl(struct socket *so, struct mbuf *m)
2615{
2616	struct tcpcb *tp = sototcpcb(so);
2617	struct toepcb *toep = tp->t_toe;
2618
2619#ifdef T3_TRACE
2620	T3_TRACE1(TIDTB(sk),
2621		  "process_abort_rpl: GTS rpl pending %d",
2622		  sock_flag(sk, ABORT_RPL_PENDING));
2623#endif
2624
2625	INP_INFO_WLOCK(&tcbinfo);
2626	inp_wlock(tp->t_inpcb);
2627
2628	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2629		/*
2630		 * XXX panic on tcpdrop
2631		 */
2632		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
2633			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2634		else {
2635			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2636			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2637			    !is_t3a(TOE_DEV(so))) {
2638				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2639					panic("TP_ABORT_REQ_RCVD set");
2640				t3_release_offload_resources(toep);
2641				tp = tcp_close(tp);
2642			}
2643		}
2644	}
2645	if (tp)
2646		inp_wunlock(tp->t_inpcb);
2647	INP_INFO_WUNLOCK(&tcbinfo);
2648
2649	m_free(m);
2650}
2651
2652/*
2653 * Handle an ABORT_RPL_RSS CPL message.
2654 */
2655static int
2656do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2657{
2658	struct socket *so;
2659	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2660	struct toepcb *toep;
2661
2662	/*
2663	 * Ignore replies to post-close aborts indicating that the abort was
2664	 * requested too late.  These connections are terminated when we get
2665	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2666	 * arrives the TID is either no longer used or it has been recycled.
2667	 */
2668	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2669discard:
2670		m_free(m);
2671		return (0);
2672	}
2673
2674	toep = (struct toepcb *)ctx;
2675
2676        /*
2677	 * Sometimes we've already closed the socket, e.g., a post-close
2678	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2679	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2680	 * but FW turns the ABORT_REQ into a regular one and so we get
2681	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2682	 */
2683	if (!toep)
2684		goto discard;
2685
2686	if (toep->tp_tp == NULL) {
2687		printf("removing tid for abort\n");
2688		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2689		if (toep->tp_l2t)
2690			l2t_release(L2DATA(cdev), toep->tp_l2t);
2691
2692		toepcb_release(toep);
2693		goto discard;
2694	}
2695
2696	printf("toep=%p\n", toep);
2697	printf("tp=%p\n", toep->tp_tp);
2698
2699	so = toeptoso(toep); /* <- XXX panic */
2700	toepcb_hold(toep);
2701	process_abort_rpl(so, m);
2702	toepcb_release(toep);
2703	return (0);
2704}
2705
2706/*
2707 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2708 * indicate whether RST should be sent in response.
2709 */
2710static int
2711abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2712{
2713	struct tcpcb *tp = sototcpcb(so);
2714
2715	switch (abort_reason) {
2716	case CPL_ERR_BAD_SYN:
2717#if 0
2718		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2719#endif
2720	case CPL_ERR_CONN_RESET:
2721		// XXX need to handle SYN_RECV due to crossed SYNs
2722		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2723	case CPL_ERR_XMIT_TIMEDOUT:
2724	case CPL_ERR_PERSIST_TIMEDOUT:
2725	case CPL_ERR_FINWAIT2_TIMEDOUT:
2726	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2727#if 0
2728		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2729#endif
2730		return (ETIMEDOUT);
2731	default:
2732		return (EIO);
2733	}
2734}
2735
2736static inline void
2737set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2738{
2739	struct cpl_abort_rpl *rpl = cplhdr(m);
2740
2741	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2742	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2743	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2744
2745	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2746	rpl->cmd = cmd;
2747}
2748
2749static void
2750send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2751{
2752	struct mbuf *reply_mbuf;
2753	struct cpl_abort_req_rss *req = cplhdr(m);
2754
2755	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2756	m_set_priority(m, CPL_PRIORITY_DATA);
2757	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2758	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2759	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2760	m_free(m);
2761}
2762
2763/*
2764 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2765 */
2766static inline int
2767is_neg_adv_abort(unsigned int status)
2768{
2769	return status == CPL_ERR_RTX_NEG_ADVICE ||
2770	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2771}
2772
2773static void
2774send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2775{
2776	struct mbuf  *reply_mbuf;
2777	struct cpl_abort_req_rss *req = cplhdr(m);
2778
2779	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2780
2781	if (!reply_mbuf) {
2782		/* Defer the reply.  Stick rst_status into req->cmd. */
2783		req->status = rst_status;
2784		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2785		return;
2786	}
2787
2788	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2789	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2790	m_free(m);
2791
2792	/*
2793	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2794	 * these messages while ARP is pending.  For other connection states
2795	 * it's not a problem.
2796	 */
2797	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2798}
2799
2800#ifdef notyet
2801static void
2802cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2803{
2804	CXGB_UNIMPLEMENTED();
2805#ifdef notyet
2806	struct request_sock *req = child->sk_user_data;
2807
2808	inet_csk_reqsk_queue_removed(parent, req);
2809	synq_remove(tcp_sk(child));
2810	__reqsk_free(req);
2811	child->sk_user_data = NULL;
2812#endif
2813}
2814
2815
2816/*
2817 * Performs the actual work to abort a SYN_RECV connection.
2818 */
2819static void
2820do_abort_syn_rcv(struct socket *child, struct socket *parent)
2821{
2822	struct tcpcb *parenttp = sototcpcb(parent);
2823	struct tcpcb *childtp = sototcpcb(child);
2824
2825	/*
2826	 * If the server is still open we clean up the child connection,
2827	 * otherwise the server already did the clean up as it was purging
2828	 * its SYN queue and the skb was just sitting in its backlog.
2829	 */
2830	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2831		cleanup_syn_rcv_conn(child, parent);
2832		INP_INFO_WLOCK(&tcbinfo);
2833		inp_wlock(childtp->t_inpcb);
2834		t3_release_offload_resources(childtp->t_toe);
2835		childtp = tcp_close(childtp);
2836		INP_INFO_WUNLOCK(&tcbinfo);
2837		if (childtp)
2838			inp_wunlock(childtp->t_inpcb);
2839	}
2840}
2841#endif
2842
2843/*
2844 * Handle abort requests for a SYN_RECV connection.  These need extra work
2845 * because the socket is on its parent's SYN queue.
2846 */
2847static int
2848abort_syn_rcv(struct socket *so, struct mbuf *m)
2849{
2850	CXGB_UNIMPLEMENTED();
2851#ifdef notyet
2852	struct socket *parent;
2853	struct toedev *tdev = TOE_DEV(so);
2854	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2855	struct socket *oreq = so->so_incomp;
2856	struct t3c_tid_entry *t3c_stid;
2857	struct tid_info *t;
2858
2859	if (!oreq)
2860		return -1;        /* somehow we are not on the SYN queue */
2861
2862	t = &(T3C_DATA(cdev))->tid_maps;
2863	t3c_stid = lookup_stid(t, oreq->ts_recent);
2864	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2865
2866	SOCK_LOCK(parent);
2867	do_abort_syn_rcv(so, parent);
2868	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2869	SOCK_UNLOCK(parent);
2870#endif
2871	return (0);
2872}
2873
2874/*
2875 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2876 * request except that we need to reply to it.
2877 */
2878static void
2879process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
2880{
2881	int rst_status = CPL_ABORT_NO_RST;
2882	const struct cpl_abort_req_rss *req = cplhdr(m);
2883	struct tcpcb *tp = sototcpcb(so);
2884	struct toepcb *toep = tp->t_toe;
2885
2886	inp_wlock(tp->t_inpcb);
2887	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2888		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2889		m_free(m);
2890		goto skip;
2891	}
2892
2893	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
2894	/*
2895	 * Three cases to consider:
2896	 * a) We haven't sent an abort_req; close the connection.
2897	 * b) We have sent a post-close abort_req that will get to TP too late
2898	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
2899	 *    be ignored and the connection should be closed now.
2900	 * c) We have sent a regular abort_req that will get to TP too late.
2901	 *    That will generate an abort_rpl with status 0, wait for it.
2902	 */
2903	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
2904	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
2905		so->so_error = abort_status_to_errno(so, req->status,
2906		    &rst_status);
2907		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2908			sorwakeup(so);
2909		/*
2910		 * SYN_RECV needs special processing.  If abort_syn_rcv()
2911		 * returns 0 is has taken care of the abort.
2912		 */
2913		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
2914			goto skip;
2915
2916		t3_release_offload_resources(toep);
2917		tp = tcp_close(tp);
2918	}
2919	if (tp)
2920		inp_wunlock(tp->t_inpcb);
2921	send_abort_rpl(m, tdev, rst_status);
2922	return;
2923
2924skip:
2925	inp_wunlock(tp->t_inpcb);
2926}
2927
2928/*
2929 * Handle an ABORT_REQ_RSS CPL message.
2930 */
2931static int
2932do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2933{
2934	const struct cpl_abort_req_rss *req = cplhdr(m);
2935	struct toepcb *toep = (struct toepcb *)ctx;
2936	struct socket *so;
2937	struct inpcb *inp;
2938
2939	if (is_neg_adv_abort(req->status)) {
2940		m_free(m);
2941		return (0);
2942	}
2943
2944	printf("aborting tid=%d\n", toep->tp_tid);
2945
2946	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
2947		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2948		toep->tp_flags |= TP_ABORT_REQ_RCVD;
2949		printf("sending abort rpl\n");
2950
2951		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
2952		printf("sent\n");
2953		if (toep->tp_l2t)
2954			l2t_release(L2DATA(cdev), toep->tp_l2t);
2955
2956		/*
2957		 *  Unhook
2958		 */
2959		toep->tp_tp->t_toe = NULL;
2960		toep->tp_tp->t_flags &= ~TF_TOE;
2961		toep->tp_tp = NULL;
2962		/*
2963		 * XXX need to call syncache_chkrst - but we don't
2964		 * have a way of doing that yet
2965		 */
2966		toepcb_release(toep);
2967		printf("abort for unestablished connection :-(\n");
2968		return (0);
2969	}
2970	if (toep->tp_tp == NULL) {
2971		printf("disconnected toepcb\n");
2972		/* should be freed momentarily */
2973		return (0);
2974	}
2975
2976	so = toeptoso(toep);
2977	inp = sotoinpcb(so);
2978
2979	VALIDATE_SOCK(so);
2980	toepcb_hold(toep);
2981	INP_INFO_WLOCK(&tcbinfo);
2982	process_abort_req(so, m, TOE_DEV(so));
2983	INP_INFO_WUNLOCK(&tcbinfo);
2984	toepcb_release(toep);
2985	return (0);
2986}
2987#ifdef notyet
2988static void
2989pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
2990{
2991	struct toedev *tdev = TOE_DEV(parent);
2992
2993	do_abort_syn_rcv(child, parent);
2994	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
2995		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
2996
2997		rpl->opt0h = htonl(F_TCAM_BYPASS);
2998		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2999		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3000	} else
3001		m_free(m);
3002}
3003#endif
3004static void
3005handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3006{
3007	CXGB_UNIMPLEMENTED();
3008
3009#ifdef notyet
3010	struct t3cdev *cdev;
3011	struct socket *parent;
3012	struct socket *oreq;
3013	struct t3c_tid_entry *t3c_stid;
3014	struct tid_info *t;
3015	struct tcpcb *otp, *tp = sototcpcb(so);
3016	struct toepcb *toep = tp->t_toe;
3017
3018	/*
3019	 * If the connection is being aborted due to the parent listening
3020	 * socket going away there's nothing to do, the ABORT_REQ will close
3021	 * the connection.
3022	 */
3023	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3024		m_free(m);
3025		return;
3026	}
3027
3028	oreq = so->so_incomp;
3029	otp = sototcpcb(oreq);
3030
3031	cdev = T3C_DEV(so);
3032	t = &(T3C_DATA(cdev))->tid_maps;
3033	t3c_stid = lookup_stid(t, otp->ts_recent);
3034	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3035
3036	SOCK_LOCK(parent);
3037	pass_open_abort(so, parent, m);
3038	SOCK_UNLOCK(parent);
3039#endif
3040}
3041
3042/*
3043 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3044 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3045 * connection.
3046 */
3047static void
3048pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3049{
3050
3051#ifdef notyet
3052	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3053	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3054#endif
3055	handle_pass_open_arp_failure(m_get_socket(m), m);
3056}
3057
3058/*
3059 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3060 */
3061static void
3062mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3063{
3064	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3065	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3066	unsigned int tid = GET_TID(req);
3067
3068	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3069	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3070	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3071	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3072	rpl->opt0h = htonl(F_TCAM_BYPASS);
3073	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3074	rpl->opt2 = 0;
3075	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3076}
3077
3078/*
3079 * Send a deferred reject to an accept request.
3080 */
3081static void
3082reject_pass_request(struct toedev *tdev, struct mbuf *m)
3083{
3084	struct mbuf *reply_mbuf;
3085
3086	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3087	mk_pass_accept_rpl(reply_mbuf, m);
3088	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3089	m_free(m);
3090}
3091
3092static void
3093handle_syncache_event(int event, void *arg)
3094{
3095	struct toepcb *toep = arg;
3096
3097	switch (event) {
3098	case TOE_SC_ENTRY_PRESENT:
3099		/*
3100		 * entry already exists - free toepcb
3101		 * and l2t
3102		 */
3103		printf("syncache entry present\n");
3104		toepcb_release(toep);
3105		break;
3106	case TOE_SC_DROP:
3107		/*
3108		 * The syncache has given up on this entry
3109		 * either it timed out, or it was evicted
3110		 * we need to explicitly release the tid
3111		 */
3112		printf("syncache entry dropped\n");
3113		toepcb_release(toep);
3114		break;
3115	default:
3116		log(LOG_ERR, "unknown syncache event %d\n", event);
3117		break;
3118	}
3119}
3120
3121static void
3122syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3123{
3124	struct in_conninfo inc;
3125	struct tcpopt to;
3126	struct tcphdr th;
3127	struct inpcb *inp;
3128	int mss, wsf, sack, ts;
3129	uint32_t rcv_isn = ntohl(req->rcv_isn);
3130
3131	bzero(&to, sizeof(struct tcpopt));
3132	inp = sotoinpcb(lso);
3133
3134	/*
3135	 * Fill out information for entering us into the syncache
3136	 */
3137	inc.inc_fport = th.th_sport = req->peer_port;
3138	inc.inc_lport = th.th_dport = req->local_port;
3139	th.th_seq = req->rcv_isn;
3140	th.th_flags = TH_SYN;
3141
3142	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3143
3144
3145	inc.inc_isipv6 = 0;
3146	inc.inc_len = 0;
3147	inc.inc_faddr.s_addr = req->peer_ip;
3148	inc.inc_laddr.s_addr = req->local_ip;
3149
3150	DPRINTF("syncache add of %d:%d %d:%d\n",
3151	    ntohl(req->local_ip), ntohs(req->local_port),
3152	    ntohl(req->peer_ip), ntohs(req->peer_port));
3153
3154	mss = req->tcp_options.mss;
3155	wsf = req->tcp_options.wsf;
3156	ts = req->tcp_options.tstamp;
3157	sack = req->tcp_options.sack;
3158	to.to_mss = mss;
3159	to.to_wscale = wsf;
3160	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3161	INP_INFO_WLOCK(&tcbinfo);
3162	inp_wlock(inp);
3163	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3164}
3165
3166
3167/*
3168 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3169 * lock held.  Note that the sock here is a listening socket that is not owned
3170 * by the TOE.
3171 */
3172static void
3173process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3174    struct listen_ctx *lctx)
3175{
3176	int rt_flags;
3177	struct l2t_entry *e;
3178	struct iff_mac tim;
3179	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3180	struct cpl_pass_accept_rpl *rpl;
3181	struct cpl_pass_accept_req *req = cplhdr(m);
3182	unsigned int tid = GET_TID(req);
3183	struct tom_data *d = TOM_DATA(tdev);
3184	struct t3cdev *cdev = d->cdev;
3185	struct tcpcb *tp = sototcpcb(so);
3186	struct toepcb *newtoep;
3187	struct rtentry *dst;
3188	struct sockaddr_in nam;
3189	struct t3c_data *td = T3C_DATA(cdev);
3190
3191	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3192	if (__predict_false(reply_mbuf == NULL)) {
3193		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3194			t3_defer_reply(m, tdev, reject_pass_request);
3195		else {
3196			cxgb_queue_tid_release(cdev, tid);
3197			m_free(m);
3198		}
3199		DPRINTF("failed to get reply_mbuf\n");
3200
3201		goto out;
3202	}
3203
3204	if (tp->t_state != TCPS_LISTEN) {
3205		DPRINTF("socket not in listen state\n");
3206
3207		goto reject;
3208	}
3209
3210	tim.mac_addr = req->dst_mac;
3211	tim.vlan_tag = ntohs(req->vlan_tag);
3212	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3213		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3214		goto reject;
3215	}
3216
3217#ifdef notyet
3218	/*
3219	 * XXX do route lookup to confirm that we're still listening on this
3220	 * address
3221	 */
3222	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3223			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3224		goto reject;
3225	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3226		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3227	dst_release(skb->dst);	// done with the input route, release it
3228	skb->dst = NULL;
3229
3230	if ((rt_flags & RTF_LOCAL) == 0)
3231		goto reject;
3232#endif
3233	/*
3234	 * XXX
3235	 */
3236	rt_flags = RTF_LOCAL;
3237	if ((rt_flags & RTF_LOCAL) == 0)
3238		goto reject;
3239
3240	/*
3241	 * Calculate values and add to syncache
3242	 */
3243
3244	newtoep = toepcb_alloc();
3245	if (newtoep == NULL)
3246		goto reject;
3247
3248	bzero(&nam, sizeof(struct sockaddr_in));
3249
3250	nam.sin_len = sizeof(struct sockaddr_in);
3251	nam.sin_family = AF_INET;
3252	nam.sin_addr.s_addr =req->peer_ip;
3253	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3254
3255	if (dst == NULL) {
3256		printf("failed to find route\n");
3257		goto reject;
3258	}
3259	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3260	    (struct sockaddr *)&nam);
3261	if (e == NULL) {
3262		DPRINTF("failed to get l2t\n");
3263	}
3264	/*
3265	 * Point to our listen socket until accept
3266	 */
3267	newtoep->tp_tp = tp;
3268	newtoep->tp_flags = TP_SYN_RCVD;
3269	newtoep->tp_tid = tid;
3270	newtoep->tp_toedev = tdev;
3271	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3272
3273	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3274	SOCK_LOCK(so);
3275	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3276	SOCK_UNLOCK(so);
3277
3278	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
3279		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3280
3281	if (newtoep->tp_ulp_mode) {
3282		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3283
3284		if (ddp_mbuf == NULL)
3285			newtoep->tp_ulp_mode = 0;
3286	}
3287
3288	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3289	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3290	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3291	/*
3292	 * XXX workaround for lack of syncache drop
3293	 */
3294	toepcb_hold(newtoep);
3295	syncache_add_accept_req(req, so, newtoep);
3296
3297	rpl = cplhdr(reply_mbuf);
3298	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3299	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3300	rpl->wr.wr_lo = 0;
3301	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3302	rpl->opt2 = htonl(calc_opt2(so, tdev));
3303	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3304	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3305
3306	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3307	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3308	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3309				  CPL_PASS_OPEN_ACCEPT);
3310
3311	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3312
3313	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3314
3315	l2t_send(cdev, reply_mbuf, e);
3316	m_free(m);
3317	if (newtoep->tp_ulp_mode) {
3318		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3319				V_TF_DDP_OFF(1) |
3320				TP_DDP_TIMER_WORKAROUND_MASK,
3321				V_TF_DDP_OFF(1) |
3322		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3323	} else
3324		printf("not offloading\n");
3325
3326
3327
3328	return;
3329reject:
3330	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3331		mk_pass_accept_rpl(reply_mbuf, m);
3332	else
3333		mk_tid_release(reply_mbuf, newtoep, tid);
3334	cxgb_ofld_send(cdev, reply_mbuf);
3335	m_free(m);
3336out:
3337#if 0
3338	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3339#else
3340	return;
3341#endif
3342}
3343
3344/*
3345 * Handle a CPL_PASS_ACCEPT_REQ message.
3346 */
3347static int
3348do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3349{
3350	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3351	struct socket *lso = listen_ctx->lso;
3352	struct tom_data *d = listen_ctx->tom_data;
3353
3354#if VALIDATE_TID
3355	struct cpl_pass_accept_req *req = cplhdr(m);
3356	unsigned int tid = GET_TID(req);
3357	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3358
3359	if (unlikely(!lsk)) {
3360		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3361		       cdev->name,
3362		       (unsigned long)((union listen_entry *)ctx -
3363					t->stid_tab));
3364		return CPL_RET_BUF_DONE;
3365	}
3366	if (unlikely(tid >= t->ntids)) {
3367		printk(KERN_ERR "%s: passive open TID %u too large\n",
3368		       cdev->name, tid);
3369		return CPL_RET_BUF_DONE;
3370	}
3371	/*
3372	 * For T3A the current user of the TID may have closed but its last
3373	 * message(s) may have been backlogged so the TID appears to be still
3374	 * in use.  Just take the TID away, the connection can close at its
3375	 * own leisure.  For T3B this situation is a bug.
3376	 */
3377	if (!valid_new_tid(t, tid) &&
3378	    cdev->type != T3A) {
3379		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3380		       cdev->name, tid);
3381		return CPL_RET_BUF_DONE;
3382	}
3383#endif
3384
3385	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3386	return (0);
3387}
3388
3389/*
3390 * Called when a connection is established to translate the TCP options
3391 * reported by HW to FreeBSD's native format.
3392 */
3393static void
3394assign_rxopt(struct socket *so, unsigned int opt)
3395{
3396	const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
3397	struct tcpcb *tp = sototcpcb(so);
3398	struct toepcb *toep = tp->t_toe;
3399
3400	inp_wlock_assert(tp->t_inpcb);
3401
3402	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3403	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3404	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3405	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3406	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3407	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3408		tp->rcv_scale = tp->request_r_scale;
3409}
3410
3411/*
3412 * Completes some final bits of initialization for just established connections
3413 * and changes their state to TCP_ESTABLISHED.
3414 *
3415 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3416 */
3417static void
3418make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3419{
3420	struct tcpcb *tp = sototcpcb(so);
3421	struct toepcb *toep = tp->t_toe;
3422
3423	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3424	assign_rxopt(so, opt);
3425	so->so_proto->pr_ctloutput = t3_ctloutput;
3426
3427#if 0
3428	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3429#endif
3430	/*
3431	 * XXX not clear what rcv_wup maps to
3432	 */
3433	/*
3434	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3435	 * pass through opt0.
3436	 */
3437	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3438		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3439
3440	dump_toepcb(toep);
3441
3442#ifdef notyet
3443/*
3444 * no clean interface for marking ARP up to date
3445 */
3446	dst_confirm(sk->sk_dst_cache);
3447#endif
3448	tp->t_starttime = ticks;
3449	tp->t_state = TCPS_ESTABLISHED;
3450	soisconnected(so);
3451}
3452
3453static int
3454syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3455{
3456
3457	struct in_conninfo inc;
3458	struct tcpopt to;
3459	struct tcphdr th;
3460	int mss, wsf, sack, ts;
3461	struct mbuf *m = NULL;
3462	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3463	unsigned int opt;
3464
3465#ifdef MAC
3466#error	"no MAC support"
3467#endif
3468
3469	opt = ntohs(req->tcp_opt);
3470
3471	bzero(&to, sizeof(struct tcpopt));
3472
3473	/*
3474	 * Fill out information for entering us into the syncache
3475	 */
3476	inc.inc_fport = th.th_sport = req->peer_port;
3477	inc.inc_lport = th.th_dport = req->local_port;
3478	th.th_seq = req->rcv_isn;
3479	th.th_flags = TH_ACK;
3480
3481	inc.inc_isipv6 = 0;
3482	inc.inc_len = 0;
3483	inc.inc_faddr.s_addr = req->peer_ip;
3484	inc.inc_laddr.s_addr = req->local_ip;
3485
3486	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3487	wsf  = G_TCPOPT_WSCALE_OK(opt);
3488	ts   = G_TCPOPT_TSTAMP(opt);
3489	sack = G_TCPOPT_SACK(opt);
3490
3491	to.to_mss = mss;
3492	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3493	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3494
3495	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3496	    ntohl(req->local_ip), ntohs(req->local_port),
3497	    ntohl(req->peer_ip), ntohs(req->peer_port),
3498	    mss, wsf, ts, sack);
3499	return syncache_expand(&inc, &to, &th, so, m);
3500}
3501
3502
3503/*
3504 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3505 * if we are in TCP_SYN_RECV due to crossed SYNs
3506 */
3507static int
3508do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3509{
3510	struct cpl_pass_establish *req = cplhdr(m);
3511	struct toepcb *toep = (struct toepcb *)ctx;
3512	struct tcpcb *tp;
3513	struct socket *so, *lso;
3514	struct t3c_data *td = T3C_DATA(cdev);
3515	// Complete socket initialization now that we have the SND_ISN
3516
3517	struct toedev *tdev;
3518
3519	so = lso = toeptoso(toep);
3520	tdev = toep->tp_toedev;
3521
3522	SOCK_LOCK(so);
3523	LIST_REMOVE(toep, synq_entry);
3524	SOCK_UNLOCK(so);
3525
3526	INP_INFO_WLOCK(&tcbinfo);
3527	if (!syncache_expand_establish_req(req, &so, toep)) {
3528		/*
3529		 * No entry
3530		 */
3531		CXGB_UNIMPLEMENTED();
3532	}
3533	if (so == NULL) {
3534		/*
3535		 * Couldn't create the socket
3536		 */
3537		CXGB_UNIMPLEMENTED();
3538	}
3539
3540	/*
3541	 * XXX workaround for lack of syncache drop
3542	 */
3543	toepcb_release(toep);
3544
3545	tp = sototcpcb(so);
3546	inp_wlock(tp->t_inpcb);
3547
3548	so->so_snd.sb_flags |= SB_NOCOALESCE;
3549	so->so_rcv.sb_flags |= SB_NOCOALESCE;
3550
3551	toep->tp_tp = tp;
3552	toep->tp_flags = 0;
3553	tp->t_toe = toep;
3554	reset_wr_list(toep);
3555	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3556	tp->rcv_nxt = toep->tp_copied_seq;
3557	install_offload_ops(so);
3558
3559	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3560	toep->tp_wr_unacked = 0;
3561	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3562	toep->tp_qset_idx = 0;
3563	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3564
3565	/*
3566	 * XXX Cancel any keep alive timer
3567	 */
3568
3569	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3570	INP_INFO_WUNLOCK(&tcbinfo);
3571	inp_wunlock(tp->t_inpcb);
3572
3573	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3574	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3575#ifdef notyet
3576	/*
3577	 * XXX not sure how these checks map to us
3578	 */
3579	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3580		sk->sk_state_change(sk);
3581		sk_wake_async(so, 0, POLL_OUT);
3582	}
3583	/*
3584	 * The state for the new connection is now up to date.
3585	 * Next check if we should add the connection to the parent's
3586	 * accept queue.  When the parent closes it resets connections
3587	 * on its SYN queue, so check if we are being reset.  If so we
3588	 * don't need to do anything more, the coming ABORT_RPL will
3589	 * destroy this socket.  Otherwise move the connection to the
3590	 * accept queue.
3591	 *
3592	 * Note that we reset the synq before closing the server so if
3593	 * we are not being reset the stid is still open.
3594	 */
3595	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3596		__kfree_skb(skb);
3597		goto unlock;
3598	}
3599#endif
3600	m_free(m);
3601
3602	return (0);
3603}
3604
3605/*
3606 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3607 * and send them to the TOE.
3608 */
3609static void
3610fixup_and_send_ofo(struct socket *so)
3611{
3612	struct mbuf *m;
3613	struct toedev *tdev = TOE_DEV(so);
3614	struct tcpcb *tp = sototcpcb(so);
3615	struct toepcb *toep = tp->t_toe;
3616	unsigned int tid = toep->tp_tid;
3617
3618	printf("fixup_and_send_ofo\n");
3619
3620	inp_wlock_assert(tp->t_inpcb);
3621	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3622		/*
3623		 * A variety of messages can be waiting but the fields we'll
3624		 * be touching are common to all so any message type will do.
3625		 */
3626		struct cpl_close_con_req *p = cplhdr(m);
3627
3628		p->wr.wr_lo = htonl(V_WR_TID(tid));
3629		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3630		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3631	}
3632}
3633
3634/*
3635 * Updates socket state from an active establish CPL message.  Runs with the
3636 * socket lock held.
3637 */
3638static void
3639socket_act_establish(struct socket *so, struct mbuf *m)
3640{
3641	struct cpl_act_establish *req = cplhdr(m);
3642	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3643	struct tcpcb *tp = sototcpcb(so);
3644	struct toepcb *toep = tp->t_toe;
3645
3646	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3647		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3648		    toep->tp_tid, tp->t_state);
3649
3650	tp->ts_recent_age = ticks;
3651	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3652	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3653
3654	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3655
3656	/*
3657	 * Now that we finally have a TID send any CPL messages that we had to
3658	 * defer for lack of a TID.
3659	 */
3660	if (mbufq_len(&toep->out_of_order_queue))
3661		fixup_and_send_ofo(so);
3662
3663	if (__predict_false(so->so_state & SS_NOFDREF)) {
3664		/*
3665		 * XXX does this even make sense?
3666		 */
3667		sorwakeup(so);
3668	}
3669	m_free(m);
3670#ifdef notyet
3671/*
3672 * XXX assume no write requests permitted while socket connection is
3673 * incomplete
3674 */
3675	/*
3676	 * Currently the send queue must be empty at this point because the
3677	 * socket layer does not send anything before a connection is
3678	 * established.  To be future proof though we handle the possibility
3679	 * that there are pending buffers to send (either TX_DATA or
3680	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3681	 * buffers according to the just learned write_seq, and then we send
3682	 * them on their way.
3683	 */
3684	fixup_pending_writeq_buffers(sk);
3685	if (t3_push_frames(so, 1))
3686		sk->sk_write_space(sk);
3687#endif
3688
3689	toep->tp_state = tp->t_state;
3690	tcpstat.tcps_connects++;
3691
3692}
3693
3694/*
3695 * Process a CPL_ACT_ESTABLISH message.
3696 */
3697static int
3698do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3699{
3700	struct cpl_act_establish *req = cplhdr(m);
3701	unsigned int tid = GET_TID(req);
3702	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3703	struct toepcb *toep = (struct toepcb *)ctx;
3704	struct tcpcb *tp = toep->tp_tp;
3705	struct socket *so;
3706	struct toedev *tdev;
3707	struct tom_data *d;
3708
3709	if (tp == NULL) {
3710		free_atid(cdev, atid);
3711		return (0);
3712	}
3713
3714	so = toeptoso(toep);
3715	tdev = TOE_DEV(so); /* blow up here if link was down */
3716	d = TOM_DATA(tdev);
3717
3718	inp_wlock(tp->t_inpcb);
3719
3720	/*
3721	 * It's OK if the TID is currently in use, the owning socket may have
3722	 * backlogged its last CPL message(s).  Just take it away.
3723	 */
3724	toep->tp_tid = tid;
3725	toep->tp_tp = tp;
3726	so_insert_tid(d, so, tid);
3727	free_atid(cdev, atid);
3728	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3729
3730	socket_act_establish(so, m);
3731	inp_wunlock(tp->t_inpcb);
3732	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3733	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3734
3735	return (0);
3736}
3737
3738/*
3739 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3740 * next batch of work requests from the write queue.
3741 */
3742static void
3743wr_ack(struct toepcb *toep, struct mbuf *m)
3744{
3745	struct tcpcb *tp = toep->tp_tp;
3746	struct cpl_wr_ack *hdr = cplhdr(m);
3747	struct socket *so = toeptoso(toep);
3748	unsigned int credits = ntohs(hdr->credits);
3749	u32 snd_una = ntohl(hdr->snd_una);
3750	int bytes = 0;
3751
3752	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3753
3754	inp_wlock(tp->t_inpcb);
3755
3756	toep->tp_wr_avail += credits;
3757	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3758		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3759
3760	while (credits) {
3761		struct mbuf *p = peek_wr(toep);
3762
3763		if (__predict_false(!p)) {
3764			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3765			    "nothing pending, state %u wr_avail=%u\n",
3766			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3767			break;
3768		}
3769		CTR2(KTR_TOM,
3770			"wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len);
3771
3772		KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list"));
3773		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3774
3775#if DEBUG_WR > 1
3776			struct tx_data_wr *w = cplhdr(p);
3777			log(LOG_ERR,
3778			       "TID %u got %u WR credits, need %u, len %u, "
3779			       "main body %u, frags %u, seq # %u, ACK una %u,"
3780			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3781			       toep->tp_tid, credits, p->csum, p->len,
3782			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3783			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3784			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3785#endif
3786			p->m_pkthdr.csum_data -= credits;
3787			break;
3788		} else {
3789			dequeue_wr(toep);
3790			credits -= p->m_pkthdr.csum_data;
3791			bytes += p->m_pkthdr.len;
3792			CTR3(KTR_TOM,
3793			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3794			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3795
3796			m_free(p);
3797		}
3798	}
3799
3800#if DEBUG_WR
3801	check_wr_invariants(tp);
3802#endif
3803
3804	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3805#if VALIDATE_SEQ
3806		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3807
3808		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3809		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3810		    toep->tp_tid, tp->snd_una);
3811#endif
3812		goto out_free;
3813	}
3814
3815	if (tp->snd_una != snd_una) {
3816		tp->snd_una = snd_una;
3817		tp->ts_recent_age = ticks;
3818#ifdef notyet
3819		/*
3820		 * Keep ARP entry "minty fresh"
3821		 */
3822		dst_confirm(sk->sk_dst_cache);
3823#endif
3824		if (tp->snd_una == tp->snd_nxt)
3825			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3826	}
3827	if (bytes) {
3828		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3829		SOCKBUF_LOCK(&so->so_snd);
3830		sbdrop_locked(&so->so_snd, bytes);
3831		sowwakeup_locked(so);
3832	}
3833
3834	if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
3835		t3_push_frames(so, 0);
3836
3837out_free:
3838	inp_wunlock(tp->t_inpcb);
3839	m_free(m);
3840}
3841
3842/*
3843 * Handler for TX_DATA_ACK CPL messages.
3844 */
3845static int
3846do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3847{
3848	struct toepcb *toep = (struct toepcb *)ctx;
3849
3850	VALIDATE_SOCK(so);
3851
3852	wr_ack(toep, m);
3853	return 0;
3854}
3855
3856/*
3857 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
3858 */
3859static int
3860do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
3861{
3862	m_freem(m);
3863	return 0;
3864}
3865
3866/*
3867 * Reset a connection that is on a listener's SYN queue or accept queue,
3868 * i.e., one that has not had a struct socket associated with it.
3869 * Must be called from process context.
3870 *
3871 * Modeled after code in inet_csk_listen_stop().
3872 */
3873static void
3874t3_reset_listen_child(struct socket *child)
3875{
3876	struct tcpcb *tp = sototcpcb(child);
3877
3878	t3_send_reset(tp->t_toe);
3879}
3880
3881/*
3882 * Disconnect offloaded established but not yet accepted connections sitting
3883 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
3884 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
3885 */
3886void
3887t3_disconnect_acceptq(struct socket *listen_so)
3888{
3889	struct socket *so;
3890	struct tcpcb *tp;
3891
3892	TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
3893		tp = sototcpcb(so);
3894
3895		if (tp->t_flags & TF_TOE) {
3896			inp_wlock(tp->t_inpcb);
3897			t3_reset_listen_child(so);
3898			inp_wunlock(tp->t_inpcb);
3899		}
3900	}
3901}
3902
3903/*
3904 * Reset offloaded connections sitting on a server's syn queue.  As above
3905 * we send ABORT_REQ and finish off when we get ABORT_RPL.
3906 */
3907
3908void
3909t3_reset_synq(struct listen_ctx *lctx)
3910{
3911	struct toepcb *toep;
3912
3913	SOCK_LOCK(lctx->lso);
3914	while (!LIST_EMPTY(&lctx->synq_head)) {
3915		toep = LIST_FIRST(&lctx->synq_head);
3916		LIST_REMOVE(toep, synq_entry);
3917		toep->tp_tp = NULL;
3918		t3_send_reset(toep);
3919		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
3920		toepcb_release(toep);
3921	}
3922	SOCK_UNLOCK(lctx->lso);
3923}
3924
3925
3926int
3927t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
3928		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
3929		   unsigned int pg_off, unsigned int color)
3930{
3931	unsigned int i, j, pidx;
3932	struct pagepod *p;
3933	struct mbuf *m;
3934	struct ulp_mem_io *req;
3935	struct tcpcb *tp = sototcpcb(so);
3936	struct toepcb *toep = tp->t_toe;
3937	unsigned int tid = toep->tp_tid;
3938	const struct tom_data *td = TOM_DATA(TOE_DEV(so));
3939	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
3940
3941	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
3942	    gl, nppods, tag, maxoff, pg_off, color);
3943
3944	for (i = 0; i < nppods; ++i) {
3945		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
3946		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
3947		req = mtod(m, struct ulp_mem_io *);
3948		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
3949		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
3950		req->wr.wr_lo = 0;
3951		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
3952					   V_ULPTX_CMD(ULP_MEM_WRITE));
3953		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
3954				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
3955
3956		p = (struct pagepod *)(req + 1);
3957		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
3958			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
3959			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
3960						  V_PPOD_COLOR(color));
3961			p->pp_max_offset = htonl(maxoff);
3962			p->pp_page_offset = htonl(pg_off);
3963			p->pp_rsvd = 0;
3964			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
3965				p->pp_addr[j] = pidx < gl->dgl_nelem ?
3966				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
3967		} else
3968			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
3969		send_or_defer(toep, m, 0);
3970		ppod_addr += PPOD_SIZE;
3971	}
3972	return (0);
3973}
3974
3975/*
3976 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
3977 */
3978static inline void
3979mk_cpl_barrier_ulp(struct cpl_barrier *b)
3980{
3981	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
3982
3983	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3984	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
3985	b->opcode = CPL_BARRIER;
3986}
3987
3988/*
3989 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
3990 */
3991static inline void
3992mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
3993{
3994	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
3995
3996	txpkt = (struct ulp_txpkt *)req;
3997	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3998	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
3999	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4000	req->cpuno = htons(cpuno);
4001}
4002
4003/*
4004 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4005 */
4006static inline void
4007mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4008                     unsigned int word, uint64_t mask, uint64_t val)
4009{
4010	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4011
4012	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4013	    tid, word, mask, val);
4014
4015	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4016	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4017	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4018	req->reply = V_NO_REPLY(1);
4019	req->cpu_idx = 0;
4020	req->word = htons(word);
4021	req->mask = htobe64(mask);
4022	req->val = htobe64(val);
4023}
4024
4025/*
4026 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4027 */
4028static void
4029mk_rx_data_ack_ulp(struct socket *so,struct cpl_rx_data_ack *ack,
4030    unsigned int tid, unsigned int credits)
4031{
4032	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4033
4034	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4035	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4036	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4037	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4038	    V_RX_DACK_MODE(TOM_TUNABLE(TOE_DEV(so), delack)) |
4039				 V_RX_CREDITS(credits));
4040}
4041
4042void
4043t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4044{
4045	unsigned int wrlen;
4046	struct mbuf *m;
4047	struct work_request_hdr *wr;
4048	struct cpl_barrier *lock;
4049	struct cpl_set_tcb_field *req;
4050	struct cpl_get_tcb *getreq;
4051	struct ddp_state *p = &toep->tp_ddp_state;
4052
4053	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4054	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4055		sizeof(*getreq);
4056	m = m_gethdr_nofail(wrlen);
4057	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4058	wr = mtod(m, struct work_request_hdr *);
4059	bzero(wr, wrlen);
4060
4061	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4062	m->m_pkthdr.len = m->m_len = wrlen;
4063
4064	lock = (struct cpl_barrier *)(wr + 1);
4065	mk_cpl_barrier_ulp(lock);
4066
4067	req = (struct cpl_set_tcb_field *)(lock + 1);
4068
4069	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4070
4071	/* Hmmm, not sure if this actually a good thing: reactivating
4072	 * the other buffer might be an issue if it has been completed
4073	 * already. However, that is unlikely, since the fact that the UBUF
4074	 * is not completed indicates that there is no oustanding data.
4075	 */
4076	if (bufidx == 0)
4077		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4078				     V_TF_DDP_ACTIVE_BUF(1) |
4079				     V_TF_DDP_BUF0_VALID(1),
4080				     V_TF_DDP_ACTIVE_BUF(1));
4081	else
4082		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4083				     V_TF_DDP_ACTIVE_BUF(1) |
4084				     V_TF_DDP_BUF1_VALID(1), 0);
4085
4086	getreq = (struct cpl_get_tcb *)(req + 1);
4087	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4088
4089	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4090
4091	/* Keep track of the number of oustanding CPL_GET_TCB requests
4092	 */
4093	p->get_tcb_count++;
4094
4095#ifdef T3_TRACE
4096	T3_TRACE1(TIDTB(so),
4097		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4098#endif
4099	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4100}
4101
4102/**
4103 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4104 * @sk: the socket associated with the buffers
4105 * @bufidx: index of HW DDP buffer (0 or 1)
4106 * @tag0: new tag for HW buffer 0
4107 * @tag1: new tag for HW buffer 1
4108 * @len: new length for HW buf @bufidx
4109 *
4110 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4111 * buffer by changing the buffer tag and length and setting the valid and
4112 * active flag accordingly.  The caller must ensure the new buffer is at
4113 * least as big as the existing one.  Since we typically reprogram both HW
4114 * buffers this function sets both tags for convenience. Read the TCB to
4115 * determine how made data was written into the buffer before the overlay
4116 * took place.
4117 */
4118void
4119t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4120	 	       unsigned int tag1, unsigned int len)
4121{
4122	unsigned int wrlen;
4123	struct mbuf *m;
4124	struct work_request_hdr *wr;
4125	struct cpl_get_tcb *getreq;
4126	struct cpl_set_tcb_field *req;
4127	struct ddp_state *p = &toep->tp_ddp_state;
4128
4129	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4130	    bufidx, tag0, tag1, len);
4131	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4132	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4133	m = m_gethdr_nofail(wrlen);
4134	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4135	wr = mtod(m, struct work_request_hdr *);
4136	m->m_pkthdr.len = m->m_len = wrlen;
4137	bzero(wr, wrlen);
4138
4139
4140	/* Set the ATOMIC flag to make sure that TP processes the following
4141	 * CPLs in an atomic manner and no wire segments can be interleaved.
4142	 */
4143	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4144	req = (struct cpl_set_tcb_field *)(wr + 1);
4145	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4146			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4147			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4148			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4149			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4150	req++;
4151	if (bufidx == 0) {
4152		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4153			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4154			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4155		req++;
4156		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4157			    V_TF_DDP_PUSH_DISABLE_0(1) |
4158			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4159			    V_TF_DDP_PUSH_DISABLE_0(0) |
4160			    V_TF_DDP_BUF0_VALID(1));
4161	} else {
4162		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4163			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4164			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4165		req++;
4166		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4167			    V_TF_DDP_PUSH_DISABLE_1(1) |
4168			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4169			    V_TF_DDP_PUSH_DISABLE_1(0) |
4170			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4171	}
4172
4173	getreq = (struct cpl_get_tcb *)(req + 1);
4174	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4175
4176	/* Keep track of the number of oustanding CPL_GET_TCB requests
4177	 */
4178	p->get_tcb_count++;
4179
4180#ifdef T3_TRACE
4181	T3_TRACE4(TIDTB(sk),
4182		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4183		  "len %d",
4184		  bufidx, tag0, tag1, len);
4185#endif
4186	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4187}
4188
4189/*
4190 * Sends a compound WR containing all the CPL messages needed to program the
4191 * two HW DDP buffers, namely optionally setting up the length and offset of
4192 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4193 */
4194void
4195t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4196		      unsigned int len1, unsigned int offset1,
4197                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4198{
4199	unsigned int wrlen;
4200	struct mbuf *m;
4201	struct work_request_hdr *wr;
4202	struct cpl_set_tcb_field *req;
4203
4204	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4205	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4206
4207	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4208	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4209		(len1 ? sizeof(*req) : 0) +
4210		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4211	m = m_gethdr_nofail(wrlen);
4212	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4213	wr = mtod(m, struct work_request_hdr *);
4214	bzero(wr, wrlen);
4215
4216	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4217	m->m_pkthdr.len = m->m_len = wrlen;
4218
4219	req = (struct cpl_set_tcb_field *)(wr + 1);
4220	if (len0) {                  /* program buffer 0 offset and length */
4221		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4222			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4223			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4224			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4225			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4226		req++;
4227	}
4228	if (len1) {                  /* program buffer 1 offset and length */
4229		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4230			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4231			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4232			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4233			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4234		req++;
4235	}
4236
4237	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4238			     ddp_flags);
4239
4240	if (modulate) {
4241		mk_rx_data_ack_ulp(toeptoso(toep),
4242		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4243		    toep->tp_copied_seq - toep->tp_rcv_wup);
4244		toep->tp_rcv_wup = toep->tp_copied_seq;
4245	}
4246
4247#ifdef T3_TRACE
4248	T3_TRACE5(TIDTB(sk),
4249		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4250		  "modulate %d",
4251		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4252		  modulate);
4253#endif
4254
4255	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4256}
4257
4258void
4259t3_init_wr_tab(unsigned int wr_len)
4260{
4261	int i;
4262
4263	if (mbuf_wrs[1])     /* already initialized */
4264		return;
4265
4266	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4267		int sgl_len = (3 * i) / 2 + (i & 1);
4268
4269		sgl_len += 3;
4270		mbuf_wrs[i] = sgl_len <= wr_len ?
4271		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4272	}
4273
4274	wrlen = wr_len * 8;
4275}
4276
4277int
4278t3_init_cpl_io(void)
4279{
4280#ifdef notyet
4281	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4282	if (!tcphdr_skb) {
4283		log(LOG_ERR,
4284		       "Chelsio TCP offload: can't allocate sk_buff\n");
4285		return -1;
4286	}
4287	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4288	tcphdr_skb->h.raw = tcphdr_skb->data;
4289	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4290#endif
4291
4292	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4293	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4294	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4295	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4296	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4297	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4298	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4299	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4300	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4301	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4302	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4303	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4304	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4305	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4306	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4307	return (0);
4308}
4309
4310