cxgb_cpl_io.c revision 176472
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 176472 2008-02-23 01:06:17Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44#include <sys/syslog.h>
45#include <sys/socketvar.h>
46#include <sys/protosw.h>
47#include <sys/priv.h>
48
49#include <net/if.h>
50#include <net/route.h>
51
52#include <netinet/in.h>
53#include <netinet/in_pcb.h>
54#include <netinet/in_systm.h>
55#include <netinet/in_var.h>
56
57
58#include <dev/cxgb/cxgb_osdep.h>
59#include <dev/cxgb/sys/mbufq.h>
60
61#include <netinet/ip.h>
62#include <netinet/tcp_var.h>
63#include <netinet/tcp_fsm.h>
64#include <netinet/tcp_offload.h>
65#include <netinet/tcp_seq.h>
66#include <netinet/tcp_syncache.h>
67#include <netinet/tcp_timer.h>
68#include <net/route.h>
69
70#include <dev/cxgb/t3cdev.h>
71#include <dev/cxgb/common/cxgb_firmware_exports.h>
72#include <dev/cxgb/common/cxgb_t3_cpl.h>
73#include <dev/cxgb/common/cxgb_tcb.h>
74#include <dev/cxgb/common/cxgb_ctl_defs.h>
75#include <dev/cxgb/cxgb_l2t.h>
76#include <dev/cxgb/cxgb_offload.h>
77#include <vm/vm.h>
78#include <vm/pmap.h>
79#include <machine/bus.h>
80#include <dev/cxgb/sys/mvec.h>
81#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
82#include <dev/cxgb/ulp/tom/cxgb_defs.h>
83#include <dev/cxgb/ulp/tom/cxgb_tom.h>
84#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
85#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
86#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space.  Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets.  It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes.  Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body.  This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes.  Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window.  We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139extern int tcp_do_autorcvbuf;
140extern int tcp_do_autosndbuf;
141extern int tcp_autorcvbuf_max;
142extern int tcp_autosndbuf_max;
143
144static void t3_send_reset(struct toepcb *toep);
145static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
146static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
147static void handle_syncache_event(int event, void *arg);
148
149static inline void
150SBAPPEND(struct sockbuf *sb, struct mbuf *n)
151{
152	struct mbuf * m;
153
154	m = sb->sb_mb;
155	while (m) {
156		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
157		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
158			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
159		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
160			m->m_next, m->m_nextpkt, m->m_flags));
161		m = m->m_next;
162	}
163	m = n;
164	while (m) {
165		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
166		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
167			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
168		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
169			m->m_next, m->m_nextpkt, m->m_flags));
170		m = m->m_next;
171	}
172	sbappend_locked(sb, n);
173	m = sb->sb_mb;
174	while (m) {
175		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176			m->m_next, m->m_nextpkt, m->m_flags));
177		m = m->m_next;
178	}
179}
180
181static inline int
182is_t3a(const struct toedev *dev)
183{
184	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
185}
186
187static void
188dump_toepcb(struct toepcb *toep)
189{
190	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
191	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
192	    toep->tp_mtu_idx, toep->tp_tid);
193
194	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
195	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
196	    toep->tp_mss_clamp, toep->tp_flags);
197}
198
199#ifndef RTALLOC2_DEFINED
200static struct rtentry *
201rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
202{
203	struct rtentry *rt = NULL;
204
205	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
206		RT_UNLOCK(rt);
207
208	return (rt);
209}
210#endif
211/*
212 * Determine whether to send a CPL message now or defer it.  A message is
213 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
214 * For connections in other states the message is sent immediately.
215 * If through_l2t is set the message is subject to ARP processing, otherwise
216 * it is sent directly.
217 */
218static inline void
219send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
220{
221	struct tcpcb *tp = toep->tp_tp;
222
223	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
224		INP_LOCK(tp->t_inpcb);
225		mbufq_tail(&toep->out_of_order_queue, m);  // defer
226		INP_UNLOCK(tp->t_inpcb);
227	} else if (through_l2t)
228		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
229	else
230		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
231}
232
233static inline unsigned int
234mkprio(unsigned int cntrl, const struct toepcb *toep)
235{
236        return (cntrl);
237}
238
239/*
240 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
241 */
242static inline void
243mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
244{
245	struct cpl_tid_release *req;
246
247	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
248	m->m_pkthdr.len = m->m_len = sizeof(*req);
249	req = mtod(m, struct cpl_tid_release *);
250	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
251	req->wr.wr_lo = 0;
252	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
253}
254
255static inline void
256make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
257{
258	struct tcpcb *tp = sototcpcb(so);
259	struct toepcb *toep = tp->t_toe;
260	struct tx_data_wr *req;
261
262	INP_LOCK_ASSERT(tp->t_inpcb);
263
264	req = mtod(m, struct tx_data_wr *);
265	m->m_len = sizeof(*req);
266	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
267	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
268	/* len includes the length of any HW ULP additions */
269	req->len = htonl(len);
270	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
271	/* V_TX_ULP_SUBMODE sets both the mode and submode */
272	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
273	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
274	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
275				   (tail ? 0 : 1))));
276	req->sndseq = htonl(tp->snd_nxt);
277	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
278		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
279				    V_TX_CPU_IDX(toep->tp_qset));
280
281		/* Sendbuffer is in units of 32KB.
282		 */
283		if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
284			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
285		else
286			req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
287		toep->tp_flags |= TP_DATASENT;
288	}
289}
290
291#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
292
293int
294t3_push_frames(struct socket *so, int req_completion)
295{
296	struct tcpcb *tp = sototcpcb(so);
297	struct toepcb *toep = tp->t_toe;
298
299	struct mbuf *tail, *m0, *last;
300	struct t3cdev *cdev;
301	struct tom_data *d;
302	int i, bytes, count, total_bytes;
303	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
304
305	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
306		DPRINTF("tcp state=%d\n", tp->t_state);
307		return (0);
308	}
309
310	if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
311		DPRINTF("disconnecting\n");
312
313		return (0);
314	}
315
316
317	INP_LOCK_ASSERT(tp->t_inpcb);
318	SOCKBUF_LOCK(&so->so_snd);
319	d = TOM_DATA(TOE_DEV(so));
320	cdev = d->cdev;
321	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
322	total_bytes = 0;
323	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
324	    toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
325
326	if (last && toep->tp_m_last == last  && so->so_snd.sb_sndptroff != 0) {
327		KASSERT(tail, ("sbdrop error"));
328		last = tail = tail->m_next;
329	}
330
331	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
332		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
333		SOCKBUF_UNLOCK(&so->so_snd);
334		return (0);
335	}
336
337	toep->tp_m_last = NULL;
338	while (toep->tp_wr_avail && (tail != NULL)) {
339		count = bytes = 0;
340		segp = segs;
341		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
342			SOCKBUF_UNLOCK(&so->so_snd);
343			return (0);
344		}
345		/*
346		 * If the data in tail fits as in-line, then
347		 * make an immediate data wr.
348		 */
349		if (tail->m_len <= IMM_LEN) {
350			count = 1;
351			bytes = tail->m_len;
352			last = tail;
353			tail = tail->m_next;
354			m_set_sgl(m0, NULL);
355			m_set_sgllen(m0, 0);
356			make_tx_data_wr(so, m0, bytes, tail);
357			m_append(m0, bytes, mtod(last, caddr_t));
358			KASSERT(!m0->m_next, ("bad append"));
359		} else {
360			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
361			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
362				bytes += tail->m_len;
363				last = tail;
364				count++;
365				/*
366				 * technically an abuse to be using this for a VA
367				 * but less gross than defining my own structure
368				 * or calling pmap_kextract from here :-|
369				 */
370				segp->ds_addr = (bus_addr_t)tail->m_data;
371				segp->ds_len = tail->m_len;
372				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
373				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
374				segp++;
375				tail = tail->m_next;
376			}
377			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
378			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
379
380			m_set_sgl(m0, segs);
381			m_set_sgllen(m0, count);
382			make_tx_data_wr(so, m0, bytes, tail);
383		}
384		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
385
386		if (tail) {
387			so->so_snd.sb_sndptr = tail;
388			toep->tp_m_last = NULL;
389		} else
390			toep->tp_m_last = so->so_snd.sb_sndptr = last;
391
392
393		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
394
395		so->so_snd.sb_sndptroff += bytes;
396		total_bytes += bytes;
397		toep->tp_write_seq += bytes;
398		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
399		    toep->tp_wr_avail, count, mbuf_wrs[count], tail, so->so_snd.sb_sndptr, so->so_snd.sb_sndptroff);
400		if (tail)
401			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
402			    total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
403		else
404			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
405			    total_bytes, toep->tp_m_last, tp->snd_una);
406
407
408		i = 0;
409		while (i < count && m_get_sgllen(m0)) {
410			if ((count - i) >= 3) {
411				CTR6(KTR_TOM,
412				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
413				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
414				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
415				    i += 3;
416			} else if ((count - i) == 2) {
417				CTR4(KTR_TOM,
418				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
419				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
420				    i += 2;
421			} else {
422				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
423				    segs[i].ds_addr, segs[i].ds_len);
424				i++;
425			}
426
427		}
428
429                 /*
430		 * remember credits used
431		 */
432		m0->m_pkthdr.csum_data = mbuf_wrs[count];
433		m0->m_pkthdr.len = bytes;
434		toep->tp_wr_avail -= mbuf_wrs[count];
435		toep->tp_wr_unacked += mbuf_wrs[count];
436
437		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
438		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
439			struct work_request_hdr *wr = cplhdr(m0);
440
441			wr->wr_hi |= htonl(F_WR_COMPL);
442			toep->tp_wr_unacked = 0;
443		}
444		KASSERT((m0->m_pkthdr.csum_data > 0) &&
445		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
446			m0->m_pkthdr.csum_data));
447		m0->m_type = MT_DONTFREE;
448		enqueue_wr(toep, m0);
449		DPRINTF("sending offload tx with %d bytes in %d segments\n",
450		    bytes, count);
451		l2t_send(cdev, m0, toep->tp_l2t);
452	}
453	SOCKBUF_UNLOCK(&so->so_snd);
454	return (total_bytes);
455}
456
457/*
458 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
459 * under any circumstances.  We take the easy way out and always queue the
460 * message to the write_queue.  We can optimize the case where the queue is
461 * already empty though the optimization is probably not worth it.
462 */
463static void
464close_conn(struct socket *so)
465{
466	struct mbuf *m;
467	struct cpl_close_con_req *req;
468	struct tom_data *d;
469	struct inpcb *inp = sotoinpcb(so);
470	struct tcpcb *tp;
471	struct toepcb *toep;
472	unsigned int tid;
473
474
475	INP_LOCK(inp);
476	tp = sototcpcb(so);
477	toep = tp->t_toe;
478
479	if (tp->t_state != TCPS_SYN_SENT)
480		t3_push_frames(so, 1);
481
482	if (toep->tp_flags & TP_FIN_SENT) {
483		INP_UNLOCK(inp);
484		return;
485	}
486
487	tid = toep->tp_tid;
488
489	d = TOM_DATA(toep->tp_toedev);
490
491	m = m_gethdr_nofail(sizeof(*req));
492
493	toep->tp_flags |= TP_FIN_SENT;
494	req = mtod(m, struct cpl_close_con_req *);
495
496	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
497	req->wr.wr_lo = htonl(V_WR_TID(tid));
498	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
499	req->rsvd = htonl(toep->tp_write_seq);
500	INP_UNLOCK(inp);
501	/*
502	 * XXX - need to defer shutdown while there is still data in the queue
503	 *
504	 */
505	cxgb_ofld_send(d->cdev, m);
506
507}
508
509/*
510 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
511 * and send it along.
512 */
513static void
514abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
515{
516	struct cpl_abort_req *req = cplhdr(m);
517
518	req->cmd = CPL_ABORT_NO_RST;
519	cxgb_ofld_send(cdev, m);
520}
521
522/*
523 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
524 * permitted to return without sending the message in case we cannot allocate
525 * an sk_buff.  Returns the number of credits sent.
526 */
527uint32_t
528t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
529{
530	struct mbuf *m;
531	struct cpl_rx_data_ack *req;
532	struct toepcb *toep = tp->t_toe;
533	struct toedev *tdev = toep->tp_toedev;
534
535	m = m_gethdr_nofail(sizeof(*req));
536
537	DPRINTF("returning %u credits to HW\n", credits);
538
539	req = mtod(m, struct cpl_rx_data_ack *);
540	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
541	req->wr.wr_lo = 0;
542	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
543	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
544	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
545	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
546	return (credits);
547}
548
549/*
550 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
551 * This is only used in DDP mode, so we take the opportunity to also set the
552 * DACK mode and flush any Rx credits.
553 */
554void
555t3_send_rx_modulate(struct toepcb *toep)
556{
557	struct mbuf *m;
558	struct cpl_rx_data_ack *req;
559
560	m = m_gethdr_nofail(sizeof(*req));
561
562	req = mtod(m, struct cpl_rx_data_ack *);
563	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
564	req->wr.wr_lo = 0;
565	m->m_pkthdr.len = m->m_len = sizeof(*req);
566
567	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
568	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
569				 V_RX_DACK_MODE(1) |
570				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
571	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
572	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
573	toep->tp_rcv_wup = toep->tp_copied_seq;
574}
575
576/*
577 * Handle receipt of an urgent pointer.
578 */
579static void
580handle_urg_ptr(struct socket *so, uint32_t urg_seq)
581{
582#ifdef URGENT_DATA_SUPPORTED
583	struct tcpcb *tp = sototcpcb(so);
584
585	urg_seq--;   /* initially points past the urgent data, per BSD */
586
587	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
588		return;                                 /* duplicate pointer */
589	sk_send_sigurg(sk);
590	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
591	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
592		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
593
594		tp->copied_seq++;
595		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
596			tom_eat_skb(sk, skb, 0);
597	}
598	tp->urg_data = TCP_URG_NOTYET;
599	tp->urg_seq = urg_seq;
600#endif
601}
602
603/*
604 * Returns true if a socket cannot accept new Rx data.
605 */
606static inline int
607so_no_receive(const struct socket *so)
608{
609	return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
610}
611
612/*
613 * Process an urgent data notification.
614 */
615static void
616rx_urg_notify(struct toepcb *toep, struct mbuf *m)
617{
618	struct cpl_rx_urg_notify *hdr = cplhdr(m);
619	struct socket *so = toeptoso(toep);
620
621	VALIDATE_SOCK(so);
622
623	if (!so_no_receive(so))
624		handle_urg_ptr(so, ntohl(hdr->seq));
625
626	m_freem(m);
627}
628
629/*
630 * Handler for RX_URG_NOTIFY CPL messages.
631 */
632static int
633do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
634{
635	struct toepcb *toep = (struct toepcb *)ctx;
636
637	rx_urg_notify(toep, m);
638	return (0);
639}
640
641/*
642 * Set of states for which we should return RX credits.
643 */
644#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
645
646/*
647 * Called after some received data has been read.  It returns RX credits
648 * to the HW for the amount of data processed.
649 */
650void
651t3_cleanup_rbuf(struct tcpcb *tp, int copied)
652{
653	struct toepcb *toep = tp->t_toe;
654	struct socket *so;
655	struct toedev *dev;
656	int dack_mode, must_send, read;
657	u32 thres, credits, dack = 0;
658
659	so = tp->t_inpcb->inp_socket;
660	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
661		(tp->t_state == TCPS_FIN_WAIT_2))) {
662		if (copied) {
663			SOCKBUF_LOCK(&so->so_rcv);
664			toep->tp_copied_seq += copied;
665			SOCKBUF_UNLOCK(&so->so_rcv);
666		}
667
668		return;
669	}
670
671	INP_LOCK_ASSERT(tp->t_inpcb);
672	SOCKBUF_LOCK(&so->so_rcv);
673	if (copied)
674		toep->tp_copied_seq += copied;
675	else {
676		read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
677		toep->tp_copied_seq += read;
678	}
679	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
680	toep->tp_enqueued_bytes = so->so_rcv.sb_cc;
681	SOCKBUF_UNLOCK(&so->so_rcv);
682
683	if (credits > so->so_rcv.sb_mbmax) {
684	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
685		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
686	    credits = so->so_rcv.sb_mbmax;
687	}
688
689
690	    /*
691	 * XXX this won't accurately reflect credit return - we need
692	 * to look at the difference between the amount that has been
693	 * put in the recv sockbuf and what is there now
694	 */
695
696	if (__predict_false(!credits))
697		return;
698
699	dev = toep->tp_toedev;
700	thres = TOM_TUNABLE(dev, rx_credit_thres);
701
702	if (__predict_false(thres == 0))
703		return;
704
705	if (toep->tp_ulp_mode)
706		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
707	else {
708		dack_mode = TOM_TUNABLE(dev, delack);
709		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
710			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
711
712			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
713				dack = F_RX_DACK_CHANGE |
714				       V_RX_DACK_MODE(dack_mode);
715		}
716	}
717
718	/*
719	 * For coalescing to work effectively ensure the receive window has
720	 * at least 16KB left.
721	 */
722	must_send = credits + 16384 >= tp->rcv_wnd;
723
724	if (must_send || credits >= thres)
725		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
726}
727
728static int
729cxgb_toe_disconnect(struct tcpcb *tp)
730{
731	struct socket *so;
732
733	DPRINTF("cxgb_toe_disconnect\n");
734
735	so = tp->t_inpcb->inp_socket;
736	close_conn(so);
737	return (0);
738}
739
740static int
741cxgb_toe_reset(struct tcpcb *tp)
742{
743	struct toepcb *toep = tp->t_toe;
744
745
746	t3_send_reset(toep);
747
748	/*
749	 * unhook from socket
750	 */
751	tp->t_flags &= ~TF_TOE;
752	toep->tp_tp = NULL;
753	tp->t_toe = NULL;
754	return (0);
755}
756
757static int
758cxgb_toe_send(struct tcpcb *tp)
759{
760	struct socket *so;
761
762	DPRINTF("cxgb_toe_send\n");
763	dump_toepcb(tp->t_toe);
764
765	so = tp->t_inpcb->inp_socket;
766	t3_push_frames(so, 1);
767	return (0);
768}
769
770static int
771cxgb_toe_rcvd(struct tcpcb *tp)
772{
773	INP_LOCK_ASSERT(tp->t_inpcb);
774	t3_cleanup_rbuf(tp, 0);
775
776	return (0);
777}
778
779static void
780cxgb_toe_detach(struct tcpcb *tp)
781{
782	struct toepcb *toep;
783	/*
784	 * XXX how do we handle teardown in the SYN_SENT state?
785	 *
786	 */
787	INP_INFO_WLOCK(&tcbinfo);
788	toep = tp->t_toe;
789	toep->tp_tp = NULL;
790
791	/*
792	 * unhook from socket
793	 */
794	tp->t_flags &= ~TF_TOE;
795	tp->t_toe = NULL;
796	INP_INFO_WUNLOCK(&tcbinfo);
797}
798
799
800static struct toe_usrreqs cxgb_toe_usrreqs = {
801	.tu_disconnect = cxgb_toe_disconnect,
802	.tu_reset = cxgb_toe_reset,
803	.tu_send = cxgb_toe_send,
804	.tu_rcvd = cxgb_toe_rcvd,
805	.tu_detach = cxgb_toe_detach,
806	.tu_detach = cxgb_toe_detach,
807	.tu_syncache_event = handle_syncache_event,
808};
809
810
811static void
812__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
813			    uint64_t mask, uint64_t val, int no_reply)
814{
815	struct cpl_set_tcb_field *req;
816
817	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
818	    toep->tp_tid, word, mask, val);
819
820	req = mtod(m, struct cpl_set_tcb_field *);
821	m->m_pkthdr.len = m->m_len = sizeof(*req);
822	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
823	req->wr.wr_lo = 0;
824	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
825	req->reply = V_NO_REPLY(no_reply);
826	req->cpu_idx = 0;
827	req->word = htons(word);
828	req->mask = htobe64(mask);
829	req->val = htobe64(val);
830
831	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
832	send_or_defer(toep, m, 0);
833}
834
835static void
836t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
837{
838	struct mbuf *m;
839	struct tcpcb *tp = sototcpcb(so);
840	struct toepcb *toep = tp->t_toe;
841
842	if (toep == NULL)
843		return;
844
845	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
846		printf("not seting field\n");
847		return;
848	}
849
850	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
851
852	__set_tcb_field(toep, m, word, mask, val, 1);
853}
854
855/*
856 * Set one of the t_flags bits in the TCB.
857 */
858static void
859set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
860{
861	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
862}
863
864/*
865 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
866 */
867static void
868t3_set_nagle(struct socket *so)
869{
870	struct tcpcb *tp = sototcpcb(so);
871
872	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
873}
874
875/*
876 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
877 */
878void
879t3_set_keepalive(struct socket *so, int on_off)
880{
881	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
882}
883
884void
885t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
886{
887	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
888}
889
890/*
891 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
892 */
893static void
894t3_set_tos(struct socket *so)
895{
896	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
897			 V_TCB_TOS(SO_TOS(so)));
898}
899
900
901/*
902 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
903 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
904 * set the PSH bit in the last segment, which would trigger delivery.]
905 * We work around the issue by setting a DDP buffer in a partial placed state,
906 * which guarantees that TP will schedule a timer.
907 */
908#define TP_DDP_TIMER_WORKAROUND_MASK\
909    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
910     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
911       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
912#define TP_DDP_TIMER_WORKAROUND_VAL\
913    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
914     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
915      32))
916
917static void
918t3_enable_ddp(struct socket *so, int on)
919{
920	if (on) {
921
922		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
923				 V_TF_DDP_OFF(0));
924	} else
925		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
926				 V_TF_DDP_OFF(1) |
927				 TP_DDP_TIMER_WORKAROUND_MASK,
928				 V_TF_DDP_OFF(1) |
929				 TP_DDP_TIMER_WORKAROUND_VAL);
930
931}
932
933void
934t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
935{
936	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
937			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
938			 tag_color);
939}
940
941void
942t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
943		    unsigned int len)
944{
945	if (buf_idx == 0)
946		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
947			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
948			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
949			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
950			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
951	else
952		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
953			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
954			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
955			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
956			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
957}
958
959static int
960t3_set_cong_control(struct socket *so, const char *name)
961{
962#ifdef CONGESTION_CONTROL_SUPPORTED
963	int cong_algo;
964
965	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
966		if (!strcmp(name, t3_cong_ops[cong_algo].name))
967			break;
968
969	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
970		return -EINVAL;
971#endif
972	return 0;
973}
974
975int
976t3_get_tcb(struct socket *so)
977{
978	struct cpl_get_tcb *req;
979	struct tcpcb *tp = sototcpcb(so);
980	struct toepcb *toep = tp->t_toe;
981	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
982
983	if (!m)
984		return (ENOMEM);
985
986	INP_LOCK_ASSERT(tp->t_inpcb);
987	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
988	req = mtod(m, struct cpl_get_tcb *);
989	m->m_pkthdr.len = m->m_len = sizeof(*req);
990	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
991	req->wr.wr_lo = 0;
992	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
993	req->cpuno = htons(toep->tp_qset);
994	req->rsvd = 0;
995	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
996		mbufq_tail(&toep->out_of_order_queue, m);	// defer
997	else
998		cxgb_ofld_send(T3C_DEV(so), m);
999	return 0;
1000}
1001
1002static inline void
1003so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
1004{
1005	struct toepcb *toep = sototoep(so);
1006	toepcb_hold(toep);
1007
1008	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1009}
1010
1011/**
1012 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1013 *	@d: TOM state
1014 *	@mtu: the target MTU
1015 *
1016 *	Returns the index of the value in the MTU table that is closest to but
1017 *	does not exceed the target MTU.
1018 */
1019static unsigned int
1020find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1021{
1022	int i = 0;
1023
1024	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1025		++i;
1026	return (i);
1027}
1028
1029static unsigned int
1030select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1031{
1032	unsigned int idx;
1033
1034#ifdef notyet
1035	struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
1036#endif
1037	if (tp) {
1038		tp->t_maxseg = pmtu - 40;
1039		if (tp->t_maxseg < td->mtus[0] - 40)
1040			tp->t_maxseg = td->mtus[0] - 40;
1041		idx = find_best_mtu(td, tp->t_maxseg + 40);
1042
1043		tp->t_maxseg = td->mtus[idx] - 40;
1044	} else
1045		idx = find_best_mtu(td, pmtu);
1046
1047	return (idx);
1048}
1049
1050static inline void
1051free_atid(struct t3cdev *cdev, unsigned int tid)
1052{
1053	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1054
1055	if (toep)
1056		toepcb_release(toep);
1057}
1058
1059/*
1060 * Release resources held by an offload connection (TID, L2T entry, etc.)
1061 */
1062static void
1063t3_release_offload_resources(struct toepcb *toep)
1064{
1065	struct tcpcb *tp = toep->tp_tp;
1066	struct toedev *tdev = toep->tp_toedev;
1067	struct t3cdev *cdev;
1068	unsigned int tid = toep->tp_tid;
1069
1070	if (!tdev)
1071		return;
1072
1073	cdev = TOEP_T3C_DEV(toep);
1074	if (!cdev)
1075		return;
1076
1077	toep->tp_qset = 0;
1078	t3_release_ddp_resources(toep);
1079
1080#ifdef CTRL_SKB_CACHE
1081	kfree_skb(CTRL_SKB_CACHE(tp));
1082	CTRL_SKB_CACHE(tp) = NULL;
1083#endif
1084
1085	if (toep->tp_wr_avail != toep->tp_wr_max) {
1086		purge_wr_queue(toep);
1087		reset_wr_list(toep);
1088	}
1089
1090	if (toep->tp_l2t) {
1091		l2t_release(L2DATA(cdev), toep->tp_l2t);
1092		toep->tp_l2t = NULL;
1093	}
1094	toep->tp_tp = NULL;
1095	if (tp) {
1096		INP_LOCK_ASSERT(tp->t_inpcb);
1097		tp->t_toe = NULL;
1098		tp->t_flags &= ~TF_TOE;
1099	}
1100
1101	if (toep->tp_state == TCPS_SYN_SENT) {
1102		free_atid(cdev, tid);
1103#ifdef notyet
1104		__skb_queue_purge(&tp->out_of_order_queue);
1105#endif
1106	} else {                                          // we have TID
1107		cxgb_remove_tid(cdev, toep, tid);
1108		toepcb_release(toep);
1109	}
1110#if 0
1111	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1112#endif
1113}
1114
1115static void
1116install_offload_ops(struct socket *so)
1117{
1118	struct tcpcb *tp = sototcpcb(so);
1119
1120	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1121
1122	t3_install_socket_ops(so);
1123	tp->t_flags |= TF_TOE;
1124	tp->t_tu = &cxgb_toe_usrreqs;
1125}
1126
1127/*
1128 * Determine the receive window scaling factor given a target max
1129 * receive window.
1130 */
1131static __inline int
1132select_rcv_wscale(int space)
1133{
1134	int wscale = 0;
1135
1136	if (space > MAX_RCV_WND)
1137		space = MAX_RCV_WND;
1138
1139	if (tcp_do_rfc1323)
1140		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1141
1142	return (wscale);
1143}
1144
1145/*
1146 * Determine the receive window size for a socket.
1147 */
1148static unsigned long
1149select_rcv_wnd(struct toedev *dev, struct socket *so)
1150{
1151	struct tom_data *d = TOM_DATA(dev);
1152	unsigned int wnd;
1153	unsigned int max_rcv_wnd;
1154
1155	if (tcp_do_autorcvbuf)
1156		wnd = tcp_autorcvbuf_max;
1157	else
1158		wnd = so->so_rcv.sb_hiwat;
1159
1160
1161
1162	/* XXX
1163	 * For receive coalescing to work effectively we need a receive window
1164	 * that can accomodate a coalesced segment.
1165	 */
1166	if (wnd < MIN_RCV_WND)
1167		wnd = MIN_RCV_WND;
1168
1169	/* PR 5138 */
1170	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1171				    (uint32_t)d->rx_page_size * 23 :
1172				    MAX_RCV_WND);
1173
1174	return min(wnd, max_rcv_wnd);
1175}
1176
1177/*
1178 * Assign offload parameters to some socket fields.  This code is used by
1179 * both active and passive opens.
1180 */
1181static inline void
1182init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1183    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1184{
1185	struct tcpcb *tp = sototcpcb(so);
1186	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1187
1188	SOCK_LOCK_ASSERT(so);
1189
1190	printf("initializing offload socket\n");
1191	/*
1192	 * We either need to fix push frames to work with sbcompress
1193	 * or we need to add this
1194	 */
1195	so->so_snd.sb_flags |= SB_NOCOALESCE;
1196	so->so_rcv.sb_flags |= SB_NOCOALESCE;
1197
1198	tp->t_toe = toep;
1199	toep->tp_tp = tp;
1200	toep->tp_toedev = dev;
1201
1202	toep->tp_tid = tid;
1203	toep->tp_l2t = e;
1204	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1205	toep->tp_wr_unacked = 0;
1206	toep->tp_delack_mode = 0;
1207
1208	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1209	/*
1210	 * XXX broken
1211	 *
1212	 */
1213	tp->rcv_wnd = select_rcv_wnd(dev, so);
1214
1215        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
1216		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1217	toep->tp_qset_idx = 0;
1218
1219	reset_wr_list(toep);
1220	DPRINTF("initialization done\n");
1221}
1222
1223/*
1224 * The next two functions calculate the option 0 value for a socket.
1225 */
1226static inline unsigned int
1227calc_opt0h(struct socket *so, int mtu_idx)
1228{
1229	struct tcpcb *tp = sototcpcb(so);
1230	int wscale = select_rcv_wscale(tp->rcv_wnd);
1231
1232	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1233	    V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1234	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1235}
1236
1237static inline unsigned int
1238calc_opt0l(struct socket *so, int ulp_mode)
1239{
1240	struct tcpcb *tp = sototcpcb(so);
1241	unsigned int val;
1242
1243	val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
1244	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1245
1246	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
1247	return (val);
1248}
1249
1250static inline unsigned int
1251calc_opt2(const struct socket *so, struct toedev *dev)
1252{
1253	int flv_valid;
1254
1255	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1256
1257	return (V_FLAVORS_VALID(flv_valid) |
1258	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1259}
1260
1261#if DEBUG_WR > 1
1262static int
1263count_pending_wrs(const struct toepcb *toep)
1264{
1265	const struct mbuf *m;
1266	int n = 0;
1267
1268	wr_queue_walk(toep, m)
1269		n += m->m_pkthdr.csum_data;
1270	return (n);
1271}
1272#endif
1273
1274#if 0
1275(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1276#endif
1277
1278static void
1279mk_act_open_req(struct socket *so, struct mbuf *m,
1280    unsigned int atid, const struct l2t_entry *e)
1281{
1282	struct cpl_act_open_req *req;
1283	struct inpcb *inp = sotoinpcb(so);
1284	struct tcpcb *tp = intotcpcb(inp);
1285	struct toepcb *toep = tp->t_toe;
1286	struct toedev *tdev = TOE_DEV(so);
1287
1288	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1289
1290	req = mtod(m, struct cpl_act_open_req *);
1291	m->m_pkthdr.len = m->m_len = sizeof(*req);
1292
1293	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1294	req->wr.wr_lo = 0;
1295	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1296	req->local_port = inp->inp_lport;
1297	req->peer_port = inp->inp_fport;
1298	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1299	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1300	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1301			   V_TX_CHANNEL(e->smt_idx));
1302	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1303	req->params = 0;
1304	req->opt2 = htonl(calc_opt2(so, tdev));
1305}
1306
1307
1308/*
1309 * Convert an ACT_OPEN_RPL status to an errno.
1310 */
1311static int
1312act_open_rpl_status_to_errno(int status)
1313{
1314	switch (status) {
1315	case CPL_ERR_CONN_RESET:
1316		return (ECONNREFUSED);
1317	case CPL_ERR_ARP_MISS:
1318		return (EHOSTUNREACH);
1319	case CPL_ERR_CONN_TIMEDOUT:
1320		return (ETIMEDOUT);
1321	case CPL_ERR_TCAM_FULL:
1322		return (ENOMEM);
1323	case CPL_ERR_CONN_EXIST:
1324		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1325		return (EADDRINUSE);
1326	default:
1327		return (EIO);
1328	}
1329}
1330
1331static void
1332fail_act_open(struct toepcb *toep, int errno)
1333{
1334	struct tcpcb *tp = toep->tp_tp;
1335
1336	t3_release_offload_resources(toep);
1337	if (tp) {
1338		INP_LOCK_ASSERT(tp->t_inpcb);
1339		tcp_drop(tp, errno);
1340	}
1341
1342#ifdef notyet
1343	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1344#endif
1345}
1346
1347/*
1348 * Handle active open failures.
1349 */
1350static void
1351active_open_failed(struct toepcb *toep, struct mbuf *m)
1352{
1353	struct cpl_act_open_rpl *rpl = cplhdr(m);
1354	struct inpcb *inp;
1355
1356	INP_INFO_WLOCK(&tcbinfo);
1357	if (toep->tp_tp == NULL)
1358		goto done;
1359
1360	inp = toep->tp_tp->t_inpcb;
1361	INP_LOCK(inp);
1362
1363/*
1364 * Don't handle connection retry for now
1365 */
1366#ifdef notyet
1367	struct inet_connection_sock *icsk = inet_csk(sk);
1368
1369	if (rpl->status == CPL_ERR_CONN_EXIST &&
1370	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1371		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1372		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1373			       jiffies + HZ / 2);
1374	} else
1375#endif
1376		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1377	INP_UNLOCK(inp);
1378done:
1379	INP_INFO_WUNLOCK(&tcbinfo);
1380
1381	m_free(m);
1382}
1383
1384/*
1385 * Return whether a failed active open has allocated a TID
1386 */
1387static inline int
1388act_open_has_tid(int status)
1389{
1390	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1391	       status != CPL_ERR_ARP_MISS;
1392}
1393
1394/*
1395 * Process an ACT_OPEN_RPL CPL message.
1396 */
1397static int
1398do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1399{
1400	struct toepcb *toep = (struct toepcb *)ctx;
1401	struct cpl_act_open_rpl *rpl = cplhdr(m);
1402
1403	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1404		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1405
1406	active_open_failed(toep, m);
1407	return (0);
1408}
1409
1410/*
1411 * Handle an ARP failure for an active open.   XXX purge ofo queue
1412 *
1413 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1414 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1415 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1416 * free the atid.  Hmm.
1417 */
1418#ifdef notyet
1419static void
1420act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1421{
1422	struct toepcb *toep = m_get_toep(m);
1423	struct tcpcb *tp = toep->tp_tp;
1424	struct inpcb *inp = tp->t_inpcb;
1425	struct socket *so = toeptoso(toep);
1426
1427	INP_LOCK(inp);
1428	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1429		fail_act_open(so, EHOSTUNREACH);
1430		printf("freeing %p\n", m);
1431
1432		m_free(m);
1433	}
1434	INP_UNLOCK(inp);
1435}
1436#endif
1437/*
1438 * Send an active open request.
1439 */
1440int
1441t3_connect(struct toedev *tdev, struct socket *so,
1442    struct rtentry *rt, struct sockaddr *nam)
1443{
1444	struct mbuf *m;
1445	struct l2t_entry *e;
1446	struct tom_data *d = TOM_DATA(tdev);
1447	struct inpcb *inp = sotoinpcb(so);
1448	struct tcpcb *tp = intotcpcb(inp);
1449	struct toepcb *toep; /* allocated by init_offload_socket */
1450
1451	int atid;
1452
1453	toep = toepcb_alloc();
1454	if (toep == NULL)
1455		goto out_err;
1456
1457	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1458		goto out_err;
1459
1460	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1461	if (!e)
1462		goto free_tid;
1463
1464	INP_LOCK_ASSERT(inp);
1465	m = m_gethdr(MT_DATA, M_WAITOK);
1466
1467#if 0
1468	m->m_toe.mt_toepcb = tp->t_toe;
1469	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1470#endif
1471	SOCK_LOCK(so);
1472
1473	init_offload_socket(so, tdev, atid, e, rt, toep);
1474
1475	install_offload_ops(so);
1476
1477	mk_act_open_req(so, m, atid, e);
1478	SOCK_UNLOCK(so);
1479
1480	soisconnecting(so);
1481	toep = tp->t_toe;
1482	m_set_toep(m, tp->t_toe);
1483
1484	toep->tp_state = TCPS_SYN_SENT;
1485	l2t_send(d->cdev, (struct mbuf *)m, e);
1486
1487	if (toep->tp_ulp_mode)
1488		t3_enable_ddp(so, 0);
1489	return 	(0);
1490
1491free_tid:
1492	printf("failing connect - free atid\n");
1493
1494	free_atid(d->cdev, atid);
1495out_err:
1496	printf("return ENOMEM\n");
1497       return (ENOMEM);
1498}
1499
1500/*
1501 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1502 * not send multiple ABORT_REQs for the same connection and also that we do
1503 * not try to send a message after the connection has closed.  Returns 1 if
1504 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1505 */
1506static void
1507t3_send_reset(struct toepcb *toep)
1508{
1509
1510	struct cpl_abort_req *req;
1511	unsigned int tid = toep->tp_tid;
1512	int mode = CPL_ABORT_SEND_RST;
1513	struct tcpcb *tp = toep->tp_tp;
1514	struct toedev *tdev = toep->tp_toedev;
1515	struct socket *so = NULL;
1516	struct mbuf *m;
1517
1518	if (tp) {
1519		INP_LOCK_ASSERT(tp->t_inpcb);
1520		so = toeptoso(toep);
1521	}
1522
1523	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1524		tdev == NULL))
1525		return;
1526	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1527
1528	/* Purge the send queue so we don't send anything after an abort. */
1529	if (so)
1530		sbflush(&so->so_snd);
1531	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1532		mode |= CPL_ABORT_POST_CLOSE_REQ;
1533
1534	m = m_gethdr_nofail(sizeof(*req));
1535	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1536	set_arp_failure_handler(m, abort_arp_failure);
1537
1538	req = mtod(m, struct cpl_abort_req *);
1539	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1540	req->wr.wr_lo = htonl(V_WR_TID(tid));
1541	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1542	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1543	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1544	req->cmd = mode;
1545	if (tp && (tp->t_state == TCPS_SYN_SENT))
1546		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1547	else
1548		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1549}
1550
1551static int
1552t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1553{
1554	struct inpcb *inp;
1555	int error, optval;
1556
1557	if (sopt->sopt_name == IP_OPTIONS)
1558		return (ENOPROTOOPT);
1559
1560	if (sopt->sopt_name != IP_TOS)
1561		return (EOPNOTSUPP);
1562
1563	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1564
1565	if (error)
1566		return (error);
1567
1568	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1569		return (EPERM);
1570
1571	inp = sotoinpcb(so);
1572	inp->inp_ip_tos = optval;
1573
1574	t3_set_tos(so);
1575
1576	return (0);
1577}
1578
1579static int
1580t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1581{
1582	int err = 0;
1583	size_t copied;
1584
1585	if (sopt->sopt_name != TCP_CONGESTION &&
1586	    sopt->sopt_name != TCP_NODELAY)
1587		return (EOPNOTSUPP);
1588
1589	if (sopt->sopt_name == TCP_CONGESTION) {
1590		char name[TCP_CA_NAME_MAX];
1591		int optlen = sopt->sopt_valsize;
1592		struct tcpcb *tp;
1593
1594		if (optlen < 1)
1595			return (EINVAL);
1596
1597		err = copyinstr(sopt->sopt_val, name,
1598		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1599		if (err)
1600			return (err);
1601		if (copied < 1)
1602			return (EINVAL);
1603
1604		tp = sototcpcb(so);
1605		/*
1606		 * XXX I need to revisit this
1607		 */
1608		if ((err = t3_set_cong_control(so, name)) == 0) {
1609#ifdef CONGESTION_CONTROL_SUPPORTED
1610			tp->t_cong_control = strdup(name, M_CXGB);
1611#endif
1612		} else
1613			return (err);
1614	} else {
1615		int optval, oldval;
1616		struct inpcb *inp;
1617		struct tcpcb *tp;
1618
1619		err = sooptcopyin(sopt, &optval, sizeof optval,
1620		    sizeof optval);
1621
1622		if (err)
1623			return (err);
1624
1625		inp = sotoinpcb(so);
1626		tp = intotcpcb(inp);
1627
1628		INP_LOCK(inp);
1629
1630		oldval = tp->t_flags;
1631		if (optval)
1632			tp->t_flags |= TF_NODELAY;
1633		else
1634			tp->t_flags &= ~TF_NODELAY;
1635		INP_UNLOCK(inp);
1636
1637		if (oldval != tp->t_flags)
1638			t3_set_nagle(so);
1639
1640	}
1641
1642	return (0);
1643}
1644
1645static int
1646t3_ctloutput(struct socket *so, struct sockopt *sopt)
1647{
1648	int err;
1649
1650	if (sopt->sopt_level != IPPROTO_TCP)
1651		err =  t3_ip_ctloutput(so, sopt);
1652	else
1653		err = t3_tcp_ctloutput(so, sopt);
1654
1655	if (err != EOPNOTSUPP)
1656		return (err);
1657
1658	return (tcp_ctloutput(so, sopt));
1659}
1660
1661/*
1662 * Returns true if we need to explicitly request RST when we receive new data
1663 * on an RX-closed connection.
1664 */
1665static inline int
1666need_rst_on_excess_rx(const struct toepcb *toep)
1667{
1668	return (1);
1669}
1670
1671/*
1672 * Handles Rx data that arrives in a state where the socket isn't accepting
1673 * new data.
1674 */
1675static void
1676handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1677{
1678
1679	if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1680		t3_send_reset(toep);
1681	m_freem(m);
1682}
1683
1684/*
1685 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1686 * by getting the DDP offset from the TCB.
1687 */
1688static void
1689tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1690{
1691	struct ddp_state *q = &toep->tp_ddp_state;
1692	struct ddp_buf_state *bsp;
1693	struct cpl_get_tcb_rpl *hdr;
1694	unsigned int ddp_offset;
1695	struct socket *so;
1696	struct tcpcb *tp;
1697
1698	uint64_t t;
1699	__be64 *tcb;
1700
1701	so = toeptoso(toep);
1702	tp = toep->tp_tp;
1703
1704	INP_LOCK_ASSERT(tp->t_inpcb);
1705	SOCKBUF_LOCK(&so->so_rcv);
1706
1707	/* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
1708	 * really need a cookie in order to dispatch the RPLs.
1709	 */
1710	q->get_tcb_count--;
1711
1712	/* It is a possible that a previous CPL already invalidated UBUF DDP
1713	 * and moved the cur_buf idx and hence no further processing of this
1714	 * skb is required. However, the app might be sleeping on
1715	 * !q->get_tcb_count and we need to wake it up.
1716	 */
1717	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1718		struct socket *so = toeptoso(toep);
1719
1720		m_freem(m);
1721		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1722			sorwakeup_locked(so);
1723		else
1724			SOCKBUF_UNLOCK(&so->so_rcv);
1725		return;
1726	}
1727
1728	bsp = &q->buf_state[q->cur_buf];
1729	hdr = cplhdr(m);
1730	tcb = (__be64 *)(hdr + 1);
1731	if (q->cur_buf == 0) {
1732		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1733		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1734	} else {
1735		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1736		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1737	}
1738	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1739	m->m_cur_offset = bsp->cur_offset;
1740	bsp->cur_offset = ddp_offset;
1741	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1742
1743	CTR5(KTR_TOM,
1744	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1745	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1746	KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u",
1747		ddp_offset, m->m_cur_offset));
1748
1749#ifdef T3_TRACE
1750	T3_TRACE3(TIDTB(so),
1751		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u",
1752		  tp->rcv_nxt, q->cur_buf, ddp_offset);
1753#endif
1754
1755#if 0
1756{
1757	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1758
1759	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1760	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1761
1762        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1763        rcv_nxt = t >> S_TCB_RCV_NXT;
1764        rcv_nxt &= M_TCB_RCV_NXT;
1765
1766        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1767        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1768        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1769
1770	T3_TRACE2(TIDTB(sk),
1771		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1772		  ddp_flags, rcv_nxt - rx_hdr_offset);
1773	T3_TRACE4(TB(q),
1774		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1775		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1776	T3_TRACE3(TB(q),
1777		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1778		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1779	T3_TRACE2(TB(q),
1780		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1781		 q->buf_state[0].flags, q->buf_state[1].flags);
1782
1783}
1784#endif
1785	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1786		handle_excess_rx(toep, m);
1787		return;
1788	}
1789
1790#ifdef T3_TRACE
1791	if ((int)m->m_pkthdr.len < 0) {
1792		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1793	}
1794#endif
1795	if (bsp->flags & DDP_BF_NOCOPY) {
1796#ifdef T3_TRACE
1797		T3_TRACE0(TB(q),
1798			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1799
1800		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1801			printk("!cancel_ubuf");
1802			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1803		}
1804#endif
1805		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1806		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1807		q->cur_buf ^= 1;
1808	} else if (bsp->flags & DDP_BF_NOFLIP) {
1809
1810		m->m_ddp_flags = 1;    /* always a kernel buffer */
1811
1812		/* now HW buffer carries a user buffer */
1813		bsp->flags &= ~DDP_BF_NOFLIP;
1814		bsp->flags |= DDP_BF_NOCOPY;
1815
1816		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1817		 * any new data in which case we're done. If in addition the
1818		 * offset is 0, then there wasn't a completion for the kbuf
1819		 * and we need to decrement the posted count.
1820		 */
1821		if (m->m_pkthdr.len == 0) {
1822			if (ddp_offset == 0) {
1823				q->kbuf_posted--;
1824				bsp->flags |= DDP_BF_NODATA;
1825			}
1826			SOCKBUF_UNLOCK(&so->so_rcv);
1827
1828			m_free(m);
1829			return;
1830		}
1831	} else {
1832		SOCKBUF_UNLOCK(&so->so_rcv);
1833		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1834		 * but it got here way late and nobody cares anymore.
1835		 */
1836		m_free(m);
1837		return;
1838	}
1839
1840	m->m_ddp_gl = (unsigned char *)bsp->gl;
1841	m->m_flags |= M_DDP;
1842	m->m_seq = tp->rcv_nxt;
1843	tp->rcv_nxt += m->m_pkthdr.len;
1844	tp->t_rcvtime = ticks;
1845#ifdef T3_TRACE
1846	T3_TRACE3(TB(q),
1847		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
1848		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1849#endif
1850	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1851		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1852	if (m->m_pkthdr.len == 0)
1853		q->user_ddp_pending = 0;
1854	else
1855		SBAPPEND(&so->so_rcv, m);
1856	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1857		sorwakeup_locked(so);
1858	else
1859		SOCKBUF_UNLOCK(&so->so_rcv);
1860}
1861
1862/*
1863 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1864 * in that case they are similar to DDP completions.
1865 */
1866static int
1867do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1868{
1869	struct toepcb *toep = (struct toepcb *)ctx;
1870
1871	/* OK if socket doesn't exist */
1872	if (toep == NULL) {
1873		printf("null toep in do_get_tcb_rpl\n");
1874		return (CPL_RET_BUF_DONE);
1875	}
1876
1877	INP_LOCK(toep->tp_tp->t_inpcb);
1878	tcb_rpl_as_ddp_complete(toep, m);
1879	INP_UNLOCK(toep->tp_tp->t_inpcb);
1880
1881	return (0);
1882}
1883
1884static void
1885handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1886{
1887	struct tcpcb *tp = toep->tp_tp;
1888	struct socket *so = toeptoso(toep);
1889	struct ddp_state *q;
1890	struct ddp_buf_state *bsp;
1891	struct cpl_rx_data *hdr = cplhdr(m);
1892	unsigned int rcv_nxt = ntohl(hdr->seq);
1893
1894	if (tp->rcv_nxt == rcv_nxt)
1895		return;
1896
1897	INP_LOCK_ASSERT(tp->t_inpcb);
1898	SOCKBUF_LOCK(&so->so_rcv);
1899	q = &toep->tp_ddp_state;
1900	bsp = &q->buf_state[q->cur_buf];
1901	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
1902		rcv_nxt, tp->rcv_nxt));
1903	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
1904	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
1905	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
1906	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
1907
1908#ifdef T3_TRACE
1909	if ((int)m->m_pkthdr.len < 0) {
1910		t3_ddp_error(so, "handle_ddp_data: neg len");
1911	}
1912#endif
1913
1914	m->m_ddp_gl = (unsigned char *)bsp->gl;
1915	m->m_flags |= M_DDP;
1916	m->m_cur_offset = bsp->cur_offset;
1917	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
1918	if (bsp->flags & DDP_BF_NOCOPY)
1919		bsp->flags &= ~DDP_BF_NOCOPY;
1920
1921	m->m_seq = tp->rcv_nxt;
1922	tp->rcv_nxt = rcv_nxt;
1923	bsp->cur_offset += m->m_pkthdr.len;
1924	if (!(bsp->flags & DDP_BF_NOFLIP))
1925		q->cur_buf ^= 1;
1926	/*
1927	 * For now, don't re-enable DDP after a connection fell out of  DDP
1928	 * mode.
1929	 */
1930	q->ubuf_ddp_ready = 0;
1931	SOCKBUF_UNLOCK(&so->so_rcv);
1932}
1933
1934/*
1935 * Process new data received for a connection.
1936 */
1937static void
1938new_rx_data(struct toepcb *toep, struct mbuf *m)
1939{
1940	struct cpl_rx_data *hdr = cplhdr(m);
1941	struct tcpcb *tp = toep->tp_tp;
1942	struct socket *so = toeptoso(toep);
1943	int len = be16toh(hdr->len);
1944
1945	INP_LOCK(tp->t_inpcb);
1946
1947	if (__predict_false(so_no_receive(so))) {
1948		handle_excess_rx(toep, m);
1949		INP_UNLOCK(tp->t_inpcb);
1950		TRACE_EXIT;
1951		return;
1952	}
1953
1954	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
1955		handle_ddp_data(toep, m);
1956
1957	m->m_seq = ntohl(hdr->seq);
1958	m->m_ulp_mode = 0;                    /* for iSCSI */
1959
1960#if VALIDATE_SEQ
1961	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
1962		log(LOG_ERR,
1963		       "%s: TID %u: Bad sequence number %u, expected %u\n",
1964		    TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq,
1965		       tp->rcv_nxt);
1966		m_freem(m);
1967		INP_UNLOCK(tp->t_inpcb);
1968		return;
1969	}
1970#endif
1971	m_adj(m, sizeof(*hdr));
1972
1973#ifdef URGENT_DATA_SUPPORTED
1974	/*
1975	 * We don't handle urgent data yet
1976	 */
1977	if (__predict_false(hdr->urg))
1978		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
1979	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
1980		     tp->urg_seq - tp->rcv_nxt < skb->len))
1981		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
1982							 tp->rcv_nxt];
1983#endif
1984	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
1985		toep->tp_delack_mode = hdr->dack_mode;
1986		toep->tp_delack_seq = tp->rcv_nxt;
1987	}
1988	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
1989	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
1990
1991	if (len < m->m_pkthdr.len)
1992		m->m_pkthdr.len = m->m_len = len;
1993
1994	tp->rcv_nxt += m->m_pkthdr.len;
1995	tp->t_rcvtime = ticks;
1996	toep->tp_enqueued_bytes += m->m_pkthdr.len;
1997#ifdef T3_TRACE
1998	T3_TRACE2(TIDTB(sk),
1999	    "new_rx_data: seq 0x%x len %u",
2000	    m->m_seq, m->m_pkthdr.len);
2001#endif
2002	INP_UNLOCK(tp->t_inpcb);
2003	SOCKBUF_LOCK(&so->so_rcv);
2004	if (sb_notify(&so->so_rcv))
2005		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
2006
2007	SBAPPEND(&so->so_rcv, m);
2008
2009#ifdef notyet
2010	/*
2011	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2012	 *
2013	 */
2014	KASSERT(so->so_rcv.sb_cc < (so->so_rcv.sb_mbmax << 1),
2015
2016	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2017		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
2018#endif
2019
2020
2021	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2022	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
2023
2024	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2025		sorwakeup_locked(so);
2026	else
2027		SOCKBUF_UNLOCK(&so->so_rcv);
2028}
2029
2030/*
2031 * Handler for RX_DATA CPL messages.
2032 */
2033static int
2034do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2035{
2036	struct toepcb *toep = (struct toepcb *)ctx;
2037
2038	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2039
2040	new_rx_data(toep, m);
2041
2042	return (0);
2043}
2044
2045static void
2046new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2047{
2048	struct tcpcb *tp;
2049	struct ddp_state *q;
2050	struct ddp_buf_state *bsp;
2051	struct cpl_rx_data_ddp *hdr;
2052	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2053	struct socket *so = toeptoso(toep);
2054	int nomoredata = 0;
2055
2056	tp = sototcpcb(so);
2057
2058	INP_LOCK(tp->t_inpcb);
2059	if (__predict_false(so_no_receive(so))) {
2060
2061		handle_excess_rx(toep, m);
2062		INP_UNLOCK(tp->t_inpcb);
2063		return;
2064	}
2065
2066	q = &toep->tp_ddp_state;
2067	hdr = cplhdr(m);
2068	ddp_report = ntohl(hdr->u.ddp_report);
2069	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2070	bsp = &q->buf_state[buf_idx];
2071
2072#ifdef T3_TRACE
2073	T3_TRACE5(TIDTB(sk),
2074		  "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2075		  "hdr seq 0x%x len %u offset %u",
2076		  tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2077		  ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
2078	T3_TRACE1(TIDTB(sk),
2079		  "new_rx_data_ddp: ddp_report 0x%x",
2080		  ddp_report);
2081#endif
2082	CTR4(KTR_TOM,
2083	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2084	    "hdr seq 0x%x len %u",
2085	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2086	    ntohs(hdr->len));
2087	CTR3(KTR_TOM,
2088	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2089	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2090
2091	ddp_len = ntohs(hdr->len);
2092	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2093
2094	m->m_seq = tp->rcv_nxt;
2095	tp->rcv_nxt = rcv_nxt;
2096
2097	tp->t_rcvtime = ticks;
2098	/*
2099	 * Store the length in m->m_len.  We are changing the meaning of
2100	 * m->m_len here, we need to be very careful that nothing from now on
2101	 * interprets ->len of this packet the usual way.
2102	 */
2103	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2104	INP_UNLOCK(tp->t_inpcb);
2105	CTR3(KTR_TOM,
2106	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2107	    m->m_len, rcv_nxt, m->m_seq);
2108	/*
2109	 * Figure out where the new data was placed in the buffer and store it
2110	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2111	 * account for page pod's pg_offset.
2112	 */
2113	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2114	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2115
2116	SOCKBUF_LOCK(&so->so_rcv);
2117	m->m_ddp_gl = (unsigned char *)bsp->gl;
2118	m->m_flags |= M_DDP;
2119	bsp->cur_offset = end_offset;
2120	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2121
2122	/*
2123	 * Length is only meaningful for kbuf
2124	 */
2125	if (!(bsp->flags & DDP_BF_NOCOPY))
2126		KASSERT(m->m_len <= bsp->gl->dgl_length,
2127		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2128			m->m_len, bsp->gl->dgl_length));
2129
2130	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2131	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2132
2133
2134        /*
2135	 * Bit 0 of flags stores whether the DDP buffer is completed.
2136	 * Note that other parts of the code depend on this being in bit 0.
2137	 */
2138	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2139		panic("spurious ddp completion");
2140	} else {
2141		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2142		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2143			q->cur_buf ^= 1;                     /* flip buffers */
2144	}
2145
2146	if (bsp->flags & DDP_BF_NOCOPY) {
2147		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2148		bsp->flags &= ~DDP_BF_NOCOPY;
2149	}
2150
2151	if (ddp_report & F_DDP_PSH)
2152		m->m_ddp_flags |= DDP_BF_PSH;
2153	if (nomoredata)
2154		m->m_ddp_flags |= DDP_BF_NODATA;
2155
2156	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2157		toep->tp_delack_mode = G_DDP_DACK_MODE(ddp_report);
2158		toep->tp_delack_seq = tp->rcv_nxt;
2159	}
2160
2161	SBAPPEND(&so->so_rcv, m);
2162
2163	if ((so->so_state & SS_NOFDREF) == 0)
2164		sorwakeup_locked(so);
2165	else
2166		SOCKBUF_UNLOCK(&so->so_rcv);
2167}
2168
2169#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2170		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2171		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2172		 F_DDP_INVALID_PPOD)
2173
2174/*
2175 * Handler for RX_DATA_DDP CPL messages.
2176 */
2177static int
2178do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2179{
2180	struct toepcb *toep = ctx;
2181	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2182
2183	VALIDATE_SOCK(so);
2184
2185	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2186		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2187		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2188		return (CPL_RET_BUF_DONE);
2189	}
2190#if 0
2191	skb->h.th = tcphdr_skb->h.th;
2192#endif
2193	new_rx_data_ddp(toep, m);
2194	return (0);
2195}
2196
2197static void
2198process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2199{
2200	struct tcpcb *tp = toep->tp_tp;
2201	struct socket *so = toeptoso(toep);
2202	struct ddp_state *q;
2203	struct ddp_buf_state *bsp;
2204	struct cpl_rx_ddp_complete *hdr;
2205	unsigned int ddp_report, buf_idx, when;
2206	int nomoredata = 0;
2207
2208	INP_LOCK(tp->t_inpcb);
2209	if (__predict_false(so_no_receive(so))) {
2210		struct inpcb *inp = sotoinpcb(so);
2211
2212		handle_excess_rx(toep, m);
2213		INP_UNLOCK(inp);
2214		return;
2215	}
2216	q = &toep->tp_ddp_state;
2217	hdr = cplhdr(m);
2218	ddp_report = ntohl(hdr->ddp_report);
2219	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2220	m->m_pkthdr.csum_data = tp->rcv_nxt;
2221
2222
2223	SOCKBUF_LOCK(&so->so_rcv);
2224	bsp = &q->buf_state[buf_idx];
2225	when = bsp->cur_offset;
2226	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2227	tp->rcv_nxt += m->m_len;
2228	tp->t_rcvtime = ticks;
2229	INP_UNLOCK(tp->t_inpcb);
2230
2231	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2232#ifdef T3_TRACE
2233	T3_TRACE5(TIDTB(sk),
2234		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2235		  "ddp_report 0x%x offset %u, len %u",
2236		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2237		   G_DDP_OFFSET(ddp_report), skb->len);
2238#endif
2239	CTR5(KTR_TOM,
2240		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2241		  "ddp_report 0x%x offset %u, len %u",
2242		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2243		   G_DDP_OFFSET(ddp_report), m->m_len);
2244
2245	bsp->cur_offset += m->m_len;
2246
2247	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2248		q->cur_buf ^= 1;                     /* flip buffers */
2249		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2250			nomoredata=1;
2251	}
2252
2253#ifdef T3_TRACE
2254	T3_TRACE4(TIDTB(sk),
2255		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2256		  "ddp_report %u offset %u",
2257		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2258		   G_DDP_OFFSET(ddp_report));
2259#endif
2260	CTR4(KTR_TOM,
2261		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2262		  "ddp_report %u offset %u",
2263		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2264		   G_DDP_OFFSET(ddp_report));
2265
2266	m->m_ddp_gl = (unsigned char *)bsp->gl;
2267	m->m_flags |= M_DDP;
2268	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2269	if (bsp->flags & DDP_BF_NOCOPY)
2270		bsp->flags &= ~DDP_BF_NOCOPY;
2271	if (nomoredata)
2272		m->m_ddp_flags |= DDP_BF_NODATA;
2273
2274	SBAPPEND(&so->so_rcv, m);
2275
2276	if ((so->so_state & SS_NOFDREF) == 0)
2277		sorwakeup_locked(so);
2278	else
2279		SOCKBUF_UNLOCK(&so->so_rcv);
2280}
2281
2282/*
2283 * Handler for RX_DDP_COMPLETE CPL messages.
2284 */
2285static int
2286do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2287{
2288	struct toepcb *toep = ctx;
2289
2290	VALIDATE_SOCK(so);
2291#if 0
2292	skb->h.th = tcphdr_skb->h.th;
2293#endif
2294	process_ddp_complete(toep, m);
2295	return (0);
2296}
2297
2298/*
2299 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2300 * socket state before calling tcp_time_wait to comply with its expectations.
2301 */
2302static void
2303enter_timewait(struct socket *so)
2304{
2305	struct tcpcb *tp = sototcpcb(so);
2306
2307	INP_LOCK_ASSERT(tp->t_inpcb);
2308	/*
2309	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2310	 * process peer_close because we don't want to carry the peer FIN in
2311	 * the socket's receive queue and if we increment rcv_nxt without
2312	 * having the FIN in the receive queue we'll confuse facilities such
2313	 * as SIOCINQ.
2314	 */
2315	tp->rcv_nxt++;
2316
2317	tp->ts_recent_age = 0;	     /* defeat recycling */
2318	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2319	tcp_twstart(tp);
2320}
2321
2322/*
2323 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2324 * function deals with the data that may be reported along with the FIN.
2325 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2326 * perform normal FIN-related processing.  In the latter case 1 indicates that
2327 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2328 * skb can be freed.
2329 */
2330static int
2331handle_peer_close_data(struct socket *so, struct mbuf *m)
2332{
2333	struct tcpcb *tp = sototcpcb(so);
2334	struct toepcb *toep = tp->t_toe;
2335	struct ddp_state *q;
2336	struct ddp_buf_state *bsp;
2337	struct cpl_peer_close *req = cplhdr(m);
2338	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2339
2340	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2341		return (0);
2342
2343	if (__predict_false(so_no_receive(so))) {
2344		handle_excess_rx(toep, m);
2345
2346		/*
2347		 * Although we discard the data we want to process the FIN so
2348		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2349		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2350		 * may be what will close the connection.  We return 1 because
2351		 * handle_excess_rx() already freed the packet.
2352		 */
2353		return (1);
2354	}
2355
2356	INP_LOCK_ASSERT(tp->t_inpcb);
2357	q = &toep->tp_ddp_state;
2358	SOCKBUF_LOCK(&so->so_rcv);
2359	bsp = &q->buf_state[q->cur_buf];
2360	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2361	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2362	m->m_ddp_gl = (unsigned char *)bsp->gl;
2363	m->m_flags |= M_DDP;
2364	m->m_cur_offset = bsp->cur_offset;
2365	m->m_ddp_flags =
2366	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2367	m->m_seq = tp->rcv_nxt;
2368	tp->rcv_nxt = rcv_nxt;
2369	bsp->cur_offset += m->m_pkthdr.len;
2370	if (!(bsp->flags & DDP_BF_NOFLIP))
2371		q->cur_buf ^= 1;
2372	tp->t_rcvtime = ticks;
2373	SBAPPEND(&so->so_rcv, m);
2374	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2375		sorwakeup_locked(so);
2376	else
2377		SOCKBUF_UNLOCK(&so->so_rcv);
2378	return (1);
2379}
2380
2381/*
2382 * Handle a peer FIN.
2383 */
2384static void
2385do_peer_fin(struct socket *so, struct mbuf *m)
2386{
2387	struct tcpcb *tp = sototcpcb(so);
2388	struct toepcb *toep = tp->t_toe;
2389	int keep = 0;
2390	DPRINTF("do_peer_fin state=%d\n", tp->t_state);
2391
2392#ifdef T3_TRACE
2393	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
2394#endif
2395
2396	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2397		printf("abort_pending set\n");
2398
2399		goto out;
2400	}
2401	INP_INFO_WLOCK(&tcbinfo);
2402	INP_LOCK(tp->t_inpcb);
2403	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2404		keep = handle_peer_close_data(so, m);
2405		if (keep < 0) {
2406			INP_INFO_WUNLOCK(&tcbinfo);
2407			INP_UNLOCK(tp->t_inpcb);
2408			return;
2409		}
2410	}
2411	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2412		socantrcvmore(so);
2413		/*
2414		 * If connection is half-synchronized
2415		 * (ie NEEDSYN flag on) then delay ACK,
2416		 * so it may be piggybacked when SYN is sent.
2417		 * Otherwise, since we received a FIN then no
2418		 * more input can be expected, send ACK now.
2419		 */
2420		if (tp->t_flags & TF_NEEDSYN)
2421			tp->t_flags |= TF_DELACK;
2422		else
2423			tp->t_flags |= TF_ACKNOW;
2424		tp->rcv_nxt++;
2425	}
2426
2427	switch (tp->t_state) {
2428	case TCPS_SYN_RECEIVED:
2429	    tp->t_starttime = ticks;
2430	/* FALLTHROUGH */
2431	case TCPS_ESTABLISHED:
2432		tp->t_state = TCPS_CLOSE_WAIT;
2433		break;
2434	case TCPS_FIN_WAIT_1:
2435		tp->t_state = TCPS_CLOSING;
2436		break;
2437	case TCPS_FIN_WAIT_2:
2438		/*
2439		 * If we've sent an abort_req we must have sent it too late,
2440		 * HW will send us a reply telling us so, and this peer_close
2441		 * is really the last message for this connection and needs to
2442		 * be treated as an abort_rpl, i.e., transition the connection
2443		 * to TCP_CLOSE (note that the host stack does this at the
2444		 * time of generating the RST but we must wait for HW).
2445		 * Otherwise we enter TIME_WAIT.
2446		 */
2447		t3_release_offload_resources(toep);
2448		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2449			tp = tcp_close(tp);
2450		} else {
2451			enter_timewait(so);
2452		}
2453		break;
2454	default:
2455		log(LOG_ERR,
2456		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2457		       TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
2458	}
2459	INP_INFO_WUNLOCK(&tcbinfo);
2460	if (tp)
2461		INP_UNLOCK(tp->t_inpcb);
2462
2463	DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
2464
2465#ifdef notyet
2466	/* Do not send POLL_HUP for half duplex close. */
2467	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2468	    sk->sk_state == TCP_CLOSE)
2469		sk_wake_async(so, 1, POLL_HUP);
2470	else
2471		sk_wake_async(so, 1, POLL_IN);
2472#endif
2473
2474out:
2475	if (!keep)
2476		m_free(m);
2477}
2478
2479/*
2480 * Handler for PEER_CLOSE CPL messages.
2481 */
2482static int
2483do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2484{
2485	struct toepcb *toep = (struct toepcb *)ctx;
2486	struct socket *so = toeptoso(toep);
2487
2488	VALIDATE_SOCK(so);
2489
2490	do_peer_fin(so, m);
2491	return (0);
2492}
2493
2494static void
2495process_close_con_rpl(struct socket *so, struct mbuf *m)
2496{
2497	struct tcpcb *tp = sototcpcb(so);
2498	struct cpl_close_con_rpl *rpl = cplhdr(m);
2499	struct toepcb *toep = tp->t_toe;
2500
2501	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2502
2503	DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
2504	    !!(so->so_state & SS_NOFDREF));
2505	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
2506		goto out;
2507
2508	INP_INFO_WLOCK(&tcbinfo);
2509	INP_LOCK(tp->t_inpcb);
2510	switch (tp->t_state) {
2511	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2512		t3_release_offload_resources(toep);
2513		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2514			tp = tcp_close(tp);
2515
2516		} else {
2517			enter_timewait(so);
2518			soisdisconnected(so);
2519		}
2520		break;
2521	case TCPS_LAST_ACK:
2522		/*
2523		 * In this state we don't care about pending abort_rpl.
2524		 * If we've sent abort_req it was post-close and was sent too
2525		 * late, this close_con_rpl is the actual last message.
2526		 */
2527		t3_release_offload_resources(toep);
2528		tp = tcp_close(tp);
2529		break;
2530	case TCPS_FIN_WAIT_1:
2531		/*
2532		 * If we can't receive any more
2533		 * data, then closing user can proceed.
2534		 * Starting the timer is contrary to the
2535		 * specification, but if we don't get a FIN
2536		 * we'll hang forever.
2537		 *
2538		 * XXXjl:
2539		 * we should release the tp also, and use a
2540		 * compressed state.
2541		 */
2542		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2543			int timeout;
2544
2545			soisdisconnected(so);
2546			timeout = (tcp_fast_finwait2_recycle) ?
2547			    tcp_finwait2_timeout : tcp_maxidle;
2548			tcp_timer_activate(tp, TT_2MSL, timeout);
2549		}
2550		tp->t_state = TCPS_FIN_WAIT_2;
2551		if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
2552		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2553			tp = tcp_drop(tp, 0);
2554		}
2555
2556		break;
2557	default:
2558		log(LOG_ERR,
2559		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2560		       TOE_DEV(so)->tod_name, toep->tp_tid,
2561		       tp->t_state);
2562	}
2563	INP_INFO_WUNLOCK(&tcbinfo);
2564	if (tp)
2565		INP_UNLOCK(tp->t_inpcb);
2566out:
2567	m_freem(m);
2568}
2569
2570/*
2571 * Handler for CLOSE_CON_RPL CPL messages.
2572 */
2573static int
2574do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2575			    void *ctx)
2576{
2577	struct toepcb *toep = (struct toepcb *)ctx;
2578	struct socket *so = toeptoso(toep);
2579
2580	VALIDATE_SOCK(so);
2581
2582	process_close_con_rpl(so, m);
2583	return (0);
2584}
2585
2586/*
2587 * Process abort replies.  We only process these messages if we anticipate
2588 * them as the coordination between SW and HW in this area is somewhat lacking
2589 * and sometimes we get ABORT_RPLs after we are done with the connection that
2590 * originated the ABORT_REQ.
2591 */
2592static void
2593process_abort_rpl(struct socket *so, struct mbuf *m)
2594{
2595	struct tcpcb *tp = sototcpcb(so);
2596	struct toepcb *toep = tp->t_toe;
2597
2598#ifdef T3_TRACE
2599	T3_TRACE1(TIDTB(sk),
2600		  "process_abort_rpl: GTS rpl pending %d",
2601		  sock_flag(sk, ABORT_RPL_PENDING));
2602#endif
2603
2604	INP_INFO_WLOCK(&tcbinfo);
2605	INP_LOCK(tp->t_inpcb);
2606
2607	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2608		/*
2609		 * XXX panic on tcpdrop
2610		 */
2611		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
2612			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2613		else {
2614			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2615			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2616			    !is_t3a(TOE_DEV(so))) {
2617				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2618					panic("TP_ABORT_REQ_RCVD set");
2619				t3_release_offload_resources(toep);
2620				tp = tcp_close(tp);
2621			}
2622		}
2623	}
2624	if (tp)
2625		INP_UNLOCK(tp->t_inpcb);
2626	INP_INFO_WUNLOCK(&tcbinfo);
2627
2628	m_free(m);
2629}
2630
2631/*
2632 * Handle an ABORT_RPL_RSS CPL message.
2633 */
2634static int
2635do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2636{
2637	struct socket *so;
2638	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2639	struct toepcb *toep;
2640
2641	/*
2642	 * Ignore replies to post-close aborts indicating that the abort was
2643	 * requested too late.  These connections are terminated when we get
2644	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2645	 * arrives the TID is either no longer used or it has been recycled.
2646	 */
2647	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2648discard:
2649		m_free(m);
2650		return (0);
2651	}
2652
2653	toep = (struct toepcb *)ctx;
2654
2655        /*
2656	 * Sometimes we've already closed the socket, e.g., a post-close
2657	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2658	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2659	 * but FW turns the ABORT_REQ into a regular one and so we get
2660	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2661	 */
2662	if (!toep)
2663		goto discard;
2664
2665	if (toep->tp_tp == NULL) {
2666		printf("removing tid for abort\n");
2667		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2668		if (toep->tp_l2t)
2669			l2t_release(L2DATA(cdev), toep->tp_l2t);
2670
2671		toepcb_release(toep);
2672		goto discard;
2673	}
2674
2675	printf("toep=%p\n", toep);
2676	printf("tp=%p\n", toep->tp_tp);
2677
2678	so = toeptoso(toep); /* <- XXX panic */
2679	toepcb_hold(toep);
2680	process_abort_rpl(so, m);
2681	toepcb_release(toep);
2682	return (0);
2683}
2684
2685/*
2686 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2687 * indicate whether RST should be sent in response.
2688 */
2689static int
2690abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2691{
2692	struct tcpcb *tp = sototcpcb(so);
2693
2694	switch (abort_reason) {
2695	case CPL_ERR_BAD_SYN:
2696#if 0
2697		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2698#endif
2699	case CPL_ERR_CONN_RESET:
2700		// XXX need to handle SYN_RECV due to crossed SYNs
2701		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2702	case CPL_ERR_XMIT_TIMEDOUT:
2703	case CPL_ERR_PERSIST_TIMEDOUT:
2704	case CPL_ERR_FINWAIT2_TIMEDOUT:
2705	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2706#if 0
2707		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2708#endif
2709		return (ETIMEDOUT);
2710	default:
2711		return (EIO);
2712	}
2713}
2714
2715static inline void
2716set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2717{
2718	struct cpl_abort_rpl *rpl = cplhdr(m);
2719
2720	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2721	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2722	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2723
2724	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2725	rpl->cmd = cmd;
2726}
2727
2728static void
2729send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2730{
2731	struct mbuf *reply_mbuf;
2732	struct cpl_abort_req_rss *req = cplhdr(m);
2733
2734	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2735	m_set_priority(m, CPL_PRIORITY_DATA);
2736	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2737	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2738	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2739	m_free(m);
2740}
2741
2742/*
2743 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2744 */
2745static inline int
2746is_neg_adv_abort(unsigned int status)
2747{
2748	return status == CPL_ERR_RTX_NEG_ADVICE ||
2749	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2750}
2751
2752static void
2753send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2754{
2755	struct mbuf  *reply_mbuf;
2756	struct cpl_abort_req_rss *req = cplhdr(m);
2757
2758	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2759
2760	if (!reply_mbuf) {
2761		/* Defer the reply.  Stick rst_status into req->cmd. */
2762		req->status = rst_status;
2763		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2764		return;
2765	}
2766
2767	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2768	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2769	m_free(m);
2770
2771	/*
2772	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2773	 * these messages while ARP is pending.  For other connection states
2774	 * it's not a problem.
2775	 */
2776	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2777}
2778
2779#ifdef notyet
2780static void
2781cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2782{
2783	UNIMPLEMENTED();
2784#ifdef notyet
2785	struct request_sock *req = child->sk_user_data;
2786
2787	inet_csk_reqsk_queue_removed(parent, req);
2788	synq_remove(tcp_sk(child));
2789	__reqsk_free(req);
2790	child->sk_user_data = NULL;
2791#endif
2792}
2793
2794
2795/*
2796 * Performs the actual work to abort a SYN_RECV connection.
2797 */
2798static void
2799do_abort_syn_rcv(struct socket *child, struct socket *parent)
2800{
2801	struct tcpcb *parenttp = sototcpcb(parent);
2802	struct tcpcb *childtp = sototcpcb(child);
2803
2804	/*
2805	 * If the server is still open we clean up the child connection,
2806	 * otherwise the server already did the clean up as it was purging
2807	 * its SYN queue and the skb was just sitting in its backlog.
2808	 */
2809	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2810		cleanup_syn_rcv_conn(child, parent);
2811		INP_INFO_WLOCK(&tcbinfo);
2812		INP_LOCK(childtp->t_inpcb);
2813		t3_release_offload_resources(childtp->t_toe);
2814		childtp = tcp_close(childtp);
2815		INP_INFO_WUNLOCK(&tcbinfo);
2816		if (childtp)
2817			INP_UNLOCK(childtp->t_inpcb);
2818	}
2819}
2820#endif
2821
2822/*
2823 * Handle abort requests for a SYN_RECV connection.  These need extra work
2824 * because the socket is on its parent's SYN queue.
2825 */
2826static int
2827abort_syn_rcv(struct socket *so, struct mbuf *m)
2828{
2829	UNIMPLEMENTED();
2830#ifdef notyet
2831	struct socket *parent;
2832	struct toedev *tdev = TOE_DEV(so);
2833	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2834	struct socket *oreq = so->so_incomp;
2835	struct t3c_tid_entry *t3c_stid;
2836	struct tid_info *t;
2837
2838	if (!oreq)
2839		return -1;        /* somehow we are not on the SYN queue */
2840
2841	t = &(T3C_DATA(cdev))->tid_maps;
2842	t3c_stid = lookup_stid(t, oreq->ts_recent);
2843	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2844
2845	SOCK_LOCK(parent);
2846	do_abort_syn_rcv(so, parent);
2847	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2848	SOCK_UNLOCK(parent);
2849#endif
2850	return (0);
2851}
2852
2853/*
2854 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2855 * request except that we need to reply to it.
2856 */
2857static void
2858process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
2859{
2860	int rst_status = CPL_ABORT_NO_RST;
2861	const struct cpl_abort_req_rss *req = cplhdr(m);
2862	struct tcpcb *tp = sototcpcb(so);
2863	struct toepcb *toep = tp->t_toe;
2864
2865	INP_LOCK(tp->t_inpcb);
2866	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2867		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2868		m_free(m);
2869		goto skip;
2870	}
2871
2872	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
2873	/*
2874	 * Three cases to consider:
2875	 * a) We haven't sent an abort_req; close the connection.
2876	 * b) We have sent a post-close abort_req that will get to TP too late
2877	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
2878	 *    be ignored and the connection should be closed now.
2879	 * c) We have sent a regular abort_req that will get to TP too late.
2880	 *    That will generate an abort_rpl with status 0, wait for it.
2881	 */
2882	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
2883	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
2884		so->so_error = abort_status_to_errno(so, req->status,
2885		    &rst_status);
2886		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2887			sorwakeup(so);
2888		/*
2889		 * SYN_RECV needs special processing.  If abort_syn_rcv()
2890		 * returns 0 is has taken care of the abort.
2891		 */
2892		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
2893			goto skip;
2894
2895		t3_release_offload_resources(toep);
2896		tp = tcp_close(tp);
2897	}
2898	if (tp)
2899		INP_UNLOCK(tp->t_inpcb);
2900	send_abort_rpl(m, tdev, rst_status);
2901	return;
2902
2903skip:
2904	INP_UNLOCK(tp->t_inpcb);
2905}
2906
2907/*
2908 * Handle an ABORT_REQ_RSS CPL message.
2909 */
2910static int
2911do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2912{
2913	const struct cpl_abort_req_rss *req = cplhdr(m);
2914	struct toepcb *toep = (struct toepcb *)ctx;
2915	struct socket *so;
2916	struct inpcb *inp;
2917
2918	if (is_neg_adv_abort(req->status)) {
2919		m_free(m);
2920		return (0);
2921	}
2922
2923	printf("aborting tid=%d\n", toep->tp_tid);
2924
2925	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
2926		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2927		toep->tp_flags |= TP_ABORT_REQ_RCVD;
2928		printf("sending abort rpl\n");
2929
2930		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
2931		printf("sent\n");
2932		if (toep->tp_l2t)
2933			l2t_release(L2DATA(cdev), toep->tp_l2t);
2934
2935		/*
2936		 *  Unhook
2937		 */
2938		toep->tp_tp->t_toe = NULL;
2939		toep->tp_tp->t_flags &= ~TF_TOE;
2940		toep->tp_tp = NULL;
2941		/*
2942		 * XXX need to call syncache_chkrst - but we don't
2943		 * have a way of doing that yet
2944		 */
2945		toepcb_release(toep);
2946		printf("abort for unestablished connection :-(\n");
2947		return (0);
2948	}
2949	if (toep->tp_tp == NULL) {
2950		printf("disconnected toepcb\n");
2951		/* should be freed momentarily */
2952		return (0);
2953	}
2954
2955	so = toeptoso(toep);
2956	inp = sotoinpcb(so);
2957
2958	VALIDATE_SOCK(so);
2959	toepcb_hold(toep);
2960	INP_INFO_WLOCK(&tcbinfo);
2961	process_abort_req(so, m, TOE_DEV(so));
2962	INP_INFO_WUNLOCK(&tcbinfo);
2963	toepcb_release(toep);
2964	return (0);
2965}
2966#ifdef notyet
2967static void
2968pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
2969{
2970	struct toedev *tdev = TOE_DEV(parent);
2971
2972	do_abort_syn_rcv(child, parent);
2973	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
2974		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
2975
2976		rpl->opt0h = htonl(F_TCAM_BYPASS);
2977		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2978		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
2979	} else
2980		m_free(m);
2981}
2982#endif
2983static void
2984handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
2985{
2986	UNIMPLEMENTED();
2987
2988#ifdef notyet
2989	struct t3cdev *cdev;
2990	struct socket *parent;
2991	struct socket *oreq;
2992	struct t3c_tid_entry *t3c_stid;
2993	struct tid_info *t;
2994	struct tcpcb *otp, *tp = sototcpcb(so);
2995	struct toepcb *toep = tp->t_toe;
2996
2997	/*
2998	 * If the connection is being aborted due to the parent listening
2999	 * socket going away there's nothing to do, the ABORT_REQ will close
3000	 * the connection.
3001	 */
3002	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3003		m_free(m);
3004		return;
3005	}
3006
3007	oreq = so->so_incomp;
3008	otp = sototcpcb(oreq);
3009
3010	cdev = T3C_DEV(so);
3011	t = &(T3C_DATA(cdev))->tid_maps;
3012	t3c_stid = lookup_stid(t, otp->ts_recent);
3013	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3014
3015	SOCK_LOCK(parent);
3016	pass_open_abort(so, parent, m);
3017	SOCK_UNLOCK(parent);
3018#endif
3019}
3020
3021/*
3022 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3023 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3024 * connection.
3025 */
3026static void
3027pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3028{
3029
3030#ifdef notyet
3031	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3032	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3033#endif
3034	handle_pass_open_arp_failure(m_get_socket(m), m);
3035}
3036
3037/*
3038 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3039 */
3040static void
3041mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3042{
3043	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3044	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3045	unsigned int tid = GET_TID(req);
3046
3047	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3048	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3049	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3050	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3051	rpl->opt0h = htonl(F_TCAM_BYPASS);
3052	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3053	rpl->opt2 = 0;
3054	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3055}
3056
3057/*
3058 * Send a deferred reject to an accept request.
3059 */
3060static void
3061reject_pass_request(struct toedev *tdev, struct mbuf *m)
3062{
3063	struct mbuf *reply_mbuf;
3064
3065	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3066	mk_pass_accept_rpl(reply_mbuf, m);
3067	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3068	m_free(m);
3069}
3070
3071static void
3072handle_syncache_event(int event, void *arg)
3073{
3074	struct toepcb *toep = arg;
3075
3076	switch (event) {
3077	case TOE_SC_ENTRY_PRESENT:
3078		/*
3079		 * entry already exists - free toepcb
3080		 * and l2t
3081		 */
3082		printf("syncache entry present\n");
3083		toepcb_release(toep);
3084		break;
3085	case TOE_SC_DROP:
3086		/*
3087		 * The syncache has given up on this entry
3088		 * either it timed out, or it was evicted
3089		 * we need to explicitly release the tid
3090		 */
3091		printf("syncache entry dropped\n");
3092		toepcb_release(toep);
3093		break;
3094	default:
3095		log(LOG_ERR, "unknown syncache event %d\n", event);
3096		break;
3097	}
3098}
3099
3100static void
3101syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3102{
3103	struct in_conninfo inc;
3104	struct tcpopt to;
3105	struct tcphdr th;
3106	struct inpcb *inp;
3107	int mss, wsf, sack, ts;
3108	uint32_t rcv_isn = ntohl(req->rcv_isn);
3109
3110	bzero(&to, sizeof(struct tcpopt));
3111	inp = sotoinpcb(lso);
3112
3113	/*
3114	 * Fill out information for entering us into the syncache
3115	 */
3116	inc.inc_fport = th.th_sport = req->peer_port;
3117	inc.inc_lport = th.th_dport = req->local_port;
3118	th.th_seq = req->rcv_isn;
3119	th.th_flags = TH_SYN;
3120
3121	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3122
3123
3124	inc.inc_isipv6 = 0;
3125	inc.inc_len = 0;
3126	inc.inc_faddr.s_addr = req->peer_ip;
3127	inc.inc_laddr.s_addr = req->local_ip;
3128
3129	DPRINTF("syncache add of %d:%d %d:%d\n",
3130	    ntohl(req->local_ip), ntohs(req->local_port),
3131	    ntohl(req->peer_ip), ntohs(req->peer_port));
3132
3133	mss = req->tcp_options.mss;
3134	wsf = req->tcp_options.wsf;
3135	ts = req->tcp_options.tstamp;
3136	sack = req->tcp_options.sack;
3137	to.to_mss = mss;
3138	to.to_wscale = wsf;
3139	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3140	INP_INFO_WLOCK(&tcbinfo);
3141	INP_LOCK(inp);
3142	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3143}
3144
3145
3146/*
3147 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3148 * lock held.  Note that the sock here is a listening socket that is not owned
3149 * by the TOE.
3150 */
3151static void
3152process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3153    struct listen_ctx *lctx)
3154{
3155	int rt_flags;
3156	struct l2t_entry *e;
3157	struct iff_mac tim;
3158	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3159	struct cpl_pass_accept_rpl *rpl;
3160	struct cpl_pass_accept_req *req = cplhdr(m);
3161	unsigned int tid = GET_TID(req);
3162	struct tom_data *d = TOM_DATA(tdev);
3163	struct t3cdev *cdev = d->cdev;
3164	struct tcpcb *tp = sototcpcb(so);
3165	struct toepcb *newtoep;
3166	struct rtentry *dst;
3167	struct sockaddr_in nam;
3168	struct t3c_data *td = T3C_DATA(cdev);
3169
3170	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3171	if (__predict_false(reply_mbuf == NULL)) {
3172		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3173			t3_defer_reply(m, tdev, reject_pass_request);
3174		else {
3175			cxgb_queue_tid_release(cdev, tid);
3176			m_free(m);
3177		}
3178		DPRINTF("failed to get reply_mbuf\n");
3179
3180		goto out;
3181	}
3182
3183	if (tp->t_state != TCPS_LISTEN) {
3184		DPRINTF("socket not in listen state\n");
3185
3186		goto reject;
3187	}
3188
3189	tim.mac_addr = req->dst_mac;
3190	tim.vlan_tag = ntohs(req->vlan_tag);
3191	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3192		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3193		goto reject;
3194	}
3195
3196#ifdef notyet
3197	/*
3198	 * XXX do route lookup to confirm that we're still listening on this
3199	 * address
3200	 */
3201	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3202			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3203		goto reject;
3204	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3205		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3206	dst_release(skb->dst);	// done with the input route, release it
3207	skb->dst = NULL;
3208
3209	if ((rt_flags & RTF_LOCAL) == 0)
3210		goto reject;
3211#endif
3212	/*
3213	 * XXX
3214	 */
3215	rt_flags = RTF_LOCAL;
3216	if ((rt_flags & RTF_LOCAL) == 0)
3217		goto reject;
3218
3219	/*
3220	 * Calculate values and add to syncache
3221	 */
3222
3223	newtoep = toepcb_alloc();
3224	if (newtoep == NULL)
3225		goto reject;
3226
3227	bzero(&nam, sizeof(struct sockaddr_in));
3228
3229	nam.sin_len = sizeof(struct sockaddr_in);
3230	nam.sin_family = AF_INET;
3231	nam.sin_addr.s_addr =req->peer_ip;
3232	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3233
3234	if (dst == NULL) {
3235		printf("failed to find route\n");
3236		goto reject;
3237	}
3238	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3239	    (struct sockaddr *)&nam);
3240	if (e == NULL) {
3241		DPRINTF("failed to get l2t\n");
3242	}
3243	/*
3244	 * Point to our listen socket until accept
3245	 */
3246	newtoep->tp_tp = tp;
3247	newtoep->tp_flags = TP_SYN_RCVD;
3248	newtoep->tp_tid = tid;
3249	newtoep->tp_toedev = tdev;
3250	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3251
3252	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3253	SOCK_LOCK(so);
3254	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3255	SOCK_UNLOCK(so);
3256
3257	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
3258		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3259
3260	if (newtoep->tp_ulp_mode) {
3261		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3262
3263		if (ddp_mbuf == NULL)
3264			newtoep->tp_ulp_mode = 0;
3265	}
3266
3267	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3268	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3269	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3270	/*
3271	 * XXX workaround for lack of syncache drop
3272	 */
3273	toepcb_hold(newtoep);
3274	syncache_add_accept_req(req, so, newtoep);
3275
3276	rpl = cplhdr(reply_mbuf);
3277	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3278	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3279	rpl->wr.wr_lo = 0;
3280	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3281	rpl->opt2 = htonl(calc_opt2(so, tdev));
3282	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3283	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3284
3285	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3286	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3287	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3288				  CPL_PASS_OPEN_ACCEPT);
3289
3290	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3291
3292	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3293
3294	l2t_send(cdev, reply_mbuf, e);
3295	m_free(m);
3296	if (newtoep->tp_ulp_mode) {
3297		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3298				V_TF_DDP_OFF(1) |
3299				TP_DDP_TIMER_WORKAROUND_MASK,
3300				V_TF_DDP_OFF(1) |
3301		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3302	} else
3303		printf("not offloading\n");
3304
3305
3306
3307	return;
3308reject:
3309	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3310		mk_pass_accept_rpl(reply_mbuf, m);
3311	else
3312		mk_tid_release(reply_mbuf, newtoep, tid);
3313	cxgb_ofld_send(cdev, reply_mbuf);
3314	m_free(m);
3315out:
3316#if 0
3317	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3318#else
3319	return;
3320#endif
3321}
3322
3323/*
3324 * Handle a CPL_PASS_ACCEPT_REQ message.
3325 */
3326static int
3327do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3328{
3329	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3330	struct socket *lso = listen_ctx->lso;
3331	struct tom_data *d = listen_ctx->tom_data;
3332
3333#if VALIDATE_TID
3334	struct cpl_pass_accept_req *req = cplhdr(m);
3335	unsigned int tid = GET_TID(req);
3336	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3337
3338	if (unlikely(!lsk)) {
3339		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3340		       cdev->name,
3341		       (unsigned long)((union listen_entry *)ctx -
3342					t->stid_tab));
3343		return CPL_RET_BUF_DONE;
3344	}
3345	if (unlikely(tid >= t->ntids)) {
3346		printk(KERN_ERR "%s: passive open TID %u too large\n",
3347		       cdev->name, tid);
3348		return CPL_RET_BUF_DONE;
3349	}
3350	/*
3351	 * For T3A the current user of the TID may have closed but its last
3352	 * message(s) may have been backlogged so the TID appears to be still
3353	 * in use.  Just take the TID away, the connection can close at its
3354	 * own leisure.  For T3B this situation is a bug.
3355	 */
3356	if (!valid_new_tid(t, tid) &&
3357	    cdev->type != T3A) {
3358		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3359		       cdev->name, tid);
3360		return CPL_RET_BUF_DONE;
3361	}
3362#endif
3363
3364	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3365	return (0);
3366}
3367
3368/*
3369 * Called when a connection is established to translate the TCP options
3370 * reported by HW to FreeBSD's native format.
3371 */
3372static void
3373assign_rxopt(struct socket *so, unsigned int opt)
3374{
3375	const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
3376	struct tcpcb *tp = sototcpcb(so);
3377	struct toepcb *toep = tp->t_toe;
3378
3379	INP_LOCK_ASSERT(tp->t_inpcb);
3380
3381	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3382	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3383	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3384	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3385	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3386	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3387		tp->rcv_scale = tp->request_r_scale;
3388}
3389
3390/*
3391 * Completes some final bits of initialization for just established connections
3392 * and changes their state to TCP_ESTABLISHED.
3393 *
3394 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3395 */
3396static void
3397make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3398{
3399	struct tcpcb *tp = sototcpcb(so);
3400	struct toepcb *toep = tp->t_toe;
3401
3402	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3403	assign_rxopt(so, opt);
3404	so->so_proto->pr_ctloutput = t3_ctloutput;
3405
3406#if 0
3407	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3408#endif
3409	/*
3410	 * XXX not clear what rcv_wup maps to
3411	 */
3412	/*
3413	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3414	 * pass through opt0.
3415	 */
3416	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3417		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3418
3419	dump_toepcb(toep);
3420
3421#ifdef notyet
3422/*
3423 * no clean interface for marking ARP up to date
3424 */
3425	dst_confirm(sk->sk_dst_cache);
3426#endif
3427	tp->t_starttime = ticks;
3428	tp->t_state = TCPS_ESTABLISHED;
3429	soisconnected(so);
3430}
3431
3432static int
3433syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3434{
3435
3436	struct in_conninfo inc;
3437	struct tcpopt to;
3438	struct tcphdr th;
3439	int mss, wsf, sack, ts;
3440	struct mbuf *m = NULL;
3441	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3442	unsigned int opt;
3443
3444#ifdef MAC
3445#error	"no MAC support"
3446#endif
3447
3448	opt = ntohs(req->tcp_opt);
3449
3450	bzero(&to, sizeof(struct tcpopt));
3451
3452	/*
3453	 * Fill out information for entering us into the syncache
3454	 */
3455	inc.inc_fport = th.th_sport = req->peer_port;
3456	inc.inc_lport = th.th_dport = req->local_port;
3457	th.th_seq = req->rcv_isn;
3458	th.th_flags = TH_ACK;
3459
3460	inc.inc_isipv6 = 0;
3461	inc.inc_len = 0;
3462	inc.inc_faddr.s_addr = req->peer_ip;
3463	inc.inc_laddr.s_addr = req->local_ip;
3464
3465	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3466	wsf  = G_TCPOPT_WSCALE_OK(opt);
3467	ts   = G_TCPOPT_TSTAMP(opt);
3468	sack = G_TCPOPT_SACK(opt);
3469
3470	to.to_mss = mss;
3471	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3472	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3473
3474	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3475	    ntohl(req->local_ip), ntohs(req->local_port),
3476	    ntohl(req->peer_ip), ntohs(req->peer_port),
3477	    mss, wsf, ts, sack);
3478	return syncache_expand(&inc, &to, &th, so, m);
3479}
3480
3481
3482/*
3483 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3484 * if we are in TCP_SYN_RECV due to crossed SYNs
3485 */
3486static int
3487do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3488{
3489	struct cpl_pass_establish *req = cplhdr(m);
3490	struct toepcb *toep = (struct toepcb *)ctx;
3491	struct tcpcb *tp;
3492	struct socket *so, *lso;
3493	struct t3c_data *td = T3C_DATA(cdev);
3494	// Complete socket initialization now that we have the SND_ISN
3495
3496	struct toedev *tdev;
3497
3498	so = lso = toeptoso(toep);
3499	tdev = toep->tp_toedev;
3500
3501	SOCK_LOCK(so);
3502	LIST_REMOVE(toep, synq_entry);
3503	SOCK_UNLOCK(so);
3504
3505	INP_INFO_WLOCK(&tcbinfo);
3506	if (!syncache_expand_establish_req(req, &so, toep)) {
3507		/*
3508		 * No entry
3509		 */
3510		UNIMPLEMENTED();
3511	}
3512	if (so == NULL) {
3513		/*
3514		 * Couldn't create the socket
3515		 */
3516		UNIMPLEMENTED();
3517	}
3518
3519	/*
3520	 * XXX workaround for lack of syncache drop
3521	 */
3522	toepcb_release(toep);
3523
3524	tp = sototcpcb(so);
3525	INP_LOCK(tp->t_inpcb);
3526
3527	so->so_snd.sb_flags |= SB_NOCOALESCE;
3528	so->so_rcv.sb_flags |= SB_NOCOALESCE;
3529
3530	toep->tp_tp = tp;
3531	toep->tp_flags = 0;
3532	tp->t_toe = toep;
3533	reset_wr_list(toep);
3534	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3535	tp->rcv_nxt = toep->tp_copied_seq;
3536	install_offload_ops(so);
3537
3538	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3539	toep->tp_wr_unacked = 0;
3540	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3541	toep->tp_qset_idx = 0;
3542	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3543
3544	/*
3545	 * XXX Cancel any keep alive timer
3546	 */
3547
3548	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3549	INP_INFO_WUNLOCK(&tcbinfo);
3550	INP_UNLOCK(tp->t_inpcb);
3551
3552	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3553	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3554#ifdef notyet
3555	/*
3556	 * XXX not sure how these checks map to us
3557	 */
3558	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3559		sk->sk_state_change(sk);
3560		sk_wake_async(so, 0, POLL_OUT);
3561	}
3562	/*
3563	 * The state for the new connection is now up to date.
3564	 * Next check if we should add the connection to the parent's
3565	 * accept queue.  When the parent closes it resets connections
3566	 * on its SYN queue, so check if we are being reset.  If so we
3567	 * don't need to do anything more, the coming ABORT_RPL will
3568	 * destroy this socket.  Otherwise move the connection to the
3569	 * accept queue.
3570	 *
3571	 * Note that we reset the synq before closing the server so if
3572	 * we are not being reset the stid is still open.
3573	 */
3574	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3575		__kfree_skb(skb);
3576		goto unlock;
3577	}
3578#endif
3579	m_free(m);
3580
3581	return (0);
3582}
3583
3584/*
3585 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3586 * and send them to the TOE.
3587 */
3588static void
3589fixup_and_send_ofo(struct socket *so)
3590{
3591	struct mbuf *m;
3592	struct toedev *tdev = TOE_DEV(so);
3593	struct tcpcb *tp = sototcpcb(so);
3594	struct toepcb *toep = tp->t_toe;
3595	unsigned int tid = toep->tp_tid;
3596
3597	printf("fixup_and_send_ofo\n");
3598
3599	INP_LOCK_ASSERT(tp->t_inpcb);
3600	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3601		/*
3602		 * A variety of messages can be waiting but the fields we'll
3603		 * be touching are common to all so any message type will do.
3604		 */
3605		struct cpl_close_con_req *p = cplhdr(m);
3606
3607		p->wr.wr_lo = htonl(V_WR_TID(tid));
3608		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3609		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3610	}
3611}
3612
3613/*
3614 * Updates socket state from an active establish CPL message.  Runs with the
3615 * socket lock held.
3616 */
3617static void
3618socket_act_establish(struct socket *so, struct mbuf *m)
3619{
3620	struct cpl_act_establish *req = cplhdr(m);
3621	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3622	struct tcpcb *tp = sototcpcb(so);
3623	struct toepcb *toep = tp->t_toe;
3624
3625	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3626		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3627		    toep->tp_tid, tp->t_state);
3628
3629	tp->ts_recent_age = ticks;
3630	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3631	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3632
3633	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3634
3635	/*
3636	 * Now that we finally have a TID send any CPL messages that we had to
3637	 * defer for lack of a TID.
3638	 */
3639	if (mbufq_len(&toep->out_of_order_queue))
3640		fixup_and_send_ofo(so);
3641
3642	if (__predict_false(so->so_state & SS_NOFDREF)) {
3643		/*
3644		 * XXX does this even make sense?
3645		 */
3646		sorwakeup(so);
3647	}
3648	m_free(m);
3649#ifdef notyet
3650/*
3651 * XXX assume no write requests permitted while socket connection is
3652 * incomplete
3653 */
3654	/*
3655	 * Currently the send queue must be empty at this point because the
3656	 * socket layer does not send anything before a connection is
3657	 * established.  To be future proof though we handle the possibility
3658	 * that there are pending buffers to send (either TX_DATA or
3659	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3660	 * buffers according to the just learned write_seq, and then we send
3661	 * them on their way.
3662	 */
3663	fixup_pending_writeq_buffers(sk);
3664	if (t3_push_frames(so, 1))
3665		sk->sk_write_space(sk);
3666#endif
3667
3668	toep->tp_state = tp->t_state;
3669	tcpstat.tcps_connects++;
3670
3671}
3672
3673/*
3674 * Process a CPL_ACT_ESTABLISH message.
3675 */
3676static int
3677do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3678{
3679	struct cpl_act_establish *req = cplhdr(m);
3680	unsigned int tid = GET_TID(req);
3681	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3682	struct toepcb *toep = (struct toepcb *)ctx;
3683	struct tcpcb *tp = toep->tp_tp;
3684	struct socket *so;
3685	struct toedev *tdev;
3686	struct tom_data *d;
3687
3688	if (tp == NULL) {
3689		free_atid(cdev, atid);
3690		return (0);
3691	}
3692
3693	so = toeptoso(toep);
3694	tdev = TOE_DEV(so); /* blow up here if link was down */
3695	d = TOM_DATA(tdev);
3696
3697	INP_LOCK(tp->t_inpcb);
3698
3699	/*
3700	 * It's OK if the TID is currently in use, the owning socket may have
3701	 * backlogged its last CPL message(s).  Just take it away.
3702	 */
3703	toep->tp_tid = tid;
3704	toep->tp_tp = tp;
3705	so_insert_tid(d, so, tid);
3706	free_atid(cdev, atid);
3707	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3708
3709	socket_act_establish(so, m);
3710	INP_UNLOCK(tp->t_inpcb);
3711	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3712	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3713
3714	return (0);
3715}
3716
3717/*
3718 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3719 * next batch of work requests from the write queue.
3720 */
3721static void
3722wr_ack(struct toepcb *toep, struct mbuf *m)
3723{
3724	struct tcpcb *tp = toep->tp_tp;
3725	struct cpl_wr_ack *hdr = cplhdr(m);
3726	struct socket *so = toeptoso(toep);
3727	unsigned int credits = ntohs(hdr->credits);
3728	u32 snd_una = ntohl(hdr->snd_una);
3729	int bytes = 0;
3730
3731	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3732
3733	INP_LOCK(tp->t_inpcb);
3734
3735	toep->tp_wr_avail += credits;
3736	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3737		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3738
3739	while (credits) {
3740		struct mbuf *p = peek_wr(toep);
3741
3742		if (__predict_false(!p)) {
3743			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3744			    "nothing pending, state %u wr_avail=%u\n",
3745			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3746			break;
3747		}
3748		CTR2(KTR_TOM,
3749			"wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len);
3750
3751		KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list"));
3752		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3753
3754#if DEBUG_WR > 1
3755			struct tx_data_wr *w = cplhdr(p);
3756			log(LOG_ERR,
3757			       "TID %u got %u WR credits, need %u, len %u, "
3758			       "main body %u, frags %u, seq # %u, ACK una %u,"
3759			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3760			       toep->tp_tid, credits, p->csum, p->len,
3761			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3762			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3763			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3764#endif
3765			p->m_pkthdr.csum_data -= credits;
3766			break;
3767		} else {
3768			dequeue_wr(toep);
3769			credits -= p->m_pkthdr.csum_data;
3770			bytes += p->m_pkthdr.len;
3771			CTR3(KTR_TOM,
3772			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3773			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3774
3775			m_free(p);
3776		}
3777	}
3778
3779#if DEBUG_WR
3780	check_wr_invariants(tp);
3781#endif
3782
3783	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3784#if VALIDATE_SEQ
3785		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3786
3787		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3788		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3789		    toep->tp_tid, tp->snd_una);
3790#endif
3791		goto out_free;
3792	}
3793
3794	if (tp->snd_una != snd_una) {
3795		tp->snd_una = snd_una;
3796		tp->ts_recent_age = ticks;
3797#ifdef notyet
3798		/*
3799		 * Keep ARP entry "minty fresh"
3800		 */
3801		dst_confirm(sk->sk_dst_cache);
3802#endif
3803		if (tp->snd_una == tp->snd_nxt)
3804			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3805	}
3806	if (bytes) {
3807		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3808		SOCKBUF_LOCK(&so->so_snd);
3809		sbdrop_locked(&so->so_snd, bytes);
3810		sowwakeup_locked(so);
3811	}
3812
3813	if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
3814		t3_push_frames(so, 0);
3815
3816out_free:
3817	INP_UNLOCK(tp->t_inpcb);
3818	m_free(m);
3819}
3820
3821/*
3822 * Handler for TX_DATA_ACK CPL messages.
3823 */
3824static int
3825do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3826{
3827	struct toepcb *toep = (struct toepcb *)ctx;
3828
3829	VALIDATE_SOCK(so);
3830
3831	wr_ack(toep, m);
3832	return 0;
3833}
3834
3835/*
3836 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
3837 */
3838static int
3839do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
3840{
3841	m_freem(m);
3842	return 0;
3843}
3844
3845/*
3846 * Reset a connection that is on a listener's SYN queue or accept queue,
3847 * i.e., one that has not had a struct socket associated with it.
3848 * Must be called from process context.
3849 *
3850 * Modeled after code in inet_csk_listen_stop().
3851 */
3852static void
3853t3_reset_listen_child(struct socket *child)
3854{
3855	struct tcpcb *tp = sototcpcb(child);
3856
3857	t3_send_reset(tp->t_toe);
3858}
3859
3860/*
3861 * Disconnect offloaded established but not yet accepted connections sitting
3862 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
3863 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
3864 */
3865void
3866t3_disconnect_acceptq(struct socket *listen_so)
3867{
3868	struct socket *so;
3869	struct tcpcb *tp;
3870
3871	TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
3872		tp = sototcpcb(so);
3873
3874		if (tp->t_flags & TF_TOE) {
3875			INP_LOCK(tp->t_inpcb);
3876			t3_reset_listen_child(so);
3877			INP_UNLOCK(tp->t_inpcb);
3878		}
3879
3880	}
3881}
3882
3883/*
3884 * Reset offloaded connections sitting on a server's syn queue.  As above
3885 * we send ABORT_REQ and finish off when we get ABORT_RPL.
3886 */
3887
3888void
3889t3_reset_synq(struct listen_ctx *lctx)
3890{
3891	struct toepcb *toep;
3892
3893	SOCK_LOCK(lctx->lso);
3894	while (!LIST_EMPTY(&lctx->synq_head)) {
3895		toep = LIST_FIRST(&lctx->synq_head);
3896		LIST_REMOVE(toep, synq_entry);
3897		toep->tp_tp = NULL;
3898		t3_send_reset(toep);
3899		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
3900		toepcb_release(toep);
3901	}
3902	SOCK_UNLOCK(lctx->lso);
3903}
3904
3905
3906int
3907t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
3908		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
3909		   unsigned int pg_off, unsigned int color)
3910{
3911	unsigned int i, j, pidx;
3912	struct pagepod *p;
3913	struct mbuf *m;
3914	struct ulp_mem_io *req;
3915	struct tcpcb *tp = sototcpcb(so);
3916	struct toepcb *toep = tp->t_toe;
3917	unsigned int tid = toep->tp_tid;
3918	const struct tom_data *td = TOM_DATA(TOE_DEV(so));
3919	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
3920
3921	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
3922	    gl, nppods, tag, maxoff, pg_off, color);
3923
3924	for (i = 0; i < nppods; ++i) {
3925		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
3926		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
3927		req = mtod(m, struct ulp_mem_io *);
3928		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
3929		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
3930		req->wr.wr_lo = 0;
3931		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
3932					   V_ULPTX_CMD(ULP_MEM_WRITE));
3933		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
3934				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
3935
3936		p = (struct pagepod *)(req + 1);
3937		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
3938			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
3939			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
3940						  V_PPOD_COLOR(color));
3941			p->pp_max_offset = htonl(maxoff);
3942			p->pp_page_offset = htonl(pg_off);
3943			p->pp_rsvd = 0;
3944			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
3945				p->pp_addr[j] = pidx < gl->dgl_nelem ?
3946				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
3947		} else
3948			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
3949		send_or_defer(toep, m, 0);
3950		ppod_addr += PPOD_SIZE;
3951	}
3952	return (0);
3953}
3954
3955/*
3956 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
3957 */
3958static inline void
3959mk_cpl_barrier_ulp(struct cpl_barrier *b)
3960{
3961	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
3962
3963	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3964	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
3965	b->opcode = CPL_BARRIER;
3966}
3967
3968/*
3969 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
3970 */
3971static inline void
3972mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
3973{
3974	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
3975
3976	txpkt = (struct ulp_txpkt *)req;
3977	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3978	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
3979	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
3980	req->cpuno = htons(cpuno);
3981}
3982
3983/*
3984 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
3985 */
3986static inline void
3987mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
3988                     unsigned int word, uint64_t mask, uint64_t val)
3989{
3990	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
3991
3992	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
3993	    tid, word, mask, val);
3994
3995	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3996	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
3997	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
3998	req->reply = V_NO_REPLY(1);
3999	req->cpu_idx = 0;
4000	req->word = htons(word);
4001	req->mask = htobe64(mask);
4002	req->val = htobe64(val);
4003}
4004
4005/*
4006 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4007 */
4008static void
4009mk_rx_data_ack_ulp(struct cpl_rx_data_ack *ack, unsigned int tid, unsigned int credits)
4010{
4011	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4012
4013	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4014	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4015	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4016	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4017				 V_RX_DACK_MODE(1) | V_RX_CREDITS(credits));
4018}
4019
4020void
4021t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4022{
4023	unsigned int wrlen;
4024	struct mbuf *m;
4025	struct work_request_hdr *wr;
4026	struct cpl_barrier *lock;
4027	struct cpl_set_tcb_field *req;
4028	struct cpl_get_tcb *getreq;
4029	struct ddp_state *p = &toep->tp_ddp_state;
4030
4031	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4032	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4033		sizeof(*getreq);
4034	m = m_gethdr_nofail(wrlen);
4035	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4036	wr = mtod(m, struct work_request_hdr *);
4037	bzero(wr, wrlen);
4038
4039	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4040	m->m_pkthdr.len = m->m_len = wrlen;
4041
4042	lock = (struct cpl_barrier *)(wr + 1);
4043	mk_cpl_barrier_ulp(lock);
4044
4045	req = (struct cpl_set_tcb_field *)(lock + 1);
4046
4047	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4048
4049	/* Hmmm, not sure if this actually a good thing: reactivating
4050	 * the other buffer might be an issue if it has been completed
4051	 * already. However, that is unlikely, since the fact that the UBUF
4052	 * is not completed indicates that there is no oustanding data.
4053	 */
4054	if (bufidx == 0)
4055		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4056				     V_TF_DDP_ACTIVE_BUF(1) |
4057				     V_TF_DDP_BUF0_VALID(1),
4058				     V_TF_DDP_ACTIVE_BUF(1));
4059	else
4060		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4061				     V_TF_DDP_ACTIVE_BUF(1) |
4062				     V_TF_DDP_BUF1_VALID(1), 0);
4063
4064	getreq = (struct cpl_get_tcb *)(req + 1);
4065	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4066
4067	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4068
4069	/* Keep track of the number of oustanding CPL_GET_TCB requests
4070	 */
4071	p->get_tcb_count++;
4072
4073#ifdef T3_TRACE
4074	T3_TRACE1(TIDTB(so),
4075		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4076#endif
4077	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4078}
4079
4080/**
4081 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4082 * @sk: the socket associated with the buffers
4083 * @bufidx: index of HW DDP buffer (0 or 1)
4084 * @tag0: new tag for HW buffer 0
4085 * @tag1: new tag for HW buffer 1
4086 * @len: new length for HW buf @bufidx
4087 *
4088 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4089 * buffer by changing the buffer tag and length and setting the valid and
4090 * active flag accordingly.  The caller must ensure the new buffer is at
4091 * least as big as the existing one.  Since we typically reprogram both HW
4092 * buffers this function sets both tags for convenience. Read the TCB to
4093 * determine how made data was written into the buffer before the overlay
4094 * took place.
4095 */
4096void
4097t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4098	 	       unsigned int tag1, unsigned int len)
4099{
4100	unsigned int wrlen;
4101	struct mbuf *m;
4102	struct work_request_hdr *wr;
4103	struct cpl_get_tcb *getreq;
4104	struct cpl_set_tcb_field *req;
4105	struct ddp_state *p = &toep->tp_ddp_state;
4106
4107	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4108	    bufidx, tag0, tag1, len);
4109	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4110	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4111	m = m_gethdr_nofail(wrlen);
4112	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4113	wr = mtod(m, struct work_request_hdr *);
4114	m->m_pkthdr.len = m->m_len = wrlen;
4115	bzero(wr, wrlen);
4116
4117
4118	/* Set the ATOMIC flag to make sure that TP processes the following
4119	 * CPLs in an atomic manner and no wire segments can be interleaved.
4120	 */
4121	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4122	req = (struct cpl_set_tcb_field *)(wr + 1);
4123	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4124			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4125			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4126			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4127			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4128	req++;
4129	if (bufidx == 0) {
4130		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4131			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4132			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4133		req++;
4134		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4135			    V_TF_DDP_PUSH_DISABLE_0(1) |
4136			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4137			    V_TF_DDP_PUSH_DISABLE_0(0) |
4138			    V_TF_DDP_BUF0_VALID(1));
4139	} else {
4140		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4141			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4142			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4143		req++;
4144		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4145			    V_TF_DDP_PUSH_DISABLE_1(1) |
4146			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4147			    V_TF_DDP_PUSH_DISABLE_1(0) |
4148			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4149	}
4150
4151	getreq = (struct cpl_get_tcb *)(req + 1);
4152	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4153
4154	/* Keep track of the number of oustanding CPL_GET_TCB requests
4155	 */
4156	p->get_tcb_count++;
4157
4158#ifdef T3_TRACE
4159	T3_TRACE4(TIDTB(sk),
4160		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4161		  "len %d",
4162		  bufidx, tag0, tag1, len);
4163#endif
4164	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4165}
4166
4167/*
4168 * Sends a compound WR containing all the CPL messages needed to program the
4169 * two HW DDP buffers, namely optionally setting up the length and offset of
4170 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4171 */
4172void
4173t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4174		      unsigned int len1, unsigned int offset1,
4175                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4176{
4177	unsigned int wrlen;
4178	struct mbuf *m;
4179	struct work_request_hdr *wr;
4180	struct cpl_set_tcb_field *req;
4181
4182	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4183	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4184
4185	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4186	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4187		(len1 ? sizeof(*req) : 0) +
4188		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4189	m = m_gethdr_nofail(wrlen);
4190	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4191	wr = mtod(m, struct work_request_hdr *);
4192	bzero(wr, wrlen);
4193
4194	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4195	m->m_pkthdr.len = m->m_len = wrlen;
4196
4197	req = (struct cpl_set_tcb_field *)(wr + 1);
4198	if (len0) {                  /* program buffer 0 offset and length */
4199		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4200			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4201			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4202			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4203			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4204		req++;
4205	}
4206	if (len1) {                  /* program buffer 1 offset and length */
4207		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4208			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4209			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4210			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4211			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4212		req++;
4213	}
4214
4215	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4216			     ddp_flags);
4217
4218	if (modulate) {
4219		mk_rx_data_ack_ulp((struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4220				   toep->tp_copied_seq - toep->tp_rcv_wup);
4221		toep->tp_rcv_wup = toep->tp_copied_seq;
4222	}
4223
4224#ifdef T3_TRACE
4225	T3_TRACE5(TIDTB(sk),
4226		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4227		  "modulate %d",
4228		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4229		  modulate);
4230#endif
4231
4232	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4233}
4234
4235void
4236t3_init_wr_tab(unsigned int wr_len)
4237{
4238	int i;
4239
4240	if (mbuf_wrs[1])     /* already initialized */
4241		return;
4242
4243	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4244		int sgl_len = (3 * i) / 2 + (i & 1);
4245
4246		sgl_len += 3;
4247		mbuf_wrs[i] = sgl_len <= wr_len ?
4248		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4249	}
4250
4251	wrlen = wr_len * 8;
4252}
4253
4254int
4255t3_init_cpl_io(void)
4256{
4257#ifdef notyet
4258	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4259	if (!tcphdr_skb) {
4260		log(LOG_ERR,
4261		       "Chelsio TCP offload: can't allocate sk_buff\n");
4262		return -1;
4263	}
4264	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4265	tcphdr_skb->h.raw = tcphdr_skb->data;
4266	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4267#endif
4268
4269	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4270	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4271	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4272	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4273	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4274	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4275	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4276	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4277	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4278	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4279	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4280	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4281	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4282	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4283	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4284	return (0);
4285}
4286
4287