cxgb_cpl_io.c revision 185088
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 185088 2008-11-19 09:39:34Z zec $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/sockbuf.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/protosw.h>
49#include <sys/priv.h>
50
51#if __FreeBSD_version >= 800044
52#include <sys/vimage.h>
53#else
54#define V_tcp_do_autosndbuf tcp_do_autosndbuf
55#define V_tcp_autosndbuf_max tcp_autosndbuf_max
56#define V_tcp_do_rfc1323 tcp_do_rfc1323
57#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
58#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
59#define V_tcpstat tcpstat
60#endif
61
62#include <net/if.h>
63#include <net/route.h>
64
65#include <netinet/in.h>
66#include <netinet/in_pcb.h>
67#include <netinet/in_systm.h>
68#include <netinet/in_var.h>
69
70
71#include <cxgb_osdep.h>
72#include <sys/mbufq.h>
73
74#include <netinet/ip.h>
75#include <netinet/tcp_var.h>
76#include <netinet/tcp_fsm.h>
77#include <netinet/tcp_offload.h>
78#include <netinet/tcp_seq.h>
79#include <netinet/tcp_syncache.h>
80#include <netinet/tcp_timer.h>
81#include <net/route.h>
82
83#include <t3cdev.h>
84#include <common/cxgb_firmware_exports.h>
85#include <common/cxgb_t3_cpl.h>
86#include <common/cxgb_tcb.h>
87#include <common/cxgb_ctl_defs.h>
88#include <cxgb_offload.h>
89#include <vm/vm.h>
90#include <vm/pmap.h>
91#include <machine/bus.h>
92#include <sys/mvec.h>
93#include <ulp/toecore/cxgb_toedev.h>
94#include <ulp/tom/cxgb_l2t.h>
95#include <ulp/tom/cxgb_defs.h>
96#include <ulp/tom/cxgb_tom.h>
97#include <ulp/tom/cxgb_t3_ddp.h>
98#include <ulp/tom/cxgb_toepcb.h>
99#include <ulp/tom/cxgb_tcp.h>
100#include <ulp/tom/cxgb_tcp_offload.h>
101
102/*
103 * For ULP connections HW may add headers, e.g., for digests, that aren't part
104 * of the messages sent by the host but that are part of the TCP payload and
105 * therefore consume TCP sequence space.  Tx connection parameters that
106 * operate in TCP sequence space are affected by the HW additions and need to
107 * compensate for them to accurately track TCP sequence numbers. This array
108 * contains the compensating extra lengths for ULP packets.  It is indexed by
109 * a packet's ULP submode.
110 */
111const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
112
113#ifdef notyet
114/*
115 * This sk_buff holds a fake header-only TCP segment that we use whenever we
116 * need to exploit SW TCP functionality that expects TCP headers, such as
117 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
118 * CPUs without locking.
119 */
120static struct mbuf *tcphdr_mbuf __read_mostly;
121#endif
122
123/*
124 * Size of WRs in bytes.  Note that we assume all devices we are handling have
125 * the same WR size.
126 */
127static unsigned int wrlen __read_mostly;
128
129/*
130 * The number of WRs needed for an skb depends on the number of page fragments
131 * in the skb and whether it has any payload in its main body.  This maps the
132 * length of the gather list represented by an skb into the # of necessary WRs.
133 */
134static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
135
136/*
137 * Max receive window supported by HW in bytes.  Only a small part of it can
138 * be set through option0, the rest needs to be set through RX_DATA_ACK.
139 */
140#define MAX_RCV_WND ((1U << 27) - 1)
141
142/*
143 * Min receive window.  We want it to be large enough to accommodate receive
144 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
145 */
146#define MIN_RCV_WND (24 * 1024U)
147#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
148
149#define VALIDATE_SEQ 0
150#define VALIDATE_SOCK(so)
151#define DEBUG_WR 0
152
153#define TCP_TIMEWAIT	1
154#define TCP_CLOSE	2
155#define TCP_DROP	3
156
157static void t3_send_reset(struct toepcb *toep);
158static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
159static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
160static void handle_syncache_event(int event, void *arg);
161
162static inline void
163SBAPPEND(struct sockbuf *sb, struct mbuf *n)
164{
165	struct mbuf *m;
166
167	m = sb->sb_mb;
168	while (m) {
169		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
170		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
171			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
172		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
173			m->m_next, m->m_nextpkt, m->m_flags));
174		m = m->m_next;
175	}
176	m = n;
177	while (m) {
178		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
179		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
180			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
181		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
182			m->m_next, m->m_nextpkt, m->m_flags));
183		m = m->m_next;
184	}
185	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
186	sbappendstream_locked(sb, n);
187	m = sb->sb_mb;
188
189	while (m) {
190		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
191			m->m_next, m->m_nextpkt, m->m_flags));
192		m = m->m_next;
193	}
194}
195
196static inline int
197is_t3a(const struct toedev *dev)
198{
199	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
200}
201
202static void
203dump_toepcb(struct toepcb *toep)
204{
205	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
206	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
207	    toep->tp_mtu_idx, toep->tp_tid);
208
209	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
210	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
211	    toep->tp_mss_clamp, toep->tp_flags);
212}
213
214#ifndef RTALLOC2_DEFINED
215static struct rtentry *
216rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
217{
218	struct rtentry *rt = NULL;
219
220	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
221		RT_UNLOCK(rt);
222
223	return (rt);
224}
225#endif
226
227/*
228 * Determine whether to send a CPL message now or defer it.  A message is
229 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
230 * For connections in other states the message is sent immediately.
231 * If through_l2t is set the message is subject to ARP processing, otherwise
232 * it is sent directly.
233 */
234static inline void
235send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
236{
237	struct tcpcb *tp = toep->tp_tp;
238
239	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
240		inp_wlock(tp->t_inpcb);
241		mbufq_tail(&toep->out_of_order_queue, m);  // defer
242		inp_wunlock(tp->t_inpcb);
243	} else if (through_l2t)
244		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
245	else
246		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
247}
248
249static inline unsigned int
250mkprio(unsigned int cntrl, const struct toepcb *toep)
251{
252        return (cntrl);
253}
254
255/*
256 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
257 */
258static inline void
259mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
260{
261	struct cpl_tid_release *req;
262
263	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
264	m->m_pkthdr.len = m->m_len = sizeof(*req);
265	req = mtod(m, struct cpl_tid_release *);
266	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
267	req->wr.wr_lo = 0;
268	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
269}
270
271static inline void
272make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
273{
274	INIT_VNET_INET(so->so_vnet);
275	struct tcpcb *tp = so_sototcpcb(so);
276	struct toepcb *toep = tp->t_toe;
277	struct tx_data_wr *req;
278	struct sockbuf *snd;
279
280	inp_lock_assert(tp->t_inpcb);
281	snd = so_sockbuf_snd(so);
282
283	req = mtod(m, struct tx_data_wr *);
284	m->m_len = sizeof(*req);
285	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
286	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
287	/* len includes the length of any HW ULP additions */
288	req->len = htonl(len);
289	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
290	/* V_TX_ULP_SUBMODE sets both the mode and submode */
291	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
292	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
293	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
294				   (tail ? 0 : 1))));
295	req->sndseq = htonl(tp->snd_nxt);
296	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
297		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
298				    V_TX_CPU_IDX(toep->tp_qset));
299
300		/* Sendbuffer is in units of 32KB.
301		 */
302		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
303			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
304		else {
305			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
306		}
307
308		toep->tp_flags |= TP_DATASENT;
309	}
310}
311
312#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
313
314int
315t3_push_frames(struct socket *so, int req_completion)
316{
317	struct tcpcb *tp = so_sototcpcb(so);
318	struct toepcb *toep = tp->t_toe;
319
320	struct mbuf *tail, *m0, *last;
321	struct t3cdev *cdev;
322	struct tom_data *d;
323	int state, bytes, count, total_bytes;
324	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
325	struct sockbuf *snd;
326
327	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
328		DPRINTF("tcp state=%d\n", tp->t_state);
329		return (0);
330	}
331
332	state = so_state_get(so);
333
334	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
335		DPRINTF("disconnecting\n");
336
337		return (0);
338	}
339
340	inp_lock_assert(tp->t_inpcb);
341
342	snd = so_sockbuf_snd(so);
343	sockbuf_lock(snd);
344
345	d = TOM_DATA(toep->tp_toedev);
346	cdev = d->cdev;
347
348	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
349
350	total_bytes = 0;
351	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
352	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
353
354	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
355		KASSERT(tail, ("sbdrop error"));
356		last = tail = tail->m_next;
357	}
358
359	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
360		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
361		sockbuf_unlock(snd);
362
363		return (0);
364	}
365
366	toep->tp_m_last = NULL;
367	while (toep->tp_wr_avail && (tail != NULL)) {
368		count = bytes = 0;
369		segp = segs;
370		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
371			sockbuf_unlock(snd);
372			return (0);
373		}
374		/*
375		 * If the data in tail fits as in-line, then
376		 * make an immediate data wr.
377		 */
378		if (tail->m_len <= IMM_LEN) {
379			count = 1;
380			bytes = tail->m_len;
381			last = tail;
382			tail = tail->m_next;
383			m_set_sgl(m0, NULL);
384			m_set_sgllen(m0, 0);
385			make_tx_data_wr(so, m0, bytes, tail);
386			m_append(m0, bytes, mtod(last, caddr_t));
387			KASSERT(!m0->m_next, ("bad append"));
388		} else {
389			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
390			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
391				bytes += tail->m_len;
392				last = tail;
393				count++;
394				/*
395				 * technically an abuse to be using this for a VA
396				 * but less gross than defining my own structure
397				 * or calling pmap_kextract from here :-|
398				 */
399				segp->ds_addr = (bus_addr_t)tail->m_data;
400				segp->ds_len = tail->m_len;
401				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
402				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
403				segp++;
404				tail = tail->m_next;
405			}
406			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
407			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
408
409			m_set_sgl(m0, segs);
410			m_set_sgllen(m0, count);
411			make_tx_data_wr(so, m0, bytes, tail);
412		}
413		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
414
415		if (tail) {
416			snd->sb_sndptr = tail;
417			toep->tp_m_last = NULL;
418		} else
419			toep->tp_m_last = snd->sb_sndptr = last;
420
421
422		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
423
424		snd->sb_sndptroff += bytes;
425		total_bytes += bytes;
426		toep->tp_write_seq += bytes;
427		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
428		    " tail=%p sndptr=%p sndptroff=%d",
429		    toep->tp_wr_avail, count, mbuf_wrs[count],
430		    tail, snd->sb_sndptr, snd->sb_sndptroff);
431		if (tail)
432			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
433			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
434			    total_bytes, toep->tp_m_last, tail->m_data,
435			    tp->snd_una);
436		else
437			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
438			    " tp_m_last=%p snd_una=0x%08x",
439			    total_bytes, toep->tp_m_last, tp->snd_una);
440
441
442#ifdef KTR
443{
444		int i;
445
446		i = 0;
447		while (i < count && m_get_sgllen(m0)) {
448			if ((count - i) >= 3) {
449				CTR6(KTR_TOM,
450				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
451				    " len=%d pa=0x%zx len=%d",
452				    segs[i].ds_addr, segs[i].ds_len,
453				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
454				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
455				    i += 3;
456			} else if ((count - i) == 2) {
457				CTR4(KTR_TOM,
458				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
459				    " len=%d",
460				    segs[i].ds_addr, segs[i].ds_len,
461				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
462				    i += 2;
463			} else {
464				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
465				    segs[i].ds_addr, segs[i].ds_len);
466				i++;
467			}
468
469		}
470}
471#endif
472                 /*
473		 * remember credits used
474		 */
475		m0->m_pkthdr.csum_data = mbuf_wrs[count];
476		m0->m_pkthdr.len = bytes;
477		toep->tp_wr_avail -= mbuf_wrs[count];
478		toep->tp_wr_unacked += mbuf_wrs[count];
479
480		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
481		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
482			struct work_request_hdr *wr = cplhdr(m0);
483
484			wr->wr_hi |= htonl(F_WR_COMPL);
485			toep->tp_wr_unacked = 0;
486		}
487		KASSERT((m0->m_pkthdr.csum_data > 0) &&
488		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
489			m0->m_pkthdr.csum_data));
490		m0->m_type = MT_DONTFREE;
491		enqueue_wr(toep, m0);
492		DPRINTF("sending offload tx with %d bytes in %d segments\n",
493		    bytes, count);
494		l2t_send(cdev, m0, toep->tp_l2t);
495	}
496	sockbuf_unlock(snd);
497	return (total_bytes);
498}
499
500/*
501 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
502 * under any circumstances.  We take the easy way out and always queue the
503 * message to the write_queue.  We can optimize the case where the queue is
504 * already empty though the optimization is probably not worth it.
505 */
506static void
507close_conn(struct socket *so)
508{
509	struct mbuf *m;
510	struct cpl_close_con_req *req;
511	struct tom_data *d;
512	struct inpcb *inp = so_sotoinpcb(so);
513	struct tcpcb *tp;
514	struct toepcb *toep;
515	unsigned int tid;
516
517
518	inp_wlock(inp);
519	tp = so_sototcpcb(so);
520	toep = tp->t_toe;
521
522	if (tp->t_state != TCPS_SYN_SENT)
523		t3_push_frames(so, 1);
524
525	if (toep->tp_flags & TP_FIN_SENT) {
526		inp_wunlock(inp);
527		return;
528	}
529
530	tid = toep->tp_tid;
531
532	d = TOM_DATA(toep->tp_toedev);
533
534	m = m_gethdr_nofail(sizeof(*req));
535	m_set_priority(m, CPL_PRIORITY_DATA);
536	m_set_sgl(m, NULL);
537	m_set_sgllen(m, 0);
538
539	toep->tp_flags |= TP_FIN_SENT;
540	req = mtod(m, struct cpl_close_con_req *);
541
542	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
543	req->wr.wr_lo = htonl(V_WR_TID(tid));
544	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
545	req->rsvd = 0;
546	inp_wunlock(inp);
547	/*
548	 * XXX - need to defer shutdown while there is still data in the queue
549	 *
550	 */
551	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
552	cxgb_ofld_send(d->cdev, m);
553
554}
555
556/*
557 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
558 * and send it along.
559 */
560static void
561abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
562{
563	struct cpl_abort_req *req = cplhdr(m);
564
565	req->cmd = CPL_ABORT_NO_RST;
566	cxgb_ofld_send(cdev, m);
567}
568
569/*
570 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
571 * permitted to return without sending the message in case we cannot allocate
572 * an sk_buff.  Returns the number of credits sent.
573 */
574uint32_t
575t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
576{
577	struct mbuf *m;
578	struct cpl_rx_data_ack *req;
579	struct toepcb *toep = tp->t_toe;
580	struct toedev *tdev = toep->tp_toedev;
581
582	m = m_gethdr_nofail(sizeof(*req));
583
584	DPRINTF("returning %u credits to HW\n", credits);
585
586	req = mtod(m, struct cpl_rx_data_ack *);
587	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
588	req->wr.wr_lo = 0;
589	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
590	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
591	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
592	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
593	return (credits);
594}
595
596/*
597 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
598 * This is only used in DDP mode, so we take the opportunity to also set the
599 * DACK mode and flush any Rx credits.
600 */
601void
602t3_send_rx_modulate(struct toepcb *toep)
603{
604	struct mbuf *m;
605	struct cpl_rx_data_ack *req;
606
607	m = m_gethdr_nofail(sizeof(*req));
608
609	req = mtod(m, struct cpl_rx_data_ack *);
610	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
611	req->wr.wr_lo = 0;
612	m->m_pkthdr.len = m->m_len = sizeof(*req);
613
614	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
615	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
616				 V_RX_DACK_MODE(1) |
617				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
618	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
619	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
620	toep->tp_rcv_wup = toep->tp_copied_seq;
621}
622
623/*
624 * Handle receipt of an urgent pointer.
625 */
626static void
627handle_urg_ptr(struct socket *so, uint32_t urg_seq)
628{
629#ifdef URGENT_DATA_SUPPORTED
630	struct tcpcb *tp = so_sototcpcb(so);
631
632	urg_seq--;   /* initially points past the urgent data, per BSD */
633
634	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
635		return;                                 /* duplicate pointer */
636	sk_send_sigurg(sk);
637	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
638	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
639		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
640
641		tp->copied_seq++;
642		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
643			tom_eat_skb(sk, skb, 0);
644	}
645	tp->urg_data = TCP_URG_NOTYET;
646	tp->urg_seq = urg_seq;
647#endif
648}
649
650/*
651 * Returns true if a socket cannot accept new Rx data.
652 */
653static inline int
654so_no_receive(const struct socket *so)
655{
656	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
657}
658
659/*
660 * Process an urgent data notification.
661 */
662static void
663rx_urg_notify(struct toepcb *toep, struct mbuf *m)
664{
665	struct cpl_rx_urg_notify *hdr = cplhdr(m);
666	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
667
668	VALIDATE_SOCK(so);
669
670	if (!so_no_receive(so))
671		handle_urg_ptr(so, ntohl(hdr->seq));
672
673	m_freem(m);
674}
675
676/*
677 * Handler for RX_URG_NOTIFY CPL messages.
678 */
679static int
680do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
681{
682	struct toepcb *toep = (struct toepcb *)ctx;
683
684	rx_urg_notify(toep, m);
685	return (0);
686}
687
688static __inline int
689is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
690{
691	return (toep->tp_ulp_mode ||
692		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
693		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
694}
695
696/*
697 * Set of states for which we should return RX credits.
698 */
699#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
700
701/*
702 * Called after some received data has been read.  It returns RX credits
703 * to the HW for the amount of data processed.
704 */
705void
706t3_cleanup_rbuf(struct tcpcb *tp, int copied)
707{
708	struct toepcb *toep = tp->t_toe;
709	struct socket *so;
710	struct toedev *dev;
711	int dack_mode, must_send, read;
712	u32 thres, credits, dack = 0;
713	struct sockbuf *rcv;
714
715	so = inp_inpcbtosocket(tp->t_inpcb);
716	rcv = so_sockbuf_rcv(so);
717
718	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
719		(tp->t_state == TCPS_FIN_WAIT_2))) {
720		if (copied) {
721			sockbuf_lock(rcv);
722			toep->tp_copied_seq += copied;
723			sockbuf_unlock(rcv);
724		}
725
726		return;
727	}
728
729	inp_lock_assert(tp->t_inpcb);
730
731	sockbuf_lock(rcv);
732	if (copied)
733		toep->tp_copied_seq += copied;
734	else {
735		read = toep->tp_enqueued_bytes - rcv->sb_cc;
736		toep->tp_copied_seq += read;
737	}
738	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
739	toep->tp_enqueued_bytes = rcv->sb_cc;
740	sockbuf_unlock(rcv);
741
742	if (credits > rcv->sb_mbmax) {
743		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
744		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
745	    credits = rcv->sb_mbmax;
746	}
747
748
749	/*
750	 * XXX this won't accurately reflect credit return - we need
751	 * to look at the difference between the amount that has been
752	 * put in the recv sockbuf and what is there now
753	 */
754
755	if (__predict_false(!credits))
756		return;
757
758	dev = toep->tp_toedev;
759	thres = TOM_TUNABLE(dev, rx_credit_thres);
760
761	if (__predict_false(thres == 0))
762		return;
763
764	if (is_delack_mode_valid(dev, toep)) {
765		dack_mode = TOM_TUNABLE(dev, delack);
766		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
767			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
768
769			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
770				dack = F_RX_DACK_CHANGE |
771				       V_RX_DACK_MODE(dack_mode);
772		}
773	} else
774		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
775
776	/*
777	 * For coalescing to work effectively ensure the receive window has
778	 * at least 16KB left.
779	 */
780	must_send = credits + 16384 >= tp->rcv_wnd;
781
782	if (must_send || credits >= thres)
783		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
784}
785
786static int
787cxgb_toe_disconnect(struct tcpcb *tp)
788{
789	struct socket *so;
790
791	DPRINTF("cxgb_toe_disconnect\n");
792
793	so = inp_inpcbtosocket(tp->t_inpcb);
794	close_conn(so);
795	return (0);
796}
797
798static int
799cxgb_toe_reset(struct tcpcb *tp)
800{
801	struct toepcb *toep = tp->t_toe;
802
803	t3_send_reset(toep);
804
805	/*
806	 * unhook from socket
807	 */
808	tp->t_flags &= ~TF_TOE;
809	toep->tp_tp = NULL;
810	tp->t_toe = NULL;
811	return (0);
812}
813
814static int
815cxgb_toe_send(struct tcpcb *tp)
816{
817	struct socket *so;
818
819	DPRINTF("cxgb_toe_send\n");
820	dump_toepcb(tp->t_toe);
821
822	so = inp_inpcbtosocket(tp->t_inpcb);
823	t3_push_frames(so, 1);
824	return (0);
825}
826
827static int
828cxgb_toe_rcvd(struct tcpcb *tp)
829{
830
831	inp_lock_assert(tp->t_inpcb);
832
833	t3_cleanup_rbuf(tp, 0);
834
835	return (0);
836}
837
838static void
839cxgb_toe_detach(struct tcpcb *tp)
840{
841	struct toepcb *toep;
842
843        /*
844	 * XXX how do we handle teardown in the SYN_SENT state?
845	 *
846	 */
847	inp_lock_assert(tp->t_inpcb);
848	toep = tp->t_toe;
849	toep->tp_tp = NULL;
850
851	/*
852	 * unhook from socket
853	 */
854	tp->t_flags &= ~TF_TOE;
855	tp->t_toe = NULL;
856}
857
858
859static struct toe_usrreqs cxgb_toe_usrreqs = {
860	.tu_disconnect = cxgb_toe_disconnect,
861	.tu_reset = cxgb_toe_reset,
862	.tu_send = cxgb_toe_send,
863	.tu_rcvd = cxgb_toe_rcvd,
864	.tu_detach = cxgb_toe_detach,
865	.tu_detach = cxgb_toe_detach,
866	.tu_syncache_event = handle_syncache_event,
867};
868
869
870static void
871__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
872			    uint64_t mask, uint64_t val, int no_reply)
873{
874	struct cpl_set_tcb_field *req;
875
876	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
877	    toep->tp_tid, word, mask, val);
878
879	req = mtod(m, struct cpl_set_tcb_field *);
880	m->m_pkthdr.len = m->m_len = sizeof(*req);
881	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
882	req->wr.wr_lo = 0;
883	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
884	req->reply = V_NO_REPLY(no_reply);
885	req->cpu_idx = 0;
886	req->word = htons(word);
887	req->mask = htobe64(mask);
888	req->val = htobe64(val);
889
890	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
891	send_or_defer(toep, m, 0);
892}
893
894static void
895t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
896{
897	struct mbuf *m;
898	struct tcpcb *tp = toep->tp_tp;
899
900	if (toep == NULL)
901		return;
902
903	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
904		printf("not seting field\n");
905		return;
906	}
907
908	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
909
910	__set_tcb_field(toep, m, word, mask, val, 1);
911}
912
913/*
914 * Set one of the t_flags bits in the TCB.
915 */
916static void
917set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
918{
919
920	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
921}
922
923/*
924 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
925 */
926static void
927t3_set_nagle(struct toepcb *toep)
928{
929	struct tcpcb *tp = toep->tp_tp;
930
931	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
932}
933
934/*
935 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
936 */
937void
938t3_set_keepalive(struct toepcb *toep, int on_off)
939{
940
941	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
942}
943
944void
945t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
946{
947	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
948}
949
950void
951t3_set_dack_mss(struct toepcb *toep, int on_off)
952{
953
954	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
955}
956
957/*
958 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
959 */
960static void
961t3_set_tos(struct toepcb *toep)
962{
963	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
964
965	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
966			 V_TCB_TOS(tos));
967}
968
969
970/*
971 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
972 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
973 * set the PSH bit in the last segment, which would trigger delivery.]
974 * We work around the issue by setting a DDP buffer in a partial placed state,
975 * which guarantees that TP will schedule a timer.
976 */
977#define TP_DDP_TIMER_WORKAROUND_MASK\
978    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
979     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
980       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
981#define TP_DDP_TIMER_WORKAROUND_VAL\
982    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
983     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
984      32))
985
986static void
987t3_enable_ddp(struct toepcb *toep, int on)
988{
989	if (on) {
990
991		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
992				 V_TF_DDP_OFF(0));
993	} else
994		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
995				 V_TF_DDP_OFF(1) |
996				 TP_DDP_TIMER_WORKAROUND_MASK,
997				 V_TF_DDP_OFF(1) |
998				 TP_DDP_TIMER_WORKAROUND_VAL);
999
1000}
1001
1002void
1003t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1004{
1005	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1006			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1007			 tag_color);
1008}
1009
1010void
1011t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1012		    unsigned int len)
1013{
1014	if (buf_idx == 0)
1015		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1016			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1017			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1018			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1019			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1020	else
1021		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1022			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1023			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1024			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1025			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1026}
1027
1028static int
1029t3_set_cong_control(struct socket *so, const char *name)
1030{
1031#ifdef CONGESTION_CONTROL_SUPPORTED
1032	int cong_algo;
1033
1034	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1035		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1036			break;
1037
1038	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1039		return -EINVAL;
1040#endif
1041	return 0;
1042}
1043
1044int
1045t3_get_tcb(struct toepcb *toep)
1046{
1047	struct cpl_get_tcb *req;
1048	struct tcpcb *tp = toep->tp_tp;
1049	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1050
1051	if (!m)
1052		return (ENOMEM);
1053
1054	inp_lock_assert(tp->t_inpcb);
1055	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1056	req = mtod(m, struct cpl_get_tcb *);
1057	m->m_pkthdr.len = m->m_len = sizeof(*req);
1058	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1059	req->wr.wr_lo = 0;
1060	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1061	req->cpuno = htons(toep->tp_qset);
1062	req->rsvd = 0;
1063	if (tp->t_state == TCPS_SYN_SENT)
1064		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1065	else
1066		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1067	return 0;
1068}
1069
1070static inline void
1071so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1072{
1073
1074	toepcb_hold(toep);
1075
1076	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1077}
1078
1079/**
1080 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1081 *	@d: TOM state
1082 *	@mtu: the target MTU
1083 *
1084 *	Returns the index of the value in the MTU table that is closest to but
1085 *	does not exceed the target MTU.
1086 */
1087static unsigned int
1088find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1089{
1090	int i = 0;
1091
1092	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1093		++i;
1094	return (i);
1095}
1096
1097static unsigned int
1098select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1099{
1100	unsigned int idx;
1101
1102#ifdef notyet
1103	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1104#endif
1105	if (tp) {
1106		tp->t_maxseg = pmtu - 40;
1107		if (tp->t_maxseg < td->mtus[0] - 40)
1108			tp->t_maxseg = td->mtus[0] - 40;
1109		idx = find_best_mtu(td, tp->t_maxseg + 40);
1110
1111		tp->t_maxseg = td->mtus[idx] - 40;
1112	} else
1113		idx = find_best_mtu(td, pmtu);
1114
1115	return (idx);
1116}
1117
1118static inline void
1119free_atid(struct t3cdev *cdev, unsigned int tid)
1120{
1121	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1122
1123	if (toep)
1124		toepcb_release(toep);
1125}
1126
1127/*
1128 * Release resources held by an offload connection (TID, L2T entry, etc.)
1129 */
1130static void
1131t3_release_offload_resources(struct toepcb *toep)
1132{
1133	struct tcpcb *tp = toep->tp_tp;
1134	struct toedev *tdev = toep->tp_toedev;
1135	struct t3cdev *cdev;
1136	struct socket *so;
1137	unsigned int tid = toep->tp_tid;
1138	struct sockbuf *rcv;
1139
1140	CTR0(KTR_TOM, "t3_release_offload_resources");
1141
1142	if (!tdev)
1143		return;
1144
1145	cdev = TOEP_T3C_DEV(toep);
1146	if (!cdev)
1147		return;
1148
1149	toep->tp_qset = 0;
1150	t3_release_ddp_resources(toep);
1151
1152#ifdef CTRL_SKB_CACHE
1153	kfree_skb(CTRL_SKB_CACHE(tp));
1154	CTRL_SKB_CACHE(tp) = NULL;
1155#endif
1156
1157	if (toep->tp_wr_avail != toep->tp_wr_max) {
1158		purge_wr_queue(toep);
1159		reset_wr_list(toep);
1160	}
1161
1162	if (toep->tp_l2t) {
1163		l2t_release(L2DATA(cdev), toep->tp_l2t);
1164		toep->tp_l2t = NULL;
1165	}
1166	toep->tp_tp = NULL;
1167	if (tp) {
1168		inp_lock_assert(tp->t_inpcb);
1169		so = inp_inpcbtosocket(tp->t_inpcb);
1170		rcv = so_sockbuf_rcv(so);
1171		/*
1172		 * cancel any offloaded reads
1173		 *
1174		 */
1175		sockbuf_lock(rcv);
1176		tp->t_toe = NULL;
1177		tp->t_flags &= ~TF_TOE;
1178		if (toep->tp_ddp_state.user_ddp_pending) {
1179			t3_cancel_ubuf(toep, rcv);
1180			toep->tp_ddp_state.user_ddp_pending = 0;
1181		}
1182		so_sorwakeup_locked(so);
1183
1184	}
1185
1186	if (toep->tp_state == TCPS_SYN_SENT) {
1187		free_atid(cdev, tid);
1188#ifdef notyet
1189		__skb_queue_purge(&tp->out_of_order_queue);
1190#endif
1191	} else {                                          // we have TID
1192		cxgb_remove_tid(cdev, toep, tid);
1193		toepcb_release(toep);
1194	}
1195#if 0
1196	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1197#endif
1198}
1199
1200static void
1201install_offload_ops(struct socket *so)
1202{
1203	struct tcpcb *tp = so_sototcpcb(so);
1204
1205	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1206
1207	t3_install_socket_ops(so);
1208	tp->t_flags |= TF_TOE;
1209	tp->t_tu = &cxgb_toe_usrreqs;
1210}
1211
1212/*
1213 * Determine the receive window scaling factor given a target max
1214 * receive window.
1215 */
1216static __inline int
1217select_rcv_wscale(int space)
1218{
1219	INIT_VNET_INET(so->so_vnet);
1220	int wscale = 0;
1221
1222	if (space > MAX_RCV_WND)
1223		space = MAX_RCV_WND;
1224
1225	if (V_tcp_do_rfc1323)
1226		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1227
1228	return (wscale);
1229}
1230
1231/*
1232 * Determine the receive window size for a socket.
1233 */
1234static unsigned long
1235select_rcv_wnd(struct toedev *dev, struct socket *so)
1236{
1237	INIT_VNET_INET(so->so_vnet);
1238	struct tom_data *d = TOM_DATA(dev);
1239	unsigned int wnd;
1240	unsigned int max_rcv_wnd;
1241	struct sockbuf *rcv;
1242
1243	rcv = so_sockbuf_rcv(so);
1244
1245	if (V_tcp_do_autorcvbuf)
1246		wnd = V_tcp_autorcvbuf_max;
1247	else
1248		wnd = rcv->sb_hiwat;
1249
1250
1251
1252	/* XXX
1253	 * For receive coalescing to work effectively we need a receive window
1254	 * that can accomodate a coalesced segment.
1255	 */
1256	if (wnd < MIN_RCV_WND)
1257		wnd = MIN_RCV_WND;
1258
1259	/* PR 5138 */
1260	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1261				    (uint32_t)d->rx_page_size * 23 :
1262				    MAX_RCV_WND);
1263
1264	return min(wnd, max_rcv_wnd);
1265}
1266
1267/*
1268 * Assign offload parameters to some socket fields.  This code is used by
1269 * both active and passive opens.
1270 */
1271static inline void
1272init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1273    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1274{
1275	struct tcpcb *tp = so_sototcpcb(so);
1276	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1277	struct sockbuf *snd, *rcv;
1278
1279#ifdef notyet
1280	SOCK_LOCK_ASSERT(so);
1281#endif
1282
1283	snd = so_sockbuf_snd(so);
1284	rcv = so_sockbuf_rcv(so);
1285
1286	log(LOG_INFO, "initializing offload socket\n");
1287	/*
1288	 * We either need to fix push frames to work with sbcompress
1289	 * or we need to add this
1290	 */
1291	snd->sb_flags |= SB_NOCOALESCE;
1292	rcv->sb_flags |= SB_NOCOALESCE;
1293
1294	tp->t_toe = toep;
1295	toep->tp_tp = tp;
1296	toep->tp_toedev = dev;
1297
1298	toep->tp_tid = tid;
1299	toep->tp_l2t = e;
1300	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1301	toep->tp_wr_unacked = 0;
1302	toep->tp_delack_mode = 0;
1303
1304	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1305	/*
1306	 * XXX broken
1307	 *
1308	 */
1309	tp->rcv_wnd = select_rcv_wnd(dev, so);
1310
1311        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1312		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1313	toep->tp_qset_idx = 0;
1314
1315	reset_wr_list(toep);
1316	DPRINTF("initialization done\n");
1317}
1318
1319/*
1320 * The next two functions calculate the option 0 value for a socket.
1321 */
1322static inline unsigned int
1323calc_opt0h(struct socket *so, int mtu_idx)
1324{
1325	struct tcpcb *tp = so_sototcpcb(so);
1326	int wscale = select_rcv_wscale(tp->rcv_wnd);
1327
1328	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1329	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1330	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1331}
1332
1333static inline unsigned int
1334calc_opt0l(struct socket *so, int ulp_mode)
1335{
1336	struct tcpcb *tp = so_sototcpcb(so);
1337	unsigned int val;
1338
1339	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1340	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1341
1342	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1343	return (val);
1344}
1345
1346static inline unsigned int
1347calc_opt2(const struct socket *so, struct toedev *dev)
1348{
1349	int flv_valid;
1350
1351	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1352
1353	return (V_FLAVORS_VALID(flv_valid) |
1354	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1355}
1356
1357#if DEBUG_WR > 1
1358static int
1359count_pending_wrs(const struct toepcb *toep)
1360{
1361	const struct mbuf *m;
1362	int n = 0;
1363
1364	wr_queue_walk(toep, m)
1365		n += m->m_pkthdr.csum_data;
1366	return (n);
1367}
1368#endif
1369
1370#if 0
1371(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1372#endif
1373
1374static void
1375mk_act_open_req(struct socket *so, struct mbuf *m,
1376    unsigned int atid, const struct l2t_entry *e)
1377{
1378	struct cpl_act_open_req *req;
1379	struct inpcb *inp = so_sotoinpcb(so);
1380	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1381	struct toepcb *toep = tp->t_toe;
1382	struct toedev *tdev = toep->tp_toedev;
1383
1384	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1385
1386	req = mtod(m, struct cpl_act_open_req *);
1387	m->m_pkthdr.len = m->m_len = sizeof(*req);
1388
1389	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1390	req->wr.wr_lo = 0;
1391	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1392	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1393#if 0
1394	req->local_port = inp->inp_lport;
1395	req->peer_port = inp->inp_fport;
1396	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1397	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1398#endif
1399	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1400			   V_TX_CHANNEL(e->smt_idx));
1401	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1402	req->params = 0;
1403	req->opt2 = htonl(calc_opt2(so, tdev));
1404}
1405
1406
1407/*
1408 * Convert an ACT_OPEN_RPL status to an errno.
1409 */
1410static int
1411act_open_rpl_status_to_errno(int status)
1412{
1413	switch (status) {
1414	case CPL_ERR_CONN_RESET:
1415		return (ECONNREFUSED);
1416	case CPL_ERR_ARP_MISS:
1417		return (EHOSTUNREACH);
1418	case CPL_ERR_CONN_TIMEDOUT:
1419		return (ETIMEDOUT);
1420	case CPL_ERR_TCAM_FULL:
1421		return (ENOMEM);
1422	case CPL_ERR_CONN_EXIST:
1423		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1424		return (EADDRINUSE);
1425	default:
1426		return (EIO);
1427	}
1428}
1429
1430static void
1431fail_act_open(struct toepcb *toep, int errno)
1432{
1433	struct tcpcb *tp = toep->tp_tp;
1434
1435	t3_release_offload_resources(toep);
1436	if (tp) {
1437		inp_wunlock(tp->t_inpcb);
1438		tcp_offload_drop(tp, errno);
1439	}
1440
1441#ifdef notyet
1442	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1443#endif
1444}
1445
1446/*
1447 * Handle active open failures.
1448 */
1449static void
1450active_open_failed(struct toepcb *toep, struct mbuf *m)
1451{
1452	struct cpl_act_open_rpl *rpl = cplhdr(m);
1453	struct inpcb *inp;
1454
1455	if (toep->tp_tp == NULL)
1456		goto done;
1457
1458	inp = toep->tp_tp->t_inpcb;
1459
1460/*
1461 * Don't handle connection retry for now
1462 */
1463#ifdef notyet
1464	struct inet_connection_sock *icsk = inet_csk(sk);
1465
1466	if (rpl->status == CPL_ERR_CONN_EXIST &&
1467	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1468		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1469		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1470			       jiffies + HZ / 2);
1471	} else
1472#endif
1473	{
1474		inp_wlock(inp);
1475		/*
1476		 * drops the inpcb lock
1477		 */
1478		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1479	}
1480
1481	done:
1482	m_free(m);
1483}
1484
1485/*
1486 * Return whether a failed active open has allocated a TID
1487 */
1488static inline int
1489act_open_has_tid(int status)
1490{
1491	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1492	       status != CPL_ERR_ARP_MISS;
1493}
1494
1495/*
1496 * Process an ACT_OPEN_RPL CPL message.
1497 */
1498static int
1499do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1500{
1501	struct toepcb *toep = (struct toepcb *)ctx;
1502	struct cpl_act_open_rpl *rpl = cplhdr(m);
1503
1504	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1505		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1506
1507	active_open_failed(toep, m);
1508	return (0);
1509}
1510
1511/*
1512 * Handle an ARP failure for an active open.   XXX purge ofo queue
1513 *
1514 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1515 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1516 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1517 * free the atid.  Hmm.
1518 */
1519#ifdef notyet
1520static void
1521act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1522{
1523	struct toepcb *toep = m_get_toep(m);
1524	struct tcpcb *tp = toep->tp_tp;
1525	struct inpcb *inp = tp->t_inpcb;
1526	struct socket *so;
1527
1528	inp_wlock(inp);
1529	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1530		/*
1531		 * drops the inpcb lock
1532		 */
1533		fail_act_open(so, EHOSTUNREACH);
1534		printf("freeing %p\n", m);
1535
1536		m_free(m);
1537	} else
1538		inp_wunlock(inp);
1539}
1540#endif
1541/*
1542 * Send an active open request.
1543 */
1544int
1545t3_connect(struct toedev *tdev, struct socket *so,
1546    struct rtentry *rt, struct sockaddr *nam)
1547{
1548	struct mbuf *m;
1549	struct l2t_entry *e;
1550	struct tom_data *d = TOM_DATA(tdev);
1551	struct inpcb *inp = so_sotoinpcb(so);
1552	struct tcpcb *tp = intotcpcb(inp);
1553	struct toepcb *toep; /* allocated by init_offload_socket */
1554
1555	int atid;
1556
1557	toep = toepcb_alloc();
1558	if (toep == NULL)
1559		goto out_err;
1560
1561	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1562		goto out_err;
1563
1564	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1565	if (!e)
1566		goto free_tid;
1567
1568	inp_lock_assert(inp);
1569	m = m_gethdr(MT_DATA, M_WAITOK);
1570
1571#if 0
1572	m->m_toe.mt_toepcb = tp->t_toe;
1573	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1574#endif
1575	so_lock(so);
1576
1577	init_offload_socket(so, tdev, atid, e, rt, toep);
1578
1579	install_offload_ops(so);
1580
1581	mk_act_open_req(so, m, atid, e);
1582	so_unlock(so);
1583
1584	soisconnecting(so);
1585	toep = tp->t_toe;
1586	m_set_toep(m, tp->t_toe);
1587
1588	toep->tp_state = TCPS_SYN_SENT;
1589	l2t_send(d->cdev, (struct mbuf *)m, e);
1590
1591	if (toep->tp_ulp_mode)
1592		t3_enable_ddp(toep, 0);
1593	return 	(0);
1594
1595free_tid:
1596	printf("failing connect - free atid\n");
1597
1598	free_atid(d->cdev, atid);
1599out_err:
1600	printf("return ENOMEM\n");
1601       return (ENOMEM);
1602}
1603
1604/*
1605 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1606 * not send multiple ABORT_REQs for the same connection and also that we do
1607 * not try to send a message after the connection has closed.  Returns 1 if
1608 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1609 */
1610static void
1611t3_send_reset(struct toepcb *toep)
1612{
1613
1614	struct cpl_abort_req *req;
1615	unsigned int tid = toep->tp_tid;
1616	int mode = CPL_ABORT_SEND_RST;
1617	struct tcpcb *tp = toep->tp_tp;
1618	struct toedev *tdev = toep->tp_toedev;
1619	struct socket *so = NULL;
1620	struct mbuf *m;
1621	struct sockbuf *snd;
1622
1623	if (tp) {
1624		inp_lock_assert(tp->t_inpcb);
1625		so = inp_inpcbtosocket(tp->t_inpcb);
1626	}
1627
1628	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1629		tdev == NULL))
1630		return;
1631	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1632
1633	snd = so_sockbuf_snd(so);
1634	/* Purge the send queue so we don't send anything after an abort. */
1635	if (so)
1636		sbflush(snd);
1637	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1638		mode |= CPL_ABORT_POST_CLOSE_REQ;
1639
1640	m = m_gethdr_nofail(sizeof(*req));
1641	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1642	set_arp_failure_handler(m, abort_arp_failure);
1643
1644	req = mtod(m, struct cpl_abort_req *);
1645	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1646	req->wr.wr_lo = htonl(V_WR_TID(tid));
1647	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1648	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1649	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1650	req->cmd = mode;
1651	if (tp && (tp->t_state == TCPS_SYN_SENT))
1652		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1653	else
1654		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1655}
1656
1657static int
1658t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1659{
1660	struct inpcb *inp;
1661	int error, optval;
1662
1663	if (sopt->sopt_name == IP_OPTIONS)
1664		return (ENOPROTOOPT);
1665
1666	if (sopt->sopt_name != IP_TOS)
1667		return (EOPNOTSUPP);
1668
1669	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1670
1671	if (error)
1672		return (error);
1673
1674	if (optval > IPTOS_PREC_CRITIC_ECP)
1675		return (EINVAL);
1676
1677	inp = so_sotoinpcb(so);
1678	inp_wlock(inp);
1679	inp_ip_tos_set(inp, optval);
1680#if 0
1681	inp->inp_ip_tos = optval;
1682#endif
1683	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1684	inp_wunlock(inp);
1685
1686	return (0);
1687}
1688
1689static int
1690t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1691{
1692	int err = 0;
1693	size_t copied;
1694
1695	if (sopt->sopt_name != TCP_CONGESTION &&
1696	    sopt->sopt_name != TCP_NODELAY)
1697		return (EOPNOTSUPP);
1698
1699	if (sopt->sopt_name == TCP_CONGESTION) {
1700		char name[TCP_CA_NAME_MAX];
1701		int optlen = sopt->sopt_valsize;
1702		struct tcpcb *tp;
1703
1704		if (sopt->sopt_dir == SOPT_GET) {
1705			KASSERT(0, ("unimplemented"));
1706			return (EOPNOTSUPP);
1707		}
1708
1709		if (optlen < 1)
1710			return (EINVAL);
1711
1712		err = copyinstr(sopt->sopt_val, name,
1713		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1714		if (err)
1715			return (err);
1716		if (copied < 1)
1717			return (EINVAL);
1718
1719		tp = so_sototcpcb(so);
1720		/*
1721		 * XXX I need to revisit this
1722		 */
1723		if ((err = t3_set_cong_control(so, name)) == 0) {
1724#ifdef CONGESTION_CONTROL_SUPPORTED
1725			tp->t_cong_control = strdup(name, M_CXGB);
1726#endif
1727		} else
1728			return (err);
1729	} else {
1730		int optval, oldval;
1731		struct inpcb *inp;
1732		struct tcpcb *tp;
1733
1734		if (sopt->sopt_dir == SOPT_GET)
1735			return (EOPNOTSUPP);
1736
1737		err = sooptcopyin(sopt, &optval, sizeof optval,
1738		    sizeof optval);
1739
1740		if (err)
1741			return (err);
1742
1743		inp = so_sotoinpcb(so);
1744		inp_wlock(inp);
1745		tp = inp_inpcbtotcpcb(inp);
1746
1747		oldval = tp->t_flags;
1748		if (optval)
1749			tp->t_flags |= TF_NODELAY;
1750		else
1751			tp->t_flags &= ~TF_NODELAY;
1752		inp_wunlock(inp);
1753
1754
1755		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1756			t3_set_nagle(tp->t_toe);
1757
1758	}
1759
1760	return (0);
1761}
1762
1763int
1764t3_ctloutput(struct socket *so, struct sockopt *sopt)
1765{
1766	int err;
1767
1768	if (sopt->sopt_level != IPPROTO_TCP)
1769		err =  t3_ip_ctloutput(so, sopt);
1770	else
1771		err = t3_tcp_ctloutput(so, sopt);
1772
1773	if (err != EOPNOTSUPP)
1774		return (err);
1775
1776	return (tcp_ctloutput(so, sopt));
1777}
1778
1779/*
1780 * Returns true if we need to explicitly request RST when we receive new data
1781 * on an RX-closed connection.
1782 */
1783static inline int
1784need_rst_on_excess_rx(const struct toepcb *toep)
1785{
1786	return (1);
1787}
1788
1789/*
1790 * Handles Rx data that arrives in a state where the socket isn't accepting
1791 * new data.
1792 */
1793static void
1794handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1795{
1796
1797	if (need_rst_on_excess_rx(toep) &&
1798	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1799		t3_send_reset(toep);
1800	m_freem(m);
1801}
1802
1803/*
1804 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1805 * by getting the DDP offset from the TCB.
1806 */
1807static void
1808tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1809{
1810	struct ddp_state *q = &toep->tp_ddp_state;
1811	struct ddp_buf_state *bsp;
1812	struct cpl_get_tcb_rpl *hdr;
1813	unsigned int ddp_offset;
1814	struct socket *so;
1815	struct tcpcb *tp;
1816	struct sockbuf *rcv;
1817	int state;
1818
1819	uint64_t t;
1820	__be64 *tcb;
1821
1822	tp = toep->tp_tp;
1823	so = inp_inpcbtosocket(tp->t_inpcb);
1824
1825	inp_lock_assert(tp->t_inpcb);
1826	rcv = so_sockbuf_rcv(so);
1827	sockbuf_lock(rcv);
1828
1829	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1830	 * We really need a cookie in order to dispatch the RPLs.
1831	 */
1832	q->get_tcb_count--;
1833
1834	/* It is a possible that a previous CPL already invalidated UBUF DDP
1835	 * and moved the cur_buf idx and hence no further processing of this
1836	 * skb is required. However, the app might be sleeping on
1837	 * !q->get_tcb_count and we need to wake it up.
1838	 */
1839	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1840		int state = so_state_get(so);
1841
1842		m_freem(m);
1843		if (__predict_true((state & SS_NOFDREF) == 0))
1844			so_sorwakeup_locked(so);
1845		else
1846			sockbuf_unlock(rcv);
1847
1848		return;
1849	}
1850
1851	bsp = &q->buf_state[q->cur_buf];
1852	hdr = cplhdr(m);
1853	tcb = (__be64 *)(hdr + 1);
1854	if (q->cur_buf == 0) {
1855		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1856		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1857	} else {
1858		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1859		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1860	}
1861	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1862	m->m_cur_offset = bsp->cur_offset;
1863	bsp->cur_offset = ddp_offset;
1864	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1865
1866	CTR5(KTR_TOM,
1867	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1868	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1869	KASSERT(ddp_offset >= m->m_cur_offset,
1870	    ("ddp_offset=%u less than cur_offset=%u",
1871		ddp_offset, m->m_cur_offset));
1872
1873#if 0
1874{
1875	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1876
1877	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1878	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1879
1880        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1881        rcv_nxt = t >> S_TCB_RCV_NXT;
1882        rcv_nxt &= M_TCB_RCV_NXT;
1883
1884        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1885        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1886        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1887
1888	T3_TRACE2(TIDTB(sk),
1889		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1890		  ddp_flags, rcv_nxt - rx_hdr_offset);
1891	T3_TRACE4(TB(q),
1892		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1893		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1894	T3_TRACE3(TB(q),
1895		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1896		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1897	T3_TRACE2(TB(q),
1898		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1899		 q->buf_state[0].flags, q->buf_state[1].flags);
1900
1901}
1902#endif
1903	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1904		handle_excess_rx(toep, m);
1905		return;
1906	}
1907
1908#ifdef T3_TRACE
1909	if ((int)m->m_pkthdr.len < 0) {
1910		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1911	}
1912#endif
1913	if (bsp->flags & DDP_BF_NOCOPY) {
1914#ifdef T3_TRACE
1915		T3_TRACE0(TB(q),
1916			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1917
1918		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1919			printk("!cancel_ubuf");
1920			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1921		}
1922#endif
1923		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1924		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1925		q->cur_buf ^= 1;
1926	} else if (bsp->flags & DDP_BF_NOFLIP) {
1927
1928		m->m_ddp_flags = 1;    /* always a kernel buffer */
1929
1930		/* now HW buffer carries a user buffer */
1931		bsp->flags &= ~DDP_BF_NOFLIP;
1932		bsp->flags |= DDP_BF_NOCOPY;
1933
1934		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1935		 * any new data in which case we're done. If in addition the
1936		 * offset is 0, then there wasn't a completion for the kbuf
1937		 * and we need to decrement the posted count.
1938		 */
1939		if (m->m_pkthdr.len == 0) {
1940			if (ddp_offset == 0) {
1941				q->kbuf_posted--;
1942				bsp->flags |= DDP_BF_NODATA;
1943			}
1944			sockbuf_unlock(rcv);
1945			m_free(m);
1946			return;
1947		}
1948	} else {
1949		sockbuf_unlock(rcv);
1950
1951		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1952		 * but it got here way late and nobody cares anymore.
1953		 */
1954		m_free(m);
1955		return;
1956	}
1957
1958	m->m_ddp_gl = (unsigned char *)bsp->gl;
1959	m->m_flags |= M_DDP;
1960	m->m_seq = tp->rcv_nxt;
1961	tp->rcv_nxt += m->m_pkthdr.len;
1962	tp->t_rcvtime = ticks;
1963	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1964		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1965	if (m->m_pkthdr.len == 0) {
1966		q->user_ddp_pending = 0;
1967		m_free(m);
1968	} else
1969		SBAPPEND(rcv, m);
1970
1971	state = so_state_get(so);
1972	if (__predict_true((state & SS_NOFDREF) == 0))
1973		so_sorwakeup_locked(so);
1974	else
1975		sockbuf_unlock(rcv);
1976}
1977
1978/*
1979 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1980 * in that case they are similar to DDP completions.
1981 */
1982static int
1983do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1984{
1985	struct toepcb *toep = (struct toepcb *)ctx;
1986
1987	/* OK if socket doesn't exist */
1988	if (toep == NULL) {
1989		printf("null toep in do_get_tcb_rpl\n");
1990		return (CPL_RET_BUF_DONE);
1991	}
1992
1993	inp_wlock(toep->tp_tp->t_inpcb);
1994	tcb_rpl_as_ddp_complete(toep, m);
1995	inp_wunlock(toep->tp_tp->t_inpcb);
1996
1997	return (0);
1998}
1999
2000static void
2001handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2002{
2003	struct tcpcb *tp = toep->tp_tp;
2004	struct socket *so;
2005	struct ddp_state *q;
2006	struct ddp_buf_state *bsp;
2007	struct cpl_rx_data *hdr = cplhdr(m);
2008	unsigned int rcv_nxt = ntohl(hdr->seq);
2009	struct sockbuf *rcv;
2010
2011	if (tp->rcv_nxt == rcv_nxt)
2012		return;
2013
2014	inp_lock_assert(tp->t_inpcb);
2015	so  = inp_inpcbtosocket(tp->t_inpcb);
2016	rcv = so_sockbuf_rcv(so);
2017	sockbuf_lock(rcv);
2018
2019	q = &toep->tp_ddp_state;
2020	bsp = &q->buf_state[q->cur_buf];
2021	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2022		rcv_nxt, tp->rcv_nxt));
2023	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2024	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2025	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2026	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2027
2028#ifdef T3_TRACE
2029	if ((int)m->m_pkthdr.len < 0) {
2030		t3_ddp_error(so, "handle_ddp_data: neg len");
2031	}
2032#endif
2033	m->m_ddp_gl = (unsigned char *)bsp->gl;
2034	m->m_flags |= M_DDP;
2035	m->m_cur_offset = bsp->cur_offset;
2036	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2037	if (bsp->flags & DDP_BF_NOCOPY)
2038		bsp->flags &= ~DDP_BF_NOCOPY;
2039
2040	m->m_seq = tp->rcv_nxt;
2041	tp->rcv_nxt = rcv_nxt;
2042	bsp->cur_offset += m->m_pkthdr.len;
2043	if (!(bsp->flags & DDP_BF_NOFLIP))
2044		q->cur_buf ^= 1;
2045	/*
2046	 * For now, don't re-enable DDP after a connection fell out of  DDP
2047	 * mode.
2048	 */
2049	q->ubuf_ddp_ready = 0;
2050	sockbuf_unlock(rcv);
2051}
2052
2053/*
2054 * Process new data received for a connection.
2055 */
2056static void
2057new_rx_data(struct toepcb *toep, struct mbuf *m)
2058{
2059	struct cpl_rx_data *hdr = cplhdr(m);
2060	struct tcpcb *tp = toep->tp_tp;
2061	struct socket *so;
2062	struct sockbuf *rcv;
2063	int state;
2064	int len = be16toh(hdr->len);
2065
2066	inp_wlock(tp->t_inpcb);
2067
2068	so  = inp_inpcbtosocket(tp->t_inpcb);
2069
2070	if (__predict_false(so_no_receive(so))) {
2071		handle_excess_rx(toep, m);
2072		inp_wunlock(tp->t_inpcb);
2073		TRACE_EXIT;
2074		return;
2075	}
2076
2077	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2078		handle_ddp_data(toep, m);
2079
2080	m->m_seq = ntohl(hdr->seq);
2081	m->m_ulp_mode = 0;                    /* for iSCSI */
2082
2083#if VALIDATE_SEQ
2084	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2085		log(LOG_ERR,
2086		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2087		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2088		       tp->rcv_nxt);
2089		m_freem(m);
2090		inp_wunlock(tp->t_inpcb);
2091		return;
2092	}
2093#endif
2094	m_adj(m, sizeof(*hdr));
2095
2096#ifdef URGENT_DATA_SUPPORTED
2097	/*
2098	 * We don't handle urgent data yet
2099	 */
2100	if (__predict_false(hdr->urg))
2101		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2102	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2103		     tp->urg_seq - tp->rcv_nxt < skb->len))
2104		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2105							 tp->rcv_nxt];
2106#endif
2107	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2108		toep->tp_delack_mode = hdr->dack_mode;
2109		toep->tp_delack_seq = tp->rcv_nxt;
2110	}
2111	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2112	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2113
2114	if (len < m->m_pkthdr.len)
2115		m->m_pkthdr.len = m->m_len = len;
2116
2117	tp->rcv_nxt += m->m_pkthdr.len;
2118	tp->t_rcvtime = ticks;
2119	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2120	CTR2(KTR_TOM,
2121	    "new_rx_data: seq 0x%x len %u",
2122	    m->m_seq, m->m_pkthdr.len);
2123	inp_wunlock(tp->t_inpcb);
2124	rcv = so_sockbuf_rcv(so);
2125	sockbuf_lock(rcv);
2126#if 0
2127	if (sb_notify(rcv))
2128		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2129#endif
2130	SBAPPEND(rcv, m);
2131
2132#ifdef notyet
2133	/*
2134	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2135	 *
2136	 */
2137	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2138
2139	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2140		so, rcv->sb_cc, rcv->sb_mbmax));
2141#endif
2142
2143
2144	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2145	    rcv->sb_cc, rcv->sb_mbcnt);
2146
2147	state = so_state_get(so);
2148	if (__predict_true((state & SS_NOFDREF) == 0))
2149		so_sorwakeup_locked(so);
2150	else
2151		sockbuf_unlock(rcv);
2152}
2153
2154/*
2155 * Handler for RX_DATA CPL messages.
2156 */
2157static int
2158do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2159{
2160	struct toepcb *toep = (struct toepcb *)ctx;
2161
2162	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2163
2164	new_rx_data(toep, m);
2165
2166	return (0);
2167}
2168
2169static void
2170new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2171{
2172	struct tcpcb *tp;
2173	struct ddp_state *q;
2174	struct ddp_buf_state *bsp;
2175	struct cpl_rx_data_ddp *hdr;
2176	struct socket *so;
2177	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2178	int nomoredata = 0;
2179	unsigned int delack_mode;
2180	struct sockbuf *rcv;
2181
2182	tp = toep->tp_tp;
2183	inp_wlock(tp->t_inpcb);
2184	so = inp_inpcbtosocket(tp->t_inpcb);
2185
2186	if (__predict_false(so_no_receive(so))) {
2187
2188		handle_excess_rx(toep, m);
2189		inp_wunlock(tp->t_inpcb);
2190		return;
2191	}
2192
2193	q = &toep->tp_ddp_state;
2194	hdr = cplhdr(m);
2195	ddp_report = ntohl(hdr->u.ddp_report);
2196	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2197	bsp = &q->buf_state[buf_idx];
2198
2199	CTR4(KTR_TOM,
2200	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2201	    "hdr seq 0x%x len %u",
2202	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2203	    ntohs(hdr->len));
2204	CTR3(KTR_TOM,
2205	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2206	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2207
2208	ddp_len = ntohs(hdr->len);
2209	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2210
2211	delack_mode = G_DDP_DACK_MODE(ddp_report);
2212	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2213		toep->tp_delack_mode = delack_mode;
2214		toep->tp_delack_seq = tp->rcv_nxt;
2215	}
2216
2217	m->m_seq = tp->rcv_nxt;
2218	tp->rcv_nxt = rcv_nxt;
2219
2220	tp->t_rcvtime = ticks;
2221	/*
2222	 * Store the length in m->m_len.  We are changing the meaning of
2223	 * m->m_len here, we need to be very careful that nothing from now on
2224	 * interprets ->len of this packet the usual way.
2225	 */
2226	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2227	inp_wunlock(tp->t_inpcb);
2228	CTR3(KTR_TOM,
2229	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2230	    m->m_len, rcv_nxt, m->m_seq);
2231	/*
2232	 * Figure out where the new data was placed in the buffer and store it
2233	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2234	 * account for page pod's pg_offset.
2235	 */
2236	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2237	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2238
2239	rcv = so_sockbuf_rcv(so);
2240	sockbuf_lock(rcv);
2241
2242	m->m_ddp_gl = (unsigned char *)bsp->gl;
2243	m->m_flags |= M_DDP;
2244	bsp->cur_offset = end_offset;
2245	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2246
2247	/*
2248	 * Length is only meaningful for kbuf
2249	 */
2250	if (!(bsp->flags & DDP_BF_NOCOPY))
2251		KASSERT(m->m_len <= bsp->gl->dgl_length,
2252		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2253			m->m_len, bsp->gl->dgl_length));
2254
2255	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2256	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2257        /*
2258	 * Bit 0 of flags stores whether the DDP buffer is completed.
2259	 * Note that other parts of the code depend on this being in bit 0.
2260	 */
2261	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2262		panic("spurious ddp completion");
2263	} else {
2264		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2265		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2266			q->cur_buf ^= 1;                     /* flip buffers */
2267	}
2268
2269	if (bsp->flags & DDP_BF_NOCOPY) {
2270		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2271		bsp->flags &= ~DDP_BF_NOCOPY;
2272	}
2273
2274	if (ddp_report & F_DDP_PSH)
2275		m->m_ddp_flags |= DDP_BF_PSH;
2276	if (nomoredata)
2277		m->m_ddp_flags |= DDP_BF_NODATA;
2278
2279#ifdef notyet
2280	skb_reset_transport_header(skb);
2281	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2282#endif
2283	SBAPPEND(rcv, m);
2284
2285	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2286	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2287		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2288		so_sorwakeup_locked(so);
2289	else
2290		sockbuf_unlock(rcv);
2291}
2292
2293#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2294		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2295		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2296		 F_DDP_INVALID_PPOD)
2297
2298/*
2299 * Handler for RX_DATA_DDP CPL messages.
2300 */
2301static int
2302do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2303{
2304	struct toepcb *toep = ctx;
2305	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2306
2307	VALIDATE_SOCK(so);
2308
2309	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2310		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2311		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2312		return (CPL_RET_BUF_DONE);
2313	}
2314#if 0
2315	skb->h.th = tcphdr_skb->h.th;
2316#endif
2317	new_rx_data_ddp(toep, m);
2318	return (0);
2319}
2320
2321static void
2322process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2323{
2324	struct tcpcb *tp = toep->tp_tp;
2325	struct socket *so;
2326	struct ddp_state *q;
2327	struct ddp_buf_state *bsp;
2328	struct cpl_rx_ddp_complete *hdr;
2329	unsigned int ddp_report, buf_idx, when, delack_mode;
2330	int nomoredata = 0;
2331	struct sockbuf *rcv;
2332
2333	inp_wlock(tp->t_inpcb);
2334	so = inp_inpcbtosocket(tp->t_inpcb);
2335
2336	if (__predict_false(so_no_receive(so))) {
2337		struct inpcb *inp = so_sotoinpcb(so);
2338
2339		handle_excess_rx(toep, m);
2340		inp_wunlock(inp);
2341		return;
2342	}
2343	q = &toep->tp_ddp_state;
2344	hdr = cplhdr(m);
2345	ddp_report = ntohl(hdr->ddp_report);
2346	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2347	m->m_pkthdr.csum_data = tp->rcv_nxt;
2348
2349	rcv = so_sockbuf_rcv(so);
2350	sockbuf_lock(rcv);
2351
2352	bsp = &q->buf_state[buf_idx];
2353	when = bsp->cur_offset;
2354	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2355	tp->rcv_nxt += m->m_len;
2356	tp->t_rcvtime = ticks;
2357
2358	delack_mode = G_DDP_DACK_MODE(ddp_report);
2359	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2360		toep->tp_delack_mode = delack_mode;
2361		toep->tp_delack_seq = tp->rcv_nxt;
2362	}
2363#ifdef notyet
2364	skb_reset_transport_header(skb);
2365	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2366#endif
2367	inp_wunlock(tp->t_inpcb);
2368
2369	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2370	CTR5(KTR_TOM,
2371		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2372		  "ddp_report 0x%x offset %u, len %u",
2373		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2374		   G_DDP_OFFSET(ddp_report), m->m_len);
2375
2376	m->m_cur_offset = bsp->cur_offset;
2377	bsp->cur_offset += m->m_len;
2378
2379	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2380		q->cur_buf ^= 1;                     /* flip buffers */
2381		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2382			nomoredata=1;
2383	}
2384
2385	CTR4(KTR_TOM,
2386		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2387		  "ddp_report %u offset %u",
2388		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2389		   G_DDP_OFFSET(ddp_report));
2390
2391	m->m_ddp_gl = (unsigned char *)bsp->gl;
2392	m->m_flags |= M_DDP;
2393	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2394	if (bsp->flags & DDP_BF_NOCOPY)
2395		bsp->flags &= ~DDP_BF_NOCOPY;
2396	if (nomoredata)
2397		m->m_ddp_flags |= DDP_BF_NODATA;
2398
2399	SBAPPEND(rcv, m);
2400	if ((so_state_get(so) & SS_NOFDREF) == 0)
2401		so_sorwakeup_locked(so);
2402	else
2403		sockbuf_unlock(rcv);
2404}
2405
2406/*
2407 * Handler for RX_DDP_COMPLETE CPL messages.
2408 */
2409static int
2410do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2411{
2412	struct toepcb *toep = ctx;
2413
2414	VALIDATE_SOCK(so);
2415#if 0
2416	skb->h.th = tcphdr_skb->h.th;
2417#endif
2418	process_ddp_complete(toep, m);
2419	return (0);
2420}
2421
2422/*
2423 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2424 * socket state before calling tcp_time_wait to comply with its expectations.
2425 */
2426static void
2427enter_timewait(struct tcpcb *tp)
2428{
2429	/*
2430	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2431	 * process peer_close because we don't want to carry the peer FIN in
2432	 * the socket's receive queue and if we increment rcv_nxt without
2433	 * having the FIN in the receive queue we'll confuse facilities such
2434	 * as SIOCINQ.
2435	 */
2436	inp_wlock(tp->t_inpcb);
2437	tp->rcv_nxt++;
2438
2439	tp->ts_recent_age = 0;	     /* defeat recycling */
2440	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2441	inp_wunlock(tp->t_inpcb);
2442	tcp_offload_twstart(tp);
2443}
2444
2445/*
2446 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2447 * function deals with the data that may be reported along with the FIN.
2448 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2449 * perform normal FIN-related processing.  In the latter case 1 indicates that
2450 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2451 * skb can be freed.
2452 */
2453static int
2454handle_peer_close_data(struct socket *so, struct mbuf *m)
2455{
2456	struct tcpcb *tp = so_sototcpcb(so);
2457	struct toepcb *toep = tp->t_toe;
2458	struct ddp_state *q;
2459	struct ddp_buf_state *bsp;
2460	struct cpl_peer_close *req = cplhdr(m);
2461	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2462	struct sockbuf *rcv;
2463
2464	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2465		return (0);
2466
2467	CTR0(KTR_TOM, "handle_peer_close_data");
2468	if (__predict_false(so_no_receive(so))) {
2469		handle_excess_rx(toep, m);
2470
2471		/*
2472		 * Although we discard the data we want to process the FIN so
2473		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2474		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2475		 * may be what will close the connection.  We return 1 because
2476		 * handle_excess_rx() already freed the packet.
2477		 */
2478		return (1);
2479	}
2480
2481	inp_lock_assert(tp->t_inpcb);
2482	q = &toep->tp_ddp_state;
2483	rcv = so_sockbuf_rcv(so);
2484	sockbuf_lock(rcv);
2485
2486	bsp = &q->buf_state[q->cur_buf];
2487	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2488	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2489	m->m_ddp_gl = (unsigned char *)bsp->gl;
2490	m->m_flags |= M_DDP;
2491	m->m_cur_offset = bsp->cur_offset;
2492	m->m_ddp_flags =
2493	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2494	m->m_seq = tp->rcv_nxt;
2495	tp->rcv_nxt = rcv_nxt;
2496	bsp->cur_offset += m->m_pkthdr.len;
2497	if (!(bsp->flags & DDP_BF_NOFLIP))
2498		q->cur_buf ^= 1;
2499#ifdef notyet
2500	skb_reset_transport_header(skb);
2501	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2502#endif
2503	tp->t_rcvtime = ticks;
2504	SBAPPEND(rcv, m);
2505	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2506		so_sorwakeup_locked(so);
2507	else
2508		sockbuf_unlock(rcv);
2509
2510	return (1);
2511}
2512
2513/*
2514 * Handle a peer FIN.
2515 */
2516static void
2517do_peer_fin(struct toepcb *toep, struct mbuf *m)
2518{
2519	struct socket *so;
2520	struct tcpcb *tp = toep->tp_tp;
2521	int keep, action;
2522
2523	action = keep = 0;
2524	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2525	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2526		printf("abort_pending set\n");
2527
2528		goto out;
2529	}
2530	inp_wlock(tp->t_inpcb);
2531	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2532	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2533		keep = handle_peer_close_data(so, m);
2534		if (keep < 0) {
2535			inp_wunlock(tp->t_inpcb);
2536			return;
2537		}
2538	}
2539	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2540		CTR1(KTR_TOM,
2541		    "waking up waiters for cantrcvmore on %p ", so);
2542		socantrcvmore(so);
2543
2544		/*
2545		 * If connection is half-synchronized
2546		 * (ie NEEDSYN flag on) then delay ACK,
2547		 * so it may be piggybacked when SYN is sent.
2548		 * Otherwise, since we received a FIN then no
2549		 * more input can be expected, send ACK now.
2550		 */
2551		if (tp->t_flags & TF_NEEDSYN)
2552			tp->t_flags |= TF_DELACK;
2553		else
2554			tp->t_flags |= TF_ACKNOW;
2555		tp->rcv_nxt++;
2556	}
2557
2558	switch (tp->t_state) {
2559	case TCPS_SYN_RECEIVED:
2560	    tp->t_starttime = ticks;
2561	/* FALLTHROUGH */
2562	case TCPS_ESTABLISHED:
2563		tp->t_state = TCPS_CLOSE_WAIT;
2564		break;
2565	case TCPS_FIN_WAIT_1:
2566		tp->t_state = TCPS_CLOSING;
2567		break;
2568	case TCPS_FIN_WAIT_2:
2569		/*
2570		 * If we've sent an abort_req we must have sent it too late,
2571		 * HW will send us a reply telling us so, and this peer_close
2572		 * is really the last message for this connection and needs to
2573		 * be treated as an abort_rpl, i.e., transition the connection
2574		 * to TCP_CLOSE (note that the host stack does this at the
2575		 * time of generating the RST but we must wait for HW).
2576		 * Otherwise we enter TIME_WAIT.
2577		 */
2578		t3_release_offload_resources(toep);
2579		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2580			action = TCP_CLOSE;
2581		} else {
2582			action = TCP_TIMEWAIT;
2583		}
2584		break;
2585	default:
2586		log(LOG_ERR,
2587		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2588		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2589	}
2590	inp_wunlock(tp->t_inpcb);
2591
2592	if (action == TCP_TIMEWAIT) {
2593		enter_timewait(tp);
2594	} else if (action == TCP_DROP) {
2595		tcp_offload_drop(tp, 0);
2596	} else if (action == TCP_CLOSE) {
2597		tcp_offload_close(tp);
2598	}
2599
2600#ifdef notyet
2601	/* Do not send POLL_HUP for half duplex close. */
2602	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2603	    sk->sk_state == TCP_CLOSE)
2604		sk_wake_async(so, 1, POLL_HUP);
2605	else
2606		sk_wake_async(so, 1, POLL_IN);
2607#endif
2608
2609out:
2610	if (!keep)
2611		m_free(m);
2612}
2613
2614/*
2615 * Handler for PEER_CLOSE CPL messages.
2616 */
2617static int
2618do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2619{
2620	struct toepcb *toep = (struct toepcb *)ctx;
2621
2622	VALIDATE_SOCK(so);
2623
2624	do_peer_fin(toep, m);
2625	return (0);
2626}
2627
2628static void
2629process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2630{
2631	struct cpl_close_con_rpl *rpl = cplhdr(m);
2632	struct tcpcb *tp = toep->tp_tp;
2633	struct socket *so;
2634	int action = 0;
2635	struct sockbuf *rcv;
2636
2637	inp_wlock(tp->t_inpcb);
2638	so = inp_inpcbtosocket(tp->t_inpcb);
2639
2640	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2641
2642	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2643		inp_wunlock(tp->t_inpcb);
2644		goto out;
2645	}
2646
2647	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2648	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2649
2650	switch (tp->t_state) {
2651	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2652		t3_release_offload_resources(toep);
2653		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2654			action = TCP_CLOSE;
2655
2656		} else {
2657			action = TCP_TIMEWAIT;
2658		}
2659		break;
2660	case TCPS_LAST_ACK:
2661		/*
2662		 * In this state we don't care about pending abort_rpl.
2663		 * If we've sent abort_req it was post-close and was sent too
2664		 * late, this close_con_rpl is the actual last message.
2665		 */
2666		t3_release_offload_resources(toep);
2667		action = TCP_CLOSE;
2668		break;
2669	case TCPS_FIN_WAIT_1:
2670		/*
2671		 * If we can't receive any more
2672		 * data, then closing user can proceed.
2673		 * Starting the timer is contrary to the
2674		 * specification, but if we don't get a FIN
2675		 * we'll hang forever.
2676		 *
2677		 * XXXjl:
2678		 * we should release the tp also, and use a
2679		 * compressed state.
2680		 */
2681		if (so)
2682			rcv = so_sockbuf_rcv(so);
2683		else
2684			break;
2685
2686		if (rcv->sb_state & SBS_CANTRCVMORE) {
2687			int timeout;
2688
2689			if (so)
2690				soisdisconnected(so);
2691			timeout = (tcp_fast_finwait2_recycle) ?
2692			    tcp_finwait2_timeout : tcp_maxidle;
2693			tcp_timer_activate(tp, TT_2MSL, timeout);
2694		}
2695		tp->t_state = TCPS_FIN_WAIT_2;
2696		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2697		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2698			action = TCP_DROP;
2699		}
2700
2701		break;
2702	default:
2703		log(LOG_ERR,
2704		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2705		       toep->tp_toedev->tod_name, toep->tp_tid,
2706		       tp->t_state);
2707	}
2708	inp_wunlock(tp->t_inpcb);
2709
2710
2711	if (action == TCP_TIMEWAIT) {
2712		enter_timewait(tp);
2713	} else if (action == TCP_DROP) {
2714		tcp_offload_drop(tp, 0);
2715	} else if (action == TCP_CLOSE) {
2716		tcp_offload_close(tp);
2717	}
2718out:
2719	m_freem(m);
2720}
2721
2722/*
2723 * Handler for CLOSE_CON_RPL CPL messages.
2724 */
2725static int
2726do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2727			    void *ctx)
2728{
2729	struct toepcb *toep = (struct toepcb *)ctx;
2730
2731	process_close_con_rpl(toep, m);
2732	return (0);
2733}
2734
2735/*
2736 * Process abort replies.  We only process these messages if we anticipate
2737 * them as the coordination between SW and HW in this area is somewhat lacking
2738 * and sometimes we get ABORT_RPLs after we are done with the connection that
2739 * originated the ABORT_REQ.
2740 */
2741static void
2742process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2743{
2744	struct tcpcb *tp = toep->tp_tp;
2745	struct socket *so;
2746	int needclose = 0;
2747
2748#ifdef T3_TRACE
2749	T3_TRACE1(TIDTB(sk),
2750		  "process_abort_rpl: GTS rpl pending %d",
2751		  sock_flag(sk, ABORT_RPL_PENDING));
2752#endif
2753
2754	inp_wlock(tp->t_inpcb);
2755	so = inp_inpcbtosocket(tp->t_inpcb);
2756
2757	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2758		/*
2759		 * XXX panic on tcpdrop
2760		 */
2761		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2762			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2763		else {
2764			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2765			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2766			    !is_t3a(toep->tp_toedev)) {
2767				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2768					panic("TP_ABORT_REQ_RCVD set");
2769				t3_release_offload_resources(toep);
2770				needclose = 1;
2771			}
2772		}
2773	}
2774	inp_wunlock(tp->t_inpcb);
2775
2776	if (needclose)
2777		tcp_offload_close(tp);
2778
2779	m_free(m);
2780}
2781
2782/*
2783 * Handle an ABORT_RPL_RSS CPL message.
2784 */
2785static int
2786do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2787{
2788	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2789	struct toepcb *toep;
2790
2791	/*
2792	 * Ignore replies to post-close aborts indicating that the abort was
2793	 * requested too late.  These connections are terminated when we get
2794	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2795	 * arrives the TID is either no longer used or it has been recycled.
2796	 */
2797	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2798discard:
2799		m_free(m);
2800		return (0);
2801	}
2802
2803	toep = (struct toepcb *)ctx;
2804
2805        /*
2806	 * Sometimes we've already closed the socket, e.g., a post-close
2807	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2808	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2809	 * but FW turns the ABORT_REQ into a regular one and so we get
2810	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2811	 */
2812	if (!toep)
2813		goto discard;
2814
2815	if (toep->tp_tp == NULL) {
2816		log(LOG_NOTICE, "removing tid for abort\n");
2817		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2818		if (toep->tp_l2t)
2819			l2t_release(L2DATA(cdev), toep->tp_l2t);
2820
2821		toepcb_release(toep);
2822		goto discard;
2823	}
2824
2825	log(LOG_NOTICE, "toep=%p\n", toep);
2826	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2827
2828	toepcb_hold(toep);
2829	process_abort_rpl(toep, m);
2830	toepcb_release(toep);
2831	return (0);
2832}
2833
2834/*
2835 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2836 * indicate whether RST should be sent in response.
2837 */
2838static int
2839abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2840{
2841	struct tcpcb *tp = so_sototcpcb(so);
2842
2843	switch (abort_reason) {
2844	case CPL_ERR_BAD_SYN:
2845#if 0
2846		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2847#endif
2848	case CPL_ERR_CONN_RESET:
2849		// XXX need to handle SYN_RECV due to crossed SYNs
2850		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2851	case CPL_ERR_XMIT_TIMEDOUT:
2852	case CPL_ERR_PERSIST_TIMEDOUT:
2853	case CPL_ERR_FINWAIT2_TIMEDOUT:
2854	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2855#if 0
2856		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2857#endif
2858		return (ETIMEDOUT);
2859	default:
2860		return (EIO);
2861	}
2862}
2863
2864static inline void
2865set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2866{
2867	struct cpl_abort_rpl *rpl = cplhdr(m);
2868
2869	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2870	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2871	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2872
2873	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2874	rpl->cmd = cmd;
2875}
2876
2877static void
2878send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2879{
2880	struct mbuf *reply_mbuf;
2881	struct cpl_abort_req_rss *req = cplhdr(m);
2882
2883	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2884	m_set_priority(m, CPL_PRIORITY_DATA);
2885	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2886	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2887	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2888	m_free(m);
2889}
2890
2891/*
2892 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2893 */
2894static inline int
2895is_neg_adv_abort(unsigned int status)
2896{
2897	return status == CPL_ERR_RTX_NEG_ADVICE ||
2898	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2899}
2900
2901static void
2902send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2903{
2904	struct mbuf  *reply_mbuf;
2905	struct cpl_abort_req_rss *req = cplhdr(m);
2906
2907	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2908
2909	if (!reply_mbuf) {
2910		/* Defer the reply.  Stick rst_status into req->cmd. */
2911		req->status = rst_status;
2912		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2913		return;
2914	}
2915
2916	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2917	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2918	m_free(m);
2919
2920	/*
2921	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2922	 * these messages while ARP is pending.  For other connection states
2923	 * it's not a problem.
2924	 */
2925	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2926}
2927
2928#ifdef notyet
2929static void
2930cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2931{
2932	CXGB_UNIMPLEMENTED();
2933#ifdef notyet
2934	struct request_sock *req = child->sk_user_data;
2935
2936	inet_csk_reqsk_queue_removed(parent, req);
2937	synq_remove(tcp_sk(child));
2938	__reqsk_free(req);
2939	child->sk_user_data = NULL;
2940#endif
2941}
2942
2943
2944/*
2945 * Performs the actual work to abort a SYN_RECV connection.
2946 */
2947static void
2948do_abort_syn_rcv(struct socket *child, struct socket *parent)
2949{
2950	struct tcpcb *parenttp = so_sototcpcb(parent);
2951	struct tcpcb *childtp = so_sototcpcb(child);
2952
2953	/*
2954	 * If the server is still open we clean up the child connection,
2955	 * otherwise the server already did the clean up as it was purging
2956	 * its SYN queue and the skb was just sitting in its backlog.
2957	 */
2958	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2959		cleanup_syn_rcv_conn(child, parent);
2960		inp_wlock(childtp->t_inpcb);
2961		t3_release_offload_resources(childtp->t_toe);
2962		inp_wunlock(childtp->t_inpcb);
2963		tcp_offload_close(childtp);
2964	}
2965}
2966#endif
2967
2968/*
2969 * Handle abort requests for a SYN_RECV connection.  These need extra work
2970 * because the socket is on its parent's SYN queue.
2971 */
2972static int
2973abort_syn_rcv(struct socket *so, struct mbuf *m)
2974{
2975	CXGB_UNIMPLEMENTED();
2976#ifdef notyet
2977	struct socket *parent;
2978	struct toedev *tdev = toep->tp_toedev;
2979	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2980	struct socket *oreq = so->so_incomp;
2981	struct t3c_tid_entry *t3c_stid;
2982	struct tid_info *t;
2983
2984	if (!oreq)
2985		return -1;        /* somehow we are not on the SYN queue */
2986
2987	t = &(T3C_DATA(cdev))->tid_maps;
2988	t3c_stid = lookup_stid(t, oreq->ts_recent);
2989	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2990
2991	so_lock(parent);
2992	do_abort_syn_rcv(so, parent);
2993	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2994	so_unlock(parent);
2995#endif
2996	return (0);
2997}
2998
2999/*
3000 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
3001 * request except that we need to reply to it.
3002 */
3003static void
3004process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3005{
3006	int rst_status = CPL_ABORT_NO_RST;
3007	const struct cpl_abort_req_rss *req = cplhdr(m);
3008	struct tcpcb *tp = toep->tp_tp;
3009	struct socket *so;
3010	int needclose = 0;
3011
3012	inp_wlock(tp->t_inpcb);
3013	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3014	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3015		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3016		m_free(m);
3017		goto skip;
3018	}
3019
3020	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3021	/*
3022	 * Three cases to consider:
3023	 * a) We haven't sent an abort_req; close the connection.
3024	 * b) We have sent a post-close abort_req that will get to TP too late
3025	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3026	 *    be ignored and the connection should be closed now.
3027	 * c) We have sent a regular abort_req that will get to TP too late.
3028	 *    That will generate an abort_rpl with status 0, wait for it.
3029	 */
3030	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3031	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3032		int error;
3033
3034		error = abort_status_to_errno(so, req->status,
3035		    &rst_status);
3036		so_error_set(so, error);
3037
3038		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3039			so_sorwakeup(so);
3040		/*
3041		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3042		 * returns 0 is has taken care of the abort.
3043		 */
3044		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3045			goto skip;
3046
3047		t3_release_offload_resources(toep);
3048		needclose = 1;
3049	}
3050	inp_wunlock(tp->t_inpcb);
3051
3052	if (needclose)
3053		tcp_offload_close(tp);
3054
3055	send_abort_rpl(m, tdev, rst_status);
3056	return;
3057skip:
3058	inp_wunlock(tp->t_inpcb);
3059}
3060
3061/*
3062 * Handle an ABORT_REQ_RSS CPL message.
3063 */
3064static int
3065do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3066{
3067	const struct cpl_abort_req_rss *req = cplhdr(m);
3068	struct toepcb *toep = (struct toepcb *)ctx;
3069
3070	if (is_neg_adv_abort(req->status)) {
3071		m_free(m);
3072		return (0);
3073	}
3074
3075	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3076
3077	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3078		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3079		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3080
3081		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3082		if (toep->tp_l2t)
3083			l2t_release(L2DATA(cdev), toep->tp_l2t);
3084
3085		/*
3086		 *  Unhook
3087		 */
3088		toep->tp_tp->t_toe = NULL;
3089		toep->tp_tp->t_flags &= ~TF_TOE;
3090		toep->tp_tp = NULL;
3091		/*
3092		 * XXX need to call syncache_chkrst - but we don't
3093		 * have a way of doing that yet
3094		 */
3095		toepcb_release(toep);
3096		log(LOG_ERR, "abort for unestablished connection :-(\n");
3097		return (0);
3098	}
3099	if (toep->tp_tp == NULL) {
3100		log(LOG_NOTICE, "disconnected toepcb\n");
3101		/* should be freed momentarily */
3102		return (0);
3103	}
3104
3105
3106	toepcb_hold(toep);
3107	process_abort_req(toep, m, toep->tp_toedev);
3108	toepcb_release(toep);
3109	return (0);
3110}
3111#ifdef notyet
3112static void
3113pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3114{
3115	struct toedev *tdev = TOE_DEV(parent);
3116
3117	do_abort_syn_rcv(child, parent);
3118	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3119		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3120
3121		rpl->opt0h = htonl(F_TCAM_BYPASS);
3122		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3123		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3124	} else
3125		m_free(m);
3126}
3127#endif
3128static void
3129handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3130{
3131	CXGB_UNIMPLEMENTED();
3132
3133#ifdef notyet
3134	struct t3cdev *cdev;
3135	struct socket *parent;
3136	struct socket *oreq;
3137	struct t3c_tid_entry *t3c_stid;
3138	struct tid_info *t;
3139	struct tcpcb *otp, *tp = so_sototcpcb(so);
3140	struct toepcb *toep = tp->t_toe;
3141
3142	/*
3143	 * If the connection is being aborted due to the parent listening
3144	 * socket going away there's nothing to do, the ABORT_REQ will close
3145	 * the connection.
3146	 */
3147	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3148		m_free(m);
3149		return;
3150	}
3151
3152	oreq = so->so_incomp;
3153	otp = so_sototcpcb(oreq);
3154
3155	cdev = T3C_DEV(so);
3156	t = &(T3C_DATA(cdev))->tid_maps;
3157	t3c_stid = lookup_stid(t, otp->ts_recent);
3158	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3159
3160	so_lock(parent);
3161	pass_open_abort(so, parent, m);
3162	so_unlock(parent);
3163#endif
3164}
3165
3166/*
3167 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3168 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3169 * connection.
3170 */
3171static void
3172pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3173{
3174
3175#ifdef notyet
3176	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3177	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3178#endif
3179	handle_pass_open_arp_failure(m_get_socket(m), m);
3180}
3181
3182/*
3183 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3184 */
3185static void
3186mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3187{
3188	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3189	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3190	unsigned int tid = GET_TID(req);
3191
3192	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3193	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3194	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3195	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3196	rpl->opt0h = htonl(F_TCAM_BYPASS);
3197	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3198	rpl->opt2 = 0;
3199	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3200}
3201
3202/*
3203 * Send a deferred reject to an accept request.
3204 */
3205static void
3206reject_pass_request(struct toedev *tdev, struct mbuf *m)
3207{
3208	struct mbuf *reply_mbuf;
3209
3210	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3211	mk_pass_accept_rpl(reply_mbuf, m);
3212	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3213	m_free(m);
3214}
3215
3216static void
3217handle_syncache_event(int event, void *arg)
3218{
3219	struct toepcb *toep = arg;
3220
3221	switch (event) {
3222	case TOE_SC_ENTRY_PRESENT:
3223		/*
3224		 * entry already exists - free toepcb
3225		 * and l2t
3226		 */
3227		printf("syncache entry present\n");
3228		toepcb_release(toep);
3229		break;
3230	case TOE_SC_DROP:
3231		/*
3232		 * The syncache has given up on this entry
3233		 * either it timed out, or it was evicted
3234		 * we need to explicitly release the tid
3235		 */
3236		printf("syncache entry dropped\n");
3237		toepcb_release(toep);
3238		break;
3239	default:
3240		log(LOG_ERR, "unknown syncache event %d\n", event);
3241		break;
3242	}
3243}
3244
3245static void
3246syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3247{
3248	struct in_conninfo inc;
3249	struct tcpopt to;
3250	struct tcphdr th;
3251	struct inpcb *inp;
3252	int mss, wsf, sack, ts;
3253	uint32_t rcv_isn = ntohl(req->rcv_isn);
3254
3255	bzero(&to, sizeof(struct tcpopt));
3256	inp = so_sotoinpcb(lso);
3257
3258	/*
3259	 * Fill out information for entering us into the syncache
3260	 */
3261	bzero(&inc, sizeof(inc));
3262	inc.inc_fport = th.th_sport = req->peer_port;
3263	inc.inc_lport = th.th_dport = req->local_port;
3264	th.th_seq = req->rcv_isn;
3265	th.th_flags = TH_SYN;
3266
3267	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3268
3269
3270	inc.inc_isipv6 = 0;
3271	inc.inc_len = 0;
3272	inc.inc_faddr.s_addr = req->peer_ip;
3273	inc.inc_laddr.s_addr = req->local_ip;
3274
3275	DPRINTF("syncache add of %d:%d %d:%d\n",
3276	    ntohl(req->local_ip), ntohs(req->local_port),
3277	    ntohl(req->peer_ip), ntohs(req->peer_port));
3278
3279	mss = req->tcp_options.mss;
3280	wsf = req->tcp_options.wsf;
3281	ts = req->tcp_options.tstamp;
3282	sack = req->tcp_options.sack;
3283	to.to_mss = mss;
3284	to.to_wscale = wsf;
3285	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3286	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3287}
3288
3289
3290/*
3291 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3292 * lock held.  Note that the sock here is a listening socket that is not owned
3293 * by the TOE.
3294 */
3295static void
3296process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3297    struct listen_ctx *lctx)
3298{
3299	int rt_flags;
3300	struct l2t_entry *e;
3301	struct iff_mac tim;
3302	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3303	struct cpl_pass_accept_rpl *rpl;
3304	struct cpl_pass_accept_req *req = cplhdr(m);
3305	unsigned int tid = GET_TID(req);
3306	struct tom_data *d = TOM_DATA(tdev);
3307	struct t3cdev *cdev = d->cdev;
3308	struct tcpcb *tp = so_sototcpcb(so);
3309	struct toepcb *newtoep;
3310	struct rtentry *dst;
3311	struct sockaddr_in nam;
3312	struct t3c_data *td = T3C_DATA(cdev);
3313
3314	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3315	if (__predict_false(reply_mbuf == NULL)) {
3316		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3317			t3_defer_reply(m, tdev, reject_pass_request);
3318		else {
3319			cxgb_queue_tid_release(cdev, tid);
3320			m_free(m);
3321		}
3322		DPRINTF("failed to get reply_mbuf\n");
3323
3324		goto out;
3325	}
3326
3327	if (tp->t_state != TCPS_LISTEN) {
3328		DPRINTF("socket not in listen state\n");
3329
3330		goto reject;
3331	}
3332
3333	tim.mac_addr = req->dst_mac;
3334	tim.vlan_tag = ntohs(req->vlan_tag);
3335	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3336		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3337		goto reject;
3338	}
3339
3340#ifdef notyet
3341	/*
3342	 * XXX do route lookup to confirm that we're still listening on this
3343	 * address
3344	 */
3345	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3346			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3347		goto reject;
3348	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3349		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3350	dst_release(skb->dst);	// done with the input route, release it
3351	skb->dst = NULL;
3352
3353	if ((rt_flags & RTF_LOCAL) == 0)
3354		goto reject;
3355#endif
3356	/*
3357	 * XXX
3358	 */
3359	rt_flags = RTF_LOCAL;
3360	if ((rt_flags & RTF_LOCAL) == 0)
3361		goto reject;
3362
3363	/*
3364	 * Calculate values and add to syncache
3365	 */
3366
3367	newtoep = toepcb_alloc();
3368	if (newtoep == NULL)
3369		goto reject;
3370
3371	bzero(&nam, sizeof(struct sockaddr_in));
3372
3373	nam.sin_len = sizeof(struct sockaddr_in);
3374	nam.sin_family = AF_INET;
3375	nam.sin_addr.s_addr =req->peer_ip;
3376	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3377
3378	if (dst == NULL) {
3379		printf("failed to find route\n");
3380		goto reject;
3381	}
3382	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3383	    (struct sockaddr *)&nam);
3384	if (e == NULL) {
3385		DPRINTF("failed to get l2t\n");
3386	}
3387	/*
3388	 * Point to our listen socket until accept
3389	 */
3390	newtoep->tp_tp = tp;
3391	newtoep->tp_flags = TP_SYN_RCVD;
3392	newtoep->tp_tid = tid;
3393	newtoep->tp_toedev = tdev;
3394	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3395
3396	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3397	so_lock(so);
3398	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3399	so_unlock(so);
3400
3401	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3402		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3403
3404	if (newtoep->tp_ulp_mode) {
3405		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3406
3407		if (ddp_mbuf == NULL)
3408			newtoep->tp_ulp_mode = 0;
3409	}
3410
3411	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3412	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3413	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3414	/*
3415	 * XXX workaround for lack of syncache drop
3416	 */
3417	toepcb_hold(newtoep);
3418	syncache_add_accept_req(req, so, newtoep);
3419
3420	rpl = cplhdr(reply_mbuf);
3421	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3422	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3423	rpl->wr.wr_lo = 0;
3424	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3425	rpl->opt2 = htonl(calc_opt2(so, tdev));
3426	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3427	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3428
3429	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3430	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3431	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3432				  CPL_PASS_OPEN_ACCEPT);
3433
3434	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3435
3436	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3437
3438	l2t_send(cdev, reply_mbuf, e);
3439	m_free(m);
3440	if (newtoep->tp_ulp_mode) {
3441		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3442				V_TF_DDP_OFF(1) |
3443				TP_DDP_TIMER_WORKAROUND_MASK,
3444				V_TF_DDP_OFF(1) |
3445		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3446	} else
3447		DPRINTF("no DDP\n");
3448
3449	return;
3450reject:
3451	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3452		mk_pass_accept_rpl(reply_mbuf, m);
3453	else
3454		mk_tid_release(reply_mbuf, newtoep, tid);
3455	cxgb_ofld_send(cdev, reply_mbuf);
3456	m_free(m);
3457out:
3458#if 0
3459	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3460#else
3461	return;
3462#endif
3463}
3464
3465/*
3466 * Handle a CPL_PASS_ACCEPT_REQ message.
3467 */
3468static int
3469do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3470{
3471	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3472	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3473	struct tom_data *d = listen_ctx->tom_data;
3474
3475#if VALIDATE_TID
3476	struct cpl_pass_accept_req *req = cplhdr(m);
3477	unsigned int tid = GET_TID(req);
3478	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3479
3480	if (unlikely(!lsk)) {
3481		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3482		       cdev->name,
3483		       (unsigned long)((union listen_entry *)ctx -
3484					t->stid_tab));
3485		return CPL_RET_BUF_DONE;
3486	}
3487	if (unlikely(tid >= t->ntids)) {
3488		printk(KERN_ERR "%s: passive open TID %u too large\n",
3489		       cdev->name, tid);
3490		return CPL_RET_BUF_DONE;
3491	}
3492	/*
3493	 * For T3A the current user of the TID may have closed but its last
3494	 * message(s) may have been backlogged so the TID appears to be still
3495	 * in use.  Just take the TID away, the connection can close at its
3496	 * own leisure.  For T3B this situation is a bug.
3497	 */
3498	if (!valid_new_tid(t, tid) &&
3499	    cdev->type != T3A) {
3500		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3501		       cdev->name, tid);
3502		return CPL_RET_BUF_DONE;
3503	}
3504#endif
3505
3506	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3507	return (0);
3508}
3509
3510/*
3511 * Called when a connection is established to translate the TCP options
3512 * reported by HW to FreeBSD's native format.
3513 */
3514static void
3515assign_rxopt(struct socket *so, unsigned int opt)
3516{
3517	struct tcpcb *tp = so_sototcpcb(so);
3518	struct toepcb *toep = tp->t_toe;
3519	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3520
3521	inp_lock_assert(tp->t_inpcb);
3522
3523	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3524	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3525	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3526	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3527	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3528	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3529		tp->rcv_scale = tp->request_r_scale;
3530}
3531
3532/*
3533 * Completes some final bits of initialization for just established connections
3534 * and changes their state to TCP_ESTABLISHED.
3535 *
3536 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3537 */
3538static void
3539make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3540{
3541	struct tcpcb *tp = so_sototcpcb(so);
3542	struct toepcb *toep = tp->t_toe;
3543
3544	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3545	assign_rxopt(so, opt);
3546
3547	/*
3548	 *XXXXXXXXXXX
3549	 *
3550	 */
3551#ifdef notyet
3552	so->so_proto->pr_ctloutput = t3_ctloutput;
3553#endif
3554
3555#if 0
3556	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3557#endif
3558	/*
3559	 * XXX not clear what rcv_wup maps to
3560	 */
3561	/*
3562	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3563	 * pass through opt0.
3564	 */
3565	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3566		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3567
3568	dump_toepcb(toep);
3569
3570#ifdef notyet
3571/*
3572 * no clean interface for marking ARP up to date
3573 */
3574	dst_confirm(sk->sk_dst_cache);
3575#endif
3576	tp->t_starttime = ticks;
3577	tp->t_state = TCPS_ESTABLISHED;
3578	soisconnected(so);
3579}
3580
3581static int
3582syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3583{
3584
3585	struct in_conninfo inc;
3586	struct tcpopt to;
3587	struct tcphdr th;
3588	int mss, wsf, sack, ts;
3589	struct mbuf *m = NULL;
3590	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3591	unsigned int opt;
3592
3593#ifdef MAC
3594#error	"no MAC support"
3595#endif
3596
3597	opt = ntohs(req->tcp_opt);
3598
3599	bzero(&to, sizeof(struct tcpopt));
3600
3601	/*
3602	 * Fill out information for entering us into the syncache
3603	 */
3604	bzero(&inc, sizeof(inc));
3605	inc.inc_fport = th.th_sport = req->peer_port;
3606	inc.inc_lport = th.th_dport = req->local_port;
3607	th.th_seq = req->rcv_isn;
3608	th.th_flags = TH_ACK;
3609
3610	inc.inc_isipv6 = 0;
3611	inc.inc_len = 0;
3612	inc.inc_faddr.s_addr = req->peer_ip;
3613	inc.inc_laddr.s_addr = req->local_ip;
3614
3615	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3616	wsf  = G_TCPOPT_WSCALE_OK(opt);
3617	ts   = G_TCPOPT_TSTAMP(opt);
3618	sack = G_TCPOPT_SACK(opt);
3619
3620	to.to_mss = mss;
3621	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3622	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3623
3624	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3625	    ntohl(req->local_ip), ntohs(req->local_port),
3626	    ntohl(req->peer_ip), ntohs(req->peer_port),
3627	    mss, wsf, ts, sack);
3628	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3629}
3630
3631
3632/*
3633 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3634 * if we are in TCP_SYN_RECV due to crossed SYNs
3635 */
3636static int
3637do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3638{
3639	struct cpl_pass_establish *req = cplhdr(m);
3640	struct toepcb *toep = (struct toepcb *)ctx;
3641	struct tcpcb *tp = toep->tp_tp;
3642	struct socket *so, *lso;
3643	struct t3c_data *td = T3C_DATA(cdev);
3644	struct sockbuf *snd, *rcv;
3645
3646	// Complete socket initialization now that we have the SND_ISN
3647
3648	struct toedev *tdev;
3649
3650
3651	tdev = toep->tp_toedev;
3652
3653	inp_wlock(tp->t_inpcb);
3654
3655	/*
3656	 *
3657	 * XXX need to add reference while we're manipulating
3658	 */
3659	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3660
3661	inp_wunlock(tp->t_inpcb);
3662
3663	so_lock(so);
3664	LIST_REMOVE(toep, synq_entry);
3665	so_unlock(so);
3666
3667	if (!syncache_expand_establish_req(req, &so, toep)) {
3668		/*
3669		 * No entry
3670		 */
3671		CXGB_UNIMPLEMENTED();
3672	}
3673	if (so == NULL) {
3674		/*
3675		 * Couldn't create the socket
3676		 */
3677		CXGB_UNIMPLEMENTED();
3678	}
3679
3680	tp = so_sototcpcb(so);
3681	inp_wlock(tp->t_inpcb);
3682
3683	snd = so_sockbuf_snd(so);
3684	rcv = so_sockbuf_rcv(so);
3685
3686	snd->sb_flags |= SB_NOCOALESCE;
3687	rcv->sb_flags |= SB_NOCOALESCE;
3688
3689	toep->tp_tp = tp;
3690	toep->tp_flags = 0;
3691	tp->t_toe = toep;
3692	reset_wr_list(toep);
3693	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3694	tp->rcv_nxt = toep->tp_copied_seq;
3695	install_offload_ops(so);
3696
3697	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3698	toep->tp_wr_unacked = 0;
3699	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3700	toep->tp_qset_idx = 0;
3701	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3702
3703	/*
3704	 * XXX Cancel any keep alive timer
3705	 */
3706
3707	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3708
3709	/*
3710	 * XXX workaround for lack of syncache drop
3711	 */
3712	toepcb_release(toep);
3713	inp_wunlock(tp->t_inpcb);
3714
3715	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3716	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3717#ifdef notyet
3718	/*
3719	 * XXX not sure how these checks map to us
3720	 */
3721	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3722		sk->sk_state_change(sk);
3723		sk_wake_async(so, 0, POLL_OUT);
3724	}
3725	/*
3726	 * The state for the new connection is now up to date.
3727	 * Next check if we should add the connection to the parent's
3728	 * accept queue.  When the parent closes it resets connections
3729	 * on its SYN queue, so check if we are being reset.  If so we
3730	 * don't need to do anything more, the coming ABORT_RPL will
3731	 * destroy this socket.  Otherwise move the connection to the
3732	 * accept queue.
3733	 *
3734	 * Note that we reset the synq before closing the server so if
3735	 * we are not being reset the stid is still open.
3736	 */
3737	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3738		__kfree_skb(skb);
3739		goto unlock;
3740	}
3741#endif
3742	m_free(m);
3743
3744	return (0);
3745}
3746
3747/*
3748 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3749 * and send them to the TOE.
3750 */
3751static void
3752fixup_and_send_ofo(struct toepcb *toep)
3753{
3754	struct mbuf *m;
3755	struct toedev *tdev = toep->tp_toedev;
3756	struct tcpcb *tp = toep->tp_tp;
3757	unsigned int tid = toep->tp_tid;
3758
3759	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3760
3761	inp_lock_assert(tp->t_inpcb);
3762	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3763		/*
3764		 * A variety of messages can be waiting but the fields we'll
3765		 * be touching are common to all so any message type will do.
3766		 */
3767		struct cpl_close_con_req *p = cplhdr(m);
3768
3769		p->wr.wr_lo = htonl(V_WR_TID(tid));
3770		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3771		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3772	}
3773}
3774
3775/*
3776 * Updates socket state from an active establish CPL message.  Runs with the
3777 * socket lock held.
3778 */
3779static void
3780socket_act_establish(struct socket *so, struct mbuf *m)
3781{
3782	INIT_VNET_INET(so->so_vnet);
3783	struct cpl_act_establish *req = cplhdr(m);
3784	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3785	struct tcpcb *tp = so_sototcpcb(so);
3786	struct toepcb *toep = tp->t_toe;
3787
3788	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3789		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3790		    toep->tp_tid, tp->t_state);
3791
3792	tp->ts_recent_age = ticks;
3793	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3794	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3795
3796	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3797
3798	/*
3799	 * Now that we finally have a TID send any CPL messages that we had to
3800	 * defer for lack of a TID.
3801	 */
3802	if (mbufq_len(&toep->out_of_order_queue))
3803		fixup_and_send_ofo(toep);
3804
3805	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3806		/*
3807		 * XXX does this even make sense?
3808		 */
3809		so_sorwakeup(so);
3810	}
3811	m_free(m);
3812#ifdef notyet
3813/*
3814 * XXX assume no write requests permitted while socket connection is
3815 * incomplete
3816 */
3817	/*
3818	 * Currently the send queue must be empty at this point because the
3819	 * socket layer does not send anything before a connection is
3820	 * established.  To be future proof though we handle the possibility
3821	 * that there are pending buffers to send (either TX_DATA or
3822	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3823	 * buffers according to the just learned write_seq, and then we send
3824	 * them on their way.
3825	 */
3826	fixup_pending_writeq_buffers(sk);
3827	if (t3_push_frames(so, 1))
3828		sk->sk_write_space(sk);
3829#endif
3830
3831	toep->tp_state = tp->t_state;
3832	V_tcpstat.tcps_connects++;
3833
3834}
3835
3836/*
3837 * Process a CPL_ACT_ESTABLISH message.
3838 */
3839static int
3840do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3841{
3842	struct cpl_act_establish *req = cplhdr(m);
3843	unsigned int tid = GET_TID(req);
3844	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3845	struct toepcb *toep = (struct toepcb *)ctx;
3846	struct tcpcb *tp = toep->tp_tp;
3847	struct socket *so;
3848	struct toedev *tdev;
3849	struct tom_data *d;
3850
3851	if (tp == NULL) {
3852		free_atid(cdev, atid);
3853		return (0);
3854	}
3855	inp_wlock(tp->t_inpcb);
3856
3857	/*
3858	 * XXX
3859	 */
3860	so = inp_inpcbtosocket(tp->t_inpcb);
3861	tdev = toep->tp_toedev; /* blow up here if link was down */
3862	d = TOM_DATA(tdev);
3863
3864	/*
3865	 * It's OK if the TID is currently in use, the owning socket may have
3866	 * backlogged its last CPL message(s).  Just take it away.
3867	 */
3868	toep->tp_tid = tid;
3869	toep->tp_tp = tp;
3870	so_insert_tid(d, toep, tid);
3871	free_atid(cdev, atid);
3872	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3873
3874	socket_act_establish(so, m);
3875	inp_wunlock(tp->t_inpcb);
3876	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3877	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3878
3879	return (0);
3880}
3881
3882/*
3883 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3884 * next batch of work requests from the write queue.
3885 */
3886static void
3887wr_ack(struct toepcb *toep, struct mbuf *m)
3888{
3889	struct tcpcb *tp = toep->tp_tp;
3890	struct cpl_wr_ack *hdr = cplhdr(m);
3891	struct socket *so;
3892	unsigned int credits = ntohs(hdr->credits);
3893	u32 snd_una = ntohl(hdr->snd_una);
3894	int bytes = 0;
3895	struct sockbuf *snd;
3896
3897	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3898
3899	inp_wlock(tp->t_inpcb);
3900	so = inp_inpcbtosocket(tp->t_inpcb);
3901	toep->tp_wr_avail += credits;
3902	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3903		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3904
3905	while (credits) {
3906		struct mbuf *p = peek_wr(toep);
3907
3908		if (__predict_false(!p)) {
3909			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3910			    "nothing pending, state %u wr_avail=%u\n",
3911			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3912			break;
3913		}
3914		CTR2(KTR_TOM,
3915			"wr_ack: p->credits=%d p->bytes=%d",
3916		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3917		KASSERT(p->m_pkthdr.csum_data != 0,
3918		    ("empty request still on list"));
3919
3920		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3921
3922#if DEBUG_WR > 1
3923			struct tx_data_wr *w = cplhdr(p);
3924			log(LOG_ERR,
3925			       "TID %u got %u WR credits, need %u, len %u, "
3926			       "main body %u, frags %u, seq # %u, ACK una %u,"
3927			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3928			       toep->tp_tid, credits, p->csum, p->len,
3929			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3930			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3931			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3932#endif
3933			p->m_pkthdr.csum_data -= credits;
3934			break;
3935		} else {
3936			dequeue_wr(toep);
3937			credits -= p->m_pkthdr.csum_data;
3938			bytes += p->m_pkthdr.len;
3939			CTR3(KTR_TOM,
3940			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3941			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3942
3943			m_free(p);
3944		}
3945	}
3946
3947#if DEBUG_WR
3948	check_wr_invariants(tp);
3949#endif
3950
3951	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3952#if VALIDATE_SEQ
3953		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3954
3955		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3956		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3957		    toep->tp_tid, tp->snd_una);
3958#endif
3959		goto out_free;
3960	}
3961
3962	if (tp->snd_una != snd_una) {
3963		tp->snd_una = snd_una;
3964		tp->ts_recent_age = ticks;
3965#ifdef notyet
3966		/*
3967		 * Keep ARP entry "minty fresh"
3968		 */
3969		dst_confirm(sk->sk_dst_cache);
3970#endif
3971		if (tp->snd_una == tp->snd_nxt)
3972			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3973	}
3974
3975	snd = so_sockbuf_snd(so);
3976	if (bytes) {
3977		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3978		snd = so_sockbuf_snd(so);
3979		sockbuf_lock(snd);
3980		sbdrop_locked(snd, bytes);
3981		so_sowwakeup_locked(so);
3982	}
3983
3984	if (snd->sb_sndptroff < snd->sb_cc)
3985		t3_push_frames(so, 0);
3986
3987out_free:
3988	inp_wunlock(tp->t_inpcb);
3989	m_free(m);
3990}
3991
3992/*
3993 * Handler for TX_DATA_ACK CPL messages.
3994 */
3995static int
3996do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3997{
3998	struct toepcb *toep = (struct toepcb *)ctx;
3999
4000	VALIDATE_SOCK(so);
4001
4002	wr_ack(toep, m);
4003	return 0;
4004}
4005
4006/*
4007 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4008 */
4009static int
4010do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4011{
4012	m_freem(m);
4013	return 0;
4014}
4015
4016/*
4017 * Reset a connection that is on a listener's SYN queue or accept queue,
4018 * i.e., one that has not had a struct socket associated with it.
4019 * Must be called from process context.
4020 *
4021 * Modeled after code in inet_csk_listen_stop().
4022 */
4023static void
4024t3_reset_listen_child(struct socket *child)
4025{
4026	struct tcpcb *tp = so_sototcpcb(child);
4027
4028	t3_send_reset(tp->t_toe);
4029}
4030
4031
4032static void
4033t3_child_disconnect(struct socket *so, void *arg)
4034{
4035	struct tcpcb *tp = so_sototcpcb(so);
4036
4037	if (tp->t_flags & TF_TOE) {
4038		inp_wlock(tp->t_inpcb);
4039		t3_reset_listen_child(so);
4040		inp_wunlock(tp->t_inpcb);
4041	}
4042}
4043
4044/*
4045 * Disconnect offloaded established but not yet accepted connections sitting
4046 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4047 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4048 */
4049void
4050t3_disconnect_acceptq(struct socket *listen_so)
4051{
4052
4053	so_lock(listen_so);
4054	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4055	so_unlock(listen_so);
4056}
4057
4058/*
4059 * Reset offloaded connections sitting on a server's syn queue.  As above
4060 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4061 */
4062
4063void
4064t3_reset_synq(struct listen_ctx *lctx)
4065{
4066	struct toepcb *toep;
4067
4068	so_lock(lctx->lso);
4069	while (!LIST_EMPTY(&lctx->synq_head)) {
4070		toep = LIST_FIRST(&lctx->synq_head);
4071		LIST_REMOVE(toep, synq_entry);
4072		toep->tp_tp = NULL;
4073		t3_send_reset(toep);
4074		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4075		toepcb_release(toep);
4076	}
4077	so_unlock(lctx->lso);
4078}
4079
4080
4081int
4082t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4083		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4084		   unsigned int pg_off, unsigned int color)
4085{
4086	unsigned int i, j, pidx;
4087	struct pagepod *p;
4088	struct mbuf *m;
4089	struct ulp_mem_io *req;
4090	unsigned int tid = toep->tp_tid;
4091	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4092	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4093
4094	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4095	    gl, nppods, tag, maxoff, pg_off, color);
4096
4097	for (i = 0; i < nppods; ++i) {
4098		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4099		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4100		req = mtod(m, struct ulp_mem_io *);
4101		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4102		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4103		req->wr.wr_lo = 0;
4104		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4105					   V_ULPTX_CMD(ULP_MEM_WRITE));
4106		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4107				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4108
4109		p = (struct pagepod *)(req + 1);
4110		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4111			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4112			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4113						  V_PPOD_COLOR(color));
4114			p->pp_max_offset = htonl(maxoff);
4115			p->pp_page_offset = htonl(pg_off);
4116			p->pp_rsvd = 0;
4117			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4118				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4119				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4120		} else
4121			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4122		send_or_defer(toep, m, 0);
4123		ppod_addr += PPOD_SIZE;
4124	}
4125	return (0);
4126}
4127
4128/*
4129 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4130 */
4131static inline void
4132mk_cpl_barrier_ulp(struct cpl_barrier *b)
4133{
4134	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4135
4136	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4137	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4138	b->opcode = CPL_BARRIER;
4139}
4140
4141/*
4142 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4143 */
4144static inline void
4145mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4146{
4147	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4148
4149	txpkt = (struct ulp_txpkt *)req;
4150	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4151	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4152	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4153	req->cpuno = htons(cpuno);
4154}
4155
4156/*
4157 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4158 */
4159static inline void
4160mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4161                     unsigned int word, uint64_t mask, uint64_t val)
4162{
4163	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4164
4165	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4166	    tid, word, mask, val);
4167
4168	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4169	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4170	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4171	req->reply = V_NO_REPLY(1);
4172	req->cpu_idx = 0;
4173	req->word = htons(word);
4174	req->mask = htobe64(mask);
4175	req->val = htobe64(val);
4176}
4177
4178/*
4179 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4180 */
4181static void
4182mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4183    unsigned int tid, unsigned int credits)
4184{
4185	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4186
4187	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4188	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4189	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4190	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4191	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4192				 V_RX_CREDITS(credits));
4193}
4194
4195void
4196t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4197{
4198	unsigned int wrlen;
4199	struct mbuf *m;
4200	struct work_request_hdr *wr;
4201	struct cpl_barrier *lock;
4202	struct cpl_set_tcb_field *req;
4203	struct cpl_get_tcb *getreq;
4204	struct ddp_state *p = &toep->tp_ddp_state;
4205
4206#if 0
4207	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4208#endif
4209	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4210		sizeof(*getreq);
4211	m = m_gethdr_nofail(wrlen);
4212	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4213	wr = mtod(m, struct work_request_hdr *);
4214	bzero(wr, wrlen);
4215
4216	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4217	m->m_pkthdr.len = m->m_len = wrlen;
4218
4219	lock = (struct cpl_barrier *)(wr + 1);
4220	mk_cpl_barrier_ulp(lock);
4221
4222	req = (struct cpl_set_tcb_field *)(lock + 1);
4223
4224	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4225
4226	/* Hmmm, not sure if this actually a good thing: reactivating
4227	 * the other buffer might be an issue if it has been completed
4228	 * already. However, that is unlikely, since the fact that the UBUF
4229	 * is not completed indicates that there is no oustanding data.
4230	 */
4231	if (bufidx == 0)
4232		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4233				     V_TF_DDP_ACTIVE_BUF(1) |
4234				     V_TF_DDP_BUF0_VALID(1),
4235				     V_TF_DDP_ACTIVE_BUF(1));
4236	else
4237		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4238				     V_TF_DDP_ACTIVE_BUF(1) |
4239				     V_TF_DDP_BUF1_VALID(1), 0);
4240
4241	getreq = (struct cpl_get_tcb *)(req + 1);
4242	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4243
4244	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4245
4246	/* Keep track of the number of oustanding CPL_GET_TCB requests
4247	 */
4248	p->get_tcb_count++;
4249
4250#ifdef T3_TRACE
4251	T3_TRACE1(TIDTB(so),
4252		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4253#endif
4254	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4255}
4256
4257/**
4258 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4259 * @sk: the socket associated with the buffers
4260 * @bufidx: index of HW DDP buffer (0 or 1)
4261 * @tag0: new tag for HW buffer 0
4262 * @tag1: new tag for HW buffer 1
4263 * @len: new length for HW buf @bufidx
4264 *
4265 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4266 * buffer by changing the buffer tag and length and setting the valid and
4267 * active flag accordingly.  The caller must ensure the new buffer is at
4268 * least as big as the existing one.  Since we typically reprogram both HW
4269 * buffers this function sets both tags for convenience. Read the TCB to
4270 * determine how made data was written into the buffer before the overlay
4271 * took place.
4272 */
4273void
4274t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4275	 	       unsigned int tag1, unsigned int len)
4276{
4277	unsigned int wrlen;
4278	struct mbuf *m;
4279	struct work_request_hdr *wr;
4280	struct cpl_get_tcb *getreq;
4281	struct cpl_set_tcb_field *req;
4282	struct ddp_state *p = &toep->tp_ddp_state;
4283
4284	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4285	    bufidx, tag0, tag1, len);
4286#if 0
4287	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4288#endif
4289	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4290	m = m_gethdr_nofail(wrlen);
4291	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4292	wr = mtod(m, struct work_request_hdr *);
4293	m->m_pkthdr.len = m->m_len = wrlen;
4294	bzero(wr, wrlen);
4295
4296
4297	/* Set the ATOMIC flag to make sure that TP processes the following
4298	 * CPLs in an atomic manner and no wire segments can be interleaved.
4299	 */
4300	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4301	req = (struct cpl_set_tcb_field *)(wr + 1);
4302	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4303			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4304			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4305			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4306			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4307	req++;
4308	if (bufidx == 0) {
4309		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4310			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4311			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4312		req++;
4313		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4314			    V_TF_DDP_PUSH_DISABLE_0(1) |
4315			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4316			    V_TF_DDP_PUSH_DISABLE_0(0) |
4317			    V_TF_DDP_BUF0_VALID(1));
4318	} else {
4319		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4320			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4321			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4322		req++;
4323		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4324			    V_TF_DDP_PUSH_DISABLE_1(1) |
4325			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4326			    V_TF_DDP_PUSH_DISABLE_1(0) |
4327			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4328	}
4329
4330	getreq = (struct cpl_get_tcb *)(req + 1);
4331	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4332
4333	/* Keep track of the number of oustanding CPL_GET_TCB requests
4334	 */
4335	p->get_tcb_count++;
4336
4337#ifdef T3_TRACE
4338	T3_TRACE4(TIDTB(sk),
4339		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4340		  "len %d",
4341		  bufidx, tag0, tag1, len);
4342#endif
4343	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4344}
4345
4346/*
4347 * Sends a compound WR containing all the CPL messages needed to program the
4348 * two HW DDP buffers, namely optionally setting up the length and offset of
4349 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4350 */
4351void
4352t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4353		      unsigned int len1, unsigned int offset1,
4354                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4355{
4356	unsigned int wrlen;
4357	struct mbuf *m;
4358	struct work_request_hdr *wr;
4359	struct cpl_set_tcb_field *req;
4360
4361	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4362	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4363
4364#if 0
4365	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4366#endif
4367	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4368		(len1 ? sizeof(*req) : 0) +
4369		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4370	m = m_gethdr_nofail(wrlen);
4371	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4372	wr = mtod(m, struct work_request_hdr *);
4373	bzero(wr, wrlen);
4374
4375	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4376	m->m_pkthdr.len = m->m_len = wrlen;
4377
4378	req = (struct cpl_set_tcb_field *)(wr + 1);
4379	if (len0) {                  /* program buffer 0 offset and length */
4380		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4381			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4382			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4383			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4384			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4385		req++;
4386	}
4387	if (len1) {                  /* program buffer 1 offset and length */
4388		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4389			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4390			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4391			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4392			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4393		req++;
4394	}
4395
4396	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4397			     ddp_flags);
4398
4399	if (modulate) {
4400		mk_rx_data_ack_ulp(toep,
4401		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4402		    toep->tp_copied_seq - toep->tp_rcv_wup);
4403		toep->tp_rcv_wup = toep->tp_copied_seq;
4404	}
4405
4406#ifdef T3_TRACE
4407	T3_TRACE5(TIDTB(sk),
4408		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4409		  "modulate %d",
4410		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4411		  modulate);
4412#endif
4413
4414	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4415}
4416
4417void
4418t3_init_wr_tab(unsigned int wr_len)
4419{
4420	int i;
4421
4422	if (mbuf_wrs[1])     /* already initialized */
4423		return;
4424
4425	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4426		int sgl_len = (3 * i) / 2 + (i & 1);
4427
4428		sgl_len += 3;
4429		mbuf_wrs[i] = sgl_len <= wr_len ?
4430		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4431	}
4432
4433	wrlen = wr_len * 8;
4434}
4435
4436int
4437t3_init_cpl_io(void)
4438{
4439#ifdef notyet
4440	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4441	if (!tcphdr_skb) {
4442		log(LOG_ERR,
4443		       "Chelsio TCP offload: can't allocate sk_buff\n");
4444		return -1;
4445	}
4446	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4447	tcphdr_skb->h.raw = tcphdr_skb->data;
4448	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4449#endif
4450
4451	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4452	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4453	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4454	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4455	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4456	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4457	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4458	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4459	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4460	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4461	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4462	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4463	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4464	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4465	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4466	return (0);
4467}
4468
4469