cxgb_cpl_io.c revision 191816
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 191816 2009-05-05 10:56:12Z zec $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/sockbuf.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/protosw.h>
49#include <sys/priv.h>
50
51#if __FreeBSD_version >= 800044
52#include <sys/vimage.h>
53#else
54#define V_tcp_do_autosndbuf tcp_do_autosndbuf
55#define V_tcp_autosndbuf_max tcp_autosndbuf_max
56#define V_tcp_do_rfc1323 tcp_do_rfc1323
57#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
58#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
59#define V_tcpstat tcpstat
60#endif
61
62#include <net/if.h>
63#include <net/route.h>
64
65#include <netinet/in.h>
66#include <netinet/in_pcb.h>
67#include <netinet/in_systm.h>
68#include <netinet/in_var.h>
69
70
71#include <cxgb_osdep.h>
72#include <sys/mbufq.h>
73
74#include <netinet/ip.h>
75#include <netinet/tcp_var.h>
76#include <netinet/tcp_fsm.h>
77#include <netinet/tcp_offload.h>
78#include <netinet/tcp_seq.h>
79#include <netinet/tcp_syncache.h>
80#include <netinet/tcp_timer.h>
81#if __FreeBSD_version >= 800056
82#include <netinet/vinet.h>
83#endif
84#include <net/route.h>
85
86#include <t3cdev.h>
87#include <common/cxgb_firmware_exports.h>
88#include <common/cxgb_t3_cpl.h>
89#include <common/cxgb_tcb.h>
90#include <common/cxgb_ctl_defs.h>
91#include <cxgb_offload.h>
92#include <vm/vm.h>
93#include <vm/pmap.h>
94#include <machine/bus.h>
95#include <sys/mvec.h>
96#include <ulp/toecore/cxgb_toedev.h>
97#include <ulp/tom/cxgb_l2t.h>
98#include <ulp/tom/cxgb_defs.h>
99#include <ulp/tom/cxgb_tom.h>
100#include <ulp/tom/cxgb_t3_ddp.h>
101#include <ulp/tom/cxgb_toepcb.h>
102#include <ulp/tom/cxgb_tcp.h>
103#include <ulp/tom/cxgb_tcp_offload.h>
104
105/*
106 * For ULP connections HW may add headers, e.g., for digests, that aren't part
107 * of the messages sent by the host but that are part of the TCP payload and
108 * therefore consume TCP sequence space.  Tx connection parameters that
109 * operate in TCP sequence space are affected by the HW additions and need to
110 * compensate for them to accurately track TCP sequence numbers. This array
111 * contains the compensating extra lengths for ULP packets.  It is indexed by
112 * a packet's ULP submode.
113 */
114const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
115
116#ifdef notyet
117/*
118 * This sk_buff holds a fake header-only TCP segment that we use whenever we
119 * need to exploit SW TCP functionality that expects TCP headers, such as
120 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
121 * CPUs without locking.
122 */
123static struct mbuf *tcphdr_mbuf __read_mostly;
124#endif
125
126/*
127 * Size of WRs in bytes.  Note that we assume all devices we are handling have
128 * the same WR size.
129 */
130static unsigned int wrlen __read_mostly;
131
132/*
133 * The number of WRs needed for an skb depends on the number of page fragments
134 * in the skb and whether it has any payload in its main body.  This maps the
135 * length of the gather list represented by an skb into the # of necessary WRs.
136 */
137static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
138
139/*
140 * Max receive window supported by HW in bytes.  Only a small part of it can
141 * be set through option0, the rest needs to be set through RX_DATA_ACK.
142 */
143#define MAX_RCV_WND ((1U << 27) - 1)
144
145/*
146 * Min receive window.  We want it to be large enough to accommodate receive
147 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
148 */
149#define MIN_RCV_WND (24 * 1024U)
150#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
151
152#define VALIDATE_SEQ 0
153#define VALIDATE_SOCK(so)
154#define DEBUG_WR 0
155
156#define TCP_TIMEWAIT	1
157#define TCP_CLOSE	2
158#define TCP_DROP	3
159
160static void t3_send_reset(struct toepcb *toep);
161static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
162static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
163static void handle_syncache_event(int event, void *arg);
164
165static inline void
166SBAPPEND(struct sockbuf *sb, struct mbuf *n)
167{
168	struct mbuf *m;
169
170	m = sb->sb_mb;
171	while (m) {
172		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
173		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
174			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
175		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176			m->m_next, m->m_nextpkt, m->m_flags));
177		m = m->m_next;
178	}
179	m = n;
180	while (m) {
181		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
182		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
183			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
184		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
185			m->m_next, m->m_nextpkt, m->m_flags));
186		m = m->m_next;
187	}
188	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
189	sbappendstream_locked(sb, n);
190	m = sb->sb_mb;
191
192	while (m) {
193		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
194			m->m_next, m->m_nextpkt, m->m_flags));
195		m = m->m_next;
196	}
197}
198
199static inline int
200is_t3a(const struct toedev *dev)
201{
202	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
203}
204
205static void
206dump_toepcb(struct toepcb *toep)
207{
208	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
209	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
210	    toep->tp_mtu_idx, toep->tp_tid);
211
212	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
213	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
214	    toep->tp_mss_clamp, toep->tp_flags);
215}
216
217#ifndef RTALLOC2_DEFINED
218static struct rtentry *
219rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
220{
221	struct rtentry *rt = NULL;
222
223	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
224		RT_UNLOCK(rt);
225
226	return (rt);
227}
228#endif
229
230/*
231 * Determine whether to send a CPL message now or defer it.  A message is
232 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
233 * For connections in other states the message is sent immediately.
234 * If through_l2t is set the message is subject to ARP processing, otherwise
235 * it is sent directly.
236 */
237static inline void
238send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
239{
240	struct tcpcb *tp = toep->tp_tp;
241
242	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
243		inp_wlock(tp->t_inpcb);
244		mbufq_tail(&toep->out_of_order_queue, m);  // defer
245		inp_wunlock(tp->t_inpcb);
246	} else if (through_l2t)
247		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
248	else
249		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
250}
251
252static inline unsigned int
253mkprio(unsigned int cntrl, const struct toepcb *toep)
254{
255        return (cntrl);
256}
257
258/*
259 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
260 */
261static inline void
262mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
263{
264	struct cpl_tid_release *req;
265
266	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
267	m->m_pkthdr.len = m->m_len = sizeof(*req);
268	req = mtod(m, struct cpl_tid_release *);
269	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
270	req->wr.wr_lo = 0;
271	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
272}
273
274static inline void
275make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
276{
277	INIT_VNET_INET(so->so_vnet);
278	struct tcpcb *tp = so_sototcpcb(so);
279	struct toepcb *toep = tp->t_toe;
280	struct tx_data_wr *req;
281	struct sockbuf *snd;
282
283	inp_lock_assert(tp->t_inpcb);
284	snd = so_sockbuf_snd(so);
285
286	req = mtod(m, struct tx_data_wr *);
287	m->m_len = sizeof(*req);
288	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
289	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
290	/* len includes the length of any HW ULP additions */
291	req->len = htonl(len);
292	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
293	/* V_TX_ULP_SUBMODE sets both the mode and submode */
294	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
295	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
296	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
297				   (tail ? 0 : 1))));
298	req->sndseq = htonl(tp->snd_nxt);
299	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
300		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
301				    V_TX_CPU_IDX(toep->tp_qset));
302
303		/* Sendbuffer is in units of 32KB.
304		 */
305		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
306			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
307		else {
308			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
309		}
310
311		toep->tp_flags |= TP_DATASENT;
312	}
313}
314
315#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
316
317int
318t3_push_frames(struct socket *so, int req_completion)
319{
320	struct tcpcb *tp = so_sototcpcb(so);
321	struct toepcb *toep = tp->t_toe;
322
323	struct mbuf *tail, *m0, *last;
324	struct t3cdev *cdev;
325	struct tom_data *d;
326	int state, bytes, count, total_bytes;
327	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
328	struct sockbuf *snd;
329
330	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
331		DPRINTF("tcp state=%d\n", tp->t_state);
332		return (0);
333	}
334
335	state = so_state_get(so);
336
337	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
338		DPRINTF("disconnecting\n");
339
340		return (0);
341	}
342
343	inp_lock_assert(tp->t_inpcb);
344
345	snd = so_sockbuf_snd(so);
346	sockbuf_lock(snd);
347
348	d = TOM_DATA(toep->tp_toedev);
349	cdev = d->cdev;
350
351	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
352
353	total_bytes = 0;
354	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
355	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
356
357	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
358		KASSERT(tail, ("sbdrop error"));
359		last = tail = tail->m_next;
360	}
361
362	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
363		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
364		sockbuf_unlock(snd);
365
366		return (0);
367	}
368
369	toep->tp_m_last = NULL;
370	while (toep->tp_wr_avail && (tail != NULL)) {
371		count = bytes = 0;
372		segp = segs;
373		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
374			sockbuf_unlock(snd);
375			return (0);
376		}
377		/*
378		 * If the data in tail fits as in-line, then
379		 * make an immediate data wr.
380		 */
381		if (tail->m_len <= IMM_LEN) {
382			count = 1;
383			bytes = tail->m_len;
384			last = tail;
385			tail = tail->m_next;
386			m_set_sgl(m0, NULL);
387			m_set_sgllen(m0, 0);
388			make_tx_data_wr(so, m0, bytes, tail);
389			m_append(m0, bytes, mtod(last, caddr_t));
390			KASSERT(!m0->m_next, ("bad append"));
391		} else {
392			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
393			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
394				bytes += tail->m_len;
395				last = tail;
396				count++;
397				/*
398				 * technically an abuse to be using this for a VA
399				 * but less gross than defining my own structure
400				 * or calling pmap_kextract from here :-|
401				 */
402				segp->ds_addr = (bus_addr_t)tail->m_data;
403				segp->ds_len = tail->m_len;
404				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
405				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
406				segp++;
407				tail = tail->m_next;
408			}
409			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
410			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
411
412			m_set_sgl(m0, segs);
413			m_set_sgllen(m0, count);
414			make_tx_data_wr(so, m0, bytes, tail);
415		}
416		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
417
418		if (tail) {
419			snd->sb_sndptr = tail;
420			toep->tp_m_last = NULL;
421		} else
422			toep->tp_m_last = snd->sb_sndptr = last;
423
424
425		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
426
427		snd->sb_sndptroff += bytes;
428		total_bytes += bytes;
429		toep->tp_write_seq += bytes;
430		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
431		    " tail=%p sndptr=%p sndptroff=%d",
432		    toep->tp_wr_avail, count, mbuf_wrs[count],
433		    tail, snd->sb_sndptr, snd->sb_sndptroff);
434		if (tail)
435			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
436			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
437			    total_bytes, toep->tp_m_last, tail->m_data,
438			    tp->snd_una);
439		else
440			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
441			    " tp_m_last=%p snd_una=0x%08x",
442			    total_bytes, toep->tp_m_last, tp->snd_una);
443
444
445#ifdef KTR
446{
447		int i;
448
449		i = 0;
450		while (i < count && m_get_sgllen(m0)) {
451			if ((count - i) >= 3) {
452				CTR6(KTR_TOM,
453				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
454				    " len=%d pa=0x%zx len=%d",
455				    segs[i].ds_addr, segs[i].ds_len,
456				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
457				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
458				    i += 3;
459			} else if ((count - i) == 2) {
460				CTR4(KTR_TOM,
461				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
462				    " len=%d",
463				    segs[i].ds_addr, segs[i].ds_len,
464				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
465				    i += 2;
466			} else {
467				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
468				    segs[i].ds_addr, segs[i].ds_len);
469				i++;
470			}
471
472		}
473}
474#endif
475                 /*
476		 * remember credits used
477		 */
478		m0->m_pkthdr.csum_data = mbuf_wrs[count];
479		m0->m_pkthdr.len = bytes;
480		toep->tp_wr_avail -= mbuf_wrs[count];
481		toep->tp_wr_unacked += mbuf_wrs[count];
482
483		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
484		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
485			struct work_request_hdr *wr = cplhdr(m0);
486
487			wr->wr_hi |= htonl(F_WR_COMPL);
488			toep->tp_wr_unacked = 0;
489		}
490		KASSERT((m0->m_pkthdr.csum_data > 0) &&
491		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
492			m0->m_pkthdr.csum_data));
493		m0->m_type = MT_DONTFREE;
494		enqueue_wr(toep, m0);
495		DPRINTF("sending offload tx with %d bytes in %d segments\n",
496		    bytes, count);
497		l2t_send(cdev, m0, toep->tp_l2t);
498	}
499	sockbuf_unlock(snd);
500	return (total_bytes);
501}
502
503/*
504 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
505 * under any circumstances.  We take the easy way out and always queue the
506 * message to the write_queue.  We can optimize the case where the queue is
507 * already empty though the optimization is probably not worth it.
508 */
509static void
510close_conn(struct socket *so)
511{
512	struct mbuf *m;
513	struct cpl_close_con_req *req;
514	struct tom_data *d;
515	struct inpcb *inp = so_sotoinpcb(so);
516	struct tcpcb *tp;
517	struct toepcb *toep;
518	unsigned int tid;
519
520
521	inp_wlock(inp);
522	tp = so_sototcpcb(so);
523	toep = tp->t_toe;
524
525	if (tp->t_state != TCPS_SYN_SENT)
526		t3_push_frames(so, 1);
527
528	if (toep->tp_flags & TP_FIN_SENT) {
529		inp_wunlock(inp);
530		return;
531	}
532
533	tid = toep->tp_tid;
534
535	d = TOM_DATA(toep->tp_toedev);
536
537	m = m_gethdr_nofail(sizeof(*req));
538	m_set_priority(m, CPL_PRIORITY_DATA);
539	m_set_sgl(m, NULL);
540	m_set_sgllen(m, 0);
541
542	toep->tp_flags |= TP_FIN_SENT;
543	req = mtod(m, struct cpl_close_con_req *);
544
545	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
546	req->wr.wr_lo = htonl(V_WR_TID(tid));
547	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
548	req->rsvd = 0;
549	inp_wunlock(inp);
550	/*
551	 * XXX - need to defer shutdown while there is still data in the queue
552	 *
553	 */
554	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
555	cxgb_ofld_send(d->cdev, m);
556
557}
558
559/*
560 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
561 * and send it along.
562 */
563static void
564abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
565{
566	struct cpl_abort_req *req = cplhdr(m);
567
568	req->cmd = CPL_ABORT_NO_RST;
569	cxgb_ofld_send(cdev, m);
570}
571
572/*
573 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
574 * permitted to return without sending the message in case we cannot allocate
575 * an sk_buff.  Returns the number of credits sent.
576 */
577uint32_t
578t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
579{
580	struct mbuf *m;
581	struct cpl_rx_data_ack *req;
582	struct toepcb *toep = tp->t_toe;
583	struct toedev *tdev = toep->tp_toedev;
584
585	m = m_gethdr_nofail(sizeof(*req));
586
587	DPRINTF("returning %u credits to HW\n", credits);
588
589	req = mtod(m, struct cpl_rx_data_ack *);
590	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
591	req->wr.wr_lo = 0;
592	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
593	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
594	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
595	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
596	return (credits);
597}
598
599/*
600 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
601 * This is only used in DDP mode, so we take the opportunity to also set the
602 * DACK mode and flush any Rx credits.
603 */
604void
605t3_send_rx_modulate(struct toepcb *toep)
606{
607	struct mbuf *m;
608	struct cpl_rx_data_ack *req;
609
610	m = m_gethdr_nofail(sizeof(*req));
611
612	req = mtod(m, struct cpl_rx_data_ack *);
613	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
614	req->wr.wr_lo = 0;
615	m->m_pkthdr.len = m->m_len = sizeof(*req);
616
617	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
618	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
619				 V_RX_DACK_MODE(1) |
620				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
621	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
622	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
623	toep->tp_rcv_wup = toep->tp_copied_seq;
624}
625
626/*
627 * Handle receipt of an urgent pointer.
628 */
629static void
630handle_urg_ptr(struct socket *so, uint32_t urg_seq)
631{
632#ifdef URGENT_DATA_SUPPORTED
633	struct tcpcb *tp = so_sototcpcb(so);
634
635	urg_seq--;   /* initially points past the urgent data, per BSD */
636
637	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
638		return;                                 /* duplicate pointer */
639	sk_send_sigurg(sk);
640	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
641	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
642		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
643
644		tp->copied_seq++;
645		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
646			tom_eat_skb(sk, skb, 0);
647	}
648	tp->urg_data = TCP_URG_NOTYET;
649	tp->urg_seq = urg_seq;
650#endif
651}
652
653/*
654 * Returns true if a socket cannot accept new Rx data.
655 */
656static inline int
657so_no_receive(const struct socket *so)
658{
659	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
660}
661
662/*
663 * Process an urgent data notification.
664 */
665static void
666rx_urg_notify(struct toepcb *toep, struct mbuf *m)
667{
668	struct cpl_rx_urg_notify *hdr = cplhdr(m);
669	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
670
671	VALIDATE_SOCK(so);
672
673	if (!so_no_receive(so))
674		handle_urg_ptr(so, ntohl(hdr->seq));
675
676	m_freem(m);
677}
678
679/*
680 * Handler for RX_URG_NOTIFY CPL messages.
681 */
682static int
683do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
684{
685	struct toepcb *toep = (struct toepcb *)ctx;
686
687	rx_urg_notify(toep, m);
688	return (0);
689}
690
691static __inline int
692is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
693{
694	return (toep->tp_ulp_mode ||
695		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
696		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
697}
698
699/*
700 * Set of states for which we should return RX credits.
701 */
702#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
703
704/*
705 * Called after some received data has been read.  It returns RX credits
706 * to the HW for the amount of data processed.
707 */
708void
709t3_cleanup_rbuf(struct tcpcb *tp, int copied)
710{
711	struct toepcb *toep = tp->t_toe;
712	struct socket *so;
713	struct toedev *dev;
714	int dack_mode, must_send, read;
715	u32 thres, credits, dack = 0;
716	struct sockbuf *rcv;
717
718	so = inp_inpcbtosocket(tp->t_inpcb);
719	rcv = so_sockbuf_rcv(so);
720
721	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
722		(tp->t_state == TCPS_FIN_WAIT_2))) {
723		if (copied) {
724			sockbuf_lock(rcv);
725			toep->tp_copied_seq += copied;
726			sockbuf_unlock(rcv);
727		}
728
729		return;
730	}
731
732	inp_lock_assert(tp->t_inpcb);
733
734	sockbuf_lock(rcv);
735	if (copied)
736		toep->tp_copied_seq += copied;
737	else {
738		read = toep->tp_enqueued_bytes - rcv->sb_cc;
739		toep->tp_copied_seq += read;
740	}
741	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
742	toep->tp_enqueued_bytes = rcv->sb_cc;
743	sockbuf_unlock(rcv);
744
745	if (credits > rcv->sb_mbmax) {
746		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
747		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
748	    credits = rcv->sb_mbmax;
749	}
750
751
752	/*
753	 * XXX this won't accurately reflect credit return - we need
754	 * to look at the difference between the amount that has been
755	 * put in the recv sockbuf and what is there now
756	 */
757
758	if (__predict_false(!credits))
759		return;
760
761	dev = toep->tp_toedev;
762	thres = TOM_TUNABLE(dev, rx_credit_thres);
763
764	if (__predict_false(thres == 0))
765		return;
766
767	if (is_delack_mode_valid(dev, toep)) {
768		dack_mode = TOM_TUNABLE(dev, delack);
769		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
770			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
771
772			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
773				dack = F_RX_DACK_CHANGE |
774				       V_RX_DACK_MODE(dack_mode);
775		}
776	} else
777		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
778
779	/*
780	 * For coalescing to work effectively ensure the receive window has
781	 * at least 16KB left.
782	 */
783	must_send = credits + 16384 >= tp->rcv_wnd;
784
785	if (must_send || credits >= thres)
786		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
787}
788
789static int
790cxgb_toe_disconnect(struct tcpcb *tp)
791{
792	struct socket *so;
793
794	DPRINTF("cxgb_toe_disconnect\n");
795
796	so = inp_inpcbtosocket(tp->t_inpcb);
797	close_conn(so);
798	return (0);
799}
800
801static int
802cxgb_toe_reset(struct tcpcb *tp)
803{
804	struct toepcb *toep = tp->t_toe;
805
806	t3_send_reset(toep);
807
808	/*
809	 * unhook from socket
810	 */
811	tp->t_flags &= ~TF_TOE;
812	toep->tp_tp = NULL;
813	tp->t_toe = NULL;
814	return (0);
815}
816
817static int
818cxgb_toe_send(struct tcpcb *tp)
819{
820	struct socket *so;
821
822	DPRINTF("cxgb_toe_send\n");
823	dump_toepcb(tp->t_toe);
824
825	so = inp_inpcbtosocket(tp->t_inpcb);
826	t3_push_frames(so, 1);
827	return (0);
828}
829
830static int
831cxgb_toe_rcvd(struct tcpcb *tp)
832{
833
834	inp_lock_assert(tp->t_inpcb);
835
836	t3_cleanup_rbuf(tp, 0);
837
838	return (0);
839}
840
841static void
842cxgb_toe_detach(struct tcpcb *tp)
843{
844	struct toepcb *toep;
845
846        /*
847	 * XXX how do we handle teardown in the SYN_SENT state?
848	 *
849	 */
850	inp_lock_assert(tp->t_inpcb);
851	toep = tp->t_toe;
852	toep->tp_tp = NULL;
853
854	/*
855	 * unhook from socket
856	 */
857	tp->t_flags &= ~TF_TOE;
858	tp->t_toe = NULL;
859}
860
861
862static struct toe_usrreqs cxgb_toe_usrreqs = {
863	.tu_disconnect = cxgb_toe_disconnect,
864	.tu_reset = cxgb_toe_reset,
865	.tu_send = cxgb_toe_send,
866	.tu_rcvd = cxgb_toe_rcvd,
867	.tu_detach = cxgb_toe_detach,
868	.tu_detach = cxgb_toe_detach,
869	.tu_syncache_event = handle_syncache_event,
870};
871
872
873static void
874__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
875			    uint64_t mask, uint64_t val, int no_reply)
876{
877	struct cpl_set_tcb_field *req;
878
879	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
880	    toep->tp_tid, word, mask, val);
881
882	req = mtod(m, struct cpl_set_tcb_field *);
883	m->m_pkthdr.len = m->m_len = sizeof(*req);
884	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
885	req->wr.wr_lo = 0;
886	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
887	req->reply = V_NO_REPLY(no_reply);
888	req->cpu_idx = 0;
889	req->word = htons(word);
890	req->mask = htobe64(mask);
891	req->val = htobe64(val);
892
893	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
894	send_or_defer(toep, m, 0);
895}
896
897static void
898t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
899{
900	struct mbuf *m;
901	struct tcpcb *tp = toep->tp_tp;
902
903	if (toep == NULL)
904		return;
905
906	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
907		printf("not seting field\n");
908		return;
909	}
910
911	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
912
913	__set_tcb_field(toep, m, word, mask, val, 1);
914}
915
916/*
917 * Set one of the t_flags bits in the TCB.
918 */
919static void
920set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
921{
922
923	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
924}
925
926/*
927 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
928 */
929static void
930t3_set_nagle(struct toepcb *toep)
931{
932	struct tcpcb *tp = toep->tp_tp;
933
934	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
935}
936
937/*
938 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
939 */
940void
941t3_set_keepalive(struct toepcb *toep, int on_off)
942{
943
944	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
945}
946
947void
948t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
949{
950	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
951}
952
953void
954t3_set_dack_mss(struct toepcb *toep, int on_off)
955{
956
957	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
958}
959
960/*
961 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
962 */
963static void
964t3_set_tos(struct toepcb *toep)
965{
966	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
967
968	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
969			 V_TCB_TOS(tos));
970}
971
972
973/*
974 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
975 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
976 * set the PSH bit in the last segment, which would trigger delivery.]
977 * We work around the issue by setting a DDP buffer in a partial placed state,
978 * which guarantees that TP will schedule a timer.
979 */
980#define TP_DDP_TIMER_WORKAROUND_MASK\
981    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
982     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
983       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
984#define TP_DDP_TIMER_WORKAROUND_VAL\
985    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
986     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
987      32))
988
989static void
990t3_enable_ddp(struct toepcb *toep, int on)
991{
992	if (on) {
993
994		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
995				 V_TF_DDP_OFF(0));
996	} else
997		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
998				 V_TF_DDP_OFF(1) |
999				 TP_DDP_TIMER_WORKAROUND_MASK,
1000				 V_TF_DDP_OFF(1) |
1001				 TP_DDP_TIMER_WORKAROUND_VAL);
1002
1003}
1004
1005void
1006t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1007{
1008	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1009			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1010			 tag_color);
1011}
1012
1013void
1014t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1015		    unsigned int len)
1016{
1017	if (buf_idx == 0)
1018		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1019			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1020			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1021			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1022			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1023	else
1024		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1025			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1026			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1027			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1028			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1029}
1030
1031static int
1032t3_set_cong_control(struct socket *so, const char *name)
1033{
1034#ifdef CONGESTION_CONTROL_SUPPORTED
1035	int cong_algo;
1036
1037	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1038		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1039			break;
1040
1041	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1042		return -EINVAL;
1043#endif
1044	return 0;
1045}
1046
1047int
1048t3_get_tcb(struct toepcb *toep)
1049{
1050	struct cpl_get_tcb *req;
1051	struct tcpcb *tp = toep->tp_tp;
1052	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1053
1054	if (!m)
1055		return (ENOMEM);
1056
1057	inp_lock_assert(tp->t_inpcb);
1058	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1059	req = mtod(m, struct cpl_get_tcb *);
1060	m->m_pkthdr.len = m->m_len = sizeof(*req);
1061	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1062	req->wr.wr_lo = 0;
1063	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1064	req->cpuno = htons(toep->tp_qset);
1065	req->rsvd = 0;
1066	if (tp->t_state == TCPS_SYN_SENT)
1067		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1068	else
1069		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1070	return 0;
1071}
1072
1073static inline void
1074so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1075{
1076
1077	toepcb_hold(toep);
1078
1079	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1080}
1081
1082/**
1083 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1084 *	@d: TOM state
1085 *	@mtu: the target MTU
1086 *
1087 *	Returns the index of the value in the MTU table that is closest to but
1088 *	does not exceed the target MTU.
1089 */
1090static unsigned int
1091find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1092{
1093	int i = 0;
1094
1095	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1096		++i;
1097	return (i);
1098}
1099
1100static unsigned int
1101select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1102{
1103	unsigned int idx;
1104
1105#ifdef notyet
1106	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1107#endif
1108	if (tp) {
1109		tp->t_maxseg = pmtu - 40;
1110		if (tp->t_maxseg < td->mtus[0] - 40)
1111			tp->t_maxseg = td->mtus[0] - 40;
1112		idx = find_best_mtu(td, tp->t_maxseg + 40);
1113
1114		tp->t_maxseg = td->mtus[idx] - 40;
1115	} else
1116		idx = find_best_mtu(td, pmtu);
1117
1118	return (idx);
1119}
1120
1121static inline void
1122free_atid(struct t3cdev *cdev, unsigned int tid)
1123{
1124	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1125
1126	if (toep)
1127		toepcb_release(toep);
1128}
1129
1130/*
1131 * Release resources held by an offload connection (TID, L2T entry, etc.)
1132 */
1133static void
1134t3_release_offload_resources(struct toepcb *toep)
1135{
1136	struct tcpcb *tp = toep->tp_tp;
1137	struct toedev *tdev = toep->tp_toedev;
1138	struct t3cdev *cdev;
1139	struct socket *so;
1140	unsigned int tid = toep->tp_tid;
1141	struct sockbuf *rcv;
1142
1143	CTR0(KTR_TOM, "t3_release_offload_resources");
1144
1145	if (!tdev)
1146		return;
1147
1148	cdev = TOEP_T3C_DEV(toep);
1149	if (!cdev)
1150		return;
1151
1152	toep->tp_qset = 0;
1153	t3_release_ddp_resources(toep);
1154
1155#ifdef CTRL_SKB_CACHE
1156	kfree_skb(CTRL_SKB_CACHE(tp));
1157	CTRL_SKB_CACHE(tp) = NULL;
1158#endif
1159
1160	if (toep->tp_wr_avail != toep->tp_wr_max) {
1161		purge_wr_queue(toep);
1162		reset_wr_list(toep);
1163	}
1164
1165	if (toep->tp_l2t) {
1166		l2t_release(L2DATA(cdev), toep->tp_l2t);
1167		toep->tp_l2t = NULL;
1168	}
1169	toep->tp_tp = NULL;
1170	if (tp) {
1171		inp_lock_assert(tp->t_inpcb);
1172		so = inp_inpcbtosocket(tp->t_inpcb);
1173		rcv = so_sockbuf_rcv(so);
1174		/*
1175		 * cancel any offloaded reads
1176		 *
1177		 */
1178		sockbuf_lock(rcv);
1179		tp->t_toe = NULL;
1180		tp->t_flags &= ~TF_TOE;
1181		if (toep->tp_ddp_state.user_ddp_pending) {
1182			t3_cancel_ubuf(toep, rcv);
1183			toep->tp_ddp_state.user_ddp_pending = 0;
1184		}
1185		so_sorwakeup_locked(so);
1186
1187	}
1188
1189	if (toep->tp_state == TCPS_SYN_SENT) {
1190		free_atid(cdev, tid);
1191#ifdef notyet
1192		__skb_queue_purge(&tp->out_of_order_queue);
1193#endif
1194	} else {                                          // we have TID
1195		cxgb_remove_tid(cdev, toep, tid);
1196		toepcb_release(toep);
1197	}
1198#if 0
1199	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1200#endif
1201}
1202
1203static void
1204install_offload_ops(struct socket *so)
1205{
1206	struct tcpcb *tp = so_sototcpcb(so);
1207
1208	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1209
1210	t3_install_socket_ops(so);
1211	tp->t_flags |= TF_TOE;
1212	tp->t_tu = &cxgb_toe_usrreqs;
1213}
1214
1215/*
1216 * Determine the receive window scaling factor given a target max
1217 * receive window.
1218 */
1219static __inline int
1220select_rcv_wscale(int space, struct vnet *vnet)
1221{
1222	INIT_VNET_INET(so->so_vnet);
1223	int wscale = 0;
1224
1225	if (space > MAX_RCV_WND)
1226		space = MAX_RCV_WND;
1227
1228	if (V_tcp_do_rfc1323)
1229		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1230
1231	return (wscale);
1232}
1233
1234/*
1235 * Determine the receive window size for a socket.
1236 */
1237static unsigned long
1238select_rcv_wnd(struct toedev *dev, struct socket *so)
1239{
1240	INIT_VNET_INET(so->so_vnet);
1241	struct tom_data *d = TOM_DATA(dev);
1242	unsigned int wnd;
1243	unsigned int max_rcv_wnd;
1244	struct sockbuf *rcv;
1245
1246	rcv = so_sockbuf_rcv(so);
1247
1248	if (V_tcp_do_autorcvbuf)
1249		wnd = V_tcp_autorcvbuf_max;
1250	else
1251		wnd = rcv->sb_hiwat;
1252
1253
1254
1255	/* XXX
1256	 * For receive coalescing to work effectively we need a receive window
1257	 * that can accomodate a coalesced segment.
1258	 */
1259	if (wnd < MIN_RCV_WND)
1260		wnd = MIN_RCV_WND;
1261
1262	/* PR 5138 */
1263	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1264				    (uint32_t)d->rx_page_size * 23 :
1265				    MAX_RCV_WND);
1266
1267	return min(wnd, max_rcv_wnd);
1268}
1269
1270/*
1271 * Assign offload parameters to some socket fields.  This code is used by
1272 * both active and passive opens.
1273 */
1274static inline void
1275init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1276    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1277{
1278	struct tcpcb *tp = so_sototcpcb(so);
1279	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1280	struct sockbuf *snd, *rcv;
1281
1282#ifdef notyet
1283	SOCK_LOCK_ASSERT(so);
1284#endif
1285
1286	snd = so_sockbuf_snd(so);
1287	rcv = so_sockbuf_rcv(so);
1288
1289	log(LOG_INFO, "initializing offload socket\n");
1290	/*
1291	 * We either need to fix push frames to work with sbcompress
1292	 * or we need to add this
1293	 */
1294	snd->sb_flags |= SB_NOCOALESCE;
1295	rcv->sb_flags |= SB_NOCOALESCE;
1296
1297	tp->t_toe = toep;
1298	toep->tp_tp = tp;
1299	toep->tp_toedev = dev;
1300
1301	toep->tp_tid = tid;
1302	toep->tp_l2t = e;
1303	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1304	toep->tp_wr_unacked = 0;
1305	toep->tp_delack_mode = 0;
1306
1307	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1308	/*
1309	 * XXX broken
1310	 *
1311	 */
1312	tp->rcv_wnd = select_rcv_wnd(dev, so);
1313
1314        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1315		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1316	toep->tp_qset_idx = 0;
1317
1318	reset_wr_list(toep);
1319	DPRINTF("initialization done\n");
1320}
1321
1322/*
1323 * The next two functions calculate the option 0 value for a socket.
1324 */
1325static inline unsigned int
1326calc_opt0h(struct socket *so, int mtu_idx)
1327{
1328	struct tcpcb *tp = so_sototcpcb(so);
1329	int wscale = select_rcv_wscale(tp->rcv_wnd, so->so_vnet);
1330
1331	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1332	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1333	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1334}
1335
1336static inline unsigned int
1337calc_opt0l(struct socket *so, int ulp_mode)
1338{
1339	struct tcpcb *tp = so_sototcpcb(so);
1340	unsigned int val;
1341
1342	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1343	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1344
1345	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1346	return (val);
1347}
1348
1349static inline unsigned int
1350calc_opt2(const struct socket *so, struct toedev *dev)
1351{
1352	int flv_valid;
1353
1354	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1355
1356	return (V_FLAVORS_VALID(flv_valid) |
1357	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1358}
1359
1360#if DEBUG_WR > 1
1361static int
1362count_pending_wrs(const struct toepcb *toep)
1363{
1364	const struct mbuf *m;
1365	int n = 0;
1366
1367	wr_queue_walk(toep, m)
1368		n += m->m_pkthdr.csum_data;
1369	return (n);
1370}
1371#endif
1372
1373#if 0
1374(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1375#endif
1376
1377static void
1378mk_act_open_req(struct socket *so, struct mbuf *m,
1379    unsigned int atid, const struct l2t_entry *e)
1380{
1381	struct cpl_act_open_req *req;
1382	struct inpcb *inp = so_sotoinpcb(so);
1383	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1384	struct toepcb *toep = tp->t_toe;
1385	struct toedev *tdev = toep->tp_toedev;
1386
1387	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1388
1389	req = mtod(m, struct cpl_act_open_req *);
1390	m->m_pkthdr.len = m->m_len = sizeof(*req);
1391
1392	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1393	req->wr.wr_lo = 0;
1394	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1395	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1396#if 0
1397	req->local_port = inp->inp_lport;
1398	req->peer_port = inp->inp_fport;
1399	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1400	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1401#endif
1402	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1403			   V_TX_CHANNEL(e->smt_idx));
1404	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1405	req->params = 0;
1406	req->opt2 = htonl(calc_opt2(so, tdev));
1407}
1408
1409
1410/*
1411 * Convert an ACT_OPEN_RPL status to an errno.
1412 */
1413static int
1414act_open_rpl_status_to_errno(int status)
1415{
1416	switch (status) {
1417	case CPL_ERR_CONN_RESET:
1418		return (ECONNREFUSED);
1419	case CPL_ERR_ARP_MISS:
1420		return (EHOSTUNREACH);
1421	case CPL_ERR_CONN_TIMEDOUT:
1422		return (ETIMEDOUT);
1423	case CPL_ERR_TCAM_FULL:
1424		return (ENOMEM);
1425	case CPL_ERR_CONN_EXIST:
1426		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1427		return (EADDRINUSE);
1428	default:
1429		return (EIO);
1430	}
1431}
1432
1433static void
1434fail_act_open(struct toepcb *toep, int errno)
1435{
1436	struct tcpcb *tp = toep->tp_tp;
1437
1438	t3_release_offload_resources(toep);
1439	if (tp) {
1440		inp_wunlock(tp->t_inpcb);
1441		tcp_offload_drop(tp, errno);
1442	}
1443
1444#ifdef notyet
1445	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1446#endif
1447}
1448
1449/*
1450 * Handle active open failures.
1451 */
1452static void
1453active_open_failed(struct toepcb *toep, struct mbuf *m)
1454{
1455	struct cpl_act_open_rpl *rpl = cplhdr(m);
1456	struct inpcb *inp;
1457
1458	if (toep->tp_tp == NULL)
1459		goto done;
1460
1461	inp = toep->tp_tp->t_inpcb;
1462
1463/*
1464 * Don't handle connection retry for now
1465 */
1466#ifdef notyet
1467	struct inet_connection_sock *icsk = inet_csk(sk);
1468
1469	if (rpl->status == CPL_ERR_CONN_EXIST &&
1470	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1471		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1472		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1473			       jiffies + HZ / 2);
1474	} else
1475#endif
1476	{
1477		inp_wlock(inp);
1478		/*
1479		 * drops the inpcb lock
1480		 */
1481		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1482	}
1483
1484	done:
1485	m_free(m);
1486}
1487
1488/*
1489 * Return whether a failed active open has allocated a TID
1490 */
1491static inline int
1492act_open_has_tid(int status)
1493{
1494	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1495	       status != CPL_ERR_ARP_MISS;
1496}
1497
1498/*
1499 * Process an ACT_OPEN_RPL CPL message.
1500 */
1501static int
1502do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1503{
1504	struct toepcb *toep = (struct toepcb *)ctx;
1505	struct cpl_act_open_rpl *rpl = cplhdr(m);
1506
1507	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1508		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1509
1510	active_open_failed(toep, m);
1511	return (0);
1512}
1513
1514/*
1515 * Handle an ARP failure for an active open.   XXX purge ofo queue
1516 *
1517 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1518 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1519 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1520 * free the atid.  Hmm.
1521 */
1522#ifdef notyet
1523static void
1524act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1525{
1526	struct toepcb *toep = m_get_toep(m);
1527	struct tcpcb *tp = toep->tp_tp;
1528	struct inpcb *inp = tp->t_inpcb;
1529	struct socket *so;
1530
1531	inp_wlock(inp);
1532	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1533		/*
1534		 * drops the inpcb lock
1535		 */
1536		fail_act_open(so, EHOSTUNREACH);
1537		printf("freeing %p\n", m);
1538
1539		m_free(m);
1540	} else
1541		inp_wunlock(inp);
1542}
1543#endif
1544/*
1545 * Send an active open request.
1546 */
1547int
1548t3_connect(struct toedev *tdev, struct socket *so,
1549    struct rtentry *rt, struct sockaddr *nam)
1550{
1551	struct mbuf *m;
1552	struct l2t_entry *e;
1553	struct tom_data *d = TOM_DATA(tdev);
1554	struct inpcb *inp = so_sotoinpcb(so);
1555	struct tcpcb *tp = intotcpcb(inp);
1556	struct toepcb *toep; /* allocated by init_offload_socket */
1557
1558	int atid;
1559
1560	toep = toepcb_alloc();
1561	if (toep == NULL)
1562		goto out_err;
1563
1564	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1565		goto out_err;
1566
1567	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1568	if (!e)
1569		goto free_tid;
1570
1571	inp_lock_assert(inp);
1572	m = m_gethdr(MT_DATA, M_WAITOK);
1573
1574#if 0
1575	m->m_toe.mt_toepcb = tp->t_toe;
1576	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1577#endif
1578	so_lock(so);
1579
1580	init_offload_socket(so, tdev, atid, e, rt, toep);
1581
1582	install_offload_ops(so);
1583
1584	mk_act_open_req(so, m, atid, e);
1585	so_unlock(so);
1586
1587	soisconnecting(so);
1588	toep = tp->t_toe;
1589	m_set_toep(m, tp->t_toe);
1590
1591	toep->tp_state = TCPS_SYN_SENT;
1592	l2t_send(d->cdev, (struct mbuf *)m, e);
1593
1594	if (toep->tp_ulp_mode)
1595		t3_enable_ddp(toep, 0);
1596	return 	(0);
1597
1598free_tid:
1599	printf("failing connect - free atid\n");
1600
1601	free_atid(d->cdev, atid);
1602out_err:
1603	printf("return ENOMEM\n");
1604       return (ENOMEM);
1605}
1606
1607/*
1608 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1609 * not send multiple ABORT_REQs for the same connection and also that we do
1610 * not try to send a message after the connection has closed.  Returns 1 if
1611 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1612 */
1613static void
1614t3_send_reset(struct toepcb *toep)
1615{
1616
1617	struct cpl_abort_req *req;
1618	unsigned int tid = toep->tp_tid;
1619	int mode = CPL_ABORT_SEND_RST;
1620	struct tcpcb *tp = toep->tp_tp;
1621	struct toedev *tdev = toep->tp_toedev;
1622	struct socket *so = NULL;
1623	struct mbuf *m;
1624	struct sockbuf *snd;
1625
1626	if (tp) {
1627		inp_lock_assert(tp->t_inpcb);
1628		so = inp_inpcbtosocket(tp->t_inpcb);
1629	}
1630
1631	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1632		tdev == NULL))
1633		return;
1634	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1635
1636	snd = so_sockbuf_snd(so);
1637	/* Purge the send queue so we don't send anything after an abort. */
1638	if (so)
1639		sbflush(snd);
1640	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1641		mode |= CPL_ABORT_POST_CLOSE_REQ;
1642
1643	m = m_gethdr_nofail(sizeof(*req));
1644	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1645	set_arp_failure_handler(m, abort_arp_failure);
1646
1647	req = mtod(m, struct cpl_abort_req *);
1648	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1649	req->wr.wr_lo = htonl(V_WR_TID(tid));
1650	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1651	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1652	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1653	req->cmd = mode;
1654	if (tp && (tp->t_state == TCPS_SYN_SENT))
1655		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1656	else
1657		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1658}
1659
1660static int
1661t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1662{
1663	struct inpcb *inp;
1664	int error, optval;
1665
1666	if (sopt->sopt_name == IP_OPTIONS)
1667		return (ENOPROTOOPT);
1668
1669	if (sopt->sopt_name != IP_TOS)
1670		return (EOPNOTSUPP);
1671
1672	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1673
1674	if (error)
1675		return (error);
1676
1677	if (optval > IPTOS_PREC_CRITIC_ECP)
1678		return (EINVAL);
1679
1680	inp = so_sotoinpcb(so);
1681	inp_wlock(inp);
1682	inp_ip_tos_set(inp, optval);
1683#if 0
1684	inp->inp_ip_tos = optval;
1685#endif
1686	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1687	inp_wunlock(inp);
1688
1689	return (0);
1690}
1691
1692static int
1693t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1694{
1695	int err = 0;
1696	size_t copied;
1697
1698	if (sopt->sopt_name != TCP_CONGESTION &&
1699	    sopt->sopt_name != TCP_NODELAY)
1700		return (EOPNOTSUPP);
1701
1702	if (sopt->sopt_name == TCP_CONGESTION) {
1703		char name[TCP_CA_NAME_MAX];
1704		int optlen = sopt->sopt_valsize;
1705		struct tcpcb *tp;
1706
1707		if (sopt->sopt_dir == SOPT_GET) {
1708			KASSERT(0, ("unimplemented"));
1709			return (EOPNOTSUPP);
1710		}
1711
1712		if (optlen < 1)
1713			return (EINVAL);
1714
1715		err = copyinstr(sopt->sopt_val, name,
1716		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1717		if (err)
1718			return (err);
1719		if (copied < 1)
1720			return (EINVAL);
1721
1722		tp = so_sototcpcb(so);
1723		/*
1724		 * XXX I need to revisit this
1725		 */
1726		if ((err = t3_set_cong_control(so, name)) == 0) {
1727#ifdef CONGESTION_CONTROL_SUPPORTED
1728			tp->t_cong_control = strdup(name, M_CXGB);
1729#endif
1730		} else
1731			return (err);
1732	} else {
1733		int optval, oldval;
1734		struct inpcb *inp;
1735		struct tcpcb *tp;
1736
1737		if (sopt->sopt_dir == SOPT_GET)
1738			return (EOPNOTSUPP);
1739
1740		err = sooptcopyin(sopt, &optval, sizeof optval,
1741		    sizeof optval);
1742
1743		if (err)
1744			return (err);
1745
1746		inp = so_sotoinpcb(so);
1747		inp_wlock(inp);
1748		tp = inp_inpcbtotcpcb(inp);
1749
1750		oldval = tp->t_flags;
1751		if (optval)
1752			tp->t_flags |= TF_NODELAY;
1753		else
1754			tp->t_flags &= ~TF_NODELAY;
1755		inp_wunlock(inp);
1756
1757
1758		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1759			t3_set_nagle(tp->t_toe);
1760
1761	}
1762
1763	return (0);
1764}
1765
1766int
1767t3_ctloutput(struct socket *so, struct sockopt *sopt)
1768{
1769	int err;
1770
1771	if (sopt->sopt_level != IPPROTO_TCP)
1772		err =  t3_ip_ctloutput(so, sopt);
1773	else
1774		err = t3_tcp_ctloutput(so, sopt);
1775
1776	if (err != EOPNOTSUPP)
1777		return (err);
1778
1779	return (tcp_ctloutput(so, sopt));
1780}
1781
1782/*
1783 * Returns true if we need to explicitly request RST when we receive new data
1784 * on an RX-closed connection.
1785 */
1786static inline int
1787need_rst_on_excess_rx(const struct toepcb *toep)
1788{
1789	return (1);
1790}
1791
1792/*
1793 * Handles Rx data that arrives in a state where the socket isn't accepting
1794 * new data.
1795 */
1796static void
1797handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1798{
1799
1800	if (need_rst_on_excess_rx(toep) &&
1801	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1802		t3_send_reset(toep);
1803	m_freem(m);
1804}
1805
1806/*
1807 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1808 * by getting the DDP offset from the TCB.
1809 */
1810static void
1811tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1812{
1813	struct ddp_state *q = &toep->tp_ddp_state;
1814	struct ddp_buf_state *bsp;
1815	struct cpl_get_tcb_rpl *hdr;
1816	unsigned int ddp_offset;
1817	struct socket *so;
1818	struct tcpcb *tp;
1819	struct sockbuf *rcv;
1820	int state;
1821
1822	uint64_t t;
1823	__be64 *tcb;
1824
1825	tp = toep->tp_tp;
1826	so = inp_inpcbtosocket(tp->t_inpcb);
1827
1828	inp_lock_assert(tp->t_inpcb);
1829	rcv = so_sockbuf_rcv(so);
1830	sockbuf_lock(rcv);
1831
1832	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1833	 * We really need a cookie in order to dispatch the RPLs.
1834	 */
1835	q->get_tcb_count--;
1836
1837	/* It is a possible that a previous CPL already invalidated UBUF DDP
1838	 * and moved the cur_buf idx and hence no further processing of this
1839	 * skb is required. However, the app might be sleeping on
1840	 * !q->get_tcb_count and we need to wake it up.
1841	 */
1842	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1843		int state = so_state_get(so);
1844
1845		m_freem(m);
1846		if (__predict_true((state & SS_NOFDREF) == 0))
1847			so_sorwakeup_locked(so);
1848		else
1849			sockbuf_unlock(rcv);
1850
1851		return;
1852	}
1853
1854	bsp = &q->buf_state[q->cur_buf];
1855	hdr = cplhdr(m);
1856	tcb = (__be64 *)(hdr + 1);
1857	if (q->cur_buf == 0) {
1858		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1859		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1860	} else {
1861		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1862		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1863	}
1864	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1865	m->m_cur_offset = bsp->cur_offset;
1866	bsp->cur_offset = ddp_offset;
1867	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1868
1869	CTR5(KTR_TOM,
1870	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1871	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1872	KASSERT(ddp_offset >= m->m_cur_offset,
1873	    ("ddp_offset=%u less than cur_offset=%u",
1874		ddp_offset, m->m_cur_offset));
1875
1876#if 0
1877{
1878	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1879
1880	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1881	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1882
1883        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1884        rcv_nxt = t >> S_TCB_RCV_NXT;
1885        rcv_nxt &= M_TCB_RCV_NXT;
1886
1887        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1888        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1889        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1890
1891	T3_TRACE2(TIDTB(sk),
1892		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1893		  ddp_flags, rcv_nxt - rx_hdr_offset);
1894	T3_TRACE4(TB(q),
1895		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1896		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1897	T3_TRACE3(TB(q),
1898		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1899		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1900	T3_TRACE2(TB(q),
1901		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1902		 q->buf_state[0].flags, q->buf_state[1].flags);
1903
1904}
1905#endif
1906	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1907		handle_excess_rx(toep, m);
1908		return;
1909	}
1910
1911#ifdef T3_TRACE
1912	if ((int)m->m_pkthdr.len < 0) {
1913		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1914	}
1915#endif
1916	if (bsp->flags & DDP_BF_NOCOPY) {
1917#ifdef T3_TRACE
1918		T3_TRACE0(TB(q),
1919			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1920
1921		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1922			printk("!cancel_ubuf");
1923			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1924		}
1925#endif
1926		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1927		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1928		q->cur_buf ^= 1;
1929	} else if (bsp->flags & DDP_BF_NOFLIP) {
1930
1931		m->m_ddp_flags = 1;    /* always a kernel buffer */
1932
1933		/* now HW buffer carries a user buffer */
1934		bsp->flags &= ~DDP_BF_NOFLIP;
1935		bsp->flags |= DDP_BF_NOCOPY;
1936
1937		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1938		 * any new data in which case we're done. If in addition the
1939		 * offset is 0, then there wasn't a completion for the kbuf
1940		 * and we need to decrement the posted count.
1941		 */
1942		if (m->m_pkthdr.len == 0) {
1943			if (ddp_offset == 0) {
1944				q->kbuf_posted--;
1945				bsp->flags |= DDP_BF_NODATA;
1946			}
1947			sockbuf_unlock(rcv);
1948			m_free(m);
1949			return;
1950		}
1951	} else {
1952		sockbuf_unlock(rcv);
1953
1954		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1955		 * but it got here way late and nobody cares anymore.
1956		 */
1957		m_free(m);
1958		return;
1959	}
1960
1961	m->m_ddp_gl = (unsigned char *)bsp->gl;
1962	m->m_flags |= M_DDP;
1963	m->m_seq = tp->rcv_nxt;
1964	tp->rcv_nxt += m->m_pkthdr.len;
1965	tp->t_rcvtime = ticks;
1966	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1967		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1968	if (m->m_pkthdr.len == 0) {
1969		q->user_ddp_pending = 0;
1970		m_free(m);
1971	} else
1972		SBAPPEND(rcv, m);
1973
1974	state = so_state_get(so);
1975	if (__predict_true((state & SS_NOFDREF) == 0))
1976		so_sorwakeup_locked(so);
1977	else
1978		sockbuf_unlock(rcv);
1979}
1980
1981/*
1982 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1983 * in that case they are similar to DDP completions.
1984 */
1985static int
1986do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1987{
1988	struct toepcb *toep = (struct toepcb *)ctx;
1989
1990	/* OK if socket doesn't exist */
1991	if (toep == NULL) {
1992		printf("null toep in do_get_tcb_rpl\n");
1993		return (CPL_RET_BUF_DONE);
1994	}
1995
1996	inp_wlock(toep->tp_tp->t_inpcb);
1997	tcb_rpl_as_ddp_complete(toep, m);
1998	inp_wunlock(toep->tp_tp->t_inpcb);
1999
2000	return (0);
2001}
2002
2003static void
2004handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2005{
2006	struct tcpcb *tp = toep->tp_tp;
2007	struct socket *so;
2008	struct ddp_state *q;
2009	struct ddp_buf_state *bsp;
2010	struct cpl_rx_data *hdr = cplhdr(m);
2011	unsigned int rcv_nxt = ntohl(hdr->seq);
2012	struct sockbuf *rcv;
2013
2014	if (tp->rcv_nxt == rcv_nxt)
2015		return;
2016
2017	inp_lock_assert(tp->t_inpcb);
2018	so  = inp_inpcbtosocket(tp->t_inpcb);
2019	rcv = so_sockbuf_rcv(so);
2020	sockbuf_lock(rcv);
2021
2022	q = &toep->tp_ddp_state;
2023	bsp = &q->buf_state[q->cur_buf];
2024	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2025		rcv_nxt, tp->rcv_nxt));
2026	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2027	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2028	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2029	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2030
2031#ifdef T3_TRACE
2032	if ((int)m->m_pkthdr.len < 0) {
2033		t3_ddp_error(so, "handle_ddp_data: neg len");
2034	}
2035#endif
2036	m->m_ddp_gl = (unsigned char *)bsp->gl;
2037	m->m_flags |= M_DDP;
2038	m->m_cur_offset = bsp->cur_offset;
2039	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2040	if (bsp->flags & DDP_BF_NOCOPY)
2041		bsp->flags &= ~DDP_BF_NOCOPY;
2042
2043	m->m_seq = tp->rcv_nxt;
2044	tp->rcv_nxt = rcv_nxt;
2045	bsp->cur_offset += m->m_pkthdr.len;
2046	if (!(bsp->flags & DDP_BF_NOFLIP))
2047		q->cur_buf ^= 1;
2048	/*
2049	 * For now, don't re-enable DDP after a connection fell out of  DDP
2050	 * mode.
2051	 */
2052	q->ubuf_ddp_ready = 0;
2053	sockbuf_unlock(rcv);
2054}
2055
2056/*
2057 * Process new data received for a connection.
2058 */
2059static void
2060new_rx_data(struct toepcb *toep, struct mbuf *m)
2061{
2062	struct cpl_rx_data *hdr = cplhdr(m);
2063	struct tcpcb *tp = toep->tp_tp;
2064	struct socket *so;
2065	struct sockbuf *rcv;
2066	int state;
2067	int len = be16toh(hdr->len);
2068
2069	inp_wlock(tp->t_inpcb);
2070
2071	so  = inp_inpcbtosocket(tp->t_inpcb);
2072
2073	if (__predict_false(so_no_receive(so))) {
2074		handle_excess_rx(toep, m);
2075		inp_wunlock(tp->t_inpcb);
2076		TRACE_EXIT;
2077		return;
2078	}
2079
2080	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2081		handle_ddp_data(toep, m);
2082
2083	m->m_seq = ntohl(hdr->seq);
2084	m->m_ulp_mode = 0;                    /* for iSCSI */
2085
2086#if VALIDATE_SEQ
2087	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2088		log(LOG_ERR,
2089		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2090		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2091		       tp->rcv_nxt);
2092		m_freem(m);
2093		inp_wunlock(tp->t_inpcb);
2094		return;
2095	}
2096#endif
2097	m_adj(m, sizeof(*hdr));
2098
2099#ifdef URGENT_DATA_SUPPORTED
2100	/*
2101	 * We don't handle urgent data yet
2102	 */
2103	if (__predict_false(hdr->urg))
2104		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2105	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2106		     tp->urg_seq - tp->rcv_nxt < skb->len))
2107		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2108							 tp->rcv_nxt];
2109#endif
2110	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2111		toep->tp_delack_mode = hdr->dack_mode;
2112		toep->tp_delack_seq = tp->rcv_nxt;
2113	}
2114	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2115	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2116
2117	if (len < m->m_pkthdr.len)
2118		m->m_pkthdr.len = m->m_len = len;
2119
2120	tp->rcv_nxt += m->m_pkthdr.len;
2121	tp->t_rcvtime = ticks;
2122	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2123	CTR2(KTR_TOM,
2124	    "new_rx_data: seq 0x%x len %u",
2125	    m->m_seq, m->m_pkthdr.len);
2126	inp_wunlock(tp->t_inpcb);
2127	rcv = so_sockbuf_rcv(so);
2128	sockbuf_lock(rcv);
2129#if 0
2130	if (sb_notify(rcv))
2131		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2132#endif
2133	SBAPPEND(rcv, m);
2134
2135#ifdef notyet
2136	/*
2137	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2138	 *
2139	 */
2140	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2141
2142	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2143		so, rcv->sb_cc, rcv->sb_mbmax));
2144#endif
2145
2146
2147	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2148	    rcv->sb_cc, rcv->sb_mbcnt);
2149
2150	state = so_state_get(so);
2151	if (__predict_true((state & SS_NOFDREF) == 0))
2152		so_sorwakeup_locked(so);
2153	else
2154		sockbuf_unlock(rcv);
2155}
2156
2157/*
2158 * Handler for RX_DATA CPL messages.
2159 */
2160static int
2161do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2162{
2163	struct toepcb *toep = (struct toepcb *)ctx;
2164
2165	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2166
2167	new_rx_data(toep, m);
2168
2169	return (0);
2170}
2171
2172static void
2173new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2174{
2175	struct tcpcb *tp;
2176	struct ddp_state *q;
2177	struct ddp_buf_state *bsp;
2178	struct cpl_rx_data_ddp *hdr;
2179	struct socket *so;
2180	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2181	int nomoredata = 0;
2182	unsigned int delack_mode;
2183	struct sockbuf *rcv;
2184
2185	tp = toep->tp_tp;
2186	inp_wlock(tp->t_inpcb);
2187	so = inp_inpcbtosocket(tp->t_inpcb);
2188
2189	if (__predict_false(so_no_receive(so))) {
2190
2191		handle_excess_rx(toep, m);
2192		inp_wunlock(tp->t_inpcb);
2193		return;
2194	}
2195
2196	q = &toep->tp_ddp_state;
2197	hdr = cplhdr(m);
2198	ddp_report = ntohl(hdr->u.ddp_report);
2199	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2200	bsp = &q->buf_state[buf_idx];
2201
2202	CTR4(KTR_TOM,
2203	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2204	    "hdr seq 0x%x len %u",
2205	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2206	    ntohs(hdr->len));
2207	CTR3(KTR_TOM,
2208	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2209	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2210
2211	ddp_len = ntohs(hdr->len);
2212	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2213
2214	delack_mode = G_DDP_DACK_MODE(ddp_report);
2215	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2216		toep->tp_delack_mode = delack_mode;
2217		toep->tp_delack_seq = tp->rcv_nxt;
2218	}
2219
2220	m->m_seq = tp->rcv_nxt;
2221	tp->rcv_nxt = rcv_nxt;
2222
2223	tp->t_rcvtime = ticks;
2224	/*
2225	 * Store the length in m->m_len.  We are changing the meaning of
2226	 * m->m_len here, we need to be very careful that nothing from now on
2227	 * interprets ->len of this packet the usual way.
2228	 */
2229	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2230	inp_wunlock(tp->t_inpcb);
2231	CTR3(KTR_TOM,
2232	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2233	    m->m_len, rcv_nxt, m->m_seq);
2234	/*
2235	 * Figure out where the new data was placed in the buffer and store it
2236	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2237	 * account for page pod's pg_offset.
2238	 */
2239	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2240	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2241
2242	rcv = so_sockbuf_rcv(so);
2243	sockbuf_lock(rcv);
2244
2245	m->m_ddp_gl = (unsigned char *)bsp->gl;
2246	m->m_flags |= M_DDP;
2247	bsp->cur_offset = end_offset;
2248	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2249
2250	/*
2251	 * Length is only meaningful for kbuf
2252	 */
2253	if (!(bsp->flags & DDP_BF_NOCOPY))
2254		KASSERT(m->m_len <= bsp->gl->dgl_length,
2255		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2256			m->m_len, bsp->gl->dgl_length));
2257
2258	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2259	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2260        /*
2261	 * Bit 0 of flags stores whether the DDP buffer is completed.
2262	 * Note that other parts of the code depend on this being in bit 0.
2263	 */
2264	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2265		panic("spurious ddp completion");
2266	} else {
2267		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2268		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2269			q->cur_buf ^= 1;                     /* flip buffers */
2270	}
2271
2272	if (bsp->flags & DDP_BF_NOCOPY) {
2273		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2274		bsp->flags &= ~DDP_BF_NOCOPY;
2275	}
2276
2277	if (ddp_report & F_DDP_PSH)
2278		m->m_ddp_flags |= DDP_BF_PSH;
2279	if (nomoredata)
2280		m->m_ddp_flags |= DDP_BF_NODATA;
2281
2282#ifdef notyet
2283	skb_reset_transport_header(skb);
2284	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2285#endif
2286	SBAPPEND(rcv, m);
2287
2288	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2289	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2290		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2291		so_sorwakeup_locked(so);
2292	else
2293		sockbuf_unlock(rcv);
2294}
2295
2296#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2297		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2298		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2299		 F_DDP_INVALID_PPOD)
2300
2301/*
2302 * Handler for RX_DATA_DDP CPL messages.
2303 */
2304static int
2305do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2306{
2307	struct toepcb *toep = ctx;
2308	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2309
2310	VALIDATE_SOCK(so);
2311
2312	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2313		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2314		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2315		return (CPL_RET_BUF_DONE);
2316	}
2317#if 0
2318	skb->h.th = tcphdr_skb->h.th;
2319#endif
2320	new_rx_data_ddp(toep, m);
2321	return (0);
2322}
2323
2324static void
2325process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2326{
2327	struct tcpcb *tp = toep->tp_tp;
2328	struct socket *so;
2329	struct ddp_state *q;
2330	struct ddp_buf_state *bsp;
2331	struct cpl_rx_ddp_complete *hdr;
2332	unsigned int ddp_report, buf_idx, when, delack_mode;
2333	int nomoredata = 0;
2334	struct sockbuf *rcv;
2335
2336	inp_wlock(tp->t_inpcb);
2337	so = inp_inpcbtosocket(tp->t_inpcb);
2338
2339	if (__predict_false(so_no_receive(so))) {
2340		struct inpcb *inp = so_sotoinpcb(so);
2341
2342		handle_excess_rx(toep, m);
2343		inp_wunlock(inp);
2344		return;
2345	}
2346	q = &toep->tp_ddp_state;
2347	hdr = cplhdr(m);
2348	ddp_report = ntohl(hdr->ddp_report);
2349	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2350	m->m_pkthdr.csum_data = tp->rcv_nxt;
2351
2352	rcv = so_sockbuf_rcv(so);
2353	sockbuf_lock(rcv);
2354
2355	bsp = &q->buf_state[buf_idx];
2356	when = bsp->cur_offset;
2357	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2358	tp->rcv_nxt += m->m_len;
2359	tp->t_rcvtime = ticks;
2360
2361	delack_mode = G_DDP_DACK_MODE(ddp_report);
2362	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2363		toep->tp_delack_mode = delack_mode;
2364		toep->tp_delack_seq = tp->rcv_nxt;
2365	}
2366#ifdef notyet
2367	skb_reset_transport_header(skb);
2368	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2369#endif
2370	inp_wunlock(tp->t_inpcb);
2371
2372	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2373	CTR5(KTR_TOM,
2374		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2375		  "ddp_report 0x%x offset %u, len %u",
2376		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2377		   G_DDP_OFFSET(ddp_report), m->m_len);
2378
2379	m->m_cur_offset = bsp->cur_offset;
2380	bsp->cur_offset += m->m_len;
2381
2382	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2383		q->cur_buf ^= 1;                     /* flip buffers */
2384		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2385			nomoredata=1;
2386	}
2387
2388	CTR4(KTR_TOM,
2389		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2390		  "ddp_report %u offset %u",
2391		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2392		   G_DDP_OFFSET(ddp_report));
2393
2394	m->m_ddp_gl = (unsigned char *)bsp->gl;
2395	m->m_flags |= M_DDP;
2396	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2397	if (bsp->flags & DDP_BF_NOCOPY)
2398		bsp->flags &= ~DDP_BF_NOCOPY;
2399	if (nomoredata)
2400		m->m_ddp_flags |= DDP_BF_NODATA;
2401
2402	SBAPPEND(rcv, m);
2403	if ((so_state_get(so) & SS_NOFDREF) == 0)
2404		so_sorwakeup_locked(so);
2405	else
2406		sockbuf_unlock(rcv);
2407}
2408
2409/*
2410 * Handler for RX_DDP_COMPLETE CPL messages.
2411 */
2412static int
2413do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2414{
2415	struct toepcb *toep = ctx;
2416
2417	VALIDATE_SOCK(so);
2418#if 0
2419	skb->h.th = tcphdr_skb->h.th;
2420#endif
2421	process_ddp_complete(toep, m);
2422	return (0);
2423}
2424
2425/*
2426 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2427 * socket state before calling tcp_time_wait to comply with its expectations.
2428 */
2429static void
2430enter_timewait(struct tcpcb *tp)
2431{
2432	/*
2433	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2434	 * process peer_close because we don't want to carry the peer FIN in
2435	 * the socket's receive queue and if we increment rcv_nxt without
2436	 * having the FIN in the receive queue we'll confuse facilities such
2437	 * as SIOCINQ.
2438	 */
2439	inp_wlock(tp->t_inpcb);
2440	tp->rcv_nxt++;
2441
2442	tp->ts_recent_age = 0;	     /* defeat recycling */
2443	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2444	inp_wunlock(tp->t_inpcb);
2445	tcp_offload_twstart(tp);
2446}
2447
2448/*
2449 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2450 * function deals with the data that may be reported along with the FIN.
2451 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2452 * perform normal FIN-related processing.  In the latter case 1 indicates that
2453 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2454 * skb can be freed.
2455 */
2456static int
2457handle_peer_close_data(struct socket *so, struct mbuf *m)
2458{
2459	struct tcpcb *tp = so_sototcpcb(so);
2460	struct toepcb *toep = tp->t_toe;
2461	struct ddp_state *q;
2462	struct ddp_buf_state *bsp;
2463	struct cpl_peer_close *req = cplhdr(m);
2464	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2465	struct sockbuf *rcv;
2466
2467	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2468		return (0);
2469
2470	CTR0(KTR_TOM, "handle_peer_close_data");
2471	if (__predict_false(so_no_receive(so))) {
2472		handle_excess_rx(toep, m);
2473
2474		/*
2475		 * Although we discard the data we want to process the FIN so
2476		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2477		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2478		 * may be what will close the connection.  We return 1 because
2479		 * handle_excess_rx() already freed the packet.
2480		 */
2481		return (1);
2482	}
2483
2484	inp_lock_assert(tp->t_inpcb);
2485	q = &toep->tp_ddp_state;
2486	rcv = so_sockbuf_rcv(so);
2487	sockbuf_lock(rcv);
2488
2489	bsp = &q->buf_state[q->cur_buf];
2490	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2491	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2492	m->m_ddp_gl = (unsigned char *)bsp->gl;
2493	m->m_flags |= M_DDP;
2494	m->m_cur_offset = bsp->cur_offset;
2495	m->m_ddp_flags =
2496	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2497	m->m_seq = tp->rcv_nxt;
2498	tp->rcv_nxt = rcv_nxt;
2499	bsp->cur_offset += m->m_pkthdr.len;
2500	if (!(bsp->flags & DDP_BF_NOFLIP))
2501		q->cur_buf ^= 1;
2502#ifdef notyet
2503	skb_reset_transport_header(skb);
2504	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2505#endif
2506	tp->t_rcvtime = ticks;
2507	SBAPPEND(rcv, m);
2508	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2509		so_sorwakeup_locked(so);
2510	else
2511		sockbuf_unlock(rcv);
2512
2513	return (1);
2514}
2515
2516/*
2517 * Handle a peer FIN.
2518 */
2519static void
2520do_peer_fin(struct toepcb *toep, struct mbuf *m)
2521{
2522	struct socket *so;
2523	struct tcpcb *tp = toep->tp_tp;
2524	int keep, action;
2525
2526	action = keep = 0;
2527	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2528	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2529		printf("abort_pending set\n");
2530
2531		goto out;
2532	}
2533	inp_wlock(tp->t_inpcb);
2534	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2535	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2536		keep = handle_peer_close_data(so, m);
2537		if (keep < 0) {
2538			inp_wunlock(tp->t_inpcb);
2539			return;
2540		}
2541	}
2542	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2543		CTR1(KTR_TOM,
2544		    "waking up waiters for cantrcvmore on %p ", so);
2545		socantrcvmore(so);
2546
2547		/*
2548		 * If connection is half-synchronized
2549		 * (ie NEEDSYN flag on) then delay ACK,
2550		 * so it may be piggybacked when SYN is sent.
2551		 * Otherwise, since we received a FIN then no
2552		 * more input can be expected, send ACK now.
2553		 */
2554		if (tp->t_flags & TF_NEEDSYN)
2555			tp->t_flags |= TF_DELACK;
2556		else
2557			tp->t_flags |= TF_ACKNOW;
2558		tp->rcv_nxt++;
2559	}
2560
2561	switch (tp->t_state) {
2562	case TCPS_SYN_RECEIVED:
2563	    tp->t_starttime = ticks;
2564	/* FALLTHROUGH */
2565	case TCPS_ESTABLISHED:
2566		tp->t_state = TCPS_CLOSE_WAIT;
2567		break;
2568	case TCPS_FIN_WAIT_1:
2569		tp->t_state = TCPS_CLOSING;
2570		break;
2571	case TCPS_FIN_WAIT_2:
2572		/*
2573		 * If we've sent an abort_req we must have sent it too late,
2574		 * HW will send us a reply telling us so, and this peer_close
2575		 * is really the last message for this connection and needs to
2576		 * be treated as an abort_rpl, i.e., transition the connection
2577		 * to TCP_CLOSE (note that the host stack does this at the
2578		 * time of generating the RST but we must wait for HW).
2579		 * Otherwise we enter TIME_WAIT.
2580		 */
2581		t3_release_offload_resources(toep);
2582		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2583			action = TCP_CLOSE;
2584		} else {
2585			action = TCP_TIMEWAIT;
2586		}
2587		break;
2588	default:
2589		log(LOG_ERR,
2590		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2591		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2592	}
2593	inp_wunlock(tp->t_inpcb);
2594
2595	if (action == TCP_TIMEWAIT) {
2596		enter_timewait(tp);
2597	} else if (action == TCP_DROP) {
2598		tcp_offload_drop(tp, 0);
2599	} else if (action == TCP_CLOSE) {
2600		tcp_offload_close(tp);
2601	}
2602
2603#ifdef notyet
2604	/* Do not send POLL_HUP for half duplex close. */
2605	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2606	    sk->sk_state == TCP_CLOSE)
2607		sk_wake_async(so, 1, POLL_HUP);
2608	else
2609		sk_wake_async(so, 1, POLL_IN);
2610#endif
2611
2612out:
2613	if (!keep)
2614		m_free(m);
2615}
2616
2617/*
2618 * Handler for PEER_CLOSE CPL messages.
2619 */
2620static int
2621do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2622{
2623	struct toepcb *toep = (struct toepcb *)ctx;
2624
2625	VALIDATE_SOCK(so);
2626
2627	do_peer_fin(toep, m);
2628	return (0);
2629}
2630
2631static void
2632process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2633{
2634	struct cpl_close_con_rpl *rpl = cplhdr(m);
2635	struct tcpcb *tp = toep->tp_tp;
2636	struct socket *so;
2637	int action = 0;
2638	struct sockbuf *rcv;
2639
2640	inp_wlock(tp->t_inpcb);
2641	so = inp_inpcbtosocket(tp->t_inpcb);
2642
2643	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2644
2645	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2646		inp_wunlock(tp->t_inpcb);
2647		goto out;
2648	}
2649
2650	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2651	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2652
2653	switch (tp->t_state) {
2654	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2655		t3_release_offload_resources(toep);
2656		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2657			action = TCP_CLOSE;
2658
2659		} else {
2660			action = TCP_TIMEWAIT;
2661		}
2662		break;
2663	case TCPS_LAST_ACK:
2664		/*
2665		 * In this state we don't care about pending abort_rpl.
2666		 * If we've sent abort_req it was post-close and was sent too
2667		 * late, this close_con_rpl is the actual last message.
2668		 */
2669		t3_release_offload_resources(toep);
2670		action = TCP_CLOSE;
2671		break;
2672	case TCPS_FIN_WAIT_1:
2673		/*
2674		 * If we can't receive any more
2675		 * data, then closing user can proceed.
2676		 * Starting the timer is contrary to the
2677		 * specification, but if we don't get a FIN
2678		 * we'll hang forever.
2679		 *
2680		 * XXXjl:
2681		 * we should release the tp also, and use a
2682		 * compressed state.
2683		 */
2684		if (so)
2685			rcv = so_sockbuf_rcv(so);
2686		else
2687			break;
2688
2689		if (rcv->sb_state & SBS_CANTRCVMORE) {
2690			int timeout;
2691
2692			if (so)
2693				soisdisconnected(so);
2694			timeout = (tcp_fast_finwait2_recycle) ?
2695			    tcp_finwait2_timeout : tcp_maxidle;
2696			tcp_timer_activate(tp, TT_2MSL, timeout);
2697		}
2698		tp->t_state = TCPS_FIN_WAIT_2;
2699		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2700		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2701			action = TCP_DROP;
2702		}
2703
2704		break;
2705	default:
2706		log(LOG_ERR,
2707		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2708		       toep->tp_toedev->tod_name, toep->tp_tid,
2709		       tp->t_state);
2710	}
2711	inp_wunlock(tp->t_inpcb);
2712
2713
2714	if (action == TCP_TIMEWAIT) {
2715		enter_timewait(tp);
2716	} else if (action == TCP_DROP) {
2717		tcp_offload_drop(tp, 0);
2718	} else if (action == TCP_CLOSE) {
2719		tcp_offload_close(tp);
2720	}
2721out:
2722	m_freem(m);
2723}
2724
2725/*
2726 * Handler for CLOSE_CON_RPL CPL messages.
2727 */
2728static int
2729do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2730			    void *ctx)
2731{
2732	struct toepcb *toep = (struct toepcb *)ctx;
2733
2734	process_close_con_rpl(toep, m);
2735	return (0);
2736}
2737
2738/*
2739 * Process abort replies.  We only process these messages if we anticipate
2740 * them as the coordination between SW and HW in this area is somewhat lacking
2741 * and sometimes we get ABORT_RPLs after we are done with the connection that
2742 * originated the ABORT_REQ.
2743 */
2744static void
2745process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2746{
2747	struct tcpcb *tp = toep->tp_tp;
2748	struct socket *so;
2749	int needclose = 0;
2750
2751#ifdef T3_TRACE
2752	T3_TRACE1(TIDTB(sk),
2753		  "process_abort_rpl: GTS rpl pending %d",
2754		  sock_flag(sk, ABORT_RPL_PENDING));
2755#endif
2756
2757	inp_wlock(tp->t_inpcb);
2758	so = inp_inpcbtosocket(tp->t_inpcb);
2759
2760	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2761		/*
2762		 * XXX panic on tcpdrop
2763		 */
2764		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2765			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2766		else {
2767			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2768			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2769			    !is_t3a(toep->tp_toedev)) {
2770				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2771					panic("TP_ABORT_REQ_RCVD set");
2772				t3_release_offload_resources(toep);
2773				needclose = 1;
2774			}
2775		}
2776	}
2777	inp_wunlock(tp->t_inpcb);
2778
2779	if (needclose)
2780		tcp_offload_close(tp);
2781
2782	m_free(m);
2783}
2784
2785/*
2786 * Handle an ABORT_RPL_RSS CPL message.
2787 */
2788static int
2789do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2790{
2791	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2792	struct toepcb *toep;
2793
2794	/*
2795	 * Ignore replies to post-close aborts indicating that the abort was
2796	 * requested too late.  These connections are terminated when we get
2797	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2798	 * arrives the TID is either no longer used or it has been recycled.
2799	 */
2800	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2801discard:
2802		m_free(m);
2803		return (0);
2804	}
2805
2806	toep = (struct toepcb *)ctx;
2807
2808        /*
2809	 * Sometimes we've already closed the socket, e.g., a post-close
2810	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2811	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2812	 * but FW turns the ABORT_REQ into a regular one and so we get
2813	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2814	 */
2815	if (!toep)
2816		goto discard;
2817
2818	if (toep->tp_tp == NULL) {
2819		log(LOG_NOTICE, "removing tid for abort\n");
2820		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2821		if (toep->tp_l2t)
2822			l2t_release(L2DATA(cdev), toep->tp_l2t);
2823
2824		toepcb_release(toep);
2825		goto discard;
2826	}
2827
2828	log(LOG_NOTICE, "toep=%p\n", toep);
2829	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2830
2831	toepcb_hold(toep);
2832	process_abort_rpl(toep, m);
2833	toepcb_release(toep);
2834	return (0);
2835}
2836
2837/*
2838 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2839 * indicate whether RST should be sent in response.
2840 */
2841static int
2842abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2843{
2844	struct tcpcb *tp = so_sototcpcb(so);
2845
2846	switch (abort_reason) {
2847	case CPL_ERR_BAD_SYN:
2848#if 0
2849		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2850#endif
2851	case CPL_ERR_CONN_RESET:
2852		// XXX need to handle SYN_RECV due to crossed SYNs
2853		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2854	case CPL_ERR_XMIT_TIMEDOUT:
2855	case CPL_ERR_PERSIST_TIMEDOUT:
2856	case CPL_ERR_FINWAIT2_TIMEDOUT:
2857	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2858#if 0
2859		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2860#endif
2861		return (ETIMEDOUT);
2862	default:
2863		return (EIO);
2864	}
2865}
2866
2867static inline void
2868set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2869{
2870	struct cpl_abort_rpl *rpl = cplhdr(m);
2871
2872	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2873	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2874	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2875
2876	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2877	rpl->cmd = cmd;
2878}
2879
2880static void
2881send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2882{
2883	struct mbuf *reply_mbuf;
2884	struct cpl_abort_req_rss *req = cplhdr(m);
2885
2886	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2887	m_set_priority(m, CPL_PRIORITY_DATA);
2888	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2889	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2890	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2891	m_free(m);
2892}
2893
2894/*
2895 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2896 */
2897static inline int
2898is_neg_adv_abort(unsigned int status)
2899{
2900	return status == CPL_ERR_RTX_NEG_ADVICE ||
2901	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2902}
2903
2904static void
2905send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2906{
2907	struct mbuf  *reply_mbuf;
2908	struct cpl_abort_req_rss *req = cplhdr(m);
2909
2910	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2911
2912	if (!reply_mbuf) {
2913		/* Defer the reply.  Stick rst_status into req->cmd. */
2914		req->status = rst_status;
2915		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2916		return;
2917	}
2918
2919	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2920	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2921	m_free(m);
2922
2923	/*
2924	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2925	 * these messages while ARP is pending.  For other connection states
2926	 * it's not a problem.
2927	 */
2928	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2929}
2930
2931#ifdef notyet
2932static void
2933cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2934{
2935	CXGB_UNIMPLEMENTED();
2936#ifdef notyet
2937	struct request_sock *req = child->sk_user_data;
2938
2939	inet_csk_reqsk_queue_removed(parent, req);
2940	synq_remove(tcp_sk(child));
2941	__reqsk_free(req);
2942	child->sk_user_data = NULL;
2943#endif
2944}
2945
2946
2947/*
2948 * Performs the actual work to abort a SYN_RECV connection.
2949 */
2950static void
2951do_abort_syn_rcv(struct socket *child, struct socket *parent)
2952{
2953	struct tcpcb *parenttp = so_sototcpcb(parent);
2954	struct tcpcb *childtp = so_sototcpcb(child);
2955
2956	/*
2957	 * If the server is still open we clean up the child connection,
2958	 * otherwise the server already did the clean up as it was purging
2959	 * its SYN queue and the skb was just sitting in its backlog.
2960	 */
2961	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2962		cleanup_syn_rcv_conn(child, parent);
2963		inp_wlock(childtp->t_inpcb);
2964		t3_release_offload_resources(childtp->t_toe);
2965		inp_wunlock(childtp->t_inpcb);
2966		tcp_offload_close(childtp);
2967	}
2968}
2969#endif
2970
2971/*
2972 * Handle abort requests for a SYN_RECV connection.  These need extra work
2973 * because the socket is on its parent's SYN queue.
2974 */
2975static int
2976abort_syn_rcv(struct socket *so, struct mbuf *m)
2977{
2978	CXGB_UNIMPLEMENTED();
2979#ifdef notyet
2980	struct socket *parent;
2981	struct toedev *tdev = toep->tp_toedev;
2982	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2983	struct socket *oreq = so->so_incomp;
2984	struct t3c_tid_entry *t3c_stid;
2985	struct tid_info *t;
2986
2987	if (!oreq)
2988		return -1;        /* somehow we are not on the SYN queue */
2989
2990	t = &(T3C_DATA(cdev))->tid_maps;
2991	t3c_stid = lookup_stid(t, oreq->ts_recent);
2992	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2993
2994	so_lock(parent);
2995	do_abort_syn_rcv(so, parent);
2996	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2997	so_unlock(parent);
2998#endif
2999	return (0);
3000}
3001
3002/*
3003 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
3004 * request except that we need to reply to it.
3005 */
3006static void
3007process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3008{
3009	int rst_status = CPL_ABORT_NO_RST;
3010	const struct cpl_abort_req_rss *req = cplhdr(m);
3011	struct tcpcb *tp = toep->tp_tp;
3012	struct socket *so;
3013	int needclose = 0;
3014
3015	inp_wlock(tp->t_inpcb);
3016	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3017	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3018		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3019		m_free(m);
3020		goto skip;
3021	}
3022
3023	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3024	/*
3025	 * Three cases to consider:
3026	 * a) We haven't sent an abort_req; close the connection.
3027	 * b) We have sent a post-close abort_req that will get to TP too late
3028	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3029	 *    be ignored and the connection should be closed now.
3030	 * c) We have sent a regular abort_req that will get to TP too late.
3031	 *    That will generate an abort_rpl with status 0, wait for it.
3032	 */
3033	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3034	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3035		int error;
3036
3037		error = abort_status_to_errno(so, req->status,
3038		    &rst_status);
3039		so_error_set(so, error);
3040
3041		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3042			so_sorwakeup(so);
3043		/*
3044		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3045		 * returns 0 is has taken care of the abort.
3046		 */
3047		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3048			goto skip;
3049
3050		t3_release_offload_resources(toep);
3051		needclose = 1;
3052	}
3053	inp_wunlock(tp->t_inpcb);
3054
3055	if (needclose)
3056		tcp_offload_close(tp);
3057
3058	send_abort_rpl(m, tdev, rst_status);
3059	return;
3060skip:
3061	inp_wunlock(tp->t_inpcb);
3062}
3063
3064/*
3065 * Handle an ABORT_REQ_RSS CPL message.
3066 */
3067static int
3068do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3069{
3070	const struct cpl_abort_req_rss *req = cplhdr(m);
3071	struct toepcb *toep = (struct toepcb *)ctx;
3072
3073	if (is_neg_adv_abort(req->status)) {
3074		m_free(m);
3075		return (0);
3076	}
3077
3078	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3079
3080	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3081		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3082		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3083
3084		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3085		if (toep->tp_l2t)
3086			l2t_release(L2DATA(cdev), toep->tp_l2t);
3087
3088		/*
3089		 *  Unhook
3090		 */
3091		toep->tp_tp->t_toe = NULL;
3092		toep->tp_tp->t_flags &= ~TF_TOE;
3093		toep->tp_tp = NULL;
3094		/*
3095		 * XXX need to call syncache_chkrst - but we don't
3096		 * have a way of doing that yet
3097		 */
3098		toepcb_release(toep);
3099		log(LOG_ERR, "abort for unestablished connection :-(\n");
3100		return (0);
3101	}
3102	if (toep->tp_tp == NULL) {
3103		log(LOG_NOTICE, "disconnected toepcb\n");
3104		/* should be freed momentarily */
3105		return (0);
3106	}
3107
3108
3109	toepcb_hold(toep);
3110	process_abort_req(toep, m, toep->tp_toedev);
3111	toepcb_release(toep);
3112	return (0);
3113}
3114#ifdef notyet
3115static void
3116pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3117{
3118	struct toedev *tdev = TOE_DEV(parent);
3119
3120	do_abort_syn_rcv(child, parent);
3121	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3122		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3123
3124		rpl->opt0h = htonl(F_TCAM_BYPASS);
3125		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3126		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3127	} else
3128		m_free(m);
3129}
3130#endif
3131static void
3132handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3133{
3134	CXGB_UNIMPLEMENTED();
3135
3136#ifdef notyet
3137	struct t3cdev *cdev;
3138	struct socket *parent;
3139	struct socket *oreq;
3140	struct t3c_tid_entry *t3c_stid;
3141	struct tid_info *t;
3142	struct tcpcb *otp, *tp = so_sototcpcb(so);
3143	struct toepcb *toep = tp->t_toe;
3144
3145	/*
3146	 * If the connection is being aborted due to the parent listening
3147	 * socket going away there's nothing to do, the ABORT_REQ will close
3148	 * the connection.
3149	 */
3150	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3151		m_free(m);
3152		return;
3153	}
3154
3155	oreq = so->so_incomp;
3156	otp = so_sototcpcb(oreq);
3157
3158	cdev = T3C_DEV(so);
3159	t = &(T3C_DATA(cdev))->tid_maps;
3160	t3c_stid = lookup_stid(t, otp->ts_recent);
3161	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3162
3163	so_lock(parent);
3164	pass_open_abort(so, parent, m);
3165	so_unlock(parent);
3166#endif
3167}
3168
3169/*
3170 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3171 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3172 * connection.
3173 */
3174static void
3175pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3176{
3177
3178#ifdef notyet
3179	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3180	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3181#endif
3182	handle_pass_open_arp_failure(m_get_socket(m), m);
3183}
3184
3185/*
3186 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3187 */
3188static void
3189mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3190{
3191	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3192	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3193	unsigned int tid = GET_TID(req);
3194
3195	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3196	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3197	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3198	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3199	rpl->opt0h = htonl(F_TCAM_BYPASS);
3200	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3201	rpl->opt2 = 0;
3202	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3203}
3204
3205/*
3206 * Send a deferred reject to an accept request.
3207 */
3208static void
3209reject_pass_request(struct toedev *tdev, struct mbuf *m)
3210{
3211	struct mbuf *reply_mbuf;
3212
3213	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3214	mk_pass_accept_rpl(reply_mbuf, m);
3215	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3216	m_free(m);
3217}
3218
3219static void
3220handle_syncache_event(int event, void *arg)
3221{
3222	struct toepcb *toep = arg;
3223
3224	switch (event) {
3225	case TOE_SC_ENTRY_PRESENT:
3226		/*
3227		 * entry already exists - free toepcb
3228		 * and l2t
3229		 */
3230		printf("syncache entry present\n");
3231		toepcb_release(toep);
3232		break;
3233	case TOE_SC_DROP:
3234		/*
3235		 * The syncache has given up on this entry
3236		 * either it timed out, or it was evicted
3237		 * we need to explicitly release the tid
3238		 */
3239		printf("syncache entry dropped\n");
3240		toepcb_release(toep);
3241		break;
3242	default:
3243		log(LOG_ERR, "unknown syncache event %d\n", event);
3244		break;
3245	}
3246}
3247
3248static void
3249syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3250{
3251	struct in_conninfo inc;
3252	struct tcpopt to;
3253	struct tcphdr th;
3254	struct inpcb *inp;
3255	int mss, wsf, sack, ts;
3256	uint32_t rcv_isn = ntohl(req->rcv_isn);
3257
3258	bzero(&to, sizeof(struct tcpopt));
3259	inp = so_sotoinpcb(lso);
3260
3261	/*
3262	 * Fill out information for entering us into the syncache
3263	 */
3264	bzero(&inc, sizeof(inc));
3265	inc.inc_fport = th.th_sport = req->peer_port;
3266	inc.inc_lport = th.th_dport = req->local_port;
3267	th.th_seq = req->rcv_isn;
3268	th.th_flags = TH_SYN;
3269
3270	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3271
3272	inc.inc_len = 0;
3273	inc.inc_faddr.s_addr = req->peer_ip;
3274	inc.inc_laddr.s_addr = req->local_ip;
3275
3276	DPRINTF("syncache add of %d:%d %d:%d\n",
3277	    ntohl(req->local_ip), ntohs(req->local_port),
3278	    ntohl(req->peer_ip), ntohs(req->peer_port));
3279
3280	mss = req->tcp_options.mss;
3281	wsf = req->tcp_options.wsf;
3282	ts = req->tcp_options.tstamp;
3283	sack = req->tcp_options.sack;
3284	to.to_mss = mss;
3285	to.to_wscale = wsf;
3286	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3287	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3288}
3289
3290
3291/*
3292 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3293 * lock held.  Note that the sock here is a listening socket that is not owned
3294 * by the TOE.
3295 */
3296static void
3297process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3298    struct listen_ctx *lctx)
3299{
3300	int rt_flags;
3301	struct l2t_entry *e;
3302	struct iff_mac tim;
3303	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3304	struct cpl_pass_accept_rpl *rpl;
3305	struct cpl_pass_accept_req *req = cplhdr(m);
3306	unsigned int tid = GET_TID(req);
3307	struct tom_data *d = TOM_DATA(tdev);
3308	struct t3cdev *cdev = d->cdev;
3309	struct tcpcb *tp = so_sototcpcb(so);
3310	struct toepcb *newtoep;
3311	struct rtentry *dst;
3312	struct sockaddr_in nam;
3313	struct t3c_data *td = T3C_DATA(cdev);
3314
3315	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3316	if (__predict_false(reply_mbuf == NULL)) {
3317		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3318			t3_defer_reply(m, tdev, reject_pass_request);
3319		else {
3320			cxgb_queue_tid_release(cdev, tid);
3321			m_free(m);
3322		}
3323		DPRINTF("failed to get reply_mbuf\n");
3324
3325		goto out;
3326	}
3327
3328	if (tp->t_state != TCPS_LISTEN) {
3329		DPRINTF("socket not in listen state\n");
3330
3331		goto reject;
3332	}
3333
3334	tim.mac_addr = req->dst_mac;
3335	tim.vlan_tag = ntohs(req->vlan_tag);
3336	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3337		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3338		goto reject;
3339	}
3340
3341#ifdef notyet
3342	/*
3343	 * XXX do route lookup to confirm that we're still listening on this
3344	 * address
3345	 */
3346	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3347			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3348		goto reject;
3349	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3350		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3351	dst_release(skb->dst);	// done with the input route, release it
3352	skb->dst = NULL;
3353
3354	if ((rt_flags & RTF_LOCAL) == 0)
3355		goto reject;
3356#endif
3357	/*
3358	 * XXX
3359	 */
3360	rt_flags = RTF_LOCAL;
3361	if ((rt_flags & RTF_LOCAL) == 0)
3362		goto reject;
3363
3364	/*
3365	 * Calculate values and add to syncache
3366	 */
3367
3368	newtoep = toepcb_alloc();
3369	if (newtoep == NULL)
3370		goto reject;
3371
3372	bzero(&nam, sizeof(struct sockaddr_in));
3373
3374	nam.sin_len = sizeof(struct sockaddr_in);
3375	nam.sin_family = AF_INET;
3376	nam.sin_addr.s_addr =req->peer_ip;
3377	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3378
3379	if (dst == NULL) {
3380		printf("failed to find route\n");
3381		goto reject;
3382	}
3383	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3384	    (struct sockaddr *)&nam);
3385	if (e == NULL) {
3386		DPRINTF("failed to get l2t\n");
3387	}
3388	/*
3389	 * Point to our listen socket until accept
3390	 */
3391	newtoep->tp_tp = tp;
3392	newtoep->tp_flags = TP_SYN_RCVD;
3393	newtoep->tp_tid = tid;
3394	newtoep->tp_toedev = tdev;
3395	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3396
3397	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3398	so_lock(so);
3399	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3400	so_unlock(so);
3401
3402	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3403		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3404
3405	if (newtoep->tp_ulp_mode) {
3406		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3407
3408		if (ddp_mbuf == NULL)
3409			newtoep->tp_ulp_mode = 0;
3410	}
3411
3412	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3413	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3414	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3415	/*
3416	 * XXX workaround for lack of syncache drop
3417	 */
3418	toepcb_hold(newtoep);
3419	syncache_add_accept_req(req, so, newtoep);
3420
3421	rpl = cplhdr(reply_mbuf);
3422	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3423	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3424	rpl->wr.wr_lo = 0;
3425	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3426	rpl->opt2 = htonl(calc_opt2(so, tdev));
3427	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3428	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3429
3430	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3431	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3432	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3433				  CPL_PASS_OPEN_ACCEPT);
3434
3435	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3436
3437	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3438
3439	l2t_send(cdev, reply_mbuf, e);
3440	m_free(m);
3441	if (newtoep->tp_ulp_mode) {
3442		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3443				V_TF_DDP_OFF(1) |
3444				TP_DDP_TIMER_WORKAROUND_MASK,
3445				V_TF_DDP_OFF(1) |
3446		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3447	} else
3448		DPRINTF("no DDP\n");
3449
3450	return;
3451reject:
3452	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3453		mk_pass_accept_rpl(reply_mbuf, m);
3454	else
3455		mk_tid_release(reply_mbuf, newtoep, tid);
3456	cxgb_ofld_send(cdev, reply_mbuf);
3457	m_free(m);
3458out:
3459#if 0
3460	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3461#else
3462	return;
3463#endif
3464}
3465
3466/*
3467 * Handle a CPL_PASS_ACCEPT_REQ message.
3468 */
3469static int
3470do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3471{
3472	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3473	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3474	struct tom_data *d = listen_ctx->tom_data;
3475
3476#if VALIDATE_TID
3477	struct cpl_pass_accept_req *req = cplhdr(m);
3478	unsigned int tid = GET_TID(req);
3479	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3480
3481	if (unlikely(!lsk)) {
3482		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3483		       cdev->name,
3484		       (unsigned long)((union listen_entry *)ctx -
3485					t->stid_tab));
3486		return CPL_RET_BUF_DONE;
3487	}
3488	if (unlikely(tid >= t->ntids)) {
3489		printk(KERN_ERR "%s: passive open TID %u too large\n",
3490		       cdev->name, tid);
3491		return CPL_RET_BUF_DONE;
3492	}
3493	/*
3494	 * For T3A the current user of the TID may have closed but its last
3495	 * message(s) may have been backlogged so the TID appears to be still
3496	 * in use.  Just take the TID away, the connection can close at its
3497	 * own leisure.  For T3B this situation is a bug.
3498	 */
3499	if (!valid_new_tid(t, tid) &&
3500	    cdev->type != T3A) {
3501		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3502		       cdev->name, tid);
3503		return CPL_RET_BUF_DONE;
3504	}
3505#endif
3506
3507	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3508	return (0);
3509}
3510
3511/*
3512 * Called when a connection is established to translate the TCP options
3513 * reported by HW to FreeBSD's native format.
3514 */
3515static void
3516assign_rxopt(struct socket *so, unsigned int opt)
3517{
3518	struct tcpcb *tp = so_sototcpcb(so);
3519	struct toepcb *toep = tp->t_toe;
3520	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3521
3522	inp_lock_assert(tp->t_inpcb);
3523
3524	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3525	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3526	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3527	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3528	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3529	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3530		tp->rcv_scale = tp->request_r_scale;
3531}
3532
3533/*
3534 * Completes some final bits of initialization for just established connections
3535 * and changes their state to TCP_ESTABLISHED.
3536 *
3537 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3538 */
3539static void
3540make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3541{
3542	struct tcpcb *tp = so_sototcpcb(so);
3543	struct toepcb *toep = tp->t_toe;
3544
3545	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3546	assign_rxopt(so, opt);
3547
3548	/*
3549	 *XXXXXXXXXXX
3550	 *
3551	 */
3552#ifdef notyet
3553	so->so_proto->pr_ctloutput = t3_ctloutput;
3554#endif
3555
3556#if 0
3557	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3558#endif
3559	/*
3560	 * XXX not clear what rcv_wup maps to
3561	 */
3562	/*
3563	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3564	 * pass through opt0.
3565	 */
3566	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3567		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3568
3569	dump_toepcb(toep);
3570
3571#ifdef notyet
3572/*
3573 * no clean interface for marking ARP up to date
3574 */
3575	dst_confirm(sk->sk_dst_cache);
3576#endif
3577	tp->t_starttime = ticks;
3578	tp->t_state = TCPS_ESTABLISHED;
3579	soisconnected(so);
3580}
3581
3582static int
3583syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3584{
3585
3586	struct in_conninfo inc;
3587	struct tcpopt to;
3588	struct tcphdr th;
3589	int mss, wsf, sack, ts;
3590	struct mbuf *m = NULL;
3591	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3592	unsigned int opt;
3593
3594#ifdef MAC
3595#error	"no MAC support"
3596#endif
3597
3598	opt = ntohs(req->tcp_opt);
3599
3600	bzero(&to, sizeof(struct tcpopt));
3601
3602	/*
3603	 * Fill out information for entering us into the syncache
3604	 */
3605	bzero(&inc, sizeof(inc));
3606	inc.inc_fport = th.th_sport = req->peer_port;
3607	inc.inc_lport = th.th_dport = req->local_port;
3608	th.th_seq = req->rcv_isn;
3609	th.th_flags = TH_ACK;
3610
3611	inc.inc_len = 0;
3612	inc.inc_faddr.s_addr = req->peer_ip;
3613	inc.inc_laddr.s_addr = req->local_ip;
3614
3615	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3616	wsf  = G_TCPOPT_WSCALE_OK(opt);
3617	ts   = G_TCPOPT_TSTAMP(opt);
3618	sack = G_TCPOPT_SACK(opt);
3619
3620	to.to_mss = mss;
3621	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3622	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3623
3624	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3625	    ntohl(req->local_ip), ntohs(req->local_port),
3626	    ntohl(req->peer_ip), ntohs(req->peer_port),
3627	    mss, wsf, ts, sack);
3628	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3629}
3630
3631
3632/*
3633 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3634 * if we are in TCP_SYN_RECV due to crossed SYNs
3635 */
3636static int
3637do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3638{
3639	struct cpl_pass_establish *req = cplhdr(m);
3640	struct toepcb *toep = (struct toepcb *)ctx;
3641	struct tcpcb *tp = toep->tp_tp;
3642	struct socket *so, *lso;
3643	struct t3c_data *td = T3C_DATA(cdev);
3644	struct sockbuf *snd, *rcv;
3645
3646	// Complete socket initialization now that we have the SND_ISN
3647
3648	struct toedev *tdev;
3649
3650
3651	tdev = toep->tp_toedev;
3652
3653	inp_wlock(tp->t_inpcb);
3654
3655	/*
3656	 *
3657	 * XXX need to add reference while we're manipulating
3658	 */
3659	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3660
3661	inp_wunlock(tp->t_inpcb);
3662
3663	so_lock(so);
3664	LIST_REMOVE(toep, synq_entry);
3665	so_unlock(so);
3666
3667	if (!syncache_expand_establish_req(req, &so, toep)) {
3668		/*
3669		 * No entry
3670		 */
3671		CXGB_UNIMPLEMENTED();
3672	}
3673	if (so == NULL) {
3674		/*
3675		 * Couldn't create the socket
3676		 */
3677		CXGB_UNIMPLEMENTED();
3678	}
3679
3680	tp = so_sototcpcb(so);
3681	inp_wlock(tp->t_inpcb);
3682
3683	snd = so_sockbuf_snd(so);
3684	rcv = so_sockbuf_rcv(so);
3685
3686	snd->sb_flags |= SB_NOCOALESCE;
3687	rcv->sb_flags |= SB_NOCOALESCE;
3688
3689	toep->tp_tp = tp;
3690	toep->tp_flags = 0;
3691	tp->t_toe = toep;
3692	reset_wr_list(toep);
3693	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3694	tp->rcv_nxt = toep->tp_copied_seq;
3695	install_offload_ops(so);
3696
3697	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3698	toep->tp_wr_unacked = 0;
3699	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3700	toep->tp_qset_idx = 0;
3701	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3702
3703	/*
3704	 * XXX Cancel any keep alive timer
3705	 */
3706
3707	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3708
3709	/*
3710	 * XXX workaround for lack of syncache drop
3711	 */
3712	toepcb_release(toep);
3713	inp_wunlock(tp->t_inpcb);
3714
3715	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3716	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3717#ifdef notyet
3718	/*
3719	 * XXX not sure how these checks map to us
3720	 */
3721	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3722		sk->sk_state_change(sk);
3723		sk_wake_async(so, 0, POLL_OUT);
3724	}
3725	/*
3726	 * The state for the new connection is now up to date.
3727	 * Next check if we should add the connection to the parent's
3728	 * accept queue.  When the parent closes it resets connections
3729	 * on its SYN queue, so check if we are being reset.  If so we
3730	 * don't need to do anything more, the coming ABORT_RPL will
3731	 * destroy this socket.  Otherwise move the connection to the
3732	 * accept queue.
3733	 *
3734	 * Note that we reset the synq before closing the server so if
3735	 * we are not being reset the stid is still open.
3736	 */
3737	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3738		__kfree_skb(skb);
3739		goto unlock;
3740	}
3741#endif
3742	m_free(m);
3743
3744	return (0);
3745}
3746
3747/*
3748 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3749 * and send them to the TOE.
3750 */
3751static void
3752fixup_and_send_ofo(struct toepcb *toep)
3753{
3754	struct mbuf *m;
3755	struct toedev *tdev = toep->tp_toedev;
3756	struct tcpcb *tp = toep->tp_tp;
3757	unsigned int tid = toep->tp_tid;
3758
3759	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3760
3761	inp_lock_assert(tp->t_inpcb);
3762	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3763		/*
3764		 * A variety of messages can be waiting but the fields we'll
3765		 * be touching are common to all so any message type will do.
3766		 */
3767		struct cpl_close_con_req *p = cplhdr(m);
3768
3769		p->wr.wr_lo = htonl(V_WR_TID(tid));
3770		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3771		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3772	}
3773}
3774
3775/*
3776 * Updates socket state from an active establish CPL message.  Runs with the
3777 * socket lock held.
3778 */
3779static void
3780socket_act_establish(struct socket *so, struct mbuf *m)
3781{
3782	INIT_VNET_INET(so->so_vnet);
3783	struct cpl_act_establish *req = cplhdr(m);
3784	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3785	struct tcpcb *tp = so_sototcpcb(so);
3786	struct toepcb *toep = tp->t_toe;
3787
3788	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3789		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3790		    toep->tp_tid, tp->t_state);
3791
3792	tp->ts_recent_age = ticks;
3793	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3794	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3795
3796	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3797
3798	/*
3799	 * Now that we finally have a TID send any CPL messages that we had to
3800	 * defer for lack of a TID.
3801	 */
3802	if (mbufq_len(&toep->out_of_order_queue))
3803		fixup_and_send_ofo(toep);
3804
3805	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3806		/*
3807		 * XXX does this even make sense?
3808		 */
3809		so_sorwakeup(so);
3810	}
3811	m_free(m);
3812#ifdef notyet
3813/*
3814 * XXX assume no write requests permitted while socket connection is
3815 * incomplete
3816 */
3817	/*
3818	 * Currently the send queue must be empty at this point because the
3819	 * socket layer does not send anything before a connection is
3820	 * established.  To be future proof though we handle the possibility
3821	 * that there are pending buffers to send (either TX_DATA or
3822	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3823	 * buffers according to the just learned write_seq, and then we send
3824	 * them on their way.
3825	 */
3826	fixup_pending_writeq_buffers(sk);
3827	if (t3_push_frames(so, 1))
3828		sk->sk_write_space(sk);
3829#endif
3830
3831	toep->tp_state = tp->t_state;
3832	TCPSTAT_INC(tcps_connects);
3833
3834}
3835
3836/*
3837 * Process a CPL_ACT_ESTABLISH message.
3838 */
3839static int
3840do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3841{
3842	struct cpl_act_establish *req = cplhdr(m);
3843	unsigned int tid = GET_TID(req);
3844	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3845	struct toepcb *toep = (struct toepcb *)ctx;
3846	struct tcpcb *tp = toep->tp_tp;
3847	struct socket *so;
3848	struct toedev *tdev;
3849	struct tom_data *d;
3850
3851	if (tp == NULL) {
3852		free_atid(cdev, atid);
3853		return (0);
3854	}
3855	inp_wlock(tp->t_inpcb);
3856
3857	/*
3858	 * XXX
3859	 */
3860	so = inp_inpcbtosocket(tp->t_inpcb);
3861	tdev = toep->tp_toedev; /* blow up here if link was down */
3862	d = TOM_DATA(tdev);
3863
3864	/*
3865	 * It's OK if the TID is currently in use, the owning socket may have
3866	 * backlogged its last CPL message(s).  Just take it away.
3867	 */
3868	toep->tp_tid = tid;
3869	toep->tp_tp = tp;
3870	so_insert_tid(d, toep, tid);
3871	free_atid(cdev, atid);
3872	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3873
3874	socket_act_establish(so, m);
3875	inp_wunlock(tp->t_inpcb);
3876	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3877	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3878
3879	return (0);
3880}
3881
3882/*
3883 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3884 * next batch of work requests from the write queue.
3885 */
3886static void
3887wr_ack(struct toepcb *toep, struct mbuf *m)
3888{
3889	struct tcpcb *tp = toep->tp_tp;
3890	struct cpl_wr_ack *hdr = cplhdr(m);
3891	struct socket *so;
3892	unsigned int credits = ntohs(hdr->credits);
3893	u32 snd_una = ntohl(hdr->snd_una);
3894	int bytes = 0;
3895	struct sockbuf *snd;
3896
3897	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3898
3899	inp_wlock(tp->t_inpcb);
3900	so = inp_inpcbtosocket(tp->t_inpcb);
3901	toep->tp_wr_avail += credits;
3902	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3903		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3904
3905	while (credits) {
3906		struct mbuf *p = peek_wr(toep);
3907
3908		if (__predict_false(!p)) {
3909			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3910			    "nothing pending, state %u wr_avail=%u\n",
3911			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3912			break;
3913		}
3914		CTR2(KTR_TOM,
3915			"wr_ack: p->credits=%d p->bytes=%d",
3916		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3917		KASSERT(p->m_pkthdr.csum_data != 0,
3918		    ("empty request still on list"));
3919
3920		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3921
3922#if DEBUG_WR > 1
3923			struct tx_data_wr *w = cplhdr(p);
3924			log(LOG_ERR,
3925			       "TID %u got %u WR credits, need %u, len %u, "
3926			       "main body %u, frags %u, seq # %u, ACK una %u,"
3927			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3928			       toep->tp_tid, credits, p->csum, p->len,
3929			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3930			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3931			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3932#endif
3933			p->m_pkthdr.csum_data -= credits;
3934			break;
3935		} else {
3936			dequeue_wr(toep);
3937			credits -= p->m_pkthdr.csum_data;
3938			bytes += p->m_pkthdr.len;
3939			CTR3(KTR_TOM,
3940			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3941			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3942
3943			m_free(p);
3944		}
3945	}
3946
3947#if DEBUG_WR
3948	check_wr_invariants(tp);
3949#endif
3950
3951	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3952#if VALIDATE_SEQ
3953		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3954
3955		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3956		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3957		    toep->tp_tid, tp->snd_una);
3958#endif
3959		goto out_free;
3960	}
3961
3962	if (tp->snd_una != snd_una) {
3963		tp->snd_una = snd_una;
3964		tp->ts_recent_age = ticks;
3965#ifdef notyet
3966		/*
3967		 * Keep ARP entry "minty fresh"
3968		 */
3969		dst_confirm(sk->sk_dst_cache);
3970#endif
3971		if (tp->snd_una == tp->snd_nxt)
3972			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3973	}
3974
3975	snd = so_sockbuf_snd(so);
3976	if (bytes) {
3977		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3978		snd = so_sockbuf_snd(so);
3979		sockbuf_lock(snd);
3980		sbdrop_locked(snd, bytes);
3981		so_sowwakeup_locked(so);
3982	}
3983
3984	if (snd->sb_sndptroff < snd->sb_cc)
3985		t3_push_frames(so, 0);
3986
3987out_free:
3988	inp_wunlock(tp->t_inpcb);
3989	m_free(m);
3990}
3991
3992/*
3993 * Handler for TX_DATA_ACK CPL messages.
3994 */
3995static int
3996do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3997{
3998	struct toepcb *toep = (struct toepcb *)ctx;
3999
4000	VALIDATE_SOCK(so);
4001
4002	wr_ack(toep, m);
4003	return 0;
4004}
4005
4006/*
4007 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4008 */
4009static int
4010do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4011{
4012	m_freem(m);
4013	return 0;
4014}
4015
4016/*
4017 * Reset a connection that is on a listener's SYN queue or accept queue,
4018 * i.e., one that has not had a struct socket associated with it.
4019 * Must be called from process context.
4020 *
4021 * Modeled after code in inet_csk_listen_stop().
4022 */
4023static void
4024t3_reset_listen_child(struct socket *child)
4025{
4026	struct tcpcb *tp = so_sototcpcb(child);
4027
4028	t3_send_reset(tp->t_toe);
4029}
4030
4031
4032static void
4033t3_child_disconnect(struct socket *so, void *arg)
4034{
4035	struct tcpcb *tp = so_sototcpcb(so);
4036
4037	if (tp->t_flags & TF_TOE) {
4038		inp_wlock(tp->t_inpcb);
4039		t3_reset_listen_child(so);
4040		inp_wunlock(tp->t_inpcb);
4041	}
4042}
4043
4044/*
4045 * Disconnect offloaded established but not yet accepted connections sitting
4046 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4047 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4048 */
4049void
4050t3_disconnect_acceptq(struct socket *listen_so)
4051{
4052
4053	so_lock(listen_so);
4054	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4055	so_unlock(listen_so);
4056}
4057
4058/*
4059 * Reset offloaded connections sitting on a server's syn queue.  As above
4060 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4061 */
4062
4063void
4064t3_reset_synq(struct listen_ctx *lctx)
4065{
4066	struct toepcb *toep;
4067
4068	so_lock(lctx->lso);
4069	while (!LIST_EMPTY(&lctx->synq_head)) {
4070		toep = LIST_FIRST(&lctx->synq_head);
4071		LIST_REMOVE(toep, synq_entry);
4072		toep->tp_tp = NULL;
4073		t3_send_reset(toep);
4074		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4075		toepcb_release(toep);
4076	}
4077	so_unlock(lctx->lso);
4078}
4079
4080
4081int
4082t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4083		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4084		   unsigned int pg_off, unsigned int color)
4085{
4086	unsigned int i, j, pidx;
4087	struct pagepod *p;
4088	struct mbuf *m;
4089	struct ulp_mem_io *req;
4090	unsigned int tid = toep->tp_tid;
4091	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4092	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4093
4094	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4095	    gl, nppods, tag, maxoff, pg_off, color);
4096
4097	for (i = 0; i < nppods; ++i) {
4098		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4099		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4100		req = mtod(m, struct ulp_mem_io *);
4101		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4102		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4103		req->wr.wr_lo = 0;
4104		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4105					   V_ULPTX_CMD(ULP_MEM_WRITE));
4106		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4107				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4108
4109		p = (struct pagepod *)(req + 1);
4110		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4111			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4112			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4113						  V_PPOD_COLOR(color));
4114			p->pp_max_offset = htonl(maxoff);
4115			p->pp_page_offset = htonl(pg_off);
4116			p->pp_rsvd = 0;
4117			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4118				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4119				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4120		} else
4121			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4122		send_or_defer(toep, m, 0);
4123		ppod_addr += PPOD_SIZE;
4124	}
4125	return (0);
4126}
4127
4128/*
4129 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4130 */
4131static inline void
4132mk_cpl_barrier_ulp(struct cpl_barrier *b)
4133{
4134	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4135
4136	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4137	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4138	b->opcode = CPL_BARRIER;
4139}
4140
4141/*
4142 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4143 */
4144static inline void
4145mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4146{
4147	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4148
4149	txpkt = (struct ulp_txpkt *)req;
4150	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4151	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4152	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4153	req->cpuno = htons(cpuno);
4154}
4155
4156/*
4157 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4158 */
4159static inline void
4160mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4161                     unsigned int word, uint64_t mask, uint64_t val)
4162{
4163	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4164
4165	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4166	    tid, word, mask, val);
4167
4168	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4169	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4170	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4171	req->reply = V_NO_REPLY(1);
4172	req->cpu_idx = 0;
4173	req->word = htons(word);
4174	req->mask = htobe64(mask);
4175	req->val = htobe64(val);
4176}
4177
4178/*
4179 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4180 */
4181static void
4182mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4183    unsigned int tid, unsigned int credits)
4184{
4185	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4186
4187	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4188	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4189	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4190	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4191	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4192				 V_RX_CREDITS(credits));
4193}
4194
4195void
4196t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4197{
4198	unsigned int wrlen;
4199	struct mbuf *m;
4200	struct work_request_hdr *wr;
4201	struct cpl_barrier *lock;
4202	struct cpl_set_tcb_field *req;
4203	struct cpl_get_tcb *getreq;
4204	struct ddp_state *p = &toep->tp_ddp_state;
4205
4206#if 0
4207	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4208#endif
4209	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4210		sizeof(*getreq);
4211	m = m_gethdr_nofail(wrlen);
4212	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4213	wr = mtod(m, struct work_request_hdr *);
4214	bzero(wr, wrlen);
4215
4216	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4217	m->m_pkthdr.len = m->m_len = wrlen;
4218
4219	lock = (struct cpl_barrier *)(wr + 1);
4220	mk_cpl_barrier_ulp(lock);
4221
4222	req = (struct cpl_set_tcb_field *)(lock + 1);
4223
4224	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4225
4226	/* Hmmm, not sure if this actually a good thing: reactivating
4227	 * the other buffer might be an issue if it has been completed
4228	 * already. However, that is unlikely, since the fact that the UBUF
4229	 * is not completed indicates that there is no oustanding data.
4230	 */
4231	if (bufidx == 0)
4232		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4233				     V_TF_DDP_ACTIVE_BUF(1) |
4234				     V_TF_DDP_BUF0_VALID(1),
4235				     V_TF_DDP_ACTIVE_BUF(1));
4236	else
4237		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4238				     V_TF_DDP_ACTIVE_BUF(1) |
4239				     V_TF_DDP_BUF1_VALID(1), 0);
4240
4241	getreq = (struct cpl_get_tcb *)(req + 1);
4242	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4243
4244	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4245
4246	/* Keep track of the number of oustanding CPL_GET_TCB requests
4247	 */
4248	p->get_tcb_count++;
4249
4250#ifdef T3_TRACE
4251	T3_TRACE1(TIDTB(so),
4252		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4253#endif
4254	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4255}
4256
4257/**
4258 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4259 * @sk: the socket associated with the buffers
4260 * @bufidx: index of HW DDP buffer (0 or 1)
4261 * @tag0: new tag for HW buffer 0
4262 * @tag1: new tag for HW buffer 1
4263 * @len: new length for HW buf @bufidx
4264 *
4265 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4266 * buffer by changing the buffer tag and length and setting the valid and
4267 * active flag accordingly.  The caller must ensure the new buffer is at
4268 * least as big as the existing one.  Since we typically reprogram both HW
4269 * buffers this function sets both tags for convenience. Read the TCB to
4270 * determine how made data was written into the buffer before the overlay
4271 * took place.
4272 */
4273void
4274t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4275	 	       unsigned int tag1, unsigned int len)
4276{
4277	unsigned int wrlen;
4278	struct mbuf *m;
4279	struct work_request_hdr *wr;
4280	struct cpl_get_tcb *getreq;
4281	struct cpl_set_tcb_field *req;
4282	struct ddp_state *p = &toep->tp_ddp_state;
4283
4284	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4285	    bufidx, tag0, tag1, len);
4286#if 0
4287	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4288#endif
4289	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4290	m = m_gethdr_nofail(wrlen);
4291	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4292	wr = mtod(m, struct work_request_hdr *);
4293	m->m_pkthdr.len = m->m_len = wrlen;
4294	bzero(wr, wrlen);
4295
4296
4297	/* Set the ATOMIC flag to make sure that TP processes the following
4298	 * CPLs in an atomic manner and no wire segments can be interleaved.
4299	 */
4300	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4301	req = (struct cpl_set_tcb_field *)(wr + 1);
4302	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4303			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4304			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4305			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4306			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4307	req++;
4308	if (bufidx == 0) {
4309		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4310			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4311			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4312		req++;
4313		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4314			    V_TF_DDP_PUSH_DISABLE_0(1) |
4315			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4316			    V_TF_DDP_PUSH_DISABLE_0(0) |
4317			    V_TF_DDP_BUF0_VALID(1));
4318	} else {
4319		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4320			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4321			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4322		req++;
4323		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4324			    V_TF_DDP_PUSH_DISABLE_1(1) |
4325			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4326			    V_TF_DDP_PUSH_DISABLE_1(0) |
4327			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4328	}
4329
4330	getreq = (struct cpl_get_tcb *)(req + 1);
4331	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4332
4333	/* Keep track of the number of oustanding CPL_GET_TCB requests
4334	 */
4335	p->get_tcb_count++;
4336
4337#ifdef T3_TRACE
4338	T3_TRACE4(TIDTB(sk),
4339		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4340		  "len %d",
4341		  bufidx, tag0, tag1, len);
4342#endif
4343	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4344}
4345
4346/*
4347 * Sends a compound WR containing all the CPL messages needed to program the
4348 * two HW DDP buffers, namely optionally setting up the length and offset of
4349 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4350 */
4351void
4352t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4353		      unsigned int len1, unsigned int offset1,
4354                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4355{
4356	unsigned int wrlen;
4357	struct mbuf *m;
4358	struct work_request_hdr *wr;
4359	struct cpl_set_tcb_field *req;
4360
4361	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4362	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4363
4364#if 0
4365	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4366#endif
4367	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4368		(len1 ? sizeof(*req) : 0) +
4369		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4370	m = m_gethdr_nofail(wrlen);
4371	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4372	wr = mtod(m, struct work_request_hdr *);
4373	bzero(wr, wrlen);
4374
4375	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4376	m->m_pkthdr.len = m->m_len = wrlen;
4377
4378	req = (struct cpl_set_tcb_field *)(wr + 1);
4379	if (len0) {                  /* program buffer 0 offset and length */
4380		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4381			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4382			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4383			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4384			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4385		req++;
4386	}
4387	if (len1) {                  /* program buffer 1 offset and length */
4388		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4389			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4390			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4391			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4392			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4393		req++;
4394	}
4395
4396	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4397			     ddp_flags);
4398
4399	if (modulate) {
4400		mk_rx_data_ack_ulp(toep,
4401		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4402		    toep->tp_copied_seq - toep->tp_rcv_wup);
4403		toep->tp_rcv_wup = toep->tp_copied_seq;
4404	}
4405
4406#ifdef T3_TRACE
4407	T3_TRACE5(TIDTB(sk),
4408		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4409		  "modulate %d",
4410		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4411		  modulate);
4412#endif
4413
4414	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4415}
4416
4417void
4418t3_init_wr_tab(unsigned int wr_len)
4419{
4420	int i;
4421
4422	if (mbuf_wrs[1])     /* already initialized */
4423		return;
4424
4425	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4426		int sgl_len = (3 * i) / 2 + (i & 1);
4427
4428		sgl_len += 3;
4429		mbuf_wrs[i] = sgl_len <= wr_len ?
4430		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4431	}
4432
4433	wrlen = wr_len * 8;
4434}
4435
4436int
4437t3_init_cpl_io(void)
4438{
4439#ifdef notyet
4440	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4441	if (!tcphdr_skb) {
4442		log(LOG_ERR,
4443		       "Chelsio TCP offload: can't allocate sk_buff\n");
4444		return -1;
4445	}
4446	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4447	tcphdr_skb->h.raw = tcphdr_skb->data;
4448	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4449#endif
4450
4451	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4452	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4453	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4454	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4455	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4456	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4457	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4458	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4459	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4460	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4461	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4462	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4463	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4464	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4465	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4466	return (0);
4467}
4468
4469