cxgb_cpl_io.c revision 185571
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 185571 2008-12-02 21:37:28Z bz $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/sockbuf.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/protosw.h>
49#include <sys/priv.h>
50
51#if __FreeBSD_version >= 800044
52#include <sys/vimage.h>
53#else
54#define V_tcp_do_autosndbuf tcp_do_autosndbuf
55#define V_tcp_autosndbuf_max tcp_autosndbuf_max
56#define V_tcp_do_rfc1323 tcp_do_rfc1323
57#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
58#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
59#define V_tcpstat tcpstat
60#endif
61
62#include <net/if.h>
63#include <net/route.h>
64
65#include <netinet/in.h>
66#include <netinet/in_pcb.h>
67#include <netinet/in_systm.h>
68#include <netinet/in_var.h>
69
70
71#include <cxgb_osdep.h>
72#include <sys/mbufq.h>
73
74#include <netinet/ip.h>
75#include <netinet/tcp_var.h>
76#include <netinet/tcp_fsm.h>
77#include <netinet/tcp_offload.h>
78#include <netinet/tcp_seq.h>
79#include <netinet/tcp_syncache.h>
80#include <netinet/tcp_timer.h>
81#if __FreeBSD_version >= 800056
82#include <netinet/vinet.h>
83#endif
84#include <net/route.h>
85
86#include <t3cdev.h>
87#include <common/cxgb_firmware_exports.h>
88#include <common/cxgb_t3_cpl.h>
89#include <common/cxgb_tcb.h>
90#include <common/cxgb_ctl_defs.h>
91#include <cxgb_offload.h>
92#include <vm/vm.h>
93#include <vm/pmap.h>
94#include <machine/bus.h>
95#include <sys/mvec.h>
96#include <ulp/toecore/cxgb_toedev.h>
97#include <ulp/tom/cxgb_l2t.h>
98#include <ulp/tom/cxgb_defs.h>
99#include <ulp/tom/cxgb_tom.h>
100#include <ulp/tom/cxgb_t3_ddp.h>
101#include <ulp/tom/cxgb_toepcb.h>
102#include <ulp/tom/cxgb_tcp.h>
103#include <ulp/tom/cxgb_tcp_offload.h>
104
105/*
106 * For ULP connections HW may add headers, e.g., for digests, that aren't part
107 * of the messages sent by the host but that are part of the TCP payload and
108 * therefore consume TCP sequence space.  Tx connection parameters that
109 * operate in TCP sequence space are affected by the HW additions and need to
110 * compensate for them to accurately track TCP sequence numbers. This array
111 * contains the compensating extra lengths for ULP packets.  It is indexed by
112 * a packet's ULP submode.
113 */
114const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
115
116#ifdef notyet
117/*
118 * This sk_buff holds a fake header-only TCP segment that we use whenever we
119 * need to exploit SW TCP functionality that expects TCP headers, such as
120 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
121 * CPUs without locking.
122 */
123static struct mbuf *tcphdr_mbuf __read_mostly;
124#endif
125
126/*
127 * Size of WRs in bytes.  Note that we assume all devices we are handling have
128 * the same WR size.
129 */
130static unsigned int wrlen __read_mostly;
131
132/*
133 * The number of WRs needed for an skb depends on the number of page fragments
134 * in the skb and whether it has any payload in its main body.  This maps the
135 * length of the gather list represented by an skb into the # of necessary WRs.
136 */
137static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
138
139/*
140 * Max receive window supported by HW in bytes.  Only a small part of it can
141 * be set through option0, the rest needs to be set through RX_DATA_ACK.
142 */
143#define MAX_RCV_WND ((1U << 27) - 1)
144
145/*
146 * Min receive window.  We want it to be large enough to accommodate receive
147 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
148 */
149#define MIN_RCV_WND (24 * 1024U)
150#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
151
152#define VALIDATE_SEQ 0
153#define VALIDATE_SOCK(so)
154#define DEBUG_WR 0
155
156#define TCP_TIMEWAIT	1
157#define TCP_CLOSE	2
158#define TCP_DROP	3
159
160static void t3_send_reset(struct toepcb *toep);
161static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
162static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
163static void handle_syncache_event(int event, void *arg);
164
165static inline void
166SBAPPEND(struct sockbuf *sb, struct mbuf *n)
167{
168	struct mbuf *m;
169
170	m = sb->sb_mb;
171	while (m) {
172		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
173		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
174			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
175		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176			m->m_next, m->m_nextpkt, m->m_flags));
177		m = m->m_next;
178	}
179	m = n;
180	while (m) {
181		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
182		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
183			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
184		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
185			m->m_next, m->m_nextpkt, m->m_flags));
186		m = m->m_next;
187	}
188	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
189	sbappendstream_locked(sb, n);
190	m = sb->sb_mb;
191
192	while (m) {
193		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
194			m->m_next, m->m_nextpkt, m->m_flags));
195		m = m->m_next;
196	}
197}
198
199static inline int
200is_t3a(const struct toedev *dev)
201{
202	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
203}
204
205static void
206dump_toepcb(struct toepcb *toep)
207{
208	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
209	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
210	    toep->tp_mtu_idx, toep->tp_tid);
211
212	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
213	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
214	    toep->tp_mss_clamp, toep->tp_flags);
215}
216
217#ifndef RTALLOC2_DEFINED
218static struct rtentry *
219rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
220{
221	struct rtentry *rt = NULL;
222
223	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
224		RT_UNLOCK(rt);
225
226	return (rt);
227}
228#endif
229
230/*
231 * Determine whether to send a CPL message now or defer it.  A message is
232 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
233 * For connections in other states the message is sent immediately.
234 * If through_l2t is set the message is subject to ARP processing, otherwise
235 * it is sent directly.
236 */
237static inline void
238send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
239{
240	struct tcpcb *tp = toep->tp_tp;
241
242	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
243		inp_wlock(tp->t_inpcb);
244		mbufq_tail(&toep->out_of_order_queue, m);  // defer
245		inp_wunlock(tp->t_inpcb);
246	} else if (through_l2t)
247		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
248	else
249		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
250}
251
252static inline unsigned int
253mkprio(unsigned int cntrl, const struct toepcb *toep)
254{
255        return (cntrl);
256}
257
258/*
259 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
260 */
261static inline void
262mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
263{
264	struct cpl_tid_release *req;
265
266	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
267	m->m_pkthdr.len = m->m_len = sizeof(*req);
268	req = mtod(m, struct cpl_tid_release *);
269	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
270	req->wr.wr_lo = 0;
271	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
272}
273
274static inline void
275make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
276{
277	INIT_VNET_INET(so->so_vnet);
278	struct tcpcb *tp = so_sototcpcb(so);
279	struct toepcb *toep = tp->t_toe;
280	struct tx_data_wr *req;
281	struct sockbuf *snd;
282
283	inp_lock_assert(tp->t_inpcb);
284	snd = so_sockbuf_snd(so);
285
286	req = mtod(m, struct tx_data_wr *);
287	m->m_len = sizeof(*req);
288	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
289	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
290	/* len includes the length of any HW ULP additions */
291	req->len = htonl(len);
292	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
293	/* V_TX_ULP_SUBMODE sets both the mode and submode */
294	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
295	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
296	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
297				   (tail ? 0 : 1))));
298	req->sndseq = htonl(tp->snd_nxt);
299	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
300		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
301				    V_TX_CPU_IDX(toep->tp_qset));
302
303		/* Sendbuffer is in units of 32KB.
304		 */
305		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
306			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
307		else {
308			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
309		}
310
311		toep->tp_flags |= TP_DATASENT;
312	}
313}
314
315#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
316
317int
318t3_push_frames(struct socket *so, int req_completion)
319{
320	struct tcpcb *tp = so_sototcpcb(so);
321	struct toepcb *toep = tp->t_toe;
322
323	struct mbuf *tail, *m0, *last;
324	struct t3cdev *cdev;
325	struct tom_data *d;
326	int state, bytes, count, total_bytes;
327	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
328	struct sockbuf *snd;
329
330	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
331		DPRINTF("tcp state=%d\n", tp->t_state);
332		return (0);
333	}
334
335	state = so_state_get(so);
336
337	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
338		DPRINTF("disconnecting\n");
339
340		return (0);
341	}
342
343	inp_lock_assert(tp->t_inpcb);
344
345	snd = so_sockbuf_snd(so);
346	sockbuf_lock(snd);
347
348	d = TOM_DATA(toep->tp_toedev);
349	cdev = d->cdev;
350
351	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
352
353	total_bytes = 0;
354	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
355	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
356
357	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
358		KASSERT(tail, ("sbdrop error"));
359		last = tail = tail->m_next;
360	}
361
362	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
363		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
364		sockbuf_unlock(snd);
365
366		return (0);
367	}
368
369	toep->tp_m_last = NULL;
370	while (toep->tp_wr_avail && (tail != NULL)) {
371		count = bytes = 0;
372		segp = segs;
373		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
374			sockbuf_unlock(snd);
375			return (0);
376		}
377		/*
378		 * If the data in tail fits as in-line, then
379		 * make an immediate data wr.
380		 */
381		if (tail->m_len <= IMM_LEN) {
382			count = 1;
383			bytes = tail->m_len;
384			last = tail;
385			tail = tail->m_next;
386			m_set_sgl(m0, NULL);
387			m_set_sgllen(m0, 0);
388			make_tx_data_wr(so, m0, bytes, tail);
389			m_append(m0, bytes, mtod(last, caddr_t));
390			KASSERT(!m0->m_next, ("bad append"));
391		} else {
392			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
393			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
394				bytes += tail->m_len;
395				last = tail;
396				count++;
397				/*
398				 * technically an abuse to be using this for a VA
399				 * but less gross than defining my own structure
400				 * or calling pmap_kextract from here :-|
401				 */
402				segp->ds_addr = (bus_addr_t)tail->m_data;
403				segp->ds_len = tail->m_len;
404				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
405				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
406				segp++;
407				tail = tail->m_next;
408			}
409			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
410			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
411
412			m_set_sgl(m0, segs);
413			m_set_sgllen(m0, count);
414			make_tx_data_wr(so, m0, bytes, tail);
415		}
416		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
417
418		if (tail) {
419			snd->sb_sndptr = tail;
420			toep->tp_m_last = NULL;
421		} else
422			toep->tp_m_last = snd->sb_sndptr = last;
423
424
425		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
426
427		snd->sb_sndptroff += bytes;
428		total_bytes += bytes;
429		toep->tp_write_seq += bytes;
430		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
431		    " tail=%p sndptr=%p sndptroff=%d",
432		    toep->tp_wr_avail, count, mbuf_wrs[count],
433		    tail, snd->sb_sndptr, snd->sb_sndptroff);
434		if (tail)
435			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
436			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
437			    total_bytes, toep->tp_m_last, tail->m_data,
438			    tp->snd_una);
439		else
440			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
441			    " tp_m_last=%p snd_una=0x%08x",
442			    total_bytes, toep->tp_m_last, tp->snd_una);
443
444
445#ifdef KTR
446{
447		int i;
448
449		i = 0;
450		while (i < count && m_get_sgllen(m0)) {
451			if ((count - i) >= 3) {
452				CTR6(KTR_TOM,
453				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
454				    " len=%d pa=0x%zx len=%d",
455				    segs[i].ds_addr, segs[i].ds_len,
456				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
457				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
458				    i += 3;
459			} else if ((count - i) == 2) {
460				CTR4(KTR_TOM,
461				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
462				    " len=%d",
463				    segs[i].ds_addr, segs[i].ds_len,
464				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
465				    i += 2;
466			} else {
467				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
468				    segs[i].ds_addr, segs[i].ds_len);
469				i++;
470			}
471
472		}
473}
474#endif
475                 /*
476		 * remember credits used
477		 */
478		m0->m_pkthdr.csum_data = mbuf_wrs[count];
479		m0->m_pkthdr.len = bytes;
480		toep->tp_wr_avail -= mbuf_wrs[count];
481		toep->tp_wr_unacked += mbuf_wrs[count];
482
483		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
484		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
485			struct work_request_hdr *wr = cplhdr(m0);
486
487			wr->wr_hi |= htonl(F_WR_COMPL);
488			toep->tp_wr_unacked = 0;
489		}
490		KASSERT((m0->m_pkthdr.csum_data > 0) &&
491		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
492			m0->m_pkthdr.csum_data));
493		m0->m_type = MT_DONTFREE;
494		enqueue_wr(toep, m0);
495		DPRINTF("sending offload tx with %d bytes in %d segments\n",
496		    bytes, count);
497		l2t_send(cdev, m0, toep->tp_l2t);
498	}
499	sockbuf_unlock(snd);
500	return (total_bytes);
501}
502
503/*
504 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
505 * under any circumstances.  We take the easy way out and always queue the
506 * message to the write_queue.  We can optimize the case where the queue is
507 * already empty though the optimization is probably not worth it.
508 */
509static void
510close_conn(struct socket *so)
511{
512	struct mbuf *m;
513	struct cpl_close_con_req *req;
514	struct tom_data *d;
515	struct inpcb *inp = so_sotoinpcb(so);
516	struct tcpcb *tp;
517	struct toepcb *toep;
518	unsigned int tid;
519
520
521	inp_wlock(inp);
522	tp = so_sototcpcb(so);
523	toep = tp->t_toe;
524
525	if (tp->t_state != TCPS_SYN_SENT)
526		t3_push_frames(so, 1);
527
528	if (toep->tp_flags & TP_FIN_SENT) {
529		inp_wunlock(inp);
530		return;
531	}
532
533	tid = toep->tp_tid;
534
535	d = TOM_DATA(toep->tp_toedev);
536
537	m = m_gethdr_nofail(sizeof(*req));
538	m_set_priority(m, CPL_PRIORITY_DATA);
539	m_set_sgl(m, NULL);
540	m_set_sgllen(m, 0);
541
542	toep->tp_flags |= TP_FIN_SENT;
543	req = mtod(m, struct cpl_close_con_req *);
544
545	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
546	req->wr.wr_lo = htonl(V_WR_TID(tid));
547	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
548	req->rsvd = 0;
549	inp_wunlock(inp);
550	/*
551	 * XXX - need to defer shutdown while there is still data in the queue
552	 *
553	 */
554	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
555	cxgb_ofld_send(d->cdev, m);
556
557}
558
559/*
560 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
561 * and send it along.
562 */
563static void
564abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
565{
566	struct cpl_abort_req *req = cplhdr(m);
567
568	req->cmd = CPL_ABORT_NO_RST;
569	cxgb_ofld_send(cdev, m);
570}
571
572/*
573 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
574 * permitted to return without sending the message in case we cannot allocate
575 * an sk_buff.  Returns the number of credits sent.
576 */
577uint32_t
578t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
579{
580	struct mbuf *m;
581	struct cpl_rx_data_ack *req;
582	struct toepcb *toep = tp->t_toe;
583	struct toedev *tdev = toep->tp_toedev;
584
585	m = m_gethdr_nofail(sizeof(*req));
586
587	DPRINTF("returning %u credits to HW\n", credits);
588
589	req = mtod(m, struct cpl_rx_data_ack *);
590	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
591	req->wr.wr_lo = 0;
592	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
593	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
594	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
595	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
596	return (credits);
597}
598
599/*
600 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
601 * This is only used in DDP mode, so we take the opportunity to also set the
602 * DACK mode and flush any Rx credits.
603 */
604void
605t3_send_rx_modulate(struct toepcb *toep)
606{
607	struct mbuf *m;
608	struct cpl_rx_data_ack *req;
609
610	m = m_gethdr_nofail(sizeof(*req));
611
612	req = mtod(m, struct cpl_rx_data_ack *);
613	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
614	req->wr.wr_lo = 0;
615	m->m_pkthdr.len = m->m_len = sizeof(*req);
616
617	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
618	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
619				 V_RX_DACK_MODE(1) |
620				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
621	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
622	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
623	toep->tp_rcv_wup = toep->tp_copied_seq;
624}
625
626/*
627 * Handle receipt of an urgent pointer.
628 */
629static void
630handle_urg_ptr(struct socket *so, uint32_t urg_seq)
631{
632#ifdef URGENT_DATA_SUPPORTED
633	struct tcpcb *tp = so_sototcpcb(so);
634
635	urg_seq--;   /* initially points past the urgent data, per BSD */
636
637	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
638		return;                                 /* duplicate pointer */
639	sk_send_sigurg(sk);
640	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
641	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
642		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
643
644		tp->copied_seq++;
645		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
646			tom_eat_skb(sk, skb, 0);
647	}
648	tp->urg_data = TCP_URG_NOTYET;
649	tp->urg_seq = urg_seq;
650#endif
651}
652
653/*
654 * Returns true if a socket cannot accept new Rx data.
655 */
656static inline int
657so_no_receive(const struct socket *so)
658{
659	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
660}
661
662/*
663 * Process an urgent data notification.
664 */
665static void
666rx_urg_notify(struct toepcb *toep, struct mbuf *m)
667{
668	struct cpl_rx_urg_notify *hdr = cplhdr(m);
669	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
670
671	VALIDATE_SOCK(so);
672
673	if (!so_no_receive(so))
674		handle_urg_ptr(so, ntohl(hdr->seq));
675
676	m_freem(m);
677}
678
679/*
680 * Handler for RX_URG_NOTIFY CPL messages.
681 */
682static int
683do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
684{
685	struct toepcb *toep = (struct toepcb *)ctx;
686
687	rx_urg_notify(toep, m);
688	return (0);
689}
690
691static __inline int
692is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
693{
694	return (toep->tp_ulp_mode ||
695		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
696		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
697}
698
699/*
700 * Set of states for which we should return RX credits.
701 */
702#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
703
704/*
705 * Called after some received data has been read.  It returns RX credits
706 * to the HW for the amount of data processed.
707 */
708void
709t3_cleanup_rbuf(struct tcpcb *tp, int copied)
710{
711	struct toepcb *toep = tp->t_toe;
712	struct socket *so;
713	struct toedev *dev;
714	int dack_mode, must_send, read;
715	u32 thres, credits, dack = 0;
716	struct sockbuf *rcv;
717
718	so = inp_inpcbtosocket(tp->t_inpcb);
719	rcv = so_sockbuf_rcv(so);
720
721	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
722		(tp->t_state == TCPS_FIN_WAIT_2))) {
723		if (copied) {
724			sockbuf_lock(rcv);
725			toep->tp_copied_seq += copied;
726			sockbuf_unlock(rcv);
727		}
728
729		return;
730	}
731
732	inp_lock_assert(tp->t_inpcb);
733
734	sockbuf_lock(rcv);
735	if (copied)
736		toep->tp_copied_seq += copied;
737	else {
738		read = toep->tp_enqueued_bytes - rcv->sb_cc;
739		toep->tp_copied_seq += read;
740	}
741	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
742	toep->tp_enqueued_bytes = rcv->sb_cc;
743	sockbuf_unlock(rcv);
744
745	if (credits > rcv->sb_mbmax) {
746		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
747		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
748	    credits = rcv->sb_mbmax;
749	}
750
751
752	/*
753	 * XXX this won't accurately reflect credit return - we need
754	 * to look at the difference between the amount that has been
755	 * put in the recv sockbuf and what is there now
756	 */
757
758	if (__predict_false(!credits))
759		return;
760
761	dev = toep->tp_toedev;
762	thres = TOM_TUNABLE(dev, rx_credit_thres);
763
764	if (__predict_false(thres == 0))
765		return;
766
767	if (is_delack_mode_valid(dev, toep)) {
768		dack_mode = TOM_TUNABLE(dev, delack);
769		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
770			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
771
772			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
773				dack = F_RX_DACK_CHANGE |
774				       V_RX_DACK_MODE(dack_mode);
775		}
776	} else
777		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
778
779	/*
780	 * For coalescing to work effectively ensure the receive window has
781	 * at least 16KB left.
782	 */
783	must_send = credits + 16384 >= tp->rcv_wnd;
784
785	if (must_send || credits >= thres)
786		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
787}
788
789static int
790cxgb_toe_disconnect(struct tcpcb *tp)
791{
792	struct socket *so;
793
794	DPRINTF("cxgb_toe_disconnect\n");
795
796	so = inp_inpcbtosocket(tp->t_inpcb);
797	close_conn(so);
798	return (0);
799}
800
801static int
802cxgb_toe_reset(struct tcpcb *tp)
803{
804	struct toepcb *toep = tp->t_toe;
805
806	t3_send_reset(toep);
807
808	/*
809	 * unhook from socket
810	 */
811	tp->t_flags &= ~TF_TOE;
812	toep->tp_tp = NULL;
813	tp->t_toe = NULL;
814	return (0);
815}
816
817static int
818cxgb_toe_send(struct tcpcb *tp)
819{
820	struct socket *so;
821
822	DPRINTF("cxgb_toe_send\n");
823	dump_toepcb(tp->t_toe);
824
825	so = inp_inpcbtosocket(tp->t_inpcb);
826	t3_push_frames(so, 1);
827	return (0);
828}
829
830static int
831cxgb_toe_rcvd(struct tcpcb *tp)
832{
833
834	inp_lock_assert(tp->t_inpcb);
835
836	t3_cleanup_rbuf(tp, 0);
837
838	return (0);
839}
840
841static void
842cxgb_toe_detach(struct tcpcb *tp)
843{
844	struct toepcb *toep;
845
846        /*
847	 * XXX how do we handle teardown in the SYN_SENT state?
848	 *
849	 */
850	inp_lock_assert(tp->t_inpcb);
851	toep = tp->t_toe;
852	toep->tp_tp = NULL;
853
854	/*
855	 * unhook from socket
856	 */
857	tp->t_flags &= ~TF_TOE;
858	tp->t_toe = NULL;
859}
860
861
862static struct toe_usrreqs cxgb_toe_usrreqs = {
863	.tu_disconnect = cxgb_toe_disconnect,
864	.tu_reset = cxgb_toe_reset,
865	.tu_send = cxgb_toe_send,
866	.tu_rcvd = cxgb_toe_rcvd,
867	.tu_detach = cxgb_toe_detach,
868	.tu_detach = cxgb_toe_detach,
869	.tu_syncache_event = handle_syncache_event,
870};
871
872
873static void
874__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
875			    uint64_t mask, uint64_t val, int no_reply)
876{
877	struct cpl_set_tcb_field *req;
878
879	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
880	    toep->tp_tid, word, mask, val);
881
882	req = mtod(m, struct cpl_set_tcb_field *);
883	m->m_pkthdr.len = m->m_len = sizeof(*req);
884	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
885	req->wr.wr_lo = 0;
886	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
887	req->reply = V_NO_REPLY(no_reply);
888	req->cpu_idx = 0;
889	req->word = htons(word);
890	req->mask = htobe64(mask);
891	req->val = htobe64(val);
892
893	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
894	send_or_defer(toep, m, 0);
895}
896
897static void
898t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
899{
900	struct mbuf *m;
901	struct tcpcb *tp = toep->tp_tp;
902
903	if (toep == NULL)
904		return;
905
906	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
907		printf("not seting field\n");
908		return;
909	}
910
911	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
912
913	__set_tcb_field(toep, m, word, mask, val, 1);
914}
915
916/*
917 * Set one of the t_flags bits in the TCB.
918 */
919static void
920set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
921{
922
923	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
924}
925
926/*
927 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
928 */
929static void
930t3_set_nagle(struct toepcb *toep)
931{
932	struct tcpcb *tp = toep->tp_tp;
933
934	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
935}
936
937/*
938 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
939 */
940void
941t3_set_keepalive(struct toepcb *toep, int on_off)
942{
943
944	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
945}
946
947void
948t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
949{
950	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
951}
952
953void
954t3_set_dack_mss(struct toepcb *toep, int on_off)
955{
956
957	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
958}
959
960/*
961 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
962 */
963static void
964t3_set_tos(struct toepcb *toep)
965{
966	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
967
968	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
969			 V_TCB_TOS(tos));
970}
971
972
973/*
974 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
975 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
976 * set the PSH bit in the last segment, which would trigger delivery.]
977 * We work around the issue by setting a DDP buffer in a partial placed state,
978 * which guarantees that TP will schedule a timer.
979 */
980#define TP_DDP_TIMER_WORKAROUND_MASK\
981    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
982     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
983       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
984#define TP_DDP_TIMER_WORKAROUND_VAL\
985    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
986     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
987      32))
988
989static void
990t3_enable_ddp(struct toepcb *toep, int on)
991{
992	if (on) {
993
994		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
995				 V_TF_DDP_OFF(0));
996	} else
997		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
998				 V_TF_DDP_OFF(1) |
999				 TP_DDP_TIMER_WORKAROUND_MASK,
1000				 V_TF_DDP_OFF(1) |
1001				 TP_DDP_TIMER_WORKAROUND_VAL);
1002
1003}
1004
1005void
1006t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1007{
1008	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1009			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1010			 tag_color);
1011}
1012
1013void
1014t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1015		    unsigned int len)
1016{
1017	if (buf_idx == 0)
1018		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1019			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1020			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1021			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1022			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1023	else
1024		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1025			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1026			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1027			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1028			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1029}
1030
1031static int
1032t3_set_cong_control(struct socket *so, const char *name)
1033{
1034#ifdef CONGESTION_CONTROL_SUPPORTED
1035	int cong_algo;
1036
1037	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1038		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1039			break;
1040
1041	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1042		return -EINVAL;
1043#endif
1044	return 0;
1045}
1046
1047int
1048t3_get_tcb(struct toepcb *toep)
1049{
1050	struct cpl_get_tcb *req;
1051	struct tcpcb *tp = toep->tp_tp;
1052	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1053
1054	if (!m)
1055		return (ENOMEM);
1056
1057	inp_lock_assert(tp->t_inpcb);
1058	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1059	req = mtod(m, struct cpl_get_tcb *);
1060	m->m_pkthdr.len = m->m_len = sizeof(*req);
1061	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1062	req->wr.wr_lo = 0;
1063	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1064	req->cpuno = htons(toep->tp_qset);
1065	req->rsvd = 0;
1066	if (tp->t_state == TCPS_SYN_SENT)
1067		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1068	else
1069		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1070	return 0;
1071}
1072
1073static inline void
1074so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1075{
1076
1077	toepcb_hold(toep);
1078
1079	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1080}
1081
1082/**
1083 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1084 *	@d: TOM state
1085 *	@mtu: the target MTU
1086 *
1087 *	Returns the index of the value in the MTU table that is closest to but
1088 *	does not exceed the target MTU.
1089 */
1090static unsigned int
1091find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1092{
1093	int i = 0;
1094
1095	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1096		++i;
1097	return (i);
1098}
1099
1100static unsigned int
1101select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1102{
1103	unsigned int idx;
1104
1105#ifdef notyet
1106	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1107#endif
1108	if (tp) {
1109		tp->t_maxseg = pmtu - 40;
1110		if (tp->t_maxseg < td->mtus[0] - 40)
1111			tp->t_maxseg = td->mtus[0] - 40;
1112		idx = find_best_mtu(td, tp->t_maxseg + 40);
1113
1114		tp->t_maxseg = td->mtus[idx] - 40;
1115	} else
1116		idx = find_best_mtu(td, pmtu);
1117
1118	return (idx);
1119}
1120
1121static inline void
1122free_atid(struct t3cdev *cdev, unsigned int tid)
1123{
1124	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1125
1126	if (toep)
1127		toepcb_release(toep);
1128}
1129
1130/*
1131 * Release resources held by an offload connection (TID, L2T entry, etc.)
1132 */
1133static void
1134t3_release_offload_resources(struct toepcb *toep)
1135{
1136	struct tcpcb *tp = toep->tp_tp;
1137	struct toedev *tdev = toep->tp_toedev;
1138	struct t3cdev *cdev;
1139	struct socket *so;
1140	unsigned int tid = toep->tp_tid;
1141	struct sockbuf *rcv;
1142
1143	CTR0(KTR_TOM, "t3_release_offload_resources");
1144
1145	if (!tdev)
1146		return;
1147
1148	cdev = TOEP_T3C_DEV(toep);
1149	if (!cdev)
1150		return;
1151
1152	toep->tp_qset = 0;
1153	t3_release_ddp_resources(toep);
1154
1155#ifdef CTRL_SKB_CACHE
1156	kfree_skb(CTRL_SKB_CACHE(tp));
1157	CTRL_SKB_CACHE(tp) = NULL;
1158#endif
1159
1160	if (toep->tp_wr_avail != toep->tp_wr_max) {
1161		purge_wr_queue(toep);
1162		reset_wr_list(toep);
1163	}
1164
1165	if (toep->tp_l2t) {
1166		l2t_release(L2DATA(cdev), toep->tp_l2t);
1167		toep->tp_l2t = NULL;
1168	}
1169	toep->tp_tp = NULL;
1170	if (tp) {
1171		inp_lock_assert(tp->t_inpcb);
1172		so = inp_inpcbtosocket(tp->t_inpcb);
1173		rcv = so_sockbuf_rcv(so);
1174		/*
1175		 * cancel any offloaded reads
1176		 *
1177		 */
1178		sockbuf_lock(rcv);
1179		tp->t_toe = NULL;
1180		tp->t_flags &= ~TF_TOE;
1181		if (toep->tp_ddp_state.user_ddp_pending) {
1182			t3_cancel_ubuf(toep, rcv);
1183			toep->tp_ddp_state.user_ddp_pending = 0;
1184		}
1185		so_sorwakeup_locked(so);
1186
1187	}
1188
1189	if (toep->tp_state == TCPS_SYN_SENT) {
1190		free_atid(cdev, tid);
1191#ifdef notyet
1192		__skb_queue_purge(&tp->out_of_order_queue);
1193#endif
1194	} else {                                          // we have TID
1195		cxgb_remove_tid(cdev, toep, tid);
1196		toepcb_release(toep);
1197	}
1198#if 0
1199	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1200#endif
1201}
1202
1203static void
1204install_offload_ops(struct socket *so)
1205{
1206	struct tcpcb *tp = so_sototcpcb(so);
1207
1208	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1209
1210	t3_install_socket_ops(so);
1211	tp->t_flags |= TF_TOE;
1212	tp->t_tu = &cxgb_toe_usrreqs;
1213}
1214
1215/*
1216 * Determine the receive window scaling factor given a target max
1217 * receive window.
1218 */
1219static __inline int
1220select_rcv_wscale(int space)
1221{
1222	INIT_VNET_INET(so->so_vnet);
1223	int wscale = 0;
1224
1225	if (space > MAX_RCV_WND)
1226		space = MAX_RCV_WND;
1227
1228	if (V_tcp_do_rfc1323)
1229		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1230
1231	return (wscale);
1232}
1233
1234/*
1235 * Determine the receive window size for a socket.
1236 */
1237static unsigned long
1238select_rcv_wnd(struct toedev *dev, struct socket *so)
1239{
1240	INIT_VNET_INET(so->so_vnet);
1241	struct tom_data *d = TOM_DATA(dev);
1242	unsigned int wnd;
1243	unsigned int max_rcv_wnd;
1244	struct sockbuf *rcv;
1245
1246	rcv = so_sockbuf_rcv(so);
1247
1248	if (V_tcp_do_autorcvbuf)
1249		wnd = V_tcp_autorcvbuf_max;
1250	else
1251		wnd = rcv->sb_hiwat;
1252
1253
1254
1255	/* XXX
1256	 * For receive coalescing to work effectively we need a receive window
1257	 * that can accomodate a coalesced segment.
1258	 */
1259	if (wnd < MIN_RCV_WND)
1260		wnd = MIN_RCV_WND;
1261
1262	/* PR 5138 */
1263	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1264				    (uint32_t)d->rx_page_size * 23 :
1265				    MAX_RCV_WND);
1266
1267	return min(wnd, max_rcv_wnd);
1268}
1269
1270/*
1271 * Assign offload parameters to some socket fields.  This code is used by
1272 * both active and passive opens.
1273 */
1274static inline void
1275init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1276    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1277{
1278	struct tcpcb *tp = so_sototcpcb(so);
1279	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1280	struct sockbuf *snd, *rcv;
1281
1282#ifdef notyet
1283	SOCK_LOCK_ASSERT(so);
1284#endif
1285
1286	snd = so_sockbuf_snd(so);
1287	rcv = so_sockbuf_rcv(so);
1288
1289	log(LOG_INFO, "initializing offload socket\n");
1290	/*
1291	 * We either need to fix push frames to work with sbcompress
1292	 * or we need to add this
1293	 */
1294	snd->sb_flags |= SB_NOCOALESCE;
1295	rcv->sb_flags |= SB_NOCOALESCE;
1296
1297	tp->t_toe = toep;
1298	toep->tp_tp = tp;
1299	toep->tp_toedev = dev;
1300
1301	toep->tp_tid = tid;
1302	toep->tp_l2t = e;
1303	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1304	toep->tp_wr_unacked = 0;
1305	toep->tp_delack_mode = 0;
1306
1307	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1308	/*
1309	 * XXX broken
1310	 *
1311	 */
1312	tp->rcv_wnd = select_rcv_wnd(dev, so);
1313
1314        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1315		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1316	toep->tp_qset_idx = 0;
1317
1318	reset_wr_list(toep);
1319	DPRINTF("initialization done\n");
1320}
1321
1322/*
1323 * The next two functions calculate the option 0 value for a socket.
1324 */
1325static inline unsigned int
1326calc_opt0h(struct socket *so, int mtu_idx)
1327{
1328	struct tcpcb *tp = so_sototcpcb(so);
1329	int wscale = select_rcv_wscale(tp->rcv_wnd);
1330
1331	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1332	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1333	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1334}
1335
1336static inline unsigned int
1337calc_opt0l(struct socket *so, int ulp_mode)
1338{
1339	struct tcpcb *tp = so_sototcpcb(so);
1340	unsigned int val;
1341
1342	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1343	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1344
1345	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1346	return (val);
1347}
1348
1349static inline unsigned int
1350calc_opt2(const struct socket *so, struct toedev *dev)
1351{
1352	int flv_valid;
1353
1354	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1355
1356	return (V_FLAVORS_VALID(flv_valid) |
1357	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1358}
1359
1360#if DEBUG_WR > 1
1361static int
1362count_pending_wrs(const struct toepcb *toep)
1363{
1364	const struct mbuf *m;
1365	int n = 0;
1366
1367	wr_queue_walk(toep, m)
1368		n += m->m_pkthdr.csum_data;
1369	return (n);
1370}
1371#endif
1372
1373#if 0
1374(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1375#endif
1376
1377static void
1378mk_act_open_req(struct socket *so, struct mbuf *m,
1379    unsigned int atid, const struct l2t_entry *e)
1380{
1381	struct cpl_act_open_req *req;
1382	struct inpcb *inp = so_sotoinpcb(so);
1383	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1384	struct toepcb *toep = tp->t_toe;
1385	struct toedev *tdev = toep->tp_toedev;
1386
1387	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1388
1389	req = mtod(m, struct cpl_act_open_req *);
1390	m->m_pkthdr.len = m->m_len = sizeof(*req);
1391
1392	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1393	req->wr.wr_lo = 0;
1394	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1395	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1396#if 0
1397	req->local_port = inp->inp_lport;
1398	req->peer_port = inp->inp_fport;
1399	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1400	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1401#endif
1402	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1403			   V_TX_CHANNEL(e->smt_idx));
1404	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1405	req->params = 0;
1406	req->opt2 = htonl(calc_opt2(so, tdev));
1407}
1408
1409
1410/*
1411 * Convert an ACT_OPEN_RPL status to an errno.
1412 */
1413static int
1414act_open_rpl_status_to_errno(int status)
1415{
1416	switch (status) {
1417	case CPL_ERR_CONN_RESET:
1418		return (ECONNREFUSED);
1419	case CPL_ERR_ARP_MISS:
1420		return (EHOSTUNREACH);
1421	case CPL_ERR_CONN_TIMEDOUT:
1422		return (ETIMEDOUT);
1423	case CPL_ERR_TCAM_FULL:
1424		return (ENOMEM);
1425	case CPL_ERR_CONN_EXIST:
1426		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1427		return (EADDRINUSE);
1428	default:
1429		return (EIO);
1430	}
1431}
1432
1433static void
1434fail_act_open(struct toepcb *toep, int errno)
1435{
1436	struct tcpcb *tp = toep->tp_tp;
1437
1438	t3_release_offload_resources(toep);
1439	if (tp) {
1440		inp_wunlock(tp->t_inpcb);
1441		tcp_offload_drop(tp, errno);
1442	}
1443
1444#ifdef notyet
1445	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1446#endif
1447}
1448
1449/*
1450 * Handle active open failures.
1451 */
1452static void
1453active_open_failed(struct toepcb *toep, struct mbuf *m)
1454{
1455	struct cpl_act_open_rpl *rpl = cplhdr(m);
1456	struct inpcb *inp;
1457
1458	if (toep->tp_tp == NULL)
1459		goto done;
1460
1461	inp = toep->tp_tp->t_inpcb;
1462
1463/*
1464 * Don't handle connection retry for now
1465 */
1466#ifdef notyet
1467	struct inet_connection_sock *icsk = inet_csk(sk);
1468
1469	if (rpl->status == CPL_ERR_CONN_EXIST &&
1470	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1471		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1472		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1473			       jiffies + HZ / 2);
1474	} else
1475#endif
1476	{
1477		inp_wlock(inp);
1478		/*
1479		 * drops the inpcb lock
1480		 */
1481		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1482	}
1483
1484	done:
1485	m_free(m);
1486}
1487
1488/*
1489 * Return whether a failed active open has allocated a TID
1490 */
1491static inline int
1492act_open_has_tid(int status)
1493{
1494	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1495	       status != CPL_ERR_ARP_MISS;
1496}
1497
1498/*
1499 * Process an ACT_OPEN_RPL CPL message.
1500 */
1501static int
1502do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1503{
1504	struct toepcb *toep = (struct toepcb *)ctx;
1505	struct cpl_act_open_rpl *rpl = cplhdr(m);
1506
1507	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1508		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1509
1510	active_open_failed(toep, m);
1511	return (0);
1512}
1513
1514/*
1515 * Handle an ARP failure for an active open.   XXX purge ofo queue
1516 *
1517 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1518 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1519 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1520 * free the atid.  Hmm.
1521 */
1522#ifdef notyet
1523static void
1524act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1525{
1526	struct toepcb *toep = m_get_toep(m);
1527	struct tcpcb *tp = toep->tp_tp;
1528	struct inpcb *inp = tp->t_inpcb;
1529	struct socket *so;
1530
1531	inp_wlock(inp);
1532	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1533		/*
1534		 * drops the inpcb lock
1535		 */
1536		fail_act_open(so, EHOSTUNREACH);
1537		printf("freeing %p\n", m);
1538
1539		m_free(m);
1540	} else
1541		inp_wunlock(inp);
1542}
1543#endif
1544/*
1545 * Send an active open request.
1546 */
1547int
1548t3_connect(struct toedev *tdev, struct socket *so,
1549    struct rtentry *rt, struct sockaddr *nam)
1550{
1551	struct mbuf *m;
1552	struct l2t_entry *e;
1553	struct tom_data *d = TOM_DATA(tdev);
1554	struct inpcb *inp = so_sotoinpcb(so);
1555	struct tcpcb *tp = intotcpcb(inp);
1556	struct toepcb *toep; /* allocated by init_offload_socket */
1557
1558	int atid;
1559
1560	toep = toepcb_alloc();
1561	if (toep == NULL)
1562		goto out_err;
1563
1564	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1565		goto out_err;
1566
1567	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1568	if (!e)
1569		goto free_tid;
1570
1571	inp_lock_assert(inp);
1572	m = m_gethdr(MT_DATA, M_WAITOK);
1573
1574#if 0
1575	m->m_toe.mt_toepcb = tp->t_toe;
1576	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1577#endif
1578	so_lock(so);
1579
1580	init_offload_socket(so, tdev, atid, e, rt, toep);
1581
1582	install_offload_ops(so);
1583
1584	mk_act_open_req(so, m, atid, e);
1585	so_unlock(so);
1586
1587	soisconnecting(so);
1588	toep = tp->t_toe;
1589	m_set_toep(m, tp->t_toe);
1590
1591	toep->tp_state = TCPS_SYN_SENT;
1592	l2t_send(d->cdev, (struct mbuf *)m, e);
1593
1594	if (toep->tp_ulp_mode)
1595		t3_enable_ddp(toep, 0);
1596	return 	(0);
1597
1598free_tid:
1599	printf("failing connect - free atid\n");
1600
1601	free_atid(d->cdev, atid);
1602out_err:
1603	printf("return ENOMEM\n");
1604       return (ENOMEM);
1605}
1606
1607/*
1608 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1609 * not send multiple ABORT_REQs for the same connection and also that we do
1610 * not try to send a message after the connection has closed.  Returns 1 if
1611 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1612 */
1613static void
1614t3_send_reset(struct toepcb *toep)
1615{
1616
1617	struct cpl_abort_req *req;
1618	unsigned int tid = toep->tp_tid;
1619	int mode = CPL_ABORT_SEND_RST;
1620	struct tcpcb *tp = toep->tp_tp;
1621	struct toedev *tdev = toep->tp_toedev;
1622	struct socket *so = NULL;
1623	struct mbuf *m;
1624	struct sockbuf *snd;
1625
1626	if (tp) {
1627		inp_lock_assert(tp->t_inpcb);
1628		so = inp_inpcbtosocket(tp->t_inpcb);
1629	}
1630
1631	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1632		tdev == NULL))
1633		return;
1634	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1635
1636	snd = so_sockbuf_snd(so);
1637	/* Purge the send queue so we don't send anything after an abort. */
1638	if (so)
1639		sbflush(snd);
1640	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1641		mode |= CPL_ABORT_POST_CLOSE_REQ;
1642
1643	m = m_gethdr_nofail(sizeof(*req));
1644	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1645	set_arp_failure_handler(m, abort_arp_failure);
1646
1647	req = mtod(m, struct cpl_abort_req *);
1648	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1649	req->wr.wr_lo = htonl(V_WR_TID(tid));
1650	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1651	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1652	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1653	req->cmd = mode;
1654	if (tp && (tp->t_state == TCPS_SYN_SENT))
1655		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1656	else
1657		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1658}
1659
1660static int
1661t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1662{
1663	struct inpcb *inp;
1664	int error, optval;
1665
1666	if (sopt->sopt_name == IP_OPTIONS)
1667		return (ENOPROTOOPT);
1668
1669	if (sopt->sopt_name != IP_TOS)
1670		return (EOPNOTSUPP);
1671
1672	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1673
1674	if (error)
1675		return (error);
1676
1677	if (optval > IPTOS_PREC_CRITIC_ECP)
1678		return (EINVAL);
1679
1680	inp = so_sotoinpcb(so);
1681	inp_wlock(inp);
1682	inp_ip_tos_set(inp, optval);
1683#if 0
1684	inp->inp_ip_tos = optval;
1685#endif
1686	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1687	inp_wunlock(inp);
1688
1689	return (0);
1690}
1691
1692static int
1693t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1694{
1695	int err = 0;
1696	size_t copied;
1697
1698	if (sopt->sopt_name != TCP_CONGESTION &&
1699	    sopt->sopt_name != TCP_NODELAY)
1700		return (EOPNOTSUPP);
1701
1702	if (sopt->sopt_name == TCP_CONGESTION) {
1703		char name[TCP_CA_NAME_MAX];
1704		int optlen = sopt->sopt_valsize;
1705		struct tcpcb *tp;
1706
1707		if (sopt->sopt_dir == SOPT_GET) {
1708			KASSERT(0, ("unimplemented"));
1709			return (EOPNOTSUPP);
1710		}
1711
1712		if (optlen < 1)
1713			return (EINVAL);
1714
1715		err = copyinstr(sopt->sopt_val, name,
1716		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1717		if (err)
1718			return (err);
1719		if (copied < 1)
1720			return (EINVAL);
1721
1722		tp = so_sototcpcb(so);
1723		/*
1724		 * XXX I need to revisit this
1725		 */
1726		if ((err = t3_set_cong_control(so, name)) == 0) {
1727#ifdef CONGESTION_CONTROL_SUPPORTED
1728			tp->t_cong_control = strdup(name, M_CXGB);
1729#endif
1730		} else
1731			return (err);
1732	} else {
1733		int optval, oldval;
1734		struct inpcb *inp;
1735		struct tcpcb *tp;
1736
1737		if (sopt->sopt_dir == SOPT_GET)
1738			return (EOPNOTSUPP);
1739
1740		err = sooptcopyin(sopt, &optval, sizeof optval,
1741		    sizeof optval);
1742
1743		if (err)
1744			return (err);
1745
1746		inp = so_sotoinpcb(so);
1747		inp_wlock(inp);
1748		tp = inp_inpcbtotcpcb(inp);
1749
1750		oldval = tp->t_flags;
1751		if (optval)
1752			tp->t_flags |= TF_NODELAY;
1753		else
1754			tp->t_flags &= ~TF_NODELAY;
1755		inp_wunlock(inp);
1756
1757
1758		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1759			t3_set_nagle(tp->t_toe);
1760
1761	}
1762
1763	return (0);
1764}
1765
1766int
1767t3_ctloutput(struct socket *so, struct sockopt *sopt)
1768{
1769	int err;
1770
1771	if (sopt->sopt_level != IPPROTO_TCP)
1772		err =  t3_ip_ctloutput(so, sopt);
1773	else
1774		err = t3_tcp_ctloutput(so, sopt);
1775
1776	if (err != EOPNOTSUPP)
1777		return (err);
1778
1779	return (tcp_ctloutput(so, sopt));
1780}
1781
1782/*
1783 * Returns true if we need to explicitly request RST when we receive new data
1784 * on an RX-closed connection.
1785 */
1786static inline int
1787need_rst_on_excess_rx(const struct toepcb *toep)
1788{
1789	return (1);
1790}
1791
1792/*
1793 * Handles Rx data that arrives in a state where the socket isn't accepting
1794 * new data.
1795 */
1796static void
1797handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1798{
1799
1800	if (need_rst_on_excess_rx(toep) &&
1801	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1802		t3_send_reset(toep);
1803	m_freem(m);
1804}
1805
1806/*
1807 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1808 * by getting the DDP offset from the TCB.
1809 */
1810static void
1811tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1812{
1813	struct ddp_state *q = &toep->tp_ddp_state;
1814	struct ddp_buf_state *bsp;
1815	struct cpl_get_tcb_rpl *hdr;
1816	unsigned int ddp_offset;
1817	struct socket *so;
1818	struct tcpcb *tp;
1819	struct sockbuf *rcv;
1820	int state;
1821
1822	uint64_t t;
1823	__be64 *tcb;
1824
1825	tp = toep->tp_tp;
1826	so = inp_inpcbtosocket(tp->t_inpcb);
1827
1828	inp_lock_assert(tp->t_inpcb);
1829	rcv = so_sockbuf_rcv(so);
1830	sockbuf_lock(rcv);
1831
1832	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1833	 * We really need a cookie in order to dispatch the RPLs.
1834	 */
1835	q->get_tcb_count--;
1836
1837	/* It is a possible that a previous CPL already invalidated UBUF DDP
1838	 * and moved the cur_buf idx and hence no further processing of this
1839	 * skb is required. However, the app might be sleeping on
1840	 * !q->get_tcb_count and we need to wake it up.
1841	 */
1842	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1843		int state = so_state_get(so);
1844
1845		m_freem(m);
1846		if (__predict_true((state & SS_NOFDREF) == 0))
1847			so_sorwakeup_locked(so);
1848		else
1849			sockbuf_unlock(rcv);
1850
1851		return;
1852	}
1853
1854	bsp = &q->buf_state[q->cur_buf];
1855	hdr = cplhdr(m);
1856	tcb = (__be64 *)(hdr + 1);
1857	if (q->cur_buf == 0) {
1858		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1859		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1860	} else {
1861		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1862		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1863	}
1864	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1865	m->m_cur_offset = bsp->cur_offset;
1866	bsp->cur_offset = ddp_offset;
1867	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1868
1869	CTR5(KTR_TOM,
1870	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1871	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1872	KASSERT(ddp_offset >= m->m_cur_offset,
1873	    ("ddp_offset=%u less than cur_offset=%u",
1874		ddp_offset, m->m_cur_offset));
1875
1876#if 0
1877{
1878	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1879
1880	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1881	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1882
1883        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1884        rcv_nxt = t >> S_TCB_RCV_NXT;
1885        rcv_nxt &= M_TCB_RCV_NXT;
1886
1887        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1888        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1889        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1890
1891	T3_TRACE2(TIDTB(sk),
1892		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1893		  ddp_flags, rcv_nxt - rx_hdr_offset);
1894	T3_TRACE4(TB(q),
1895		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1896		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1897	T3_TRACE3(TB(q),
1898		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1899		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1900	T3_TRACE2(TB(q),
1901		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1902		 q->buf_state[0].flags, q->buf_state[1].flags);
1903
1904}
1905#endif
1906	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1907		handle_excess_rx(toep, m);
1908		return;
1909	}
1910
1911#ifdef T3_TRACE
1912	if ((int)m->m_pkthdr.len < 0) {
1913		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1914	}
1915#endif
1916	if (bsp->flags & DDP_BF_NOCOPY) {
1917#ifdef T3_TRACE
1918		T3_TRACE0(TB(q),
1919			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1920
1921		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1922			printk("!cancel_ubuf");
1923			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1924		}
1925#endif
1926		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1927		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1928		q->cur_buf ^= 1;
1929	} else if (bsp->flags & DDP_BF_NOFLIP) {
1930
1931		m->m_ddp_flags = 1;    /* always a kernel buffer */
1932
1933		/* now HW buffer carries a user buffer */
1934		bsp->flags &= ~DDP_BF_NOFLIP;
1935		bsp->flags |= DDP_BF_NOCOPY;
1936
1937		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1938		 * any new data in which case we're done. If in addition the
1939		 * offset is 0, then there wasn't a completion for the kbuf
1940		 * and we need to decrement the posted count.
1941		 */
1942		if (m->m_pkthdr.len == 0) {
1943			if (ddp_offset == 0) {
1944				q->kbuf_posted--;
1945				bsp->flags |= DDP_BF_NODATA;
1946			}
1947			sockbuf_unlock(rcv);
1948			m_free(m);
1949			return;
1950		}
1951	} else {
1952		sockbuf_unlock(rcv);
1953
1954		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1955		 * but it got here way late and nobody cares anymore.
1956		 */
1957		m_free(m);
1958		return;
1959	}
1960
1961	m->m_ddp_gl = (unsigned char *)bsp->gl;
1962	m->m_flags |= M_DDP;
1963	m->m_seq = tp->rcv_nxt;
1964	tp->rcv_nxt += m->m_pkthdr.len;
1965	tp->t_rcvtime = ticks;
1966	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1967		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1968	if (m->m_pkthdr.len == 0) {
1969		q->user_ddp_pending = 0;
1970		m_free(m);
1971	} else
1972		SBAPPEND(rcv, m);
1973
1974	state = so_state_get(so);
1975	if (__predict_true((state & SS_NOFDREF) == 0))
1976		so_sorwakeup_locked(so);
1977	else
1978		sockbuf_unlock(rcv);
1979}
1980
1981/*
1982 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1983 * in that case they are similar to DDP completions.
1984 */
1985static int
1986do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1987{
1988	struct toepcb *toep = (struct toepcb *)ctx;
1989
1990	/* OK if socket doesn't exist */
1991	if (toep == NULL) {
1992		printf("null toep in do_get_tcb_rpl\n");
1993		return (CPL_RET_BUF_DONE);
1994	}
1995
1996	inp_wlock(toep->tp_tp->t_inpcb);
1997	tcb_rpl_as_ddp_complete(toep, m);
1998	inp_wunlock(toep->tp_tp->t_inpcb);
1999
2000	return (0);
2001}
2002
2003static void
2004handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2005{
2006	struct tcpcb *tp = toep->tp_tp;
2007	struct socket *so;
2008	struct ddp_state *q;
2009	struct ddp_buf_state *bsp;
2010	struct cpl_rx_data *hdr = cplhdr(m);
2011	unsigned int rcv_nxt = ntohl(hdr->seq);
2012	struct sockbuf *rcv;
2013
2014	if (tp->rcv_nxt == rcv_nxt)
2015		return;
2016
2017	inp_lock_assert(tp->t_inpcb);
2018	so  = inp_inpcbtosocket(tp->t_inpcb);
2019	rcv = so_sockbuf_rcv(so);
2020	sockbuf_lock(rcv);
2021
2022	q = &toep->tp_ddp_state;
2023	bsp = &q->buf_state[q->cur_buf];
2024	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2025		rcv_nxt, tp->rcv_nxt));
2026	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2027	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2028	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2029	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2030
2031#ifdef T3_TRACE
2032	if ((int)m->m_pkthdr.len < 0) {
2033		t3_ddp_error(so, "handle_ddp_data: neg len");
2034	}
2035#endif
2036	m->m_ddp_gl = (unsigned char *)bsp->gl;
2037	m->m_flags |= M_DDP;
2038	m->m_cur_offset = bsp->cur_offset;
2039	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2040	if (bsp->flags & DDP_BF_NOCOPY)
2041		bsp->flags &= ~DDP_BF_NOCOPY;
2042
2043	m->m_seq = tp->rcv_nxt;
2044	tp->rcv_nxt = rcv_nxt;
2045	bsp->cur_offset += m->m_pkthdr.len;
2046	if (!(bsp->flags & DDP_BF_NOFLIP))
2047		q->cur_buf ^= 1;
2048	/*
2049	 * For now, don't re-enable DDP after a connection fell out of  DDP
2050	 * mode.
2051	 */
2052	q->ubuf_ddp_ready = 0;
2053	sockbuf_unlock(rcv);
2054}
2055
2056/*
2057 * Process new data received for a connection.
2058 */
2059static void
2060new_rx_data(struct toepcb *toep, struct mbuf *m)
2061{
2062	struct cpl_rx_data *hdr = cplhdr(m);
2063	struct tcpcb *tp = toep->tp_tp;
2064	struct socket *so;
2065	struct sockbuf *rcv;
2066	int state;
2067	int len = be16toh(hdr->len);
2068
2069	inp_wlock(tp->t_inpcb);
2070
2071	so  = inp_inpcbtosocket(tp->t_inpcb);
2072
2073	if (__predict_false(so_no_receive(so))) {
2074		handle_excess_rx(toep, m);
2075		inp_wunlock(tp->t_inpcb);
2076		TRACE_EXIT;
2077		return;
2078	}
2079
2080	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2081		handle_ddp_data(toep, m);
2082
2083	m->m_seq = ntohl(hdr->seq);
2084	m->m_ulp_mode = 0;                    /* for iSCSI */
2085
2086#if VALIDATE_SEQ
2087	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2088		log(LOG_ERR,
2089		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2090		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2091		       tp->rcv_nxt);
2092		m_freem(m);
2093		inp_wunlock(tp->t_inpcb);
2094		return;
2095	}
2096#endif
2097	m_adj(m, sizeof(*hdr));
2098
2099#ifdef URGENT_DATA_SUPPORTED
2100	/*
2101	 * We don't handle urgent data yet
2102	 */
2103	if (__predict_false(hdr->urg))
2104		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2105	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2106		     tp->urg_seq - tp->rcv_nxt < skb->len))
2107		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2108							 tp->rcv_nxt];
2109#endif
2110	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2111		toep->tp_delack_mode = hdr->dack_mode;
2112		toep->tp_delack_seq = tp->rcv_nxt;
2113	}
2114	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2115	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2116
2117	if (len < m->m_pkthdr.len)
2118		m->m_pkthdr.len = m->m_len = len;
2119
2120	tp->rcv_nxt += m->m_pkthdr.len;
2121	tp->t_rcvtime = ticks;
2122	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2123	CTR2(KTR_TOM,
2124	    "new_rx_data: seq 0x%x len %u",
2125	    m->m_seq, m->m_pkthdr.len);
2126	inp_wunlock(tp->t_inpcb);
2127	rcv = so_sockbuf_rcv(so);
2128	sockbuf_lock(rcv);
2129#if 0
2130	if (sb_notify(rcv))
2131		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2132#endif
2133	SBAPPEND(rcv, m);
2134
2135#ifdef notyet
2136	/*
2137	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2138	 *
2139	 */
2140	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2141
2142	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2143		so, rcv->sb_cc, rcv->sb_mbmax));
2144#endif
2145
2146
2147	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2148	    rcv->sb_cc, rcv->sb_mbcnt);
2149
2150	state = so_state_get(so);
2151	if (__predict_true((state & SS_NOFDREF) == 0))
2152		so_sorwakeup_locked(so);
2153	else
2154		sockbuf_unlock(rcv);
2155}
2156
2157/*
2158 * Handler for RX_DATA CPL messages.
2159 */
2160static int
2161do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2162{
2163	struct toepcb *toep = (struct toepcb *)ctx;
2164
2165	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2166
2167	new_rx_data(toep, m);
2168
2169	return (0);
2170}
2171
2172static void
2173new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2174{
2175	struct tcpcb *tp;
2176	struct ddp_state *q;
2177	struct ddp_buf_state *bsp;
2178	struct cpl_rx_data_ddp *hdr;
2179	struct socket *so;
2180	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2181	int nomoredata = 0;
2182	unsigned int delack_mode;
2183	struct sockbuf *rcv;
2184
2185	tp = toep->tp_tp;
2186	inp_wlock(tp->t_inpcb);
2187	so = inp_inpcbtosocket(tp->t_inpcb);
2188
2189	if (__predict_false(so_no_receive(so))) {
2190
2191		handle_excess_rx(toep, m);
2192		inp_wunlock(tp->t_inpcb);
2193		return;
2194	}
2195
2196	q = &toep->tp_ddp_state;
2197	hdr = cplhdr(m);
2198	ddp_report = ntohl(hdr->u.ddp_report);
2199	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2200	bsp = &q->buf_state[buf_idx];
2201
2202	CTR4(KTR_TOM,
2203	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2204	    "hdr seq 0x%x len %u",
2205	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2206	    ntohs(hdr->len));
2207	CTR3(KTR_TOM,
2208	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2209	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2210
2211	ddp_len = ntohs(hdr->len);
2212	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2213
2214	delack_mode = G_DDP_DACK_MODE(ddp_report);
2215	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2216		toep->tp_delack_mode = delack_mode;
2217		toep->tp_delack_seq = tp->rcv_nxt;
2218	}
2219
2220	m->m_seq = tp->rcv_nxt;
2221	tp->rcv_nxt = rcv_nxt;
2222
2223	tp->t_rcvtime = ticks;
2224	/*
2225	 * Store the length in m->m_len.  We are changing the meaning of
2226	 * m->m_len here, we need to be very careful that nothing from now on
2227	 * interprets ->len of this packet the usual way.
2228	 */
2229	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2230	inp_wunlock(tp->t_inpcb);
2231	CTR3(KTR_TOM,
2232	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2233	    m->m_len, rcv_nxt, m->m_seq);
2234	/*
2235	 * Figure out where the new data was placed in the buffer and store it
2236	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2237	 * account for page pod's pg_offset.
2238	 */
2239	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2240	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2241
2242	rcv = so_sockbuf_rcv(so);
2243	sockbuf_lock(rcv);
2244
2245	m->m_ddp_gl = (unsigned char *)bsp->gl;
2246	m->m_flags |= M_DDP;
2247	bsp->cur_offset = end_offset;
2248	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2249
2250	/*
2251	 * Length is only meaningful for kbuf
2252	 */
2253	if (!(bsp->flags & DDP_BF_NOCOPY))
2254		KASSERT(m->m_len <= bsp->gl->dgl_length,
2255		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2256			m->m_len, bsp->gl->dgl_length));
2257
2258	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2259	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2260        /*
2261	 * Bit 0 of flags stores whether the DDP buffer is completed.
2262	 * Note that other parts of the code depend on this being in bit 0.
2263	 */
2264	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2265		panic("spurious ddp completion");
2266	} else {
2267		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2268		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2269			q->cur_buf ^= 1;                     /* flip buffers */
2270	}
2271
2272	if (bsp->flags & DDP_BF_NOCOPY) {
2273		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2274		bsp->flags &= ~DDP_BF_NOCOPY;
2275	}
2276
2277	if (ddp_report & F_DDP_PSH)
2278		m->m_ddp_flags |= DDP_BF_PSH;
2279	if (nomoredata)
2280		m->m_ddp_flags |= DDP_BF_NODATA;
2281
2282#ifdef notyet
2283	skb_reset_transport_header(skb);
2284	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2285#endif
2286	SBAPPEND(rcv, m);
2287
2288	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2289	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2290		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2291		so_sorwakeup_locked(so);
2292	else
2293		sockbuf_unlock(rcv);
2294}
2295
2296#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2297		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2298		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2299		 F_DDP_INVALID_PPOD)
2300
2301/*
2302 * Handler for RX_DATA_DDP CPL messages.
2303 */
2304static int
2305do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2306{
2307	struct toepcb *toep = ctx;
2308	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2309
2310	VALIDATE_SOCK(so);
2311
2312	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2313		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2314		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2315		return (CPL_RET_BUF_DONE);
2316	}
2317#if 0
2318	skb->h.th = tcphdr_skb->h.th;
2319#endif
2320	new_rx_data_ddp(toep, m);
2321	return (0);
2322}
2323
2324static void
2325process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2326{
2327	struct tcpcb *tp = toep->tp_tp;
2328	struct socket *so;
2329	struct ddp_state *q;
2330	struct ddp_buf_state *bsp;
2331	struct cpl_rx_ddp_complete *hdr;
2332	unsigned int ddp_report, buf_idx, when, delack_mode;
2333	int nomoredata = 0;
2334	struct sockbuf *rcv;
2335
2336	inp_wlock(tp->t_inpcb);
2337	so = inp_inpcbtosocket(tp->t_inpcb);
2338
2339	if (__predict_false(so_no_receive(so))) {
2340		struct inpcb *inp = so_sotoinpcb(so);
2341
2342		handle_excess_rx(toep, m);
2343		inp_wunlock(inp);
2344		return;
2345	}
2346	q = &toep->tp_ddp_state;
2347	hdr = cplhdr(m);
2348	ddp_report = ntohl(hdr->ddp_report);
2349	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2350	m->m_pkthdr.csum_data = tp->rcv_nxt;
2351
2352	rcv = so_sockbuf_rcv(so);
2353	sockbuf_lock(rcv);
2354
2355	bsp = &q->buf_state[buf_idx];
2356	when = bsp->cur_offset;
2357	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2358	tp->rcv_nxt += m->m_len;
2359	tp->t_rcvtime = ticks;
2360
2361	delack_mode = G_DDP_DACK_MODE(ddp_report);
2362	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2363		toep->tp_delack_mode = delack_mode;
2364		toep->tp_delack_seq = tp->rcv_nxt;
2365	}
2366#ifdef notyet
2367	skb_reset_transport_header(skb);
2368	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2369#endif
2370	inp_wunlock(tp->t_inpcb);
2371
2372	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2373	CTR5(KTR_TOM,
2374		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2375		  "ddp_report 0x%x offset %u, len %u",
2376		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2377		   G_DDP_OFFSET(ddp_report), m->m_len);
2378
2379	m->m_cur_offset = bsp->cur_offset;
2380	bsp->cur_offset += m->m_len;
2381
2382	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2383		q->cur_buf ^= 1;                     /* flip buffers */
2384		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2385			nomoredata=1;
2386	}
2387
2388	CTR4(KTR_TOM,
2389		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2390		  "ddp_report %u offset %u",
2391		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2392		   G_DDP_OFFSET(ddp_report));
2393
2394	m->m_ddp_gl = (unsigned char *)bsp->gl;
2395	m->m_flags |= M_DDP;
2396	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2397	if (bsp->flags & DDP_BF_NOCOPY)
2398		bsp->flags &= ~DDP_BF_NOCOPY;
2399	if (nomoredata)
2400		m->m_ddp_flags |= DDP_BF_NODATA;
2401
2402	SBAPPEND(rcv, m);
2403	if ((so_state_get(so) & SS_NOFDREF) == 0)
2404		so_sorwakeup_locked(so);
2405	else
2406		sockbuf_unlock(rcv);
2407}
2408
2409/*
2410 * Handler for RX_DDP_COMPLETE CPL messages.
2411 */
2412static int
2413do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2414{
2415	struct toepcb *toep = ctx;
2416
2417	VALIDATE_SOCK(so);
2418#if 0
2419	skb->h.th = tcphdr_skb->h.th;
2420#endif
2421	process_ddp_complete(toep, m);
2422	return (0);
2423}
2424
2425/*
2426 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2427 * socket state before calling tcp_time_wait to comply with its expectations.
2428 */
2429static void
2430enter_timewait(struct tcpcb *tp)
2431{
2432	/*
2433	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2434	 * process peer_close because we don't want to carry the peer FIN in
2435	 * the socket's receive queue and if we increment rcv_nxt without
2436	 * having the FIN in the receive queue we'll confuse facilities such
2437	 * as SIOCINQ.
2438	 */
2439	inp_wlock(tp->t_inpcb);
2440	tp->rcv_nxt++;
2441
2442	tp->ts_recent_age = 0;	     /* defeat recycling */
2443	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2444	inp_wunlock(tp->t_inpcb);
2445	tcp_offload_twstart(tp);
2446}
2447
2448/*
2449 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2450 * function deals with the data that may be reported along with the FIN.
2451 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2452 * perform normal FIN-related processing.  In the latter case 1 indicates that
2453 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2454 * skb can be freed.
2455 */
2456static int
2457handle_peer_close_data(struct socket *so, struct mbuf *m)
2458{
2459	struct tcpcb *tp = so_sototcpcb(so);
2460	struct toepcb *toep = tp->t_toe;
2461	struct ddp_state *q;
2462	struct ddp_buf_state *bsp;
2463	struct cpl_peer_close *req = cplhdr(m);
2464	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2465	struct sockbuf *rcv;
2466
2467	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2468		return (0);
2469
2470	CTR0(KTR_TOM, "handle_peer_close_data");
2471	if (__predict_false(so_no_receive(so))) {
2472		handle_excess_rx(toep, m);
2473
2474		/*
2475		 * Although we discard the data we want to process the FIN so
2476		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2477		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2478		 * may be what will close the connection.  We return 1 because
2479		 * handle_excess_rx() already freed the packet.
2480		 */
2481		return (1);
2482	}
2483
2484	inp_lock_assert(tp->t_inpcb);
2485	q = &toep->tp_ddp_state;
2486	rcv = so_sockbuf_rcv(so);
2487	sockbuf_lock(rcv);
2488
2489	bsp = &q->buf_state[q->cur_buf];
2490	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2491	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2492	m->m_ddp_gl = (unsigned char *)bsp->gl;
2493	m->m_flags |= M_DDP;
2494	m->m_cur_offset = bsp->cur_offset;
2495	m->m_ddp_flags =
2496	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2497	m->m_seq = tp->rcv_nxt;
2498	tp->rcv_nxt = rcv_nxt;
2499	bsp->cur_offset += m->m_pkthdr.len;
2500	if (!(bsp->flags & DDP_BF_NOFLIP))
2501		q->cur_buf ^= 1;
2502#ifdef notyet
2503	skb_reset_transport_header(skb);
2504	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2505#endif
2506	tp->t_rcvtime = ticks;
2507	SBAPPEND(rcv, m);
2508	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2509		so_sorwakeup_locked(so);
2510	else
2511		sockbuf_unlock(rcv);
2512
2513	return (1);
2514}
2515
2516/*
2517 * Handle a peer FIN.
2518 */
2519static void
2520do_peer_fin(struct toepcb *toep, struct mbuf *m)
2521{
2522	struct socket *so;
2523	struct tcpcb *tp = toep->tp_tp;
2524	int keep, action;
2525
2526	action = keep = 0;
2527	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2528	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2529		printf("abort_pending set\n");
2530
2531		goto out;
2532	}
2533	inp_wlock(tp->t_inpcb);
2534	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2535	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2536		keep = handle_peer_close_data(so, m);
2537		if (keep < 0) {
2538			inp_wunlock(tp->t_inpcb);
2539			return;
2540		}
2541	}
2542	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2543		CTR1(KTR_TOM,
2544		    "waking up waiters for cantrcvmore on %p ", so);
2545		socantrcvmore(so);
2546
2547		/*
2548		 * If connection is half-synchronized
2549		 * (ie NEEDSYN flag on) then delay ACK,
2550		 * so it may be piggybacked when SYN is sent.
2551		 * Otherwise, since we received a FIN then no
2552		 * more input can be expected, send ACK now.
2553		 */
2554		if (tp->t_flags & TF_NEEDSYN)
2555			tp->t_flags |= TF_DELACK;
2556		else
2557			tp->t_flags |= TF_ACKNOW;
2558		tp->rcv_nxt++;
2559	}
2560
2561	switch (tp->t_state) {
2562	case TCPS_SYN_RECEIVED:
2563	    tp->t_starttime = ticks;
2564	/* FALLTHROUGH */
2565	case TCPS_ESTABLISHED:
2566		tp->t_state = TCPS_CLOSE_WAIT;
2567		break;
2568	case TCPS_FIN_WAIT_1:
2569		tp->t_state = TCPS_CLOSING;
2570		break;
2571	case TCPS_FIN_WAIT_2:
2572		/*
2573		 * If we've sent an abort_req we must have sent it too late,
2574		 * HW will send us a reply telling us so, and this peer_close
2575		 * is really the last message for this connection and needs to
2576		 * be treated as an abort_rpl, i.e., transition the connection
2577		 * to TCP_CLOSE (note that the host stack does this at the
2578		 * time of generating the RST but we must wait for HW).
2579		 * Otherwise we enter TIME_WAIT.
2580		 */
2581		t3_release_offload_resources(toep);
2582		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2583			action = TCP_CLOSE;
2584		} else {
2585			action = TCP_TIMEWAIT;
2586		}
2587		break;
2588	default:
2589		log(LOG_ERR,
2590		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2591		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2592	}
2593	inp_wunlock(tp->t_inpcb);
2594
2595	if (action == TCP_TIMEWAIT) {
2596		enter_timewait(tp);
2597	} else if (action == TCP_DROP) {
2598		tcp_offload_drop(tp, 0);
2599	} else if (action == TCP_CLOSE) {
2600		tcp_offload_close(tp);
2601	}
2602
2603#ifdef notyet
2604	/* Do not send POLL_HUP for half duplex close. */
2605	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2606	    sk->sk_state == TCP_CLOSE)
2607		sk_wake_async(so, 1, POLL_HUP);
2608	else
2609		sk_wake_async(so, 1, POLL_IN);
2610#endif
2611
2612out:
2613	if (!keep)
2614		m_free(m);
2615}
2616
2617/*
2618 * Handler for PEER_CLOSE CPL messages.
2619 */
2620static int
2621do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2622{
2623	struct toepcb *toep = (struct toepcb *)ctx;
2624
2625	VALIDATE_SOCK(so);
2626
2627	do_peer_fin(toep, m);
2628	return (0);
2629}
2630
2631static void
2632process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2633{
2634	struct cpl_close_con_rpl *rpl = cplhdr(m);
2635	struct tcpcb *tp = toep->tp_tp;
2636	struct socket *so;
2637	int action = 0;
2638	struct sockbuf *rcv;
2639
2640	inp_wlock(tp->t_inpcb);
2641	so = inp_inpcbtosocket(tp->t_inpcb);
2642
2643	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2644
2645	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2646		inp_wunlock(tp->t_inpcb);
2647		goto out;
2648	}
2649
2650	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2651	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2652
2653	switch (tp->t_state) {
2654	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2655		t3_release_offload_resources(toep);
2656		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2657			action = TCP_CLOSE;
2658
2659		} else {
2660			action = TCP_TIMEWAIT;
2661		}
2662		break;
2663	case TCPS_LAST_ACK:
2664		/*
2665		 * In this state we don't care about pending abort_rpl.
2666		 * If we've sent abort_req it was post-close and was sent too
2667		 * late, this close_con_rpl is the actual last message.
2668		 */
2669		t3_release_offload_resources(toep);
2670		action = TCP_CLOSE;
2671		break;
2672	case TCPS_FIN_WAIT_1:
2673		/*
2674		 * If we can't receive any more
2675		 * data, then closing user can proceed.
2676		 * Starting the timer is contrary to the
2677		 * specification, but if we don't get a FIN
2678		 * we'll hang forever.
2679		 *
2680		 * XXXjl:
2681		 * we should release the tp also, and use a
2682		 * compressed state.
2683		 */
2684		if (so)
2685			rcv = so_sockbuf_rcv(so);
2686		else
2687			break;
2688
2689		if (rcv->sb_state & SBS_CANTRCVMORE) {
2690			int timeout;
2691
2692			if (so)
2693				soisdisconnected(so);
2694			timeout = (tcp_fast_finwait2_recycle) ?
2695			    tcp_finwait2_timeout : tcp_maxidle;
2696			tcp_timer_activate(tp, TT_2MSL, timeout);
2697		}
2698		tp->t_state = TCPS_FIN_WAIT_2;
2699		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2700		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2701			action = TCP_DROP;
2702		}
2703
2704		break;
2705	default:
2706		log(LOG_ERR,
2707		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2708		       toep->tp_toedev->tod_name, toep->tp_tid,
2709		       tp->t_state);
2710	}
2711	inp_wunlock(tp->t_inpcb);
2712
2713
2714	if (action == TCP_TIMEWAIT) {
2715		enter_timewait(tp);
2716	} else if (action == TCP_DROP) {
2717		tcp_offload_drop(tp, 0);
2718	} else if (action == TCP_CLOSE) {
2719		tcp_offload_close(tp);
2720	}
2721out:
2722	m_freem(m);
2723}
2724
2725/*
2726 * Handler for CLOSE_CON_RPL CPL messages.
2727 */
2728static int
2729do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2730			    void *ctx)
2731{
2732	struct toepcb *toep = (struct toepcb *)ctx;
2733
2734	process_close_con_rpl(toep, m);
2735	return (0);
2736}
2737
2738/*
2739 * Process abort replies.  We only process these messages if we anticipate
2740 * them as the coordination between SW and HW in this area is somewhat lacking
2741 * and sometimes we get ABORT_RPLs after we are done with the connection that
2742 * originated the ABORT_REQ.
2743 */
2744static void
2745process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2746{
2747	struct tcpcb *tp = toep->tp_tp;
2748	struct socket *so;
2749	int needclose = 0;
2750
2751#ifdef T3_TRACE
2752	T3_TRACE1(TIDTB(sk),
2753		  "process_abort_rpl: GTS rpl pending %d",
2754		  sock_flag(sk, ABORT_RPL_PENDING));
2755#endif
2756
2757	inp_wlock(tp->t_inpcb);
2758	so = inp_inpcbtosocket(tp->t_inpcb);
2759
2760	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2761		/*
2762		 * XXX panic on tcpdrop
2763		 */
2764		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2765			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2766		else {
2767			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2768			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2769			    !is_t3a(toep->tp_toedev)) {
2770				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2771					panic("TP_ABORT_REQ_RCVD set");
2772				t3_release_offload_resources(toep);
2773				needclose = 1;
2774			}
2775		}
2776	}
2777	inp_wunlock(tp->t_inpcb);
2778
2779	if (needclose)
2780		tcp_offload_close(tp);
2781
2782	m_free(m);
2783}
2784
2785/*
2786 * Handle an ABORT_RPL_RSS CPL message.
2787 */
2788static int
2789do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2790{
2791	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2792	struct toepcb *toep;
2793
2794	/*
2795	 * Ignore replies to post-close aborts indicating that the abort was
2796	 * requested too late.  These connections are terminated when we get
2797	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2798	 * arrives the TID is either no longer used or it has been recycled.
2799	 */
2800	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2801discard:
2802		m_free(m);
2803		return (0);
2804	}
2805
2806	toep = (struct toepcb *)ctx;
2807
2808        /*
2809	 * Sometimes we've already closed the socket, e.g., a post-close
2810	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2811	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2812	 * but FW turns the ABORT_REQ into a regular one and so we get
2813	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2814	 */
2815	if (!toep)
2816		goto discard;
2817
2818	if (toep->tp_tp == NULL) {
2819		log(LOG_NOTICE, "removing tid for abort\n");
2820		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2821		if (toep->tp_l2t)
2822			l2t_release(L2DATA(cdev), toep->tp_l2t);
2823
2824		toepcb_release(toep);
2825		goto discard;
2826	}
2827
2828	log(LOG_NOTICE, "toep=%p\n", toep);
2829	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2830
2831	toepcb_hold(toep);
2832	process_abort_rpl(toep, m);
2833	toepcb_release(toep);
2834	return (0);
2835}
2836
2837/*
2838 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2839 * indicate whether RST should be sent in response.
2840 */
2841static int
2842abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2843{
2844	struct tcpcb *tp = so_sototcpcb(so);
2845
2846	switch (abort_reason) {
2847	case CPL_ERR_BAD_SYN:
2848#if 0
2849		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2850#endif
2851	case CPL_ERR_CONN_RESET:
2852		// XXX need to handle SYN_RECV due to crossed SYNs
2853		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2854	case CPL_ERR_XMIT_TIMEDOUT:
2855	case CPL_ERR_PERSIST_TIMEDOUT:
2856	case CPL_ERR_FINWAIT2_TIMEDOUT:
2857	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2858#if 0
2859		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2860#endif
2861		return (ETIMEDOUT);
2862	default:
2863		return (EIO);
2864	}
2865}
2866
2867static inline void
2868set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2869{
2870	struct cpl_abort_rpl *rpl = cplhdr(m);
2871
2872	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2873	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2874	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2875
2876	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2877	rpl->cmd = cmd;
2878}
2879
2880static void
2881send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2882{
2883	struct mbuf *reply_mbuf;
2884	struct cpl_abort_req_rss *req = cplhdr(m);
2885
2886	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2887	m_set_priority(m, CPL_PRIORITY_DATA);
2888	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2889	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2890	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2891	m_free(m);
2892}
2893
2894/*
2895 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2896 */
2897static inline int
2898is_neg_adv_abort(unsigned int status)
2899{
2900	return status == CPL_ERR_RTX_NEG_ADVICE ||
2901	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2902}
2903
2904static void
2905send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2906{
2907	struct mbuf  *reply_mbuf;
2908	struct cpl_abort_req_rss *req = cplhdr(m);
2909
2910	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2911
2912	if (!reply_mbuf) {
2913		/* Defer the reply.  Stick rst_status into req->cmd. */
2914		req->status = rst_status;
2915		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2916		return;
2917	}
2918
2919	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2920	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2921	m_free(m);
2922
2923	/*
2924	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2925	 * these messages while ARP is pending.  For other connection states
2926	 * it's not a problem.
2927	 */
2928	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2929}
2930
2931#ifdef notyet
2932static void
2933cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2934{
2935	CXGB_UNIMPLEMENTED();
2936#ifdef notyet
2937	struct request_sock *req = child->sk_user_data;
2938
2939	inet_csk_reqsk_queue_removed(parent, req);
2940	synq_remove(tcp_sk(child));
2941	__reqsk_free(req);
2942	child->sk_user_data = NULL;
2943#endif
2944}
2945
2946
2947/*
2948 * Performs the actual work to abort a SYN_RECV connection.
2949 */
2950static void
2951do_abort_syn_rcv(struct socket *child, struct socket *parent)
2952{
2953	struct tcpcb *parenttp = so_sototcpcb(parent);
2954	struct tcpcb *childtp = so_sototcpcb(child);
2955
2956	/*
2957	 * If the server is still open we clean up the child connection,
2958	 * otherwise the server already did the clean up as it was purging
2959	 * its SYN queue and the skb was just sitting in its backlog.
2960	 */
2961	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2962		cleanup_syn_rcv_conn(child, parent);
2963		inp_wlock(childtp->t_inpcb);
2964		t3_release_offload_resources(childtp->t_toe);
2965		inp_wunlock(childtp->t_inpcb);
2966		tcp_offload_close(childtp);
2967	}
2968}
2969#endif
2970
2971/*
2972 * Handle abort requests for a SYN_RECV connection.  These need extra work
2973 * because the socket is on its parent's SYN queue.
2974 */
2975static int
2976abort_syn_rcv(struct socket *so, struct mbuf *m)
2977{
2978	CXGB_UNIMPLEMENTED();
2979#ifdef notyet
2980	struct socket *parent;
2981	struct toedev *tdev = toep->tp_toedev;
2982	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2983	struct socket *oreq = so->so_incomp;
2984	struct t3c_tid_entry *t3c_stid;
2985	struct tid_info *t;
2986
2987	if (!oreq)
2988		return -1;        /* somehow we are not on the SYN queue */
2989
2990	t = &(T3C_DATA(cdev))->tid_maps;
2991	t3c_stid = lookup_stid(t, oreq->ts_recent);
2992	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2993
2994	so_lock(parent);
2995	do_abort_syn_rcv(so, parent);
2996	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2997	so_unlock(parent);
2998#endif
2999	return (0);
3000}
3001
3002/*
3003 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
3004 * request except that we need to reply to it.
3005 */
3006static void
3007process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3008{
3009	int rst_status = CPL_ABORT_NO_RST;
3010	const struct cpl_abort_req_rss *req = cplhdr(m);
3011	struct tcpcb *tp = toep->tp_tp;
3012	struct socket *so;
3013	int needclose = 0;
3014
3015	inp_wlock(tp->t_inpcb);
3016	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3017	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3018		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3019		m_free(m);
3020		goto skip;
3021	}
3022
3023	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3024	/*
3025	 * Three cases to consider:
3026	 * a) We haven't sent an abort_req; close the connection.
3027	 * b) We have sent a post-close abort_req that will get to TP too late
3028	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3029	 *    be ignored and the connection should be closed now.
3030	 * c) We have sent a regular abort_req that will get to TP too late.
3031	 *    That will generate an abort_rpl with status 0, wait for it.
3032	 */
3033	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3034	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3035		int error;
3036
3037		error = abort_status_to_errno(so, req->status,
3038		    &rst_status);
3039		so_error_set(so, error);
3040
3041		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3042			so_sorwakeup(so);
3043		/*
3044		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3045		 * returns 0 is has taken care of the abort.
3046		 */
3047		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3048			goto skip;
3049
3050		t3_release_offload_resources(toep);
3051		needclose = 1;
3052	}
3053	inp_wunlock(tp->t_inpcb);
3054
3055	if (needclose)
3056		tcp_offload_close(tp);
3057
3058	send_abort_rpl(m, tdev, rst_status);
3059	return;
3060skip:
3061	inp_wunlock(tp->t_inpcb);
3062}
3063
3064/*
3065 * Handle an ABORT_REQ_RSS CPL message.
3066 */
3067static int
3068do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3069{
3070	const struct cpl_abort_req_rss *req = cplhdr(m);
3071	struct toepcb *toep = (struct toepcb *)ctx;
3072
3073	if (is_neg_adv_abort(req->status)) {
3074		m_free(m);
3075		return (0);
3076	}
3077
3078	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3079
3080	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3081		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3082		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3083
3084		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3085		if (toep->tp_l2t)
3086			l2t_release(L2DATA(cdev), toep->tp_l2t);
3087
3088		/*
3089		 *  Unhook
3090		 */
3091		toep->tp_tp->t_toe = NULL;
3092		toep->tp_tp->t_flags &= ~TF_TOE;
3093		toep->tp_tp = NULL;
3094		/*
3095		 * XXX need to call syncache_chkrst - but we don't
3096		 * have a way of doing that yet
3097		 */
3098		toepcb_release(toep);
3099		log(LOG_ERR, "abort for unestablished connection :-(\n");
3100		return (0);
3101	}
3102	if (toep->tp_tp == NULL) {
3103		log(LOG_NOTICE, "disconnected toepcb\n");
3104		/* should be freed momentarily */
3105		return (0);
3106	}
3107
3108
3109	toepcb_hold(toep);
3110	process_abort_req(toep, m, toep->tp_toedev);
3111	toepcb_release(toep);
3112	return (0);
3113}
3114#ifdef notyet
3115static void
3116pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3117{
3118	struct toedev *tdev = TOE_DEV(parent);
3119
3120	do_abort_syn_rcv(child, parent);
3121	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3122		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3123
3124		rpl->opt0h = htonl(F_TCAM_BYPASS);
3125		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3126		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3127	} else
3128		m_free(m);
3129}
3130#endif
3131static void
3132handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3133{
3134	CXGB_UNIMPLEMENTED();
3135
3136#ifdef notyet
3137	struct t3cdev *cdev;
3138	struct socket *parent;
3139	struct socket *oreq;
3140	struct t3c_tid_entry *t3c_stid;
3141	struct tid_info *t;
3142	struct tcpcb *otp, *tp = so_sototcpcb(so);
3143	struct toepcb *toep = tp->t_toe;
3144
3145	/*
3146	 * If the connection is being aborted due to the parent listening
3147	 * socket going away there's nothing to do, the ABORT_REQ will close
3148	 * the connection.
3149	 */
3150	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3151		m_free(m);
3152		return;
3153	}
3154
3155	oreq = so->so_incomp;
3156	otp = so_sototcpcb(oreq);
3157
3158	cdev = T3C_DEV(so);
3159	t = &(T3C_DATA(cdev))->tid_maps;
3160	t3c_stid = lookup_stid(t, otp->ts_recent);
3161	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3162
3163	so_lock(parent);
3164	pass_open_abort(so, parent, m);
3165	so_unlock(parent);
3166#endif
3167}
3168
3169/*
3170 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3171 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3172 * connection.
3173 */
3174static void
3175pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3176{
3177
3178#ifdef notyet
3179	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3180	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3181#endif
3182	handle_pass_open_arp_failure(m_get_socket(m), m);
3183}
3184
3185/*
3186 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3187 */
3188static void
3189mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3190{
3191	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3192	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3193	unsigned int tid = GET_TID(req);
3194
3195	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3196	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3197	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3198	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3199	rpl->opt0h = htonl(F_TCAM_BYPASS);
3200	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3201	rpl->opt2 = 0;
3202	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3203}
3204
3205/*
3206 * Send a deferred reject to an accept request.
3207 */
3208static void
3209reject_pass_request(struct toedev *tdev, struct mbuf *m)
3210{
3211	struct mbuf *reply_mbuf;
3212
3213	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3214	mk_pass_accept_rpl(reply_mbuf, m);
3215	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3216	m_free(m);
3217}
3218
3219static void
3220handle_syncache_event(int event, void *arg)
3221{
3222	struct toepcb *toep = arg;
3223
3224	switch (event) {
3225	case TOE_SC_ENTRY_PRESENT:
3226		/*
3227		 * entry already exists - free toepcb
3228		 * and l2t
3229		 */
3230		printf("syncache entry present\n");
3231		toepcb_release(toep);
3232		break;
3233	case TOE_SC_DROP:
3234		/*
3235		 * The syncache has given up on this entry
3236		 * either it timed out, or it was evicted
3237		 * we need to explicitly release the tid
3238		 */
3239		printf("syncache entry dropped\n");
3240		toepcb_release(toep);
3241		break;
3242	default:
3243		log(LOG_ERR, "unknown syncache event %d\n", event);
3244		break;
3245	}
3246}
3247
3248static void
3249syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3250{
3251	struct in_conninfo inc;
3252	struct tcpopt to;
3253	struct tcphdr th;
3254	struct inpcb *inp;
3255	int mss, wsf, sack, ts;
3256	uint32_t rcv_isn = ntohl(req->rcv_isn);
3257
3258	bzero(&to, sizeof(struct tcpopt));
3259	inp = so_sotoinpcb(lso);
3260
3261	/*
3262	 * Fill out information for entering us into the syncache
3263	 */
3264	bzero(&inc, sizeof(inc));
3265	inc.inc_fport = th.th_sport = req->peer_port;
3266	inc.inc_lport = th.th_dport = req->local_port;
3267	th.th_seq = req->rcv_isn;
3268	th.th_flags = TH_SYN;
3269
3270	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3271
3272
3273	inc.inc_isipv6 = 0;
3274	inc.inc_len = 0;
3275	inc.inc_faddr.s_addr = req->peer_ip;
3276	inc.inc_laddr.s_addr = req->local_ip;
3277
3278	DPRINTF("syncache add of %d:%d %d:%d\n",
3279	    ntohl(req->local_ip), ntohs(req->local_port),
3280	    ntohl(req->peer_ip), ntohs(req->peer_port));
3281
3282	mss = req->tcp_options.mss;
3283	wsf = req->tcp_options.wsf;
3284	ts = req->tcp_options.tstamp;
3285	sack = req->tcp_options.sack;
3286	to.to_mss = mss;
3287	to.to_wscale = wsf;
3288	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3289	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3290}
3291
3292
3293/*
3294 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3295 * lock held.  Note that the sock here is a listening socket that is not owned
3296 * by the TOE.
3297 */
3298static void
3299process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3300    struct listen_ctx *lctx)
3301{
3302	int rt_flags;
3303	struct l2t_entry *e;
3304	struct iff_mac tim;
3305	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3306	struct cpl_pass_accept_rpl *rpl;
3307	struct cpl_pass_accept_req *req = cplhdr(m);
3308	unsigned int tid = GET_TID(req);
3309	struct tom_data *d = TOM_DATA(tdev);
3310	struct t3cdev *cdev = d->cdev;
3311	struct tcpcb *tp = so_sototcpcb(so);
3312	struct toepcb *newtoep;
3313	struct rtentry *dst;
3314	struct sockaddr_in nam;
3315	struct t3c_data *td = T3C_DATA(cdev);
3316
3317	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3318	if (__predict_false(reply_mbuf == NULL)) {
3319		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3320			t3_defer_reply(m, tdev, reject_pass_request);
3321		else {
3322			cxgb_queue_tid_release(cdev, tid);
3323			m_free(m);
3324		}
3325		DPRINTF("failed to get reply_mbuf\n");
3326
3327		goto out;
3328	}
3329
3330	if (tp->t_state != TCPS_LISTEN) {
3331		DPRINTF("socket not in listen state\n");
3332
3333		goto reject;
3334	}
3335
3336	tim.mac_addr = req->dst_mac;
3337	tim.vlan_tag = ntohs(req->vlan_tag);
3338	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3339		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3340		goto reject;
3341	}
3342
3343#ifdef notyet
3344	/*
3345	 * XXX do route lookup to confirm that we're still listening on this
3346	 * address
3347	 */
3348	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3349			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3350		goto reject;
3351	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3352		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3353	dst_release(skb->dst);	// done with the input route, release it
3354	skb->dst = NULL;
3355
3356	if ((rt_flags & RTF_LOCAL) == 0)
3357		goto reject;
3358#endif
3359	/*
3360	 * XXX
3361	 */
3362	rt_flags = RTF_LOCAL;
3363	if ((rt_flags & RTF_LOCAL) == 0)
3364		goto reject;
3365
3366	/*
3367	 * Calculate values and add to syncache
3368	 */
3369
3370	newtoep = toepcb_alloc();
3371	if (newtoep == NULL)
3372		goto reject;
3373
3374	bzero(&nam, sizeof(struct sockaddr_in));
3375
3376	nam.sin_len = sizeof(struct sockaddr_in);
3377	nam.sin_family = AF_INET;
3378	nam.sin_addr.s_addr =req->peer_ip;
3379	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3380
3381	if (dst == NULL) {
3382		printf("failed to find route\n");
3383		goto reject;
3384	}
3385	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3386	    (struct sockaddr *)&nam);
3387	if (e == NULL) {
3388		DPRINTF("failed to get l2t\n");
3389	}
3390	/*
3391	 * Point to our listen socket until accept
3392	 */
3393	newtoep->tp_tp = tp;
3394	newtoep->tp_flags = TP_SYN_RCVD;
3395	newtoep->tp_tid = tid;
3396	newtoep->tp_toedev = tdev;
3397	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3398
3399	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3400	so_lock(so);
3401	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3402	so_unlock(so);
3403
3404	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3405		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3406
3407	if (newtoep->tp_ulp_mode) {
3408		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3409
3410		if (ddp_mbuf == NULL)
3411			newtoep->tp_ulp_mode = 0;
3412	}
3413
3414	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3415	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3416	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3417	/*
3418	 * XXX workaround for lack of syncache drop
3419	 */
3420	toepcb_hold(newtoep);
3421	syncache_add_accept_req(req, so, newtoep);
3422
3423	rpl = cplhdr(reply_mbuf);
3424	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3425	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3426	rpl->wr.wr_lo = 0;
3427	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3428	rpl->opt2 = htonl(calc_opt2(so, tdev));
3429	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3430	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3431
3432	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3433	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3434	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3435				  CPL_PASS_OPEN_ACCEPT);
3436
3437	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3438
3439	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3440
3441	l2t_send(cdev, reply_mbuf, e);
3442	m_free(m);
3443	if (newtoep->tp_ulp_mode) {
3444		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3445				V_TF_DDP_OFF(1) |
3446				TP_DDP_TIMER_WORKAROUND_MASK,
3447				V_TF_DDP_OFF(1) |
3448		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3449	} else
3450		DPRINTF("no DDP\n");
3451
3452	return;
3453reject:
3454	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3455		mk_pass_accept_rpl(reply_mbuf, m);
3456	else
3457		mk_tid_release(reply_mbuf, newtoep, tid);
3458	cxgb_ofld_send(cdev, reply_mbuf);
3459	m_free(m);
3460out:
3461#if 0
3462	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3463#else
3464	return;
3465#endif
3466}
3467
3468/*
3469 * Handle a CPL_PASS_ACCEPT_REQ message.
3470 */
3471static int
3472do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3473{
3474	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3475	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3476	struct tom_data *d = listen_ctx->tom_data;
3477
3478#if VALIDATE_TID
3479	struct cpl_pass_accept_req *req = cplhdr(m);
3480	unsigned int tid = GET_TID(req);
3481	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3482
3483	if (unlikely(!lsk)) {
3484		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3485		       cdev->name,
3486		       (unsigned long)((union listen_entry *)ctx -
3487					t->stid_tab));
3488		return CPL_RET_BUF_DONE;
3489	}
3490	if (unlikely(tid >= t->ntids)) {
3491		printk(KERN_ERR "%s: passive open TID %u too large\n",
3492		       cdev->name, tid);
3493		return CPL_RET_BUF_DONE;
3494	}
3495	/*
3496	 * For T3A the current user of the TID may have closed but its last
3497	 * message(s) may have been backlogged so the TID appears to be still
3498	 * in use.  Just take the TID away, the connection can close at its
3499	 * own leisure.  For T3B this situation is a bug.
3500	 */
3501	if (!valid_new_tid(t, tid) &&
3502	    cdev->type != T3A) {
3503		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3504		       cdev->name, tid);
3505		return CPL_RET_BUF_DONE;
3506	}
3507#endif
3508
3509	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3510	return (0);
3511}
3512
3513/*
3514 * Called when a connection is established to translate the TCP options
3515 * reported by HW to FreeBSD's native format.
3516 */
3517static void
3518assign_rxopt(struct socket *so, unsigned int opt)
3519{
3520	struct tcpcb *tp = so_sototcpcb(so);
3521	struct toepcb *toep = tp->t_toe;
3522	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3523
3524	inp_lock_assert(tp->t_inpcb);
3525
3526	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3527	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3528	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3529	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3530	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3531	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3532		tp->rcv_scale = tp->request_r_scale;
3533}
3534
3535/*
3536 * Completes some final bits of initialization for just established connections
3537 * and changes their state to TCP_ESTABLISHED.
3538 *
3539 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3540 */
3541static void
3542make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3543{
3544	struct tcpcb *tp = so_sototcpcb(so);
3545	struct toepcb *toep = tp->t_toe;
3546
3547	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3548	assign_rxopt(so, opt);
3549
3550	/*
3551	 *XXXXXXXXXXX
3552	 *
3553	 */
3554#ifdef notyet
3555	so->so_proto->pr_ctloutput = t3_ctloutput;
3556#endif
3557
3558#if 0
3559	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3560#endif
3561	/*
3562	 * XXX not clear what rcv_wup maps to
3563	 */
3564	/*
3565	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3566	 * pass through opt0.
3567	 */
3568	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3569		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3570
3571	dump_toepcb(toep);
3572
3573#ifdef notyet
3574/*
3575 * no clean interface for marking ARP up to date
3576 */
3577	dst_confirm(sk->sk_dst_cache);
3578#endif
3579	tp->t_starttime = ticks;
3580	tp->t_state = TCPS_ESTABLISHED;
3581	soisconnected(so);
3582}
3583
3584static int
3585syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3586{
3587
3588	struct in_conninfo inc;
3589	struct tcpopt to;
3590	struct tcphdr th;
3591	int mss, wsf, sack, ts;
3592	struct mbuf *m = NULL;
3593	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3594	unsigned int opt;
3595
3596#ifdef MAC
3597#error	"no MAC support"
3598#endif
3599
3600	opt = ntohs(req->tcp_opt);
3601
3602	bzero(&to, sizeof(struct tcpopt));
3603
3604	/*
3605	 * Fill out information for entering us into the syncache
3606	 */
3607	bzero(&inc, sizeof(inc));
3608	inc.inc_fport = th.th_sport = req->peer_port;
3609	inc.inc_lport = th.th_dport = req->local_port;
3610	th.th_seq = req->rcv_isn;
3611	th.th_flags = TH_ACK;
3612
3613	inc.inc_isipv6 = 0;
3614	inc.inc_len = 0;
3615	inc.inc_faddr.s_addr = req->peer_ip;
3616	inc.inc_laddr.s_addr = req->local_ip;
3617
3618	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3619	wsf  = G_TCPOPT_WSCALE_OK(opt);
3620	ts   = G_TCPOPT_TSTAMP(opt);
3621	sack = G_TCPOPT_SACK(opt);
3622
3623	to.to_mss = mss;
3624	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3625	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3626
3627	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3628	    ntohl(req->local_ip), ntohs(req->local_port),
3629	    ntohl(req->peer_ip), ntohs(req->peer_port),
3630	    mss, wsf, ts, sack);
3631	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3632}
3633
3634
3635/*
3636 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3637 * if we are in TCP_SYN_RECV due to crossed SYNs
3638 */
3639static int
3640do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3641{
3642	struct cpl_pass_establish *req = cplhdr(m);
3643	struct toepcb *toep = (struct toepcb *)ctx;
3644	struct tcpcb *tp = toep->tp_tp;
3645	struct socket *so, *lso;
3646	struct t3c_data *td = T3C_DATA(cdev);
3647	struct sockbuf *snd, *rcv;
3648
3649	// Complete socket initialization now that we have the SND_ISN
3650
3651	struct toedev *tdev;
3652
3653
3654	tdev = toep->tp_toedev;
3655
3656	inp_wlock(tp->t_inpcb);
3657
3658	/*
3659	 *
3660	 * XXX need to add reference while we're manipulating
3661	 */
3662	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3663
3664	inp_wunlock(tp->t_inpcb);
3665
3666	so_lock(so);
3667	LIST_REMOVE(toep, synq_entry);
3668	so_unlock(so);
3669
3670	if (!syncache_expand_establish_req(req, &so, toep)) {
3671		/*
3672		 * No entry
3673		 */
3674		CXGB_UNIMPLEMENTED();
3675	}
3676	if (so == NULL) {
3677		/*
3678		 * Couldn't create the socket
3679		 */
3680		CXGB_UNIMPLEMENTED();
3681	}
3682
3683	tp = so_sototcpcb(so);
3684	inp_wlock(tp->t_inpcb);
3685
3686	snd = so_sockbuf_snd(so);
3687	rcv = so_sockbuf_rcv(so);
3688
3689	snd->sb_flags |= SB_NOCOALESCE;
3690	rcv->sb_flags |= SB_NOCOALESCE;
3691
3692	toep->tp_tp = tp;
3693	toep->tp_flags = 0;
3694	tp->t_toe = toep;
3695	reset_wr_list(toep);
3696	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3697	tp->rcv_nxt = toep->tp_copied_seq;
3698	install_offload_ops(so);
3699
3700	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3701	toep->tp_wr_unacked = 0;
3702	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3703	toep->tp_qset_idx = 0;
3704	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3705
3706	/*
3707	 * XXX Cancel any keep alive timer
3708	 */
3709
3710	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3711
3712	/*
3713	 * XXX workaround for lack of syncache drop
3714	 */
3715	toepcb_release(toep);
3716	inp_wunlock(tp->t_inpcb);
3717
3718	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3719	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3720#ifdef notyet
3721	/*
3722	 * XXX not sure how these checks map to us
3723	 */
3724	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3725		sk->sk_state_change(sk);
3726		sk_wake_async(so, 0, POLL_OUT);
3727	}
3728	/*
3729	 * The state for the new connection is now up to date.
3730	 * Next check if we should add the connection to the parent's
3731	 * accept queue.  When the parent closes it resets connections
3732	 * on its SYN queue, so check if we are being reset.  If so we
3733	 * don't need to do anything more, the coming ABORT_RPL will
3734	 * destroy this socket.  Otherwise move the connection to the
3735	 * accept queue.
3736	 *
3737	 * Note that we reset the synq before closing the server so if
3738	 * we are not being reset the stid is still open.
3739	 */
3740	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3741		__kfree_skb(skb);
3742		goto unlock;
3743	}
3744#endif
3745	m_free(m);
3746
3747	return (0);
3748}
3749
3750/*
3751 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3752 * and send them to the TOE.
3753 */
3754static void
3755fixup_and_send_ofo(struct toepcb *toep)
3756{
3757	struct mbuf *m;
3758	struct toedev *tdev = toep->tp_toedev;
3759	struct tcpcb *tp = toep->tp_tp;
3760	unsigned int tid = toep->tp_tid;
3761
3762	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3763
3764	inp_lock_assert(tp->t_inpcb);
3765	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3766		/*
3767		 * A variety of messages can be waiting but the fields we'll
3768		 * be touching are common to all so any message type will do.
3769		 */
3770		struct cpl_close_con_req *p = cplhdr(m);
3771
3772		p->wr.wr_lo = htonl(V_WR_TID(tid));
3773		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3774		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3775	}
3776}
3777
3778/*
3779 * Updates socket state from an active establish CPL message.  Runs with the
3780 * socket lock held.
3781 */
3782static void
3783socket_act_establish(struct socket *so, struct mbuf *m)
3784{
3785	INIT_VNET_INET(so->so_vnet);
3786	struct cpl_act_establish *req = cplhdr(m);
3787	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3788	struct tcpcb *tp = so_sototcpcb(so);
3789	struct toepcb *toep = tp->t_toe;
3790
3791	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3792		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3793		    toep->tp_tid, tp->t_state);
3794
3795	tp->ts_recent_age = ticks;
3796	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3797	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3798
3799	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3800
3801	/*
3802	 * Now that we finally have a TID send any CPL messages that we had to
3803	 * defer for lack of a TID.
3804	 */
3805	if (mbufq_len(&toep->out_of_order_queue))
3806		fixup_and_send_ofo(toep);
3807
3808	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3809		/*
3810		 * XXX does this even make sense?
3811		 */
3812		so_sorwakeup(so);
3813	}
3814	m_free(m);
3815#ifdef notyet
3816/*
3817 * XXX assume no write requests permitted while socket connection is
3818 * incomplete
3819 */
3820	/*
3821	 * Currently the send queue must be empty at this point because the
3822	 * socket layer does not send anything before a connection is
3823	 * established.  To be future proof though we handle the possibility
3824	 * that there are pending buffers to send (either TX_DATA or
3825	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3826	 * buffers according to the just learned write_seq, and then we send
3827	 * them on their way.
3828	 */
3829	fixup_pending_writeq_buffers(sk);
3830	if (t3_push_frames(so, 1))
3831		sk->sk_write_space(sk);
3832#endif
3833
3834	toep->tp_state = tp->t_state;
3835	V_tcpstat.tcps_connects++;
3836
3837}
3838
3839/*
3840 * Process a CPL_ACT_ESTABLISH message.
3841 */
3842static int
3843do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3844{
3845	struct cpl_act_establish *req = cplhdr(m);
3846	unsigned int tid = GET_TID(req);
3847	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3848	struct toepcb *toep = (struct toepcb *)ctx;
3849	struct tcpcb *tp = toep->tp_tp;
3850	struct socket *so;
3851	struct toedev *tdev;
3852	struct tom_data *d;
3853
3854	if (tp == NULL) {
3855		free_atid(cdev, atid);
3856		return (0);
3857	}
3858	inp_wlock(tp->t_inpcb);
3859
3860	/*
3861	 * XXX
3862	 */
3863	so = inp_inpcbtosocket(tp->t_inpcb);
3864	tdev = toep->tp_toedev; /* blow up here if link was down */
3865	d = TOM_DATA(tdev);
3866
3867	/*
3868	 * It's OK if the TID is currently in use, the owning socket may have
3869	 * backlogged its last CPL message(s).  Just take it away.
3870	 */
3871	toep->tp_tid = tid;
3872	toep->tp_tp = tp;
3873	so_insert_tid(d, toep, tid);
3874	free_atid(cdev, atid);
3875	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3876
3877	socket_act_establish(so, m);
3878	inp_wunlock(tp->t_inpcb);
3879	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3880	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3881
3882	return (0);
3883}
3884
3885/*
3886 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3887 * next batch of work requests from the write queue.
3888 */
3889static void
3890wr_ack(struct toepcb *toep, struct mbuf *m)
3891{
3892	struct tcpcb *tp = toep->tp_tp;
3893	struct cpl_wr_ack *hdr = cplhdr(m);
3894	struct socket *so;
3895	unsigned int credits = ntohs(hdr->credits);
3896	u32 snd_una = ntohl(hdr->snd_una);
3897	int bytes = 0;
3898	struct sockbuf *snd;
3899
3900	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3901
3902	inp_wlock(tp->t_inpcb);
3903	so = inp_inpcbtosocket(tp->t_inpcb);
3904	toep->tp_wr_avail += credits;
3905	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3906		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3907
3908	while (credits) {
3909		struct mbuf *p = peek_wr(toep);
3910
3911		if (__predict_false(!p)) {
3912			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3913			    "nothing pending, state %u wr_avail=%u\n",
3914			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3915			break;
3916		}
3917		CTR2(KTR_TOM,
3918			"wr_ack: p->credits=%d p->bytes=%d",
3919		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3920		KASSERT(p->m_pkthdr.csum_data != 0,
3921		    ("empty request still on list"));
3922
3923		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3924
3925#if DEBUG_WR > 1
3926			struct tx_data_wr *w = cplhdr(p);
3927			log(LOG_ERR,
3928			       "TID %u got %u WR credits, need %u, len %u, "
3929			       "main body %u, frags %u, seq # %u, ACK una %u,"
3930			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3931			       toep->tp_tid, credits, p->csum, p->len,
3932			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3933			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3934			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3935#endif
3936			p->m_pkthdr.csum_data -= credits;
3937			break;
3938		} else {
3939			dequeue_wr(toep);
3940			credits -= p->m_pkthdr.csum_data;
3941			bytes += p->m_pkthdr.len;
3942			CTR3(KTR_TOM,
3943			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3944			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3945
3946			m_free(p);
3947		}
3948	}
3949
3950#if DEBUG_WR
3951	check_wr_invariants(tp);
3952#endif
3953
3954	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3955#if VALIDATE_SEQ
3956		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3957
3958		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3959		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3960		    toep->tp_tid, tp->snd_una);
3961#endif
3962		goto out_free;
3963	}
3964
3965	if (tp->snd_una != snd_una) {
3966		tp->snd_una = snd_una;
3967		tp->ts_recent_age = ticks;
3968#ifdef notyet
3969		/*
3970		 * Keep ARP entry "minty fresh"
3971		 */
3972		dst_confirm(sk->sk_dst_cache);
3973#endif
3974		if (tp->snd_una == tp->snd_nxt)
3975			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3976	}
3977
3978	snd = so_sockbuf_snd(so);
3979	if (bytes) {
3980		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3981		snd = so_sockbuf_snd(so);
3982		sockbuf_lock(snd);
3983		sbdrop_locked(snd, bytes);
3984		so_sowwakeup_locked(so);
3985	}
3986
3987	if (snd->sb_sndptroff < snd->sb_cc)
3988		t3_push_frames(so, 0);
3989
3990out_free:
3991	inp_wunlock(tp->t_inpcb);
3992	m_free(m);
3993}
3994
3995/*
3996 * Handler for TX_DATA_ACK CPL messages.
3997 */
3998static int
3999do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
4000{
4001	struct toepcb *toep = (struct toepcb *)ctx;
4002
4003	VALIDATE_SOCK(so);
4004
4005	wr_ack(toep, m);
4006	return 0;
4007}
4008
4009/*
4010 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4011 */
4012static int
4013do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4014{
4015	m_freem(m);
4016	return 0;
4017}
4018
4019/*
4020 * Reset a connection that is on a listener's SYN queue or accept queue,
4021 * i.e., one that has not had a struct socket associated with it.
4022 * Must be called from process context.
4023 *
4024 * Modeled after code in inet_csk_listen_stop().
4025 */
4026static void
4027t3_reset_listen_child(struct socket *child)
4028{
4029	struct tcpcb *tp = so_sototcpcb(child);
4030
4031	t3_send_reset(tp->t_toe);
4032}
4033
4034
4035static void
4036t3_child_disconnect(struct socket *so, void *arg)
4037{
4038	struct tcpcb *tp = so_sototcpcb(so);
4039
4040	if (tp->t_flags & TF_TOE) {
4041		inp_wlock(tp->t_inpcb);
4042		t3_reset_listen_child(so);
4043		inp_wunlock(tp->t_inpcb);
4044	}
4045}
4046
4047/*
4048 * Disconnect offloaded established but not yet accepted connections sitting
4049 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4050 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4051 */
4052void
4053t3_disconnect_acceptq(struct socket *listen_so)
4054{
4055
4056	so_lock(listen_so);
4057	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4058	so_unlock(listen_so);
4059}
4060
4061/*
4062 * Reset offloaded connections sitting on a server's syn queue.  As above
4063 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4064 */
4065
4066void
4067t3_reset_synq(struct listen_ctx *lctx)
4068{
4069	struct toepcb *toep;
4070
4071	so_lock(lctx->lso);
4072	while (!LIST_EMPTY(&lctx->synq_head)) {
4073		toep = LIST_FIRST(&lctx->synq_head);
4074		LIST_REMOVE(toep, synq_entry);
4075		toep->tp_tp = NULL;
4076		t3_send_reset(toep);
4077		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4078		toepcb_release(toep);
4079	}
4080	so_unlock(lctx->lso);
4081}
4082
4083
4084int
4085t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4086		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4087		   unsigned int pg_off, unsigned int color)
4088{
4089	unsigned int i, j, pidx;
4090	struct pagepod *p;
4091	struct mbuf *m;
4092	struct ulp_mem_io *req;
4093	unsigned int tid = toep->tp_tid;
4094	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4095	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4096
4097	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4098	    gl, nppods, tag, maxoff, pg_off, color);
4099
4100	for (i = 0; i < nppods; ++i) {
4101		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4102		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4103		req = mtod(m, struct ulp_mem_io *);
4104		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4105		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4106		req->wr.wr_lo = 0;
4107		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4108					   V_ULPTX_CMD(ULP_MEM_WRITE));
4109		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4110				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4111
4112		p = (struct pagepod *)(req + 1);
4113		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4114			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4115			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4116						  V_PPOD_COLOR(color));
4117			p->pp_max_offset = htonl(maxoff);
4118			p->pp_page_offset = htonl(pg_off);
4119			p->pp_rsvd = 0;
4120			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4121				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4122				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4123		} else
4124			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4125		send_or_defer(toep, m, 0);
4126		ppod_addr += PPOD_SIZE;
4127	}
4128	return (0);
4129}
4130
4131/*
4132 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4133 */
4134static inline void
4135mk_cpl_barrier_ulp(struct cpl_barrier *b)
4136{
4137	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4138
4139	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4140	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4141	b->opcode = CPL_BARRIER;
4142}
4143
4144/*
4145 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4146 */
4147static inline void
4148mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4149{
4150	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4151
4152	txpkt = (struct ulp_txpkt *)req;
4153	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4154	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4155	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4156	req->cpuno = htons(cpuno);
4157}
4158
4159/*
4160 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4161 */
4162static inline void
4163mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4164                     unsigned int word, uint64_t mask, uint64_t val)
4165{
4166	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4167
4168	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4169	    tid, word, mask, val);
4170
4171	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4172	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4173	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4174	req->reply = V_NO_REPLY(1);
4175	req->cpu_idx = 0;
4176	req->word = htons(word);
4177	req->mask = htobe64(mask);
4178	req->val = htobe64(val);
4179}
4180
4181/*
4182 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4183 */
4184static void
4185mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4186    unsigned int tid, unsigned int credits)
4187{
4188	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4189
4190	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4191	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4192	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4193	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4194	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4195				 V_RX_CREDITS(credits));
4196}
4197
4198void
4199t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4200{
4201	unsigned int wrlen;
4202	struct mbuf *m;
4203	struct work_request_hdr *wr;
4204	struct cpl_barrier *lock;
4205	struct cpl_set_tcb_field *req;
4206	struct cpl_get_tcb *getreq;
4207	struct ddp_state *p = &toep->tp_ddp_state;
4208
4209#if 0
4210	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4211#endif
4212	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4213		sizeof(*getreq);
4214	m = m_gethdr_nofail(wrlen);
4215	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4216	wr = mtod(m, struct work_request_hdr *);
4217	bzero(wr, wrlen);
4218
4219	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4220	m->m_pkthdr.len = m->m_len = wrlen;
4221
4222	lock = (struct cpl_barrier *)(wr + 1);
4223	mk_cpl_barrier_ulp(lock);
4224
4225	req = (struct cpl_set_tcb_field *)(lock + 1);
4226
4227	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4228
4229	/* Hmmm, not sure if this actually a good thing: reactivating
4230	 * the other buffer might be an issue if it has been completed
4231	 * already. However, that is unlikely, since the fact that the UBUF
4232	 * is not completed indicates that there is no oustanding data.
4233	 */
4234	if (bufidx == 0)
4235		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4236				     V_TF_DDP_ACTIVE_BUF(1) |
4237				     V_TF_DDP_BUF0_VALID(1),
4238				     V_TF_DDP_ACTIVE_BUF(1));
4239	else
4240		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4241				     V_TF_DDP_ACTIVE_BUF(1) |
4242				     V_TF_DDP_BUF1_VALID(1), 0);
4243
4244	getreq = (struct cpl_get_tcb *)(req + 1);
4245	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4246
4247	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4248
4249	/* Keep track of the number of oustanding CPL_GET_TCB requests
4250	 */
4251	p->get_tcb_count++;
4252
4253#ifdef T3_TRACE
4254	T3_TRACE1(TIDTB(so),
4255		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4256#endif
4257	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4258}
4259
4260/**
4261 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4262 * @sk: the socket associated with the buffers
4263 * @bufidx: index of HW DDP buffer (0 or 1)
4264 * @tag0: new tag for HW buffer 0
4265 * @tag1: new tag for HW buffer 1
4266 * @len: new length for HW buf @bufidx
4267 *
4268 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4269 * buffer by changing the buffer tag and length and setting the valid and
4270 * active flag accordingly.  The caller must ensure the new buffer is at
4271 * least as big as the existing one.  Since we typically reprogram both HW
4272 * buffers this function sets both tags for convenience. Read the TCB to
4273 * determine how made data was written into the buffer before the overlay
4274 * took place.
4275 */
4276void
4277t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4278	 	       unsigned int tag1, unsigned int len)
4279{
4280	unsigned int wrlen;
4281	struct mbuf *m;
4282	struct work_request_hdr *wr;
4283	struct cpl_get_tcb *getreq;
4284	struct cpl_set_tcb_field *req;
4285	struct ddp_state *p = &toep->tp_ddp_state;
4286
4287	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4288	    bufidx, tag0, tag1, len);
4289#if 0
4290	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4291#endif
4292	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4293	m = m_gethdr_nofail(wrlen);
4294	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4295	wr = mtod(m, struct work_request_hdr *);
4296	m->m_pkthdr.len = m->m_len = wrlen;
4297	bzero(wr, wrlen);
4298
4299
4300	/* Set the ATOMIC flag to make sure that TP processes the following
4301	 * CPLs in an atomic manner and no wire segments can be interleaved.
4302	 */
4303	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4304	req = (struct cpl_set_tcb_field *)(wr + 1);
4305	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4306			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4307			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4308			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4309			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4310	req++;
4311	if (bufidx == 0) {
4312		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4313			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4314			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4315		req++;
4316		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4317			    V_TF_DDP_PUSH_DISABLE_0(1) |
4318			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4319			    V_TF_DDP_PUSH_DISABLE_0(0) |
4320			    V_TF_DDP_BUF0_VALID(1));
4321	} else {
4322		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4323			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4324			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4325		req++;
4326		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4327			    V_TF_DDP_PUSH_DISABLE_1(1) |
4328			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4329			    V_TF_DDP_PUSH_DISABLE_1(0) |
4330			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4331	}
4332
4333	getreq = (struct cpl_get_tcb *)(req + 1);
4334	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4335
4336	/* Keep track of the number of oustanding CPL_GET_TCB requests
4337	 */
4338	p->get_tcb_count++;
4339
4340#ifdef T3_TRACE
4341	T3_TRACE4(TIDTB(sk),
4342		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4343		  "len %d",
4344		  bufidx, tag0, tag1, len);
4345#endif
4346	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4347}
4348
4349/*
4350 * Sends a compound WR containing all the CPL messages needed to program the
4351 * two HW DDP buffers, namely optionally setting up the length and offset of
4352 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4353 */
4354void
4355t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4356		      unsigned int len1, unsigned int offset1,
4357                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4358{
4359	unsigned int wrlen;
4360	struct mbuf *m;
4361	struct work_request_hdr *wr;
4362	struct cpl_set_tcb_field *req;
4363
4364	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4365	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4366
4367#if 0
4368	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4369#endif
4370	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4371		(len1 ? sizeof(*req) : 0) +
4372		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4373	m = m_gethdr_nofail(wrlen);
4374	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4375	wr = mtod(m, struct work_request_hdr *);
4376	bzero(wr, wrlen);
4377
4378	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4379	m->m_pkthdr.len = m->m_len = wrlen;
4380
4381	req = (struct cpl_set_tcb_field *)(wr + 1);
4382	if (len0) {                  /* program buffer 0 offset and length */
4383		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4384			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4385			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4386			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4387			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4388		req++;
4389	}
4390	if (len1) {                  /* program buffer 1 offset and length */
4391		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4392			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4393			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4394			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4395			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4396		req++;
4397	}
4398
4399	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4400			     ddp_flags);
4401
4402	if (modulate) {
4403		mk_rx_data_ack_ulp(toep,
4404		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4405		    toep->tp_copied_seq - toep->tp_rcv_wup);
4406		toep->tp_rcv_wup = toep->tp_copied_seq;
4407	}
4408
4409#ifdef T3_TRACE
4410	T3_TRACE5(TIDTB(sk),
4411		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4412		  "modulate %d",
4413		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4414		  modulate);
4415#endif
4416
4417	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4418}
4419
4420void
4421t3_init_wr_tab(unsigned int wr_len)
4422{
4423	int i;
4424
4425	if (mbuf_wrs[1])     /* already initialized */
4426		return;
4427
4428	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4429		int sgl_len = (3 * i) / 2 + (i & 1);
4430
4431		sgl_len += 3;
4432		mbuf_wrs[i] = sgl_len <= wr_len ?
4433		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4434	}
4435
4436	wrlen = wr_len * 8;
4437}
4438
4439int
4440t3_init_cpl_io(void)
4441{
4442#ifdef notyet
4443	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4444	if (!tcphdr_skb) {
4445		log(LOG_ERR,
4446		       "Chelsio TCP offload: can't allocate sk_buff\n");
4447		return -1;
4448	}
4449	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4450	tcphdr_skb->h.raw = tcphdr_skb->data;
4451	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4452#endif
4453
4454	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4455	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4456	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4457	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4458	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4459	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4460	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4461	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4462	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4463	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4464	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4465	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4466	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4467	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4468	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4469	return (0);
4470}
4471
4472