cxgb_cpl_io.c revision 183289
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 183289 2008-09-23 02:22:24Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/sockbuf.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/protosw.h>
49#include <sys/priv.h>
50
51#if __FreeBSD_version >= 800044
52#include <sys/vimage.h>
53#else
54#define V_tcp_do_autosndbuf tcp_do_autosndbuf
55#define V_tcp_autosndbuf_max tcp_autosndbuf_max
56#define V_tcp_do_rfc1323 tcp_do_rfc1323
57#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
58#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
59#define V_tcpstat tcpstat
60#endif
61
62#include <net/if.h>
63#include <net/route.h>
64
65#include <netinet/in.h>
66#include <netinet/in_pcb.h>
67#include <netinet/in_systm.h>
68#include <netinet/in_var.h>
69
70
71#include <dev/cxgb/cxgb_osdep.h>
72#include <dev/cxgb/sys/mbufq.h>
73
74#include <netinet/ip.h>
75#include <netinet/tcp_var.h>
76#include <netinet/tcp_fsm.h>
77#include <netinet/tcp_offload.h>
78#include <netinet/tcp_seq.h>
79#include <netinet/tcp_syncache.h>
80#include <netinet/tcp_timer.h>
81#include <net/route.h>
82
83#include <dev/cxgb/t3cdev.h>
84#include <dev/cxgb/common/cxgb_firmware_exports.h>
85#include <dev/cxgb/common/cxgb_t3_cpl.h>
86#include <dev/cxgb/common/cxgb_tcb.h>
87#include <dev/cxgb/common/cxgb_ctl_defs.h>
88#include <dev/cxgb/cxgb_offload.h>
89#include <vm/vm.h>
90#include <vm/pmap.h>
91#include <machine/bus.h>
92#include <dev/cxgb/sys/mvec.h>
93#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
94#include <dev/cxgb/ulp/tom/cxgb_defs.h>
95#include <dev/cxgb/ulp/tom/cxgb_tom.h>
96#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
97#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
98#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
99
100#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
101
102/*
103 * For ULP connections HW may add headers, e.g., for digests, that aren't part
104 * of the messages sent by the host but that are part of the TCP payload and
105 * therefore consume TCP sequence space.  Tx connection parameters that
106 * operate in TCP sequence space are affected by the HW additions and need to
107 * compensate for them to accurately track TCP sequence numbers. This array
108 * contains the compensating extra lengths for ULP packets.  It is indexed by
109 * a packet's ULP submode.
110 */
111const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
112
113#ifdef notyet
114/*
115 * This sk_buff holds a fake header-only TCP segment that we use whenever we
116 * need to exploit SW TCP functionality that expects TCP headers, such as
117 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
118 * CPUs without locking.
119 */
120static struct mbuf *tcphdr_mbuf __read_mostly;
121#endif
122
123/*
124 * Size of WRs in bytes.  Note that we assume all devices we are handling have
125 * the same WR size.
126 */
127static unsigned int wrlen __read_mostly;
128
129/*
130 * The number of WRs needed for an skb depends on the number of page fragments
131 * in the skb and whether it has any payload in its main body.  This maps the
132 * length of the gather list represented by an skb into the # of necessary WRs.
133 */
134static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
135
136/*
137 * Max receive window supported by HW in bytes.  Only a small part of it can
138 * be set through option0, the rest needs to be set through RX_DATA_ACK.
139 */
140#define MAX_RCV_WND ((1U << 27) - 1)
141
142/*
143 * Min receive window.  We want it to be large enough to accommodate receive
144 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
145 */
146#define MIN_RCV_WND (24 * 1024U)
147#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
148
149#define VALIDATE_SEQ 0
150#define VALIDATE_SOCK(so)
151#define DEBUG_WR 0
152
153#define TCP_TIMEWAIT	1
154#define TCP_CLOSE	2
155#define TCP_DROP	3
156
157extern int tcp_do_autorcvbuf;
158extern int tcp_do_autosndbuf;
159extern int tcp_autorcvbuf_max;
160extern int tcp_autosndbuf_max;
161
162static void t3_send_reset(struct toepcb *toep);
163static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
164static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
165static void handle_syncache_event(int event, void *arg);
166
167static inline void
168SBAPPEND(struct sockbuf *sb, struct mbuf *n)
169{
170	struct mbuf *m;
171
172	m = sb->sb_mb;
173	while (m) {
174		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
175		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
176			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
177		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
178			m->m_next, m->m_nextpkt, m->m_flags));
179		m = m->m_next;
180	}
181	m = n;
182	while (m) {
183		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
184		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
185			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
186		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
187			m->m_next, m->m_nextpkt, m->m_flags));
188		m = m->m_next;
189	}
190	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
191	sbappendstream_locked(sb, n);
192	m = sb->sb_mb;
193
194	while (m) {
195		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
196			m->m_next, m->m_nextpkt, m->m_flags));
197		m = m->m_next;
198	}
199}
200
201static inline int
202is_t3a(const struct toedev *dev)
203{
204	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
205}
206
207static void
208dump_toepcb(struct toepcb *toep)
209{
210	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
211	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
212	    toep->tp_mtu_idx, toep->tp_tid);
213
214	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
215	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
216	    toep->tp_mss_clamp, toep->tp_flags);
217}
218
219#ifndef RTALLOC2_DEFINED
220static struct rtentry *
221rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
222{
223	struct rtentry *rt = NULL;
224
225	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
226		RT_UNLOCK(rt);
227
228	return (rt);
229}
230#endif
231
232/*
233 * Determine whether to send a CPL message now or defer it.  A message is
234 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
235 * For connections in other states the message is sent immediately.
236 * If through_l2t is set the message is subject to ARP processing, otherwise
237 * it is sent directly.
238 */
239static inline void
240send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
241{
242	struct tcpcb *tp = toep->tp_tp;
243
244	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
245		inp_wlock(tp->t_inpcb);
246		mbufq_tail(&toep->out_of_order_queue, m);  // defer
247		inp_wunlock(tp->t_inpcb);
248	} else if (through_l2t)
249		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
250	else
251		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
252}
253
254static inline unsigned int
255mkprio(unsigned int cntrl, const struct toepcb *toep)
256{
257        return (cntrl);
258}
259
260/*
261 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
262 */
263static inline void
264mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
265{
266	struct cpl_tid_release *req;
267
268	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
269	m->m_pkthdr.len = m->m_len = sizeof(*req);
270	req = mtod(m, struct cpl_tid_release *);
271	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
272	req->wr.wr_lo = 0;
273	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
274}
275
276static inline void
277make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
278{
279	struct tcpcb *tp = so_sototcpcb(so);
280	struct toepcb *toep = tp->t_toe;
281	struct tx_data_wr *req;
282	struct sockbuf *snd;
283
284	inp_lock_assert(tp->t_inpcb);
285	snd = so_sockbuf_snd(so);
286
287	req = mtod(m, struct tx_data_wr *);
288	m->m_len = sizeof(*req);
289	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
290	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
291	/* len includes the length of any HW ULP additions */
292	req->len = htonl(len);
293	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
294	/* V_TX_ULP_SUBMODE sets both the mode and submode */
295	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
296	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
297	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
298				   (tail ? 0 : 1))));
299	req->sndseq = htonl(tp->snd_nxt);
300	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
301		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
302				    V_TX_CPU_IDX(toep->tp_qset));
303
304		/* Sendbuffer is in units of 32KB.
305		 */
306		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
307			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
308		else {
309			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
310		}
311
312		toep->tp_flags |= TP_DATASENT;
313	}
314}
315
316#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
317
318int
319t3_push_frames(struct socket *so, int req_completion)
320{
321	struct tcpcb *tp = so_sototcpcb(so);
322	struct toepcb *toep = tp->t_toe;
323
324	struct mbuf *tail, *m0, *last;
325	struct t3cdev *cdev;
326	struct tom_data *d;
327	int state, bytes, count, total_bytes;
328	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
329	struct sockbuf *snd;
330
331	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
332		DPRINTF("tcp state=%d\n", tp->t_state);
333		return (0);
334	}
335
336	state = so_state_get(so);
337
338	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
339		DPRINTF("disconnecting\n");
340
341		return (0);
342	}
343
344	inp_lock_assert(tp->t_inpcb);
345
346	snd = so_sockbuf_snd(so);
347	sockbuf_lock(snd);
348
349	d = TOM_DATA(toep->tp_toedev);
350	cdev = d->cdev;
351
352	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
353
354	total_bytes = 0;
355	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
356	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
357
358	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
359		KASSERT(tail, ("sbdrop error"));
360		last = tail = tail->m_next;
361	}
362
363	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
364		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
365		sockbuf_unlock(snd);
366
367		return (0);
368	}
369
370	toep->tp_m_last = NULL;
371	while (toep->tp_wr_avail && (tail != NULL)) {
372		count = bytes = 0;
373		segp = segs;
374		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
375			sockbuf_unlock(snd);
376			return (0);
377		}
378		/*
379		 * If the data in tail fits as in-line, then
380		 * make an immediate data wr.
381		 */
382		if (tail->m_len <= IMM_LEN) {
383			count = 1;
384			bytes = tail->m_len;
385			last = tail;
386			tail = tail->m_next;
387			m_set_sgl(m0, NULL);
388			m_set_sgllen(m0, 0);
389			make_tx_data_wr(so, m0, bytes, tail);
390			m_append(m0, bytes, mtod(last, caddr_t));
391			KASSERT(!m0->m_next, ("bad append"));
392		} else {
393			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
394			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
395				bytes += tail->m_len;
396				last = tail;
397				count++;
398				/*
399				 * technically an abuse to be using this for a VA
400				 * but less gross than defining my own structure
401				 * or calling pmap_kextract from here :-|
402				 */
403				segp->ds_addr = (bus_addr_t)tail->m_data;
404				segp->ds_len = tail->m_len;
405				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
406				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
407				segp++;
408				tail = tail->m_next;
409			}
410			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
411			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
412
413			m_set_sgl(m0, segs);
414			m_set_sgllen(m0, count);
415			make_tx_data_wr(so, m0, bytes, tail);
416		}
417		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
418
419		if (tail) {
420			snd->sb_sndptr = tail;
421			toep->tp_m_last = NULL;
422		} else
423			toep->tp_m_last = snd->sb_sndptr = last;
424
425
426		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
427
428		snd->sb_sndptroff += bytes;
429		total_bytes += bytes;
430		toep->tp_write_seq += bytes;
431		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
432		    " tail=%p sndptr=%p sndptroff=%d",
433		    toep->tp_wr_avail, count, mbuf_wrs[count],
434		    tail, snd->sb_sndptr, snd->sb_sndptroff);
435		if (tail)
436			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
437			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
438			    total_bytes, toep->tp_m_last, tail->m_data,
439			    tp->snd_una);
440		else
441			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
442			    " tp_m_last=%p snd_una=0x%08x",
443			    total_bytes, toep->tp_m_last, tp->snd_una);
444
445
446#ifdef KTR
447{
448		int i;
449
450		i = 0;
451		while (i < count && m_get_sgllen(m0)) {
452			if ((count - i) >= 3) {
453				CTR6(KTR_TOM,
454				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
455				    " len=%d pa=0x%zx len=%d",
456				    segs[i].ds_addr, segs[i].ds_len,
457				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
458				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
459				    i += 3;
460			} else if ((count - i) == 2) {
461				CTR4(KTR_TOM,
462				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
463				    " len=%d",
464				    segs[i].ds_addr, segs[i].ds_len,
465				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
466				    i += 2;
467			} else {
468				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
469				    segs[i].ds_addr, segs[i].ds_len);
470				i++;
471			}
472
473		}
474}
475#endif
476                 /*
477		 * remember credits used
478		 */
479		m0->m_pkthdr.csum_data = mbuf_wrs[count];
480		m0->m_pkthdr.len = bytes;
481		toep->tp_wr_avail -= mbuf_wrs[count];
482		toep->tp_wr_unacked += mbuf_wrs[count];
483
484		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
485		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
486			struct work_request_hdr *wr = cplhdr(m0);
487
488			wr->wr_hi |= htonl(F_WR_COMPL);
489			toep->tp_wr_unacked = 0;
490		}
491		KASSERT((m0->m_pkthdr.csum_data > 0) &&
492		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
493			m0->m_pkthdr.csum_data));
494		m0->m_type = MT_DONTFREE;
495		enqueue_wr(toep, m0);
496		DPRINTF("sending offload tx with %d bytes in %d segments\n",
497		    bytes, count);
498		l2t_send(cdev, m0, toep->tp_l2t);
499	}
500	sockbuf_unlock(snd);
501	return (total_bytes);
502}
503
504/*
505 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
506 * under any circumstances.  We take the easy way out and always queue the
507 * message to the write_queue.  We can optimize the case where the queue is
508 * already empty though the optimization is probably not worth it.
509 */
510static void
511close_conn(struct socket *so)
512{
513	struct mbuf *m;
514	struct cpl_close_con_req *req;
515	struct tom_data *d;
516	struct inpcb *inp = so_sotoinpcb(so);
517	struct tcpcb *tp;
518	struct toepcb *toep;
519	unsigned int tid;
520
521
522	inp_wlock(inp);
523	tp = so_sototcpcb(so);
524	toep = tp->t_toe;
525
526	if (tp->t_state != TCPS_SYN_SENT)
527		t3_push_frames(so, 1);
528
529	if (toep->tp_flags & TP_FIN_SENT) {
530		inp_wunlock(inp);
531		return;
532	}
533
534	tid = toep->tp_tid;
535
536	d = TOM_DATA(toep->tp_toedev);
537
538	m = m_gethdr_nofail(sizeof(*req));
539	m_set_priority(m, CPL_PRIORITY_DATA);
540	m_set_sgl(m, NULL);
541	m_set_sgllen(m, 0);
542
543	toep->tp_flags |= TP_FIN_SENT;
544	req = mtod(m, struct cpl_close_con_req *);
545
546	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
547	req->wr.wr_lo = htonl(V_WR_TID(tid));
548	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
549	req->rsvd = 0;
550	inp_wunlock(inp);
551	/*
552	 * XXX - need to defer shutdown while there is still data in the queue
553	 *
554	 */
555	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
556	cxgb_ofld_send(d->cdev, m);
557
558}
559
560/*
561 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
562 * and send it along.
563 */
564static void
565abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
566{
567	struct cpl_abort_req *req = cplhdr(m);
568
569	req->cmd = CPL_ABORT_NO_RST;
570	cxgb_ofld_send(cdev, m);
571}
572
573/*
574 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
575 * permitted to return without sending the message in case we cannot allocate
576 * an sk_buff.  Returns the number of credits sent.
577 */
578uint32_t
579t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
580{
581	struct mbuf *m;
582	struct cpl_rx_data_ack *req;
583	struct toepcb *toep = tp->t_toe;
584	struct toedev *tdev = toep->tp_toedev;
585
586	m = m_gethdr_nofail(sizeof(*req));
587
588	DPRINTF("returning %u credits to HW\n", credits);
589
590	req = mtod(m, struct cpl_rx_data_ack *);
591	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
592	req->wr.wr_lo = 0;
593	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
594	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
595	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
596	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
597	return (credits);
598}
599
600/*
601 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
602 * This is only used in DDP mode, so we take the opportunity to also set the
603 * DACK mode and flush any Rx credits.
604 */
605void
606t3_send_rx_modulate(struct toepcb *toep)
607{
608	struct mbuf *m;
609	struct cpl_rx_data_ack *req;
610
611	m = m_gethdr_nofail(sizeof(*req));
612
613	req = mtod(m, struct cpl_rx_data_ack *);
614	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
615	req->wr.wr_lo = 0;
616	m->m_pkthdr.len = m->m_len = sizeof(*req);
617
618	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
619	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
620				 V_RX_DACK_MODE(1) |
621				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
622	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
623	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
624	toep->tp_rcv_wup = toep->tp_copied_seq;
625}
626
627/*
628 * Handle receipt of an urgent pointer.
629 */
630static void
631handle_urg_ptr(struct socket *so, uint32_t urg_seq)
632{
633#ifdef URGENT_DATA_SUPPORTED
634	struct tcpcb *tp = so_sototcpcb(so);
635
636	urg_seq--;   /* initially points past the urgent data, per BSD */
637
638	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
639		return;                                 /* duplicate pointer */
640	sk_send_sigurg(sk);
641	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
642	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
643		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
644
645		tp->copied_seq++;
646		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
647			tom_eat_skb(sk, skb, 0);
648	}
649	tp->urg_data = TCP_URG_NOTYET;
650	tp->urg_seq = urg_seq;
651#endif
652}
653
654/*
655 * Returns true if a socket cannot accept new Rx data.
656 */
657static inline int
658so_no_receive(const struct socket *so)
659{
660	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
661}
662
663/*
664 * Process an urgent data notification.
665 */
666static void
667rx_urg_notify(struct toepcb *toep, struct mbuf *m)
668{
669	struct cpl_rx_urg_notify *hdr = cplhdr(m);
670	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
671
672	VALIDATE_SOCK(so);
673
674	if (!so_no_receive(so))
675		handle_urg_ptr(so, ntohl(hdr->seq));
676
677	m_freem(m);
678}
679
680/*
681 * Handler for RX_URG_NOTIFY CPL messages.
682 */
683static int
684do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
685{
686	struct toepcb *toep = (struct toepcb *)ctx;
687
688	rx_urg_notify(toep, m);
689	return (0);
690}
691
692static __inline int
693is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
694{
695	return (toep->tp_ulp_mode ||
696		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
697		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
698}
699
700/*
701 * Set of states for which we should return RX credits.
702 */
703#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
704
705/*
706 * Called after some received data has been read.  It returns RX credits
707 * to the HW for the amount of data processed.
708 */
709void
710t3_cleanup_rbuf(struct tcpcb *tp, int copied)
711{
712	struct toepcb *toep = tp->t_toe;
713	struct socket *so;
714	struct toedev *dev;
715	int dack_mode, must_send, read;
716	u32 thres, credits, dack = 0;
717	struct sockbuf *rcv;
718
719	so = inp_inpcbtosocket(tp->t_inpcb);
720	rcv = so_sockbuf_rcv(so);
721
722	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
723		(tp->t_state == TCPS_FIN_WAIT_2))) {
724		if (copied) {
725			sockbuf_lock(rcv);
726			toep->tp_copied_seq += copied;
727			sockbuf_unlock(rcv);
728		}
729
730		return;
731	}
732
733	inp_lock_assert(tp->t_inpcb);
734
735	sockbuf_lock(rcv);
736	if (copied)
737		toep->tp_copied_seq += copied;
738	else {
739		read = toep->tp_enqueued_bytes - rcv->sb_cc;
740		toep->tp_copied_seq += read;
741	}
742	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
743	toep->tp_enqueued_bytes = rcv->sb_cc;
744	sockbuf_unlock(rcv);
745
746	if (credits > rcv->sb_mbmax) {
747		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
748		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
749	    credits = rcv->sb_mbmax;
750	}
751
752
753	/*
754	 * XXX this won't accurately reflect credit return - we need
755	 * to look at the difference between the amount that has been
756	 * put in the recv sockbuf and what is there now
757	 */
758
759	if (__predict_false(!credits))
760		return;
761
762	dev = toep->tp_toedev;
763	thres = TOM_TUNABLE(dev, rx_credit_thres);
764
765	if (__predict_false(thres == 0))
766		return;
767
768	if (is_delack_mode_valid(dev, toep)) {
769		dack_mode = TOM_TUNABLE(dev, delack);
770		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
771			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
772
773			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
774				dack = F_RX_DACK_CHANGE |
775				       V_RX_DACK_MODE(dack_mode);
776		}
777	} else
778		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
779
780	/*
781	 * For coalescing to work effectively ensure the receive window has
782	 * at least 16KB left.
783	 */
784	must_send = credits + 16384 >= tp->rcv_wnd;
785
786	if (must_send || credits >= thres)
787		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
788}
789
790static int
791cxgb_toe_disconnect(struct tcpcb *tp)
792{
793	struct socket *so;
794
795	DPRINTF("cxgb_toe_disconnect\n");
796
797	so = inp_inpcbtosocket(tp->t_inpcb);
798	close_conn(so);
799	return (0);
800}
801
802static int
803cxgb_toe_reset(struct tcpcb *tp)
804{
805	struct toepcb *toep = tp->t_toe;
806
807	t3_send_reset(toep);
808
809	/*
810	 * unhook from socket
811	 */
812	tp->t_flags &= ~TF_TOE;
813	toep->tp_tp = NULL;
814	tp->t_toe = NULL;
815	return (0);
816}
817
818static int
819cxgb_toe_send(struct tcpcb *tp)
820{
821	struct socket *so;
822
823	DPRINTF("cxgb_toe_send\n");
824	dump_toepcb(tp->t_toe);
825
826	so = inp_inpcbtosocket(tp->t_inpcb);
827	t3_push_frames(so, 1);
828	return (0);
829}
830
831static int
832cxgb_toe_rcvd(struct tcpcb *tp)
833{
834
835	inp_lock_assert(tp->t_inpcb);
836
837	t3_cleanup_rbuf(tp, 0);
838
839	return (0);
840}
841
842static void
843cxgb_toe_detach(struct tcpcb *tp)
844{
845	struct toepcb *toep;
846
847        /*
848	 * XXX how do we handle teardown in the SYN_SENT state?
849	 *
850	 */
851	inp_lock_assert(tp->t_inpcb);
852	toep = tp->t_toe;
853	toep->tp_tp = NULL;
854
855	/*
856	 * unhook from socket
857	 */
858	tp->t_flags &= ~TF_TOE;
859	tp->t_toe = NULL;
860}
861
862
863static struct toe_usrreqs cxgb_toe_usrreqs = {
864	.tu_disconnect = cxgb_toe_disconnect,
865	.tu_reset = cxgb_toe_reset,
866	.tu_send = cxgb_toe_send,
867	.tu_rcvd = cxgb_toe_rcvd,
868	.tu_detach = cxgb_toe_detach,
869	.tu_detach = cxgb_toe_detach,
870	.tu_syncache_event = handle_syncache_event,
871};
872
873
874static void
875__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
876			    uint64_t mask, uint64_t val, int no_reply)
877{
878	struct cpl_set_tcb_field *req;
879
880	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
881	    toep->tp_tid, word, mask, val);
882
883	req = mtod(m, struct cpl_set_tcb_field *);
884	m->m_pkthdr.len = m->m_len = sizeof(*req);
885	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
886	req->wr.wr_lo = 0;
887	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
888	req->reply = V_NO_REPLY(no_reply);
889	req->cpu_idx = 0;
890	req->word = htons(word);
891	req->mask = htobe64(mask);
892	req->val = htobe64(val);
893
894	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
895	send_or_defer(toep, m, 0);
896}
897
898static void
899t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
900{
901	struct mbuf *m;
902	struct tcpcb *tp = toep->tp_tp;
903
904	if (toep == NULL)
905		return;
906
907	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
908		printf("not seting field\n");
909		return;
910	}
911
912	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
913
914	__set_tcb_field(toep, m, word, mask, val, 1);
915}
916
917/*
918 * Set one of the t_flags bits in the TCB.
919 */
920static void
921set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
922{
923
924	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
925}
926
927/*
928 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
929 */
930static void
931t3_set_nagle(struct toepcb *toep)
932{
933	struct tcpcb *tp = toep->tp_tp;
934
935	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
936}
937
938/*
939 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
940 */
941void
942t3_set_keepalive(struct toepcb *toep, int on_off)
943{
944
945	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
946}
947
948void
949t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
950{
951	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
952}
953
954void
955t3_set_dack_mss(struct toepcb *toep, int on_off)
956{
957
958	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
959}
960
961/*
962 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
963 */
964static void
965t3_set_tos(struct toepcb *toep)
966{
967	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
968
969	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
970			 V_TCB_TOS(tos));
971}
972
973
974/*
975 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
976 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
977 * set the PSH bit in the last segment, which would trigger delivery.]
978 * We work around the issue by setting a DDP buffer in a partial placed state,
979 * which guarantees that TP will schedule a timer.
980 */
981#define TP_DDP_TIMER_WORKAROUND_MASK\
982    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
983     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
984       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
985#define TP_DDP_TIMER_WORKAROUND_VAL\
986    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
987     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
988      32))
989
990static void
991t3_enable_ddp(struct toepcb *toep, int on)
992{
993	if (on) {
994
995		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
996				 V_TF_DDP_OFF(0));
997	} else
998		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
999				 V_TF_DDP_OFF(1) |
1000				 TP_DDP_TIMER_WORKAROUND_MASK,
1001				 V_TF_DDP_OFF(1) |
1002				 TP_DDP_TIMER_WORKAROUND_VAL);
1003
1004}
1005
1006void
1007t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1008{
1009	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1010			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1011			 tag_color);
1012}
1013
1014void
1015t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1016		    unsigned int len)
1017{
1018	if (buf_idx == 0)
1019		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1020			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1021			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1022			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1023			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1024	else
1025		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1026			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1027			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1028			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1029			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1030}
1031
1032static int
1033t3_set_cong_control(struct socket *so, const char *name)
1034{
1035#ifdef CONGESTION_CONTROL_SUPPORTED
1036	int cong_algo;
1037
1038	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1039		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1040			break;
1041
1042	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1043		return -EINVAL;
1044#endif
1045	return 0;
1046}
1047
1048int
1049t3_get_tcb(struct toepcb *toep)
1050{
1051	struct cpl_get_tcb *req;
1052	struct tcpcb *tp = toep->tp_tp;
1053	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1054
1055	if (!m)
1056		return (ENOMEM);
1057
1058	inp_lock_assert(tp->t_inpcb);
1059	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1060	req = mtod(m, struct cpl_get_tcb *);
1061	m->m_pkthdr.len = m->m_len = sizeof(*req);
1062	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1063	req->wr.wr_lo = 0;
1064	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1065	req->cpuno = htons(toep->tp_qset);
1066	req->rsvd = 0;
1067	if (tp->t_state == TCPS_SYN_SENT)
1068		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1069	else
1070		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1071	return 0;
1072}
1073
1074static inline void
1075so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1076{
1077
1078	toepcb_hold(toep);
1079
1080	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1081}
1082
1083/**
1084 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1085 *	@d: TOM state
1086 *	@mtu: the target MTU
1087 *
1088 *	Returns the index of the value in the MTU table that is closest to but
1089 *	does not exceed the target MTU.
1090 */
1091static unsigned int
1092find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1093{
1094	int i = 0;
1095
1096	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1097		++i;
1098	return (i);
1099}
1100
1101static unsigned int
1102select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1103{
1104	unsigned int idx;
1105
1106#ifdef notyet
1107	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1108#endif
1109	if (tp) {
1110		tp->t_maxseg = pmtu - 40;
1111		if (tp->t_maxseg < td->mtus[0] - 40)
1112			tp->t_maxseg = td->mtus[0] - 40;
1113		idx = find_best_mtu(td, tp->t_maxseg + 40);
1114
1115		tp->t_maxseg = td->mtus[idx] - 40;
1116	} else
1117		idx = find_best_mtu(td, pmtu);
1118
1119	return (idx);
1120}
1121
1122static inline void
1123free_atid(struct t3cdev *cdev, unsigned int tid)
1124{
1125	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1126
1127	if (toep)
1128		toepcb_release(toep);
1129}
1130
1131/*
1132 * Release resources held by an offload connection (TID, L2T entry, etc.)
1133 */
1134static void
1135t3_release_offload_resources(struct toepcb *toep)
1136{
1137	struct tcpcb *tp = toep->tp_tp;
1138	struct toedev *tdev = toep->tp_toedev;
1139	struct t3cdev *cdev;
1140	struct socket *so;
1141	unsigned int tid = toep->tp_tid;
1142	struct sockbuf *rcv;
1143
1144	CTR0(KTR_TOM, "t3_release_offload_resources");
1145
1146	if (!tdev)
1147		return;
1148
1149	cdev = TOEP_T3C_DEV(toep);
1150	if (!cdev)
1151		return;
1152
1153	toep->tp_qset = 0;
1154	t3_release_ddp_resources(toep);
1155
1156#ifdef CTRL_SKB_CACHE
1157	kfree_skb(CTRL_SKB_CACHE(tp));
1158	CTRL_SKB_CACHE(tp) = NULL;
1159#endif
1160
1161	if (toep->tp_wr_avail != toep->tp_wr_max) {
1162		purge_wr_queue(toep);
1163		reset_wr_list(toep);
1164	}
1165
1166	if (toep->tp_l2t) {
1167		l2t_release(L2DATA(cdev), toep->tp_l2t);
1168		toep->tp_l2t = NULL;
1169	}
1170	toep->tp_tp = NULL;
1171	if (tp) {
1172		inp_lock_assert(tp->t_inpcb);
1173		so = inp_inpcbtosocket(tp->t_inpcb);
1174		rcv = so_sockbuf_rcv(so);
1175		/*
1176		 * cancel any offloaded reads
1177		 *
1178		 */
1179		sockbuf_lock(rcv);
1180		tp->t_toe = NULL;
1181		tp->t_flags &= ~TF_TOE;
1182		if (toep->tp_ddp_state.user_ddp_pending) {
1183			t3_cancel_ubuf(toep, rcv);
1184			toep->tp_ddp_state.user_ddp_pending = 0;
1185		}
1186		so_sorwakeup_locked(so);
1187
1188	}
1189
1190	if (toep->tp_state == TCPS_SYN_SENT) {
1191		free_atid(cdev, tid);
1192#ifdef notyet
1193		__skb_queue_purge(&tp->out_of_order_queue);
1194#endif
1195	} else {                                          // we have TID
1196		cxgb_remove_tid(cdev, toep, tid);
1197		toepcb_release(toep);
1198	}
1199#if 0
1200	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1201#endif
1202}
1203
1204static void
1205install_offload_ops(struct socket *so)
1206{
1207	struct tcpcb *tp = so_sototcpcb(so);
1208
1209	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1210
1211	t3_install_socket_ops(so);
1212	tp->t_flags |= TF_TOE;
1213	tp->t_tu = &cxgb_toe_usrreqs;
1214}
1215
1216/*
1217 * Determine the receive window scaling factor given a target max
1218 * receive window.
1219 */
1220static __inline int
1221select_rcv_wscale(int space)
1222{
1223	int wscale = 0;
1224
1225	if (space > MAX_RCV_WND)
1226		space = MAX_RCV_WND;
1227
1228	if (V_tcp_do_rfc1323)
1229		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1230
1231	return (wscale);
1232}
1233
1234/*
1235 * Determine the receive window size for a socket.
1236 */
1237static unsigned long
1238select_rcv_wnd(struct toedev *dev, struct socket *so)
1239{
1240	struct tom_data *d = TOM_DATA(dev);
1241	unsigned int wnd;
1242	unsigned int max_rcv_wnd;
1243	struct sockbuf *rcv;
1244
1245	rcv = so_sockbuf_rcv(so);
1246
1247	if (V_tcp_do_autorcvbuf)
1248		wnd = V_tcp_autorcvbuf_max;
1249	else
1250		wnd = rcv->sb_hiwat;
1251
1252
1253
1254	/* XXX
1255	 * For receive coalescing to work effectively we need a receive window
1256	 * that can accomodate a coalesced segment.
1257	 */
1258	if (wnd < MIN_RCV_WND)
1259		wnd = MIN_RCV_WND;
1260
1261	/* PR 5138 */
1262	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1263				    (uint32_t)d->rx_page_size * 23 :
1264				    MAX_RCV_WND);
1265
1266	return min(wnd, max_rcv_wnd);
1267}
1268
1269/*
1270 * Assign offload parameters to some socket fields.  This code is used by
1271 * both active and passive opens.
1272 */
1273static inline void
1274init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1275    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1276{
1277	struct tcpcb *tp = so_sototcpcb(so);
1278	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1279	struct sockbuf *snd, *rcv;
1280
1281#ifdef notyet
1282	SOCK_LOCK_ASSERT(so);
1283#endif
1284
1285	snd = so_sockbuf_snd(so);
1286	rcv = so_sockbuf_rcv(so);
1287
1288	log(LOG_INFO, "initializing offload socket\n");
1289	/*
1290	 * We either need to fix push frames to work with sbcompress
1291	 * or we need to add this
1292	 */
1293	snd->sb_flags |= SB_NOCOALESCE;
1294	rcv->sb_flags |= SB_NOCOALESCE;
1295
1296	tp->t_toe = toep;
1297	toep->tp_tp = tp;
1298	toep->tp_toedev = dev;
1299
1300	toep->tp_tid = tid;
1301	toep->tp_l2t = e;
1302	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1303	toep->tp_wr_unacked = 0;
1304	toep->tp_delack_mode = 0;
1305
1306	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1307	/*
1308	 * XXX broken
1309	 *
1310	 */
1311	tp->rcv_wnd = select_rcv_wnd(dev, so);
1312
1313        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1314		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1315	toep->tp_qset_idx = 0;
1316
1317	reset_wr_list(toep);
1318	DPRINTF("initialization done\n");
1319}
1320
1321/*
1322 * The next two functions calculate the option 0 value for a socket.
1323 */
1324static inline unsigned int
1325calc_opt0h(struct socket *so, int mtu_idx)
1326{
1327	struct tcpcb *tp = so_sototcpcb(so);
1328	int wscale = select_rcv_wscale(tp->rcv_wnd);
1329
1330	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1331	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1332	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1333}
1334
1335static inline unsigned int
1336calc_opt0l(struct socket *so, int ulp_mode)
1337{
1338	struct tcpcb *tp = so_sototcpcb(so);
1339	unsigned int val;
1340
1341	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1342	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1343
1344	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1345	return (val);
1346}
1347
1348static inline unsigned int
1349calc_opt2(const struct socket *so, struct toedev *dev)
1350{
1351	int flv_valid;
1352
1353	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1354
1355	return (V_FLAVORS_VALID(flv_valid) |
1356	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1357}
1358
1359#if DEBUG_WR > 1
1360static int
1361count_pending_wrs(const struct toepcb *toep)
1362{
1363	const struct mbuf *m;
1364	int n = 0;
1365
1366	wr_queue_walk(toep, m)
1367		n += m->m_pkthdr.csum_data;
1368	return (n);
1369}
1370#endif
1371
1372#if 0
1373(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1374#endif
1375
1376static void
1377mk_act_open_req(struct socket *so, struct mbuf *m,
1378    unsigned int atid, const struct l2t_entry *e)
1379{
1380	struct cpl_act_open_req *req;
1381	struct inpcb *inp = so_sotoinpcb(so);
1382	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1383	struct toepcb *toep = tp->t_toe;
1384	struct toedev *tdev = toep->tp_toedev;
1385
1386	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1387
1388	req = mtod(m, struct cpl_act_open_req *);
1389	m->m_pkthdr.len = m->m_len = sizeof(*req);
1390
1391	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1392	req->wr.wr_lo = 0;
1393	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1394	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1395#if 0
1396	req->local_port = inp->inp_lport;
1397	req->peer_port = inp->inp_fport;
1398	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1399	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1400#endif
1401	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1402			   V_TX_CHANNEL(e->smt_idx));
1403	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1404	req->params = 0;
1405	req->opt2 = htonl(calc_opt2(so, tdev));
1406}
1407
1408
1409/*
1410 * Convert an ACT_OPEN_RPL status to an errno.
1411 */
1412static int
1413act_open_rpl_status_to_errno(int status)
1414{
1415	switch (status) {
1416	case CPL_ERR_CONN_RESET:
1417		return (ECONNREFUSED);
1418	case CPL_ERR_ARP_MISS:
1419		return (EHOSTUNREACH);
1420	case CPL_ERR_CONN_TIMEDOUT:
1421		return (ETIMEDOUT);
1422	case CPL_ERR_TCAM_FULL:
1423		return (ENOMEM);
1424	case CPL_ERR_CONN_EXIST:
1425		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1426		return (EADDRINUSE);
1427	default:
1428		return (EIO);
1429	}
1430}
1431
1432static void
1433fail_act_open(struct toepcb *toep, int errno)
1434{
1435	struct tcpcb *tp = toep->tp_tp;
1436
1437	t3_release_offload_resources(toep);
1438	if (tp) {
1439		inp_wunlock(tp->t_inpcb);
1440		tcp_offload_drop(tp, errno);
1441	}
1442
1443#ifdef notyet
1444	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1445#endif
1446}
1447
1448/*
1449 * Handle active open failures.
1450 */
1451static void
1452active_open_failed(struct toepcb *toep, struct mbuf *m)
1453{
1454	struct cpl_act_open_rpl *rpl = cplhdr(m);
1455	struct inpcb *inp;
1456
1457	if (toep->tp_tp == NULL)
1458		goto done;
1459
1460	inp = toep->tp_tp->t_inpcb;
1461
1462/*
1463 * Don't handle connection retry for now
1464 */
1465#ifdef notyet
1466	struct inet_connection_sock *icsk = inet_csk(sk);
1467
1468	if (rpl->status == CPL_ERR_CONN_EXIST &&
1469	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1470		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1471		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1472			       jiffies + HZ / 2);
1473	} else
1474#endif
1475	{
1476		inp_wlock(inp);
1477		/*
1478		 * drops the inpcb lock
1479		 */
1480		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1481	}
1482
1483	done:
1484	m_free(m);
1485}
1486
1487/*
1488 * Return whether a failed active open has allocated a TID
1489 */
1490static inline int
1491act_open_has_tid(int status)
1492{
1493	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1494	       status != CPL_ERR_ARP_MISS;
1495}
1496
1497/*
1498 * Process an ACT_OPEN_RPL CPL message.
1499 */
1500static int
1501do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1502{
1503	struct toepcb *toep = (struct toepcb *)ctx;
1504	struct cpl_act_open_rpl *rpl = cplhdr(m);
1505
1506	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1507		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1508
1509	active_open_failed(toep, m);
1510	return (0);
1511}
1512
1513/*
1514 * Handle an ARP failure for an active open.   XXX purge ofo queue
1515 *
1516 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1517 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1518 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1519 * free the atid.  Hmm.
1520 */
1521#ifdef notyet
1522static void
1523act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1524{
1525	struct toepcb *toep = m_get_toep(m);
1526	struct tcpcb *tp = toep->tp_tp;
1527	struct inpcb *inp = tp->t_inpcb;
1528	struct socket *so;
1529
1530	inp_wlock(inp);
1531	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1532		/*
1533		 * drops the inpcb lock
1534		 */
1535		fail_act_open(so, EHOSTUNREACH);
1536		printf("freeing %p\n", m);
1537
1538		m_free(m);
1539	} else
1540		inp_wunlock(inp);
1541}
1542#endif
1543/*
1544 * Send an active open request.
1545 */
1546int
1547t3_connect(struct toedev *tdev, struct socket *so,
1548    struct rtentry *rt, struct sockaddr *nam)
1549{
1550	struct mbuf *m;
1551	struct l2t_entry *e;
1552	struct tom_data *d = TOM_DATA(tdev);
1553	struct inpcb *inp = so_sotoinpcb(so);
1554	struct tcpcb *tp = intotcpcb(inp);
1555	struct toepcb *toep; /* allocated by init_offload_socket */
1556
1557	int atid;
1558
1559	toep = toepcb_alloc();
1560	if (toep == NULL)
1561		goto out_err;
1562
1563	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1564		goto out_err;
1565
1566	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1567	if (!e)
1568		goto free_tid;
1569
1570	inp_lock_assert(inp);
1571	m = m_gethdr(MT_DATA, M_WAITOK);
1572
1573#if 0
1574	m->m_toe.mt_toepcb = tp->t_toe;
1575	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1576#endif
1577	so_lock(so);
1578
1579	init_offload_socket(so, tdev, atid, e, rt, toep);
1580
1581	install_offload_ops(so);
1582
1583	mk_act_open_req(so, m, atid, e);
1584	so_unlock(so);
1585
1586	soisconnecting(so);
1587	toep = tp->t_toe;
1588	m_set_toep(m, tp->t_toe);
1589
1590	toep->tp_state = TCPS_SYN_SENT;
1591	l2t_send(d->cdev, (struct mbuf *)m, e);
1592
1593	if (toep->tp_ulp_mode)
1594		t3_enable_ddp(toep, 0);
1595	return 	(0);
1596
1597free_tid:
1598	printf("failing connect - free atid\n");
1599
1600	free_atid(d->cdev, atid);
1601out_err:
1602	printf("return ENOMEM\n");
1603       return (ENOMEM);
1604}
1605
1606/*
1607 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1608 * not send multiple ABORT_REQs for the same connection and also that we do
1609 * not try to send a message after the connection has closed.  Returns 1 if
1610 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1611 */
1612static void
1613t3_send_reset(struct toepcb *toep)
1614{
1615
1616	struct cpl_abort_req *req;
1617	unsigned int tid = toep->tp_tid;
1618	int mode = CPL_ABORT_SEND_RST;
1619	struct tcpcb *tp = toep->tp_tp;
1620	struct toedev *tdev = toep->tp_toedev;
1621	struct socket *so = NULL;
1622	struct mbuf *m;
1623	struct sockbuf *snd;
1624
1625	if (tp) {
1626		inp_lock_assert(tp->t_inpcb);
1627		so = inp_inpcbtosocket(tp->t_inpcb);
1628	}
1629
1630	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1631		tdev == NULL))
1632		return;
1633	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1634
1635	snd = so_sockbuf_snd(so);
1636	/* Purge the send queue so we don't send anything after an abort. */
1637	if (so)
1638		sbflush(snd);
1639	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1640		mode |= CPL_ABORT_POST_CLOSE_REQ;
1641
1642	m = m_gethdr_nofail(sizeof(*req));
1643	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1644	set_arp_failure_handler(m, abort_arp_failure);
1645
1646	req = mtod(m, struct cpl_abort_req *);
1647	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1648	req->wr.wr_lo = htonl(V_WR_TID(tid));
1649	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1650	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1651	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1652	req->cmd = mode;
1653	if (tp && (tp->t_state == TCPS_SYN_SENT))
1654		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1655	else
1656		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1657}
1658
1659static int
1660t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1661{
1662	struct inpcb *inp;
1663	int error, optval;
1664
1665	if (sopt->sopt_name == IP_OPTIONS)
1666		return (ENOPROTOOPT);
1667
1668	if (sopt->sopt_name != IP_TOS)
1669		return (EOPNOTSUPP);
1670
1671	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1672
1673	if (error)
1674		return (error);
1675
1676	if (optval > IPTOS_PREC_CRITIC_ECP)
1677		return (EINVAL);
1678
1679	inp = so_sotoinpcb(so);
1680	inp_wlock(inp);
1681	inp_ip_tos_set(inp, optval);
1682#if 0
1683	inp->inp_ip_tos = optval;
1684#endif
1685	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1686	inp_wunlock(inp);
1687
1688	return (0);
1689}
1690
1691static int
1692t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1693{
1694	int err = 0;
1695	size_t copied;
1696
1697	if (sopt->sopt_name != TCP_CONGESTION &&
1698	    sopt->sopt_name != TCP_NODELAY)
1699		return (EOPNOTSUPP);
1700
1701	if (sopt->sopt_name == TCP_CONGESTION) {
1702		char name[TCP_CA_NAME_MAX];
1703		int optlen = sopt->sopt_valsize;
1704		struct tcpcb *tp;
1705
1706		if (sopt->sopt_dir == SOPT_GET) {
1707			KASSERT(0, ("unimplemented"));
1708			return (EOPNOTSUPP);
1709		}
1710
1711		if (optlen < 1)
1712			return (EINVAL);
1713
1714		err = copyinstr(sopt->sopt_val, name,
1715		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1716		if (err)
1717			return (err);
1718		if (copied < 1)
1719			return (EINVAL);
1720
1721		tp = so_sototcpcb(so);
1722		/*
1723		 * XXX I need to revisit this
1724		 */
1725		if ((err = t3_set_cong_control(so, name)) == 0) {
1726#ifdef CONGESTION_CONTROL_SUPPORTED
1727			tp->t_cong_control = strdup(name, M_CXGB);
1728#endif
1729		} else
1730			return (err);
1731	} else {
1732		int optval, oldval;
1733		struct inpcb *inp;
1734		struct tcpcb *tp;
1735
1736		if (sopt->sopt_dir == SOPT_GET)
1737			return (EOPNOTSUPP);
1738
1739		err = sooptcopyin(sopt, &optval, sizeof optval,
1740		    sizeof optval);
1741
1742		if (err)
1743			return (err);
1744
1745		inp = so_sotoinpcb(so);
1746		inp_wlock(inp);
1747		tp = inp_inpcbtotcpcb(inp);
1748
1749		oldval = tp->t_flags;
1750		if (optval)
1751			tp->t_flags |= TF_NODELAY;
1752		else
1753			tp->t_flags &= ~TF_NODELAY;
1754		inp_wunlock(inp);
1755
1756
1757		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1758			t3_set_nagle(tp->t_toe);
1759
1760	}
1761
1762	return (0);
1763}
1764
1765int
1766t3_ctloutput(struct socket *so, struct sockopt *sopt)
1767{
1768	int err;
1769
1770	if (sopt->sopt_level != IPPROTO_TCP)
1771		err =  t3_ip_ctloutput(so, sopt);
1772	else
1773		err = t3_tcp_ctloutput(so, sopt);
1774
1775	if (err != EOPNOTSUPP)
1776		return (err);
1777
1778	return (tcp_ctloutput(so, sopt));
1779}
1780
1781/*
1782 * Returns true if we need to explicitly request RST when we receive new data
1783 * on an RX-closed connection.
1784 */
1785static inline int
1786need_rst_on_excess_rx(const struct toepcb *toep)
1787{
1788	return (1);
1789}
1790
1791/*
1792 * Handles Rx data that arrives in a state where the socket isn't accepting
1793 * new data.
1794 */
1795static void
1796handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1797{
1798
1799	if (need_rst_on_excess_rx(toep) &&
1800	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1801		t3_send_reset(toep);
1802	m_freem(m);
1803}
1804
1805/*
1806 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1807 * by getting the DDP offset from the TCB.
1808 */
1809static void
1810tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1811{
1812	struct ddp_state *q = &toep->tp_ddp_state;
1813	struct ddp_buf_state *bsp;
1814	struct cpl_get_tcb_rpl *hdr;
1815	unsigned int ddp_offset;
1816	struct socket *so;
1817	struct tcpcb *tp;
1818	struct sockbuf *rcv;
1819	int state;
1820
1821	uint64_t t;
1822	__be64 *tcb;
1823
1824	tp = toep->tp_tp;
1825	so = inp_inpcbtosocket(tp->t_inpcb);
1826
1827	inp_lock_assert(tp->t_inpcb);
1828	rcv = so_sockbuf_rcv(so);
1829	sockbuf_lock(rcv);
1830
1831	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1832	 * We really need a cookie in order to dispatch the RPLs.
1833	 */
1834	q->get_tcb_count--;
1835
1836	/* It is a possible that a previous CPL already invalidated UBUF DDP
1837	 * and moved the cur_buf idx and hence no further processing of this
1838	 * skb is required. However, the app might be sleeping on
1839	 * !q->get_tcb_count and we need to wake it up.
1840	 */
1841	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1842		int state = so_state_get(so);
1843
1844		m_freem(m);
1845		if (__predict_true((state & SS_NOFDREF) == 0))
1846			so_sorwakeup_locked(so);
1847		else
1848			sockbuf_unlock(rcv);
1849
1850		return;
1851	}
1852
1853	bsp = &q->buf_state[q->cur_buf];
1854	hdr = cplhdr(m);
1855	tcb = (__be64 *)(hdr + 1);
1856	if (q->cur_buf == 0) {
1857		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1858		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1859	} else {
1860		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1861		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1862	}
1863	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1864	m->m_cur_offset = bsp->cur_offset;
1865	bsp->cur_offset = ddp_offset;
1866	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1867
1868	CTR5(KTR_TOM,
1869	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1870	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1871	KASSERT(ddp_offset >= m->m_cur_offset,
1872	    ("ddp_offset=%u less than cur_offset=%u",
1873		ddp_offset, m->m_cur_offset));
1874
1875#if 0
1876{
1877	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1878
1879	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1880	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1881
1882        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1883        rcv_nxt = t >> S_TCB_RCV_NXT;
1884        rcv_nxt &= M_TCB_RCV_NXT;
1885
1886        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1887        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1888        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1889
1890	T3_TRACE2(TIDTB(sk),
1891		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1892		  ddp_flags, rcv_nxt - rx_hdr_offset);
1893	T3_TRACE4(TB(q),
1894		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1895		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1896	T3_TRACE3(TB(q),
1897		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1898		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1899	T3_TRACE2(TB(q),
1900		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1901		 q->buf_state[0].flags, q->buf_state[1].flags);
1902
1903}
1904#endif
1905	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1906		handle_excess_rx(toep, m);
1907		return;
1908	}
1909
1910#ifdef T3_TRACE
1911	if ((int)m->m_pkthdr.len < 0) {
1912		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1913	}
1914#endif
1915	if (bsp->flags & DDP_BF_NOCOPY) {
1916#ifdef T3_TRACE
1917		T3_TRACE0(TB(q),
1918			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1919
1920		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1921			printk("!cancel_ubuf");
1922			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1923		}
1924#endif
1925		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1926		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1927		q->cur_buf ^= 1;
1928	} else if (bsp->flags & DDP_BF_NOFLIP) {
1929
1930		m->m_ddp_flags = 1;    /* always a kernel buffer */
1931
1932		/* now HW buffer carries a user buffer */
1933		bsp->flags &= ~DDP_BF_NOFLIP;
1934		bsp->flags |= DDP_BF_NOCOPY;
1935
1936		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1937		 * any new data in which case we're done. If in addition the
1938		 * offset is 0, then there wasn't a completion for the kbuf
1939		 * and we need to decrement the posted count.
1940		 */
1941		if (m->m_pkthdr.len == 0) {
1942			if (ddp_offset == 0) {
1943				q->kbuf_posted--;
1944				bsp->flags |= DDP_BF_NODATA;
1945			}
1946			sockbuf_unlock(rcv);
1947			m_free(m);
1948			return;
1949		}
1950	} else {
1951		sockbuf_unlock(rcv);
1952
1953		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1954		 * but it got here way late and nobody cares anymore.
1955		 */
1956		m_free(m);
1957		return;
1958	}
1959
1960	m->m_ddp_gl = (unsigned char *)bsp->gl;
1961	m->m_flags |= M_DDP;
1962	m->m_seq = tp->rcv_nxt;
1963	tp->rcv_nxt += m->m_pkthdr.len;
1964	tp->t_rcvtime = ticks;
1965	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1966		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1967	if (m->m_pkthdr.len == 0) {
1968		q->user_ddp_pending = 0;
1969		m_free(m);
1970	} else
1971		SBAPPEND(rcv, m);
1972
1973	state = so_state_get(so);
1974	if (__predict_true((state & SS_NOFDREF) == 0))
1975		so_sorwakeup_locked(so);
1976	else
1977		sockbuf_unlock(rcv);
1978}
1979
1980/*
1981 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1982 * in that case they are similar to DDP completions.
1983 */
1984static int
1985do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1986{
1987	struct toepcb *toep = (struct toepcb *)ctx;
1988
1989	/* OK if socket doesn't exist */
1990	if (toep == NULL) {
1991		printf("null toep in do_get_tcb_rpl\n");
1992		return (CPL_RET_BUF_DONE);
1993	}
1994
1995	inp_wlock(toep->tp_tp->t_inpcb);
1996	tcb_rpl_as_ddp_complete(toep, m);
1997	inp_wunlock(toep->tp_tp->t_inpcb);
1998
1999	return (0);
2000}
2001
2002static void
2003handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2004{
2005	struct tcpcb *tp = toep->tp_tp;
2006	struct socket *so;
2007	struct ddp_state *q;
2008	struct ddp_buf_state *bsp;
2009	struct cpl_rx_data *hdr = cplhdr(m);
2010	unsigned int rcv_nxt = ntohl(hdr->seq);
2011	struct sockbuf *rcv;
2012
2013	if (tp->rcv_nxt == rcv_nxt)
2014		return;
2015
2016	inp_lock_assert(tp->t_inpcb);
2017	so  = inp_inpcbtosocket(tp->t_inpcb);
2018	rcv = so_sockbuf_rcv(so);
2019	sockbuf_lock(rcv);
2020
2021	q = &toep->tp_ddp_state;
2022	bsp = &q->buf_state[q->cur_buf];
2023	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2024		rcv_nxt, tp->rcv_nxt));
2025	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2026	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2027	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2028	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2029
2030#ifdef T3_TRACE
2031	if ((int)m->m_pkthdr.len < 0) {
2032		t3_ddp_error(so, "handle_ddp_data: neg len");
2033	}
2034#endif
2035	m->m_ddp_gl = (unsigned char *)bsp->gl;
2036	m->m_flags |= M_DDP;
2037	m->m_cur_offset = bsp->cur_offset;
2038	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2039	if (bsp->flags & DDP_BF_NOCOPY)
2040		bsp->flags &= ~DDP_BF_NOCOPY;
2041
2042	m->m_seq = tp->rcv_nxt;
2043	tp->rcv_nxt = rcv_nxt;
2044	bsp->cur_offset += m->m_pkthdr.len;
2045	if (!(bsp->flags & DDP_BF_NOFLIP))
2046		q->cur_buf ^= 1;
2047	/*
2048	 * For now, don't re-enable DDP after a connection fell out of  DDP
2049	 * mode.
2050	 */
2051	q->ubuf_ddp_ready = 0;
2052	sockbuf_unlock(rcv);
2053}
2054
2055/*
2056 * Process new data received for a connection.
2057 */
2058static void
2059new_rx_data(struct toepcb *toep, struct mbuf *m)
2060{
2061	struct cpl_rx_data *hdr = cplhdr(m);
2062	struct tcpcb *tp = toep->tp_tp;
2063	struct socket *so;
2064	struct sockbuf *rcv;
2065	int state;
2066	int len = be16toh(hdr->len);
2067
2068	inp_wlock(tp->t_inpcb);
2069
2070	so  = inp_inpcbtosocket(tp->t_inpcb);
2071
2072	if (__predict_false(so_no_receive(so))) {
2073		handle_excess_rx(toep, m);
2074		inp_wunlock(tp->t_inpcb);
2075		TRACE_EXIT;
2076		return;
2077	}
2078
2079	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2080		handle_ddp_data(toep, m);
2081
2082	m->m_seq = ntohl(hdr->seq);
2083	m->m_ulp_mode = 0;                    /* for iSCSI */
2084
2085#if VALIDATE_SEQ
2086	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2087		log(LOG_ERR,
2088		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2089		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2090		       tp->rcv_nxt);
2091		m_freem(m);
2092		inp_wunlock(tp->t_inpcb);
2093		return;
2094	}
2095#endif
2096	m_adj(m, sizeof(*hdr));
2097
2098#ifdef URGENT_DATA_SUPPORTED
2099	/*
2100	 * We don't handle urgent data yet
2101	 */
2102	if (__predict_false(hdr->urg))
2103		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2104	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2105		     tp->urg_seq - tp->rcv_nxt < skb->len))
2106		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2107							 tp->rcv_nxt];
2108#endif
2109	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2110		toep->tp_delack_mode = hdr->dack_mode;
2111		toep->tp_delack_seq = tp->rcv_nxt;
2112	}
2113	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2114	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2115
2116	if (len < m->m_pkthdr.len)
2117		m->m_pkthdr.len = m->m_len = len;
2118
2119	tp->rcv_nxt += m->m_pkthdr.len;
2120	tp->t_rcvtime = ticks;
2121	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2122	CTR2(KTR_TOM,
2123	    "new_rx_data: seq 0x%x len %u",
2124	    m->m_seq, m->m_pkthdr.len);
2125	inp_wunlock(tp->t_inpcb);
2126	rcv = so_sockbuf_rcv(so);
2127	sockbuf_lock(rcv);
2128#if 0
2129	if (sb_notify(rcv))
2130		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2131#endif
2132	SBAPPEND(rcv, m);
2133
2134#ifdef notyet
2135	/*
2136	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2137	 *
2138	 */
2139	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2140
2141	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2142		so, rcv->sb_cc, rcv->sb_mbmax));
2143#endif
2144
2145
2146	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2147	    rcv->sb_cc, rcv->sb_mbcnt);
2148
2149	state = so_state_get(so);
2150	if (__predict_true((state & SS_NOFDREF) == 0))
2151		so_sorwakeup_locked(so);
2152	else
2153		sockbuf_unlock(rcv);
2154}
2155
2156/*
2157 * Handler for RX_DATA CPL messages.
2158 */
2159static int
2160do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2161{
2162	struct toepcb *toep = (struct toepcb *)ctx;
2163
2164	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2165
2166	new_rx_data(toep, m);
2167
2168	return (0);
2169}
2170
2171static void
2172new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2173{
2174	struct tcpcb *tp;
2175	struct ddp_state *q;
2176	struct ddp_buf_state *bsp;
2177	struct cpl_rx_data_ddp *hdr;
2178	struct socket *so;
2179	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2180	int nomoredata = 0;
2181	unsigned int delack_mode;
2182	struct sockbuf *rcv;
2183
2184	tp = toep->tp_tp;
2185	inp_wlock(tp->t_inpcb);
2186	so = inp_inpcbtosocket(tp->t_inpcb);
2187
2188	if (__predict_false(so_no_receive(so))) {
2189
2190		handle_excess_rx(toep, m);
2191		inp_wunlock(tp->t_inpcb);
2192		return;
2193	}
2194
2195	q = &toep->tp_ddp_state;
2196	hdr = cplhdr(m);
2197	ddp_report = ntohl(hdr->u.ddp_report);
2198	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2199	bsp = &q->buf_state[buf_idx];
2200
2201	CTR4(KTR_TOM,
2202	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2203	    "hdr seq 0x%x len %u",
2204	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2205	    ntohs(hdr->len));
2206	CTR3(KTR_TOM,
2207	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2208	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2209
2210	ddp_len = ntohs(hdr->len);
2211	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2212
2213	delack_mode = G_DDP_DACK_MODE(ddp_report);
2214	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2215		toep->tp_delack_mode = delack_mode;
2216		toep->tp_delack_seq = tp->rcv_nxt;
2217	}
2218
2219	m->m_seq = tp->rcv_nxt;
2220	tp->rcv_nxt = rcv_nxt;
2221
2222	tp->t_rcvtime = ticks;
2223	/*
2224	 * Store the length in m->m_len.  We are changing the meaning of
2225	 * m->m_len here, we need to be very careful that nothing from now on
2226	 * interprets ->len of this packet the usual way.
2227	 */
2228	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2229	inp_wunlock(tp->t_inpcb);
2230	CTR3(KTR_TOM,
2231	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2232	    m->m_len, rcv_nxt, m->m_seq);
2233	/*
2234	 * Figure out where the new data was placed in the buffer and store it
2235	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2236	 * account for page pod's pg_offset.
2237	 */
2238	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2239	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2240
2241	rcv = so_sockbuf_rcv(so);
2242	sockbuf_lock(rcv);
2243
2244	m->m_ddp_gl = (unsigned char *)bsp->gl;
2245	m->m_flags |= M_DDP;
2246	bsp->cur_offset = end_offset;
2247	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2248
2249	/*
2250	 * Length is only meaningful for kbuf
2251	 */
2252	if (!(bsp->flags & DDP_BF_NOCOPY))
2253		KASSERT(m->m_len <= bsp->gl->dgl_length,
2254		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2255			m->m_len, bsp->gl->dgl_length));
2256
2257	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2258	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2259        /*
2260	 * Bit 0 of flags stores whether the DDP buffer is completed.
2261	 * Note that other parts of the code depend on this being in bit 0.
2262	 */
2263	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2264		panic("spurious ddp completion");
2265	} else {
2266		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2267		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2268			q->cur_buf ^= 1;                     /* flip buffers */
2269	}
2270
2271	if (bsp->flags & DDP_BF_NOCOPY) {
2272		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2273		bsp->flags &= ~DDP_BF_NOCOPY;
2274	}
2275
2276	if (ddp_report & F_DDP_PSH)
2277		m->m_ddp_flags |= DDP_BF_PSH;
2278	if (nomoredata)
2279		m->m_ddp_flags |= DDP_BF_NODATA;
2280
2281#ifdef notyet
2282	skb_reset_transport_header(skb);
2283	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2284#endif
2285	SBAPPEND(rcv, m);
2286
2287	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2288	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2289		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2290		so_sorwakeup_locked(so);
2291	else
2292		sockbuf_unlock(rcv);
2293}
2294
2295#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2296		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2297		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2298		 F_DDP_INVALID_PPOD)
2299
2300/*
2301 * Handler for RX_DATA_DDP CPL messages.
2302 */
2303static int
2304do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2305{
2306	struct toepcb *toep = ctx;
2307	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2308
2309	VALIDATE_SOCK(so);
2310
2311	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2312		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2313		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2314		return (CPL_RET_BUF_DONE);
2315	}
2316#if 0
2317	skb->h.th = tcphdr_skb->h.th;
2318#endif
2319	new_rx_data_ddp(toep, m);
2320	return (0);
2321}
2322
2323static void
2324process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2325{
2326	struct tcpcb *tp = toep->tp_tp;
2327	struct socket *so;
2328	struct ddp_state *q;
2329	struct ddp_buf_state *bsp;
2330	struct cpl_rx_ddp_complete *hdr;
2331	unsigned int ddp_report, buf_idx, when, delack_mode;
2332	int nomoredata = 0;
2333	struct sockbuf *rcv;
2334
2335	inp_wlock(tp->t_inpcb);
2336	so = inp_inpcbtosocket(tp->t_inpcb);
2337
2338	if (__predict_false(so_no_receive(so))) {
2339		struct inpcb *inp = so_sotoinpcb(so);
2340
2341		handle_excess_rx(toep, m);
2342		inp_wunlock(inp);
2343		return;
2344	}
2345	q = &toep->tp_ddp_state;
2346	hdr = cplhdr(m);
2347	ddp_report = ntohl(hdr->ddp_report);
2348	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2349	m->m_pkthdr.csum_data = tp->rcv_nxt;
2350
2351	rcv = so_sockbuf_rcv(so);
2352	sockbuf_lock(rcv);
2353
2354	bsp = &q->buf_state[buf_idx];
2355	when = bsp->cur_offset;
2356	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2357	tp->rcv_nxt += m->m_len;
2358	tp->t_rcvtime = ticks;
2359
2360	delack_mode = G_DDP_DACK_MODE(ddp_report);
2361	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2362		toep->tp_delack_mode = delack_mode;
2363		toep->tp_delack_seq = tp->rcv_nxt;
2364	}
2365#ifdef notyet
2366	skb_reset_transport_header(skb);
2367	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2368#endif
2369	inp_wunlock(tp->t_inpcb);
2370
2371	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2372	CTR5(KTR_TOM,
2373		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2374		  "ddp_report 0x%x offset %u, len %u",
2375		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2376		   G_DDP_OFFSET(ddp_report), m->m_len);
2377
2378	m->m_cur_offset = bsp->cur_offset;
2379	bsp->cur_offset += m->m_len;
2380
2381	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2382		q->cur_buf ^= 1;                     /* flip buffers */
2383		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2384			nomoredata=1;
2385	}
2386
2387	CTR4(KTR_TOM,
2388		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2389		  "ddp_report %u offset %u",
2390		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2391		   G_DDP_OFFSET(ddp_report));
2392
2393	m->m_ddp_gl = (unsigned char *)bsp->gl;
2394	m->m_flags |= M_DDP;
2395	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2396	if (bsp->flags & DDP_BF_NOCOPY)
2397		bsp->flags &= ~DDP_BF_NOCOPY;
2398	if (nomoredata)
2399		m->m_ddp_flags |= DDP_BF_NODATA;
2400
2401	SBAPPEND(rcv, m);
2402	if ((so_state_get(so) & SS_NOFDREF) == 0)
2403		so_sorwakeup_locked(so);
2404	else
2405		sockbuf_unlock(rcv);
2406}
2407
2408/*
2409 * Handler for RX_DDP_COMPLETE CPL messages.
2410 */
2411static int
2412do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2413{
2414	struct toepcb *toep = ctx;
2415
2416	VALIDATE_SOCK(so);
2417#if 0
2418	skb->h.th = tcphdr_skb->h.th;
2419#endif
2420	process_ddp_complete(toep, m);
2421	return (0);
2422}
2423
2424/*
2425 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2426 * socket state before calling tcp_time_wait to comply with its expectations.
2427 */
2428static void
2429enter_timewait(struct tcpcb *tp)
2430{
2431	/*
2432	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2433	 * process peer_close because we don't want to carry the peer FIN in
2434	 * the socket's receive queue and if we increment rcv_nxt without
2435	 * having the FIN in the receive queue we'll confuse facilities such
2436	 * as SIOCINQ.
2437	 */
2438	inp_wlock(tp->t_inpcb);
2439	tp->rcv_nxt++;
2440
2441	tp->ts_recent_age = 0;	     /* defeat recycling */
2442	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2443	inp_wunlock(tp->t_inpcb);
2444	tcp_offload_twstart(tp);
2445}
2446
2447/*
2448 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2449 * function deals with the data that may be reported along with the FIN.
2450 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2451 * perform normal FIN-related processing.  In the latter case 1 indicates that
2452 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2453 * skb can be freed.
2454 */
2455static int
2456handle_peer_close_data(struct socket *so, struct mbuf *m)
2457{
2458	struct tcpcb *tp = so_sototcpcb(so);
2459	struct toepcb *toep = tp->t_toe;
2460	struct ddp_state *q;
2461	struct ddp_buf_state *bsp;
2462	struct cpl_peer_close *req = cplhdr(m);
2463	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2464	struct sockbuf *rcv;
2465
2466	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2467		return (0);
2468
2469	CTR0(KTR_TOM, "handle_peer_close_data");
2470	if (__predict_false(so_no_receive(so))) {
2471		handle_excess_rx(toep, m);
2472
2473		/*
2474		 * Although we discard the data we want to process the FIN so
2475		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2476		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2477		 * may be what will close the connection.  We return 1 because
2478		 * handle_excess_rx() already freed the packet.
2479		 */
2480		return (1);
2481	}
2482
2483	inp_lock_assert(tp->t_inpcb);
2484	q = &toep->tp_ddp_state;
2485	rcv = so_sockbuf_rcv(so);
2486	sockbuf_lock(rcv);
2487
2488	bsp = &q->buf_state[q->cur_buf];
2489	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2490	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2491	m->m_ddp_gl = (unsigned char *)bsp->gl;
2492	m->m_flags |= M_DDP;
2493	m->m_cur_offset = bsp->cur_offset;
2494	m->m_ddp_flags =
2495	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2496	m->m_seq = tp->rcv_nxt;
2497	tp->rcv_nxt = rcv_nxt;
2498	bsp->cur_offset += m->m_pkthdr.len;
2499	if (!(bsp->flags & DDP_BF_NOFLIP))
2500		q->cur_buf ^= 1;
2501#ifdef notyet
2502	skb_reset_transport_header(skb);
2503	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2504#endif
2505	tp->t_rcvtime = ticks;
2506	SBAPPEND(rcv, m);
2507	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2508		so_sorwakeup_locked(so);
2509	else
2510		sockbuf_unlock(rcv);
2511
2512	return (1);
2513}
2514
2515/*
2516 * Handle a peer FIN.
2517 */
2518static void
2519do_peer_fin(struct toepcb *toep, struct mbuf *m)
2520{
2521	struct socket *so;
2522	struct tcpcb *tp = toep->tp_tp;
2523	int keep, action;
2524
2525	action = keep = 0;
2526	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2527	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2528		printf("abort_pending set\n");
2529
2530		goto out;
2531	}
2532	inp_wlock(tp->t_inpcb);
2533	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2534	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2535		keep = handle_peer_close_data(so, m);
2536		if (keep < 0) {
2537			inp_wunlock(tp->t_inpcb);
2538			return;
2539		}
2540	}
2541	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2542		CTR1(KTR_TOM,
2543		    "waking up waiters for cantrcvmore on %p ", so);
2544		socantrcvmore(so);
2545
2546		/*
2547		 * If connection is half-synchronized
2548		 * (ie NEEDSYN flag on) then delay ACK,
2549		 * so it may be piggybacked when SYN is sent.
2550		 * Otherwise, since we received a FIN then no
2551		 * more input can be expected, send ACK now.
2552		 */
2553		if (tp->t_flags & TF_NEEDSYN)
2554			tp->t_flags |= TF_DELACK;
2555		else
2556			tp->t_flags |= TF_ACKNOW;
2557		tp->rcv_nxt++;
2558	}
2559
2560	switch (tp->t_state) {
2561	case TCPS_SYN_RECEIVED:
2562	    tp->t_starttime = ticks;
2563	/* FALLTHROUGH */
2564	case TCPS_ESTABLISHED:
2565		tp->t_state = TCPS_CLOSE_WAIT;
2566		break;
2567	case TCPS_FIN_WAIT_1:
2568		tp->t_state = TCPS_CLOSING;
2569		break;
2570	case TCPS_FIN_WAIT_2:
2571		/*
2572		 * If we've sent an abort_req we must have sent it too late,
2573		 * HW will send us a reply telling us so, and this peer_close
2574		 * is really the last message for this connection and needs to
2575		 * be treated as an abort_rpl, i.e., transition the connection
2576		 * to TCP_CLOSE (note that the host stack does this at the
2577		 * time of generating the RST but we must wait for HW).
2578		 * Otherwise we enter TIME_WAIT.
2579		 */
2580		t3_release_offload_resources(toep);
2581		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2582			action = TCP_CLOSE;
2583		} else {
2584			action = TCP_TIMEWAIT;
2585		}
2586		break;
2587	default:
2588		log(LOG_ERR,
2589		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2590		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2591	}
2592	inp_wunlock(tp->t_inpcb);
2593
2594	if (action == TCP_TIMEWAIT) {
2595		enter_timewait(tp);
2596	} else if (action == TCP_DROP) {
2597		tcp_offload_drop(tp, 0);
2598	} else if (action == TCP_CLOSE) {
2599		tcp_offload_close(tp);
2600	}
2601
2602#ifdef notyet
2603	/* Do not send POLL_HUP for half duplex close. */
2604	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2605	    sk->sk_state == TCP_CLOSE)
2606		sk_wake_async(so, 1, POLL_HUP);
2607	else
2608		sk_wake_async(so, 1, POLL_IN);
2609#endif
2610
2611out:
2612	if (!keep)
2613		m_free(m);
2614}
2615
2616/*
2617 * Handler for PEER_CLOSE CPL messages.
2618 */
2619static int
2620do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2621{
2622	struct toepcb *toep = (struct toepcb *)ctx;
2623
2624	VALIDATE_SOCK(so);
2625
2626	do_peer_fin(toep, m);
2627	return (0);
2628}
2629
2630static void
2631process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2632{
2633	struct cpl_close_con_rpl *rpl = cplhdr(m);
2634	struct tcpcb *tp = toep->tp_tp;
2635	struct socket *so;
2636	int action = 0;
2637	struct sockbuf *rcv;
2638
2639	inp_wlock(tp->t_inpcb);
2640	so = inp_inpcbtosocket(tp->t_inpcb);
2641
2642	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2643
2644	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2645		inp_wunlock(tp->t_inpcb);
2646		goto out;
2647	}
2648
2649	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2650	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2651
2652	switch (tp->t_state) {
2653	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2654		t3_release_offload_resources(toep);
2655		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2656			action = TCP_CLOSE;
2657
2658		} else {
2659			action = TCP_TIMEWAIT;
2660		}
2661		break;
2662	case TCPS_LAST_ACK:
2663		/*
2664		 * In this state we don't care about pending abort_rpl.
2665		 * If we've sent abort_req it was post-close and was sent too
2666		 * late, this close_con_rpl is the actual last message.
2667		 */
2668		t3_release_offload_resources(toep);
2669		action = TCP_CLOSE;
2670		break;
2671	case TCPS_FIN_WAIT_1:
2672		/*
2673		 * If we can't receive any more
2674		 * data, then closing user can proceed.
2675		 * Starting the timer is contrary to the
2676		 * specification, but if we don't get a FIN
2677		 * we'll hang forever.
2678		 *
2679		 * XXXjl:
2680		 * we should release the tp also, and use a
2681		 * compressed state.
2682		 */
2683		if (so)
2684			rcv = so_sockbuf_rcv(so);
2685		else
2686			break;
2687
2688		if (rcv->sb_state & SBS_CANTRCVMORE) {
2689			int timeout;
2690
2691			if (so)
2692				soisdisconnected(so);
2693			timeout = (tcp_fast_finwait2_recycle) ?
2694			    tcp_finwait2_timeout : tcp_maxidle;
2695			tcp_timer_activate(tp, TT_2MSL, timeout);
2696		}
2697		tp->t_state = TCPS_FIN_WAIT_2;
2698		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2699		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2700			action = TCP_DROP;
2701		}
2702
2703		break;
2704	default:
2705		log(LOG_ERR,
2706		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2707		       toep->tp_toedev->tod_name, toep->tp_tid,
2708		       tp->t_state);
2709	}
2710	inp_wunlock(tp->t_inpcb);
2711
2712
2713	if (action == TCP_TIMEWAIT) {
2714		enter_timewait(tp);
2715	} else if (action == TCP_DROP) {
2716		tcp_offload_drop(tp, 0);
2717	} else if (action == TCP_CLOSE) {
2718		tcp_offload_close(tp);
2719	}
2720out:
2721	m_freem(m);
2722}
2723
2724/*
2725 * Handler for CLOSE_CON_RPL CPL messages.
2726 */
2727static int
2728do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2729			    void *ctx)
2730{
2731	struct toepcb *toep = (struct toepcb *)ctx;
2732
2733	process_close_con_rpl(toep, m);
2734	return (0);
2735}
2736
2737/*
2738 * Process abort replies.  We only process these messages if we anticipate
2739 * them as the coordination between SW and HW in this area is somewhat lacking
2740 * and sometimes we get ABORT_RPLs after we are done with the connection that
2741 * originated the ABORT_REQ.
2742 */
2743static void
2744process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2745{
2746	struct tcpcb *tp = toep->tp_tp;
2747	struct socket *so;
2748	int needclose = 0;
2749
2750#ifdef T3_TRACE
2751	T3_TRACE1(TIDTB(sk),
2752		  "process_abort_rpl: GTS rpl pending %d",
2753		  sock_flag(sk, ABORT_RPL_PENDING));
2754#endif
2755
2756	inp_wlock(tp->t_inpcb);
2757	so = inp_inpcbtosocket(tp->t_inpcb);
2758
2759	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2760		/*
2761		 * XXX panic on tcpdrop
2762		 */
2763		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2764			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2765		else {
2766			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2767			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2768			    !is_t3a(toep->tp_toedev)) {
2769				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2770					panic("TP_ABORT_REQ_RCVD set");
2771				t3_release_offload_resources(toep);
2772				needclose = 1;
2773			}
2774		}
2775	}
2776	inp_wunlock(tp->t_inpcb);
2777
2778	if (needclose)
2779		tcp_offload_close(tp);
2780
2781	m_free(m);
2782}
2783
2784/*
2785 * Handle an ABORT_RPL_RSS CPL message.
2786 */
2787static int
2788do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2789{
2790	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2791	struct toepcb *toep;
2792
2793	/*
2794	 * Ignore replies to post-close aborts indicating that the abort was
2795	 * requested too late.  These connections are terminated when we get
2796	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2797	 * arrives the TID is either no longer used or it has been recycled.
2798	 */
2799	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2800discard:
2801		m_free(m);
2802		return (0);
2803	}
2804
2805	toep = (struct toepcb *)ctx;
2806
2807        /*
2808	 * Sometimes we've already closed the socket, e.g., a post-close
2809	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2810	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2811	 * but FW turns the ABORT_REQ into a regular one and so we get
2812	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2813	 */
2814	if (!toep)
2815		goto discard;
2816
2817	if (toep->tp_tp == NULL) {
2818		log(LOG_NOTICE, "removing tid for abort\n");
2819		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2820		if (toep->tp_l2t)
2821			l2t_release(L2DATA(cdev), toep->tp_l2t);
2822
2823		toepcb_release(toep);
2824		goto discard;
2825	}
2826
2827	log(LOG_NOTICE, "toep=%p\n", toep);
2828	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2829
2830	toepcb_hold(toep);
2831	process_abort_rpl(toep, m);
2832	toepcb_release(toep);
2833	return (0);
2834}
2835
2836/*
2837 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2838 * indicate whether RST should be sent in response.
2839 */
2840static int
2841abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2842{
2843	struct tcpcb *tp = so_sototcpcb(so);
2844
2845	switch (abort_reason) {
2846	case CPL_ERR_BAD_SYN:
2847#if 0
2848		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2849#endif
2850	case CPL_ERR_CONN_RESET:
2851		// XXX need to handle SYN_RECV due to crossed SYNs
2852		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2853	case CPL_ERR_XMIT_TIMEDOUT:
2854	case CPL_ERR_PERSIST_TIMEDOUT:
2855	case CPL_ERR_FINWAIT2_TIMEDOUT:
2856	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2857#if 0
2858		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2859#endif
2860		return (ETIMEDOUT);
2861	default:
2862		return (EIO);
2863	}
2864}
2865
2866static inline void
2867set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2868{
2869	struct cpl_abort_rpl *rpl = cplhdr(m);
2870
2871	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2872	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2873	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2874
2875	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2876	rpl->cmd = cmd;
2877}
2878
2879static void
2880send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2881{
2882	struct mbuf *reply_mbuf;
2883	struct cpl_abort_req_rss *req = cplhdr(m);
2884
2885	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2886	m_set_priority(m, CPL_PRIORITY_DATA);
2887	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2888	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2889	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2890	m_free(m);
2891}
2892
2893/*
2894 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2895 */
2896static inline int
2897is_neg_adv_abort(unsigned int status)
2898{
2899	return status == CPL_ERR_RTX_NEG_ADVICE ||
2900	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2901}
2902
2903static void
2904send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2905{
2906	struct mbuf  *reply_mbuf;
2907	struct cpl_abort_req_rss *req = cplhdr(m);
2908
2909	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2910
2911	if (!reply_mbuf) {
2912		/* Defer the reply.  Stick rst_status into req->cmd. */
2913		req->status = rst_status;
2914		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2915		return;
2916	}
2917
2918	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2919	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2920	m_free(m);
2921
2922	/*
2923	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2924	 * these messages while ARP is pending.  For other connection states
2925	 * it's not a problem.
2926	 */
2927	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2928}
2929
2930#ifdef notyet
2931static void
2932cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2933{
2934	CXGB_UNIMPLEMENTED();
2935#ifdef notyet
2936	struct request_sock *req = child->sk_user_data;
2937
2938	inet_csk_reqsk_queue_removed(parent, req);
2939	synq_remove(tcp_sk(child));
2940	__reqsk_free(req);
2941	child->sk_user_data = NULL;
2942#endif
2943}
2944
2945
2946/*
2947 * Performs the actual work to abort a SYN_RECV connection.
2948 */
2949static void
2950do_abort_syn_rcv(struct socket *child, struct socket *parent)
2951{
2952	struct tcpcb *parenttp = so_sototcpcb(parent);
2953	struct tcpcb *childtp = so_sototcpcb(child);
2954
2955	/*
2956	 * If the server is still open we clean up the child connection,
2957	 * otherwise the server already did the clean up as it was purging
2958	 * its SYN queue and the skb was just sitting in its backlog.
2959	 */
2960	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2961		cleanup_syn_rcv_conn(child, parent);
2962		inp_wlock(childtp->t_inpcb);
2963		t3_release_offload_resources(childtp->t_toe);
2964		inp_wunlock(childtp->t_inpcb);
2965		tcp_offload_close(childtp);
2966	}
2967}
2968#endif
2969
2970/*
2971 * Handle abort requests for a SYN_RECV connection.  These need extra work
2972 * because the socket is on its parent's SYN queue.
2973 */
2974static int
2975abort_syn_rcv(struct socket *so, struct mbuf *m)
2976{
2977	CXGB_UNIMPLEMENTED();
2978#ifdef notyet
2979	struct socket *parent;
2980	struct toedev *tdev = toep->tp_toedev;
2981	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2982	struct socket *oreq = so->so_incomp;
2983	struct t3c_tid_entry *t3c_stid;
2984	struct tid_info *t;
2985
2986	if (!oreq)
2987		return -1;        /* somehow we are not on the SYN queue */
2988
2989	t = &(T3C_DATA(cdev))->tid_maps;
2990	t3c_stid = lookup_stid(t, oreq->ts_recent);
2991	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2992
2993	so_lock(parent);
2994	do_abort_syn_rcv(so, parent);
2995	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2996	so_unlock(parent);
2997#endif
2998	return (0);
2999}
3000
3001/*
3002 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
3003 * request except that we need to reply to it.
3004 */
3005static void
3006process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3007{
3008	int rst_status = CPL_ABORT_NO_RST;
3009	const struct cpl_abort_req_rss *req = cplhdr(m);
3010	struct tcpcb *tp = toep->tp_tp;
3011	struct socket *so;
3012	int needclose = 0;
3013
3014	inp_wlock(tp->t_inpcb);
3015	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3016	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3017		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3018		m_free(m);
3019		goto skip;
3020	}
3021
3022	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3023	/*
3024	 * Three cases to consider:
3025	 * a) We haven't sent an abort_req; close the connection.
3026	 * b) We have sent a post-close abort_req that will get to TP too late
3027	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3028	 *    be ignored and the connection should be closed now.
3029	 * c) We have sent a regular abort_req that will get to TP too late.
3030	 *    That will generate an abort_rpl with status 0, wait for it.
3031	 */
3032	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3033	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3034		int error;
3035
3036		error = abort_status_to_errno(so, req->status,
3037		    &rst_status);
3038		so_error_set(so, error);
3039
3040		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3041			so_sorwakeup(so);
3042		/*
3043		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3044		 * returns 0 is has taken care of the abort.
3045		 */
3046		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3047			goto skip;
3048
3049		t3_release_offload_resources(toep);
3050		needclose = 1;
3051	}
3052	inp_wunlock(tp->t_inpcb);
3053
3054	if (needclose)
3055		tcp_offload_close(tp);
3056
3057	send_abort_rpl(m, tdev, rst_status);
3058	return;
3059skip:
3060	inp_wunlock(tp->t_inpcb);
3061}
3062
3063/*
3064 * Handle an ABORT_REQ_RSS CPL message.
3065 */
3066static int
3067do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3068{
3069	const struct cpl_abort_req_rss *req = cplhdr(m);
3070	struct toepcb *toep = (struct toepcb *)ctx;
3071
3072	if (is_neg_adv_abort(req->status)) {
3073		m_free(m);
3074		return (0);
3075	}
3076
3077	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3078
3079	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3080		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3081		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3082
3083		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3084		if (toep->tp_l2t)
3085			l2t_release(L2DATA(cdev), toep->tp_l2t);
3086
3087		/*
3088		 *  Unhook
3089		 */
3090		toep->tp_tp->t_toe = NULL;
3091		toep->tp_tp->t_flags &= ~TF_TOE;
3092		toep->tp_tp = NULL;
3093		/*
3094		 * XXX need to call syncache_chkrst - but we don't
3095		 * have a way of doing that yet
3096		 */
3097		toepcb_release(toep);
3098		log(LOG_ERR, "abort for unestablished connection :-(\n");
3099		return (0);
3100	}
3101	if (toep->tp_tp == NULL) {
3102		log(LOG_NOTICE, "disconnected toepcb\n");
3103		/* should be freed momentarily */
3104		return (0);
3105	}
3106
3107
3108	toepcb_hold(toep);
3109	process_abort_req(toep, m, toep->tp_toedev);
3110	toepcb_release(toep);
3111	return (0);
3112}
3113#ifdef notyet
3114static void
3115pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3116{
3117	struct toedev *tdev = TOE_DEV(parent);
3118
3119	do_abort_syn_rcv(child, parent);
3120	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3121		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3122
3123		rpl->opt0h = htonl(F_TCAM_BYPASS);
3124		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3125		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3126	} else
3127		m_free(m);
3128}
3129#endif
3130static void
3131handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3132{
3133	CXGB_UNIMPLEMENTED();
3134
3135#ifdef notyet
3136	struct t3cdev *cdev;
3137	struct socket *parent;
3138	struct socket *oreq;
3139	struct t3c_tid_entry *t3c_stid;
3140	struct tid_info *t;
3141	struct tcpcb *otp, *tp = so_sototcpcb(so);
3142	struct toepcb *toep = tp->t_toe;
3143
3144	/*
3145	 * If the connection is being aborted due to the parent listening
3146	 * socket going away there's nothing to do, the ABORT_REQ will close
3147	 * the connection.
3148	 */
3149	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3150		m_free(m);
3151		return;
3152	}
3153
3154	oreq = so->so_incomp;
3155	otp = so_sototcpcb(oreq);
3156
3157	cdev = T3C_DEV(so);
3158	t = &(T3C_DATA(cdev))->tid_maps;
3159	t3c_stid = lookup_stid(t, otp->ts_recent);
3160	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3161
3162	so_lock(parent);
3163	pass_open_abort(so, parent, m);
3164	so_unlock(parent);
3165#endif
3166}
3167
3168/*
3169 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3170 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3171 * connection.
3172 */
3173static void
3174pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3175{
3176
3177#ifdef notyet
3178	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3179	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3180#endif
3181	handle_pass_open_arp_failure(m_get_socket(m), m);
3182}
3183
3184/*
3185 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3186 */
3187static void
3188mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3189{
3190	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3191	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3192	unsigned int tid = GET_TID(req);
3193
3194	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3195	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3196	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3197	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3198	rpl->opt0h = htonl(F_TCAM_BYPASS);
3199	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3200	rpl->opt2 = 0;
3201	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3202}
3203
3204/*
3205 * Send a deferred reject to an accept request.
3206 */
3207static void
3208reject_pass_request(struct toedev *tdev, struct mbuf *m)
3209{
3210	struct mbuf *reply_mbuf;
3211
3212	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3213	mk_pass_accept_rpl(reply_mbuf, m);
3214	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3215	m_free(m);
3216}
3217
3218static void
3219handle_syncache_event(int event, void *arg)
3220{
3221	struct toepcb *toep = arg;
3222
3223	switch (event) {
3224	case TOE_SC_ENTRY_PRESENT:
3225		/*
3226		 * entry already exists - free toepcb
3227		 * and l2t
3228		 */
3229		printf("syncache entry present\n");
3230		toepcb_release(toep);
3231		break;
3232	case TOE_SC_DROP:
3233		/*
3234		 * The syncache has given up on this entry
3235		 * either it timed out, or it was evicted
3236		 * we need to explicitly release the tid
3237		 */
3238		printf("syncache entry dropped\n");
3239		toepcb_release(toep);
3240		break;
3241	default:
3242		log(LOG_ERR, "unknown syncache event %d\n", event);
3243		break;
3244	}
3245}
3246
3247static void
3248syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3249{
3250	struct in_conninfo inc;
3251	struct tcpopt to;
3252	struct tcphdr th;
3253	struct inpcb *inp;
3254	int mss, wsf, sack, ts;
3255	uint32_t rcv_isn = ntohl(req->rcv_isn);
3256
3257	bzero(&to, sizeof(struct tcpopt));
3258	inp = so_sotoinpcb(lso);
3259
3260	/*
3261	 * Fill out information for entering us into the syncache
3262	 */
3263	bzero(&inc, sizeof(inc));
3264	inc.inc_fport = th.th_sport = req->peer_port;
3265	inc.inc_lport = th.th_dport = req->local_port;
3266	th.th_seq = req->rcv_isn;
3267	th.th_flags = TH_SYN;
3268
3269	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3270
3271
3272	inc.inc_isipv6 = 0;
3273	inc.inc_len = 0;
3274	inc.inc_faddr.s_addr = req->peer_ip;
3275	inc.inc_laddr.s_addr = req->local_ip;
3276
3277	DPRINTF("syncache add of %d:%d %d:%d\n",
3278	    ntohl(req->local_ip), ntohs(req->local_port),
3279	    ntohl(req->peer_ip), ntohs(req->peer_port));
3280
3281	mss = req->tcp_options.mss;
3282	wsf = req->tcp_options.wsf;
3283	ts = req->tcp_options.tstamp;
3284	sack = req->tcp_options.sack;
3285	to.to_mss = mss;
3286	to.to_wscale = wsf;
3287	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3288	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3289}
3290
3291
3292/*
3293 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3294 * lock held.  Note that the sock here is a listening socket that is not owned
3295 * by the TOE.
3296 */
3297static void
3298process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3299    struct listen_ctx *lctx)
3300{
3301	int rt_flags;
3302	struct l2t_entry *e;
3303	struct iff_mac tim;
3304	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3305	struct cpl_pass_accept_rpl *rpl;
3306	struct cpl_pass_accept_req *req = cplhdr(m);
3307	unsigned int tid = GET_TID(req);
3308	struct tom_data *d = TOM_DATA(tdev);
3309	struct t3cdev *cdev = d->cdev;
3310	struct tcpcb *tp = so_sototcpcb(so);
3311	struct toepcb *newtoep;
3312	struct rtentry *dst;
3313	struct sockaddr_in nam;
3314	struct t3c_data *td = T3C_DATA(cdev);
3315
3316	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3317	if (__predict_false(reply_mbuf == NULL)) {
3318		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3319			t3_defer_reply(m, tdev, reject_pass_request);
3320		else {
3321			cxgb_queue_tid_release(cdev, tid);
3322			m_free(m);
3323		}
3324		DPRINTF("failed to get reply_mbuf\n");
3325
3326		goto out;
3327	}
3328
3329	if (tp->t_state != TCPS_LISTEN) {
3330		DPRINTF("socket not in listen state\n");
3331
3332		goto reject;
3333	}
3334
3335	tim.mac_addr = req->dst_mac;
3336	tim.vlan_tag = ntohs(req->vlan_tag);
3337	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3338		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3339		goto reject;
3340	}
3341
3342#ifdef notyet
3343	/*
3344	 * XXX do route lookup to confirm that we're still listening on this
3345	 * address
3346	 */
3347	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3348			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3349		goto reject;
3350	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3351		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3352	dst_release(skb->dst);	// done with the input route, release it
3353	skb->dst = NULL;
3354
3355	if ((rt_flags & RTF_LOCAL) == 0)
3356		goto reject;
3357#endif
3358	/*
3359	 * XXX
3360	 */
3361	rt_flags = RTF_LOCAL;
3362	if ((rt_flags & RTF_LOCAL) == 0)
3363		goto reject;
3364
3365	/*
3366	 * Calculate values and add to syncache
3367	 */
3368
3369	newtoep = toepcb_alloc();
3370	if (newtoep == NULL)
3371		goto reject;
3372
3373	bzero(&nam, sizeof(struct sockaddr_in));
3374
3375	nam.sin_len = sizeof(struct sockaddr_in);
3376	nam.sin_family = AF_INET;
3377	nam.sin_addr.s_addr =req->peer_ip;
3378	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3379
3380	if (dst == NULL) {
3381		printf("failed to find route\n");
3382		goto reject;
3383	}
3384	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3385	    (struct sockaddr *)&nam);
3386	if (e == NULL) {
3387		DPRINTF("failed to get l2t\n");
3388	}
3389	/*
3390	 * Point to our listen socket until accept
3391	 */
3392	newtoep->tp_tp = tp;
3393	newtoep->tp_flags = TP_SYN_RCVD;
3394	newtoep->tp_tid = tid;
3395	newtoep->tp_toedev = tdev;
3396	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3397
3398	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3399	so_lock(so);
3400	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3401	so_unlock(so);
3402
3403	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3404		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3405
3406	if (newtoep->tp_ulp_mode) {
3407		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3408
3409		if (ddp_mbuf == NULL)
3410			newtoep->tp_ulp_mode = 0;
3411	}
3412
3413	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3414	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3415	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3416	/*
3417	 * XXX workaround for lack of syncache drop
3418	 */
3419	toepcb_hold(newtoep);
3420	syncache_add_accept_req(req, so, newtoep);
3421
3422	rpl = cplhdr(reply_mbuf);
3423	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3424	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3425	rpl->wr.wr_lo = 0;
3426	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3427	rpl->opt2 = htonl(calc_opt2(so, tdev));
3428	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3429	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3430
3431	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3432	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3433	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3434				  CPL_PASS_OPEN_ACCEPT);
3435
3436	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3437
3438	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3439
3440	l2t_send(cdev, reply_mbuf, e);
3441	m_free(m);
3442	if (newtoep->tp_ulp_mode) {
3443		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3444				V_TF_DDP_OFF(1) |
3445				TP_DDP_TIMER_WORKAROUND_MASK,
3446				V_TF_DDP_OFF(1) |
3447		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3448	} else
3449		printf("not offloading\n");
3450
3451
3452
3453	return;
3454reject:
3455	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3456		mk_pass_accept_rpl(reply_mbuf, m);
3457	else
3458		mk_tid_release(reply_mbuf, newtoep, tid);
3459	cxgb_ofld_send(cdev, reply_mbuf);
3460	m_free(m);
3461out:
3462#if 0
3463	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3464#else
3465	return;
3466#endif
3467}
3468
3469/*
3470 * Handle a CPL_PASS_ACCEPT_REQ message.
3471 */
3472static int
3473do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3474{
3475	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3476	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3477	struct tom_data *d = listen_ctx->tom_data;
3478
3479#if VALIDATE_TID
3480	struct cpl_pass_accept_req *req = cplhdr(m);
3481	unsigned int tid = GET_TID(req);
3482	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3483
3484	if (unlikely(!lsk)) {
3485		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3486		       cdev->name,
3487		       (unsigned long)((union listen_entry *)ctx -
3488					t->stid_tab));
3489		return CPL_RET_BUF_DONE;
3490	}
3491	if (unlikely(tid >= t->ntids)) {
3492		printk(KERN_ERR "%s: passive open TID %u too large\n",
3493		       cdev->name, tid);
3494		return CPL_RET_BUF_DONE;
3495	}
3496	/*
3497	 * For T3A the current user of the TID may have closed but its last
3498	 * message(s) may have been backlogged so the TID appears to be still
3499	 * in use.  Just take the TID away, the connection can close at its
3500	 * own leisure.  For T3B this situation is a bug.
3501	 */
3502	if (!valid_new_tid(t, tid) &&
3503	    cdev->type != T3A) {
3504		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3505		       cdev->name, tid);
3506		return CPL_RET_BUF_DONE;
3507	}
3508#endif
3509
3510	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3511	return (0);
3512}
3513
3514/*
3515 * Called when a connection is established to translate the TCP options
3516 * reported by HW to FreeBSD's native format.
3517 */
3518static void
3519assign_rxopt(struct socket *so, unsigned int opt)
3520{
3521	struct tcpcb *tp = so_sototcpcb(so);
3522	struct toepcb *toep = tp->t_toe;
3523	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3524
3525	inp_lock_assert(tp->t_inpcb);
3526
3527	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3528	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3529	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3530	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3531	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3532	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3533		tp->rcv_scale = tp->request_r_scale;
3534}
3535
3536/*
3537 * Completes some final bits of initialization for just established connections
3538 * and changes their state to TCP_ESTABLISHED.
3539 *
3540 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3541 */
3542static void
3543make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3544{
3545	struct tcpcb *tp = so_sototcpcb(so);
3546	struct toepcb *toep = tp->t_toe;
3547
3548	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3549	assign_rxopt(so, opt);
3550
3551	/*
3552	 *XXXXXXXXXXX
3553	 *
3554	 */
3555#ifdef notyet
3556	so->so_proto->pr_ctloutput = t3_ctloutput;
3557#endif
3558
3559#if 0
3560	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3561#endif
3562	/*
3563	 * XXX not clear what rcv_wup maps to
3564	 */
3565	/*
3566	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3567	 * pass through opt0.
3568	 */
3569	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3570		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3571
3572	dump_toepcb(toep);
3573
3574#ifdef notyet
3575/*
3576 * no clean interface for marking ARP up to date
3577 */
3578	dst_confirm(sk->sk_dst_cache);
3579#endif
3580	tp->t_starttime = ticks;
3581	tp->t_state = TCPS_ESTABLISHED;
3582	soisconnected(so);
3583}
3584
3585static int
3586syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3587{
3588
3589	struct in_conninfo inc;
3590	struct tcpopt to;
3591	struct tcphdr th;
3592	int mss, wsf, sack, ts;
3593	struct mbuf *m = NULL;
3594	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3595	unsigned int opt;
3596
3597#ifdef MAC
3598#error	"no MAC support"
3599#endif
3600
3601	opt = ntohs(req->tcp_opt);
3602
3603	bzero(&to, sizeof(struct tcpopt));
3604
3605	/*
3606	 * Fill out information for entering us into the syncache
3607	 */
3608	bzero(&inc, sizeof(inc));
3609	inc.inc_fport = th.th_sport = req->peer_port;
3610	inc.inc_lport = th.th_dport = req->local_port;
3611	th.th_seq = req->rcv_isn;
3612	th.th_flags = TH_ACK;
3613
3614	inc.inc_isipv6 = 0;
3615	inc.inc_len = 0;
3616	inc.inc_faddr.s_addr = req->peer_ip;
3617	inc.inc_laddr.s_addr = req->local_ip;
3618
3619	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3620	wsf  = G_TCPOPT_WSCALE_OK(opt);
3621	ts   = G_TCPOPT_TSTAMP(opt);
3622	sack = G_TCPOPT_SACK(opt);
3623
3624	to.to_mss = mss;
3625	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3626	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3627
3628	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3629	    ntohl(req->local_ip), ntohs(req->local_port),
3630	    ntohl(req->peer_ip), ntohs(req->peer_port),
3631	    mss, wsf, ts, sack);
3632	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3633}
3634
3635
3636/*
3637 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3638 * if we are in TCP_SYN_RECV due to crossed SYNs
3639 */
3640static int
3641do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3642{
3643	struct cpl_pass_establish *req = cplhdr(m);
3644	struct toepcb *toep = (struct toepcb *)ctx;
3645	struct tcpcb *tp = toep->tp_tp;
3646	struct socket *so, *lso;
3647	struct t3c_data *td = T3C_DATA(cdev);
3648	struct sockbuf *snd, *rcv;
3649
3650	// Complete socket initialization now that we have the SND_ISN
3651
3652	struct toedev *tdev;
3653
3654
3655	tdev = toep->tp_toedev;
3656
3657	inp_wlock(tp->t_inpcb);
3658
3659	/*
3660	 *
3661	 * XXX need to add reference while we're manipulating
3662	 */
3663	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3664
3665	inp_wunlock(tp->t_inpcb);
3666
3667	so_lock(so);
3668	LIST_REMOVE(toep, synq_entry);
3669	so_unlock(so);
3670
3671	if (!syncache_expand_establish_req(req, &so, toep)) {
3672		/*
3673		 * No entry
3674		 */
3675		CXGB_UNIMPLEMENTED();
3676	}
3677	if (so == NULL) {
3678		/*
3679		 * Couldn't create the socket
3680		 */
3681		CXGB_UNIMPLEMENTED();
3682	}
3683
3684	tp = so_sototcpcb(so);
3685	inp_wlock(tp->t_inpcb);
3686
3687	snd = so_sockbuf_snd(so);
3688	rcv = so_sockbuf_rcv(so);
3689
3690	snd->sb_flags |= SB_NOCOALESCE;
3691	rcv->sb_flags |= SB_NOCOALESCE;
3692
3693	toep->tp_tp = tp;
3694	toep->tp_flags = 0;
3695	tp->t_toe = toep;
3696	reset_wr_list(toep);
3697	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3698	tp->rcv_nxt = toep->tp_copied_seq;
3699	install_offload_ops(so);
3700
3701	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3702	toep->tp_wr_unacked = 0;
3703	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3704	toep->tp_qset_idx = 0;
3705	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3706
3707	/*
3708	 * XXX Cancel any keep alive timer
3709	 */
3710
3711	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3712
3713	/*
3714	 * XXX workaround for lack of syncache drop
3715	 */
3716	toepcb_release(toep);
3717	inp_wunlock(tp->t_inpcb);
3718
3719	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3720	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3721#ifdef notyet
3722	/*
3723	 * XXX not sure how these checks map to us
3724	 */
3725	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3726		sk->sk_state_change(sk);
3727		sk_wake_async(so, 0, POLL_OUT);
3728	}
3729	/*
3730	 * The state for the new connection is now up to date.
3731	 * Next check if we should add the connection to the parent's
3732	 * accept queue.  When the parent closes it resets connections
3733	 * on its SYN queue, so check if we are being reset.  If so we
3734	 * don't need to do anything more, the coming ABORT_RPL will
3735	 * destroy this socket.  Otherwise move the connection to the
3736	 * accept queue.
3737	 *
3738	 * Note that we reset the synq before closing the server so if
3739	 * we are not being reset the stid is still open.
3740	 */
3741	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3742		__kfree_skb(skb);
3743		goto unlock;
3744	}
3745#endif
3746	m_free(m);
3747
3748	return (0);
3749}
3750
3751/*
3752 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3753 * and send them to the TOE.
3754 */
3755static void
3756fixup_and_send_ofo(struct toepcb *toep)
3757{
3758	struct mbuf *m;
3759	struct toedev *tdev = toep->tp_toedev;
3760	struct tcpcb *tp = toep->tp_tp;
3761	unsigned int tid = toep->tp_tid;
3762
3763	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3764
3765	inp_lock_assert(tp->t_inpcb);
3766	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3767		/*
3768		 * A variety of messages can be waiting but the fields we'll
3769		 * be touching are common to all so any message type will do.
3770		 */
3771		struct cpl_close_con_req *p = cplhdr(m);
3772
3773		p->wr.wr_lo = htonl(V_WR_TID(tid));
3774		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3775		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3776	}
3777}
3778
3779/*
3780 * Updates socket state from an active establish CPL message.  Runs with the
3781 * socket lock held.
3782 */
3783static void
3784socket_act_establish(struct socket *so, struct mbuf *m)
3785{
3786	struct cpl_act_establish *req = cplhdr(m);
3787	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3788	struct tcpcb *tp = so_sototcpcb(so);
3789	struct toepcb *toep = tp->t_toe;
3790
3791	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3792		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3793		    toep->tp_tid, tp->t_state);
3794
3795	tp->ts_recent_age = ticks;
3796	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3797	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3798
3799	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3800
3801	/*
3802	 * Now that we finally have a TID send any CPL messages that we had to
3803	 * defer for lack of a TID.
3804	 */
3805	if (mbufq_len(&toep->out_of_order_queue))
3806		fixup_and_send_ofo(toep);
3807
3808	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3809		/*
3810		 * XXX does this even make sense?
3811		 */
3812		so_sorwakeup(so);
3813	}
3814	m_free(m);
3815#ifdef notyet
3816/*
3817 * XXX assume no write requests permitted while socket connection is
3818 * incomplete
3819 */
3820	/*
3821	 * Currently the send queue must be empty at this point because the
3822	 * socket layer does not send anything before a connection is
3823	 * established.  To be future proof though we handle the possibility
3824	 * that there are pending buffers to send (either TX_DATA or
3825	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3826	 * buffers according to the just learned write_seq, and then we send
3827	 * them on their way.
3828	 */
3829	fixup_pending_writeq_buffers(sk);
3830	if (t3_push_frames(so, 1))
3831		sk->sk_write_space(sk);
3832#endif
3833
3834	toep->tp_state = tp->t_state;
3835	V_tcpstat.tcps_connects++;
3836
3837}
3838
3839/*
3840 * Process a CPL_ACT_ESTABLISH message.
3841 */
3842static int
3843do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3844{
3845	struct cpl_act_establish *req = cplhdr(m);
3846	unsigned int tid = GET_TID(req);
3847	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3848	struct toepcb *toep = (struct toepcb *)ctx;
3849	struct tcpcb *tp = toep->tp_tp;
3850	struct socket *so;
3851	struct toedev *tdev;
3852	struct tom_data *d;
3853
3854	if (tp == NULL) {
3855		free_atid(cdev, atid);
3856		return (0);
3857	}
3858	inp_wlock(tp->t_inpcb);
3859
3860	/*
3861	 * XXX
3862	 */
3863	so = inp_inpcbtosocket(tp->t_inpcb);
3864	tdev = toep->tp_toedev; /* blow up here if link was down */
3865	d = TOM_DATA(tdev);
3866
3867	/*
3868	 * It's OK if the TID is currently in use, the owning socket may have
3869	 * backlogged its last CPL message(s).  Just take it away.
3870	 */
3871	toep->tp_tid = tid;
3872	toep->tp_tp = tp;
3873	so_insert_tid(d, toep, tid);
3874	free_atid(cdev, atid);
3875	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3876
3877	socket_act_establish(so, m);
3878	inp_wunlock(tp->t_inpcb);
3879	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3880	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3881
3882	return (0);
3883}
3884
3885/*
3886 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3887 * next batch of work requests from the write queue.
3888 */
3889static void
3890wr_ack(struct toepcb *toep, struct mbuf *m)
3891{
3892	struct tcpcb *tp = toep->tp_tp;
3893	struct cpl_wr_ack *hdr = cplhdr(m);
3894	struct socket *so;
3895	unsigned int credits = ntohs(hdr->credits);
3896	u32 snd_una = ntohl(hdr->snd_una);
3897	int bytes = 0;
3898	struct sockbuf *snd;
3899
3900	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3901
3902	inp_wlock(tp->t_inpcb);
3903	so = inp_inpcbtosocket(tp->t_inpcb);
3904	toep->tp_wr_avail += credits;
3905	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3906		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3907
3908	while (credits) {
3909		struct mbuf *p = peek_wr(toep);
3910
3911		if (__predict_false(!p)) {
3912			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3913			    "nothing pending, state %u wr_avail=%u\n",
3914			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3915			break;
3916		}
3917		CTR2(KTR_TOM,
3918			"wr_ack: p->credits=%d p->bytes=%d",
3919		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3920		KASSERT(p->m_pkthdr.csum_data != 0,
3921		    ("empty request still on list"));
3922
3923		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3924
3925#if DEBUG_WR > 1
3926			struct tx_data_wr *w = cplhdr(p);
3927			log(LOG_ERR,
3928			       "TID %u got %u WR credits, need %u, len %u, "
3929			       "main body %u, frags %u, seq # %u, ACK una %u,"
3930			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3931			       toep->tp_tid, credits, p->csum, p->len,
3932			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3933			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3934			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3935#endif
3936			p->m_pkthdr.csum_data -= credits;
3937			break;
3938		} else {
3939			dequeue_wr(toep);
3940			credits -= p->m_pkthdr.csum_data;
3941			bytes += p->m_pkthdr.len;
3942			CTR3(KTR_TOM,
3943			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3944			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3945
3946			m_free(p);
3947		}
3948	}
3949
3950#if DEBUG_WR
3951	check_wr_invariants(tp);
3952#endif
3953
3954	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3955#if VALIDATE_SEQ
3956		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3957
3958		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3959		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3960		    toep->tp_tid, tp->snd_una);
3961#endif
3962		goto out_free;
3963	}
3964
3965	if (tp->snd_una != snd_una) {
3966		tp->snd_una = snd_una;
3967		tp->ts_recent_age = ticks;
3968#ifdef notyet
3969		/*
3970		 * Keep ARP entry "minty fresh"
3971		 */
3972		dst_confirm(sk->sk_dst_cache);
3973#endif
3974		if (tp->snd_una == tp->snd_nxt)
3975			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3976	}
3977
3978	snd = so_sockbuf_snd(so);
3979	if (bytes) {
3980		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3981		snd = so_sockbuf_snd(so);
3982		sockbuf_lock(snd);
3983		sbdrop_locked(snd, bytes);
3984		so_sowwakeup_locked(so);
3985	}
3986
3987	if (snd->sb_sndptroff < snd->sb_cc)
3988		t3_push_frames(so, 0);
3989
3990out_free:
3991	inp_wunlock(tp->t_inpcb);
3992	m_free(m);
3993}
3994
3995/*
3996 * Handler for TX_DATA_ACK CPL messages.
3997 */
3998static int
3999do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
4000{
4001	struct toepcb *toep = (struct toepcb *)ctx;
4002
4003	VALIDATE_SOCK(so);
4004
4005	wr_ack(toep, m);
4006	return 0;
4007}
4008
4009/*
4010 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4011 */
4012static int
4013do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4014{
4015	m_freem(m);
4016	return 0;
4017}
4018
4019/*
4020 * Reset a connection that is on a listener's SYN queue or accept queue,
4021 * i.e., one that has not had a struct socket associated with it.
4022 * Must be called from process context.
4023 *
4024 * Modeled after code in inet_csk_listen_stop().
4025 */
4026static void
4027t3_reset_listen_child(struct socket *child)
4028{
4029	struct tcpcb *tp = so_sototcpcb(child);
4030
4031	t3_send_reset(tp->t_toe);
4032}
4033
4034
4035static void
4036t3_child_disconnect(struct socket *so, void *arg)
4037{
4038	struct tcpcb *tp = so_sototcpcb(so);
4039
4040	if (tp->t_flags & TF_TOE) {
4041		inp_wlock(tp->t_inpcb);
4042		t3_reset_listen_child(so);
4043		inp_wunlock(tp->t_inpcb);
4044	}
4045}
4046
4047/*
4048 * Disconnect offloaded established but not yet accepted connections sitting
4049 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4050 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4051 */
4052void
4053t3_disconnect_acceptq(struct socket *listen_so)
4054{
4055
4056	so_lock(listen_so);
4057	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4058	so_unlock(listen_so);
4059}
4060
4061/*
4062 * Reset offloaded connections sitting on a server's syn queue.  As above
4063 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4064 */
4065
4066void
4067t3_reset_synq(struct listen_ctx *lctx)
4068{
4069	struct toepcb *toep;
4070
4071	so_lock(lctx->lso);
4072	while (!LIST_EMPTY(&lctx->synq_head)) {
4073		toep = LIST_FIRST(&lctx->synq_head);
4074		LIST_REMOVE(toep, synq_entry);
4075		toep->tp_tp = NULL;
4076		t3_send_reset(toep);
4077		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4078		toepcb_release(toep);
4079	}
4080	so_unlock(lctx->lso);
4081}
4082
4083
4084int
4085t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4086		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4087		   unsigned int pg_off, unsigned int color)
4088{
4089	unsigned int i, j, pidx;
4090	struct pagepod *p;
4091	struct mbuf *m;
4092	struct ulp_mem_io *req;
4093	unsigned int tid = toep->tp_tid;
4094	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4095	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4096
4097	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4098	    gl, nppods, tag, maxoff, pg_off, color);
4099
4100	for (i = 0; i < nppods; ++i) {
4101		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4102		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4103		req = mtod(m, struct ulp_mem_io *);
4104		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4105		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4106		req->wr.wr_lo = 0;
4107		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4108					   V_ULPTX_CMD(ULP_MEM_WRITE));
4109		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4110				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4111
4112		p = (struct pagepod *)(req + 1);
4113		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4114			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4115			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4116						  V_PPOD_COLOR(color));
4117			p->pp_max_offset = htonl(maxoff);
4118			p->pp_page_offset = htonl(pg_off);
4119			p->pp_rsvd = 0;
4120			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4121				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4122				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4123		} else
4124			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4125		send_or_defer(toep, m, 0);
4126		ppod_addr += PPOD_SIZE;
4127	}
4128	return (0);
4129}
4130
4131/*
4132 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4133 */
4134static inline void
4135mk_cpl_barrier_ulp(struct cpl_barrier *b)
4136{
4137	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4138
4139	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4140	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4141	b->opcode = CPL_BARRIER;
4142}
4143
4144/*
4145 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4146 */
4147static inline void
4148mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4149{
4150	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4151
4152	txpkt = (struct ulp_txpkt *)req;
4153	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4154	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4155	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4156	req->cpuno = htons(cpuno);
4157}
4158
4159/*
4160 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4161 */
4162static inline void
4163mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4164                     unsigned int word, uint64_t mask, uint64_t val)
4165{
4166	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4167
4168	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4169	    tid, word, mask, val);
4170
4171	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4172	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4173	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4174	req->reply = V_NO_REPLY(1);
4175	req->cpu_idx = 0;
4176	req->word = htons(word);
4177	req->mask = htobe64(mask);
4178	req->val = htobe64(val);
4179}
4180
4181/*
4182 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4183 */
4184static void
4185mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4186    unsigned int tid, unsigned int credits)
4187{
4188	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4189
4190	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4191	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4192	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4193	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4194	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4195				 V_RX_CREDITS(credits));
4196}
4197
4198void
4199t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4200{
4201	unsigned int wrlen;
4202	struct mbuf *m;
4203	struct work_request_hdr *wr;
4204	struct cpl_barrier *lock;
4205	struct cpl_set_tcb_field *req;
4206	struct cpl_get_tcb *getreq;
4207	struct ddp_state *p = &toep->tp_ddp_state;
4208
4209#if 0
4210	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4211#endif
4212	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4213		sizeof(*getreq);
4214	m = m_gethdr_nofail(wrlen);
4215	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4216	wr = mtod(m, struct work_request_hdr *);
4217	bzero(wr, wrlen);
4218
4219	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4220	m->m_pkthdr.len = m->m_len = wrlen;
4221
4222	lock = (struct cpl_barrier *)(wr + 1);
4223	mk_cpl_barrier_ulp(lock);
4224
4225	req = (struct cpl_set_tcb_field *)(lock + 1);
4226
4227	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4228
4229	/* Hmmm, not sure if this actually a good thing: reactivating
4230	 * the other buffer might be an issue if it has been completed
4231	 * already. However, that is unlikely, since the fact that the UBUF
4232	 * is not completed indicates that there is no oustanding data.
4233	 */
4234	if (bufidx == 0)
4235		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4236				     V_TF_DDP_ACTIVE_BUF(1) |
4237				     V_TF_DDP_BUF0_VALID(1),
4238				     V_TF_DDP_ACTIVE_BUF(1));
4239	else
4240		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4241				     V_TF_DDP_ACTIVE_BUF(1) |
4242				     V_TF_DDP_BUF1_VALID(1), 0);
4243
4244	getreq = (struct cpl_get_tcb *)(req + 1);
4245	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4246
4247	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4248
4249	/* Keep track of the number of oustanding CPL_GET_TCB requests
4250	 */
4251	p->get_tcb_count++;
4252
4253#ifdef T3_TRACE
4254	T3_TRACE1(TIDTB(so),
4255		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4256#endif
4257	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4258}
4259
4260/**
4261 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4262 * @sk: the socket associated with the buffers
4263 * @bufidx: index of HW DDP buffer (0 or 1)
4264 * @tag0: new tag for HW buffer 0
4265 * @tag1: new tag for HW buffer 1
4266 * @len: new length for HW buf @bufidx
4267 *
4268 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4269 * buffer by changing the buffer tag and length and setting the valid and
4270 * active flag accordingly.  The caller must ensure the new buffer is at
4271 * least as big as the existing one.  Since we typically reprogram both HW
4272 * buffers this function sets both tags for convenience. Read the TCB to
4273 * determine how made data was written into the buffer before the overlay
4274 * took place.
4275 */
4276void
4277t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4278	 	       unsigned int tag1, unsigned int len)
4279{
4280	unsigned int wrlen;
4281	struct mbuf *m;
4282	struct work_request_hdr *wr;
4283	struct cpl_get_tcb *getreq;
4284	struct cpl_set_tcb_field *req;
4285	struct ddp_state *p = &toep->tp_ddp_state;
4286
4287	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4288	    bufidx, tag0, tag1, len);
4289#if 0
4290	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4291#endif
4292	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4293	m = m_gethdr_nofail(wrlen);
4294	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4295	wr = mtod(m, struct work_request_hdr *);
4296	m->m_pkthdr.len = m->m_len = wrlen;
4297	bzero(wr, wrlen);
4298
4299
4300	/* Set the ATOMIC flag to make sure that TP processes the following
4301	 * CPLs in an atomic manner and no wire segments can be interleaved.
4302	 */
4303	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4304	req = (struct cpl_set_tcb_field *)(wr + 1);
4305	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4306			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4307			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4308			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4309			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4310	req++;
4311	if (bufidx == 0) {
4312		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4313			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4314			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4315		req++;
4316		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4317			    V_TF_DDP_PUSH_DISABLE_0(1) |
4318			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4319			    V_TF_DDP_PUSH_DISABLE_0(0) |
4320			    V_TF_DDP_BUF0_VALID(1));
4321	} else {
4322		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4323			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4324			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4325		req++;
4326		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4327			    V_TF_DDP_PUSH_DISABLE_1(1) |
4328			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4329			    V_TF_DDP_PUSH_DISABLE_1(0) |
4330			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4331	}
4332
4333	getreq = (struct cpl_get_tcb *)(req + 1);
4334	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4335
4336	/* Keep track of the number of oustanding CPL_GET_TCB requests
4337	 */
4338	p->get_tcb_count++;
4339
4340#ifdef T3_TRACE
4341	T3_TRACE4(TIDTB(sk),
4342		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4343		  "len %d",
4344		  bufidx, tag0, tag1, len);
4345#endif
4346	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4347}
4348
4349/*
4350 * Sends a compound WR containing all the CPL messages needed to program the
4351 * two HW DDP buffers, namely optionally setting up the length and offset of
4352 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4353 */
4354void
4355t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4356		      unsigned int len1, unsigned int offset1,
4357                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4358{
4359	unsigned int wrlen;
4360	struct mbuf *m;
4361	struct work_request_hdr *wr;
4362	struct cpl_set_tcb_field *req;
4363
4364	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4365	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4366
4367#if 0
4368	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4369#endif
4370	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4371		(len1 ? sizeof(*req) : 0) +
4372		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4373	m = m_gethdr_nofail(wrlen);
4374	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4375	wr = mtod(m, struct work_request_hdr *);
4376	bzero(wr, wrlen);
4377
4378	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4379	m->m_pkthdr.len = m->m_len = wrlen;
4380
4381	req = (struct cpl_set_tcb_field *)(wr + 1);
4382	if (len0) {                  /* program buffer 0 offset and length */
4383		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4384			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4385			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4386			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4387			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4388		req++;
4389	}
4390	if (len1) {                  /* program buffer 1 offset and length */
4391		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4392			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4393			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4394			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4395			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4396		req++;
4397	}
4398
4399	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4400			     ddp_flags);
4401
4402	if (modulate) {
4403		mk_rx_data_ack_ulp(toep,
4404		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4405		    toep->tp_copied_seq - toep->tp_rcv_wup);
4406		toep->tp_rcv_wup = toep->tp_copied_seq;
4407	}
4408
4409#ifdef T3_TRACE
4410	T3_TRACE5(TIDTB(sk),
4411		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4412		  "modulate %d",
4413		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4414		  modulate);
4415#endif
4416
4417	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4418}
4419
4420void
4421t3_init_wr_tab(unsigned int wr_len)
4422{
4423	int i;
4424
4425	if (mbuf_wrs[1])     /* already initialized */
4426		return;
4427
4428	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4429		int sgl_len = (3 * i) / 2 + (i & 1);
4430
4431		sgl_len += 3;
4432		mbuf_wrs[i] = sgl_len <= wr_len ?
4433		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4434	}
4435
4436	wrlen = wr_len * 8;
4437}
4438
4439int
4440t3_init_cpl_io(void)
4441{
4442#ifdef notyet
4443	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4444	if (!tcphdr_skb) {
4445		log(LOG_ERR,
4446		       "Chelsio TCP offload: can't allocate sk_buff\n");
4447		return -1;
4448	}
4449	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4450	tcphdr_skb->h.raw = tcphdr_skb->data;
4451	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4452#endif
4453
4454	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4455	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4456	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4457	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4458	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4459	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4460	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4461	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4462	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4463	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4464	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4465	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4466	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4467	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4468	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4469	return (0);
4470}
4471
4472