cxgb_cpl_io.c revision 183550
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 183550 2008-10-02 15:37:58Z zec $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/sockbuf.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/protosw.h>
49#include <sys/priv.h>
50
51#if __FreeBSD_version >= 800044
52#include <sys/vimage.h>
53#else
54#define V_tcp_do_autosndbuf tcp_do_autosndbuf
55#define V_tcp_autosndbuf_max tcp_autosndbuf_max
56#define V_tcp_do_rfc1323 tcp_do_rfc1323
57#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
58#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
59#define V_tcpstat tcpstat
60#endif
61
62#include <net/if.h>
63#include <net/route.h>
64
65#include <netinet/in.h>
66#include <netinet/in_pcb.h>
67#include <netinet/in_systm.h>
68#include <netinet/in_var.h>
69
70
71#include <cxgb_osdep.h>
72#include <sys/mbufq.h>
73
74#include <netinet/ip.h>
75#include <netinet/tcp_var.h>
76#include <netinet/tcp_fsm.h>
77#include <netinet/tcp_offload.h>
78#include <netinet/tcp_seq.h>
79#include <netinet/tcp_syncache.h>
80#include <netinet/tcp_timer.h>
81#include <net/route.h>
82
83#include <t3cdev.h>
84#include <common/cxgb_firmware_exports.h>
85#include <common/cxgb_t3_cpl.h>
86#include <common/cxgb_tcb.h>
87#include <common/cxgb_ctl_defs.h>
88#include <cxgb_offload.h>
89#include <vm/vm.h>
90#include <vm/pmap.h>
91#include <machine/bus.h>
92#include <sys/mvec.h>
93#include <ulp/toecore/cxgb_toedev.h>
94#include <ulp/tom/cxgb_l2t.h>
95#include <ulp/tom/cxgb_defs.h>
96#include <ulp/tom/cxgb_tom.h>
97#include <ulp/tom/cxgb_t3_ddp.h>
98#include <ulp/tom/cxgb_toepcb.h>
99#include <ulp/tom/cxgb_tcp.h>
100#include <ulp/tom/cxgb_tcp_offload.h>
101
102/*
103 * For ULP connections HW may add headers, e.g., for digests, that aren't part
104 * of the messages sent by the host but that are part of the TCP payload and
105 * therefore consume TCP sequence space.  Tx connection parameters that
106 * operate in TCP sequence space are affected by the HW additions and need to
107 * compensate for them to accurately track TCP sequence numbers. This array
108 * contains the compensating extra lengths for ULP packets.  It is indexed by
109 * a packet's ULP submode.
110 */
111const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
112
113#ifdef notyet
114/*
115 * This sk_buff holds a fake header-only TCP segment that we use whenever we
116 * need to exploit SW TCP functionality that expects TCP headers, such as
117 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
118 * CPUs without locking.
119 */
120static struct mbuf *tcphdr_mbuf __read_mostly;
121#endif
122
123/*
124 * Size of WRs in bytes.  Note that we assume all devices we are handling have
125 * the same WR size.
126 */
127static unsigned int wrlen __read_mostly;
128
129/*
130 * The number of WRs needed for an skb depends on the number of page fragments
131 * in the skb and whether it has any payload in its main body.  This maps the
132 * length of the gather list represented by an skb into the # of necessary WRs.
133 */
134static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
135
136/*
137 * Max receive window supported by HW in bytes.  Only a small part of it can
138 * be set through option0, the rest needs to be set through RX_DATA_ACK.
139 */
140#define MAX_RCV_WND ((1U << 27) - 1)
141
142/*
143 * Min receive window.  We want it to be large enough to accommodate receive
144 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
145 */
146#define MIN_RCV_WND (24 * 1024U)
147#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
148
149#define VALIDATE_SEQ 0
150#define VALIDATE_SOCK(so)
151#define DEBUG_WR 0
152
153#define TCP_TIMEWAIT	1
154#define TCP_CLOSE	2
155#define TCP_DROP	3
156
157extern int tcp_do_autorcvbuf;
158extern int tcp_do_autosndbuf;
159extern int tcp_autorcvbuf_max;
160extern int tcp_autosndbuf_max;
161
162static void t3_send_reset(struct toepcb *toep);
163static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
164static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
165static void handle_syncache_event(int event, void *arg);
166
167static inline void
168SBAPPEND(struct sockbuf *sb, struct mbuf *n)
169{
170	struct mbuf *m;
171
172	m = sb->sb_mb;
173	while (m) {
174		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
175		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
176			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
177		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
178			m->m_next, m->m_nextpkt, m->m_flags));
179		m = m->m_next;
180	}
181	m = n;
182	while (m) {
183		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
184		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
185			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
186		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
187			m->m_next, m->m_nextpkt, m->m_flags));
188		m = m->m_next;
189	}
190	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
191	sbappendstream_locked(sb, n);
192	m = sb->sb_mb;
193
194	while (m) {
195		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
196			m->m_next, m->m_nextpkt, m->m_flags));
197		m = m->m_next;
198	}
199}
200
201static inline int
202is_t3a(const struct toedev *dev)
203{
204	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
205}
206
207static void
208dump_toepcb(struct toepcb *toep)
209{
210	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
211	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
212	    toep->tp_mtu_idx, toep->tp_tid);
213
214	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
215	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
216	    toep->tp_mss_clamp, toep->tp_flags);
217}
218
219#ifndef RTALLOC2_DEFINED
220static struct rtentry *
221rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
222{
223	struct rtentry *rt = NULL;
224
225	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
226		RT_UNLOCK(rt);
227
228	return (rt);
229}
230#endif
231
232/*
233 * Determine whether to send a CPL message now or defer it.  A message is
234 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
235 * For connections in other states the message is sent immediately.
236 * If through_l2t is set the message is subject to ARP processing, otherwise
237 * it is sent directly.
238 */
239static inline void
240send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
241{
242	struct tcpcb *tp = toep->tp_tp;
243
244	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
245		inp_wlock(tp->t_inpcb);
246		mbufq_tail(&toep->out_of_order_queue, m);  // defer
247		inp_wunlock(tp->t_inpcb);
248	} else if (through_l2t)
249		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
250	else
251		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
252}
253
254static inline unsigned int
255mkprio(unsigned int cntrl, const struct toepcb *toep)
256{
257        return (cntrl);
258}
259
260/*
261 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
262 */
263static inline void
264mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
265{
266	struct cpl_tid_release *req;
267
268	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
269	m->m_pkthdr.len = m->m_len = sizeof(*req);
270	req = mtod(m, struct cpl_tid_release *);
271	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
272	req->wr.wr_lo = 0;
273	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
274}
275
276static inline void
277make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
278{
279	INIT_VNET_INET(so->so_vnet);
280	struct tcpcb *tp = so_sototcpcb(so);
281	struct toepcb *toep = tp->t_toe;
282	struct tx_data_wr *req;
283	struct sockbuf *snd;
284
285	inp_lock_assert(tp->t_inpcb);
286	snd = so_sockbuf_snd(so);
287
288	req = mtod(m, struct tx_data_wr *);
289	m->m_len = sizeof(*req);
290	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
291	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
292	/* len includes the length of any HW ULP additions */
293	req->len = htonl(len);
294	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
295	/* V_TX_ULP_SUBMODE sets both the mode and submode */
296	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
297	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
298	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
299				   (tail ? 0 : 1))));
300	req->sndseq = htonl(tp->snd_nxt);
301	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
302		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
303				    V_TX_CPU_IDX(toep->tp_qset));
304
305		/* Sendbuffer is in units of 32KB.
306		 */
307		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
308			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
309		else {
310			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
311		}
312
313		toep->tp_flags |= TP_DATASENT;
314	}
315}
316
317#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
318
319int
320t3_push_frames(struct socket *so, int req_completion)
321{
322	struct tcpcb *tp = so_sototcpcb(so);
323	struct toepcb *toep = tp->t_toe;
324
325	struct mbuf *tail, *m0, *last;
326	struct t3cdev *cdev;
327	struct tom_data *d;
328	int state, bytes, count, total_bytes;
329	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
330	struct sockbuf *snd;
331
332	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
333		DPRINTF("tcp state=%d\n", tp->t_state);
334		return (0);
335	}
336
337	state = so_state_get(so);
338
339	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
340		DPRINTF("disconnecting\n");
341
342		return (0);
343	}
344
345	inp_lock_assert(tp->t_inpcb);
346
347	snd = so_sockbuf_snd(so);
348	sockbuf_lock(snd);
349
350	d = TOM_DATA(toep->tp_toedev);
351	cdev = d->cdev;
352
353	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
354
355	total_bytes = 0;
356	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
357	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
358
359	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
360		KASSERT(tail, ("sbdrop error"));
361		last = tail = tail->m_next;
362	}
363
364	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
365		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
366		sockbuf_unlock(snd);
367
368		return (0);
369	}
370
371	toep->tp_m_last = NULL;
372	while (toep->tp_wr_avail && (tail != NULL)) {
373		count = bytes = 0;
374		segp = segs;
375		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
376			sockbuf_unlock(snd);
377			return (0);
378		}
379		/*
380		 * If the data in tail fits as in-line, then
381		 * make an immediate data wr.
382		 */
383		if (tail->m_len <= IMM_LEN) {
384			count = 1;
385			bytes = tail->m_len;
386			last = tail;
387			tail = tail->m_next;
388			m_set_sgl(m0, NULL);
389			m_set_sgllen(m0, 0);
390			make_tx_data_wr(so, m0, bytes, tail);
391			m_append(m0, bytes, mtod(last, caddr_t));
392			KASSERT(!m0->m_next, ("bad append"));
393		} else {
394			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
395			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
396				bytes += tail->m_len;
397				last = tail;
398				count++;
399				/*
400				 * technically an abuse to be using this for a VA
401				 * but less gross than defining my own structure
402				 * or calling pmap_kextract from here :-|
403				 */
404				segp->ds_addr = (bus_addr_t)tail->m_data;
405				segp->ds_len = tail->m_len;
406				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
407				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
408				segp++;
409				tail = tail->m_next;
410			}
411			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
412			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
413
414			m_set_sgl(m0, segs);
415			m_set_sgllen(m0, count);
416			make_tx_data_wr(so, m0, bytes, tail);
417		}
418		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
419
420		if (tail) {
421			snd->sb_sndptr = tail;
422			toep->tp_m_last = NULL;
423		} else
424			toep->tp_m_last = snd->sb_sndptr = last;
425
426
427		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
428
429		snd->sb_sndptroff += bytes;
430		total_bytes += bytes;
431		toep->tp_write_seq += bytes;
432		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
433		    " tail=%p sndptr=%p sndptroff=%d",
434		    toep->tp_wr_avail, count, mbuf_wrs[count],
435		    tail, snd->sb_sndptr, snd->sb_sndptroff);
436		if (tail)
437			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
438			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
439			    total_bytes, toep->tp_m_last, tail->m_data,
440			    tp->snd_una);
441		else
442			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
443			    " tp_m_last=%p snd_una=0x%08x",
444			    total_bytes, toep->tp_m_last, tp->snd_una);
445
446
447#ifdef KTR
448{
449		int i;
450
451		i = 0;
452		while (i < count && m_get_sgllen(m0)) {
453			if ((count - i) >= 3) {
454				CTR6(KTR_TOM,
455				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
456				    " len=%d pa=0x%zx len=%d",
457				    segs[i].ds_addr, segs[i].ds_len,
458				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
459				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
460				    i += 3;
461			} else if ((count - i) == 2) {
462				CTR4(KTR_TOM,
463				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
464				    " len=%d",
465				    segs[i].ds_addr, segs[i].ds_len,
466				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
467				    i += 2;
468			} else {
469				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
470				    segs[i].ds_addr, segs[i].ds_len);
471				i++;
472			}
473
474		}
475}
476#endif
477                 /*
478		 * remember credits used
479		 */
480		m0->m_pkthdr.csum_data = mbuf_wrs[count];
481		m0->m_pkthdr.len = bytes;
482		toep->tp_wr_avail -= mbuf_wrs[count];
483		toep->tp_wr_unacked += mbuf_wrs[count];
484
485		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
486		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
487			struct work_request_hdr *wr = cplhdr(m0);
488
489			wr->wr_hi |= htonl(F_WR_COMPL);
490			toep->tp_wr_unacked = 0;
491		}
492		KASSERT((m0->m_pkthdr.csum_data > 0) &&
493		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
494			m0->m_pkthdr.csum_data));
495		m0->m_type = MT_DONTFREE;
496		enqueue_wr(toep, m0);
497		DPRINTF("sending offload tx with %d bytes in %d segments\n",
498		    bytes, count);
499		l2t_send(cdev, m0, toep->tp_l2t);
500	}
501	sockbuf_unlock(snd);
502	return (total_bytes);
503}
504
505/*
506 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
507 * under any circumstances.  We take the easy way out and always queue the
508 * message to the write_queue.  We can optimize the case where the queue is
509 * already empty though the optimization is probably not worth it.
510 */
511static void
512close_conn(struct socket *so)
513{
514	struct mbuf *m;
515	struct cpl_close_con_req *req;
516	struct tom_data *d;
517	struct inpcb *inp = so_sotoinpcb(so);
518	struct tcpcb *tp;
519	struct toepcb *toep;
520	unsigned int tid;
521
522
523	inp_wlock(inp);
524	tp = so_sototcpcb(so);
525	toep = tp->t_toe;
526
527	if (tp->t_state != TCPS_SYN_SENT)
528		t3_push_frames(so, 1);
529
530	if (toep->tp_flags & TP_FIN_SENT) {
531		inp_wunlock(inp);
532		return;
533	}
534
535	tid = toep->tp_tid;
536
537	d = TOM_DATA(toep->tp_toedev);
538
539	m = m_gethdr_nofail(sizeof(*req));
540	m_set_priority(m, CPL_PRIORITY_DATA);
541	m_set_sgl(m, NULL);
542	m_set_sgllen(m, 0);
543
544	toep->tp_flags |= TP_FIN_SENT;
545	req = mtod(m, struct cpl_close_con_req *);
546
547	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
548	req->wr.wr_lo = htonl(V_WR_TID(tid));
549	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
550	req->rsvd = 0;
551	inp_wunlock(inp);
552	/*
553	 * XXX - need to defer shutdown while there is still data in the queue
554	 *
555	 */
556	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
557	cxgb_ofld_send(d->cdev, m);
558
559}
560
561/*
562 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
563 * and send it along.
564 */
565static void
566abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
567{
568	struct cpl_abort_req *req = cplhdr(m);
569
570	req->cmd = CPL_ABORT_NO_RST;
571	cxgb_ofld_send(cdev, m);
572}
573
574/*
575 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
576 * permitted to return without sending the message in case we cannot allocate
577 * an sk_buff.  Returns the number of credits sent.
578 */
579uint32_t
580t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
581{
582	struct mbuf *m;
583	struct cpl_rx_data_ack *req;
584	struct toepcb *toep = tp->t_toe;
585	struct toedev *tdev = toep->tp_toedev;
586
587	m = m_gethdr_nofail(sizeof(*req));
588
589	DPRINTF("returning %u credits to HW\n", credits);
590
591	req = mtod(m, struct cpl_rx_data_ack *);
592	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
593	req->wr.wr_lo = 0;
594	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
595	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
596	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
597	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
598	return (credits);
599}
600
601/*
602 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
603 * This is only used in DDP mode, so we take the opportunity to also set the
604 * DACK mode and flush any Rx credits.
605 */
606void
607t3_send_rx_modulate(struct toepcb *toep)
608{
609	struct mbuf *m;
610	struct cpl_rx_data_ack *req;
611
612	m = m_gethdr_nofail(sizeof(*req));
613
614	req = mtod(m, struct cpl_rx_data_ack *);
615	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
616	req->wr.wr_lo = 0;
617	m->m_pkthdr.len = m->m_len = sizeof(*req);
618
619	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
620	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
621				 V_RX_DACK_MODE(1) |
622				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
623	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
624	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
625	toep->tp_rcv_wup = toep->tp_copied_seq;
626}
627
628/*
629 * Handle receipt of an urgent pointer.
630 */
631static void
632handle_urg_ptr(struct socket *so, uint32_t urg_seq)
633{
634#ifdef URGENT_DATA_SUPPORTED
635	struct tcpcb *tp = so_sototcpcb(so);
636
637	urg_seq--;   /* initially points past the urgent data, per BSD */
638
639	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
640		return;                                 /* duplicate pointer */
641	sk_send_sigurg(sk);
642	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
643	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
644		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
645
646		tp->copied_seq++;
647		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
648			tom_eat_skb(sk, skb, 0);
649	}
650	tp->urg_data = TCP_URG_NOTYET;
651	tp->urg_seq = urg_seq;
652#endif
653}
654
655/*
656 * Returns true if a socket cannot accept new Rx data.
657 */
658static inline int
659so_no_receive(const struct socket *so)
660{
661	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
662}
663
664/*
665 * Process an urgent data notification.
666 */
667static void
668rx_urg_notify(struct toepcb *toep, struct mbuf *m)
669{
670	struct cpl_rx_urg_notify *hdr = cplhdr(m);
671	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
672
673	VALIDATE_SOCK(so);
674
675	if (!so_no_receive(so))
676		handle_urg_ptr(so, ntohl(hdr->seq));
677
678	m_freem(m);
679}
680
681/*
682 * Handler for RX_URG_NOTIFY CPL messages.
683 */
684static int
685do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
686{
687	struct toepcb *toep = (struct toepcb *)ctx;
688
689	rx_urg_notify(toep, m);
690	return (0);
691}
692
693static __inline int
694is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
695{
696	return (toep->tp_ulp_mode ||
697		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
698		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
699}
700
701/*
702 * Set of states for which we should return RX credits.
703 */
704#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
705
706/*
707 * Called after some received data has been read.  It returns RX credits
708 * to the HW for the amount of data processed.
709 */
710void
711t3_cleanup_rbuf(struct tcpcb *tp, int copied)
712{
713	struct toepcb *toep = tp->t_toe;
714	struct socket *so;
715	struct toedev *dev;
716	int dack_mode, must_send, read;
717	u32 thres, credits, dack = 0;
718	struct sockbuf *rcv;
719
720	so = inp_inpcbtosocket(tp->t_inpcb);
721	rcv = so_sockbuf_rcv(so);
722
723	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
724		(tp->t_state == TCPS_FIN_WAIT_2))) {
725		if (copied) {
726			sockbuf_lock(rcv);
727			toep->tp_copied_seq += copied;
728			sockbuf_unlock(rcv);
729		}
730
731		return;
732	}
733
734	inp_lock_assert(tp->t_inpcb);
735
736	sockbuf_lock(rcv);
737	if (copied)
738		toep->tp_copied_seq += copied;
739	else {
740		read = toep->tp_enqueued_bytes - rcv->sb_cc;
741		toep->tp_copied_seq += read;
742	}
743	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
744	toep->tp_enqueued_bytes = rcv->sb_cc;
745	sockbuf_unlock(rcv);
746
747	if (credits > rcv->sb_mbmax) {
748		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
749		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
750	    credits = rcv->sb_mbmax;
751	}
752
753
754	/*
755	 * XXX this won't accurately reflect credit return - we need
756	 * to look at the difference between the amount that has been
757	 * put in the recv sockbuf and what is there now
758	 */
759
760	if (__predict_false(!credits))
761		return;
762
763	dev = toep->tp_toedev;
764	thres = TOM_TUNABLE(dev, rx_credit_thres);
765
766	if (__predict_false(thres == 0))
767		return;
768
769	if (is_delack_mode_valid(dev, toep)) {
770		dack_mode = TOM_TUNABLE(dev, delack);
771		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
772			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
773
774			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
775				dack = F_RX_DACK_CHANGE |
776				       V_RX_DACK_MODE(dack_mode);
777		}
778	} else
779		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
780
781	/*
782	 * For coalescing to work effectively ensure the receive window has
783	 * at least 16KB left.
784	 */
785	must_send = credits + 16384 >= tp->rcv_wnd;
786
787	if (must_send || credits >= thres)
788		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
789}
790
791static int
792cxgb_toe_disconnect(struct tcpcb *tp)
793{
794	struct socket *so;
795
796	DPRINTF("cxgb_toe_disconnect\n");
797
798	so = inp_inpcbtosocket(tp->t_inpcb);
799	close_conn(so);
800	return (0);
801}
802
803static int
804cxgb_toe_reset(struct tcpcb *tp)
805{
806	struct toepcb *toep = tp->t_toe;
807
808	t3_send_reset(toep);
809
810	/*
811	 * unhook from socket
812	 */
813	tp->t_flags &= ~TF_TOE;
814	toep->tp_tp = NULL;
815	tp->t_toe = NULL;
816	return (0);
817}
818
819static int
820cxgb_toe_send(struct tcpcb *tp)
821{
822	struct socket *so;
823
824	DPRINTF("cxgb_toe_send\n");
825	dump_toepcb(tp->t_toe);
826
827	so = inp_inpcbtosocket(tp->t_inpcb);
828	t3_push_frames(so, 1);
829	return (0);
830}
831
832static int
833cxgb_toe_rcvd(struct tcpcb *tp)
834{
835
836	inp_lock_assert(tp->t_inpcb);
837
838	t3_cleanup_rbuf(tp, 0);
839
840	return (0);
841}
842
843static void
844cxgb_toe_detach(struct tcpcb *tp)
845{
846	struct toepcb *toep;
847
848        /*
849	 * XXX how do we handle teardown in the SYN_SENT state?
850	 *
851	 */
852	inp_lock_assert(tp->t_inpcb);
853	toep = tp->t_toe;
854	toep->tp_tp = NULL;
855
856	/*
857	 * unhook from socket
858	 */
859	tp->t_flags &= ~TF_TOE;
860	tp->t_toe = NULL;
861}
862
863
864static struct toe_usrreqs cxgb_toe_usrreqs = {
865	.tu_disconnect = cxgb_toe_disconnect,
866	.tu_reset = cxgb_toe_reset,
867	.tu_send = cxgb_toe_send,
868	.tu_rcvd = cxgb_toe_rcvd,
869	.tu_detach = cxgb_toe_detach,
870	.tu_detach = cxgb_toe_detach,
871	.tu_syncache_event = handle_syncache_event,
872};
873
874
875static void
876__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
877			    uint64_t mask, uint64_t val, int no_reply)
878{
879	struct cpl_set_tcb_field *req;
880
881	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
882	    toep->tp_tid, word, mask, val);
883
884	req = mtod(m, struct cpl_set_tcb_field *);
885	m->m_pkthdr.len = m->m_len = sizeof(*req);
886	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
887	req->wr.wr_lo = 0;
888	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
889	req->reply = V_NO_REPLY(no_reply);
890	req->cpu_idx = 0;
891	req->word = htons(word);
892	req->mask = htobe64(mask);
893	req->val = htobe64(val);
894
895	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
896	send_or_defer(toep, m, 0);
897}
898
899static void
900t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
901{
902	struct mbuf *m;
903	struct tcpcb *tp = toep->tp_tp;
904
905	if (toep == NULL)
906		return;
907
908	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
909		printf("not seting field\n");
910		return;
911	}
912
913	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
914
915	__set_tcb_field(toep, m, word, mask, val, 1);
916}
917
918/*
919 * Set one of the t_flags bits in the TCB.
920 */
921static void
922set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
923{
924
925	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
926}
927
928/*
929 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
930 */
931static void
932t3_set_nagle(struct toepcb *toep)
933{
934	struct tcpcb *tp = toep->tp_tp;
935
936	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
937}
938
939/*
940 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
941 */
942void
943t3_set_keepalive(struct toepcb *toep, int on_off)
944{
945
946	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
947}
948
949void
950t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
951{
952	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
953}
954
955void
956t3_set_dack_mss(struct toepcb *toep, int on_off)
957{
958
959	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
960}
961
962/*
963 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
964 */
965static void
966t3_set_tos(struct toepcb *toep)
967{
968	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
969
970	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
971			 V_TCB_TOS(tos));
972}
973
974
975/*
976 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
977 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
978 * set the PSH bit in the last segment, which would trigger delivery.]
979 * We work around the issue by setting a DDP buffer in a partial placed state,
980 * which guarantees that TP will schedule a timer.
981 */
982#define TP_DDP_TIMER_WORKAROUND_MASK\
983    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
984     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
985       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
986#define TP_DDP_TIMER_WORKAROUND_VAL\
987    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
988     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
989      32))
990
991static void
992t3_enable_ddp(struct toepcb *toep, int on)
993{
994	if (on) {
995
996		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
997				 V_TF_DDP_OFF(0));
998	} else
999		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
1000				 V_TF_DDP_OFF(1) |
1001				 TP_DDP_TIMER_WORKAROUND_MASK,
1002				 V_TF_DDP_OFF(1) |
1003				 TP_DDP_TIMER_WORKAROUND_VAL);
1004
1005}
1006
1007void
1008t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1009{
1010	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1011			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1012			 tag_color);
1013}
1014
1015void
1016t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1017		    unsigned int len)
1018{
1019	if (buf_idx == 0)
1020		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1021			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1022			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1023			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1024			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1025	else
1026		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1027			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1028			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1029			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1030			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1031}
1032
1033static int
1034t3_set_cong_control(struct socket *so, const char *name)
1035{
1036#ifdef CONGESTION_CONTROL_SUPPORTED
1037	int cong_algo;
1038
1039	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1040		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1041			break;
1042
1043	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1044		return -EINVAL;
1045#endif
1046	return 0;
1047}
1048
1049int
1050t3_get_tcb(struct toepcb *toep)
1051{
1052	struct cpl_get_tcb *req;
1053	struct tcpcb *tp = toep->tp_tp;
1054	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1055
1056	if (!m)
1057		return (ENOMEM);
1058
1059	inp_lock_assert(tp->t_inpcb);
1060	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1061	req = mtod(m, struct cpl_get_tcb *);
1062	m->m_pkthdr.len = m->m_len = sizeof(*req);
1063	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1064	req->wr.wr_lo = 0;
1065	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1066	req->cpuno = htons(toep->tp_qset);
1067	req->rsvd = 0;
1068	if (tp->t_state == TCPS_SYN_SENT)
1069		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1070	else
1071		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1072	return 0;
1073}
1074
1075static inline void
1076so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1077{
1078
1079	toepcb_hold(toep);
1080
1081	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1082}
1083
1084/**
1085 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1086 *	@d: TOM state
1087 *	@mtu: the target MTU
1088 *
1089 *	Returns the index of the value in the MTU table that is closest to but
1090 *	does not exceed the target MTU.
1091 */
1092static unsigned int
1093find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1094{
1095	int i = 0;
1096
1097	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1098		++i;
1099	return (i);
1100}
1101
1102static unsigned int
1103select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1104{
1105	unsigned int idx;
1106
1107#ifdef notyet
1108	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1109#endif
1110	if (tp) {
1111		tp->t_maxseg = pmtu - 40;
1112		if (tp->t_maxseg < td->mtus[0] - 40)
1113			tp->t_maxseg = td->mtus[0] - 40;
1114		idx = find_best_mtu(td, tp->t_maxseg + 40);
1115
1116		tp->t_maxseg = td->mtus[idx] - 40;
1117	} else
1118		idx = find_best_mtu(td, pmtu);
1119
1120	return (idx);
1121}
1122
1123static inline void
1124free_atid(struct t3cdev *cdev, unsigned int tid)
1125{
1126	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1127
1128	if (toep)
1129		toepcb_release(toep);
1130}
1131
1132/*
1133 * Release resources held by an offload connection (TID, L2T entry, etc.)
1134 */
1135static void
1136t3_release_offload_resources(struct toepcb *toep)
1137{
1138	struct tcpcb *tp = toep->tp_tp;
1139	struct toedev *tdev = toep->tp_toedev;
1140	struct t3cdev *cdev;
1141	struct socket *so;
1142	unsigned int tid = toep->tp_tid;
1143	struct sockbuf *rcv;
1144
1145	CTR0(KTR_TOM, "t3_release_offload_resources");
1146
1147	if (!tdev)
1148		return;
1149
1150	cdev = TOEP_T3C_DEV(toep);
1151	if (!cdev)
1152		return;
1153
1154	toep->tp_qset = 0;
1155	t3_release_ddp_resources(toep);
1156
1157#ifdef CTRL_SKB_CACHE
1158	kfree_skb(CTRL_SKB_CACHE(tp));
1159	CTRL_SKB_CACHE(tp) = NULL;
1160#endif
1161
1162	if (toep->tp_wr_avail != toep->tp_wr_max) {
1163		purge_wr_queue(toep);
1164		reset_wr_list(toep);
1165	}
1166
1167	if (toep->tp_l2t) {
1168		l2t_release(L2DATA(cdev), toep->tp_l2t);
1169		toep->tp_l2t = NULL;
1170	}
1171	toep->tp_tp = NULL;
1172	if (tp) {
1173		inp_lock_assert(tp->t_inpcb);
1174		so = inp_inpcbtosocket(tp->t_inpcb);
1175		rcv = so_sockbuf_rcv(so);
1176		/*
1177		 * cancel any offloaded reads
1178		 *
1179		 */
1180		sockbuf_lock(rcv);
1181		tp->t_toe = NULL;
1182		tp->t_flags &= ~TF_TOE;
1183		if (toep->tp_ddp_state.user_ddp_pending) {
1184			t3_cancel_ubuf(toep, rcv);
1185			toep->tp_ddp_state.user_ddp_pending = 0;
1186		}
1187		so_sorwakeup_locked(so);
1188
1189	}
1190
1191	if (toep->tp_state == TCPS_SYN_SENT) {
1192		free_atid(cdev, tid);
1193#ifdef notyet
1194		__skb_queue_purge(&tp->out_of_order_queue);
1195#endif
1196	} else {                                          // we have TID
1197		cxgb_remove_tid(cdev, toep, tid);
1198		toepcb_release(toep);
1199	}
1200#if 0
1201	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1202#endif
1203}
1204
1205static void
1206install_offload_ops(struct socket *so)
1207{
1208	struct tcpcb *tp = so_sototcpcb(so);
1209
1210	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1211
1212	t3_install_socket_ops(so);
1213	tp->t_flags |= TF_TOE;
1214	tp->t_tu = &cxgb_toe_usrreqs;
1215}
1216
1217/*
1218 * Determine the receive window scaling factor given a target max
1219 * receive window.
1220 */
1221static __inline int
1222select_rcv_wscale(int space)
1223{
1224	INIT_VNET_INET(so->so_vnet);
1225	int wscale = 0;
1226
1227	if (space > MAX_RCV_WND)
1228		space = MAX_RCV_WND;
1229
1230	if (V_tcp_do_rfc1323)
1231		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1232
1233	return (wscale);
1234}
1235
1236/*
1237 * Determine the receive window size for a socket.
1238 */
1239static unsigned long
1240select_rcv_wnd(struct toedev *dev, struct socket *so)
1241{
1242	INIT_VNET_INET(so->so_vnet);
1243	struct tom_data *d = TOM_DATA(dev);
1244	unsigned int wnd;
1245	unsigned int max_rcv_wnd;
1246	struct sockbuf *rcv;
1247
1248	rcv = so_sockbuf_rcv(so);
1249
1250	if (V_tcp_do_autorcvbuf)
1251		wnd = V_tcp_autorcvbuf_max;
1252	else
1253		wnd = rcv->sb_hiwat;
1254
1255
1256
1257	/* XXX
1258	 * For receive coalescing to work effectively we need a receive window
1259	 * that can accomodate a coalesced segment.
1260	 */
1261	if (wnd < MIN_RCV_WND)
1262		wnd = MIN_RCV_WND;
1263
1264	/* PR 5138 */
1265	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1266				    (uint32_t)d->rx_page_size * 23 :
1267				    MAX_RCV_WND);
1268
1269	return min(wnd, max_rcv_wnd);
1270}
1271
1272/*
1273 * Assign offload parameters to some socket fields.  This code is used by
1274 * both active and passive opens.
1275 */
1276static inline void
1277init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1278    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1279{
1280	struct tcpcb *tp = so_sototcpcb(so);
1281	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1282	struct sockbuf *snd, *rcv;
1283
1284#ifdef notyet
1285	SOCK_LOCK_ASSERT(so);
1286#endif
1287
1288	snd = so_sockbuf_snd(so);
1289	rcv = so_sockbuf_rcv(so);
1290
1291	log(LOG_INFO, "initializing offload socket\n");
1292	/*
1293	 * We either need to fix push frames to work with sbcompress
1294	 * or we need to add this
1295	 */
1296	snd->sb_flags |= SB_NOCOALESCE;
1297	rcv->sb_flags |= SB_NOCOALESCE;
1298
1299	tp->t_toe = toep;
1300	toep->tp_tp = tp;
1301	toep->tp_toedev = dev;
1302
1303	toep->tp_tid = tid;
1304	toep->tp_l2t = e;
1305	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1306	toep->tp_wr_unacked = 0;
1307	toep->tp_delack_mode = 0;
1308
1309	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1310	/*
1311	 * XXX broken
1312	 *
1313	 */
1314	tp->rcv_wnd = select_rcv_wnd(dev, so);
1315
1316        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1317		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1318	toep->tp_qset_idx = 0;
1319
1320	reset_wr_list(toep);
1321	DPRINTF("initialization done\n");
1322}
1323
1324/*
1325 * The next two functions calculate the option 0 value for a socket.
1326 */
1327static inline unsigned int
1328calc_opt0h(struct socket *so, int mtu_idx)
1329{
1330	struct tcpcb *tp = so_sototcpcb(so);
1331	int wscale = select_rcv_wscale(tp->rcv_wnd);
1332
1333	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1334	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1335	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1336}
1337
1338static inline unsigned int
1339calc_opt0l(struct socket *so, int ulp_mode)
1340{
1341	struct tcpcb *tp = so_sototcpcb(so);
1342	unsigned int val;
1343
1344	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1345	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1346
1347	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1348	return (val);
1349}
1350
1351static inline unsigned int
1352calc_opt2(const struct socket *so, struct toedev *dev)
1353{
1354	int flv_valid;
1355
1356	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1357
1358	return (V_FLAVORS_VALID(flv_valid) |
1359	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1360}
1361
1362#if DEBUG_WR > 1
1363static int
1364count_pending_wrs(const struct toepcb *toep)
1365{
1366	const struct mbuf *m;
1367	int n = 0;
1368
1369	wr_queue_walk(toep, m)
1370		n += m->m_pkthdr.csum_data;
1371	return (n);
1372}
1373#endif
1374
1375#if 0
1376(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1377#endif
1378
1379static void
1380mk_act_open_req(struct socket *so, struct mbuf *m,
1381    unsigned int atid, const struct l2t_entry *e)
1382{
1383	struct cpl_act_open_req *req;
1384	struct inpcb *inp = so_sotoinpcb(so);
1385	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1386	struct toepcb *toep = tp->t_toe;
1387	struct toedev *tdev = toep->tp_toedev;
1388
1389	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1390
1391	req = mtod(m, struct cpl_act_open_req *);
1392	m->m_pkthdr.len = m->m_len = sizeof(*req);
1393
1394	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1395	req->wr.wr_lo = 0;
1396	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1397	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1398#if 0
1399	req->local_port = inp->inp_lport;
1400	req->peer_port = inp->inp_fport;
1401	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1402	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1403#endif
1404	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1405			   V_TX_CHANNEL(e->smt_idx));
1406	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1407	req->params = 0;
1408	req->opt2 = htonl(calc_opt2(so, tdev));
1409}
1410
1411
1412/*
1413 * Convert an ACT_OPEN_RPL status to an errno.
1414 */
1415static int
1416act_open_rpl_status_to_errno(int status)
1417{
1418	switch (status) {
1419	case CPL_ERR_CONN_RESET:
1420		return (ECONNREFUSED);
1421	case CPL_ERR_ARP_MISS:
1422		return (EHOSTUNREACH);
1423	case CPL_ERR_CONN_TIMEDOUT:
1424		return (ETIMEDOUT);
1425	case CPL_ERR_TCAM_FULL:
1426		return (ENOMEM);
1427	case CPL_ERR_CONN_EXIST:
1428		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1429		return (EADDRINUSE);
1430	default:
1431		return (EIO);
1432	}
1433}
1434
1435static void
1436fail_act_open(struct toepcb *toep, int errno)
1437{
1438	struct tcpcb *tp = toep->tp_tp;
1439
1440	t3_release_offload_resources(toep);
1441	if (tp) {
1442		inp_wunlock(tp->t_inpcb);
1443		tcp_offload_drop(tp, errno);
1444	}
1445
1446#ifdef notyet
1447	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1448#endif
1449}
1450
1451/*
1452 * Handle active open failures.
1453 */
1454static void
1455active_open_failed(struct toepcb *toep, struct mbuf *m)
1456{
1457	struct cpl_act_open_rpl *rpl = cplhdr(m);
1458	struct inpcb *inp;
1459
1460	if (toep->tp_tp == NULL)
1461		goto done;
1462
1463	inp = toep->tp_tp->t_inpcb;
1464
1465/*
1466 * Don't handle connection retry for now
1467 */
1468#ifdef notyet
1469	struct inet_connection_sock *icsk = inet_csk(sk);
1470
1471	if (rpl->status == CPL_ERR_CONN_EXIST &&
1472	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1473		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1474		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1475			       jiffies + HZ / 2);
1476	} else
1477#endif
1478	{
1479		inp_wlock(inp);
1480		/*
1481		 * drops the inpcb lock
1482		 */
1483		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1484	}
1485
1486	done:
1487	m_free(m);
1488}
1489
1490/*
1491 * Return whether a failed active open has allocated a TID
1492 */
1493static inline int
1494act_open_has_tid(int status)
1495{
1496	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1497	       status != CPL_ERR_ARP_MISS;
1498}
1499
1500/*
1501 * Process an ACT_OPEN_RPL CPL message.
1502 */
1503static int
1504do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1505{
1506	struct toepcb *toep = (struct toepcb *)ctx;
1507	struct cpl_act_open_rpl *rpl = cplhdr(m);
1508
1509	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1510		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1511
1512	active_open_failed(toep, m);
1513	return (0);
1514}
1515
1516/*
1517 * Handle an ARP failure for an active open.   XXX purge ofo queue
1518 *
1519 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1520 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1521 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1522 * free the atid.  Hmm.
1523 */
1524#ifdef notyet
1525static void
1526act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1527{
1528	struct toepcb *toep = m_get_toep(m);
1529	struct tcpcb *tp = toep->tp_tp;
1530	struct inpcb *inp = tp->t_inpcb;
1531	struct socket *so;
1532
1533	inp_wlock(inp);
1534	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1535		/*
1536		 * drops the inpcb lock
1537		 */
1538		fail_act_open(so, EHOSTUNREACH);
1539		printf("freeing %p\n", m);
1540
1541		m_free(m);
1542	} else
1543		inp_wunlock(inp);
1544}
1545#endif
1546/*
1547 * Send an active open request.
1548 */
1549int
1550t3_connect(struct toedev *tdev, struct socket *so,
1551    struct rtentry *rt, struct sockaddr *nam)
1552{
1553	struct mbuf *m;
1554	struct l2t_entry *e;
1555	struct tom_data *d = TOM_DATA(tdev);
1556	struct inpcb *inp = so_sotoinpcb(so);
1557	struct tcpcb *tp = intotcpcb(inp);
1558	struct toepcb *toep; /* allocated by init_offload_socket */
1559
1560	int atid;
1561
1562	toep = toepcb_alloc();
1563	if (toep == NULL)
1564		goto out_err;
1565
1566	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1567		goto out_err;
1568
1569	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1570	if (!e)
1571		goto free_tid;
1572
1573	inp_lock_assert(inp);
1574	m = m_gethdr(MT_DATA, M_WAITOK);
1575
1576#if 0
1577	m->m_toe.mt_toepcb = tp->t_toe;
1578	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1579#endif
1580	so_lock(so);
1581
1582	init_offload_socket(so, tdev, atid, e, rt, toep);
1583
1584	install_offload_ops(so);
1585
1586	mk_act_open_req(so, m, atid, e);
1587	so_unlock(so);
1588
1589	soisconnecting(so);
1590	toep = tp->t_toe;
1591	m_set_toep(m, tp->t_toe);
1592
1593	toep->tp_state = TCPS_SYN_SENT;
1594	l2t_send(d->cdev, (struct mbuf *)m, e);
1595
1596	if (toep->tp_ulp_mode)
1597		t3_enable_ddp(toep, 0);
1598	return 	(0);
1599
1600free_tid:
1601	printf("failing connect - free atid\n");
1602
1603	free_atid(d->cdev, atid);
1604out_err:
1605	printf("return ENOMEM\n");
1606       return (ENOMEM);
1607}
1608
1609/*
1610 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1611 * not send multiple ABORT_REQs for the same connection and also that we do
1612 * not try to send a message after the connection has closed.  Returns 1 if
1613 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1614 */
1615static void
1616t3_send_reset(struct toepcb *toep)
1617{
1618
1619	struct cpl_abort_req *req;
1620	unsigned int tid = toep->tp_tid;
1621	int mode = CPL_ABORT_SEND_RST;
1622	struct tcpcb *tp = toep->tp_tp;
1623	struct toedev *tdev = toep->tp_toedev;
1624	struct socket *so = NULL;
1625	struct mbuf *m;
1626	struct sockbuf *snd;
1627
1628	if (tp) {
1629		inp_lock_assert(tp->t_inpcb);
1630		so = inp_inpcbtosocket(tp->t_inpcb);
1631	}
1632
1633	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1634		tdev == NULL))
1635		return;
1636	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1637
1638	snd = so_sockbuf_snd(so);
1639	/* Purge the send queue so we don't send anything after an abort. */
1640	if (so)
1641		sbflush(snd);
1642	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1643		mode |= CPL_ABORT_POST_CLOSE_REQ;
1644
1645	m = m_gethdr_nofail(sizeof(*req));
1646	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1647	set_arp_failure_handler(m, abort_arp_failure);
1648
1649	req = mtod(m, struct cpl_abort_req *);
1650	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1651	req->wr.wr_lo = htonl(V_WR_TID(tid));
1652	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1653	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1654	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1655	req->cmd = mode;
1656	if (tp && (tp->t_state == TCPS_SYN_SENT))
1657		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1658	else
1659		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1660}
1661
1662static int
1663t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1664{
1665	struct inpcb *inp;
1666	int error, optval;
1667
1668	if (sopt->sopt_name == IP_OPTIONS)
1669		return (ENOPROTOOPT);
1670
1671	if (sopt->sopt_name != IP_TOS)
1672		return (EOPNOTSUPP);
1673
1674	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1675
1676	if (error)
1677		return (error);
1678
1679	if (optval > IPTOS_PREC_CRITIC_ECP)
1680		return (EINVAL);
1681
1682	inp = so_sotoinpcb(so);
1683	inp_wlock(inp);
1684	inp_ip_tos_set(inp, optval);
1685#if 0
1686	inp->inp_ip_tos = optval;
1687#endif
1688	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1689	inp_wunlock(inp);
1690
1691	return (0);
1692}
1693
1694static int
1695t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1696{
1697	int err = 0;
1698	size_t copied;
1699
1700	if (sopt->sopt_name != TCP_CONGESTION &&
1701	    sopt->sopt_name != TCP_NODELAY)
1702		return (EOPNOTSUPP);
1703
1704	if (sopt->sopt_name == TCP_CONGESTION) {
1705		char name[TCP_CA_NAME_MAX];
1706		int optlen = sopt->sopt_valsize;
1707		struct tcpcb *tp;
1708
1709		if (sopt->sopt_dir == SOPT_GET) {
1710			KASSERT(0, ("unimplemented"));
1711			return (EOPNOTSUPP);
1712		}
1713
1714		if (optlen < 1)
1715			return (EINVAL);
1716
1717		err = copyinstr(sopt->sopt_val, name,
1718		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1719		if (err)
1720			return (err);
1721		if (copied < 1)
1722			return (EINVAL);
1723
1724		tp = so_sototcpcb(so);
1725		/*
1726		 * XXX I need to revisit this
1727		 */
1728		if ((err = t3_set_cong_control(so, name)) == 0) {
1729#ifdef CONGESTION_CONTROL_SUPPORTED
1730			tp->t_cong_control = strdup(name, M_CXGB);
1731#endif
1732		} else
1733			return (err);
1734	} else {
1735		int optval, oldval;
1736		struct inpcb *inp;
1737		struct tcpcb *tp;
1738
1739		if (sopt->sopt_dir == SOPT_GET)
1740			return (EOPNOTSUPP);
1741
1742		err = sooptcopyin(sopt, &optval, sizeof optval,
1743		    sizeof optval);
1744
1745		if (err)
1746			return (err);
1747
1748		inp = so_sotoinpcb(so);
1749		inp_wlock(inp);
1750		tp = inp_inpcbtotcpcb(inp);
1751
1752		oldval = tp->t_flags;
1753		if (optval)
1754			tp->t_flags |= TF_NODELAY;
1755		else
1756			tp->t_flags &= ~TF_NODELAY;
1757		inp_wunlock(inp);
1758
1759
1760		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1761			t3_set_nagle(tp->t_toe);
1762
1763	}
1764
1765	return (0);
1766}
1767
1768int
1769t3_ctloutput(struct socket *so, struct sockopt *sopt)
1770{
1771	int err;
1772
1773	if (sopt->sopt_level != IPPROTO_TCP)
1774		err =  t3_ip_ctloutput(so, sopt);
1775	else
1776		err = t3_tcp_ctloutput(so, sopt);
1777
1778	if (err != EOPNOTSUPP)
1779		return (err);
1780
1781	return (tcp_ctloutput(so, sopt));
1782}
1783
1784/*
1785 * Returns true if we need to explicitly request RST when we receive new data
1786 * on an RX-closed connection.
1787 */
1788static inline int
1789need_rst_on_excess_rx(const struct toepcb *toep)
1790{
1791	return (1);
1792}
1793
1794/*
1795 * Handles Rx data that arrives in a state where the socket isn't accepting
1796 * new data.
1797 */
1798static void
1799handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1800{
1801
1802	if (need_rst_on_excess_rx(toep) &&
1803	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1804		t3_send_reset(toep);
1805	m_freem(m);
1806}
1807
1808/*
1809 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1810 * by getting the DDP offset from the TCB.
1811 */
1812static void
1813tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1814{
1815	struct ddp_state *q = &toep->tp_ddp_state;
1816	struct ddp_buf_state *bsp;
1817	struct cpl_get_tcb_rpl *hdr;
1818	unsigned int ddp_offset;
1819	struct socket *so;
1820	struct tcpcb *tp;
1821	struct sockbuf *rcv;
1822	int state;
1823
1824	uint64_t t;
1825	__be64 *tcb;
1826
1827	tp = toep->tp_tp;
1828	so = inp_inpcbtosocket(tp->t_inpcb);
1829
1830	inp_lock_assert(tp->t_inpcb);
1831	rcv = so_sockbuf_rcv(so);
1832	sockbuf_lock(rcv);
1833
1834	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1835	 * We really need a cookie in order to dispatch the RPLs.
1836	 */
1837	q->get_tcb_count--;
1838
1839	/* It is a possible that a previous CPL already invalidated UBUF DDP
1840	 * and moved the cur_buf idx and hence no further processing of this
1841	 * skb is required. However, the app might be sleeping on
1842	 * !q->get_tcb_count and we need to wake it up.
1843	 */
1844	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1845		int state = so_state_get(so);
1846
1847		m_freem(m);
1848		if (__predict_true((state & SS_NOFDREF) == 0))
1849			so_sorwakeup_locked(so);
1850		else
1851			sockbuf_unlock(rcv);
1852
1853		return;
1854	}
1855
1856	bsp = &q->buf_state[q->cur_buf];
1857	hdr = cplhdr(m);
1858	tcb = (__be64 *)(hdr + 1);
1859	if (q->cur_buf == 0) {
1860		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1861		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1862	} else {
1863		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1864		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1865	}
1866	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1867	m->m_cur_offset = bsp->cur_offset;
1868	bsp->cur_offset = ddp_offset;
1869	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1870
1871	CTR5(KTR_TOM,
1872	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1873	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1874	KASSERT(ddp_offset >= m->m_cur_offset,
1875	    ("ddp_offset=%u less than cur_offset=%u",
1876		ddp_offset, m->m_cur_offset));
1877
1878#if 0
1879{
1880	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1881
1882	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1883	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1884
1885        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1886        rcv_nxt = t >> S_TCB_RCV_NXT;
1887        rcv_nxt &= M_TCB_RCV_NXT;
1888
1889        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1890        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1891        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1892
1893	T3_TRACE2(TIDTB(sk),
1894		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1895		  ddp_flags, rcv_nxt - rx_hdr_offset);
1896	T3_TRACE4(TB(q),
1897		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1898		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1899	T3_TRACE3(TB(q),
1900		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1901		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1902	T3_TRACE2(TB(q),
1903		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1904		 q->buf_state[0].flags, q->buf_state[1].flags);
1905
1906}
1907#endif
1908	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1909		handle_excess_rx(toep, m);
1910		return;
1911	}
1912
1913#ifdef T3_TRACE
1914	if ((int)m->m_pkthdr.len < 0) {
1915		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1916	}
1917#endif
1918	if (bsp->flags & DDP_BF_NOCOPY) {
1919#ifdef T3_TRACE
1920		T3_TRACE0(TB(q),
1921			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1922
1923		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1924			printk("!cancel_ubuf");
1925			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1926		}
1927#endif
1928		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1929		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1930		q->cur_buf ^= 1;
1931	} else if (bsp->flags & DDP_BF_NOFLIP) {
1932
1933		m->m_ddp_flags = 1;    /* always a kernel buffer */
1934
1935		/* now HW buffer carries a user buffer */
1936		bsp->flags &= ~DDP_BF_NOFLIP;
1937		bsp->flags |= DDP_BF_NOCOPY;
1938
1939		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1940		 * any new data in which case we're done. If in addition the
1941		 * offset is 0, then there wasn't a completion for the kbuf
1942		 * and we need to decrement the posted count.
1943		 */
1944		if (m->m_pkthdr.len == 0) {
1945			if (ddp_offset == 0) {
1946				q->kbuf_posted--;
1947				bsp->flags |= DDP_BF_NODATA;
1948			}
1949			sockbuf_unlock(rcv);
1950			m_free(m);
1951			return;
1952		}
1953	} else {
1954		sockbuf_unlock(rcv);
1955
1956		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1957		 * but it got here way late and nobody cares anymore.
1958		 */
1959		m_free(m);
1960		return;
1961	}
1962
1963	m->m_ddp_gl = (unsigned char *)bsp->gl;
1964	m->m_flags |= M_DDP;
1965	m->m_seq = tp->rcv_nxt;
1966	tp->rcv_nxt += m->m_pkthdr.len;
1967	tp->t_rcvtime = ticks;
1968	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1969		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1970	if (m->m_pkthdr.len == 0) {
1971		q->user_ddp_pending = 0;
1972		m_free(m);
1973	} else
1974		SBAPPEND(rcv, m);
1975
1976	state = so_state_get(so);
1977	if (__predict_true((state & SS_NOFDREF) == 0))
1978		so_sorwakeup_locked(so);
1979	else
1980		sockbuf_unlock(rcv);
1981}
1982
1983/*
1984 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1985 * in that case they are similar to DDP completions.
1986 */
1987static int
1988do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1989{
1990	struct toepcb *toep = (struct toepcb *)ctx;
1991
1992	/* OK if socket doesn't exist */
1993	if (toep == NULL) {
1994		printf("null toep in do_get_tcb_rpl\n");
1995		return (CPL_RET_BUF_DONE);
1996	}
1997
1998	inp_wlock(toep->tp_tp->t_inpcb);
1999	tcb_rpl_as_ddp_complete(toep, m);
2000	inp_wunlock(toep->tp_tp->t_inpcb);
2001
2002	return (0);
2003}
2004
2005static void
2006handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2007{
2008	struct tcpcb *tp = toep->tp_tp;
2009	struct socket *so;
2010	struct ddp_state *q;
2011	struct ddp_buf_state *bsp;
2012	struct cpl_rx_data *hdr = cplhdr(m);
2013	unsigned int rcv_nxt = ntohl(hdr->seq);
2014	struct sockbuf *rcv;
2015
2016	if (tp->rcv_nxt == rcv_nxt)
2017		return;
2018
2019	inp_lock_assert(tp->t_inpcb);
2020	so  = inp_inpcbtosocket(tp->t_inpcb);
2021	rcv = so_sockbuf_rcv(so);
2022	sockbuf_lock(rcv);
2023
2024	q = &toep->tp_ddp_state;
2025	bsp = &q->buf_state[q->cur_buf];
2026	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2027		rcv_nxt, tp->rcv_nxt));
2028	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2029	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2030	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2031	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2032
2033#ifdef T3_TRACE
2034	if ((int)m->m_pkthdr.len < 0) {
2035		t3_ddp_error(so, "handle_ddp_data: neg len");
2036	}
2037#endif
2038	m->m_ddp_gl = (unsigned char *)bsp->gl;
2039	m->m_flags |= M_DDP;
2040	m->m_cur_offset = bsp->cur_offset;
2041	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2042	if (bsp->flags & DDP_BF_NOCOPY)
2043		bsp->flags &= ~DDP_BF_NOCOPY;
2044
2045	m->m_seq = tp->rcv_nxt;
2046	tp->rcv_nxt = rcv_nxt;
2047	bsp->cur_offset += m->m_pkthdr.len;
2048	if (!(bsp->flags & DDP_BF_NOFLIP))
2049		q->cur_buf ^= 1;
2050	/*
2051	 * For now, don't re-enable DDP after a connection fell out of  DDP
2052	 * mode.
2053	 */
2054	q->ubuf_ddp_ready = 0;
2055	sockbuf_unlock(rcv);
2056}
2057
2058/*
2059 * Process new data received for a connection.
2060 */
2061static void
2062new_rx_data(struct toepcb *toep, struct mbuf *m)
2063{
2064	struct cpl_rx_data *hdr = cplhdr(m);
2065	struct tcpcb *tp = toep->tp_tp;
2066	struct socket *so;
2067	struct sockbuf *rcv;
2068	int state;
2069	int len = be16toh(hdr->len);
2070
2071	inp_wlock(tp->t_inpcb);
2072
2073	so  = inp_inpcbtosocket(tp->t_inpcb);
2074
2075	if (__predict_false(so_no_receive(so))) {
2076		handle_excess_rx(toep, m);
2077		inp_wunlock(tp->t_inpcb);
2078		TRACE_EXIT;
2079		return;
2080	}
2081
2082	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2083		handle_ddp_data(toep, m);
2084
2085	m->m_seq = ntohl(hdr->seq);
2086	m->m_ulp_mode = 0;                    /* for iSCSI */
2087
2088#if VALIDATE_SEQ
2089	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2090		log(LOG_ERR,
2091		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2092		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2093		       tp->rcv_nxt);
2094		m_freem(m);
2095		inp_wunlock(tp->t_inpcb);
2096		return;
2097	}
2098#endif
2099	m_adj(m, sizeof(*hdr));
2100
2101#ifdef URGENT_DATA_SUPPORTED
2102	/*
2103	 * We don't handle urgent data yet
2104	 */
2105	if (__predict_false(hdr->urg))
2106		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2107	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2108		     tp->urg_seq - tp->rcv_nxt < skb->len))
2109		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2110							 tp->rcv_nxt];
2111#endif
2112	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2113		toep->tp_delack_mode = hdr->dack_mode;
2114		toep->tp_delack_seq = tp->rcv_nxt;
2115	}
2116	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2117	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2118
2119	if (len < m->m_pkthdr.len)
2120		m->m_pkthdr.len = m->m_len = len;
2121
2122	tp->rcv_nxt += m->m_pkthdr.len;
2123	tp->t_rcvtime = ticks;
2124	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2125	CTR2(KTR_TOM,
2126	    "new_rx_data: seq 0x%x len %u",
2127	    m->m_seq, m->m_pkthdr.len);
2128	inp_wunlock(tp->t_inpcb);
2129	rcv = so_sockbuf_rcv(so);
2130	sockbuf_lock(rcv);
2131#if 0
2132	if (sb_notify(rcv))
2133		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2134#endif
2135	SBAPPEND(rcv, m);
2136
2137#ifdef notyet
2138	/*
2139	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2140	 *
2141	 */
2142	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2143
2144	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2145		so, rcv->sb_cc, rcv->sb_mbmax));
2146#endif
2147
2148
2149	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2150	    rcv->sb_cc, rcv->sb_mbcnt);
2151
2152	state = so_state_get(so);
2153	if (__predict_true((state & SS_NOFDREF) == 0))
2154		so_sorwakeup_locked(so);
2155	else
2156		sockbuf_unlock(rcv);
2157}
2158
2159/*
2160 * Handler for RX_DATA CPL messages.
2161 */
2162static int
2163do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2164{
2165	struct toepcb *toep = (struct toepcb *)ctx;
2166
2167	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2168
2169	new_rx_data(toep, m);
2170
2171	return (0);
2172}
2173
2174static void
2175new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2176{
2177	struct tcpcb *tp;
2178	struct ddp_state *q;
2179	struct ddp_buf_state *bsp;
2180	struct cpl_rx_data_ddp *hdr;
2181	struct socket *so;
2182	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2183	int nomoredata = 0;
2184	unsigned int delack_mode;
2185	struct sockbuf *rcv;
2186
2187	tp = toep->tp_tp;
2188	inp_wlock(tp->t_inpcb);
2189	so = inp_inpcbtosocket(tp->t_inpcb);
2190
2191	if (__predict_false(so_no_receive(so))) {
2192
2193		handle_excess_rx(toep, m);
2194		inp_wunlock(tp->t_inpcb);
2195		return;
2196	}
2197
2198	q = &toep->tp_ddp_state;
2199	hdr = cplhdr(m);
2200	ddp_report = ntohl(hdr->u.ddp_report);
2201	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2202	bsp = &q->buf_state[buf_idx];
2203
2204	CTR4(KTR_TOM,
2205	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2206	    "hdr seq 0x%x len %u",
2207	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2208	    ntohs(hdr->len));
2209	CTR3(KTR_TOM,
2210	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2211	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2212
2213	ddp_len = ntohs(hdr->len);
2214	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2215
2216	delack_mode = G_DDP_DACK_MODE(ddp_report);
2217	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2218		toep->tp_delack_mode = delack_mode;
2219		toep->tp_delack_seq = tp->rcv_nxt;
2220	}
2221
2222	m->m_seq = tp->rcv_nxt;
2223	tp->rcv_nxt = rcv_nxt;
2224
2225	tp->t_rcvtime = ticks;
2226	/*
2227	 * Store the length in m->m_len.  We are changing the meaning of
2228	 * m->m_len here, we need to be very careful that nothing from now on
2229	 * interprets ->len of this packet the usual way.
2230	 */
2231	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2232	inp_wunlock(tp->t_inpcb);
2233	CTR3(KTR_TOM,
2234	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2235	    m->m_len, rcv_nxt, m->m_seq);
2236	/*
2237	 * Figure out where the new data was placed in the buffer and store it
2238	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2239	 * account for page pod's pg_offset.
2240	 */
2241	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2242	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2243
2244	rcv = so_sockbuf_rcv(so);
2245	sockbuf_lock(rcv);
2246
2247	m->m_ddp_gl = (unsigned char *)bsp->gl;
2248	m->m_flags |= M_DDP;
2249	bsp->cur_offset = end_offset;
2250	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2251
2252	/*
2253	 * Length is only meaningful for kbuf
2254	 */
2255	if (!(bsp->flags & DDP_BF_NOCOPY))
2256		KASSERT(m->m_len <= bsp->gl->dgl_length,
2257		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2258			m->m_len, bsp->gl->dgl_length));
2259
2260	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2261	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2262        /*
2263	 * Bit 0 of flags stores whether the DDP buffer is completed.
2264	 * Note that other parts of the code depend on this being in bit 0.
2265	 */
2266	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2267		panic("spurious ddp completion");
2268	} else {
2269		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2270		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2271			q->cur_buf ^= 1;                     /* flip buffers */
2272	}
2273
2274	if (bsp->flags & DDP_BF_NOCOPY) {
2275		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2276		bsp->flags &= ~DDP_BF_NOCOPY;
2277	}
2278
2279	if (ddp_report & F_DDP_PSH)
2280		m->m_ddp_flags |= DDP_BF_PSH;
2281	if (nomoredata)
2282		m->m_ddp_flags |= DDP_BF_NODATA;
2283
2284#ifdef notyet
2285	skb_reset_transport_header(skb);
2286	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2287#endif
2288	SBAPPEND(rcv, m);
2289
2290	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2291	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2292		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2293		so_sorwakeup_locked(so);
2294	else
2295		sockbuf_unlock(rcv);
2296}
2297
2298#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2299		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2300		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2301		 F_DDP_INVALID_PPOD)
2302
2303/*
2304 * Handler for RX_DATA_DDP CPL messages.
2305 */
2306static int
2307do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2308{
2309	struct toepcb *toep = ctx;
2310	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2311
2312	VALIDATE_SOCK(so);
2313
2314	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2315		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2316		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2317		return (CPL_RET_BUF_DONE);
2318	}
2319#if 0
2320	skb->h.th = tcphdr_skb->h.th;
2321#endif
2322	new_rx_data_ddp(toep, m);
2323	return (0);
2324}
2325
2326static void
2327process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2328{
2329	struct tcpcb *tp = toep->tp_tp;
2330	struct socket *so;
2331	struct ddp_state *q;
2332	struct ddp_buf_state *bsp;
2333	struct cpl_rx_ddp_complete *hdr;
2334	unsigned int ddp_report, buf_idx, when, delack_mode;
2335	int nomoredata = 0;
2336	struct sockbuf *rcv;
2337
2338	inp_wlock(tp->t_inpcb);
2339	so = inp_inpcbtosocket(tp->t_inpcb);
2340
2341	if (__predict_false(so_no_receive(so))) {
2342		struct inpcb *inp = so_sotoinpcb(so);
2343
2344		handle_excess_rx(toep, m);
2345		inp_wunlock(inp);
2346		return;
2347	}
2348	q = &toep->tp_ddp_state;
2349	hdr = cplhdr(m);
2350	ddp_report = ntohl(hdr->ddp_report);
2351	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2352	m->m_pkthdr.csum_data = tp->rcv_nxt;
2353
2354	rcv = so_sockbuf_rcv(so);
2355	sockbuf_lock(rcv);
2356
2357	bsp = &q->buf_state[buf_idx];
2358	when = bsp->cur_offset;
2359	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2360	tp->rcv_nxt += m->m_len;
2361	tp->t_rcvtime = ticks;
2362
2363	delack_mode = G_DDP_DACK_MODE(ddp_report);
2364	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2365		toep->tp_delack_mode = delack_mode;
2366		toep->tp_delack_seq = tp->rcv_nxt;
2367	}
2368#ifdef notyet
2369	skb_reset_transport_header(skb);
2370	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2371#endif
2372	inp_wunlock(tp->t_inpcb);
2373
2374	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2375	CTR5(KTR_TOM,
2376		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2377		  "ddp_report 0x%x offset %u, len %u",
2378		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2379		   G_DDP_OFFSET(ddp_report), m->m_len);
2380
2381	m->m_cur_offset = bsp->cur_offset;
2382	bsp->cur_offset += m->m_len;
2383
2384	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2385		q->cur_buf ^= 1;                     /* flip buffers */
2386		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2387			nomoredata=1;
2388	}
2389
2390	CTR4(KTR_TOM,
2391		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2392		  "ddp_report %u offset %u",
2393		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2394		   G_DDP_OFFSET(ddp_report));
2395
2396	m->m_ddp_gl = (unsigned char *)bsp->gl;
2397	m->m_flags |= M_DDP;
2398	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2399	if (bsp->flags & DDP_BF_NOCOPY)
2400		bsp->flags &= ~DDP_BF_NOCOPY;
2401	if (nomoredata)
2402		m->m_ddp_flags |= DDP_BF_NODATA;
2403
2404	SBAPPEND(rcv, m);
2405	if ((so_state_get(so) & SS_NOFDREF) == 0)
2406		so_sorwakeup_locked(so);
2407	else
2408		sockbuf_unlock(rcv);
2409}
2410
2411/*
2412 * Handler for RX_DDP_COMPLETE CPL messages.
2413 */
2414static int
2415do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2416{
2417	struct toepcb *toep = ctx;
2418
2419	VALIDATE_SOCK(so);
2420#if 0
2421	skb->h.th = tcphdr_skb->h.th;
2422#endif
2423	process_ddp_complete(toep, m);
2424	return (0);
2425}
2426
2427/*
2428 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2429 * socket state before calling tcp_time_wait to comply with its expectations.
2430 */
2431static void
2432enter_timewait(struct tcpcb *tp)
2433{
2434	/*
2435	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2436	 * process peer_close because we don't want to carry the peer FIN in
2437	 * the socket's receive queue and if we increment rcv_nxt without
2438	 * having the FIN in the receive queue we'll confuse facilities such
2439	 * as SIOCINQ.
2440	 */
2441	inp_wlock(tp->t_inpcb);
2442	tp->rcv_nxt++;
2443
2444	tp->ts_recent_age = 0;	     /* defeat recycling */
2445	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2446	inp_wunlock(tp->t_inpcb);
2447	tcp_offload_twstart(tp);
2448}
2449
2450/*
2451 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2452 * function deals with the data that may be reported along with the FIN.
2453 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2454 * perform normal FIN-related processing.  In the latter case 1 indicates that
2455 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2456 * skb can be freed.
2457 */
2458static int
2459handle_peer_close_data(struct socket *so, struct mbuf *m)
2460{
2461	struct tcpcb *tp = so_sototcpcb(so);
2462	struct toepcb *toep = tp->t_toe;
2463	struct ddp_state *q;
2464	struct ddp_buf_state *bsp;
2465	struct cpl_peer_close *req = cplhdr(m);
2466	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2467	struct sockbuf *rcv;
2468
2469	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2470		return (0);
2471
2472	CTR0(KTR_TOM, "handle_peer_close_data");
2473	if (__predict_false(so_no_receive(so))) {
2474		handle_excess_rx(toep, m);
2475
2476		/*
2477		 * Although we discard the data we want to process the FIN so
2478		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2479		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2480		 * may be what will close the connection.  We return 1 because
2481		 * handle_excess_rx() already freed the packet.
2482		 */
2483		return (1);
2484	}
2485
2486	inp_lock_assert(tp->t_inpcb);
2487	q = &toep->tp_ddp_state;
2488	rcv = so_sockbuf_rcv(so);
2489	sockbuf_lock(rcv);
2490
2491	bsp = &q->buf_state[q->cur_buf];
2492	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2493	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2494	m->m_ddp_gl = (unsigned char *)bsp->gl;
2495	m->m_flags |= M_DDP;
2496	m->m_cur_offset = bsp->cur_offset;
2497	m->m_ddp_flags =
2498	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2499	m->m_seq = tp->rcv_nxt;
2500	tp->rcv_nxt = rcv_nxt;
2501	bsp->cur_offset += m->m_pkthdr.len;
2502	if (!(bsp->flags & DDP_BF_NOFLIP))
2503		q->cur_buf ^= 1;
2504#ifdef notyet
2505	skb_reset_transport_header(skb);
2506	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2507#endif
2508	tp->t_rcvtime = ticks;
2509	SBAPPEND(rcv, m);
2510	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2511		so_sorwakeup_locked(so);
2512	else
2513		sockbuf_unlock(rcv);
2514
2515	return (1);
2516}
2517
2518/*
2519 * Handle a peer FIN.
2520 */
2521static void
2522do_peer_fin(struct toepcb *toep, struct mbuf *m)
2523{
2524	struct socket *so;
2525	struct tcpcb *tp = toep->tp_tp;
2526	int keep, action;
2527
2528	action = keep = 0;
2529	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2530	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2531		printf("abort_pending set\n");
2532
2533		goto out;
2534	}
2535	inp_wlock(tp->t_inpcb);
2536	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2537	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2538		keep = handle_peer_close_data(so, m);
2539		if (keep < 0) {
2540			inp_wunlock(tp->t_inpcb);
2541			return;
2542		}
2543	}
2544	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2545		CTR1(KTR_TOM,
2546		    "waking up waiters for cantrcvmore on %p ", so);
2547		socantrcvmore(so);
2548
2549		/*
2550		 * If connection is half-synchronized
2551		 * (ie NEEDSYN flag on) then delay ACK,
2552		 * so it may be piggybacked when SYN is sent.
2553		 * Otherwise, since we received a FIN then no
2554		 * more input can be expected, send ACK now.
2555		 */
2556		if (tp->t_flags & TF_NEEDSYN)
2557			tp->t_flags |= TF_DELACK;
2558		else
2559			tp->t_flags |= TF_ACKNOW;
2560		tp->rcv_nxt++;
2561	}
2562
2563	switch (tp->t_state) {
2564	case TCPS_SYN_RECEIVED:
2565	    tp->t_starttime = ticks;
2566	/* FALLTHROUGH */
2567	case TCPS_ESTABLISHED:
2568		tp->t_state = TCPS_CLOSE_WAIT;
2569		break;
2570	case TCPS_FIN_WAIT_1:
2571		tp->t_state = TCPS_CLOSING;
2572		break;
2573	case TCPS_FIN_WAIT_2:
2574		/*
2575		 * If we've sent an abort_req we must have sent it too late,
2576		 * HW will send us a reply telling us so, and this peer_close
2577		 * is really the last message for this connection and needs to
2578		 * be treated as an abort_rpl, i.e., transition the connection
2579		 * to TCP_CLOSE (note that the host stack does this at the
2580		 * time of generating the RST but we must wait for HW).
2581		 * Otherwise we enter TIME_WAIT.
2582		 */
2583		t3_release_offload_resources(toep);
2584		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2585			action = TCP_CLOSE;
2586		} else {
2587			action = TCP_TIMEWAIT;
2588		}
2589		break;
2590	default:
2591		log(LOG_ERR,
2592		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2593		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2594	}
2595	inp_wunlock(tp->t_inpcb);
2596
2597	if (action == TCP_TIMEWAIT) {
2598		enter_timewait(tp);
2599	} else if (action == TCP_DROP) {
2600		tcp_offload_drop(tp, 0);
2601	} else if (action == TCP_CLOSE) {
2602		tcp_offload_close(tp);
2603	}
2604
2605#ifdef notyet
2606	/* Do not send POLL_HUP for half duplex close. */
2607	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2608	    sk->sk_state == TCP_CLOSE)
2609		sk_wake_async(so, 1, POLL_HUP);
2610	else
2611		sk_wake_async(so, 1, POLL_IN);
2612#endif
2613
2614out:
2615	if (!keep)
2616		m_free(m);
2617}
2618
2619/*
2620 * Handler for PEER_CLOSE CPL messages.
2621 */
2622static int
2623do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2624{
2625	struct toepcb *toep = (struct toepcb *)ctx;
2626
2627	VALIDATE_SOCK(so);
2628
2629	do_peer_fin(toep, m);
2630	return (0);
2631}
2632
2633static void
2634process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2635{
2636	struct cpl_close_con_rpl *rpl = cplhdr(m);
2637	struct tcpcb *tp = toep->tp_tp;
2638	struct socket *so;
2639	int action = 0;
2640	struct sockbuf *rcv;
2641
2642	inp_wlock(tp->t_inpcb);
2643	so = inp_inpcbtosocket(tp->t_inpcb);
2644
2645	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2646
2647	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2648		inp_wunlock(tp->t_inpcb);
2649		goto out;
2650	}
2651
2652	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2653	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2654
2655	switch (tp->t_state) {
2656	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2657		t3_release_offload_resources(toep);
2658		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2659			action = TCP_CLOSE;
2660
2661		} else {
2662			action = TCP_TIMEWAIT;
2663		}
2664		break;
2665	case TCPS_LAST_ACK:
2666		/*
2667		 * In this state we don't care about pending abort_rpl.
2668		 * If we've sent abort_req it was post-close and was sent too
2669		 * late, this close_con_rpl is the actual last message.
2670		 */
2671		t3_release_offload_resources(toep);
2672		action = TCP_CLOSE;
2673		break;
2674	case TCPS_FIN_WAIT_1:
2675		/*
2676		 * If we can't receive any more
2677		 * data, then closing user can proceed.
2678		 * Starting the timer is contrary to the
2679		 * specification, but if we don't get a FIN
2680		 * we'll hang forever.
2681		 *
2682		 * XXXjl:
2683		 * we should release the tp also, and use a
2684		 * compressed state.
2685		 */
2686		if (so)
2687			rcv = so_sockbuf_rcv(so);
2688		else
2689			break;
2690
2691		if (rcv->sb_state & SBS_CANTRCVMORE) {
2692			int timeout;
2693
2694			if (so)
2695				soisdisconnected(so);
2696			timeout = (tcp_fast_finwait2_recycle) ?
2697			    tcp_finwait2_timeout : tcp_maxidle;
2698			tcp_timer_activate(tp, TT_2MSL, timeout);
2699		}
2700		tp->t_state = TCPS_FIN_WAIT_2;
2701		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2702		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2703			action = TCP_DROP;
2704		}
2705
2706		break;
2707	default:
2708		log(LOG_ERR,
2709		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2710		       toep->tp_toedev->tod_name, toep->tp_tid,
2711		       tp->t_state);
2712	}
2713	inp_wunlock(tp->t_inpcb);
2714
2715
2716	if (action == TCP_TIMEWAIT) {
2717		enter_timewait(tp);
2718	} else if (action == TCP_DROP) {
2719		tcp_offload_drop(tp, 0);
2720	} else if (action == TCP_CLOSE) {
2721		tcp_offload_close(tp);
2722	}
2723out:
2724	m_freem(m);
2725}
2726
2727/*
2728 * Handler for CLOSE_CON_RPL CPL messages.
2729 */
2730static int
2731do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2732			    void *ctx)
2733{
2734	struct toepcb *toep = (struct toepcb *)ctx;
2735
2736	process_close_con_rpl(toep, m);
2737	return (0);
2738}
2739
2740/*
2741 * Process abort replies.  We only process these messages if we anticipate
2742 * them as the coordination between SW and HW in this area is somewhat lacking
2743 * and sometimes we get ABORT_RPLs after we are done with the connection that
2744 * originated the ABORT_REQ.
2745 */
2746static void
2747process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2748{
2749	struct tcpcb *tp = toep->tp_tp;
2750	struct socket *so;
2751	int needclose = 0;
2752
2753#ifdef T3_TRACE
2754	T3_TRACE1(TIDTB(sk),
2755		  "process_abort_rpl: GTS rpl pending %d",
2756		  sock_flag(sk, ABORT_RPL_PENDING));
2757#endif
2758
2759	inp_wlock(tp->t_inpcb);
2760	so = inp_inpcbtosocket(tp->t_inpcb);
2761
2762	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2763		/*
2764		 * XXX panic on tcpdrop
2765		 */
2766		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2767			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2768		else {
2769			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2770			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2771			    !is_t3a(toep->tp_toedev)) {
2772				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2773					panic("TP_ABORT_REQ_RCVD set");
2774				t3_release_offload_resources(toep);
2775				needclose = 1;
2776			}
2777		}
2778	}
2779	inp_wunlock(tp->t_inpcb);
2780
2781	if (needclose)
2782		tcp_offload_close(tp);
2783
2784	m_free(m);
2785}
2786
2787/*
2788 * Handle an ABORT_RPL_RSS CPL message.
2789 */
2790static int
2791do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2792{
2793	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2794	struct toepcb *toep;
2795
2796	/*
2797	 * Ignore replies to post-close aborts indicating that the abort was
2798	 * requested too late.  These connections are terminated when we get
2799	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2800	 * arrives the TID is either no longer used or it has been recycled.
2801	 */
2802	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2803discard:
2804		m_free(m);
2805		return (0);
2806	}
2807
2808	toep = (struct toepcb *)ctx;
2809
2810        /*
2811	 * Sometimes we've already closed the socket, e.g., a post-close
2812	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2813	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2814	 * but FW turns the ABORT_REQ into a regular one and so we get
2815	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2816	 */
2817	if (!toep)
2818		goto discard;
2819
2820	if (toep->tp_tp == NULL) {
2821		log(LOG_NOTICE, "removing tid for abort\n");
2822		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2823		if (toep->tp_l2t)
2824			l2t_release(L2DATA(cdev), toep->tp_l2t);
2825
2826		toepcb_release(toep);
2827		goto discard;
2828	}
2829
2830	log(LOG_NOTICE, "toep=%p\n", toep);
2831	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2832
2833	toepcb_hold(toep);
2834	process_abort_rpl(toep, m);
2835	toepcb_release(toep);
2836	return (0);
2837}
2838
2839/*
2840 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2841 * indicate whether RST should be sent in response.
2842 */
2843static int
2844abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2845{
2846	struct tcpcb *tp = so_sototcpcb(so);
2847
2848	switch (abort_reason) {
2849	case CPL_ERR_BAD_SYN:
2850#if 0
2851		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2852#endif
2853	case CPL_ERR_CONN_RESET:
2854		// XXX need to handle SYN_RECV due to crossed SYNs
2855		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2856	case CPL_ERR_XMIT_TIMEDOUT:
2857	case CPL_ERR_PERSIST_TIMEDOUT:
2858	case CPL_ERR_FINWAIT2_TIMEDOUT:
2859	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2860#if 0
2861		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2862#endif
2863		return (ETIMEDOUT);
2864	default:
2865		return (EIO);
2866	}
2867}
2868
2869static inline void
2870set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2871{
2872	struct cpl_abort_rpl *rpl = cplhdr(m);
2873
2874	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2875	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2876	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2877
2878	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2879	rpl->cmd = cmd;
2880}
2881
2882static void
2883send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2884{
2885	struct mbuf *reply_mbuf;
2886	struct cpl_abort_req_rss *req = cplhdr(m);
2887
2888	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2889	m_set_priority(m, CPL_PRIORITY_DATA);
2890	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2891	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2892	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2893	m_free(m);
2894}
2895
2896/*
2897 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2898 */
2899static inline int
2900is_neg_adv_abort(unsigned int status)
2901{
2902	return status == CPL_ERR_RTX_NEG_ADVICE ||
2903	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2904}
2905
2906static void
2907send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2908{
2909	struct mbuf  *reply_mbuf;
2910	struct cpl_abort_req_rss *req = cplhdr(m);
2911
2912	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2913
2914	if (!reply_mbuf) {
2915		/* Defer the reply.  Stick rst_status into req->cmd. */
2916		req->status = rst_status;
2917		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2918		return;
2919	}
2920
2921	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2922	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2923	m_free(m);
2924
2925	/*
2926	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2927	 * these messages while ARP is pending.  For other connection states
2928	 * it's not a problem.
2929	 */
2930	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2931}
2932
2933#ifdef notyet
2934static void
2935cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2936{
2937	CXGB_UNIMPLEMENTED();
2938#ifdef notyet
2939	struct request_sock *req = child->sk_user_data;
2940
2941	inet_csk_reqsk_queue_removed(parent, req);
2942	synq_remove(tcp_sk(child));
2943	__reqsk_free(req);
2944	child->sk_user_data = NULL;
2945#endif
2946}
2947
2948
2949/*
2950 * Performs the actual work to abort a SYN_RECV connection.
2951 */
2952static void
2953do_abort_syn_rcv(struct socket *child, struct socket *parent)
2954{
2955	struct tcpcb *parenttp = so_sototcpcb(parent);
2956	struct tcpcb *childtp = so_sototcpcb(child);
2957
2958	/*
2959	 * If the server is still open we clean up the child connection,
2960	 * otherwise the server already did the clean up as it was purging
2961	 * its SYN queue and the skb was just sitting in its backlog.
2962	 */
2963	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2964		cleanup_syn_rcv_conn(child, parent);
2965		inp_wlock(childtp->t_inpcb);
2966		t3_release_offload_resources(childtp->t_toe);
2967		inp_wunlock(childtp->t_inpcb);
2968		tcp_offload_close(childtp);
2969	}
2970}
2971#endif
2972
2973/*
2974 * Handle abort requests for a SYN_RECV connection.  These need extra work
2975 * because the socket is on its parent's SYN queue.
2976 */
2977static int
2978abort_syn_rcv(struct socket *so, struct mbuf *m)
2979{
2980	CXGB_UNIMPLEMENTED();
2981#ifdef notyet
2982	struct socket *parent;
2983	struct toedev *tdev = toep->tp_toedev;
2984	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2985	struct socket *oreq = so->so_incomp;
2986	struct t3c_tid_entry *t3c_stid;
2987	struct tid_info *t;
2988
2989	if (!oreq)
2990		return -1;        /* somehow we are not on the SYN queue */
2991
2992	t = &(T3C_DATA(cdev))->tid_maps;
2993	t3c_stid = lookup_stid(t, oreq->ts_recent);
2994	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2995
2996	so_lock(parent);
2997	do_abort_syn_rcv(so, parent);
2998	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2999	so_unlock(parent);
3000#endif
3001	return (0);
3002}
3003
3004/*
3005 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
3006 * request except that we need to reply to it.
3007 */
3008static void
3009process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3010{
3011	int rst_status = CPL_ABORT_NO_RST;
3012	const struct cpl_abort_req_rss *req = cplhdr(m);
3013	struct tcpcb *tp = toep->tp_tp;
3014	struct socket *so;
3015	int needclose = 0;
3016
3017	inp_wlock(tp->t_inpcb);
3018	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3019	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3020		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3021		m_free(m);
3022		goto skip;
3023	}
3024
3025	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3026	/*
3027	 * Three cases to consider:
3028	 * a) We haven't sent an abort_req; close the connection.
3029	 * b) We have sent a post-close abort_req that will get to TP too late
3030	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3031	 *    be ignored and the connection should be closed now.
3032	 * c) We have sent a regular abort_req that will get to TP too late.
3033	 *    That will generate an abort_rpl with status 0, wait for it.
3034	 */
3035	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3036	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3037		int error;
3038
3039		error = abort_status_to_errno(so, req->status,
3040		    &rst_status);
3041		so_error_set(so, error);
3042
3043		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3044			so_sorwakeup(so);
3045		/*
3046		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3047		 * returns 0 is has taken care of the abort.
3048		 */
3049		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3050			goto skip;
3051
3052		t3_release_offload_resources(toep);
3053		needclose = 1;
3054	}
3055	inp_wunlock(tp->t_inpcb);
3056
3057	if (needclose)
3058		tcp_offload_close(tp);
3059
3060	send_abort_rpl(m, tdev, rst_status);
3061	return;
3062skip:
3063	inp_wunlock(tp->t_inpcb);
3064}
3065
3066/*
3067 * Handle an ABORT_REQ_RSS CPL message.
3068 */
3069static int
3070do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3071{
3072	const struct cpl_abort_req_rss *req = cplhdr(m);
3073	struct toepcb *toep = (struct toepcb *)ctx;
3074
3075	if (is_neg_adv_abort(req->status)) {
3076		m_free(m);
3077		return (0);
3078	}
3079
3080	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3081
3082	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3083		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3084		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3085
3086		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3087		if (toep->tp_l2t)
3088			l2t_release(L2DATA(cdev), toep->tp_l2t);
3089
3090		/*
3091		 *  Unhook
3092		 */
3093		toep->tp_tp->t_toe = NULL;
3094		toep->tp_tp->t_flags &= ~TF_TOE;
3095		toep->tp_tp = NULL;
3096		/*
3097		 * XXX need to call syncache_chkrst - but we don't
3098		 * have a way of doing that yet
3099		 */
3100		toepcb_release(toep);
3101		log(LOG_ERR, "abort for unestablished connection :-(\n");
3102		return (0);
3103	}
3104	if (toep->tp_tp == NULL) {
3105		log(LOG_NOTICE, "disconnected toepcb\n");
3106		/* should be freed momentarily */
3107		return (0);
3108	}
3109
3110
3111	toepcb_hold(toep);
3112	process_abort_req(toep, m, toep->tp_toedev);
3113	toepcb_release(toep);
3114	return (0);
3115}
3116#ifdef notyet
3117static void
3118pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3119{
3120	struct toedev *tdev = TOE_DEV(parent);
3121
3122	do_abort_syn_rcv(child, parent);
3123	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3124		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3125
3126		rpl->opt0h = htonl(F_TCAM_BYPASS);
3127		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3128		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3129	} else
3130		m_free(m);
3131}
3132#endif
3133static void
3134handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3135{
3136	CXGB_UNIMPLEMENTED();
3137
3138#ifdef notyet
3139	struct t3cdev *cdev;
3140	struct socket *parent;
3141	struct socket *oreq;
3142	struct t3c_tid_entry *t3c_stid;
3143	struct tid_info *t;
3144	struct tcpcb *otp, *tp = so_sototcpcb(so);
3145	struct toepcb *toep = tp->t_toe;
3146
3147	/*
3148	 * If the connection is being aborted due to the parent listening
3149	 * socket going away there's nothing to do, the ABORT_REQ will close
3150	 * the connection.
3151	 */
3152	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3153		m_free(m);
3154		return;
3155	}
3156
3157	oreq = so->so_incomp;
3158	otp = so_sototcpcb(oreq);
3159
3160	cdev = T3C_DEV(so);
3161	t = &(T3C_DATA(cdev))->tid_maps;
3162	t3c_stid = lookup_stid(t, otp->ts_recent);
3163	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3164
3165	so_lock(parent);
3166	pass_open_abort(so, parent, m);
3167	so_unlock(parent);
3168#endif
3169}
3170
3171/*
3172 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3173 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3174 * connection.
3175 */
3176static void
3177pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3178{
3179
3180#ifdef notyet
3181	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3182	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3183#endif
3184	handle_pass_open_arp_failure(m_get_socket(m), m);
3185}
3186
3187/*
3188 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3189 */
3190static void
3191mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3192{
3193	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3194	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3195	unsigned int tid = GET_TID(req);
3196
3197	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3198	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3199	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3200	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3201	rpl->opt0h = htonl(F_TCAM_BYPASS);
3202	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3203	rpl->opt2 = 0;
3204	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3205}
3206
3207/*
3208 * Send a deferred reject to an accept request.
3209 */
3210static void
3211reject_pass_request(struct toedev *tdev, struct mbuf *m)
3212{
3213	struct mbuf *reply_mbuf;
3214
3215	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3216	mk_pass_accept_rpl(reply_mbuf, m);
3217	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3218	m_free(m);
3219}
3220
3221static void
3222handle_syncache_event(int event, void *arg)
3223{
3224	struct toepcb *toep = arg;
3225
3226	switch (event) {
3227	case TOE_SC_ENTRY_PRESENT:
3228		/*
3229		 * entry already exists - free toepcb
3230		 * and l2t
3231		 */
3232		printf("syncache entry present\n");
3233		toepcb_release(toep);
3234		break;
3235	case TOE_SC_DROP:
3236		/*
3237		 * The syncache has given up on this entry
3238		 * either it timed out, or it was evicted
3239		 * we need to explicitly release the tid
3240		 */
3241		printf("syncache entry dropped\n");
3242		toepcb_release(toep);
3243		break;
3244	default:
3245		log(LOG_ERR, "unknown syncache event %d\n", event);
3246		break;
3247	}
3248}
3249
3250static void
3251syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3252{
3253	struct in_conninfo inc;
3254	struct tcpopt to;
3255	struct tcphdr th;
3256	struct inpcb *inp;
3257	int mss, wsf, sack, ts;
3258	uint32_t rcv_isn = ntohl(req->rcv_isn);
3259
3260	bzero(&to, sizeof(struct tcpopt));
3261	inp = so_sotoinpcb(lso);
3262
3263	/*
3264	 * Fill out information for entering us into the syncache
3265	 */
3266	bzero(&inc, sizeof(inc));
3267	inc.inc_fport = th.th_sport = req->peer_port;
3268	inc.inc_lport = th.th_dport = req->local_port;
3269	th.th_seq = req->rcv_isn;
3270	th.th_flags = TH_SYN;
3271
3272	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3273
3274
3275	inc.inc_isipv6 = 0;
3276	inc.inc_len = 0;
3277	inc.inc_faddr.s_addr = req->peer_ip;
3278	inc.inc_laddr.s_addr = req->local_ip;
3279
3280	DPRINTF("syncache add of %d:%d %d:%d\n",
3281	    ntohl(req->local_ip), ntohs(req->local_port),
3282	    ntohl(req->peer_ip), ntohs(req->peer_port));
3283
3284	mss = req->tcp_options.mss;
3285	wsf = req->tcp_options.wsf;
3286	ts = req->tcp_options.tstamp;
3287	sack = req->tcp_options.sack;
3288	to.to_mss = mss;
3289	to.to_wscale = wsf;
3290	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3291	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3292}
3293
3294
3295/*
3296 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3297 * lock held.  Note that the sock here is a listening socket that is not owned
3298 * by the TOE.
3299 */
3300static void
3301process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3302    struct listen_ctx *lctx)
3303{
3304	int rt_flags;
3305	struct l2t_entry *e;
3306	struct iff_mac tim;
3307	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3308	struct cpl_pass_accept_rpl *rpl;
3309	struct cpl_pass_accept_req *req = cplhdr(m);
3310	unsigned int tid = GET_TID(req);
3311	struct tom_data *d = TOM_DATA(tdev);
3312	struct t3cdev *cdev = d->cdev;
3313	struct tcpcb *tp = so_sototcpcb(so);
3314	struct toepcb *newtoep;
3315	struct rtentry *dst;
3316	struct sockaddr_in nam;
3317	struct t3c_data *td = T3C_DATA(cdev);
3318
3319	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3320	if (__predict_false(reply_mbuf == NULL)) {
3321		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3322			t3_defer_reply(m, tdev, reject_pass_request);
3323		else {
3324			cxgb_queue_tid_release(cdev, tid);
3325			m_free(m);
3326		}
3327		DPRINTF("failed to get reply_mbuf\n");
3328
3329		goto out;
3330	}
3331
3332	if (tp->t_state != TCPS_LISTEN) {
3333		DPRINTF("socket not in listen state\n");
3334
3335		goto reject;
3336	}
3337
3338	tim.mac_addr = req->dst_mac;
3339	tim.vlan_tag = ntohs(req->vlan_tag);
3340	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3341		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3342		goto reject;
3343	}
3344
3345#ifdef notyet
3346	/*
3347	 * XXX do route lookup to confirm that we're still listening on this
3348	 * address
3349	 */
3350	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3351			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3352		goto reject;
3353	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3354		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3355	dst_release(skb->dst);	// done with the input route, release it
3356	skb->dst = NULL;
3357
3358	if ((rt_flags & RTF_LOCAL) == 0)
3359		goto reject;
3360#endif
3361	/*
3362	 * XXX
3363	 */
3364	rt_flags = RTF_LOCAL;
3365	if ((rt_flags & RTF_LOCAL) == 0)
3366		goto reject;
3367
3368	/*
3369	 * Calculate values and add to syncache
3370	 */
3371
3372	newtoep = toepcb_alloc();
3373	if (newtoep == NULL)
3374		goto reject;
3375
3376	bzero(&nam, sizeof(struct sockaddr_in));
3377
3378	nam.sin_len = sizeof(struct sockaddr_in);
3379	nam.sin_family = AF_INET;
3380	nam.sin_addr.s_addr =req->peer_ip;
3381	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3382
3383	if (dst == NULL) {
3384		printf("failed to find route\n");
3385		goto reject;
3386	}
3387	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3388	    (struct sockaddr *)&nam);
3389	if (e == NULL) {
3390		DPRINTF("failed to get l2t\n");
3391	}
3392	/*
3393	 * Point to our listen socket until accept
3394	 */
3395	newtoep->tp_tp = tp;
3396	newtoep->tp_flags = TP_SYN_RCVD;
3397	newtoep->tp_tid = tid;
3398	newtoep->tp_toedev = tdev;
3399	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3400
3401	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3402	so_lock(so);
3403	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3404	so_unlock(so);
3405
3406	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3407		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3408
3409	if (newtoep->tp_ulp_mode) {
3410		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3411
3412		if (ddp_mbuf == NULL)
3413			newtoep->tp_ulp_mode = 0;
3414	}
3415
3416	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3417	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3418	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3419	/*
3420	 * XXX workaround for lack of syncache drop
3421	 */
3422	toepcb_hold(newtoep);
3423	syncache_add_accept_req(req, so, newtoep);
3424
3425	rpl = cplhdr(reply_mbuf);
3426	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3427	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3428	rpl->wr.wr_lo = 0;
3429	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3430	rpl->opt2 = htonl(calc_opt2(so, tdev));
3431	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3432	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3433
3434	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3435	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3436	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3437				  CPL_PASS_OPEN_ACCEPT);
3438
3439	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3440
3441	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3442
3443	l2t_send(cdev, reply_mbuf, e);
3444	m_free(m);
3445	if (newtoep->tp_ulp_mode) {
3446		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3447				V_TF_DDP_OFF(1) |
3448				TP_DDP_TIMER_WORKAROUND_MASK,
3449				V_TF_DDP_OFF(1) |
3450		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3451	} else
3452		printf("not offloading\n");
3453
3454
3455
3456	return;
3457reject:
3458	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3459		mk_pass_accept_rpl(reply_mbuf, m);
3460	else
3461		mk_tid_release(reply_mbuf, newtoep, tid);
3462	cxgb_ofld_send(cdev, reply_mbuf);
3463	m_free(m);
3464out:
3465#if 0
3466	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3467#else
3468	return;
3469#endif
3470}
3471
3472/*
3473 * Handle a CPL_PASS_ACCEPT_REQ message.
3474 */
3475static int
3476do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3477{
3478	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3479	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3480	struct tom_data *d = listen_ctx->tom_data;
3481
3482#if VALIDATE_TID
3483	struct cpl_pass_accept_req *req = cplhdr(m);
3484	unsigned int tid = GET_TID(req);
3485	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3486
3487	if (unlikely(!lsk)) {
3488		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3489		       cdev->name,
3490		       (unsigned long)((union listen_entry *)ctx -
3491					t->stid_tab));
3492		return CPL_RET_BUF_DONE;
3493	}
3494	if (unlikely(tid >= t->ntids)) {
3495		printk(KERN_ERR "%s: passive open TID %u too large\n",
3496		       cdev->name, tid);
3497		return CPL_RET_BUF_DONE;
3498	}
3499	/*
3500	 * For T3A the current user of the TID may have closed but its last
3501	 * message(s) may have been backlogged so the TID appears to be still
3502	 * in use.  Just take the TID away, the connection can close at its
3503	 * own leisure.  For T3B this situation is a bug.
3504	 */
3505	if (!valid_new_tid(t, tid) &&
3506	    cdev->type != T3A) {
3507		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3508		       cdev->name, tid);
3509		return CPL_RET_BUF_DONE;
3510	}
3511#endif
3512
3513	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3514	return (0);
3515}
3516
3517/*
3518 * Called when a connection is established to translate the TCP options
3519 * reported by HW to FreeBSD's native format.
3520 */
3521static void
3522assign_rxopt(struct socket *so, unsigned int opt)
3523{
3524	struct tcpcb *tp = so_sototcpcb(so);
3525	struct toepcb *toep = tp->t_toe;
3526	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3527
3528	inp_lock_assert(tp->t_inpcb);
3529
3530	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3531	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3532	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3533	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3534	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3535	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3536		tp->rcv_scale = tp->request_r_scale;
3537}
3538
3539/*
3540 * Completes some final bits of initialization for just established connections
3541 * and changes their state to TCP_ESTABLISHED.
3542 *
3543 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3544 */
3545static void
3546make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3547{
3548	struct tcpcb *tp = so_sototcpcb(so);
3549	struct toepcb *toep = tp->t_toe;
3550
3551	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3552	assign_rxopt(so, opt);
3553
3554	/*
3555	 *XXXXXXXXXXX
3556	 *
3557	 */
3558#ifdef notyet
3559	so->so_proto->pr_ctloutput = t3_ctloutput;
3560#endif
3561
3562#if 0
3563	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3564#endif
3565	/*
3566	 * XXX not clear what rcv_wup maps to
3567	 */
3568	/*
3569	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3570	 * pass through opt0.
3571	 */
3572	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3573		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3574
3575	dump_toepcb(toep);
3576
3577#ifdef notyet
3578/*
3579 * no clean interface for marking ARP up to date
3580 */
3581	dst_confirm(sk->sk_dst_cache);
3582#endif
3583	tp->t_starttime = ticks;
3584	tp->t_state = TCPS_ESTABLISHED;
3585	soisconnected(so);
3586}
3587
3588static int
3589syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3590{
3591
3592	struct in_conninfo inc;
3593	struct tcpopt to;
3594	struct tcphdr th;
3595	int mss, wsf, sack, ts;
3596	struct mbuf *m = NULL;
3597	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3598	unsigned int opt;
3599
3600#ifdef MAC
3601#error	"no MAC support"
3602#endif
3603
3604	opt = ntohs(req->tcp_opt);
3605
3606	bzero(&to, sizeof(struct tcpopt));
3607
3608	/*
3609	 * Fill out information for entering us into the syncache
3610	 */
3611	bzero(&inc, sizeof(inc));
3612	inc.inc_fport = th.th_sport = req->peer_port;
3613	inc.inc_lport = th.th_dport = req->local_port;
3614	th.th_seq = req->rcv_isn;
3615	th.th_flags = TH_ACK;
3616
3617	inc.inc_isipv6 = 0;
3618	inc.inc_len = 0;
3619	inc.inc_faddr.s_addr = req->peer_ip;
3620	inc.inc_laddr.s_addr = req->local_ip;
3621
3622	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3623	wsf  = G_TCPOPT_WSCALE_OK(opt);
3624	ts   = G_TCPOPT_TSTAMP(opt);
3625	sack = G_TCPOPT_SACK(opt);
3626
3627	to.to_mss = mss;
3628	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3629	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3630
3631	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3632	    ntohl(req->local_ip), ntohs(req->local_port),
3633	    ntohl(req->peer_ip), ntohs(req->peer_port),
3634	    mss, wsf, ts, sack);
3635	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3636}
3637
3638
3639/*
3640 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3641 * if we are in TCP_SYN_RECV due to crossed SYNs
3642 */
3643static int
3644do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3645{
3646	struct cpl_pass_establish *req = cplhdr(m);
3647	struct toepcb *toep = (struct toepcb *)ctx;
3648	struct tcpcb *tp = toep->tp_tp;
3649	struct socket *so, *lso;
3650	struct t3c_data *td = T3C_DATA(cdev);
3651	struct sockbuf *snd, *rcv;
3652
3653	// Complete socket initialization now that we have the SND_ISN
3654
3655	struct toedev *tdev;
3656
3657
3658	tdev = toep->tp_toedev;
3659
3660	inp_wlock(tp->t_inpcb);
3661
3662	/*
3663	 *
3664	 * XXX need to add reference while we're manipulating
3665	 */
3666	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3667
3668	inp_wunlock(tp->t_inpcb);
3669
3670	so_lock(so);
3671	LIST_REMOVE(toep, synq_entry);
3672	so_unlock(so);
3673
3674	if (!syncache_expand_establish_req(req, &so, toep)) {
3675		/*
3676		 * No entry
3677		 */
3678		CXGB_UNIMPLEMENTED();
3679	}
3680	if (so == NULL) {
3681		/*
3682		 * Couldn't create the socket
3683		 */
3684		CXGB_UNIMPLEMENTED();
3685	}
3686
3687	tp = so_sototcpcb(so);
3688	inp_wlock(tp->t_inpcb);
3689
3690	snd = so_sockbuf_snd(so);
3691	rcv = so_sockbuf_rcv(so);
3692
3693	snd->sb_flags |= SB_NOCOALESCE;
3694	rcv->sb_flags |= SB_NOCOALESCE;
3695
3696	toep->tp_tp = tp;
3697	toep->tp_flags = 0;
3698	tp->t_toe = toep;
3699	reset_wr_list(toep);
3700	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3701	tp->rcv_nxt = toep->tp_copied_seq;
3702	install_offload_ops(so);
3703
3704	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3705	toep->tp_wr_unacked = 0;
3706	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3707	toep->tp_qset_idx = 0;
3708	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3709
3710	/*
3711	 * XXX Cancel any keep alive timer
3712	 */
3713
3714	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3715
3716	/*
3717	 * XXX workaround for lack of syncache drop
3718	 */
3719	toepcb_release(toep);
3720	inp_wunlock(tp->t_inpcb);
3721
3722	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3723	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3724#ifdef notyet
3725	/*
3726	 * XXX not sure how these checks map to us
3727	 */
3728	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3729		sk->sk_state_change(sk);
3730		sk_wake_async(so, 0, POLL_OUT);
3731	}
3732	/*
3733	 * The state for the new connection is now up to date.
3734	 * Next check if we should add the connection to the parent's
3735	 * accept queue.  When the parent closes it resets connections
3736	 * on its SYN queue, so check if we are being reset.  If so we
3737	 * don't need to do anything more, the coming ABORT_RPL will
3738	 * destroy this socket.  Otherwise move the connection to the
3739	 * accept queue.
3740	 *
3741	 * Note that we reset the synq before closing the server so if
3742	 * we are not being reset the stid is still open.
3743	 */
3744	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3745		__kfree_skb(skb);
3746		goto unlock;
3747	}
3748#endif
3749	m_free(m);
3750
3751	return (0);
3752}
3753
3754/*
3755 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3756 * and send them to the TOE.
3757 */
3758static void
3759fixup_and_send_ofo(struct toepcb *toep)
3760{
3761	struct mbuf *m;
3762	struct toedev *tdev = toep->tp_toedev;
3763	struct tcpcb *tp = toep->tp_tp;
3764	unsigned int tid = toep->tp_tid;
3765
3766	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3767
3768	inp_lock_assert(tp->t_inpcb);
3769	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3770		/*
3771		 * A variety of messages can be waiting but the fields we'll
3772		 * be touching are common to all so any message type will do.
3773		 */
3774		struct cpl_close_con_req *p = cplhdr(m);
3775
3776		p->wr.wr_lo = htonl(V_WR_TID(tid));
3777		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3778		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3779	}
3780}
3781
3782/*
3783 * Updates socket state from an active establish CPL message.  Runs with the
3784 * socket lock held.
3785 */
3786static void
3787socket_act_establish(struct socket *so, struct mbuf *m)
3788{
3789	INIT_VNET_INET(so->so_vnet);
3790	struct cpl_act_establish *req = cplhdr(m);
3791	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3792	struct tcpcb *tp = so_sototcpcb(so);
3793	struct toepcb *toep = tp->t_toe;
3794
3795	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3796		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3797		    toep->tp_tid, tp->t_state);
3798
3799	tp->ts_recent_age = ticks;
3800	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3801	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3802
3803	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3804
3805	/*
3806	 * Now that we finally have a TID send any CPL messages that we had to
3807	 * defer for lack of a TID.
3808	 */
3809	if (mbufq_len(&toep->out_of_order_queue))
3810		fixup_and_send_ofo(toep);
3811
3812	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3813		/*
3814		 * XXX does this even make sense?
3815		 */
3816		so_sorwakeup(so);
3817	}
3818	m_free(m);
3819#ifdef notyet
3820/*
3821 * XXX assume no write requests permitted while socket connection is
3822 * incomplete
3823 */
3824	/*
3825	 * Currently the send queue must be empty at this point because the
3826	 * socket layer does not send anything before a connection is
3827	 * established.  To be future proof though we handle the possibility
3828	 * that there are pending buffers to send (either TX_DATA or
3829	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3830	 * buffers according to the just learned write_seq, and then we send
3831	 * them on their way.
3832	 */
3833	fixup_pending_writeq_buffers(sk);
3834	if (t3_push_frames(so, 1))
3835		sk->sk_write_space(sk);
3836#endif
3837
3838	toep->tp_state = tp->t_state;
3839	V_tcpstat.tcps_connects++;
3840
3841}
3842
3843/*
3844 * Process a CPL_ACT_ESTABLISH message.
3845 */
3846static int
3847do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3848{
3849	struct cpl_act_establish *req = cplhdr(m);
3850	unsigned int tid = GET_TID(req);
3851	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3852	struct toepcb *toep = (struct toepcb *)ctx;
3853	struct tcpcb *tp = toep->tp_tp;
3854	struct socket *so;
3855	struct toedev *tdev;
3856	struct tom_data *d;
3857
3858	if (tp == NULL) {
3859		free_atid(cdev, atid);
3860		return (0);
3861	}
3862	inp_wlock(tp->t_inpcb);
3863
3864	/*
3865	 * XXX
3866	 */
3867	so = inp_inpcbtosocket(tp->t_inpcb);
3868	tdev = toep->tp_toedev; /* blow up here if link was down */
3869	d = TOM_DATA(tdev);
3870
3871	/*
3872	 * It's OK if the TID is currently in use, the owning socket may have
3873	 * backlogged its last CPL message(s).  Just take it away.
3874	 */
3875	toep->tp_tid = tid;
3876	toep->tp_tp = tp;
3877	so_insert_tid(d, toep, tid);
3878	free_atid(cdev, atid);
3879	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3880
3881	socket_act_establish(so, m);
3882	inp_wunlock(tp->t_inpcb);
3883	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3884	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3885
3886	return (0);
3887}
3888
3889/*
3890 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3891 * next batch of work requests from the write queue.
3892 */
3893static void
3894wr_ack(struct toepcb *toep, struct mbuf *m)
3895{
3896	struct tcpcb *tp = toep->tp_tp;
3897	struct cpl_wr_ack *hdr = cplhdr(m);
3898	struct socket *so;
3899	unsigned int credits = ntohs(hdr->credits);
3900	u32 snd_una = ntohl(hdr->snd_una);
3901	int bytes = 0;
3902	struct sockbuf *snd;
3903
3904	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3905
3906	inp_wlock(tp->t_inpcb);
3907	so = inp_inpcbtosocket(tp->t_inpcb);
3908	toep->tp_wr_avail += credits;
3909	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3910		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3911
3912	while (credits) {
3913		struct mbuf *p = peek_wr(toep);
3914
3915		if (__predict_false(!p)) {
3916			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3917			    "nothing pending, state %u wr_avail=%u\n",
3918			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3919			break;
3920		}
3921		CTR2(KTR_TOM,
3922			"wr_ack: p->credits=%d p->bytes=%d",
3923		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3924		KASSERT(p->m_pkthdr.csum_data != 0,
3925		    ("empty request still on list"));
3926
3927		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3928
3929#if DEBUG_WR > 1
3930			struct tx_data_wr *w = cplhdr(p);
3931			log(LOG_ERR,
3932			       "TID %u got %u WR credits, need %u, len %u, "
3933			       "main body %u, frags %u, seq # %u, ACK una %u,"
3934			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3935			       toep->tp_tid, credits, p->csum, p->len,
3936			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3937			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3938			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3939#endif
3940			p->m_pkthdr.csum_data -= credits;
3941			break;
3942		} else {
3943			dequeue_wr(toep);
3944			credits -= p->m_pkthdr.csum_data;
3945			bytes += p->m_pkthdr.len;
3946			CTR3(KTR_TOM,
3947			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3948			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3949
3950			m_free(p);
3951		}
3952	}
3953
3954#if DEBUG_WR
3955	check_wr_invariants(tp);
3956#endif
3957
3958	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3959#if VALIDATE_SEQ
3960		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3961
3962		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3963		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3964		    toep->tp_tid, tp->snd_una);
3965#endif
3966		goto out_free;
3967	}
3968
3969	if (tp->snd_una != snd_una) {
3970		tp->snd_una = snd_una;
3971		tp->ts_recent_age = ticks;
3972#ifdef notyet
3973		/*
3974		 * Keep ARP entry "minty fresh"
3975		 */
3976		dst_confirm(sk->sk_dst_cache);
3977#endif
3978		if (tp->snd_una == tp->snd_nxt)
3979			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3980	}
3981
3982	snd = so_sockbuf_snd(so);
3983	if (bytes) {
3984		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3985		snd = so_sockbuf_snd(so);
3986		sockbuf_lock(snd);
3987		sbdrop_locked(snd, bytes);
3988		so_sowwakeup_locked(so);
3989	}
3990
3991	if (snd->sb_sndptroff < snd->sb_cc)
3992		t3_push_frames(so, 0);
3993
3994out_free:
3995	inp_wunlock(tp->t_inpcb);
3996	m_free(m);
3997}
3998
3999/*
4000 * Handler for TX_DATA_ACK CPL messages.
4001 */
4002static int
4003do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
4004{
4005	struct toepcb *toep = (struct toepcb *)ctx;
4006
4007	VALIDATE_SOCK(so);
4008
4009	wr_ack(toep, m);
4010	return 0;
4011}
4012
4013/*
4014 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4015 */
4016static int
4017do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4018{
4019	m_freem(m);
4020	return 0;
4021}
4022
4023/*
4024 * Reset a connection that is on a listener's SYN queue or accept queue,
4025 * i.e., one that has not had a struct socket associated with it.
4026 * Must be called from process context.
4027 *
4028 * Modeled after code in inet_csk_listen_stop().
4029 */
4030static void
4031t3_reset_listen_child(struct socket *child)
4032{
4033	struct tcpcb *tp = so_sototcpcb(child);
4034
4035	t3_send_reset(tp->t_toe);
4036}
4037
4038
4039static void
4040t3_child_disconnect(struct socket *so, void *arg)
4041{
4042	struct tcpcb *tp = so_sototcpcb(so);
4043
4044	if (tp->t_flags & TF_TOE) {
4045		inp_wlock(tp->t_inpcb);
4046		t3_reset_listen_child(so);
4047		inp_wunlock(tp->t_inpcb);
4048	}
4049}
4050
4051/*
4052 * Disconnect offloaded established but not yet accepted connections sitting
4053 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4054 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4055 */
4056void
4057t3_disconnect_acceptq(struct socket *listen_so)
4058{
4059
4060	so_lock(listen_so);
4061	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4062	so_unlock(listen_so);
4063}
4064
4065/*
4066 * Reset offloaded connections sitting on a server's syn queue.  As above
4067 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4068 */
4069
4070void
4071t3_reset_synq(struct listen_ctx *lctx)
4072{
4073	struct toepcb *toep;
4074
4075	so_lock(lctx->lso);
4076	while (!LIST_EMPTY(&lctx->synq_head)) {
4077		toep = LIST_FIRST(&lctx->synq_head);
4078		LIST_REMOVE(toep, synq_entry);
4079		toep->tp_tp = NULL;
4080		t3_send_reset(toep);
4081		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4082		toepcb_release(toep);
4083	}
4084	so_unlock(lctx->lso);
4085}
4086
4087
4088int
4089t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4090		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4091		   unsigned int pg_off, unsigned int color)
4092{
4093	unsigned int i, j, pidx;
4094	struct pagepod *p;
4095	struct mbuf *m;
4096	struct ulp_mem_io *req;
4097	unsigned int tid = toep->tp_tid;
4098	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4099	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4100
4101	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4102	    gl, nppods, tag, maxoff, pg_off, color);
4103
4104	for (i = 0; i < nppods; ++i) {
4105		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4106		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4107		req = mtod(m, struct ulp_mem_io *);
4108		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4109		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4110		req->wr.wr_lo = 0;
4111		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4112					   V_ULPTX_CMD(ULP_MEM_WRITE));
4113		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4114				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4115
4116		p = (struct pagepod *)(req + 1);
4117		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4118			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4119			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4120						  V_PPOD_COLOR(color));
4121			p->pp_max_offset = htonl(maxoff);
4122			p->pp_page_offset = htonl(pg_off);
4123			p->pp_rsvd = 0;
4124			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4125				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4126				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4127		} else
4128			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4129		send_or_defer(toep, m, 0);
4130		ppod_addr += PPOD_SIZE;
4131	}
4132	return (0);
4133}
4134
4135/*
4136 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4137 */
4138static inline void
4139mk_cpl_barrier_ulp(struct cpl_barrier *b)
4140{
4141	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4142
4143	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4144	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4145	b->opcode = CPL_BARRIER;
4146}
4147
4148/*
4149 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4150 */
4151static inline void
4152mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4153{
4154	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4155
4156	txpkt = (struct ulp_txpkt *)req;
4157	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4158	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4159	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4160	req->cpuno = htons(cpuno);
4161}
4162
4163/*
4164 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4165 */
4166static inline void
4167mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4168                     unsigned int word, uint64_t mask, uint64_t val)
4169{
4170	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4171
4172	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4173	    tid, word, mask, val);
4174
4175	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4176	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4177	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4178	req->reply = V_NO_REPLY(1);
4179	req->cpu_idx = 0;
4180	req->word = htons(word);
4181	req->mask = htobe64(mask);
4182	req->val = htobe64(val);
4183}
4184
4185/*
4186 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4187 */
4188static void
4189mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4190    unsigned int tid, unsigned int credits)
4191{
4192	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4193
4194	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4195	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4196	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4197	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4198	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4199				 V_RX_CREDITS(credits));
4200}
4201
4202void
4203t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4204{
4205	unsigned int wrlen;
4206	struct mbuf *m;
4207	struct work_request_hdr *wr;
4208	struct cpl_barrier *lock;
4209	struct cpl_set_tcb_field *req;
4210	struct cpl_get_tcb *getreq;
4211	struct ddp_state *p = &toep->tp_ddp_state;
4212
4213#if 0
4214	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4215#endif
4216	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4217		sizeof(*getreq);
4218	m = m_gethdr_nofail(wrlen);
4219	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4220	wr = mtod(m, struct work_request_hdr *);
4221	bzero(wr, wrlen);
4222
4223	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4224	m->m_pkthdr.len = m->m_len = wrlen;
4225
4226	lock = (struct cpl_barrier *)(wr + 1);
4227	mk_cpl_barrier_ulp(lock);
4228
4229	req = (struct cpl_set_tcb_field *)(lock + 1);
4230
4231	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4232
4233	/* Hmmm, not sure if this actually a good thing: reactivating
4234	 * the other buffer might be an issue if it has been completed
4235	 * already. However, that is unlikely, since the fact that the UBUF
4236	 * is not completed indicates that there is no oustanding data.
4237	 */
4238	if (bufidx == 0)
4239		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4240				     V_TF_DDP_ACTIVE_BUF(1) |
4241				     V_TF_DDP_BUF0_VALID(1),
4242				     V_TF_DDP_ACTIVE_BUF(1));
4243	else
4244		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4245				     V_TF_DDP_ACTIVE_BUF(1) |
4246				     V_TF_DDP_BUF1_VALID(1), 0);
4247
4248	getreq = (struct cpl_get_tcb *)(req + 1);
4249	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4250
4251	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4252
4253	/* Keep track of the number of oustanding CPL_GET_TCB requests
4254	 */
4255	p->get_tcb_count++;
4256
4257#ifdef T3_TRACE
4258	T3_TRACE1(TIDTB(so),
4259		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4260#endif
4261	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4262}
4263
4264/**
4265 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4266 * @sk: the socket associated with the buffers
4267 * @bufidx: index of HW DDP buffer (0 or 1)
4268 * @tag0: new tag for HW buffer 0
4269 * @tag1: new tag for HW buffer 1
4270 * @len: new length for HW buf @bufidx
4271 *
4272 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4273 * buffer by changing the buffer tag and length and setting the valid and
4274 * active flag accordingly.  The caller must ensure the new buffer is at
4275 * least as big as the existing one.  Since we typically reprogram both HW
4276 * buffers this function sets both tags for convenience. Read the TCB to
4277 * determine how made data was written into the buffer before the overlay
4278 * took place.
4279 */
4280void
4281t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4282	 	       unsigned int tag1, unsigned int len)
4283{
4284	unsigned int wrlen;
4285	struct mbuf *m;
4286	struct work_request_hdr *wr;
4287	struct cpl_get_tcb *getreq;
4288	struct cpl_set_tcb_field *req;
4289	struct ddp_state *p = &toep->tp_ddp_state;
4290
4291	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4292	    bufidx, tag0, tag1, len);
4293#if 0
4294	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4295#endif
4296	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4297	m = m_gethdr_nofail(wrlen);
4298	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4299	wr = mtod(m, struct work_request_hdr *);
4300	m->m_pkthdr.len = m->m_len = wrlen;
4301	bzero(wr, wrlen);
4302
4303
4304	/* Set the ATOMIC flag to make sure that TP processes the following
4305	 * CPLs in an atomic manner and no wire segments can be interleaved.
4306	 */
4307	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4308	req = (struct cpl_set_tcb_field *)(wr + 1);
4309	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4310			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4311			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4312			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4313			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4314	req++;
4315	if (bufidx == 0) {
4316		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4317			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4318			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4319		req++;
4320		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4321			    V_TF_DDP_PUSH_DISABLE_0(1) |
4322			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4323			    V_TF_DDP_PUSH_DISABLE_0(0) |
4324			    V_TF_DDP_BUF0_VALID(1));
4325	} else {
4326		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4327			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4328			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4329		req++;
4330		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4331			    V_TF_DDP_PUSH_DISABLE_1(1) |
4332			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4333			    V_TF_DDP_PUSH_DISABLE_1(0) |
4334			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4335	}
4336
4337	getreq = (struct cpl_get_tcb *)(req + 1);
4338	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4339
4340	/* Keep track of the number of oustanding CPL_GET_TCB requests
4341	 */
4342	p->get_tcb_count++;
4343
4344#ifdef T3_TRACE
4345	T3_TRACE4(TIDTB(sk),
4346		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4347		  "len %d",
4348		  bufidx, tag0, tag1, len);
4349#endif
4350	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4351}
4352
4353/*
4354 * Sends a compound WR containing all the CPL messages needed to program the
4355 * two HW DDP buffers, namely optionally setting up the length and offset of
4356 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4357 */
4358void
4359t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4360		      unsigned int len1, unsigned int offset1,
4361                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4362{
4363	unsigned int wrlen;
4364	struct mbuf *m;
4365	struct work_request_hdr *wr;
4366	struct cpl_set_tcb_field *req;
4367
4368	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4369	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4370
4371#if 0
4372	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4373#endif
4374	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4375		(len1 ? sizeof(*req) : 0) +
4376		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4377	m = m_gethdr_nofail(wrlen);
4378	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4379	wr = mtod(m, struct work_request_hdr *);
4380	bzero(wr, wrlen);
4381
4382	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4383	m->m_pkthdr.len = m->m_len = wrlen;
4384
4385	req = (struct cpl_set_tcb_field *)(wr + 1);
4386	if (len0) {                  /* program buffer 0 offset and length */
4387		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4388			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4389			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4390			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4391			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4392		req++;
4393	}
4394	if (len1) {                  /* program buffer 1 offset and length */
4395		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4396			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4397			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4398			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4399			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4400		req++;
4401	}
4402
4403	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4404			     ddp_flags);
4405
4406	if (modulate) {
4407		mk_rx_data_ack_ulp(toep,
4408		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4409		    toep->tp_copied_seq - toep->tp_rcv_wup);
4410		toep->tp_rcv_wup = toep->tp_copied_seq;
4411	}
4412
4413#ifdef T3_TRACE
4414	T3_TRACE5(TIDTB(sk),
4415		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4416		  "modulate %d",
4417		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4418		  modulate);
4419#endif
4420
4421	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4422}
4423
4424void
4425t3_init_wr_tab(unsigned int wr_len)
4426{
4427	int i;
4428
4429	if (mbuf_wrs[1])     /* already initialized */
4430		return;
4431
4432	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4433		int sgl_len = (3 * i) / 2 + (i & 1);
4434
4435		sgl_len += 3;
4436		mbuf_wrs[i] = sgl_len <= wr_len ?
4437		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4438	}
4439
4440	wrlen = wr_len * 8;
4441}
4442
4443int
4444t3_init_cpl_io(void)
4445{
4446#ifdef notyet
4447	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4448	if (!tcphdr_skb) {
4449		log(LOG_ERR,
4450		       "Chelsio TCP offload: can't allocate sk_buff\n");
4451		return -1;
4452	}
4453	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4454	tcphdr_skb->h.raw = tcphdr_skb->data;
4455	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4456#endif
4457
4458	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4459	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4460	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4461	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4462	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4463	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4464	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4465	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4466	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4467	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4468	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4469	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4470	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4471	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4472	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4473	return (0);
4474}
4475
4476