cxgb_cpl_io.c revision 181011
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 181011 2008-07-30 20:08:34Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockbuf.h>
43#include <sys/sockstate.h>
44#include <sys/sockopt.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/protosw.h>
49#include <sys/priv.h>
50
51#include <net/if.h>
52#include <net/route.h>
53
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/in_systm.h>
57#include <netinet/in_var.h>
58
59
60#include <dev/cxgb/cxgb_osdep.h>
61#include <dev/cxgb/sys/mbufq.h>
62
63#include <netinet/ip.h>
64#include <netinet/tcp_var.h>
65#include <netinet/tcp_fsm.h>
66#include <netinet/tcp_offload.h>
67#include <netinet/tcp_seq.h>
68#include <netinet/tcp_syncache.h>
69#include <netinet/tcp_timer.h>
70#include <net/route.h>
71
72#include <dev/cxgb/t3cdev.h>
73#include <dev/cxgb/common/cxgb_firmware_exports.h>
74#include <dev/cxgb/common/cxgb_t3_cpl.h>
75#include <dev/cxgb/common/cxgb_tcb.h>
76#include <dev/cxgb/common/cxgb_ctl_defs.h>
77#include <dev/cxgb/cxgb_offload.h>
78#include <vm/vm.h>
79#include <vm/pmap.h>
80#include <machine/bus.h>
81#include <dev/cxgb/sys/mvec.h>
82#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
83#include <dev/cxgb/ulp/tom/cxgb_defs.h>
84#include <dev/cxgb/ulp/tom/cxgb_tom.h>
85#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
86#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
87#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
88
89#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
90
91/*
92 * For ULP connections HW may add headers, e.g., for digests, that aren't part
93 * of the messages sent by the host but that are part of the TCP payload and
94 * therefore consume TCP sequence space.  Tx connection parameters that
95 * operate in TCP sequence space are affected by the HW additions and need to
96 * compensate for them to accurately track TCP sequence numbers. This array
97 * contains the compensating extra lengths for ULP packets.  It is indexed by
98 * a packet's ULP submode.
99 */
100const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
101
102#ifdef notyet
103/*
104 * This sk_buff holds a fake header-only TCP segment that we use whenever we
105 * need to exploit SW TCP functionality that expects TCP headers, such as
106 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
107 * CPUs without locking.
108 */
109static struct mbuf *tcphdr_mbuf __read_mostly;
110#endif
111
112/*
113 * Size of WRs in bytes.  Note that we assume all devices we are handling have
114 * the same WR size.
115 */
116static unsigned int wrlen __read_mostly;
117
118/*
119 * The number of WRs needed for an skb depends on the number of page fragments
120 * in the skb and whether it has any payload in its main body.  This maps the
121 * length of the gather list represented by an skb into the # of necessary WRs.
122 */
123static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
124
125/*
126 * Max receive window supported by HW in bytes.  Only a small part of it can
127 * be set through option0, the rest needs to be set through RX_DATA_ACK.
128 */
129#define MAX_RCV_WND ((1U << 27) - 1)
130
131/*
132 * Min receive window.  We want it to be large enough to accommodate receive
133 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
134 */
135#define MIN_RCV_WND (24 * 1024U)
136#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
137
138#define VALIDATE_SEQ 0
139#define VALIDATE_SOCK(so)
140#define DEBUG_WR 0
141
142#define TCP_TIMEWAIT	1
143#define TCP_CLOSE	2
144#define TCP_DROP	3
145
146extern int tcp_do_autorcvbuf;
147extern int tcp_do_autosndbuf;
148extern int tcp_autorcvbuf_max;
149extern int tcp_autosndbuf_max;
150
151static void t3_send_reset(struct toepcb *toep);
152static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
153static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
154static void handle_syncache_event(int event, void *arg);
155
156static inline void
157SBAPPEND(struct sockbuf *sb, struct mbuf *n)
158{
159	struct mbuf *m;
160
161	m = sb->sb_mb;
162	while (m) {
163		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
164		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
165			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
166		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
167			m->m_next, m->m_nextpkt, m->m_flags));
168		m = m->m_next;
169	}
170	m = n;
171	while (m) {
172		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
173		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
174			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
175		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176			m->m_next, m->m_nextpkt, m->m_flags));
177		m = m->m_next;
178	}
179	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
180	sbappendstream_locked(sb, n);
181	m = sb->sb_mb;
182
183	while (m) {
184		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
185			m->m_next, m->m_nextpkt, m->m_flags));
186		m = m->m_next;
187	}
188}
189
190static inline int
191is_t3a(const struct toedev *dev)
192{
193	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
194}
195
196static void
197dump_toepcb(struct toepcb *toep)
198{
199	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
200	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
201	    toep->tp_mtu_idx, toep->tp_tid);
202
203	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
204	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
205	    toep->tp_mss_clamp, toep->tp_flags);
206}
207
208#ifndef RTALLOC2_DEFINED
209static struct rtentry *
210rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
211{
212	struct rtentry *rt = NULL;
213
214	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
215		RT_UNLOCK(rt);
216
217	return (rt);
218}
219#endif
220
221/*
222 * Determine whether to send a CPL message now or defer it.  A message is
223 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
224 * For connections in other states the message is sent immediately.
225 * If through_l2t is set the message is subject to ARP processing, otherwise
226 * it is sent directly.
227 */
228static inline void
229send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
230{
231	struct tcpcb *tp = toep->tp_tp;
232
233	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
234		inp_wlock(tp->t_inpcb);
235		mbufq_tail(&toep->out_of_order_queue, m);  // defer
236		inp_wunlock(tp->t_inpcb);
237	} else if (through_l2t)
238		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
239	else
240		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
241}
242
243static inline unsigned int
244mkprio(unsigned int cntrl, const struct toepcb *toep)
245{
246        return (cntrl);
247}
248
249/*
250 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
251 */
252static inline void
253mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
254{
255	struct cpl_tid_release *req;
256
257	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
258	m->m_pkthdr.len = m->m_len = sizeof(*req);
259	req = mtod(m, struct cpl_tid_release *);
260	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
261	req->wr.wr_lo = 0;
262	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
263}
264
265static inline void
266make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
267{
268	struct tcpcb *tp = so_sototcpcb(so);
269	struct toepcb *toep = tp->t_toe;
270	struct tx_data_wr *req;
271	struct sockbuf *snd;
272
273	inp_lock_assert(tp->t_inpcb);
274	snd = so_sockbuf_snd(so);
275
276	req = mtod(m, struct tx_data_wr *);
277	m->m_len = sizeof(*req);
278	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
279	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
280	/* len includes the length of any HW ULP additions */
281	req->len = htonl(len);
282	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
283	/* V_TX_ULP_SUBMODE sets both the mode and submode */
284	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
285	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
286	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
287				   (tail ? 0 : 1))));
288	req->sndseq = htonl(tp->snd_nxt);
289	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
290		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
291				    V_TX_CPU_IDX(toep->tp_qset));
292
293		/* Sendbuffer is in units of 32KB.
294		 */
295		if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
296			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
297		else {
298			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
299		}
300
301		toep->tp_flags |= TP_DATASENT;
302	}
303}
304
305#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
306
307int
308t3_push_frames(struct socket *so, int req_completion)
309{
310	struct tcpcb *tp = so_sototcpcb(so);
311	struct toepcb *toep = tp->t_toe;
312
313	struct mbuf *tail, *m0, *last;
314	struct t3cdev *cdev;
315	struct tom_data *d;
316	int state, bytes, count, total_bytes;
317	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
318	struct sockbuf *snd;
319
320	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
321		DPRINTF("tcp state=%d\n", tp->t_state);
322		return (0);
323	}
324
325	state = so_state_get(so);
326
327	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
328		DPRINTF("disconnecting\n");
329
330		return (0);
331	}
332
333	inp_lock_assert(tp->t_inpcb);
334
335	snd = so_sockbuf_snd(so);
336	sockbuf_lock(snd);
337
338	d = TOM_DATA(toep->tp_toedev);
339	cdev = d->cdev;
340
341	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
342
343	total_bytes = 0;
344	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
345	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
346
347	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
348		KASSERT(tail, ("sbdrop error"));
349		last = tail = tail->m_next;
350	}
351
352	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
353		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
354		sockbuf_unlock(snd);
355
356		return (0);
357	}
358
359	toep->tp_m_last = NULL;
360	while (toep->tp_wr_avail && (tail != NULL)) {
361		count = bytes = 0;
362		segp = segs;
363		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
364			sockbuf_unlock(snd);
365			return (0);
366		}
367		/*
368		 * If the data in tail fits as in-line, then
369		 * make an immediate data wr.
370		 */
371		if (tail->m_len <= IMM_LEN) {
372			count = 1;
373			bytes = tail->m_len;
374			last = tail;
375			tail = tail->m_next;
376			m_set_sgl(m0, NULL);
377			m_set_sgllen(m0, 0);
378			make_tx_data_wr(so, m0, bytes, tail);
379			m_append(m0, bytes, mtod(last, caddr_t));
380			KASSERT(!m0->m_next, ("bad append"));
381		} else {
382			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
383			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
384				bytes += tail->m_len;
385				last = tail;
386				count++;
387				/*
388				 * technically an abuse to be using this for a VA
389				 * but less gross than defining my own structure
390				 * or calling pmap_kextract from here :-|
391				 */
392				segp->ds_addr = (bus_addr_t)tail->m_data;
393				segp->ds_len = tail->m_len;
394				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
395				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
396				segp++;
397				tail = tail->m_next;
398			}
399			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
400			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
401
402			m_set_sgl(m0, segs);
403			m_set_sgllen(m0, count);
404			make_tx_data_wr(so, m0, bytes, tail);
405		}
406		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
407
408		if (tail) {
409			snd->sb_sndptr = tail;
410			toep->tp_m_last = NULL;
411		} else
412			toep->tp_m_last = snd->sb_sndptr = last;
413
414
415		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
416
417		snd->sb_sndptroff += bytes;
418		total_bytes += bytes;
419		toep->tp_write_seq += bytes;
420		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
421		    " tail=%p sndptr=%p sndptroff=%d",
422		    toep->tp_wr_avail, count, mbuf_wrs[count],
423		    tail, snd->sb_sndptr, snd->sb_sndptroff);
424		if (tail)
425			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
426			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
427			    total_bytes, toep->tp_m_last, tail->m_data,
428			    tp->snd_una);
429		else
430			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
431			    " tp_m_last=%p snd_una=0x%08x",
432			    total_bytes, toep->tp_m_last, tp->snd_una);
433
434
435#ifdef KTR
436{
437		int i;
438
439		i = 0;
440		while (i < count && m_get_sgllen(m0)) {
441			if ((count - i) >= 3) {
442				CTR6(KTR_TOM,
443				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
444				    " len=%d pa=0x%zx len=%d",
445				    segs[i].ds_addr, segs[i].ds_len,
446				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
447				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
448				    i += 3;
449			} else if ((count - i) == 2) {
450				CTR4(KTR_TOM,
451				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
452				    " len=%d",
453				    segs[i].ds_addr, segs[i].ds_len,
454				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
455				    i += 2;
456			} else {
457				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
458				    segs[i].ds_addr, segs[i].ds_len);
459				i++;
460			}
461
462		}
463}
464#endif
465                 /*
466		 * remember credits used
467		 */
468		m0->m_pkthdr.csum_data = mbuf_wrs[count];
469		m0->m_pkthdr.len = bytes;
470		toep->tp_wr_avail -= mbuf_wrs[count];
471		toep->tp_wr_unacked += mbuf_wrs[count];
472
473		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
474		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
475			struct work_request_hdr *wr = cplhdr(m0);
476
477			wr->wr_hi |= htonl(F_WR_COMPL);
478			toep->tp_wr_unacked = 0;
479		}
480		KASSERT((m0->m_pkthdr.csum_data > 0) &&
481		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
482			m0->m_pkthdr.csum_data));
483		m0->m_type = MT_DONTFREE;
484		enqueue_wr(toep, m0);
485		DPRINTF("sending offload tx with %d bytes in %d segments\n",
486		    bytes, count);
487		l2t_send(cdev, m0, toep->tp_l2t);
488	}
489	sockbuf_unlock(snd);
490	return (total_bytes);
491}
492
493/*
494 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
495 * under any circumstances.  We take the easy way out and always queue the
496 * message to the write_queue.  We can optimize the case where the queue is
497 * already empty though the optimization is probably not worth it.
498 */
499static void
500close_conn(struct socket *so)
501{
502	struct mbuf *m;
503	struct cpl_close_con_req *req;
504	struct tom_data *d;
505	struct inpcb *inp = so_sotoinpcb(so);
506	struct tcpcb *tp;
507	struct toepcb *toep;
508	unsigned int tid;
509
510
511	inp_wlock(inp);
512	tp = so_sototcpcb(so);
513	toep = tp->t_toe;
514
515	if (tp->t_state != TCPS_SYN_SENT)
516		t3_push_frames(so, 1);
517
518	if (toep->tp_flags & TP_FIN_SENT) {
519		inp_wunlock(inp);
520		return;
521	}
522
523	tid = toep->tp_tid;
524
525	d = TOM_DATA(toep->tp_toedev);
526
527	m = m_gethdr_nofail(sizeof(*req));
528	m_set_priority(m, CPL_PRIORITY_DATA);
529	m_set_sgl(m, NULL);
530	m_set_sgllen(m, 0);
531
532	toep->tp_flags |= TP_FIN_SENT;
533	req = mtod(m, struct cpl_close_con_req *);
534
535	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
536	req->wr.wr_lo = htonl(V_WR_TID(tid));
537	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
538	req->rsvd = 0;
539	inp_wunlock(inp);
540	/*
541	 * XXX - need to defer shutdown while there is still data in the queue
542	 *
543	 */
544	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
545	cxgb_ofld_send(d->cdev, m);
546
547}
548
549/*
550 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
551 * and send it along.
552 */
553static void
554abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
555{
556	struct cpl_abort_req *req = cplhdr(m);
557
558	req->cmd = CPL_ABORT_NO_RST;
559	cxgb_ofld_send(cdev, m);
560}
561
562/*
563 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
564 * permitted to return without sending the message in case we cannot allocate
565 * an sk_buff.  Returns the number of credits sent.
566 */
567uint32_t
568t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
569{
570	struct mbuf *m;
571	struct cpl_rx_data_ack *req;
572	struct toepcb *toep = tp->t_toe;
573	struct toedev *tdev = toep->tp_toedev;
574
575	m = m_gethdr_nofail(sizeof(*req));
576
577	DPRINTF("returning %u credits to HW\n", credits);
578
579	req = mtod(m, struct cpl_rx_data_ack *);
580	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
581	req->wr.wr_lo = 0;
582	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
583	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
584	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
585	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
586	return (credits);
587}
588
589/*
590 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
591 * This is only used in DDP mode, so we take the opportunity to also set the
592 * DACK mode and flush any Rx credits.
593 */
594void
595t3_send_rx_modulate(struct toepcb *toep)
596{
597	struct mbuf *m;
598	struct cpl_rx_data_ack *req;
599
600	m = m_gethdr_nofail(sizeof(*req));
601
602	req = mtod(m, struct cpl_rx_data_ack *);
603	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
604	req->wr.wr_lo = 0;
605	m->m_pkthdr.len = m->m_len = sizeof(*req);
606
607	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
608	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
609				 V_RX_DACK_MODE(1) |
610				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
611	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
612	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
613	toep->tp_rcv_wup = toep->tp_copied_seq;
614}
615
616/*
617 * Handle receipt of an urgent pointer.
618 */
619static void
620handle_urg_ptr(struct socket *so, uint32_t urg_seq)
621{
622#ifdef URGENT_DATA_SUPPORTED
623	struct tcpcb *tp = so_sototcpcb(so);
624
625	urg_seq--;   /* initially points past the urgent data, per BSD */
626
627	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
628		return;                                 /* duplicate pointer */
629	sk_send_sigurg(sk);
630	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
631	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
632		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
633
634		tp->copied_seq++;
635		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
636			tom_eat_skb(sk, skb, 0);
637	}
638	tp->urg_data = TCP_URG_NOTYET;
639	tp->urg_seq = urg_seq;
640#endif
641}
642
643/*
644 * Returns true if a socket cannot accept new Rx data.
645 */
646static inline int
647so_no_receive(const struct socket *so)
648{
649	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
650}
651
652/*
653 * Process an urgent data notification.
654 */
655static void
656rx_urg_notify(struct toepcb *toep, struct mbuf *m)
657{
658	struct cpl_rx_urg_notify *hdr = cplhdr(m);
659	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
660
661	VALIDATE_SOCK(so);
662
663	if (!so_no_receive(so))
664		handle_urg_ptr(so, ntohl(hdr->seq));
665
666	m_freem(m);
667}
668
669/*
670 * Handler for RX_URG_NOTIFY CPL messages.
671 */
672static int
673do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
674{
675	struct toepcb *toep = (struct toepcb *)ctx;
676
677	rx_urg_notify(toep, m);
678	return (0);
679}
680
681static __inline int
682is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
683{
684	return (toep->tp_ulp_mode ||
685		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
686		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
687}
688
689/*
690 * Set of states for which we should return RX credits.
691 */
692#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
693
694/*
695 * Called after some received data has been read.  It returns RX credits
696 * to the HW for the amount of data processed.
697 */
698void
699t3_cleanup_rbuf(struct tcpcb *tp, int copied)
700{
701	struct toepcb *toep = tp->t_toe;
702	struct socket *so;
703	struct toedev *dev;
704	int dack_mode, must_send, read;
705	u32 thres, credits, dack = 0;
706	struct sockbuf *rcv;
707
708	so = inp_inpcbtosocket(tp->t_inpcb);
709	rcv = so_sockbuf_rcv(so);
710
711	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
712		(tp->t_state == TCPS_FIN_WAIT_2))) {
713		if (copied) {
714			sockbuf_lock(rcv);
715			toep->tp_copied_seq += copied;
716			sockbuf_unlock(rcv);
717		}
718
719		return;
720	}
721
722	inp_lock_assert(tp->t_inpcb);
723
724	sockbuf_lock(rcv);
725	if (copied)
726		toep->tp_copied_seq += copied;
727	else {
728		read = toep->tp_enqueued_bytes - rcv->sb_cc;
729		toep->tp_copied_seq += read;
730	}
731	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
732	toep->tp_enqueued_bytes = rcv->sb_cc;
733	sockbuf_unlock(rcv);
734
735	if (credits > rcv->sb_mbmax) {
736		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
737		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
738	    credits = rcv->sb_mbmax;
739	}
740
741
742	/*
743	 * XXX this won't accurately reflect credit return - we need
744	 * to look at the difference between the amount that has been
745	 * put in the recv sockbuf and what is there now
746	 */
747
748	if (__predict_false(!credits))
749		return;
750
751	dev = toep->tp_toedev;
752	thres = TOM_TUNABLE(dev, rx_credit_thres);
753
754	if (__predict_false(thres == 0))
755		return;
756
757	if (is_delack_mode_valid(dev, toep)) {
758		dack_mode = TOM_TUNABLE(dev, delack);
759		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
760			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
761
762			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
763				dack = F_RX_DACK_CHANGE |
764				       V_RX_DACK_MODE(dack_mode);
765		}
766	} else
767		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
768
769	/*
770	 * For coalescing to work effectively ensure the receive window has
771	 * at least 16KB left.
772	 */
773	must_send = credits + 16384 >= tp->rcv_wnd;
774
775	if (must_send || credits >= thres)
776		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
777}
778
779static int
780cxgb_toe_disconnect(struct tcpcb *tp)
781{
782	struct socket *so;
783
784	DPRINTF("cxgb_toe_disconnect\n");
785
786	so = inp_inpcbtosocket(tp->t_inpcb);
787	close_conn(so);
788	return (0);
789}
790
791static int
792cxgb_toe_reset(struct tcpcb *tp)
793{
794	struct toepcb *toep = tp->t_toe;
795
796	t3_send_reset(toep);
797
798	/*
799	 * unhook from socket
800	 */
801	tp->t_flags &= ~TF_TOE;
802	toep->tp_tp = NULL;
803	tp->t_toe = NULL;
804	return (0);
805}
806
807static int
808cxgb_toe_send(struct tcpcb *tp)
809{
810	struct socket *so;
811
812	DPRINTF("cxgb_toe_send\n");
813	dump_toepcb(tp->t_toe);
814
815	so = inp_inpcbtosocket(tp->t_inpcb);
816	t3_push_frames(so, 1);
817	return (0);
818}
819
820static int
821cxgb_toe_rcvd(struct tcpcb *tp)
822{
823
824	inp_lock_assert(tp->t_inpcb);
825
826	t3_cleanup_rbuf(tp, 0);
827
828	return (0);
829}
830
831static void
832cxgb_toe_detach(struct tcpcb *tp)
833{
834	struct toepcb *toep;
835
836        /*
837	 * XXX how do we handle teardown in the SYN_SENT state?
838	 *
839	 */
840	inp_lock_assert(tp->t_inpcb);
841	toep = tp->t_toe;
842	toep->tp_tp = NULL;
843
844	/*
845	 * unhook from socket
846	 */
847	tp->t_flags &= ~TF_TOE;
848	tp->t_toe = NULL;
849}
850
851
852static struct toe_usrreqs cxgb_toe_usrreqs = {
853	.tu_disconnect = cxgb_toe_disconnect,
854	.tu_reset = cxgb_toe_reset,
855	.tu_send = cxgb_toe_send,
856	.tu_rcvd = cxgb_toe_rcvd,
857	.tu_detach = cxgb_toe_detach,
858	.tu_detach = cxgb_toe_detach,
859	.tu_syncache_event = handle_syncache_event,
860};
861
862
863static void
864__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
865			    uint64_t mask, uint64_t val, int no_reply)
866{
867	struct cpl_set_tcb_field *req;
868
869	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
870	    toep->tp_tid, word, mask, val);
871
872	req = mtod(m, struct cpl_set_tcb_field *);
873	m->m_pkthdr.len = m->m_len = sizeof(*req);
874	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
875	req->wr.wr_lo = 0;
876	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
877	req->reply = V_NO_REPLY(no_reply);
878	req->cpu_idx = 0;
879	req->word = htons(word);
880	req->mask = htobe64(mask);
881	req->val = htobe64(val);
882
883	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
884	send_or_defer(toep, m, 0);
885}
886
887static void
888t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
889{
890	struct mbuf *m;
891	struct tcpcb *tp = toep->tp_tp;
892
893	if (toep == NULL)
894		return;
895
896	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
897		printf("not seting field\n");
898		return;
899	}
900
901	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
902
903	__set_tcb_field(toep, m, word, mask, val, 1);
904}
905
906/*
907 * Set one of the t_flags bits in the TCB.
908 */
909static void
910set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
911{
912
913	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
914}
915
916/*
917 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
918 */
919static void
920t3_set_nagle(struct toepcb *toep)
921{
922	struct tcpcb *tp = toep->tp_tp;
923
924	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
925}
926
927/*
928 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
929 */
930void
931t3_set_keepalive(struct toepcb *toep, int on_off)
932{
933
934	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
935}
936
937void
938t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
939{
940	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
941}
942
943void
944t3_set_dack_mss(struct toepcb *toep, int on_off)
945{
946
947	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
948}
949
950/*
951 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
952 */
953static void
954t3_set_tos(struct toepcb *toep)
955{
956	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
957
958	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
959			 V_TCB_TOS(tos));
960}
961
962
963/*
964 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
965 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
966 * set the PSH bit in the last segment, which would trigger delivery.]
967 * We work around the issue by setting a DDP buffer in a partial placed state,
968 * which guarantees that TP will schedule a timer.
969 */
970#define TP_DDP_TIMER_WORKAROUND_MASK\
971    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
972     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
973       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
974#define TP_DDP_TIMER_WORKAROUND_VAL\
975    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
976     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
977      32))
978
979static void
980t3_enable_ddp(struct toepcb *toep, int on)
981{
982	if (on) {
983
984		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
985				 V_TF_DDP_OFF(0));
986	} else
987		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
988				 V_TF_DDP_OFF(1) |
989				 TP_DDP_TIMER_WORKAROUND_MASK,
990				 V_TF_DDP_OFF(1) |
991				 TP_DDP_TIMER_WORKAROUND_VAL);
992
993}
994
995void
996t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
997{
998	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
999			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1000			 tag_color);
1001}
1002
1003void
1004t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1005		    unsigned int len)
1006{
1007	if (buf_idx == 0)
1008		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1009			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1010			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1011			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1012			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1013	else
1014		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1015			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1016			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1017			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1018			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1019}
1020
1021static int
1022t3_set_cong_control(struct socket *so, const char *name)
1023{
1024#ifdef CONGESTION_CONTROL_SUPPORTED
1025	int cong_algo;
1026
1027	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1028		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1029			break;
1030
1031	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1032		return -EINVAL;
1033#endif
1034	return 0;
1035}
1036
1037int
1038t3_get_tcb(struct toepcb *toep)
1039{
1040	struct cpl_get_tcb *req;
1041	struct tcpcb *tp = toep->tp_tp;
1042	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1043
1044	if (!m)
1045		return (ENOMEM);
1046
1047	inp_lock_assert(tp->t_inpcb);
1048	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1049	req = mtod(m, struct cpl_get_tcb *);
1050	m->m_pkthdr.len = m->m_len = sizeof(*req);
1051	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1052	req->wr.wr_lo = 0;
1053	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1054	req->cpuno = htons(toep->tp_qset);
1055	req->rsvd = 0;
1056	if (tp->t_state == TCPS_SYN_SENT)
1057		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1058	else
1059		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1060	return 0;
1061}
1062
1063static inline void
1064so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1065{
1066
1067	toepcb_hold(toep);
1068
1069	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1070}
1071
1072/**
1073 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1074 *	@d: TOM state
1075 *	@mtu: the target MTU
1076 *
1077 *	Returns the index of the value in the MTU table that is closest to but
1078 *	does not exceed the target MTU.
1079 */
1080static unsigned int
1081find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1082{
1083	int i = 0;
1084
1085	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1086		++i;
1087	return (i);
1088}
1089
1090static unsigned int
1091select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1092{
1093	unsigned int idx;
1094
1095#ifdef notyet
1096	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1097#endif
1098	if (tp) {
1099		tp->t_maxseg = pmtu - 40;
1100		if (tp->t_maxseg < td->mtus[0] - 40)
1101			tp->t_maxseg = td->mtus[0] - 40;
1102		idx = find_best_mtu(td, tp->t_maxseg + 40);
1103
1104		tp->t_maxseg = td->mtus[idx] - 40;
1105	} else
1106		idx = find_best_mtu(td, pmtu);
1107
1108	return (idx);
1109}
1110
1111static inline void
1112free_atid(struct t3cdev *cdev, unsigned int tid)
1113{
1114	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1115
1116	if (toep)
1117		toepcb_release(toep);
1118}
1119
1120/*
1121 * Release resources held by an offload connection (TID, L2T entry, etc.)
1122 */
1123static void
1124t3_release_offload_resources(struct toepcb *toep)
1125{
1126	struct tcpcb *tp = toep->tp_tp;
1127	struct toedev *tdev = toep->tp_toedev;
1128	struct t3cdev *cdev;
1129	struct socket *so;
1130	unsigned int tid = toep->tp_tid;
1131	struct sockbuf *rcv;
1132
1133	CTR0(KTR_TOM, "t3_release_offload_resources");
1134
1135	if (!tdev)
1136		return;
1137
1138	cdev = TOEP_T3C_DEV(toep);
1139	if (!cdev)
1140		return;
1141
1142	toep->tp_qset = 0;
1143	t3_release_ddp_resources(toep);
1144
1145#ifdef CTRL_SKB_CACHE
1146	kfree_skb(CTRL_SKB_CACHE(tp));
1147	CTRL_SKB_CACHE(tp) = NULL;
1148#endif
1149
1150	if (toep->tp_wr_avail != toep->tp_wr_max) {
1151		purge_wr_queue(toep);
1152		reset_wr_list(toep);
1153	}
1154
1155	if (toep->tp_l2t) {
1156		l2t_release(L2DATA(cdev), toep->tp_l2t);
1157		toep->tp_l2t = NULL;
1158	}
1159	toep->tp_tp = NULL;
1160	if (tp) {
1161		inp_lock_assert(tp->t_inpcb);
1162		so = inp_inpcbtosocket(tp->t_inpcb);
1163		rcv = so_sockbuf_rcv(so);
1164		/*
1165		 * cancel any offloaded reads
1166		 *
1167		 */
1168		sockbuf_lock(rcv);
1169		tp->t_toe = NULL;
1170		tp->t_flags &= ~TF_TOE;
1171		if (toep->tp_ddp_state.user_ddp_pending) {
1172			t3_cancel_ubuf(toep, rcv);
1173			toep->tp_ddp_state.user_ddp_pending = 0;
1174		}
1175		so_sorwakeup_locked(so);
1176
1177	}
1178
1179	if (toep->tp_state == TCPS_SYN_SENT) {
1180		free_atid(cdev, tid);
1181#ifdef notyet
1182		__skb_queue_purge(&tp->out_of_order_queue);
1183#endif
1184	} else {                                          // we have TID
1185		cxgb_remove_tid(cdev, toep, tid);
1186		toepcb_release(toep);
1187	}
1188#if 0
1189	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1190#endif
1191}
1192
1193static void
1194install_offload_ops(struct socket *so)
1195{
1196	struct tcpcb *tp = so_sototcpcb(so);
1197
1198	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1199
1200	t3_install_socket_ops(so);
1201	tp->t_flags |= TF_TOE;
1202	tp->t_tu = &cxgb_toe_usrreqs;
1203}
1204
1205/*
1206 * Determine the receive window scaling factor given a target max
1207 * receive window.
1208 */
1209static __inline int
1210select_rcv_wscale(int space)
1211{
1212	int wscale = 0;
1213
1214	if (space > MAX_RCV_WND)
1215		space = MAX_RCV_WND;
1216
1217	if (tcp_do_rfc1323)
1218		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1219
1220	return (wscale);
1221}
1222
1223/*
1224 * Determine the receive window size for a socket.
1225 */
1226static unsigned long
1227select_rcv_wnd(struct toedev *dev, struct socket *so)
1228{
1229	struct tom_data *d = TOM_DATA(dev);
1230	unsigned int wnd;
1231	unsigned int max_rcv_wnd;
1232	struct sockbuf *rcv;
1233
1234	rcv = so_sockbuf_rcv(so);
1235
1236	if (tcp_do_autorcvbuf)
1237		wnd = tcp_autorcvbuf_max;
1238	else
1239		wnd = rcv->sb_hiwat;
1240
1241
1242
1243	/* XXX
1244	 * For receive coalescing to work effectively we need a receive window
1245	 * that can accomodate a coalesced segment.
1246	 */
1247	if (wnd < MIN_RCV_WND)
1248		wnd = MIN_RCV_WND;
1249
1250	/* PR 5138 */
1251	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1252				    (uint32_t)d->rx_page_size * 23 :
1253				    MAX_RCV_WND);
1254
1255	return min(wnd, max_rcv_wnd);
1256}
1257
1258/*
1259 * Assign offload parameters to some socket fields.  This code is used by
1260 * both active and passive opens.
1261 */
1262static inline void
1263init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1264    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1265{
1266	struct tcpcb *tp = so_sototcpcb(so);
1267	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1268	struct sockbuf *snd, *rcv;
1269
1270#ifdef notyet
1271	SOCK_LOCK_ASSERT(so);
1272#endif
1273
1274	snd = so_sockbuf_snd(so);
1275	rcv = so_sockbuf_rcv(so);
1276
1277	log(LOG_INFO, "initializing offload socket\n");
1278	/*
1279	 * We either need to fix push frames to work with sbcompress
1280	 * or we need to add this
1281	 */
1282	snd->sb_flags |= SB_NOCOALESCE;
1283	rcv->sb_flags |= SB_NOCOALESCE;
1284
1285	tp->t_toe = toep;
1286	toep->tp_tp = tp;
1287	toep->tp_toedev = dev;
1288
1289	toep->tp_tid = tid;
1290	toep->tp_l2t = e;
1291	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1292	toep->tp_wr_unacked = 0;
1293	toep->tp_delack_mode = 0;
1294
1295	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1296	/*
1297	 * XXX broken
1298	 *
1299	 */
1300	tp->rcv_wnd = select_rcv_wnd(dev, so);
1301
1302        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1303		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1304	toep->tp_qset_idx = 0;
1305
1306	reset_wr_list(toep);
1307	DPRINTF("initialization done\n");
1308}
1309
1310/*
1311 * The next two functions calculate the option 0 value for a socket.
1312 */
1313static inline unsigned int
1314calc_opt0h(struct socket *so, int mtu_idx)
1315{
1316	struct tcpcb *tp = so_sototcpcb(so);
1317	int wscale = select_rcv_wscale(tp->rcv_wnd);
1318
1319	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1320	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1321	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1322}
1323
1324static inline unsigned int
1325calc_opt0l(struct socket *so, int ulp_mode)
1326{
1327	struct tcpcb *tp = so_sototcpcb(so);
1328	unsigned int val;
1329
1330	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1331	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1332
1333	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1334	return (val);
1335}
1336
1337static inline unsigned int
1338calc_opt2(const struct socket *so, struct toedev *dev)
1339{
1340	int flv_valid;
1341
1342	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1343
1344	return (V_FLAVORS_VALID(flv_valid) |
1345	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1346}
1347
1348#if DEBUG_WR > 1
1349static int
1350count_pending_wrs(const struct toepcb *toep)
1351{
1352	const struct mbuf *m;
1353	int n = 0;
1354
1355	wr_queue_walk(toep, m)
1356		n += m->m_pkthdr.csum_data;
1357	return (n);
1358}
1359#endif
1360
1361#if 0
1362(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1363#endif
1364
1365static void
1366mk_act_open_req(struct socket *so, struct mbuf *m,
1367    unsigned int atid, const struct l2t_entry *e)
1368{
1369	struct cpl_act_open_req *req;
1370	struct inpcb *inp = so_sotoinpcb(so);
1371	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1372	struct toepcb *toep = tp->t_toe;
1373	struct toedev *tdev = toep->tp_toedev;
1374
1375	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1376
1377	req = mtod(m, struct cpl_act_open_req *);
1378	m->m_pkthdr.len = m->m_len = sizeof(*req);
1379
1380	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1381	req->wr.wr_lo = 0;
1382	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1383	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1384#if 0
1385	req->local_port = inp->inp_lport;
1386	req->peer_port = inp->inp_fport;
1387	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1388	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1389#endif
1390	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1391			   V_TX_CHANNEL(e->smt_idx));
1392	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1393	req->params = 0;
1394	req->opt2 = htonl(calc_opt2(so, tdev));
1395}
1396
1397
1398/*
1399 * Convert an ACT_OPEN_RPL status to an errno.
1400 */
1401static int
1402act_open_rpl_status_to_errno(int status)
1403{
1404	switch (status) {
1405	case CPL_ERR_CONN_RESET:
1406		return (ECONNREFUSED);
1407	case CPL_ERR_ARP_MISS:
1408		return (EHOSTUNREACH);
1409	case CPL_ERR_CONN_TIMEDOUT:
1410		return (ETIMEDOUT);
1411	case CPL_ERR_TCAM_FULL:
1412		return (ENOMEM);
1413	case CPL_ERR_CONN_EXIST:
1414		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1415		return (EADDRINUSE);
1416	default:
1417		return (EIO);
1418	}
1419}
1420
1421static void
1422fail_act_open(struct toepcb *toep, int errno)
1423{
1424	struct tcpcb *tp = toep->tp_tp;
1425
1426	t3_release_offload_resources(toep);
1427	if (tp) {
1428		inp_wunlock(tp->t_inpcb);
1429		tcp_offload_drop(tp, errno);
1430	}
1431
1432#ifdef notyet
1433	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1434#endif
1435}
1436
1437/*
1438 * Handle active open failures.
1439 */
1440static void
1441active_open_failed(struct toepcb *toep, struct mbuf *m)
1442{
1443	struct cpl_act_open_rpl *rpl = cplhdr(m);
1444	struct inpcb *inp;
1445
1446	if (toep->tp_tp == NULL)
1447		goto done;
1448
1449	inp = toep->tp_tp->t_inpcb;
1450
1451/*
1452 * Don't handle connection retry for now
1453 */
1454#ifdef notyet
1455	struct inet_connection_sock *icsk = inet_csk(sk);
1456
1457	if (rpl->status == CPL_ERR_CONN_EXIST &&
1458	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1459		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1460		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1461			       jiffies + HZ / 2);
1462	} else
1463#endif
1464	{
1465		inp_wlock(inp);
1466		/*
1467		 * drops the inpcb lock
1468		 */
1469		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1470	}
1471
1472	done:
1473	m_free(m);
1474}
1475
1476/*
1477 * Return whether a failed active open has allocated a TID
1478 */
1479static inline int
1480act_open_has_tid(int status)
1481{
1482	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1483	       status != CPL_ERR_ARP_MISS;
1484}
1485
1486/*
1487 * Process an ACT_OPEN_RPL CPL message.
1488 */
1489static int
1490do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1491{
1492	struct toepcb *toep = (struct toepcb *)ctx;
1493	struct cpl_act_open_rpl *rpl = cplhdr(m);
1494
1495	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1496		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1497
1498	active_open_failed(toep, m);
1499	return (0);
1500}
1501
1502/*
1503 * Handle an ARP failure for an active open.   XXX purge ofo queue
1504 *
1505 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1506 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1507 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1508 * free the atid.  Hmm.
1509 */
1510#ifdef notyet
1511static void
1512act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1513{
1514	struct toepcb *toep = m_get_toep(m);
1515	struct tcpcb *tp = toep->tp_tp;
1516	struct inpcb *inp = tp->t_inpcb;
1517	struct socket *so;
1518
1519	inp_wlock(inp);
1520	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1521		/*
1522		 * drops the inpcb lock
1523		 */
1524		fail_act_open(so, EHOSTUNREACH);
1525		printf("freeing %p\n", m);
1526
1527		m_free(m);
1528	} else
1529		inp_wunlock(inp);
1530}
1531#endif
1532/*
1533 * Send an active open request.
1534 */
1535int
1536t3_connect(struct toedev *tdev, struct socket *so,
1537    struct rtentry *rt, struct sockaddr *nam)
1538{
1539	struct mbuf *m;
1540	struct l2t_entry *e;
1541	struct tom_data *d = TOM_DATA(tdev);
1542	struct inpcb *inp = so_sotoinpcb(so);
1543	struct tcpcb *tp = intotcpcb(inp);
1544	struct toepcb *toep; /* allocated by init_offload_socket */
1545
1546	int atid;
1547
1548	toep = toepcb_alloc();
1549	if (toep == NULL)
1550		goto out_err;
1551
1552	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1553		goto out_err;
1554
1555	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1556	if (!e)
1557		goto free_tid;
1558
1559	inp_lock_assert(inp);
1560	m = m_gethdr(MT_DATA, M_WAITOK);
1561
1562#if 0
1563	m->m_toe.mt_toepcb = tp->t_toe;
1564	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1565#endif
1566	so_lock(so);
1567
1568	init_offload_socket(so, tdev, atid, e, rt, toep);
1569
1570	install_offload_ops(so);
1571
1572	mk_act_open_req(so, m, atid, e);
1573	so_unlock(so);
1574
1575	soisconnecting(so);
1576	toep = tp->t_toe;
1577	m_set_toep(m, tp->t_toe);
1578
1579	toep->tp_state = TCPS_SYN_SENT;
1580	l2t_send(d->cdev, (struct mbuf *)m, e);
1581
1582	if (toep->tp_ulp_mode)
1583		t3_enable_ddp(toep, 0);
1584	return 	(0);
1585
1586free_tid:
1587	printf("failing connect - free atid\n");
1588
1589	free_atid(d->cdev, atid);
1590out_err:
1591	printf("return ENOMEM\n");
1592       return (ENOMEM);
1593}
1594
1595/*
1596 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1597 * not send multiple ABORT_REQs for the same connection and also that we do
1598 * not try to send a message after the connection has closed.  Returns 1 if
1599 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1600 */
1601static void
1602t3_send_reset(struct toepcb *toep)
1603{
1604
1605	struct cpl_abort_req *req;
1606	unsigned int tid = toep->tp_tid;
1607	int mode = CPL_ABORT_SEND_RST;
1608	struct tcpcb *tp = toep->tp_tp;
1609	struct toedev *tdev = toep->tp_toedev;
1610	struct socket *so = NULL;
1611	struct mbuf *m;
1612	struct sockbuf *snd;
1613
1614	if (tp) {
1615		inp_lock_assert(tp->t_inpcb);
1616		so = inp_inpcbtosocket(tp->t_inpcb);
1617	}
1618
1619	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1620		tdev == NULL))
1621		return;
1622	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1623
1624	snd = so_sockbuf_snd(so);
1625	/* Purge the send queue so we don't send anything after an abort. */
1626	if (so)
1627		sbflush(snd);
1628	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1629		mode |= CPL_ABORT_POST_CLOSE_REQ;
1630
1631	m = m_gethdr_nofail(sizeof(*req));
1632	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1633	set_arp_failure_handler(m, abort_arp_failure);
1634
1635	req = mtod(m, struct cpl_abort_req *);
1636	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1637	req->wr.wr_lo = htonl(V_WR_TID(tid));
1638	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1639	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1640	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1641	req->cmd = mode;
1642	if (tp && (tp->t_state == TCPS_SYN_SENT))
1643		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1644	else
1645		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1646}
1647
1648static int
1649t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1650{
1651	struct inpcb *inp;
1652	int error, optval;
1653
1654	if (sopt->sopt_name == IP_OPTIONS)
1655		return (ENOPROTOOPT);
1656
1657	if (sopt->sopt_name != IP_TOS)
1658		return (EOPNOTSUPP);
1659
1660	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1661
1662	if (error)
1663		return (error);
1664
1665	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1666		return (EPERM);
1667
1668	inp = so_sotoinpcb(so);
1669	inp_wlock(inp);
1670	inp_ip_tos_set(inp, optval);
1671#if 0
1672	inp->inp_ip_tos = optval;
1673#endif
1674	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1675	inp_wunlock(inp);
1676
1677	return (0);
1678}
1679
1680static int
1681t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1682{
1683	int err = 0;
1684	size_t copied;
1685
1686	if (sopt->sopt_name != TCP_CONGESTION &&
1687	    sopt->sopt_name != TCP_NODELAY)
1688		return (EOPNOTSUPP);
1689
1690	if (sopt->sopt_name == TCP_CONGESTION) {
1691		char name[TCP_CA_NAME_MAX];
1692		int optlen = sopt->sopt_valsize;
1693		struct tcpcb *tp;
1694
1695		if (sopt->sopt_dir == SOPT_GET) {
1696			KASSERT(0, ("unimplemented"));
1697			return (EOPNOTSUPP);
1698		}
1699
1700		if (optlen < 1)
1701			return (EINVAL);
1702
1703		err = copyinstr(sopt->sopt_val, name,
1704		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1705		if (err)
1706			return (err);
1707		if (copied < 1)
1708			return (EINVAL);
1709
1710		tp = so_sototcpcb(so);
1711		/*
1712		 * XXX I need to revisit this
1713		 */
1714		if ((err = t3_set_cong_control(so, name)) == 0) {
1715#ifdef CONGESTION_CONTROL_SUPPORTED
1716			tp->t_cong_control = strdup(name, M_CXGB);
1717#endif
1718		} else
1719			return (err);
1720	} else {
1721		int optval, oldval;
1722		struct inpcb *inp;
1723		struct tcpcb *tp;
1724
1725		if (sopt->sopt_dir == SOPT_GET)
1726			return (EOPNOTSUPP);
1727
1728		err = sooptcopyin(sopt, &optval, sizeof optval,
1729		    sizeof optval);
1730
1731		if (err)
1732			return (err);
1733
1734		inp = so_sotoinpcb(so);
1735		tp = inp_inpcbtotcpcb(inp);
1736
1737		inp_wlock(inp);
1738
1739		oldval = tp->t_flags;
1740		if (optval)
1741			tp->t_flags |= TF_NODELAY;
1742		else
1743			tp->t_flags &= ~TF_NODELAY;
1744		inp_wunlock(inp);
1745
1746
1747		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1748			t3_set_nagle(tp->t_toe);
1749
1750	}
1751
1752	return (0);
1753}
1754
1755int
1756t3_ctloutput(struct socket *so, struct sockopt *sopt)
1757{
1758	int err;
1759
1760	if (sopt->sopt_level != IPPROTO_TCP)
1761		err =  t3_ip_ctloutput(so, sopt);
1762	else
1763		err = t3_tcp_ctloutput(so, sopt);
1764
1765	if (err != EOPNOTSUPP)
1766		return (err);
1767
1768	return (tcp_ctloutput(so, sopt));
1769}
1770
1771/*
1772 * Returns true if we need to explicitly request RST when we receive new data
1773 * on an RX-closed connection.
1774 */
1775static inline int
1776need_rst_on_excess_rx(const struct toepcb *toep)
1777{
1778	return (1);
1779}
1780
1781/*
1782 * Handles Rx data that arrives in a state where the socket isn't accepting
1783 * new data.
1784 */
1785static void
1786handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1787{
1788
1789	if (need_rst_on_excess_rx(toep) &&
1790	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1791		t3_send_reset(toep);
1792	m_freem(m);
1793}
1794
1795/*
1796 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1797 * by getting the DDP offset from the TCB.
1798 */
1799static void
1800tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1801{
1802	struct ddp_state *q = &toep->tp_ddp_state;
1803	struct ddp_buf_state *bsp;
1804	struct cpl_get_tcb_rpl *hdr;
1805	unsigned int ddp_offset;
1806	struct socket *so;
1807	struct tcpcb *tp;
1808	struct sockbuf *rcv;
1809	int state;
1810
1811	uint64_t t;
1812	__be64 *tcb;
1813
1814	tp = toep->tp_tp;
1815	so = inp_inpcbtosocket(tp->t_inpcb);
1816
1817	inp_lock_assert(tp->t_inpcb);
1818	rcv = so_sockbuf_rcv(so);
1819	sockbuf_lock(rcv);
1820
1821	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1822	 * We really need a cookie in order to dispatch the RPLs.
1823	 */
1824	q->get_tcb_count--;
1825
1826	/* It is a possible that a previous CPL already invalidated UBUF DDP
1827	 * and moved the cur_buf idx and hence no further processing of this
1828	 * skb is required. However, the app might be sleeping on
1829	 * !q->get_tcb_count and we need to wake it up.
1830	 */
1831	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1832		int state = so_state_get(so);
1833
1834		m_freem(m);
1835		if (__predict_true((state & SS_NOFDREF) == 0))
1836			so_sorwakeup_locked(so);
1837		else
1838			sockbuf_unlock(rcv);
1839
1840		return;
1841	}
1842
1843	bsp = &q->buf_state[q->cur_buf];
1844	hdr = cplhdr(m);
1845	tcb = (__be64 *)(hdr + 1);
1846	if (q->cur_buf == 0) {
1847		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1848		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1849	} else {
1850		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1851		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1852	}
1853	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1854	m->m_cur_offset = bsp->cur_offset;
1855	bsp->cur_offset = ddp_offset;
1856	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1857
1858	CTR5(KTR_TOM,
1859	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1860	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1861	KASSERT(ddp_offset >= m->m_cur_offset,
1862	    ("ddp_offset=%u less than cur_offset=%u",
1863		ddp_offset, m->m_cur_offset));
1864
1865#if 0
1866{
1867	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1868
1869	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1870	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1871
1872        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1873        rcv_nxt = t >> S_TCB_RCV_NXT;
1874        rcv_nxt &= M_TCB_RCV_NXT;
1875
1876        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1877        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1878        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1879
1880	T3_TRACE2(TIDTB(sk),
1881		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1882		  ddp_flags, rcv_nxt - rx_hdr_offset);
1883	T3_TRACE4(TB(q),
1884		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1885		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1886	T3_TRACE3(TB(q),
1887		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1888		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1889	T3_TRACE2(TB(q),
1890		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1891		 q->buf_state[0].flags, q->buf_state[1].flags);
1892
1893}
1894#endif
1895	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1896		handle_excess_rx(toep, m);
1897		return;
1898	}
1899
1900#ifdef T3_TRACE
1901	if ((int)m->m_pkthdr.len < 0) {
1902		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1903	}
1904#endif
1905	if (bsp->flags & DDP_BF_NOCOPY) {
1906#ifdef T3_TRACE
1907		T3_TRACE0(TB(q),
1908			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1909
1910		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1911			printk("!cancel_ubuf");
1912			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1913		}
1914#endif
1915		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1916		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1917		q->cur_buf ^= 1;
1918	} else if (bsp->flags & DDP_BF_NOFLIP) {
1919
1920		m->m_ddp_flags = 1;    /* always a kernel buffer */
1921
1922		/* now HW buffer carries a user buffer */
1923		bsp->flags &= ~DDP_BF_NOFLIP;
1924		bsp->flags |= DDP_BF_NOCOPY;
1925
1926		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1927		 * any new data in which case we're done. If in addition the
1928		 * offset is 0, then there wasn't a completion for the kbuf
1929		 * and we need to decrement the posted count.
1930		 */
1931		if (m->m_pkthdr.len == 0) {
1932			if (ddp_offset == 0) {
1933				q->kbuf_posted--;
1934				bsp->flags |= DDP_BF_NODATA;
1935			}
1936			sockbuf_unlock(rcv);
1937			m_free(m);
1938			return;
1939		}
1940	} else {
1941		sockbuf_unlock(rcv);
1942
1943		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1944		 * but it got here way late and nobody cares anymore.
1945		 */
1946		m_free(m);
1947		return;
1948	}
1949
1950	m->m_ddp_gl = (unsigned char *)bsp->gl;
1951	m->m_flags |= M_DDP;
1952	m->m_seq = tp->rcv_nxt;
1953	tp->rcv_nxt += m->m_pkthdr.len;
1954	tp->t_rcvtime = ticks;
1955	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1956		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1957	if (m->m_pkthdr.len == 0) {
1958		q->user_ddp_pending = 0;
1959		m_free(m);
1960	} else
1961		SBAPPEND(rcv, m);
1962
1963	state = so_state_get(so);
1964	if (__predict_true((state & SS_NOFDREF) == 0))
1965		so_sorwakeup_locked(so);
1966	else
1967		sockbuf_unlock(rcv);
1968}
1969
1970/*
1971 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1972 * in that case they are similar to DDP completions.
1973 */
1974static int
1975do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1976{
1977	struct toepcb *toep = (struct toepcb *)ctx;
1978
1979	/* OK if socket doesn't exist */
1980	if (toep == NULL) {
1981		printf("null toep in do_get_tcb_rpl\n");
1982		return (CPL_RET_BUF_DONE);
1983	}
1984
1985	inp_wlock(toep->tp_tp->t_inpcb);
1986	tcb_rpl_as_ddp_complete(toep, m);
1987	inp_wunlock(toep->tp_tp->t_inpcb);
1988
1989	return (0);
1990}
1991
1992static void
1993handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1994{
1995	struct tcpcb *tp = toep->tp_tp;
1996	struct socket *so;
1997	struct ddp_state *q;
1998	struct ddp_buf_state *bsp;
1999	struct cpl_rx_data *hdr = cplhdr(m);
2000	unsigned int rcv_nxt = ntohl(hdr->seq);
2001	struct sockbuf *rcv;
2002
2003	if (tp->rcv_nxt == rcv_nxt)
2004		return;
2005
2006	inp_lock_assert(tp->t_inpcb);
2007	so  = inp_inpcbtosocket(tp->t_inpcb);
2008	rcv = so_sockbuf_rcv(so);
2009	sockbuf_lock(rcv);
2010
2011	q = &toep->tp_ddp_state;
2012	bsp = &q->buf_state[q->cur_buf];
2013	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2014		rcv_nxt, tp->rcv_nxt));
2015	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2016	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2017	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2018	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2019
2020#ifdef T3_TRACE
2021	if ((int)m->m_pkthdr.len < 0) {
2022		t3_ddp_error(so, "handle_ddp_data: neg len");
2023	}
2024#endif
2025	m->m_ddp_gl = (unsigned char *)bsp->gl;
2026	m->m_flags |= M_DDP;
2027	m->m_cur_offset = bsp->cur_offset;
2028	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2029	if (bsp->flags & DDP_BF_NOCOPY)
2030		bsp->flags &= ~DDP_BF_NOCOPY;
2031
2032	m->m_seq = tp->rcv_nxt;
2033	tp->rcv_nxt = rcv_nxt;
2034	bsp->cur_offset += m->m_pkthdr.len;
2035	if (!(bsp->flags & DDP_BF_NOFLIP))
2036		q->cur_buf ^= 1;
2037	/*
2038	 * For now, don't re-enable DDP after a connection fell out of  DDP
2039	 * mode.
2040	 */
2041	q->ubuf_ddp_ready = 0;
2042	sockbuf_unlock(rcv);
2043}
2044
2045/*
2046 * Process new data received for a connection.
2047 */
2048static void
2049new_rx_data(struct toepcb *toep, struct mbuf *m)
2050{
2051	struct cpl_rx_data *hdr = cplhdr(m);
2052	struct tcpcb *tp = toep->tp_tp;
2053	struct socket *so;
2054	struct sockbuf *rcv;
2055	int state;
2056	int len = be16toh(hdr->len);
2057
2058	inp_wlock(tp->t_inpcb);
2059
2060	so  = inp_inpcbtosocket(tp->t_inpcb);
2061
2062	if (__predict_false(so_no_receive(so))) {
2063		handle_excess_rx(toep, m);
2064		inp_wunlock(tp->t_inpcb);
2065		TRACE_EXIT;
2066		return;
2067	}
2068
2069	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2070		handle_ddp_data(toep, m);
2071
2072	m->m_seq = ntohl(hdr->seq);
2073	m->m_ulp_mode = 0;                    /* for iSCSI */
2074
2075#if VALIDATE_SEQ
2076	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2077		log(LOG_ERR,
2078		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2079		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2080		       tp->rcv_nxt);
2081		m_freem(m);
2082		inp_wunlock(tp->t_inpcb);
2083		return;
2084	}
2085#endif
2086	m_adj(m, sizeof(*hdr));
2087
2088#ifdef URGENT_DATA_SUPPORTED
2089	/*
2090	 * We don't handle urgent data yet
2091	 */
2092	if (__predict_false(hdr->urg))
2093		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2094	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2095		     tp->urg_seq - tp->rcv_nxt < skb->len))
2096		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2097							 tp->rcv_nxt];
2098#endif
2099	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2100		toep->tp_delack_mode = hdr->dack_mode;
2101		toep->tp_delack_seq = tp->rcv_nxt;
2102	}
2103	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2104	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2105
2106	if (len < m->m_pkthdr.len)
2107		m->m_pkthdr.len = m->m_len = len;
2108
2109	tp->rcv_nxt += m->m_pkthdr.len;
2110	tp->t_rcvtime = ticks;
2111	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2112	CTR2(KTR_TOM,
2113	    "new_rx_data: seq 0x%x len %u",
2114	    m->m_seq, m->m_pkthdr.len);
2115	inp_wunlock(tp->t_inpcb);
2116	rcv = so_sockbuf_rcv(so);
2117	sockbuf_lock(rcv);
2118#if 0
2119	if (sb_notify(rcv))
2120		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2121#endif
2122	SBAPPEND(rcv, m);
2123
2124#ifdef notyet
2125	/*
2126	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2127	 *
2128	 */
2129	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2130
2131	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2132		so, rcv->sb_cc, rcv->sb_mbmax));
2133#endif
2134
2135
2136	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2137	    rcv->sb_cc, rcv->sb_mbcnt);
2138
2139	state = so_state_get(so);
2140	if (__predict_true((state & SS_NOFDREF) == 0))
2141		so_sorwakeup_locked(so);
2142	else
2143		sockbuf_unlock(rcv);
2144}
2145
2146/*
2147 * Handler for RX_DATA CPL messages.
2148 */
2149static int
2150do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2151{
2152	struct toepcb *toep = (struct toepcb *)ctx;
2153
2154	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2155
2156	new_rx_data(toep, m);
2157
2158	return (0);
2159}
2160
2161static void
2162new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2163{
2164	struct tcpcb *tp;
2165	struct ddp_state *q;
2166	struct ddp_buf_state *bsp;
2167	struct cpl_rx_data_ddp *hdr;
2168	struct socket *so;
2169	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2170	int nomoredata = 0;
2171	unsigned int delack_mode;
2172	struct sockbuf *rcv;
2173
2174	tp = toep->tp_tp;
2175	inp_wlock(tp->t_inpcb);
2176	so = inp_inpcbtosocket(tp->t_inpcb);
2177
2178	if (__predict_false(so_no_receive(so))) {
2179
2180		handle_excess_rx(toep, m);
2181		inp_wunlock(tp->t_inpcb);
2182		return;
2183	}
2184
2185	q = &toep->tp_ddp_state;
2186	hdr = cplhdr(m);
2187	ddp_report = ntohl(hdr->u.ddp_report);
2188	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2189	bsp = &q->buf_state[buf_idx];
2190
2191	CTR4(KTR_TOM,
2192	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2193	    "hdr seq 0x%x len %u",
2194	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2195	    ntohs(hdr->len));
2196	CTR3(KTR_TOM,
2197	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2198	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2199
2200	ddp_len = ntohs(hdr->len);
2201	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2202
2203	delack_mode = G_DDP_DACK_MODE(ddp_report);
2204	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2205		toep->tp_delack_mode = delack_mode;
2206		toep->tp_delack_seq = tp->rcv_nxt;
2207	}
2208
2209	m->m_seq = tp->rcv_nxt;
2210	tp->rcv_nxt = rcv_nxt;
2211
2212	tp->t_rcvtime = ticks;
2213	/*
2214	 * Store the length in m->m_len.  We are changing the meaning of
2215	 * m->m_len here, we need to be very careful that nothing from now on
2216	 * interprets ->len of this packet the usual way.
2217	 */
2218	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2219	inp_wunlock(tp->t_inpcb);
2220	CTR3(KTR_TOM,
2221	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2222	    m->m_len, rcv_nxt, m->m_seq);
2223	/*
2224	 * Figure out where the new data was placed in the buffer and store it
2225	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2226	 * account for page pod's pg_offset.
2227	 */
2228	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2229	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2230
2231	rcv = so_sockbuf_rcv(so);
2232	sockbuf_lock(rcv);
2233
2234	m->m_ddp_gl = (unsigned char *)bsp->gl;
2235	m->m_flags |= M_DDP;
2236	bsp->cur_offset = end_offset;
2237	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2238
2239	/*
2240	 * Length is only meaningful for kbuf
2241	 */
2242	if (!(bsp->flags & DDP_BF_NOCOPY))
2243		KASSERT(m->m_len <= bsp->gl->dgl_length,
2244		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2245			m->m_len, bsp->gl->dgl_length));
2246
2247	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2248	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2249        /*
2250	 * Bit 0 of flags stores whether the DDP buffer is completed.
2251	 * Note that other parts of the code depend on this being in bit 0.
2252	 */
2253	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2254		panic("spurious ddp completion");
2255	} else {
2256		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2257		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2258			q->cur_buf ^= 1;                     /* flip buffers */
2259	}
2260
2261	if (bsp->flags & DDP_BF_NOCOPY) {
2262		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2263		bsp->flags &= ~DDP_BF_NOCOPY;
2264	}
2265
2266	if (ddp_report & F_DDP_PSH)
2267		m->m_ddp_flags |= DDP_BF_PSH;
2268	if (nomoredata)
2269		m->m_ddp_flags |= DDP_BF_NODATA;
2270
2271#ifdef notyet
2272	skb_reset_transport_header(skb);
2273	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2274#endif
2275	SBAPPEND(rcv, m);
2276
2277	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2278	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2279		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2280		so_sorwakeup_locked(so);
2281	else
2282		sockbuf_unlock(rcv);
2283}
2284
2285#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2286		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2287		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2288		 F_DDP_INVALID_PPOD)
2289
2290/*
2291 * Handler for RX_DATA_DDP CPL messages.
2292 */
2293static int
2294do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2295{
2296	struct toepcb *toep = ctx;
2297	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2298
2299	VALIDATE_SOCK(so);
2300
2301	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2302		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2303		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2304		return (CPL_RET_BUF_DONE);
2305	}
2306#if 0
2307	skb->h.th = tcphdr_skb->h.th;
2308#endif
2309	new_rx_data_ddp(toep, m);
2310	return (0);
2311}
2312
2313static void
2314process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2315{
2316	struct tcpcb *tp = toep->tp_tp;
2317	struct socket *so;
2318	struct ddp_state *q;
2319	struct ddp_buf_state *bsp;
2320	struct cpl_rx_ddp_complete *hdr;
2321	unsigned int ddp_report, buf_idx, when, delack_mode;
2322	int nomoredata = 0;
2323	struct sockbuf *rcv;
2324
2325	inp_wlock(tp->t_inpcb);
2326	so = inp_inpcbtosocket(tp->t_inpcb);
2327
2328	if (__predict_false(so_no_receive(so))) {
2329		struct inpcb *inp = so_sotoinpcb(so);
2330
2331		handle_excess_rx(toep, m);
2332		inp_wunlock(inp);
2333		return;
2334	}
2335	q = &toep->tp_ddp_state;
2336	hdr = cplhdr(m);
2337	ddp_report = ntohl(hdr->ddp_report);
2338	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2339	m->m_pkthdr.csum_data = tp->rcv_nxt;
2340
2341	rcv = so_sockbuf_rcv(so);
2342	sockbuf_lock(rcv);
2343
2344	bsp = &q->buf_state[buf_idx];
2345	when = bsp->cur_offset;
2346	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2347	tp->rcv_nxt += m->m_len;
2348	tp->t_rcvtime = ticks;
2349
2350	delack_mode = G_DDP_DACK_MODE(ddp_report);
2351	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2352		toep->tp_delack_mode = delack_mode;
2353		toep->tp_delack_seq = tp->rcv_nxt;
2354	}
2355#ifdef notyet
2356	skb_reset_transport_header(skb);
2357	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2358#endif
2359	inp_wunlock(tp->t_inpcb);
2360
2361	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2362	CTR5(KTR_TOM,
2363		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2364		  "ddp_report 0x%x offset %u, len %u",
2365		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2366		   G_DDP_OFFSET(ddp_report), m->m_len);
2367
2368	m->m_cur_offset = bsp->cur_offset;
2369	bsp->cur_offset += m->m_len;
2370
2371	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2372		q->cur_buf ^= 1;                     /* flip buffers */
2373		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2374			nomoredata=1;
2375	}
2376
2377	CTR4(KTR_TOM,
2378		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2379		  "ddp_report %u offset %u",
2380		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2381		   G_DDP_OFFSET(ddp_report));
2382
2383	m->m_ddp_gl = (unsigned char *)bsp->gl;
2384	m->m_flags |= M_DDP;
2385	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2386	if (bsp->flags & DDP_BF_NOCOPY)
2387		bsp->flags &= ~DDP_BF_NOCOPY;
2388	if (nomoredata)
2389		m->m_ddp_flags |= DDP_BF_NODATA;
2390
2391	SBAPPEND(rcv, m);
2392	if ((so_state_get(so) & SS_NOFDREF) == 0)
2393		so_sorwakeup_locked(so);
2394	else
2395		sockbuf_unlock(rcv);
2396}
2397
2398/*
2399 * Handler for RX_DDP_COMPLETE CPL messages.
2400 */
2401static int
2402do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2403{
2404	struct toepcb *toep = ctx;
2405
2406	VALIDATE_SOCK(so);
2407#if 0
2408	skb->h.th = tcphdr_skb->h.th;
2409#endif
2410	process_ddp_complete(toep, m);
2411	return (0);
2412}
2413
2414/*
2415 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2416 * socket state before calling tcp_time_wait to comply with its expectations.
2417 */
2418static void
2419enter_timewait(struct tcpcb *tp)
2420{
2421	/*
2422	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2423	 * process peer_close because we don't want to carry the peer FIN in
2424	 * the socket's receive queue and if we increment rcv_nxt without
2425	 * having the FIN in the receive queue we'll confuse facilities such
2426	 * as SIOCINQ.
2427	 */
2428	inp_wlock(tp->t_inpcb);
2429	tp->rcv_nxt++;
2430
2431	tp->ts_recent_age = 0;	     /* defeat recycling */
2432	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2433	inp_wunlock(tp->t_inpcb);
2434	tcp_offload_twstart(tp);
2435}
2436
2437/*
2438 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2439 * function deals with the data that may be reported along with the FIN.
2440 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2441 * perform normal FIN-related processing.  In the latter case 1 indicates that
2442 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2443 * skb can be freed.
2444 */
2445static int
2446handle_peer_close_data(struct socket *so, struct mbuf *m)
2447{
2448	struct tcpcb *tp = so_sototcpcb(so);
2449	struct toepcb *toep = tp->t_toe;
2450	struct ddp_state *q;
2451	struct ddp_buf_state *bsp;
2452	struct cpl_peer_close *req = cplhdr(m);
2453	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2454	struct sockbuf *rcv;
2455
2456	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2457		return (0);
2458
2459	CTR0(KTR_TOM, "handle_peer_close_data");
2460	if (__predict_false(so_no_receive(so))) {
2461		handle_excess_rx(toep, m);
2462
2463		/*
2464		 * Although we discard the data we want to process the FIN so
2465		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2466		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2467		 * may be what will close the connection.  We return 1 because
2468		 * handle_excess_rx() already freed the packet.
2469		 */
2470		return (1);
2471	}
2472
2473	inp_lock_assert(tp->t_inpcb);
2474	q = &toep->tp_ddp_state;
2475	rcv = so_sockbuf_rcv(so);
2476	sockbuf_lock(rcv);
2477
2478	bsp = &q->buf_state[q->cur_buf];
2479	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2480	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2481	m->m_ddp_gl = (unsigned char *)bsp->gl;
2482	m->m_flags |= M_DDP;
2483	m->m_cur_offset = bsp->cur_offset;
2484	m->m_ddp_flags =
2485	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2486	m->m_seq = tp->rcv_nxt;
2487	tp->rcv_nxt = rcv_nxt;
2488	bsp->cur_offset += m->m_pkthdr.len;
2489	if (!(bsp->flags & DDP_BF_NOFLIP))
2490		q->cur_buf ^= 1;
2491#ifdef notyet
2492	skb_reset_transport_header(skb);
2493	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2494#endif
2495	tp->t_rcvtime = ticks;
2496	SBAPPEND(rcv, m);
2497	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2498		so_sorwakeup_locked(so);
2499	else
2500		sockbuf_unlock(rcv);
2501
2502	return (1);
2503}
2504
2505/*
2506 * Handle a peer FIN.
2507 */
2508static void
2509do_peer_fin(struct toepcb *toep, struct mbuf *m)
2510{
2511	struct socket *so;
2512	struct tcpcb *tp = toep->tp_tp;
2513	int keep, action;
2514
2515	action = keep = 0;
2516	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2517	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2518		printf("abort_pending set\n");
2519
2520		goto out;
2521	}
2522	inp_wlock(tp->t_inpcb);
2523	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2524	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2525		keep = handle_peer_close_data(so, m);
2526		if (keep < 0) {
2527			inp_wunlock(tp->t_inpcb);
2528			return;
2529		}
2530	}
2531	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2532		CTR1(KTR_TOM,
2533		    "waking up waiters for cantrcvmore on %p ", so);
2534		socantrcvmore(so);
2535
2536		/*
2537		 * If connection is half-synchronized
2538		 * (ie NEEDSYN flag on) then delay ACK,
2539		 * so it may be piggybacked when SYN is sent.
2540		 * Otherwise, since we received a FIN then no
2541		 * more input can be expected, send ACK now.
2542		 */
2543		if (tp->t_flags & TF_NEEDSYN)
2544			tp->t_flags |= TF_DELACK;
2545		else
2546			tp->t_flags |= TF_ACKNOW;
2547		tp->rcv_nxt++;
2548	}
2549
2550	switch (tp->t_state) {
2551	case TCPS_SYN_RECEIVED:
2552	    tp->t_starttime = ticks;
2553	/* FALLTHROUGH */
2554	case TCPS_ESTABLISHED:
2555		tp->t_state = TCPS_CLOSE_WAIT;
2556		break;
2557	case TCPS_FIN_WAIT_1:
2558		tp->t_state = TCPS_CLOSING;
2559		break;
2560	case TCPS_FIN_WAIT_2:
2561		/*
2562		 * If we've sent an abort_req we must have sent it too late,
2563		 * HW will send us a reply telling us so, and this peer_close
2564		 * is really the last message for this connection and needs to
2565		 * be treated as an abort_rpl, i.e., transition the connection
2566		 * to TCP_CLOSE (note that the host stack does this at the
2567		 * time of generating the RST but we must wait for HW).
2568		 * Otherwise we enter TIME_WAIT.
2569		 */
2570		t3_release_offload_resources(toep);
2571		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2572			action = TCP_CLOSE;
2573		} else {
2574			action = TCP_TIMEWAIT;
2575		}
2576		break;
2577	default:
2578		log(LOG_ERR,
2579		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2580		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2581	}
2582	inp_wunlock(tp->t_inpcb);
2583
2584	if (action == TCP_TIMEWAIT) {
2585		enter_timewait(tp);
2586	} else if (action == TCP_DROP) {
2587		tcp_offload_drop(tp, 0);
2588	} else if (action == TCP_CLOSE) {
2589		tcp_offload_close(tp);
2590	}
2591
2592#ifdef notyet
2593	/* Do not send POLL_HUP for half duplex close. */
2594	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2595	    sk->sk_state == TCP_CLOSE)
2596		sk_wake_async(so, 1, POLL_HUP);
2597	else
2598		sk_wake_async(so, 1, POLL_IN);
2599#endif
2600
2601out:
2602	if (!keep)
2603		m_free(m);
2604}
2605
2606/*
2607 * Handler for PEER_CLOSE CPL messages.
2608 */
2609static int
2610do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2611{
2612	struct toepcb *toep = (struct toepcb *)ctx;
2613
2614	VALIDATE_SOCK(so);
2615
2616	do_peer_fin(toep, m);
2617	return (0);
2618}
2619
2620static void
2621process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2622{
2623	struct cpl_close_con_rpl *rpl = cplhdr(m);
2624	struct tcpcb *tp = toep->tp_tp;
2625	struct socket *so;
2626	int action = 0;
2627	struct sockbuf *rcv;
2628
2629	inp_wlock(tp->t_inpcb);
2630	so = inp_inpcbtosocket(tp->t_inpcb);
2631
2632	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2633
2634	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2635		inp_wunlock(tp->t_inpcb);
2636		goto out;
2637	}
2638
2639	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2640	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2641
2642	switch (tp->t_state) {
2643	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2644		t3_release_offload_resources(toep);
2645		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2646			action = TCP_CLOSE;
2647
2648		} else {
2649			action = TCP_TIMEWAIT;
2650		}
2651		break;
2652	case TCPS_LAST_ACK:
2653		/*
2654		 * In this state we don't care about pending abort_rpl.
2655		 * If we've sent abort_req it was post-close and was sent too
2656		 * late, this close_con_rpl is the actual last message.
2657		 */
2658		t3_release_offload_resources(toep);
2659		action = TCP_CLOSE;
2660		break;
2661	case TCPS_FIN_WAIT_1:
2662		/*
2663		 * If we can't receive any more
2664		 * data, then closing user can proceed.
2665		 * Starting the timer is contrary to the
2666		 * specification, but if we don't get a FIN
2667		 * we'll hang forever.
2668		 *
2669		 * XXXjl:
2670		 * we should release the tp also, and use a
2671		 * compressed state.
2672		 */
2673		if (so)
2674			rcv = so_sockbuf_rcv(so);
2675		else
2676			break;
2677
2678		if (rcv->sb_state & SBS_CANTRCVMORE) {
2679			int timeout;
2680
2681			if (so)
2682				soisdisconnected(so);
2683			timeout = (tcp_fast_finwait2_recycle) ?
2684			    tcp_finwait2_timeout : tcp_maxidle;
2685			tcp_timer_activate(tp, TT_2MSL, timeout);
2686		}
2687		tp->t_state = TCPS_FIN_WAIT_2;
2688		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2689		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2690			action = TCP_DROP;
2691		}
2692
2693		break;
2694	default:
2695		log(LOG_ERR,
2696		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2697		       toep->tp_toedev->tod_name, toep->tp_tid,
2698		       tp->t_state);
2699	}
2700	inp_wunlock(tp->t_inpcb);
2701
2702
2703	if (action == TCP_TIMEWAIT) {
2704		enter_timewait(tp);
2705	} else if (action == TCP_DROP) {
2706		tcp_offload_drop(tp, 0);
2707	} else if (action == TCP_CLOSE) {
2708		tcp_offload_close(tp);
2709	}
2710out:
2711	m_freem(m);
2712}
2713
2714/*
2715 * Handler for CLOSE_CON_RPL CPL messages.
2716 */
2717static int
2718do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2719			    void *ctx)
2720{
2721	struct toepcb *toep = (struct toepcb *)ctx;
2722
2723	process_close_con_rpl(toep, m);
2724	return (0);
2725}
2726
2727/*
2728 * Process abort replies.  We only process these messages if we anticipate
2729 * them as the coordination between SW and HW in this area is somewhat lacking
2730 * and sometimes we get ABORT_RPLs after we are done with the connection that
2731 * originated the ABORT_REQ.
2732 */
2733static void
2734process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2735{
2736	struct tcpcb *tp = toep->tp_tp;
2737	struct socket *so;
2738	int needclose = 0;
2739
2740#ifdef T3_TRACE
2741	T3_TRACE1(TIDTB(sk),
2742		  "process_abort_rpl: GTS rpl pending %d",
2743		  sock_flag(sk, ABORT_RPL_PENDING));
2744#endif
2745
2746	inp_wlock(tp->t_inpcb);
2747	so = inp_inpcbtosocket(tp->t_inpcb);
2748
2749	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2750		/*
2751		 * XXX panic on tcpdrop
2752		 */
2753		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2754			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2755		else {
2756			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2757			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2758			    !is_t3a(toep->tp_toedev)) {
2759				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2760					panic("TP_ABORT_REQ_RCVD set");
2761				t3_release_offload_resources(toep);
2762				needclose = 1;
2763			}
2764		}
2765	}
2766	inp_wunlock(tp->t_inpcb);
2767
2768	if (needclose)
2769		tcp_offload_close(tp);
2770
2771	m_free(m);
2772}
2773
2774/*
2775 * Handle an ABORT_RPL_RSS CPL message.
2776 */
2777static int
2778do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2779{
2780	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2781	struct toepcb *toep;
2782
2783	/*
2784	 * Ignore replies to post-close aborts indicating that the abort was
2785	 * requested too late.  These connections are terminated when we get
2786	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2787	 * arrives the TID is either no longer used or it has been recycled.
2788	 */
2789	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2790discard:
2791		m_free(m);
2792		return (0);
2793	}
2794
2795	toep = (struct toepcb *)ctx;
2796
2797        /*
2798	 * Sometimes we've already closed the socket, e.g., a post-close
2799	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2800	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2801	 * but FW turns the ABORT_REQ into a regular one and so we get
2802	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2803	 */
2804	if (!toep)
2805		goto discard;
2806
2807	if (toep->tp_tp == NULL) {
2808		log(LOG_NOTICE, "removing tid for abort\n");
2809		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2810		if (toep->tp_l2t)
2811			l2t_release(L2DATA(cdev), toep->tp_l2t);
2812
2813		toepcb_release(toep);
2814		goto discard;
2815	}
2816
2817	log(LOG_NOTICE, "toep=%p\n", toep);
2818	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2819
2820	toepcb_hold(toep);
2821	process_abort_rpl(toep, m);
2822	toepcb_release(toep);
2823	return (0);
2824}
2825
2826/*
2827 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2828 * indicate whether RST should be sent in response.
2829 */
2830static int
2831abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2832{
2833	struct tcpcb *tp = so_sototcpcb(so);
2834
2835	switch (abort_reason) {
2836	case CPL_ERR_BAD_SYN:
2837#if 0
2838		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2839#endif
2840	case CPL_ERR_CONN_RESET:
2841		// XXX need to handle SYN_RECV due to crossed SYNs
2842		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2843	case CPL_ERR_XMIT_TIMEDOUT:
2844	case CPL_ERR_PERSIST_TIMEDOUT:
2845	case CPL_ERR_FINWAIT2_TIMEDOUT:
2846	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2847#if 0
2848		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2849#endif
2850		return (ETIMEDOUT);
2851	default:
2852		return (EIO);
2853	}
2854}
2855
2856static inline void
2857set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2858{
2859	struct cpl_abort_rpl *rpl = cplhdr(m);
2860
2861	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2862	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2863	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2864
2865	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2866	rpl->cmd = cmd;
2867}
2868
2869static void
2870send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2871{
2872	struct mbuf *reply_mbuf;
2873	struct cpl_abort_req_rss *req = cplhdr(m);
2874
2875	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2876	m_set_priority(m, CPL_PRIORITY_DATA);
2877	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2878	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2879	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2880	m_free(m);
2881}
2882
2883/*
2884 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2885 */
2886static inline int
2887is_neg_adv_abort(unsigned int status)
2888{
2889	return status == CPL_ERR_RTX_NEG_ADVICE ||
2890	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2891}
2892
2893static void
2894send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2895{
2896	struct mbuf  *reply_mbuf;
2897	struct cpl_abort_req_rss *req = cplhdr(m);
2898
2899	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2900
2901	if (!reply_mbuf) {
2902		/* Defer the reply.  Stick rst_status into req->cmd. */
2903		req->status = rst_status;
2904		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2905		return;
2906	}
2907
2908	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2909	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2910	m_free(m);
2911
2912	/*
2913	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2914	 * these messages while ARP is pending.  For other connection states
2915	 * it's not a problem.
2916	 */
2917	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2918}
2919
2920#ifdef notyet
2921static void
2922cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2923{
2924	CXGB_UNIMPLEMENTED();
2925#ifdef notyet
2926	struct request_sock *req = child->sk_user_data;
2927
2928	inet_csk_reqsk_queue_removed(parent, req);
2929	synq_remove(tcp_sk(child));
2930	__reqsk_free(req);
2931	child->sk_user_data = NULL;
2932#endif
2933}
2934
2935
2936/*
2937 * Performs the actual work to abort a SYN_RECV connection.
2938 */
2939static void
2940do_abort_syn_rcv(struct socket *child, struct socket *parent)
2941{
2942	struct tcpcb *parenttp = so_sototcpcb(parent);
2943	struct tcpcb *childtp = so_sototcpcb(child);
2944
2945	/*
2946	 * If the server is still open we clean up the child connection,
2947	 * otherwise the server already did the clean up as it was purging
2948	 * its SYN queue and the skb was just sitting in its backlog.
2949	 */
2950	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2951		cleanup_syn_rcv_conn(child, parent);
2952		inp_wlock(childtp->t_inpcb);
2953		t3_release_offload_resources(childtp->t_toe);
2954		inp_wunlock(childtp->t_inpcb);
2955		tcp_offload_close(childtp);
2956	}
2957}
2958#endif
2959
2960/*
2961 * Handle abort requests for a SYN_RECV connection.  These need extra work
2962 * because the socket is on its parent's SYN queue.
2963 */
2964static int
2965abort_syn_rcv(struct socket *so, struct mbuf *m)
2966{
2967	CXGB_UNIMPLEMENTED();
2968#ifdef notyet
2969	struct socket *parent;
2970	struct toedev *tdev = toep->tp_toedev;
2971	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2972	struct socket *oreq = so->so_incomp;
2973	struct t3c_tid_entry *t3c_stid;
2974	struct tid_info *t;
2975
2976	if (!oreq)
2977		return -1;        /* somehow we are not on the SYN queue */
2978
2979	t = &(T3C_DATA(cdev))->tid_maps;
2980	t3c_stid = lookup_stid(t, oreq->ts_recent);
2981	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2982
2983	so_lock(parent);
2984	do_abort_syn_rcv(so, parent);
2985	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2986	so_unlock(parent);
2987#endif
2988	return (0);
2989}
2990
2991/*
2992 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2993 * request except that we need to reply to it.
2994 */
2995static void
2996process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
2997{
2998	int rst_status = CPL_ABORT_NO_RST;
2999	const struct cpl_abort_req_rss *req = cplhdr(m);
3000	struct tcpcb *tp = toep->tp_tp;
3001	struct socket *so;
3002	int needclose = 0;
3003
3004	inp_wlock(tp->t_inpcb);
3005	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3006	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3007		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3008		m_free(m);
3009		goto skip;
3010	}
3011
3012	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3013	/*
3014	 * Three cases to consider:
3015	 * a) We haven't sent an abort_req; close the connection.
3016	 * b) We have sent a post-close abort_req that will get to TP too late
3017	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3018	 *    be ignored and the connection should be closed now.
3019	 * c) We have sent a regular abort_req that will get to TP too late.
3020	 *    That will generate an abort_rpl with status 0, wait for it.
3021	 */
3022	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3023	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3024		int error;
3025
3026		error = abort_status_to_errno(so, req->status,
3027		    &rst_status);
3028		so_error_set(so, error);
3029
3030		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3031			so_sorwakeup(so);
3032		/*
3033		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3034		 * returns 0 is has taken care of the abort.
3035		 */
3036		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3037			goto skip;
3038
3039		t3_release_offload_resources(toep);
3040		needclose = 1;
3041	}
3042	inp_wunlock(tp->t_inpcb);
3043
3044	if (needclose)
3045		tcp_offload_close(tp);
3046
3047	send_abort_rpl(m, tdev, rst_status);
3048	return;
3049skip:
3050	inp_wunlock(tp->t_inpcb);
3051}
3052
3053/*
3054 * Handle an ABORT_REQ_RSS CPL message.
3055 */
3056static int
3057do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3058{
3059	const struct cpl_abort_req_rss *req = cplhdr(m);
3060	struct toepcb *toep = (struct toepcb *)ctx;
3061
3062	if (is_neg_adv_abort(req->status)) {
3063		m_free(m);
3064		return (0);
3065	}
3066
3067	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3068
3069	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3070		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3071		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3072
3073		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3074		if (toep->tp_l2t)
3075			l2t_release(L2DATA(cdev), toep->tp_l2t);
3076
3077		/*
3078		 *  Unhook
3079		 */
3080		toep->tp_tp->t_toe = NULL;
3081		toep->tp_tp->t_flags &= ~TF_TOE;
3082		toep->tp_tp = NULL;
3083		/*
3084		 * XXX need to call syncache_chkrst - but we don't
3085		 * have a way of doing that yet
3086		 */
3087		toepcb_release(toep);
3088		log(LOG_ERR, "abort for unestablished connection :-(\n");
3089		return (0);
3090	}
3091	if (toep->tp_tp == NULL) {
3092		log(LOG_NOTICE, "disconnected toepcb\n");
3093		/* should be freed momentarily */
3094		return (0);
3095	}
3096
3097
3098	toepcb_hold(toep);
3099	process_abort_req(toep, m, toep->tp_toedev);
3100	toepcb_release(toep);
3101	return (0);
3102}
3103#ifdef notyet
3104static void
3105pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3106{
3107	struct toedev *tdev = TOE_DEV(parent);
3108
3109	do_abort_syn_rcv(child, parent);
3110	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3111		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3112
3113		rpl->opt0h = htonl(F_TCAM_BYPASS);
3114		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3115		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3116	} else
3117		m_free(m);
3118}
3119#endif
3120static void
3121handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3122{
3123	CXGB_UNIMPLEMENTED();
3124
3125#ifdef notyet
3126	struct t3cdev *cdev;
3127	struct socket *parent;
3128	struct socket *oreq;
3129	struct t3c_tid_entry *t3c_stid;
3130	struct tid_info *t;
3131	struct tcpcb *otp, *tp = so_sototcpcb(so);
3132	struct toepcb *toep = tp->t_toe;
3133
3134	/*
3135	 * If the connection is being aborted due to the parent listening
3136	 * socket going away there's nothing to do, the ABORT_REQ will close
3137	 * the connection.
3138	 */
3139	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3140		m_free(m);
3141		return;
3142	}
3143
3144	oreq = so->so_incomp;
3145	otp = so_sototcpcb(oreq);
3146
3147	cdev = T3C_DEV(so);
3148	t = &(T3C_DATA(cdev))->tid_maps;
3149	t3c_stid = lookup_stid(t, otp->ts_recent);
3150	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3151
3152	so_lock(parent);
3153	pass_open_abort(so, parent, m);
3154	so_unlock(parent);
3155#endif
3156}
3157
3158/*
3159 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3160 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3161 * connection.
3162 */
3163static void
3164pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3165{
3166
3167#ifdef notyet
3168	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3169	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3170#endif
3171	handle_pass_open_arp_failure(m_get_socket(m), m);
3172}
3173
3174/*
3175 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3176 */
3177static void
3178mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3179{
3180	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3181	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3182	unsigned int tid = GET_TID(req);
3183
3184	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3185	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3186	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3187	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3188	rpl->opt0h = htonl(F_TCAM_BYPASS);
3189	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3190	rpl->opt2 = 0;
3191	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3192}
3193
3194/*
3195 * Send a deferred reject to an accept request.
3196 */
3197static void
3198reject_pass_request(struct toedev *tdev, struct mbuf *m)
3199{
3200	struct mbuf *reply_mbuf;
3201
3202	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3203	mk_pass_accept_rpl(reply_mbuf, m);
3204	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3205	m_free(m);
3206}
3207
3208static void
3209handle_syncache_event(int event, void *arg)
3210{
3211	struct toepcb *toep = arg;
3212
3213	switch (event) {
3214	case TOE_SC_ENTRY_PRESENT:
3215		/*
3216		 * entry already exists - free toepcb
3217		 * and l2t
3218		 */
3219		printf("syncache entry present\n");
3220		toepcb_release(toep);
3221		break;
3222	case TOE_SC_DROP:
3223		/*
3224		 * The syncache has given up on this entry
3225		 * either it timed out, or it was evicted
3226		 * we need to explicitly release the tid
3227		 */
3228		printf("syncache entry dropped\n");
3229		toepcb_release(toep);
3230		break;
3231	default:
3232		log(LOG_ERR, "unknown syncache event %d\n", event);
3233		break;
3234	}
3235}
3236
3237static void
3238syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3239{
3240	struct in_conninfo inc;
3241	struct tcpopt to;
3242	struct tcphdr th;
3243	struct inpcb *inp;
3244	int mss, wsf, sack, ts;
3245	uint32_t rcv_isn = ntohl(req->rcv_isn);
3246
3247	bzero(&to, sizeof(struct tcpopt));
3248	inp = so_sotoinpcb(lso);
3249
3250	/*
3251	 * Fill out information for entering us into the syncache
3252	 */
3253	inc.inc_fport = th.th_sport = req->peer_port;
3254	inc.inc_lport = th.th_dport = req->local_port;
3255	th.th_seq = req->rcv_isn;
3256	th.th_flags = TH_SYN;
3257
3258	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3259
3260
3261	inc.inc_isipv6 = 0;
3262	inc.inc_len = 0;
3263	inc.inc_faddr.s_addr = req->peer_ip;
3264	inc.inc_laddr.s_addr = req->local_ip;
3265
3266	DPRINTF("syncache add of %d:%d %d:%d\n",
3267	    ntohl(req->local_ip), ntohs(req->local_port),
3268	    ntohl(req->peer_ip), ntohs(req->peer_port));
3269
3270	mss = req->tcp_options.mss;
3271	wsf = req->tcp_options.wsf;
3272	ts = req->tcp_options.tstamp;
3273	sack = req->tcp_options.sack;
3274	to.to_mss = mss;
3275	to.to_wscale = wsf;
3276	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3277	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3278}
3279
3280
3281/*
3282 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3283 * lock held.  Note that the sock here is a listening socket that is not owned
3284 * by the TOE.
3285 */
3286static void
3287process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3288    struct listen_ctx *lctx)
3289{
3290	int rt_flags;
3291	struct l2t_entry *e;
3292	struct iff_mac tim;
3293	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3294	struct cpl_pass_accept_rpl *rpl;
3295	struct cpl_pass_accept_req *req = cplhdr(m);
3296	unsigned int tid = GET_TID(req);
3297	struct tom_data *d = TOM_DATA(tdev);
3298	struct t3cdev *cdev = d->cdev;
3299	struct tcpcb *tp = so_sototcpcb(so);
3300	struct toepcb *newtoep;
3301	struct rtentry *dst;
3302	struct sockaddr_in nam;
3303	struct t3c_data *td = T3C_DATA(cdev);
3304
3305	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3306	if (__predict_false(reply_mbuf == NULL)) {
3307		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3308			t3_defer_reply(m, tdev, reject_pass_request);
3309		else {
3310			cxgb_queue_tid_release(cdev, tid);
3311			m_free(m);
3312		}
3313		DPRINTF("failed to get reply_mbuf\n");
3314
3315		goto out;
3316	}
3317
3318	if (tp->t_state != TCPS_LISTEN) {
3319		DPRINTF("socket not in listen state\n");
3320
3321		goto reject;
3322	}
3323
3324	tim.mac_addr = req->dst_mac;
3325	tim.vlan_tag = ntohs(req->vlan_tag);
3326	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3327		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3328		goto reject;
3329	}
3330
3331#ifdef notyet
3332	/*
3333	 * XXX do route lookup to confirm that we're still listening on this
3334	 * address
3335	 */
3336	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3337			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3338		goto reject;
3339	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3340		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3341	dst_release(skb->dst);	// done with the input route, release it
3342	skb->dst = NULL;
3343
3344	if ((rt_flags & RTF_LOCAL) == 0)
3345		goto reject;
3346#endif
3347	/*
3348	 * XXX
3349	 */
3350	rt_flags = RTF_LOCAL;
3351	if ((rt_flags & RTF_LOCAL) == 0)
3352		goto reject;
3353
3354	/*
3355	 * Calculate values and add to syncache
3356	 */
3357
3358	newtoep = toepcb_alloc();
3359	if (newtoep == NULL)
3360		goto reject;
3361
3362	bzero(&nam, sizeof(struct sockaddr_in));
3363
3364	nam.sin_len = sizeof(struct sockaddr_in);
3365	nam.sin_family = AF_INET;
3366	nam.sin_addr.s_addr =req->peer_ip;
3367	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3368
3369	if (dst == NULL) {
3370		printf("failed to find route\n");
3371		goto reject;
3372	}
3373	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3374	    (struct sockaddr *)&nam);
3375	if (e == NULL) {
3376		DPRINTF("failed to get l2t\n");
3377	}
3378	/*
3379	 * Point to our listen socket until accept
3380	 */
3381	newtoep->tp_tp = tp;
3382	newtoep->tp_flags = TP_SYN_RCVD;
3383	newtoep->tp_tid = tid;
3384	newtoep->tp_toedev = tdev;
3385	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3386
3387	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3388	so_lock(so);
3389	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3390	so_unlock(so);
3391
3392	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3393		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3394
3395	if (newtoep->tp_ulp_mode) {
3396		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3397
3398		if (ddp_mbuf == NULL)
3399			newtoep->tp_ulp_mode = 0;
3400	}
3401
3402	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3403	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3404	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3405	/*
3406	 * XXX workaround for lack of syncache drop
3407	 */
3408	toepcb_hold(newtoep);
3409	syncache_add_accept_req(req, so, newtoep);
3410
3411	rpl = cplhdr(reply_mbuf);
3412	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3413	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3414	rpl->wr.wr_lo = 0;
3415	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3416	rpl->opt2 = htonl(calc_opt2(so, tdev));
3417	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3418	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3419
3420	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3421	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3422	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3423				  CPL_PASS_OPEN_ACCEPT);
3424
3425	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3426
3427	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3428
3429	l2t_send(cdev, reply_mbuf, e);
3430	m_free(m);
3431	if (newtoep->tp_ulp_mode) {
3432		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3433				V_TF_DDP_OFF(1) |
3434				TP_DDP_TIMER_WORKAROUND_MASK,
3435				V_TF_DDP_OFF(1) |
3436		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3437	} else
3438		printf("not offloading\n");
3439
3440
3441
3442	return;
3443reject:
3444	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3445		mk_pass_accept_rpl(reply_mbuf, m);
3446	else
3447		mk_tid_release(reply_mbuf, newtoep, tid);
3448	cxgb_ofld_send(cdev, reply_mbuf);
3449	m_free(m);
3450out:
3451#if 0
3452	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3453#else
3454	return;
3455#endif
3456}
3457
3458/*
3459 * Handle a CPL_PASS_ACCEPT_REQ message.
3460 */
3461static int
3462do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3463{
3464	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3465	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3466	struct tom_data *d = listen_ctx->tom_data;
3467
3468#if VALIDATE_TID
3469	struct cpl_pass_accept_req *req = cplhdr(m);
3470	unsigned int tid = GET_TID(req);
3471	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3472
3473	if (unlikely(!lsk)) {
3474		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3475		       cdev->name,
3476		       (unsigned long)((union listen_entry *)ctx -
3477					t->stid_tab));
3478		return CPL_RET_BUF_DONE;
3479	}
3480	if (unlikely(tid >= t->ntids)) {
3481		printk(KERN_ERR "%s: passive open TID %u too large\n",
3482		       cdev->name, tid);
3483		return CPL_RET_BUF_DONE;
3484	}
3485	/*
3486	 * For T3A the current user of the TID may have closed but its last
3487	 * message(s) may have been backlogged so the TID appears to be still
3488	 * in use.  Just take the TID away, the connection can close at its
3489	 * own leisure.  For T3B this situation is a bug.
3490	 */
3491	if (!valid_new_tid(t, tid) &&
3492	    cdev->type != T3A) {
3493		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3494		       cdev->name, tid);
3495		return CPL_RET_BUF_DONE;
3496	}
3497#endif
3498
3499	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3500	return (0);
3501}
3502
3503/*
3504 * Called when a connection is established to translate the TCP options
3505 * reported by HW to FreeBSD's native format.
3506 */
3507static void
3508assign_rxopt(struct socket *so, unsigned int opt)
3509{
3510	struct tcpcb *tp = so_sototcpcb(so);
3511	struct toepcb *toep = tp->t_toe;
3512	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3513
3514	inp_lock_assert(tp->t_inpcb);
3515
3516	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3517	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3518	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3519	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3520	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3521	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3522		tp->rcv_scale = tp->request_r_scale;
3523}
3524
3525/*
3526 * Completes some final bits of initialization for just established connections
3527 * and changes their state to TCP_ESTABLISHED.
3528 *
3529 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3530 */
3531static void
3532make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3533{
3534	struct tcpcb *tp = so_sototcpcb(so);
3535	struct toepcb *toep = tp->t_toe;
3536
3537	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3538	assign_rxopt(so, opt);
3539
3540	/*
3541	 *XXXXXXXXXXX
3542	 *
3543	 */
3544#ifdef notyet
3545	so->so_proto->pr_ctloutput = t3_ctloutput;
3546#endif
3547
3548#if 0
3549	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3550#endif
3551	/*
3552	 * XXX not clear what rcv_wup maps to
3553	 */
3554	/*
3555	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3556	 * pass through opt0.
3557	 */
3558	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3559		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3560
3561	dump_toepcb(toep);
3562
3563#ifdef notyet
3564/*
3565 * no clean interface for marking ARP up to date
3566 */
3567	dst_confirm(sk->sk_dst_cache);
3568#endif
3569	tp->t_starttime = ticks;
3570	tp->t_state = TCPS_ESTABLISHED;
3571	soisconnected(so);
3572}
3573
3574static int
3575syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3576{
3577
3578	struct in_conninfo inc;
3579	struct tcpopt to;
3580	struct tcphdr th;
3581	int mss, wsf, sack, ts;
3582	struct mbuf *m = NULL;
3583	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3584	unsigned int opt;
3585
3586#ifdef MAC
3587#error	"no MAC support"
3588#endif
3589
3590	opt = ntohs(req->tcp_opt);
3591
3592	bzero(&to, sizeof(struct tcpopt));
3593
3594	/*
3595	 * Fill out information for entering us into the syncache
3596	 */
3597	inc.inc_fport = th.th_sport = req->peer_port;
3598	inc.inc_lport = th.th_dport = req->local_port;
3599	th.th_seq = req->rcv_isn;
3600	th.th_flags = TH_ACK;
3601
3602	inc.inc_isipv6 = 0;
3603	inc.inc_len = 0;
3604	inc.inc_faddr.s_addr = req->peer_ip;
3605	inc.inc_laddr.s_addr = req->local_ip;
3606
3607	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3608	wsf  = G_TCPOPT_WSCALE_OK(opt);
3609	ts   = G_TCPOPT_TSTAMP(opt);
3610	sack = G_TCPOPT_SACK(opt);
3611
3612	to.to_mss = mss;
3613	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3614	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3615
3616	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3617	    ntohl(req->local_ip), ntohs(req->local_port),
3618	    ntohl(req->peer_ip), ntohs(req->peer_port),
3619	    mss, wsf, ts, sack);
3620	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3621}
3622
3623
3624/*
3625 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3626 * if we are in TCP_SYN_RECV due to crossed SYNs
3627 */
3628static int
3629do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3630{
3631	struct cpl_pass_establish *req = cplhdr(m);
3632	struct toepcb *toep = (struct toepcb *)ctx;
3633	struct tcpcb *tp = toep->tp_tp;
3634	struct socket *so, *lso;
3635	struct t3c_data *td = T3C_DATA(cdev);
3636	struct sockbuf *snd, *rcv;
3637
3638	// Complete socket initialization now that we have the SND_ISN
3639
3640	struct toedev *tdev;
3641
3642
3643	tdev = toep->tp_toedev;
3644
3645	inp_wlock(tp->t_inpcb);
3646
3647	/*
3648	 *
3649	 * XXX need to add reference while we're manipulating
3650	 */
3651	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3652
3653	inp_wunlock(tp->t_inpcb);
3654
3655	so_lock(so);
3656	LIST_REMOVE(toep, synq_entry);
3657	so_unlock(so);
3658
3659	if (!syncache_expand_establish_req(req, &so, toep)) {
3660		/*
3661		 * No entry
3662		 */
3663		CXGB_UNIMPLEMENTED();
3664	}
3665	if (so == NULL) {
3666		/*
3667		 * Couldn't create the socket
3668		 */
3669		CXGB_UNIMPLEMENTED();
3670	}
3671
3672	tp = so_sototcpcb(so);
3673	inp_wlock(tp->t_inpcb);
3674
3675	snd = so_sockbuf_snd(so);
3676	rcv = so_sockbuf_rcv(so);
3677
3678	snd->sb_flags |= SB_NOCOALESCE;
3679	rcv->sb_flags |= SB_NOCOALESCE;
3680
3681	toep->tp_tp = tp;
3682	toep->tp_flags = 0;
3683	tp->t_toe = toep;
3684	reset_wr_list(toep);
3685	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3686	tp->rcv_nxt = toep->tp_copied_seq;
3687	install_offload_ops(so);
3688
3689	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3690	toep->tp_wr_unacked = 0;
3691	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3692	toep->tp_qset_idx = 0;
3693	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3694
3695	/*
3696	 * XXX Cancel any keep alive timer
3697	 */
3698
3699	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3700
3701	/*
3702	 * XXX workaround for lack of syncache drop
3703	 */
3704	toepcb_release(toep);
3705	inp_wunlock(tp->t_inpcb);
3706
3707	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3708	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3709#ifdef notyet
3710	/*
3711	 * XXX not sure how these checks map to us
3712	 */
3713	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3714		sk->sk_state_change(sk);
3715		sk_wake_async(so, 0, POLL_OUT);
3716	}
3717	/*
3718	 * The state for the new connection is now up to date.
3719	 * Next check if we should add the connection to the parent's
3720	 * accept queue.  When the parent closes it resets connections
3721	 * on its SYN queue, so check if we are being reset.  If so we
3722	 * don't need to do anything more, the coming ABORT_RPL will
3723	 * destroy this socket.  Otherwise move the connection to the
3724	 * accept queue.
3725	 *
3726	 * Note that we reset the synq before closing the server so if
3727	 * we are not being reset the stid is still open.
3728	 */
3729	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3730		__kfree_skb(skb);
3731		goto unlock;
3732	}
3733#endif
3734	m_free(m);
3735
3736	return (0);
3737}
3738
3739/*
3740 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3741 * and send them to the TOE.
3742 */
3743static void
3744fixup_and_send_ofo(struct toepcb *toep)
3745{
3746	struct mbuf *m;
3747	struct toedev *tdev = toep->tp_toedev;
3748	struct tcpcb *tp = toep->tp_tp;
3749	unsigned int tid = toep->tp_tid;
3750
3751	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3752
3753	inp_lock_assert(tp->t_inpcb);
3754	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3755		/*
3756		 * A variety of messages can be waiting but the fields we'll
3757		 * be touching are common to all so any message type will do.
3758		 */
3759		struct cpl_close_con_req *p = cplhdr(m);
3760
3761		p->wr.wr_lo = htonl(V_WR_TID(tid));
3762		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3763		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3764	}
3765}
3766
3767/*
3768 * Updates socket state from an active establish CPL message.  Runs with the
3769 * socket lock held.
3770 */
3771static void
3772socket_act_establish(struct socket *so, struct mbuf *m)
3773{
3774	struct cpl_act_establish *req = cplhdr(m);
3775	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3776	struct tcpcb *tp = so_sototcpcb(so);
3777	struct toepcb *toep = tp->t_toe;
3778
3779	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3780		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3781		    toep->tp_tid, tp->t_state);
3782
3783	tp->ts_recent_age = ticks;
3784	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3785	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3786
3787	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3788
3789	/*
3790	 * Now that we finally have a TID send any CPL messages that we had to
3791	 * defer for lack of a TID.
3792	 */
3793	if (mbufq_len(&toep->out_of_order_queue))
3794		fixup_and_send_ofo(toep);
3795
3796	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3797		/*
3798		 * XXX does this even make sense?
3799		 */
3800		so_sorwakeup(so);
3801	}
3802	m_free(m);
3803#ifdef notyet
3804/*
3805 * XXX assume no write requests permitted while socket connection is
3806 * incomplete
3807 */
3808	/*
3809	 * Currently the send queue must be empty at this point because the
3810	 * socket layer does not send anything before a connection is
3811	 * established.  To be future proof though we handle the possibility
3812	 * that there are pending buffers to send (either TX_DATA or
3813	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3814	 * buffers according to the just learned write_seq, and then we send
3815	 * them on their way.
3816	 */
3817	fixup_pending_writeq_buffers(sk);
3818	if (t3_push_frames(so, 1))
3819		sk->sk_write_space(sk);
3820#endif
3821
3822	toep->tp_state = tp->t_state;
3823	tcpstat.tcps_connects++;
3824
3825}
3826
3827/*
3828 * Process a CPL_ACT_ESTABLISH message.
3829 */
3830static int
3831do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3832{
3833	struct cpl_act_establish *req = cplhdr(m);
3834	unsigned int tid = GET_TID(req);
3835	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3836	struct toepcb *toep = (struct toepcb *)ctx;
3837	struct tcpcb *tp = toep->tp_tp;
3838	struct socket *so;
3839	struct toedev *tdev;
3840	struct tom_data *d;
3841
3842	if (tp == NULL) {
3843		free_atid(cdev, atid);
3844		return (0);
3845	}
3846	inp_wlock(tp->t_inpcb);
3847
3848	/*
3849	 * XXX
3850	 */
3851	so = inp_inpcbtosocket(tp->t_inpcb);
3852	tdev = toep->tp_toedev; /* blow up here if link was down */
3853	d = TOM_DATA(tdev);
3854
3855	/*
3856	 * It's OK if the TID is currently in use, the owning socket may have
3857	 * backlogged its last CPL message(s).  Just take it away.
3858	 */
3859	toep->tp_tid = tid;
3860	toep->tp_tp = tp;
3861	so_insert_tid(d, toep, tid);
3862	free_atid(cdev, atid);
3863	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3864
3865	socket_act_establish(so, m);
3866	inp_wunlock(tp->t_inpcb);
3867	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3868	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3869
3870	return (0);
3871}
3872
3873/*
3874 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3875 * next batch of work requests from the write queue.
3876 */
3877static void
3878wr_ack(struct toepcb *toep, struct mbuf *m)
3879{
3880	struct tcpcb *tp = toep->tp_tp;
3881	struct cpl_wr_ack *hdr = cplhdr(m);
3882	struct socket *so;
3883	unsigned int credits = ntohs(hdr->credits);
3884	u32 snd_una = ntohl(hdr->snd_una);
3885	int bytes = 0;
3886	struct sockbuf *snd;
3887
3888	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3889
3890	inp_wlock(tp->t_inpcb);
3891	so = inp_inpcbtosocket(tp->t_inpcb);
3892	toep->tp_wr_avail += credits;
3893	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3894		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3895
3896	while (credits) {
3897		struct mbuf *p = peek_wr(toep);
3898
3899		if (__predict_false(!p)) {
3900			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3901			    "nothing pending, state %u wr_avail=%u\n",
3902			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3903			break;
3904		}
3905		CTR2(KTR_TOM,
3906			"wr_ack: p->credits=%d p->bytes=%d",
3907		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3908		KASSERT(p->m_pkthdr.csum_data != 0,
3909		    ("empty request still on list"));
3910
3911		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3912
3913#if DEBUG_WR > 1
3914			struct tx_data_wr *w = cplhdr(p);
3915			log(LOG_ERR,
3916			       "TID %u got %u WR credits, need %u, len %u, "
3917			       "main body %u, frags %u, seq # %u, ACK una %u,"
3918			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3919			       toep->tp_tid, credits, p->csum, p->len,
3920			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3921			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3922			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3923#endif
3924			p->m_pkthdr.csum_data -= credits;
3925			break;
3926		} else {
3927			dequeue_wr(toep);
3928			credits -= p->m_pkthdr.csum_data;
3929			bytes += p->m_pkthdr.len;
3930			CTR3(KTR_TOM,
3931			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3932			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3933
3934			m_free(p);
3935		}
3936	}
3937
3938#if DEBUG_WR
3939	check_wr_invariants(tp);
3940#endif
3941
3942	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3943#if VALIDATE_SEQ
3944		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3945
3946		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3947		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3948		    toep->tp_tid, tp->snd_una);
3949#endif
3950		goto out_free;
3951	}
3952
3953	if (tp->snd_una != snd_una) {
3954		tp->snd_una = snd_una;
3955		tp->ts_recent_age = ticks;
3956#ifdef notyet
3957		/*
3958		 * Keep ARP entry "minty fresh"
3959		 */
3960		dst_confirm(sk->sk_dst_cache);
3961#endif
3962		if (tp->snd_una == tp->snd_nxt)
3963			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3964	}
3965
3966	snd = so_sockbuf_snd(so);
3967	if (bytes) {
3968		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3969		snd = so_sockbuf_snd(so);
3970		sockbuf_lock(snd);
3971		sbdrop_locked(snd, bytes);
3972		so_sowwakeup_locked(so);
3973	}
3974
3975	if (snd->sb_sndptroff < snd->sb_cc)
3976		t3_push_frames(so, 0);
3977
3978out_free:
3979	inp_wunlock(tp->t_inpcb);
3980	m_free(m);
3981}
3982
3983/*
3984 * Handler for TX_DATA_ACK CPL messages.
3985 */
3986static int
3987do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3988{
3989	struct toepcb *toep = (struct toepcb *)ctx;
3990
3991	VALIDATE_SOCK(so);
3992
3993	wr_ack(toep, m);
3994	return 0;
3995}
3996
3997/*
3998 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
3999 */
4000static int
4001do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4002{
4003	m_freem(m);
4004	return 0;
4005}
4006
4007/*
4008 * Reset a connection that is on a listener's SYN queue or accept queue,
4009 * i.e., one that has not had a struct socket associated with it.
4010 * Must be called from process context.
4011 *
4012 * Modeled after code in inet_csk_listen_stop().
4013 */
4014static void
4015t3_reset_listen_child(struct socket *child)
4016{
4017	struct tcpcb *tp = so_sototcpcb(child);
4018
4019	t3_send_reset(tp->t_toe);
4020}
4021
4022
4023static void
4024t3_child_disconnect(struct socket *so, void *arg)
4025{
4026	struct tcpcb *tp = so_sototcpcb(so);
4027
4028	if (tp->t_flags & TF_TOE) {
4029		inp_wlock(tp->t_inpcb);
4030		t3_reset_listen_child(so);
4031		inp_wunlock(tp->t_inpcb);
4032	}
4033}
4034
4035/*
4036 * Disconnect offloaded established but not yet accepted connections sitting
4037 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4038 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4039 */
4040void
4041t3_disconnect_acceptq(struct socket *listen_so)
4042{
4043
4044	so_lock(listen_so);
4045	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4046	so_unlock(listen_so);
4047}
4048
4049/*
4050 * Reset offloaded connections sitting on a server's syn queue.  As above
4051 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4052 */
4053
4054void
4055t3_reset_synq(struct listen_ctx *lctx)
4056{
4057	struct toepcb *toep;
4058
4059	so_lock(lctx->lso);
4060	while (!LIST_EMPTY(&lctx->synq_head)) {
4061		toep = LIST_FIRST(&lctx->synq_head);
4062		LIST_REMOVE(toep, synq_entry);
4063		toep->tp_tp = NULL;
4064		t3_send_reset(toep);
4065		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4066		toepcb_release(toep);
4067	}
4068	so_unlock(lctx->lso);
4069}
4070
4071
4072int
4073t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4074		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4075		   unsigned int pg_off, unsigned int color)
4076{
4077	unsigned int i, j, pidx;
4078	struct pagepod *p;
4079	struct mbuf *m;
4080	struct ulp_mem_io *req;
4081	unsigned int tid = toep->tp_tid;
4082	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4083	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4084
4085	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4086	    gl, nppods, tag, maxoff, pg_off, color);
4087
4088	for (i = 0; i < nppods; ++i) {
4089		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4090		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4091		req = mtod(m, struct ulp_mem_io *);
4092		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4093		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4094		req->wr.wr_lo = 0;
4095		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4096					   V_ULPTX_CMD(ULP_MEM_WRITE));
4097		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4098				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4099
4100		p = (struct pagepod *)(req + 1);
4101		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4102			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4103			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4104						  V_PPOD_COLOR(color));
4105			p->pp_max_offset = htonl(maxoff);
4106			p->pp_page_offset = htonl(pg_off);
4107			p->pp_rsvd = 0;
4108			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4109				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4110				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4111		} else
4112			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4113		send_or_defer(toep, m, 0);
4114		ppod_addr += PPOD_SIZE;
4115	}
4116	return (0);
4117}
4118
4119/*
4120 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4121 */
4122static inline void
4123mk_cpl_barrier_ulp(struct cpl_barrier *b)
4124{
4125	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4126
4127	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4128	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4129	b->opcode = CPL_BARRIER;
4130}
4131
4132/*
4133 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4134 */
4135static inline void
4136mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4137{
4138	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4139
4140	txpkt = (struct ulp_txpkt *)req;
4141	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4142	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4143	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4144	req->cpuno = htons(cpuno);
4145}
4146
4147/*
4148 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4149 */
4150static inline void
4151mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4152                     unsigned int word, uint64_t mask, uint64_t val)
4153{
4154	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4155
4156	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4157	    tid, word, mask, val);
4158
4159	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4160	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4161	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4162	req->reply = V_NO_REPLY(1);
4163	req->cpu_idx = 0;
4164	req->word = htons(word);
4165	req->mask = htobe64(mask);
4166	req->val = htobe64(val);
4167}
4168
4169/*
4170 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4171 */
4172static void
4173mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4174    unsigned int tid, unsigned int credits)
4175{
4176	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4177
4178	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4179	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4180	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4181	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4182	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4183				 V_RX_CREDITS(credits));
4184}
4185
4186void
4187t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4188{
4189	unsigned int wrlen;
4190	struct mbuf *m;
4191	struct work_request_hdr *wr;
4192	struct cpl_barrier *lock;
4193	struct cpl_set_tcb_field *req;
4194	struct cpl_get_tcb *getreq;
4195	struct ddp_state *p = &toep->tp_ddp_state;
4196
4197#if 0
4198	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4199#endif
4200	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4201		sizeof(*getreq);
4202	m = m_gethdr_nofail(wrlen);
4203	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4204	wr = mtod(m, struct work_request_hdr *);
4205	bzero(wr, wrlen);
4206
4207	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4208	m->m_pkthdr.len = m->m_len = wrlen;
4209
4210	lock = (struct cpl_barrier *)(wr + 1);
4211	mk_cpl_barrier_ulp(lock);
4212
4213	req = (struct cpl_set_tcb_field *)(lock + 1);
4214
4215	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4216
4217	/* Hmmm, not sure if this actually a good thing: reactivating
4218	 * the other buffer might be an issue if it has been completed
4219	 * already. However, that is unlikely, since the fact that the UBUF
4220	 * is not completed indicates that there is no oustanding data.
4221	 */
4222	if (bufidx == 0)
4223		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4224				     V_TF_DDP_ACTIVE_BUF(1) |
4225				     V_TF_DDP_BUF0_VALID(1),
4226				     V_TF_DDP_ACTIVE_BUF(1));
4227	else
4228		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4229				     V_TF_DDP_ACTIVE_BUF(1) |
4230				     V_TF_DDP_BUF1_VALID(1), 0);
4231
4232	getreq = (struct cpl_get_tcb *)(req + 1);
4233	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4234
4235	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4236
4237	/* Keep track of the number of oustanding CPL_GET_TCB requests
4238	 */
4239	p->get_tcb_count++;
4240
4241#ifdef T3_TRACE
4242	T3_TRACE1(TIDTB(so),
4243		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4244#endif
4245	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4246}
4247
4248/**
4249 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4250 * @sk: the socket associated with the buffers
4251 * @bufidx: index of HW DDP buffer (0 or 1)
4252 * @tag0: new tag for HW buffer 0
4253 * @tag1: new tag for HW buffer 1
4254 * @len: new length for HW buf @bufidx
4255 *
4256 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4257 * buffer by changing the buffer tag and length and setting the valid and
4258 * active flag accordingly.  The caller must ensure the new buffer is at
4259 * least as big as the existing one.  Since we typically reprogram both HW
4260 * buffers this function sets both tags for convenience. Read the TCB to
4261 * determine how made data was written into the buffer before the overlay
4262 * took place.
4263 */
4264void
4265t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4266	 	       unsigned int tag1, unsigned int len)
4267{
4268	unsigned int wrlen;
4269	struct mbuf *m;
4270	struct work_request_hdr *wr;
4271	struct cpl_get_tcb *getreq;
4272	struct cpl_set_tcb_field *req;
4273	struct ddp_state *p = &toep->tp_ddp_state;
4274
4275	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4276	    bufidx, tag0, tag1, len);
4277#if 0
4278	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4279#endif
4280	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4281	m = m_gethdr_nofail(wrlen);
4282	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4283	wr = mtod(m, struct work_request_hdr *);
4284	m->m_pkthdr.len = m->m_len = wrlen;
4285	bzero(wr, wrlen);
4286
4287
4288	/* Set the ATOMIC flag to make sure that TP processes the following
4289	 * CPLs in an atomic manner and no wire segments can be interleaved.
4290	 */
4291	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4292	req = (struct cpl_set_tcb_field *)(wr + 1);
4293	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4294			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4295			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4296			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4297			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4298	req++;
4299	if (bufidx == 0) {
4300		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4301			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4302			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4303		req++;
4304		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4305			    V_TF_DDP_PUSH_DISABLE_0(1) |
4306			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4307			    V_TF_DDP_PUSH_DISABLE_0(0) |
4308			    V_TF_DDP_BUF0_VALID(1));
4309	} else {
4310		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4311			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4312			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4313		req++;
4314		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4315			    V_TF_DDP_PUSH_DISABLE_1(1) |
4316			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4317			    V_TF_DDP_PUSH_DISABLE_1(0) |
4318			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4319	}
4320
4321	getreq = (struct cpl_get_tcb *)(req + 1);
4322	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4323
4324	/* Keep track of the number of oustanding CPL_GET_TCB requests
4325	 */
4326	p->get_tcb_count++;
4327
4328#ifdef T3_TRACE
4329	T3_TRACE4(TIDTB(sk),
4330		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4331		  "len %d",
4332		  bufidx, tag0, tag1, len);
4333#endif
4334	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4335}
4336
4337/*
4338 * Sends a compound WR containing all the CPL messages needed to program the
4339 * two HW DDP buffers, namely optionally setting up the length and offset of
4340 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4341 */
4342void
4343t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4344		      unsigned int len1, unsigned int offset1,
4345                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4346{
4347	unsigned int wrlen;
4348	struct mbuf *m;
4349	struct work_request_hdr *wr;
4350	struct cpl_set_tcb_field *req;
4351
4352	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4353	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4354
4355#if 0
4356	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4357#endif
4358	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4359		(len1 ? sizeof(*req) : 0) +
4360		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4361	m = m_gethdr_nofail(wrlen);
4362	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4363	wr = mtod(m, struct work_request_hdr *);
4364	bzero(wr, wrlen);
4365
4366	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4367	m->m_pkthdr.len = m->m_len = wrlen;
4368
4369	req = (struct cpl_set_tcb_field *)(wr + 1);
4370	if (len0) {                  /* program buffer 0 offset and length */
4371		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4372			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4373			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4374			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4375			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4376		req++;
4377	}
4378	if (len1) {                  /* program buffer 1 offset and length */
4379		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4380			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4381			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4382			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4383			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4384		req++;
4385	}
4386
4387	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4388			     ddp_flags);
4389
4390	if (modulate) {
4391		mk_rx_data_ack_ulp(toep,
4392		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4393		    toep->tp_copied_seq - toep->tp_rcv_wup);
4394		toep->tp_rcv_wup = toep->tp_copied_seq;
4395	}
4396
4397#ifdef T3_TRACE
4398	T3_TRACE5(TIDTB(sk),
4399		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4400		  "modulate %d",
4401		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4402		  modulate);
4403#endif
4404
4405	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4406}
4407
4408void
4409t3_init_wr_tab(unsigned int wr_len)
4410{
4411	int i;
4412
4413	if (mbuf_wrs[1])     /* already initialized */
4414		return;
4415
4416	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4417		int sgl_len = (3 * i) / 2 + (i & 1);
4418
4419		sgl_len += 3;
4420		mbuf_wrs[i] = sgl_len <= wr_len ?
4421		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4422	}
4423
4424	wrlen = wr_len * 8;
4425}
4426
4427int
4428t3_init_cpl_io(void)
4429{
4430#ifdef notyet
4431	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4432	if (!tcphdr_skb) {
4433		log(LOG_ERR,
4434		       "Chelsio TCP offload: can't allocate sk_buff\n");
4435		return -1;
4436	}
4437	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4438	tcphdr_skb->h.raw = tcphdr_skb->data;
4439	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4440#endif
4441
4442	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4443	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4444	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4445	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4446	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4447	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4448	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4449	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4450	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4451	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4452	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4453	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4454	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4455	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4456	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4457	return (0);
4458}
4459
4460