cxgb_cpl_io.c revision 180644
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 180644 2008-07-21 01:23:19Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44#include <sys/syslog.h>
45#include <sys/protosw.h>
46#include <sys/priv.h>
47
48#include <net/if.h>
49#include <net/route.h>
50
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_systm.h>
54#include <netinet/in_var.h>
55
56
57#include <dev/cxgb/cxgb_osdep.h>
58#include <dev/cxgb/sys/mbufq.h>
59
60#include <netinet/ip.h>
61#include <netinet/tcp_var.h>
62#include <netinet/tcp_fsm.h>
63#include <netinet/tcp_offload.h>
64#include <netinet/tcp_seq.h>
65#include <netinet/tcp_syncache.h>
66#include <netinet/tcp_timer.h>
67#include <net/route.h>
68
69#include <dev/cxgb/t3cdev.h>
70#include <dev/cxgb/common/cxgb_firmware_exports.h>
71#include <dev/cxgb/common/cxgb_t3_cpl.h>
72#include <dev/cxgb/common/cxgb_tcb.h>
73#include <dev/cxgb/common/cxgb_ctl_defs.h>
74#include <dev/cxgb/cxgb_offload.h>
75#include <vm/vm.h>
76#include <vm/pmap.h>
77#include <machine/bus.h>
78#include <dev/cxgb/sys/mvec.h>
79#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
80#include <dev/cxgb/ulp/tom/cxgb_defs.h>
81#include <dev/cxgb/ulp/tom/cxgb_tom.h>
82#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
83#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
84#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
85
86#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space.  Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets.  It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes.  Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body.  This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes.  Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window.  We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139#define TCP_TIMEWAIT	1
140#define TCP_CLOSE	2
141#define TCP_DROP	3
142
143extern int tcp_do_autorcvbuf;
144extern int tcp_do_autosndbuf;
145extern int tcp_autorcvbuf_max;
146extern int tcp_autosndbuf_max;
147
148static void t3_send_reset(struct toepcb *toep);
149static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
150static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
151static void handle_syncache_event(int event, void *arg);
152
153static inline void
154SBAPPEND(struct sockbuf *sb, struct mbuf *n)
155{
156	struct mbuf *m;
157
158	m = sb->sb_mb;
159	while (m) {
160		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
161		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
162			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
163		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
164			m->m_next, m->m_nextpkt, m->m_flags));
165		m = m->m_next;
166	}
167	m = n;
168	while (m) {
169		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
170		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
171			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
172		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
173			m->m_next, m->m_nextpkt, m->m_flags));
174		m = m->m_next;
175	}
176	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
177	sbappendstream_locked(sb, n);
178	m = sb->sb_mb;
179
180	while (m) {
181		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
182			m->m_next, m->m_nextpkt, m->m_flags));
183		m = m->m_next;
184	}
185}
186
187static inline int
188is_t3a(const struct toedev *dev)
189{
190	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
191}
192
193static void
194dump_toepcb(struct toepcb *toep)
195{
196	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
197	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
198	    toep->tp_mtu_idx, toep->tp_tid);
199
200	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
201	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
202	    toep->tp_mss_clamp, toep->tp_flags);
203}
204
205#ifndef RTALLOC2_DEFINED
206static struct rtentry *
207rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
208{
209	struct rtentry *rt = NULL;
210
211	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
212		RT_UNLOCK(rt);
213
214	return (rt);
215}
216#endif
217
218/*
219 * Determine whether to send a CPL message now or defer it.  A message is
220 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
221 * For connections in other states the message is sent immediately.
222 * If through_l2t is set the message is subject to ARP processing, otherwise
223 * it is sent directly.
224 */
225static inline void
226send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
227{
228	struct tcpcb *tp = toep->tp_tp;
229
230	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
231		inp_wlock(tp->t_inpcb);
232		mbufq_tail(&toep->out_of_order_queue, m);  // defer
233		inp_wunlock(tp->t_inpcb);
234	} else if (through_l2t)
235		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
236	else
237		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
238}
239
240static inline unsigned int
241mkprio(unsigned int cntrl, const struct toepcb *toep)
242{
243        return (cntrl);
244}
245
246/*
247 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
248 */
249static inline void
250mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
251{
252	struct cpl_tid_release *req;
253
254	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
255	m->m_pkthdr.len = m->m_len = sizeof(*req);
256	req = mtod(m, struct cpl_tid_release *);
257	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
258	req->wr.wr_lo = 0;
259	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
260}
261
262static inline void
263make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
264{
265	struct tcpcb *tp = so_sototcpcb(so);
266	struct toepcb *toep = tp->t_toe;
267	struct tx_data_wr *req;
268	struct sockbuf *snd;
269
270	inp_lock_assert(tp->t_inpcb);
271	snd = so_sockbuf_snd(so);
272
273	req = mtod(m, struct tx_data_wr *);
274	m->m_len = sizeof(*req);
275	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
276	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
277	/* len includes the length of any HW ULP additions */
278	req->len = htonl(len);
279	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
280	/* V_TX_ULP_SUBMODE sets both the mode and submode */
281	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
282	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
283	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
284				   (tail ? 0 : 1))));
285	req->sndseq = htonl(tp->snd_nxt);
286	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
287		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
288				    V_TX_CPU_IDX(toep->tp_qset));
289
290		/* Sendbuffer is in units of 32KB.
291		 */
292		if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
293			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
294		else {
295			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
296		}
297
298		toep->tp_flags |= TP_DATASENT;
299	}
300}
301
302#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
303
304int
305t3_push_frames(struct socket *so, int req_completion)
306{
307	struct tcpcb *tp = so_sototcpcb(so);
308	struct toepcb *toep = tp->t_toe;
309
310	struct mbuf *tail, *m0, *last;
311	struct t3cdev *cdev;
312	struct tom_data *d;
313	int state, bytes, count, total_bytes;
314	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
315	struct sockbuf *snd;
316
317	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
318		DPRINTF("tcp state=%d\n", tp->t_state);
319		return (0);
320	}
321
322	state = so_state_get(so);
323
324	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
325		DPRINTF("disconnecting\n");
326
327		return (0);
328	}
329
330	inp_lock_assert(tp->t_inpcb);
331
332	snd = so_sockbuf_snd(so);
333	sockbuf_lock(snd);
334
335	d = TOM_DATA(toep->tp_toedev);
336	cdev = d->cdev;
337
338	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
339
340	total_bytes = 0;
341	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
342	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
343
344	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
345		KASSERT(tail, ("sbdrop error"));
346		last = tail = tail->m_next;
347	}
348
349	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
350		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
351		sockbuf_unlock(snd);
352
353		return (0);
354	}
355
356	toep->tp_m_last = NULL;
357	while (toep->tp_wr_avail && (tail != NULL)) {
358		count = bytes = 0;
359		segp = segs;
360		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
361			sockbuf_unlock(snd);
362			return (0);
363		}
364		/*
365		 * If the data in tail fits as in-line, then
366		 * make an immediate data wr.
367		 */
368		if (tail->m_len <= IMM_LEN) {
369			count = 1;
370			bytes = tail->m_len;
371			last = tail;
372			tail = tail->m_next;
373			m_set_sgl(m0, NULL);
374			m_set_sgllen(m0, 0);
375			make_tx_data_wr(so, m0, bytes, tail);
376			m_append(m0, bytes, mtod(last, caddr_t));
377			KASSERT(!m0->m_next, ("bad append"));
378		} else {
379			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
380			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
381				bytes += tail->m_len;
382				last = tail;
383				count++;
384				/*
385				 * technically an abuse to be using this for a VA
386				 * but less gross than defining my own structure
387				 * or calling pmap_kextract from here :-|
388				 */
389				segp->ds_addr = (bus_addr_t)tail->m_data;
390				segp->ds_len = tail->m_len;
391				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
392				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
393				segp++;
394				tail = tail->m_next;
395			}
396			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
397			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
398
399			m_set_sgl(m0, segs);
400			m_set_sgllen(m0, count);
401			make_tx_data_wr(so, m0, bytes, tail);
402		}
403		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
404
405		if (tail) {
406			snd->sb_sndptr = tail;
407			toep->tp_m_last = NULL;
408		} else
409			toep->tp_m_last = snd->sb_sndptr = last;
410
411
412		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
413
414		snd->sb_sndptroff += bytes;
415		total_bytes += bytes;
416		toep->tp_write_seq += bytes;
417		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
418		    " tail=%p sndptr=%p sndptroff=%d",
419		    toep->tp_wr_avail, count, mbuf_wrs[count],
420		    tail, snd->sb_sndptr, snd->sb_sndptroff);
421		if (tail)
422			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
423			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
424			    total_bytes, toep->tp_m_last, tail->m_data,
425			    tp->snd_una);
426		else
427			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
428			    " tp_m_last=%p snd_una=0x%08x",
429			    total_bytes, toep->tp_m_last, tp->snd_una);
430
431
432#ifdef KTR
433{
434		int i;
435
436		i = 0;
437		while (i < count && m_get_sgllen(m0)) {
438			if ((count - i) >= 3) {
439				CTR6(KTR_TOM,
440				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
441				    " len=%d pa=0x%zx len=%d",
442				    segs[i].ds_addr, segs[i].ds_len,
443				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
444				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
445				    i += 3;
446			} else if ((count - i) == 2) {
447				CTR4(KTR_TOM,
448				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
449				    " len=%d",
450				    segs[i].ds_addr, segs[i].ds_len,
451				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
452				    i += 2;
453			} else {
454				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
455				    segs[i].ds_addr, segs[i].ds_len);
456				i++;
457			}
458
459		}
460}
461#endif
462                 /*
463		 * remember credits used
464		 */
465		m0->m_pkthdr.csum_data = mbuf_wrs[count];
466		m0->m_pkthdr.len = bytes;
467		toep->tp_wr_avail -= mbuf_wrs[count];
468		toep->tp_wr_unacked += mbuf_wrs[count];
469
470		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
471		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
472			struct work_request_hdr *wr = cplhdr(m0);
473
474			wr->wr_hi |= htonl(F_WR_COMPL);
475			toep->tp_wr_unacked = 0;
476		}
477		KASSERT((m0->m_pkthdr.csum_data > 0) &&
478		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
479			m0->m_pkthdr.csum_data));
480		m0->m_type = MT_DONTFREE;
481		enqueue_wr(toep, m0);
482		DPRINTF("sending offload tx with %d bytes in %d segments\n",
483		    bytes, count);
484		l2t_send(cdev, m0, toep->tp_l2t);
485	}
486	sockbuf_unlock(snd);
487	return (total_bytes);
488}
489
490/*
491 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
492 * under any circumstances.  We take the easy way out and always queue the
493 * message to the write_queue.  We can optimize the case where the queue is
494 * already empty though the optimization is probably not worth it.
495 */
496static void
497close_conn(struct socket *so)
498{
499	struct mbuf *m;
500	struct cpl_close_con_req *req;
501	struct tom_data *d;
502	struct inpcb *inp = so_sotoinpcb(so);
503	struct tcpcb *tp;
504	struct toepcb *toep;
505	unsigned int tid;
506
507
508	inp_wlock(inp);
509	tp = so_sototcpcb(so);
510	toep = tp->t_toe;
511
512	if (tp->t_state != TCPS_SYN_SENT)
513		t3_push_frames(so, 1);
514
515	if (toep->tp_flags & TP_FIN_SENT) {
516		inp_wunlock(inp);
517		return;
518	}
519
520	tid = toep->tp_tid;
521
522	d = TOM_DATA(toep->tp_toedev);
523
524	m = m_gethdr_nofail(sizeof(*req));
525	m_set_priority(m, CPL_PRIORITY_DATA);
526	m_set_sgl(m, NULL);
527	m_set_sgllen(m, 0);
528
529	toep->tp_flags |= TP_FIN_SENT;
530	req = mtod(m, struct cpl_close_con_req *);
531
532	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
533	req->wr.wr_lo = htonl(V_WR_TID(tid));
534	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
535	req->rsvd = 0;
536	inp_wunlock(inp);
537	/*
538	 * XXX - need to defer shutdown while there is still data in the queue
539	 *
540	 */
541	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
542	cxgb_ofld_send(d->cdev, m);
543
544}
545
546/*
547 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
548 * and send it along.
549 */
550static void
551abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
552{
553	struct cpl_abort_req *req = cplhdr(m);
554
555	req->cmd = CPL_ABORT_NO_RST;
556	cxgb_ofld_send(cdev, m);
557}
558
559/*
560 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
561 * permitted to return without sending the message in case we cannot allocate
562 * an sk_buff.  Returns the number of credits sent.
563 */
564uint32_t
565t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
566{
567	struct mbuf *m;
568	struct cpl_rx_data_ack *req;
569	struct toepcb *toep = tp->t_toe;
570	struct toedev *tdev = toep->tp_toedev;
571
572	m = m_gethdr_nofail(sizeof(*req));
573
574	DPRINTF("returning %u credits to HW\n", credits);
575
576	req = mtod(m, struct cpl_rx_data_ack *);
577	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
578	req->wr.wr_lo = 0;
579	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
580	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
581	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
582	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
583	return (credits);
584}
585
586/*
587 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
588 * This is only used in DDP mode, so we take the opportunity to also set the
589 * DACK mode and flush any Rx credits.
590 */
591void
592t3_send_rx_modulate(struct toepcb *toep)
593{
594	struct mbuf *m;
595	struct cpl_rx_data_ack *req;
596
597	m = m_gethdr_nofail(sizeof(*req));
598
599	req = mtod(m, struct cpl_rx_data_ack *);
600	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
601	req->wr.wr_lo = 0;
602	m->m_pkthdr.len = m->m_len = sizeof(*req);
603
604	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
605	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
606				 V_RX_DACK_MODE(1) |
607				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
608	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
609	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
610	toep->tp_rcv_wup = toep->tp_copied_seq;
611}
612
613/*
614 * Handle receipt of an urgent pointer.
615 */
616static void
617handle_urg_ptr(struct socket *so, uint32_t urg_seq)
618{
619#ifdef URGENT_DATA_SUPPORTED
620	struct tcpcb *tp = so_sototcpcb(so);
621
622	urg_seq--;   /* initially points past the urgent data, per BSD */
623
624	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
625		return;                                 /* duplicate pointer */
626	sk_send_sigurg(sk);
627	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
628	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
629		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
630
631		tp->copied_seq++;
632		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
633			tom_eat_skb(sk, skb, 0);
634	}
635	tp->urg_data = TCP_URG_NOTYET;
636	tp->urg_seq = urg_seq;
637#endif
638}
639
640/*
641 * Returns true if a socket cannot accept new Rx data.
642 */
643static inline int
644so_no_receive(const struct socket *so)
645{
646	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
647}
648
649/*
650 * Process an urgent data notification.
651 */
652static void
653rx_urg_notify(struct toepcb *toep, struct mbuf *m)
654{
655	struct cpl_rx_urg_notify *hdr = cplhdr(m);
656	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
657
658	VALIDATE_SOCK(so);
659
660	if (!so_no_receive(so))
661		handle_urg_ptr(so, ntohl(hdr->seq));
662
663	m_freem(m);
664}
665
666/*
667 * Handler for RX_URG_NOTIFY CPL messages.
668 */
669static int
670do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
671{
672	struct toepcb *toep = (struct toepcb *)ctx;
673
674	rx_urg_notify(toep, m);
675	return (0);
676}
677
678static __inline int
679is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
680{
681	return (toep->tp_ulp_mode ||
682		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
683		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
684}
685
686/*
687 * Set of states for which we should return RX credits.
688 */
689#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
690
691/*
692 * Called after some received data has been read.  It returns RX credits
693 * to the HW for the amount of data processed.
694 */
695void
696t3_cleanup_rbuf(struct tcpcb *tp, int copied)
697{
698	struct toepcb *toep = tp->t_toe;
699	struct socket *so;
700	struct toedev *dev;
701	int dack_mode, must_send, read;
702	u32 thres, credits, dack = 0;
703	struct sockbuf *rcv;
704
705	so = inp_inpcbtosocket(tp->t_inpcb);
706	rcv = so_sockbuf_rcv(so);
707
708	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
709		(tp->t_state == TCPS_FIN_WAIT_2))) {
710		if (copied) {
711			sockbuf_lock(rcv);
712			toep->tp_copied_seq += copied;
713			sockbuf_unlock(rcv);
714		}
715
716		return;
717	}
718
719	inp_lock_assert(tp->t_inpcb);
720
721	sockbuf_lock(rcv);
722	if (copied)
723		toep->tp_copied_seq += copied;
724	else {
725		read = toep->tp_enqueued_bytes - rcv->sb_cc;
726		toep->tp_copied_seq += read;
727	}
728	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
729	toep->tp_enqueued_bytes = rcv->sb_cc;
730	sockbuf_unlock(rcv);
731
732	if (credits > rcv->sb_mbmax) {
733		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
734		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
735	    credits = rcv->sb_mbmax;
736	}
737
738
739	/*
740	 * XXX this won't accurately reflect credit return - we need
741	 * to look at the difference between the amount that has been
742	 * put in the recv sockbuf and what is there now
743	 */
744
745	if (__predict_false(!credits))
746		return;
747
748	dev = toep->tp_toedev;
749	thres = TOM_TUNABLE(dev, rx_credit_thres);
750
751	if (__predict_false(thres == 0))
752		return;
753
754	if (is_delack_mode_valid(dev, toep)) {
755		dack_mode = TOM_TUNABLE(dev, delack);
756		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
757			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
758
759			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
760				dack = F_RX_DACK_CHANGE |
761				       V_RX_DACK_MODE(dack_mode);
762		}
763	} else
764		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
765
766	/*
767	 * For coalescing to work effectively ensure the receive window has
768	 * at least 16KB left.
769	 */
770	must_send = credits + 16384 >= tp->rcv_wnd;
771
772	if (must_send || credits >= thres)
773		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
774}
775
776static int
777cxgb_toe_disconnect(struct tcpcb *tp)
778{
779	struct socket *so;
780
781	DPRINTF("cxgb_toe_disconnect\n");
782
783	so = inp_inpcbtosocket(tp->t_inpcb);
784	close_conn(so);
785	return (0);
786}
787
788static int
789cxgb_toe_reset(struct tcpcb *tp)
790{
791	struct toepcb *toep = tp->t_toe;
792
793	t3_send_reset(toep);
794
795	/*
796	 * unhook from socket
797	 */
798	tp->t_flags &= ~TF_TOE;
799	toep->tp_tp = NULL;
800	tp->t_toe = NULL;
801	return (0);
802}
803
804static int
805cxgb_toe_send(struct tcpcb *tp)
806{
807	struct socket *so;
808
809	DPRINTF("cxgb_toe_send\n");
810	dump_toepcb(tp->t_toe);
811
812	so = inp_inpcbtosocket(tp->t_inpcb);
813	t3_push_frames(so, 1);
814	return (0);
815}
816
817static int
818cxgb_toe_rcvd(struct tcpcb *tp)
819{
820
821	inp_lock_assert(tp->t_inpcb);
822
823	t3_cleanup_rbuf(tp, 0);
824
825	return (0);
826}
827
828static void
829cxgb_toe_detach(struct tcpcb *tp)
830{
831	struct toepcb *toep;
832
833        /*
834	 * XXX how do we handle teardown in the SYN_SENT state?
835	 *
836	 */
837	inp_lock_assert(tp->t_inpcb);
838	toep = tp->t_toe;
839	toep->tp_tp = NULL;
840
841	/*
842	 * unhook from socket
843	 */
844	tp->t_flags &= ~TF_TOE;
845	tp->t_toe = NULL;
846}
847
848
849static struct toe_usrreqs cxgb_toe_usrreqs = {
850	.tu_disconnect = cxgb_toe_disconnect,
851	.tu_reset = cxgb_toe_reset,
852	.tu_send = cxgb_toe_send,
853	.tu_rcvd = cxgb_toe_rcvd,
854	.tu_detach = cxgb_toe_detach,
855	.tu_detach = cxgb_toe_detach,
856	.tu_syncache_event = handle_syncache_event,
857};
858
859
860static void
861__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
862			    uint64_t mask, uint64_t val, int no_reply)
863{
864	struct cpl_set_tcb_field *req;
865
866	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
867	    toep->tp_tid, word, mask, val);
868
869	req = mtod(m, struct cpl_set_tcb_field *);
870	m->m_pkthdr.len = m->m_len = sizeof(*req);
871	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
872	req->wr.wr_lo = 0;
873	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
874	req->reply = V_NO_REPLY(no_reply);
875	req->cpu_idx = 0;
876	req->word = htons(word);
877	req->mask = htobe64(mask);
878	req->val = htobe64(val);
879
880	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
881	send_or_defer(toep, m, 0);
882}
883
884static void
885t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
886{
887	struct mbuf *m;
888	struct tcpcb *tp = toep->tp_tp;
889
890	if (toep == NULL)
891		return;
892
893	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
894		printf("not seting field\n");
895		return;
896	}
897
898	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
899
900	__set_tcb_field(toep, m, word, mask, val, 1);
901}
902
903/*
904 * Set one of the t_flags bits in the TCB.
905 */
906static void
907set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
908{
909
910	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
911}
912
913/*
914 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
915 */
916static void
917t3_set_nagle(struct toepcb *toep)
918{
919	struct tcpcb *tp = toep->tp_tp;
920
921	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
922}
923
924/*
925 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
926 */
927void
928t3_set_keepalive(struct toepcb *toep, int on_off)
929{
930
931	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
932}
933
934void
935t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
936{
937	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
938}
939
940void
941t3_set_dack_mss(struct toepcb *toep, int on_off)
942{
943
944	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
945}
946
947/*
948 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
949 */
950static void
951t3_set_tos(struct toepcb *toep)
952{
953	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
954
955	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
956			 V_TCB_TOS(tos));
957}
958
959
960/*
961 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
962 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
963 * set the PSH bit in the last segment, which would trigger delivery.]
964 * We work around the issue by setting a DDP buffer in a partial placed state,
965 * which guarantees that TP will schedule a timer.
966 */
967#define TP_DDP_TIMER_WORKAROUND_MASK\
968    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
969     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
970       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
971#define TP_DDP_TIMER_WORKAROUND_VAL\
972    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
973     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
974      32))
975
976static void
977t3_enable_ddp(struct toepcb *toep, int on)
978{
979	if (on) {
980
981		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
982				 V_TF_DDP_OFF(0));
983	} else
984		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
985				 V_TF_DDP_OFF(1) |
986				 TP_DDP_TIMER_WORKAROUND_MASK,
987				 V_TF_DDP_OFF(1) |
988				 TP_DDP_TIMER_WORKAROUND_VAL);
989
990}
991
992void
993t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
994{
995	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
996			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
997			 tag_color);
998}
999
1000void
1001t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1002		    unsigned int len)
1003{
1004	if (buf_idx == 0)
1005		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1006			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1007			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1008			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1009			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1010	else
1011		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1012			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1013			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1014			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1015			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1016}
1017
1018static int
1019t3_set_cong_control(struct socket *so, const char *name)
1020{
1021#ifdef CONGESTION_CONTROL_SUPPORTED
1022	int cong_algo;
1023
1024	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1025		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1026			break;
1027
1028	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1029		return -EINVAL;
1030#endif
1031	return 0;
1032}
1033
1034int
1035t3_get_tcb(struct toepcb *toep)
1036{
1037	struct cpl_get_tcb *req;
1038	struct tcpcb *tp = toep->tp_tp;
1039	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1040
1041	if (!m)
1042		return (ENOMEM);
1043
1044	inp_lock_assert(tp->t_inpcb);
1045	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1046	req = mtod(m, struct cpl_get_tcb *);
1047	m->m_pkthdr.len = m->m_len = sizeof(*req);
1048	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1049	req->wr.wr_lo = 0;
1050	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1051	req->cpuno = htons(toep->tp_qset);
1052	req->rsvd = 0;
1053	if (tp->t_state == TCPS_SYN_SENT)
1054		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1055	else
1056		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1057	return 0;
1058}
1059
1060static inline void
1061so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1062{
1063
1064	toepcb_hold(toep);
1065
1066	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1067}
1068
1069/**
1070 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1071 *	@d: TOM state
1072 *	@mtu: the target MTU
1073 *
1074 *	Returns the index of the value in the MTU table that is closest to but
1075 *	does not exceed the target MTU.
1076 */
1077static unsigned int
1078find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1079{
1080	int i = 0;
1081
1082	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1083		++i;
1084	return (i);
1085}
1086
1087static unsigned int
1088select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1089{
1090	unsigned int idx;
1091
1092#ifdef notyet
1093	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1094#endif
1095	if (tp) {
1096		tp->t_maxseg = pmtu - 40;
1097		if (tp->t_maxseg < td->mtus[0] - 40)
1098			tp->t_maxseg = td->mtus[0] - 40;
1099		idx = find_best_mtu(td, tp->t_maxseg + 40);
1100
1101		tp->t_maxseg = td->mtus[idx] - 40;
1102	} else
1103		idx = find_best_mtu(td, pmtu);
1104
1105	return (idx);
1106}
1107
1108static inline void
1109free_atid(struct t3cdev *cdev, unsigned int tid)
1110{
1111	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1112
1113	if (toep)
1114		toepcb_release(toep);
1115}
1116
1117/*
1118 * Release resources held by an offload connection (TID, L2T entry, etc.)
1119 */
1120static void
1121t3_release_offload_resources(struct toepcb *toep)
1122{
1123	struct tcpcb *tp = toep->tp_tp;
1124	struct toedev *tdev = toep->tp_toedev;
1125	struct t3cdev *cdev;
1126	struct socket *so;
1127	unsigned int tid = toep->tp_tid;
1128	struct sockbuf *rcv;
1129
1130	CTR0(KTR_TOM, "t3_release_offload_resources");
1131
1132	if (!tdev)
1133		return;
1134
1135	cdev = TOEP_T3C_DEV(toep);
1136	if (!cdev)
1137		return;
1138
1139	toep->tp_qset = 0;
1140	t3_release_ddp_resources(toep);
1141
1142#ifdef CTRL_SKB_CACHE
1143	kfree_skb(CTRL_SKB_CACHE(tp));
1144	CTRL_SKB_CACHE(tp) = NULL;
1145#endif
1146
1147	if (toep->tp_wr_avail != toep->tp_wr_max) {
1148		purge_wr_queue(toep);
1149		reset_wr_list(toep);
1150	}
1151
1152	if (toep->tp_l2t) {
1153		l2t_release(L2DATA(cdev), toep->tp_l2t);
1154		toep->tp_l2t = NULL;
1155	}
1156	toep->tp_tp = NULL;
1157	if (tp) {
1158		inp_lock_assert(tp->t_inpcb);
1159		so = inp_inpcbtosocket(tp->t_inpcb);
1160		rcv = so_sockbuf_rcv(so);
1161		/*
1162		 * cancel any offloaded reads
1163		 *
1164		 */
1165		sockbuf_lock(rcv);
1166		tp->t_toe = NULL;
1167		tp->t_flags &= ~TF_TOE;
1168		if (toep->tp_ddp_state.user_ddp_pending) {
1169			t3_cancel_ubuf(toep, rcv);
1170			toep->tp_ddp_state.user_ddp_pending = 0;
1171		}
1172		so_sorwakeup_locked(so);
1173
1174	}
1175
1176	if (toep->tp_state == TCPS_SYN_SENT) {
1177		free_atid(cdev, tid);
1178#ifdef notyet
1179		__skb_queue_purge(&tp->out_of_order_queue);
1180#endif
1181	} else {                                          // we have TID
1182		cxgb_remove_tid(cdev, toep, tid);
1183		toepcb_release(toep);
1184	}
1185#if 0
1186	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1187#endif
1188}
1189
1190static void
1191install_offload_ops(struct socket *so)
1192{
1193	struct tcpcb *tp = so_sototcpcb(so);
1194
1195	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1196
1197	t3_install_socket_ops(so);
1198	tp->t_flags |= TF_TOE;
1199	tp->t_tu = &cxgb_toe_usrreqs;
1200}
1201
1202/*
1203 * Determine the receive window scaling factor given a target max
1204 * receive window.
1205 */
1206static __inline int
1207select_rcv_wscale(int space)
1208{
1209	int wscale = 0;
1210
1211	if (space > MAX_RCV_WND)
1212		space = MAX_RCV_WND;
1213
1214	if (tcp_do_rfc1323)
1215		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1216
1217	return (wscale);
1218}
1219
1220/*
1221 * Determine the receive window size for a socket.
1222 */
1223static unsigned long
1224select_rcv_wnd(struct toedev *dev, struct socket *so)
1225{
1226	struct tom_data *d = TOM_DATA(dev);
1227	unsigned int wnd;
1228	unsigned int max_rcv_wnd;
1229	struct sockbuf *rcv;
1230
1231	rcv = so_sockbuf_rcv(so);
1232
1233	if (tcp_do_autorcvbuf)
1234		wnd = tcp_autorcvbuf_max;
1235	else
1236		wnd = rcv->sb_hiwat;
1237
1238
1239
1240	/* XXX
1241	 * For receive coalescing to work effectively we need a receive window
1242	 * that can accomodate a coalesced segment.
1243	 */
1244	if (wnd < MIN_RCV_WND)
1245		wnd = MIN_RCV_WND;
1246
1247	/* PR 5138 */
1248	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1249				    (uint32_t)d->rx_page_size * 23 :
1250				    MAX_RCV_WND);
1251
1252	return min(wnd, max_rcv_wnd);
1253}
1254
1255/*
1256 * Assign offload parameters to some socket fields.  This code is used by
1257 * both active and passive opens.
1258 */
1259static inline void
1260init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1261    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1262{
1263	struct tcpcb *tp = so_sototcpcb(so);
1264	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1265	struct sockbuf *snd, *rcv;
1266
1267#ifdef notyet
1268	SOCK_LOCK_ASSERT(so);
1269#endif
1270
1271	snd = so_sockbuf_snd(so);
1272	rcv = so_sockbuf_rcv(so);
1273
1274	log(LOG_INFO, "initializing offload socket\n");
1275	/*
1276	 * We either need to fix push frames to work with sbcompress
1277	 * or we need to add this
1278	 */
1279	snd->sb_flags |= SB_NOCOALESCE;
1280	rcv->sb_flags |= SB_NOCOALESCE;
1281
1282	tp->t_toe = toep;
1283	toep->tp_tp = tp;
1284	toep->tp_toedev = dev;
1285
1286	toep->tp_tid = tid;
1287	toep->tp_l2t = e;
1288	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1289	toep->tp_wr_unacked = 0;
1290	toep->tp_delack_mode = 0;
1291
1292	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1293	/*
1294	 * XXX broken
1295	 *
1296	 */
1297	tp->rcv_wnd = select_rcv_wnd(dev, so);
1298
1299        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1300		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1301	toep->tp_qset_idx = 0;
1302
1303	reset_wr_list(toep);
1304	DPRINTF("initialization done\n");
1305}
1306
1307/*
1308 * The next two functions calculate the option 0 value for a socket.
1309 */
1310static inline unsigned int
1311calc_opt0h(struct socket *so, int mtu_idx)
1312{
1313	struct tcpcb *tp = so_sototcpcb(so);
1314	int wscale = select_rcv_wscale(tp->rcv_wnd);
1315
1316	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1317	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1318	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1319}
1320
1321static inline unsigned int
1322calc_opt0l(struct socket *so, int ulp_mode)
1323{
1324	struct tcpcb *tp = so_sototcpcb(so);
1325	unsigned int val;
1326
1327	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1328	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1329
1330	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1331	return (val);
1332}
1333
1334static inline unsigned int
1335calc_opt2(const struct socket *so, struct toedev *dev)
1336{
1337	int flv_valid;
1338
1339	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1340
1341	return (V_FLAVORS_VALID(flv_valid) |
1342	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1343}
1344
1345#if DEBUG_WR > 1
1346static int
1347count_pending_wrs(const struct toepcb *toep)
1348{
1349	const struct mbuf *m;
1350	int n = 0;
1351
1352	wr_queue_walk(toep, m)
1353		n += m->m_pkthdr.csum_data;
1354	return (n);
1355}
1356#endif
1357
1358#if 0
1359(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1360#endif
1361
1362static void
1363mk_act_open_req(struct socket *so, struct mbuf *m,
1364    unsigned int atid, const struct l2t_entry *e)
1365{
1366	struct cpl_act_open_req *req;
1367	struct inpcb *inp = so_sotoinpcb(so);
1368	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1369	struct toepcb *toep = tp->t_toe;
1370	struct toedev *tdev = toep->tp_toedev;
1371
1372	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1373
1374	req = mtod(m, struct cpl_act_open_req *);
1375	m->m_pkthdr.len = m->m_len = sizeof(*req);
1376
1377	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1378	req->wr.wr_lo = 0;
1379	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1380	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1381#if 0
1382	req->local_port = inp->inp_lport;
1383	req->peer_port = inp->inp_fport;
1384	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1385	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1386#endif
1387	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1388			   V_TX_CHANNEL(e->smt_idx));
1389	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1390	req->params = 0;
1391	req->opt2 = htonl(calc_opt2(so, tdev));
1392}
1393
1394
1395/*
1396 * Convert an ACT_OPEN_RPL status to an errno.
1397 */
1398static int
1399act_open_rpl_status_to_errno(int status)
1400{
1401	switch (status) {
1402	case CPL_ERR_CONN_RESET:
1403		return (ECONNREFUSED);
1404	case CPL_ERR_ARP_MISS:
1405		return (EHOSTUNREACH);
1406	case CPL_ERR_CONN_TIMEDOUT:
1407		return (ETIMEDOUT);
1408	case CPL_ERR_TCAM_FULL:
1409		return (ENOMEM);
1410	case CPL_ERR_CONN_EXIST:
1411		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1412		return (EADDRINUSE);
1413	default:
1414		return (EIO);
1415	}
1416}
1417
1418static void
1419fail_act_open(struct toepcb *toep, int errno)
1420{
1421	struct tcpcb *tp = toep->tp_tp;
1422
1423	t3_release_offload_resources(toep);
1424	if (tp) {
1425		inp_wunlock(tp->t_inpcb);
1426		tcp_offload_drop(tp, errno);
1427	}
1428
1429#ifdef notyet
1430	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1431#endif
1432}
1433
1434/*
1435 * Handle active open failures.
1436 */
1437static void
1438active_open_failed(struct toepcb *toep, struct mbuf *m)
1439{
1440	struct cpl_act_open_rpl *rpl = cplhdr(m);
1441	struct inpcb *inp;
1442
1443	if (toep->tp_tp == NULL)
1444		goto done;
1445
1446	inp = toep->tp_tp->t_inpcb;
1447
1448/*
1449 * Don't handle connection retry for now
1450 */
1451#ifdef notyet
1452	struct inet_connection_sock *icsk = inet_csk(sk);
1453
1454	if (rpl->status == CPL_ERR_CONN_EXIST &&
1455	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1456		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1457		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1458			       jiffies + HZ / 2);
1459	} else
1460#endif
1461	{
1462		inp_wlock(inp);
1463		/*
1464		 * drops the inpcb lock
1465		 */
1466		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1467	}
1468
1469	done:
1470	m_free(m);
1471}
1472
1473/*
1474 * Return whether a failed active open has allocated a TID
1475 */
1476static inline int
1477act_open_has_tid(int status)
1478{
1479	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1480	       status != CPL_ERR_ARP_MISS;
1481}
1482
1483/*
1484 * Process an ACT_OPEN_RPL CPL message.
1485 */
1486static int
1487do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1488{
1489	struct toepcb *toep = (struct toepcb *)ctx;
1490	struct cpl_act_open_rpl *rpl = cplhdr(m);
1491
1492	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1493		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1494
1495	active_open_failed(toep, m);
1496	return (0);
1497}
1498
1499/*
1500 * Handle an ARP failure for an active open.   XXX purge ofo queue
1501 *
1502 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1503 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1504 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1505 * free the atid.  Hmm.
1506 */
1507#ifdef notyet
1508static void
1509act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1510{
1511	struct toepcb *toep = m_get_toep(m);
1512	struct tcpcb *tp = toep->tp_tp;
1513	struct inpcb *inp = tp->t_inpcb;
1514	struct socket *so;
1515
1516	inp_wlock(inp);
1517	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1518		/*
1519		 * drops the inpcb lock
1520		 */
1521		fail_act_open(so, EHOSTUNREACH);
1522		printf("freeing %p\n", m);
1523
1524		m_free(m);
1525	} else
1526		inp_wunlock(inp);
1527}
1528#endif
1529/*
1530 * Send an active open request.
1531 */
1532int
1533t3_connect(struct toedev *tdev, struct socket *so,
1534    struct rtentry *rt, struct sockaddr *nam)
1535{
1536	struct mbuf *m;
1537	struct l2t_entry *e;
1538	struct tom_data *d = TOM_DATA(tdev);
1539	struct inpcb *inp = so_sotoinpcb(so);
1540	struct tcpcb *tp = intotcpcb(inp);
1541	struct toepcb *toep; /* allocated by init_offload_socket */
1542
1543	int atid;
1544
1545	toep = toepcb_alloc();
1546	if (toep == NULL)
1547		goto out_err;
1548
1549	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1550		goto out_err;
1551
1552	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1553	if (!e)
1554		goto free_tid;
1555
1556	inp_lock_assert(inp);
1557	m = m_gethdr(MT_DATA, M_WAITOK);
1558
1559#if 0
1560	m->m_toe.mt_toepcb = tp->t_toe;
1561	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1562#endif
1563	so_lock(so);
1564
1565	init_offload_socket(so, tdev, atid, e, rt, toep);
1566
1567	install_offload_ops(so);
1568
1569	mk_act_open_req(so, m, atid, e);
1570	so_unlock(so);
1571
1572	soisconnecting(so);
1573	toep = tp->t_toe;
1574	m_set_toep(m, tp->t_toe);
1575
1576	toep->tp_state = TCPS_SYN_SENT;
1577	l2t_send(d->cdev, (struct mbuf *)m, e);
1578
1579	if (toep->tp_ulp_mode)
1580		t3_enable_ddp(toep, 0);
1581	return 	(0);
1582
1583free_tid:
1584	printf("failing connect - free atid\n");
1585
1586	free_atid(d->cdev, atid);
1587out_err:
1588	printf("return ENOMEM\n");
1589       return (ENOMEM);
1590}
1591
1592/*
1593 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1594 * not send multiple ABORT_REQs for the same connection and also that we do
1595 * not try to send a message after the connection has closed.  Returns 1 if
1596 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1597 */
1598static void
1599t3_send_reset(struct toepcb *toep)
1600{
1601
1602	struct cpl_abort_req *req;
1603	unsigned int tid = toep->tp_tid;
1604	int mode = CPL_ABORT_SEND_RST;
1605	struct tcpcb *tp = toep->tp_tp;
1606	struct toedev *tdev = toep->tp_toedev;
1607	struct socket *so = NULL;
1608	struct mbuf *m;
1609	struct sockbuf *snd;
1610
1611	if (tp) {
1612		inp_lock_assert(tp->t_inpcb);
1613		so = inp_inpcbtosocket(tp->t_inpcb);
1614	}
1615
1616	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1617		tdev == NULL))
1618		return;
1619	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1620
1621	snd = so_sockbuf_snd(so);
1622	/* Purge the send queue so we don't send anything after an abort. */
1623	if (so)
1624		sbflush(snd);
1625	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1626		mode |= CPL_ABORT_POST_CLOSE_REQ;
1627
1628	m = m_gethdr_nofail(sizeof(*req));
1629	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1630	set_arp_failure_handler(m, abort_arp_failure);
1631
1632	req = mtod(m, struct cpl_abort_req *);
1633	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1634	req->wr.wr_lo = htonl(V_WR_TID(tid));
1635	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1636	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1637	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1638	req->cmd = mode;
1639	if (tp && (tp->t_state == TCPS_SYN_SENT))
1640		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1641	else
1642		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1643}
1644
1645static int
1646t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1647{
1648	struct inpcb *inp;
1649	int error, optval;
1650
1651	if (sopt->sopt_name == IP_OPTIONS)
1652		return (ENOPROTOOPT);
1653
1654	if (sopt->sopt_name != IP_TOS)
1655		return (EOPNOTSUPP);
1656
1657	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1658
1659	if (error)
1660		return (error);
1661
1662	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1663		return (EPERM);
1664
1665	inp = so_sotoinpcb(so);
1666	inp_wlock(inp);
1667	inp_ip_tos_set(inp, optval);
1668#if 0
1669	inp->inp_ip_tos = optval;
1670#endif
1671	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1672	inp_wunlock(inp);
1673
1674	return (0);
1675}
1676
1677static int
1678t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1679{
1680	int err = 0;
1681	size_t copied;
1682
1683	if (sopt->sopt_name != TCP_CONGESTION &&
1684	    sopt->sopt_name != TCP_NODELAY)
1685		return (EOPNOTSUPP);
1686
1687	if (sopt->sopt_name == TCP_CONGESTION) {
1688		char name[TCP_CA_NAME_MAX];
1689		int optlen = sopt->sopt_valsize;
1690		struct tcpcb *tp;
1691
1692		if (sopt->sopt_dir == SOPT_GET) {
1693			KASSERT(0, ("unimplemented"));
1694			return (EOPNOTSUPP);
1695		}
1696
1697		if (optlen < 1)
1698			return (EINVAL);
1699
1700		err = copyinstr(sopt->sopt_val, name,
1701		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1702		if (err)
1703			return (err);
1704		if (copied < 1)
1705			return (EINVAL);
1706
1707		tp = so_sototcpcb(so);
1708		/*
1709		 * XXX I need to revisit this
1710		 */
1711		if ((err = t3_set_cong_control(so, name)) == 0) {
1712#ifdef CONGESTION_CONTROL_SUPPORTED
1713			tp->t_cong_control = strdup(name, M_CXGB);
1714#endif
1715		} else
1716			return (err);
1717	} else {
1718		int optval, oldval;
1719		struct inpcb *inp;
1720		struct tcpcb *tp;
1721
1722		if (sopt->sopt_dir == SOPT_GET)
1723			return (EOPNOTSUPP);
1724
1725		err = sooptcopyin(sopt, &optval, sizeof optval,
1726		    sizeof optval);
1727
1728		if (err)
1729			return (err);
1730
1731		inp = so_sotoinpcb(so);
1732		tp = inp_inpcbtotcpcb(inp);
1733
1734		inp_wlock(inp);
1735
1736		oldval = tp->t_flags;
1737		if (optval)
1738			tp->t_flags |= TF_NODELAY;
1739		else
1740			tp->t_flags &= ~TF_NODELAY;
1741		inp_wunlock(inp);
1742
1743
1744		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1745			t3_set_nagle(tp->t_toe);
1746
1747	}
1748
1749	return (0);
1750}
1751
1752int
1753t3_ctloutput(struct socket *so, struct sockopt *sopt)
1754{
1755	int err;
1756
1757	if (sopt->sopt_level != IPPROTO_TCP)
1758		err =  t3_ip_ctloutput(so, sopt);
1759	else
1760		err = t3_tcp_ctloutput(so, sopt);
1761
1762	if (err != EOPNOTSUPP)
1763		return (err);
1764
1765	return (tcp_ctloutput(so, sopt));
1766}
1767
1768/*
1769 * Returns true if we need to explicitly request RST when we receive new data
1770 * on an RX-closed connection.
1771 */
1772static inline int
1773need_rst_on_excess_rx(const struct toepcb *toep)
1774{
1775	return (1);
1776}
1777
1778/*
1779 * Handles Rx data that arrives in a state where the socket isn't accepting
1780 * new data.
1781 */
1782static void
1783handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1784{
1785
1786	if (need_rst_on_excess_rx(toep) &&
1787	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1788		t3_send_reset(toep);
1789	m_freem(m);
1790}
1791
1792/*
1793 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1794 * by getting the DDP offset from the TCB.
1795 */
1796static void
1797tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1798{
1799	struct ddp_state *q = &toep->tp_ddp_state;
1800	struct ddp_buf_state *bsp;
1801	struct cpl_get_tcb_rpl *hdr;
1802	unsigned int ddp_offset;
1803	struct socket *so;
1804	struct tcpcb *tp;
1805	struct sockbuf *rcv;
1806	int state;
1807
1808	uint64_t t;
1809	__be64 *tcb;
1810
1811	tp = toep->tp_tp;
1812	so = inp_inpcbtosocket(tp->t_inpcb);
1813
1814	inp_lock_assert(tp->t_inpcb);
1815	rcv = so_sockbuf_rcv(so);
1816	sockbuf_lock(rcv);
1817
1818	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1819	 * We really need a cookie in order to dispatch the RPLs.
1820	 */
1821	q->get_tcb_count--;
1822
1823	/* It is a possible that a previous CPL already invalidated UBUF DDP
1824	 * and moved the cur_buf idx and hence no further processing of this
1825	 * skb is required. However, the app might be sleeping on
1826	 * !q->get_tcb_count and we need to wake it up.
1827	 */
1828	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1829		int state = so_state_get(so);
1830
1831		m_freem(m);
1832		if (__predict_true((state & SS_NOFDREF) == 0))
1833			so_sorwakeup_locked(so);
1834		else
1835			sockbuf_unlock(rcv);
1836
1837		return;
1838	}
1839
1840	bsp = &q->buf_state[q->cur_buf];
1841	hdr = cplhdr(m);
1842	tcb = (__be64 *)(hdr + 1);
1843	if (q->cur_buf == 0) {
1844		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1845		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1846	} else {
1847		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1848		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1849	}
1850	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1851	m->m_cur_offset = bsp->cur_offset;
1852	bsp->cur_offset = ddp_offset;
1853	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1854
1855	CTR5(KTR_TOM,
1856	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1857	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1858	KASSERT(ddp_offset >= m->m_cur_offset,
1859	    ("ddp_offset=%u less than cur_offset=%u",
1860		ddp_offset, m->m_cur_offset));
1861
1862#if 0
1863{
1864	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1865
1866	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1867	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1868
1869        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1870        rcv_nxt = t >> S_TCB_RCV_NXT;
1871        rcv_nxt &= M_TCB_RCV_NXT;
1872
1873        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1874        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1875        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1876
1877	T3_TRACE2(TIDTB(sk),
1878		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1879		  ddp_flags, rcv_nxt - rx_hdr_offset);
1880	T3_TRACE4(TB(q),
1881		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1882		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1883	T3_TRACE3(TB(q),
1884		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1885		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1886	T3_TRACE2(TB(q),
1887		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1888		 q->buf_state[0].flags, q->buf_state[1].flags);
1889
1890}
1891#endif
1892	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1893		handle_excess_rx(toep, m);
1894		return;
1895	}
1896
1897#ifdef T3_TRACE
1898	if ((int)m->m_pkthdr.len < 0) {
1899		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1900	}
1901#endif
1902	if (bsp->flags & DDP_BF_NOCOPY) {
1903#ifdef T3_TRACE
1904		T3_TRACE0(TB(q),
1905			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1906
1907		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1908			printk("!cancel_ubuf");
1909			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1910		}
1911#endif
1912		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1913		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1914		q->cur_buf ^= 1;
1915	} else if (bsp->flags & DDP_BF_NOFLIP) {
1916
1917		m->m_ddp_flags = 1;    /* always a kernel buffer */
1918
1919		/* now HW buffer carries a user buffer */
1920		bsp->flags &= ~DDP_BF_NOFLIP;
1921		bsp->flags |= DDP_BF_NOCOPY;
1922
1923		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1924		 * any new data in which case we're done. If in addition the
1925		 * offset is 0, then there wasn't a completion for the kbuf
1926		 * and we need to decrement the posted count.
1927		 */
1928		if (m->m_pkthdr.len == 0) {
1929			if (ddp_offset == 0) {
1930				q->kbuf_posted--;
1931				bsp->flags |= DDP_BF_NODATA;
1932			}
1933			sockbuf_unlock(rcv);
1934			m_free(m);
1935			return;
1936		}
1937	} else {
1938		sockbuf_unlock(rcv);
1939
1940		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1941		 * but it got here way late and nobody cares anymore.
1942		 */
1943		m_free(m);
1944		return;
1945	}
1946
1947	m->m_ddp_gl = (unsigned char *)bsp->gl;
1948	m->m_flags |= M_DDP;
1949	m->m_seq = tp->rcv_nxt;
1950	tp->rcv_nxt += m->m_pkthdr.len;
1951	tp->t_rcvtime = ticks;
1952	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1953		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1954	if (m->m_pkthdr.len == 0) {
1955		q->user_ddp_pending = 0;
1956		m_free(m);
1957	} else
1958		SBAPPEND(rcv, m);
1959
1960	state = so_state_get(so);
1961	if (__predict_true((state & SS_NOFDREF) == 0))
1962		so_sorwakeup_locked(so);
1963	else
1964		sockbuf_unlock(rcv);
1965}
1966
1967/*
1968 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1969 * in that case they are similar to DDP completions.
1970 */
1971static int
1972do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1973{
1974	struct toepcb *toep = (struct toepcb *)ctx;
1975
1976	/* OK if socket doesn't exist */
1977	if (toep == NULL) {
1978		printf("null toep in do_get_tcb_rpl\n");
1979		return (CPL_RET_BUF_DONE);
1980	}
1981
1982	inp_wlock(toep->tp_tp->t_inpcb);
1983	tcb_rpl_as_ddp_complete(toep, m);
1984	inp_wunlock(toep->tp_tp->t_inpcb);
1985
1986	return (0);
1987}
1988
1989static void
1990handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1991{
1992	struct tcpcb *tp = toep->tp_tp;
1993	struct socket *so;
1994	struct ddp_state *q;
1995	struct ddp_buf_state *bsp;
1996	struct cpl_rx_data *hdr = cplhdr(m);
1997	unsigned int rcv_nxt = ntohl(hdr->seq);
1998	struct sockbuf *rcv;
1999
2000	if (tp->rcv_nxt == rcv_nxt)
2001		return;
2002
2003	inp_lock_assert(tp->t_inpcb);
2004	so  = inp_inpcbtosocket(tp->t_inpcb);
2005	rcv = so_sockbuf_rcv(so);
2006	sockbuf_lock(rcv);
2007
2008	q = &toep->tp_ddp_state;
2009	bsp = &q->buf_state[q->cur_buf];
2010	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2011		rcv_nxt, tp->rcv_nxt));
2012	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2013	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2014	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2015	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2016
2017#ifdef T3_TRACE
2018	if ((int)m->m_pkthdr.len < 0) {
2019		t3_ddp_error(so, "handle_ddp_data: neg len");
2020	}
2021#endif
2022	m->m_ddp_gl = (unsigned char *)bsp->gl;
2023	m->m_flags |= M_DDP;
2024	m->m_cur_offset = bsp->cur_offset;
2025	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2026	if (bsp->flags & DDP_BF_NOCOPY)
2027		bsp->flags &= ~DDP_BF_NOCOPY;
2028
2029	m->m_seq = tp->rcv_nxt;
2030	tp->rcv_nxt = rcv_nxt;
2031	bsp->cur_offset += m->m_pkthdr.len;
2032	if (!(bsp->flags & DDP_BF_NOFLIP))
2033		q->cur_buf ^= 1;
2034	/*
2035	 * For now, don't re-enable DDP after a connection fell out of  DDP
2036	 * mode.
2037	 */
2038	q->ubuf_ddp_ready = 0;
2039	sockbuf_unlock(rcv);
2040}
2041
2042/*
2043 * Process new data received for a connection.
2044 */
2045static void
2046new_rx_data(struct toepcb *toep, struct mbuf *m)
2047{
2048	struct cpl_rx_data *hdr = cplhdr(m);
2049	struct tcpcb *tp = toep->tp_tp;
2050	struct socket *so;
2051	struct sockbuf *rcv;
2052	int state;
2053	int len = be16toh(hdr->len);
2054
2055	inp_wlock(tp->t_inpcb);
2056
2057	so  = inp_inpcbtosocket(tp->t_inpcb);
2058
2059	if (__predict_false(so_no_receive(so))) {
2060		handle_excess_rx(toep, m);
2061		inp_wunlock(tp->t_inpcb);
2062		TRACE_EXIT;
2063		return;
2064	}
2065
2066	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2067		handle_ddp_data(toep, m);
2068
2069	m->m_seq = ntohl(hdr->seq);
2070	m->m_ulp_mode = 0;                    /* for iSCSI */
2071
2072#if VALIDATE_SEQ
2073	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2074		log(LOG_ERR,
2075		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2076		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2077		       tp->rcv_nxt);
2078		m_freem(m);
2079		inp_wunlock(tp->t_inpcb);
2080		return;
2081	}
2082#endif
2083	m_adj(m, sizeof(*hdr));
2084
2085#ifdef URGENT_DATA_SUPPORTED
2086	/*
2087	 * We don't handle urgent data yet
2088	 */
2089	if (__predict_false(hdr->urg))
2090		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2091	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2092		     tp->urg_seq - tp->rcv_nxt < skb->len))
2093		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2094							 tp->rcv_nxt];
2095#endif
2096	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2097		toep->tp_delack_mode = hdr->dack_mode;
2098		toep->tp_delack_seq = tp->rcv_nxt;
2099	}
2100	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2101	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2102
2103	if (len < m->m_pkthdr.len)
2104		m->m_pkthdr.len = m->m_len = len;
2105
2106	tp->rcv_nxt += m->m_pkthdr.len;
2107	tp->t_rcvtime = ticks;
2108	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2109	CTR2(KTR_TOM,
2110	    "new_rx_data: seq 0x%x len %u",
2111	    m->m_seq, m->m_pkthdr.len);
2112	inp_wunlock(tp->t_inpcb);
2113	rcv = so_sockbuf_rcv(so);
2114	sockbuf_lock(rcv);
2115#if 0
2116	if (sb_notify(rcv))
2117		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2118#endif
2119	SBAPPEND(rcv, m);
2120
2121#ifdef notyet
2122	/*
2123	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2124	 *
2125	 */
2126	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2127
2128	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2129		so, rcv->sb_cc, rcv->sb_mbmax));
2130#endif
2131
2132
2133	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2134	    rcv->sb_cc, rcv->sb_mbcnt);
2135
2136	state = so_state_get(so);
2137	if (__predict_true((state & SS_NOFDREF) == 0))
2138		so_sorwakeup_locked(so);
2139	else
2140		sockbuf_unlock(rcv);
2141}
2142
2143/*
2144 * Handler for RX_DATA CPL messages.
2145 */
2146static int
2147do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2148{
2149	struct toepcb *toep = (struct toepcb *)ctx;
2150
2151	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2152
2153	new_rx_data(toep, m);
2154
2155	return (0);
2156}
2157
2158static void
2159new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2160{
2161	struct tcpcb *tp;
2162	struct ddp_state *q;
2163	struct ddp_buf_state *bsp;
2164	struct cpl_rx_data_ddp *hdr;
2165	struct socket *so;
2166	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2167	int nomoredata = 0;
2168	unsigned int delack_mode;
2169	struct sockbuf *rcv;
2170
2171	tp = toep->tp_tp;
2172	inp_wlock(tp->t_inpcb);
2173	so = inp_inpcbtosocket(tp->t_inpcb);
2174
2175	if (__predict_false(so_no_receive(so))) {
2176
2177		handle_excess_rx(toep, m);
2178		inp_wunlock(tp->t_inpcb);
2179		return;
2180	}
2181
2182	q = &toep->tp_ddp_state;
2183	hdr = cplhdr(m);
2184	ddp_report = ntohl(hdr->u.ddp_report);
2185	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2186	bsp = &q->buf_state[buf_idx];
2187
2188	CTR4(KTR_TOM,
2189	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2190	    "hdr seq 0x%x len %u",
2191	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2192	    ntohs(hdr->len));
2193	CTR3(KTR_TOM,
2194	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2195	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2196
2197	ddp_len = ntohs(hdr->len);
2198	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2199
2200	delack_mode = G_DDP_DACK_MODE(ddp_report);
2201	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2202		toep->tp_delack_mode = delack_mode;
2203		toep->tp_delack_seq = tp->rcv_nxt;
2204	}
2205
2206	m->m_seq = tp->rcv_nxt;
2207	tp->rcv_nxt = rcv_nxt;
2208
2209	tp->t_rcvtime = ticks;
2210	/*
2211	 * Store the length in m->m_len.  We are changing the meaning of
2212	 * m->m_len here, we need to be very careful that nothing from now on
2213	 * interprets ->len of this packet the usual way.
2214	 */
2215	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2216	inp_wunlock(tp->t_inpcb);
2217	CTR3(KTR_TOM,
2218	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2219	    m->m_len, rcv_nxt, m->m_seq);
2220	/*
2221	 * Figure out where the new data was placed in the buffer and store it
2222	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2223	 * account for page pod's pg_offset.
2224	 */
2225	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2226	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2227
2228	rcv = so_sockbuf_rcv(so);
2229	sockbuf_lock(rcv);
2230
2231	m->m_ddp_gl = (unsigned char *)bsp->gl;
2232	m->m_flags |= M_DDP;
2233	bsp->cur_offset = end_offset;
2234	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2235
2236	/*
2237	 * Length is only meaningful for kbuf
2238	 */
2239	if (!(bsp->flags & DDP_BF_NOCOPY))
2240		KASSERT(m->m_len <= bsp->gl->dgl_length,
2241		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2242			m->m_len, bsp->gl->dgl_length));
2243
2244	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2245	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2246        /*
2247	 * Bit 0 of flags stores whether the DDP buffer is completed.
2248	 * Note that other parts of the code depend on this being in bit 0.
2249	 */
2250	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2251		panic("spurious ddp completion");
2252	} else {
2253		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2254		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2255			q->cur_buf ^= 1;                     /* flip buffers */
2256	}
2257
2258	if (bsp->flags & DDP_BF_NOCOPY) {
2259		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2260		bsp->flags &= ~DDP_BF_NOCOPY;
2261	}
2262
2263	if (ddp_report & F_DDP_PSH)
2264		m->m_ddp_flags |= DDP_BF_PSH;
2265	if (nomoredata)
2266		m->m_ddp_flags |= DDP_BF_NODATA;
2267
2268#ifdef notyet
2269	skb_reset_transport_header(skb);
2270	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2271#endif
2272	SBAPPEND(rcv, m);
2273
2274	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2275	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2276		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2277		so_sorwakeup_locked(so);
2278	else
2279		sockbuf_unlock(rcv);
2280}
2281
2282#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2283		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2284		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2285		 F_DDP_INVALID_PPOD)
2286
2287/*
2288 * Handler for RX_DATA_DDP CPL messages.
2289 */
2290static int
2291do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2292{
2293	struct toepcb *toep = ctx;
2294	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2295
2296	VALIDATE_SOCK(so);
2297
2298	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2299		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2300		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2301		return (CPL_RET_BUF_DONE);
2302	}
2303#if 0
2304	skb->h.th = tcphdr_skb->h.th;
2305#endif
2306	new_rx_data_ddp(toep, m);
2307	return (0);
2308}
2309
2310static void
2311process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2312{
2313	struct tcpcb *tp = toep->tp_tp;
2314	struct socket *so;
2315	struct ddp_state *q;
2316	struct ddp_buf_state *bsp;
2317	struct cpl_rx_ddp_complete *hdr;
2318	unsigned int ddp_report, buf_idx, when, delack_mode;
2319	int nomoredata = 0;
2320	struct sockbuf *rcv;
2321
2322	inp_wlock(tp->t_inpcb);
2323	so = inp_inpcbtosocket(tp->t_inpcb);
2324
2325	if (__predict_false(so_no_receive(so))) {
2326		struct inpcb *inp = so_sotoinpcb(so);
2327
2328		handle_excess_rx(toep, m);
2329		inp_wunlock(inp);
2330		return;
2331	}
2332	q = &toep->tp_ddp_state;
2333	hdr = cplhdr(m);
2334	ddp_report = ntohl(hdr->ddp_report);
2335	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2336	m->m_pkthdr.csum_data = tp->rcv_nxt;
2337
2338	rcv = so_sockbuf_rcv(so);
2339	sockbuf_lock(rcv);
2340
2341	bsp = &q->buf_state[buf_idx];
2342	when = bsp->cur_offset;
2343	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2344	tp->rcv_nxt += m->m_len;
2345	tp->t_rcvtime = ticks;
2346
2347	delack_mode = G_DDP_DACK_MODE(ddp_report);
2348	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2349		toep->tp_delack_mode = delack_mode;
2350		toep->tp_delack_seq = tp->rcv_nxt;
2351	}
2352#ifdef notyet
2353	skb_reset_transport_header(skb);
2354	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2355#endif
2356	inp_wunlock(tp->t_inpcb);
2357
2358	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2359	CTR5(KTR_TOM,
2360		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2361		  "ddp_report 0x%x offset %u, len %u",
2362		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2363		   G_DDP_OFFSET(ddp_report), m->m_len);
2364
2365	m->m_cur_offset = bsp->cur_offset;
2366	bsp->cur_offset += m->m_len;
2367
2368	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2369		q->cur_buf ^= 1;                     /* flip buffers */
2370		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2371			nomoredata=1;
2372	}
2373
2374	CTR4(KTR_TOM,
2375		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2376		  "ddp_report %u offset %u",
2377		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2378		   G_DDP_OFFSET(ddp_report));
2379
2380	m->m_ddp_gl = (unsigned char *)bsp->gl;
2381	m->m_flags |= M_DDP;
2382	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2383	if (bsp->flags & DDP_BF_NOCOPY)
2384		bsp->flags &= ~DDP_BF_NOCOPY;
2385	if (nomoredata)
2386		m->m_ddp_flags |= DDP_BF_NODATA;
2387
2388	SBAPPEND(rcv, m);
2389	if ((so_state_get(so) & SS_NOFDREF) == 0)
2390		so_sorwakeup_locked(so);
2391	else
2392		sockbuf_unlock(rcv);
2393}
2394
2395/*
2396 * Handler for RX_DDP_COMPLETE CPL messages.
2397 */
2398static int
2399do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2400{
2401	struct toepcb *toep = ctx;
2402
2403	VALIDATE_SOCK(so);
2404#if 0
2405	skb->h.th = tcphdr_skb->h.th;
2406#endif
2407	process_ddp_complete(toep, m);
2408	return (0);
2409}
2410
2411/*
2412 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2413 * socket state before calling tcp_time_wait to comply with its expectations.
2414 */
2415static void
2416enter_timewait(struct tcpcb *tp)
2417{
2418	/*
2419	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2420	 * process peer_close because we don't want to carry the peer FIN in
2421	 * the socket's receive queue and if we increment rcv_nxt without
2422	 * having the FIN in the receive queue we'll confuse facilities such
2423	 * as SIOCINQ.
2424	 */
2425	inp_wlock(tp->t_inpcb);
2426	tp->rcv_nxt++;
2427
2428	tp->ts_recent_age = 0;	     /* defeat recycling */
2429	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2430	inp_wunlock(tp->t_inpcb);
2431	tcp_offload_twstart(tp);
2432}
2433
2434static void
2435enter_timewait_disconnect(struct tcpcb *tp)
2436{
2437	/*
2438	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2439	 * process peer_close because we don't want to carry the peer FIN in
2440	 * the socket's receive queue and if we increment rcv_nxt without
2441	 * having the FIN in the receive queue we'll confuse facilities such
2442	 * as SIOCINQ.
2443	 */
2444	inp_wlock(tp->t_inpcb);
2445	tp->rcv_nxt++;
2446
2447	tp->ts_recent_age = 0;	     /* defeat recycling */
2448	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2449	inp_wunlock(tp->t_inpcb);
2450	tcp_offload_twstart_disconnect(tp);
2451}
2452
2453/*
2454 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2455 * function deals with the data that may be reported along with the FIN.
2456 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2457 * perform normal FIN-related processing.  In the latter case 1 indicates that
2458 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2459 * skb can be freed.
2460 */
2461static int
2462handle_peer_close_data(struct socket *so, struct mbuf *m)
2463{
2464	struct tcpcb *tp = so_sototcpcb(so);
2465	struct toepcb *toep = tp->t_toe;
2466	struct ddp_state *q;
2467	struct ddp_buf_state *bsp;
2468	struct cpl_peer_close *req = cplhdr(m);
2469	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2470	struct sockbuf *rcv;
2471
2472	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2473		return (0);
2474
2475	CTR0(KTR_TOM, "handle_peer_close_data");
2476	if (__predict_false(so_no_receive(so))) {
2477		handle_excess_rx(toep, m);
2478
2479		/*
2480		 * Although we discard the data we want to process the FIN so
2481		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2482		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2483		 * may be what will close the connection.  We return 1 because
2484		 * handle_excess_rx() already freed the packet.
2485		 */
2486		return (1);
2487	}
2488
2489	inp_lock_assert(tp->t_inpcb);
2490	q = &toep->tp_ddp_state;
2491	rcv = so_sockbuf_rcv(so);
2492	sockbuf_lock(rcv);
2493
2494	bsp = &q->buf_state[q->cur_buf];
2495	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2496	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2497	m->m_ddp_gl = (unsigned char *)bsp->gl;
2498	m->m_flags |= M_DDP;
2499	m->m_cur_offset = bsp->cur_offset;
2500	m->m_ddp_flags =
2501	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2502	m->m_seq = tp->rcv_nxt;
2503	tp->rcv_nxt = rcv_nxt;
2504	bsp->cur_offset += m->m_pkthdr.len;
2505	if (!(bsp->flags & DDP_BF_NOFLIP))
2506		q->cur_buf ^= 1;
2507#ifdef notyet
2508	skb_reset_transport_header(skb);
2509	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2510#endif
2511	tp->t_rcvtime = ticks;
2512	SBAPPEND(rcv, m);
2513	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2514		so_sorwakeup_locked(so);
2515	else
2516		sockbuf_unlock(rcv);
2517
2518	return (1);
2519}
2520
2521/*
2522 * Handle a peer FIN.
2523 */
2524static void
2525do_peer_fin(struct toepcb *toep, struct mbuf *m)
2526{
2527	struct socket *so;
2528	struct tcpcb *tp = toep->tp_tp;
2529	int keep, action;
2530
2531	action = keep = 0;
2532	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2533	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2534		printf("abort_pending set\n");
2535
2536		goto out;
2537	}
2538	inp_wlock(tp->t_inpcb);
2539	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2540	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2541		keep = handle_peer_close_data(so, m);
2542		if (keep < 0) {
2543			inp_wunlock(tp->t_inpcb);
2544			return;
2545		}
2546	}
2547	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2548		CTR1(KTR_TOM,
2549		    "waking up waiters for cantrcvmore on %p ", so);
2550		socantrcvmore(so);
2551
2552		/*
2553		 * If connection is half-synchronized
2554		 * (ie NEEDSYN flag on) then delay ACK,
2555		 * so it may be piggybacked when SYN is sent.
2556		 * Otherwise, since we received a FIN then no
2557		 * more input can be expected, send ACK now.
2558		 */
2559		if (tp->t_flags & TF_NEEDSYN)
2560			tp->t_flags |= TF_DELACK;
2561		else
2562			tp->t_flags |= TF_ACKNOW;
2563		tp->rcv_nxt++;
2564	}
2565
2566	switch (tp->t_state) {
2567	case TCPS_SYN_RECEIVED:
2568	    tp->t_starttime = ticks;
2569	/* FALLTHROUGH */
2570	case TCPS_ESTABLISHED:
2571		tp->t_state = TCPS_CLOSE_WAIT;
2572		break;
2573	case TCPS_FIN_WAIT_1:
2574		tp->t_state = TCPS_CLOSING;
2575		break;
2576	case TCPS_FIN_WAIT_2:
2577		/*
2578		 * If we've sent an abort_req we must have sent it too late,
2579		 * HW will send us a reply telling us so, and this peer_close
2580		 * is really the last message for this connection and needs to
2581		 * be treated as an abort_rpl, i.e., transition the connection
2582		 * to TCP_CLOSE (note that the host stack does this at the
2583		 * time of generating the RST but we must wait for HW).
2584		 * Otherwise we enter TIME_WAIT.
2585		 */
2586		t3_release_offload_resources(toep);
2587		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2588			action = TCP_CLOSE;
2589		} else {
2590			action = TCP_TIMEWAIT;
2591		}
2592		break;
2593	default:
2594		log(LOG_ERR,
2595		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2596		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2597	}
2598	inp_wunlock(tp->t_inpcb);
2599
2600	if (action == TCP_TIMEWAIT) {
2601		enter_timewait(tp);
2602	} else if (action == TCP_DROP) {
2603		tcp_offload_drop(tp, 0);
2604	} else if (action == TCP_CLOSE) {
2605		tcp_offload_close(tp);
2606	}
2607
2608#ifdef notyet
2609	/* Do not send POLL_HUP for half duplex close. */
2610	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2611	    sk->sk_state == TCP_CLOSE)
2612		sk_wake_async(so, 1, POLL_HUP);
2613	else
2614		sk_wake_async(so, 1, POLL_IN);
2615#endif
2616
2617out:
2618	if (!keep)
2619		m_free(m);
2620}
2621
2622/*
2623 * Handler for PEER_CLOSE CPL messages.
2624 */
2625static int
2626do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2627{
2628	struct toepcb *toep = (struct toepcb *)ctx;
2629
2630	VALIDATE_SOCK(so);
2631
2632	do_peer_fin(toep, m);
2633	return (0);
2634}
2635
2636static void
2637process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2638{
2639	struct cpl_close_con_rpl *rpl = cplhdr(m);
2640	struct tcpcb *tp = toep->tp_tp;
2641	struct socket *so;
2642	int action = 0;
2643	struct sockbuf *rcv;
2644
2645	inp_wlock(tp->t_inpcb);
2646	so = inp_inpcbtosocket(tp->t_inpcb);
2647
2648	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2649
2650	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2651		inp_wunlock(tp->t_inpcb);
2652		goto out;
2653	}
2654
2655	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2656	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2657
2658	switch (tp->t_state) {
2659	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2660		t3_release_offload_resources(toep);
2661		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2662			action = TCP_CLOSE;
2663
2664		} else {
2665			action = TCP_TIMEWAIT;
2666		}
2667		break;
2668	case TCPS_LAST_ACK:
2669		/*
2670		 * In this state we don't care about pending abort_rpl.
2671		 * If we've sent abort_req it was post-close and was sent too
2672		 * late, this close_con_rpl is the actual last message.
2673		 */
2674		t3_release_offload_resources(toep);
2675		action = TCP_CLOSE;
2676		break;
2677	case TCPS_FIN_WAIT_1:
2678		/*
2679		 * If we can't receive any more
2680		 * data, then closing user can proceed.
2681		 * Starting the timer is contrary to the
2682		 * specification, but if we don't get a FIN
2683		 * we'll hang forever.
2684		 *
2685		 * XXXjl:
2686		 * we should release the tp also, and use a
2687		 * compressed state.
2688		 */
2689		if (so)
2690			rcv = so_sockbuf_rcv(so);
2691		else
2692			break;
2693
2694		if (rcv->sb_state & SBS_CANTRCVMORE) {
2695			int timeout;
2696
2697			if (so)
2698				soisdisconnected(so);
2699			timeout = (tcp_fast_finwait2_recycle) ?
2700			    tcp_finwait2_timeout : tcp_maxidle;
2701			tcp_timer_activate(tp, TT_2MSL, timeout);
2702		}
2703		tp->t_state = TCPS_FIN_WAIT_2;
2704		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2705		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2706			action = TCP_DROP;
2707		}
2708
2709		break;
2710	default:
2711		log(LOG_ERR,
2712		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2713		       toep->tp_toedev->tod_name, toep->tp_tid,
2714		       tp->t_state);
2715	}
2716	inp_wunlock(tp->t_inpcb);
2717
2718
2719	if (action == TCP_TIMEWAIT) {
2720		enter_timewait_disconnect(tp);
2721	} else if (action == TCP_DROP) {
2722		tcp_offload_drop(tp, 0);
2723	} else if (action == TCP_CLOSE) {
2724		tcp_offload_close(tp);
2725	}
2726out:
2727	m_freem(m);
2728}
2729
2730/*
2731 * Handler for CLOSE_CON_RPL CPL messages.
2732 */
2733static int
2734do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2735			    void *ctx)
2736{
2737	struct toepcb *toep = (struct toepcb *)ctx;
2738
2739	process_close_con_rpl(toep, m);
2740	return (0);
2741}
2742
2743/*
2744 * Process abort replies.  We only process these messages if we anticipate
2745 * them as the coordination between SW and HW in this area is somewhat lacking
2746 * and sometimes we get ABORT_RPLs after we are done with the connection that
2747 * originated the ABORT_REQ.
2748 */
2749static void
2750process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2751{
2752	struct tcpcb *tp = toep->tp_tp;
2753	struct socket *so;
2754	int needclose = 0;
2755
2756#ifdef T3_TRACE
2757	T3_TRACE1(TIDTB(sk),
2758		  "process_abort_rpl: GTS rpl pending %d",
2759		  sock_flag(sk, ABORT_RPL_PENDING));
2760#endif
2761
2762	inp_wlock(tp->t_inpcb);
2763	so = inp_inpcbtosocket(tp->t_inpcb);
2764
2765	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2766		/*
2767		 * XXX panic on tcpdrop
2768		 */
2769		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2770			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2771		else {
2772			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2773			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2774			    !is_t3a(toep->tp_toedev)) {
2775				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2776					panic("TP_ABORT_REQ_RCVD set");
2777				t3_release_offload_resources(toep);
2778				needclose = 1;
2779			}
2780		}
2781	}
2782	inp_wunlock(tp->t_inpcb);
2783
2784	if (needclose)
2785		tcp_offload_close(tp);
2786
2787	m_free(m);
2788}
2789
2790/*
2791 * Handle an ABORT_RPL_RSS CPL message.
2792 */
2793static int
2794do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2795{
2796	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2797	struct toepcb *toep;
2798
2799	/*
2800	 * Ignore replies to post-close aborts indicating that the abort was
2801	 * requested too late.  These connections are terminated when we get
2802	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2803	 * arrives the TID is either no longer used or it has been recycled.
2804	 */
2805	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2806discard:
2807		m_free(m);
2808		return (0);
2809	}
2810
2811	toep = (struct toepcb *)ctx;
2812
2813        /*
2814	 * Sometimes we've already closed the socket, e.g., a post-close
2815	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2816	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2817	 * but FW turns the ABORT_REQ into a regular one and so we get
2818	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2819	 */
2820	if (!toep)
2821		goto discard;
2822
2823	if (toep->tp_tp == NULL) {
2824		log(LOG_NOTICE, "removing tid for abort\n");
2825		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2826		if (toep->tp_l2t)
2827			l2t_release(L2DATA(cdev), toep->tp_l2t);
2828
2829		toepcb_release(toep);
2830		goto discard;
2831	}
2832
2833	log(LOG_NOTICE, "toep=%p\n", toep);
2834	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2835
2836	toepcb_hold(toep);
2837	process_abort_rpl(toep, m);
2838	toepcb_release(toep);
2839	return (0);
2840}
2841
2842/*
2843 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2844 * indicate whether RST should be sent in response.
2845 */
2846static int
2847abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2848{
2849	struct tcpcb *tp = so_sototcpcb(so);
2850
2851	switch (abort_reason) {
2852	case CPL_ERR_BAD_SYN:
2853#if 0
2854		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2855#endif
2856	case CPL_ERR_CONN_RESET:
2857		// XXX need to handle SYN_RECV due to crossed SYNs
2858		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2859	case CPL_ERR_XMIT_TIMEDOUT:
2860	case CPL_ERR_PERSIST_TIMEDOUT:
2861	case CPL_ERR_FINWAIT2_TIMEDOUT:
2862	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2863#if 0
2864		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2865#endif
2866		return (ETIMEDOUT);
2867	default:
2868		return (EIO);
2869	}
2870}
2871
2872static inline void
2873set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2874{
2875	struct cpl_abort_rpl *rpl = cplhdr(m);
2876
2877	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2878	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2879	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2880
2881	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2882	rpl->cmd = cmd;
2883}
2884
2885static void
2886send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2887{
2888	struct mbuf *reply_mbuf;
2889	struct cpl_abort_req_rss *req = cplhdr(m);
2890
2891	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2892	m_set_priority(m, CPL_PRIORITY_DATA);
2893	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2894	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2895	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2896	m_free(m);
2897}
2898
2899/*
2900 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2901 */
2902static inline int
2903is_neg_adv_abort(unsigned int status)
2904{
2905	return status == CPL_ERR_RTX_NEG_ADVICE ||
2906	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2907}
2908
2909static void
2910send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2911{
2912	struct mbuf  *reply_mbuf;
2913	struct cpl_abort_req_rss *req = cplhdr(m);
2914
2915	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2916
2917	if (!reply_mbuf) {
2918		/* Defer the reply.  Stick rst_status into req->cmd. */
2919		req->status = rst_status;
2920		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2921		return;
2922	}
2923
2924	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2925	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2926	m_free(m);
2927
2928	/*
2929	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2930	 * these messages while ARP is pending.  For other connection states
2931	 * it's not a problem.
2932	 */
2933	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2934}
2935
2936#ifdef notyet
2937static void
2938cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2939{
2940	CXGB_UNIMPLEMENTED();
2941#ifdef notyet
2942	struct request_sock *req = child->sk_user_data;
2943
2944	inet_csk_reqsk_queue_removed(parent, req);
2945	synq_remove(tcp_sk(child));
2946	__reqsk_free(req);
2947	child->sk_user_data = NULL;
2948#endif
2949}
2950
2951
2952/*
2953 * Performs the actual work to abort a SYN_RECV connection.
2954 */
2955static void
2956do_abort_syn_rcv(struct socket *child, struct socket *parent)
2957{
2958	struct tcpcb *parenttp = so_sototcpcb(parent);
2959	struct tcpcb *childtp = so_sototcpcb(child);
2960
2961	/*
2962	 * If the server is still open we clean up the child connection,
2963	 * otherwise the server already did the clean up as it was purging
2964	 * its SYN queue and the skb was just sitting in its backlog.
2965	 */
2966	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2967		cleanup_syn_rcv_conn(child, parent);
2968		inp_wlock(childtp->t_inpcb);
2969		t3_release_offload_resources(childtp->t_toe);
2970		inp_wunlock(childtp->t_inpcb);
2971		tcp_offload_close(childtp);
2972	}
2973}
2974#endif
2975
2976/*
2977 * Handle abort requests for a SYN_RECV connection.  These need extra work
2978 * because the socket is on its parent's SYN queue.
2979 */
2980static int
2981abort_syn_rcv(struct socket *so, struct mbuf *m)
2982{
2983	CXGB_UNIMPLEMENTED();
2984#ifdef notyet
2985	struct socket *parent;
2986	struct toedev *tdev = toep->tp_toedev;
2987	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2988	struct socket *oreq = so->so_incomp;
2989	struct t3c_tid_entry *t3c_stid;
2990	struct tid_info *t;
2991
2992	if (!oreq)
2993		return -1;        /* somehow we are not on the SYN queue */
2994
2995	t = &(T3C_DATA(cdev))->tid_maps;
2996	t3c_stid = lookup_stid(t, oreq->ts_recent);
2997	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2998
2999	so_lock(parent);
3000	do_abort_syn_rcv(so, parent);
3001	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
3002	so_unlock(parent);
3003#endif
3004	return (0);
3005}
3006
3007/*
3008 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
3009 * request except that we need to reply to it.
3010 */
3011static void
3012process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3013{
3014	int rst_status = CPL_ABORT_NO_RST;
3015	const struct cpl_abort_req_rss *req = cplhdr(m);
3016	struct tcpcb *tp = toep->tp_tp;
3017	struct socket *so;
3018	int needclose = 0;
3019
3020	inp_wlock(tp->t_inpcb);
3021	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3022	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3023		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3024		m_free(m);
3025		goto skip;
3026	}
3027
3028	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3029	/*
3030	 * Three cases to consider:
3031	 * a) We haven't sent an abort_req; close the connection.
3032	 * b) We have sent a post-close abort_req that will get to TP too late
3033	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3034	 *    be ignored and the connection should be closed now.
3035	 * c) We have sent a regular abort_req that will get to TP too late.
3036	 *    That will generate an abort_rpl with status 0, wait for it.
3037	 */
3038	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3039	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3040		int error;
3041
3042		error = abort_status_to_errno(so, req->status,
3043		    &rst_status);
3044		so_error_set(so, error);
3045
3046		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3047			so_sorwakeup(so);
3048		/*
3049		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3050		 * returns 0 is has taken care of the abort.
3051		 */
3052		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3053			goto skip;
3054
3055		t3_release_offload_resources(toep);
3056		needclose = 1;
3057	}
3058	inp_wunlock(tp->t_inpcb);
3059
3060	if (needclose)
3061		tcp_offload_close(tp);
3062
3063	send_abort_rpl(m, tdev, rst_status);
3064	return;
3065skip:
3066	inp_wunlock(tp->t_inpcb);
3067}
3068
3069/*
3070 * Handle an ABORT_REQ_RSS CPL message.
3071 */
3072static int
3073do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3074{
3075	const struct cpl_abort_req_rss *req = cplhdr(m);
3076	struct toepcb *toep = (struct toepcb *)ctx;
3077
3078	if (is_neg_adv_abort(req->status)) {
3079		m_free(m);
3080		return (0);
3081	}
3082
3083	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3084
3085	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3086		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3087		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3088
3089		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3090		if (toep->tp_l2t)
3091			l2t_release(L2DATA(cdev), toep->tp_l2t);
3092
3093		/*
3094		 *  Unhook
3095		 */
3096		toep->tp_tp->t_toe = NULL;
3097		toep->tp_tp->t_flags &= ~TF_TOE;
3098		toep->tp_tp = NULL;
3099		/*
3100		 * XXX need to call syncache_chkrst - but we don't
3101		 * have a way of doing that yet
3102		 */
3103		toepcb_release(toep);
3104		log(LOG_ERR, "abort for unestablished connection :-(\n");
3105		return (0);
3106	}
3107	if (toep->tp_tp == NULL) {
3108		log(LOG_NOTICE, "disconnected toepcb\n");
3109		/* should be freed momentarily */
3110		return (0);
3111	}
3112
3113
3114	toepcb_hold(toep);
3115	process_abort_req(toep, m, toep->tp_toedev);
3116	toepcb_release(toep);
3117	return (0);
3118}
3119#ifdef notyet
3120static void
3121pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3122{
3123	struct toedev *tdev = TOE_DEV(parent);
3124
3125	do_abort_syn_rcv(child, parent);
3126	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3127		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3128
3129		rpl->opt0h = htonl(F_TCAM_BYPASS);
3130		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3131		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3132	} else
3133		m_free(m);
3134}
3135#endif
3136static void
3137handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3138{
3139	CXGB_UNIMPLEMENTED();
3140
3141#ifdef notyet
3142	struct t3cdev *cdev;
3143	struct socket *parent;
3144	struct socket *oreq;
3145	struct t3c_tid_entry *t3c_stid;
3146	struct tid_info *t;
3147	struct tcpcb *otp, *tp = so_sototcpcb(so);
3148	struct toepcb *toep = tp->t_toe;
3149
3150	/*
3151	 * If the connection is being aborted due to the parent listening
3152	 * socket going away there's nothing to do, the ABORT_REQ will close
3153	 * the connection.
3154	 */
3155	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3156		m_free(m);
3157		return;
3158	}
3159
3160	oreq = so->so_incomp;
3161	otp = so_sototcpcb(oreq);
3162
3163	cdev = T3C_DEV(so);
3164	t = &(T3C_DATA(cdev))->tid_maps;
3165	t3c_stid = lookup_stid(t, otp->ts_recent);
3166	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3167
3168	so_lock(parent);
3169	pass_open_abort(so, parent, m);
3170	so_unlock(parent);
3171#endif
3172}
3173
3174/*
3175 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3176 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3177 * connection.
3178 */
3179static void
3180pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3181{
3182
3183#ifdef notyet
3184	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3185	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3186#endif
3187	handle_pass_open_arp_failure(m_get_socket(m), m);
3188}
3189
3190/*
3191 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3192 */
3193static void
3194mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3195{
3196	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3197	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3198	unsigned int tid = GET_TID(req);
3199
3200	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3201	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3202	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3203	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3204	rpl->opt0h = htonl(F_TCAM_BYPASS);
3205	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3206	rpl->opt2 = 0;
3207	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3208}
3209
3210/*
3211 * Send a deferred reject to an accept request.
3212 */
3213static void
3214reject_pass_request(struct toedev *tdev, struct mbuf *m)
3215{
3216	struct mbuf *reply_mbuf;
3217
3218	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3219	mk_pass_accept_rpl(reply_mbuf, m);
3220	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3221	m_free(m);
3222}
3223
3224static void
3225handle_syncache_event(int event, void *arg)
3226{
3227	struct toepcb *toep = arg;
3228
3229	switch (event) {
3230	case TOE_SC_ENTRY_PRESENT:
3231		/*
3232		 * entry already exists - free toepcb
3233		 * and l2t
3234		 */
3235		printf("syncache entry present\n");
3236		toepcb_release(toep);
3237		break;
3238	case TOE_SC_DROP:
3239		/*
3240		 * The syncache has given up on this entry
3241		 * either it timed out, or it was evicted
3242		 * we need to explicitly release the tid
3243		 */
3244		printf("syncache entry dropped\n");
3245		toepcb_release(toep);
3246		break;
3247	default:
3248		log(LOG_ERR, "unknown syncache event %d\n", event);
3249		break;
3250	}
3251}
3252
3253static void
3254syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3255{
3256	struct in_conninfo inc;
3257	struct tcpopt to;
3258	struct tcphdr th;
3259	struct inpcb *inp;
3260	int mss, wsf, sack, ts;
3261	uint32_t rcv_isn = ntohl(req->rcv_isn);
3262
3263	bzero(&to, sizeof(struct tcpopt));
3264	inp = so_sotoinpcb(lso);
3265
3266	/*
3267	 * Fill out information for entering us into the syncache
3268	 */
3269	inc.inc_fport = th.th_sport = req->peer_port;
3270	inc.inc_lport = th.th_dport = req->local_port;
3271	th.th_seq = req->rcv_isn;
3272	th.th_flags = TH_SYN;
3273
3274	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3275
3276
3277	inc.inc_isipv6 = 0;
3278	inc.inc_len = 0;
3279	inc.inc_faddr.s_addr = req->peer_ip;
3280	inc.inc_laddr.s_addr = req->local_ip;
3281
3282	DPRINTF("syncache add of %d:%d %d:%d\n",
3283	    ntohl(req->local_ip), ntohs(req->local_port),
3284	    ntohl(req->peer_ip), ntohs(req->peer_port));
3285
3286	mss = req->tcp_options.mss;
3287	wsf = req->tcp_options.wsf;
3288	ts = req->tcp_options.tstamp;
3289	sack = req->tcp_options.sack;
3290	to.to_mss = mss;
3291	to.to_wscale = wsf;
3292	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3293	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3294}
3295
3296
3297/*
3298 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3299 * lock held.  Note that the sock here is a listening socket that is not owned
3300 * by the TOE.
3301 */
3302static void
3303process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3304    struct listen_ctx *lctx)
3305{
3306	int rt_flags;
3307	struct l2t_entry *e;
3308	struct iff_mac tim;
3309	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3310	struct cpl_pass_accept_rpl *rpl;
3311	struct cpl_pass_accept_req *req = cplhdr(m);
3312	unsigned int tid = GET_TID(req);
3313	struct tom_data *d = TOM_DATA(tdev);
3314	struct t3cdev *cdev = d->cdev;
3315	struct tcpcb *tp = so_sototcpcb(so);
3316	struct toepcb *newtoep;
3317	struct rtentry *dst;
3318	struct sockaddr_in nam;
3319	struct t3c_data *td = T3C_DATA(cdev);
3320
3321	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3322	if (__predict_false(reply_mbuf == NULL)) {
3323		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3324			t3_defer_reply(m, tdev, reject_pass_request);
3325		else {
3326			cxgb_queue_tid_release(cdev, tid);
3327			m_free(m);
3328		}
3329		DPRINTF("failed to get reply_mbuf\n");
3330
3331		goto out;
3332	}
3333
3334	if (tp->t_state != TCPS_LISTEN) {
3335		DPRINTF("socket not in listen state\n");
3336
3337		goto reject;
3338	}
3339
3340	tim.mac_addr = req->dst_mac;
3341	tim.vlan_tag = ntohs(req->vlan_tag);
3342	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3343		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3344		goto reject;
3345	}
3346
3347#ifdef notyet
3348	/*
3349	 * XXX do route lookup to confirm that we're still listening on this
3350	 * address
3351	 */
3352	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3353			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3354		goto reject;
3355	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3356		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3357	dst_release(skb->dst);	// done with the input route, release it
3358	skb->dst = NULL;
3359
3360	if ((rt_flags & RTF_LOCAL) == 0)
3361		goto reject;
3362#endif
3363	/*
3364	 * XXX
3365	 */
3366	rt_flags = RTF_LOCAL;
3367	if ((rt_flags & RTF_LOCAL) == 0)
3368		goto reject;
3369
3370	/*
3371	 * Calculate values and add to syncache
3372	 */
3373
3374	newtoep = toepcb_alloc();
3375	if (newtoep == NULL)
3376		goto reject;
3377
3378	bzero(&nam, sizeof(struct sockaddr_in));
3379
3380	nam.sin_len = sizeof(struct sockaddr_in);
3381	nam.sin_family = AF_INET;
3382	nam.sin_addr.s_addr =req->peer_ip;
3383	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3384
3385	if (dst == NULL) {
3386		printf("failed to find route\n");
3387		goto reject;
3388	}
3389	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3390	    (struct sockaddr *)&nam);
3391	if (e == NULL) {
3392		DPRINTF("failed to get l2t\n");
3393	}
3394	/*
3395	 * Point to our listen socket until accept
3396	 */
3397	newtoep->tp_tp = tp;
3398	newtoep->tp_flags = TP_SYN_RCVD;
3399	newtoep->tp_tid = tid;
3400	newtoep->tp_toedev = tdev;
3401	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3402
3403	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3404	so_lock(so);
3405	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3406	so_unlock(so);
3407
3408	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3409		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3410
3411	if (newtoep->tp_ulp_mode) {
3412		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3413
3414		if (ddp_mbuf == NULL)
3415			newtoep->tp_ulp_mode = 0;
3416	}
3417
3418	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3419	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3420	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3421	/*
3422	 * XXX workaround for lack of syncache drop
3423	 */
3424	toepcb_hold(newtoep);
3425	syncache_add_accept_req(req, so, newtoep);
3426
3427	rpl = cplhdr(reply_mbuf);
3428	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3429	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3430	rpl->wr.wr_lo = 0;
3431	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3432	rpl->opt2 = htonl(calc_opt2(so, tdev));
3433	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3434	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3435
3436	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3437	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3438	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3439				  CPL_PASS_OPEN_ACCEPT);
3440
3441	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3442
3443	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3444
3445	l2t_send(cdev, reply_mbuf, e);
3446	m_free(m);
3447	if (newtoep->tp_ulp_mode) {
3448		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3449				V_TF_DDP_OFF(1) |
3450				TP_DDP_TIMER_WORKAROUND_MASK,
3451				V_TF_DDP_OFF(1) |
3452		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3453	} else
3454		printf("not offloading\n");
3455
3456
3457
3458	return;
3459reject:
3460	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3461		mk_pass_accept_rpl(reply_mbuf, m);
3462	else
3463		mk_tid_release(reply_mbuf, newtoep, tid);
3464	cxgb_ofld_send(cdev, reply_mbuf);
3465	m_free(m);
3466out:
3467#if 0
3468	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3469#else
3470	return;
3471#endif
3472}
3473
3474/*
3475 * Handle a CPL_PASS_ACCEPT_REQ message.
3476 */
3477static int
3478do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3479{
3480	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3481	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3482	struct tom_data *d = listen_ctx->tom_data;
3483
3484#if VALIDATE_TID
3485	struct cpl_pass_accept_req *req = cplhdr(m);
3486	unsigned int tid = GET_TID(req);
3487	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3488
3489	if (unlikely(!lsk)) {
3490		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3491		       cdev->name,
3492		       (unsigned long)((union listen_entry *)ctx -
3493					t->stid_tab));
3494		return CPL_RET_BUF_DONE;
3495	}
3496	if (unlikely(tid >= t->ntids)) {
3497		printk(KERN_ERR "%s: passive open TID %u too large\n",
3498		       cdev->name, tid);
3499		return CPL_RET_BUF_DONE;
3500	}
3501	/*
3502	 * For T3A the current user of the TID may have closed but its last
3503	 * message(s) may have been backlogged so the TID appears to be still
3504	 * in use.  Just take the TID away, the connection can close at its
3505	 * own leisure.  For T3B this situation is a bug.
3506	 */
3507	if (!valid_new_tid(t, tid) &&
3508	    cdev->type != T3A) {
3509		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3510		       cdev->name, tid);
3511		return CPL_RET_BUF_DONE;
3512	}
3513#endif
3514
3515	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3516	return (0);
3517}
3518
3519/*
3520 * Called when a connection is established to translate the TCP options
3521 * reported by HW to FreeBSD's native format.
3522 */
3523static void
3524assign_rxopt(struct socket *so, unsigned int opt)
3525{
3526	struct tcpcb *tp = so_sototcpcb(so);
3527	struct toepcb *toep = tp->t_toe;
3528	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3529
3530	inp_lock_assert(tp->t_inpcb);
3531
3532	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3533	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3534	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3535	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3536	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3537	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3538		tp->rcv_scale = tp->request_r_scale;
3539}
3540
3541/*
3542 * Completes some final bits of initialization for just established connections
3543 * and changes their state to TCP_ESTABLISHED.
3544 *
3545 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3546 */
3547static void
3548make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3549{
3550	struct tcpcb *tp = so_sototcpcb(so);
3551	struct toepcb *toep = tp->t_toe;
3552
3553	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3554	assign_rxopt(so, opt);
3555
3556	/*
3557	 *XXXXXXXXXXX
3558	 *
3559	 */
3560#ifdef notyet
3561	so->so_proto->pr_ctloutput = t3_ctloutput;
3562#endif
3563
3564#if 0
3565	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3566#endif
3567	/*
3568	 * XXX not clear what rcv_wup maps to
3569	 */
3570	/*
3571	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3572	 * pass through opt0.
3573	 */
3574	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3575		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3576
3577	dump_toepcb(toep);
3578
3579#ifdef notyet
3580/*
3581 * no clean interface for marking ARP up to date
3582 */
3583	dst_confirm(sk->sk_dst_cache);
3584#endif
3585	tp->t_starttime = ticks;
3586	tp->t_state = TCPS_ESTABLISHED;
3587	soisconnected(so);
3588}
3589
3590static int
3591syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3592{
3593
3594	struct in_conninfo inc;
3595	struct tcpopt to;
3596	struct tcphdr th;
3597	int mss, wsf, sack, ts;
3598	struct mbuf *m = NULL;
3599	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3600	unsigned int opt;
3601
3602#ifdef MAC
3603#error	"no MAC support"
3604#endif
3605
3606	opt = ntohs(req->tcp_opt);
3607
3608	bzero(&to, sizeof(struct tcpopt));
3609
3610	/*
3611	 * Fill out information for entering us into the syncache
3612	 */
3613	inc.inc_fport = th.th_sport = req->peer_port;
3614	inc.inc_lport = th.th_dport = req->local_port;
3615	th.th_seq = req->rcv_isn;
3616	th.th_flags = TH_ACK;
3617
3618	inc.inc_isipv6 = 0;
3619	inc.inc_len = 0;
3620	inc.inc_faddr.s_addr = req->peer_ip;
3621	inc.inc_laddr.s_addr = req->local_ip;
3622
3623	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3624	wsf  = G_TCPOPT_WSCALE_OK(opt);
3625	ts   = G_TCPOPT_TSTAMP(opt);
3626	sack = G_TCPOPT_SACK(opt);
3627
3628	to.to_mss = mss;
3629	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3630	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3631
3632	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3633	    ntohl(req->local_ip), ntohs(req->local_port),
3634	    ntohl(req->peer_ip), ntohs(req->peer_port),
3635	    mss, wsf, ts, sack);
3636	return syncache_offload_expand(&inc, &to, &th, so, m);
3637}
3638
3639
3640/*
3641 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3642 * if we are in TCP_SYN_RECV due to crossed SYNs
3643 */
3644static int
3645do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3646{
3647	struct cpl_pass_establish *req = cplhdr(m);
3648	struct toepcb *toep = (struct toepcb *)ctx;
3649	struct tcpcb *tp = toep->tp_tp;
3650	struct socket *so, *lso;
3651	struct t3c_data *td = T3C_DATA(cdev);
3652	struct sockbuf *snd, *rcv;
3653
3654	// Complete socket initialization now that we have the SND_ISN
3655
3656	struct toedev *tdev;
3657
3658
3659	tdev = toep->tp_toedev;
3660
3661	inp_wlock(tp->t_inpcb);
3662
3663	/*
3664	 *
3665	 * XXX need to add reference while we're manipulating
3666	 */
3667	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3668
3669	inp_wunlock(tp->t_inpcb);
3670
3671	so_lock(so);
3672	LIST_REMOVE(toep, synq_entry);
3673	so_unlock(so);
3674
3675	if (!syncache_expand_establish_req(req, &so, toep)) {
3676		/*
3677		 * No entry
3678		 */
3679		CXGB_UNIMPLEMENTED();
3680	}
3681	if (so == NULL) {
3682		/*
3683		 * Couldn't create the socket
3684		 */
3685		CXGB_UNIMPLEMENTED();
3686	}
3687
3688	tp = so_sototcpcb(so);
3689	inp_wlock(tp->t_inpcb);
3690
3691	snd = so_sockbuf_snd(so);
3692	rcv = so_sockbuf_rcv(so);
3693
3694	snd->sb_flags |= SB_NOCOALESCE;
3695	rcv->sb_flags |= SB_NOCOALESCE;
3696
3697	toep->tp_tp = tp;
3698	toep->tp_flags = 0;
3699	tp->t_toe = toep;
3700	reset_wr_list(toep);
3701	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3702	tp->rcv_nxt = toep->tp_copied_seq;
3703	install_offload_ops(so);
3704
3705	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3706	toep->tp_wr_unacked = 0;
3707	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3708	toep->tp_qset_idx = 0;
3709	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3710
3711	/*
3712	 * XXX Cancel any keep alive timer
3713	 */
3714
3715	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3716
3717	/*
3718	 * XXX workaround for lack of syncache drop
3719	 */
3720	toepcb_release(toep);
3721	inp_wunlock(tp->t_inpcb);
3722
3723	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3724	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3725#ifdef notyet
3726	/*
3727	 * XXX not sure how these checks map to us
3728	 */
3729	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3730		sk->sk_state_change(sk);
3731		sk_wake_async(so, 0, POLL_OUT);
3732	}
3733	/*
3734	 * The state for the new connection is now up to date.
3735	 * Next check if we should add the connection to the parent's
3736	 * accept queue.  When the parent closes it resets connections
3737	 * on its SYN queue, so check if we are being reset.  If so we
3738	 * don't need to do anything more, the coming ABORT_RPL will
3739	 * destroy this socket.  Otherwise move the connection to the
3740	 * accept queue.
3741	 *
3742	 * Note that we reset the synq before closing the server so if
3743	 * we are not being reset the stid is still open.
3744	 */
3745	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3746		__kfree_skb(skb);
3747		goto unlock;
3748	}
3749#endif
3750	m_free(m);
3751
3752	return (0);
3753}
3754
3755/*
3756 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3757 * and send them to the TOE.
3758 */
3759static void
3760fixup_and_send_ofo(struct toepcb *toep)
3761{
3762	struct mbuf *m;
3763	struct toedev *tdev = toep->tp_toedev;
3764	struct tcpcb *tp = toep->tp_tp;
3765	unsigned int tid = toep->tp_tid;
3766
3767	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3768
3769	inp_lock_assert(tp->t_inpcb);
3770	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3771		/*
3772		 * A variety of messages can be waiting but the fields we'll
3773		 * be touching are common to all so any message type will do.
3774		 */
3775		struct cpl_close_con_req *p = cplhdr(m);
3776
3777		p->wr.wr_lo = htonl(V_WR_TID(tid));
3778		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3779		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3780	}
3781}
3782
3783/*
3784 * Updates socket state from an active establish CPL message.  Runs with the
3785 * socket lock held.
3786 */
3787static void
3788socket_act_establish(struct socket *so, struct mbuf *m)
3789{
3790	struct cpl_act_establish *req = cplhdr(m);
3791	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3792	struct tcpcb *tp = so_sototcpcb(so);
3793	struct toepcb *toep = tp->t_toe;
3794
3795	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3796		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3797		    toep->tp_tid, tp->t_state);
3798
3799	tp->ts_recent_age = ticks;
3800	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3801	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3802
3803	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3804
3805	/*
3806	 * Now that we finally have a TID send any CPL messages that we had to
3807	 * defer for lack of a TID.
3808	 */
3809	if (mbufq_len(&toep->out_of_order_queue))
3810		fixup_and_send_ofo(toep);
3811
3812	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3813		/*
3814		 * XXX does this even make sense?
3815		 */
3816		so_sorwakeup(so);
3817	}
3818	m_free(m);
3819#ifdef notyet
3820/*
3821 * XXX assume no write requests permitted while socket connection is
3822 * incomplete
3823 */
3824	/*
3825	 * Currently the send queue must be empty at this point because the
3826	 * socket layer does not send anything before a connection is
3827	 * established.  To be future proof though we handle the possibility
3828	 * that there are pending buffers to send (either TX_DATA or
3829	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3830	 * buffers according to the just learned write_seq, and then we send
3831	 * them on their way.
3832	 */
3833	fixup_pending_writeq_buffers(sk);
3834	if (t3_push_frames(so, 1))
3835		sk->sk_write_space(sk);
3836#endif
3837
3838	toep->tp_state = tp->t_state;
3839	tcpstat.tcps_connects++;
3840
3841}
3842
3843/*
3844 * Process a CPL_ACT_ESTABLISH message.
3845 */
3846static int
3847do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3848{
3849	struct cpl_act_establish *req = cplhdr(m);
3850	unsigned int tid = GET_TID(req);
3851	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3852	struct toepcb *toep = (struct toepcb *)ctx;
3853	struct tcpcb *tp = toep->tp_tp;
3854	struct socket *so;
3855	struct toedev *tdev;
3856	struct tom_data *d;
3857
3858	if (tp == NULL) {
3859		free_atid(cdev, atid);
3860		return (0);
3861	}
3862	inp_wlock(tp->t_inpcb);
3863
3864	/*
3865	 * XXX
3866	 */
3867	so = inp_inpcbtosocket(tp->t_inpcb);
3868	tdev = toep->tp_toedev; /* blow up here if link was down */
3869	d = TOM_DATA(tdev);
3870
3871	/*
3872	 * It's OK if the TID is currently in use, the owning socket may have
3873	 * backlogged its last CPL message(s).  Just take it away.
3874	 */
3875	toep->tp_tid = tid;
3876	toep->tp_tp = tp;
3877	so_insert_tid(d, toep, tid);
3878	free_atid(cdev, atid);
3879	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3880
3881	socket_act_establish(so, m);
3882	inp_wunlock(tp->t_inpcb);
3883	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3884	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3885
3886	return (0);
3887}
3888
3889/*
3890 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3891 * next batch of work requests from the write queue.
3892 */
3893static void
3894wr_ack(struct toepcb *toep, struct mbuf *m)
3895{
3896	struct tcpcb *tp = toep->tp_tp;
3897	struct cpl_wr_ack *hdr = cplhdr(m);
3898	struct socket *so;
3899	unsigned int credits = ntohs(hdr->credits);
3900	u32 snd_una = ntohl(hdr->snd_una);
3901	int bytes = 0;
3902	struct sockbuf *snd;
3903
3904	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3905
3906	inp_wlock(tp->t_inpcb);
3907	so = inp_inpcbtosocket(tp->t_inpcb);
3908	toep->tp_wr_avail += credits;
3909	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3910		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3911
3912	while (credits) {
3913		struct mbuf *p = peek_wr(toep);
3914
3915		if (__predict_false(!p)) {
3916			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3917			    "nothing pending, state %u wr_avail=%u\n",
3918			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3919			break;
3920		}
3921		CTR2(KTR_TOM,
3922			"wr_ack: p->credits=%d p->bytes=%d",
3923		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3924		KASSERT(p->m_pkthdr.csum_data != 0,
3925		    ("empty request still on list"));
3926
3927		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3928
3929#if DEBUG_WR > 1
3930			struct tx_data_wr *w = cplhdr(p);
3931			log(LOG_ERR,
3932			       "TID %u got %u WR credits, need %u, len %u, "
3933			       "main body %u, frags %u, seq # %u, ACK una %u,"
3934			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3935			       toep->tp_tid, credits, p->csum, p->len,
3936			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3937			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3938			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3939#endif
3940			p->m_pkthdr.csum_data -= credits;
3941			break;
3942		} else {
3943			dequeue_wr(toep);
3944			credits -= p->m_pkthdr.csum_data;
3945			bytes += p->m_pkthdr.len;
3946			CTR3(KTR_TOM,
3947			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3948			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3949
3950			m_free(p);
3951		}
3952	}
3953
3954#if DEBUG_WR
3955	check_wr_invariants(tp);
3956#endif
3957
3958	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3959#if VALIDATE_SEQ
3960		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3961
3962		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3963		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3964		    toep->tp_tid, tp->snd_una);
3965#endif
3966		goto out_free;
3967	}
3968
3969	if (tp->snd_una != snd_una) {
3970		tp->snd_una = snd_una;
3971		tp->ts_recent_age = ticks;
3972#ifdef notyet
3973		/*
3974		 * Keep ARP entry "minty fresh"
3975		 */
3976		dst_confirm(sk->sk_dst_cache);
3977#endif
3978		if (tp->snd_una == tp->snd_nxt)
3979			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3980	}
3981
3982	snd = so_sockbuf_snd(so);
3983	if (bytes) {
3984		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3985		snd = so_sockbuf_snd(so);
3986		sockbuf_lock(snd);
3987		sbdrop_locked(snd, bytes);
3988		so_sowwakeup_locked(so);
3989	}
3990
3991	if (snd->sb_sndptroff < snd->sb_cc)
3992		t3_push_frames(so, 0);
3993
3994out_free:
3995	inp_wunlock(tp->t_inpcb);
3996	m_free(m);
3997}
3998
3999/*
4000 * Handler for TX_DATA_ACK CPL messages.
4001 */
4002static int
4003do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
4004{
4005	struct toepcb *toep = (struct toepcb *)ctx;
4006
4007	VALIDATE_SOCK(so);
4008
4009	wr_ack(toep, m);
4010	return 0;
4011}
4012
4013/*
4014 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4015 */
4016static int
4017do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4018{
4019	m_freem(m);
4020	return 0;
4021}
4022
4023/*
4024 * Reset a connection that is on a listener's SYN queue or accept queue,
4025 * i.e., one that has not had a struct socket associated with it.
4026 * Must be called from process context.
4027 *
4028 * Modeled after code in inet_csk_listen_stop().
4029 */
4030static void
4031t3_reset_listen_child(struct socket *child)
4032{
4033	struct tcpcb *tp = so_sototcpcb(child);
4034
4035	t3_send_reset(tp->t_toe);
4036}
4037
4038
4039static void
4040t3_child_disconnect(struct socket *so, void *arg)
4041{
4042	struct tcpcb *tp = so_sototcpcb(so);
4043
4044	if (tp->t_flags & TF_TOE) {
4045		inp_wlock(tp->t_inpcb);
4046		t3_reset_listen_child(so);
4047		inp_wunlock(tp->t_inpcb);
4048	}
4049}
4050
4051/*
4052 * Disconnect offloaded established but not yet accepted connections sitting
4053 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4054 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4055 */
4056void
4057t3_disconnect_acceptq(struct socket *listen_so)
4058{
4059
4060	so_lock(listen_so);
4061	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4062	so_unlock(listen_so);
4063}
4064
4065/*
4066 * Reset offloaded connections sitting on a server's syn queue.  As above
4067 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4068 */
4069
4070void
4071t3_reset_synq(struct listen_ctx *lctx)
4072{
4073	struct toepcb *toep;
4074
4075	so_lock(lctx->lso);
4076	while (!LIST_EMPTY(&lctx->synq_head)) {
4077		toep = LIST_FIRST(&lctx->synq_head);
4078		LIST_REMOVE(toep, synq_entry);
4079		toep->tp_tp = NULL;
4080		t3_send_reset(toep);
4081		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4082		toepcb_release(toep);
4083	}
4084	so_unlock(lctx->lso);
4085}
4086
4087
4088int
4089t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4090		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4091		   unsigned int pg_off, unsigned int color)
4092{
4093	unsigned int i, j, pidx;
4094	struct pagepod *p;
4095	struct mbuf *m;
4096	struct ulp_mem_io *req;
4097	unsigned int tid = toep->tp_tid;
4098	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4099	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4100
4101	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4102	    gl, nppods, tag, maxoff, pg_off, color);
4103
4104	for (i = 0; i < nppods; ++i) {
4105		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4106		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4107		req = mtod(m, struct ulp_mem_io *);
4108		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4109		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4110		req->wr.wr_lo = 0;
4111		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4112					   V_ULPTX_CMD(ULP_MEM_WRITE));
4113		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4114				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4115
4116		p = (struct pagepod *)(req + 1);
4117		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4118			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4119			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4120						  V_PPOD_COLOR(color));
4121			p->pp_max_offset = htonl(maxoff);
4122			p->pp_page_offset = htonl(pg_off);
4123			p->pp_rsvd = 0;
4124			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4125				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4126				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4127		} else
4128			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4129		send_or_defer(toep, m, 0);
4130		ppod_addr += PPOD_SIZE;
4131	}
4132	return (0);
4133}
4134
4135/*
4136 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4137 */
4138static inline void
4139mk_cpl_barrier_ulp(struct cpl_barrier *b)
4140{
4141	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4142
4143	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4144	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4145	b->opcode = CPL_BARRIER;
4146}
4147
4148/*
4149 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4150 */
4151static inline void
4152mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4153{
4154	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4155
4156	txpkt = (struct ulp_txpkt *)req;
4157	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4158	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4159	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4160	req->cpuno = htons(cpuno);
4161}
4162
4163/*
4164 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4165 */
4166static inline void
4167mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4168                     unsigned int word, uint64_t mask, uint64_t val)
4169{
4170	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4171
4172	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4173	    tid, word, mask, val);
4174
4175	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4176	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4177	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4178	req->reply = V_NO_REPLY(1);
4179	req->cpu_idx = 0;
4180	req->word = htons(word);
4181	req->mask = htobe64(mask);
4182	req->val = htobe64(val);
4183}
4184
4185/*
4186 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4187 */
4188static void
4189mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4190    unsigned int tid, unsigned int credits)
4191{
4192	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4193
4194	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4195	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4196	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4197	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4198	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4199				 V_RX_CREDITS(credits));
4200}
4201
4202void
4203t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4204{
4205	unsigned int wrlen;
4206	struct mbuf *m;
4207	struct work_request_hdr *wr;
4208	struct cpl_barrier *lock;
4209	struct cpl_set_tcb_field *req;
4210	struct cpl_get_tcb *getreq;
4211	struct ddp_state *p = &toep->tp_ddp_state;
4212
4213#if 0
4214	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4215#endif
4216	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4217		sizeof(*getreq);
4218	m = m_gethdr_nofail(wrlen);
4219	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4220	wr = mtod(m, struct work_request_hdr *);
4221	bzero(wr, wrlen);
4222
4223	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4224	m->m_pkthdr.len = m->m_len = wrlen;
4225
4226	lock = (struct cpl_barrier *)(wr + 1);
4227	mk_cpl_barrier_ulp(lock);
4228
4229	req = (struct cpl_set_tcb_field *)(lock + 1);
4230
4231	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4232
4233	/* Hmmm, not sure if this actually a good thing: reactivating
4234	 * the other buffer might be an issue if it has been completed
4235	 * already. However, that is unlikely, since the fact that the UBUF
4236	 * is not completed indicates that there is no oustanding data.
4237	 */
4238	if (bufidx == 0)
4239		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4240				     V_TF_DDP_ACTIVE_BUF(1) |
4241				     V_TF_DDP_BUF0_VALID(1),
4242				     V_TF_DDP_ACTIVE_BUF(1));
4243	else
4244		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4245				     V_TF_DDP_ACTIVE_BUF(1) |
4246				     V_TF_DDP_BUF1_VALID(1), 0);
4247
4248	getreq = (struct cpl_get_tcb *)(req + 1);
4249	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4250
4251	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4252
4253	/* Keep track of the number of oustanding CPL_GET_TCB requests
4254	 */
4255	p->get_tcb_count++;
4256
4257#ifdef T3_TRACE
4258	T3_TRACE1(TIDTB(so),
4259		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4260#endif
4261	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4262}
4263
4264/**
4265 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4266 * @sk: the socket associated with the buffers
4267 * @bufidx: index of HW DDP buffer (0 or 1)
4268 * @tag0: new tag for HW buffer 0
4269 * @tag1: new tag for HW buffer 1
4270 * @len: new length for HW buf @bufidx
4271 *
4272 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4273 * buffer by changing the buffer tag and length and setting the valid and
4274 * active flag accordingly.  The caller must ensure the new buffer is at
4275 * least as big as the existing one.  Since we typically reprogram both HW
4276 * buffers this function sets both tags for convenience. Read the TCB to
4277 * determine how made data was written into the buffer before the overlay
4278 * took place.
4279 */
4280void
4281t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4282	 	       unsigned int tag1, unsigned int len)
4283{
4284	unsigned int wrlen;
4285	struct mbuf *m;
4286	struct work_request_hdr *wr;
4287	struct cpl_get_tcb *getreq;
4288	struct cpl_set_tcb_field *req;
4289	struct ddp_state *p = &toep->tp_ddp_state;
4290
4291	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4292	    bufidx, tag0, tag1, len);
4293#if 0
4294	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4295#endif
4296	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4297	m = m_gethdr_nofail(wrlen);
4298	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4299	wr = mtod(m, struct work_request_hdr *);
4300	m->m_pkthdr.len = m->m_len = wrlen;
4301	bzero(wr, wrlen);
4302
4303
4304	/* Set the ATOMIC flag to make sure that TP processes the following
4305	 * CPLs in an atomic manner and no wire segments can be interleaved.
4306	 */
4307	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4308	req = (struct cpl_set_tcb_field *)(wr + 1);
4309	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4310			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4311			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4312			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4313			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4314	req++;
4315	if (bufidx == 0) {
4316		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4317			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4318			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4319		req++;
4320		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4321			    V_TF_DDP_PUSH_DISABLE_0(1) |
4322			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4323			    V_TF_DDP_PUSH_DISABLE_0(0) |
4324			    V_TF_DDP_BUF0_VALID(1));
4325	} else {
4326		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4327			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4328			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4329		req++;
4330		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4331			    V_TF_DDP_PUSH_DISABLE_1(1) |
4332			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4333			    V_TF_DDP_PUSH_DISABLE_1(0) |
4334			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4335	}
4336
4337	getreq = (struct cpl_get_tcb *)(req + 1);
4338	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4339
4340	/* Keep track of the number of oustanding CPL_GET_TCB requests
4341	 */
4342	p->get_tcb_count++;
4343
4344#ifdef T3_TRACE
4345	T3_TRACE4(TIDTB(sk),
4346		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4347		  "len %d",
4348		  bufidx, tag0, tag1, len);
4349#endif
4350	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4351}
4352
4353/*
4354 * Sends a compound WR containing all the CPL messages needed to program the
4355 * two HW DDP buffers, namely optionally setting up the length and offset of
4356 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4357 */
4358void
4359t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4360		      unsigned int len1, unsigned int offset1,
4361                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4362{
4363	unsigned int wrlen;
4364	struct mbuf *m;
4365	struct work_request_hdr *wr;
4366	struct cpl_set_tcb_field *req;
4367
4368	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4369	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4370
4371#if 0
4372	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4373#endif
4374	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4375		(len1 ? sizeof(*req) : 0) +
4376		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4377	m = m_gethdr_nofail(wrlen);
4378	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4379	wr = mtod(m, struct work_request_hdr *);
4380	bzero(wr, wrlen);
4381
4382	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4383	m->m_pkthdr.len = m->m_len = wrlen;
4384
4385	req = (struct cpl_set_tcb_field *)(wr + 1);
4386	if (len0) {                  /* program buffer 0 offset and length */
4387		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4388			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4389			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4390			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4391			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4392		req++;
4393	}
4394	if (len1) {                  /* program buffer 1 offset and length */
4395		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4396			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4397			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4398			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4399			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4400		req++;
4401	}
4402
4403	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4404			     ddp_flags);
4405
4406	if (modulate) {
4407		mk_rx_data_ack_ulp(toep,
4408		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4409		    toep->tp_copied_seq - toep->tp_rcv_wup);
4410		toep->tp_rcv_wup = toep->tp_copied_seq;
4411	}
4412
4413#ifdef T3_TRACE
4414	T3_TRACE5(TIDTB(sk),
4415		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4416		  "modulate %d",
4417		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4418		  modulate);
4419#endif
4420
4421	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4422}
4423
4424void
4425t3_init_wr_tab(unsigned int wr_len)
4426{
4427	int i;
4428
4429	if (mbuf_wrs[1])     /* already initialized */
4430		return;
4431
4432	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4433		int sgl_len = (3 * i) / 2 + (i & 1);
4434
4435		sgl_len += 3;
4436		mbuf_wrs[i] = sgl_len <= wr_len ?
4437		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4438	}
4439
4440	wrlen = wr_len * 8;
4441}
4442
4443int
4444t3_init_cpl_io(void)
4445{
4446#ifdef notyet
4447	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4448	if (!tcphdr_skb) {
4449		log(LOG_ERR,
4450		       "Chelsio TCP offload: can't allocate sk_buff\n");
4451		return -1;
4452	}
4453	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4454	tcphdr_skb->h.raw = tcphdr_skb->data;
4455	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4456#endif
4457
4458	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4459	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4460	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4461	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4462	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4463	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4464	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4465	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4466	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4467	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4468	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4469	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4470	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4471	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4472	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4473	return (0);
4474}
4475
4476