cxgb_cpl_io.c revision 196019
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 196019 2009-08-01 19:26:27Z rwatson $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/sockbuf.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/protosw.h>
49#include <sys/priv.h>
50
51#if __FreeBSD_version < 800044
52#define V_tcp_do_autosndbuf tcp_do_autosndbuf
53#define V_tcp_autosndbuf_max tcp_autosndbuf_max
54#define V_tcp_do_rfc1323 tcp_do_rfc1323
55#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
56#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
57#define V_tcpstat tcpstat
58#endif
59
60#include <net/if.h>
61#include <net/route.h>
62
63#include <netinet/in.h>
64#include <netinet/in_pcb.h>
65#include <netinet/in_systm.h>
66#include <netinet/in_var.h>
67
68
69#include <cxgb_osdep.h>
70#include <sys/mbufq.h>
71
72#include <netinet/ip.h>
73#include <netinet/tcp_var.h>
74#include <netinet/tcp_fsm.h>
75#include <netinet/tcp_offload.h>
76#include <netinet/tcp_seq.h>
77#include <netinet/tcp_syncache.h>
78#include <netinet/tcp_timer.h>
79#include <net/route.h>
80
81#include <t3cdev.h>
82#include <common/cxgb_firmware_exports.h>
83#include <common/cxgb_t3_cpl.h>
84#include <common/cxgb_tcb.h>
85#include <common/cxgb_ctl_defs.h>
86#include <cxgb_offload.h>
87#include <vm/vm.h>
88#include <vm/pmap.h>
89#include <machine/bus.h>
90#include <sys/mvec.h>
91#include <ulp/toecore/cxgb_toedev.h>
92#include <ulp/tom/cxgb_l2t.h>
93#include <ulp/tom/cxgb_defs.h>
94#include <ulp/tom/cxgb_tom.h>
95#include <ulp/tom/cxgb_t3_ddp.h>
96#include <ulp/tom/cxgb_toepcb.h>
97#include <ulp/tom/cxgb_tcp.h>
98#include <ulp/tom/cxgb_tcp_offload.h>
99
100/*
101 * For ULP connections HW may add headers, e.g., for digests, that aren't part
102 * of the messages sent by the host but that are part of the TCP payload and
103 * therefore consume TCP sequence space.  Tx connection parameters that
104 * operate in TCP sequence space are affected by the HW additions and need to
105 * compensate for them to accurately track TCP sequence numbers. This array
106 * contains the compensating extra lengths for ULP packets.  It is indexed by
107 * a packet's ULP submode.
108 */
109const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
110
111#ifdef notyet
112/*
113 * This sk_buff holds a fake header-only TCP segment that we use whenever we
114 * need to exploit SW TCP functionality that expects TCP headers, such as
115 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
116 * CPUs without locking.
117 */
118static struct mbuf *tcphdr_mbuf __read_mostly;
119#endif
120
121/*
122 * Size of WRs in bytes.  Note that we assume all devices we are handling have
123 * the same WR size.
124 */
125static unsigned int wrlen __read_mostly;
126
127/*
128 * The number of WRs needed for an skb depends on the number of page fragments
129 * in the skb and whether it has any payload in its main body.  This maps the
130 * length of the gather list represented by an skb into the # of necessary WRs.
131 */
132static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
133
134/*
135 * Max receive window supported by HW in bytes.  Only a small part of it can
136 * be set through option0, the rest needs to be set through RX_DATA_ACK.
137 */
138#define MAX_RCV_WND ((1U << 27) - 1)
139
140/*
141 * Min receive window.  We want it to be large enough to accommodate receive
142 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
143 */
144#define MIN_RCV_WND (24 * 1024U)
145#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
146
147#define VALIDATE_SEQ 0
148#define VALIDATE_SOCK(so)
149#define DEBUG_WR 0
150
151#define TCP_TIMEWAIT	1
152#define TCP_CLOSE	2
153#define TCP_DROP	3
154
155static void t3_send_reset(struct toepcb *toep);
156static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
157static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
158static void handle_syncache_event(int event, void *arg);
159
160static inline void
161SBAPPEND(struct sockbuf *sb, struct mbuf *n)
162{
163	struct mbuf *m;
164
165	m = sb->sb_mb;
166	while (m) {
167		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
168		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
169			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
170		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
171			m->m_next, m->m_nextpkt, m->m_flags));
172		m = m->m_next;
173	}
174	m = n;
175	while (m) {
176		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
177		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
178			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
179		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
180			m->m_next, m->m_nextpkt, m->m_flags));
181		m = m->m_next;
182	}
183	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
184	sbappendstream_locked(sb, n);
185	m = sb->sb_mb;
186
187	while (m) {
188		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
189			m->m_next, m->m_nextpkt, m->m_flags));
190		m = m->m_next;
191	}
192}
193
194static inline int
195is_t3a(const struct toedev *dev)
196{
197	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
198}
199
200static void
201dump_toepcb(struct toepcb *toep)
202{
203	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
204	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
205	    toep->tp_mtu_idx, toep->tp_tid);
206
207	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
208	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
209	    toep->tp_mss_clamp, toep->tp_flags);
210}
211
212#ifndef RTALLOC2_DEFINED
213static struct rtentry *
214rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
215{
216	struct rtentry *rt = NULL;
217
218	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
219		RT_UNLOCK(rt);
220
221	return (rt);
222}
223#endif
224
225/*
226 * Determine whether to send a CPL message now or defer it.  A message is
227 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
228 * For connections in other states the message is sent immediately.
229 * If through_l2t is set the message is subject to ARP processing, otherwise
230 * it is sent directly.
231 */
232static inline void
233send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
234{
235	struct tcpcb *tp = toep->tp_tp;
236
237	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
238		inp_wlock(tp->t_inpcb);
239		mbufq_tail(&toep->out_of_order_queue, m);  // defer
240		inp_wunlock(tp->t_inpcb);
241	} else if (through_l2t)
242		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
243	else
244		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
245}
246
247static inline unsigned int
248mkprio(unsigned int cntrl, const struct toepcb *toep)
249{
250        return (cntrl);
251}
252
253/*
254 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
255 */
256static inline void
257mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
258{
259	struct cpl_tid_release *req;
260
261	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
262	m->m_pkthdr.len = m->m_len = sizeof(*req);
263	req = mtod(m, struct cpl_tid_release *);
264	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
265	req->wr.wr_lo = 0;
266	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
267}
268
269static inline void
270make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
271{
272	struct tcpcb *tp = so_sototcpcb(so);
273	struct toepcb *toep = tp->t_toe;
274	struct tx_data_wr *req;
275	struct sockbuf *snd;
276
277	inp_lock_assert(tp->t_inpcb);
278	snd = so_sockbuf_snd(so);
279
280	req = mtod(m, struct tx_data_wr *);
281	m->m_len = sizeof(*req);
282	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
283	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
284	/* len includes the length of any HW ULP additions */
285	req->len = htonl(len);
286	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
287	/* V_TX_ULP_SUBMODE sets both the mode and submode */
288	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
289	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
290	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
291				   (tail ? 0 : 1))));
292	req->sndseq = htonl(tp->snd_nxt);
293	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
294		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
295				    V_TX_CPU_IDX(toep->tp_qset));
296
297		/* Sendbuffer is in units of 32KB.
298		 */
299		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
300			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
301		else {
302			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
303		}
304
305		toep->tp_flags |= TP_DATASENT;
306	}
307}
308
309#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
310
311int
312t3_push_frames(struct socket *so, int req_completion)
313{
314	struct tcpcb *tp = so_sototcpcb(so);
315	struct toepcb *toep = tp->t_toe;
316
317	struct mbuf *tail, *m0, *last;
318	struct t3cdev *cdev;
319	struct tom_data *d;
320	int state, bytes, count, total_bytes;
321	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
322	struct sockbuf *snd;
323
324	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
325		DPRINTF("tcp state=%d\n", tp->t_state);
326		return (0);
327	}
328
329	state = so_state_get(so);
330
331	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
332		DPRINTF("disconnecting\n");
333
334		return (0);
335	}
336
337	inp_lock_assert(tp->t_inpcb);
338
339	snd = so_sockbuf_snd(so);
340	sockbuf_lock(snd);
341
342	d = TOM_DATA(toep->tp_toedev);
343	cdev = d->cdev;
344
345	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
346
347	total_bytes = 0;
348	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
349	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
350
351	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
352		KASSERT(tail, ("sbdrop error"));
353		last = tail = tail->m_next;
354	}
355
356	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
357		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
358		sockbuf_unlock(snd);
359
360		return (0);
361	}
362
363	toep->tp_m_last = NULL;
364	while (toep->tp_wr_avail && (tail != NULL)) {
365		count = bytes = 0;
366		segp = segs;
367		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
368			sockbuf_unlock(snd);
369			return (0);
370		}
371		/*
372		 * If the data in tail fits as in-line, then
373		 * make an immediate data wr.
374		 */
375		if (tail->m_len <= IMM_LEN) {
376			count = 1;
377			bytes = tail->m_len;
378			last = tail;
379			tail = tail->m_next;
380			m_set_sgl(m0, NULL);
381			m_set_sgllen(m0, 0);
382			make_tx_data_wr(so, m0, bytes, tail);
383			m_append(m0, bytes, mtod(last, caddr_t));
384			KASSERT(!m0->m_next, ("bad append"));
385		} else {
386			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
387			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
388				bytes += tail->m_len;
389				last = tail;
390				count++;
391				/*
392				 * technically an abuse to be using this for a VA
393				 * but less gross than defining my own structure
394				 * or calling pmap_kextract from here :-|
395				 */
396				segp->ds_addr = (bus_addr_t)tail->m_data;
397				segp->ds_len = tail->m_len;
398				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
399				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
400				segp++;
401				tail = tail->m_next;
402			}
403			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
404			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
405
406			m_set_sgl(m0, segs);
407			m_set_sgllen(m0, count);
408			make_tx_data_wr(so, m0, bytes, tail);
409		}
410		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
411
412		if (tail) {
413			snd->sb_sndptr = tail;
414			toep->tp_m_last = NULL;
415		} else
416			toep->tp_m_last = snd->sb_sndptr = last;
417
418
419		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
420
421		snd->sb_sndptroff += bytes;
422		total_bytes += bytes;
423		toep->tp_write_seq += bytes;
424		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
425		    " tail=%p sndptr=%p sndptroff=%d",
426		    toep->tp_wr_avail, count, mbuf_wrs[count],
427		    tail, snd->sb_sndptr, snd->sb_sndptroff);
428		if (tail)
429			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
430			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
431			    total_bytes, toep->tp_m_last, tail->m_data,
432			    tp->snd_una);
433		else
434			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
435			    " tp_m_last=%p snd_una=0x%08x",
436			    total_bytes, toep->tp_m_last, tp->snd_una);
437
438
439#ifdef KTR
440{
441		int i;
442
443		i = 0;
444		while (i < count && m_get_sgllen(m0)) {
445			if ((count - i) >= 3) {
446				CTR6(KTR_TOM,
447				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
448				    " len=%d pa=0x%zx len=%d",
449				    segs[i].ds_addr, segs[i].ds_len,
450				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
451				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
452				    i += 3;
453			} else if ((count - i) == 2) {
454				CTR4(KTR_TOM,
455				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
456				    " len=%d",
457				    segs[i].ds_addr, segs[i].ds_len,
458				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
459				    i += 2;
460			} else {
461				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
462				    segs[i].ds_addr, segs[i].ds_len);
463				i++;
464			}
465
466		}
467}
468#endif
469                 /*
470		 * remember credits used
471		 */
472		m0->m_pkthdr.csum_data = mbuf_wrs[count];
473		m0->m_pkthdr.len = bytes;
474		toep->tp_wr_avail -= mbuf_wrs[count];
475		toep->tp_wr_unacked += mbuf_wrs[count];
476
477		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
478		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
479			struct work_request_hdr *wr = cplhdr(m0);
480
481			wr->wr_hi |= htonl(F_WR_COMPL);
482			toep->tp_wr_unacked = 0;
483		}
484		KASSERT((m0->m_pkthdr.csum_data > 0) &&
485		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
486			m0->m_pkthdr.csum_data));
487		m0->m_type = MT_DONTFREE;
488		enqueue_wr(toep, m0);
489		DPRINTF("sending offload tx with %d bytes in %d segments\n",
490		    bytes, count);
491		l2t_send(cdev, m0, toep->tp_l2t);
492	}
493	sockbuf_unlock(snd);
494	return (total_bytes);
495}
496
497/*
498 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
499 * under any circumstances.  We take the easy way out and always queue the
500 * message to the write_queue.  We can optimize the case where the queue is
501 * already empty though the optimization is probably not worth it.
502 */
503static void
504close_conn(struct socket *so)
505{
506	struct mbuf *m;
507	struct cpl_close_con_req *req;
508	struct tom_data *d;
509	struct inpcb *inp = so_sotoinpcb(so);
510	struct tcpcb *tp;
511	struct toepcb *toep;
512	unsigned int tid;
513
514
515	inp_wlock(inp);
516	tp = so_sototcpcb(so);
517	toep = tp->t_toe;
518
519	if (tp->t_state != TCPS_SYN_SENT)
520		t3_push_frames(so, 1);
521
522	if (toep->tp_flags & TP_FIN_SENT) {
523		inp_wunlock(inp);
524		return;
525	}
526
527	tid = toep->tp_tid;
528
529	d = TOM_DATA(toep->tp_toedev);
530
531	m = m_gethdr_nofail(sizeof(*req));
532	m_set_priority(m, CPL_PRIORITY_DATA);
533	m_set_sgl(m, NULL);
534	m_set_sgllen(m, 0);
535
536	toep->tp_flags |= TP_FIN_SENT;
537	req = mtod(m, struct cpl_close_con_req *);
538
539	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
540	req->wr.wr_lo = htonl(V_WR_TID(tid));
541	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
542	req->rsvd = 0;
543	inp_wunlock(inp);
544	/*
545	 * XXX - need to defer shutdown while there is still data in the queue
546	 *
547	 */
548	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
549	cxgb_ofld_send(d->cdev, m);
550
551}
552
553/*
554 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
555 * and send it along.
556 */
557static void
558abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
559{
560	struct cpl_abort_req *req = cplhdr(m);
561
562	req->cmd = CPL_ABORT_NO_RST;
563	cxgb_ofld_send(cdev, m);
564}
565
566/*
567 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
568 * permitted to return without sending the message in case we cannot allocate
569 * an sk_buff.  Returns the number of credits sent.
570 */
571uint32_t
572t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
573{
574	struct mbuf *m;
575	struct cpl_rx_data_ack *req;
576	struct toepcb *toep = tp->t_toe;
577	struct toedev *tdev = toep->tp_toedev;
578
579	m = m_gethdr_nofail(sizeof(*req));
580
581	DPRINTF("returning %u credits to HW\n", credits);
582
583	req = mtod(m, struct cpl_rx_data_ack *);
584	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
585	req->wr.wr_lo = 0;
586	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
587	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
588	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
589	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
590	return (credits);
591}
592
593/*
594 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
595 * This is only used in DDP mode, so we take the opportunity to also set the
596 * DACK mode and flush any Rx credits.
597 */
598void
599t3_send_rx_modulate(struct toepcb *toep)
600{
601	struct mbuf *m;
602	struct cpl_rx_data_ack *req;
603
604	m = m_gethdr_nofail(sizeof(*req));
605
606	req = mtod(m, struct cpl_rx_data_ack *);
607	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
608	req->wr.wr_lo = 0;
609	m->m_pkthdr.len = m->m_len = sizeof(*req);
610
611	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
612	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
613				 V_RX_DACK_MODE(1) |
614				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
615	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
616	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
617	toep->tp_rcv_wup = toep->tp_copied_seq;
618}
619
620/*
621 * Handle receipt of an urgent pointer.
622 */
623static void
624handle_urg_ptr(struct socket *so, uint32_t urg_seq)
625{
626#ifdef URGENT_DATA_SUPPORTED
627	struct tcpcb *tp = so_sototcpcb(so);
628
629	urg_seq--;   /* initially points past the urgent data, per BSD */
630
631	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
632		return;                                 /* duplicate pointer */
633	sk_send_sigurg(sk);
634	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
635	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
636		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
637
638		tp->copied_seq++;
639		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
640			tom_eat_skb(sk, skb, 0);
641	}
642	tp->urg_data = TCP_URG_NOTYET;
643	tp->urg_seq = urg_seq;
644#endif
645}
646
647/*
648 * Returns true if a socket cannot accept new Rx data.
649 */
650static inline int
651so_no_receive(const struct socket *so)
652{
653	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
654}
655
656/*
657 * Process an urgent data notification.
658 */
659static void
660rx_urg_notify(struct toepcb *toep, struct mbuf *m)
661{
662	struct cpl_rx_urg_notify *hdr = cplhdr(m);
663	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
664
665	VALIDATE_SOCK(so);
666
667	if (!so_no_receive(so))
668		handle_urg_ptr(so, ntohl(hdr->seq));
669
670	m_freem(m);
671}
672
673/*
674 * Handler for RX_URG_NOTIFY CPL messages.
675 */
676static int
677do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
678{
679	struct toepcb *toep = (struct toepcb *)ctx;
680
681	rx_urg_notify(toep, m);
682	return (0);
683}
684
685static __inline int
686is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
687{
688	return (toep->tp_ulp_mode ||
689		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
690		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
691}
692
693/*
694 * Set of states for which we should return RX credits.
695 */
696#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
697
698/*
699 * Called after some received data has been read.  It returns RX credits
700 * to the HW for the amount of data processed.
701 */
702void
703t3_cleanup_rbuf(struct tcpcb *tp, int copied)
704{
705	struct toepcb *toep = tp->t_toe;
706	struct socket *so;
707	struct toedev *dev;
708	int dack_mode, must_send, read;
709	u32 thres, credits, dack = 0;
710	struct sockbuf *rcv;
711
712	so = inp_inpcbtosocket(tp->t_inpcb);
713	rcv = so_sockbuf_rcv(so);
714
715	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
716		(tp->t_state == TCPS_FIN_WAIT_2))) {
717		if (copied) {
718			sockbuf_lock(rcv);
719			toep->tp_copied_seq += copied;
720			sockbuf_unlock(rcv);
721		}
722
723		return;
724	}
725
726	inp_lock_assert(tp->t_inpcb);
727
728	sockbuf_lock(rcv);
729	if (copied)
730		toep->tp_copied_seq += copied;
731	else {
732		read = toep->tp_enqueued_bytes - rcv->sb_cc;
733		toep->tp_copied_seq += read;
734	}
735	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
736	toep->tp_enqueued_bytes = rcv->sb_cc;
737	sockbuf_unlock(rcv);
738
739	if (credits > rcv->sb_mbmax) {
740		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
741		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
742	    credits = rcv->sb_mbmax;
743	}
744
745
746	/*
747	 * XXX this won't accurately reflect credit return - we need
748	 * to look at the difference between the amount that has been
749	 * put in the recv sockbuf and what is there now
750	 */
751
752	if (__predict_false(!credits))
753		return;
754
755	dev = toep->tp_toedev;
756	thres = TOM_TUNABLE(dev, rx_credit_thres);
757
758	if (__predict_false(thres == 0))
759		return;
760
761	if (is_delack_mode_valid(dev, toep)) {
762		dack_mode = TOM_TUNABLE(dev, delack);
763		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
764			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
765
766			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
767				dack = F_RX_DACK_CHANGE |
768				       V_RX_DACK_MODE(dack_mode);
769		}
770	} else
771		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
772
773	/*
774	 * For coalescing to work effectively ensure the receive window has
775	 * at least 16KB left.
776	 */
777	must_send = credits + 16384 >= tp->rcv_wnd;
778
779	if (must_send || credits >= thres)
780		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
781}
782
783static int
784cxgb_toe_disconnect(struct tcpcb *tp)
785{
786	struct socket *so;
787
788	DPRINTF("cxgb_toe_disconnect\n");
789
790	so = inp_inpcbtosocket(tp->t_inpcb);
791	close_conn(so);
792	return (0);
793}
794
795static int
796cxgb_toe_reset(struct tcpcb *tp)
797{
798	struct toepcb *toep = tp->t_toe;
799
800	t3_send_reset(toep);
801
802	/*
803	 * unhook from socket
804	 */
805	tp->t_flags &= ~TF_TOE;
806	toep->tp_tp = NULL;
807	tp->t_toe = NULL;
808	return (0);
809}
810
811static int
812cxgb_toe_send(struct tcpcb *tp)
813{
814	struct socket *so;
815
816	DPRINTF("cxgb_toe_send\n");
817	dump_toepcb(tp->t_toe);
818
819	so = inp_inpcbtosocket(tp->t_inpcb);
820	t3_push_frames(so, 1);
821	return (0);
822}
823
824static int
825cxgb_toe_rcvd(struct tcpcb *tp)
826{
827
828	inp_lock_assert(tp->t_inpcb);
829
830	t3_cleanup_rbuf(tp, 0);
831
832	return (0);
833}
834
835static void
836cxgb_toe_detach(struct tcpcb *tp)
837{
838	struct toepcb *toep;
839
840        /*
841	 * XXX how do we handle teardown in the SYN_SENT state?
842	 *
843	 */
844	inp_lock_assert(tp->t_inpcb);
845	toep = tp->t_toe;
846	toep->tp_tp = NULL;
847
848	/*
849	 * unhook from socket
850	 */
851	tp->t_flags &= ~TF_TOE;
852	tp->t_toe = NULL;
853}
854
855
856static struct toe_usrreqs cxgb_toe_usrreqs = {
857	.tu_disconnect = cxgb_toe_disconnect,
858	.tu_reset = cxgb_toe_reset,
859	.tu_send = cxgb_toe_send,
860	.tu_rcvd = cxgb_toe_rcvd,
861	.tu_detach = cxgb_toe_detach,
862	.tu_detach = cxgb_toe_detach,
863	.tu_syncache_event = handle_syncache_event,
864};
865
866
867static void
868__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
869			    uint64_t mask, uint64_t val, int no_reply)
870{
871	struct cpl_set_tcb_field *req;
872
873	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
874	    toep->tp_tid, word, mask, val);
875
876	req = mtod(m, struct cpl_set_tcb_field *);
877	m->m_pkthdr.len = m->m_len = sizeof(*req);
878	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
879	req->wr.wr_lo = 0;
880	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
881	req->reply = V_NO_REPLY(no_reply);
882	req->cpu_idx = 0;
883	req->word = htons(word);
884	req->mask = htobe64(mask);
885	req->val = htobe64(val);
886
887	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
888	send_or_defer(toep, m, 0);
889}
890
891static void
892t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
893{
894	struct mbuf *m;
895	struct tcpcb *tp = toep->tp_tp;
896
897	if (toep == NULL)
898		return;
899
900	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
901		printf("not seting field\n");
902		return;
903	}
904
905	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
906
907	__set_tcb_field(toep, m, word, mask, val, 1);
908}
909
910/*
911 * Set one of the t_flags bits in the TCB.
912 */
913static void
914set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
915{
916
917	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
918}
919
920/*
921 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
922 */
923static void
924t3_set_nagle(struct toepcb *toep)
925{
926	struct tcpcb *tp = toep->tp_tp;
927
928	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
929}
930
931/*
932 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
933 */
934void
935t3_set_keepalive(struct toepcb *toep, int on_off)
936{
937
938	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
939}
940
941void
942t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
943{
944	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
945}
946
947void
948t3_set_dack_mss(struct toepcb *toep, int on_off)
949{
950
951	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
952}
953
954/*
955 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
956 */
957static void
958t3_set_tos(struct toepcb *toep)
959{
960	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
961
962	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
963			 V_TCB_TOS(tos));
964}
965
966
967/*
968 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
969 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
970 * set the PSH bit in the last segment, which would trigger delivery.]
971 * We work around the issue by setting a DDP buffer in a partial placed state,
972 * which guarantees that TP will schedule a timer.
973 */
974#define TP_DDP_TIMER_WORKAROUND_MASK\
975    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
976     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
977       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
978#define TP_DDP_TIMER_WORKAROUND_VAL\
979    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
980     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
981      32))
982
983static void
984t3_enable_ddp(struct toepcb *toep, int on)
985{
986	if (on) {
987
988		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
989				 V_TF_DDP_OFF(0));
990	} else
991		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
992				 V_TF_DDP_OFF(1) |
993				 TP_DDP_TIMER_WORKAROUND_MASK,
994				 V_TF_DDP_OFF(1) |
995				 TP_DDP_TIMER_WORKAROUND_VAL);
996
997}
998
999void
1000t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1001{
1002	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1003			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1004			 tag_color);
1005}
1006
1007void
1008t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1009		    unsigned int len)
1010{
1011	if (buf_idx == 0)
1012		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1013			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1014			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1015			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1016			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1017	else
1018		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1019			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1020			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1021			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1022			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1023}
1024
1025static int
1026t3_set_cong_control(struct socket *so, const char *name)
1027{
1028#ifdef CONGESTION_CONTROL_SUPPORTED
1029	int cong_algo;
1030
1031	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1032		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1033			break;
1034
1035	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1036		return -EINVAL;
1037#endif
1038	return 0;
1039}
1040
1041int
1042t3_get_tcb(struct toepcb *toep)
1043{
1044	struct cpl_get_tcb *req;
1045	struct tcpcb *tp = toep->tp_tp;
1046	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1047
1048	if (!m)
1049		return (ENOMEM);
1050
1051	inp_lock_assert(tp->t_inpcb);
1052	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1053	req = mtod(m, struct cpl_get_tcb *);
1054	m->m_pkthdr.len = m->m_len = sizeof(*req);
1055	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1056	req->wr.wr_lo = 0;
1057	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1058	req->cpuno = htons(toep->tp_qset);
1059	req->rsvd = 0;
1060	if (tp->t_state == TCPS_SYN_SENT)
1061		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1062	else
1063		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1064	return 0;
1065}
1066
1067static inline void
1068so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1069{
1070
1071	toepcb_hold(toep);
1072
1073	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1074}
1075
1076/**
1077 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1078 *	@d: TOM state
1079 *	@mtu: the target MTU
1080 *
1081 *	Returns the index of the value in the MTU table that is closest to but
1082 *	does not exceed the target MTU.
1083 */
1084static unsigned int
1085find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1086{
1087	int i = 0;
1088
1089	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1090		++i;
1091	return (i);
1092}
1093
1094static unsigned int
1095select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1096{
1097	unsigned int idx;
1098
1099#ifdef notyet
1100	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1101#endif
1102	if (tp) {
1103		tp->t_maxseg = pmtu - 40;
1104		if (tp->t_maxseg < td->mtus[0] - 40)
1105			tp->t_maxseg = td->mtus[0] - 40;
1106		idx = find_best_mtu(td, tp->t_maxseg + 40);
1107
1108		tp->t_maxseg = td->mtus[idx] - 40;
1109	} else
1110		idx = find_best_mtu(td, pmtu);
1111
1112	return (idx);
1113}
1114
1115static inline void
1116free_atid(struct t3cdev *cdev, unsigned int tid)
1117{
1118	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1119
1120	if (toep)
1121		toepcb_release(toep);
1122}
1123
1124/*
1125 * Release resources held by an offload connection (TID, L2T entry, etc.)
1126 */
1127static void
1128t3_release_offload_resources(struct toepcb *toep)
1129{
1130	struct tcpcb *tp = toep->tp_tp;
1131	struct toedev *tdev = toep->tp_toedev;
1132	struct t3cdev *cdev;
1133	struct socket *so;
1134	unsigned int tid = toep->tp_tid;
1135	struct sockbuf *rcv;
1136
1137	CTR0(KTR_TOM, "t3_release_offload_resources");
1138
1139	if (!tdev)
1140		return;
1141
1142	cdev = TOEP_T3C_DEV(toep);
1143	if (!cdev)
1144		return;
1145
1146	toep->tp_qset = 0;
1147	t3_release_ddp_resources(toep);
1148
1149#ifdef CTRL_SKB_CACHE
1150	kfree_skb(CTRL_SKB_CACHE(tp));
1151	CTRL_SKB_CACHE(tp) = NULL;
1152#endif
1153
1154	if (toep->tp_wr_avail != toep->tp_wr_max) {
1155		purge_wr_queue(toep);
1156		reset_wr_list(toep);
1157	}
1158
1159	if (toep->tp_l2t) {
1160		l2t_release(L2DATA(cdev), toep->tp_l2t);
1161		toep->tp_l2t = NULL;
1162	}
1163	toep->tp_tp = NULL;
1164	if (tp) {
1165		inp_lock_assert(tp->t_inpcb);
1166		so = inp_inpcbtosocket(tp->t_inpcb);
1167		rcv = so_sockbuf_rcv(so);
1168		/*
1169		 * cancel any offloaded reads
1170		 *
1171		 */
1172		sockbuf_lock(rcv);
1173		tp->t_toe = NULL;
1174		tp->t_flags &= ~TF_TOE;
1175		if (toep->tp_ddp_state.user_ddp_pending) {
1176			t3_cancel_ubuf(toep, rcv);
1177			toep->tp_ddp_state.user_ddp_pending = 0;
1178		}
1179		so_sorwakeup_locked(so);
1180
1181	}
1182
1183	if (toep->tp_state == TCPS_SYN_SENT) {
1184		free_atid(cdev, tid);
1185#ifdef notyet
1186		__skb_queue_purge(&tp->out_of_order_queue);
1187#endif
1188	} else {                                          // we have TID
1189		cxgb_remove_tid(cdev, toep, tid);
1190		toepcb_release(toep);
1191	}
1192#if 0
1193	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1194#endif
1195}
1196
1197static void
1198install_offload_ops(struct socket *so)
1199{
1200	struct tcpcb *tp = so_sototcpcb(so);
1201
1202	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1203
1204	t3_install_socket_ops(so);
1205	tp->t_flags |= TF_TOE;
1206	tp->t_tu = &cxgb_toe_usrreqs;
1207}
1208
1209/*
1210 * Determine the receive window scaling factor given a target max
1211 * receive window.
1212 */
1213static __inline int
1214select_rcv_wscale(int space, struct vnet *vnet)
1215{
1216	int wscale = 0;
1217
1218	if (space > MAX_RCV_WND)
1219		space = MAX_RCV_WND;
1220
1221	if (V_tcp_do_rfc1323)
1222		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1223
1224	return (wscale);
1225}
1226
1227/*
1228 * Determine the receive window size for a socket.
1229 */
1230static unsigned long
1231select_rcv_wnd(struct toedev *dev, struct socket *so)
1232{
1233	struct tom_data *d = TOM_DATA(dev);
1234	unsigned int wnd;
1235	unsigned int max_rcv_wnd;
1236	struct sockbuf *rcv;
1237
1238	rcv = so_sockbuf_rcv(so);
1239
1240	if (V_tcp_do_autorcvbuf)
1241		wnd = V_tcp_autorcvbuf_max;
1242	else
1243		wnd = rcv->sb_hiwat;
1244
1245
1246
1247	/* XXX
1248	 * For receive coalescing to work effectively we need a receive window
1249	 * that can accomodate a coalesced segment.
1250	 */
1251	if (wnd < MIN_RCV_WND)
1252		wnd = MIN_RCV_WND;
1253
1254	/* PR 5138 */
1255	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1256				    (uint32_t)d->rx_page_size * 23 :
1257				    MAX_RCV_WND);
1258
1259	return min(wnd, max_rcv_wnd);
1260}
1261
1262/*
1263 * Assign offload parameters to some socket fields.  This code is used by
1264 * both active and passive opens.
1265 */
1266static inline void
1267init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1268    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1269{
1270	struct tcpcb *tp = so_sototcpcb(so);
1271	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1272	struct sockbuf *snd, *rcv;
1273
1274#ifdef notyet
1275	SOCK_LOCK_ASSERT(so);
1276#endif
1277
1278	snd = so_sockbuf_snd(so);
1279	rcv = so_sockbuf_rcv(so);
1280
1281	log(LOG_INFO, "initializing offload socket\n");
1282	/*
1283	 * We either need to fix push frames to work with sbcompress
1284	 * or we need to add this
1285	 */
1286	snd->sb_flags |= SB_NOCOALESCE;
1287	rcv->sb_flags |= SB_NOCOALESCE;
1288
1289	tp->t_toe = toep;
1290	toep->tp_tp = tp;
1291	toep->tp_toedev = dev;
1292
1293	toep->tp_tid = tid;
1294	toep->tp_l2t = e;
1295	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1296	toep->tp_wr_unacked = 0;
1297	toep->tp_delack_mode = 0;
1298
1299	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1300	/*
1301	 * XXX broken
1302	 *
1303	 */
1304	tp->rcv_wnd = select_rcv_wnd(dev, so);
1305
1306        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1307		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1308	toep->tp_qset_idx = 0;
1309
1310	reset_wr_list(toep);
1311	DPRINTF("initialization done\n");
1312}
1313
1314/*
1315 * The next two functions calculate the option 0 value for a socket.
1316 */
1317static inline unsigned int
1318calc_opt0h(struct socket *so, int mtu_idx)
1319{
1320	struct tcpcb *tp = so_sototcpcb(so);
1321	int wscale = select_rcv_wscale(tp->rcv_wnd, so->so_vnet);
1322
1323	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1324	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1325	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1326}
1327
1328static inline unsigned int
1329calc_opt0l(struct socket *so, int ulp_mode)
1330{
1331	struct tcpcb *tp = so_sototcpcb(so);
1332	unsigned int val;
1333
1334	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1335	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1336
1337	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1338	return (val);
1339}
1340
1341static inline unsigned int
1342calc_opt2(const struct socket *so, struct toedev *dev)
1343{
1344	int flv_valid;
1345
1346	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1347
1348	return (V_FLAVORS_VALID(flv_valid) |
1349	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1350}
1351
1352#if DEBUG_WR > 1
1353static int
1354count_pending_wrs(const struct toepcb *toep)
1355{
1356	const struct mbuf *m;
1357	int n = 0;
1358
1359	wr_queue_walk(toep, m)
1360		n += m->m_pkthdr.csum_data;
1361	return (n);
1362}
1363#endif
1364
1365#if 0
1366(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1367#endif
1368
1369static void
1370mk_act_open_req(struct socket *so, struct mbuf *m,
1371    unsigned int atid, const struct l2t_entry *e)
1372{
1373	struct cpl_act_open_req *req;
1374	struct inpcb *inp = so_sotoinpcb(so);
1375	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1376	struct toepcb *toep = tp->t_toe;
1377	struct toedev *tdev = toep->tp_toedev;
1378
1379	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1380
1381	req = mtod(m, struct cpl_act_open_req *);
1382	m->m_pkthdr.len = m->m_len = sizeof(*req);
1383
1384	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1385	req->wr.wr_lo = 0;
1386	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1387	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1388#if 0
1389	req->local_port = inp->inp_lport;
1390	req->peer_port = inp->inp_fport;
1391	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1392	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1393#endif
1394	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1395			   V_TX_CHANNEL(e->smt_idx));
1396	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1397	req->params = 0;
1398	req->opt2 = htonl(calc_opt2(so, tdev));
1399}
1400
1401
1402/*
1403 * Convert an ACT_OPEN_RPL status to an errno.
1404 */
1405static int
1406act_open_rpl_status_to_errno(int status)
1407{
1408	switch (status) {
1409	case CPL_ERR_CONN_RESET:
1410		return (ECONNREFUSED);
1411	case CPL_ERR_ARP_MISS:
1412		return (EHOSTUNREACH);
1413	case CPL_ERR_CONN_TIMEDOUT:
1414		return (ETIMEDOUT);
1415	case CPL_ERR_TCAM_FULL:
1416		return (ENOMEM);
1417	case CPL_ERR_CONN_EXIST:
1418		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1419		return (EADDRINUSE);
1420	default:
1421		return (EIO);
1422	}
1423}
1424
1425static void
1426fail_act_open(struct toepcb *toep, int errno)
1427{
1428	struct tcpcb *tp = toep->tp_tp;
1429
1430	t3_release_offload_resources(toep);
1431	if (tp) {
1432		inp_wunlock(tp->t_inpcb);
1433		tcp_offload_drop(tp, errno);
1434	}
1435
1436#ifdef notyet
1437	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1438#endif
1439}
1440
1441/*
1442 * Handle active open failures.
1443 */
1444static void
1445active_open_failed(struct toepcb *toep, struct mbuf *m)
1446{
1447	struct cpl_act_open_rpl *rpl = cplhdr(m);
1448	struct inpcb *inp;
1449
1450	if (toep->tp_tp == NULL)
1451		goto done;
1452
1453	inp = toep->tp_tp->t_inpcb;
1454
1455/*
1456 * Don't handle connection retry for now
1457 */
1458#ifdef notyet
1459	struct inet_connection_sock *icsk = inet_csk(sk);
1460
1461	if (rpl->status == CPL_ERR_CONN_EXIST &&
1462	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1463		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1464		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1465			       jiffies + HZ / 2);
1466	} else
1467#endif
1468	{
1469		inp_wlock(inp);
1470		/*
1471		 * drops the inpcb lock
1472		 */
1473		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1474	}
1475
1476	done:
1477	m_free(m);
1478}
1479
1480/*
1481 * Return whether a failed active open has allocated a TID
1482 */
1483static inline int
1484act_open_has_tid(int status)
1485{
1486	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1487	       status != CPL_ERR_ARP_MISS;
1488}
1489
1490/*
1491 * Process an ACT_OPEN_RPL CPL message.
1492 */
1493static int
1494do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1495{
1496	struct toepcb *toep = (struct toepcb *)ctx;
1497	struct cpl_act_open_rpl *rpl = cplhdr(m);
1498
1499	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1500		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1501
1502	active_open_failed(toep, m);
1503	return (0);
1504}
1505
1506/*
1507 * Handle an ARP failure for an active open.   XXX purge ofo queue
1508 *
1509 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1510 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1511 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1512 * free the atid.  Hmm.
1513 */
1514#ifdef notyet
1515static void
1516act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1517{
1518	struct toepcb *toep = m_get_toep(m);
1519	struct tcpcb *tp = toep->tp_tp;
1520	struct inpcb *inp = tp->t_inpcb;
1521	struct socket *so;
1522
1523	inp_wlock(inp);
1524	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1525		/*
1526		 * drops the inpcb lock
1527		 */
1528		fail_act_open(so, EHOSTUNREACH);
1529		printf("freeing %p\n", m);
1530
1531		m_free(m);
1532	} else
1533		inp_wunlock(inp);
1534}
1535#endif
1536/*
1537 * Send an active open request.
1538 */
1539int
1540t3_connect(struct toedev *tdev, struct socket *so,
1541    struct rtentry *rt, struct sockaddr *nam)
1542{
1543	struct mbuf *m;
1544	struct l2t_entry *e;
1545	struct tom_data *d = TOM_DATA(tdev);
1546	struct inpcb *inp = so_sotoinpcb(so);
1547	struct tcpcb *tp = intotcpcb(inp);
1548	struct toepcb *toep; /* allocated by init_offload_socket */
1549
1550	int atid;
1551
1552	toep = toepcb_alloc();
1553	if (toep == NULL)
1554		goto out_err;
1555
1556	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1557		goto out_err;
1558
1559	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1560	if (!e)
1561		goto free_tid;
1562
1563	inp_lock_assert(inp);
1564	m = m_gethdr(MT_DATA, M_WAITOK);
1565
1566#if 0
1567	m->m_toe.mt_toepcb = tp->t_toe;
1568	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1569#endif
1570	so_lock(so);
1571
1572	init_offload_socket(so, tdev, atid, e, rt, toep);
1573
1574	install_offload_ops(so);
1575
1576	mk_act_open_req(so, m, atid, e);
1577	so_unlock(so);
1578
1579	soisconnecting(so);
1580	toep = tp->t_toe;
1581	m_set_toep(m, tp->t_toe);
1582
1583	toep->tp_state = TCPS_SYN_SENT;
1584	l2t_send(d->cdev, (struct mbuf *)m, e);
1585
1586	if (toep->tp_ulp_mode)
1587		t3_enable_ddp(toep, 0);
1588	return 	(0);
1589
1590free_tid:
1591	printf("failing connect - free atid\n");
1592
1593	free_atid(d->cdev, atid);
1594out_err:
1595	printf("return ENOMEM\n");
1596       return (ENOMEM);
1597}
1598
1599/*
1600 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1601 * not send multiple ABORT_REQs for the same connection and also that we do
1602 * not try to send a message after the connection has closed.  Returns 1 if
1603 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1604 */
1605static void
1606t3_send_reset(struct toepcb *toep)
1607{
1608
1609	struct cpl_abort_req *req;
1610	unsigned int tid = toep->tp_tid;
1611	int mode = CPL_ABORT_SEND_RST;
1612	struct tcpcb *tp = toep->tp_tp;
1613	struct toedev *tdev = toep->tp_toedev;
1614	struct socket *so = NULL;
1615	struct mbuf *m;
1616	struct sockbuf *snd;
1617
1618	if (tp) {
1619		inp_lock_assert(tp->t_inpcb);
1620		so = inp_inpcbtosocket(tp->t_inpcb);
1621	}
1622
1623	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1624		tdev == NULL))
1625		return;
1626	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1627
1628	snd = so_sockbuf_snd(so);
1629	/* Purge the send queue so we don't send anything after an abort. */
1630	if (so)
1631		sbflush(snd);
1632	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1633		mode |= CPL_ABORT_POST_CLOSE_REQ;
1634
1635	m = m_gethdr_nofail(sizeof(*req));
1636	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1637	set_arp_failure_handler(m, abort_arp_failure);
1638
1639	req = mtod(m, struct cpl_abort_req *);
1640	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1641	req->wr.wr_lo = htonl(V_WR_TID(tid));
1642	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1643	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1644	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1645	req->cmd = mode;
1646	if (tp && (tp->t_state == TCPS_SYN_SENT))
1647		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1648	else
1649		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1650}
1651
1652static int
1653t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1654{
1655	struct inpcb *inp;
1656	int error, optval;
1657
1658	if (sopt->sopt_name == IP_OPTIONS)
1659		return (ENOPROTOOPT);
1660
1661	if (sopt->sopt_name != IP_TOS)
1662		return (EOPNOTSUPP);
1663
1664	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1665
1666	if (error)
1667		return (error);
1668
1669	if (optval > IPTOS_PREC_CRITIC_ECP)
1670		return (EINVAL);
1671
1672	inp = so_sotoinpcb(so);
1673	inp_wlock(inp);
1674	inp_ip_tos_set(inp, optval);
1675#if 0
1676	inp->inp_ip_tos = optval;
1677#endif
1678	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1679	inp_wunlock(inp);
1680
1681	return (0);
1682}
1683
1684static int
1685t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1686{
1687	int err = 0;
1688	size_t copied;
1689
1690	if (sopt->sopt_name != TCP_CONGESTION &&
1691	    sopt->sopt_name != TCP_NODELAY)
1692		return (EOPNOTSUPP);
1693
1694	if (sopt->sopt_name == TCP_CONGESTION) {
1695		char name[TCP_CA_NAME_MAX];
1696		int optlen = sopt->sopt_valsize;
1697		struct tcpcb *tp;
1698
1699		if (sopt->sopt_dir == SOPT_GET) {
1700			KASSERT(0, ("unimplemented"));
1701			return (EOPNOTSUPP);
1702		}
1703
1704		if (optlen < 1)
1705			return (EINVAL);
1706
1707		err = copyinstr(sopt->sopt_val, name,
1708		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1709		if (err)
1710			return (err);
1711		if (copied < 1)
1712			return (EINVAL);
1713
1714		tp = so_sototcpcb(so);
1715		/*
1716		 * XXX I need to revisit this
1717		 */
1718		if ((err = t3_set_cong_control(so, name)) == 0) {
1719#ifdef CONGESTION_CONTROL_SUPPORTED
1720			tp->t_cong_control = strdup(name, M_CXGB);
1721#endif
1722		} else
1723			return (err);
1724	} else {
1725		int optval, oldval;
1726		struct inpcb *inp;
1727		struct tcpcb *tp;
1728
1729		if (sopt->sopt_dir == SOPT_GET)
1730			return (EOPNOTSUPP);
1731
1732		err = sooptcopyin(sopt, &optval, sizeof optval,
1733		    sizeof optval);
1734
1735		if (err)
1736			return (err);
1737
1738		inp = so_sotoinpcb(so);
1739		inp_wlock(inp);
1740		tp = inp_inpcbtotcpcb(inp);
1741
1742		oldval = tp->t_flags;
1743		if (optval)
1744			tp->t_flags |= TF_NODELAY;
1745		else
1746			tp->t_flags &= ~TF_NODELAY;
1747		inp_wunlock(inp);
1748
1749
1750		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1751			t3_set_nagle(tp->t_toe);
1752
1753	}
1754
1755	return (0);
1756}
1757
1758int
1759t3_ctloutput(struct socket *so, struct sockopt *sopt)
1760{
1761	int err;
1762
1763	if (sopt->sopt_level != IPPROTO_TCP)
1764		err =  t3_ip_ctloutput(so, sopt);
1765	else
1766		err = t3_tcp_ctloutput(so, sopt);
1767
1768	if (err != EOPNOTSUPP)
1769		return (err);
1770
1771	return (tcp_ctloutput(so, sopt));
1772}
1773
1774/*
1775 * Returns true if we need to explicitly request RST when we receive new data
1776 * on an RX-closed connection.
1777 */
1778static inline int
1779need_rst_on_excess_rx(const struct toepcb *toep)
1780{
1781	return (1);
1782}
1783
1784/*
1785 * Handles Rx data that arrives in a state where the socket isn't accepting
1786 * new data.
1787 */
1788static void
1789handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1790{
1791
1792	if (need_rst_on_excess_rx(toep) &&
1793	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1794		t3_send_reset(toep);
1795	m_freem(m);
1796}
1797
1798/*
1799 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1800 * by getting the DDP offset from the TCB.
1801 */
1802static void
1803tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1804{
1805	struct ddp_state *q = &toep->tp_ddp_state;
1806	struct ddp_buf_state *bsp;
1807	struct cpl_get_tcb_rpl *hdr;
1808	unsigned int ddp_offset;
1809	struct socket *so;
1810	struct tcpcb *tp;
1811	struct sockbuf *rcv;
1812	int state;
1813
1814	uint64_t t;
1815	__be64 *tcb;
1816
1817	tp = toep->tp_tp;
1818	so = inp_inpcbtosocket(tp->t_inpcb);
1819
1820	inp_lock_assert(tp->t_inpcb);
1821	rcv = so_sockbuf_rcv(so);
1822	sockbuf_lock(rcv);
1823
1824	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1825	 * We really need a cookie in order to dispatch the RPLs.
1826	 */
1827	q->get_tcb_count--;
1828
1829	/* It is a possible that a previous CPL already invalidated UBUF DDP
1830	 * and moved the cur_buf idx and hence no further processing of this
1831	 * skb is required. However, the app might be sleeping on
1832	 * !q->get_tcb_count and we need to wake it up.
1833	 */
1834	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1835		int state = so_state_get(so);
1836
1837		m_freem(m);
1838		if (__predict_true((state & SS_NOFDREF) == 0))
1839			so_sorwakeup_locked(so);
1840		else
1841			sockbuf_unlock(rcv);
1842
1843		return;
1844	}
1845
1846	bsp = &q->buf_state[q->cur_buf];
1847	hdr = cplhdr(m);
1848	tcb = (__be64 *)(hdr + 1);
1849	if (q->cur_buf == 0) {
1850		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1851		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1852	} else {
1853		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1854		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1855	}
1856	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1857	m->m_cur_offset = bsp->cur_offset;
1858	bsp->cur_offset = ddp_offset;
1859	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1860
1861	CTR5(KTR_TOM,
1862	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1863	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1864	KASSERT(ddp_offset >= m->m_cur_offset,
1865	    ("ddp_offset=%u less than cur_offset=%u",
1866		ddp_offset, m->m_cur_offset));
1867
1868#if 0
1869{
1870	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1871
1872	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1873	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1874
1875        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1876        rcv_nxt = t >> S_TCB_RCV_NXT;
1877        rcv_nxt &= M_TCB_RCV_NXT;
1878
1879        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1880        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1881        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1882
1883	T3_TRACE2(TIDTB(sk),
1884		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1885		  ddp_flags, rcv_nxt - rx_hdr_offset);
1886	T3_TRACE4(TB(q),
1887		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1888		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1889	T3_TRACE3(TB(q),
1890		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1891		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1892	T3_TRACE2(TB(q),
1893		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1894		 q->buf_state[0].flags, q->buf_state[1].flags);
1895
1896}
1897#endif
1898	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1899		handle_excess_rx(toep, m);
1900		return;
1901	}
1902
1903#ifdef T3_TRACE
1904	if ((int)m->m_pkthdr.len < 0) {
1905		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1906	}
1907#endif
1908	if (bsp->flags & DDP_BF_NOCOPY) {
1909#ifdef T3_TRACE
1910		T3_TRACE0(TB(q),
1911			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1912
1913		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1914			printk("!cancel_ubuf");
1915			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1916		}
1917#endif
1918		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1919		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1920		q->cur_buf ^= 1;
1921	} else if (bsp->flags & DDP_BF_NOFLIP) {
1922
1923		m->m_ddp_flags = 1;    /* always a kernel buffer */
1924
1925		/* now HW buffer carries a user buffer */
1926		bsp->flags &= ~DDP_BF_NOFLIP;
1927		bsp->flags |= DDP_BF_NOCOPY;
1928
1929		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1930		 * any new data in which case we're done. If in addition the
1931		 * offset is 0, then there wasn't a completion for the kbuf
1932		 * and we need to decrement the posted count.
1933		 */
1934		if (m->m_pkthdr.len == 0) {
1935			if (ddp_offset == 0) {
1936				q->kbuf_posted--;
1937				bsp->flags |= DDP_BF_NODATA;
1938			}
1939			sockbuf_unlock(rcv);
1940			m_free(m);
1941			return;
1942		}
1943	} else {
1944		sockbuf_unlock(rcv);
1945
1946		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1947		 * but it got here way late and nobody cares anymore.
1948		 */
1949		m_free(m);
1950		return;
1951	}
1952
1953	m->m_ddp_gl = (unsigned char *)bsp->gl;
1954	m->m_flags |= M_DDP;
1955	m->m_seq = tp->rcv_nxt;
1956	tp->rcv_nxt += m->m_pkthdr.len;
1957	tp->t_rcvtime = ticks;
1958	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1959		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1960	if (m->m_pkthdr.len == 0) {
1961		q->user_ddp_pending = 0;
1962		m_free(m);
1963	} else
1964		SBAPPEND(rcv, m);
1965
1966	state = so_state_get(so);
1967	if (__predict_true((state & SS_NOFDREF) == 0))
1968		so_sorwakeup_locked(so);
1969	else
1970		sockbuf_unlock(rcv);
1971}
1972
1973/*
1974 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1975 * in that case they are similar to DDP completions.
1976 */
1977static int
1978do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1979{
1980	struct toepcb *toep = (struct toepcb *)ctx;
1981
1982	/* OK if socket doesn't exist */
1983	if (toep == NULL) {
1984		printf("null toep in do_get_tcb_rpl\n");
1985		return (CPL_RET_BUF_DONE);
1986	}
1987
1988	inp_wlock(toep->tp_tp->t_inpcb);
1989	tcb_rpl_as_ddp_complete(toep, m);
1990	inp_wunlock(toep->tp_tp->t_inpcb);
1991
1992	return (0);
1993}
1994
1995static void
1996handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1997{
1998	struct tcpcb *tp = toep->tp_tp;
1999	struct socket *so;
2000	struct ddp_state *q;
2001	struct ddp_buf_state *bsp;
2002	struct cpl_rx_data *hdr = cplhdr(m);
2003	unsigned int rcv_nxt = ntohl(hdr->seq);
2004	struct sockbuf *rcv;
2005
2006	if (tp->rcv_nxt == rcv_nxt)
2007		return;
2008
2009	inp_lock_assert(tp->t_inpcb);
2010	so  = inp_inpcbtosocket(tp->t_inpcb);
2011	rcv = so_sockbuf_rcv(so);
2012	sockbuf_lock(rcv);
2013
2014	q = &toep->tp_ddp_state;
2015	bsp = &q->buf_state[q->cur_buf];
2016	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2017		rcv_nxt, tp->rcv_nxt));
2018	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2019	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2020	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2021	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2022
2023#ifdef T3_TRACE
2024	if ((int)m->m_pkthdr.len < 0) {
2025		t3_ddp_error(so, "handle_ddp_data: neg len");
2026	}
2027#endif
2028	m->m_ddp_gl = (unsigned char *)bsp->gl;
2029	m->m_flags |= M_DDP;
2030	m->m_cur_offset = bsp->cur_offset;
2031	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2032	if (bsp->flags & DDP_BF_NOCOPY)
2033		bsp->flags &= ~DDP_BF_NOCOPY;
2034
2035	m->m_seq = tp->rcv_nxt;
2036	tp->rcv_nxt = rcv_nxt;
2037	bsp->cur_offset += m->m_pkthdr.len;
2038	if (!(bsp->flags & DDP_BF_NOFLIP))
2039		q->cur_buf ^= 1;
2040	/*
2041	 * For now, don't re-enable DDP after a connection fell out of  DDP
2042	 * mode.
2043	 */
2044	q->ubuf_ddp_ready = 0;
2045	sockbuf_unlock(rcv);
2046}
2047
2048/*
2049 * Process new data received for a connection.
2050 */
2051static void
2052new_rx_data(struct toepcb *toep, struct mbuf *m)
2053{
2054	struct cpl_rx_data *hdr = cplhdr(m);
2055	struct tcpcb *tp = toep->tp_tp;
2056	struct socket *so;
2057	struct sockbuf *rcv;
2058	int state;
2059	int len = be16toh(hdr->len);
2060
2061	inp_wlock(tp->t_inpcb);
2062
2063	so  = inp_inpcbtosocket(tp->t_inpcb);
2064
2065	if (__predict_false(so_no_receive(so))) {
2066		handle_excess_rx(toep, m);
2067		inp_wunlock(tp->t_inpcb);
2068		TRACE_EXIT;
2069		return;
2070	}
2071
2072	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2073		handle_ddp_data(toep, m);
2074
2075	m->m_seq = ntohl(hdr->seq);
2076	m->m_ulp_mode = 0;                    /* for iSCSI */
2077
2078#if VALIDATE_SEQ
2079	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2080		log(LOG_ERR,
2081		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2082		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2083		       tp->rcv_nxt);
2084		m_freem(m);
2085		inp_wunlock(tp->t_inpcb);
2086		return;
2087	}
2088#endif
2089	m_adj(m, sizeof(*hdr));
2090
2091#ifdef URGENT_DATA_SUPPORTED
2092	/*
2093	 * We don't handle urgent data yet
2094	 */
2095	if (__predict_false(hdr->urg))
2096		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2097	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2098		     tp->urg_seq - tp->rcv_nxt < skb->len))
2099		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2100							 tp->rcv_nxt];
2101#endif
2102	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2103		toep->tp_delack_mode = hdr->dack_mode;
2104		toep->tp_delack_seq = tp->rcv_nxt;
2105	}
2106	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2107	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2108
2109	if (len < m->m_pkthdr.len)
2110		m->m_pkthdr.len = m->m_len = len;
2111
2112	tp->rcv_nxt += m->m_pkthdr.len;
2113	tp->t_rcvtime = ticks;
2114	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2115	CTR2(KTR_TOM,
2116	    "new_rx_data: seq 0x%x len %u",
2117	    m->m_seq, m->m_pkthdr.len);
2118	inp_wunlock(tp->t_inpcb);
2119	rcv = so_sockbuf_rcv(so);
2120	sockbuf_lock(rcv);
2121#if 0
2122	if (sb_notify(rcv))
2123		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2124#endif
2125	SBAPPEND(rcv, m);
2126
2127#ifdef notyet
2128	/*
2129	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2130	 *
2131	 */
2132	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2133
2134	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2135		so, rcv->sb_cc, rcv->sb_mbmax));
2136#endif
2137
2138
2139	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2140	    rcv->sb_cc, rcv->sb_mbcnt);
2141
2142	state = so_state_get(so);
2143	if (__predict_true((state & SS_NOFDREF) == 0))
2144		so_sorwakeup_locked(so);
2145	else
2146		sockbuf_unlock(rcv);
2147}
2148
2149/*
2150 * Handler for RX_DATA CPL messages.
2151 */
2152static int
2153do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2154{
2155	struct toepcb *toep = (struct toepcb *)ctx;
2156
2157	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2158
2159	new_rx_data(toep, m);
2160
2161	return (0);
2162}
2163
2164static void
2165new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2166{
2167	struct tcpcb *tp;
2168	struct ddp_state *q;
2169	struct ddp_buf_state *bsp;
2170	struct cpl_rx_data_ddp *hdr;
2171	struct socket *so;
2172	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2173	int nomoredata = 0;
2174	unsigned int delack_mode;
2175	struct sockbuf *rcv;
2176
2177	tp = toep->tp_tp;
2178	inp_wlock(tp->t_inpcb);
2179	so = inp_inpcbtosocket(tp->t_inpcb);
2180
2181	if (__predict_false(so_no_receive(so))) {
2182
2183		handle_excess_rx(toep, m);
2184		inp_wunlock(tp->t_inpcb);
2185		return;
2186	}
2187
2188	q = &toep->tp_ddp_state;
2189	hdr = cplhdr(m);
2190	ddp_report = ntohl(hdr->u.ddp_report);
2191	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2192	bsp = &q->buf_state[buf_idx];
2193
2194	CTR4(KTR_TOM,
2195	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2196	    "hdr seq 0x%x len %u",
2197	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2198	    ntohs(hdr->len));
2199	CTR3(KTR_TOM,
2200	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2201	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2202
2203	ddp_len = ntohs(hdr->len);
2204	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2205
2206	delack_mode = G_DDP_DACK_MODE(ddp_report);
2207	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2208		toep->tp_delack_mode = delack_mode;
2209		toep->tp_delack_seq = tp->rcv_nxt;
2210	}
2211
2212	m->m_seq = tp->rcv_nxt;
2213	tp->rcv_nxt = rcv_nxt;
2214
2215	tp->t_rcvtime = ticks;
2216	/*
2217	 * Store the length in m->m_len.  We are changing the meaning of
2218	 * m->m_len here, we need to be very careful that nothing from now on
2219	 * interprets ->len of this packet the usual way.
2220	 */
2221	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2222	inp_wunlock(tp->t_inpcb);
2223	CTR3(KTR_TOM,
2224	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2225	    m->m_len, rcv_nxt, m->m_seq);
2226	/*
2227	 * Figure out where the new data was placed in the buffer and store it
2228	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2229	 * account for page pod's pg_offset.
2230	 */
2231	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2232	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2233
2234	rcv = so_sockbuf_rcv(so);
2235	sockbuf_lock(rcv);
2236
2237	m->m_ddp_gl = (unsigned char *)bsp->gl;
2238	m->m_flags |= M_DDP;
2239	bsp->cur_offset = end_offset;
2240	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2241
2242	/*
2243	 * Length is only meaningful for kbuf
2244	 */
2245	if (!(bsp->flags & DDP_BF_NOCOPY))
2246		KASSERT(m->m_len <= bsp->gl->dgl_length,
2247		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2248			m->m_len, bsp->gl->dgl_length));
2249
2250	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2251	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2252        /*
2253	 * Bit 0 of flags stores whether the DDP buffer is completed.
2254	 * Note that other parts of the code depend on this being in bit 0.
2255	 */
2256	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2257		panic("spurious ddp completion");
2258	} else {
2259		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2260		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2261			q->cur_buf ^= 1;                     /* flip buffers */
2262	}
2263
2264	if (bsp->flags & DDP_BF_NOCOPY) {
2265		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2266		bsp->flags &= ~DDP_BF_NOCOPY;
2267	}
2268
2269	if (ddp_report & F_DDP_PSH)
2270		m->m_ddp_flags |= DDP_BF_PSH;
2271	if (nomoredata)
2272		m->m_ddp_flags |= DDP_BF_NODATA;
2273
2274#ifdef notyet
2275	skb_reset_transport_header(skb);
2276	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2277#endif
2278	SBAPPEND(rcv, m);
2279
2280	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2281	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2282		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2283		so_sorwakeup_locked(so);
2284	else
2285		sockbuf_unlock(rcv);
2286}
2287
2288#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2289		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2290		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2291		 F_DDP_INVALID_PPOD)
2292
2293/*
2294 * Handler for RX_DATA_DDP CPL messages.
2295 */
2296static int
2297do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2298{
2299	struct toepcb *toep = ctx;
2300	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2301
2302	VALIDATE_SOCK(so);
2303
2304	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2305		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2306		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2307		return (CPL_RET_BUF_DONE);
2308	}
2309#if 0
2310	skb->h.th = tcphdr_skb->h.th;
2311#endif
2312	new_rx_data_ddp(toep, m);
2313	return (0);
2314}
2315
2316static void
2317process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2318{
2319	struct tcpcb *tp = toep->tp_tp;
2320	struct socket *so;
2321	struct ddp_state *q;
2322	struct ddp_buf_state *bsp;
2323	struct cpl_rx_ddp_complete *hdr;
2324	unsigned int ddp_report, buf_idx, when, delack_mode;
2325	int nomoredata = 0;
2326	struct sockbuf *rcv;
2327
2328	inp_wlock(tp->t_inpcb);
2329	so = inp_inpcbtosocket(tp->t_inpcb);
2330
2331	if (__predict_false(so_no_receive(so))) {
2332		struct inpcb *inp = so_sotoinpcb(so);
2333
2334		handle_excess_rx(toep, m);
2335		inp_wunlock(inp);
2336		return;
2337	}
2338	q = &toep->tp_ddp_state;
2339	hdr = cplhdr(m);
2340	ddp_report = ntohl(hdr->ddp_report);
2341	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2342	m->m_pkthdr.csum_data = tp->rcv_nxt;
2343
2344	rcv = so_sockbuf_rcv(so);
2345	sockbuf_lock(rcv);
2346
2347	bsp = &q->buf_state[buf_idx];
2348	when = bsp->cur_offset;
2349	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2350	tp->rcv_nxt += m->m_len;
2351	tp->t_rcvtime = ticks;
2352
2353	delack_mode = G_DDP_DACK_MODE(ddp_report);
2354	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2355		toep->tp_delack_mode = delack_mode;
2356		toep->tp_delack_seq = tp->rcv_nxt;
2357	}
2358#ifdef notyet
2359	skb_reset_transport_header(skb);
2360	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2361#endif
2362	inp_wunlock(tp->t_inpcb);
2363
2364	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2365	CTR5(KTR_TOM,
2366		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2367		  "ddp_report 0x%x offset %u, len %u",
2368		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2369		   G_DDP_OFFSET(ddp_report), m->m_len);
2370
2371	m->m_cur_offset = bsp->cur_offset;
2372	bsp->cur_offset += m->m_len;
2373
2374	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2375		q->cur_buf ^= 1;                     /* flip buffers */
2376		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2377			nomoredata=1;
2378	}
2379
2380	CTR4(KTR_TOM,
2381		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2382		  "ddp_report %u offset %u",
2383		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2384		   G_DDP_OFFSET(ddp_report));
2385
2386	m->m_ddp_gl = (unsigned char *)bsp->gl;
2387	m->m_flags |= M_DDP;
2388	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2389	if (bsp->flags & DDP_BF_NOCOPY)
2390		bsp->flags &= ~DDP_BF_NOCOPY;
2391	if (nomoredata)
2392		m->m_ddp_flags |= DDP_BF_NODATA;
2393
2394	SBAPPEND(rcv, m);
2395	if ((so_state_get(so) & SS_NOFDREF) == 0)
2396		so_sorwakeup_locked(so);
2397	else
2398		sockbuf_unlock(rcv);
2399}
2400
2401/*
2402 * Handler for RX_DDP_COMPLETE CPL messages.
2403 */
2404static int
2405do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2406{
2407	struct toepcb *toep = ctx;
2408
2409	VALIDATE_SOCK(so);
2410#if 0
2411	skb->h.th = tcphdr_skb->h.th;
2412#endif
2413	process_ddp_complete(toep, m);
2414	return (0);
2415}
2416
2417/*
2418 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2419 * socket state before calling tcp_time_wait to comply with its expectations.
2420 */
2421static void
2422enter_timewait(struct tcpcb *tp)
2423{
2424	/*
2425	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2426	 * process peer_close because we don't want to carry the peer FIN in
2427	 * the socket's receive queue and if we increment rcv_nxt without
2428	 * having the FIN in the receive queue we'll confuse facilities such
2429	 * as SIOCINQ.
2430	 */
2431	inp_wlock(tp->t_inpcb);
2432	tp->rcv_nxt++;
2433
2434	tp->ts_recent_age = 0;	     /* defeat recycling */
2435	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2436	inp_wunlock(tp->t_inpcb);
2437	tcp_offload_twstart(tp);
2438}
2439
2440/*
2441 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2442 * function deals with the data that may be reported along with the FIN.
2443 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2444 * perform normal FIN-related processing.  In the latter case 1 indicates that
2445 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2446 * skb can be freed.
2447 */
2448static int
2449handle_peer_close_data(struct socket *so, struct mbuf *m)
2450{
2451	struct tcpcb *tp = so_sototcpcb(so);
2452	struct toepcb *toep = tp->t_toe;
2453	struct ddp_state *q;
2454	struct ddp_buf_state *bsp;
2455	struct cpl_peer_close *req = cplhdr(m);
2456	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2457	struct sockbuf *rcv;
2458
2459	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2460		return (0);
2461
2462	CTR0(KTR_TOM, "handle_peer_close_data");
2463	if (__predict_false(so_no_receive(so))) {
2464		handle_excess_rx(toep, m);
2465
2466		/*
2467		 * Although we discard the data we want to process the FIN so
2468		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2469		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2470		 * may be what will close the connection.  We return 1 because
2471		 * handle_excess_rx() already freed the packet.
2472		 */
2473		return (1);
2474	}
2475
2476	inp_lock_assert(tp->t_inpcb);
2477	q = &toep->tp_ddp_state;
2478	rcv = so_sockbuf_rcv(so);
2479	sockbuf_lock(rcv);
2480
2481	bsp = &q->buf_state[q->cur_buf];
2482	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2483	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2484	m->m_ddp_gl = (unsigned char *)bsp->gl;
2485	m->m_flags |= M_DDP;
2486	m->m_cur_offset = bsp->cur_offset;
2487	m->m_ddp_flags =
2488	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2489	m->m_seq = tp->rcv_nxt;
2490	tp->rcv_nxt = rcv_nxt;
2491	bsp->cur_offset += m->m_pkthdr.len;
2492	if (!(bsp->flags & DDP_BF_NOFLIP))
2493		q->cur_buf ^= 1;
2494#ifdef notyet
2495	skb_reset_transport_header(skb);
2496	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2497#endif
2498	tp->t_rcvtime = ticks;
2499	SBAPPEND(rcv, m);
2500	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2501		so_sorwakeup_locked(so);
2502	else
2503		sockbuf_unlock(rcv);
2504
2505	return (1);
2506}
2507
2508/*
2509 * Handle a peer FIN.
2510 */
2511static void
2512do_peer_fin(struct toepcb *toep, struct mbuf *m)
2513{
2514	struct socket *so;
2515	struct tcpcb *tp = toep->tp_tp;
2516	int keep, action;
2517
2518	action = keep = 0;
2519	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2520	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2521		printf("abort_pending set\n");
2522
2523		goto out;
2524	}
2525	inp_wlock(tp->t_inpcb);
2526	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2527	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2528		keep = handle_peer_close_data(so, m);
2529		if (keep < 0) {
2530			inp_wunlock(tp->t_inpcb);
2531			return;
2532		}
2533	}
2534	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2535		CTR1(KTR_TOM,
2536		    "waking up waiters for cantrcvmore on %p ", so);
2537		socantrcvmore(so);
2538
2539		/*
2540		 * If connection is half-synchronized
2541		 * (ie NEEDSYN flag on) then delay ACK,
2542		 * so it may be piggybacked when SYN is sent.
2543		 * Otherwise, since we received a FIN then no
2544		 * more input can be expected, send ACK now.
2545		 */
2546		if (tp->t_flags & TF_NEEDSYN)
2547			tp->t_flags |= TF_DELACK;
2548		else
2549			tp->t_flags |= TF_ACKNOW;
2550		tp->rcv_nxt++;
2551	}
2552
2553	switch (tp->t_state) {
2554	case TCPS_SYN_RECEIVED:
2555	    tp->t_starttime = ticks;
2556	/* FALLTHROUGH */
2557	case TCPS_ESTABLISHED:
2558		tp->t_state = TCPS_CLOSE_WAIT;
2559		break;
2560	case TCPS_FIN_WAIT_1:
2561		tp->t_state = TCPS_CLOSING;
2562		break;
2563	case TCPS_FIN_WAIT_2:
2564		/*
2565		 * If we've sent an abort_req we must have sent it too late,
2566		 * HW will send us a reply telling us so, and this peer_close
2567		 * is really the last message for this connection and needs to
2568		 * be treated as an abort_rpl, i.e., transition the connection
2569		 * to TCP_CLOSE (note that the host stack does this at the
2570		 * time of generating the RST but we must wait for HW).
2571		 * Otherwise we enter TIME_WAIT.
2572		 */
2573		t3_release_offload_resources(toep);
2574		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2575			action = TCP_CLOSE;
2576		} else {
2577			action = TCP_TIMEWAIT;
2578		}
2579		break;
2580	default:
2581		log(LOG_ERR,
2582		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2583		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2584	}
2585	inp_wunlock(tp->t_inpcb);
2586
2587	if (action == TCP_TIMEWAIT) {
2588		enter_timewait(tp);
2589	} else if (action == TCP_DROP) {
2590		tcp_offload_drop(tp, 0);
2591	} else if (action == TCP_CLOSE) {
2592		tcp_offload_close(tp);
2593	}
2594
2595#ifdef notyet
2596	/* Do not send POLL_HUP for half duplex close. */
2597	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2598	    sk->sk_state == TCP_CLOSE)
2599		sk_wake_async(so, 1, POLL_HUP);
2600	else
2601		sk_wake_async(so, 1, POLL_IN);
2602#endif
2603
2604out:
2605	if (!keep)
2606		m_free(m);
2607}
2608
2609/*
2610 * Handler for PEER_CLOSE CPL messages.
2611 */
2612static int
2613do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2614{
2615	struct toepcb *toep = (struct toepcb *)ctx;
2616
2617	VALIDATE_SOCK(so);
2618
2619	do_peer_fin(toep, m);
2620	return (0);
2621}
2622
2623static void
2624process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2625{
2626	struct cpl_close_con_rpl *rpl = cplhdr(m);
2627	struct tcpcb *tp = toep->tp_tp;
2628	struct socket *so;
2629	int action = 0;
2630	struct sockbuf *rcv;
2631
2632	inp_wlock(tp->t_inpcb);
2633	so = inp_inpcbtosocket(tp->t_inpcb);
2634
2635	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2636
2637	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2638		inp_wunlock(tp->t_inpcb);
2639		goto out;
2640	}
2641
2642	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2643	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2644
2645	switch (tp->t_state) {
2646	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2647		t3_release_offload_resources(toep);
2648		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2649			action = TCP_CLOSE;
2650
2651		} else {
2652			action = TCP_TIMEWAIT;
2653		}
2654		break;
2655	case TCPS_LAST_ACK:
2656		/*
2657		 * In this state we don't care about pending abort_rpl.
2658		 * If we've sent abort_req it was post-close and was sent too
2659		 * late, this close_con_rpl is the actual last message.
2660		 */
2661		t3_release_offload_resources(toep);
2662		action = TCP_CLOSE;
2663		break;
2664	case TCPS_FIN_WAIT_1:
2665		/*
2666		 * If we can't receive any more
2667		 * data, then closing user can proceed.
2668		 * Starting the timer is contrary to the
2669		 * specification, but if we don't get a FIN
2670		 * we'll hang forever.
2671		 *
2672		 * XXXjl:
2673		 * we should release the tp also, and use a
2674		 * compressed state.
2675		 */
2676		if (so)
2677			rcv = so_sockbuf_rcv(so);
2678		else
2679			break;
2680
2681		if (rcv->sb_state & SBS_CANTRCVMORE) {
2682			int timeout;
2683
2684			if (so)
2685				soisdisconnected(so);
2686			timeout = (tcp_fast_finwait2_recycle) ?
2687			    tcp_finwait2_timeout : tcp_maxidle;
2688			tcp_timer_activate(tp, TT_2MSL, timeout);
2689		}
2690		tp->t_state = TCPS_FIN_WAIT_2;
2691		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2692		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2693			action = TCP_DROP;
2694		}
2695
2696		break;
2697	default:
2698		log(LOG_ERR,
2699		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2700		       toep->tp_toedev->tod_name, toep->tp_tid,
2701		       tp->t_state);
2702	}
2703	inp_wunlock(tp->t_inpcb);
2704
2705
2706	if (action == TCP_TIMEWAIT) {
2707		enter_timewait(tp);
2708	} else if (action == TCP_DROP) {
2709		tcp_offload_drop(tp, 0);
2710	} else if (action == TCP_CLOSE) {
2711		tcp_offload_close(tp);
2712	}
2713out:
2714	m_freem(m);
2715}
2716
2717/*
2718 * Handler for CLOSE_CON_RPL CPL messages.
2719 */
2720static int
2721do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2722			    void *ctx)
2723{
2724	struct toepcb *toep = (struct toepcb *)ctx;
2725
2726	process_close_con_rpl(toep, m);
2727	return (0);
2728}
2729
2730/*
2731 * Process abort replies.  We only process these messages if we anticipate
2732 * them as the coordination between SW and HW in this area is somewhat lacking
2733 * and sometimes we get ABORT_RPLs after we are done with the connection that
2734 * originated the ABORT_REQ.
2735 */
2736static void
2737process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2738{
2739	struct tcpcb *tp = toep->tp_tp;
2740	struct socket *so;
2741	int needclose = 0;
2742
2743#ifdef T3_TRACE
2744	T3_TRACE1(TIDTB(sk),
2745		  "process_abort_rpl: GTS rpl pending %d",
2746		  sock_flag(sk, ABORT_RPL_PENDING));
2747#endif
2748
2749	inp_wlock(tp->t_inpcb);
2750	so = inp_inpcbtosocket(tp->t_inpcb);
2751
2752	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2753		/*
2754		 * XXX panic on tcpdrop
2755		 */
2756		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2757			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2758		else {
2759			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2760			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2761			    !is_t3a(toep->tp_toedev)) {
2762				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2763					panic("TP_ABORT_REQ_RCVD set");
2764				t3_release_offload_resources(toep);
2765				needclose = 1;
2766			}
2767		}
2768	}
2769	inp_wunlock(tp->t_inpcb);
2770
2771	if (needclose)
2772		tcp_offload_close(tp);
2773
2774	m_free(m);
2775}
2776
2777/*
2778 * Handle an ABORT_RPL_RSS CPL message.
2779 */
2780static int
2781do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2782{
2783	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2784	struct toepcb *toep;
2785
2786	/*
2787	 * Ignore replies to post-close aborts indicating that the abort was
2788	 * requested too late.  These connections are terminated when we get
2789	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2790	 * arrives the TID is either no longer used or it has been recycled.
2791	 */
2792	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2793discard:
2794		m_free(m);
2795		return (0);
2796	}
2797
2798	toep = (struct toepcb *)ctx;
2799
2800        /*
2801	 * Sometimes we've already closed the socket, e.g., a post-close
2802	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2803	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2804	 * but FW turns the ABORT_REQ into a regular one and so we get
2805	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2806	 */
2807	if (!toep)
2808		goto discard;
2809
2810	if (toep->tp_tp == NULL) {
2811		log(LOG_NOTICE, "removing tid for abort\n");
2812		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2813		if (toep->tp_l2t)
2814			l2t_release(L2DATA(cdev), toep->tp_l2t);
2815
2816		toepcb_release(toep);
2817		goto discard;
2818	}
2819
2820	log(LOG_NOTICE, "toep=%p\n", toep);
2821	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2822
2823	toepcb_hold(toep);
2824	process_abort_rpl(toep, m);
2825	toepcb_release(toep);
2826	return (0);
2827}
2828
2829/*
2830 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2831 * indicate whether RST should be sent in response.
2832 */
2833static int
2834abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2835{
2836	struct tcpcb *tp = so_sototcpcb(so);
2837
2838	switch (abort_reason) {
2839	case CPL_ERR_BAD_SYN:
2840#if 0
2841		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2842#endif
2843	case CPL_ERR_CONN_RESET:
2844		// XXX need to handle SYN_RECV due to crossed SYNs
2845		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2846	case CPL_ERR_XMIT_TIMEDOUT:
2847	case CPL_ERR_PERSIST_TIMEDOUT:
2848	case CPL_ERR_FINWAIT2_TIMEDOUT:
2849	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2850#if 0
2851		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2852#endif
2853		return (ETIMEDOUT);
2854	default:
2855		return (EIO);
2856	}
2857}
2858
2859static inline void
2860set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2861{
2862	struct cpl_abort_rpl *rpl = cplhdr(m);
2863
2864	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2865	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2866	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2867
2868	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2869	rpl->cmd = cmd;
2870}
2871
2872static void
2873send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2874{
2875	struct mbuf *reply_mbuf;
2876	struct cpl_abort_req_rss *req = cplhdr(m);
2877
2878	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2879	m_set_priority(m, CPL_PRIORITY_DATA);
2880	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2881	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2882	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2883	m_free(m);
2884}
2885
2886/*
2887 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2888 */
2889static inline int
2890is_neg_adv_abort(unsigned int status)
2891{
2892	return status == CPL_ERR_RTX_NEG_ADVICE ||
2893	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2894}
2895
2896static void
2897send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2898{
2899	struct mbuf  *reply_mbuf;
2900	struct cpl_abort_req_rss *req = cplhdr(m);
2901
2902	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2903
2904	if (!reply_mbuf) {
2905		/* Defer the reply.  Stick rst_status into req->cmd. */
2906		req->status = rst_status;
2907		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2908		return;
2909	}
2910
2911	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2912	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2913	m_free(m);
2914
2915	/*
2916	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2917	 * these messages while ARP is pending.  For other connection states
2918	 * it's not a problem.
2919	 */
2920	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2921}
2922
2923#ifdef notyet
2924static void
2925cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2926{
2927	CXGB_UNIMPLEMENTED();
2928#ifdef notyet
2929	struct request_sock *req = child->sk_user_data;
2930
2931	inet_csk_reqsk_queue_removed(parent, req);
2932	synq_remove(tcp_sk(child));
2933	__reqsk_free(req);
2934	child->sk_user_data = NULL;
2935#endif
2936}
2937
2938
2939/*
2940 * Performs the actual work to abort a SYN_RECV connection.
2941 */
2942static void
2943do_abort_syn_rcv(struct socket *child, struct socket *parent)
2944{
2945	struct tcpcb *parenttp = so_sototcpcb(parent);
2946	struct tcpcb *childtp = so_sototcpcb(child);
2947
2948	/*
2949	 * If the server is still open we clean up the child connection,
2950	 * otherwise the server already did the clean up as it was purging
2951	 * its SYN queue and the skb was just sitting in its backlog.
2952	 */
2953	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2954		cleanup_syn_rcv_conn(child, parent);
2955		inp_wlock(childtp->t_inpcb);
2956		t3_release_offload_resources(childtp->t_toe);
2957		inp_wunlock(childtp->t_inpcb);
2958		tcp_offload_close(childtp);
2959	}
2960}
2961#endif
2962
2963/*
2964 * Handle abort requests for a SYN_RECV connection.  These need extra work
2965 * because the socket is on its parent's SYN queue.
2966 */
2967static int
2968abort_syn_rcv(struct socket *so, struct mbuf *m)
2969{
2970	CXGB_UNIMPLEMENTED();
2971#ifdef notyet
2972	struct socket *parent;
2973	struct toedev *tdev = toep->tp_toedev;
2974	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2975	struct socket *oreq = so->so_incomp;
2976	struct t3c_tid_entry *t3c_stid;
2977	struct tid_info *t;
2978
2979	if (!oreq)
2980		return -1;        /* somehow we are not on the SYN queue */
2981
2982	t = &(T3C_DATA(cdev))->tid_maps;
2983	t3c_stid = lookup_stid(t, oreq->ts_recent);
2984	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2985
2986	so_lock(parent);
2987	do_abort_syn_rcv(so, parent);
2988	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2989	so_unlock(parent);
2990#endif
2991	return (0);
2992}
2993
2994/*
2995 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2996 * request except that we need to reply to it.
2997 */
2998static void
2999process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3000{
3001	int rst_status = CPL_ABORT_NO_RST;
3002	const struct cpl_abort_req_rss *req = cplhdr(m);
3003	struct tcpcb *tp = toep->tp_tp;
3004	struct socket *so;
3005	int needclose = 0;
3006
3007	inp_wlock(tp->t_inpcb);
3008	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3009	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3010		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3011		m_free(m);
3012		goto skip;
3013	}
3014
3015	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3016	/*
3017	 * Three cases to consider:
3018	 * a) We haven't sent an abort_req; close the connection.
3019	 * b) We have sent a post-close abort_req that will get to TP too late
3020	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3021	 *    be ignored and the connection should be closed now.
3022	 * c) We have sent a regular abort_req that will get to TP too late.
3023	 *    That will generate an abort_rpl with status 0, wait for it.
3024	 */
3025	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3026	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3027		int error;
3028
3029		error = abort_status_to_errno(so, req->status,
3030		    &rst_status);
3031		so_error_set(so, error);
3032
3033		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3034			so_sorwakeup(so);
3035		/*
3036		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3037		 * returns 0 is has taken care of the abort.
3038		 */
3039		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3040			goto skip;
3041
3042		t3_release_offload_resources(toep);
3043		needclose = 1;
3044	}
3045	inp_wunlock(tp->t_inpcb);
3046
3047	if (needclose)
3048		tcp_offload_close(tp);
3049
3050	send_abort_rpl(m, tdev, rst_status);
3051	return;
3052skip:
3053	inp_wunlock(tp->t_inpcb);
3054}
3055
3056/*
3057 * Handle an ABORT_REQ_RSS CPL message.
3058 */
3059static int
3060do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3061{
3062	const struct cpl_abort_req_rss *req = cplhdr(m);
3063	struct toepcb *toep = (struct toepcb *)ctx;
3064
3065	if (is_neg_adv_abort(req->status)) {
3066		m_free(m);
3067		return (0);
3068	}
3069
3070	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3071
3072	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3073		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3074		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3075
3076		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3077		if (toep->tp_l2t)
3078			l2t_release(L2DATA(cdev), toep->tp_l2t);
3079
3080		/*
3081		 *  Unhook
3082		 */
3083		toep->tp_tp->t_toe = NULL;
3084		toep->tp_tp->t_flags &= ~TF_TOE;
3085		toep->tp_tp = NULL;
3086		/*
3087		 * XXX need to call syncache_chkrst - but we don't
3088		 * have a way of doing that yet
3089		 */
3090		toepcb_release(toep);
3091		log(LOG_ERR, "abort for unestablished connection :-(\n");
3092		return (0);
3093	}
3094	if (toep->tp_tp == NULL) {
3095		log(LOG_NOTICE, "disconnected toepcb\n");
3096		/* should be freed momentarily */
3097		return (0);
3098	}
3099
3100
3101	toepcb_hold(toep);
3102	process_abort_req(toep, m, toep->tp_toedev);
3103	toepcb_release(toep);
3104	return (0);
3105}
3106#ifdef notyet
3107static void
3108pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3109{
3110	struct toedev *tdev = TOE_DEV(parent);
3111
3112	do_abort_syn_rcv(child, parent);
3113	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3114		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3115
3116		rpl->opt0h = htonl(F_TCAM_BYPASS);
3117		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3118		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3119	} else
3120		m_free(m);
3121}
3122#endif
3123static void
3124handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3125{
3126	CXGB_UNIMPLEMENTED();
3127
3128#ifdef notyet
3129	struct t3cdev *cdev;
3130	struct socket *parent;
3131	struct socket *oreq;
3132	struct t3c_tid_entry *t3c_stid;
3133	struct tid_info *t;
3134	struct tcpcb *otp, *tp = so_sototcpcb(so);
3135	struct toepcb *toep = tp->t_toe;
3136
3137	/*
3138	 * If the connection is being aborted due to the parent listening
3139	 * socket going away there's nothing to do, the ABORT_REQ will close
3140	 * the connection.
3141	 */
3142	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3143		m_free(m);
3144		return;
3145	}
3146
3147	oreq = so->so_incomp;
3148	otp = so_sototcpcb(oreq);
3149
3150	cdev = T3C_DEV(so);
3151	t = &(T3C_DATA(cdev))->tid_maps;
3152	t3c_stid = lookup_stid(t, otp->ts_recent);
3153	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3154
3155	so_lock(parent);
3156	pass_open_abort(so, parent, m);
3157	so_unlock(parent);
3158#endif
3159}
3160
3161/*
3162 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3163 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3164 * connection.
3165 */
3166static void
3167pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3168{
3169
3170#ifdef notyet
3171	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3172	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3173#endif
3174	handle_pass_open_arp_failure(m_get_socket(m), m);
3175}
3176
3177/*
3178 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3179 */
3180static void
3181mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3182{
3183	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3184	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3185	unsigned int tid = GET_TID(req);
3186
3187	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3188	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3189	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3190	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3191	rpl->opt0h = htonl(F_TCAM_BYPASS);
3192	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3193	rpl->opt2 = 0;
3194	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3195}
3196
3197/*
3198 * Send a deferred reject to an accept request.
3199 */
3200static void
3201reject_pass_request(struct toedev *tdev, struct mbuf *m)
3202{
3203	struct mbuf *reply_mbuf;
3204
3205	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3206	mk_pass_accept_rpl(reply_mbuf, m);
3207	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3208	m_free(m);
3209}
3210
3211static void
3212handle_syncache_event(int event, void *arg)
3213{
3214	struct toepcb *toep = arg;
3215
3216	switch (event) {
3217	case TOE_SC_ENTRY_PRESENT:
3218		/*
3219		 * entry already exists - free toepcb
3220		 * and l2t
3221		 */
3222		printf("syncache entry present\n");
3223		toepcb_release(toep);
3224		break;
3225	case TOE_SC_DROP:
3226		/*
3227		 * The syncache has given up on this entry
3228		 * either it timed out, or it was evicted
3229		 * we need to explicitly release the tid
3230		 */
3231		printf("syncache entry dropped\n");
3232		toepcb_release(toep);
3233		break;
3234	default:
3235		log(LOG_ERR, "unknown syncache event %d\n", event);
3236		break;
3237	}
3238}
3239
3240static void
3241syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3242{
3243	struct in_conninfo inc;
3244	struct toeopt toeo;
3245	struct tcphdr th;
3246	struct inpcb *inp;
3247	int mss, wsf, sack, ts;
3248	uint32_t rcv_isn = ntohl(req->rcv_isn);
3249
3250	bzero(&toeo, sizeof(struct toeopt));
3251	inp = so_sotoinpcb(lso);
3252
3253	/*
3254	 * Fill out information for entering us into the syncache
3255	 */
3256	bzero(&inc, sizeof(inc));
3257	inc.inc_fport = th.th_sport = req->peer_port;
3258	inc.inc_lport = th.th_dport = req->local_port;
3259	th.th_seq = req->rcv_isn;
3260	th.th_flags = TH_SYN;
3261
3262	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3263
3264	inc.inc_len = 0;
3265	inc.inc_faddr.s_addr = req->peer_ip;
3266	inc.inc_laddr.s_addr = req->local_ip;
3267
3268	DPRINTF("syncache add of %d:%d %d:%d\n",
3269	    ntohl(req->local_ip), ntohs(req->local_port),
3270	    ntohl(req->peer_ip), ntohs(req->peer_port));
3271
3272	mss = req->tcp_options.mss;
3273	wsf = req->tcp_options.wsf;
3274	ts = req->tcp_options.tstamp;
3275	sack = req->tcp_options.sack;
3276	toeo.to_mss = mss;
3277	toeo.to_wscale = wsf;
3278	toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3279	tcp_offload_syncache_add(&inc, &toeo, &th, inp, &lso, &cxgb_toe_usrreqs,
3280toep);
3281}
3282
3283
3284/*
3285 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3286 * lock held.  Note that the sock here is a listening socket that is not owned
3287 * by the TOE.
3288 */
3289static void
3290process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3291    struct listen_ctx *lctx)
3292{
3293	int rt_flags;
3294	struct l2t_entry *e;
3295	struct iff_mac tim;
3296	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3297	struct cpl_pass_accept_rpl *rpl;
3298	struct cpl_pass_accept_req *req = cplhdr(m);
3299	unsigned int tid = GET_TID(req);
3300	struct tom_data *d = TOM_DATA(tdev);
3301	struct t3cdev *cdev = d->cdev;
3302	struct tcpcb *tp = so_sototcpcb(so);
3303	struct toepcb *newtoep;
3304	struct rtentry *dst;
3305	struct sockaddr_in nam;
3306	struct t3c_data *td = T3C_DATA(cdev);
3307
3308	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3309	if (__predict_false(reply_mbuf == NULL)) {
3310		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3311			t3_defer_reply(m, tdev, reject_pass_request);
3312		else {
3313			cxgb_queue_tid_release(cdev, tid);
3314			m_free(m);
3315		}
3316		DPRINTF("failed to get reply_mbuf\n");
3317
3318		goto out;
3319	}
3320
3321	if (tp->t_state != TCPS_LISTEN) {
3322		DPRINTF("socket not in listen state\n");
3323
3324		goto reject;
3325	}
3326
3327	tim.mac_addr = req->dst_mac;
3328	tim.vlan_tag = ntohs(req->vlan_tag);
3329	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3330		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3331		goto reject;
3332	}
3333
3334#ifdef notyet
3335	/*
3336	 * XXX do route lookup to confirm that we're still listening on this
3337	 * address
3338	 */
3339	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3340			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3341		goto reject;
3342	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3343		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3344	dst_release(skb->dst);	// done with the input route, release it
3345	skb->dst = NULL;
3346
3347	if ((rt_flags & RTF_LOCAL) == 0)
3348		goto reject;
3349#endif
3350	/*
3351	 * XXX
3352	 */
3353	rt_flags = RTF_LOCAL;
3354	if ((rt_flags & RTF_LOCAL) == 0)
3355		goto reject;
3356
3357	/*
3358	 * Calculate values and add to syncache
3359	 */
3360
3361	newtoep = toepcb_alloc();
3362	if (newtoep == NULL)
3363		goto reject;
3364
3365	bzero(&nam, sizeof(struct sockaddr_in));
3366
3367	nam.sin_len = sizeof(struct sockaddr_in);
3368	nam.sin_family = AF_INET;
3369	nam.sin_addr.s_addr =req->peer_ip;
3370	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3371
3372	if (dst == NULL) {
3373		printf("failed to find route\n");
3374		goto reject;
3375	}
3376	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3377	    (struct sockaddr *)&nam);
3378	if (e == NULL) {
3379		DPRINTF("failed to get l2t\n");
3380	}
3381	/*
3382	 * Point to our listen socket until accept
3383	 */
3384	newtoep->tp_tp = tp;
3385	newtoep->tp_flags = TP_SYN_RCVD;
3386	newtoep->tp_tid = tid;
3387	newtoep->tp_toedev = tdev;
3388	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3389
3390	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3391	so_lock(so);
3392	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3393	so_unlock(so);
3394
3395	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3396		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3397
3398	if (newtoep->tp_ulp_mode) {
3399		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3400
3401		if (ddp_mbuf == NULL)
3402			newtoep->tp_ulp_mode = 0;
3403	}
3404
3405	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3406	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3407	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3408	/*
3409	 * XXX workaround for lack of syncache drop
3410	 */
3411	toepcb_hold(newtoep);
3412	syncache_add_accept_req(req, so, newtoep);
3413
3414	rpl = cplhdr(reply_mbuf);
3415	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3416	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3417	rpl->wr.wr_lo = 0;
3418	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3419	rpl->opt2 = htonl(calc_opt2(so, tdev));
3420	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3421	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3422
3423	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3424	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3425	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3426				  CPL_PASS_OPEN_ACCEPT);
3427
3428	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3429
3430	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3431
3432	l2t_send(cdev, reply_mbuf, e);
3433	m_free(m);
3434	if (newtoep->tp_ulp_mode) {
3435		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3436				V_TF_DDP_OFF(1) |
3437				TP_DDP_TIMER_WORKAROUND_MASK,
3438				V_TF_DDP_OFF(1) |
3439		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3440	} else
3441		DPRINTF("no DDP\n");
3442
3443	return;
3444reject:
3445	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3446		mk_pass_accept_rpl(reply_mbuf, m);
3447	else
3448		mk_tid_release(reply_mbuf, newtoep, tid);
3449	cxgb_ofld_send(cdev, reply_mbuf);
3450	m_free(m);
3451out:
3452#if 0
3453	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3454#else
3455	return;
3456#endif
3457}
3458
3459/*
3460 * Handle a CPL_PASS_ACCEPT_REQ message.
3461 */
3462static int
3463do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3464{
3465	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3466	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3467	struct tom_data *d = listen_ctx->tom_data;
3468
3469#if VALIDATE_TID
3470	struct cpl_pass_accept_req *req = cplhdr(m);
3471	unsigned int tid = GET_TID(req);
3472	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3473
3474	if (unlikely(!lsk)) {
3475		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3476		       cdev->name,
3477		       (unsigned long)((union listen_entry *)ctx -
3478					t->stid_tab));
3479		return CPL_RET_BUF_DONE;
3480	}
3481	if (unlikely(tid >= t->ntids)) {
3482		printk(KERN_ERR "%s: passive open TID %u too large\n",
3483		       cdev->name, tid);
3484		return CPL_RET_BUF_DONE;
3485	}
3486	/*
3487	 * For T3A the current user of the TID may have closed but its last
3488	 * message(s) may have been backlogged so the TID appears to be still
3489	 * in use.  Just take the TID away, the connection can close at its
3490	 * own leisure.  For T3B this situation is a bug.
3491	 */
3492	if (!valid_new_tid(t, tid) &&
3493	    cdev->type != T3A) {
3494		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3495		       cdev->name, tid);
3496		return CPL_RET_BUF_DONE;
3497	}
3498#endif
3499
3500	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3501	return (0);
3502}
3503
3504/*
3505 * Called when a connection is established to translate the TCP options
3506 * reported by HW to FreeBSD's native format.
3507 */
3508static void
3509assign_rxopt(struct socket *so, unsigned int opt)
3510{
3511	struct tcpcb *tp = so_sototcpcb(so);
3512	struct toepcb *toep = tp->t_toe;
3513	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3514
3515	inp_lock_assert(tp->t_inpcb);
3516
3517	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3518	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3519	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3520	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3521	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3522	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3523		tp->rcv_scale = tp->request_r_scale;
3524}
3525
3526/*
3527 * Completes some final bits of initialization for just established connections
3528 * and changes their state to TCP_ESTABLISHED.
3529 *
3530 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3531 */
3532static void
3533make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3534{
3535	struct tcpcb *tp = so_sototcpcb(so);
3536	struct toepcb *toep = tp->t_toe;
3537
3538	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3539	assign_rxopt(so, opt);
3540
3541	/*
3542	 *XXXXXXXXXXX
3543	 *
3544	 */
3545#ifdef notyet
3546	so->so_proto->pr_ctloutput = t3_ctloutput;
3547#endif
3548
3549#if 0
3550	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3551#endif
3552	/*
3553	 * XXX not clear what rcv_wup maps to
3554	 */
3555	/*
3556	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3557	 * pass through opt0.
3558	 */
3559	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3560		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3561
3562	dump_toepcb(toep);
3563
3564#ifdef notyet
3565/*
3566 * no clean interface for marking ARP up to date
3567 */
3568	dst_confirm(sk->sk_dst_cache);
3569#endif
3570	tp->t_starttime = ticks;
3571	tp->t_state = TCPS_ESTABLISHED;
3572	soisconnected(so);
3573}
3574
3575static int
3576syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3577{
3578
3579	struct in_conninfo inc;
3580	struct toeopt toeo;
3581	struct tcphdr th;
3582	int mss, wsf, sack, ts;
3583	struct mbuf *m = NULL;
3584	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3585	unsigned int opt;
3586
3587#ifdef MAC
3588#error	"no MAC support"
3589#endif
3590
3591	opt = ntohs(req->tcp_opt);
3592
3593	bzero(&toeo, sizeof(struct toeopt));
3594
3595	/*
3596	 * Fill out information for entering us into the syncache
3597	 */
3598	bzero(&inc, sizeof(inc));
3599	inc.inc_fport = th.th_sport = req->peer_port;
3600	inc.inc_lport = th.th_dport = req->local_port;
3601	th.th_seq = req->rcv_isn;
3602	th.th_flags = TH_ACK;
3603
3604	inc.inc_len = 0;
3605	inc.inc_faddr.s_addr = req->peer_ip;
3606	inc.inc_laddr.s_addr = req->local_ip;
3607
3608	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3609	wsf  = G_TCPOPT_WSCALE_OK(opt);
3610	ts   = G_TCPOPT_TSTAMP(opt);
3611	sack = G_TCPOPT_SACK(opt);
3612
3613	toeo.to_mss = mss;
3614	toeo.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3615	toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3616
3617	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3618	    ntohl(req->local_ip), ntohs(req->local_port),
3619	    ntohl(req->peer_ip), ntohs(req->peer_port),
3620	    mss, wsf, ts, sack);
3621	return tcp_offload_syncache_expand(&inc, &toeo, &th, so, m);
3622}
3623
3624
3625/*
3626 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3627 * if we are in TCP_SYN_RECV due to crossed SYNs
3628 */
3629static int
3630do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3631{
3632	struct cpl_pass_establish *req = cplhdr(m);
3633	struct toepcb *toep = (struct toepcb *)ctx;
3634	struct tcpcb *tp = toep->tp_tp;
3635	struct socket *so, *lso;
3636	struct t3c_data *td = T3C_DATA(cdev);
3637	struct sockbuf *snd, *rcv;
3638
3639	// Complete socket initialization now that we have the SND_ISN
3640
3641	struct toedev *tdev;
3642
3643
3644	tdev = toep->tp_toedev;
3645
3646	inp_wlock(tp->t_inpcb);
3647
3648	/*
3649	 *
3650	 * XXX need to add reference while we're manipulating
3651	 */
3652	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3653
3654	inp_wunlock(tp->t_inpcb);
3655
3656	so_lock(so);
3657	LIST_REMOVE(toep, synq_entry);
3658	so_unlock(so);
3659
3660	if (!syncache_expand_establish_req(req, &so, toep)) {
3661		/*
3662		 * No entry
3663		 */
3664		CXGB_UNIMPLEMENTED();
3665	}
3666	if (so == NULL) {
3667		/*
3668		 * Couldn't create the socket
3669		 */
3670		CXGB_UNIMPLEMENTED();
3671	}
3672
3673	tp = so_sototcpcb(so);
3674	inp_wlock(tp->t_inpcb);
3675
3676	snd = so_sockbuf_snd(so);
3677	rcv = so_sockbuf_rcv(so);
3678
3679	snd->sb_flags |= SB_NOCOALESCE;
3680	rcv->sb_flags |= SB_NOCOALESCE;
3681
3682	toep->tp_tp = tp;
3683	toep->tp_flags = 0;
3684	tp->t_toe = toep;
3685	reset_wr_list(toep);
3686	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3687	tp->rcv_nxt = toep->tp_copied_seq;
3688	install_offload_ops(so);
3689
3690	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3691	toep->tp_wr_unacked = 0;
3692	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3693	toep->tp_qset_idx = 0;
3694	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3695
3696	/*
3697	 * XXX Cancel any keep alive timer
3698	 */
3699
3700	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3701
3702	/*
3703	 * XXX workaround for lack of syncache drop
3704	 */
3705	toepcb_release(toep);
3706	inp_wunlock(tp->t_inpcb);
3707
3708	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3709	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3710#ifdef notyet
3711	/*
3712	 * XXX not sure how these checks map to us
3713	 */
3714	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3715		sk->sk_state_change(sk);
3716		sk_wake_async(so, 0, POLL_OUT);
3717	}
3718	/*
3719	 * The state for the new connection is now up to date.
3720	 * Next check if we should add the connection to the parent's
3721	 * accept queue.  When the parent closes it resets connections
3722	 * on its SYN queue, so check if we are being reset.  If so we
3723	 * don't need to do anything more, the coming ABORT_RPL will
3724	 * destroy this socket.  Otherwise move the connection to the
3725	 * accept queue.
3726	 *
3727	 * Note that we reset the synq before closing the server so if
3728	 * we are not being reset the stid is still open.
3729	 */
3730	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3731		__kfree_skb(skb);
3732		goto unlock;
3733	}
3734#endif
3735	m_free(m);
3736
3737	return (0);
3738}
3739
3740/*
3741 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3742 * and send them to the TOE.
3743 */
3744static void
3745fixup_and_send_ofo(struct toepcb *toep)
3746{
3747	struct mbuf *m;
3748	struct toedev *tdev = toep->tp_toedev;
3749	struct tcpcb *tp = toep->tp_tp;
3750	unsigned int tid = toep->tp_tid;
3751
3752	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3753
3754	inp_lock_assert(tp->t_inpcb);
3755	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3756		/*
3757		 * A variety of messages can be waiting but the fields we'll
3758		 * be touching are common to all so any message type will do.
3759		 */
3760		struct cpl_close_con_req *p = cplhdr(m);
3761
3762		p->wr.wr_lo = htonl(V_WR_TID(tid));
3763		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3764		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3765	}
3766}
3767
3768/*
3769 * Updates socket state from an active establish CPL message.  Runs with the
3770 * socket lock held.
3771 */
3772static void
3773socket_act_establish(struct socket *so, struct mbuf *m)
3774{
3775	struct cpl_act_establish *req = cplhdr(m);
3776	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3777	struct tcpcb *tp = so_sototcpcb(so);
3778	struct toepcb *toep = tp->t_toe;
3779
3780	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3781		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3782		    toep->tp_tid, tp->t_state);
3783
3784	tp->ts_recent_age = ticks;
3785	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3786	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3787
3788	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3789
3790	/*
3791	 * Now that we finally have a TID send any CPL messages that we had to
3792	 * defer for lack of a TID.
3793	 */
3794	if (mbufq_len(&toep->out_of_order_queue))
3795		fixup_and_send_ofo(toep);
3796
3797	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3798		/*
3799		 * XXX does this even make sense?
3800		 */
3801		so_sorwakeup(so);
3802	}
3803	m_free(m);
3804#ifdef notyet
3805/*
3806 * XXX assume no write requests permitted while socket connection is
3807 * incomplete
3808 */
3809	/*
3810	 * Currently the send queue must be empty at this point because the
3811	 * socket layer does not send anything before a connection is
3812	 * established.  To be future proof though we handle the possibility
3813	 * that there are pending buffers to send (either TX_DATA or
3814	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3815	 * buffers according to the just learned write_seq, and then we send
3816	 * them on their way.
3817	 */
3818	fixup_pending_writeq_buffers(sk);
3819	if (t3_push_frames(so, 1))
3820		sk->sk_write_space(sk);
3821#endif
3822
3823	toep->tp_state = tp->t_state;
3824	TCPSTAT_INC(tcps_connects);
3825
3826}
3827
3828/*
3829 * Process a CPL_ACT_ESTABLISH message.
3830 */
3831static int
3832do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3833{
3834	struct cpl_act_establish *req = cplhdr(m);
3835	unsigned int tid = GET_TID(req);
3836	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3837	struct toepcb *toep = (struct toepcb *)ctx;
3838	struct tcpcb *tp = toep->tp_tp;
3839	struct socket *so;
3840	struct toedev *tdev;
3841	struct tom_data *d;
3842
3843	if (tp == NULL) {
3844		free_atid(cdev, atid);
3845		return (0);
3846	}
3847	inp_wlock(tp->t_inpcb);
3848
3849	/*
3850	 * XXX
3851	 */
3852	so = inp_inpcbtosocket(tp->t_inpcb);
3853	tdev = toep->tp_toedev; /* blow up here if link was down */
3854	d = TOM_DATA(tdev);
3855
3856	/*
3857	 * It's OK if the TID is currently in use, the owning socket may have
3858	 * backlogged its last CPL message(s).  Just take it away.
3859	 */
3860	toep->tp_tid = tid;
3861	toep->tp_tp = tp;
3862	so_insert_tid(d, toep, tid);
3863	free_atid(cdev, atid);
3864	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3865
3866	socket_act_establish(so, m);
3867	inp_wunlock(tp->t_inpcb);
3868	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3869	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3870
3871	return (0);
3872}
3873
3874/*
3875 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3876 * next batch of work requests from the write queue.
3877 */
3878static void
3879wr_ack(struct toepcb *toep, struct mbuf *m)
3880{
3881	struct tcpcb *tp = toep->tp_tp;
3882	struct cpl_wr_ack *hdr = cplhdr(m);
3883	struct socket *so;
3884	unsigned int credits = ntohs(hdr->credits);
3885	u32 snd_una = ntohl(hdr->snd_una);
3886	int bytes = 0;
3887	struct sockbuf *snd;
3888
3889	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3890
3891	inp_wlock(tp->t_inpcb);
3892	so = inp_inpcbtosocket(tp->t_inpcb);
3893	toep->tp_wr_avail += credits;
3894	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3895		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3896
3897	while (credits) {
3898		struct mbuf *p = peek_wr(toep);
3899
3900		if (__predict_false(!p)) {
3901			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3902			    "nothing pending, state %u wr_avail=%u\n",
3903			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3904			break;
3905		}
3906		CTR2(KTR_TOM,
3907			"wr_ack: p->credits=%d p->bytes=%d",
3908		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3909		KASSERT(p->m_pkthdr.csum_data != 0,
3910		    ("empty request still on list"));
3911
3912		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3913
3914#if DEBUG_WR > 1
3915			struct tx_data_wr *w = cplhdr(p);
3916			log(LOG_ERR,
3917			       "TID %u got %u WR credits, need %u, len %u, "
3918			       "main body %u, frags %u, seq # %u, ACK una %u,"
3919			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3920			       toep->tp_tid, credits, p->csum, p->len,
3921			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3922			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3923			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3924#endif
3925			p->m_pkthdr.csum_data -= credits;
3926			break;
3927		} else {
3928			dequeue_wr(toep);
3929			credits -= p->m_pkthdr.csum_data;
3930			bytes += p->m_pkthdr.len;
3931			CTR3(KTR_TOM,
3932			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3933			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3934
3935			m_free(p);
3936		}
3937	}
3938
3939#if DEBUG_WR
3940	check_wr_invariants(tp);
3941#endif
3942
3943	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3944#if VALIDATE_SEQ
3945		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3946
3947		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3948		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3949		    toep->tp_tid, tp->snd_una);
3950#endif
3951		goto out_free;
3952	}
3953
3954	if (tp->snd_una != snd_una) {
3955		tp->snd_una = snd_una;
3956		tp->ts_recent_age = ticks;
3957#ifdef notyet
3958		/*
3959		 * Keep ARP entry "minty fresh"
3960		 */
3961		dst_confirm(sk->sk_dst_cache);
3962#endif
3963		if (tp->snd_una == tp->snd_nxt)
3964			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3965	}
3966
3967	snd = so_sockbuf_snd(so);
3968	if (bytes) {
3969		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3970		snd = so_sockbuf_snd(so);
3971		sockbuf_lock(snd);
3972		sbdrop_locked(snd, bytes);
3973		so_sowwakeup_locked(so);
3974	}
3975
3976	if (snd->sb_sndptroff < snd->sb_cc)
3977		t3_push_frames(so, 0);
3978
3979out_free:
3980	inp_wunlock(tp->t_inpcb);
3981	m_free(m);
3982}
3983
3984/*
3985 * Handler for TX_DATA_ACK CPL messages.
3986 */
3987static int
3988do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3989{
3990	struct toepcb *toep = (struct toepcb *)ctx;
3991
3992	VALIDATE_SOCK(so);
3993
3994	wr_ack(toep, m);
3995	return 0;
3996}
3997
3998/*
3999 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4000 */
4001static int
4002do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4003{
4004	m_freem(m);
4005	return 0;
4006}
4007
4008/*
4009 * Reset a connection that is on a listener's SYN queue or accept queue,
4010 * i.e., one that has not had a struct socket associated with it.
4011 * Must be called from process context.
4012 *
4013 * Modeled after code in inet_csk_listen_stop().
4014 */
4015static void
4016t3_reset_listen_child(struct socket *child)
4017{
4018	struct tcpcb *tp = so_sototcpcb(child);
4019
4020	t3_send_reset(tp->t_toe);
4021}
4022
4023
4024static void
4025t3_child_disconnect(struct socket *so, void *arg)
4026{
4027	struct tcpcb *tp = so_sototcpcb(so);
4028
4029	if (tp->t_flags & TF_TOE) {
4030		inp_wlock(tp->t_inpcb);
4031		t3_reset_listen_child(so);
4032		inp_wunlock(tp->t_inpcb);
4033	}
4034}
4035
4036/*
4037 * Disconnect offloaded established but not yet accepted connections sitting
4038 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4039 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4040 */
4041void
4042t3_disconnect_acceptq(struct socket *listen_so)
4043{
4044
4045	so_lock(listen_so);
4046	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4047	so_unlock(listen_so);
4048}
4049
4050/*
4051 * Reset offloaded connections sitting on a server's syn queue.  As above
4052 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4053 */
4054
4055void
4056t3_reset_synq(struct listen_ctx *lctx)
4057{
4058	struct toepcb *toep;
4059
4060	so_lock(lctx->lso);
4061	while (!LIST_EMPTY(&lctx->synq_head)) {
4062		toep = LIST_FIRST(&lctx->synq_head);
4063		LIST_REMOVE(toep, synq_entry);
4064		toep->tp_tp = NULL;
4065		t3_send_reset(toep);
4066		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4067		toepcb_release(toep);
4068	}
4069	so_unlock(lctx->lso);
4070}
4071
4072
4073int
4074t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4075		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4076		   unsigned int pg_off, unsigned int color)
4077{
4078	unsigned int i, j, pidx;
4079	struct pagepod *p;
4080	struct mbuf *m;
4081	struct ulp_mem_io *req;
4082	unsigned int tid = toep->tp_tid;
4083	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4084	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4085
4086	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4087	    gl, nppods, tag, maxoff, pg_off, color);
4088
4089	for (i = 0; i < nppods; ++i) {
4090		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4091		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4092		req = mtod(m, struct ulp_mem_io *);
4093		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4094		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4095		req->wr.wr_lo = 0;
4096		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4097					   V_ULPTX_CMD(ULP_MEM_WRITE));
4098		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4099				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4100
4101		p = (struct pagepod *)(req + 1);
4102		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4103			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4104			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4105						  V_PPOD_COLOR(color));
4106			p->pp_max_offset = htonl(maxoff);
4107			p->pp_page_offset = htonl(pg_off);
4108			p->pp_rsvd = 0;
4109			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4110				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4111				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4112		} else
4113			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4114		send_or_defer(toep, m, 0);
4115		ppod_addr += PPOD_SIZE;
4116	}
4117	return (0);
4118}
4119
4120/*
4121 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4122 */
4123static inline void
4124mk_cpl_barrier_ulp(struct cpl_barrier *b)
4125{
4126	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4127
4128	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4129	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4130	b->opcode = CPL_BARRIER;
4131}
4132
4133/*
4134 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4135 */
4136static inline void
4137mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4138{
4139	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4140
4141	txpkt = (struct ulp_txpkt *)req;
4142	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4143	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4144	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4145	req->cpuno = htons(cpuno);
4146}
4147
4148/*
4149 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4150 */
4151static inline void
4152mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4153                     unsigned int word, uint64_t mask, uint64_t val)
4154{
4155	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4156
4157	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4158	    tid, word, mask, val);
4159
4160	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4161	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4162	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4163	req->reply = V_NO_REPLY(1);
4164	req->cpu_idx = 0;
4165	req->word = htons(word);
4166	req->mask = htobe64(mask);
4167	req->val = htobe64(val);
4168}
4169
4170/*
4171 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4172 */
4173static void
4174mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4175    unsigned int tid, unsigned int credits)
4176{
4177	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4178
4179	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4180	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4181	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4182	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4183	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4184				 V_RX_CREDITS(credits));
4185}
4186
4187void
4188t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4189{
4190	unsigned int wrlen;
4191	struct mbuf *m;
4192	struct work_request_hdr *wr;
4193	struct cpl_barrier *lock;
4194	struct cpl_set_tcb_field *req;
4195	struct cpl_get_tcb *getreq;
4196	struct ddp_state *p = &toep->tp_ddp_state;
4197
4198#if 0
4199	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4200#endif
4201	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4202		sizeof(*getreq);
4203	m = m_gethdr_nofail(wrlen);
4204	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4205	wr = mtod(m, struct work_request_hdr *);
4206	bzero(wr, wrlen);
4207
4208	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4209	m->m_pkthdr.len = m->m_len = wrlen;
4210
4211	lock = (struct cpl_barrier *)(wr + 1);
4212	mk_cpl_barrier_ulp(lock);
4213
4214	req = (struct cpl_set_tcb_field *)(lock + 1);
4215
4216	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4217
4218	/* Hmmm, not sure if this actually a good thing: reactivating
4219	 * the other buffer might be an issue if it has been completed
4220	 * already. However, that is unlikely, since the fact that the UBUF
4221	 * is not completed indicates that there is no oustanding data.
4222	 */
4223	if (bufidx == 0)
4224		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4225				     V_TF_DDP_ACTIVE_BUF(1) |
4226				     V_TF_DDP_BUF0_VALID(1),
4227				     V_TF_DDP_ACTIVE_BUF(1));
4228	else
4229		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4230				     V_TF_DDP_ACTIVE_BUF(1) |
4231				     V_TF_DDP_BUF1_VALID(1), 0);
4232
4233	getreq = (struct cpl_get_tcb *)(req + 1);
4234	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4235
4236	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4237
4238	/* Keep track of the number of oustanding CPL_GET_TCB requests
4239	 */
4240	p->get_tcb_count++;
4241
4242#ifdef T3_TRACE
4243	T3_TRACE1(TIDTB(so),
4244		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4245#endif
4246	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4247}
4248
4249/**
4250 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4251 * @sk: the socket associated with the buffers
4252 * @bufidx: index of HW DDP buffer (0 or 1)
4253 * @tag0: new tag for HW buffer 0
4254 * @tag1: new tag for HW buffer 1
4255 * @len: new length for HW buf @bufidx
4256 *
4257 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4258 * buffer by changing the buffer tag and length and setting the valid and
4259 * active flag accordingly.  The caller must ensure the new buffer is at
4260 * least as big as the existing one.  Since we typically reprogram both HW
4261 * buffers this function sets both tags for convenience. Read the TCB to
4262 * determine how made data was written into the buffer before the overlay
4263 * took place.
4264 */
4265void
4266t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4267	 	       unsigned int tag1, unsigned int len)
4268{
4269	unsigned int wrlen;
4270	struct mbuf *m;
4271	struct work_request_hdr *wr;
4272	struct cpl_get_tcb *getreq;
4273	struct cpl_set_tcb_field *req;
4274	struct ddp_state *p = &toep->tp_ddp_state;
4275
4276	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4277	    bufidx, tag0, tag1, len);
4278#if 0
4279	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4280#endif
4281	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4282	m = m_gethdr_nofail(wrlen);
4283	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4284	wr = mtod(m, struct work_request_hdr *);
4285	m->m_pkthdr.len = m->m_len = wrlen;
4286	bzero(wr, wrlen);
4287
4288
4289	/* Set the ATOMIC flag to make sure that TP processes the following
4290	 * CPLs in an atomic manner and no wire segments can be interleaved.
4291	 */
4292	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4293	req = (struct cpl_set_tcb_field *)(wr + 1);
4294	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4295			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4296			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4297			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4298			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4299	req++;
4300	if (bufidx == 0) {
4301		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4302			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4303			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4304		req++;
4305		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4306			    V_TF_DDP_PUSH_DISABLE_0(1) |
4307			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4308			    V_TF_DDP_PUSH_DISABLE_0(0) |
4309			    V_TF_DDP_BUF0_VALID(1));
4310	} else {
4311		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4312			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4313			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4314		req++;
4315		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4316			    V_TF_DDP_PUSH_DISABLE_1(1) |
4317			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4318			    V_TF_DDP_PUSH_DISABLE_1(0) |
4319			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4320	}
4321
4322	getreq = (struct cpl_get_tcb *)(req + 1);
4323	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4324
4325	/* Keep track of the number of oustanding CPL_GET_TCB requests
4326	 */
4327	p->get_tcb_count++;
4328
4329#ifdef T3_TRACE
4330	T3_TRACE4(TIDTB(sk),
4331		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4332		  "len %d",
4333		  bufidx, tag0, tag1, len);
4334#endif
4335	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4336}
4337
4338/*
4339 * Sends a compound WR containing all the CPL messages needed to program the
4340 * two HW DDP buffers, namely optionally setting up the length and offset of
4341 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4342 */
4343void
4344t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4345		      unsigned int len1, unsigned int offset1,
4346                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4347{
4348	unsigned int wrlen;
4349	struct mbuf *m;
4350	struct work_request_hdr *wr;
4351	struct cpl_set_tcb_field *req;
4352
4353	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4354	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4355
4356#if 0
4357	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4358#endif
4359	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4360		(len1 ? sizeof(*req) : 0) +
4361		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4362	m = m_gethdr_nofail(wrlen);
4363	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4364	wr = mtod(m, struct work_request_hdr *);
4365	bzero(wr, wrlen);
4366
4367	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4368	m->m_pkthdr.len = m->m_len = wrlen;
4369
4370	req = (struct cpl_set_tcb_field *)(wr + 1);
4371	if (len0) {                  /* program buffer 0 offset and length */
4372		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4373			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4374			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4375			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4376			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4377		req++;
4378	}
4379	if (len1) {                  /* program buffer 1 offset and length */
4380		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4381			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4382			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4383			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4384			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4385		req++;
4386	}
4387
4388	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4389			     ddp_flags);
4390
4391	if (modulate) {
4392		mk_rx_data_ack_ulp(toep,
4393		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4394		    toep->tp_copied_seq - toep->tp_rcv_wup);
4395		toep->tp_rcv_wup = toep->tp_copied_seq;
4396	}
4397
4398#ifdef T3_TRACE
4399	T3_TRACE5(TIDTB(sk),
4400		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4401		  "modulate %d",
4402		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4403		  modulate);
4404#endif
4405
4406	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4407}
4408
4409void
4410t3_init_wr_tab(unsigned int wr_len)
4411{
4412	int i;
4413
4414	if (mbuf_wrs[1])     /* already initialized */
4415		return;
4416
4417	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4418		int sgl_len = (3 * i) / 2 + (i & 1);
4419
4420		sgl_len += 3;
4421		mbuf_wrs[i] = sgl_len <= wr_len ?
4422		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4423	}
4424
4425	wrlen = wr_len * 8;
4426}
4427
4428int
4429t3_init_cpl_io(void)
4430{
4431#ifdef notyet
4432	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4433	if (!tcphdr_skb) {
4434		log(LOG_ERR,
4435		       "Chelsio TCP offload: can't allocate sk_buff\n");
4436		return -1;
4437	}
4438	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4439	tcphdr_skb->h.raw = tcphdr_skb->data;
4440	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4441#endif
4442
4443	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4444	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4445	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4446	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4447	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4448	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4449	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4450	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4451	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4452	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4453	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4454	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4455	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4456	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4457	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4458	return (0);
4459}
4460
4461