cxgb_cpl_io.c revision 181803
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 181803 2008-08-17 23:27:27Z bz $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/sockbuf.h>
47#include <sys/sysctl.h>
48#include <sys/syslog.h>
49#include <sys/protosw.h>
50#include <sys/priv.h>
51#include <sys/vimage.h>
52
53#include <net/if.h>
54#include <net/route.h>
55
56#include <netinet/in.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_systm.h>
59#include <netinet/in_var.h>
60
61
62#include <dev/cxgb/cxgb_osdep.h>
63#include <dev/cxgb/sys/mbufq.h>
64
65#include <netinet/ip.h>
66#include <netinet/tcp_var.h>
67#include <netinet/tcp_fsm.h>
68#include <netinet/tcp_offload.h>
69#include <netinet/tcp_seq.h>
70#include <netinet/tcp_syncache.h>
71#include <netinet/tcp_timer.h>
72#include <net/route.h>
73
74#include <dev/cxgb/t3cdev.h>
75#include <dev/cxgb/common/cxgb_firmware_exports.h>
76#include <dev/cxgb/common/cxgb_t3_cpl.h>
77#include <dev/cxgb/common/cxgb_tcb.h>
78#include <dev/cxgb/common/cxgb_ctl_defs.h>
79#include <dev/cxgb/cxgb_offload.h>
80#include <vm/vm.h>
81#include <vm/pmap.h>
82#include <machine/bus.h>
83#include <dev/cxgb/sys/mvec.h>
84#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
85#include <dev/cxgb/ulp/tom/cxgb_defs.h>
86#include <dev/cxgb/ulp/tom/cxgb_tom.h>
87#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
88#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
89#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
90
91#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
92
93/*
94 * For ULP connections HW may add headers, e.g., for digests, that aren't part
95 * of the messages sent by the host but that are part of the TCP payload and
96 * therefore consume TCP sequence space.  Tx connection parameters that
97 * operate in TCP sequence space are affected by the HW additions and need to
98 * compensate for them to accurately track TCP sequence numbers. This array
99 * contains the compensating extra lengths for ULP packets.  It is indexed by
100 * a packet's ULP submode.
101 */
102const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
103
104#ifdef notyet
105/*
106 * This sk_buff holds a fake header-only TCP segment that we use whenever we
107 * need to exploit SW TCP functionality that expects TCP headers, such as
108 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
109 * CPUs without locking.
110 */
111static struct mbuf *tcphdr_mbuf __read_mostly;
112#endif
113
114/*
115 * Size of WRs in bytes.  Note that we assume all devices we are handling have
116 * the same WR size.
117 */
118static unsigned int wrlen __read_mostly;
119
120/*
121 * The number of WRs needed for an skb depends on the number of page fragments
122 * in the skb and whether it has any payload in its main body.  This maps the
123 * length of the gather list represented by an skb into the # of necessary WRs.
124 */
125static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
126
127/*
128 * Max receive window supported by HW in bytes.  Only a small part of it can
129 * be set through option0, the rest needs to be set through RX_DATA_ACK.
130 */
131#define MAX_RCV_WND ((1U << 27) - 1)
132
133/*
134 * Min receive window.  We want it to be large enough to accommodate receive
135 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
136 */
137#define MIN_RCV_WND (24 * 1024U)
138#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
139
140#define VALIDATE_SEQ 0
141#define VALIDATE_SOCK(so)
142#define DEBUG_WR 0
143
144#define TCP_TIMEWAIT	1
145#define TCP_CLOSE	2
146#define TCP_DROP	3
147
148extern int tcp_do_autorcvbuf;
149extern int tcp_do_autosndbuf;
150extern int tcp_autorcvbuf_max;
151extern int tcp_autosndbuf_max;
152
153static void t3_send_reset(struct toepcb *toep);
154static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
155static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
156static void handle_syncache_event(int event, void *arg);
157
158static inline void
159SBAPPEND(struct sockbuf *sb, struct mbuf *n)
160{
161	struct mbuf *m;
162
163	m = sb->sb_mb;
164	while (m) {
165		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
166		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
167			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
168		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
169			m->m_next, m->m_nextpkt, m->m_flags));
170		m = m->m_next;
171	}
172	m = n;
173	while (m) {
174		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
175		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
176			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
177		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
178			m->m_next, m->m_nextpkt, m->m_flags));
179		m = m->m_next;
180	}
181	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
182	sbappendstream_locked(sb, n);
183	m = sb->sb_mb;
184
185	while (m) {
186		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
187			m->m_next, m->m_nextpkt, m->m_flags));
188		m = m->m_next;
189	}
190}
191
192static inline int
193is_t3a(const struct toedev *dev)
194{
195	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
196}
197
198static void
199dump_toepcb(struct toepcb *toep)
200{
201	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
202	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
203	    toep->tp_mtu_idx, toep->tp_tid);
204
205	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
206	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
207	    toep->tp_mss_clamp, toep->tp_flags);
208}
209
210#ifndef RTALLOC2_DEFINED
211static struct rtentry *
212rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
213{
214	struct rtentry *rt = NULL;
215
216	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
217		RT_UNLOCK(rt);
218
219	return (rt);
220}
221#endif
222
223/*
224 * Determine whether to send a CPL message now or defer it.  A message is
225 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
226 * For connections in other states the message is sent immediately.
227 * If through_l2t is set the message is subject to ARP processing, otherwise
228 * it is sent directly.
229 */
230static inline void
231send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
232{
233	struct tcpcb *tp = toep->tp_tp;
234
235	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
236		inp_wlock(tp->t_inpcb);
237		mbufq_tail(&toep->out_of_order_queue, m);  // defer
238		inp_wunlock(tp->t_inpcb);
239	} else if (through_l2t)
240		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
241	else
242		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
243}
244
245static inline unsigned int
246mkprio(unsigned int cntrl, const struct toepcb *toep)
247{
248        return (cntrl);
249}
250
251/*
252 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
253 */
254static inline void
255mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
256{
257	struct cpl_tid_release *req;
258
259	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
260	m->m_pkthdr.len = m->m_len = sizeof(*req);
261	req = mtod(m, struct cpl_tid_release *);
262	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
263	req->wr.wr_lo = 0;
264	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
265}
266
267static inline void
268make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
269{
270	struct tcpcb *tp = so_sototcpcb(so);
271	struct toepcb *toep = tp->t_toe;
272	struct tx_data_wr *req;
273	struct sockbuf *snd;
274
275	inp_lock_assert(tp->t_inpcb);
276	snd = so_sockbuf_snd(so);
277
278	req = mtod(m, struct tx_data_wr *);
279	m->m_len = sizeof(*req);
280	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
281	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
282	/* len includes the length of any HW ULP additions */
283	req->len = htonl(len);
284	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
285	/* V_TX_ULP_SUBMODE sets both the mode and submode */
286	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
287	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
288	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
289				   (tail ? 0 : 1))));
290	req->sndseq = htonl(tp->snd_nxt);
291	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
292		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
293				    V_TX_CPU_IDX(toep->tp_qset));
294
295		/* Sendbuffer is in units of 32KB.
296		 */
297		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
298			req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
299		else {
300			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
301		}
302
303		toep->tp_flags |= TP_DATASENT;
304	}
305}
306
307#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
308
309int
310t3_push_frames(struct socket *so, int req_completion)
311{
312	struct tcpcb *tp = so_sototcpcb(so);
313	struct toepcb *toep = tp->t_toe;
314
315	struct mbuf *tail, *m0, *last;
316	struct t3cdev *cdev;
317	struct tom_data *d;
318	int state, bytes, count, total_bytes;
319	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
320	struct sockbuf *snd;
321
322	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
323		DPRINTF("tcp state=%d\n", tp->t_state);
324		return (0);
325	}
326
327	state = so_state_get(so);
328
329	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
330		DPRINTF("disconnecting\n");
331
332		return (0);
333	}
334
335	inp_lock_assert(tp->t_inpcb);
336
337	snd = so_sockbuf_snd(so);
338	sockbuf_lock(snd);
339
340	d = TOM_DATA(toep->tp_toedev);
341	cdev = d->cdev;
342
343	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
344
345	total_bytes = 0;
346	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
347	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
348
349	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
350		KASSERT(tail, ("sbdrop error"));
351		last = tail = tail->m_next;
352	}
353
354	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
355		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
356		sockbuf_unlock(snd);
357
358		return (0);
359	}
360
361	toep->tp_m_last = NULL;
362	while (toep->tp_wr_avail && (tail != NULL)) {
363		count = bytes = 0;
364		segp = segs;
365		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
366			sockbuf_unlock(snd);
367			return (0);
368		}
369		/*
370		 * If the data in tail fits as in-line, then
371		 * make an immediate data wr.
372		 */
373		if (tail->m_len <= IMM_LEN) {
374			count = 1;
375			bytes = tail->m_len;
376			last = tail;
377			tail = tail->m_next;
378			m_set_sgl(m0, NULL);
379			m_set_sgllen(m0, 0);
380			make_tx_data_wr(so, m0, bytes, tail);
381			m_append(m0, bytes, mtod(last, caddr_t));
382			KASSERT(!m0->m_next, ("bad append"));
383		} else {
384			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
385			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
386				bytes += tail->m_len;
387				last = tail;
388				count++;
389				/*
390				 * technically an abuse to be using this for a VA
391				 * but less gross than defining my own structure
392				 * or calling pmap_kextract from here :-|
393				 */
394				segp->ds_addr = (bus_addr_t)tail->m_data;
395				segp->ds_len = tail->m_len;
396				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
397				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
398				segp++;
399				tail = tail->m_next;
400			}
401			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
402			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
403
404			m_set_sgl(m0, segs);
405			m_set_sgllen(m0, count);
406			make_tx_data_wr(so, m0, bytes, tail);
407		}
408		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
409
410		if (tail) {
411			snd->sb_sndptr = tail;
412			toep->tp_m_last = NULL;
413		} else
414			toep->tp_m_last = snd->sb_sndptr = last;
415
416
417		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
418
419		snd->sb_sndptroff += bytes;
420		total_bytes += bytes;
421		toep->tp_write_seq += bytes;
422		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
423		    " tail=%p sndptr=%p sndptroff=%d",
424		    toep->tp_wr_avail, count, mbuf_wrs[count],
425		    tail, snd->sb_sndptr, snd->sb_sndptroff);
426		if (tail)
427			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
428			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
429			    total_bytes, toep->tp_m_last, tail->m_data,
430			    tp->snd_una);
431		else
432			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
433			    " tp_m_last=%p snd_una=0x%08x",
434			    total_bytes, toep->tp_m_last, tp->snd_una);
435
436
437#ifdef KTR
438{
439		int i;
440
441		i = 0;
442		while (i < count && m_get_sgllen(m0)) {
443			if ((count - i) >= 3) {
444				CTR6(KTR_TOM,
445				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
446				    " len=%d pa=0x%zx len=%d",
447				    segs[i].ds_addr, segs[i].ds_len,
448				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
449				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
450				    i += 3;
451			} else if ((count - i) == 2) {
452				CTR4(KTR_TOM,
453				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
454				    " len=%d",
455				    segs[i].ds_addr, segs[i].ds_len,
456				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
457				    i += 2;
458			} else {
459				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
460				    segs[i].ds_addr, segs[i].ds_len);
461				i++;
462			}
463
464		}
465}
466#endif
467                 /*
468		 * remember credits used
469		 */
470		m0->m_pkthdr.csum_data = mbuf_wrs[count];
471		m0->m_pkthdr.len = bytes;
472		toep->tp_wr_avail -= mbuf_wrs[count];
473		toep->tp_wr_unacked += mbuf_wrs[count];
474
475		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
476		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
477			struct work_request_hdr *wr = cplhdr(m0);
478
479			wr->wr_hi |= htonl(F_WR_COMPL);
480			toep->tp_wr_unacked = 0;
481		}
482		KASSERT((m0->m_pkthdr.csum_data > 0) &&
483		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
484			m0->m_pkthdr.csum_data));
485		m0->m_type = MT_DONTFREE;
486		enqueue_wr(toep, m0);
487		DPRINTF("sending offload tx with %d bytes in %d segments\n",
488		    bytes, count);
489		l2t_send(cdev, m0, toep->tp_l2t);
490	}
491	sockbuf_unlock(snd);
492	return (total_bytes);
493}
494
495/*
496 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
497 * under any circumstances.  We take the easy way out and always queue the
498 * message to the write_queue.  We can optimize the case where the queue is
499 * already empty though the optimization is probably not worth it.
500 */
501static void
502close_conn(struct socket *so)
503{
504	struct mbuf *m;
505	struct cpl_close_con_req *req;
506	struct tom_data *d;
507	struct inpcb *inp = so_sotoinpcb(so);
508	struct tcpcb *tp;
509	struct toepcb *toep;
510	unsigned int tid;
511
512
513	inp_wlock(inp);
514	tp = so_sototcpcb(so);
515	toep = tp->t_toe;
516
517	if (tp->t_state != TCPS_SYN_SENT)
518		t3_push_frames(so, 1);
519
520	if (toep->tp_flags & TP_FIN_SENT) {
521		inp_wunlock(inp);
522		return;
523	}
524
525	tid = toep->tp_tid;
526
527	d = TOM_DATA(toep->tp_toedev);
528
529	m = m_gethdr_nofail(sizeof(*req));
530	m_set_priority(m, CPL_PRIORITY_DATA);
531	m_set_sgl(m, NULL);
532	m_set_sgllen(m, 0);
533
534	toep->tp_flags |= TP_FIN_SENT;
535	req = mtod(m, struct cpl_close_con_req *);
536
537	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
538	req->wr.wr_lo = htonl(V_WR_TID(tid));
539	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
540	req->rsvd = 0;
541	inp_wunlock(inp);
542	/*
543	 * XXX - need to defer shutdown while there is still data in the queue
544	 *
545	 */
546	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
547	cxgb_ofld_send(d->cdev, m);
548
549}
550
551/*
552 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
553 * and send it along.
554 */
555static void
556abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
557{
558	struct cpl_abort_req *req = cplhdr(m);
559
560	req->cmd = CPL_ABORT_NO_RST;
561	cxgb_ofld_send(cdev, m);
562}
563
564/*
565 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
566 * permitted to return without sending the message in case we cannot allocate
567 * an sk_buff.  Returns the number of credits sent.
568 */
569uint32_t
570t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
571{
572	struct mbuf *m;
573	struct cpl_rx_data_ack *req;
574	struct toepcb *toep = tp->t_toe;
575	struct toedev *tdev = toep->tp_toedev;
576
577	m = m_gethdr_nofail(sizeof(*req));
578
579	DPRINTF("returning %u credits to HW\n", credits);
580
581	req = mtod(m, struct cpl_rx_data_ack *);
582	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
583	req->wr.wr_lo = 0;
584	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
585	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
586	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
587	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
588	return (credits);
589}
590
591/*
592 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
593 * This is only used in DDP mode, so we take the opportunity to also set the
594 * DACK mode and flush any Rx credits.
595 */
596void
597t3_send_rx_modulate(struct toepcb *toep)
598{
599	struct mbuf *m;
600	struct cpl_rx_data_ack *req;
601
602	m = m_gethdr_nofail(sizeof(*req));
603
604	req = mtod(m, struct cpl_rx_data_ack *);
605	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
606	req->wr.wr_lo = 0;
607	m->m_pkthdr.len = m->m_len = sizeof(*req);
608
609	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
610	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
611				 V_RX_DACK_MODE(1) |
612				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
613	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
614	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
615	toep->tp_rcv_wup = toep->tp_copied_seq;
616}
617
618/*
619 * Handle receipt of an urgent pointer.
620 */
621static void
622handle_urg_ptr(struct socket *so, uint32_t urg_seq)
623{
624#ifdef URGENT_DATA_SUPPORTED
625	struct tcpcb *tp = so_sototcpcb(so);
626
627	urg_seq--;   /* initially points past the urgent data, per BSD */
628
629	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
630		return;                                 /* duplicate pointer */
631	sk_send_sigurg(sk);
632	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
633	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
634		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
635
636		tp->copied_seq++;
637		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
638			tom_eat_skb(sk, skb, 0);
639	}
640	tp->urg_data = TCP_URG_NOTYET;
641	tp->urg_seq = urg_seq;
642#endif
643}
644
645/*
646 * Returns true if a socket cannot accept new Rx data.
647 */
648static inline int
649so_no_receive(const struct socket *so)
650{
651	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
652}
653
654/*
655 * Process an urgent data notification.
656 */
657static void
658rx_urg_notify(struct toepcb *toep, struct mbuf *m)
659{
660	struct cpl_rx_urg_notify *hdr = cplhdr(m);
661	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
662
663	VALIDATE_SOCK(so);
664
665	if (!so_no_receive(so))
666		handle_urg_ptr(so, ntohl(hdr->seq));
667
668	m_freem(m);
669}
670
671/*
672 * Handler for RX_URG_NOTIFY CPL messages.
673 */
674static int
675do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
676{
677	struct toepcb *toep = (struct toepcb *)ctx;
678
679	rx_urg_notify(toep, m);
680	return (0);
681}
682
683static __inline int
684is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
685{
686	return (toep->tp_ulp_mode ||
687		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
688		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
689}
690
691/*
692 * Set of states for which we should return RX credits.
693 */
694#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
695
696/*
697 * Called after some received data has been read.  It returns RX credits
698 * to the HW for the amount of data processed.
699 */
700void
701t3_cleanup_rbuf(struct tcpcb *tp, int copied)
702{
703	struct toepcb *toep = tp->t_toe;
704	struct socket *so;
705	struct toedev *dev;
706	int dack_mode, must_send, read;
707	u32 thres, credits, dack = 0;
708	struct sockbuf *rcv;
709
710	so = inp_inpcbtosocket(tp->t_inpcb);
711	rcv = so_sockbuf_rcv(so);
712
713	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
714		(tp->t_state == TCPS_FIN_WAIT_2))) {
715		if (copied) {
716			sockbuf_lock(rcv);
717			toep->tp_copied_seq += copied;
718			sockbuf_unlock(rcv);
719		}
720
721		return;
722	}
723
724	inp_lock_assert(tp->t_inpcb);
725
726	sockbuf_lock(rcv);
727	if (copied)
728		toep->tp_copied_seq += copied;
729	else {
730		read = toep->tp_enqueued_bytes - rcv->sb_cc;
731		toep->tp_copied_seq += read;
732	}
733	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
734	toep->tp_enqueued_bytes = rcv->sb_cc;
735	sockbuf_unlock(rcv);
736
737	if (credits > rcv->sb_mbmax) {
738		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
739		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
740	    credits = rcv->sb_mbmax;
741	}
742
743
744	/*
745	 * XXX this won't accurately reflect credit return - we need
746	 * to look at the difference between the amount that has been
747	 * put in the recv sockbuf and what is there now
748	 */
749
750	if (__predict_false(!credits))
751		return;
752
753	dev = toep->tp_toedev;
754	thres = TOM_TUNABLE(dev, rx_credit_thres);
755
756	if (__predict_false(thres == 0))
757		return;
758
759	if (is_delack_mode_valid(dev, toep)) {
760		dack_mode = TOM_TUNABLE(dev, delack);
761		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
762			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
763
764			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
765				dack = F_RX_DACK_CHANGE |
766				       V_RX_DACK_MODE(dack_mode);
767		}
768	} else
769		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
770
771	/*
772	 * For coalescing to work effectively ensure the receive window has
773	 * at least 16KB left.
774	 */
775	must_send = credits + 16384 >= tp->rcv_wnd;
776
777	if (must_send || credits >= thres)
778		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
779}
780
781static int
782cxgb_toe_disconnect(struct tcpcb *tp)
783{
784	struct socket *so;
785
786	DPRINTF("cxgb_toe_disconnect\n");
787
788	so = inp_inpcbtosocket(tp->t_inpcb);
789	close_conn(so);
790	return (0);
791}
792
793static int
794cxgb_toe_reset(struct tcpcb *tp)
795{
796	struct toepcb *toep = tp->t_toe;
797
798	t3_send_reset(toep);
799
800	/*
801	 * unhook from socket
802	 */
803	tp->t_flags &= ~TF_TOE;
804	toep->tp_tp = NULL;
805	tp->t_toe = NULL;
806	return (0);
807}
808
809static int
810cxgb_toe_send(struct tcpcb *tp)
811{
812	struct socket *so;
813
814	DPRINTF("cxgb_toe_send\n");
815	dump_toepcb(tp->t_toe);
816
817	so = inp_inpcbtosocket(tp->t_inpcb);
818	t3_push_frames(so, 1);
819	return (0);
820}
821
822static int
823cxgb_toe_rcvd(struct tcpcb *tp)
824{
825
826	inp_lock_assert(tp->t_inpcb);
827
828	t3_cleanup_rbuf(tp, 0);
829
830	return (0);
831}
832
833static void
834cxgb_toe_detach(struct tcpcb *tp)
835{
836	struct toepcb *toep;
837
838        /*
839	 * XXX how do we handle teardown in the SYN_SENT state?
840	 *
841	 */
842	inp_lock_assert(tp->t_inpcb);
843	toep = tp->t_toe;
844	toep->tp_tp = NULL;
845
846	/*
847	 * unhook from socket
848	 */
849	tp->t_flags &= ~TF_TOE;
850	tp->t_toe = NULL;
851}
852
853
854static struct toe_usrreqs cxgb_toe_usrreqs = {
855	.tu_disconnect = cxgb_toe_disconnect,
856	.tu_reset = cxgb_toe_reset,
857	.tu_send = cxgb_toe_send,
858	.tu_rcvd = cxgb_toe_rcvd,
859	.tu_detach = cxgb_toe_detach,
860	.tu_detach = cxgb_toe_detach,
861	.tu_syncache_event = handle_syncache_event,
862};
863
864
865static void
866__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
867			    uint64_t mask, uint64_t val, int no_reply)
868{
869	struct cpl_set_tcb_field *req;
870
871	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
872	    toep->tp_tid, word, mask, val);
873
874	req = mtod(m, struct cpl_set_tcb_field *);
875	m->m_pkthdr.len = m->m_len = sizeof(*req);
876	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
877	req->wr.wr_lo = 0;
878	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
879	req->reply = V_NO_REPLY(no_reply);
880	req->cpu_idx = 0;
881	req->word = htons(word);
882	req->mask = htobe64(mask);
883	req->val = htobe64(val);
884
885	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
886	send_or_defer(toep, m, 0);
887}
888
889static void
890t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
891{
892	struct mbuf *m;
893	struct tcpcb *tp = toep->tp_tp;
894
895	if (toep == NULL)
896		return;
897
898	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
899		printf("not seting field\n");
900		return;
901	}
902
903	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
904
905	__set_tcb_field(toep, m, word, mask, val, 1);
906}
907
908/*
909 * Set one of the t_flags bits in the TCB.
910 */
911static void
912set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
913{
914
915	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
916}
917
918/*
919 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
920 */
921static void
922t3_set_nagle(struct toepcb *toep)
923{
924	struct tcpcb *tp = toep->tp_tp;
925
926	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
927}
928
929/*
930 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
931 */
932void
933t3_set_keepalive(struct toepcb *toep, int on_off)
934{
935
936	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
937}
938
939void
940t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
941{
942	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
943}
944
945void
946t3_set_dack_mss(struct toepcb *toep, int on_off)
947{
948
949	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
950}
951
952/*
953 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
954 */
955static void
956t3_set_tos(struct toepcb *toep)
957{
958	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
959
960	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
961			 V_TCB_TOS(tos));
962}
963
964
965/*
966 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
967 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
968 * set the PSH bit in the last segment, which would trigger delivery.]
969 * We work around the issue by setting a DDP buffer in a partial placed state,
970 * which guarantees that TP will schedule a timer.
971 */
972#define TP_DDP_TIMER_WORKAROUND_MASK\
973    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
974     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
975       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
976#define TP_DDP_TIMER_WORKAROUND_VAL\
977    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
978     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
979      32))
980
981static void
982t3_enable_ddp(struct toepcb *toep, int on)
983{
984	if (on) {
985
986		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
987				 V_TF_DDP_OFF(0));
988	} else
989		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
990				 V_TF_DDP_OFF(1) |
991				 TP_DDP_TIMER_WORKAROUND_MASK,
992				 V_TF_DDP_OFF(1) |
993				 TP_DDP_TIMER_WORKAROUND_VAL);
994
995}
996
997void
998t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
999{
1000	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1001			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1002			 tag_color);
1003}
1004
1005void
1006t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1007		    unsigned int len)
1008{
1009	if (buf_idx == 0)
1010		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1011			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1012			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1013			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1014			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1015	else
1016		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1017			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1018			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1019			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1020			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1021}
1022
1023static int
1024t3_set_cong_control(struct socket *so, const char *name)
1025{
1026#ifdef CONGESTION_CONTROL_SUPPORTED
1027	int cong_algo;
1028
1029	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1030		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1031			break;
1032
1033	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1034		return -EINVAL;
1035#endif
1036	return 0;
1037}
1038
1039int
1040t3_get_tcb(struct toepcb *toep)
1041{
1042	struct cpl_get_tcb *req;
1043	struct tcpcb *tp = toep->tp_tp;
1044	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1045
1046	if (!m)
1047		return (ENOMEM);
1048
1049	inp_lock_assert(tp->t_inpcb);
1050	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1051	req = mtod(m, struct cpl_get_tcb *);
1052	m->m_pkthdr.len = m->m_len = sizeof(*req);
1053	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1054	req->wr.wr_lo = 0;
1055	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1056	req->cpuno = htons(toep->tp_qset);
1057	req->rsvd = 0;
1058	if (tp->t_state == TCPS_SYN_SENT)
1059		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1060	else
1061		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1062	return 0;
1063}
1064
1065static inline void
1066so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1067{
1068
1069	toepcb_hold(toep);
1070
1071	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1072}
1073
1074/**
1075 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1076 *	@d: TOM state
1077 *	@mtu: the target MTU
1078 *
1079 *	Returns the index of the value in the MTU table that is closest to but
1080 *	does not exceed the target MTU.
1081 */
1082static unsigned int
1083find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1084{
1085	int i = 0;
1086
1087	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1088		++i;
1089	return (i);
1090}
1091
1092static unsigned int
1093select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1094{
1095	unsigned int idx;
1096
1097#ifdef notyet
1098	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1099#endif
1100	if (tp) {
1101		tp->t_maxseg = pmtu - 40;
1102		if (tp->t_maxseg < td->mtus[0] - 40)
1103			tp->t_maxseg = td->mtus[0] - 40;
1104		idx = find_best_mtu(td, tp->t_maxseg + 40);
1105
1106		tp->t_maxseg = td->mtus[idx] - 40;
1107	} else
1108		idx = find_best_mtu(td, pmtu);
1109
1110	return (idx);
1111}
1112
1113static inline void
1114free_atid(struct t3cdev *cdev, unsigned int tid)
1115{
1116	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1117
1118	if (toep)
1119		toepcb_release(toep);
1120}
1121
1122/*
1123 * Release resources held by an offload connection (TID, L2T entry, etc.)
1124 */
1125static void
1126t3_release_offload_resources(struct toepcb *toep)
1127{
1128	struct tcpcb *tp = toep->tp_tp;
1129	struct toedev *tdev = toep->tp_toedev;
1130	struct t3cdev *cdev;
1131	struct socket *so;
1132	unsigned int tid = toep->tp_tid;
1133	struct sockbuf *rcv;
1134
1135	CTR0(KTR_TOM, "t3_release_offload_resources");
1136
1137	if (!tdev)
1138		return;
1139
1140	cdev = TOEP_T3C_DEV(toep);
1141	if (!cdev)
1142		return;
1143
1144	toep->tp_qset = 0;
1145	t3_release_ddp_resources(toep);
1146
1147#ifdef CTRL_SKB_CACHE
1148	kfree_skb(CTRL_SKB_CACHE(tp));
1149	CTRL_SKB_CACHE(tp) = NULL;
1150#endif
1151
1152	if (toep->tp_wr_avail != toep->tp_wr_max) {
1153		purge_wr_queue(toep);
1154		reset_wr_list(toep);
1155	}
1156
1157	if (toep->tp_l2t) {
1158		l2t_release(L2DATA(cdev), toep->tp_l2t);
1159		toep->tp_l2t = NULL;
1160	}
1161	toep->tp_tp = NULL;
1162	if (tp) {
1163		inp_lock_assert(tp->t_inpcb);
1164		so = inp_inpcbtosocket(tp->t_inpcb);
1165		rcv = so_sockbuf_rcv(so);
1166		/*
1167		 * cancel any offloaded reads
1168		 *
1169		 */
1170		sockbuf_lock(rcv);
1171		tp->t_toe = NULL;
1172		tp->t_flags &= ~TF_TOE;
1173		if (toep->tp_ddp_state.user_ddp_pending) {
1174			t3_cancel_ubuf(toep, rcv);
1175			toep->tp_ddp_state.user_ddp_pending = 0;
1176		}
1177		so_sorwakeup_locked(so);
1178
1179	}
1180
1181	if (toep->tp_state == TCPS_SYN_SENT) {
1182		free_atid(cdev, tid);
1183#ifdef notyet
1184		__skb_queue_purge(&tp->out_of_order_queue);
1185#endif
1186	} else {                                          // we have TID
1187		cxgb_remove_tid(cdev, toep, tid);
1188		toepcb_release(toep);
1189	}
1190#if 0
1191	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1192#endif
1193}
1194
1195static void
1196install_offload_ops(struct socket *so)
1197{
1198	struct tcpcb *tp = so_sototcpcb(so);
1199
1200	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1201
1202	t3_install_socket_ops(so);
1203	tp->t_flags |= TF_TOE;
1204	tp->t_tu = &cxgb_toe_usrreqs;
1205}
1206
1207/*
1208 * Determine the receive window scaling factor given a target max
1209 * receive window.
1210 */
1211static __inline int
1212select_rcv_wscale(int space)
1213{
1214	int wscale = 0;
1215
1216	if (space > MAX_RCV_WND)
1217		space = MAX_RCV_WND;
1218
1219	if (V_tcp_do_rfc1323)
1220		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1221
1222	return (wscale);
1223}
1224
1225/*
1226 * Determine the receive window size for a socket.
1227 */
1228static unsigned long
1229select_rcv_wnd(struct toedev *dev, struct socket *so)
1230{
1231	struct tom_data *d = TOM_DATA(dev);
1232	unsigned int wnd;
1233	unsigned int max_rcv_wnd;
1234	struct sockbuf *rcv;
1235
1236	rcv = so_sockbuf_rcv(so);
1237
1238	if (V_tcp_do_autorcvbuf)
1239		wnd = V_tcp_autorcvbuf_max;
1240	else
1241		wnd = rcv->sb_hiwat;
1242
1243
1244
1245	/* XXX
1246	 * For receive coalescing to work effectively we need a receive window
1247	 * that can accomodate a coalesced segment.
1248	 */
1249	if (wnd < MIN_RCV_WND)
1250		wnd = MIN_RCV_WND;
1251
1252	/* PR 5138 */
1253	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1254				    (uint32_t)d->rx_page_size * 23 :
1255				    MAX_RCV_WND);
1256
1257	return min(wnd, max_rcv_wnd);
1258}
1259
1260/*
1261 * Assign offload parameters to some socket fields.  This code is used by
1262 * both active and passive opens.
1263 */
1264static inline void
1265init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1266    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1267{
1268	struct tcpcb *tp = so_sototcpcb(so);
1269	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1270	struct sockbuf *snd, *rcv;
1271
1272#ifdef notyet
1273	SOCK_LOCK_ASSERT(so);
1274#endif
1275
1276	snd = so_sockbuf_snd(so);
1277	rcv = so_sockbuf_rcv(so);
1278
1279	log(LOG_INFO, "initializing offload socket\n");
1280	/*
1281	 * We either need to fix push frames to work with sbcompress
1282	 * or we need to add this
1283	 */
1284	snd->sb_flags |= SB_NOCOALESCE;
1285	rcv->sb_flags |= SB_NOCOALESCE;
1286
1287	tp->t_toe = toep;
1288	toep->tp_tp = tp;
1289	toep->tp_toedev = dev;
1290
1291	toep->tp_tid = tid;
1292	toep->tp_l2t = e;
1293	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1294	toep->tp_wr_unacked = 0;
1295	toep->tp_delack_mode = 0;
1296
1297	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1298	/*
1299	 * XXX broken
1300	 *
1301	 */
1302	tp->rcv_wnd = select_rcv_wnd(dev, so);
1303
1304        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1305		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1306	toep->tp_qset_idx = 0;
1307
1308	reset_wr_list(toep);
1309	DPRINTF("initialization done\n");
1310}
1311
1312/*
1313 * The next two functions calculate the option 0 value for a socket.
1314 */
1315static inline unsigned int
1316calc_opt0h(struct socket *so, int mtu_idx)
1317{
1318	struct tcpcb *tp = so_sototcpcb(so);
1319	int wscale = select_rcv_wscale(tp->rcv_wnd);
1320
1321	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1322	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1323	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1324}
1325
1326static inline unsigned int
1327calc_opt0l(struct socket *so, int ulp_mode)
1328{
1329	struct tcpcb *tp = so_sototcpcb(so);
1330	unsigned int val;
1331
1332	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1333	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1334
1335	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1336	return (val);
1337}
1338
1339static inline unsigned int
1340calc_opt2(const struct socket *so, struct toedev *dev)
1341{
1342	int flv_valid;
1343
1344	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1345
1346	return (V_FLAVORS_VALID(flv_valid) |
1347	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1348}
1349
1350#if DEBUG_WR > 1
1351static int
1352count_pending_wrs(const struct toepcb *toep)
1353{
1354	const struct mbuf *m;
1355	int n = 0;
1356
1357	wr_queue_walk(toep, m)
1358		n += m->m_pkthdr.csum_data;
1359	return (n);
1360}
1361#endif
1362
1363#if 0
1364(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1365#endif
1366
1367static void
1368mk_act_open_req(struct socket *so, struct mbuf *m,
1369    unsigned int atid, const struct l2t_entry *e)
1370{
1371	struct cpl_act_open_req *req;
1372	struct inpcb *inp = so_sotoinpcb(so);
1373	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1374	struct toepcb *toep = tp->t_toe;
1375	struct toedev *tdev = toep->tp_toedev;
1376
1377	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1378
1379	req = mtod(m, struct cpl_act_open_req *);
1380	m->m_pkthdr.len = m->m_len = sizeof(*req);
1381
1382	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1383	req->wr.wr_lo = 0;
1384	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1385	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1386#if 0
1387	req->local_port = inp->inp_lport;
1388	req->peer_port = inp->inp_fport;
1389	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1390	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1391#endif
1392	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1393			   V_TX_CHANNEL(e->smt_idx));
1394	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1395	req->params = 0;
1396	req->opt2 = htonl(calc_opt2(so, tdev));
1397}
1398
1399
1400/*
1401 * Convert an ACT_OPEN_RPL status to an errno.
1402 */
1403static int
1404act_open_rpl_status_to_errno(int status)
1405{
1406	switch (status) {
1407	case CPL_ERR_CONN_RESET:
1408		return (ECONNREFUSED);
1409	case CPL_ERR_ARP_MISS:
1410		return (EHOSTUNREACH);
1411	case CPL_ERR_CONN_TIMEDOUT:
1412		return (ETIMEDOUT);
1413	case CPL_ERR_TCAM_FULL:
1414		return (ENOMEM);
1415	case CPL_ERR_CONN_EXIST:
1416		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1417		return (EADDRINUSE);
1418	default:
1419		return (EIO);
1420	}
1421}
1422
1423static void
1424fail_act_open(struct toepcb *toep, int errno)
1425{
1426	struct tcpcb *tp = toep->tp_tp;
1427
1428	t3_release_offload_resources(toep);
1429	if (tp) {
1430		inp_wunlock(tp->t_inpcb);
1431		tcp_offload_drop(tp, errno);
1432	}
1433
1434#ifdef notyet
1435	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1436#endif
1437}
1438
1439/*
1440 * Handle active open failures.
1441 */
1442static void
1443active_open_failed(struct toepcb *toep, struct mbuf *m)
1444{
1445	struct cpl_act_open_rpl *rpl = cplhdr(m);
1446	struct inpcb *inp;
1447
1448	if (toep->tp_tp == NULL)
1449		goto done;
1450
1451	inp = toep->tp_tp->t_inpcb;
1452
1453/*
1454 * Don't handle connection retry for now
1455 */
1456#ifdef notyet
1457	struct inet_connection_sock *icsk = inet_csk(sk);
1458
1459	if (rpl->status == CPL_ERR_CONN_EXIST &&
1460	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1461		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1462		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1463			       jiffies + HZ / 2);
1464	} else
1465#endif
1466	{
1467		inp_wlock(inp);
1468		/*
1469		 * drops the inpcb lock
1470		 */
1471		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1472	}
1473
1474	done:
1475	m_free(m);
1476}
1477
1478/*
1479 * Return whether a failed active open has allocated a TID
1480 */
1481static inline int
1482act_open_has_tid(int status)
1483{
1484	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1485	       status != CPL_ERR_ARP_MISS;
1486}
1487
1488/*
1489 * Process an ACT_OPEN_RPL CPL message.
1490 */
1491static int
1492do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1493{
1494	struct toepcb *toep = (struct toepcb *)ctx;
1495	struct cpl_act_open_rpl *rpl = cplhdr(m);
1496
1497	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1498		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1499
1500	active_open_failed(toep, m);
1501	return (0);
1502}
1503
1504/*
1505 * Handle an ARP failure for an active open.   XXX purge ofo queue
1506 *
1507 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1508 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1509 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1510 * free the atid.  Hmm.
1511 */
1512#ifdef notyet
1513static void
1514act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1515{
1516	struct toepcb *toep = m_get_toep(m);
1517	struct tcpcb *tp = toep->tp_tp;
1518	struct inpcb *inp = tp->t_inpcb;
1519	struct socket *so;
1520
1521	inp_wlock(inp);
1522	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1523		/*
1524		 * drops the inpcb lock
1525		 */
1526		fail_act_open(so, EHOSTUNREACH);
1527		printf("freeing %p\n", m);
1528
1529		m_free(m);
1530	} else
1531		inp_wunlock(inp);
1532}
1533#endif
1534/*
1535 * Send an active open request.
1536 */
1537int
1538t3_connect(struct toedev *tdev, struct socket *so,
1539    struct rtentry *rt, struct sockaddr *nam)
1540{
1541	struct mbuf *m;
1542	struct l2t_entry *e;
1543	struct tom_data *d = TOM_DATA(tdev);
1544	struct inpcb *inp = so_sotoinpcb(so);
1545	struct tcpcb *tp = intotcpcb(inp);
1546	struct toepcb *toep; /* allocated by init_offload_socket */
1547
1548	int atid;
1549
1550	toep = toepcb_alloc();
1551	if (toep == NULL)
1552		goto out_err;
1553
1554	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1555		goto out_err;
1556
1557	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1558	if (!e)
1559		goto free_tid;
1560
1561	inp_lock_assert(inp);
1562	m = m_gethdr(MT_DATA, M_WAITOK);
1563
1564#if 0
1565	m->m_toe.mt_toepcb = tp->t_toe;
1566	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1567#endif
1568	so_lock(so);
1569
1570	init_offload_socket(so, tdev, atid, e, rt, toep);
1571
1572	install_offload_ops(so);
1573
1574	mk_act_open_req(so, m, atid, e);
1575	so_unlock(so);
1576
1577	soisconnecting(so);
1578	toep = tp->t_toe;
1579	m_set_toep(m, tp->t_toe);
1580
1581	toep->tp_state = TCPS_SYN_SENT;
1582	l2t_send(d->cdev, (struct mbuf *)m, e);
1583
1584	if (toep->tp_ulp_mode)
1585		t3_enable_ddp(toep, 0);
1586	return 	(0);
1587
1588free_tid:
1589	printf("failing connect - free atid\n");
1590
1591	free_atid(d->cdev, atid);
1592out_err:
1593	printf("return ENOMEM\n");
1594       return (ENOMEM);
1595}
1596
1597/*
1598 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1599 * not send multiple ABORT_REQs for the same connection and also that we do
1600 * not try to send a message after the connection has closed.  Returns 1 if
1601 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1602 */
1603static void
1604t3_send_reset(struct toepcb *toep)
1605{
1606
1607	struct cpl_abort_req *req;
1608	unsigned int tid = toep->tp_tid;
1609	int mode = CPL_ABORT_SEND_RST;
1610	struct tcpcb *tp = toep->tp_tp;
1611	struct toedev *tdev = toep->tp_toedev;
1612	struct socket *so = NULL;
1613	struct mbuf *m;
1614	struct sockbuf *snd;
1615
1616	if (tp) {
1617		inp_lock_assert(tp->t_inpcb);
1618		so = inp_inpcbtosocket(tp->t_inpcb);
1619	}
1620
1621	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1622		tdev == NULL))
1623		return;
1624	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1625
1626	snd = so_sockbuf_snd(so);
1627	/* Purge the send queue so we don't send anything after an abort. */
1628	if (so)
1629		sbflush(snd);
1630	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1631		mode |= CPL_ABORT_POST_CLOSE_REQ;
1632
1633	m = m_gethdr_nofail(sizeof(*req));
1634	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1635	set_arp_failure_handler(m, abort_arp_failure);
1636
1637	req = mtod(m, struct cpl_abort_req *);
1638	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1639	req->wr.wr_lo = htonl(V_WR_TID(tid));
1640	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1641	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1642	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1643	req->cmd = mode;
1644	if (tp && (tp->t_state == TCPS_SYN_SENT))
1645		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1646	else
1647		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1648}
1649
1650static int
1651t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1652{
1653	struct inpcb *inp;
1654	int error, optval;
1655
1656	if (sopt->sopt_name == IP_OPTIONS)
1657		return (ENOPROTOOPT);
1658
1659	if (sopt->sopt_name != IP_TOS)
1660		return (EOPNOTSUPP);
1661
1662	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1663
1664	if (error)
1665		return (error);
1666
1667	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1668		return (EPERM);
1669
1670	inp = so_sotoinpcb(so);
1671	inp_wlock(inp);
1672	inp_ip_tos_set(inp, optval);
1673#if 0
1674	inp->inp_ip_tos = optval;
1675#endif
1676	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1677	inp_wunlock(inp);
1678
1679	return (0);
1680}
1681
1682static int
1683t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1684{
1685	int err = 0;
1686	size_t copied;
1687
1688	if (sopt->sopt_name != TCP_CONGESTION &&
1689	    sopt->sopt_name != TCP_NODELAY)
1690		return (EOPNOTSUPP);
1691
1692	if (sopt->sopt_name == TCP_CONGESTION) {
1693		char name[TCP_CA_NAME_MAX];
1694		int optlen = sopt->sopt_valsize;
1695		struct tcpcb *tp;
1696
1697		if (sopt->sopt_dir == SOPT_GET) {
1698			KASSERT(0, ("unimplemented"));
1699			return (EOPNOTSUPP);
1700		}
1701
1702		if (optlen < 1)
1703			return (EINVAL);
1704
1705		err = copyinstr(sopt->sopt_val, name,
1706		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1707		if (err)
1708			return (err);
1709		if (copied < 1)
1710			return (EINVAL);
1711
1712		tp = so_sototcpcb(so);
1713		/*
1714		 * XXX I need to revisit this
1715		 */
1716		if ((err = t3_set_cong_control(so, name)) == 0) {
1717#ifdef CONGESTION_CONTROL_SUPPORTED
1718			tp->t_cong_control = strdup(name, M_CXGB);
1719#endif
1720		} else
1721			return (err);
1722	} else {
1723		int optval, oldval;
1724		struct inpcb *inp;
1725		struct tcpcb *tp;
1726
1727		if (sopt->sopt_dir == SOPT_GET)
1728			return (EOPNOTSUPP);
1729
1730		err = sooptcopyin(sopt, &optval, sizeof optval,
1731		    sizeof optval);
1732
1733		if (err)
1734			return (err);
1735
1736		inp = so_sotoinpcb(so);
1737		tp = inp_inpcbtotcpcb(inp);
1738
1739		inp_wlock(inp);
1740
1741		oldval = tp->t_flags;
1742		if (optval)
1743			tp->t_flags |= TF_NODELAY;
1744		else
1745			tp->t_flags &= ~TF_NODELAY;
1746		inp_wunlock(inp);
1747
1748
1749		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1750			t3_set_nagle(tp->t_toe);
1751
1752	}
1753
1754	return (0);
1755}
1756
1757int
1758t3_ctloutput(struct socket *so, struct sockopt *sopt)
1759{
1760	int err;
1761
1762	if (sopt->sopt_level != IPPROTO_TCP)
1763		err =  t3_ip_ctloutput(so, sopt);
1764	else
1765		err = t3_tcp_ctloutput(so, sopt);
1766
1767	if (err != EOPNOTSUPP)
1768		return (err);
1769
1770	return (tcp_ctloutput(so, sopt));
1771}
1772
1773/*
1774 * Returns true if we need to explicitly request RST when we receive new data
1775 * on an RX-closed connection.
1776 */
1777static inline int
1778need_rst_on_excess_rx(const struct toepcb *toep)
1779{
1780	return (1);
1781}
1782
1783/*
1784 * Handles Rx data that arrives in a state where the socket isn't accepting
1785 * new data.
1786 */
1787static void
1788handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1789{
1790
1791	if (need_rst_on_excess_rx(toep) &&
1792	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1793		t3_send_reset(toep);
1794	m_freem(m);
1795}
1796
1797/*
1798 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1799 * by getting the DDP offset from the TCB.
1800 */
1801static void
1802tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1803{
1804	struct ddp_state *q = &toep->tp_ddp_state;
1805	struct ddp_buf_state *bsp;
1806	struct cpl_get_tcb_rpl *hdr;
1807	unsigned int ddp_offset;
1808	struct socket *so;
1809	struct tcpcb *tp;
1810	struct sockbuf *rcv;
1811	int state;
1812
1813	uint64_t t;
1814	__be64 *tcb;
1815
1816	tp = toep->tp_tp;
1817	so = inp_inpcbtosocket(tp->t_inpcb);
1818
1819	inp_lock_assert(tp->t_inpcb);
1820	rcv = so_sockbuf_rcv(so);
1821	sockbuf_lock(rcv);
1822
1823	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1824	 * We really need a cookie in order to dispatch the RPLs.
1825	 */
1826	q->get_tcb_count--;
1827
1828	/* It is a possible that a previous CPL already invalidated UBUF DDP
1829	 * and moved the cur_buf idx and hence no further processing of this
1830	 * skb is required. However, the app might be sleeping on
1831	 * !q->get_tcb_count and we need to wake it up.
1832	 */
1833	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1834		int state = so_state_get(so);
1835
1836		m_freem(m);
1837		if (__predict_true((state & SS_NOFDREF) == 0))
1838			so_sorwakeup_locked(so);
1839		else
1840			sockbuf_unlock(rcv);
1841
1842		return;
1843	}
1844
1845	bsp = &q->buf_state[q->cur_buf];
1846	hdr = cplhdr(m);
1847	tcb = (__be64 *)(hdr + 1);
1848	if (q->cur_buf == 0) {
1849		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1850		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1851	} else {
1852		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1853		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1854	}
1855	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1856	m->m_cur_offset = bsp->cur_offset;
1857	bsp->cur_offset = ddp_offset;
1858	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1859
1860	CTR5(KTR_TOM,
1861	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1862	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1863	KASSERT(ddp_offset >= m->m_cur_offset,
1864	    ("ddp_offset=%u less than cur_offset=%u",
1865		ddp_offset, m->m_cur_offset));
1866
1867#if 0
1868{
1869	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1870
1871	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1872	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1873
1874        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1875        rcv_nxt = t >> S_TCB_RCV_NXT;
1876        rcv_nxt &= M_TCB_RCV_NXT;
1877
1878        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1879        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1880        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1881
1882	T3_TRACE2(TIDTB(sk),
1883		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1884		  ddp_flags, rcv_nxt - rx_hdr_offset);
1885	T3_TRACE4(TB(q),
1886		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1887		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1888	T3_TRACE3(TB(q),
1889		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1890		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1891	T3_TRACE2(TB(q),
1892		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1893		 q->buf_state[0].flags, q->buf_state[1].flags);
1894
1895}
1896#endif
1897	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1898		handle_excess_rx(toep, m);
1899		return;
1900	}
1901
1902#ifdef T3_TRACE
1903	if ((int)m->m_pkthdr.len < 0) {
1904		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1905	}
1906#endif
1907	if (bsp->flags & DDP_BF_NOCOPY) {
1908#ifdef T3_TRACE
1909		T3_TRACE0(TB(q),
1910			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1911
1912		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1913			printk("!cancel_ubuf");
1914			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1915		}
1916#endif
1917		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1918		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1919		q->cur_buf ^= 1;
1920	} else if (bsp->flags & DDP_BF_NOFLIP) {
1921
1922		m->m_ddp_flags = 1;    /* always a kernel buffer */
1923
1924		/* now HW buffer carries a user buffer */
1925		bsp->flags &= ~DDP_BF_NOFLIP;
1926		bsp->flags |= DDP_BF_NOCOPY;
1927
1928		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1929		 * any new data in which case we're done. If in addition the
1930		 * offset is 0, then there wasn't a completion for the kbuf
1931		 * and we need to decrement the posted count.
1932		 */
1933		if (m->m_pkthdr.len == 0) {
1934			if (ddp_offset == 0) {
1935				q->kbuf_posted--;
1936				bsp->flags |= DDP_BF_NODATA;
1937			}
1938			sockbuf_unlock(rcv);
1939			m_free(m);
1940			return;
1941		}
1942	} else {
1943		sockbuf_unlock(rcv);
1944
1945		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1946		 * but it got here way late and nobody cares anymore.
1947		 */
1948		m_free(m);
1949		return;
1950	}
1951
1952	m->m_ddp_gl = (unsigned char *)bsp->gl;
1953	m->m_flags |= M_DDP;
1954	m->m_seq = tp->rcv_nxt;
1955	tp->rcv_nxt += m->m_pkthdr.len;
1956	tp->t_rcvtime = ticks;
1957	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1958		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1959	if (m->m_pkthdr.len == 0) {
1960		q->user_ddp_pending = 0;
1961		m_free(m);
1962	} else
1963		SBAPPEND(rcv, m);
1964
1965	state = so_state_get(so);
1966	if (__predict_true((state & SS_NOFDREF) == 0))
1967		so_sorwakeup_locked(so);
1968	else
1969		sockbuf_unlock(rcv);
1970}
1971
1972/*
1973 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1974 * in that case they are similar to DDP completions.
1975 */
1976static int
1977do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1978{
1979	struct toepcb *toep = (struct toepcb *)ctx;
1980
1981	/* OK if socket doesn't exist */
1982	if (toep == NULL) {
1983		printf("null toep in do_get_tcb_rpl\n");
1984		return (CPL_RET_BUF_DONE);
1985	}
1986
1987	inp_wlock(toep->tp_tp->t_inpcb);
1988	tcb_rpl_as_ddp_complete(toep, m);
1989	inp_wunlock(toep->tp_tp->t_inpcb);
1990
1991	return (0);
1992}
1993
1994static void
1995handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1996{
1997	struct tcpcb *tp = toep->tp_tp;
1998	struct socket *so;
1999	struct ddp_state *q;
2000	struct ddp_buf_state *bsp;
2001	struct cpl_rx_data *hdr = cplhdr(m);
2002	unsigned int rcv_nxt = ntohl(hdr->seq);
2003	struct sockbuf *rcv;
2004
2005	if (tp->rcv_nxt == rcv_nxt)
2006		return;
2007
2008	inp_lock_assert(tp->t_inpcb);
2009	so  = inp_inpcbtosocket(tp->t_inpcb);
2010	rcv = so_sockbuf_rcv(so);
2011	sockbuf_lock(rcv);
2012
2013	q = &toep->tp_ddp_state;
2014	bsp = &q->buf_state[q->cur_buf];
2015	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2016		rcv_nxt, tp->rcv_nxt));
2017	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2018	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2019	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2020	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2021
2022#ifdef T3_TRACE
2023	if ((int)m->m_pkthdr.len < 0) {
2024		t3_ddp_error(so, "handle_ddp_data: neg len");
2025	}
2026#endif
2027	m->m_ddp_gl = (unsigned char *)bsp->gl;
2028	m->m_flags |= M_DDP;
2029	m->m_cur_offset = bsp->cur_offset;
2030	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2031	if (bsp->flags & DDP_BF_NOCOPY)
2032		bsp->flags &= ~DDP_BF_NOCOPY;
2033
2034	m->m_seq = tp->rcv_nxt;
2035	tp->rcv_nxt = rcv_nxt;
2036	bsp->cur_offset += m->m_pkthdr.len;
2037	if (!(bsp->flags & DDP_BF_NOFLIP))
2038		q->cur_buf ^= 1;
2039	/*
2040	 * For now, don't re-enable DDP after a connection fell out of  DDP
2041	 * mode.
2042	 */
2043	q->ubuf_ddp_ready = 0;
2044	sockbuf_unlock(rcv);
2045}
2046
2047/*
2048 * Process new data received for a connection.
2049 */
2050static void
2051new_rx_data(struct toepcb *toep, struct mbuf *m)
2052{
2053	struct cpl_rx_data *hdr = cplhdr(m);
2054	struct tcpcb *tp = toep->tp_tp;
2055	struct socket *so;
2056	struct sockbuf *rcv;
2057	int state;
2058	int len = be16toh(hdr->len);
2059
2060	inp_wlock(tp->t_inpcb);
2061
2062	so  = inp_inpcbtosocket(tp->t_inpcb);
2063
2064	if (__predict_false(so_no_receive(so))) {
2065		handle_excess_rx(toep, m);
2066		inp_wunlock(tp->t_inpcb);
2067		TRACE_EXIT;
2068		return;
2069	}
2070
2071	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2072		handle_ddp_data(toep, m);
2073
2074	m->m_seq = ntohl(hdr->seq);
2075	m->m_ulp_mode = 0;                    /* for iSCSI */
2076
2077#if VALIDATE_SEQ
2078	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2079		log(LOG_ERR,
2080		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2081		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2082		       tp->rcv_nxt);
2083		m_freem(m);
2084		inp_wunlock(tp->t_inpcb);
2085		return;
2086	}
2087#endif
2088	m_adj(m, sizeof(*hdr));
2089
2090#ifdef URGENT_DATA_SUPPORTED
2091	/*
2092	 * We don't handle urgent data yet
2093	 */
2094	if (__predict_false(hdr->urg))
2095		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2096	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2097		     tp->urg_seq - tp->rcv_nxt < skb->len))
2098		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2099							 tp->rcv_nxt];
2100#endif
2101	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2102		toep->tp_delack_mode = hdr->dack_mode;
2103		toep->tp_delack_seq = tp->rcv_nxt;
2104	}
2105	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2106	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2107
2108	if (len < m->m_pkthdr.len)
2109		m->m_pkthdr.len = m->m_len = len;
2110
2111	tp->rcv_nxt += m->m_pkthdr.len;
2112	tp->t_rcvtime = ticks;
2113	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2114	CTR2(KTR_TOM,
2115	    "new_rx_data: seq 0x%x len %u",
2116	    m->m_seq, m->m_pkthdr.len);
2117	inp_wunlock(tp->t_inpcb);
2118	rcv = so_sockbuf_rcv(so);
2119	sockbuf_lock(rcv);
2120#if 0
2121	if (sb_notify(rcv))
2122		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2123#endif
2124	SBAPPEND(rcv, m);
2125
2126#ifdef notyet
2127	/*
2128	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2129	 *
2130	 */
2131	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2132
2133	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2134		so, rcv->sb_cc, rcv->sb_mbmax));
2135#endif
2136
2137
2138	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2139	    rcv->sb_cc, rcv->sb_mbcnt);
2140
2141	state = so_state_get(so);
2142	if (__predict_true((state & SS_NOFDREF) == 0))
2143		so_sorwakeup_locked(so);
2144	else
2145		sockbuf_unlock(rcv);
2146}
2147
2148/*
2149 * Handler for RX_DATA CPL messages.
2150 */
2151static int
2152do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2153{
2154	struct toepcb *toep = (struct toepcb *)ctx;
2155
2156	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2157
2158	new_rx_data(toep, m);
2159
2160	return (0);
2161}
2162
2163static void
2164new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2165{
2166	struct tcpcb *tp;
2167	struct ddp_state *q;
2168	struct ddp_buf_state *bsp;
2169	struct cpl_rx_data_ddp *hdr;
2170	struct socket *so;
2171	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2172	int nomoredata = 0;
2173	unsigned int delack_mode;
2174	struct sockbuf *rcv;
2175
2176	tp = toep->tp_tp;
2177	inp_wlock(tp->t_inpcb);
2178	so = inp_inpcbtosocket(tp->t_inpcb);
2179
2180	if (__predict_false(so_no_receive(so))) {
2181
2182		handle_excess_rx(toep, m);
2183		inp_wunlock(tp->t_inpcb);
2184		return;
2185	}
2186
2187	q = &toep->tp_ddp_state;
2188	hdr = cplhdr(m);
2189	ddp_report = ntohl(hdr->u.ddp_report);
2190	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2191	bsp = &q->buf_state[buf_idx];
2192
2193	CTR4(KTR_TOM,
2194	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2195	    "hdr seq 0x%x len %u",
2196	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2197	    ntohs(hdr->len));
2198	CTR3(KTR_TOM,
2199	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2200	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2201
2202	ddp_len = ntohs(hdr->len);
2203	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2204
2205	delack_mode = G_DDP_DACK_MODE(ddp_report);
2206	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2207		toep->tp_delack_mode = delack_mode;
2208		toep->tp_delack_seq = tp->rcv_nxt;
2209	}
2210
2211	m->m_seq = tp->rcv_nxt;
2212	tp->rcv_nxt = rcv_nxt;
2213
2214	tp->t_rcvtime = ticks;
2215	/*
2216	 * Store the length in m->m_len.  We are changing the meaning of
2217	 * m->m_len here, we need to be very careful that nothing from now on
2218	 * interprets ->len of this packet the usual way.
2219	 */
2220	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2221	inp_wunlock(tp->t_inpcb);
2222	CTR3(KTR_TOM,
2223	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2224	    m->m_len, rcv_nxt, m->m_seq);
2225	/*
2226	 * Figure out where the new data was placed in the buffer and store it
2227	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2228	 * account for page pod's pg_offset.
2229	 */
2230	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2231	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2232
2233	rcv = so_sockbuf_rcv(so);
2234	sockbuf_lock(rcv);
2235
2236	m->m_ddp_gl = (unsigned char *)bsp->gl;
2237	m->m_flags |= M_DDP;
2238	bsp->cur_offset = end_offset;
2239	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2240
2241	/*
2242	 * Length is only meaningful for kbuf
2243	 */
2244	if (!(bsp->flags & DDP_BF_NOCOPY))
2245		KASSERT(m->m_len <= bsp->gl->dgl_length,
2246		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2247			m->m_len, bsp->gl->dgl_length));
2248
2249	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2250	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2251        /*
2252	 * Bit 0 of flags stores whether the DDP buffer is completed.
2253	 * Note that other parts of the code depend on this being in bit 0.
2254	 */
2255	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2256		panic("spurious ddp completion");
2257	} else {
2258		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2259		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2260			q->cur_buf ^= 1;                     /* flip buffers */
2261	}
2262
2263	if (bsp->flags & DDP_BF_NOCOPY) {
2264		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2265		bsp->flags &= ~DDP_BF_NOCOPY;
2266	}
2267
2268	if (ddp_report & F_DDP_PSH)
2269		m->m_ddp_flags |= DDP_BF_PSH;
2270	if (nomoredata)
2271		m->m_ddp_flags |= DDP_BF_NODATA;
2272
2273#ifdef notyet
2274	skb_reset_transport_header(skb);
2275	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2276#endif
2277	SBAPPEND(rcv, m);
2278
2279	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2280	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2281		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2282		so_sorwakeup_locked(so);
2283	else
2284		sockbuf_unlock(rcv);
2285}
2286
2287#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2288		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2289		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2290		 F_DDP_INVALID_PPOD)
2291
2292/*
2293 * Handler for RX_DATA_DDP CPL messages.
2294 */
2295static int
2296do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2297{
2298	struct toepcb *toep = ctx;
2299	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2300
2301	VALIDATE_SOCK(so);
2302
2303	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2304		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2305		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2306		return (CPL_RET_BUF_DONE);
2307	}
2308#if 0
2309	skb->h.th = tcphdr_skb->h.th;
2310#endif
2311	new_rx_data_ddp(toep, m);
2312	return (0);
2313}
2314
2315static void
2316process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2317{
2318	struct tcpcb *tp = toep->tp_tp;
2319	struct socket *so;
2320	struct ddp_state *q;
2321	struct ddp_buf_state *bsp;
2322	struct cpl_rx_ddp_complete *hdr;
2323	unsigned int ddp_report, buf_idx, when, delack_mode;
2324	int nomoredata = 0;
2325	struct sockbuf *rcv;
2326
2327	inp_wlock(tp->t_inpcb);
2328	so = inp_inpcbtosocket(tp->t_inpcb);
2329
2330	if (__predict_false(so_no_receive(so))) {
2331		struct inpcb *inp = so_sotoinpcb(so);
2332
2333		handle_excess_rx(toep, m);
2334		inp_wunlock(inp);
2335		return;
2336	}
2337	q = &toep->tp_ddp_state;
2338	hdr = cplhdr(m);
2339	ddp_report = ntohl(hdr->ddp_report);
2340	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2341	m->m_pkthdr.csum_data = tp->rcv_nxt;
2342
2343	rcv = so_sockbuf_rcv(so);
2344	sockbuf_lock(rcv);
2345
2346	bsp = &q->buf_state[buf_idx];
2347	when = bsp->cur_offset;
2348	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2349	tp->rcv_nxt += m->m_len;
2350	tp->t_rcvtime = ticks;
2351
2352	delack_mode = G_DDP_DACK_MODE(ddp_report);
2353	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2354		toep->tp_delack_mode = delack_mode;
2355		toep->tp_delack_seq = tp->rcv_nxt;
2356	}
2357#ifdef notyet
2358	skb_reset_transport_header(skb);
2359	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2360#endif
2361	inp_wunlock(tp->t_inpcb);
2362
2363	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2364	CTR5(KTR_TOM,
2365		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2366		  "ddp_report 0x%x offset %u, len %u",
2367		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2368		   G_DDP_OFFSET(ddp_report), m->m_len);
2369
2370	m->m_cur_offset = bsp->cur_offset;
2371	bsp->cur_offset += m->m_len;
2372
2373	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2374		q->cur_buf ^= 1;                     /* flip buffers */
2375		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2376			nomoredata=1;
2377	}
2378
2379	CTR4(KTR_TOM,
2380		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2381		  "ddp_report %u offset %u",
2382		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2383		   G_DDP_OFFSET(ddp_report));
2384
2385	m->m_ddp_gl = (unsigned char *)bsp->gl;
2386	m->m_flags |= M_DDP;
2387	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2388	if (bsp->flags & DDP_BF_NOCOPY)
2389		bsp->flags &= ~DDP_BF_NOCOPY;
2390	if (nomoredata)
2391		m->m_ddp_flags |= DDP_BF_NODATA;
2392
2393	SBAPPEND(rcv, m);
2394	if ((so_state_get(so) & SS_NOFDREF) == 0)
2395		so_sorwakeup_locked(so);
2396	else
2397		sockbuf_unlock(rcv);
2398}
2399
2400/*
2401 * Handler for RX_DDP_COMPLETE CPL messages.
2402 */
2403static int
2404do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2405{
2406	struct toepcb *toep = ctx;
2407
2408	VALIDATE_SOCK(so);
2409#if 0
2410	skb->h.th = tcphdr_skb->h.th;
2411#endif
2412	process_ddp_complete(toep, m);
2413	return (0);
2414}
2415
2416/*
2417 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2418 * socket state before calling tcp_time_wait to comply with its expectations.
2419 */
2420static void
2421enter_timewait(struct tcpcb *tp)
2422{
2423	/*
2424	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2425	 * process peer_close because we don't want to carry the peer FIN in
2426	 * the socket's receive queue and if we increment rcv_nxt without
2427	 * having the FIN in the receive queue we'll confuse facilities such
2428	 * as SIOCINQ.
2429	 */
2430	inp_wlock(tp->t_inpcb);
2431	tp->rcv_nxt++;
2432
2433	tp->ts_recent_age = 0;	     /* defeat recycling */
2434	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2435	inp_wunlock(tp->t_inpcb);
2436	tcp_offload_twstart(tp);
2437}
2438
2439/*
2440 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2441 * function deals with the data that may be reported along with the FIN.
2442 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2443 * perform normal FIN-related processing.  In the latter case 1 indicates that
2444 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2445 * skb can be freed.
2446 */
2447static int
2448handle_peer_close_data(struct socket *so, struct mbuf *m)
2449{
2450	struct tcpcb *tp = so_sototcpcb(so);
2451	struct toepcb *toep = tp->t_toe;
2452	struct ddp_state *q;
2453	struct ddp_buf_state *bsp;
2454	struct cpl_peer_close *req = cplhdr(m);
2455	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2456	struct sockbuf *rcv;
2457
2458	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2459		return (0);
2460
2461	CTR0(KTR_TOM, "handle_peer_close_data");
2462	if (__predict_false(so_no_receive(so))) {
2463		handle_excess_rx(toep, m);
2464
2465		/*
2466		 * Although we discard the data we want to process the FIN so
2467		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2468		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2469		 * may be what will close the connection.  We return 1 because
2470		 * handle_excess_rx() already freed the packet.
2471		 */
2472		return (1);
2473	}
2474
2475	inp_lock_assert(tp->t_inpcb);
2476	q = &toep->tp_ddp_state;
2477	rcv = so_sockbuf_rcv(so);
2478	sockbuf_lock(rcv);
2479
2480	bsp = &q->buf_state[q->cur_buf];
2481	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2482	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2483	m->m_ddp_gl = (unsigned char *)bsp->gl;
2484	m->m_flags |= M_DDP;
2485	m->m_cur_offset = bsp->cur_offset;
2486	m->m_ddp_flags =
2487	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2488	m->m_seq = tp->rcv_nxt;
2489	tp->rcv_nxt = rcv_nxt;
2490	bsp->cur_offset += m->m_pkthdr.len;
2491	if (!(bsp->flags & DDP_BF_NOFLIP))
2492		q->cur_buf ^= 1;
2493#ifdef notyet
2494	skb_reset_transport_header(skb);
2495	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2496#endif
2497	tp->t_rcvtime = ticks;
2498	SBAPPEND(rcv, m);
2499	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2500		so_sorwakeup_locked(so);
2501	else
2502		sockbuf_unlock(rcv);
2503
2504	return (1);
2505}
2506
2507/*
2508 * Handle a peer FIN.
2509 */
2510static void
2511do_peer_fin(struct toepcb *toep, struct mbuf *m)
2512{
2513	struct socket *so;
2514	struct tcpcb *tp = toep->tp_tp;
2515	int keep, action;
2516
2517	action = keep = 0;
2518	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2519	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2520		printf("abort_pending set\n");
2521
2522		goto out;
2523	}
2524	inp_wlock(tp->t_inpcb);
2525	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2526	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2527		keep = handle_peer_close_data(so, m);
2528		if (keep < 0) {
2529			inp_wunlock(tp->t_inpcb);
2530			return;
2531		}
2532	}
2533	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2534		CTR1(KTR_TOM,
2535		    "waking up waiters for cantrcvmore on %p ", so);
2536		socantrcvmore(so);
2537
2538		/*
2539		 * If connection is half-synchronized
2540		 * (ie NEEDSYN flag on) then delay ACK,
2541		 * so it may be piggybacked when SYN is sent.
2542		 * Otherwise, since we received a FIN then no
2543		 * more input can be expected, send ACK now.
2544		 */
2545		if (tp->t_flags & TF_NEEDSYN)
2546			tp->t_flags |= TF_DELACK;
2547		else
2548			tp->t_flags |= TF_ACKNOW;
2549		tp->rcv_nxt++;
2550	}
2551
2552	switch (tp->t_state) {
2553	case TCPS_SYN_RECEIVED:
2554	    tp->t_starttime = ticks;
2555	/* FALLTHROUGH */
2556	case TCPS_ESTABLISHED:
2557		tp->t_state = TCPS_CLOSE_WAIT;
2558		break;
2559	case TCPS_FIN_WAIT_1:
2560		tp->t_state = TCPS_CLOSING;
2561		break;
2562	case TCPS_FIN_WAIT_2:
2563		/*
2564		 * If we've sent an abort_req we must have sent it too late,
2565		 * HW will send us a reply telling us so, and this peer_close
2566		 * is really the last message for this connection and needs to
2567		 * be treated as an abort_rpl, i.e., transition the connection
2568		 * to TCP_CLOSE (note that the host stack does this at the
2569		 * time of generating the RST but we must wait for HW).
2570		 * Otherwise we enter TIME_WAIT.
2571		 */
2572		t3_release_offload_resources(toep);
2573		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2574			action = TCP_CLOSE;
2575		} else {
2576			action = TCP_TIMEWAIT;
2577		}
2578		break;
2579	default:
2580		log(LOG_ERR,
2581		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2582		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2583	}
2584	inp_wunlock(tp->t_inpcb);
2585
2586	if (action == TCP_TIMEWAIT) {
2587		enter_timewait(tp);
2588	} else if (action == TCP_DROP) {
2589		tcp_offload_drop(tp, 0);
2590	} else if (action == TCP_CLOSE) {
2591		tcp_offload_close(tp);
2592	}
2593
2594#ifdef notyet
2595	/* Do not send POLL_HUP for half duplex close. */
2596	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2597	    sk->sk_state == TCP_CLOSE)
2598		sk_wake_async(so, 1, POLL_HUP);
2599	else
2600		sk_wake_async(so, 1, POLL_IN);
2601#endif
2602
2603out:
2604	if (!keep)
2605		m_free(m);
2606}
2607
2608/*
2609 * Handler for PEER_CLOSE CPL messages.
2610 */
2611static int
2612do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2613{
2614	struct toepcb *toep = (struct toepcb *)ctx;
2615
2616	VALIDATE_SOCK(so);
2617
2618	do_peer_fin(toep, m);
2619	return (0);
2620}
2621
2622static void
2623process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2624{
2625	struct cpl_close_con_rpl *rpl = cplhdr(m);
2626	struct tcpcb *tp = toep->tp_tp;
2627	struct socket *so;
2628	int action = 0;
2629	struct sockbuf *rcv;
2630
2631	inp_wlock(tp->t_inpcb);
2632	so = inp_inpcbtosocket(tp->t_inpcb);
2633
2634	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2635
2636	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2637		inp_wunlock(tp->t_inpcb);
2638		goto out;
2639	}
2640
2641	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2642	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2643
2644	switch (tp->t_state) {
2645	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2646		t3_release_offload_resources(toep);
2647		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2648			action = TCP_CLOSE;
2649
2650		} else {
2651			action = TCP_TIMEWAIT;
2652		}
2653		break;
2654	case TCPS_LAST_ACK:
2655		/*
2656		 * In this state we don't care about pending abort_rpl.
2657		 * If we've sent abort_req it was post-close and was sent too
2658		 * late, this close_con_rpl is the actual last message.
2659		 */
2660		t3_release_offload_resources(toep);
2661		action = TCP_CLOSE;
2662		break;
2663	case TCPS_FIN_WAIT_1:
2664		/*
2665		 * If we can't receive any more
2666		 * data, then closing user can proceed.
2667		 * Starting the timer is contrary to the
2668		 * specification, but if we don't get a FIN
2669		 * we'll hang forever.
2670		 *
2671		 * XXXjl:
2672		 * we should release the tp also, and use a
2673		 * compressed state.
2674		 */
2675		if (so)
2676			rcv = so_sockbuf_rcv(so);
2677		else
2678			break;
2679
2680		if (rcv->sb_state & SBS_CANTRCVMORE) {
2681			int timeout;
2682
2683			if (so)
2684				soisdisconnected(so);
2685			timeout = (tcp_fast_finwait2_recycle) ?
2686			    tcp_finwait2_timeout : tcp_maxidle;
2687			tcp_timer_activate(tp, TT_2MSL, timeout);
2688		}
2689		tp->t_state = TCPS_FIN_WAIT_2;
2690		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2691		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2692			action = TCP_DROP;
2693		}
2694
2695		break;
2696	default:
2697		log(LOG_ERR,
2698		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2699		       toep->tp_toedev->tod_name, toep->tp_tid,
2700		       tp->t_state);
2701	}
2702	inp_wunlock(tp->t_inpcb);
2703
2704
2705	if (action == TCP_TIMEWAIT) {
2706		enter_timewait(tp);
2707	} else if (action == TCP_DROP) {
2708		tcp_offload_drop(tp, 0);
2709	} else if (action == TCP_CLOSE) {
2710		tcp_offload_close(tp);
2711	}
2712out:
2713	m_freem(m);
2714}
2715
2716/*
2717 * Handler for CLOSE_CON_RPL CPL messages.
2718 */
2719static int
2720do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2721			    void *ctx)
2722{
2723	struct toepcb *toep = (struct toepcb *)ctx;
2724
2725	process_close_con_rpl(toep, m);
2726	return (0);
2727}
2728
2729/*
2730 * Process abort replies.  We only process these messages if we anticipate
2731 * them as the coordination between SW and HW in this area is somewhat lacking
2732 * and sometimes we get ABORT_RPLs after we are done with the connection that
2733 * originated the ABORT_REQ.
2734 */
2735static void
2736process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2737{
2738	struct tcpcb *tp = toep->tp_tp;
2739	struct socket *so;
2740	int needclose = 0;
2741
2742#ifdef T3_TRACE
2743	T3_TRACE1(TIDTB(sk),
2744		  "process_abort_rpl: GTS rpl pending %d",
2745		  sock_flag(sk, ABORT_RPL_PENDING));
2746#endif
2747
2748	inp_wlock(tp->t_inpcb);
2749	so = inp_inpcbtosocket(tp->t_inpcb);
2750
2751	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2752		/*
2753		 * XXX panic on tcpdrop
2754		 */
2755		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2756			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2757		else {
2758			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2759			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2760			    !is_t3a(toep->tp_toedev)) {
2761				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2762					panic("TP_ABORT_REQ_RCVD set");
2763				t3_release_offload_resources(toep);
2764				needclose = 1;
2765			}
2766		}
2767	}
2768	inp_wunlock(tp->t_inpcb);
2769
2770	if (needclose)
2771		tcp_offload_close(tp);
2772
2773	m_free(m);
2774}
2775
2776/*
2777 * Handle an ABORT_RPL_RSS CPL message.
2778 */
2779static int
2780do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2781{
2782	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2783	struct toepcb *toep;
2784
2785	/*
2786	 * Ignore replies to post-close aborts indicating that the abort was
2787	 * requested too late.  These connections are terminated when we get
2788	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2789	 * arrives the TID is either no longer used or it has been recycled.
2790	 */
2791	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2792discard:
2793		m_free(m);
2794		return (0);
2795	}
2796
2797	toep = (struct toepcb *)ctx;
2798
2799        /*
2800	 * Sometimes we've already closed the socket, e.g., a post-close
2801	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2802	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2803	 * but FW turns the ABORT_REQ into a regular one and so we get
2804	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2805	 */
2806	if (!toep)
2807		goto discard;
2808
2809	if (toep->tp_tp == NULL) {
2810		log(LOG_NOTICE, "removing tid for abort\n");
2811		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2812		if (toep->tp_l2t)
2813			l2t_release(L2DATA(cdev), toep->tp_l2t);
2814
2815		toepcb_release(toep);
2816		goto discard;
2817	}
2818
2819	log(LOG_NOTICE, "toep=%p\n", toep);
2820	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2821
2822	toepcb_hold(toep);
2823	process_abort_rpl(toep, m);
2824	toepcb_release(toep);
2825	return (0);
2826}
2827
2828/*
2829 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2830 * indicate whether RST should be sent in response.
2831 */
2832static int
2833abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2834{
2835	struct tcpcb *tp = so_sototcpcb(so);
2836
2837	switch (abort_reason) {
2838	case CPL_ERR_BAD_SYN:
2839#if 0
2840		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2841#endif
2842	case CPL_ERR_CONN_RESET:
2843		// XXX need to handle SYN_RECV due to crossed SYNs
2844		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2845	case CPL_ERR_XMIT_TIMEDOUT:
2846	case CPL_ERR_PERSIST_TIMEDOUT:
2847	case CPL_ERR_FINWAIT2_TIMEDOUT:
2848	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2849#if 0
2850		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2851#endif
2852		return (ETIMEDOUT);
2853	default:
2854		return (EIO);
2855	}
2856}
2857
2858static inline void
2859set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2860{
2861	struct cpl_abort_rpl *rpl = cplhdr(m);
2862
2863	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2864	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2865	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2866
2867	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2868	rpl->cmd = cmd;
2869}
2870
2871static void
2872send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2873{
2874	struct mbuf *reply_mbuf;
2875	struct cpl_abort_req_rss *req = cplhdr(m);
2876
2877	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2878	m_set_priority(m, CPL_PRIORITY_DATA);
2879	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2880	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2881	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2882	m_free(m);
2883}
2884
2885/*
2886 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2887 */
2888static inline int
2889is_neg_adv_abort(unsigned int status)
2890{
2891	return status == CPL_ERR_RTX_NEG_ADVICE ||
2892	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2893}
2894
2895static void
2896send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2897{
2898	struct mbuf  *reply_mbuf;
2899	struct cpl_abort_req_rss *req = cplhdr(m);
2900
2901	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2902
2903	if (!reply_mbuf) {
2904		/* Defer the reply.  Stick rst_status into req->cmd. */
2905		req->status = rst_status;
2906		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2907		return;
2908	}
2909
2910	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2911	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2912	m_free(m);
2913
2914	/*
2915	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2916	 * these messages while ARP is pending.  For other connection states
2917	 * it's not a problem.
2918	 */
2919	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2920}
2921
2922#ifdef notyet
2923static void
2924cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2925{
2926	CXGB_UNIMPLEMENTED();
2927#ifdef notyet
2928	struct request_sock *req = child->sk_user_data;
2929
2930	inet_csk_reqsk_queue_removed(parent, req);
2931	synq_remove(tcp_sk(child));
2932	__reqsk_free(req);
2933	child->sk_user_data = NULL;
2934#endif
2935}
2936
2937
2938/*
2939 * Performs the actual work to abort a SYN_RECV connection.
2940 */
2941static void
2942do_abort_syn_rcv(struct socket *child, struct socket *parent)
2943{
2944	struct tcpcb *parenttp = so_sototcpcb(parent);
2945	struct tcpcb *childtp = so_sototcpcb(child);
2946
2947	/*
2948	 * If the server is still open we clean up the child connection,
2949	 * otherwise the server already did the clean up as it was purging
2950	 * its SYN queue and the skb was just sitting in its backlog.
2951	 */
2952	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2953		cleanup_syn_rcv_conn(child, parent);
2954		inp_wlock(childtp->t_inpcb);
2955		t3_release_offload_resources(childtp->t_toe);
2956		inp_wunlock(childtp->t_inpcb);
2957		tcp_offload_close(childtp);
2958	}
2959}
2960#endif
2961
2962/*
2963 * Handle abort requests for a SYN_RECV connection.  These need extra work
2964 * because the socket is on its parent's SYN queue.
2965 */
2966static int
2967abort_syn_rcv(struct socket *so, struct mbuf *m)
2968{
2969	CXGB_UNIMPLEMENTED();
2970#ifdef notyet
2971	struct socket *parent;
2972	struct toedev *tdev = toep->tp_toedev;
2973	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2974	struct socket *oreq = so->so_incomp;
2975	struct t3c_tid_entry *t3c_stid;
2976	struct tid_info *t;
2977
2978	if (!oreq)
2979		return -1;        /* somehow we are not on the SYN queue */
2980
2981	t = &(T3C_DATA(cdev))->tid_maps;
2982	t3c_stid = lookup_stid(t, oreq->ts_recent);
2983	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2984
2985	so_lock(parent);
2986	do_abort_syn_rcv(so, parent);
2987	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2988	so_unlock(parent);
2989#endif
2990	return (0);
2991}
2992
2993/*
2994 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2995 * request except that we need to reply to it.
2996 */
2997static void
2998process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
2999{
3000	int rst_status = CPL_ABORT_NO_RST;
3001	const struct cpl_abort_req_rss *req = cplhdr(m);
3002	struct tcpcb *tp = toep->tp_tp;
3003	struct socket *so;
3004	int needclose = 0;
3005
3006	inp_wlock(tp->t_inpcb);
3007	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3008	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3009		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3010		m_free(m);
3011		goto skip;
3012	}
3013
3014	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3015	/*
3016	 * Three cases to consider:
3017	 * a) We haven't sent an abort_req; close the connection.
3018	 * b) We have sent a post-close abort_req that will get to TP too late
3019	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3020	 *    be ignored and the connection should be closed now.
3021	 * c) We have sent a regular abort_req that will get to TP too late.
3022	 *    That will generate an abort_rpl with status 0, wait for it.
3023	 */
3024	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3025	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3026		int error;
3027
3028		error = abort_status_to_errno(so, req->status,
3029		    &rst_status);
3030		so_error_set(so, error);
3031
3032		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3033			so_sorwakeup(so);
3034		/*
3035		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3036		 * returns 0 is has taken care of the abort.
3037		 */
3038		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3039			goto skip;
3040
3041		t3_release_offload_resources(toep);
3042		needclose = 1;
3043	}
3044	inp_wunlock(tp->t_inpcb);
3045
3046	if (needclose)
3047		tcp_offload_close(tp);
3048
3049	send_abort_rpl(m, tdev, rst_status);
3050	return;
3051skip:
3052	inp_wunlock(tp->t_inpcb);
3053}
3054
3055/*
3056 * Handle an ABORT_REQ_RSS CPL message.
3057 */
3058static int
3059do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3060{
3061	const struct cpl_abort_req_rss *req = cplhdr(m);
3062	struct toepcb *toep = (struct toepcb *)ctx;
3063
3064	if (is_neg_adv_abort(req->status)) {
3065		m_free(m);
3066		return (0);
3067	}
3068
3069	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3070
3071	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3072		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3073		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3074
3075		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3076		if (toep->tp_l2t)
3077			l2t_release(L2DATA(cdev), toep->tp_l2t);
3078
3079		/*
3080		 *  Unhook
3081		 */
3082		toep->tp_tp->t_toe = NULL;
3083		toep->tp_tp->t_flags &= ~TF_TOE;
3084		toep->tp_tp = NULL;
3085		/*
3086		 * XXX need to call syncache_chkrst - but we don't
3087		 * have a way of doing that yet
3088		 */
3089		toepcb_release(toep);
3090		log(LOG_ERR, "abort for unestablished connection :-(\n");
3091		return (0);
3092	}
3093	if (toep->tp_tp == NULL) {
3094		log(LOG_NOTICE, "disconnected toepcb\n");
3095		/* should be freed momentarily */
3096		return (0);
3097	}
3098
3099
3100	toepcb_hold(toep);
3101	process_abort_req(toep, m, toep->tp_toedev);
3102	toepcb_release(toep);
3103	return (0);
3104}
3105#ifdef notyet
3106static void
3107pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3108{
3109	struct toedev *tdev = TOE_DEV(parent);
3110
3111	do_abort_syn_rcv(child, parent);
3112	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3113		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3114
3115		rpl->opt0h = htonl(F_TCAM_BYPASS);
3116		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3117		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3118	} else
3119		m_free(m);
3120}
3121#endif
3122static void
3123handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3124{
3125	CXGB_UNIMPLEMENTED();
3126
3127#ifdef notyet
3128	struct t3cdev *cdev;
3129	struct socket *parent;
3130	struct socket *oreq;
3131	struct t3c_tid_entry *t3c_stid;
3132	struct tid_info *t;
3133	struct tcpcb *otp, *tp = so_sototcpcb(so);
3134	struct toepcb *toep = tp->t_toe;
3135
3136	/*
3137	 * If the connection is being aborted due to the parent listening
3138	 * socket going away there's nothing to do, the ABORT_REQ will close
3139	 * the connection.
3140	 */
3141	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3142		m_free(m);
3143		return;
3144	}
3145
3146	oreq = so->so_incomp;
3147	otp = so_sototcpcb(oreq);
3148
3149	cdev = T3C_DEV(so);
3150	t = &(T3C_DATA(cdev))->tid_maps;
3151	t3c_stid = lookup_stid(t, otp->ts_recent);
3152	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3153
3154	so_lock(parent);
3155	pass_open_abort(so, parent, m);
3156	so_unlock(parent);
3157#endif
3158}
3159
3160/*
3161 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3162 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3163 * connection.
3164 */
3165static void
3166pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3167{
3168
3169#ifdef notyet
3170	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3171	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3172#endif
3173	handle_pass_open_arp_failure(m_get_socket(m), m);
3174}
3175
3176/*
3177 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3178 */
3179static void
3180mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3181{
3182	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3183	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3184	unsigned int tid = GET_TID(req);
3185
3186	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3187	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3188	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3189	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3190	rpl->opt0h = htonl(F_TCAM_BYPASS);
3191	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3192	rpl->opt2 = 0;
3193	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3194}
3195
3196/*
3197 * Send a deferred reject to an accept request.
3198 */
3199static void
3200reject_pass_request(struct toedev *tdev, struct mbuf *m)
3201{
3202	struct mbuf *reply_mbuf;
3203
3204	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3205	mk_pass_accept_rpl(reply_mbuf, m);
3206	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3207	m_free(m);
3208}
3209
3210static void
3211handle_syncache_event(int event, void *arg)
3212{
3213	struct toepcb *toep = arg;
3214
3215	switch (event) {
3216	case TOE_SC_ENTRY_PRESENT:
3217		/*
3218		 * entry already exists - free toepcb
3219		 * and l2t
3220		 */
3221		printf("syncache entry present\n");
3222		toepcb_release(toep);
3223		break;
3224	case TOE_SC_DROP:
3225		/*
3226		 * The syncache has given up on this entry
3227		 * either it timed out, or it was evicted
3228		 * we need to explicitly release the tid
3229		 */
3230		printf("syncache entry dropped\n");
3231		toepcb_release(toep);
3232		break;
3233	default:
3234		log(LOG_ERR, "unknown syncache event %d\n", event);
3235		break;
3236	}
3237}
3238
3239static void
3240syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3241{
3242	struct in_conninfo inc;
3243	struct tcpopt to;
3244	struct tcphdr th;
3245	struct inpcb *inp;
3246	int mss, wsf, sack, ts;
3247	uint32_t rcv_isn = ntohl(req->rcv_isn);
3248
3249	bzero(&to, sizeof(struct tcpopt));
3250	inp = so_sotoinpcb(lso);
3251
3252	/*
3253	 * Fill out information for entering us into the syncache
3254	 */
3255	inc.inc_fport = th.th_sport = req->peer_port;
3256	inc.inc_lport = th.th_dport = req->local_port;
3257	th.th_seq = req->rcv_isn;
3258	th.th_flags = TH_SYN;
3259
3260	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3261
3262
3263	inc.inc_isipv6 = 0;
3264	inc.inc_len = 0;
3265	inc.inc_faddr.s_addr = req->peer_ip;
3266	inc.inc_laddr.s_addr = req->local_ip;
3267
3268	DPRINTF("syncache add of %d:%d %d:%d\n",
3269	    ntohl(req->local_ip), ntohs(req->local_port),
3270	    ntohl(req->peer_ip), ntohs(req->peer_port));
3271
3272	mss = req->tcp_options.mss;
3273	wsf = req->tcp_options.wsf;
3274	ts = req->tcp_options.tstamp;
3275	sack = req->tcp_options.sack;
3276	to.to_mss = mss;
3277	to.to_wscale = wsf;
3278	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3279	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3280}
3281
3282
3283/*
3284 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3285 * lock held.  Note that the sock here is a listening socket that is not owned
3286 * by the TOE.
3287 */
3288static void
3289process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3290    struct listen_ctx *lctx)
3291{
3292	int rt_flags;
3293	struct l2t_entry *e;
3294	struct iff_mac tim;
3295	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3296	struct cpl_pass_accept_rpl *rpl;
3297	struct cpl_pass_accept_req *req = cplhdr(m);
3298	unsigned int tid = GET_TID(req);
3299	struct tom_data *d = TOM_DATA(tdev);
3300	struct t3cdev *cdev = d->cdev;
3301	struct tcpcb *tp = so_sototcpcb(so);
3302	struct toepcb *newtoep;
3303	struct rtentry *dst;
3304	struct sockaddr_in nam;
3305	struct t3c_data *td = T3C_DATA(cdev);
3306
3307	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3308	if (__predict_false(reply_mbuf == NULL)) {
3309		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3310			t3_defer_reply(m, tdev, reject_pass_request);
3311		else {
3312			cxgb_queue_tid_release(cdev, tid);
3313			m_free(m);
3314		}
3315		DPRINTF("failed to get reply_mbuf\n");
3316
3317		goto out;
3318	}
3319
3320	if (tp->t_state != TCPS_LISTEN) {
3321		DPRINTF("socket not in listen state\n");
3322
3323		goto reject;
3324	}
3325
3326	tim.mac_addr = req->dst_mac;
3327	tim.vlan_tag = ntohs(req->vlan_tag);
3328	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3329		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3330		goto reject;
3331	}
3332
3333#ifdef notyet
3334	/*
3335	 * XXX do route lookup to confirm that we're still listening on this
3336	 * address
3337	 */
3338	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3339			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3340		goto reject;
3341	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3342		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3343	dst_release(skb->dst);	// done with the input route, release it
3344	skb->dst = NULL;
3345
3346	if ((rt_flags & RTF_LOCAL) == 0)
3347		goto reject;
3348#endif
3349	/*
3350	 * XXX
3351	 */
3352	rt_flags = RTF_LOCAL;
3353	if ((rt_flags & RTF_LOCAL) == 0)
3354		goto reject;
3355
3356	/*
3357	 * Calculate values and add to syncache
3358	 */
3359
3360	newtoep = toepcb_alloc();
3361	if (newtoep == NULL)
3362		goto reject;
3363
3364	bzero(&nam, sizeof(struct sockaddr_in));
3365
3366	nam.sin_len = sizeof(struct sockaddr_in);
3367	nam.sin_family = AF_INET;
3368	nam.sin_addr.s_addr =req->peer_ip;
3369	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3370
3371	if (dst == NULL) {
3372		printf("failed to find route\n");
3373		goto reject;
3374	}
3375	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3376	    (struct sockaddr *)&nam);
3377	if (e == NULL) {
3378		DPRINTF("failed to get l2t\n");
3379	}
3380	/*
3381	 * Point to our listen socket until accept
3382	 */
3383	newtoep->tp_tp = tp;
3384	newtoep->tp_flags = TP_SYN_RCVD;
3385	newtoep->tp_tid = tid;
3386	newtoep->tp_toedev = tdev;
3387	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3388
3389	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3390	so_lock(so);
3391	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3392	so_unlock(so);
3393
3394	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3395		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3396
3397	if (newtoep->tp_ulp_mode) {
3398		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3399
3400		if (ddp_mbuf == NULL)
3401			newtoep->tp_ulp_mode = 0;
3402	}
3403
3404	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3405	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3406	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3407	/*
3408	 * XXX workaround for lack of syncache drop
3409	 */
3410	toepcb_hold(newtoep);
3411	syncache_add_accept_req(req, so, newtoep);
3412
3413	rpl = cplhdr(reply_mbuf);
3414	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3415	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3416	rpl->wr.wr_lo = 0;
3417	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3418	rpl->opt2 = htonl(calc_opt2(so, tdev));
3419	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3420	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3421
3422	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3423	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3424	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3425				  CPL_PASS_OPEN_ACCEPT);
3426
3427	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3428
3429	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3430
3431	l2t_send(cdev, reply_mbuf, e);
3432	m_free(m);
3433	if (newtoep->tp_ulp_mode) {
3434		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3435				V_TF_DDP_OFF(1) |
3436				TP_DDP_TIMER_WORKAROUND_MASK,
3437				V_TF_DDP_OFF(1) |
3438		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3439	} else
3440		printf("not offloading\n");
3441
3442
3443
3444	return;
3445reject:
3446	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3447		mk_pass_accept_rpl(reply_mbuf, m);
3448	else
3449		mk_tid_release(reply_mbuf, newtoep, tid);
3450	cxgb_ofld_send(cdev, reply_mbuf);
3451	m_free(m);
3452out:
3453#if 0
3454	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3455#else
3456	return;
3457#endif
3458}
3459
3460/*
3461 * Handle a CPL_PASS_ACCEPT_REQ message.
3462 */
3463static int
3464do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3465{
3466	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3467	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3468	struct tom_data *d = listen_ctx->tom_data;
3469
3470#if VALIDATE_TID
3471	struct cpl_pass_accept_req *req = cplhdr(m);
3472	unsigned int tid = GET_TID(req);
3473	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3474
3475	if (unlikely(!lsk)) {
3476		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3477		       cdev->name,
3478		       (unsigned long)((union listen_entry *)ctx -
3479					t->stid_tab));
3480		return CPL_RET_BUF_DONE;
3481	}
3482	if (unlikely(tid >= t->ntids)) {
3483		printk(KERN_ERR "%s: passive open TID %u too large\n",
3484		       cdev->name, tid);
3485		return CPL_RET_BUF_DONE;
3486	}
3487	/*
3488	 * For T3A the current user of the TID may have closed but its last
3489	 * message(s) may have been backlogged so the TID appears to be still
3490	 * in use.  Just take the TID away, the connection can close at its
3491	 * own leisure.  For T3B this situation is a bug.
3492	 */
3493	if (!valid_new_tid(t, tid) &&
3494	    cdev->type != T3A) {
3495		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3496		       cdev->name, tid);
3497		return CPL_RET_BUF_DONE;
3498	}
3499#endif
3500
3501	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3502	return (0);
3503}
3504
3505/*
3506 * Called when a connection is established to translate the TCP options
3507 * reported by HW to FreeBSD's native format.
3508 */
3509static void
3510assign_rxopt(struct socket *so, unsigned int opt)
3511{
3512	struct tcpcb *tp = so_sototcpcb(so);
3513	struct toepcb *toep = tp->t_toe;
3514	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3515
3516	inp_lock_assert(tp->t_inpcb);
3517
3518	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3519	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3520	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3521	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3522	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3523	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3524		tp->rcv_scale = tp->request_r_scale;
3525}
3526
3527/*
3528 * Completes some final bits of initialization for just established connections
3529 * and changes their state to TCP_ESTABLISHED.
3530 *
3531 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3532 */
3533static void
3534make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3535{
3536	struct tcpcb *tp = so_sototcpcb(so);
3537	struct toepcb *toep = tp->t_toe;
3538
3539	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3540	assign_rxopt(so, opt);
3541
3542	/*
3543	 *XXXXXXXXXXX
3544	 *
3545	 */
3546#ifdef notyet
3547	so->so_proto->pr_ctloutput = t3_ctloutput;
3548#endif
3549
3550#if 0
3551	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3552#endif
3553	/*
3554	 * XXX not clear what rcv_wup maps to
3555	 */
3556	/*
3557	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3558	 * pass through opt0.
3559	 */
3560	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3561		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3562
3563	dump_toepcb(toep);
3564
3565#ifdef notyet
3566/*
3567 * no clean interface for marking ARP up to date
3568 */
3569	dst_confirm(sk->sk_dst_cache);
3570#endif
3571	tp->t_starttime = ticks;
3572	tp->t_state = TCPS_ESTABLISHED;
3573	soisconnected(so);
3574}
3575
3576static int
3577syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3578{
3579
3580	struct in_conninfo inc;
3581	struct tcpopt to;
3582	struct tcphdr th;
3583	int mss, wsf, sack, ts;
3584	struct mbuf *m = NULL;
3585	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3586	unsigned int opt;
3587
3588#ifdef MAC
3589#error	"no MAC support"
3590#endif
3591
3592	opt = ntohs(req->tcp_opt);
3593
3594	bzero(&to, sizeof(struct tcpopt));
3595
3596	/*
3597	 * Fill out information for entering us into the syncache
3598	 */
3599	inc.inc_fport = th.th_sport = req->peer_port;
3600	inc.inc_lport = th.th_dport = req->local_port;
3601	th.th_seq = req->rcv_isn;
3602	th.th_flags = TH_ACK;
3603
3604	inc.inc_isipv6 = 0;
3605	inc.inc_len = 0;
3606	inc.inc_faddr.s_addr = req->peer_ip;
3607	inc.inc_laddr.s_addr = req->local_ip;
3608
3609	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3610	wsf  = G_TCPOPT_WSCALE_OK(opt);
3611	ts   = G_TCPOPT_TSTAMP(opt);
3612	sack = G_TCPOPT_SACK(opt);
3613
3614	to.to_mss = mss;
3615	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3616	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3617
3618	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3619	    ntohl(req->local_ip), ntohs(req->local_port),
3620	    ntohl(req->peer_ip), ntohs(req->peer_port),
3621	    mss, wsf, ts, sack);
3622	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3623}
3624
3625
3626/*
3627 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3628 * if we are in TCP_SYN_RECV due to crossed SYNs
3629 */
3630static int
3631do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3632{
3633	struct cpl_pass_establish *req = cplhdr(m);
3634	struct toepcb *toep = (struct toepcb *)ctx;
3635	struct tcpcb *tp = toep->tp_tp;
3636	struct socket *so, *lso;
3637	struct t3c_data *td = T3C_DATA(cdev);
3638	struct sockbuf *snd, *rcv;
3639
3640	// Complete socket initialization now that we have the SND_ISN
3641
3642	struct toedev *tdev;
3643
3644
3645	tdev = toep->tp_toedev;
3646
3647	inp_wlock(tp->t_inpcb);
3648
3649	/*
3650	 *
3651	 * XXX need to add reference while we're manipulating
3652	 */
3653	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3654
3655	inp_wunlock(tp->t_inpcb);
3656
3657	so_lock(so);
3658	LIST_REMOVE(toep, synq_entry);
3659	so_unlock(so);
3660
3661	if (!syncache_expand_establish_req(req, &so, toep)) {
3662		/*
3663		 * No entry
3664		 */
3665		CXGB_UNIMPLEMENTED();
3666	}
3667	if (so == NULL) {
3668		/*
3669		 * Couldn't create the socket
3670		 */
3671		CXGB_UNIMPLEMENTED();
3672	}
3673
3674	tp = so_sototcpcb(so);
3675	inp_wlock(tp->t_inpcb);
3676
3677	snd = so_sockbuf_snd(so);
3678	rcv = so_sockbuf_rcv(so);
3679
3680	snd->sb_flags |= SB_NOCOALESCE;
3681	rcv->sb_flags |= SB_NOCOALESCE;
3682
3683	toep->tp_tp = tp;
3684	toep->tp_flags = 0;
3685	tp->t_toe = toep;
3686	reset_wr_list(toep);
3687	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3688	tp->rcv_nxt = toep->tp_copied_seq;
3689	install_offload_ops(so);
3690
3691	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3692	toep->tp_wr_unacked = 0;
3693	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3694	toep->tp_qset_idx = 0;
3695	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3696
3697	/*
3698	 * XXX Cancel any keep alive timer
3699	 */
3700
3701	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3702
3703	/*
3704	 * XXX workaround for lack of syncache drop
3705	 */
3706	toepcb_release(toep);
3707	inp_wunlock(tp->t_inpcb);
3708
3709	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3710	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3711#ifdef notyet
3712	/*
3713	 * XXX not sure how these checks map to us
3714	 */
3715	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3716		sk->sk_state_change(sk);
3717		sk_wake_async(so, 0, POLL_OUT);
3718	}
3719	/*
3720	 * The state for the new connection is now up to date.
3721	 * Next check if we should add the connection to the parent's
3722	 * accept queue.  When the parent closes it resets connections
3723	 * on its SYN queue, so check if we are being reset.  If so we
3724	 * don't need to do anything more, the coming ABORT_RPL will
3725	 * destroy this socket.  Otherwise move the connection to the
3726	 * accept queue.
3727	 *
3728	 * Note that we reset the synq before closing the server so if
3729	 * we are not being reset the stid is still open.
3730	 */
3731	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3732		__kfree_skb(skb);
3733		goto unlock;
3734	}
3735#endif
3736	m_free(m);
3737
3738	return (0);
3739}
3740
3741/*
3742 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3743 * and send them to the TOE.
3744 */
3745static void
3746fixup_and_send_ofo(struct toepcb *toep)
3747{
3748	struct mbuf *m;
3749	struct toedev *tdev = toep->tp_toedev;
3750	struct tcpcb *tp = toep->tp_tp;
3751	unsigned int tid = toep->tp_tid;
3752
3753	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3754
3755	inp_lock_assert(tp->t_inpcb);
3756	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3757		/*
3758		 * A variety of messages can be waiting but the fields we'll
3759		 * be touching are common to all so any message type will do.
3760		 */
3761		struct cpl_close_con_req *p = cplhdr(m);
3762
3763		p->wr.wr_lo = htonl(V_WR_TID(tid));
3764		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3765		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3766	}
3767}
3768
3769/*
3770 * Updates socket state from an active establish CPL message.  Runs with the
3771 * socket lock held.
3772 */
3773static void
3774socket_act_establish(struct socket *so, struct mbuf *m)
3775{
3776	struct cpl_act_establish *req = cplhdr(m);
3777	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3778	struct tcpcb *tp = so_sototcpcb(so);
3779	struct toepcb *toep = tp->t_toe;
3780
3781	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3782		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3783		    toep->tp_tid, tp->t_state);
3784
3785	tp->ts_recent_age = ticks;
3786	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3787	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3788
3789	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3790
3791	/*
3792	 * Now that we finally have a TID send any CPL messages that we had to
3793	 * defer for lack of a TID.
3794	 */
3795	if (mbufq_len(&toep->out_of_order_queue))
3796		fixup_and_send_ofo(toep);
3797
3798	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3799		/*
3800		 * XXX does this even make sense?
3801		 */
3802		so_sorwakeup(so);
3803	}
3804	m_free(m);
3805#ifdef notyet
3806/*
3807 * XXX assume no write requests permitted while socket connection is
3808 * incomplete
3809 */
3810	/*
3811	 * Currently the send queue must be empty at this point because the
3812	 * socket layer does not send anything before a connection is
3813	 * established.  To be future proof though we handle the possibility
3814	 * that there are pending buffers to send (either TX_DATA or
3815	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3816	 * buffers according to the just learned write_seq, and then we send
3817	 * them on their way.
3818	 */
3819	fixup_pending_writeq_buffers(sk);
3820	if (t3_push_frames(so, 1))
3821		sk->sk_write_space(sk);
3822#endif
3823
3824	toep->tp_state = tp->t_state;
3825	V_tcpstat.tcps_connects++;
3826
3827}
3828
3829/*
3830 * Process a CPL_ACT_ESTABLISH message.
3831 */
3832static int
3833do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3834{
3835	struct cpl_act_establish *req = cplhdr(m);
3836	unsigned int tid = GET_TID(req);
3837	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3838	struct toepcb *toep = (struct toepcb *)ctx;
3839	struct tcpcb *tp = toep->tp_tp;
3840	struct socket *so;
3841	struct toedev *tdev;
3842	struct tom_data *d;
3843
3844	if (tp == NULL) {
3845		free_atid(cdev, atid);
3846		return (0);
3847	}
3848	inp_wlock(tp->t_inpcb);
3849
3850	/*
3851	 * XXX
3852	 */
3853	so = inp_inpcbtosocket(tp->t_inpcb);
3854	tdev = toep->tp_toedev; /* blow up here if link was down */
3855	d = TOM_DATA(tdev);
3856
3857	/*
3858	 * It's OK if the TID is currently in use, the owning socket may have
3859	 * backlogged its last CPL message(s).  Just take it away.
3860	 */
3861	toep->tp_tid = tid;
3862	toep->tp_tp = tp;
3863	so_insert_tid(d, toep, tid);
3864	free_atid(cdev, atid);
3865	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3866
3867	socket_act_establish(so, m);
3868	inp_wunlock(tp->t_inpcb);
3869	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3870	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3871
3872	return (0);
3873}
3874
3875/*
3876 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3877 * next batch of work requests from the write queue.
3878 */
3879static void
3880wr_ack(struct toepcb *toep, struct mbuf *m)
3881{
3882	struct tcpcb *tp = toep->tp_tp;
3883	struct cpl_wr_ack *hdr = cplhdr(m);
3884	struct socket *so;
3885	unsigned int credits = ntohs(hdr->credits);
3886	u32 snd_una = ntohl(hdr->snd_una);
3887	int bytes = 0;
3888	struct sockbuf *snd;
3889
3890	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3891
3892	inp_wlock(tp->t_inpcb);
3893	so = inp_inpcbtosocket(tp->t_inpcb);
3894	toep->tp_wr_avail += credits;
3895	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3896		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3897
3898	while (credits) {
3899		struct mbuf *p = peek_wr(toep);
3900
3901		if (__predict_false(!p)) {
3902			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3903			    "nothing pending, state %u wr_avail=%u\n",
3904			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3905			break;
3906		}
3907		CTR2(KTR_TOM,
3908			"wr_ack: p->credits=%d p->bytes=%d",
3909		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3910		KASSERT(p->m_pkthdr.csum_data != 0,
3911		    ("empty request still on list"));
3912
3913		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3914
3915#if DEBUG_WR > 1
3916			struct tx_data_wr *w = cplhdr(p);
3917			log(LOG_ERR,
3918			       "TID %u got %u WR credits, need %u, len %u, "
3919			       "main body %u, frags %u, seq # %u, ACK una %u,"
3920			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3921			       toep->tp_tid, credits, p->csum, p->len,
3922			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3923			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3924			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3925#endif
3926			p->m_pkthdr.csum_data -= credits;
3927			break;
3928		} else {
3929			dequeue_wr(toep);
3930			credits -= p->m_pkthdr.csum_data;
3931			bytes += p->m_pkthdr.len;
3932			CTR3(KTR_TOM,
3933			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3934			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3935
3936			m_free(p);
3937		}
3938	}
3939
3940#if DEBUG_WR
3941	check_wr_invariants(tp);
3942#endif
3943
3944	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3945#if VALIDATE_SEQ
3946		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3947
3948		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3949		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3950		    toep->tp_tid, tp->snd_una);
3951#endif
3952		goto out_free;
3953	}
3954
3955	if (tp->snd_una != snd_una) {
3956		tp->snd_una = snd_una;
3957		tp->ts_recent_age = ticks;
3958#ifdef notyet
3959		/*
3960		 * Keep ARP entry "minty fresh"
3961		 */
3962		dst_confirm(sk->sk_dst_cache);
3963#endif
3964		if (tp->snd_una == tp->snd_nxt)
3965			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3966	}
3967
3968	snd = so_sockbuf_snd(so);
3969	if (bytes) {
3970		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3971		snd = so_sockbuf_snd(so);
3972		sockbuf_lock(snd);
3973		sbdrop_locked(snd, bytes);
3974		so_sowwakeup_locked(so);
3975	}
3976
3977	if (snd->sb_sndptroff < snd->sb_cc)
3978		t3_push_frames(so, 0);
3979
3980out_free:
3981	inp_wunlock(tp->t_inpcb);
3982	m_free(m);
3983}
3984
3985/*
3986 * Handler for TX_DATA_ACK CPL messages.
3987 */
3988static int
3989do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3990{
3991	struct toepcb *toep = (struct toepcb *)ctx;
3992
3993	VALIDATE_SOCK(so);
3994
3995	wr_ack(toep, m);
3996	return 0;
3997}
3998
3999/*
4000 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4001 */
4002static int
4003do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4004{
4005	m_freem(m);
4006	return 0;
4007}
4008
4009/*
4010 * Reset a connection that is on a listener's SYN queue or accept queue,
4011 * i.e., one that has not had a struct socket associated with it.
4012 * Must be called from process context.
4013 *
4014 * Modeled after code in inet_csk_listen_stop().
4015 */
4016static void
4017t3_reset_listen_child(struct socket *child)
4018{
4019	struct tcpcb *tp = so_sototcpcb(child);
4020
4021	t3_send_reset(tp->t_toe);
4022}
4023
4024
4025static void
4026t3_child_disconnect(struct socket *so, void *arg)
4027{
4028	struct tcpcb *tp = so_sototcpcb(so);
4029
4030	if (tp->t_flags & TF_TOE) {
4031		inp_wlock(tp->t_inpcb);
4032		t3_reset_listen_child(so);
4033		inp_wunlock(tp->t_inpcb);
4034	}
4035}
4036
4037/*
4038 * Disconnect offloaded established but not yet accepted connections sitting
4039 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4040 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4041 */
4042void
4043t3_disconnect_acceptq(struct socket *listen_so)
4044{
4045
4046	so_lock(listen_so);
4047	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4048	so_unlock(listen_so);
4049}
4050
4051/*
4052 * Reset offloaded connections sitting on a server's syn queue.  As above
4053 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4054 */
4055
4056void
4057t3_reset_synq(struct listen_ctx *lctx)
4058{
4059	struct toepcb *toep;
4060
4061	so_lock(lctx->lso);
4062	while (!LIST_EMPTY(&lctx->synq_head)) {
4063		toep = LIST_FIRST(&lctx->synq_head);
4064		LIST_REMOVE(toep, synq_entry);
4065		toep->tp_tp = NULL;
4066		t3_send_reset(toep);
4067		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4068		toepcb_release(toep);
4069	}
4070	so_unlock(lctx->lso);
4071}
4072
4073
4074int
4075t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4076		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4077		   unsigned int pg_off, unsigned int color)
4078{
4079	unsigned int i, j, pidx;
4080	struct pagepod *p;
4081	struct mbuf *m;
4082	struct ulp_mem_io *req;
4083	unsigned int tid = toep->tp_tid;
4084	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4085	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4086
4087	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4088	    gl, nppods, tag, maxoff, pg_off, color);
4089
4090	for (i = 0; i < nppods; ++i) {
4091		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4092		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4093		req = mtod(m, struct ulp_mem_io *);
4094		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4095		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4096		req->wr.wr_lo = 0;
4097		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4098					   V_ULPTX_CMD(ULP_MEM_WRITE));
4099		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4100				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4101
4102		p = (struct pagepod *)(req + 1);
4103		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4104			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4105			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4106						  V_PPOD_COLOR(color));
4107			p->pp_max_offset = htonl(maxoff);
4108			p->pp_page_offset = htonl(pg_off);
4109			p->pp_rsvd = 0;
4110			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4111				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4112				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4113		} else
4114			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4115		send_or_defer(toep, m, 0);
4116		ppod_addr += PPOD_SIZE;
4117	}
4118	return (0);
4119}
4120
4121/*
4122 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4123 */
4124static inline void
4125mk_cpl_barrier_ulp(struct cpl_barrier *b)
4126{
4127	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4128
4129	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4130	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4131	b->opcode = CPL_BARRIER;
4132}
4133
4134/*
4135 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4136 */
4137static inline void
4138mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4139{
4140	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4141
4142	txpkt = (struct ulp_txpkt *)req;
4143	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4144	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4145	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4146	req->cpuno = htons(cpuno);
4147}
4148
4149/*
4150 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4151 */
4152static inline void
4153mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4154                     unsigned int word, uint64_t mask, uint64_t val)
4155{
4156	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4157
4158	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4159	    tid, word, mask, val);
4160
4161	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4162	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4163	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4164	req->reply = V_NO_REPLY(1);
4165	req->cpu_idx = 0;
4166	req->word = htons(word);
4167	req->mask = htobe64(mask);
4168	req->val = htobe64(val);
4169}
4170
4171/*
4172 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4173 */
4174static void
4175mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4176    unsigned int tid, unsigned int credits)
4177{
4178	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4179
4180	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4181	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4182	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4183	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4184	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4185				 V_RX_CREDITS(credits));
4186}
4187
4188void
4189t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4190{
4191	unsigned int wrlen;
4192	struct mbuf *m;
4193	struct work_request_hdr *wr;
4194	struct cpl_barrier *lock;
4195	struct cpl_set_tcb_field *req;
4196	struct cpl_get_tcb *getreq;
4197	struct ddp_state *p = &toep->tp_ddp_state;
4198
4199#if 0
4200	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4201#endif
4202	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4203		sizeof(*getreq);
4204	m = m_gethdr_nofail(wrlen);
4205	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4206	wr = mtod(m, struct work_request_hdr *);
4207	bzero(wr, wrlen);
4208
4209	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4210	m->m_pkthdr.len = m->m_len = wrlen;
4211
4212	lock = (struct cpl_barrier *)(wr + 1);
4213	mk_cpl_barrier_ulp(lock);
4214
4215	req = (struct cpl_set_tcb_field *)(lock + 1);
4216
4217	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4218
4219	/* Hmmm, not sure if this actually a good thing: reactivating
4220	 * the other buffer might be an issue if it has been completed
4221	 * already. However, that is unlikely, since the fact that the UBUF
4222	 * is not completed indicates that there is no oustanding data.
4223	 */
4224	if (bufidx == 0)
4225		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4226				     V_TF_DDP_ACTIVE_BUF(1) |
4227				     V_TF_DDP_BUF0_VALID(1),
4228				     V_TF_DDP_ACTIVE_BUF(1));
4229	else
4230		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4231				     V_TF_DDP_ACTIVE_BUF(1) |
4232				     V_TF_DDP_BUF1_VALID(1), 0);
4233
4234	getreq = (struct cpl_get_tcb *)(req + 1);
4235	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4236
4237	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4238
4239	/* Keep track of the number of oustanding CPL_GET_TCB requests
4240	 */
4241	p->get_tcb_count++;
4242
4243#ifdef T3_TRACE
4244	T3_TRACE1(TIDTB(so),
4245		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4246#endif
4247	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4248}
4249
4250/**
4251 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4252 * @sk: the socket associated with the buffers
4253 * @bufidx: index of HW DDP buffer (0 or 1)
4254 * @tag0: new tag for HW buffer 0
4255 * @tag1: new tag for HW buffer 1
4256 * @len: new length for HW buf @bufidx
4257 *
4258 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4259 * buffer by changing the buffer tag and length and setting the valid and
4260 * active flag accordingly.  The caller must ensure the new buffer is at
4261 * least as big as the existing one.  Since we typically reprogram both HW
4262 * buffers this function sets both tags for convenience. Read the TCB to
4263 * determine how made data was written into the buffer before the overlay
4264 * took place.
4265 */
4266void
4267t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4268	 	       unsigned int tag1, unsigned int len)
4269{
4270	unsigned int wrlen;
4271	struct mbuf *m;
4272	struct work_request_hdr *wr;
4273	struct cpl_get_tcb *getreq;
4274	struct cpl_set_tcb_field *req;
4275	struct ddp_state *p = &toep->tp_ddp_state;
4276
4277	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4278	    bufidx, tag0, tag1, len);
4279#if 0
4280	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4281#endif
4282	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4283	m = m_gethdr_nofail(wrlen);
4284	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4285	wr = mtod(m, struct work_request_hdr *);
4286	m->m_pkthdr.len = m->m_len = wrlen;
4287	bzero(wr, wrlen);
4288
4289
4290	/* Set the ATOMIC flag to make sure that TP processes the following
4291	 * CPLs in an atomic manner and no wire segments can be interleaved.
4292	 */
4293	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4294	req = (struct cpl_set_tcb_field *)(wr + 1);
4295	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4296			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4297			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4298			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4299			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4300	req++;
4301	if (bufidx == 0) {
4302		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4303			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4304			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4305		req++;
4306		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4307			    V_TF_DDP_PUSH_DISABLE_0(1) |
4308			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4309			    V_TF_DDP_PUSH_DISABLE_0(0) |
4310			    V_TF_DDP_BUF0_VALID(1));
4311	} else {
4312		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4313			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4314			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4315		req++;
4316		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4317			    V_TF_DDP_PUSH_DISABLE_1(1) |
4318			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4319			    V_TF_DDP_PUSH_DISABLE_1(0) |
4320			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4321	}
4322
4323	getreq = (struct cpl_get_tcb *)(req + 1);
4324	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4325
4326	/* Keep track of the number of oustanding CPL_GET_TCB requests
4327	 */
4328	p->get_tcb_count++;
4329
4330#ifdef T3_TRACE
4331	T3_TRACE4(TIDTB(sk),
4332		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4333		  "len %d",
4334		  bufidx, tag0, tag1, len);
4335#endif
4336	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4337}
4338
4339/*
4340 * Sends a compound WR containing all the CPL messages needed to program the
4341 * two HW DDP buffers, namely optionally setting up the length and offset of
4342 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4343 */
4344void
4345t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4346		      unsigned int len1, unsigned int offset1,
4347                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4348{
4349	unsigned int wrlen;
4350	struct mbuf *m;
4351	struct work_request_hdr *wr;
4352	struct cpl_set_tcb_field *req;
4353
4354	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4355	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4356
4357#if 0
4358	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4359#endif
4360	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4361		(len1 ? sizeof(*req) : 0) +
4362		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4363	m = m_gethdr_nofail(wrlen);
4364	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4365	wr = mtod(m, struct work_request_hdr *);
4366	bzero(wr, wrlen);
4367
4368	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4369	m->m_pkthdr.len = m->m_len = wrlen;
4370
4371	req = (struct cpl_set_tcb_field *)(wr + 1);
4372	if (len0) {                  /* program buffer 0 offset and length */
4373		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4374			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4375			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4376			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4377			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4378		req++;
4379	}
4380	if (len1) {                  /* program buffer 1 offset and length */
4381		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4382			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4383			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4384			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4385			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4386		req++;
4387	}
4388
4389	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4390			     ddp_flags);
4391
4392	if (modulate) {
4393		mk_rx_data_ack_ulp(toep,
4394		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4395		    toep->tp_copied_seq - toep->tp_rcv_wup);
4396		toep->tp_rcv_wup = toep->tp_copied_seq;
4397	}
4398
4399#ifdef T3_TRACE
4400	T3_TRACE5(TIDTB(sk),
4401		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4402		  "modulate %d",
4403		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4404		  modulate);
4405#endif
4406
4407	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4408}
4409
4410void
4411t3_init_wr_tab(unsigned int wr_len)
4412{
4413	int i;
4414
4415	if (mbuf_wrs[1])     /* already initialized */
4416		return;
4417
4418	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4419		int sgl_len = (3 * i) / 2 + (i & 1);
4420
4421		sgl_len += 3;
4422		mbuf_wrs[i] = sgl_len <= wr_len ?
4423		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4424	}
4425
4426	wrlen = wr_len * 8;
4427}
4428
4429int
4430t3_init_cpl_io(void)
4431{
4432#ifdef notyet
4433	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4434	if (!tcphdr_skb) {
4435		log(LOG_ERR,
4436		       "Chelsio TCP offload: can't allocate sk_buff\n");
4437		return -1;
4438	}
4439	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4440	tcphdr_skb->h.raw = tcphdr_skb->data;
4441	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4442#endif
4443
4444	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4445	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4446	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4447	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4448	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4449	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4450	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4451	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4452	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4453	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4454	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4455	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4456	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4457	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4458	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4459	return (0);
4460}
4461
4462