cxgb_cpl_io.c revision 178302
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 178302 2008-04-19 03:22:43Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44#include <sys/syslog.h>
45#include <sys/protosw.h>
46#include <sys/priv.h>
47
48#include <net/if.h>
49#include <net/route.h>
50
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_systm.h>
54#include <netinet/in_var.h>
55
56
57#include <dev/cxgb/cxgb_osdep.h>
58#include <dev/cxgb/sys/mbufq.h>
59
60#include <netinet/ip.h>
61#include <netinet/tcp_var.h>
62#include <netinet/tcp_fsm.h>
63#include <netinet/tcp_offload.h>
64#include <netinet/tcp_seq.h>
65#include <netinet/tcp_syncache.h>
66#include <netinet/tcp_timer.h>
67#include <net/route.h>
68
69#include <dev/cxgb/t3cdev.h>
70#include <dev/cxgb/common/cxgb_firmware_exports.h>
71#include <dev/cxgb/common/cxgb_t3_cpl.h>
72#include <dev/cxgb/common/cxgb_tcb.h>
73#include <dev/cxgb/common/cxgb_ctl_defs.h>
74#include <dev/cxgb/cxgb_offload.h>
75#include <vm/vm.h>
76#include <vm/pmap.h>
77#include <machine/bus.h>
78#include <dev/cxgb/sys/mvec.h>
79#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
80#include <dev/cxgb/ulp/tom/cxgb_defs.h>
81#include <dev/cxgb/ulp/tom/cxgb_tom.h>
82#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
83#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
84#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
85
86#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space.  Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets.  It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes.  Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body.  This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes.  Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window.  We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139#define TCP_TIMEWAIT	1
140#define TCP_CLOSE	2
141#define TCP_DROP	3
142
143extern int tcp_do_autorcvbuf;
144extern int tcp_do_autosndbuf;
145extern int tcp_autorcvbuf_max;
146extern int tcp_autosndbuf_max;
147
148static void t3_send_reset(struct toepcb *toep);
149static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
150static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
151static void handle_syncache_event(int event, void *arg);
152
153static inline void
154SBAPPEND(struct sockbuf *sb, struct mbuf *n)
155{
156	struct mbuf *m;
157
158	m = sb->sb_mb;
159	while (m) {
160		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
161		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
162			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
163		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
164			m->m_next, m->m_nextpkt, m->m_flags));
165		m = m->m_next;
166	}
167	m = n;
168	while (m) {
169		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
170		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
171			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
172		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
173			m->m_next, m->m_nextpkt, m->m_flags));
174		m = m->m_next;
175	}
176	sbappend_locked(sb, n);
177	m = sb->sb_mb;
178
179	while (m) {
180		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
181			m->m_next, m->m_nextpkt, m->m_flags));
182		m = m->m_next;
183	}
184}
185
186static inline int
187is_t3a(const struct toedev *dev)
188{
189	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
190}
191
192static void
193dump_toepcb(struct toepcb *toep)
194{
195	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
196	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
197	    toep->tp_mtu_idx, toep->tp_tid);
198
199	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
200	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
201	    toep->tp_mss_clamp, toep->tp_flags);
202}
203
204#ifndef RTALLOC2_DEFINED
205static struct rtentry *
206rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
207{
208	struct rtentry *rt = NULL;
209
210	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
211		RT_UNLOCK(rt);
212
213	return (rt);
214}
215#endif
216
217/*
218 * Determine whether to send a CPL message now or defer it.  A message is
219 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
220 * For connections in other states the message is sent immediately.
221 * If through_l2t is set the message is subject to ARP processing, otherwise
222 * it is sent directly.
223 */
224static inline void
225send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
226{
227	struct tcpcb *tp = toep->tp_tp;
228
229	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
230		inp_wlock(tp->t_inpcb);
231		mbufq_tail(&toep->out_of_order_queue, m);  // defer
232		inp_wunlock(tp->t_inpcb);
233	} else if (through_l2t)
234		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
235	else
236		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
237}
238
239static inline unsigned int
240mkprio(unsigned int cntrl, const struct toepcb *toep)
241{
242        return (cntrl);
243}
244
245/*
246 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
247 */
248static inline void
249mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
250{
251	struct cpl_tid_release *req;
252
253	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
254	m->m_pkthdr.len = m->m_len = sizeof(*req);
255	req = mtod(m, struct cpl_tid_release *);
256	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
257	req->wr.wr_lo = 0;
258	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
259}
260
261static inline void
262make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
263{
264	struct tcpcb *tp = so_sototcpcb(so);
265	struct toepcb *toep = tp->t_toe;
266	struct tx_data_wr *req;
267	struct sockbuf *snd;
268
269	inp_lock_assert(tp->t_inpcb);
270	snd = so_sockbuf_snd(so);
271
272	req = mtod(m, struct tx_data_wr *);
273	m->m_len = sizeof(*req);
274	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
275	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
276	/* len includes the length of any HW ULP additions */
277	req->len = htonl(len);
278	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
279	/* V_TX_ULP_SUBMODE sets both the mode and submode */
280	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
281	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
282	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
283				   (tail ? 0 : 1))));
284	req->sndseq = htonl(tp->snd_nxt);
285	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
286		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
287				    V_TX_CPU_IDX(toep->tp_qset));
288
289		/* Sendbuffer is in units of 32KB.
290		 */
291		if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
292			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
293		else {
294			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
295		}
296
297		toep->tp_flags |= TP_DATASENT;
298	}
299}
300
301#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
302
303int
304t3_push_frames(struct socket *so, int req_completion)
305{
306	struct tcpcb *tp = so_sototcpcb(so);
307	struct toepcb *toep = tp->t_toe;
308
309	struct mbuf *tail, *m0, *last;
310	struct t3cdev *cdev;
311	struct tom_data *d;
312	int state, bytes, count, total_bytes;
313	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
314	struct sockbuf *snd;
315
316	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
317		DPRINTF("tcp state=%d\n", tp->t_state);
318		return (0);
319	}
320
321	state = so_state_get(so);
322
323	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
324		DPRINTF("disconnecting\n");
325
326		return (0);
327	}
328
329	inp_lock_assert(tp->t_inpcb);
330
331	snd = so_sockbuf_snd(so);
332	sockbuf_lock(snd);
333
334	d = TOM_DATA(toep->tp_toedev);
335	cdev = d->cdev;
336
337	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
338
339	total_bytes = 0;
340	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
341	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
342
343	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
344		KASSERT(tail, ("sbdrop error"));
345		last = tail = tail->m_next;
346	}
347
348	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
349		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
350		sockbuf_unlock(snd);
351
352		return (0);
353	}
354
355	toep->tp_m_last = NULL;
356	while (toep->tp_wr_avail && (tail != NULL)) {
357		count = bytes = 0;
358		segp = segs;
359		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
360			sockbuf_unlock(snd);
361			return (0);
362		}
363		/*
364		 * If the data in tail fits as in-line, then
365		 * make an immediate data wr.
366		 */
367		if (tail->m_len <= IMM_LEN) {
368			count = 1;
369			bytes = tail->m_len;
370			last = tail;
371			tail = tail->m_next;
372			m_set_sgl(m0, NULL);
373			m_set_sgllen(m0, 0);
374			make_tx_data_wr(so, m0, bytes, tail);
375			m_append(m0, bytes, mtod(last, caddr_t));
376			KASSERT(!m0->m_next, ("bad append"));
377		} else {
378			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
379			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
380				bytes += tail->m_len;
381				last = tail;
382				count++;
383				/*
384				 * technically an abuse to be using this for a VA
385				 * but less gross than defining my own structure
386				 * or calling pmap_kextract from here :-|
387				 */
388				segp->ds_addr = (bus_addr_t)tail->m_data;
389				segp->ds_len = tail->m_len;
390				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
391				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
392				segp++;
393				tail = tail->m_next;
394			}
395			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
396			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
397
398			m_set_sgl(m0, segs);
399			m_set_sgllen(m0, count);
400			make_tx_data_wr(so, m0, bytes, tail);
401		}
402		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
403
404		if (tail) {
405			snd->sb_sndptr = tail;
406			toep->tp_m_last = NULL;
407		} else
408			toep->tp_m_last = snd->sb_sndptr = last;
409
410
411		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
412
413		snd->sb_sndptroff += bytes;
414		total_bytes += bytes;
415		toep->tp_write_seq += bytes;
416		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
417		    toep->tp_wr_avail, count, mbuf_wrs[count], tail, snd->sb_sndptr, snd->sb_sndptroff);
418		if (tail)
419			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
420			    total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
421		else
422			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
423			    total_bytes, toep->tp_m_last, tp->snd_una);
424
425
426#ifdef KTR
427{
428		int i;
429
430		i = 0;
431		while (i < count && m_get_sgllen(m0)) {
432			if ((count - i) >= 3) {
433				CTR6(KTR_TOM,
434				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
435				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
436				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
437				    i += 3;
438			} else if ((count - i) == 2) {
439				CTR4(KTR_TOM,
440				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
441				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
442				    i += 2;
443			} else {
444				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
445				    segs[i].ds_addr, segs[i].ds_len);
446				i++;
447			}
448
449		}
450}
451#endif
452                 /*
453		 * remember credits used
454		 */
455		m0->m_pkthdr.csum_data = mbuf_wrs[count];
456		m0->m_pkthdr.len = bytes;
457		toep->tp_wr_avail -= mbuf_wrs[count];
458		toep->tp_wr_unacked += mbuf_wrs[count];
459
460		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
461		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
462			struct work_request_hdr *wr = cplhdr(m0);
463
464			wr->wr_hi |= htonl(F_WR_COMPL);
465			toep->tp_wr_unacked = 0;
466		}
467		KASSERT((m0->m_pkthdr.csum_data > 0) &&
468		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
469			m0->m_pkthdr.csum_data));
470		m0->m_type = MT_DONTFREE;
471		enqueue_wr(toep, m0);
472		DPRINTF("sending offload tx with %d bytes in %d segments\n",
473		    bytes, count);
474		l2t_send(cdev, m0, toep->tp_l2t);
475	}
476	sockbuf_unlock(snd);
477	return (total_bytes);
478}
479
480/*
481 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
482 * under any circumstances.  We take the easy way out and always queue the
483 * message to the write_queue.  We can optimize the case where the queue is
484 * already empty though the optimization is probably not worth it.
485 */
486static void
487close_conn(struct socket *so)
488{
489	struct mbuf *m;
490	struct cpl_close_con_req *req;
491	struct tom_data *d;
492	struct inpcb *inp = so_sotoinpcb(so);
493	struct tcpcb *tp;
494	struct toepcb *toep;
495	unsigned int tid;
496
497
498	inp_wlock(inp);
499	tp = so_sototcpcb(so);
500	toep = tp->t_toe;
501
502	if (tp->t_state != TCPS_SYN_SENT)
503		t3_push_frames(so, 1);
504
505	if (toep->tp_flags & TP_FIN_SENT) {
506		inp_wunlock(inp);
507		return;
508	}
509
510	tid = toep->tp_tid;
511
512	d = TOM_DATA(toep->tp_toedev);
513
514	m = m_gethdr_nofail(sizeof(*req));
515	m_set_priority(m, CPL_PRIORITY_DATA);
516	m_set_sgl(m, NULL);
517	m_set_sgllen(m, 0);
518
519	toep->tp_flags |= TP_FIN_SENT;
520	req = mtod(m, struct cpl_close_con_req *);
521
522	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
523	req->wr.wr_lo = htonl(V_WR_TID(tid));
524	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
525	req->rsvd = 0;
526	inp_wunlock(inp);
527	/*
528	 * XXX - need to defer shutdown while there is still data in the queue
529	 *
530	 */
531	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
532	cxgb_ofld_send(d->cdev, m);
533
534}
535
536/*
537 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
538 * and send it along.
539 */
540static void
541abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
542{
543	struct cpl_abort_req *req = cplhdr(m);
544
545	req->cmd = CPL_ABORT_NO_RST;
546	cxgb_ofld_send(cdev, m);
547}
548
549/*
550 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
551 * permitted to return without sending the message in case we cannot allocate
552 * an sk_buff.  Returns the number of credits sent.
553 */
554uint32_t
555t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
556{
557	struct mbuf *m;
558	struct cpl_rx_data_ack *req;
559	struct toepcb *toep = tp->t_toe;
560	struct toedev *tdev = toep->tp_toedev;
561
562	m = m_gethdr_nofail(sizeof(*req));
563
564	DPRINTF("returning %u credits to HW\n", credits);
565
566	req = mtod(m, struct cpl_rx_data_ack *);
567	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
568	req->wr.wr_lo = 0;
569	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
570	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
571	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
572	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
573	return (credits);
574}
575
576/*
577 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
578 * This is only used in DDP mode, so we take the opportunity to also set the
579 * DACK mode and flush any Rx credits.
580 */
581void
582t3_send_rx_modulate(struct toepcb *toep)
583{
584	struct mbuf *m;
585	struct cpl_rx_data_ack *req;
586
587	m = m_gethdr_nofail(sizeof(*req));
588
589	req = mtod(m, struct cpl_rx_data_ack *);
590	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
591	req->wr.wr_lo = 0;
592	m->m_pkthdr.len = m->m_len = sizeof(*req);
593
594	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
595	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
596				 V_RX_DACK_MODE(1) |
597				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
598	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
599	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
600	toep->tp_rcv_wup = toep->tp_copied_seq;
601}
602
603/*
604 * Handle receipt of an urgent pointer.
605 */
606static void
607handle_urg_ptr(struct socket *so, uint32_t urg_seq)
608{
609#ifdef URGENT_DATA_SUPPORTED
610	struct tcpcb *tp = so_sototcpcb(so);
611
612	urg_seq--;   /* initially points past the urgent data, per BSD */
613
614	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
615		return;                                 /* duplicate pointer */
616	sk_send_sigurg(sk);
617	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
618	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
619		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
620
621		tp->copied_seq++;
622		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
623			tom_eat_skb(sk, skb, 0);
624	}
625	tp->urg_data = TCP_URG_NOTYET;
626	tp->urg_seq = urg_seq;
627#endif
628}
629
630/*
631 * Returns true if a socket cannot accept new Rx data.
632 */
633static inline int
634so_no_receive(const struct socket *so)
635{
636	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
637}
638
639/*
640 * Process an urgent data notification.
641 */
642static void
643rx_urg_notify(struct toepcb *toep, struct mbuf *m)
644{
645	struct cpl_rx_urg_notify *hdr = cplhdr(m);
646	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
647
648	VALIDATE_SOCK(so);
649
650	if (!so_no_receive(so))
651		handle_urg_ptr(so, ntohl(hdr->seq));
652
653	m_freem(m);
654}
655
656/*
657 * Handler for RX_URG_NOTIFY CPL messages.
658 */
659static int
660do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
661{
662	struct toepcb *toep = (struct toepcb *)ctx;
663
664	rx_urg_notify(toep, m);
665	return (0);
666}
667
668static __inline int
669is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
670{
671	return (toep->tp_ulp_mode ||
672		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
673		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
674}
675
676/*
677 * Set of states for which we should return RX credits.
678 */
679#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
680
681/*
682 * Called after some received data has been read.  It returns RX credits
683 * to the HW for the amount of data processed.
684 */
685void
686t3_cleanup_rbuf(struct tcpcb *tp, int copied)
687{
688	struct toepcb *toep = tp->t_toe;
689	struct socket *so;
690	struct toedev *dev;
691	int dack_mode, must_send, read;
692	u32 thres, credits, dack = 0;
693	struct sockbuf *rcv;
694
695	so = inp_inpcbtosocket(tp->t_inpcb);
696	rcv = so_sockbuf_rcv(so);
697
698	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
699		(tp->t_state == TCPS_FIN_WAIT_2))) {
700		if (copied) {
701			sockbuf_lock(rcv);
702			toep->tp_copied_seq += copied;
703			sockbuf_unlock(rcv);
704		}
705
706		return;
707	}
708
709	inp_lock_assert(tp->t_inpcb);
710
711	sockbuf_lock(rcv);
712	if (copied)
713		toep->tp_copied_seq += copied;
714	else {
715		read = toep->tp_enqueued_bytes - rcv->sb_cc;
716		toep->tp_copied_seq += read;
717	}
718	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
719	toep->tp_enqueued_bytes = rcv->sb_cc;
720	sockbuf_unlock(rcv);
721
722	if (credits > rcv->sb_mbmax) {
723		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
724		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
725	    credits = rcv->sb_mbmax;
726	}
727
728
729	/*
730	 * XXX this won't accurately reflect credit return - we need
731	 * to look at the difference between the amount that has been
732	 * put in the recv sockbuf and what is there now
733	 */
734
735	if (__predict_false(!credits))
736		return;
737
738	dev = toep->tp_toedev;
739	thres = TOM_TUNABLE(dev, rx_credit_thres);
740
741	if (__predict_false(thres == 0))
742		return;
743
744	if (is_delack_mode_valid(dev, toep)) {
745		dack_mode = TOM_TUNABLE(dev, delack);
746		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
747			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
748
749			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
750				dack = F_RX_DACK_CHANGE |
751				       V_RX_DACK_MODE(dack_mode);
752		}
753	} else
754		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
755
756	/*
757	 * For coalescing to work effectively ensure the receive window has
758	 * at least 16KB left.
759	 */
760	must_send = credits + 16384 >= tp->rcv_wnd;
761
762	if (must_send || credits >= thres)
763		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
764}
765
766static int
767cxgb_toe_disconnect(struct tcpcb *tp)
768{
769	struct socket *so;
770
771	DPRINTF("cxgb_toe_disconnect\n");
772
773	so = inp_inpcbtosocket(tp->t_inpcb);
774	close_conn(so);
775	return (0);
776}
777
778static int
779cxgb_toe_reset(struct tcpcb *tp)
780{
781	struct toepcb *toep = tp->t_toe;
782
783	t3_send_reset(toep);
784
785	/*
786	 * unhook from socket
787	 */
788	tp->t_flags &= ~TF_TOE;
789	toep->tp_tp = NULL;
790	tp->t_toe = NULL;
791	return (0);
792}
793
794static int
795cxgb_toe_send(struct tcpcb *tp)
796{
797	struct socket *so;
798
799	DPRINTF("cxgb_toe_send\n");
800	dump_toepcb(tp->t_toe);
801
802	so = inp_inpcbtosocket(tp->t_inpcb);
803	t3_push_frames(so, 1);
804	return (0);
805}
806
807static int
808cxgb_toe_rcvd(struct tcpcb *tp)
809{
810
811	inp_lock_assert(tp->t_inpcb);
812
813	t3_cleanup_rbuf(tp, 0);
814
815	return (0);
816}
817
818static void
819cxgb_toe_detach(struct tcpcb *tp)
820{
821	struct toepcb *toep;
822
823        /*
824	 * XXX how do we handle teardown in the SYN_SENT state?
825	 *
826	 */
827	inp_lock_assert(tp->t_inpcb);
828	toep = tp->t_toe;
829	toep->tp_tp = NULL;
830
831	/*
832	 * unhook from socket
833	 */
834	tp->t_flags &= ~TF_TOE;
835	tp->t_toe = NULL;
836}
837
838
839static struct toe_usrreqs cxgb_toe_usrreqs = {
840	.tu_disconnect = cxgb_toe_disconnect,
841	.tu_reset = cxgb_toe_reset,
842	.tu_send = cxgb_toe_send,
843	.tu_rcvd = cxgb_toe_rcvd,
844	.tu_detach = cxgb_toe_detach,
845	.tu_detach = cxgb_toe_detach,
846	.tu_syncache_event = handle_syncache_event,
847};
848
849
850static void
851__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
852			    uint64_t mask, uint64_t val, int no_reply)
853{
854	struct cpl_set_tcb_field *req;
855
856	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
857	    toep->tp_tid, word, mask, val);
858
859	req = mtod(m, struct cpl_set_tcb_field *);
860	m->m_pkthdr.len = m->m_len = sizeof(*req);
861	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
862	req->wr.wr_lo = 0;
863	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
864	req->reply = V_NO_REPLY(no_reply);
865	req->cpu_idx = 0;
866	req->word = htons(word);
867	req->mask = htobe64(mask);
868	req->val = htobe64(val);
869
870	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
871	send_or_defer(toep, m, 0);
872}
873
874static void
875t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
876{
877	struct mbuf *m;
878	struct tcpcb *tp = toep->tp_tp;
879
880	if (toep == NULL)
881		return;
882
883	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
884		printf("not seting field\n");
885		return;
886	}
887
888	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
889
890	__set_tcb_field(toep, m, word, mask, val, 1);
891}
892
893/*
894 * Set one of the t_flags bits in the TCB.
895 */
896static void
897set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
898{
899
900	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
901}
902
903/*
904 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
905 */
906static void
907t3_set_nagle(struct toepcb *toep)
908{
909	struct tcpcb *tp = toep->tp_tp;
910
911	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
912}
913
914/*
915 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
916 */
917void
918t3_set_keepalive(struct toepcb *toep, int on_off)
919{
920
921	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
922}
923
924void
925t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
926{
927	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
928}
929
930void
931t3_set_dack_mss(struct toepcb *toep, int on_off)
932{
933
934	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
935}
936
937/*
938 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
939 */
940static void
941t3_set_tos(struct toepcb *toep)
942{
943	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
944
945	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
946			 V_TCB_TOS(tos));
947}
948
949
950/*
951 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
952 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
953 * set the PSH bit in the last segment, which would trigger delivery.]
954 * We work around the issue by setting a DDP buffer in a partial placed state,
955 * which guarantees that TP will schedule a timer.
956 */
957#define TP_DDP_TIMER_WORKAROUND_MASK\
958    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
959     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
960       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
961#define TP_DDP_TIMER_WORKAROUND_VAL\
962    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
963     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
964      32))
965
966static void
967t3_enable_ddp(struct toepcb *toep, int on)
968{
969	if (on) {
970
971		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
972				 V_TF_DDP_OFF(0));
973	} else
974		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
975				 V_TF_DDP_OFF(1) |
976				 TP_DDP_TIMER_WORKAROUND_MASK,
977				 V_TF_DDP_OFF(1) |
978				 TP_DDP_TIMER_WORKAROUND_VAL);
979
980}
981
982void
983t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
984{
985	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
986			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
987			 tag_color);
988}
989
990void
991t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
992		    unsigned int len)
993{
994	if (buf_idx == 0)
995		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
996			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
997			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
998			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
999			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1000	else
1001		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1002			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1003			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1004			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1005			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1006}
1007
1008static int
1009t3_set_cong_control(struct socket *so, const char *name)
1010{
1011#ifdef CONGESTION_CONTROL_SUPPORTED
1012	int cong_algo;
1013
1014	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1015		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1016			break;
1017
1018	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1019		return -EINVAL;
1020#endif
1021	return 0;
1022}
1023
1024int
1025t3_get_tcb(struct toepcb *toep)
1026{
1027	struct cpl_get_tcb *req;
1028	struct tcpcb *tp = toep->tp_tp;
1029	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1030
1031	if (!m)
1032		return (ENOMEM);
1033
1034	inp_lock_assert(tp->t_inpcb);
1035	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1036	req = mtod(m, struct cpl_get_tcb *);
1037	m->m_pkthdr.len = m->m_len = sizeof(*req);
1038	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1039	req->wr.wr_lo = 0;
1040	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1041	req->cpuno = htons(toep->tp_qset);
1042	req->rsvd = 0;
1043	if (tp->t_state == TCPS_SYN_SENT)
1044		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1045	else
1046		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1047	return 0;
1048}
1049
1050static inline void
1051so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1052{
1053
1054	toepcb_hold(toep);
1055
1056	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1057}
1058
1059/**
1060 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1061 *	@d: TOM state
1062 *	@mtu: the target MTU
1063 *
1064 *	Returns the index of the value in the MTU table that is closest to but
1065 *	does not exceed the target MTU.
1066 */
1067static unsigned int
1068find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1069{
1070	int i = 0;
1071
1072	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1073		++i;
1074	return (i);
1075}
1076
1077static unsigned int
1078select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1079{
1080	unsigned int idx;
1081
1082#ifdef notyet
1083	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1084#endif
1085	if (tp) {
1086		tp->t_maxseg = pmtu - 40;
1087		if (tp->t_maxseg < td->mtus[0] - 40)
1088			tp->t_maxseg = td->mtus[0] - 40;
1089		idx = find_best_mtu(td, tp->t_maxseg + 40);
1090
1091		tp->t_maxseg = td->mtus[idx] - 40;
1092	} else
1093		idx = find_best_mtu(td, pmtu);
1094
1095	return (idx);
1096}
1097
1098static inline void
1099free_atid(struct t3cdev *cdev, unsigned int tid)
1100{
1101	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1102
1103	if (toep)
1104		toepcb_release(toep);
1105}
1106
1107/*
1108 * Release resources held by an offload connection (TID, L2T entry, etc.)
1109 */
1110static void
1111t3_release_offload_resources(struct toepcb *toep)
1112{
1113	struct tcpcb *tp = toep->tp_tp;
1114	struct toedev *tdev = toep->tp_toedev;
1115	struct t3cdev *cdev;
1116	struct socket *so;
1117	unsigned int tid = toep->tp_tid;
1118	struct sockbuf *rcv;
1119
1120	CTR0(KTR_TOM, "t3_release_offload_resources");
1121
1122	if (!tdev)
1123		return;
1124
1125	cdev = TOEP_T3C_DEV(toep);
1126	if (!cdev)
1127		return;
1128
1129	toep->tp_qset = 0;
1130	t3_release_ddp_resources(toep);
1131
1132#ifdef CTRL_SKB_CACHE
1133	kfree_skb(CTRL_SKB_CACHE(tp));
1134	CTRL_SKB_CACHE(tp) = NULL;
1135#endif
1136
1137	if (toep->tp_wr_avail != toep->tp_wr_max) {
1138		purge_wr_queue(toep);
1139		reset_wr_list(toep);
1140	}
1141
1142	if (toep->tp_l2t) {
1143		l2t_release(L2DATA(cdev), toep->tp_l2t);
1144		toep->tp_l2t = NULL;
1145	}
1146	toep->tp_tp = NULL;
1147	if (tp) {
1148		inp_lock_assert(tp->t_inpcb);
1149		so = inp_inpcbtosocket(tp->t_inpcb);
1150		rcv = so_sockbuf_rcv(so);
1151		/*
1152		 * cancel any offloaded reads
1153		 *
1154		 */
1155		sockbuf_lock(rcv);
1156		tp->t_toe = NULL;
1157		tp->t_flags &= ~TF_TOE;
1158		if (toep->tp_ddp_state.user_ddp_pending) {
1159			t3_cancel_ubuf(toep, rcv);
1160			toep->tp_ddp_state.user_ddp_pending = 0;
1161		}
1162		so_sorwakeup_locked(so);
1163
1164	}
1165
1166	if (toep->tp_state == TCPS_SYN_SENT) {
1167		free_atid(cdev, tid);
1168#ifdef notyet
1169		__skb_queue_purge(&tp->out_of_order_queue);
1170#endif
1171	} else {                                          // we have TID
1172		cxgb_remove_tid(cdev, toep, tid);
1173		toepcb_release(toep);
1174	}
1175#if 0
1176	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1177#endif
1178}
1179
1180static void
1181install_offload_ops(struct socket *so)
1182{
1183	struct tcpcb *tp = so_sototcpcb(so);
1184
1185	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1186
1187	t3_install_socket_ops(so);
1188	tp->t_flags |= TF_TOE;
1189	tp->t_tu = &cxgb_toe_usrreqs;
1190}
1191
1192/*
1193 * Determine the receive window scaling factor given a target max
1194 * receive window.
1195 */
1196static __inline int
1197select_rcv_wscale(int space)
1198{
1199	int wscale = 0;
1200
1201	if (space > MAX_RCV_WND)
1202		space = MAX_RCV_WND;
1203
1204	if (tcp_do_rfc1323)
1205		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1206
1207	return (wscale);
1208}
1209
1210/*
1211 * Determine the receive window size for a socket.
1212 */
1213static unsigned long
1214select_rcv_wnd(struct toedev *dev, struct socket *so)
1215{
1216	struct tom_data *d = TOM_DATA(dev);
1217	unsigned int wnd;
1218	unsigned int max_rcv_wnd;
1219	struct sockbuf *rcv;
1220
1221	rcv = so_sockbuf_rcv(so);
1222
1223	if (tcp_do_autorcvbuf)
1224		wnd = tcp_autorcvbuf_max;
1225	else
1226		wnd = rcv->sb_hiwat;
1227
1228
1229
1230	/* XXX
1231	 * For receive coalescing to work effectively we need a receive window
1232	 * that can accomodate a coalesced segment.
1233	 */
1234	if (wnd < MIN_RCV_WND)
1235		wnd = MIN_RCV_WND;
1236
1237	/* PR 5138 */
1238	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1239				    (uint32_t)d->rx_page_size * 23 :
1240				    MAX_RCV_WND);
1241
1242	return min(wnd, max_rcv_wnd);
1243}
1244
1245/*
1246 * Assign offload parameters to some socket fields.  This code is used by
1247 * both active and passive opens.
1248 */
1249static inline void
1250init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1251    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1252{
1253	struct tcpcb *tp = so_sototcpcb(so);
1254	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1255	struct sockbuf *snd, *rcv;
1256
1257#ifdef notyet
1258	SOCK_LOCK_ASSERT(so);
1259#endif
1260
1261	snd = so_sockbuf_snd(so);
1262	rcv = so_sockbuf_rcv(so);
1263
1264	log(LOG_INFO, "initializing offload socket\n");
1265	/*
1266	 * We either need to fix push frames to work with sbcompress
1267	 * or we need to add this
1268	 */
1269	snd->sb_flags |= SB_NOCOALESCE;
1270	rcv->sb_flags |= SB_NOCOALESCE;
1271
1272	tp->t_toe = toep;
1273	toep->tp_tp = tp;
1274	toep->tp_toedev = dev;
1275
1276	toep->tp_tid = tid;
1277	toep->tp_l2t = e;
1278	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1279	toep->tp_wr_unacked = 0;
1280	toep->tp_delack_mode = 0;
1281
1282	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1283	/*
1284	 * XXX broken
1285	 *
1286	 */
1287	tp->rcv_wnd = select_rcv_wnd(dev, so);
1288
1289        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1290		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1291	toep->tp_qset_idx = 0;
1292
1293	reset_wr_list(toep);
1294	DPRINTF("initialization done\n");
1295}
1296
1297/*
1298 * The next two functions calculate the option 0 value for a socket.
1299 */
1300static inline unsigned int
1301calc_opt0h(struct socket *so, int mtu_idx)
1302{
1303	struct tcpcb *tp = so_sototcpcb(so);
1304	int wscale = select_rcv_wscale(tp->rcv_wnd);
1305
1306	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1307	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1308	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1309}
1310
1311static inline unsigned int
1312calc_opt0l(struct socket *so, int ulp_mode)
1313{
1314	struct tcpcb *tp = so_sototcpcb(so);
1315	unsigned int val;
1316
1317	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1318	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1319
1320	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1321	return (val);
1322}
1323
1324static inline unsigned int
1325calc_opt2(const struct socket *so, struct toedev *dev)
1326{
1327	int flv_valid;
1328
1329	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1330
1331	return (V_FLAVORS_VALID(flv_valid) |
1332	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1333}
1334
1335#if DEBUG_WR > 1
1336static int
1337count_pending_wrs(const struct toepcb *toep)
1338{
1339	const struct mbuf *m;
1340	int n = 0;
1341
1342	wr_queue_walk(toep, m)
1343		n += m->m_pkthdr.csum_data;
1344	return (n);
1345}
1346#endif
1347
1348#if 0
1349(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1350#endif
1351
1352static void
1353mk_act_open_req(struct socket *so, struct mbuf *m,
1354    unsigned int atid, const struct l2t_entry *e)
1355{
1356	struct cpl_act_open_req *req;
1357	struct inpcb *inp = so_sotoinpcb(so);
1358	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1359	struct toepcb *toep = tp->t_toe;
1360	struct toedev *tdev = toep->tp_toedev;
1361
1362	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1363
1364	req = mtod(m, struct cpl_act_open_req *);
1365	m->m_pkthdr.len = m->m_len = sizeof(*req);
1366
1367	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1368	req->wr.wr_lo = 0;
1369	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1370	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1371#if 0
1372	req->local_port = inp->inp_lport;
1373	req->peer_port = inp->inp_fport;
1374	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1375	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1376#endif
1377	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1378			   V_TX_CHANNEL(e->smt_idx));
1379	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1380	req->params = 0;
1381	req->opt2 = htonl(calc_opt2(so, tdev));
1382}
1383
1384
1385/*
1386 * Convert an ACT_OPEN_RPL status to an errno.
1387 */
1388static int
1389act_open_rpl_status_to_errno(int status)
1390{
1391	switch (status) {
1392	case CPL_ERR_CONN_RESET:
1393		return (ECONNREFUSED);
1394	case CPL_ERR_ARP_MISS:
1395		return (EHOSTUNREACH);
1396	case CPL_ERR_CONN_TIMEDOUT:
1397		return (ETIMEDOUT);
1398	case CPL_ERR_TCAM_FULL:
1399		return (ENOMEM);
1400	case CPL_ERR_CONN_EXIST:
1401		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1402		return (EADDRINUSE);
1403	default:
1404		return (EIO);
1405	}
1406}
1407
1408static void
1409fail_act_open(struct toepcb *toep, int errno)
1410{
1411	struct tcpcb *tp = toep->tp_tp;
1412
1413	t3_release_offload_resources(toep);
1414	if (tp) {
1415		inp_wunlock(tp->t_inpcb);
1416		tcp_offload_drop(tp, errno);
1417	}
1418
1419#ifdef notyet
1420	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1421#endif
1422}
1423
1424/*
1425 * Handle active open failures.
1426 */
1427static void
1428active_open_failed(struct toepcb *toep, struct mbuf *m)
1429{
1430	struct cpl_act_open_rpl *rpl = cplhdr(m);
1431	struct inpcb *inp;
1432
1433	if (toep->tp_tp == NULL)
1434		goto done;
1435
1436	inp = toep->tp_tp->t_inpcb;
1437
1438/*
1439 * Don't handle connection retry for now
1440 */
1441#ifdef notyet
1442	struct inet_connection_sock *icsk = inet_csk(sk);
1443
1444	if (rpl->status == CPL_ERR_CONN_EXIST &&
1445	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1446		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1447		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1448			       jiffies + HZ / 2);
1449	} else
1450#endif
1451	{
1452		inp_wlock(inp);
1453		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1454	}
1455
1456	done:
1457	m_free(m);
1458}
1459
1460/*
1461 * Return whether a failed active open has allocated a TID
1462 */
1463static inline int
1464act_open_has_tid(int status)
1465{
1466	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1467	       status != CPL_ERR_ARP_MISS;
1468}
1469
1470/*
1471 * Process an ACT_OPEN_RPL CPL message.
1472 */
1473static int
1474do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1475{
1476	struct toepcb *toep = (struct toepcb *)ctx;
1477	struct cpl_act_open_rpl *rpl = cplhdr(m);
1478
1479	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1480		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1481
1482	active_open_failed(toep, m);
1483	return (0);
1484}
1485
1486/*
1487 * Handle an ARP failure for an active open.   XXX purge ofo queue
1488 *
1489 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1490 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1491 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1492 * free the atid.  Hmm.
1493 */
1494#ifdef notyet
1495static void
1496act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1497{
1498	struct toepcb *toep = m_get_toep(m);
1499	struct tcpcb *tp = toep->tp_tp;
1500	struct inpcb *inp = tp->t_inpcb;
1501	struct socket *so;
1502
1503	inp_wlock(inp);
1504	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1505		fail_act_open(so, EHOSTUNREACH);
1506		printf("freeing %p\n", m);
1507
1508		m_free(m);
1509	} else
1510		inp_wunlock(inp);
1511}
1512#endif
1513/*
1514 * Send an active open request.
1515 */
1516int
1517t3_connect(struct toedev *tdev, struct socket *so,
1518    struct rtentry *rt, struct sockaddr *nam)
1519{
1520	struct mbuf *m;
1521	struct l2t_entry *e;
1522	struct tom_data *d = TOM_DATA(tdev);
1523	struct inpcb *inp = so_sotoinpcb(so);
1524	struct tcpcb *tp = intotcpcb(inp);
1525	struct toepcb *toep; /* allocated by init_offload_socket */
1526
1527	int atid;
1528
1529	toep = toepcb_alloc();
1530	if (toep == NULL)
1531		goto out_err;
1532
1533	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1534		goto out_err;
1535
1536	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1537	if (!e)
1538		goto free_tid;
1539
1540	inp_lock_assert(inp);
1541	m = m_gethdr(MT_DATA, M_WAITOK);
1542
1543#if 0
1544	m->m_toe.mt_toepcb = tp->t_toe;
1545	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1546#endif
1547	so_lock(so);
1548
1549	init_offload_socket(so, tdev, atid, e, rt, toep);
1550
1551	install_offload_ops(so);
1552
1553	mk_act_open_req(so, m, atid, e);
1554	so_unlock(so);
1555
1556	soisconnecting(so);
1557	toep = tp->t_toe;
1558	m_set_toep(m, tp->t_toe);
1559
1560	toep->tp_state = TCPS_SYN_SENT;
1561	l2t_send(d->cdev, (struct mbuf *)m, e);
1562
1563	if (toep->tp_ulp_mode)
1564		t3_enable_ddp(toep, 0);
1565	return 	(0);
1566
1567free_tid:
1568	printf("failing connect - free atid\n");
1569
1570	free_atid(d->cdev, atid);
1571out_err:
1572	printf("return ENOMEM\n");
1573       return (ENOMEM);
1574}
1575
1576/*
1577 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1578 * not send multiple ABORT_REQs for the same connection and also that we do
1579 * not try to send a message after the connection has closed.  Returns 1 if
1580 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1581 */
1582static void
1583t3_send_reset(struct toepcb *toep)
1584{
1585
1586	struct cpl_abort_req *req;
1587	unsigned int tid = toep->tp_tid;
1588	int mode = CPL_ABORT_SEND_RST;
1589	struct tcpcb *tp = toep->tp_tp;
1590	struct toedev *tdev = toep->tp_toedev;
1591	struct socket *so = NULL;
1592	struct mbuf *m;
1593	struct sockbuf *snd;
1594
1595	if (tp) {
1596		inp_lock_assert(tp->t_inpcb);
1597		so = inp_inpcbtosocket(tp->t_inpcb);
1598	}
1599
1600	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1601		tdev == NULL))
1602		return;
1603	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1604
1605	snd = so_sockbuf_snd(so);
1606	/* Purge the send queue so we don't send anything after an abort. */
1607	if (so)
1608		sbflush(snd);
1609	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1610		mode |= CPL_ABORT_POST_CLOSE_REQ;
1611
1612	m = m_gethdr_nofail(sizeof(*req));
1613	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1614	set_arp_failure_handler(m, abort_arp_failure);
1615
1616	req = mtod(m, struct cpl_abort_req *);
1617	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1618	req->wr.wr_lo = htonl(V_WR_TID(tid));
1619	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1620	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1621	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1622	req->cmd = mode;
1623	if (tp && (tp->t_state == TCPS_SYN_SENT))
1624		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1625	else
1626		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1627}
1628
1629static int
1630t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1631{
1632	struct inpcb *inp;
1633	int error, optval;
1634
1635	if (sopt->sopt_name == IP_OPTIONS)
1636		return (ENOPROTOOPT);
1637
1638	if (sopt->sopt_name != IP_TOS)
1639		return (EOPNOTSUPP);
1640
1641	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1642
1643	if (error)
1644		return (error);
1645
1646	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1647		return (EPERM);
1648
1649	inp = so_sotoinpcb(so);
1650	inp_ip_tos_set(inp, optval);
1651#if 0
1652	inp->inp_ip_tos = optval;
1653#endif
1654	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1655
1656	return (0);
1657}
1658
1659static int
1660t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1661{
1662	int err = 0;
1663	size_t copied;
1664
1665	if (sopt->sopt_name != TCP_CONGESTION &&
1666	    sopt->sopt_name != TCP_NODELAY)
1667		return (EOPNOTSUPP);
1668
1669	if (sopt->sopt_name == TCP_CONGESTION) {
1670		char name[TCP_CA_NAME_MAX];
1671		int optlen = sopt->sopt_valsize;
1672		struct tcpcb *tp;
1673
1674		if (optlen < 1)
1675			return (EINVAL);
1676
1677		err = copyinstr(sopt->sopt_val, name,
1678		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1679		if (err)
1680			return (err);
1681		if (copied < 1)
1682			return (EINVAL);
1683
1684		tp = so_sototcpcb(so);
1685		/*
1686		 * XXX I need to revisit this
1687		 */
1688		if ((err = t3_set_cong_control(so, name)) == 0) {
1689#ifdef CONGESTION_CONTROL_SUPPORTED
1690			tp->t_cong_control = strdup(name, M_CXGB);
1691#endif
1692		} else
1693			return (err);
1694	} else {
1695		int optval, oldval;
1696		struct inpcb *inp;
1697		struct tcpcb *tp;
1698
1699		err = sooptcopyin(sopt, &optval, sizeof optval,
1700		    sizeof optval);
1701
1702		if (err)
1703			return (err);
1704
1705		inp = so_sotoinpcb(so);
1706		tp = inp_inpcbtotcpcb(inp);
1707
1708		inp_wlock(inp);
1709
1710		oldval = tp->t_flags;
1711		if (optval)
1712			tp->t_flags |= TF_NODELAY;
1713		else
1714			tp->t_flags &= ~TF_NODELAY;
1715		inp_wunlock(inp);
1716
1717
1718		if (oldval != tp->t_flags)
1719			t3_set_nagle(tp->t_toe);
1720
1721	}
1722
1723	return (0);
1724}
1725
1726int
1727t3_ctloutput(struct socket *so, struct sockopt *sopt)
1728{
1729	int err;
1730
1731	if (sopt->sopt_level != IPPROTO_TCP)
1732		err =  t3_ip_ctloutput(so, sopt);
1733	else
1734		err = t3_tcp_ctloutput(so, sopt);
1735
1736	if (err != EOPNOTSUPP)
1737		return (err);
1738
1739	return (tcp_ctloutput(so, sopt));
1740}
1741
1742/*
1743 * Returns true if we need to explicitly request RST when we receive new data
1744 * on an RX-closed connection.
1745 */
1746static inline int
1747need_rst_on_excess_rx(const struct toepcb *toep)
1748{
1749	return (1);
1750}
1751
1752/*
1753 * Handles Rx data that arrives in a state where the socket isn't accepting
1754 * new data.
1755 */
1756static void
1757handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1758{
1759
1760	if (need_rst_on_excess_rx(toep) &&
1761	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1762		t3_send_reset(toep);
1763	m_freem(m);
1764}
1765
1766/*
1767 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1768 * by getting the DDP offset from the TCB.
1769 */
1770static void
1771tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1772{
1773	struct ddp_state *q = &toep->tp_ddp_state;
1774	struct ddp_buf_state *bsp;
1775	struct cpl_get_tcb_rpl *hdr;
1776	unsigned int ddp_offset;
1777	struct socket *so;
1778	struct tcpcb *tp;
1779	struct sockbuf *rcv;
1780	int state;
1781
1782	uint64_t t;
1783	__be64 *tcb;
1784
1785	tp = toep->tp_tp;
1786	so = inp_inpcbtosocket(tp->t_inpcb);
1787
1788	inp_lock_assert(tp->t_inpcb);
1789	rcv = so_sockbuf_rcv(so);
1790	sockbuf_lock(rcv);
1791
1792	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1793	 * We really need a cookie in order to dispatch the RPLs.
1794	 */
1795	q->get_tcb_count--;
1796
1797	/* It is a possible that a previous CPL already invalidated UBUF DDP
1798	 * and moved the cur_buf idx and hence no further processing of this
1799	 * skb is required. However, the app might be sleeping on
1800	 * !q->get_tcb_count and we need to wake it up.
1801	 */
1802	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1803		int state = so_state_get(so);
1804
1805		m_freem(m);
1806		if (__predict_true((state & SS_NOFDREF) == 0))
1807			so_sorwakeup_locked(so);
1808		else
1809			sockbuf_unlock(rcv);
1810
1811		return;
1812	}
1813
1814	bsp = &q->buf_state[q->cur_buf];
1815	hdr = cplhdr(m);
1816	tcb = (__be64 *)(hdr + 1);
1817	if (q->cur_buf == 0) {
1818		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1819		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1820	} else {
1821		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1822		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1823	}
1824	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1825	m->m_cur_offset = bsp->cur_offset;
1826	bsp->cur_offset = ddp_offset;
1827	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1828
1829	CTR5(KTR_TOM,
1830	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1831	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1832	KASSERT(ddp_offset >= m->m_cur_offset,
1833	    ("ddp_offset=%u less than cur_offset=%u",
1834		ddp_offset, m->m_cur_offset));
1835
1836#if 0
1837{
1838	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1839
1840	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1841	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1842
1843        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1844        rcv_nxt = t >> S_TCB_RCV_NXT;
1845        rcv_nxt &= M_TCB_RCV_NXT;
1846
1847        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1848        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1849        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1850
1851	T3_TRACE2(TIDTB(sk),
1852		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1853		  ddp_flags, rcv_nxt - rx_hdr_offset);
1854	T3_TRACE4(TB(q),
1855		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1856		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1857	T3_TRACE3(TB(q),
1858		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1859		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1860	T3_TRACE2(TB(q),
1861		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1862		 q->buf_state[0].flags, q->buf_state[1].flags);
1863
1864}
1865#endif
1866	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1867		handle_excess_rx(toep, m);
1868		return;
1869	}
1870
1871#ifdef T3_TRACE
1872	if ((int)m->m_pkthdr.len < 0) {
1873		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1874	}
1875#endif
1876	if (bsp->flags & DDP_BF_NOCOPY) {
1877#ifdef T3_TRACE
1878		T3_TRACE0(TB(q),
1879			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1880
1881		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1882			printk("!cancel_ubuf");
1883			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1884		}
1885#endif
1886		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1887		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1888		q->cur_buf ^= 1;
1889	} else if (bsp->flags & DDP_BF_NOFLIP) {
1890
1891		m->m_ddp_flags = 1;    /* always a kernel buffer */
1892
1893		/* now HW buffer carries a user buffer */
1894		bsp->flags &= ~DDP_BF_NOFLIP;
1895		bsp->flags |= DDP_BF_NOCOPY;
1896
1897		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1898		 * any new data in which case we're done. If in addition the
1899		 * offset is 0, then there wasn't a completion for the kbuf
1900		 * and we need to decrement the posted count.
1901		 */
1902		if (m->m_pkthdr.len == 0) {
1903			if (ddp_offset == 0) {
1904				q->kbuf_posted--;
1905				bsp->flags |= DDP_BF_NODATA;
1906			}
1907			sockbuf_unlock(rcv);
1908			m_free(m);
1909			return;
1910		}
1911	} else {
1912		sockbuf_unlock(rcv);
1913
1914		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1915		 * but it got here way late and nobody cares anymore.
1916		 */
1917		m_free(m);
1918		return;
1919	}
1920
1921	m->m_ddp_gl = (unsigned char *)bsp->gl;
1922	m->m_flags |= M_DDP;
1923	m->m_seq = tp->rcv_nxt;
1924	tp->rcv_nxt += m->m_pkthdr.len;
1925	tp->t_rcvtime = ticks;
1926	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1927		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1928	if (m->m_pkthdr.len == 0) {
1929		q->user_ddp_pending = 0;
1930		m_free(m);
1931	} else
1932		SBAPPEND(rcv, m);
1933
1934	state = so_state_get(so);
1935	if (__predict_true((state & SS_NOFDREF) == 0))
1936		so_sorwakeup_locked(so);
1937	else
1938		sockbuf_unlock(rcv);
1939}
1940
1941/*
1942 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1943 * in that case they are similar to DDP completions.
1944 */
1945static int
1946do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1947{
1948	struct toepcb *toep = (struct toepcb *)ctx;
1949
1950	/* OK if socket doesn't exist */
1951	if (toep == NULL) {
1952		printf("null toep in do_get_tcb_rpl\n");
1953		return (CPL_RET_BUF_DONE);
1954	}
1955
1956	inp_wlock(toep->tp_tp->t_inpcb);
1957	tcb_rpl_as_ddp_complete(toep, m);
1958	inp_wunlock(toep->tp_tp->t_inpcb);
1959
1960	return (0);
1961}
1962
1963static void
1964handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1965{
1966	struct tcpcb *tp = toep->tp_tp;
1967	struct socket *so;
1968	struct ddp_state *q;
1969	struct ddp_buf_state *bsp;
1970	struct cpl_rx_data *hdr = cplhdr(m);
1971	unsigned int rcv_nxt = ntohl(hdr->seq);
1972	struct sockbuf *rcv;
1973
1974	if (tp->rcv_nxt == rcv_nxt)
1975		return;
1976
1977	inp_lock_assert(tp->t_inpcb);
1978	so  = inp_inpcbtosocket(tp->t_inpcb);
1979	rcv = so_sockbuf_rcv(so);
1980	sockbuf_lock(rcv);
1981
1982	q = &toep->tp_ddp_state;
1983	bsp = &q->buf_state[q->cur_buf];
1984	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
1985		rcv_nxt, tp->rcv_nxt));
1986	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
1987	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
1988	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
1989	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
1990
1991#ifdef T3_TRACE
1992	if ((int)m->m_pkthdr.len < 0) {
1993		t3_ddp_error(so, "handle_ddp_data: neg len");
1994	}
1995#endif
1996	m->m_ddp_gl = (unsigned char *)bsp->gl;
1997	m->m_flags |= M_DDP;
1998	m->m_cur_offset = bsp->cur_offset;
1999	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2000	if (bsp->flags & DDP_BF_NOCOPY)
2001		bsp->flags &= ~DDP_BF_NOCOPY;
2002
2003	m->m_seq = tp->rcv_nxt;
2004	tp->rcv_nxt = rcv_nxt;
2005	bsp->cur_offset += m->m_pkthdr.len;
2006	if (!(bsp->flags & DDP_BF_NOFLIP))
2007		q->cur_buf ^= 1;
2008	/*
2009	 * For now, don't re-enable DDP after a connection fell out of  DDP
2010	 * mode.
2011	 */
2012	q->ubuf_ddp_ready = 0;
2013	sockbuf_unlock(rcv);
2014}
2015
2016/*
2017 * Process new data received for a connection.
2018 */
2019static void
2020new_rx_data(struct toepcb *toep, struct mbuf *m)
2021{
2022	struct cpl_rx_data *hdr = cplhdr(m);
2023	struct tcpcb *tp = toep->tp_tp;
2024	struct socket *so;
2025	struct sockbuf *rcv;
2026	int state;
2027	int len = be16toh(hdr->len);
2028
2029	inp_wlock(tp->t_inpcb);
2030
2031	so  = inp_inpcbtosocket(tp->t_inpcb);
2032
2033	if (__predict_false(so_no_receive(so))) {
2034		handle_excess_rx(toep, m);
2035		inp_wunlock(tp->t_inpcb);
2036		TRACE_EXIT;
2037		return;
2038	}
2039
2040	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2041		handle_ddp_data(toep, m);
2042
2043	m->m_seq = ntohl(hdr->seq);
2044	m->m_ulp_mode = 0;                    /* for iSCSI */
2045
2046#if VALIDATE_SEQ
2047	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2048		log(LOG_ERR,
2049		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2050		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2051		       tp->rcv_nxt);
2052		m_freem(m);
2053		inp_wunlock(tp->t_inpcb);
2054		return;
2055	}
2056#endif
2057	m_adj(m, sizeof(*hdr));
2058
2059#ifdef URGENT_DATA_SUPPORTED
2060	/*
2061	 * We don't handle urgent data yet
2062	 */
2063	if (__predict_false(hdr->urg))
2064		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2065	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2066		     tp->urg_seq - tp->rcv_nxt < skb->len))
2067		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2068							 tp->rcv_nxt];
2069#endif
2070	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2071		toep->tp_delack_mode = hdr->dack_mode;
2072		toep->tp_delack_seq = tp->rcv_nxt;
2073	}
2074	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2075	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2076
2077	if (len < m->m_pkthdr.len)
2078		m->m_pkthdr.len = m->m_len = len;
2079
2080	tp->rcv_nxt += m->m_pkthdr.len;
2081	tp->t_rcvtime = ticks;
2082	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2083	CTR2(KTR_TOM,
2084	    "new_rx_data: seq 0x%x len %u",
2085	    m->m_seq, m->m_pkthdr.len);
2086	inp_wunlock(tp->t_inpcb);
2087	rcv = so_sockbuf_rcv(so);
2088	sockbuf_lock(rcv);
2089#if 0
2090	if (sb_notify(rcv))
2091		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2092#endif
2093	SBAPPEND(rcv, m);
2094
2095#ifdef notyet
2096	/*
2097	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2098	 *
2099	 */
2100	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2101
2102	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2103		so, rcv->sb_cc, rcv->sb_mbmax));
2104#endif
2105
2106
2107	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2108	    rcv->sb_cc, rcv->sb_mbcnt);
2109
2110	state = so_state_get(so);
2111	if (__predict_true((state & SS_NOFDREF) == 0))
2112		so_sorwakeup_locked(so);
2113	else
2114		sockbuf_unlock(rcv);
2115}
2116
2117/*
2118 * Handler for RX_DATA CPL messages.
2119 */
2120static int
2121do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2122{
2123	struct toepcb *toep = (struct toepcb *)ctx;
2124
2125	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2126
2127	new_rx_data(toep, m);
2128
2129	return (0);
2130}
2131
2132static void
2133new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2134{
2135	struct tcpcb *tp;
2136	struct ddp_state *q;
2137	struct ddp_buf_state *bsp;
2138	struct cpl_rx_data_ddp *hdr;
2139	struct socket *so;
2140	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2141	int nomoredata = 0;
2142	unsigned int delack_mode;
2143	struct sockbuf *rcv;
2144
2145	tp = toep->tp_tp;
2146	inp_wlock(tp->t_inpcb);
2147	so = inp_inpcbtosocket(tp->t_inpcb);
2148
2149	if (__predict_false(so_no_receive(so))) {
2150
2151		handle_excess_rx(toep, m);
2152		inp_wunlock(tp->t_inpcb);
2153		return;
2154	}
2155
2156	q = &toep->tp_ddp_state;
2157	hdr = cplhdr(m);
2158	ddp_report = ntohl(hdr->u.ddp_report);
2159	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2160	bsp = &q->buf_state[buf_idx];
2161
2162	CTR4(KTR_TOM,
2163	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2164	    "hdr seq 0x%x len %u",
2165	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2166	    ntohs(hdr->len));
2167	CTR3(KTR_TOM,
2168	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2169	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2170
2171	ddp_len = ntohs(hdr->len);
2172	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2173
2174	delack_mode = G_DDP_DACK_MODE(ddp_report);
2175	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2176		toep->tp_delack_mode = delack_mode;
2177		toep->tp_delack_seq = tp->rcv_nxt;
2178	}
2179
2180	m->m_seq = tp->rcv_nxt;
2181	tp->rcv_nxt = rcv_nxt;
2182
2183	tp->t_rcvtime = ticks;
2184	/*
2185	 * Store the length in m->m_len.  We are changing the meaning of
2186	 * m->m_len here, we need to be very careful that nothing from now on
2187	 * interprets ->len of this packet the usual way.
2188	 */
2189	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2190	inp_wunlock(tp->t_inpcb);
2191	CTR3(KTR_TOM,
2192	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2193	    m->m_len, rcv_nxt, m->m_seq);
2194	/*
2195	 * Figure out where the new data was placed in the buffer and store it
2196	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2197	 * account for page pod's pg_offset.
2198	 */
2199	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2200	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2201
2202	rcv = so_sockbuf_rcv(so);
2203	sockbuf_lock(rcv);
2204
2205	m->m_ddp_gl = (unsigned char *)bsp->gl;
2206	m->m_flags |= M_DDP;
2207	bsp->cur_offset = end_offset;
2208	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2209
2210	/*
2211	 * Length is only meaningful for kbuf
2212	 */
2213	if (!(bsp->flags & DDP_BF_NOCOPY))
2214		KASSERT(m->m_len <= bsp->gl->dgl_length,
2215		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2216			m->m_len, bsp->gl->dgl_length));
2217
2218	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2219	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2220        /*
2221	 * Bit 0 of flags stores whether the DDP buffer is completed.
2222	 * Note that other parts of the code depend on this being in bit 0.
2223	 */
2224	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2225		panic("spurious ddp completion");
2226	} else {
2227		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2228		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2229			q->cur_buf ^= 1;                     /* flip buffers */
2230	}
2231
2232	if (bsp->flags & DDP_BF_NOCOPY) {
2233		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2234		bsp->flags &= ~DDP_BF_NOCOPY;
2235	}
2236
2237	if (ddp_report & F_DDP_PSH)
2238		m->m_ddp_flags |= DDP_BF_PSH;
2239	if (nomoredata)
2240		m->m_ddp_flags |= DDP_BF_NODATA;
2241
2242#ifdef notyet
2243	skb_reset_transport_header(skb);
2244	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2245#endif
2246	SBAPPEND(rcv, m);
2247
2248	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2249	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2250		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2251		so_sorwakeup_locked(so);
2252	else
2253		sockbuf_unlock(rcv);
2254}
2255
2256#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2257		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2258		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2259		 F_DDP_INVALID_PPOD)
2260
2261/*
2262 * Handler for RX_DATA_DDP CPL messages.
2263 */
2264static int
2265do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2266{
2267	struct toepcb *toep = ctx;
2268	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2269
2270	VALIDATE_SOCK(so);
2271
2272	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2273		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2274		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2275		return (CPL_RET_BUF_DONE);
2276	}
2277#if 0
2278	skb->h.th = tcphdr_skb->h.th;
2279#endif
2280	new_rx_data_ddp(toep, m);
2281	return (0);
2282}
2283
2284static void
2285process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2286{
2287	struct tcpcb *tp = toep->tp_tp;
2288	struct socket *so;
2289	struct ddp_state *q;
2290	struct ddp_buf_state *bsp;
2291	struct cpl_rx_ddp_complete *hdr;
2292	unsigned int ddp_report, buf_idx, when, delack_mode;
2293	int nomoredata = 0;
2294	struct sockbuf *rcv;
2295
2296	inp_wlock(tp->t_inpcb);
2297	so = inp_inpcbtosocket(tp->t_inpcb);
2298
2299	if (__predict_false(so_no_receive(so))) {
2300		struct inpcb *inp = so_sotoinpcb(so);
2301
2302		handle_excess_rx(toep, m);
2303		inp_wunlock(inp);
2304		return;
2305	}
2306	q = &toep->tp_ddp_state;
2307	hdr = cplhdr(m);
2308	ddp_report = ntohl(hdr->ddp_report);
2309	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2310	m->m_pkthdr.csum_data = tp->rcv_nxt;
2311
2312	rcv = so_sockbuf_rcv(so);
2313	sockbuf_lock(rcv);
2314
2315	bsp = &q->buf_state[buf_idx];
2316	when = bsp->cur_offset;
2317	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2318	tp->rcv_nxt += m->m_len;
2319	tp->t_rcvtime = ticks;
2320
2321	delack_mode = G_DDP_DACK_MODE(ddp_report);
2322	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2323		toep->tp_delack_mode = delack_mode;
2324		toep->tp_delack_seq = tp->rcv_nxt;
2325	}
2326#ifdef notyet
2327	skb_reset_transport_header(skb);
2328	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2329#endif
2330	inp_wunlock(tp->t_inpcb);
2331
2332	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2333	CTR5(KTR_TOM,
2334		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2335		  "ddp_report 0x%x offset %u, len %u",
2336		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2337		   G_DDP_OFFSET(ddp_report), m->m_len);
2338
2339	bsp->cur_offset += m->m_len;
2340
2341	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2342		q->cur_buf ^= 1;                     /* flip buffers */
2343		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2344			nomoredata=1;
2345	}
2346
2347	CTR4(KTR_TOM,
2348		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2349		  "ddp_report %u offset %u",
2350		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2351		   G_DDP_OFFSET(ddp_report));
2352
2353	m->m_ddp_gl = (unsigned char *)bsp->gl;
2354	m->m_flags |= M_DDP;
2355	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2356	if (bsp->flags & DDP_BF_NOCOPY)
2357		bsp->flags &= ~DDP_BF_NOCOPY;
2358	if (nomoredata)
2359		m->m_ddp_flags |= DDP_BF_NODATA;
2360
2361	SBAPPEND(rcv, m);
2362	if ((so_state_get(so) & SS_NOFDREF) == 0)
2363		so_sorwakeup_locked(so);
2364	else
2365		sockbuf_unlock(rcv);
2366}
2367
2368/*
2369 * Handler for RX_DDP_COMPLETE CPL messages.
2370 */
2371static int
2372do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2373{
2374	struct toepcb *toep = ctx;
2375
2376	VALIDATE_SOCK(so);
2377#if 0
2378	skb->h.th = tcphdr_skb->h.th;
2379#endif
2380	process_ddp_complete(toep, m);
2381	return (0);
2382}
2383
2384/*
2385 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2386 * socket state before calling tcp_time_wait to comply with its expectations.
2387 */
2388static void
2389enter_timewait(struct tcpcb *tp)
2390{
2391	/*
2392	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2393	 * process peer_close because we don't want to carry the peer FIN in
2394	 * the socket's receive queue and if we increment rcv_nxt without
2395	 * having the FIN in the receive queue we'll confuse facilities such
2396	 * as SIOCINQ.
2397	 */
2398	inp_wlock(tp->t_inpcb);
2399	tp->rcv_nxt++;
2400
2401	tp->ts_recent_age = 0;	     /* defeat recycling */
2402	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2403	inp_wunlock(tp->t_inpcb);
2404	tcp_offload_twstart(tp);
2405}
2406
2407static void
2408enter_timewait_disconnect(struct tcpcb *tp)
2409{
2410	/*
2411	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2412	 * process peer_close because we don't want to carry the peer FIN in
2413	 * the socket's receive queue and if we increment rcv_nxt without
2414	 * having the FIN in the receive queue we'll confuse facilities such
2415	 * as SIOCINQ.
2416	 */
2417	inp_wlock(tp->t_inpcb);
2418	tp->rcv_nxt++;
2419
2420	tp->ts_recent_age = 0;	     /* defeat recycling */
2421	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2422	inp_wunlock(tp->t_inpcb);
2423	tcp_offload_twstart_disconnect(tp);
2424}
2425
2426/*
2427 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2428 * function deals with the data that may be reported along with the FIN.
2429 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2430 * perform normal FIN-related processing.  In the latter case 1 indicates that
2431 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2432 * skb can be freed.
2433 */
2434static int
2435handle_peer_close_data(struct socket *so, struct mbuf *m)
2436{
2437	struct tcpcb *tp = so_sototcpcb(so);
2438	struct toepcb *toep = tp->t_toe;
2439	struct ddp_state *q;
2440	struct ddp_buf_state *bsp;
2441	struct cpl_peer_close *req = cplhdr(m);
2442	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2443	struct sockbuf *rcv;
2444
2445	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2446		return (0);
2447
2448	CTR0(KTR_TOM, "handle_peer_close_data");
2449	if (__predict_false(so_no_receive(so))) {
2450		handle_excess_rx(toep, m);
2451
2452		/*
2453		 * Although we discard the data we want to process the FIN so
2454		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2455		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2456		 * may be what will close the connection.  We return 1 because
2457		 * handle_excess_rx() already freed the packet.
2458		 */
2459		return (1);
2460	}
2461
2462	inp_lock_assert(tp->t_inpcb);
2463	q = &toep->tp_ddp_state;
2464	rcv = so_sockbuf_rcv(so);
2465	sockbuf_lock(rcv);
2466
2467	bsp = &q->buf_state[q->cur_buf];
2468	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2469	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2470	m->m_ddp_gl = (unsigned char *)bsp->gl;
2471	m->m_flags |= M_DDP;
2472	m->m_cur_offset = bsp->cur_offset;
2473	m->m_ddp_flags =
2474	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2475	m->m_seq = tp->rcv_nxt;
2476	tp->rcv_nxt = rcv_nxt;
2477	bsp->cur_offset += m->m_pkthdr.len;
2478	if (!(bsp->flags & DDP_BF_NOFLIP))
2479		q->cur_buf ^= 1;
2480#ifdef notyet
2481	skb_reset_transport_header(skb);
2482	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2483#endif
2484	tp->t_rcvtime = ticks;
2485	SBAPPEND(rcv, m);
2486	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2487		so_sorwakeup_locked(so);
2488	else
2489		sockbuf_unlock(rcv);
2490
2491	return (1);
2492}
2493
2494/*
2495 * Handle a peer FIN.
2496 */
2497static void
2498do_peer_fin(struct toepcb *toep, struct mbuf *m)
2499{
2500	struct socket *so;
2501	struct tcpcb *tp = toep->tp_tp;
2502	int keep, action;
2503
2504	action = keep = 0;
2505	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2506	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2507		printf("abort_pending set\n");
2508
2509		goto out;
2510	}
2511	inp_wlock(tp->t_inpcb);
2512	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2513	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2514		keep = handle_peer_close_data(so, m);
2515		if (keep < 0) {
2516			inp_wunlock(tp->t_inpcb);
2517			return;
2518		}
2519	}
2520	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2521		socantrcvmore(so);
2522		/*
2523		 * If connection is half-synchronized
2524		 * (ie NEEDSYN flag on) then delay ACK,
2525		 * so it may be piggybacked when SYN is sent.
2526		 * Otherwise, since we received a FIN then no
2527		 * more input can be expected, send ACK now.
2528		 */
2529		if (tp->t_flags & TF_NEEDSYN)
2530			tp->t_flags |= TF_DELACK;
2531		else
2532			tp->t_flags |= TF_ACKNOW;
2533		tp->rcv_nxt++;
2534	}
2535
2536	switch (tp->t_state) {
2537	case TCPS_SYN_RECEIVED:
2538	    tp->t_starttime = ticks;
2539	/* FALLTHROUGH */
2540	case TCPS_ESTABLISHED:
2541		tp->t_state = TCPS_CLOSE_WAIT;
2542		break;
2543	case TCPS_FIN_WAIT_1:
2544		tp->t_state = TCPS_CLOSING;
2545		break;
2546	case TCPS_FIN_WAIT_2:
2547		/*
2548		 * If we've sent an abort_req we must have sent it too late,
2549		 * HW will send us a reply telling us so, and this peer_close
2550		 * is really the last message for this connection and needs to
2551		 * be treated as an abort_rpl, i.e., transition the connection
2552		 * to TCP_CLOSE (note that the host stack does this at the
2553		 * time of generating the RST but we must wait for HW).
2554		 * Otherwise we enter TIME_WAIT.
2555		 */
2556		t3_release_offload_resources(toep);
2557		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2558			action = TCP_CLOSE;
2559		} else {
2560			action = TCP_TIMEWAIT;
2561		}
2562		break;
2563	default:
2564		log(LOG_ERR,
2565		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2566		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2567	}
2568	inp_wunlock(tp->t_inpcb);
2569
2570	DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(rcv), rcv->sb_flags);
2571
2572
2573	if (action == TCP_TIMEWAIT) {
2574		enter_timewait(tp);
2575	} else if (action == TCP_DROP) {
2576		tcp_offload_drop(tp, 0);
2577	} else if (action == TCP_CLOSE) {
2578		tcp_offload_close(tp);
2579	}
2580
2581#ifdef notyet
2582	/* Do not send POLL_HUP for half duplex close. */
2583	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2584	    sk->sk_state == TCP_CLOSE)
2585		sk_wake_async(so, 1, POLL_HUP);
2586	else
2587		sk_wake_async(so, 1, POLL_IN);
2588#endif
2589
2590out:
2591	if (!keep)
2592		m_free(m);
2593}
2594
2595/*
2596 * Handler for PEER_CLOSE CPL messages.
2597 */
2598static int
2599do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2600{
2601	struct toepcb *toep = (struct toepcb *)ctx;
2602
2603	VALIDATE_SOCK(so);
2604
2605	do_peer_fin(toep, m);
2606	return (0);
2607}
2608
2609static void
2610process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2611{
2612	struct cpl_close_con_rpl *rpl = cplhdr(m);
2613	struct tcpcb *tp = toep->tp_tp;
2614	struct socket *so;
2615	int action = 0;
2616	struct sockbuf *rcv;
2617
2618	inp_wlock(tp->t_inpcb);
2619	so = inp_inpcbtosocket(tp->t_inpcb);
2620
2621	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2622
2623	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2624		inp_wunlock(tp->t_inpcb);
2625		goto out;
2626	}
2627
2628	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2629	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2630
2631	switch (tp->t_state) {
2632	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2633		t3_release_offload_resources(toep);
2634		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2635			action = TCP_CLOSE;
2636
2637		} else {
2638			action = TCP_TIMEWAIT;
2639		}
2640		break;
2641	case TCPS_LAST_ACK:
2642		/*
2643		 * In this state we don't care about pending abort_rpl.
2644		 * If we've sent abort_req it was post-close and was sent too
2645		 * late, this close_con_rpl is the actual last message.
2646		 */
2647		t3_release_offload_resources(toep);
2648		action = TCP_CLOSE;
2649		break;
2650	case TCPS_FIN_WAIT_1:
2651		/*
2652		 * If we can't receive any more
2653		 * data, then closing user can proceed.
2654		 * Starting the timer is contrary to the
2655		 * specification, but if we don't get a FIN
2656		 * we'll hang forever.
2657		 *
2658		 * XXXjl:
2659		 * we should release the tp also, and use a
2660		 * compressed state.
2661		 */
2662		if (so)
2663			rcv = so_sockbuf_rcv(so);
2664		else
2665			break;
2666
2667		if (rcv->sb_state & SBS_CANTRCVMORE) {
2668			int timeout;
2669
2670			if (so)
2671				soisdisconnected(so);
2672			timeout = (tcp_fast_finwait2_recycle) ?
2673			    tcp_finwait2_timeout : tcp_maxidle;
2674			tcp_timer_activate(tp, TT_2MSL, timeout);
2675		}
2676		tp->t_state = TCPS_FIN_WAIT_2;
2677		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2678		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2679			action = TCP_DROP;
2680		}
2681
2682		break;
2683	default:
2684		log(LOG_ERR,
2685		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2686		       toep->tp_toedev->tod_name, toep->tp_tid,
2687		       tp->t_state);
2688	}
2689	inp_wunlock(tp->t_inpcb);
2690
2691
2692	if (action == TCP_TIMEWAIT) {
2693		enter_timewait_disconnect(tp);
2694	} else if (action == TCP_DROP) {
2695		tcp_offload_drop(tp, 0);
2696	} else if (action == TCP_CLOSE) {
2697		tcp_offload_close(tp);
2698	}
2699out:
2700	m_freem(m);
2701}
2702
2703/*
2704 * Handler for CLOSE_CON_RPL CPL messages.
2705 */
2706static int
2707do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2708			    void *ctx)
2709{
2710	struct toepcb *toep = (struct toepcb *)ctx;
2711
2712	process_close_con_rpl(toep, m);
2713	return (0);
2714}
2715
2716/*
2717 * Process abort replies.  We only process these messages if we anticipate
2718 * them as the coordination between SW and HW in this area is somewhat lacking
2719 * and sometimes we get ABORT_RPLs after we are done with the connection that
2720 * originated the ABORT_REQ.
2721 */
2722static void
2723process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2724{
2725	struct tcpcb *tp = toep->tp_tp;
2726	struct socket *so;
2727	int needclose = 0;
2728
2729#ifdef T3_TRACE
2730	T3_TRACE1(TIDTB(sk),
2731		  "process_abort_rpl: GTS rpl pending %d",
2732		  sock_flag(sk, ABORT_RPL_PENDING));
2733#endif
2734
2735	inp_wlock(tp->t_inpcb);
2736	so = inp_inpcbtosocket(tp->t_inpcb);
2737
2738	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2739		/*
2740		 * XXX panic on tcpdrop
2741		 */
2742		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2743			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2744		else {
2745			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2746			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2747			    !is_t3a(toep->tp_toedev)) {
2748				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2749					panic("TP_ABORT_REQ_RCVD set");
2750				t3_release_offload_resources(toep);
2751				needclose = 1;
2752			}
2753		}
2754	}
2755	inp_wunlock(tp->t_inpcb);
2756
2757	if (needclose)
2758		tcp_offload_close(tp);
2759
2760	m_free(m);
2761}
2762
2763/*
2764 * Handle an ABORT_RPL_RSS CPL message.
2765 */
2766static int
2767do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2768{
2769	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2770	struct toepcb *toep;
2771
2772	/*
2773	 * Ignore replies to post-close aborts indicating that the abort was
2774	 * requested too late.  These connections are terminated when we get
2775	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2776	 * arrives the TID is either no longer used or it has been recycled.
2777	 */
2778	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2779discard:
2780		m_free(m);
2781		return (0);
2782	}
2783
2784	toep = (struct toepcb *)ctx;
2785
2786        /*
2787	 * Sometimes we've already closed the socket, e.g., a post-close
2788	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2789	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2790	 * but FW turns the ABORT_REQ into a regular one and so we get
2791	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2792	 */
2793	if (!toep)
2794		goto discard;
2795
2796	if (toep->tp_tp == NULL) {
2797		log(LOG_NOTICE, "removing tid for abort\n");
2798		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2799		if (toep->tp_l2t)
2800			l2t_release(L2DATA(cdev), toep->tp_l2t);
2801
2802		toepcb_release(toep);
2803		goto discard;
2804	}
2805
2806	log(LOG_NOTICE, "toep=%p\n", toep);
2807	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2808
2809	toepcb_hold(toep);
2810	process_abort_rpl(toep, m);
2811	toepcb_release(toep);
2812	return (0);
2813}
2814
2815/*
2816 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2817 * indicate whether RST should be sent in response.
2818 */
2819static int
2820abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2821{
2822	struct tcpcb *tp = so_sototcpcb(so);
2823
2824	switch (abort_reason) {
2825	case CPL_ERR_BAD_SYN:
2826#if 0
2827		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2828#endif
2829	case CPL_ERR_CONN_RESET:
2830		// XXX need to handle SYN_RECV due to crossed SYNs
2831		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2832	case CPL_ERR_XMIT_TIMEDOUT:
2833	case CPL_ERR_PERSIST_TIMEDOUT:
2834	case CPL_ERR_FINWAIT2_TIMEDOUT:
2835	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2836#if 0
2837		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2838#endif
2839		return (ETIMEDOUT);
2840	default:
2841		return (EIO);
2842	}
2843}
2844
2845static inline void
2846set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2847{
2848	struct cpl_abort_rpl *rpl = cplhdr(m);
2849
2850	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2851	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2852	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2853
2854	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2855	rpl->cmd = cmd;
2856}
2857
2858static void
2859send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2860{
2861	struct mbuf *reply_mbuf;
2862	struct cpl_abort_req_rss *req = cplhdr(m);
2863
2864	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2865	m_set_priority(m, CPL_PRIORITY_DATA);
2866	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2867	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2868	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2869	m_free(m);
2870}
2871
2872/*
2873 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2874 */
2875static inline int
2876is_neg_adv_abort(unsigned int status)
2877{
2878	return status == CPL_ERR_RTX_NEG_ADVICE ||
2879	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2880}
2881
2882static void
2883send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2884{
2885	struct mbuf  *reply_mbuf;
2886	struct cpl_abort_req_rss *req = cplhdr(m);
2887
2888	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2889
2890	if (!reply_mbuf) {
2891		/* Defer the reply.  Stick rst_status into req->cmd. */
2892		req->status = rst_status;
2893		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2894		return;
2895	}
2896
2897	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2898	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2899	m_free(m);
2900
2901	/*
2902	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2903	 * these messages while ARP is pending.  For other connection states
2904	 * it's not a problem.
2905	 */
2906	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2907}
2908
2909#ifdef notyet
2910static void
2911cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2912{
2913	CXGB_UNIMPLEMENTED();
2914#ifdef notyet
2915	struct request_sock *req = child->sk_user_data;
2916
2917	inet_csk_reqsk_queue_removed(parent, req);
2918	synq_remove(tcp_sk(child));
2919	__reqsk_free(req);
2920	child->sk_user_data = NULL;
2921#endif
2922}
2923
2924
2925/*
2926 * Performs the actual work to abort a SYN_RECV connection.
2927 */
2928static void
2929do_abort_syn_rcv(struct socket *child, struct socket *parent)
2930{
2931	struct tcpcb *parenttp = so_sototcpcb(parent);
2932	struct tcpcb *childtp = so_sototcpcb(child);
2933
2934	/*
2935	 * If the server is still open we clean up the child connection,
2936	 * otherwise the server already did the clean up as it was purging
2937	 * its SYN queue and the skb was just sitting in its backlog.
2938	 */
2939	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2940		cleanup_syn_rcv_conn(child, parent);
2941		inp_wlock(childtp->t_inpcb);
2942		t3_release_offload_resources(childtp->t_toe);
2943		inp_wunlock(childtp->t_inpcb);
2944		tcp_offload_close(childtp);
2945	}
2946}
2947#endif
2948
2949/*
2950 * Handle abort requests for a SYN_RECV connection.  These need extra work
2951 * because the socket is on its parent's SYN queue.
2952 */
2953static int
2954abort_syn_rcv(struct socket *so, struct mbuf *m)
2955{
2956	CXGB_UNIMPLEMENTED();
2957#ifdef notyet
2958	struct socket *parent;
2959	struct toedev *tdev = toep->tp_toedev;
2960	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2961	struct socket *oreq = so->so_incomp;
2962	struct t3c_tid_entry *t3c_stid;
2963	struct tid_info *t;
2964
2965	if (!oreq)
2966		return -1;        /* somehow we are not on the SYN queue */
2967
2968	t = &(T3C_DATA(cdev))->tid_maps;
2969	t3c_stid = lookup_stid(t, oreq->ts_recent);
2970	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2971
2972	so_lock(parent);
2973	do_abort_syn_rcv(so, parent);
2974	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2975	so_unlock(parent);
2976#endif
2977	return (0);
2978}
2979
2980/*
2981 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2982 * request except that we need to reply to it.
2983 */
2984static void
2985process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
2986{
2987	int rst_status = CPL_ABORT_NO_RST;
2988	const struct cpl_abort_req_rss *req = cplhdr(m);
2989	struct tcpcb *tp = toep->tp_tp;
2990	struct socket *so;
2991	int needclose = 0;
2992
2993	inp_wlock(tp->t_inpcb);
2994	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2995	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2996		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2997		m_free(m);
2998		goto skip;
2999	}
3000
3001	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3002	/*
3003	 * Three cases to consider:
3004	 * a) We haven't sent an abort_req; close the connection.
3005	 * b) We have sent a post-close abort_req that will get to TP too late
3006	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3007	 *    be ignored and the connection should be closed now.
3008	 * c) We have sent a regular abort_req that will get to TP too late.
3009	 *    That will generate an abort_rpl with status 0, wait for it.
3010	 */
3011	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3012	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3013		int error;
3014
3015		error = abort_status_to_errno(so, req->status,
3016		    &rst_status);
3017		so_error_set(so, error);
3018
3019		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3020			so_sorwakeup(so);
3021		/*
3022		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3023		 * returns 0 is has taken care of the abort.
3024		 */
3025		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3026			goto skip;
3027
3028		t3_release_offload_resources(toep);
3029		needclose = 1;
3030	}
3031	inp_wunlock(tp->t_inpcb);
3032
3033	if (needclose)
3034		tcp_offload_close(tp);
3035
3036	send_abort_rpl(m, tdev, rst_status);
3037	return;
3038skip:
3039	inp_wunlock(tp->t_inpcb);
3040}
3041
3042/*
3043 * Handle an ABORT_REQ_RSS CPL message.
3044 */
3045static int
3046do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3047{
3048	const struct cpl_abort_req_rss *req = cplhdr(m);
3049	struct toepcb *toep = (struct toepcb *)ctx;
3050
3051	if (is_neg_adv_abort(req->status)) {
3052		m_free(m);
3053		return (0);
3054	}
3055
3056	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3057
3058	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3059		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3060		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3061
3062		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3063		if (toep->tp_l2t)
3064			l2t_release(L2DATA(cdev), toep->tp_l2t);
3065
3066		/*
3067		 *  Unhook
3068		 */
3069		toep->tp_tp->t_toe = NULL;
3070		toep->tp_tp->t_flags &= ~TF_TOE;
3071		toep->tp_tp = NULL;
3072		/*
3073		 * XXX need to call syncache_chkrst - but we don't
3074		 * have a way of doing that yet
3075		 */
3076		toepcb_release(toep);
3077		log(LOG_ERR, "abort for unestablished connection :-(\n");
3078		return (0);
3079	}
3080	if (toep->tp_tp == NULL) {
3081		log(LOG_NOTICE, "disconnected toepcb\n");
3082		/* should be freed momentarily */
3083		return (0);
3084	}
3085
3086
3087	toepcb_hold(toep);
3088	process_abort_req(toep, m, toep->tp_toedev);
3089	toepcb_release(toep);
3090	return (0);
3091}
3092#ifdef notyet
3093static void
3094pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3095{
3096	struct toedev *tdev = TOE_DEV(parent);
3097
3098	do_abort_syn_rcv(child, parent);
3099	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3100		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3101
3102		rpl->opt0h = htonl(F_TCAM_BYPASS);
3103		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3104		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3105	} else
3106		m_free(m);
3107}
3108#endif
3109static void
3110handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3111{
3112	CXGB_UNIMPLEMENTED();
3113
3114#ifdef notyet
3115	struct t3cdev *cdev;
3116	struct socket *parent;
3117	struct socket *oreq;
3118	struct t3c_tid_entry *t3c_stid;
3119	struct tid_info *t;
3120	struct tcpcb *otp, *tp = so_sototcpcb(so);
3121	struct toepcb *toep = tp->t_toe;
3122
3123	/*
3124	 * If the connection is being aborted due to the parent listening
3125	 * socket going away there's nothing to do, the ABORT_REQ will close
3126	 * the connection.
3127	 */
3128	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3129		m_free(m);
3130		return;
3131	}
3132
3133	oreq = so->so_incomp;
3134	otp = so_sototcpcb(oreq);
3135
3136	cdev = T3C_DEV(so);
3137	t = &(T3C_DATA(cdev))->tid_maps;
3138	t3c_stid = lookup_stid(t, otp->ts_recent);
3139	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3140
3141	so_lock(parent);
3142	pass_open_abort(so, parent, m);
3143	so_unlock(parent);
3144#endif
3145}
3146
3147/*
3148 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3149 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3150 * connection.
3151 */
3152static void
3153pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3154{
3155
3156#ifdef notyet
3157	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3158	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3159#endif
3160	handle_pass_open_arp_failure(m_get_socket(m), m);
3161}
3162
3163/*
3164 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3165 */
3166static void
3167mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3168{
3169	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3170	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3171	unsigned int tid = GET_TID(req);
3172
3173	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3174	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3175	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3176	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3177	rpl->opt0h = htonl(F_TCAM_BYPASS);
3178	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3179	rpl->opt2 = 0;
3180	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3181}
3182
3183/*
3184 * Send a deferred reject to an accept request.
3185 */
3186static void
3187reject_pass_request(struct toedev *tdev, struct mbuf *m)
3188{
3189	struct mbuf *reply_mbuf;
3190
3191	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3192	mk_pass_accept_rpl(reply_mbuf, m);
3193	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3194	m_free(m);
3195}
3196
3197static void
3198handle_syncache_event(int event, void *arg)
3199{
3200	struct toepcb *toep = arg;
3201
3202	switch (event) {
3203	case TOE_SC_ENTRY_PRESENT:
3204		/*
3205		 * entry already exists - free toepcb
3206		 * and l2t
3207		 */
3208		printf("syncache entry present\n");
3209		toepcb_release(toep);
3210		break;
3211	case TOE_SC_DROP:
3212		/*
3213		 * The syncache has given up on this entry
3214		 * either it timed out, or it was evicted
3215		 * we need to explicitly release the tid
3216		 */
3217		printf("syncache entry dropped\n");
3218		toepcb_release(toep);
3219		break;
3220	default:
3221		log(LOG_ERR, "unknown syncache event %d\n", event);
3222		break;
3223	}
3224}
3225
3226static void
3227syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3228{
3229	struct in_conninfo inc;
3230	struct tcpopt to;
3231	struct tcphdr th;
3232	struct inpcb *inp;
3233	int mss, wsf, sack, ts;
3234	uint32_t rcv_isn = ntohl(req->rcv_isn);
3235
3236	bzero(&to, sizeof(struct tcpopt));
3237	inp = so_sotoinpcb(lso);
3238
3239	/*
3240	 * Fill out information for entering us into the syncache
3241	 */
3242	inc.inc_fport = th.th_sport = req->peer_port;
3243	inc.inc_lport = th.th_dport = req->local_port;
3244	th.th_seq = req->rcv_isn;
3245	th.th_flags = TH_SYN;
3246
3247	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3248
3249
3250	inc.inc_isipv6 = 0;
3251	inc.inc_len = 0;
3252	inc.inc_faddr.s_addr = req->peer_ip;
3253	inc.inc_laddr.s_addr = req->local_ip;
3254
3255	DPRINTF("syncache add of %d:%d %d:%d\n",
3256	    ntohl(req->local_ip), ntohs(req->local_port),
3257	    ntohl(req->peer_ip), ntohs(req->peer_port));
3258
3259	mss = req->tcp_options.mss;
3260	wsf = req->tcp_options.wsf;
3261	ts = req->tcp_options.tstamp;
3262	sack = req->tcp_options.sack;
3263	to.to_mss = mss;
3264	to.to_wscale = wsf;
3265	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3266	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3267}
3268
3269
3270/*
3271 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3272 * lock held.  Note that the sock here is a listening socket that is not owned
3273 * by the TOE.
3274 */
3275static void
3276process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3277    struct listen_ctx *lctx)
3278{
3279	int rt_flags;
3280	struct l2t_entry *e;
3281	struct iff_mac tim;
3282	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3283	struct cpl_pass_accept_rpl *rpl;
3284	struct cpl_pass_accept_req *req = cplhdr(m);
3285	unsigned int tid = GET_TID(req);
3286	struct tom_data *d = TOM_DATA(tdev);
3287	struct t3cdev *cdev = d->cdev;
3288	struct tcpcb *tp = so_sototcpcb(so);
3289	struct toepcb *newtoep;
3290	struct rtentry *dst;
3291	struct sockaddr_in nam;
3292	struct t3c_data *td = T3C_DATA(cdev);
3293
3294	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3295	if (__predict_false(reply_mbuf == NULL)) {
3296		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3297			t3_defer_reply(m, tdev, reject_pass_request);
3298		else {
3299			cxgb_queue_tid_release(cdev, tid);
3300			m_free(m);
3301		}
3302		DPRINTF("failed to get reply_mbuf\n");
3303
3304		goto out;
3305	}
3306
3307	if (tp->t_state != TCPS_LISTEN) {
3308		DPRINTF("socket not in listen state\n");
3309
3310		goto reject;
3311	}
3312
3313	tim.mac_addr = req->dst_mac;
3314	tim.vlan_tag = ntohs(req->vlan_tag);
3315	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3316		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3317		goto reject;
3318	}
3319
3320#ifdef notyet
3321	/*
3322	 * XXX do route lookup to confirm that we're still listening on this
3323	 * address
3324	 */
3325	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3326			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3327		goto reject;
3328	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3329		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3330	dst_release(skb->dst);	// done with the input route, release it
3331	skb->dst = NULL;
3332
3333	if ((rt_flags & RTF_LOCAL) == 0)
3334		goto reject;
3335#endif
3336	/*
3337	 * XXX
3338	 */
3339	rt_flags = RTF_LOCAL;
3340	if ((rt_flags & RTF_LOCAL) == 0)
3341		goto reject;
3342
3343	/*
3344	 * Calculate values and add to syncache
3345	 */
3346
3347	newtoep = toepcb_alloc();
3348	if (newtoep == NULL)
3349		goto reject;
3350
3351	bzero(&nam, sizeof(struct sockaddr_in));
3352
3353	nam.sin_len = sizeof(struct sockaddr_in);
3354	nam.sin_family = AF_INET;
3355	nam.sin_addr.s_addr =req->peer_ip;
3356	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3357
3358	if (dst == NULL) {
3359		printf("failed to find route\n");
3360		goto reject;
3361	}
3362	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3363	    (struct sockaddr *)&nam);
3364	if (e == NULL) {
3365		DPRINTF("failed to get l2t\n");
3366	}
3367	/*
3368	 * Point to our listen socket until accept
3369	 */
3370	newtoep->tp_tp = tp;
3371	newtoep->tp_flags = TP_SYN_RCVD;
3372	newtoep->tp_tid = tid;
3373	newtoep->tp_toedev = tdev;
3374	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3375
3376	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3377	so_lock(so);
3378	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3379	so_unlock(so);
3380
3381	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3382		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3383
3384	if (newtoep->tp_ulp_mode) {
3385		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3386
3387		if (ddp_mbuf == NULL)
3388			newtoep->tp_ulp_mode = 0;
3389	}
3390
3391	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3392	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3393	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3394	/*
3395	 * XXX workaround for lack of syncache drop
3396	 */
3397	toepcb_hold(newtoep);
3398	syncache_add_accept_req(req, so, newtoep);
3399
3400	rpl = cplhdr(reply_mbuf);
3401	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3402	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3403	rpl->wr.wr_lo = 0;
3404	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3405	rpl->opt2 = htonl(calc_opt2(so, tdev));
3406	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3407	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3408
3409	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3410	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3411	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3412				  CPL_PASS_OPEN_ACCEPT);
3413
3414	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3415
3416	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3417
3418	l2t_send(cdev, reply_mbuf, e);
3419	m_free(m);
3420	if (newtoep->tp_ulp_mode) {
3421		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3422				V_TF_DDP_OFF(1) |
3423				TP_DDP_TIMER_WORKAROUND_MASK,
3424				V_TF_DDP_OFF(1) |
3425		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3426	} else
3427		printf("not offloading\n");
3428
3429
3430
3431	return;
3432reject:
3433	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3434		mk_pass_accept_rpl(reply_mbuf, m);
3435	else
3436		mk_tid_release(reply_mbuf, newtoep, tid);
3437	cxgb_ofld_send(cdev, reply_mbuf);
3438	m_free(m);
3439out:
3440#if 0
3441	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3442#else
3443	return;
3444#endif
3445}
3446
3447/*
3448 * Handle a CPL_PASS_ACCEPT_REQ message.
3449 */
3450static int
3451do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3452{
3453	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3454	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3455	struct tom_data *d = listen_ctx->tom_data;
3456
3457#if VALIDATE_TID
3458	struct cpl_pass_accept_req *req = cplhdr(m);
3459	unsigned int tid = GET_TID(req);
3460	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3461
3462	if (unlikely(!lsk)) {
3463		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3464		       cdev->name,
3465		       (unsigned long)((union listen_entry *)ctx -
3466					t->stid_tab));
3467		return CPL_RET_BUF_DONE;
3468	}
3469	if (unlikely(tid >= t->ntids)) {
3470		printk(KERN_ERR "%s: passive open TID %u too large\n",
3471		       cdev->name, tid);
3472		return CPL_RET_BUF_DONE;
3473	}
3474	/*
3475	 * For T3A the current user of the TID may have closed but its last
3476	 * message(s) may have been backlogged so the TID appears to be still
3477	 * in use.  Just take the TID away, the connection can close at its
3478	 * own leisure.  For T3B this situation is a bug.
3479	 */
3480	if (!valid_new_tid(t, tid) &&
3481	    cdev->type != T3A) {
3482		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3483		       cdev->name, tid);
3484		return CPL_RET_BUF_DONE;
3485	}
3486#endif
3487
3488	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3489	return (0);
3490}
3491
3492/*
3493 * Called when a connection is established to translate the TCP options
3494 * reported by HW to FreeBSD's native format.
3495 */
3496static void
3497assign_rxopt(struct socket *so, unsigned int opt)
3498{
3499	struct tcpcb *tp = so_sototcpcb(so);
3500	struct toepcb *toep = tp->t_toe;
3501	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3502
3503	inp_lock_assert(tp->t_inpcb);
3504
3505	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3506	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3507	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3508	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3509	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3510	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3511		tp->rcv_scale = tp->request_r_scale;
3512}
3513
3514/*
3515 * Completes some final bits of initialization for just established connections
3516 * and changes their state to TCP_ESTABLISHED.
3517 *
3518 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3519 */
3520static void
3521make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3522{
3523	struct tcpcb *tp = so_sototcpcb(so);
3524	struct toepcb *toep = tp->t_toe;
3525
3526	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3527	assign_rxopt(so, opt);
3528
3529	/*
3530	 *XXXXXXXXXXX
3531	 *
3532	 */
3533#ifdef notyet
3534	so->so_proto->pr_ctloutput = t3_ctloutput;
3535#endif
3536
3537#if 0
3538	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3539#endif
3540	/*
3541	 * XXX not clear what rcv_wup maps to
3542	 */
3543	/*
3544	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3545	 * pass through opt0.
3546	 */
3547	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3548		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3549
3550	dump_toepcb(toep);
3551
3552#ifdef notyet
3553/*
3554 * no clean interface for marking ARP up to date
3555 */
3556	dst_confirm(sk->sk_dst_cache);
3557#endif
3558	tp->t_starttime = ticks;
3559	tp->t_state = TCPS_ESTABLISHED;
3560	soisconnected(so);
3561}
3562
3563static int
3564syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3565{
3566
3567	struct in_conninfo inc;
3568	struct tcpopt to;
3569	struct tcphdr th;
3570	int mss, wsf, sack, ts;
3571	struct mbuf *m = NULL;
3572	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3573	unsigned int opt;
3574
3575#ifdef MAC
3576#error	"no MAC support"
3577#endif
3578
3579	opt = ntohs(req->tcp_opt);
3580
3581	bzero(&to, sizeof(struct tcpopt));
3582
3583	/*
3584	 * Fill out information for entering us into the syncache
3585	 */
3586	inc.inc_fport = th.th_sport = req->peer_port;
3587	inc.inc_lport = th.th_dport = req->local_port;
3588	th.th_seq = req->rcv_isn;
3589	th.th_flags = TH_ACK;
3590
3591	inc.inc_isipv6 = 0;
3592	inc.inc_len = 0;
3593	inc.inc_faddr.s_addr = req->peer_ip;
3594	inc.inc_laddr.s_addr = req->local_ip;
3595
3596	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3597	wsf  = G_TCPOPT_WSCALE_OK(opt);
3598	ts   = G_TCPOPT_TSTAMP(opt);
3599	sack = G_TCPOPT_SACK(opt);
3600
3601	to.to_mss = mss;
3602	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3603	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3604
3605	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3606	    ntohl(req->local_ip), ntohs(req->local_port),
3607	    ntohl(req->peer_ip), ntohs(req->peer_port),
3608	    mss, wsf, ts, sack);
3609	return syncache_offload_expand(&inc, &to, &th, so, m);
3610}
3611
3612
3613/*
3614 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3615 * if we are in TCP_SYN_RECV due to crossed SYNs
3616 */
3617static int
3618do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3619{
3620	struct cpl_pass_establish *req = cplhdr(m);
3621	struct toepcb *toep = (struct toepcb *)ctx;
3622	struct tcpcb *tp = toep->tp_tp;
3623	struct socket *so, *lso;
3624	struct t3c_data *td = T3C_DATA(cdev);
3625	struct sockbuf *snd, *rcv;
3626
3627	// Complete socket initialization now that we have the SND_ISN
3628
3629	struct toedev *tdev;
3630
3631
3632	tdev = toep->tp_toedev;
3633
3634	inp_wlock(tp->t_inpcb);
3635
3636	/*
3637	 *
3638	 * XXX need to add reference while we're manipulating
3639	 */
3640	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3641
3642	inp_wunlock(tp->t_inpcb);
3643
3644	snd = so_sockbuf_snd(so);
3645	rcv = so_sockbuf_rcv(so);
3646
3647
3648	so_lock(so);
3649	LIST_REMOVE(toep, synq_entry);
3650	so_unlock(so);
3651
3652	if (!syncache_expand_establish_req(req, &so, toep)) {
3653		/*
3654		 * No entry
3655		 */
3656		CXGB_UNIMPLEMENTED();
3657	}
3658	if (so == NULL) {
3659		/*
3660		 * Couldn't create the socket
3661		 */
3662		CXGB_UNIMPLEMENTED();
3663	}
3664
3665	tp = so_sototcpcb(so);
3666	inp_wlock(tp->t_inpcb);
3667
3668
3669	snd->sb_flags |= SB_NOCOALESCE;
3670	rcv->sb_flags |= SB_NOCOALESCE;
3671
3672	toep->tp_tp = tp;
3673	toep->tp_flags = 0;
3674	tp->t_toe = toep;
3675	reset_wr_list(toep);
3676	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3677	tp->rcv_nxt = toep->tp_copied_seq;
3678	install_offload_ops(so);
3679
3680	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3681	toep->tp_wr_unacked = 0;
3682	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3683	toep->tp_qset_idx = 0;
3684	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3685
3686	/*
3687	 * XXX Cancel any keep alive timer
3688	 */
3689
3690	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3691
3692	/*
3693	 * XXX workaround for lack of syncache drop
3694	 */
3695	toepcb_release(toep);
3696	inp_wunlock(tp->t_inpcb);
3697
3698	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3699	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3700#ifdef notyet
3701	/*
3702	 * XXX not sure how these checks map to us
3703	 */
3704	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3705		sk->sk_state_change(sk);
3706		sk_wake_async(so, 0, POLL_OUT);
3707	}
3708	/*
3709	 * The state for the new connection is now up to date.
3710	 * Next check if we should add the connection to the parent's
3711	 * accept queue.  When the parent closes it resets connections
3712	 * on its SYN queue, so check if we are being reset.  If so we
3713	 * don't need to do anything more, the coming ABORT_RPL will
3714	 * destroy this socket.  Otherwise move the connection to the
3715	 * accept queue.
3716	 *
3717	 * Note that we reset the synq before closing the server so if
3718	 * we are not being reset the stid is still open.
3719	 */
3720	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3721		__kfree_skb(skb);
3722		goto unlock;
3723	}
3724#endif
3725	m_free(m);
3726
3727	return (0);
3728}
3729
3730/*
3731 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3732 * and send them to the TOE.
3733 */
3734static void
3735fixup_and_send_ofo(struct toepcb *toep)
3736{
3737	struct mbuf *m;
3738	struct toedev *tdev = toep->tp_toedev;
3739	struct tcpcb *tp = toep->tp_tp;
3740	unsigned int tid = toep->tp_tid;
3741
3742	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3743
3744	inp_lock_assert(tp->t_inpcb);
3745	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3746		/*
3747		 * A variety of messages can be waiting but the fields we'll
3748		 * be touching are common to all so any message type will do.
3749		 */
3750		struct cpl_close_con_req *p = cplhdr(m);
3751
3752		p->wr.wr_lo = htonl(V_WR_TID(tid));
3753		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3754		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3755	}
3756}
3757
3758/*
3759 * Updates socket state from an active establish CPL message.  Runs with the
3760 * socket lock held.
3761 */
3762static void
3763socket_act_establish(struct socket *so, struct mbuf *m)
3764{
3765	struct cpl_act_establish *req = cplhdr(m);
3766	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3767	struct tcpcb *tp = so_sototcpcb(so);
3768	struct toepcb *toep = tp->t_toe;
3769
3770	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3771		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3772		    toep->tp_tid, tp->t_state);
3773
3774	tp->ts_recent_age = ticks;
3775	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3776	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3777
3778	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3779
3780	/*
3781	 * Now that we finally have a TID send any CPL messages that we had to
3782	 * defer for lack of a TID.
3783	 */
3784	if (mbufq_len(&toep->out_of_order_queue))
3785		fixup_and_send_ofo(toep);
3786
3787	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3788		/*
3789		 * XXX does this even make sense?
3790		 */
3791		so_sorwakeup(so);
3792	}
3793	m_free(m);
3794#ifdef notyet
3795/*
3796 * XXX assume no write requests permitted while socket connection is
3797 * incomplete
3798 */
3799	/*
3800	 * Currently the send queue must be empty at this point because the
3801	 * socket layer does not send anything before a connection is
3802	 * established.  To be future proof though we handle the possibility
3803	 * that there are pending buffers to send (either TX_DATA or
3804	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3805	 * buffers according to the just learned write_seq, and then we send
3806	 * them on their way.
3807	 */
3808	fixup_pending_writeq_buffers(sk);
3809	if (t3_push_frames(so, 1))
3810		sk->sk_write_space(sk);
3811#endif
3812
3813	toep->tp_state = tp->t_state;
3814	tcpstat.tcps_connects++;
3815
3816}
3817
3818/*
3819 * Process a CPL_ACT_ESTABLISH message.
3820 */
3821static int
3822do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3823{
3824	struct cpl_act_establish *req = cplhdr(m);
3825	unsigned int tid = GET_TID(req);
3826	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3827	struct toepcb *toep = (struct toepcb *)ctx;
3828	struct tcpcb *tp = toep->tp_tp;
3829	struct socket *so;
3830	struct toedev *tdev;
3831	struct tom_data *d;
3832
3833	if (tp == NULL) {
3834		free_atid(cdev, atid);
3835		return (0);
3836	}
3837	inp_wlock(tp->t_inpcb);
3838
3839	/*
3840	 * XXX
3841	 */
3842	so = inp_inpcbtosocket(tp->t_inpcb);
3843	tdev = toep->tp_toedev; /* blow up here if link was down */
3844	d = TOM_DATA(tdev);
3845
3846	/*
3847	 * It's OK if the TID is currently in use, the owning socket may have
3848	 * backlogged its last CPL message(s).  Just take it away.
3849	 */
3850	toep->tp_tid = tid;
3851	toep->tp_tp = tp;
3852	so_insert_tid(d, toep, tid);
3853	free_atid(cdev, atid);
3854	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3855
3856	socket_act_establish(so, m);
3857	inp_wunlock(tp->t_inpcb);
3858	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3859	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3860
3861	return (0);
3862}
3863
3864/*
3865 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3866 * next batch of work requests from the write queue.
3867 */
3868static void
3869wr_ack(struct toepcb *toep, struct mbuf *m)
3870{
3871	struct tcpcb *tp = toep->tp_tp;
3872	struct cpl_wr_ack *hdr = cplhdr(m);
3873	struct socket *so;
3874	unsigned int credits = ntohs(hdr->credits);
3875	u32 snd_una = ntohl(hdr->snd_una);
3876	int bytes = 0;
3877	struct sockbuf *snd;
3878
3879	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3880
3881	inp_wlock(tp->t_inpcb);
3882	so = inp_inpcbtosocket(tp->t_inpcb);
3883	toep->tp_wr_avail += credits;
3884	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3885		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3886
3887	while (credits) {
3888		struct mbuf *p = peek_wr(toep);
3889
3890		if (__predict_false(!p)) {
3891			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3892			    "nothing pending, state %u wr_avail=%u\n",
3893			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3894			break;
3895		}
3896		CTR2(KTR_TOM,
3897			"wr_ack: p->credits=%d p->bytes=%d",
3898		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3899		KASSERT(p->m_pkthdr.csum_data != 0,
3900		    ("empty request still on list"));
3901
3902		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3903
3904#if DEBUG_WR > 1
3905			struct tx_data_wr *w = cplhdr(p);
3906			log(LOG_ERR,
3907			       "TID %u got %u WR credits, need %u, len %u, "
3908			       "main body %u, frags %u, seq # %u, ACK una %u,"
3909			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3910			       toep->tp_tid, credits, p->csum, p->len,
3911			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3912			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3913			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3914#endif
3915			p->m_pkthdr.csum_data -= credits;
3916			break;
3917		} else {
3918			dequeue_wr(toep);
3919			credits -= p->m_pkthdr.csum_data;
3920			bytes += p->m_pkthdr.len;
3921			CTR3(KTR_TOM,
3922			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3923			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3924
3925			m_free(p);
3926		}
3927	}
3928
3929#if DEBUG_WR
3930	check_wr_invariants(tp);
3931#endif
3932
3933	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3934#if VALIDATE_SEQ
3935		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3936
3937		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3938		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3939		    toep->tp_tid, tp->snd_una);
3940#endif
3941		goto out_free;
3942	}
3943
3944	if (tp->snd_una != snd_una) {
3945		tp->snd_una = snd_una;
3946		tp->ts_recent_age = ticks;
3947#ifdef notyet
3948		/*
3949		 * Keep ARP entry "minty fresh"
3950		 */
3951		dst_confirm(sk->sk_dst_cache);
3952#endif
3953		if (tp->snd_una == tp->snd_nxt)
3954			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3955	}
3956
3957	snd = so_sockbuf_snd(so);
3958	if (bytes) {
3959		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3960		snd = so_sockbuf_snd(so);
3961		sockbuf_lock(snd);
3962		sbdrop_locked(snd, bytes);
3963		so_sowwakeup_locked(so);
3964	}
3965
3966	if (snd->sb_sndptroff < snd->sb_cc)
3967		t3_push_frames(so, 0);
3968
3969out_free:
3970	inp_wunlock(tp->t_inpcb);
3971	m_free(m);
3972}
3973
3974/*
3975 * Handler for TX_DATA_ACK CPL messages.
3976 */
3977static int
3978do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3979{
3980	struct toepcb *toep = (struct toepcb *)ctx;
3981
3982	VALIDATE_SOCK(so);
3983
3984	wr_ack(toep, m);
3985	return 0;
3986}
3987
3988/*
3989 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
3990 */
3991static int
3992do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
3993{
3994	m_freem(m);
3995	return 0;
3996}
3997
3998/*
3999 * Reset a connection that is on a listener's SYN queue or accept queue,
4000 * i.e., one that has not had a struct socket associated with it.
4001 * Must be called from process context.
4002 *
4003 * Modeled after code in inet_csk_listen_stop().
4004 */
4005static void
4006t3_reset_listen_child(struct socket *child)
4007{
4008	struct tcpcb *tp = so_sototcpcb(child);
4009
4010	t3_send_reset(tp->t_toe);
4011}
4012
4013
4014static void
4015t3_child_disconnect(struct socket *so, void *arg)
4016{
4017	struct tcpcb *tp = so_sototcpcb(so);
4018
4019	if (tp->t_flags & TF_TOE) {
4020		inp_wlock(tp->t_inpcb);
4021		t3_reset_listen_child(so);
4022		inp_wunlock(tp->t_inpcb);
4023	}
4024}
4025
4026/*
4027 * Disconnect offloaded established but not yet accepted connections sitting
4028 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4029 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4030 */
4031void
4032t3_disconnect_acceptq(struct socket *listen_so)
4033{
4034
4035	so_lock(listen_so);
4036	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4037	so_unlock(listen_so);
4038}
4039
4040/*
4041 * Reset offloaded connections sitting on a server's syn queue.  As above
4042 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4043 */
4044
4045void
4046t3_reset_synq(struct listen_ctx *lctx)
4047{
4048	struct toepcb *toep;
4049
4050	so_lock(lctx->lso);
4051	while (!LIST_EMPTY(&lctx->synq_head)) {
4052		toep = LIST_FIRST(&lctx->synq_head);
4053		LIST_REMOVE(toep, synq_entry);
4054		toep->tp_tp = NULL;
4055		t3_send_reset(toep);
4056		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4057		toepcb_release(toep);
4058	}
4059	so_unlock(lctx->lso);
4060}
4061
4062
4063int
4064t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4065		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4066		   unsigned int pg_off, unsigned int color)
4067{
4068	unsigned int i, j, pidx;
4069	struct pagepod *p;
4070	struct mbuf *m;
4071	struct ulp_mem_io *req;
4072	unsigned int tid = toep->tp_tid;
4073	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4074	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4075
4076	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4077	    gl, nppods, tag, maxoff, pg_off, color);
4078
4079	for (i = 0; i < nppods; ++i) {
4080		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4081		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4082		req = mtod(m, struct ulp_mem_io *);
4083		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4084		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4085		req->wr.wr_lo = 0;
4086		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4087					   V_ULPTX_CMD(ULP_MEM_WRITE));
4088		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4089				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4090
4091		p = (struct pagepod *)(req + 1);
4092		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4093			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4094			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4095						  V_PPOD_COLOR(color));
4096			p->pp_max_offset = htonl(maxoff);
4097			p->pp_page_offset = htonl(pg_off);
4098			p->pp_rsvd = 0;
4099			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4100				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4101				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4102		} else
4103			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4104		send_or_defer(toep, m, 0);
4105		ppod_addr += PPOD_SIZE;
4106	}
4107	return (0);
4108}
4109
4110/*
4111 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4112 */
4113static inline void
4114mk_cpl_barrier_ulp(struct cpl_barrier *b)
4115{
4116	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4117
4118	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4119	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4120	b->opcode = CPL_BARRIER;
4121}
4122
4123/*
4124 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4125 */
4126static inline void
4127mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4128{
4129	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4130
4131	txpkt = (struct ulp_txpkt *)req;
4132	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4133	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4134	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4135	req->cpuno = htons(cpuno);
4136}
4137
4138/*
4139 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4140 */
4141static inline void
4142mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4143                     unsigned int word, uint64_t mask, uint64_t val)
4144{
4145	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4146
4147	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4148	    tid, word, mask, val);
4149
4150	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4151	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4152	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4153	req->reply = V_NO_REPLY(1);
4154	req->cpu_idx = 0;
4155	req->word = htons(word);
4156	req->mask = htobe64(mask);
4157	req->val = htobe64(val);
4158}
4159
4160/*
4161 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4162 */
4163static void
4164mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4165    unsigned int tid, unsigned int credits)
4166{
4167	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4168
4169	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4170	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4171	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4172	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4173	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4174				 V_RX_CREDITS(credits));
4175}
4176
4177void
4178t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4179{
4180	unsigned int wrlen;
4181	struct mbuf *m;
4182	struct work_request_hdr *wr;
4183	struct cpl_barrier *lock;
4184	struct cpl_set_tcb_field *req;
4185	struct cpl_get_tcb *getreq;
4186	struct ddp_state *p = &toep->tp_ddp_state;
4187
4188#if 0
4189	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4190#endif
4191	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4192		sizeof(*getreq);
4193	m = m_gethdr_nofail(wrlen);
4194	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4195	wr = mtod(m, struct work_request_hdr *);
4196	bzero(wr, wrlen);
4197
4198	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4199	m->m_pkthdr.len = m->m_len = wrlen;
4200
4201	lock = (struct cpl_barrier *)(wr + 1);
4202	mk_cpl_barrier_ulp(lock);
4203
4204	req = (struct cpl_set_tcb_field *)(lock + 1);
4205
4206	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4207
4208	/* Hmmm, not sure if this actually a good thing: reactivating
4209	 * the other buffer might be an issue if it has been completed
4210	 * already. However, that is unlikely, since the fact that the UBUF
4211	 * is not completed indicates that there is no oustanding data.
4212	 */
4213	if (bufidx == 0)
4214		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4215				     V_TF_DDP_ACTIVE_BUF(1) |
4216				     V_TF_DDP_BUF0_VALID(1),
4217				     V_TF_DDP_ACTIVE_BUF(1));
4218	else
4219		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4220				     V_TF_DDP_ACTIVE_BUF(1) |
4221				     V_TF_DDP_BUF1_VALID(1), 0);
4222
4223	getreq = (struct cpl_get_tcb *)(req + 1);
4224	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4225
4226	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4227
4228	/* Keep track of the number of oustanding CPL_GET_TCB requests
4229	 */
4230	p->get_tcb_count++;
4231
4232#ifdef T3_TRACE
4233	T3_TRACE1(TIDTB(so),
4234		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4235#endif
4236	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4237}
4238
4239/**
4240 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4241 * @sk: the socket associated with the buffers
4242 * @bufidx: index of HW DDP buffer (0 or 1)
4243 * @tag0: new tag for HW buffer 0
4244 * @tag1: new tag for HW buffer 1
4245 * @len: new length for HW buf @bufidx
4246 *
4247 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4248 * buffer by changing the buffer tag and length and setting the valid and
4249 * active flag accordingly.  The caller must ensure the new buffer is at
4250 * least as big as the existing one.  Since we typically reprogram both HW
4251 * buffers this function sets both tags for convenience. Read the TCB to
4252 * determine how made data was written into the buffer before the overlay
4253 * took place.
4254 */
4255void
4256t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4257	 	       unsigned int tag1, unsigned int len)
4258{
4259	unsigned int wrlen;
4260	struct mbuf *m;
4261	struct work_request_hdr *wr;
4262	struct cpl_get_tcb *getreq;
4263	struct cpl_set_tcb_field *req;
4264	struct ddp_state *p = &toep->tp_ddp_state;
4265
4266	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4267	    bufidx, tag0, tag1, len);
4268#if 0
4269	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4270#endif
4271	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4272	m = m_gethdr_nofail(wrlen);
4273	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4274	wr = mtod(m, struct work_request_hdr *);
4275	m->m_pkthdr.len = m->m_len = wrlen;
4276	bzero(wr, wrlen);
4277
4278
4279	/* Set the ATOMIC flag to make sure that TP processes the following
4280	 * CPLs in an atomic manner and no wire segments can be interleaved.
4281	 */
4282	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4283	req = (struct cpl_set_tcb_field *)(wr + 1);
4284	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4285			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4286			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4287			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4288			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4289	req++;
4290	if (bufidx == 0) {
4291		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4292			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4293			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4294		req++;
4295		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4296			    V_TF_DDP_PUSH_DISABLE_0(1) |
4297			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4298			    V_TF_DDP_PUSH_DISABLE_0(0) |
4299			    V_TF_DDP_BUF0_VALID(1));
4300	} else {
4301		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4302			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4303			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4304		req++;
4305		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4306			    V_TF_DDP_PUSH_DISABLE_1(1) |
4307			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4308			    V_TF_DDP_PUSH_DISABLE_1(0) |
4309			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4310	}
4311
4312	getreq = (struct cpl_get_tcb *)(req + 1);
4313	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4314
4315	/* Keep track of the number of oustanding CPL_GET_TCB requests
4316	 */
4317	p->get_tcb_count++;
4318
4319#ifdef T3_TRACE
4320	T3_TRACE4(TIDTB(sk),
4321		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4322		  "len %d",
4323		  bufidx, tag0, tag1, len);
4324#endif
4325	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4326}
4327
4328/*
4329 * Sends a compound WR containing all the CPL messages needed to program the
4330 * two HW DDP buffers, namely optionally setting up the length and offset of
4331 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4332 */
4333void
4334t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4335		      unsigned int len1, unsigned int offset1,
4336                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4337{
4338	unsigned int wrlen;
4339	struct mbuf *m;
4340	struct work_request_hdr *wr;
4341	struct cpl_set_tcb_field *req;
4342
4343	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4344	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4345
4346#if 0
4347	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4348#endif
4349	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4350		(len1 ? sizeof(*req) : 0) +
4351		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4352	m = m_gethdr_nofail(wrlen);
4353	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4354	wr = mtod(m, struct work_request_hdr *);
4355	bzero(wr, wrlen);
4356
4357	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4358	m->m_pkthdr.len = m->m_len = wrlen;
4359
4360	req = (struct cpl_set_tcb_field *)(wr + 1);
4361	if (len0) {                  /* program buffer 0 offset and length */
4362		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4363			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4364			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4365			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4366			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4367		req++;
4368	}
4369	if (len1) {                  /* program buffer 1 offset and length */
4370		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4371			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4372			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4373			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4374			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4375		req++;
4376	}
4377
4378	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4379			     ddp_flags);
4380
4381	if (modulate) {
4382		mk_rx_data_ack_ulp(toep,
4383		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4384		    toep->tp_copied_seq - toep->tp_rcv_wup);
4385		toep->tp_rcv_wup = toep->tp_copied_seq;
4386	}
4387
4388#ifdef T3_TRACE
4389	T3_TRACE5(TIDTB(sk),
4390		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4391		  "modulate %d",
4392		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4393		  modulate);
4394#endif
4395
4396	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4397}
4398
4399void
4400t3_init_wr_tab(unsigned int wr_len)
4401{
4402	int i;
4403
4404	if (mbuf_wrs[1])     /* already initialized */
4405		return;
4406
4407	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4408		int sgl_len = (3 * i) / 2 + (i & 1);
4409
4410		sgl_len += 3;
4411		mbuf_wrs[i] = sgl_len <= wr_len ?
4412		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4413	}
4414
4415	wrlen = wr_len * 8;
4416}
4417
4418int
4419t3_init_cpl_io(void)
4420{
4421#ifdef notyet
4422	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4423	if (!tcphdr_skb) {
4424		log(LOG_ERR,
4425		       "Chelsio TCP offload: can't allocate sk_buff\n");
4426		return -1;
4427	}
4428	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4429	tcphdr_skb->h.raw = tcphdr_skb->data;
4430	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4431#endif
4432
4433	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4434	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4435	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4436	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4437	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4438	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4439	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4440	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4441	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4442	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4443	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4444	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4445	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4446	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4447	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4448	return (0);
4449}
4450
4451