cxgb_cpl_io.c revision 178767
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 178767 2008-05-05 01:41:53Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44#include <sys/syslog.h>
45#include <sys/protosw.h>
46#include <sys/priv.h>
47
48#include <net/if.h>
49#include <net/route.h>
50
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_systm.h>
54#include <netinet/in_var.h>
55
56
57#include <dev/cxgb/cxgb_osdep.h>
58#include <dev/cxgb/sys/mbufq.h>
59
60#include <netinet/ip.h>
61#include <netinet/tcp_var.h>
62#include <netinet/tcp_fsm.h>
63#include <netinet/tcp_offload.h>
64#include <netinet/tcp_seq.h>
65#include <netinet/tcp_syncache.h>
66#include <netinet/tcp_timer.h>
67#include <net/route.h>
68
69#include <dev/cxgb/t3cdev.h>
70#include <dev/cxgb/common/cxgb_firmware_exports.h>
71#include <dev/cxgb/common/cxgb_t3_cpl.h>
72#include <dev/cxgb/common/cxgb_tcb.h>
73#include <dev/cxgb/common/cxgb_ctl_defs.h>
74#include <dev/cxgb/cxgb_offload.h>
75#include <vm/vm.h>
76#include <vm/pmap.h>
77#include <machine/bus.h>
78#include <dev/cxgb/sys/mvec.h>
79#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
80#include <dev/cxgb/ulp/tom/cxgb_defs.h>
81#include <dev/cxgb/ulp/tom/cxgb_tom.h>
82#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
83#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
84#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
85
86#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space.  Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets.  It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes.  Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body.  This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes.  Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window.  We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139#define TCP_TIMEWAIT	1
140#define TCP_CLOSE	2
141#define TCP_DROP	3
142
143extern int tcp_do_autorcvbuf;
144extern int tcp_do_autosndbuf;
145extern int tcp_autorcvbuf_max;
146extern int tcp_autosndbuf_max;
147
148static void t3_send_reset(struct toepcb *toep);
149static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
150static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
151static void handle_syncache_event(int event, void *arg);
152
153static inline void
154SBAPPEND(struct sockbuf *sb, struct mbuf *n)
155{
156	struct mbuf *m;
157
158	m = sb->sb_mb;
159	while (m) {
160		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
161		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
162			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
163		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
164			m->m_next, m->m_nextpkt, m->m_flags));
165		m = m->m_next;
166	}
167	m = n;
168	while (m) {
169		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
170		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
171			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
172		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
173			m->m_next, m->m_nextpkt, m->m_flags));
174		m = m->m_next;
175	}
176	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
177	sbappendstream_locked(sb, n);
178	m = sb->sb_mb;
179
180	while (m) {
181		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
182			m->m_next, m->m_nextpkt, m->m_flags));
183		m = m->m_next;
184	}
185}
186
187static inline int
188is_t3a(const struct toedev *dev)
189{
190	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
191}
192
193static void
194dump_toepcb(struct toepcb *toep)
195{
196	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
197	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
198	    toep->tp_mtu_idx, toep->tp_tid);
199
200	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
201	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
202	    toep->tp_mss_clamp, toep->tp_flags);
203}
204
205#ifndef RTALLOC2_DEFINED
206static struct rtentry *
207rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
208{
209	struct rtentry *rt = NULL;
210
211	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
212		RT_UNLOCK(rt);
213
214	return (rt);
215}
216#endif
217
218/*
219 * Determine whether to send a CPL message now or defer it.  A message is
220 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
221 * For connections in other states the message is sent immediately.
222 * If through_l2t is set the message is subject to ARP processing, otherwise
223 * it is sent directly.
224 */
225static inline void
226send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
227{
228	struct tcpcb *tp = toep->tp_tp;
229
230	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
231		inp_wlock(tp->t_inpcb);
232		mbufq_tail(&toep->out_of_order_queue, m);  // defer
233		inp_wunlock(tp->t_inpcb);
234	} else if (through_l2t)
235		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
236	else
237		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
238}
239
240static inline unsigned int
241mkprio(unsigned int cntrl, const struct toepcb *toep)
242{
243        return (cntrl);
244}
245
246/*
247 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
248 */
249static inline void
250mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
251{
252	struct cpl_tid_release *req;
253
254	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
255	m->m_pkthdr.len = m->m_len = sizeof(*req);
256	req = mtod(m, struct cpl_tid_release *);
257	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
258	req->wr.wr_lo = 0;
259	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
260}
261
262static inline void
263make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
264{
265	struct tcpcb *tp = so_sototcpcb(so);
266	struct toepcb *toep = tp->t_toe;
267	struct tx_data_wr *req;
268	struct sockbuf *snd;
269
270	inp_lock_assert(tp->t_inpcb);
271	snd = so_sockbuf_snd(so);
272
273	req = mtod(m, struct tx_data_wr *);
274	m->m_len = sizeof(*req);
275	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
276	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
277	/* len includes the length of any HW ULP additions */
278	req->len = htonl(len);
279	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
280	/* V_TX_ULP_SUBMODE sets both the mode and submode */
281	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
282	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
283	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
284				   (tail ? 0 : 1))));
285	req->sndseq = htonl(tp->snd_nxt);
286	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
287		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
288				    V_TX_CPU_IDX(toep->tp_qset));
289
290		/* Sendbuffer is in units of 32KB.
291		 */
292		if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
293			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
294		else {
295			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
296		}
297
298		toep->tp_flags |= TP_DATASENT;
299	}
300}
301
302#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
303
304int
305t3_push_frames(struct socket *so, int req_completion)
306{
307	struct tcpcb *tp = so_sototcpcb(so);
308	struct toepcb *toep = tp->t_toe;
309
310	struct mbuf *tail, *m0, *last;
311	struct t3cdev *cdev;
312	struct tom_data *d;
313	int state, bytes, count, total_bytes;
314	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
315	struct sockbuf *snd;
316
317	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
318		DPRINTF("tcp state=%d\n", tp->t_state);
319		return (0);
320	}
321
322	state = so_state_get(so);
323
324	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
325		DPRINTF("disconnecting\n");
326
327		return (0);
328	}
329
330	inp_lock_assert(tp->t_inpcb);
331
332	snd = so_sockbuf_snd(so);
333	sockbuf_lock(snd);
334
335	d = TOM_DATA(toep->tp_toedev);
336	cdev = d->cdev;
337
338	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
339
340	total_bytes = 0;
341	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
342	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
343
344	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
345		KASSERT(tail, ("sbdrop error"));
346		last = tail = tail->m_next;
347	}
348
349	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
350		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
351		sockbuf_unlock(snd);
352
353		return (0);
354	}
355
356	toep->tp_m_last = NULL;
357	while (toep->tp_wr_avail && (tail != NULL)) {
358		count = bytes = 0;
359		segp = segs;
360		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
361			sockbuf_unlock(snd);
362			return (0);
363		}
364		/*
365		 * If the data in tail fits as in-line, then
366		 * make an immediate data wr.
367		 */
368		if (tail->m_len <= IMM_LEN) {
369			count = 1;
370			bytes = tail->m_len;
371			last = tail;
372			tail = tail->m_next;
373			m_set_sgl(m0, NULL);
374			m_set_sgllen(m0, 0);
375			make_tx_data_wr(so, m0, bytes, tail);
376			m_append(m0, bytes, mtod(last, caddr_t));
377			KASSERT(!m0->m_next, ("bad append"));
378		} else {
379			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
380			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
381				bytes += tail->m_len;
382				last = tail;
383				count++;
384				/*
385				 * technically an abuse to be using this for a VA
386				 * but less gross than defining my own structure
387				 * or calling pmap_kextract from here :-|
388				 */
389				segp->ds_addr = (bus_addr_t)tail->m_data;
390				segp->ds_len = tail->m_len;
391				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
392				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
393				segp++;
394				tail = tail->m_next;
395			}
396			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
397			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
398
399			m_set_sgl(m0, segs);
400			m_set_sgllen(m0, count);
401			make_tx_data_wr(so, m0, bytes, tail);
402		}
403		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
404
405		if (tail) {
406			snd->sb_sndptr = tail;
407			toep->tp_m_last = NULL;
408		} else
409			toep->tp_m_last = snd->sb_sndptr = last;
410
411
412		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
413
414		snd->sb_sndptroff += bytes;
415		total_bytes += bytes;
416		toep->tp_write_seq += bytes;
417		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
418		    toep->tp_wr_avail, count, mbuf_wrs[count], tail, snd->sb_sndptr, snd->sb_sndptroff);
419		if (tail)
420			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
421			    total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
422		else
423			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
424			    total_bytes, toep->tp_m_last, tp->snd_una);
425
426
427#ifdef KTR
428{
429		int i;
430
431		i = 0;
432		while (i < count && m_get_sgllen(m0)) {
433			if ((count - i) >= 3) {
434				CTR6(KTR_TOM,
435				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
436				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
437				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
438				    i += 3;
439			} else if ((count - i) == 2) {
440				CTR4(KTR_TOM,
441				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
442				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
443				    i += 2;
444			} else {
445				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
446				    segs[i].ds_addr, segs[i].ds_len);
447				i++;
448			}
449
450		}
451}
452#endif
453                 /*
454		 * remember credits used
455		 */
456		m0->m_pkthdr.csum_data = mbuf_wrs[count];
457		m0->m_pkthdr.len = bytes;
458		toep->tp_wr_avail -= mbuf_wrs[count];
459		toep->tp_wr_unacked += mbuf_wrs[count];
460
461		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
462		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
463			struct work_request_hdr *wr = cplhdr(m0);
464
465			wr->wr_hi |= htonl(F_WR_COMPL);
466			toep->tp_wr_unacked = 0;
467		}
468		KASSERT((m0->m_pkthdr.csum_data > 0) &&
469		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
470			m0->m_pkthdr.csum_data));
471		m0->m_type = MT_DONTFREE;
472		enqueue_wr(toep, m0);
473		DPRINTF("sending offload tx with %d bytes in %d segments\n",
474		    bytes, count);
475		l2t_send(cdev, m0, toep->tp_l2t);
476	}
477	sockbuf_unlock(snd);
478	return (total_bytes);
479}
480
481/*
482 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
483 * under any circumstances.  We take the easy way out and always queue the
484 * message to the write_queue.  We can optimize the case where the queue is
485 * already empty though the optimization is probably not worth it.
486 */
487static void
488close_conn(struct socket *so)
489{
490	struct mbuf *m;
491	struct cpl_close_con_req *req;
492	struct tom_data *d;
493	struct inpcb *inp = so_sotoinpcb(so);
494	struct tcpcb *tp;
495	struct toepcb *toep;
496	unsigned int tid;
497
498
499	inp_wlock(inp);
500	tp = so_sototcpcb(so);
501	toep = tp->t_toe;
502
503	if (tp->t_state != TCPS_SYN_SENT)
504		t3_push_frames(so, 1);
505
506	if (toep->tp_flags & TP_FIN_SENT) {
507		inp_wunlock(inp);
508		return;
509	}
510
511	tid = toep->tp_tid;
512
513	d = TOM_DATA(toep->tp_toedev);
514
515	m = m_gethdr_nofail(sizeof(*req));
516	m_set_priority(m, CPL_PRIORITY_DATA);
517	m_set_sgl(m, NULL);
518	m_set_sgllen(m, 0);
519
520	toep->tp_flags |= TP_FIN_SENT;
521	req = mtod(m, struct cpl_close_con_req *);
522
523	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
524	req->wr.wr_lo = htonl(V_WR_TID(tid));
525	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
526	req->rsvd = 0;
527	inp_wunlock(inp);
528	/*
529	 * XXX - need to defer shutdown while there is still data in the queue
530	 *
531	 */
532	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
533	cxgb_ofld_send(d->cdev, m);
534
535}
536
537/*
538 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
539 * and send it along.
540 */
541static void
542abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
543{
544	struct cpl_abort_req *req = cplhdr(m);
545
546	req->cmd = CPL_ABORT_NO_RST;
547	cxgb_ofld_send(cdev, m);
548}
549
550/*
551 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
552 * permitted to return without sending the message in case we cannot allocate
553 * an sk_buff.  Returns the number of credits sent.
554 */
555uint32_t
556t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
557{
558	struct mbuf *m;
559	struct cpl_rx_data_ack *req;
560	struct toepcb *toep = tp->t_toe;
561	struct toedev *tdev = toep->tp_toedev;
562
563	m = m_gethdr_nofail(sizeof(*req));
564
565	DPRINTF("returning %u credits to HW\n", credits);
566
567	req = mtod(m, struct cpl_rx_data_ack *);
568	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
569	req->wr.wr_lo = 0;
570	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
571	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
572	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
573	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
574	return (credits);
575}
576
577/*
578 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
579 * This is only used in DDP mode, so we take the opportunity to also set the
580 * DACK mode and flush any Rx credits.
581 */
582void
583t3_send_rx_modulate(struct toepcb *toep)
584{
585	struct mbuf *m;
586	struct cpl_rx_data_ack *req;
587
588	m = m_gethdr_nofail(sizeof(*req));
589
590	req = mtod(m, struct cpl_rx_data_ack *);
591	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
592	req->wr.wr_lo = 0;
593	m->m_pkthdr.len = m->m_len = sizeof(*req);
594
595	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
596	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
597				 V_RX_DACK_MODE(1) |
598				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
599	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
600	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
601	toep->tp_rcv_wup = toep->tp_copied_seq;
602}
603
604/*
605 * Handle receipt of an urgent pointer.
606 */
607static void
608handle_urg_ptr(struct socket *so, uint32_t urg_seq)
609{
610#ifdef URGENT_DATA_SUPPORTED
611	struct tcpcb *tp = so_sototcpcb(so);
612
613	urg_seq--;   /* initially points past the urgent data, per BSD */
614
615	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
616		return;                                 /* duplicate pointer */
617	sk_send_sigurg(sk);
618	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
619	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
620		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
621
622		tp->copied_seq++;
623		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
624			tom_eat_skb(sk, skb, 0);
625	}
626	tp->urg_data = TCP_URG_NOTYET;
627	tp->urg_seq = urg_seq;
628#endif
629}
630
631/*
632 * Returns true if a socket cannot accept new Rx data.
633 */
634static inline int
635so_no_receive(const struct socket *so)
636{
637	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
638}
639
640/*
641 * Process an urgent data notification.
642 */
643static void
644rx_urg_notify(struct toepcb *toep, struct mbuf *m)
645{
646	struct cpl_rx_urg_notify *hdr = cplhdr(m);
647	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
648
649	VALIDATE_SOCK(so);
650
651	if (!so_no_receive(so))
652		handle_urg_ptr(so, ntohl(hdr->seq));
653
654	m_freem(m);
655}
656
657/*
658 * Handler for RX_URG_NOTIFY CPL messages.
659 */
660static int
661do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
662{
663	struct toepcb *toep = (struct toepcb *)ctx;
664
665	rx_urg_notify(toep, m);
666	return (0);
667}
668
669static __inline int
670is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
671{
672	return (toep->tp_ulp_mode ||
673		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
674		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
675}
676
677/*
678 * Set of states for which we should return RX credits.
679 */
680#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
681
682/*
683 * Called after some received data has been read.  It returns RX credits
684 * to the HW for the amount of data processed.
685 */
686void
687t3_cleanup_rbuf(struct tcpcb *tp, int copied)
688{
689	struct toepcb *toep = tp->t_toe;
690	struct socket *so;
691	struct toedev *dev;
692	int dack_mode, must_send, read;
693	u32 thres, credits, dack = 0;
694	struct sockbuf *rcv;
695
696	so = inp_inpcbtosocket(tp->t_inpcb);
697	rcv = so_sockbuf_rcv(so);
698
699	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
700		(tp->t_state == TCPS_FIN_WAIT_2))) {
701		if (copied) {
702			sockbuf_lock(rcv);
703			toep->tp_copied_seq += copied;
704			sockbuf_unlock(rcv);
705		}
706
707		return;
708	}
709
710	inp_lock_assert(tp->t_inpcb);
711
712	sockbuf_lock(rcv);
713	if (copied)
714		toep->tp_copied_seq += copied;
715	else {
716		read = toep->tp_enqueued_bytes - rcv->sb_cc;
717		toep->tp_copied_seq += read;
718	}
719	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
720	toep->tp_enqueued_bytes = rcv->sb_cc;
721	sockbuf_unlock(rcv);
722
723	if (credits > rcv->sb_mbmax) {
724		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
725		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
726	    credits = rcv->sb_mbmax;
727	}
728
729
730	/*
731	 * XXX this won't accurately reflect credit return - we need
732	 * to look at the difference between the amount that has been
733	 * put in the recv sockbuf and what is there now
734	 */
735
736	if (__predict_false(!credits))
737		return;
738
739	dev = toep->tp_toedev;
740	thres = TOM_TUNABLE(dev, rx_credit_thres);
741
742	if (__predict_false(thres == 0))
743		return;
744
745	if (is_delack_mode_valid(dev, toep)) {
746		dack_mode = TOM_TUNABLE(dev, delack);
747		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
748			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
749
750			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
751				dack = F_RX_DACK_CHANGE |
752				       V_RX_DACK_MODE(dack_mode);
753		}
754	} else
755		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
756
757	/*
758	 * For coalescing to work effectively ensure the receive window has
759	 * at least 16KB left.
760	 */
761	must_send = credits + 16384 >= tp->rcv_wnd;
762
763	if (must_send || credits >= thres)
764		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
765}
766
767static int
768cxgb_toe_disconnect(struct tcpcb *tp)
769{
770	struct socket *so;
771
772	DPRINTF("cxgb_toe_disconnect\n");
773
774	so = inp_inpcbtosocket(tp->t_inpcb);
775	close_conn(so);
776	return (0);
777}
778
779static int
780cxgb_toe_reset(struct tcpcb *tp)
781{
782	struct toepcb *toep = tp->t_toe;
783
784	t3_send_reset(toep);
785
786	/*
787	 * unhook from socket
788	 */
789	tp->t_flags &= ~TF_TOE;
790	toep->tp_tp = NULL;
791	tp->t_toe = NULL;
792	return (0);
793}
794
795static int
796cxgb_toe_send(struct tcpcb *tp)
797{
798	struct socket *so;
799
800	DPRINTF("cxgb_toe_send\n");
801	dump_toepcb(tp->t_toe);
802
803	so = inp_inpcbtosocket(tp->t_inpcb);
804	t3_push_frames(so, 1);
805	return (0);
806}
807
808static int
809cxgb_toe_rcvd(struct tcpcb *tp)
810{
811
812	inp_lock_assert(tp->t_inpcb);
813
814	t3_cleanup_rbuf(tp, 0);
815
816	return (0);
817}
818
819static void
820cxgb_toe_detach(struct tcpcb *tp)
821{
822	struct toepcb *toep;
823
824        /*
825	 * XXX how do we handle teardown in the SYN_SENT state?
826	 *
827	 */
828	inp_lock_assert(tp->t_inpcb);
829	toep = tp->t_toe;
830	toep->tp_tp = NULL;
831
832	/*
833	 * unhook from socket
834	 */
835	tp->t_flags &= ~TF_TOE;
836	tp->t_toe = NULL;
837}
838
839
840static struct toe_usrreqs cxgb_toe_usrreqs = {
841	.tu_disconnect = cxgb_toe_disconnect,
842	.tu_reset = cxgb_toe_reset,
843	.tu_send = cxgb_toe_send,
844	.tu_rcvd = cxgb_toe_rcvd,
845	.tu_detach = cxgb_toe_detach,
846	.tu_detach = cxgb_toe_detach,
847	.tu_syncache_event = handle_syncache_event,
848};
849
850
851static void
852__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
853			    uint64_t mask, uint64_t val, int no_reply)
854{
855	struct cpl_set_tcb_field *req;
856
857	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
858	    toep->tp_tid, word, mask, val);
859
860	req = mtod(m, struct cpl_set_tcb_field *);
861	m->m_pkthdr.len = m->m_len = sizeof(*req);
862	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
863	req->wr.wr_lo = 0;
864	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
865	req->reply = V_NO_REPLY(no_reply);
866	req->cpu_idx = 0;
867	req->word = htons(word);
868	req->mask = htobe64(mask);
869	req->val = htobe64(val);
870
871	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
872	send_or_defer(toep, m, 0);
873}
874
875static void
876t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
877{
878	struct mbuf *m;
879	struct tcpcb *tp = toep->tp_tp;
880
881	if (toep == NULL)
882		return;
883
884	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
885		printf("not seting field\n");
886		return;
887	}
888
889	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
890
891	__set_tcb_field(toep, m, word, mask, val, 1);
892}
893
894/*
895 * Set one of the t_flags bits in the TCB.
896 */
897static void
898set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
899{
900
901	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
902}
903
904/*
905 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
906 */
907static void
908t3_set_nagle(struct toepcb *toep)
909{
910	struct tcpcb *tp = toep->tp_tp;
911
912	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
913}
914
915/*
916 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
917 */
918void
919t3_set_keepalive(struct toepcb *toep, int on_off)
920{
921
922	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
923}
924
925void
926t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
927{
928	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
929}
930
931void
932t3_set_dack_mss(struct toepcb *toep, int on_off)
933{
934
935	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
936}
937
938/*
939 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
940 */
941static void
942t3_set_tos(struct toepcb *toep)
943{
944	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
945
946	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
947			 V_TCB_TOS(tos));
948}
949
950
951/*
952 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
953 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
954 * set the PSH bit in the last segment, which would trigger delivery.]
955 * We work around the issue by setting a DDP buffer in a partial placed state,
956 * which guarantees that TP will schedule a timer.
957 */
958#define TP_DDP_TIMER_WORKAROUND_MASK\
959    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
960     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
961       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
962#define TP_DDP_TIMER_WORKAROUND_VAL\
963    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
964     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
965      32))
966
967static void
968t3_enable_ddp(struct toepcb *toep, int on)
969{
970	if (on) {
971
972		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
973				 V_TF_DDP_OFF(0));
974	} else
975		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
976				 V_TF_DDP_OFF(1) |
977				 TP_DDP_TIMER_WORKAROUND_MASK,
978				 V_TF_DDP_OFF(1) |
979				 TP_DDP_TIMER_WORKAROUND_VAL);
980
981}
982
983void
984t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
985{
986	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
987			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
988			 tag_color);
989}
990
991void
992t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
993		    unsigned int len)
994{
995	if (buf_idx == 0)
996		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
997			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
998			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
999			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1000			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1001	else
1002		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1003			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1004			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1005			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1006			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1007}
1008
1009static int
1010t3_set_cong_control(struct socket *so, const char *name)
1011{
1012#ifdef CONGESTION_CONTROL_SUPPORTED
1013	int cong_algo;
1014
1015	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1016		if (!strcmp(name, t3_cong_ops[cong_algo].name))
1017			break;
1018
1019	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1020		return -EINVAL;
1021#endif
1022	return 0;
1023}
1024
1025int
1026t3_get_tcb(struct toepcb *toep)
1027{
1028	struct cpl_get_tcb *req;
1029	struct tcpcb *tp = toep->tp_tp;
1030	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1031
1032	if (!m)
1033		return (ENOMEM);
1034
1035	inp_lock_assert(tp->t_inpcb);
1036	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1037	req = mtod(m, struct cpl_get_tcb *);
1038	m->m_pkthdr.len = m->m_len = sizeof(*req);
1039	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1040	req->wr.wr_lo = 0;
1041	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1042	req->cpuno = htons(toep->tp_qset);
1043	req->rsvd = 0;
1044	if (tp->t_state == TCPS_SYN_SENT)
1045		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1046	else
1047		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1048	return 0;
1049}
1050
1051static inline void
1052so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1053{
1054
1055	toepcb_hold(toep);
1056
1057	cxgb_insert_tid(d->cdev, d->client, toep, tid);
1058}
1059
1060/**
1061 *	find_best_mtu - find the entry in the MTU table closest to an MTU
1062 *	@d: TOM state
1063 *	@mtu: the target MTU
1064 *
1065 *	Returns the index of the value in the MTU table that is closest to but
1066 *	does not exceed the target MTU.
1067 */
1068static unsigned int
1069find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1070{
1071	int i = 0;
1072
1073	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1074		++i;
1075	return (i);
1076}
1077
1078static unsigned int
1079select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1080{
1081	unsigned int idx;
1082
1083#ifdef notyet
1084	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1085#endif
1086	if (tp) {
1087		tp->t_maxseg = pmtu - 40;
1088		if (tp->t_maxseg < td->mtus[0] - 40)
1089			tp->t_maxseg = td->mtus[0] - 40;
1090		idx = find_best_mtu(td, tp->t_maxseg + 40);
1091
1092		tp->t_maxseg = td->mtus[idx] - 40;
1093	} else
1094		idx = find_best_mtu(td, pmtu);
1095
1096	return (idx);
1097}
1098
1099static inline void
1100free_atid(struct t3cdev *cdev, unsigned int tid)
1101{
1102	struct toepcb *toep = cxgb_free_atid(cdev, tid);
1103
1104	if (toep)
1105		toepcb_release(toep);
1106}
1107
1108/*
1109 * Release resources held by an offload connection (TID, L2T entry, etc.)
1110 */
1111static void
1112t3_release_offload_resources(struct toepcb *toep)
1113{
1114	struct tcpcb *tp = toep->tp_tp;
1115	struct toedev *tdev = toep->tp_toedev;
1116	struct t3cdev *cdev;
1117	struct socket *so;
1118	unsigned int tid = toep->tp_tid;
1119	struct sockbuf *rcv;
1120
1121	CTR0(KTR_TOM, "t3_release_offload_resources");
1122
1123	if (!tdev)
1124		return;
1125
1126	cdev = TOEP_T3C_DEV(toep);
1127	if (!cdev)
1128		return;
1129
1130	toep->tp_qset = 0;
1131	t3_release_ddp_resources(toep);
1132
1133#ifdef CTRL_SKB_CACHE
1134	kfree_skb(CTRL_SKB_CACHE(tp));
1135	CTRL_SKB_CACHE(tp) = NULL;
1136#endif
1137
1138	if (toep->tp_wr_avail != toep->tp_wr_max) {
1139		purge_wr_queue(toep);
1140		reset_wr_list(toep);
1141	}
1142
1143	if (toep->tp_l2t) {
1144		l2t_release(L2DATA(cdev), toep->tp_l2t);
1145		toep->tp_l2t = NULL;
1146	}
1147	toep->tp_tp = NULL;
1148	if (tp) {
1149		inp_lock_assert(tp->t_inpcb);
1150		so = inp_inpcbtosocket(tp->t_inpcb);
1151		rcv = so_sockbuf_rcv(so);
1152		/*
1153		 * cancel any offloaded reads
1154		 *
1155		 */
1156		sockbuf_lock(rcv);
1157		tp->t_toe = NULL;
1158		tp->t_flags &= ~TF_TOE;
1159		if (toep->tp_ddp_state.user_ddp_pending) {
1160			t3_cancel_ubuf(toep, rcv);
1161			toep->tp_ddp_state.user_ddp_pending = 0;
1162		}
1163		so_sorwakeup_locked(so);
1164
1165	}
1166
1167	if (toep->tp_state == TCPS_SYN_SENT) {
1168		free_atid(cdev, tid);
1169#ifdef notyet
1170		__skb_queue_purge(&tp->out_of_order_queue);
1171#endif
1172	} else {                                          // we have TID
1173		cxgb_remove_tid(cdev, toep, tid);
1174		toepcb_release(toep);
1175	}
1176#if 0
1177	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1178#endif
1179}
1180
1181static void
1182install_offload_ops(struct socket *so)
1183{
1184	struct tcpcb *tp = so_sototcpcb(so);
1185
1186	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1187
1188	t3_install_socket_ops(so);
1189	tp->t_flags |= TF_TOE;
1190	tp->t_tu = &cxgb_toe_usrreqs;
1191}
1192
1193/*
1194 * Determine the receive window scaling factor given a target max
1195 * receive window.
1196 */
1197static __inline int
1198select_rcv_wscale(int space)
1199{
1200	int wscale = 0;
1201
1202	if (space > MAX_RCV_WND)
1203		space = MAX_RCV_WND;
1204
1205	if (tcp_do_rfc1323)
1206		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1207
1208	return (wscale);
1209}
1210
1211/*
1212 * Determine the receive window size for a socket.
1213 */
1214static unsigned long
1215select_rcv_wnd(struct toedev *dev, struct socket *so)
1216{
1217	struct tom_data *d = TOM_DATA(dev);
1218	unsigned int wnd;
1219	unsigned int max_rcv_wnd;
1220	struct sockbuf *rcv;
1221
1222	rcv = so_sockbuf_rcv(so);
1223
1224	if (tcp_do_autorcvbuf)
1225		wnd = tcp_autorcvbuf_max;
1226	else
1227		wnd = rcv->sb_hiwat;
1228
1229
1230
1231	/* XXX
1232	 * For receive coalescing to work effectively we need a receive window
1233	 * that can accomodate a coalesced segment.
1234	 */
1235	if (wnd < MIN_RCV_WND)
1236		wnd = MIN_RCV_WND;
1237
1238	/* PR 5138 */
1239	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1240				    (uint32_t)d->rx_page_size * 23 :
1241				    MAX_RCV_WND);
1242
1243	return min(wnd, max_rcv_wnd);
1244}
1245
1246/*
1247 * Assign offload parameters to some socket fields.  This code is used by
1248 * both active and passive opens.
1249 */
1250static inline void
1251init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1252    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1253{
1254	struct tcpcb *tp = so_sototcpcb(so);
1255	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1256	struct sockbuf *snd, *rcv;
1257
1258#ifdef notyet
1259	SOCK_LOCK_ASSERT(so);
1260#endif
1261
1262	snd = so_sockbuf_snd(so);
1263	rcv = so_sockbuf_rcv(so);
1264
1265	log(LOG_INFO, "initializing offload socket\n");
1266	/*
1267	 * We either need to fix push frames to work with sbcompress
1268	 * or we need to add this
1269	 */
1270	snd->sb_flags |= SB_NOCOALESCE;
1271	rcv->sb_flags |= SB_NOCOALESCE;
1272
1273	tp->t_toe = toep;
1274	toep->tp_tp = tp;
1275	toep->tp_toedev = dev;
1276
1277	toep->tp_tid = tid;
1278	toep->tp_l2t = e;
1279	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1280	toep->tp_wr_unacked = 0;
1281	toep->tp_delack_mode = 0;
1282
1283	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1284	/*
1285	 * XXX broken
1286	 *
1287	 */
1288	tp->rcv_wnd = select_rcv_wnd(dev, so);
1289
1290        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1291		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1292	toep->tp_qset_idx = 0;
1293
1294	reset_wr_list(toep);
1295	DPRINTF("initialization done\n");
1296}
1297
1298/*
1299 * The next two functions calculate the option 0 value for a socket.
1300 */
1301static inline unsigned int
1302calc_opt0h(struct socket *so, int mtu_idx)
1303{
1304	struct tcpcb *tp = so_sototcpcb(so);
1305	int wscale = select_rcv_wscale(tp->rcv_wnd);
1306
1307	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1308	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1309	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1310}
1311
1312static inline unsigned int
1313calc_opt0l(struct socket *so, int ulp_mode)
1314{
1315	struct tcpcb *tp = so_sototcpcb(so);
1316	unsigned int val;
1317
1318	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1319	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1320
1321	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1322	return (val);
1323}
1324
1325static inline unsigned int
1326calc_opt2(const struct socket *so, struct toedev *dev)
1327{
1328	int flv_valid;
1329
1330	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1331
1332	return (V_FLAVORS_VALID(flv_valid) |
1333	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1334}
1335
1336#if DEBUG_WR > 1
1337static int
1338count_pending_wrs(const struct toepcb *toep)
1339{
1340	const struct mbuf *m;
1341	int n = 0;
1342
1343	wr_queue_walk(toep, m)
1344		n += m->m_pkthdr.csum_data;
1345	return (n);
1346}
1347#endif
1348
1349#if 0
1350(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1351#endif
1352
1353static void
1354mk_act_open_req(struct socket *so, struct mbuf *m,
1355    unsigned int atid, const struct l2t_entry *e)
1356{
1357	struct cpl_act_open_req *req;
1358	struct inpcb *inp = so_sotoinpcb(so);
1359	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1360	struct toepcb *toep = tp->t_toe;
1361	struct toedev *tdev = toep->tp_toedev;
1362
1363	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1364
1365	req = mtod(m, struct cpl_act_open_req *);
1366	m->m_pkthdr.len = m->m_len = sizeof(*req);
1367
1368	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1369	req->wr.wr_lo = 0;
1370	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1371	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1372#if 0
1373	req->local_port = inp->inp_lport;
1374	req->peer_port = inp->inp_fport;
1375	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1376	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1377#endif
1378	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1379			   V_TX_CHANNEL(e->smt_idx));
1380	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1381	req->params = 0;
1382	req->opt2 = htonl(calc_opt2(so, tdev));
1383}
1384
1385
1386/*
1387 * Convert an ACT_OPEN_RPL status to an errno.
1388 */
1389static int
1390act_open_rpl_status_to_errno(int status)
1391{
1392	switch (status) {
1393	case CPL_ERR_CONN_RESET:
1394		return (ECONNREFUSED);
1395	case CPL_ERR_ARP_MISS:
1396		return (EHOSTUNREACH);
1397	case CPL_ERR_CONN_TIMEDOUT:
1398		return (ETIMEDOUT);
1399	case CPL_ERR_TCAM_FULL:
1400		return (ENOMEM);
1401	case CPL_ERR_CONN_EXIST:
1402		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1403		return (EADDRINUSE);
1404	default:
1405		return (EIO);
1406	}
1407}
1408
1409static void
1410fail_act_open(struct toepcb *toep, int errno)
1411{
1412	struct tcpcb *tp = toep->tp_tp;
1413
1414	t3_release_offload_resources(toep);
1415	if (tp) {
1416		inp_wunlock(tp->t_inpcb);
1417		tcp_offload_drop(tp, errno);
1418	}
1419
1420#ifdef notyet
1421	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1422#endif
1423}
1424
1425/*
1426 * Handle active open failures.
1427 */
1428static void
1429active_open_failed(struct toepcb *toep, struct mbuf *m)
1430{
1431	struct cpl_act_open_rpl *rpl = cplhdr(m);
1432	struct inpcb *inp;
1433
1434	if (toep->tp_tp == NULL)
1435		goto done;
1436
1437	inp = toep->tp_tp->t_inpcb;
1438
1439/*
1440 * Don't handle connection retry for now
1441 */
1442#ifdef notyet
1443	struct inet_connection_sock *icsk = inet_csk(sk);
1444
1445	if (rpl->status == CPL_ERR_CONN_EXIST &&
1446	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1447		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1448		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1449			       jiffies + HZ / 2);
1450	} else
1451#endif
1452	{
1453		inp_wlock(inp);
1454		/*
1455		 * drops the inpcb lock
1456		 */
1457		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1458	}
1459
1460	done:
1461	m_free(m);
1462}
1463
1464/*
1465 * Return whether a failed active open has allocated a TID
1466 */
1467static inline int
1468act_open_has_tid(int status)
1469{
1470	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1471	       status != CPL_ERR_ARP_MISS;
1472}
1473
1474/*
1475 * Process an ACT_OPEN_RPL CPL message.
1476 */
1477static int
1478do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1479{
1480	struct toepcb *toep = (struct toepcb *)ctx;
1481	struct cpl_act_open_rpl *rpl = cplhdr(m);
1482
1483	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1484		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1485
1486	active_open_failed(toep, m);
1487	return (0);
1488}
1489
1490/*
1491 * Handle an ARP failure for an active open.   XXX purge ofo queue
1492 *
1493 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1494 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1495 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1496 * free the atid.  Hmm.
1497 */
1498#ifdef notyet
1499static void
1500act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1501{
1502	struct toepcb *toep = m_get_toep(m);
1503	struct tcpcb *tp = toep->tp_tp;
1504	struct inpcb *inp = tp->t_inpcb;
1505	struct socket *so;
1506
1507	inp_wlock(inp);
1508	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1509		/*
1510		 * drops the inpcb lock
1511		 */
1512		fail_act_open(so, EHOSTUNREACH);
1513		printf("freeing %p\n", m);
1514
1515		m_free(m);
1516	} else
1517		inp_wunlock(inp);
1518}
1519#endif
1520/*
1521 * Send an active open request.
1522 */
1523int
1524t3_connect(struct toedev *tdev, struct socket *so,
1525    struct rtentry *rt, struct sockaddr *nam)
1526{
1527	struct mbuf *m;
1528	struct l2t_entry *e;
1529	struct tom_data *d = TOM_DATA(tdev);
1530	struct inpcb *inp = so_sotoinpcb(so);
1531	struct tcpcb *tp = intotcpcb(inp);
1532	struct toepcb *toep; /* allocated by init_offload_socket */
1533
1534	int atid;
1535
1536	toep = toepcb_alloc();
1537	if (toep == NULL)
1538		goto out_err;
1539
1540	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1541		goto out_err;
1542
1543	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1544	if (!e)
1545		goto free_tid;
1546
1547	inp_lock_assert(inp);
1548	m = m_gethdr(MT_DATA, M_WAITOK);
1549
1550#if 0
1551	m->m_toe.mt_toepcb = tp->t_toe;
1552	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1553#endif
1554	so_lock(so);
1555
1556	init_offload_socket(so, tdev, atid, e, rt, toep);
1557
1558	install_offload_ops(so);
1559
1560	mk_act_open_req(so, m, atid, e);
1561	so_unlock(so);
1562
1563	soisconnecting(so);
1564	toep = tp->t_toe;
1565	m_set_toep(m, tp->t_toe);
1566
1567	toep->tp_state = TCPS_SYN_SENT;
1568	l2t_send(d->cdev, (struct mbuf *)m, e);
1569
1570	if (toep->tp_ulp_mode)
1571		t3_enable_ddp(toep, 0);
1572	return 	(0);
1573
1574free_tid:
1575	printf("failing connect - free atid\n");
1576
1577	free_atid(d->cdev, atid);
1578out_err:
1579	printf("return ENOMEM\n");
1580       return (ENOMEM);
1581}
1582
1583/*
1584 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1585 * not send multiple ABORT_REQs for the same connection and also that we do
1586 * not try to send a message after the connection has closed.  Returns 1 if
1587 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1588 */
1589static void
1590t3_send_reset(struct toepcb *toep)
1591{
1592
1593	struct cpl_abort_req *req;
1594	unsigned int tid = toep->tp_tid;
1595	int mode = CPL_ABORT_SEND_RST;
1596	struct tcpcb *tp = toep->tp_tp;
1597	struct toedev *tdev = toep->tp_toedev;
1598	struct socket *so = NULL;
1599	struct mbuf *m;
1600	struct sockbuf *snd;
1601
1602	if (tp) {
1603		inp_lock_assert(tp->t_inpcb);
1604		so = inp_inpcbtosocket(tp->t_inpcb);
1605	}
1606
1607	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1608		tdev == NULL))
1609		return;
1610	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1611
1612	snd = so_sockbuf_snd(so);
1613	/* Purge the send queue so we don't send anything after an abort. */
1614	if (so)
1615		sbflush(snd);
1616	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1617		mode |= CPL_ABORT_POST_CLOSE_REQ;
1618
1619	m = m_gethdr_nofail(sizeof(*req));
1620	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1621	set_arp_failure_handler(m, abort_arp_failure);
1622
1623	req = mtod(m, struct cpl_abort_req *);
1624	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1625	req->wr.wr_lo = htonl(V_WR_TID(tid));
1626	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1627	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1628	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1629	req->cmd = mode;
1630	if (tp && (tp->t_state == TCPS_SYN_SENT))
1631		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1632	else
1633		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1634}
1635
1636static int
1637t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1638{
1639	struct inpcb *inp;
1640	int error, optval;
1641
1642	if (sopt->sopt_name == IP_OPTIONS)
1643		return (ENOPROTOOPT);
1644
1645	if (sopt->sopt_name != IP_TOS)
1646		return (EOPNOTSUPP);
1647
1648	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1649
1650	if (error)
1651		return (error);
1652
1653	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1654		return (EPERM);
1655
1656	inp = so_sotoinpcb(so);
1657	inp_wlock(inp);
1658	inp_ip_tos_set(inp, optval);
1659#if 0
1660	inp->inp_ip_tos = optval;
1661#endif
1662	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1663	inp_wunlock(inp);
1664
1665	return (0);
1666}
1667
1668static int
1669t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1670{
1671	int err = 0;
1672	size_t copied;
1673
1674	if (sopt->sopt_name != TCP_CONGESTION &&
1675	    sopt->sopt_name != TCP_NODELAY)
1676		return (EOPNOTSUPP);
1677
1678	if (sopt->sopt_name == TCP_CONGESTION) {
1679		char name[TCP_CA_NAME_MAX];
1680		int optlen = sopt->sopt_valsize;
1681		struct tcpcb *tp;
1682
1683		if (optlen < 1)
1684			return (EINVAL);
1685
1686		err = copyinstr(sopt->sopt_val, name,
1687		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1688		if (err)
1689			return (err);
1690		if (copied < 1)
1691			return (EINVAL);
1692
1693		tp = so_sototcpcb(so);
1694		/*
1695		 * XXX I need to revisit this
1696		 */
1697		if ((err = t3_set_cong_control(so, name)) == 0) {
1698#ifdef CONGESTION_CONTROL_SUPPORTED
1699			tp->t_cong_control = strdup(name, M_CXGB);
1700#endif
1701		} else
1702			return (err);
1703	} else {
1704		int optval, oldval;
1705		struct inpcb *inp;
1706		struct tcpcb *tp;
1707
1708		err = sooptcopyin(sopt, &optval, sizeof optval,
1709		    sizeof optval);
1710
1711		if (err)
1712			return (err);
1713
1714		inp = so_sotoinpcb(so);
1715		tp = inp_inpcbtotcpcb(inp);
1716
1717		inp_wlock(inp);
1718
1719		oldval = tp->t_flags;
1720		if (optval)
1721			tp->t_flags |= TF_NODELAY;
1722		else
1723			tp->t_flags &= ~TF_NODELAY;
1724		inp_wunlock(inp);
1725
1726
1727		if (oldval != tp->t_flags && (tp->t_toe != NULL))
1728			t3_set_nagle(tp->t_toe);
1729
1730	}
1731
1732	return (0);
1733}
1734
1735int
1736t3_ctloutput(struct socket *so, struct sockopt *sopt)
1737{
1738	int err;
1739
1740	if (sopt->sopt_level != IPPROTO_TCP)
1741		err =  t3_ip_ctloutput(so, sopt);
1742	else
1743		err = t3_tcp_ctloutput(so, sopt);
1744
1745	if (err != EOPNOTSUPP)
1746		return (err);
1747
1748	return (tcp_ctloutput(so, sopt));
1749}
1750
1751/*
1752 * Returns true if we need to explicitly request RST when we receive new data
1753 * on an RX-closed connection.
1754 */
1755static inline int
1756need_rst_on_excess_rx(const struct toepcb *toep)
1757{
1758	return (1);
1759}
1760
1761/*
1762 * Handles Rx data that arrives in a state where the socket isn't accepting
1763 * new data.
1764 */
1765static void
1766handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1767{
1768
1769	if (need_rst_on_excess_rx(toep) &&
1770	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1771		t3_send_reset(toep);
1772	m_freem(m);
1773}
1774
1775/*
1776 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1777 * by getting the DDP offset from the TCB.
1778 */
1779static void
1780tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1781{
1782	struct ddp_state *q = &toep->tp_ddp_state;
1783	struct ddp_buf_state *bsp;
1784	struct cpl_get_tcb_rpl *hdr;
1785	unsigned int ddp_offset;
1786	struct socket *so;
1787	struct tcpcb *tp;
1788	struct sockbuf *rcv;
1789	int state;
1790
1791	uint64_t t;
1792	__be64 *tcb;
1793
1794	tp = toep->tp_tp;
1795	so = inp_inpcbtosocket(tp->t_inpcb);
1796
1797	inp_lock_assert(tp->t_inpcb);
1798	rcv = so_sockbuf_rcv(so);
1799	sockbuf_lock(rcv);
1800
1801	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1802	 * We really need a cookie in order to dispatch the RPLs.
1803	 */
1804	q->get_tcb_count--;
1805
1806	/* It is a possible that a previous CPL already invalidated UBUF DDP
1807	 * and moved the cur_buf idx and hence no further processing of this
1808	 * skb is required. However, the app might be sleeping on
1809	 * !q->get_tcb_count and we need to wake it up.
1810	 */
1811	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1812		int state = so_state_get(so);
1813
1814		m_freem(m);
1815		if (__predict_true((state & SS_NOFDREF) == 0))
1816			so_sorwakeup_locked(so);
1817		else
1818			sockbuf_unlock(rcv);
1819
1820		return;
1821	}
1822
1823	bsp = &q->buf_state[q->cur_buf];
1824	hdr = cplhdr(m);
1825	tcb = (__be64 *)(hdr + 1);
1826	if (q->cur_buf == 0) {
1827		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1828		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1829	} else {
1830		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1831		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1832	}
1833	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1834	m->m_cur_offset = bsp->cur_offset;
1835	bsp->cur_offset = ddp_offset;
1836	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1837
1838	CTR5(KTR_TOM,
1839	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1840	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1841	KASSERT(ddp_offset >= m->m_cur_offset,
1842	    ("ddp_offset=%u less than cur_offset=%u",
1843		ddp_offset, m->m_cur_offset));
1844
1845#if 0
1846{
1847	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1848
1849	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1850	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1851
1852        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1853        rcv_nxt = t >> S_TCB_RCV_NXT;
1854        rcv_nxt &= M_TCB_RCV_NXT;
1855
1856        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1857        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1858        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1859
1860	T3_TRACE2(TIDTB(sk),
1861		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1862		  ddp_flags, rcv_nxt - rx_hdr_offset);
1863	T3_TRACE4(TB(q),
1864		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1865		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1866	T3_TRACE3(TB(q),
1867		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1868		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1869	T3_TRACE2(TB(q),
1870		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1871		 q->buf_state[0].flags, q->buf_state[1].flags);
1872
1873}
1874#endif
1875	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1876		handle_excess_rx(toep, m);
1877		return;
1878	}
1879
1880#ifdef T3_TRACE
1881	if ((int)m->m_pkthdr.len < 0) {
1882		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1883	}
1884#endif
1885	if (bsp->flags & DDP_BF_NOCOPY) {
1886#ifdef T3_TRACE
1887		T3_TRACE0(TB(q),
1888			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1889
1890		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1891			printk("!cancel_ubuf");
1892			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1893		}
1894#endif
1895		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1896		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1897		q->cur_buf ^= 1;
1898	} else if (bsp->flags & DDP_BF_NOFLIP) {
1899
1900		m->m_ddp_flags = 1;    /* always a kernel buffer */
1901
1902		/* now HW buffer carries a user buffer */
1903		bsp->flags &= ~DDP_BF_NOFLIP;
1904		bsp->flags |= DDP_BF_NOCOPY;
1905
1906		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1907		 * any new data in which case we're done. If in addition the
1908		 * offset is 0, then there wasn't a completion for the kbuf
1909		 * and we need to decrement the posted count.
1910		 */
1911		if (m->m_pkthdr.len == 0) {
1912			if (ddp_offset == 0) {
1913				q->kbuf_posted--;
1914				bsp->flags |= DDP_BF_NODATA;
1915			}
1916			sockbuf_unlock(rcv);
1917			m_free(m);
1918			return;
1919		}
1920	} else {
1921		sockbuf_unlock(rcv);
1922
1923		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1924		 * but it got here way late and nobody cares anymore.
1925		 */
1926		m_free(m);
1927		return;
1928	}
1929
1930	m->m_ddp_gl = (unsigned char *)bsp->gl;
1931	m->m_flags |= M_DDP;
1932	m->m_seq = tp->rcv_nxt;
1933	tp->rcv_nxt += m->m_pkthdr.len;
1934	tp->t_rcvtime = ticks;
1935	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1936		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
1937	if (m->m_pkthdr.len == 0) {
1938		q->user_ddp_pending = 0;
1939		m_free(m);
1940	} else
1941		SBAPPEND(rcv, m);
1942
1943	state = so_state_get(so);
1944	if (__predict_true((state & SS_NOFDREF) == 0))
1945		so_sorwakeup_locked(so);
1946	else
1947		sockbuf_unlock(rcv);
1948}
1949
1950/*
1951 * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1952 * in that case they are similar to DDP completions.
1953 */
1954static int
1955do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1956{
1957	struct toepcb *toep = (struct toepcb *)ctx;
1958
1959	/* OK if socket doesn't exist */
1960	if (toep == NULL) {
1961		printf("null toep in do_get_tcb_rpl\n");
1962		return (CPL_RET_BUF_DONE);
1963	}
1964
1965	inp_wlock(toep->tp_tp->t_inpcb);
1966	tcb_rpl_as_ddp_complete(toep, m);
1967	inp_wunlock(toep->tp_tp->t_inpcb);
1968
1969	return (0);
1970}
1971
1972static void
1973handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1974{
1975	struct tcpcb *tp = toep->tp_tp;
1976	struct socket *so;
1977	struct ddp_state *q;
1978	struct ddp_buf_state *bsp;
1979	struct cpl_rx_data *hdr = cplhdr(m);
1980	unsigned int rcv_nxt = ntohl(hdr->seq);
1981	struct sockbuf *rcv;
1982
1983	if (tp->rcv_nxt == rcv_nxt)
1984		return;
1985
1986	inp_lock_assert(tp->t_inpcb);
1987	so  = inp_inpcbtosocket(tp->t_inpcb);
1988	rcv = so_sockbuf_rcv(so);
1989	sockbuf_lock(rcv);
1990
1991	q = &toep->tp_ddp_state;
1992	bsp = &q->buf_state[q->cur_buf];
1993	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
1994		rcv_nxt, tp->rcv_nxt));
1995	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
1996	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
1997	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
1998	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
1999
2000#ifdef T3_TRACE
2001	if ((int)m->m_pkthdr.len < 0) {
2002		t3_ddp_error(so, "handle_ddp_data: neg len");
2003	}
2004#endif
2005	m->m_ddp_gl = (unsigned char *)bsp->gl;
2006	m->m_flags |= M_DDP;
2007	m->m_cur_offset = bsp->cur_offset;
2008	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2009	if (bsp->flags & DDP_BF_NOCOPY)
2010		bsp->flags &= ~DDP_BF_NOCOPY;
2011
2012	m->m_seq = tp->rcv_nxt;
2013	tp->rcv_nxt = rcv_nxt;
2014	bsp->cur_offset += m->m_pkthdr.len;
2015	if (!(bsp->flags & DDP_BF_NOFLIP))
2016		q->cur_buf ^= 1;
2017	/*
2018	 * For now, don't re-enable DDP after a connection fell out of  DDP
2019	 * mode.
2020	 */
2021	q->ubuf_ddp_ready = 0;
2022	sockbuf_unlock(rcv);
2023}
2024
2025/*
2026 * Process new data received for a connection.
2027 */
2028static void
2029new_rx_data(struct toepcb *toep, struct mbuf *m)
2030{
2031	struct cpl_rx_data *hdr = cplhdr(m);
2032	struct tcpcb *tp = toep->tp_tp;
2033	struct socket *so;
2034	struct sockbuf *rcv;
2035	int state;
2036	int len = be16toh(hdr->len);
2037
2038	inp_wlock(tp->t_inpcb);
2039
2040	so  = inp_inpcbtosocket(tp->t_inpcb);
2041
2042	if (__predict_false(so_no_receive(so))) {
2043		handle_excess_rx(toep, m);
2044		inp_wunlock(tp->t_inpcb);
2045		TRACE_EXIT;
2046		return;
2047	}
2048
2049	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2050		handle_ddp_data(toep, m);
2051
2052	m->m_seq = ntohl(hdr->seq);
2053	m->m_ulp_mode = 0;                    /* for iSCSI */
2054
2055#if VALIDATE_SEQ
2056	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2057		log(LOG_ERR,
2058		       "%s: TID %u: Bad sequence number %u, expected %u\n",
2059		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2060		       tp->rcv_nxt);
2061		m_freem(m);
2062		inp_wunlock(tp->t_inpcb);
2063		return;
2064	}
2065#endif
2066	m_adj(m, sizeof(*hdr));
2067
2068#ifdef URGENT_DATA_SUPPORTED
2069	/*
2070	 * We don't handle urgent data yet
2071	 */
2072	if (__predict_false(hdr->urg))
2073		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2074	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2075		     tp->urg_seq - tp->rcv_nxt < skb->len))
2076		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2077							 tp->rcv_nxt];
2078#endif
2079	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2080		toep->tp_delack_mode = hdr->dack_mode;
2081		toep->tp_delack_seq = tp->rcv_nxt;
2082	}
2083	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2084	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2085
2086	if (len < m->m_pkthdr.len)
2087		m->m_pkthdr.len = m->m_len = len;
2088
2089	tp->rcv_nxt += m->m_pkthdr.len;
2090	tp->t_rcvtime = ticks;
2091	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2092	CTR2(KTR_TOM,
2093	    "new_rx_data: seq 0x%x len %u",
2094	    m->m_seq, m->m_pkthdr.len);
2095	inp_wunlock(tp->t_inpcb);
2096	rcv = so_sockbuf_rcv(so);
2097	sockbuf_lock(rcv);
2098#if 0
2099	if (sb_notify(rcv))
2100		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2101#endif
2102	SBAPPEND(rcv, m);
2103
2104#ifdef notyet
2105	/*
2106	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2107	 *
2108	 */
2109	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2110
2111	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2112		so, rcv->sb_cc, rcv->sb_mbmax));
2113#endif
2114
2115
2116	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2117	    rcv->sb_cc, rcv->sb_mbcnt);
2118
2119	state = so_state_get(so);
2120	if (__predict_true((state & SS_NOFDREF) == 0))
2121		so_sorwakeup_locked(so);
2122	else
2123		sockbuf_unlock(rcv);
2124}
2125
2126/*
2127 * Handler for RX_DATA CPL messages.
2128 */
2129static int
2130do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2131{
2132	struct toepcb *toep = (struct toepcb *)ctx;
2133
2134	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2135
2136	new_rx_data(toep, m);
2137
2138	return (0);
2139}
2140
2141static void
2142new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2143{
2144	struct tcpcb *tp;
2145	struct ddp_state *q;
2146	struct ddp_buf_state *bsp;
2147	struct cpl_rx_data_ddp *hdr;
2148	struct socket *so;
2149	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2150	int nomoredata = 0;
2151	unsigned int delack_mode;
2152	struct sockbuf *rcv;
2153
2154	tp = toep->tp_tp;
2155	inp_wlock(tp->t_inpcb);
2156	so = inp_inpcbtosocket(tp->t_inpcb);
2157
2158	if (__predict_false(so_no_receive(so))) {
2159
2160		handle_excess_rx(toep, m);
2161		inp_wunlock(tp->t_inpcb);
2162		return;
2163	}
2164
2165	q = &toep->tp_ddp_state;
2166	hdr = cplhdr(m);
2167	ddp_report = ntohl(hdr->u.ddp_report);
2168	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2169	bsp = &q->buf_state[buf_idx];
2170
2171	CTR4(KTR_TOM,
2172	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2173	    "hdr seq 0x%x len %u",
2174	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2175	    ntohs(hdr->len));
2176	CTR3(KTR_TOM,
2177	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2178	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2179
2180	ddp_len = ntohs(hdr->len);
2181	rcv_nxt = ntohl(hdr->seq) + ddp_len;
2182
2183	delack_mode = G_DDP_DACK_MODE(ddp_report);
2184	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2185		toep->tp_delack_mode = delack_mode;
2186		toep->tp_delack_seq = tp->rcv_nxt;
2187	}
2188
2189	m->m_seq = tp->rcv_nxt;
2190	tp->rcv_nxt = rcv_nxt;
2191
2192	tp->t_rcvtime = ticks;
2193	/*
2194	 * Store the length in m->m_len.  We are changing the meaning of
2195	 * m->m_len here, we need to be very careful that nothing from now on
2196	 * interprets ->len of this packet the usual way.
2197	 */
2198	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2199	inp_wunlock(tp->t_inpcb);
2200	CTR3(KTR_TOM,
2201	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2202	    m->m_len, rcv_nxt, m->m_seq);
2203	/*
2204	 * Figure out where the new data was placed in the buffer and store it
2205	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
2206	 * account for page pod's pg_offset.
2207	 */
2208	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2209	m->m_cur_offset = end_offset - m->m_pkthdr.len;
2210
2211	rcv = so_sockbuf_rcv(so);
2212	sockbuf_lock(rcv);
2213
2214	m->m_ddp_gl = (unsigned char *)bsp->gl;
2215	m->m_flags |= M_DDP;
2216	bsp->cur_offset = end_offset;
2217	toep->tp_enqueued_bytes += m->m_pkthdr.len;
2218
2219	/*
2220	 * Length is only meaningful for kbuf
2221	 */
2222	if (!(bsp->flags & DDP_BF_NOCOPY))
2223		KASSERT(m->m_len <= bsp->gl->dgl_length,
2224		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
2225			m->m_len, bsp->gl->dgl_length));
2226
2227	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2228	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2229        /*
2230	 * Bit 0 of flags stores whether the DDP buffer is completed.
2231	 * Note that other parts of the code depend on this being in bit 0.
2232	 */
2233	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2234		panic("spurious ddp completion");
2235	} else {
2236		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2237		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2238			q->cur_buf ^= 1;                     /* flip buffers */
2239	}
2240
2241	if (bsp->flags & DDP_BF_NOCOPY) {
2242		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2243		bsp->flags &= ~DDP_BF_NOCOPY;
2244	}
2245
2246	if (ddp_report & F_DDP_PSH)
2247		m->m_ddp_flags |= DDP_BF_PSH;
2248	if (nomoredata)
2249		m->m_ddp_flags |= DDP_BF_NODATA;
2250
2251#ifdef notyet
2252	skb_reset_transport_header(skb);
2253	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2254#endif
2255	SBAPPEND(rcv, m);
2256
2257	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2258	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2259		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2260		so_sorwakeup_locked(so);
2261	else
2262		sockbuf_unlock(rcv);
2263}
2264
2265#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2266		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2267		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2268		 F_DDP_INVALID_PPOD)
2269
2270/*
2271 * Handler for RX_DATA_DDP CPL messages.
2272 */
2273static int
2274do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2275{
2276	struct toepcb *toep = ctx;
2277	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2278
2279	VALIDATE_SOCK(so);
2280
2281	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2282		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2283		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2284		return (CPL_RET_BUF_DONE);
2285	}
2286#if 0
2287	skb->h.th = tcphdr_skb->h.th;
2288#endif
2289	new_rx_data_ddp(toep, m);
2290	return (0);
2291}
2292
2293static void
2294process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2295{
2296	struct tcpcb *tp = toep->tp_tp;
2297	struct socket *so;
2298	struct ddp_state *q;
2299	struct ddp_buf_state *bsp;
2300	struct cpl_rx_ddp_complete *hdr;
2301	unsigned int ddp_report, buf_idx, when, delack_mode;
2302	int nomoredata = 0;
2303	struct sockbuf *rcv;
2304
2305	inp_wlock(tp->t_inpcb);
2306	so = inp_inpcbtosocket(tp->t_inpcb);
2307
2308	if (__predict_false(so_no_receive(so))) {
2309		struct inpcb *inp = so_sotoinpcb(so);
2310
2311		handle_excess_rx(toep, m);
2312		inp_wunlock(inp);
2313		return;
2314	}
2315	q = &toep->tp_ddp_state;
2316	hdr = cplhdr(m);
2317	ddp_report = ntohl(hdr->ddp_report);
2318	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2319	m->m_pkthdr.csum_data = tp->rcv_nxt;
2320
2321	rcv = so_sockbuf_rcv(so);
2322	sockbuf_lock(rcv);
2323
2324	bsp = &q->buf_state[buf_idx];
2325	when = bsp->cur_offset;
2326	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2327	tp->rcv_nxt += m->m_len;
2328	tp->t_rcvtime = ticks;
2329
2330	delack_mode = G_DDP_DACK_MODE(ddp_report);
2331	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2332		toep->tp_delack_mode = delack_mode;
2333		toep->tp_delack_seq = tp->rcv_nxt;
2334	}
2335#ifdef notyet
2336	skb_reset_transport_header(skb);
2337	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2338#endif
2339	inp_wunlock(tp->t_inpcb);
2340
2341	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2342	CTR5(KTR_TOM,
2343		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2344		  "ddp_report 0x%x offset %u, len %u",
2345		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2346		   G_DDP_OFFSET(ddp_report), m->m_len);
2347
2348	m->m_cur_offset = bsp->cur_offset;
2349	bsp->cur_offset += m->m_len;
2350
2351	if (!(bsp->flags & DDP_BF_NOFLIP)) {
2352		q->cur_buf ^= 1;                     /* flip buffers */
2353		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2354			nomoredata=1;
2355	}
2356
2357	CTR4(KTR_TOM,
2358		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2359		  "ddp_report %u offset %u",
2360		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
2361		   G_DDP_OFFSET(ddp_report));
2362
2363	m->m_ddp_gl = (unsigned char *)bsp->gl;
2364	m->m_flags |= M_DDP;
2365	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2366	if (bsp->flags & DDP_BF_NOCOPY)
2367		bsp->flags &= ~DDP_BF_NOCOPY;
2368	if (nomoredata)
2369		m->m_ddp_flags |= DDP_BF_NODATA;
2370
2371	SBAPPEND(rcv, m);
2372	if ((so_state_get(so) & SS_NOFDREF) == 0)
2373		so_sorwakeup_locked(so);
2374	else
2375		sockbuf_unlock(rcv);
2376}
2377
2378/*
2379 * Handler for RX_DDP_COMPLETE CPL messages.
2380 */
2381static int
2382do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2383{
2384	struct toepcb *toep = ctx;
2385
2386	VALIDATE_SOCK(so);
2387#if 0
2388	skb->h.th = tcphdr_skb->h.th;
2389#endif
2390	process_ddp_complete(toep, m);
2391	return (0);
2392}
2393
2394/*
2395 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2396 * socket state before calling tcp_time_wait to comply with its expectations.
2397 */
2398static void
2399enter_timewait(struct tcpcb *tp)
2400{
2401	/*
2402	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2403	 * process peer_close because we don't want to carry the peer FIN in
2404	 * the socket's receive queue and if we increment rcv_nxt without
2405	 * having the FIN in the receive queue we'll confuse facilities such
2406	 * as SIOCINQ.
2407	 */
2408	inp_wlock(tp->t_inpcb);
2409	tp->rcv_nxt++;
2410
2411	tp->ts_recent_age = 0;	     /* defeat recycling */
2412	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2413	inp_wunlock(tp->t_inpcb);
2414	tcp_offload_twstart(tp);
2415}
2416
2417static void
2418enter_timewait_disconnect(struct tcpcb *tp)
2419{
2420	/*
2421	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2422	 * process peer_close because we don't want to carry the peer FIN in
2423	 * the socket's receive queue and if we increment rcv_nxt without
2424	 * having the FIN in the receive queue we'll confuse facilities such
2425	 * as SIOCINQ.
2426	 */
2427	inp_wlock(tp->t_inpcb);
2428	tp->rcv_nxt++;
2429
2430	tp->ts_recent_age = 0;	     /* defeat recycling */
2431	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2432	inp_wunlock(tp->t_inpcb);
2433	tcp_offload_twstart_disconnect(tp);
2434}
2435
2436/*
2437 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2438 * function deals with the data that may be reported along with the FIN.
2439 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2440 * perform normal FIN-related processing.  In the latter case 1 indicates that
2441 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2442 * skb can be freed.
2443 */
2444static int
2445handle_peer_close_data(struct socket *so, struct mbuf *m)
2446{
2447	struct tcpcb *tp = so_sototcpcb(so);
2448	struct toepcb *toep = tp->t_toe;
2449	struct ddp_state *q;
2450	struct ddp_buf_state *bsp;
2451	struct cpl_peer_close *req = cplhdr(m);
2452	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2453	struct sockbuf *rcv;
2454
2455	if (tp->rcv_nxt == rcv_nxt)			/* no data */
2456		return (0);
2457
2458	CTR0(KTR_TOM, "handle_peer_close_data");
2459	if (__predict_false(so_no_receive(so))) {
2460		handle_excess_rx(toep, m);
2461
2462		/*
2463		 * Although we discard the data we want to process the FIN so
2464		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2465		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
2466		 * may be what will close the connection.  We return 1 because
2467		 * handle_excess_rx() already freed the packet.
2468		 */
2469		return (1);
2470	}
2471
2472	inp_lock_assert(tp->t_inpcb);
2473	q = &toep->tp_ddp_state;
2474	rcv = so_sockbuf_rcv(so);
2475	sockbuf_lock(rcv);
2476
2477	bsp = &q->buf_state[q->cur_buf];
2478	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2479	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2480	m->m_ddp_gl = (unsigned char *)bsp->gl;
2481	m->m_flags |= M_DDP;
2482	m->m_cur_offset = bsp->cur_offset;
2483	m->m_ddp_flags =
2484	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2485	m->m_seq = tp->rcv_nxt;
2486	tp->rcv_nxt = rcv_nxt;
2487	bsp->cur_offset += m->m_pkthdr.len;
2488	if (!(bsp->flags & DDP_BF_NOFLIP))
2489		q->cur_buf ^= 1;
2490#ifdef notyet
2491	skb_reset_transport_header(skb);
2492	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2493#endif
2494	tp->t_rcvtime = ticks;
2495	SBAPPEND(rcv, m);
2496	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2497		so_sorwakeup_locked(so);
2498	else
2499		sockbuf_unlock(rcv);
2500
2501	return (1);
2502}
2503
2504/*
2505 * Handle a peer FIN.
2506 */
2507static void
2508do_peer_fin(struct toepcb *toep, struct mbuf *m)
2509{
2510	struct socket *so;
2511	struct tcpcb *tp = toep->tp_tp;
2512	int keep, action;
2513
2514	action = keep = 0;
2515	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2516	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2517		printf("abort_pending set\n");
2518
2519		goto out;
2520	}
2521	inp_wlock(tp->t_inpcb);
2522	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2523	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2524		keep = handle_peer_close_data(so, m);
2525		if (keep < 0) {
2526			inp_wunlock(tp->t_inpcb);
2527			return;
2528		}
2529	}
2530	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2531		CTR1(KTR_TOM,
2532		    "waking up waiters for cantrcvmore on %p ", so);
2533		socantrcvmore(so);
2534
2535		/*
2536		 * If connection is half-synchronized
2537		 * (ie NEEDSYN flag on) then delay ACK,
2538		 * so it may be piggybacked when SYN is sent.
2539		 * Otherwise, since we received a FIN then no
2540		 * more input can be expected, send ACK now.
2541		 */
2542		if (tp->t_flags & TF_NEEDSYN)
2543			tp->t_flags |= TF_DELACK;
2544		else
2545			tp->t_flags |= TF_ACKNOW;
2546		tp->rcv_nxt++;
2547	}
2548
2549	switch (tp->t_state) {
2550	case TCPS_SYN_RECEIVED:
2551	    tp->t_starttime = ticks;
2552	/* FALLTHROUGH */
2553	case TCPS_ESTABLISHED:
2554		tp->t_state = TCPS_CLOSE_WAIT;
2555		break;
2556	case TCPS_FIN_WAIT_1:
2557		tp->t_state = TCPS_CLOSING;
2558		break;
2559	case TCPS_FIN_WAIT_2:
2560		/*
2561		 * If we've sent an abort_req we must have sent it too late,
2562		 * HW will send us a reply telling us so, and this peer_close
2563		 * is really the last message for this connection and needs to
2564		 * be treated as an abort_rpl, i.e., transition the connection
2565		 * to TCP_CLOSE (note that the host stack does this at the
2566		 * time of generating the RST but we must wait for HW).
2567		 * Otherwise we enter TIME_WAIT.
2568		 */
2569		t3_release_offload_resources(toep);
2570		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2571			action = TCP_CLOSE;
2572		} else {
2573			action = TCP_TIMEWAIT;
2574		}
2575		break;
2576	default:
2577		log(LOG_ERR,
2578		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
2579		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2580	}
2581	inp_wunlock(tp->t_inpcb);
2582
2583	if (action == TCP_TIMEWAIT) {
2584		enter_timewait(tp);
2585	} else if (action == TCP_DROP) {
2586		tcp_offload_drop(tp, 0);
2587	} else if (action == TCP_CLOSE) {
2588		tcp_offload_close(tp);
2589	}
2590
2591#ifdef notyet
2592	/* Do not send POLL_HUP for half duplex close. */
2593	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2594	    sk->sk_state == TCP_CLOSE)
2595		sk_wake_async(so, 1, POLL_HUP);
2596	else
2597		sk_wake_async(so, 1, POLL_IN);
2598#endif
2599
2600out:
2601	if (!keep)
2602		m_free(m);
2603}
2604
2605/*
2606 * Handler for PEER_CLOSE CPL messages.
2607 */
2608static int
2609do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2610{
2611	struct toepcb *toep = (struct toepcb *)ctx;
2612
2613	VALIDATE_SOCK(so);
2614
2615	do_peer_fin(toep, m);
2616	return (0);
2617}
2618
2619static void
2620process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2621{
2622	struct cpl_close_con_rpl *rpl = cplhdr(m);
2623	struct tcpcb *tp = toep->tp_tp;
2624	struct socket *so;
2625	int action = 0;
2626	struct sockbuf *rcv;
2627
2628	inp_wlock(tp->t_inpcb);
2629	so = inp_inpcbtosocket(tp->t_inpcb);
2630
2631	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2632
2633	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2634		inp_wunlock(tp->t_inpcb);
2635		goto out;
2636	}
2637
2638	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2639	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2640
2641	switch (tp->t_state) {
2642	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2643		t3_release_offload_resources(toep);
2644		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2645			action = TCP_CLOSE;
2646
2647		} else {
2648			action = TCP_TIMEWAIT;
2649		}
2650		break;
2651	case TCPS_LAST_ACK:
2652		/*
2653		 * In this state we don't care about pending abort_rpl.
2654		 * If we've sent abort_req it was post-close and was sent too
2655		 * late, this close_con_rpl is the actual last message.
2656		 */
2657		t3_release_offload_resources(toep);
2658		action = TCP_CLOSE;
2659		break;
2660	case TCPS_FIN_WAIT_1:
2661		/*
2662		 * If we can't receive any more
2663		 * data, then closing user can proceed.
2664		 * Starting the timer is contrary to the
2665		 * specification, but if we don't get a FIN
2666		 * we'll hang forever.
2667		 *
2668		 * XXXjl:
2669		 * we should release the tp also, and use a
2670		 * compressed state.
2671		 */
2672		if (so)
2673			rcv = so_sockbuf_rcv(so);
2674		else
2675			break;
2676
2677		if (rcv->sb_state & SBS_CANTRCVMORE) {
2678			int timeout;
2679
2680			if (so)
2681				soisdisconnected(so);
2682			timeout = (tcp_fast_finwait2_recycle) ?
2683			    tcp_finwait2_timeout : tcp_maxidle;
2684			tcp_timer_activate(tp, TT_2MSL, timeout);
2685		}
2686		tp->t_state = TCPS_FIN_WAIT_2;
2687		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2688		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2689			action = TCP_DROP;
2690		}
2691
2692		break;
2693	default:
2694		log(LOG_ERR,
2695		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2696		       toep->tp_toedev->tod_name, toep->tp_tid,
2697		       tp->t_state);
2698	}
2699	inp_wunlock(tp->t_inpcb);
2700
2701
2702	if (action == TCP_TIMEWAIT) {
2703		enter_timewait_disconnect(tp);
2704	} else if (action == TCP_DROP) {
2705		tcp_offload_drop(tp, 0);
2706	} else if (action == TCP_CLOSE) {
2707		tcp_offload_close(tp);
2708	}
2709out:
2710	m_freem(m);
2711}
2712
2713/*
2714 * Handler for CLOSE_CON_RPL CPL messages.
2715 */
2716static int
2717do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2718			    void *ctx)
2719{
2720	struct toepcb *toep = (struct toepcb *)ctx;
2721
2722	process_close_con_rpl(toep, m);
2723	return (0);
2724}
2725
2726/*
2727 * Process abort replies.  We only process these messages if we anticipate
2728 * them as the coordination between SW and HW in this area is somewhat lacking
2729 * and sometimes we get ABORT_RPLs after we are done with the connection that
2730 * originated the ABORT_REQ.
2731 */
2732static void
2733process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2734{
2735	struct tcpcb *tp = toep->tp_tp;
2736	struct socket *so;
2737	int needclose = 0;
2738
2739#ifdef T3_TRACE
2740	T3_TRACE1(TIDTB(sk),
2741		  "process_abort_rpl: GTS rpl pending %d",
2742		  sock_flag(sk, ABORT_RPL_PENDING));
2743#endif
2744
2745	inp_wlock(tp->t_inpcb);
2746	so = inp_inpcbtosocket(tp->t_inpcb);
2747
2748	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2749		/*
2750		 * XXX panic on tcpdrop
2751		 */
2752		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2753			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2754		else {
2755			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2756			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2757			    !is_t3a(toep->tp_toedev)) {
2758				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2759					panic("TP_ABORT_REQ_RCVD set");
2760				t3_release_offload_resources(toep);
2761				needclose = 1;
2762			}
2763		}
2764	}
2765	inp_wunlock(tp->t_inpcb);
2766
2767	if (needclose)
2768		tcp_offload_close(tp);
2769
2770	m_free(m);
2771}
2772
2773/*
2774 * Handle an ABORT_RPL_RSS CPL message.
2775 */
2776static int
2777do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2778{
2779	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2780	struct toepcb *toep;
2781
2782	/*
2783	 * Ignore replies to post-close aborts indicating that the abort was
2784	 * requested too late.  These connections are terminated when we get
2785	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2786	 * arrives the TID is either no longer used or it has been recycled.
2787	 */
2788	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2789discard:
2790		m_free(m);
2791		return (0);
2792	}
2793
2794	toep = (struct toepcb *)ctx;
2795
2796        /*
2797	 * Sometimes we've already closed the socket, e.g., a post-close
2798	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2799	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2800	 * but FW turns the ABORT_REQ into a regular one and so we get
2801	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2802	 */
2803	if (!toep)
2804		goto discard;
2805
2806	if (toep->tp_tp == NULL) {
2807		log(LOG_NOTICE, "removing tid for abort\n");
2808		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2809		if (toep->tp_l2t)
2810			l2t_release(L2DATA(cdev), toep->tp_l2t);
2811
2812		toepcb_release(toep);
2813		goto discard;
2814	}
2815
2816	log(LOG_NOTICE, "toep=%p\n", toep);
2817	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2818
2819	toepcb_hold(toep);
2820	process_abort_rpl(toep, m);
2821	toepcb_release(toep);
2822	return (0);
2823}
2824
2825/*
2826 * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2827 * indicate whether RST should be sent in response.
2828 */
2829static int
2830abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2831{
2832	struct tcpcb *tp = so_sototcpcb(so);
2833
2834	switch (abort_reason) {
2835	case CPL_ERR_BAD_SYN:
2836#if 0
2837		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2838#endif
2839	case CPL_ERR_CONN_RESET:
2840		// XXX need to handle SYN_RECV due to crossed SYNs
2841		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2842	case CPL_ERR_XMIT_TIMEDOUT:
2843	case CPL_ERR_PERSIST_TIMEDOUT:
2844	case CPL_ERR_FINWAIT2_TIMEDOUT:
2845	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2846#if 0
2847		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2848#endif
2849		return (ETIMEDOUT);
2850	default:
2851		return (EIO);
2852	}
2853}
2854
2855static inline void
2856set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2857{
2858	struct cpl_abort_rpl *rpl = cplhdr(m);
2859
2860	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2861	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2862	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2863
2864	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2865	rpl->cmd = cmd;
2866}
2867
2868static void
2869send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2870{
2871	struct mbuf *reply_mbuf;
2872	struct cpl_abort_req_rss *req = cplhdr(m);
2873
2874	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2875	m_set_priority(m, CPL_PRIORITY_DATA);
2876	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2877	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2878	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2879	m_free(m);
2880}
2881
2882/*
2883 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2884 */
2885static inline int
2886is_neg_adv_abort(unsigned int status)
2887{
2888	return status == CPL_ERR_RTX_NEG_ADVICE ||
2889	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2890}
2891
2892static void
2893send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2894{
2895	struct mbuf  *reply_mbuf;
2896	struct cpl_abort_req_rss *req = cplhdr(m);
2897
2898	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2899
2900	if (!reply_mbuf) {
2901		/* Defer the reply.  Stick rst_status into req->cmd. */
2902		req->status = rst_status;
2903		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2904		return;
2905	}
2906
2907	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2908	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2909	m_free(m);
2910
2911	/*
2912	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2913	 * these messages while ARP is pending.  For other connection states
2914	 * it's not a problem.
2915	 */
2916	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2917}
2918
2919#ifdef notyet
2920static void
2921cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2922{
2923	CXGB_UNIMPLEMENTED();
2924#ifdef notyet
2925	struct request_sock *req = child->sk_user_data;
2926
2927	inet_csk_reqsk_queue_removed(parent, req);
2928	synq_remove(tcp_sk(child));
2929	__reqsk_free(req);
2930	child->sk_user_data = NULL;
2931#endif
2932}
2933
2934
2935/*
2936 * Performs the actual work to abort a SYN_RECV connection.
2937 */
2938static void
2939do_abort_syn_rcv(struct socket *child, struct socket *parent)
2940{
2941	struct tcpcb *parenttp = so_sototcpcb(parent);
2942	struct tcpcb *childtp = so_sototcpcb(child);
2943
2944	/*
2945	 * If the server is still open we clean up the child connection,
2946	 * otherwise the server already did the clean up as it was purging
2947	 * its SYN queue and the skb was just sitting in its backlog.
2948	 */
2949	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2950		cleanup_syn_rcv_conn(child, parent);
2951		inp_wlock(childtp->t_inpcb);
2952		t3_release_offload_resources(childtp->t_toe);
2953		inp_wunlock(childtp->t_inpcb);
2954		tcp_offload_close(childtp);
2955	}
2956}
2957#endif
2958
2959/*
2960 * Handle abort requests for a SYN_RECV connection.  These need extra work
2961 * because the socket is on its parent's SYN queue.
2962 */
2963static int
2964abort_syn_rcv(struct socket *so, struct mbuf *m)
2965{
2966	CXGB_UNIMPLEMENTED();
2967#ifdef notyet
2968	struct socket *parent;
2969	struct toedev *tdev = toep->tp_toedev;
2970	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2971	struct socket *oreq = so->so_incomp;
2972	struct t3c_tid_entry *t3c_stid;
2973	struct tid_info *t;
2974
2975	if (!oreq)
2976		return -1;        /* somehow we are not on the SYN queue */
2977
2978	t = &(T3C_DATA(cdev))->tid_maps;
2979	t3c_stid = lookup_stid(t, oreq->ts_recent);
2980	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2981
2982	so_lock(parent);
2983	do_abort_syn_rcv(so, parent);
2984	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2985	so_unlock(parent);
2986#endif
2987	return (0);
2988}
2989
2990/*
2991 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2992 * request except that we need to reply to it.
2993 */
2994static void
2995process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
2996{
2997	int rst_status = CPL_ABORT_NO_RST;
2998	const struct cpl_abort_req_rss *req = cplhdr(m);
2999	struct tcpcb *tp = toep->tp_tp;
3000	struct socket *so;
3001	int needclose = 0;
3002
3003	inp_wlock(tp->t_inpcb);
3004	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3005	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3006		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3007		m_free(m);
3008		goto skip;
3009	}
3010
3011	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3012	/*
3013	 * Three cases to consider:
3014	 * a) We haven't sent an abort_req; close the connection.
3015	 * b) We have sent a post-close abort_req that will get to TP too late
3016	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3017	 *    be ignored and the connection should be closed now.
3018	 * c) We have sent a regular abort_req that will get to TP too late.
3019	 *    That will generate an abort_rpl with status 0, wait for it.
3020	 */
3021	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3022	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3023		int error;
3024
3025		error = abort_status_to_errno(so, req->status,
3026		    &rst_status);
3027		so_error_set(so, error);
3028
3029		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3030			so_sorwakeup(so);
3031		/*
3032		 * SYN_RECV needs special processing.  If abort_syn_rcv()
3033		 * returns 0 is has taken care of the abort.
3034		 */
3035		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3036			goto skip;
3037
3038		t3_release_offload_resources(toep);
3039		needclose = 1;
3040	}
3041	inp_wunlock(tp->t_inpcb);
3042
3043	if (needclose)
3044		tcp_offload_close(tp);
3045
3046	send_abort_rpl(m, tdev, rst_status);
3047	return;
3048skip:
3049	inp_wunlock(tp->t_inpcb);
3050}
3051
3052/*
3053 * Handle an ABORT_REQ_RSS CPL message.
3054 */
3055static int
3056do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3057{
3058	const struct cpl_abort_req_rss *req = cplhdr(m);
3059	struct toepcb *toep = (struct toepcb *)ctx;
3060
3061	if (is_neg_adv_abort(req->status)) {
3062		m_free(m);
3063		return (0);
3064	}
3065
3066	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3067
3068	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3069		cxgb_remove_tid(cdev, toep, toep->tp_tid);
3070		toep->tp_flags |= TP_ABORT_REQ_RCVD;
3071
3072		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3073		if (toep->tp_l2t)
3074			l2t_release(L2DATA(cdev), toep->tp_l2t);
3075
3076		/*
3077		 *  Unhook
3078		 */
3079		toep->tp_tp->t_toe = NULL;
3080		toep->tp_tp->t_flags &= ~TF_TOE;
3081		toep->tp_tp = NULL;
3082		/*
3083		 * XXX need to call syncache_chkrst - but we don't
3084		 * have a way of doing that yet
3085		 */
3086		toepcb_release(toep);
3087		log(LOG_ERR, "abort for unestablished connection :-(\n");
3088		return (0);
3089	}
3090	if (toep->tp_tp == NULL) {
3091		log(LOG_NOTICE, "disconnected toepcb\n");
3092		/* should be freed momentarily */
3093		return (0);
3094	}
3095
3096
3097	toepcb_hold(toep);
3098	process_abort_req(toep, m, toep->tp_toedev);
3099	toepcb_release(toep);
3100	return (0);
3101}
3102#ifdef notyet
3103static void
3104pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3105{
3106	struct toedev *tdev = TOE_DEV(parent);
3107
3108	do_abort_syn_rcv(child, parent);
3109	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3110		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3111
3112		rpl->opt0h = htonl(F_TCAM_BYPASS);
3113		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3114		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3115	} else
3116		m_free(m);
3117}
3118#endif
3119static void
3120handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3121{
3122	CXGB_UNIMPLEMENTED();
3123
3124#ifdef notyet
3125	struct t3cdev *cdev;
3126	struct socket *parent;
3127	struct socket *oreq;
3128	struct t3c_tid_entry *t3c_stid;
3129	struct tid_info *t;
3130	struct tcpcb *otp, *tp = so_sototcpcb(so);
3131	struct toepcb *toep = tp->t_toe;
3132
3133	/*
3134	 * If the connection is being aborted due to the parent listening
3135	 * socket going away there's nothing to do, the ABORT_REQ will close
3136	 * the connection.
3137	 */
3138	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3139		m_free(m);
3140		return;
3141	}
3142
3143	oreq = so->so_incomp;
3144	otp = so_sototcpcb(oreq);
3145
3146	cdev = T3C_DEV(so);
3147	t = &(T3C_DATA(cdev))->tid_maps;
3148	t3c_stid = lookup_stid(t, otp->ts_recent);
3149	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3150
3151	so_lock(parent);
3152	pass_open_abort(so, parent, m);
3153	so_unlock(parent);
3154#endif
3155}
3156
3157/*
3158 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3159 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3160 * connection.
3161 */
3162static void
3163pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3164{
3165
3166#ifdef notyet
3167	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3168	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3169#endif
3170	handle_pass_open_arp_failure(m_get_socket(m), m);
3171}
3172
3173/*
3174 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3175 */
3176static void
3177mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3178{
3179	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3180	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3181	unsigned int tid = GET_TID(req);
3182
3183	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3184	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3185	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3186	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3187	rpl->opt0h = htonl(F_TCAM_BYPASS);
3188	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3189	rpl->opt2 = 0;
3190	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3191}
3192
3193/*
3194 * Send a deferred reject to an accept request.
3195 */
3196static void
3197reject_pass_request(struct toedev *tdev, struct mbuf *m)
3198{
3199	struct mbuf *reply_mbuf;
3200
3201	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3202	mk_pass_accept_rpl(reply_mbuf, m);
3203	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3204	m_free(m);
3205}
3206
3207static void
3208handle_syncache_event(int event, void *arg)
3209{
3210	struct toepcb *toep = arg;
3211
3212	switch (event) {
3213	case TOE_SC_ENTRY_PRESENT:
3214		/*
3215		 * entry already exists - free toepcb
3216		 * and l2t
3217		 */
3218		printf("syncache entry present\n");
3219		toepcb_release(toep);
3220		break;
3221	case TOE_SC_DROP:
3222		/*
3223		 * The syncache has given up on this entry
3224		 * either it timed out, or it was evicted
3225		 * we need to explicitly release the tid
3226		 */
3227		printf("syncache entry dropped\n");
3228		toepcb_release(toep);
3229		break;
3230	default:
3231		log(LOG_ERR, "unknown syncache event %d\n", event);
3232		break;
3233	}
3234}
3235
3236static void
3237syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3238{
3239	struct in_conninfo inc;
3240	struct tcpopt to;
3241	struct tcphdr th;
3242	struct inpcb *inp;
3243	int mss, wsf, sack, ts;
3244	uint32_t rcv_isn = ntohl(req->rcv_isn);
3245
3246	bzero(&to, sizeof(struct tcpopt));
3247	inp = so_sotoinpcb(lso);
3248
3249	/*
3250	 * Fill out information for entering us into the syncache
3251	 */
3252	inc.inc_fport = th.th_sport = req->peer_port;
3253	inc.inc_lport = th.th_dport = req->local_port;
3254	th.th_seq = req->rcv_isn;
3255	th.th_flags = TH_SYN;
3256
3257	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3258
3259
3260	inc.inc_isipv6 = 0;
3261	inc.inc_len = 0;
3262	inc.inc_faddr.s_addr = req->peer_ip;
3263	inc.inc_laddr.s_addr = req->local_ip;
3264
3265	DPRINTF("syncache add of %d:%d %d:%d\n",
3266	    ntohl(req->local_ip), ntohs(req->local_port),
3267	    ntohl(req->peer_ip), ntohs(req->peer_port));
3268
3269	mss = req->tcp_options.mss;
3270	wsf = req->tcp_options.wsf;
3271	ts = req->tcp_options.tstamp;
3272	sack = req->tcp_options.sack;
3273	to.to_mss = mss;
3274	to.to_wscale = wsf;
3275	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3276	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3277}
3278
3279
3280/*
3281 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3282 * lock held.  Note that the sock here is a listening socket that is not owned
3283 * by the TOE.
3284 */
3285static void
3286process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3287    struct listen_ctx *lctx)
3288{
3289	int rt_flags;
3290	struct l2t_entry *e;
3291	struct iff_mac tim;
3292	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3293	struct cpl_pass_accept_rpl *rpl;
3294	struct cpl_pass_accept_req *req = cplhdr(m);
3295	unsigned int tid = GET_TID(req);
3296	struct tom_data *d = TOM_DATA(tdev);
3297	struct t3cdev *cdev = d->cdev;
3298	struct tcpcb *tp = so_sototcpcb(so);
3299	struct toepcb *newtoep;
3300	struct rtentry *dst;
3301	struct sockaddr_in nam;
3302	struct t3c_data *td = T3C_DATA(cdev);
3303
3304	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3305	if (__predict_false(reply_mbuf == NULL)) {
3306		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3307			t3_defer_reply(m, tdev, reject_pass_request);
3308		else {
3309			cxgb_queue_tid_release(cdev, tid);
3310			m_free(m);
3311		}
3312		DPRINTF("failed to get reply_mbuf\n");
3313
3314		goto out;
3315	}
3316
3317	if (tp->t_state != TCPS_LISTEN) {
3318		DPRINTF("socket not in listen state\n");
3319
3320		goto reject;
3321	}
3322
3323	tim.mac_addr = req->dst_mac;
3324	tim.vlan_tag = ntohs(req->vlan_tag);
3325	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3326		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3327		goto reject;
3328	}
3329
3330#ifdef notyet
3331	/*
3332	 * XXX do route lookup to confirm that we're still listening on this
3333	 * address
3334	 */
3335	if (ip_route_input(skb, req->local_ip, req->peer_ip,
3336			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3337		goto reject;
3338	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3339		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3340	dst_release(skb->dst);	// done with the input route, release it
3341	skb->dst = NULL;
3342
3343	if ((rt_flags & RTF_LOCAL) == 0)
3344		goto reject;
3345#endif
3346	/*
3347	 * XXX
3348	 */
3349	rt_flags = RTF_LOCAL;
3350	if ((rt_flags & RTF_LOCAL) == 0)
3351		goto reject;
3352
3353	/*
3354	 * Calculate values and add to syncache
3355	 */
3356
3357	newtoep = toepcb_alloc();
3358	if (newtoep == NULL)
3359		goto reject;
3360
3361	bzero(&nam, sizeof(struct sockaddr_in));
3362
3363	nam.sin_len = sizeof(struct sockaddr_in);
3364	nam.sin_family = AF_INET;
3365	nam.sin_addr.s_addr =req->peer_ip;
3366	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3367
3368	if (dst == NULL) {
3369		printf("failed to find route\n");
3370		goto reject;
3371	}
3372	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3373	    (struct sockaddr *)&nam);
3374	if (e == NULL) {
3375		DPRINTF("failed to get l2t\n");
3376	}
3377	/*
3378	 * Point to our listen socket until accept
3379	 */
3380	newtoep->tp_tp = tp;
3381	newtoep->tp_flags = TP_SYN_RCVD;
3382	newtoep->tp_tid = tid;
3383	newtoep->tp_toedev = tdev;
3384	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3385
3386	cxgb_insert_tid(cdev, d->client, newtoep, tid);
3387	so_lock(so);
3388	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3389	so_unlock(so);
3390
3391	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3392		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3393
3394	if (newtoep->tp_ulp_mode) {
3395		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3396
3397		if (ddp_mbuf == NULL)
3398			newtoep->tp_ulp_mode = 0;
3399	}
3400
3401	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3402	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3403	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3404	/*
3405	 * XXX workaround for lack of syncache drop
3406	 */
3407	toepcb_hold(newtoep);
3408	syncache_add_accept_req(req, so, newtoep);
3409
3410	rpl = cplhdr(reply_mbuf);
3411	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3412	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3413	rpl->wr.wr_lo = 0;
3414	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3415	rpl->opt2 = htonl(calc_opt2(so, tdev));
3416	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3417	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
3418
3419	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3420	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3421	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3422				  CPL_PASS_OPEN_ACCEPT);
3423
3424	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3425
3426	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3427
3428	l2t_send(cdev, reply_mbuf, e);
3429	m_free(m);
3430	if (newtoep->tp_ulp_mode) {
3431		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3432				V_TF_DDP_OFF(1) |
3433				TP_DDP_TIMER_WORKAROUND_MASK,
3434				V_TF_DDP_OFF(1) |
3435		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
3436	} else
3437		printf("not offloading\n");
3438
3439
3440
3441	return;
3442reject:
3443	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3444		mk_pass_accept_rpl(reply_mbuf, m);
3445	else
3446		mk_tid_release(reply_mbuf, newtoep, tid);
3447	cxgb_ofld_send(cdev, reply_mbuf);
3448	m_free(m);
3449out:
3450#if 0
3451	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3452#else
3453	return;
3454#endif
3455}
3456
3457/*
3458 * Handle a CPL_PASS_ACCEPT_REQ message.
3459 */
3460static int
3461do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3462{
3463	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3464	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3465	struct tom_data *d = listen_ctx->tom_data;
3466
3467#if VALIDATE_TID
3468	struct cpl_pass_accept_req *req = cplhdr(m);
3469	unsigned int tid = GET_TID(req);
3470	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3471
3472	if (unlikely(!lsk)) {
3473		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3474		       cdev->name,
3475		       (unsigned long)((union listen_entry *)ctx -
3476					t->stid_tab));
3477		return CPL_RET_BUF_DONE;
3478	}
3479	if (unlikely(tid >= t->ntids)) {
3480		printk(KERN_ERR "%s: passive open TID %u too large\n",
3481		       cdev->name, tid);
3482		return CPL_RET_BUF_DONE;
3483	}
3484	/*
3485	 * For T3A the current user of the TID may have closed but its last
3486	 * message(s) may have been backlogged so the TID appears to be still
3487	 * in use.  Just take the TID away, the connection can close at its
3488	 * own leisure.  For T3B this situation is a bug.
3489	 */
3490	if (!valid_new_tid(t, tid) &&
3491	    cdev->type != T3A) {
3492		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3493		       cdev->name, tid);
3494		return CPL_RET_BUF_DONE;
3495	}
3496#endif
3497
3498	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3499	return (0);
3500}
3501
3502/*
3503 * Called when a connection is established to translate the TCP options
3504 * reported by HW to FreeBSD's native format.
3505 */
3506static void
3507assign_rxopt(struct socket *so, unsigned int opt)
3508{
3509	struct tcpcb *tp = so_sototcpcb(so);
3510	struct toepcb *toep = tp->t_toe;
3511	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3512
3513	inp_lock_assert(tp->t_inpcb);
3514
3515	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3516	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3517	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3518	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3519	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3520	    (TF_RCVD_SCALE|TF_REQ_SCALE))
3521		tp->rcv_scale = tp->request_r_scale;
3522}
3523
3524/*
3525 * Completes some final bits of initialization for just established connections
3526 * and changes their state to TCP_ESTABLISHED.
3527 *
3528 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3529 */
3530static void
3531make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3532{
3533	struct tcpcb *tp = so_sototcpcb(so);
3534	struct toepcb *toep = tp->t_toe;
3535
3536	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3537	assign_rxopt(so, opt);
3538
3539	/*
3540	 *XXXXXXXXXXX
3541	 *
3542	 */
3543#ifdef notyet
3544	so->so_proto->pr_ctloutput = t3_ctloutput;
3545#endif
3546
3547#if 0
3548	inet_sk(sk)->id = tp->write_seq ^ jiffies;
3549#endif
3550	/*
3551	 * XXX not clear what rcv_wup maps to
3552	 */
3553	/*
3554	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3555	 * pass through opt0.
3556	 */
3557	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3558		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3559
3560	dump_toepcb(toep);
3561
3562#ifdef notyet
3563/*
3564 * no clean interface for marking ARP up to date
3565 */
3566	dst_confirm(sk->sk_dst_cache);
3567#endif
3568	tp->t_starttime = ticks;
3569	tp->t_state = TCPS_ESTABLISHED;
3570	soisconnected(so);
3571}
3572
3573static int
3574syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3575{
3576
3577	struct in_conninfo inc;
3578	struct tcpopt to;
3579	struct tcphdr th;
3580	int mss, wsf, sack, ts;
3581	struct mbuf *m = NULL;
3582	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3583	unsigned int opt;
3584
3585#ifdef MAC
3586#error	"no MAC support"
3587#endif
3588
3589	opt = ntohs(req->tcp_opt);
3590
3591	bzero(&to, sizeof(struct tcpopt));
3592
3593	/*
3594	 * Fill out information for entering us into the syncache
3595	 */
3596	inc.inc_fport = th.th_sport = req->peer_port;
3597	inc.inc_lport = th.th_dport = req->local_port;
3598	th.th_seq = req->rcv_isn;
3599	th.th_flags = TH_ACK;
3600
3601	inc.inc_isipv6 = 0;
3602	inc.inc_len = 0;
3603	inc.inc_faddr.s_addr = req->peer_ip;
3604	inc.inc_laddr.s_addr = req->local_ip;
3605
3606	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3607	wsf  = G_TCPOPT_WSCALE_OK(opt);
3608	ts   = G_TCPOPT_TSTAMP(opt);
3609	sack = G_TCPOPT_SACK(opt);
3610
3611	to.to_mss = mss;
3612	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3613	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3614
3615	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3616	    ntohl(req->local_ip), ntohs(req->local_port),
3617	    ntohl(req->peer_ip), ntohs(req->peer_port),
3618	    mss, wsf, ts, sack);
3619	return syncache_offload_expand(&inc, &to, &th, so, m);
3620}
3621
3622
3623/*
3624 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3625 * if we are in TCP_SYN_RECV due to crossed SYNs
3626 */
3627static int
3628do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3629{
3630	struct cpl_pass_establish *req = cplhdr(m);
3631	struct toepcb *toep = (struct toepcb *)ctx;
3632	struct tcpcb *tp = toep->tp_tp;
3633	struct socket *so, *lso;
3634	struct t3c_data *td = T3C_DATA(cdev);
3635	struct sockbuf *snd, *rcv;
3636
3637	// Complete socket initialization now that we have the SND_ISN
3638
3639	struct toedev *tdev;
3640
3641
3642	tdev = toep->tp_toedev;
3643
3644	inp_wlock(tp->t_inpcb);
3645
3646	/*
3647	 *
3648	 * XXX need to add reference while we're manipulating
3649	 */
3650	so = lso = inp_inpcbtosocket(tp->t_inpcb);
3651
3652	inp_wunlock(tp->t_inpcb);
3653
3654	so_lock(so);
3655	LIST_REMOVE(toep, synq_entry);
3656	so_unlock(so);
3657
3658	if (!syncache_expand_establish_req(req, &so, toep)) {
3659		/*
3660		 * No entry
3661		 */
3662		CXGB_UNIMPLEMENTED();
3663	}
3664	if (so == NULL) {
3665		/*
3666		 * Couldn't create the socket
3667		 */
3668		CXGB_UNIMPLEMENTED();
3669	}
3670
3671	tp = so_sototcpcb(so);
3672	inp_wlock(tp->t_inpcb);
3673
3674	snd = so_sockbuf_snd(so);
3675	rcv = so_sockbuf_rcv(so);
3676
3677	snd->sb_flags |= SB_NOCOALESCE;
3678	rcv->sb_flags |= SB_NOCOALESCE;
3679
3680	toep->tp_tp = tp;
3681	toep->tp_flags = 0;
3682	tp->t_toe = toep;
3683	reset_wr_list(toep);
3684	tp->rcv_wnd = select_rcv_wnd(tdev, so);
3685	tp->rcv_nxt = toep->tp_copied_seq;
3686	install_offload_ops(so);
3687
3688	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3689	toep->tp_wr_unacked = 0;
3690	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3691	toep->tp_qset_idx = 0;
3692	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3693
3694	/*
3695	 * XXX Cancel any keep alive timer
3696	 */
3697
3698	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3699
3700	/*
3701	 * XXX workaround for lack of syncache drop
3702	 */
3703	toepcb_release(toep);
3704	inp_wunlock(tp->t_inpcb);
3705
3706	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3707	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3708#ifdef notyet
3709	/*
3710	 * XXX not sure how these checks map to us
3711	 */
3712	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3713		sk->sk_state_change(sk);
3714		sk_wake_async(so, 0, POLL_OUT);
3715	}
3716	/*
3717	 * The state for the new connection is now up to date.
3718	 * Next check if we should add the connection to the parent's
3719	 * accept queue.  When the parent closes it resets connections
3720	 * on its SYN queue, so check if we are being reset.  If so we
3721	 * don't need to do anything more, the coming ABORT_RPL will
3722	 * destroy this socket.  Otherwise move the connection to the
3723	 * accept queue.
3724	 *
3725	 * Note that we reset the synq before closing the server so if
3726	 * we are not being reset the stid is still open.
3727	 */
3728	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3729		__kfree_skb(skb);
3730		goto unlock;
3731	}
3732#endif
3733	m_free(m);
3734
3735	return (0);
3736}
3737
3738/*
3739 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3740 * and send them to the TOE.
3741 */
3742static void
3743fixup_and_send_ofo(struct toepcb *toep)
3744{
3745	struct mbuf *m;
3746	struct toedev *tdev = toep->tp_toedev;
3747	struct tcpcb *tp = toep->tp_tp;
3748	unsigned int tid = toep->tp_tid;
3749
3750	log(LOG_NOTICE, "fixup_and_send_ofo\n");
3751
3752	inp_lock_assert(tp->t_inpcb);
3753	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3754		/*
3755		 * A variety of messages can be waiting but the fields we'll
3756		 * be touching are common to all so any message type will do.
3757		 */
3758		struct cpl_close_con_req *p = cplhdr(m);
3759
3760		p->wr.wr_lo = htonl(V_WR_TID(tid));
3761		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3762		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3763	}
3764}
3765
3766/*
3767 * Updates socket state from an active establish CPL message.  Runs with the
3768 * socket lock held.
3769 */
3770static void
3771socket_act_establish(struct socket *so, struct mbuf *m)
3772{
3773	struct cpl_act_establish *req = cplhdr(m);
3774	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3775	struct tcpcb *tp = so_sototcpcb(so);
3776	struct toepcb *toep = tp->t_toe;
3777
3778	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3779		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3780		    toep->tp_tid, tp->t_state);
3781
3782	tp->ts_recent_age = ticks;
3783	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3784	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3785
3786	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3787
3788	/*
3789	 * Now that we finally have a TID send any CPL messages that we had to
3790	 * defer for lack of a TID.
3791	 */
3792	if (mbufq_len(&toep->out_of_order_queue))
3793		fixup_and_send_ofo(toep);
3794
3795	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3796		/*
3797		 * XXX does this even make sense?
3798		 */
3799		so_sorwakeup(so);
3800	}
3801	m_free(m);
3802#ifdef notyet
3803/*
3804 * XXX assume no write requests permitted while socket connection is
3805 * incomplete
3806 */
3807	/*
3808	 * Currently the send queue must be empty at this point because the
3809	 * socket layer does not send anything before a connection is
3810	 * established.  To be future proof though we handle the possibility
3811	 * that there are pending buffers to send (either TX_DATA or
3812	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3813	 * buffers according to the just learned write_seq, and then we send
3814	 * them on their way.
3815	 */
3816	fixup_pending_writeq_buffers(sk);
3817	if (t3_push_frames(so, 1))
3818		sk->sk_write_space(sk);
3819#endif
3820
3821	toep->tp_state = tp->t_state;
3822	tcpstat.tcps_connects++;
3823
3824}
3825
3826/*
3827 * Process a CPL_ACT_ESTABLISH message.
3828 */
3829static int
3830do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3831{
3832	struct cpl_act_establish *req = cplhdr(m);
3833	unsigned int tid = GET_TID(req);
3834	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3835	struct toepcb *toep = (struct toepcb *)ctx;
3836	struct tcpcb *tp = toep->tp_tp;
3837	struct socket *so;
3838	struct toedev *tdev;
3839	struct tom_data *d;
3840
3841	if (tp == NULL) {
3842		free_atid(cdev, atid);
3843		return (0);
3844	}
3845	inp_wlock(tp->t_inpcb);
3846
3847	/*
3848	 * XXX
3849	 */
3850	so = inp_inpcbtosocket(tp->t_inpcb);
3851	tdev = toep->tp_toedev; /* blow up here if link was down */
3852	d = TOM_DATA(tdev);
3853
3854	/*
3855	 * It's OK if the TID is currently in use, the owning socket may have
3856	 * backlogged its last CPL message(s).  Just take it away.
3857	 */
3858	toep->tp_tid = tid;
3859	toep->tp_tp = tp;
3860	so_insert_tid(d, toep, tid);
3861	free_atid(cdev, atid);
3862	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3863
3864	socket_act_establish(so, m);
3865	inp_wunlock(tp->t_inpcb);
3866	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3867	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3868
3869	return (0);
3870}
3871
3872/*
3873 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3874 * next batch of work requests from the write queue.
3875 */
3876static void
3877wr_ack(struct toepcb *toep, struct mbuf *m)
3878{
3879	struct tcpcb *tp = toep->tp_tp;
3880	struct cpl_wr_ack *hdr = cplhdr(m);
3881	struct socket *so;
3882	unsigned int credits = ntohs(hdr->credits);
3883	u32 snd_una = ntohl(hdr->snd_una);
3884	int bytes = 0;
3885	struct sockbuf *snd;
3886
3887	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3888
3889	inp_wlock(tp->t_inpcb);
3890	so = inp_inpcbtosocket(tp->t_inpcb);
3891	toep->tp_wr_avail += credits;
3892	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3893		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3894
3895	while (credits) {
3896		struct mbuf *p = peek_wr(toep);
3897
3898		if (__predict_false(!p)) {
3899			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3900			    "nothing pending, state %u wr_avail=%u\n",
3901			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3902			break;
3903		}
3904		CTR2(KTR_TOM,
3905			"wr_ack: p->credits=%d p->bytes=%d",
3906		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
3907		KASSERT(p->m_pkthdr.csum_data != 0,
3908		    ("empty request still on list"));
3909
3910		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3911
3912#if DEBUG_WR > 1
3913			struct tx_data_wr *w = cplhdr(p);
3914			log(LOG_ERR,
3915			       "TID %u got %u WR credits, need %u, len %u, "
3916			       "main body %u, frags %u, seq # %u, ACK una %u,"
3917			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3918			       toep->tp_tid, credits, p->csum, p->len,
3919			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3920			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3921			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3922#endif
3923			p->m_pkthdr.csum_data -= credits;
3924			break;
3925		} else {
3926			dequeue_wr(toep);
3927			credits -= p->m_pkthdr.csum_data;
3928			bytes += p->m_pkthdr.len;
3929			CTR3(KTR_TOM,
3930			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3931			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3932
3933			m_free(p);
3934		}
3935	}
3936
3937#if DEBUG_WR
3938	check_wr_invariants(tp);
3939#endif
3940
3941	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3942#if VALIDATE_SEQ
3943		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3944
3945		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3946		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3947		    toep->tp_tid, tp->snd_una);
3948#endif
3949		goto out_free;
3950	}
3951
3952	if (tp->snd_una != snd_una) {
3953		tp->snd_una = snd_una;
3954		tp->ts_recent_age = ticks;
3955#ifdef notyet
3956		/*
3957		 * Keep ARP entry "minty fresh"
3958		 */
3959		dst_confirm(sk->sk_dst_cache);
3960#endif
3961		if (tp->snd_una == tp->snd_nxt)
3962			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3963	}
3964
3965	snd = so_sockbuf_snd(so);
3966	if (bytes) {
3967		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3968		snd = so_sockbuf_snd(so);
3969		sockbuf_lock(snd);
3970		sbdrop_locked(snd, bytes);
3971		so_sowwakeup_locked(so);
3972	}
3973
3974	if (snd->sb_sndptroff < snd->sb_cc)
3975		t3_push_frames(so, 0);
3976
3977out_free:
3978	inp_wunlock(tp->t_inpcb);
3979	m_free(m);
3980}
3981
3982/*
3983 * Handler for TX_DATA_ACK CPL messages.
3984 */
3985static int
3986do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3987{
3988	struct toepcb *toep = (struct toepcb *)ctx;
3989
3990	VALIDATE_SOCK(so);
3991
3992	wr_ack(toep, m);
3993	return 0;
3994}
3995
3996/*
3997 * Handler for TRACE_PKT CPL messages.  Just sink these packets.
3998 */
3999static int
4000do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4001{
4002	m_freem(m);
4003	return 0;
4004}
4005
4006/*
4007 * Reset a connection that is on a listener's SYN queue or accept queue,
4008 * i.e., one that has not had a struct socket associated with it.
4009 * Must be called from process context.
4010 *
4011 * Modeled after code in inet_csk_listen_stop().
4012 */
4013static void
4014t3_reset_listen_child(struct socket *child)
4015{
4016	struct tcpcb *tp = so_sototcpcb(child);
4017
4018	t3_send_reset(tp->t_toe);
4019}
4020
4021
4022static void
4023t3_child_disconnect(struct socket *so, void *arg)
4024{
4025	struct tcpcb *tp = so_sototcpcb(so);
4026
4027	if (tp->t_flags & TF_TOE) {
4028		inp_wlock(tp->t_inpcb);
4029		t3_reset_listen_child(so);
4030		inp_wunlock(tp->t_inpcb);
4031	}
4032}
4033
4034/*
4035 * Disconnect offloaded established but not yet accepted connections sitting
4036 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4037 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4038 */
4039void
4040t3_disconnect_acceptq(struct socket *listen_so)
4041{
4042
4043	so_lock(listen_so);
4044	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4045	so_unlock(listen_so);
4046}
4047
4048/*
4049 * Reset offloaded connections sitting on a server's syn queue.  As above
4050 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4051 */
4052
4053void
4054t3_reset_synq(struct listen_ctx *lctx)
4055{
4056	struct toepcb *toep;
4057
4058	so_lock(lctx->lso);
4059	while (!LIST_EMPTY(&lctx->synq_head)) {
4060		toep = LIST_FIRST(&lctx->synq_head);
4061		LIST_REMOVE(toep, synq_entry);
4062		toep->tp_tp = NULL;
4063		t3_send_reset(toep);
4064		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4065		toepcb_release(toep);
4066	}
4067	so_unlock(lctx->lso);
4068}
4069
4070
4071int
4072t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4073		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
4074		   unsigned int pg_off, unsigned int color)
4075{
4076	unsigned int i, j, pidx;
4077	struct pagepod *p;
4078	struct mbuf *m;
4079	struct ulp_mem_io *req;
4080	unsigned int tid = toep->tp_tid;
4081	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4082	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4083
4084	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4085	    gl, nppods, tag, maxoff, pg_off, color);
4086
4087	for (i = 0; i < nppods; ++i) {
4088		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4089		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4090		req = mtod(m, struct ulp_mem_io *);
4091		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4092		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4093		req->wr.wr_lo = 0;
4094		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4095					   V_ULPTX_CMD(ULP_MEM_WRITE));
4096		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4097				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4098
4099		p = (struct pagepod *)(req + 1);
4100		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4101			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4102			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4103						  V_PPOD_COLOR(color));
4104			p->pp_max_offset = htonl(maxoff);
4105			p->pp_page_offset = htonl(pg_off);
4106			p->pp_rsvd = 0;
4107			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4108				p->pp_addr[j] = pidx < gl->dgl_nelem ?
4109				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4110		} else
4111			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4112		send_or_defer(toep, m, 0);
4113		ppod_addr += PPOD_SIZE;
4114	}
4115	return (0);
4116}
4117
4118/*
4119 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4120 */
4121static inline void
4122mk_cpl_barrier_ulp(struct cpl_barrier *b)
4123{
4124	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4125
4126	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4127	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4128	b->opcode = CPL_BARRIER;
4129}
4130
4131/*
4132 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4133 */
4134static inline void
4135mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4136{
4137	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4138
4139	txpkt = (struct ulp_txpkt *)req;
4140	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4141	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4142	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4143	req->cpuno = htons(cpuno);
4144}
4145
4146/*
4147 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4148 */
4149static inline void
4150mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4151                     unsigned int word, uint64_t mask, uint64_t val)
4152{
4153	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4154
4155	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4156	    tid, word, mask, val);
4157
4158	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4159	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4160	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4161	req->reply = V_NO_REPLY(1);
4162	req->cpu_idx = 0;
4163	req->word = htons(word);
4164	req->mask = htobe64(mask);
4165	req->val = htobe64(val);
4166}
4167
4168/*
4169 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4170 */
4171static void
4172mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4173    unsigned int tid, unsigned int credits)
4174{
4175	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4176
4177	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4178	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4179	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4180	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4181	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4182				 V_RX_CREDITS(credits));
4183}
4184
4185void
4186t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4187{
4188	unsigned int wrlen;
4189	struct mbuf *m;
4190	struct work_request_hdr *wr;
4191	struct cpl_barrier *lock;
4192	struct cpl_set_tcb_field *req;
4193	struct cpl_get_tcb *getreq;
4194	struct ddp_state *p = &toep->tp_ddp_state;
4195
4196#if 0
4197	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4198#endif
4199	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4200		sizeof(*getreq);
4201	m = m_gethdr_nofail(wrlen);
4202	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4203	wr = mtod(m, struct work_request_hdr *);
4204	bzero(wr, wrlen);
4205
4206	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4207	m->m_pkthdr.len = m->m_len = wrlen;
4208
4209	lock = (struct cpl_barrier *)(wr + 1);
4210	mk_cpl_barrier_ulp(lock);
4211
4212	req = (struct cpl_set_tcb_field *)(lock + 1);
4213
4214	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4215
4216	/* Hmmm, not sure if this actually a good thing: reactivating
4217	 * the other buffer might be an issue if it has been completed
4218	 * already. However, that is unlikely, since the fact that the UBUF
4219	 * is not completed indicates that there is no oustanding data.
4220	 */
4221	if (bufidx == 0)
4222		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4223				     V_TF_DDP_ACTIVE_BUF(1) |
4224				     V_TF_DDP_BUF0_VALID(1),
4225				     V_TF_DDP_ACTIVE_BUF(1));
4226	else
4227		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4228				     V_TF_DDP_ACTIVE_BUF(1) |
4229				     V_TF_DDP_BUF1_VALID(1), 0);
4230
4231	getreq = (struct cpl_get_tcb *)(req + 1);
4232	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4233
4234	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4235
4236	/* Keep track of the number of oustanding CPL_GET_TCB requests
4237	 */
4238	p->get_tcb_count++;
4239
4240#ifdef T3_TRACE
4241	T3_TRACE1(TIDTB(so),
4242		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
4243#endif
4244	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4245}
4246
4247/**
4248 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4249 * @sk: the socket associated with the buffers
4250 * @bufidx: index of HW DDP buffer (0 or 1)
4251 * @tag0: new tag for HW buffer 0
4252 * @tag1: new tag for HW buffer 1
4253 * @len: new length for HW buf @bufidx
4254 *
4255 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4256 * buffer by changing the buffer tag and length and setting the valid and
4257 * active flag accordingly.  The caller must ensure the new buffer is at
4258 * least as big as the existing one.  Since we typically reprogram both HW
4259 * buffers this function sets both tags for convenience. Read the TCB to
4260 * determine how made data was written into the buffer before the overlay
4261 * took place.
4262 */
4263void
4264t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4265	 	       unsigned int tag1, unsigned int len)
4266{
4267	unsigned int wrlen;
4268	struct mbuf *m;
4269	struct work_request_hdr *wr;
4270	struct cpl_get_tcb *getreq;
4271	struct cpl_set_tcb_field *req;
4272	struct ddp_state *p = &toep->tp_ddp_state;
4273
4274	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4275	    bufidx, tag0, tag1, len);
4276#if 0
4277	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4278#endif
4279	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4280	m = m_gethdr_nofail(wrlen);
4281	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4282	wr = mtod(m, struct work_request_hdr *);
4283	m->m_pkthdr.len = m->m_len = wrlen;
4284	bzero(wr, wrlen);
4285
4286
4287	/* Set the ATOMIC flag to make sure that TP processes the following
4288	 * CPLs in an atomic manner and no wire segments can be interleaved.
4289	 */
4290	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4291	req = (struct cpl_set_tcb_field *)(wr + 1);
4292	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4293			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4294			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4295			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
4296			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4297	req++;
4298	if (bufidx == 0) {
4299		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4300			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4301			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4302		req++;
4303		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4304			    V_TF_DDP_PUSH_DISABLE_0(1) |
4305			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4306			    V_TF_DDP_PUSH_DISABLE_0(0) |
4307			    V_TF_DDP_BUF0_VALID(1));
4308	} else {
4309		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4310			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4311			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4312		req++;
4313		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4314			    V_TF_DDP_PUSH_DISABLE_1(1) |
4315			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4316			    V_TF_DDP_PUSH_DISABLE_1(0) |
4317			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4318	}
4319
4320	getreq = (struct cpl_get_tcb *)(req + 1);
4321	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4322
4323	/* Keep track of the number of oustanding CPL_GET_TCB requests
4324	 */
4325	p->get_tcb_count++;
4326
4327#ifdef T3_TRACE
4328	T3_TRACE4(TIDTB(sk),
4329		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4330		  "len %d",
4331		  bufidx, tag0, tag1, len);
4332#endif
4333	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4334}
4335
4336/*
4337 * Sends a compound WR containing all the CPL messages needed to program the
4338 * two HW DDP buffers, namely optionally setting up the length and offset of
4339 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4340 */
4341void
4342t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4343		      unsigned int len1, unsigned int offset1,
4344                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4345{
4346	unsigned int wrlen;
4347	struct mbuf *m;
4348	struct work_request_hdr *wr;
4349	struct cpl_set_tcb_field *req;
4350
4351	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4352	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4353
4354#if 0
4355	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4356#endif
4357	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4358		(len1 ? sizeof(*req) : 0) +
4359		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4360	m = m_gethdr_nofail(wrlen);
4361	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4362	wr = mtod(m, struct work_request_hdr *);
4363	bzero(wr, wrlen);
4364
4365	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4366	m->m_pkthdr.len = m->m_len = wrlen;
4367
4368	req = (struct cpl_set_tcb_field *)(wr + 1);
4369	if (len0) {                  /* program buffer 0 offset and length */
4370		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4371			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4372			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4373			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4374			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4375		req++;
4376	}
4377	if (len1) {                  /* program buffer 1 offset and length */
4378		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4379			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4380			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4381			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4382			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4383		req++;
4384	}
4385
4386	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4387			     ddp_flags);
4388
4389	if (modulate) {
4390		mk_rx_data_ack_ulp(toep,
4391		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4392		    toep->tp_copied_seq - toep->tp_rcv_wup);
4393		toep->tp_rcv_wup = toep->tp_copied_seq;
4394	}
4395
4396#ifdef T3_TRACE
4397	T3_TRACE5(TIDTB(sk),
4398		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4399		  "modulate %d",
4400		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4401		  modulate);
4402#endif
4403
4404	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4405}
4406
4407void
4408t3_init_wr_tab(unsigned int wr_len)
4409{
4410	int i;
4411
4412	if (mbuf_wrs[1])     /* already initialized */
4413		return;
4414
4415	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4416		int sgl_len = (3 * i) / 2 + (i & 1);
4417
4418		sgl_len += 3;
4419		mbuf_wrs[i] = sgl_len <= wr_len ?
4420		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
4421	}
4422
4423	wrlen = wr_len * 8;
4424}
4425
4426int
4427t3_init_cpl_io(void)
4428{
4429#ifdef notyet
4430	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4431	if (!tcphdr_skb) {
4432		log(LOG_ERR,
4433		       "Chelsio TCP offload: can't allocate sk_buff\n");
4434		return -1;
4435	}
4436	skb_put(tcphdr_skb, sizeof(struct tcphdr));
4437	tcphdr_skb->h.raw = tcphdr_skb->data;
4438	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4439#endif
4440
4441	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4442	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4443	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4444	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4445	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4446	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4447	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4448	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4449	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4450	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4451	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4452	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4453	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4454	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4455	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4456	return (0);
4457}
4458
4459