cxgb_cpl_io.c revision 174712
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 174712 2007-12-17 10:02:29Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/lock.h>
39#include <sys/mbuf.h>
40#include <sys/mutex.h>
41#include <sys/socket.h>
42#include <sys/sysctl.h>
43#include <sys/syslog.h>
44#include <sys/socketvar.h>
45#include <sys/protosw.h>
46#include <sys/priv.h>
47
48#include <net/if.h>
49#include <net/route.h>
50
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_systm.h>
54#include <netinet/in_var.h>
55
56
57#include <dev/cxgb/cxgb_osdep.h>
58#include <dev/cxgb/sys/mbufq.h>
59
60#include <netinet/ip.h>
61#include <netinet/tcp_var.h>
62#include <netinet/tcp_fsm.h>
63#include <netinet/tcp_offload.h>
64#include <netinet/tcp_seq.h>
65#include <netinet/tcp_syncache.h>
66#include <net/route.h>
67
68
69#include <dev/cxgb/t3cdev.h>
70#include <dev/cxgb/common/cxgb_firmware_exports.h>
71#include <dev/cxgb/common/cxgb_t3_cpl.h>
72#include <dev/cxgb/common/cxgb_tcb.h>
73#include <dev/cxgb/common/cxgb_ctl_defs.h>
74#include <dev/cxgb/cxgb_l2t.h>
75#include <dev/cxgb/cxgb_offload.h>
76#include <vm/vm.h>
77#include <vm/pmap.h>
78#include <machine/bus.h>
79#include <dev/cxgb/sys/mvec.h>
80#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
81#include <dev/cxgb/ulp/tom/cxgb_defs.h>
82#include <dev/cxgb/ulp/tom/cxgb_tom.h>
83#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
84#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
85#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
86
87
88
89/*
90 * For ULP connections HW may add headers, e.g., for digests, that aren't part
91 * of the messages sent by the host but that are part of the TCP payload and
92 * therefore consume TCP sequence space.  Tx connection parameters that
93 * operate in TCP sequence space are affected by the HW additions and need to
94 * compensate for them to accurately track TCP sequence numbers. This array
95 * contains the compensating extra lengths for ULP packets.  It is indexed by
96 * a packet's ULP submode.
97 */
98const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
99
100#ifdef notyet
101/*
102 * This sk_buff holds a fake header-only TCP segment that we use whenever we
103 * need to exploit SW TCP functionality that expects TCP headers, such as
104 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
105 * CPUs without locking.
106 */
107static struct mbuf *tcphdr_mbuf __read_mostly;
108#endif
109
110/*
111 * Size of WRs in bytes.  Note that we assume all devices we are handling have
112 * the same WR size.
113 */
114static unsigned int wrlen __read_mostly;
115
116/*
117 * The number of WRs needed for an skb depends on the number of page fragments
118 * in the skb and whether it has any payload in its main body.  This maps the
119 * length of the gather list represented by an skb into the # of necessary WRs.
120 */
121static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly;
122
123/*
124 * Max receive window supported by HW in bytes.  Only a small part of it can
125 * be set through option0, the rest needs to be set through RX_DATA_ACK.
126 */
127#define MAX_RCV_WND ((1U << 27) - 1)
128
129/*
130 * Min receive window.  We want it to be large enough to accommodate receive
131 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
132 */
133#define MIN_RCV_WND (24 * 1024U)
134#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
135
136#define VALIDATE_SEQ 0
137#define VALIDATE_SOCK(so)
138#define DEBUG_WR 0
139
140extern int tcp_do_autorcvbuf;
141extern int tcp_do_autosndbuf;
142extern int tcp_autorcvbuf_max;
143extern int tcp_autosndbuf_max;
144
145static void t3_send_reset(struct toepcb *toep);
146static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
147static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
148static void handle_syncache_event(int event, void *arg);
149
150
151static inline int
152is_t3a(const struct toedev *dev)
153{
154	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
155}
156
157static void
158dump_toepcb(struct toepcb *toep)
159{
160	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
161	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
162	    toep->tp_mtu_idx, toep->tp_tid);
163
164	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
165	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
166	    toep->tp_mss_clamp, toep->tp_flags);
167}
168
169static struct rtentry *
170rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
171{
172	struct rtentry *rt = NULL;
173
174	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
175		RT_UNLOCK(rt);
176
177	return (rt);
178}
179
180/*
181 * Determine whether to send a CPL message now or defer it.  A message is
182 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
183 * For connections in other states the message is sent immediately.
184 * If through_l2t is set the message is subject to ARP processing, otherwise
185 * it is sent directly.
186 */
187static inline void
188send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
189{
190	struct toepcb *toep = tp->t_toe;
191
192
193	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
194		INP_LOCK(tp->t_inpcb);
195		mbufq_tail(&toep->out_of_order_queue, m);  // defer
196		INP_UNLOCK(tp->t_inpcb);
197	} else if (through_l2t)
198		l2t_send(T3C_DEV(so), m, toep->tp_l2t);  // send through L2T
199	else
200		cxgb_ofld_send(T3C_DEV(so), m);          // send directly
201}
202
203static inline unsigned int
204mkprio(unsigned int cntrl, const struct socket *so)
205{
206        return cntrl;
207}
208
209/*
210 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
211 */
212static inline void
213mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid)
214{
215	struct cpl_tid_release *req;
216
217	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so));
218	m->m_pkthdr.len = m->m_len = sizeof(*req);
219	req = mtod(m, struct cpl_tid_release *);
220	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
221	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
222}
223
224static inline void
225make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
226{
227	struct tcpcb *tp = sototcpcb(so);
228	struct toepcb *toep = tp->t_toe;
229	struct tx_data_wr *req;
230
231	INP_LOCK_ASSERT(tp->t_inpcb);
232
233	req = mtod(m, struct tx_data_wr *);
234	m->m_len = sizeof(*req);
235	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
236	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
237	/* len includes the length of any HW ULP additions */
238	req->len = htonl(len);
239	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
240	/* V_TX_ULP_SUBMODE sets both the mode and submode */
241	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
242	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
243	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
244				   (tail ? 0 : 1))));
245	req->sndseq = htonl(tp->snd_nxt);
246	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
247		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
248				    V_TX_CPU_IDX(toep->tp_qset));
249
250		/* Sendbuffer is in units of 32KB.
251		 */
252		if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
253			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
254		else
255			req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
256		toep->tp_flags |= TP_DATASENT;
257	}
258}
259
260int
261t3_push_frames(struct socket *so, int req_completion)
262{
263	struct tcpcb *tp = sototcpcb(so);
264	struct toepcb *toep = tp->t_toe;
265
266	struct mbuf *tail, *m0, *last;
267	struct t3cdev *cdev;
268	struct tom_data *d;
269	int bytes, count, total_bytes;
270	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
271	segp = segs;
272
273	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
274		DPRINTF("tcp state=%d\n", tp->t_state);
275		return (0);
276	}
277
278	if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
279		DPRINTF("disconnecting\n");
280
281		return (0);
282	}
283
284	INP_LOCK_ASSERT(tp->t_inpcb);
285
286	SOCKBUF_LOCK(&so->so_snd);
287
288	d = TOM_DATA(TOE_DEV(so));
289	cdev = d->cdev;
290	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
291	total_bytes = 0;
292	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
293	    toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
294
295	if (last && toep->tp_m_last == last  && so->so_snd.sb_sndptroff != 0) {
296		KASSERT(tail, ("sbdrop error"));
297		last = tail = tail->m_next;
298	}
299
300	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
301		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
302		SOCKBUF_UNLOCK(&so->so_snd);
303		return (0);
304	}
305
306	toep->tp_m_last = NULL;
307	while (toep->tp_wr_avail && (tail != NULL)) {
308		count = bytes = 0;
309		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
310			SOCKBUF_UNLOCK(&so->so_snd);
311			return (0);
312		}
313		while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
314		    && (tail != NULL) && (count < TX_MAX_SEGS)) {
315			bytes += tail->m_len;
316			count++;
317			last = tail;
318			/*
319			 * technically an abuse to be using this for a VA
320			 * but less gross than defining my own structure
321			 * or calling pmap_kextract from here :-|
322			 */
323			segp->ds_addr = (bus_addr_t)tail->m_data;
324			segp->ds_len = tail->m_len;
325			DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
326			    count, mbuf_wrs[count], tail->m_data, tail->m_len);
327
328			segp++;
329			tail = tail->m_next;
330		}
331		DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
332		    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
333		if (tail) {
334			so->so_snd.sb_sndptr = tail;
335			toep->tp_m_last = NULL;
336		} else
337			toep->tp_m_last = so->so_snd.sb_sndptr = last;
338
339		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
340
341		so->so_snd.sb_sndptroff += bytes;
342		total_bytes += bytes;
343		toep->tp_write_seq += bytes;
344
345
346		SOCKBUF_UNLOCK(&so->so_snd);
347
348		/*
349		 * XXX can drop socket buffer lock here
350		 */
351
352		toep->tp_wr_avail -= mbuf_wrs[count];
353		toep->tp_wr_unacked += mbuf_wrs[count];
354
355		make_tx_data_wr(so, m0, bytes, tail);
356		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so));
357		m_set_sgl(m0, segs);
358		m_set_sgllen(m0, count);
359		/*
360		 * remember credits used
361		 */
362		m0->m_pkthdr.csum_data = mbuf_wrs[count];
363		m0->m_pkthdr.len = bytes;
364		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
365		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
366			struct work_request_hdr *wr = cplhdr(m0);
367
368			wr->wr_hi |= htonl(F_WR_COMPL);
369			toep->tp_wr_unacked = 0;
370		}
371
372		m0->m_type = MT_DONTFREE;
373		enqueue_wr(toep, m0);
374		DPRINTF("sending offload tx with %d bytes in %d segments\n",
375		    bytes, count);
376
377		l2t_send(cdev, m0, toep->tp_l2t);
378		if (toep->tp_wr_avail && (tail != NULL))
379			SOCKBUF_LOCK(&so->so_snd);
380	}
381
382	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
383	return (total_bytes);
384}
385
386/*
387 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
388 * under any circumstances.  We take the easy way out and always queue the
389 * message to the write_queue.  We can optimize the case where the queue is
390 * already empty though the optimization is probably not worth it.
391 */
392static void
393close_conn(struct socket *so)
394{
395	struct mbuf *m;
396	struct cpl_close_con_req *req;
397	struct tom_data *d;
398	struct inpcb *inp = sotoinpcb(so);
399	struct tcpcb *tp;
400	struct toepcb *toep;
401	unsigned int tid;
402
403
404	INP_LOCK(inp);
405	tp = sototcpcb(so);
406	toep = tp->t_toe;
407
408	if (tp->t_state != TCPS_SYN_SENT)
409		t3_push_frames(so, 1);
410
411	if (toep->tp_flags & TP_FIN_SENT) {
412		INP_UNLOCK(inp);
413		return;
414	}
415
416	tid = toep->tp_tid;
417
418	d = TOM_DATA(toep->tp_toedev);
419
420	m = m_gethdr_nofail(sizeof(*req));
421
422	toep->tp_flags |= TP_FIN_SENT;
423	req = mtod(m, struct cpl_close_con_req *);
424
425	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
426	req->wr.wr_lo = htonl(V_WR_TID(tid));
427	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
428	req->rsvd = htonl(toep->tp_write_seq);
429	INP_UNLOCK(inp);
430	/*
431	 * XXX - need to defer shutdown while there is still data in the queue
432	 *
433	 */
434	cxgb_ofld_send(d->cdev, m);
435
436}
437
438/*
439 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
440 * and send it along.
441 */
442static void
443abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
444{
445	struct cpl_abort_req *req = cplhdr(m);
446
447	req->cmd = CPL_ABORT_NO_RST;
448	cxgb_ofld_send(cdev, m);
449}
450
451/*
452 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
453 * permitted to return without sending the message in case we cannot allocate
454 * an sk_buff.  Returns the number of credits sent.
455 */
456uint32_t
457t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
458{
459	struct mbuf *m;
460	struct cpl_rx_data_ack *req;
461	struct toepcb *toep = tp->t_toe;
462	struct toedev *tdev = toep->tp_toedev;
463
464	m = m_gethdr_nofail(sizeof(*req));
465
466	DPRINTF("returning %u credits to HW\n", credits);
467
468	req = mtod(m, struct cpl_rx_data_ack *);
469	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
470	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
471	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
472	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep)));
473	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
474	return (credits);
475}
476
477
478/*
479 * Set of states for which we should return RX credits.
480 */
481#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
482
483/*
484 * Called after some received data has been read.  It returns RX credits
485 * to the HW for the amount of data processed.
486 */
487void
488t3_cleanup_rbuf(struct tcpcb *tp)
489{
490	struct toepcb *toep = tp->t_toe;
491	struct socket *so;
492	struct toedev *dev;
493	int dack_mode, must_send, read;
494	u32 thres, credits, dack = 0;
495
496	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
497		(tp->t_state == TCPS_FIN_WAIT_2)))
498		return;
499	INP_LOCK_ASSERT(tp->t_inpcb);
500
501	so = tp->t_inpcb->inp_socket;
502	SOCKBUF_LOCK(&so->so_rcv);
503	read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
504	toep->tp_copied_seq += read;
505	toep->tp_enqueued_bytes -= read;
506	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
507	SOCKBUF_UNLOCK(&so->so_rcv);
508
509	if (credits > so->so_rcv.sb_mbmax)
510	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
511		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
512	/*
513	 * XXX this won't accurately reflect credit return - we need
514	 * to look at the difference between the amount that has been
515	 * put in the recv sockbuf and what is there now
516	 */
517
518	if (__predict_false(!credits))
519		return;
520
521	dev = toep->tp_toedev;
522	thres = TOM_TUNABLE(dev, rx_credit_thres);
523
524	if (__predict_false(thres == 0))
525		return;
526
527	if (toep->tp_ulp_mode)
528		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
529	else {
530		dack_mode = TOM_TUNABLE(dev, delack);
531		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
532			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
533
534			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
535				dack = F_RX_DACK_CHANGE |
536				       V_RX_DACK_MODE(dack_mode);
537		}
538	}
539
540	/*
541	 * For coalescing to work effectively ensure the receive window has
542	 * at least 16KB left.
543	 */
544	must_send = credits + 16384 >= tp->rcv_wnd;
545
546	if (must_send || credits >= thres)
547		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
548}
549
550static int
551cxgb_toe_disconnect(struct tcpcb *tp)
552{
553	struct socket *so;
554
555	DPRINTF("cxgb_toe_disconnect\n");
556
557	so = tp->t_inpcb->inp_socket;
558	close_conn(so);
559	return (0);
560}
561
562static int
563cxgb_toe_reset(struct tcpcb *tp)
564{
565	struct toepcb *toep = tp->t_toe;
566
567
568	t3_send_reset(toep);
569
570	/*
571	 * unhook from socket
572	 */
573	tp->t_flags &= ~TF_TOE;
574	toep->tp_tp = NULL;
575	tp->t_toe = NULL;
576	return (0);
577}
578
579static int
580cxgb_toe_send(struct tcpcb *tp)
581{
582	struct socket *so;
583
584	DPRINTF("cxgb_toe_send\n");
585	dump_toepcb(tp->t_toe);
586
587	so = tp->t_inpcb->inp_socket;
588	t3_push_frames(so, 1);
589	return (0);
590}
591
592static int
593cxgb_toe_rcvd(struct tcpcb *tp)
594{
595	INP_LOCK_ASSERT(tp->t_inpcb);
596	t3_cleanup_rbuf(tp);
597
598	return (0);
599}
600
601static void
602cxgb_toe_detach(struct tcpcb *tp)
603{
604	struct toepcb *toep;
605	/*
606	 * XXX how do we handle teardown in the SYN_SENT state?
607	 *
608	 */
609	INP_INFO_WLOCK(&tcbinfo);
610	toep = tp->t_toe;
611	toep->tp_tp = NULL;
612
613	/*
614	 * unhook from socket
615	 */
616	tp->t_flags &= ~TF_TOE;
617	tp->t_toe = NULL;
618	INP_INFO_WUNLOCK(&tcbinfo);
619}
620
621
622static struct toe_usrreqs cxgb_toe_usrreqs = {
623	.tu_disconnect = cxgb_toe_disconnect,
624	.tu_reset = cxgb_toe_reset,
625	.tu_send = cxgb_toe_send,
626	.tu_rcvd = cxgb_toe_rcvd,
627	.tu_detach = cxgb_toe_detach,
628	.tu_detach = cxgb_toe_detach,
629	.tu_syncache_event = handle_syncache_event,
630};
631
632
633static void
634__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
635			    uint64_t mask, uint64_t val, int no_reply)
636{
637	struct cpl_set_tcb_field *req;
638	struct tcpcb *tp = sototcpcb(so);
639	struct toepcb *toep = tp->t_toe;
640
641	req = mtod(m, struct cpl_set_tcb_field *);
642	m->m_pkthdr.len = m->m_len = sizeof(*req);
643	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
644	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
645	req->reply = V_NO_REPLY(no_reply);
646	req->cpu_idx = 0;
647	req->word = htons(word);
648	req->mask = htobe64(mask);
649	req->val = htobe64(val);
650
651	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
652	send_or_defer(so, tp, m, 0);
653}
654
655static void
656t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
657{
658	struct mbuf *m;
659	struct tcpcb *tp = sototcpcb(so);
660	struct toepcb *toep = tp->t_toe;
661
662	if (toep == NULL)
663		return;
664
665	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
666		return;
667
668	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
669
670	__set_tcb_field(so, m, word, mask, val, 1);
671}
672
673/*
674 * Set one of the t_flags bits in the TCB.
675 */
676static void
677set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
678{
679	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
680}
681
682/*
683 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
684 */
685static void
686t3_set_nagle(struct socket *so)
687{
688	struct tcpcb *tp = sototcpcb(so);
689
690	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
691}
692
693/*
694 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
695 */
696void
697t3_set_keepalive(struct socket *so, int on_off)
698{
699	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
700}
701
702void
703t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
704{
705	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
706}
707
708/*
709 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
710 */
711static void
712t3_set_tos(struct socket *so)
713{
714	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
715			 V_TCB_TOS(SO_TOS(so)));
716}
717
718
719/*
720 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
721 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
722 * set the PSH bit in the last segment, which would trigger delivery.]
723 * We work around the issue by setting a DDP buffer in a partial placed state,
724 * which guarantees that TP will schedule a timer.
725 */
726#define TP_DDP_TIMER_WORKAROUND_MASK\
727    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
728     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
729       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
730#define TP_DDP_TIMER_WORKAROUND_VAL\
731    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
732     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
733      32))
734
735static void
736t3_enable_ddp(struct socket *so, int on)
737{
738	if (on)
739		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
740				 V_TF_DDP_OFF(0));
741	else
742		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
743				 V_TF_DDP_OFF(1) |
744				 TP_DDP_TIMER_WORKAROUND_MASK,
745				 V_TF_DDP_OFF(1) |
746				 TP_DDP_TIMER_WORKAROUND_VAL);
747
748}
749
750
751void
752t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
753{
754	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
755			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
756			 tag_color);
757}
758
759void
760t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
761		    unsigned int len)
762{
763	if (buf_idx == 0)
764		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
765			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
766			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
767			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
768			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
769	else
770		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
771			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
772			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
773			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
774			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
775}
776
777static int
778t3_set_cong_control(struct socket *so, const char *name)
779{
780#ifdef notyet
781	int cong_algo;
782
783	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
784		if (!strcmp(name, t3_cong_ops[cong_algo].name))
785			break;
786
787	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
788		return -EINVAL;
789#endif
790	return 0;
791}
792
793int
794t3_get_tcb(struct socket *so)
795{
796	struct cpl_get_tcb *req;
797	struct tcpcb *tp = sototcpcb(so);
798	struct toepcb *toep = tp->t_toe;
799	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
800
801	if (!m)
802		return (ENOMEM);
803
804	INP_LOCK_ASSERT(tp->t_inpcb);
805	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
806	req = mtod(m, struct cpl_get_tcb *);
807	m->m_pkthdr.len = m->m_len = sizeof(*req);
808	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
809	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
810	req->cpuno = htons(toep->tp_qset);
811	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
812		mbufq_tail(&toep->out_of_order_queue, m);	// defer
813	else
814		cxgb_ofld_send(T3C_DEV(so), m);
815	return 0;
816}
817
818static inline void
819so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
820{
821	struct toepcb *toep = sototoep(so);
822	toepcb_hold(toep);
823
824	cxgb_insert_tid(d->cdev, d->client, toep, tid);
825}
826
827/**
828 *	find_best_mtu - find the entry in the MTU table closest to an MTU
829 *	@d: TOM state
830 *	@mtu: the target MTU
831 *
832 *	Returns the index of the value in the MTU table that is closest to but
833 *	does not exceed the target MTU.
834 */
835static unsigned int
836find_best_mtu(const struct t3c_data *d, unsigned short mtu)
837{
838	int i = 0;
839
840	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
841		++i;
842	return (i);
843}
844
845static unsigned int
846select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
847{
848	unsigned int idx;
849
850#ifdef notyet
851	struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
852#endif
853	if (tp) {
854		tp->t_maxseg = pmtu - 40;
855		if (tp->t_maxseg < td->mtus[0] - 40)
856			tp->t_maxseg = td->mtus[0] - 40;
857		idx = find_best_mtu(td, tp->t_maxseg + 40);
858
859		tp->t_maxseg = td->mtus[idx] - 40;
860	} else
861		idx = find_best_mtu(td, pmtu);
862
863	return (idx);
864}
865
866void
867t3_release_ddp_resources(struct toepcb *toep)
868{
869	/*
870	 * This is a no-op until we have DDP support
871	 */
872}
873
874static inline void
875free_atid(struct t3cdev *cdev, unsigned int tid)
876{
877	struct toepcb *toep = cxgb_free_atid(cdev, tid);
878
879	if (toep)
880		toepcb_release(toep);
881}
882
883/*
884 * Release resources held by an offload connection (TID, L2T entry, etc.)
885 */
886static void
887t3_release_offload_resources(struct toepcb *toep)
888{
889	struct tcpcb *tp = toep->tp_tp;
890	struct toedev *tdev = toep->tp_toedev;
891	struct t3cdev *cdev;
892	unsigned int tid = toep->tp_tid;
893
894	if (!tdev)
895		return;
896
897	cdev = TOEP_T3C_DEV(toep);
898	if (!cdev)
899		return;
900
901	toep->tp_qset = 0;
902	t3_release_ddp_resources(toep);
903
904#ifdef CTRL_SKB_CACHE
905	kfree_skb(CTRL_SKB_CACHE(tp));
906	CTRL_SKB_CACHE(tp) = NULL;
907#endif
908
909	if (toep->tp_wr_avail != toep->tp_wr_max) {
910		purge_wr_queue(toep);
911		reset_wr_list(toep);
912	}
913
914	if (toep->tp_l2t) {
915		l2t_release(L2DATA(cdev), toep->tp_l2t);
916		toep->tp_l2t = NULL;
917	}
918	printf("setting toep->tp_tp to NULL\n");
919
920	toep->tp_tp = NULL;
921	if (tp) {
922		INP_LOCK_ASSERT(tp->t_inpcb);
923		tp->t_toe = NULL;
924		tp->t_flags &= ~TF_TOE;
925	}
926
927	if (toep->tp_state == TCPS_SYN_SENT) {
928		free_atid(cdev, tid);
929#ifdef notyet
930		__skb_queue_purge(&tp->out_of_order_queue);
931#endif
932	} else {                                          // we have TID
933		cxgb_remove_tid(cdev, toep, tid);
934		toepcb_release(toep);
935	}
936#if 0
937	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
938#endif
939}
940
941static void
942install_offload_ops(struct socket *so)
943{
944	struct tcpcb *tp = sototcpcb(so);
945
946	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
947
948	t3_install_socket_ops(so);
949	tp->t_flags |= TF_TOE;
950	tp->t_tu = &cxgb_toe_usrreqs;
951}
952
953/*
954 * Determine the receive window scaling factor given a target max
955 * receive window.
956 */
957static __inline int
958select_rcv_wscale(int space)
959{
960	int wscale = 0;
961
962	if (space > MAX_RCV_WND)
963		space = MAX_RCV_WND;
964
965	if (tcp_do_rfc1323)
966		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
967	return wscale;
968}
969
970/*
971 * Determine the receive window size for a socket.
972 */
973static unsigned int
974select_rcv_wnd(struct socket *so)
975{
976	struct toedev *dev = TOE_DEV(so);
977	struct tom_data *d = TOM_DATA(dev);
978	unsigned int wnd;
979	unsigned int max_rcv_wnd;
980
981	if (tcp_do_autorcvbuf)
982		wnd = tcp_autorcvbuf_max;
983	else
984		wnd = sbspace(&so->so_rcv);
985
986	/* XXX
987	 * For receive coalescing to work effectively we need a receive window
988	 * that can accomodate a coalesced segment.
989	 */
990	if (wnd < MIN_RCV_WND)
991		wnd = MIN_RCV_WND;
992
993	/* PR 5138 */
994	max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ?
995				    (uint32_t)d->rx_page_size * 23 :
996				    MAX_RCV_WND);
997
998	return min(wnd, max_rcv_wnd);
999}
1000
1001/*
1002 * Assign offload parameters to some socket fields.  This code is used by
1003 * both active and passive opens.
1004 */
1005static inline void
1006init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1007    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1008{
1009	struct tcpcb *tp = sototcpcb(so);
1010	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1011
1012	SOCK_LOCK_ASSERT(so);
1013
1014	printf("initializing offload socket\n");
1015	/*
1016	 * We either need to fix push frames to work with sbcompress
1017	 * or we need to add this
1018	 */
1019	so->so_snd.sb_flags |= SB_NOCOALESCE;
1020
1021	tp->t_toe = toep;
1022	toep->tp_tp = tp;
1023	toep->tp_toedev = dev;
1024
1025	toep->tp_tid = tid;
1026	toep->tp_l2t = e;
1027	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1028	toep->tp_wr_unacked = 0;
1029	toep->tp_delack_mode = 0;
1030
1031	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1032	/*
1033	 * XXX broken
1034	 *
1035	 */
1036	tp->rcv_wnd = select_rcv_wnd(so);
1037        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
1038		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1039	toep->tp_qset_idx = 0;
1040
1041	reset_wr_list(toep);
1042	DPRINTF("initialization done\n");
1043}
1044
1045/*
1046 * The next two functions calculate the option 0 value for a socket.
1047 */
1048static inline unsigned int
1049calc_opt0h(struct socket *so, int mtu_idx)
1050{
1051	struct tcpcb *tp = sototcpcb(so);
1052	int wscale = select_rcv_wscale(tp->rcv_wnd);
1053
1054	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1055	    V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1056	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1057}
1058
1059static inline unsigned int
1060calc_opt0l(struct socket *so, int ulp_mode)
1061{
1062	struct tcpcb *tp = sototcpcb(so);
1063	unsigned int val;
1064
1065	val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
1066	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1067
1068	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
1069	return (val);
1070}
1071
1072static inline unsigned int
1073calc_opt2(const struct socket *so, struct toedev *dev)
1074{
1075	int flv_valid;
1076
1077	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1078
1079	return V_FLAVORS_VALID(flv_valid) |
1080	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0);
1081}
1082#if 0
1083(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1084#endif
1085
1086static void
1087mk_act_open_req(struct socket *so, struct mbuf *m,
1088    unsigned int atid, const struct l2t_entry *e)
1089{
1090	struct cpl_act_open_req *req;
1091	struct inpcb *inp = sotoinpcb(so);
1092	struct tcpcb *tp = intotcpcb(inp);
1093	struct toepcb *toep = tp->t_toe;
1094	struct toedev *tdev = TOE_DEV(so);
1095
1096	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so));
1097
1098	req = mtod(m, struct cpl_act_open_req *);
1099	m->m_pkthdr.len = m->m_len = sizeof(*req);
1100
1101	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1102	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1103	req->local_port = inp->inp_lport;
1104	req->peer_port = inp->inp_fport;
1105	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1106	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1107	DPRINTF("connect smt_idx=%d\n", e->smt_idx);
1108	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1109			   V_TX_CHANNEL(e->smt_idx));
1110	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1111	req->params = 0;
1112	req->opt2 = htonl(calc_opt2(so, tdev));
1113}
1114
1115
1116/*
1117 * Convert an ACT_OPEN_RPL status to an errno.
1118 */
1119static int
1120act_open_rpl_status_to_errno(int status)
1121{
1122	switch (status) {
1123	case CPL_ERR_CONN_RESET:
1124		return (ECONNREFUSED);
1125	case CPL_ERR_ARP_MISS:
1126		return (EHOSTUNREACH);
1127	case CPL_ERR_CONN_TIMEDOUT:
1128		return (ETIMEDOUT);
1129	case CPL_ERR_TCAM_FULL:
1130		return (ENOMEM);
1131	case CPL_ERR_CONN_EXIST:
1132		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1133		return (EADDRINUSE);
1134	default:
1135		return (EIO);
1136	}
1137}
1138
1139static void
1140fail_act_open(struct toepcb *toep, int errno)
1141{
1142	struct tcpcb *tp = toep->tp_tp;
1143
1144	t3_release_offload_resources(toep);
1145	if (tp) {
1146		INP_LOCK_ASSERT(tp->t_inpcb);
1147		cxgb_tcp_drop(tp, errno);
1148	}
1149
1150#ifdef notyet
1151	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1152#endif
1153}
1154
1155/*
1156 * Handle active open failures.
1157 */
1158static void
1159active_open_failed(struct toepcb *toep, struct mbuf *m)
1160{
1161	struct cpl_act_open_rpl *rpl = cplhdr(m);
1162	struct inpcb *inp;
1163
1164	INP_INFO_WLOCK(&tcbinfo);
1165	if (toep->tp_tp == NULL)
1166		goto done;
1167
1168	inp = toep->tp_tp->t_inpcb;
1169	INP_LOCK(inp);
1170
1171/*
1172 * Don't handle connection retry for now
1173 */
1174#ifdef notyet
1175	struct inet_connection_sock *icsk = inet_csk(sk);
1176
1177	if (rpl->status == CPL_ERR_CONN_EXIST &&
1178	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1179		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1180		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1181			       jiffies + HZ / 2);
1182	} else
1183#endif
1184		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1185	INP_UNLOCK(inp);
1186done:
1187	INP_INFO_WUNLOCK(&tcbinfo);
1188
1189	m_free(m);
1190}
1191
1192/*
1193 * Return whether a failed active open has allocated a TID
1194 */
1195static inline int
1196act_open_has_tid(int status)
1197{
1198	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1199	       status != CPL_ERR_ARP_MISS;
1200}
1201
1202/*
1203 * Process an ACT_OPEN_RPL CPL message.
1204 */
1205static int
1206do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1207{
1208	struct toepcb *toep = (struct toepcb *)ctx;
1209	struct cpl_act_open_rpl *rpl = cplhdr(m);
1210
1211	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1212		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1213
1214	active_open_failed(toep, m);
1215	return (0);
1216}
1217
1218/*
1219 * Handle an ARP failure for an active open.   XXX purge ofo queue
1220 *
1221 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1222 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1223 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1224 * free the atid.  Hmm.
1225 */
1226#ifdef notyet
1227static void
1228act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1229{
1230	struct toepcb *toep = m_get_toep(m);
1231	struct tcpcb *tp = toep->tp_tp;
1232	struct inpcb *inp = tp->t_inpcb;
1233	struct socket *so = toeptoso(toep);
1234
1235	INP_LOCK(inp);
1236	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1237		fail_act_open(so, EHOSTUNREACH);
1238		printf("freeing %p\n", m);
1239
1240		m_free(m);
1241	}
1242	INP_UNLOCK(inp);
1243}
1244#endif
1245/*
1246 * Send an active open request.
1247 */
1248int
1249t3_connect(struct toedev *tdev, struct socket *so,
1250    struct rtentry *rt, struct sockaddr *nam)
1251{
1252	struct mbuf *m;
1253	struct l2t_entry *e;
1254	struct tom_data *d = TOM_DATA(tdev);
1255	struct inpcb *inp = sotoinpcb(so);
1256	struct tcpcb *tp = intotcpcb(inp);
1257	struct toepcb *toep; /* allocated by init_offload_socket */
1258
1259	int atid;
1260
1261	toep = toepcb_alloc();
1262	if (toep == NULL)
1263		goto out_err;
1264
1265	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1266		goto out_err;
1267
1268	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1269	if (!e)
1270		goto free_tid;
1271
1272	INP_LOCK_ASSERT(inp);
1273	m = m_gethdr(MT_DATA, M_WAITOK);
1274
1275#if 0
1276	m->m_toe.mt_toepcb = tp->t_toe;
1277	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1278#endif
1279	SOCK_LOCK(so);
1280
1281	init_offload_socket(so, tdev, atid, e, rt, toep);
1282
1283	install_offload_ops(so);
1284
1285	mk_act_open_req(so, m, atid, e);
1286	SOCK_UNLOCK(so);
1287
1288	soisconnecting(so);
1289	toep = tp->t_toe;
1290	m_set_toep(m, tp->t_toe);
1291
1292	printf("sending off request\n");
1293
1294	toep->tp_state = TCPS_SYN_SENT;
1295	l2t_send(d->cdev, (struct mbuf *)m, e);
1296
1297	if (toep->tp_ulp_mode)
1298		t3_enable_ddp(so, 0);
1299	return 	(0);
1300
1301free_tid:
1302	printf("failing connect - free atid\n");
1303
1304	free_atid(d->cdev, atid);
1305out_err:
1306	printf("return ENOMEM\n");
1307       return (ENOMEM);
1308}
1309
1310/*
1311 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1312 * not send multiple ABORT_REQs for the same connection and also that we do
1313 * not try to send a message after the connection has closed.  Returns 1 if
1314 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1315 */
1316static void
1317t3_send_reset(struct toepcb *toep)
1318{
1319
1320	struct cpl_abort_req *req;
1321	unsigned int tid = toep->tp_tid;
1322	int mode = CPL_ABORT_SEND_RST;
1323	struct tcpcb *tp = toep->tp_tp;
1324	struct toedev *tdev = toep->tp_toedev;
1325	struct socket *so = NULL;
1326	struct mbuf *m;
1327
1328	if (tp) {
1329		INP_LOCK_ASSERT(tp->t_inpcb);
1330		so = toeptoso(toep);
1331	}
1332
1333	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1334		tdev == NULL))
1335		return;
1336	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1337
1338	/* Purge the send queue so we don't send anything after an abort. */
1339	if (so)
1340		sbflush(&so->so_snd);
1341	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1342		mode |= CPL_ABORT_POST_CLOSE_REQ;
1343
1344	m = m_gethdr_nofail(sizeof(*req));
1345	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so));
1346	set_arp_failure_handler(m, abort_arp_failure);
1347
1348	req = mtod(m, struct cpl_abort_req *);
1349	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1350	req->wr.wr_lo = htonl(V_WR_TID(tid));
1351	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1352	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1353	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1354	req->cmd = mode;
1355	if (tp && (tp->t_state == TCPS_SYN_SENT))
1356		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1357	else
1358		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1359}
1360
1361static int
1362t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1363{
1364	struct inpcb *inp;
1365	int error, optval;
1366
1367	if (sopt->sopt_name == IP_OPTIONS)
1368		return (ENOPROTOOPT);
1369
1370	if (sopt->sopt_name != IP_TOS)
1371		return (EOPNOTSUPP);
1372
1373	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1374
1375	if (error)
1376		return (error);
1377
1378	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1379		return (EPERM);
1380
1381	inp = sotoinpcb(so);
1382	inp->inp_ip_tos = optval;
1383
1384	t3_set_tos(so);
1385
1386	return (0);
1387}
1388
1389static int
1390t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1391{
1392	int err = 0;
1393	size_t copied;
1394
1395	if (sopt->sopt_name != TCP_CONGESTION &&
1396	    sopt->sopt_name != TCP_NODELAY)
1397		return (EOPNOTSUPP);
1398
1399	if (sopt->sopt_name == TCP_CONGESTION) {
1400		char name[TCP_CA_NAME_MAX];
1401		int optlen = sopt->sopt_valsize;
1402		struct tcpcb *tp;
1403
1404		if (optlen < 1)
1405			return (EINVAL);
1406
1407		err = copyinstr(sopt->sopt_val, name,
1408		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1409		if (err)
1410			return (err);
1411		if (copied < 1)
1412			return (EINVAL);
1413
1414		tp = sototcpcb(so);
1415		/*
1416		 * XXX I need to revisit this
1417		 */
1418		if ((err = t3_set_cong_control(so, name)) == 0) {
1419#ifdef notyet
1420			tp->t_cong_control = strdup(name, M_CXGB);
1421#endif
1422		} else
1423			return (err);
1424	} else {
1425		int optval, oldval;
1426		struct inpcb *inp;
1427		struct tcpcb *tp;
1428
1429		err = sooptcopyin(sopt, &optval, sizeof optval,
1430		    sizeof optval);
1431
1432		if (err)
1433			return (err);
1434
1435		inp = sotoinpcb(so);
1436		tp = intotcpcb(inp);
1437
1438		INP_LOCK(inp);
1439
1440		oldval = tp->t_flags;
1441		if (optval)
1442			tp->t_flags |= TF_NODELAY;
1443		else
1444			tp->t_flags &= ~TF_NODELAY;
1445		INP_UNLOCK(inp);
1446
1447		if (oldval != tp->t_flags)
1448			t3_set_nagle(so);
1449
1450	}
1451
1452	return (0);
1453}
1454
1455static int
1456t3_ctloutput(struct socket *so, struct sockopt *sopt)
1457{
1458	int err;
1459
1460	if (sopt->sopt_level != IPPROTO_TCP)
1461		err =  t3_ip_ctloutput(so, sopt);
1462	else
1463		err = t3_tcp_ctloutput(so, sopt);
1464
1465	if (err != EOPNOTSUPP)
1466		return (err);
1467
1468	return tcp_ctloutput(so, sopt);
1469}
1470
1471/*
1472 * Process new data received for a connection.
1473 */
1474static void
1475new_rx_data(struct toepcb *toep, struct mbuf *m)
1476{
1477	struct cpl_rx_data *hdr = cplhdr(m);
1478	struct tcpcb *tp = toep->tp_tp;
1479	struct socket *so = toeptoso(toep);
1480	int len = be16toh(hdr->len);
1481
1482	INP_LOCK(tp->t_inpcb);
1483
1484#ifdef notyet
1485	if (__predict_false(sk_no_receive(sk))) {
1486		handle_excess_rx(so, skb);
1487		return;
1488	}
1489
1490	if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
1491		handle_ddp_data(so, skb);
1492
1493	TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
1494	TCP_SKB_CB(skb)->flags = 0;
1495	skb_ulp_mode(skb) = 0;                    /* for iSCSI */
1496#endif
1497#if VALIDATE_SEQ
1498	if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
1499		printk(KERN_ERR
1500		       "%s: TID %u: Bad sequence number %u, expected %u\n",
1501		       TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq,
1502		       tp->rcv_nxt);
1503		__kfree_skb(skb);
1504		return;
1505	}
1506#endif
1507	m_adj(m, sizeof(*hdr));
1508
1509#ifdef notyet
1510	/*
1511	 * We don't handle urgent data yet
1512	 */
1513	if (__predict_false(hdr->urg))
1514		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
1515	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
1516		     tp->urg_seq - tp->rcv_nxt < skb->len))
1517		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
1518							 tp->rcv_nxt];
1519#endif
1520	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
1521		toep->tp_delack_mode = hdr->dack_mode;
1522		toep->tp_delack_seq = tp->rcv_nxt;
1523	}
1524
1525	DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len);
1526
1527	if (len < m->m_pkthdr.len)
1528		m->m_pkthdr.len = m->m_len = len;
1529
1530	tp->rcv_nxt += m->m_pkthdr.len;
1531	tp->t_rcvtime = ticks;
1532	toep->tp_enqueued_bytes += m->m_pkthdr.len;
1533#ifdef T3_TRACE
1534	T3_TRACE2(TIDTB(sk),
1535		  "new_rx_data: seq 0x%x len %u",
1536		  TCP_SKB_CB(skb)->seq, skb->len);
1537#endif
1538	SOCKBUF_LOCK(&so->so_rcv);
1539	if (sb_notify(&so->so_rcv))
1540		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
1541
1542	sbappend_locked(&so->so_rcv, m);
1543	KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax,
1544
1545	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
1546		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
1547
1548	INP_UNLOCK(tp->t_inpcb);
1549	DPRINTF("sb_cc=%d sb_mbcnt=%d\n",
1550	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
1551
1552	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1553		sorwakeup_locked(so);
1554	else
1555		SOCKBUF_UNLOCK(&so->so_rcv);
1556}
1557
1558/*
1559 * Handler for RX_DATA CPL messages.
1560 */
1561static int
1562do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1563{
1564	struct toepcb *toep = (struct toepcb *)ctx;
1565
1566	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
1567
1568	new_rx_data(toep, m);
1569
1570	return (0);
1571}
1572
1573static void
1574new_rx_data_ddp(struct socket *so, struct mbuf *m)
1575{
1576	struct tcpcb *tp = sototcpcb(so);
1577	struct toepcb *toep = tp->t_toe;
1578	struct ddp_state *q;
1579	struct ddp_buf_state *bsp;
1580	struct cpl_rx_data_ddp *hdr;
1581	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
1582
1583#ifdef notyet
1584	if (unlikely(sk_no_receive(sk))) {
1585		handle_excess_rx(so, m);
1586		return;
1587	}
1588#endif
1589	tp = sototcpcb(so);
1590	q = &toep->tp_ddp_state;
1591	hdr = cplhdr(m);
1592	ddp_report = ntohl(hdr->u.ddp_report);
1593	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
1594	bsp = &q->buf_state[buf_idx];
1595
1596#ifdef T3_TRACE
1597	T3_TRACE5(TIDTB(sk),
1598		  "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
1599		  "hdr seq 0x%x len %u offset %u",
1600		  tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
1601		  ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
1602	T3_TRACE1(TIDTB(sk),
1603		  "new_rx_data_ddp: ddp_report 0x%x",
1604		  ddp_report);
1605#endif
1606
1607	ddp_len = ntohs(hdr->len);
1608	rcv_nxt = ntohl(hdr->seq) + ddp_len;
1609
1610	/*
1611	 * Overload to store old rcv_next
1612	 */
1613	m->m_pkthdr.csum_data = tp->rcv_nxt;
1614	tp->rcv_nxt = rcv_nxt;
1615
1616	/*
1617	 * Store the length in m->m_len.  We are changing the meaning of
1618	 * m->m_len here, we need to be very careful that nothing from now on
1619	 * interprets ->len of this packet the usual way.
1620	 */
1621	m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data;
1622
1623	/*
1624	 * Figure out where the new data was placed in the buffer and store it
1625	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
1626	 * account for page pod's pg_offset.
1627	 */
1628	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
1629#ifdef notyet
1630	TCP_SKB_CB(skb)->when = end_offset - skb->len;
1631
1632	/*
1633	 * We store in mac.raw the address of the gather list where the
1634	 * placement happened.
1635	 */
1636	skb->mac.raw = (unsigned char *)bsp->gl;
1637#endif
1638	bsp->cur_offset = end_offset;
1639
1640	/*
1641	 * Bit 0 of flags stores whether the DDP buffer is completed.
1642	 * Note that other parts of the code depend on this being in bit 0.
1643	 */
1644	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
1645#if 0
1646		TCP_SKB_CB(skb)->flags = 0;  /* potential spurious completion */
1647#endif
1648		panic("spurious ddp completion");
1649	} else {
1650		m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
1651		if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP))
1652			q->cur_buf ^= 1;                     /* flip buffers */
1653	}
1654
1655	if (bsp->flags & DDP_BF_NOCOPY) {
1656		m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY);
1657		bsp->flags &= ~DDP_BF_NOCOPY;
1658	}
1659
1660	if (ddp_report & F_DDP_PSH)
1661		m->m_pkthdr.csum_flags |= DDP_BF_PSH;
1662
1663	tp->t_rcvtime = ticks;
1664	sbappendstream_locked(&so->so_rcv, m);
1665#ifdef notyet
1666	if (!sock_flag(sk, SOCK_DEAD))
1667		sk->sk_data_ready(sk, 0);
1668#endif
1669}
1670
1671#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
1672		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
1673		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
1674		 F_DDP_INVALID_PPOD)
1675
1676/*
1677 * Handler for RX_DATA_DDP CPL messages.
1678 */
1679static int
1680do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1681{
1682	struct toepcb *toep = ctx;
1683	struct socket *so = toeptoso(toep);
1684	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
1685
1686	VALIDATE_SOCK(so);
1687
1688	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
1689		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
1690		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
1691		return CPL_RET_BUF_DONE;
1692	}
1693#if 0
1694	skb->h.th = tcphdr_skb->h.th;
1695#endif
1696	new_rx_data_ddp(so, m);
1697	return (0);
1698}
1699
1700static void
1701process_ddp_complete(struct socket *so, struct mbuf *m)
1702{
1703	struct tcpcb *tp = sototcpcb(so);
1704	struct toepcb *toep = tp->t_toe;
1705	struct ddp_state *q;
1706	struct ddp_buf_state *bsp;
1707	struct cpl_rx_ddp_complete *hdr;
1708	unsigned int ddp_report, buf_idx, when;
1709
1710#ifdef notyet
1711	if (unlikely(sk_no_receive(sk))) {
1712		handle_excess_rx(sk, skb);
1713		return;
1714	}
1715#endif
1716	q = &toep->tp_ddp_state;
1717	hdr = cplhdr(m);
1718	ddp_report = ntohl(hdr->ddp_report);
1719	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
1720	bsp = &q->buf_state[buf_idx];
1721
1722	when = bsp->cur_offset;
1723	m->m_len = G_DDP_OFFSET(ddp_report) - when;
1724
1725#ifdef T3_TRACE
1726	T3_TRACE5(TIDTB(sk),
1727		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
1728		  "ddp_report 0x%x offset %u, len %u",
1729		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
1730		   G_DDP_OFFSET(ddp_report), skb->len);
1731#endif
1732
1733	bsp->cur_offset += m->m_len;
1734
1735	if (!(bsp->flags & DDP_BF_NOFLIP))
1736		q->cur_buf ^= 1;                     /* flip buffers */
1737
1738#ifdef T3_TRACE
1739	T3_TRACE4(TIDTB(sk),
1740		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
1741		  "ddp_report %u offset %u",
1742		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
1743		   G_DDP_OFFSET(ddp_report));
1744#endif
1745#if 0
1746	skb->mac.raw = (unsigned char *)bsp->gl;
1747#endif
1748	m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
1749	if (bsp->flags & DDP_BF_NOCOPY)
1750		bsp->flags &= ~DDP_BF_NOCOPY;
1751	m->m_pkthdr.csum_data = tp->rcv_nxt;
1752	tp->rcv_nxt += m->m_len;
1753
1754	tp->t_rcvtime = ticks;
1755	sbappendstream_locked(&so->so_rcv, m);
1756#ifdef notyet
1757	if (!sock_flag(sk, SOCK_DEAD))
1758		sk->sk_data_ready(sk, 0);
1759#endif
1760}
1761
1762/*
1763 * Handler for RX_DDP_COMPLETE CPL messages.
1764 */
1765static int
1766do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1767{
1768	struct toepcb *toep = ctx;
1769	struct socket *so = toeptoso(toep);
1770
1771	VALIDATE_SOCK(so);
1772#if 0
1773	skb->h.th = tcphdr_skb->h.th;
1774#endif
1775	process_ddp_complete(so, m);
1776	return (0);
1777}
1778
1779/*
1780 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
1781 * socket state before calling tcp_time_wait to comply with its expectations.
1782 */
1783static void
1784enter_timewait(struct socket *so)
1785{
1786	struct tcpcb *tp = sototcpcb(so);
1787
1788	INP_LOCK_ASSERT(tp->t_inpcb);
1789	/*
1790	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
1791	 * process peer_close because we don't want to carry the peer FIN in
1792	 * the socket's receive queue and if we increment rcv_nxt without
1793	 * having the FIN in the receive queue we'll confuse facilities such
1794	 * as SIOCINQ.
1795	 */
1796	tp->rcv_nxt++;
1797
1798	tp->ts_recent_age = 0;	     /* defeat recycling */
1799	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
1800	tcp_twstart(tp);
1801}
1802
1803/*
1804 * Handle a peer FIN.
1805 */
1806static void
1807do_peer_fin(struct socket *so, struct mbuf *m)
1808{
1809	struct tcpcb *tp = sototcpcb(so);
1810	struct toepcb *toep = tp->t_toe;
1811	int keep = 0, dead = (so->so_state & SS_NOFDREF);
1812
1813	DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead);
1814
1815#ifdef T3_TRACE
1816	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
1817#endif
1818
1819	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
1820		printf("abort_pending set\n");
1821
1822		goto out;
1823	}
1824
1825#ifdef notyet
1826	if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
1827		keep = handle_peer_close_data(so, skb);
1828		if (keep < 0)
1829			return;
1830	}
1831	sk->sk_shutdown |= RCV_SHUTDOWN;
1832	sock_set_flag(so, SOCK_DONE);
1833#endif
1834	INP_INFO_WLOCK(&tcbinfo);
1835	INP_LOCK(tp->t_inpcb);
1836	if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
1837		socantrcvmore(so);
1838	switch (tp->t_state) {
1839	case TCPS_SYN_RECEIVED:
1840	    tp->t_starttime = ticks;
1841	/* FALLTHROUGH */
1842	case TCPS_ESTABLISHED:
1843		tp->t_state = TCPS_CLOSE_WAIT;
1844		break;
1845	case TCPS_FIN_WAIT_1:
1846		tp->t_state = TCPS_CLOSING;
1847		break;
1848	case TCPS_FIN_WAIT_2:
1849		/*
1850		 * If we've sent an abort_req we must have sent it too late,
1851		 * HW will send us a reply telling us so, and this peer_close
1852		 * is really the last message for this connection and needs to
1853		 * be treated as an abort_rpl, i.e., transition the connection
1854		 * to TCP_CLOSE (note that the host stack does this at the
1855		 * time of generating the RST but we must wait for HW).
1856		 * Otherwise we enter TIME_WAIT.
1857		 */
1858		t3_release_offload_resources(toep);
1859		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1860			tp = tcp_close(tp);
1861		} else
1862			enter_timewait(so);
1863		break;
1864	default:
1865		log(LOG_ERR,
1866		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
1867		       TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
1868	}
1869	INP_INFO_WUNLOCK(&tcbinfo);
1870	if (tp)
1871		INP_UNLOCK(tp->t_inpcb);
1872
1873	if (!dead) {
1874		DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
1875
1876		sorwakeup(so);
1877		sowwakeup(so);
1878		wakeup(&so->so_timeo);
1879#ifdef notyet
1880		sk->sk_state_change(sk);
1881
1882		/* Do not send POLL_HUP for half duplex close. */
1883		if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
1884		    sk->sk_state == TCP_CLOSE)
1885			sk_wake_async(so, 1, POLL_HUP);
1886		else
1887			sk_wake_async(so, 1, POLL_IN);
1888#endif
1889	}
1890out:
1891	if (!keep)
1892		m_free(m);
1893}
1894
1895/*
1896 * Handler for PEER_CLOSE CPL messages.
1897 */
1898static int
1899do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1900{
1901	struct toepcb *toep = (struct toepcb *)ctx;
1902	struct socket *so = toeptoso(toep);
1903
1904	VALIDATE_SOCK(so);
1905
1906	do_peer_fin(so, m);
1907	return (0);
1908}
1909
1910static void
1911process_close_con_rpl(struct socket *so, struct mbuf *m)
1912{
1913	struct tcpcb *tp = sototcpcb(so);
1914	struct cpl_close_con_rpl *rpl = cplhdr(m);
1915	struct toepcb *toep = tp->t_toe;
1916
1917	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
1918
1919	DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
1920	    !!(so->so_state & SS_NOFDREF));
1921	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
1922		goto out;
1923
1924	INP_INFO_WLOCK(&tcbinfo);
1925	INP_LOCK(tp->t_inpcb);
1926	switch (tp->t_state) {
1927	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
1928		t3_release_offload_resources(toep);
1929		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1930			tp = tcp_close(tp);
1931
1932		} else
1933			enter_timewait(so);
1934		break;
1935	case TCPS_LAST_ACK:
1936		/*
1937		 * In this state we don't care about pending abort_rpl.
1938		 * If we've sent abort_req it was post-close and was sent too
1939		 * late, this close_con_rpl is the actual last message.
1940		 */
1941		t3_release_offload_resources(toep);
1942		tp = tcp_close(tp);
1943		break;
1944	case TCPS_FIN_WAIT_1:
1945#ifdef notyet
1946		dst_confirm(sk->sk_dst_cache);
1947#endif
1948		soisdisconnecting(so);
1949
1950		if ((so->so_state & SS_NOFDREF) == 0) {
1951			/*
1952			 * Wake up lingering close
1953			 */
1954			sowwakeup(so);
1955			sorwakeup(so);
1956			wakeup(&so->so_timeo);
1957		} else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
1958		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
1959			tp = cxgb_tcp_drop(tp, 0);
1960		}
1961
1962		break;
1963	default:
1964		log(LOG_ERR,
1965		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
1966		       TOE_DEV(so)->tod_name, toep->tp_tid,
1967		       tp->t_state);
1968	}
1969	INP_INFO_WUNLOCK(&tcbinfo);
1970	if (tp)
1971		INP_UNLOCK(tp->t_inpcb);
1972out:
1973	m_free(m);
1974}
1975
1976/*
1977 * Handler for CLOSE_CON_RPL CPL messages.
1978 */
1979static int
1980do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
1981			    void *ctx)
1982{
1983	struct toepcb *toep = (struct toepcb *)ctx;
1984	struct socket *so = toeptoso(toep);
1985
1986	VALIDATE_SOCK(so);
1987
1988	process_close_con_rpl(so, m);
1989	return (0);
1990}
1991
1992/*
1993 * Process abort replies.  We only process these messages if we anticipate
1994 * them as the coordination between SW and HW in this area is somewhat lacking
1995 * and sometimes we get ABORT_RPLs after we are done with the connection that
1996 * originated the ABORT_REQ.
1997 */
1998static void
1999process_abort_rpl(struct socket *so, struct mbuf *m)
2000{
2001	struct tcpcb *tp = sototcpcb(so);
2002	struct toepcb *toep = tp->t_toe;
2003
2004#ifdef T3_TRACE
2005	T3_TRACE1(TIDTB(sk),
2006		  "process_abort_rpl: GTS rpl pending %d",
2007		  sock_flag(sk, ABORT_RPL_PENDING));
2008#endif
2009	INP_LOCK(tp->t_inpcb);
2010
2011	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2012		/*
2013		 * XXX panic on tcpdrop
2014		 */
2015		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
2016			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2017		else {
2018			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2019			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2020			    !is_t3a(TOE_DEV(so))) {
2021				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2022					panic("TP_ABORT_REQ_RCVD set");
2023				INP_INFO_WLOCK(&tcbinfo);
2024				INP_LOCK(tp->t_inpcb);
2025				t3_release_offload_resources(toep);
2026				tp = tcp_close(tp);
2027				INP_INFO_WUNLOCK(&tcbinfo);
2028			}
2029		}
2030	}
2031	if (tp)
2032		INP_UNLOCK(tp->t_inpcb);
2033
2034	m_free(m);
2035}
2036
2037/*
2038 * Handle an ABORT_RPL_RSS CPL message.
2039 */
2040static int
2041do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2042{
2043	struct socket *so;
2044	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2045	struct toepcb *toep;
2046
2047	/*
2048	 * Ignore replies to post-close aborts indicating that the abort was
2049	 * requested too late.  These connections are terminated when we get
2050	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2051	 * arrives the TID is either no longer used or it has been recycled.
2052	 */
2053	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2054discard:
2055		m_free(m);
2056		return (0);
2057	}
2058
2059	toep = (struct toepcb *)ctx;
2060
2061        /*
2062	 * Sometimes we've already closed the socket, e.g., a post-close
2063	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2064	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2065	 * but FW turns the ABORT_REQ into a regular one and so we get
2066	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2067	 */
2068	if (!toep)
2069		goto discard;
2070
2071	if (toep->tp_tp == NULL) {
2072		printf("removing tid for abort\n");
2073		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2074		if (toep->tp_l2t)
2075			l2t_release(L2DATA(cdev), toep->tp_l2t);
2076
2077		toepcb_release(toep);
2078		goto discard;
2079	}
2080
2081	printf("toep=%p\n", toep);
2082	printf("tp=%p\n", toep->tp_tp);
2083
2084	so = toeptoso(toep); /* <- XXX panic */
2085	toepcb_hold(toep);
2086	process_abort_rpl(so, m);
2087	toepcb_release(toep);
2088	return (0);
2089}
2090
2091/*
2092 * Convert the status code of an ABORT_REQ into a Linux error code.  Also
2093 * indicate whether RST should be sent in response.
2094 */
2095static int
2096abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2097{
2098	struct tcpcb *tp = sototcpcb(so);
2099
2100	switch (abort_reason) {
2101	case CPL_ERR_BAD_SYN:
2102#if 0
2103		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2104#endif
2105	case CPL_ERR_CONN_RESET:
2106		// XXX need to handle SYN_RECV due to crossed SYNs
2107		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2108	case CPL_ERR_XMIT_TIMEDOUT:
2109	case CPL_ERR_PERSIST_TIMEDOUT:
2110	case CPL_ERR_FINWAIT2_TIMEDOUT:
2111	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2112#if 0
2113		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2114#endif
2115		return (ETIMEDOUT);
2116	default:
2117		return (EIO);
2118	}
2119}
2120
2121static inline void
2122set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2123{
2124	struct cpl_abort_rpl *rpl = cplhdr(m);
2125
2126	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2127	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2128	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2129
2130	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2131	rpl->cmd = cmd;
2132}
2133
2134static void
2135send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2136{
2137	struct mbuf *reply_mbuf;
2138	struct cpl_abort_req_rss *req = cplhdr(m);
2139
2140	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2141	m_set_priority(m, CPL_PRIORITY_DATA);
2142	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2143	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2144	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2145	m_free(m);
2146}
2147
2148/*
2149 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2150 */
2151static inline int
2152is_neg_adv_abort(unsigned int status)
2153{
2154	return status == CPL_ERR_RTX_NEG_ADVICE ||
2155	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2156}
2157
2158static void
2159send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2160{
2161	struct mbuf  *reply_mbuf;
2162	struct cpl_abort_req_rss *req = cplhdr(m);
2163
2164	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2165
2166	if (!reply_mbuf) {
2167		/* Defer the reply.  Stick rst_status into req->cmd. */
2168		req->status = rst_status;
2169		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2170		return;
2171	}
2172
2173	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2174	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2175	m_free(m);
2176
2177	/*
2178	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2179	 * these messages while ARP is pending.  For other connection states
2180	 * it's not a problem.
2181	 */
2182	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2183}
2184
2185#ifdef notyet
2186static void
2187cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2188{
2189	UNIMPLEMENTED();
2190#ifdef notyet
2191	struct request_sock *req = child->sk_user_data;
2192
2193	inet_csk_reqsk_queue_removed(parent, req);
2194	synq_remove(tcp_sk(child));
2195	__reqsk_free(req);
2196	child->sk_user_data = NULL;
2197#endif
2198}
2199
2200
2201/*
2202 * Performs the actual work to abort a SYN_RECV connection.
2203 */
2204static void
2205do_abort_syn_rcv(struct socket *child, struct socket *parent)
2206{
2207	struct tcpcb *parenttp = sototcpcb(parent);
2208	struct tcpcb *childtp = sototcpcb(child);
2209
2210	/*
2211	 * If the server is still open we clean up the child connection,
2212	 * otherwise the server already did the clean up as it was purging
2213	 * its SYN queue and the skb was just sitting in its backlog.
2214	 */
2215	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2216		cleanup_syn_rcv_conn(child, parent);
2217		INP_INFO_WLOCK(&tcbinfo);
2218		INP_LOCK(childtp->t_inpcb);
2219		t3_release_offload_resources(childtp->t_toe);
2220		childtp = tcp_close(childtp);
2221		INP_INFO_WUNLOCK(&tcbinfo);
2222		if (childtp)
2223			INP_UNLOCK(childtp->t_inpcb);
2224	}
2225}
2226#endif
2227
2228/*
2229 * Handle abort requests for a SYN_RECV connection.  These need extra work
2230 * because the socket is on its parent's SYN queue.
2231 */
2232static int
2233abort_syn_rcv(struct socket *so, struct mbuf *m)
2234{
2235	UNIMPLEMENTED();
2236#ifdef notyet
2237	struct socket *parent;
2238	struct toedev *tdev = TOE_DEV(so);
2239	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2240	struct socket *oreq = so->so_incomp;
2241	struct t3c_tid_entry *t3c_stid;
2242	struct tid_info *t;
2243
2244	if (!oreq)
2245		return -1;        /* somehow we are not on the SYN queue */
2246
2247	t = &(T3C_DATA(cdev))->tid_maps;
2248	t3c_stid = lookup_stid(t, oreq->ts_recent);
2249	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2250
2251	SOCK_LOCK(parent);
2252	do_abort_syn_rcv(so, parent);
2253	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2254	SOCK_UNLOCK(parent);
2255#endif
2256	return (0);
2257}
2258
2259/*
2260 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2261 * request except that we need to reply to it.
2262 */
2263static void
2264process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
2265{
2266	int rst_status = CPL_ABORT_NO_RST;
2267	const struct cpl_abort_req_rss *req = cplhdr(m);
2268	struct tcpcb *tp = sototcpcb(so);
2269	struct toepcb *toep = tp->t_toe;
2270
2271	INP_LOCK(tp->t_inpcb);
2272	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2273		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2274		m_free(m);
2275		goto skip;
2276	}
2277
2278	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
2279	/*
2280	 * Three cases to consider:
2281	 * a) We haven't sent an abort_req; close the connection.
2282	 * b) We have sent a post-close abort_req that will get to TP too late
2283	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
2284	 *    be ignored and the connection should be closed now.
2285	 * c) We have sent a regular abort_req that will get to TP too late.
2286	 *    That will generate an abort_rpl with status 0, wait for it.
2287	 */
2288	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
2289	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
2290		so->so_error = abort_status_to_errno(so, req->status,
2291		    &rst_status);
2292#if 0
2293		if (!sock_flag(sk, SOCK_DEAD))
2294			sk->sk_error_report(sk);
2295#endif
2296		/*
2297		 * SYN_RECV needs special processing.  If abort_syn_rcv()
2298		 * returns 0 is has taken care of the abort.
2299		 */
2300		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
2301			goto skip;
2302
2303		t3_release_offload_resources(toep);
2304		tp = tcp_close(tp);
2305	}
2306	if (tp)
2307		INP_UNLOCK(tp->t_inpcb);
2308	send_abort_rpl(m, tdev, rst_status);
2309	return;
2310
2311skip:
2312	INP_UNLOCK(tp->t_inpcb);
2313}
2314
2315/*
2316 * Handle an ABORT_REQ_RSS CPL message.
2317 */
2318static int
2319do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2320{
2321	const struct cpl_abort_req_rss *req = cplhdr(m);
2322	struct toepcb *toep = (struct toepcb *)ctx;
2323	struct socket *so;
2324	struct inpcb *inp;
2325
2326	if (is_neg_adv_abort(req->status)) {
2327		m_free(m);
2328		return (0);
2329	}
2330
2331	printf("aborting tid=%d\n", toep->tp_tid);
2332
2333	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
2334		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2335		toep->tp_flags |= TP_ABORT_REQ_RCVD;
2336		printf("sending abort rpl\n");
2337
2338		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
2339		printf("sent\n");
2340		if (toep->tp_l2t)
2341			l2t_release(L2DATA(cdev), toep->tp_l2t);
2342
2343		/*
2344		 *  Unhook
2345		 */
2346		toep->tp_tp->t_toe = NULL;
2347		toep->tp_tp->t_flags &= ~TF_TOE;
2348		toep->tp_tp = NULL;
2349		/*
2350		 * XXX need to call syncache_chkrst - but we don't
2351		 * have a way of doing that yet
2352		 */
2353		toepcb_release(toep);
2354		printf("abort for unestablished connection :-(\n");
2355		return (0);
2356	}
2357	if (toep->tp_tp == NULL) {
2358		printf("disconnected toepcb\n");
2359		/* should be freed momentarily */
2360		return (0);
2361	}
2362
2363	so = toeptoso(toep);
2364	inp = sotoinpcb(so);
2365
2366	VALIDATE_SOCK(so);
2367	toepcb_hold(toep);
2368	INP_INFO_WLOCK(&tcbinfo);
2369	process_abort_req(so, m, TOE_DEV(so));
2370	INP_INFO_WUNLOCK(&tcbinfo);
2371	toepcb_release(toep);
2372	return (0);
2373}
2374#ifdef notyet
2375static void
2376pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
2377{
2378	struct toedev *tdev = TOE_DEV(parent);
2379
2380	do_abort_syn_rcv(child, parent);
2381	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
2382		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
2383
2384		rpl->opt0h = htonl(F_TCAM_BYPASS);
2385		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2386		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
2387	} else
2388		m_free(m);
2389}
2390#endif
2391static void
2392handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
2393{
2394	UNIMPLEMENTED();
2395
2396#ifdef notyet
2397	struct t3cdev *cdev;
2398	struct socket *parent;
2399	struct socket *oreq;
2400	struct t3c_tid_entry *t3c_stid;
2401	struct tid_info *t;
2402	struct tcpcb *otp, *tp = sototcpcb(so);
2403	struct toepcb *toep = tp->t_toe;
2404
2405	/*
2406	 * If the connection is being aborted due to the parent listening
2407	 * socket going away there's nothing to do, the ABORT_REQ will close
2408	 * the connection.
2409	 */
2410	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2411		m_free(m);
2412		return;
2413	}
2414
2415	oreq = so->so_incomp;
2416	otp = sototcpcb(oreq);
2417
2418	cdev = T3C_DEV(so);
2419	t = &(T3C_DATA(cdev))->tid_maps;
2420	t3c_stid = lookup_stid(t, otp->ts_recent);
2421	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2422
2423	SOCK_LOCK(parent);
2424	pass_open_abort(so, parent, m);
2425	SOCK_UNLOCK(parent);
2426#endif
2427}
2428
2429/*
2430 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
2431 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
2432 * connection.
2433 */
2434static void
2435pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
2436{
2437
2438#ifdef notyet
2439	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2440	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
2441#endif
2442	handle_pass_open_arp_failure(m_get_socket(m), m);
2443}
2444
2445/*
2446 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
2447 */
2448static void
2449mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
2450{
2451	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
2452	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
2453	unsigned int tid = GET_TID(req);
2454
2455	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
2456	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
2457	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
2458	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
2459	rpl->opt0h = htonl(F_TCAM_BYPASS);
2460	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2461	rpl->opt2 = 0;
2462	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
2463}
2464
2465/*
2466 * Send a deferred reject to an accept request.
2467 */
2468static void
2469reject_pass_request(struct toedev *tdev, struct mbuf *m)
2470{
2471	struct mbuf *reply_mbuf;
2472
2473	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
2474	mk_pass_accept_rpl(reply_mbuf, m);
2475	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2476	m_free(m);
2477}
2478
2479static void
2480handle_syncache_event(int event, void *arg)
2481{
2482	struct toepcb *toep = arg;
2483
2484	switch (event) {
2485	case TOE_SC_ENTRY_PRESENT:
2486		/*
2487		 * entry already exists - free toepcb
2488		 * and l2t
2489		 */
2490		printf("syncache entry present\n");
2491		toepcb_release(toep);
2492		break;
2493	case TOE_SC_DROP:
2494		/*
2495		 * The syncache has given up on this entry
2496		 * either it timed out, or it was evicted
2497		 * we need to explicitly release the tid
2498		 */
2499		printf("syncache entry dropped\n");
2500		toepcb_release(toep);
2501		break;
2502	default:
2503		log(LOG_ERR, "unknown syncache event %d\n", event);
2504		break;
2505	}
2506}
2507
2508static void
2509syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
2510{
2511	struct in_conninfo inc;
2512	struct tcpopt to;
2513	struct tcphdr th;
2514	struct inpcb *inp;
2515	int mss, wsf, sack, ts;
2516
2517	bzero(&to, sizeof(struct tcpopt));
2518	inp = sotoinpcb(lso);
2519
2520	/*
2521	 * Fill out information for entering us into the syncache
2522	 */
2523	inc.inc_fport = th.th_sport = req->peer_port;
2524	inc.inc_lport = th.th_dport = req->local_port;
2525	toep->tp_iss = th.th_seq = req->rcv_isn;
2526	th.th_flags = TH_SYN;
2527
2528	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn);
2529
2530	inc.inc_isipv6 = 0;
2531	inc.inc_len = 0;
2532	inc.inc_faddr.s_addr = req->peer_ip;
2533	inc.inc_laddr.s_addr = req->local_ip;
2534
2535	DPRINTF("syncache add of %d:%d %d:%d\n",
2536	    ntohl(req->local_ip), ntohs(req->local_port),
2537	    ntohl(req->peer_ip), ntohs(req->peer_port));
2538
2539	mss = req->tcp_options.mss;
2540	wsf = req->tcp_options.wsf;
2541	ts = req->tcp_options.tstamp;
2542	sack = req->tcp_options.sack;
2543	to.to_mss = mss;
2544	to.to_wscale = wsf;
2545	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
2546
2547	INP_INFO_WLOCK(&tcbinfo);
2548	INP_LOCK(inp);
2549	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
2550}
2551
2552
2553/*
2554 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
2555 * lock held.  Note that the sock here is a listening socket that is not owned
2556 * by the TOE.
2557 */
2558static void
2559process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
2560    struct listen_ctx *lctx)
2561{
2562	int rt_flags;
2563	struct l2t_entry *e;
2564	struct iff_mac tim;
2565	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
2566	struct cpl_pass_accept_rpl *rpl;
2567	struct cpl_pass_accept_req *req = cplhdr(m);
2568	unsigned int tid = GET_TID(req);
2569	struct tom_data *d = TOM_DATA(tdev);
2570	struct t3cdev *cdev = d->cdev;
2571	struct tcpcb *tp = sototcpcb(so);
2572	struct toepcb *newtoep;
2573	struct rtentry *dst;
2574	struct sockaddr_in nam;
2575	struct t3c_data *td = T3C_DATA(cdev);
2576
2577	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2578	if (__predict_false(reply_mbuf == NULL)) {
2579		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
2580			t3_defer_reply(m, tdev, reject_pass_request);
2581		else {
2582			cxgb_queue_tid_release(cdev, tid);
2583			m_free(m);
2584		}
2585		DPRINTF("failed to get reply_mbuf\n");
2586
2587		goto out;
2588	}
2589
2590	if (tp->t_state != TCPS_LISTEN) {
2591		DPRINTF("socket not in listen state\n");
2592
2593		goto reject;
2594	}
2595
2596	tim.mac_addr = req->dst_mac;
2597	tim.vlan_tag = ntohs(req->vlan_tag);
2598	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
2599		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
2600		goto reject;
2601	}
2602
2603#ifdef notyet
2604	/*
2605	 * XXX do route lookup to confirm that we're still listening on this
2606	 * address
2607	 */
2608	if (ip_route_input(skb, req->local_ip, req->peer_ip,
2609			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
2610		goto reject;
2611	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
2612		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
2613	dst_release(skb->dst);	// done with the input route, release it
2614	skb->dst = NULL;
2615
2616	if ((rt_flags & RTF_LOCAL) == 0)
2617		goto reject;
2618#endif
2619	/*
2620	 * XXX
2621	 */
2622	rt_flags = RTF_LOCAL;
2623	if ((rt_flags & RTF_LOCAL) == 0)
2624		goto reject;
2625
2626	/*
2627	 * Calculate values and add to syncache
2628	 */
2629
2630	newtoep = toepcb_alloc();
2631	if (newtoep == NULL)
2632		goto reject;
2633
2634	bzero(&nam, sizeof(struct sockaddr_in));
2635
2636	nam.sin_len = sizeof(struct sockaddr_in);
2637	nam.sin_family = AF_INET;
2638	nam.sin_addr.s_addr =req->peer_ip;
2639	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
2640
2641	if (dst == NULL) {
2642		printf("failed to find route\n");
2643		goto reject;
2644	}
2645	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
2646	    (struct sockaddr *)&nam);
2647	if (e == NULL) {
2648		DPRINTF("failed to get l2t\n");
2649	}
2650	/*
2651	 * Point to our listen socket until accept
2652	 */
2653	newtoep->tp_tp = tp;
2654	newtoep->tp_flags = TP_SYN_RCVD;
2655	newtoep->tp_tid = tid;
2656	newtoep->tp_toedev = tdev;
2657
2658	printf("inserting tid=%d\n", tid);
2659	cxgb_insert_tid(cdev, d->client, newtoep, tid);
2660	SOCK_LOCK(so);
2661	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
2662	SOCK_UNLOCK(so);
2663
2664
2665	if (lctx->ulp_mode) {
2666		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2667
2668		if (!ddp_mbuf)
2669			newtoep->tp_ulp_mode = 0;
2670		else
2671			newtoep->tp_ulp_mode = lctx->ulp_mode;
2672	}
2673
2674	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
2675
2676	DPRINTF("adding request to syn cache\n");
2677
2678	/*
2679	 * XXX workaround for lack of syncache drop
2680	 */
2681	toepcb_hold(newtoep);
2682	syncache_add_accept_req(req, so, newtoep);
2683
2684
2685
2686	rpl = cplhdr(reply_mbuf);
2687	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
2688	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
2689	rpl->wr.wr_lo = 0;
2690	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
2691	rpl->opt2 = htonl(calc_opt2(so, tdev));
2692	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
2693	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
2694
2695	DPRINTF("accept smt_idx=%d\n", e->smt_idx);
2696
2697	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
2698	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
2699	rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) |
2700				  CPL_PASS_OPEN_ACCEPT);
2701
2702	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
2703
2704	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so));
2705
2706#ifdef DEBUG_PRINT
2707	{
2708		int i;
2709
2710		DPRINTF("rpl:\n");
2711		uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *);
2712
2713		for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++)
2714			DPRINTF("[%d] %08x\n", i, rplbuf[i]);
2715	}
2716#endif
2717
2718
2719	l2t_send(cdev, reply_mbuf, e);
2720	m_free(m);
2721#ifdef notyet
2722	/*
2723	 * XXX this call path has to be converted to not depend on sockets
2724	 */
2725	if (newtoep->tp_ulp_mode)
2726		__set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
2727				V_TF_DDP_OFF(1) |
2728				TP_DDP_TIMER_WORKAROUND_MASK,
2729				V_TF_DDP_OFF(1) |
2730				TP_DDP_TIMER_WORKAROUND_VAL, 1);
2731
2732#endif
2733	return;
2734reject:
2735	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
2736		mk_pass_accept_rpl(reply_mbuf, m);
2737	else
2738		mk_tid_release(reply_mbuf, NULL, tid);
2739	cxgb_ofld_send(cdev, reply_mbuf);
2740	m_free(m);
2741out:
2742#if 0
2743	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2744#else
2745	return;
2746#endif
2747}
2748
2749/*
2750 * Handle a CPL_PASS_ACCEPT_REQ message.
2751 */
2752static int
2753do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2754{
2755	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
2756	struct socket *lso = listen_ctx->lso;
2757	struct tom_data *d = listen_ctx->tom_data;
2758
2759#if VALIDATE_TID
2760	struct cpl_pass_accept_req *req = cplhdr(m);
2761	unsigned int tid = GET_TID(req);
2762	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
2763
2764	if (unlikely(!lsk)) {
2765		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
2766		       cdev->name,
2767		       (unsigned long)((union listen_entry *)ctx -
2768					t->stid_tab));
2769		return CPL_RET_BUF_DONE;
2770	}
2771	if (unlikely(tid >= t->ntids)) {
2772		printk(KERN_ERR "%s: passive open TID %u too large\n",
2773		       cdev->name, tid);
2774		return CPL_RET_BUF_DONE;
2775	}
2776	/*
2777	 * For T3A the current user of the TID may have closed but its last
2778	 * message(s) may have been backlogged so the TID appears to be still
2779	 * in use.  Just take the TID away, the connection can close at its
2780	 * own leisure.  For T3B this situation is a bug.
2781	 */
2782	if (!valid_new_tid(t, tid) &&
2783	    cdev->type != T3A) {
2784		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
2785		       cdev->name, tid);
2786		return CPL_RET_BUF_DONE;
2787	}
2788#endif
2789
2790	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
2791	return (0);
2792}
2793
2794/*
2795 * Called when a connection is established to translate the TCP options
2796 * reported by HW to Linux's native format.
2797 */
2798static void
2799assign_rxopt(struct socket *so, unsigned int opt)
2800{
2801	const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
2802	struct tcpcb *tp = sototcpcb(so);
2803	struct toepcb *toep = tp->t_toe;
2804
2805	INP_LOCK_ASSERT(tp->t_inpcb);
2806
2807	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
2808	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
2809	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
2810	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
2811	if (tp->t_flags & TF_RCVD_SCALE)
2812		tp->rcv_scale = 0;
2813}
2814
2815/*
2816 * Completes some final bits of initialization for just established connections
2817 * and changes their state to TCP_ESTABLISHED.
2818 *
2819 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
2820 */
2821static void
2822make_established(struct socket *so, u32 snd_isn, unsigned int opt)
2823{
2824	struct tcpcb *tp = sototcpcb(so);
2825	struct toepcb *toep = tp->t_toe;
2826
2827	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
2828	assign_rxopt(so, opt);
2829	so->so_proto->pr_ctloutput = t3_ctloutput;
2830
2831#if 0
2832	inet_sk(sk)->id = tp->write_seq ^ jiffies;
2833#endif
2834
2835
2836	/*
2837	 * XXX not clear what rcv_wup maps to
2838	 */
2839	/*
2840	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
2841	 * pass through opt0.
2842	 */
2843	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
2844		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
2845
2846	dump_toepcb(toep);
2847
2848#ifdef notyet
2849/*
2850 * no clean interface for marking ARP up to date
2851 */
2852	dst_confirm(sk->sk_dst_cache);
2853#endif
2854	tp->t_state = TCPS_ESTABLISHED;
2855}
2856
2857static int
2858syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
2859{
2860
2861	struct in_conninfo inc;
2862	struct tcpopt to;
2863	struct tcphdr th;
2864	int mss, wsf, sack, ts;
2865	struct mbuf *m = NULL;
2866	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
2867	unsigned int opt;
2868
2869#ifdef MAC
2870#error	"no MAC support"
2871#endif
2872
2873	opt = ntohs(req->tcp_opt);
2874
2875	bzero(&to, sizeof(struct tcpopt));
2876
2877	/*
2878	 * Fill out information for entering us into the syncache
2879	 */
2880	inc.inc_fport = th.th_sport = req->peer_port;
2881	inc.inc_lport = th.th_dport = req->local_port;
2882	th.th_seq = req->rcv_isn;
2883	th.th_flags = TH_ACK;
2884
2885	inc.inc_isipv6 = 0;
2886	inc.inc_len = 0;
2887	inc.inc_faddr.s_addr = req->peer_ip;
2888	inc.inc_laddr.s_addr = req->local_ip;
2889
2890	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
2891	wsf  = G_TCPOPT_WSCALE_OK(opt);
2892	ts   = G_TCPOPT_TSTAMP(opt);
2893	sack = G_TCPOPT_SACK(opt);
2894
2895	to.to_mss = mss;
2896	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
2897	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
2898
2899	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
2900	    ntohl(req->local_ip), ntohs(req->local_port),
2901	    ntohl(req->peer_ip), ntohs(req->peer_port),
2902	    mss, wsf, ts, sack);
2903	return syncache_expand(&inc, &to, &th, so, m);
2904}
2905
2906
2907/*
2908 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
2909 * if we are in TCP_SYN_RECV due to crossed SYNs
2910 */
2911static int
2912do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2913{
2914	struct cpl_pass_establish *req = cplhdr(m);
2915	struct toepcb *toep = (struct toepcb *)ctx;
2916	struct tcpcb *tp;
2917	struct socket *so, *lso;
2918	struct t3c_data *td = T3C_DATA(cdev);
2919	// Complete socket initialization now that we have the SND_ISN
2920
2921	struct toedev *tdev;
2922
2923	so = lso = toeptoso(toep);
2924	tdev = toep->tp_toedev;
2925
2926	SOCK_LOCK(so);
2927	LIST_REMOVE(toep, synq_entry);
2928	SOCK_UNLOCK(so);
2929
2930	INP_INFO_WLOCK(&tcbinfo);
2931	if (!syncache_expand_establish_req(req, &so, toep)) {
2932		/*
2933		 * No entry
2934		 */
2935		UNIMPLEMENTED();
2936	}
2937	if (so == NULL) {
2938		/*
2939		 * Couldn't create the socket
2940		 */
2941		UNIMPLEMENTED();
2942	}
2943
2944	/*
2945	 * XXX workaround for lack of syncache drop
2946	 */
2947	toepcb_release(toep);
2948
2949	tp = sototcpcb(so);
2950	INP_LOCK(tp->t_inpcb);
2951#ifdef notyet
2952	so->so_snd.sb_flags |= SB_TOE;
2953	so->so_rcv.sb_flags |= SB_TOE;
2954#endif
2955	toep->tp_tp = tp;
2956	toep->tp_flags = 0;
2957	tp->t_toe = toep;
2958	reset_wr_list(toep);
2959	tp->rcv_wnd = select_rcv_wnd(so);
2960	DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd);
2961	install_offload_ops(so);
2962
2963	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
2964	toep->tp_wr_unacked = 0;
2965	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
2966	toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
2967	    tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
2968	toep->tp_qset_idx = 0;
2969	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
2970
2971	/*
2972	 * XXX Cancel any keep alive timer
2973	 */
2974
2975	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
2976	INP_INFO_WUNLOCK(&tcbinfo);
2977	INP_UNLOCK(tp->t_inpcb);
2978	soisconnected(so);
2979
2980#ifdef notyet
2981	/*
2982	 * XXX not sure how these checks map to us
2983	 */
2984	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
2985		sk->sk_state_change(sk);
2986		sk_wake_async(so, 0, POLL_OUT);
2987	}
2988	/*
2989	 * The state for the new connection is now up to date.
2990	 * Next check if we should add the connection to the parent's
2991	 * accept queue.  When the parent closes it resets connections
2992	 * on its SYN queue, so check if we are being reset.  If so we
2993	 * don't need to do anything more, the coming ABORT_RPL will
2994	 * destroy this socket.  Otherwise move the connection to the
2995	 * accept queue.
2996	 *
2997	 * Note that we reset the synq before closing the server so if
2998	 * we are not being reset the stid is still open.
2999	 */
3000	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3001		__kfree_skb(skb);
3002		goto unlock;
3003	}
3004#endif
3005	m_free(m);
3006
3007	return (0);
3008}
3009
3010/*
3011 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3012 * and send them to the TOE.
3013 */
3014static void
3015fixup_and_send_ofo(struct socket *so)
3016{
3017	struct mbuf *m;
3018	struct toedev *tdev = TOE_DEV(so);
3019	struct tcpcb *tp = sototcpcb(so);
3020	struct toepcb *toep = tp->t_toe;
3021	unsigned int tid = toep->tp_tid;
3022
3023	printf("fixup_and_send_ofo\n");
3024
3025	INP_LOCK_ASSERT(tp->t_inpcb);
3026	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3027		/*
3028		 * A variety of messages can be waiting but the fields we'll
3029		 * be touching are common to all so any message type will do.
3030		 */
3031		struct cpl_close_con_req *p = cplhdr(m);
3032
3033		p->wr.wr_lo = htonl(V_WR_TID(tid));
3034		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3035		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3036	}
3037}
3038
3039/*
3040 * Updates socket state from an active establish CPL message.  Runs with the
3041 * socket lock held.
3042 */
3043static void
3044socket_act_establish(struct socket *so, struct mbuf *m)
3045{
3046	struct cpl_act_establish *req = cplhdr(m);
3047	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3048	struct tcpcb *tp = sototcpcb(so);
3049	struct toepcb *toep = tp->t_toe;
3050
3051	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3052		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3053		    toep->tp_tid, tp->t_state);
3054
3055	tp->ts_recent_age = ticks;
3056	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3057	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3058
3059	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3060
3061	/*
3062	 * Now that we finally have a TID send any CPL messages that we had to
3063	 * defer for lack of a TID.
3064	 */
3065	if (mbufq_len(&toep->out_of_order_queue))
3066		fixup_and_send_ofo(so);
3067
3068	if (__predict_false(so->so_state & SS_NOFDREF)) {
3069#ifdef notyet
3070		/*
3071		 * XXX 	not clear what should be done here
3072		 * appears to correspond to sorwakeup_locked
3073		 */
3074		sk->sk_state_change(sk);
3075		sk_wake_async(so, 0, POLL_OUT);
3076#endif
3077	}
3078	m_free(m);
3079#ifdef notyet
3080/*
3081 * XXX assume no write requests permitted while socket connection is
3082 * incomplete
3083 */
3084	/*
3085	 * Currently the send queue must be empty at this point because the
3086	 * socket layer does not send anything before a connection is
3087	 * established.  To be future proof though we handle the possibility
3088	 * that there are pending buffers to send (either TX_DATA or
3089	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3090	 * buffers according to the just learned write_seq, and then we send
3091	 * them on their way.
3092	 */
3093	fixup_pending_writeq_buffers(sk);
3094	if (t3_push_frames(so, 1))
3095		sk->sk_write_space(sk);
3096#endif
3097
3098	soisconnected(so);
3099	toep->tp_state = tp->t_state = TCPS_ESTABLISHED;
3100	tcpstat.tcps_connects++;
3101
3102}
3103
3104/*
3105 * Process a CPL_ACT_ESTABLISH message.
3106 */
3107static int
3108do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3109{
3110	struct cpl_act_establish *req = cplhdr(m);
3111	unsigned int tid = GET_TID(req);
3112	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3113	struct toepcb *toep = (struct toepcb *)ctx;
3114	struct tcpcb *tp = toep->tp_tp;
3115	struct socket *so;
3116	struct toedev *tdev;
3117	struct tom_data *d;
3118
3119	if (tp == NULL) {
3120		free_atid(cdev, atid);
3121		return (0);
3122	}
3123
3124	so = toeptoso(toep);
3125	tdev = TOE_DEV(so); /* blow up here if link was down */
3126	d = TOM_DATA(tdev);
3127
3128	INP_LOCK(tp->t_inpcb);
3129
3130	/*
3131	 * It's OK if the TID is currently in use, the owning socket may have
3132	 * backlogged its last CPL message(s).  Just take it away.
3133	 */
3134	toep->tp_tid = tid;
3135	toep->tp_tp = tp;
3136	so_insert_tid(d, so, tid);
3137	free_atid(cdev, atid);
3138	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3139
3140	socket_act_establish(so, m);
3141	INP_UNLOCK(tp->t_inpcb);
3142	return (0);
3143}
3144
3145/*
3146 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3147 * next batch of work requests from the write queue.
3148 */
3149static void
3150wr_ack(struct toepcb *toep, struct mbuf *m)
3151{
3152	struct tcpcb *tp = toep->tp_tp;
3153	struct cpl_wr_ack *hdr = cplhdr(m);
3154	struct socket *so = toeptoso(toep);
3155	unsigned int credits = ntohs(hdr->credits);
3156	u32 snd_una = ntohl(hdr->snd_una);
3157	int bytes = 0;
3158
3159	DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits);
3160
3161	INP_LOCK(tp->t_inpcb);
3162
3163	toep->tp_wr_avail += credits;
3164	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3165		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3166
3167	while (credits) {
3168		struct mbuf *p = peek_wr(toep);
3169		DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ;
3170
3171		if (__predict_false(!p)) {
3172			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3173			    "nothing pending, state %u\n",
3174			       credits, toep->tp_tid, tp->t_state);
3175			break;
3176		}
3177		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3178#if DEBUG_WR > 1
3179			struct tx_data_wr *w = cplhdr(p);
3180#ifdef notyet
3181			log(LOG_ERR,
3182			       "TID %u got %u WR credits, need %u, len %u, "
3183			       "main body %u, frags %u, seq # %u, ACK una %u,"
3184			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3185			       toep->tp_tid, credits, p->csum, p->len,
3186			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3187			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3188			       WR_AVAIL(tp), count_pending_wrs(tp) - credits);
3189#endif
3190#endif
3191			p->m_pkthdr.csum_data -= credits;
3192			break;
3193		} else {
3194			dequeue_wr(toep);
3195			credits -= p->m_pkthdr.csum_data;
3196			bytes += p->m_pkthdr.len;
3197			DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len);
3198
3199			m_free(p);
3200		}
3201	}
3202
3203#if DEBUG_WR
3204	check_wr_invariants(tp);
3205#endif
3206
3207	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3208#if VALIDATE_SEQ
3209		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3210
3211		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3212		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3213		    toep->tp_tid, tp->snd_una);
3214#endif
3215		goto out_free;
3216	}
3217
3218	if (tp->snd_una != snd_una) {
3219		tp->snd_una = snd_una;
3220		tp->ts_recent_age = ticks;
3221#ifdef notyet
3222		/*
3223		 * Keep ARP entry "minty fresh"
3224		 */
3225		dst_confirm(sk->sk_dst_cache);
3226#endif
3227		if (tp->snd_una == tp->snd_nxt)
3228			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3229	}
3230	if (bytes) {
3231		DPRINTF("sbdrop(%d)\n", bytes);
3232		SOCKBUF_LOCK(&so->so_snd);
3233		sbdrop_locked(&so->so_snd, bytes);
3234		sowwakeup_locked(so);
3235	}
3236
3237	if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
3238		t3_push_frames(so, 0);
3239
3240out_free:
3241	INP_UNLOCK(tp->t_inpcb);
3242	m_free(m);
3243}
3244
3245/*
3246 * Handler for TX_DATA_ACK CPL messages.
3247 */
3248static int
3249do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3250{
3251	struct toepcb *toep = (struct toepcb *)ctx;
3252
3253	DPRINTF("do_wr_ack\n");
3254	dump_toepcb(toep);
3255
3256	VALIDATE_SOCK(so);
3257
3258	wr_ack(toep, m);
3259	return 0;
3260}
3261
3262
3263/*
3264 * Reset a connection that is on a listener's SYN queue or accept queue,
3265 * i.e., one that has not had a struct socket associated with it.
3266 * Must be called from process context.
3267 *
3268 * Modeled after code in inet_csk_listen_stop().
3269 */
3270static void
3271t3_reset_listen_child(struct socket *child)
3272{
3273	struct tcpcb *tp = sototcpcb(child);
3274
3275	t3_send_reset(tp->t_toe);
3276}
3277
3278/*
3279 * Disconnect offloaded established but not yet accepted connections sitting
3280 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
3281 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
3282 */
3283void
3284t3_disconnect_acceptq(struct socket *listen_so)
3285{
3286	struct socket *so;
3287	struct tcpcb *tp;
3288
3289	TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
3290		tp = sototcpcb(so);
3291
3292		if (tp->t_flags & TF_TOE) {
3293			INP_LOCK(tp->t_inpcb);
3294			t3_reset_listen_child(so);
3295			INP_UNLOCK(tp->t_inpcb);
3296		}
3297
3298	}
3299}
3300
3301/*
3302 * Reset offloaded connections sitting on a server's syn queue.  As above
3303 * we send ABORT_REQ and finish off when we get ABORT_RPL.
3304 */
3305
3306void
3307t3_reset_synq(struct listen_ctx *lctx)
3308{
3309	struct toepcb *toep;
3310
3311	SOCK_LOCK(lctx->lso);
3312	while (!LIST_EMPTY(&lctx->synq_head)) {
3313		toep = LIST_FIRST(&lctx->synq_head);
3314		LIST_REMOVE(toep, synq_entry);
3315		toep->tp_tp = NULL;
3316		t3_send_reset(toep);
3317		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
3318		toepcb_release(toep);
3319	}
3320	SOCK_UNLOCK(lctx->lso);
3321}
3322
3323void
3324t3_init_wr_tab(unsigned int wr_len)
3325{
3326	int i;
3327
3328	if (mbuf_wrs[1])     /* already initialized */
3329		return;
3330
3331	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
3332		int sgl_len = (3 * i) / 2 + (i & 1);
3333
3334		sgl_len += 3;
3335		mbuf_wrs[i] = sgl_len <= wr_len ?
3336		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
3337	}
3338
3339	wrlen = wr_len * 8;
3340}
3341
3342int
3343t3_init_cpl_io(void)
3344{
3345#ifdef notyet
3346	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
3347	if (!tcphdr_skb) {
3348		log(LOG_ERR,
3349		       "Chelsio TCP offload: can't allocate sk_buff\n");
3350		return -1;
3351	}
3352	skb_put(tcphdr_skb, sizeof(struct tcphdr));
3353	tcphdr_skb->h.raw = tcphdr_skb->data;
3354	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
3355#endif
3356
3357
3358	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
3359	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
3360	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
3361	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
3362	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
3363	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
3364	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
3365	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
3366	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
3367	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
3368	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
3369	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
3370#ifdef notyet
3371	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
3372	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
3373	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
3374#endif
3375	return (0);
3376}
3377
3378