cxgb_cpl_io.c revision 174708
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 174708 2007-12-17 08:17:51Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/lock.h>
39#include <sys/mbuf.h>
40#include <sys/mutex.h>
41#include <sys/socket.h>
42#include <sys/sysctl.h>
43#include <sys/syslog.h>
44#include <sys/socketvar.h>
45#include <sys/protosw.h>
46#include <sys/priv.h>
47
48#include <net/if.h>
49#include <net/route.h>
50
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_systm.h>
54#include <netinet/in_var.h>
55
56
57#include <dev/cxgb/cxgb_osdep.h>
58#include <dev/cxgb/sys/mbufq.h>
59
60#include <netinet/ip.h>
61#include <netinet/tcp_var.h>
62#include <netinet/tcp_fsm.h>
63#include <netinet/tcp_offload.h>
64#include <netinet/tcp_seq.h>
65#include <netinet/tcp_syncache.h>
66#include <net/route.h>
67
68
69#include <dev/cxgb/t3cdev.h>
70#include <dev/cxgb/common/cxgb_firmware_exports.h>
71#include <dev/cxgb/common/cxgb_t3_cpl.h>
72#include <dev/cxgb/common/cxgb_tcb.h>
73#include <dev/cxgb/common/cxgb_ctl_defs.h>
74#include <dev/cxgb/cxgb_l2t.h>
75#include <dev/cxgb/cxgb_offload.h>
76#include <vm/vm.h>
77#include <vm/pmap.h>
78#include <machine/bus.h>
79#include <dev/cxgb/sys/mvec.h>
80#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
81#include <dev/cxgb/ulp/tom/cxgb_defs.h>
82#include <dev/cxgb/ulp/tom/cxgb_tom.h>
83#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
84#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
85#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
86
87
88
89/*
90 * For ULP connections HW may add headers, e.g., for digests, that aren't part
91 * of the messages sent by the host but that are part of the TCP payload and
92 * therefore consume TCP sequence space.  Tx connection parameters that
93 * operate in TCP sequence space are affected by the HW additions and need to
94 * compensate for them to accurately track TCP sequence numbers. This array
95 * contains the compensating extra lengths for ULP packets.  It is indexed by
96 * a packet's ULP submode.
97 */
98const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
99
100#ifdef notyet
101/*
102 * This sk_buff holds a fake header-only TCP segment that we use whenever we
103 * need to exploit SW TCP functionality that expects TCP headers, such as
104 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
105 * CPUs without locking.
106 */
107static struct mbuf *tcphdr_mbuf __read_mostly;
108#endif
109
110/*
111 * Size of WRs in bytes.  Note that we assume all devices we are handling have
112 * the same WR size.
113 */
114static unsigned int wrlen __read_mostly;
115
116/*
117 * The number of WRs needed for an skb depends on the number of page fragments
118 * in the skb and whether it has any payload in its main body.  This maps the
119 * length of the gather list represented by an skb into the # of necessary WRs.
120 */
121static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly;
122
123/*
124 * Max receive window supported by HW in bytes.  Only a small part of it can
125 * be set through option0, the rest needs to be set through RX_DATA_ACK.
126 */
127#define MAX_RCV_WND ((1U << 27) - 1)
128
129/*
130 * Min receive window.  We want it to be large enough to accommodate receive
131 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
132 */
133#define MIN_RCV_WND (24 * 1024U)
134#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
135
136#define VALIDATE_SEQ 0
137#define VALIDATE_SOCK(so)
138#define DEBUG_WR 0
139
140extern int tcp_do_autorcvbuf;
141extern int tcp_do_autosndbuf;
142extern int tcp_autorcvbuf_max;
143extern int tcp_autosndbuf_max;
144
145static void t3_send_reset(struct toepcb *toep);
146static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
147static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
148static void handle_syncache_event(int event, void *arg);
149
150
151static inline int
152is_t3a(const struct toedev *dev)
153{
154	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
155}
156
157static void
158dump_toepcb(struct toepcb *toep)
159{
160	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
161	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
162	    toep->tp_mtu_idx, toep->tp_tid);
163
164	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
165	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
166	    toep->tp_mss_clamp, toep->tp_flags);
167}
168
169static struct rtentry *
170rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
171{
172	struct rtentry *rt = NULL;
173
174	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
175		RT_UNLOCK(rt);
176
177	return (rt);
178}
179
180/*
181 * Determine whether to send a CPL message now or defer it.  A message is
182 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
183 * For connections in other states the message is sent immediately.
184 * If through_l2t is set the message is subject to ARP processing, otherwise
185 * it is sent directly.
186 */
187static inline void
188send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
189{
190	struct toepcb *toep = tp->t_toe;
191
192
193	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
194		INP_LOCK(tp->t_inpcb);
195		mbufq_tail(&toep->out_of_order_queue, m);  // defer
196		INP_UNLOCK(tp->t_inpcb);
197	} else if (through_l2t)
198		l2t_send(T3C_DEV(so), m, toep->tp_l2t);  // send through L2T
199	else
200		cxgb_ofld_send(T3C_DEV(so), m);          // send directly
201}
202
203static inline unsigned int
204mkprio(unsigned int cntrl, const struct socket *so)
205{
206        return cntrl;
207}
208
209/*
210 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
211 */
212static inline void
213mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid)
214{
215	struct cpl_tid_release *req;
216
217	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so));
218	m->m_pkthdr.len = m->m_len = sizeof(*req);
219	req = mtod(m, struct cpl_tid_release *);
220	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
221	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
222}
223
224static inline void
225make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
226{
227	struct tcpcb *tp = sototcpcb(so);
228	struct toepcb *toep = tp->t_toe;
229	struct tx_data_wr *req;
230
231	INP_LOCK_ASSERT(tp->t_inpcb);
232
233	req = mtod(m, struct tx_data_wr *);
234	m->m_len = sizeof(*req);
235	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
236	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
237	/* len includes the length of any HW ULP additions */
238	req->len = htonl(len);
239	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
240	/* V_TX_ULP_SUBMODE sets both the mode and submode */
241	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
242	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
243	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
244				   (tail ? 0 : 1))));
245	req->sndseq = htonl(tp->snd_nxt);
246	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
247		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
248				    V_TX_CPU_IDX(toep->tp_qset));
249
250		/* Sendbuffer is in units of 32KB.
251		 */
252		if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
253			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
254		else
255			req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
256		toep->tp_flags |= TP_DATASENT;
257	}
258}
259
260int
261t3_push_frames(struct socket *so, int req_completion)
262{
263	struct tcpcb *tp = sototcpcb(so);
264	struct toepcb *toep = tp->t_toe;
265
266	struct mbuf *tail, *m0, *last;
267	struct t3cdev *cdev;
268	struct tom_data *d;
269	int bytes, count, total_bytes;
270	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
271	segp = segs;
272
273	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
274		DPRINTF("tcp state=%d\n", tp->t_state);
275		return (0);
276	}
277
278	if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
279		DPRINTF("disconnecting\n");
280
281		return (0);
282	}
283
284	INP_LOCK_ASSERT(tp->t_inpcb);
285
286	SOCKBUF_LOCK(&so->so_snd);
287
288	d = TOM_DATA(TOE_DEV(so));
289	cdev = d->cdev;
290	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
291	total_bytes = 0;
292	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
293	    toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
294
295	if (last && toep->tp_m_last == last  && so->so_snd.sb_sndptroff != 0) {
296		KASSERT(tail, ("sbdrop error"));
297		last = tail = tail->m_next;
298	}
299
300	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
301		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
302		SOCKBUF_UNLOCK(&so->so_snd);
303		return (0);
304	}
305
306	toep->tp_m_last = NULL;
307	while (toep->tp_wr_avail && (tail != NULL)) {
308		count = bytes = 0;
309		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
310			SOCKBUF_UNLOCK(&so->so_snd);
311			return (0);
312		}
313		while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
314		    && (tail != NULL) && (count < TX_MAX_SEGS)) {
315			bytes += tail->m_len;
316			count++;
317			last = tail;
318			/*
319			 * technically an abuse to be using this for a VA
320			 * but less gross than defining my own structure
321			 * or calling pmap_kextract from here :-|
322			 */
323			segp->ds_addr = (bus_addr_t)tail->m_data;
324			segp->ds_len = tail->m_len;
325			DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
326			    count, mbuf_wrs[count], tail->m_data, tail->m_len);
327
328			segp++;
329			tail = tail->m_next;
330		}
331		DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
332		    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
333		if (tail) {
334			so->so_snd.sb_sndptr = tail;
335			toep->tp_m_last = NULL;
336		} else
337			toep->tp_m_last = so->so_snd.sb_sndptr = last;
338
339		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
340
341		so->so_snd.sb_sndptroff += bytes;
342		total_bytes += bytes;
343		toep->tp_write_seq += bytes;
344
345
346		SOCKBUF_UNLOCK(&so->so_snd);
347
348		/*
349		 * XXX can drop socket buffer lock here
350		 */
351
352		toep->tp_wr_avail -= mbuf_wrs[count];
353		toep->tp_wr_unacked += mbuf_wrs[count];
354
355		make_tx_data_wr(so, m0, bytes, tail);
356		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so));
357		m_set_sgl(m0, segs);
358		m_set_sgllen(m0, count);
359		/*
360		 * remember credits used
361		 */
362		m0->m_pkthdr.csum_data = mbuf_wrs[count];
363		m0->m_pkthdr.len = bytes;
364		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
365		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
366			struct work_request_hdr *wr = cplhdr(m0);
367
368			wr->wr_hi |= htonl(F_WR_COMPL);
369			toep->tp_wr_unacked = 0;
370		}
371
372		m0->m_type = MT_DONTFREE;
373		enqueue_wr(toep, m0);
374		DPRINTF("sending offload tx with %d bytes in %d segments\n",
375		    bytes, count);
376
377		l2t_send(cdev, m0, toep->tp_l2t);
378		if (toep->tp_wr_avail && (tail != NULL))
379			SOCKBUF_LOCK(&so->so_snd);
380	}
381
382	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
383	return (total_bytes);
384}
385
386/*
387 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
388 * under any circumstances.  We take the easy way out and always queue the
389 * message to the write_queue.  We can optimize the case where the queue is
390 * already empty though the optimization is probably not worth it.
391 */
392static void
393close_conn(struct socket *so)
394{
395	struct mbuf *m;
396	struct cpl_close_con_req *req;
397	struct tom_data *d;
398	struct inpcb *inp = sotoinpcb(so);
399	struct tcpcb *tp;
400	struct toepcb *toep;
401	unsigned int tid;
402
403
404	INP_LOCK(inp);
405	tp = sototcpcb(so);
406	toep = tp->t_toe;
407
408	if (tp->t_state != TCPS_SYN_SENT)
409		t3_push_frames(so, 1);
410
411	if (toep->tp_flags & TP_FIN_SENT) {
412		INP_UNLOCK(inp);
413		return;
414	}
415
416	tid = toep->tp_tid;
417
418	d = TOM_DATA(toep->tp_toedev);
419
420	m = m_gethdr_nofail(sizeof(*req));
421
422	toep->tp_flags |= TP_FIN_SENT;
423	req = mtod(m, struct cpl_close_con_req *);
424
425	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
426	req->wr.wr_lo = htonl(V_WR_TID(tid));
427	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
428	req->rsvd = htonl(toep->tp_write_seq);
429	INP_UNLOCK(inp);
430	/*
431	 * XXX - need to defer shutdown while there is still data in the queue
432	 *
433	 */
434	cxgb_ofld_send(d->cdev, m);
435
436}
437
438/*
439 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
440 * and send it along.
441 */
442static void
443abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
444{
445	struct cpl_abort_req *req = cplhdr(m);
446
447	req->cmd = CPL_ABORT_NO_RST;
448	cxgb_ofld_send(cdev, m);
449}
450
451/*
452 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
453 * permitted to return without sending the message in case we cannot allocate
454 * an sk_buff.  Returns the number of credits sent.
455 */
456uint32_t
457t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
458{
459	struct mbuf *m;
460	struct cpl_rx_data_ack *req;
461	struct toepcb *toep = tp->t_toe;
462	struct toedev *tdev = toep->tp_toedev;
463
464	m = m_gethdr_nofail(sizeof(*req));
465
466	DPRINTF("returning %u credits to HW\n", credits);
467
468	req = mtod(m, struct cpl_rx_data_ack *);
469	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
470	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
471	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
472	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep)));
473	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
474	return (credits);
475}
476
477
478/*
479 * Set of states for which we should return RX credits.
480 */
481#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
482
483/*
484 * Called after some received data has been read.  It returns RX credits
485 * to the HW for the amount of data processed.
486 */
487void
488t3_cleanup_rbuf(struct tcpcb *tp)
489{
490	struct toepcb *toep = tp->t_toe;
491	struct socket *so;
492	struct toedev *dev;
493	int dack_mode, must_send, read;
494	u32 thres, credits, dack = 0;
495
496	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
497		(tp->t_state == TCPS_FIN_WAIT_2)))
498		return;
499	INP_LOCK_ASSERT(tp->t_inpcb);
500
501	so = tp->t_inpcb->inp_socket;
502	SOCKBUF_LOCK(&so->so_rcv);
503	read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
504	toep->tp_copied_seq += read;
505	toep->tp_enqueued_bytes -= read;
506	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
507	SOCKBUF_UNLOCK(&so->so_rcv);
508
509	if (credits > so->so_rcv.sb_mbmax)
510	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
511		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
512	/*
513	 * XXX this won't accurately reflect credit return - we need
514	 * to look at the difference between the amount that has been
515	 * put in the recv sockbuf and what is there now
516	 */
517
518	if (__predict_false(!credits))
519		return;
520
521	dev = toep->tp_toedev;
522	thres = TOM_TUNABLE(dev, rx_credit_thres);
523
524	if (__predict_false(thres == 0))
525		return;
526
527	if (toep->tp_ulp_mode)
528		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
529	else {
530		dack_mode = TOM_TUNABLE(dev, delack);
531		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
532			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
533
534			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
535				dack = F_RX_DACK_CHANGE |
536				       V_RX_DACK_MODE(dack_mode);
537		}
538	}
539
540	/*
541	 * For coalescing to work effectively ensure the receive window has
542	 * at least 16KB left.
543	 */
544	must_send = credits + 16384 >= tp->rcv_wnd;
545
546	if (must_send || credits >= thres)
547		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
548}
549
550static int
551cxgb_toe_disconnect(struct tcpcb *tp)
552{
553	struct socket *so;
554
555	DPRINTF("cxgb_toe_disconnect\n");
556
557	so = tp->t_inpcb->inp_socket;
558	close_conn(so);
559	return (0);
560}
561
562static int
563cxgb_toe_reset(struct tcpcb *tp)
564{
565	struct toepcb *toep = tp->t_toe;
566
567
568	t3_send_reset(toep);
569
570	/*
571	 * unhook from socket
572	 */
573	tp->t_flags &= ~TF_TOE;
574	toep->tp_tp = NULL;
575	tp->t_toe = NULL;
576	return (0);
577}
578
579static int
580cxgb_toe_send(struct tcpcb *tp)
581{
582	struct socket *so;
583
584	DPRINTF("cxgb_toe_send\n");
585	dump_toepcb(tp->t_toe);
586
587	so = tp->t_inpcb->inp_socket;
588	t3_push_frames(so, 1);
589	return (0);
590}
591
592static int
593cxgb_toe_rcvd(struct tcpcb *tp)
594{
595	INP_LOCK_ASSERT(tp->t_inpcb);
596	t3_cleanup_rbuf(tp);
597
598	return (0);
599}
600
601static void
602cxgb_toe_detach(struct tcpcb *tp)
603{
604	struct toepcb *toep;
605	/*
606	 * XXX how do we handle teardown in the SYN_SENT state?
607	 *
608	 */
609	INP_INFO_WLOCK(&tcbinfo);
610	toep = tp->t_toe;
611	toep->tp_tp = NULL;
612
613	/*
614	 * unhook from socket
615	 */
616	tp->t_flags &= ~TF_TOE;
617	tp->t_toe = NULL;
618	INP_INFO_WUNLOCK(&tcbinfo);
619}
620
621
622static struct toe_usrreqs cxgb_toe_usrreqs = {
623	.tu_disconnect = cxgb_toe_disconnect,
624	.tu_reset = cxgb_toe_reset,
625	.tu_send = cxgb_toe_send,
626	.tu_rcvd = cxgb_toe_rcvd,
627	.tu_detach = cxgb_toe_detach,
628	.tu_detach = cxgb_toe_detach,
629	.tu_syncache_event = handle_syncache_event,
630};
631
632
633static void
634__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
635			    uint64_t mask, uint64_t val, int no_reply)
636{
637	struct cpl_set_tcb_field *req;
638	struct tcpcb *tp = sototcpcb(so);
639	struct toepcb *toep = tp->t_toe;
640
641	req = mtod(m, struct cpl_set_tcb_field *);
642	m->m_pkthdr.len = m->m_len = sizeof(*req);
643	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
644	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
645	req->reply = V_NO_REPLY(no_reply);
646	req->cpu_idx = 0;
647	req->word = htons(word);
648	req->mask = htobe64(mask);
649	req->val = htobe64(val);
650
651	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
652	send_or_defer(so, tp, m, 0);
653}
654
655static void
656t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
657{
658	struct mbuf *m;
659	struct tcpcb *tp = sototcpcb(so);
660	struct toepcb *toep = tp->t_toe;
661
662	if (toep == NULL)
663		return;
664
665	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
666		return;
667
668	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
669
670	__set_tcb_field(so, m, word, mask, val, 1);
671}
672
673/*
674 * Set one of the t_flags bits in the TCB.
675 */
676static void
677set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
678{
679	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
680}
681
682/*
683 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
684 */
685static void
686t3_set_nagle(struct socket *so)
687{
688	struct tcpcb *tp = sototcpcb(so);
689
690	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
691}
692
693/*
694 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
695 */
696void
697t3_set_keepalive(struct socket *so, int on_off)
698{
699	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
700}
701
702void
703t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
704{
705	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
706}
707
708/*
709 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
710 */
711static void
712t3_set_tos(struct socket *so)
713{
714	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
715			 V_TCB_TOS(SO_TOS(so)));
716}
717
718
719/*
720 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
721 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
722 * set the PSH bit in the last segment, which would trigger delivery.]
723 * We work around the issue by setting a DDP buffer in a partial placed state,
724 * which guarantees that TP will schedule a timer.
725 */
726#define TP_DDP_TIMER_WORKAROUND_MASK\
727    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
728     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
729       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
730#define TP_DDP_TIMER_WORKAROUND_VAL\
731    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
732     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
733      32))
734
735static void
736t3_enable_ddp(struct socket *so, int on)
737{
738	if (on)
739		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
740				 V_TF_DDP_OFF(0));
741	else
742		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
743				 V_TF_DDP_OFF(1) |
744				 TP_DDP_TIMER_WORKAROUND_MASK,
745				 V_TF_DDP_OFF(1) |
746				 TP_DDP_TIMER_WORKAROUND_VAL);
747
748}
749
750
751void
752t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
753{
754	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
755			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
756			 tag_color);
757}
758
759void
760t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
761		    unsigned int len)
762{
763	if (buf_idx == 0)
764		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
765			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
766			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
767			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
768			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
769	else
770		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
771			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
772			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
773			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
774			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
775}
776
777static int
778t3_set_cong_control(struct socket *so, const char *name)
779{
780#ifdef notyet
781	int cong_algo;
782
783	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
784		if (!strcmp(name, t3_cong_ops[cong_algo].name))
785			break;
786
787	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
788		return -EINVAL;
789#endif
790	return 0;
791}
792
793int
794t3_get_tcb(struct socket *so)
795{
796	struct cpl_get_tcb *req;
797	struct tcpcb *tp = sototcpcb(so);
798	struct toepcb *toep = tp->t_toe;
799	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
800
801	if (!m)
802		return (ENOMEM);
803
804	INP_LOCK_ASSERT(tp->t_inpcb);
805	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
806	req = mtod(m, struct cpl_get_tcb *);
807	m->m_pkthdr.len = m->m_len = sizeof(*req);
808	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
809	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
810	req->cpuno = htons(toep->tp_qset);
811	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
812		mbufq_tail(&toep->out_of_order_queue, m);	// defer
813	else
814		cxgb_ofld_send(T3C_DEV(so), m);
815	return 0;
816}
817
818static inline void
819so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
820{
821	struct toepcb *toep = sototoep(so);
822	toepcb_hold(toep);
823
824	cxgb_insert_tid(d->cdev, d->client, toep, tid);
825}
826
827/**
828 *	find_best_mtu - find the entry in the MTU table closest to an MTU
829 *	@d: TOM state
830 *	@mtu: the target MTU
831 *
832 *	Returns the index of the value in the MTU table that is closest to but
833 *	does not exceed the target MTU.
834 */
835static unsigned int
836find_best_mtu(const struct t3c_data *d, unsigned short mtu)
837{
838	int i = 0;
839
840	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
841		++i;
842	return (i);
843}
844
845static unsigned int
846select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
847{
848	unsigned int idx;
849
850#ifdef notyet
851	struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
852#endif
853	if (tp) {
854		tp->t_maxseg = pmtu - 40;
855		if (tp->t_maxseg < td->mtus[0] - 40)
856			tp->t_maxseg = td->mtus[0] - 40;
857		idx = find_best_mtu(td, tp->t_maxseg + 40);
858
859		tp->t_maxseg = td->mtus[idx] - 40;
860	} else
861		idx = find_best_mtu(td, pmtu);
862
863	return (idx);
864}
865
866void
867t3_release_ddp_resources(struct toepcb *toep)
868{
869	/*
870	 * This is a no-op until we have DDP support
871	 */
872}
873
874static inline void
875free_atid(struct t3cdev *cdev, unsigned int tid)
876{
877	struct toepcb *toep = cxgb_free_atid(cdev, tid);
878
879	if (toep)
880		toepcb_release(toep);
881}
882
883/*
884 * Release resources held by an offload connection (TID, L2T entry, etc.)
885 */
886static void
887t3_release_offload_resources(struct toepcb *toep)
888{
889	struct tcpcb *tp = toep->tp_tp;
890	struct toedev *tdev = toep->tp_toedev;
891	struct t3cdev *cdev;
892	unsigned int tid = toep->tp_tid;
893
894	if (!tdev)
895		return;
896
897	cdev = TOEP_T3C_DEV(toep);
898	if (!cdev)
899		return;
900
901	toep->tp_qset = 0;
902	t3_release_ddp_resources(toep);
903
904#ifdef CTRL_SKB_CACHE
905	kfree_skb(CTRL_SKB_CACHE(tp));
906	CTRL_SKB_CACHE(tp) = NULL;
907#endif
908
909	if (toep->tp_wr_avail != toep->tp_wr_max) {
910		purge_wr_queue(toep);
911		reset_wr_list(toep);
912	}
913
914	if (toep->tp_l2t) {
915		l2t_release(L2DATA(cdev), toep->tp_l2t);
916		toep->tp_l2t = NULL;
917	}
918	printf("setting toep->tp_tp to NULL\n");
919
920	toep->tp_tp = NULL;
921	if (tp) {
922		INP_LOCK_ASSERT(tp->t_inpcb);
923		tp->t_toe = NULL;
924		tp->t_flags &= ~TF_TOE;
925	}
926
927	if (toep->tp_state == TCPS_SYN_SENT) {
928		free_atid(cdev, tid);
929#ifdef notyet
930		__skb_queue_purge(&tp->out_of_order_queue);
931#endif
932	} else {                                          // we have TID
933		cxgb_remove_tid(cdev, toep, tid);
934		toepcb_release(toep);
935	}
936#if 0
937	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
938#endif
939}
940
941static void
942install_offload_ops(struct socket *so)
943{
944	struct tcpcb *tp = sototcpcb(so);
945
946	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
947
948	t3_install_socket_ops(so);
949	tp->t_flags |= TF_TOE;
950	tp->t_tu = &cxgb_toe_usrreqs;
951}
952
953/*
954 * Determine the receive window scaling factor given a target max
955 * receive window.
956 */
957static __inline int
958select_rcv_wscale(int space)
959{
960	int wscale = 0;
961
962	if (space > MAX_RCV_WND)
963		space = MAX_RCV_WND;
964
965	if (tcp_do_rfc1323)
966		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
967	return wscale;
968}
969
970/*
971 * Determine the receive window size for a socket.
972 */
973static unsigned int
974select_rcv_wnd(struct socket *so)
975{
976	struct toedev *dev = TOE_DEV(so);
977	struct tom_data *d = TOM_DATA(dev);
978	unsigned int wnd;
979	unsigned int max_rcv_wnd;
980
981	if (tcp_do_autorcvbuf)
982		wnd = tcp_autorcvbuf_max;
983	else
984		wnd = sbspace(&so->so_rcv);
985
986	/* XXX
987	 * For receive coalescing to work effectively we need a receive window
988	 * that can accomodate a coalesced segment.
989	 */
990	if (wnd < MIN_RCV_WND)
991		wnd = MIN_RCV_WND;
992
993	/* PR 5138 */
994	max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ?
995				    (uint32_t)d->rx_page_size * 23 :
996				    MAX_RCV_WND);
997
998	return min(wnd, max_rcv_wnd);
999}
1000
1001/*
1002 * Assign offload parameters to some socket fields.  This code is used by
1003 * both active and passive opens.
1004 */
1005static inline void
1006init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1007    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1008{
1009	struct tcpcb *tp = sototcpcb(so);
1010	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1011
1012	SOCK_LOCK_ASSERT(so);
1013
1014	printf("initializing offload socket\n");
1015#ifdef notyet
1016	/*
1017	 * We either need to fix push frames to work with sbcompress
1018	 * or we need to add this
1019	 */
1020	so->so_rcv.sb_flags |= SB_TOE;
1021	so->so_snd.sb_flags |= SB_TOE;
1022#endif
1023	tp->t_toe = toep;
1024	toep->tp_tp = tp;
1025	toep->tp_toedev = dev;
1026
1027	toep->tp_tid = tid;
1028	toep->tp_l2t = e;
1029	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1030	toep->tp_wr_unacked = 0;
1031	toep->tp_delack_mode = 0;
1032
1033	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1034	/*
1035	 * XXX broken
1036	 *
1037	 */
1038	tp->rcv_wnd = select_rcv_wnd(so);
1039        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
1040		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1041	toep->tp_qset_idx = 0;
1042
1043	reset_wr_list(toep);
1044	DPRINTF("initialization done\n");
1045}
1046
1047/*
1048 * The next two functions calculate the option 0 value for a socket.
1049 */
1050static inline unsigned int
1051calc_opt0h(struct socket *so, int mtu_idx)
1052{
1053	struct tcpcb *tp = sototcpcb(so);
1054	int wscale = select_rcv_wscale(tp->rcv_wnd);
1055
1056	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1057	    V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1058	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1059}
1060
1061static inline unsigned int
1062calc_opt0l(struct socket *so, int ulp_mode)
1063{
1064	struct tcpcb *tp = sototcpcb(so);
1065	unsigned int val;
1066
1067	val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
1068	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1069
1070	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
1071	return (val);
1072}
1073
1074static inline unsigned int
1075calc_opt2(const struct socket *so, struct toedev *dev)
1076{
1077	int flv_valid;
1078
1079	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1080
1081	return V_FLAVORS_VALID(flv_valid) |
1082	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0);
1083}
1084#if 0
1085(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1086#endif
1087
1088static void
1089mk_act_open_req(struct socket *so, struct mbuf *m,
1090    unsigned int atid, const struct l2t_entry *e)
1091{
1092	struct cpl_act_open_req *req;
1093	struct inpcb *inp = sotoinpcb(so);
1094	struct tcpcb *tp = intotcpcb(inp);
1095	struct toepcb *toep = tp->t_toe;
1096	struct toedev *tdev = TOE_DEV(so);
1097
1098	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so));
1099
1100	req = mtod(m, struct cpl_act_open_req *);
1101	m->m_pkthdr.len = m->m_len = sizeof(*req);
1102
1103	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1104	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1105	req->local_port = inp->inp_lport;
1106	req->peer_port = inp->inp_fport;
1107	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1108	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1109	DPRINTF("connect smt_idx=%d\n", e->smt_idx);
1110	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1111			   V_TX_CHANNEL(e->smt_idx));
1112	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1113	req->params = 0;
1114	req->opt2 = htonl(calc_opt2(so, tdev));
1115}
1116
1117
1118/*
1119 * Convert an ACT_OPEN_RPL status to an errno.
1120 */
1121static int
1122act_open_rpl_status_to_errno(int status)
1123{
1124	switch (status) {
1125	case CPL_ERR_CONN_RESET:
1126		return (ECONNREFUSED);
1127	case CPL_ERR_ARP_MISS:
1128		return (EHOSTUNREACH);
1129	case CPL_ERR_CONN_TIMEDOUT:
1130		return (ETIMEDOUT);
1131	case CPL_ERR_TCAM_FULL:
1132		return (ENOMEM);
1133	case CPL_ERR_CONN_EXIST:
1134		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1135		return (EADDRINUSE);
1136	default:
1137		return (EIO);
1138	}
1139}
1140
1141static void
1142fail_act_open(struct toepcb *toep, int errno)
1143{
1144	struct tcpcb *tp = toep->tp_tp;
1145
1146	t3_release_offload_resources(toep);
1147	if (tp) {
1148		INP_LOCK_ASSERT(tp->t_inpcb);
1149		cxgb_tcp_drop(tp, errno);
1150	}
1151
1152#ifdef notyet
1153	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1154#endif
1155}
1156
1157/*
1158 * Handle active open failures.
1159 */
1160static void
1161active_open_failed(struct toepcb *toep, struct mbuf *m)
1162{
1163	struct cpl_act_open_rpl *rpl = cplhdr(m);
1164	struct inpcb *inp;
1165
1166	INP_INFO_WLOCK(&tcbinfo);
1167	if (toep->tp_tp == NULL)
1168		goto done;
1169
1170	inp = toep->tp_tp->t_inpcb;
1171	INP_LOCK(inp);
1172
1173/*
1174 * Don't handle connection retry for now
1175 */
1176#ifdef notyet
1177	struct inet_connection_sock *icsk = inet_csk(sk);
1178
1179	if (rpl->status == CPL_ERR_CONN_EXIST &&
1180	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1181		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1182		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1183			       jiffies + HZ / 2);
1184	} else
1185#endif
1186		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1187	INP_UNLOCK(inp);
1188done:
1189	INP_INFO_WUNLOCK(&tcbinfo);
1190
1191	m_free(m);
1192}
1193
1194/*
1195 * Return whether a failed active open has allocated a TID
1196 */
1197static inline int
1198act_open_has_tid(int status)
1199{
1200	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1201	       status != CPL_ERR_ARP_MISS;
1202}
1203
1204/*
1205 * Process an ACT_OPEN_RPL CPL message.
1206 */
1207static int
1208do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1209{
1210	struct toepcb *toep = (struct toepcb *)ctx;
1211	struct cpl_act_open_rpl *rpl = cplhdr(m);
1212
1213	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1214		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1215
1216	active_open_failed(toep, m);
1217	return (0);
1218}
1219
1220/*
1221 * Handle an ARP failure for an active open.   XXX purge ofo queue
1222 *
1223 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1224 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1225 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1226 * free the atid.  Hmm.
1227 */
1228#ifdef notyet
1229static void
1230act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1231{
1232	struct toepcb *toep = m_get_toep(m);
1233	struct tcpcb *tp = toep->tp_tp;
1234	struct inpcb *inp = tp->t_inpcb;
1235	struct socket *so = toeptoso(toep);
1236
1237	INP_LOCK(inp);
1238	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1239		fail_act_open(so, EHOSTUNREACH);
1240		printf("freeing %p\n", m);
1241
1242		m_free(m);
1243	}
1244	INP_UNLOCK(inp);
1245}
1246#endif
1247/*
1248 * Send an active open request.
1249 */
1250int
1251t3_connect(struct toedev *tdev, struct socket *so,
1252    struct rtentry *rt, struct sockaddr *nam)
1253{
1254	struct mbuf *m;
1255	struct l2t_entry *e;
1256	struct tom_data *d = TOM_DATA(tdev);
1257	struct inpcb *inp = sotoinpcb(so);
1258	struct tcpcb *tp = intotcpcb(inp);
1259	struct toepcb *toep; /* allocated by init_offload_socket */
1260
1261	int atid;
1262
1263	toep = toepcb_alloc();
1264	if (toep == NULL)
1265		goto out_err;
1266
1267	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1268		goto out_err;
1269
1270	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1271	if (!e)
1272		goto free_tid;
1273
1274	INP_LOCK_ASSERT(inp);
1275	m = m_gethdr(MT_DATA, M_WAITOK);
1276
1277#if 0
1278	m->m_toe.mt_toepcb = tp->t_toe;
1279	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1280#endif
1281	SOCK_LOCK(so);
1282
1283	init_offload_socket(so, tdev, atid, e, rt, toep);
1284
1285	install_offload_ops(so);
1286
1287	mk_act_open_req(so, m, atid, e);
1288	SOCK_UNLOCK(so);
1289
1290	soisconnecting(so);
1291	toep = tp->t_toe;
1292	m_set_toep(m, tp->t_toe);
1293
1294	printf("sending off request\n");
1295
1296	toep->tp_state = TCPS_SYN_SENT;
1297	l2t_send(d->cdev, (struct mbuf *)m, e);
1298
1299	if (toep->tp_ulp_mode)
1300		t3_enable_ddp(so, 0);
1301	return 	(0);
1302
1303free_tid:
1304	printf("failing connect - free atid\n");
1305
1306	free_atid(d->cdev, atid);
1307out_err:
1308	printf("return ENOMEM\n");
1309       return (ENOMEM);
1310}
1311
1312/*
1313 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1314 * not send multiple ABORT_REQs for the same connection and also that we do
1315 * not try to send a message after the connection has closed.  Returns 1 if
1316 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1317 */
1318static void
1319t3_send_reset(struct toepcb *toep)
1320{
1321
1322	struct cpl_abort_req *req;
1323	unsigned int tid = toep->tp_tid;
1324	int mode = CPL_ABORT_SEND_RST;
1325	struct tcpcb *tp = toep->tp_tp;
1326	struct toedev *tdev = toep->tp_toedev;
1327	struct socket *so = NULL;
1328	struct mbuf *m;
1329
1330	if (tp) {
1331		INP_LOCK_ASSERT(tp->t_inpcb);
1332		so = toeptoso(toep);
1333	}
1334
1335	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1336		tdev == NULL))
1337		return;
1338	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1339
1340	/* Purge the send queue so we don't send anything after an abort. */
1341	if (so)
1342		sbflush(&so->so_snd);
1343	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1344		mode |= CPL_ABORT_POST_CLOSE_REQ;
1345
1346	m = m_gethdr_nofail(sizeof(*req));
1347	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so));
1348	set_arp_failure_handler(m, abort_arp_failure);
1349
1350	req = mtod(m, struct cpl_abort_req *);
1351	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1352	req->wr.wr_lo = htonl(V_WR_TID(tid));
1353	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1354	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1355	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1356	req->cmd = mode;
1357	if (tp && (tp->t_state == TCPS_SYN_SENT))
1358		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1359	else
1360		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1361}
1362
1363static int
1364t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1365{
1366	struct inpcb *inp;
1367	int error, optval;
1368
1369	if (sopt->sopt_name == IP_OPTIONS)
1370		return (ENOPROTOOPT);
1371
1372	if (sopt->sopt_name != IP_TOS)
1373		return (EOPNOTSUPP);
1374
1375	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1376
1377	if (error)
1378		return (error);
1379
1380	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1381		return (EPERM);
1382
1383	inp = sotoinpcb(so);
1384	inp->inp_ip_tos = optval;
1385
1386	t3_set_tos(so);
1387
1388	return (0);
1389}
1390
1391static int
1392t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1393{
1394	int err = 0;
1395	size_t copied;
1396
1397	if (sopt->sopt_name != TCP_CONGESTION &&
1398	    sopt->sopt_name != TCP_NODELAY)
1399		return (EOPNOTSUPP);
1400
1401	if (sopt->sopt_name == TCP_CONGESTION) {
1402		char name[TCP_CA_NAME_MAX];
1403		int optlen = sopt->sopt_valsize;
1404		struct tcpcb *tp;
1405
1406		if (optlen < 1)
1407			return (EINVAL);
1408
1409		err = copyinstr(sopt->sopt_val, name,
1410		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1411		if (err)
1412			return (err);
1413		if (copied < 1)
1414			return (EINVAL);
1415
1416		tp = sototcpcb(so);
1417		/*
1418		 * XXX I need to revisit this
1419		 */
1420		if ((err = t3_set_cong_control(so, name)) == 0) {
1421#ifdef notyet
1422			tp->t_cong_control = strdup(name, M_CXGB);
1423#endif
1424		} else
1425			return (err);
1426	} else {
1427		int optval, oldval;
1428		struct inpcb *inp;
1429		struct tcpcb *tp;
1430
1431		err = sooptcopyin(sopt, &optval, sizeof optval,
1432		    sizeof optval);
1433
1434		if (err)
1435			return (err);
1436
1437		inp = sotoinpcb(so);
1438		tp = intotcpcb(inp);
1439
1440		INP_LOCK(inp);
1441
1442		oldval = tp->t_flags;
1443		if (optval)
1444			tp->t_flags |= TF_NODELAY;
1445		else
1446			tp->t_flags &= ~TF_NODELAY;
1447		INP_UNLOCK(inp);
1448
1449		if (oldval != tp->t_flags)
1450			t3_set_nagle(so);
1451
1452	}
1453
1454	return (0);
1455}
1456
1457static int
1458t3_ctloutput(struct socket *so, struct sockopt *sopt)
1459{
1460	int err;
1461
1462	if (sopt->sopt_level != IPPROTO_TCP)
1463		err =  t3_ip_ctloutput(so, sopt);
1464	else
1465		err = t3_tcp_ctloutput(so, sopt);
1466
1467	if (err != EOPNOTSUPP)
1468		return (err);
1469
1470	return tcp_ctloutput(so, sopt);
1471}
1472
1473/*
1474 * Process new data received for a connection.
1475 */
1476static void
1477new_rx_data(struct toepcb *toep, struct mbuf *m)
1478{
1479	struct cpl_rx_data *hdr = cplhdr(m);
1480	struct tcpcb *tp = toep->tp_tp;
1481	struct socket *so = toeptoso(toep);
1482	int len = be16toh(hdr->len);
1483
1484	INP_LOCK(tp->t_inpcb);
1485
1486#ifdef notyet
1487	if (__predict_false(sk_no_receive(sk))) {
1488		handle_excess_rx(so, skb);
1489		return;
1490	}
1491
1492	if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
1493		handle_ddp_data(so, skb);
1494
1495	TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
1496	TCP_SKB_CB(skb)->flags = 0;
1497	skb_ulp_mode(skb) = 0;                    /* for iSCSI */
1498#endif
1499#if VALIDATE_SEQ
1500	if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
1501		printk(KERN_ERR
1502		       "%s: TID %u: Bad sequence number %u, expected %u\n",
1503		       TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq,
1504		       tp->rcv_nxt);
1505		__kfree_skb(skb);
1506		return;
1507	}
1508#endif
1509	m_adj(m, sizeof(*hdr));
1510
1511#ifdef notyet
1512	/*
1513	 * We don't handle urgent data yet
1514	 */
1515	if (__predict_false(hdr->urg))
1516		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
1517	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
1518		     tp->urg_seq - tp->rcv_nxt < skb->len))
1519		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
1520							 tp->rcv_nxt];
1521#endif
1522	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
1523		toep->tp_delack_mode = hdr->dack_mode;
1524		toep->tp_delack_seq = tp->rcv_nxt;
1525	}
1526
1527	DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len);
1528
1529	if (len < m->m_pkthdr.len)
1530		m->m_pkthdr.len = m->m_len = len;
1531
1532	tp->rcv_nxt += m->m_pkthdr.len;
1533	tp->t_rcvtime = ticks;
1534	toep->tp_enqueued_bytes += m->m_pkthdr.len;
1535#ifdef T3_TRACE
1536	T3_TRACE2(TIDTB(sk),
1537		  "new_rx_data: seq 0x%x len %u",
1538		  TCP_SKB_CB(skb)->seq, skb->len);
1539#endif
1540	SOCKBUF_LOCK(&so->so_rcv);
1541	if (sb_notify(&so->so_rcv))
1542		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
1543
1544	sbappend_locked(&so->so_rcv, m);
1545	KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax,
1546
1547	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
1548		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
1549
1550	INP_UNLOCK(tp->t_inpcb);
1551	DPRINTF("sb_cc=%d sb_mbcnt=%d\n",
1552	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
1553
1554	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1555		sorwakeup_locked(so);
1556	else
1557		SOCKBUF_UNLOCK(&so->so_rcv);
1558}
1559
1560/*
1561 * Handler for RX_DATA CPL messages.
1562 */
1563static int
1564do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1565{
1566	struct toepcb *toep = (struct toepcb *)ctx;
1567
1568	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
1569
1570	new_rx_data(toep, m);
1571
1572	return (0);
1573}
1574
1575static void
1576new_rx_data_ddp(struct socket *so, struct mbuf *m)
1577{
1578	struct tcpcb *tp = sototcpcb(so);
1579	struct toepcb *toep = tp->t_toe;
1580	struct ddp_state *q;
1581	struct ddp_buf_state *bsp;
1582	struct cpl_rx_data_ddp *hdr;
1583	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
1584
1585#ifdef notyet
1586	if (unlikely(sk_no_receive(sk))) {
1587		handle_excess_rx(so, m);
1588		return;
1589	}
1590#endif
1591	tp = sototcpcb(so);
1592	q = &toep->tp_ddp_state;
1593	hdr = cplhdr(m);
1594	ddp_report = ntohl(hdr->u.ddp_report);
1595	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
1596	bsp = &q->buf_state[buf_idx];
1597
1598#ifdef T3_TRACE
1599	T3_TRACE5(TIDTB(sk),
1600		  "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
1601		  "hdr seq 0x%x len %u offset %u",
1602		  tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
1603		  ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
1604	T3_TRACE1(TIDTB(sk),
1605		  "new_rx_data_ddp: ddp_report 0x%x",
1606		  ddp_report);
1607#endif
1608
1609	ddp_len = ntohs(hdr->len);
1610	rcv_nxt = ntohl(hdr->seq) + ddp_len;
1611
1612	/*
1613	 * Overload to store old rcv_next
1614	 */
1615	m->m_pkthdr.csum_data = tp->rcv_nxt;
1616	tp->rcv_nxt = rcv_nxt;
1617
1618	/*
1619	 * Store the length in m->m_len.  We are changing the meaning of
1620	 * m->m_len here, we need to be very careful that nothing from now on
1621	 * interprets ->len of this packet the usual way.
1622	 */
1623	m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data;
1624
1625	/*
1626	 * Figure out where the new data was placed in the buffer and store it
1627	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
1628	 * account for page pod's pg_offset.
1629	 */
1630	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
1631#ifdef notyet
1632	TCP_SKB_CB(skb)->when = end_offset - skb->len;
1633
1634	/*
1635	 * We store in mac.raw the address of the gather list where the
1636	 * placement happened.
1637	 */
1638	skb->mac.raw = (unsigned char *)bsp->gl;
1639#endif
1640	bsp->cur_offset = end_offset;
1641
1642	/*
1643	 * Bit 0 of flags stores whether the DDP buffer is completed.
1644	 * Note that other parts of the code depend on this being in bit 0.
1645	 */
1646	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
1647#if 0
1648		TCP_SKB_CB(skb)->flags = 0;  /* potential spurious completion */
1649#endif
1650		panic("spurious ddp completion");
1651	} else {
1652		m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
1653		if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP))
1654			q->cur_buf ^= 1;                     /* flip buffers */
1655	}
1656
1657	if (bsp->flags & DDP_BF_NOCOPY) {
1658		m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY);
1659		bsp->flags &= ~DDP_BF_NOCOPY;
1660	}
1661
1662	if (ddp_report & F_DDP_PSH)
1663		m->m_pkthdr.csum_flags |= DDP_BF_PSH;
1664
1665	tp->t_rcvtime = ticks;
1666	sbappendstream_locked(&so->so_rcv, m);
1667#ifdef notyet
1668	if (!sock_flag(sk, SOCK_DEAD))
1669		sk->sk_data_ready(sk, 0);
1670#endif
1671}
1672
1673#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
1674		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
1675		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
1676		 F_DDP_INVALID_PPOD)
1677
1678/*
1679 * Handler for RX_DATA_DDP CPL messages.
1680 */
1681static int
1682do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1683{
1684	struct toepcb *toep = ctx;
1685	struct socket *so = toeptoso(toep);
1686	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
1687
1688	VALIDATE_SOCK(so);
1689
1690	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
1691		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
1692		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
1693		return CPL_RET_BUF_DONE;
1694	}
1695#if 0
1696	skb->h.th = tcphdr_skb->h.th;
1697#endif
1698	new_rx_data_ddp(so, m);
1699	return (0);
1700}
1701
1702static void
1703process_ddp_complete(struct socket *so, struct mbuf *m)
1704{
1705	struct tcpcb *tp = sototcpcb(so);
1706	struct toepcb *toep = tp->t_toe;
1707	struct ddp_state *q;
1708	struct ddp_buf_state *bsp;
1709	struct cpl_rx_ddp_complete *hdr;
1710	unsigned int ddp_report, buf_idx, when;
1711
1712#ifdef notyet
1713	if (unlikely(sk_no_receive(sk))) {
1714		handle_excess_rx(sk, skb);
1715		return;
1716	}
1717#endif
1718	q = &toep->tp_ddp_state;
1719	hdr = cplhdr(m);
1720	ddp_report = ntohl(hdr->ddp_report);
1721	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
1722	bsp = &q->buf_state[buf_idx];
1723
1724	when = bsp->cur_offset;
1725	m->m_len = G_DDP_OFFSET(ddp_report) - when;
1726
1727#ifdef T3_TRACE
1728	T3_TRACE5(TIDTB(sk),
1729		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
1730		  "ddp_report 0x%x offset %u, len %u",
1731		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
1732		   G_DDP_OFFSET(ddp_report), skb->len);
1733#endif
1734
1735	bsp->cur_offset += m->m_len;
1736
1737	if (!(bsp->flags & DDP_BF_NOFLIP))
1738		q->cur_buf ^= 1;                     /* flip buffers */
1739
1740#ifdef T3_TRACE
1741	T3_TRACE4(TIDTB(sk),
1742		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
1743		  "ddp_report %u offset %u",
1744		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
1745		   G_DDP_OFFSET(ddp_report));
1746#endif
1747#if 0
1748	skb->mac.raw = (unsigned char *)bsp->gl;
1749#endif
1750	m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
1751	if (bsp->flags & DDP_BF_NOCOPY)
1752		bsp->flags &= ~DDP_BF_NOCOPY;
1753	m->m_pkthdr.csum_data = tp->rcv_nxt;
1754	tp->rcv_nxt += m->m_len;
1755
1756	tp->t_rcvtime = ticks;
1757	sbappendstream_locked(&so->so_rcv, m);
1758#ifdef notyet
1759	if (!sock_flag(sk, SOCK_DEAD))
1760		sk->sk_data_ready(sk, 0);
1761#endif
1762}
1763
1764/*
1765 * Handler for RX_DDP_COMPLETE CPL messages.
1766 */
1767static int
1768do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1769{
1770	struct toepcb *toep = ctx;
1771	struct socket *so = toeptoso(toep);
1772
1773	VALIDATE_SOCK(so);
1774#if 0
1775	skb->h.th = tcphdr_skb->h.th;
1776#endif
1777	process_ddp_complete(so, m);
1778	return (0);
1779}
1780
1781/*
1782 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
1783 * socket state before calling tcp_time_wait to comply with its expectations.
1784 */
1785static void
1786enter_timewait(struct socket *so)
1787{
1788	struct tcpcb *tp = sototcpcb(so);
1789
1790	INP_LOCK_ASSERT(tp->t_inpcb);
1791	/*
1792	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
1793	 * process peer_close because we don't want to carry the peer FIN in
1794	 * the socket's receive queue and if we increment rcv_nxt without
1795	 * having the FIN in the receive queue we'll confuse facilities such
1796	 * as SIOCINQ.
1797	 */
1798	tp->rcv_nxt++;
1799
1800	tp->ts_recent_age = 0;	     /* defeat recycling */
1801	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
1802	tcp_twstart(tp);
1803}
1804
1805/*
1806 * Handle a peer FIN.
1807 */
1808static void
1809do_peer_fin(struct socket *so, struct mbuf *m)
1810{
1811	struct tcpcb *tp = sototcpcb(so);
1812	struct toepcb *toep = tp->t_toe;
1813	int keep = 0, dead = (so->so_state & SS_NOFDREF);
1814
1815	DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead);
1816
1817#ifdef T3_TRACE
1818	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
1819#endif
1820
1821	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
1822		printf("abort_pending set\n");
1823
1824		goto out;
1825	}
1826
1827#ifdef notyet
1828	if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
1829		keep = handle_peer_close_data(so, skb);
1830		if (keep < 0)
1831			return;
1832	}
1833	sk->sk_shutdown |= RCV_SHUTDOWN;
1834	sock_set_flag(so, SOCK_DONE);
1835#endif
1836	INP_INFO_WLOCK(&tcbinfo);
1837	INP_LOCK(tp->t_inpcb);
1838	if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
1839		socantrcvmore(so);
1840	switch (tp->t_state) {
1841	case TCPS_SYN_RECEIVED:
1842	    tp->t_starttime = ticks;
1843	/* FALLTHROUGH */
1844	case TCPS_ESTABLISHED:
1845		tp->t_state = TCPS_CLOSE_WAIT;
1846		break;
1847	case TCPS_FIN_WAIT_1:
1848		tp->t_state = TCPS_CLOSING;
1849		break;
1850	case TCPS_FIN_WAIT_2:
1851		/*
1852		 * If we've sent an abort_req we must have sent it too late,
1853		 * HW will send us a reply telling us so, and this peer_close
1854		 * is really the last message for this connection and needs to
1855		 * be treated as an abort_rpl, i.e., transition the connection
1856		 * to TCP_CLOSE (note that the host stack does this at the
1857		 * time of generating the RST but we must wait for HW).
1858		 * Otherwise we enter TIME_WAIT.
1859		 */
1860		t3_release_offload_resources(toep);
1861		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1862			tp = tcp_close(tp);
1863		} else
1864			enter_timewait(so);
1865		break;
1866	default:
1867		log(LOG_ERR,
1868		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
1869		       TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
1870	}
1871	INP_INFO_WUNLOCK(&tcbinfo);
1872	if (tp)
1873		INP_UNLOCK(tp->t_inpcb);
1874
1875	if (!dead) {
1876		DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
1877
1878		sorwakeup(so);
1879		sowwakeup(so);
1880		wakeup(&so->so_timeo);
1881#ifdef notyet
1882		sk->sk_state_change(sk);
1883
1884		/* Do not send POLL_HUP for half duplex close. */
1885		if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
1886		    sk->sk_state == TCP_CLOSE)
1887			sk_wake_async(so, 1, POLL_HUP);
1888		else
1889			sk_wake_async(so, 1, POLL_IN);
1890#endif
1891	}
1892out:
1893	if (!keep)
1894		m_free(m);
1895}
1896
1897/*
1898 * Handler for PEER_CLOSE CPL messages.
1899 */
1900static int
1901do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1902{
1903	struct toepcb *toep = (struct toepcb *)ctx;
1904	struct socket *so = toeptoso(toep);
1905
1906	VALIDATE_SOCK(so);
1907
1908	do_peer_fin(so, m);
1909	return (0);
1910}
1911
1912static void
1913process_close_con_rpl(struct socket *so, struct mbuf *m)
1914{
1915	struct tcpcb *tp = sototcpcb(so);
1916	struct cpl_close_con_rpl *rpl = cplhdr(m);
1917	struct toepcb *toep = tp->t_toe;
1918
1919	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
1920
1921	DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
1922	    !!(so->so_state & SS_NOFDREF));
1923	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
1924		goto out;
1925
1926	INP_INFO_WLOCK(&tcbinfo);
1927	INP_LOCK(tp->t_inpcb);
1928	switch (tp->t_state) {
1929	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
1930		t3_release_offload_resources(toep);
1931		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1932			tp = tcp_close(tp);
1933
1934		} else
1935			enter_timewait(so);
1936		break;
1937	case TCPS_LAST_ACK:
1938		/*
1939		 * In this state we don't care about pending abort_rpl.
1940		 * If we've sent abort_req it was post-close and was sent too
1941		 * late, this close_con_rpl is the actual last message.
1942		 */
1943		t3_release_offload_resources(toep);
1944		tp = tcp_close(tp);
1945		break;
1946	case TCPS_FIN_WAIT_1:
1947#ifdef notyet
1948		dst_confirm(sk->sk_dst_cache);
1949#endif
1950		soisdisconnecting(so);
1951
1952		if ((so->so_state & SS_NOFDREF) == 0) {
1953			/*
1954			 * Wake up lingering close
1955			 */
1956			sowwakeup(so);
1957			sorwakeup(so);
1958			wakeup(&so->so_timeo);
1959		} else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
1960		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
1961			tp = cxgb_tcp_drop(tp, 0);
1962		}
1963
1964		break;
1965	default:
1966		log(LOG_ERR,
1967		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
1968		       TOE_DEV(so)->tod_name, toep->tp_tid,
1969		       tp->t_state);
1970	}
1971	INP_INFO_WUNLOCK(&tcbinfo);
1972	if (tp)
1973		INP_UNLOCK(tp->t_inpcb);
1974out:
1975	m_free(m);
1976}
1977
1978/*
1979 * Handler for CLOSE_CON_RPL CPL messages.
1980 */
1981static int
1982do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
1983			    void *ctx)
1984{
1985	struct toepcb *toep = (struct toepcb *)ctx;
1986	struct socket *so = toeptoso(toep);
1987
1988	VALIDATE_SOCK(so);
1989
1990	process_close_con_rpl(so, m);
1991	return (0);
1992}
1993
1994/*
1995 * Process abort replies.  We only process these messages if we anticipate
1996 * them as the coordination between SW and HW in this area is somewhat lacking
1997 * and sometimes we get ABORT_RPLs after we are done with the connection that
1998 * originated the ABORT_REQ.
1999 */
2000static void
2001process_abort_rpl(struct socket *so, struct mbuf *m)
2002{
2003	struct tcpcb *tp = sototcpcb(so);
2004	struct toepcb *toep = tp->t_toe;
2005
2006#ifdef T3_TRACE
2007	T3_TRACE1(TIDTB(sk),
2008		  "process_abort_rpl: GTS rpl pending %d",
2009		  sock_flag(sk, ABORT_RPL_PENDING));
2010#endif
2011	INP_LOCK(tp->t_inpcb);
2012
2013	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2014		/*
2015		 * XXX panic on tcpdrop
2016		 */
2017		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
2018			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2019		else {
2020			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2021			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2022			    !is_t3a(TOE_DEV(so))) {
2023				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2024					panic("TP_ABORT_REQ_RCVD set");
2025				INP_INFO_WLOCK(&tcbinfo);
2026				INP_LOCK(tp->t_inpcb);
2027				t3_release_offload_resources(toep);
2028				tp = tcp_close(tp);
2029				INP_INFO_WUNLOCK(&tcbinfo);
2030			}
2031		}
2032	}
2033	if (tp)
2034		INP_UNLOCK(tp->t_inpcb);
2035
2036	m_free(m);
2037}
2038
2039/*
2040 * Handle an ABORT_RPL_RSS CPL message.
2041 */
2042static int
2043do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2044{
2045	struct socket *so;
2046	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2047	struct toepcb *toep;
2048
2049	/*
2050	 * Ignore replies to post-close aborts indicating that the abort was
2051	 * requested too late.  These connections are terminated when we get
2052	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2053	 * arrives the TID is either no longer used or it has been recycled.
2054	 */
2055	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2056discard:
2057		m_free(m);
2058		return (0);
2059	}
2060
2061	toep = (struct toepcb *)ctx;
2062
2063        /*
2064	 * Sometimes we've already closed the socket, e.g., a post-close
2065	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2066	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2067	 * but FW turns the ABORT_REQ into a regular one and so we get
2068	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2069	 */
2070	if (!toep)
2071		goto discard;
2072
2073	if (toep->tp_tp == NULL) {
2074		printf("removing tid for abort\n");
2075		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2076		if (toep->tp_l2t)
2077			l2t_release(L2DATA(cdev), toep->tp_l2t);
2078
2079		toepcb_release(toep);
2080		goto discard;
2081	}
2082
2083	printf("toep=%p\n", toep);
2084	printf("tp=%p\n", toep->tp_tp);
2085
2086	so = toeptoso(toep); /* <- XXX panic */
2087	toepcb_hold(toep);
2088	process_abort_rpl(so, m);
2089	toepcb_release(toep);
2090	return (0);
2091}
2092
2093/*
2094 * Convert the status code of an ABORT_REQ into a Linux error code.  Also
2095 * indicate whether RST should be sent in response.
2096 */
2097static int
2098abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2099{
2100	struct tcpcb *tp = sototcpcb(so);
2101
2102	switch (abort_reason) {
2103	case CPL_ERR_BAD_SYN:
2104#if 0
2105		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2106#endif
2107	case CPL_ERR_CONN_RESET:
2108		// XXX need to handle SYN_RECV due to crossed SYNs
2109		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2110	case CPL_ERR_XMIT_TIMEDOUT:
2111	case CPL_ERR_PERSIST_TIMEDOUT:
2112	case CPL_ERR_FINWAIT2_TIMEDOUT:
2113	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2114#if 0
2115		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2116#endif
2117		return (ETIMEDOUT);
2118	default:
2119		return (EIO);
2120	}
2121}
2122
2123static inline void
2124set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2125{
2126	struct cpl_abort_rpl *rpl = cplhdr(m);
2127
2128	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2129	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2130	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2131
2132	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2133	rpl->cmd = cmd;
2134}
2135
2136static void
2137send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2138{
2139	struct mbuf *reply_mbuf;
2140	struct cpl_abort_req_rss *req = cplhdr(m);
2141
2142	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2143	m_set_priority(m, CPL_PRIORITY_DATA);
2144	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2145	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2146	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2147	m_free(m);
2148}
2149
2150/*
2151 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2152 */
2153static inline int
2154is_neg_adv_abort(unsigned int status)
2155{
2156	return status == CPL_ERR_RTX_NEG_ADVICE ||
2157	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2158}
2159
2160static void
2161send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2162{
2163	struct mbuf  *reply_mbuf;
2164	struct cpl_abort_req_rss *req = cplhdr(m);
2165
2166	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2167
2168	if (!reply_mbuf) {
2169		/* Defer the reply.  Stick rst_status into req->cmd. */
2170		req->status = rst_status;
2171		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2172		return;
2173	}
2174
2175	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2176	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2177	m_free(m);
2178
2179	/*
2180	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2181	 * these messages while ARP is pending.  For other connection states
2182	 * it's not a problem.
2183	 */
2184	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2185}
2186
2187#ifdef notyet
2188static void
2189cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2190{
2191	UNIMPLEMENTED();
2192#ifdef notyet
2193	struct request_sock *req = child->sk_user_data;
2194
2195	inet_csk_reqsk_queue_removed(parent, req);
2196	synq_remove(tcp_sk(child));
2197	__reqsk_free(req);
2198	child->sk_user_data = NULL;
2199#endif
2200}
2201
2202
2203/*
2204 * Performs the actual work to abort a SYN_RECV connection.
2205 */
2206static void
2207do_abort_syn_rcv(struct socket *child, struct socket *parent)
2208{
2209	struct tcpcb *parenttp = sototcpcb(parent);
2210	struct tcpcb *childtp = sototcpcb(child);
2211
2212	/*
2213	 * If the server is still open we clean up the child connection,
2214	 * otherwise the server already did the clean up as it was purging
2215	 * its SYN queue and the skb was just sitting in its backlog.
2216	 */
2217	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2218		cleanup_syn_rcv_conn(child, parent);
2219		INP_INFO_WLOCK(&tcbinfo);
2220		INP_LOCK(childtp->t_inpcb);
2221		t3_release_offload_resources(childtp->t_toe);
2222		childtp = tcp_close(childtp);
2223		INP_INFO_WUNLOCK(&tcbinfo);
2224		if (childtp)
2225			INP_UNLOCK(childtp->t_inpcb);
2226	}
2227}
2228#endif
2229
2230/*
2231 * Handle abort requests for a SYN_RECV connection.  These need extra work
2232 * because the socket is on its parent's SYN queue.
2233 */
2234static int
2235abort_syn_rcv(struct socket *so, struct mbuf *m)
2236{
2237	UNIMPLEMENTED();
2238#ifdef notyet
2239	struct socket *parent;
2240	struct toedev *tdev = TOE_DEV(so);
2241	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2242	struct socket *oreq = so->so_incomp;
2243	struct t3c_tid_entry *t3c_stid;
2244	struct tid_info *t;
2245
2246	if (!oreq)
2247		return -1;        /* somehow we are not on the SYN queue */
2248
2249	t = &(T3C_DATA(cdev))->tid_maps;
2250	t3c_stid = lookup_stid(t, oreq->ts_recent);
2251	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2252
2253	SOCK_LOCK(parent);
2254	do_abort_syn_rcv(so, parent);
2255	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2256	SOCK_UNLOCK(parent);
2257#endif
2258	return (0);
2259}
2260
2261/*
2262 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2263 * request except that we need to reply to it.
2264 */
2265static void
2266process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
2267{
2268	int rst_status = CPL_ABORT_NO_RST;
2269	const struct cpl_abort_req_rss *req = cplhdr(m);
2270	struct tcpcb *tp = sototcpcb(so);
2271	struct toepcb *toep = tp->t_toe;
2272
2273	INP_LOCK(tp->t_inpcb);
2274	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2275		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2276		m_free(m);
2277		goto skip;
2278	}
2279
2280	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
2281	/*
2282	 * Three cases to consider:
2283	 * a) We haven't sent an abort_req; close the connection.
2284	 * b) We have sent a post-close abort_req that will get to TP too late
2285	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
2286	 *    be ignored and the connection should be closed now.
2287	 * c) We have sent a regular abort_req that will get to TP too late.
2288	 *    That will generate an abort_rpl with status 0, wait for it.
2289	 */
2290	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
2291	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
2292		so->so_error = abort_status_to_errno(so, req->status,
2293		    &rst_status);
2294#if 0
2295		if (!sock_flag(sk, SOCK_DEAD))
2296			sk->sk_error_report(sk);
2297#endif
2298		/*
2299		 * SYN_RECV needs special processing.  If abort_syn_rcv()
2300		 * returns 0 is has taken care of the abort.
2301		 */
2302		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
2303			goto skip;
2304
2305		t3_release_offload_resources(toep);
2306		tp = tcp_close(tp);
2307	}
2308	if (tp)
2309		INP_UNLOCK(tp->t_inpcb);
2310	send_abort_rpl(m, tdev, rst_status);
2311	return;
2312
2313skip:
2314	INP_UNLOCK(tp->t_inpcb);
2315}
2316
2317/*
2318 * Handle an ABORT_REQ_RSS CPL message.
2319 */
2320static int
2321do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2322{
2323	const struct cpl_abort_req_rss *req = cplhdr(m);
2324	struct toepcb *toep = (struct toepcb *)ctx;
2325	struct socket *so;
2326	struct inpcb *inp;
2327
2328	if (is_neg_adv_abort(req->status)) {
2329		m_free(m);
2330		return (0);
2331	}
2332
2333	printf("aborting tid=%d\n", toep->tp_tid);
2334
2335	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
2336		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2337		toep->tp_flags |= TP_ABORT_REQ_RCVD;
2338		printf("sending abort rpl\n");
2339
2340		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
2341		printf("sent\n");
2342		if (toep->tp_l2t)
2343			l2t_release(L2DATA(cdev), toep->tp_l2t);
2344
2345		/*
2346		 *  Unhook
2347		 */
2348		toep->tp_tp->t_toe = NULL;
2349		toep->tp_tp->t_flags &= ~TF_TOE;
2350		toep->tp_tp = NULL;
2351		/*
2352		 * XXX need to call syncache_chkrst - but we don't
2353		 * have a way of doing that yet
2354		 */
2355		toepcb_release(toep);
2356		printf("abort for unestablished connection :-(\n");
2357		return (0);
2358	}
2359	if (toep->tp_tp == NULL) {
2360		printf("disconnected toepcb\n");
2361		/* should be freed momentarily */
2362		return (0);
2363	}
2364
2365	so = toeptoso(toep);
2366	inp = sotoinpcb(so);
2367
2368	VALIDATE_SOCK(so);
2369	toepcb_hold(toep);
2370	INP_INFO_WLOCK(&tcbinfo);
2371	process_abort_req(so, m, TOE_DEV(so));
2372	INP_INFO_WUNLOCK(&tcbinfo);
2373	toepcb_release(toep);
2374	return (0);
2375}
2376#ifdef notyet
2377static void
2378pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
2379{
2380	struct toedev *tdev = TOE_DEV(parent);
2381
2382	do_abort_syn_rcv(child, parent);
2383	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
2384		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
2385
2386		rpl->opt0h = htonl(F_TCAM_BYPASS);
2387		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2388		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
2389	} else
2390		m_free(m);
2391}
2392#endif
2393static void
2394handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
2395{
2396	UNIMPLEMENTED();
2397
2398#ifdef notyet
2399	struct t3cdev *cdev;
2400	struct socket *parent;
2401	struct socket *oreq;
2402	struct t3c_tid_entry *t3c_stid;
2403	struct tid_info *t;
2404	struct tcpcb *otp, *tp = sototcpcb(so);
2405	struct toepcb *toep = tp->t_toe;
2406
2407	/*
2408	 * If the connection is being aborted due to the parent listening
2409	 * socket going away there's nothing to do, the ABORT_REQ will close
2410	 * the connection.
2411	 */
2412	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2413		m_free(m);
2414		return;
2415	}
2416
2417	oreq = so->so_incomp;
2418	otp = sototcpcb(oreq);
2419
2420	cdev = T3C_DEV(so);
2421	t = &(T3C_DATA(cdev))->tid_maps;
2422	t3c_stid = lookup_stid(t, otp->ts_recent);
2423	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2424
2425	SOCK_LOCK(parent);
2426	pass_open_abort(so, parent, m);
2427	SOCK_UNLOCK(parent);
2428#endif
2429}
2430
2431/*
2432 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
2433 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
2434 * connection.
2435 */
2436static void
2437pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
2438{
2439
2440#ifdef notyet
2441	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2442	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
2443#endif
2444	handle_pass_open_arp_failure(m_get_socket(m), m);
2445}
2446
2447/*
2448 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
2449 */
2450static void
2451mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
2452{
2453	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
2454	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
2455	unsigned int tid = GET_TID(req);
2456
2457	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
2458	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
2459	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
2460	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
2461	rpl->opt0h = htonl(F_TCAM_BYPASS);
2462	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2463	rpl->opt2 = 0;
2464	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
2465}
2466
2467/*
2468 * Send a deferred reject to an accept request.
2469 */
2470static void
2471reject_pass_request(struct toedev *tdev, struct mbuf *m)
2472{
2473	struct mbuf *reply_mbuf;
2474
2475	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
2476	mk_pass_accept_rpl(reply_mbuf, m);
2477	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2478	m_free(m);
2479}
2480
2481static void
2482handle_syncache_event(int event, void *arg)
2483{
2484	struct toepcb *toep = arg;
2485
2486	switch (event) {
2487	case TOE_SC_ENTRY_PRESENT:
2488		/*
2489		 * entry already exists - free toepcb
2490		 * and l2t
2491		 */
2492		printf("syncache entry present\n");
2493		toepcb_release(toep);
2494		break;
2495	case TOE_SC_DROP:
2496		/*
2497		 * The syncache has given up on this entry
2498		 * either it timed out, or it was evicted
2499		 * we need to explicitly release the tid
2500		 */
2501		printf("syncache entry dropped\n");
2502		toepcb_release(toep);
2503		break;
2504	default:
2505		log(LOG_ERR, "unknown syncache event %d\n", event);
2506		break;
2507	}
2508}
2509
2510static void
2511syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
2512{
2513	struct in_conninfo inc;
2514	struct tcpopt to;
2515	struct tcphdr th;
2516	struct inpcb *inp;
2517	int mss, wsf, sack, ts;
2518
2519	bzero(&to, sizeof(struct tcpopt));
2520	inp = sotoinpcb(lso);
2521
2522	/*
2523	 * Fill out information for entering us into the syncache
2524	 */
2525	inc.inc_fport = th.th_sport = req->peer_port;
2526	inc.inc_lport = th.th_dport = req->local_port;
2527	toep->tp_iss = th.th_seq = req->rcv_isn;
2528	th.th_flags = TH_SYN;
2529
2530	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn);
2531
2532	inc.inc_isipv6 = 0;
2533	inc.inc_len = 0;
2534	inc.inc_faddr.s_addr = req->peer_ip;
2535	inc.inc_laddr.s_addr = req->local_ip;
2536
2537	DPRINTF("syncache add of %d:%d %d:%d\n",
2538	    ntohl(req->local_ip), ntohs(req->local_port),
2539	    ntohl(req->peer_ip), ntohs(req->peer_port));
2540
2541	mss = req->tcp_options.mss;
2542	wsf = req->tcp_options.wsf;
2543	ts = req->tcp_options.tstamp;
2544	sack = req->tcp_options.sack;
2545	to.to_mss = mss;
2546	to.to_wscale = wsf;
2547	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
2548
2549	INP_INFO_WLOCK(&tcbinfo);
2550	INP_LOCK(inp);
2551	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
2552}
2553
2554
2555/*
2556 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
2557 * lock held.  Note that the sock here is a listening socket that is not owned
2558 * by the TOE.
2559 */
2560static void
2561process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
2562    struct listen_ctx *lctx)
2563{
2564	int rt_flags;
2565	struct l2t_entry *e;
2566	struct iff_mac tim;
2567	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
2568	struct cpl_pass_accept_rpl *rpl;
2569	struct cpl_pass_accept_req *req = cplhdr(m);
2570	unsigned int tid = GET_TID(req);
2571	struct tom_data *d = TOM_DATA(tdev);
2572	struct t3cdev *cdev = d->cdev;
2573	struct tcpcb *tp = sototcpcb(so);
2574	struct toepcb *newtoep;
2575	struct rtentry *dst;
2576	struct sockaddr_in nam;
2577	struct t3c_data *td = T3C_DATA(cdev);
2578
2579	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2580	if (__predict_false(reply_mbuf == NULL)) {
2581		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
2582			t3_defer_reply(m, tdev, reject_pass_request);
2583		else {
2584			cxgb_queue_tid_release(cdev, tid);
2585			m_free(m);
2586		}
2587		DPRINTF("failed to get reply_mbuf\n");
2588
2589		goto out;
2590	}
2591
2592	if (tp->t_state != TCPS_LISTEN) {
2593		DPRINTF("socket not in listen state\n");
2594
2595		goto reject;
2596	}
2597
2598	tim.mac_addr = req->dst_mac;
2599	tim.vlan_tag = ntohs(req->vlan_tag);
2600	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
2601		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
2602		goto reject;
2603	}
2604
2605#ifdef notyet
2606	/*
2607	 * XXX do route lookup to confirm that we're still listening on this
2608	 * address
2609	 */
2610	if (ip_route_input(skb, req->local_ip, req->peer_ip,
2611			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
2612		goto reject;
2613	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
2614		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
2615	dst_release(skb->dst);	// done with the input route, release it
2616	skb->dst = NULL;
2617
2618	if ((rt_flags & RTF_LOCAL) == 0)
2619		goto reject;
2620#endif
2621	/*
2622	 * XXX
2623	 */
2624	rt_flags = RTF_LOCAL;
2625	if ((rt_flags & RTF_LOCAL) == 0)
2626		goto reject;
2627
2628	/*
2629	 * Calculate values and add to syncache
2630	 */
2631
2632	newtoep = toepcb_alloc();
2633	if (newtoep == NULL)
2634		goto reject;
2635
2636	bzero(&nam, sizeof(struct sockaddr_in));
2637
2638	nam.sin_len = sizeof(struct sockaddr_in);
2639	nam.sin_family = AF_INET;
2640	nam.sin_addr.s_addr =req->peer_ip;
2641	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
2642
2643	if (dst == NULL) {
2644		printf("failed to find route\n");
2645		goto reject;
2646	}
2647	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
2648	    (struct sockaddr *)&nam);
2649	if (e == NULL) {
2650		DPRINTF("failed to get l2t\n");
2651	}
2652	/*
2653	 * Point to our listen socket until accept
2654	 */
2655	newtoep->tp_tp = tp;
2656	newtoep->tp_flags = TP_SYN_RCVD;
2657	newtoep->tp_tid = tid;
2658	newtoep->tp_toedev = tdev;
2659
2660	printf("inserting tid=%d\n", tid);
2661	cxgb_insert_tid(cdev, d->client, newtoep, tid);
2662	SOCK_LOCK(so);
2663	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
2664	SOCK_UNLOCK(so);
2665
2666
2667	if (lctx->ulp_mode) {
2668		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2669
2670		if (!ddp_mbuf)
2671			newtoep->tp_ulp_mode = 0;
2672		else
2673			newtoep->tp_ulp_mode = lctx->ulp_mode;
2674	}
2675
2676	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
2677
2678	DPRINTF("adding request to syn cache\n");
2679
2680	/*
2681	 * XXX workaround for lack of syncache drop
2682	 */
2683	toepcb_hold(newtoep);
2684	syncache_add_accept_req(req, so, newtoep);
2685
2686
2687
2688	rpl = cplhdr(reply_mbuf);
2689	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
2690	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
2691	rpl->wr.wr_lo = 0;
2692	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
2693	rpl->opt2 = htonl(calc_opt2(so, tdev));
2694	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
2695	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
2696
2697	DPRINTF("accept smt_idx=%d\n", e->smt_idx);
2698
2699	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
2700	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
2701	rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) |
2702				  CPL_PASS_OPEN_ACCEPT);
2703
2704	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
2705
2706	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so));
2707
2708#ifdef DEBUG_PRINT
2709	{
2710		int i;
2711
2712		DPRINTF("rpl:\n");
2713		uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *);
2714
2715		for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++)
2716			DPRINTF("[%d] %08x\n", i, rplbuf[i]);
2717	}
2718#endif
2719
2720
2721	l2t_send(cdev, reply_mbuf, e);
2722	m_free(m);
2723#ifdef notyet
2724	/*
2725	 * XXX this call path has to be converted to not depend on sockets
2726	 */
2727	if (newtoep->tp_ulp_mode)
2728		__set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
2729				V_TF_DDP_OFF(1) |
2730				TP_DDP_TIMER_WORKAROUND_MASK,
2731				V_TF_DDP_OFF(1) |
2732				TP_DDP_TIMER_WORKAROUND_VAL, 1);
2733
2734#endif
2735	return;
2736reject:
2737	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
2738		mk_pass_accept_rpl(reply_mbuf, m);
2739	else
2740		mk_tid_release(reply_mbuf, NULL, tid);
2741	cxgb_ofld_send(cdev, reply_mbuf);
2742	m_free(m);
2743out:
2744#if 0
2745	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2746#else
2747	return;
2748#endif
2749}
2750
2751/*
2752 * Handle a CPL_PASS_ACCEPT_REQ message.
2753 */
2754static int
2755do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2756{
2757	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
2758	struct socket *lso = listen_ctx->lso;
2759	struct tom_data *d = listen_ctx->tom_data;
2760
2761#if VALIDATE_TID
2762	struct cpl_pass_accept_req *req = cplhdr(m);
2763	unsigned int tid = GET_TID(req);
2764	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
2765
2766	if (unlikely(!lsk)) {
2767		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
2768		       cdev->name,
2769		       (unsigned long)((union listen_entry *)ctx -
2770					t->stid_tab));
2771		return CPL_RET_BUF_DONE;
2772	}
2773	if (unlikely(tid >= t->ntids)) {
2774		printk(KERN_ERR "%s: passive open TID %u too large\n",
2775		       cdev->name, tid);
2776		return CPL_RET_BUF_DONE;
2777	}
2778	/*
2779	 * For T3A the current user of the TID may have closed but its last
2780	 * message(s) may have been backlogged so the TID appears to be still
2781	 * in use.  Just take the TID away, the connection can close at its
2782	 * own leisure.  For T3B this situation is a bug.
2783	 */
2784	if (!valid_new_tid(t, tid) &&
2785	    cdev->type != T3A) {
2786		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
2787		       cdev->name, tid);
2788		return CPL_RET_BUF_DONE;
2789	}
2790#endif
2791
2792	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
2793	return (0);
2794}
2795
2796/*
2797 * Called when a connection is established to translate the TCP options
2798 * reported by HW to Linux's native format.
2799 */
2800static void
2801assign_rxopt(struct socket *so, unsigned int opt)
2802{
2803	const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
2804	struct tcpcb *tp = sototcpcb(so);
2805	struct toepcb *toep = tp->t_toe;
2806
2807	INP_LOCK_ASSERT(tp->t_inpcb);
2808
2809	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
2810	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
2811	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
2812	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
2813	if (tp->t_flags & TF_RCVD_SCALE)
2814		tp->rcv_scale = 0;
2815}
2816
2817/*
2818 * Completes some final bits of initialization for just established connections
2819 * and changes their state to TCP_ESTABLISHED.
2820 *
2821 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
2822 */
2823static void
2824make_established(struct socket *so, u32 snd_isn, unsigned int opt)
2825{
2826	struct tcpcb *tp = sototcpcb(so);
2827	struct toepcb *toep = tp->t_toe;
2828
2829	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
2830	assign_rxopt(so, opt);
2831	so->so_proto->pr_ctloutput = t3_ctloutput;
2832
2833#if 0
2834	inet_sk(sk)->id = tp->write_seq ^ jiffies;
2835#endif
2836
2837
2838	/*
2839	 * XXX not clear what rcv_wup maps to
2840	 */
2841	/*
2842	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
2843	 * pass through opt0.
2844	 */
2845	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
2846		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
2847
2848	dump_toepcb(toep);
2849
2850#ifdef notyet
2851/*
2852 * no clean interface for marking ARP up to date
2853 */
2854	dst_confirm(sk->sk_dst_cache);
2855#endif
2856	tp->t_state = TCPS_ESTABLISHED;
2857}
2858
2859static int
2860syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
2861{
2862
2863	struct in_conninfo inc;
2864	struct tcpopt to;
2865	struct tcphdr th;
2866	int mss, wsf, sack, ts;
2867	struct mbuf *m = NULL;
2868	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
2869	unsigned int opt;
2870
2871#ifdef MAC
2872#error	"no MAC support"
2873#endif
2874
2875	opt = ntohs(req->tcp_opt);
2876
2877	bzero(&to, sizeof(struct tcpopt));
2878
2879	/*
2880	 * Fill out information for entering us into the syncache
2881	 */
2882	inc.inc_fport = th.th_sport = req->peer_port;
2883	inc.inc_lport = th.th_dport = req->local_port;
2884	th.th_seq = req->rcv_isn;
2885	th.th_flags = TH_ACK;
2886
2887	inc.inc_isipv6 = 0;
2888	inc.inc_len = 0;
2889	inc.inc_faddr.s_addr = req->peer_ip;
2890	inc.inc_laddr.s_addr = req->local_ip;
2891
2892	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
2893	wsf  = G_TCPOPT_WSCALE_OK(opt);
2894	ts   = G_TCPOPT_TSTAMP(opt);
2895	sack = G_TCPOPT_SACK(opt);
2896
2897	to.to_mss = mss;
2898	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
2899	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
2900
2901	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
2902	    ntohl(req->local_ip), ntohs(req->local_port),
2903	    ntohl(req->peer_ip), ntohs(req->peer_port),
2904	    mss, wsf, ts, sack);
2905	return syncache_expand(&inc, &to, &th, so, m);
2906}
2907
2908
2909/*
2910 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
2911 * if we are in TCP_SYN_RECV due to crossed SYNs
2912 */
2913static int
2914do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2915{
2916	struct cpl_pass_establish *req = cplhdr(m);
2917	struct toepcb *toep = (struct toepcb *)ctx;
2918	struct tcpcb *tp;
2919	struct socket *so, *lso;
2920	struct t3c_data *td = T3C_DATA(cdev);
2921	// Complete socket initialization now that we have the SND_ISN
2922
2923	struct toedev *tdev;
2924
2925	so = lso = toeptoso(toep);
2926	tdev = toep->tp_toedev;
2927
2928	SOCK_LOCK(so);
2929	LIST_REMOVE(toep, synq_entry);
2930	SOCK_UNLOCK(so);
2931
2932	INP_INFO_WLOCK(&tcbinfo);
2933	if (!syncache_expand_establish_req(req, &so, toep)) {
2934		/*
2935		 * No entry
2936		 */
2937		UNIMPLEMENTED();
2938	}
2939	if (so == NULL) {
2940		/*
2941		 * Couldn't create the socket
2942		 */
2943		UNIMPLEMENTED();
2944	}
2945
2946	/*
2947	 * XXX workaround for lack of syncache drop
2948	 */
2949	toepcb_release(toep);
2950
2951	tp = sototcpcb(so);
2952	INP_LOCK(tp->t_inpcb);
2953#ifdef notyet
2954	so->so_snd.sb_flags |= SB_TOE;
2955	so->so_rcv.sb_flags |= SB_TOE;
2956#endif
2957	toep->tp_tp = tp;
2958	toep->tp_flags = 0;
2959	tp->t_toe = toep;
2960	reset_wr_list(toep);
2961	tp->rcv_wnd = select_rcv_wnd(so);
2962	DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd);
2963	install_offload_ops(so);
2964
2965	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
2966	toep->tp_wr_unacked = 0;
2967	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
2968	toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
2969	    tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
2970	toep->tp_qset_idx = 0;
2971	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
2972
2973	/*
2974	 * XXX Cancel any keep alive timer
2975	 */
2976
2977	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
2978	INP_INFO_WUNLOCK(&tcbinfo);
2979	INP_UNLOCK(tp->t_inpcb);
2980	soisconnected(so);
2981
2982#ifdef notyet
2983	/*
2984	 * XXX not sure how these checks map to us
2985	 */
2986	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
2987		sk->sk_state_change(sk);
2988		sk_wake_async(so, 0, POLL_OUT);
2989	}
2990	/*
2991	 * The state for the new connection is now up to date.
2992	 * Next check if we should add the connection to the parent's
2993	 * accept queue.  When the parent closes it resets connections
2994	 * on its SYN queue, so check if we are being reset.  If so we
2995	 * don't need to do anything more, the coming ABORT_RPL will
2996	 * destroy this socket.  Otherwise move the connection to the
2997	 * accept queue.
2998	 *
2999	 * Note that we reset the synq before closing the server so if
3000	 * we are not being reset the stid is still open.
3001	 */
3002	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3003		__kfree_skb(skb);
3004		goto unlock;
3005	}
3006#endif
3007	m_free(m);
3008
3009	return (0);
3010}
3011
3012/*
3013 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3014 * and send them to the TOE.
3015 */
3016static void
3017fixup_and_send_ofo(struct socket *so)
3018{
3019	struct mbuf *m;
3020	struct toedev *tdev = TOE_DEV(so);
3021	struct tcpcb *tp = sototcpcb(so);
3022	struct toepcb *toep = tp->t_toe;
3023	unsigned int tid = toep->tp_tid;
3024
3025	printf("fixup_and_send_ofo\n");
3026
3027	INP_LOCK_ASSERT(tp->t_inpcb);
3028	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3029		/*
3030		 * A variety of messages can be waiting but the fields we'll
3031		 * be touching are common to all so any message type will do.
3032		 */
3033		struct cpl_close_con_req *p = cplhdr(m);
3034
3035		p->wr.wr_lo = htonl(V_WR_TID(tid));
3036		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3037		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3038	}
3039}
3040
3041/*
3042 * Updates socket state from an active establish CPL message.  Runs with the
3043 * socket lock held.
3044 */
3045static void
3046socket_act_establish(struct socket *so, struct mbuf *m)
3047{
3048	struct cpl_act_establish *req = cplhdr(m);
3049	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3050	struct tcpcb *tp = sototcpcb(so);
3051	struct toepcb *toep = tp->t_toe;
3052
3053	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3054		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3055		    toep->tp_tid, tp->t_state);
3056
3057	tp->ts_recent_age = ticks;
3058	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3059	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3060
3061	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3062
3063	/*
3064	 * Now that we finally have a TID send any CPL messages that we had to
3065	 * defer for lack of a TID.
3066	 */
3067	if (mbufq_len(&toep->out_of_order_queue))
3068		fixup_and_send_ofo(so);
3069
3070	if (__predict_false(so->so_state & SS_NOFDREF)) {
3071#ifdef notyet
3072		/*
3073		 * XXX 	not clear what should be done here
3074		 * appears to correspond to sorwakeup_locked
3075		 */
3076		sk->sk_state_change(sk);
3077		sk_wake_async(so, 0, POLL_OUT);
3078#endif
3079	}
3080	m_free(m);
3081#ifdef notyet
3082/*
3083 * XXX assume no write requests permitted while socket connection is
3084 * incomplete
3085 */
3086	/*
3087	 * Currently the send queue must be empty at this point because the
3088	 * socket layer does not send anything before a connection is
3089	 * established.  To be future proof though we handle the possibility
3090	 * that there are pending buffers to send (either TX_DATA or
3091	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3092	 * buffers according to the just learned write_seq, and then we send
3093	 * them on their way.
3094	 */
3095	fixup_pending_writeq_buffers(sk);
3096	if (t3_push_frames(so, 1))
3097		sk->sk_write_space(sk);
3098#endif
3099
3100	soisconnected(so);
3101	toep->tp_state = tp->t_state = TCPS_ESTABLISHED;
3102	tcpstat.tcps_connects++;
3103
3104}
3105
3106/*
3107 * Process a CPL_ACT_ESTABLISH message.
3108 */
3109static int
3110do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3111{
3112	struct cpl_act_establish *req = cplhdr(m);
3113	unsigned int tid = GET_TID(req);
3114	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3115	struct toepcb *toep = (struct toepcb *)ctx;
3116	struct tcpcb *tp = toep->tp_tp;
3117	struct socket *so;
3118	struct toedev *tdev;
3119	struct tom_data *d;
3120
3121	if (tp == NULL) {
3122		free_atid(cdev, atid);
3123		return (0);
3124	}
3125
3126	so = toeptoso(toep);
3127	tdev = TOE_DEV(so); /* blow up here if link was down */
3128	d = TOM_DATA(tdev);
3129
3130	INP_LOCK(tp->t_inpcb);
3131
3132	/*
3133	 * It's OK if the TID is currently in use, the owning socket may have
3134	 * backlogged its last CPL message(s).  Just take it away.
3135	 */
3136	toep->tp_tid = tid;
3137	toep->tp_tp = tp;
3138	so_insert_tid(d, so, tid);
3139	free_atid(cdev, atid);
3140	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3141
3142	socket_act_establish(so, m);
3143	INP_UNLOCK(tp->t_inpcb);
3144	return (0);
3145}
3146
3147/*
3148 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3149 * next batch of work requests from the write queue.
3150 */
3151static void
3152wr_ack(struct toepcb *toep, struct mbuf *m)
3153{
3154	struct tcpcb *tp = toep->tp_tp;
3155	struct cpl_wr_ack *hdr = cplhdr(m);
3156	struct socket *so = toeptoso(toep);
3157	unsigned int credits = ntohs(hdr->credits);
3158	u32 snd_una = ntohl(hdr->snd_una);
3159	int bytes = 0;
3160
3161	DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits);
3162
3163	INP_LOCK(tp->t_inpcb);
3164
3165	toep->tp_wr_avail += credits;
3166	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3167		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3168
3169	while (credits) {
3170		struct mbuf *p = peek_wr(toep);
3171		DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ;
3172
3173		if (__predict_false(!p)) {
3174			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3175			    "nothing pending, state %u\n",
3176			       credits, toep->tp_tid, tp->t_state);
3177			break;
3178		}
3179		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3180#if DEBUG_WR > 1
3181			struct tx_data_wr *w = cplhdr(p);
3182#ifdef notyet
3183			log(LOG_ERR,
3184			       "TID %u got %u WR credits, need %u, len %u, "
3185			       "main body %u, frags %u, seq # %u, ACK una %u,"
3186			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3187			       toep->tp_tid, credits, p->csum, p->len,
3188			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3189			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3190			       WR_AVAIL(tp), count_pending_wrs(tp) - credits);
3191#endif
3192#endif
3193			p->m_pkthdr.csum_data -= credits;
3194			break;
3195		} else {
3196			dequeue_wr(toep);
3197			credits -= p->m_pkthdr.csum_data;
3198			bytes += p->m_pkthdr.len;
3199			DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len);
3200
3201			m_free(p);
3202		}
3203	}
3204
3205#if DEBUG_WR
3206	check_wr_invariants(tp);
3207#endif
3208
3209	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3210#if VALIDATE_SEQ
3211		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3212
3213		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3214		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3215		    toep->tp_tid, tp->snd_una);
3216#endif
3217		goto out_free;
3218	}
3219
3220	if (tp->snd_una != snd_una) {
3221		tp->snd_una = snd_una;
3222		tp->ts_recent_age = ticks;
3223#ifdef notyet
3224		/*
3225		 * Keep ARP entry "minty fresh"
3226		 */
3227		dst_confirm(sk->sk_dst_cache);
3228#endif
3229		if (tp->snd_una == tp->snd_nxt)
3230			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3231	}
3232	if (bytes) {
3233		DPRINTF("sbdrop(%d)\n", bytes);
3234		SOCKBUF_LOCK(&so->so_snd);
3235		sbdrop_locked(&so->so_snd, bytes);
3236		sowwakeup_locked(so);
3237	}
3238
3239	if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
3240		t3_push_frames(so, 0);
3241
3242out_free:
3243	INP_UNLOCK(tp->t_inpcb);
3244	m_free(m);
3245}
3246
3247/*
3248 * Handler for TX_DATA_ACK CPL messages.
3249 */
3250static int
3251do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3252{
3253	struct toepcb *toep = (struct toepcb *)ctx;
3254
3255	DPRINTF("do_wr_ack\n");
3256	dump_toepcb(toep);
3257
3258	VALIDATE_SOCK(so);
3259
3260	wr_ack(toep, m);
3261	return 0;
3262}
3263
3264
3265/*
3266 * Reset a connection that is on a listener's SYN queue or accept queue,
3267 * i.e., one that has not had a struct socket associated with it.
3268 * Must be called from process context.
3269 *
3270 * Modeled after code in inet_csk_listen_stop().
3271 */
3272static void
3273t3_reset_listen_child(struct socket *child)
3274{
3275	struct tcpcb *tp = sototcpcb(child);
3276
3277	t3_send_reset(tp->t_toe);
3278}
3279
3280/*
3281 * Disconnect offloaded established but not yet accepted connections sitting
3282 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
3283 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
3284 */
3285void
3286t3_disconnect_acceptq(struct socket *listen_so)
3287{
3288	struct socket *so;
3289	struct tcpcb *tp;
3290
3291	TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
3292		tp = sototcpcb(so);
3293
3294		if (tp->t_flags & TF_TOE) {
3295			INP_LOCK(tp->t_inpcb);
3296			t3_reset_listen_child(so);
3297			INP_UNLOCK(tp->t_inpcb);
3298		}
3299
3300	}
3301}
3302
3303/*
3304 * Reset offloaded connections sitting on a server's syn queue.  As above
3305 * we send ABORT_REQ and finish off when we get ABORT_RPL.
3306 */
3307
3308void
3309t3_reset_synq(struct listen_ctx *lctx)
3310{
3311	struct toepcb *toep;
3312
3313	SOCK_LOCK(lctx->lso);
3314	while (!LIST_EMPTY(&lctx->synq_head)) {
3315		toep = LIST_FIRST(&lctx->synq_head);
3316		LIST_REMOVE(toep, synq_entry);
3317		toep->tp_tp = NULL;
3318		t3_send_reset(toep);
3319		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
3320		toepcb_release(toep);
3321	}
3322	SOCK_UNLOCK(lctx->lso);
3323}
3324
3325void
3326t3_init_wr_tab(unsigned int wr_len)
3327{
3328	int i;
3329
3330	if (mbuf_wrs[1])     /* already initialized */
3331		return;
3332
3333	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
3334		int sgl_len = (3 * i) / 2 + (i & 1);
3335
3336		sgl_len += 3;
3337		mbuf_wrs[i] = sgl_len <= wr_len ?
3338		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
3339	}
3340
3341	wrlen = wr_len * 8;
3342}
3343
3344int
3345t3_init_cpl_io(void)
3346{
3347#ifdef notyet
3348	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
3349	if (!tcphdr_skb) {
3350		log(LOG_ERR,
3351		       "Chelsio TCP offload: can't allocate sk_buff\n");
3352		return -1;
3353	}
3354	skb_put(tcphdr_skb, sizeof(struct tcphdr));
3355	tcphdr_skb->h.raw = tcphdr_skb->data;
3356	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
3357#endif
3358
3359
3360	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
3361	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
3362	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
3363	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
3364	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
3365	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
3366	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
3367	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
3368	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
3369	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
3370	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
3371	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
3372#ifdef notyet
3373	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
3374	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
3375	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
3376#endif
3377	return (0);
3378}
3379
3380