cxgb_cpl_io.c revision 174641
1275976Ssmh/**************************************************************************
2265555Sambrisko
3265555SambriskoCopyright (c) 2007, Chelsio Inc.
4275976SsmhAll rights reserved.
5275976Ssmh
6265555SambriskoRedistribution and use in source and binary forms, with or without
7275976Ssmhmodification, are permitted provided that the following conditions are met:
8275976Ssmh
9275976Ssmh 1. Redistributions of source code must retain the above copyright notice,
10265555Sambrisko    this list of conditions and the following disclaimer.
11275976Ssmh
12275976Ssmh 2. Neither the name of the Chelsio Corporation nor the names of its
13275976Ssmh    contributors may be used to endorse or promote products derived from
14265555Sambrisko    this software without specific prior written permission.
15265555Sambrisko
16275976SsmhTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17265555SambriskoAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18275976SsmhIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19275976SsmhARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20275976SsmhLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21275976SsmhCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22275976SsmhSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23265555SambriskoINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24265555SambriskoCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25265555SambriskoARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26265555SambriskoPOSSIBILITY OF SUCH DAMAGE.
27265555Sambrisko
28265555Sambrisko***************************************************************************/
29265555Sambrisko
30265555Sambrisko#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 174641 2007-12-16 05:27:26Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/lock.h>
39#include <sys/mbuf.h>
40#include <sys/mutex.h>
41#include <sys/socket.h>
42#include <sys/sysctl.h>
43#include <sys/syslog.h>
44#include <sys/socketvar.h>
45#include <sys/protosw.h>
46#include <sys/priv.h>
47
48#include <net/if.h>
49#include <net/route.h>
50
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_systm.h>
54#include <netinet/in_var.h>
55
56
57#include <dev/cxgb/cxgb_osdep.h>
58#include <dev/cxgb/sys/mbufq.h>
59
60#include <netinet/ip.h>
61#include <netinet/tcp_var.h>
62#include <netinet/tcp_fsm.h>
63#include <netinet/tcp_ofld.h>
64#include <netinet/tcp_seq.h>
65#include <netinet/tcp_syncache.h>
66#include <net/route.h>
67
68
69#include <dev/cxgb/t3cdev.h>
70#include <dev/cxgb/common/cxgb_firmware_exports.h>
71#include <dev/cxgb/common/cxgb_t3_cpl.h>
72#include <dev/cxgb/common/cxgb_tcb.h>
73#include <dev/cxgb/common/cxgb_ctl_defs.h>
74#include <dev/cxgb/cxgb_l2t.h>
75#include <dev/cxgb/cxgb_offload.h>
76#include <vm/vm.h>
77#include <vm/pmap.h>
78#include <machine/bus.h>
79#include <dev/cxgb/sys/mvec.h>
80#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
81#include <dev/cxgb/ulp/tom/cxgb_defs.h>
82#include <dev/cxgb/ulp/tom/cxgb_tom.h>
83#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
84#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
85
86
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space.  Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets.  It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes.  Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body.  This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes.  Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window.  We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139extern int tcp_do_autorcvbuf;
140extern int tcp_do_autosndbuf;
141extern int tcp_autorcvbuf_max;
142extern int tcp_autosndbuf_max;
143
144static void t3_send_reset(struct toepcb *toep);
145static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
146static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
147static void handle_syncache_event(int event, void *arg);
148
149
150static inline int
151is_t3a(const struct toedev *dev)
152{
153	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
154}
155
156static void
157dump_toepcb(struct toepcb *toep)
158{
159	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
160	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
161	    toep->tp_mtu_idx, toep->tp_tid);
162
163	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
164	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
165	    toep->tp_mss_clamp, toep->tp_flags);
166}
167
168static struct rtentry *
169rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
170{
171	struct rtentry *rt = NULL;
172
173	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
174		RT_UNLOCK(rt);
175
176	return (rt);
177}
178
179/*
180 * Determine whether to send a CPL message now or defer it.  A message is
181 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
182 * For connections in other states the message is sent immediately.
183 * If through_l2t is set the message is subject to ARP processing, otherwise
184 * it is sent directly.
185 */
186static inline void
187send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
188{
189	struct toepcb *toep = tp->t_toe;
190
191
192	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
193		INP_LOCK(tp->t_inpcb);
194		mbufq_tail(&toep->out_of_order_queue, m);  // defer
195		INP_UNLOCK(tp->t_inpcb);
196	} else if (through_l2t)
197		l2t_send(T3C_DEV(so), m, toep->tp_l2t);  // send through L2T
198	else
199		cxgb_ofld_send(T3C_DEV(so), m);          // send directly
200}
201
202static inline unsigned int
203mkprio(unsigned int cntrl, const struct socket *so)
204{
205        return cntrl;
206}
207
208/*
209 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
210 */
211static inline void
212mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid)
213{
214	struct cpl_tid_release *req;
215
216	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so));
217	m->m_pkthdr.len = m->m_len = sizeof(*req);
218	req = mtod(m, struct cpl_tid_release *);
219	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
220	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
221}
222
223static inline void
224make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
225{
226	struct tcpcb *tp = sototcpcb(so);
227	struct toepcb *toep = tp->t_toe;
228	struct tx_data_wr *req;
229
230	INP_LOCK_ASSERT(tp->t_inpcb);
231
232	req = mtod(m, struct tx_data_wr *);
233	m->m_len = sizeof(*req);
234	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
235	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
236	/* len includes the length of any HW ULP additions */
237	req->len = htonl(len);
238	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
239	/* V_TX_ULP_SUBMODE sets both the mode and submode */
240	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
241	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
242	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
243				   (tail ? 0 : 1))));
244	req->sndseq = htonl(tp->snd_nxt);
245	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
246		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
247				    V_TX_CPU_IDX(toep->tp_qset));
248
249		/* Sendbuffer is in units of 32KB.
250		 */
251		if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
252			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
253		else
254			req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
255		toep->tp_flags |= TP_DATASENT;
256	}
257}
258
259int
260t3_push_frames(struct socket *so, int req_completion)
261{
262	struct tcpcb *tp = sototcpcb(so);
263	struct toepcb *toep = tp->t_toe;
264
265	struct mbuf *tail, *m0, *last;
266	struct t3cdev *cdev;
267	struct tom_data *d;
268	int bytes, count, total_bytes;
269	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
270	segp = segs;
271
272	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
273		DPRINTF("tcp state=%d\n", tp->t_state);
274		return (0);
275	}
276
277	if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
278		DPRINTF("disconnecting\n");
279
280		return (0);
281	}
282
283	INP_LOCK_ASSERT(tp->t_inpcb);
284
285	SOCKBUF_LOCK(&so->so_snd);
286
287	d = TOM_DATA(TOE_DEV(so));
288	cdev = d->cdev;
289	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
290	total_bytes = 0;
291	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
292	    toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
293
294	if (last && toep->tp_m_last == last  && so->so_snd.sb_sndptroff != 0) {
295		KASSERT(tail, ("sbdrop error"));
296		last = tail = tail->m_next;
297	}
298
299	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
300		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
301		SOCKBUF_UNLOCK(&so->so_snd);
302		return (0);
303	}
304
305	toep->tp_m_last = NULL;
306	while (toep->tp_wr_avail && (tail != NULL)) {
307		count = bytes = 0;
308		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
309			SOCKBUF_UNLOCK(&so->so_snd);
310			return (0);
311		}
312		while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
313		    && (tail != NULL) && (count < TX_MAX_SEGS)) {
314			bytes += tail->m_len;
315			count++;
316			last = tail;
317			/*
318			 * technically an abuse to be using this for a VA
319			 * but less gross than defining my own structure
320			 * or calling pmap_kextract from here :-|
321			 */
322			segp->ds_addr = (bus_addr_t)tail->m_data;
323			segp->ds_len = tail->m_len;
324			DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
325			    count, mbuf_wrs[count], tail->m_data, tail->m_len);
326
327			segp++;
328			tail = tail->m_next;
329		}
330		DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
331		    toep->tp_wr_avail, count, mbuf_wrs[count], tail);
332		if (tail) {
333			so->so_snd.sb_sndptr = tail;
334			toep->tp_m_last = NULL;
335		} else
336			toep->tp_m_last = so->so_snd.sb_sndptr = last;
337
338		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
339
340		so->so_snd.sb_sndptroff += bytes;
341		total_bytes += bytes;
342		toep->tp_write_seq += bytes;
343
344
345		SOCKBUF_UNLOCK(&so->so_snd);
346
347		/*
348		 * XXX can drop socket buffer lock here
349		 */
350
351		toep->tp_wr_avail -= mbuf_wrs[count];
352		toep->tp_wr_unacked += mbuf_wrs[count];
353
354		make_tx_data_wr(so, m0, bytes, tail);
355		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so));
356		m_set_sgl(m0, segs);
357		m_set_sgllen(m0, count);
358		/*
359		 * remember credits used
360		 */
361		m0->m_pkthdr.csum_data = mbuf_wrs[count];
362		m0->m_pkthdr.len = bytes;
363		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
364		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
365			struct work_request_hdr *wr = cplhdr(m0);
366
367			wr->wr_hi |= htonl(F_WR_COMPL);
368			toep->tp_wr_unacked = 0;
369		}
370
371		m0->m_type = MT_DONTFREE;
372		enqueue_wr(toep, m0);
373		DPRINTF("sending offload tx with %d bytes in %d segments\n",
374		    bytes, count);
375
376		l2t_send(cdev, m0, toep->tp_l2t);
377		if (toep->tp_wr_avail && (tail != NULL))
378			SOCKBUF_LOCK(&so->so_snd);
379	}
380
381	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
382	return (total_bytes);
383}
384
385/*
386 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
387 * under any circumstances.  We take the easy way out and always queue the
388 * message to the write_queue.  We can optimize the case where the queue is
389 * already empty though the optimization is probably not worth it.
390 */
391static void
392close_conn(struct socket *so)
393{
394	struct mbuf *m;
395	struct cpl_close_con_req *req;
396	struct tom_data *d;
397	struct inpcb *inp = sotoinpcb(so);
398	struct tcpcb *tp;
399	struct toepcb *toep;
400	unsigned int tid;
401
402
403	INP_LOCK(inp);
404	tp = sototcpcb(so);
405	toep = tp->t_toe;
406
407	if (tp->t_state != TCPS_SYN_SENT)
408		t3_push_frames(so, 1);
409
410	if (toep->tp_flags & TP_FIN_SENT) {
411		INP_UNLOCK(inp);
412		return;
413	}
414
415	tid = toep->tp_tid;
416
417	d = TOM_DATA(toep->tp_toedev);
418
419	m = m_gethdr_nofail(sizeof(*req));
420
421	toep->tp_flags |= TP_FIN_SENT;
422	req = mtod(m, struct cpl_close_con_req *);
423
424	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
425	req->wr.wr_lo = htonl(V_WR_TID(tid));
426	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
427	req->rsvd = htonl(toep->tp_write_seq);
428	INP_UNLOCK(inp);
429	/*
430	 * XXX - need to defer shutdown while there is still data in the queue
431	 *
432	 */
433	cxgb_ofld_send(d->cdev, m);
434
435}
436
437/*
438 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
439 * and send it along.
440 */
441static void
442abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
443{
444	struct cpl_abort_req *req = cplhdr(m);
445
446	req->cmd = CPL_ABORT_NO_RST;
447	cxgb_ofld_send(cdev, m);
448}
449
450/*
451 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
452 * permitted to return without sending the message in case we cannot allocate
453 * an sk_buff.  Returns the number of credits sent.
454 */
455uint32_t
456t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
457{
458	struct mbuf *m;
459	struct cpl_rx_data_ack *req;
460	struct toepcb *toep = tp->t_toe;
461	struct toedev *tdev = toep->tp_toedev;
462
463	m = m_gethdr_nofail(sizeof(*req));
464
465	DPRINTF("returning %u credits to HW\n", credits);
466
467	req = mtod(m, struct cpl_rx_data_ack *);
468	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
469	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
470	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
471	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep)));
472	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
473	return (credits);
474}
475
476
477/*
478 * Set of states for which we should return RX credits.
479 */
480#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
481
482/*
483 * Called after some received data has been read.  It returns RX credits
484 * to the HW for the amount of data processed.
485 */
486void
487t3_cleanup_rbuf(struct tcpcb *tp)
488{
489	struct toepcb *toep = tp->t_toe;
490	struct socket *so;
491	struct toedev *dev;
492	int dack_mode, must_send, read;
493	u32 thres, credits, dack = 0;
494
495	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
496		(tp->t_state == TCPS_FIN_WAIT_2)))
497		return;
498	INP_LOCK_ASSERT(tp->t_inpcb);
499
500	so = tp->t_inpcb->inp_socket;
501	SOCKBUF_LOCK(&so->so_rcv);
502	read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
503	toep->tp_copied_seq += read;
504	toep->tp_enqueued_bytes -= read;
505	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
506	SOCKBUF_UNLOCK(&so->so_rcv);
507
508	if (credits > so->so_rcv.sb_mbmax)
509	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
510		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
511	/*
512	 * XXX this won't accurately reflect credit return - we need
513	 * to look at the difference between the amount that has been
514	 * put in the recv sockbuf and what is there now
515	 */
516
517	if (__predict_false(!credits))
518		return;
519
520	dev = toep->tp_toedev;
521	thres = TOM_TUNABLE(dev, rx_credit_thres);
522
523	if (__predict_false(thres == 0))
524		return;
525
526	if (toep->tp_ulp_mode)
527		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
528	else {
529		dack_mode = TOM_TUNABLE(dev, delack);
530		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
531			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
532
533			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
534				dack = F_RX_DACK_CHANGE |
535				       V_RX_DACK_MODE(dack_mode);
536		}
537	}
538
539	/*
540	 * For coalescing to work effectively ensure the receive window has
541	 * at least 16KB left.
542	 */
543	must_send = credits + 16384 >= tp->rcv_wnd;
544
545	if (must_send || credits >= thres)
546		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
547}
548
549static int
550cxgb_toe_disconnect(struct tcpcb *tp)
551{
552	struct socket *so;
553
554	DPRINTF("cxgb_toe_disconnect\n");
555
556	so = tp->t_inpcb->inp_socket;
557	close_conn(so);
558	return (0);
559}
560
561static int
562cxgb_toe_abort(struct tcpcb *tp)
563{
564	struct toepcb *toep = tp->t_toe;
565
566
567	t3_send_reset(toep);
568
569	/*
570	 * unhook from socket
571	 */
572	tp->t_flags &= ~TF_TOE;
573	toep->tp_tp = NULL;
574	tp->t_toe = NULL;
575	return (0);
576}
577
578static int
579cxgb_toe_send(struct tcpcb *tp)
580{
581	struct socket *so;
582
583	DPRINTF("cxgb_toe_send\n");
584	dump_toepcb(tp->t_toe);
585
586	so = tp->t_inpcb->inp_socket;
587	t3_push_frames(so, 1);
588	return (0);
589}
590
591static int
592cxgb_toe_rcvd(struct tcpcb *tp)
593{
594	INP_LOCK_ASSERT(tp->t_inpcb);
595	t3_cleanup_rbuf(tp);
596
597	return (0);
598}
599
600static void
601cxgb_toe_detach(struct tcpcb *tp)
602{
603	struct toepcb *toep;
604	/*
605	 * XXX how do we handle teardown in the SYN_SENT state?
606	 *
607	 */
608	INP_INFO_WLOCK(&tcbinfo);
609	toep = tp->t_toe;
610	toep->tp_tp = NULL;
611
612	/*
613	 * unhook from socket
614	 */
615	tp->t_flags &= ~TF_TOE;
616	tp->t_toe = NULL;
617	INP_INFO_WUNLOCK(&tcbinfo);
618}
619
620
621static struct toe_usrreqs cxgb_toe_usrreqs = {
622	.tu_disconnect = cxgb_toe_disconnect,
623	.tu_abort = cxgb_toe_abort,
624	.tu_send = cxgb_toe_send,
625	.tu_rcvd = cxgb_toe_rcvd,
626	.tu_detach = cxgb_toe_detach,
627	.tu_detach = cxgb_toe_detach,
628	.tu_syncache_event = handle_syncache_event,
629};
630
631
632static void
633__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
634			    uint64_t mask, uint64_t val, int no_reply)
635{
636	struct cpl_set_tcb_field *req;
637	struct tcpcb *tp = sototcpcb(so);
638	struct toepcb *toep = tp->t_toe;
639
640	req = mtod(m, struct cpl_set_tcb_field *);
641	m->m_pkthdr.len = m->m_len = sizeof(*req);
642	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
643	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
644	req->reply = V_NO_REPLY(no_reply);
645	req->cpu_idx = 0;
646	req->word = htons(word);
647	req->mask = htobe64(mask);
648	req->val = htobe64(val);
649
650	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
651	send_or_defer(so, tp, m, 0);
652}
653
654static void
655t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
656{
657	struct mbuf *m;
658	struct tcpcb *tp = sototcpcb(so);
659	struct toepcb *toep = tp->t_toe;
660
661	if (toep == NULL)
662		return;
663
664	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
665		return;
666
667	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
668
669	__set_tcb_field(so, m, word, mask, val, 1);
670}
671
672/*
673 * Set one of the t_flags bits in the TCB.
674 */
675static void
676set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
677{
678	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
679}
680
681/*
682 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
683 */
684static void
685t3_set_nagle(struct socket *so)
686{
687	struct tcpcb *tp = sototcpcb(so);
688
689	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
690}
691
692/*
693 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
694 */
695void
696t3_set_keepalive(struct socket *so, int on_off)
697{
698	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
699}
700
701void
702t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
703{
704	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
705}
706
707/*
708 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
709 */
710static void
711t3_set_tos(struct socket *so)
712{
713	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
714			 V_TCB_TOS(SO_TOS(so)));
715}
716
717
718/*
719 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
720 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
721 * set the PSH bit in the last segment, which would trigger delivery.]
722 * We work around the issue by setting a DDP buffer in a partial placed state,
723 * which guarantees that TP will schedule a timer.
724 */
725#define TP_DDP_TIMER_WORKAROUND_MASK\
726    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
727     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
728       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
729#define TP_DDP_TIMER_WORKAROUND_VAL\
730    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
731     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
732      32))
733
734static void
735t3_enable_ddp(struct socket *so, int on)
736{
737	if (on)
738		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
739				 V_TF_DDP_OFF(0));
740	else
741		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
742				 V_TF_DDP_OFF(1) |
743				 TP_DDP_TIMER_WORKAROUND_MASK,
744				 V_TF_DDP_OFF(1) |
745				 TP_DDP_TIMER_WORKAROUND_VAL);
746
747}
748
749
750void
751t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
752{
753	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
754			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
755			 tag_color);
756}
757
758void
759t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
760		    unsigned int len)
761{
762	if (buf_idx == 0)
763		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
764			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
765			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
766			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
767			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
768	else
769		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
770			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
771			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
772			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
773			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
774}
775
776static int
777t3_set_cong_control(struct socket *so, const char *name)
778{
779#ifdef notyet
780	int cong_algo;
781
782	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
783		if (!strcmp(name, t3_cong_ops[cong_algo].name))
784			break;
785
786	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
787		return -EINVAL;
788#endif
789	return 0;
790}
791
792int
793t3_get_tcb(struct socket *so)
794{
795	struct cpl_get_tcb *req;
796	struct tcpcb *tp = sototcpcb(so);
797	struct toepcb *toep = tp->t_toe;
798	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
799
800	if (!m)
801		return (ENOMEM);
802
803	INP_LOCK_ASSERT(tp->t_inpcb);
804	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
805	req = mtod(m, struct cpl_get_tcb *);
806	m->m_pkthdr.len = m->m_len = sizeof(*req);
807	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
808	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
809	req->cpuno = htons(toep->tp_qset);
810	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
811		mbufq_tail(&toep->out_of_order_queue, m);	// defer
812	else
813		cxgb_ofld_send(T3C_DEV(so), m);
814	return 0;
815}
816
817static inline void
818so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
819{
820	struct toepcb *toep = sototoep(so);
821	toepcb_hold(toep);
822
823	cxgb_insert_tid(d->cdev, d->client, toep, tid);
824}
825
826/**
827 *	find_best_mtu - find the entry in the MTU table closest to an MTU
828 *	@d: TOM state
829 *	@mtu: the target MTU
830 *
831 *	Returns the index of the value in the MTU table that is closest to but
832 *	does not exceed the target MTU.
833 */
834static unsigned int
835find_best_mtu(const struct t3c_data *d, unsigned short mtu)
836{
837	int i = 0;
838
839	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
840		++i;
841	return (i);
842}
843
844static unsigned int
845select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
846{
847	unsigned int idx;
848
849#ifdef notyet
850	struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
851#endif
852	if (tp) {
853		tp->t_maxseg = pmtu - 40;
854		if (tp->t_maxseg < td->mtus[0] - 40)
855			tp->t_maxseg = td->mtus[0] - 40;
856		idx = find_best_mtu(td, tp->t_maxseg + 40);
857
858		tp->t_maxseg = td->mtus[idx] - 40;
859	} else
860		idx = find_best_mtu(td, pmtu);
861
862	return (idx);
863}
864
865void
866t3_release_ddp_resources(struct toepcb *toep)
867{
868	/*
869	 * This is a no-op until we have DDP support
870	 */
871}
872
873static inline void
874free_atid(struct t3cdev *cdev, unsigned int tid)
875{
876	struct toepcb *toep = cxgb_free_atid(cdev, tid);
877
878	if (toep)
879		toepcb_release(toep);
880}
881
882/*
883 * Release resources held by an offload connection (TID, L2T entry, etc.)
884 */
885static void
886t3_release_offload_resources(struct toepcb *toep)
887{
888	struct tcpcb *tp = toep->tp_tp;
889	struct toedev *tdev = toep->tp_toedev;
890	struct t3cdev *cdev;
891	unsigned int tid = toep->tp_tid;
892
893	if (!tdev)
894		return;
895
896	cdev = TOEP_T3C_DEV(toep);
897	if (!cdev)
898		return;
899
900	toep->tp_qset = 0;
901	t3_release_ddp_resources(toep);
902
903#ifdef CTRL_SKB_CACHE
904	kfree_skb(CTRL_SKB_CACHE(tp));
905	CTRL_SKB_CACHE(tp) = NULL;
906#endif
907
908	if (toep->tp_wr_avail != toep->tp_wr_max) {
909		purge_wr_queue(toep);
910		reset_wr_list(toep);
911	}
912
913	if (toep->tp_l2t) {
914		l2t_release(L2DATA(cdev), toep->tp_l2t);
915		toep->tp_l2t = NULL;
916	}
917	printf("setting toep->tp_tp to NULL\n");
918
919	toep->tp_tp = NULL;
920	if (tp) {
921		INP_LOCK_ASSERT(tp->t_inpcb);
922		tp->t_toe = NULL;
923		tp->t_flags &= ~TF_TOE;
924	}
925
926	if (toep->tp_state == TCPS_SYN_SENT) {
927		free_atid(cdev, tid);
928#ifdef notyet
929		__skb_queue_purge(&tp->out_of_order_queue);
930#endif
931	} else {                                          // we have TID
932		cxgb_remove_tid(cdev, toep, tid);
933		toepcb_release(toep);
934	}
935#if 0
936	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
937#endif
938}
939
940static void
941install_offload_ops(struct socket *so)
942{
943	struct tcpcb *tp = sototcpcb(so);
944
945	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
946
947	t3_install_socket_ops(so);
948	tp->t_flags |= TF_TOE;
949	tp->t_tu = &cxgb_toe_usrreqs;
950}
951
952/*
953 * Determine the receive window scaling factor given a target max
954 * receive window.
955 */
956static __inline int
957select_rcv_wscale(int space)
958{
959	int wscale = 0;
960
961	if (space > MAX_RCV_WND)
962		space = MAX_RCV_WND;
963
964	if (tcp_do_rfc1323)
965		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
966	return wscale;
967}
968
969/*
970 * Determine the receive window size for a socket.
971 */
972static unsigned int
973select_rcv_wnd(struct socket *so)
974{
975	struct toedev *dev = TOE_DEV(so);
976	struct tom_data *d = TOM_DATA(dev);
977	unsigned int wnd;
978	unsigned int max_rcv_wnd;
979
980	if (tcp_do_autorcvbuf)
981		wnd = tcp_autorcvbuf_max;
982	else
983		wnd = sbspace(&so->so_rcv);
984
985	/* XXX
986	 * For receive coalescing to work effectively we need a receive window
987	 * that can accomodate a coalesced segment.
988	 */
989	if (wnd < MIN_RCV_WND)
990		wnd = MIN_RCV_WND;
991
992	/* PR 5138 */
993	max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ?
994				    (uint32_t)d->rx_page_size * 23 :
995				    MAX_RCV_WND);
996
997	return min(wnd, max_rcv_wnd);
998}
999
1000/*
1001 * Assign offload parameters to some socket fields.  This code is used by
1002 * both active and passive opens.
1003 */
1004static inline void
1005init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1006    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1007{
1008	struct tcpcb *tp = sototcpcb(so);
1009	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1010
1011	SOCK_LOCK_ASSERT(so);
1012
1013	printf("initializing offload socket\n");
1014#ifdef notyet
1015	/*
1016	 * We either need to fix push frames to work with sbcompress
1017	 * or we need to add this
1018	 */
1019	so->so_rcv.sb_flags |= SB_TOE;
1020	so->so_snd.sb_flags |= SB_TOE;
1021#endif
1022	tp->t_toe = toep;
1023	toep->tp_tp = tp;
1024	toep->tp_toedev = dev;
1025
1026	toep->tp_tid = tid;
1027	toep->tp_l2t = e;
1028	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1029	toep->tp_wr_unacked = 0;
1030	toep->tp_delack_mode = 0;
1031
1032	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1033	/*
1034	 * XXX broken
1035	 *
1036	 */
1037	tp->rcv_wnd = select_rcv_wnd(so);
1038        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
1039		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1040	toep->tp_qset_idx = 0;
1041
1042	reset_wr_list(toep);
1043	DPRINTF("initialization done\n");
1044}
1045
1046/*
1047 * The next two functions calculate the option 0 value for a socket.
1048 */
1049static inline unsigned int
1050calc_opt0h(struct socket *so, int mtu_idx)
1051{
1052	struct tcpcb *tp = sototcpcb(so);
1053	int wscale = select_rcv_wscale(tp->rcv_wnd);
1054
1055	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1056	    V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1057	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1058}
1059
1060static inline unsigned int
1061calc_opt0l(struct socket *so, int ulp_mode)
1062{
1063	struct tcpcb *tp = sototcpcb(so);
1064	unsigned int val;
1065
1066	val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
1067	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1068
1069	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
1070	return (val);
1071}
1072
1073static inline unsigned int
1074calc_opt2(const struct socket *so, struct toedev *dev)
1075{
1076	int flv_valid;
1077
1078	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1079
1080	return V_FLAVORS_VALID(flv_valid) |
1081	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0);
1082}
1083#if 0
1084(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1085#endif
1086
1087static void
1088mk_act_open_req(struct socket *so, struct mbuf *m,
1089    unsigned int atid, const struct l2t_entry *e)
1090{
1091	struct cpl_act_open_req *req;
1092	struct inpcb *inp = sotoinpcb(so);
1093	struct tcpcb *tp = intotcpcb(inp);
1094	struct toepcb *toep = tp->t_toe;
1095	struct toedev *tdev = TOE_DEV(so);
1096
1097	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so));
1098
1099	req = mtod(m, struct cpl_act_open_req *);
1100	m->m_pkthdr.len = m->m_len = sizeof(*req);
1101
1102	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1103	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1104	req->local_port = inp->inp_lport;
1105	req->peer_port = inp->inp_fport;
1106	memcpy(&req->local_ip, &inp->inp_laddr, 4);
1107	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1108	DPRINTF("connect smt_idx=%d\n", e->smt_idx);
1109	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1110			   V_TX_CHANNEL(e->smt_idx));
1111	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1112	req->params = 0;
1113	req->opt2 = htonl(calc_opt2(so, tdev));
1114}
1115
1116
1117/*
1118 * Convert an ACT_OPEN_RPL status to an errno.
1119 */
1120static int
1121act_open_rpl_status_to_errno(int status)
1122{
1123	switch (status) {
1124	case CPL_ERR_CONN_RESET:
1125		return (ECONNREFUSED);
1126	case CPL_ERR_ARP_MISS:
1127		return (EHOSTUNREACH);
1128	case CPL_ERR_CONN_TIMEDOUT:
1129		return (ETIMEDOUT);
1130	case CPL_ERR_TCAM_FULL:
1131		return (ENOMEM);
1132	case CPL_ERR_CONN_EXIST:
1133		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1134		return (EADDRINUSE);
1135	default:
1136		return (EIO);
1137	}
1138}
1139
1140static void
1141fail_act_open(struct toepcb *toep, int errno)
1142{
1143	struct tcpcb *tp = toep->tp_tp;
1144
1145	t3_release_offload_resources(toep);
1146	if (tp) {
1147		INP_LOCK_ASSERT(tp->t_inpcb);
1148		tcp_drop(tp, errno);
1149	}
1150
1151#ifdef notyet
1152	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1153#endif
1154}
1155
1156/*
1157 * Handle active open failures.
1158 */
1159static void
1160active_open_failed(struct toepcb *toep, struct mbuf *m)
1161{
1162	struct cpl_act_open_rpl *rpl = cplhdr(m);
1163	struct inpcb *inp;
1164
1165	INP_INFO_WLOCK(&tcbinfo);
1166	if (toep->tp_tp == NULL)
1167		goto done;
1168
1169	inp = toep->tp_tp->t_inpcb;
1170	INP_LOCK(inp);
1171
1172/*
1173 * Don't handle connection retry for now
1174 */
1175#ifdef notyet
1176	struct inet_connection_sock *icsk = inet_csk(sk);
1177
1178	if (rpl->status == CPL_ERR_CONN_EXIST &&
1179	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1180		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1181		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1182			       jiffies + HZ / 2);
1183	} else
1184#endif
1185		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1186	INP_UNLOCK(inp);
1187done:
1188	INP_INFO_WUNLOCK(&tcbinfo);
1189
1190	m_free(m);
1191}
1192
1193/*
1194 * Return whether a failed active open has allocated a TID
1195 */
1196static inline int
1197act_open_has_tid(int status)
1198{
1199	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1200	       status != CPL_ERR_ARP_MISS;
1201}
1202
1203/*
1204 * Process an ACT_OPEN_RPL CPL message.
1205 */
1206static int
1207do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1208{
1209	struct toepcb *toep = (struct toepcb *)ctx;
1210	struct cpl_act_open_rpl *rpl = cplhdr(m);
1211
1212	if (cdev->type != T3A && act_open_has_tid(rpl->status))
1213		cxgb_queue_tid_release(cdev, GET_TID(rpl));
1214
1215	active_open_failed(toep, m);
1216	return (0);
1217}
1218
1219/*
1220 * Handle an ARP failure for an active open.   XXX purge ofo queue
1221 *
1222 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1223 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1224 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1225 * free the atid.  Hmm.
1226 */
1227#ifdef notyet
1228static void
1229act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1230{
1231	struct toepcb *toep = m_get_toep(m);
1232	struct tcpcb *tp = toep->tp_tp;
1233	struct inpcb *inp = tp->t_inpcb;
1234	struct socket *so = toeptoso(toep);
1235
1236	INP_LOCK(inp);
1237	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1238		fail_act_open(so, EHOSTUNREACH);
1239		printf("freeing %p\n", m);
1240
1241		m_free(m);
1242	}
1243	INP_UNLOCK(inp);
1244}
1245#endif
1246/*
1247 * Send an active open request.
1248 */
1249int
1250t3_connect(struct toedev *tdev, struct socket *so,
1251    struct rtentry *rt, struct sockaddr *nam)
1252{
1253	struct mbuf *m;
1254	struct l2t_entry *e;
1255	struct tom_data *d = TOM_DATA(tdev);
1256	struct inpcb *inp = sotoinpcb(so);
1257	struct tcpcb *tp = intotcpcb(inp);
1258	struct toepcb *toep; /* allocated by init_offload_socket */
1259
1260	int atid;
1261
1262	toep = toepcb_alloc();
1263	if (toep == NULL)
1264		goto out_err;
1265
1266	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1267		goto out_err;
1268
1269	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1270	if (!e)
1271		goto free_tid;
1272
1273	INP_LOCK_ASSERT(inp);
1274	m = m_gethdr(MT_DATA, M_WAITOK);
1275
1276#if 0
1277	m->m_toe.mt_toepcb = tp->t_toe;
1278	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1279#endif
1280	SOCK_LOCK(so);
1281
1282	init_offload_socket(so, tdev, atid, e, rt, toep);
1283
1284	install_offload_ops(so);
1285
1286	mk_act_open_req(so, m, atid, e);
1287	SOCK_UNLOCK(so);
1288
1289	soisconnecting(so);
1290	toep = tp->t_toe;
1291	m_set_toep(m, tp->t_toe);
1292
1293	printf("sending off request\n");
1294
1295	toep->tp_state = TCPS_SYN_SENT;
1296	l2t_send(d->cdev, (struct mbuf *)m, e);
1297
1298	if (toep->tp_ulp_mode)
1299		t3_enable_ddp(so, 0);
1300	return 	(0);
1301
1302free_tid:
1303	printf("failing connect - free atid\n");
1304
1305	free_atid(d->cdev, atid);
1306out_err:
1307	printf("return ENOMEM\n");
1308       return (ENOMEM);
1309}
1310
1311/*
1312 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1313 * not send multiple ABORT_REQs for the same connection and also that we do
1314 * not try to send a message after the connection has closed.  Returns 1 if
1315 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1316 */
1317static void
1318t3_send_reset(struct toepcb *toep)
1319{
1320
1321	struct cpl_abort_req *req;
1322	unsigned int tid = toep->tp_tid;
1323	int mode = CPL_ABORT_SEND_RST;
1324	struct tcpcb *tp = toep->tp_tp;
1325	struct toedev *tdev = toep->tp_toedev;
1326	struct socket *so = NULL;
1327	struct mbuf *m;
1328
1329	if (tp) {
1330		INP_LOCK_ASSERT(tp->t_inpcb);
1331		so = toeptoso(toep);
1332	}
1333
1334	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1335		tdev == NULL))
1336		return;
1337	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1338
1339	/* Purge the send queue so we don't send anything after an abort. */
1340	if (so)
1341		sbflush(&so->so_snd);
1342	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1343		mode |= CPL_ABORT_POST_CLOSE_REQ;
1344
1345	m = m_gethdr_nofail(sizeof(*req));
1346	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so));
1347	set_arp_failure_handler(m, abort_arp_failure);
1348
1349	req = mtod(m, struct cpl_abort_req *);
1350	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1351	req->wr.wr_lo = htonl(V_WR_TID(tid));
1352	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1353	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1354	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1355	req->cmd = mode;
1356	if (tp && (tp->t_state == TCPS_SYN_SENT))
1357		mbufq_tail(&toep->out_of_order_queue, m);	// defer
1358	else
1359		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1360}
1361
1362static int
1363t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1364{
1365	struct inpcb *inp;
1366	int error, optval;
1367
1368	if (sopt->sopt_name == IP_OPTIONS)
1369		return (ENOPROTOOPT);
1370
1371	if (sopt->sopt_name != IP_TOS)
1372		return (EOPNOTSUPP);
1373
1374	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1375
1376	if (error)
1377		return (error);
1378
1379	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1380		return (EPERM);
1381
1382	inp = sotoinpcb(so);
1383	inp->inp_ip_tos = optval;
1384
1385	t3_set_tos(so);
1386
1387	return (0);
1388}
1389
1390static int
1391t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1392{
1393	int err = 0;
1394	size_t copied;
1395
1396	if (sopt->sopt_name != TCP_CONGESTION &&
1397	    sopt->sopt_name != TCP_NODELAY)
1398		return (EOPNOTSUPP);
1399
1400	if (sopt->sopt_name == TCP_CONGESTION) {
1401		char name[TCP_CA_NAME_MAX];
1402		int optlen = sopt->sopt_valsize;
1403		struct tcpcb *tp;
1404
1405		if (optlen < 1)
1406			return (EINVAL);
1407
1408		err = copyinstr(sopt->sopt_val, name,
1409		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1410		if (err)
1411			return (err);
1412		if (copied < 1)
1413			return (EINVAL);
1414
1415		tp = sototcpcb(so);
1416		/*
1417		 * XXX I need to revisit this
1418		 */
1419		if ((err = t3_set_cong_control(so, name)) == 0) {
1420#ifdef notyet
1421			tp->t_cong_control = strdup(name, M_CXGB);
1422#endif
1423		} else
1424			return (err);
1425	} else {
1426		int optval, oldval;
1427		struct inpcb *inp;
1428		struct tcpcb *tp;
1429
1430		err = sooptcopyin(sopt, &optval, sizeof optval,
1431		    sizeof optval);
1432
1433		if (err)
1434			return (err);
1435
1436		inp = sotoinpcb(so);
1437		tp = intotcpcb(inp);
1438
1439		INP_LOCK(inp);
1440
1441		oldval = tp->t_flags;
1442		if (optval)
1443			tp->t_flags |= TF_NODELAY;
1444		else
1445			tp->t_flags &= ~TF_NODELAY;
1446		INP_UNLOCK(inp);
1447
1448		if (oldval != tp->t_flags)
1449			t3_set_nagle(so);
1450
1451	}
1452
1453	return (0);
1454}
1455
1456static int
1457t3_ctloutput(struct socket *so, struct sockopt *sopt)
1458{
1459	int err;
1460
1461	if (sopt->sopt_level != IPPROTO_TCP)
1462		err =  t3_ip_ctloutput(so, sopt);
1463	else
1464		err = t3_tcp_ctloutput(so, sopt);
1465
1466	if (err != EOPNOTSUPP)
1467		return (err);
1468
1469	return tcp_ctloutput(so, sopt);
1470}
1471
1472/*
1473 * Process new data received for a connection.
1474 */
1475static void
1476new_rx_data(struct toepcb *toep, struct mbuf *m)
1477{
1478	struct cpl_rx_data *hdr = cplhdr(m);
1479	struct tcpcb *tp = toep->tp_tp;
1480	struct socket *so = toeptoso(toep);
1481	int len = be16toh(hdr->len);
1482
1483	INP_LOCK(tp->t_inpcb);
1484
1485#ifdef notyet
1486	if (__predict_false(sk_no_receive(sk))) {
1487		handle_excess_rx(so, skb);
1488		return;
1489	}
1490
1491	if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
1492		handle_ddp_data(so, skb);
1493
1494	TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
1495	TCP_SKB_CB(skb)->flags = 0;
1496	skb_ulp_mode(skb) = 0;                    /* for iSCSI */
1497#endif
1498#if VALIDATE_SEQ
1499	if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
1500		printk(KERN_ERR
1501		       "%s: TID %u: Bad sequence number %u, expected %u\n",
1502		       TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq,
1503		       tp->rcv_nxt);
1504		__kfree_skb(skb);
1505		return;
1506	}
1507#endif
1508	m_adj(m, sizeof(*hdr));
1509
1510#ifdef notyet
1511	/*
1512	 * We don't handle urgent data yet
1513	 */
1514	if (__predict_false(hdr->urg))
1515		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
1516	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
1517		     tp->urg_seq - tp->rcv_nxt < skb->len))
1518		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
1519							 tp->rcv_nxt];
1520#endif
1521	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
1522		toep->tp_delack_mode = hdr->dack_mode;
1523		toep->tp_delack_seq = tp->rcv_nxt;
1524	}
1525
1526	DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len);
1527
1528	if (len < m->m_pkthdr.len)
1529		m->m_pkthdr.len = m->m_len = len;
1530
1531	tp->rcv_nxt += m->m_pkthdr.len;
1532	tp->t_rcvtime = ticks;
1533	toep->tp_enqueued_bytes += m->m_pkthdr.len;
1534#ifdef T3_TRACE
1535	T3_TRACE2(TIDTB(sk),
1536		  "new_rx_data: seq 0x%x len %u",
1537		  TCP_SKB_CB(skb)->seq, skb->len);
1538#endif
1539	SOCKBUF_LOCK(&so->so_rcv);
1540	if (sb_notify(&so->so_rcv))
1541		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
1542
1543	sbappend_locked(&so->so_rcv, m);
1544	KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax,
1545
1546	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
1547		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
1548
1549	INP_UNLOCK(tp->t_inpcb);
1550	DPRINTF("sb_cc=%d sb_mbcnt=%d\n",
1551	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
1552
1553	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1554		sorwakeup_locked(so);
1555	else
1556		SOCKBUF_UNLOCK(&so->so_rcv);
1557}
1558
1559/*
1560 * Handler for RX_DATA CPL messages.
1561 */
1562static int
1563do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1564{
1565	struct toepcb *toep = (struct toepcb *)ctx;
1566
1567	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
1568
1569	new_rx_data(toep, m);
1570
1571	return (0);
1572}
1573
1574static void
1575new_rx_data_ddp(struct socket *so, struct mbuf *m)
1576{
1577	struct tcpcb *tp = sototcpcb(so);
1578	struct toepcb *toep = tp->t_toe;
1579	struct ddp_state *q;
1580	struct ddp_buf_state *bsp;
1581	struct cpl_rx_data_ddp *hdr;
1582	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
1583
1584#ifdef notyet
1585	if (unlikely(sk_no_receive(sk))) {
1586		handle_excess_rx(so, m);
1587		return;
1588	}
1589#endif
1590	tp = sototcpcb(so);
1591	q = &toep->tp_ddp_state;
1592	hdr = cplhdr(m);
1593	ddp_report = ntohl(hdr->u.ddp_report);
1594	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
1595	bsp = &q->buf_state[buf_idx];
1596
1597#ifdef T3_TRACE
1598	T3_TRACE5(TIDTB(sk),
1599		  "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
1600		  "hdr seq 0x%x len %u offset %u",
1601		  tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
1602		  ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
1603	T3_TRACE1(TIDTB(sk),
1604		  "new_rx_data_ddp: ddp_report 0x%x",
1605		  ddp_report);
1606#endif
1607
1608	ddp_len = ntohs(hdr->len);
1609	rcv_nxt = ntohl(hdr->seq) + ddp_len;
1610
1611	/*
1612	 * Overload to store old rcv_next
1613	 */
1614	m->m_pkthdr.csum_data = tp->rcv_nxt;
1615	tp->rcv_nxt = rcv_nxt;
1616
1617	/*
1618	 * Store the length in m->m_len.  We are changing the meaning of
1619	 * m->m_len here, we need to be very careful that nothing from now on
1620	 * interprets ->len of this packet the usual way.
1621	 */
1622	m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data;
1623
1624	/*
1625	 * Figure out where the new data was placed in the buffer and store it
1626	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
1627	 * account for page pod's pg_offset.
1628	 */
1629	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
1630#ifdef notyet
1631	TCP_SKB_CB(skb)->when = end_offset - skb->len;
1632
1633	/*
1634	 * We store in mac.raw the address of the gather list where the
1635	 * placement happened.
1636	 */
1637	skb->mac.raw = (unsigned char *)bsp->gl;
1638#endif
1639	bsp->cur_offset = end_offset;
1640
1641	/*
1642	 * Bit 0 of flags stores whether the DDP buffer is completed.
1643	 * Note that other parts of the code depend on this being in bit 0.
1644	 */
1645	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
1646#if 0
1647		TCP_SKB_CB(skb)->flags = 0;  /* potential spurious completion */
1648#endif
1649		panic("spurious ddp completion");
1650	} else {
1651		m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
1652		if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP))
1653			q->cur_buf ^= 1;                     /* flip buffers */
1654	}
1655
1656	if (bsp->flags & DDP_BF_NOCOPY) {
1657		m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY);
1658		bsp->flags &= ~DDP_BF_NOCOPY;
1659	}
1660
1661	if (ddp_report & F_DDP_PSH)
1662		m->m_pkthdr.csum_flags |= DDP_BF_PSH;
1663
1664	tp->t_rcvtime = ticks;
1665	sbappendstream_locked(&so->so_rcv, m);
1666#ifdef notyet
1667	if (!sock_flag(sk, SOCK_DEAD))
1668		sk->sk_data_ready(sk, 0);
1669#endif
1670}
1671
1672#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
1673		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
1674		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
1675		 F_DDP_INVALID_PPOD)
1676
1677/*
1678 * Handler for RX_DATA_DDP CPL messages.
1679 */
1680static int
1681do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1682{
1683	struct toepcb *toep = ctx;
1684	struct socket *so = toeptoso(toep);
1685	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
1686
1687	VALIDATE_SOCK(so);
1688
1689	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
1690		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
1691		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
1692		return CPL_RET_BUF_DONE;
1693	}
1694#if 0
1695	skb->h.th = tcphdr_skb->h.th;
1696#endif
1697	new_rx_data_ddp(so, m);
1698	return (0);
1699}
1700
1701static void
1702process_ddp_complete(struct socket *so, struct mbuf *m)
1703{
1704	struct tcpcb *tp = sototcpcb(so);
1705	struct toepcb *toep = tp->t_toe;
1706	struct ddp_state *q;
1707	struct ddp_buf_state *bsp;
1708	struct cpl_rx_ddp_complete *hdr;
1709	unsigned int ddp_report, buf_idx, when;
1710
1711#ifdef notyet
1712	if (unlikely(sk_no_receive(sk))) {
1713		handle_excess_rx(sk, skb);
1714		return;
1715	}
1716#endif
1717	q = &toep->tp_ddp_state;
1718	hdr = cplhdr(m);
1719	ddp_report = ntohl(hdr->ddp_report);
1720	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
1721	bsp = &q->buf_state[buf_idx];
1722
1723	when = bsp->cur_offset;
1724	m->m_len = G_DDP_OFFSET(ddp_report) - when;
1725
1726#ifdef T3_TRACE
1727	T3_TRACE5(TIDTB(sk),
1728		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
1729		  "ddp_report 0x%x offset %u, len %u",
1730		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
1731		   G_DDP_OFFSET(ddp_report), skb->len);
1732#endif
1733
1734	bsp->cur_offset += m->m_len;
1735
1736	if (!(bsp->flags & DDP_BF_NOFLIP))
1737		q->cur_buf ^= 1;                     /* flip buffers */
1738
1739#ifdef T3_TRACE
1740	T3_TRACE4(TIDTB(sk),
1741		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
1742		  "ddp_report %u offset %u",
1743		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
1744		   G_DDP_OFFSET(ddp_report));
1745#endif
1746#if 0
1747	skb->mac.raw = (unsigned char *)bsp->gl;
1748#endif
1749	m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
1750	if (bsp->flags & DDP_BF_NOCOPY)
1751		bsp->flags &= ~DDP_BF_NOCOPY;
1752	m->m_pkthdr.csum_data = tp->rcv_nxt;
1753	tp->rcv_nxt += m->m_len;
1754
1755	tp->t_rcvtime = ticks;
1756	sbappendstream_locked(&so->so_rcv, m);
1757#ifdef notyet
1758	if (!sock_flag(sk, SOCK_DEAD))
1759		sk->sk_data_ready(sk, 0);
1760#endif
1761}
1762
1763/*
1764 * Handler for RX_DDP_COMPLETE CPL messages.
1765 */
1766static int
1767do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1768{
1769	struct toepcb *toep = ctx;
1770	struct socket *so = toeptoso(toep);
1771
1772	VALIDATE_SOCK(so);
1773#if 0
1774	skb->h.th = tcphdr_skb->h.th;
1775#endif
1776	process_ddp_complete(so, m);
1777	return (0);
1778}
1779
1780/*
1781 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
1782 * socket state before calling tcp_time_wait to comply with its expectations.
1783 */
1784static void
1785enter_timewait(struct socket *so)
1786{
1787	struct tcpcb *tp = sototcpcb(so);
1788
1789	INP_LOCK_ASSERT(tp->t_inpcb);
1790	/*
1791	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
1792	 * process peer_close because we don't want to carry the peer FIN in
1793	 * the socket's receive queue and if we increment rcv_nxt without
1794	 * having the FIN in the receive queue we'll confuse facilities such
1795	 * as SIOCINQ.
1796	 */
1797	tp->rcv_nxt++;
1798
1799	tp->ts_recent_age = 0;	     /* defeat recycling */
1800	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
1801	tcp_twstart(tp);
1802}
1803
1804/*
1805 * Handle a peer FIN.
1806 */
1807static void
1808do_peer_fin(struct socket *so, struct mbuf *m)
1809{
1810	struct tcpcb *tp = sototcpcb(so);
1811	struct toepcb *toep = tp->t_toe;
1812	int keep = 0, dead = (so->so_state & SS_NOFDREF);
1813
1814	DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead);
1815
1816#ifdef T3_TRACE
1817	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
1818#endif
1819
1820	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
1821		printf("abort_pending set\n");
1822
1823		goto out;
1824	}
1825
1826#ifdef notyet
1827	if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
1828		keep = handle_peer_close_data(so, skb);
1829		if (keep < 0)
1830			return;
1831	}
1832	sk->sk_shutdown |= RCV_SHUTDOWN;
1833	sock_set_flag(so, SOCK_DONE);
1834#endif
1835	INP_INFO_WLOCK(&tcbinfo);
1836	INP_LOCK(tp->t_inpcb);
1837	if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
1838		socantrcvmore(so);
1839	switch (tp->t_state) {
1840	case TCPS_SYN_RECEIVED:
1841	    tp->t_starttime = ticks;
1842	/* FALLTHROUGH */
1843	case TCPS_ESTABLISHED:
1844		tp->t_state = TCPS_CLOSE_WAIT;
1845		break;
1846	case TCPS_FIN_WAIT_1:
1847		tp->t_state = TCPS_CLOSING;
1848		break;
1849	case TCPS_FIN_WAIT_2:
1850		/*
1851		 * If we've sent an abort_req we must have sent it too late,
1852		 * HW will send us a reply telling us so, and this peer_close
1853		 * is really the last message for this connection and needs to
1854		 * be treated as an abort_rpl, i.e., transition the connection
1855		 * to TCP_CLOSE (note that the host stack does this at the
1856		 * time of generating the RST but we must wait for HW).
1857		 * Otherwise we enter TIME_WAIT.
1858		 */
1859		t3_release_offload_resources(toep);
1860		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1861			tp = tcp_close(tp);
1862		} else
1863			enter_timewait(so);
1864		break;
1865	default:
1866		log(LOG_ERR,
1867		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
1868		       TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
1869	}
1870	INP_INFO_WUNLOCK(&tcbinfo);
1871	if (tp)
1872		INP_UNLOCK(tp->t_inpcb);
1873
1874	if (!dead) {
1875		DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
1876
1877		sorwakeup(so);
1878		sowwakeup(so);
1879		wakeup(&so->so_timeo);
1880#ifdef notyet
1881		sk->sk_state_change(sk);
1882
1883		/* Do not send POLL_HUP for half duplex close. */
1884		if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
1885		    sk->sk_state == TCP_CLOSE)
1886			sk_wake_async(so, 1, POLL_HUP);
1887		else
1888			sk_wake_async(so, 1, POLL_IN);
1889#endif
1890	}
1891out:
1892	if (!keep)
1893		m_free(m);
1894}
1895
1896/*
1897 * Handler for PEER_CLOSE CPL messages.
1898 */
1899static int
1900do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1901{
1902	struct toepcb *toep = (struct toepcb *)ctx;
1903	struct socket *so = toeptoso(toep);
1904
1905	VALIDATE_SOCK(so);
1906
1907	do_peer_fin(so, m);
1908	return (0);
1909}
1910
1911static void
1912process_close_con_rpl(struct socket *so, struct mbuf *m)
1913{
1914	struct tcpcb *tp = sototcpcb(so);
1915	struct cpl_close_con_rpl *rpl = cplhdr(m);
1916	struct toepcb *toep = tp->t_toe;
1917
1918	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
1919
1920	DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
1921	    !!(so->so_state & SS_NOFDREF));
1922	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
1923		goto out;
1924
1925	INP_INFO_WLOCK(&tcbinfo);
1926	INP_LOCK(tp->t_inpcb);
1927	switch (tp->t_state) {
1928	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
1929		t3_release_offload_resources(toep);
1930		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
1931			tp = tcp_close(tp);
1932
1933		} else
1934			enter_timewait(so);
1935		break;
1936	case TCPS_LAST_ACK:
1937		/*
1938		 * In this state we don't care about pending abort_rpl.
1939		 * If we've sent abort_req it was post-close and was sent too
1940		 * late, this close_con_rpl is the actual last message.
1941		 */
1942		t3_release_offload_resources(toep);
1943		tp = tcp_close(tp);
1944		break;
1945	case TCPS_FIN_WAIT_1:
1946#ifdef notyet
1947		dst_confirm(sk->sk_dst_cache);
1948#endif
1949		soisdisconnecting(so);
1950
1951		if ((so->so_state & SS_NOFDREF) == 0) {
1952			/*
1953			 * Wake up lingering close
1954			 */
1955			sowwakeup(so);
1956			sorwakeup(so);
1957			wakeup(&so->so_timeo);
1958		} else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
1959		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
1960			tp = tcp_drop(tp, 0);
1961		}
1962
1963		break;
1964	default:
1965		log(LOG_ERR,
1966		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
1967		       TOE_DEV(so)->tod_name, toep->tp_tid,
1968		       tp->t_state);
1969	}
1970	INP_INFO_WUNLOCK(&tcbinfo);
1971	if (tp)
1972		INP_UNLOCK(tp->t_inpcb);
1973out:
1974	m_free(m);
1975}
1976
1977/*
1978 * Handler for CLOSE_CON_RPL CPL messages.
1979 */
1980static int
1981do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
1982			    void *ctx)
1983{
1984	struct toepcb *toep = (struct toepcb *)ctx;
1985	struct socket *so = toeptoso(toep);
1986
1987	VALIDATE_SOCK(so);
1988
1989	process_close_con_rpl(so, m);
1990	return (0);
1991}
1992
1993/*
1994 * Process abort replies.  We only process these messages if we anticipate
1995 * them as the coordination between SW and HW in this area is somewhat lacking
1996 * and sometimes we get ABORT_RPLs after we are done with the connection that
1997 * originated the ABORT_REQ.
1998 */
1999static void
2000process_abort_rpl(struct socket *so, struct mbuf *m)
2001{
2002	struct tcpcb *tp = sototcpcb(so);
2003	struct toepcb *toep = tp->t_toe;
2004
2005#ifdef T3_TRACE
2006	T3_TRACE1(TIDTB(sk),
2007		  "process_abort_rpl: GTS rpl pending %d",
2008		  sock_flag(sk, ABORT_RPL_PENDING));
2009#endif
2010	INP_LOCK(tp->t_inpcb);
2011
2012	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2013		/*
2014		 * XXX panic on tcpdrop
2015		 */
2016		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
2017			toep->tp_flags |= TP_ABORT_RPL_RCVD;
2018		else {
2019			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2020			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2021			    !is_t3a(TOE_DEV(so))) {
2022				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2023					panic("TP_ABORT_REQ_RCVD set");
2024				INP_INFO_WLOCK(&tcbinfo);
2025				INP_LOCK(tp->t_inpcb);
2026				t3_release_offload_resources(toep);
2027				tp = tcp_close(tp);
2028				INP_INFO_WUNLOCK(&tcbinfo);
2029			}
2030		}
2031	}
2032	if (tp)
2033		INP_UNLOCK(tp->t_inpcb);
2034
2035	m_free(m);
2036}
2037
2038/*
2039 * Handle an ABORT_RPL_RSS CPL message.
2040 */
2041static int
2042do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2043{
2044	struct socket *so;
2045	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2046	struct toepcb *toep;
2047
2048	/*
2049	 * Ignore replies to post-close aborts indicating that the abort was
2050	 * requested too late.  These connections are terminated when we get
2051	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2052	 * arrives the TID is either no longer used or it has been recycled.
2053	 */
2054	if (rpl->status == CPL_ERR_ABORT_FAILED) {
2055discard:
2056		m_free(m);
2057		return (0);
2058	}
2059
2060	toep = (struct toepcb *)ctx;
2061
2062        /*
2063	 * Sometimes we've already closed the socket, e.g., a post-close
2064	 * abort races with ABORT_REQ_RSS, the latter frees the socket
2065	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2066	 * but FW turns the ABORT_REQ into a regular one and so we get
2067	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2068	 */
2069	if (!toep)
2070		goto discard;
2071
2072	if (toep->tp_tp == NULL) {
2073		printf("removing tid for abort\n");
2074		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2075		if (toep->tp_l2t)
2076			l2t_release(L2DATA(cdev), toep->tp_l2t);
2077
2078		toepcb_release(toep);
2079		goto discard;
2080	}
2081
2082	printf("toep=%p\n", toep);
2083	printf("tp=%p\n", toep->tp_tp);
2084
2085	so = toeptoso(toep); /* <- XXX panic */
2086	toepcb_hold(toep);
2087	process_abort_rpl(so, m);
2088	toepcb_release(toep);
2089	return (0);
2090}
2091
2092/*
2093 * Convert the status code of an ABORT_REQ into a Linux error code.  Also
2094 * indicate whether RST should be sent in response.
2095 */
2096static int
2097abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2098{
2099	struct tcpcb *tp = sototcpcb(so);
2100
2101	switch (abort_reason) {
2102	case CPL_ERR_BAD_SYN:
2103#if 0
2104		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
2105#endif
2106	case CPL_ERR_CONN_RESET:
2107		// XXX need to handle SYN_RECV due to crossed SYNs
2108		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2109	case CPL_ERR_XMIT_TIMEDOUT:
2110	case CPL_ERR_PERSIST_TIMEDOUT:
2111	case CPL_ERR_FINWAIT2_TIMEDOUT:
2112	case CPL_ERR_KEEPALIVE_TIMEDOUT:
2113#if 0
2114		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2115#endif
2116		return (ETIMEDOUT);
2117	default:
2118		return (EIO);
2119	}
2120}
2121
2122static inline void
2123set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2124{
2125	struct cpl_abort_rpl *rpl = cplhdr(m);
2126
2127	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2128	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2129	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2130
2131	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2132	rpl->cmd = cmd;
2133}
2134
2135static void
2136send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2137{
2138	struct mbuf *reply_mbuf;
2139	struct cpl_abort_req_rss *req = cplhdr(m);
2140
2141	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2142	m_set_priority(m, CPL_PRIORITY_DATA);
2143	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2144	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2145	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2146	m_free(m);
2147}
2148
2149/*
2150 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2151 */
2152static inline int
2153is_neg_adv_abort(unsigned int status)
2154{
2155	return status == CPL_ERR_RTX_NEG_ADVICE ||
2156	    status == CPL_ERR_PERSIST_NEG_ADVICE;
2157}
2158
2159static void
2160send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2161{
2162	struct mbuf  *reply_mbuf;
2163	struct cpl_abort_req_rss *req = cplhdr(m);
2164
2165	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2166
2167	if (!reply_mbuf) {
2168		/* Defer the reply.  Stick rst_status into req->cmd. */
2169		req->status = rst_status;
2170		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2171		return;
2172	}
2173
2174	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2175	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2176	m_free(m);
2177
2178	/*
2179	 * XXX need to sync with ARP as for SYN_RECV connections we can send
2180	 * these messages while ARP is pending.  For other connection states
2181	 * it's not a problem.
2182	 */
2183	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2184}
2185
2186#ifdef notyet
2187static void
2188cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2189{
2190	UNIMPLEMENTED();
2191#ifdef notyet
2192	struct request_sock *req = child->sk_user_data;
2193
2194	inet_csk_reqsk_queue_removed(parent, req);
2195	synq_remove(tcp_sk(child));
2196	__reqsk_free(req);
2197	child->sk_user_data = NULL;
2198#endif
2199}
2200
2201
2202/*
2203 * Performs the actual work to abort a SYN_RECV connection.
2204 */
2205static void
2206do_abort_syn_rcv(struct socket *child, struct socket *parent)
2207{
2208	struct tcpcb *parenttp = sototcpcb(parent);
2209	struct tcpcb *childtp = sototcpcb(child);
2210
2211	/*
2212	 * If the server is still open we clean up the child connection,
2213	 * otherwise the server already did the clean up as it was purging
2214	 * its SYN queue and the skb was just sitting in its backlog.
2215	 */
2216	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2217		cleanup_syn_rcv_conn(child, parent);
2218		INP_INFO_WLOCK(&tcbinfo);
2219		INP_LOCK(childtp->t_inpcb);
2220		t3_release_offload_resources(childtp->t_toe);
2221		childtp = tcp_close(childtp);
2222		INP_INFO_WUNLOCK(&tcbinfo);
2223		if (childtp)
2224			INP_UNLOCK(childtp->t_inpcb);
2225	}
2226}
2227#endif
2228
2229/*
2230 * Handle abort requests for a SYN_RECV connection.  These need extra work
2231 * because the socket is on its parent's SYN queue.
2232 */
2233static int
2234abort_syn_rcv(struct socket *so, struct mbuf *m)
2235{
2236	UNIMPLEMENTED();
2237#ifdef notyet
2238	struct socket *parent;
2239	struct toedev *tdev = TOE_DEV(so);
2240	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2241	struct socket *oreq = so->so_incomp;
2242	struct t3c_tid_entry *t3c_stid;
2243	struct tid_info *t;
2244
2245	if (!oreq)
2246		return -1;        /* somehow we are not on the SYN queue */
2247
2248	t = &(T3C_DATA(cdev))->tid_maps;
2249	t3c_stid = lookup_stid(t, oreq->ts_recent);
2250	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2251
2252	SOCK_LOCK(parent);
2253	do_abort_syn_rcv(so, parent);
2254	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2255	SOCK_UNLOCK(parent);
2256#endif
2257	return (0);
2258}
2259
2260/*
2261 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2262 * request except that we need to reply to it.
2263 */
2264static void
2265process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
2266{
2267	int rst_status = CPL_ABORT_NO_RST;
2268	const struct cpl_abort_req_rss *req = cplhdr(m);
2269	struct tcpcb *tp = sototcpcb(so);
2270	struct toepcb *toep = tp->t_toe;
2271
2272	INP_LOCK(tp->t_inpcb);
2273	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2274		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2275		m_free(m);
2276		goto skip;
2277	}
2278
2279	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
2280	/*
2281	 * Three cases to consider:
2282	 * a) We haven't sent an abort_req; close the connection.
2283	 * b) We have sent a post-close abort_req that will get to TP too late
2284	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
2285	 *    be ignored and the connection should be closed now.
2286	 * c) We have sent a regular abort_req that will get to TP too late.
2287	 *    That will generate an abort_rpl with status 0, wait for it.
2288	 */
2289	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
2290	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
2291		so->so_error = abort_status_to_errno(so, req->status,
2292		    &rst_status);
2293#if 0
2294		if (!sock_flag(sk, SOCK_DEAD))
2295			sk->sk_error_report(sk);
2296#endif
2297		/*
2298		 * SYN_RECV needs special processing.  If abort_syn_rcv()
2299		 * returns 0 is has taken care of the abort.
2300		 */
2301		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
2302			goto skip;
2303
2304		t3_release_offload_resources(toep);
2305		tp = tcp_close(tp);
2306	}
2307	if (tp)
2308		INP_UNLOCK(tp->t_inpcb);
2309	send_abort_rpl(m, tdev, rst_status);
2310	return;
2311
2312skip:
2313	INP_UNLOCK(tp->t_inpcb);
2314}
2315
2316/*
2317 * Handle an ABORT_REQ_RSS CPL message.
2318 */
2319static int
2320do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2321{
2322	const struct cpl_abort_req_rss *req = cplhdr(m);
2323	struct toepcb *toep = (struct toepcb *)ctx;
2324	struct socket *so;
2325	struct inpcb *inp;
2326
2327	if (is_neg_adv_abort(req->status)) {
2328		m_free(m);
2329		return (0);
2330	}
2331
2332	printf("aborting tid=%d\n", toep->tp_tid);
2333
2334	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
2335		cxgb_remove_tid(cdev, toep, toep->tp_tid);
2336		toep->tp_flags |= TP_ABORT_REQ_RCVD;
2337		printf("sending abort rpl\n");
2338
2339		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
2340		printf("sent\n");
2341		if (toep->tp_l2t)
2342			l2t_release(L2DATA(cdev), toep->tp_l2t);
2343
2344		/*
2345		 *  Unhook
2346		 */
2347		toep->tp_tp->t_toe = NULL;
2348		toep->tp_tp->t_flags &= ~TF_TOE;
2349		toep->tp_tp = NULL;
2350		/*
2351		 * XXX need to call syncache_chkrst - but we don't
2352		 * have a way of doing that yet
2353		 */
2354		toepcb_release(toep);
2355		printf("abort for unestablished connection :-(\n");
2356		return (0);
2357	}
2358	if (toep->tp_tp == NULL) {
2359		printf("disconnected toepcb\n");
2360		/* should be freed momentarily */
2361		return (0);
2362	}
2363
2364	so = toeptoso(toep);
2365	inp = sotoinpcb(so);
2366
2367	VALIDATE_SOCK(so);
2368	toepcb_hold(toep);
2369	INP_INFO_WLOCK(&tcbinfo);
2370	process_abort_req(so, m, TOE_DEV(so));
2371	INP_INFO_WUNLOCK(&tcbinfo);
2372	toepcb_release(toep);
2373	return (0);
2374}
2375#ifdef notyet
2376static void
2377pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
2378{
2379	struct toedev *tdev = TOE_DEV(parent);
2380
2381	do_abort_syn_rcv(child, parent);
2382	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
2383		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
2384
2385		rpl->opt0h = htonl(F_TCAM_BYPASS);
2386		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2387		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
2388	} else
2389		m_free(m);
2390}
2391#endif
2392static void
2393handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
2394{
2395	UNIMPLEMENTED();
2396
2397#ifdef notyet
2398	struct t3cdev *cdev;
2399	struct socket *parent;
2400	struct socket *oreq;
2401	struct t3c_tid_entry *t3c_stid;
2402	struct tid_info *t;
2403	struct tcpcb *otp, *tp = sototcpcb(so);
2404	struct toepcb *toep = tp->t_toe;
2405
2406	/*
2407	 * If the connection is being aborted due to the parent listening
2408	 * socket going away there's nothing to do, the ABORT_REQ will close
2409	 * the connection.
2410	 */
2411	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2412		m_free(m);
2413		return;
2414	}
2415
2416	oreq = so->so_incomp;
2417	otp = sototcpcb(oreq);
2418
2419	cdev = T3C_DEV(so);
2420	t = &(T3C_DATA(cdev))->tid_maps;
2421	t3c_stid = lookup_stid(t, otp->ts_recent);
2422	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2423
2424	SOCK_LOCK(parent);
2425	pass_open_abort(so, parent, m);
2426	SOCK_UNLOCK(parent);
2427#endif
2428}
2429
2430/*
2431 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
2432 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
2433 * connection.
2434 */
2435static void
2436pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
2437{
2438
2439#ifdef notyet
2440	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2441	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
2442#endif
2443	handle_pass_open_arp_failure(m_get_socket(m), m);
2444}
2445
2446/*
2447 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
2448 */
2449static void
2450mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
2451{
2452	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
2453	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
2454	unsigned int tid = GET_TID(req);
2455
2456	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
2457	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
2458	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
2459	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
2460	rpl->opt0h = htonl(F_TCAM_BYPASS);
2461	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2462	rpl->opt2 = 0;
2463	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
2464}
2465
2466/*
2467 * Send a deferred reject to an accept request.
2468 */
2469static void
2470reject_pass_request(struct toedev *tdev, struct mbuf *m)
2471{
2472	struct mbuf *reply_mbuf;
2473
2474	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
2475	mk_pass_accept_rpl(reply_mbuf, m);
2476	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2477	m_free(m);
2478}
2479
2480static void
2481handle_syncache_event(int event, void *arg)
2482{
2483	struct toepcb *toep = arg;
2484
2485	switch (event) {
2486	case SC_ENTRY_PRESENT:
2487		/*
2488		 * entry already exists - free toepcb
2489		 * and l2t
2490		 */
2491		printf("syncache entry present\n");
2492		toepcb_release(toep);
2493		break;
2494	case SC_DROP:
2495		/*
2496		 * The syncache has given up on this entry
2497		 * either it timed out, or it was evicted
2498		 * we need to explicitly release the tid
2499		 */
2500		printf("syncache entry dropped\n");
2501		toepcb_release(toep);
2502		break;
2503	default:
2504		log(LOG_ERR, "unknown syncache event %d\n", event);
2505		break;
2506	}
2507}
2508
2509static void
2510syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
2511{
2512	struct in_conninfo inc;
2513	struct tcpopt to;
2514	struct tcphdr th;
2515	struct inpcb *inp;
2516	int mss, wsf, sack, ts;
2517
2518	bzero(&to, sizeof(struct tcpopt));
2519	inp = sotoinpcb(lso);
2520
2521	/*
2522	 * Fill out information for entering us into the syncache
2523	 */
2524	inc.inc_fport = th.th_sport = req->peer_port;
2525	inc.inc_lport = th.th_dport = req->local_port;
2526	toep->tp_iss = th.th_seq = req->rcv_isn;
2527	th.th_flags = TH_SYN;
2528
2529	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn);
2530
2531	inc.inc_isipv6 = 0;
2532	inc.inc_len = 0;
2533	inc.inc_faddr.s_addr = req->peer_ip;
2534	inc.inc_laddr.s_addr = req->local_ip;
2535
2536	DPRINTF("syncache add of %d:%d %d:%d\n",
2537	    ntohl(req->local_ip), ntohs(req->local_port),
2538	    ntohl(req->peer_ip), ntohs(req->peer_port));
2539
2540	mss = req->tcp_options.mss;
2541	wsf = req->tcp_options.wsf;
2542	ts = req->tcp_options.tstamp;
2543	sack = req->tcp_options.sack;
2544	to.to_mss = mss;
2545	to.to_wscale = wsf;
2546	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
2547
2548	INP_INFO_WLOCK(&tcbinfo);
2549	INP_LOCK(inp);
2550	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
2551}
2552
2553
2554/*
2555 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
2556 * lock held.  Note that the sock here is a listening socket that is not owned
2557 * by the TOE.
2558 */
2559static void
2560process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
2561    struct listen_ctx *lctx)
2562{
2563	int rt_flags;
2564	struct l2t_entry *e;
2565	struct iff_mac tim;
2566	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
2567	struct cpl_pass_accept_rpl *rpl;
2568	struct cpl_pass_accept_req *req = cplhdr(m);
2569	unsigned int tid = GET_TID(req);
2570	struct tom_data *d = TOM_DATA(tdev);
2571	struct t3cdev *cdev = d->cdev;
2572	struct tcpcb *tp = sototcpcb(so);
2573	struct toepcb *newtoep;
2574	struct rtentry *dst;
2575	struct sockaddr_in nam;
2576	struct t3c_data *td = T3C_DATA(cdev);
2577
2578	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2579	if (__predict_false(reply_mbuf == NULL)) {
2580		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
2581			t3_defer_reply(m, tdev, reject_pass_request);
2582		else {
2583			cxgb_queue_tid_release(cdev, tid);
2584			m_free(m);
2585		}
2586		DPRINTF("failed to get reply_mbuf\n");
2587
2588		goto out;
2589	}
2590
2591	if (tp->t_state != TCPS_LISTEN) {
2592		DPRINTF("socket not in listen state\n");
2593
2594		goto reject;
2595	}
2596
2597	tim.mac_addr = req->dst_mac;
2598	tim.vlan_tag = ntohs(req->vlan_tag);
2599	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
2600		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
2601		goto reject;
2602	}
2603
2604#ifdef notyet
2605	/*
2606	 * XXX do route lookup to confirm that we're still listening on this
2607	 * address
2608	 */
2609	if (ip_route_input(skb, req->local_ip, req->peer_ip,
2610			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
2611		goto reject;
2612	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
2613		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
2614	dst_release(skb->dst);	// done with the input route, release it
2615	skb->dst = NULL;
2616
2617	if ((rt_flags & RTF_LOCAL) == 0)
2618		goto reject;
2619#endif
2620	/*
2621	 * XXX
2622	 */
2623	rt_flags = RTF_LOCAL;
2624	if ((rt_flags & RTF_LOCAL) == 0)
2625		goto reject;
2626
2627	/*
2628	 * Calculate values and add to syncache
2629	 */
2630
2631	newtoep = toepcb_alloc();
2632	if (newtoep == NULL)
2633		goto reject;
2634
2635	bzero(&nam, sizeof(struct sockaddr_in));
2636
2637	nam.sin_len = sizeof(struct sockaddr_in);
2638	nam.sin_family = AF_INET;
2639	nam.sin_addr.s_addr =req->peer_ip;
2640	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
2641
2642	if (dst == NULL) {
2643		printf("failed to find route\n");
2644		goto reject;
2645	}
2646	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
2647	    (struct sockaddr *)&nam);
2648	if (e == NULL) {
2649		DPRINTF("failed to get l2t\n");
2650	}
2651	/*
2652	 * Point to our listen socket until accept
2653	 */
2654	newtoep->tp_tp = tp;
2655	newtoep->tp_flags = TP_SYN_RCVD;
2656	newtoep->tp_tid = tid;
2657	newtoep->tp_toedev = tdev;
2658
2659	printf("inserting tid=%d\n", tid);
2660	cxgb_insert_tid(cdev, d->client, newtoep, tid);
2661	SOCK_LOCK(so);
2662	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
2663	SOCK_UNLOCK(so);
2664
2665
2666	if (lctx->ulp_mode) {
2667		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2668
2669		if (!ddp_mbuf)
2670			newtoep->tp_ulp_mode = 0;
2671		else
2672			newtoep->tp_ulp_mode = lctx->ulp_mode;
2673	}
2674
2675	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
2676
2677	DPRINTF("adding request to syn cache\n");
2678
2679	/*
2680	 * XXX workaround for lack of syncache drop
2681	 */
2682	toepcb_hold(newtoep);
2683	syncache_add_accept_req(req, so, newtoep);
2684
2685
2686
2687	rpl = cplhdr(reply_mbuf);
2688	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
2689	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
2690	rpl->wr.wr_lo = 0;
2691	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
2692	rpl->opt2 = htonl(calc_opt2(so, tdev));
2693	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
2694	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
2695
2696	DPRINTF("accept smt_idx=%d\n", e->smt_idx);
2697
2698	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
2699	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
2700	rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) |
2701				  CPL_PASS_OPEN_ACCEPT);
2702
2703	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
2704
2705	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so));
2706
2707#ifdef DEBUG_PRINT
2708	{
2709		int i;
2710
2711		DPRINTF("rpl:\n");
2712		uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *);
2713
2714		for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++)
2715			DPRINTF("[%d] %08x\n", i, rplbuf[i]);
2716	}
2717#endif
2718
2719
2720	l2t_send(cdev, reply_mbuf, e);
2721	m_free(m);
2722#ifdef notyet
2723	/*
2724	 * XXX this call path has to be converted to not depend on sockets
2725	 */
2726	if (newtoep->tp_ulp_mode)
2727		__set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
2728				V_TF_DDP_OFF(1) |
2729				TP_DDP_TIMER_WORKAROUND_MASK,
2730				V_TF_DDP_OFF(1) |
2731				TP_DDP_TIMER_WORKAROUND_VAL, 1);
2732
2733#endif
2734	return;
2735reject:
2736	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
2737		mk_pass_accept_rpl(reply_mbuf, m);
2738	else
2739		mk_tid_release(reply_mbuf, NULL, tid);
2740	cxgb_ofld_send(cdev, reply_mbuf);
2741	m_free(m);
2742out:
2743#if 0
2744	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2745#else
2746	return;
2747#endif
2748}
2749
2750/*
2751 * Handle a CPL_PASS_ACCEPT_REQ message.
2752 */
2753static int
2754do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2755{
2756	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
2757	struct socket *lso = listen_ctx->lso;
2758	struct tom_data *d = listen_ctx->tom_data;
2759
2760#if VALIDATE_TID
2761	struct cpl_pass_accept_req *req = cplhdr(m);
2762	unsigned int tid = GET_TID(req);
2763	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
2764
2765	if (unlikely(!lsk)) {
2766		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
2767		       cdev->name,
2768		       (unsigned long)((union listen_entry *)ctx -
2769					t->stid_tab));
2770		return CPL_RET_BUF_DONE;
2771	}
2772	if (unlikely(tid >= t->ntids)) {
2773		printk(KERN_ERR "%s: passive open TID %u too large\n",
2774		       cdev->name, tid);
2775		return CPL_RET_BUF_DONE;
2776	}
2777	/*
2778	 * For T3A the current user of the TID may have closed but its last
2779	 * message(s) may have been backlogged so the TID appears to be still
2780	 * in use.  Just take the TID away, the connection can close at its
2781	 * own leisure.  For T3B this situation is a bug.
2782	 */
2783	if (!valid_new_tid(t, tid) &&
2784	    cdev->type != T3A) {
2785		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
2786		       cdev->name, tid);
2787		return CPL_RET_BUF_DONE;
2788	}
2789#endif
2790
2791	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
2792	return (0);
2793}
2794
2795/*
2796 * Called when a connection is established to translate the TCP options
2797 * reported by HW to Linux's native format.
2798 */
2799static void
2800assign_rxopt(struct socket *so, unsigned int opt)
2801{
2802	const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
2803	struct tcpcb *tp = sototcpcb(so);
2804	struct toepcb *toep = tp->t_toe;
2805
2806	INP_LOCK_ASSERT(tp->t_inpcb);
2807
2808	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
2809	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
2810	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
2811	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
2812	if (tp->t_flags & TF_RCVD_SCALE)
2813		tp->rcv_scale = 0;
2814}
2815
2816/*
2817 * Completes some final bits of initialization for just established connections
2818 * and changes their state to TCP_ESTABLISHED.
2819 *
2820 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
2821 */
2822static void
2823make_established(struct socket *so, u32 snd_isn, unsigned int opt)
2824{
2825	struct tcpcb *tp = sototcpcb(so);
2826	struct toepcb *toep = tp->t_toe;
2827
2828	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
2829	assign_rxopt(so, opt);
2830	so->so_proto->pr_ctloutput = t3_ctloutput;
2831
2832#if 0
2833	inet_sk(sk)->id = tp->write_seq ^ jiffies;
2834#endif
2835
2836
2837	/*
2838	 * XXX not clear what rcv_wup maps to
2839	 */
2840	/*
2841	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
2842	 * pass through opt0.
2843	 */
2844	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
2845		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
2846
2847	dump_toepcb(toep);
2848
2849#ifdef notyet
2850/*
2851 * no clean interface for marking ARP up to date
2852 */
2853	dst_confirm(sk->sk_dst_cache);
2854#endif
2855	tp->t_state = TCPS_ESTABLISHED;
2856}
2857
2858static int
2859syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
2860{
2861
2862	struct in_conninfo inc;
2863	struct tcpopt to;
2864	struct tcphdr th;
2865	int mss, wsf, sack, ts;
2866	struct mbuf *m = NULL;
2867	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
2868	unsigned int opt;
2869
2870#ifdef MAC
2871#error	"no MAC support"
2872#endif
2873
2874	opt = ntohs(req->tcp_opt);
2875
2876	bzero(&to, sizeof(struct tcpopt));
2877
2878	/*
2879	 * Fill out information for entering us into the syncache
2880	 */
2881	inc.inc_fport = th.th_sport = req->peer_port;
2882	inc.inc_lport = th.th_dport = req->local_port;
2883	th.th_seq = req->rcv_isn;
2884	th.th_flags = TH_ACK;
2885
2886	inc.inc_isipv6 = 0;
2887	inc.inc_len = 0;
2888	inc.inc_faddr.s_addr = req->peer_ip;
2889	inc.inc_laddr.s_addr = req->local_ip;
2890
2891	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
2892	wsf  = G_TCPOPT_WSCALE_OK(opt);
2893	ts   = G_TCPOPT_TSTAMP(opt);
2894	sack = G_TCPOPT_SACK(opt);
2895
2896	to.to_mss = mss;
2897	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
2898	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
2899
2900	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
2901	    ntohl(req->local_ip), ntohs(req->local_port),
2902	    ntohl(req->peer_ip), ntohs(req->peer_port),
2903	    mss, wsf, ts, sack);
2904	return syncache_expand(&inc, &to, &th, so, m);
2905}
2906
2907
2908/*
2909 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
2910 * if we are in TCP_SYN_RECV due to crossed SYNs
2911 */
2912static int
2913do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2914{
2915	struct cpl_pass_establish *req = cplhdr(m);
2916	struct toepcb *toep = (struct toepcb *)ctx;
2917	struct tcpcb *tp;
2918	struct socket *so, *lso;
2919	struct t3c_data *td = T3C_DATA(cdev);
2920	// Complete socket initialization now that we have the SND_ISN
2921
2922	struct toedev *tdev;
2923
2924	so = lso = toeptoso(toep);
2925	tdev = toep->tp_toedev;
2926
2927	SOCK_LOCK(so);
2928	LIST_REMOVE(toep, synq_entry);
2929	SOCK_UNLOCK(so);
2930
2931	INP_INFO_WLOCK(&tcbinfo);
2932	if (!syncache_expand_establish_req(req, &so, toep)) {
2933		/*
2934		 * No entry
2935		 */
2936		UNIMPLEMENTED();
2937	}
2938	if (so == NULL) {
2939		/*
2940		 * Couldn't create the socket
2941		 */
2942		UNIMPLEMENTED();
2943	}
2944
2945	/*
2946	 * XXX workaround for lack of syncache drop
2947	 */
2948	toepcb_release(toep);
2949
2950	tp = sototcpcb(so);
2951	INP_LOCK(tp->t_inpcb);
2952#ifdef notyet
2953	so->so_snd.sb_flags |= SB_TOE;
2954	so->so_rcv.sb_flags |= SB_TOE;
2955#endif
2956	toep->tp_tp = tp;
2957	toep->tp_flags = 0;
2958	tp->t_toe = toep;
2959	reset_wr_list(toep);
2960	tp->rcv_wnd = select_rcv_wnd(so);
2961	DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd);
2962	install_offload_ops(so);
2963
2964	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
2965	toep->tp_wr_unacked = 0;
2966	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
2967	toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
2968	    tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
2969	toep->tp_qset_idx = 0;
2970	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
2971
2972	/*
2973	 * XXX Cancel any keep alive timer
2974	 */
2975
2976	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
2977	INP_INFO_WUNLOCK(&tcbinfo);
2978	INP_UNLOCK(tp->t_inpcb);
2979	soisconnected(so);
2980
2981#ifdef notyet
2982	/*
2983	 * XXX not sure how these checks map to us
2984	 */
2985	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
2986		sk->sk_state_change(sk);
2987		sk_wake_async(so, 0, POLL_OUT);
2988	}
2989	/*
2990	 * The state for the new connection is now up to date.
2991	 * Next check if we should add the connection to the parent's
2992	 * accept queue.  When the parent closes it resets connections
2993	 * on its SYN queue, so check if we are being reset.  If so we
2994	 * don't need to do anything more, the coming ABORT_RPL will
2995	 * destroy this socket.  Otherwise move the connection to the
2996	 * accept queue.
2997	 *
2998	 * Note that we reset the synq before closing the server so if
2999	 * we are not being reset the stid is still open.
3000	 */
3001	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3002		__kfree_skb(skb);
3003		goto unlock;
3004	}
3005#endif
3006	m_free(m);
3007
3008	return (0);
3009}
3010
3011/*
3012 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3013 * and send them to the TOE.
3014 */
3015static void
3016fixup_and_send_ofo(struct socket *so)
3017{
3018	struct mbuf *m;
3019	struct toedev *tdev = TOE_DEV(so);
3020	struct tcpcb *tp = sototcpcb(so);
3021	struct toepcb *toep = tp->t_toe;
3022	unsigned int tid = toep->tp_tid;
3023
3024	printf("fixup_and_send_ofo\n");
3025
3026	INP_LOCK_ASSERT(tp->t_inpcb);
3027	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3028		/*
3029		 * A variety of messages can be waiting but the fields we'll
3030		 * be touching are common to all so any message type will do.
3031		 */
3032		struct cpl_close_con_req *p = cplhdr(m);
3033
3034		p->wr.wr_lo = htonl(V_WR_TID(tid));
3035		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3036		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3037	}
3038}
3039
3040/*
3041 * Updates socket state from an active establish CPL message.  Runs with the
3042 * socket lock held.
3043 */
3044static void
3045socket_act_establish(struct socket *so, struct mbuf *m)
3046{
3047	struct cpl_act_establish *req = cplhdr(m);
3048	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
3049	struct tcpcb *tp = sototcpcb(so);
3050	struct toepcb *toep = tp->t_toe;
3051
3052	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3053		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3054		    toep->tp_tid, tp->t_state);
3055
3056	tp->ts_recent_age = ticks;
3057	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3058	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3059
3060	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3061
3062	/*
3063	 * Now that we finally have a TID send any CPL messages that we had to
3064	 * defer for lack of a TID.
3065	 */
3066	if (mbufq_len(&toep->out_of_order_queue))
3067		fixup_and_send_ofo(so);
3068
3069	if (__predict_false(so->so_state & SS_NOFDREF)) {
3070#ifdef notyet
3071		/*
3072		 * XXX 	not clear what should be done here
3073		 * appears to correspond to sorwakeup_locked
3074		 */
3075		sk->sk_state_change(sk);
3076		sk_wake_async(so, 0, POLL_OUT);
3077#endif
3078	}
3079	m_free(m);
3080#ifdef notyet
3081/*
3082 * XXX assume no write requests permitted while socket connection is
3083 * incomplete
3084 */
3085	/*
3086	 * Currently the send queue must be empty at this point because the
3087	 * socket layer does not send anything before a connection is
3088	 * established.  To be future proof though we handle the possibility
3089	 * that there are pending buffers to send (either TX_DATA or
3090	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3091	 * buffers according to the just learned write_seq, and then we send
3092	 * them on their way.
3093	 */
3094	fixup_pending_writeq_buffers(sk);
3095	if (t3_push_frames(so, 1))
3096		sk->sk_write_space(sk);
3097#endif
3098
3099	soisconnected(so);
3100	toep->tp_state = tp->t_state = TCPS_ESTABLISHED;
3101	tcpstat.tcps_connects++;
3102
3103}
3104
3105/*
3106 * Process a CPL_ACT_ESTABLISH message.
3107 */
3108static int
3109do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3110{
3111	struct cpl_act_establish *req = cplhdr(m);
3112	unsigned int tid = GET_TID(req);
3113	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3114	struct toepcb *toep = (struct toepcb *)ctx;
3115	struct tcpcb *tp = toep->tp_tp;
3116	struct socket *so;
3117	struct toedev *tdev;
3118	struct tom_data *d;
3119
3120	if (tp == NULL) {
3121		free_atid(cdev, atid);
3122		return (0);
3123	}
3124
3125	so = toeptoso(toep);
3126	tdev = TOE_DEV(so); /* blow up here if link was down */
3127	d = TOM_DATA(tdev);
3128
3129	INP_LOCK(tp->t_inpcb);
3130
3131	/*
3132	 * It's OK if the TID is currently in use, the owning socket may have
3133	 * backlogged its last CPL message(s).  Just take it away.
3134	 */
3135	toep->tp_tid = tid;
3136	toep->tp_tp = tp;
3137	so_insert_tid(d, so, tid);
3138	free_atid(cdev, atid);
3139	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3140
3141	socket_act_establish(so, m);
3142	INP_UNLOCK(tp->t_inpcb);
3143	return (0);
3144}
3145
3146/*
3147 * Process an acknowledgment of WR completion.  Advance snd_una and send the
3148 * next batch of work requests from the write queue.
3149 */
3150static void
3151wr_ack(struct toepcb *toep, struct mbuf *m)
3152{
3153	struct tcpcb *tp = toep->tp_tp;
3154	struct cpl_wr_ack *hdr = cplhdr(m);
3155	struct socket *so = toeptoso(toep);
3156	unsigned int credits = ntohs(hdr->credits);
3157	u32 snd_una = ntohl(hdr->snd_una);
3158	int bytes = 0;
3159
3160	DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits);
3161
3162	INP_LOCK(tp->t_inpcb);
3163
3164	toep->tp_wr_avail += credits;
3165	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3166		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3167
3168	while (credits) {
3169		struct mbuf *p = peek_wr(toep);
3170		DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ;
3171
3172		if (__predict_false(!p)) {
3173			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3174			    "nothing pending, state %u\n",
3175			       credits, toep->tp_tid, tp->t_state);
3176			break;
3177		}
3178		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3179#if DEBUG_WR > 1
3180			struct tx_data_wr *w = cplhdr(p);
3181#ifdef notyet
3182			log(LOG_ERR,
3183			       "TID %u got %u WR credits, need %u, len %u, "
3184			       "main body %u, frags %u, seq # %u, ACK una %u,"
3185			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3186			       toep->tp_tid, credits, p->csum, p->len,
3187			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
3188			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3189			       WR_AVAIL(tp), count_pending_wrs(tp) - credits);
3190#endif
3191#endif
3192			p->m_pkthdr.csum_data -= credits;
3193			break;
3194		} else {
3195			dequeue_wr(toep);
3196			credits -= p->m_pkthdr.csum_data;
3197			bytes += p->m_pkthdr.len;
3198			DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len);
3199
3200			m_free(p);
3201		}
3202	}
3203
3204#if DEBUG_WR
3205	check_wr_invariants(tp);
3206#endif
3207
3208	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3209#if VALIDATE_SEQ
3210		struct tom_data *d = TOM_DATA(TOE_DEV(so));
3211
3212		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3213		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3214		    toep->tp_tid, tp->snd_una);
3215#endif
3216		goto out_free;
3217	}
3218
3219	if (tp->snd_una != snd_una) {
3220		tp->snd_una = snd_una;
3221		tp->ts_recent_age = ticks;
3222#ifdef notyet
3223		/*
3224		 * Keep ARP entry "minty fresh"
3225		 */
3226		dst_confirm(sk->sk_dst_cache);
3227#endif
3228		if (tp->snd_una == tp->snd_nxt)
3229			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3230	}
3231	if (bytes) {
3232		DPRINTF("sbdrop(%d)\n", bytes);
3233		SOCKBUF_LOCK(&so->so_snd);
3234		sbdrop_locked(&so->so_snd, bytes);
3235		sowwakeup_locked(so);
3236	}
3237
3238	if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
3239		t3_push_frames(so, 0);
3240
3241out_free:
3242	INP_UNLOCK(tp->t_inpcb);
3243	m_free(m);
3244}
3245
3246/*
3247 * Handler for TX_DATA_ACK CPL messages.
3248 */
3249static int
3250do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3251{
3252	struct toepcb *toep = (struct toepcb *)ctx;
3253
3254	DPRINTF("do_wr_ack\n");
3255	dump_toepcb(toep);
3256
3257	VALIDATE_SOCK(so);
3258
3259	wr_ack(toep, m);
3260	return 0;
3261}
3262
3263
3264/*
3265 * Reset a connection that is on a listener's SYN queue or accept queue,
3266 * i.e., one that has not had a struct socket associated with it.
3267 * Must be called from process context.
3268 *
3269 * Modeled after code in inet_csk_listen_stop().
3270 */
3271static void
3272t3_reset_listen_child(struct socket *child)
3273{
3274	struct tcpcb *tp = sototcpcb(child);
3275
3276	t3_send_reset(tp->t_toe);
3277}
3278
3279/*
3280 * Disconnect offloaded established but not yet accepted connections sitting
3281 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
3282 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
3283 */
3284void
3285t3_disconnect_acceptq(struct socket *listen_so)
3286{
3287	struct socket *so;
3288	struct tcpcb *tp;
3289
3290	TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
3291		tp = sototcpcb(so);
3292
3293		if (tp->t_flags & TF_TOE) {
3294			INP_LOCK(tp->t_inpcb);
3295			t3_reset_listen_child(so);
3296			INP_UNLOCK(tp->t_inpcb);
3297		}
3298
3299	}
3300}
3301
3302/*
3303 * Reset offloaded connections sitting on a server's syn queue.  As above
3304 * we send ABORT_REQ and finish off when we get ABORT_RPL.
3305 */
3306
3307void
3308t3_reset_synq(struct listen_ctx *lctx)
3309{
3310	struct toepcb *toep;
3311
3312	SOCK_LOCK(lctx->lso);
3313	while (!LIST_EMPTY(&lctx->synq_head)) {
3314		toep = LIST_FIRST(&lctx->synq_head);
3315		LIST_REMOVE(toep, synq_entry);
3316		toep->tp_tp = NULL;
3317		t3_send_reset(toep);
3318		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
3319		toepcb_release(toep);
3320	}
3321	SOCK_UNLOCK(lctx->lso);
3322}
3323
3324void
3325t3_init_wr_tab(unsigned int wr_len)
3326{
3327	int i;
3328
3329	if (mbuf_wrs[1])     /* already initialized */
3330		return;
3331
3332	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
3333		int sgl_len = (3 * i) / 2 + (i & 1);
3334
3335		sgl_len += 3;
3336		mbuf_wrs[i] = sgl_len <= wr_len ?
3337		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
3338	}
3339
3340	wrlen = wr_len * 8;
3341}
3342
3343int
3344t3_init_cpl_io(void)
3345{
3346#ifdef notyet
3347	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
3348	if (!tcphdr_skb) {
3349		log(LOG_ERR,
3350		       "Chelsio TCP offload: can't allocate sk_buff\n");
3351		return -1;
3352	}
3353	skb_put(tcphdr_skb, sizeof(struct tcphdr));
3354	tcphdr_skb->h.raw = tcphdr_skb->data;
3355	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
3356#endif
3357
3358
3359	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
3360	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
3361	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
3362	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
3363	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
3364	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
3365	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
3366	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
3367	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
3368	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
3369	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
3370	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
3371#ifdef notyet
3372	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
3373	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
3374	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
3375#endif
3376	return (0);
3377}
3378
3379