Deleted Added
full compact
cxgb_cpl_io.c (176507) cxgb_cpl_io.c (177340)
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 176507 2008-02-24 07:19:31Z kmacy $");
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 177340 2008-03-18 03:55:12Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44#include <sys/syslog.h>
45#include <sys/socketvar.h>
46#include <sys/protosw.h>
47#include <sys/priv.h>
48
49#include <net/if.h>
50#include <net/route.h>
51
52#include <netinet/in.h>
53#include <netinet/in_pcb.h>
54#include <netinet/in_systm.h>
55#include <netinet/in_var.h>
56
57
58#include <dev/cxgb/cxgb_osdep.h>
59#include <dev/cxgb/sys/mbufq.h>
60
61#include <netinet/ip.h>
62#include <netinet/tcp_var.h>
63#include <netinet/tcp_fsm.h>
64#include <netinet/tcp_offload.h>
65#include <netinet/tcp_seq.h>
66#include <netinet/tcp_syncache.h>
67#include <netinet/tcp_timer.h>
68#include <net/route.h>
69
70#include <dev/cxgb/t3cdev.h>
71#include <dev/cxgb/common/cxgb_firmware_exports.h>
72#include <dev/cxgb/common/cxgb_t3_cpl.h>
73#include <dev/cxgb/common/cxgb_tcb.h>
74#include <dev/cxgb/common/cxgb_ctl_defs.h>
75#include <dev/cxgb/cxgb_l2t.h>
76#include <dev/cxgb/cxgb_offload.h>
77#include <vm/vm.h>
78#include <vm/pmap.h>
79#include <machine/bus.h>
80#include <dev/cxgb/sys/mvec.h>
81#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
82#include <dev/cxgb/ulp/tom/cxgb_defs.h>
83#include <dev/cxgb/ulp/tom/cxgb_tom.h>
84#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
85#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
86#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space. Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets. It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes. Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body. This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes. Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window. We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139extern int tcp_do_autorcvbuf;
140extern int tcp_do_autosndbuf;
141extern int tcp_autorcvbuf_max;
142extern int tcp_autosndbuf_max;
143
144static void t3_send_reset(struct toepcb *toep);
145static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
146static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
147static void handle_syncache_event(int event, void *arg);
148
149static inline void
150SBAPPEND(struct sockbuf *sb, struct mbuf *n)
151{
152 struct mbuf * m;
153
154 m = sb->sb_mb;
155 while (m) {
156 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
157 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
158 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
159 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
160 m->m_next, m->m_nextpkt, m->m_flags));
161 m = m->m_next;
162 }
163 m = n;
164 while (m) {
165 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
166 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
167 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
168 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
169 m->m_next, m->m_nextpkt, m->m_flags));
170 m = m->m_next;
171 }
172 sbappend_locked(sb, n);
173 m = sb->sb_mb;
174 while (m) {
175 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176 m->m_next, m->m_nextpkt, m->m_flags));
177 m = m->m_next;
178 }
179}
180
181static inline int
182is_t3a(const struct toedev *dev)
183{
184 return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
185}
186
187static void
188dump_toepcb(struct toepcb *toep)
189{
190 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
191 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
192 toep->tp_mtu_idx, toep->tp_tid);
193
194 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
195 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
196 toep->tp_mss_clamp, toep->tp_flags);
197}
198
199#ifndef RTALLOC2_DEFINED
200static struct rtentry *
201rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
202{
203 struct rtentry *rt = NULL;
204
205 if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
206 RT_UNLOCK(rt);
207
208 return (rt);
209}
210#endif
211/*
212 * Determine whether to send a CPL message now or defer it. A message is
213 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
214 * For connections in other states the message is sent immediately.
215 * If through_l2t is set the message is subject to ARP processing, otherwise
216 * it is sent directly.
217 */
218static inline void
219send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
220{
221 struct tcpcb *tp = toep->tp_tp;
222
223 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
224 INP_LOCK(tp->t_inpcb);
225 mbufq_tail(&toep->out_of_order_queue, m); // defer
226 INP_UNLOCK(tp->t_inpcb);
227 } else if (through_l2t)
228 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T
229 else
230 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly
231}
232
233static inline unsigned int
234mkprio(unsigned int cntrl, const struct toepcb *toep)
235{
236 return (cntrl);
237}
238
239/*
240 * Populate a TID_RELEASE WR. The skb must be already propely sized.
241 */
242static inline void
243mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
244{
245 struct cpl_tid_release *req;
246
247 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
248 m->m_pkthdr.len = m->m_len = sizeof(*req);
249 req = mtod(m, struct cpl_tid_release *);
250 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
251 req->wr.wr_lo = 0;
252 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
253}
254
255static inline void
256make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
257{
258 struct tcpcb *tp = sototcpcb(so);
259 struct toepcb *toep = tp->t_toe;
260 struct tx_data_wr *req;
261
262 INP_LOCK_ASSERT(tp->t_inpcb);
263
264 req = mtod(m, struct tx_data_wr *);
265 m->m_len = sizeof(*req);
266 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
267 req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
268 /* len includes the length of any HW ULP additions */
269 req->len = htonl(len);
270 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
271 /* V_TX_ULP_SUBMODE sets both the mode and submode */
272 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
273 V_TX_URG(/* skb_urgent(skb) */ 0 ) |
274 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
275 (tail ? 0 : 1))));
276 req->sndseq = htonl(tp->snd_nxt);
277 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
278 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
279 V_TX_CPU_IDX(toep->tp_qset));
280
281 /* Sendbuffer is in units of 32KB.
282 */
283 if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
284 req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
285 else
286 req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
287 toep->tp_flags |= TP_DATASENT;
288 }
289}
290
291#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
292
293int
294t3_push_frames(struct socket *so, int req_completion)
295{
296 struct tcpcb *tp = sototcpcb(so);
297 struct toepcb *toep = tp->t_toe;
298
299 struct mbuf *tail, *m0, *last;
300 struct t3cdev *cdev;
301 struct tom_data *d;
302 int i, bytes, count, total_bytes;
303 bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
304
305 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
306 DPRINTF("tcp state=%d\n", tp->t_state);
307 return (0);
308 }
309
310 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
311 DPRINTF("disconnecting\n");
312
313 return (0);
314 }
315
316
317 INP_LOCK_ASSERT(tp->t_inpcb);
318 SOCKBUF_LOCK(&so->so_snd);
319 d = TOM_DATA(TOE_DEV(so));
320 cdev = d->cdev;
321 last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
322 total_bytes = 0;
323 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
324 toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
325
326 if (last && toep->tp_m_last == last && so->so_snd.sb_sndptroff != 0) {
327 KASSERT(tail, ("sbdrop error"));
328 last = tail = tail->m_next;
329 }
330
331 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
332 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
333 SOCKBUF_UNLOCK(&so->so_snd);
334 return (0);
335 }
336
337 toep->tp_m_last = NULL;
338 while (toep->tp_wr_avail && (tail != NULL)) {
339 count = bytes = 0;
340 segp = segs;
341 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
342 SOCKBUF_UNLOCK(&so->so_snd);
343 return (0);
344 }
345 /*
346 * If the data in tail fits as in-line, then
347 * make an immediate data wr.
348 */
349 if (tail->m_len <= IMM_LEN) {
350 count = 1;
351 bytes = tail->m_len;
352 last = tail;
353 tail = tail->m_next;
354 m_set_sgl(m0, NULL);
355 m_set_sgllen(m0, 0);
356 make_tx_data_wr(so, m0, bytes, tail);
357 m_append(m0, bytes, mtod(last, caddr_t));
358 KASSERT(!m0->m_next, ("bad append"));
359 } else {
360 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
361 && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
362 bytes += tail->m_len;
363 last = tail;
364 count++;
365 /*
366 * technically an abuse to be using this for a VA
367 * but less gross than defining my own structure
368 * or calling pmap_kextract from here :-|
369 */
370 segp->ds_addr = (bus_addr_t)tail->m_data;
371 segp->ds_len = tail->m_len;
372 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
373 count, mbuf_wrs[count], tail->m_data, tail->m_len);
374 segp++;
375 tail = tail->m_next;
376 }
377 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
378 toep->tp_wr_avail, count, mbuf_wrs[count], tail);
379
380 m_set_sgl(m0, segs);
381 m_set_sgllen(m0, count);
382 make_tx_data_wr(so, m0, bytes, tail);
383 }
384 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
385
386 if (tail) {
387 so->so_snd.sb_sndptr = tail;
388 toep->tp_m_last = NULL;
389 } else
390 toep->tp_m_last = so->so_snd.sb_sndptr = last;
391
392
393 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
394
395 so->so_snd.sb_sndptroff += bytes;
396 total_bytes += bytes;
397 toep->tp_write_seq += bytes;
398 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
399 toep->tp_wr_avail, count, mbuf_wrs[count], tail, so->so_snd.sb_sndptr, so->so_snd.sb_sndptroff);
400 if (tail)
401 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
402 total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
403 else
404 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
405 total_bytes, toep->tp_m_last, tp->snd_una);
406
407
408 i = 0;
409 while (i < count && m_get_sgllen(m0)) {
410 if ((count - i) >= 3) {
411 CTR6(KTR_TOM,
412 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
413 segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
414 segs[i + 2].ds_addr, segs[i + 2].ds_len);
415 i += 3;
416 } else if ((count - i) == 2) {
417 CTR4(KTR_TOM,
418 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
419 segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
420 i += 2;
421 } else {
422 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
423 segs[i].ds_addr, segs[i].ds_len);
424 i++;
425 }
426
427 }
428
429 /*
430 * remember credits used
431 */
432 m0->m_pkthdr.csum_data = mbuf_wrs[count];
433 m0->m_pkthdr.len = bytes;
434 toep->tp_wr_avail -= mbuf_wrs[count];
435 toep->tp_wr_unacked += mbuf_wrs[count];
436
437 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
438 toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
439 struct work_request_hdr *wr = cplhdr(m0);
440
441 wr->wr_hi |= htonl(F_WR_COMPL);
442 toep->tp_wr_unacked = 0;
443 }
444 KASSERT((m0->m_pkthdr.csum_data > 0) &&
445 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
446 m0->m_pkthdr.csum_data));
447 m0->m_type = MT_DONTFREE;
448 enqueue_wr(toep, m0);
449 DPRINTF("sending offload tx with %d bytes in %d segments\n",
450 bytes, count);
451 l2t_send(cdev, m0, toep->tp_l2t);
452 }
453 SOCKBUF_UNLOCK(&so->so_snd);
454 return (total_bytes);
455}
456
457/*
458 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
459 * under any circumstances. We take the easy way out and always queue the
460 * message to the write_queue. We can optimize the case where the queue is
461 * already empty though the optimization is probably not worth it.
462 */
463static void
464close_conn(struct socket *so)
465{
466 struct mbuf *m;
467 struct cpl_close_con_req *req;
468 struct tom_data *d;
469 struct inpcb *inp = sotoinpcb(so);
470 struct tcpcb *tp;
471 struct toepcb *toep;
472 unsigned int tid;
473
474
475 INP_LOCK(inp);
476 tp = sototcpcb(so);
477 toep = tp->t_toe;
478
479 if (tp->t_state != TCPS_SYN_SENT)
480 t3_push_frames(so, 1);
481
482 if (toep->tp_flags & TP_FIN_SENT) {
483 INP_UNLOCK(inp);
484 return;
485 }
486
487 tid = toep->tp_tid;
488
489 d = TOM_DATA(toep->tp_toedev);
490
491 m = m_gethdr_nofail(sizeof(*req));
492
493 toep->tp_flags |= TP_FIN_SENT;
494 req = mtod(m, struct cpl_close_con_req *);
495
496 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
497 req->wr.wr_lo = htonl(V_WR_TID(tid));
498 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
499 req->rsvd = htonl(toep->tp_write_seq);
500 INP_UNLOCK(inp);
501 /*
502 * XXX - need to defer shutdown while there is still data in the queue
503 *
504 */
505 cxgb_ofld_send(d->cdev, m);
506
507}
508
509/*
510 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
511 * and send it along.
512 */
513static void
514abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
515{
516 struct cpl_abort_req *req = cplhdr(m);
517
518 req->cmd = CPL_ABORT_NO_RST;
519 cxgb_ofld_send(cdev, m);
520}
521
522/*
523 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
524 * permitted to return without sending the message in case we cannot allocate
525 * an sk_buff. Returns the number of credits sent.
526 */
527uint32_t
528t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
529{
530 struct mbuf *m;
531 struct cpl_rx_data_ack *req;
532 struct toepcb *toep = tp->t_toe;
533 struct toedev *tdev = toep->tp_toedev;
534
535 m = m_gethdr_nofail(sizeof(*req));
536
537 DPRINTF("returning %u credits to HW\n", credits);
538
539 req = mtod(m, struct cpl_rx_data_ack *);
540 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
541 req->wr.wr_lo = 0;
542 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
543 req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
544 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
545 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
546 return (credits);
547}
548
549/*
550 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
551 * This is only used in DDP mode, so we take the opportunity to also set the
552 * DACK mode and flush any Rx credits.
553 */
554void
555t3_send_rx_modulate(struct toepcb *toep)
556{
557 struct mbuf *m;
558 struct cpl_rx_data_ack *req;
559
560 m = m_gethdr_nofail(sizeof(*req));
561
562 req = mtod(m, struct cpl_rx_data_ack *);
563 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
564 req->wr.wr_lo = 0;
565 m->m_pkthdr.len = m->m_len = sizeof(*req);
566
567 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
568 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
569 V_RX_DACK_MODE(1) |
570 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
571 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
572 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
573 toep->tp_rcv_wup = toep->tp_copied_seq;
574}
575
576/*
577 * Handle receipt of an urgent pointer.
578 */
579static void
580handle_urg_ptr(struct socket *so, uint32_t urg_seq)
581{
582#ifdef URGENT_DATA_SUPPORTED
583 struct tcpcb *tp = sototcpcb(so);
584
585 urg_seq--; /* initially points past the urgent data, per BSD */
586
587 if (tp->urg_data && !after(urg_seq, tp->urg_seq))
588 return; /* duplicate pointer */
589 sk_send_sigurg(sk);
590 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
591 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
592 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
593
594 tp->copied_seq++;
595 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
596 tom_eat_skb(sk, skb, 0);
597 }
598 tp->urg_data = TCP_URG_NOTYET;
599 tp->urg_seq = urg_seq;
600#endif
601}
602
603/*
604 * Returns true if a socket cannot accept new Rx data.
605 */
606static inline int
607so_no_receive(const struct socket *so)
608{
609 return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
610}
611
612/*
613 * Process an urgent data notification.
614 */
615static void
616rx_urg_notify(struct toepcb *toep, struct mbuf *m)
617{
618 struct cpl_rx_urg_notify *hdr = cplhdr(m);
619 struct socket *so = toeptoso(toep);
620
621 VALIDATE_SOCK(so);
622
623 if (!so_no_receive(so))
624 handle_urg_ptr(so, ntohl(hdr->seq));
625
626 m_freem(m);
627}
628
629/*
630 * Handler for RX_URG_NOTIFY CPL messages.
631 */
632static int
633do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
634{
635 struct toepcb *toep = (struct toepcb *)ctx;
636
637 rx_urg_notify(toep, m);
638 return (0);
639}
640
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44#include <sys/syslog.h>
45#include <sys/socketvar.h>
46#include <sys/protosw.h>
47#include <sys/priv.h>
48
49#include <net/if.h>
50#include <net/route.h>
51
52#include <netinet/in.h>
53#include <netinet/in_pcb.h>
54#include <netinet/in_systm.h>
55#include <netinet/in_var.h>
56
57
58#include <dev/cxgb/cxgb_osdep.h>
59#include <dev/cxgb/sys/mbufq.h>
60
61#include <netinet/ip.h>
62#include <netinet/tcp_var.h>
63#include <netinet/tcp_fsm.h>
64#include <netinet/tcp_offload.h>
65#include <netinet/tcp_seq.h>
66#include <netinet/tcp_syncache.h>
67#include <netinet/tcp_timer.h>
68#include <net/route.h>
69
70#include <dev/cxgb/t3cdev.h>
71#include <dev/cxgb/common/cxgb_firmware_exports.h>
72#include <dev/cxgb/common/cxgb_t3_cpl.h>
73#include <dev/cxgb/common/cxgb_tcb.h>
74#include <dev/cxgb/common/cxgb_ctl_defs.h>
75#include <dev/cxgb/cxgb_l2t.h>
76#include <dev/cxgb/cxgb_offload.h>
77#include <vm/vm.h>
78#include <vm/pmap.h>
79#include <machine/bus.h>
80#include <dev/cxgb/sys/mvec.h>
81#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
82#include <dev/cxgb/ulp/tom/cxgb_defs.h>
83#include <dev/cxgb/ulp/tom/cxgb_tom.h>
84#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
85#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
86#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
87
88/*
89 * For ULP connections HW may add headers, e.g., for digests, that aren't part
90 * of the messages sent by the host but that are part of the TCP payload and
91 * therefore consume TCP sequence space. Tx connection parameters that
92 * operate in TCP sequence space are affected by the HW additions and need to
93 * compensate for them to accurately track TCP sequence numbers. This array
94 * contains the compensating extra lengths for ULP packets. It is indexed by
95 * a packet's ULP submode.
96 */
97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
98
99#ifdef notyet
100/*
101 * This sk_buff holds a fake header-only TCP segment that we use whenever we
102 * need to exploit SW TCP functionality that expects TCP headers, such as
103 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
104 * CPUs without locking.
105 */
106static struct mbuf *tcphdr_mbuf __read_mostly;
107#endif
108
109/*
110 * Size of WRs in bytes. Note that we assume all devices we are handling have
111 * the same WR size.
112 */
113static unsigned int wrlen __read_mostly;
114
115/*
116 * The number of WRs needed for an skb depends on the number of page fragments
117 * in the skb and whether it has any payload in its main body. This maps the
118 * length of the gather list represented by an skb into the # of necessary WRs.
119 */
120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
121
122/*
123 * Max receive window supported by HW in bytes. Only a small part of it can
124 * be set through option0, the rest needs to be set through RX_DATA_ACK.
125 */
126#define MAX_RCV_WND ((1U << 27) - 1)
127
128/*
129 * Min receive window. We want it to be large enough to accommodate receive
130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
131 */
132#define MIN_RCV_WND (24 * 1024U)
133#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
134
135#define VALIDATE_SEQ 0
136#define VALIDATE_SOCK(so)
137#define DEBUG_WR 0
138
139extern int tcp_do_autorcvbuf;
140extern int tcp_do_autosndbuf;
141extern int tcp_autorcvbuf_max;
142extern int tcp_autosndbuf_max;
143
144static void t3_send_reset(struct toepcb *toep);
145static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
146static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
147static void handle_syncache_event(int event, void *arg);
148
149static inline void
150SBAPPEND(struct sockbuf *sb, struct mbuf *n)
151{
152 struct mbuf * m;
153
154 m = sb->sb_mb;
155 while (m) {
156 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
157 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
158 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
159 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
160 m->m_next, m->m_nextpkt, m->m_flags));
161 m = m->m_next;
162 }
163 m = n;
164 while (m) {
165 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
166 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
167 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
168 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
169 m->m_next, m->m_nextpkt, m->m_flags));
170 m = m->m_next;
171 }
172 sbappend_locked(sb, n);
173 m = sb->sb_mb;
174 while (m) {
175 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176 m->m_next, m->m_nextpkt, m->m_flags));
177 m = m->m_next;
178 }
179}
180
181static inline int
182is_t3a(const struct toedev *dev)
183{
184 return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
185}
186
187static void
188dump_toepcb(struct toepcb *toep)
189{
190 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
191 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
192 toep->tp_mtu_idx, toep->tp_tid);
193
194 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
195 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
196 toep->tp_mss_clamp, toep->tp_flags);
197}
198
199#ifndef RTALLOC2_DEFINED
200static struct rtentry *
201rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
202{
203 struct rtentry *rt = NULL;
204
205 if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
206 RT_UNLOCK(rt);
207
208 return (rt);
209}
210#endif
211/*
212 * Determine whether to send a CPL message now or defer it. A message is
213 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
214 * For connections in other states the message is sent immediately.
215 * If through_l2t is set the message is subject to ARP processing, otherwise
216 * it is sent directly.
217 */
218static inline void
219send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
220{
221 struct tcpcb *tp = toep->tp_tp;
222
223 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
224 INP_LOCK(tp->t_inpcb);
225 mbufq_tail(&toep->out_of_order_queue, m); // defer
226 INP_UNLOCK(tp->t_inpcb);
227 } else if (through_l2t)
228 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T
229 else
230 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly
231}
232
233static inline unsigned int
234mkprio(unsigned int cntrl, const struct toepcb *toep)
235{
236 return (cntrl);
237}
238
239/*
240 * Populate a TID_RELEASE WR. The skb must be already propely sized.
241 */
242static inline void
243mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
244{
245 struct cpl_tid_release *req;
246
247 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
248 m->m_pkthdr.len = m->m_len = sizeof(*req);
249 req = mtod(m, struct cpl_tid_release *);
250 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
251 req->wr.wr_lo = 0;
252 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
253}
254
255static inline void
256make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
257{
258 struct tcpcb *tp = sototcpcb(so);
259 struct toepcb *toep = tp->t_toe;
260 struct tx_data_wr *req;
261
262 INP_LOCK_ASSERT(tp->t_inpcb);
263
264 req = mtod(m, struct tx_data_wr *);
265 m->m_len = sizeof(*req);
266 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
267 req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
268 /* len includes the length of any HW ULP additions */
269 req->len = htonl(len);
270 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
271 /* V_TX_ULP_SUBMODE sets both the mode and submode */
272 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
273 V_TX_URG(/* skb_urgent(skb) */ 0 ) |
274 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
275 (tail ? 0 : 1))));
276 req->sndseq = htonl(tp->snd_nxt);
277 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
278 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
279 V_TX_CPU_IDX(toep->tp_qset));
280
281 /* Sendbuffer is in units of 32KB.
282 */
283 if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
284 req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
285 else
286 req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
287 toep->tp_flags |= TP_DATASENT;
288 }
289}
290
291#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
292
293int
294t3_push_frames(struct socket *so, int req_completion)
295{
296 struct tcpcb *tp = sototcpcb(so);
297 struct toepcb *toep = tp->t_toe;
298
299 struct mbuf *tail, *m0, *last;
300 struct t3cdev *cdev;
301 struct tom_data *d;
302 int i, bytes, count, total_bytes;
303 bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
304
305 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
306 DPRINTF("tcp state=%d\n", tp->t_state);
307 return (0);
308 }
309
310 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
311 DPRINTF("disconnecting\n");
312
313 return (0);
314 }
315
316
317 INP_LOCK_ASSERT(tp->t_inpcb);
318 SOCKBUF_LOCK(&so->so_snd);
319 d = TOM_DATA(TOE_DEV(so));
320 cdev = d->cdev;
321 last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
322 total_bytes = 0;
323 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
324 toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
325
326 if (last && toep->tp_m_last == last && so->so_snd.sb_sndptroff != 0) {
327 KASSERT(tail, ("sbdrop error"));
328 last = tail = tail->m_next;
329 }
330
331 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
332 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
333 SOCKBUF_UNLOCK(&so->so_snd);
334 return (0);
335 }
336
337 toep->tp_m_last = NULL;
338 while (toep->tp_wr_avail && (tail != NULL)) {
339 count = bytes = 0;
340 segp = segs;
341 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
342 SOCKBUF_UNLOCK(&so->so_snd);
343 return (0);
344 }
345 /*
346 * If the data in tail fits as in-line, then
347 * make an immediate data wr.
348 */
349 if (tail->m_len <= IMM_LEN) {
350 count = 1;
351 bytes = tail->m_len;
352 last = tail;
353 tail = tail->m_next;
354 m_set_sgl(m0, NULL);
355 m_set_sgllen(m0, 0);
356 make_tx_data_wr(so, m0, bytes, tail);
357 m_append(m0, bytes, mtod(last, caddr_t));
358 KASSERT(!m0->m_next, ("bad append"));
359 } else {
360 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
361 && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
362 bytes += tail->m_len;
363 last = tail;
364 count++;
365 /*
366 * technically an abuse to be using this for a VA
367 * but less gross than defining my own structure
368 * or calling pmap_kextract from here :-|
369 */
370 segp->ds_addr = (bus_addr_t)tail->m_data;
371 segp->ds_len = tail->m_len;
372 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
373 count, mbuf_wrs[count], tail->m_data, tail->m_len);
374 segp++;
375 tail = tail->m_next;
376 }
377 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
378 toep->tp_wr_avail, count, mbuf_wrs[count], tail);
379
380 m_set_sgl(m0, segs);
381 m_set_sgllen(m0, count);
382 make_tx_data_wr(so, m0, bytes, tail);
383 }
384 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
385
386 if (tail) {
387 so->so_snd.sb_sndptr = tail;
388 toep->tp_m_last = NULL;
389 } else
390 toep->tp_m_last = so->so_snd.sb_sndptr = last;
391
392
393 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
394
395 so->so_snd.sb_sndptroff += bytes;
396 total_bytes += bytes;
397 toep->tp_write_seq += bytes;
398 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
399 toep->tp_wr_avail, count, mbuf_wrs[count], tail, so->so_snd.sb_sndptr, so->so_snd.sb_sndptroff);
400 if (tail)
401 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
402 total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
403 else
404 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
405 total_bytes, toep->tp_m_last, tp->snd_una);
406
407
408 i = 0;
409 while (i < count && m_get_sgllen(m0)) {
410 if ((count - i) >= 3) {
411 CTR6(KTR_TOM,
412 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
413 segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
414 segs[i + 2].ds_addr, segs[i + 2].ds_len);
415 i += 3;
416 } else if ((count - i) == 2) {
417 CTR4(KTR_TOM,
418 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
419 segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
420 i += 2;
421 } else {
422 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
423 segs[i].ds_addr, segs[i].ds_len);
424 i++;
425 }
426
427 }
428
429 /*
430 * remember credits used
431 */
432 m0->m_pkthdr.csum_data = mbuf_wrs[count];
433 m0->m_pkthdr.len = bytes;
434 toep->tp_wr_avail -= mbuf_wrs[count];
435 toep->tp_wr_unacked += mbuf_wrs[count];
436
437 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
438 toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
439 struct work_request_hdr *wr = cplhdr(m0);
440
441 wr->wr_hi |= htonl(F_WR_COMPL);
442 toep->tp_wr_unacked = 0;
443 }
444 KASSERT((m0->m_pkthdr.csum_data > 0) &&
445 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
446 m0->m_pkthdr.csum_data));
447 m0->m_type = MT_DONTFREE;
448 enqueue_wr(toep, m0);
449 DPRINTF("sending offload tx with %d bytes in %d segments\n",
450 bytes, count);
451 l2t_send(cdev, m0, toep->tp_l2t);
452 }
453 SOCKBUF_UNLOCK(&so->so_snd);
454 return (total_bytes);
455}
456
457/*
458 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
459 * under any circumstances. We take the easy way out and always queue the
460 * message to the write_queue. We can optimize the case where the queue is
461 * already empty though the optimization is probably not worth it.
462 */
463static void
464close_conn(struct socket *so)
465{
466 struct mbuf *m;
467 struct cpl_close_con_req *req;
468 struct tom_data *d;
469 struct inpcb *inp = sotoinpcb(so);
470 struct tcpcb *tp;
471 struct toepcb *toep;
472 unsigned int tid;
473
474
475 INP_LOCK(inp);
476 tp = sototcpcb(so);
477 toep = tp->t_toe;
478
479 if (tp->t_state != TCPS_SYN_SENT)
480 t3_push_frames(so, 1);
481
482 if (toep->tp_flags & TP_FIN_SENT) {
483 INP_UNLOCK(inp);
484 return;
485 }
486
487 tid = toep->tp_tid;
488
489 d = TOM_DATA(toep->tp_toedev);
490
491 m = m_gethdr_nofail(sizeof(*req));
492
493 toep->tp_flags |= TP_FIN_SENT;
494 req = mtod(m, struct cpl_close_con_req *);
495
496 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
497 req->wr.wr_lo = htonl(V_WR_TID(tid));
498 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
499 req->rsvd = htonl(toep->tp_write_seq);
500 INP_UNLOCK(inp);
501 /*
502 * XXX - need to defer shutdown while there is still data in the queue
503 *
504 */
505 cxgb_ofld_send(d->cdev, m);
506
507}
508
509/*
510 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
511 * and send it along.
512 */
513static void
514abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
515{
516 struct cpl_abort_req *req = cplhdr(m);
517
518 req->cmd = CPL_ABORT_NO_RST;
519 cxgb_ofld_send(cdev, m);
520}
521
522/*
523 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
524 * permitted to return without sending the message in case we cannot allocate
525 * an sk_buff. Returns the number of credits sent.
526 */
527uint32_t
528t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
529{
530 struct mbuf *m;
531 struct cpl_rx_data_ack *req;
532 struct toepcb *toep = tp->t_toe;
533 struct toedev *tdev = toep->tp_toedev;
534
535 m = m_gethdr_nofail(sizeof(*req));
536
537 DPRINTF("returning %u credits to HW\n", credits);
538
539 req = mtod(m, struct cpl_rx_data_ack *);
540 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
541 req->wr.wr_lo = 0;
542 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
543 req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
544 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
545 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
546 return (credits);
547}
548
549/*
550 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
551 * This is only used in DDP mode, so we take the opportunity to also set the
552 * DACK mode and flush any Rx credits.
553 */
554void
555t3_send_rx_modulate(struct toepcb *toep)
556{
557 struct mbuf *m;
558 struct cpl_rx_data_ack *req;
559
560 m = m_gethdr_nofail(sizeof(*req));
561
562 req = mtod(m, struct cpl_rx_data_ack *);
563 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
564 req->wr.wr_lo = 0;
565 m->m_pkthdr.len = m->m_len = sizeof(*req);
566
567 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
568 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
569 V_RX_DACK_MODE(1) |
570 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
571 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
572 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
573 toep->tp_rcv_wup = toep->tp_copied_seq;
574}
575
576/*
577 * Handle receipt of an urgent pointer.
578 */
579static void
580handle_urg_ptr(struct socket *so, uint32_t urg_seq)
581{
582#ifdef URGENT_DATA_SUPPORTED
583 struct tcpcb *tp = sototcpcb(so);
584
585 urg_seq--; /* initially points past the urgent data, per BSD */
586
587 if (tp->urg_data && !after(urg_seq, tp->urg_seq))
588 return; /* duplicate pointer */
589 sk_send_sigurg(sk);
590 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
591 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
592 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
593
594 tp->copied_seq++;
595 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
596 tom_eat_skb(sk, skb, 0);
597 }
598 tp->urg_data = TCP_URG_NOTYET;
599 tp->urg_seq = urg_seq;
600#endif
601}
602
603/*
604 * Returns true if a socket cannot accept new Rx data.
605 */
606static inline int
607so_no_receive(const struct socket *so)
608{
609 return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
610}
611
612/*
613 * Process an urgent data notification.
614 */
615static void
616rx_urg_notify(struct toepcb *toep, struct mbuf *m)
617{
618 struct cpl_rx_urg_notify *hdr = cplhdr(m);
619 struct socket *so = toeptoso(toep);
620
621 VALIDATE_SOCK(so);
622
623 if (!so_no_receive(so))
624 handle_urg_ptr(so, ntohl(hdr->seq));
625
626 m_freem(m);
627}
628
629/*
630 * Handler for RX_URG_NOTIFY CPL messages.
631 */
632static int
633do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
634{
635 struct toepcb *toep = (struct toepcb *)ctx;
636
637 rx_urg_notify(toep, m);
638 return (0);
639}
640
641static __inline int
642is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
643{
644 return (toep->tp_ulp_mode ||
645 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
646 dev->tod_ttid >= TOE_ID_CHELSIO_T3));
647}
648
641/*
642 * Set of states for which we should return RX credits.
643 */
644#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
645
646/*
647 * Called after some received data has been read. It returns RX credits
648 * to the HW for the amount of data processed.
649 */
650void
651t3_cleanup_rbuf(struct tcpcb *tp, int copied)
652{
653 struct toepcb *toep = tp->t_toe;
654 struct socket *so;
655 struct toedev *dev;
656 int dack_mode, must_send, read;
657 u32 thres, credits, dack = 0;
658
659 so = tp->t_inpcb->inp_socket;
660 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
661 (tp->t_state == TCPS_FIN_WAIT_2))) {
662 if (copied) {
663 SOCKBUF_LOCK(&so->so_rcv);
664 toep->tp_copied_seq += copied;
665 SOCKBUF_UNLOCK(&so->so_rcv);
666 }
667
668 return;
669 }
670
671 INP_LOCK_ASSERT(tp->t_inpcb);
672 SOCKBUF_LOCK(&so->so_rcv);
673 if (copied)
674 toep->tp_copied_seq += copied;
675 else {
676 read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
677 toep->tp_copied_seq += read;
678 }
679 credits = toep->tp_copied_seq - toep->tp_rcv_wup;
680 toep->tp_enqueued_bytes = so->so_rcv.sb_cc;
681 SOCKBUF_UNLOCK(&so->so_rcv);
682
683 if (credits > so->so_rcv.sb_mbmax) {
684 printf("copied_seq=%u rcv_wup=%u credits=%u\n",
685 toep->tp_copied_seq, toep->tp_rcv_wup, credits);
686 credits = so->so_rcv.sb_mbmax;
687 }
688
689
690 /*
691 * XXX this won't accurately reflect credit return - we need
692 * to look at the difference between the amount that has been
693 * put in the recv sockbuf and what is there now
694 */
695
696 if (__predict_false(!credits))
697 return;
698
699 dev = toep->tp_toedev;
700 thres = TOM_TUNABLE(dev, rx_credit_thres);
701
702 if (__predict_false(thres == 0))
703 return;
704
649/*
650 * Set of states for which we should return RX credits.
651 */
652#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
653
654/*
655 * Called after some received data has been read. It returns RX credits
656 * to the HW for the amount of data processed.
657 */
658void
659t3_cleanup_rbuf(struct tcpcb *tp, int copied)
660{
661 struct toepcb *toep = tp->t_toe;
662 struct socket *so;
663 struct toedev *dev;
664 int dack_mode, must_send, read;
665 u32 thres, credits, dack = 0;
666
667 so = tp->t_inpcb->inp_socket;
668 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
669 (tp->t_state == TCPS_FIN_WAIT_2))) {
670 if (copied) {
671 SOCKBUF_LOCK(&so->so_rcv);
672 toep->tp_copied_seq += copied;
673 SOCKBUF_UNLOCK(&so->so_rcv);
674 }
675
676 return;
677 }
678
679 INP_LOCK_ASSERT(tp->t_inpcb);
680 SOCKBUF_LOCK(&so->so_rcv);
681 if (copied)
682 toep->tp_copied_seq += copied;
683 else {
684 read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
685 toep->tp_copied_seq += read;
686 }
687 credits = toep->tp_copied_seq - toep->tp_rcv_wup;
688 toep->tp_enqueued_bytes = so->so_rcv.sb_cc;
689 SOCKBUF_UNLOCK(&so->so_rcv);
690
691 if (credits > so->so_rcv.sb_mbmax) {
692 printf("copied_seq=%u rcv_wup=%u credits=%u\n",
693 toep->tp_copied_seq, toep->tp_rcv_wup, credits);
694 credits = so->so_rcv.sb_mbmax;
695 }
696
697
698 /*
699 * XXX this won't accurately reflect credit return - we need
700 * to look at the difference between the amount that has been
701 * put in the recv sockbuf and what is there now
702 */
703
704 if (__predict_false(!credits))
705 return;
706
707 dev = toep->tp_toedev;
708 thres = TOM_TUNABLE(dev, rx_credit_thres);
709
710 if (__predict_false(thres == 0))
711 return;
712
705 if (toep->tp_ulp_mode)
706 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
707 else {
713 if (is_delack_mode_valid(dev, toep)) {
708 dack_mode = TOM_TUNABLE(dev, delack);
709 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
710 u32 r = tp->rcv_nxt - toep->tp_delack_seq;
711
712 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
713 dack = F_RX_DACK_CHANGE |
714 V_RX_DACK_MODE(dack_mode);
715 }
714 dack_mode = TOM_TUNABLE(dev, delack);
715 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
716 u32 r = tp->rcv_nxt - toep->tp_delack_seq;
717
718 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
719 dack = F_RX_DACK_CHANGE |
720 V_RX_DACK_MODE(dack_mode);
721 }
716 }
717
722 } else
723 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
724
718 /*
719 * For coalescing to work effectively ensure the receive window has
720 * at least 16KB left.
721 */
722 must_send = credits + 16384 >= tp->rcv_wnd;
723
724 if (must_send || credits >= thres)
725 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
726}
727
728static int
729cxgb_toe_disconnect(struct tcpcb *tp)
730{
731 struct socket *so;
732
733 DPRINTF("cxgb_toe_disconnect\n");
734
735 so = tp->t_inpcb->inp_socket;
736 close_conn(so);
737 return (0);
738}
739
740static int
741cxgb_toe_reset(struct tcpcb *tp)
742{
743 struct toepcb *toep = tp->t_toe;
744
745
746 t3_send_reset(toep);
747
748 /*
749 * unhook from socket
750 */
751 tp->t_flags &= ~TF_TOE;
752 toep->tp_tp = NULL;
753 tp->t_toe = NULL;
754 return (0);
755}
756
757static int
758cxgb_toe_send(struct tcpcb *tp)
759{
760 struct socket *so;
761
762 DPRINTF("cxgb_toe_send\n");
763 dump_toepcb(tp->t_toe);
764
765 so = tp->t_inpcb->inp_socket;
766 t3_push_frames(so, 1);
767 return (0);
768}
769
770static int
771cxgb_toe_rcvd(struct tcpcb *tp)
772{
773 INP_LOCK_ASSERT(tp->t_inpcb);
774 t3_cleanup_rbuf(tp, 0);
775
776 return (0);
777}
778
779static void
780cxgb_toe_detach(struct tcpcb *tp)
781{
782 struct toepcb *toep;
783 /*
784 * XXX how do we handle teardown in the SYN_SENT state?
785 *
786 */
787 INP_INFO_WLOCK(&tcbinfo);
788 toep = tp->t_toe;
789 toep->tp_tp = NULL;
790
791 /*
792 * unhook from socket
793 */
794 tp->t_flags &= ~TF_TOE;
795 tp->t_toe = NULL;
796 INP_INFO_WUNLOCK(&tcbinfo);
797}
798
799
800static struct toe_usrreqs cxgb_toe_usrreqs = {
801 .tu_disconnect = cxgb_toe_disconnect,
802 .tu_reset = cxgb_toe_reset,
803 .tu_send = cxgb_toe_send,
804 .tu_rcvd = cxgb_toe_rcvd,
805 .tu_detach = cxgb_toe_detach,
806 .tu_detach = cxgb_toe_detach,
807 .tu_syncache_event = handle_syncache_event,
808};
809
810
811static void
812__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
813 uint64_t mask, uint64_t val, int no_reply)
814{
815 struct cpl_set_tcb_field *req;
816
817 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
818 toep->tp_tid, word, mask, val);
819
820 req = mtod(m, struct cpl_set_tcb_field *);
821 m->m_pkthdr.len = m->m_len = sizeof(*req);
822 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
823 req->wr.wr_lo = 0;
824 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
825 req->reply = V_NO_REPLY(no_reply);
826 req->cpu_idx = 0;
827 req->word = htons(word);
828 req->mask = htobe64(mask);
829 req->val = htobe64(val);
830
831 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
832 send_or_defer(toep, m, 0);
833}
834
835static void
836t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
837{
838 struct mbuf *m;
839 struct tcpcb *tp = sototcpcb(so);
840 struct toepcb *toep = tp->t_toe;
841
842 if (toep == NULL)
843 return;
844
845 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
846 printf("not seting field\n");
847 return;
848 }
849
850 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
851
852 __set_tcb_field(toep, m, word, mask, val, 1);
853}
854
855/*
856 * Set one of the t_flags bits in the TCB.
857 */
858static void
859set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
860{
861 t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
862}
863
864/*
865 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
866 */
867static void
868t3_set_nagle(struct socket *so)
869{
870 struct tcpcb *tp = sototcpcb(so);
871
872 set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
873}
874
875/*
876 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
877 */
878void
879t3_set_keepalive(struct socket *so, int on_off)
880{
881 set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
882}
883
884void
885t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
886{
887 set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
888}
889
725 /*
726 * For coalescing to work effectively ensure the receive window has
727 * at least 16KB left.
728 */
729 must_send = credits + 16384 >= tp->rcv_wnd;
730
731 if (must_send || credits >= thres)
732 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
733}
734
735static int
736cxgb_toe_disconnect(struct tcpcb *tp)
737{
738 struct socket *so;
739
740 DPRINTF("cxgb_toe_disconnect\n");
741
742 so = tp->t_inpcb->inp_socket;
743 close_conn(so);
744 return (0);
745}
746
747static int
748cxgb_toe_reset(struct tcpcb *tp)
749{
750 struct toepcb *toep = tp->t_toe;
751
752
753 t3_send_reset(toep);
754
755 /*
756 * unhook from socket
757 */
758 tp->t_flags &= ~TF_TOE;
759 toep->tp_tp = NULL;
760 tp->t_toe = NULL;
761 return (0);
762}
763
764static int
765cxgb_toe_send(struct tcpcb *tp)
766{
767 struct socket *so;
768
769 DPRINTF("cxgb_toe_send\n");
770 dump_toepcb(tp->t_toe);
771
772 so = tp->t_inpcb->inp_socket;
773 t3_push_frames(so, 1);
774 return (0);
775}
776
777static int
778cxgb_toe_rcvd(struct tcpcb *tp)
779{
780 INP_LOCK_ASSERT(tp->t_inpcb);
781 t3_cleanup_rbuf(tp, 0);
782
783 return (0);
784}
785
786static void
787cxgb_toe_detach(struct tcpcb *tp)
788{
789 struct toepcb *toep;
790 /*
791 * XXX how do we handle teardown in the SYN_SENT state?
792 *
793 */
794 INP_INFO_WLOCK(&tcbinfo);
795 toep = tp->t_toe;
796 toep->tp_tp = NULL;
797
798 /*
799 * unhook from socket
800 */
801 tp->t_flags &= ~TF_TOE;
802 tp->t_toe = NULL;
803 INP_INFO_WUNLOCK(&tcbinfo);
804}
805
806
807static struct toe_usrreqs cxgb_toe_usrreqs = {
808 .tu_disconnect = cxgb_toe_disconnect,
809 .tu_reset = cxgb_toe_reset,
810 .tu_send = cxgb_toe_send,
811 .tu_rcvd = cxgb_toe_rcvd,
812 .tu_detach = cxgb_toe_detach,
813 .tu_detach = cxgb_toe_detach,
814 .tu_syncache_event = handle_syncache_event,
815};
816
817
818static void
819__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
820 uint64_t mask, uint64_t val, int no_reply)
821{
822 struct cpl_set_tcb_field *req;
823
824 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
825 toep->tp_tid, word, mask, val);
826
827 req = mtod(m, struct cpl_set_tcb_field *);
828 m->m_pkthdr.len = m->m_len = sizeof(*req);
829 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
830 req->wr.wr_lo = 0;
831 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
832 req->reply = V_NO_REPLY(no_reply);
833 req->cpu_idx = 0;
834 req->word = htons(word);
835 req->mask = htobe64(mask);
836 req->val = htobe64(val);
837
838 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
839 send_or_defer(toep, m, 0);
840}
841
842static void
843t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
844{
845 struct mbuf *m;
846 struct tcpcb *tp = sototcpcb(so);
847 struct toepcb *toep = tp->t_toe;
848
849 if (toep == NULL)
850 return;
851
852 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
853 printf("not seting field\n");
854 return;
855 }
856
857 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
858
859 __set_tcb_field(toep, m, word, mask, val, 1);
860}
861
862/*
863 * Set one of the t_flags bits in the TCB.
864 */
865static void
866set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
867{
868 t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
869}
870
871/*
872 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
873 */
874static void
875t3_set_nagle(struct socket *so)
876{
877 struct tcpcb *tp = sototcpcb(so);
878
879 set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
880}
881
882/*
883 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
884 */
885void
886t3_set_keepalive(struct socket *so, int on_off)
887{
888 set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
889}
890
891void
892t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
893{
894 set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
895}
896
897void
898t3_set_dack_mss(struct socket *so, int on_off)
899{
900 set_tcb_tflag(so, S_TF_DACK_MSS, on_off);
901}
902
890/*
891 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
892 */
893static void
894t3_set_tos(struct socket *so)
895{
896 t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
897 V_TCB_TOS(SO_TOS(so)));
898}
899
900
901/*
902 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
903 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
904 * set the PSH bit in the last segment, which would trigger delivery.]
905 * We work around the issue by setting a DDP buffer in a partial placed state,
906 * which guarantees that TP will schedule a timer.
907 */
908#define TP_DDP_TIMER_WORKAROUND_MASK\
909 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
910 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
911 V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
912#define TP_DDP_TIMER_WORKAROUND_VAL\
913 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
914 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
915 32))
916
917static void
918t3_enable_ddp(struct socket *so, int on)
919{
920 if (on) {
921
922 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
923 V_TF_DDP_OFF(0));
924 } else
925 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
926 V_TF_DDP_OFF(1) |
927 TP_DDP_TIMER_WORKAROUND_MASK,
928 V_TF_DDP_OFF(1) |
929 TP_DDP_TIMER_WORKAROUND_VAL);
930
931}
932
933void
934t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
935{
936 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
937 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
938 tag_color);
939}
940
941void
942t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
943 unsigned int len)
944{
945 if (buf_idx == 0)
946 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
947 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
948 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
949 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
950 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
951 else
952 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
953 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
954 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
955 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
956 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
957}
958
959static int
960t3_set_cong_control(struct socket *so, const char *name)
961{
962#ifdef CONGESTION_CONTROL_SUPPORTED
963 int cong_algo;
964
965 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
966 if (!strcmp(name, t3_cong_ops[cong_algo].name))
967 break;
968
969 if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
970 return -EINVAL;
971#endif
972 return 0;
973}
974
975int
976t3_get_tcb(struct socket *so)
977{
978 struct cpl_get_tcb *req;
979 struct tcpcb *tp = sototcpcb(so);
980 struct toepcb *toep = tp->t_toe;
981 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
982
983 if (!m)
984 return (ENOMEM);
985
986 INP_LOCK_ASSERT(tp->t_inpcb);
987 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
988 req = mtod(m, struct cpl_get_tcb *);
989 m->m_pkthdr.len = m->m_len = sizeof(*req);
990 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
991 req->wr.wr_lo = 0;
992 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
993 req->cpuno = htons(toep->tp_qset);
994 req->rsvd = 0;
995 if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
996 mbufq_tail(&toep->out_of_order_queue, m); // defer
997 else
998 cxgb_ofld_send(T3C_DEV(so), m);
999 return 0;
1000}
1001
1002static inline void
1003so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
1004{
1005 struct toepcb *toep = sototoep(so);
1006 toepcb_hold(toep);
1007
1008 cxgb_insert_tid(d->cdev, d->client, toep, tid);
1009}
1010
1011/**
1012 * find_best_mtu - find the entry in the MTU table closest to an MTU
1013 * @d: TOM state
1014 * @mtu: the target MTU
1015 *
1016 * Returns the index of the value in the MTU table that is closest to but
1017 * does not exceed the target MTU.
1018 */
1019static unsigned int
1020find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1021{
1022 int i = 0;
1023
1024 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1025 ++i;
1026 return (i);
1027}
1028
1029static unsigned int
1030select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1031{
1032 unsigned int idx;
1033
1034#ifdef notyet
1035 struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
1036#endif
1037 if (tp) {
1038 tp->t_maxseg = pmtu - 40;
1039 if (tp->t_maxseg < td->mtus[0] - 40)
1040 tp->t_maxseg = td->mtus[0] - 40;
1041 idx = find_best_mtu(td, tp->t_maxseg + 40);
1042
1043 tp->t_maxseg = td->mtus[idx] - 40;
1044 } else
1045 idx = find_best_mtu(td, pmtu);
1046
1047 return (idx);
1048}
1049
1050static inline void
1051free_atid(struct t3cdev *cdev, unsigned int tid)
1052{
1053 struct toepcb *toep = cxgb_free_atid(cdev, tid);
1054
1055 if (toep)
1056 toepcb_release(toep);
1057}
1058
1059/*
1060 * Release resources held by an offload connection (TID, L2T entry, etc.)
1061 */
1062static void
1063t3_release_offload_resources(struct toepcb *toep)
1064{
1065 struct tcpcb *tp = toep->tp_tp;
1066 struct toedev *tdev = toep->tp_toedev;
1067 struct t3cdev *cdev;
1068 unsigned int tid = toep->tp_tid;
1069
1070 if (!tdev)
1071 return;
1072
1073 cdev = TOEP_T3C_DEV(toep);
1074 if (!cdev)
1075 return;
1076
1077 toep->tp_qset = 0;
1078 t3_release_ddp_resources(toep);
1079
1080#ifdef CTRL_SKB_CACHE
1081 kfree_skb(CTRL_SKB_CACHE(tp));
1082 CTRL_SKB_CACHE(tp) = NULL;
1083#endif
1084
1085 if (toep->tp_wr_avail != toep->tp_wr_max) {
1086 purge_wr_queue(toep);
1087 reset_wr_list(toep);
1088 }
1089
1090 if (toep->tp_l2t) {
1091 l2t_release(L2DATA(cdev), toep->tp_l2t);
1092 toep->tp_l2t = NULL;
1093 }
1094 toep->tp_tp = NULL;
1095 if (tp) {
1096 INP_LOCK_ASSERT(tp->t_inpcb);
1097 tp->t_toe = NULL;
1098 tp->t_flags &= ~TF_TOE;
1099 }
1100
1101 if (toep->tp_state == TCPS_SYN_SENT) {
1102 free_atid(cdev, tid);
1103#ifdef notyet
1104 __skb_queue_purge(&tp->out_of_order_queue);
1105#endif
1106 } else { // we have TID
1107 cxgb_remove_tid(cdev, toep, tid);
1108 toepcb_release(toep);
1109 }
1110#if 0
1111 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1112#endif
1113}
1114
1115static void
1116install_offload_ops(struct socket *so)
1117{
1118 struct tcpcb *tp = sototcpcb(so);
1119
1120 KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1121
1122 t3_install_socket_ops(so);
1123 tp->t_flags |= TF_TOE;
1124 tp->t_tu = &cxgb_toe_usrreqs;
1125}
1126
1127/*
1128 * Determine the receive window scaling factor given a target max
1129 * receive window.
1130 */
1131static __inline int
1132select_rcv_wscale(int space)
1133{
1134 int wscale = 0;
1135
1136 if (space > MAX_RCV_WND)
1137 space = MAX_RCV_WND;
1138
1139 if (tcp_do_rfc1323)
1140 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1141
1142 return (wscale);
1143}
1144
1145/*
1146 * Determine the receive window size for a socket.
1147 */
1148static unsigned long
1149select_rcv_wnd(struct toedev *dev, struct socket *so)
1150{
1151 struct tom_data *d = TOM_DATA(dev);
1152 unsigned int wnd;
1153 unsigned int max_rcv_wnd;
1154
1155 if (tcp_do_autorcvbuf)
1156 wnd = tcp_autorcvbuf_max;
1157 else
1158 wnd = so->so_rcv.sb_hiwat;
1159
1160
1161
1162 /* XXX
1163 * For receive coalescing to work effectively we need a receive window
1164 * that can accomodate a coalesced segment.
1165 */
1166 if (wnd < MIN_RCV_WND)
1167 wnd = MIN_RCV_WND;
1168
1169 /* PR 5138 */
1170 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1171 (uint32_t)d->rx_page_size * 23 :
1172 MAX_RCV_WND);
1173
1174 return min(wnd, max_rcv_wnd);
1175}
1176
1177/*
1178 * Assign offload parameters to some socket fields. This code is used by
1179 * both active and passive opens.
1180 */
1181static inline void
1182init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1183 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1184{
1185 struct tcpcb *tp = sototcpcb(so);
1186 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1187
1188 SOCK_LOCK_ASSERT(so);
1189
1190 printf("initializing offload socket\n");
1191 /*
1192 * We either need to fix push frames to work with sbcompress
1193 * or we need to add this
1194 */
1195 so->so_snd.sb_flags |= SB_NOCOALESCE;
1196 so->so_rcv.sb_flags |= SB_NOCOALESCE;
1197
1198 tp->t_toe = toep;
1199 toep->tp_tp = tp;
1200 toep->tp_toedev = dev;
1201
1202 toep->tp_tid = tid;
1203 toep->tp_l2t = e;
1204 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1205 toep->tp_wr_unacked = 0;
1206 toep->tp_delack_mode = 0;
1207
1208 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1209 /*
1210 * XXX broken
1211 *
1212 */
1213 tp->rcv_wnd = select_rcv_wnd(dev, so);
1214
1215 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
1216 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1217 toep->tp_qset_idx = 0;
1218
1219 reset_wr_list(toep);
1220 DPRINTF("initialization done\n");
1221}
1222
1223/*
1224 * The next two functions calculate the option 0 value for a socket.
1225 */
1226static inline unsigned int
1227calc_opt0h(struct socket *so, int mtu_idx)
1228{
1229 struct tcpcb *tp = sototcpcb(so);
1230 int wscale = select_rcv_wscale(tp->rcv_wnd);
1231
1232 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1233 V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1234 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1235}
1236
1237static inline unsigned int
1238calc_opt0l(struct socket *so, int ulp_mode)
1239{
1240 struct tcpcb *tp = sototcpcb(so);
1241 unsigned int val;
1242
1243 val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
1244 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1245
1246 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
1247 return (val);
1248}
1249
1250static inline unsigned int
1251calc_opt2(const struct socket *so, struct toedev *dev)
1252{
1253 int flv_valid;
1254
1255 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1256
1257 return (V_FLAVORS_VALID(flv_valid) |
1258 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1259}
1260
1261#if DEBUG_WR > 1
1262static int
1263count_pending_wrs(const struct toepcb *toep)
1264{
1265 const struct mbuf *m;
1266 int n = 0;
1267
1268 wr_queue_walk(toep, m)
1269 n += m->m_pkthdr.csum_data;
1270 return (n);
1271}
1272#endif
1273
1274#if 0
1275(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1276#endif
1277
1278static void
1279mk_act_open_req(struct socket *so, struct mbuf *m,
1280 unsigned int atid, const struct l2t_entry *e)
1281{
1282 struct cpl_act_open_req *req;
1283 struct inpcb *inp = sotoinpcb(so);
1284 struct tcpcb *tp = intotcpcb(inp);
1285 struct toepcb *toep = tp->t_toe;
1286 struct toedev *tdev = TOE_DEV(so);
1287
1288 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1289
1290 req = mtod(m, struct cpl_act_open_req *);
1291 m->m_pkthdr.len = m->m_len = sizeof(*req);
1292
1293 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1294 req->wr.wr_lo = 0;
1295 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1296 req->local_port = inp->inp_lport;
1297 req->peer_port = inp->inp_fport;
1298 memcpy(&req->local_ip, &inp->inp_laddr, 4);
1299 memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1300 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1301 V_TX_CHANNEL(e->smt_idx));
1302 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1303 req->params = 0;
1304 req->opt2 = htonl(calc_opt2(so, tdev));
1305}
1306
1307
1308/*
1309 * Convert an ACT_OPEN_RPL status to an errno.
1310 */
1311static int
1312act_open_rpl_status_to_errno(int status)
1313{
1314 switch (status) {
1315 case CPL_ERR_CONN_RESET:
1316 return (ECONNREFUSED);
1317 case CPL_ERR_ARP_MISS:
1318 return (EHOSTUNREACH);
1319 case CPL_ERR_CONN_TIMEDOUT:
1320 return (ETIMEDOUT);
1321 case CPL_ERR_TCAM_FULL:
1322 return (ENOMEM);
1323 case CPL_ERR_CONN_EXIST:
1324 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1325 return (EADDRINUSE);
1326 default:
1327 return (EIO);
1328 }
1329}
1330
1331static void
1332fail_act_open(struct toepcb *toep, int errno)
1333{
1334 struct tcpcb *tp = toep->tp_tp;
1335
1336 t3_release_offload_resources(toep);
1337 if (tp) {
1338 INP_LOCK_ASSERT(tp->t_inpcb);
1339 tcp_drop(tp, errno);
1340 }
1341
1342#ifdef notyet
1343 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1344#endif
1345}
1346
1347/*
1348 * Handle active open failures.
1349 */
1350static void
1351active_open_failed(struct toepcb *toep, struct mbuf *m)
1352{
1353 struct cpl_act_open_rpl *rpl = cplhdr(m);
1354 struct inpcb *inp;
1355
1356 INP_INFO_WLOCK(&tcbinfo);
1357 if (toep->tp_tp == NULL)
1358 goto done;
1359
1360 inp = toep->tp_tp->t_inpcb;
1361 INP_LOCK(inp);
1362
1363/*
1364 * Don't handle connection retry for now
1365 */
1366#ifdef notyet
1367 struct inet_connection_sock *icsk = inet_csk(sk);
1368
1369 if (rpl->status == CPL_ERR_CONN_EXIST &&
1370 icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1371 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1372 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1373 jiffies + HZ / 2);
1374 } else
1375#endif
1376 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1377 INP_UNLOCK(inp);
1378done:
1379 INP_INFO_WUNLOCK(&tcbinfo);
1380
1381 m_free(m);
1382}
1383
1384/*
1385 * Return whether a failed active open has allocated a TID
1386 */
1387static inline int
1388act_open_has_tid(int status)
1389{
1390 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1391 status != CPL_ERR_ARP_MISS;
1392}
1393
1394/*
1395 * Process an ACT_OPEN_RPL CPL message.
1396 */
1397static int
1398do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1399{
1400 struct toepcb *toep = (struct toepcb *)ctx;
1401 struct cpl_act_open_rpl *rpl = cplhdr(m);
1402
1403 if (cdev->type != T3A && act_open_has_tid(rpl->status))
1404 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1405
1406 active_open_failed(toep, m);
1407 return (0);
1408}
1409
1410/*
1411 * Handle an ARP failure for an active open. XXX purge ofo queue
1412 *
1413 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1414 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1415 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
1416 * free the atid. Hmm.
1417 */
1418#ifdef notyet
1419static void
1420act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1421{
1422 struct toepcb *toep = m_get_toep(m);
1423 struct tcpcb *tp = toep->tp_tp;
1424 struct inpcb *inp = tp->t_inpcb;
1425 struct socket *so = toeptoso(toep);
1426
1427 INP_LOCK(inp);
1428 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1429 fail_act_open(so, EHOSTUNREACH);
1430 printf("freeing %p\n", m);
1431
1432 m_free(m);
1433 }
1434 INP_UNLOCK(inp);
1435}
1436#endif
1437/*
1438 * Send an active open request.
1439 */
1440int
1441t3_connect(struct toedev *tdev, struct socket *so,
1442 struct rtentry *rt, struct sockaddr *nam)
1443{
1444 struct mbuf *m;
1445 struct l2t_entry *e;
1446 struct tom_data *d = TOM_DATA(tdev);
1447 struct inpcb *inp = sotoinpcb(so);
1448 struct tcpcb *tp = intotcpcb(inp);
1449 struct toepcb *toep; /* allocated by init_offload_socket */
1450
1451 int atid;
1452
1453 toep = toepcb_alloc();
1454 if (toep == NULL)
1455 goto out_err;
1456
1457 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1458 goto out_err;
1459
1460 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1461 if (!e)
1462 goto free_tid;
1463
1464 INP_LOCK_ASSERT(inp);
1465 m = m_gethdr(MT_DATA, M_WAITOK);
1466
1467#if 0
1468 m->m_toe.mt_toepcb = tp->t_toe;
1469 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1470#endif
1471 SOCK_LOCK(so);
1472
1473 init_offload_socket(so, tdev, atid, e, rt, toep);
1474
1475 install_offload_ops(so);
1476
1477 mk_act_open_req(so, m, atid, e);
1478 SOCK_UNLOCK(so);
1479
1480 soisconnecting(so);
1481 toep = tp->t_toe;
1482 m_set_toep(m, tp->t_toe);
1483
1484 toep->tp_state = TCPS_SYN_SENT;
1485 l2t_send(d->cdev, (struct mbuf *)m, e);
1486
1487 if (toep->tp_ulp_mode)
1488 t3_enable_ddp(so, 0);
1489 return (0);
1490
1491free_tid:
1492 printf("failing connect - free atid\n");
1493
1494 free_atid(d->cdev, atid);
1495out_err:
1496 printf("return ENOMEM\n");
1497 return (ENOMEM);
1498}
1499
1500/*
1501 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
1502 * not send multiple ABORT_REQs for the same connection and also that we do
1503 * not try to send a message after the connection has closed. Returns 1 if
1504 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1505 */
1506static void
1507t3_send_reset(struct toepcb *toep)
1508{
1509
1510 struct cpl_abort_req *req;
1511 unsigned int tid = toep->tp_tid;
1512 int mode = CPL_ABORT_SEND_RST;
1513 struct tcpcb *tp = toep->tp_tp;
1514 struct toedev *tdev = toep->tp_toedev;
1515 struct socket *so = NULL;
1516 struct mbuf *m;
1517
1518 if (tp) {
1519 INP_LOCK_ASSERT(tp->t_inpcb);
1520 so = toeptoso(toep);
1521 }
1522
1523 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1524 tdev == NULL))
1525 return;
1526 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1527
1528 /* Purge the send queue so we don't send anything after an abort. */
1529 if (so)
1530 sbflush(&so->so_snd);
1531 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1532 mode |= CPL_ABORT_POST_CLOSE_REQ;
1533
1534 m = m_gethdr_nofail(sizeof(*req));
1535 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1536 set_arp_failure_handler(m, abort_arp_failure);
1537
1538 req = mtod(m, struct cpl_abort_req *);
1539 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1540 req->wr.wr_lo = htonl(V_WR_TID(tid));
1541 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1542 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1543 req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1544 req->cmd = mode;
1545 if (tp && (tp->t_state == TCPS_SYN_SENT))
1546 mbufq_tail(&toep->out_of_order_queue, m); // defer
1547 else
1548 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1549}
1550
1551static int
1552t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1553{
1554 struct inpcb *inp;
1555 int error, optval;
1556
1557 if (sopt->sopt_name == IP_OPTIONS)
1558 return (ENOPROTOOPT);
1559
1560 if (sopt->sopt_name != IP_TOS)
1561 return (EOPNOTSUPP);
1562
1563 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1564
1565 if (error)
1566 return (error);
1567
1568 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1569 return (EPERM);
1570
1571 inp = sotoinpcb(so);
1572 inp->inp_ip_tos = optval;
1573
1574 t3_set_tos(so);
1575
1576 return (0);
1577}
1578
1579static int
1580t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1581{
1582 int err = 0;
1583 size_t copied;
1584
1585 if (sopt->sopt_name != TCP_CONGESTION &&
1586 sopt->sopt_name != TCP_NODELAY)
1587 return (EOPNOTSUPP);
1588
1589 if (sopt->sopt_name == TCP_CONGESTION) {
1590 char name[TCP_CA_NAME_MAX];
1591 int optlen = sopt->sopt_valsize;
1592 struct tcpcb *tp;
1593
1594 if (optlen < 1)
1595 return (EINVAL);
1596
1597 err = copyinstr(sopt->sopt_val, name,
1598 min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1599 if (err)
1600 return (err);
1601 if (copied < 1)
1602 return (EINVAL);
1603
1604 tp = sototcpcb(so);
1605 /*
1606 * XXX I need to revisit this
1607 */
1608 if ((err = t3_set_cong_control(so, name)) == 0) {
1609#ifdef CONGESTION_CONTROL_SUPPORTED
1610 tp->t_cong_control = strdup(name, M_CXGB);
1611#endif
1612 } else
1613 return (err);
1614 } else {
1615 int optval, oldval;
1616 struct inpcb *inp;
1617 struct tcpcb *tp;
1618
1619 err = sooptcopyin(sopt, &optval, sizeof optval,
1620 sizeof optval);
1621
1622 if (err)
1623 return (err);
1624
1625 inp = sotoinpcb(so);
1626 tp = intotcpcb(inp);
1627
1628 INP_LOCK(inp);
1629
1630 oldval = tp->t_flags;
1631 if (optval)
1632 tp->t_flags |= TF_NODELAY;
1633 else
1634 tp->t_flags &= ~TF_NODELAY;
1635 INP_UNLOCK(inp);
1636
1637 if (oldval != tp->t_flags)
1638 t3_set_nagle(so);
1639
1640 }
1641
1642 return (0);
1643}
1644
1645static int
1646t3_ctloutput(struct socket *so, struct sockopt *sopt)
1647{
1648 int err;
1649
1650 if (sopt->sopt_level != IPPROTO_TCP)
1651 err = t3_ip_ctloutput(so, sopt);
1652 else
1653 err = t3_tcp_ctloutput(so, sopt);
1654
1655 if (err != EOPNOTSUPP)
1656 return (err);
1657
1658 return (tcp_ctloutput(so, sopt));
1659}
1660
1661/*
1662 * Returns true if we need to explicitly request RST when we receive new data
1663 * on an RX-closed connection.
1664 */
1665static inline int
1666need_rst_on_excess_rx(const struct toepcb *toep)
1667{
1668 return (1);
1669}
1670
1671/*
1672 * Handles Rx data that arrives in a state where the socket isn't accepting
1673 * new data.
1674 */
1675static void
1676handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1677{
1678
1679 if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1680 t3_send_reset(toep);
1681 m_freem(m);
1682}
1683
1684/*
1685 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1686 * by getting the DDP offset from the TCB.
1687 */
1688static void
1689tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1690{
1691 struct ddp_state *q = &toep->tp_ddp_state;
1692 struct ddp_buf_state *bsp;
1693 struct cpl_get_tcb_rpl *hdr;
1694 unsigned int ddp_offset;
1695 struct socket *so;
1696 struct tcpcb *tp;
1697
1698 uint64_t t;
1699 __be64 *tcb;
1700
1701 so = toeptoso(toep);
1702 tp = toep->tp_tp;
1703
1704 INP_LOCK_ASSERT(tp->t_inpcb);
1705 SOCKBUF_LOCK(&so->so_rcv);
1706
1707 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
1708 * really need a cookie in order to dispatch the RPLs.
1709 */
1710 q->get_tcb_count--;
1711
1712 /* It is a possible that a previous CPL already invalidated UBUF DDP
1713 * and moved the cur_buf idx and hence no further processing of this
1714 * skb is required. However, the app might be sleeping on
1715 * !q->get_tcb_count and we need to wake it up.
1716 */
1717 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1718 struct socket *so = toeptoso(toep);
1719
1720 m_freem(m);
1721 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1722 sorwakeup_locked(so);
1723 else
1724 SOCKBUF_UNLOCK(&so->so_rcv);
1725 return;
1726 }
1727
1728 bsp = &q->buf_state[q->cur_buf];
1729 hdr = cplhdr(m);
1730 tcb = (__be64 *)(hdr + 1);
1731 if (q->cur_buf == 0) {
1732 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1733 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1734 } else {
1735 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1736 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1737 }
1738 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1739 m->m_cur_offset = bsp->cur_offset;
1740 bsp->cur_offset = ddp_offset;
1741 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1742
1743 CTR5(KTR_TOM,
1744 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1745 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1746 KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u",
1747 ddp_offset, m->m_cur_offset));
1748
1749#ifdef T3_TRACE
1750 T3_TRACE3(TIDTB(so),
1751 "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u",
1752 tp->rcv_nxt, q->cur_buf, ddp_offset);
1753#endif
1754
1755#if 0
1756{
1757 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1758
1759 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1760 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1761
1762 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1763 rcv_nxt = t >> S_TCB_RCV_NXT;
1764 rcv_nxt &= M_TCB_RCV_NXT;
1765
1766 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1767 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1768 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1769
1770 T3_TRACE2(TIDTB(sk),
1771 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1772 ddp_flags, rcv_nxt - rx_hdr_offset);
1773 T3_TRACE4(TB(q),
1774 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1775 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1776 T3_TRACE3(TB(q),
1777 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1778 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1779 T3_TRACE2(TB(q),
1780 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1781 q->buf_state[0].flags, q->buf_state[1].flags);
1782
1783}
1784#endif
1785 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1786 handle_excess_rx(toep, m);
1787 return;
1788 }
1789
1790#ifdef T3_TRACE
1791 if ((int)m->m_pkthdr.len < 0) {
1792 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1793 }
1794#endif
1795 if (bsp->flags & DDP_BF_NOCOPY) {
1796#ifdef T3_TRACE
1797 T3_TRACE0(TB(q),
1798 "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1799
1800 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1801 printk("!cancel_ubuf");
1802 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1803 }
1804#endif
1805 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1806 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1807 q->cur_buf ^= 1;
1808 } else if (bsp->flags & DDP_BF_NOFLIP) {
1809
1810 m->m_ddp_flags = 1; /* always a kernel buffer */
1811
1812 /* now HW buffer carries a user buffer */
1813 bsp->flags &= ~DDP_BF_NOFLIP;
1814 bsp->flags |= DDP_BF_NOCOPY;
1815
1816 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1817 * any new data in which case we're done. If in addition the
1818 * offset is 0, then there wasn't a completion for the kbuf
1819 * and we need to decrement the posted count.
1820 */
1821 if (m->m_pkthdr.len == 0) {
1822 if (ddp_offset == 0) {
1823 q->kbuf_posted--;
1824 bsp->flags |= DDP_BF_NODATA;
1825 }
1826 SOCKBUF_UNLOCK(&so->so_rcv);
1827
1828 m_free(m);
1829 return;
1830 }
1831 } else {
1832 SOCKBUF_UNLOCK(&so->so_rcv);
1833 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1834 * but it got here way late and nobody cares anymore.
1835 */
1836 m_free(m);
1837 return;
1838 }
1839
1840 m->m_ddp_gl = (unsigned char *)bsp->gl;
1841 m->m_flags |= M_DDP;
1842 m->m_seq = tp->rcv_nxt;
1843 tp->rcv_nxt += m->m_pkthdr.len;
1844 tp->t_rcvtime = ticks;
1845#ifdef T3_TRACE
1846 T3_TRACE3(TB(q),
1847 "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
1848 m->m_seq, q->cur_buf, m->m_pkthdr.len);
1849#endif
1850 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1851 m->m_seq, q->cur_buf, m->m_pkthdr.len);
1852 if (m->m_pkthdr.len == 0)
1853 q->user_ddp_pending = 0;
1854 else
1855 SBAPPEND(&so->so_rcv, m);
1856 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1857 sorwakeup_locked(so);
1858 else
1859 SOCKBUF_UNLOCK(&so->so_rcv);
1860}
1861
1862/*
1863 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
1864 * in that case they are similar to DDP completions.
1865 */
1866static int
1867do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1868{
1869 struct toepcb *toep = (struct toepcb *)ctx;
1870
1871 /* OK if socket doesn't exist */
1872 if (toep == NULL) {
1873 printf("null toep in do_get_tcb_rpl\n");
1874 return (CPL_RET_BUF_DONE);
1875 }
1876
1877 INP_LOCK(toep->tp_tp->t_inpcb);
1878 tcb_rpl_as_ddp_complete(toep, m);
1879 INP_UNLOCK(toep->tp_tp->t_inpcb);
1880
1881 return (0);
1882}
1883
1884static void
1885handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1886{
1887 struct tcpcb *tp = toep->tp_tp;
1888 struct socket *so = toeptoso(toep);
1889 struct ddp_state *q;
1890 struct ddp_buf_state *bsp;
1891 struct cpl_rx_data *hdr = cplhdr(m);
1892 unsigned int rcv_nxt = ntohl(hdr->seq);
1893
1894 if (tp->rcv_nxt == rcv_nxt)
1895 return;
1896
1897 INP_LOCK_ASSERT(tp->t_inpcb);
1898 SOCKBUF_LOCK(&so->so_rcv);
1899 q = &toep->tp_ddp_state;
1900 bsp = &q->buf_state[q->cur_buf];
1901 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
1902 rcv_nxt, tp->rcv_nxt));
1903 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
1904 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
1905 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
1906 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
1907
1908#ifdef T3_TRACE
1909 if ((int)m->m_pkthdr.len < 0) {
1910 t3_ddp_error(so, "handle_ddp_data: neg len");
1911 }
1912#endif
1913
1914 m->m_ddp_gl = (unsigned char *)bsp->gl;
1915 m->m_flags |= M_DDP;
1916 m->m_cur_offset = bsp->cur_offset;
1917 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
1918 if (bsp->flags & DDP_BF_NOCOPY)
1919 bsp->flags &= ~DDP_BF_NOCOPY;
1920
1921 m->m_seq = tp->rcv_nxt;
1922 tp->rcv_nxt = rcv_nxt;
1923 bsp->cur_offset += m->m_pkthdr.len;
1924 if (!(bsp->flags & DDP_BF_NOFLIP))
1925 q->cur_buf ^= 1;
1926 /*
1927 * For now, don't re-enable DDP after a connection fell out of DDP
1928 * mode.
1929 */
1930 q->ubuf_ddp_ready = 0;
1931 SOCKBUF_UNLOCK(&so->so_rcv);
1932}
1933
1934/*
1935 * Process new data received for a connection.
1936 */
1937static void
1938new_rx_data(struct toepcb *toep, struct mbuf *m)
1939{
1940 struct cpl_rx_data *hdr = cplhdr(m);
1941 struct tcpcb *tp = toep->tp_tp;
1942 struct socket *so = toeptoso(toep);
1943 int len = be16toh(hdr->len);
1944
1945 INP_LOCK(tp->t_inpcb);
1946
1947 if (__predict_false(so_no_receive(so))) {
1948 handle_excess_rx(toep, m);
1949 INP_UNLOCK(tp->t_inpcb);
1950 TRACE_EXIT;
1951 return;
1952 }
1953
1954 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
1955 handle_ddp_data(toep, m);
1956
1957 m->m_seq = ntohl(hdr->seq);
1958 m->m_ulp_mode = 0; /* for iSCSI */
1959
1960#if VALIDATE_SEQ
1961 if (__predict_false(m->m_seq != tp->rcv_nxt)) {
1962 log(LOG_ERR,
1963 "%s: TID %u: Bad sequence number %u, expected %u\n",
1964 TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq,
1965 tp->rcv_nxt);
1966 m_freem(m);
1967 INP_UNLOCK(tp->t_inpcb);
1968 return;
1969 }
1970#endif
1971 m_adj(m, sizeof(*hdr));
1972
1973#ifdef URGENT_DATA_SUPPORTED
1974 /*
1975 * We don't handle urgent data yet
1976 */
1977 if (__predict_false(hdr->urg))
1978 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
1979 if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
1980 tp->urg_seq - tp->rcv_nxt < skb->len))
1981 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
1982 tp->rcv_nxt];
1983#endif
1984 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
1985 toep->tp_delack_mode = hdr->dack_mode;
1986 toep->tp_delack_seq = tp->rcv_nxt;
1987 }
1988 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
1989 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
1990
1991 if (len < m->m_pkthdr.len)
1992 m->m_pkthdr.len = m->m_len = len;
1993
1994 tp->rcv_nxt += m->m_pkthdr.len;
1995 tp->t_rcvtime = ticks;
1996 toep->tp_enqueued_bytes += m->m_pkthdr.len;
1997#ifdef T3_TRACE
1998 T3_TRACE2(TIDTB(sk),
1999 "new_rx_data: seq 0x%x len %u",
2000 m->m_seq, m->m_pkthdr.len);
2001#endif
2002 INP_UNLOCK(tp->t_inpcb);
2003 SOCKBUF_LOCK(&so->so_rcv);
2004 if (sb_notify(&so->so_rcv))
2005 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
2006
2007 SBAPPEND(&so->so_rcv, m);
2008
2009#ifdef notyet
2010 /*
2011 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2012 *
2013 */
2014 KASSERT(so->so_rcv.sb_cc < (so->so_rcv.sb_mbmax << 1),
2015
2016 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2017 so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
2018#endif
2019
2020
2021 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2022 so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
2023
2024 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2025 sorwakeup_locked(so);
2026 else
2027 SOCKBUF_UNLOCK(&so->so_rcv);
2028}
2029
2030/*
2031 * Handler for RX_DATA CPL messages.
2032 */
2033static int
2034do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2035{
2036 struct toepcb *toep = (struct toepcb *)ctx;
2037
2038 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2039
2040 new_rx_data(toep, m);
2041
2042 return (0);
2043}
2044
2045static void
2046new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2047{
2048 struct tcpcb *tp;
2049 struct ddp_state *q;
2050 struct ddp_buf_state *bsp;
2051 struct cpl_rx_data_ddp *hdr;
2052 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2053 struct socket *so = toeptoso(toep);
2054 int nomoredata = 0;
903/*
904 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
905 */
906static void
907t3_set_tos(struct socket *so)
908{
909 t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
910 V_TCB_TOS(SO_TOS(so)));
911}
912
913
914/*
915 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
916 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
917 * set the PSH bit in the last segment, which would trigger delivery.]
918 * We work around the issue by setting a DDP buffer in a partial placed state,
919 * which guarantees that TP will schedule a timer.
920 */
921#define TP_DDP_TIMER_WORKAROUND_MASK\
922 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
923 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
924 V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
925#define TP_DDP_TIMER_WORKAROUND_VAL\
926 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
927 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
928 32))
929
930static void
931t3_enable_ddp(struct socket *so, int on)
932{
933 if (on) {
934
935 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
936 V_TF_DDP_OFF(0));
937 } else
938 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
939 V_TF_DDP_OFF(1) |
940 TP_DDP_TIMER_WORKAROUND_MASK,
941 V_TF_DDP_OFF(1) |
942 TP_DDP_TIMER_WORKAROUND_VAL);
943
944}
945
946void
947t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
948{
949 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
950 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
951 tag_color);
952}
953
954void
955t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
956 unsigned int len)
957{
958 if (buf_idx == 0)
959 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
960 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
961 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
962 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
963 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
964 else
965 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
966 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
967 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
968 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
969 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
970}
971
972static int
973t3_set_cong_control(struct socket *so, const char *name)
974{
975#ifdef CONGESTION_CONTROL_SUPPORTED
976 int cong_algo;
977
978 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
979 if (!strcmp(name, t3_cong_ops[cong_algo].name))
980 break;
981
982 if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
983 return -EINVAL;
984#endif
985 return 0;
986}
987
988int
989t3_get_tcb(struct socket *so)
990{
991 struct cpl_get_tcb *req;
992 struct tcpcb *tp = sototcpcb(so);
993 struct toepcb *toep = tp->t_toe;
994 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
995
996 if (!m)
997 return (ENOMEM);
998
999 INP_LOCK_ASSERT(tp->t_inpcb);
1000 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1001 req = mtod(m, struct cpl_get_tcb *);
1002 m->m_pkthdr.len = m->m_len = sizeof(*req);
1003 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1004 req->wr.wr_lo = 0;
1005 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1006 req->cpuno = htons(toep->tp_qset);
1007 req->rsvd = 0;
1008 if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
1009 mbufq_tail(&toep->out_of_order_queue, m); // defer
1010 else
1011 cxgb_ofld_send(T3C_DEV(so), m);
1012 return 0;
1013}
1014
1015static inline void
1016so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
1017{
1018 struct toepcb *toep = sototoep(so);
1019 toepcb_hold(toep);
1020
1021 cxgb_insert_tid(d->cdev, d->client, toep, tid);
1022}
1023
1024/**
1025 * find_best_mtu - find the entry in the MTU table closest to an MTU
1026 * @d: TOM state
1027 * @mtu: the target MTU
1028 *
1029 * Returns the index of the value in the MTU table that is closest to but
1030 * does not exceed the target MTU.
1031 */
1032static unsigned int
1033find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1034{
1035 int i = 0;
1036
1037 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1038 ++i;
1039 return (i);
1040}
1041
1042static unsigned int
1043select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1044{
1045 unsigned int idx;
1046
1047#ifdef notyet
1048 struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
1049#endif
1050 if (tp) {
1051 tp->t_maxseg = pmtu - 40;
1052 if (tp->t_maxseg < td->mtus[0] - 40)
1053 tp->t_maxseg = td->mtus[0] - 40;
1054 idx = find_best_mtu(td, tp->t_maxseg + 40);
1055
1056 tp->t_maxseg = td->mtus[idx] - 40;
1057 } else
1058 idx = find_best_mtu(td, pmtu);
1059
1060 return (idx);
1061}
1062
1063static inline void
1064free_atid(struct t3cdev *cdev, unsigned int tid)
1065{
1066 struct toepcb *toep = cxgb_free_atid(cdev, tid);
1067
1068 if (toep)
1069 toepcb_release(toep);
1070}
1071
1072/*
1073 * Release resources held by an offload connection (TID, L2T entry, etc.)
1074 */
1075static void
1076t3_release_offload_resources(struct toepcb *toep)
1077{
1078 struct tcpcb *tp = toep->tp_tp;
1079 struct toedev *tdev = toep->tp_toedev;
1080 struct t3cdev *cdev;
1081 unsigned int tid = toep->tp_tid;
1082
1083 if (!tdev)
1084 return;
1085
1086 cdev = TOEP_T3C_DEV(toep);
1087 if (!cdev)
1088 return;
1089
1090 toep->tp_qset = 0;
1091 t3_release_ddp_resources(toep);
1092
1093#ifdef CTRL_SKB_CACHE
1094 kfree_skb(CTRL_SKB_CACHE(tp));
1095 CTRL_SKB_CACHE(tp) = NULL;
1096#endif
1097
1098 if (toep->tp_wr_avail != toep->tp_wr_max) {
1099 purge_wr_queue(toep);
1100 reset_wr_list(toep);
1101 }
1102
1103 if (toep->tp_l2t) {
1104 l2t_release(L2DATA(cdev), toep->tp_l2t);
1105 toep->tp_l2t = NULL;
1106 }
1107 toep->tp_tp = NULL;
1108 if (tp) {
1109 INP_LOCK_ASSERT(tp->t_inpcb);
1110 tp->t_toe = NULL;
1111 tp->t_flags &= ~TF_TOE;
1112 }
1113
1114 if (toep->tp_state == TCPS_SYN_SENT) {
1115 free_atid(cdev, tid);
1116#ifdef notyet
1117 __skb_queue_purge(&tp->out_of_order_queue);
1118#endif
1119 } else { // we have TID
1120 cxgb_remove_tid(cdev, toep, tid);
1121 toepcb_release(toep);
1122 }
1123#if 0
1124 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1125#endif
1126}
1127
1128static void
1129install_offload_ops(struct socket *so)
1130{
1131 struct tcpcb *tp = sototcpcb(so);
1132
1133 KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1134
1135 t3_install_socket_ops(so);
1136 tp->t_flags |= TF_TOE;
1137 tp->t_tu = &cxgb_toe_usrreqs;
1138}
1139
1140/*
1141 * Determine the receive window scaling factor given a target max
1142 * receive window.
1143 */
1144static __inline int
1145select_rcv_wscale(int space)
1146{
1147 int wscale = 0;
1148
1149 if (space > MAX_RCV_WND)
1150 space = MAX_RCV_WND;
1151
1152 if (tcp_do_rfc1323)
1153 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1154
1155 return (wscale);
1156}
1157
1158/*
1159 * Determine the receive window size for a socket.
1160 */
1161static unsigned long
1162select_rcv_wnd(struct toedev *dev, struct socket *so)
1163{
1164 struct tom_data *d = TOM_DATA(dev);
1165 unsigned int wnd;
1166 unsigned int max_rcv_wnd;
1167
1168 if (tcp_do_autorcvbuf)
1169 wnd = tcp_autorcvbuf_max;
1170 else
1171 wnd = so->so_rcv.sb_hiwat;
1172
1173
1174
1175 /* XXX
1176 * For receive coalescing to work effectively we need a receive window
1177 * that can accomodate a coalesced segment.
1178 */
1179 if (wnd < MIN_RCV_WND)
1180 wnd = MIN_RCV_WND;
1181
1182 /* PR 5138 */
1183 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1184 (uint32_t)d->rx_page_size * 23 :
1185 MAX_RCV_WND);
1186
1187 return min(wnd, max_rcv_wnd);
1188}
1189
1190/*
1191 * Assign offload parameters to some socket fields. This code is used by
1192 * both active and passive opens.
1193 */
1194static inline void
1195init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1196 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1197{
1198 struct tcpcb *tp = sototcpcb(so);
1199 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1200
1201 SOCK_LOCK_ASSERT(so);
1202
1203 printf("initializing offload socket\n");
1204 /*
1205 * We either need to fix push frames to work with sbcompress
1206 * or we need to add this
1207 */
1208 so->so_snd.sb_flags |= SB_NOCOALESCE;
1209 so->so_rcv.sb_flags |= SB_NOCOALESCE;
1210
1211 tp->t_toe = toep;
1212 toep->tp_tp = tp;
1213 toep->tp_toedev = dev;
1214
1215 toep->tp_tid = tid;
1216 toep->tp_l2t = e;
1217 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1218 toep->tp_wr_unacked = 0;
1219 toep->tp_delack_mode = 0;
1220
1221 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1222 /*
1223 * XXX broken
1224 *
1225 */
1226 tp->rcv_wnd = select_rcv_wnd(dev, so);
1227
1228 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
1229 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1230 toep->tp_qset_idx = 0;
1231
1232 reset_wr_list(toep);
1233 DPRINTF("initialization done\n");
1234}
1235
1236/*
1237 * The next two functions calculate the option 0 value for a socket.
1238 */
1239static inline unsigned int
1240calc_opt0h(struct socket *so, int mtu_idx)
1241{
1242 struct tcpcb *tp = sototcpcb(so);
1243 int wscale = select_rcv_wscale(tp->rcv_wnd);
1244
1245 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1246 V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1247 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1248}
1249
1250static inline unsigned int
1251calc_opt0l(struct socket *so, int ulp_mode)
1252{
1253 struct tcpcb *tp = sototcpcb(so);
1254 unsigned int val;
1255
1256 val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
1257 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1258
1259 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
1260 return (val);
1261}
1262
1263static inline unsigned int
1264calc_opt2(const struct socket *so, struct toedev *dev)
1265{
1266 int flv_valid;
1267
1268 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1269
1270 return (V_FLAVORS_VALID(flv_valid) |
1271 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1272}
1273
1274#if DEBUG_WR > 1
1275static int
1276count_pending_wrs(const struct toepcb *toep)
1277{
1278 const struct mbuf *m;
1279 int n = 0;
1280
1281 wr_queue_walk(toep, m)
1282 n += m->m_pkthdr.csum_data;
1283 return (n);
1284}
1285#endif
1286
1287#if 0
1288(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1289#endif
1290
1291static void
1292mk_act_open_req(struct socket *so, struct mbuf *m,
1293 unsigned int atid, const struct l2t_entry *e)
1294{
1295 struct cpl_act_open_req *req;
1296 struct inpcb *inp = sotoinpcb(so);
1297 struct tcpcb *tp = intotcpcb(inp);
1298 struct toepcb *toep = tp->t_toe;
1299 struct toedev *tdev = TOE_DEV(so);
1300
1301 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1302
1303 req = mtod(m, struct cpl_act_open_req *);
1304 m->m_pkthdr.len = m->m_len = sizeof(*req);
1305
1306 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1307 req->wr.wr_lo = 0;
1308 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1309 req->local_port = inp->inp_lport;
1310 req->peer_port = inp->inp_fport;
1311 memcpy(&req->local_ip, &inp->inp_laddr, 4);
1312 memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1313 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1314 V_TX_CHANNEL(e->smt_idx));
1315 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1316 req->params = 0;
1317 req->opt2 = htonl(calc_opt2(so, tdev));
1318}
1319
1320
1321/*
1322 * Convert an ACT_OPEN_RPL status to an errno.
1323 */
1324static int
1325act_open_rpl_status_to_errno(int status)
1326{
1327 switch (status) {
1328 case CPL_ERR_CONN_RESET:
1329 return (ECONNREFUSED);
1330 case CPL_ERR_ARP_MISS:
1331 return (EHOSTUNREACH);
1332 case CPL_ERR_CONN_TIMEDOUT:
1333 return (ETIMEDOUT);
1334 case CPL_ERR_TCAM_FULL:
1335 return (ENOMEM);
1336 case CPL_ERR_CONN_EXIST:
1337 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1338 return (EADDRINUSE);
1339 default:
1340 return (EIO);
1341 }
1342}
1343
1344static void
1345fail_act_open(struct toepcb *toep, int errno)
1346{
1347 struct tcpcb *tp = toep->tp_tp;
1348
1349 t3_release_offload_resources(toep);
1350 if (tp) {
1351 INP_LOCK_ASSERT(tp->t_inpcb);
1352 tcp_drop(tp, errno);
1353 }
1354
1355#ifdef notyet
1356 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1357#endif
1358}
1359
1360/*
1361 * Handle active open failures.
1362 */
1363static void
1364active_open_failed(struct toepcb *toep, struct mbuf *m)
1365{
1366 struct cpl_act_open_rpl *rpl = cplhdr(m);
1367 struct inpcb *inp;
1368
1369 INP_INFO_WLOCK(&tcbinfo);
1370 if (toep->tp_tp == NULL)
1371 goto done;
1372
1373 inp = toep->tp_tp->t_inpcb;
1374 INP_LOCK(inp);
1375
1376/*
1377 * Don't handle connection retry for now
1378 */
1379#ifdef notyet
1380 struct inet_connection_sock *icsk = inet_csk(sk);
1381
1382 if (rpl->status == CPL_ERR_CONN_EXIST &&
1383 icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1384 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1385 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1386 jiffies + HZ / 2);
1387 } else
1388#endif
1389 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1390 INP_UNLOCK(inp);
1391done:
1392 INP_INFO_WUNLOCK(&tcbinfo);
1393
1394 m_free(m);
1395}
1396
1397/*
1398 * Return whether a failed active open has allocated a TID
1399 */
1400static inline int
1401act_open_has_tid(int status)
1402{
1403 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1404 status != CPL_ERR_ARP_MISS;
1405}
1406
1407/*
1408 * Process an ACT_OPEN_RPL CPL message.
1409 */
1410static int
1411do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1412{
1413 struct toepcb *toep = (struct toepcb *)ctx;
1414 struct cpl_act_open_rpl *rpl = cplhdr(m);
1415
1416 if (cdev->type != T3A && act_open_has_tid(rpl->status))
1417 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1418
1419 active_open_failed(toep, m);
1420 return (0);
1421}
1422
1423/*
1424 * Handle an ARP failure for an active open. XXX purge ofo queue
1425 *
1426 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1427 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1428 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
1429 * free the atid. Hmm.
1430 */
1431#ifdef notyet
1432static void
1433act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1434{
1435 struct toepcb *toep = m_get_toep(m);
1436 struct tcpcb *tp = toep->tp_tp;
1437 struct inpcb *inp = tp->t_inpcb;
1438 struct socket *so = toeptoso(toep);
1439
1440 INP_LOCK(inp);
1441 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1442 fail_act_open(so, EHOSTUNREACH);
1443 printf("freeing %p\n", m);
1444
1445 m_free(m);
1446 }
1447 INP_UNLOCK(inp);
1448}
1449#endif
1450/*
1451 * Send an active open request.
1452 */
1453int
1454t3_connect(struct toedev *tdev, struct socket *so,
1455 struct rtentry *rt, struct sockaddr *nam)
1456{
1457 struct mbuf *m;
1458 struct l2t_entry *e;
1459 struct tom_data *d = TOM_DATA(tdev);
1460 struct inpcb *inp = sotoinpcb(so);
1461 struct tcpcb *tp = intotcpcb(inp);
1462 struct toepcb *toep; /* allocated by init_offload_socket */
1463
1464 int atid;
1465
1466 toep = toepcb_alloc();
1467 if (toep == NULL)
1468 goto out_err;
1469
1470 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1471 goto out_err;
1472
1473 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1474 if (!e)
1475 goto free_tid;
1476
1477 INP_LOCK_ASSERT(inp);
1478 m = m_gethdr(MT_DATA, M_WAITOK);
1479
1480#if 0
1481 m->m_toe.mt_toepcb = tp->t_toe;
1482 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1483#endif
1484 SOCK_LOCK(so);
1485
1486 init_offload_socket(so, tdev, atid, e, rt, toep);
1487
1488 install_offload_ops(so);
1489
1490 mk_act_open_req(so, m, atid, e);
1491 SOCK_UNLOCK(so);
1492
1493 soisconnecting(so);
1494 toep = tp->t_toe;
1495 m_set_toep(m, tp->t_toe);
1496
1497 toep->tp_state = TCPS_SYN_SENT;
1498 l2t_send(d->cdev, (struct mbuf *)m, e);
1499
1500 if (toep->tp_ulp_mode)
1501 t3_enable_ddp(so, 0);
1502 return (0);
1503
1504free_tid:
1505 printf("failing connect - free atid\n");
1506
1507 free_atid(d->cdev, atid);
1508out_err:
1509 printf("return ENOMEM\n");
1510 return (ENOMEM);
1511}
1512
1513/*
1514 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
1515 * not send multiple ABORT_REQs for the same connection and also that we do
1516 * not try to send a message after the connection has closed. Returns 1 if
1517 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1518 */
1519static void
1520t3_send_reset(struct toepcb *toep)
1521{
1522
1523 struct cpl_abort_req *req;
1524 unsigned int tid = toep->tp_tid;
1525 int mode = CPL_ABORT_SEND_RST;
1526 struct tcpcb *tp = toep->tp_tp;
1527 struct toedev *tdev = toep->tp_toedev;
1528 struct socket *so = NULL;
1529 struct mbuf *m;
1530
1531 if (tp) {
1532 INP_LOCK_ASSERT(tp->t_inpcb);
1533 so = toeptoso(toep);
1534 }
1535
1536 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1537 tdev == NULL))
1538 return;
1539 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1540
1541 /* Purge the send queue so we don't send anything after an abort. */
1542 if (so)
1543 sbflush(&so->so_snd);
1544 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1545 mode |= CPL_ABORT_POST_CLOSE_REQ;
1546
1547 m = m_gethdr_nofail(sizeof(*req));
1548 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1549 set_arp_failure_handler(m, abort_arp_failure);
1550
1551 req = mtod(m, struct cpl_abort_req *);
1552 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1553 req->wr.wr_lo = htonl(V_WR_TID(tid));
1554 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1555 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1556 req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1557 req->cmd = mode;
1558 if (tp && (tp->t_state == TCPS_SYN_SENT))
1559 mbufq_tail(&toep->out_of_order_queue, m); // defer
1560 else
1561 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1562}
1563
1564static int
1565t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1566{
1567 struct inpcb *inp;
1568 int error, optval;
1569
1570 if (sopt->sopt_name == IP_OPTIONS)
1571 return (ENOPROTOOPT);
1572
1573 if (sopt->sopt_name != IP_TOS)
1574 return (EOPNOTSUPP);
1575
1576 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1577
1578 if (error)
1579 return (error);
1580
1581 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1582 return (EPERM);
1583
1584 inp = sotoinpcb(so);
1585 inp->inp_ip_tos = optval;
1586
1587 t3_set_tos(so);
1588
1589 return (0);
1590}
1591
1592static int
1593t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1594{
1595 int err = 0;
1596 size_t copied;
1597
1598 if (sopt->sopt_name != TCP_CONGESTION &&
1599 sopt->sopt_name != TCP_NODELAY)
1600 return (EOPNOTSUPP);
1601
1602 if (sopt->sopt_name == TCP_CONGESTION) {
1603 char name[TCP_CA_NAME_MAX];
1604 int optlen = sopt->sopt_valsize;
1605 struct tcpcb *tp;
1606
1607 if (optlen < 1)
1608 return (EINVAL);
1609
1610 err = copyinstr(sopt->sopt_val, name,
1611 min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1612 if (err)
1613 return (err);
1614 if (copied < 1)
1615 return (EINVAL);
1616
1617 tp = sototcpcb(so);
1618 /*
1619 * XXX I need to revisit this
1620 */
1621 if ((err = t3_set_cong_control(so, name)) == 0) {
1622#ifdef CONGESTION_CONTROL_SUPPORTED
1623 tp->t_cong_control = strdup(name, M_CXGB);
1624#endif
1625 } else
1626 return (err);
1627 } else {
1628 int optval, oldval;
1629 struct inpcb *inp;
1630 struct tcpcb *tp;
1631
1632 err = sooptcopyin(sopt, &optval, sizeof optval,
1633 sizeof optval);
1634
1635 if (err)
1636 return (err);
1637
1638 inp = sotoinpcb(so);
1639 tp = intotcpcb(inp);
1640
1641 INP_LOCK(inp);
1642
1643 oldval = tp->t_flags;
1644 if (optval)
1645 tp->t_flags |= TF_NODELAY;
1646 else
1647 tp->t_flags &= ~TF_NODELAY;
1648 INP_UNLOCK(inp);
1649
1650 if (oldval != tp->t_flags)
1651 t3_set_nagle(so);
1652
1653 }
1654
1655 return (0);
1656}
1657
1658static int
1659t3_ctloutput(struct socket *so, struct sockopt *sopt)
1660{
1661 int err;
1662
1663 if (sopt->sopt_level != IPPROTO_TCP)
1664 err = t3_ip_ctloutput(so, sopt);
1665 else
1666 err = t3_tcp_ctloutput(so, sopt);
1667
1668 if (err != EOPNOTSUPP)
1669 return (err);
1670
1671 return (tcp_ctloutput(so, sopt));
1672}
1673
1674/*
1675 * Returns true if we need to explicitly request RST when we receive new data
1676 * on an RX-closed connection.
1677 */
1678static inline int
1679need_rst_on_excess_rx(const struct toepcb *toep)
1680{
1681 return (1);
1682}
1683
1684/*
1685 * Handles Rx data that arrives in a state where the socket isn't accepting
1686 * new data.
1687 */
1688static void
1689handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1690{
1691
1692 if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1693 t3_send_reset(toep);
1694 m_freem(m);
1695}
1696
1697/*
1698 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1699 * by getting the DDP offset from the TCB.
1700 */
1701static void
1702tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1703{
1704 struct ddp_state *q = &toep->tp_ddp_state;
1705 struct ddp_buf_state *bsp;
1706 struct cpl_get_tcb_rpl *hdr;
1707 unsigned int ddp_offset;
1708 struct socket *so;
1709 struct tcpcb *tp;
1710
1711 uint64_t t;
1712 __be64 *tcb;
1713
1714 so = toeptoso(toep);
1715 tp = toep->tp_tp;
1716
1717 INP_LOCK_ASSERT(tp->t_inpcb);
1718 SOCKBUF_LOCK(&so->so_rcv);
1719
1720 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
1721 * really need a cookie in order to dispatch the RPLs.
1722 */
1723 q->get_tcb_count--;
1724
1725 /* It is a possible that a previous CPL already invalidated UBUF DDP
1726 * and moved the cur_buf idx and hence no further processing of this
1727 * skb is required. However, the app might be sleeping on
1728 * !q->get_tcb_count and we need to wake it up.
1729 */
1730 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1731 struct socket *so = toeptoso(toep);
1732
1733 m_freem(m);
1734 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1735 sorwakeup_locked(so);
1736 else
1737 SOCKBUF_UNLOCK(&so->so_rcv);
1738 return;
1739 }
1740
1741 bsp = &q->buf_state[q->cur_buf];
1742 hdr = cplhdr(m);
1743 tcb = (__be64 *)(hdr + 1);
1744 if (q->cur_buf == 0) {
1745 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1746 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1747 } else {
1748 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1749 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1750 }
1751 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1752 m->m_cur_offset = bsp->cur_offset;
1753 bsp->cur_offset = ddp_offset;
1754 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1755
1756 CTR5(KTR_TOM,
1757 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1758 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1759 KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u",
1760 ddp_offset, m->m_cur_offset));
1761
1762#ifdef T3_TRACE
1763 T3_TRACE3(TIDTB(so),
1764 "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u",
1765 tp->rcv_nxt, q->cur_buf, ddp_offset);
1766#endif
1767
1768#if 0
1769{
1770 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1771
1772 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1773 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1774
1775 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1776 rcv_nxt = t >> S_TCB_RCV_NXT;
1777 rcv_nxt &= M_TCB_RCV_NXT;
1778
1779 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1780 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1781 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1782
1783 T3_TRACE2(TIDTB(sk),
1784 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1785 ddp_flags, rcv_nxt - rx_hdr_offset);
1786 T3_TRACE4(TB(q),
1787 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1788 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1789 T3_TRACE3(TB(q),
1790 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1791 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1792 T3_TRACE2(TB(q),
1793 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1794 q->buf_state[0].flags, q->buf_state[1].flags);
1795
1796}
1797#endif
1798 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1799 handle_excess_rx(toep, m);
1800 return;
1801 }
1802
1803#ifdef T3_TRACE
1804 if ((int)m->m_pkthdr.len < 0) {
1805 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1806 }
1807#endif
1808 if (bsp->flags & DDP_BF_NOCOPY) {
1809#ifdef T3_TRACE
1810 T3_TRACE0(TB(q),
1811 "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1812
1813 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1814 printk("!cancel_ubuf");
1815 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1816 }
1817#endif
1818 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1819 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1820 q->cur_buf ^= 1;
1821 } else if (bsp->flags & DDP_BF_NOFLIP) {
1822
1823 m->m_ddp_flags = 1; /* always a kernel buffer */
1824
1825 /* now HW buffer carries a user buffer */
1826 bsp->flags &= ~DDP_BF_NOFLIP;
1827 bsp->flags |= DDP_BF_NOCOPY;
1828
1829 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1830 * any new data in which case we're done. If in addition the
1831 * offset is 0, then there wasn't a completion for the kbuf
1832 * and we need to decrement the posted count.
1833 */
1834 if (m->m_pkthdr.len == 0) {
1835 if (ddp_offset == 0) {
1836 q->kbuf_posted--;
1837 bsp->flags |= DDP_BF_NODATA;
1838 }
1839 SOCKBUF_UNLOCK(&so->so_rcv);
1840
1841 m_free(m);
1842 return;
1843 }
1844 } else {
1845 SOCKBUF_UNLOCK(&so->so_rcv);
1846 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1847 * but it got here way late and nobody cares anymore.
1848 */
1849 m_free(m);
1850 return;
1851 }
1852
1853 m->m_ddp_gl = (unsigned char *)bsp->gl;
1854 m->m_flags |= M_DDP;
1855 m->m_seq = tp->rcv_nxt;
1856 tp->rcv_nxt += m->m_pkthdr.len;
1857 tp->t_rcvtime = ticks;
1858#ifdef T3_TRACE
1859 T3_TRACE3(TB(q),
1860 "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
1861 m->m_seq, q->cur_buf, m->m_pkthdr.len);
1862#endif
1863 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1864 m->m_seq, q->cur_buf, m->m_pkthdr.len);
1865 if (m->m_pkthdr.len == 0)
1866 q->user_ddp_pending = 0;
1867 else
1868 SBAPPEND(&so->so_rcv, m);
1869 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
1870 sorwakeup_locked(so);
1871 else
1872 SOCKBUF_UNLOCK(&so->so_rcv);
1873}
1874
1875/*
1876 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
1877 * in that case they are similar to DDP completions.
1878 */
1879static int
1880do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1881{
1882 struct toepcb *toep = (struct toepcb *)ctx;
1883
1884 /* OK if socket doesn't exist */
1885 if (toep == NULL) {
1886 printf("null toep in do_get_tcb_rpl\n");
1887 return (CPL_RET_BUF_DONE);
1888 }
1889
1890 INP_LOCK(toep->tp_tp->t_inpcb);
1891 tcb_rpl_as_ddp_complete(toep, m);
1892 INP_UNLOCK(toep->tp_tp->t_inpcb);
1893
1894 return (0);
1895}
1896
1897static void
1898handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1899{
1900 struct tcpcb *tp = toep->tp_tp;
1901 struct socket *so = toeptoso(toep);
1902 struct ddp_state *q;
1903 struct ddp_buf_state *bsp;
1904 struct cpl_rx_data *hdr = cplhdr(m);
1905 unsigned int rcv_nxt = ntohl(hdr->seq);
1906
1907 if (tp->rcv_nxt == rcv_nxt)
1908 return;
1909
1910 INP_LOCK_ASSERT(tp->t_inpcb);
1911 SOCKBUF_LOCK(&so->so_rcv);
1912 q = &toep->tp_ddp_state;
1913 bsp = &q->buf_state[q->cur_buf];
1914 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
1915 rcv_nxt, tp->rcv_nxt));
1916 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
1917 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
1918 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
1919 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
1920
1921#ifdef T3_TRACE
1922 if ((int)m->m_pkthdr.len < 0) {
1923 t3_ddp_error(so, "handle_ddp_data: neg len");
1924 }
1925#endif
1926
1927 m->m_ddp_gl = (unsigned char *)bsp->gl;
1928 m->m_flags |= M_DDP;
1929 m->m_cur_offset = bsp->cur_offset;
1930 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
1931 if (bsp->flags & DDP_BF_NOCOPY)
1932 bsp->flags &= ~DDP_BF_NOCOPY;
1933
1934 m->m_seq = tp->rcv_nxt;
1935 tp->rcv_nxt = rcv_nxt;
1936 bsp->cur_offset += m->m_pkthdr.len;
1937 if (!(bsp->flags & DDP_BF_NOFLIP))
1938 q->cur_buf ^= 1;
1939 /*
1940 * For now, don't re-enable DDP after a connection fell out of DDP
1941 * mode.
1942 */
1943 q->ubuf_ddp_ready = 0;
1944 SOCKBUF_UNLOCK(&so->so_rcv);
1945}
1946
1947/*
1948 * Process new data received for a connection.
1949 */
1950static void
1951new_rx_data(struct toepcb *toep, struct mbuf *m)
1952{
1953 struct cpl_rx_data *hdr = cplhdr(m);
1954 struct tcpcb *tp = toep->tp_tp;
1955 struct socket *so = toeptoso(toep);
1956 int len = be16toh(hdr->len);
1957
1958 INP_LOCK(tp->t_inpcb);
1959
1960 if (__predict_false(so_no_receive(so))) {
1961 handle_excess_rx(toep, m);
1962 INP_UNLOCK(tp->t_inpcb);
1963 TRACE_EXIT;
1964 return;
1965 }
1966
1967 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
1968 handle_ddp_data(toep, m);
1969
1970 m->m_seq = ntohl(hdr->seq);
1971 m->m_ulp_mode = 0; /* for iSCSI */
1972
1973#if VALIDATE_SEQ
1974 if (__predict_false(m->m_seq != tp->rcv_nxt)) {
1975 log(LOG_ERR,
1976 "%s: TID %u: Bad sequence number %u, expected %u\n",
1977 TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq,
1978 tp->rcv_nxt);
1979 m_freem(m);
1980 INP_UNLOCK(tp->t_inpcb);
1981 return;
1982 }
1983#endif
1984 m_adj(m, sizeof(*hdr));
1985
1986#ifdef URGENT_DATA_SUPPORTED
1987 /*
1988 * We don't handle urgent data yet
1989 */
1990 if (__predict_false(hdr->urg))
1991 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
1992 if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
1993 tp->urg_seq - tp->rcv_nxt < skb->len))
1994 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
1995 tp->rcv_nxt];
1996#endif
1997 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
1998 toep->tp_delack_mode = hdr->dack_mode;
1999 toep->tp_delack_seq = tp->rcv_nxt;
2000 }
2001 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2002 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2003
2004 if (len < m->m_pkthdr.len)
2005 m->m_pkthdr.len = m->m_len = len;
2006
2007 tp->rcv_nxt += m->m_pkthdr.len;
2008 tp->t_rcvtime = ticks;
2009 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2010#ifdef T3_TRACE
2011 T3_TRACE2(TIDTB(sk),
2012 "new_rx_data: seq 0x%x len %u",
2013 m->m_seq, m->m_pkthdr.len);
2014#endif
2015 INP_UNLOCK(tp->t_inpcb);
2016 SOCKBUF_LOCK(&so->so_rcv);
2017 if (sb_notify(&so->so_rcv))
2018 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
2019
2020 SBAPPEND(&so->so_rcv, m);
2021
2022#ifdef notyet
2023 /*
2024 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2025 *
2026 */
2027 KASSERT(so->so_rcv.sb_cc < (so->so_rcv.sb_mbmax << 1),
2028
2029 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2030 so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
2031#endif
2032
2033
2034 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2035 so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
2036
2037 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2038 sorwakeup_locked(so);
2039 else
2040 SOCKBUF_UNLOCK(&so->so_rcv);
2041}
2042
2043/*
2044 * Handler for RX_DATA CPL messages.
2045 */
2046static int
2047do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2048{
2049 struct toepcb *toep = (struct toepcb *)ctx;
2050
2051 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2052
2053 new_rx_data(toep, m);
2054
2055 return (0);
2056}
2057
2058static void
2059new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2060{
2061 struct tcpcb *tp;
2062 struct ddp_state *q;
2063 struct ddp_buf_state *bsp;
2064 struct cpl_rx_data_ddp *hdr;
2065 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2066 struct socket *so = toeptoso(toep);
2067 int nomoredata = 0;
2068 unsigned int delack_mode;
2055
2056 tp = sototcpcb(so);
2057
2058 INP_LOCK(tp->t_inpcb);
2059 if (__predict_false(so_no_receive(so))) {
2060
2061 handle_excess_rx(toep, m);
2062 INP_UNLOCK(tp->t_inpcb);
2063 return;
2064 }
2065
2066 q = &toep->tp_ddp_state;
2067 hdr = cplhdr(m);
2068 ddp_report = ntohl(hdr->u.ddp_report);
2069 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2070 bsp = &q->buf_state[buf_idx];
2071
2072#ifdef T3_TRACE
2073 T3_TRACE5(TIDTB(sk),
2074 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2075 "hdr seq 0x%x len %u offset %u",
2076 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2077 ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
2078 T3_TRACE1(TIDTB(sk),
2079 "new_rx_data_ddp: ddp_report 0x%x",
2080 ddp_report);
2081#endif
2082 CTR4(KTR_TOM,
2083 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2084 "hdr seq 0x%x len %u",
2085 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2086 ntohs(hdr->len));
2087 CTR3(KTR_TOM,
2088 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2089 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2090
2091 ddp_len = ntohs(hdr->len);
2092 rcv_nxt = ntohl(hdr->seq) + ddp_len;
2093
2069
2070 tp = sototcpcb(so);
2071
2072 INP_LOCK(tp->t_inpcb);
2073 if (__predict_false(so_no_receive(so))) {
2074
2075 handle_excess_rx(toep, m);
2076 INP_UNLOCK(tp->t_inpcb);
2077 return;
2078 }
2079
2080 q = &toep->tp_ddp_state;
2081 hdr = cplhdr(m);
2082 ddp_report = ntohl(hdr->u.ddp_report);
2083 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2084 bsp = &q->buf_state[buf_idx];
2085
2086#ifdef T3_TRACE
2087 T3_TRACE5(TIDTB(sk),
2088 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2089 "hdr seq 0x%x len %u offset %u",
2090 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2091 ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
2092 T3_TRACE1(TIDTB(sk),
2093 "new_rx_data_ddp: ddp_report 0x%x",
2094 ddp_report);
2095#endif
2096 CTR4(KTR_TOM,
2097 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2098 "hdr seq 0x%x len %u",
2099 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2100 ntohs(hdr->len));
2101 CTR3(KTR_TOM,
2102 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2103 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2104
2105 ddp_len = ntohs(hdr->len);
2106 rcv_nxt = ntohl(hdr->seq) + ddp_len;
2107
2108 delack_mode = G_DDP_DACK_MODE(ddp_report);
2109 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2110 toep->tp_delack_mode = delack_mode;
2111 toep->tp_delack_seq = tp->rcv_nxt;
2112 }
2113
2094 m->m_seq = tp->rcv_nxt;
2095 tp->rcv_nxt = rcv_nxt;
2096
2097 tp->t_rcvtime = ticks;
2098 /*
2099 * Store the length in m->m_len. We are changing the meaning of
2100 * m->m_len here, we need to be very careful that nothing from now on
2101 * interprets ->len of this packet the usual way.
2102 */
2103 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2104 INP_UNLOCK(tp->t_inpcb);
2105 CTR3(KTR_TOM,
2106 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2107 m->m_len, rcv_nxt, m->m_seq);
2108 /*
2109 * Figure out where the new data was placed in the buffer and store it
2110 * in when. Assumes the buffer offset starts at 0, consumer needs to
2111 * account for page pod's pg_offset.
2112 */
2113 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2114 m->m_cur_offset = end_offset - m->m_pkthdr.len;
2115
2116 SOCKBUF_LOCK(&so->so_rcv);
2117 m->m_ddp_gl = (unsigned char *)bsp->gl;
2118 m->m_flags |= M_DDP;
2119 bsp->cur_offset = end_offset;
2120 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2121
2122 /*
2123 * Length is only meaningful for kbuf
2124 */
2125 if (!(bsp->flags & DDP_BF_NOCOPY))
2126 KASSERT(m->m_len <= bsp->gl->dgl_length,
2127 ("length received exceeds ddp pages: len=%d dgl_length=%d",
2128 m->m_len, bsp->gl->dgl_length));
2129
2130 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2131 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2132
2133
2134 /*
2135 * Bit 0 of flags stores whether the DDP buffer is completed.
2136 * Note that other parts of the code depend on this being in bit 0.
2137 */
2138 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2139 panic("spurious ddp completion");
2140 } else {
2141 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2142 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2143 q->cur_buf ^= 1; /* flip buffers */
2144 }
2145
2146 if (bsp->flags & DDP_BF_NOCOPY) {
2147 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2148 bsp->flags &= ~DDP_BF_NOCOPY;
2149 }
2150
2151 if (ddp_report & F_DDP_PSH)
2152 m->m_ddp_flags |= DDP_BF_PSH;
2153 if (nomoredata)
2154 m->m_ddp_flags |= DDP_BF_NODATA;
2155
2114 m->m_seq = tp->rcv_nxt;
2115 tp->rcv_nxt = rcv_nxt;
2116
2117 tp->t_rcvtime = ticks;
2118 /*
2119 * Store the length in m->m_len. We are changing the meaning of
2120 * m->m_len here, we need to be very careful that nothing from now on
2121 * interprets ->len of this packet the usual way.
2122 */
2123 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2124 INP_UNLOCK(tp->t_inpcb);
2125 CTR3(KTR_TOM,
2126 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2127 m->m_len, rcv_nxt, m->m_seq);
2128 /*
2129 * Figure out where the new data was placed in the buffer and store it
2130 * in when. Assumes the buffer offset starts at 0, consumer needs to
2131 * account for page pod's pg_offset.
2132 */
2133 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2134 m->m_cur_offset = end_offset - m->m_pkthdr.len;
2135
2136 SOCKBUF_LOCK(&so->so_rcv);
2137 m->m_ddp_gl = (unsigned char *)bsp->gl;
2138 m->m_flags |= M_DDP;
2139 bsp->cur_offset = end_offset;
2140 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2141
2142 /*
2143 * Length is only meaningful for kbuf
2144 */
2145 if (!(bsp->flags & DDP_BF_NOCOPY))
2146 KASSERT(m->m_len <= bsp->gl->dgl_length,
2147 ("length received exceeds ddp pages: len=%d dgl_length=%d",
2148 m->m_len, bsp->gl->dgl_length));
2149
2150 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2151 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2152
2153
2154 /*
2155 * Bit 0 of flags stores whether the DDP buffer is completed.
2156 * Note that other parts of the code depend on this being in bit 0.
2157 */
2158 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2159 panic("spurious ddp completion");
2160 } else {
2161 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2162 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2163 q->cur_buf ^= 1; /* flip buffers */
2164 }
2165
2166 if (bsp->flags & DDP_BF_NOCOPY) {
2167 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2168 bsp->flags &= ~DDP_BF_NOCOPY;
2169 }
2170
2171 if (ddp_report & F_DDP_PSH)
2172 m->m_ddp_flags |= DDP_BF_PSH;
2173 if (nomoredata)
2174 m->m_ddp_flags |= DDP_BF_NODATA;
2175
2156 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2157 toep->tp_delack_mode = G_DDP_DACK_MODE(ddp_report);
2158 toep->tp_delack_seq = tp->rcv_nxt;
2159 }
2160
2176#ifdef notyet
2177 skb_reset_transport_header(skb);
2178 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */
2179#endif
2161 SBAPPEND(&so->so_rcv, m);
2162
2163 if ((so->so_state & SS_NOFDREF) == 0)
2164 sorwakeup_locked(so);
2165 else
2166 SOCKBUF_UNLOCK(&so->so_rcv);
2167}
2168
2169#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2170 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2171 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2172 F_DDP_INVALID_PPOD)
2173
2174/*
2175 * Handler for RX_DATA_DDP CPL messages.
2176 */
2177static int
2178do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2179{
2180 struct toepcb *toep = ctx;
2181 const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2182
2183 VALIDATE_SOCK(so);
2184
2185 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2186 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2187 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2188 return (CPL_RET_BUF_DONE);
2189 }
2190#if 0
2191 skb->h.th = tcphdr_skb->h.th;
2192#endif
2193 new_rx_data_ddp(toep, m);
2194 return (0);
2195}
2196
2197static void
2198process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2199{
2200 struct tcpcb *tp = toep->tp_tp;
2201 struct socket *so = toeptoso(toep);
2202 struct ddp_state *q;
2203 struct ddp_buf_state *bsp;
2204 struct cpl_rx_ddp_complete *hdr;
2180 SBAPPEND(&so->so_rcv, m);
2181
2182 if ((so->so_state & SS_NOFDREF) == 0)
2183 sorwakeup_locked(so);
2184 else
2185 SOCKBUF_UNLOCK(&so->so_rcv);
2186}
2187
2188#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2189 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2190 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2191 F_DDP_INVALID_PPOD)
2192
2193/*
2194 * Handler for RX_DATA_DDP CPL messages.
2195 */
2196static int
2197do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2198{
2199 struct toepcb *toep = ctx;
2200 const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2201
2202 VALIDATE_SOCK(so);
2203
2204 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2205 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2206 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2207 return (CPL_RET_BUF_DONE);
2208 }
2209#if 0
2210 skb->h.th = tcphdr_skb->h.th;
2211#endif
2212 new_rx_data_ddp(toep, m);
2213 return (0);
2214}
2215
2216static void
2217process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2218{
2219 struct tcpcb *tp = toep->tp_tp;
2220 struct socket *so = toeptoso(toep);
2221 struct ddp_state *q;
2222 struct ddp_buf_state *bsp;
2223 struct cpl_rx_ddp_complete *hdr;
2205 unsigned int ddp_report, buf_idx, when;
2224 unsigned int ddp_report, buf_idx, when, delack_mode;
2206 int nomoredata = 0;
2207
2208 INP_LOCK(tp->t_inpcb);
2209 if (__predict_false(so_no_receive(so))) {
2210 struct inpcb *inp = sotoinpcb(so);
2211
2212 handle_excess_rx(toep, m);
2213 INP_UNLOCK(inp);
2214 return;
2215 }
2216 q = &toep->tp_ddp_state;
2217 hdr = cplhdr(m);
2218 ddp_report = ntohl(hdr->ddp_report);
2219 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2220 m->m_pkthdr.csum_data = tp->rcv_nxt;
2221
2222
2223 SOCKBUF_LOCK(&so->so_rcv);
2224 bsp = &q->buf_state[buf_idx];
2225 when = bsp->cur_offset;
2226 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2227 tp->rcv_nxt += m->m_len;
2228 tp->t_rcvtime = ticks;
2225 int nomoredata = 0;
2226
2227 INP_LOCK(tp->t_inpcb);
2228 if (__predict_false(so_no_receive(so))) {
2229 struct inpcb *inp = sotoinpcb(so);
2230
2231 handle_excess_rx(toep, m);
2232 INP_UNLOCK(inp);
2233 return;
2234 }
2235 q = &toep->tp_ddp_state;
2236 hdr = cplhdr(m);
2237 ddp_report = ntohl(hdr->ddp_report);
2238 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2239 m->m_pkthdr.csum_data = tp->rcv_nxt;
2240
2241
2242 SOCKBUF_LOCK(&so->so_rcv);
2243 bsp = &q->buf_state[buf_idx];
2244 when = bsp->cur_offset;
2245 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2246 tp->rcv_nxt += m->m_len;
2247 tp->t_rcvtime = ticks;
2248
2249 delack_mode = G_DDP_DACK_MODE(ddp_report);
2250 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2251 toep->tp_delack_mode = delack_mode;
2252 toep->tp_delack_seq = tp->rcv_nxt;
2253 }
2254#ifdef notyet
2255 skb_reset_transport_header(skb);
2256 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2257#endif
2229 INP_UNLOCK(tp->t_inpcb);
2230
2231 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2258 INP_UNLOCK(tp->t_inpcb);
2259
2260 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2232#ifdef T3_TRACE
2233 T3_TRACE5(TIDTB(sk),
2234 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2235 "ddp_report 0x%x offset %u, len %u",
2236 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2237 G_DDP_OFFSET(ddp_report), skb->len);
2238#endif
2239 CTR5(KTR_TOM,
2240 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2241 "ddp_report 0x%x offset %u, len %u",
2242 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2243 G_DDP_OFFSET(ddp_report), m->m_len);
2244
2245 bsp->cur_offset += m->m_len;
2246
2247 if (!(bsp->flags & DDP_BF_NOFLIP)) {
2248 q->cur_buf ^= 1; /* flip buffers */
2249 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2250 nomoredata=1;
2251 }
2252
2261 CTR5(KTR_TOM,
2262 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2263 "ddp_report 0x%x offset %u, len %u",
2264 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2265 G_DDP_OFFSET(ddp_report), m->m_len);
2266
2267 bsp->cur_offset += m->m_len;
2268
2269 if (!(bsp->flags & DDP_BF_NOFLIP)) {
2270 q->cur_buf ^= 1; /* flip buffers */
2271 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2272 nomoredata=1;
2273 }
2274
2253#ifdef T3_TRACE
2254 T3_TRACE4(TIDTB(sk),
2255 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2256 "ddp_report %u offset %u",
2257 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2258 G_DDP_OFFSET(ddp_report));
2259#endif
2260 CTR4(KTR_TOM,
2261 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2262 "ddp_report %u offset %u",
2263 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2264 G_DDP_OFFSET(ddp_report));
2265
2266 m->m_ddp_gl = (unsigned char *)bsp->gl;
2267 m->m_flags |= M_DDP;
2268 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2269 if (bsp->flags & DDP_BF_NOCOPY)
2270 bsp->flags &= ~DDP_BF_NOCOPY;
2271 if (nomoredata)
2272 m->m_ddp_flags |= DDP_BF_NODATA;
2273
2275 CTR4(KTR_TOM,
2276 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2277 "ddp_report %u offset %u",
2278 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2279 G_DDP_OFFSET(ddp_report));
2280
2281 m->m_ddp_gl = (unsigned char *)bsp->gl;
2282 m->m_flags |= M_DDP;
2283 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2284 if (bsp->flags & DDP_BF_NOCOPY)
2285 bsp->flags &= ~DDP_BF_NOCOPY;
2286 if (nomoredata)
2287 m->m_ddp_flags |= DDP_BF_NODATA;
2288
2289
2274 SBAPPEND(&so->so_rcv, m);
2275
2276 if ((so->so_state & SS_NOFDREF) == 0)
2277 sorwakeup_locked(so);
2278 else
2279 SOCKBUF_UNLOCK(&so->so_rcv);
2280}
2281
2282/*
2283 * Handler for RX_DDP_COMPLETE CPL messages.
2284 */
2285static int
2286do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2287{
2288 struct toepcb *toep = ctx;
2289
2290 VALIDATE_SOCK(so);
2291#if 0
2292 skb->h.th = tcphdr_skb->h.th;
2293#endif
2294 process_ddp_complete(toep, m);
2295 return (0);
2296}
2297
2298/*
2299 * Move a socket to TIME_WAIT state. We need to make some adjustments to the
2300 * socket state before calling tcp_time_wait to comply with its expectations.
2301 */
2302static void
2303enter_timewait(struct socket *so)
2304{
2305 struct tcpcb *tp = sototcpcb(so);
2306
2307 INP_LOCK_ASSERT(tp->t_inpcb);
2308 /*
2309 * Bump rcv_nxt for the peer FIN. We don't do this at the time we
2310 * process peer_close because we don't want to carry the peer FIN in
2311 * the socket's receive queue and if we increment rcv_nxt without
2312 * having the FIN in the receive queue we'll confuse facilities such
2313 * as SIOCINQ.
2314 */
2315 tp->rcv_nxt++;
2316
2317 tp->ts_recent_age = 0; /* defeat recycling */
2318 tp->t_srtt = 0; /* defeat tcp_update_metrics */
2319 tcp_twstart(tp);
2320}
2321
2322/*
2323 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
2324 * function deals with the data that may be reported along with the FIN.
2325 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2326 * perform normal FIN-related processing. In the latter case 1 indicates that
2327 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2328 * skb can be freed.
2329 */
2330static int
2331handle_peer_close_data(struct socket *so, struct mbuf *m)
2332{
2333 struct tcpcb *tp = sototcpcb(so);
2334 struct toepcb *toep = tp->t_toe;
2335 struct ddp_state *q;
2336 struct ddp_buf_state *bsp;
2337 struct cpl_peer_close *req = cplhdr(m);
2338 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2339
2340 if (tp->rcv_nxt == rcv_nxt) /* no data */
2341 return (0);
2342
2343 if (__predict_false(so_no_receive(so))) {
2344 handle_excess_rx(toep, m);
2345
2346 /*
2347 * Although we discard the data we want to process the FIN so
2348 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2349 * PEER_CLOSE without data. In particular this PEER_CLOSE
2350 * may be what will close the connection. We return 1 because
2351 * handle_excess_rx() already freed the packet.
2352 */
2353 return (1);
2354 }
2355
2356 INP_LOCK_ASSERT(tp->t_inpcb);
2357 q = &toep->tp_ddp_state;
2358 SOCKBUF_LOCK(&so->so_rcv);
2359 bsp = &q->buf_state[q->cur_buf];
2360 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2361 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2362 m->m_ddp_gl = (unsigned char *)bsp->gl;
2363 m->m_flags |= M_DDP;
2364 m->m_cur_offset = bsp->cur_offset;
2365 m->m_ddp_flags =
2366 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2367 m->m_seq = tp->rcv_nxt;
2368 tp->rcv_nxt = rcv_nxt;
2369 bsp->cur_offset += m->m_pkthdr.len;
2370 if (!(bsp->flags & DDP_BF_NOFLIP))
2371 q->cur_buf ^= 1;
2290 SBAPPEND(&so->so_rcv, m);
2291
2292 if ((so->so_state & SS_NOFDREF) == 0)
2293 sorwakeup_locked(so);
2294 else
2295 SOCKBUF_UNLOCK(&so->so_rcv);
2296}
2297
2298/*
2299 * Handler for RX_DDP_COMPLETE CPL messages.
2300 */
2301static int
2302do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2303{
2304 struct toepcb *toep = ctx;
2305
2306 VALIDATE_SOCK(so);
2307#if 0
2308 skb->h.th = tcphdr_skb->h.th;
2309#endif
2310 process_ddp_complete(toep, m);
2311 return (0);
2312}
2313
2314/*
2315 * Move a socket to TIME_WAIT state. We need to make some adjustments to the
2316 * socket state before calling tcp_time_wait to comply with its expectations.
2317 */
2318static void
2319enter_timewait(struct socket *so)
2320{
2321 struct tcpcb *tp = sototcpcb(so);
2322
2323 INP_LOCK_ASSERT(tp->t_inpcb);
2324 /*
2325 * Bump rcv_nxt for the peer FIN. We don't do this at the time we
2326 * process peer_close because we don't want to carry the peer FIN in
2327 * the socket's receive queue and if we increment rcv_nxt without
2328 * having the FIN in the receive queue we'll confuse facilities such
2329 * as SIOCINQ.
2330 */
2331 tp->rcv_nxt++;
2332
2333 tp->ts_recent_age = 0; /* defeat recycling */
2334 tp->t_srtt = 0; /* defeat tcp_update_metrics */
2335 tcp_twstart(tp);
2336}
2337
2338/*
2339 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
2340 * function deals with the data that may be reported along with the FIN.
2341 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2342 * perform normal FIN-related processing. In the latter case 1 indicates that
2343 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2344 * skb can be freed.
2345 */
2346static int
2347handle_peer_close_data(struct socket *so, struct mbuf *m)
2348{
2349 struct tcpcb *tp = sototcpcb(so);
2350 struct toepcb *toep = tp->t_toe;
2351 struct ddp_state *q;
2352 struct ddp_buf_state *bsp;
2353 struct cpl_peer_close *req = cplhdr(m);
2354 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2355
2356 if (tp->rcv_nxt == rcv_nxt) /* no data */
2357 return (0);
2358
2359 if (__predict_false(so_no_receive(so))) {
2360 handle_excess_rx(toep, m);
2361
2362 /*
2363 * Although we discard the data we want to process the FIN so
2364 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2365 * PEER_CLOSE without data. In particular this PEER_CLOSE
2366 * may be what will close the connection. We return 1 because
2367 * handle_excess_rx() already freed the packet.
2368 */
2369 return (1);
2370 }
2371
2372 INP_LOCK_ASSERT(tp->t_inpcb);
2373 q = &toep->tp_ddp_state;
2374 SOCKBUF_LOCK(&so->so_rcv);
2375 bsp = &q->buf_state[q->cur_buf];
2376 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2377 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2378 m->m_ddp_gl = (unsigned char *)bsp->gl;
2379 m->m_flags |= M_DDP;
2380 m->m_cur_offset = bsp->cur_offset;
2381 m->m_ddp_flags =
2382 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2383 m->m_seq = tp->rcv_nxt;
2384 tp->rcv_nxt = rcv_nxt;
2385 bsp->cur_offset += m->m_pkthdr.len;
2386 if (!(bsp->flags & DDP_BF_NOFLIP))
2387 q->cur_buf ^= 1;
2388#ifdef notyet
2389 skb_reset_transport_header(skb);
2390 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2391#endif
2372 tp->t_rcvtime = ticks;
2373 SBAPPEND(&so->so_rcv, m);
2374 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2375 sorwakeup_locked(so);
2376 else
2377 SOCKBUF_UNLOCK(&so->so_rcv);
2378 return (1);
2379}
2380
2381/*
2382 * Handle a peer FIN.
2383 */
2384static void
2385do_peer_fin(struct socket *so, struct mbuf *m)
2386{
2387 struct tcpcb *tp = sototcpcb(so);
2388 struct toepcb *toep = tp->t_toe;
2389 int keep = 0;
2390 DPRINTF("do_peer_fin state=%d\n", tp->t_state);
2391
2392#ifdef T3_TRACE
2393 T3_TRACE0(TIDTB(sk),"do_peer_fin:");
2394#endif
2395
2396 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2397 printf("abort_pending set\n");
2398
2399 goto out;
2400 }
2401 INP_INFO_WLOCK(&tcbinfo);
2402 INP_LOCK(tp->t_inpcb);
2403 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2404 keep = handle_peer_close_data(so, m);
2405 if (keep < 0) {
2406 INP_INFO_WUNLOCK(&tcbinfo);
2407 INP_UNLOCK(tp->t_inpcb);
2408 return;
2409 }
2410 }
2411 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2412 socantrcvmore(so);
2413 /*
2414 * If connection is half-synchronized
2415 * (ie NEEDSYN flag on) then delay ACK,
2416 * so it may be piggybacked when SYN is sent.
2417 * Otherwise, since we received a FIN then no
2418 * more input can be expected, send ACK now.
2419 */
2420 if (tp->t_flags & TF_NEEDSYN)
2421 tp->t_flags |= TF_DELACK;
2422 else
2423 tp->t_flags |= TF_ACKNOW;
2424 tp->rcv_nxt++;
2425 }
2426
2427 switch (tp->t_state) {
2428 case TCPS_SYN_RECEIVED:
2429 tp->t_starttime = ticks;
2430 /* FALLTHROUGH */
2431 case TCPS_ESTABLISHED:
2432 tp->t_state = TCPS_CLOSE_WAIT;
2433 break;
2434 case TCPS_FIN_WAIT_1:
2435 tp->t_state = TCPS_CLOSING;
2436 break;
2437 case TCPS_FIN_WAIT_2:
2438 /*
2439 * If we've sent an abort_req we must have sent it too late,
2440 * HW will send us a reply telling us so, and this peer_close
2441 * is really the last message for this connection and needs to
2442 * be treated as an abort_rpl, i.e., transition the connection
2443 * to TCP_CLOSE (note that the host stack does this at the
2444 * time of generating the RST but we must wait for HW).
2445 * Otherwise we enter TIME_WAIT.
2446 */
2447 t3_release_offload_resources(toep);
2448 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2449 tp = tcp_close(tp);
2450 } else {
2451 enter_timewait(so);
2452 }
2453 break;
2454 default:
2455 log(LOG_ERR,
2456 "%s: TID %u received PEER_CLOSE in bad state %d\n",
2457 TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
2458 }
2459 INP_INFO_WUNLOCK(&tcbinfo);
2460 if (tp)
2461 INP_UNLOCK(tp->t_inpcb);
2462
2463 DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
2464
2465#ifdef notyet
2466 /* Do not send POLL_HUP for half duplex close. */
2467 if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2468 sk->sk_state == TCP_CLOSE)
2469 sk_wake_async(so, 1, POLL_HUP);
2470 else
2471 sk_wake_async(so, 1, POLL_IN);
2472#endif
2473
2474out:
2475 if (!keep)
2476 m_free(m);
2477}
2478
2479/*
2480 * Handler for PEER_CLOSE CPL messages.
2481 */
2482static int
2483do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2484{
2485 struct toepcb *toep = (struct toepcb *)ctx;
2486 struct socket *so = toeptoso(toep);
2487
2488 VALIDATE_SOCK(so);
2489
2490 do_peer_fin(so, m);
2491 return (0);
2492}
2493
2494static void
2495process_close_con_rpl(struct socket *so, struct mbuf *m)
2496{
2497 struct tcpcb *tp = sototcpcb(so);
2498 struct cpl_close_con_rpl *rpl = cplhdr(m);
2499 struct toepcb *toep = tp->t_toe;
2500
2501 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */
2502
2503 DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
2504 !!(so->so_state & SS_NOFDREF));
2505 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
2506 goto out;
2507
2508 INP_INFO_WLOCK(&tcbinfo);
2509 INP_LOCK(tp->t_inpcb);
2510 switch (tp->t_state) {
2511 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
2512 t3_release_offload_resources(toep);
2513 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2514 tp = tcp_close(tp);
2515
2516 } else {
2517 enter_timewait(so);
2518 soisdisconnected(so);
2519 }
2520 break;
2521 case TCPS_LAST_ACK:
2522 /*
2523 * In this state we don't care about pending abort_rpl.
2524 * If we've sent abort_req it was post-close and was sent too
2525 * late, this close_con_rpl is the actual last message.
2526 */
2527 t3_release_offload_resources(toep);
2528 tp = tcp_close(tp);
2529 break;
2530 case TCPS_FIN_WAIT_1:
2531 /*
2532 * If we can't receive any more
2533 * data, then closing user can proceed.
2534 * Starting the timer is contrary to the
2535 * specification, but if we don't get a FIN
2536 * we'll hang forever.
2537 *
2538 * XXXjl:
2539 * we should release the tp also, and use a
2540 * compressed state.
2541 */
2542 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2543 int timeout;
2544
2545 soisdisconnected(so);
2546 timeout = (tcp_fast_finwait2_recycle) ?
2547 tcp_finwait2_timeout : tcp_maxidle;
2548 tcp_timer_activate(tp, TT_2MSL, timeout);
2549 }
2550 tp->t_state = TCPS_FIN_WAIT_2;
2551 if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
2552 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2553 tp = tcp_drop(tp, 0);
2554 }
2555
2556 break;
2557 default:
2558 log(LOG_ERR,
2559 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2560 TOE_DEV(so)->tod_name, toep->tp_tid,
2561 tp->t_state);
2562 }
2563 INP_INFO_WUNLOCK(&tcbinfo);
2564 if (tp)
2565 INP_UNLOCK(tp->t_inpcb);
2566out:
2567 m_freem(m);
2568}
2569
2570/*
2571 * Handler for CLOSE_CON_RPL CPL messages.
2572 */
2573static int
2574do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2575 void *ctx)
2576{
2577 struct toepcb *toep = (struct toepcb *)ctx;
2578 struct socket *so = toeptoso(toep);
2579
2580 VALIDATE_SOCK(so);
2581
2582 process_close_con_rpl(so, m);
2583 return (0);
2584}
2585
2586/*
2587 * Process abort replies. We only process these messages if we anticipate
2588 * them as the coordination between SW and HW in this area is somewhat lacking
2589 * and sometimes we get ABORT_RPLs after we are done with the connection that
2590 * originated the ABORT_REQ.
2591 */
2592static void
2593process_abort_rpl(struct socket *so, struct mbuf *m)
2594{
2595 struct tcpcb *tp = sototcpcb(so);
2596 struct toepcb *toep = tp->t_toe;
2597
2598#ifdef T3_TRACE
2599 T3_TRACE1(TIDTB(sk),
2600 "process_abort_rpl: GTS rpl pending %d",
2601 sock_flag(sk, ABORT_RPL_PENDING));
2602#endif
2603
2604 INP_INFO_WLOCK(&tcbinfo);
2605 INP_LOCK(tp->t_inpcb);
2606
2607 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2608 /*
2609 * XXX panic on tcpdrop
2610 */
2611 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
2612 toep->tp_flags |= TP_ABORT_RPL_RCVD;
2613 else {
2614 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2615 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2616 !is_t3a(TOE_DEV(so))) {
2617 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2618 panic("TP_ABORT_REQ_RCVD set");
2619 t3_release_offload_resources(toep);
2620 tp = tcp_close(tp);
2621 }
2622 }
2623 }
2624 if (tp)
2625 INP_UNLOCK(tp->t_inpcb);
2626 INP_INFO_WUNLOCK(&tcbinfo);
2627
2628 m_free(m);
2629}
2630
2631/*
2632 * Handle an ABORT_RPL_RSS CPL message.
2633 */
2634static int
2635do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2636{
2637 struct socket *so;
2638 struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2639 struct toepcb *toep;
2640
2641 /*
2642 * Ignore replies to post-close aborts indicating that the abort was
2643 * requested too late. These connections are terminated when we get
2644 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2645 * arrives the TID is either no longer used or it has been recycled.
2646 */
2647 if (rpl->status == CPL_ERR_ABORT_FAILED) {
2648discard:
2649 m_free(m);
2650 return (0);
2651 }
2652
2653 toep = (struct toepcb *)ctx;
2654
2655 /*
2656 * Sometimes we've already closed the socket, e.g., a post-close
2657 * abort races with ABORT_REQ_RSS, the latter frees the socket
2658 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2659 * but FW turns the ABORT_REQ into a regular one and so we get
2660 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
2661 */
2662 if (!toep)
2663 goto discard;
2664
2665 if (toep->tp_tp == NULL) {
2666 printf("removing tid for abort\n");
2667 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2668 if (toep->tp_l2t)
2669 l2t_release(L2DATA(cdev), toep->tp_l2t);
2670
2671 toepcb_release(toep);
2672 goto discard;
2673 }
2674
2675 printf("toep=%p\n", toep);
2676 printf("tp=%p\n", toep->tp_tp);
2677
2678 so = toeptoso(toep); /* <- XXX panic */
2679 toepcb_hold(toep);
2680 process_abort_rpl(so, m);
2681 toepcb_release(toep);
2682 return (0);
2683}
2684
2685/*
2686 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also
2687 * indicate whether RST should be sent in response.
2688 */
2689static int
2690abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2691{
2692 struct tcpcb *tp = sototcpcb(so);
2693
2694 switch (abort_reason) {
2695 case CPL_ERR_BAD_SYN:
2696#if 0
2697 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through
2698#endif
2699 case CPL_ERR_CONN_RESET:
2700 // XXX need to handle SYN_RECV due to crossed SYNs
2701 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2702 case CPL_ERR_XMIT_TIMEDOUT:
2703 case CPL_ERR_PERSIST_TIMEDOUT:
2704 case CPL_ERR_FINWAIT2_TIMEDOUT:
2705 case CPL_ERR_KEEPALIVE_TIMEDOUT:
2706#if 0
2707 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2708#endif
2709 return (ETIMEDOUT);
2710 default:
2711 return (EIO);
2712 }
2713}
2714
2715static inline void
2716set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2717{
2718 struct cpl_abort_rpl *rpl = cplhdr(m);
2719
2720 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2721 rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2722 m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2723
2724 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2725 rpl->cmd = cmd;
2726}
2727
2728static void
2729send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2730{
2731 struct mbuf *reply_mbuf;
2732 struct cpl_abort_req_rss *req = cplhdr(m);
2733
2734 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2735 m_set_priority(m, CPL_PRIORITY_DATA);
2736 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2737 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2738 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2739 m_free(m);
2740}
2741
2742/*
2743 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2744 */
2745static inline int
2746is_neg_adv_abort(unsigned int status)
2747{
2748 return status == CPL_ERR_RTX_NEG_ADVICE ||
2749 status == CPL_ERR_PERSIST_NEG_ADVICE;
2750}
2751
2752static void
2753send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2754{
2755 struct mbuf *reply_mbuf;
2756 struct cpl_abort_req_rss *req = cplhdr(m);
2757
2758 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2759
2760 if (!reply_mbuf) {
2761 /* Defer the reply. Stick rst_status into req->cmd. */
2762 req->status = rst_status;
2763 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2764 return;
2765 }
2766
2767 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2768 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2769 m_free(m);
2770
2771 /*
2772 * XXX need to sync with ARP as for SYN_RECV connections we can send
2773 * these messages while ARP is pending. For other connection states
2774 * it's not a problem.
2775 */
2776 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2777}
2778
2779#ifdef notyet
2780static void
2781cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2782{
2783 CXGB_UNIMPLEMENTED();
2784#ifdef notyet
2785 struct request_sock *req = child->sk_user_data;
2786
2787 inet_csk_reqsk_queue_removed(parent, req);
2788 synq_remove(tcp_sk(child));
2789 __reqsk_free(req);
2790 child->sk_user_data = NULL;
2791#endif
2792}
2793
2794
2795/*
2796 * Performs the actual work to abort a SYN_RECV connection.
2797 */
2798static void
2799do_abort_syn_rcv(struct socket *child, struct socket *parent)
2800{
2801 struct tcpcb *parenttp = sototcpcb(parent);
2802 struct tcpcb *childtp = sototcpcb(child);
2803
2804 /*
2805 * If the server is still open we clean up the child connection,
2806 * otherwise the server already did the clean up as it was purging
2807 * its SYN queue and the skb was just sitting in its backlog.
2808 */
2809 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2810 cleanup_syn_rcv_conn(child, parent);
2811 INP_INFO_WLOCK(&tcbinfo);
2812 INP_LOCK(childtp->t_inpcb);
2813 t3_release_offload_resources(childtp->t_toe);
2814 childtp = tcp_close(childtp);
2815 INP_INFO_WUNLOCK(&tcbinfo);
2816 if (childtp)
2817 INP_UNLOCK(childtp->t_inpcb);
2818 }
2819}
2820#endif
2821
2822/*
2823 * Handle abort requests for a SYN_RECV connection. These need extra work
2824 * because the socket is on its parent's SYN queue.
2825 */
2826static int
2827abort_syn_rcv(struct socket *so, struct mbuf *m)
2828{
2829 CXGB_UNIMPLEMENTED();
2830#ifdef notyet
2831 struct socket *parent;
2832 struct toedev *tdev = TOE_DEV(so);
2833 struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2834 struct socket *oreq = so->so_incomp;
2835 struct t3c_tid_entry *t3c_stid;
2836 struct tid_info *t;
2837
2838 if (!oreq)
2839 return -1; /* somehow we are not on the SYN queue */
2840
2841 t = &(T3C_DATA(cdev))->tid_maps;
2842 t3c_stid = lookup_stid(t, oreq->ts_recent);
2843 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2844
2845 SOCK_LOCK(parent);
2846 do_abort_syn_rcv(so, parent);
2847 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2848 SOCK_UNLOCK(parent);
2849#endif
2850 return (0);
2851}
2852
2853/*
2854 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this
2855 * request except that we need to reply to it.
2856 */
2857static void
2858process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
2859{
2860 int rst_status = CPL_ABORT_NO_RST;
2861 const struct cpl_abort_req_rss *req = cplhdr(m);
2862 struct tcpcb *tp = sototcpcb(so);
2863 struct toepcb *toep = tp->t_toe;
2864
2865 INP_LOCK(tp->t_inpcb);
2866 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2867 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2868 m_free(m);
2869 goto skip;
2870 }
2871
2872 toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
2873 /*
2874 * Three cases to consider:
2875 * a) We haven't sent an abort_req; close the connection.
2876 * b) We have sent a post-close abort_req that will get to TP too late
2877 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
2878 * be ignored and the connection should be closed now.
2879 * c) We have sent a regular abort_req that will get to TP too late.
2880 * That will generate an abort_rpl with status 0, wait for it.
2881 */
2882 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
2883 (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
2884 so->so_error = abort_status_to_errno(so, req->status,
2885 &rst_status);
2886 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2887 sorwakeup(so);
2888 /*
2889 * SYN_RECV needs special processing. If abort_syn_rcv()
2890 * returns 0 is has taken care of the abort.
2891 */
2892 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
2893 goto skip;
2894
2895 t3_release_offload_resources(toep);
2896 tp = tcp_close(tp);
2897 }
2898 if (tp)
2899 INP_UNLOCK(tp->t_inpcb);
2900 send_abort_rpl(m, tdev, rst_status);
2901 return;
2902
2903skip:
2904 INP_UNLOCK(tp->t_inpcb);
2905}
2906
2907/*
2908 * Handle an ABORT_REQ_RSS CPL message.
2909 */
2910static int
2911do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2912{
2913 const struct cpl_abort_req_rss *req = cplhdr(m);
2914 struct toepcb *toep = (struct toepcb *)ctx;
2915 struct socket *so;
2916 struct inpcb *inp;
2917
2918 if (is_neg_adv_abort(req->status)) {
2919 m_free(m);
2920 return (0);
2921 }
2922
2923 printf("aborting tid=%d\n", toep->tp_tid);
2924
2925 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
2926 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2927 toep->tp_flags |= TP_ABORT_REQ_RCVD;
2928 printf("sending abort rpl\n");
2929
2930 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
2931 printf("sent\n");
2932 if (toep->tp_l2t)
2933 l2t_release(L2DATA(cdev), toep->tp_l2t);
2934
2935 /*
2936 * Unhook
2937 */
2938 toep->tp_tp->t_toe = NULL;
2939 toep->tp_tp->t_flags &= ~TF_TOE;
2940 toep->tp_tp = NULL;
2941 /*
2942 * XXX need to call syncache_chkrst - but we don't
2943 * have a way of doing that yet
2944 */
2945 toepcb_release(toep);
2946 printf("abort for unestablished connection :-(\n");
2947 return (0);
2948 }
2949 if (toep->tp_tp == NULL) {
2950 printf("disconnected toepcb\n");
2951 /* should be freed momentarily */
2952 return (0);
2953 }
2954
2955 so = toeptoso(toep);
2956 inp = sotoinpcb(so);
2957
2958 VALIDATE_SOCK(so);
2959 toepcb_hold(toep);
2960 INP_INFO_WLOCK(&tcbinfo);
2961 process_abort_req(so, m, TOE_DEV(so));
2962 INP_INFO_WUNLOCK(&tcbinfo);
2963 toepcb_release(toep);
2964 return (0);
2965}
2966#ifdef notyet
2967static void
2968pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
2969{
2970 struct toedev *tdev = TOE_DEV(parent);
2971
2972 do_abort_syn_rcv(child, parent);
2973 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
2974 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
2975
2976 rpl->opt0h = htonl(F_TCAM_BYPASS);
2977 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2978 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
2979 } else
2980 m_free(m);
2981}
2982#endif
2983static void
2984handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
2985{
2986 CXGB_UNIMPLEMENTED();
2987
2988#ifdef notyet
2989 struct t3cdev *cdev;
2990 struct socket *parent;
2991 struct socket *oreq;
2992 struct t3c_tid_entry *t3c_stid;
2993 struct tid_info *t;
2994 struct tcpcb *otp, *tp = sototcpcb(so);
2995 struct toepcb *toep = tp->t_toe;
2996
2997 /*
2998 * If the connection is being aborted due to the parent listening
2999 * socket going away there's nothing to do, the ABORT_REQ will close
3000 * the connection.
3001 */
3002 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3003 m_free(m);
3004 return;
3005 }
3006
3007 oreq = so->so_incomp;
3008 otp = sototcpcb(oreq);
3009
3010 cdev = T3C_DEV(so);
3011 t = &(T3C_DATA(cdev))->tid_maps;
3012 t3c_stid = lookup_stid(t, otp->ts_recent);
3013 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3014
3015 SOCK_LOCK(parent);
3016 pass_open_abort(so, parent, m);
3017 SOCK_UNLOCK(parent);
3018#endif
3019}
3020
3021/*
3022 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
3023 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3024 * connection.
3025 */
3026static void
3027pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3028{
3029
3030#ifdef notyet
3031 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3032 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3033#endif
3034 handle_pass_open_arp_failure(m_get_socket(m), m);
3035}
3036
3037/*
3038 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3039 */
3040static void
3041mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3042{
3043 struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3044 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3045 unsigned int tid = GET_TID(req);
3046
3047 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3048 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3049 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3050 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
3051 rpl->opt0h = htonl(F_TCAM_BYPASS);
3052 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3053 rpl->opt2 = 0;
3054 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3055}
3056
3057/*
3058 * Send a deferred reject to an accept request.
3059 */
3060static void
3061reject_pass_request(struct toedev *tdev, struct mbuf *m)
3062{
3063 struct mbuf *reply_mbuf;
3064
3065 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3066 mk_pass_accept_rpl(reply_mbuf, m);
3067 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3068 m_free(m);
3069}
3070
3071static void
3072handle_syncache_event(int event, void *arg)
3073{
3074 struct toepcb *toep = arg;
3075
3076 switch (event) {
3077 case TOE_SC_ENTRY_PRESENT:
3078 /*
3079 * entry already exists - free toepcb
3080 * and l2t
3081 */
3082 printf("syncache entry present\n");
3083 toepcb_release(toep);
3084 break;
3085 case TOE_SC_DROP:
3086 /*
3087 * The syncache has given up on this entry
3088 * either it timed out, or it was evicted
3089 * we need to explicitly release the tid
3090 */
3091 printf("syncache entry dropped\n");
3092 toepcb_release(toep);
3093 break;
3094 default:
3095 log(LOG_ERR, "unknown syncache event %d\n", event);
3096 break;
3097 }
3098}
3099
3100static void
3101syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3102{
3103 struct in_conninfo inc;
3104 struct tcpopt to;
3105 struct tcphdr th;
3106 struct inpcb *inp;
3107 int mss, wsf, sack, ts;
3108 uint32_t rcv_isn = ntohl(req->rcv_isn);
3109
3110 bzero(&to, sizeof(struct tcpopt));
3111 inp = sotoinpcb(lso);
3112
3113 /*
3114 * Fill out information for entering us into the syncache
3115 */
3116 inc.inc_fport = th.th_sport = req->peer_port;
3117 inc.inc_lport = th.th_dport = req->local_port;
3118 th.th_seq = req->rcv_isn;
3119 th.th_flags = TH_SYN;
3120
3121 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3122
3123
3124 inc.inc_isipv6 = 0;
3125 inc.inc_len = 0;
3126 inc.inc_faddr.s_addr = req->peer_ip;
3127 inc.inc_laddr.s_addr = req->local_ip;
3128
3129 DPRINTF("syncache add of %d:%d %d:%d\n",
3130 ntohl(req->local_ip), ntohs(req->local_port),
3131 ntohl(req->peer_ip), ntohs(req->peer_port));
3132
3133 mss = req->tcp_options.mss;
3134 wsf = req->tcp_options.wsf;
3135 ts = req->tcp_options.tstamp;
3136 sack = req->tcp_options.sack;
3137 to.to_mss = mss;
3138 to.to_wscale = wsf;
3139 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3140 INP_INFO_WLOCK(&tcbinfo);
3141 INP_LOCK(inp);
3142 syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3143}
3144
3145
3146/*
3147 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
3148 * lock held. Note that the sock here is a listening socket that is not owned
3149 * by the TOE.
3150 */
3151static void
3152process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3153 struct listen_ctx *lctx)
3154{
3155 int rt_flags;
3156 struct l2t_entry *e;
3157 struct iff_mac tim;
3158 struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3159 struct cpl_pass_accept_rpl *rpl;
3160 struct cpl_pass_accept_req *req = cplhdr(m);
3161 unsigned int tid = GET_TID(req);
3162 struct tom_data *d = TOM_DATA(tdev);
3163 struct t3cdev *cdev = d->cdev;
3164 struct tcpcb *tp = sototcpcb(so);
3165 struct toepcb *newtoep;
3166 struct rtentry *dst;
3167 struct sockaddr_in nam;
3168 struct t3c_data *td = T3C_DATA(cdev);
3169
3170 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3171 if (__predict_false(reply_mbuf == NULL)) {
3172 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3173 t3_defer_reply(m, tdev, reject_pass_request);
3174 else {
3175 cxgb_queue_tid_release(cdev, tid);
3176 m_free(m);
3177 }
3178 DPRINTF("failed to get reply_mbuf\n");
3179
3180 goto out;
3181 }
3182
3183 if (tp->t_state != TCPS_LISTEN) {
3184 DPRINTF("socket not in listen state\n");
3185
3186 goto reject;
3187 }
3188
3189 tim.mac_addr = req->dst_mac;
3190 tim.vlan_tag = ntohs(req->vlan_tag);
3191 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3192 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3193 goto reject;
3194 }
3195
3196#ifdef notyet
3197 /*
3198 * XXX do route lookup to confirm that we're still listening on this
3199 * address
3200 */
3201 if (ip_route_input(skb, req->local_ip, req->peer_ip,
3202 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3203 goto reject;
3204 rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3205 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3206 dst_release(skb->dst); // done with the input route, release it
3207 skb->dst = NULL;
3208
3209 if ((rt_flags & RTF_LOCAL) == 0)
3210 goto reject;
3211#endif
3212 /*
3213 * XXX
3214 */
3215 rt_flags = RTF_LOCAL;
3216 if ((rt_flags & RTF_LOCAL) == 0)
3217 goto reject;
3218
3219 /*
3220 * Calculate values and add to syncache
3221 */
3222
3223 newtoep = toepcb_alloc();
3224 if (newtoep == NULL)
3225 goto reject;
3226
3227 bzero(&nam, sizeof(struct sockaddr_in));
3228
3229 nam.sin_len = sizeof(struct sockaddr_in);
3230 nam.sin_family = AF_INET;
3231 nam.sin_addr.s_addr =req->peer_ip;
3232 dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3233
3234 if (dst == NULL) {
3235 printf("failed to find route\n");
3236 goto reject;
3237 }
3238 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3239 (struct sockaddr *)&nam);
3240 if (e == NULL) {
3241 DPRINTF("failed to get l2t\n");
3242 }
3243 /*
3244 * Point to our listen socket until accept
3245 */
3246 newtoep->tp_tp = tp;
3247 newtoep->tp_flags = TP_SYN_RCVD;
3248 newtoep->tp_tid = tid;
3249 newtoep->tp_toedev = tdev;
3250 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3251
3252 cxgb_insert_tid(cdev, d->client, newtoep, tid);
3253 SOCK_LOCK(so);
3254 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3255 SOCK_UNLOCK(so);
3256
3257 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
3258 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3259
3260 if (newtoep->tp_ulp_mode) {
3261 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3262
3263 if (ddp_mbuf == NULL)
3264 newtoep->tp_ulp_mode = 0;
3265 }
3266
3267 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3268 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3269 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3270 /*
3271 * XXX workaround for lack of syncache drop
3272 */
3273 toepcb_hold(newtoep);
3274 syncache_add_accept_req(req, so, newtoep);
3275
3276 rpl = cplhdr(reply_mbuf);
3277 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3278 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3279 rpl->wr.wr_lo = 0;
3280 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3281 rpl->opt2 = htonl(calc_opt2(so, tdev));
3282 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3283 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
3284
3285 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3286 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3287 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3288 CPL_PASS_OPEN_ACCEPT);
3289
3290 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3291
3292 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3293
3294 l2t_send(cdev, reply_mbuf, e);
3295 m_free(m);
3296 if (newtoep->tp_ulp_mode) {
3297 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3298 V_TF_DDP_OFF(1) |
3299 TP_DDP_TIMER_WORKAROUND_MASK,
3300 V_TF_DDP_OFF(1) |
3301 TP_DDP_TIMER_WORKAROUND_VAL, 1);
3302 } else
3303 printf("not offloading\n");
3304
3305
3306
3307 return;
3308reject:
3309 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3310 mk_pass_accept_rpl(reply_mbuf, m);
3311 else
3312 mk_tid_release(reply_mbuf, newtoep, tid);
3313 cxgb_ofld_send(cdev, reply_mbuf);
3314 m_free(m);
3315out:
3316#if 0
3317 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3318#else
3319 return;
3320#endif
3321}
3322
3323/*
3324 * Handle a CPL_PASS_ACCEPT_REQ message.
3325 */
3326static int
3327do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3328{
3329 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3330 struct socket *lso = listen_ctx->lso;
3331 struct tom_data *d = listen_ctx->tom_data;
3332
3333#if VALIDATE_TID
3334 struct cpl_pass_accept_req *req = cplhdr(m);
3335 unsigned int tid = GET_TID(req);
3336 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3337
3338 if (unlikely(!lsk)) {
3339 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3340 cdev->name,
3341 (unsigned long)((union listen_entry *)ctx -
3342 t->stid_tab));
3343 return CPL_RET_BUF_DONE;
3344 }
3345 if (unlikely(tid >= t->ntids)) {
3346 printk(KERN_ERR "%s: passive open TID %u too large\n",
3347 cdev->name, tid);
3348 return CPL_RET_BUF_DONE;
3349 }
3350 /*
3351 * For T3A the current user of the TID may have closed but its last
3352 * message(s) may have been backlogged so the TID appears to be still
3353 * in use. Just take the TID away, the connection can close at its
3354 * own leisure. For T3B this situation is a bug.
3355 */
3356 if (!valid_new_tid(t, tid) &&
3357 cdev->type != T3A) {
3358 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3359 cdev->name, tid);
3360 return CPL_RET_BUF_DONE;
3361 }
3362#endif
3363
3364 process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3365 return (0);
3366}
3367
3368/*
3369 * Called when a connection is established to translate the TCP options
3370 * reported by HW to FreeBSD's native format.
3371 */
3372static void
3373assign_rxopt(struct socket *so, unsigned int opt)
3374{
3375 const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
3376 struct tcpcb *tp = sototcpcb(so);
3377 struct toepcb *toep = tp->t_toe;
3378
3379 INP_LOCK_ASSERT(tp->t_inpcb);
3380
3381 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3382 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3383 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3384 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3385 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3386 (TF_RCVD_SCALE|TF_REQ_SCALE))
3387 tp->rcv_scale = tp->request_r_scale;
3388}
3389
3390/*
3391 * Completes some final bits of initialization for just established connections
3392 * and changes their state to TCP_ESTABLISHED.
3393 *
3394 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3395 */
3396static void
3397make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3398{
3399 struct tcpcb *tp = sototcpcb(so);
3400 struct toepcb *toep = tp->t_toe;
3401
3402 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3403 assign_rxopt(so, opt);
3404 so->so_proto->pr_ctloutput = t3_ctloutput;
3405
3406#if 0
3407 inet_sk(sk)->id = tp->write_seq ^ jiffies;
3408#endif
3409 /*
3410 * XXX not clear what rcv_wup maps to
3411 */
3412 /*
3413 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3414 * pass through opt0.
3415 */
3416 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3417 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3418
3419 dump_toepcb(toep);
3420
3421#ifdef notyet
3422/*
3423 * no clean interface for marking ARP up to date
3424 */
3425 dst_confirm(sk->sk_dst_cache);
3426#endif
3427 tp->t_starttime = ticks;
3428 tp->t_state = TCPS_ESTABLISHED;
3429 soisconnected(so);
3430}
3431
3432static int
3433syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3434{
3435
3436 struct in_conninfo inc;
3437 struct tcpopt to;
3438 struct tcphdr th;
3439 int mss, wsf, sack, ts;
3440 struct mbuf *m = NULL;
3441 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3442 unsigned int opt;
3443
3444#ifdef MAC
3445#error "no MAC support"
3446#endif
3447
3448 opt = ntohs(req->tcp_opt);
3449
3450 bzero(&to, sizeof(struct tcpopt));
3451
3452 /*
3453 * Fill out information for entering us into the syncache
3454 */
3455 inc.inc_fport = th.th_sport = req->peer_port;
3456 inc.inc_lport = th.th_dport = req->local_port;
3457 th.th_seq = req->rcv_isn;
3458 th.th_flags = TH_ACK;
3459
3460 inc.inc_isipv6 = 0;
3461 inc.inc_len = 0;
3462 inc.inc_faddr.s_addr = req->peer_ip;
3463 inc.inc_laddr.s_addr = req->local_ip;
3464
3465 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3466 wsf = G_TCPOPT_WSCALE_OK(opt);
3467 ts = G_TCPOPT_TSTAMP(opt);
3468 sack = G_TCPOPT_SACK(opt);
3469
3470 to.to_mss = mss;
3471 to.to_wscale = G_TCPOPT_SND_WSCALE(opt);
3472 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3473
3474 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3475 ntohl(req->local_ip), ntohs(req->local_port),
3476 ntohl(req->peer_ip), ntohs(req->peer_port),
3477 mss, wsf, ts, sack);
3478 return syncache_expand(&inc, &to, &th, so, m);
3479}
3480
3481
3482/*
3483 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
3484 * if we are in TCP_SYN_RECV due to crossed SYNs
3485 */
3486static int
3487do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3488{
3489 struct cpl_pass_establish *req = cplhdr(m);
3490 struct toepcb *toep = (struct toepcb *)ctx;
3491 struct tcpcb *tp;
3492 struct socket *so, *lso;
3493 struct t3c_data *td = T3C_DATA(cdev);
3494 // Complete socket initialization now that we have the SND_ISN
3495
3496 struct toedev *tdev;
3497
3498 so = lso = toeptoso(toep);
3499 tdev = toep->tp_toedev;
3500
3501 SOCK_LOCK(so);
3502 LIST_REMOVE(toep, synq_entry);
3503 SOCK_UNLOCK(so);
3504
3505 INP_INFO_WLOCK(&tcbinfo);
3506 if (!syncache_expand_establish_req(req, &so, toep)) {
3507 /*
3508 * No entry
3509 */
3510 CXGB_UNIMPLEMENTED();
3511 }
3512 if (so == NULL) {
3513 /*
3514 * Couldn't create the socket
3515 */
3516 CXGB_UNIMPLEMENTED();
3517 }
3518
3519 /*
3520 * XXX workaround for lack of syncache drop
3521 */
3522 toepcb_release(toep);
3523
3524 tp = sototcpcb(so);
3525 INP_LOCK(tp->t_inpcb);
3526
3527 so->so_snd.sb_flags |= SB_NOCOALESCE;
3528 so->so_rcv.sb_flags |= SB_NOCOALESCE;
3529
3530 toep->tp_tp = tp;
3531 toep->tp_flags = 0;
3532 tp->t_toe = toep;
3533 reset_wr_list(toep);
3534 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3535 tp->rcv_nxt = toep->tp_copied_seq;
3536 install_offload_ops(so);
3537
3538 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3539 toep->tp_wr_unacked = 0;
3540 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3541 toep->tp_qset_idx = 0;
3542 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3543
3544 /*
3545 * XXX Cancel any keep alive timer
3546 */
3547
3548 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3549 INP_INFO_WUNLOCK(&tcbinfo);
3550 INP_UNLOCK(tp->t_inpcb);
3551
3552 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3553 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3554#ifdef notyet
3555 /*
3556 * XXX not sure how these checks map to us
3557 */
3558 if (unlikely(sk->sk_socket)) { // simultaneous opens only
3559 sk->sk_state_change(sk);
3560 sk_wake_async(so, 0, POLL_OUT);
3561 }
3562 /*
3563 * The state for the new connection is now up to date.
3564 * Next check if we should add the connection to the parent's
3565 * accept queue. When the parent closes it resets connections
3566 * on its SYN queue, so check if we are being reset. If so we
3567 * don't need to do anything more, the coming ABORT_RPL will
3568 * destroy this socket. Otherwise move the connection to the
3569 * accept queue.
3570 *
3571 * Note that we reset the synq before closing the server so if
3572 * we are not being reset the stid is still open.
3573 */
3574 if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3575 __kfree_skb(skb);
3576 goto unlock;
3577 }
3578#endif
3579 m_free(m);
3580
3581 return (0);
3582}
3583
3584/*
3585 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3586 * and send them to the TOE.
3587 */
3588static void
3589fixup_and_send_ofo(struct socket *so)
3590{
3591 struct mbuf *m;
3592 struct toedev *tdev = TOE_DEV(so);
3593 struct tcpcb *tp = sototcpcb(so);
3594 struct toepcb *toep = tp->t_toe;
3595 unsigned int tid = toep->tp_tid;
3596
3597 printf("fixup_and_send_ofo\n");
3598
3599 INP_LOCK_ASSERT(tp->t_inpcb);
3600 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3601 /*
3602 * A variety of messages can be waiting but the fields we'll
3603 * be touching are common to all so any message type will do.
3604 */
3605 struct cpl_close_con_req *p = cplhdr(m);
3606
3607 p->wr.wr_lo = htonl(V_WR_TID(tid));
3608 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3609 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3610 }
3611}
3612
3613/*
3614 * Updates socket state from an active establish CPL message. Runs with the
3615 * socket lock held.
3616 */
3617static void
3618socket_act_establish(struct socket *so, struct mbuf *m)
3619{
3620 struct cpl_act_establish *req = cplhdr(m);
3621 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
3622 struct tcpcb *tp = sototcpcb(so);
3623 struct toepcb *toep = tp->t_toe;
3624
3625 if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3626 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3627 toep->tp_tid, tp->t_state);
3628
3629 tp->ts_recent_age = ticks;
3630 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3631 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3632
3633 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3634
3635 /*
3636 * Now that we finally have a TID send any CPL messages that we had to
3637 * defer for lack of a TID.
3638 */
3639 if (mbufq_len(&toep->out_of_order_queue))
3640 fixup_and_send_ofo(so);
3641
3642 if (__predict_false(so->so_state & SS_NOFDREF)) {
3643 /*
3644 * XXX does this even make sense?
3645 */
3646 sorwakeup(so);
3647 }
3648 m_free(m);
3649#ifdef notyet
3650/*
3651 * XXX assume no write requests permitted while socket connection is
3652 * incomplete
3653 */
3654 /*
3655 * Currently the send queue must be empty at this point because the
3656 * socket layer does not send anything before a connection is
3657 * established. To be future proof though we handle the possibility
3658 * that there are pending buffers to send (either TX_DATA or
3659 * CLOSE_CON_REQ). First we need to adjust the sequence number of the
3660 * buffers according to the just learned write_seq, and then we send
3661 * them on their way.
3662 */
3663 fixup_pending_writeq_buffers(sk);
3664 if (t3_push_frames(so, 1))
3665 sk->sk_write_space(sk);
3666#endif
3667
3668 toep->tp_state = tp->t_state;
3669 tcpstat.tcps_connects++;
3670
3671}
3672
3673/*
3674 * Process a CPL_ACT_ESTABLISH message.
3675 */
3676static int
3677do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3678{
3679 struct cpl_act_establish *req = cplhdr(m);
3680 unsigned int tid = GET_TID(req);
3681 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3682 struct toepcb *toep = (struct toepcb *)ctx;
3683 struct tcpcb *tp = toep->tp_tp;
3684 struct socket *so;
3685 struct toedev *tdev;
3686 struct tom_data *d;
3687
3688 if (tp == NULL) {
3689 free_atid(cdev, atid);
3690 return (0);
3691 }
3692
3693 so = toeptoso(toep);
3694 tdev = TOE_DEV(so); /* blow up here if link was down */
3695 d = TOM_DATA(tdev);
3696
3697 INP_LOCK(tp->t_inpcb);
3698
3699 /*
3700 * It's OK if the TID is currently in use, the owning socket may have
3701 * backlogged its last CPL message(s). Just take it away.
3702 */
3703 toep->tp_tid = tid;
3704 toep->tp_tp = tp;
3705 so_insert_tid(d, so, tid);
3706 free_atid(cdev, atid);
3707 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3708
3709 socket_act_establish(so, m);
3710 INP_UNLOCK(tp->t_inpcb);
3711 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3712 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3713
3714 return (0);
3715}
3716
3717/*
3718 * Process an acknowledgment of WR completion. Advance snd_una and send the
3719 * next batch of work requests from the write queue.
3720 */
3721static void
3722wr_ack(struct toepcb *toep, struct mbuf *m)
3723{
3724 struct tcpcb *tp = toep->tp_tp;
3725 struct cpl_wr_ack *hdr = cplhdr(m);
3726 struct socket *so = toeptoso(toep);
3727 unsigned int credits = ntohs(hdr->credits);
3728 u32 snd_una = ntohl(hdr->snd_una);
3729 int bytes = 0;
3730
3731 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3732
3733 INP_LOCK(tp->t_inpcb);
3734
3735 toep->tp_wr_avail += credits;
3736 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3737 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3738
3739 while (credits) {
3740 struct mbuf *p = peek_wr(toep);
3741
3742 if (__predict_false(!p)) {
3743 log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3744 "nothing pending, state %u wr_avail=%u\n",
3745 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3746 break;
3747 }
3748 CTR2(KTR_TOM,
3749 "wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len);
3750
3751 KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list"));
3752 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3753
3754#if DEBUG_WR > 1
3755 struct tx_data_wr *w = cplhdr(p);
3756 log(LOG_ERR,
3757 "TID %u got %u WR credits, need %u, len %u, "
3758 "main body %u, frags %u, seq # %u, ACK una %u,"
3759 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3760 toep->tp_tid, credits, p->csum, p->len,
3761 p->len - p->data_len, skb_shinfo(p)->nr_frags,
3762 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3763 toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3764#endif
3765 p->m_pkthdr.csum_data -= credits;
3766 break;
3767 } else {
3768 dequeue_wr(toep);
3769 credits -= p->m_pkthdr.csum_data;
3770 bytes += p->m_pkthdr.len;
3771 CTR3(KTR_TOM,
3772 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3773 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3774
3775 m_free(p);
3776 }
3777 }
3778
3779#if DEBUG_WR
3780 check_wr_invariants(tp);
3781#endif
3782
3783 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3784#if VALIDATE_SEQ
3785 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3786
3787 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3788 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3789 toep->tp_tid, tp->snd_una);
3790#endif
3791 goto out_free;
3792 }
3793
3794 if (tp->snd_una != snd_una) {
3795 tp->snd_una = snd_una;
3796 tp->ts_recent_age = ticks;
3797#ifdef notyet
3798 /*
3799 * Keep ARP entry "minty fresh"
3800 */
3801 dst_confirm(sk->sk_dst_cache);
3802#endif
3803 if (tp->snd_una == tp->snd_nxt)
3804 toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3805 }
3806 if (bytes) {
3807 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3808 SOCKBUF_LOCK(&so->so_snd);
3809 sbdrop_locked(&so->so_snd, bytes);
3810 sowwakeup_locked(so);
3811 }
3812
3813 if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
3814 t3_push_frames(so, 0);
3815
3816out_free:
3817 INP_UNLOCK(tp->t_inpcb);
3818 m_free(m);
3819}
3820
3821/*
3822 * Handler for TX_DATA_ACK CPL messages.
3823 */
3824static int
3825do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3826{
3827 struct toepcb *toep = (struct toepcb *)ctx;
3828
3829 VALIDATE_SOCK(so);
3830
3831 wr_ack(toep, m);
3832 return 0;
3833}
3834
3835/*
3836 * Handler for TRACE_PKT CPL messages. Just sink these packets.
3837 */
3838static int
3839do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
3840{
3841 m_freem(m);
3842 return 0;
3843}
3844
3845/*
3846 * Reset a connection that is on a listener's SYN queue or accept queue,
3847 * i.e., one that has not had a struct socket associated with it.
3848 * Must be called from process context.
3849 *
3850 * Modeled after code in inet_csk_listen_stop().
3851 */
3852static void
3853t3_reset_listen_child(struct socket *child)
3854{
3855 struct tcpcb *tp = sototcpcb(child);
3856
3857 t3_send_reset(tp->t_toe);
3858}
3859
3860/*
3861 * Disconnect offloaded established but not yet accepted connections sitting
3862 * on a server's accept_queue. We just send an ABORT_REQ at this point and
3863 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
3864 */
3865void
3866t3_disconnect_acceptq(struct socket *listen_so)
3867{
3868 struct socket *so;
3869 struct tcpcb *tp;
3870
3871 TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
3872 tp = sototcpcb(so);
3873
3874 if (tp->t_flags & TF_TOE) {
3875 INP_LOCK(tp->t_inpcb);
3876 t3_reset_listen_child(so);
3877 INP_UNLOCK(tp->t_inpcb);
3878 }
3879
3880 }
3881}
3882
3883/*
3884 * Reset offloaded connections sitting on a server's syn queue. As above
3885 * we send ABORT_REQ and finish off when we get ABORT_RPL.
3886 */
3887
3888void
3889t3_reset_synq(struct listen_ctx *lctx)
3890{
3891 struct toepcb *toep;
3892
3893 SOCK_LOCK(lctx->lso);
3894 while (!LIST_EMPTY(&lctx->synq_head)) {
3895 toep = LIST_FIRST(&lctx->synq_head);
3896 LIST_REMOVE(toep, synq_entry);
3897 toep->tp_tp = NULL;
3898 t3_send_reset(toep);
3899 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
3900 toepcb_release(toep);
3901 }
3902 SOCK_UNLOCK(lctx->lso);
3903}
3904
3905
3906int
3907t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
3908 unsigned int nppods, unsigned int tag, unsigned int maxoff,
3909 unsigned int pg_off, unsigned int color)
3910{
3911 unsigned int i, j, pidx;
3912 struct pagepod *p;
3913 struct mbuf *m;
3914 struct ulp_mem_io *req;
3915 struct tcpcb *tp = sototcpcb(so);
3916 struct toepcb *toep = tp->t_toe;
3917 unsigned int tid = toep->tp_tid;
3918 const struct tom_data *td = TOM_DATA(TOE_DEV(so));
3919 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
3920
3921 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
3922 gl, nppods, tag, maxoff, pg_off, color);
3923
3924 for (i = 0; i < nppods; ++i) {
3925 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
3926 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
3927 req = mtod(m, struct ulp_mem_io *);
3928 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
3929 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
3930 req->wr.wr_lo = 0;
3931 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
3932 V_ULPTX_CMD(ULP_MEM_WRITE));
3933 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
3934 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
3935
3936 p = (struct pagepod *)(req + 1);
3937 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
3938 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
3939 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
3940 V_PPOD_COLOR(color));
3941 p->pp_max_offset = htonl(maxoff);
3942 p->pp_page_offset = htonl(pg_off);
3943 p->pp_rsvd = 0;
3944 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
3945 p->pp_addr[j] = pidx < gl->dgl_nelem ?
3946 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
3947 } else
3948 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
3949 send_or_defer(toep, m, 0);
3950 ppod_addr += PPOD_SIZE;
3951 }
3952 return (0);
3953}
3954
3955/*
3956 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
3957 */
3958static inline void
3959mk_cpl_barrier_ulp(struct cpl_barrier *b)
3960{
3961 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
3962
3963 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3964 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
3965 b->opcode = CPL_BARRIER;
3966}
3967
3968/*
3969 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
3970 */
3971static inline void
3972mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
3973{
3974 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
3975
3976 txpkt = (struct ulp_txpkt *)req;
3977 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3978 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
3979 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
3980 req->cpuno = htons(cpuno);
3981}
3982
3983/*
3984 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
3985 */
3986static inline void
3987mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
3988 unsigned int word, uint64_t mask, uint64_t val)
3989{
3990 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
3991
3992 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
3993 tid, word, mask, val);
3994
3995 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3996 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
3997 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
3998 req->reply = V_NO_REPLY(1);
3999 req->cpu_idx = 0;
4000 req->word = htons(word);
4001 req->mask = htobe64(mask);
4002 req->val = htobe64(val);
4003}
4004
4005/*
4006 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4007 */
4008static void
2392 tp->t_rcvtime = ticks;
2393 SBAPPEND(&so->so_rcv, m);
2394 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2395 sorwakeup_locked(so);
2396 else
2397 SOCKBUF_UNLOCK(&so->so_rcv);
2398 return (1);
2399}
2400
2401/*
2402 * Handle a peer FIN.
2403 */
2404static void
2405do_peer_fin(struct socket *so, struct mbuf *m)
2406{
2407 struct tcpcb *tp = sototcpcb(so);
2408 struct toepcb *toep = tp->t_toe;
2409 int keep = 0;
2410 DPRINTF("do_peer_fin state=%d\n", tp->t_state);
2411
2412#ifdef T3_TRACE
2413 T3_TRACE0(TIDTB(sk),"do_peer_fin:");
2414#endif
2415
2416 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2417 printf("abort_pending set\n");
2418
2419 goto out;
2420 }
2421 INP_INFO_WLOCK(&tcbinfo);
2422 INP_LOCK(tp->t_inpcb);
2423 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2424 keep = handle_peer_close_data(so, m);
2425 if (keep < 0) {
2426 INP_INFO_WUNLOCK(&tcbinfo);
2427 INP_UNLOCK(tp->t_inpcb);
2428 return;
2429 }
2430 }
2431 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2432 socantrcvmore(so);
2433 /*
2434 * If connection is half-synchronized
2435 * (ie NEEDSYN flag on) then delay ACK,
2436 * so it may be piggybacked when SYN is sent.
2437 * Otherwise, since we received a FIN then no
2438 * more input can be expected, send ACK now.
2439 */
2440 if (tp->t_flags & TF_NEEDSYN)
2441 tp->t_flags |= TF_DELACK;
2442 else
2443 tp->t_flags |= TF_ACKNOW;
2444 tp->rcv_nxt++;
2445 }
2446
2447 switch (tp->t_state) {
2448 case TCPS_SYN_RECEIVED:
2449 tp->t_starttime = ticks;
2450 /* FALLTHROUGH */
2451 case TCPS_ESTABLISHED:
2452 tp->t_state = TCPS_CLOSE_WAIT;
2453 break;
2454 case TCPS_FIN_WAIT_1:
2455 tp->t_state = TCPS_CLOSING;
2456 break;
2457 case TCPS_FIN_WAIT_2:
2458 /*
2459 * If we've sent an abort_req we must have sent it too late,
2460 * HW will send us a reply telling us so, and this peer_close
2461 * is really the last message for this connection and needs to
2462 * be treated as an abort_rpl, i.e., transition the connection
2463 * to TCP_CLOSE (note that the host stack does this at the
2464 * time of generating the RST but we must wait for HW).
2465 * Otherwise we enter TIME_WAIT.
2466 */
2467 t3_release_offload_resources(toep);
2468 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2469 tp = tcp_close(tp);
2470 } else {
2471 enter_timewait(so);
2472 }
2473 break;
2474 default:
2475 log(LOG_ERR,
2476 "%s: TID %u received PEER_CLOSE in bad state %d\n",
2477 TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
2478 }
2479 INP_INFO_WUNLOCK(&tcbinfo);
2480 if (tp)
2481 INP_UNLOCK(tp->t_inpcb);
2482
2483 DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
2484
2485#ifdef notyet
2486 /* Do not send POLL_HUP for half duplex close. */
2487 if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2488 sk->sk_state == TCP_CLOSE)
2489 sk_wake_async(so, 1, POLL_HUP);
2490 else
2491 sk_wake_async(so, 1, POLL_IN);
2492#endif
2493
2494out:
2495 if (!keep)
2496 m_free(m);
2497}
2498
2499/*
2500 * Handler for PEER_CLOSE CPL messages.
2501 */
2502static int
2503do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2504{
2505 struct toepcb *toep = (struct toepcb *)ctx;
2506 struct socket *so = toeptoso(toep);
2507
2508 VALIDATE_SOCK(so);
2509
2510 do_peer_fin(so, m);
2511 return (0);
2512}
2513
2514static void
2515process_close_con_rpl(struct socket *so, struct mbuf *m)
2516{
2517 struct tcpcb *tp = sototcpcb(so);
2518 struct cpl_close_con_rpl *rpl = cplhdr(m);
2519 struct toepcb *toep = tp->t_toe;
2520
2521 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */
2522
2523 DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
2524 !!(so->so_state & SS_NOFDREF));
2525 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
2526 goto out;
2527
2528 INP_INFO_WLOCK(&tcbinfo);
2529 INP_LOCK(tp->t_inpcb);
2530 switch (tp->t_state) {
2531 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
2532 t3_release_offload_resources(toep);
2533 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2534 tp = tcp_close(tp);
2535
2536 } else {
2537 enter_timewait(so);
2538 soisdisconnected(so);
2539 }
2540 break;
2541 case TCPS_LAST_ACK:
2542 /*
2543 * In this state we don't care about pending abort_rpl.
2544 * If we've sent abort_req it was post-close and was sent too
2545 * late, this close_con_rpl is the actual last message.
2546 */
2547 t3_release_offload_resources(toep);
2548 tp = tcp_close(tp);
2549 break;
2550 case TCPS_FIN_WAIT_1:
2551 /*
2552 * If we can't receive any more
2553 * data, then closing user can proceed.
2554 * Starting the timer is contrary to the
2555 * specification, but if we don't get a FIN
2556 * we'll hang forever.
2557 *
2558 * XXXjl:
2559 * we should release the tp also, and use a
2560 * compressed state.
2561 */
2562 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2563 int timeout;
2564
2565 soisdisconnected(so);
2566 timeout = (tcp_fast_finwait2_recycle) ?
2567 tcp_finwait2_timeout : tcp_maxidle;
2568 tcp_timer_activate(tp, TT_2MSL, timeout);
2569 }
2570 tp->t_state = TCPS_FIN_WAIT_2;
2571 if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
2572 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2573 tp = tcp_drop(tp, 0);
2574 }
2575
2576 break;
2577 default:
2578 log(LOG_ERR,
2579 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2580 TOE_DEV(so)->tod_name, toep->tp_tid,
2581 tp->t_state);
2582 }
2583 INP_INFO_WUNLOCK(&tcbinfo);
2584 if (tp)
2585 INP_UNLOCK(tp->t_inpcb);
2586out:
2587 m_freem(m);
2588}
2589
2590/*
2591 * Handler for CLOSE_CON_RPL CPL messages.
2592 */
2593static int
2594do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2595 void *ctx)
2596{
2597 struct toepcb *toep = (struct toepcb *)ctx;
2598 struct socket *so = toeptoso(toep);
2599
2600 VALIDATE_SOCK(so);
2601
2602 process_close_con_rpl(so, m);
2603 return (0);
2604}
2605
2606/*
2607 * Process abort replies. We only process these messages if we anticipate
2608 * them as the coordination between SW and HW in this area is somewhat lacking
2609 * and sometimes we get ABORT_RPLs after we are done with the connection that
2610 * originated the ABORT_REQ.
2611 */
2612static void
2613process_abort_rpl(struct socket *so, struct mbuf *m)
2614{
2615 struct tcpcb *tp = sototcpcb(so);
2616 struct toepcb *toep = tp->t_toe;
2617
2618#ifdef T3_TRACE
2619 T3_TRACE1(TIDTB(sk),
2620 "process_abort_rpl: GTS rpl pending %d",
2621 sock_flag(sk, ABORT_RPL_PENDING));
2622#endif
2623
2624 INP_INFO_WLOCK(&tcbinfo);
2625 INP_LOCK(tp->t_inpcb);
2626
2627 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2628 /*
2629 * XXX panic on tcpdrop
2630 */
2631 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
2632 toep->tp_flags |= TP_ABORT_RPL_RCVD;
2633 else {
2634 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2635 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2636 !is_t3a(TOE_DEV(so))) {
2637 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2638 panic("TP_ABORT_REQ_RCVD set");
2639 t3_release_offload_resources(toep);
2640 tp = tcp_close(tp);
2641 }
2642 }
2643 }
2644 if (tp)
2645 INP_UNLOCK(tp->t_inpcb);
2646 INP_INFO_WUNLOCK(&tcbinfo);
2647
2648 m_free(m);
2649}
2650
2651/*
2652 * Handle an ABORT_RPL_RSS CPL message.
2653 */
2654static int
2655do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2656{
2657 struct socket *so;
2658 struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2659 struct toepcb *toep;
2660
2661 /*
2662 * Ignore replies to post-close aborts indicating that the abort was
2663 * requested too late. These connections are terminated when we get
2664 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2665 * arrives the TID is either no longer used or it has been recycled.
2666 */
2667 if (rpl->status == CPL_ERR_ABORT_FAILED) {
2668discard:
2669 m_free(m);
2670 return (0);
2671 }
2672
2673 toep = (struct toepcb *)ctx;
2674
2675 /*
2676 * Sometimes we've already closed the socket, e.g., a post-close
2677 * abort races with ABORT_REQ_RSS, the latter frees the socket
2678 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2679 * but FW turns the ABORT_REQ into a regular one and so we get
2680 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
2681 */
2682 if (!toep)
2683 goto discard;
2684
2685 if (toep->tp_tp == NULL) {
2686 printf("removing tid for abort\n");
2687 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2688 if (toep->tp_l2t)
2689 l2t_release(L2DATA(cdev), toep->tp_l2t);
2690
2691 toepcb_release(toep);
2692 goto discard;
2693 }
2694
2695 printf("toep=%p\n", toep);
2696 printf("tp=%p\n", toep->tp_tp);
2697
2698 so = toeptoso(toep); /* <- XXX panic */
2699 toepcb_hold(toep);
2700 process_abort_rpl(so, m);
2701 toepcb_release(toep);
2702 return (0);
2703}
2704
2705/*
2706 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also
2707 * indicate whether RST should be sent in response.
2708 */
2709static int
2710abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2711{
2712 struct tcpcb *tp = sototcpcb(so);
2713
2714 switch (abort_reason) {
2715 case CPL_ERR_BAD_SYN:
2716#if 0
2717 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through
2718#endif
2719 case CPL_ERR_CONN_RESET:
2720 // XXX need to handle SYN_RECV due to crossed SYNs
2721 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2722 case CPL_ERR_XMIT_TIMEDOUT:
2723 case CPL_ERR_PERSIST_TIMEDOUT:
2724 case CPL_ERR_FINWAIT2_TIMEDOUT:
2725 case CPL_ERR_KEEPALIVE_TIMEDOUT:
2726#if 0
2727 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2728#endif
2729 return (ETIMEDOUT);
2730 default:
2731 return (EIO);
2732 }
2733}
2734
2735static inline void
2736set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2737{
2738 struct cpl_abort_rpl *rpl = cplhdr(m);
2739
2740 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2741 rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2742 m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2743
2744 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2745 rpl->cmd = cmd;
2746}
2747
2748static void
2749send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2750{
2751 struct mbuf *reply_mbuf;
2752 struct cpl_abort_req_rss *req = cplhdr(m);
2753
2754 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2755 m_set_priority(m, CPL_PRIORITY_DATA);
2756 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2757 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2758 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2759 m_free(m);
2760}
2761
2762/*
2763 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2764 */
2765static inline int
2766is_neg_adv_abort(unsigned int status)
2767{
2768 return status == CPL_ERR_RTX_NEG_ADVICE ||
2769 status == CPL_ERR_PERSIST_NEG_ADVICE;
2770}
2771
2772static void
2773send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2774{
2775 struct mbuf *reply_mbuf;
2776 struct cpl_abort_req_rss *req = cplhdr(m);
2777
2778 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2779
2780 if (!reply_mbuf) {
2781 /* Defer the reply. Stick rst_status into req->cmd. */
2782 req->status = rst_status;
2783 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2784 return;
2785 }
2786
2787 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2788 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2789 m_free(m);
2790
2791 /*
2792 * XXX need to sync with ARP as for SYN_RECV connections we can send
2793 * these messages while ARP is pending. For other connection states
2794 * it's not a problem.
2795 */
2796 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2797}
2798
2799#ifdef notyet
2800static void
2801cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2802{
2803 CXGB_UNIMPLEMENTED();
2804#ifdef notyet
2805 struct request_sock *req = child->sk_user_data;
2806
2807 inet_csk_reqsk_queue_removed(parent, req);
2808 synq_remove(tcp_sk(child));
2809 __reqsk_free(req);
2810 child->sk_user_data = NULL;
2811#endif
2812}
2813
2814
2815/*
2816 * Performs the actual work to abort a SYN_RECV connection.
2817 */
2818static void
2819do_abort_syn_rcv(struct socket *child, struct socket *parent)
2820{
2821 struct tcpcb *parenttp = sototcpcb(parent);
2822 struct tcpcb *childtp = sototcpcb(child);
2823
2824 /*
2825 * If the server is still open we clean up the child connection,
2826 * otherwise the server already did the clean up as it was purging
2827 * its SYN queue and the skb was just sitting in its backlog.
2828 */
2829 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2830 cleanup_syn_rcv_conn(child, parent);
2831 INP_INFO_WLOCK(&tcbinfo);
2832 INP_LOCK(childtp->t_inpcb);
2833 t3_release_offload_resources(childtp->t_toe);
2834 childtp = tcp_close(childtp);
2835 INP_INFO_WUNLOCK(&tcbinfo);
2836 if (childtp)
2837 INP_UNLOCK(childtp->t_inpcb);
2838 }
2839}
2840#endif
2841
2842/*
2843 * Handle abort requests for a SYN_RECV connection. These need extra work
2844 * because the socket is on its parent's SYN queue.
2845 */
2846static int
2847abort_syn_rcv(struct socket *so, struct mbuf *m)
2848{
2849 CXGB_UNIMPLEMENTED();
2850#ifdef notyet
2851 struct socket *parent;
2852 struct toedev *tdev = TOE_DEV(so);
2853 struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2854 struct socket *oreq = so->so_incomp;
2855 struct t3c_tid_entry *t3c_stid;
2856 struct tid_info *t;
2857
2858 if (!oreq)
2859 return -1; /* somehow we are not on the SYN queue */
2860
2861 t = &(T3C_DATA(cdev))->tid_maps;
2862 t3c_stid = lookup_stid(t, oreq->ts_recent);
2863 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2864
2865 SOCK_LOCK(parent);
2866 do_abort_syn_rcv(so, parent);
2867 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2868 SOCK_UNLOCK(parent);
2869#endif
2870 return (0);
2871}
2872
2873/*
2874 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this
2875 * request except that we need to reply to it.
2876 */
2877static void
2878process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
2879{
2880 int rst_status = CPL_ABORT_NO_RST;
2881 const struct cpl_abort_req_rss *req = cplhdr(m);
2882 struct tcpcb *tp = sototcpcb(so);
2883 struct toepcb *toep = tp->t_toe;
2884
2885 INP_LOCK(tp->t_inpcb);
2886 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
2887 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
2888 m_free(m);
2889 goto skip;
2890 }
2891
2892 toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
2893 /*
2894 * Three cases to consider:
2895 * a) We haven't sent an abort_req; close the connection.
2896 * b) We have sent a post-close abort_req that will get to TP too late
2897 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
2898 * be ignored and the connection should be closed now.
2899 * c) We have sent a regular abort_req that will get to TP too late.
2900 * That will generate an abort_rpl with status 0, wait for it.
2901 */
2902 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
2903 (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
2904 so->so_error = abort_status_to_errno(so, req->status,
2905 &rst_status);
2906 if (__predict_true((so->so_state & SS_NOFDREF) == 0))
2907 sorwakeup(so);
2908 /*
2909 * SYN_RECV needs special processing. If abort_syn_rcv()
2910 * returns 0 is has taken care of the abort.
2911 */
2912 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
2913 goto skip;
2914
2915 t3_release_offload_resources(toep);
2916 tp = tcp_close(tp);
2917 }
2918 if (tp)
2919 INP_UNLOCK(tp->t_inpcb);
2920 send_abort_rpl(m, tdev, rst_status);
2921 return;
2922
2923skip:
2924 INP_UNLOCK(tp->t_inpcb);
2925}
2926
2927/*
2928 * Handle an ABORT_REQ_RSS CPL message.
2929 */
2930static int
2931do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2932{
2933 const struct cpl_abort_req_rss *req = cplhdr(m);
2934 struct toepcb *toep = (struct toepcb *)ctx;
2935 struct socket *so;
2936 struct inpcb *inp;
2937
2938 if (is_neg_adv_abort(req->status)) {
2939 m_free(m);
2940 return (0);
2941 }
2942
2943 printf("aborting tid=%d\n", toep->tp_tid);
2944
2945 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
2946 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2947 toep->tp_flags |= TP_ABORT_REQ_RCVD;
2948 printf("sending abort rpl\n");
2949
2950 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
2951 printf("sent\n");
2952 if (toep->tp_l2t)
2953 l2t_release(L2DATA(cdev), toep->tp_l2t);
2954
2955 /*
2956 * Unhook
2957 */
2958 toep->tp_tp->t_toe = NULL;
2959 toep->tp_tp->t_flags &= ~TF_TOE;
2960 toep->tp_tp = NULL;
2961 /*
2962 * XXX need to call syncache_chkrst - but we don't
2963 * have a way of doing that yet
2964 */
2965 toepcb_release(toep);
2966 printf("abort for unestablished connection :-(\n");
2967 return (0);
2968 }
2969 if (toep->tp_tp == NULL) {
2970 printf("disconnected toepcb\n");
2971 /* should be freed momentarily */
2972 return (0);
2973 }
2974
2975 so = toeptoso(toep);
2976 inp = sotoinpcb(so);
2977
2978 VALIDATE_SOCK(so);
2979 toepcb_hold(toep);
2980 INP_INFO_WLOCK(&tcbinfo);
2981 process_abort_req(so, m, TOE_DEV(so));
2982 INP_INFO_WUNLOCK(&tcbinfo);
2983 toepcb_release(toep);
2984 return (0);
2985}
2986#ifdef notyet
2987static void
2988pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
2989{
2990 struct toedev *tdev = TOE_DEV(parent);
2991
2992 do_abort_syn_rcv(child, parent);
2993 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
2994 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
2995
2996 rpl->opt0h = htonl(F_TCAM_BYPASS);
2997 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
2998 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
2999 } else
3000 m_free(m);
3001}
3002#endif
3003static void
3004handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3005{
3006 CXGB_UNIMPLEMENTED();
3007
3008#ifdef notyet
3009 struct t3cdev *cdev;
3010 struct socket *parent;
3011 struct socket *oreq;
3012 struct t3c_tid_entry *t3c_stid;
3013 struct tid_info *t;
3014 struct tcpcb *otp, *tp = sototcpcb(so);
3015 struct toepcb *toep = tp->t_toe;
3016
3017 /*
3018 * If the connection is being aborted due to the parent listening
3019 * socket going away there's nothing to do, the ABORT_REQ will close
3020 * the connection.
3021 */
3022 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3023 m_free(m);
3024 return;
3025 }
3026
3027 oreq = so->so_incomp;
3028 otp = sototcpcb(oreq);
3029
3030 cdev = T3C_DEV(so);
3031 t = &(T3C_DATA(cdev))->tid_maps;
3032 t3c_stid = lookup_stid(t, otp->ts_recent);
3033 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3034
3035 SOCK_LOCK(parent);
3036 pass_open_abort(so, parent, m);
3037 SOCK_UNLOCK(parent);
3038#endif
3039}
3040
3041/*
3042 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
3043 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3044 * connection.
3045 */
3046static void
3047pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3048{
3049
3050#ifdef notyet
3051 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3052 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3053#endif
3054 handle_pass_open_arp_failure(m_get_socket(m), m);
3055}
3056
3057/*
3058 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3059 */
3060static void
3061mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3062{
3063 struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3064 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3065 unsigned int tid = GET_TID(req);
3066
3067 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3068 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3069 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3070 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
3071 rpl->opt0h = htonl(F_TCAM_BYPASS);
3072 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3073 rpl->opt2 = 0;
3074 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3075}
3076
3077/*
3078 * Send a deferred reject to an accept request.
3079 */
3080static void
3081reject_pass_request(struct toedev *tdev, struct mbuf *m)
3082{
3083 struct mbuf *reply_mbuf;
3084
3085 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3086 mk_pass_accept_rpl(reply_mbuf, m);
3087 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3088 m_free(m);
3089}
3090
3091static void
3092handle_syncache_event(int event, void *arg)
3093{
3094 struct toepcb *toep = arg;
3095
3096 switch (event) {
3097 case TOE_SC_ENTRY_PRESENT:
3098 /*
3099 * entry already exists - free toepcb
3100 * and l2t
3101 */
3102 printf("syncache entry present\n");
3103 toepcb_release(toep);
3104 break;
3105 case TOE_SC_DROP:
3106 /*
3107 * The syncache has given up on this entry
3108 * either it timed out, or it was evicted
3109 * we need to explicitly release the tid
3110 */
3111 printf("syncache entry dropped\n");
3112 toepcb_release(toep);
3113 break;
3114 default:
3115 log(LOG_ERR, "unknown syncache event %d\n", event);
3116 break;
3117 }
3118}
3119
3120static void
3121syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3122{
3123 struct in_conninfo inc;
3124 struct tcpopt to;
3125 struct tcphdr th;
3126 struct inpcb *inp;
3127 int mss, wsf, sack, ts;
3128 uint32_t rcv_isn = ntohl(req->rcv_isn);
3129
3130 bzero(&to, sizeof(struct tcpopt));
3131 inp = sotoinpcb(lso);
3132
3133 /*
3134 * Fill out information for entering us into the syncache
3135 */
3136 inc.inc_fport = th.th_sport = req->peer_port;
3137 inc.inc_lport = th.th_dport = req->local_port;
3138 th.th_seq = req->rcv_isn;
3139 th.th_flags = TH_SYN;
3140
3141 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3142
3143
3144 inc.inc_isipv6 = 0;
3145 inc.inc_len = 0;
3146 inc.inc_faddr.s_addr = req->peer_ip;
3147 inc.inc_laddr.s_addr = req->local_ip;
3148
3149 DPRINTF("syncache add of %d:%d %d:%d\n",
3150 ntohl(req->local_ip), ntohs(req->local_port),
3151 ntohl(req->peer_ip), ntohs(req->peer_port));
3152
3153 mss = req->tcp_options.mss;
3154 wsf = req->tcp_options.wsf;
3155 ts = req->tcp_options.tstamp;
3156 sack = req->tcp_options.sack;
3157 to.to_mss = mss;
3158 to.to_wscale = wsf;
3159 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3160 INP_INFO_WLOCK(&tcbinfo);
3161 INP_LOCK(inp);
3162 syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3163}
3164
3165
3166/*
3167 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
3168 * lock held. Note that the sock here is a listening socket that is not owned
3169 * by the TOE.
3170 */
3171static void
3172process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3173 struct listen_ctx *lctx)
3174{
3175 int rt_flags;
3176 struct l2t_entry *e;
3177 struct iff_mac tim;
3178 struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3179 struct cpl_pass_accept_rpl *rpl;
3180 struct cpl_pass_accept_req *req = cplhdr(m);
3181 unsigned int tid = GET_TID(req);
3182 struct tom_data *d = TOM_DATA(tdev);
3183 struct t3cdev *cdev = d->cdev;
3184 struct tcpcb *tp = sototcpcb(so);
3185 struct toepcb *newtoep;
3186 struct rtentry *dst;
3187 struct sockaddr_in nam;
3188 struct t3c_data *td = T3C_DATA(cdev);
3189
3190 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3191 if (__predict_false(reply_mbuf == NULL)) {
3192 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3193 t3_defer_reply(m, tdev, reject_pass_request);
3194 else {
3195 cxgb_queue_tid_release(cdev, tid);
3196 m_free(m);
3197 }
3198 DPRINTF("failed to get reply_mbuf\n");
3199
3200 goto out;
3201 }
3202
3203 if (tp->t_state != TCPS_LISTEN) {
3204 DPRINTF("socket not in listen state\n");
3205
3206 goto reject;
3207 }
3208
3209 tim.mac_addr = req->dst_mac;
3210 tim.vlan_tag = ntohs(req->vlan_tag);
3211 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3212 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3213 goto reject;
3214 }
3215
3216#ifdef notyet
3217 /*
3218 * XXX do route lookup to confirm that we're still listening on this
3219 * address
3220 */
3221 if (ip_route_input(skb, req->local_ip, req->peer_ip,
3222 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3223 goto reject;
3224 rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3225 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3226 dst_release(skb->dst); // done with the input route, release it
3227 skb->dst = NULL;
3228
3229 if ((rt_flags & RTF_LOCAL) == 0)
3230 goto reject;
3231#endif
3232 /*
3233 * XXX
3234 */
3235 rt_flags = RTF_LOCAL;
3236 if ((rt_flags & RTF_LOCAL) == 0)
3237 goto reject;
3238
3239 /*
3240 * Calculate values and add to syncache
3241 */
3242
3243 newtoep = toepcb_alloc();
3244 if (newtoep == NULL)
3245 goto reject;
3246
3247 bzero(&nam, sizeof(struct sockaddr_in));
3248
3249 nam.sin_len = sizeof(struct sockaddr_in);
3250 nam.sin_family = AF_INET;
3251 nam.sin_addr.s_addr =req->peer_ip;
3252 dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3253
3254 if (dst == NULL) {
3255 printf("failed to find route\n");
3256 goto reject;
3257 }
3258 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3259 (struct sockaddr *)&nam);
3260 if (e == NULL) {
3261 DPRINTF("failed to get l2t\n");
3262 }
3263 /*
3264 * Point to our listen socket until accept
3265 */
3266 newtoep->tp_tp = tp;
3267 newtoep->tp_flags = TP_SYN_RCVD;
3268 newtoep->tp_tid = tid;
3269 newtoep->tp_toedev = tdev;
3270 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3271
3272 cxgb_insert_tid(cdev, d->client, newtoep, tid);
3273 SOCK_LOCK(so);
3274 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3275 SOCK_UNLOCK(so);
3276
3277 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
3278 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3279
3280 if (newtoep->tp_ulp_mode) {
3281 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3282
3283 if (ddp_mbuf == NULL)
3284 newtoep->tp_ulp_mode = 0;
3285 }
3286
3287 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3288 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3289 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3290 /*
3291 * XXX workaround for lack of syncache drop
3292 */
3293 toepcb_hold(newtoep);
3294 syncache_add_accept_req(req, so, newtoep);
3295
3296 rpl = cplhdr(reply_mbuf);
3297 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3298 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3299 rpl->wr.wr_lo = 0;
3300 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3301 rpl->opt2 = htonl(calc_opt2(so, tdev));
3302 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3303 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
3304
3305 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3306 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3307 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3308 CPL_PASS_OPEN_ACCEPT);
3309
3310 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3311
3312 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3313
3314 l2t_send(cdev, reply_mbuf, e);
3315 m_free(m);
3316 if (newtoep->tp_ulp_mode) {
3317 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3318 V_TF_DDP_OFF(1) |
3319 TP_DDP_TIMER_WORKAROUND_MASK,
3320 V_TF_DDP_OFF(1) |
3321 TP_DDP_TIMER_WORKAROUND_VAL, 1);
3322 } else
3323 printf("not offloading\n");
3324
3325
3326
3327 return;
3328reject:
3329 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3330 mk_pass_accept_rpl(reply_mbuf, m);
3331 else
3332 mk_tid_release(reply_mbuf, newtoep, tid);
3333 cxgb_ofld_send(cdev, reply_mbuf);
3334 m_free(m);
3335out:
3336#if 0
3337 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3338#else
3339 return;
3340#endif
3341}
3342
3343/*
3344 * Handle a CPL_PASS_ACCEPT_REQ message.
3345 */
3346static int
3347do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3348{
3349 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3350 struct socket *lso = listen_ctx->lso;
3351 struct tom_data *d = listen_ctx->tom_data;
3352
3353#if VALIDATE_TID
3354 struct cpl_pass_accept_req *req = cplhdr(m);
3355 unsigned int tid = GET_TID(req);
3356 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3357
3358 if (unlikely(!lsk)) {
3359 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3360 cdev->name,
3361 (unsigned long)((union listen_entry *)ctx -
3362 t->stid_tab));
3363 return CPL_RET_BUF_DONE;
3364 }
3365 if (unlikely(tid >= t->ntids)) {
3366 printk(KERN_ERR "%s: passive open TID %u too large\n",
3367 cdev->name, tid);
3368 return CPL_RET_BUF_DONE;
3369 }
3370 /*
3371 * For T3A the current user of the TID may have closed but its last
3372 * message(s) may have been backlogged so the TID appears to be still
3373 * in use. Just take the TID away, the connection can close at its
3374 * own leisure. For T3B this situation is a bug.
3375 */
3376 if (!valid_new_tid(t, tid) &&
3377 cdev->type != T3A) {
3378 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3379 cdev->name, tid);
3380 return CPL_RET_BUF_DONE;
3381 }
3382#endif
3383
3384 process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3385 return (0);
3386}
3387
3388/*
3389 * Called when a connection is established to translate the TCP options
3390 * reported by HW to FreeBSD's native format.
3391 */
3392static void
3393assign_rxopt(struct socket *so, unsigned int opt)
3394{
3395 const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
3396 struct tcpcb *tp = sototcpcb(so);
3397 struct toepcb *toep = tp->t_toe;
3398
3399 INP_LOCK_ASSERT(tp->t_inpcb);
3400
3401 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3402 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3403 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3404 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3405 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3406 (TF_RCVD_SCALE|TF_REQ_SCALE))
3407 tp->rcv_scale = tp->request_r_scale;
3408}
3409
3410/*
3411 * Completes some final bits of initialization for just established connections
3412 * and changes their state to TCP_ESTABLISHED.
3413 *
3414 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3415 */
3416static void
3417make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3418{
3419 struct tcpcb *tp = sototcpcb(so);
3420 struct toepcb *toep = tp->t_toe;
3421
3422 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3423 assign_rxopt(so, opt);
3424 so->so_proto->pr_ctloutput = t3_ctloutput;
3425
3426#if 0
3427 inet_sk(sk)->id = tp->write_seq ^ jiffies;
3428#endif
3429 /*
3430 * XXX not clear what rcv_wup maps to
3431 */
3432 /*
3433 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3434 * pass through opt0.
3435 */
3436 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3437 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3438
3439 dump_toepcb(toep);
3440
3441#ifdef notyet
3442/*
3443 * no clean interface for marking ARP up to date
3444 */
3445 dst_confirm(sk->sk_dst_cache);
3446#endif
3447 tp->t_starttime = ticks;
3448 tp->t_state = TCPS_ESTABLISHED;
3449 soisconnected(so);
3450}
3451
3452static int
3453syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3454{
3455
3456 struct in_conninfo inc;
3457 struct tcpopt to;
3458 struct tcphdr th;
3459 int mss, wsf, sack, ts;
3460 struct mbuf *m = NULL;
3461 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3462 unsigned int opt;
3463
3464#ifdef MAC
3465#error "no MAC support"
3466#endif
3467
3468 opt = ntohs(req->tcp_opt);
3469
3470 bzero(&to, sizeof(struct tcpopt));
3471
3472 /*
3473 * Fill out information for entering us into the syncache
3474 */
3475 inc.inc_fport = th.th_sport = req->peer_port;
3476 inc.inc_lport = th.th_dport = req->local_port;
3477 th.th_seq = req->rcv_isn;
3478 th.th_flags = TH_ACK;
3479
3480 inc.inc_isipv6 = 0;
3481 inc.inc_len = 0;
3482 inc.inc_faddr.s_addr = req->peer_ip;
3483 inc.inc_laddr.s_addr = req->local_ip;
3484
3485 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3486 wsf = G_TCPOPT_WSCALE_OK(opt);
3487 ts = G_TCPOPT_TSTAMP(opt);
3488 sack = G_TCPOPT_SACK(opt);
3489
3490 to.to_mss = mss;
3491 to.to_wscale = G_TCPOPT_SND_WSCALE(opt);
3492 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3493
3494 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3495 ntohl(req->local_ip), ntohs(req->local_port),
3496 ntohl(req->peer_ip), ntohs(req->peer_port),
3497 mss, wsf, ts, sack);
3498 return syncache_expand(&inc, &to, &th, so, m);
3499}
3500
3501
3502/*
3503 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
3504 * if we are in TCP_SYN_RECV due to crossed SYNs
3505 */
3506static int
3507do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3508{
3509 struct cpl_pass_establish *req = cplhdr(m);
3510 struct toepcb *toep = (struct toepcb *)ctx;
3511 struct tcpcb *tp;
3512 struct socket *so, *lso;
3513 struct t3c_data *td = T3C_DATA(cdev);
3514 // Complete socket initialization now that we have the SND_ISN
3515
3516 struct toedev *tdev;
3517
3518 so = lso = toeptoso(toep);
3519 tdev = toep->tp_toedev;
3520
3521 SOCK_LOCK(so);
3522 LIST_REMOVE(toep, synq_entry);
3523 SOCK_UNLOCK(so);
3524
3525 INP_INFO_WLOCK(&tcbinfo);
3526 if (!syncache_expand_establish_req(req, &so, toep)) {
3527 /*
3528 * No entry
3529 */
3530 CXGB_UNIMPLEMENTED();
3531 }
3532 if (so == NULL) {
3533 /*
3534 * Couldn't create the socket
3535 */
3536 CXGB_UNIMPLEMENTED();
3537 }
3538
3539 /*
3540 * XXX workaround for lack of syncache drop
3541 */
3542 toepcb_release(toep);
3543
3544 tp = sototcpcb(so);
3545 INP_LOCK(tp->t_inpcb);
3546
3547 so->so_snd.sb_flags |= SB_NOCOALESCE;
3548 so->so_rcv.sb_flags |= SB_NOCOALESCE;
3549
3550 toep->tp_tp = tp;
3551 toep->tp_flags = 0;
3552 tp->t_toe = toep;
3553 reset_wr_list(toep);
3554 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3555 tp->rcv_nxt = toep->tp_copied_seq;
3556 install_offload_ops(so);
3557
3558 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3559 toep->tp_wr_unacked = 0;
3560 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3561 toep->tp_qset_idx = 0;
3562 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3563
3564 /*
3565 * XXX Cancel any keep alive timer
3566 */
3567
3568 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3569 INP_INFO_WUNLOCK(&tcbinfo);
3570 INP_UNLOCK(tp->t_inpcb);
3571
3572 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3573 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3574#ifdef notyet
3575 /*
3576 * XXX not sure how these checks map to us
3577 */
3578 if (unlikely(sk->sk_socket)) { // simultaneous opens only
3579 sk->sk_state_change(sk);
3580 sk_wake_async(so, 0, POLL_OUT);
3581 }
3582 /*
3583 * The state for the new connection is now up to date.
3584 * Next check if we should add the connection to the parent's
3585 * accept queue. When the parent closes it resets connections
3586 * on its SYN queue, so check if we are being reset. If so we
3587 * don't need to do anything more, the coming ABORT_RPL will
3588 * destroy this socket. Otherwise move the connection to the
3589 * accept queue.
3590 *
3591 * Note that we reset the synq before closing the server so if
3592 * we are not being reset the stid is still open.
3593 */
3594 if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3595 __kfree_skb(skb);
3596 goto unlock;
3597 }
3598#endif
3599 m_free(m);
3600
3601 return (0);
3602}
3603
3604/*
3605 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3606 * and send them to the TOE.
3607 */
3608static void
3609fixup_and_send_ofo(struct socket *so)
3610{
3611 struct mbuf *m;
3612 struct toedev *tdev = TOE_DEV(so);
3613 struct tcpcb *tp = sototcpcb(so);
3614 struct toepcb *toep = tp->t_toe;
3615 unsigned int tid = toep->tp_tid;
3616
3617 printf("fixup_and_send_ofo\n");
3618
3619 INP_LOCK_ASSERT(tp->t_inpcb);
3620 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3621 /*
3622 * A variety of messages can be waiting but the fields we'll
3623 * be touching are common to all so any message type will do.
3624 */
3625 struct cpl_close_con_req *p = cplhdr(m);
3626
3627 p->wr.wr_lo = htonl(V_WR_TID(tid));
3628 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3629 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3630 }
3631}
3632
3633/*
3634 * Updates socket state from an active establish CPL message. Runs with the
3635 * socket lock held.
3636 */
3637static void
3638socket_act_establish(struct socket *so, struct mbuf *m)
3639{
3640 struct cpl_act_establish *req = cplhdr(m);
3641 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
3642 struct tcpcb *tp = sototcpcb(so);
3643 struct toepcb *toep = tp->t_toe;
3644
3645 if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3646 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3647 toep->tp_tid, tp->t_state);
3648
3649 tp->ts_recent_age = ticks;
3650 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3651 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3652
3653 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3654
3655 /*
3656 * Now that we finally have a TID send any CPL messages that we had to
3657 * defer for lack of a TID.
3658 */
3659 if (mbufq_len(&toep->out_of_order_queue))
3660 fixup_and_send_ofo(so);
3661
3662 if (__predict_false(so->so_state & SS_NOFDREF)) {
3663 /*
3664 * XXX does this even make sense?
3665 */
3666 sorwakeup(so);
3667 }
3668 m_free(m);
3669#ifdef notyet
3670/*
3671 * XXX assume no write requests permitted while socket connection is
3672 * incomplete
3673 */
3674 /*
3675 * Currently the send queue must be empty at this point because the
3676 * socket layer does not send anything before a connection is
3677 * established. To be future proof though we handle the possibility
3678 * that there are pending buffers to send (either TX_DATA or
3679 * CLOSE_CON_REQ). First we need to adjust the sequence number of the
3680 * buffers according to the just learned write_seq, and then we send
3681 * them on their way.
3682 */
3683 fixup_pending_writeq_buffers(sk);
3684 if (t3_push_frames(so, 1))
3685 sk->sk_write_space(sk);
3686#endif
3687
3688 toep->tp_state = tp->t_state;
3689 tcpstat.tcps_connects++;
3690
3691}
3692
3693/*
3694 * Process a CPL_ACT_ESTABLISH message.
3695 */
3696static int
3697do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3698{
3699 struct cpl_act_establish *req = cplhdr(m);
3700 unsigned int tid = GET_TID(req);
3701 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3702 struct toepcb *toep = (struct toepcb *)ctx;
3703 struct tcpcb *tp = toep->tp_tp;
3704 struct socket *so;
3705 struct toedev *tdev;
3706 struct tom_data *d;
3707
3708 if (tp == NULL) {
3709 free_atid(cdev, atid);
3710 return (0);
3711 }
3712
3713 so = toeptoso(toep);
3714 tdev = TOE_DEV(so); /* blow up here if link was down */
3715 d = TOM_DATA(tdev);
3716
3717 INP_LOCK(tp->t_inpcb);
3718
3719 /*
3720 * It's OK if the TID is currently in use, the owning socket may have
3721 * backlogged its last CPL message(s). Just take it away.
3722 */
3723 toep->tp_tid = tid;
3724 toep->tp_tp = tp;
3725 so_insert_tid(d, so, tid);
3726 free_atid(cdev, atid);
3727 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3728
3729 socket_act_establish(so, m);
3730 INP_UNLOCK(tp->t_inpcb);
3731 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3732 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3733
3734 return (0);
3735}
3736
3737/*
3738 * Process an acknowledgment of WR completion. Advance snd_una and send the
3739 * next batch of work requests from the write queue.
3740 */
3741static void
3742wr_ack(struct toepcb *toep, struct mbuf *m)
3743{
3744 struct tcpcb *tp = toep->tp_tp;
3745 struct cpl_wr_ack *hdr = cplhdr(m);
3746 struct socket *so = toeptoso(toep);
3747 unsigned int credits = ntohs(hdr->credits);
3748 u32 snd_una = ntohl(hdr->snd_una);
3749 int bytes = 0;
3750
3751 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3752
3753 INP_LOCK(tp->t_inpcb);
3754
3755 toep->tp_wr_avail += credits;
3756 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3757 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3758
3759 while (credits) {
3760 struct mbuf *p = peek_wr(toep);
3761
3762 if (__predict_false(!p)) {
3763 log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3764 "nothing pending, state %u wr_avail=%u\n",
3765 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3766 break;
3767 }
3768 CTR2(KTR_TOM,
3769 "wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len);
3770
3771 KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list"));
3772 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3773
3774#if DEBUG_WR > 1
3775 struct tx_data_wr *w = cplhdr(p);
3776 log(LOG_ERR,
3777 "TID %u got %u WR credits, need %u, len %u, "
3778 "main body %u, frags %u, seq # %u, ACK una %u,"
3779 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3780 toep->tp_tid, credits, p->csum, p->len,
3781 p->len - p->data_len, skb_shinfo(p)->nr_frags,
3782 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3783 toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3784#endif
3785 p->m_pkthdr.csum_data -= credits;
3786 break;
3787 } else {
3788 dequeue_wr(toep);
3789 credits -= p->m_pkthdr.csum_data;
3790 bytes += p->m_pkthdr.len;
3791 CTR3(KTR_TOM,
3792 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3793 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3794
3795 m_free(p);
3796 }
3797 }
3798
3799#if DEBUG_WR
3800 check_wr_invariants(tp);
3801#endif
3802
3803 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3804#if VALIDATE_SEQ
3805 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3806
3807 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3808 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3809 toep->tp_tid, tp->snd_una);
3810#endif
3811 goto out_free;
3812 }
3813
3814 if (tp->snd_una != snd_una) {
3815 tp->snd_una = snd_una;
3816 tp->ts_recent_age = ticks;
3817#ifdef notyet
3818 /*
3819 * Keep ARP entry "minty fresh"
3820 */
3821 dst_confirm(sk->sk_dst_cache);
3822#endif
3823 if (tp->snd_una == tp->snd_nxt)
3824 toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3825 }
3826 if (bytes) {
3827 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3828 SOCKBUF_LOCK(&so->so_snd);
3829 sbdrop_locked(&so->so_snd, bytes);
3830 sowwakeup_locked(so);
3831 }
3832
3833 if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
3834 t3_push_frames(so, 0);
3835
3836out_free:
3837 INP_UNLOCK(tp->t_inpcb);
3838 m_free(m);
3839}
3840
3841/*
3842 * Handler for TX_DATA_ACK CPL messages.
3843 */
3844static int
3845do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3846{
3847 struct toepcb *toep = (struct toepcb *)ctx;
3848
3849 VALIDATE_SOCK(so);
3850
3851 wr_ack(toep, m);
3852 return 0;
3853}
3854
3855/*
3856 * Handler for TRACE_PKT CPL messages. Just sink these packets.
3857 */
3858static int
3859do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
3860{
3861 m_freem(m);
3862 return 0;
3863}
3864
3865/*
3866 * Reset a connection that is on a listener's SYN queue or accept queue,
3867 * i.e., one that has not had a struct socket associated with it.
3868 * Must be called from process context.
3869 *
3870 * Modeled after code in inet_csk_listen_stop().
3871 */
3872static void
3873t3_reset_listen_child(struct socket *child)
3874{
3875 struct tcpcb *tp = sototcpcb(child);
3876
3877 t3_send_reset(tp->t_toe);
3878}
3879
3880/*
3881 * Disconnect offloaded established but not yet accepted connections sitting
3882 * on a server's accept_queue. We just send an ABORT_REQ at this point and
3883 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
3884 */
3885void
3886t3_disconnect_acceptq(struct socket *listen_so)
3887{
3888 struct socket *so;
3889 struct tcpcb *tp;
3890
3891 TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
3892 tp = sototcpcb(so);
3893
3894 if (tp->t_flags & TF_TOE) {
3895 INP_LOCK(tp->t_inpcb);
3896 t3_reset_listen_child(so);
3897 INP_UNLOCK(tp->t_inpcb);
3898 }
3899
3900 }
3901}
3902
3903/*
3904 * Reset offloaded connections sitting on a server's syn queue. As above
3905 * we send ABORT_REQ and finish off when we get ABORT_RPL.
3906 */
3907
3908void
3909t3_reset_synq(struct listen_ctx *lctx)
3910{
3911 struct toepcb *toep;
3912
3913 SOCK_LOCK(lctx->lso);
3914 while (!LIST_EMPTY(&lctx->synq_head)) {
3915 toep = LIST_FIRST(&lctx->synq_head);
3916 LIST_REMOVE(toep, synq_entry);
3917 toep->tp_tp = NULL;
3918 t3_send_reset(toep);
3919 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
3920 toepcb_release(toep);
3921 }
3922 SOCK_UNLOCK(lctx->lso);
3923}
3924
3925
3926int
3927t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
3928 unsigned int nppods, unsigned int tag, unsigned int maxoff,
3929 unsigned int pg_off, unsigned int color)
3930{
3931 unsigned int i, j, pidx;
3932 struct pagepod *p;
3933 struct mbuf *m;
3934 struct ulp_mem_io *req;
3935 struct tcpcb *tp = sototcpcb(so);
3936 struct toepcb *toep = tp->t_toe;
3937 unsigned int tid = toep->tp_tid;
3938 const struct tom_data *td = TOM_DATA(TOE_DEV(so));
3939 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
3940
3941 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
3942 gl, nppods, tag, maxoff, pg_off, color);
3943
3944 for (i = 0; i < nppods; ++i) {
3945 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
3946 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
3947 req = mtod(m, struct ulp_mem_io *);
3948 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
3949 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
3950 req->wr.wr_lo = 0;
3951 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
3952 V_ULPTX_CMD(ULP_MEM_WRITE));
3953 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
3954 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
3955
3956 p = (struct pagepod *)(req + 1);
3957 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
3958 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
3959 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
3960 V_PPOD_COLOR(color));
3961 p->pp_max_offset = htonl(maxoff);
3962 p->pp_page_offset = htonl(pg_off);
3963 p->pp_rsvd = 0;
3964 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
3965 p->pp_addr[j] = pidx < gl->dgl_nelem ?
3966 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
3967 } else
3968 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
3969 send_or_defer(toep, m, 0);
3970 ppod_addr += PPOD_SIZE;
3971 }
3972 return (0);
3973}
3974
3975/*
3976 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
3977 */
3978static inline void
3979mk_cpl_barrier_ulp(struct cpl_barrier *b)
3980{
3981 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
3982
3983 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3984 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
3985 b->opcode = CPL_BARRIER;
3986}
3987
3988/*
3989 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
3990 */
3991static inline void
3992mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
3993{
3994 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
3995
3996 txpkt = (struct ulp_txpkt *)req;
3997 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
3998 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
3999 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4000 req->cpuno = htons(cpuno);
4001}
4002
4003/*
4004 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4005 */
4006static inline void
4007mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4008 unsigned int word, uint64_t mask, uint64_t val)
4009{
4010 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4011
4012 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4013 tid, word, mask, val);
4014
4015 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4016 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4017 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4018 req->reply = V_NO_REPLY(1);
4019 req->cpu_idx = 0;
4020 req->word = htons(word);
4021 req->mask = htobe64(mask);
4022 req->val = htobe64(val);
4023}
4024
4025/*
4026 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4027 */
4028static void
4009mk_rx_data_ack_ulp(struct cpl_rx_data_ack *ack, unsigned int tid, unsigned int credits)
4029mk_rx_data_ack_ulp(struct socket *so,struct cpl_rx_data_ack *ack,
4030 unsigned int tid, unsigned int credits)
4010{
4011 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4012
4013 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4014 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4015 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4016 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4031{
4032 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4033
4034 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4035 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4036 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4037 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4017 V_RX_DACK_MODE(1) | V_RX_CREDITS(credits));
4038 V_RX_DACK_MODE(TOM_TUNABLE(TOE_DEV(so), delack)) |
4039 V_RX_CREDITS(credits));
4018}
4019
4020void
4021t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4022{
4023 unsigned int wrlen;
4024 struct mbuf *m;
4025 struct work_request_hdr *wr;
4026 struct cpl_barrier *lock;
4027 struct cpl_set_tcb_field *req;
4028 struct cpl_get_tcb *getreq;
4029 struct ddp_state *p = &toep->tp_ddp_state;
4030
4031 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4032 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4033 sizeof(*getreq);
4034 m = m_gethdr_nofail(wrlen);
4035 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4036 wr = mtod(m, struct work_request_hdr *);
4037 bzero(wr, wrlen);
4038
4039 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4040 m->m_pkthdr.len = m->m_len = wrlen;
4041
4042 lock = (struct cpl_barrier *)(wr + 1);
4043 mk_cpl_barrier_ulp(lock);
4044
4045 req = (struct cpl_set_tcb_field *)(lock + 1);
4046
4047 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4048
4049 /* Hmmm, not sure if this actually a good thing: reactivating
4050 * the other buffer might be an issue if it has been completed
4051 * already. However, that is unlikely, since the fact that the UBUF
4052 * is not completed indicates that there is no oustanding data.
4053 */
4054 if (bufidx == 0)
4055 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4056 V_TF_DDP_ACTIVE_BUF(1) |
4057 V_TF_DDP_BUF0_VALID(1),
4058 V_TF_DDP_ACTIVE_BUF(1));
4059 else
4060 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4061 V_TF_DDP_ACTIVE_BUF(1) |
4062 V_TF_DDP_BUF1_VALID(1), 0);
4063
4064 getreq = (struct cpl_get_tcb *)(req + 1);
4065 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4066
4067 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4068
4069 /* Keep track of the number of oustanding CPL_GET_TCB requests
4070 */
4071 p->get_tcb_count++;
4072
4073#ifdef T3_TRACE
4074 T3_TRACE1(TIDTB(so),
4075 "t3_cancel_ddpbuf: bufidx %u", bufidx);
4076#endif
4077 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4078}
4079
4080/**
4081 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4082 * @sk: the socket associated with the buffers
4083 * @bufidx: index of HW DDP buffer (0 or 1)
4084 * @tag0: new tag for HW buffer 0
4085 * @tag1: new tag for HW buffer 1
4086 * @len: new length for HW buf @bufidx
4087 *
4088 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4089 * buffer by changing the buffer tag and length and setting the valid and
4090 * active flag accordingly. The caller must ensure the new buffer is at
4091 * least as big as the existing one. Since we typically reprogram both HW
4092 * buffers this function sets both tags for convenience. Read the TCB to
4093 * determine how made data was written into the buffer before the overlay
4094 * took place.
4095 */
4096void
4097t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4098 unsigned int tag1, unsigned int len)
4099{
4100 unsigned int wrlen;
4101 struct mbuf *m;
4102 struct work_request_hdr *wr;
4103 struct cpl_get_tcb *getreq;
4104 struct cpl_set_tcb_field *req;
4105 struct ddp_state *p = &toep->tp_ddp_state;
4106
4107 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4108 bufidx, tag0, tag1, len);
4109 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4110 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4111 m = m_gethdr_nofail(wrlen);
4112 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4113 wr = mtod(m, struct work_request_hdr *);
4114 m->m_pkthdr.len = m->m_len = wrlen;
4115 bzero(wr, wrlen);
4116
4117
4118 /* Set the ATOMIC flag to make sure that TP processes the following
4119 * CPLs in an atomic manner and no wire segments can be interleaved.
4120 */
4121 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4122 req = (struct cpl_set_tcb_field *)(wr + 1);
4123 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4124 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4125 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4126 V_TCB_RX_DDP_BUF0_TAG(tag0) |
4127 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4128 req++;
4129 if (bufidx == 0) {
4130 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4131 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4132 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4133 req++;
4134 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4135 V_TF_DDP_PUSH_DISABLE_0(1) |
4136 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4137 V_TF_DDP_PUSH_DISABLE_0(0) |
4138 V_TF_DDP_BUF0_VALID(1));
4139 } else {
4140 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4141 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4142 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4143 req++;
4144 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4145 V_TF_DDP_PUSH_DISABLE_1(1) |
4146 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4147 V_TF_DDP_PUSH_DISABLE_1(0) |
4148 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4149 }
4150
4151 getreq = (struct cpl_get_tcb *)(req + 1);
4152 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4153
4154 /* Keep track of the number of oustanding CPL_GET_TCB requests
4155 */
4156 p->get_tcb_count++;
4157
4158#ifdef T3_TRACE
4159 T3_TRACE4(TIDTB(sk),
4160 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4161 "len %d",
4162 bufidx, tag0, tag1, len);
4163#endif
4164 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4165}
4166
4167/*
4168 * Sends a compound WR containing all the CPL messages needed to program the
4169 * two HW DDP buffers, namely optionally setting up the length and offset of
4170 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4171 */
4172void
4173t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4174 unsigned int len1, unsigned int offset1,
4175 uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4176{
4177 unsigned int wrlen;
4178 struct mbuf *m;
4179 struct work_request_hdr *wr;
4180 struct cpl_set_tcb_field *req;
4181
4182 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4183 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4184
4185 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4186 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4187 (len1 ? sizeof(*req) : 0) +
4188 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4189 m = m_gethdr_nofail(wrlen);
4190 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4191 wr = mtod(m, struct work_request_hdr *);
4192 bzero(wr, wrlen);
4193
4194 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4195 m->m_pkthdr.len = m->m_len = wrlen;
4196
4197 req = (struct cpl_set_tcb_field *)(wr + 1);
4198 if (len0) { /* program buffer 0 offset and length */
4199 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4200 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4201 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4202 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4203 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4204 req++;
4205 }
4206 if (len1) { /* program buffer 1 offset and length */
4207 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4208 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4209 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4210 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4211 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4212 req++;
4213 }
4214
4215 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4216 ddp_flags);
4217
4218 if (modulate) {
4040}
4041
4042void
4043t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4044{
4045 unsigned int wrlen;
4046 struct mbuf *m;
4047 struct work_request_hdr *wr;
4048 struct cpl_barrier *lock;
4049 struct cpl_set_tcb_field *req;
4050 struct cpl_get_tcb *getreq;
4051 struct ddp_state *p = &toep->tp_ddp_state;
4052
4053 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4054 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4055 sizeof(*getreq);
4056 m = m_gethdr_nofail(wrlen);
4057 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4058 wr = mtod(m, struct work_request_hdr *);
4059 bzero(wr, wrlen);
4060
4061 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4062 m->m_pkthdr.len = m->m_len = wrlen;
4063
4064 lock = (struct cpl_barrier *)(wr + 1);
4065 mk_cpl_barrier_ulp(lock);
4066
4067 req = (struct cpl_set_tcb_field *)(lock + 1);
4068
4069 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4070
4071 /* Hmmm, not sure if this actually a good thing: reactivating
4072 * the other buffer might be an issue if it has been completed
4073 * already. However, that is unlikely, since the fact that the UBUF
4074 * is not completed indicates that there is no oustanding data.
4075 */
4076 if (bufidx == 0)
4077 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4078 V_TF_DDP_ACTIVE_BUF(1) |
4079 V_TF_DDP_BUF0_VALID(1),
4080 V_TF_DDP_ACTIVE_BUF(1));
4081 else
4082 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4083 V_TF_DDP_ACTIVE_BUF(1) |
4084 V_TF_DDP_BUF1_VALID(1), 0);
4085
4086 getreq = (struct cpl_get_tcb *)(req + 1);
4087 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4088
4089 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4090
4091 /* Keep track of the number of oustanding CPL_GET_TCB requests
4092 */
4093 p->get_tcb_count++;
4094
4095#ifdef T3_TRACE
4096 T3_TRACE1(TIDTB(so),
4097 "t3_cancel_ddpbuf: bufidx %u", bufidx);
4098#endif
4099 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4100}
4101
4102/**
4103 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4104 * @sk: the socket associated with the buffers
4105 * @bufidx: index of HW DDP buffer (0 or 1)
4106 * @tag0: new tag for HW buffer 0
4107 * @tag1: new tag for HW buffer 1
4108 * @len: new length for HW buf @bufidx
4109 *
4110 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4111 * buffer by changing the buffer tag and length and setting the valid and
4112 * active flag accordingly. The caller must ensure the new buffer is at
4113 * least as big as the existing one. Since we typically reprogram both HW
4114 * buffers this function sets both tags for convenience. Read the TCB to
4115 * determine how made data was written into the buffer before the overlay
4116 * took place.
4117 */
4118void
4119t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4120 unsigned int tag1, unsigned int len)
4121{
4122 unsigned int wrlen;
4123 struct mbuf *m;
4124 struct work_request_hdr *wr;
4125 struct cpl_get_tcb *getreq;
4126 struct cpl_set_tcb_field *req;
4127 struct ddp_state *p = &toep->tp_ddp_state;
4128
4129 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4130 bufidx, tag0, tag1, len);
4131 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4132 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4133 m = m_gethdr_nofail(wrlen);
4134 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4135 wr = mtod(m, struct work_request_hdr *);
4136 m->m_pkthdr.len = m->m_len = wrlen;
4137 bzero(wr, wrlen);
4138
4139
4140 /* Set the ATOMIC flag to make sure that TP processes the following
4141 * CPLs in an atomic manner and no wire segments can be interleaved.
4142 */
4143 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4144 req = (struct cpl_set_tcb_field *)(wr + 1);
4145 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4146 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4147 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4148 V_TCB_RX_DDP_BUF0_TAG(tag0) |
4149 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4150 req++;
4151 if (bufidx == 0) {
4152 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4153 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4154 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4155 req++;
4156 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4157 V_TF_DDP_PUSH_DISABLE_0(1) |
4158 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4159 V_TF_DDP_PUSH_DISABLE_0(0) |
4160 V_TF_DDP_BUF0_VALID(1));
4161 } else {
4162 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4163 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4164 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4165 req++;
4166 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4167 V_TF_DDP_PUSH_DISABLE_1(1) |
4168 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4169 V_TF_DDP_PUSH_DISABLE_1(0) |
4170 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4171 }
4172
4173 getreq = (struct cpl_get_tcb *)(req + 1);
4174 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4175
4176 /* Keep track of the number of oustanding CPL_GET_TCB requests
4177 */
4178 p->get_tcb_count++;
4179
4180#ifdef T3_TRACE
4181 T3_TRACE4(TIDTB(sk),
4182 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4183 "len %d",
4184 bufidx, tag0, tag1, len);
4185#endif
4186 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4187}
4188
4189/*
4190 * Sends a compound WR containing all the CPL messages needed to program the
4191 * two HW DDP buffers, namely optionally setting up the length and offset of
4192 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4193 */
4194void
4195t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4196 unsigned int len1, unsigned int offset1,
4197 uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4198{
4199 unsigned int wrlen;
4200 struct mbuf *m;
4201 struct work_request_hdr *wr;
4202 struct cpl_set_tcb_field *req;
4203
4204 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4205 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4206
4207 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4208 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4209 (len1 ? sizeof(*req) : 0) +
4210 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4211 m = m_gethdr_nofail(wrlen);
4212 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4213 wr = mtod(m, struct work_request_hdr *);
4214 bzero(wr, wrlen);
4215
4216 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4217 m->m_pkthdr.len = m->m_len = wrlen;
4218
4219 req = (struct cpl_set_tcb_field *)(wr + 1);
4220 if (len0) { /* program buffer 0 offset and length */
4221 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4222 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4223 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4224 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4225 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4226 req++;
4227 }
4228 if (len1) { /* program buffer 1 offset and length */
4229 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4230 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4231 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4232 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4233 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4234 req++;
4235 }
4236
4237 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4238 ddp_flags);
4239
4240 if (modulate) {
4219 mk_rx_data_ack_ulp((struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4220 toep->tp_copied_seq - toep->tp_rcv_wup);
4241 mk_rx_data_ack_ulp(toeptoso(toep),
4242 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4243 toep->tp_copied_seq - toep->tp_rcv_wup);
4221 toep->tp_rcv_wup = toep->tp_copied_seq;
4222 }
4223
4224#ifdef T3_TRACE
4225 T3_TRACE5(TIDTB(sk),
4226 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4227 "modulate %d",
4228 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4229 modulate);
4230#endif
4231
4232 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4233}
4234
4235void
4236t3_init_wr_tab(unsigned int wr_len)
4237{
4238 int i;
4239
4240 if (mbuf_wrs[1]) /* already initialized */
4241 return;
4242
4243 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4244 int sgl_len = (3 * i) / 2 + (i & 1);
4245
4246 sgl_len += 3;
4247 mbuf_wrs[i] = sgl_len <= wr_len ?
4248 1 : 1 + (sgl_len - 2) / (wr_len - 1);
4249 }
4250
4251 wrlen = wr_len * 8;
4252}
4253
4254int
4255t3_init_cpl_io(void)
4256{
4257#ifdef notyet
4258 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4259 if (!tcphdr_skb) {
4260 log(LOG_ERR,
4261 "Chelsio TCP offload: can't allocate sk_buff\n");
4262 return -1;
4263 }
4264 skb_put(tcphdr_skb, sizeof(struct tcphdr));
4265 tcphdr_skb->h.raw = tcphdr_skb->data;
4266 memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4267#endif
4268
4269 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4270 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4271 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4272 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4273 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4274 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4275 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4276 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4277 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4278 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4279 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4280 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4281 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4282 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4283 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4284 return (0);
4285}
4286
4244 toep->tp_rcv_wup = toep->tp_copied_seq;
4245 }
4246
4247#ifdef T3_TRACE
4248 T3_TRACE5(TIDTB(sk),
4249 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4250 "modulate %d",
4251 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4252 modulate);
4253#endif
4254
4255 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4256}
4257
4258void
4259t3_init_wr_tab(unsigned int wr_len)
4260{
4261 int i;
4262
4263 if (mbuf_wrs[1]) /* already initialized */
4264 return;
4265
4266 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4267 int sgl_len = (3 * i) / 2 + (i & 1);
4268
4269 sgl_len += 3;
4270 mbuf_wrs[i] = sgl_len <= wr_len ?
4271 1 : 1 + (sgl_len - 2) / (wr_len - 1);
4272 }
4273
4274 wrlen = wr_len * 8;
4275}
4276
4277int
4278t3_init_cpl_io(void)
4279{
4280#ifdef notyet
4281 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4282 if (!tcphdr_skb) {
4283 log(LOG_ERR,
4284 "Chelsio TCP offload: can't allocate sk_buff\n");
4285 return -1;
4286 }
4287 skb_put(tcphdr_skb, sizeof(struct tcphdr));
4288 tcphdr_skb->h.raw = tcphdr_skb->data;
4289 memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4290#endif
4291
4292 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4293 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4294 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4295 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4296 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4297 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4298 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4299 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4300 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4301 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4302 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4303 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4304 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4305 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4306 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4307 return (0);
4308}
4309