Deleted Added
full compact
cxgb_cpl_io.c (196019) cxgb_cpl_io.c (196039)
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
1/**************************************************************************
2
3Copyright (c) 2007-2008, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 196019 2009-08-01 19:26:27Z rwatson $");
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 196039 2009-08-02 19:43:32Z rwatson $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/sockbuf.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/protosw.h>
49#include <sys/priv.h>
50
51#if __FreeBSD_version < 800044
52#define V_tcp_do_autosndbuf tcp_do_autosndbuf
53#define V_tcp_autosndbuf_max tcp_autosndbuf_max
54#define V_tcp_do_rfc1323 tcp_do_rfc1323
55#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
56#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
57#define V_tcpstat tcpstat
58#endif
59
60#include <net/if.h>
61#include <net/route.h>
62
63#include <netinet/in.h>
64#include <netinet/in_pcb.h>
65#include <netinet/in_systm.h>
66#include <netinet/in_var.h>
67
68
69#include <cxgb_osdep.h>
70#include <sys/mbufq.h>
71
72#include <netinet/ip.h>
73#include <netinet/tcp_var.h>
74#include <netinet/tcp_fsm.h>
75#include <netinet/tcp_offload.h>
76#include <netinet/tcp_seq.h>
77#include <netinet/tcp_syncache.h>
78#include <netinet/tcp_timer.h>
79#include <net/route.h>
80
81#include <t3cdev.h>
82#include <common/cxgb_firmware_exports.h>
83#include <common/cxgb_t3_cpl.h>
84#include <common/cxgb_tcb.h>
85#include <common/cxgb_ctl_defs.h>
86#include <cxgb_offload.h>
87#include <vm/vm.h>
88#include <vm/pmap.h>
89#include <machine/bus.h>
90#include <sys/mvec.h>
91#include <ulp/toecore/cxgb_toedev.h>
92#include <ulp/tom/cxgb_l2t.h>
93#include <ulp/tom/cxgb_defs.h>
94#include <ulp/tom/cxgb_tom.h>
95#include <ulp/tom/cxgb_t3_ddp.h>
96#include <ulp/tom/cxgb_toepcb.h>
97#include <ulp/tom/cxgb_tcp.h>
98#include <ulp/tom/cxgb_tcp_offload.h>
99
100/*
101 * For ULP connections HW may add headers, e.g., for digests, that aren't part
102 * of the messages sent by the host but that are part of the TCP payload and
103 * therefore consume TCP sequence space. Tx connection parameters that
104 * operate in TCP sequence space are affected by the HW additions and need to
105 * compensate for them to accurately track TCP sequence numbers. This array
106 * contains the compensating extra lengths for ULP packets. It is indexed by
107 * a packet's ULP submode.
108 */
109const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
110
111#ifdef notyet
112/*
113 * This sk_buff holds a fake header-only TCP segment that we use whenever we
114 * need to exploit SW TCP functionality that expects TCP headers, such as
115 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
116 * CPUs without locking.
117 */
118static struct mbuf *tcphdr_mbuf __read_mostly;
119#endif
120
121/*
122 * Size of WRs in bytes. Note that we assume all devices we are handling have
123 * the same WR size.
124 */
125static unsigned int wrlen __read_mostly;
126
127/*
128 * The number of WRs needed for an skb depends on the number of page fragments
129 * in the skb and whether it has any payload in its main body. This maps the
130 * length of the gather list represented by an skb into the # of necessary WRs.
131 */
132static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
133
134/*
135 * Max receive window supported by HW in bytes. Only a small part of it can
136 * be set through option0, the rest needs to be set through RX_DATA_ACK.
137 */
138#define MAX_RCV_WND ((1U << 27) - 1)
139
140/*
141 * Min receive window. We want it to be large enough to accommodate receive
142 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
143 */
144#define MIN_RCV_WND (24 * 1024U)
145#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
146
147#define VALIDATE_SEQ 0
148#define VALIDATE_SOCK(so)
149#define DEBUG_WR 0
150
151#define TCP_TIMEWAIT 1
152#define TCP_CLOSE 2
153#define TCP_DROP 3
154
155static void t3_send_reset(struct toepcb *toep);
156static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
157static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
158static void handle_syncache_event(int event, void *arg);
159
160static inline void
161SBAPPEND(struct sockbuf *sb, struct mbuf *n)
162{
163 struct mbuf *m;
164
165 m = sb->sb_mb;
166 while (m) {
167 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
168 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
169 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
170 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
171 m->m_next, m->m_nextpkt, m->m_flags));
172 m = m->m_next;
173 }
174 m = n;
175 while (m) {
176 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
177 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
178 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
179 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
180 m->m_next, m->m_nextpkt, m->m_flags));
181 m = m->m_next;
182 }
183 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
184 sbappendstream_locked(sb, n);
185 m = sb->sb_mb;
186
187 while (m) {
188 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
189 m->m_next, m->m_nextpkt, m->m_flags));
190 m = m->m_next;
191 }
192}
193
194static inline int
195is_t3a(const struct toedev *dev)
196{
197 return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
198}
199
200static void
201dump_toepcb(struct toepcb *toep)
202{
203 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
204 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
205 toep->tp_mtu_idx, toep->tp_tid);
206
207 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
208 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
209 toep->tp_mss_clamp, toep->tp_flags);
210}
211
212#ifndef RTALLOC2_DEFINED
213static struct rtentry *
214rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
215{
216 struct rtentry *rt = NULL;
217
218 if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
219 RT_UNLOCK(rt);
220
221 return (rt);
222}
223#endif
224
225/*
226 * Determine whether to send a CPL message now or defer it. A message is
227 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
228 * For connections in other states the message is sent immediately.
229 * If through_l2t is set the message is subject to ARP processing, otherwise
230 * it is sent directly.
231 */
232static inline void
233send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
234{
235 struct tcpcb *tp = toep->tp_tp;
236
237 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
238 inp_wlock(tp->t_inpcb);
239 mbufq_tail(&toep->out_of_order_queue, m); // defer
240 inp_wunlock(tp->t_inpcb);
241 } else if (through_l2t)
242 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T
243 else
244 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly
245}
246
247static inline unsigned int
248mkprio(unsigned int cntrl, const struct toepcb *toep)
249{
250 return (cntrl);
251}
252
253/*
254 * Populate a TID_RELEASE WR. The skb must be already propely sized.
255 */
256static inline void
257mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
258{
259 struct cpl_tid_release *req;
260
261 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
262 m->m_pkthdr.len = m->m_len = sizeof(*req);
263 req = mtod(m, struct cpl_tid_release *);
264 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
265 req->wr.wr_lo = 0;
266 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
267}
268
269static inline void
270make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
271{
272 struct tcpcb *tp = so_sototcpcb(so);
273 struct toepcb *toep = tp->t_toe;
274 struct tx_data_wr *req;
275 struct sockbuf *snd;
276
277 inp_lock_assert(tp->t_inpcb);
278 snd = so_sockbuf_snd(so);
279
280 req = mtod(m, struct tx_data_wr *);
281 m->m_len = sizeof(*req);
282 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
283 req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
284 /* len includes the length of any HW ULP additions */
285 req->len = htonl(len);
286 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
287 /* V_TX_ULP_SUBMODE sets both the mode and submode */
288 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
289 V_TX_URG(/* skb_urgent(skb) */ 0 ) |
290 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
291 (tail ? 0 : 1))));
292 req->sndseq = htonl(tp->snd_nxt);
293 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
294 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
295 V_TX_CPU_IDX(toep->tp_qset));
296
297 /* Sendbuffer is in units of 32KB.
298 */
299 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
300 req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
301 else {
302 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
303 }
304
305 toep->tp_flags |= TP_DATASENT;
306 }
307}
308
309#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
310
311int
312t3_push_frames(struct socket *so, int req_completion)
313{
314 struct tcpcb *tp = so_sototcpcb(so);
315 struct toepcb *toep = tp->t_toe;
316
317 struct mbuf *tail, *m0, *last;
318 struct t3cdev *cdev;
319 struct tom_data *d;
320 int state, bytes, count, total_bytes;
321 bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
322 struct sockbuf *snd;
323
324 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
325 DPRINTF("tcp state=%d\n", tp->t_state);
326 return (0);
327 }
328
329 state = so_state_get(so);
330
331 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
332 DPRINTF("disconnecting\n");
333
334 return (0);
335 }
336
337 inp_lock_assert(tp->t_inpcb);
338
339 snd = so_sockbuf_snd(so);
340 sockbuf_lock(snd);
341
342 d = TOM_DATA(toep->tp_toedev);
343 cdev = d->cdev;
344
345 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
346
347 total_bytes = 0;
348 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
349 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
350
351 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) {
352 KASSERT(tail, ("sbdrop error"));
353 last = tail = tail->m_next;
354 }
355
356 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
357 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
358 sockbuf_unlock(snd);
359
360 return (0);
361 }
362
363 toep->tp_m_last = NULL;
364 while (toep->tp_wr_avail && (tail != NULL)) {
365 count = bytes = 0;
366 segp = segs;
367 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
368 sockbuf_unlock(snd);
369 return (0);
370 }
371 /*
372 * If the data in tail fits as in-line, then
373 * make an immediate data wr.
374 */
375 if (tail->m_len <= IMM_LEN) {
376 count = 1;
377 bytes = tail->m_len;
378 last = tail;
379 tail = tail->m_next;
380 m_set_sgl(m0, NULL);
381 m_set_sgllen(m0, 0);
382 make_tx_data_wr(so, m0, bytes, tail);
383 m_append(m0, bytes, mtod(last, caddr_t));
384 KASSERT(!m0->m_next, ("bad append"));
385 } else {
386 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
387 && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
388 bytes += tail->m_len;
389 last = tail;
390 count++;
391 /*
392 * technically an abuse to be using this for a VA
393 * but less gross than defining my own structure
394 * or calling pmap_kextract from here :-|
395 */
396 segp->ds_addr = (bus_addr_t)tail->m_data;
397 segp->ds_len = tail->m_len;
398 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
399 count, mbuf_wrs[count], tail->m_data, tail->m_len);
400 segp++;
401 tail = tail->m_next;
402 }
403 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
404 toep->tp_wr_avail, count, mbuf_wrs[count], tail);
405
406 m_set_sgl(m0, segs);
407 m_set_sgllen(m0, count);
408 make_tx_data_wr(so, m0, bytes, tail);
409 }
410 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
411
412 if (tail) {
413 snd->sb_sndptr = tail;
414 toep->tp_m_last = NULL;
415 } else
416 toep->tp_m_last = snd->sb_sndptr = last;
417
418
419 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
420
421 snd->sb_sndptroff += bytes;
422 total_bytes += bytes;
423 toep->tp_write_seq += bytes;
424 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
425 " tail=%p sndptr=%p sndptroff=%d",
426 toep->tp_wr_avail, count, mbuf_wrs[count],
427 tail, snd->sb_sndptr, snd->sb_sndptroff);
428 if (tail)
429 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
430 " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
431 total_bytes, toep->tp_m_last, tail->m_data,
432 tp->snd_una);
433 else
434 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
435 " tp_m_last=%p snd_una=0x%08x",
436 total_bytes, toep->tp_m_last, tp->snd_una);
437
438
439#ifdef KTR
440{
441 int i;
442
443 i = 0;
444 while (i < count && m_get_sgllen(m0)) {
445 if ((count - i) >= 3) {
446 CTR6(KTR_TOM,
447 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
448 " len=%d pa=0x%zx len=%d",
449 segs[i].ds_addr, segs[i].ds_len,
450 segs[i + 1].ds_addr, segs[i + 1].ds_len,
451 segs[i + 2].ds_addr, segs[i + 2].ds_len);
452 i += 3;
453 } else if ((count - i) == 2) {
454 CTR4(KTR_TOM,
455 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
456 " len=%d",
457 segs[i].ds_addr, segs[i].ds_len,
458 segs[i + 1].ds_addr, segs[i + 1].ds_len);
459 i += 2;
460 } else {
461 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
462 segs[i].ds_addr, segs[i].ds_len);
463 i++;
464 }
465
466 }
467}
468#endif
469 /*
470 * remember credits used
471 */
472 m0->m_pkthdr.csum_data = mbuf_wrs[count];
473 m0->m_pkthdr.len = bytes;
474 toep->tp_wr_avail -= mbuf_wrs[count];
475 toep->tp_wr_unacked += mbuf_wrs[count];
476
477 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
478 toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
479 struct work_request_hdr *wr = cplhdr(m0);
480
481 wr->wr_hi |= htonl(F_WR_COMPL);
482 toep->tp_wr_unacked = 0;
483 }
484 KASSERT((m0->m_pkthdr.csum_data > 0) &&
485 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
486 m0->m_pkthdr.csum_data));
487 m0->m_type = MT_DONTFREE;
488 enqueue_wr(toep, m0);
489 DPRINTF("sending offload tx with %d bytes in %d segments\n",
490 bytes, count);
491 l2t_send(cdev, m0, toep->tp_l2t);
492 }
493 sockbuf_unlock(snd);
494 return (total_bytes);
495}
496
497/*
498 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
499 * under any circumstances. We take the easy way out and always queue the
500 * message to the write_queue. We can optimize the case where the queue is
501 * already empty though the optimization is probably not worth it.
502 */
503static void
504close_conn(struct socket *so)
505{
506 struct mbuf *m;
507 struct cpl_close_con_req *req;
508 struct tom_data *d;
509 struct inpcb *inp = so_sotoinpcb(so);
510 struct tcpcb *tp;
511 struct toepcb *toep;
512 unsigned int tid;
513
514
515 inp_wlock(inp);
516 tp = so_sototcpcb(so);
517 toep = tp->t_toe;
518
519 if (tp->t_state != TCPS_SYN_SENT)
520 t3_push_frames(so, 1);
521
522 if (toep->tp_flags & TP_FIN_SENT) {
523 inp_wunlock(inp);
524 return;
525 }
526
527 tid = toep->tp_tid;
528
529 d = TOM_DATA(toep->tp_toedev);
530
531 m = m_gethdr_nofail(sizeof(*req));
532 m_set_priority(m, CPL_PRIORITY_DATA);
533 m_set_sgl(m, NULL);
534 m_set_sgllen(m, 0);
535
536 toep->tp_flags |= TP_FIN_SENT;
537 req = mtod(m, struct cpl_close_con_req *);
538
539 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
540 req->wr.wr_lo = htonl(V_WR_TID(tid));
541 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
542 req->rsvd = 0;
543 inp_wunlock(inp);
544 /*
545 * XXX - need to defer shutdown while there is still data in the queue
546 *
547 */
548 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
549 cxgb_ofld_send(d->cdev, m);
550
551}
552
553/*
554 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
555 * and send it along.
556 */
557static void
558abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
559{
560 struct cpl_abort_req *req = cplhdr(m);
561
562 req->cmd = CPL_ABORT_NO_RST;
563 cxgb_ofld_send(cdev, m);
564}
565
566/*
567 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
568 * permitted to return without sending the message in case we cannot allocate
569 * an sk_buff. Returns the number of credits sent.
570 */
571uint32_t
572t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
573{
574 struct mbuf *m;
575 struct cpl_rx_data_ack *req;
576 struct toepcb *toep = tp->t_toe;
577 struct toedev *tdev = toep->tp_toedev;
578
579 m = m_gethdr_nofail(sizeof(*req));
580
581 DPRINTF("returning %u credits to HW\n", credits);
582
583 req = mtod(m, struct cpl_rx_data_ack *);
584 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
585 req->wr.wr_lo = 0;
586 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
587 req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
588 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
589 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
590 return (credits);
591}
592
593/*
594 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
595 * This is only used in DDP mode, so we take the opportunity to also set the
596 * DACK mode and flush any Rx credits.
597 */
598void
599t3_send_rx_modulate(struct toepcb *toep)
600{
601 struct mbuf *m;
602 struct cpl_rx_data_ack *req;
603
604 m = m_gethdr_nofail(sizeof(*req));
605
606 req = mtod(m, struct cpl_rx_data_ack *);
607 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
608 req->wr.wr_lo = 0;
609 m->m_pkthdr.len = m->m_len = sizeof(*req);
610
611 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
612 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
613 V_RX_DACK_MODE(1) |
614 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
615 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
616 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
617 toep->tp_rcv_wup = toep->tp_copied_seq;
618}
619
620/*
621 * Handle receipt of an urgent pointer.
622 */
623static void
624handle_urg_ptr(struct socket *so, uint32_t urg_seq)
625{
626#ifdef URGENT_DATA_SUPPORTED
627 struct tcpcb *tp = so_sototcpcb(so);
628
629 urg_seq--; /* initially points past the urgent data, per BSD */
630
631 if (tp->urg_data && !after(urg_seq, tp->urg_seq))
632 return; /* duplicate pointer */
633 sk_send_sigurg(sk);
634 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
635 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
636 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
637
638 tp->copied_seq++;
639 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
640 tom_eat_skb(sk, skb, 0);
641 }
642 tp->urg_data = TCP_URG_NOTYET;
643 tp->urg_seq = urg_seq;
644#endif
645}
646
647/*
648 * Returns true if a socket cannot accept new Rx data.
649 */
650static inline int
651so_no_receive(const struct socket *so)
652{
653 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
654}
655
656/*
657 * Process an urgent data notification.
658 */
659static void
660rx_urg_notify(struct toepcb *toep, struct mbuf *m)
661{
662 struct cpl_rx_urg_notify *hdr = cplhdr(m);
663 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
664
665 VALIDATE_SOCK(so);
666
667 if (!so_no_receive(so))
668 handle_urg_ptr(so, ntohl(hdr->seq));
669
670 m_freem(m);
671}
672
673/*
674 * Handler for RX_URG_NOTIFY CPL messages.
675 */
676static int
677do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
678{
679 struct toepcb *toep = (struct toepcb *)ctx;
680
681 rx_urg_notify(toep, m);
682 return (0);
683}
684
685static __inline int
686is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
687{
688 return (toep->tp_ulp_mode ||
689 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
690 dev->tod_ttid >= TOE_ID_CHELSIO_T3));
691}
692
693/*
694 * Set of states for which we should return RX credits.
695 */
696#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
697
698/*
699 * Called after some received data has been read. It returns RX credits
700 * to the HW for the amount of data processed.
701 */
702void
703t3_cleanup_rbuf(struct tcpcb *tp, int copied)
704{
705 struct toepcb *toep = tp->t_toe;
706 struct socket *so;
707 struct toedev *dev;
708 int dack_mode, must_send, read;
709 u32 thres, credits, dack = 0;
710 struct sockbuf *rcv;
711
712 so = inp_inpcbtosocket(tp->t_inpcb);
713 rcv = so_sockbuf_rcv(so);
714
715 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
716 (tp->t_state == TCPS_FIN_WAIT_2))) {
717 if (copied) {
718 sockbuf_lock(rcv);
719 toep->tp_copied_seq += copied;
720 sockbuf_unlock(rcv);
721 }
722
723 return;
724 }
725
726 inp_lock_assert(tp->t_inpcb);
727
728 sockbuf_lock(rcv);
729 if (copied)
730 toep->tp_copied_seq += copied;
731 else {
732 read = toep->tp_enqueued_bytes - rcv->sb_cc;
733 toep->tp_copied_seq += read;
734 }
735 credits = toep->tp_copied_seq - toep->tp_rcv_wup;
736 toep->tp_enqueued_bytes = rcv->sb_cc;
737 sockbuf_unlock(rcv);
738
739 if (credits > rcv->sb_mbmax) {
740 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
741 toep->tp_copied_seq, toep->tp_rcv_wup, credits);
742 credits = rcv->sb_mbmax;
743 }
744
745
746 /*
747 * XXX this won't accurately reflect credit return - we need
748 * to look at the difference between the amount that has been
749 * put in the recv sockbuf and what is there now
750 */
751
752 if (__predict_false(!credits))
753 return;
754
755 dev = toep->tp_toedev;
756 thres = TOM_TUNABLE(dev, rx_credit_thres);
757
758 if (__predict_false(thres == 0))
759 return;
760
761 if (is_delack_mode_valid(dev, toep)) {
762 dack_mode = TOM_TUNABLE(dev, delack);
763 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
764 u32 r = tp->rcv_nxt - toep->tp_delack_seq;
765
766 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
767 dack = F_RX_DACK_CHANGE |
768 V_RX_DACK_MODE(dack_mode);
769 }
770 } else
771 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
772
773 /*
774 * For coalescing to work effectively ensure the receive window has
775 * at least 16KB left.
776 */
777 must_send = credits + 16384 >= tp->rcv_wnd;
778
779 if (must_send || credits >= thres)
780 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
781}
782
783static int
784cxgb_toe_disconnect(struct tcpcb *tp)
785{
786 struct socket *so;
787
788 DPRINTF("cxgb_toe_disconnect\n");
789
790 so = inp_inpcbtosocket(tp->t_inpcb);
791 close_conn(so);
792 return (0);
793}
794
795static int
796cxgb_toe_reset(struct tcpcb *tp)
797{
798 struct toepcb *toep = tp->t_toe;
799
800 t3_send_reset(toep);
801
802 /*
803 * unhook from socket
804 */
805 tp->t_flags &= ~TF_TOE;
806 toep->tp_tp = NULL;
807 tp->t_toe = NULL;
808 return (0);
809}
810
811static int
812cxgb_toe_send(struct tcpcb *tp)
813{
814 struct socket *so;
815
816 DPRINTF("cxgb_toe_send\n");
817 dump_toepcb(tp->t_toe);
818
819 so = inp_inpcbtosocket(tp->t_inpcb);
820 t3_push_frames(so, 1);
821 return (0);
822}
823
824static int
825cxgb_toe_rcvd(struct tcpcb *tp)
826{
827
828 inp_lock_assert(tp->t_inpcb);
829
830 t3_cleanup_rbuf(tp, 0);
831
832 return (0);
833}
834
835static void
836cxgb_toe_detach(struct tcpcb *tp)
837{
838 struct toepcb *toep;
839
840 /*
841 * XXX how do we handle teardown in the SYN_SENT state?
842 *
843 */
844 inp_lock_assert(tp->t_inpcb);
845 toep = tp->t_toe;
846 toep->tp_tp = NULL;
847
848 /*
849 * unhook from socket
850 */
851 tp->t_flags &= ~TF_TOE;
852 tp->t_toe = NULL;
853}
854
855
856static struct toe_usrreqs cxgb_toe_usrreqs = {
857 .tu_disconnect = cxgb_toe_disconnect,
858 .tu_reset = cxgb_toe_reset,
859 .tu_send = cxgb_toe_send,
860 .tu_rcvd = cxgb_toe_rcvd,
861 .tu_detach = cxgb_toe_detach,
862 .tu_detach = cxgb_toe_detach,
863 .tu_syncache_event = handle_syncache_event,
864};
865
866
867static void
868__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
869 uint64_t mask, uint64_t val, int no_reply)
870{
871 struct cpl_set_tcb_field *req;
872
873 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
874 toep->tp_tid, word, mask, val);
875
876 req = mtod(m, struct cpl_set_tcb_field *);
877 m->m_pkthdr.len = m->m_len = sizeof(*req);
878 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
879 req->wr.wr_lo = 0;
880 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
881 req->reply = V_NO_REPLY(no_reply);
882 req->cpu_idx = 0;
883 req->word = htons(word);
884 req->mask = htobe64(mask);
885 req->val = htobe64(val);
886
887 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
888 send_or_defer(toep, m, 0);
889}
890
891static void
892t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
893{
894 struct mbuf *m;
895 struct tcpcb *tp = toep->tp_tp;
896
897 if (toep == NULL)
898 return;
899
900 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
901 printf("not seting field\n");
902 return;
903 }
904
905 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
906
907 __set_tcb_field(toep, m, word, mask, val, 1);
908}
909
910/*
911 * Set one of the t_flags bits in the TCB.
912 */
913static void
914set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
915{
916
917 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
918}
919
920/*
921 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
922 */
923static void
924t3_set_nagle(struct toepcb *toep)
925{
926 struct tcpcb *tp = toep->tp_tp;
927
928 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
929}
930
931/*
932 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
933 */
934void
935t3_set_keepalive(struct toepcb *toep, int on_off)
936{
937
938 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
939}
940
941void
942t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
943{
944 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
945}
946
947void
948t3_set_dack_mss(struct toepcb *toep, int on_off)
949{
950
951 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
952}
953
954/*
955 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
956 */
957static void
958t3_set_tos(struct toepcb *toep)
959{
960 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
961
962 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
963 V_TCB_TOS(tos));
964}
965
966
967/*
968 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
969 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
970 * set the PSH bit in the last segment, which would trigger delivery.]
971 * We work around the issue by setting a DDP buffer in a partial placed state,
972 * which guarantees that TP will schedule a timer.
973 */
974#define TP_DDP_TIMER_WORKAROUND_MASK\
975 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
976 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
977 V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
978#define TP_DDP_TIMER_WORKAROUND_VAL\
979 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
980 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
981 32))
982
983static void
984t3_enable_ddp(struct toepcb *toep, int on)
985{
986 if (on) {
987
988 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
989 V_TF_DDP_OFF(0));
990 } else
991 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
992 V_TF_DDP_OFF(1) |
993 TP_DDP_TIMER_WORKAROUND_MASK,
994 V_TF_DDP_OFF(1) |
995 TP_DDP_TIMER_WORKAROUND_VAL);
996
997}
998
999void
1000t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1001{
1002 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1003 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1004 tag_color);
1005}
1006
1007void
1008t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1009 unsigned int len)
1010{
1011 if (buf_idx == 0)
1012 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1013 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1014 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1015 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1016 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1017 else
1018 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1019 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1020 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1021 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1022 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1023}
1024
1025static int
1026t3_set_cong_control(struct socket *so, const char *name)
1027{
1028#ifdef CONGESTION_CONTROL_SUPPORTED
1029 int cong_algo;
1030
1031 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1032 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1033 break;
1034
1035 if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1036 return -EINVAL;
1037#endif
1038 return 0;
1039}
1040
1041int
1042t3_get_tcb(struct toepcb *toep)
1043{
1044 struct cpl_get_tcb *req;
1045 struct tcpcb *tp = toep->tp_tp;
1046 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1047
1048 if (!m)
1049 return (ENOMEM);
1050
1051 inp_lock_assert(tp->t_inpcb);
1052 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1053 req = mtod(m, struct cpl_get_tcb *);
1054 m->m_pkthdr.len = m->m_len = sizeof(*req);
1055 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1056 req->wr.wr_lo = 0;
1057 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1058 req->cpuno = htons(toep->tp_qset);
1059 req->rsvd = 0;
1060 if (tp->t_state == TCPS_SYN_SENT)
1061 mbufq_tail(&toep->out_of_order_queue, m); // defer
1062 else
1063 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1064 return 0;
1065}
1066
1067static inline void
1068so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1069{
1070
1071 toepcb_hold(toep);
1072
1073 cxgb_insert_tid(d->cdev, d->client, toep, tid);
1074}
1075
1076/**
1077 * find_best_mtu - find the entry in the MTU table closest to an MTU
1078 * @d: TOM state
1079 * @mtu: the target MTU
1080 *
1081 * Returns the index of the value in the MTU table that is closest to but
1082 * does not exceed the target MTU.
1083 */
1084static unsigned int
1085find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1086{
1087 int i = 0;
1088
1089 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1090 ++i;
1091 return (i);
1092}
1093
1094static unsigned int
1095select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1096{
1097 unsigned int idx;
1098
1099#ifdef notyet
1100 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1101#endif
1102 if (tp) {
1103 tp->t_maxseg = pmtu - 40;
1104 if (tp->t_maxseg < td->mtus[0] - 40)
1105 tp->t_maxseg = td->mtus[0] - 40;
1106 idx = find_best_mtu(td, tp->t_maxseg + 40);
1107
1108 tp->t_maxseg = td->mtus[idx] - 40;
1109 } else
1110 idx = find_best_mtu(td, pmtu);
1111
1112 return (idx);
1113}
1114
1115static inline void
1116free_atid(struct t3cdev *cdev, unsigned int tid)
1117{
1118 struct toepcb *toep = cxgb_free_atid(cdev, tid);
1119
1120 if (toep)
1121 toepcb_release(toep);
1122}
1123
1124/*
1125 * Release resources held by an offload connection (TID, L2T entry, etc.)
1126 */
1127static void
1128t3_release_offload_resources(struct toepcb *toep)
1129{
1130 struct tcpcb *tp = toep->tp_tp;
1131 struct toedev *tdev = toep->tp_toedev;
1132 struct t3cdev *cdev;
1133 struct socket *so;
1134 unsigned int tid = toep->tp_tid;
1135 struct sockbuf *rcv;
1136
1137 CTR0(KTR_TOM, "t3_release_offload_resources");
1138
1139 if (!tdev)
1140 return;
1141
1142 cdev = TOEP_T3C_DEV(toep);
1143 if (!cdev)
1144 return;
1145
1146 toep->tp_qset = 0;
1147 t3_release_ddp_resources(toep);
1148
1149#ifdef CTRL_SKB_CACHE
1150 kfree_skb(CTRL_SKB_CACHE(tp));
1151 CTRL_SKB_CACHE(tp) = NULL;
1152#endif
1153
1154 if (toep->tp_wr_avail != toep->tp_wr_max) {
1155 purge_wr_queue(toep);
1156 reset_wr_list(toep);
1157 }
1158
1159 if (toep->tp_l2t) {
1160 l2t_release(L2DATA(cdev), toep->tp_l2t);
1161 toep->tp_l2t = NULL;
1162 }
1163 toep->tp_tp = NULL;
1164 if (tp) {
1165 inp_lock_assert(tp->t_inpcb);
1166 so = inp_inpcbtosocket(tp->t_inpcb);
1167 rcv = so_sockbuf_rcv(so);
1168 /*
1169 * cancel any offloaded reads
1170 *
1171 */
1172 sockbuf_lock(rcv);
1173 tp->t_toe = NULL;
1174 tp->t_flags &= ~TF_TOE;
1175 if (toep->tp_ddp_state.user_ddp_pending) {
1176 t3_cancel_ubuf(toep, rcv);
1177 toep->tp_ddp_state.user_ddp_pending = 0;
1178 }
1179 so_sorwakeup_locked(so);
1180
1181 }
1182
1183 if (toep->tp_state == TCPS_SYN_SENT) {
1184 free_atid(cdev, tid);
1185#ifdef notyet
1186 __skb_queue_purge(&tp->out_of_order_queue);
1187#endif
1188 } else { // we have TID
1189 cxgb_remove_tid(cdev, toep, tid);
1190 toepcb_release(toep);
1191 }
1192#if 0
1193 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1194#endif
1195}
1196
1197static void
1198install_offload_ops(struct socket *so)
1199{
1200 struct tcpcb *tp = so_sototcpcb(so);
1201
1202 KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1203
1204 t3_install_socket_ops(so);
1205 tp->t_flags |= TF_TOE;
1206 tp->t_tu = &cxgb_toe_usrreqs;
1207}
1208
1209/*
1210 * Determine the receive window scaling factor given a target max
1211 * receive window.
1212 */
1213static __inline int
1214select_rcv_wscale(int space, struct vnet *vnet)
1215{
1216 int wscale = 0;
1217
1218 if (space > MAX_RCV_WND)
1219 space = MAX_RCV_WND;
1220
1221 if (V_tcp_do_rfc1323)
1222 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1223
1224 return (wscale);
1225}
1226
1227/*
1228 * Determine the receive window size for a socket.
1229 */
1230static unsigned long
1231select_rcv_wnd(struct toedev *dev, struct socket *so)
1232{
1233 struct tom_data *d = TOM_DATA(dev);
1234 unsigned int wnd;
1235 unsigned int max_rcv_wnd;
1236 struct sockbuf *rcv;
1237
1238 rcv = so_sockbuf_rcv(so);
1239
1240 if (V_tcp_do_autorcvbuf)
1241 wnd = V_tcp_autorcvbuf_max;
1242 else
1243 wnd = rcv->sb_hiwat;
1244
1245
1246
1247 /* XXX
1248 * For receive coalescing to work effectively we need a receive window
1249 * that can accomodate a coalesced segment.
1250 */
1251 if (wnd < MIN_RCV_WND)
1252 wnd = MIN_RCV_WND;
1253
1254 /* PR 5138 */
1255 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1256 (uint32_t)d->rx_page_size * 23 :
1257 MAX_RCV_WND);
1258
1259 return min(wnd, max_rcv_wnd);
1260}
1261
1262/*
1263 * Assign offload parameters to some socket fields. This code is used by
1264 * both active and passive opens.
1265 */
1266static inline void
1267init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1268 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1269{
1270 struct tcpcb *tp = so_sototcpcb(so);
1271 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1272 struct sockbuf *snd, *rcv;
1273
1274#ifdef notyet
1275 SOCK_LOCK_ASSERT(so);
1276#endif
1277
1278 snd = so_sockbuf_snd(so);
1279 rcv = so_sockbuf_rcv(so);
1280
1281 log(LOG_INFO, "initializing offload socket\n");
1282 /*
1283 * We either need to fix push frames to work with sbcompress
1284 * or we need to add this
1285 */
1286 snd->sb_flags |= SB_NOCOALESCE;
1287 rcv->sb_flags |= SB_NOCOALESCE;
1288
1289 tp->t_toe = toep;
1290 toep->tp_tp = tp;
1291 toep->tp_toedev = dev;
1292
1293 toep->tp_tid = tid;
1294 toep->tp_l2t = e;
1295 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1296 toep->tp_wr_unacked = 0;
1297 toep->tp_delack_mode = 0;
1298
1299 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1300 /*
1301 * XXX broken
1302 *
1303 */
1304 tp->rcv_wnd = select_rcv_wnd(dev, so);
1305
1306 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1307 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1308 toep->tp_qset_idx = 0;
1309
1310 reset_wr_list(toep);
1311 DPRINTF("initialization done\n");
1312}
1313
1314/*
1315 * The next two functions calculate the option 0 value for a socket.
1316 */
1317static inline unsigned int
1318calc_opt0h(struct socket *so, int mtu_idx)
1319{
1320 struct tcpcb *tp = so_sototcpcb(so);
1321 int wscale = select_rcv_wscale(tp->rcv_wnd, so->so_vnet);
1322
1323 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1324 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1325 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1326}
1327
1328static inline unsigned int
1329calc_opt0l(struct socket *so, int ulp_mode)
1330{
1331 struct tcpcb *tp = so_sototcpcb(so);
1332 unsigned int val;
1333
1334 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1335 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1336
1337 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1338 return (val);
1339}
1340
1341static inline unsigned int
1342calc_opt2(const struct socket *so, struct toedev *dev)
1343{
1344 int flv_valid;
1345
1346 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1347
1348 return (V_FLAVORS_VALID(flv_valid) |
1349 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1350}
1351
1352#if DEBUG_WR > 1
1353static int
1354count_pending_wrs(const struct toepcb *toep)
1355{
1356 const struct mbuf *m;
1357 int n = 0;
1358
1359 wr_queue_walk(toep, m)
1360 n += m->m_pkthdr.csum_data;
1361 return (n);
1362}
1363#endif
1364
1365#if 0
1366(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1367#endif
1368
1369static void
1370mk_act_open_req(struct socket *so, struct mbuf *m,
1371 unsigned int atid, const struct l2t_entry *e)
1372{
1373 struct cpl_act_open_req *req;
1374 struct inpcb *inp = so_sotoinpcb(so);
1375 struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1376 struct toepcb *toep = tp->t_toe;
1377 struct toedev *tdev = toep->tp_toedev;
1378
1379 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1380
1381 req = mtod(m, struct cpl_act_open_req *);
1382 m->m_pkthdr.len = m->m_len = sizeof(*req);
1383
1384 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1385 req->wr.wr_lo = 0;
1386 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1387 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1388#if 0
1389 req->local_port = inp->inp_lport;
1390 req->peer_port = inp->inp_fport;
1391 memcpy(&req->local_ip, &inp->inp_laddr, 4);
1392 memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1393#endif
1394 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1395 V_TX_CHANNEL(e->smt_idx));
1396 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1397 req->params = 0;
1398 req->opt2 = htonl(calc_opt2(so, tdev));
1399}
1400
1401
1402/*
1403 * Convert an ACT_OPEN_RPL status to an errno.
1404 */
1405static int
1406act_open_rpl_status_to_errno(int status)
1407{
1408 switch (status) {
1409 case CPL_ERR_CONN_RESET:
1410 return (ECONNREFUSED);
1411 case CPL_ERR_ARP_MISS:
1412 return (EHOSTUNREACH);
1413 case CPL_ERR_CONN_TIMEDOUT:
1414 return (ETIMEDOUT);
1415 case CPL_ERR_TCAM_FULL:
1416 return (ENOMEM);
1417 case CPL_ERR_CONN_EXIST:
1418 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1419 return (EADDRINUSE);
1420 default:
1421 return (EIO);
1422 }
1423}
1424
1425static void
1426fail_act_open(struct toepcb *toep, int errno)
1427{
1428 struct tcpcb *tp = toep->tp_tp;
1429
1430 t3_release_offload_resources(toep);
1431 if (tp) {
1432 inp_wunlock(tp->t_inpcb);
1433 tcp_offload_drop(tp, errno);
1434 }
1435
1436#ifdef notyet
1437 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1438#endif
1439}
1440
1441/*
1442 * Handle active open failures.
1443 */
1444static void
1445active_open_failed(struct toepcb *toep, struct mbuf *m)
1446{
1447 struct cpl_act_open_rpl *rpl = cplhdr(m);
1448 struct inpcb *inp;
1449
1450 if (toep->tp_tp == NULL)
1451 goto done;
1452
1453 inp = toep->tp_tp->t_inpcb;
1454
1455/*
1456 * Don't handle connection retry for now
1457 */
1458#ifdef notyet
1459 struct inet_connection_sock *icsk = inet_csk(sk);
1460
1461 if (rpl->status == CPL_ERR_CONN_EXIST &&
1462 icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1463 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1464 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1465 jiffies + HZ / 2);
1466 } else
1467#endif
1468 {
1469 inp_wlock(inp);
1470 /*
1471 * drops the inpcb lock
1472 */
1473 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1474 }
1475
1476 done:
1477 m_free(m);
1478}
1479
1480/*
1481 * Return whether a failed active open has allocated a TID
1482 */
1483static inline int
1484act_open_has_tid(int status)
1485{
1486 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1487 status != CPL_ERR_ARP_MISS;
1488}
1489
1490/*
1491 * Process an ACT_OPEN_RPL CPL message.
1492 */
1493static int
1494do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1495{
1496 struct toepcb *toep = (struct toepcb *)ctx;
1497 struct cpl_act_open_rpl *rpl = cplhdr(m);
1498
1499 if (cdev->type != T3A && act_open_has_tid(rpl->status))
1500 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1501
1502 active_open_failed(toep, m);
1503 return (0);
1504}
1505
1506/*
1507 * Handle an ARP failure for an active open. XXX purge ofo queue
1508 *
1509 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1510 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1511 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
1512 * free the atid. Hmm.
1513 */
1514#ifdef notyet
1515static void
1516act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1517{
1518 struct toepcb *toep = m_get_toep(m);
1519 struct tcpcb *tp = toep->tp_tp;
1520 struct inpcb *inp = tp->t_inpcb;
1521 struct socket *so;
1522
1523 inp_wlock(inp);
1524 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1525 /*
1526 * drops the inpcb lock
1527 */
1528 fail_act_open(so, EHOSTUNREACH);
1529 printf("freeing %p\n", m);
1530
1531 m_free(m);
1532 } else
1533 inp_wunlock(inp);
1534}
1535#endif
1536/*
1537 * Send an active open request.
1538 */
1539int
1540t3_connect(struct toedev *tdev, struct socket *so,
1541 struct rtentry *rt, struct sockaddr *nam)
1542{
1543 struct mbuf *m;
1544 struct l2t_entry *e;
1545 struct tom_data *d = TOM_DATA(tdev);
1546 struct inpcb *inp = so_sotoinpcb(so);
1547 struct tcpcb *tp = intotcpcb(inp);
1548 struct toepcb *toep; /* allocated by init_offload_socket */
1549
1550 int atid;
1551
1552 toep = toepcb_alloc();
1553 if (toep == NULL)
1554 goto out_err;
1555
1556 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1557 goto out_err;
1558
1559 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1560 if (!e)
1561 goto free_tid;
1562
1563 inp_lock_assert(inp);
1564 m = m_gethdr(MT_DATA, M_WAITOK);
1565
1566#if 0
1567 m->m_toe.mt_toepcb = tp->t_toe;
1568 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1569#endif
1570 so_lock(so);
1571
1572 init_offload_socket(so, tdev, atid, e, rt, toep);
1573
1574 install_offload_ops(so);
1575
1576 mk_act_open_req(so, m, atid, e);
1577 so_unlock(so);
1578
1579 soisconnecting(so);
1580 toep = tp->t_toe;
1581 m_set_toep(m, tp->t_toe);
1582
1583 toep->tp_state = TCPS_SYN_SENT;
1584 l2t_send(d->cdev, (struct mbuf *)m, e);
1585
1586 if (toep->tp_ulp_mode)
1587 t3_enable_ddp(toep, 0);
1588 return (0);
1589
1590free_tid:
1591 printf("failing connect - free atid\n");
1592
1593 free_atid(d->cdev, atid);
1594out_err:
1595 printf("return ENOMEM\n");
1596 return (ENOMEM);
1597}
1598
1599/*
1600 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
1601 * not send multiple ABORT_REQs for the same connection and also that we do
1602 * not try to send a message after the connection has closed. Returns 1 if
1603 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1604 */
1605static void
1606t3_send_reset(struct toepcb *toep)
1607{
1608
1609 struct cpl_abort_req *req;
1610 unsigned int tid = toep->tp_tid;
1611 int mode = CPL_ABORT_SEND_RST;
1612 struct tcpcb *tp = toep->tp_tp;
1613 struct toedev *tdev = toep->tp_toedev;
1614 struct socket *so = NULL;
1615 struct mbuf *m;
1616 struct sockbuf *snd;
1617
1618 if (tp) {
1619 inp_lock_assert(tp->t_inpcb);
1620 so = inp_inpcbtosocket(tp->t_inpcb);
1621 }
1622
1623 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1624 tdev == NULL))
1625 return;
1626 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1627
1628 snd = so_sockbuf_snd(so);
1629 /* Purge the send queue so we don't send anything after an abort. */
1630 if (so)
1631 sbflush(snd);
1632 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1633 mode |= CPL_ABORT_POST_CLOSE_REQ;
1634
1635 m = m_gethdr_nofail(sizeof(*req));
1636 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1637 set_arp_failure_handler(m, abort_arp_failure);
1638
1639 req = mtod(m, struct cpl_abort_req *);
1640 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1641 req->wr.wr_lo = htonl(V_WR_TID(tid));
1642 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1643 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1644 req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1645 req->cmd = mode;
1646 if (tp && (tp->t_state == TCPS_SYN_SENT))
1647 mbufq_tail(&toep->out_of_order_queue, m); // defer
1648 else
1649 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1650}
1651
1652static int
1653t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1654{
1655 struct inpcb *inp;
1656 int error, optval;
1657
1658 if (sopt->sopt_name == IP_OPTIONS)
1659 return (ENOPROTOOPT);
1660
1661 if (sopt->sopt_name != IP_TOS)
1662 return (EOPNOTSUPP);
1663
1664 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1665
1666 if (error)
1667 return (error);
1668
1669 if (optval > IPTOS_PREC_CRITIC_ECP)
1670 return (EINVAL);
1671
1672 inp = so_sotoinpcb(so);
1673 inp_wlock(inp);
1674 inp_ip_tos_set(inp, optval);
1675#if 0
1676 inp->inp_ip_tos = optval;
1677#endif
1678 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1679 inp_wunlock(inp);
1680
1681 return (0);
1682}
1683
1684static int
1685t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1686{
1687 int err = 0;
1688 size_t copied;
1689
1690 if (sopt->sopt_name != TCP_CONGESTION &&
1691 sopt->sopt_name != TCP_NODELAY)
1692 return (EOPNOTSUPP);
1693
1694 if (sopt->sopt_name == TCP_CONGESTION) {
1695 char name[TCP_CA_NAME_MAX];
1696 int optlen = sopt->sopt_valsize;
1697 struct tcpcb *tp;
1698
1699 if (sopt->sopt_dir == SOPT_GET) {
1700 KASSERT(0, ("unimplemented"));
1701 return (EOPNOTSUPP);
1702 }
1703
1704 if (optlen < 1)
1705 return (EINVAL);
1706
1707 err = copyinstr(sopt->sopt_val, name,
1708 min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1709 if (err)
1710 return (err);
1711 if (copied < 1)
1712 return (EINVAL);
1713
1714 tp = so_sototcpcb(so);
1715 /*
1716 * XXX I need to revisit this
1717 */
1718 if ((err = t3_set_cong_control(so, name)) == 0) {
1719#ifdef CONGESTION_CONTROL_SUPPORTED
1720 tp->t_cong_control = strdup(name, M_CXGB);
1721#endif
1722 } else
1723 return (err);
1724 } else {
1725 int optval, oldval;
1726 struct inpcb *inp;
1727 struct tcpcb *tp;
1728
1729 if (sopt->sopt_dir == SOPT_GET)
1730 return (EOPNOTSUPP);
1731
1732 err = sooptcopyin(sopt, &optval, sizeof optval,
1733 sizeof optval);
1734
1735 if (err)
1736 return (err);
1737
1738 inp = so_sotoinpcb(so);
1739 inp_wlock(inp);
1740 tp = inp_inpcbtotcpcb(inp);
1741
1742 oldval = tp->t_flags;
1743 if (optval)
1744 tp->t_flags |= TF_NODELAY;
1745 else
1746 tp->t_flags &= ~TF_NODELAY;
1747 inp_wunlock(inp);
1748
1749
1750 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1751 t3_set_nagle(tp->t_toe);
1752
1753 }
1754
1755 return (0);
1756}
1757
1758int
1759t3_ctloutput(struct socket *so, struct sockopt *sopt)
1760{
1761 int err;
1762
1763 if (sopt->sopt_level != IPPROTO_TCP)
1764 err = t3_ip_ctloutput(so, sopt);
1765 else
1766 err = t3_tcp_ctloutput(so, sopt);
1767
1768 if (err != EOPNOTSUPP)
1769 return (err);
1770
1771 return (tcp_ctloutput(so, sopt));
1772}
1773
1774/*
1775 * Returns true if we need to explicitly request RST when we receive new data
1776 * on an RX-closed connection.
1777 */
1778static inline int
1779need_rst_on_excess_rx(const struct toepcb *toep)
1780{
1781 return (1);
1782}
1783
1784/*
1785 * Handles Rx data that arrives in a state where the socket isn't accepting
1786 * new data.
1787 */
1788static void
1789handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1790{
1791
1792 if (need_rst_on_excess_rx(toep) &&
1793 !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1794 t3_send_reset(toep);
1795 m_freem(m);
1796}
1797
1798/*
1799 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1800 * by getting the DDP offset from the TCB.
1801 */
1802static void
1803tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1804{
1805 struct ddp_state *q = &toep->tp_ddp_state;
1806 struct ddp_buf_state *bsp;
1807 struct cpl_get_tcb_rpl *hdr;
1808 unsigned int ddp_offset;
1809 struct socket *so;
1810 struct tcpcb *tp;
1811 struct sockbuf *rcv;
1812 int state;
1813
1814 uint64_t t;
1815 __be64 *tcb;
1816
1817 tp = toep->tp_tp;
1818 so = inp_inpcbtosocket(tp->t_inpcb);
1819
1820 inp_lock_assert(tp->t_inpcb);
1821 rcv = so_sockbuf_rcv(so);
1822 sockbuf_lock(rcv);
1823
1824 /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1825 * We really need a cookie in order to dispatch the RPLs.
1826 */
1827 q->get_tcb_count--;
1828
1829 /* It is a possible that a previous CPL already invalidated UBUF DDP
1830 * and moved the cur_buf idx and hence no further processing of this
1831 * skb is required. However, the app might be sleeping on
1832 * !q->get_tcb_count and we need to wake it up.
1833 */
1834 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1835 int state = so_state_get(so);
1836
1837 m_freem(m);
1838 if (__predict_true((state & SS_NOFDREF) == 0))
1839 so_sorwakeup_locked(so);
1840 else
1841 sockbuf_unlock(rcv);
1842
1843 return;
1844 }
1845
1846 bsp = &q->buf_state[q->cur_buf];
1847 hdr = cplhdr(m);
1848 tcb = (__be64 *)(hdr + 1);
1849 if (q->cur_buf == 0) {
1850 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1851 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1852 } else {
1853 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1854 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1855 }
1856 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1857 m->m_cur_offset = bsp->cur_offset;
1858 bsp->cur_offset = ddp_offset;
1859 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1860
1861 CTR5(KTR_TOM,
1862 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1863 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1864 KASSERT(ddp_offset >= m->m_cur_offset,
1865 ("ddp_offset=%u less than cur_offset=%u",
1866 ddp_offset, m->m_cur_offset));
1867
1868#if 0
1869{
1870 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1871
1872 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1873 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1874
1875 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1876 rcv_nxt = t >> S_TCB_RCV_NXT;
1877 rcv_nxt &= M_TCB_RCV_NXT;
1878
1879 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1880 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1881 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1882
1883 T3_TRACE2(TIDTB(sk),
1884 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1885 ddp_flags, rcv_nxt - rx_hdr_offset);
1886 T3_TRACE4(TB(q),
1887 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1888 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1889 T3_TRACE3(TB(q),
1890 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1891 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1892 T3_TRACE2(TB(q),
1893 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1894 q->buf_state[0].flags, q->buf_state[1].flags);
1895
1896}
1897#endif
1898 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1899 handle_excess_rx(toep, m);
1900 return;
1901 }
1902
1903#ifdef T3_TRACE
1904 if ((int)m->m_pkthdr.len < 0) {
1905 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1906 }
1907#endif
1908 if (bsp->flags & DDP_BF_NOCOPY) {
1909#ifdef T3_TRACE
1910 T3_TRACE0(TB(q),
1911 "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1912
1913 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1914 printk("!cancel_ubuf");
1915 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1916 }
1917#endif
1918 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1919 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1920 q->cur_buf ^= 1;
1921 } else if (bsp->flags & DDP_BF_NOFLIP) {
1922
1923 m->m_ddp_flags = 1; /* always a kernel buffer */
1924
1925 /* now HW buffer carries a user buffer */
1926 bsp->flags &= ~DDP_BF_NOFLIP;
1927 bsp->flags |= DDP_BF_NOCOPY;
1928
1929 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1930 * any new data in which case we're done. If in addition the
1931 * offset is 0, then there wasn't a completion for the kbuf
1932 * and we need to decrement the posted count.
1933 */
1934 if (m->m_pkthdr.len == 0) {
1935 if (ddp_offset == 0) {
1936 q->kbuf_posted--;
1937 bsp->flags |= DDP_BF_NODATA;
1938 }
1939 sockbuf_unlock(rcv);
1940 m_free(m);
1941 return;
1942 }
1943 } else {
1944 sockbuf_unlock(rcv);
1945
1946 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1947 * but it got here way late and nobody cares anymore.
1948 */
1949 m_free(m);
1950 return;
1951 }
1952
1953 m->m_ddp_gl = (unsigned char *)bsp->gl;
1954 m->m_flags |= M_DDP;
1955 m->m_seq = tp->rcv_nxt;
1956 tp->rcv_nxt += m->m_pkthdr.len;
1957 tp->t_rcvtime = ticks;
1958 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1959 m->m_seq, q->cur_buf, m->m_pkthdr.len);
1960 if (m->m_pkthdr.len == 0) {
1961 q->user_ddp_pending = 0;
1962 m_free(m);
1963 } else
1964 SBAPPEND(rcv, m);
1965
1966 state = so_state_get(so);
1967 if (__predict_true((state & SS_NOFDREF) == 0))
1968 so_sorwakeup_locked(so);
1969 else
1970 sockbuf_unlock(rcv);
1971}
1972
1973/*
1974 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
1975 * in that case they are similar to DDP completions.
1976 */
1977static int
1978do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1979{
1980 struct toepcb *toep = (struct toepcb *)ctx;
1981
1982 /* OK if socket doesn't exist */
1983 if (toep == NULL) {
1984 printf("null toep in do_get_tcb_rpl\n");
1985 return (CPL_RET_BUF_DONE);
1986 }
1987
1988 inp_wlock(toep->tp_tp->t_inpcb);
1989 tcb_rpl_as_ddp_complete(toep, m);
1990 inp_wunlock(toep->tp_tp->t_inpcb);
1991
1992 return (0);
1993}
1994
1995static void
1996handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1997{
1998 struct tcpcb *tp = toep->tp_tp;
1999 struct socket *so;
2000 struct ddp_state *q;
2001 struct ddp_buf_state *bsp;
2002 struct cpl_rx_data *hdr = cplhdr(m);
2003 unsigned int rcv_nxt = ntohl(hdr->seq);
2004 struct sockbuf *rcv;
2005
2006 if (tp->rcv_nxt == rcv_nxt)
2007 return;
2008
2009 inp_lock_assert(tp->t_inpcb);
2010 so = inp_inpcbtosocket(tp->t_inpcb);
2011 rcv = so_sockbuf_rcv(so);
2012 sockbuf_lock(rcv);
2013
2014 q = &toep->tp_ddp_state;
2015 bsp = &q->buf_state[q->cur_buf];
2016 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2017 rcv_nxt, tp->rcv_nxt));
2018 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2019 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2020 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2021 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2022
2023#ifdef T3_TRACE
2024 if ((int)m->m_pkthdr.len < 0) {
2025 t3_ddp_error(so, "handle_ddp_data: neg len");
2026 }
2027#endif
2028 m->m_ddp_gl = (unsigned char *)bsp->gl;
2029 m->m_flags |= M_DDP;
2030 m->m_cur_offset = bsp->cur_offset;
2031 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2032 if (bsp->flags & DDP_BF_NOCOPY)
2033 bsp->flags &= ~DDP_BF_NOCOPY;
2034
2035 m->m_seq = tp->rcv_nxt;
2036 tp->rcv_nxt = rcv_nxt;
2037 bsp->cur_offset += m->m_pkthdr.len;
2038 if (!(bsp->flags & DDP_BF_NOFLIP))
2039 q->cur_buf ^= 1;
2040 /*
2041 * For now, don't re-enable DDP after a connection fell out of DDP
2042 * mode.
2043 */
2044 q->ubuf_ddp_ready = 0;
2045 sockbuf_unlock(rcv);
2046}
2047
2048/*
2049 * Process new data received for a connection.
2050 */
2051static void
2052new_rx_data(struct toepcb *toep, struct mbuf *m)
2053{
2054 struct cpl_rx_data *hdr = cplhdr(m);
2055 struct tcpcb *tp = toep->tp_tp;
2056 struct socket *so;
2057 struct sockbuf *rcv;
2058 int state;
2059 int len = be16toh(hdr->len);
2060
2061 inp_wlock(tp->t_inpcb);
2062
2063 so = inp_inpcbtosocket(tp->t_inpcb);
2064
2065 if (__predict_false(so_no_receive(so))) {
2066 handle_excess_rx(toep, m);
2067 inp_wunlock(tp->t_inpcb);
2068 TRACE_EXIT;
2069 return;
2070 }
2071
2072 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2073 handle_ddp_data(toep, m);
2074
2075 m->m_seq = ntohl(hdr->seq);
2076 m->m_ulp_mode = 0; /* for iSCSI */
2077
2078#if VALIDATE_SEQ
2079 if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2080 log(LOG_ERR,
2081 "%s: TID %u: Bad sequence number %u, expected %u\n",
2082 toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2083 tp->rcv_nxt);
2084 m_freem(m);
2085 inp_wunlock(tp->t_inpcb);
2086 return;
2087 }
2088#endif
2089 m_adj(m, sizeof(*hdr));
2090
2091#ifdef URGENT_DATA_SUPPORTED
2092 /*
2093 * We don't handle urgent data yet
2094 */
2095 if (__predict_false(hdr->urg))
2096 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2097 if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2098 tp->urg_seq - tp->rcv_nxt < skb->len))
2099 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2100 tp->rcv_nxt];
2101#endif
2102 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2103 toep->tp_delack_mode = hdr->dack_mode;
2104 toep->tp_delack_seq = tp->rcv_nxt;
2105 }
2106 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2107 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2108
2109 if (len < m->m_pkthdr.len)
2110 m->m_pkthdr.len = m->m_len = len;
2111
2112 tp->rcv_nxt += m->m_pkthdr.len;
2113 tp->t_rcvtime = ticks;
2114 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2115 CTR2(KTR_TOM,
2116 "new_rx_data: seq 0x%x len %u",
2117 m->m_seq, m->m_pkthdr.len);
2118 inp_wunlock(tp->t_inpcb);
2119 rcv = so_sockbuf_rcv(so);
2120 sockbuf_lock(rcv);
2121#if 0
2122 if (sb_notify(rcv))
2123 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2124#endif
2125 SBAPPEND(rcv, m);
2126
2127#ifdef notyet
2128 /*
2129 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2130 *
2131 */
2132 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2133
2134 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2135 so, rcv->sb_cc, rcv->sb_mbmax));
2136#endif
2137
2138
2139 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2140 rcv->sb_cc, rcv->sb_mbcnt);
2141
2142 state = so_state_get(so);
2143 if (__predict_true((state & SS_NOFDREF) == 0))
2144 so_sorwakeup_locked(so);
2145 else
2146 sockbuf_unlock(rcv);
2147}
2148
2149/*
2150 * Handler for RX_DATA CPL messages.
2151 */
2152static int
2153do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2154{
2155 struct toepcb *toep = (struct toepcb *)ctx;
2156
2157 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2158
2159 new_rx_data(toep, m);
2160
2161 return (0);
2162}
2163
2164static void
2165new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2166{
2167 struct tcpcb *tp;
2168 struct ddp_state *q;
2169 struct ddp_buf_state *bsp;
2170 struct cpl_rx_data_ddp *hdr;
2171 struct socket *so;
2172 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2173 int nomoredata = 0;
2174 unsigned int delack_mode;
2175 struct sockbuf *rcv;
2176
2177 tp = toep->tp_tp;
2178 inp_wlock(tp->t_inpcb);
2179 so = inp_inpcbtosocket(tp->t_inpcb);
2180
2181 if (__predict_false(so_no_receive(so))) {
2182
2183 handle_excess_rx(toep, m);
2184 inp_wunlock(tp->t_inpcb);
2185 return;
2186 }
2187
2188 q = &toep->tp_ddp_state;
2189 hdr = cplhdr(m);
2190 ddp_report = ntohl(hdr->u.ddp_report);
2191 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2192 bsp = &q->buf_state[buf_idx];
2193
2194 CTR4(KTR_TOM,
2195 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2196 "hdr seq 0x%x len %u",
2197 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2198 ntohs(hdr->len));
2199 CTR3(KTR_TOM,
2200 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2201 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2202
2203 ddp_len = ntohs(hdr->len);
2204 rcv_nxt = ntohl(hdr->seq) + ddp_len;
2205
2206 delack_mode = G_DDP_DACK_MODE(ddp_report);
2207 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2208 toep->tp_delack_mode = delack_mode;
2209 toep->tp_delack_seq = tp->rcv_nxt;
2210 }
2211
2212 m->m_seq = tp->rcv_nxt;
2213 tp->rcv_nxt = rcv_nxt;
2214
2215 tp->t_rcvtime = ticks;
2216 /*
2217 * Store the length in m->m_len. We are changing the meaning of
2218 * m->m_len here, we need to be very careful that nothing from now on
2219 * interprets ->len of this packet the usual way.
2220 */
2221 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2222 inp_wunlock(tp->t_inpcb);
2223 CTR3(KTR_TOM,
2224 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2225 m->m_len, rcv_nxt, m->m_seq);
2226 /*
2227 * Figure out where the new data was placed in the buffer and store it
2228 * in when. Assumes the buffer offset starts at 0, consumer needs to
2229 * account for page pod's pg_offset.
2230 */
2231 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2232 m->m_cur_offset = end_offset - m->m_pkthdr.len;
2233
2234 rcv = so_sockbuf_rcv(so);
2235 sockbuf_lock(rcv);
2236
2237 m->m_ddp_gl = (unsigned char *)bsp->gl;
2238 m->m_flags |= M_DDP;
2239 bsp->cur_offset = end_offset;
2240 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2241
2242 /*
2243 * Length is only meaningful for kbuf
2244 */
2245 if (!(bsp->flags & DDP_BF_NOCOPY))
2246 KASSERT(m->m_len <= bsp->gl->dgl_length,
2247 ("length received exceeds ddp pages: len=%d dgl_length=%d",
2248 m->m_len, bsp->gl->dgl_length));
2249
2250 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2251 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2252 /*
2253 * Bit 0 of flags stores whether the DDP buffer is completed.
2254 * Note that other parts of the code depend on this being in bit 0.
2255 */
2256 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2257 panic("spurious ddp completion");
2258 } else {
2259 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2260 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2261 q->cur_buf ^= 1; /* flip buffers */
2262 }
2263
2264 if (bsp->flags & DDP_BF_NOCOPY) {
2265 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2266 bsp->flags &= ~DDP_BF_NOCOPY;
2267 }
2268
2269 if (ddp_report & F_DDP_PSH)
2270 m->m_ddp_flags |= DDP_BF_PSH;
2271 if (nomoredata)
2272 m->m_ddp_flags |= DDP_BF_NODATA;
2273
2274#ifdef notyet
2275 skb_reset_transport_header(skb);
2276 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */
2277#endif
2278 SBAPPEND(rcv, m);
2279
2280 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2281 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2282 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2283 so_sorwakeup_locked(so);
2284 else
2285 sockbuf_unlock(rcv);
2286}
2287
2288#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2289 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2290 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2291 F_DDP_INVALID_PPOD)
2292
2293/*
2294 * Handler for RX_DATA_DDP CPL messages.
2295 */
2296static int
2297do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2298{
2299 struct toepcb *toep = ctx;
2300 const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2301
2302 VALIDATE_SOCK(so);
2303
2304 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2305 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2306 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2307 return (CPL_RET_BUF_DONE);
2308 }
2309#if 0
2310 skb->h.th = tcphdr_skb->h.th;
2311#endif
2312 new_rx_data_ddp(toep, m);
2313 return (0);
2314}
2315
2316static void
2317process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2318{
2319 struct tcpcb *tp = toep->tp_tp;
2320 struct socket *so;
2321 struct ddp_state *q;
2322 struct ddp_buf_state *bsp;
2323 struct cpl_rx_ddp_complete *hdr;
2324 unsigned int ddp_report, buf_idx, when, delack_mode;
2325 int nomoredata = 0;
2326 struct sockbuf *rcv;
2327
2328 inp_wlock(tp->t_inpcb);
2329 so = inp_inpcbtosocket(tp->t_inpcb);
2330
2331 if (__predict_false(so_no_receive(so))) {
2332 struct inpcb *inp = so_sotoinpcb(so);
2333
2334 handle_excess_rx(toep, m);
2335 inp_wunlock(inp);
2336 return;
2337 }
2338 q = &toep->tp_ddp_state;
2339 hdr = cplhdr(m);
2340 ddp_report = ntohl(hdr->ddp_report);
2341 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2342 m->m_pkthdr.csum_data = tp->rcv_nxt;
2343
2344 rcv = so_sockbuf_rcv(so);
2345 sockbuf_lock(rcv);
2346
2347 bsp = &q->buf_state[buf_idx];
2348 when = bsp->cur_offset;
2349 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2350 tp->rcv_nxt += m->m_len;
2351 tp->t_rcvtime = ticks;
2352
2353 delack_mode = G_DDP_DACK_MODE(ddp_report);
2354 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2355 toep->tp_delack_mode = delack_mode;
2356 toep->tp_delack_seq = tp->rcv_nxt;
2357 }
2358#ifdef notyet
2359 skb_reset_transport_header(skb);
2360 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2361#endif
2362 inp_wunlock(tp->t_inpcb);
2363
2364 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2365 CTR5(KTR_TOM,
2366 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2367 "ddp_report 0x%x offset %u, len %u",
2368 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2369 G_DDP_OFFSET(ddp_report), m->m_len);
2370
2371 m->m_cur_offset = bsp->cur_offset;
2372 bsp->cur_offset += m->m_len;
2373
2374 if (!(bsp->flags & DDP_BF_NOFLIP)) {
2375 q->cur_buf ^= 1; /* flip buffers */
2376 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2377 nomoredata=1;
2378 }
2379
2380 CTR4(KTR_TOM,
2381 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2382 "ddp_report %u offset %u",
2383 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2384 G_DDP_OFFSET(ddp_report));
2385
2386 m->m_ddp_gl = (unsigned char *)bsp->gl;
2387 m->m_flags |= M_DDP;
2388 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2389 if (bsp->flags & DDP_BF_NOCOPY)
2390 bsp->flags &= ~DDP_BF_NOCOPY;
2391 if (nomoredata)
2392 m->m_ddp_flags |= DDP_BF_NODATA;
2393
2394 SBAPPEND(rcv, m);
2395 if ((so_state_get(so) & SS_NOFDREF) == 0)
2396 so_sorwakeup_locked(so);
2397 else
2398 sockbuf_unlock(rcv);
2399}
2400
2401/*
2402 * Handler for RX_DDP_COMPLETE CPL messages.
2403 */
2404static int
2405do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2406{
2407 struct toepcb *toep = ctx;
2408
2409 VALIDATE_SOCK(so);
2410#if 0
2411 skb->h.th = tcphdr_skb->h.th;
2412#endif
2413 process_ddp_complete(toep, m);
2414 return (0);
2415}
2416
2417/*
2418 * Move a socket to TIME_WAIT state. We need to make some adjustments to the
2419 * socket state before calling tcp_time_wait to comply with its expectations.
2420 */
2421static void
2422enter_timewait(struct tcpcb *tp)
2423{
2424 /*
2425 * Bump rcv_nxt for the peer FIN. We don't do this at the time we
2426 * process peer_close because we don't want to carry the peer FIN in
2427 * the socket's receive queue and if we increment rcv_nxt without
2428 * having the FIN in the receive queue we'll confuse facilities such
2429 * as SIOCINQ.
2430 */
2431 inp_wlock(tp->t_inpcb);
2432 tp->rcv_nxt++;
2433
2434 tp->ts_recent_age = 0; /* defeat recycling */
2435 tp->t_srtt = 0; /* defeat tcp_update_metrics */
2436 inp_wunlock(tp->t_inpcb);
2437 tcp_offload_twstart(tp);
2438}
2439
2440/*
2441 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
2442 * function deals with the data that may be reported along with the FIN.
2443 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2444 * perform normal FIN-related processing. In the latter case 1 indicates that
2445 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2446 * skb can be freed.
2447 */
2448static int
2449handle_peer_close_data(struct socket *so, struct mbuf *m)
2450{
2451 struct tcpcb *tp = so_sototcpcb(so);
2452 struct toepcb *toep = tp->t_toe;
2453 struct ddp_state *q;
2454 struct ddp_buf_state *bsp;
2455 struct cpl_peer_close *req = cplhdr(m);
2456 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2457 struct sockbuf *rcv;
2458
2459 if (tp->rcv_nxt == rcv_nxt) /* no data */
2460 return (0);
2461
2462 CTR0(KTR_TOM, "handle_peer_close_data");
2463 if (__predict_false(so_no_receive(so))) {
2464 handle_excess_rx(toep, m);
2465
2466 /*
2467 * Although we discard the data we want to process the FIN so
2468 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2469 * PEER_CLOSE without data. In particular this PEER_CLOSE
2470 * may be what will close the connection. We return 1 because
2471 * handle_excess_rx() already freed the packet.
2472 */
2473 return (1);
2474 }
2475
2476 inp_lock_assert(tp->t_inpcb);
2477 q = &toep->tp_ddp_state;
2478 rcv = so_sockbuf_rcv(so);
2479 sockbuf_lock(rcv);
2480
2481 bsp = &q->buf_state[q->cur_buf];
2482 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2483 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2484 m->m_ddp_gl = (unsigned char *)bsp->gl;
2485 m->m_flags |= M_DDP;
2486 m->m_cur_offset = bsp->cur_offset;
2487 m->m_ddp_flags =
2488 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2489 m->m_seq = tp->rcv_nxt;
2490 tp->rcv_nxt = rcv_nxt;
2491 bsp->cur_offset += m->m_pkthdr.len;
2492 if (!(bsp->flags & DDP_BF_NOFLIP))
2493 q->cur_buf ^= 1;
2494#ifdef notyet
2495 skb_reset_transport_header(skb);
2496 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2497#endif
2498 tp->t_rcvtime = ticks;
2499 SBAPPEND(rcv, m);
2500 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2501 so_sorwakeup_locked(so);
2502 else
2503 sockbuf_unlock(rcv);
2504
2505 return (1);
2506}
2507
2508/*
2509 * Handle a peer FIN.
2510 */
2511static void
2512do_peer_fin(struct toepcb *toep, struct mbuf *m)
2513{
2514 struct socket *so;
2515 struct tcpcb *tp = toep->tp_tp;
2516 int keep, action;
2517
2518 action = keep = 0;
2519 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2520 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2521 printf("abort_pending set\n");
2522
2523 goto out;
2524 }
2525 inp_wlock(tp->t_inpcb);
2526 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2527 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2528 keep = handle_peer_close_data(so, m);
2529 if (keep < 0) {
2530 inp_wunlock(tp->t_inpcb);
2531 return;
2532 }
2533 }
2534 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2535 CTR1(KTR_TOM,
2536 "waking up waiters for cantrcvmore on %p ", so);
2537 socantrcvmore(so);
2538
2539 /*
2540 * If connection is half-synchronized
2541 * (ie NEEDSYN flag on) then delay ACK,
2542 * so it may be piggybacked when SYN is sent.
2543 * Otherwise, since we received a FIN then no
2544 * more input can be expected, send ACK now.
2545 */
2546 if (tp->t_flags & TF_NEEDSYN)
2547 tp->t_flags |= TF_DELACK;
2548 else
2549 tp->t_flags |= TF_ACKNOW;
2550 tp->rcv_nxt++;
2551 }
2552
2553 switch (tp->t_state) {
2554 case TCPS_SYN_RECEIVED:
2555 tp->t_starttime = ticks;
2556 /* FALLTHROUGH */
2557 case TCPS_ESTABLISHED:
2558 tp->t_state = TCPS_CLOSE_WAIT;
2559 break;
2560 case TCPS_FIN_WAIT_1:
2561 tp->t_state = TCPS_CLOSING;
2562 break;
2563 case TCPS_FIN_WAIT_2:
2564 /*
2565 * If we've sent an abort_req we must have sent it too late,
2566 * HW will send us a reply telling us so, and this peer_close
2567 * is really the last message for this connection and needs to
2568 * be treated as an abort_rpl, i.e., transition the connection
2569 * to TCP_CLOSE (note that the host stack does this at the
2570 * time of generating the RST but we must wait for HW).
2571 * Otherwise we enter TIME_WAIT.
2572 */
2573 t3_release_offload_resources(toep);
2574 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2575 action = TCP_CLOSE;
2576 } else {
2577 action = TCP_TIMEWAIT;
2578 }
2579 break;
2580 default:
2581 log(LOG_ERR,
2582 "%s: TID %u received PEER_CLOSE in bad state %d\n",
2583 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2584 }
2585 inp_wunlock(tp->t_inpcb);
2586
2587 if (action == TCP_TIMEWAIT) {
2588 enter_timewait(tp);
2589 } else if (action == TCP_DROP) {
2590 tcp_offload_drop(tp, 0);
2591 } else if (action == TCP_CLOSE) {
2592 tcp_offload_close(tp);
2593 }
2594
2595#ifdef notyet
2596 /* Do not send POLL_HUP for half duplex close. */
2597 if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2598 sk->sk_state == TCP_CLOSE)
2599 sk_wake_async(so, 1, POLL_HUP);
2600 else
2601 sk_wake_async(so, 1, POLL_IN);
2602#endif
2603
2604out:
2605 if (!keep)
2606 m_free(m);
2607}
2608
2609/*
2610 * Handler for PEER_CLOSE CPL messages.
2611 */
2612static int
2613do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2614{
2615 struct toepcb *toep = (struct toepcb *)ctx;
2616
2617 VALIDATE_SOCK(so);
2618
2619 do_peer_fin(toep, m);
2620 return (0);
2621}
2622
2623static void
2624process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2625{
2626 struct cpl_close_con_rpl *rpl = cplhdr(m);
2627 struct tcpcb *tp = toep->tp_tp;
2628 struct socket *so;
2629 int action = 0;
2630 struct sockbuf *rcv;
2631
2632 inp_wlock(tp->t_inpcb);
2633 so = inp_inpcbtosocket(tp->t_inpcb);
2634
2635 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */
2636
2637 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2638 inp_wunlock(tp->t_inpcb);
2639 goto out;
2640 }
2641
2642 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2643 tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2644
2645 switch (tp->t_state) {
2646 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
2647 t3_release_offload_resources(toep);
2648 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2649 action = TCP_CLOSE;
2650
2651 } else {
2652 action = TCP_TIMEWAIT;
2653 }
2654 break;
2655 case TCPS_LAST_ACK:
2656 /*
2657 * In this state we don't care about pending abort_rpl.
2658 * If we've sent abort_req it was post-close and was sent too
2659 * late, this close_con_rpl is the actual last message.
2660 */
2661 t3_release_offload_resources(toep);
2662 action = TCP_CLOSE;
2663 break;
2664 case TCPS_FIN_WAIT_1:
2665 /*
2666 * If we can't receive any more
2667 * data, then closing user can proceed.
2668 * Starting the timer is contrary to the
2669 * specification, but if we don't get a FIN
2670 * we'll hang forever.
2671 *
2672 * XXXjl:
2673 * we should release the tp also, and use a
2674 * compressed state.
2675 */
2676 if (so)
2677 rcv = so_sockbuf_rcv(so);
2678 else
2679 break;
2680
2681 if (rcv->sb_state & SBS_CANTRCVMORE) {
2682 int timeout;
2683
2684 if (so)
2685 soisdisconnected(so);
2686 timeout = (tcp_fast_finwait2_recycle) ?
2687 tcp_finwait2_timeout : tcp_maxidle;
2688 tcp_timer_activate(tp, TT_2MSL, timeout);
2689 }
2690 tp->t_state = TCPS_FIN_WAIT_2;
2691 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2692 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2693 action = TCP_DROP;
2694 }
2695
2696 break;
2697 default:
2698 log(LOG_ERR,
2699 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2700 toep->tp_toedev->tod_name, toep->tp_tid,
2701 tp->t_state);
2702 }
2703 inp_wunlock(tp->t_inpcb);
2704
2705
2706 if (action == TCP_TIMEWAIT) {
2707 enter_timewait(tp);
2708 } else if (action == TCP_DROP) {
2709 tcp_offload_drop(tp, 0);
2710 } else if (action == TCP_CLOSE) {
2711 tcp_offload_close(tp);
2712 }
2713out:
2714 m_freem(m);
2715}
2716
2717/*
2718 * Handler for CLOSE_CON_RPL CPL messages.
2719 */
2720static int
2721do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2722 void *ctx)
2723{
2724 struct toepcb *toep = (struct toepcb *)ctx;
2725
2726 process_close_con_rpl(toep, m);
2727 return (0);
2728}
2729
2730/*
2731 * Process abort replies. We only process these messages if we anticipate
2732 * them as the coordination between SW and HW in this area is somewhat lacking
2733 * and sometimes we get ABORT_RPLs after we are done with the connection that
2734 * originated the ABORT_REQ.
2735 */
2736static void
2737process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2738{
2739 struct tcpcb *tp = toep->tp_tp;
2740 struct socket *so;
2741 int needclose = 0;
2742
2743#ifdef T3_TRACE
2744 T3_TRACE1(TIDTB(sk),
2745 "process_abort_rpl: GTS rpl pending %d",
2746 sock_flag(sk, ABORT_RPL_PENDING));
2747#endif
2748
2749 inp_wlock(tp->t_inpcb);
2750 so = inp_inpcbtosocket(tp->t_inpcb);
2751
2752 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2753 /*
2754 * XXX panic on tcpdrop
2755 */
2756 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2757 toep->tp_flags |= TP_ABORT_RPL_RCVD;
2758 else {
2759 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2760 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2761 !is_t3a(toep->tp_toedev)) {
2762 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2763 panic("TP_ABORT_REQ_RCVD set");
2764 t3_release_offload_resources(toep);
2765 needclose = 1;
2766 }
2767 }
2768 }
2769 inp_wunlock(tp->t_inpcb);
2770
2771 if (needclose)
2772 tcp_offload_close(tp);
2773
2774 m_free(m);
2775}
2776
2777/*
2778 * Handle an ABORT_RPL_RSS CPL message.
2779 */
2780static int
2781do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2782{
2783 struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2784 struct toepcb *toep;
2785
2786 /*
2787 * Ignore replies to post-close aborts indicating that the abort was
2788 * requested too late. These connections are terminated when we get
2789 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2790 * arrives the TID is either no longer used or it has been recycled.
2791 */
2792 if (rpl->status == CPL_ERR_ABORT_FAILED) {
2793discard:
2794 m_free(m);
2795 return (0);
2796 }
2797
2798 toep = (struct toepcb *)ctx;
2799
2800 /*
2801 * Sometimes we've already closed the socket, e.g., a post-close
2802 * abort races with ABORT_REQ_RSS, the latter frees the socket
2803 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2804 * but FW turns the ABORT_REQ into a regular one and so we get
2805 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
2806 */
2807 if (!toep)
2808 goto discard;
2809
2810 if (toep->tp_tp == NULL) {
2811 log(LOG_NOTICE, "removing tid for abort\n");
2812 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2813 if (toep->tp_l2t)
2814 l2t_release(L2DATA(cdev), toep->tp_l2t);
2815
2816 toepcb_release(toep);
2817 goto discard;
2818 }
2819
2820 log(LOG_NOTICE, "toep=%p\n", toep);
2821 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2822
2823 toepcb_hold(toep);
2824 process_abort_rpl(toep, m);
2825 toepcb_release(toep);
2826 return (0);
2827}
2828
2829/*
2830 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also
2831 * indicate whether RST should be sent in response.
2832 */
2833static int
2834abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2835{
2836 struct tcpcb *tp = so_sototcpcb(so);
2837
2838 switch (abort_reason) {
2839 case CPL_ERR_BAD_SYN:
2840#if 0
2841 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through
2842#endif
2843 case CPL_ERR_CONN_RESET:
2844 // XXX need to handle SYN_RECV due to crossed SYNs
2845 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2846 case CPL_ERR_XMIT_TIMEDOUT:
2847 case CPL_ERR_PERSIST_TIMEDOUT:
2848 case CPL_ERR_FINWAIT2_TIMEDOUT:
2849 case CPL_ERR_KEEPALIVE_TIMEDOUT:
2850#if 0
2851 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2852#endif
2853 return (ETIMEDOUT);
2854 default:
2855 return (EIO);
2856 }
2857}
2858
2859static inline void
2860set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2861{
2862 struct cpl_abort_rpl *rpl = cplhdr(m);
2863
2864 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2865 rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2866 m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2867
2868 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2869 rpl->cmd = cmd;
2870}
2871
2872static void
2873send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2874{
2875 struct mbuf *reply_mbuf;
2876 struct cpl_abort_req_rss *req = cplhdr(m);
2877
2878 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2879 m_set_priority(m, CPL_PRIORITY_DATA);
2880 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2881 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2882 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2883 m_free(m);
2884}
2885
2886/*
2887 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2888 */
2889static inline int
2890is_neg_adv_abort(unsigned int status)
2891{
2892 return status == CPL_ERR_RTX_NEG_ADVICE ||
2893 status == CPL_ERR_PERSIST_NEG_ADVICE;
2894}
2895
2896static void
2897send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2898{
2899 struct mbuf *reply_mbuf;
2900 struct cpl_abort_req_rss *req = cplhdr(m);
2901
2902 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2903
2904 if (!reply_mbuf) {
2905 /* Defer the reply. Stick rst_status into req->cmd. */
2906 req->status = rst_status;
2907 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2908 return;
2909 }
2910
2911 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2912 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2913 m_free(m);
2914
2915 /*
2916 * XXX need to sync with ARP as for SYN_RECV connections we can send
2917 * these messages while ARP is pending. For other connection states
2918 * it's not a problem.
2919 */
2920 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2921}
2922
2923#ifdef notyet
2924static void
2925cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2926{
2927 CXGB_UNIMPLEMENTED();
2928#ifdef notyet
2929 struct request_sock *req = child->sk_user_data;
2930
2931 inet_csk_reqsk_queue_removed(parent, req);
2932 synq_remove(tcp_sk(child));
2933 __reqsk_free(req);
2934 child->sk_user_data = NULL;
2935#endif
2936}
2937
2938
2939/*
2940 * Performs the actual work to abort a SYN_RECV connection.
2941 */
2942static void
2943do_abort_syn_rcv(struct socket *child, struct socket *parent)
2944{
2945 struct tcpcb *parenttp = so_sototcpcb(parent);
2946 struct tcpcb *childtp = so_sototcpcb(child);
2947
2948 /*
2949 * If the server is still open we clean up the child connection,
2950 * otherwise the server already did the clean up as it was purging
2951 * its SYN queue and the skb was just sitting in its backlog.
2952 */
2953 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2954 cleanup_syn_rcv_conn(child, parent);
2955 inp_wlock(childtp->t_inpcb);
2956 t3_release_offload_resources(childtp->t_toe);
2957 inp_wunlock(childtp->t_inpcb);
2958 tcp_offload_close(childtp);
2959 }
2960}
2961#endif
2962
2963/*
2964 * Handle abort requests for a SYN_RECV connection. These need extra work
2965 * because the socket is on its parent's SYN queue.
2966 */
2967static int
2968abort_syn_rcv(struct socket *so, struct mbuf *m)
2969{
2970 CXGB_UNIMPLEMENTED();
2971#ifdef notyet
2972 struct socket *parent;
2973 struct toedev *tdev = toep->tp_toedev;
2974 struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2975 struct socket *oreq = so->so_incomp;
2976 struct t3c_tid_entry *t3c_stid;
2977 struct tid_info *t;
2978
2979 if (!oreq)
2980 return -1; /* somehow we are not on the SYN queue */
2981
2982 t = &(T3C_DATA(cdev))->tid_maps;
2983 t3c_stid = lookup_stid(t, oreq->ts_recent);
2984 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2985
2986 so_lock(parent);
2987 do_abort_syn_rcv(so, parent);
2988 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2989 so_unlock(parent);
2990#endif
2991 return (0);
2992}
2993
2994/*
2995 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this
2996 * request except that we need to reply to it.
2997 */
2998static void
2999process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3000{
3001 int rst_status = CPL_ABORT_NO_RST;
3002 const struct cpl_abort_req_rss *req = cplhdr(m);
3003 struct tcpcb *tp = toep->tp_tp;
3004 struct socket *so;
3005 int needclose = 0;
3006
3007 inp_wlock(tp->t_inpcb);
3008 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3009 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3010 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3011 m_free(m);
3012 goto skip;
3013 }
3014
3015 toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3016 /*
3017 * Three cases to consider:
3018 * a) We haven't sent an abort_req; close the connection.
3019 * b) We have sent a post-close abort_req that will get to TP too late
3020 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
3021 * be ignored and the connection should be closed now.
3022 * c) We have sent a regular abort_req that will get to TP too late.
3023 * That will generate an abort_rpl with status 0, wait for it.
3024 */
3025 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3026 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3027 int error;
3028
3029 error = abort_status_to_errno(so, req->status,
3030 &rst_status);
3031 so_error_set(so, error);
3032
3033 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3034 so_sorwakeup(so);
3035 /*
3036 * SYN_RECV needs special processing. If abort_syn_rcv()
3037 * returns 0 is has taken care of the abort.
3038 */
3039 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3040 goto skip;
3041
3042 t3_release_offload_resources(toep);
3043 needclose = 1;
3044 }
3045 inp_wunlock(tp->t_inpcb);
3046
3047 if (needclose)
3048 tcp_offload_close(tp);
3049
3050 send_abort_rpl(m, tdev, rst_status);
3051 return;
3052skip:
3053 inp_wunlock(tp->t_inpcb);
3054}
3055
3056/*
3057 * Handle an ABORT_REQ_RSS CPL message.
3058 */
3059static int
3060do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3061{
3062 const struct cpl_abort_req_rss *req = cplhdr(m);
3063 struct toepcb *toep = (struct toepcb *)ctx;
3064
3065 if (is_neg_adv_abort(req->status)) {
3066 m_free(m);
3067 return (0);
3068 }
3069
3070 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3071
3072 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3073 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3074 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3075
3076 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3077 if (toep->tp_l2t)
3078 l2t_release(L2DATA(cdev), toep->tp_l2t);
3079
3080 /*
3081 * Unhook
3082 */
3083 toep->tp_tp->t_toe = NULL;
3084 toep->tp_tp->t_flags &= ~TF_TOE;
3085 toep->tp_tp = NULL;
3086 /*
3087 * XXX need to call syncache_chkrst - but we don't
3088 * have a way of doing that yet
3089 */
3090 toepcb_release(toep);
3091 log(LOG_ERR, "abort for unestablished connection :-(\n");
3092 return (0);
3093 }
3094 if (toep->tp_tp == NULL) {
3095 log(LOG_NOTICE, "disconnected toepcb\n");
3096 /* should be freed momentarily */
3097 return (0);
3098 }
3099
3100
3101 toepcb_hold(toep);
3102 process_abort_req(toep, m, toep->tp_toedev);
3103 toepcb_release(toep);
3104 return (0);
3105}
3106#ifdef notyet
3107static void
3108pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3109{
3110 struct toedev *tdev = TOE_DEV(parent);
3111
3112 do_abort_syn_rcv(child, parent);
3113 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3114 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3115
3116 rpl->opt0h = htonl(F_TCAM_BYPASS);
3117 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3118 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3119 } else
3120 m_free(m);
3121}
3122#endif
3123static void
3124handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3125{
3126 CXGB_UNIMPLEMENTED();
3127
3128#ifdef notyet
3129 struct t3cdev *cdev;
3130 struct socket *parent;
3131 struct socket *oreq;
3132 struct t3c_tid_entry *t3c_stid;
3133 struct tid_info *t;
3134 struct tcpcb *otp, *tp = so_sototcpcb(so);
3135 struct toepcb *toep = tp->t_toe;
3136
3137 /*
3138 * If the connection is being aborted due to the parent listening
3139 * socket going away there's nothing to do, the ABORT_REQ will close
3140 * the connection.
3141 */
3142 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3143 m_free(m);
3144 return;
3145 }
3146
3147 oreq = so->so_incomp;
3148 otp = so_sototcpcb(oreq);
3149
3150 cdev = T3C_DEV(so);
3151 t = &(T3C_DATA(cdev))->tid_maps;
3152 t3c_stid = lookup_stid(t, otp->ts_recent);
3153 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3154
3155 so_lock(parent);
3156 pass_open_abort(so, parent, m);
3157 so_unlock(parent);
3158#endif
3159}
3160
3161/*
3162 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
3163 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3164 * connection.
3165 */
3166static void
3167pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3168{
3169
3170#ifdef notyet
3171 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3172 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3173#endif
3174 handle_pass_open_arp_failure(m_get_socket(m), m);
3175}
3176
3177/*
3178 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3179 */
3180static void
3181mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3182{
3183 struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3184 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3185 unsigned int tid = GET_TID(req);
3186
3187 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3188 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3189 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3190 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
3191 rpl->opt0h = htonl(F_TCAM_BYPASS);
3192 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3193 rpl->opt2 = 0;
3194 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3195}
3196
3197/*
3198 * Send a deferred reject to an accept request.
3199 */
3200static void
3201reject_pass_request(struct toedev *tdev, struct mbuf *m)
3202{
3203 struct mbuf *reply_mbuf;
3204
3205 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3206 mk_pass_accept_rpl(reply_mbuf, m);
3207 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3208 m_free(m);
3209}
3210
3211static void
3212handle_syncache_event(int event, void *arg)
3213{
3214 struct toepcb *toep = arg;
3215
3216 switch (event) {
3217 case TOE_SC_ENTRY_PRESENT:
3218 /*
3219 * entry already exists - free toepcb
3220 * and l2t
3221 */
3222 printf("syncache entry present\n");
3223 toepcb_release(toep);
3224 break;
3225 case TOE_SC_DROP:
3226 /*
3227 * The syncache has given up on this entry
3228 * either it timed out, or it was evicted
3229 * we need to explicitly release the tid
3230 */
3231 printf("syncache entry dropped\n");
3232 toepcb_release(toep);
3233 break;
3234 default:
3235 log(LOG_ERR, "unknown syncache event %d\n", event);
3236 break;
3237 }
3238}
3239
3240static void
3241syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3242{
3243 struct in_conninfo inc;
3244 struct toeopt toeo;
3245 struct tcphdr th;
3246 struct inpcb *inp;
3247 int mss, wsf, sack, ts;
3248 uint32_t rcv_isn = ntohl(req->rcv_isn);
3249
3250 bzero(&toeo, sizeof(struct toeopt));
3251 inp = so_sotoinpcb(lso);
3252
3253 /*
3254 * Fill out information for entering us into the syncache
3255 */
3256 bzero(&inc, sizeof(inc));
3257 inc.inc_fport = th.th_sport = req->peer_port;
3258 inc.inc_lport = th.th_dport = req->local_port;
3259 th.th_seq = req->rcv_isn;
3260 th.th_flags = TH_SYN;
3261
3262 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3263
3264 inc.inc_len = 0;
3265 inc.inc_faddr.s_addr = req->peer_ip;
3266 inc.inc_laddr.s_addr = req->local_ip;
3267
3268 DPRINTF("syncache add of %d:%d %d:%d\n",
3269 ntohl(req->local_ip), ntohs(req->local_port),
3270 ntohl(req->peer_ip), ntohs(req->peer_port));
3271
3272 mss = req->tcp_options.mss;
3273 wsf = req->tcp_options.wsf;
3274 ts = req->tcp_options.tstamp;
3275 sack = req->tcp_options.sack;
3276 toeo.to_mss = mss;
3277 toeo.to_wscale = wsf;
3278 toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3279 tcp_offload_syncache_add(&inc, &toeo, &th, inp, &lso, &cxgb_toe_usrreqs,
3280toep);
3281}
3282
3283
3284/*
3285 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
3286 * lock held. Note that the sock here is a listening socket that is not owned
3287 * by the TOE.
3288 */
3289static void
3290process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3291 struct listen_ctx *lctx)
3292{
3293 int rt_flags;
3294 struct l2t_entry *e;
3295 struct iff_mac tim;
3296 struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3297 struct cpl_pass_accept_rpl *rpl;
3298 struct cpl_pass_accept_req *req = cplhdr(m);
3299 unsigned int tid = GET_TID(req);
3300 struct tom_data *d = TOM_DATA(tdev);
3301 struct t3cdev *cdev = d->cdev;
3302 struct tcpcb *tp = so_sototcpcb(so);
3303 struct toepcb *newtoep;
3304 struct rtentry *dst;
3305 struct sockaddr_in nam;
3306 struct t3c_data *td = T3C_DATA(cdev);
3307
3308 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3309 if (__predict_false(reply_mbuf == NULL)) {
3310 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3311 t3_defer_reply(m, tdev, reject_pass_request);
3312 else {
3313 cxgb_queue_tid_release(cdev, tid);
3314 m_free(m);
3315 }
3316 DPRINTF("failed to get reply_mbuf\n");
3317
3318 goto out;
3319 }
3320
3321 if (tp->t_state != TCPS_LISTEN) {
3322 DPRINTF("socket not in listen state\n");
3323
3324 goto reject;
3325 }
3326
3327 tim.mac_addr = req->dst_mac;
3328 tim.vlan_tag = ntohs(req->vlan_tag);
3329 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3330 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3331 goto reject;
3332 }
3333
3334#ifdef notyet
3335 /*
3336 * XXX do route lookup to confirm that we're still listening on this
3337 * address
3338 */
3339 if (ip_route_input(skb, req->local_ip, req->peer_ip,
3340 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3341 goto reject;
3342 rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3343 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3344 dst_release(skb->dst); // done with the input route, release it
3345 skb->dst = NULL;
3346
3347 if ((rt_flags & RTF_LOCAL) == 0)
3348 goto reject;
3349#endif
3350 /*
3351 * XXX
3352 */
3353 rt_flags = RTF_LOCAL;
3354 if ((rt_flags & RTF_LOCAL) == 0)
3355 goto reject;
3356
3357 /*
3358 * Calculate values and add to syncache
3359 */
3360
3361 newtoep = toepcb_alloc();
3362 if (newtoep == NULL)
3363 goto reject;
3364
3365 bzero(&nam, sizeof(struct sockaddr_in));
3366
3367 nam.sin_len = sizeof(struct sockaddr_in);
3368 nam.sin_family = AF_INET;
3369 nam.sin_addr.s_addr =req->peer_ip;
3370 dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3371
3372 if (dst == NULL) {
3373 printf("failed to find route\n");
3374 goto reject;
3375 }
3376 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3377 (struct sockaddr *)&nam);
3378 if (e == NULL) {
3379 DPRINTF("failed to get l2t\n");
3380 }
3381 /*
3382 * Point to our listen socket until accept
3383 */
3384 newtoep->tp_tp = tp;
3385 newtoep->tp_flags = TP_SYN_RCVD;
3386 newtoep->tp_tid = tid;
3387 newtoep->tp_toedev = tdev;
3388 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3389
3390 cxgb_insert_tid(cdev, d->client, newtoep, tid);
3391 so_lock(so);
3392 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3393 so_unlock(so);
3394
3395 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3396 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3397
3398 if (newtoep->tp_ulp_mode) {
3399 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3400
3401 if (ddp_mbuf == NULL)
3402 newtoep->tp_ulp_mode = 0;
3403 }
3404
3405 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3406 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3407 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3408 /*
3409 * XXX workaround for lack of syncache drop
3410 */
3411 toepcb_hold(newtoep);
3412 syncache_add_accept_req(req, so, newtoep);
3413
3414 rpl = cplhdr(reply_mbuf);
3415 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3416 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3417 rpl->wr.wr_lo = 0;
3418 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3419 rpl->opt2 = htonl(calc_opt2(so, tdev));
3420 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3421 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
3422
3423 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3424 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3425 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3426 CPL_PASS_OPEN_ACCEPT);
3427
3428 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3429
3430 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3431
3432 l2t_send(cdev, reply_mbuf, e);
3433 m_free(m);
3434 if (newtoep->tp_ulp_mode) {
3435 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3436 V_TF_DDP_OFF(1) |
3437 TP_DDP_TIMER_WORKAROUND_MASK,
3438 V_TF_DDP_OFF(1) |
3439 TP_DDP_TIMER_WORKAROUND_VAL, 1);
3440 } else
3441 DPRINTF("no DDP\n");
3442
3443 return;
3444reject:
3445 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3446 mk_pass_accept_rpl(reply_mbuf, m);
3447 else
3448 mk_tid_release(reply_mbuf, newtoep, tid);
3449 cxgb_ofld_send(cdev, reply_mbuf);
3450 m_free(m);
3451out:
3452#if 0
3453 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3454#else
3455 return;
3456#endif
3457}
3458
3459/*
3460 * Handle a CPL_PASS_ACCEPT_REQ message.
3461 */
3462static int
3463do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3464{
3465 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3466 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3467 struct tom_data *d = listen_ctx->tom_data;
3468
3469#if VALIDATE_TID
3470 struct cpl_pass_accept_req *req = cplhdr(m);
3471 unsigned int tid = GET_TID(req);
3472 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3473
3474 if (unlikely(!lsk)) {
3475 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3476 cdev->name,
3477 (unsigned long)((union listen_entry *)ctx -
3478 t->stid_tab));
3479 return CPL_RET_BUF_DONE;
3480 }
3481 if (unlikely(tid >= t->ntids)) {
3482 printk(KERN_ERR "%s: passive open TID %u too large\n",
3483 cdev->name, tid);
3484 return CPL_RET_BUF_DONE;
3485 }
3486 /*
3487 * For T3A the current user of the TID may have closed but its last
3488 * message(s) may have been backlogged so the TID appears to be still
3489 * in use. Just take the TID away, the connection can close at its
3490 * own leisure. For T3B this situation is a bug.
3491 */
3492 if (!valid_new_tid(t, tid) &&
3493 cdev->type != T3A) {
3494 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3495 cdev->name, tid);
3496 return CPL_RET_BUF_DONE;
3497 }
3498#endif
3499
3500 process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3501 return (0);
3502}
3503
3504/*
3505 * Called when a connection is established to translate the TCP options
3506 * reported by HW to FreeBSD's native format.
3507 */
3508static void
3509assign_rxopt(struct socket *so, unsigned int opt)
3510{
3511 struct tcpcb *tp = so_sototcpcb(so);
3512 struct toepcb *toep = tp->t_toe;
3513 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3514
3515 inp_lock_assert(tp->t_inpcb);
3516
3517 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3518 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3519 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3520 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3521 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3522 (TF_RCVD_SCALE|TF_REQ_SCALE))
3523 tp->rcv_scale = tp->request_r_scale;
3524}
3525
3526/*
3527 * Completes some final bits of initialization for just established connections
3528 * and changes their state to TCP_ESTABLISHED.
3529 *
3530 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3531 */
3532static void
3533make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3534{
3535 struct tcpcb *tp = so_sototcpcb(so);
3536 struct toepcb *toep = tp->t_toe;
3537
3538 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3539 assign_rxopt(so, opt);
3540
3541 /*
3542 *XXXXXXXXXXX
3543 *
3544 */
3545#ifdef notyet
3546 so->so_proto->pr_ctloutput = t3_ctloutput;
3547#endif
3548
3549#if 0
3550 inet_sk(sk)->id = tp->write_seq ^ jiffies;
3551#endif
3552 /*
3553 * XXX not clear what rcv_wup maps to
3554 */
3555 /*
3556 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3557 * pass through opt0.
3558 */
3559 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3560 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3561
3562 dump_toepcb(toep);
3563
3564#ifdef notyet
3565/*
3566 * no clean interface for marking ARP up to date
3567 */
3568 dst_confirm(sk->sk_dst_cache);
3569#endif
3570 tp->t_starttime = ticks;
3571 tp->t_state = TCPS_ESTABLISHED;
3572 soisconnected(so);
3573}
3574
3575static int
3576syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3577{
3578
3579 struct in_conninfo inc;
3580 struct toeopt toeo;
3581 struct tcphdr th;
3582 int mss, wsf, sack, ts;
3583 struct mbuf *m = NULL;
3584 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3585 unsigned int opt;
3586
3587#ifdef MAC
3588#error "no MAC support"
3589#endif
3590
3591 opt = ntohs(req->tcp_opt);
3592
3593 bzero(&toeo, sizeof(struct toeopt));
3594
3595 /*
3596 * Fill out information for entering us into the syncache
3597 */
3598 bzero(&inc, sizeof(inc));
3599 inc.inc_fport = th.th_sport = req->peer_port;
3600 inc.inc_lport = th.th_dport = req->local_port;
3601 th.th_seq = req->rcv_isn;
3602 th.th_flags = TH_ACK;
3603
3604 inc.inc_len = 0;
3605 inc.inc_faddr.s_addr = req->peer_ip;
3606 inc.inc_laddr.s_addr = req->local_ip;
3607
3608 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3609 wsf = G_TCPOPT_WSCALE_OK(opt);
3610 ts = G_TCPOPT_TSTAMP(opt);
3611 sack = G_TCPOPT_SACK(opt);
3612
3613 toeo.to_mss = mss;
3614 toeo.to_wscale = G_TCPOPT_SND_WSCALE(opt);
3615 toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3616
3617 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3618 ntohl(req->local_ip), ntohs(req->local_port),
3619 ntohl(req->peer_ip), ntohs(req->peer_port),
3620 mss, wsf, ts, sack);
3621 return tcp_offload_syncache_expand(&inc, &toeo, &th, so, m);
3622}
3623
3624
3625/*
3626 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
3627 * if we are in TCP_SYN_RECV due to crossed SYNs
3628 */
3629static int
3630do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3631{
3632 struct cpl_pass_establish *req = cplhdr(m);
3633 struct toepcb *toep = (struct toepcb *)ctx;
3634 struct tcpcb *tp = toep->tp_tp;
3635 struct socket *so, *lso;
3636 struct t3c_data *td = T3C_DATA(cdev);
3637 struct sockbuf *snd, *rcv;
3638
3639 // Complete socket initialization now that we have the SND_ISN
3640
3641 struct toedev *tdev;
3642
3643
3644 tdev = toep->tp_toedev;
3645
3646 inp_wlock(tp->t_inpcb);
3647
3648 /*
3649 *
3650 * XXX need to add reference while we're manipulating
3651 */
3652 so = lso = inp_inpcbtosocket(tp->t_inpcb);
3653
3654 inp_wunlock(tp->t_inpcb);
3655
3656 so_lock(so);
3657 LIST_REMOVE(toep, synq_entry);
3658 so_unlock(so);
3659
3660 if (!syncache_expand_establish_req(req, &so, toep)) {
3661 /*
3662 * No entry
3663 */
3664 CXGB_UNIMPLEMENTED();
3665 }
3666 if (so == NULL) {
3667 /*
3668 * Couldn't create the socket
3669 */
3670 CXGB_UNIMPLEMENTED();
3671 }
3672
3673 tp = so_sototcpcb(so);
3674 inp_wlock(tp->t_inpcb);
3675
3676 snd = so_sockbuf_snd(so);
3677 rcv = so_sockbuf_rcv(so);
3678
3679 snd->sb_flags |= SB_NOCOALESCE;
3680 rcv->sb_flags |= SB_NOCOALESCE;
3681
3682 toep->tp_tp = tp;
3683 toep->tp_flags = 0;
3684 tp->t_toe = toep;
3685 reset_wr_list(toep);
3686 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3687 tp->rcv_nxt = toep->tp_copied_seq;
3688 install_offload_ops(so);
3689
3690 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3691 toep->tp_wr_unacked = 0;
3692 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3693 toep->tp_qset_idx = 0;
3694 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3695
3696 /*
3697 * XXX Cancel any keep alive timer
3698 */
3699
3700 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3701
3702 /*
3703 * XXX workaround for lack of syncache drop
3704 */
3705 toepcb_release(toep);
3706 inp_wunlock(tp->t_inpcb);
3707
3708 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3709 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3710#ifdef notyet
3711 /*
3712 * XXX not sure how these checks map to us
3713 */
3714 if (unlikely(sk->sk_socket)) { // simultaneous opens only
3715 sk->sk_state_change(sk);
3716 sk_wake_async(so, 0, POLL_OUT);
3717 }
3718 /*
3719 * The state for the new connection is now up to date.
3720 * Next check if we should add the connection to the parent's
3721 * accept queue. When the parent closes it resets connections
3722 * on its SYN queue, so check if we are being reset. If so we
3723 * don't need to do anything more, the coming ABORT_RPL will
3724 * destroy this socket. Otherwise move the connection to the
3725 * accept queue.
3726 *
3727 * Note that we reset the synq before closing the server so if
3728 * we are not being reset the stid is still open.
3729 */
3730 if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3731 __kfree_skb(skb);
3732 goto unlock;
3733 }
3734#endif
3735 m_free(m);
3736
3737 return (0);
3738}
3739
3740/*
3741 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3742 * and send them to the TOE.
3743 */
3744static void
3745fixup_and_send_ofo(struct toepcb *toep)
3746{
3747 struct mbuf *m;
3748 struct toedev *tdev = toep->tp_toedev;
3749 struct tcpcb *tp = toep->tp_tp;
3750 unsigned int tid = toep->tp_tid;
3751
3752 log(LOG_NOTICE, "fixup_and_send_ofo\n");
3753
3754 inp_lock_assert(tp->t_inpcb);
3755 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3756 /*
3757 * A variety of messages can be waiting but the fields we'll
3758 * be touching are common to all so any message type will do.
3759 */
3760 struct cpl_close_con_req *p = cplhdr(m);
3761
3762 p->wr.wr_lo = htonl(V_WR_TID(tid));
3763 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3764 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3765 }
3766}
3767
3768/*
3769 * Updates socket state from an active establish CPL message. Runs with the
3770 * socket lock held.
3771 */
3772static void
3773socket_act_establish(struct socket *so, struct mbuf *m)
3774{
3775 struct cpl_act_establish *req = cplhdr(m);
3776 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
3777 struct tcpcb *tp = so_sototcpcb(so);
3778 struct toepcb *toep = tp->t_toe;
3779
3780 if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3781 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3782 toep->tp_tid, tp->t_state);
3783
3784 tp->ts_recent_age = ticks;
3785 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3786 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3787
3788 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3789
3790 /*
3791 * Now that we finally have a TID send any CPL messages that we had to
3792 * defer for lack of a TID.
3793 */
3794 if (mbufq_len(&toep->out_of_order_queue))
3795 fixup_and_send_ofo(toep);
3796
3797 if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3798 /*
3799 * XXX does this even make sense?
3800 */
3801 so_sorwakeup(so);
3802 }
3803 m_free(m);
3804#ifdef notyet
3805/*
3806 * XXX assume no write requests permitted while socket connection is
3807 * incomplete
3808 */
3809 /*
3810 * Currently the send queue must be empty at this point because the
3811 * socket layer does not send anything before a connection is
3812 * established. To be future proof though we handle the possibility
3813 * that there are pending buffers to send (either TX_DATA or
3814 * CLOSE_CON_REQ). First we need to adjust the sequence number of the
3815 * buffers according to the just learned write_seq, and then we send
3816 * them on their way.
3817 */
3818 fixup_pending_writeq_buffers(sk);
3819 if (t3_push_frames(so, 1))
3820 sk->sk_write_space(sk);
3821#endif
3822
3823 toep->tp_state = tp->t_state;
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/fcntl.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/sockstate.h>
43#include <sys/sockopt.h>
44#include <sys/socket.h>
45#include <sys/sockbuf.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/protosw.h>
49#include <sys/priv.h>
50
51#if __FreeBSD_version < 800044
52#define V_tcp_do_autosndbuf tcp_do_autosndbuf
53#define V_tcp_autosndbuf_max tcp_autosndbuf_max
54#define V_tcp_do_rfc1323 tcp_do_rfc1323
55#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
56#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
57#define V_tcpstat tcpstat
58#endif
59
60#include <net/if.h>
61#include <net/route.h>
62
63#include <netinet/in.h>
64#include <netinet/in_pcb.h>
65#include <netinet/in_systm.h>
66#include <netinet/in_var.h>
67
68
69#include <cxgb_osdep.h>
70#include <sys/mbufq.h>
71
72#include <netinet/ip.h>
73#include <netinet/tcp_var.h>
74#include <netinet/tcp_fsm.h>
75#include <netinet/tcp_offload.h>
76#include <netinet/tcp_seq.h>
77#include <netinet/tcp_syncache.h>
78#include <netinet/tcp_timer.h>
79#include <net/route.h>
80
81#include <t3cdev.h>
82#include <common/cxgb_firmware_exports.h>
83#include <common/cxgb_t3_cpl.h>
84#include <common/cxgb_tcb.h>
85#include <common/cxgb_ctl_defs.h>
86#include <cxgb_offload.h>
87#include <vm/vm.h>
88#include <vm/pmap.h>
89#include <machine/bus.h>
90#include <sys/mvec.h>
91#include <ulp/toecore/cxgb_toedev.h>
92#include <ulp/tom/cxgb_l2t.h>
93#include <ulp/tom/cxgb_defs.h>
94#include <ulp/tom/cxgb_tom.h>
95#include <ulp/tom/cxgb_t3_ddp.h>
96#include <ulp/tom/cxgb_toepcb.h>
97#include <ulp/tom/cxgb_tcp.h>
98#include <ulp/tom/cxgb_tcp_offload.h>
99
100/*
101 * For ULP connections HW may add headers, e.g., for digests, that aren't part
102 * of the messages sent by the host but that are part of the TCP payload and
103 * therefore consume TCP sequence space. Tx connection parameters that
104 * operate in TCP sequence space are affected by the HW additions and need to
105 * compensate for them to accurately track TCP sequence numbers. This array
106 * contains the compensating extra lengths for ULP packets. It is indexed by
107 * a packet's ULP submode.
108 */
109const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
110
111#ifdef notyet
112/*
113 * This sk_buff holds a fake header-only TCP segment that we use whenever we
114 * need to exploit SW TCP functionality that expects TCP headers, such as
115 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
116 * CPUs without locking.
117 */
118static struct mbuf *tcphdr_mbuf __read_mostly;
119#endif
120
121/*
122 * Size of WRs in bytes. Note that we assume all devices we are handling have
123 * the same WR size.
124 */
125static unsigned int wrlen __read_mostly;
126
127/*
128 * The number of WRs needed for an skb depends on the number of page fragments
129 * in the skb and whether it has any payload in its main body. This maps the
130 * length of the gather list represented by an skb into the # of necessary WRs.
131 */
132static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
133
134/*
135 * Max receive window supported by HW in bytes. Only a small part of it can
136 * be set through option0, the rest needs to be set through RX_DATA_ACK.
137 */
138#define MAX_RCV_WND ((1U << 27) - 1)
139
140/*
141 * Min receive window. We want it to be large enough to accommodate receive
142 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
143 */
144#define MIN_RCV_WND (24 * 1024U)
145#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
146
147#define VALIDATE_SEQ 0
148#define VALIDATE_SOCK(so)
149#define DEBUG_WR 0
150
151#define TCP_TIMEWAIT 1
152#define TCP_CLOSE 2
153#define TCP_DROP 3
154
155static void t3_send_reset(struct toepcb *toep);
156static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
157static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
158static void handle_syncache_event(int event, void *arg);
159
160static inline void
161SBAPPEND(struct sockbuf *sb, struct mbuf *n)
162{
163 struct mbuf *m;
164
165 m = sb->sb_mb;
166 while (m) {
167 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
168 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
169 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
170 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
171 m->m_next, m->m_nextpkt, m->m_flags));
172 m = m->m_next;
173 }
174 m = n;
175 while (m) {
176 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
177 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
178 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
179 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
180 m->m_next, m->m_nextpkt, m->m_flags));
181 m = m->m_next;
182 }
183 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
184 sbappendstream_locked(sb, n);
185 m = sb->sb_mb;
186
187 while (m) {
188 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
189 m->m_next, m->m_nextpkt, m->m_flags));
190 m = m->m_next;
191 }
192}
193
194static inline int
195is_t3a(const struct toedev *dev)
196{
197 return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
198}
199
200static void
201dump_toepcb(struct toepcb *toep)
202{
203 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
204 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
205 toep->tp_mtu_idx, toep->tp_tid);
206
207 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
208 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
209 toep->tp_mss_clamp, toep->tp_flags);
210}
211
212#ifndef RTALLOC2_DEFINED
213static struct rtentry *
214rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
215{
216 struct rtentry *rt = NULL;
217
218 if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
219 RT_UNLOCK(rt);
220
221 return (rt);
222}
223#endif
224
225/*
226 * Determine whether to send a CPL message now or defer it. A message is
227 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
228 * For connections in other states the message is sent immediately.
229 * If through_l2t is set the message is subject to ARP processing, otherwise
230 * it is sent directly.
231 */
232static inline void
233send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
234{
235 struct tcpcb *tp = toep->tp_tp;
236
237 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
238 inp_wlock(tp->t_inpcb);
239 mbufq_tail(&toep->out_of_order_queue, m); // defer
240 inp_wunlock(tp->t_inpcb);
241 } else if (through_l2t)
242 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T
243 else
244 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly
245}
246
247static inline unsigned int
248mkprio(unsigned int cntrl, const struct toepcb *toep)
249{
250 return (cntrl);
251}
252
253/*
254 * Populate a TID_RELEASE WR. The skb must be already propely sized.
255 */
256static inline void
257mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
258{
259 struct cpl_tid_release *req;
260
261 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
262 m->m_pkthdr.len = m->m_len = sizeof(*req);
263 req = mtod(m, struct cpl_tid_release *);
264 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
265 req->wr.wr_lo = 0;
266 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
267}
268
269static inline void
270make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
271{
272 struct tcpcb *tp = so_sototcpcb(so);
273 struct toepcb *toep = tp->t_toe;
274 struct tx_data_wr *req;
275 struct sockbuf *snd;
276
277 inp_lock_assert(tp->t_inpcb);
278 snd = so_sockbuf_snd(so);
279
280 req = mtod(m, struct tx_data_wr *);
281 m->m_len = sizeof(*req);
282 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
283 req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
284 /* len includes the length of any HW ULP additions */
285 req->len = htonl(len);
286 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
287 /* V_TX_ULP_SUBMODE sets both the mode and submode */
288 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
289 V_TX_URG(/* skb_urgent(skb) */ 0 ) |
290 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
291 (tail ? 0 : 1))));
292 req->sndseq = htonl(tp->snd_nxt);
293 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
294 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
295 V_TX_CPU_IDX(toep->tp_qset));
296
297 /* Sendbuffer is in units of 32KB.
298 */
299 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
300 req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
301 else {
302 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
303 }
304
305 toep->tp_flags |= TP_DATASENT;
306 }
307}
308
309#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
310
311int
312t3_push_frames(struct socket *so, int req_completion)
313{
314 struct tcpcb *tp = so_sototcpcb(so);
315 struct toepcb *toep = tp->t_toe;
316
317 struct mbuf *tail, *m0, *last;
318 struct t3cdev *cdev;
319 struct tom_data *d;
320 int state, bytes, count, total_bytes;
321 bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
322 struct sockbuf *snd;
323
324 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
325 DPRINTF("tcp state=%d\n", tp->t_state);
326 return (0);
327 }
328
329 state = so_state_get(so);
330
331 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
332 DPRINTF("disconnecting\n");
333
334 return (0);
335 }
336
337 inp_lock_assert(tp->t_inpcb);
338
339 snd = so_sockbuf_snd(so);
340 sockbuf_lock(snd);
341
342 d = TOM_DATA(toep->tp_toedev);
343 cdev = d->cdev;
344
345 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
346
347 total_bytes = 0;
348 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
349 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
350
351 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) {
352 KASSERT(tail, ("sbdrop error"));
353 last = tail = tail->m_next;
354 }
355
356 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
357 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
358 sockbuf_unlock(snd);
359
360 return (0);
361 }
362
363 toep->tp_m_last = NULL;
364 while (toep->tp_wr_avail && (tail != NULL)) {
365 count = bytes = 0;
366 segp = segs;
367 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
368 sockbuf_unlock(snd);
369 return (0);
370 }
371 /*
372 * If the data in tail fits as in-line, then
373 * make an immediate data wr.
374 */
375 if (tail->m_len <= IMM_LEN) {
376 count = 1;
377 bytes = tail->m_len;
378 last = tail;
379 tail = tail->m_next;
380 m_set_sgl(m0, NULL);
381 m_set_sgllen(m0, 0);
382 make_tx_data_wr(so, m0, bytes, tail);
383 m_append(m0, bytes, mtod(last, caddr_t));
384 KASSERT(!m0->m_next, ("bad append"));
385 } else {
386 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
387 && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
388 bytes += tail->m_len;
389 last = tail;
390 count++;
391 /*
392 * technically an abuse to be using this for a VA
393 * but less gross than defining my own structure
394 * or calling pmap_kextract from here :-|
395 */
396 segp->ds_addr = (bus_addr_t)tail->m_data;
397 segp->ds_len = tail->m_len;
398 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
399 count, mbuf_wrs[count], tail->m_data, tail->m_len);
400 segp++;
401 tail = tail->m_next;
402 }
403 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
404 toep->tp_wr_avail, count, mbuf_wrs[count], tail);
405
406 m_set_sgl(m0, segs);
407 m_set_sgllen(m0, count);
408 make_tx_data_wr(so, m0, bytes, tail);
409 }
410 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
411
412 if (tail) {
413 snd->sb_sndptr = tail;
414 toep->tp_m_last = NULL;
415 } else
416 toep->tp_m_last = snd->sb_sndptr = last;
417
418
419 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
420
421 snd->sb_sndptroff += bytes;
422 total_bytes += bytes;
423 toep->tp_write_seq += bytes;
424 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
425 " tail=%p sndptr=%p sndptroff=%d",
426 toep->tp_wr_avail, count, mbuf_wrs[count],
427 tail, snd->sb_sndptr, snd->sb_sndptroff);
428 if (tail)
429 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
430 " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
431 total_bytes, toep->tp_m_last, tail->m_data,
432 tp->snd_una);
433 else
434 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
435 " tp_m_last=%p snd_una=0x%08x",
436 total_bytes, toep->tp_m_last, tp->snd_una);
437
438
439#ifdef KTR
440{
441 int i;
442
443 i = 0;
444 while (i < count && m_get_sgllen(m0)) {
445 if ((count - i) >= 3) {
446 CTR6(KTR_TOM,
447 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
448 " len=%d pa=0x%zx len=%d",
449 segs[i].ds_addr, segs[i].ds_len,
450 segs[i + 1].ds_addr, segs[i + 1].ds_len,
451 segs[i + 2].ds_addr, segs[i + 2].ds_len);
452 i += 3;
453 } else if ((count - i) == 2) {
454 CTR4(KTR_TOM,
455 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
456 " len=%d",
457 segs[i].ds_addr, segs[i].ds_len,
458 segs[i + 1].ds_addr, segs[i + 1].ds_len);
459 i += 2;
460 } else {
461 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
462 segs[i].ds_addr, segs[i].ds_len);
463 i++;
464 }
465
466 }
467}
468#endif
469 /*
470 * remember credits used
471 */
472 m0->m_pkthdr.csum_data = mbuf_wrs[count];
473 m0->m_pkthdr.len = bytes;
474 toep->tp_wr_avail -= mbuf_wrs[count];
475 toep->tp_wr_unacked += mbuf_wrs[count];
476
477 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
478 toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
479 struct work_request_hdr *wr = cplhdr(m0);
480
481 wr->wr_hi |= htonl(F_WR_COMPL);
482 toep->tp_wr_unacked = 0;
483 }
484 KASSERT((m0->m_pkthdr.csum_data > 0) &&
485 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
486 m0->m_pkthdr.csum_data));
487 m0->m_type = MT_DONTFREE;
488 enqueue_wr(toep, m0);
489 DPRINTF("sending offload tx with %d bytes in %d segments\n",
490 bytes, count);
491 l2t_send(cdev, m0, toep->tp_l2t);
492 }
493 sockbuf_unlock(snd);
494 return (total_bytes);
495}
496
497/*
498 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
499 * under any circumstances. We take the easy way out and always queue the
500 * message to the write_queue. We can optimize the case where the queue is
501 * already empty though the optimization is probably not worth it.
502 */
503static void
504close_conn(struct socket *so)
505{
506 struct mbuf *m;
507 struct cpl_close_con_req *req;
508 struct tom_data *d;
509 struct inpcb *inp = so_sotoinpcb(so);
510 struct tcpcb *tp;
511 struct toepcb *toep;
512 unsigned int tid;
513
514
515 inp_wlock(inp);
516 tp = so_sototcpcb(so);
517 toep = tp->t_toe;
518
519 if (tp->t_state != TCPS_SYN_SENT)
520 t3_push_frames(so, 1);
521
522 if (toep->tp_flags & TP_FIN_SENT) {
523 inp_wunlock(inp);
524 return;
525 }
526
527 tid = toep->tp_tid;
528
529 d = TOM_DATA(toep->tp_toedev);
530
531 m = m_gethdr_nofail(sizeof(*req));
532 m_set_priority(m, CPL_PRIORITY_DATA);
533 m_set_sgl(m, NULL);
534 m_set_sgllen(m, 0);
535
536 toep->tp_flags |= TP_FIN_SENT;
537 req = mtod(m, struct cpl_close_con_req *);
538
539 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
540 req->wr.wr_lo = htonl(V_WR_TID(tid));
541 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
542 req->rsvd = 0;
543 inp_wunlock(inp);
544 /*
545 * XXX - need to defer shutdown while there is still data in the queue
546 *
547 */
548 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
549 cxgb_ofld_send(d->cdev, m);
550
551}
552
553/*
554 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
555 * and send it along.
556 */
557static void
558abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
559{
560 struct cpl_abort_req *req = cplhdr(m);
561
562 req->cmd = CPL_ABORT_NO_RST;
563 cxgb_ofld_send(cdev, m);
564}
565
566/*
567 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
568 * permitted to return without sending the message in case we cannot allocate
569 * an sk_buff. Returns the number of credits sent.
570 */
571uint32_t
572t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
573{
574 struct mbuf *m;
575 struct cpl_rx_data_ack *req;
576 struct toepcb *toep = tp->t_toe;
577 struct toedev *tdev = toep->tp_toedev;
578
579 m = m_gethdr_nofail(sizeof(*req));
580
581 DPRINTF("returning %u credits to HW\n", credits);
582
583 req = mtod(m, struct cpl_rx_data_ack *);
584 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
585 req->wr.wr_lo = 0;
586 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
587 req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
588 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
589 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
590 return (credits);
591}
592
593/*
594 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
595 * This is only used in DDP mode, so we take the opportunity to also set the
596 * DACK mode and flush any Rx credits.
597 */
598void
599t3_send_rx_modulate(struct toepcb *toep)
600{
601 struct mbuf *m;
602 struct cpl_rx_data_ack *req;
603
604 m = m_gethdr_nofail(sizeof(*req));
605
606 req = mtod(m, struct cpl_rx_data_ack *);
607 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
608 req->wr.wr_lo = 0;
609 m->m_pkthdr.len = m->m_len = sizeof(*req);
610
611 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
612 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
613 V_RX_DACK_MODE(1) |
614 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
615 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
616 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
617 toep->tp_rcv_wup = toep->tp_copied_seq;
618}
619
620/*
621 * Handle receipt of an urgent pointer.
622 */
623static void
624handle_urg_ptr(struct socket *so, uint32_t urg_seq)
625{
626#ifdef URGENT_DATA_SUPPORTED
627 struct tcpcb *tp = so_sototcpcb(so);
628
629 urg_seq--; /* initially points past the urgent data, per BSD */
630
631 if (tp->urg_data && !after(urg_seq, tp->urg_seq))
632 return; /* duplicate pointer */
633 sk_send_sigurg(sk);
634 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
635 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
636 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
637
638 tp->copied_seq++;
639 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
640 tom_eat_skb(sk, skb, 0);
641 }
642 tp->urg_data = TCP_URG_NOTYET;
643 tp->urg_seq = urg_seq;
644#endif
645}
646
647/*
648 * Returns true if a socket cannot accept new Rx data.
649 */
650static inline int
651so_no_receive(const struct socket *so)
652{
653 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
654}
655
656/*
657 * Process an urgent data notification.
658 */
659static void
660rx_urg_notify(struct toepcb *toep, struct mbuf *m)
661{
662 struct cpl_rx_urg_notify *hdr = cplhdr(m);
663 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
664
665 VALIDATE_SOCK(so);
666
667 if (!so_no_receive(so))
668 handle_urg_ptr(so, ntohl(hdr->seq));
669
670 m_freem(m);
671}
672
673/*
674 * Handler for RX_URG_NOTIFY CPL messages.
675 */
676static int
677do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
678{
679 struct toepcb *toep = (struct toepcb *)ctx;
680
681 rx_urg_notify(toep, m);
682 return (0);
683}
684
685static __inline int
686is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
687{
688 return (toep->tp_ulp_mode ||
689 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
690 dev->tod_ttid >= TOE_ID_CHELSIO_T3));
691}
692
693/*
694 * Set of states for which we should return RX credits.
695 */
696#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
697
698/*
699 * Called after some received data has been read. It returns RX credits
700 * to the HW for the amount of data processed.
701 */
702void
703t3_cleanup_rbuf(struct tcpcb *tp, int copied)
704{
705 struct toepcb *toep = tp->t_toe;
706 struct socket *so;
707 struct toedev *dev;
708 int dack_mode, must_send, read;
709 u32 thres, credits, dack = 0;
710 struct sockbuf *rcv;
711
712 so = inp_inpcbtosocket(tp->t_inpcb);
713 rcv = so_sockbuf_rcv(so);
714
715 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
716 (tp->t_state == TCPS_FIN_WAIT_2))) {
717 if (copied) {
718 sockbuf_lock(rcv);
719 toep->tp_copied_seq += copied;
720 sockbuf_unlock(rcv);
721 }
722
723 return;
724 }
725
726 inp_lock_assert(tp->t_inpcb);
727
728 sockbuf_lock(rcv);
729 if (copied)
730 toep->tp_copied_seq += copied;
731 else {
732 read = toep->tp_enqueued_bytes - rcv->sb_cc;
733 toep->tp_copied_seq += read;
734 }
735 credits = toep->tp_copied_seq - toep->tp_rcv_wup;
736 toep->tp_enqueued_bytes = rcv->sb_cc;
737 sockbuf_unlock(rcv);
738
739 if (credits > rcv->sb_mbmax) {
740 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
741 toep->tp_copied_seq, toep->tp_rcv_wup, credits);
742 credits = rcv->sb_mbmax;
743 }
744
745
746 /*
747 * XXX this won't accurately reflect credit return - we need
748 * to look at the difference between the amount that has been
749 * put in the recv sockbuf and what is there now
750 */
751
752 if (__predict_false(!credits))
753 return;
754
755 dev = toep->tp_toedev;
756 thres = TOM_TUNABLE(dev, rx_credit_thres);
757
758 if (__predict_false(thres == 0))
759 return;
760
761 if (is_delack_mode_valid(dev, toep)) {
762 dack_mode = TOM_TUNABLE(dev, delack);
763 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
764 u32 r = tp->rcv_nxt - toep->tp_delack_seq;
765
766 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
767 dack = F_RX_DACK_CHANGE |
768 V_RX_DACK_MODE(dack_mode);
769 }
770 } else
771 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
772
773 /*
774 * For coalescing to work effectively ensure the receive window has
775 * at least 16KB left.
776 */
777 must_send = credits + 16384 >= tp->rcv_wnd;
778
779 if (must_send || credits >= thres)
780 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
781}
782
783static int
784cxgb_toe_disconnect(struct tcpcb *tp)
785{
786 struct socket *so;
787
788 DPRINTF("cxgb_toe_disconnect\n");
789
790 so = inp_inpcbtosocket(tp->t_inpcb);
791 close_conn(so);
792 return (0);
793}
794
795static int
796cxgb_toe_reset(struct tcpcb *tp)
797{
798 struct toepcb *toep = tp->t_toe;
799
800 t3_send_reset(toep);
801
802 /*
803 * unhook from socket
804 */
805 tp->t_flags &= ~TF_TOE;
806 toep->tp_tp = NULL;
807 tp->t_toe = NULL;
808 return (0);
809}
810
811static int
812cxgb_toe_send(struct tcpcb *tp)
813{
814 struct socket *so;
815
816 DPRINTF("cxgb_toe_send\n");
817 dump_toepcb(tp->t_toe);
818
819 so = inp_inpcbtosocket(tp->t_inpcb);
820 t3_push_frames(so, 1);
821 return (0);
822}
823
824static int
825cxgb_toe_rcvd(struct tcpcb *tp)
826{
827
828 inp_lock_assert(tp->t_inpcb);
829
830 t3_cleanup_rbuf(tp, 0);
831
832 return (0);
833}
834
835static void
836cxgb_toe_detach(struct tcpcb *tp)
837{
838 struct toepcb *toep;
839
840 /*
841 * XXX how do we handle teardown in the SYN_SENT state?
842 *
843 */
844 inp_lock_assert(tp->t_inpcb);
845 toep = tp->t_toe;
846 toep->tp_tp = NULL;
847
848 /*
849 * unhook from socket
850 */
851 tp->t_flags &= ~TF_TOE;
852 tp->t_toe = NULL;
853}
854
855
856static struct toe_usrreqs cxgb_toe_usrreqs = {
857 .tu_disconnect = cxgb_toe_disconnect,
858 .tu_reset = cxgb_toe_reset,
859 .tu_send = cxgb_toe_send,
860 .tu_rcvd = cxgb_toe_rcvd,
861 .tu_detach = cxgb_toe_detach,
862 .tu_detach = cxgb_toe_detach,
863 .tu_syncache_event = handle_syncache_event,
864};
865
866
867static void
868__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
869 uint64_t mask, uint64_t val, int no_reply)
870{
871 struct cpl_set_tcb_field *req;
872
873 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
874 toep->tp_tid, word, mask, val);
875
876 req = mtod(m, struct cpl_set_tcb_field *);
877 m->m_pkthdr.len = m->m_len = sizeof(*req);
878 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
879 req->wr.wr_lo = 0;
880 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
881 req->reply = V_NO_REPLY(no_reply);
882 req->cpu_idx = 0;
883 req->word = htons(word);
884 req->mask = htobe64(mask);
885 req->val = htobe64(val);
886
887 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
888 send_or_defer(toep, m, 0);
889}
890
891static void
892t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
893{
894 struct mbuf *m;
895 struct tcpcb *tp = toep->tp_tp;
896
897 if (toep == NULL)
898 return;
899
900 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
901 printf("not seting field\n");
902 return;
903 }
904
905 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
906
907 __set_tcb_field(toep, m, word, mask, val, 1);
908}
909
910/*
911 * Set one of the t_flags bits in the TCB.
912 */
913static void
914set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
915{
916
917 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
918}
919
920/*
921 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
922 */
923static void
924t3_set_nagle(struct toepcb *toep)
925{
926 struct tcpcb *tp = toep->tp_tp;
927
928 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
929}
930
931/*
932 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
933 */
934void
935t3_set_keepalive(struct toepcb *toep, int on_off)
936{
937
938 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
939}
940
941void
942t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
943{
944 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
945}
946
947void
948t3_set_dack_mss(struct toepcb *toep, int on_off)
949{
950
951 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
952}
953
954/*
955 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
956 */
957static void
958t3_set_tos(struct toepcb *toep)
959{
960 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
961
962 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
963 V_TCB_TOS(tos));
964}
965
966
967/*
968 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
969 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
970 * set the PSH bit in the last segment, which would trigger delivery.]
971 * We work around the issue by setting a DDP buffer in a partial placed state,
972 * which guarantees that TP will schedule a timer.
973 */
974#define TP_DDP_TIMER_WORKAROUND_MASK\
975 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
976 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
977 V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
978#define TP_DDP_TIMER_WORKAROUND_VAL\
979 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
980 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
981 32))
982
983static void
984t3_enable_ddp(struct toepcb *toep, int on)
985{
986 if (on) {
987
988 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
989 V_TF_DDP_OFF(0));
990 } else
991 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
992 V_TF_DDP_OFF(1) |
993 TP_DDP_TIMER_WORKAROUND_MASK,
994 V_TF_DDP_OFF(1) |
995 TP_DDP_TIMER_WORKAROUND_VAL);
996
997}
998
999void
1000t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1001{
1002 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1003 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1004 tag_color);
1005}
1006
1007void
1008t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1009 unsigned int len)
1010{
1011 if (buf_idx == 0)
1012 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1013 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1014 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1015 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1016 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1017 else
1018 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1019 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1020 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1021 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1022 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1023}
1024
1025static int
1026t3_set_cong_control(struct socket *so, const char *name)
1027{
1028#ifdef CONGESTION_CONTROL_SUPPORTED
1029 int cong_algo;
1030
1031 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1032 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1033 break;
1034
1035 if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1036 return -EINVAL;
1037#endif
1038 return 0;
1039}
1040
1041int
1042t3_get_tcb(struct toepcb *toep)
1043{
1044 struct cpl_get_tcb *req;
1045 struct tcpcb *tp = toep->tp_tp;
1046 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1047
1048 if (!m)
1049 return (ENOMEM);
1050
1051 inp_lock_assert(tp->t_inpcb);
1052 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1053 req = mtod(m, struct cpl_get_tcb *);
1054 m->m_pkthdr.len = m->m_len = sizeof(*req);
1055 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1056 req->wr.wr_lo = 0;
1057 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1058 req->cpuno = htons(toep->tp_qset);
1059 req->rsvd = 0;
1060 if (tp->t_state == TCPS_SYN_SENT)
1061 mbufq_tail(&toep->out_of_order_queue, m); // defer
1062 else
1063 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1064 return 0;
1065}
1066
1067static inline void
1068so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1069{
1070
1071 toepcb_hold(toep);
1072
1073 cxgb_insert_tid(d->cdev, d->client, toep, tid);
1074}
1075
1076/**
1077 * find_best_mtu - find the entry in the MTU table closest to an MTU
1078 * @d: TOM state
1079 * @mtu: the target MTU
1080 *
1081 * Returns the index of the value in the MTU table that is closest to but
1082 * does not exceed the target MTU.
1083 */
1084static unsigned int
1085find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1086{
1087 int i = 0;
1088
1089 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1090 ++i;
1091 return (i);
1092}
1093
1094static unsigned int
1095select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1096{
1097 unsigned int idx;
1098
1099#ifdef notyet
1100 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1101#endif
1102 if (tp) {
1103 tp->t_maxseg = pmtu - 40;
1104 if (tp->t_maxseg < td->mtus[0] - 40)
1105 tp->t_maxseg = td->mtus[0] - 40;
1106 idx = find_best_mtu(td, tp->t_maxseg + 40);
1107
1108 tp->t_maxseg = td->mtus[idx] - 40;
1109 } else
1110 idx = find_best_mtu(td, pmtu);
1111
1112 return (idx);
1113}
1114
1115static inline void
1116free_atid(struct t3cdev *cdev, unsigned int tid)
1117{
1118 struct toepcb *toep = cxgb_free_atid(cdev, tid);
1119
1120 if (toep)
1121 toepcb_release(toep);
1122}
1123
1124/*
1125 * Release resources held by an offload connection (TID, L2T entry, etc.)
1126 */
1127static void
1128t3_release_offload_resources(struct toepcb *toep)
1129{
1130 struct tcpcb *tp = toep->tp_tp;
1131 struct toedev *tdev = toep->tp_toedev;
1132 struct t3cdev *cdev;
1133 struct socket *so;
1134 unsigned int tid = toep->tp_tid;
1135 struct sockbuf *rcv;
1136
1137 CTR0(KTR_TOM, "t3_release_offload_resources");
1138
1139 if (!tdev)
1140 return;
1141
1142 cdev = TOEP_T3C_DEV(toep);
1143 if (!cdev)
1144 return;
1145
1146 toep->tp_qset = 0;
1147 t3_release_ddp_resources(toep);
1148
1149#ifdef CTRL_SKB_CACHE
1150 kfree_skb(CTRL_SKB_CACHE(tp));
1151 CTRL_SKB_CACHE(tp) = NULL;
1152#endif
1153
1154 if (toep->tp_wr_avail != toep->tp_wr_max) {
1155 purge_wr_queue(toep);
1156 reset_wr_list(toep);
1157 }
1158
1159 if (toep->tp_l2t) {
1160 l2t_release(L2DATA(cdev), toep->tp_l2t);
1161 toep->tp_l2t = NULL;
1162 }
1163 toep->tp_tp = NULL;
1164 if (tp) {
1165 inp_lock_assert(tp->t_inpcb);
1166 so = inp_inpcbtosocket(tp->t_inpcb);
1167 rcv = so_sockbuf_rcv(so);
1168 /*
1169 * cancel any offloaded reads
1170 *
1171 */
1172 sockbuf_lock(rcv);
1173 tp->t_toe = NULL;
1174 tp->t_flags &= ~TF_TOE;
1175 if (toep->tp_ddp_state.user_ddp_pending) {
1176 t3_cancel_ubuf(toep, rcv);
1177 toep->tp_ddp_state.user_ddp_pending = 0;
1178 }
1179 so_sorwakeup_locked(so);
1180
1181 }
1182
1183 if (toep->tp_state == TCPS_SYN_SENT) {
1184 free_atid(cdev, tid);
1185#ifdef notyet
1186 __skb_queue_purge(&tp->out_of_order_queue);
1187#endif
1188 } else { // we have TID
1189 cxgb_remove_tid(cdev, toep, tid);
1190 toepcb_release(toep);
1191 }
1192#if 0
1193 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1194#endif
1195}
1196
1197static void
1198install_offload_ops(struct socket *so)
1199{
1200 struct tcpcb *tp = so_sototcpcb(so);
1201
1202 KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1203
1204 t3_install_socket_ops(so);
1205 tp->t_flags |= TF_TOE;
1206 tp->t_tu = &cxgb_toe_usrreqs;
1207}
1208
1209/*
1210 * Determine the receive window scaling factor given a target max
1211 * receive window.
1212 */
1213static __inline int
1214select_rcv_wscale(int space, struct vnet *vnet)
1215{
1216 int wscale = 0;
1217
1218 if (space > MAX_RCV_WND)
1219 space = MAX_RCV_WND;
1220
1221 if (V_tcp_do_rfc1323)
1222 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1223
1224 return (wscale);
1225}
1226
1227/*
1228 * Determine the receive window size for a socket.
1229 */
1230static unsigned long
1231select_rcv_wnd(struct toedev *dev, struct socket *so)
1232{
1233 struct tom_data *d = TOM_DATA(dev);
1234 unsigned int wnd;
1235 unsigned int max_rcv_wnd;
1236 struct sockbuf *rcv;
1237
1238 rcv = so_sockbuf_rcv(so);
1239
1240 if (V_tcp_do_autorcvbuf)
1241 wnd = V_tcp_autorcvbuf_max;
1242 else
1243 wnd = rcv->sb_hiwat;
1244
1245
1246
1247 /* XXX
1248 * For receive coalescing to work effectively we need a receive window
1249 * that can accomodate a coalesced segment.
1250 */
1251 if (wnd < MIN_RCV_WND)
1252 wnd = MIN_RCV_WND;
1253
1254 /* PR 5138 */
1255 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1256 (uint32_t)d->rx_page_size * 23 :
1257 MAX_RCV_WND);
1258
1259 return min(wnd, max_rcv_wnd);
1260}
1261
1262/*
1263 * Assign offload parameters to some socket fields. This code is used by
1264 * both active and passive opens.
1265 */
1266static inline void
1267init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1268 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1269{
1270 struct tcpcb *tp = so_sototcpcb(so);
1271 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1272 struct sockbuf *snd, *rcv;
1273
1274#ifdef notyet
1275 SOCK_LOCK_ASSERT(so);
1276#endif
1277
1278 snd = so_sockbuf_snd(so);
1279 rcv = so_sockbuf_rcv(so);
1280
1281 log(LOG_INFO, "initializing offload socket\n");
1282 /*
1283 * We either need to fix push frames to work with sbcompress
1284 * or we need to add this
1285 */
1286 snd->sb_flags |= SB_NOCOALESCE;
1287 rcv->sb_flags |= SB_NOCOALESCE;
1288
1289 tp->t_toe = toep;
1290 toep->tp_tp = tp;
1291 toep->tp_toedev = dev;
1292
1293 toep->tp_tid = tid;
1294 toep->tp_l2t = e;
1295 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1296 toep->tp_wr_unacked = 0;
1297 toep->tp_delack_mode = 0;
1298
1299 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1300 /*
1301 * XXX broken
1302 *
1303 */
1304 tp->rcv_wnd = select_rcv_wnd(dev, so);
1305
1306 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1307 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1308 toep->tp_qset_idx = 0;
1309
1310 reset_wr_list(toep);
1311 DPRINTF("initialization done\n");
1312}
1313
1314/*
1315 * The next two functions calculate the option 0 value for a socket.
1316 */
1317static inline unsigned int
1318calc_opt0h(struct socket *so, int mtu_idx)
1319{
1320 struct tcpcb *tp = so_sototcpcb(so);
1321 int wscale = select_rcv_wscale(tp->rcv_wnd, so->so_vnet);
1322
1323 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1324 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1325 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1326}
1327
1328static inline unsigned int
1329calc_opt0l(struct socket *so, int ulp_mode)
1330{
1331 struct tcpcb *tp = so_sototcpcb(so);
1332 unsigned int val;
1333
1334 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1335 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1336
1337 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1338 return (val);
1339}
1340
1341static inline unsigned int
1342calc_opt2(const struct socket *so, struct toedev *dev)
1343{
1344 int flv_valid;
1345
1346 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1347
1348 return (V_FLAVORS_VALID(flv_valid) |
1349 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1350}
1351
1352#if DEBUG_WR > 1
1353static int
1354count_pending_wrs(const struct toepcb *toep)
1355{
1356 const struct mbuf *m;
1357 int n = 0;
1358
1359 wr_queue_walk(toep, m)
1360 n += m->m_pkthdr.csum_data;
1361 return (n);
1362}
1363#endif
1364
1365#if 0
1366(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1367#endif
1368
1369static void
1370mk_act_open_req(struct socket *so, struct mbuf *m,
1371 unsigned int atid, const struct l2t_entry *e)
1372{
1373 struct cpl_act_open_req *req;
1374 struct inpcb *inp = so_sotoinpcb(so);
1375 struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1376 struct toepcb *toep = tp->t_toe;
1377 struct toedev *tdev = toep->tp_toedev;
1378
1379 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1380
1381 req = mtod(m, struct cpl_act_open_req *);
1382 m->m_pkthdr.len = m->m_len = sizeof(*req);
1383
1384 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1385 req->wr.wr_lo = 0;
1386 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1387 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1388#if 0
1389 req->local_port = inp->inp_lport;
1390 req->peer_port = inp->inp_fport;
1391 memcpy(&req->local_ip, &inp->inp_laddr, 4);
1392 memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1393#endif
1394 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1395 V_TX_CHANNEL(e->smt_idx));
1396 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1397 req->params = 0;
1398 req->opt2 = htonl(calc_opt2(so, tdev));
1399}
1400
1401
1402/*
1403 * Convert an ACT_OPEN_RPL status to an errno.
1404 */
1405static int
1406act_open_rpl_status_to_errno(int status)
1407{
1408 switch (status) {
1409 case CPL_ERR_CONN_RESET:
1410 return (ECONNREFUSED);
1411 case CPL_ERR_ARP_MISS:
1412 return (EHOSTUNREACH);
1413 case CPL_ERR_CONN_TIMEDOUT:
1414 return (ETIMEDOUT);
1415 case CPL_ERR_TCAM_FULL:
1416 return (ENOMEM);
1417 case CPL_ERR_CONN_EXIST:
1418 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1419 return (EADDRINUSE);
1420 default:
1421 return (EIO);
1422 }
1423}
1424
1425static void
1426fail_act_open(struct toepcb *toep, int errno)
1427{
1428 struct tcpcb *tp = toep->tp_tp;
1429
1430 t3_release_offload_resources(toep);
1431 if (tp) {
1432 inp_wunlock(tp->t_inpcb);
1433 tcp_offload_drop(tp, errno);
1434 }
1435
1436#ifdef notyet
1437 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1438#endif
1439}
1440
1441/*
1442 * Handle active open failures.
1443 */
1444static void
1445active_open_failed(struct toepcb *toep, struct mbuf *m)
1446{
1447 struct cpl_act_open_rpl *rpl = cplhdr(m);
1448 struct inpcb *inp;
1449
1450 if (toep->tp_tp == NULL)
1451 goto done;
1452
1453 inp = toep->tp_tp->t_inpcb;
1454
1455/*
1456 * Don't handle connection retry for now
1457 */
1458#ifdef notyet
1459 struct inet_connection_sock *icsk = inet_csk(sk);
1460
1461 if (rpl->status == CPL_ERR_CONN_EXIST &&
1462 icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1463 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1464 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1465 jiffies + HZ / 2);
1466 } else
1467#endif
1468 {
1469 inp_wlock(inp);
1470 /*
1471 * drops the inpcb lock
1472 */
1473 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1474 }
1475
1476 done:
1477 m_free(m);
1478}
1479
1480/*
1481 * Return whether a failed active open has allocated a TID
1482 */
1483static inline int
1484act_open_has_tid(int status)
1485{
1486 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1487 status != CPL_ERR_ARP_MISS;
1488}
1489
1490/*
1491 * Process an ACT_OPEN_RPL CPL message.
1492 */
1493static int
1494do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1495{
1496 struct toepcb *toep = (struct toepcb *)ctx;
1497 struct cpl_act_open_rpl *rpl = cplhdr(m);
1498
1499 if (cdev->type != T3A && act_open_has_tid(rpl->status))
1500 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1501
1502 active_open_failed(toep, m);
1503 return (0);
1504}
1505
1506/*
1507 * Handle an ARP failure for an active open. XXX purge ofo queue
1508 *
1509 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1510 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1511 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
1512 * free the atid. Hmm.
1513 */
1514#ifdef notyet
1515static void
1516act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1517{
1518 struct toepcb *toep = m_get_toep(m);
1519 struct tcpcb *tp = toep->tp_tp;
1520 struct inpcb *inp = tp->t_inpcb;
1521 struct socket *so;
1522
1523 inp_wlock(inp);
1524 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1525 /*
1526 * drops the inpcb lock
1527 */
1528 fail_act_open(so, EHOSTUNREACH);
1529 printf("freeing %p\n", m);
1530
1531 m_free(m);
1532 } else
1533 inp_wunlock(inp);
1534}
1535#endif
1536/*
1537 * Send an active open request.
1538 */
1539int
1540t3_connect(struct toedev *tdev, struct socket *so,
1541 struct rtentry *rt, struct sockaddr *nam)
1542{
1543 struct mbuf *m;
1544 struct l2t_entry *e;
1545 struct tom_data *d = TOM_DATA(tdev);
1546 struct inpcb *inp = so_sotoinpcb(so);
1547 struct tcpcb *tp = intotcpcb(inp);
1548 struct toepcb *toep; /* allocated by init_offload_socket */
1549
1550 int atid;
1551
1552 toep = toepcb_alloc();
1553 if (toep == NULL)
1554 goto out_err;
1555
1556 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1557 goto out_err;
1558
1559 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1560 if (!e)
1561 goto free_tid;
1562
1563 inp_lock_assert(inp);
1564 m = m_gethdr(MT_DATA, M_WAITOK);
1565
1566#if 0
1567 m->m_toe.mt_toepcb = tp->t_toe;
1568 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1569#endif
1570 so_lock(so);
1571
1572 init_offload_socket(so, tdev, atid, e, rt, toep);
1573
1574 install_offload_ops(so);
1575
1576 mk_act_open_req(so, m, atid, e);
1577 so_unlock(so);
1578
1579 soisconnecting(so);
1580 toep = tp->t_toe;
1581 m_set_toep(m, tp->t_toe);
1582
1583 toep->tp_state = TCPS_SYN_SENT;
1584 l2t_send(d->cdev, (struct mbuf *)m, e);
1585
1586 if (toep->tp_ulp_mode)
1587 t3_enable_ddp(toep, 0);
1588 return (0);
1589
1590free_tid:
1591 printf("failing connect - free atid\n");
1592
1593 free_atid(d->cdev, atid);
1594out_err:
1595 printf("return ENOMEM\n");
1596 return (ENOMEM);
1597}
1598
1599/*
1600 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
1601 * not send multiple ABORT_REQs for the same connection and also that we do
1602 * not try to send a message after the connection has closed. Returns 1 if
1603 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1604 */
1605static void
1606t3_send_reset(struct toepcb *toep)
1607{
1608
1609 struct cpl_abort_req *req;
1610 unsigned int tid = toep->tp_tid;
1611 int mode = CPL_ABORT_SEND_RST;
1612 struct tcpcb *tp = toep->tp_tp;
1613 struct toedev *tdev = toep->tp_toedev;
1614 struct socket *so = NULL;
1615 struct mbuf *m;
1616 struct sockbuf *snd;
1617
1618 if (tp) {
1619 inp_lock_assert(tp->t_inpcb);
1620 so = inp_inpcbtosocket(tp->t_inpcb);
1621 }
1622
1623 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1624 tdev == NULL))
1625 return;
1626 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1627
1628 snd = so_sockbuf_snd(so);
1629 /* Purge the send queue so we don't send anything after an abort. */
1630 if (so)
1631 sbflush(snd);
1632 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1633 mode |= CPL_ABORT_POST_CLOSE_REQ;
1634
1635 m = m_gethdr_nofail(sizeof(*req));
1636 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1637 set_arp_failure_handler(m, abort_arp_failure);
1638
1639 req = mtod(m, struct cpl_abort_req *);
1640 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1641 req->wr.wr_lo = htonl(V_WR_TID(tid));
1642 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1643 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1644 req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1645 req->cmd = mode;
1646 if (tp && (tp->t_state == TCPS_SYN_SENT))
1647 mbufq_tail(&toep->out_of_order_queue, m); // defer
1648 else
1649 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1650}
1651
1652static int
1653t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1654{
1655 struct inpcb *inp;
1656 int error, optval;
1657
1658 if (sopt->sopt_name == IP_OPTIONS)
1659 return (ENOPROTOOPT);
1660
1661 if (sopt->sopt_name != IP_TOS)
1662 return (EOPNOTSUPP);
1663
1664 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1665
1666 if (error)
1667 return (error);
1668
1669 if (optval > IPTOS_PREC_CRITIC_ECP)
1670 return (EINVAL);
1671
1672 inp = so_sotoinpcb(so);
1673 inp_wlock(inp);
1674 inp_ip_tos_set(inp, optval);
1675#if 0
1676 inp->inp_ip_tos = optval;
1677#endif
1678 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1679 inp_wunlock(inp);
1680
1681 return (0);
1682}
1683
1684static int
1685t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1686{
1687 int err = 0;
1688 size_t copied;
1689
1690 if (sopt->sopt_name != TCP_CONGESTION &&
1691 sopt->sopt_name != TCP_NODELAY)
1692 return (EOPNOTSUPP);
1693
1694 if (sopt->sopt_name == TCP_CONGESTION) {
1695 char name[TCP_CA_NAME_MAX];
1696 int optlen = sopt->sopt_valsize;
1697 struct tcpcb *tp;
1698
1699 if (sopt->sopt_dir == SOPT_GET) {
1700 KASSERT(0, ("unimplemented"));
1701 return (EOPNOTSUPP);
1702 }
1703
1704 if (optlen < 1)
1705 return (EINVAL);
1706
1707 err = copyinstr(sopt->sopt_val, name,
1708 min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1709 if (err)
1710 return (err);
1711 if (copied < 1)
1712 return (EINVAL);
1713
1714 tp = so_sototcpcb(so);
1715 /*
1716 * XXX I need to revisit this
1717 */
1718 if ((err = t3_set_cong_control(so, name)) == 0) {
1719#ifdef CONGESTION_CONTROL_SUPPORTED
1720 tp->t_cong_control = strdup(name, M_CXGB);
1721#endif
1722 } else
1723 return (err);
1724 } else {
1725 int optval, oldval;
1726 struct inpcb *inp;
1727 struct tcpcb *tp;
1728
1729 if (sopt->sopt_dir == SOPT_GET)
1730 return (EOPNOTSUPP);
1731
1732 err = sooptcopyin(sopt, &optval, sizeof optval,
1733 sizeof optval);
1734
1735 if (err)
1736 return (err);
1737
1738 inp = so_sotoinpcb(so);
1739 inp_wlock(inp);
1740 tp = inp_inpcbtotcpcb(inp);
1741
1742 oldval = tp->t_flags;
1743 if (optval)
1744 tp->t_flags |= TF_NODELAY;
1745 else
1746 tp->t_flags &= ~TF_NODELAY;
1747 inp_wunlock(inp);
1748
1749
1750 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1751 t3_set_nagle(tp->t_toe);
1752
1753 }
1754
1755 return (0);
1756}
1757
1758int
1759t3_ctloutput(struct socket *so, struct sockopt *sopt)
1760{
1761 int err;
1762
1763 if (sopt->sopt_level != IPPROTO_TCP)
1764 err = t3_ip_ctloutput(so, sopt);
1765 else
1766 err = t3_tcp_ctloutput(so, sopt);
1767
1768 if (err != EOPNOTSUPP)
1769 return (err);
1770
1771 return (tcp_ctloutput(so, sopt));
1772}
1773
1774/*
1775 * Returns true if we need to explicitly request RST when we receive new data
1776 * on an RX-closed connection.
1777 */
1778static inline int
1779need_rst_on_excess_rx(const struct toepcb *toep)
1780{
1781 return (1);
1782}
1783
1784/*
1785 * Handles Rx data that arrives in a state where the socket isn't accepting
1786 * new data.
1787 */
1788static void
1789handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1790{
1791
1792 if (need_rst_on_excess_rx(toep) &&
1793 !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1794 t3_send_reset(toep);
1795 m_freem(m);
1796}
1797
1798/*
1799 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1800 * by getting the DDP offset from the TCB.
1801 */
1802static void
1803tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1804{
1805 struct ddp_state *q = &toep->tp_ddp_state;
1806 struct ddp_buf_state *bsp;
1807 struct cpl_get_tcb_rpl *hdr;
1808 unsigned int ddp_offset;
1809 struct socket *so;
1810 struct tcpcb *tp;
1811 struct sockbuf *rcv;
1812 int state;
1813
1814 uint64_t t;
1815 __be64 *tcb;
1816
1817 tp = toep->tp_tp;
1818 so = inp_inpcbtosocket(tp->t_inpcb);
1819
1820 inp_lock_assert(tp->t_inpcb);
1821 rcv = so_sockbuf_rcv(so);
1822 sockbuf_lock(rcv);
1823
1824 /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1825 * We really need a cookie in order to dispatch the RPLs.
1826 */
1827 q->get_tcb_count--;
1828
1829 /* It is a possible that a previous CPL already invalidated UBUF DDP
1830 * and moved the cur_buf idx and hence no further processing of this
1831 * skb is required. However, the app might be sleeping on
1832 * !q->get_tcb_count and we need to wake it up.
1833 */
1834 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1835 int state = so_state_get(so);
1836
1837 m_freem(m);
1838 if (__predict_true((state & SS_NOFDREF) == 0))
1839 so_sorwakeup_locked(so);
1840 else
1841 sockbuf_unlock(rcv);
1842
1843 return;
1844 }
1845
1846 bsp = &q->buf_state[q->cur_buf];
1847 hdr = cplhdr(m);
1848 tcb = (__be64 *)(hdr + 1);
1849 if (q->cur_buf == 0) {
1850 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1851 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1852 } else {
1853 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1854 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1855 }
1856 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1857 m->m_cur_offset = bsp->cur_offset;
1858 bsp->cur_offset = ddp_offset;
1859 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1860
1861 CTR5(KTR_TOM,
1862 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1863 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1864 KASSERT(ddp_offset >= m->m_cur_offset,
1865 ("ddp_offset=%u less than cur_offset=%u",
1866 ddp_offset, m->m_cur_offset));
1867
1868#if 0
1869{
1870 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1871
1872 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1873 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1874
1875 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1876 rcv_nxt = t >> S_TCB_RCV_NXT;
1877 rcv_nxt &= M_TCB_RCV_NXT;
1878
1879 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1880 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1881 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1882
1883 T3_TRACE2(TIDTB(sk),
1884 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1885 ddp_flags, rcv_nxt - rx_hdr_offset);
1886 T3_TRACE4(TB(q),
1887 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1888 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1889 T3_TRACE3(TB(q),
1890 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1891 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1892 T3_TRACE2(TB(q),
1893 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1894 q->buf_state[0].flags, q->buf_state[1].flags);
1895
1896}
1897#endif
1898 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1899 handle_excess_rx(toep, m);
1900 return;
1901 }
1902
1903#ifdef T3_TRACE
1904 if ((int)m->m_pkthdr.len < 0) {
1905 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1906 }
1907#endif
1908 if (bsp->flags & DDP_BF_NOCOPY) {
1909#ifdef T3_TRACE
1910 T3_TRACE0(TB(q),
1911 "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1912
1913 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1914 printk("!cancel_ubuf");
1915 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1916 }
1917#endif
1918 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1919 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1920 q->cur_buf ^= 1;
1921 } else if (bsp->flags & DDP_BF_NOFLIP) {
1922
1923 m->m_ddp_flags = 1; /* always a kernel buffer */
1924
1925 /* now HW buffer carries a user buffer */
1926 bsp->flags &= ~DDP_BF_NOFLIP;
1927 bsp->flags |= DDP_BF_NOCOPY;
1928
1929 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1930 * any new data in which case we're done. If in addition the
1931 * offset is 0, then there wasn't a completion for the kbuf
1932 * and we need to decrement the posted count.
1933 */
1934 if (m->m_pkthdr.len == 0) {
1935 if (ddp_offset == 0) {
1936 q->kbuf_posted--;
1937 bsp->flags |= DDP_BF_NODATA;
1938 }
1939 sockbuf_unlock(rcv);
1940 m_free(m);
1941 return;
1942 }
1943 } else {
1944 sockbuf_unlock(rcv);
1945
1946 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1947 * but it got here way late and nobody cares anymore.
1948 */
1949 m_free(m);
1950 return;
1951 }
1952
1953 m->m_ddp_gl = (unsigned char *)bsp->gl;
1954 m->m_flags |= M_DDP;
1955 m->m_seq = tp->rcv_nxt;
1956 tp->rcv_nxt += m->m_pkthdr.len;
1957 tp->t_rcvtime = ticks;
1958 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1959 m->m_seq, q->cur_buf, m->m_pkthdr.len);
1960 if (m->m_pkthdr.len == 0) {
1961 q->user_ddp_pending = 0;
1962 m_free(m);
1963 } else
1964 SBAPPEND(rcv, m);
1965
1966 state = so_state_get(so);
1967 if (__predict_true((state & SS_NOFDREF) == 0))
1968 so_sorwakeup_locked(so);
1969 else
1970 sockbuf_unlock(rcv);
1971}
1972
1973/*
1974 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
1975 * in that case they are similar to DDP completions.
1976 */
1977static int
1978do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1979{
1980 struct toepcb *toep = (struct toepcb *)ctx;
1981
1982 /* OK if socket doesn't exist */
1983 if (toep == NULL) {
1984 printf("null toep in do_get_tcb_rpl\n");
1985 return (CPL_RET_BUF_DONE);
1986 }
1987
1988 inp_wlock(toep->tp_tp->t_inpcb);
1989 tcb_rpl_as_ddp_complete(toep, m);
1990 inp_wunlock(toep->tp_tp->t_inpcb);
1991
1992 return (0);
1993}
1994
1995static void
1996handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1997{
1998 struct tcpcb *tp = toep->tp_tp;
1999 struct socket *so;
2000 struct ddp_state *q;
2001 struct ddp_buf_state *bsp;
2002 struct cpl_rx_data *hdr = cplhdr(m);
2003 unsigned int rcv_nxt = ntohl(hdr->seq);
2004 struct sockbuf *rcv;
2005
2006 if (tp->rcv_nxt == rcv_nxt)
2007 return;
2008
2009 inp_lock_assert(tp->t_inpcb);
2010 so = inp_inpcbtosocket(tp->t_inpcb);
2011 rcv = so_sockbuf_rcv(so);
2012 sockbuf_lock(rcv);
2013
2014 q = &toep->tp_ddp_state;
2015 bsp = &q->buf_state[q->cur_buf];
2016 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2017 rcv_nxt, tp->rcv_nxt));
2018 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2019 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2020 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2021 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2022
2023#ifdef T3_TRACE
2024 if ((int)m->m_pkthdr.len < 0) {
2025 t3_ddp_error(so, "handle_ddp_data: neg len");
2026 }
2027#endif
2028 m->m_ddp_gl = (unsigned char *)bsp->gl;
2029 m->m_flags |= M_DDP;
2030 m->m_cur_offset = bsp->cur_offset;
2031 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2032 if (bsp->flags & DDP_BF_NOCOPY)
2033 bsp->flags &= ~DDP_BF_NOCOPY;
2034
2035 m->m_seq = tp->rcv_nxt;
2036 tp->rcv_nxt = rcv_nxt;
2037 bsp->cur_offset += m->m_pkthdr.len;
2038 if (!(bsp->flags & DDP_BF_NOFLIP))
2039 q->cur_buf ^= 1;
2040 /*
2041 * For now, don't re-enable DDP after a connection fell out of DDP
2042 * mode.
2043 */
2044 q->ubuf_ddp_ready = 0;
2045 sockbuf_unlock(rcv);
2046}
2047
2048/*
2049 * Process new data received for a connection.
2050 */
2051static void
2052new_rx_data(struct toepcb *toep, struct mbuf *m)
2053{
2054 struct cpl_rx_data *hdr = cplhdr(m);
2055 struct tcpcb *tp = toep->tp_tp;
2056 struct socket *so;
2057 struct sockbuf *rcv;
2058 int state;
2059 int len = be16toh(hdr->len);
2060
2061 inp_wlock(tp->t_inpcb);
2062
2063 so = inp_inpcbtosocket(tp->t_inpcb);
2064
2065 if (__predict_false(so_no_receive(so))) {
2066 handle_excess_rx(toep, m);
2067 inp_wunlock(tp->t_inpcb);
2068 TRACE_EXIT;
2069 return;
2070 }
2071
2072 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2073 handle_ddp_data(toep, m);
2074
2075 m->m_seq = ntohl(hdr->seq);
2076 m->m_ulp_mode = 0; /* for iSCSI */
2077
2078#if VALIDATE_SEQ
2079 if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2080 log(LOG_ERR,
2081 "%s: TID %u: Bad sequence number %u, expected %u\n",
2082 toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2083 tp->rcv_nxt);
2084 m_freem(m);
2085 inp_wunlock(tp->t_inpcb);
2086 return;
2087 }
2088#endif
2089 m_adj(m, sizeof(*hdr));
2090
2091#ifdef URGENT_DATA_SUPPORTED
2092 /*
2093 * We don't handle urgent data yet
2094 */
2095 if (__predict_false(hdr->urg))
2096 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2097 if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2098 tp->urg_seq - tp->rcv_nxt < skb->len))
2099 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2100 tp->rcv_nxt];
2101#endif
2102 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2103 toep->tp_delack_mode = hdr->dack_mode;
2104 toep->tp_delack_seq = tp->rcv_nxt;
2105 }
2106 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2107 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2108
2109 if (len < m->m_pkthdr.len)
2110 m->m_pkthdr.len = m->m_len = len;
2111
2112 tp->rcv_nxt += m->m_pkthdr.len;
2113 tp->t_rcvtime = ticks;
2114 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2115 CTR2(KTR_TOM,
2116 "new_rx_data: seq 0x%x len %u",
2117 m->m_seq, m->m_pkthdr.len);
2118 inp_wunlock(tp->t_inpcb);
2119 rcv = so_sockbuf_rcv(so);
2120 sockbuf_lock(rcv);
2121#if 0
2122 if (sb_notify(rcv))
2123 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2124#endif
2125 SBAPPEND(rcv, m);
2126
2127#ifdef notyet
2128 /*
2129 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2130 *
2131 */
2132 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2133
2134 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2135 so, rcv->sb_cc, rcv->sb_mbmax));
2136#endif
2137
2138
2139 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2140 rcv->sb_cc, rcv->sb_mbcnt);
2141
2142 state = so_state_get(so);
2143 if (__predict_true((state & SS_NOFDREF) == 0))
2144 so_sorwakeup_locked(so);
2145 else
2146 sockbuf_unlock(rcv);
2147}
2148
2149/*
2150 * Handler for RX_DATA CPL messages.
2151 */
2152static int
2153do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2154{
2155 struct toepcb *toep = (struct toepcb *)ctx;
2156
2157 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2158
2159 new_rx_data(toep, m);
2160
2161 return (0);
2162}
2163
2164static void
2165new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2166{
2167 struct tcpcb *tp;
2168 struct ddp_state *q;
2169 struct ddp_buf_state *bsp;
2170 struct cpl_rx_data_ddp *hdr;
2171 struct socket *so;
2172 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2173 int nomoredata = 0;
2174 unsigned int delack_mode;
2175 struct sockbuf *rcv;
2176
2177 tp = toep->tp_tp;
2178 inp_wlock(tp->t_inpcb);
2179 so = inp_inpcbtosocket(tp->t_inpcb);
2180
2181 if (__predict_false(so_no_receive(so))) {
2182
2183 handle_excess_rx(toep, m);
2184 inp_wunlock(tp->t_inpcb);
2185 return;
2186 }
2187
2188 q = &toep->tp_ddp_state;
2189 hdr = cplhdr(m);
2190 ddp_report = ntohl(hdr->u.ddp_report);
2191 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2192 bsp = &q->buf_state[buf_idx];
2193
2194 CTR4(KTR_TOM,
2195 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2196 "hdr seq 0x%x len %u",
2197 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2198 ntohs(hdr->len));
2199 CTR3(KTR_TOM,
2200 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2201 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2202
2203 ddp_len = ntohs(hdr->len);
2204 rcv_nxt = ntohl(hdr->seq) + ddp_len;
2205
2206 delack_mode = G_DDP_DACK_MODE(ddp_report);
2207 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2208 toep->tp_delack_mode = delack_mode;
2209 toep->tp_delack_seq = tp->rcv_nxt;
2210 }
2211
2212 m->m_seq = tp->rcv_nxt;
2213 tp->rcv_nxt = rcv_nxt;
2214
2215 tp->t_rcvtime = ticks;
2216 /*
2217 * Store the length in m->m_len. We are changing the meaning of
2218 * m->m_len here, we need to be very careful that nothing from now on
2219 * interprets ->len of this packet the usual way.
2220 */
2221 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2222 inp_wunlock(tp->t_inpcb);
2223 CTR3(KTR_TOM,
2224 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2225 m->m_len, rcv_nxt, m->m_seq);
2226 /*
2227 * Figure out where the new data was placed in the buffer and store it
2228 * in when. Assumes the buffer offset starts at 0, consumer needs to
2229 * account for page pod's pg_offset.
2230 */
2231 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2232 m->m_cur_offset = end_offset - m->m_pkthdr.len;
2233
2234 rcv = so_sockbuf_rcv(so);
2235 sockbuf_lock(rcv);
2236
2237 m->m_ddp_gl = (unsigned char *)bsp->gl;
2238 m->m_flags |= M_DDP;
2239 bsp->cur_offset = end_offset;
2240 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2241
2242 /*
2243 * Length is only meaningful for kbuf
2244 */
2245 if (!(bsp->flags & DDP_BF_NOCOPY))
2246 KASSERT(m->m_len <= bsp->gl->dgl_length,
2247 ("length received exceeds ddp pages: len=%d dgl_length=%d",
2248 m->m_len, bsp->gl->dgl_length));
2249
2250 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2251 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2252 /*
2253 * Bit 0 of flags stores whether the DDP buffer is completed.
2254 * Note that other parts of the code depend on this being in bit 0.
2255 */
2256 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2257 panic("spurious ddp completion");
2258 } else {
2259 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2260 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2261 q->cur_buf ^= 1; /* flip buffers */
2262 }
2263
2264 if (bsp->flags & DDP_BF_NOCOPY) {
2265 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2266 bsp->flags &= ~DDP_BF_NOCOPY;
2267 }
2268
2269 if (ddp_report & F_DDP_PSH)
2270 m->m_ddp_flags |= DDP_BF_PSH;
2271 if (nomoredata)
2272 m->m_ddp_flags |= DDP_BF_NODATA;
2273
2274#ifdef notyet
2275 skb_reset_transport_header(skb);
2276 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */
2277#endif
2278 SBAPPEND(rcv, m);
2279
2280 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2281 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2282 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2283 so_sorwakeup_locked(so);
2284 else
2285 sockbuf_unlock(rcv);
2286}
2287
2288#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2289 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2290 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2291 F_DDP_INVALID_PPOD)
2292
2293/*
2294 * Handler for RX_DATA_DDP CPL messages.
2295 */
2296static int
2297do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2298{
2299 struct toepcb *toep = ctx;
2300 const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2301
2302 VALIDATE_SOCK(so);
2303
2304 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2305 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2306 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2307 return (CPL_RET_BUF_DONE);
2308 }
2309#if 0
2310 skb->h.th = tcphdr_skb->h.th;
2311#endif
2312 new_rx_data_ddp(toep, m);
2313 return (0);
2314}
2315
2316static void
2317process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2318{
2319 struct tcpcb *tp = toep->tp_tp;
2320 struct socket *so;
2321 struct ddp_state *q;
2322 struct ddp_buf_state *bsp;
2323 struct cpl_rx_ddp_complete *hdr;
2324 unsigned int ddp_report, buf_idx, when, delack_mode;
2325 int nomoredata = 0;
2326 struct sockbuf *rcv;
2327
2328 inp_wlock(tp->t_inpcb);
2329 so = inp_inpcbtosocket(tp->t_inpcb);
2330
2331 if (__predict_false(so_no_receive(so))) {
2332 struct inpcb *inp = so_sotoinpcb(so);
2333
2334 handle_excess_rx(toep, m);
2335 inp_wunlock(inp);
2336 return;
2337 }
2338 q = &toep->tp_ddp_state;
2339 hdr = cplhdr(m);
2340 ddp_report = ntohl(hdr->ddp_report);
2341 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2342 m->m_pkthdr.csum_data = tp->rcv_nxt;
2343
2344 rcv = so_sockbuf_rcv(so);
2345 sockbuf_lock(rcv);
2346
2347 bsp = &q->buf_state[buf_idx];
2348 when = bsp->cur_offset;
2349 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2350 tp->rcv_nxt += m->m_len;
2351 tp->t_rcvtime = ticks;
2352
2353 delack_mode = G_DDP_DACK_MODE(ddp_report);
2354 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2355 toep->tp_delack_mode = delack_mode;
2356 toep->tp_delack_seq = tp->rcv_nxt;
2357 }
2358#ifdef notyet
2359 skb_reset_transport_header(skb);
2360 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2361#endif
2362 inp_wunlock(tp->t_inpcb);
2363
2364 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2365 CTR5(KTR_TOM,
2366 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2367 "ddp_report 0x%x offset %u, len %u",
2368 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2369 G_DDP_OFFSET(ddp_report), m->m_len);
2370
2371 m->m_cur_offset = bsp->cur_offset;
2372 bsp->cur_offset += m->m_len;
2373
2374 if (!(bsp->flags & DDP_BF_NOFLIP)) {
2375 q->cur_buf ^= 1; /* flip buffers */
2376 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2377 nomoredata=1;
2378 }
2379
2380 CTR4(KTR_TOM,
2381 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2382 "ddp_report %u offset %u",
2383 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2384 G_DDP_OFFSET(ddp_report));
2385
2386 m->m_ddp_gl = (unsigned char *)bsp->gl;
2387 m->m_flags |= M_DDP;
2388 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2389 if (bsp->flags & DDP_BF_NOCOPY)
2390 bsp->flags &= ~DDP_BF_NOCOPY;
2391 if (nomoredata)
2392 m->m_ddp_flags |= DDP_BF_NODATA;
2393
2394 SBAPPEND(rcv, m);
2395 if ((so_state_get(so) & SS_NOFDREF) == 0)
2396 so_sorwakeup_locked(so);
2397 else
2398 sockbuf_unlock(rcv);
2399}
2400
2401/*
2402 * Handler for RX_DDP_COMPLETE CPL messages.
2403 */
2404static int
2405do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2406{
2407 struct toepcb *toep = ctx;
2408
2409 VALIDATE_SOCK(so);
2410#if 0
2411 skb->h.th = tcphdr_skb->h.th;
2412#endif
2413 process_ddp_complete(toep, m);
2414 return (0);
2415}
2416
2417/*
2418 * Move a socket to TIME_WAIT state. We need to make some adjustments to the
2419 * socket state before calling tcp_time_wait to comply with its expectations.
2420 */
2421static void
2422enter_timewait(struct tcpcb *tp)
2423{
2424 /*
2425 * Bump rcv_nxt for the peer FIN. We don't do this at the time we
2426 * process peer_close because we don't want to carry the peer FIN in
2427 * the socket's receive queue and if we increment rcv_nxt without
2428 * having the FIN in the receive queue we'll confuse facilities such
2429 * as SIOCINQ.
2430 */
2431 inp_wlock(tp->t_inpcb);
2432 tp->rcv_nxt++;
2433
2434 tp->ts_recent_age = 0; /* defeat recycling */
2435 tp->t_srtt = 0; /* defeat tcp_update_metrics */
2436 inp_wunlock(tp->t_inpcb);
2437 tcp_offload_twstart(tp);
2438}
2439
2440/*
2441 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
2442 * function deals with the data that may be reported along with the FIN.
2443 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2444 * perform normal FIN-related processing. In the latter case 1 indicates that
2445 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2446 * skb can be freed.
2447 */
2448static int
2449handle_peer_close_data(struct socket *so, struct mbuf *m)
2450{
2451 struct tcpcb *tp = so_sototcpcb(so);
2452 struct toepcb *toep = tp->t_toe;
2453 struct ddp_state *q;
2454 struct ddp_buf_state *bsp;
2455 struct cpl_peer_close *req = cplhdr(m);
2456 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2457 struct sockbuf *rcv;
2458
2459 if (tp->rcv_nxt == rcv_nxt) /* no data */
2460 return (0);
2461
2462 CTR0(KTR_TOM, "handle_peer_close_data");
2463 if (__predict_false(so_no_receive(so))) {
2464 handle_excess_rx(toep, m);
2465
2466 /*
2467 * Although we discard the data we want to process the FIN so
2468 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2469 * PEER_CLOSE without data. In particular this PEER_CLOSE
2470 * may be what will close the connection. We return 1 because
2471 * handle_excess_rx() already freed the packet.
2472 */
2473 return (1);
2474 }
2475
2476 inp_lock_assert(tp->t_inpcb);
2477 q = &toep->tp_ddp_state;
2478 rcv = so_sockbuf_rcv(so);
2479 sockbuf_lock(rcv);
2480
2481 bsp = &q->buf_state[q->cur_buf];
2482 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2483 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2484 m->m_ddp_gl = (unsigned char *)bsp->gl;
2485 m->m_flags |= M_DDP;
2486 m->m_cur_offset = bsp->cur_offset;
2487 m->m_ddp_flags =
2488 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2489 m->m_seq = tp->rcv_nxt;
2490 tp->rcv_nxt = rcv_nxt;
2491 bsp->cur_offset += m->m_pkthdr.len;
2492 if (!(bsp->flags & DDP_BF_NOFLIP))
2493 q->cur_buf ^= 1;
2494#ifdef notyet
2495 skb_reset_transport_header(skb);
2496 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2497#endif
2498 tp->t_rcvtime = ticks;
2499 SBAPPEND(rcv, m);
2500 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2501 so_sorwakeup_locked(so);
2502 else
2503 sockbuf_unlock(rcv);
2504
2505 return (1);
2506}
2507
2508/*
2509 * Handle a peer FIN.
2510 */
2511static void
2512do_peer_fin(struct toepcb *toep, struct mbuf *m)
2513{
2514 struct socket *so;
2515 struct tcpcb *tp = toep->tp_tp;
2516 int keep, action;
2517
2518 action = keep = 0;
2519 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2520 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2521 printf("abort_pending set\n");
2522
2523 goto out;
2524 }
2525 inp_wlock(tp->t_inpcb);
2526 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2527 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2528 keep = handle_peer_close_data(so, m);
2529 if (keep < 0) {
2530 inp_wunlock(tp->t_inpcb);
2531 return;
2532 }
2533 }
2534 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2535 CTR1(KTR_TOM,
2536 "waking up waiters for cantrcvmore on %p ", so);
2537 socantrcvmore(so);
2538
2539 /*
2540 * If connection is half-synchronized
2541 * (ie NEEDSYN flag on) then delay ACK,
2542 * so it may be piggybacked when SYN is sent.
2543 * Otherwise, since we received a FIN then no
2544 * more input can be expected, send ACK now.
2545 */
2546 if (tp->t_flags & TF_NEEDSYN)
2547 tp->t_flags |= TF_DELACK;
2548 else
2549 tp->t_flags |= TF_ACKNOW;
2550 tp->rcv_nxt++;
2551 }
2552
2553 switch (tp->t_state) {
2554 case TCPS_SYN_RECEIVED:
2555 tp->t_starttime = ticks;
2556 /* FALLTHROUGH */
2557 case TCPS_ESTABLISHED:
2558 tp->t_state = TCPS_CLOSE_WAIT;
2559 break;
2560 case TCPS_FIN_WAIT_1:
2561 tp->t_state = TCPS_CLOSING;
2562 break;
2563 case TCPS_FIN_WAIT_2:
2564 /*
2565 * If we've sent an abort_req we must have sent it too late,
2566 * HW will send us a reply telling us so, and this peer_close
2567 * is really the last message for this connection and needs to
2568 * be treated as an abort_rpl, i.e., transition the connection
2569 * to TCP_CLOSE (note that the host stack does this at the
2570 * time of generating the RST but we must wait for HW).
2571 * Otherwise we enter TIME_WAIT.
2572 */
2573 t3_release_offload_resources(toep);
2574 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2575 action = TCP_CLOSE;
2576 } else {
2577 action = TCP_TIMEWAIT;
2578 }
2579 break;
2580 default:
2581 log(LOG_ERR,
2582 "%s: TID %u received PEER_CLOSE in bad state %d\n",
2583 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2584 }
2585 inp_wunlock(tp->t_inpcb);
2586
2587 if (action == TCP_TIMEWAIT) {
2588 enter_timewait(tp);
2589 } else if (action == TCP_DROP) {
2590 tcp_offload_drop(tp, 0);
2591 } else if (action == TCP_CLOSE) {
2592 tcp_offload_close(tp);
2593 }
2594
2595#ifdef notyet
2596 /* Do not send POLL_HUP for half duplex close. */
2597 if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2598 sk->sk_state == TCP_CLOSE)
2599 sk_wake_async(so, 1, POLL_HUP);
2600 else
2601 sk_wake_async(so, 1, POLL_IN);
2602#endif
2603
2604out:
2605 if (!keep)
2606 m_free(m);
2607}
2608
2609/*
2610 * Handler for PEER_CLOSE CPL messages.
2611 */
2612static int
2613do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2614{
2615 struct toepcb *toep = (struct toepcb *)ctx;
2616
2617 VALIDATE_SOCK(so);
2618
2619 do_peer_fin(toep, m);
2620 return (0);
2621}
2622
2623static void
2624process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2625{
2626 struct cpl_close_con_rpl *rpl = cplhdr(m);
2627 struct tcpcb *tp = toep->tp_tp;
2628 struct socket *so;
2629 int action = 0;
2630 struct sockbuf *rcv;
2631
2632 inp_wlock(tp->t_inpcb);
2633 so = inp_inpcbtosocket(tp->t_inpcb);
2634
2635 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */
2636
2637 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2638 inp_wunlock(tp->t_inpcb);
2639 goto out;
2640 }
2641
2642 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2643 tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2644
2645 switch (tp->t_state) {
2646 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
2647 t3_release_offload_resources(toep);
2648 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2649 action = TCP_CLOSE;
2650
2651 } else {
2652 action = TCP_TIMEWAIT;
2653 }
2654 break;
2655 case TCPS_LAST_ACK:
2656 /*
2657 * In this state we don't care about pending abort_rpl.
2658 * If we've sent abort_req it was post-close and was sent too
2659 * late, this close_con_rpl is the actual last message.
2660 */
2661 t3_release_offload_resources(toep);
2662 action = TCP_CLOSE;
2663 break;
2664 case TCPS_FIN_WAIT_1:
2665 /*
2666 * If we can't receive any more
2667 * data, then closing user can proceed.
2668 * Starting the timer is contrary to the
2669 * specification, but if we don't get a FIN
2670 * we'll hang forever.
2671 *
2672 * XXXjl:
2673 * we should release the tp also, and use a
2674 * compressed state.
2675 */
2676 if (so)
2677 rcv = so_sockbuf_rcv(so);
2678 else
2679 break;
2680
2681 if (rcv->sb_state & SBS_CANTRCVMORE) {
2682 int timeout;
2683
2684 if (so)
2685 soisdisconnected(so);
2686 timeout = (tcp_fast_finwait2_recycle) ?
2687 tcp_finwait2_timeout : tcp_maxidle;
2688 tcp_timer_activate(tp, TT_2MSL, timeout);
2689 }
2690 tp->t_state = TCPS_FIN_WAIT_2;
2691 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2692 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2693 action = TCP_DROP;
2694 }
2695
2696 break;
2697 default:
2698 log(LOG_ERR,
2699 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2700 toep->tp_toedev->tod_name, toep->tp_tid,
2701 tp->t_state);
2702 }
2703 inp_wunlock(tp->t_inpcb);
2704
2705
2706 if (action == TCP_TIMEWAIT) {
2707 enter_timewait(tp);
2708 } else if (action == TCP_DROP) {
2709 tcp_offload_drop(tp, 0);
2710 } else if (action == TCP_CLOSE) {
2711 tcp_offload_close(tp);
2712 }
2713out:
2714 m_freem(m);
2715}
2716
2717/*
2718 * Handler for CLOSE_CON_RPL CPL messages.
2719 */
2720static int
2721do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2722 void *ctx)
2723{
2724 struct toepcb *toep = (struct toepcb *)ctx;
2725
2726 process_close_con_rpl(toep, m);
2727 return (0);
2728}
2729
2730/*
2731 * Process abort replies. We only process these messages if we anticipate
2732 * them as the coordination between SW and HW in this area is somewhat lacking
2733 * and sometimes we get ABORT_RPLs after we are done with the connection that
2734 * originated the ABORT_REQ.
2735 */
2736static void
2737process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2738{
2739 struct tcpcb *tp = toep->tp_tp;
2740 struct socket *so;
2741 int needclose = 0;
2742
2743#ifdef T3_TRACE
2744 T3_TRACE1(TIDTB(sk),
2745 "process_abort_rpl: GTS rpl pending %d",
2746 sock_flag(sk, ABORT_RPL_PENDING));
2747#endif
2748
2749 inp_wlock(tp->t_inpcb);
2750 so = inp_inpcbtosocket(tp->t_inpcb);
2751
2752 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2753 /*
2754 * XXX panic on tcpdrop
2755 */
2756 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2757 toep->tp_flags |= TP_ABORT_RPL_RCVD;
2758 else {
2759 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2760 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2761 !is_t3a(toep->tp_toedev)) {
2762 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2763 panic("TP_ABORT_REQ_RCVD set");
2764 t3_release_offload_resources(toep);
2765 needclose = 1;
2766 }
2767 }
2768 }
2769 inp_wunlock(tp->t_inpcb);
2770
2771 if (needclose)
2772 tcp_offload_close(tp);
2773
2774 m_free(m);
2775}
2776
2777/*
2778 * Handle an ABORT_RPL_RSS CPL message.
2779 */
2780static int
2781do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2782{
2783 struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2784 struct toepcb *toep;
2785
2786 /*
2787 * Ignore replies to post-close aborts indicating that the abort was
2788 * requested too late. These connections are terminated when we get
2789 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2790 * arrives the TID is either no longer used or it has been recycled.
2791 */
2792 if (rpl->status == CPL_ERR_ABORT_FAILED) {
2793discard:
2794 m_free(m);
2795 return (0);
2796 }
2797
2798 toep = (struct toepcb *)ctx;
2799
2800 /*
2801 * Sometimes we've already closed the socket, e.g., a post-close
2802 * abort races with ABORT_REQ_RSS, the latter frees the socket
2803 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2804 * but FW turns the ABORT_REQ into a regular one and so we get
2805 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
2806 */
2807 if (!toep)
2808 goto discard;
2809
2810 if (toep->tp_tp == NULL) {
2811 log(LOG_NOTICE, "removing tid for abort\n");
2812 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2813 if (toep->tp_l2t)
2814 l2t_release(L2DATA(cdev), toep->tp_l2t);
2815
2816 toepcb_release(toep);
2817 goto discard;
2818 }
2819
2820 log(LOG_NOTICE, "toep=%p\n", toep);
2821 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2822
2823 toepcb_hold(toep);
2824 process_abort_rpl(toep, m);
2825 toepcb_release(toep);
2826 return (0);
2827}
2828
2829/*
2830 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also
2831 * indicate whether RST should be sent in response.
2832 */
2833static int
2834abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2835{
2836 struct tcpcb *tp = so_sototcpcb(so);
2837
2838 switch (abort_reason) {
2839 case CPL_ERR_BAD_SYN:
2840#if 0
2841 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through
2842#endif
2843 case CPL_ERR_CONN_RESET:
2844 // XXX need to handle SYN_RECV due to crossed SYNs
2845 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2846 case CPL_ERR_XMIT_TIMEDOUT:
2847 case CPL_ERR_PERSIST_TIMEDOUT:
2848 case CPL_ERR_FINWAIT2_TIMEDOUT:
2849 case CPL_ERR_KEEPALIVE_TIMEDOUT:
2850#if 0
2851 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2852#endif
2853 return (ETIMEDOUT);
2854 default:
2855 return (EIO);
2856 }
2857}
2858
2859static inline void
2860set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2861{
2862 struct cpl_abort_rpl *rpl = cplhdr(m);
2863
2864 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2865 rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2866 m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2867
2868 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2869 rpl->cmd = cmd;
2870}
2871
2872static void
2873send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2874{
2875 struct mbuf *reply_mbuf;
2876 struct cpl_abort_req_rss *req = cplhdr(m);
2877
2878 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2879 m_set_priority(m, CPL_PRIORITY_DATA);
2880 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2881 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2882 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2883 m_free(m);
2884}
2885
2886/*
2887 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2888 */
2889static inline int
2890is_neg_adv_abort(unsigned int status)
2891{
2892 return status == CPL_ERR_RTX_NEG_ADVICE ||
2893 status == CPL_ERR_PERSIST_NEG_ADVICE;
2894}
2895
2896static void
2897send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2898{
2899 struct mbuf *reply_mbuf;
2900 struct cpl_abort_req_rss *req = cplhdr(m);
2901
2902 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2903
2904 if (!reply_mbuf) {
2905 /* Defer the reply. Stick rst_status into req->cmd. */
2906 req->status = rst_status;
2907 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2908 return;
2909 }
2910
2911 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2912 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2913 m_free(m);
2914
2915 /*
2916 * XXX need to sync with ARP as for SYN_RECV connections we can send
2917 * these messages while ARP is pending. For other connection states
2918 * it's not a problem.
2919 */
2920 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2921}
2922
2923#ifdef notyet
2924static void
2925cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2926{
2927 CXGB_UNIMPLEMENTED();
2928#ifdef notyet
2929 struct request_sock *req = child->sk_user_data;
2930
2931 inet_csk_reqsk_queue_removed(parent, req);
2932 synq_remove(tcp_sk(child));
2933 __reqsk_free(req);
2934 child->sk_user_data = NULL;
2935#endif
2936}
2937
2938
2939/*
2940 * Performs the actual work to abort a SYN_RECV connection.
2941 */
2942static void
2943do_abort_syn_rcv(struct socket *child, struct socket *parent)
2944{
2945 struct tcpcb *parenttp = so_sototcpcb(parent);
2946 struct tcpcb *childtp = so_sototcpcb(child);
2947
2948 /*
2949 * If the server is still open we clean up the child connection,
2950 * otherwise the server already did the clean up as it was purging
2951 * its SYN queue and the skb was just sitting in its backlog.
2952 */
2953 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2954 cleanup_syn_rcv_conn(child, parent);
2955 inp_wlock(childtp->t_inpcb);
2956 t3_release_offload_resources(childtp->t_toe);
2957 inp_wunlock(childtp->t_inpcb);
2958 tcp_offload_close(childtp);
2959 }
2960}
2961#endif
2962
2963/*
2964 * Handle abort requests for a SYN_RECV connection. These need extra work
2965 * because the socket is on its parent's SYN queue.
2966 */
2967static int
2968abort_syn_rcv(struct socket *so, struct mbuf *m)
2969{
2970 CXGB_UNIMPLEMENTED();
2971#ifdef notyet
2972 struct socket *parent;
2973 struct toedev *tdev = toep->tp_toedev;
2974 struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2975 struct socket *oreq = so->so_incomp;
2976 struct t3c_tid_entry *t3c_stid;
2977 struct tid_info *t;
2978
2979 if (!oreq)
2980 return -1; /* somehow we are not on the SYN queue */
2981
2982 t = &(T3C_DATA(cdev))->tid_maps;
2983 t3c_stid = lookup_stid(t, oreq->ts_recent);
2984 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2985
2986 so_lock(parent);
2987 do_abort_syn_rcv(so, parent);
2988 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2989 so_unlock(parent);
2990#endif
2991 return (0);
2992}
2993
2994/*
2995 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this
2996 * request except that we need to reply to it.
2997 */
2998static void
2999process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3000{
3001 int rst_status = CPL_ABORT_NO_RST;
3002 const struct cpl_abort_req_rss *req = cplhdr(m);
3003 struct tcpcb *tp = toep->tp_tp;
3004 struct socket *so;
3005 int needclose = 0;
3006
3007 inp_wlock(tp->t_inpcb);
3008 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3009 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3010 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3011 m_free(m);
3012 goto skip;
3013 }
3014
3015 toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3016 /*
3017 * Three cases to consider:
3018 * a) We haven't sent an abort_req; close the connection.
3019 * b) We have sent a post-close abort_req that will get to TP too late
3020 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
3021 * be ignored and the connection should be closed now.
3022 * c) We have sent a regular abort_req that will get to TP too late.
3023 * That will generate an abort_rpl with status 0, wait for it.
3024 */
3025 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3026 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3027 int error;
3028
3029 error = abort_status_to_errno(so, req->status,
3030 &rst_status);
3031 so_error_set(so, error);
3032
3033 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3034 so_sorwakeup(so);
3035 /*
3036 * SYN_RECV needs special processing. If abort_syn_rcv()
3037 * returns 0 is has taken care of the abort.
3038 */
3039 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3040 goto skip;
3041
3042 t3_release_offload_resources(toep);
3043 needclose = 1;
3044 }
3045 inp_wunlock(tp->t_inpcb);
3046
3047 if (needclose)
3048 tcp_offload_close(tp);
3049
3050 send_abort_rpl(m, tdev, rst_status);
3051 return;
3052skip:
3053 inp_wunlock(tp->t_inpcb);
3054}
3055
3056/*
3057 * Handle an ABORT_REQ_RSS CPL message.
3058 */
3059static int
3060do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3061{
3062 const struct cpl_abort_req_rss *req = cplhdr(m);
3063 struct toepcb *toep = (struct toepcb *)ctx;
3064
3065 if (is_neg_adv_abort(req->status)) {
3066 m_free(m);
3067 return (0);
3068 }
3069
3070 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3071
3072 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3073 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3074 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3075
3076 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3077 if (toep->tp_l2t)
3078 l2t_release(L2DATA(cdev), toep->tp_l2t);
3079
3080 /*
3081 * Unhook
3082 */
3083 toep->tp_tp->t_toe = NULL;
3084 toep->tp_tp->t_flags &= ~TF_TOE;
3085 toep->tp_tp = NULL;
3086 /*
3087 * XXX need to call syncache_chkrst - but we don't
3088 * have a way of doing that yet
3089 */
3090 toepcb_release(toep);
3091 log(LOG_ERR, "abort for unestablished connection :-(\n");
3092 return (0);
3093 }
3094 if (toep->tp_tp == NULL) {
3095 log(LOG_NOTICE, "disconnected toepcb\n");
3096 /* should be freed momentarily */
3097 return (0);
3098 }
3099
3100
3101 toepcb_hold(toep);
3102 process_abort_req(toep, m, toep->tp_toedev);
3103 toepcb_release(toep);
3104 return (0);
3105}
3106#ifdef notyet
3107static void
3108pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3109{
3110 struct toedev *tdev = TOE_DEV(parent);
3111
3112 do_abort_syn_rcv(child, parent);
3113 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3114 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3115
3116 rpl->opt0h = htonl(F_TCAM_BYPASS);
3117 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3118 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3119 } else
3120 m_free(m);
3121}
3122#endif
3123static void
3124handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3125{
3126 CXGB_UNIMPLEMENTED();
3127
3128#ifdef notyet
3129 struct t3cdev *cdev;
3130 struct socket *parent;
3131 struct socket *oreq;
3132 struct t3c_tid_entry *t3c_stid;
3133 struct tid_info *t;
3134 struct tcpcb *otp, *tp = so_sototcpcb(so);
3135 struct toepcb *toep = tp->t_toe;
3136
3137 /*
3138 * If the connection is being aborted due to the parent listening
3139 * socket going away there's nothing to do, the ABORT_REQ will close
3140 * the connection.
3141 */
3142 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3143 m_free(m);
3144 return;
3145 }
3146
3147 oreq = so->so_incomp;
3148 otp = so_sototcpcb(oreq);
3149
3150 cdev = T3C_DEV(so);
3151 t = &(T3C_DATA(cdev))->tid_maps;
3152 t3c_stid = lookup_stid(t, otp->ts_recent);
3153 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3154
3155 so_lock(parent);
3156 pass_open_abort(so, parent, m);
3157 so_unlock(parent);
3158#endif
3159}
3160
3161/*
3162 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
3163 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3164 * connection.
3165 */
3166static void
3167pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3168{
3169
3170#ifdef notyet
3171 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3172 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3173#endif
3174 handle_pass_open_arp_failure(m_get_socket(m), m);
3175}
3176
3177/*
3178 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3179 */
3180static void
3181mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3182{
3183 struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3184 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3185 unsigned int tid = GET_TID(req);
3186
3187 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3188 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3189 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3190 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
3191 rpl->opt0h = htonl(F_TCAM_BYPASS);
3192 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3193 rpl->opt2 = 0;
3194 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3195}
3196
3197/*
3198 * Send a deferred reject to an accept request.
3199 */
3200static void
3201reject_pass_request(struct toedev *tdev, struct mbuf *m)
3202{
3203 struct mbuf *reply_mbuf;
3204
3205 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3206 mk_pass_accept_rpl(reply_mbuf, m);
3207 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3208 m_free(m);
3209}
3210
3211static void
3212handle_syncache_event(int event, void *arg)
3213{
3214 struct toepcb *toep = arg;
3215
3216 switch (event) {
3217 case TOE_SC_ENTRY_PRESENT:
3218 /*
3219 * entry already exists - free toepcb
3220 * and l2t
3221 */
3222 printf("syncache entry present\n");
3223 toepcb_release(toep);
3224 break;
3225 case TOE_SC_DROP:
3226 /*
3227 * The syncache has given up on this entry
3228 * either it timed out, or it was evicted
3229 * we need to explicitly release the tid
3230 */
3231 printf("syncache entry dropped\n");
3232 toepcb_release(toep);
3233 break;
3234 default:
3235 log(LOG_ERR, "unknown syncache event %d\n", event);
3236 break;
3237 }
3238}
3239
3240static void
3241syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3242{
3243 struct in_conninfo inc;
3244 struct toeopt toeo;
3245 struct tcphdr th;
3246 struct inpcb *inp;
3247 int mss, wsf, sack, ts;
3248 uint32_t rcv_isn = ntohl(req->rcv_isn);
3249
3250 bzero(&toeo, sizeof(struct toeopt));
3251 inp = so_sotoinpcb(lso);
3252
3253 /*
3254 * Fill out information for entering us into the syncache
3255 */
3256 bzero(&inc, sizeof(inc));
3257 inc.inc_fport = th.th_sport = req->peer_port;
3258 inc.inc_lport = th.th_dport = req->local_port;
3259 th.th_seq = req->rcv_isn;
3260 th.th_flags = TH_SYN;
3261
3262 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3263
3264 inc.inc_len = 0;
3265 inc.inc_faddr.s_addr = req->peer_ip;
3266 inc.inc_laddr.s_addr = req->local_ip;
3267
3268 DPRINTF("syncache add of %d:%d %d:%d\n",
3269 ntohl(req->local_ip), ntohs(req->local_port),
3270 ntohl(req->peer_ip), ntohs(req->peer_port));
3271
3272 mss = req->tcp_options.mss;
3273 wsf = req->tcp_options.wsf;
3274 ts = req->tcp_options.tstamp;
3275 sack = req->tcp_options.sack;
3276 toeo.to_mss = mss;
3277 toeo.to_wscale = wsf;
3278 toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3279 tcp_offload_syncache_add(&inc, &toeo, &th, inp, &lso, &cxgb_toe_usrreqs,
3280toep);
3281}
3282
3283
3284/*
3285 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
3286 * lock held. Note that the sock here is a listening socket that is not owned
3287 * by the TOE.
3288 */
3289static void
3290process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3291 struct listen_ctx *lctx)
3292{
3293 int rt_flags;
3294 struct l2t_entry *e;
3295 struct iff_mac tim;
3296 struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3297 struct cpl_pass_accept_rpl *rpl;
3298 struct cpl_pass_accept_req *req = cplhdr(m);
3299 unsigned int tid = GET_TID(req);
3300 struct tom_data *d = TOM_DATA(tdev);
3301 struct t3cdev *cdev = d->cdev;
3302 struct tcpcb *tp = so_sototcpcb(so);
3303 struct toepcb *newtoep;
3304 struct rtentry *dst;
3305 struct sockaddr_in nam;
3306 struct t3c_data *td = T3C_DATA(cdev);
3307
3308 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3309 if (__predict_false(reply_mbuf == NULL)) {
3310 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3311 t3_defer_reply(m, tdev, reject_pass_request);
3312 else {
3313 cxgb_queue_tid_release(cdev, tid);
3314 m_free(m);
3315 }
3316 DPRINTF("failed to get reply_mbuf\n");
3317
3318 goto out;
3319 }
3320
3321 if (tp->t_state != TCPS_LISTEN) {
3322 DPRINTF("socket not in listen state\n");
3323
3324 goto reject;
3325 }
3326
3327 tim.mac_addr = req->dst_mac;
3328 tim.vlan_tag = ntohs(req->vlan_tag);
3329 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3330 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3331 goto reject;
3332 }
3333
3334#ifdef notyet
3335 /*
3336 * XXX do route lookup to confirm that we're still listening on this
3337 * address
3338 */
3339 if (ip_route_input(skb, req->local_ip, req->peer_ip,
3340 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3341 goto reject;
3342 rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3343 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3344 dst_release(skb->dst); // done with the input route, release it
3345 skb->dst = NULL;
3346
3347 if ((rt_flags & RTF_LOCAL) == 0)
3348 goto reject;
3349#endif
3350 /*
3351 * XXX
3352 */
3353 rt_flags = RTF_LOCAL;
3354 if ((rt_flags & RTF_LOCAL) == 0)
3355 goto reject;
3356
3357 /*
3358 * Calculate values and add to syncache
3359 */
3360
3361 newtoep = toepcb_alloc();
3362 if (newtoep == NULL)
3363 goto reject;
3364
3365 bzero(&nam, sizeof(struct sockaddr_in));
3366
3367 nam.sin_len = sizeof(struct sockaddr_in);
3368 nam.sin_family = AF_INET;
3369 nam.sin_addr.s_addr =req->peer_ip;
3370 dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3371
3372 if (dst == NULL) {
3373 printf("failed to find route\n");
3374 goto reject;
3375 }
3376 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3377 (struct sockaddr *)&nam);
3378 if (e == NULL) {
3379 DPRINTF("failed to get l2t\n");
3380 }
3381 /*
3382 * Point to our listen socket until accept
3383 */
3384 newtoep->tp_tp = tp;
3385 newtoep->tp_flags = TP_SYN_RCVD;
3386 newtoep->tp_tid = tid;
3387 newtoep->tp_toedev = tdev;
3388 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3389
3390 cxgb_insert_tid(cdev, d->client, newtoep, tid);
3391 so_lock(so);
3392 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3393 so_unlock(so);
3394
3395 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3396 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3397
3398 if (newtoep->tp_ulp_mode) {
3399 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3400
3401 if (ddp_mbuf == NULL)
3402 newtoep->tp_ulp_mode = 0;
3403 }
3404
3405 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3406 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3407 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3408 /*
3409 * XXX workaround for lack of syncache drop
3410 */
3411 toepcb_hold(newtoep);
3412 syncache_add_accept_req(req, so, newtoep);
3413
3414 rpl = cplhdr(reply_mbuf);
3415 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3416 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3417 rpl->wr.wr_lo = 0;
3418 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3419 rpl->opt2 = htonl(calc_opt2(so, tdev));
3420 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3421 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
3422
3423 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3424 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3425 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3426 CPL_PASS_OPEN_ACCEPT);
3427
3428 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3429
3430 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3431
3432 l2t_send(cdev, reply_mbuf, e);
3433 m_free(m);
3434 if (newtoep->tp_ulp_mode) {
3435 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3436 V_TF_DDP_OFF(1) |
3437 TP_DDP_TIMER_WORKAROUND_MASK,
3438 V_TF_DDP_OFF(1) |
3439 TP_DDP_TIMER_WORKAROUND_VAL, 1);
3440 } else
3441 DPRINTF("no DDP\n");
3442
3443 return;
3444reject:
3445 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3446 mk_pass_accept_rpl(reply_mbuf, m);
3447 else
3448 mk_tid_release(reply_mbuf, newtoep, tid);
3449 cxgb_ofld_send(cdev, reply_mbuf);
3450 m_free(m);
3451out:
3452#if 0
3453 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3454#else
3455 return;
3456#endif
3457}
3458
3459/*
3460 * Handle a CPL_PASS_ACCEPT_REQ message.
3461 */
3462static int
3463do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3464{
3465 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3466 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3467 struct tom_data *d = listen_ctx->tom_data;
3468
3469#if VALIDATE_TID
3470 struct cpl_pass_accept_req *req = cplhdr(m);
3471 unsigned int tid = GET_TID(req);
3472 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3473
3474 if (unlikely(!lsk)) {
3475 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3476 cdev->name,
3477 (unsigned long)((union listen_entry *)ctx -
3478 t->stid_tab));
3479 return CPL_RET_BUF_DONE;
3480 }
3481 if (unlikely(tid >= t->ntids)) {
3482 printk(KERN_ERR "%s: passive open TID %u too large\n",
3483 cdev->name, tid);
3484 return CPL_RET_BUF_DONE;
3485 }
3486 /*
3487 * For T3A the current user of the TID may have closed but its last
3488 * message(s) may have been backlogged so the TID appears to be still
3489 * in use. Just take the TID away, the connection can close at its
3490 * own leisure. For T3B this situation is a bug.
3491 */
3492 if (!valid_new_tid(t, tid) &&
3493 cdev->type != T3A) {
3494 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3495 cdev->name, tid);
3496 return CPL_RET_BUF_DONE;
3497 }
3498#endif
3499
3500 process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3501 return (0);
3502}
3503
3504/*
3505 * Called when a connection is established to translate the TCP options
3506 * reported by HW to FreeBSD's native format.
3507 */
3508static void
3509assign_rxopt(struct socket *so, unsigned int opt)
3510{
3511 struct tcpcb *tp = so_sototcpcb(so);
3512 struct toepcb *toep = tp->t_toe;
3513 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3514
3515 inp_lock_assert(tp->t_inpcb);
3516
3517 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3518 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3519 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3520 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3521 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3522 (TF_RCVD_SCALE|TF_REQ_SCALE))
3523 tp->rcv_scale = tp->request_r_scale;
3524}
3525
3526/*
3527 * Completes some final bits of initialization for just established connections
3528 * and changes their state to TCP_ESTABLISHED.
3529 *
3530 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3531 */
3532static void
3533make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3534{
3535 struct tcpcb *tp = so_sototcpcb(so);
3536 struct toepcb *toep = tp->t_toe;
3537
3538 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3539 assign_rxopt(so, opt);
3540
3541 /*
3542 *XXXXXXXXXXX
3543 *
3544 */
3545#ifdef notyet
3546 so->so_proto->pr_ctloutput = t3_ctloutput;
3547#endif
3548
3549#if 0
3550 inet_sk(sk)->id = tp->write_seq ^ jiffies;
3551#endif
3552 /*
3553 * XXX not clear what rcv_wup maps to
3554 */
3555 /*
3556 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3557 * pass through opt0.
3558 */
3559 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3560 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3561
3562 dump_toepcb(toep);
3563
3564#ifdef notyet
3565/*
3566 * no clean interface for marking ARP up to date
3567 */
3568 dst_confirm(sk->sk_dst_cache);
3569#endif
3570 tp->t_starttime = ticks;
3571 tp->t_state = TCPS_ESTABLISHED;
3572 soisconnected(so);
3573}
3574
3575static int
3576syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3577{
3578
3579 struct in_conninfo inc;
3580 struct toeopt toeo;
3581 struct tcphdr th;
3582 int mss, wsf, sack, ts;
3583 struct mbuf *m = NULL;
3584 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3585 unsigned int opt;
3586
3587#ifdef MAC
3588#error "no MAC support"
3589#endif
3590
3591 opt = ntohs(req->tcp_opt);
3592
3593 bzero(&toeo, sizeof(struct toeopt));
3594
3595 /*
3596 * Fill out information for entering us into the syncache
3597 */
3598 bzero(&inc, sizeof(inc));
3599 inc.inc_fport = th.th_sport = req->peer_port;
3600 inc.inc_lport = th.th_dport = req->local_port;
3601 th.th_seq = req->rcv_isn;
3602 th.th_flags = TH_ACK;
3603
3604 inc.inc_len = 0;
3605 inc.inc_faddr.s_addr = req->peer_ip;
3606 inc.inc_laddr.s_addr = req->local_ip;
3607
3608 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3609 wsf = G_TCPOPT_WSCALE_OK(opt);
3610 ts = G_TCPOPT_TSTAMP(opt);
3611 sack = G_TCPOPT_SACK(opt);
3612
3613 toeo.to_mss = mss;
3614 toeo.to_wscale = G_TCPOPT_SND_WSCALE(opt);
3615 toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3616
3617 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3618 ntohl(req->local_ip), ntohs(req->local_port),
3619 ntohl(req->peer_ip), ntohs(req->peer_port),
3620 mss, wsf, ts, sack);
3621 return tcp_offload_syncache_expand(&inc, &toeo, &th, so, m);
3622}
3623
3624
3625/*
3626 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
3627 * if we are in TCP_SYN_RECV due to crossed SYNs
3628 */
3629static int
3630do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3631{
3632 struct cpl_pass_establish *req = cplhdr(m);
3633 struct toepcb *toep = (struct toepcb *)ctx;
3634 struct tcpcb *tp = toep->tp_tp;
3635 struct socket *so, *lso;
3636 struct t3c_data *td = T3C_DATA(cdev);
3637 struct sockbuf *snd, *rcv;
3638
3639 // Complete socket initialization now that we have the SND_ISN
3640
3641 struct toedev *tdev;
3642
3643
3644 tdev = toep->tp_toedev;
3645
3646 inp_wlock(tp->t_inpcb);
3647
3648 /*
3649 *
3650 * XXX need to add reference while we're manipulating
3651 */
3652 so = lso = inp_inpcbtosocket(tp->t_inpcb);
3653
3654 inp_wunlock(tp->t_inpcb);
3655
3656 so_lock(so);
3657 LIST_REMOVE(toep, synq_entry);
3658 so_unlock(so);
3659
3660 if (!syncache_expand_establish_req(req, &so, toep)) {
3661 /*
3662 * No entry
3663 */
3664 CXGB_UNIMPLEMENTED();
3665 }
3666 if (so == NULL) {
3667 /*
3668 * Couldn't create the socket
3669 */
3670 CXGB_UNIMPLEMENTED();
3671 }
3672
3673 tp = so_sototcpcb(so);
3674 inp_wlock(tp->t_inpcb);
3675
3676 snd = so_sockbuf_snd(so);
3677 rcv = so_sockbuf_rcv(so);
3678
3679 snd->sb_flags |= SB_NOCOALESCE;
3680 rcv->sb_flags |= SB_NOCOALESCE;
3681
3682 toep->tp_tp = tp;
3683 toep->tp_flags = 0;
3684 tp->t_toe = toep;
3685 reset_wr_list(toep);
3686 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3687 tp->rcv_nxt = toep->tp_copied_seq;
3688 install_offload_ops(so);
3689
3690 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3691 toep->tp_wr_unacked = 0;
3692 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3693 toep->tp_qset_idx = 0;
3694 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3695
3696 /*
3697 * XXX Cancel any keep alive timer
3698 */
3699
3700 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3701
3702 /*
3703 * XXX workaround for lack of syncache drop
3704 */
3705 toepcb_release(toep);
3706 inp_wunlock(tp->t_inpcb);
3707
3708 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3709 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3710#ifdef notyet
3711 /*
3712 * XXX not sure how these checks map to us
3713 */
3714 if (unlikely(sk->sk_socket)) { // simultaneous opens only
3715 sk->sk_state_change(sk);
3716 sk_wake_async(so, 0, POLL_OUT);
3717 }
3718 /*
3719 * The state for the new connection is now up to date.
3720 * Next check if we should add the connection to the parent's
3721 * accept queue. When the parent closes it resets connections
3722 * on its SYN queue, so check if we are being reset. If so we
3723 * don't need to do anything more, the coming ABORT_RPL will
3724 * destroy this socket. Otherwise move the connection to the
3725 * accept queue.
3726 *
3727 * Note that we reset the synq before closing the server so if
3728 * we are not being reset the stid is still open.
3729 */
3730 if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3731 __kfree_skb(skb);
3732 goto unlock;
3733 }
3734#endif
3735 m_free(m);
3736
3737 return (0);
3738}
3739
3740/*
3741 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3742 * and send them to the TOE.
3743 */
3744static void
3745fixup_and_send_ofo(struct toepcb *toep)
3746{
3747 struct mbuf *m;
3748 struct toedev *tdev = toep->tp_toedev;
3749 struct tcpcb *tp = toep->tp_tp;
3750 unsigned int tid = toep->tp_tid;
3751
3752 log(LOG_NOTICE, "fixup_and_send_ofo\n");
3753
3754 inp_lock_assert(tp->t_inpcb);
3755 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3756 /*
3757 * A variety of messages can be waiting but the fields we'll
3758 * be touching are common to all so any message type will do.
3759 */
3760 struct cpl_close_con_req *p = cplhdr(m);
3761
3762 p->wr.wr_lo = htonl(V_WR_TID(tid));
3763 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3764 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3765 }
3766}
3767
3768/*
3769 * Updates socket state from an active establish CPL message. Runs with the
3770 * socket lock held.
3771 */
3772static void
3773socket_act_establish(struct socket *so, struct mbuf *m)
3774{
3775 struct cpl_act_establish *req = cplhdr(m);
3776 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
3777 struct tcpcb *tp = so_sototcpcb(so);
3778 struct toepcb *toep = tp->t_toe;
3779
3780 if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3781 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3782 toep->tp_tid, tp->t_state);
3783
3784 tp->ts_recent_age = ticks;
3785 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3786 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3787
3788 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3789
3790 /*
3791 * Now that we finally have a TID send any CPL messages that we had to
3792 * defer for lack of a TID.
3793 */
3794 if (mbufq_len(&toep->out_of_order_queue))
3795 fixup_and_send_ofo(toep);
3796
3797 if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3798 /*
3799 * XXX does this even make sense?
3800 */
3801 so_sorwakeup(so);
3802 }
3803 m_free(m);
3804#ifdef notyet
3805/*
3806 * XXX assume no write requests permitted while socket connection is
3807 * incomplete
3808 */
3809 /*
3810 * Currently the send queue must be empty at this point because the
3811 * socket layer does not send anything before a connection is
3812 * established. To be future proof though we handle the possibility
3813 * that there are pending buffers to send (either TX_DATA or
3814 * CLOSE_CON_REQ). First we need to adjust the sequence number of the
3815 * buffers according to the just learned write_seq, and then we send
3816 * them on their way.
3817 */
3818 fixup_pending_writeq_buffers(sk);
3819 if (t3_push_frames(so, 1))
3820 sk->sk_write_space(sk);
3821#endif
3822
3823 toep->tp_state = tp->t_state;
3824 TCPSTAT_INC(tcps_connects);
3824 KMOD_TCPSTAT_INC(tcps_connects);
3825
3826}
3827
3828/*
3829 * Process a CPL_ACT_ESTABLISH message.
3830 */
3831static int
3832do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3833{
3834 struct cpl_act_establish *req = cplhdr(m);
3835 unsigned int tid = GET_TID(req);
3836 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3837 struct toepcb *toep = (struct toepcb *)ctx;
3838 struct tcpcb *tp = toep->tp_tp;
3839 struct socket *so;
3840 struct toedev *tdev;
3841 struct tom_data *d;
3842
3843 if (tp == NULL) {
3844 free_atid(cdev, atid);
3845 return (0);
3846 }
3847 inp_wlock(tp->t_inpcb);
3848
3849 /*
3850 * XXX
3851 */
3852 so = inp_inpcbtosocket(tp->t_inpcb);
3853 tdev = toep->tp_toedev; /* blow up here if link was down */
3854 d = TOM_DATA(tdev);
3855
3856 /*
3857 * It's OK if the TID is currently in use, the owning socket may have
3858 * backlogged its last CPL message(s). Just take it away.
3859 */
3860 toep->tp_tid = tid;
3861 toep->tp_tp = tp;
3862 so_insert_tid(d, toep, tid);
3863 free_atid(cdev, atid);
3864 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3865
3866 socket_act_establish(so, m);
3867 inp_wunlock(tp->t_inpcb);
3868 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3869 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3870
3871 return (0);
3872}
3873
3874/*
3875 * Process an acknowledgment of WR completion. Advance snd_una and send the
3876 * next batch of work requests from the write queue.
3877 */
3878static void
3879wr_ack(struct toepcb *toep, struct mbuf *m)
3880{
3881 struct tcpcb *tp = toep->tp_tp;
3882 struct cpl_wr_ack *hdr = cplhdr(m);
3883 struct socket *so;
3884 unsigned int credits = ntohs(hdr->credits);
3885 u32 snd_una = ntohl(hdr->snd_una);
3886 int bytes = 0;
3887 struct sockbuf *snd;
3888
3889 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3890
3891 inp_wlock(tp->t_inpcb);
3892 so = inp_inpcbtosocket(tp->t_inpcb);
3893 toep->tp_wr_avail += credits;
3894 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3895 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3896
3897 while (credits) {
3898 struct mbuf *p = peek_wr(toep);
3899
3900 if (__predict_false(!p)) {
3901 log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3902 "nothing pending, state %u wr_avail=%u\n",
3903 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3904 break;
3905 }
3906 CTR2(KTR_TOM,
3907 "wr_ack: p->credits=%d p->bytes=%d",
3908 p->m_pkthdr.csum_data, p->m_pkthdr.len);
3909 KASSERT(p->m_pkthdr.csum_data != 0,
3910 ("empty request still on list"));
3911
3912 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3913
3914#if DEBUG_WR > 1
3915 struct tx_data_wr *w = cplhdr(p);
3916 log(LOG_ERR,
3917 "TID %u got %u WR credits, need %u, len %u, "
3918 "main body %u, frags %u, seq # %u, ACK una %u,"
3919 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3920 toep->tp_tid, credits, p->csum, p->len,
3921 p->len - p->data_len, skb_shinfo(p)->nr_frags,
3922 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3923 toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3924#endif
3925 p->m_pkthdr.csum_data -= credits;
3926 break;
3927 } else {
3928 dequeue_wr(toep);
3929 credits -= p->m_pkthdr.csum_data;
3930 bytes += p->m_pkthdr.len;
3931 CTR3(KTR_TOM,
3932 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3933 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3934
3935 m_free(p);
3936 }
3937 }
3938
3939#if DEBUG_WR
3940 check_wr_invariants(tp);
3941#endif
3942
3943 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3944#if VALIDATE_SEQ
3945 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3946
3947 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3948 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3949 toep->tp_tid, tp->snd_una);
3950#endif
3951 goto out_free;
3952 }
3953
3954 if (tp->snd_una != snd_una) {
3955 tp->snd_una = snd_una;
3956 tp->ts_recent_age = ticks;
3957#ifdef notyet
3958 /*
3959 * Keep ARP entry "minty fresh"
3960 */
3961 dst_confirm(sk->sk_dst_cache);
3962#endif
3963 if (tp->snd_una == tp->snd_nxt)
3964 toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3965 }
3966
3967 snd = so_sockbuf_snd(so);
3968 if (bytes) {
3969 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3970 snd = so_sockbuf_snd(so);
3971 sockbuf_lock(snd);
3972 sbdrop_locked(snd, bytes);
3973 so_sowwakeup_locked(so);
3974 }
3975
3976 if (snd->sb_sndptroff < snd->sb_cc)
3977 t3_push_frames(so, 0);
3978
3979out_free:
3980 inp_wunlock(tp->t_inpcb);
3981 m_free(m);
3982}
3983
3984/*
3985 * Handler for TX_DATA_ACK CPL messages.
3986 */
3987static int
3988do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3989{
3990 struct toepcb *toep = (struct toepcb *)ctx;
3991
3992 VALIDATE_SOCK(so);
3993
3994 wr_ack(toep, m);
3995 return 0;
3996}
3997
3998/*
3999 * Handler for TRACE_PKT CPL messages. Just sink these packets.
4000 */
4001static int
4002do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4003{
4004 m_freem(m);
4005 return 0;
4006}
4007
4008/*
4009 * Reset a connection that is on a listener's SYN queue or accept queue,
4010 * i.e., one that has not had a struct socket associated with it.
4011 * Must be called from process context.
4012 *
4013 * Modeled after code in inet_csk_listen_stop().
4014 */
4015static void
4016t3_reset_listen_child(struct socket *child)
4017{
4018 struct tcpcb *tp = so_sototcpcb(child);
4019
4020 t3_send_reset(tp->t_toe);
4021}
4022
4023
4024static void
4025t3_child_disconnect(struct socket *so, void *arg)
4026{
4027 struct tcpcb *tp = so_sototcpcb(so);
4028
4029 if (tp->t_flags & TF_TOE) {
4030 inp_wlock(tp->t_inpcb);
4031 t3_reset_listen_child(so);
4032 inp_wunlock(tp->t_inpcb);
4033 }
4034}
4035
4036/*
4037 * Disconnect offloaded established but not yet accepted connections sitting
4038 * on a server's accept_queue. We just send an ABORT_REQ at this point and
4039 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4040 */
4041void
4042t3_disconnect_acceptq(struct socket *listen_so)
4043{
4044
4045 so_lock(listen_so);
4046 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4047 so_unlock(listen_so);
4048}
4049
4050/*
4051 * Reset offloaded connections sitting on a server's syn queue. As above
4052 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4053 */
4054
4055void
4056t3_reset_synq(struct listen_ctx *lctx)
4057{
4058 struct toepcb *toep;
4059
4060 so_lock(lctx->lso);
4061 while (!LIST_EMPTY(&lctx->synq_head)) {
4062 toep = LIST_FIRST(&lctx->synq_head);
4063 LIST_REMOVE(toep, synq_entry);
4064 toep->tp_tp = NULL;
4065 t3_send_reset(toep);
4066 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4067 toepcb_release(toep);
4068 }
4069 so_unlock(lctx->lso);
4070}
4071
4072
4073int
4074t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4075 unsigned int nppods, unsigned int tag, unsigned int maxoff,
4076 unsigned int pg_off, unsigned int color)
4077{
4078 unsigned int i, j, pidx;
4079 struct pagepod *p;
4080 struct mbuf *m;
4081 struct ulp_mem_io *req;
4082 unsigned int tid = toep->tp_tid;
4083 const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4084 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4085
4086 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4087 gl, nppods, tag, maxoff, pg_off, color);
4088
4089 for (i = 0; i < nppods; ++i) {
4090 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4091 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4092 req = mtod(m, struct ulp_mem_io *);
4093 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4094 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4095 req->wr.wr_lo = 0;
4096 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4097 V_ULPTX_CMD(ULP_MEM_WRITE));
4098 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4099 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4100
4101 p = (struct pagepod *)(req + 1);
4102 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4103 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4104 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4105 V_PPOD_COLOR(color));
4106 p->pp_max_offset = htonl(maxoff);
4107 p->pp_page_offset = htonl(pg_off);
4108 p->pp_rsvd = 0;
4109 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4110 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4111 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4112 } else
4113 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
4114 send_or_defer(toep, m, 0);
4115 ppod_addr += PPOD_SIZE;
4116 }
4117 return (0);
4118}
4119
4120/*
4121 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4122 */
4123static inline void
4124mk_cpl_barrier_ulp(struct cpl_barrier *b)
4125{
4126 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4127
4128 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4129 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4130 b->opcode = CPL_BARRIER;
4131}
4132
4133/*
4134 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4135 */
4136static inline void
4137mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4138{
4139 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4140
4141 txpkt = (struct ulp_txpkt *)req;
4142 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4143 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4144 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4145 req->cpuno = htons(cpuno);
4146}
4147
4148/*
4149 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4150 */
4151static inline void
4152mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4153 unsigned int word, uint64_t mask, uint64_t val)
4154{
4155 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4156
4157 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4158 tid, word, mask, val);
4159
4160 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4161 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4162 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4163 req->reply = V_NO_REPLY(1);
4164 req->cpu_idx = 0;
4165 req->word = htons(word);
4166 req->mask = htobe64(mask);
4167 req->val = htobe64(val);
4168}
4169
4170/*
4171 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4172 */
4173static void
4174mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4175 unsigned int tid, unsigned int credits)
4176{
4177 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4178
4179 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4180 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4181 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4182 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4183 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4184 V_RX_CREDITS(credits));
4185}
4186
4187void
4188t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4189{
4190 unsigned int wrlen;
4191 struct mbuf *m;
4192 struct work_request_hdr *wr;
4193 struct cpl_barrier *lock;
4194 struct cpl_set_tcb_field *req;
4195 struct cpl_get_tcb *getreq;
4196 struct ddp_state *p = &toep->tp_ddp_state;
4197
4198#if 0
4199 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4200#endif
4201 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4202 sizeof(*getreq);
4203 m = m_gethdr_nofail(wrlen);
4204 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4205 wr = mtod(m, struct work_request_hdr *);
4206 bzero(wr, wrlen);
4207
4208 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4209 m->m_pkthdr.len = m->m_len = wrlen;
4210
4211 lock = (struct cpl_barrier *)(wr + 1);
4212 mk_cpl_barrier_ulp(lock);
4213
4214 req = (struct cpl_set_tcb_field *)(lock + 1);
4215
4216 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4217
4218 /* Hmmm, not sure if this actually a good thing: reactivating
4219 * the other buffer might be an issue if it has been completed
4220 * already. However, that is unlikely, since the fact that the UBUF
4221 * is not completed indicates that there is no oustanding data.
4222 */
4223 if (bufidx == 0)
4224 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4225 V_TF_DDP_ACTIVE_BUF(1) |
4226 V_TF_DDP_BUF0_VALID(1),
4227 V_TF_DDP_ACTIVE_BUF(1));
4228 else
4229 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4230 V_TF_DDP_ACTIVE_BUF(1) |
4231 V_TF_DDP_BUF1_VALID(1), 0);
4232
4233 getreq = (struct cpl_get_tcb *)(req + 1);
4234 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4235
4236 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4237
4238 /* Keep track of the number of oustanding CPL_GET_TCB requests
4239 */
4240 p->get_tcb_count++;
4241
4242#ifdef T3_TRACE
4243 T3_TRACE1(TIDTB(so),
4244 "t3_cancel_ddpbuf: bufidx %u", bufidx);
4245#endif
4246 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4247}
4248
4249/**
4250 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4251 * @sk: the socket associated with the buffers
4252 * @bufidx: index of HW DDP buffer (0 or 1)
4253 * @tag0: new tag for HW buffer 0
4254 * @tag1: new tag for HW buffer 1
4255 * @len: new length for HW buf @bufidx
4256 *
4257 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4258 * buffer by changing the buffer tag and length and setting the valid and
4259 * active flag accordingly. The caller must ensure the new buffer is at
4260 * least as big as the existing one. Since we typically reprogram both HW
4261 * buffers this function sets both tags for convenience. Read the TCB to
4262 * determine how made data was written into the buffer before the overlay
4263 * took place.
4264 */
4265void
4266t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4267 unsigned int tag1, unsigned int len)
4268{
4269 unsigned int wrlen;
4270 struct mbuf *m;
4271 struct work_request_hdr *wr;
4272 struct cpl_get_tcb *getreq;
4273 struct cpl_set_tcb_field *req;
4274 struct ddp_state *p = &toep->tp_ddp_state;
4275
4276 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4277 bufidx, tag0, tag1, len);
4278#if 0
4279 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4280#endif
4281 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4282 m = m_gethdr_nofail(wrlen);
4283 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4284 wr = mtod(m, struct work_request_hdr *);
4285 m->m_pkthdr.len = m->m_len = wrlen;
4286 bzero(wr, wrlen);
4287
4288
4289 /* Set the ATOMIC flag to make sure that TP processes the following
4290 * CPLs in an atomic manner and no wire segments can be interleaved.
4291 */
4292 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4293 req = (struct cpl_set_tcb_field *)(wr + 1);
4294 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4295 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4296 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4297 V_TCB_RX_DDP_BUF0_TAG(tag0) |
4298 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4299 req++;
4300 if (bufidx == 0) {
4301 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4302 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4303 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4304 req++;
4305 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4306 V_TF_DDP_PUSH_DISABLE_0(1) |
4307 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4308 V_TF_DDP_PUSH_DISABLE_0(0) |
4309 V_TF_DDP_BUF0_VALID(1));
4310 } else {
4311 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4312 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4313 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4314 req++;
4315 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4316 V_TF_DDP_PUSH_DISABLE_1(1) |
4317 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4318 V_TF_DDP_PUSH_DISABLE_1(0) |
4319 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4320 }
4321
4322 getreq = (struct cpl_get_tcb *)(req + 1);
4323 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4324
4325 /* Keep track of the number of oustanding CPL_GET_TCB requests
4326 */
4327 p->get_tcb_count++;
4328
4329#ifdef T3_TRACE
4330 T3_TRACE4(TIDTB(sk),
4331 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4332 "len %d",
4333 bufidx, tag0, tag1, len);
4334#endif
4335 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4336}
4337
4338/*
4339 * Sends a compound WR containing all the CPL messages needed to program the
4340 * two HW DDP buffers, namely optionally setting up the length and offset of
4341 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4342 */
4343void
4344t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4345 unsigned int len1, unsigned int offset1,
4346 uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4347{
4348 unsigned int wrlen;
4349 struct mbuf *m;
4350 struct work_request_hdr *wr;
4351 struct cpl_set_tcb_field *req;
4352
4353 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4354 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4355
4356#if 0
4357 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4358#endif
4359 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4360 (len1 ? sizeof(*req) : 0) +
4361 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4362 m = m_gethdr_nofail(wrlen);
4363 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4364 wr = mtod(m, struct work_request_hdr *);
4365 bzero(wr, wrlen);
4366
4367 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4368 m->m_pkthdr.len = m->m_len = wrlen;
4369
4370 req = (struct cpl_set_tcb_field *)(wr + 1);
4371 if (len0) { /* program buffer 0 offset and length */
4372 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4373 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4374 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4375 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4376 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4377 req++;
4378 }
4379 if (len1) { /* program buffer 1 offset and length */
4380 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4381 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4382 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4383 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4384 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4385 req++;
4386 }
4387
4388 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4389 ddp_flags);
4390
4391 if (modulate) {
4392 mk_rx_data_ack_ulp(toep,
4393 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4394 toep->tp_copied_seq - toep->tp_rcv_wup);
4395 toep->tp_rcv_wup = toep->tp_copied_seq;
4396 }
4397
4398#ifdef T3_TRACE
4399 T3_TRACE5(TIDTB(sk),
4400 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4401 "modulate %d",
4402 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4403 modulate);
4404#endif
4405
4406 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4407}
4408
4409void
4410t3_init_wr_tab(unsigned int wr_len)
4411{
4412 int i;
4413
4414 if (mbuf_wrs[1]) /* already initialized */
4415 return;
4416
4417 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4418 int sgl_len = (3 * i) / 2 + (i & 1);
4419
4420 sgl_len += 3;
4421 mbuf_wrs[i] = sgl_len <= wr_len ?
4422 1 : 1 + (sgl_len - 2) / (wr_len - 1);
4423 }
4424
4425 wrlen = wr_len * 8;
4426}
4427
4428int
4429t3_init_cpl_io(void)
4430{
4431#ifdef notyet
4432 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4433 if (!tcphdr_skb) {
4434 log(LOG_ERR,
4435 "Chelsio TCP offload: can't allocate sk_buff\n");
4436 return -1;
4437 }
4438 skb_put(tcphdr_skb, sizeof(struct tcphdr));
4439 tcphdr_skb->h.raw = tcphdr_skb->data;
4440 memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4441#endif
4442
4443 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4444 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4445 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4446 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4447 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4448 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4449 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4450 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4451 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4452 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4453 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4454 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4455 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4456 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4457 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4458 return (0);
4459}
4460
3825
3826}
3827
3828/*
3829 * Process a CPL_ACT_ESTABLISH message.
3830 */
3831static int
3832do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3833{
3834 struct cpl_act_establish *req = cplhdr(m);
3835 unsigned int tid = GET_TID(req);
3836 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3837 struct toepcb *toep = (struct toepcb *)ctx;
3838 struct tcpcb *tp = toep->tp_tp;
3839 struct socket *so;
3840 struct toedev *tdev;
3841 struct tom_data *d;
3842
3843 if (tp == NULL) {
3844 free_atid(cdev, atid);
3845 return (0);
3846 }
3847 inp_wlock(tp->t_inpcb);
3848
3849 /*
3850 * XXX
3851 */
3852 so = inp_inpcbtosocket(tp->t_inpcb);
3853 tdev = toep->tp_toedev; /* blow up here if link was down */
3854 d = TOM_DATA(tdev);
3855
3856 /*
3857 * It's OK if the TID is currently in use, the owning socket may have
3858 * backlogged its last CPL message(s). Just take it away.
3859 */
3860 toep->tp_tid = tid;
3861 toep->tp_tp = tp;
3862 so_insert_tid(d, toep, tid);
3863 free_atid(cdev, atid);
3864 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3865
3866 socket_act_establish(so, m);
3867 inp_wunlock(tp->t_inpcb);
3868 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3869 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3870
3871 return (0);
3872}
3873
3874/*
3875 * Process an acknowledgment of WR completion. Advance snd_una and send the
3876 * next batch of work requests from the write queue.
3877 */
3878static void
3879wr_ack(struct toepcb *toep, struct mbuf *m)
3880{
3881 struct tcpcb *tp = toep->tp_tp;
3882 struct cpl_wr_ack *hdr = cplhdr(m);
3883 struct socket *so;
3884 unsigned int credits = ntohs(hdr->credits);
3885 u32 snd_una = ntohl(hdr->snd_una);
3886 int bytes = 0;
3887 struct sockbuf *snd;
3888
3889 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3890
3891 inp_wlock(tp->t_inpcb);
3892 so = inp_inpcbtosocket(tp->t_inpcb);
3893 toep->tp_wr_avail += credits;
3894 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3895 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3896
3897 while (credits) {
3898 struct mbuf *p = peek_wr(toep);
3899
3900 if (__predict_false(!p)) {
3901 log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3902 "nothing pending, state %u wr_avail=%u\n",
3903 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3904 break;
3905 }
3906 CTR2(KTR_TOM,
3907 "wr_ack: p->credits=%d p->bytes=%d",
3908 p->m_pkthdr.csum_data, p->m_pkthdr.len);
3909 KASSERT(p->m_pkthdr.csum_data != 0,
3910 ("empty request still on list"));
3911
3912 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3913
3914#if DEBUG_WR > 1
3915 struct tx_data_wr *w = cplhdr(p);
3916 log(LOG_ERR,
3917 "TID %u got %u WR credits, need %u, len %u, "
3918 "main body %u, frags %u, seq # %u, ACK una %u,"
3919 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3920 toep->tp_tid, credits, p->csum, p->len,
3921 p->len - p->data_len, skb_shinfo(p)->nr_frags,
3922 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3923 toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3924#endif
3925 p->m_pkthdr.csum_data -= credits;
3926 break;
3927 } else {
3928 dequeue_wr(toep);
3929 credits -= p->m_pkthdr.csum_data;
3930 bytes += p->m_pkthdr.len;
3931 CTR3(KTR_TOM,
3932 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3933 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3934
3935 m_free(p);
3936 }
3937 }
3938
3939#if DEBUG_WR
3940 check_wr_invariants(tp);
3941#endif
3942
3943 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3944#if VALIDATE_SEQ
3945 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3946
3947 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3948 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3949 toep->tp_tid, tp->snd_una);
3950#endif
3951 goto out_free;
3952 }
3953
3954 if (tp->snd_una != snd_una) {
3955 tp->snd_una = snd_una;
3956 tp->ts_recent_age = ticks;
3957#ifdef notyet
3958 /*
3959 * Keep ARP entry "minty fresh"
3960 */
3961 dst_confirm(sk->sk_dst_cache);
3962#endif
3963 if (tp->snd_una == tp->snd_nxt)
3964 toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3965 }
3966
3967 snd = so_sockbuf_snd(so);
3968 if (bytes) {
3969 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3970 snd = so_sockbuf_snd(so);
3971 sockbuf_lock(snd);
3972 sbdrop_locked(snd, bytes);
3973 so_sowwakeup_locked(so);
3974 }
3975
3976 if (snd->sb_sndptroff < snd->sb_cc)
3977 t3_push_frames(so, 0);
3978
3979out_free:
3980 inp_wunlock(tp->t_inpcb);
3981 m_free(m);
3982}
3983
3984/*
3985 * Handler for TX_DATA_ACK CPL messages.
3986 */
3987static int
3988do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3989{
3990 struct toepcb *toep = (struct toepcb *)ctx;
3991
3992 VALIDATE_SOCK(so);
3993
3994 wr_ack(toep, m);
3995 return 0;
3996}
3997
3998/*
3999 * Handler for TRACE_PKT CPL messages. Just sink these packets.
4000 */
4001static int
4002do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4003{
4004 m_freem(m);
4005 return 0;
4006}
4007
4008/*
4009 * Reset a connection that is on a listener's SYN queue or accept queue,
4010 * i.e., one that has not had a struct socket associated with it.
4011 * Must be called from process context.
4012 *
4013 * Modeled after code in inet_csk_listen_stop().
4014 */
4015static void
4016t3_reset_listen_child(struct socket *child)
4017{
4018 struct tcpcb *tp = so_sototcpcb(child);
4019
4020 t3_send_reset(tp->t_toe);
4021}
4022
4023
4024static void
4025t3_child_disconnect(struct socket *so, void *arg)
4026{
4027 struct tcpcb *tp = so_sototcpcb(so);
4028
4029 if (tp->t_flags & TF_TOE) {
4030 inp_wlock(tp->t_inpcb);
4031 t3_reset_listen_child(so);
4032 inp_wunlock(tp->t_inpcb);
4033 }
4034}
4035
4036/*
4037 * Disconnect offloaded established but not yet accepted connections sitting
4038 * on a server's accept_queue. We just send an ABORT_REQ at this point and
4039 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4040 */
4041void
4042t3_disconnect_acceptq(struct socket *listen_so)
4043{
4044
4045 so_lock(listen_so);
4046 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4047 so_unlock(listen_so);
4048}
4049
4050/*
4051 * Reset offloaded connections sitting on a server's syn queue. As above
4052 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4053 */
4054
4055void
4056t3_reset_synq(struct listen_ctx *lctx)
4057{
4058 struct toepcb *toep;
4059
4060 so_lock(lctx->lso);
4061 while (!LIST_EMPTY(&lctx->synq_head)) {
4062 toep = LIST_FIRST(&lctx->synq_head);
4063 LIST_REMOVE(toep, synq_entry);
4064 toep->tp_tp = NULL;
4065 t3_send_reset(toep);
4066 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4067 toepcb_release(toep);
4068 }
4069 so_unlock(lctx->lso);
4070}
4071
4072
4073int
4074t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4075 unsigned int nppods, unsigned int tag, unsigned int maxoff,
4076 unsigned int pg_off, unsigned int color)
4077{
4078 unsigned int i, j, pidx;
4079 struct pagepod *p;
4080 struct mbuf *m;
4081 struct ulp_mem_io *req;
4082 unsigned int tid = toep->tp_tid;
4083 const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4084 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4085
4086 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4087 gl, nppods, tag, maxoff, pg_off, color);
4088
4089 for (i = 0; i < nppods; ++i) {
4090 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4091 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4092 req = mtod(m, struct ulp_mem_io *);
4093 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4094 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4095 req->wr.wr_lo = 0;
4096 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4097 V_ULPTX_CMD(ULP_MEM_WRITE));
4098 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4099 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4100
4101 p = (struct pagepod *)(req + 1);
4102 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4103 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4104 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4105 V_PPOD_COLOR(color));
4106 p->pp_max_offset = htonl(maxoff);
4107 p->pp_page_offset = htonl(pg_off);
4108 p->pp_rsvd = 0;
4109 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4110 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4111 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4112 } else
4113 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
4114 send_or_defer(toep, m, 0);
4115 ppod_addr += PPOD_SIZE;
4116 }
4117 return (0);
4118}
4119
4120/*
4121 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4122 */
4123static inline void
4124mk_cpl_barrier_ulp(struct cpl_barrier *b)
4125{
4126 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4127
4128 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4129 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4130 b->opcode = CPL_BARRIER;
4131}
4132
4133/*
4134 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4135 */
4136static inline void
4137mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4138{
4139 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4140
4141 txpkt = (struct ulp_txpkt *)req;
4142 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4143 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4144 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4145 req->cpuno = htons(cpuno);
4146}
4147
4148/*
4149 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4150 */
4151static inline void
4152mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4153 unsigned int word, uint64_t mask, uint64_t val)
4154{
4155 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4156
4157 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4158 tid, word, mask, val);
4159
4160 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4161 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4162 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4163 req->reply = V_NO_REPLY(1);
4164 req->cpu_idx = 0;
4165 req->word = htons(word);
4166 req->mask = htobe64(mask);
4167 req->val = htobe64(val);
4168}
4169
4170/*
4171 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4172 */
4173static void
4174mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4175 unsigned int tid, unsigned int credits)
4176{
4177 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4178
4179 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4180 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4181 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4182 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4183 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4184 V_RX_CREDITS(credits));
4185}
4186
4187void
4188t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4189{
4190 unsigned int wrlen;
4191 struct mbuf *m;
4192 struct work_request_hdr *wr;
4193 struct cpl_barrier *lock;
4194 struct cpl_set_tcb_field *req;
4195 struct cpl_get_tcb *getreq;
4196 struct ddp_state *p = &toep->tp_ddp_state;
4197
4198#if 0
4199 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4200#endif
4201 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4202 sizeof(*getreq);
4203 m = m_gethdr_nofail(wrlen);
4204 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4205 wr = mtod(m, struct work_request_hdr *);
4206 bzero(wr, wrlen);
4207
4208 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4209 m->m_pkthdr.len = m->m_len = wrlen;
4210
4211 lock = (struct cpl_barrier *)(wr + 1);
4212 mk_cpl_barrier_ulp(lock);
4213
4214 req = (struct cpl_set_tcb_field *)(lock + 1);
4215
4216 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4217
4218 /* Hmmm, not sure if this actually a good thing: reactivating
4219 * the other buffer might be an issue if it has been completed
4220 * already. However, that is unlikely, since the fact that the UBUF
4221 * is not completed indicates that there is no oustanding data.
4222 */
4223 if (bufidx == 0)
4224 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4225 V_TF_DDP_ACTIVE_BUF(1) |
4226 V_TF_DDP_BUF0_VALID(1),
4227 V_TF_DDP_ACTIVE_BUF(1));
4228 else
4229 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4230 V_TF_DDP_ACTIVE_BUF(1) |
4231 V_TF_DDP_BUF1_VALID(1), 0);
4232
4233 getreq = (struct cpl_get_tcb *)(req + 1);
4234 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4235
4236 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4237
4238 /* Keep track of the number of oustanding CPL_GET_TCB requests
4239 */
4240 p->get_tcb_count++;
4241
4242#ifdef T3_TRACE
4243 T3_TRACE1(TIDTB(so),
4244 "t3_cancel_ddpbuf: bufidx %u", bufidx);
4245#endif
4246 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4247}
4248
4249/**
4250 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4251 * @sk: the socket associated with the buffers
4252 * @bufidx: index of HW DDP buffer (0 or 1)
4253 * @tag0: new tag for HW buffer 0
4254 * @tag1: new tag for HW buffer 1
4255 * @len: new length for HW buf @bufidx
4256 *
4257 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4258 * buffer by changing the buffer tag and length and setting the valid and
4259 * active flag accordingly. The caller must ensure the new buffer is at
4260 * least as big as the existing one. Since we typically reprogram both HW
4261 * buffers this function sets both tags for convenience. Read the TCB to
4262 * determine how made data was written into the buffer before the overlay
4263 * took place.
4264 */
4265void
4266t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4267 unsigned int tag1, unsigned int len)
4268{
4269 unsigned int wrlen;
4270 struct mbuf *m;
4271 struct work_request_hdr *wr;
4272 struct cpl_get_tcb *getreq;
4273 struct cpl_set_tcb_field *req;
4274 struct ddp_state *p = &toep->tp_ddp_state;
4275
4276 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4277 bufidx, tag0, tag1, len);
4278#if 0
4279 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4280#endif
4281 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4282 m = m_gethdr_nofail(wrlen);
4283 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4284 wr = mtod(m, struct work_request_hdr *);
4285 m->m_pkthdr.len = m->m_len = wrlen;
4286 bzero(wr, wrlen);
4287
4288
4289 /* Set the ATOMIC flag to make sure that TP processes the following
4290 * CPLs in an atomic manner and no wire segments can be interleaved.
4291 */
4292 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4293 req = (struct cpl_set_tcb_field *)(wr + 1);
4294 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4295 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4296 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4297 V_TCB_RX_DDP_BUF0_TAG(tag0) |
4298 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4299 req++;
4300 if (bufidx == 0) {
4301 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4302 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4303 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4304 req++;
4305 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4306 V_TF_DDP_PUSH_DISABLE_0(1) |
4307 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4308 V_TF_DDP_PUSH_DISABLE_0(0) |
4309 V_TF_DDP_BUF0_VALID(1));
4310 } else {
4311 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4312 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4313 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4314 req++;
4315 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4316 V_TF_DDP_PUSH_DISABLE_1(1) |
4317 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4318 V_TF_DDP_PUSH_DISABLE_1(0) |
4319 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4320 }
4321
4322 getreq = (struct cpl_get_tcb *)(req + 1);
4323 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4324
4325 /* Keep track of the number of oustanding CPL_GET_TCB requests
4326 */
4327 p->get_tcb_count++;
4328
4329#ifdef T3_TRACE
4330 T3_TRACE4(TIDTB(sk),
4331 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4332 "len %d",
4333 bufidx, tag0, tag1, len);
4334#endif
4335 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4336}
4337
4338/*
4339 * Sends a compound WR containing all the CPL messages needed to program the
4340 * two HW DDP buffers, namely optionally setting up the length and offset of
4341 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4342 */
4343void
4344t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4345 unsigned int len1, unsigned int offset1,
4346 uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4347{
4348 unsigned int wrlen;
4349 struct mbuf *m;
4350 struct work_request_hdr *wr;
4351 struct cpl_set_tcb_field *req;
4352
4353 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4354 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4355
4356#if 0
4357 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4358#endif
4359 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4360 (len1 ? sizeof(*req) : 0) +
4361 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4362 m = m_gethdr_nofail(wrlen);
4363 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4364 wr = mtod(m, struct work_request_hdr *);
4365 bzero(wr, wrlen);
4366
4367 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4368 m->m_pkthdr.len = m->m_len = wrlen;
4369
4370 req = (struct cpl_set_tcb_field *)(wr + 1);
4371 if (len0) { /* program buffer 0 offset and length */
4372 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4373 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4374 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4375 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4376 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4377 req++;
4378 }
4379 if (len1) { /* program buffer 1 offset and length */
4380 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4381 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4382 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4383 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4384 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4385 req++;
4386 }
4387
4388 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4389 ddp_flags);
4390
4391 if (modulate) {
4392 mk_rx_data_ack_ulp(toep,
4393 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4394 toep->tp_copied_seq - toep->tp_rcv_wup);
4395 toep->tp_rcv_wup = toep->tp_copied_seq;
4396 }
4397
4398#ifdef T3_TRACE
4399 T3_TRACE5(TIDTB(sk),
4400 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4401 "modulate %d",
4402 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4403 modulate);
4404#endif
4405
4406 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4407}
4408
4409void
4410t3_init_wr_tab(unsigned int wr_len)
4411{
4412 int i;
4413
4414 if (mbuf_wrs[1]) /* already initialized */
4415 return;
4416
4417 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4418 int sgl_len = (3 * i) / 2 + (i & 1);
4419
4420 sgl_len += 3;
4421 mbuf_wrs[i] = sgl_len <= wr_len ?
4422 1 : 1 + (sgl_len - 2) / (wr_len - 1);
4423 }
4424
4425 wrlen = wr_len * 8;
4426}
4427
4428int
4429t3_init_cpl_io(void)
4430{
4431#ifdef notyet
4432 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4433 if (!tcphdr_skb) {
4434 log(LOG_ERR,
4435 "Chelsio TCP offload: can't allocate sk_buff\n");
4436 return -1;
4437 }
4438 skb_put(tcphdr_skb, sizeof(struct tcphdr));
4439 tcphdr_skb->h.raw = tcphdr_skb->data;
4440 memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4441#endif
4442
4443 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4444 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4445 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4446 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4447 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4448 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4449 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4450 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4451 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4452 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4453 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4454 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4455 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4456 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4457 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4458 return (0);
4459}
4460