cxgb_cpl_io.c revision 183550
1/************************************************************************** 2 3Copyright (c) 2007-2008, Chelsio Inc. 4All rights reserved. 5 6Redistribution and use in source and binary forms, with or without 7modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26POSSIBILITY OF SUCH DAMAGE. 27 28***************************************************************************/ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 183550 2008-10-02 15:37:58Z zec $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/sockstate.h> 43#include <sys/sockopt.h> 44#include <sys/socket.h> 45#include <sys/sockbuf.h> 46#include <sys/sysctl.h> 47#include <sys/syslog.h> 48#include <sys/protosw.h> 49#include <sys/priv.h> 50 51#if __FreeBSD_version >= 800044 52#include <sys/vimage.h> 53#else 54#define V_tcp_do_autosndbuf tcp_do_autosndbuf 55#define V_tcp_autosndbuf_max tcp_autosndbuf_max 56#define V_tcp_do_rfc1323 tcp_do_rfc1323 57#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf 58#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max 59#define V_tcpstat tcpstat 60#endif 61 62#include <net/if.h> 63#include <net/route.h> 64 65#include <netinet/in.h> 66#include <netinet/in_pcb.h> 67#include <netinet/in_systm.h> 68#include <netinet/in_var.h> 69 70 71#include <cxgb_osdep.h> 72#include <sys/mbufq.h> 73 74#include <netinet/ip.h> 75#include <netinet/tcp_var.h> 76#include <netinet/tcp_fsm.h> 77#include <netinet/tcp_offload.h> 78#include <netinet/tcp_seq.h> 79#include <netinet/tcp_syncache.h> 80#include <netinet/tcp_timer.h> 81#include <net/route.h> 82 83#include <t3cdev.h> 84#include <common/cxgb_firmware_exports.h> 85#include <common/cxgb_t3_cpl.h> 86#include <common/cxgb_tcb.h> 87#include <common/cxgb_ctl_defs.h> 88#include <cxgb_offload.h> 89#include <vm/vm.h> 90#include <vm/pmap.h> 91#include <machine/bus.h> 92#include <sys/mvec.h> 93#include <ulp/toecore/cxgb_toedev.h> 94#include <ulp/tom/cxgb_l2t.h> 95#include <ulp/tom/cxgb_defs.h> 96#include <ulp/tom/cxgb_tom.h> 97#include <ulp/tom/cxgb_t3_ddp.h> 98#include <ulp/tom/cxgb_toepcb.h> 99#include <ulp/tom/cxgb_tcp.h> 100#include <ulp/tom/cxgb_tcp_offload.h> 101 102/* 103 * For ULP connections HW may add headers, e.g., for digests, that aren't part 104 * of the messages sent by the host but that are part of the TCP payload and 105 * therefore consume TCP sequence space. Tx connection parameters that 106 * operate in TCP sequence space are affected by the HW additions and need to 107 * compensate for them to accurately track TCP sequence numbers. This array 108 * contains the compensating extra lengths for ULP packets. It is indexed by 109 * a packet's ULP submode. 110 */ 111const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 112 113#ifdef notyet 114/* 115 * This sk_buff holds a fake header-only TCP segment that we use whenever we 116 * need to exploit SW TCP functionality that expects TCP headers, such as 117 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 118 * CPUs without locking. 119 */ 120static struct mbuf *tcphdr_mbuf __read_mostly; 121#endif 122 123/* 124 * Size of WRs in bytes. Note that we assume all devices we are handling have 125 * the same WR size. 126 */ 127static unsigned int wrlen __read_mostly; 128 129/* 130 * The number of WRs needed for an skb depends on the number of page fragments 131 * in the skb and whether it has any payload in its main body. This maps the 132 * length of the gather list represented by an skb into the # of necessary WRs. 133 */ 134static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; 135 136/* 137 * Max receive window supported by HW in bytes. Only a small part of it can 138 * be set through option0, the rest needs to be set through RX_DATA_ACK. 139 */ 140#define MAX_RCV_WND ((1U << 27) - 1) 141 142/* 143 * Min receive window. We want it to be large enough to accommodate receive 144 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 145 */ 146#define MIN_RCV_WND (24 * 1024U) 147#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 148 149#define VALIDATE_SEQ 0 150#define VALIDATE_SOCK(so) 151#define DEBUG_WR 0 152 153#define TCP_TIMEWAIT 1 154#define TCP_CLOSE 2 155#define TCP_DROP 3 156 157extern int tcp_do_autorcvbuf; 158extern int tcp_do_autosndbuf; 159extern int tcp_autorcvbuf_max; 160extern int tcp_autosndbuf_max; 161 162static void t3_send_reset(struct toepcb *toep); 163static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 164static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 165static void handle_syncache_event(int event, void *arg); 166 167static inline void 168SBAPPEND(struct sockbuf *sb, struct mbuf *n) 169{ 170 struct mbuf *m; 171 172 m = sb->sb_mb; 173 while (m) { 174 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 175 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 176 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 177 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 178 m->m_next, m->m_nextpkt, m->m_flags)); 179 m = m->m_next; 180 } 181 m = n; 182 while (m) { 183 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 184 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 185 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 186 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 187 m->m_next, m->m_nextpkt, m->m_flags)); 188 m = m->m_next; 189 } 190 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); 191 sbappendstream_locked(sb, n); 192 m = sb->sb_mb; 193 194 while (m) { 195 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 196 m->m_next, m->m_nextpkt, m->m_flags)); 197 m = m->m_next; 198 } 199} 200 201static inline int 202is_t3a(const struct toedev *dev) 203{ 204 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 205} 206 207static void 208dump_toepcb(struct toepcb *toep) 209{ 210 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 211 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 212 toep->tp_mtu_idx, toep->tp_tid); 213 214 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 215 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 216 toep->tp_mss_clamp, toep->tp_flags); 217} 218 219#ifndef RTALLOC2_DEFINED 220static struct rtentry * 221rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 222{ 223 struct rtentry *rt = NULL; 224 225 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 226 RT_UNLOCK(rt); 227 228 return (rt); 229} 230#endif 231 232/* 233 * Determine whether to send a CPL message now or defer it. A message is 234 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 235 * For connections in other states the message is sent immediately. 236 * If through_l2t is set the message is subject to ARP processing, otherwise 237 * it is sent directly. 238 */ 239static inline void 240send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) 241{ 242 struct tcpcb *tp = toep->tp_tp; 243 244 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 245 inp_wlock(tp->t_inpcb); 246 mbufq_tail(&toep->out_of_order_queue, m); // defer 247 inp_wunlock(tp->t_inpcb); 248 } else if (through_l2t) 249 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T 250 else 251 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly 252} 253 254static inline unsigned int 255mkprio(unsigned int cntrl, const struct toepcb *toep) 256{ 257 return (cntrl); 258} 259 260/* 261 * Populate a TID_RELEASE WR. The skb must be already propely sized. 262 */ 263static inline void 264mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) 265{ 266 struct cpl_tid_release *req; 267 268 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); 269 m->m_pkthdr.len = m->m_len = sizeof(*req); 270 req = mtod(m, struct cpl_tid_release *); 271 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 272 req->wr.wr_lo = 0; 273 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 274} 275 276static inline void 277make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 278{ 279 INIT_VNET_INET(so->so_vnet); 280 struct tcpcb *tp = so_sototcpcb(so); 281 struct toepcb *toep = tp->t_toe; 282 struct tx_data_wr *req; 283 struct sockbuf *snd; 284 285 inp_lock_assert(tp->t_inpcb); 286 snd = so_sockbuf_snd(so); 287 288 req = mtod(m, struct tx_data_wr *); 289 m->m_len = sizeof(*req); 290 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 291 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 292 /* len includes the length of any HW ULP additions */ 293 req->len = htonl(len); 294 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 295 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 296 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 297 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 298 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 299 (tail ? 0 : 1)))); 300 req->sndseq = htonl(tp->snd_nxt); 301 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 302 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 303 V_TX_CPU_IDX(toep->tp_qset)); 304 305 /* Sendbuffer is in units of 32KB. 306 */ 307 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 308 req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15)); 309 else { 310 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 311 } 312 313 toep->tp_flags |= TP_DATASENT; 314 } 315} 316 317#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ 318 319int 320t3_push_frames(struct socket *so, int req_completion) 321{ 322 struct tcpcb *tp = so_sototcpcb(so); 323 struct toepcb *toep = tp->t_toe; 324 325 struct mbuf *tail, *m0, *last; 326 struct t3cdev *cdev; 327 struct tom_data *d; 328 int state, bytes, count, total_bytes; 329 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 330 struct sockbuf *snd; 331 332 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 333 DPRINTF("tcp state=%d\n", tp->t_state); 334 return (0); 335 } 336 337 state = so_state_get(so); 338 339 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 340 DPRINTF("disconnecting\n"); 341 342 return (0); 343 } 344 345 inp_lock_assert(tp->t_inpcb); 346 347 snd = so_sockbuf_snd(so); 348 sockbuf_lock(snd); 349 350 d = TOM_DATA(toep->tp_toedev); 351 cdev = d->cdev; 352 353 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 354 355 total_bytes = 0; 356 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 357 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); 358 359 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { 360 KASSERT(tail, ("sbdrop error")); 361 last = tail = tail->m_next; 362 } 363 364 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 365 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 366 sockbuf_unlock(snd); 367 368 return (0); 369 } 370 371 toep->tp_m_last = NULL; 372 while (toep->tp_wr_avail && (tail != NULL)) { 373 count = bytes = 0; 374 segp = segs; 375 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 376 sockbuf_unlock(snd); 377 return (0); 378 } 379 /* 380 * If the data in tail fits as in-line, then 381 * make an immediate data wr. 382 */ 383 if (tail->m_len <= IMM_LEN) { 384 count = 1; 385 bytes = tail->m_len; 386 last = tail; 387 tail = tail->m_next; 388 m_set_sgl(m0, NULL); 389 m_set_sgllen(m0, 0); 390 make_tx_data_wr(so, m0, bytes, tail); 391 m_append(m0, bytes, mtod(last, caddr_t)); 392 KASSERT(!m0->m_next, ("bad append")); 393 } else { 394 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 395 && (tail != NULL) && (count < TX_MAX_SEGS-1)) { 396 bytes += tail->m_len; 397 last = tail; 398 count++; 399 /* 400 * technically an abuse to be using this for a VA 401 * but less gross than defining my own structure 402 * or calling pmap_kextract from here :-| 403 */ 404 segp->ds_addr = (bus_addr_t)tail->m_data; 405 segp->ds_len = tail->m_len; 406 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 407 count, mbuf_wrs[count], tail->m_data, tail->m_len); 408 segp++; 409 tail = tail->m_next; 410 } 411 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 412 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 413 414 m_set_sgl(m0, segs); 415 m_set_sgllen(m0, count); 416 make_tx_data_wr(so, m0, bytes, tail); 417 } 418 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); 419 420 if (tail) { 421 snd->sb_sndptr = tail; 422 toep->tp_m_last = NULL; 423 } else 424 toep->tp_m_last = snd->sb_sndptr = last; 425 426 427 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 428 429 snd->sb_sndptroff += bytes; 430 total_bytes += bytes; 431 toep->tp_write_seq += bytes; 432 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" 433 " tail=%p sndptr=%p sndptroff=%d", 434 toep->tp_wr_avail, count, mbuf_wrs[count], 435 tail, snd->sb_sndptr, snd->sb_sndptroff); 436 if (tail) 437 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" 438 " tp_m_last=%p tailbuf=%p snd_una=0x%08x", 439 total_bytes, toep->tp_m_last, tail->m_data, 440 tp->snd_una); 441 else 442 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" 443 " tp_m_last=%p snd_una=0x%08x", 444 total_bytes, toep->tp_m_last, tp->snd_una); 445 446 447#ifdef KTR 448{ 449 int i; 450 451 i = 0; 452 while (i < count && m_get_sgllen(m0)) { 453 if ((count - i) >= 3) { 454 CTR6(KTR_TOM, 455 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 456 " len=%d pa=0x%zx len=%d", 457 segs[i].ds_addr, segs[i].ds_len, 458 segs[i + 1].ds_addr, segs[i + 1].ds_len, 459 segs[i + 2].ds_addr, segs[i + 2].ds_len); 460 i += 3; 461 } else if ((count - i) == 2) { 462 CTR4(KTR_TOM, 463 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 464 " len=%d", 465 segs[i].ds_addr, segs[i].ds_len, 466 segs[i + 1].ds_addr, segs[i + 1].ds_len); 467 i += 2; 468 } else { 469 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", 470 segs[i].ds_addr, segs[i].ds_len); 471 i++; 472 } 473 474 } 475} 476#endif 477 /* 478 * remember credits used 479 */ 480 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 481 m0->m_pkthdr.len = bytes; 482 toep->tp_wr_avail -= mbuf_wrs[count]; 483 toep->tp_wr_unacked += mbuf_wrs[count]; 484 485 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 486 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 487 struct work_request_hdr *wr = cplhdr(m0); 488 489 wr->wr_hi |= htonl(F_WR_COMPL); 490 toep->tp_wr_unacked = 0; 491 } 492 KASSERT((m0->m_pkthdr.csum_data > 0) && 493 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", 494 m0->m_pkthdr.csum_data)); 495 m0->m_type = MT_DONTFREE; 496 enqueue_wr(toep, m0); 497 DPRINTF("sending offload tx with %d bytes in %d segments\n", 498 bytes, count); 499 l2t_send(cdev, m0, toep->tp_l2t); 500 } 501 sockbuf_unlock(snd); 502 return (total_bytes); 503} 504 505/* 506 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 507 * under any circumstances. We take the easy way out and always queue the 508 * message to the write_queue. We can optimize the case where the queue is 509 * already empty though the optimization is probably not worth it. 510 */ 511static void 512close_conn(struct socket *so) 513{ 514 struct mbuf *m; 515 struct cpl_close_con_req *req; 516 struct tom_data *d; 517 struct inpcb *inp = so_sotoinpcb(so); 518 struct tcpcb *tp; 519 struct toepcb *toep; 520 unsigned int tid; 521 522 523 inp_wlock(inp); 524 tp = so_sototcpcb(so); 525 toep = tp->t_toe; 526 527 if (tp->t_state != TCPS_SYN_SENT) 528 t3_push_frames(so, 1); 529 530 if (toep->tp_flags & TP_FIN_SENT) { 531 inp_wunlock(inp); 532 return; 533 } 534 535 tid = toep->tp_tid; 536 537 d = TOM_DATA(toep->tp_toedev); 538 539 m = m_gethdr_nofail(sizeof(*req)); 540 m_set_priority(m, CPL_PRIORITY_DATA); 541 m_set_sgl(m, NULL); 542 m_set_sgllen(m, 0); 543 544 toep->tp_flags |= TP_FIN_SENT; 545 req = mtod(m, struct cpl_close_con_req *); 546 547 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 548 req->wr.wr_lo = htonl(V_WR_TID(tid)); 549 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 550 req->rsvd = 0; 551 inp_wunlock(inp); 552 /* 553 * XXX - need to defer shutdown while there is still data in the queue 554 * 555 */ 556 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); 557 cxgb_ofld_send(d->cdev, m); 558 559} 560 561/* 562 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 563 * and send it along. 564 */ 565static void 566abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 567{ 568 struct cpl_abort_req *req = cplhdr(m); 569 570 req->cmd = CPL_ABORT_NO_RST; 571 cxgb_ofld_send(cdev, m); 572} 573 574/* 575 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 576 * permitted to return without sending the message in case we cannot allocate 577 * an sk_buff. Returns the number of credits sent. 578 */ 579uint32_t 580t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 581{ 582 struct mbuf *m; 583 struct cpl_rx_data_ack *req; 584 struct toepcb *toep = tp->t_toe; 585 struct toedev *tdev = toep->tp_toedev; 586 587 m = m_gethdr_nofail(sizeof(*req)); 588 589 DPRINTF("returning %u credits to HW\n", credits); 590 591 req = mtod(m, struct cpl_rx_data_ack *); 592 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 593 req->wr.wr_lo = 0; 594 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 595 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 596 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 597 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 598 return (credits); 599} 600 601/* 602 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. 603 * This is only used in DDP mode, so we take the opportunity to also set the 604 * DACK mode and flush any Rx credits. 605 */ 606void 607t3_send_rx_modulate(struct toepcb *toep) 608{ 609 struct mbuf *m; 610 struct cpl_rx_data_ack *req; 611 612 m = m_gethdr_nofail(sizeof(*req)); 613 614 req = mtod(m, struct cpl_rx_data_ack *); 615 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 616 req->wr.wr_lo = 0; 617 m->m_pkthdr.len = m->m_len = sizeof(*req); 618 619 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 620 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 621 V_RX_DACK_MODE(1) | 622 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); 623 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 624 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 625 toep->tp_rcv_wup = toep->tp_copied_seq; 626} 627 628/* 629 * Handle receipt of an urgent pointer. 630 */ 631static void 632handle_urg_ptr(struct socket *so, uint32_t urg_seq) 633{ 634#ifdef URGENT_DATA_SUPPORTED 635 struct tcpcb *tp = so_sototcpcb(so); 636 637 urg_seq--; /* initially points past the urgent data, per BSD */ 638 639 if (tp->urg_data && !after(urg_seq, tp->urg_seq)) 640 return; /* duplicate pointer */ 641 sk_send_sigurg(sk); 642 if (tp->urg_seq == tp->copied_seq && tp->urg_data && 643 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { 644 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 645 646 tp->copied_seq++; 647 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) 648 tom_eat_skb(sk, skb, 0); 649 } 650 tp->urg_data = TCP_URG_NOTYET; 651 tp->urg_seq = urg_seq; 652#endif 653} 654 655/* 656 * Returns true if a socket cannot accept new Rx data. 657 */ 658static inline int 659so_no_receive(const struct socket *so) 660{ 661 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); 662} 663 664/* 665 * Process an urgent data notification. 666 */ 667static void 668rx_urg_notify(struct toepcb *toep, struct mbuf *m) 669{ 670 struct cpl_rx_urg_notify *hdr = cplhdr(m); 671 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 672 673 VALIDATE_SOCK(so); 674 675 if (!so_no_receive(so)) 676 handle_urg_ptr(so, ntohl(hdr->seq)); 677 678 m_freem(m); 679} 680 681/* 682 * Handler for RX_URG_NOTIFY CPL messages. 683 */ 684static int 685do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) 686{ 687 struct toepcb *toep = (struct toepcb *)ctx; 688 689 rx_urg_notify(toep, m); 690 return (0); 691} 692 693static __inline int 694is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) 695{ 696 return (toep->tp_ulp_mode || 697 (toep->tp_ulp_mode == ULP_MODE_TCPDDP && 698 dev->tod_ttid >= TOE_ID_CHELSIO_T3)); 699} 700 701/* 702 * Set of states for which we should return RX credits. 703 */ 704#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 705 706/* 707 * Called after some received data has been read. It returns RX credits 708 * to the HW for the amount of data processed. 709 */ 710void 711t3_cleanup_rbuf(struct tcpcb *tp, int copied) 712{ 713 struct toepcb *toep = tp->t_toe; 714 struct socket *so; 715 struct toedev *dev; 716 int dack_mode, must_send, read; 717 u32 thres, credits, dack = 0; 718 struct sockbuf *rcv; 719 720 so = inp_inpcbtosocket(tp->t_inpcb); 721 rcv = so_sockbuf_rcv(so); 722 723 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 724 (tp->t_state == TCPS_FIN_WAIT_2))) { 725 if (copied) { 726 sockbuf_lock(rcv); 727 toep->tp_copied_seq += copied; 728 sockbuf_unlock(rcv); 729 } 730 731 return; 732 } 733 734 inp_lock_assert(tp->t_inpcb); 735 736 sockbuf_lock(rcv); 737 if (copied) 738 toep->tp_copied_seq += copied; 739 else { 740 read = toep->tp_enqueued_bytes - rcv->sb_cc; 741 toep->tp_copied_seq += read; 742 } 743 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 744 toep->tp_enqueued_bytes = rcv->sb_cc; 745 sockbuf_unlock(rcv); 746 747 if (credits > rcv->sb_mbmax) { 748 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", 749 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 750 credits = rcv->sb_mbmax; 751 } 752 753 754 /* 755 * XXX this won't accurately reflect credit return - we need 756 * to look at the difference between the amount that has been 757 * put in the recv sockbuf and what is there now 758 */ 759 760 if (__predict_false(!credits)) 761 return; 762 763 dev = toep->tp_toedev; 764 thres = TOM_TUNABLE(dev, rx_credit_thres); 765 766 if (__predict_false(thres == 0)) 767 return; 768 769 if (is_delack_mode_valid(dev, toep)) { 770 dack_mode = TOM_TUNABLE(dev, delack); 771 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 772 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 773 774 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 775 dack = F_RX_DACK_CHANGE | 776 V_RX_DACK_MODE(dack_mode); 777 } 778 } else 779 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 780 781 /* 782 * For coalescing to work effectively ensure the receive window has 783 * at least 16KB left. 784 */ 785 must_send = credits + 16384 >= tp->rcv_wnd; 786 787 if (must_send || credits >= thres) 788 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 789} 790 791static int 792cxgb_toe_disconnect(struct tcpcb *tp) 793{ 794 struct socket *so; 795 796 DPRINTF("cxgb_toe_disconnect\n"); 797 798 so = inp_inpcbtosocket(tp->t_inpcb); 799 close_conn(so); 800 return (0); 801} 802 803static int 804cxgb_toe_reset(struct tcpcb *tp) 805{ 806 struct toepcb *toep = tp->t_toe; 807 808 t3_send_reset(toep); 809 810 /* 811 * unhook from socket 812 */ 813 tp->t_flags &= ~TF_TOE; 814 toep->tp_tp = NULL; 815 tp->t_toe = NULL; 816 return (0); 817} 818 819static int 820cxgb_toe_send(struct tcpcb *tp) 821{ 822 struct socket *so; 823 824 DPRINTF("cxgb_toe_send\n"); 825 dump_toepcb(tp->t_toe); 826 827 so = inp_inpcbtosocket(tp->t_inpcb); 828 t3_push_frames(so, 1); 829 return (0); 830} 831 832static int 833cxgb_toe_rcvd(struct tcpcb *tp) 834{ 835 836 inp_lock_assert(tp->t_inpcb); 837 838 t3_cleanup_rbuf(tp, 0); 839 840 return (0); 841} 842 843static void 844cxgb_toe_detach(struct tcpcb *tp) 845{ 846 struct toepcb *toep; 847 848 /* 849 * XXX how do we handle teardown in the SYN_SENT state? 850 * 851 */ 852 inp_lock_assert(tp->t_inpcb); 853 toep = tp->t_toe; 854 toep->tp_tp = NULL; 855 856 /* 857 * unhook from socket 858 */ 859 tp->t_flags &= ~TF_TOE; 860 tp->t_toe = NULL; 861} 862 863 864static struct toe_usrreqs cxgb_toe_usrreqs = { 865 .tu_disconnect = cxgb_toe_disconnect, 866 .tu_reset = cxgb_toe_reset, 867 .tu_send = cxgb_toe_send, 868 .tu_rcvd = cxgb_toe_rcvd, 869 .tu_detach = cxgb_toe_detach, 870 .tu_detach = cxgb_toe_detach, 871 .tu_syncache_event = handle_syncache_event, 872}; 873 874 875static void 876__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, 877 uint64_t mask, uint64_t val, int no_reply) 878{ 879 struct cpl_set_tcb_field *req; 880 881 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 882 toep->tp_tid, word, mask, val); 883 884 req = mtod(m, struct cpl_set_tcb_field *); 885 m->m_pkthdr.len = m->m_len = sizeof(*req); 886 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 887 req->wr.wr_lo = 0; 888 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 889 req->reply = V_NO_REPLY(no_reply); 890 req->cpu_idx = 0; 891 req->word = htons(word); 892 req->mask = htobe64(mask); 893 req->val = htobe64(val); 894 895 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 896 send_or_defer(toep, m, 0); 897} 898 899static void 900t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) 901{ 902 struct mbuf *m; 903 struct tcpcb *tp = toep->tp_tp; 904 905 if (toep == NULL) 906 return; 907 908 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { 909 printf("not seting field\n"); 910 return; 911 } 912 913 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 914 915 __set_tcb_field(toep, m, word, mask, val, 1); 916} 917 918/* 919 * Set one of the t_flags bits in the TCB. 920 */ 921static void 922set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) 923{ 924 925 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 926} 927 928/* 929 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 930 */ 931static void 932t3_set_nagle(struct toepcb *toep) 933{ 934 struct tcpcb *tp = toep->tp_tp; 935 936 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 937} 938 939/* 940 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 941 */ 942void 943t3_set_keepalive(struct toepcb *toep, int on_off) 944{ 945 946 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); 947} 948 949void 950t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) 951{ 952 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); 953} 954 955void 956t3_set_dack_mss(struct toepcb *toep, int on_off) 957{ 958 959 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); 960} 961 962/* 963 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 964 */ 965static void 966t3_set_tos(struct toepcb *toep) 967{ 968 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); 969 970 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 971 V_TCB_TOS(tos)); 972} 973 974 975/* 976 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 977 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 978 * set the PSH bit in the last segment, which would trigger delivery.] 979 * We work around the issue by setting a DDP buffer in a partial placed state, 980 * which guarantees that TP will schedule a timer. 981 */ 982#define TP_DDP_TIMER_WORKAROUND_MASK\ 983 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 984 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 985 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 986#define TP_DDP_TIMER_WORKAROUND_VAL\ 987 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 988 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 989 32)) 990 991static void 992t3_enable_ddp(struct toepcb *toep, int on) 993{ 994 if (on) { 995 996 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 997 V_TF_DDP_OFF(0)); 998 } else 999 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, 1000 V_TF_DDP_OFF(1) | 1001 TP_DDP_TIMER_WORKAROUND_MASK, 1002 V_TF_DDP_OFF(1) | 1003 TP_DDP_TIMER_WORKAROUND_VAL); 1004 1005} 1006 1007void 1008t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) 1009{ 1010 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 1011 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 1012 tag_color); 1013} 1014 1015void 1016t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, 1017 unsigned int len) 1018{ 1019 if (buf_idx == 0) 1020 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, 1021 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 1022 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 1023 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 1024 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 1025 else 1026 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, 1027 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 1028 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 1029 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 1030 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 1031} 1032 1033static int 1034t3_set_cong_control(struct socket *so, const char *name) 1035{ 1036#ifdef CONGESTION_CONTROL_SUPPORTED 1037 int cong_algo; 1038 1039 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 1040 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 1041 break; 1042 1043 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 1044 return -EINVAL; 1045#endif 1046 return 0; 1047} 1048 1049int 1050t3_get_tcb(struct toepcb *toep) 1051{ 1052 struct cpl_get_tcb *req; 1053 struct tcpcb *tp = toep->tp_tp; 1054 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 1055 1056 if (!m) 1057 return (ENOMEM); 1058 1059 inp_lock_assert(tp->t_inpcb); 1060 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 1061 req = mtod(m, struct cpl_get_tcb *); 1062 m->m_pkthdr.len = m->m_len = sizeof(*req); 1063 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1064 req->wr.wr_lo = 0; 1065 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 1066 req->cpuno = htons(toep->tp_qset); 1067 req->rsvd = 0; 1068 if (tp->t_state == TCPS_SYN_SENT) 1069 mbufq_tail(&toep->out_of_order_queue, m); // defer 1070 else 1071 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 1072 return 0; 1073} 1074 1075static inline void 1076so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) 1077{ 1078 1079 toepcb_hold(toep); 1080 1081 cxgb_insert_tid(d->cdev, d->client, toep, tid); 1082} 1083 1084/** 1085 * find_best_mtu - find the entry in the MTU table closest to an MTU 1086 * @d: TOM state 1087 * @mtu: the target MTU 1088 * 1089 * Returns the index of the value in the MTU table that is closest to but 1090 * does not exceed the target MTU. 1091 */ 1092static unsigned int 1093find_best_mtu(const struct t3c_data *d, unsigned short mtu) 1094{ 1095 int i = 0; 1096 1097 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 1098 ++i; 1099 return (i); 1100} 1101 1102static unsigned int 1103select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 1104{ 1105 unsigned int idx; 1106 1107#ifdef notyet 1108 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; 1109#endif 1110 if (tp) { 1111 tp->t_maxseg = pmtu - 40; 1112 if (tp->t_maxseg < td->mtus[0] - 40) 1113 tp->t_maxseg = td->mtus[0] - 40; 1114 idx = find_best_mtu(td, tp->t_maxseg + 40); 1115 1116 tp->t_maxseg = td->mtus[idx] - 40; 1117 } else 1118 idx = find_best_mtu(td, pmtu); 1119 1120 return (idx); 1121} 1122 1123static inline void 1124free_atid(struct t3cdev *cdev, unsigned int tid) 1125{ 1126 struct toepcb *toep = cxgb_free_atid(cdev, tid); 1127 1128 if (toep) 1129 toepcb_release(toep); 1130} 1131 1132/* 1133 * Release resources held by an offload connection (TID, L2T entry, etc.) 1134 */ 1135static void 1136t3_release_offload_resources(struct toepcb *toep) 1137{ 1138 struct tcpcb *tp = toep->tp_tp; 1139 struct toedev *tdev = toep->tp_toedev; 1140 struct t3cdev *cdev; 1141 struct socket *so; 1142 unsigned int tid = toep->tp_tid; 1143 struct sockbuf *rcv; 1144 1145 CTR0(KTR_TOM, "t3_release_offload_resources"); 1146 1147 if (!tdev) 1148 return; 1149 1150 cdev = TOEP_T3C_DEV(toep); 1151 if (!cdev) 1152 return; 1153 1154 toep->tp_qset = 0; 1155 t3_release_ddp_resources(toep); 1156 1157#ifdef CTRL_SKB_CACHE 1158 kfree_skb(CTRL_SKB_CACHE(tp)); 1159 CTRL_SKB_CACHE(tp) = NULL; 1160#endif 1161 1162 if (toep->tp_wr_avail != toep->tp_wr_max) { 1163 purge_wr_queue(toep); 1164 reset_wr_list(toep); 1165 } 1166 1167 if (toep->tp_l2t) { 1168 l2t_release(L2DATA(cdev), toep->tp_l2t); 1169 toep->tp_l2t = NULL; 1170 } 1171 toep->tp_tp = NULL; 1172 if (tp) { 1173 inp_lock_assert(tp->t_inpcb); 1174 so = inp_inpcbtosocket(tp->t_inpcb); 1175 rcv = so_sockbuf_rcv(so); 1176 /* 1177 * cancel any offloaded reads 1178 * 1179 */ 1180 sockbuf_lock(rcv); 1181 tp->t_toe = NULL; 1182 tp->t_flags &= ~TF_TOE; 1183 if (toep->tp_ddp_state.user_ddp_pending) { 1184 t3_cancel_ubuf(toep, rcv); 1185 toep->tp_ddp_state.user_ddp_pending = 0; 1186 } 1187 so_sorwakeup_locked(so); 1188 1189 } 1190 1191 if (toep->tp_state == TCPS_SYN_SENT) { 1192 free_atid(cdev, tid); 1193#ifdef notyet 1194 __skb_queue_purge(&tp->out_of_order_queue); 1195#endif 1196 } else { // we have TID 1197 cxgb_remove_tid(cdev, toep, tid); 1198 toepcb_release(toep); 1199 } 1200#if 0 1201 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 1202#endif 1203} 1204 1205static void 1206install_offload_ops(struct socket *so) 1207{ 1208 struct tcpcb *tp = so_sototcpcb(so); 1209 1210 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 1211 1212 t3_install_socket_ops(so); 1213 tp->t_flags |= TF_TOE; 1214 tp->t_tu = &cxgb_toe_usrreqs; 1215} 1216 1217/* 1218 * Determine the receive window scaling factor given a target max 1219 * receive window. 1220 */ 1221static __inline int 1222select_rcv_wscale(int space) 1223{ 1224 INIT_VNET_INET(so->so_vnet); 1225 int wscale = 0; 1226 1227 if (space > MAX_RCV_WND) 1228 space = MAX_RCV_WND; 1229 1230 if (V_tcp_do_rfc1323) 1231 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 1232 1233 return (wscale); 1234} 1235 1236/* 1237 * Determine the receive window size for a socket. 1238 */ 1239static unsigned long 1240select_rcv_wnd(struct toedev *dev, struct socket *so) 1241{ 1242 INIT_VNET_INET(so->so_vnet); 1243 struct tom_data *d = TOM_DATA(dev); 1244 unsigned int wnd; 1245 unsigned int max_rcv_wnd; 1246 struct sockbuf *rcv; 1247 1248 rcv = so_sockbuf_rcv(so); 1249 1250 if (V_tcp_do_autorcvbuf) 1251 wnd = V_tcp_autorcvbuf_max; 1252 else 1253 wnd = rcv->sb_hiwat; 1254 1255 1256 1257 /* XXX 1258 * For receive coalescing to work effectively we need a receive window 1259 * that can accomodate a coalesced segment. 1260 */ 1261 if (wnd < MIN_RCV_WND) 1262 wnd = MIN_RCV_WND; 1263 1264 /* PR 5138 */ 1265 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 1266 (uint32_t)d->rx_page_size * 23 : 1267 MAX_RCV_WND); 1268 1269 return min(wnd, max_rcv_wnd); 1270} 1271 1272/* 1273 * Assign offload parameters to some socket fields. This code is used by 1274 * both active and passive opens. 1275 */ 1276static inline void 1277init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1278 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1279{ 1280 struct tcpcb *tp = so_sototcpcb(so); 1281 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1282 struct sockbuf *snd, *rcv; 1283 1284#ifdef notyet 1285 SOCK_LOCK_ASSERT(so); 1286#endif 1287 1288 snd = so_sockbuf_snd(so); 1289 rcv = so_sockbuf_rcv(so); 1290 1291 log(LOG_INFO, "initializing offload socket\n"); 1292 /* 1293 * We either need to fix push frames to work with sbcompress 1294 * or we need to add this 1295 */ 1296 snd->sb_flags |= SB_NOCOALESCE; 1297 rcv->sb_flags |= SB_NOCOALESCE; 1298 1299 tp->t_toe = toep; 1300 toep->tp_tp = tp; 1301 toep->tp_toedev = dev; 1302 1303 toep->tp_tid = tid; 1304 toep->tp_l2t = e; 1305 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1306 toep->tp_wr_unacked = 0; 1307 toep->tp_delack_mode = 0; 1308 1309 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1310 /* 1311 * XXX broken 1312 * 1313 */ 1314 tp->rcv_wnd = select_rcv_wnd(dev, so); 1315 1316 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 1317 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1318 toep->tp_qset_idx = 0; 1319 1320 reset_wr_list(toep); 1321 DPRINTF("initialization done\n"); 1322} 1323 1324/* 1325 * The next two functions calculate the option 0 value for a socket. 1326 */ 1327static inline unsigned int 1328calc_opt0h(struct socket *so, int mtu_idx) 1329{ 1330 struct tcpcb *tp = so_sototcpcb(so); 1331 int wscale = select_rcv_wscale(tp->rcv_wnd); 1332 1333 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1334 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1335 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1336} 1337 1338static inline unsigned int 1339calc_opt0l(struct socket *so, int ulp_mode) 1340{ 1341 struct tcpcb *tp = so_sototcpcb(so); 1342 unsigned int val; 1343 1344 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | 1345 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1346 1347 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); 1348 return (val); 1349} 1350 1351static inline unsigned int 1352calc_opt2(const struct socket *so, struct toedev *dev) 1353{ 1354 int flv_valid; 1355 1356 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1357 1358 return (V_FLAVORS_VALID(flv_valid) | 1359 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); 1360} 1361 1362#if DEBUG_WR > 1 1363static int 1364count_pending_wrs(const struct toepcb *toep) 1365{ 1366 const struct mbuf *m; 1367 int n = 0; 1368 1369 wr_queue_walk(toep, m) 1370 n += m->m_pkthdr.csum_data; 1371 return (n); 1372} 1373#endif 1374 1375#if 0 1376(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1377#endif 1378 1379static void 1380mk_act_open_req(struct socket *so, struct mbuf *m, 1381 unsigned int atid, const struct l2t_entry *e) 1382{ 1383 struct cpl_act_open_req *req; 1384 struct inpcb *inp = so_sotoinpcb(so); 1385 struct tcpcb *tp = inp_inpcbtotcpcb(inp); 1386 struct toepcb *toep = tp->t_toe; 1387 struct toedev *tdev = toep->tp_toedev; 1388 1389 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); 1390 1391 req = mtod(m, struct cpl_act_open_req *); 1392 m->m_pkthdr.len = m->m_len = sizeof(*req); 1393 1394 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1395 req->wr.wr_lo = 0; 1396 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1397 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); 1398#if 0 1399 req->local_port = inp->inp_lport; 1400 req->peer_port = inp->inp_fport; 1401 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1402 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1403#endif 1404 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1405 V_TX_CHANNEL(e->smt_idx)); 1406 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1407 req->params = 0; 1408 req->opt2 = htonl(calc_opt2(so, tdev)); 1409} 1410 1411 1412/* 1413 * Convert an ACT_OPEN_RPL status to an errno. 1414 */ 1415static int 1416act_open_rpl_status_to_errno(int status) 1417{ 1418 switch (status) { 1419 case CPL_ERR_CONN_RESET: 1420 return (ECONNREFUSED); 1421 case CPL_ERR_ARP_MISS: 1422 return (EHOSTUNREACH); 1423 case CPL_ERR_CONN_TIMEDOUT: 1424 return (ETIMEDOUT); 1425 case CPL_ERR_TCAM_FULL: 1426 return (ENOMEM); 1427 case CPL_ERR_CONN_EXIST: 1428 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1429 return (EADDRINUSE); 1430 default: 1431 return (EIO); 1432 } 1433} 1434 1435static void 1436fail_act_open(struct toepcb *toep, int errno) 1437{ 1438 struct tcpcb *tp = toep->tp_tp; 1439 1440 t3_release_offload_resources(toep); 1441 if (tp) { 1442 inp_wunlock(tp->t_inpcb); 1443 tcp_offload_drop(tp, errno); 1444 } 1445 1446#ifdef notyet 1447 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1448#endif 1449} 1450 1451/* 1452 * Handle active open failures. 1453 */ 1454static void 1455active_open_failed(struct toepcb *toep, struct mbuf *m) 1456{ 1457 struct cpl_act_open_rpl *rpl = cplhdr(m); 1458 struct inpcb *inp; 1459 1460 if (toep->tp_tp == NULL) 1461 goto done; 1462 1463 inp = toep->tp_tp->t_inpcb; 1464 1465/* 1466 * Don't handle connection retry for now 1467 */ 1468#ifdef notyet 1469 struct inet_connection_sock *icsk = inet_csk(sk); 1470 1471 if (rpl->status == CPL_ERR_CONN_EXIST && 1472 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1473 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1474 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1475 jiffies + HZ / 2); 1476 } else 1477#endif 1478 { 1479 inp_wlock(inp); 1480 /* 1481 * drops the inpcb lock 1482 */ 1483 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1484 } 1485 1486 done: 1487 m_free(m); 1488} 1489 1490/* 1491 * Return whether a failed active open has allocated a TID 1492 */ 1493static inline int 1494act_open_has_tid(int status) 1495{ 1496 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1497 status != CPL_ERR_ARP_MISS; 1498} 1499 1500/* 1501 * Process an ACT_OPEN_RPL CPL message. 1502 */ 1503static int 1504do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1505{ 1506 struct toepcb *toep = (struct toepcb *)ctx; 1507 struct cpl_act_open_rpl *rpl = cplhdr(m); 1508 1509 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1510 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1511 1512 active_open_failed(toep, m); 1513 return (0); 1514} 1515 1516/* 1517 * Handle an ARP failure for an active open. XXX purge ofo queue 1518 * 1519 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1520 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1521 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1522 * free the atid. Hmm. 1523 */ 1524#ifdef notyet 1525static void 1526act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1527{ 1528 struct toepcb *toep = m_get_toep(m); 1529 struct tcpcb *tp = toep->tp_tp; 1530 struct inpcb *inp = tp->t_inpcb; 1531 struct socket *so; 1532 1533 inp_wlock(inp); 1534 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1535 /* 1536 * drops the inpcb lock 1537 */ 1538 fail_act_open(so, EHOSTUNREACH); 1539 printf("freeing %p\n", m); 1540 1541 m_free(m); 1542 } else 1543 inp_wunlock(inp); 1544} 1545#endif 1546/* 1547 * Send an active open request. 1548 */ 1549int 1550t3_connect(struct toedev *tdev, struct socket *so, 1551 struct rtentry *rt, struct sockaddr *nam) 1552{ 1553 struct mbuf *m; 1554 struct l2t_entry *e; 1555 struct tom_data *d = TOM_DATA(tdev); 1556 struct inpcb *inp = so_sotoinpcb(so); 1557 struct tcpcb *tp = intotcpcb(inp); 1558 struct toepcb *toep; /* allocated by init_offload_socket */ 1559 1560 int atid; 1561 1562 toep = toepcb_alloc(); 1563 if (toep == NULL) 1564 goto out_err; 1565 1566 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1567 goto out_err; 1568 1569 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1570 if (!e) 1571 goto free_tid; 1572 1573 inp_lock_assert(inp); 1574 m = m_gethdr(MT_DATA, M_WAITOK); 1575 1576#if 0 1577 m->m_toe.mt_toepcb = tp->t_toe; 1578 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1579#endif 1580 so_lock(so); 1581 1582 init_offload_socket(so, tdev, atid, e, rt, toep); 1583 1584 install_offload_ops(so); 1585 1586 mk_act_open_req(so, m, atid, e); 1587 so_unlock(so); 1588 1589 soisconnecting(so); 1590 toep = tp->t_toe; 1591 m_set_toep(m, tp->t_toe); 1592 1593 toep->tp_state = TCPS_SYN_SENT; 1594 l2t_send(d->cdev, (struct mbuf *)m, e); 1595 1596 if (toep->tp_ulp_mode) 1597 t3_enable_ddp(toep, 0); 1598 return (0); 1599 1600free_tid: 1601 printf("failing connect - free atid\n"); 1602 1603 free_atid(d->cdev, atid); 1604out_err: 1605 printf("return ENOMEM\n"); 1606 return (ENOMEM); 1607} 1608 1609/* 1610 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1611 * not send multiple ABORT_REQs for the same connection and also that we do 1612 * not try to send a message after the connection has closed. Returns 1 if 1613 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1614 */ 1615static void 1616t3_send_reset(struct toepcb *toep) 1617{ 1618 1619 struct cpl_abort_req *req; 1620 unsigned int tid = toep->tp_tid; 1621 int mode = CPL_ABORT_SEND_RST; 1622 struct tcpcb *tp = toep->tp_tp; 1623 struct toedev *tdev = toep->tp_toedev; 1624 struct socket *so = NULL; 1625 struct mbuf *m; 1626 struct sockbuf *snd; 1627 1628 if (tp) { 1629 inp_lock_assert(tp->t_inpcb); 1630 so = inp_inpcbtosocket(tp->t_inpcb); 1631 } 1632 1633 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1634 tdev == NULL)) 1635 return; 1636 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1637 1638 snd = so_sockbuf_snd(so); 1639 /* Purge the send queue so we don't send anything after an abort. */ 1640 if (so) 1641 sbflush(snd); 1642 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1643 mode |= CPL_ABORT_POST_CLOSE_REQ; 1644 1645 m = m_gethdr_nofail(sizeof(*req)); 1646 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); 1647 set_arp_failure_handler(m, abort_arp_failure); 1648 1649 req = mtod(m, struct cpl_abort_req *); 1650 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1651 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1652 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1653 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1654 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1655 req->cmd = mode; 1656 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1657 mbufq_tail(&toep->out_of_order_queue, m); // defer 1658 else 1659 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1660} 1661 1662static int 1663t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1664{ 1665 struct inpcb *inp; 1666 int error, optval; 1667 1668 if (sopt->sopt_name == IP_OPTIONS) 1669 return (ENOPROTOOPT); 1670 1671 if (sopt->sopt_name != IP_TOS) 1672 return (EOPNOTSUPP); 1673 1674 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1675 1676 if (error) 1677 return (error); 1678 1679 if (optval > IPTOS_PREC_CRITIC_ECP) 1680 return (EINVAL); 1681 1682 inp = so_sotoinpcb(so); 1683 inp_wlock(inp); 1684 inp_ip_tos_set(inp, optval); 1685#if 0 1686 inp->inp_ip_tos = optval; 1687#endif 1688 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); 1689 inp_wunlock(inp); 1690 1691 return (0); 1692} 1693 1694static int 1695t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1696{ 1697 int err = 0; 1698 size_t copied; 1699 1700 if (sopt->sopt_name != TCP_CONGESTION && 1701 sopt->sopt_name != TCP_NODELAY) 1702 return (EOPNOTSUPP); 1703 1704 if (sopt->sopt_name == TCP_CONGESTION) { 1705 char name[TCP_CA_NAME_MAX]; 1706 int optlen = sopt->sopt_valsize; 1707 struct tcpcb *tp; 1708 1709 if (sopt->sopt_dir == SOPT_GET) { 1710 KASSERT(0, ("unimplemented")); 1711 return (EOPNOTSUPP); 1712 } 1713 1714 if (optlen < 1) 1715 return (EINVAL); 1716 1717 err = copyinstr(sopt->sopt_val, name, 1718 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1719 if (err) 1720 return (err); 1721 if (copied < 1) 1722 return (EINVAL); 1723 1724 tp = so_sototcpcb(so); 1725 /* 1726 * XXX I need to revisit this 1727 */ 1728 if ((err = t3_set_cong_control(so, name)) == 0) { 1729#ifdef CONGESTION_CONTROL_SUPPORTED 1730 tp->t_cong_control = strdup(name, M_CXGB); 1731#endif 1732 } else 1733 return (err); 1734 } else { 1735 int optval, oldval; 1736 struct inpcb *inp; 1737 struct tcpcb *tp; 1738 1739 if (sopt->sopt_dir == SOPT_GET) 1740 return (EOPNOTSUPP); 1741 1742 err = sooptcopyin(sopt, &optval, sizeof optval, 1743 sizeof optval); 1744 1745 if (err) 1746 return (err); 1747 1748 inp = so_sotoinpcb(so); 1749 inp_wlock(inp); 1750 tp = inp_inpcbtotcpcb(inp); 1751 1752 oldval = tp->t_flags; 1753 if (optval) 1754 tp->t_flags |= TF_NODELAY; 1755 else 1756 tp->t_flags &= ~TF_NODELAY; 1757 inp_wunlock(inp); 1758 1759 1760 if (oldval != tp->t_flags && (tp->t_toe != NULL)) 1761 t3_set_nagle(tp->t_toe); 1762 1763 } 1764 1765 return (0); 1766} 1767 1768int 1769t3_ctloutput(struct socket *so, struct sockopt *sopt) 1770{ 1771 int err; 1772 1773 if (sopt->sopt_level != IPPROTO_TCP) 1774 err = t3_ip_ctloutput(so, sopt); 1775 else 1776 err = t3_tcp_ctloutput(so, sopt); 1777 1778 if (err != EOPNOTSUPP) 1779 return (err); 1780 1781 return (tcp_ctloutput(so, sopt)); 1782} 1783 1784/* 1785 * Returns true if we need to explicitly request RST when we receive new data 1786 * on an RX-closed connection. 1787 */ 1788static inline int 1789need_rst_on_excess_rx(const struct toepcb *toep) 1790{ 1791 return (1); 1792} 1793 1794/* 1795 * Handles Rx data that arrives in a state where the socket isn't accepting 1796 * new data. 1797 */ 1798static void 1799handle_excess_rx(struct toepcb *toep, struct mbuf *m) 1800{ 1801 1802 if (need_rst_on_excess_rx(toep) && 1803 !(toep->tp_flags & TP_ABORT_SHUTDOWN)) 1804 t3_send_reset(toep); 1805 m_freem(m); 1806} 1807 1808/* 1809 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) 1810 * by getting the DDP offset from the TCB. 1811 */ 1812static void 1813tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) 1814{ 1815 struct ddp_state *q = &toep->tp_ddp_state; 1816 struct ddp_buf_state *bsp; 1817 struct cpl_get_tcb_rpl *hdr; 1818 unsigned int ddp_offset; 1819 struct socket *so; 1820 struct tcpcb *tp; 1821 struct sockbuf *rcv; 1822 int state; 1823 1824 uint64_t t; 1825 __be64 *tcb; 1826 1827 tp = toep->tp_tp; 1828 so = inp_inpcbtosocket(tp->t_inpcb); 1829 1830 inp_lock_assert(tp->t_inpcb); 1831 rcv = so_sockbuf_rcv(so); 1832 sockbuf_lock(rcv); 1833 1834 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. 1835 * We really need a cookie in order to dispatch the RPLs. 1836 */ 1837 q->get_tcb_count--; 1838 1839 /* It is a possible that a previous CPL already invalidated UBUF DDP 1840 * and moved the cur_buf idx and hence no further processing of this 1841 * skb is required. However, the app might be sleeping on 1842 * !q->get_tcb_count and we need to wake it up. 1843 */ 1844 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { 1845 int state = so_state_get(so); 1846 1847 m_freem(m); 1848 if (__predict_true((state & SS_NOFDREF) == 0)) 1849 so_sorwakeup_locked(so); 1850 else 1851 sockbuf_unlock(rcv); 1852 1853 return; 1854 } 1855 1856 bsp = &q->buf_state[q->cur_buf]; 1857 hdr = cplhdr(m); 1858 tcb = (__be64 *)(hdr + 1); 1859 if (q->cur_buf == 0) { 1860 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); 1861 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); 1862 } else { 1863 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); 1864 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; 1865 } 1866 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; 1867 m->m_cur_offset = bsp->cur_offset; 1868 bsp->cur_offset = ddp_offset; 1869 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; 1870 1871 CTR5(KTR_TOM, 1872 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", 1873 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); 1874 KASSERT(ddp_offset >= m->m_cur_offset, 1875 ("ddp_offset=%u less than cur_offset=%u", 1876 ddp_offset, m->m_cur_offset)); 1877 1878#if 0 1879{ 1880 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; 1881 1882 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); 1883 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; 1884 1885 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); 1886 rcv_nxt = t >> S_TCB_RCV_NXT; 1887 rcv_nxt &= M_TCB_RCV_NXT; 1888 1889 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); 1890 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); 1891 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; 1892 1893 T3_TRACE2(TIDTB(sk), 1894 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", 1895 ddp_flags, rcv_nxt - rx_hdr_offset); 1896 T3_TRACE4(TB(q), 1897 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", 1898 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); 1899 T3_TRACE3(TB(q), 1900 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", 1901 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); 1902 T3_TRACE2(TB(q), 1903 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", 1904 q->buf_state[0].flags, q->buf_state[1].flags); 1905 1906} 1907#endif 1908 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { 1909 handle_excess_rx(toep, m); 1910 return; 1911 } 1912 1913#ifdef T3_TRACE 1914 if ((int)m->m_pkthdr.len < 0) { 1915 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); 1916 } 1917#endif 1918 if (bsp->flags & DDP_BF_NOCOPY) { 1919#ifdef T3_TRACE 1920 T3_TRACE0(TB(q), 1921 "tcb_rpl_as_ddp_complete: CANCEL UBUF"); 1922 1923 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1924 printk("!cancel_ubuf"); 1925 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); 1926 } 1927#endif 1928 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; 1929 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); 1930 q->cur_buf ^= 1; 1931 } else if (bsp->flags & DDP_BF_NOFLIP) { 1932 1933 m->m_ddp_flags = 1; /* always a kernel buffer */ 1934 1935 /* now HW buffer carries a user buffer */ 1936 bsp->flags &= ~DDP_BF_NOFLIP; 1937 bsp->flags |= DDP_BF_NOCOPY; 1938 1939 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate 1940 * any new data in which case we're done. If in addition the 1941 * offset is 0, then there wasn't a completion for the kbuf 1942 * and we need to decrement the posted count. 1943 */ 1944 if (m->m_pkthdr.len == 0) { 1945 if (ddp_offset == 0) { 1946 q->kbuf_posted--; 1947 bsp->flags |= DDP_BF_NODATA; 1948 } 1949 sockbuf_unlock(rcv); 1950 m_free(m); 1951 return; 1952 } 1953 } else { 1954 sockbuf_unlock(rcv); 1955 1956 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, 1957 * but it got here way late and nobody cares anymore. 1958 */ 1959 m_free(m); 1960 return; 1961 } 1962 1963 m->m_ddp_gl = (unsigned char *)bsp->gl; 1964 m->m_flags |= M_DDP; 1965 m->m_seq = tp->rcv_nxt; 1966 tp->rcv_nxt += m->m_pkthdr.len; 1967 tp->t_rcvtime = ticks; 1968 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", 1969 m->m_seq, q->cur_buf, m->m_pkthdr.len); 1970 if (m->m_pkthdr.len == 0) { 1971 q->user_ddp_pending = 0; 1972 m_free(m); 1973 } else 1974 SBAPPEND(rcv, m); 1975 1976 state = so_state_get(so); 1977 if (__predict_true((state & SS_NOFDREF) == 0)) 1978 so_sorwakeup_locked(so); 1979 else 1980 sockbuf_unlock(rcv); 1981} 1982 1983/* 1984 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, 1985 * in that case they are similar to DDP completions. 1986 */ 1987static int 1988do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1989{ 1990 struct toepcb *toep = (struct toepcb *)ctx; 1991 1992 /* OK if socket doesn't exist */ 1993 if (toep == NULL) { 1994 printf("null toep in do_get_tcb_rpl\n"); 1995 return (CPL_RET_BUF_DONE); 1996 } 1997 1998 inp_wlock(toep->tp_tp->t_inpcb); 1999 tcb_rpl_as_ddp_complete(toep, m); 2000 inp_wunlock(toep->tp_tp->t_inpcb); 2001 2002 return (0); 2003} 2004 2005static void 2006handle_ddp_data(struct toepcb *toep, struct mbuf *m) 2007{ 2008 struct tcpcb *tp = toep->tp_tp; 2009 struct socket *so; 2010 struct ddp_state *q; 2011 struct ddp_buf_state *bsp; 2012 struct cpl_rx_data *hdr = cplhdr(m); 2013 unsigned int rcv_nxt = ntohl(hdr->seq); 2014 struct sockbuf *rcv; 2015 2016 if (tp->rcv_nxt == rcv_nxt) 2017 return; 2018 2019 inp_lock_assert(tp->t_inpcb); 2020 so = inp_inpcbtosocket(tp->t_inpcb); 2021 rcv = so_sockbuf_rcv(so); 2022 sockbuf_lock(rcv); 2023 2024 q = &toep->tp_ddp_state; 2025 bsp = &q->buf_state[q->cur_buf]; 2026 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", 2027 rcv_nxt, tp->rcv_nxt)); 2028 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2029 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2030 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", 2031 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); 2032 2033#ifdef T3_TRACE 2034 if ((int)m->m_pkthdr.len < 0) { 2035 t3_ddp_error(so, "handle_ddp_data: neg len"); 2036 } 2037#endif 2038 m->m_ddp_gl = (unsigned char *)bsp->gl; 2039 m->m_flags |= M_DDP; 2040 m->m_cur_offset = bsp->cur_offset; 2041 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2042 if (bsp->flags & DDP_BF_NOCOPY) 2043 bsp->flags &= ~DDP_BF_NOCOPY; 2044 2045 m->m_seq = tp->rcv_nxt; 2046 tp->rcv_nxt = rcv_nxt; 2047 bsp->cur_offset += m->m_pkthdr.len; 2048 if (!(bsp->flags & DDP_BF_NOFLIP)) 2049 q->cur_buf ^= 1; 2050 /* 2051 * For now, don't re-enable DDP after a connection fell out of DDP 2052 * mode. 2053 */ 2054 q->ubuf_ddp_ready = 0; 2055 sockbuf_unlock(rcv); 2056} 2057 2058/* 2059 * Process new data received for a connection. 2060 */ 2061static void 2062new_rx_data(struct toepcb *toep, struct mbuf *m) 2063{ 2064 struct cpl_rx_data *hdr = cplhdr(m); 2065 struct tcpcb *tp = toep->tp_tp; 2066 struct socket *so; 2067 struct sockbuf *rcv; 2068 int state; 2069 int len = be16toh(hdr->len); 2070 2071 inp_wlock(tp->t_inpcb); 2072 2073 so = inp_inpcbtosocket(tp->t_inpcb); 2074 2075 if (__predict_false(so_no_receive(so))) { 2076 handle_excess_rx(toep, m); 2077 inp_wunlock(tp->t_inpcb); 2078 TRACE_EXIT; 2079 return; 2080 } 2081 2082 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) 2083 handle_ddp_data(toep, m); 2084 2085 m->m_seq = ntohl(hdr->seq); 2086 m->m_ulp_mode = 0; /* for iSCSI */ 2087 2088#if VALIDATE_SEQ 2089 if (__predict_false(m->m_seq != tp->rcv_nxt)) { 2090 log(LOG_ERR, 2091 "%s: TID %u: Bad sequence number %u, expected %u\n", 2092 toep->tp_toedev->name, toep->tp_tid, m->m_seq, 2093 tp->rcv_nxt); 2094 m_freem(m); 2095 inp_wunlock(tp->t_inpcb); 2096 return; 2097 } 2098#endif 2099 m_adj(m, sizeof(*hdr)); 2100 2101#ifdef URGENT_DATA_SUPPORTED 2102 /* 2103 * We don't handle urgent data yet 2104 */ 2105 if (__predict_false(hdr->urg)) 2106 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 2107 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 2108 tp->urg_seq - tp->rcv_nxt < skb->len)) 2109 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 2110 tp->rcv_nxt]; 2111#endif 2112 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 2113 toep->tp_delack_mode = hdr->dack_mode; 2114 toep->tp_delack_seq = tp->rcv_nxt; 2115 } 2116 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", 2117 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); 2118 2119 if (len < m->m_pkthdr.len) 2120 m->m_pkthdr.len = m->m_len = len; 2121 2122 tp->rcv_nxt += m->m_pkthdr.len; 2123 tp->t_rcvtime = ticks; 2124 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2125 CTR2(KTR_TOM, 2126 "new_rx_data: seq 0x%x len %u", 2127 m->m_seq, m->m_pkthdr.len); 2128 inp_wunlock(tp->t_inpcb); 2129 rcv = so_sockbuf_rcv(so); 2130 sockbuf_lock(rcv); 2131#if 0 2132 if (sb_notify(rcv)) 2133 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); 2134#endif 2135 SBAPPEND(rcv, m); 2136 2137#ifdef notyet 2138 /* 2139 * We're giving too many credits to the card - but disable this check so we can keep on moving :-| 2140 * 2141 */ 2142 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), 2143 2144 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 2145 so, rcv->sb_cc, rcv->sb_mbmax)); 2146#endif 2147 2148 2149 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", 2150 rcv->sb_cc, rcv->sb_mbcnt); 2151 2152 state = so_state_get(so); 2153 if (__predict_true((state & SS_NOFDREF) == 0)) 2154 so_sorwakeup_locked(so); 2155 else 2156 sockbuf_unlock(rcv); 2157} 2158 2159/* 2160 * Handler for RX_DATA CPL messages. 2161 */ 2162static int 2163do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2164{ 2165 struct toepcb *toep = (struct toepcb *)ctx; 2166 2167 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 2168 2169 new_rx_data(toep, m); 2170 2171 return (0); 2172} 2173 2174static void 2175new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) 2176{ 2177 struct tcpcb *tp; 2178 struct ddp_state *q; 2179 struct ddp_buf_state *bsp; 2180 struct cpl_rx_data_ddp *hdr; 2181 struct socket *so; 2182 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 2183 int nomoredata = 0; 2184 unsigned int delack_mode; 2185 struct sockbuf *rcv; 2186 2187 tp = toep->tp_tp; 2188 inp_wlock(tp->t_inpcb); 2189 so = inp_inpcbtosocket(tp->t_inpcb); 2190 2191 if (__predict_false(so_no_receive(so))) { 2192 2193 handle_excess_rx(toep, m); 2194 inp_wunlock(tp->t_inpcb); 2195 return; 2196 } 2197 2198 q = &toep->tp_ddp_state; 2199 hdr = cplhdr(m); 2200 ddp_report = ntohl(hdr->u.ddp_report); 2201 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2202 bsp = &q->buf_state[buf_idx]; 2203 2204 CTR4(KTR_TOM, 2205 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 2206 "hdr seq 0x%x len %u", 2207 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 2208 ntohs(hdr->len)); 2209 CTR3(KTR_TOM, 2210 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", 2211 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); 2212 2213 ddp_len = ntohs(hdr->len); 2214 rcv_nxt = ntohl(hdr->seq) + ddp_len; 2215 2216 delack_mode = G_DDP_DACK_MODE(ddp_report); 2217 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2218 toep->tp_delack_mode = delack_mode; 2219 toep->tp_delack_seq = tp->rcv_nxt; 2220 } 2221 2222 m->m_seq = tp->rcv_nxt; 2223 tp->rcv_nxt = rcv_nxt; 2224 2225 tp->t_rcvtime = ticks; 2226 /* 2227 * Store the length in m->m_len. We are changing the meaning of 2228 * m->m_len here, we need to be very careful that nothing from now on 2229 * interprets ->len of this packet the usual way. 2230 */ 2231 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; 2232 inp_wunlock(tp->t_inpcb); 2233 CTR3(KTR_TOM, 2234 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", 2235 m->m_len, rcv_nxt, m->m_seq); 2236 /* 2237 * Figure out where the new data was placed in the buffer and store it 2238 * in when. Assumes the buffer offset starts at 0, consumer needs to 2239 * account for page pod's pg_offset. 2240 */ 2241 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 2242 m->m_cur_offset = end_offset - m->m_pkthdr.len; 2243 2244 rcv = so_sockbuf_rcv(so); 2245 sockbuf_lock(rcv); 2246 2247 m->m_ddp_gl = (unsigned char *)bsp->gl; 2248 m->m_flags |= M_DDP; 2249 bsp->cur_offset = end_offset; 2250 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2251 2252 /* 2253 * Length is only meaningful for kbuf 2254 */ 2255 if (!(bsp->flags & DDP_BF_NOCOPY)) 2256 KASSERT(m->m_len <= bsp->gl->dgl_length, 2257 ("length received exceeds ddp pages: len=%d dgl_length=%d", 2258 m->m_len, bsp->gl->dgl_length)); 2259 2260 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2261 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); 2262 /* 2263 * Bit 0 of flags stores whether the DDP buffer is completed. 2264 * Note that other parts of the code depend on this being in bit 0. 2265 */ 2266 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 2267 panic("spurious ddp completion"); 2268 } else { 2269 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 2270 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 2271 q->cur_buf ^= 1; /* flip buffers */ 2272 } 2273 2274 if (bsp->flags & DDP_BF_NOCOPY) { 2275 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); 2276 bsp->flags &= ~DDP_BF_NOCOPY; 2277 } 2278 2279 if (ddp_report & F_DDP_PSH) 2280 m->m_ddp_flags |= DDP_BF_PSH; 2281 if (nomoredata) 2282 m->m_ddp_flags |= DDP_BF_NODATA; 2283 2284#ifdef notyet 2285 skb_reset_transport_header(skb); 2286 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ 2287#endif 2288 SBAPPEND(rcv, m); 2289 2290 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || 2291 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) 2292 || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) 2293 so_sorwakeup_locked(so); 2294 else 2295 sockbuf_unlock(rcv); 2296} 2297 2298#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 2299 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 2300 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 2301 F_DDP_INVALID_PPOD) 2302 2303/* 2304 * Handler for RX_DATA_DDP CPL messages. 2305 */ 2306static int 2307do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2308{ 2309 struct toepcb *toep = ctx; 2310 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 2311 2312 VALIDATE_SOCK(so); 2313 2314 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 2315 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 2316 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 2317 return (CPL_RET_BUF_DONE); 2318 } 2319#if 0 2320 skb->h.th = tcphdr_skb->h.th; 2321#endif 2322 new_rx_data_ddp(toep, m); 2323 return (0); 2324} 2325 2326static void 2327process_ddp_complete(struct toepcb *toep, struct mbuf *m) 2328{ 2329 struct tcpcb *tp = toep->tp_tp; 2330 struct socket *so; 2331 struct ddp_state *q; 2332 struct ddp_buf_state *bsp; 2333 struct cpl_rx_ddp_complete *hdr; 2334 unsigned int ddp_report, buf_idx, when, delack_mode; 2335 int nomoredata = 0; 2336 struct sockbuf *rcv; 2337 2338 inp_wlock(tp->t_inpcb); 2339 so = inp_inpcbtosocket(tp->t_inpcb); 2340 2341 if (__predict_false(so_no_receive(so))) { 2342 struct inpcb *inp = so_sotoinpcb(so); 2343 2344 handle_excess_rx(toep, m); 2345 inp_wunlock(inp); 2346 return; 2347 } 2348 q = &toep->tp_ddp_state; 2349 hdr = cplhdr(m); 2350 ddp_report = ntohl(hdr->ddp_report); 2351 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2352 m->m_pkthdr.csum_data = tp->rcv_nxt; 2353 2354 rcv = so_sockbuf_rcv(so); 2355 sockbuf_lock(rcv); 2356 2357 bsp = &q->buf_state[buf_idx]; 2358 when = bsp->cur_offset; 2359 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; 2360 tp->rcv_nxt += m->m_len; 2361 tp->t_rcvtime = ticks; 2362 2363 delack_mode = G_DDP_DACK_MODE(ddp_report); 2364 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2365 toep->tp_delack_mode = delack_mode; 2366 toep->tp_delack_seq = tp->rcv_nxt; 2367 } 2368#ifdef notyet 2369 skb_reset_transport_header(skb); 2370 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2371#endif 2372 inp_wunlock(tp->t_inpcb); 2373 2374 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2375 CTR5(KTR_TOM, 2376 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2377 "ddp_report 0x%x offset %u, len %u", 2378 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2379 G_DDP_OFFSET(ddp_report), m->m_len); 2380 2381 m->m_cur_offset = bsp->cur_offset; 2382 bsp->cur_offset += m->m_len; 2383 2384 if (!(bsp->flags & DDP_BF_NOFLIP)) { 2385 q->cur_buf ^= 1; /* flip buffers */ 2386 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) 2387 nomoredata=1; 2388 } 2389 2390 CTR4(KTR_TOM, 2391 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2392 "ddp_report %u offset %u", 2393 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2394 G_DDP_OFFSET(ddp_report)); 2395 2396 m->m_ddp_gl = (unsigned char *)bsp->gl; 2397 m->m_flags |= M_DDP; 2398 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 2399 if (bsp->flags & DDP_BF_NOCOPY) 2400 bsp->flags &= ~DDP_BF_NOCOPY; 2401 if (nomoredata) 2402 m->m_ddp_flags |= DDP_BF_NODATA; 2403 2404 SBAPPEND(rcv, m); 2405 if ((so_state_get(so) & SS_NOFDREF) == 0) 2406 so_sorwakeup_locked(so); 2407 else 2408 sockbuf_unlock(rcv); 2409} 2410 2411/* 2412 * Handler for RX_DDP_COMPLETE CPL messages. 2413 */ 2414static int 2415do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2416{ 2417 struct toepcb *toep = ctx; 2418 2419 VALIDATE_SOCK(so); 2420#if 0 2421 skb->h.th = tcphdr_skb->h.th; 2422#endif 2423 process_ddp_complete(toep, m); 2424 return (0); 2425} 2426 2427/* 2428 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 2429 * socket state before calling tcp_time_wait to comply with its expectations. 2430 */ 2431static void 2432enter_timewait(struct tcpcb *tp) 2433{ 2434 /* 2435 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2436 * process peer_close because we don't want to carry the peer FIN in 2437 * the socket's receive queue and if we increment rcv_nxt without 2438 * having the FIN in the receive queue we'll confuse facilities such 2439 * as SIOCINQ. 2440 */ 2441 inp_wlock(tp->t_inpcb); 2442 tp->rcv_nxt++; 2443 2444 tp->ts_recent_age = 0; /* defeat recycling */ 2445 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2446 inp_wunlock(tp->t_inpcb); 2447 tcp_offload_twstart(tp); 2448} 2449 2450/* 2451 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This 2452 * function deals with the data that may be reported along with the FIN. 2453 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to 2454 * perform normal FIN-related processing. In the latter case 1 indicates that 2455 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the 2456 * skb can be freed. 2457 */ 2458static int 2459handle_peer_close_data(struct socket *so, struct mbuf *m) 2460{ 2461 struct tcpcb *tp = so_sototcpcb(so); 2462 struct toepcb *toep = tp->t_toe; 2463 struct ddp_state *q; 2464 struct ddp_buf_state *bsp; 2465 struct cpl_peer_close *req = cplhdr(m); 2466 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ 2467 struct sockbuf *rcv; 2468 2469 if (tp->rcv_nxt == rcv_nxt) /* no data */ 2470 return (0); 2471 2472 CTR0(KTR_TOM, "handle_peer_close_data"); 2473 if (__predict_false(so_no_receive(so))) { 2474 handle_excess_rx(toep, m); 2475 2476 /* 2477 * Although we discard the data we want to process the FIN so 2478 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + 2479 * PEER_CLOSE without data. In particular this PEER_CLOSE 2480 * may be what will close the connection. We return 1 because 2481 * handle_excess_rx() already freed the packet. 2482 */ 2483 return (1); 2484 } 2485 2486 inp_lock_assert(tp->t_inpcb); 2487 q = &toep->tp_ddp_state; 2488 rcv = so_sockbuf_rcv(so); 2489 sockbuf_lock(rcv); 2490 2491 bsp = &q->buf_state[q->cur_buf]; 2492 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2493 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2494 m->m_ddp_gl = (unsigned char *)bsp->gl; 2495 m->m_flags |= M_DDP; 2496 m->m_cur_offset = bsp->cur_offset; 2497 m->m_ddp_flags = 2498 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2499 m->m_seq = tp->rcv_nxt; 2500 tp->rcv_nxt = rcv_nxt; 2501 bsp->cur_offset += m->m_pkthdr.len; 2502 if (!(bsp->flags & DDP_BF_NOFLIP)) 2503 q->cur_buf ^= 1; 2504#ifdef notyet 2505 skb_reset_transport_header(skb); 2506 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2507#endif 2508 tp->t_rcvtime = ticks; 2509 SBAPPEND(rcv, m); 2510 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 2511 so_sorwakeup_locked(so); 2512 else 2513 sockbuf_unlock(rcv); 2514 2515 return (1); 2516} 2517 2518/* 2519 * Handle a peer FIN. 2520 */ 2521static void 2522do_peer_fin(struct toepcb *toep, struct mbuf *m) 2523{ 2524 struct socket *so; 2525 struct tcpcb *tp = toep->tp_tp; 2526 int keep, action; 2527 2528 action = keep = 0; 2529 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); 2530 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2531 printf("abort_pending set\n"); 2532 2533 goto out; 2534 } 2535 inp_wlock(tp->t_inpcb); 2536 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 2537 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { 2538 keep = handle_peer_close_data(so, m); 2539 if (keep < 0) { 2540 inp_wunlock(tp->t_inpcb); 2541 return; 2542 } 2543 } 2544 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2545 CTR1(KTR_TOM, 2546 "waking up waiters for cantrcvmore on %p ", so); 2547 socantrcvmore(so); 2548 2549 /* 2550 * If connection is half-synchronized 2551 * (ie NEEDSYN flag on) then delay ACK, 2552 * so it may be piggybacked when SYN is sent. 2553 * Otherwise, since we received a FIN then no 2554 * more input can be expected, send ACK now. 2555 */ 2556 if (tp->t_flags & TF_NEEDSYN) 2557 tp->t_flags |= TF_DELACK; 2558 else 2559 tp->t_flags |= TF_ACKNOW; 2560 tp->rcv_nxt++; 2561 } 2562 2563 switch (tp->t_state) { 2564 case TCPS_SYN_RECEIVED: 2565 tp->t_starttime = ticks; 2566 /* FALLTHROUGH */ 2567 case TCPS_ESTABLISHED: 2568 tp->t_state = TCPS_CLOSE_WAIT; 2569 break; 2570 case TCPS_FIN_WAIT_1: 2571 tp->t_state = TCPS_CLOSING; 2572 break; 2573 case TCPS_FIN_WAIT_2: 2574 /* 2575 * If we've sent an abort_req we must have sent it too late, 2576 * HW will send us a reply telling us so, and this peer_close 2577 * is really the last message for this connection and needs to 2578 * be treated as an abort_rpl, i.e., transition the connection 2579 * to TCP_CLOSE (note that the host stack does this at the 2580 * time of generating the RST but we must wait for HW). 2581 * Otherwise we enter TIME_WAIT. 2582 */ 2583 t3_release_offload_resources(toep); 2584 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2585 action = TCP_CLOSE; 2586 } else { 2587 action = TCP_TIMEWAIT; 2588 } 2589 break; 2590 default: 2591 log(LOG_ERR, 2592 "%s: TID %u received PEER_CLOSE in bad state %d\n", 2593 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); 2594 } 2595 inp_wunlock(tp->t_inpcb); 2596 2597 if (action == TCP_TIMEWAIT) { 2598 enter_timewait(tp); 2599 } else if (action == TCP_DROP) { 2600 tcp_offload_drop(tp, 0); 2601 } else if (action == TCP_CLOSE) { 2602 tcp_offload_close(tp); 2603 } 2604 2605#ifdef notyet 2606 /* Do not send POLL_HUP for half duplex close. */ 2607 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 2608 sk->sk_state == TCP_CLOSE) 2609 sk_wake_async(so, 1, POLL_HUP); 2610 else 2611 sk_wake_async(so, 1, POLL_IN); 2612#endif 2613 2614out: 2615 if (!keep) 2616 m_free(m); 2617} 2618 2619/* 2620 * Handler for PEER_CLOSE CPL messages. 2621 */ 2622static int 2623do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2624{ 2625 struct toepcb *toep = (struct toepcb *)ctx; 2626 2627 VALIDATE_SOCK(so); 2628 2629 do_peer_fin(toep, m); 2630 return (0); 2631} 2632 2633static void 2634process_close_con_rpl(struct toepcb *toep, struct mbuf *m) 2635{ 2636 struct cpl_close_con_rpl *rpl = cplhdr(m); 2637 struct tcpcb *tp = toep->tp_tp; 2638 struct socket *so; 2639 int action = 0; 2640 struct sockbuf *rcv; 2641 2642 inp_wlock(tp->t_inpcb); 2643 so = inp_inpcbtosocket(tp->t_inpcb); 2644 2645 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 2646 2647 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2648 inp_wunlock(tp->t_inpcb); 2649 goto out; 2650 } 2651 2652 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 2653 tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); 2654 2655 switch (tp->t_state) { 2656 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 2657 t3_release_offload_resources(toep); 2658 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2659 action = TCP_CLOSE; 2660 2661 } else { 2662 action = TCP_TIMEWAIT; 2663 } 2664 break; 2665 case TCPS_LAST_ACK: 2666 /* 2667 * In this state we don't care about pending abort_rpl. 2668 * If we've sent abort_req it was post-close and was sent too 2669 * late, this close_con_rpl is the actual last message. 2670 */ 2671 t3_release_offload_resources(toep); 2672 action = TCP_CLOSE; 2673 break; 2674 case TCPS_FIN_WAIT_1: 2675 /* 2676 * If we can't receive any more 2677 * data, then closing user can proceed. 2678 * Starting the timer is contrary to the 2679 * specification, but if we don't get a FIN 2680 * we'll hang forever. 2681 * 2682 * XXXjl: 2683 * we should release the tp also, and use a 2684 * compressed state. 2685 */ 2686 if (so) 2687 rcv = so_sockbuf_rcv(so); 2688 else 2689 break; 2690 2691 if (rcv->sb_state & SBS_CANTRCVMORE) { 2692 int timeout; 2693 2694 if (so) 2695 soisdisconnected(so); 2696 timeout = (tcp_fast_finwait2_recycle) ? 2697 tcp_finwait2_timeout : tcp_maxidle; 2698 tcp_timer_activate(tp, TT_2MSL, timeout); 2699 } 2700 tp->t_state = TCPS_FIN_WAIT_2; 2701 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && 2702 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 2703 action = TCP_DROP; 2704 } 2705 2706 break; 2707 default: 2708 log(LOG_ERR, 2709 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 2710 toep->tp_toedev->tod_name, toep->tp_tid, 2711 tp->t_state); 2712 } 2713 inp_wunlock(tp->t_inpcb); 2714 2715 2716 if (action == TCP_TIMEWAIT) { 2717 enter_timewait(tp); 2718 } else if (action == TCP_DROP) { 2719 tcp_offload_drop(tp, 0); 2720 } else if (action == TCP_CLOSE) { 2721 tcp_offload_close(tp); 2722 } 2723out: 2724 m_freem(m); 2725} 2726 2727/* 2728 * Handler for CLOSE_CON_RPL CPL messages. 2729 */ 2730static int 2731do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 2732 void *ctx) 2733{ 2734 struct toepcb *toep = (struct toepcb *)ctx; 2735 2736 process_close_con_rpl(toep, m); 2737 return (0); 2738} 2739 2740/* 2741 * Process abort replies. We only process these messages if we anticipate 2742 * them as the coordination between SW and HW in this area is somewhat lacking 2743 * and sometimes we get ABORT_RPLs after we are done with the connection that 2744 * originated the ABORT_REQ. 2745 */ 2746static void 2747process_abort_rpl(struct toepcb *toep, struct mbuf *m) 2748{ 2749 struct tcpcb *tp = toep->tp_tp; 2750 struct socket *so; 2751 int needclose = 0; 2752 2753#ifdef T3_TRACE 2754 T3_TRACE1(TIDTB(sk), 2755 "process_abort_rpl: GTS rpl pending %d", 2756 sock_flag(sk, ABORT_RPL_PENDING)); 2757#endif 2758 2759 inp_wlock(tp->t_inpcb); 2760 so = inp_inpcbtosocket(tp->t_inpcb); 2761 2762 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2763 /* 2764 * XXX panic on tcpdrop 2765 */ 2766 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) 2767 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2768 else { 2769 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2770 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2771 !is_t3a(toep->tp_toedev)) { 2772 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2773 panic("TP_ABORT_REQ_RCVD set"); 2774 t3_release_offload_resources(toep); 2775 needclose = 1; 2776 } 2777 } 2778 } 2779 inp_wunlock(tp->t_inpcb); 2780 2781 if (needclose) 2782 tcp_offload_close(tp); 2783 2784 m_free(m); 2785} 2786 2787/* 2788 * Handle an ABORT_RPL_RSS CPL message. 2789 */ 2790static int 2791do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2792{ 2793 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2794 struct toepcb *toep; 2795 2796 /* 2797 * Ignore replies to post-close aborts indicating that the abort was 2798 * requested too late. These connections are terminated when we get 2799 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2800 * arrives the TID is either no longer used or it has been recycled. 2801 */ 2802 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2803discard: 2804 m_free(m); 2805 return (0); 2806 } 2807 2808 toep = (struct toepcb *)ctx; 2809 2810 /* 2811 * Sometimes we've already closed the socket, e.g., a post-close 2812 * abort races with ABORT_REQ_RSS, the latter frees the socket 2813 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2814 * but FW turns the ABORT_REQ into a regular one and so we get 2815 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2816 */ 2817 if (!toep) 2818 goto discard; 2819 2820 if (toep->tp_tp == NULL) { 2821 log(LOG_NOTICE, "removing tid for abort\n"); 2822 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2823 if (toep->tp_l2t) 2824 l2t_release(L2DATA(cdev), toep->tp_l2t); 2825 2826 toepcb_release(toep); 2827 goto discard; 2828 } 2829 2830 log(LOG_NOTICE, "toep=%p\n", toep); 2831 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); 2832 2833 toepcb_hold(toep); 2834 process_abort_rpl(toep, m); 2835 toepcb_release(toep); 2836 return (0); 2837} 2838 2839/* 2840 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also 2841 * indicate whether RST should be sent in response. 2842 */ 2843static int 2844abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2845{ 2846 struct tcpcb *tp = so_sototcpcb(so); 2847 2848 switch (abort_reason) { 2849 case CPL_ERR_BAD_SYN: 2850#if 0 2851 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2852#endif 2853 case CPL_ERR_CONN_RESET: 2854 // XXX need to handle SYN_RECV due to crossed SYNs 2855 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2856 case CPL_ERR_XMIT_TIMEDOUT: 2857 case CPL_ERR_PERSIST_TIMEDOUT: 2858 case CPL_ERR_FINWAIT2_TIMEDOUT: 2859 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2860#if 0 2861 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2862#endif 2863 return (ETIMEDOUT); 2864 default: 2865 return (EIO); 2866 } 2867} 2868 2869static inline void 2870set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2871{ 2872 struct cpl_abort_rpl *rpl = cplhdr(m); 2873 2874 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2875 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2876 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2877 2878 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2879 rpl->cmd = cmd; 2880} 2881 2882static void 2883send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2884{ 2885 struct mbuf *reply_mbuf; 2886 struct cpl_abort_req_rss *req = cplhdr(m); 2887 2888 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2889 m_set_priority(m, CPL_PRIORITY_DATA); 2890 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2891 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2892 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2893 m_free(m); 2894} 2895 2896/* 2897 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2898 */ 2899static inline int 2900is_neg_adv_abort(unsigned int status) 2901{ 2902 return status == CPL_ERR_RTX_NEG_ADVICE || 2903 status == CPL_ERR_PERSIST_NEG_ADVICE; 2904} 2905 2906static void 2907send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2908{ 2909 struct mbuf *reply_mbuf; 2910 struct cpl_abort_req_rss *req = cplhdr(m); 2911 2912 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2913 2914 if (!reply_mbuf) { 2915 /* Defer the reply. Stick rst_status into req->cmd. */ 2916 req->status = rst_status; 2917 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2918 return; 2919 } 2920 2921 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2922 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2923 m_free(m); 2924 2925 /* 2926 * XXX need to sync with ARP as for SYN_RECV connections we can send 2927 * these messages while ARP is pending. For other connection states 2928 * it's not a problem. 2929 */ 2930 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2931} 2932 2933#ifdef notyet 2934static void 2935cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2936{ 2937 CXGB_UNIMPLEMENTED(); 2938#ifdef notyet 2939 struct request_sock *req = child->sk_user_data; 2940 2941 inet_csk_reqsk_queue_removed(parent, req); 2942 synq_remove(tcp_sk(child)); 2943 __reqsk_free(req); 2944 child->sk_user_data = NULL; 2945#endif 2946} 2947 2948 2949/* 2950 * Performs the actual work to abort a SYN_RECV connection. 2951 */ 2952static void 2953do_abort_syn_rcv(struct socket *child, struct socket *parent) 2954{ 2955 struct tcpcb *parenttp = so_sototcpcb(parent); 2956 struct tcpcb *childtp = so_sototcpcb(child); 2957 2958 /* 2959 * If the server is still open we clean up the child connection, 2960 * otherwise the server already did the clean up as it was purging 2961 * its SYN queue and the skb was just sitting in its backlog. 2962 */ 2963 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2964 cleanup_syn_rcv_conn(child, parent); 2965 inp_wlock(childtp->t_inpcb); 2966 t3_release_offload_resources(childtp->t_toe); 2967 inp_wunlock(childtp->t_inpcb); 2968 tcp_offload_close(childtp); 2969 } 2970} 2971#endif 2972 2973/* 2974 * Handle abort requests for a SYN_RECV connection. These need extra work 2975 * because the socket is on its parent's SYN queue. 2976 */ 2977static int 2978abort_syn_rcv(struct socket *so, struct mbuf *m) 2979{ 2980 CXGB_UNIMPLEMENTED(); 2981#ifdef notyet 2982 struct socket *parent; 2983 struct toedev *tdev = toep->tp_toedev; 2984 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2985 struct socket *oreq = so->so_incomp; 2986 struct t3c_tid_entry *t3c_stid; 2987 struct tid_info *t; 2988 2989 if (!oreq) 2990 return -1; /* somehow we are not on the SYN queue */ 2991 2992 t = &(T3C_DATA(cdev))->tid_maps; 2993 t3c_stid = lookup_stid(t, oreq->ts_recent); 2994 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2995 2996 so_lock(parent); 2997 do_abort_syn_rcv(so, parent); 2998 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2999 so_unlock(parent); 3000#endif 3001 return (0); 3002} 3003 3004/* 3005 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 3006 * request except that we need to reply to it. 3007 */ 3008static void 3009process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) 3010{ 3011 int rst_status = CPL_ABORT_NO_RST; 3012 const struct cpl_abort_req_rss *req = cplhdr(m); 3013 struct tcpcb *tp = toep->tp_tp; 3014 struct socket *so; 3015 int needclose = 0; 3016 3017 inp_wlock(tp->t_inpcb); 3018 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 3019 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 3020 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 3021 m_free(m); 3022 goto skip; 3023 } 3024 3025 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 3026 /* 3027 * Three cases to consider: 3028 * a) We haven't sent an abort_req; close the connection. 3029 * b) We have sent a post-close abort_req that will get to TP too late 3030 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 3031 * be ignored and the connection should be closed now. 3032 * c) We have sent a regular abort_req that will get to TP too late. 3033 * That will generate an abort_rpl with status 0, wait for it. 3034 */ 3035 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 3036 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 3037 int error; 3038 3039 error = abort_status_to_errno(so, req->status, 3040 &rst_status); 3041 so_error_set(so, error); 3042 3043 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 3044 so_sorwakeup(so); 3045 /* 3046 * SYN_RECV needs special processing. If abort_syn_rcv() 3047 * returns 0 is has taken care of the abort. 3048 */ 3049 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 3050 goto skip; 3051 3052 t3_release_offload_resources(toep); 3053 needclose = 1; 3054 } 3055 inp_wunlock(tp->t_inpcb); 3056 3057 if (needclose) 3058 tcp_offload_close(tp); 3059 3060 send_abort_rpl(m, tdev, rst_status); 3061 return; 3062skip: 3063 inp_wunlock(tp->t_inpcb); 3064} 3065 3066/* 3067 * Handle an ABORT_REQ_RSS CPL message. 3068 */ 3069static int 3070do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3071{ 3072 const struct cpl_abort_req_rss *req = cplhdr(m); 3073 struct toepcb *toep = (struct toepcb *)ctx; 3074 3075 if (is_neg_adv_abort(req->status)) { 3076 m_free(m); 3077 return (0); 3078 } 3079 3080 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); 3081 3082 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 3083 cxgb_remove_tid(cdev, toep, toep->tp_tid); 3084 toep->tp_flags |= TP_ABORT_REQ_RCVD; 3085 3086 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 3087 if (toep->tp_l2t) 3088 l2t_release(L2DATA(cdev), toep->tp_l2t); 3089 3090 /* 3091 * Unhook 3092 */ 3093 toep->tp_tp->t_toe = NULL; 3094 toep->tp_tp->t_flags &= ~TF_TOE; 3095 toep->tp_tp = NULL; 3096 /* 3097 * XXX need to call syncache_chkrst - but we don't 3098 * have a way of doing that yet 3099 */ 3100 toepcb_release(toep); 3101 log(LOG_ERR, "abort for unestablished connection :-(\n"); 3102 return (0); 3103 } 3104 if (toep->tp_tp == NULL) { 3105 log(LOG_NOTICE, "disconnected toepcb\n"); 3106 /* should be freed momentarily */ 3107 return (0); 3108 } 3109 3110 3111 toepcb_hold(toep); 3112 process_abort_req(toep, m, toep->tp_toedev); 3113 toepcb_release(toep); 3114 return (0); 3115} 3116#ifdef notyet 3117static void 3118pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 3119{ 3120 struct toedev *tdev = TOE_DEV(parent); 3121 3122 do_abort_syn_rcv(child, parent); 3123 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 3124 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 3125 3126 rpl->opt0h = htonl(F_TCAM_BYPASS); 3127 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3128 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3129 } else 3130 m_free(m); 3131} 3132#endif 3133static void 3134handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 3135{ 3136 CXGB_UNIMPLEMENTED(); 3137 3138#ifdef notyet 3139 struct t3cdev *cdev; 3140 struct socket *parent; 3141 struct socket *oreq; 3142 struct t3c_tid_entry *t3c_stid; 3143 struct tid_info *t; 3144 struct tcpcb *otp, *tp = so_sototcpcb(so); 3145 struct toepcb *toep = tp->t_toe; 3146 3147 /* 3148 * If the connection is being aborted due to the parent listening 3149 * socket going away there's nothing to do, the ABORT_REQ will close 3150 * the connection. 3151 */ 3152 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 3153 m_free(m); 3154 return; 3155 } 3156 3157 oreq = so->so_incomp; 3158 otp = so_sototcpcb(oreq); 3159 3160 cdev = T3C_DEV(so); 3161 t = &(T3C_DATA(cdev))->tid_maps; 3162 t3c_stid = lookup_stid(t, otp->ts_recent); 3163 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 3164 3165 so_lock(parent); 3166 pass_open_abort(so, parent, m); 3167 so_unlock(parent); 3168#endif 3169} 3170 3171/* 3172 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 3173 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 3174 * connection. 3175 */ 3176static void 3177pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 3178{ 3179 3180#ifdef notyet 3181 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3182 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 3183#endif 3184 handle_pass_open_arp_failure(m_get_socket(m), m); 3185} 3186 3187/* 3188 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 3189 */ 3190static void 3191mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 3192{ 3193 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 3194 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 3195 unsigned int tid = GET_TID(req); 3196 3197 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 3198 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3199 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3200 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 3201 rpl->opt0h = htonl(F_TCAM_BYPASS); 3202 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3203 rpl->opt2 = 0; 3204 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3205} 3206 3207/* 3208 * Send a deferred reject to an accept request. 3209 */ 3210static void 3211reject_pass_request(struct toedev *tdev, struct mbuf *m) 3212{ 3213 struct mbuf *reply_mbuf; 3214 3215 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 3216 mk_pass_accept_rpl(reply_mbuf, m); 3217 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 3218 m_free(m); 3219} 3220 3221static void 3222handle_syncache_event(int event, void *arg) 3223{ 3224 struct toepcb *toep = arg; 3225 3226 switch (event) { 3227 case TOE_SC_ENTRY_PRESENT: 3228 /* 3229 * entry already exists - free toepcb 3230 * and l2t 3231 */ 3232 printf("syncache entry present\n"); 3233 toepcb_release(toep); 3234 break; 3235 case TOE_SC_DROP: 3236 /* 3237 * The syncache has given up on this entry 3238 * either it timed out, or it was evicted 3239 * we need to explicitly release the tid 3240 */ 3241 printf("syncache entry dropped\n"); 3242 toepcb_release(toep); 3243 break; 3244 default: 3245 log(LOG_ERR, "unknown syncache event %d\n", event); 3246 break; 3247 } 3248} 3249 3250static void 3251syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 3252{ 3253 struct in_conninfo inc; 3254 struct tcpopt to; 3255 struct tcphdr th; 3256 struct inpcb *inp; 3257 int mss, wsf, sack, ts; 3258 uint32_t rcv_isn = ntohl(req->rcv_isn); 3259 3260 bzero(&to, sizeof(struct tcpopt)); 3261 inp = so_sotoinpcb(lso); 3262 3263 /* 3264 * Fill out information for entering us into the syncache 3265 */ 3266 bzero(&inc, sizeof(inc)); 3267 inc.inc_fport = th.th_sport = req->peer_port; 3268 inc.inc_lport = th.th_dport = req->local_port; 3269 th.th_seq = req->rcv_isn; 3270 th.th_flags = TH_SYN; 3271 3272 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; 3273 3274 3275 inc.inc_isipv6 = 0; 3276 inc.inc_len = 0; 3277 inc.inc_faddr.s_addr = req->peer_ip; 3278 inc.inc_laddr.s_addr = req->local_ip; 3279 3280 DPRINTF("syncache add of %d:%d %d:%d\n", 3281 ntohl(req->local_ip), ntohs(req->local_port), 3282 ntohl(req->peer_ip), ntohs(req->peer_port)); 3283 3284 mss = req->tcp_options.mss; 3285 wsf = req->tcp_options.wsf; 3286 ts = req->tcp_options.tstamp; 3287 sack = req->tcp_options.sack; 3288 to.to_mss = mss; 3289 to.to_wscale = wsf; 3290 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3291 tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 3292} 3293 3294 3295/* 3296 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 3297 * lock held. Note that the sock here is a listening socket that is not owned 3298 * by the TOE. 3299 */ 3300static void 3301process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 3302 struct listen_ctx *lctx) 3303{ 3304 int rt_flags; 3305 struct l2t_entry *e; 3306 struct iff_mac tim; 3307 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 3308 struct cpl_pass_accept_rpl *rpl; 3309 struct cpl_pass_accept_req *req = cplhdr(m); 3310 unsigned int tid = GET_TID(req); 3311 struct tom_data *d = TOM_DATA(tdev); 3312 struct t3cdev *cdev = d->cdev; 3313 struct tcpcb *tp = so_sototcpcb(so); 3314 struct toepcb *newtoep; 3315 struct rtentry *dst; 3316 struct sockaddr_in nam; 3317 struct t3c_data *td = T3C_DATA(cdev); 3318 3319 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3320 if (__predict_false(reply_mbuf == NULL)) { 3321 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3322 t3_defer_reply(m, tdev, reject_pass_request); 3323 else { 3324 cxgb_queue_tid_release(cdev, tid); 3325 m_free(m); 3326 } 3327 DPRINTF("failed to get reply_mbuf\n"); 3328 3329 goto out; 3330 } 3331 3332 if (tp->t_state != TCPS_LISTEN) { 3333 DPRINTF("socket not in listen state\n"); 3334 3335 goto reject; 3336 } 3337 3338 tim.mac_addr = req->dst_mac; 3339 tim.vlan_tag = ntohs(req->vlan_tag); 3340 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 3341 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 3342 goto reject; 3343 } 3344 3345#ifdef notyet 3346 /* 3347 * XXX do route lookup to confirm that we're still listening on this 3348 * address 3349 */ 3350 if (ip_route_input(skb, req->local_ip, req->peer_ip, 3351 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 3352 goto reject; 3353 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 3354 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 3355 dst_release(skb->dst); // done with the input route, release it 3356 skb->dst = NULL; 3357 3358 if ((rt_flags & RTF_LOCAL) == 0) 3359 goto reject; 3360#endif 3361 /* 3362 * XXX 3363 */ 3364 rt_flags = RTF_LOCAL; 3365 if ((rt_flags & RTF_LOCAL) == 0) 3366 goto reject; 3367 3368 /* 3369 * Calculate values and add to syncache 3370 */ 3371 3372 newtoep = toepcb_alloc(); 3373 if (newtoep == NULL) 3374 goto reject; 3375 3376 bzero(&nam, sizeof(struct sockaddr_in)); 3377 3378 nam.sin_len = sizeof(struct sockaddr_in); 3379 nam.sin_family = AF_INET; 3380 nam.sin_addr.s_addr =req->peer_ip; 3381 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 3382 3383 if (dst == NULL) { 3384 printf("failed to find route\n"); 3385 goto reject; 3386 } 3387 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 3388 (struct sockaddr *)&nam); 3389 if (e == NULL) { 3390 DPRINTF("failed to get l2t\n"); 3391 } 3392 /* 3393 * Point to our listen socket until accept 3394 */ 3395 newtoep->tp_tp = tp; 3396 newtoep->tp_flags = TP_SYN_RCVD; 3397 newtoep->tp_tid = tid; 3398 newtoep->tp_toedev = tdev; 3399 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3400 3401 cxgb_insert_tid(cdev, d->client, newtoep, tid); 3402 so_lock(so); 3403 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 3404 so_unlock(so); 3405 3406 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 3407 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 3408 3409 if (newtoep->tp_ulp_mode) { 3410 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3411 3412 if (ddp_mbuf == NULL) 3413 newtoep->tp_ulp_mode = 0; 3414 } 3415 3416 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", 3417 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); 3418 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 3419 /* 3420 * XXX workaround for lack of syncache drop 3421 */ 3422 toepcb_hold(newtoep); 3423 syncache_add_accept_req(req, so, newtoep); 3424 3425 rpl = cplhdr(reply_mbuf); 3426 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 3427 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3428 rpl->wr.wr_lo = 0; 3429 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3430 rpl->opt2 = htonl(calc_opt2(so, tdev)); 3431 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3432 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 3433 3434 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 3435 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 3436 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | 3437 CPL_PASS_OPEN_ACCEPT); 3438 3439 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 3440 3441 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); 3442 3443 l2t_send(cdev, reply_mbuf, e); 3444 m_free(m); 3445 if (newtoep->tp_ulp_mode) { 3446 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 3447 V_TF_DDP_OFF(1) | 3448 TP_DDP_TIMER_WORKAROUND_MASK, 3449 V_TF_DDP_OFF(1) | 3450 TP_DDP_TIMER_WORKAROUND_VAL, 1); 3451 } else 3452 printf("not offloading\n"); 3453 3454 3455 3456 return; 3457reject: 3458 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3459 mk_pass_accept_rpl(reply_mbuf, m); 3460 else 3461 mk_tid_release(reply_mbuf, newtoep, tid); 3462 cxgb_ofld_send(cdev, reply_mbuf); 3463 m_free(m); 3464out: 3465#if 0 3466 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3467#else 3468 return; 3469#endif 3470} 3471 3472/* 3473 * Handle a CPL_PASS_ACCEPT_REQ message. 3474 */ 3475static int 3476do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3477{ 3478 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 3479 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ 3480 struct tom_data *d = listen_ctx->tom_data; 3481 3482#if VALIDATE_TID 3483 struct cpl_pass_accept_req *req = cplhdr(m); 3484 unsigned int tid = GET_TID(req); 3485 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 3486 3487 if (unlikely(!lsk)) { 3488 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 3489 cdev->name, 3490 (unsigned long)((union listen_entry *)ctx - 3491 t->stid_tab)); 3492 return CPL_RET_BUF_DONE; 3493 } 3494 if (unlikely(tid >= t->ntids)) { 3495 printk(KERN_ERR "%s: passive open TID %u too large\n", 3496 cdev->name, tid); 3497 return CPL_RET_BUF_DONE; 3498 } 3499 /* 3500 * For T3A the current user of the TID may have closed but its last 3501 * message(s) may have been backlogged so the TID appears to be still 3502 * in use. Just take the TID away, the connection can close at its 3503 * own leisure. For T3B this situation is a bug. 3504 */ 3505 if (!valid_new_tid(t, tid) && 3506 cdev->type != T3A) { 3507 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 3508 cdev->name, tid); 3509 return CPL_RET_BUF_DONE; 3510 } 3511#endif 3512 3513 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 3514 return (0); 3515} 3516 3517/* 3518 * Called when a connection is established to translate the TCP options 3519 * reported by HW to FreeBSD's native format. 3520 */ 3521static void 3522assign_rxopt(struct socket *so, unsigned int opt) 3523{ 3524 struct tcpcb *tp = so_sototcpcb(so); 3525 struct toepcb *toep = tp->t_toe; 3526 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); 3527 3528 inp_lock_assert(tp->t_inpcb); 3529 3530 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3531 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 3532 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 3533 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 3534 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3535 (TF_RCVD_SCALE|TF_REQ_SCALE)) 3536 tp->rcv_scale = tp->request_r_scale; 3537} 3538 3539/* 3540 * Completes some final bits of initialization for just established connections 3541 * and changes their state to TCP_ESTABLISHED. 3542 * 3543 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 3544 */ 3545static void 3546make_established(struct socket *so, u32 snd_isn, unsigned int opt) 3547{ 3548 struct tcpcb *tp = so_sototcpcb(so); 3549 struct toepcb *toep = tp->t_toe; 3550 3551 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 3552 assign_rxopt(so, opt); 3553 3554 /* 3555 *XXXXXXXXXXX 3556 * 3557 */ 3558#ifdef notyet 3559 so->so_proto->pr_ctloutput = t3_ctloutput; 3560#endif 3561 3562#if 0 3563 inet_sk(sk)->id = tp->write_seq ^ jiffies; 3564#endif 3565 /* 3566 * XXX not clear what rcv_wup maps to 3567 */ 3568 /* 3569 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 3570 * pass through opt0. 3571 */ 3572 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 3573 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 3574 3575 dump_toepcb(toep); 3576 3577#ifdef notyet 3578/* 3579 * no clean interface for marking ARP up to date 3580 */ 3581 dst_confirm(sk->sk_dst_cache); 3582#endif 3583 tp->t_starttime = ticks; 3584 tp->t_state = TCPS_ESTABLISHED; 3585 soisconnected(so); 3586} 3587 3588static int 3589syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 3590{ 3591 3592 struct in_conninfo inc; 3593 struct tcpopt to; 3594 struct tcphdr th; 3595 int mss, wsf, sack, ts; 3596 struct mbuf *m = NULL; 3597 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 3598 unsigned int opt; 3599 3600#ifdef MAC 3601#error "no MAC support" 3602#endif 3603 3604 opt = ntohs(req->tcp_opt); 3605 3606 bzero(&to, sizeof(struct tcpopt)); 3607 3608 /* 3609 * Fill out information for entering us into the syncache 3610 */ 3611 bzero(&inc, sizeof(inc)); 3612 inc.inc_fport = th.th_sport = req->peer_port; 3613 inc.inc_lport = th.th_dport = req->local_port; 3614 th.th_seq = req->rcv_isn; 3615 th.th_flags = TH_ACK; 3616 3617 inc.inc_isipv6 = 0; 3618 inc.inc_len = 0; 3619 inc.inc_faddr.s_addr = req->peer_ip; 3620 inc.inc_laddr.s_addr = req->local_ip; 3621 3622 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3623 wsf = G_TCPOPT_WSCALE_OK(opt); 3624 ts = G_TCPOPT_TSTAMP(opt); 3625 sack = G_TCPOPT_SACK(opt); 3626 3627 to.to_mss = mss; 3628 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 3629 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3630 3631 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 3632 ntohl(req->local_ip), ntohs(req->local_port), 3633 ntohl(req->peer_ip), ntohs(req->peer_port), 3634 mss, wsf, ts, sack); 3635 return tcp_offload_syncache_expand(&inc, &to, &th, so, m); 3636} 3637 3638 3639/* 3640 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 3641 * if we are in TCP_SYN_RECV due to crossed SYNs 3642 */ 3643static int 3644do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3645{ 3646 struct cpl_pass_establish *req = cplhdr(m); 3647 struct toepcb *toep = (struct toepcb *)ctx; 3648 struct tcpcb *tp = toep->tp_tp; 3649 struct socket *so, *lso; 3650 struct t3c_data *td = T3C_DATA(cdev); 3651 struct sockbuf *snd, *rcv; 3652 3653 // Complete socket initialization now that we have the SND_ISN 3654 3655 struct toedev *tdev; 3656 3657 3658 tdev = toep->tp_toedev; 3659 3660 inp_wlock(tp->t_inpcb); 3661 3662 /* 3663 * 3664 * XXX need to add reference while we're manipulating 3665 */ 3666 so = lso = inp_inpcbtosocket(tp->t_inpcb); 3667 3668 inp_wunlock(tp->t_inpcb); 3669 3670 so_lock(so); 3671 LIST_REMOVE(toep, synq_entry); 3672 so_unlock(so); 3673 3674 if (!syncache_expand_establish_req(req, &so, toep)) { 3675 /* 3676 * No entry 3677 */ 3678 CXGB_UNIMPLEMENTED(); 3679 } 3680 if (so == NULL) { 3681 /* 3682 * Couldn't create the socket 3683 */ 3684 CXGB_UNIMPLEMENTED(); 3685 } 3686 3687 tp = so_sototcpcb(so); 3688 inp_wlock(tp->t_inpcb); 3689 3690 snd = so_sockbuf_snd(so); 3691 rcv = so_sockbuf_rcv(so); 3692 3693 snd->sb_flags |= SB_NOCOALESCE; 3694 rcv->sb_flags |= SB_NOCOALESCE; 3695 3696 toep->tp_tp = tp; 3697 toep->tp_flags = 0; 3698 tp->t_toe = toep; 3699 reset_wr_list(toep); 3700 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3701 tp->rcv_nxt = toep->tp_copied_seq; 3702 install_offload_ops(so); 3703 3704 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 3705 toep->tp_wr_unacked = 0; 3706 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3707 toep->tp_qset_idx = 0; 3708 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 3709 3710 /* 3711 * XXX Cancel any keep alive timer 3712 */ 3713 3714 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3715 3716 /* 3717 * XXX workaround for lack of syncache drop 3718 */ 3719 toepcb_release(toep); 3720 inp_wunlock(tp->t_inpcb); 3721 3722 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); 3723 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3724#ifdef notyet 3725 /* 3726 * XXX not sure how these checks map to us 3727 */ 3728 if (unlikely(sk->sk_socket)) { // simultaneous opens only 3729 sk->sk_state_change(sk); 3730 sk_wake_async(so, 0, POLL_OUT); 3731 } 3732 /* 3733 * The state for the new connection is now up to date. 3734 * Next check if we should add the connection to the parent's 3735 * accept queue. When the parent closes it resets connections 3736 * on its SYN queue, so check if we are being reset. If so we 3737 * don't need to do anything more, the coming ABORT_RPL will 3738 * destroy this socket. Otherwise move the connection to the 3739 * accept queue. 3740 * 3741 * Note that we reset the synq before closing the server so if 3742 * we are not being reset the stid is still open. 3743 */ 3744 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3745 __kfree_skb(skb); 3746 goto unlock; 3747 } 3748#endif 3749 m_free(m); 3750 3751 return (0); 3752} 3753 3754/* 3755 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3756 * and send them to the TOE. 3757 */ 3758static void 3759fixup_and_send_ofo(struct toepcb *toep) 3760{ 3761 struct mbuf *m; 3762 struct toedev *tdev = toep->tp_toedev; 3763 struct tcpcb *tp = toep->tp_tp; 3764 unsigned int tid = toep->tp_tid; 3765 3766 log(LOG_NOTICE, "fixup_and_send_ofo\n"); 3767 3768 inp_lock_assert(tp->t_inpcb); 3769 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3770 /* 3771 * A variety of messages can be waiting but the fields we'll 3772 * be touching are common to all so any message type will do. 3773 */ 3774 struct cpl_close_con_req *p = cplhdr(m); 3775 3776 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3777 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3778 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3779 } 3780} 3781 3782/* 3783 * Updates socket state from an active establish CPL message. Runs with the 3784 * socket lock held. 3785 */ 3786static void 3787socket_act_establish(struct socket *so, struct mbuf *m) 3788{ 3789 INIT_VNET_INET(so->so_vnet); 3790 struct cpl_act_establish *req = cplhdr(m); 3791 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3792 struct tcpcb *tp = so_sototcpcb(so); 3793 struct toepcb *toep = tp->t_toe; 3794 3795 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3796 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3797 toep->tp_tid, tp->t_state); 3798 3799 tp->ts_recent_age = ticks; 3800 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3801 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3802 3803 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3804 3805 /* 3806 * Now that we finally have a TID send any CPL messages that we had to 3807 * defer for lack of a TID. 3808 */ 3809 if (mbufq_len(&toep->out_of_order_queue)) 3810 fixup_and_send_ofo(toep); 3811 3812 if (__predict_false(so_state_get(so) & SS_NOFDREF)) { 3813 /* 3814 * XXX does this even make sense? 3815 */ 3816 so_sorwakeup(so); 3817 } 3818 m_free(m); 3819#ifdef notyet 3820/* 3821 * XXX assume no write requests permitted while socket connection is 3822 * incomplete 3823 */ 3824 /* 3825 * Currently the send queue must be empty at this point because the 3826 * socket layer does not send anything before a connection is 3827 * established. To be future proof though we handle the possibility 3828 * that there are pending buffers to send (either TX_DATA or 3829 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3830 * buffers according to the just learned write_seq, and then we send 3831 * them on their way. 3832 */ 3833 fixup_pending_writeq_buffers(sk); 3834 if (t3_push_frames(so, 1)) 3835 sk->sk_write_space(sk); 3836#endif 3837 3838 toep->tp_state = tp->t_state; 3839 V_tcpstat.tcps_connects++; 3840 3841} 3842 3843/* 3844 * Process a CPL_ACT_ESTABLISH message. 3845 */ 3846static int 3847do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3848{ 3849 struct cpl_act_establish *req = cplhdr(m); 3850 unsigned int tid = GET_TID(req); 3851 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3852 struct toepcb *toep = (struct toepcb *)ctx; 3853 struct tcpcb *tp = toep->tp_tp; 3854 struct socket *so; 3855 struct toedev *tdev; 3856 struct tom_data *d; 3857 3858 if (tp == NULL) { 3859 free_atid(cdev, atid); 3860 return (0); 3861 } 3862 inp_wlock(tp->t_inpcb); 3863 3864 /* 3865 * XXX 3866 */ 3867 so = inp_inpcbtosocket(tp->t_inpcb); 3868 tdev = toep->tp_toedev; /* blow up here if link was down */ 3869 d = TOM_DATA(tdev); 3870 3871 /* 3872 * It's OK if the TID is currently in use, the owning socket may have 3873 * backlogged its last CPL message(s). Just take it away. 3874 */ 3875 toep->tp_tid = tid; 3876 toep->tp_tp = tp; 3877 so_insert_tid(d, toep, tid); 3878 free_atid(cdev, atid); 3879 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3880 3881 socket_act_establish(so, m); 3882 inp_wunlock(tp->t_inpcb); 3883 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); 3884 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3885 3886 return (0); 3887} 3888 3889/* 3890 * Process an acknowledgment of WR completion. Advance snd_una and send the 3891 * next batch of work requests from the write queue. 3892 */ 3893static void 3894wr_ack(struct toepcb *toep, struct mbuf *m) 3895{ 3896 struct tcpcb *tp = toep->tp_tp; 3897 struct cpl_wr_ack *hdr = cplhdr(m); 3898 struct socket *so; 3899 unsigned int credits = ntohs(hdr->credits); 3900 u32 snd_una = ntohl(hdr->snd_una); 3901 int bytes = 0; 3902 struct sockbuf *snd; 3903 3904 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); 3905 3906 inp_wlock(tp->t_inpcb); 3907 so = inp_inpcbtosocket(tp->t_inpcb); 3908 toep->tp_wr_avail += credits; 3909 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3910 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3911 3912 while (credits) { 3913 struct mbuf *p = peek_wr(toep); 3914 3915 if (__predict_false(!p)) { 3916 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3917 "nothing pending, state %u wr_avail=%u\n", 3918 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 3919 break; 3920 } 3921 CTR2(KTR_TOM, 3922 "wr_ack: p->credits=%d p->bytes=%d", 3923 p->m_pkthdr.csum_data, p->m_pkthdr.len); 3924 KASSERT(p->m_pkthdr.csum_data != 0, 3925 ("empty request still on list")); 3926 3927 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3928 3929#if DEBUG_WR > 1 3930 struct tx_data_wr *w = cplhdr(p); 3931 log(LOG_ERR, 3932 "TID %u got %u WR credits, need %u, len %u, " 3933 "main body %u, frags %u, seq # %u, ACK una %u," 3934 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3935 toep->tp_tid, credits, p->csum, p->len, 3936 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3937 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3938 toep->tp_wr_avail, count_pending_wrs(tp) - credits); 3939#endif 3940 p->m_pkthdr.csum_data -= credits; 3941 break; 3942 } else { 3943 dequeue_wr(toep); 3944 credits -= p->m_pkthdr.csum_data; 3945 bytes += p->m_pkthdr.len; 3946 CTR3(KTR_TOM, 3947 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", 3948 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); 3949 3950 m_free(p); 3951 } 3952 } 3953 3954#if DEBUG_WR 3955 check_wr_invariants(tp); 3956#endif 3957 3958 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3959#if VALIDATE_SEQ 3960 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3961 3962 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3963 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3964 toep->tp_tid, tp->snd_una); 3965#endif 3966 goto out_free; 3967 } 3968 3969 if (tp->snd_una != snd_una) { 3970 tp->snd_una = snd_una; 3971 tp->ts_recent_age = ticks; 3972#ifdef notyet 3973 /* 3974 * Keep ARP entry "minty fresh" 3975 */ 3976 dst_confirm(sk->sk_dst_cache); 3977#endif 3978 if (tp->snd_una == tp->snd_nxt) 3979 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3980 } 3981 3982 snd = so_sockbuf_snd(so); 3983 if (bytes) { 3984 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); 3985 snd = so_sockbuf_snd(so); 3986 sockbuf_lock(snd); 3987 sbdrop_locked(snd, bytes); 3988 so_sowwakeup_locked(so); 3989 } 3990 3991 if (snd->sb_sndptroff < snd->sb_cc) 3992 t3_push_frames(so, 0); 3993 3994out_free: 3995 inp_wunlock(tp->t_inpcb); 3996 m_free(m); 3997} 3998 3999/* 4000 * Handler for TX_DATA_ACK CPL messages. 4001 */ 4002static int 4003do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 4004{ 4005 struct toepcb *toep = (struct toepcb *)ctx; 4006 4007 VALIDATE_SOCK(so); 4008 4009 wr_ack(toep, m); 4010 return 0; 4011} 4012 4013/* 4014 * Handler for TRACE_PKT CPL messages. Just sink these packets. 4015 */ 4016static int 4017do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) 4018{ 4019 m_freem(m); 4020 return 0; 4021} 4022 4023/* 4024 * Reset a connection that is on a listener's SYN queue or accept queue, 4025 * i.e., one that has not had a struct socket associated with it. 4026 * Must be called from process context. 4027 * 4028 * Modeled after code in inet_csk_listen_stop(). 4029 */ 4030static void 4031t3_reset_listen_child(struct socket *child) 4032{ 4033 struct tcpcb *tp = so_sototcpcb(child); 4034 4035 t3_send_reset(tp->t_toe); 4036} 4037 4038 4039static void 4040t3_child_disconnect(struct socket *so, void *arg) 4041{ 4042 struct tcpcb *tp = so_sototcpcb(so); 4043 4044 if (tp->t_flags & TF_TOE) { 4045 inp_wlock(tp->t_inpcb); 4046 t3_reset_listen_child(so); 4047 inp_wunlock(tp->t_inpcb); 4048 } 4049} 4050 4051/* 4052 * Disconnect offloaded established but not yet accepted connections sitting 4053 * on a server's accept_queue. We just send an ABORT_REQ at this point and 4054 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 4055 */ 4056void 4057t3_disconnect_acceptq(struct socket *listen_so) 4058{ 4059 4060 so_lock(listen_so); 4061 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); 4062 so_unlock(listen_so); 4063} 4064 4065/* 4066 * Reset offloaded connections sitting on a server's syn queue. As above 4067 * we send ABORT_REQ and finish off when we get ABORT_RPL. 4068 */ 4069 4070void 4071t3_reset_synq(struct listen_ctx *lctx) 4072{ 4073 struct toepcb *toep; 4074 4075 so_lock(lctx->lso); 4076 while (!LIST_EMPTY(&lctx->synq_head)) { 4077 toep = LIST_FIRST(&lctx->synq_head); 4078 LIST_REMOVE(toep, synq_entry); 4079 toep->tp_tp = NULL; 4080 t3_send_reset(toep); 4081 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 4082 toepcb_release(toep); 4083 } 4084 so_unlock(lctx->lso); 4085} 4086 4087 4088int 4089t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, 4090 unsigned int nppods, unsigned int tag, unsigned int maxoff, 4091 unsigned int pg_off, unsigned int color) 4092{ 4093 unsigned int i, j, pidx; 4094 struct pagepod *p; 4095 struct mbuf *m; 4096 struct ulp_mem_io *req; 4097 unsigned int tid = toep->tp_tid; 4098 const struct tom_data *td = TOM_DATA(toep->tp_toedev); 4099 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; 4100 4101 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", 4102 gl, nppods, tag, maxoff, pg_off, color); 4103 4104 for (i = 0; i < nppods; ++i) { 4105 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); 4106 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4107 req = mtod(m, struct ulp_mem_io *); 4108 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; 4109 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4110 req->wr.wr_lo = 0; 4111 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | 4112 V_ULPTX_CMD(ULP_MEM_WRITE)); 4113 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | 4114 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); 4115 4116 p = (struct pagepod *)(req + 1); 4117 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { 4118 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); 4119 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | 4120 V_PPOD_COLOR(color)); 4121 p->pp_max_offset = htonl(maxoff); 4122 p->pp_page_offset = htonl(pg_off); 4123 p->pp_rsvd = 0; 4124 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) 4125 p->pp_addr[j] = pidx < gl->dgl_nelem ? 4126 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; 4127 } else 4128 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ 4129 send_or_defer(toep, m, 0); 4130 ppod_addr += PPOD_SIZE; 4131 } 4132 return (0); 4133} 4134 4135/* 4136 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. 4137 */ 4138static inline void 4139mk_cpl_barrier_ulp(struct cpl_barrier *b) 4140{ 4141 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; 4142 4143 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4144 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); 4145 b->opcode = CPL_BARRIER; 4146} 4147 4148/* 4149 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. 4150 */ 4151static inline void 4152mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) 4153{ 4154 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4155 4156 txpkt = (struct ulp_txpkt *)req; 4157 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4158 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4159 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); 4160 req->cpuno = htons(cpuno); 4161} 4162 4163/* 4164 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. 4165 */ 4166static inline void 4167mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, 4168 unsigned int word, uint64_t mask, uint64_t val) 4169{ 4170 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4171 4172 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 4173 tid, word, mask, val); 4174 4175 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4176 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4177 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); 4178 req->reply = V_NO_REPLY(1); 4179 req->cpu_idx = 0; 4180 req->word = htons(word); 4181 req->mask = htobe64(mask); 4182 req->val = htobe64(val); 4183} 4184 4185/* 4186 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. 4187 */ 4188static void 4189mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, 4190 unsigned int tid, unsigned int credits) 4191{ 4192 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; 4193 4194 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4195 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); 4196 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); 4197 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 4198 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | 4199 V_RX_CREDITS(credits)); 4200} 4201 4202void 4203t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) 4204{ 4205 unsigned int wrlen; 4206 struct mbuf *m; 4207 struct work_request_hdr *wr; 4208 struct cpl_barrier *lock; 4209 struct cpl_set_tcb_field *req; 4210 struct cpl_get_tcb *getreq; 4211 struct ddp_state *p = &toep->tp_ddp_state; 4212 4213#if 0 4214 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4215#endif 4216 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + 4217 sizeof(*getreq); 4218 m = m_gethdr_nofail(wrlen); 4219 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4220 wr = mtod(m, struct work_request_hdr *); 4221 bzero(wr, wrlen); 4222 4223 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4224 m->m_pkthdr.len = m->m_len = wrlen; 4225 4226 lock = (struct cpl_barrier *)(wr + 1); 4227 mk_cpl_barrier_ulp(lock); 4228 4229 req = (struct cpl_set_tcb_field *)(lock + 1); 4230 4231 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); 4232 4233 /* Hmmm, not sure if this actually a good thing: reactivating 4234 * the other buffer might be an issue if it has been completed 4235 * already. However, that is unlikely, since the fact that the UBUF 4236 * is not completed indicates that there is no oustanding data. 4237 */ 4238 if (bufidx == 0) 4239 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4240 V_TF_DDP_ACTIVE_BUF(1) | 4241 V_TF_DDP_BUF0_VALID(1), 4242 V_TF_DDP_ACTIVE_BUF(1)); 4243 else 4244 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4245 V_TF_DDP_ACTIVE_BUF(1) | 4246 V_TF_DDP_BUF1_VALID(1), 0); 4247 4248 getreq = (struct cpl_get_tcb *)(req + 1); 4249 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4250 4251 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); 4252 4253 /* Keep track of the number of oustanding CPL_GET_TCB requests 4254 */ 4255 p->get_tcb_count++; 4256 4257#ifdef T3_TRACE 4258 T3_TRACE1(TIDTB(so), 4259 "t3_cancel_ddpbuf: bufidx %u", bufidx); 4260#endif 4261 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4262} 4263 4264/** 4265 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one 4266 * @sk: the socket associated with the buffers 4267 * @bufidx: index of HW DDP buffer (0 or 1) 4268 * @tag0: new tag for HW buffer 0 4269 * @tag1: new tag for HW buffer 1 4270 * @len: new length for HW buf @bufidx 4271 * 4272 * Sends a compound WR to overlay a new DDP buffer on top of an existing 4273 * buffer by changing the buffer tag and length and setting the valid and 4274 * active flag accordingly. The caller must ensure the new buffer is at 4275 * least as big as the existing one. Since we typically reprogram both HW 4276 * buffers this function sets both tags for convenience. Read the TCB to 4277 * determine how made data was written into the buffer before the overlay 4278 * took place. 4279 */ 4280void 4281t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, 4282 unsigned int tag1, unsigned int len) 4283{ 4284 unsigned int wrlen; 4285 struct mbuf *m; 4286 struct work_request_hdr *wr; 4287 struct cpl_get_tcb *getreq; 4288 struct cpl_set_tcb_field *req; 4289 struct ddp_state *p = &toep->tp_ddp_state; 4290 4291 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", 4292 bufidx, tag0, tag1, len); 4293#if 0 4294 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4295#endif 4296 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); 4297 m = m_gethdr_nofail(wrlen); 4298 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4299 wr = mtod(m, struct work_request_hdr *); 4300 m->m_pkthdr.len = m->m_len = wrlen; 4301 bzero(wr, wrlen); 4302 4303 4304 /* Set the ATOMIC flag to make sure that TP processes the following 4305 * CPLs in an atomic manner and no wire segments can be interleaved. 4306 */ 4307 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); 4308 req = (struct cpl_set_tcb_field *)(wr + 1); 4309 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, 4310 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | 4311 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, 4312 V_TCB_RX_DDP_BUF0_TAG(tag0) | 4313 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); 4314 req++; 4315 if (bufidx == 0) { 4316 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, 4317 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4318 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 4319 req++; 4320 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4321 V_TF_DDP_PUSH_DISABLE_0(1) | 4322 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4323 V_TF_DDP_PUSH_DISABLE_0(0) | 4324 V_TF_DDP_BUF0_VALID(1)); 4325 } else { 4326 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, 4327 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), 4328 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); 4329 req++; 4330 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4331 V_TF_DDP_PUSH_DISABLE_1(1) | 4332 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4333 V_TF_DDP_PUSH_DISABLE_1(0) | 4334 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); 4335 } 4336 4337 getreq = (struct cpl_get_tcb *)(req + 1); 4338 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4339 4340 /* Keep track of the number of oustanding CPL_GET_TCB requests 4341 */ 4342 p->get_tcb_count++; 4343 4344#ifdef T3_TRACE 4345 T3_TRACE4(TIDTB(sk), 4346 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " 4347 "len %d", 4348 bufidx, tag0, tag1, len); 4349#endif 4350 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4351} 4352 4353/* 4354 * Sends a compound WR containing all the CPL messages needed to program the 4355 * two HW DDP buffers, namely optionally setting up the length and offset of 4356 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. 4357 */ 4358void 4359t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, 4360 unsigned int len1, unsigned int offset1, 4361 uint64_t ddp_flags, uint64_t flag_mask, int modulate) 4362{ 4363 unsigned int wrlen; 4364 struct mbuf *m; 4365 struct work_request_hdr *wr; 4366 struct cpl_set_tcb_field *req; 4367 4368 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", 4369 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); 4370 4371#if 0 4372 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4373#endif 4374 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + 4375 (len1 ? sizeof(*req) : 0) + 4376 (modulate ? sizeof(struct cpl_rx_data_ack) : 0); 4377 m = m_gethdr_nofail(wrlen); 4378 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4379 wr = mtod(m, struct work_request_hdr *); 4380 bzero(wr, wrlen); 4381 4382 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4383 m->m_pkthdr.len = m->m_len = wrlen; 4384 4385 req = (struct cpl_set_tcb_field *)(wr + 1); 4386 if (len0) { /* program buffer 0 offset and length */ 4387 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, 4388 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 4389 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4390 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | 4391 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); 4392 req++; 4393 } 4394 if (len1) { /* program buffer 1 offset and length */ 4395 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, 4396 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 4397 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, 4398 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | 4399 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); 4400 req++; 4401 } 4402 4403 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, 4404 ddp_flags); 4405 4406 if (modulate) { 4407 mk_rx_data_ack_ulp(toep, 4408 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, 4409 toep->tp_copied_seq - toep->tp_rcv_wup); 4410 toep->tp_rcv_wup = toep->tp_copied_seq; 4411 } 4412 4413#ifdef T3_TRACE 4414 T3_TRACE5(TIDTB(sk), 4415 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " 4416 "modulate %d", 4417 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, 4418 modulate); 4419#endif 4420 4421 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4422} 4423 4424void 4425t3_init_wr_tab(unsigned int wr_len) 4426{ 4427 int i; 4428 4429 if (mbuf_wrs[1]) /* already initialized */ 4430 return; 4431 4432 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 4433 int sgl_len = (3 * i) / 2 + (i & 1); 4434 4435 sgl_len += 3; 4436 mbuf_wrs[i] = sgl_len <= wr_len ? 4437 1 : 1 + (sgl_len - 2) / (wr_len - 1); 4438 } 4439 4440 wrlen = wr_len * 8; 4441} 4442 4443int 4444t3_init_cpl_io(void) 4445{ 4446#ifdef notyet 4447 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 4448 if (!tcphdr_skb) { 4449 log(LOG_ERR, 4450 "Chelsio TCP offload: can't allocate sk_buff\n"); 4451 return -1; 4452 } 4453 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 4454 tcphdr_skb->h.raw = tcphdr_skb->data; 4455 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 4456#endif 4457 4458 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 4459 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 4460 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 4461 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 4462 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 4463 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 4464 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 4465 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 4466 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 4467 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 4468 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 4469 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 4470 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 4471 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 4472 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 4473 return (0); 4474} 4475 4476