cxgb_cpl_io.c revision 195677
1/************************************************************************** 2 3Copyright (c) 2007-2008, Chelsio Inc. 4All rights reserved. 5 6Redistribution and use in source and binary forms, with or without 7modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26POSSIBILITY OF SUCH DAMAGE. 27 28***************************************************************************/ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 195677 2009-07-14 11:53:21Z lstewart $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/sockstate.h> 43#include <sys/sockopt.h> 44#include <sys/socket.h> 45#include <sys/sockbuf.h> 46#include <sys/sysctl.h> 47#include <sys/syslog.h> 48#include <sys/protosw.h> 49#include <sys/priv.h> 50 51#if __FreeBSD_version >= 800044 52#include <sys/vimage.h> 53#else 54#define V_tcp_do_autosndbuf tcp_do_autosndbuf 55#define V_tcp_autosndbuf_max tcp_autosndbuf_max 56#define V_tcp_do_rfc1323 tcp_do_rfc1323 57#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf 58#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max 59#define V_tcpstat tcpstat 60#endif 61 62#include <net/if.h> 63#include <net/route.h> 64 65#include <netinet/in.h> 66#include <netinet/in_pcb.h> 67#include <netinet/in_systm.h> 68#include <netinet/in_var.h> 69 70 71#include <cxgb_osdep.h> 72#include <sys/mbufq.h> 73 74#include <netinet/ip.h> 75#include <netinet/tcp_var.h> 76#include <netinet/tcp_fsm.h> 77#include <netinet/tcp_offload.h> 78#include <netinet/tcp_seq.h> 79#include <netinet/tcp_syncache.h> 80#include <netinet/tcp_timer.h> 81#if __FreeBSD_version >= 800056 82#include <netinet/vinet.h> 83#endif 84#include <net/route.h> 85 86#include <t3cdev.h> 87#include <common/cxgb_firmware_exports.h> 88#include <common/cxgb_t3_cpl.h> 89#include <common/cxgb_tcb.h> 90#include <common/cxgb_ctl_defs.h> 91#include <cxgb_offload.h> 92#include <vm/vm.h> 93#include <vm/pmap.h> 94#include <machine/bus.h> 95#include <sys/mvec.h> 96#include <ulp/toecore/cxgb_toedev.h> 97#include <ulp/tom/cxgb_l2t.h> 98#include <ulp/tom/cxgb_defs.h> 99#include <ulp/tom/cxgb_tom.h> 100#include <ulp/tom/cxgb_t3_ddp.h> 101#include <ulp/tom/cxgb_toepcb.h> 102#include <ulp/tom/cxgb_tcp.h> 103#include <ulp/tom/cxgb_tcp_offload.h> 104 105/* 106 * For ULP connections HW may add headers, e.g., for digests, that aren't part 107 * of the messages sent by the host but that are part of the TCP payload and 108 * therefore consume TCP sequence space. Tx connection parameters that 109 * operate in TCP sequence space are affected by the HW additions and need to 110 * compensate for them to accurately track TCP sequence numbers. This array 111 * contains the compensating extra lengths for ULP packets. It is indexed by 112 * a packet's ULP submode. 113 */ 114const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 115 116#ifdef notyet 117/* 118 * This sk_buff holds a fake header-only TCP segment that we use whenever we 119 * need to exploit SW TCP functionality that expects TCP headers, such as 120 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 121 * CPUs without locking. 122 */ 123static struct mbuf *tcphdr_mbuf __read_mostly; 124#endif 125 126/* 127 * Size of WRs in bytes. Note that we assume all devices we are handling have 128 * the same WR size. 129 */ 130static unsigned int wrlen __read_mostly; 131 132/* 133 * The number of WRs needed for an skb depends on the number of page fragments 134 * in the skb and whether it has any payload in its main body. This maps the 135 * length of the gather list represented by an skb into the # of necessary WRs. 136 */ 137static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; 138 139/* 140 * Max receive window supported by HW in bytes. Only a small part of it can 141 * be set through option0, the rest needs to be set through RX_DATA_ACK. 142 */ 143#define MAX_RCV_WND ((1U << 27) - 1) 144 145/* 146 * Min receive window. We want it to be large enough to accommodate receive 147 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 148 */ 149#define MIN_RCV_WND (24 * 1024U) 150#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 151 152#define VALIDATE_SEQ 0 153#define VALIDATE_SOCK(so) 154#define DEBUG_WR 0 155 156#define TCP_TIMEWAIT 1 157#define TCP_CLOSE 2 158#define TCP_DROP 3 159 160static void t3_send_reset(struct toepcb *toep); 161static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 162static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 163static void handle_syncache_event(int event, void *arg); 164 165static inline void 166SBAPPEND(struct sockbuf *sb, struct mbuf *n) 167{ 168 struct mbuf *m; 169 170 m = sb->sb_mb; 171 while (m) { 172 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 173 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 174 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 175 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 176 m->m_next, m->m_nextpkt, m->m_flags)); 177 m = m->m_next; 178 } 179 m = n; 180 while (m) { 181 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 182 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 183 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 184 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 185 m->m_next, m->m_nextpkt, m->m_flags)); 186 m = m->m_next; 187 } 188 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); 189 sbappendstream_locked(sb, n); 190 m = sb->sb_mb; 191 192 while (m) { 193 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 194 m->m_next, m->m_nextpkt, m->m_flags)); 195 m = m->m_next; 196 } 197} 198 199static inline int 200is_t3a(const struct toedev *dev) 201{ 202 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 203} 204 205static void 206dump_toepcb(struct toepcb *toep) 207{ 208 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 209 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 210 toep->tp_mtu_idx, toep->tp_tid); 211 212 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 213 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 214 toep->tp_mss_clamp, toep->tp_flags); 215} 216 217#ifndef RTALLOC2_DEFINED 218static struct rtentry * 219rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 220{ 221 struct rtentry *rt = NULL; 222 223 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 224 RT_UNLOCK(rt); 225 226 return (rt); 227} 228#endif 229 230/* 231 * Determine whether to send a CPL message now or defer it. A message is 232 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 233 * For connections in other states the message is sent immediately. 234 * If through_l2t is set the message is subject to ARP processing, otherwise 235 * it is sent directly. 236 */ 237static inline void 238send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) 239{ 240 struct tcpcb *tp = toep->tp_tp; 241 242 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 243 inp_wlock(tp->t_inpcb); 244 mbufq_tail(&toep->out_of_order_queue, m); // defer 245 inp_wunlock(tp->t_inpcb); 246 } else if (through_l2t) 247 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T 248 else 249 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly 250} 251 252static inline unsigned int 253mkprio(unsigned int cntrl, const struct toepcb *toep) 254{ 255 return (cntrl); 256} 257 258/* 259 * Populate a TID_RELEASE WR. The skb must be already propely sized. 260 */ 261static inline void 262mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) 263{ 264 struct cpl_tid_release *req; 265 266 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); 267 m->m_pkthdr.len = m->m_len = sizeof(*req); 268 req = mtod(m, struct cpl_tid_release *); 269 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 270 req->wr.wr_lo = 0; 271 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 272} 273 274static inline void 275make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 276{ 277 INIT_VNET_INET(so->so_vnet); 278 struct tcpcb *tp = so_sototcpcb(so); 279 struct toepcb *toep = tp->t_toe; 280 struct tx_data_wr *req; 281 struct sockbuf *snd; 282 283 inp_lock_assert(tp->t_inpcb); 284 snd = so_sockbuf_snd(so); 285 286 req = mtod(m, struct tx_data_wr *); 287 m->m_len = sizeof(*req); 288 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 289 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 290 /* len includes the length of any HW ULP additions */ 291 req->len = htonl(len); 292 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 293 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 294 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 295 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 296 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 297 (tail ? 0 : 1)))); 298 req->sndseq = htonl(tp->snd_nxt); 299 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 300 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 301 V_TX_CPU_IDX(toep->tp_qset)); 302 303 /* Sendbuffer is in units of 32KB. 304 */ 305 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 306 req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15)); 307 else { 308 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 309 } 310 311 toep->tp_flags |= TP_DATASENT; 312 } 313} 314 315#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ 316 317int 318t3_push_frames(struct socket *so, int req_completion) 319{ 320 struct tcpcb *tp = so_sototcpcb(so); 321 struct toepcb *toep = tp->t_toe; 322 323 struct mbuf *tail, *m0, *last; 324 struct t3cdev *cdev; 325 struct tom_data *d; 326 int state, bytes, count, total_bytes; 327 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 328 struct sockbuf *snd; 329 330 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 331 DPRINTF("tcp state=%d\n", tp->t_state); 332 return (0); 333 } 334 335 state = so_state_get(so); 336 337 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 338 DPRINTF("disconnecting\n"); 339 340 return (0); 341 } 342 343 inp_lock_assert(tp->t_inpcb); 344 345 snd = so_sockbuf_snd(so); 346 sockbuf_lock(snd); 347 348 d = TOM_DATA(toep->tp_toedev); 349 cdev = d->cdev; 350 351 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 352 353 total_bytes = 0; 354 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 355 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); 356 357 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { 358 KASSERT(tail, ("sbdrop error")); 359 last = tail = tail->m_next; 360 } 361 362 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 363 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 364 sockbuf_unlock(snd); 365 366 return (0); 367 } 368 369 toep->tp_m_last = NULL; 370 while (toep->tp_wr_avail && (tail != NULL)) { 371 count = bytes = 0; 372 segp = segs; 373 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 374 sockbuf_unlock(snd); 375 return (0); 376 } 377 /* 378 * If the data in tail fits as in-line, then 379 * make an immediate data wr. 380 */ 381 if (tail->m_len <= IMM_LEN) { 382 count = 1; 383 bytes = tail->m_len; 384 last = tail; 385 tail = tail->m_next; 386 m_set_sgl(m0, NULL); 387 m_set_sgllen(m0, 0); 388 make_tx_data_wr(so, m0, bytes, tail); 389 m_append(m0, bytes, mtod(last, caddr_t)); 390 KASSERT(!m0->m_next, ("bad append")); 391 } else { 392 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 393 && (tail != NULL) && (count < TX_MAX_SEGS-1)) { 394 bytes += tail->m_len; 395 last = tail; 396 count++; 397 /* 398 * technically an abuse to be using this for a VA 399 * but less gross than defining my own structure 400 * or calling pmap_kextract from here :-| 401 */ 402 segp->ds_addr = (bus_addr_t)tail->m_data; 403 segp->ds_len = tail->m_len; 404 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 405 count, mbuf_wrs[count], tail->m_data, tail->m_len); 406 segp++; 407 tail = tail->m_next; 408 } 409 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 410 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 411 412 m_set_sgl(m0, segs); 413 m_set_sgllen(m0, count); 414 make_tx_data_wr(so, m0, bytes, tail); 415 } 416 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); 417 418 if (tail) { 419 snd->sb_sndptr = tail; 420 toep->tp_m_last = NULL; 421 } else 422 toep->tp_m_last = snd->sb_sndptr = last; 423 424 425 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 426 427 snd->sb_sndptroff += bytes; 428 total_bytes += bytes; 429 toep->tp_write_seq += bytes; 430 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" 431 " tail=%p sndptr=%p sndptroff=%d", 432 toep->tp_wr_avail, count, mbuf_wrs[count], 433 tail, snd->sb_sndptr, snd->sb_sndptroff); 434 if (tail) 435 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" 436 " tp_m_last=%p tailbuf=%p snd_una=0x%08x", 437 total_bytes, toep->tp_m_last, tail->m_data, 438 tp->snd_una); 439 else 440 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" 441 " tp_m_last=%p snd_una=0x%08x", 442 total_bytes, toep->tp_m_last, tp->snd_una); 443 444 445#ifdef KTR 446{ 447 int i; 448 449 i = 0; 450 while (i < count && m_get_sgllen(m0)) { 451 if ((count - i) >= 3) { 452 CTR6(KTR_TOM, 453 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 454 " len=%d pa=0x%zx len=%d", 455 segs[i].ds_addr, segs[i].ds_len, 456 segs[i + 1].ds_addr, segs[i + 1].ds_len, 457 segs[i + 2].ds_addr, segs[i + 2].ds_len); 458 i += 3; 459 } else if ((count - i) == 2) { 460 CTR4(KTR_TOM, 461 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 462 " len=%d", 463 segs[i].ds_addr, segs[i].ds_len, 464 segs[i + 1].ds_addr, segs[i + 1].ds_len); 465 i += 2; 466 } else { 467 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", 468 segs[i].ds_addr, segs[i].ds_len); 469 i++; 470 } 471 472 } 473} 474#endif 475 /* 476 * remember credits used 477 */ 478 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 479 m0->m_pkthdr.len = bytes; 480 toep->tp_wr_avail -= mbuf_wrs[count]; 481 toep->tp_wr_unacked += mbuf_wrs[count]; 482 483 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 484 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 485 struct work_request_hdr *wr = cplhdr(m0); 486 487 wr->wr_hi |= htonl(F_WR_COMPL); 488 toep->tp_wr_unacked = 0; 489 } 490 KASSERT((m0->m_pkthdr.csum_data > 0) && 491 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", 492 m0->m_pkthdr.csum_data)); 493 m0->m_type = MT_DONTFREE; 494 enqueue_wr(toep, m0); 495 DPRINTF("sending offload tx with %d bytes in %d segments\n", 496 bytes, count); 497 l2t_send(cdev, m0, toep->tp_l2t); 498 } 499 sockbuf_unlock(snd); 500 return (total_bytes); 501} 502 503/* 504 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 505 * under any circumstances. We take the easy way out and always queue the 506 * message to the write_queue. We can optimize the case where the queue is 507 * already empty though the optimization is probably not worth it. 508 */ 509static void 510close_conn(struct socket *so) 511{ 512 struct mbuf *m; 513 struct cpl_close_con_req *req; 514 struct tom_data *d; 515 struct inpcb *inp = so_sotoinpcb(so); 516 struct tcpcb *tp; 517 struct toepcb *toep; 518 unsigned int tid; 519 520 521 inp_wlock(inp); 522 tp = so_sototcpcb(so); 523 toep = tp->t_toe; 524 525 if (tp->t_state != TCPS_SYN_SENT) 526 t3_push_frames(so, 1); 527 528 if (toep->tp_flags & TP_FIN_SENT) { 529 inp_wunlock(inp); 530 return; 531 } 532 533 tid = toep->tp_tid; 534 535 d = TOM_DATA(toep->tp_toedev); 536 537 m = m_gethdr_nofail(sizeof(*req)); 538 m_set_priority(m, CPL_PRIORITY_DATA); 539 m_set_sgl(m, NULL); 540 m_set_sgllen(m, 0); 541 542 toep->tp_flags |= TP_FIN_SENT; 543 req = mtod(m, struct cpl_close_con_req *); 544 545 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 546 req->wr.wr_lo = htonl(V_WR_TID(tid)); 547 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 548 req->rsvd = 0; 549 inp_wunlock(inp); 550 /* 551 * XXX - need to defer shutdown while there is still data in the queue 552 * 553 */ 554 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); 555 cxgb_ofld_send(d->cdev, m); 556 557} 558 559/* 560 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 561 * and send it along. 562 */ 563static void 564abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 565{ 566 struct cpl_abort_req *req = cplhdr(m); 567 568 req->cmd = CPL_ABORT_NO_RST; 569 cxgb_ofld_send(cdev, m); 570} 571 572/* 573 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 574 * permitted to return without sending the message in case we cannot allocate 575 * an sk_buff. Returns the number of credits sent. 576 */ 577uint32_t 578t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 579{ 580 struct mbuf *m; 581 struct cpl_rx_data_ack *req; 582 struct toepcb *toep = tp->t_toe; 583 struct toedev *tdev = toep->tp_toedev; 584 585 m = m_gethdr_nofail(sizeof(*req)); 586 587 DPRINTF("returning %u credits to HW\n", credits); 588 589 req = mtod(m, struct cpl_rx_data_ack *); 590 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 591 req->wr.wr_lo = 0; 592 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 593 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 594 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 595 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 596 return (credits); 597} 598 599/* 600 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. 601 * This is only used in DDP mode, so we take the opportunity to also set the 602 * DACK mode and flush any Rx credits. 603 */ 604void 605t3_send_rx_modulate(struct toepcb *toep) 606{ 607 struct mbuf *m; 608 struct cpl_rx_data_ack *req; 609 610 m = m_gethdr_nofail(sizeof(*req)); 611 612 req = mtod(m, struct cpl_rx_data_ack *); 613 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 614 req->wr.wr_lo = 0; 615 m->m_pkthdr.len = m->m_len = sizeof(*req); 616 617 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 618 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 619 V_RX_DACK_MODE(1) | 620 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); 621 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 622 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 623 toep->tp_rcv_wup = toep->tp_copied_seq; 624} 625 626/* 627 * Handle receipt of an urgent pointer. 628 */ 629static void 630handle_urg_ptr(struct socket *so, uint32_t urg_seq) 631{ 632#ifdef URGENT_DATA_SUPPORTED 633 struct tcpcb *tp = so_sototcpcb(so); 634 635 urg_seq--; /* initially points past the urgent data, per BSD */ 636 637 if (tp->urg_data && !after(urg_seq, tp->urg_seq)) 638 return; /* duplicate pointer */ 639 sk_send_sigurg(sk); 640 if (tp->urg_seq == tp->copied_seq && tp->urg_data && 641 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { 642 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 643 644 tp->copied_seq++; 645 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) 646 tom_eat_skb(sk, skb, 0); 647 } 648 tp->urg_data = TCP_URG_NOTYET; 649 tp->urg_seq = urg_seq; 650#endif 651} 652 653/* 654 * Returns true if a socket cannot accept new Rx data. 655 */ 656static inline int 657so_no_receive(const struct socket *so) 658{ 659 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); 660} 661 662/* 663 * Process an urgent data notification. 664 */ 665static void 666rx_urg_notify(struct toepcb *toep, struct mbuf *m) 667{ 668 struct cpl_rx_urg_notify *hdr = cplhdr(m); 669 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 670 671 VALIDATE_SOCK(so); 672 673 if (!so_no_receive(so)) 674 handle_urg_ptr(so, ntohl(hdr->seq)); 675 676 m_freem(m); 677} 678 679/* 680 * Handler for RX_URG_NOTIFY CPL messages. 681 */ 682static int 683do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) 684{ 685 struct toepcb *toep = (struct toepcb *)ctx; 686 687 rx_urg_notify(toep, m); 688 return (0); 689} 690 691static __inline int 692is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) 693{ 694 return (toep->tp_ulp_mode || 695 (toep->tp_ulp_mode == ULP_MODE_TCPDDP && 696 dev->tod_ttid >= TOE_ID_CHELSIO_T3)); 697} 698 699/* 700 * Set of states for which we should return RX credits. 701 */ 702#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 703 704/* 705 * Called after some received data has been read. It returns RX credits 706 * to the HW for the amount of data processed. 707 */ 708void 709t3_cleanup_rbuf(struct tcpcb *tp, int copied) 710{ 711 struct toepcb *toep = tp->t_toe; 712 struct socket *so; 713 struct toedev *dev; 714 int dack_mode, must_send, read; 715 u32 thres, credits, dack = 0; 716 struct sockbuf *rcv; 717 718 so = inp_inpcbtosocket(tp->t_inpcb); 719 rcv = so_sockbuf_rcv(so); 720 721 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 722 (tp->t_state == TCPS_FIN_WAIT_2))) { 723 if (copied) { 724 sockbuf_lock(rcv); 725 toep->tp_copied_seq += copied; 726 sockbuf_unlock(rcv); 727 } 728 729 return; 730 } 731 732 inp_lock_assert(tp->t_inpcb); 733 734 sockbuf_lock(rcv); 735 if (copied) 736 toep->tp_copied_seq += copied; 737 else { 738 read = toep->tp_enqueued_bytes - rcv->sb_cc; 739 toep->tp_copied_seq += read; 740 } 741 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 742 toep->tp_enqueued_bytes = rcv->sb_cc; 743 sockbuf_unlock(rcv); 744 745 if (credits > rcv->sb_mbmax) { 746 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", 747 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 748 credits = rcv->sb_mbmax; 749 } 750 751 752 /* 753 * XXX this won't accurately reflect credit return - we need 754 * to look at the difference between the amount that has been 755 * put in the recv sockbuf and what is there now 756 */ 757 758 if (__predict_false(!credits)) 759 return; 760 761 dev = toep->tp_toedev; 762 thres = TOM_TUNABLE(dev, rx_credit_thres); 763 764 if (__predict_false(thres == 0)) 765 return; 766 767 if (is_delack_mode_valid(dev, toep)) { 768 dack_mode = TOM_TUNABLE(dev, delack); 769 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 770 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 771 772 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 773 dack = F_RX_DACK_CHANGE | 774 V_RX_DACK_MODE(dack_mode); 775 } 776 } else 777 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 778 779 /* 780 * For coalescing to work effectively ensure the receive window has 781 * at least 16KB left. 782 */ 783 must_send = credits + 16384 >= tp->rcv_wnd; 784 785 if (must_send || credits >= thres) 786 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 787} 788 789static int 790cxgb_toe_disconnect(struct tcpcb *tp) 791{ 792 struct socket *so; 793 794 DPRINTF("cxgb_toe_disconnect\n"); 795 796 so = inp_inpcbtosocket(tp->t_inpcb); 797 close_conn(so); 798 return (0); 799} 800 801static int 802cxgb_toe_reset(struct tcpcb *tp) 803{ 804 struct toepcb *toep = tp->t_toe; 805 806 t3_send_reset(toep); 807 808 /* 809 * unhook from socket 810 */ 811 tp->t_flags &= ~TF_TOE; 812 toep->tp_tp = NULL; 813 tp->t_toe = NULL; 814 return (0); 815} 816 817static int 818cxgb_toe_send(struct tcpcb *tp) 819{ 820 struct socket *so; 821 822 DPRINTF("cxgb_toe_send\n"); 823 dump_toepcb(tp->t_toe); 824 825 so = inp_inpcbtosocket(tp->t_inpcb); 826 t3_push_frames(so, 1); 827 return (0); 828} 829 830static int 831cxgb_toe_rcvd(struct tcpcb *tp) 832{ 833 834 inp_lock_assert(tp->t_inpcb); 835 836 t3_cleanup_rbuf(tp, 0); 837 838 return (0); 839} 840 841static void 842cxgb_toe_detach(struct tcpcb *tp) 843{ 844 struct toepcb *toep; 845 846 /* 847 * XXX how do we handle teardown in the SYN_SENT state? 848 * 849 */ 850 inp_lock_assert(tp->t_inpcb); 851 toep = tp->t_toe; 852 toep->tp_tp = NULL; 853 854 /* 855 * unhook from socket 856 */ 857 tp->t_flags &= ~TF_TOE; 858 tp->t_toe = NULL; 859} 860 861 862static struct toe_usrreqs cxgb_toe_usrreqs = { 863 .tu_disconnect = cxgb_toe_disconnect, 864 .tu_reset = cxgb_toe_reset, 865 .tu_send = cxgb_toe_send, 866 .tu_rcvd = cxgb_toe_rcvd, 867 .tu_detach = cxgb_toe_detach, 868 .tu_detach = cxgb_toe_detach, 869 .tu_syncache_event = handle_syncache_event, 870}; 871 872 873static void 874__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, 875 uint64_t mask, uint64_t val, int no_reply) 876{ 877 struct cpl_set_tcb_field *req; 878 879 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 880 toep->tp_tid, word, mask, val); 881 882 req = mtod(m, struct cpl_set_tcb_field *); 883 m->m_pkthdr.len = m->m_len = sizeof(*req); 884 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 885 req->wr.wr_lo = 0; 886 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 887 req->reply = V_NO_REPLY(no_reply); 888 req->cpu_idx = 0; 889 req->word = htons(word); 890 req->mask = htobe64(mask); 891 req->val = htobe64(val); 892 893 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 894 send_or_defer(toep, m, 0); 895} 896 897static void 898t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) 899{ 900 struct mbuf *m; 901 struct tcpcb *tp = toep->tp_tp; 902 903 if (toep == NULL) 904 return; 905 906 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { 907 printf("not seting field\n"); 908 return; 909 } 910 911 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 912 913 __set_tcb_field(toep, m, word, mask, val, 1); 914} 915 916/* 917 * Set one of the t_flags bits in the TCB. 918 */ 919static void 920set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) 921{ 922 923 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 924} 925 926/* 927 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 928 */ 929static void 930t3_set_nagle(struct toepcb *toep) 931{ 932 struct tcpcb *tp = toep->tp_tp; 933 934 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 935} 936 937/* 938 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 939 */ 940void 941t3_set_keepalive(struct toepcb *toep, int on_off) 942{ 943 944 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); 945} 946 947void 948t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) 949{ 950 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); 951} 952 953void 954t3_set_dack_mss(struct toepcb *toep, int on_off) 955{ 956 957 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); 958} 959 960/* 961 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 962 */ 963static void 964t3_set_tos(struct toepcb *toep) 965{ 966 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); 967 968 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 969 V_TCB_TOS(tos)); 970} 971 972 973/* 974 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 975 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 976 * set the PSH bit in the last segment, which would trigger delivery.] 977 * We work around the issue by setting a DDP buffer in a partial placed state, 978 * which guarantees that TP will schedule a timer. 979 */ 980#define TP_DDP_TIMER_WORKAROUND_MASK\ 981 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 982 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 983 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 984#define TP_DDP_TIMER_WORKAROUND_VAL\ 985 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 986 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 987 32)) 988 989static void 990t3_enable_ddp(struct toepcb *toep, int on) 991{ 992 if (on) { 993 994 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 995 V_TF_DDP_OFF(0)); 996 } else 997 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, 998 V_TF_DDP_OFF(1) | 999 TP_DDP_TIMER_WORKAROUND_MASK, 1000 V_TF_DDP_OFF(1) | 1001 TP_DDP_TIMER_WORKAROUND_VAL); 1002 1003} 1004 1005void 1006t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) 1007{ 1008 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 1009 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 1010 tag_color); 1011} 1012 1013void 1014t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, 1015 unsigned int len) 1016{ 1017 if (buf_idx == 0) 1018 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, 1019 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 1020 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 1021 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 1022 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 1023 else 1024 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, 1025 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 1026 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 1027 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 1028 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 1029} 1030 1031static int 1032t3_set_cong_control(struct socket *so, const char *name) 1033{ 1034#ifdef CONGESTION_CONTROL_SUPPORTED 1035 int cong_algo; 1036 1037 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 1038 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 1039 break; 1040 1041 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 1042 return -EINVAL; 1043#endif 1044 return 0; 1045} 1046 1047int 1048t3_get_tcb(struct toepcb *toep) 1049{ 1050 struct cpl_get_tcb *req; 1051 struct tcpcb *tp = toep->tp_tp; 1052 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 1053 1054 if (!m) 1055 return (ENOMEM); 1056 1057 inp_lock_assert(tp->t_inpcb); 1058 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 1059 req = mtod(m, struct cpl_get_tcb *); 1060 m->m_pkthdr.len = m->m_len = sizeof(*req); 1061 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1062 req->wr.wr_lo = 0; 1063 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 1064 req->cpuno = htons(toep->tp_qset); 1065 req->rsvd = 0; 1066 if (tp->t_state == TCPS_SYN_SENT) 1067 mbufq_tail(&toep->out_of_order_queue, m); // defer 1068 else 1069 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 1070 return 0; 1071} 1072 1073static inline void 1074so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) 1075{ 1076 1077 toepcb_hold(toep); 1078 1079 cxgb_insert_tid(d->cdev, d->client, toep, tid); 1080} 1081 1082/** 1083 * find_best_mtu - find the entry in the MTU table closest to an MTU 1084 * @d: TOM state 1085 * @mtu: the target MTU 1086 * 1087 * Returns the index of the value in the MTU table that is closest to but 1088 * does not exceed the target MTU. 1089 */ 1090static unsigned int 1091find_best_mtu(const struct t3c_data *d, unsigned short mtu) 1092{ 1093 int i = 0; 1094 1095 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 1096 ++i; 1097 return (i); 1098} 1099 1100static unsigned int 1101select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 1102{ 1103 unsigned int idx; 1104 1105#ifdef notyet 1106 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; 1107#endif 1108 if (tp) { 1109 tp->t_maxseg = pmtu - 40; 1110 if (tp->t_maxseg < td->mtus[0] - 40) 1111 tp->t_maxseg = td->mtus[0] - 40; 1112 idx = find_best_mtu(td, tp->t_maxseg + 40); 1113 1114 tp->t_maxseg = td->mtus[idx] - 40; 1115 } else 1116 idx = find_best_mtu(td, pmtu); 1117 1118 return (idx); 1119} 1120 1121static inline void 1122free_atid(struct t3cdev *cdev, unsigned int tid) 1123{ 1124 struct toepcb *toep = cxgb_free_atid(cdev, tid); 1125 1126 if (toep) 1127 toepcb_release(toep); 1128} 1129 1130/* 1131 * Release resources held by an offload connection (TID, L2T entry, etc.) 1132 */ 1133static void 1134t3_release_offload_resources(struct toepcb *toep) 1135{ 1136 struct tcpcb *tp = toep->tp_tp; 1137 struct toedev *tdev = toep->tp_toedev; 1138 struct t3cdev *cdev; 1139 struct socket *so; 1140 unsigned int tid = toep->tp_tid; 1141 struct sockbuf *rcv; 1142 1143 CTR0(KTR_TOM, "t3_release_offload_resources"); 1144 1145 if (!tdev) 1146 return; 1147 1148 cdev = TOEP_T3C_DEV(toep); 1149 if (!cdev) 1150 return; 1151 1152 toep->tp_qset = 0; 1153 t3_release_ddp_resources(toep); 1154 1155#ifdef CTRL_SKB_CACHE 1156 kfree_skb(CTRL_SKB_CACHE(tp)); 1157 CTRL_SKB_CACHE(tp) = NULL; 1158#endif 1159 1160 if (toep->tp_wr_avail != toep->tp_wr_max) { 1161 purge_wr_queue(toep); 1162 reset_wr_list(toep); 1163 } 1164 1165 if (toep->tp_l2t) { 1166 l2t_release(L2DATA(cdev), toep->tp_l2t); 1167 toep->tp_l2t = NULL; 1168 } 1169 toep->tp_tp = NULL; 1170 if (tp) { 1171 inp_lock_assert(tp->t_inpcb); 1172 so = inp_inpcbtosocket(tp->t_inpcb); 1173 rcv = so_sockbuf_rcv(so); 1174 /* 1175 * cancel any offloaded reads 1176 * 1177 */ 1178 sockbuf_lock(rcv); 1179 tp->t_toe = NULL; 1180 tp->t_flags &= ~TF_TOE; 1181 if (toep->tp_ddp_state.user_ddp_pending) { 1182 t3_cancel_ubuf(toep, rcv); 1183 toep->tp_ddp_state.user_ddp_pending = 0; 1184 } 1185 so_sorwakeup_locked(so); 1186 1187 } 1188 1189 if (toep->tp_state == TCPS_SYN_SENT) { 1190 free_atid(cdev, tid); 1191#ifdef notyet 1192 __skb_queue_purge(&tp->out_of_order_queue); 1193#endif 1194 } else { // we have TID 1195 cxgb_remove_tid(cdev, toep, tid); 1196 toepcb_release(toep); 1197 } 1198#if 0 1199 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 1200#endif 1201} 1202 1203static void 1204install_offload_ops(struct socket *so) 1205{ 1206 struct tcpcb *tp = so_sototcpcb(so); 1207 1208 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 1209 1210 t3_install_socket_ops(so); 1211 tp->t_flags |= TF_TOE; 1212 tp->t_tu = &cxgb_toe_usrreqs; 1213} 1214 1215/* 1216 * Determine the receive window scaling factor given a target max 1217 * receive window. 1218 */ 1219static __inline int 1220select_rcv_wscale(int space, struct vnet *vnet) 1221{ 1222 INIT_VNET_INET(vnet); 1223 int wscale = 0; 1224 1225 if (space > MAX_RCV_WND) 1226 space = MAX_RCV_WND; 1227 1228 if (V_tcp_do_rfc1323) 1229 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 1230 1231 return (wscale); 1232} 1233 1234/* 1235 * Determine the receive window size for a socket. 1236 */ 1237static unsigned long 1238select_rcv_wnd(struct toedev *dev, struct socket *so) 1239{ 1240 INIT_VNET_INET(so->so_vnet); 1241 struct tom_data *d = TOM_DATA(dev); 1242 unsigned int wnd; 1243 unsigned int max_rcv_wnd; 1244 struct sockbuf *rcv; 1245 1246 rcv = so_sockbuf_rcv(so); 1247 1248 if (V_tcp_do_autorcvbuf) 1249 wnd = V_tcp_autorcvbuf_max; 1250 else 1251 wnd = rcv->sb_hiwat; 1252 1253 1254 1255 /* XXX 1256 * For receive coalescing to work effectively we need a receive window 1257 * that can accomodate a coalesced segment. 1258 */ 1259 if (wnd < MIN_RCV_WND) 1260 wnd = MIN_RCV_WND; 1261 1262 /* PR 5138 */ 1263 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 1264 (uint32_t)d->rx_page_size * 23 : 1265 MAX_RCV_WND); 1266 1267 return min(wnd, max_rcv_wnd); 1268} 1269 1270/* 1271 * Assign offload parameters to some socket fields. This code is used by 1272 * both active and passive opens. 1273 */ 1274static inline void 1275init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1276 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1277{ 1278 struct tcpcb *tp = so_sototcpcb(so); 1279 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1280 struct sockbuf *snd, *rcv; 1281 1282#ifdef notyet 1283 SOCK_LOCK_ASSERT(so); 1284#endif 1285 1286 snd = so_sockbuf_snd(so); 1287 rcv = so_sockbuf_rcv(so); 1288 1289 log(LOG_INFO, "initializing offload socket\n"); 1290 /* 1291 * We either need to fix push frames to work with sbcompress 1292 * or we need to add this 1293 */ 1294 snd->sb_flags |= SB_NOCOALESCE; 1295 rcv->sb_flags |= SB_NOCOALESCE; 1296 1297 tp->t_toe = toep; 1298 toep->tp_tp = tp; 1299 toep->tp_toedev = dev; 1300 1301 toep->tp_tid = tid; 1302 toep->tp_l2t = e; 1303 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1304 toep->tp_wr_unacked = 0; 1305 toep->tp_delack_mode = 0; 1306 1307 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1308 /* 1309 * XXX broken 1310 * 1311 */ 1312 tp->rcv_wnd = select_rcv_wnd(dev, so); 1313 1314 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 1315 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1316 toep->tp_qset_idx = 0; 1317 1318 reset_wr_list(toep); 1319 DPRINTF("initialization done\n"); 1320} 1321 1322/* 1323 * The next two functions calculate the option 0 value for a socket. 1324 */ 1325static inline unsigned int 1326calc_opt0h(struct socket *so, int mtu_idx) 1327{ 1328 struct tcpcb *tp = so_sototcpcb(so); 1329 int wscale = select_rcv_wscale(tp->rcv_wnd, so->so_vnet); 1330 1331 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1332 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1333 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1334} 1335 1336static inline unsigned int 1337calc_opt0l(struct socket *so, int ulp_mode) 1338{ 1339 struct tcpcb *tp = so_sototcpcb(so); 1340 unsigned int val; 1341 1342 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | 1343 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1344 1345 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); 1346 return (val); 1347} 1348 1349static inline unsigned int 1350calc_opt2(const struct socket *so, struct toedev *dev) 1351{ 1352 int flv_valid; 1353 1354 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1355 1356 return (V_FLAVORS_VALID(flv_valid) | 1357 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); 1358} 1359 1360#if DEBUG_WR > 1 1361static int 1362count_pending_wrs(const struct toepcb *toep) 1363{ 1364 const struct mbuf *m; 1365 int n = 0; 1366 1367 wr_queue_walk(toep, m) 1368 n += m->m_pkthdr.csum_data; 1369 return (n); 1370} 1371#endif 1372 1373#if 0 1374(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1375#endif 1376 1377static void 1378mk_act_open_req(struct socket *so, struct mbuf *m, 1379 unsigned int atid, const struct l2t_entry *e) 1380{ 1381 struct cpl_act_open_req *req; 1382 struct inpcb *inp = so_sotoinpcb(so); 1383 struct tcpcb *tp = inp_inpcbtotcpcb(inp); 1384 struct toepcb *toep = tp->t_toe; 1385 struct toedev *tdev = toep->tp_toedev; 1386 1387 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); 1388 1389 req = mtod(m, struct cpl_act_open_req *); 1390 m->m_pkthdr.len = m->m_len = sizeof(*req); 1391 1392 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1393 req->wr.wr_lo = 0; 1394 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1395 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); 1396#if 0 1397 req->local_port = inp->inp_lport; 1398 req->peer_port = inp->inp_fport; 1399 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1400 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1401#endif 1402 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1403 V_TX_CHANNEL(e->smt_idx)); 1404 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1405 req->params = 0; 1406 req->opt2 = htonl(calc_opt2(so, tdev)); 1407} 1408 1409 1410/* 1411 * Convert an ACT_OPEN_RPL status to an errno. 1412 */ 1413static int 1414act_open_rpl_status_to_errno(int status) 1415{ 1416 switch (status) { 1417 case CPL_ERR_CONN_RESET: 1418 return (ECONNREFUSED); 1419 case CPL_ERR_ARP_MISS: 1420 return (EHOSTUNREACH); 1421 case CPL_ERR_CONN_TIMEDOUT: 1422 return (ETIMEDOUT); 1423 case CPL_ERR_TCAM_FULL: 1424 return (ENOMEM); 1425 case CPL_ERR_CONN_EXIST: 1426 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1427 return (EADDRINUSE); 1428 default: 1429 return (EIO); 1430 } 1431} 1432 1433static void 1434fail_act_open(struct toepcb *toep, int errno) 1435{ 1436 struct tcpcb *tp = toep->tp_tp; 1437 1438 t3_release_offload_resources(toep); 1439 if (tp) { 1440 inp_wunlock(tp->t_inpcb); 1441 tcp_offload_drop(tp, errno); 1442 } 1443 1444#ifdef notyet 1445 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1446#endif 1447} 1448 1449/* 1450 * Handle active open failures. 1451 */ 1452static void 1453active_open_failed(struct toepcb *toep, struct mbuf *m) 1454{ 1455 struct cpl_act_open_rpl *rpl = cplhdr(m); 1456 struct inpcb *inp; 1457 1458 if (toep->tp_tp == NULL) 1459 goto done; 1460 1461 inp = toep->tp_tp->t_inpcb; 1462 1463/* 1464 * Don't handle connection retry for now 1465 */ 1466#ifdef notyet 1467 struct inet_connection_sock *icsk = inet_csk(sk); 1468 1469 if (rpl->status == CPL_ERR_CONN_EXIST && 1470 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1471 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1472 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1473 jiffies + HZ / 2); 1474 } else 1475#endif 1476 { 1477 inp_wlock(inp); 1478 /* 1479 * drops the inpcb lock 1480 */ 1481 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1482 } 1483 1484 done: 1485 m_free(m); 1486} 1487 1488/* 1489 * Return whether a failed active open has allocated a TID 1490 */ 1491static inline int 1492act_open_has_tid(int status) 1493{ 1494 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1495 status != CPL_ERR_ARP_MISS; 1496} 1497 1498/* 1499 * Process an ACT_OPEN_RPL CPL message. 1500 */ 1501static int 1502do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1503{ 1504 struct toepcb *toep = (struct toepcb *)ctx; 1505 struct cpl_act_open_rpl *rpl = cplhdr(m); 1506 1507 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1508 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1509 1510 active_open_failed(toep, m); 1511 return (0); 1512} 1513 1514/* 1515 * Handle an ARP failure for an active open. XXX purge ofo queue 1516 * 1517 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1518 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1519 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1520 * free the atid. Hmm. 1521 */ 1522#ifdef notyet 1523static void 1524act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1525{ 1526 struct toepcb *toep = m_get_toep(m); 1527 struct tcpcb *tp = toep->tp_tp; 1528 struct inpcb *inp = tp->t_inpcb; 1529 struct socket *so; 1530 1531 inp_wlock(inp); 1532 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1533 /* 1534 * drops the inpcb lock 1535 */ 1536 fail_act_open(so, EHOSTUNREACH); 1537 printf("freeing %p\n", m); 1538 1539 m_free(m); 1540 } else 1541 inp_wunlock(inp); 1542} 1543#endif 1544/* 1545 * Send an active open request. 1546 */ 1547int 1548t3_connect(struct toedev *tdev, struct socket *so, 1549 struct rtentry *rt, struct sockaddr *nam) 1550{ 1551 struct mbuf *m; 1552 struct l2t_entry *e; 1553 struct tom_data *d = TOM_DATA(tdev); 1554 struct inpcb *inp = so_sotoinpcb(so); 1555 struct tcpcb *tp = intotcpcb(inp); 1556 struct toepcb *toep; /* allocated by init_offload_socket */ 1557 1558 int atid; 1559 1560 toep = toepcb_alloc(); 1561 if (toep == NULL) 1562 goto out_err; 1563 1564 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1565 goto out_err; 1566 1567 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1568 if (!e) 1569 goto free_tid; 1570 1571 inp_lock_assert(inp); 1572 m = m_gethdr(MT_DATA, M_WAITOK); 1573 1574#if 0 1575 m->m_toe.mt_toepcb = tp->t_toe; 1576 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1577#endif 1578 so_lock(so); 1579 1580 init_offload_socket(so, tdev, atid, e, rt, toep); 1581 1582 install_offload_ops(so); 1583 1584 mk_act_open_req(so, m, atid, e); 1585 so_unlock(so); 1586 1587 soisconnecting(so); 1588 toep = tp->t_toe; 1589 m_set_toep(m, tp->t_toe); 1590 1591 toep->tp_state = TCPS_SYN_SENT; 1592 l2t_send(d->cdev, (struct mbuf *)m, e); 1593 1594 if (toep->tp_ulp_mode) 1595 t3_enable_ddp(toep, 0); 1596 return (0); 1597 1598free_tid: 1599 printf("failing connect - free atid\n"); 1600 1601 free_atid(d->cdev, atid); 1602out_err: 1603 printf("return ENOMEM\n"); 1604 return (ENOMEM); 1605} 1606 1607/* 1608 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1609 * not send multiple ABORT_REQs for the same connection and also that we do 1610 * not try to send a message after the connection has closed. Returns 1 if 1611 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1612 */ 1613static void 1614t3_send_reset(struct toepcb *toep) 1615{ 1616 1617 struct cpl_abort_req *req; 1618 unsigned int tid = toep->tp_tid; 1619 int mode = CPL_ABORT_SEND_RST; 1620 struct tcpcb *tp = toep->tp_tp; 1621 struct toedev *tdev = toep->tp_toedev; 1622 struct socket *so = NULL; 1623 struct mbuf *m; 1624 struct sockbuf *snd; 1625 1626 if (tp) { 1627 inp_lock_assert(tp->t_inpcb); 1628 so = inp_inpcbtosocket(tp->t_inpcb); 1629 } 1630 1631 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1632 tdev == NULL)) 1633 return; 1634 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1635 1636 snd = so_sockbuf_snd(so); 1637 /* Purge the send queue so we don't send anything after an abort. */ 1638 if (so) 1639 sbflush(snd); 1640 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1641 mode |= CPL_ABORT_POST_CLOSE_REQ; 1642 1643 m = m_gethdr_nofail(sizeof(*req)); 1644 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); 1645 set_arp_failure_handler(m, abort_arp_failure); 1646 1647 req = mtod(m, struct cpl_abort_req *); 1648 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1649 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1650 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1651 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1652 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1653 req->cmd = mode; 1654 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1655 mbufq_tail(&toep->out_of_order_queue, m); // defer 1656 else 1657 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1658} 1659 1660static int 1661t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1662{ 1663 struct inpcb *inp; 1664 int error, optval; 1665 1666 if (sopt->sopt_name == IP_OPTIONS) 1667 return (ENOPROTOOPT); 1668 1669 if (sopt->sopt_name != IP_TOS) 1670 return (EOPNOTSUPP); 1671 1672 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1673 1674 if (error) 1675 return (error); 1676 1677 if (optval > IPTOS_PREC_CRITIC_ECP) 1678 return (EINVAL); 1679 1680 inp = so_sotoinpcb(so); 1681 inp_wlock(inp); 1682 inp_ip_tos_set(inp, optval); 1683#if 0 1684 inp->inp_ip_tos = optval; 1685#endif 1686 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); 1687 inp_wunlock(inp); 1688 1689 return (0); 1690} 1691 1692static int 1693t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1694{ 1695 int err = 0; 1696 size_t copied; 1697 1698 if (sopt->sopt_name != TCP_CONGESTION && 1699 sopt->sopt_name != TCP_NODELAY) 1700 return (EOPNOTSUPP); 1701 1702 if (sopt->sopt_name == TCP_CONGESTION) { 1703 char name[TCP_CA_NAME_MAX]; 1704 int optlen = sopt->sopt_valsize; 1705 struct tcpcb *tp; 1706 1707 if (sopt->sopt_dir == SOPT_GET) { 1708 KASSERT(0, ("unimplemented")); 1709 return (EOPNOTSUPP); 1710 } 1711 1712 if (optlen < 1) 1713 return (EINVAL); 1714 1715 err = copyinstr(sopt->sopt_val, name, 1716 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1717 if (err) 1718 return (err); 1719 if (copied < 1) 1720 return (EINVAL); 1721 1722 tp = so_sototcpcb(so); 1723 /* 1724 * XXX I need to revisit this 1725 */ 1726 if ((err = t3_set_cong_control(so, name)) == 0) { 1727#ifdef CONGESTION_CONTROL_SUPPORTED 1728 tp->t_cong_control = strdup(name, M_CXGB); 1729#endif 1730 } else 1731 return (err); 1732 } else { 1733 int optval, oldval; 1734 struct inpcb *inp; 1735 struct tcpcb *tp; 1736 1737 if (sopt->sopt_dir == SOPT_GET) 1738 return (EOPNOTSUPP); 1739 1740 err = sooptcopyin(sopt, &optval, sizeof optval, 1741 sizeof optval); 1742 1743 if (err) 1744 return (err); 1745 1746 inp = so_sotoinpcb(so); 1747 inp_wlock(inp); 1748 tp = inp_inpcbtotcpcb(inp); 1749 1750 oldval = tp->t_flags; 1751 if (optval) 1752 tp->t_flags |= TF_NODELAY; 1753 else 1754 tp->t_flags &= ~TF_NODELAY; 1755 inp_wunlock(inp); 1756 1757 1758 if (oldval != tp->t_flags && (tp->t_toe != NULL)) 1759 t3_set_nagle(tp->t_toe); 1760 1761 } 1762 1763 return (0); 1764} 1765 1766int 1767t3_ctloutput(struct socket *so, struct sockopt *sopt) 1768{ 1769 int err; 1770 1771 if (sopt->sopt_level != IPPROTO_TCP) 1772 err = t3_ip_ctloutput(so, sopt); 1773 else 1774 err = t3_tcp_ctloutput(so, sopt); 1775 1776 if (err != EOPNOTSUPP) 1777 return (err); 1778 1779 return (tcp_ctloutput(so, sopt)); 1780} 1781 1782/* 1783 * Returns true if we need to explicitly request RST when we receive new data 1784 * on an RX-closed connection. 1785 */ 1786static inline int 1787need_rst_on_excess_rx(const struct toepcb *toep) 1788{ 1789 return (1); 1790} 1791 1792/* 1793 * Handles Rx data that arrives in a state where the socket isn't accepting 1794 * new data. 1795 */ 1796static void 1797handle_excess_rx(struct toepcb *toep, struct mbuf *m) 1798{ 1799 1800 if (need_rst_on_excess_rx(toep) && 1801 !(toep->tp_flags & TP_ABORT_SHUTDOWN)) 1802 t3_send_reset(toep); 1803 m_freem(m); 1804} 1805 1806/* 1807 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) 1808 * by getting the DDP offset from the TCB. 1809 */ 1810static void 1811tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) 1812{ 1813 struct ddp_state *q = &toep->tp_ddp_state; 1814 struct ddp_buf_state *bsp; 1815 struct cpl_get_tcb_rpl *hdr; 1816 unsigned int ddp_offset; 1817 struct socket *so; 1818 struct tcpcb *tp; 1819 struct sockbuf *rcv; 1820 int state; 1821 1822 uint64_t t; 1823 __be64 *tcb; 1824 1825 tp = toep->tp_tp; 1826 so = inp_inpcbtosocket(tp->t_inpcb); 1827 1828 inp_lock_assert(tp->t_inpcb); 1829 rcv = so_sockbuf_rcv(so); 1830 sockbuf_lock(rcv); 1831 1832 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. 1833 * We really need a cookie in order to dispatch the RPLs. 1834 */ 1835 q->get_tcb_count--; 1836 1837 /* It is a possible that a previous CPL already invalidated UBUF DDP 1838 * and moved the cur_buf idx and hence no further processing of this 1839 * skb is required. However, the app might be sleeping on 1840 * !q->get_tcb_count and we need to wake it up. 1841 */ 1842 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { 1843 int state = so_state_get(so); 1844 1845 m_freem(m); 1846 if (__predict_true((state & SS_NOFDREF) == 0)) 1847 so_sorwakeup_locked(so); 1848 else 1849 sockbuf_unlock(rcv); 1850 1851 return; 1852 } 1853 1854 bsp = &q->buf_state[q->cur_buf]; 1855 hdr = cplhdr(m); 1856 tcb = (__be64 *)(hdr + 1); 1857 if (q->cur_buf == 0) { 1858 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); 1859 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); 1860 } else { 1861 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); 1862 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; 1863 } 1864 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; 1865 m->m_cur_offset = bsp->cur_offset; 1866 bsp->cur_offset = ddp_offset; 1867 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; 1868 1869 CTR5(KTR_TOM, 1870 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", 1871 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); 1872 KASSERT(ddp_offset >= m->m_cur_offset, 1873 ("ddp_offset=%u less than cur_offset=%u", 1874 ddp_offset, m->m_cur_offset)); 1875 1876#if 0 1877{ 1878 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; 1879 1880 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); 1881 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; 1882 1883 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); 1884 rcv_nxt = t >> S_TCB_RCV_NXT; 1885 rcv_nxt &= M_TCB_RCV_NXT; 1886 1887 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); 1888 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); 1889 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; 1890 1891 T3_TRACE2(TIDTB(sk), 1892 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", 1893 ddp_flags, rcv_nxt - rx_hdr_offset); 1894 T3_TRACE4(TB(q), 1895 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", 1896 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); 1897 T3_TRACE3(TB(q), 1898 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", 1899 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); 1900 T3_TRACE2(TB(q), 1901 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", 1902 q->buf_state[0].flags, q->buf_state[1].flags); 1903 1904} 1905#endif 1906 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { 1907 handle_excess_rx(toep, m); 1908 return; 1909 } 1910 1911#ifdef T3_TRACE 1912 if ((int)m->m_pkthdr.len < 0) { 1913 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); 1914 } 1915#endif 1916 if (bsp->flags & DDP_BF_NOCOPY) { 1917#ifdef T3_TRACE 1918 T3_TRACE0(TB(q), 1919 "tcb_rpl_as_ddp_complete: CANCEL UBUF"); 1920 1921 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1922 printk("!cancel_ubuf"); 1923 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); 1924 } 1925#endif 1926 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; 1927 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); 1928 q->cur_buf ^= 1; 1929 } else if (bsp->flags & DDP_BF_NOFLIP) { 1930 1931 m->m_ddp_flags = 1; /* always a kernel buffer */ 1932 1933 /* now HW buffer carries a user buffer */ 1934 bsp->flags &= ~DDP_BF_NOFLIP; 1935 bsp->flags |= DDP_BF_NOCOPY; 1936 1937 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate 1938 * any new data in which case we're done. If in addition the 1939 * offset is 0, then there wasn't a completion for the kbuf 1940 * and we need to decrement the posted count. 1941 */ 1942 if (m->m_pkthdr.len == 0) { 1943 if (ddp_offset == 0) { 1944 q->kbuf_posted--; 1945 bsp->flags |= DDP_BF_NODATA; 1946 } 1947 sockbuf_unlock(rcv); 1948 m_free(m); 1949 return; 1950 } 1951 } else { 1952 sockbuf_unlock(rcv); 1953 1954 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, 1955 * but it got here way late and nobody cares anymore. 1956 */ 1957 m_free(m); 1958 return; 1959 } 1960 1961 m->m_ddp_gl = (unsigned char *)bsp->gl; 1962 m->m_flags |= M_DDP; 1963 m->m_seq = tp->rcv_nxt; 1964 tp->rcv_nxt += m->m_pkthdr.len; 1965 tp->t_rcvtime = ticks; 1966 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", 1967 m->m_seq, q->cur_buf, m->m_pkthdr.len); 1968 if (m->m_pkthdr.len == 0) { 1969 q->user_ddp_pending = 0; 1970 m_free(m); 1971 } else 1972 SBAPPEND(rcv, m); 1973 1974 state = so_state_get(so); 1975 if (__predict_true((state & SS_NOFDREF) == 0)) 1976 so_sorwakeup_locked(so); 1977 else 1978 sockbuf_unlock(rcv); 1979} 1980 1981/* 1982 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, 1983 * in that case they are similar to DDP completions. 1984 */ 1985static int 1986do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1987{ 1988 struct toepcb *toep = (struct toepcb *)ctx; 1989 1990 /* OK if socket doesn't exist */ 1991 if (toep == NULL) { 1992 printf("null toep in do_get_tcb_rpl\n"); 1993 return (CPL_RET_BUF_DONE); 1994 } 1995 1996 inp_wlock(toep->tp_tp->t_inpcb); 1997 tcb_rpl_as_ddp_complete(toep, m); 1998 inp_wunlock(toep->tp_tp->t_inpcb); 1999 2000 return (0); 2001} 2002 2003static void 2004handle_ddp_data(struct toepcb *toep, struct mbuf *m) 2005{ 2006 struct tcpcb *tp = toep->tp_tp; 2007 struct socket *so; 2008 struct ddp_state *q; 2009 struct ddp_buf_state *bsp; 2010 struct cpl_rx_data *hdr = cplhdr(m); 2011 unsigned int rcv_nxt = ntohl(hdr->seq); 2012 struct sockbuf *rcv; 2013 2014 if (tp->rcv_nxt == rcv_nxt) 2015 return; 2016 2017 inp_lock_assert(tp->t_inpcb); 2018 so = inp_inpcbtosocket(tp->t_inpcb); 2019 rcv = so_sockbuf_rcv(so); 2020 sockbuf_lock(rcv); 2021 2022 q = &toep->tp_ddp_state; 2023 bsp = &q->buf_state[q->cur_buf]; 2024 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", 2025 rcv_nxt, tp->rcv_nxt)); 2026 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2027 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2028 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", 2029 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); 2030 2031#ifdef T3_TRACE 2032 if ((int)m->m_pkthdr.len < 0) { 2033 t3_ddp_error(so, "handle_ddp_data: neg len"); 2034 } 2035#endif 2036 m->m_ddp_gl = (unsigned char *)bsp->gl; 2037 m->m_flags |= M_DDP; 2038 m->m_cur_offset = bsp->cur_offset; 2039 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2040 if (bsp->flags & DDP_BF_NOCOPY) 2041 bsp->flags &= ~DDP_BF_NOCOPY; 2042 2043 m->m_seq = tp->rcv_nxt; 2044 tp->rcv_nxt = rcv_nxt; 2045 bsp->cur_offset += m->m_pkthdr.len; 2046 if (!(bsp->flags & DDP_BF_NOFLIP)) 2047 q->cur_buf ^= 1; 2048 /* 2049 * For now, don't re-enable DDP after a connection fell out of DDP 2050 * mode. 2051 */ 2052 q->ubuf_ddp_ready = 0; 2053 sockbuf_unlock(rcv); 2054} 2055 2056/* 2057 * Process new data received for a connection. 2058 */ 2059static void 2060new_rx_data(struct toepcb *toep, struct mbuf *m) 2061{ 2062 struct cpl_rx_data *hdr = cplhdr(m); 2063 struct tcpcb *tp = toep->tp_tp; 2064 struct socket *so; 2065 struct sockbuf *rcv; 2066 int state; 2067 int len = be16toh(hdr->len); 2068 2069 inp_wlock(tp->t_inpcb); 2070 2071 so = inp_inpcbtosocket(tp->t_inpcb); 2072 2073 if (__predict_false(so_no_receive(so))) { 2074 handle_excess_rx(toep, m); 2075 inp_wunlock(tp->t_inpcb); 2076 TRACE_EXIT; 2077 return; 2078 } 2079 2080 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) 2081 handle_ddp_data(toep, m); 2082 2083 m->m_seq = ntohl(hdr->seq); 2084 m->m_ulp_mode = 0; /* for iSCSI */ 2085 2086#if VALIDATE_SEQ 2087 if (__predict_false(m->m_seq != tp->rcv_nxt)) { 2088 log(LOG_ERR, 2089 "%s: TID %u: Bad sequence number %u, expected %u\n", 2090 toep->tp_toedev->name, toep->tp_tid, m->m_seq, 2091 tp->rcv_nxt); 2092 m_freem(m); 2093 inp_wunlock(tp->t_inpcb); 2094 return; 2095 } 2096#endif 2097 m_adj(m, sizeof(*hdr)); 2098 2099#ifdef URGENT_DATA_SUPPORTED 2100 /* 2101 * We don't handle urgent data yet 2102 */ 2103 if (__predict_false(hdr->urg)) 2104 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 2105 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 2106 tp->urg_seq - tp->rcv_nxt < skb->len)) 2107 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 2108 tp->rcv_nxt]; 2109#endif 2110 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 2111 toep->tp_delack_mode = hdr->dack_mode; 2112 toep->tp_delack_seq = tp->rcv_nxt; 2113 } 2114 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", 2115 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); 2116 2117 if (len < m->m_pkthdr.len) 2118 m->m_pkthdr.len = m->m_len = len; 2119 2120 tp->rcv_nxt += m->m_pkthdr.len; 2121 tp->t_rcvtime = ticks; 2122 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2123 CTR2(KTR_TOM, 2124 "new_rx_data: seq 0x%x len %u", 2125 m->m_seq, m->m_pkthdr.len); 2126 inp_wunlock(tp->t_inpcb); 2127 rcv = so_sockbuf_rcv(so); 2128 sockbuf_lock(rcv); 2129#if 0 2130 if (sb_notify(rcv)) 2131 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); 2132#endif 2133 SBAPPEND(rcv, m); 2134 2135#ifdef notyet 2136 /* 2137 * We're giving too many credits to the card - but disable this check so we can keep on moving :-| 2138 * 2139 */ 2140 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), 2141 2142 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 2143 so, rcv->sb_cc, rcv->sb_mbmax)); 2144#endif 2145 2146 2147 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", 2148 rcv->sb_cc, rcv->sb_mbcnt); 2149 2150 state = so_state_get(so); 2151 if (__predict_true((state & SS_NOFDREF) == 0)) 2152 so_sorwakeup_locked(so); 2153 else 2154 sockbuf_unlock(rcv); 2155} 2156 2157/* 2158 * Handler for RX_DATA CPL messages. 2159 */ 2160static int 2161do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2162{ 2163 struct toepcb *toep = (struct toepcb *)ctx; 2164 2165 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 2166 2167 new_rx_data(toep, m); 2168 2169 return (0); 2170} 2171 2172static void 2173new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) 2174{ 2175 struct tcpcb *tp; 2176 struct ddp_state *q; 2177 struct ddp_buf_state *bsp; 2178 struct cpl_rx_data_ddp *hdr; 2179 struct socket *so; 2180 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 2181 int nomoredata = 0; 2182 unsigned int delack_mode; 2183 struct sockbuf *rcv; 2184 2185 tp = toep->tp_tp; 2186 inp_wlock(tp->t_inpcb); 2187 so = inp_inpcbtosocket(tp->t_inpcb); 2188 2189 if (__predict_false(so_no_receive(so))) { 2190 2191 handle_excess_rx(toep, m); 2192 inp_wunlock(tp->t_inpcb); 2193 return; 2194 } 2195 2196 q = &toep->tp_ddp_state; 2197 hdr = cplhdr(m); 2198 ddp_report = ntohl(hdr->u.ddp_report); 2199 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2200 bsp = &q->buf_state[buf_idx]; 2201 2202 CTR4(KTR_TOM, 2203 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 2204 "hdr seq 0x%x len %u", 2205 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 2206 ntohs(hdr->len)); 2207 CTR3(KTR_TOM, 2208 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", 2209 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); 2210 2211 ddp_len = ntohs(hdr->len); 2212 rcv_nxt = ntohl(hdr->seq) + ddp_len; 2213 2214 delack_mode = G_DDP_DACK_MODE(ddp_report); 2215 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2216 toep->tp_delack_mode = delack_mode; 2217 toep->tp_delack_seq = tp->rcv_nxt; 2218 } 2219 2220 m->m_seq = tp->rcv_nxt; 2221 tp->rcv_nxt = rcv_nxt; 2222 2223 tp->t_rcvtime = ticks; 2224 /* 2225 * Store the length in m->m_len. We are changing the meaning of 2226 * m->m_len here, we need to be very careful that nothing from now on 2227 * interprets ->len of this packet the usual way. 2228 */ 2229 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; 2230 inp_wunlock(tp->t_inpcb); 2231 CTR3(KTR_TOM, 2232 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", 2233 m->m_len, rcv_nxt, m->m_seq); 2234 /* 2235 * Figure out where the new data was placed in the buffer and store it 2236 * in when. Assumes the buffer offset starts at 0, consumer needs to 2237 * account for page pod's pg_offset. 2238 */ 2239 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 2240 m->m_cur_offset = end_offset - m->m_pkthdr.len; 2241 2242 rcv = so_sockbuf_rcv(so); 2243 sockbuf_lock(rcv); 2244 2245 m->m_ddp_gl = (unsigned char *)bsp->gl; 2246 m->m_flags |= M_DDP; 2247 bsp->cur_offset = end_offset; 2248 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2249 2250 /* 2251 * Length is only meaningful for kbuf 2252 */ 2253 if (!(bsp->flags & DDP_BF_NOCOPY)) 2254 KASSERT(m->m_len <= bsp->gl->dgl_length, 2255 ("length received exceeds ddp pages: len=%d dgl_length=%d", 2256 m->m_len, bsp->gl->dgl_length)); 2257 2258 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2259 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); 2260 /* 2261 * Bit 0 of flags stores whether the DDP buffer is completed. 2262 * Note that other parts of the code depend on this being in bit 0. 2263 */ 2264 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 2265 panic("spurious ddp completion"); 2266 } else { 2267 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 2268 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 2269 q->cur_buf ^= 1; /* flip buffers */ 2270 } 2271 2272 if (bsp->flags & DDP_BF_NOCOPY) { 2273 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); 2274 bsp->flags &= ~DDP_BF_NOCOPY; 2275 } 2276 2277 if (ddp_report & F_DDP_PSH) 2278 m->m_ddp_flags |= DDP_BF_PSH; 2279 if (nomoredata) 2280 m->m_ddp_flags |= DDP_BF_NODATA; 2281 2282#ifdef notyet 2283 skb_reset_transport_header(skb); 2284 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ 2285#endif 2286 SBAPPEND(rcv, m); 2287 2288 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || 2289 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) 2290 || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) 2291 so_sorwakeup_locked(so); 2292 else 2293 sockbuf_unlock(rcv); 2294} 2295 2296#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 2297 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 2298 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 2299 F_DDP_INVALID_PPOD) 2300 2301/* 2302 * Handler for RX_DATA_DDP CPL messages. 2303 */ 2304static int 2305do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2306{ 2307 struct toepcb *toep = ctx; 2308 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 2309 2310 VALIDATE_SOCK(so); 2311 2312 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 2313 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 2314 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 2315 return (CPL_RET_BUF_DONE); 2316 } 2317#if 0 2318 skb->h.th = tcphdr_skb->h.th; 2319#endif 2320 new_rx_data_ddp(toep, m); 2321 return (0); 2322} 2323 2324static void 2325process_ddp_complete(struct toepcb *toep, struct mbuf *m) 2326{ 2327 struct tcpcb *tp = toep->tp_tp; 2328 struct socket *so; 2329 struct ddp_state *q; 2330 struct ddp_buf_state *bsp; 2331 struct cpl_rx_ddp_complete *hdr; 2332 unsigned int ddp_report, buf_idx, when, delack_mode; 2333 int nomoredata = 0; 2334 struct sockbuf *rcv; 2335 2336 inp_wlock(tp->t_inpcb); 2337 so = inp_inpcbtosocket(tp->t_inpcb); 2338 2339 if (__predict_false(so_no_receive(so))) { 2340 struct inpcb *inp = so_sotoinpcb(so); 2341 2342 handle_excess_rx(toep, m); 2343 inp_wunlock(inp); 2344 return; 2345 } 2346 q = &toep->tp_ddp_state; 2347 hdr = cplhdr(m); 2348 ddp_report = ntohl(hdr->ddp_report); 2349 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2350 m->m_pkthdr.csum_data = tp->rcv_nxt; 2351 2352 rcv = so_sockbuf_rcv(so); 2353 sockbuf_lock(rcv); 2354 2355 bsp = &q->buf_state[buf_idx]; 2356 when = bsp->cur_offset; 2357 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; 2358 tp->rcv_nxt += m->m_len; 2359 tp->t_rcvtime = ticks; 2360 2361 delack_mode = G_DDP_DACK_MODE(ddp_report); 2362 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2363 toep->tp_delack_mode = delack_mode; 2364 toep->tp_delack_seq = tp->rcv_nxt; 2365 } 2366#ifdef notyet 2367 skb_reset_transport_header(skb); 2368 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2369#endif 2370 inp_wunlock(tp->t_inpcb); 2371 2372 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2373 CTR5(KTR_TOM, 2374 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2375 "ddp_report 0x%x offset %u, len %u", 2376 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2377 G_DDP_OFFSET(ddp_report), m->m_len); 2378 2379 m->m_cur_offset = bsp->cur_offset; 2380 bsp->cur_offset += m->m_len; 2381 2382 if (!(bsp->flags & DDP_BF_NOFLIP)) { 2383 q->cur_buf ^= 1; /* flip buffers */ 2384 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) 2385 nomoredata=1; 2386 } 2387 2388 CTR4(KTR_TOM, 2389 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2390 "ddp_report %u offset %u", 2391 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2392 G_DDP_OFFSET(ddp_report)); 2393 2394 m->m_ddp_gl = (unsigned char *)bsp->gl; 2395 m->m_flags |= M_DDP; 2396 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 2397 if (bsp->flags & DDP_BF_NOCOPY) 2398 bsp->flags &= ~DDP_BF_NOCOPY; 2399 if (nomoredata) 2400 m->m_ddp_flags |= DDP_BF_NODATA; 2401 2402 SBAPPEND(rcv, m); 2403 if ((so_state_get(so) & SS_NOFDREF) == 0) 2404 so_sorwakeup_locked(so); 2405 else 2406 sockbuf_unlock(rcv); 2407} 2408 2409/* 2410 * Handler for RX_DDP_COMPLETE CPL messages. 2411 */ 2412static int 2413do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2414{ 2415 struct toepcb *toep = ctx; 2416 2417 VALIDATE_SOCK(so); 2418#if 0 2419 skb->h.th = tcphdr_skb->h.th; 2420#endif 2421 process_ddp_complete(toep, m); 2422 return (0); 2423} 2424 2425/* 2426 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 2427 * socket state before calling tcp_time_wait to comply with its expectations. 2428 */ 2429static void 2430enter_timewait(struct tcpcb *tp) 2431{ 2432 /* 2433 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2434 * process peer_close because we don't want to carry the peer FIN in 2435 * the socket's receive queue and if we increment rcv_nxt without 2436 * having the FIN in the receive queue we'll confuse facilities such 2437 * as SIOCINQ. 2438 */ 2439 inp_wlock(tp->t_inpcb); 2440 tp->rcv_nxt++; 2441 2442 tp->ts_recent_age = 0; /* defeat recycling */ 2443 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2444 inp_wunlock(tp->t_inpcb); 2445 tcp_offload_twstart(tp); 2446} 2447 2448/* 2449 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This 2450 * function deals with the data that may be reported along with the FIN. 2451 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to 2452 * perform normal FIN-related processing. In the latter case 1 indicates that 2453 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the 2454 * skb can be freed. 2455 */ 2456static int 2457handle_peer_close_data(struct socket *so, struct mbuf *m) 2458{ 2459 struct tcpcb *tp = so_sototcpcb(so); 2460 struct toepcb *toep = tp->t_toe; 2461 struct ddp_state *q; 2462 struct ddp_buf_state *bsp; 2463 struct cpl_peer_close *req = cplhdr(m); 2464 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ 2465 struct sockbuf *rcv; 2466 2467 if (tp->rcv_nxt == rcv_nxt) /* no data */ 2468 return (0); 2469 2470 CTR0(KTR_TOM, "handle_peer_close_data"); 2471 if (__predict_false(so_no_receive(so))) { 2472 handle_excess_rx(toep, m); 2473 2474 /* 2475 * Although we discard the data we want to process the FIN so 2476 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + 2477 * PEER_CLOSE without data. In particular this PEER_CLOSE 2478 * may be what will close the connection. We return 1 because 2479 * handle_excess_rx() already freed the packet. 2480 */ 2481 return (1); 2482 } 2483 2484 inp_lock_assert(tp->t_inpcb); 2485 q = &toep->tp_ddp_state; 2486 rcv = so_sockbuf_rcv(so); 2487 sockbuf_lock(rcv); 2488 2489 bsp = &q->buf_state[q->cur_buf]; 2490 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2491 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2492 m->m_ddp_gl = (unsigned char *)bsp->gl; 2493 m->m_flags |= M_DDP; 2494 m->m_cur_offset = bsp->cur_offset; 2495 m->m_ddp_flags = 2496 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2497 m->m_seq = tp->rcv_nxt; 2498 tp->rcv_nxt = rcv_nxt; 2499 bsp->cur_offset += m->m_pkthdr.len; 2500 if (!(bsp->flags & DDP_BF_NOFLIP)) 2501 q->cur_buf ^= 1; 2502#ifdef notyet 2503 skb_reset_transport_header(skb); 2504 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2505#endif 2506 tp->t_rcvtime = ticks; 2507 SBAPPEND(rcv, m); 2508 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 2509 so_sorwakeup_locked(so); 2510 else 2511 sockbuf_unlock(rcv); 2512 2513 return (1); 2514} 2515 2516/* 2517 * Handle a peer FIN. 2518 */ 2519static void 2520do_peer_fin(struct toepcb *toep, struct mbuf *m) 2521{ 2522 struct socket *so; 2523 struct tcpcb *tp = toep->tp_tp; 2524 int keep, action; 2525 2526 action = keep = 0; 2527 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); 2528 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2529 printf("abort_pending set\n"); 2530 2531 goto out; 2532 } 2533 inp_wlock(tp->t_inpcb); 2534 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 2535 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { 2536 keep = handle_peer_close_data(so, m); 2537 if (keep < 0) { 2538 inp_wunlock(tp->t_inpcb); 2539 return; 2540 } 2541 } 2542 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2543 CTR1(KTR_TOM, 2544 "waking up waiters for cantrcvmore on %p ", so); 2545 socantrcvmore(so); 2546 2547 /* 2548 * If connection is half-synchronized 2549 * (ie NEEDSYN flag on) then delay ACK, 2550 * so it may be piggybacked when SYN is sent. 2551 * Otherwise, since we received a FIN then no 2552 * more input can be expected, send ACK now. 2553 */ 2554 if (tp->t_flags & TF_NEEDSYN) 2555 tp->t_flags |= TF_DELACK; 2556 else 2557 tp->t_flags |= TF_ACKNOW; 2558 tp->rcv_nxt++; 2559 } 2560 2561 switch (tp->t_state) { 2562 case TCPS_SYN_RECEIVED: 2563 tp->t_starttime = ticks; 2564 /* FALLTHROUGH */ 2565 case TCPS_ESTABLISHED: 2566 tp->t_state = TCPS_CLOSE_WAIT; 2567 break; 2568 case TCPS_FIN_WAIT_1: 2569 tp->t_state = TCPS_CLOSING; 2570 break; 2571 case TCPS_FIN_WAIT_2: 2572 /* 2573 * If we've sent an abort_req we must have sent it too late, 2574 * HW will send us a reply telling us so, and this peer_close 2575 * is really the last message for this connection and needs to 2576 * be treated as an abort_rpl, i.e., transition the connection 2577 * to TCP_CLOSE (note that the host stack does this at the 2578 * time of generating the RST but we must wait for HW). 2579 * Otherwise we enter TIME_WAIT. 2580 */ 2581 t3_release_offload_resources(toep); 2582 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2583 action = TCP_CLOSE; 2584 } else { 2585 action = TCP_TIMEWAIT; 2586 } 2587 break; 2588 default: 2589 log(LOG_ERR, 2590 "%s: TID %u received PEER_CLOSE in bad state %d\n", 2591 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); 2592 } 2593 inp_wunlock(tp->t_inpcb); 2594 2595 if (action == TCP_TIMEWAIT) { 2596 enter_timewait(tp); 2597 } else if (action == TCP_DROP) { 2598 tcp_offload_drop(tp, 0); 2599 } else if (action == TCP_CLOSE) { 2600 tcp_offload_close(tp); 2601 } 2602 2603#ifdef notyet 2604 /* Do not send POLL_HUP for half duplex close. */ 2605 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 2606 sk->sk_state == TCP_CLOSE) 2607 sk_wake_async(so, 1, POLL_HUP); 2608 else 2609 sk_wake_async(so, 1, POLL_IN); 2610#endif 2611 2612out: 2613 if (!keep) 2614 m_free(m); 2615} 2616 2617/* 2618 * Handler for PEER_CLOSE CPL messages. 2619 */ 2620static int 2621do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2622{ 2623 struct toepcb *toep = (struct toepcb *)ctx; 2624 2625 VALIDATE_SOCK(so); 2626 2627 do_peer_fin(toep, m); 2628 return (0); 2629} 2630 2631static void 2632process_close_con_rpl(struct toepcb *toep, struct mbuf *m) 2633{ 2634 struct cpl_close_con_rpl *rpl = cplhdr(m); 2635 struct tcpcb *tp = toep->tp_tp; 2636 struct socket *so; 2637 int action = 0; 2638 struct sockbuf *rcv; 2639 2640 inp_wlock(tp->t_inpcb); 2641 so = inp_inpcbtosocket(tp->t_inpcb); 2642 2643 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 2644 2645 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2646 inp_wunlock(tp->t_inpcb); 2647 goto out; 2648 } 2649 2650 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 2651 tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); 2652 2653 switch (tp->t_state) { 2654 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 2655 t3_release_offload_resources(toep); 2656 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2657 action = TCP_CLOSE; 2658 2659 } else { 2660 action = TCP_TIMEWAIT; 2661 } 2662 break; 2663 case TCPS_LAST_ACK: 2664 /* 2665 * In this state we don't care about pending abort_rpl. 2666 * If we've sent abort_req it was post-close and was sent too 2667 * late, this close_con_rpl is the actual last message. 2668 */ 2669 t3_release_offload_resources(toep); 2670 action = TCP_CLOSE; 2671 break; 2672 case TCPS_FIN_WAIT_1: 2673 /* 2674 * If we can't receive any more 2675 * data, then closing user can proceed. 2676 * Starting the timer is contrary to the 2677 * specification, but if we don't get a FIN 2678 * we'll hang forever. 2679 * 2680 * XXXjl: 2681 * we should release the tp also, and use a 2682 * compressed state. 2683 */ 2684 if (so) 2685 rcv = so_sockbuf_rcv(so); 2686 else 2687 break; 2688 2689 if (rcv->sb_state & SBS_CANTRCVMORE) { 2690 int timeout; 2691 2692 if (so) 2693 soisdisconnected(so); 2694 timeout = (tcp_fast_finwait2_recycle) ? 2695 tcp_finwait2_timeout : tcp_maxidle; 2696 tcp_timer_activate(tp, TT_2MSL, timeout); 2697 } 2698 tp->t_state = TCPS_FIN_WAIT_2; 2699 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && 2700 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 2701 action = TCP_DROP; 2702 } 2703 2704 break; 2705 default: 2706 log(LOG_ERR, 2707 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 2708 toep->tp_toedev->tod_name, toep->tp_tid, 2709 tp->t_state); 2710 } 2711 inp_wunlock(tp->t_inpcb); 2712 2713 2714 if (action == TCP_TIMEWAIT) { 2715 enter_timewait(tp); 2716 } else if (action == TCP_DROP) { 2717 tcp_offload_drop(tp, 0); 2718 } else if (action == TCP_CLOSE) { 2719 tcp_offload_close(tp); 2720 } 2721out: 2722 m_freem(m); 2723} 2724 2725/* 2726 * Handler for CLOSE_CON_RPL CPL messages. 2727 */ 2728static int 2729do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 2730 void *ctx) 2731{ 2732 struct toepcb *toep = (struct toepcb *)ctx; 2733 2734 process_close_con_rpl(toep, m); 2735 return (0); 2736} 2737 2738/* 2739 * Process abort replies. We only process these messages if we anticipate 2740 * them as the coordination between SW and HW in this area is somewhat lacking 2741 * and sometimes we get ABORT_RPLs after we are done with the connection that 2742 * originated the ABORT_REQ. 2743 */ 2744static void 2745process_abort_rpl(struct toepcb *toep, struct mbuf *m) 2746{ 2747 struct tcpcb *tp = toep->tp_tp; 2748 struct socket *so; 2749 int needclose = 0; 2750 2751#ifdef T3_TRACE 2752 T3_TRACE1(TIDTB(sk), 2753 "process_abort_rpl: GTS rpl pending %d", 2754 sock_flag(sk, ABORT_RPL_PENDING)); 2755#endif 2756 2757 inp_wlock(tp->t_inpcb); 2758 so = inp_inpcbtosocket(tp->t_inpcb); 2759 2760 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2761 /* 2762 * XXX panic on tcpdrop 2763 */ 2764 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) 2765 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2766 else { 2767 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2768 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2769 !is_t3a(toep->tp_toedev)) { 2770 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2771 panic("TP_ABORT_REQ_RCVD set"); 2772 t3_release_offload_resources(toep); 2773 needclose = 1; 2774 } 2775 } 2776 } 2777 inp_wunlock(tp->t_inpcb); 2778 2779 if (needclose) 2780 tcp_offload_close(tp); 2781 2782 m_free(m); 2783} 2784 2785/* 2786 * Handle an ABORT_RPL_RSS CPL message. 2787 */ 2788static int 2789do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2790{ 2791 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2792 struct toepcb *toep; 2793 2794 /* 2795 * Ignore replies to post-close aborts indicating that the abort was 2796 * requested too late. These connections are terminated when we get 2797 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2798 * arrives the TID is either no longer used or it has been recycled. 2799 */ 2800 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2801discard: 2802 m_free(m); 2803 return (0); 2804 } 2805 2806 toep = (struct toepcb *)ctx; 2807 2808 /* 2809 * Sometimes we've already closed the socket, e.g., a post-close 2810 * abort races with ABORT_REQ_RSS, the latter frees the socket 2811 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2812 * but FW turns the ABORT_REQ into a regular one and so we get 2813 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2814 */ 2815 if (!toep) 2816 goto discard; 2817 2818 if (toep->tp_tp == NULL) { 2819 log(LOG_NOTICE, "removing tid for abort\n"); 2820 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2821 if (toep->tp_l2t) 2822 l2t_release(L2DATA(cdev), toep->tp_l2t); 2823 2824 toepcb_release(toep); 2825 goto discard; 2826 } 2827 2828 log(LOG_NOTICE, "toep=%p\n", toep); 2829 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); 2830 2831 toepcb_hold(toep); 2832 process_abort_rpl(toep, m); 2833 toepcb_release(toep); 2834 return (0); 2835} 2836 2837/* 2838 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also 2839 * indicate whether RST should be sent in response. 2840 */ 2841static int 2842abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2843{ 2844 struct tcpcb *tp = so_sototcpcb(so); 2845 2846 switch (abort_reason) { 2847 case CPL_ERR_BAD_SYN: 2848#if 0 2849 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2850#endif 2851 case CPL_ERR_CONN_RESET: 2852 // XXX need to handle SYN_RECV due to crossed SYNs 2853 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2854 case CPL_ERR_XMIT_TIMEDOUT: 2855 case CPL_ERR_PERSIST_TIMEDOUT: 2856 case CPL_ERR_FINWAIT2_TIMEDOUT: 2857 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2858#if 0 2859 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2860#endif 2861 return (ETIMEDOUT); 2862 default: 2863 return (EIO); 2864 } 2865} 2866 2867static inline void 2868set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2869{ 2870 struct cpl_abort_rpl *rpl = cplhdr(m); 2871 2872 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2873 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2874 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2875 2876 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2877 rpl->cmd = cmd; 2878} 2879 2880static void 2881send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2882{ 2883 struct mbuf *reply_mbuf; 2884 struct cpl_abort_req_rss *req = cplhdr(m); 2885 2886 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2887 m_set_priority(m, CPL_PRIORITY_DATA); 2888 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2889 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2890 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2891 m_free(m); 2892} 2893 2894/* 2895 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2896 */ 2897static inline int 2898is_neg_adv_abort(unsigned int status) 2899{ 2900 return status == CPL_ERR_RTX_NEG_ADVICE || 2901 status == CPL_ERR_PERSIST_NEG_ADVICE; 2902} 2903 2904static void 2905send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2906{ 2907 struct mbuf *reply_mbuf; 2908 struct cpl_abort_req_rss *req = cplhdr(m); 2909 2910 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2911 2912 if (!reply_mbuf) { 2913 /* Defer the reply. Stick rst_status into req->cmd. */ 2914 req->status = rst_status; 2915 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2916 return; 2917 } 2918 2919 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2920 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2921 m_free(m); 2922 2923 /* 2924 * XXX need to sync with ARP as for SYN_RECV connections we can send 2925 * these messages while ARP is pending. For other connection states 2926 * it's not a problem. 2927 */ 2928 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2929} 2930 2931#ifdef notyet 2932static void 2933cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2934{ 2935 CXGB_UNIMPLEMENTED(); 2936#ifdef notyet 2937 struct request_sock *req = child->sk_user_data; 2938 2939 inet_csk_reqsk_queue_removed(parent, req); 2940 synq_remove(tcp_sk(child)); 2941 __reqsk_free(req); 2942 child->sk_user_data = NULL; 2943#endif 2944} 2945 2946 2947/* 2948 * Performs the actual work to abort a SYN_RECV connection. 2949 */ 2950static void 2951do_abort_syn_rcv(struct socket *child, struct socket *parent) 2952{ 2953 struct tcpcb *parenttp = so_sototcpcb(parent); 2954 struct tcpcb *childtp = so_sototcpcb(child); 2955 2956 /* 2957 * If the server is still open we clean up the child connection, 2958 * otherwise the server already did the clean up as it was purging 2959 * its SYN queue and the skb was just sitting in its backlog. 2960 */ 2961 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2962 cleanup_syn_rcv_conn(child, parent); 2963 inp_wlock(childtp->t_inpcb); 2964 t3_release_offload_resources(childtp->t_toe); 2965 inp_wunlock(childtp->t_inpcb); 2966 tcp_offload_close(childtp); 2967 } 2968} 2969#endif 2970 2971/* 2972 * Handle abort requests for a SYN_RECV connection. These need extra work 2973 * because the socket is on its parent's SYN queue. 2974 */ 2975static int 2976abort_syn_rcv(struct socket *so, struct mbuf *m) 2977{ 2978 CXGB_UNIMPLEMENTED(); 2979#ifdef notyet 2980 struct socket *parent; 2981 struct toedev *tdev = toep->tp_toedev; 2982 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2983 struct socket *oreq = so->so_incomp; 2984 struct t3c_tid_entry *t3c_stid; 2985 struct tid_info *t; 2986 2987 if (!oreq) 2988 return -1; /* somehow we are not on the SYN queue */ 2989 2990 t = &(T3C_DATA(cdev))->tid_maps; 2991 t3c_stid = lookup_stid(t, oreq->ts_recent); 2992 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2993 2994 so_lock(parent); 2995 do_abort_syn_rcv(so, parent); 2996 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2997 so_unlock(parent); 2998#endif 2999 return (0); 3000} 3001 3002/* 3003 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 3004 * request except that we need to reply to it. 3005 */ 3006static void 3007process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) 3008{ 3009 int rst_status = CPL_ABORT_NO_RST; 3010 const struct cpl_abort_req_rss *req = cplhdr(m); 3011 struct tcpcb *tp = toep->tp_tp; 3012 struct socket *so; 3013 int needclose = 0; 3014 3015 inp_wlock(tp->t_inpcb); 3016 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 3017 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 3018 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 3019 m_free(m); 3020 goto skip; 3021 } 3022 3023 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 3024 /* 3025 * Three cases to consider: 3026 * a) We haven't sent an abort_req; close the connection. 3027 * b) We have sent a post-close abort_req that will get to TP too late 3028 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 3029 * be ignored and the connection should be closed now. 3030 * c) We have sent a regular abort_req that will get to TP too late. 3031 * That will generate an abort_rpl with status 0, wait for it. 3032 */ 3033 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 3034 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 3035 int error; 3036 3037 error = abort_status_to_errno(so, req->status, 3038 &rst_status); 3039 so_error_set(so, error); 3040 3041 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 3042 so_sorwakeup(so); 3043 /* 3044 * SYN_RECV needs special processing. If abort_syn_rcv() 3045 * returns 0 is has taken care of the abort. 3046 */ 3047 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 3048 goto skip; 3049 3050 t3_release_offload_resources(toep); 3051 needclose = 1; 3052 } 3053 inp_wunlock(tp->t_inpcb); 3054 3055 if (needclose) 3056 tcp_offload_close(tp); 3057 3058 send_abort_rpl(m, tdev, rst_status); 3059 return; 3060skip: 3061 inp_wunlock(tp->t_inpcb); 3062} 3063 3064/* 3065 * Handle an ABORT_REQ_RSS CPL message. 3066 */ 3067static int 3068do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3069{ 3070 const struct cpl_abort_req_rss *req = cplhdr(m); 3071 struct toepcb *toep = (struct toepcb *)ctx; 3072 3073 if (is_neg_adv_abort(req->status)) { 3074 m_free(m); 3075 return (0); 3076 } 3077 3078 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); 3079 3080 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 3081 cxgb_remove_tid(cdev, toep, toep->tp_tid); 3082 toep->tp_flags |= TP_ABORT_REQ_RCVD; 3083 3084 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 3085 if (toep->tp_l2t) 3086 l2t_release(L2DATA(cdev), toep->tp_l2t); 3087 3088 /* 3089 * Unhook 3090 */ 3091 toep->tp_tp->t_toe = NULL; 3092 toep->tp_tp->t_flags &= ~TF_TOE; 3093 toep->tp_tp = NULL; 3094 /* 3095 * XXX need to call syncache_chkrst - but we don't 3096 * have a way of doing that yet 3097 */ 3098 toepcb_release(toep); 3099 log(LOG_ERR, "abort for unestablished connection :-(\n"); 3100 return (0); 3101 } 3102 if (toep->tp_tp == NULL) { 3103 log(LOG_NOTICE, "disconnected toepcb\n"); 3104 /* should be freed momentarily */ 3105 return (0); 3106 } 3107 3108 3109 toepcb_hold(toep); 3110 process_abort_req(toep, m, toep->tp_toedev); 3111 toepcb_release(toep); 3112 return (0); 3113} 3114#ifdef notyet 3115static void 3116pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 3117{ 3118 struct toedev *tdev = TOE_DEV(parent); 3119 3120 do_abort_syn_rcv(child, parent); 3121 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 3122 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 3123 3124 rpl->opt0h = htonl(F_TCAM_BYPASS); 3125 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3126 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3127 } else 3128 m_free(m); 3129} 3130#endif 3131static void 3132handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 3133{ 3134 CXGB_UNIMPLEMENTED(); 3135 3136#ifdef notyet 3137 struct t3cdev *cdev; 3138 struct socket *parent; 3139 struct socket *oreq; 3140 struct t3c_tid_entry *t3c_stid; 3141 struct tid_info *t; 3142 struct tcpcb *otp, *tp = so_sototcpcb(so); 3143 struct toepcb *toep = tp->t_toe; 3144 3145 /* 3146 * If the connection is being aborted due to the parent listening 3147 * socket going away there's nothing to do, the ABORT_REQ will close 3148 * the connection. 3149 */ 3150 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 3151 m_free(m); 3152 return; 3153 } 3154 3155 oreq = so->so_incomp; 3156 otp = so_sototcpcb(oreq); 3157 3158 cdev = T3C_DEV(so); 3159 t = &(T3C_DATA(cdev))->tid_maps; 3160 t3c_stid = lookup_stid(t, otp->ts_recent); 3161 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 3162 3163 so_lock(parent); 3164 pass_open_abort(so, parent, m); 3165 so_unlock(parent); 3166#endif 3167} 3168 3169/* 3170 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 3171 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 3172 * connection. 3173 */ 3174static void 3175pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 3176{ 3177 3178#ifdef notyet 3179 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3180 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 3181#endif 3182 handle_pass_open_arp_failure(m_get_socket(m), m); 3183} 3184 3185/* 3186 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 3187 */ 3188static void 3189mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 3190{ 3191 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 3192 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 3193 unsigned int tid = GET_TID(req); 3194 3195 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 3196 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3197 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3198 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 3199 rpl->opt0h = htonl(F_TCAM_BYPASS); 3200 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3201 rpl->opt2 = 0; 3202 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3203} 3204 3205/* 3206 * Send a deferred reject to an accept request. 3207 */ 3208static void 3209reject_pass_request(struct toedev *tdev, struct mbuf *m) 3210{ 3211 struct mbuf *reply_mbuf; 3212 3213 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 3214 mk_pass_accept_rpl(reply_mbuf, m); 3215 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 3216 m_free(m); 3217} 3218 3219static void 3220handle_syncache_event(int event, void *arg) 3221{ 3222 struct toepcb *toep = arg; 3223 3224 switch (event) { 3225 case TOE_SC_ENTRY_PRESENT: 3226 /* 3227 * entry already exists - free toepcb 3228 * and l2t 3229 */ 3230 printf("syncache entry present\n"); 3231 toepcb_release(toep); 3232 break; 3233 case TOE_SC_DROP: 3234 /* 3235 * The syncache has given up on this entry 3236 * either it timed out, or it was evicted 3237 * we need to explicitly release the tid 3238 */ 3239 printf("syncache entry dropped\n"); 3240 toepcb_release(toep); 3241 break; 3242 default: 3243 log(LOG_ERR, "unknown syncache event %d\n", event); 3244 break; 3245 } 3246} 3247 3248static void 3249syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 3250{ 3251 struct in_conninfo inc; 3252 struct toeopt toeo; 3253 struct tcphdr th; 3254 struct inpcb *inp; 3255 int mss, wsf, sack, ts; 3256 uint32_t rcv_isn = ntohl(req->rcv_isn); 3257 3258 bzero(&toeo, sizeof(struct toeopt)); 3259 inp = so_sotoinpcb(lso); 3260 3261 /* 3262 * Fill out information for entering us into the syncache 3263 */ 3264 bzero(&inc, sizeof(inc)); 3265 inc.inc_fport = th.th_sport = req->peer_port; 3266 inc.inc_lport = th.th_dport = req->local_port; 3267 th.th_seq = req->rcv_isn; 3268 th.th_flags = TH_SYN; 3269 3270 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; 3271 3272 inc.inc_len = 0; 3273 inc.inc_faddr.s_addr = req->peer_ip; 3274 inc.inc_laddr.s_addr = req->local_ip; 3275 3276 DPRINTF("syncache add of %d:%d %d:%d\n", 3277 ntohl(req->local_ip), ntohs(req->local_port), 3278 ntohl(req->peer_ip), ntohs(req->peer_port)); 3279 3280 mss = req->tcp_options.mss; 3281 wsf = req->tcp_options.wsf; 3282 ts = req->tcp_options.tstamp; 3283 sack = req->tcp_options.sack; 3284 toeo.to_mss = mss; 3285 toeo.to_wscale = wsf; 3286 toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3287 tcp_offload_syncache_add(&inc, &toeo, &th, inp, &lso, &cxgb_toe_usrreqs, 3288toep); 3289} 3290 3291 3292/* 3293 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 3294 * lock held. Note that the sock here is a listening socket that is not owned 3295 * by the TOE. 3296 */ 3297static void 3298process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 3299 struct listen_ctx *lctx) 3300{ 3301 int rt_flags; 3302 struct l2t_entry *e; 3303 struct iff_mac tim; 3304 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 3305 struct cpl_pass_accept_rpl *rpl; 3306 struct cpl_pass_accept_req *req = cplhdr(m); 3307 unsigned int tid = GET_TID(req); 3308 struct tom_data *d = TOM_DATA(tdev); 3309 struct t3cdev *cdev = d->cdev; 3310 struct tcpcb *tp = so_sototcpcb(so); 3311 struct toepcb *newtoep; 3312 struct rtentry *dst; 3313 struct sockaddr_in nam; 3314 struct t3c_data *td = T3C_DATA(cdev); 3315 3316 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3317 if (__predict_false(reply_mbuf == NULL)) { 3318 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3319 t3_defer_reply(m, tdev, reject_pass_request); 3320 else { 3321 cxgb_queue_tid_release(cdev, tid); 3322 m_free(m); 3323 } 3324 DPRINTF("failed to get reply_mbuf\n"); 3325 3326 goto out; 3327 } 3328 3329 if (tp->t_state != TCPS_LISTEN) { 3330 DPRINTF("socket not in listen state\n"); 3331 3332 goto reject; 3333 } 3334 3335 tim.mac_addr = req->dst_mac; 3336 tim.vlan_tag = ntohs(req->vlan_tag); 3337 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 3338 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 3339 goto reject; 3340 } 3341 3342#ifdef notyet 3343 /* 3344 * XXX do route lookup to confirm that we're still listening on this 3345 * address 3346 */ 3347 if (ip_route_input(skb, req->local_ip, req->peer_ip, 3348 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 3349 goto reject; 3350 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 3351 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 3352 dst_release(skb->dst); // done with the input route, release it 3353 skb->dst = NULL; 3354 3355 if ((rt_flags & RTF_LOCAL) == 0) 3356 goto reject; 3357#endif 3358 /* 3359 * XXX 3360 */ 3361 rt_flags = RTF_LOCAL; 3362 if ((rt_flags & RTF_LOCAL) == 0) 3363 goto reject; 3364 3365 /* 3366 * Calculate values and add to syncache 3367 */ 3368 3369 newtoep = toepcb_alloc(); 3370 if (newtoep == NULL) 3371 goto reject; 3372 3373 bzero(&nam, sizeof(struct sockaddr_in)); 3374 3375 nam.sin_len = sizeof(struct sockaddr_in); 3376 nam.sin_family = AF_INET; 3377 nam.sin_addr.s_addr =req->peer_ip; 3378 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 3379 3380 if (dst == NULL) { 3381 printf("failed to find route\n"); 3382 goto reject; 3383 } 3384 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 3385 (struct sockaddr *)&nam); 3386 if (e == NULL) { 3387 DPRINTF("failed to get l2t\n"); 3388 } 3389 /* 3390 * Point to our listen socket until accept 3391 */ 3392 newtoep->tp_tp = tp; 3393 newtoep->tp_flags = TP_SYN_RCVD; 3394 newtoep->tp_tid = tid; 3395 newtoep->tp_toedev = tdev; 3396 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3397 3398 cxgb_insert_tid(cdev, d->client, newtoep, tid); 3399 so_lock(so); 3400 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 3401 so_unlock(so); 3402 3403 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 3404 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 3405 3406 if (newtoep->tp_ulp_mode) { 3407 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3408 3409 if (ddp_mbuf == NULL) 3410 newtoep->tp_ulp_mode = 0; 3411 } 3412 3413 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", 3414 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); 3415 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 3416 /* 3417 * XXX workaround for lack of syncache drop 3418 */ 3419 toepcb_hold(newtoep); 3420 syncache_add_accept_req(req, so, newtoep); 3421 3422 rpl = cplhdr(reply_mbuf); 3423 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 3424 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3425 rpl->wr.wr_lo = 0; 3426 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3427 rpl->opt2 = htonl(calc_opt2(so, tdev)); 3428 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3429 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 3430 3431 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 3432 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 3433 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | 3434 CPL_PASS_OPEN_ACCEPT); 3435 3436 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 3437 3438 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); 3439 3440 l2t_send(cdev, reply_mbuf, e); 3441 m_free(m); 3442 if (newtoep->tp_ulp_mode) { 3443 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 3444 V_TF_DDP_OFF(1) | 3445 TP_DDP_TIMER_WORKAROUND_MASK, 3446 V_TF_DDP_OFF(1) | 3447 TP_DDP_TIMER_WORKAROUND_VAL, 1); 3448 } else 3449 DPRINTF("no DDP\n"); 3450 3451 return; 3452reject: 3453 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3454 mk_pass_accept_rpl(reply_mbuf, m); 3455 else 3456 mk_tid_release(reply_mbuf, newtoep, tid); 3457 cxgb_ofld_send(cdev, reply_mbuf); 3458 m_free(m); 3459out: 3460#if 0 3461 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3462#else 3463 return; 3464#endif 3465} 3466 3467/* 3468 * Handle a CPL_PASS_ACCEPT_REQ message. 3469 */ 3470static int 3471do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3472{ 3473 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 3474 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ 3475 struct tom_data *d = listen_ctx->tom_data; 3476 3477#if VALIDATE_TID 3478 struct cpl_pass_accept_req *req = cplhdr(m); 3479 unsigned int tid = GET_TID(req); 3480 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 3481 3482 if (unlikely(!lsk)) { 3483 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 3484 cdev->name, 3485 (unsigned long)((union listen_entry *)ctx - 3486 t->stid_tab)); 3487 return CPL_RET_BUF_DONE; 3488 } 3489 if (unlikely(tid >= t->ntids)) { 3490 printk(KERN_ERR "%s: passive open TID %u too large\n", 3491 cdev->name, tid); 3492 return CPL_RET_BUF_DONE; 3493 } 3494 /* 3495 * For T3A the current user of the TID may have closed but its last 3496 * message(s) may have been backlogged so the TID appears to be still 3497 * in use. Just take the TID away, the connection can close at its 3498 * own leisure. For T3B this situation is a bug. 3499 */ 3500 if (!valid_new_tid(t, tid) && 3501 cdev->type != T3A) { 3502 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 3503 cdev->name, tid); 3504 return CPL_RET_BUF_DONE; 3505 } 3506#endif 3507 3508 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 3509 return (0); 3510} 3511 3512/* 3513 * Called when a connection is established to translate the TCP options 3514 * reported by HW to FreeBSD's native format. 3515 */ 3516static void 3517assign_rxopt(struct socket *so, unsigned int opt) 3518{ 3519 struct tcpcb *tp = so_sototcpcb(so); 3520 struct toepcb *toep = tp->t_toe; 3521 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); 3522 3523 inp_lock_assert(tp->t_inpcb); 3524 3525 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3526 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 3527 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 3528 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 3529 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3530 (TF_RCVD_SCALE|TF_REQ_SCALE)) 3531 tp->rcv_scale = tp->request_r_scale; 3532} 3533 3534/* 3535 * Completes some final bits of initialization for just established connections 3536 * and changes their state to TCP_ESTABLISHED. 3537 * 3538 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 3539 */ 3540static void 3541make_established(struct socket *so, u32 snd_isn, unsigned int opt) 3542{ 3543 struct tcpcb *tp = so_sototcpcb(so); 3544 struct toepcb *toep = tp->t_toe; 3545 3546 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 3547 assign_rxopt(so, opt); 3548 3549 /* 3550 *XXXXXXXXXXX 3551 * 3552 */ 3553#ifdef notyet 3554 so->so_proto->pr_ctloutput = t3_ctloutput; 3555#endif 3556 3557#if 0 3558 inet_sk(sk)->id = tp->write_seq ^ jiffies; 3559#endif 3560 /* 3561 * XXX not clear what rcv_wup maps to 3562 */ 3563 /* 3564 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 3565 * pass through opt0. 3566 */ 3567 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 3568 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 3569 3570 dump_toepcb(toep); 3571 3572#ifdef notyet 3573/* 3574 * no clean interface for marking ARP up to date 3575 */ 3576 dst_confirm(sk->sk_dst_cache); 3577#endif 3578 tp->t_starttime = ticks; 3579 tp->t_state = TCPS_ESTABLISHED; 3580 soisconnected(so); 3581} 3582 3583static int 3584syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 3585{ 3586 3587 struct in_conninfo inc; 3588 struct toeopt toeo; 3589 struct tcphdr th; 3590 int mss, wsf, sack, ts; 3591 struct mbuf *m = NULL; 3592 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 3593 unsigned int opt; 3594 3595#ifdef MAC 3596#error "no MAC support" 3597#endif 3598 3599 opt = ntohs(req->tcp_opt); 3600 3601 bzero(&toeo, sizeof(struct toeopt)); 3602 3603 /* 3604 * Fill out information for entering us into the syncache 3605 */ 3606 bzero(&inc, sizeof(inc)); 3607 inc.inc_fport = th.th_sport = req->peer_port; 3608 inc.inc_lport = th.th_dport = req->local_port; 3609 th.th_seq = req->rcv_isn; 3610 th.th_flags = TH_ACK; 3611 3612 inc.inc_len = 0; 3613 inc.inc_faddr.s_addr = req->peer_ip; 3614 inc.inc_laddr.s_addr = req->local_ip; 3615 3616 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3617 wsf = G_TCPOPT_WSCALE_OK(opt); 3618 ts = G_TCPOPT_TSTAMP(opt); 3619 sack = G_TCPOPT_SACK(opt); 3620 3621 toeo.to_mss = mss; 3622 toeo.to_wscale = G_TCPOPT_SND_WSCALE(opt); 3623 toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3624 3625 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 3626 ntohl(req->local_ip), ntohs(req->local_port), 3627 ntohl(req->peer_ip), ntohs(req->peer_port), 3628 mss, wsf, ts, sack); 3629 return tcp_offload_syncache_expand(&inc, &toeo, &th, so, m); 3630} 3631 3632 3633/* 3634 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 3635 * if we are in TCP_SYN_RECV due to crossed SYNs 3636 */ 3637static int 3638do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3639{ 3640 struct cpl_pass_establish *req = cplhdr(m); 3641 struct toepcb *toep = (struct toepcb *)ctx; 3642 struct tcpcb *tp = toep->tp_tp; 3643 struct socket *so, *lso; 3644 struct t3c_data *td = T3C_DATA(cdev); 3645 struct sockbuf *snd, *rcv; 3646 3647 // Complete socket initialization now that we have the SND_ISN 3648 3649 struct toedev *tdev; 3650 3651 3652 tdev = toep->tp_toedev; 3653 3654 inp_wlock(tp->t_inpcb); 3655 3656 /* 3657 * 3658 * XXX need to add reference while we're manipulating 3659 */ 3660 so = lso = inp_inpcbtosocket(tp->t_inpcb); 3661 3662 inp_wunlock(tp->t_inpcb); 3663 3664 so_lock(so); 3665 LIST_REMOVE(toep, synq_entry); 3666 so_unlock(so); 3667 3668 if (!syncache_expand_establish_req(req, &so, toep)) { 3669 /* 3670 * No entry 3671 */ 3672 CXGB_UNIMPLEMENTED(); 3673 } 3674 if (so == NULL) { 3675 /* 3676 * Couldn't create the socket 3677 */ 3678 CXGB_UNIMPLEMENTED(); 3679 } 3680 3681 tp = so_sototcpcb(so); 3682 inp_wlock(tp->t_inpcb); 3683 3684 snd = so_sockbuf_snd(so); 3685 rcv = so_sockbuf_rcv(so); 3686 3687 snd->sb_flags |= SB_NOCOALESCE; 3688 rcv->sb_flags |= SB_NOCOALESCE; 3689 3690 toep->tp_tp = tp; 3691 toep->tp_flags = 0; 3692 tp->t_toe = toep; 3693 reset_wr_list(toep); 3694 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3695 tp->rcv_nxt = toep->tp_copied_seq; 3696 install_offload_ops(so); 3697 3698 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 3699 toep->tp_wr_unacked = 0; 3700 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3701 toep->tp_qset_idx = 0; 3702 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 3703 3704 /* 3705 * XXX Cancel any keep alive timer 3706 */ 3707 3708 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3709 3710 /* 3711 * XXX workaround for lack of syncache drop 3712 */ 3713 toepcb_release(toep); 3714 inp_wunlock(tp->t_inpcb); 3715 3716 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); 3717 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3718#ifdef notyet 3719 /* 3720 * XXX not sure how these checks map to us 3721 */ 3722 if (unlikely(sk->sk_socket)) { // simultaneous opens only 3723 sk->sk_state_change(sk); 3724 sk_wake_async(so, 0, POLL_OUT); 3725 } 3726 /* 3727 * The state for the new connection is now up to date. 3728 * Next check if we should add the connection to the parent's 3729 * accept queue. When the parent closes it resets connections 3730 * on its SYN queue, so check if we are being reset. If so we 3731 * don't need to do anything more, the coming ABORT_RPL will 3732 * destroy this socket. Otherwise move the connection to the 3733 * accept queue. 3734 * 3735 * Note that we reset the synq before closing the server so if 3736 * we are not being reset the stid is still open. 3737 */ 3738 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3739 __kfree_skb(skb); 3740 goto unlock; 3741 } 3742#endif 3743 m_free(m); 3744 3745 return (0); 3746} 3747 3748/* 3749 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3750 * and send them to the TOE. 3751 */ 3752static void 3753fixup_and_send_ofo(struct toepcb *toep) 3754{ 3755 struct mbuf *m; 3756 struct toedev *tdev = toep->tp_toedev; 3757 struct tcpcb *tp = toep->tp_tp; 3758 unsigned int tid = toep->tp_tid; 3759 3760 log(LOG_NOTICE, "fixup_and_send_ofo\n"); 3761 3762 inp_lock_assert(tp->t_inpcb); 3763 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3764 /* 3765 * A variety of messages can be waiting but the fields we'll 3766 * be touching are common to all so any message type will do. 3767 */ 3768 struct cpl_close_con_req *p = cplhdr(m); 3769 3770 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3771 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3772 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3773 } 3774} 3775 3776/* 3777 * Updates socket state from an active establish CPL message. Runs with the 3778 * socket lock held. 3779 */ 3780static void 3781socket_act_establish(struct socket *so, struct mbuf *m) 3782{ 3783 INIT_VNET_INET(so->so_vnet); 3784 struct cpl_act_establish *req = cplhdr(m); 3785 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3786 struct tcpcb *tp = so_sototcpcb(so); 3787 struct toepcb *toep = tp->t_toe; 3788 3789 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3790 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3791 toep->tp_tid, tp->t_state); 3792 3793 tp->ts_recent_age = ticks; 3794 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3795 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3796 3797 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3798 3799 /* 3800 * Now that we finally have a TID send any CPL messages that we had to 3801 * defer for lack of a TID. 3802 */ 3803 if (mbufq_len(&toep->out_of_order_queue)) 3804 fixup_and_send_ofo(toep); 3805 3806 if (__predict_false(so_state_get(so) & SS_NOFDREF)) { 3807 /* 3808 * XXX does this even make sense? 3809 */ 3810 so_sorwakeup(so); 3811 } 3812 m_free(m); 3813#ifdef notyet 3814/* 3815 * XXX assume no write requests permitted while socket connection is 3816 * incomplete 3817 */ 3818 /* 3819 * Currently the send queue must be empty at this point because the 3820 * socket layer does not send anything before a connection is 3821 * established. To be future proof though we handle the possibility 3822 * that there are pending buffers to send (either TX_DATA or 3823 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3824 * buffers according to the just learned write_seq, and then we send 3825 * them on their way. 3826 */ 3827 fixup_pending_writeq_buffers(sk); 3828 if (t3_push_frames(so, 1)) 3829 sk->sk_write_space(sk); 3830#endif 3831 3832 toep->tp_state = tp->t_state; 3833 TCPSTAT_INC(tcps_connects); 3834 3835} 3836 3837/* 3838 * Process a CPL_ACT_ESTABLISH message. 3839 */ 3840static int 3841do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3842{ 3843 struct cpl_act_establish *req = cplhdr(m); 3844 unsigned int tid = GET_TID(req); 3845 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3846 struct toepcb *toep = (struct toepcb *)ctx; 3847 struct tcpcb *tp = toep->tp_tp; 3848 struct socket *so; 3849 struct toedev *tdev; 3850 struct tom_data *d; 3851 3852 if (tp == NULL) { 3853 free_atid(cdev, atid); 3854 return (0); 3855 } 3856 inp_wlock(tp->t_inpcb); 3857 3858 /* 3859 * XXX 3860 */ 3861 so = inp_inpcbtosocket(tp->t_inpcb); 3862 tdev = toep->tp_toedev; /* blow up here if link was down */ 3863 d = TOM_DATA(tdev); 3864 3865 /* 3866 * It's OK if the TID is currently in use, the owning socket may have 3867 * backlogged its last CPL message(s). Just take it away. 3868 */ 3869 toep->tp_tid = tid; 3870 toep->tp_tp = tp; 3871 so_insert_tid(d, toep, tid); 3872 free_atid(cdev, atid); 3873 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3874 3875 socket_act_establish(so, m); 3876 inp_wunlock(tp->t_inpcb); 3877 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); 3878 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3879 3880 return (0); 3881} 3882 3883/* 3884 * Process an acknowledgment of WR completion. Advance snd_una and send the 3885 * next batch of work requests from the write queue. 3886 */ 3887static void 3888wr_ack(struct toepcb *toep, struct mbuf *m) 3889{ 3890 struct tcpcb *tp = toep->tp_tp; 3891 struct cpl_wr_ack *hdr = cplhdr(m); 3892 struct socket *so; 3893 unsigned int credits = ntohs(hdr->credits); 3894 u32 snd_una = ntohl(hdr->snd_una); 3895 int bytes = 0; 3896 struct sockbuf *snd; 3897 3898 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); 3899 3900 inp_wlock(tp->t_inpcb); 3901 so = inp_inpcbtosocket(tp->t_inpcb); 3902 toep->tp_wr_avail += credits; 3903 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3904 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3905 3906 while (credits) { 3907 struct mbuf *p = peek_wr(toep); 3908 3909 if (__predict_false(!p)) { 3910 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3911 "nothing pending, state %u wr_avail=%u\n", 3912 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 3913 break; 3914 } 3915 CTR2(KTR_TOM, 3916 "wr_ack: p->credits=%d p->bytes=%d", 3917 p->m_pkthdr.csum_data, p->m_pkthdr.len); 3918 KASSERT(p->m_pkthdr.csum_data != 0, 3919 ("empty request still on list")); 3920 3921 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3922 3923#if DEBUG_WR > 1 3924 struct tx_data_wr *w = cplhdr(p); 3925 log(LOG_ERR, 3926 "TID %u got %u WR credits, need %u, len %u, " 3927 "main body %u, frags %u, seq # %u, ACK una %u," 3928 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3929 toep->tp_tid, credits, p->csum, p->len, 3930 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3931 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3932 toep->tp_wr_avail, count_pending_wrs(tp) - credits); 3933#endif 3934 p->m_pkthdr.csum_data -= credits; 3935 break; 3936 } else { 3937 dequeue_wr(toep); 3938 credits -= p->m_pkthdr.csum_data; 3939 bytes += p->m_pkthdr.len; 3940 CTR3(KTR_TOM, 3941 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", 3942 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); 3943 3944 m_free(p); 3945 } 3946 } 3947 3948#if DEBUG_WR 3949 check_wr_invariants(tp); 3950#endif 3951 3952 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3953#if VALIDATE_SEQ 3954 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3955 3956 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3957 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3958 toep->tp_tid, tp->snd_una); 3959#endif 3960 goto out_free; 3961 } 3962 3963 if (tp->snd_una != snd_una) { 3964 tp->snd_una = snd_una; 3965 tp->ts_recent_age = ticks; 3966#ifdef notyet 3967 /* 3968 * Keep ARP entry "minty fresh" 3969 */ 3970 dst_confirm(sk->sk_dst_cache); 3971#endif 3972 if (tp->snd_una == tp->snd_nxt) 3973 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3974 } 3975 3976 snd = so_sockbuf_snd(so); 3977 if (bytes) { 3978 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); 3979 snd = so_sockbuf_snd(so); 3980 sockbuf_lock(snd); 3981 sbdrop_locked(snd, bytes); 3982 so_sowwakeup_locked(so); 3983 } 3984 3985 if (snd->sb_sndptroff < snd->sb_cc) 3986 t3_push_frames(so, 0); 3987 3988out_free: 3989 inp_wunlock(tp->t_inpcb); 3990 m_free(m); 3991} 3992 3993/* 3994 * Handler for TX_DATA_ACK CPL messages. 3995 */ 3996static int 3997do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 3998{ 3999 struct toepcb *toep = (struct toepcb *)ctx; 4000 4001 VALIDATE_SOCK(so); 4002 4003 wr_ack(toep, m); 4004 return 0; 4005} 4006 4007/* 4008 * Handler for TRACE_PKT CPL messages. Just sink these packets. 4009 */ 4010static int 4011do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) 4012{ 4013 m_freem(m); 4014 return 0; 4015} 4016 4017/* 4018 * Reset a connection that is on a listener's SYN queue or accept queue, 4019 * i.e., one that has not had a struct socket associated with it. 4020 * Must be called from process context. 4021 * 4022 * Modeled after code in inet_csk_listen_stop(). 4023 */ 4024static void 4025t3_reset_listen_child(struct socket *child) 4026{ 4027 struct tcpcb *tp = so_sototcpcb(child); 4028 4029 t3_send_reset(tp->t_toe); 4030} 4031 4032 4033static void 4034t3_child_disconnect(struct socket *so, void *arg) 4035{ 4036 struct tcpcb *tp = so_sototcpcb(so); 4037 4038 if (tp->t_flags & TF_TOE) { 4039 inp_wlock(tp->t_inpcb); 4040 t3_reset_listen_child(so); 4041 inp_wunlock(tp->t_inpcb); 4042 } 4043} 4044 4045/* 4046 * Disconnect offloaded established but not yet accepted connections sitting 4047 * on a server's accept_queue. We just send an ABORT_REQ at this point and 4048 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 4049 */ 4050void 4051t3_disconnect_acceptq(struct socket *listen_so) 4052{ 4053 4054 so_lock(listen_so); 4055 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); 4056 so_unlock(listen_so); 4057} 4058 4059/* 4060 * Reset offloaded connections sitting on a server's syn queue. As above 4061 * we send ABORT_REQ and finish off when we get ABORT_RPL. 4062 */ 4063 4064void 4065t3_reset_synq(struct listen_ctx *lctx) 4066{ 4067 struct toepcb *toep; 4068 4069 so_lock(lctx->lso); 4070 while (!LIST_EMPTY(&lctx->synq_head)) { 4071 toep = LIST_FIRST(&lctx->synq_head); 4072 LIST_REMOVE(toep, synq_entry); 4073 toep->tp_tp = NULL; 4074 t3_send_reset(toep); 4075 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 4076 toepcb_release(toep); 4077 } 4078 so_unlock(lctx->lso); 4079} 4080 4081 4082int 4083t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, 4084 unsigned int nppods, unsigned int tag, unsigned int maxoff, 4085 unsigned int pg_off, unsigned int color) 4086{ 4087 unsigned int i, j, pidx; 4088 struct pagepod *p; 4089 struct mbuf *m; 4090 struct ulp_mem_io *req; 4091 unsigned int tid = toep->tp_tid; 4092 const struct tom_data *td = TOM_DATA(toep->tp_toedev); 4093 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; 4094 4095 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", 4096 gl, nppods, tag, maxoff, pg_off, color); 4097 4098 for (i = 0; i < nppods; ++i) { 4099 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); 4100 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4101 req = mtod(m, struct ulp_mem_io *); 4102 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; 4103 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4104 req->wr.wr_lo = 0; 4105 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | 4106 V_ULPTX_CMD(ULP_MEM_WRITE)); 4107 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | 4108 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); 4109 4110 p = (struct pagepod *)(req + 1); 4111 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { 4112 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); 4113 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | 4114 V_PPOD_COLOR(color)); 4115 p->pp_max_offset = htonl(maxoff); 4116 p->pp_page_offset = htonl(pg_off); 4117 p->pp_rsvd = 0; 4118 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) 4119 p->pp_addr[j] = pidx < gl->dgl_nelem ? 4120 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; 4121 } else 4122 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ 4123 send_or_defer(toep, m, 0); 4124 ppod_addr += PPOD_SIZE; 4125 } 4126 return (0); 4127} 4128 4129/* 4130 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. 4131 */ 4132static inline void 4133mk_cpl_barrier_ulp(struct cpl_barrier *b) 4134{ 4135 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; 4136 4137 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4138 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); 4139 b->opcode = CPL_BARRIER; 4140} 4141 4142/* 4143 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. 4144 */ 4145static inline void 4146mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) 4147{ 4148 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4149 4150 txpkt = (struct ulp_txpkt *)req; 4151 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4152 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4153 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); 4154 req->cpuno = htons(cpuno); 4155} 4156 4157/* 4158 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. 4159 */ 4160static inline void 4161mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, 4162 unsigned int word, uint64_t mask, uint64_t val) 4163{ 4164 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4165 4166 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 4167 tid, word, mask, val); 4168 4169 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4170 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4171 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); 4172 req->reply = V_NO_REPLY(1); 4173 req->cpu_idx = 0; 4174 req->word = htons(word); 4175 req->mask = htobe64(mask); 4176 req->val = htobe64(val); 4177} 4178 4179/* 4180 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. 4181 */ 4182static void 4183mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, 4184 unsigned int tid, unsigned int credits) 4185{ 4186 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; 4187 4188 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4189 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); 4190 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); 4191 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 4192 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | 4193 V_RX_CREDITS(credits)); 4194} 4195 4196void 4197t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) 4198{ 4199 unsigned int wrlen; 4200 struct mbuf *m; 4201 struct work_request_hdr *wr; 4202 struct cpl_barrier *lock; 4203 struct cpl_set_tcb_field *req; 4204 struct cpl_get_tcb *getreq; 4205 struct ddp_state *p = &toep->tp_ddp_state; 4206 4207#if 0 4208 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4209#endif 4210 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + 4211 sizeof(*getreq); 4212 m = m_gethdr_nofail(wrlen); 4213 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4214 wr = mtod(m, struct work_request_hdr *); 4215 bzero(wr, wrlen); 4216 4217 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4218 m->m_pkthdr.len = m->m_len = wrlen; 4219 4220 lock = (struct cpl_barrier *)(wr + 1); 4221 mk_cpl_barrier_ulp(lock); 4222 4223 req = (struct cpl_set_tcb_field *)(lock + 1); 4224 4225 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); 4226 4227 /* Hmmm, not sure if this actually a good thing: reactivating 4228 * the other buffer might be an issue if it has been completed 4229 * already. However, that is unlikely, since the fact that the UBUF 4230 * is not completed indicates that there is no oustanding data. 4231 */ 4232 if (bufidx == 0) 4233 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4234 V_TF_DDP_ACTIVE_BUF(1) | 4235 V_TF_DDP_BUF0_VALID(1), 4236 V_TF_DDP_ACTIVE_BUF(1)); 4237 else 4238 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4239 V_TF_DDP_ACTIVE_BUF(1) | 4240 V_TF_DDP_BUF1_VALID(1), 0); 4241 4242 getreq = (struct cpl_get_tcb *)(req + 1); 4243 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4244 4245 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); 4246 4247 /* Keep track of the number of oustanding CPL_GET_TCB requests 4248 */ 4249 p->get_tcb_count++; 4250 4251#ifdef T3_TRACE 4252 T3_TRACE1(TIDTB(so), 4253 "t3_cancel_ddpbuf: bufidx %u", bufidx); 4254#endif 4255 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4256} 4257 4258/** 4259 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one 4260 * @sk: the socket associated with the buffers 4261 * @bufidx: index of HW DDP buffer (0 or 1) 4262 * @tag0: new tag for HW buffer 0 4263 * @tag1: new tag for HW buffer 1 4264 * @len: new length for HW buf @bufidx 4265 * 4266 * Sends a compound WR to overlay a new DDP buffer on top of an existing 4267 * buffer by changing the buffer tag and length and setting the valid and 4268 * active flag accordingly. The caller must ensure the new buffer is at 4269 * least as big as the existing one. Since we typically reprogram both HW 4270 * buffers this function sets both tags for convenience. Read the TCB to 4271 * determine how made data was written into the buffer before the overlay 4272 * took place. 4273 */ 4274void 4275t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, 4276 unsigned int tag1, unsigned int len) 4277{ 4278 unsigned int wrlen; 4279 struct mbuf *m; 4280 struct work_request_hdr *wr; 4281 struct cpl_get_tcb *getreq; 4282 struct cpl_set_tcb_field *req; 4283 struct ddp_state *p = &toep->tp_ddp_state; 4284 4285 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", 4286 bufidx, tag0, tag1, len); 4287#if 0 4288 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4289#endif 4290 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); 4291 m = m_gethdr_nofail(wrlen); 4292 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4293 wr = mtod(m, struct work_request_hdr *); 4294 m->m_pkthdr.len = m->m_len = wrlen; 4295 bzero(wr, wrlen); 4296 4297 4298 /* Set the ATOMIC flag to make sure that TP processes the following 4299 * CPLs in an atomic manner and no wire segments can be interleaved. 4300 */ 4301 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); 4302 req = (struct cpl_set_tcb_field *)(wr + 1); 4303 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, 4304 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | 4305 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, 4306 V_TCB_RX_DDP_BUF0_TAG(tag0) | 4307 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); 4308 req++; 4309 if (bufidx == 0) { 4310 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, 4311 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4312 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 4313 req++; 4314 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4315 V_TF_DDP_PUSH_DISABLE_0(1) | 4316 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4317 V_TF_DDP_PUSH_DISABLE_0(0) | 4318 V_TF_DDP_BUF0_VALID(1)); 4319 } else { 4320 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, 4321 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), 4322 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); 4323 req++; 4324 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4325 V_TF_DDP_PUSH_DISABLE_1(1) | 4326 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4327 V_TF_DDP_PUSH_DISABLE_1(0) | 4328 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); 4329 } 4330 4331 getreq = (struct cpl_get_tcb *)(req + 1); 4332 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4333 4334 /* Keep track of the number of oustanding CPL_GET_TCB requests 4335 */ 4336 p->get_tcb_count++; 4337 4338#ifdef T3_TRACE 4339 T3_TRACE4(TIDTB(sk), 4340 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " 4341 "len %d", 4342 bufidx, tag0, tag1, len); 4343#endif 4344 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4345} 4346 4347/* 4348 * Sends a compound WR containing all the CPL messages needed to program the 4349 * two HW DDP buffers, namely optionally setting up the length and offset of 4350 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. 4351 */ 4352void 4353t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, 4354 unsigned int len1, unsigned int offset1, 4355 uint64_t ddp_flags, uint64_t flag_mask, int modulate) 4356{ 4357 unsigned int wrlen; 4358 struct mbuf *m; 4359 struct work_request_hdr *wr; 4360 struct cpl_set_tcb_field *req; 4361 4362 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", 4363 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); 4364 4365#if 0 4366 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4367#endif 4368 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + 4369 (len1 ? sizeof(*req) : 0) + 4370 (modulate ? sizeof(struct cpl_rx_data_ack) : 0); 4371 m = m_gethdr_nofail(wrlen); 4372 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4373 wr = mtod(m, struct work_request_hdr *); 4374 bzero(wr, wrlen); 4375 4376 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4377 m->m_pkthdr.len = m->m_len = wrlen; 4378 4379 req = (struct cpl_set_tcb_field *)(wr + 1); 4380 if (len0) { /* program buffer 0 offset and length */ 4381 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, 4382 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 4383 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4384 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | 4385 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); 4386 req++; 4387 } 4388 if (len1) { /* program buffer 1 offset and length */ 4389 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, 4390 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 4391 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, 4392 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | 4393 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); 4394 req++; 4395 } 4396 4397 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, 4398 ddp_flags); 4399 4400 if (modulate) { 4401 mk_rx_data_ack_ulp(toep, 4402 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, 4403 toep->tp_copied_seq - toep->tp_rcv_wup); 4404 toep->tp_rcv_wup = toep->tp_copied_seq; 4405 } 4406 4407#ifdef T3_TRACE 4408 T3_TRACE5(TIDTB(sk), 4409 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " 4410 "modulate %d", 4411 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, 4412 modulate); 4413#endif 4414 4415 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4416} 4417 4418void 4419t3_init_wr_tab(unsigned int wr_len) 4420{ 4421 int i; 4422 4423 if (mbuf_wrs[1]) /* already initialized */ 4424 return; 4425 4426 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 4427 int sgl_len = (3 * i) / 2 + (i & 1); 4428 4429 sgl_len += 3; 4430 mbuf_wrs[i] = sgl_len <= wr_len ? 4431 1 : 1 + (sgl_len - 2) / (wr_len - 1); 4432 } 4433 4434 wrlen = wr_len * 8; 4435} 4436 4437int 4438t3_init_cpl_io(void) 4439{ 4440#ifdef notyet 4441 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 4442 if (!tcphdr_skb) { 4443 log(LOG_ERR, 4444 "Chelsio TCP offload: can't allocate sk_buff\n"); 4445 return -1; 4446 } 4447 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 4448 tcphdr_skb->h.raw = tcphdr_skb->data; 4449 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 4450#endif 4451 4452 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 4453 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 4454 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 4455 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 4456 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 4457 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 4458 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 4459 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 4460 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 4461 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 4462 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 4463 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 4464 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 4465 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 4466 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 4467 return (0); 4468} 4469 4470