t4_listen.c revision 346934
1/*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/tom/t4_listen.c 346934 2019-04-29 22:16:33Z np $"); 30 31#include "opt_inet.h" 32#include "opt_inet6.h" 33 34#ifdef TCP_OFFLOAD 35#include <sys/param.h> 36#include <sys/types.h> 37#include <sys/kernel.h> 38#include <sys/ktr.h> 39#include <sys/module.h> 40#include <sys/protosw.h> 41#include <sys/refcount.h> 42#include <sys/domain.h> 43#include <sys/fnv_hash.h> 44#include <sys/socket.h> 45#include <sys/socketvar.h> 46#include <sys/sysctl.h> 47#include <net/ethernet.h> 48#include <net/if.h> 49#include <net/if_types.h> 50#include <net/if_vlan_var.h> 51#include <net/route.h> 52#include <netinet/in.h> 53#include <netinet/in_fib.h> 54#include <netinet/in_pcb.h> 55#include <netinet/ip.h> 56#include <netinet/ip6.h> 57#include <netinet6/in6_fib.h> 58#include <netinet6/scope6_var.h> 59#include <netinet/tcp_timer.h> 60#define TCPSTATES 61#include <netinet/tcp_fsm.h> 62#include <netinet/tcp_var.h> 63#include <netinet/toecore.h> 64#include <netinet/cc/cc.h> 65 66#include "common/common.h" 67#include "common/t4_msg.h" 68#include "common/t4_regs.h" 69#include "t4_clip.h" 70#include "tom/t4_tom_l2t.h" 71#include "tom/t4_tom.h" 72 73/* stid services */ 74static int alloc_stid(struct adapter *, struct listen_ctx *, int); 75static struct listen_ctx *lookup_stid(struct adapter *, int); 76static void free_stid(struct adapter *, struct listen_ctx *); 77 78/* lctx services */ 79static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, 80 struct vi_info *); 81static int free_lctx(struct adapter *, struct listen_ctx *); 82static void hold_lctx(struct listen_ctx *); 83static void listen_hash_add(struct adapter *, struct listen_ctx *); 84static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); 85static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); 86static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); 87 88static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *, 89 struct offload_settings *); 90static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *); 91static void send_reset_synqe(struct toedev *, struct synq_entry *); 92 93static int 94alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) 95{ 96 struct tid_info *t = &sc->tids; 97 u_int stid, n, f, mask; 98 struct stid_region *sr = &lctx->stid_region; 99 100 /* 101 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in 102 * the TCAM. The start of the stid region is properly aligned (the chip 103 * requires each region to be 128-cell aligned). 104 */ 105 n = isipv6 ? 2 : 1; 106 mask = n - 1; 107 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, 108 ("%s: stid region (%u, %u) not properly aligned. n = %u", 109 __func__, t->stid_base, t->nstids, n)); 110 111 mtx_lock(&t->stid_lock); 112 if (n > t->nstids - t->stids_in_use) { 113 mtx_unlock(&t->stid_lock); 114 return (-1); 115 } 116 117 if (t->nstids_free_head >= n) { 118 /* 119 * This allocation will definitely succeed because the region 120 * starts at a good alignment and we just checked we have enough 121 * stids free. 122 */ 123 f = t->nstids_free_head & mask; 124 t->nstids_free_head -= n + f; 125 stid = t->nstids_free_head; 126 TAILQ_INSERT_HEAD(&t->stids, sr, link); 127 } else { 128 struct stid_region *s; 129 130 stid = t->nstids_free_head; 131 TAILQ_FOREACH(s, &t->stids, link) { 132 stid += s->used + s->free; 133 f = stid & mask; 134 if (s->free >= n + f) { 135 stid -= n + f; 136 s->free -= n + f; 137 TAILQ_INSERT_AFTER(&t->stids, s, sr, link); 138 goto allocated; 139 } 140 } 141 142 if (__predict_false(stid != t->nstids)) { 143 panic("%s: stids TAILQ (%p) corrupt." 144 " At %d instead of %d at the end of the queue.", 145 __func__, &t->stids, stid, t->nstids); 146 } 147 148 mtx_unlock(&t->stid_lock); 149 return (-1); 150 } 151 152allocated: 153 sr->used = n; 154 sr->free = f; 155 t->stids_in_use += n; 156 t->stid_tab[stid] = lctx; 157 mtx_unlock(&t->stid_lock); 158 159 KASSERT(((stid + t->stid_base) & mask) == 0, 160 ("%s: EDOOFUS.", __func__)); 161 return (stid + t->stid_base); 162} 163 164static struct listen_ctx * 165lookup_stid(struct adapter *sc, int stid) 166{ 167 struct tid_info *t = &sc->tids; 168 169 return (t->stid_tab[stid - t->stid_base]); 170} 171 172static void 173free_stid(struct adapter *sc, struct listen_ctx *lctx) 174{ 175 struct tid_info *t = &sc->tids; 176 struct stid_region *sr = &lctx->stid_region; 177 struct stid_region *s; 178 179 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); 180 181 mtx_lock(&t->stid_lock); 182 s = TAILQ_PREV(sr, stid_head, link); 183 if (s != NULL) 184 s->free += sr->used + sr->free; 185 else 186 t->nstids_free_head += sr->used + sr->free; 187 KASSERT(t->stids_in_use >= sr->used, 188 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, 189 t->stids_in_use, sr->used)); 190 t->stids_in_use -= sr->used; 191 TAILQ_REMOVE(&t->stids, sr, link); 192 mtx_unlock(&t->stid_lock); 193} 194 195static struct listen_ctx * 196alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) 197{ 198 struct listen_ctx *lctx; 199 200 INP_WLOCK_ASSERT(inp); 201 202 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); 203 if (lctx == NULL) 204 return (NULL); 205 206 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); 207 if (lctx->stid < 0) { 208 free(lctx, M_CXGBE); 209 return (NULL); 210 } 211 212 if (inp->inp_vflag & INP_IPV6 && 213 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { 214 lctx->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL); 215 if (lctx->ce == NULL) { 216 free(lctx, M_CXGBE); 217 return (NULL); 218 } 219 } 220 221 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; 222 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; 223 refcount_init(&lctx->refcount, 1); 224 TAILQ_INIT(&lctx->synq); 225 226 lctx->inp = inp; 227 lctx->vnet = inp->inp_socket->so_vnet; 228 in_pcbref(inp); 229 230 return (lctx); 231} 232 233/* Don't call this directly, use release_lctx instead */ 234static int 235free_lctx(struct adapter *sc, struct listen_ctx *lctx) 236{ 237 struct inpcb *inp = lctx->inp; 238 239 INP_WLOCK_ASSERT(inp); 240 KASSERT(lctx->refcount == 0, 241 ("%s: refcount %d", __func__, lctx->refcount)); 242 KASSERT(TAILQ_EMPTY(&lctx->synq), 243 ("%s: synq not empty.", __func__)); 244 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 245 246 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", 247 __func__, lctx->stid, lctx, lctx->inp); 248 249 if (lctx->ce) 250 t4_release_lip(sc, lctx->ce); 251 free_stid(sc, lctx); 252 free(lctx, M_CXGBE); 253 254 return (in_pcbrele_wlocked(inp)); 255} 256 257static void 258hold_lctx(struct listen_ctx *lctx) 259{ 260 261 refcount_acquire(&lctx->refcount); 262} 263 264static inline uint32_t 265listen_hashfn(void *key, u_long mask) 266{ 267 268 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 269} 270 271/* 272 * Add a listen_ctx entry to the listen hash table. 273 */ 274static void 275listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) 276{ 277 struct tom_data *td = sc->tom_softc; 278 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 279 280 mtx_lock(&td->lctx_hash_lock); 281 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 282 td->lctx_count++; 283 mtx_unlock(&td->lctx_hash_lock); 284} 285 286/* 287 * Look for the listening socket's context entry in the hash and return it. 288 */ 289static struct listen_ctx * 290listen_hash_find(struct adapter *sc, struct inpcb *inp) 291{ 292 struct tom_data *td = sc->tom_softc; 293 int bucket = listen_hashfn(inp, td->listen_mask); 294 struct listen_ctx *lctx; 295 296 mtx_lock(&td->lctx_hash_lock); 297 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 298 if (lctx->inp == inp) 299 break; 300 } 301 mtx_unlock(&td->lctx_hash_lock); 302 303 return (lctx); 304} 305 306/* 307 * Removes the listen_ctx structure for inp from the hash and returns it. 308 */ 309static struct listen_ctx * 310listen_hash_del(struct adapter *sc, struct inpcb *inp) 311{ 312 struct tom_data *td = sc->tom_softc; 313 int bucket = listen_hashfn(inp, td->listen_mask); 314 struct listen_ctx *lctx, *l; 315 316 mtx_lock(&td->lctx_hash_lock); 317 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 318 if (lctx->inp == inp) { 319 LIST_REMOVE(lctx, link); 320 td->lctx_count--; 321 break; 322 } 323 } 324 mtx_unlock(&td->lctx_hash_lock); 325 326 return (lctx); 327} 328 329/* 330 * Releases a hold on the lctx. Must be called with the listening socket's inp 331 * locked. The inp may be freed by this function and it returns NULL to 332 * indicate this. 333 */ 334static struct inpcb * 335release_lctx(struct adapter *sc, struct listen_ctx *lctx) 336{ 337 struct inpcb *inp = lctx->inp; 338 int inp_freed = 0; 339 340 INP_WLOCK_ASSERT(inp); 341 if (refcount_release(&lctx->refcount)) 342 inp_freed = free_lctx(sc, lctx); 343 344 return (inp_freed ? NULL : inp); 345} 346 347static void 348send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) 349{ 350 struct adapter *sc = tod->tod_softc; 351 struct mbuf *m = synqe->syn; 352 struct ifnet *ifp = m->m_pkthdr.rcvif; 353 struct vi_info *vi = ifp->if_softc; 354 struct port_info *pi = vi->pi; 355 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; 356 struct wrqe *wr; 357 struct fw_flowc_wr *flowc; 358 struct cpl_abort_req *req; 359 int txqid, rxqid, flowclen; 360 struct sge_wrq *ofld_txq; 361 struct sge_ofld_rxq *ofld_rxq; 362 const int nparams = 6; 363 unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN; 364 365 INP_WLOCK_ASSERT(synqe->lctx->inp); 366 367 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", 368 __func__, synqe, synqe->flags, synqe->tid, 369 synqe->flags & TPF_ABORT_SHUTDOWN ? 370 " (abort already in progress)" : ""); 371 if (synqe->flags & TPF_ABORT_SHUTDOWN) 372 return; /* abort already in progress */ 373 synqe->flags |= TPF_ABORT_SHUTDOWN; 374 375 get_qids_from_mbuf(m, &txqid, &rxqid); 376 ofld_txq = &sc->sge.ofld_txq[txqid]; 377 ofld_rxq = &sc->sge.ofld_rxq[rxqid]; 378 379 /* The wrqe will have two WRs - a flowc followed by an abort_req */ 380 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 381 382 wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq); 383 if (wr == NULL) { 384 /* XXX */ 385 panic("%s: allocation failure.", __func__); 386 } 387 flowc = wrtod(wr); 388 req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE)); 389 390 /* First the flowc ... */ 391 memset(flowc, 0, wr->wr_len); 392 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 393 V_FW_FLOWC_WR_NPARAMS(nparams)); 394 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 395 V_FW_WR_FLOWID(synqe->tid)); 396 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 397 flowc->mnemval[0].val = htobe32(pfvf); 398 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 399 flowc->mnemval[1].val = htobe32(pi->tx_chan); 400 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 401 flowc->mnemval[2].val = htobe32(pi->tx_chan); 402 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 403 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); 404 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 405 flowc->mnemval[4].val = htobe32(512); 406 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 407 flowc->mnemval[5].val = htobe32(512); 408 synqe->flags |= TPF_FLOWC_WR_SENT; 409 410 /* ... then ABORT request */ 411 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); 412 req->rsvd0 = 0; /* don't have a snd_nxt */ 413 req->rsvd1 = 1; /* no data sent yet */ 414 req->cmd = CPL_ABORT_SEND_RST; 415 416 t4_l2t_send(sc, wr, e); 417} 418 419static int 420create_server(struct adapter *sc, struct listen_ctx *lctx) 421{ 422 struct wrqe *wr; 423 struct cpl_pass_open_req *req; 424 struct inpcb *inp = lctx->inp; 425 426 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 427 if (wr == NULL) { 428 log(LOG_ERR, "%s: allocation failure", __func__); 429 return (ENOMEM); 430 } 431 req = wrtod(wr); 432 433 INIT_TP_WR(req, 0); 434 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 435 req->local_port = inp->inp_lport; 436 req->peer_port = 0; 437 req->local_ip = inp->inp_laddr.s_addr; 438 req->peer_ip = 0; 439 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 440 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 441 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 442 443 t4_wrq_tx(sc, wr); 444 return (0); 445} 446 447static int 448create_server6(struct adapter *sc, struct listen_ctx *lctx) 449{ 450 struct wrqe *wr; 451 struct cpl_pass_open_req6 *req; 452 struct inpcb *inp = lctx->inp; 453 454 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 455 if (wr == NULL) { 456 log(LOG_ERR, "%s: allocation failure", __func__); 457 return (ENOMEM); 458 } 459 req = wrtod(wr); 460 461 INIT_TP_WR(req, 0); 462 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); 463 req->local_port = inp->inp_lport; 464 req->peer_port = 0; 465 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; 466 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; 467 req->peer_ip_hi = 0; 468 req->peer_ip_lo = 0; 469 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 470 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 471 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 472 473 t4_wrq_tx(sc, wr); 474 return (0); 475} 476 477static int 478destroy_server(struct adapter *sc, struct listen_ctx *lctx) 479{ 480 struct wrqe *wr; 481 struct cpl_close_listsvr_req *req; 482 483 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 484 if (wr == NULL) { 485 /* XXX */ 486 panic("%s: allocation failure.", __func__); 487 } 488 req = wrtod(wr); 489 490 INIT_TP_WR(req, 0); 491 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 492 lctx->stid)); 493 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); 494 req->rsvd = htobe16(0); 495 496 t4_wrq_tx(sc, wr); 497 return (0); 498} 499 500/* 501 * Start a listening server by sending a passive open request to HW. 502 * 503 * Can't take adapter lock here and access to sc->flags, 504 * sc->offload_map, if_capenable are all race prone. 505 */ 506int 507t4_listen_start(struct toedev *tod, struct tcpcb *tp) 508{ 509 struct adapter *sc = tod->tod_softc; 510 struct vi_info *vi; 511 struct port_info *pi; 512 struct inpcb *inp = tp->t_inpcb; 513 struct listen_ctx *lctx; 514 int i, rc, v; 515 struct offload_settings settings; 516 517 INP_WLOCK_ASSERT(inp); 518 519 rw_rlock(&sc->policy_lock); 520 settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 0xffff, 521 inp); 522 rw_runlock(&sc->policy_lock); 523 if (!settings.offload) 524 return (0); 525 526 /* Don't start a hardware listener for any loopback address. */ 527 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) 528 return (0); 529 if (!(inp->inp_vflag & INP_IPV6) && 530 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) 531 return (0); 532#if 0 533 ADAPTER_LOCK(sc); 534 if (IS_BUSY(sc)) { 535 log(LOG_ERR, "%s: listen request ignored, %s is busy", 536 __func__, device_get_nameunit(sc->dev)); 537 goto done; 538 } 539 540 KASSERT(uld_active(sc, ULD_TOM), 541 ("%s: TOM not initialized", __func__)); 542#endif 543 544 /* 545 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first 546 * such VI's queues to send the passive open and receive the reply to 547 * it. 548 * 549 * XXX: need a way to mark a port in use by offload. if_cxgbe should 550 * then reject any attempt to bring down such a port (and maybe reject 551 * attempts to disable IFCAP_TOE on that port too?). 552 */ 553 for_each_port(sc, i) { 554 pi = sc->port[i]; 555 for_each_vi(pi, v, vi) { 556 if (vi->flags & VI_INIT_DONE && 557 vi->ifp->if_capenable & IFCAP_TOE) 558 goto found; 559 } 560 } 561 goto done; /* no port that's UP with IFCAP_TOE enabled */ 562found: 563 564 if (listen_hash_find(sc, inp) != NULL) 565 goto done; /* already setup */ 566 567 lctx = alloc_lctx(sc, inp, vi); 568 if (lctx == NULL) { 569 log(LOG_ERR, 570 "%s: listen request ignored, %s couldn't allocate lctx\n", 571 __func__, device_get_nameunit(sc->dev)); 572 goto done; 573 } 574 listen_hash_add(sc, lctx); 575 576 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", 577 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, 578 inp->inp_vflag); 579 580 if (inp->inp_vflag & INP_IPV6) 581 rc = create_server6(sc, lctx); 582 else 583 rc = create_server(sc, lctx); 584 if (rc != 0) { 585 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", 586 __func__, device_get_nameunit(sc->dev), rc); 587 (void) listen_hash_del(sc, inp); 588 inp = release_lctx(sc, lctx); 589 /* can't be freed, host stack has a reference */ 590 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 591 goto done; 592 } 593 lctx->flags |= LCTX_RPL_PENDING; 594done: 595#if 0 596 ADAPTER_UNLOCK(sc); 597#endif 598 return (0); 599} 600 601int 602t4_listen_stop(struct toedev *tod, struct tcpcb *tp) 603{ 604 struct listen_ctx *lctx; 605 struct adapter *sc = tod->tod_softc; 606 struct inpcb *inp = tp->t_inpcb; 607 struct synq_entry *synqe; 608 609 INP_WLOCK_ASSERT(inp); 610 611 lctx = listen_hash_del(sc, inp); 612 if (lctx == NULL) 613 return (ENOENT); /* no hardware listener for this inp */ 614 615 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 616 lctx, lctx->flags); 617 618 /* 619 * If the reply to the PASS_OPEN is still pending we'll wait for it to 620 * arrive and clean up when it does. 621 */ 622 if (lctx->flags & LCTX_RPL_PENDING) { 623 KASSERT(TAILQ_EMPTY(&lctx->synq), 624 ("%s: synq not empty.", __func__)); 625 return (EINPROGRESS); 626 } 627 628 /* 629 * The host stack will abort all the connections on the listening 630 * socket's so_comp. It doesn't know about the connections on the synq 631 * so we need to take care of those. 632 */ 633 TAILQ_FOREACH(synqe, &lctx->synq, link) { 634 if (synqe->flags & TPF_SYNQE_HAS_L2TE) 635 send_reset_synqe(tod, synqe); 636 } 637 638 destroy_server(sc, lctx); 639 return (0); 640} 641 642static inline void 643hold_synqe(struct synq_entry *synqe) 644{ 645 646 refcount_acquire(&synqe->refcnt); 647} 648 649static inline void 650release_synqe(struct synq_entry *synqe) 651{ 652 653 if (refcount_release(&synqe->refcnt)) { 654 int needfree = synqe->flags & TPF_SYNQE_NEEDFREE; 655 656 m_freem(synqe->syn); 657 if (needfree) 658 free(synqe, M_CXGBE); 659 } 660} 661 662void 663t4_syncache_added(struct toedev *tod __unused, void *arg) 664{ 665 struct synq_entry *synqe = arg; 666 667 hold_synqe(synqe); 668} 669 670void 671t4_syncache_removed(struct toedev *tod __unused, void *arg) 672{ 673 struct synq_entry *synqe = arg; 674 675 release_synqe(synqe); 676} 677 678int 679t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 680{ 681 struct adapter *sc = tod->tod_softc; 682 struct synq_entry *synqe = arg; 683 struct wrqe *wr; 684 struct l2t_entry *e; 685 struct tcpopt to; 686 struct ip *ip = mtod(m, struct ip *); 687 struct tcphdr *th; 688 689 wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr); 690 if (wr == NULL) { 691 m_freem(m); 692 return (EALREADY); 693 } 694 695 if (ip->ip_v == IPVERSION) 696 th = (void *)(ip + 1); 697 else 698 th = (void *)((struct ip6_hdr *)ip + 1); 699 bzero(&to, sizeof(to)); 700 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), 701 TO_SYN); 702 703 /* save these for later */ 704 synqe->iss = be32toh(th->th_seq); 705 synqe->ts = to.to_tsval; 706 707 if (chip_id(sc) >= CHELSIO_T5) { 708 struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr); 709 710 rpl5->iss = th->th_seq; 711 } 712 713 e = &sc->l2t->l2tab[synqe->l2e_idx]; 714 t4_l2t_send(sc, wr, e); 715 716 m_freem(m); /* don't need this any more */ 717 return (0); 718} 719 720static int 721do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, 722 struct mbuf *m) 723{ 724 struct adapter *sc = iq->adapter; 725 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); 726 int stid = GET_TID(cpl); 727 unsigned int status = cpl->status; 728 struct listen_ctx *lctx = lookup_stid(sc, stid); 729 struct inpcb *inp = lctx->inp; 730#ifdef INVARIANTS 731 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 732#endif 733 734 KASSERT(opcode == CPL_PASS_OPEN_RPL, 735 ("%s: unexpected opcode 0x%x", __func__, opcode)); 736 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 737 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 738 739 INP_WLOCK(inp); 740 741 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", 742 __func__, stid, status, lctx->flags); 743 744 lctx->flags &= ~LCTX_RPL_PENDING; 745 746 if (status != CPL_ERR_NONE) 747 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); 748 749#ifdef INVARIANTS 750 /* 751 * If the inp has been dropped (listening socket closed) then 752 * listen_stop must have run and taken the inp out of the hash. 753 */ 754 if (inp->inp_flags & INP_DROPPED) { 755 KASSERT(listen_hash_del(sc, inp) == NULL, 756 ("%s: inp %p still in listen hash", __func__, inp)); 757 } 758#endif 759 760 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { 761 if (release_lctx(sc, lctx) != NULL) 762 INP_WUNLOCK(inp); 763 return (status); 764 } 765 766 /* 767 * Listening socket stopped listening earlier and now the chip tells us 768 * it has started the hardware listener. Stop it; the lctx will be 769 * released in do_close_server_rpl. 770 */ 771 if (inp->inp_flags & INP_DROPPED) { 772 destroy_server(sc, lctx); 773 INP_WUNLOCK(inp); 774 return (status); 775 } 776 777 /* 778 * Failed to start hardware listener. Take inp out of the hash and 779 * release our reference on it. An error message has been logged 780 * already. 781 */ 782 if (status != CPL_ERR_NONE) { 783 listen_hash_del(sc, inp); 784 if (release_lctx(sc, lctx) != NULL) 785 INP_WUNLOCK(inp); 786 return (status); 787 } 788 789 /* hardware listener open for business */ 790 791 INP_WUNLOCK(inp); 792 return (status); 793} 794 795static int 796do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, 797 struct mbuf *m) 798{ 799 struct adapter *sc = iq->adapter; 800 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); 801 int stid = GET_TID(cpl); 802 unsigned int status = cpl->status; 803 struct listen_ctx *lctx = lookup_stid(sc, stid); 804 struct inpcb *inp = lctx->inp; 805#ifdef INVARIANTS 806 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 807#endif 808 809 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, 810 ("%s: unexpected opcode 0x%x", __func__, opcode)); 811 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 812 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 813 814 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); 815 816 if (status != CPL_ERR_NONE) { 817 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", 818 __func__, status, stid); 819 return (status); 820 } 821 822 INP_WLOCK(inp); 823 inp = release_lctx(sc, lctx); 824 if (inp != NULL) 825 INP_WUNLOCK(inp); 826 827 return (status); 828} 829 830static void 831done_with_synqe(struct adapter *sc, struct synq_entry *synqe) 832{ 833 struct listen_ctx *lctx = synqe->lctx; 834 struct inpcb *inp = lctx->inp; 835 struct vi_info *vi = synqe->syn->m_pkthdr.rcvif->if_softc; 836 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; 837 int ntids; 838 839 INP_WLOCK_ASSERT(inp); 840 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; 841 842 TAILQ_REMOVE(&lctx->synq, synqe, link); 843 inp = release_lctx(sc, lctx); 844 if (inp) 845 INP_WUNLOCK(inp); 846 remove_tid(sc, synqe->tid, ntids); 847 release_tid(sc, synqe->tid, &sc->sge.ctrlq[vi->pi->port_id]); 848 t4_l2t_release(e); 849 release_synqe(synqe); /* removed from synq list */ 850} 851 852int 853do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, 854 struct mbuf *m) 855{ 856 struct adapter *sc = iq->adapter; 857 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 858 unsigned int tid = GET_TID(cpl); 859 struct synq_entry *synqe = lookup_tid(sc, tid); 860 struct listen_ctx *lctx = synqe->lctx; 861 struct inpcb *inp = lctx->inp; 862 int txqid; 863 struct sge_wrq *ofld_txq; 864#ifdef INVARIANTS 865 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 866#endif 867 868 KASSERT(opcode == CPL_ABORT_REQ_RSS, 869 ("%s: unexpected opcode 0x%x", __func__, opcode)); 870 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 871 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 872 873 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 874 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 875 876 if (negative_advice(cpl->status)) 877 return (0); /* Ignore negative advice */ 878 879 INP_WLOCK(inp); 880 881 get_qids_from_mbuf(synqe->syn, &txqid, NULL); 882 ofld_txq = &sc->sge.ofld_txq[txqid]; 883 884 /* 885 * If we'd initiated an abort earlier the reply to it is responsible for 886 * cleaning up resources. Otherwise we tear everything down right here 887 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 888 */ 889 if (synqe->flags & TPF_ABORT_SHUTDOWN) { 890 INP_WUNLOCK(inp); 891 goto done; 892 } 893 894 done_with_synqe(sc, synqe); 895 /* inp lock released by done_with_synqe */ 896done: 897 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 898 return (0); 899} 900 901int 902do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, 903 struct mbuf *m) 904{ 905 struct adapter *sc = iq->adapter; 906 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 907 unsigned int tid = GET_TID(cpl); 908 struct synq_entry *synqe = lookup_tid(sc, tid); 909 struct listen_ctx *lctx = synqe->lctx; 910 struct inpcb *inp = lctx->inp; 911#ifdef INVARIANTS 912 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 913#endif 914 915 KASSERT(opcode == CPL_ABORT_RPL_RSS, 916 ("%s: unexpected opcode 0x%x", __func__, opcode)); 917 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 918 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 919 920 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 921 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 922 923 INP_WLOCK(inp); 924 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 925 ("%s: wasn't expecting abort reply for synqe %p (0x%x)", 926 __func__, synqe, synqe->flags)); 927 928 done_with_synqe(sc, synqe); 929 /* inp lock released by done_with_synqe */ 930 931 return (0); 932} 933 934void 935t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) 936{ 937 struct adapter *sc = tod->tod_softc; 938 struct synq_entry *synqe = arg; 939#ifdef INVARIANTS 940 struct inpcb *inp = sotoinpcb(so); 941#endif 942 struct cpl_pass_establish *cpl = mtod(synqe->syn, void *); 943 struct toepcb *toep = *(struct toepcb **)(cpl + 1); 944 945 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ 946 INP_WLOCK_ASSERT(inp); 947 KASSERT(synqe->flags & TPF_SYNQE, 948 ("%s: %p not a synq_entry?", __func__, arg)); 949 950 offload_socket(so, toep); 951 make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); 952 toep->flags |= TPF_CPL_PENDING; 953 update_tid(sc, synqe->tid, toep); 954 synqe->flags |= TPF_SYNQE_EXPANDED; 955} 956 957static inline void 958save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi, 959 struct offload_settings *s) 960{ 961 uint32_t txqid, rxqid; 962 963 if (s->txq >= 0 && s->txq < vi->nofldtxq) 964 txqid = s->txq; 965 else 966 txqid = arc4random() % vi->nofldtxq; 967 txqid += vi->first_ofld_txq; 968 969 if (s->rxq >= 0 && s->rxq < vi->nofldrxq) 970 rxqid = s->rxq; 971 else 972 rxqid = arc4random() % vi->nofldrxq; 973 rxqid += vi->first_ofld_rxq; 974 975 m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff); 976} 977 978static inline void 979get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid) 980{ 981 982 if (txqid) 983 *txqid = m->m_pkthdr.flowid >> 16; 984 if (rxqid) 985 *rxqid = m->m_pkthdr.flowid & 0xffff; 986} 987 988/* 989 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to 990 * store some state temporarily. 991 */ 992static struct synq_entry * 993mbuf_to_synqe(struct mbuf *m) 994{ 995 int len = roundup2(sizeof (struct synq_entry), 8); 996 int tspace = M_TRAILINGSPACE(m); 997 struct synq_entry *synqe = NULL; 998 999 if (tspace < len) { 1000 synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT); 1001 if (synqe == NULL) 1002 return (NULL); 1003 synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE; 1004 } else { 1005 synqe = (void *)(m->m_data + m->m_len + tspace - len); 1006 synqe->flags = TPF_SYNQE; 1007 } 1008 1009 return (synqe); 1010} 1011 1012static void 1013t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) 1014{ 1015 bzero(to, sizeof(*to)); 1016 1017 if (t4opt->mss) { 1018 to->to_flags |= TOF_MSS; 1019 to->to_mss = be16toh(t4opt->mss); 1020 } 1021 1022 if (t4opt->wsf) { 1023 to->to_flags |= TOF_SCALE; 1024 to->to_wscale = t4opt->wsf; 1025 } 1026 1027 if (t4opt->tstamp) 1028 to->to_flags |= TOF_TS; 1029 1030 if (t4opt->sack) 1031 to->to_flags |= TOF_SACKPERM; 1032} 1033 1034/* 1035 * Options2 for passive open. 1036 */ 1037static uint32_t 1038calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, 1039 const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode, 1040 struct cc_algo *cc, const struct offload_settings *s) 1041{ 1042 struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; 1043 uint32_t opt2 = 0; 1044 1045 /* 1046 * rx flow control, rx coalesce, congestion control, and tx pace are all 1047 * explicitly set by the driver. On T5+ the ISS is also set by the 1048 * driver to the value picked by the kernel. 1049 */ 1050 if (is_t4(sc)) { 1051 opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; 1052 opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; 1053 } else { 1054 opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ 1055 opt2 |= F_T5_ISS; /* ISS provided in CPL */ 1056 } 1057 1058 if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323))) 1059 opt2 |= F_SACK_EN; 1060 1061 if (tcpopt->tstamp && 1062 (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) 1063 opt2 |= F_TSTAMPS_EN; 1064 1065 if (tcpopt->wsf < 15 && V_tcp_do_rfc1323) 1066 opt2 |= F_WND_SCALE_EN; 1067 1068 if (th->th_flags & (TH_ECE | TH_CWR) && 1069 (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) 1070 opt2 |= F_CCTRL_ECN; 1071 1072 /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ 1073 1074 opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); 1075 1076 /* These defaults are subject to ULP specific fixups later. */ 1077 opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); 1078 1079 opt2 |= V_PACE(0); 1080 1081 if (s->cong_algo >= 0) 1082 opt2 |= V_CONG_CNTRL(s->cong_algo); 1083 else if (sc->tt.cong_algorithm >= 0) 1084 opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); 1085 else { 1086 if (strcasecmp(cc->name, "reno") == 0) 1087 opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); 1088 else if (strcasecmp(cc->name, "tahoe") == 0) 1089 opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); 1090 if (strcasecmp(cc->name, "newreno") == 0) 1091 opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); 1092 if (strcasecmp(cc->name, "highspeed") == 0) 1093 opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); 1094 else { 1095 /* 1096 * Use newreno in case the algorithm selected by the 1097 * host stack is not supported by the hardware. 1098 */ 1099 opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); 1100 } 1101 } 1102 1103 if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) 1104 opt2 |= V_RX_COALESCE(M_RX_COALESCE); 1105 1106 /* Note that ofld_rxq is already set according to s->rxq. */ 1107 opt2 |= F_RSS_QUEUE_VALID; 1108 opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id); 1109 1110#ifdef USE_DDP_RX_FLOW_CONTROL 1111 if (ulp_mode == ULP_MODE_TCPDDP) 1112 opt2 |= F_RX_FC_DDP; 1113#endif 1114 1115 if (ulp_mode == ULP_MODE_TLS) { 1116 opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); 1117 opt2 |= F_RX_FC_DISABLE; 1118 } 1119 1120 return (htobe32(opt2)); 1121} 1122 1123static void 1124pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, 1125 struct in_conninfo *inc, struct tcphdr *th) 1126{ 1127 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1128 const struct ether_header *eh; 1129 unsigned int hlen = be32toh(cpl->hdr_len); 1130 uintptr_t l3hdr; 1131 const struct tcphdr *tcp; 1132 1133 eh = (const void *)(cpl + 1); 1134 if (chip_id(sc) >= CHELSIO_T6) { 1135 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); 1136 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); 1137 } else { 1138 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); 1139 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); 1140 } 1141 1142 if (inc) { 1143 bzero(inc, sizeof(*inc)); 1144 inc->inc_fport = tcp->th_sport; 1145 inc->inc_lport = tcp->th_dport; 1146 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1147 const struct ip *ip = (const void *)l3hdr; 1148 1149 inc->inc_faddr = ip->ip_src; 1150 inc->inc_laddr = ip->ip_dst; 1151 } else { 1152 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1153 1154 inc->inc_flags |= INC_ISIPV6; 1155 inc->inc6_faddr = ip6->ip6_src; 1156 inc->inc6_laddr = ip6->ip6_dst; 1157 } 1158 } 1159 1160 if (th) { 1161 bcopy(tcp, th, sizeof(*th)); 1162 tcp_fields_to_host(th); /* just like tcp_input */ 1163 } 1164} 1165 1166static struct l2t_entry * 1167get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, 1168 struct in_conninfo *inc) 1169{ 1170 struct l2t_entry *e; 1171 struct sockaddr_in6 sin6; 1172 struct sockaddr *dst = (void *)&sin6; 1173 1174 if (inc->inc_flags & INC_ISIPV6) { 1175 struct nhop6_basic nh6; 1176 1177 bzero(dst, sizeof(struct sockaddr_in6)); 1178 dst->sa_len = sizeof(struct sockaddr_in6); 1179 dst->sa_family = AF_INET6; 1180 1181 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { 1182 /* no need for route lookup */ 1183 e = t4_l2t_get(pi, ifp, dst); 1184 return (e); 1185 } 1186 1187 if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr, 1188 0, 0, 0, &nh6) != 0) 1189 return (NULL); 1190 if (nh6.nh_ifp != ifp) 1191 return (NULL); 1192 ((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr; 1193 } else { 1194 struct nhop4_basic nh4; 1195 1196 dst->sa_len = sizeof(struct sockaddr_in); 1197 dst->sa_family = AF_INET; 1198 1199 if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0, 1200 &nh4) != 0) 1201 return (NULL); 1202 if (nh4.nh_ifp != ifp) 1203 return (NULL); 1204 ((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr; 1205 } 1206 1207 e = t4_l2t_get(pi, ifp, dst); 1208 return (e); 1209} 1210 1211#define REJECT_PASS_ACCEPT() do { \ 1212 reject_reason = __LINE__; \ 1213 goto reject; \ 1214} while (0) 1215 1216/* 1217 * The context associated with a tid entry via insert_tid could be a synq_entry 1218 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 1219 */ 1220CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); 1221 1222/* 1223 * Incoming SYN on a listening socket. 1224 * 1225 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, 1226 * etc. 1227 */ 1228static int 1229do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, 1230 struct mbuf *m) 1231{ 1232 struct adapter *sc = iq->adapter; 1233 struct toedev *tod; 1234 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1235 struct cpl_pass_accept_rpl *rpl; 1236 struct wrqe *wr; 1237 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1238 unsigned int tid = GET_TID(cpl); 1239 struct listen_ctx *lctx = lookup_stid(sc, stid); 1240 struct inpcb *inp; 1241 struct socket *so; 1242 struct in_conninfo inc; 1243 struct tcphdr th; 1244 struct tcpopt to; 1245 struct port_info *pi; 1246 struct vi_info *vi; 1247 struct ifnet *hw_ifp, *ifp; 1248 struct l2t_entry *e = NULL; 1249 int rscale, mtu_idx, rx_credits, rxqid, ulp_mode; 1250 struct synq_entry *synqe = NULL; 1251 int reject_reason, v, ntids; 1252 uint16_t vid; 1253#ifdef INVARIANTS 1254 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1255#endif 1256 struct offload_settings settings; 1257 1258 KASSERT(opcode == CPL_PASS_ACCEPT_REQ, 1259 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1260 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1261 1262 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 1263 lctx); 1264 1265 pass_accept_req_to_protohdrs(sc, m, &inc, &th); 1266 t4opt_to_tcpopt(&cpl->tcpopt, &to); 1267 1268 pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))]; 1269 1270 CURVNET_SET(lctx->vnet); 1271 1272 /* 1273 * Use the MAC index to lookup the associated VI. If this SYN 1274 * didn't match a perfect MAC filter, punt. 1275 */ 1276 if (!(be16toh(cpl->l2info) & F_SYN_XACT_MATCH)) { 1277 m_freem(m); 1278 m = NULL; 1279 REJECT_PASS_ACCEPT(); 1280 } 1281 for_each_vi(pi, v, vi) { 1282 if (vi->xact_addr_filt == G_SYN_MAC_IDX(be16toh(cpl->l2info))) 1283 goto found; 1284 } 1285 m_freem(m); 1286 m = NULL; 1287 REJECT_PASS_ACCEPT(); 1288 1289found: 1290 hw_ifp = vi->ifp; /* the (v)cxgbeX ifnet */ 1291 m->m_pkthdr.rcvif = hw_ifp; 1292 tod = TOEDEV(hw_ifp); 1293 1294 /* 1295 * Figure out if there is a pseudo interface (vlan, lagg, etc.) 1296 * involved. Don't offload if the SYN had a VLAN tag and the vid 1297 * doesn't match anything on this interface. 1298 * 1299 * XXX: lagg support, lagg + vlan support. 1300 */ 1301 vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); 1302 if (vid != 0xfff) { 1303 ifp = VLAN_DEVAT(hw_ifp, vid); 1304 if (ifp == NULL) 1305 REJECT_PASS_ACCEPT(); 1306 } else 1307 ifp = hw_ifp; 1308 1309 /* 1310 * Don't offload if the peer requested a TCP option that's not known to 1311 * the silicon. 1312 */ 1313 if (cpl->tcpopt.unknown) 1314 REJECT_PASS_ACCEPT(); 1315 1316 if (inc.inc_flags & INC_ISIPV6) { 1317 1318 /* Don't offload if the ifcap isn't enabled */ 1319 if ((ifp->if_capenable & IFCAP_TOE6) == 0) 1320 REJECT_PASS_ACCEPT(); 1321 1322 /* 1323 * SYN must be directed to an IP6 address on this ifnet. This 1324 * is more restrictive than in6_localip. 1325 */ 1326 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) 1327 REJECT_PASS_ACCEPT(); 1328 1329 ntids = 2; 1330 } else { 1331 1332 /* Don't offload if the ifcap isn't enabled */ 1333 if ((ifp->if_capenable & IFCAP_TOE4) == 0) 1334 REJECT_PASS_ACCEPT(); 1335 1336 /* 1337 * SYN must be directed to an IP address on this ifnet. This 1338 * is more restrictive than in_localip. 1339 */ 1340 if (!in_ifhasaddr(ifp, inc.inc_laddr)) 1341 REJECT_PASS_ACCEPT(); 1342 1343 ntids = 1; 1344 } 1345 1346 /* 1347 * Don't offload if the ifnet that the SYN came in on is not in the same 1348 * vnet as the listening socket. 1349 */ 1350 if (lctx->vnet != ifp->if_vnet) 1351 REJECT_PASS_ACCEPT(); 1352 1353 e = get_l2te_for_nexthop(pi, ifp, &inc); 1354 if (e == NULL) 1355 REJECT_PASS_ACCEPT(); 1356 1357 synqe = mbuf_to_synqe(m); 1358 if (synqe == NULL) 1359 REJECT_PASS_ACCEPT(); 1360 1361 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : 1362 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]); 1363 if (wr == NULL) 1364 REJECT_PASS_ACCEPT(); 1365 rpl = wrtod(wr); 1366 1367 INP_INFO_RLOCK(&V_tcbinfo); /* for 4-tuple check */ 1368 1369 /* Don't offload if the 4-tuple is already in use */ 1370 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 1371 INP_INFO_RUNLOCK(&V_tcbinfo); 1372 free(wr, M_CXGBE); 1373 REJECT_PASS_ACCEPT(); 1374 } 1375 INP_INFO_RUNLOCK(&V_tcbinfo); 1376 1377 inp = lctx->inp; /* listening socket, not owned by TOE */ 1378 INP_WLOCK(inp); 1379 1380 /* Don't offload if the listening socket has closed */ 1381 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1382 /* 1383 * The listening socket has closed. The reply from the TOE to 1384 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all 1385 * resources tied to this listen context. 1386 */ 1387 INP_WUNLOCK(inp); 1388 free(wr, M_CXGBE); 1389 REJECT_PASS_ACCEPT(); 1390 } 1391 so = inp->inp_socket; 1392 rw_rlock(&sc->policy_lock); 1393 settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 0xffff, inp); 1394 rw_runlock(&sc->policy_lock); 1395 if (!settings.offload) { 1396 INP_WUNLOCK(inp); 1397 free(wr, M_CXGBE); 1398 REJECT_PASS_ACCEPT(); 1399 } 1400 1401 mtu_idx = find_best_mtu_idx(sc, &inc, &settings); 1402 rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; 1403 SOCKBUF_LOCK(&so->so_rcv); 1404 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ 1405 rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); 1406 SOCKBUF_UNLOCK(&so->so_rcv); 1407 1408 save_qids_in_mbuf(m, vi, &settings); 1409 get_qids_from_mbuf(m, NULL, &rxqid); 1410 1411 if (is_t4(sc)) 1412 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); 1413 else { 1414 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; 1415 1416 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); 1417 } 1418 ulp_mode = select_ulp_mode(so, sc, &settings); 1419 switch (ulp_mode) { 1420 case ULP_MODE_TCPDDP: 1421 synqe->flags |= TPF_SYNQE_TCPDDP; 1422 break; 1423 case ULP_MODE_TLS: 1424 synqe->flags |= TPF_SYNQE_TLS; 1425 break; 1426 } 1427 rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode, 1428 &settings); 1429 rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode, 1430 CC_ALGO(intotcpcb(inp)), &settings); 1431 1432 synqe->tid = tid; 1433 synqe->lctx = lctx; 1434 synqe->syn = m; 1435 m = NULL; 1436 refcount_init(&synqe->refcnt, 1); /* 1 means extra hold */ 1437 synqe->l2e_idx = e->idx; 1438 synqe->rcv_bufsize = rx_credits; 1439 atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr); 1440 1441 insert_tid(sc, tid, synqe, ntids); 1442 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); 1443 hold_synqe(synqe); /* hold for the duration it's in the synq */ 1444 hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */ 1445 1446 /* 1447 * If all goes well t4_syncache_respond will get called during 1448 * syncache_add. Note that syncache_add releases the pcb lock. 1449 */ 1450 toe_syncache_add(&inc, &to, &th, inp, tod, synqe); 1451 INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */ 1452 1453 /* 1454 * If we replied during syncache_add (synqe->wr has been consumed), 1455 * good. Otherwise, set it to 0 so that further syncache_respond 1456 * attempts by the kernel will be ignored. 1457 */ 1458 if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) { 1459 1460 /* 1461 * syncache may or may not have a hold on the synqe, which may 1462 * or may not be stashed in the original SYN mbuf passed to us. 1463 * Just copy it over instead of dealing with all possibilities. 1464 */ 1465 m = m_dup(synqe->syn, M_NOWAIT); 1466 if (m) 1467 m->m_pkthdr.rcvif = hw_ifp; 1468 1469 remove_tid(sc, synqe->tid, ntids); 1470 free(wr, M_CXGBE); 1471 1472 /* Yank the synqe out of the lctx synq. */ 1473 INP_WLOCK(inp); 1474 TAILQ_REMOVE(&lctx->synq, synqe, link); 1475 release_synqe(synqe); /* removed from synq list */ 1476 inp = release_lctx(sc, lctx); 1477 if (inp) 1478 INP_WUNLOCK(inp); 1479 1480 release_synqe(synqe); /* extra hold */ 1481 REJECT_PASS_ACCEPT(); 1482 } 1483 1484 CTR6(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK mode %d", 1485 __func__, stid, tid, lctx, synqe, ulp_mode); 1486 1487 INP_WLOCK(inp); 1488 synqe->flags |= TPF_SYNQE_HAS_L2TE; 1489 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1490 /* 1491 * Listening socket closed but tod_listen_stop did not abort 1492 * this tid because there was no L2T entry for the tid at that 1493 * time. Abort it now. The reply to the abort will clean up. 1494 */ 1495 CTR6(KTR_CXGBE, 1496 "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT", 1497 __func__, stid, tid, lctx, synqe, synqe->flags); 1498 if (!(synqe->flags & TPF_SYNQE_EXPANDED)) 1499 send_reset_synqe(tod, synqe); 1500 INP_WUNLOCK(inp); 1501 CURVNET_RESTORE(); 1502 1503 release_synqe(synqe); /* extra hold */ 1504 return (__LINE__); 1505 } 1506 INP_WUNLOCK(inp); 1507 CURVNET_RESTORE(); 1508 1509 release_synqe(synqe); /* extra hold */ 1510 return (0); 1511reject: 1512 CURVNET_RESTORE(); 1513 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 1514 reject_reason); 1515 1516 if (e) 1517 t4_l2t_release(e); 1518 release_tid(sc, tid, lctx->ctrlq); 1519 1520 if (__predict_true(m != NULL)) { 1521 m_adj(m, sizeof(*cpl)); 1522 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | 1523 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 1524 m->m_pkthdr.csum_data = 0xffff; 1525 hw_ifp->if_input(hw_ifp, m); 1526 } 1527 1528 return (reject_reason); 1529} 1530 1531static void 1532synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, 1533 const struct cpl_pass_establish *cpl, struct in_conninfo *inc, 1534 struct tcphdr *th, struct tcpopt *to) 1535{ 1536 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 1537 1538 /* start off with the original SYN */ 1539 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th); 1540 1541 /* modify parts to make it look like the ACK to our SYN|ACK */ 1542 th->th_flags = TH_ACK; 1543 th->th_ack = synqe->iss + 1; 1544 th->th_seq = be32toh(cpl->rcv_isn); 1545 bzero(to, sizeof(*to)); 1546 if (G_TCPOPT_TSTAMP(tcp_opt)) { 1547 to->to_flags |= TOF_TS; 1548 to->to_tsecr = synqe->ts; 1549 } 1550} 1551 1552static int 1553do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, 1554 struct mbuf *m) 1555{ 1556 struct adapter *sc = iq->adapter; 1557 struct vi_info *vi; 1558 struct ifnet *ifp; 1559 const struct cpl_pass_establish *cpl = (const void *)(rss + 1); 1560#if defined(KTR) || defined(INVARIANTS) 1561 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1562#endif 1563 unsigned int tid = GET_TID(cpl); 1564 struct synq_entry *synqe = lookup_tid(sc, tid); 1565 struct listen_ctx *lctx = synqe->lctx; 1566 struct inpcb *inp = lctx->inp, *new_inp; 1567 struct socket *so; 1568 struct tcphdr th; 1569 struct tcpopt to; 1570 struct in_conninfo inc; 1571 struct toepcb *toep; 1572 u_int txqid, rxqid; 1573#ifdef INVARIANTS 1574 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1575#endif 1576 1577 KASSERT(opcode == CPL_PASS_ESTABLISH, 1578 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1579 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1580 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1581 KASSERT(synqe->flags & TPF_SYNQE, 1582 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); 1583 1584 CURVNET_SET(lctx->vnet); 1585 INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ 1586 INP_WLOCK(inp); 1587 1588 CTR6(KTR_CXGBE, 1589 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", 1590 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); 1591 1592 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1593 1594 if (synqe->flags & TPF_SYNQE_HAS_L2TE) { 1595 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1596 ("%s: listen socket closed but tid %u not aborted.", 1597 __func__, tid)); 1598 } 1599 1600 INP_WUNLOCK(inp); 1601 INP_INFO_RUNLOCK(&V_tcbinfo); 1602 CURVNET_RESTORE(); 1603 return (0); 1604 } 1605 1606 ifp = synqe->syn->m_pkthdr.rcvif; 1607 vi = ifp->if_softc; 1608 KASSERT(vi->pi->adapter == sc, 1609 ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); 1610 1611 get_qids_from_mbuf(synqe->syn, &txqid, &rxqid); 1612 KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], 1613 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid, 1614 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); 1615 1616 toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT); 1617 if (toep == NULL) { 1618reset: 1619 /* 1620 * The reply to this abort will perform final cleanup. There is 1621 * no need to check for HAS_L2TE here. We can be here only if 1622 * we responded to the PASS_ACCEPT_REQ, and our response had the 1623 * L2T idx. 1624 */ 1625 send_reset_synqe(TOEDEV(ifp), synqe); 1626 INP_WUNLOCK(inp); 1627 INP_INFO_RUNLOCK(&V_tcbinfo); 1628 CURVNET_RESTORE(); 1629 return (0); 1630 } 1631 toep->tid = tid; 1632 toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; 1633 if (synqe->flags & TPF_SYNQE_TCPDDP) 1634 set_ulp_mode(toep, ULP_MODE_TCPDDP); 1635 else if (synqe->flags & TPF_SYNQE_TLS) 1636 set_ulp_mode(toep, ULP_MODE_TLS); 1637 else 1638 set_ulp_mode(toep, ULP_MODE_NONE); 1639 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ 1640 toep->rx_credits = synqe->rcv_bufsize; 1641 1642 so = inp->inp_socket; 1643 KASSERT(so != NULL, ("%s: socket is NULL", __func__)); 1644 1645 /* Come up with something that syncache_expand should be ok with. */ 1646 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); 1647 1648 /* 1649 * No more need for anything in the mbuf that carried the 1650 * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer 1651 * there. XXX: bad form but I don't want to increase the size of synqe. 1652 */ 1653 m = synqe->syn; 1654 KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len, 1655 ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len)); 1656 bcopy(cpl, mtod(m, void *), sizeof(*cpl)); 1657 *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep; 1658 1659 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { 1660 free_toepcb(toep); 1661 goto reset; 1662 } 1663 1664 /* New connection inpcb is already locked by syncache_expand(). */ 1665 new_inp = sotoinpcb(so); 1666 INP_WLOCK_ASSERT(new_inp); 1667 MPASS(so->so_vnet == lctx->vnet); 1668 toep->vnet = lctx->vnet; 1669 if (inc.inc_flags & INC_ISIPV6) 1670 toep->ce = t4_hold_lip(sc, &inc.inc6_laddr, lctx->ce); 1671 1672 /* 1673 * This is for the unlikely case where the syncache entry that we added 1674 * has been evicted from the syncache, but the syncache_expand above 1675 * works because of syncookies. 1676 * 1677 * XXX: we've held the tcbinfo lock throughout so there's no risk of 1678 * anyone accept'ing a connection before we've installed our hooks, but 1679 * this somewhat defeats the purpose of having a tod_offload_socket :-( 1680 */ 1681 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { 1682 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 1683 t4_offload_socket(TOEDEV(ifp), synqe, so); 1684 } 1685 1686 INP_WUNLOCK(new_inp); 1687 1688 /* Done with the synqe */ 1689 TAILQ_REMOVE(&lctx->synq, synqe, link); 1690 inp = release_lctx(sc, lctx); 1691 if (inp != NULL) 1692 INP_WUNLOCK(inp); 1693 INP_INFO_RUNLOCK(&V_tcbinfo); 1694 CURVNET_RESTORE(); 1695 release_synqe(synqe); 1696 1697 return (0); 1698} 1699 1700void 1701t4_init_listen_cpl_handlers(void) 1702{ 1703 1704 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); 1705 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 1706 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 1707 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 1708} 1709 1710void 1711t4_uninit_listen_cpl_handlers(void) 1712{ 1713 1714 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); 1715 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); 1716 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); 1717 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); 1718} 1719#endif 1720