t4_listen.c revision 309560
1/*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/tom/t4_listen.c 309560 2016-12-05 20:43:25Z jhb $"); 30 31#include "opt_inet.h" 32#include "opt_inet6.h" 33 34#ifdef TCP_OFFLOAD 35#include <sys/param.h> 36#include <sys/types.h> 37#include <sys/kernel.h> 38#include <sys/ktr.h> 39#include <sys/module.h> 40#include <sys/protosw.h> 41#include <sys/refcount.h> 42#include <sys/domain.h> 43#include <sys/fnv_hash.h> 44#include <sys/socket.h> 45#include <sys/socketvar.h> 46#include <net/ethernet.h> 47#include <net/if.h> 48#include <net/if_types.h> 49#include <net/if_vlan_var.h> 50#include <net/route.h> 51#include <netinet/in.h> 52#include <netinet/in_pcb.h> 53#include <netinet/ip.h> 54#include <netinet/ip6.h> 55#include <netinet6/scope6_var.h> 56#include <netinet/tcp_timer.h> 57#include <netinet/tcp_var.h> 58#define TCPSTATES 59#include <netinet/tcp_fsm.h> 60#include <netinet/toecore.h> 61 62#include "common/common.h" 63#include "common/t4_msg.h" 64#include "common/t4_regs.h" 65#include "tom/t4_tom_l2t.h" 66#include "tom/t4_tom.h" 67 68/* stid services */ 69static int alloc_stid(struct adapter *, struct listen_ctx *, int); 70static struct listen_ctx *lookup_stid(struct adapter *, int); 71static void free_stid(struct adapter *, struct listen_ctx *); 72 73/* lctx services */ 74static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, 75 struct vi_info *); 76static int free_lctx(struct adapter *, struct listen_ctx *); 77static void hold_lctx(struct listen_ctx *); 78static void listen_hash_add(struct adapter *, struct listen_ctx *); 79static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); 80static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); 81static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); 82 83static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *); 84static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *); 85static void send_reset_synqe(struct toedev *, struct synq_entry *); 86 87static int 88alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) 89{ 90 struct tid_info *t = &sc->tids; 91 u_int stid, n, f, mask; 92 struct stid_region *sr = &lctx->stid_region; 93 94 /* 95 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in 96 * the TCAM. The start of the stid region is properly aligned (the chip 97 * requires each region to be 128-cell aligned). 98 */ 99 n = isipv6 ? 2 : 1; 100 mask = n - 1; 101 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, 102 ("%s: stid region (%u, %u) not properly aligned. n = %u", 103 __func__, t->stid_base, t->nstids, n)); 104 105 mtx_lock(&t->stid_lock); 106 if (n > t->nstids - t->stids_in_use) { 107 mtx_unlock(&t->stid_lock); 108 return (-1); 109 } 110 111 if (t->nstids_free_head >= n) { 112 /* 113 * This allocation will definitely succeed because the region 114 * starts at a good alignment and we just checked we have enough 115 * stids free. 116 */ 117 f = t->nstids_free_head & mask; 118 t->nstids_free_head -= n + f; 119 stid = t->nstids_free_head; 120 TAILQ_INSERT_HEAD(&t->stids, sr, link); 121 } else { 122 struct stid_region *s; 123 124 stid = t->nstids_free_head; 125 TAILQ_FOREACH(s, &t->stids, link) { 126 stid += s->used + s->free; 127 f = stid & mask; 128 if (s->free >= n + f) { 129 stid -= n + f; 130 s->free -= n + f; 131 TAILQ_INSERT_AFTER(&t->stids, s, sr, link); 132 goto allocated; 133 } 134 } 135 136 if (__predict_false(stid != t->nstids)) { 137 panic("%s: stids TAILQ (%p) corrupt." 138 " At %d instead of %d at the end of the queue.", 139 __func__, &t->stids, stid, t->nstids); 140 } 141 142 mtx_unlock(&t->stid_lock); 143 return (-1); 144 } 145 146allocated: 147 sr->used = n; 148 sr->free = f; 149 t->stids_in_use += n; 150 t->stid_tab[stid] = lctx; 151 mtx_unlock(&t->stid_lock); 152 153 KASSERT(((stid + t->stid_base) & mask) == 0, 154 ("%s: EDOOFUS.", __func__)); 155 return (stid + t->stid_base); 156} 157 158static struct listen_ctx * 159lookup_stid(struct adapter *sc, int stid) 160{ 161 struct tid_info *t = &sc->tids; 162 163 return (t->stid_tab[stid - t->stid_base]); 164} 165 166static void 167free_stid(struct adapter *sc, struct listen_ctx *lctx) 168{ 169 struct tid_info *t = &sc->tids; 170 struct stid_region *sr = &lctx->stid_region; 171 struct stid_region *s; 172 173 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); 174 175 mtx_lock(&t->stid_lock); 176 s = TAILQ_PREV(sr, stid_head, link); 177 if (s != NULL) 178 s->free += sr->used + sr->free; 179 else 180 t->nstids_free_head += sr->used + sr->free; 181 KASSERT(t->stids_in_use >= sr->used, 182 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, 183 t->stids_in_use, sr->used)); 184 t->stids_in_use -= sr->used; 185 TAILQ_REMOVE(&t->stids, sr, link); 186 mtx_unlock(&t->stid_lock); 187} 188 189static struct listen_ctx * 190alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) 191{ 192 struct listen_ctx *lctx; 193 194 INP_WLOCK_ASSERT(inp); 195 196 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); 197 if (lctx == NULL) 198 return (NULL); 199 200 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); 201 if (lctx->stid < 0) { 202 free(lctx, M_CXGBE); 203 return (NULL); 204 } 205 206 if (inp->inp_vflag & INP_IPV6 && 207 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { 208 struct tom_data *td = sc->tom_softc; 209 210 lctx->ce = hold_lip(td, &inp->in6p_laddr); 211 if (lctx->ce == NULL) { 212 free(lctx, M_CXGBE); 213 return (NULL); 214 } 215 } 216 217 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; 218 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; 219 refcount_init(&lctx->refcount, 1); 220 TAILQ_INIT(&lctx->synq); 221 222 lctx->inp = inp; 223 in_pcbref(inp); 224 225 return (lctx); 226} 227 228/* Don't call this directly, use release_lctx instead */ 229static int 230free_lctx(struct adapter *sc, struct listen_ctx *lctx) 231{ 232 struct inpcb *inp = lctx->inp; 233 struct tom_data *td = sc->tom_softc; 234 235 INP_WLOCK_ASSERT(inp); 236 KASSERT(lctx->refcount == 0, 237 ("%s: refcount %d", __func__, lctx->refcount)); 238 KASSERT(TAILQ_EMPTY(&lctx->synq), 239 ("%s: synq not empty.", __func__)); 240 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 241 242 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", 243 __func__, lctx->stid, lctx, lctx->inp); 244 245 if (lctx->ce) 246 release_lip(td, lctx->ce); 247 free_stid(sc, lctx); 248 free(lctx, M_CXGBE); 249 250 return (in_pcbrele_wlocked(inp)); 251} 252 253static void 254hold_lctx(struct listen_ctx *lctx) 255{ 256 257 refcount_acquire(&lctx->refcount); 258} 259 260static inline uint32_t 261listen_hashfn(void *key, u_long mask) 262{ 263 264 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 265} 266 267/* 268 * Add a listen_ctx entry to the listen hash table. 269 */ 270static void 271listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) 272{ 273 struct tom_data *td = sc->tom_softc; 274 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 275 276 mtx_lock(&td->lctx_hash_lock); 277 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 278 td->lctx_count++; 279 mtx_unlock(&td->lctx_hash_lock); 280} 281 282/* 283 * Look for the listening socket's context entry in the hash and return it. 284 */ 285static struct listen_ctx * 286listen_hash_find(struct adapter *sc, struct inpcb *inp) 287{ 288 struct tom_data *td = sc->tom_softc; 289 int bucket = listen_hashfn(inp, td->listen_mask); 290 struct listen_ctx *lctx; 291 292 mtx_lock(&td->lctx_hash_lock); 293 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 294 if (lctx->inp == inp) 295 break; 296 } 297 mtx_unlock(&td->lctx_hash_lock); 298 299 return (lctx); 300} 301 302/* 303 * Removes the listen_ctx structure for inp from the hash and returns it. 304 */ 305static struct listen_ctx * 306listen_hash_del(struct adapter *sc, struct inpcb *inp) 307{ 308 struct tom_data *td = sc->tom_softc; 309 int bucket = listen_hashfn(inp, td->listen_mask); 310 struct listen_ctx *lctx, *l; 311 312 mtx_lock(&td->lctx_hash_lock); 313 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 314 if (lctx->inp == inp) { 315 LIST_REMOVE(lctx, link); 316 td->lctx_count--; 317 break; 318 } 319 } 320 mtx_unlock(&td->lctx_hash_lock); 321 322 return (lctx); 323} 324 325/* 326 * Releases a hold on the lctx. Must be called with the listening socket's inp 327 * locked. The inp may be freed by this function and it returns NULL to 328 * indicate this. 329 */ 330static struct inpcb * 331release_lctx(struct adapter *sc, struct listen_ctx *lctx) 332{ 333 struct inpcb *inp = lctx->inp; 334 int inp_freed = 0; 335 336 INP_WLOCK_ASSERT(inp); 337 if (refcount_release(&lctx->refcount)) 338 inp_freed = free_lctx(sc, lctx); 339 340 return (inp_freed ? NULL : inp); 341} 342 343static void 344send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) 345{ 346 struct adapter *sc = tod->tod_softc; 347 struct mbuf *m = synqe->syn; 348 struct ifnet *ifp = m->m_pkthdr.rcvif; 349 struct vi_info *vi = ifp->if_softc; 350 struct port_info *pi = vi->pi; 351 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; 352 struct wrqe *wr; 353 struct fw_flowc_wr *flowc; 354 struct cpl_abort_req *req; 355 int txqid, rxqid, flowclen; 356 struct sge_wrq *ofld_txq; 357 struct sge_ofld_rxq *ofld_rxq; 358 const int nparams = 6; 359 unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN; 360 361 INP_WLOCK_ASSERT(synqe->lctx->inp); 362 363 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", 364 __func__, synqe, synqe->flags, synqe->tid, 365 synqe->flags & TPF_ABORT_SHUTDOWN ? 366 " (abort already in progress)" : ""); 367 if (synqe->flags & TPF_ABORT_SHUTDOWN) 368 return; /* abort already in progress */ 369 synqe->flags |= TPF_ABORT_SHUTDOWN; 370 371 get_qids_from_mbuf(m, &txqid, &rxqid); 372 ofld_txq = &sc->sge.ofld_txq[txqid]; 373 ofld_rxq = &sc->sge.ofld_rxq[rxqid]; 374 375 /* The wrqe will have two WRs - a flowc followed by an abort_req */ 376 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 377 378 wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq); 379 if (wr == NULL) { 380 /* XXX */ 381 panic("%s: allocation failure.", __func__); 382 } 383 flowc = wrtod(wr); 384 req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE)); 385 386 /* First the flowc ... */ 387 memset(flowc, 0, wr->wr_len); 388 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 389 V_FW_FLOWC_WR_NPARAMS(nparams)); 390 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 391 V_FW_WR_FLOWID(synqe->tid)); 392 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 393 flowc->mnemval[0].val = htobe32(pfvf); 394 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 395 flowc->mnemval[1].val = htobe32(pi->tx_chan); 396 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 397 flowc->mnemval[2].val = htobe32(pi->tx_chan); 398 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 399 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); 400 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 401 flowc->mnemval[4].val = htobe32(512); 402 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 403 flowc->mnemval[5].val = htobe32(512); 404 synqe->flags |= TPF_FLOWC_WR_SENT; 405 406 /* ... then ABORT request */ 407 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); 408 req->rsvd0 = 0; /* don't have a snd_nxt */ 409 req->rsvd1 = 1; /* no data sent yet */ 410 req->cmd = CPL_ABORT_SEND_RST; 411 412 t4_l2t_send(sc, wr, e); 413} 414 415static int 416create_server(struct adapter *sc, struct listen_ctx *lctx) 417{ 418 struct wrqe *wr; 419 struct cpl_pass_open_req *req; 420 struct inpcb *inp = lctx->inp; 421 422 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 423 if (wr == NULL) { 424 log(LOG_ERR, "%s: allocation failure", __func__); 425 return (ENOMEM); 426 } 427 req = wrtod(wr); 428 429 INIT_TP_WR(req, 0); 430 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 431 req->local_port = inp->inp_lport; 432 req->peer_port = 0; 433 req->local_ip = inp->inp_laddr.s_addr; 434 req->peer_ip = 0; 435 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 436 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 437 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 438 439 t4_wrq_tx(sc, wr); 440 return (0); 441} 442 443static int 444create_server6(struct adapter *sc, struct listen_ctx *lctx) 445{ 446 struct wrqe *wr; 447 struct cpl_pass_open_req6 *req; 448 struct inpcb *inp = lctx->inp; 449 450 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 451 if (wr == NULL) { 452 log(LOG_ERR, "%s: allocation failure", __func__); 453 return (ENOMEM); 454 } 455 req = wrtod(wr); 456 457 INIT_TP_WR(req, 0); 458 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); 459 req->local_port = inp->inp_lport; 460 req->peer_port = 0; 461 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; 462 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; 463 req->peer_ip_hi = 0; 464 req->peer_ip_lo = 0; 465 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 466 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 467 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 468 469 t4_wrq_tx(sc, wr); 470 return (0); 471} 472 473static int 474destroy_server(struct adapter *sc, struct listen_ctx *lctx) 475{ 476 struct wrqe *wr; 477 struct cpl_close_listsvr_req *req; 478 479 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 480 if (wr == NULL) { 481 /* XXX */ 482 panic("%s: allocation failure.", __func__); 483 } 484 req = wrtod(wr); 485 486 INIT_TP_WR(req, 0); 487 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 488 lctx->stid)); 489 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); 490 req->rsvd = htobe16(0); 491 492 t4_wrq_tx(sc, wr); 493 return (0); 494} 495 496/* 497 * Start a listening server by sending a passive open request to HW. 498 * 499 * Can't take adapter lock here and access to sc->flags, 500 * sc->offload_map, if_capenable are all race prone. 501 */ 502int 503t4_listen_start(struct toedev *tod, struct tcpcb *tp) 504{ 505 struct adapter *sc = tod->tod_softc; 506 struct vi_info *vi; 507 struct port_info *pi; 508 struct inpcb *inp = tp->t_inpcb; 509 struct listen_ctx *lctx; 510 int i, rc, v; 511 512 INP_WLOCK_ASSERT(inp); 513 514 /* Don't start a hardware listener for any loopback address. */ 515 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) 516 return (0); 517 if (!(inp->inp_vflag & INP_IPV6) && 518 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) 519 return (0); 520#if 0 521 ADAPTER_LOCK(sc); 522 if (IS_BUSY(sc)) { 523 log(LOG_ERR, "%s: listen request ignored, %s is busy", 524 __func__, device_get_nameunit(sc->dev)); 525 goto done; 526 } 527 528 KASSERT(uld_active(sc, ULD_TOM), 529 ("%s: TOM not initialized", __func__)); 530#endif 531 532 /* 533 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first 534 * such VI's queues to send the passive open and receive the reply to 535 * it. 536 * 537 * XXX: need a way to mark a port in use by offload. if_cxgbe should 538 * then reject any attempt to bring down such a port (and maybe reject 539 * attempts to disable IFCAP_TOE on that port too?). 540 */ 541 for_each_port(sc, i) { 542 pi = sc->port[i]; 543 for_each_vi(pi, v, vi) { 544 if (vi->flags & VI_INIT_DONE && 545 vi->ifp->if_capenable & IFCAP_TOE) 546 goto found; 547 } 548 } 549 goto done; /* no port that's UP with IFCAP_TOE enabled */ 550found: 551 552 if (listen_hash_find(sc, inp) != NULL) 553 goto done; /* already setup */ 554 555 lctx = alloc_lctx(sc, inp, vi); 556 if (lctx == NULL) { 557 log(LOG_ERR, 558 "%s: listen request ignored, %s couldn't allocate lctx\n", 559 __func__, device_get_nameunit(sc->dev)); 560 goto done; 561 } 562 listen_hash_add(sc, lctx); 563 564 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", 565 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, 566 inp->inp_vflag); 567 568 if (inp->inp_vflag & INP_IPV6) 569 rc = create_server6(sc, lctx); 570 else 571 rc = create_server(sc, lctx); 572 if (rc != 0) { 573 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", 574 __func__, device_get_nameunit(sc->dev), rc); 575 (void) listen_hash_del(sc, inp); 576 inp = release_lctx(sc, lctx); 577 /* can't be freed, host stack has a reference */ 578 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 579 goto done; 580 } 581 lctx->flags |= LCTX_RPL_PENDING; 582done: 583#if 0 584 ADAPTER_UNLOCK(sc); 585#endif 586 return (0); 587} 588 589int 590t4_listen_stop(struct toedev *tod, struct tcpcb *tp) 591{ 592 struct listen_ctx *lctx; 593 struct adapter *sc = tod->tod_softc; 594 struct inpcb *inp = tp->t_inpcb; 595 struct synq_entry *synqe; 596 597 INP_WLOCK_ASSERT(inp); 598 599 lctx = listen_hash_del(sc, inp); 600 if (lctx == NULL) 601 return (ENOENT); /* no hardware listener for this inp */ 602 603 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 604 lctx, lctx->flags); 605 606 /* 607 * If the reply to the PASS_OPEN is still pending we'll wait for it to 608 * arrive and clean up when it does. 609 */ 610 if (lctx->flags & LCTX_RPL_PENDING) { 611 KASSERT(TAILQ_EMPTY(&lctx->synq), 612 ("%s: synq not empty.", __func__)); 613 return (EINPROGRESS); 614 } 615 616 /* 617 * The host stack will abort all the connections on the listening 618 * socket's so_comp. It doesn't know about the connections on the synq 619 * so we need to take care of those. 620 */ 621 TAILQ_FOREACH(synqe, &lctx->synq, link) { 622 if (synqe->flags & TPF_SYNQE_HAS_L2TE) 623 send_reset_synqe(tod, synqe); 624 } 625 626 destroy_server(sc, lctx); 627 return (0); 628} 629 630static inline void 631hold_synqe(struct synq_entry *synqe) 632{ 633 634 refcount_acquire(&synqe->refcnt); 635} 636 637static inline void 638release_synqe(struct synq_entry *synqe) 639{ 640 641 if (refcount_release(&synqe->refcnt)) { 642 int needfree = synqe->flags & TPF_SYNQE_NEEDFREE; 643 644 m_freem(synqe->syn); 645 if (needfree) 646 free(synqe, M_CXGBE); 647 } 648} 649 650void 651t4_syncache_added(struct toedev *tod __unused, void *arg) 652{ 653 struct synq_entry *synqe = arg; 654 655 hold_synqe(synqe); 656} 657 658void 659t4_syncache_removed(struct toedev *tod __unused, void *arg) 660{ 661 struct synq_entry *synqe = arg; 662 663 release_synqe(synqe); 664} 665 666/* XXX */ 667extern void tcp_dooptions(struct tcpopt *, u_char *, int, int); 668 669int 670t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 671{ 672 struct adapter *sc = tod->tod_softc; 673 struct synq_entry *synqe = arg; 674 struct wrqe *wr; 675 struct l2t_entry *e; 676 struct tcpopt to; 677 struct ip *ip = mtod(m, struct ip *); 678 struct tcphdr *th; 679 680 wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr); 681 if (wr == NULL) { 682 m_freem(m); 683 return (EALREADY); 684 } 685 686 if (ip->ip_v == IPVERSION) 687 th = (void *)(ip + 1); 688 else 689 th = (void *)((struct ip6_hdr *)ip + 1); 690 bzero(&to, sizeof(to)); 691 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), 692 TO_SYN); 693 694 /* save these for later */ 695 synqe->iss = be32toh(th->th_seq); 696 synqe->ts = to.to_tsval; 697 698 if (chip_id(sc) >= CHELSIO_T5) { 699 struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr); 700 701 rpl5->iss = th->th_seq; 702 } 703 704 e = &sc->l2t->l2tab[synqe->l2e_idx]; 705 t4_l2t_send(sc, wr, e); 706 707 m_freem(m); /* don't need this any more */ 708 return (0); 709} 710 711static int 712do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, 713 struct mbuf *m) 714{ 715 struct adapter *sc = iq->adapter; 716 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); 717 int stid = GET_TID(cpl); 718 unsigned int status = cpl->status; 719 struct listen_ctx *lctx = lookup_stid(sc, stid); 720 struct inpcb *inp = lctx->inp; 721#ifdef INVARIANTS 722 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 723#endif 724 725 KASSERT(opcode == CPL_PASS_OPEN_RPL, 726 ("%s: unexpected opcode 0x%x", __func__, opcode)); 727 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 728 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 729 730 INP_WLOCK(inp); 731 732 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", 733 __func__, stid, status, lctx->flags); 734 735 lctx->flags &= ~LCTX_RPL_PENDING; 736 737 if (status != CPL_ERR_NONE) 738 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); 739 740#ifdef INVARIANTS 741 /* 742 * If the inp has been dropped (listening socket closed) then 743 * listen_stop must have run and taken the inp out of the hash. 744 */ 745 if (inp->inp_flags & INP_DROPPED) { 746 KASSERT(listen_hash_del(sc, inp) == NULL, 747 ("%s: inp %p still in listen hash", __func__, inp)); 748 } 749#endif 750 751 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { 752 if (release_lctx(sc, lctx) != NULL) 753 INP_WUNLOCK(inp); 754 return (status); 755 } 756 757 /* 758 * Listening socket stopped listening earlier and now the chip tells us 759 * it has started the hardware listener. Stop it; the lctx will be 760 * released in do_close_server_rpl. 761 */ 762 if (inp->inp_flags & INP_DROPPED) { 763 destroy_server(sc, lctx); 764 INP_WUNLOCK(inp); 765 return (status); 766 } 767 768 /* 769 * Failed to start hardware listener. Take inp out of the hash and 770 * release our reference on it. An error message has been logged 771 * already. 772 */ 773 if (status != CPL_ERR_NONE) { 774 listen_hash_del(sc, inp); 775 if (release_lctx(sc, lctx) != NULL) 776 INP_WUNLOCK(inp); 777 return (status); 778 } 779 780 /* hardware listener open for business */ 781 782 INP_WUNLOCK(inp); 783 return (status); 784} 785 786static int 787do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, 788 struct mbuf *m) 789{ 790 struct adapter *sc = iq->adapter; 791 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); 792 int stid = GET_TID(cpl); 793 unsigned int status = cpl->status; 794 struct listen_ctx *lctx = lookup_stid(sc, stid); 795 struct inpcb *inp = lctx->inp; 796#ifdef INVARIANTS 797 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 798#endif 799 800 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, 801 ("%s: unexpected opcode 0x%x", __func__, opcode)); 802 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 803 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 804 805 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); 806 807 if (status != CPL_ERR_NONE) { 808 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", 809 __func__, status, stid); 810 return (status); 811 } 812 813 INP_WLOCK(inp); 814 inp = release_lctx(sc, lctx); 815 if (inp != NULL) 816 INP_WUNLOCK(inp); 817 818 return (status); 819} 820 821static void 822done_with_synqe(struct adapter *sc, struct synq_entry *synqe) 823{ 824 struct listen_ctx *lctx = synqe->lctx; 825 struct inpcb *inp = lctx->inp; 826 struct vi_info *vi = synqe->syn->m_pkthdr.rcvif->if_softc; 827 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; 828 829 INP_WLOCK_ASSERT(inp); 830 831 TAILQ_REMOVE(&lctx->synq, synqe, link); 832 inp = release_lctx(sc, lctx); 833 if (inp) 834 INP_WUNLOCK(inp); 835 remove_tid(sc, synqe->tid); 836 release_tid(sc, synqe->tid, &sc->sge.ctrlq[vi->pi->port_id]); 837 t4_l2t_release(e); 838 release_synqe(synqe); /* removed from synq list */ 839} 840 841int 842do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, 843 struct mbuf *m) 844{ 845 struct adapter *sc = iq->adapter; 846 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 847 unsigned int tid = GET_TID(cpl); 848 struct synq_entry *synqe = lookup_tid(sc, tid); 849 struct listen_ctx *lctx = synqe->lctx; 850 struct inpcb *inp = lctx->inp; 851 int txqid; 852 struct sge_wrq *ofld_txq; 853#ifdef INVARIANTS 854 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 855#endif 856 857 KASSERT(opcode == CPL_ABORT_REQ_RSS, 858 ("%s: unexpected opcode 0x%x", __func__, opcode)); 859 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 860 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 861 862 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 863 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 864 865 if (negative_advice(cpl->status)) 866 return (0); /* Ignore negative advice */ 867 868 INP_WLOCK(inp); 869 870 get_qids_from_mbuf(synqe->syn, &txqid, NULL); 871 ofld_txq = &sc->sge.ofld_txq[txqid]; 872 873 /* 874 * If we'd initiated an abort earlier the reply to it is responsible for 875 * cleaning up resources. Otherwise we tear everything down right here 876 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 877 */ 878 if (synqe->flags & TPF_ABORT_SHUTDOWN) { 879 INP_WUNLOCK(inp); 880 goto done; 881 } 882 883 done_with_synqe(sc, synqe); 884 /* inp lock released by done_with_synqe */ 885done: 886 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 887 return (0); 888} 889 890int 891do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, 892 struct mbuf *m) 893{ 894 struct adapter *sc = iq->adapter; 895 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 896 unsigned int tid = GET_TID(cpl); 897 struct synq_entry *synqe = lookup_tid(sc, tid); 898 struct listen_ctx *lctx = synqe->lctx; 899 struct inpcb *inp = lctx->inp; 900#ifdef INVARIANTS 901 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 902#endif 903 904 KASSERT(opcode == CPL_ABORT_RPL_RSS, 905 ("%s: unexpected opcode 0x%x", __func__, opcode)); 906 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 907 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 908 909 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 910 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 911 912 INP_WLOCK(inp); 913 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 914 ("%s: wasn't expecting abort reply for synqe %p (0x%x)", 915 __func__, synqe, synqe->flags)); 916 917 done_with_synqe(sc, synqe); 918 /* inp lock released by done_with_synqe */ 919 920 return (0); 921} 922 923void 924t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) 925{ 926 struct adapter *sc = tod->tod_softc; 927 struct synq_entry *synqe = arg; 928#ifdef INVARIANTS 929 struct inpcb *inp = sotoinpcb(so); 930#endif 931 struct cpl_pass_establish *cpl = mtod(synqe->syn, void *); 932 struct toepcb *toep = *(struct toepcb **)(cpl + 1); 933 934 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ 935 INP_WLOCK_ASSERT(inp); 936 KASSERT(synqe->flags & TPF_SYNQE, 937 ("%s: %p not a synq_entry?", __func__, arg)); 938 939 offload_socket(so, toep); 940 make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); 941 toep->flags |= TPF_CPL_PENDING; 942 update_tid(sc, synqe->tid, toep); 943 synqe->flags |= TPF_SYNQE_EXPANDED; 944} 945 946static inline void 947save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi) 948{ 949 uint32_t txqid, rxqid; 950 951 txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq; 952 rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq; 953 954 m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff); 955} 956 957static inline void 958get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid) 959{ 960 961 if (txqid) 962 *txqid = m->m_pkthdr.flowid >> 16; 963 if (rxqid) 964 *rxqid = m->m_pkthdr.flowid & 0xffff; 965} 966 967/* 968 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to 969 * store some state temporarily. 970 */ 971static struct synq_entry * 972mbuf_to_synqe(struct mbuf *m) 973{ 974 int len = roundup2(sizeof (struct synq_entry), 8); 975 int tspace = M_TRAILINGSPACE(m); 976 struct synq_entry *synqe = NULL; 977 978 if (tspace < len) { 979 synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT); 980 if (synqe == NULL) 981 return (NULL); 982 synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE; 983 } else { 984 synqe = (void *)(m->m_data + m->m_len + tspace - len); 985 synqe->flags = TPF_SYNQE; 986 } 987 988 return (synqe); 989} 990 991static void 992t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) 993{ 994 bzero(to, sizeof(*to)); 995 996 if (t4opt->mss) { 997 to->to_flags |= TOF_MSS; 998 to->to_mss = be16toh(t4opt->mss); 999 } 1000 1001 if (t4opt->wsf) { 1002 to->to_flags |= TOF_SCALE; 1003 to->to_wscale = t4opt->wsf; 1004 } 1005 1006 if (t4opt->tstamp) 1007 to->to_flags |= TOF_TS; 1008 1009 if (t4opt->sack) 1010 to->to_flags |= TOF_SACKPERM; 1011} 1012 1013/* 1014 * Options2 for passive open. 1015 */ 1016static uint32_t 1017calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, 1018 const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode) 1019{ 1020 struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; 1021 uint32_t opt2; 1022 1023 opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) | 1024 F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id); 1025 1026 if (V_tcp_do_rfc1323) { 1027 if (tcpopt->tstamp) 1028 opt2 |= F_TSTAMPS_EN; 1029 if (tcpopt->sack) 1030 opt2 |= F_SACK_EN; 1031 if (tcpopt->wsf <= 14) 1032 opt2 |= F_WND_SCALE_EN; 1033 } 1034 1035 if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR)) 1036 opt2 |= F_CCTRL_ECN; 1037 1038 /* RX_COALESCE is always a valid value (0 or M_RX_COALESCE). */ 1039 if (is_t4(sc)) 1040 opt2 |= F_RX_COALESCE_VALID; 1041 else { 1042 opt2 |= F_T5_OPT_2_VALID; 1043 opt2 |= F_CONG_CNTRL_VALID; /* OPT_2_ISS really, for T5 */ 1044 } 1045 if (sc->tt.rx_coalesce) 1046 opt2 |= V_RX_COALESCE(M_RX_COALESCE); 1047 1048#ifdef USE_DDP_RX_FLOW_CONTROL 1049 if (ulp_mode == ULP_MODE_TCPDDP) 1050 opt2 |= F_RX_FC_VALID | F_RX_FC_DDP; 1051#endif 1052 1053 return htobe32(opt2); 1054} 1055 1056static void 1057pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, 1058 struct in_conninfo *inc, struct tcphdr *th) 1059{ 1060 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1061 const struct ether_header *eh; 1062 unsigned int hlen = be32toh(cpl->hdr_len); 1063 uintptr_t l3hdr; 1064 const struct tcphdr *tcp; 1065 1066 eh = (const void *)(cpl + 1); 1067 if (chip_id(sc) >= CHELSIO_T6) { 1068 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); 1069 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); 1070 } else { 1071 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); 1072 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); 1073 } 1074 1075 if (inc) { 1076 bzero(inc, sizeof(*inc)); 1077 inc->inc_fport = tcp->th_sport; 1078 inc->inc_lport = tcp->th_dport; 1079 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1080 const struct ip *ip = (const void *)l3hdr; 1081 1082 inc->inc_faddr = ip->ip_src; 1083 inc->inc_laddr = ip->ip_dst; 1084 } else { 1085 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1086 1087 inc->inc_flags |= INC_ISIPV6; 1088 inc->inc6_faddr = ip6->ip6_src; 1089 inc->inc6_laddr = ip6->ip6_dst; 1090 } 1091 } 1092 1093 if (th) { 1094 bcopy(tcp, th, sizeof(*th)); 1095 tcp_fields_to_host(th); /* just like tcp_input */ 1096 } 1097} 1098 1099static int 1100ifnet_has_ip6(struct ifnet *ifp, struct in6_addr *ip6) 1101{ 1102 struct ifaddr *ifa; 1103 struct sockaddr_in6 *sin6; 1104 int found = 0; 1105 struct in6_addr in6 = *ip6; 1106 1107 /* Just as in ip6_input */ 1108 if (in6_clearscope(&in6) || in6_clearscope(&in6)) 1109 return (0); 1110 in6_setscope(&in6, ifp, NULL); 1111 1112 if_addr_rlock(ifp); 1113 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1114 sin6 = (void *)ifa->ifa_addr; 1115 if (sin6->sin6_family != AF_INET6) 1116 continue; 1117 1118 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &in6)) { 1119 found = 1; 1120 break; 1121 } 1122 } 1123 if_addr_runlock(ifp); 1124 1125 return (found); 1126} 1127 1128static struct l2t_entry * 1129get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, 1130 struct in_conninfo *inc) 1131{ 1132 struct rtentry *rt; 1133 struct l2t_entry *e; 1134 struct sockaddr_in6 sin6; 1135 struct sockaddr *dst = (void *)&sin6; 1136 1137 if (inc->inc_flags & INC_ISIPV6) { 1138 dst->sa_len = sizeof(struct sockaddr_in6); 1139 dst->sa_family = AF_INET6; 1140 ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; 1141 1142 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { 1143 /* no need for route lookup */ 1144 e = t4_l2t_get(pi, ifp, dst); 1145 return (e); 1146 } 1147 } else { 1148 dst->sa_len = sizeof(struct sockaddr_in); 1149 dst->sa_family = AF_INET; 1150 ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; 1151 } 1152 1153 rt = rtalloc1(dst, 0, 0); 1154 if (rt == NULL) 1155 return (NULL); 1156 else { 1157 struct sockaddr *nexthop; 1158 1159 RT_UNLOCK(rt); 1160 if (rt->rt_ifp != ifp) 1161 e = NULL; 1162 else { 1163 if (rt->rt_flags & RTF_GATEWAY) 1164 nexthop = rt->rt_gateway; 1165 else 1166 nexthop = dst; 1167 e = t4_l2t_get(pi, ifp, nexthop); 1168 } 1169 RTFREE(rt); 1170 } 1171 1172 return (e); 1173} 1174 1175static int 1176ifnet_has_ip(struct ifnet *ifp, struct in_addr in) 1177{ 1178 struct ifaddr *ifa; 1179 struct sockaddr_in *sin; 1180 int found = 0; 1181 1182 if_addr_rlock(ifp); 1183 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1184 sin = (void *)ifa->ifa_addr; 1185 if (sin->sin_family != AF_INET) 1186 continue; 1187 1188 if (sin->sin_addr.s_addr == in.s_addr) { 1189 found = 1; 1190 break; 1191 } 1192 } 1193 if_addr_runlock(ifp); 1194 1195 return (found); 1196} 1197 1198#define REJECT_PASS_ACCEPT() do { \ 1199 reject_reason = __LINE__; \ 1200 goto reject; \ 1201} while (0) 1202 1203/* 1204 * The context associated with a tid entry via insert_tid could be a synq_entry 1205 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 1206 */ 1207CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); 1208 1209/* 1210 * Incoming SYN on a listening socket. 1211 * 1212 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, 1213 * etc. 1214 */ 1215static int 1216do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, 1217 struct mbuf *m) 1218{ 1219 struct adapter *sc = iq->adapter; 1220 struct toedev *tod; 1221 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1222 struct cpl_pass_accept_rpl *rpl; 1223 struct wrqe *wr; 1224 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1225 unsigned int tid = GET_TID(cpl); 1226 struct listen_ctx *lctx = lookup_stid(sc, stid); 1227 struct inpcb *inp; 1228 struct socket *so; 1229 struct in_conninfo inc; 1230 struct tcphdr th; 1231 struct tcpopt to; 1232 struct port_info *pi; 1233 struct vi_info *vi; 1234 struct ifnet *hw_ifp, *ifp; 1235 struct l2t_entry *e = NULL; 1236 int rscale, mtu_idx, rx_credits, rxqid, ulp_mode; 1237 struct synq_entry *synqe = NULL; 1238 int reject_reason, v; 1239 uint16_t vid; 1240#ifdef INVARIANTS 1241 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1242#endif 1243 1244 KASSERT(opcode == CPL_PASS_ACCEPT_REQ, 1245 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1246 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1247 1248 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 1249 lctx); 1250 1251 pass_accept_req_to_protohdrs(sc, m, &inc, &th); 1252 t4opt_to_tcpopt(&cpl->tcpopt, &to); 1253 1254 pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))]; 1255 1256 /* 1257 * Use the MAC index to lookup the associated VI. If this SYN 1258 * didn't match a perfect MAC filter, punt. 1259 */ 1260 if (!(be16toh(cpl->l2info) & F_SYN_XACT_MATCH)) { 1261 m_freem(m); 1262 m = NULL; 1263 REJECT_PASS_ACCEPT(); 1264 } 1265 for_each_vi(pi, v, vi) { 1266 if (vi->xact_addr_filt == G_SYN_MAC_IDX(be16toh(cpl->l2info))) 1267 goto found; 1268 } 1269 m_freem(m); 1270 m = NULL; 1271 REJECT_PASS_ACCEPT(); 1272 1273found: 1274 hw_ifp = vi->ifp; /* the (v)cxgbeX ifnet */ 1275 m->m_pkthdr.rcvif = hw_ifp; 1276 tod = TOEDEV(hw_ifp); 1277 1278 /* 1279 * Figure out if there is a pseudo interface (vlan, lagg, etc.) 1280 * involved. Don't offload if the SYN had a VLAN tag and the vid 1281 * doesn't match anything on this interface. 1282 * 1283 * XXX: lagg support, lagg + vlan support. 1284 */ 1285 vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); 1286 if (vid != 0xfff) { 1287 ifp = VLAN_DEVAT(hw_ifp, vid); 1288 if (ifp == NULL) 1289 REJECT_PASS_ACCEPT(); 1290 } else 1291 ifp = hw_ifp; 1292 1293 /* 1294 * Don't offload if the peer requested a TCP option that's not known to 1295 * the silicon. 1296 */ 1297 if (cpl->tcpopt.unknown) 1298 REJECT_PASS_ACCEPT(); 1299 1300 if (inc.inc_flags & INC_ISIPV6) { 1301 1302 /* Don't offload if the ifcap isn't enabled */ 1303 if ((ifp->if_capenable & IFCAP_TOE6) == 0) 1304 REJECT_PASS_ACCEPT(); 1305 1306 /* 1307 * SYN must be directed to an IP6 address on this ifnet. This 1308 * is more restrictive than in6_localip. 1309 */ 1310 if (!ifnet_has_ip6(ifp, &inc.inc6_laddr)) 1311 REJECT_PASS_ACCEPT(); 1312 } else { 1313 1314 /* Don't offload if the ifcap isn't enabled */ 1315 if ((ifp->if_capenable & IFCAP_TOE4) == 0) 1316 REJECT_PASS_ACCEPT(); 1317 1318 /* 1319 * SYN must be directed to an IP address on this ifnet. This 1320 * is more restrictive than in_localip. 1321 */ 1322 if (!ifnet_has_ip(ifp, inc.inc_laddr)) 1323 REJECT_PASS_ACCEPT(); 1324 } 1325 1326 e = get_l2te_for_nexthop(pi, ifp, &inc); 1327 if (e == NULL) 1328 REJECT_PASS_ACCEPT(); 1329 1330 synqe = mbuf_to_synqe(m); 1331 if (synqe == NULL) 1332 REJECT_PASS_ACCEPT(); 1333 1334 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : 1335 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]); 1336 if (wr == NULL) 1337 REJECT_PASS_ACCEPT(); 1338 rpl = wrtod(wr); 1339 1340 INP_INFO_RLOCK(&V_tcbinfo); /* for 4-tuple check */ 1341 1342 /* Don't offload if the 4-tuple is already in use */ 1343 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 1344 INP_INFO_RUNLOCK(&V_tcbinfo); 1345 free(wr, M_CXGBE); 1346 REJECT_PASS_ACCEPT(); 1347 } 1348 INP_INFO_RUNLOCK(&V_tcbinfo); 1349 1350 inp = lctx->inp; /* listening socket, not owned by TOE */ 1351 INP_WLOCK(inp); 1352 1353 /* Don't offload if the listening socket has closed */ 1354 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1355 /* 1356 * The listening socket has closed. The reply from the TOE to 1357 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all 1358 * resources tied to this listen context. 1359 */ 1360 INP_WUNLOCK(inp); 1361 free(wr, M_CXGBE); 1362 REJECT_PASS_ACCEPT(); 1363 } 1364 so = inp->inp_socket; 1365 CURVNET_SET(so->so_vnet); 1366 1367 mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss)); 1368 rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; 1369 SOCKBUF_LOCK(&so->so_rcv); 1370 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ 1371 rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); 1372 SOCKBUF_UNLOCK(&so->so_rcv); 1373 1374 save_qids_in_mbuf(m, vi); 1375 get_qids_from_mbuf(m, NULL, &rxqid); 1376 1377 if (is_t4(sc)) 1378 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); 1379 else { 1380 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; 1381 1382 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); 1383 } 1384 if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) { 1385 ulp_mode = ULP_MODE_TCPDDP; 1386 synqe->flags |= TPF_SYNQE_TCPDDP; 1387 } else 1388 ulp_mode = ULP_MODE_NONE; 1389 rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode); 1390 rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode); 1391 1392 synqe->tid = tid; 1393 synqe->lctx = lctx; 1394 synqe->syn = m; 1395 m = NULL; 1396 refcount_init(&synqe->refcnt, 1); /* 1 means extra hold */ 1397 synqe->l2e_idx = e->idx; 1398 synqe->rcv_bufsize = rx_credits; 1399 atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr); 1400 1401 insert_tid(sc, tid, synqe); 1402 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); 1403 hold_synqe(synqe); /* hold for the duration it's in the synq */ 1404 hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */ 1405 1406 /* 1407 * If all goes well t4_syncache_respond will get called during 1408 * syncache_add. Note that syncache_add releases the pcb lock. 1409 */ 1410 toe_syncache_add(&inc, &to, &th, inp, tod, synqe); 1411 INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */ 1412 CURVNET_RESTORE(); 1413 1414 /* 1415 * If we replied during syncache_add (synqe->wr has been consumed), 1416 * good. Otherwise, set it to 0 so that further syncache_respond 1417 * attempts by the kernel will be ignored. 1418 */ 1419 if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) { 1420 1421 /* 1422 * syncache may or may not have a hold on the synqe, which may 1423 * or may not be stashed in the original SYN mbuf passed to us. 1424 * Just copy it over instead of dealing with all possibilities. 1425 */ 1426 m = m_dup(synqe->syn, M_NOWAIT); 1427 if (m) 1428 m->m_pkthdr.rcvif = hw_ifp; 1429 1430 remove_tid(sc, synqe->tid); 1431 free(wr, M_CXGBE); 1432 1433 /* Yank the synqe out of the lctx synq. */ 1434 INP_WLOCK(inp); 1435 TAILQ_REMOVE(&lctx->synq, synqe, link); 1436 release_synqe(synqe); /* removed from synq list */ 1437 inp = release_lctx(sc, lctx); 1438 if (inp) 1439 INP_WUNLOCK(inp); 1440 1441 release_synqe(synqe); /* extra hold */ 1442 REJECT_PASS_ACCEPT(); 1443 } 1444 1445 CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK", 1446 __func__, stid, tid, lctx, synqe); 1447 1448 INP_WLOCK(inp); 1449 synqe->flags |= TPF_SYNQE_HAS_L2TE; 1450 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1451 /* 1452 * Listening socket closed but tod_listen_stop did not abort 1453 * this tid because there was no L2T entry for the tid at that 1454 * time. Abort it now. The reply to the abort will clean up. 1455 */ 1456 CTR6(KTR_CXGBE, 1457 "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT", 1458 __func__, stid, tid, lctx, synqe, synqe->flags); 1459 if (!(synqe->flags & TPF_SYNQE_EXPANDED)) 1460 send_reset_synqe(tod, synqe); 1461 INP_WUNLOCK(inp); 1462 1463 release_synqe(synqe); /* extra hold */ 1464 return (__LINE__); 1465 } 1466 INP_WUNLOCK(inp); 1467 1468 release_synqe(synqe); /* extra hold */ 1469 return (0); 1470reject: 1471 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 1472 reject_reason); 1473 1474 if (e) 1475 t4_l2t_release(e); 1476 release_tid(sc, tid, lctx->ctrlq); 1477 1478 if (__predict_true(m != NULL)) { 1479 m_adj(m, sizeof(*cpl)); 1480 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | 1481 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 1482 m->m_pkthdr.csum_data = 0xffff; 1483 hw_ifp->if_input(hw_ifp, m); 1484 } 1485 1486 return (reject_reason); 1487} 1488 1489static void 1490synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, 1491 const struct cpl_pass_establish *cpl, struct in_conninfo *inc, 1492 struct tcphdr *th, struct tcpopt *to) 1493{ 1494 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 1495 1496 /* start off with the original SYN */ 1497 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th); 1498 1499 /* modify parts to make it look like the ACK to our SYN|ACK */ 1500 th->th_flags = TH_ACK; 1501 th->th_ack = synqe->iss + 1; 1502 th->th_seq = be32toh(cpl->rcv_isn); 1503 bzero(to, sizeof(*to)); 1504 if (G_TCPOPT_TSTAMP(tcp_opt)) { 1505 to->to_flags |= TOF_TS; 1506 to->to_tsecr = synqe->ts; 1507 } 1508} 1509 1510static int 1511do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, 1512 struct mbuf *m) 1513{ 1514 struct adapter *sc = iq->adapter; 1515 struct vi_info *vi; 1516 struct ifnet *ifp; 1517 const struct cpl_pass_establish *cpl = (const void *)(rss + 1); 1518#if defined(KTR) || defined(INVARIANTS) 1519 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1520#endif 1521 unsigned int tid = GET_TID(cpl); 1522 struct synq_entry *synqe = lookup_tid(sc, tid); 1523 struct listen_ctx *lctx = synqe->lctx; 1524 struct inpcb *inp = lctx->inp, *new_inp; 1525 struct socket *so; 1526 struct tcphdr th; 1527 struct tcpopt to; 1528 struct in_conninfo inc; 1529 struct toepcb *toep; 1530 u_int txqid, rxqid; 1531#ifdef INVARIANTS 1532 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1533#endif 1534 1535 KASSERT(opcode == CPL_PASS_ESTABLISH, 1536 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1537 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1538 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1539 KASSERT(synqe->flags & TPF_SYNQE, 1540 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); 1541 1542 INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ 1543 INP_WLOCK(inp); 1544 1545 CTR6(KTR_CXGBE, 1546 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", 1547 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); 1548 1549 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1550 1551 if (synqe->flags & TPF_SYNQE_HAS_L2TE) { 1552 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1553 ("%s: listen socket closed but tid %u not aborted.", 1554 __func__, tid)); 1555 } 1556 1557 INP_WUNLOCK(inp); 1558 INP_INFO_RUNLOCK(&V_tcbinfo); 1559 return (0); 1560 } 1561 1562 ifp = synqe->syn->m_pkthdr.rcvif; 1563 vi = ifp->if_softc; 1564 KASSERT(vi->pi->adapter == sc, 1565 ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); 1566 1567 get_qids_from_mbuf(synqe->syn, &txqid, &rxqid); 1568 KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], 1569 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid, 1570 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); 1571 1572 toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT); 1573 if (toep == NULL) { 1574reset: 1575 /* 1576 * The reply to this abort will perform final cleanup. There is 1577 * no need to check for HAS_L2TE here. We can be here only if 1578 * we responded to the PASS_ACCEPT_REQ, and our response had the 1579 * L2T idx. 1580 */ 1581 send_reset_synqe(TOEDEV(ifp), synqe); 1582 INP_WUNLOCK(inp); 1583 INP_INFO_RUNLOCK(&V_tcbinfo); 1584 return (0); 1585 } 1586 toep->tid = tid; 1587 toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; 1588 if (synqe->flags & TPF_SYNQE_TCPDDP) 1589 set_tcpddp_ulp_mode(toep); 1590 else 1591 toep->ulp_mode = ULP_MODE_NONE; 1592 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ 1593 toep->rx_credits = synqe->rcv_bufsize; 1594 1595 so = inp->inp_socket; 1596 KASSERT(so != NULL, ("%s: socket is NULL", __func__)); 1597 1598 /* Come up with something that syncache_expand should be ok with. */ 1599 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); 1600 1601 /* 1602 * No more need for anything in the mbuf that carried the 1603 * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer 1604 * there. XXX: bad form but I don't want to increase the size of synqe. 1605 */ 1606 m = synqe->syn; 1607 KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len, 1608 ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len)); 1609 bcopy(cpl, mtod(m, void *), sizeof(*cpl)); 1610 *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep; 1611 1612 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { 1613 free_toepcb(toep); 1614 goto reset; 1615 } 1616 1617 /* New connection inpcb is already locked by syncache_expand(). */ 1618 new_inp = sotoinpcb(so); 1619 INP_WLOCK_ASSERT(new_inp); 1620 1621 /* 1622 * This is for the unlikely case where the syncache entry that we added 1623 * has been evicted from the syncache, but the syncache_expand above 1624 * works because of syncookies. 1625 * 1626 * XXX: we've held the tcbinfo lock throughout so there's no risk of 1627 * anyone accept'ing a connection before we've installed our hooks, but 1628 * this somewhat defeats the purpose of having a tod_offload_socket :-( 1629 */ 1630 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { 1631 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 1632 t4_offload_socket(TOEDEV(ifp), synqe, so); 1633 } 1634 1635 INP_WUNLOCK(new_inp); 1636 1637 /* Done with the synqe */ 1638 TAILQ_REMOVE(&lctx->synq, synqe, link); 1639 inp = release_lctx(sc, lctx); 1640 if (inp != NULL) 1641 INP_WUNLOCK(inp); 1642 INP_INFO_RUNLOCK(&V_tcbinfo); 1643 release_synqe(synqe); 1644 1645 return (0); 1646} 1647 1648void 1649t4_init_listen_cpl_handlers(void) 1650{ 1651 1652 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); 1653 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 1654 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 1655 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 1656} 1657#endif 1658