1/*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD$"); 29 30#include "opt_inet.h" 31 32#ifdef TCP_OFFLOAD 33#include <sys/param.h> 34#include <sys/refcount.h> 35#include <sys/socket.h> 36#include <sys/socketvar.h> 37#include <sys/sysctl.h> 38#include <net/if.h> 39#include <net/if_var.h> 40#include <net/route.h> 41#include <netinet/in.h> 42#include <netinet/ip.h> 43#include <netinet/in_fib.h> 44#include <netinet/in_pcb.h> 45#include <netinet/in_var.h> 46#include <netinet/tcp_timer.h> 47#define TCPSTATES 48#include <netinet/tcp_fsm.h> 49#include <netinet/tcp_var.h> 50#include <netinet/toecore.h> 51 52#include "cxgb_include.h" 53#include "ulp/tom/cxgb_tom.h" 54#include "ulp/tom/cxgb_l2t.h" 55#include "ulp/tom/cxgb_toepcb.h" 56 57static void t3_send_reset_synqe(struct toedev *, struct synq_entry *); 58 59static int 60alloc_stid(struct tid_info *t, void *ctx) 61{ 62 int stid = -1; 63 64 mtx_lock(&t->stid_lock); 65 if (t->sfree) { 66 union listen_entry *p = t->sfree; 67 68 stid = (p - t->stid_tab) + t->stid_base; 69 t->sfree = p->next; 70 p->ctx = ctx; 71 t->stids_in_use++; 72 } 73 mtx_unlock(&t->stid_lock); 74 return (stid); 75} 76 77static void 78free_stid(struct tid_info *t, int stid) 79{ 80 union listen_entry *p = stid2entry(t, stid); 81 82 mtx_lock(&t->stid_lock); 83 p->next = t->sfree; 84 t->sfree = p; 85 t->stids_in_use--; 86 mtx_unlock(&t->stid_lock); 87} 88 89static struct listen_ctx * 90alloc_lctx(struct tom_data *td, struct inpcb *inp, int qset) 91{ 92 struct listen_ctx *lctx; 93 94 INP_WLOCK_ASSERT(inp); 95 96 lctx = malloc(sizeof(struct listen_ctx), M_CXGB, M_NOWAIT | M_ZERO); 97 if (lctx == NULL) 98 return (NULL); 99 100 lctx->stid = alloc_stid(&td->tid_maps, lctx); 101 if (lctx->stid < 0) { 102 free(lctx, M_CXGB); 103 return (NULL); 104 } 105 106 lctx->inp = inp; 107 in_pcbref(inp); 108 109 lctx->qset = qset; 110 refcount_init(&lctx->refcnt, 1); 111 TAILQ_INIT(&lctx->synq); 112 113 return (lctx); 114} 115 116/* Don't call this directly, use release_lctx instead */ 117static int 118free_lctx(struct tom_data *td, struct listen_ctx *lctx) 119{ 120 struct inpcb *inp = lctx->inp; 121 122 INP_WLOCK_ASSERT(inp); 123 KASSERT(lctx->refcnt == 0, 124 ("%s: refcnt %d", __func__, lctx->refcnt)); 125 KASSERT(TAILQ_EMPTY(&lctx->synq), 126 ("%s: synq not empty.", __func__)); 127 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 128 129 CTR4(KTR_CXGB, "%s: stid %u, lctx %p, inp %p", 130 __func__, lctx->stid, lctx, lctx->inp); 131 132 free_stid(&td->tid_maps, lctx->stid); 133 free(lctx, M_CXGB); 134 135 return in_pcbrele_wlocked(inp); 136} 137 138static void 139hold_lctx(struct listen_ctx *lctx) 140{ 141 142 refcount_acquire(&lctx->refcnt); 143} 144 145static inline uint32_t 146listen_hashfn(void *key, u_long mask) 147{ 148 149 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 150} 151 152/* 153 * Add a listen_ctx entry to the listen hash table. 154 */ 155static void 156listen_hash_add(struct tom_data *td, struct listen_ctx *lctx) 157{ 158 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 159 160 mtx_lock(&td->lctx_hash_lock); 161 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 162 td->lctx_count++; 163 mtx_unlock(&td->lctx_hash_lock); 164} 165 166/* 167 * Look for the listening socket's context entry in the hash and return it. 168 */ 169static struct listen_ctx * 170listen_hash_find(struct tom_data *td, struct inpcb *inp) 171{ 172 int bucket = listen_hashfn(inp, td->listen_mask); 173 struct listen_ctx *lctx; 174 175 mtx_lock(&td->lctx_hash_lock); 176 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 177 if (lctx->inp == inp) 178 break; 179 } 180 mtx_unlock(&td->lctx_hash_lock); 181 182 return (lctx); 183} 184 185/* 186 * Removes the listen_ctx structure for inp from the hash and returns it. 187 */ 188static struct listen_ctx * 189listen_hash_del(struct tom_data *td, struct inpcb *inp) 190{ 191 int bucket = listen_hashfn(inp, td->listen_mask); 192 struct listen_ctx *lctx, *l; 193 194 mtx_lock(&td->lctx_hash_lock); 195 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 196 if (lctx->inp == inp) { 197 LIST_REMOVE(lctx, link); 198 td->lctx_count--; 199 break; 200 } 201 } 202 mtx_unlock(&td->lctx_hash_lock); 203 204 return (lctx); 205} 206 207/* 208 * Releases a hold on the lctx. Must be called with the listening socket's inp 209 * locked. The inp may be freed by this function and it returns NULL to 210 * indicate this. 211 */ 212static struct inpcb * 213release_lctx(struct tom_data *td, struct listen_ctx *lctx) 214{ 215 struct inpcb *inp = lctx->inp; 216 int inp_freed = 0; 217 218 INP_WLOCK_ASSERT(inp); 219 if (refcount_release(&lctx->refcnt)) 220 inp_freed = free_lctx(td, lctx); 221 222 return (inp_freed ? NULL : inp); 223} 224 225static int 226create_server(struct adapter *sc, struct listen_ctx *lctx) 227{ 228 struct mbuf *m; 229 struct cpl_pass_open_req *req; 230 struct inpcb *inp = lctx->inp; 231 232 m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req); 233 if (m == NULL) 234 return (ENOMEM); 235 236 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 237 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 238 req->local_port = inp->inp_lport; 239 memcpy(&req->local_ip, &inp->inp_laddr, 4); 240 req->peer_port = 0; 241 req->peer_ip = 0; 242 req->peer_netmask = 0; 243 req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); 244 req->opt0l = htonl(V_RCV_BUFSIZ(16)); 245 req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); 246 247 t3_offload_tx(sc, m); 248 249 return (0); 250} 251 252static int 253destroy_server(struct adapter *sc, struct listen_ctx *lctx) 254{ 255 struct mbuf *m; 256 struct cpl_close_listserv_req *req; 257 258 m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req); 259 if (m == NULL) 260 return (ENOMEM); 261 262 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 263 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 264 lctx->stid)); 265 req->cpu_idx = 0; 266 267 t3_offload_tx(sc, m); 268 269 return (0); 270} 271 272/* 273 * Process a CPL_CLOSE_LISTSRV_RPL message. If the status is good we release 274 * the STID. 275 */ 276static int 277do_close_server_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 278{ 279 struct adapter *sc = qs->adap; 280 struct tom_data *td = sc->tom_softc; 281 struct cpl_close_listserv_rpl *rpl = mtod(m, void *); 282 unsigned int stid = GET_TID(rpl); 283 struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid); 284 struct inpcb *inp = lctx->inp; 285 286 CTR3(KTR_CXGB, "%s: stid %u, status %u", __func__, stid, rpl->status); 287 288 if (rpl->status != CPL_ERR_NONE) { 289 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u", 290 __func__, rpl->status, stid); 291 } else { 292 INP_WLOCK(inp); 293 KASSERT(listen_hash_del(td, lctx->inp) == NULL, 294 ("%s: inp %p still in listen hash", __func__, inp)); 295 if (release_lctx(td, lctx) != NULL) 296 INP_WUNLOCK(inp); 297 } 298 299 m_freem(m); 300 return (0); 301} 302 303/* 304 * Process a CPL_PASS_OPEN_RPL message. Remove the lctx from the listen hash 305 * table and free it if there was any error, otherwise nothing to do. 306 */ 307static int 308do_pass_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 309{ 310 struct adapter *sc = qs->adap; 311 struct tom_data *td = sc->tom_softc; 312 struct cpl_pass_open_rpl *rpl = mtod(m, void *); 313 int stid = GET_TID(rpl); 314 struct listen_ctx *lctx; 315 struct inpcb *inp; 316 317 /* 318 * We get these replies also when setting up HW filters. Just throw 319 * those away. 320 */ 321 if (stid >= td->tid_maps.stid_base + td->tid_maps.nstids) 322 goto done; 323 324 lctx = lookup_stid(&td->tid_maps, stid); 325 inp = lctx->inp; 326 327 INP_WLOCK(inp); 328 329 CTR4(KTR_CXGB, "%s: stid %u, status %u, flags 0x%x", 330 __func__, stid, rpl->status, lctx->flags); 331 332 lctx->flags &= ~LCTX_RPL_PENDING; 333 334 if (rpl->status != CPL_ERR_NONE) { 335 log(LOG_ERR, "%s: %s: hw listen (stid %d) failed: %d\n", 336 __func__, device_get_nameunit(sc->dev), stid, rpl->status); 337 } 338 339#ifdef INVARIANTS 340 /* 341 * If the inp has been dropped (listening socket closed) then 342 * listen_stop must have run and taken the inp out of the hash. 343 */ 344 if (inp->inp_flags & INP_DROPPED) { 345 KASSERT(listen_hash_del(td, inp) == NULL, 346 ("%s: inp %p still in listen hash", __func__, inp)); 347 } 348#endif 349 350 if (inp->inp_flags & INP_DROPPED && rpl->status != CPL_ERR_NONE) { 351 if (release_lctx(td, lctx) != NULL) 352 INP_WUNLOCK(inp); 353 goto done; 354 } 355 356 /* 357 * Listening socket stopped listening earlier and now the chip tells us 358 * it has started the hardware listener. Stop it; the lctx will be 359 * released in do_close_server_rpl. 360 */ 361 if (inp->inp_flags & INP_DROPPED) { 362 destroy_server(sc, lctx); 363 INP_WUNLOCK(inp); 364 goto done; 365 } 366 367 /* 368 * Failed to start hardware listener. Take inp out of the hash and 369 * release our reference on it. An error message has been logged 370 * already. 371 */ 372 if (rpl->status != CPL_ERR_NONE) { 373 listen_hash_del(td, inp); 374 if (release_lctx(td, lctx) != NULL) 375 INP_WUNLOCK(inp); 376 goto done; 377 } 378 379 /* hardware listener open for business */ 380 381 INP_WUNLOCK(inp); 382done: 383 m_freem(m); 384 return (0); 385} 386 387static void 388pass_accept_req_to_protohdrs(const struct cpl_pass_accept_req *cpl, 389 struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) 390{ 391 const struct tcp_options *t3opt = &cpl->tcp_options; 392 393 bzero(inc, sizeof(*inc)); 394 inc->inc_faddr.s_addr = cpl->peer_ip; 395 inc->inc_laddr.s_addr = cpl->local_ip; 396 inc->inc_fport = cpl->peer_port; 397 inc->inc_lport = cpl->local_port; 398 399 bzero(th, sizeof(*th)); 400 th->th_sport = cpl->peer_port; 401 th->th_dport = cpl->local_port; 402 th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */ 403 th->th_flags = TH_SYN; 404 405 bzero(to, sizeof(*to)); 406 if (t3opt->mss) { 407 to->to_flags |= TOF_MSS; 408 to->to_mss = be16toh(t3opt->mss); 409 } 410 if (t3opt->wsf) { 411 to->to_flags |= TOF_SCALE; 412 to->to_wscale = t3opt->wsf; 413 } 414 if (t3opt->tstamp) 415 to->to_flags |= TOF_TS; 416 if (t3opt->sack) 417 to->to_flags |= TOF_SACKPERM; 418} 419 420static inline void 421hold_synqe(struct synq_entry *synqe) 422{ 423 424 refcount_acquire(&synqe->refcnt); 425} 426 427static inline void 428release_synqe(struct synq_entry *synqe) 429{ 430 431 if (refcount_release(&synqe->refcnt)) 432 m_freem(synqe->m); 433} 434 435/* 436 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to 437 * store some state temporarily. There will be enough room in the mbuf's 438 * trailing space as the CPL is not that large. 439 * 440 * XXX: bad hack. 441 */ 442static struct synq_entry * 443mbuf_to_synq_entry(struct mbuf *m) 444{ 445 int len = roundup(sizeof (struct synq_entry), 8); 446 447 if (__predict_false(M_TRAILINGSPACE(m) < len)) { 448 panic("%s: no room for synq_entry (%td, %d)\n", __func__, 449 M_TRAILINGSPACE(m), len); 450 } 451 452 return ((void *)(M_START(m) + M_SIZE(m) - len)); 453} 454 455#ifdef KTR 456#define REJECT_PASS_ACCEPT() do { \ 457 reject_reason = __LINE__; \ 458 goto reject; \ 459} while (0) 460#else 461#define REJECT_PASS_ACCEPT() do { goto reject; } while (0) 462#endif 463 464/* 465 * The context associated with a tid entry via insert_tid could be a synq_entry 466 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 467 */ 468CTASSERT(offsetof(struct toepcb, tp_flags) == offsetof(struct synq_entry, flags)); 469 470/* 471 * Handle a CPL_PASS_ACCEPT_REQ message. 472 */ 473static int 474do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 475{ 476 struct adapter *sc = qs->adap; 477 struct tom_data *td = sc->tom_softc; 478 struct toedev *tod = &td->tod; 479 const struct cpl_pass_accept_req *req = mtod(m, void *); 480 unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 481 unsigned int tid = GET_TID(req); 482 struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid); 483 struct l2t_entry *e = NULL; 484 struct nhop4_basic nh4; 485 struct sockaddr_in nam; 486 struct inpcb *inp; 487 struct socket *so; 488 struct port_info *pi; 489 struct ifnet *ifp; 490 struct in_conninfo inc; 491 struct tcphdr th; 492 struct tcpopt to; 493 struct synq_entry *synqe = NULL; 494 int i; 495#ifdef KTR 496 int reject_reason; 497#endif 498 499 CTR4(KTR_CXGB, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 500 lctx); 501 502 pass_accept_req_to_protohdrs(req, &inc, &th, &to); 503 504 /* 505 * Don't offload if the interface that received the SYN doesn't have 506 * IFCAP_TOE enabled. 507 */ 508 pi = NULL; 509 for_each_port(sc, i) { 510 if (memcmp(sc->port[i].hw_addr, req->dst_mac, ETHER_ADDR_LEN)) 511 continue; 512 pi = &sc->port[i]; 513 break; 514 } 515 if (pi == NULL) 516 REJECT_PASS_ACCEPT(); 517 ifp = pi->ifp; 518 if ((ifp->if_capenable & IFCAP_TOE4) == 0) 519 REJECT_PASS_ACCEPT(); 520 521 /* 522 * Don't offload if the outgoing interface for the route back to the 523 * peer is not the same as the interface that received the SYN. 524 */ 525 bzero(&nam, sizeof(nam)); 526 nam.sin_len = sizeof(nam); 527 nam.sin_family = AF_INET; 528 nam.sin_addr = inc.inc_faddr; 529 if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, nam.sin_addr, 0, 0, &nh4) != 0) 530 REJECT_PASS_ACCEPT(); 531 else { 532 nam.sin_addr = nh4.nh_addr; 533 if (nh4.nh_ifp == ifp) 534 e = t3_l2t_get(pi, ifp, (struct sockaddr *)&nam); 535 if (e == NULL) 536 REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */ 537 } 538 539 INP_INFO_RLOCK(&V_tcbinfo); 540 541 /* Don't offload if the 4-tuple is already in use */ 542 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 543 INP_INFO_RUNLOCK(&V_tcbinfo); 544 REJECT_PASS_ACCEPT(); 545 } 546 547 inp = lctx->inp; /* listening socket (not owned by the TOE) */ 548 INP_WLOCK(inp); 549 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 550 /* 551 * The listening socket has closed. The reply from the TOE to 552 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all 553 * resources tied to this listen context. 554 */ 555 INP_WUNLOCK(inp); 556 INP_INFO_RUNLOCK(&V_tcbinfo); 557 REJECT_PASS_ACCEPT(); 558 } 559 so = inp->inp_socket; 560 561 /* Reuse the mbuf that delivered the CPL to us */ 562 synqe = mbuf_to_synq_entry(m); 563 synqe->flags = TP_IS_A_SYNQ_ENTRY; 564 synqe->m = m; 565 synqe->lctx = lctx; 566 synqe->tid = tid; 567 synqe->e = e; 568 synqe->opt0h = calc_opt0h(so, 0, 0, e); 569 synqe->qset = pi->first_qset + (arc4random() % pi->nqsets); 570 SOCKBUF_LOCK(&so->so_rcv); 571 synqe->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); 572 SOCKBUF_UNLOCK(&so->so_rcv); 573 refcount_init(&synqe->refcnt, 1); 574 atomic_store_rel_int(&synqe->reply, RPL_OK); 575 576 insert_tid(td, synqe, tid); 577 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); 578 hold_synqe(synqe); 579 hold_lctx(lctx); 580 581 /* syncache_add releases both pcbinfo and pcb locks */ 582 toe_syncache_add(&inc, &to, &th, inp, tod, synqe); 583 INP_UNLOCK_ASSERT(inp); 584 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 585 586 /* 587 * If we replied during syncache_add (reply is RPL_DONE), good. 588 * Otherwise (reply is unchanged - RPL_OK) it's no longer ok to reply. 589 * The mbuf will stick around as long as the entry is in the syncache. 590 * The kernel is free to retry syncache_respond but we'll ignore it due 591 * to RPL_DONT. 592 */ 593 if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONT)) { 594 595 INP_WLOCK(inp); 596 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 597 /* listener closed. synqe must have been aborted. */ 598 KASSERT(synqe->flags & TP_ABORT_SHUTDOWN, 599 ("%s: listener %p closed but synqe %p not aborted", 600 __func__, inp, synqe)); 601 602 CTR5(KTR_CXGB, 603 "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED", 604 __func__, stid, tid, lctx, synqe); 605 INP_WUNLOCK(inp); 606 release_synqe(synqe); 607 return (__LINE__); 608 } 609 610 KASSERT(!(synqe->flags & TP_ABORT_SHUTDOWN), 611 ("%s: synqe %p aborted, but listener %p not dropped.", 612 __func__, synqe, inp)); 613 614 TAILQ_REMOVE(&lctx->synq, synqe, link); 615 release_synqe(synqe); /* removed from synq list */ 616 inp = release_lctx(td, lctx); 617 if (inp) 618 INP_WUNLOCK(inp); 619 620 release_synqe(synqe); /* about to exit function */ 621 REJECT_PASS_ACCEPT(); 622 } 623 624 KASSERT(synqe->reply == RPL_DONE, 625 ("%s: reply %d", __func__, synqe->reply)); 626 627 CTR3(KTR_CXGB, "%s: stid %u, tid %u, OK", __func__, stid, tid); 628 release_synqe(synqe); 629 return (0); 630 631reject: 632 CTR4(KTR_CXGB, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 633 reject_reason); 634 635 if (synqe == NULL) 636 m_freem(m); 637 if (e) 638 l2t_release(td->l2t, e); 639 queue_tid_release(tod, tid); 640 641 return (0); 642} 643 644static void 645pass_establish_to_protohdrs(const struct cpl_pass_establish *cpl, 646 struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) 647{ 648 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 649 650 bzero(inc, sizeof(*inc)); 651 inc->inc_faddr.s_addr = cpl->peer_ip; 652 inc->inc_laddr.s_addr = cpl->local_ip; 653 inc->inc_fport = cpl->peer_port; 654 inc->inc_lport = cpl->local_port; 655 656 bzero(th, sizeof(*th)); 657 th->th_sport = cpl->peer_port; 658 th->th_dport = cpl->local_port; 659 th->th_flags = TH_ACK; 660 th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */ 661 th->th_ack = be32toh(cpl->snd_isn); /* ditto */ 662 663 bzero(to, sizeof(*to)); 664 if (G_TCPOPT_TSTAMP(tcp_opt)) 665 to->to_flags |= TOF_TS; 666} 667 668/* 669 * Process a CPL_PASS_ESTABLISH message. The T3 has already established a 670 * connection and we need to do the software side setup. 671 */ 672static int 673do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 674{ 675 struct adapter *sc = qs->adap; 676 struct tom_data *td = sc->tom_softc; 677 struct cpl_pass_establish *cpl = mtod(m, void *); 678 struct toedev *tod = &td->tod; 679 unsigned int tid = GET_TID(cpl); 680 struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); 681 struct toepcb *toep; 682 struct socket *so; 683 struct listen_ctx *lctx = synqe->lctx; 684 struct inpcb *inp = lctx->inp, *new_inp; 685 struct tcpopt to; 686 struct tcphdr th; 687 struct in_conninfo inc; 688#ifdef KTR 689 int stid = G_PASS_OPEN_TID(ntohl(cpl->tos_tid)); 690#endif 691 692 CTR5(KTR_CXGB, "%s: stid %u, tid %u, lctx %p, inp_flags 0x%x", 693 __func__, stid, tid, lctx, inp->inp_flags); 694 695 KASSERT(qs->idx == synqe->qset, 696 ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset)); 697 698 INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ 699 INP_WLOCK(inp); 700 701 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 702 /* 703 * The listening socket has closed. The TOM must have aborted 704 * all the embryonic connections (including this one) that were 705 * on the lctx's synq. do_abort_rpl for the tid is responsible 706 * for cleaning up. 707 */ 708 KASSERT(synqe->flags & TP_ABORT_SHUTDOWN, 709 ("%s: listen socket dropped but tid %u not aborted.", 710 __func__, tid)); 711 INP_WUNLOCK(inp); 712 INP_INFO_RUNLOCK(&V_tcbinfo); 713 m_freem(m); 714 return (0); 715 } 716 717 pass_establish_to_protohdrs(cpl, &inc, &th, &to); 718 719 /* Lie in order to pass the checks in syncache_expand */ 720 to.to_tsecr = synqe->ts; 721 th.th_ack = synqe->iss + 1; 722 723 toep = toepcb_alloc(tod); 724 if (toep == NULL) { 725reset: 726 t3_send_reset_synqe(tod, synqe); 727 INP_WUNLOCK(inp); 728 INP_INFO_RUNLOCK(&V_tcbinfo); 729 m_freem(m); 730 return (0); 731 } 732 toep->tp_qset = qs->idx; 733 toep->tp_l2t = synqe->e; 734 toep->tp_tid = tid; 735 toep->tp_rx_credits = synqe->rx_credits; 736 737 synqe->toep = toep; 738 synqe->cpl = cpl; 739 740 so = inp->inp_socket; 741 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { 742 toepcb_free(toep); 743 goto reset; 744 } 745 746 /* New connection inpcb is already locked by syncache_expand(). */ 747 new_inp = sotoinpcb(so); 748 INP_WLOCK_ASSERT(new_inp); 749 750 if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) { 751 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 752 t3_offload_socket(tod, synqe, so); 753 } 754 755 INP_WUNLOCK(new_inp); 756 757 /* Remove the synq entry and release its reference on the lctx */ 758 TAILQ_REMOVE(&lctx->synq, synqe, link); 759 inp = release_lctx(td, lctx); 760 if (inp) 761 INP_WUNLOCK(inp); 762 INP_INFO_RUNLOCK(&V_tcbinfo); 763 release_synqe(synqe); 764 765 m_freem(m); 766 return (0); 767} 768 769void 770t3_init_listen_cpl_handlers(struct adapter *sc) 771{ 772 t3_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl); 773 t3_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 774 t3_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 775 t3_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish); 776} 777 778/* 779 * Start a listening server by sending a passive open request to HW. 780 * 781 * Can't take adapter lock here and access to sc->flags, sc->open_device_map, 782 * sc->offload_map, if_capenable are all race prone. 783 */ 784int 785t3_listen_start(struct toedev *tod, struct tcpcb *tp) 786{ 787 struct tom_data *td = t3_tomdata(tod); 788 struct adapter *sc = tod->tod_softc; 789 struct port_info *pi; 790 struct inpcb *inp = tp->t_inpcb; 791 struct listen_ctx *lctx; 792 int i; 793 794 INP_WLOCK_ASSERT(inp); 795 796 if ((inp->inp_vflag & INP_IPV4) == 0) 797 return (0); 798 799#ifdef notyet 800 ADAPTER_LOCK(sc); 801 if (IS_BUSY(sc)) { 802 log(LOG_ERR, "%s: listen request ignored, %s is busy", 803 __func__, device_get_nameunit(sc->dev)); 804 goto done; 805 } 806 807 KASSERT(sc->flags & TOM_INIT_DONE, 808 ("%s: TOM not initialized", __func__)); 809#endif 810 811 if ((sc->open_device_map & sc->offload_map) == 0) 812 goto done; /* no port that's UP with IFCAP_TOE enabled */ 813 814 /* 815 * Find a running port with IFCAP_TOE4. We'll use the first such port's 816 * queues to send the passive open and receive the reply to it. 817 * 818 * XXX: need a way to mark an port in use by offload. if_cxgbe should 819 * then reject any attempt to bring down such a port (and maybe reject 820 * attempts to disable IFCAP_TOE on that port too?). 821 */ 822 for_each_port(sc, i) { 823 if (isset(&sc->open_device_map, i) && 824 sc->port[i].ifp->if_capenable & IFCAP_TOE4) 825 break; 826 } 827 KASSERT(i < sc->params.nports, 828 ("%s: no running port with TOE capability enabled.", __func__)); 829 pi = &sc->port[i]; 830 831 if (listen_hash_find(td, inp) != NULL) 832 goto done; /* already setup */ 833 834 lctx = alloc_lctx(td, inp, pi->first_qset); 835 if (lctx == NULL) { 836 log(LOG_ERR, 837 "%s: listen request ignored, %s couldn't allocate lctx\n", 838 __func__, device_get_nameunit(sc->dev)); 839 goto done; 840 } 841 listen_hash_add(td, lctx); 842 843 CTR5(KTR_CXGB, "%s: stid %u (%s), lctx %p, inp %p", __func__, 844 lctx->stid, tcpstates[tp->t_state], lctx, inp); 845 846 if (create_server(sc, lctx) != 0) { 847 log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__, 848 device_get_nameunit(sc->dev)); 849 (void) listen_hash_del(td, inp); 850 inp = release_lctx(td, lctx); 851 /* can't be freed, host stack has a reference */ 852 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 853 goto done; 854 } 855 lctx->flags |= LCTX_RPL_PENDING; 856done: 857#ifdef notyet 858 ADAPTER_UNLOCK(sc); 859#endif 860 return (0); 861} 862 863/* 864 * Stop a listening server by sending a close_listsvr request to HW. 865 * The server TID is freed when we get the reply. 866 */ 867int 868t3_listen_stop(struct toedev *tod, struct tcpcb *tp) 869{ 870 struct listen_ctx *lctx; 871 struct adapter *sc = tod->tod_softc; 872 struct tom_data *td = t3_tomdata(tod); 873 struct inpcb *inp = tp->t_inpcb; 874 struct synq_entry *synqe; 875 876 INP_WLOCK_ASSERT(inp); 877 878 lctx = listen_hash_del(td, inp); 879 if (lctx == NULL) 880 return (ENOENT); /* no hardware listener for this inp */ 881 882 CTR4(KTR_CXGB, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 883 lctx, lctx->flags); 884 885 /* 886 * If the reply to the PASS_OPEN is still pending we'll wait for it to 887 * arrive and clean up when it does. 888 */ 889 if (lctx->flags & LCTX_RPL_PENDING) { 890 KASSERT(TAILQ_EMPTY(&lctx->synq), 891 ("%s: synq not empty.", __func__)); 892 return (EINPROGRESS); 893 } 894 895 /* 896 * The host stack will abort all the connections on the listening 897 * socket's so_comp. It doesn't know about the connections on the synq 898 * so we need to take care of those. 899 */ 900 TAILQ_FOREACH(synqe, &lctx->synq, link) { 901 KASSERT(synqe->lctx == lctx, ("%s: synq corrupt", __func__)); 902 t3_send_reset_synqe(tod, synqe); 903 } 904 905 destroy_server(sc, lctx); 906 return (0); 907} 908 909void 910t3_syncache_added(struct toedev *tod __unused, void *arg) 911{ 912 struct synq_entry *synqe = arg; 913 914 hold_synqe(synqe); 915} 916 917void 918t3_syncache_removed(struct toedev *tod __unused, void *arg) 919{ 920 struct synq_entry *synqe = arg; 921 922 release_synqe(synqe); 923} 924 925int 926t3_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 927{ 928 struct adapter *sc = tod->tod_softc; 929 struct synq_entry *synqe = arg; 930 struct l2t_entry *e = synqe->e; 931 struct ip *ip = mtod(m, struct ip *); 932 struct tcphdr *th = (void *)(ip + 1); 933 struct cpl_pass_accept_rpl *rpl; 934 struct mbuf *r; 935 struct listen_ctx *lctx = synqe->lctx; 936 struct tcpopt to; 937 int mtu_idx, cpu_idx; 938 939 /* 940 * The first time we run it's during the call to syncache_add. That's 941 * the only one we care about. 942 */ 943 if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONE) == 0) 944 goto done; /* reply to the CPL only if it's ok to do so */ 945 946 r = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, rpl); 947 if (r == NULL) 948 goto done; 949 950 /* 951 * Use only the provided mbuf (with ip and tcp headers) and what's in 952 * synqe. Avoid looking at the listening socket (lctx->inp) here. 953 * 954 * XXX: if the incoming SYN had the TCP timestamp option but the kernel 955 * decides it doesn't want to use TCP timestamps we have no way of 956 * relaying this info to the chip on a per-tid basis (all we have is a 957 * global knob). 958 */ 959 bzero(&to, sizeof(to)); 960 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), 961 TO_SYN); 962 963 /* stash them for later */ 964 synqe->iss = be32toh(th->th_seq); 965 synqe->ts = to.to_tsval; 966 967 mtu_idx = find_best_mtu_idx(sc, NULL, to.to_mss); 968 cpu_idx = sc->rrss_map[synqe->qset]; 969 970 rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 971 rpl->wr.wrh_lo = 0; 972 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, synqe->tid)); 973 rpl->opt2 = calc_opt2(cpu_idx); 974 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 975 rpl->peer_ip = ip->ip_dst.s_addr; 976 rpl->opt0h = synqe->opt0h | 977 calc_opt0h(NULL, mtu_idx, to.to_wscale, NULL); 978 rpl->opt0l_status = htobe32(CPL_PASS_OPEN_ACCEPT) | 979 calc_opt0l(NULL, synqe->rx_credits); 980 981 l2t_send(sc, r, e); 982done: 983 m_freem(m); 984 return (0); 985} 986 987int 988do_abort_req_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 989{ 990 struct adapter *sc = qs->adap; 991 struct tom_data *td = sc->tom_softc; 992 struct toedev *tod = &td->tod; 993 const struct cpl_abort_req_rss *req = mtod(m, void *); 994 unsigned int tid = GET_TID(req); 995 struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); 996 struct listen_ctx *lctx = synqe->lctx; 997 struct inpcb *inp = lctx->inp; 998 999 KASSERT(synqe->flags & TP_IS_A_SYNQ_ENTRY, 1000 ("%s: !SYNQ_ENTRY", __func__)); 1001 1002 CTR6(KTR_CXGB, "%s: tid %u, synqe %p (%x), lctx %p, status %d", 1003 __func__, tid, synqe, synqe->flags, synqe->lctx, req->status); 1004 1005 INP_WLOCK(inp); 1006 1007 if (!(synqe->flags & TP_ABORT_REQ_RCVD)) { 1008 synqe->flags |= TP_ABORT_REQ_RCVD; 1009 synqe->flags |= TP_ABORT_SHUTDOWN; 1010 INP_WUNLOCK(inp); 1011 m_freem(m); 1012 return (0); 1013 } 1014 synqe->flags &= ~TP_ABORT_REQ_RCVD; 1015 1016 /* 1017 * If we'd sent a reset on this synqe, we'll ignore this and clean up in 1018 * the T3's reply to our reset instead. 1019 */ 1020 if (synqe->flags & TP_ABORT_RPL_PENDING) { 1021 synqe->flags |= TP_ABORT_RPL_SENT; 1022 INP_WUNLOCK(inp); 1023 } else { 1024 TAILQ_REMOVE(&lctx->synq, synqe, link); 1025 inp = release_lctx(td, lctx); 1026 if (inp) 1027 INP_WUNLOCK(inp); 1028 release_tid(tod, tid, qs->idx); 1029 l2t_release(td->l2t, synqe->e); 1030 release_synqe(synqe); 1031 } 1032 1033 send_abort_rpl(tod, tid, qs->idx); 1034 m_freem(m); 1035 return (0); 1036} 1037 1038int 1039do_abort_rpl_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1040{ 1041 struct adapter *sc = qs->adap; 1042 struct tom_data *td = sc->tom_softc; 1043 struct toedev *tod = &td->tod; 1044 const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); 1045 unsigned int tid = GET_TID(rpl); 1046 struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); 1047 struct listen_ctx *lctx = synqe->lctx; 1048 struct inpcb *inp = lctx->inp; 1049 1050 CTR3(KTR_CXGB, "%s: tid %d, synqe %p, status %d", tid, synqe, 1051 rpl->status); 1052 1053 INP_WLOCK(inp); 1054 1055 if (synqe->flags & TP_ABORT_RPL_PENDING) { 1056 if (!(synqe->flags & TP_ABORT_RPL_RCVD)) { 1057 synqe->flags |= TP_ABORT_RPL_RCVD; 1058 INP_WUNLOCK(inp); 1059 } else { 1060 synqe->flags &= ~TP_ABORT_RPL_RCVD; 1061 synqe->flags &= TP_ABORT_RPL_PENDING; 1062 1063 TAILQ_REMOVE(&lctx->synq, synqe, link); 1064 inp = release_lctx(td, lctx); 1065 if (inp) 1066 INP_WUNLOCK(inp); 1067 release_tid(tod, tid, qs->idx); 1068 l2t_release(td->l2t, synqe->e); 1069 release_synqe(synqe); 1070 } 1071 } 1072 1073 m_freem(m); 1074 return (0); 1075} 1076 1077static void 1078t3_send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) 1079{ 1080 struct cpl_abort_req *req; 1081 unsigned int tid = synqe->tid; 1082 struct adapter *sc = tod->tod_softc; 1083 struct mbuf *m; 1084#ifdef INVARIANTS 1085 struct listen_ctx *lctx = synqe->lctx; 1086 struct inpcb *inp = lctx->inp; 1087#endif 1088 1089 INP_WLOCK_ASSERT(inp); 1090 1091 CTR4(KTR_CXGB, "%s: tid %d, synqe %p (%x)", __func__, tid, synqe, 1092 synqe->flags); 1093 1094 if (synqe->flags & TP_ABORT_SHUTDOWN) 1095 return; 1096 1097 synqe->flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); 1098 1099 m = M_GETHDR_OFLD(synqe->qset, CPL_PRIORITY_DATA, req); 1100 if (m == NULL) 1101 CXGB_UNIMPLEMENTED(); 1102 1103 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1104 req->wr.wrh_lo = htonl(V_WR_TID(tid)); 1105 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1106 req->rsvd0 = 0; 1107 req->rsvd1 = !(synqe->flags & TP_DATASENT); 1108 req->cmd = CPL_ABORT_SEND_RST; 1109 1110 l2t_send(sc, m, synqe->e); 1111} 1112 1113void 1114t3_offload_socket(struct toedev *tod, void *arg, struct socket *so) 1115{ 1116 struct adapter *sc = tod->tod_softc; 1117 struct tom_data *td = sc->tom_softc; 1118 struct synq_entry *synqe = arg; 1119#ifdef INVARIANTS 1120 struct inpcb *inp = sotoinpcb(so); 1121#endif 1122 struct cpl_pass_establish *cpl = synqe->cpl; 1123 struct toepcb *toep = synqe->toep; 1124 1125 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ 1126 INP_WLOCK_ASSERT(inp); 1127 1128 offload_socket(so, toep); 1129 make_established(so, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); 1130 update_tid(td, toep, synqe->tid); 1131 synqe->flags |= TP_SYNQE_EXPANDED; 1132} 1133#endif 1134