cxgb_listen.c revision 286227
1/*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_listen.c 286227 2015-08-03 12:13:54Z jch $"); 29 30#include "opt_inet.h" 31 32#ifdef TCP_OFFLOAD 33#include <sys/param.h> 34#include <sys/refcount.h> 35#include <sys/socket.h> 36#include <sys/socketvar.h> 37#include <sys/sysctl.h> 38#include <net/if.h> 39#include <net/if_var.h> 40#include <net/route.h> 41#include <netinet/in.h> 42#include <netinet/ip.h> 43#include <netinet/in_pcb.h> 44#include <netinet/in_var.h> 45#include <netinet/tcp_timer.h> 46#include <netinet/tcp_var.h> 47#define TCPSTATES 48#include <netinet/tcp_fsm.h> 49#include <netinet/toecore.h> 50 51#include "cxgb_include.h" 52#include "ulp/tom/cxgb_tom.h" 53#include "ulp/tom/cxgb_l2t.h" 54#include "ulp/tom/cxgb_toepcb.h" 55 56static void t3_send_reset_synqe(struct toedev *, struct synq_entry *); 57 58static int 59alloc_stid(struct tid_info *t, void *ctx) 60{ 61 int stid = -1; 62 63 mtx_lock(&t->stid_lock); 64 if (t->sfree) { 65 union listen_entry *p = t->sfree; 66 67 stid = (p - t->stid_tab) + t->stid_base; 68 t->sfree = p->next; 69 p->ctx = ctx; 70 t->stids_in_use++; 71 } 72 mtx_unlock(&t->stid_lock); 73 return (stid); 74} 75 76static void 77free_stid(struct tid_info *t, int stid) 78{ 79 union listen_entry *p = stid2entry(t, stid); 80 81 mtx_lock(&t->stid_lock); 82 p->next = t->sfree; 83 t->sfree = p; 84 t->stids_in_use--; 85 mtx_unlock(&t->stid_lock); 86} 87 88static struct listen_ctx * 89alloc_lctx(struct tom_data *td, struct inpcb *inp, int qset) 90{ 91 struct listen_ctx *lctx; 92 93 INP_WLOCK_ASSERT(inp); 94 95 lctx = malloc(sizeof(struct listen_ctx), M_CXGB, M_NOWAIT | M_ZERO); 96 if (lctx == NULL) 97 return (NULL); 98 99 lctx->stid = alloc_stid(&td->tid_maps, lctx); 100 if (lctx->stid < 0) { 101 free(lctx, M_CXGB); 102 return (NULL); 103 } 104 105 lctx->inp = inp; 106 in_pcbref(inp); 107 108 lctx->qset = qset; 109 refcount_init(&lctx->refcnt, 1); 110 TAILQ_INIT(&lctx->synq); 111 112 return (lctx); 113} 114 115/* Don't call this directly, use release_lctx instead */ 116static int 117free_lctx(struct tom_data *td, struct listen_ctx *lctx) 118{ 119 struct inpcb *inp = lctx->inp; 120 121 INP_WLOCK_ASSERT(inp); 122 KASSERT(lctx->refcnt == 0, 123 ("%s: refcnt %d", __func__, lctx->refcnt)); 124 KASSERT(TAILQ_EMPTY(&lctx->synq), 125 ("%s: synq not empty.", __func__)); 126 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 127 128 CTR4(KTR_CXGB, "%s: stid %u, lctx %p, inp %p", 129 __func__, lctx->stid, lctx, lctx->inp); 130 131 free_stid(&td->tid_maps, lctx->stid); 132 free(lctx, M_CXGB); 133 134 return in_pcbrele_wlocked(inp); 135} 136 137static void 138hold_lctx(struct listen_ctx *lctx) 139{ 140 141 refcount_acquire(&lctx->refcnt); 142} 143 144static inline uint32_t 145listen_hashfn(void *key, u_long mask) 146{ 147 148 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 149} 150 151/* 152 * Add a listen_ctx entry to the listen hash table. 153 */ 154static void 155listen_hash_add(struct tom_data *td, struct listen_ctx *lctx) 156{ 157 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 158 159 mtx_lock(&td->lctx_hash_lock); 160 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 161 td->lctx_count++; 162 mtx_unlock(&td->lctx_hash_lock); 163} 164 165/* 166 * Look for the listening socket's context entry in the hash and return it. 167 */ 168static struct listen_ctx * 169listen_hash_find(struct tom_data *td, struct inpcb *inp) 170{ 171 int bucket = listen_hashfn(inp, td->listen_mask); 172 struct listen_ctx *lctx; 173 174 mtx_lock(&td->lctx_hash_lock); 175 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 176 if (lctx->inp == inp) 177 break; 178 } 179 mtx_unlock(&td->lctx_hash_lock); 180 181 return (lctx); 182} 183 184/* 185 * Removes the listen_ctx structure for inp from the hash and returns it. 186 */ 187static struct listen_ctx * 188listen_hash_del(struct tom_data *td, struct inpcb *inp) 189{ 190 int bucket = listen_hashfn(inp, td->listen_mask); 191 struct listen_ctx *lctx, *l; 192 193 mtx_lock(&td->lctx_hash_lock); 194 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 195 if (lctx->inp == inp) { 196 LIST_REMOVE(lctx, link); 197 td->lctx_count--; 198 break; 199 } 200 } 201 mtx_unlock(&td->lctx_hash_lock); 202 203 return (lctx); 204} 205 206/* 207 * Releases a hold on the lctx. Must be called with the listening socket's inp 208 * locked. The inp may be freed by this function and it returns NULL to 209 * indicate this. 210 */ 211static struct inpcb * 212release_lctx(struct tom_data *td, struct listen_ctx *lctx) 213{ 214 struct inpcb *inp = lctx->inp; 215 int inp_freed = 0; 216 217 INP_WLOCK_ASSERT(inp); 218 if (refcount_release(&lctx->refcnt)) 219 inp_freed = free_lctx(td, lctx); 220 221 return (inp_freed ? NULL : inp); 222} 223 224static int 225create_server(struct adapter *sc, struct listen_ctx *lctx) 226{ 227 struct mbuf *m; 228 struct cpl_pass_open_req *req; 229 struct inpcb *inp = lctx->inp; 230 231 m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req); 232 if (m == NULL) 233 return (ENOMEM); 234 235 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 236 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 237 req->local_port = inp->inp_lport; 238 memcpy(&req->local_ip, &inp->inp_laddr, 4); 239 req->peer_port = 0; 240 req->peer_ip = 0; 241 req->peer_netmask = 0; 242 req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); 243 req->opt0l = htonl(V_RCV_BUFSIZ(16)); 244 req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); 245 246 t3_offload_tx(sc, m); 247 248 return (0); 249} 250 251static int 252destroy_server(struct adapter *sc, struct listen_ctx *lctx) 253{ 254 struct mbuf *m; 255 struct cpl_close_listserv_req *req; 256 257 m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req); 258 if (m == NULL) 259 return (ENOMEM); 260 261 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 262 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 263 lctx->stid)); 264 req->cpu_idx = 0; 265 266 t3_offload_tx(sc, m); 267 268 return (0); 269} 270 271/* 272 * Process a CPL_CLOSE_LISTSRV_RPL message. If the status is good we release 273 * the STID. 274 */ 275static int 276do_close_server_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 277{ 278 struct adapter *sc = qs->adap; 279 struct tom_data *td = sc->tom_softc; 280 struct cpl_close_listserv_rpl *rpl = mtod(m, void *); 281 unsigned int stid = GET_TID(rpl); 282 struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid); 283 struct inpcb *inp = lctx->inp; 284 285 CTR3(KTR_CXGB, "%s: stid %u, status %u", __func__, stid, rpl->status); 286 287 if (rpl->status != CPL_ERR_NONE) { 288 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u", 289 __func__, rpl->status, stid); 290 } else { 291 INP_WLOCK(inp); 292 KASSERT(listen_hash_del(td, lctx->inp) == NULL, 293 ("%s: inp %p still in listen hash", __func__, inp)); 294 if (release_lctx(td, lctx) != NULL) 295 INP_WUNLOCK(inp); 296 } 297 298 m_freem(m); 299 return (0); 300} 301 302/* 303 * Process a CPL_PASS_OPEN_RPL message. Remove the lctx from the listen hash 304 * table and free it if there was any error, otherwise nothing to do. 305 */ 306static int 307do_pass_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 308{ 309 struct adapter *sc = qs->adap; 310 struct tom_data *td = sc->tom_softc; 311 struct cpl_pass_open_rpl *rpl = mtod(m, void *); 312 int stid = GET_TID(rpl); 313 struct listen_ctx *lctx; 314 struct inpcb *inp; 315 316 /* 317 * We get these replies also when setting up HW filters. Just throw 318 * those away. 319 */ 320 if (stid >= td->tid_maps.stid_base + td->tid_maps.nstids) 321 goto done; 322 323 lctx = lookup_stid(&td->tid_maps, stid); 324 inp = lctx->inp; 325 326 INP_WLOCK(inp); 327 328 CTR4(KTR_CXGB, "%s: stid %u, status %u, flags 0x%x", 329 __func__, stid, rpl->status, lctx->flags); 330 331 lctx->flags &= ~LCTX_RPL_PENDING; 332 333 if (rpl->status != CPL_ERR_NONE) { 334 log(LOG_ERR, "%s: %s: hw listen (stid %d) failed: %d\n", 335 __func__, device_get_nameunit(sc->dev), stid, rpl->status); 336 } 337 338#ifdef INVARIANTS 339 /* 340 * If the inp has been dropped (listening socket closed) then 341 * listen_stop must have run and taken the inp out of the hash. 342 */ 343 if (inp->inp_flags & INP_DROPPED) { 344 KASSERT(listen_hash_del(td, inp) == NULL, 345 ("%s: inp %p still in listen hash", __func__, inp)); 346 } 347#endif 348 349 if (inp->inp_flags & INP_DROPPED && rpl->status != CPL_ERR_NONE) { 350 if (release_lctx(td, lctx) != NULL) 351 INP_WUNLOCK(inp); 352 goto done; 353 } 354 355 /* 356 * Listening socket stopped listening earlier and now the chip tells us 357 * it has started the hardware listener. Stop it; the lctx will be 358 * released in do_close_server_rpl. 359 */ 360 if (inp->inp_flags & INP_DROPPED) { 361 destroy_server(sc, lctx); 362 INP_WUNLOCK(inp); 363 goto done; 364 } 365 366 /* 367 * Failed to start hardware listener. Take inp out of the hash and 368 * release our reference on it. An error message has been logged 369 * already. 370 */ 371 if (rpl->status != CPL_ERR_NONE) { 372 listen_hash_del(td, inp); 373 if (release_lctx(td, lctx) != NULL) 374 INP_WUNLOCK(inp); 375 goto done; 376 } 377 378 /* hardware listener open for business */ 379 380 INP_WUNLOCK(inp); 381done: 382 m_freem(m); 383 return (0); 384} 385 386static void 387pass_accept_req_to_protohdrs(const struct cpl_pass_accept_req *cpl, 388 struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) 389{ 390 const struct tcp_options *t3opt = &cpl->tcp_options; 391 392 bzero(inc, sizeof(*inc)); 393 inc->inc_faddr.s_addr = cpl->peer_ip; 394 inc->inc_laddr.s_addr = cpl->local_ip; 395 inc->inc_fport = cpl->peer_port; 396 inc->inc_lport = cpl->local_port; 397 398 bzero(th, sizeof(*th)); 399 th->th_sport = cpl->peer_port; 400 th->th_dport = cpl->local_port; 401 th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */ 402 th->th_flags = TH_SYN; 403 404 bzero(to, sizeof(*to)); 405 if (t3opt->mss) { 406 to->to_flags |= TOF_MSS; 407 to->to_mss = be16toh(t3opt->mss); 408 } 409 if (t3opt->wsf) { 410 to->to_flags |= TOF_SCALE; 411 to->to_wscale = t3opt->wsf; 412 } 413 if (t3opt->tstamp) 414 to->to_flags |= TOF_TS; 415 if (t3opt->sack) 416 to->to_flags |= TOF_SACKPERM; 417} 418 419static inline void 420hold_synqe(struct synq_entry *synqe) 421{ 422 423 refcount_acquire(&synqe->refcnt); 424} 425 426static inline void 427release_synqe(struct synq_entry *synqe) 428{ 429 430 if (refcount_release(&synqe->refcnt)) 431 m_freem(synqe->m); 432} 433 434/* 435 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to 436 * store some state temporarily. There will be enough room in the mbuf's 437 * trailing space as the CPL is not that large. 438 * 439 * XXX: bad hack. 440 */ 441static struct synq_entry * 442mbuf_to_synq_entry(struct mbuf *m) 443{ 444 int len = roundup(sizeof (struct synq_entry), 8); 445 446 if (__predict_false(M_TRAILINGSPACE(m) < len)) { 447 panic("%s: no room for synq_entry (%td, %d)\n", __func__, 448 M_TRAILINGSPACE(m), len); 449 } 450 451 return ((void *)(M_START(m) + M_SIZE(m) - len)); 452} 453 454#ifdef KTR 455#define REJECT_PASS_ACCEPT() do { \ 456 reject_reason = __LINE__; \ 457 goto reject; \ 458} while (0) 459#else 460#define REJECT_PASS_ACCEPT() do { goto reject; } while (0) 461#endif 462 463/* 464 * The context associated with a tid entry via insert_tid could be a synq_entry 465 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 466 */ 467CTASSERT(offsetof(struct toepcb, tp_flags) == offsetof(struct synq_entry, flags)); 468 469/* 470 * Handle a CPL_PASS_ACCEPT_REQ message. 471 */ 472static int 473do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 474{ 475 struct adapter *sc = qs->adap; 476 struct tom_data *td = sc->tom_softc; 477 struct toedev *tod = &td->tod; 478 const struct cpl_pass_accept_req *req = mtod(m, void *); 479 unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 480 unsigned int tid = GET_TID(req); 481 struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid); 482 struct l2t_entry *e = NULL; 483 struct sockaddr_in nam; 484 struct rtentry *rt; 485 struct inpcb *inp; 486 struct socket *so; 487 struct port_info *pi; 488 struct ifnet *ifp; 489 struct in_conninfo inc; 490 struct tcphdr th; 491 struct tcpopt to; 492 struct synq_entry *synqe = NULL; 493 int i; 494#ifdef KTR 495 int reject_reason; 496#endif 497 498 CTR4(KTR_CXGB, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 499 lctx); 500 501 pass_accept_req_to_protohdrs(req, &inc, &th, &to); 502 503 /* 504 * Don't offload if the interface that received the SYN doesn't have 505 * IFCAP_TOE enabled. 506 */ 507 pi = NULL; 508 for_each_port(sc, i) { 509 if (memcmp(sc->port[i].hw_addr, req->dst_mac, ETHER_ADDR_LEN)) 510 continue; 511 pi = &sc->port[i]; 512 break; 513 } 514 if (pi == NULL) 515 REJECT_PASS_ACCEPT(); 516 ifp = pi->ifp; 517 if ((ifp->if_capenable & IFCAP_TOE4) == 0) 518 REJECT_PASS_ACCEPT(); 519 520 /* 521 * Don't offload if the outgoing interface for the route back to the 522 * peer is not the same as the interface that received the SYN. 523 */ 524 bzero(&nam, sizeof(nam)); 525 nam.sin_len = sizeof(nam); 526 nam.sin_family = AF_INET; 527 nam.sin_addr = inc.inc_faddr; 528 rt = rtalloc1((struct sockaddr *)&nam, 0, 0); 529 if (rt == NULL) 530 REJECT_PASS_ACCEPT(); 531 else { 532 struct sockaddr *nexthop; 533 534 RT_UNLOCK(rt); 535 nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : 536 (struct sockaddr *)&nam; 537 if (rt->rt_ifp == ifp) 538 e = t3_l2t_get(pi, rt->rt_ifp, nexthop); 539 RTFREE(rt); 540 if (e == NULL) 541 REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */ 542 } 543 544 INP_INFO_RLOCK(&V_tcbinfo); 545 546 /* Don't offload if the 4-tuple is already in use */ 547 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 548 INP_INFO_RUNLOCK(&V_tcbinfo); 549 REJECT_PASS_ACCEPT(); 550 } 551 552 inp = lctx->inp; /* listening socket (not owned by the TOE) */ 553 INP_WLOCK(inp); 554 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 555 /* 556 * The listening socket has closed. The reply from the TOE to 557 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all 558 * resources tied to this listen context. 559 */ 560 INP_WUNLOCK(inp); 561 INP_INFO_RUNLOCK(&V_tcbinfo); 562 REJECT_PASS_ACCEPT(); 563 } 564 so = inp->inp_socket; 565 566 /* Reuse the mbuf that delivered the CPL to us */ 567 synqe = mbuf_to_synq_entry(m); 568 synqe->flags = TP_IS_A_SYNQ_ENTRY; 569 synqe->m = m; 570 synqe->lctx = lctx; 571 synqe->tid = tid; 572 synqe->e = e; 573 synqe->opt0h = calc_opt0h(so, 0, 0, e); 574 synqe->qset = pi->first_qset + (arc4random() % pi->nqsets); 575 SOCKBUF_LOCK(&so->so_rcv); 576 synqe->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); 577 SOCKBUF_UNLOCK(&so->so_rcv); 578 refcount_init(&synqe->refcnt, 1); 579 atomic_store_rel_int(&synqe->reply, RPL_OK); 580 581 insert_tid(td, synqe, tid); 582 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); 583 hold_synqe(synqe); 584 hold_lctx(lctx); 585 586 /* syncache_add releases both pcbinfo and pcb locks */ 587 toe_syncache_add(&inc, &to, &th, inp, tod, synqe); 588 INP_UNLOCK_ASSERT(inp); 589 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 590 591 /* 592 * If we replied during syncache_add (reply is RPL_DONE), good. 593 * Otherwise (reply is unchanged - RPL_OK) it's no longer ok to reply. 594 * The mbuf will stick around as long as the entry is in the syncache. 595 * The kernel is free to retry syncache_respond but we'll ignore it due 596 * to RPL_DONT. 597 */ 598 if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONT)) { 599 600 INP_WLOCK(inp); 601 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 602 /* listener closed. synqe must have been aborted. */ 603 KASSERT(synqe->flags & TP_ABORT_SHUTDOWN, 604 ("%s: listener %p closed but synqe %p not aborted", 605 __func__, inp, synqe)); 606 607 CTR5(KTR_CXGB, 608 "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED", 609 __func__, stid, tid, lctx, synqe); 610 INP_WUNLOCK(inp); 611 release_synqe(synqe); 612 return (__LINE__); 613 } 614 615 KASSERT(!(synqe->flags & TP_ABORT_SHUTDOWN), 616 ("%s: synqe %p aborted, but listener %p not dropped.", 617 __func__, synqe, inp)); 618 619 TAILQ_REMOVE(&lctx->synq, synqe, link); 620 release_synqe(synqe); /* removed from synq list */ 621 inp = release_lctx(td, lctx); 622 if (inp) 623 INP_WUNLOCK(inp); 624 625 release_synqe(synqe); /* about to exit function */ 626 REJECT_PASS_ACCEPT(); 627 } 628 629 KASSERT(synqe->reply == RPL_DONE, 630 ("%s: reply %d", __func__, synqe->reply)); 631 632 CTR3(KTR_CXGB, "%s: stid %u, tid %u, OK", __func__, stid, tid); 633 release_synqe(synqe); 634 return (0); 635 636reject: 637 CTR4(KTR_CXGB, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 638 reject_reason); 639 640 if (synqe == NULL) 641 m_freem(m); 642 if (e) 643 l2t_release(td->l2t, e); 644 queue_tid_release(tod, tid); 645 646 return (0); 647} 648 649static void 650pass_establish_to_protohdrs(const struct cpl_pass_establish *cpl, 651 struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) 652{ 653 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 654 655 bzero(inc, sizeof(*inc)); 656 inc->inc_faddr.s_addr = cpl->peer_ip; 657 inc->inc_laddr.s_addr = cpl->local_ip; 658 inc->inc_fport = cpl->peer_port; 659 inc->inc_lport = cpl->local_port; 660 661 bzero(th, sizeof(*th)); 662 th->th_sport = cpl->peer_port; 663 th->th_dport = cpl->local_port; 664 th->th_flags = TH_ACK; 665 th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */ 666 th->th_ack = be32toh(cpl->snd_isn); /* ditto */ 667 668 bzero(to, sizeof(*to)); 669 if (G_TCPOPT_TSTAMP(tcp_opt)) 670 to->to_flags |= TOF_TS; 671} 672 673/* 674 * Process a CPL_PASS_ESTABLISH message. The T3 has already established a 675 * connection and we need to do the software side setup. 676 */ 677static int 678do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 679{ 680 struct adapter *sc = qs->adap; 681 struct tom_data *td = sc->tom_softc; 682 struct cpl_pass_establish *cpl = mtod(m, void *); 683 struct toedev *tod = &td->tod; 684 unsigned int tid = GET_TID(cpl); 685 struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); 686 struct toepcb *toep; 687 struct socket *so; 688 struct listen_ctx *lctx = synqe->lctx; 689 struct inpcb *inp = lctx->inp, *new_inp; 690 struct tcpopt to; 691 struct tcphdr th; 692 struct in_conninfo inc; 693#ifdef KTR 694 int stid = G_PASS_OPEN_TID(ntohl(cpl->tos_tid)); 695#endif 696 697 CTR5(KTR_CXGB, "%s: stid %u, tid %u, lctx %p, inp_flags 0x%x", 698 __func__, stid, tid, lctx, inp->inp_flags); 699 700 KASSERT(qs->idx == synqe->qset, 701 ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset)); 702 703 INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ 704 INP_WLOCK(inp); 705 706 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 707 /* 708 * The listening socket has closed. The TOM must have aborted 709 * all the embryonic connections (including this one) that were 710 * on the lctx's synq. do_abort_rpl for the tid is responsible 711 * for cleaning up. 712 */ 713 KASSERT(synqe->flags & TP_ABORT_SHUTDOWN, 714 ("%s: listen socket dropped but tid %u not aborted.", 715 __func__, tid)); 716 INP_WUNLOCK(inp); 717 INP_INFO_RUNLOCK(&V_tcbinfo); 718 m_freem(m); 719 return (0); 720 } 721 722 pass_establish_to_protohdrs(cpl, &inc, &th, &to); 723 724 /* Lie in order to pass the checks in syncache_expand */ 725 to.to_tsecr = synqe->ts; 726 th.th_ack = synqe->iss + 1; 727 728 toep = toepcb_alloc(tod); 729 if (toep == NULL) { 730reset: 731 t3_send_reset_synqe(tod, synqe); 732 INP_WUNLOCK(inp); 733 INP_INFO_RUNLOCK(&V_tcbinfo); 734 m_freem(m); 735 return (0); 736 } 737 toep->tp_qset = qs->idx; 738 toep->tp_l2t = synqe->e; 739 toep->tp_tid = tid; 740 toep->tp_rx_credits = synqe->rx_credits; 741 742 synqe->toep = toep; 743 synqe->cpl = cpl; 744 745 so = inp->inp_socket; 746 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { 747 toepcb_free(toep); 748 goto reset; 749 } 750 751 /* New connection inpcb is already locked by syncache_expand(). */ 752 new_inp = sotoinpcb(so); 753 INP_WLOCK_ASSERT(new_inp); 754 755 if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) { 756 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 757 t3_offload_socket(tod, synqe, so); 758 } 759 760 INP_WUNLOCK(new_inp); 761 762 /* Remove the synq entry and release its reference on the lctx */ 763 TAILQ_REMOVE(&lctx->synq, synqe, link); 764 inp = release_lctx(td, lctx); 765 if (inp) 766 INP_WUNLOCK(inp); 767 INP_INFO_RUNLOCK(&V_tcbinfo); 768 release_synqe(synqe); 769 770 m_freem(m); 771 return (0); 772} 773 774void 775t3_init_listen_cpl_handlers(struct adapter *sc) 776{ 777 t3_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl); 778 t3_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 779 t3_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 780 t3_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish); 781} 782 783/* 784 * Start a listening server by sending a passive open request to HW. 785 * 786 * Can't take adapter lock here and access to sc->flags, sc->open_device_map, 787 * sc->offload_map, if_capenable are all race prone. 788 */ 789int 790t3_listen_start(struct toedev *tod, struct tcpcb *tp) 791{ 792 struct tom_data *td = t3_tomdata(tod); 793 struct adapter *sc = tod->tod_softc; 794 struct port_info *pi; 795 struct inpcb *inp = tp->t_inpcb; 796 struct listen_ctx *lctx; 797 int i; 798 799 INP_WLOCK_ASSERT(inp); 800 801 if ((inp->inp_vflag & INP_IPV4) == 0) 802 return (0); 803 804#ifdef notyet 805 ADAPTER_LOCK(sc); 806 if (IS_BUSY(sc)) { 807 log(LOG_ERR, "%s: listen request ignored, %s is busy", 808 __func__, device_get_nameunit(sc->dev)); 809 goto done; 810 } 811 812 KASSERT(sc->flags & TOM_INIT_DONE, 813 ("%s: TOM not initialized", __func__)); 814#endif 815 816 if ((sc->open_device_map & sc->offload_map) == 0) 817 goto done; /* no port that's UP with IFCAP_TOE enabled */ 818 819 /* 820 * Find a running port with IFCAP_TOE4. We'll use the first such port's 821 * queues to send the passive open and receive the reply to it. 822 * 823 * XXX: need a way to mark an port in use by offload. if_cxgbe should 824 * then reject any attempt to bring down such a port (and maybe reject 825 * attempts to disable IFCAP_TOE on that port too?). 826 */ 827 for_each_port(sc, i) { 828 if (isset(&sc->open_device_map, i) && 829 sc->port[i].ifp->if_capenable & IFCAP_TOE4) 830 break; 831 } 832 KASSERT(i < sc->params.nports, 833 ("%s: no running port with TOE capability enabled.", __func__)); 834 pi = &sc->port[i]; 835 836 if (listen_hash_find(td, inp) != NULL) 837 goto done; /* already setup */ 838 839 lctx = alloc_lctx(td, inp, pi->first_qset); 840 if (lctx == NULL) { 841 log(LOG_ERR, 842 "%s: listen request ignored, %s couldn't allocate lctx\n", 843 __func__, device_get_nameunit(sc->dev)); 844 goto done; 845 } 846 listen_hash_add(td, lctx); 847 848 CTR5(KTR_CXGB, "%s: stid %u (%s), lctx %p, inp %p", __func__, 849 lctx->stid, tcpstates[tp->t_state], lctx, inp); 850 851 if (create_server(sc, lctx) != 0) { 852 log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__, 853 device_get_nameunit(sc->dev)); 854 (void) listen_hash_del(td, inp); 855 inp = release_lctx(td, lctx); 856 /* can't be freed, host stack has a reference */ 857 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 858 goto done; 859 } 860 lctx->flags |= LCTX_RPL_PENDING; 861done: 862#ifdef notyet 863 ADAPTER_UNLOCK(sc); 864#endif 865 return (0); 866} 867 868/* 869 * Stop a listening server by sending a close_listsvr request to HW. 870 * The server TID is freed when we get the reply. 871 */ 872int 873t3_listen_stop(struct toedev *tod, struct tcpcb *tp) 874{ 875 struct listen_ctx *lctx; 876 struct adapter *sc = tod->tod_softc; 877 struct tom_data *td = t3_tomdata(tod); 878 struct inpcb *inp = tp->t_inpcb; 879 struct synq_entry *synqe; 880 881 INP_WLOCK_ASSERT(inp); 882 883 lctx = listen_hash_del(td, inp); 884 if (lctx == NULL) 885 return (ENOENT); /* no hardware listener for this inp */ 886 887 CTR4(KTR_CXGB, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 888 lctx, lctx->flags); 889 890 /* 891 * If the reply to the PASS_OPEN is still pending we'll wait for it to 892 * arrive and clean up when it does. 893 */ 894 if (lctx->flags & LCTX_RPL_PENDING) { 895 KASSERT(TAILQ_EMPTY(&lctx->synq), 896 ("%s: synq not empty.", __func__)); 897 return (EINPROGRESS); 898 } 899 900 /* 901 * The host stack will abort all the connections on the listening 902 * socket's so_comp. It doesn't know about the connections on the synq 903 * so we need to take care of those. 904 */ 905 TAILQ_FOREACH(synqe, &lctx->synq, link) { 906 KASSERT(synqe->lctx == lctx, ("%s: synq corrupt", __func__)); 907 t3_send_reset_synqe(tod, synqe); 908 } 909 910 destroy_server(sc, lctx); 911 return (0); 912} 913 914void 915t3_syncache_added(struct toedev *tod __unused, void *arg) 916{ 917 struct synq_entry *synqe = arg; 918 919 hold_synqe(synqe); 920} 921 922void 923t3_syncache_removed(struct toedev *tod __unused, void *arg) 924{ 925 struct synq_entry *synqe = arg; 926 927 release_synqe(synqe); 928} 929 930/* XXX */ 931extern void tcp_dooptions(struct tcpopt *, u_char *, int, int); 932 933int 934t3_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 935{ 936 struct adapter *sc = tod->tod_softc; 937 struct synq_entry *synqe = arg; 938 struct l2t_entry *e = synqe->e; 939 struct ip *ip = mtod(m, struct ip *); 940 struct tcphdr *th = (void *)(ip + 1); 941 struct cpl_pass_accept_rpl *rpl; 942 struct mbuf *r; 943 struct listen_ctx *lctx = synqe->lctx; 944 struct tcpopt to; 945 int mtu_idx, cpu_idx; 946 947 /* 948 * The first time we run it's during the call to syncache_add. That's 949 * the only one we care about. 950 */ 951 if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONE) == 0) 952 goto done; /* reply to the CPL only if it's ok to do so */ 953 954 r = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, rpl); 955 if (r == NULL) 956 goto done; 957 958 /* 959 * Use only the provided mbuf (with ip and tcp headers) and what's in 960 * synqe. Avoid looking at the listening socket (lctx->inp) here. 961 * 962 * XXX: if the incoming SYN had the TCP timestamp option but the kernel 963 * decides it doesn't want to use TCP timestamps we have no way of 964 * relaying this info to the chip on a per-tid basis (all we have is a 965 * global knob). 966 */ 967 bzero(&to, sizeof(to)); 968 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), 969 TO_SYN); 970 971 /* stash them for later */ 972 synqe->iss = be32toh(th->th_seq); 973 synqe->ts = to.to_tsval; 974 975 mtu_idx = find_best_mtu_idx(sc, NULL, to.to_mss); 976 cpu_idx = sc->rrss_map[synqe->qset]; 977 978 rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 979 rpl->wr.wrh_lo = 0; 980 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, synqe->tid)); 981 rpl->opt2 = calc_opt2(cpu_idx); 982 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 983 rpl->peer_ip = ip->ip_dst.s_addr; 984 rpl->opt0h = synqe->opt0h | 985 calc_opt0h(NULL, mtu_idx, to.to_wscale, NULL); 986 rpl->opt0l_status = htobe32(CPL_PASS_OPEN_ACCEPT) | 987 calc_opt0l(NULL, synqe->rx_credits); 988 989 l2t_send(sc, r, e); 990done: 991 m_freem(m); 992 return (0); 993} 994 995int 996do_abort_req_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 997{ 998 struct adapter *sc = qs->adap; 999 struct tom_data *td = sc->tom_softc; 1000 struct toedev *tod = &td->tod; 1001 const struct cpl_abort_req_rss *req = mtod(m, void *); 1002 unsigned int tid = GET_TID(req); 1003 struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); 1004 struct listen_ctx *lctx = synqe->lctx; 1005 struct inpcb *inp = lctx->inp; 1006 1007 KASSERT(synqe->flags & TP_IS_A_SYNQ_ENTRY, 1008 ("%s: !SYNQ_ENTRY", __func__)); 1009 1010 CTR6(KTR_CXGB, "%s: tid %u, synqe %p (%x), lctx %p, status %d", 1011 __func__, tid, synqe, synqe->flags, synqe->lctx, req->status); 1012 1013 INP_WLOCK(inp); 1014 1015 if (!(synqe->flags & TP_ABORT_REQ_RCVD)) { 1016 synqe->flags |= TP_ABORT_REQ_RCVD; 1017 synqe->flags |= TP_ABORT_SHUTDOWN; 1018 INP_WUNLOCK(inp); 1019 m_freem(m); 1020 return (0); 1021 } 1022 synqe->flags &= ~TP_ABORT_REQ_RCVD; 1023 1024 /* 1025 * If we'd sent a reset on this synqe, we'll ignore this and clean up in 1026 * the T3's reply to our reset instead. 1027 */ 1028 if (synqe->flags & TP_ABORT_RPL_PENDING) { 1029 synqe->flags |= TP_ABORT_RPL_SENT; 1030 INP_WUNLOCK(inp); 1031 } else { 1032 TAILQ_REMOVE(&lctx->synq, synqe, link); 1033 inp = release_lctx(td, lctx); 1034 if (inp) 1035 INP_WUNLOCK(inp); 1036 release_tid(tod, tid, qs->idx); 1037 l2t_release(td->l2t, synqe->e); 1038 release_synqe(synqe); 1039 } 1040 1041 send_abort_rpl(tod, tid, qs->idx); 1042 m_freem(m); 1043 return (0); 1044} 1045 1046int 1047do_abort_rpl_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1048{ 1049 struct adapter *sc = qs->adap; 1050 struct tom_data *td = sc->tom_softc; 1051 struct toedev *tod = &td->tod; 1052 const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); 1053 unsigned int tid = GET_TID(rpl); 1054 struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); 1055 struct listen_ctx *lctx = synqe->lctx; 1056 struct inpcb *inp = lctx->inp; 1057 1058 CTR3(KTR_CXGB, "%s: tid %d, synqe %p, status %d", tid, synqe, 1059 rpl->status); 1060 1061 INP_WLOCK(inp); 1062 1063 if (synqe->flags & TP_ABORT_RPL_PENDING) { 1064 if (!(synqe->flags & TP_ABORT_RPL_RCVD)) { 1065 synqe->flags |= TP_ABORT_RPL_RCVD; 1066 INP_WUNLOCK(inp); 1067 } else { 1068 synqe->flags &= ~TP_ABORT_RPL_RCVD; 1069 synqe->flags &= TP_ABORT_RPL_PENDING; 1070 1071 TAILQ_REMOVE(&lctx->synq, synqe, link); 1072 inp = release_lctx(td, lctx); 1073 if (inp) 1074 INP_WUNLOCK(inp); 1075 release_tid(tod, tid, qs->idx); 1076 l2t_release(td->l2t, synqe->e); 1077 release_synqe(synqe); 1078 } 1079 } 1080 1081 m_freem(m); 1082 return (0); 1083} 1084 1085static void 1086t3_send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) 1087{ 1088 struct cpl_abort_req *req; 1089 unsigned int tid = synqe->tid; 1090 struct adapter *sc = tod->tod_softc; 1091 struct mbuf *m; 1092#ifdef INVARIANTS 1093 struct listen_ctx *lctx = synqe->lctx; 1094 struct inpcb *inp = lctx->inp; 1095#endif 1096 1097 INP_WLOCK_ASSERT(inp); 1098 1099 CTR4(KTR_CXGB, "%s: tid %d, synqe %p (%x)", __func__, tid, synqe, 1100 synqe->flags); 1101 1102 if (synqe->flags & TP_ABORT_SHUTDOWN) 1103 return; 1104 1105 synqe->flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); 1106 1107 m = M_GETHDR_OFLD(synqe->qset, CPL_PRIORITY_DATA, req); 1108 if (m == NULL) 1109 CXGB_UNIMPLEMENTED(); 1110 1111 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1112 req->wr.wrh_lo = htonl(V_WR_TID(tid)); 1113 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1114 req->rsvd0 = 0; 1115 req->rsvd1 = !(synqe->flags & TP_DATASENT); 1116 req->cmd = CPL_ABORT_SEND_RST; 1117 1118 l2t_send(sc, m, synqe->e); 1119} 1120 1121void 1122t3_offload_socket(struct toedev *tod, void *arg, struct socket *so) 1123{ 1124 struct adapter *sc = tod->tod_softc; 1125 struct tom_data *td = sc->tom_softc; 1126 struct synq_entry *synqe = arg; 1127#ifdef INVARIANTS 1128 struct inpcb *inp = sotoinpcb(so); 1129#endif 1130 struct cpl_pass_establish *cpl = synqe->cpl; 1131 struct toepcb *toep = synqe->toep; 1132 1133 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ 1134 INP_WLOCK_ASSERT(inp); 1135 1136 offload_socket(so, toep); 1137 make_established(so, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); 1138 update_tid(td, toep, synqe->tid); 1139 synqe->flags |= TP_SYNQE_EXPANDED; 1140} 1141#endif 1142