1/*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/10/sys/dev/cxgb/ulp/tom/cxgb_listen.c 309108 2016-11-24 14:48:46Z jch $"); 29 30#include "opt_inet.h" 31 32#ifdef TCP_OFFLOAD 33#include <sys/param.h> 34#include <sys/refcount.h> 35#include <sys/socket.h> 36#include <sys/socketvar.h> 37#include <sys/sysctl.h> 38#include <net/if.h> 39#include <net/route.h> 40#include <netinet/in.h> 41#include <netinet/ip.h> 42#include <netinet/in_pcb.h> 43#include <netinet/in_var.h> 44#include <netinet/tcp_timer.h> 45#include <netinet/tcp_var.h> 46#define TCPSTATES 47#include <netinet/tcp_fsm.h> 48#include <netinet/toecore.h> 49 50#include "cxgb_include.h" 51#include "ulp/tom/cxgb_tom.h" 52#include "ulp/tom/cxgb_l2t.h" 53#include "ulp/tom/cxgb_toepcb.h" 54 55static void t3_send_reset_synqe(struct toedev *, struct synq_entry *); 56 57static int 58alloc_stid(struct tid_info *t, void *ctx) 59{ 60 int stid = -1; 61 62 mtx_lock(&t->stid_lock); 63 if (t->sfree) { 64 union listen_entry *p = t->sfree; 65 66 stid = (p - t->stid_tab) + t->stid_base; 67 t->sfree = p->next; 68 p->ctx = ctx; 69 t->stids_in_use++; 70 } 71 mtx_unlock(&t->stid_lock); 72 return (stid); 73} 74 75static void 76free_stid(struct tid_info *t, int stid) 77{ 78 union listen_entry *p = stid2entry(t, stid); 79 80 mtx_lock(&t->stid_lock); 81 p->next = t->sfree; 82 t->sfree = p; 83 t->stids_in_use--; 84 mtx_unlock(&t->stid_lock); 85} 86 87static struct listen_ctx * 88alloc_lctx(struct tom_data *td, struct inpcb *inp, int qset) 89{ 90 struct listen_ctx *lctx; 91 92 INP_WLOCK_ASSERT(inp); 93 94 lctx = malloc(sizeof(struct listen_ctx), M_CXGB, M_NOWAIT | M_ZERO); 95 if (lctx == NULL) 96 return (NULL); 97 98 lctx->stid = alloc_stid(&td->tid_maps, lctx); 99 if (lctx->stid < 0) { 100 free(lctx, M_CXGB); 101 return (NULL); 102 } 103 104 lctx->inp = inp; 105 in_pcbref(inp); 106 107 lctx->qset = qset; 108 refcount_init(&lctx->refcnt, 1); 109 TAILQ_INIT(&lctx->synq); 110 111 return (lctx); 112} 113 114/* Don't call this directly, use release_lctx instead */ 115static int 116free_lctx(struct tom_data *td, struct listen_ctx *lctx) 117{ 118 struct inpcb *inp = lctx->inp; 119 120 INP_WLOCK_ASSERT(inp); 121 KASSERT(lctx->refcnt == 0, 122 ("%s: refcnt %d", __func__, lctx->refcnt)); 123 KASSERT(TAILQ_EMPTY(&lctx->synq), 124 ("%s: synq not empty.", __func__)); 125 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 126 127 CTR4(KTR_CXGB, "%s: stid %u, lctx %p, inp %p", 128 __func__, lctx->stid, lctx, lctx->inp); 129 130 free_stid(&td->tid_maps, lctx->stid); 131 free(lctx, M_CXGB); 132 133 return in_pcbrele_wlocked(inp); 134} 135 136static void 137hold_lctx(struct listen_ctx *lctx) 138{ 139 140 refcount_acquire(&lctx->refcnt); 141} 142 143static inline uint32_t 144listen_hashfn(void *key, u_long mask) 145{ 146 147 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 148} 149 150/* 151 * Add a listen_ctx entry to the listen hash table. 152 */ 153static void 154listen_hash_add(struct tom_data *td, struct listen_ctx *lctx) 155{ 156 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 157 158 mtx_lock(&td->lctx_hash_lock); 159 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 160 td->lctx_count++; 161 mtx_unlock(&td->lctx_hash_lock); 162} 163 164/* 165 * Look for the listening socket's context entry in the hash and return it. 166 */ 167static struct listen_ctx * 168listen_hash_find(struct tom_data *td, struct inpcb *inp) 169{ 170 int bucket = listen_hashfn(inp, td->listen_mask); 171 struct listen_ctx *lctx; 172 173 mtx_lock(&td->lctx_hash_lock); 174 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 175 if (lctx->inp == inp) 176 break; 177 } 178 mtx_unlock(&td->lctx_hash_lock); 179 180 return (lctx); 181} 182 183/* 184 * Removes the listen_ctx structure for inp from the hash and returns it. 185 */ 186static struct listen_ctx * 187listen_hash_del(struct tom_data *td, struct inpcb *inp) 188{ 189 int bucket = listen_hashfn(inp, td->listen_mask); 190 struct listen_ctx *lctx, *l; 191 192 mtx_lock(&td->lctx_hash_lock); 193 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 194 if (lctx->inp == inp) { 195 LIST_REMOVE(lctx, link); 196 td->lctx_count--; 197 break; 198 } 199 } 200 mtx_unlock(&td->lctx_hash_lock); 201 202 return (lctx); 203} 204 205/* 206 * Releases a hold on the lctx. Must be called with the listening socket's inp 207 * locked. The inp may be freed by this function and it returns NULL to 208 * indicate this. 209 */ 210static struct inpcb * 211release_lctx(struct tom_data *td, struct listen_ctx *lctx) 212{ 213 struct inpcb *inp = lctx->inp; 214 int inp_freed = 0; 215 216 INP_WLOCK_ASSERT(inp); 217 if (refcount_release(&lctx->refcnt)) 218 inp_freed = free_lctx(td, lctx); 219 220 return (inp_freed ? NULL : inp); 221} 222 223static int 224create_server(struct adapter *sc, struct listen_ctx *lctx) 225{ 226 struct mbuf *m; 227 struct cpl_pass_open_req *req; 228 struct inpcb *inp = lctx->inp; 229 230 m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req); 231 if (m == NULL) 232 return (ENOMEM); 233 234 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 235 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 236 req->local_port = inp->inp_lport; 237 memcpy(&req->local_ip, &inp->inp_laddr, 4); 238 req->peer_port = 0; 239 req->peer_ip = 0; 240 req->peer_netmask = 0; 241 req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); 242 req->opt0l = htonl(V_RCV_BUFSIZ(16)); 243 req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); 244 245 t3_offload_tx(sc, m); 246 247 return (0); 248} 249 250static int 251destroy_server(struct adapter *sc, struct listen_ctx *lctx) 252{ 253 struct mbuf *m; 254 struct cpl_close_listserv_req *req; 255 256 m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req); 257 if (m == NULL) 258 return (ENOMEM); 259 260 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 261 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 262 lctx->stid)); 263 req->cpu_idx = 0; 264 265 t3_offload_tx(sc, m); 266 267 return (0); 268} 269 270/* 271 * Process a CPL_CLOSE_LISTSRV_RPL message. If the status is good we release 272 * the STID. 273 */ 274static int 275do_close_server_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 276{ 277 struct adapter *sc = qs->adap; 278 struct tom_data *td = sc->tom_softc; 279 struct cpl_close_listserv_rpl *rpl = mtod(m, void *); 280 unsigned int stid = GET_TID(rpl); 281 struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid); 282 struct inpcb *inp = lctx->inp; 283 284 CTR3(KTR_CXGB, "%s: stid %u, status %u", __func__, stid, rpl->status); 285 286 if (rpl->status != CPL_ERR_NONE) { 287 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u", 288 __func__, rpl->status, stid); 289 } else { 290 INP_WLOCK(inp); 291 KASSERT(listen_hash_del(td, lctx->inp) == NULL, 292 ("%s: inp %p still in listen hash", __func__, inp)); 293 if (release_lctx(td, lctx) != NULL) 294 INP_WUNLOCK(inp); 295 } 296 297 m_freem(m); 298 return (0); 299} 300 301/* 302 * Process a CPL_PASS_OPEN_RPL message. Remove the lctx from the listen hash 303 * table and free it if there was any error, otherwise nothing to do. 304 */ 305static int 306do_pass_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 307{ 308 struct adapter *sc = qs->adap; 309 struct tom_data *td = sc->tom_softc; 310 struct cpl_pass_open_rpl *rpl = mtod(m, void *); 311 int stid = GET_TID(rpl); 312 struct listen_ctx *lctx; 313 struct inpcb *inp; 314 315 /* 316 * We get these replies also when setting up HW filters. Just throw 317 * those away. 318 */ 319 if (stid >= td->tid_maps.stid_base + td->tid_maps.nstids) 320 goto done; 321 322 lctx = lookup_stid(&td->tid_maps, stid); 323 inp = lctx->inp; 324 325 INP_WLOCK(inp); 326 327 CTR4(KTR_CXGB, "%s: stid %u, status %u, flags 0x%x", 328 __func__, stid, rpl->status, lctx->flags); 329 330 lctx->flags &= ~LCTX_RPL_PENDING; 331 332 if (rpl->status != CPL_ERR_NONE) { 333 log(LOG_ERR, "%s: %s: hw listen (stid %d) failed: %d\n", 334 __func__, device_get_nameunit(sc->dev), stid, rpl->status); 335 } 336 337#ifdef INVARIANTS 338 /* 339 * If the inp has been dropped (listening socket closed) then 340 * listen_stop must have run and taken the inp out of the hash. 341 */ 342 if (inp->inp_flags & INP_DROPPED) { 343 KASSERT(listen_hash_del(td, inp) == NULL, 344 ("%s: inp %p still in listen hash", __func__, inp)); 345 } 346#endif 347 348 if (inp->inp_flags & INP_DROPPED && rpl->status != CPL_ERR_NONE) { 349 if (release_lctx(td, lctx) != NULL) 350 INP_WUNLOCK(inp); 351 goto done; 352 } 353 354 /* 355 * Listening socket stopped listening earlier and now the chip tells us 356 * it has started the hardware listener. Stop it; the lctx will be 357 * released in do_close_server_rpl. 358 */ 359 if (inp->inp_flags & INP_DROPPED) { 360 destroy_server(sc, lctx); 361 INP_WUNLOCK(inp); 362 goto done; 363 } 364 365 /* 366 * Failed to start hardware listener. Take inp out of the hash and 367 * release our reference on it. An error message has been logged 368 * already. 369 */ 370 if (rpl->status != CPL_ERR_NONE) { 371 listen_hash_del(td, inp); 372 if (release_lctx(td, lctx) != NULL) 373 INP_WUNLOCK(inp); 374 goto done; 375 } 376 377 /* hardware listener open for business */ 378 379 INP_WUNLOCK(inp); 380done: 381 m_freem(m); 382 return (0); 383} 384 385static void 386pass_accept_req_to_protohdrs(const struct cpl_pass_accept_req *cpl, 387 struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) 388{ 389 const struct tcp_options *t3opt = &cpl->tcp_options; 390 391 bzero(inc, sizeof(*inc)); 392 inc->inc_faddr.s_addr = cpl->peer_ip; 393 inc->inc_laddr.s_addr = cpl->local_ip; 394 inc->inc_fport = cpl->peer_port; 395 inc->inc_lport = cpl->local_port; 396 397 bzero(th, sizeof(*th)); 398 th->th_sport = cpl->peer_port; 399 th->th_dport = cpl->local_port; 400 th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */ 401 th->th_flags = TH_SYN; 402 403 bzero(to, sizeof(*to)); 404 if (t3opt->mss) { 405 to->to_flags |= TOF_MSS; 406 to->to_mss = be16toh(t3opt->mss); 407 } 408 if (t3opt->wsf) { 409 to->to_flags |= TOF_SCALE; 410 to->to_wscale = t3opt->wsf; 411 } 412 if (t3opt->tstamp) 413 to->to_flags |= TOF_TS; 414 if (t3opt->sack) 415 to->to_flags |= TOF_SACKPERM; 416} 417 418static inline void 419hold_synqe(struct synq_entry *synqe) 420{ 421 422 refcount_acquire(&synqe->refcnt); 423} 424 425static inline void 426release_synqe(struct synq_entry *synqe) 427{ 428 429 if (refcount_release(&synqe->refcnt)) 430 m_freem(synqe->m); 431} 432 433/* 434 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to 435 * store some state temporarily. There will be enough room in the mbuf's 436 * trailing space as the CPL is not that large. 437 * 438 * XXX: bad hack. 439 */ 440static struct synq_entry * 441mbuf_to_synq_entry(struct mbuf *m) 442{ 443 int len = roundup(sizeof (struct synq_entry), 8); 444 uint8_t *buf; 445 int buflen; 446 447 if (__predict_false(M_TRAILINGSPACE(m) < len)) { 448 panic("%s: no room for synq_entry (%td, %d)\n", __func__, 449 M_TRAILINGSPACE(m), len); 450 } 451 452 if (m->m_flags & M_EXT) { 453 buf = m->m_ext.ext_buf; 454 buflen = m->m_ext.ext_size; 455 } else if (m->m_flags & M_PKTHDR) { 456 buf = &m->m_pktdat[0]; 457 buflen = MHLEN; 458 } else { 459 buf = &m->m_dat[0]; 460 buflen = MLEN; 461 } 462 463 return ((void *)(buf + buflen - len)); 464} 465 466#ifdef KTR 467#define REJECT_PASS_ACCEPT() do { \ 468 reject_reason = __LINE__; \ 469 goto reject; \ 470} while (0) 471#else 472#define REJECT_PASS_ACCEPT() do { goto reject; } while (0) 473#endif 474 475/* 476 * The context associated with a tid entry via insert_tid could be a synq_entry 477 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 478 */ 479CTASSERT(offsetof(struct toepcb, tp_flags) == offsetof(struct synq_entry, flags)); 480 481/* 482 * Handle a CPL_PASS_ACCEPT_REQ message. 483 */ 484static int 485do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 486{ 487 struct adapter *sc = qs->adap; 488 struct tom_data *td = sc->tom_softc; 489 struct toedev *tod = &td->tod; 490 const struct cpl_pass_accept_req *req = mtod(m, void *); 491 unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 492 unsigned int tid = GET_TID(req); 493 struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid); 494 struct l2t_entry *e = NULL; 495 struct sockaddr_in nam; 496 struct rtentry *rt; 497 struct inpcb *inp; 498 struct socket *so; 499 struct port_info *pi; 500 struct ifnet *ifp; 501 struct in_conninfo inc; 502 struct tcphdr th; 503 struct tcpopt to; 504 struct synq_entry *synqe = NULL; 505 int i; 506#ifdef KTR 507 int reject_reason; 508#endif 509 510 CTR4(KTR_CXGB, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 511 lctx); 512 513 pass_accept_req_to_protohdrs(req, &inc, &th, &to); 514 515 /* 516 * Don't offload if the interface that received the SYN doesn't have 517 * IFCAP_TOE enabled. 518 */ 519 pi = NULL; 520 for_each_port(sc, i) { 521 if (memcmp(sc->port[i].hw_addr, req->dst_mac, ETHER_ADDR_LEN)) 522 continue; 523 pi = &sc->port[i]; 524 break; 525 } 526 if (pi == NULL) 527 REJECT_PASS_ACCEPT(); 528 ifp = pi->ifp; 529 if ((ifp->if_capenable & IFCAP_TOE4) == 0) 530 REJECT_PASS_ACCEPT(); 531 532 /* 533 * Don't offload if the outgoing interface for the route back to the 534 * peer is not the same as the interface that received the SYN. 535 */ 536 bzero(&nam, sizeof(nam)); 537 nam.sin_len = sizeof(nam); 538 nam.sin_family = AF_INET; 539 nam.sin_addr = inc.inc_faddr; 540 rt = rtalloc1((struct sockaddr *)&nam, 0, 0); 541 if (rt == NULL) 542 REJECT_PASS_ACCEPT(); 543 else { 544 struct sockaddr *nexthop; 545 546 RT_UNLOCK(rt); 547 nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : 548 (struct sockaddr *)&nam; 549 if (rt->rt_ifp == ifp) 550 e = t3_l2t_get(pi, rt->rt_ifp, nexthop); 551 RTFREE(rt); 552 if (e == NULL) 553 REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */ 554 } 555 556 INP_INFO_RLOCK(&V_tcbinfo); 557 558 /* Don't offload if the 4-tuple is already in use */ 559 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 560 INP_INFO_RUNLOCK(&V_tcbinfo); 561 REJECT_PASS_ACCEPT(); 562 } 563 564 inp = lctx->inp; /* listening socket (not owned by the TOE) */ 565 INP_WLOCK(inp); 566 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 567 /* 568 * The listening socket has closed. The reply from the TOE to 569 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all 570 * resources tied to this listen context. 571 */ 572 INP_WUNLOCK(inp); 573 INP_INFO_RUNLOCK(&V_tcbinfo); 574 REJECT_PASS_ACCEPT(); 575 } 576 so = inp->inp_socket; 577 578 /* Reuse the mbuf that delivered the CPL to us */ 579 synqe = mbuf_to_synq_entry(m); 580 synqe->flags = TP_IS_A_SYNQ_ENTRY; 581 synqe->m = m; 582 synqe->lctx = lctx; 583 synqe->tid = tid; 584 synqe->e = e; 585 synqe->opt0h = calc_opt0h(so, 0, 0, e); 586 synqe->qset = pi->first_qset + (arc4random() % pi->nqsets); 587 SOCKBUF_LOCK(&so->so_rcv); 588 synqe->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); 589 SOCKBUF_UNLOCK(&so->so_rcv); 590 refcount_init(&synqe->refcnt, 1); 591 atomic_store_rel_int(&synqe->reply, RPL_OK); 592 593 insert_tid(td, synqe, tid); 594 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); 595 hold_synqe(synqe); 596 hold_lctx(lctx); 597 598 /* syncache_add releases both pcbinfo and pcb locks */ 599 toe_syncache_add(&inc, &to, &th, inp, tod, synqe); 600 INP_UNLOCK_ASSERT(inp); 601 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 602 603 /* 604 * If we replied during syncache_add (reply is RPL_DONE), good. 605 * Otherwise (reply is unchanged - RPL_OK) it's no longer ok to reply. 606 * The mbuf will stick around as long as the entry is in the syncache. 607 * The kernel is free to retry syncache_respond but we'll ignore it due 608 * to RPL_DONT. 609 */ 610 if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONT)) { 611 612 INP_WLOCK(inp); 613 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 614 /* listener closed. synqe must have been aborted. */ 615 KASSERT(synqe->flags & TP_ABORT_SHUTDOWN, 616 ("%s: listener %p closed but synqe %p not aborted", 617 __func__, inp, synqe)); 618 619 CTR5(KTR_CXGB, 620 "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED", 621 __func__, stid, tid, lctx, synqe); 622 INP_WUNLOCK(inp); 623 release_synqe(synqe); 624 return (__LINE__); 625 } 626 627 KASSERT(!(synqe->flags & TP_ABORT_SHUTDOWN), 628 ("%s: synqe %p aborted, but listener %p not dropped.", 629 __func__, synqe, inp)); 630 631 TAILQ_REMOVE(&lctx->synq, synqe, link); 632 release_synqe(synqe); /* removed from synq list */ 633 inp = release_lctx(td, lctx); 634 if (inp) 635 INP_WUNLOCK(inp); 636 637 release_synqe(synqe); /* about to exit function */ 638 REJECT_PASS_ACCEPT(); 639 } 640 641 KASSERT(synqe->reply == RPL_DONE, 642 ("%s: reply %d", __func__, synqe->reply)); 643 644 CTR3(KTR_CXGB, "%s: stid %u, tid %u, OK", __func__, stid, tid); 645 release_synqe(synqe); 646 return (0); 647 648reject: 649 CTR4(KTR_CXGB, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 650 reject_reason); 651 652 if (synqe == NULL) 653 m_freem(m); 654 if (e) 655 l2t_release(td->l2t, e); 656 queue_tid_release(tod, tid); 657 658 return (0); 659} 660 661static void 662pass_establish_to_protohdrs(const struct cpl_pass_establish *cpl, 663 struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) 664{ 665 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 666 667 bzero(inc, sizeof(*inc)); 668 inc->inc_faddr.s_addr = cpl->peer_ip; 669 inc->inc_laddr.s_addr = cpl->local_ip; 670 inc->inc_fport = cpl->peer_port; 671 inc->inc_lport = cpl->local_port; 672 673 bzero(th, sizeof(*th)); 674 th->th_sport = cpl->peer_port; 675 th->th_dport = cpl->local_port; 676 th->th_flags = TH_ACK; 677 th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */ 678 th->th_ack = be32toh(cpl->snd_isn); /* ditto */ 679 680 bzero(to, sizeof(*to)); 681 if (G_TCPOPT_TSTAMP(tcp_opt)) 682 to->to_flags |= TOF_TS; 683} 684 685/* 686 * Process a CPL_PASS_ESTABLISH message. The T3 has already established a 687 * connection and we need to do the software side setup. 688 */ 689static int 690do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 691{ 692 struct adapter *sc = qs->adap; 693 struct tom_data *td = sc->tom_softc; 694 struct cpl_pass_establish *cpl = mtod(m, void *); 695 struct toedev *tod = &td->tod; 696 unsigned int tid = GET_TID(cpl); 697 struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); 698 struct toepcb *toep; 699 struct socket *so; 700 struct listen_ctx *lctx = synqe->lctx; 701 struct inpcb *inp = lctx->inp, *new_inp; 702 struct tcpopt to; 703 struct tcphdr th; 704 struct in_conninfo inc; 705#ifdef KTR 706 int stid = G_PASS_OPEN_TID(ntohl(cpl->tos_tid)); 707#endif 708 709 CTR5(KTR_CXGB, "%s: stid %u, tid %u, lctx %p, inp_flags 0x%x", 710 __func__, stid, tid, lctx, inp->inp_flags); 711 712 KASSERT(qs->idx == synqe->qset, 713 ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset)); 714 715 INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ 716 INP_WLOCK(inp); 717 718 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 719 /* 720 * The listening socket has closed. The TOM must have aborted 721 * all the embryonic connections (including this one) that were 722 * on the lctx's synq. do_abort_rpl for the tid is responsible 723 * for cleaning up. 724 */ 725 KASSERT(synqe->flags & TP_ABORT_SHUTDOWN, 726 ("%s: listen socket dropped but tid %u not aborted.", 727 __func__, tid)); 728 INP_WUNLOCK(inp); 729 INP_INFO_RUNLOCK(&V_tcbinfo); 730 m_freem(m); 731 return (0); 732 } 733 734 pass_establish_to_protohdrs(cpl, &inc, &th, &to); 735 736 /* Lie in order to pass the checks in syncache_expand */ 737 to.to_tsecr = synqe->ts; 738 th.th_ack = synqe->iss + 1; 739 740 toep = toepcb_alloc(tod); 741 if (toep == NULL) { 742reset: 743 t3_send_reset_synqe(tod, synqe); 744 INP_WUNLOCK(inp); 745 INP_INFO_RUNLOCK(&V_tcbinfo); 746 m_freem(m); 747 return (0); 748 } 749 toep->tp_qset = qs->idx; 750 toep->tp_l2t = synqe->e; 751 toep->tp_tid = tid; 752 toep->tp_rx_credits = synqe->rx_credits; 753 754 synqe->toep = toep; 755 synqe->cpl = cpl; 756 757 so = inp->inp_socket; 758 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { 759 toepcb_free(toep); 760 goto reset; 761 } 762 763 /* New connection inpcb is already locked by syncache_expand(). */ 764 new_inp = sotoinpcb(so); 765 INP_WLOCK_ASSERT(new_inp); 766 767 if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) { 768 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 769 t3_offload_socket(tod, synqe, so); 770 } 771 772 INP_WUNLOCK(new_inp); 773 774 /* Remove the synq entry and release its reference on the lctx */ 775 TAILQ_REMOVE(&lctx->synq, synqe, link); 776 inp = release_lctx(td, lctx); 777 if (inp) 778 INP_WUNLOCK(inp); 779 INP_INFO_RUNLOCK(&V_tcbinfo); 780 release_synqe(synqe); 781 782 m_freem(m); 783 return (0); 784} 785 786void 787t3_init_listen_cpl_handlers(struct adapter *sc) 788{ 789 t3_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl); 790 t3_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 791 t3_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 792 t3_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish); 793} 794 795/* 796 * Start a listening server by sending a passive open request to HW. 797 * 798 * Can't take adapter lock here and access to sc->flags, sc->open_device_map, 799 * sc->offload_map, if_capenable are all race prone. 800 */ 801int 802t3_listen_start(struct toedev *tod, struct tcpcb *tp) 803{ 804 struct tom_data *td = t3_tomdata(tod); 805 struct adapter *sc = tod->tod_softc; 806 struct port_info *pi; 807 struct inpcb *inp = tp->t_inpcb; 808 struct listen_ctx *lctx; 809 int i; 810 811 INP_WLOCK_ASSERT(inp); 812 813 if ((inp->inp_vflag & INP_IPV4) == 0) 814 return (0); 815 816#ifdef notyet 817 ADAPTER_LOCK(sc); 818 if (IS_BUSY(sc)) { 819 log(LOG_ERR, "%s: listen request ignored, %s is busy", 820 __func__, device_get_nameunit(sc->dev)); 821 goto done; 822 } 823 824 KASSERT(sc->flags & TOM_INIT_DONE, 825 ("%s: TOM not initialized", __func__)); 826#endif 827 828 if ((sc->open_device_map & sc->offload_map) == 0) 829 goto done; /* no port that's UP with IFCAP_TOE enabled */ 830 831 /* 832 * Find a running port with IFCAP_TOE4. We'll use the first such port's 833 * queues to send the passive open and receive the reply to it. 834 * 835 * XXX: need a way to mark an port in use by offload. if_cxgbe should 836 * then reject any attempt to bring down such a port (and maybe reject 837 * attempts to disable IFCAP_TOE on that port too?). 838 */ 839 for_each_port(sc, i) { 840 if (isset(&sc->open_device_map, i) && 841 sc->port[i].ifp->if_capenable & IFCAP_TOE4) 842 break; 843 } 844 KASSERT(i < sc->params.nports, 845 ("%s: no running port with TOE capability enabled.", __func__)); 846 pi = &sc->port[i]; 847 848 if (listen_hash_find(td, inp) != NULL) 849 goto done; /* already setup */ 850 851 lctx = alloc_lctx(td, inp, pi->first_qset); 852 if (lctx == NULL) { 853 log(LOG_ERR, 854 "%s: listen request ignored, %s couldn't allocate lctx\n", 855 __func__, device_get_nameunit(sc->dev)); 856 goto done; 857 } 858 listen_hash_add(td, lctx); 859 860 CTR5(KTR_CXGB, "%s: stid %u (%s), lctx %p, inp %p", __func__, 861 lctx->stid, tcpstates[tp->t_state], lctx, inp); 862 863 if (create_server(sc, lctx) != 0) { 864 log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__, 865 device_get_nameunit(sc->dev)); 866 (void) listen_hash_del(td, inp); 867 inp = release_lctx(td, lctx); 868 /* can't be freed, host stack has a reference */ 869 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 870 goto done; 871 } 872 lctx->flags |= LCTX_RPL_PENDING; 873done: 874#ifdef notyet 875 ADAPTER_UNLOCK(sc); 876#endif 877 return (0); 878} 879 880/* 881 * Stop a listening server by sending a close_listsvr request to HW. 882 * The server TID is freed when we get the reply. 883 */ 884int 885t3_listen_stop(struct toedev *tod, struct tcpcb *tp) 886{ 887 struct listen_ctx *lctx; 888 struct adapter *sc = tod->tod_softc; 889 struct tom_data *td = t3_tomdata(tod); 890 struct inpcb *inp = tp->t_inpcb; 891 struct synq_entry *synqe; 892 893 INP_WLOCK_ASSERT(inp); 894 895 lctx = listen_hash_del(td, inp); 896 if (lctx == NULL) 897 return (ENOENT); /* no hardware listener for this inp */ 898 899 CTR4(KTR_CXGB, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 900 lctx, lctx->flags); 901 902 /* 903 * If the reply to the PASS_OPEN is still pending we'll wait for it to 904 * arrive and clean up when it does. 905 */ 906 if (lctx->flags & LCTX_RPL_PENDING) { 907 KASSERT(TAILQ_EMPTY(&lctx->synq), 908 ("%s: synq not empty.", __func__)); 909 return (EINPROGRESS); 910 } 911 912 /* 913 * The host stack will abort all the connections on the listening 914 * socket's so_comp. It doesn't know about the connections on the synq 915 * so we need to take care of those. 916 */ 917 TAILQ_FOREACH(synqe, &lctx->synq, link) { 918 KASSERT(synqe->lctx == lctx, ("%s: synq corrupt", __func__)); 919 t3_send_reset_synqe(tod, synqe); 920 } 921 922 destroy_server(sc, lctx); 923 return (0); 924} 925 926void 927t3_syncache_added(struct toedev *tod __unused, void *arg) 928{ 929 struct synq_entry *synqe = arg; 930 931 hold_synqe(synqe); 932} 933 934void 935t3_syncache_removed(struct toedev *tod __unused, void *arg) 936{ 937 struct synq_entry *synqe = arg; 938 939 release_synqe(synqe); 940} 941 942/* XXX */ 943extern void tcp_dooptions(struct tcpopt *, u_char *, int, int); 944 945int 946t3_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 947{ 948 struct adapter *sc = tod->tod_softc; 949 struct synq_entry *synqe = arg; 950 struct l2t_entry *e = synqe->e; 951 struct ip *ip = mtod(m, struct ip *); 952 struct tcphdr *th = (void *)(ip + 1); 953 struct cpl_pass_accept_rpl *rpl; 954 struct mbuf *r; 955 struct listen_ctx *lctx = synqe->lctx; 956 struct tcpopt to; 957 int mtu_idx, cpu_idx; 958 959 /* 960 * The first time we run it's during the call to syncache_add. That's 961 * the only one we care about. 962 */ 963 if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONE) == 0) 964 goto done; /* reply to the CPL only if it's ok to do so */ 965 966 r = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, rpl); 967 if (r == NULL) 968 goto done; 969 970 /* 971 * Use only the provided mbuf (with ip and tcp headers) and what's in 972 * synqe. Avoid looking at the listening socket (lctx->inp) here. 973 * 974 * XXX: if the incoming SYN had the TCP timestamp option but the kernel 975 * decides it doesn't want to use TCP timestamps we have no way of 976 * relaying this info to the chip on a per-tid basis (all we have is a 977 * global knob). 978 */ 979 bzero(&to, sizeof(to)); 980 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), 981 TO_SYN); 982 983 /* stash them for later */ 984 synqe->iss = be32toh(th->th_seq); 985 synqe->ts = to.to_tsval; 986 987 mtu_idx = find_best_mtu_idx(sc, NULL, to.to_mss); 988 cpu_idx = sc->rrss_map[synqe->qset]; 989 990 rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 991 rpl->wr.wrh_lo = 0; 992 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, synqe->tid)); 993 rpl->opt2 = calc_opt2(cpu_idx); 994 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 995 rpl->peer_ip = ip->ip_dst.s_addr; 996 rpl->opt0h = synqe->opt0h | 997 calc_opt0h(NULL, mtu_idx, to.to_wscale, NULL); 998 rpl->opt0l_status = htobe32(CPL_PASS_OPEN_ACCEPT) | 999 calc_opt0l(NULL, synqe->rx_credits); 1000 1001 l2t_send(sc, r, e); 1002done: 1003 m_freem(m); 1004 return (0); 1005} 1006 1007int 1008do_abort_req_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1009{ 1010 struct adapter *sc = qs->adap; 1011 struct tom_data *td = sc->tom_softc; 1012 struct toedev *tod = &td->tod; 1013 const struct cpl_abort_req_rss *req = mtod(m, void *); 1014 unsigned int tid = GET_TID(req); 1015 struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); 1016 struct listen_ctx *lctx = synqe->lctx; 1017 struct inpcb *inp = lctx->inp; 1018 1019 KASSERT(synqe->flags & TP_IS_A_SYNQ_ENTRY, 1020 ("%s: !SYNQ_ENTRY", __func__)); 1021 1022 CTR6(KTR_CXGB, "%s: tid %u, synqe %p (%x), lctx %p, status %d", 1023 __func__, tid, synqe, synqe->flags, synqe->lctx, req->status); 1024 1025 INP_WLOCK(inp); 1026 1027 if (!(synqe->flags & TP_ABORT_REQ_RCVD)) { 1028 synqe->flags |= TP_ABORT_REQ_RCVD; 1029 synqe->flags |= TP_ABORT_SHUTDOWN; 1030 INP_WUNLOCK(inp); 1031 m_freem(m); 1032 return (0); 1033 } 1034 synqe->flags &= ~TP_ABORT_REQ_RCVD; 1035 1036 /* 1037 * If we'd sent a reset on this synqe, we'll ignore this and clean up in 1038 * the T3's reply to our reset instead. 1039 */ 1040 if (synqe->flags & TP_ABORT_RPL_PENDING) { 1041 synqe->flags |= TP_ABORT_RPL_SENT; 1042 INP_WUNLOCK(inp); 1043 } else { 1044 TAILQ_REMOVE(&lctx->synq, synqe, link); 1045 inp = release_lctx(td, lctx); 1046 if (inp) 1047 INP_WUNLOCK(inp); 1048 release_tid(tod, tid, qs->idx); 1049 l2t_release(td->l2t, synqe->e); 1050 release_synqe(synqe); 1051 } 1052 1053 send_abort_rpl(tod, tid, qs->idx); 1054 m_freem(m); 1055 return (0); 1056} 1057 1058int 1059do_abort_rpl_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1060{ 1061 struct adapter *sc = qs->adap; 1062 struct tom_data *td = sc->tom_softc; 1063 struct toedev *tod = &td->tod; 1064 const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); 1065 unsigned int tid = GET_TID(rpl); 1066 struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); 1067 struct listen_ctx *lctx = synqe->lctx; 1068 struct inpcb *inp = lctx->inp; 1069 1070 CTR3(KTR_CXGB, "%s: tid %d, synqe %p, status %d", tid, synqe, 1071 rpl->status); 1072 1073 INP_WLOCK(inp); 1074 1075 if (synqe->flags & TP_ABORT_RPL_PENDING) { 1076 if (!(synqe->flags & TP_ABORT_RPL_RCVD)) { 1077 synqe->flags |= TP_ABORT_RPL_RCVD; 1078 INP_WUNLOCK(inp); 1079 } else { 1080 synqe->flags &= ~TP_ABORT_RPL_RCVD; 1081 synqe->flags &= TP_ABORT_RPL_PENDING; 1082 1083 TAILQ_REMOVE(&lctx->synq, synqe, link); 1084 inp = release_lctx(td, lctx); 1085 if (inp) 1086 INP_WUNLOCK(inp); 1087 release_tid(tod, tid, qs->idx); 1088 l2t_release(td->l2t, synqe->e); 1089 release_synqe(synqe); 1090 } 1091 } 1092 1093 m_freem(m); 1094 return (0); 1095} 1096 1097static void 1098t3_send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) 1099{ 1100 struct cpl_abort_req *req; 1101 unsigned int tid = synqe->tid; 1102 struct adapter *sc = tod->tod_softc; 1103 struct mbuf *m; 1104#ifdef INVARIANTS 1105 struct listen_ctx *lctx = synqe->lctx; 1106 struct inpcb *inp = lctx->inp; 1107#endif 1108 1109 INP_WLOCK_ASSERT(inp); 1110 1111 CTR4(KTR_CXGB, "%s: tid %d, synqe %p (%x)", __func__, tid, synqe, 1112 synqe->flags); 1113 1114 if (synqe->flags & TP_ABORT_SHUTDOWN) 1115 return; 1116 1117 synqe->flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); 1118 1119 m = M_GETHDR_OFLD(synqe->qset, CPL_PRIORITY_DATA, req); 1120 if (m == NULL) 1121 CXGB_UNIMPLEMENTED(); 1122 1123 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1124 req->wr.wrh_lo = htonl(V_WR_TID(tid)); 1125 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1126 req->rsvd0 = 0; 1127 req->rsvd1 = !(synqe->flags & TP_DATASENT); 1128 req->cmd = CPL_ABORT_SEND_RST; 1129 1130 l2t_send(sc, m, synqe->e); 1131} 1132 1133void 1134t3_offload_socket(struct toedev *tod, void *arg, struct socket *so) 1135{ 1136 struct adapter *sc = tod->tod_softc; 1137 struct tom_data *td = sc->tom_softc; 1138 struct synq_entry *synqe = arg; 1139#ifdef INVARIANTS 1140 struct inpcb *inp = sotoinpcb(so); 1141#endif 1142 struct cpl_pass_establish *cpl = synqe->cpl; 1143 struct toepcb *toep = synqe->toep; 1144 1145 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ 1146 INP_WLOCK_ASSERT(inp); 1147 1148 offload_socket(so, toep); 1149 make_established(so, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); 1150 update_tid(td, toep, synqe->tid); 1151 synqe->flags |= TP_SYNQE_EXPANDED; 1152} 1153#endif 1154