1/*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/10/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 330303 2018-03-03 00:54:12Z jhb $"); 29 30#include "opt_inet.h" 31 32#ifdef TCP_OFFLOAD 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/sockstate.h> 43#include <sys/sockopt.h> 44#include <sys/socket.h> 45#include <sys/socketvar.h> 46#include <sys/sockbuf.h> 47#include <sys/sysctl.h> 48#include <sys/syslog.h> 49#include <sys/protosw.h> 50#include <sys/priv.h> 51#include <sys/sglist.h> 52#include <sys/taskqueue.h> 53 54#include <net/if.h> 55#include <net/ethernet.h> 56#include <net/route.h> 57 58#include <netinet/in.h> 59#include <netinet/in_pcb.h> 60#include <netinet/in_systm.h> 61#include <netinet/in_var.h> 62 63#include <netinet/ip.h> 64#include <netinet/tcp_var.h> 65#define TCPSTATES 66#include <netinet/tcp_fsm.h> 67#include <netinet/toecore.h> 68#include <netinet/tcp_seq.h> 69#include <netinet/tcp_timer.h> 70#include <net/route.h> 71 72#include "cxgb_include.h" 73#include "ulp/tom/cxgb_l2t.h" 74#include "ulp/tom/cxgb_tom.h" 75#include "ulp/tom/cxgb_toepcb.h" 76 77VNET_DECLARE(int, tcp_do_autosndbuf); 78#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) 79VNET_DECLARE(int, tcp_autosndbuf_inc); 80#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) 81VNET_DECLARE(int, tcp_autosndbuf_max); 82#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) 83VNET_DECLARE(int, tcp_do_autorcvbuf); 84#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 85VNET_DECLARE(int, tcp_autorcvbuf_inc); 86#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 87VNET_DECLARE(int, tcp_autorcvbuf_max); 88#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 89 90/* 91 * For ULP connections HW may add headers, e.g., for digests, that aren't part 92 * of the messages sent by the host but that are part of the TCP payload and 93 * therefore consume TCP sequence space. Tx connection parameters that 94 * operate in TCP sequence space are affected by the HW additions and need to 95 * compensate for them to accurately track TCP sequence numbers. This array 96 * contains the compensating extra lengths for ULP packets. It is indexed by 97 * a packet's ULP submode. 98 */ 99const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 100 101/* 102 * Max receive window supported by HW in bytes. Only a small part of it can 103 * be set through option0, the rest needs to be set through RX_DATA_ACK. 104 */ 105#define MAX_RCV_WND ((1U << 27) - 1) 106 107/* 108 * Min receive window. We want it to be large enough to accommodate receive 109 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 110 */ 111#define MIN_RCV_WND (24 * 1024U) 112#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 113 114static void t3_release_offload_resources(struct toepcb *); 115static void send_reset(struct toepcb *toep); 116 117/* 118 * Called after the last CPL for the toepcb has been received. 119 * 120 * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the 121 * time this function exits. 122 */ 123static int 124toepcb_release(struct toepcb *toep) 125{ 126 struct inpcb *inp = toep->tp_inp; 127 struct toedev *tod = toep->tp_tod; 128 struct tom_data *td = t3_tomdata(tod); 129 int rc; 130 131 INP_WLOCK_ASSERT(inp); 132 KASSERT(!(toep->tp_flags & TP_CPL_DONE), 133 ("%s: double release?", __func__)); 134 135 CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid); 136 137 toep->tp_flags |= TP_CPL_DONE; 138 toep->tp_inp = NULL; 139 140 mtx_lock(&td->toep_list_lock); 141 TAILQ_REMOVE(&td->toep_list, toep, link); 142 mtx_unlock(&td->toep_list_lock); 143 144 if (!(toep->tp_flags & TP_ATTACHED)) 145 t3_release_offload_resources(toep); 146 147 rc = in_pcbrele_wlocked(inp); 148 if (!rc) 149 INP_WUNLOCK(inp); 150 return (rc); 151} 152 153/* 154 * One sided detach. The tcpcb is going away and we need to unhook the toepcb 155 * hanging off it. If the TOE driver is also done with the toepcb we'll release 156 * all offload resources. 157 */ 158static void 159toepcb_detach(struct inpcb *inp) 160{ 161 struct toepcb *toep; 162 struct tcpcb *tp; 163 164 KASSERT(inp, ("%s: inp is NULL", __func__)); 165 INP_WLOCK_ASSERT(inp); 166 167 tp = intotcpcb(inp); 168 toep = tp->t_toe; 169 170 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 171 KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__)); 172 173 CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__, 174 tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid, 175 toep, inp, tp); 176 177 tp->t_toe = NULL; 178 tp->t_flags &= ~TF_TOE; 179 toep->tp_flags &= ~TP_ATTACHED; 180 181 if (toep->tp_flags & TP_CPL_DONE) 182 t3_release_offload_resources(toep); 183} 184 185void 186t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) 187{ 188 189 toepcb_detach(tp->t_inpcb); 190} 191 192static int 193alloc_atid(struct tid_info *t, void *ctx) 194{ 195 int atid = -1; 196 197 mtx_lock(&t->atid_lock); 198 if (t->afree) { 199 union active_open_entry *p = t->afree; 200 201 atid = (p - t->atid_tab) + t->atid_base; 202 t->afree = p->next; 203 p->ctx = ctx; 204 t->atids_in_use++; 205 } 206 mtx_unlock(&t->atid_lock); 207 208 return (atid); 209} 210 211static void 212free_atid(struct tid_info *t, int atid) 213{ 214 union active_open_entry *p = atid2entry(t, atid); 215 216 mtx_lock(&t->atid_lock); 217 p->next = t->afree; 218 t->afree = p; 219 t->atids_in_use--; 220 mtx_unlock(&t->atid_lock); 221} 222 223void 224insert_tid(struct tom_data *td, void *ctx, unsigned int tid) 225{ 226 struct tid_info *t = &td->tid_maps; 227 228 t->tid_tab[tid] = ctx; 229 atomic_add_int(&t->tids_in_use, 1); 230} 231 232void 233update_tid(struct tom_data *td, void *ctx, unsigned int tid) 234{ 235 struct tid_info *t = &td->tid_maps; 236 237 t->tid_tab[tid] = ctx; 238} 239 240void 241remove_tid(struct tom_data *td, unsigned int tid) 242{ 243 struct tid_info *t = &td->tid_maps; 244 245 t->tid_tab[tid] = NULL; 246 atomic_add_int(&t->tids_in_use, -1); 247} 248 249/* use ctx as a next pointer in the tid release list */ 250void 251queue_tid_release(struct toedev *tod, unsigned int tid) 252{ 253 struct tom_data *td = t3_tomdata(tod); 254 void **p = &td->tid_maps.tid_tab[tid]; 255 struct adapter *sc = tod->tod_softc; 256 257 mtx_lock(&td->tid_release_lock); 258 *p = td->tid_release_list; 259 td->tid_release_list = p; 260 if (!*p) 261 taskqueue_enqueue(sc->tq, &td->tid_release_task); 262 mtx_unlock(&td->tid_release_lock); 263} 264 265/* 266 * Populate a TID_RELEASE WR. 267 */ 268static inline void 269mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid) 270{ 271 272 cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 273 OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 274} 275 276void 277release_tid(struct toedev *tod, unsigned int tid, int qset) 278{ 279 struct tom_data *td = t3_tomdata(tod); 280 struct adapter *sc = tod->tod_softc; 281 struct mbuf *m; 282 struct cpl_tid_release *cpl; 283#ifdef INVARIANTS 284 struct tid_info *t = &td->tid_maps; 285#endif 286 287 KASSERT(tid >= 0 && tid < t->ntids, 288 ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids)); 289 290 m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); 291 if (m) { 292 mk_tid_release(cpl, tid); 293 t3_offload_tx(sc, m); 294 remove_tid(td, tid); 295 } else 296 queue_tid_release(tod, tid); 297 298} 299 300void 301t3_process_tid_release_list(void *data, int pending) 302{ 303 struct mbuf *m; 304 struct tom_data *td = data; 305 struct adapter *sc = td->tod.tod_softc; 306 307 mtx_lock(&td->tid_release_lock); 308 while (td->tid_release_list) { 309 void **p = td->tid_release_list; 310 unsigned int tid = p - td->tid_maps.tid_tab; 311 struct cpl_tid_release *cpl; 312 313 td->tid_release_list = (void **)*p; 314 m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */ 315 if (m == NULL) 316 break; /* XXX: who reschedules the release task? */ 317 mtx_unlock(&td->tid_release_lock); 318 mk_tid_release(cpl, tid); 319 t3_offload_tx(sc, m); 320 remove_tid(td, tid); 321 mtx_lock(&td->tid_release_lock); 322 } 323 mtx_unlock(&td->tid_release_lock); 324} 325 326static void 327close_conn(struct adapter *sc, struct toepcb *toep) 328{ 329 struct mbuf *m; 330 struct cpl_close_con_req *req; 331 332 if (toep->tp_flags & TP_FIN_SENT) 333 return; 334 335 m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); 336 if (m == NULL) 337 CXGB_UNIMPLEMENTED(); 338 339 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 340 req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); 341 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid)); 342 req->rsvd = 0; 343 344 toep->tp_flags |= TP_FIN_SENT; 345 t3_offload_tx(sc, m); 346} 347 348static inline void 349make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len, 350 struct mbuf *tail) 351{ 352 struct tcpcb *tp = so_sototcpcb(so); 353 struct toepcb *toep = tp->t_toe; 354 struct sockbuf *snd; 355 356 inp_lock_assert(tp->t_inpcb); 357 snd = so_sockbuf_snd(so); 358 359 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 360 req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); 361 /* len includes the length of any HW ULP additions */ 362 req->len = htonl(len); 363 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 364 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 365 req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) | 366 V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1))); 367 req->sndseq = htonl(tp->snd_nxt); 368 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 369 struct adapter *sc = toep->tp_tod->tod_softc; 370 int cpu_idx = sc->rrss_map[toep->tp_qset]; 371 372 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 373 V_TX_CPU_IDX(cpu_idx)); 374 375 /* Sendbuffer is in units of 32KB. */ 376 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 377 req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15)); 378 else 379 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 380 381 toep->tp_flags |= TP_DATASENT; 382 } 383} 384 385/* 386 * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc. 387 * TOM_XXX_MOVE to some common header file. 388 */ 389/* 390 * IMM_LEN: # of bytes that can be tx'd as immediate data. There are 16 flits 391 * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more 392 * for the second gen bit flit. This leaves us with 12 flits. 393 * 394 * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs. 395 * The first desc has a tx_data_wr (which includes the WR header), the rest have 396 * the WR header only. All descs have the second gen bit flit. 397 * 398 * sgllen_to_descs: # of tx descs used up by an sgl of given length. The first 399 * desc has a tx_data_wr (which includes the WR header), the rest have the WR 400 * header only. All descs have the second gen bit flit. 401 * 402 * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits. 403 * 404 */ 405#define IMM_LEN 96 406static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35}; 407static int sgllen_to_descs[TX_MAX_SEGS] = { 408 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, /* 0 - 9 */ 409 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, /* 10 - 19 */ 410 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, /* 20 - 29 */ 411 4, 4, 4, 4, 4, 4 /* 30 - 35 */ 412}; 413#if 0 414static int flits_to_sgllen[TX_DESC_FLITS + 1] = { 415 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10 416}; 417#endif 418#if SGE_NUM_GENBITS != 2 419#error "SGE_NUM_GENBITS really must be 2" 420#endif 421 422int 423t3_push_frames(struct socket *so, int req_completion) 424{ 425 struct tcpcb *tp = so_sototcpcb(so); 426 struct toepcb *toep = tp->t_toe; 427 struct mbuf *m0, *sndptr, *m; 428 struct toedev *tod = toep->tp_tod; 429 struct adapter *sc = tod->tod_softc; 430 int bytes, ndesc, total_bytes = 0, mlen; 431 struct sockbuf *snd; 432 struct sglist *sgl; 433 struct ofld_hdr *oh; 434 caddr_t dst; 435 struct tx_data_wr *wr; 436 437 inp_lock_assert(tp->t_inpcb); 438 439 snd = so_sockbuf_snd(so); 440 SOCKBUF_LOCK(snd); 441 442 /* 443 * Autosize the send buffer. 444 */ 445 if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) { 446 if (snd->sb_cc >= (snd->sb_hiwat / 8 * 7) && 447 snd->sb_cc < VNET(tcp_autosndbuf_max)) { 448 if (!sbreserve_locked(snd, min(snd->sb_hiwat + 449 VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)), 450 so, curthread)) 451 snd->sb_flags &= ~SB_AUTOSIZE; 452 } 453 } 454 455 if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr) 456 sndptr = toep->tp_m_last->m_next; 457 else 458 sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 459 460 /* Nothing to send or no WRs available for sending data */ 461 if (toep->tp_wr_avail == 0 || sndptr == NULL) 462 goto out; 463 464 /* Something to send and at least 1 WR available */ 465 while (toep->tp_wr_avail && sndptr != NULL) { 466 467 m0 = m_gethdr(M_NOWAIT, MT_DATA); 468 if (m0 == NULL) 469 break; 470 oh = mtod(m0, struct ofld_hdr *); 471 wr = (void *)(oh + 1); 472 dst = (void *)(wr + 1); 473 474 m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr); 475 oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF | 476 V_HDR_QSET(toep->tp_qset); 477 478 /* 479 * Try to construct an immediate data WR if possible. Stuff as 480 * much data into it as possible, one whole mbuf at a time. 481 */ 482 mlen = sndptr->m_len; 483 ndesc = bytes = 0; 484 while (mlen <= IMM_LEN - bytes) { 485 bcopy(sndptr->m_data, dst, mlen); 486 bytes += mlen; 487 dst += mlen; 488 489 if (!(sndptr = sndptr->m_next)) 490 break; 491 mlen = sndptr->m_len; 492 } 493 494 if (bytes) { 495 496 /* Was able to fit 'bytes' bytes in an immediate WR */ 497 498 ndesc = 1; 499 make_tx_data_wr(so, wr, bytes, sndptr); 500 501 m0->m_len += bytes; 502 m0->m_pkthdr.len = m0->m_len; 503 504 } else { 505 int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC); 506 507 /* Need to make an SGL */ 508 509 sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT); 510 if (sgl == NULL) 511 break; 512 513 for (m = sndptr; m != NULL; m = m->m_next) { 514 if ((mlen = m->m_len) > 0) { 515 if (sglist_append(sgl, m->m_data, mlen)) 516 break; 517 } 518 bytes += mlen; 519 } 520 sndptr = m; 521 if (bytes == 0) { 522 sglist_free(sgl); 523 break; 524 } 525 ndesc = sgllen_to_descs[sgl->sg_nseg]; 526 oh->flags |= F_HDR_SGL; 527 oh->sgl = sgl; 528 make_tx_data_wr(so, wr, bytes, sndptr); 529 } 530 531 oh->flags |= V_HDR_NDESC(ndesc); 532 oh->plen = bytes; 533 534 snd->sb_sndptr = sndptr; 535 snd->sb_sndptroff += bytes; 536 if (sndptr == NULL) { 537 snd->sb_sndptr = snd->sb_mbtail; 538 snd->sb_sndptroff -= snd->sb_mbtail->m_len; 539 toep->tp_m_last = snd->sb_mbtail; 540 } else 541 toep->tp_m_last = NULL; 542 543 total_bytes += bytes; 544 545 toep->tp_wr_avail -= ndesc; 546 toep->tp_wr_unacked += ndesc; 547 548 if ((req_completion && toep->tp_wr_unacked == ndesc) || 549 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 550 wr->wr.wrh_hi |= htonl(F_WR_COMPL); 551 toep->tp_wr_unacked = 0; 552 } 553 554 enqueue_wr(toep, m0); 555 l2t_send(sc, m0, toep->tp_l2t); 556 } 557out: 558 SOCKBUF_UNLOCK(snd); 559 560 if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN)) 561 close_conn(sc, toep); 562 563 return (total_bytes); 564} 565 566static int 567send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 568{ 569 struct mbuf *m; 570 struct cpl_rx_data_ack *req; 571 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 572 573 m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req); 574 if (m == NULL) 575 return (0); 576 577 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 578 req->wr.wrh_lo = 0; 579 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 580 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 581 t3_offload_tx(sc, m); 582 return (credits); 583} 584 585void 586t3_rcvd(struct toedev *tod, struct tcpcb *tp) 587{ 588 struct adapter *sc = tod->tod_softc; 589 struct inpcb *inp = tp->t_inpcb; 590 struct socket *so = inp->inp_socket; 591 struct sockbuf *so_rcv = &so->so_rcv; 592 struct toepcb *toep = tp->t_toe; 593 int must_send; 594 595 INP_WLOCK_ASSERT(inp); 596 597 SOCKBUF_LOCK(so_rcv); 598 KASSERT(toep->tp_enqueued >= so_rcv->sb_cc, 599 ("%s: so_rcv->sb_cc > enqueued", __func__)); 600 toep->tp_rx_credits += toep->tp_enqueued - so_rcv->sb_cc; 601 toep->tp_enqueued = so_rcv->sb_cc; 602 SOCKBUF_UNLOCK(so_rcv); 603 604 must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd; 605 if (must_send || toep->tp_rx_credits >= 15 * 1024) { 606 int credits; 607 608 credits = send_rx_credits(sc, toep, toep->tp_rx_credits); 609 toep->tp_rx_credits -= credits; 610 tp->rcv_wnd += credits; 611 tp->rcv_adv += credits; 612 } 613} 614 615static int 616do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 617{ 618 struct adapter *sc = qs->adap; 619 struct tom_data *td = sc->tom_softc; 620 struct cpl_rx_urg_notify *hdr = mtod(m, void *); 621 unsigned int tid = GET_TID(hdr); 622 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 623 624 log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp); 625 626 m_freem(m); 627 return (0); 628} 629 630int 631t3_send_fin(struct toedev *tod, struct tcpcb *tp) 632{ 633 struct toepcb *toep = tp->t_toe; 634 struct inpcb *inp = tp->t_inpcb; 635 struct socket *so = inp_inpcbtosocket(inp); 636#if defined(KTR) 637 unsigned int tid = toep->tp_tid; 638#endif 639 640 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 641 INP_WLOCK_ASSERT(inp); 642 643 CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep, 644 toep->tp_flags); 645 646 toep->tp_flags |= TP_SEND_FIN; 647 t3_push_frames(so, 1); 648 649 return (0); 650} 651 652int 653t3_tod_output(struct toedev *tod, struct tcpcb *tp) 654{ 655 struct inpcb *inp = tp->t_inpcb; 656 struct socket *so = inp->inp_socket; 657 658 t3_push_frames(so, 1); 659 return (0); 660} 661 662/* What mtu_idx to use, given a 4-tuple and/or an MSS cap */ 663int 664find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) 665{ 666 unsigned short *mtus = &sc->params.mtus[0]; 667 int i = 0, mss; 668 669 KASSERT(inc != NULL || pmss > 0, 670 ("%s: at least one of inc/pmss must be specified", __func__)); 671 672 mss = inc ? tcp_mssopt(inc) : pmss; 673 if (pmss > 0 && mss > pmss) 674 mss = pmss; 675 676 while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40) 677 ++i; 678 679 return (i); 680} 681 682static inline void 683purge_wr_queue(struct toepcb *toep) 684{ 685 struct mbuf *m; 686 struct ofld_hdr *oh; 687 688 while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) { 689 oh = mtod(m, struct ofld_hdr *); 690 if (oh->flags & F_HDR_SGL) 691 sglist_free(oh->sgl); 692 m_freem(m); 693 } 694} 695 696/* 697 * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T 698 * entry, etc.) 699 */ 700static void 701t3_release_offload_resources(struct toepcb *toep) 702{ 703 struct toedev *tod = toep->tp_tod; 704 struct tom_data *td = t3_tomdata(tod); 705 706 /* 707 * The TOM explicitly detaches its toepcb from the system's inp before 708 * it releases the offload resources. 709 */ 710 if (toep->tp_inp) { 711 panic("%s: inp %p still attached to toepcb %p", 712 __func__, toep->tp_inp, toep); 713 } 714 715 if (toep->tp_wr_avail != toep->tp_wr_max) 716 purge_wr_queue(toep); 717 718 if (toep->tp_l2t) { 719 l2t_release(td->l2t, toep->tp_l2t); 720 toep->tp_l2t = NULL; 721 } 722 723 if (toep->tp_tid >= 0) 724 release_tid(tod, toep->tp_tid, toep->tp_qset); 725 726 toepcb_free(toep); 727} 728 729/* 730 * Determine the receive window size for a socket. 731 */ 732unsigned long 733select_rcv_wnd(struct socket *so) 734{ 735 unsigned long wnd; 736 737 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 738 739 wnd = sbspace(&so->so_rcv); 740 if (wnd < MIN_RCV_WND) 741 wnd = MIN_RCV_WND; 742 743 return min(wnd, MAX_RCV_WND); 744} 745 746int 747select_rcv_wscale(void) 748{ 749 int wscale = 0; 750 unsigned long space = sb_max; 751 752 if (space > MAX_RCV_WND) 753 space = MAX_RCV_WND; 754 755 while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) 756 wscale++; 757 758 return (wscale); 759} 760 761 762/* 763 * Set up the socket for TCP offload. 764 */ 765void 766offload_socket(struct socket *so, struct toepcb *toep) 767{ 768 struct toedev *tod = toep->tp_tod; 769 struct tom_data *td = t3_tomdata(tod); 770 struct inpcb *inp = sotoinpcb(so); 771 struct tcpcb *tp = intotcpcb(inp); 772 773 INP_WLOCK_ASSERT(inp); 774 775 /* Update socket */ 776 SOCKBUF_LOCK(&so->so_snd); 777 so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE; 778 SOCKBUF_UNLOCK(&so->so_snd); 779 SOCKBUF_LOCK(&so->so_rcv); 780 so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE; 781 SOCKBUF_UNLOCK(&so->so_rcv); 782 783 /* Update TCP PCB */ 784 tp->tod = toep->tp_tod; 785 tp->t_toe = toep; 786 tp->t_flags |= TF_TOE; 787 788 /* Install an extra hold on inp */ 789 toep->tp_inp = inp; 790 toep->tp_flags |= TP_ATTACHED; 791 in_pcbref(inp); 792 793 /* Add the TOE PCB to the active list */ 794 mtx_lock(&td->toep_list_lock); 795 TAILQ_INSERT_HEAD(&td->toep_list, toep, link); 796 mtx_unlock(&td->toep_list_lock); 797} 798 799/* This is _not_ the normal way to "unoffload" a socket. */ 800void 801undo_offload_socket(struct socket *so) 802{ 803 struct inpcb *inp = sotoinpcb(so); 804 struct tcpcb *tp = intotcpcb(inp); 805 struct toepcb *toep = tp->t_toe; 806 struct toedev *tod = toep->tp_tod; 807 struct tom_data *td = t3_tomdata(tod); 808 809 INP_WLOCK_ASSERT(inp); 810 811 so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE; 812 so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE; 813 814 tp->tod = NULL; 815 tp->t_toe = NULL; 816 tp->t_flags &= ~TF_TOE; 817 818 toep->tp_inp = NULL; 819 toep->tp_flags &= ~TP_ATTACHED; 820 if (in_pcbrele_wlocked(inp)) 821 panic("%s: inp freed.", __func__); 822 823 mtx_lock(&td->toep_list_lock); 824 TAILQ_REMOVE(&td->toep_list, toep, link); 825 mtx_unlock(&td->toep_list_lock); 826} 827 828/* 829 * Socket could be a listening socket, and we may not have a toepcb at all at 830 * this time. 831 */ 832uint32_t 833calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e) 834{ 835 uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) | 836 V_MSS_IDX(mtu_idx); 837 838 if (so != NULL) { 839 struct inpcb *inp = sotoinpcb(so); 840 struct tcpcb *tp = intotcpcb(inp); 841 int keepalive = tcp_always_keepalive || 842 so_options_get(so) & SO_KEEPALIVE; 843 844 opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); 845 opt0h |= V_KEEP_ALIVE(keepalive != 0); 846 } 847 848 if (e != NULL) 849 opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx); 850 851 return (htobe32(opt0h)); 852} 853 854uint32_t 855calc_opt0l(struct socket *so, int rcv_bufsize) 856{ 857 uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize); 858 859 KASSERT(rcv_bufsize <= M_RCV_BUFSIZ, 860 ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize)); 861 862 if (so != NULL) /* optional because noone cares about IP TOS */ 863 opt0l |= V_TOS(INP_TOS(sotoinpcb(so))); 864 865 return (htobe32(opt0l)); 866} 867 868/* 869 * Convert an ACT_OPEN_RPL status to an errno. 870 */ 871static int 872act_open_rpl_status_to_errno(int status) 873{ 874 switch (status) { 875 case CPL_ERR_CONN_RESET: 876 return (ECONNREFUSED); 877 case CPL_ERR_ARP_MISS: 878 return (EHOSTUNREACH); 879 case CPL_ERR_CONN_TIMEDOUT: 880 return (ETIMEDOUT); 881 case CPL_ERR_TCAM_FULL: 882 return (EAGAIN); 883 case CPL_ERR_CONN_EXIST: 884 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 885 return (EAGAIN); 886 default: 887 return (EIO); 888 } 889} 890 891/* 892 * Return whether a failed active open has allocated a TID 893 */ 894static inline int 895act_open_has_tid(int status) 896{ 897 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 898 status != CPL_ERR_ARP_MISS; 899} 900 901/* 902 * Active open failed. 903 */ 904static int 905do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 906{ 907 struct adapter *sc = qs->adap; 908 struct tom_data *td = sc->tom_softc; 909 struct toedev *tod = &td->tod; 910 struct cpl_act_open_rpl *rpl = mtod(m, void *); 911 unsigned int atid = G_TID(ntohl(rpl->atid)); 912 struct toepcb *toep = lookup_atid(&td->tid_maps, atid); 913 struct inpcb *inp = toep->tp_inp; 914 int s = rpl->status, rc; 915 916 CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s); 917 918 free_atid(&td->tid_maps, atid); 919 toep->tp_tid = -1; 920 921 if (act_open_has_tid(s)) 922 queue_tid_release(tod, GET_TID(rpl)); 923 924 rc = act_open_rpl_status_to_errno(s); 925 if (rc != EAGAIN) 926 INP_INFO_RLOCK(&V_tcbinfo); 927 INP_WLOCK(inp); 928 toe_connect_failed(tod, inp, rc); 929 toepcb_release(toep); /* unlocks inp */ 930 if (rc != EAGAIN) 931 INP_INFO_RUNLOCK(&V_tcbinfo); 932 933 m_freem(m); 934 return (0); 935} 936 937/* 938 * Send an active open request. 939 * 940 * State of affairs on entry: 941 * soisconnecting (so_state |= SS_ISCONNECTING) 942 * tcbinfo not locked (this has changed - used to be WLOCKed) 943 * inp WLOCKed 944 * tp->t_state = TCPS_SYN_SENT 945 * rtalloc1, RT_UNLOCK on rt. 946 */ 947int 948t3_connect(struct toedev *tod, struct socket *so, 949 struct rtentry *rt, struct sockaddr *nam) 950{ 951 struct mbuf *m = NULL; 952 struct l2t_entry *e = NULL; 953 struct tom_data *td = t3_tomdata(tod); 954 struct adapter *sc = tod->tod_softc; 955 struct cpl_act_open_req *cpl; 956 struct inpcb *inp = sotoinpcb(so); 957 struct tcpcb *tp = intotcpcb(inp); 958 struct toepcb *toep; 959 int atid = -1, mtu_idx, rscale, cpu_idx, qset; 960 struct sockaddr *gw; 961 struct ifnet *ifp = rt->rt_ifp; 962 struct port_info *pi = ifp->if_softc; /* XXX wrong for VLAN etc. */ 963 964 INP_WLOCK_ASSERT(inp); 965 966 toep = toepcb_alloc(tod); 967 if (toep == NULL) 968 goto failed; 969 970 atid = alloc_atid(&td->tid_maps, toep); 971 if (atid < 0) 972 goto failed; 973 974 qset = pi->first_qset + (arc4random() % pi->nqsets); 975 976 m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); 977 if (m == NULL) 978 goto failed; 979 980 gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam; 981 e = t3_l2t_get(pi, ifp, gw); 982 if (e == NULL) 983 goto failed; 984 985 toep->tp_l2t = e; 986 toep->tp_tid = atid; /* used to double check response */ 987 toep->tp_qset = qset; 988 989 SOCKBUF_LOCK(&so->so_rcv); 990 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ 991 toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); 992 SOCKBUF_UNLOCK(&so->so_rcv); 993 994 offload_socket(so, toep); 995 996 /* 997 * The kernel sets request_r_scale based on sb_max whereas we need to 998 * take hardware's MAX_RCV_WND into account too. This is normally a 999 * no-op as MAX_RCV_WND is much larger than the default sb_max. 1000 */ 1001 if (tp->t_flags & TF_REQ_SCALE) 1002 rscale = tp->request_r_scale = select_rcv_wscale(); 1003 else 1004 rscale = 0; 1005 mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); 1006 cpu_idx = sc->rrss_map[qset]; 1007 1008 cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD)); 1009 cpl->wr.wrh_lo = 0; 1010 OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1011 inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, 1012 &cpl->peer_port); 1013 cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e); 1014 cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits); 1015 cpl->params = 0; 1016 cpl->opt2 = calc_opt2(cpu_idx); 1017 1018 CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__, 1019 toep->tp_tid, tcpstates[tp->t_state], toep, inp); 1020 1021 if (l2t_send(sc, m, e) == 0) 1022 return (0); 1023 1024 undo_offload_socket(so); 1025 1026failed: 1027 CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p", 1028 __func__, atid, toep, e, m); 1029 1030 if (atid >= 0) 1031 free_atid(&td->tid_maps, atid); 1032 1033 if (e) 1034 l2t_release(td->l2t, e); 1035 1036 if (toep) 1037 toepcb_free(toep); 1038 1039 m_freem(m); 1040 1041 return (ENOMEM); 1042} 1043 1044/* 1045 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do not 1046 * send multiple ABORT_REQs for the same connection and also that we do not try 1047 * to send a message after the connection has closed. 1048 */ 1049static void 1050send_reset(struct toepcb *toep) 1051{ 1052 1053 struct cpl_abort_req *req; 1054 unsigned int tid = toep->tp_tid; 1055 struct inpcb *inp = toep->tp_inp; 1056 struct socket *so = inp->inp_socket; 1057 struct tcpcb *tp = intotcpcb(inp); 1058 struct toedev *tod = toep->tp_tod; 1059 struct adapter *sc = tod->tod_softc; 1060 struct mbuf *m; 1061 1062 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1063 INP_WLOCK_ASSERT(inp); 1064 1065 CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep, 1066 toep->tp_flags); 1067 1068 if (toep->tp_flags & TP_ABORT_SHUTDOWN) 1069 return; 1070 1071 toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); 1072 1073 /* Purge the send queue */ 1074 sbflush(so_sockbuf_snd(so)); 1075 purge_wr_queue(toep); 1076 1077 m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); 1078 if (m == NULL) 1079 CXGB_UNIMPLEMENTED(); 1080 1081 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1082 req->wr.wrh_lo = htonl(V_WR_TID(tid)); 1083 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1084 req->rsvd0 = htonl(tp->snd_nxt); 1085 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1086 req->cmd = CPL_ABORT_SEND_RST; 1087 1088 if (tp->t_state == TCPS_SYN_SENT) 1089 mbufq_tail(&toep->out_of_order_queue, m); /* defer */ 1090 else 1091 l2t_send(sc, m, toep->tp_l2t); 1092} 1093 1094int 1095t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp) 1096{ 1097 1098 send_reset(tp->t_toe); 1099 return (0); 1100} 1101 1102/* 1103 * Handler for RX_DATA CPL messages. 1104 */ 1105static int 1106do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1107{ 1108 struct adapter *sc = qs->adap; 1109 struct tom_data *td = sc->tom_softc; 1110 struct cpl_rx_data *hdr = mtod(m, void *); 1111 unsigned int tid = GET_TID(hdr); 1112 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1113 struct inpcb *inp = toep->tp_inp; 1114 struct tcpcb *tp; 1115 struct socket *so; 1116 struct sockbuf *so_rcv; 1117 1118 /* Advance over CPL */ 1119 m_adj(m, sizeof(*hdr)); 1120 1121 /* XXX: revisit. This comes from the T4 TOM */ 1122 if (__predict_false(inp == NULL)) { 1123 /* 1124 * do_pass_establish failed and must be attempting to abort the 1125 * connection. Meanwhile, the T4 has sent us data for such a 1126 * connection. 1127 */ 1128#ifdef notyet 1129 KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), 1130 ("%s: inp NULL and tid isn't being aborted", __func__)); 1131#endif 1132 m_freem(m); 1133 return (0); 1134 } 1135 1136 INP_WLOCK(inp); 1137 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1138 CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1139 __func__, tid, m->m_pkthdr.len, inp->inp_flags); 1140 INP_WUNLOCK(inp); 1141 m_freem(m); 1142 return (0); 1143 } 1144 1145 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) 1146 toep->tp_delack_mode = hdr->dack_mode; 1147 1148 tp = intotcpcb(inp); 1149 1150#ifdef INVARIANTS 1151 if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) { 1152 log(LOG_ERR, 1153 "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n", 1154 __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt); 1155 } 1156#endif 1157 tp->rcv_nxt += m->m_pkthdr.len; 1158 KASSERT(tp->rcv_wnd >= m->m_pkthdr.len, 1159 ("%s: negative window size", __func__)); 1160 tp->rcv_wnd -= m->m_pkthdr.len; 1161 tp->t_rcvtime = ticks; 1162 1163 so = inp->inp_socket; 1164 so_rcv = &so->so_rcv; 1165 SOCKBUF_LOCK(so_rcv); 1166 1167 if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) { 1168 CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)", 1169 __func__, tid, m->m_pkthdr.len); 1170 SOCKBUF_UNLOCK(so_rcv); 1171 INP_WUNLOCK(inp); 1172 1173 INP_INFO_RLOCK(&V_tcbinfo); 1174 INP_WLOCK(inp); 1175 tp = tcp_drop(tp, ECONNRESET); 1176 if (tp) 1177 INP_WUNLOCK(inp); 1178 INP_INFO_RUNLOCK(&V_tcbinfo); 1179 1180 m_freem(m); 1181 return (0); 1182 } 1183 1184 /* receive buffer autosize */ 1185 if (so_rcv->sb_flags & SB_AUTOSIZE && 1186 V_tcp_do_autorcvbuf && 1187 so_rcv->sb_hiwat < V_tcp_autorcvbuf_max && 1188 (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) { 1189 unsigned int hiwat = so_rcv->sb_hiwat; 1190 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 1191 V_tcp_autorcvbuf_max); 1192 1193 if (!sbreserve_locked(so_rcv, newsize, so, NULL)) 1194 so_rcv->sb_flags &= ~SB_AUTOSIZE; 1195 else 1196 toep->tp_rx_credits += newsize - hiwat; 1197 } 1198 1199 toep->tp_enqueued += m->m_pkthdr.len; 1200 sbappendstream_locked(so_rcv, m); 1201 sorwakeup_locked(so); 1202 SOCKBUF_UNLOCK_ASSERT(so_rcv); 1203 1204 INP_WUNLOCK(inp); 1205 return (0); 1206} 1207 1208/* 1209 * Handler for PEER_CLOSE CPL messages. 1210 */ 1211static int 1212do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1213{ 1214 struct adapter *sc = qs->adap; 1215 struct tom_data *td = sc->tom_softc; 1216 const struct cpl_peer_close *hdr = mtod(m, void *); 1217 unsigned int tid = GET_TID(hdr); 1218 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1219 struct inpcb *inp = toep->tp_inp; 1220 struct tcpcb *tp; 1221 struct socket *so; 1222 1223 INP_INFO_RLOCK(&V_tcbinfo); 1224 INP_WLOCK(inp); 1225 tp = intotcpcb(inp); 1226 1227 CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, 1228 tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp); 1229 1230 if (toep->tp_flags & TP_ABORT_RPL_PENDING) 1231 goto done; 1232 1233 so = inp_inpcbtosocket(inp); 1234 1235 socantrcvmore(so); 1236 tp->rcv_nxt++; 1237 1238 switch (tp->t_state) { 1239 case TCPS_SYN_RECEIVED: 1240 tp->t_starttime = ticks; 1241 /* FALLTHROUGH */ 1242 case TCPS_ESTABLISHED: 1243 tp->t_state = TCPS_CLOSE_WAIT; 1244 break; 1245 case TCPS_FIN_WAIT_1: 1246 tp->t_state = TCPS_CLOSING; 1247 break; 1248 case TCPS_FIN_WAIT_2: 1249 tcp_twstart(tp); 1250 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1251 INP_INFO_RUNLOCK(&V_tcbinfo); 1252 1253 INP_WLOCK(inp); 1254 toepcb_release(toep); /* no more CPLs expected */ 1255 1256 m_freem(m); 1257 return (0); 1258 default: 1259 log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n", 1260 __func__, toep->tp_tid, tp->t_state); 1261 } 1262 1263done: 1264 INP_WUNLOCK(inp); 1265 INP_INFO_RUNLOCK(&V_tcbinfo); 1266 1267 m_freem(m); 1268 return (0); 1269} 1270 1271/* 1272 * Handler for CLOSE_CON_RPL CPL messages. peer ACK to our FIN received. 1273 */ 1274static int 1275do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1276{ 1277 struct adapter *sc = qs->adap; 1278 struct tom_data *td = sc->tom_softc; 1279 const struct cpl_close_con_rpl *rpl = mtod(m, void *); 1280 unsigned int tid = GET_TID(rpl); 1281 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1282 struct inpcb *inp = toep->tp_inp; 1283 struct tcpcb *tp; 1284 struct socket *so; 1285 1286 INP_INFO_RLOCK(&V_tcbinfo); 1287 INP_WLOCK(inp); 1288 tp = intotcpcb(inp); 1289 1290 CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, 1291 tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags); 1292 1293 if ((toep->tp_flags & TP_ABORT_RPL_PENDING)) 1294 goto done; 1295 1296 so = inp_inpcbtosocket(inp); 1297 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 1298 1299 switch (tp->t_state) { 1300 case TCPS_CLOSING: 1301 tcp_twstart(tp); 1302release: 1303 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1304 INP_INFO_RUNLOCK(&V_tcbinfo); 1305 1306 INP_WLOCK(inp); 1307 toepcb_release(toep); /* no more CPLs expected */ 1308 1309 m_freem(m); 1310 return (0); 1311 case TCPS_LAST_ACK: 1312 if (tcp_close(tp)) 1313 INP_WUNLOCK(inp); 1314 goto release; 1315 1316 case TCPS_FIN_WAIT_1: 1317 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1318 soisdisconnected(so); 1319 tp->t_state = TCPS_FIN_WAIT_2; 1320 break; 1321 default: 1322 log(LOG_ERR, 1323 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 1324 __func__, toep->tp_tid, tp->t_state); 1325 } 1326 1327done: 1328 INP_WUNLOCK(inp); 1329 INP_INFO_RUNLOCK(&V_tcbinfo); 1330 1331 m_freem(m); 1332 return (0); 1333} 1334 1335static int 1336do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1337{ 1338 struct cpl_smt_write_rpl *rpl = mtod(m, void *); 1339 1340 if (rpl->status != CPL_ERR_NONE) { 1341 log(LOG_ERR, 1342 "Unexpected SMT_WRITE_RPL status %u for entry %u\n", 1343 rpl->status, GET_TID(rpl)); 1344 } 1345 1346 m_freem(m); 1347 return (0); 1348} 1349 1350static int 1351do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1352{ 1353 struct cpl_set_tcb_rpl *rpl = mtod(m, void *); 1354 1355 if (rpl->status != CPL_ERR_NONE) { 1356 log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n", 1357 rpl->status, GET_TID(rpl)); 1358 } 1359 1360 m_freem(m); 1361 return (0); 1362} 1363 1364/* 1365 * Handle an ABORT_RPL_RSS CPL message. 1366 */ 1367static int 1368do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1369{ 1370 struct adapter *sc = qs->adap; 1371 struct tom_data *td = sc->tom_softc; 1372 const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); 1373 unsigned int tid = GET_TID(rpl); 1374 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1375 struct inpcb *inp; 1376 1377 /* 1378 * Ignore replies to post-close aborts indicating that the abort was 1379 * requested too late. These connections are terminated when we get 1380 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 1381 * arrives the TID is either no longer used or it has been recycled. 1382 */ 1383 if (rpl->status == CPL_ERR_ABORT_FAILED) { 1384 m_freem(m); 1385 return (0); 1386 } 1387 1388 if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) 1389 return (do_abort_rpl_synqe(qs, r, m)); 1390 1391 CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep, 1392 rpl->status); 1393 1394 inp = toep->tp_inp; 1395 INP_WLOCK(inp); 1396 1397 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1398 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) { 1399 toep->tp_flags |= TP_ABORT_RPL_RCVD; 1400 INP_WUNLOCK(inp); 1401 } else { 1402 toep->tp_flags &= ~TP_ABORT_RPL_RCVD; 1403 toep->tp_flags &= TP_ABORT_RPL_PENDING; 1404 toepcb_release(toep); /* no more CPLs expected */ 1405 } 1406 } 1407 1408 m_freem(m); 1409 return (0); 1410} 1411 1412/* 1413 * Convert the status code of an ABORT_REQ into a FreeBSD error code. 1414 */ 1415static int 1416abort_status_to_errno(struct tcpcb *tp, int abort_reason) 1417{ 1418 switch (abort_reason) { 1419 case CPL_ERR_BAD_SYN: 1420 case CPL_ERR_CONN_RESET: 1421 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1422 case CPL_ERR_XMIT_TIMEDOUT: 1423 case CPL_ERR_PERSIST_TIMEDOUT: 1424 case CPL_ERR_FINWAIT2_TIMEDOUT: 1425 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1426 return (ETIMEDOUT); 1427 default: 1428 return (EIO); 1429 } 1430} 1431 1432/* 1433 * Returns whether an ABORT_REQ_RSS message is a negative advice. 1434 */ 1435static inline int 1436is_neg_adv_abort(unsigned int status) 1437{ 1438 return status == CPL_ERR_RTX_NEG_ADVICE || 1439 status == CPL_ERR_PERSIST_NEG_ADVICE; 1440} 1441 1442void 1443send_abort_rpl(struct toedev *tod, int tid, int qset) 1444{ 1445 struct mbuf *reply; 1446 struct cpl_abort_rpl *rpl; 1447 struct adapter *sc = tod->tod_softc; 1448 1449 reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl); 1450 if (!reply) 1451 CXGB_UNIMPLEMENTED(); 1452 1453 rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 1454 rpl->wr.wrh_lo = htonl(V_WR_TID(tid)); 1455 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 1456 rpl->cmd = CPL_ABORT_NO_RST; 1457 1458 t3_offload_tx(sc, reply); 1459} 1460 1461/* 1462 * Handle an ABORT_REQ_RSS CPL message. If we're waiting for an ABORT_RPL we 1463 * ignore this request except that we need to reply to it. 1464 */ 1465static int 1466do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1467{ 1468 struct adapter *sc = qs->adap; 1469 struct tom_data *td = sc->tom_softc; 1470 struct toedev *tod = &td->tod; 1471 const struct cpl_abort_req_rss *req = mtod(m, void *); 1472 unsigned int tid = GET_TID(req); 1473 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1474 struct inpcb *inp; 1475 struct tcpcb *tp; 1476 struct socket *so; 1477 int qset = toep->tp_qset; 1478 1479 if (is_neg_adv_abort(req->status)) { 1480 CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)", 1481 __func__, req->status, tid, toep->tp_flags); 1482 m_freem(m); 1483 return (0); 1484 } 1485 1486 if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) 1487 return (do_abort_req_synqe(qs, r, m)); 1488 1489 inp = toep->tp_inp; 1490 INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ 1491 INP_WLOCK(inp); 1492 1493 tp = intotcpcb(inp); 1494 so = inp->inp_socket; 1495 1496 CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d", 1497 __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags, 1498 req->status); 1499 1500 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) { 1501 toep->tp_flags |= TP_ABORT_REQ_RCVD; 1502 toep->tp_flags |= TP_ABORT_SHUTDOWN; 1503 INP_WUNLOCK(inp); 1504 INP_INFO_RUNLOCK(&V_tcbinfo); 1505 m_freem(m); 1506 return (0); 1507 } 1508 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 1509 1510 /* 1511 * If we'd sent a reset on this toep, we'll ignore this and clean up in 1512 * the T3's reply to our reset instead. 1513 */ 1514 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1515 toep->tp_flags |= TP_ABORT_RPL_SENT; 1516 INP_WUNLOCK(inp); 1517 } else { 1518 so_error_set(so, abort_status_to_errno(tp, req->status)); 1519 tp = tcp_close(tp); 1520 if (tp == NULL) 1521 INP_WLOCK(inp); /* re-acquire */ 1522 toepcb_release(toep); /* no more CPLs expected */ 1523 } 1524 INP_INFO_RUNLOCK(&V_tcbinfo); 1525 1526 send_abort_rpl(tod, tid, qset); 1527 m_freem(m); 1528 return (0); 1529} 1530 1531static void 1532assign_rxopt(struct tcpcb *tp, uint16_t tcpopt) 1533{ 1534 struct toepcb *toep = tp->t_toe; 1535 struct adapter *sc = toep->tp_tod->tod_softc; 1536 1537 tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40; 1538 1539 if (G_TCPOPT_TSTAMP(tcpopt)) { 1540 tp->t_flags |= TF_RCVD_TSTMP; 1541 tp->t_flags |= TF_REQ_TSTMP; /* forcibly set */ 1542 tp->ts_recent = 0; /* XXX */ 1543 tp->ts_recent_age = tcp_ts_getticks(); 1544 tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; 1545 } 1546 1547 if (G_TCPOPT_SACK(tcpopt)) 1548 tp->t_flags |= TF_SACK_PERMIT; 1549 else 1550 tp->t_flags &= ~TF_SACK_PERMIT; 1551 1552 if (G_TCPOPT_WSCALE_OK(tcpopt)) 1553 tp->t_flags |= TF_RCVD_SCALE; 1554 1555 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 1556 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 1557 tp->rcv_scale = tp->request_r_scale; 1558 tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt); 1559 } 1560 1561} 1562 1563/* 1564 * The ISS and IRS are from after the exchange of SYNs and are off by 1. 1565 */ 1566void 1567make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs, 1568 uint16_t cpl_tcpopt) 1569{ 1570 struct inpcb *inp = sotoinpcb(so); 1571 struct tcpcb *tp = intotcpcb(inp); 1572 struct toepcb *toep = tp->t_toe; 1573 long bufsize; 1574 uint32_t iss = be32toh(cpl_iss) - 1; /* true ISS */ 1575 uint32_t irs = be32toh(cpl_irs) - 1; /* true IRS */ 1576 uint16_t tcpopt = be16toh(cpl_tcpopt); 1577 1578 INP_WLOCK_ASSERT(inp); 1579 1580 tp->t_state = TCPS_ESTABLISHED; 1581 tp->t_starttime = ticks; 1582 TCPSTAT_INC(tcps_connects); 1583 1584 CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state], 1585 toep->tp_tid, toep, inp); 1586 1587 tp->irs = irs; 1588 tcp_rcvseqinit(tp); 1589 tp->rcv_wnd = toep->tp_rx_credits << 10; 1590 tp->rcv_adv += tp->rcv_wnd; 1591 tp->last_ack_sent = tp->rcv_nxt; 1592 1593 /* 1594 * If we were unable to send all rx credits via opt0, save the remainder 1595 * in rx_credits so that they can be handed over with the next credit 1596 * update. 1597 */ 1598 SOCKBUF_LOCK(&so->so_rcv); 1599 bufsize = select_rcv_wnd(so); 1600 SOCKBUF_UNLOCK(&so->so_rcv); 1601 toep->tp_rx_credits = bufsize - tp->rcv_wnd; 1602 1603 tp->iss = iss; 1604 tcp_sendseqinit(tp); 1605 tp->snd_una = iss + 1; 1606 tp->snd_nxt = iss + 1; 1607 tp->snd_max = iss + 1; 1608 1609 assign_rxopt(tp, tcpopt); 1610 soisconnected(so); 1611} 1612 1613/* 1614 * Fill in the right TID for CPL messages waiting in the out-of-order queue 1615 * and send them to the TOE. 1616 */ 1617static void 1618fixup_and_send_ofo(struct toepcb *toep) 1619{ 1620 struct mbuf *m; 1621 struct toedev *tod = toep->tp_tod; 1622 struct adapter *sc = tod->tod_softc; 1623 unsigned int tid = toep->tp_tid; 1624 1625 inp_lock_assert(toep->tp_inp); 1626 1627 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 1628 struct ofld_hdr *oh = mtod(m, void *); 1629 /* 1630 * A variety of messages can be waiting but the fields we'll 1631 * be touching are common to all so any message type will do. 1632 */ 1633 struct cpl_close_con_req *p = (void *)(oh + 1); 1634 1635 p->wr.wrh_lo = htonl(V_WR_TID(tid)); 1636 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 1637 t3_offload_tx(sc, m); 1638 } 1639} 1640 1641/* 1642 * Process a CPL_ACT_ESTABLISH message. 1643 */ 1644static int 1645do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1646{ 1647 struct adapter *sc = qs->adap; 1648 struct tom_data *td = sc->tom_softc; 1649 struct cpl_act_establish *req = mtod(m, void *); 1650 unsigned int tid = GET_TID(req); 1651 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 1652 struct toepcb *toep = lookup_atid(&td->tid_maps, atid); 1653 struct inpcb *inp = toep->tp_inp; 1654 struct tcpcb *tp; 1655 struct socket *so; 1656 1657 CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid); 1658 1659 free_atid(&td->tid_maps, atid); 1660 1661 INP_WLOCK(inp); 1662 tp = intotcpcb(inp); 1663 1664 KASSERT(toep->tp_qset == qs->idx, 1665 ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx)); 1666 KASSERT(toep->tp_tid == atid, 1667 ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid)); 1668 1669 toep->tp_tid = tid; 1670 insert_tid(td, toep, tid); 1671 1672 if (inp->inp_flags & INP_DROPPED) { 1673 /* socket closed by the kernel before hw told us it connected */ 1674 send_reset(toep); 1675 goto done; 1676 } 1677 1678 KASSERT(tp->t_state == TCPS_SYN_SENT, 1679 ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state)); 1680 1681 so = inp->inp_socket; 1682 make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt); 1683 1684 /* 1685 * Now that we finally have a TID send any CPL messages that we had to 1686 * defer for lack of a TID. 1687 */ 1688 if (mbufq_len(&toep->out_of_order_queue)) 1689 fixup_and_send_ofo(toep); 1690 1691done: 1692 INP_WUNLOCK(inp); 1693 m_freem(m); 1694 return (0); 1695} 1696 1697/* 1698 * Process an acknowledgment of WR completion. Advance snd_una and send the 1699 * next batch of work requests from the write queue. 1700 */ 1701static void 1702wr_ack(struct toepcb *toep, struct mbuf *m) 1703{ 1704 struct inpcb *inp = toep->tp_inp; 1705 struct tcpcb *tp; 1706 struct cpl_wr_ack *hdr = mtod(m, void *); 1707 struct socket *so; 1708 unsigned int credits = ntohs(hdr->credits); 1709 u32 snd_una = ntohl(hdr->snd_una); 1710 int bytes = 0; 1711 struct sockbuf *snd; 1712 struct mbuf *p; 1713 struct ofld_hdr *oh; 1714 1715 inp_wlock(inp); 1716 tp = intotcpcb(inp); 1717 so = inp->inp_socket; 1718 toep->tp_wr_avail += credits; 1719 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 1720 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 1721 1722 while (credits) { 1723 p = peek_wr(toep); 1724 1725 if (__predict_false(!p)) { 1726 CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, " 1727 "tid %u, state %u, wr_avail %u", __func__, credits, 1728 toep->tp_tid, tp->t_state, toep->tp_wr_avail); 1729 1730 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 1731 "nothing pending, state %u wr_avail=%u\n", 1732 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 1733 break; 1734 } 1735 1736 oh = mtod(p, struct ofld_hdr *); 1737 1738 KASSERT(credits >= G_HDR_NDESC(oh->flags), 1739 ("%s: partial credits? %d %d", __func__, credits, 1740 G_HDR_NDESC(oh->flags))); 1741 1742 dequeue_wr(toep); 1743 credits -= G_HDR_NDESC(oh->flags); 1744 bytes += oh->plen; 1745 1746 if (oh->flags & F_HDR_SGL) 1747 sglist_free(oh->sgl); 1748 m_freem(p); 1749 } 1750 1751 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) 1752 goto out_free; 1753 1754 if (tp->snd_una != snd_una) { 1755 tp->snd_una = snd_una; 1756 tp->ts_recent_age = tcp_ts_getticks(); 1757 if (tp->snd_una == tp->snd_nxt) 1758 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 1759 } 1760 1761 snd = so_sockbuf_snd(so); 1762 if (bytes) { 1763 SOCKBUF_LOCK(snd); 1764 sbdrop_locked(snd, bytes); 1765 so_sowwakeup_locked(so); 1766 } 1767 1768 if (snd->sb_sndptroff < snd->sb_cc) 1769 t3_push_frames(so, 0); 1770 1771out_free: 1772 inp_wunlock(tp->t_inpcb); 1773 m_freem(m); 1774} 1775 1776/* 1777 * Handler for TX_DATA_ACK CPL messages. 1778 */ 1779static int 1780do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1781{ 1782 struct adapter *sc = qs->adap; 1783 struct tom_data *td = sc->tom_softc; 1784 struct cpl_wr_ack *hdr = mtod(m, void *); 1785 unsigned int tid = GET_TID(hdr); 1786 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1787 1788 /* XXX bad race */ 1789 if (toep) 1790 wr_ack(toep, m); 1791 1792 return (0); 1793} 1794 1795void 1796t3_init_cpl_io(struct adapter *sc) 1797{ 1798 t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish); 1799 t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl); 1800 t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify); 1801 t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data); 1802 t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack); 1803 t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close); 1804 t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req); 1805 t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl); 1806 t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl); 1807 t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl); 1808 t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl); 1809} 1810#endif 1811