1/*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/11/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 330303 2018-03-03 00:54:12Z jhb $"); 29 30#include "opt_inet.h" 31 32#ifdef TCP_OFFLOAD 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/sockstate.h> 43#include <sys/sockopt.h> 44#include <sys/socket.h> 45#include <sys/socketvar.h> 46#include <sys/sockbuf.h> 47#include <sys/sysctl.h> 48#include <sys/syslog.h> 49#include <sys/protosw.h> 50#include <sys/priv.h> 51#include <sys/sglist.h> 52#include <sys/taskqueue.h> 53 54#include <net/if.h> 55#include <net/if_var.h> 56#include <net/ethernet.h> 57#include <net/route.h> 58 59#include <netinet/in.h> 60#include <netinet/in_pcb.h> 61#include <netinet/in_systm.h> 62#include <netinet/in_var.h> 63 64#include <netinet/ip.h> 65#define TCPSTATES 66#include <netinet/tcp_fsm.h> 67#include <netinet/tcp_var.h> 68#include <netinet/toecore.h> 69#include <netinet/tcp_seq.h> 70#include <netinet/tcp_timer.h> 71#include <net/route.h> 72 73#include "cxgb_include.h" 74#include "ulp/tom/cxgb_l2t.h" 75#include "ulp/tom/cxgb_tom.h" 76#include "ulp/tom/cxgb_toepcb.h" 77 78VNET_DECLARE(int, tcp_do_autosndbuf); 79#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) 80VNET_DECLARE(int, tcp_autosndbuf_inc); 81#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) 82VNET_DECLARE(int, tcp_autosndbuf_max); 83#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) 84VNET_DECLARE(int, tcp_do_autorcvbuf); 85#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 86VNET_DECLARE(int, tcp_autorcvbuf_inc); 87#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 88VNET_DECLARE(int, tcp_autorcvbuf_max); 89#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 90 91/* 92 * For ULP connections HW may add headers, e.g., for digests, that aren't part 93 * of the messages sent by the host but that are part of the TCP payload and 94 * therefore consume TCP sequence space. Tx connection parameters that 95 * operate in TCP sequence space are affected by the HW additions and need to 96 * compensate for them to accurately track TCP sequence numbers. This array 97 * contains the compensating extra lengths for ULP packets. It is indexed by 98 * a packet's ULP submode. 99 */ 100const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 101 102/* 103 * Max receive window supported by HW in bytes. Only a small part of it can 104 * be set through option0, the rest needs to be set through RX_DATA_ACK. 105 */ 106#define MAX_RCV_WND ((1U << 27) - 1) 107 108/* 109 * Min receive window. We want it to be large enough to accommodate receive 110 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 111 */ 112#define MIN_RCV_WND (24 * 1024U) 113#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 114 115static void t3_release_offload_resources(struct toepcb *); 116static void send_reset(struct toepcb *toep); 117 118/* 119 * Called after the last CPL for the toepcb has been received. 120 * 121 * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the 122 * time this function exits. 123 */ 124static int 125toepcb_release(struct toepcb *toep) 126{ 127 struct inpcb *inp = toep->tp_inp; 128 struct toedev *tod = toep->tp_tod; 129 struct tom_data *td = t3_tomdata(tod); 130 int rc; 131 132 INP_WLOCK_ASSERT(inp); 133 KASSERT(!(toep->tp_flags & TP_CPL_DONE), 134 ("%s: double release?", __func__)); 135 136 CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid); 137 138 toep->tp_flags |= TP_CPL_DONE; 139 toep->tp_inp = NULL; 140 141 mtx_lock(&td->toep_list_lock); 142 TAILQ_REMOVE(&td->toep_list, toep, link); 143 mtx_unlock(&td->toep_list_lock); 144 145 if (!(toep->tp_flags & TP_ATTACHED)) 146 t3_release_offload_resources(toep); 147 148 rc = in_pcbrele_wlocked(inp); 149 if (!rc) 150 INP_WUNLOCK(inp); 151 return (rc); 152} 153 154/* 155 * One sided detach. The tcpcb is going away and we need to unhook the toepcb 156 * hanging off it. If the TOE driver is also done with the toepcb we'll release 157 * all offload resources. 158 */ 159static void 160toepcb_detach(struct inpcb *inp) 161{ 162 struct toepcb *toep; 163 struct tcpcb *tp; 164 165 KASSERT(inp, ("%s: inp is NULL", __func__)); 166 INP_WLOCK_ASSERT(inp); 167 168 tp = intotcpcb(inp); 169 toep = tp->t_toe; 170 171 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 172 KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__)); 173 174 CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__, 175 tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid, 176 toep, inp, tp); 177 178 tp->t_toe = NULL; 179 tp->t_flags &= ~TF_TOE; 180 toep->tp_flags &= ~TP_ATTACHED; 181 182 if (toep->tp_flags & TP_CPL_DONE) 183 t3_release_offload_resources(toep); 184} 185 186void 187t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) 188{ 189 190 toepcb_detach(tp->t_inpcb); 191} 192 193static int 194alloc_atid(struct tid_info *t, void *ctx) 195{ 196 int atid = -1; 197 198 mtx_lock(&t->atid_lock); 199 if (t->afree) { 200 union active_open_entry *p = t->afree; 201 202 atid = (p - t->atid_tab) + t->atid_base; 203 t->afree = p->next; 204 p->ctx = ctx; 205 t->atids_in_use++; 206 } 207 mtx_unlock(&t->atid_lock); 208 209 return (atid); 210} 211 212static void 213free_atid(struct tid_info *t, int atid) 214{ 215 union active_open_entry *p = atid2entry(t, atid); 216 217 mtx_lock(&t->atid_lock); 218 p->next = t->afree; 219 t->afree = p; 220 t->atids_in_use--; 221 mtx_unlock(&t->atid_lock); 222} 223 224void 225insert_tid(struct tom_data *td, void *ctx, unsigned int tid) 226{ 227 struct tid_info *t = &td->tid_maps; 228 229 t->tid_tab[tid] = ctx; 230 atomic_add_int(&t->tids_in_use, 1); 231} 232 233void 234update_tid(struct tom_data *td, void *ctx, unsigned int tid) 235{ 236 struct tid_info *t = &td->tid_maps; 237 238 t->tid_tab[tid] = ctx; 239} 240 241void 242remove_tid(struct tom_data *td, unsigned int tid) 243{ 244 struct tid_info *t = &td->tid_maps; 245 246 t->tid_tab[tid] = NULL; 247 atomic_add_int(&t->tids_in_use, -1); 248} 249 250/* use ctx as a next pointer in the tid release list */ 251void 252queue_tid_release(struct toedev *tod, unsigned int tid) 253{ 254 struct tom_data *td = t3_tomdata(tod); 255 void **p = &td->tid_maps.tid_tab[tid]; 256 struct adapter *sc = tod->tod_softc; 257 258 mtx_lock(&td->tid_release_lock); 259 *p = td->tid_release_list; 260 td->tid_release_list = p; 261 if (!*p) 262 taskqueue_enqueue(sc->tq, &td->tid_release_task); 263 mtx_unlock(&td->tid_release_lock); 264} 265 266/* 267 * Populate a TID_RELEASE WR. 268 */ 269static inline void 270mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid) 271{ 272 273 cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 274 OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 275} 276 277void 278release_tid(struct toedev *tod, unsigned int tid, int qset) 279{ 280 struct tom_data *td = t3_tomdata(tod); 281 struct adapter *sc = tod->tod_softc; 282 struct mbuf *m; 283 struct cpl_tid_release *cpl; 284#ifdef INVARIANTS 285 struct tid_info *t = &td->tid_maps; 286#endif 287 288 KASSERT(tid < t->ntids, 289 ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids)); 290 291 m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); 292 if (m) { 293 mk_tid_release(cpl, tid); 294 t3_offload_tx(sc, m); 295 remove_tid(td, tid); 296 } else 297 queue_tid_release(tod, tid); 298 299} 300 301void 302t3_process_tid_release_list(void *data, int pending) 303{ 304 struct mbuf *m; 305 struct tom_data *td = data; 306 struct adapter *sc = td->tod.tod_softc; 307 308 mtx_lock(&td->tid_release_lock); 309 while (td->tid_release_list) { 310 void **p = td->tid_release_list; 311 unsigned int tid = p - td->tid_maps.tid_tab; 312 struct cpl_tid_release *cpl; 313 314 td->tid_release_list = (void **)*p; 315 m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */ 316 if (m == NULL) 317 break; /* XXX: who reschedules the release task? */ 318 mtx_unlock(&td->tid_release_lock); 319 mk_tid_release(cpl, tid); 320 t3_offload_tx(sc, m); 321 remove_tid(td, tid); 322 mtx_lock(&td->tid_release_lock); 323 } 324 mtx_unlock(&td->tid_release_lock); 325} 326 327static void 328close_conn(struct adapter *sc, struct toepcb *toep) 329{ 330 struct mbuf *m; 331 struct cpl_close_con_req *req; 332 333 if (toep->tp_flags & TP_FIN_SENT) 334 return; 335 336 m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); 337 if (m == NULL) 338 CXGB_UNIMPLEMENTED(); 339 340 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 341 req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); 342 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid)); 343 req->rsvd = 0; 344 345 toep->tp_flags |= TP_FIN_SENT; 346 t3_offload_tx(sc, m); 347} 348 349static inline void 350make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len, 351 struct mbuf *tail) 352{ 353 struct tcpcb *tp = so_sototcpcb(so); 354 struct toepcb *toep = tp->t_toe; 355 struct sockbuf *snd; 356 357 inp_lock_assert(tp->t_inpcb); 358 snd = so_sockbuf_snd(so); 359 360 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 361 req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); 362 /* len includes the length of any HW ULP additions */ 363 req->len = htonl(len); 364 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 365 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 366 req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) | 367 V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1))); 368 req->sndseq = htonl(tp->snd_nxt); 369 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 370 struct adapter *sc = toep->tp_tod->tod_softc; 371 int cpu_idx = sc->rrss_map[toep->tp_qset]; 372 373 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 374 V_TX_CPU_IDX(cpu_idx)); 375 376 /* Sendbuffer is in units of 32KB. */ 377 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 378 req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15)); 379 else 380 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 381 382 toep->tp_flags |= TP_DATASENT; 383 } 384} 385 386/* 387 * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc. 388 * TOM_XXX_MOVE to some common header file. 389 */ 390/* 391 * IMM_LEN: # of bytes that can be tx'd as immediate data. There are 16 flits 392 * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more 393 * for the second gen bit flit. This leaves us with 12 flits. 394 * 395 * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs. 396 * The first desc has a tx_data_wr (which includes the WR header), the rest have 397 * the WR header only. All descs have the second gen bit flit. 398 * 399 * sgllen_to_descs: # of tx descs used up by an sgl of given length. The first 400 * desc has a tx_data_wr (which includes the WR header), the rest have the WR 401 * header only. All descs have the second gen bit flit. 402 * 403 * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits. 404 * 405 */ 406#define IMM_LEN 96 407static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35}; 408static int sgllen_to_descs[TX_MAX_SEGS] = { 409 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, /* 0 - 9 */ 410 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, /* 10 - 19 */ 411 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, /* 20 - 29 */ 412 4, 4, 4, 4, 4, 4 /* 30 - 35 */ 413}; 414#if 0 415static int flits_to_sgllen[TX_DESC_FLITS + 1] = { 416 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10 417}; 418#endif 419#if SGE_NUM_GENBITS != 2 420#error "SGE_NUM_GENBITS really must be 2" 421#endif 422 423int 424t3_push_frames(struct socket *so, int req_completion) 425{ 426 struct tcpcb *tp = so_sototcpcb(so); 427 struct toepcb *toep = tp->t_toe; 428 struct mbuf *m0, *sndptr, *m; 429 struct toedev *tod = toep->tp_tod; 430 struct adapter *sc = tod->tod_softc; 431 int bytes, ndesc, total_bytes = 0, mlen; 432 struct sockbuf *snd; 433 struct sglist *sgl; 434 struct ofld_hdr *oh; 435 caddr_t dst; 436 struct tx_data_wr *wr; 437 438 inp_lock_assert(tp->t_inpcb); 439 440 snd = so_sockbuf_snd(so); 441 SOCKBUF_LOCK(snd); 442 443 /* 444 * Autosize the send buffer. 445 */ 446 if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) { 447 if (sbused(snd) >= (snd->sb_hiwat / 8 * 7) && 448 sbused(snd) < VNET(tcp_autosndbuf_max)) { 449 if (!sbreserve_locked(snd, min(snd->sb_hiwat + 450 VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)), 451 so, curthread)) 452 snd->sb_flags &= ~SB_AUTOSIZE; 453 } 454 } 455 456 if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr) 457 sndptr = toep->tp_m_last->m_next; 458 else 459 sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 460 461 /* Nothing to send or no WRs available for sending data */ 462 if (toep->tp_wr_avail == 0 || sndptr == NULL) 463 goto out; 464 465 /* Something to send and at least 1 WR available */ 466 while (toep->tp_wr_avail && sndptr != NULL) { 467 468 m0 = m_gethdr(M_NOWAIT, MT_DATA); 469 if (m0 == NULL) 470 break; 471 oh = mtod(m0, struct ofld_hdr *); 472 wr = (void *)(oh + 1); 473 dst = (void *)(wr + 1); 474 475 m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr); 476 oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF | 477 V_HDR_QSET(toep->tp_qset); 478 479 /* 480 * Try to construct an immediate data WR if possible. Stuff as 481 * much data into it as possible, one whole mbuf at a time. 482 */ 483 mlen = sndptr->m_len; 484 ndesc = bytes = 0; 485 while (mlen <= IMM_LEN - bytes) { 486 bcopy(sndptr->m_data, dst, mlen); 487 bytes += mlen; 488 dst += mlen; 489 490 if (!(sndptr = sndptr->m_next)) 491 break; 492 mlen = sndptr->m_len; 493 } 494 495 if (bytes) { 496 497 /* Was able to fit 'bytes' bytes in an immediate WR */ 498 499 ndesc = 1; 500 make_tx_data_wr(so, wr, bytes, sndptr); 501 502 m0->m_len += bytes; 503 m0->m_pkthdr.len = m0->m_len; 504 505 } else { 506 int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC); 507 508 /* Need to make an SGL */ 509 510 sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT); 511 if (sgl == NULL) 512 break; 513 514 for (m = sndptr; m != NULL; m = m->m_next) { 515 if ((mlen = m->m_len) > 0) { 516 if (sglist_append(sgl, m->m_data, mlen)) 517 break; 518 } 519 bytes += mlen; 520 } 521 sndptr = m; 522 if (bytes == 0) { 523 sglist_free(sgl); 524 break; 525 } 526 ndesc = sgllen_to_descs[sgl->sg_nseg]; 527 oh->flags |= F_HDR_SGL; 528 oh->sgl = sgl; 529 make_tx_data_wr(so, wr, bytes, sndptr); 530 } 531 532 oh->flags |= V_HDR_NDESC(ndesc); 533 oh->plen = bytes; 534 535 snd->sb_sndptr = sndptr; 536 snd->sb_sndptroff += bytes; 537 if (sndptr == NULL) { 538 snd->sb_sndptr = snd->sb_mbtail; 539 snd->sb_sndptroff -= snd->sb_mbtail->m_len; 540 toep->tp_m_last = snd->sb_mbtail; 541 } else 542 toep->tp_m_last = NULL; 543 544 total_bytes += bytes; 545 546 toep->tp_wr_avail -= ndesc; 547 toep->tp_wr_unacked += ndesc; 548 549 if ((req_completion && toep->tp_wr_unacked == ndesc) || 550 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 551 wr->wr.wrh_hi |= htonl(F_WR_COMPL); 552 toep->tp_wr_unacked = 0; 553 } 554 555 enqueue_wr(toep, m0); 556 l2t_send(sc, m0, toep->tp_l2t); 557 } 558out: 559 SOCKBUF_UNLOCK(snd); 560 561 if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN)) 562 close_conn(sc, toep); 563 564 return (total_bytes); 565} 566 567static int 568send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 569{ 570 struct mbuf *m; 571 struct cpl_rx_data_ack *req; 572 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 573 574 m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req); 575 if (m == NULL) 576 return (0); 577 578 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 579 req->wr.wrh_lo = 0; 580 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 581 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 582 t3_offload_tx(sc, m); 583 return (credits); 584} 585 586void 587t3_rcvd(struct toedev *tod, struct tcpcb *tp) 588{ 589 struct adapter *sc = tod->tod_softc; 590 struct inpcb *inp = tp->t_inpcb; 591 struct socket *so = inp->inp_socket; 592 struct sockbuf *so_rcv = &so->so_rcv; 593 struct toepcb *toep = tp->t_toe; 594 int must_send; 595 596 INP_WLOCK_ASSERT(inp); 597 598 SOCKBUF_LOCK(so_rcv); 599 KASSERT(toep->tp_enqueued >= sbused(so_rcv), 600 ("%s: sbused(so_rcv) > enqueued", __func__)); 601 toep->tp_rx_credits += toep->tp_enqueued - sbused(so_rcv); 602 toep->tp_enqueued = sbused(so_rcv); 603 SOCKBUF_UNLOCK(so_rcv); 604 605 must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd; 606 if (must_send || toep->tp_rx_credits >= 15 * 1024) { 607 int credits; 608 609 credits = send_rx_credits(sc, toep, toep->tp_rx_credits); 610 toep->tp_rx_credits -= credits; 611 tp->rcv_wnd += credits; 612 tp->rcv_adv += credits; 613 } 614} 615 616static int 617do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 618{ 619 struct adapter *sc = qs->adap; 620 struct tom_data *td = sc->tom_softc; 621 struct cpl_rx_urg_notify *hdr = mtod(m, void *); 622 unsigned int tid = GET_TID(hdr); 623 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 624 625 log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp); 626 627 m_freem(m); 628 return (0); 629} 630 631int 632t3_send_fin(struct toedev *tod, struct tcpcb *tp) 633{ 634 struct toepcb *toep = tp->t_toe; 635 struct inpcb *inp = tp->t_inpcb; 636 struct socket *so = inp_inpcbtosocket(inp); 637#if defined(KTR) 638 unsigned int tid = toep->tp_tid; 639#endif 640 641 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 642 INP_WLOCK_ASSERT(inp); 643 644 CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep, 645 toep->tp_flags); 646 647 toep->tp_flags |= TP_SEND_FIN; 648 t3_push_frames(so, 1); 649 650 return (0); 651} 652 653int 654t3_tod_output(struct toedev *tod, struct tcpcb *tp) 655{ 656 struct inpcb *inp = tp->t_inpcb; 657 struct socket *so = inp->inp_socket; 658 659 t3_push_frames(so, 1); 660 return (0); 661} 662 663/* What mtu_idx to use, given a 4-tuple and/or an MSS cap */ 664int 665find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) 666{ 667 unsigned short *mtus = &sc->params.mtus[0]; 668 int i = 0, mss; 669 670 KASSERT(inc != NULL || pmss > 0, 671 ("%s: at least one of inc/pmss must be specified", __func__)); 672 673 mss = inc ? tcp_mssopt(inc) : pmss; 674 if (pmss > 0 && mss > pmss) 675 mss = pmss; 676 677 while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40) 678 ++i; 679 680 return (i); 681} 682 683static inline void 684purge_wr_queue(struct toepcb *toep) 685{ 686 struct mbuf *m; 687 struct ofld_hdr *oh; 688 689 while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) { 690 oh = mtod(m, struct ofld_hdr *); 691 if (oh->flags & F_HDR_SGL) 692 sglist_free(oh->sgl); 693 m_freem(m); 694 } 695} 696 697/* 698 * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T 699 * entry, etc.) 700 */ 701static void 702t3_release_offload_resources(struct toepcb *toep) 703{ 704 struct toedev *tod = toep->tp_tod; 705 struct tom_data *td = t3_tomdata(tod); 706 707 /* 708 * The TOM explicitly detaches its toepcb from the system's inp before 709 * it releases the offload resources. 710 */ 711 if (toep->tp_inp) { 712 panic("%s: inp %p still attached to toepcb %p", 713 __func__, toep->tp_inp, toep); 714 } 715 716 if (toep->tp_wr_avail != toep->tp_wr_max) 717 purge_wr_queue(toep); 718 719 if (toep->tp_l2t) { 720 l2t_release(td->l2t, toep->tp_l2t); 721 toep->tp_l2t = NULL; 722 } 723 724 if (toep->tp_tid >= 0) 725 release_tid(tod, toep->tp_tid, toep->tp_qset); 726 727 toepcb_free(toep); 728} 729 730/* 731 * Determine the receive window size for a socket. 732 */ 733unsigned long 734select_rcv_wnd(struct socket *so) 735{ 736 unsigned long wnd; 737 738 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 739 740 wnd = sbspace(&so->so_rcv); 741 if (wnd < MIN_RCV_WND) 742 wnd = MIN_RCV_WND; 743 744 return min(wnd, MAX_RCV_WND); 745} 746 747int 748select_rcv_wscale(void) 749{ 750 int wscale = 0; 751 unsigned long space = sb_max; 752 753 if (space > MAX_RCV_WND) 754 space = MAX_RCV_WND; 755 756 while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) 757 wscale++; 758 759 return (wscale); 760} 761 762 763/* 764 * Set up the socket for TCP offload. 765 */ 766void 767offload_socket(struct socket *so, struct toepcb *toep) 768{ 769 struct toedev *tod = toep->tp_tod; 770 struct tom_data *td = t3_tomdata(tod); 771 struct inpcb *inp = sotoinpcb(so); 772 struct tcpcb *tp = intotcpcb(inp); 773 774 INP_WLOCK_ASSERT(inp); 775 776 /* Update socket */ 777 SOCKBUF_LOCK(&so->so_snd); 778 so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE; 779 SOCKBUF_UNLOCK(&so->so_snd); 780 SOCKBUF_LOCK(&so->so_rcv); 781 so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE; 782 SOCKBUF_UNLOCK(&so->so_rcv); 783 784 /* Update TCP PCB */ 785 tp->tod = toep->tp_tod; 786 tp->t_toe = toep; 787 tp->t_flags |= TF_TOE; 788 789 /* Install an extra hold on inp */ 790 toep->tp_inp = inp; 791 toep->tp_flags |= TP_ATTACHED; 792 in_pcbref(inp); 793 794 /* Add the TOE PCB to the active list */ 795 mtx_lock(&td->toep_list_lock); 796 TAILQ_INSERT_HEAD(&td->toep_list, toep, link); 797 mtx_unlock(&td->toep_list_lock); 798} 799 800/* This is _not_ the normal way to "unoffload" a socket. */ 801void 802undo_offload_socket(struct socket *so) 803{ 804 struct inpcb *inp = sotoinpcb(so); 805 struct tcpcb *tp = intotcpcb(inp); 806 struct toepcb *toep = tp->t_toe; 807 struct toedev *tod = toep->tp_tod; 808 struct tom_data *td = t3_tomdata(tod); 809 810 INP_WLOCK_ASSERT(inp); 811 812 so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE; 813 so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE; 814 815 tp->tod = NULL; 816 tp->t_toe = NULL; 817 tp->t_flags &= ~TF_TOE; 818 819 toep->tp_inp = NULL; 820 toep->tp_flags &= ~TP_ATTACHED; 821 if (in_pcbrele_wlocked(inp)) 822 panic("%s: inp freed.", __func__); 823 824 mtx_lock(&td->toep_list_lock); 825 TAILQ_REMOVE(&td->toep_list, toep, link); 826 mtx_unlock(&td->toep_list_lock); 827} 828 829/* 830 * Socket could be a listening socket, and we may not have a toepcb at all at 831 * this time. 832 */ 833uint32_t 834calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e) 835{ 836 uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) | 837 V_MSS_IDX(mtu_idx); 838 839 if (so != NULL) { 840 struct inpcb *inp = sotoinpcb(so); 841 struct tcpcb *tp = intotcpcb(inp); 842 int keepalive = tcp_always_keepalive || 843 so_options_get(so) & SO_KEEPALIVE; 844 845 opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); 846 opt0h |= V_KEEP_ALIVE(keepalive != 0); 847 } 848 849 if (e != NULL) 850 opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx); 851 852 return (htobe32(opt0h)); 853} 854 855uint32_t 856calc_opt0l(struct socket *so, int rcv_bufsize) 857{ 858 uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize); 859 860 KASSERT(rcv_bufsize <= M_RCV_BUFSIZ, 861 ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize)); 862 863 if (so != NULL) /* optional because no one cares about IP TOS */ 864 opt0l |= V_TOS(INP_TOS(sotoinpcb(so))); 865 866 return (htobe32(opt0l)); 867} 868 869/* 870 * Convert an ACT_OPEN_RPL status to an errno. 871 */ 872static int 873act_open_rpl_status_to_errno(int status) 874{ 875 switch (status) { 876 case CPL_ERR_CONN_RESET: 877 return (ECONNREFUSED); 878 case CPL_ERR_ARP_MISS: 879 return (EHOSTUNREACH); 880 case CPL_ERR_CONN_TIMEDOUT: 881 return (ETIMEDOUT); 882 case CPL_ERR_TCAM_FULL: 883 return (EAGAIN); 884 case CPL_ERR_CONN_EXIST: 885 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 886 return (EAGAIN); 887 default: 888 return (EIO); 889 } 890} 891 892/* 893 * Return whether a failed active open has allocated a TID 894 */ 895static inline int 896act_open_has_tid(int status) 897{ 898 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 899 status != CPL_ERR_ARP_MISS; 900} 901 902/* 903 * Active open failed. 904 */ 905static int 906do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 907{ 908 struct adapter *sc = qs->adap; 909 struct tom_data *td = sc->tom_softc; 910 struct toedev *tod = &td->tod; 911 struct cpl_act_open_rpl *rpl = mtod(m, void *); 912 unsigned int atid = G_TID(ntohl(rpl->atid)); 913 struct toepcb *toep = lookup_atid(&td->tid_maps, atid); 914 struct inpcb *inp = toep->tp_inp; 915 int s = rpl->status, rc; 916 917 CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s); 918 919 free_atid(&td->tid_maps, atid); 920 toep->tp_tid = -1; 921 922 if (act_open_has_tid(s)) 923 queue_tid_release(tod, GET_TID(rpl)); 924 925 rc = act_open_rpl_status_to_errno(s); 926 if (rc != EAGAIN) 927 INP_INFO_RLOCK(&V_tcbinfo); 928 INP_WLOCK(inp); 929 toe_connect_failed(tod, inp, rc); 930 toepcb_release(toep); /* unlocks inp */ 931 if (rc != EAGAIN) 932 INP_INFO_RUNLOCK(&V_tcbinfo); 933 934 m_freem(m); 935 return (0); 936} 937 938/* 939 * Send an active open request. 940 * 941 * State of affairs on entry: 942 * soisconnecting (so_state |= SS_ISCONNECTING) 943 * tcbinfo not locked (this has changed - used to be WLOCKed) 944 * inp WLOCKed 945 * tp->t_state = TCPS_SYN_SENT 946 * rtalloc1, RT_UNLOCK on rt. 947 */ 948int 949t3_connect(struct toedev *tod, struct socket *so, 950 struct rtentry *rt, struct sockaddr *nam) 951{ 952 struct mbuf *m = NULL; 953 struct l2t_entry *e = NULL; 954 struct tom_data *td = t3_tomdata(tod); 955 struct adapter *sc = tod->tod_softc; 956 struct cpl_act_open_req *cpl; 957 struct inpcb *inp = sotoinpcb(so); 958 struct tcpcb *tp = intotcpcb(inp); 959 struct toepcb *toep; 960 int atid = -1, mtu_idx, rscale, cpu_idx, qset; 961 struct sockaddr *gw; 962 struct ifnet *ifp = rt->rt_ifp; 963 struct port_info *pi = ifp->if_softc; /* XXX wrong for VLAN etc. */ 964 965 INP_WLOCK_ASSERT(inp); 966 967 toep = toepcb_alloc(tod); 968 if (toep == NULL) 969 goto failed; 970 971 atid = alloc_atid(&td->tid_maps, toep); 972 if (atid < 0) 973 goto failed; 974 975 qset = pi->first_qset + (arc4random() % pi->nqsets); 976 977 m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); 978 if (m == NULL) 979 goto failed; 980 981 gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam; 982 e = t3_l2t_get(pi, ifp, gw); 983 if (e == NULL) 984 goto failed; 985 986 toep->tp_l2t = e; 987 toep->tp_tid = atid; /* used to double check response */ 988 toep->tp_qset = qset; 989 990 SOCKBUF_LOCK(&so->so_rcv); 991 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ 992 toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); 993 SOCKBUF_UNLOCK(&so->so_rcv); 994 995 offload_socket(so, toep); 996 997 /* 998 * The kernel sets request_r_scale based on sb_max whereas we need to 999 * take hardware's MAX_RCV_WND into account too. This is normally a 1000 * no-op as MAX_RCV_WND is much larger than the default sb_max. 1001 */ 1002 if (tp->t_flags & TF_REQ_SCALE) 1003 rscale = tp->request_r_scale = select_rcv_wscale(); 1004 else 1005 rscale = 0; 1006 mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); 1007 cpu_idx = sc->rrss_map[qset]; 1008 1009 cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD)); 1010 cpl->wr.wrh_lo = 0; 1011 OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1012 inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, 1013 &cpl->peer_port); 1014 cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e); 1015 cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits); 1016 cpl->params = 0; 1017 cpl->opt2 = calc_opt2(cpu_idx); 1018 1019 CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__, 1020 toep->tp_tid, tcpstates[tp->t_state], toep, inp); 1021 1022 if (l2t_send(sc, m, e) == 0) 1023 return (0); 1024 1025 undo_offload_socket(so); 1026 1027failed: 1028 CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p", 1029 __func__, atid, toep, e, m); 1030 1031 if (atid >= 0) 1032 free_atid(&td->tid_maps, atid); 1033 1034 if (e) 1035 l2t_release(td->l2t, e); 1036 1037 if (toep) 1038 toepcb_free(toep); 1039 1040 m_freem(m); 1041 1042 return (ENOMEM); 1043} 1044 1045/* 1046 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do not 1047 * send multiple ABORT_REQs for the same connection and also that we do not try 1048 * to send a message after the connection has closed. 1049 */ 1050static void 1051send_reset(struct toepcb *toep) 1052{ 1053 1054 struct cpl_abort_req *req; 1055 unsigned int tid = toep->tp_tid; 1056 struct inpcb *inp = toep->tp_inp; 1057 struct socket *so = inp->inp_socket; 1058 struct tcpcb *tp = intotcpcb(inp); 1059 struct toedev *tod = toep->tp_tod; 1060 struct adapter *sc = tod->tod_softc; 1061 struct mbuf *m; 1062 1063 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1064 INP_WLOCK_ASSERT(inp); 1065 1066 CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep, 1067 toep->tp_flags); 1068 1069 if (toep->tp_flags & TP_ABORT_SHUTDOWN) 1070 return; 1071 1072 toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); 1073 1074 /* Purge the send queue */ 1075 sbflush(so_sockbuf_snd(so)); 1076 purge_wr_queue(toep); 1077 1078 m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); 1079 if (m == NULL) 1080 CXGB_UNIMPLEMENTED(); 1081 1082 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1083 req->wr.wrh_lo = htonl(V_WR_TID(tid)); 1084 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1085 req->rsvd0 = htonl(tp->snd_nxt); 1086 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1087 req->cmd = CPL_ABORT_SEND_RST; 1088 1089 if (tp->t_state == TCPS_SYN_SENT) 1090 (void )mbufq_enqueue(&toep->out_of_order_queue, m); /* defer */ 1091 else 1092 l2t_send(sc, m, toep->tp_l2t); 1093} 1094 1095int 1096t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp) 1097{ 1098 1099 send_reset(tp->t_toe); 1100 return (0); 1101} 1102 1103/* 1104 * Handler for RX_DATA CPL messages. 1105 */ 1106static int 1107do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1108{ 1109 struct adapter *sc = qs->adap; 1110 struct tom_data *td = sc->tom_softc; 1111 struct cpl_rx_data *hdr = mtod(m, void *); 1112 unsigned int tid = GET_TID(hdr); 1113 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1114 struct inpcb *inp = toep->tp_inp; 1115 struct tcpcb *tp; 1116 struct socket *so; 1117 struct sockbuf *so_rcv; 1118 1119 /* Advance over CPL */ 1120 m_adj(m, sizeof(*hdr)); 1121 1122 /* XXX: revisit. This comes from the T4 TOM */ 1123 if (__predict_false(inp == NULL)) { 1124 /* 1125 * do_pass_establish failed and must be attempting to abort the 1126 * connection. Meanwhile, the T4 has sent us data for such a 1127 * connection. 1128 */ 1129#ifdef notyet 1130 KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), 1131 ("%s: inp NULL and tid isn't being aborted", __func__)); 1132#endif 1133 m_freem(m); 1134 return (0); 1135 } 1136 1137 INP_WLOCK(inp); 1138 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1139 CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1140 __func__, tid, m->m_pkthdr.len, inp->inp_flags); 1141 INP_WUNLOCK(inp); 1142 m_freem(m); 1143 return (0); 1144 } 1145 1146 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) 1147 toep->tp_delack_mode = hdr->dack_mode; 1148 1149 tp = intotcpcb(inp); 1150 1151#ifdef INVARIANTS 1152 if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) { 1153 log(LOG_ERR, 1154 "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n", 1155 __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt); 1156 } 1157#endif 1158 tp->rcv_nxt += m->m_pkthdr.len; 1159 KASSERT(tp->rcv_wnd >= m->m_pkthdr.len, 1160 ("%s: negative window size", __func__)); 1161 tp->rcv_wnd -= m->m_pkthdr.len; 1162 tp->t_rcvtime = ticks; 1163 1164 so = inp->inp_socket; 1165 so_rcv = &so->so_rcv; 1166 SOCKBUF_LOCK(so_rcv); 1167 1168 if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) { 1169 CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)", 1170 __func__, tid, m->m_pkthdr.len); 1171 SOCKBUF_UNLOCK(so_rcv); 1172 INP_WUNLOCK(inp); 1173 1174 INP_INFO_RLOCK(&V_tcbinfo); 1175 INP_WLOCK(inp); 1176 tp = tcp_drop(tp, ECONNRESET); 1177 if (tp) 1178 INP_WUNLOCK(inp); 1179 INP_INFO_RUNLOCK(&V_tcbinfo); 1180 1181 m_freem(m); 1182 return (0); 1183 } 1184 1185 /* receive buffer autosize */ 1186 if (so_rcv->sb_flags & SB_AUTOSIZE && 1187 V_tcp_do_autorcvbuf && 1188 so_rcv->sb_hiwat < V_tcp_autorcvbuf_max && 1189 (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) { 1190 unsigned int hiwat = so_rcv->sb_hiwat; 1191 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 1192 V_tcp_autorcvbuf_max); 1193 1194 if (!sbreserve_locked(so_rcv, newsize, so, NULL)) 1195 so_rcv->sb_flags &= ~SB_AUTOSIZE; 1196 else 1197 toep->tp_rx_credits += newsize - hiwat; 1198 } 1199 1200 toep->tp_enqueued += m->m_pkthdr.len; 1201 sbappendstream_locked(so_rcv, m, 0); 1202 sorwakeup_locked(so); 1203 SOCKBUF_UNLOCK_ASSERT(so_rcv); 1204 1205 INP_WUNLOCK(inp); 1206 return (0); 1207} 1208 1209/* 1210 * Handler for PEER_CLOSE CPL messages. 1211 */ 1212static int 1213do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1214{ 1215 struct adapter *sc = qs->adap; 1216 struct tom_data *td = sc->tom_softc; 1217 const struct cpl_peer_close *hdr = mtod(m, void *); 1218 unsigned int tid = GET_TID(hdr); 1219 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1220 struct inpcb *inp = toep->tp_inp; 1221 struct tcpcb *tp; 1222 struct socket *so; 1223 1224 INP_INFO_RLOCK(&V_tcbinfo); 1225 INP_WLOCK(inp); 1226 tp = intotcpcb(inp); 1227 1228 CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, 1229 tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp); 1230 1231 if (toep->tp_flags & TP_ABORT_RPL_PENDING) 1232 goto done; 1233 1234 so = inp_inpcbtosocket(inp); 1235 1236 socantrcvmore(so); 1237 tp->rcv_nxt++; 1238 1239 switch (tp->t_state) { 1240 case TCPS_SYN_RECEIVED: 1241 tp->t_starttime = ticks; 1242 /* FALLTHROUGH */ 1243 case TCPS_ESTABLISHED: 1244 tp->t_state = TCPS_CLOSE_WAIT; 1245 break; 1246 case TCPS_FIN_WAIT_1: 1247 tp->t_state = TCPS_CLOSING; 1248 break; 1249 case TCPS_FIN_WAIT_2: 1250 tcp_twstart(tp); 1251 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1252 INP_INFO_RUNLOCK(&V_tcbinfo); 1253 1254 INP_WLOCK(inp); 1255 toepcb_release(toep); /* no more CPLs expected */ 1256 1257 m_freem(m); 1258 return (0); 1259 default: 1260 log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n", 1261 __func__, toep->tp_tid, tp->t_state); 1262 } 1263 1264done: 1265 INP_WUNLOCK(inp); 1266 INP_INFO_RUNLOCK(&V_tcbinfo); 1267 1268 m_freem(m); 1269 return (0); 1270} 1271 1272/* 1273 * Handler for CLOSE_CON_RPL CPL messages. peer ACK to our FIN received. 1274 */ 1275static int 1276do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1277{ 1278 struct adapter *sc = qs->adap; 1279 struct tom_data *td = sc->tom_softc; 1280 const struct cpl_close_con_rpl *rpl = mtod(m, void *); 1281 unsigned int tid = GET_TID(rpl); 1282 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1283 struct inpcb *inp = toep->tp_inp; 1284 struct tcpcb *tp; 1285 struct socket *so; 1286 1287 INP_INFO_RLOCK(&V_tcbinfo); 1288 INP_WLOCK(inp); 1289 tp = intotcpcb(inp); 1290 1291 CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, 1292 tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags); 1293 1294 if ((toep->tp_flags & TP_ABORT_RPL_PENDING)) 1295 goto done; 1296 1297 so = inp_inpcbtosocket(inp); 1298 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 1299 1300 switch (tp->t_state) { 1301 case TCPS_CLOSING: 1302 tcp_twstart(tp); 1303release: 1304 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1305 INP_INFO_RUNLOCK(&V_tcbinfo); 1306 1307 INP_WLOCK(inp); 1308 toepcb_release(toep); /* no more CPLs expected */ 1309 1310 m_freem(m); 1311 return (0); 1312 case TCPS_LAST_ACK: 1313 if (tcp_close(tp)) 1314 INP_WUNLOCK(inp); 1315 goto release; 1316 1317 case TCPS_FIN_WAIT_1: 1318 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1319 soisdisconnected(so); 1320 tp->t_state = TCPS_FIN_WAIT_2; 1321 break; 1322 default: 1323 log(LOG_ERR, 1324 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 1325 __func__, toep->tp_tid, tp->t_state); 1326 } 1327 1328done: 1329 INP_WUNLOCK(inp); 1330 INP_INFO_RUNLOCK(&V_tcbinfo); 1331 1332 m_freem(m); 1333 return (0); 1334} 1335 1336static int 1337do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1338{ 1339 struct cpl_smt_write_rpl *rpl = mtod(m, void *); 1340 1341 if (rpl->status != CPL_ERR_NONE) { 1342 log(LOG_ERR, 1343 "Unexpected SMT_WRITE_RPL status %u for entry %u\n", 1344 rpl->status, GET_TID(rpl)); 1345 } 1346 1347 m_freem(m); 1348 return (0); 1349} 1350 1351static int 1352do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1353{ 1354 struct cpl_set_tcb_rpl *rpl = mtod(m, void *); 1355 1356 if (rpl->status != CPL_ERR_NONE) { 1357 log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n", 1358 rpl->status, GET_TID(rpl)); 1359 } 1360 1361 m_freem(m); 1362 return (0); 1363} 1364 1365/* 1366 * Handle an ABORT_RPL_RSS CPL message. 1367 */ 1368static int 1369do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1370{ 1371 struct adapter *sc = qs->adap; 1372 struct tom_data *td = sc->tom_softc; 1373 const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); 1374 unsigned int tid = GET_TID(rpl); 1375 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1376 struct inpcb *inp; 1377 1378 /* 1379 * Ignore replies to post-close aborts indicating that the abort was 1380 * requested too late. These connections are terminated when we get 1381 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 1382 * arrives the TID is either no longer used or it has been recycled. 1383 */ 1384 if (rpl->status == CPL_ERR_ABORT_FAILED) { 1385 m_freem(m); 1386 return (0); 1387 } 1388 1389 if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) 1390 return (do_abort_rpl_synqe(qs, r, m)); 1391 1392 CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep, 1393 rpl->status); 1394 1395 inp = toep->tp_inp; 1396 INP_WLOCK(inp); 1397 1398 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1399 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) { 1400 toep->tp_flags |= TP_ABORT_RPL_RCVD; 1401 INP_WUNLOCK(inp); 1402 } else { 1403 toep->tp_flags &= ~TP_ABORT_RPL_RCVD; 1404 toep->tp_flags &= TP_ABORT_RPL_PENDING; 1405 toepcb_release(toep); /* no more CPLs expected */ 1406 } 1407 } 1408 1409 m_freem(m); 1410 return (0); 1411} 1412 1413/* 1414 * Convert the status code of an ABORT_REQ into a FreeBSD error code. 1415 */ 1416static int 1417abort_status_to_errno(struct tcpcb *tp, int abort_reason) 1418{ 1419 switch (abort_reason) { 1420 case CPL_ERR_BAD_SYN: 1421 case CPL_ERR_CONN_RESET: 1422 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1423 case CPL_ERR_XMIT_TIMEDOUT: 1424 case CPL_ERR_PERSIST_TIMEDOUT: 1425 case CPL_ERR_FINWAIT2_TIMEDOUT: 1426 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1427 return (ETIMEDOUT); 1428 default: 1429 return (EIO); 1430 } 1431} 1432 1433/* 1434 * Returns whether an ABORT_REQ_RSS message is a negative advice. 1435 */ 1436static inline int 1437is_neg_adv_abort(unsigned int status) 1438{ 1439 return status == CPL_ERR_RTX_NEG_ADVICE || 1440 status == CPL_ERR_PERSIST_NEG_ADVICE; 1441} 1442 1443void 1444send_abort_rpl(struct toedev *tod, int tid, int qset) 1445{ 1446 struct mbuf *reply; 1447 struct cpl_abort_rpl *rpl; 1448 struct adapter *sc = tod->tod_softc; 1449 1450 reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl); 1451 if (!reply) 1452 CXGB_UNIMPLEMENTED(); 1453 1454 rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 1455 rpl->wr.wrh_lo = htonl(V_WR_TID(tid)); 1456 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 1457 rpl->cmd = CPL_ABORT_NO_RST; 1458 1459 t3_offload_tx(sc, reply); 1460} 1461 1462/* 1463 * Handle an ABORT_REQ_RSS CPL message. If we're waiting for an ABORT_RPL we 1464 * ignore this request except that we need to reply to it. 1465 */ 1466static int 1467do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1468{ 1469 struct adapter *sc = qs->adap; 1470 struct tom_data *td = sc->tom_softc; 1471 struct toedev *tod = &td->tod; 1472 const struct cpl_abort_req_rss *req = mtod(m, void *); 1473 unsigned int tid = GET_TID(req); 1474 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1475 struct inpcb *inp; 1476 struct tcpcb *tp; 1477 struct socket *so; 1478 int qset = toep->tp_qset; 1479 1480 if (is_neg_adv_abort(req->status)) { 1481 CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)", 1482 __func__, req->status, tid, toep->tp_flags); 1483 m_freem(m); 1484 return (0); 1485 } 1486 1487 if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) 1488 return (do_abort_req_synqe(qs, r, m)); 1489 1490 inp = toep->tp_inp; 1491 INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ 1492 INP_WLOCK(inp); 1493 1494 tp = intotcpcb(inp); 1495 so = inp->inp_socket; 1496 1497 CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d", 1498 __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags, 1499 req->status); 1500 1501 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) { 1502 toep->tp_flags |= TP_ABORT_REQ_RCVD; 1503 toep->tp_flags |= TP_ABORT_SHUTDOWN; 1504 INP_WUNLOCK(inp); 1505 INP_INFO_RUNLOCK(&V_tcbinfo); 1506 m_freem(m); 1507 return (0); 1508 } 1509 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 1510 1511 /* 1512 * If we'd sent a reset on this toep, we'll ignore this and clean up in 1513 * the T3's reply to our reset instead. 1514 */ 1515 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1516 toep->tp_flags |= TP_ABORT_RPL_SENT; 1517 INP_WUNLOCK(inp); 1518 } else { 1519 so_error_set(so, abort_status_to_errno(tp, req->status)); 1520 tp = tcp_close(tp); 1521 if (tp == NULL) 1522 INP_WLOCK(inp); /* re-acquire */ 1523 toepcb_release(toep); /* no more CPLs expected */ 1524 } 1525 INP_INFO_RUNLOCK(&V_tcbinfo); 1526 1527 send_abort_rpl(tod, tid, qset); 1528 m_freem(m); 1529 return (0); 1530} 1531 1532static void 1533assign_rxopt(struct tcpcb *tp, uint16_t tcpopt) 1534{ 1535 struct toepcb *toep = tp->t_toe; 1536 struct adapter *sc = toep->tp_tod->tod_softc; 1537 1538 tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40; 1539 1540 if (G_TCPOPT_TSTAMP(tcpopt)) { 1541 tp->t_flags |= TF_RCVD_TSTMP; 1542 tp->t_flags |= TF_REQ_TSTMP; /* forcibly set */ 1543 tp->ts_recent = 0; /* XXX */ 1544 tp->ts_recent_age = tcp_ts_getticks(); 1545 } 1546 1547 if (G_TCPOPT_SACK(tcpopt)) 1548 tp->t_flags |= TF_SACK_PERMIT; 1549 else 1550 tp->t_flags &= ~TF_SACK_PERMIT; 1551 1552 if (G_TCPOPT_WSCALE_OK(tcpopt)) 1553 tp->t_flags |= TF_RCVD_SCALE; 1554 1555 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 1556 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 1557 tp->rcv_scale = tp->request_r_scale; 1558 tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt); 1559 } 1560 1561} 1562 1563/* 1564 * The ISS and IRS are from after the exchange of SYNs and are off by 1. 1565 */ 1566void 1567make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs, 1568 uint16_t cpl_tcpopt) 1569{ 1570 struct inpcb *inp = sotoinpcb(so); 1571 struct tcpcb *tp = intotcpcb(inp); 1572 struct toepcb *toep = tp->t_toe; 1573 long bufsize; 1574 uint32_t iss = be32toh(cpl_iss) - 1; /* true ISS */ 1575 uint32_t irs = be32toh(cpl_irs) - 1; /* true IRS */ 1576 uint16_t tcpopt = be16toh(cpl_tcpopt); 1577 1578 INP_WLOCK_ASSERT(inp); 1579 1580 tp->t_state = TCPS_ESTABLISHED; 1581 tp->t_starttime = ticks; 1582 TCPSTAT_INC(tcps_connects); 1583 1584 CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state], 1585 toep->tp_tid, toep, inp); 1586 1587 tp->irs = irs; 1588 tcp_rcvseqinit(tp); 1589 tp->rcv_wnd = toep->tp_rx_credits << 10; 1590 tp->rcv_adv += tp->rcv_wnd; 1591 tp->last_ack_sent = tp->rcv_nxt; 1592 1593 /* 1594 * If we were unable to send all rx credits via opt0, save the remainder 1595 * in rx_credits so that they can be handed over with the next credit 1596 * update. 1597 */ 1598 SOCKBUF_LOCK(&so->so_rcv); 1599 bufsize = select_rcv_wnd(so); 1600 SOCKBUF_UNLOCK(&so->so_rcv); 1601 toep->tp_rx_credits = bufsize - tp->rcv_wnd; 1602 1603 tp->iss = iss; 1604 tcp_sendseqinit(tp); 1605 tp->snd_una = iss + 1; 1606 tp->snd_nxt = iss + 1; 1607 tp->snd_max = iss + 1; 1608 1609 assign_rxopt(tp, tcpopt); 1610 soisconnected(so); 1611} 1612 1613/* 1614 * Fill in the right TID for CPL messages waiting in the out-of-order queue 1615 * and send them to the TOE. 1616 */ 1617static void 1618fixup_and_send_ofo(struct toepcb *toep) 1619{ 1620 struct mbuf *m; 1621 struct toedev *tod = toep->tp_tod; 1622 struct adapter *sc = tod->tod_softc; 1623 unsigned int tid = toep->tp_tid; 1624 1625 inp_lock_assert(toep->tp_inp); 1626 1627 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 1628 struct ofld_hdr *oh = mtod(m, void *); 1629 /* 1630 * A variety of messages can be waiting but the fields we'll 1631 * be touching are common to all so any message type will do. 1632 */ 1633 struct cpl_close_con_req *p = (void *)(oh + 1); 1634 1635 p->wr.wrh_lo = htonl(V_WR_TID(tid)); 1636 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 1637 t3_offload_tx(sc, m); 1638 } 1639} 1640 1641/* 1642 * Process a CPL_ACT_ESTABLISH message. 1643 */ 1644static int 1645do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1646{ 1647 struct adapter *sc = qs->adap; 1648 struct tom_data *td = sc->tom_softc; 1649 struct cpl_act_establish *req = mtod(m, void *); 1650 unsigned int tid = GET_TID(req); 1651 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 1652 struct toepcb *toep = lookup_atid(&td->tid_maps, atid); 1653 struct inpcb *inp = toep->tp_inp; 1654 struct tcpcb *tp; 1655 struct socket *so; 1656 1657 CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid); 1658 1659 free_atid(&td->tid_maps, atid); 1660 1661 INP_WLOCK(inp); 1662 tp = intotcpcb(inp); 1663 1664 KASSERT(toep->tp_qset == qs->idx, 1665 ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx)); 1666 KASSERT(toep->tp_tid == atid, 1667 ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid)); 1668 1669 toep->tp_tid = tid; 1670 insert_tid(td, toep, tid); 1671 1672 if (inp->inp_flags & INP_DROPPED) { 1673 /* socket closed by the kernel before hw told us it connected */ 1674 send_reset(toep); 1675 goto done; 1676 } 1677 1678 KASSERT(tp->t_state == TCPS_SYN_SENT, 1679 ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state)); 1680 1681 so = inp->inp_socket; 1682 make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt); 1683 1684 /* 1685 * Now that we finally have a TID send any CPL messages that we had to 1686 * defer for lack of a TID. 1687 */ 1688 if (mbufq_len(&toep->out_of_order_queue)) 1689 fixup_and_send_ofo(toep); 1690 1691done: 1692 INP_WUNLOCK(inp); 1693 m_freem(m); 1694 return (0); 1695} 1696 1697/* 1698 * Process an acknowledgment of WR completion. Advance snd_una and send the 1699 * next batch of work requests from the write queue. 1700 */ 1701static void 1702wr_ack(struct toepcb *toep, struct mbuf *m) 1703{ 1704 struct inpcb *inp = toep->tp_inp; 1705 struct tcpcb *tp; 1706 struct cpl_wr_ack *hdr = mtod(m, void *); 1707 struct socket *so; 1708 unsigned int credits = ntohs(hdr->credits); 1709 u32 snd_una = ntohl(hdr->snd_una); 1710 int bytes = 0; 1711 struct sockbuf *snd; 1712 struct mbuf *p; 1713 struct ofld_hdr *oh; 1714 1715 inp_wlock(inp); 1716 tp = intotcpcb(inp); 1717 so = inp->inp_socket; 1718 toep->tp_wr_avail += credits; 1719 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 1720 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 1721 1722 while (credits) { 1723 p = peek_wr(toep); 1724 1725 if (__predict_false(!p)) { 1726 CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, " 1727 "tid %u, state %u, wr_avail %u", __func__, credits, 1728 toep->tp_tid, tp->t_state, toep->tp_wr_avail); 1729 1730 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 1731 "nothing pending, state %u wr_avail=%u\n", 1732 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 1733 break; 1734 } 1735 1736 oh = mtod(p, struct ofld_hdr *); 1737 1738 KASSERT(credits >= G_HDR_NDESC(oh->flags), 1739 ("%s: partial credits? %d %d", __func__, credits, 1740 G_HDR_NDESC(oh->flags))); 1741 1742 dequeue_wr(toep); 1743 credits -= G_HDR_NDESC(oh->flags); 1744 bytes += oh->plen; 1745 1746 if (oh->flags & F_HDR_SGL) 1747 sglist_free(oh->sgl); 1748 m_freem(p); 1749 } 1750 1751 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) 1752 goto out_free; 1753 1754 if (tp->snd_una != snd_una) { 1755 tp->snd_una = snd_una; 1756 tp->ts_recent_age = tcp_ts_getticks(); 1757 if (tp->snd_una == tp->snd_nxt) 1758 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 1759 } 1760 1761 snd = so_sockbuf_snd(so); 1762 if (bytes) { 1763 SOCKBUF_LOCK(snd); 1764 sbdrop_locked(snd, bytes); 1765 so_sowwakeup_locked(so); 1766 } 1767 1768 if (snd->sb_sndptroff < sbused(snd)) 1769 t3_push_frames(so, 0); 1770 1771out_free: 1772 inp_wunlock(tp->t_inpcb); 1773 m_freem(m); 1774} 1775 1776/* 1777 * Handler for TX_DATA_ACK CPL messages. 1778 */ 1779static int 1780do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1781{ 1782 struct adapter *sc = qs->adap; 1783 struct tom_data *td = sc->tom_softc; 1784 struct cpl_wr_ack *hdr = mtod(m, void *); 1785 unsigned int tid = GET_TID(hdr); 1786 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1787 1788 /* XXX bad race */ 1789 if (toep) 1790 wr_ack(toep, m); 1791 1792 return (0); 1793} 1794 1795void 1796t3_init_cpl_io(struct adapter *sc) 1797{ 1798 t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish); 1799 t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl); 1800 t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify); 1801 t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data); 1802 t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack); 1803 t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close); 1804 t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req); 1805 t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl); 1806 t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl); 1807 t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl); 1808 t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl); 1809} 1810#endif 1811