1/*- 2 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c 355249 2019-11-30 20:22:03Z np $"); 30 31#include "opt_inet.h" 32#include "opt_inet6.h" 33 34#ifdef TCP_OFFLOAD 35#include <sys/param.h> 36#include <sys/aio.h> 37#include <sys/file.h> 38#include <sys/kernel.h> 39#include <sys/ktr.h> 40#include <sys/module.h> 41#include <sys/proc.h> 42#include <sys/protosw.h> 43#include <sys/domain.h> 44#include <sys/socket.h> 45#include <sys/socketvar.h> 46#include <sys/sglist.h> 47#include <sys/taskqueue.h> 48#include <netinet/in.h> 49#include <netinet/in_pcb.h> 50#include <netinet/ip.h> 51#include <netinet/ip6.h> 52#define TCPSTATES 53#include <netinet/tcp_fsm.h> 54#include <netinet/tcp_seq.h> 55#include <netinet/tcp_var.h> 56#include <netinet/toecore.h> 57 58#include <security/mac/mac_framework.h> 59 60#include <vm/vm.h> 61#include <vm/vm_extern.h> 62#include <vm/pmap.h> 63#include <vm/vm_map.h> 64#include <vm/vm_page.h> 65 66#include "common/common.h" 67#include "common/t4_msg.h" 68#include "common/t4_regs.h" 69#include "common/t4_tcb.h" 70#include "tom/t4_tom_l2t.h" 71#include "tom/t4_tom.h" 72 73VNET_DECLARE(int, tcp_do_autosndbuf); 74#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) 75VNET_DECLARE(int, tcp_autosndbuf_inc); 76#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) 77VNET_DECLARE(int, tcp_autosndbuf_max); 78#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) 79VNET_DECLARE(int, tcp_do_autorcvbuf); 80#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 81VNET_DECLARE(int, tcp_autorcvbuf_inc); 82#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 83VNET_DECLARE(int, tcp_autorcvbuf_max); 84#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 85 86static void t4_aiotx_cancel(struct kaiocb *job); 87static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 88 89static size_t 90aiotx_mbuf_pgoff(struct mbuf *m) 91{ 92 struct aiotx_buffer *ab; 93 94 MPASS(IS_AIOTX_MBUF(m)); 95 ab = m->m_ext.ext_arg1; 96 return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE); 97} 98 99static vm_page_t * 100aiotx_mbuf_pages(struct mbuf *m) 101{ 102 struct aiotx_buffer *ab; 103 int npages; 104 105 MPASS(IS_AIOTX_MBUF(m)); 106 ab = m->m_ext.ext_arg1; 107 npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE; 108 return (ab->ps.pages + npages); 109} 110 111void 112send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) 113{ 114 struct wrqe *wr; 115 struct fw_flowc_wr *flowc; 116 unsigned int nparams, flowclen, paramidx; 117 struct vi_info *vi = toep->vi; 118 struct port_info *pi = vi->pi; 119 struct adapter *sc = pi->adapter; 120 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 121 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 122 123 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 124 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 125 126 if (ftxp != NULL) 127 nparams = 8; 128 else 129 nparams = 6; 130 if (toep->ulp_mode == ULP_MODE_TLS) 131 nparams++; 132 if (toep->tls.fcplenmax != 0) 133 nparams++; 134 if (toep->tc_idx != -1) { 135 MPASS(toep->tc_idx >= 0 && 136 toep->tc_idx < sc->chip_params->nsched_cls); 137 nparams++; 138 } 139 140 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 141 142 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 143 if (wr == NULL) { 144 /* XXX */ 145 panic("%s: allocation failure.", __func__); 146 } 147 flowc = wrtod(wr); 148 memset(flowc, 0, wr->wr_len); 149 150 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 151 V_FW_FLOWC_WR_NPARAMS(nparams)); 152 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 153 V_FW_WR_FLOWID(toep->tid)); 154 155#define FLOWC_PARAM(__m, __v) \ 156 do { \ 157 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 158 flowc->mnemval[paramidx].val = htobe32(__v); \ 159 paramidx++; \ 160 } while (0) 161 162 paramidx = 0; 163 164 FLOWC_PARAM(PFNVFN, pfvf); 165 FLOWC_PARAM(CH, pi->tx_chan); 166 FLOWC_PARAM(PORT, pi->tx_chan); 167 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 168 if (ftxp) { 169 uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); 170 171 FLOWC_PARAM(SNDNXT, ftxp->snd_nxt); 172 FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt); 173 FLOWC_PARAM(SNDBUF, sndbuf); 174 FLOWC_PARAM(MSS, ftxp->mss); 175 176 CTR6(KTR_CXGBE, 177 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 178 __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt, 179 ftxp->rcv_nxt); 180 } else { 181 FLOWC_PARAM(SNDBUF, 512); 182 FLOWC_PARAM(MSS, 512); 183 184 CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); 185 } 186 if (toep->ulp_mode == ULP_MODE_TLS) 187 FLOWC_PARAM(ULP_MODE, toep->ulp_mode); 188 if (toep->tls.fcplenmax != 0) 189 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 190 if (toep->tc_idx != -1) 191 FLOWC_PARAM(SCHEDCLASS, toep->tc_idx); 192#undef FLOWC_PARAM 193 194 KASSERT(paramidx == nparams, ("nparams mismatch")); 195 196 txsd->tx_credits = howmany(flowclen, 16); 197 txsd->plen = 0; 198 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 199 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 200 toep->tx_credits -= txsd->tx_credits; 201 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 202 toep->txsd_pidx = 0; 203 toep->txsd_avail--; 204 205 toep->flags |= TPF_FLOWC_WR_SENT; 206 t4_wrq_tx(sc, wr); 207} 208 209#ifdef RATELIMIT 210/* 211 * Input is Bytes/second (so_max_pacing-rate), chip counts in Kilobits/second. 212 */ 213static int 214update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 215{ 216 int tc_idx, rc; 217 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 218 const int port_id = toep->vi->pi->port_id; 219 220 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 221 222 if (kbps == 0) { 223 /* unbind */ 224 tc_idx = -1; 225 } else { 226 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 227 if (rc != 0) 228 return (rc); 229 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 230 } 231 232 if (toep->tc_idx != tc_idx) { 233 struct wrqe *wr; 234 struct fw_flowc_wr *flowc; 235 int nparams = 1, flowclen, flowclen16; 236 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 237 238 flowclen = sizeof(*flowc) + nparams * sizeof(struct 239 fw_flowc_mnemval); 240 flowclen16 = howmany(flowclen, 16); 241 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 242 (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { 243 if (tc_idx >= 0) 244 t4_release_cl_rl(sc, port_id, tc_idx); 245 return (ENOMEM); 246 } 247 248 flowc = wrtod(wr); 249 memset(flowc, 0, wr->wr_len); 250 251 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 252 V_FW_FLOWC_WR_NPARAMS(nparams)); 253 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 254 V_FW_WR_FLOWID(toep->tid)); 255 256 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 257 if (tc_idx == -1) 258 flowc->mnemval[0].val = htobe32(0xff); 259 else 260 flowc->mnemval[0].val = htobe32(tc_idx); 261 262 txsd->tx_credits = flowclen16; 263 txsd->plen = 0; 264 toep->tx_credits -= txsd->tx_credits; 265 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 266 toep->txsd_pidx = 0; 267 toep->txsd_avail--; 268 t4_wrq_tx(sc, wr); 269 } 270 271 if (toep->tc_idx >= 0) 272 t4_release_cl_rl(sc, port_id, toep->tc_idx); 273 toep->tc_idx = tc_idx; 274 275 return (0); 276} 277#endif 278 279void 280send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 281{ 282 struct wrqe *wr; 283 struct cpl_abort_req *req; 284 int tid = toep->tid; 285 struct inpcb *inp = toep->inp; 286 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 287 288 INP_WLOCK_ASSERT(inp); 289 290 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 291 __func__, toep->tid, 292 inp->inp_flags & INP_DROPPED ? "inp dropped" : 293 tcpstates[tp->t_state], 294 toep->flags, inp->inp_flags, 295 toep->flags & TPF_ABORT_SHUTDOWN ? 296 " (abort already in progress)" : ""); 297 298 if (toep->flags & TPF_ABORT_SHUTDOWN) 299 return; /* abort already in progress */ 300 301 toep->flags |= TPF_ABORT_SHUTDOWN; 302 303 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 304 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 305 306 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 307 if (wr == NULL) { 308 /* XXX */ 309 panic("%s: allocation failure.", __func__); 310 } 311 req = wrtod(wr); 312 313 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 314 if (inp->inp_flags & INP_DROPPED) 315 req->rsvd0 = htobe32(snd_nxt); 316 else 317 req->rsvd0 = htobe32(tp->snd_nxt); 318 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 319 req->cmd = CPL_ABORT_SEND_RST; 320 321 /* 322 * XXX: What's the correct way to tell that the inp hasn't been detached 323 * from its socket? Should I even be flushing the snd buffer here? 324 */ 325 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 326 struct socket *so = inp->inp_socket; 327 328 if (so != NULL) /* because I'm not sure. See comment above */ 329 sbflush(&so->so_snd); 330 } 331 332 t4_l2t_send(sc, wr, toep->l2te); 333} 334 335/* 336 * Called when a connection is established to translate the TCP options 337 * reported by HW to FreeBSD's native format. 338 */ 339static void 340assign_rxopt(struct tcpcb *tp, uint16_t opt) 341{ 342 struct toepcb *toep = tp->t_toe; 343 struct inpcb *inp = tp->t_inpcb; 344 struct adapter *sc = td_adapter(toep->td); 345 346 INP_LOCK_ASSERT(inp); 347 348 toep->tcp_opt = opt; 349 toep->mtu_idx = G_TCPOPT_MSS(opt); 350 tp->t_maxseg = sc->params.mtus[toep->mtu_idx]; 351 if (inp->inp_inc.inc_flags & INC_ISIPV6) 352 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 353 else 354 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 355 356 toep->emss = tp->t_maxseg; 357 if (G_TCPOPT_TSTAMP(opt)) { 358 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 359 tp->ts_recent = 0; /* hmmm */ 360 tp->ts_recent_age = tcp_ts_getticks(); 361 toep->emss -= TCPOLEN_TSTAMP_APPA; 362 } 363 364 CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u", 365 __func__, toep->tid, toep->mtu_idx, 366 sc->params.mtus[G_TCPOPT_MSS(opt)], tp->t_maxseg, toep->emss); 367 368 if (G_TCPOPT_SACK(opt)) 369 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 370 else 371 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 372 373 if (G_TCPOPT_WSCALE_OK(opt)) 374 tp->t_flags |= TF_RCVD_SCALE; 375 376 /* Doing window scaling? */ 377 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 378 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 379 tp->rcv_scale = tp->request_r_scale; 380 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 381 } 382} 383 384/* 385 * Completes some final bits of initialization for just established connections 386 * and changes their state to TCPS_ESTABLISHED. 387 * 388 * The ISNs are from the exchange of SYNs. 389 */ 390void 391make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 392{ 393 struct inpcb *inp = toep->inp; 394 struct socket *so = inp->inp_socket; 395 struct tcpcb *tp = intotcpcb(inp); 396 long bufsize; 397 uint16_t tcpopt = be16toh(opt); 398 struct flowc_tx_params ftxp; 399 400 INP_WLOCK_ASSERT(inp); 401 KASSERT(tp->t_state == TCPS_SYN_SENT || 402 tp->t_state == TCPS_SYN_RECEIVED, 403 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 404 405 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 406 __func__, toep->tid, so, inp, tp, toep); 407 408 tcp_state_change(tp, TCPS_ESTABLISHED); 409 tp->t_starttime = ticks; 410 TCPSTAT_INC(tcps_connects); 411 412 tp->irs = irs; 413 tcp_rcvseqinit(tp); 414 tp->rcv_wnd = (u_int)toep->opt0_rcv_bufsize << 10; 415 tp->rcv_adv += tp->rcv_wnd; 416 tp->last_ack_sent = tp->rcv_nxt; 417 418 tp->iss = iss; 419 tcp_sendseqinit(tp); 420 tp->snd_una = iss + 1; 421 tp->snd_nxt = iss + 1; 422 tp->snd_max = iss + 1; 423 424 assign_rxopt(tp, tcpopt); 425 426 SOCKBUF_LOCK(&so->so_snd); 427 if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) 428 bufsize = V_tcp_autosndbuf_max; 429 else 430 bufsize = sbspace(&so->so_snd); 431 SOCKBUF_UNLOCK(&so->so_snd); 432 433 ftxp.snd_nxt = tp->snd_nxt; 434 ftxp.rcv_nxt = tp->rcv_nxt; 435 ftxp.snd_space = bufsize; 436 ftxp.mss = toep->emss; 437 send_flowc_wr(toep, &ftxp); 438 439 soisconnected(so); 440} 441 442int 443send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 444{ 445 struct wrqe *wr; 446 struct cpl_rx_data_ack *req; 447 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 448 449 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 450 451 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 452 if (wr == NULL) 453 return (0); 454 req = wrtod(wr); 455 456 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 457 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 458 459 t4_wrq_tx(sc, wr); 460 return (credits); 461} 462 463void 464send_rx_modulate(struct adapter *sc, struct toepcb *toep) 465{ 466 struct wrqe *wr; 467 struct cpl_rx_data_ack *req; 468 469 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 470 if (wr == NULL) 471 return; 472 req = wrtod(wr); 473 474 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 475 req->credit_dack = htobe32(F_RX_MODULATE_RX); 476 477 t4_wrq_tx(sc, wr); 478} 479 480void 481t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 482{ 483 struct adapter *sc = tod->tod_softc; 484 struct inpcb *inp = tp->t_inpcb; 485 struct socket *so = inp->inp_socket; 486 struct sockbuf *sb = &so->so_rcv; 487 struct toepcb *toep = tp->t_toe; 488 int rx_credits; 489 490 INP_WLOCK_ASSERT(inp); 491 SOCKBUF_LOCK_ASSERT(sb); 492 493 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 494 if (toep->ulp_mode == ULP_MODE_TLS) { 495 if (toep->tls.rcv_over >= rx_credits) { 496 toep->tls.rcv_over -= rx_credits; 497 rx_credits = 0; 498 } else { 499 rx_credits -= toep->tls.rcv_over; 500 toep->tls.rcv_over = 0; 501 } 502 } 503 504 if (rx_credits > 0 && 505 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 506 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 507 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 508 rx_credits = send_rx_credits(sc, toep, rx_credits); 509 tp->rcv_wnd += rx_credits; 510 tp->rcv_adv += rx_credits; 511 } else if (toep->flags & TPF_FORCE_CREDITS) 512 send_rx_modulate(sc, toep); 513} 514 515void 516t4_rcvd(struct toedev *tod, struct tcpcb *tp) 517{ 518 struct inpcb *inp = tp->t_inpcb; 519 struct socket *so = inp->inp_socket; 520 struct sockbuf *sb = &so->so_rcv; 521 522 SOCKBUF_LOCK(sb); 523 t4_rcvd_locked(tod, tp); 524 SOCKBUF_UNLOCK(sb); 525} 526 527/* 528 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 529 */ 530int 531t4_close_conn(struct adapter *sc, struct toepcb *toep) 532{ 533 struct wrqe *wr; 534 struct cpl_close_con_req *req; 535 unsigned int tid = toep->tid; 536 537 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 538 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 539 540 if (toep->flags & TPF_FIN_SENT) 541 return (0); 542 543 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 544 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 545 546 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 547 if (wr == NULL) { 548 /* XXX */ 549 panic("%s: allocation failure.", __func__); 550 } 551 req = wrtod(wr); 552 553 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 554 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 555 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 556 V_FW_WR_FLOWID(tid)); 557 req->wr.wr_lo = cpu_to_be64(0); 558 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 559 req->rsvd = 0; 560 561 toep->flags |= TPF_FIN_SENT; 562 toep->flags &= ~TPF_SEND_FIN; 563 t4_l2t_send(sc, wr, toep->l2te); 564 565 return (0); 566} 567 568#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 569#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 570 571/* Maximum amount of immediate data we could stuff in a WR */ 572static inline int 573max_imm_payload(int tx_credits) 574{ 575 const int n = 1; /* Use no more than one desc for imm. data WR */ 576 577 KASSERT(tx_credits >= 0 && 578 tx_credits <= MAX_OFLD_TX_CREDITS, 579 ("%s: %d credits", __func__, tx_credits)); 580 581 if (tx_credits < MIN_OFLD_TX_CREDITS) 582 return (0); 583 584 if (tx_credits >= (n * EQ_ESIZE) / 16) 585 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 586 else 587 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 588} 589 590/* Maximum number of SGL entries we could stuff in a WR */ 591static inline int 592max_dsgl_nsegs(int tx_credits) 593{ 594 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 595 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 596 597 KASSERT(tx_credits >= 0 && 598 tx_credits <= MAX_OFLD_TX_CREDITS, 599 ("%s: %d credits", __func__, tx_credits)); 600 601 if (tx_credits < MIN_OFLD_TX_CREDITS) 602 return (0); 603 604 nseg += 2 * (sge_pair_credits * 16 / 24); 605 if ((sge_pair_credits * 16) % 24 == 16) 606 nseg++; 607 608 return (nseg); 609} 610 611static inline void 612write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 613 unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign) 614{ 615 struct fw_ofld_tx_data_wr *txwr = dst; 616 617 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 618 V_FW_WR_IMMDLEN(immdlen)); 619 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 620 V_FW_WR_LEN16(credits)); 621 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) | 622 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 623 txwr->plen = htobe32(plen); 624 625 if (txalign > 0) { 626 struct tcpcb *tp = intotcpcb(toep->inp); 627 628 if (plen < 2 * toep->emss) 629 txwr->lsodisable_to_flags |= 630 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 631 else 632 txwr->lsodisable_to_flags |= 633 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 634 (tp->t_flags & TF_NODELAY ? 0 : 635 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 636 } 637} 638 639/* 640 * Generate a DSGL from a starting mbuf. The total number of segments and the 641 * maximum segments in any one mbuf are provided. 642 */ 643static void 644write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 645{ 646 struct mbuf *m; 647 struct ulptx_sgl *usgl = dst; 648 int i, j, rc; 649 struct sglist sg; 650 struct sglist_seg segs[n]; 651 652 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 653 654 sglist_init(&sg, n, segs); 655 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 656 V_ULPTX_NSGE(nsegs)); 657 658 i = -1; 659 for (m = start; m != stop; m = m->m_next) { 660 if (IS_AIOTX_MBUF(m)) 661 rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m), 662 aiotx_mbuf_pgoff(m), m->m_len); 663 else 664 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 665 if (__predict_false(rc != 0)) 666 panic("%s: sglist_append %d", __func__, rc); 667 668 for (j = 0; j < sg.sg_nseg; i++, j++) { 669 if (i < 0) { 670 usgl->len0 = htobe32(segs[j].ss_len); 671 usgl->addr0 = htobe64(segs[j].ss_paddr); 672 } else { 673 usgl->sge[i / 2].len[i & 1] = 674 htobe32(segs[j].ss_len); 675 usgl->sge[i / 2].addr[i & 1] = 676 htobe64(segs[j].ss_paddr); 677 } 678#ifdef INVARIANTS 679 nsegs--; 680#endif 681 } 682 sglist_reset(&sg); 683 } 684 if (i & 1) 685 usgl->sge[i / 2].len[1] = htobe32(0); 686 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 687 __func__, nsegs, start, stop)); 688} 689 690/* 691 * Max number of SGL entries an offload tx work request can have. This is 41 692 * (1 + 40) for a full 512B work request. 693 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 694 */ 695#define OFLD_SGL_LEN (41) 696 697/* 698 * Send data and/or a FIN to the peer. 699 * 700 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 701 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 702 * was transmitted. 703 * 704 * drop indicates the number of bytes that should be dropped from the head of 705 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 706 * contention on the send buffer lock (before this change it used to do 707 * sowwakeup and then t4_push_frames right after that when recovering from tx 708 * stalls). When drop is set this function MUST drop the bytes and wake up any 709 * writers. 710 */ 711void 712t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 713{ 714 struct mbuf *sndptr, *m, *sb_sndptr; 715 struct fw_ofld_tx_data_wr *txwr; 716 struct wrqe *wr; 717 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 718 struct inpcb *inp = toep->inp; 719 struct tcpcb *tp = intotcpcb(inp); 720 struct socket *so = inp->inp_socket; 721 struct sockbuf *sb = &so->so_snd; 722 int tx_credits, shove, compl, sowwakeup; 723 struct ofld_tx_sdesc *txsd; 724 bool aiotx_mbuf_seen; 725 726 INP_WLOCK_ASSERT(inp); 727 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 728 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 729 730 KASSERT(toep->ulp_mode == ULP_MODE_NONE || 731 toep->ulp_mode == ULP_MODE_TCPDDP || 732 toep->ulp_mode == ULP_MODE_TLS || 733 toep->ulp_mode == ULP_MODE_RDMA, 734 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 735 736#ifdef VERBOSE_TRACES 737 CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 738 __func__, toep->tid, toep->flags, tp->t_flags); 739#endif 740 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 741 return; 742 743#ifdef RATELIMIT 744 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 745 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 746 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 747 } 748#endif 749 750 /* 751 * This function doesn't resume by itself. Someone else must clear the 752 * flag and call this function. 753 */ 754 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 755 KASSERT(drop == 0, 756 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 757 return; 758 } 759 760 txsd = &toep->txsd[toep->txsd_pidx]; 761 do { 762 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 763 max_imm = max_imm_payload(tx_credits); 764 max_nsegs = max_dsgl_nsegs(tx_credits); 765 766 SOCKBUF_LOCK(sb); 767 sowwakeup = drop; 768 if (drop) { 769 sbdrop_locked(sb, drop); 770 drop = 0; 771 } 772 sb_sndptr = sb->sb_sndptr; 773 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 774 plen = 0; 775 nsegs = 0; 776 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 777 aiotx_mbuf_seen = false; 778 for (m = sndptr; m != NULL; m = m->m_next) { 779 int n; 780 781 if (IS_AIOTX_MBUF(m)) 782 n = sglist_count_vmpages(aiotx_mbuf_pages(m), 783 aiotx_mbuf_pgoff(m), m->m_len); 784 else 785 n = sglist_count(mtod(m, void *), m->m_len); 786 787 nsegs += n; 788 plen += m->m_len; 789 790 /* This mbuf sent us _over_ the nsegs limit, back out */ 791 if (plen > max_imm && nsegs > max_nsegs) { 792 nsegs -= n; 793 plen -= m->m_len; 794 if (plen == 0) { 795 /* Too few credits */ 796 toep->flags |= TPF_TX_SUSPENDED; 797 if (sowwakeup) { 798 if (!TAILQ_EMPTY( 799 &toep->aiotx_jobq)) 800 t4_aiotx_queue_toep(so, 801 toep); 802 sowwakeup_locked(so); 803 } else 804 SOCKBUF_UNLOCK(sb); 805 SOCKBUF_UNLOCK_ASSERT(sb); 806 return; 807 } 808 break; 809 } 810 811 if (IS_AIOTX_MBUF(m)) 812 aiotx_mbuf_seen = true; 813 if (max_nsegs_1mbuf < n) 814 max_nsegs_1mbuf = n; 815 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 816 817 /* This mbuf put us right at the max_nsegs limit */ 818 if (plen > max_imm && nsegs == max_nsegs) { 819 m = m->m_next; 820 break; 821 } 822 } 823 824 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 825 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 826 compl = 1; 827 else 828 compl = 0; 829 830 if (sb->sb_flags & SB_AUTOSIZE && 831 V_tcp_do_autosndbuf && 832 sb->sb_hiwat < V_tcp_autosndbuf_max && 833 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 834 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 835 V_tcp_autosndbuf_max); 836 837 if (!sbreserve_locked(sb, newsize, so, NULL)) 838 sb->sb_flags &= ~SB_AUTOSIZE; 839 else 840 sowwakeup = 1; /* room available */ 841 } 842 if (sowwakeup) { 843 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 844 t4_aiotx_queue_toep(so, toep); 845 sowwakeup_locked(so); 846 } else 847 SOCKBUF_UNLOCK(sb); 848 SOCKBUF_UNLOCK_ASSERT(sb); 849 850 /* nothing to send */ 851 if (plen == 0) { 852 KASSERT(m == NULL, 853 ("%s: nothing to send, but m != NULL", __func__)); 854 break; 855 } 856 857 if (__predict_false(toep->flags & TPF_FIN_SENT)) 858 panic("%s: excess tx.", __func__); 859 860 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 861 if (plen <= max_imm && !aiotx_mbuf_seen) { 862 863 /* Immediate data tx */ 864 865 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 866 toep->ofld_txq); 867 if (wr == NULL) { 868 /* XXX: how will we recover from this? */ 869 toep->flags |= TPF_TX_SUSPENDED; 870 return; 871 } 872 txwr = wrtod(wr); 873 credits = howmany(wr->wr_len, 16); 874 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0, 875 sc->tt.tx_align); 876 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 877 nsegs = 0; 878 } else { 879 int wr_len; 880 881 /* DSGL tx */ 882 883 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 884 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 885 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 886 if (wr == NULL) { 887 /* XXX: how will we recover from this? */ 888 toep->flags |= TPF_TX_SUSPENDED; 889 return; 890 } 891 txwr = wrtod(wr); 892 credits = howmany(wr_len, 16); 893 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0, 894 sc->tt.tx_align); 895 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 896 max_nsegs_1mbuf); 897 if (wr_len & 0xf) { 898 uint64_t *pad = (uint64_t *) 899 ((uintptr_t)txwr + wr_len); 900 *pad = 0; 901 } 902 } 903 904 KASSERT(toep->tx_credits >= credits, 905 ("%s: not enough credits", __func__)); 906 907 toep->tx_credits -= credits; 908 toep->tx_nocompl += credits; 909 toep->plen_nocompl += plen; 910 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 911 toep->tx_nocompl >= toep->tx_total / 4) 912 compl = 1; 913 914 if (compl || toep->ulp_mode == ULP_MODE_RDMA) { 915 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 916 toep->tx_nocompl = 0; 917 toep->plen_nocompl = 0; 918 } 919 920 tp->snd_nxt += plen; 921 tp->snd_max += plen; 922 923 SOCKBUF_LOCK(sb); 924 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 925 sb->sb_sndptr = sb_sndptr; 926 SOCKBUF_UNLOCK(sb); 927 928 toep->flags |= TPF_TX_DATA_SENT; 929 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 930 toep->flags |= TPF_TX_SUSPENDED; 931 932 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 933 txsd->plen = plen; 934 txsd->tx_credits = credits; 935 txsd++; 936 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 937 toep->txsd_pidx = 0; 938 txsd = &toep->txsd[0]; 939 } 940 toep->txsd_avail--; 941 942 t4_l2t_send(sc, wr, toep->l2te); 943 } while (m != NULL); 944 945 /* Send a FIN if requested, but only if there's no more data to send */ 946 if (m == NULL && toep->flags & TPF_SEND_FIN) 947 t4_close_conn(sc, toep); 948} 949 950static inline void 951rqdrop_locked(struct mbufq *q, int plen) 952{ 953 struct mbuf *m; 954 955 while (plen > 0) { 956 m = mbufq_dequeue(q); 957 958 /* Too many credits. */ 959 MPASS(m != NULL); 960 M_ASSERTPKTHDR(m); 961 962 /* Partial credits. */ 963 MPASS(plen >= m->m_pkthdr.len); 964 965 plen -= m->m_pkthdr.len; 966 m_freem(m); 967 } 968} 969 970void 971t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 972{ 973 struct mbuf *sndptr, *m; 974 struct fw_ofld_tx_data_wr *txwr; 975 struct wrqe *wr; 976 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 977 u_int adjusted_plen, ulp_submode; 978 struct inpcb *inp = toep->inp; 979 struct tcpcb *tp = intotcpcb(inp); 980 int tx_credits, shove; 981 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 982 struct mbufq *pduq = &toep->ulp_pduq; 983 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 984 985 INP_WLOCK_ASSERT(inp); 986 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 987 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 988 KASSERT(toep->ulp_mode == ULP_MODE_ISCSI, 989 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 990 991 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 992 return; 993 994 /* 995 * This function doesn't resume by itself. Someone else must clear the 996 * flag and call this function. 997 */ 998 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 999 KASSERT(drop == 0, 1000 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1001 return; 1002 } 1003 1004 if (drop) 1005 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1006 1007 while ((sndptr = mbufq_first(pduq)) != NULL) { 1008 M_ASSERTPKTHDR(sndptr); 1009 1010 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 1011 max_imm = max_imm_payload(tx_credits); 1012 max_nsegs = max_dsgl_nsegs(tx_credits); 1013 1014 plen = 0; 1015 nsegs = 0; 1016 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 1017 for (m = sndptr; m != NULL; m = m->m_next) { 1018 int n = sglist_count(mtod(m, void *), m->m_len); 1019 1020 nsegs += n; 1021 plen += m->m_len; 1022 1023 /* 1024 * This mbuf would send us _over_ the nsegs limit. 1025 * Suspend tx because the PDU can't be sent out. 1026 */ 1027 if (plen > max_imm && nsegs > max_nsegs) { 1028 toep->flags |= TPF_TX_SUSPENDED; 1029 return; 1030 } 1031 1032 if (max_nsegs_1mbuf < n) 1033 max_nsegs_1mbuf = n; 1034 } 1035 1036 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1037 panic("%s: excess tx.", __func__); 1038 1039 /* 1040 * We have a PDU to send. All of it goes out in one WR so 'm' 1041 * is NULL. A PDU's length is always a multiple of 4. 1042 */ 1043 MPASS(m == NULL); 1044 MPASS((plen & 3) == 0); 1045 MPASS(sndptr->m_pkthdr.len == plen); 1046 1047 shove = !(tp->t_flags & TF_MORETOCOME); 1048 ulp_submode = mbuf_ulp_submode(sndptr); 1049 MPASS(ulp_submode < nitems(ulp_extra_len)); 1050 1051 /* 1052 * plen doesn't include header and data digests, which are 1053 * generated and inserted in the right places by the TOE, but 1054 * they do occupy TCP sequence space and need to be accounted 1055 * for. 1056 */ 1057 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1058 if (plen <= max_imm) { 1059 1060 /* Immediate data tx */ 1061 1062 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1063 toep->ofld_txq); 1064 if (wr == NULL) { 1065 /* XXX: how will we recover from this? */ 1066 toep->flags |= TPF_TX_SUSPENDED; 1067 return; 1068 } 1069 txwr = wrtod(wr); 1070 credits = howmany(wr->wr_len, 16); 1071 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1072 shove, ulp_submode, sc->tt.tx_align); 1073 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1074 nsegs = 0; 1075 } else { 1076 int wr_len; 1077 1078 /* DSGL tx */ 1079 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1080 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1081 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 1082 if (wr == NULL) { 1083 /* XXX: how will we recover from this? */ 1084 toep->flags |= TPF_TX_SUSPENDED; 1085 return; 1086 } 1087 txwr = wrtod(wr); 1088 credits = howmany(wr_len, 16); 1089 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1090 shove, ulp_submode, sc->tt.tx_align); 1091 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1092 max_nsegs_1mbuf); 1093 if (wr_len & 0xf) { 1094 uint64_t *pad = (uint64_t *) 1095 ((uintptr_t)txwr + wr_len); 1096 *pad = 0; 1097 } 1098 } 1099 1100 KASSERT(toep->tx_credits >= credits, 1101 ("%s: not enough credits", __func__)); 1102 1103 m = mbufq_dequeue(pduq); 1104 MPASS(m == sndptr); 1105 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1106 1107 toep->tx_credits -= credits; 1108 toep->tx_nocompl += credits; 1109 toep->plen_nocompl += plen; 1110 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1111 toep->tx_nocompl >= toep->tx_total / 4) { 1112 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1113 toep->tx_nocompl = 0; 1114 toep->plen_nocompl = 0; 1115 } 1116 1117 tp->snd_nxt += adjusted_plen; 1118 tp->snd_max += adjusted_plen; 1119 1120 toep->flags |= TPF_TX_DATA_SENT; 1121 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1122 toep->flags |= TPF_TX_SUSPENDED; 1123 1124 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1125 txsd->plen = plen; 1126 txsd->tx_credits = credits; 1127 txsd++; 1128 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1129 toep->txsd_pidx = 0; 1130 txsd = &toep->txsd[0]; 1131 } 1132 toep->txsd_avail--; 1133 1134 t4_l2t_send(sc, wr, toep->l2te); 1135 } 1136 1137 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1138 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1139 t4_close_conn(sc, toep); 1140} 1141 1142int 1143t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1144{ 1145 struct adapter *sc = tod->tod_softc; 1146#ifdef INVARIANTS 1147 struct inpcb *inp = tp->t_inpcb; 1148#endif 1149 struct toepcb *toep = tp->t_toe; 1150 1151 INP_WLOCK_ASSERT(inp); 1152 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1153 ("%s: inp %p dropped.", __func__, inp)); 1154 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1155 1156 if (toep->ulp_mode == ULP_MODE_ISCSI) 1157 t4_push_pdus(sc, toep, 0); 1158 else if (tls_tx_key(toep)) 1159 t4_push_tls_records(sc, toep, 0); 1160 else 1161 t4_push_frames(sc, toep, 0); 1162 1163 return (0); 1164} 1165 1166int 1167t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1168{ 1169 struct adapter *sc = tod->tod_softc; 1170#ifdef INVARIANTS 1171 struct inpcb *inp = tp->t_inpcb; 1172#endif 1173 struct toepcb *toep = tp->t_toe; 1174 1175 INP_WLOCK_ASSERT(inp); 1176 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1177 ("%s: inp %p dropped.", __func__, inp)); 1178 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1179 1180 toep->flags |= TPF_SEND_FIN; 1181 if (tp->t_state >= TCPS_ESTABLISHED) { 1182 if (toep->ulp_mode == ULP_MODE_ISCSI) 1183 t4_push_pdus(sc, toep, 0); 1184 else if (tls_tx_key(toep)) 1185 t4_push_tls_records(sc, toep, 0); 1186 else 1187 t4_push_frames(sc, toep, 0); 1188 } 1189 1190 return (0); 1191} 1192 1193int 1194t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1195{ 1196 struct adapter *sc = tod->tod_softc; 1197#if defined(INVARIANTS) 1198 struct inpcb *inp = tp->t_inpcb; 1199#endif 1200 struct toepcb *toep = tp->t_toe; 1201 1202 INP_WLOCK_ASSERT(inp); 1203 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1204 ("%s: inp %p dropped.", __func__, inp)); 1205 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1206 1207 /* hmmmm */ 1208 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1209 ("%s: flowc for tid %u [%s] not sent already", 1210 __func__, toep->tid, tcpstates[tp->t_state])); 1211 1212 send_reset(sc, toep, 0); 1213 return (0); 1214} 1215 1216/* 1217 * Peer has sent us a FIN. 1218 */ 1219static int 1220do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1221{ 1222 struct adapter *sc = iq->adapter; 1223 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1224 unsigned int tid = GET_TID(cpl); 1225 struct toepcb *toep = lookup_tid(sc, tid); 1226 struct inpcb *inp = toep->inp; 1227 struct tcpcb *tp = NULL; 1228 struct socket *so; 1229#ifdef INVARIANTS 1230 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1231#endif 1232 1233 KASSERT(opcode == CPL_PEER_CLOSE, 1234 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1235 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1236 1237 if (__predict_false(toep->flags & TPF_SYNQE)) { 1238 /* 1239 * do_pass_establish must have run before do_peer_close and if 1240 * this is still a synqe instead of a toepcb then the connection 1241 * must be getting aborted. 1242 */ 1243 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1244 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1245 toep, toep->flags); 1246 return (0); 1247 } 1248 1249 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1250 1251 CURVNET_SET(toep->vnet); 1252 INP_INFO_RLOCK(&V_tcbinfo); 1253 INP_WLOCK(inp); 1254 tp = intotcpcb(inp); 1255 1256 CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, 1257 tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp); 1258 1259 if (toep->flags & TPF_ABORT_SHUTDOWN) 1260 goto done; 1261 1262 tp->rcv_nxt++; /* FIN */ 1263 1264 so = inp->inp_socket; 1265 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1266 DDP_LOCK(toep); 1267 if (__predict_false(toep->ddp.flags & 1268 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1269 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1270 DDP_UNLOCK(toep); 1271 } 1272 socantrcvmore(so); 1273 1274 if (toep->ulp_mode != ULP_MODE_RDMA) { 1275 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1276 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1277 be32toh(cpl->rcv_nxt))); 1278 } 1279 1280 switch (tp->t_state) { 1281 case TCPS_SYN_RECEIVED: 1282 tp->t_starttime = ticks; 1283 /* FALLTHROUGH */ 1284 1285 case TCPS_ESTABLISHED: 1286 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1287 break; 1288 1289 case TCPS_FIN_WAIT_1: 1290 tcp_state_change(tp, TCPS_CLOSING); 1291 break; 1292 1293 case TCPS_FIN_WAIT_2: 1294 tcp_twstart(tp); 1295 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1296 INP_INFO_RUNLOCK(&V_tcbinfo); 1297 CURVNET_RESTORE(); 1298 1299 INP_WLOCK(inp); 1300 final_cpl_received(toep); 1301 return (0); 1302 1303 default: 1304 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1305 __func__, tid, tp->t_state); 1306 } 1307done: 1308 INP_WUNLOCK(inp); 1309 INP_INFO_RUNLOCK(&V_tcbinfo); 1310 CURVNET_RESTORE(); 1311 return (0); 1312} 1313 1314/* 1315 * Peer has ACK'd our FIN. 1316 */ 1317static int 1318do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1319 struct mbuf *m) 1320{ 1321 struct adapter *sc = iq->adapter; 1322 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1323 unsigned int tid = GET_TID(cpl); 1324 struct toepcb *toep = lookup_tid(sc, tid); 1325 struct inpcb *inp = toep->inp; 1326 struct tcpcb *tp = NULL; 1327 struct socket *so = NULL; 1328#ifdef INVARIANTS 1329 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1330#endif 1331 1332 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1333 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1334 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1335 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1336 1337 CURVNET_SET(toep->vnet); 1338 INP_INFO_RLOCK(&V_tcbinfo); 1339 INP_WLOCK(inp); 1340 tp = intotcpcb(inp); 1341 1342 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1343 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1344 1345 if (toep->flags & TPF_ABORT_SHUTDOWN) 1346 goto done; 1347 1348 so = inp->inp_socket; 1349 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1350 1351 switch (tp->t_state) { 1352 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1353 tcp_twstart(tp); 1354release: 1355 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1356 INP_INFO_RUNLOCK(&V_tcbinfo); 1357 CURVNET_RESTORE(); 1358 1359 INP_WLOCK(inp); 1360 final_cpl_received(toep); /* no more CPLs expected */ 1361 1362 return (0); 1363 case TCPS_LAST_ACK: 1364 if (tcp_close(tp)) 1365 INP_WUNLOCK(inp); 1366 goto release; 1367 1368 case TCPS_FIN_WAIT_1: 1369 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1370 soisdisconnected(so); 1371 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1372 break; 1373 1374 default: 1375 log(LOG_ERR, 1376 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1377 __func__, tid, tcpstates[tp->t_state]); 1378 } 1379done: 1380 INP_WUNLOCK(inp); 1381 INP_INFO_RUNLOCK(&V_tcbinfo); 1382 CURVNET_RESTORE(); 1383 return (0); 1384} 1385 1386void 1387send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1388 int rst_status) 1389{ 1390 struct wrqe *wr; 1391 struct cpl_abort_rpl *cpl; 1392 1393 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1394 if (wr == NULL) { 1395 /* XXX */ 1396 panic("%s: allocation failure.", __func__); 1397 } 1398 cpl = wrtod(wr); 1399 1400 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1401 cpl->cmd = rst_status; 1402 1403 t4_wrq_tx(sc, wr); 1404} 1405 1406static int 1407abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1408{ 1409 switch (abort_reason) { 1410 case CPL_ERR_BAD_SYN: 1411 case CPL_ERR_CONN_RESET: 1412 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1413 case CPL_ERR_XMIT_TIMEDOUT: 1414 case CPL_ERR_PERSIST_TIMEDOUT: 1415 case CPL_ERR_FINWAIT2_TIMEDOUT: 1416 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1417 return (ETIMEDOUT); 1418 default: 1419 return (EIO); 1420 } 1421} 1422 1423/* 1424 * TCP RST from the peer, timeout, or some other such critical error. 1425 */ 1426static int 1427do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1428{ 1429 struct adapter *sc = iq->adapter; 1430 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1431 unsigned int tid = GET_TID(cpl); 1432 struct toepcb *toep = lookup_tid(sc, tid); 1433 struct sge_wrq *ofld_txq = toep->ofld_txq; 1434 struct inpcb *inp; 1435 struct tcpcb *tp; 1436#ifdef INVARIANTS 1437 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1438#endif 1439 1440 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1441 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1442 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1443 1444 if (toep->flags & TPF_SYNQE) 1445 return (do_abort_req_synqe(iq, rss, m)); 1446 1447 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1448 1449 if (negative_advice(cpl->status)) { 1450 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1451 __func__, cpl->status, tid, toep->flags); 1452 return (0); /* Ignore negative advice */ 1453 } 1454 1455 inp = toep->inp; 1456 CURVNET_SET(toep->vnet); 1457 INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ 1458 INP_WLOCK(inp); 1459 1460 tp = intotcpcb(inp); 1461 1462 CTR6(KTR_CXGBE, 1463 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1464 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1465 inp->inp_flags, cpl->status); 1466 1467 /* 1468 * If we'd initiated an abort earlier the reply to it is responsible for 1469 * cleaning up resources. Otherwise we tear everything down right here 1470 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1471 */ 1472 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1473 INP_WUNLOCK(inp); 1474 goto done; 1475 } 1476 toep->flags |= TPF_ABORT_SHUTDOWN; 1477 1478 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1479 struct socket *so = inp->inp_socket; 1480 1481 if (so != NULL) 1482 so_error_set(so, abort_status_to_errno(tp, 1483 cpl->status)); 1484 tp = tcp_close(tp); 1485 if (tp == NULL) 1486 INP_WLOCK(inp); /* re-acquire */ 1487 } 1488 1489 final_cpl_received(toep); 1490done: 1491 INP_INFO_RUNLOCK(&V_tcbinfo); 1492 CURVNET_RESTORE(); 1493 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1494 return (0); 1495} 1496 1497/* 1498 * Reply to the CPL_ABORT_REQ (send_reset) 1499 */ 1500static int 1501do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1502{ 1503 struct adapter *sc = iq->adapter; 1504 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1505 unsigned int tid = GET_TID(cpl); 1506 struct toepcb *toep = lookup_tid(sc, tid); 1507 struct inpcb *inp = toep->inp; 1508#ifdef INVARIANTS 1509 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1510#endif 1511 1512 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1513 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1514 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1515 1516 if (toep->flags & TPF_SYNQE) 1517 return (do_abort_rpl_synqe(iq, rss, m)); 1518 1519 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1520 1521 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1522 __func__, tid, toep, inp, cpl->status); 1523 1524 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1525 ("%s: wasn't expecting abort reply", __func__)); 1526 1527 INP_WLOCK(inp); 1528 final_cpl_received(toep); 1529 1530 return (0); 1531} 1532 1533static int 1534do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1535{ 1536 struct adapter *sc = iq->adapter; 1537 const struct cpl_rx_data *cpl = mtod(m, const void *); 1538 unsigned int tid = GET_TID(cpl); 1539 struct toepcb *toep = lookup_tid(sc, tid); 1540 struct inpcb *inp = toep->inp; 1541 struct tcpcb *tp; 1542 struct socket *so; 1543 struct sockbuf *sb; 1544 int len, rx_credits; 1545 uint32_t ddp_placed = 0; 1546 1547 if (__predict_false(toep->flags & TPF_SYNQE)) { 1548 /* 1549 * do_pass_establish must have run before do_rx_data and if this 1550 * is still a synqe instead of a toepcb then the connection must 1551 * be getting aborted. 1552 */ 1553 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1554 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1555 toep, toep->flags); 1556 m_freem(m); 1557 return (0); 1558 } 1559 1560 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1561 1562 /* strip off CPL header */ 1563 m_adj(m, sizeof(*cpl)); 1564 len = m->m_pkthdr.len; 1565 1566 INP_WLOCK(inp); 1567 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1568 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1569 __func__, tid, len, inp->inp_flags); 1570 INP_WUNLOCK(inp); 1571 m_freem(m); 1572 return (0); 1573 } 1574 1575 tp = intotcpcb(inp); 1576 1577 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1578 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1579 1580 tp->rcv_nxt += len; 1581 if (tp->rcv_wnd < len) { 1582 KASSERT(toep->ulp_mode == ULP_MODE_RDMA, 1583 ("%s: negative window size", __func__)); 1584 } 1585 1586 tp->rcv_wnd -= len; 1587 tp->t_rcvtime = ticks; 1588 1589 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1590 DDP_LOCK(toep); 1591 so = inp_inpcbtosocket(inp); 1592 sb = &so->so_rcv; 1593 SOCKBUF_LOCK(sb); 1594 1595 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1596 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1597 __func__, tid, len); 1598 m_freem(m); 1599 SOCKBUF_UNLOCK(sb); 1600 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1601 DDP_UNLOCK(toep); 1602 INP_WUNLOCK(inp); 1603 1604 CURVNET_SET(toep->vnet); 1605 INP_INFO_RLOCK(&V_tcbinfo); 1606 INP_WLOCK(inp); 1607 tp = tcp_drop(tp, ECONNRESET); 1608 if (tp) 1609 INP_WUNLOCK(inp); 1610 INP_INFO_RUNLOCK(&V_tcbinfo); 1611 CURVNET_RESTORE(); 1612 1613 return (0); 1614 } 1615 1616 /* receive buffer autosize */ 1617 MPASS(toep->vnet == so->so_vnet); 1618 CURVNET_SET(toep->vnet); 1619 if (sb->sb_flags & SB_AUTOSIZE && 1620 V_tcp_do_autorcvbuf && 1621 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1622 len > (sbspace(sb) / 8 * 7)) { 1623 unsigned int hiwat = sb->sb_hiwat; 1624 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 1625 V_tcp_autorcvbuf_max); 1626 1627 if (!sbreserve_locked(sb, newsize, so, NULL)) 1628 sb->sb_flags &= ~SB_AUTOSIZE; 1629 } 1630 1631 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1632 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1633 1634 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1635 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1636 __func__, tid, len); 1637 1638 if (changed) { 1639 if (toep->ddp.flags & DDP_SC_REQ) 1640 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1641 else { 1642 KASSERT(cpl->ddp_off == 1, 1643 ("%s: DDP switched on by itself.", 1644 __func__)); 1645 1646 /* Fell out of DDP mode */ 1647 toep->ddp.flags &= ~DDP_ON; 1648 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1649 __func__); 1650 1651 insert_ddp_data(toep, ddp_placed); 1652 } 1653 } 1654 1655 if (toep->ddp.flags & DDP_ON) { 1656 /* 1657 * CPL_RX_DATA with DDP on can only be an indicate. 1658 * Start posting queued AIO requests via DDP. The 1659 * payload that arrived in this indicate is appended 1660 * to the socket buffer as usual. 1661 */ 1662 handle_ddp_indicate(toep); 1663 } 1664 } 1665 1666 sbappendstream_locked(sb, m, 0); 1667 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1668 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1669 rx_credits = send_rx_credits(sc, toep, rx_credits); 1670 tp->rcv_wnd += rx_credits; 1671 tp->rcv_adv += rx_credits; 1672 } 1673 1674 if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1675 sbavail(sb) != 0) { 1676 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1677 tid); 1678 ddp_queue_toep(toep); 1679 } 1680 sorwakeup_locked(so); 1681 SOCKBUF_UNLOCK_ASSERT(sb); 1682 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1683 DDP_UNLOCK(toep); 1684 1685 INP_WUNLOCK(inp); 1686 CURVNET_RESTORE(); 1687 return (0); 1688} 1689 1690static int 1691do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1692{ 1693 struct adapter *sc = iq->adapter; 1694 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1695 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1696 struct toepcb *toep = lookup_tid(sc, tid); 1697 struct inpcb *inp; 1698 struct tcpcb *tp; 1699 struct socket *so; 1700 uint8_t credits = cpl->credits; 1701 struct ofld_tx_sdesc *txsd; 1702 int plen; 1703#ifdef INVARIANTS 1704 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1705#endif 1706 1707 /* 1708 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1709 * now this comes back carrying the credits for the flowc. 1710 */ 1711 if (__predict_false(toep->flags & TPF_SYNQE)) { 1712 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1713 ("%s: credits for a synq entry %p", __func__, toep)); 1714 return (0); 1715 } 1716 1717 inp = toep->inp; 1718 1719 KASSERT(opcode == CPL_FW4_ACK, 1720 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1721 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1722 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1723 1724 INP_WLOCK(inp); 1725 1726 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1727 INP_WUNLOCK(inp); 1728 return (0); 1729 } 1730 1731 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1732 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1733 1734 tp = intotcpcb(inp); 1735 1736 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1737 tcp_seq snd_una = be32toh(cpl->snd_una); 1738 1739#ifdef INVARIANTS 1740 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1741 log(LOG_ERR, 1742 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1743 __func__, snd_una, toep->tid, tp->snd_una); 1744 } 1745#endif 1746 1747 if (tp->snd_una != snd_una) { 1748 tp->snd_una = snd_una; 1749 tp->ts_recent_age = tcp_ts_getticks(); 1750 } 1751 } 1752 1753#ifdef VERBOSE_TRACES 1754 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1755#endif 1756 so = inp->inp_socket; 1757 txsd = &toep->txsd[toep->txsd_cidx]; 1758 plen = 0; 1759 while (credits) { 1760 KASSERT(credits >= txsd->tx_credits, 1761 ("%s: too many (or partial) credits", __func__)); 1762 credits -= txsd->tx_credits; 1763 toep->tx_credits += txsd->tx_credits; 1764 plen += txsd->plen; 1765 if (txsd->iv_buffer) { 1766 free(txsd->iv_buffer, M_CXGBE); 1767 txsd->iv_buffer = NULL; 1768 } 1769 txsd++; 1770 toep->txsd_avail++; 1771 KASSERT(toep->txsd_avail <= toep->txsd_total, 1772 ("%s: txsd avail > total", __func__)); 1773 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1774 txsd = &toep->txsd[0]; 1775 toep->txsd_cidx = 0; 1776 } 1777 } 1778 1779 if (toep->tx_credits == toep->tx_total) { 1780 toep->tx_nocompl = 0; 1781 toep->plen_nocompl = 0; 1782 } 1783 1784 if (toep->flags & TPF_TX_SUSPENDED && 1785 toep->tx_credits >= toep->tx_total / 4) { 1786#ifdef VERBOSE_TRACES 1787 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1788 tid); 1789#endif 1790 toep->flags &= ~TPF_TX_SUSPENDED; 1791 CURVNET_SET(toep->vnet); 1792 if (toep->ulp_mode == ULP_MODE_ISCSI) 1793 t4_push_pdus(sc, toep, plen); 1794 else if (tls_tx_key(toep)) 1795 t4_push_tls_records(sc, toep, plen); 1796 else 1797 t4_push_frames(sc, toep, plen); 1798 CURVNET_RESTORE(); 1799 } else if (plen > 0) { 1800 struct sockbuf *sb = &so->so_snd; 1801 int sbu; 1802 1803 SOCKBUF_LOCK(sb); 1804 sbu = sbused(sb); 1805 if (toep->ulp_mode == ULP_MODE_ISCSI) { 1806 1807 if (__predict_false(sbu > 0)) { 1808 /* 1809 * The data trasmitted before the tid's ULP mode 1810 * changed to ISCSI is still in so_snd. 1811 * Incoming credits should account for so_snd 1812 * first. 1813 */ 1814 sbdrop_locked(sb, min(sbu, plen)); 1815 plen -= min(sbu, plen); 1816 } 1817 sowwakeup_locked(so); /* unlocks so_snd */ 1818 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1819 } else { 1820#ifdef VERBOSE_TRACES 1821 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1822 tid, plen); 1823#endif 1824 sbdrop_locked(sb, plen); 1825 if (tls_tx_key(toep)) { 1826 struct tls_ofld_info *tls_ofld = &toep->tls; 1827 1828 MPASS(tls_ofld->sb_off >= plen); 1829 tls_ofld->sb_off -= plen; 1830 } 1831 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1832 t4_aiotx_queue_toep(so, toep); 1833 sowwakeup_locked(so); /* unlocks so_snd */ 1834 } 1835 SOCKBUF_UNLOCK_ASSERT(sb); 1836 } 1837 1838 INP_WUNLOCK(inp); 1839 1840 return (0); 1841} 1842 1843void 1844t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1845 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1846{ 1847 struct wrqe *wr; 1848 struct cpl_set_tcb_field *req; 1849 struct ofld_tx_sdesc *txsd; 1850 1851 MPASS((cookie & ~M_COOKIE) == 0); 1852 if (reply) { 1853 MPASS(cookie != CPL_COOKIE_RESERVED); 1854 } 1855 1856 wr = alloc_wrqe(sizeof(*req), wrq); 1857 if (wr == NULL) { 1858 /* XXX */ 1859 panic("%s: allocation failure.", __func__); 1860 } 1861 req = wrtod(wr); 1862 1863 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1864 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1865 if (reply == 0) 1866 req->reply_ctrl |= htobe16(F_NO_REPLY); 1867 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1868 req->mask = htobe64(mask); 1869 req->val = htobe64(val); 1870 if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { 1871 txsd = &toep->txsd[toep->txsd_pidx]; 1872 txsd->tx_credits = howmany(sizeof(*req), 16); 1873 txsd->plen = 0; 1874 KASSERT(toep->tx_credits >= txsd->tx_credits && 1875 toep->txsd_avail > 0, 1876 ("%s: not enough credits (%d)", __func__, 1877 toep->tx_credits)); 1878 toep->tx_credits -= txsd->tx_credits; 1879 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1880 toep->txsd_pidx = 0; 1881 toep->txsd_avail--; 1882 } 1883 1884 t4_wrq_tx(sc, wr); 1885} 1886 1887void 1888t4_init_cpl_io_handlers(void) 1889{ 1890 1891 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1892 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1893 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1894 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1895 CPL_COOKIE_TOM); 1896 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1897 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1898} 1899 1900void 1901t4_uninit_cpl_io_handlers(void) 1902{ 1903 1904 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1905 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1906 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1907 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1908 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1909 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1910} 1911 1912/* 1913 * Use the 'backend3' field in AIO jobs to store the amount of data 1914 * sent by the AIO job so far and the 'backend4' field to hold an 1915 * error that should be reported when the job is completed. 1916 */ 1917#define aio_sent backend3 1918#define aio_error backend4 1919 1920#define jobtotid(job) \ 1921 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1922 1923static void 1924free_aiotx_buffer(struct aiotx_buffer *ab) 1925{ 1926 struct kaiocb *job; 1927 long status; 1928 int error; 1929 1930 if (refcount_release(&ab->refcount) == 0) 1931 return; 1932 1933 job = ab->job; 1934 error = job->aio_error; 1935 status = job->aio_sent; 1936 vm_page_unhold_pages(ab->ps.pages, ab->ps.npages); 1937 free(ab, M_CXGBE); 1938#ifdef VERBOSE_TRACES 1939 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1940 jobtotid(job), job, status, error); 1941#endif 1942 if (error == ECANCELED && status != 0) 1943 error = 0; 1944 if (error == ECANCELED) 1945 aio_cancel(job); 1946 else if (error) 1947 aio_complete(job, -1, error); 1948 else 1949 aio_complete(job, status, 0); 1950} 1951 1952static void 1953t4_aiotx_mbuf_free(struct mbuf *m, void *buffer, void *arg) 1954{ 1955 struct aiotx_buffer *ab = buffer; 1956 1957#ifdef VERBOSE_TRACES 1958 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1959 m->m_len, jobtotid(ab->job)); 1960#endif 1961 free_aiotx_buffer(ab); 1962} 1963 1964/* 1965 * Hold the buffer backing an AIO request and return an AIO transmit 1966 * buffer. 1967 */ 1968static int 1969hold_aio(struct kaiocb *job) 1970{ 1971 struct aiotx_buffer *ab; 1972 struct vmspace *vm; 1973 vm_map_t map; 1974 vm_offset_t start, end, pgoff; 1975 int n; 1976 1977 MPASS(job->backend1 == NULL); 1978 1979 /* 1980 * The AIO subsystem will cancel and drain all requests before 1981 * permitting a process to exit or exec, so p_vmspace should 1982 * be stable here. 1983 */ 1984 vm = job->userproc->p_vmspace; 1985 map = &vm->vm_map; 1986 start = (uintptr_t)job->uaiocb.aio_buf; 1987 pgoff = start & PAGE_MASK; 1988 end = round_page(start + job->uaiocb.aio_nbytes); 1989 start = trunc_page(start); 1990 n = atop(end - start); 1991 1992 ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | 1993 M_ZERO); 1994 refcount_init(&ab->refcount, 1); 1995 ab->ps.pages = (vm_page_t *)(ab + 1); 1996 ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start, 1997 VM_PROT_WRITE, ab->ps.pages, n); 1998 if (ab->ps.npages < 0) { 1999 free(ab, M_CXGBE); 2000 return (EFAULT); 2001 } 2002 2003 KASSERT(ab->ps.npages == n, 2004 ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n)); 2005 2006 ab->ps.offset = pgoff; 2007 ab->ps.len = job->uaiocb.aio_nbytes; 2008 ab->job = job; 2009 job->backend1 = ab; 2010#ifdef VERBOSE_TRACES 2011 CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", 2012 __func__, jobtotid(job), &ab->ps, job, ab->ps.npages); 2013#endif 2014 return (0); 2015} 2016 2017static void 2018t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2019{ 2020 struct adapter *sc; 2021 struct sockbuf *sb; 2022 struct file *fp; 2023 struct aiotx_buffer *ab; 2024 struct inpcb *inp; 2025 struct tcpcb *tp; 2026 struct mbuf *m; 2027 int error; 2028 bool moretocome, sendmore; 2029 2030 sc = td_adapter(toep->td); 2031 sb = &so->so_snd; 2032 SOCKBUF_UNLOCK(sb); 2033 fp = job->fd_file; 2034 ab = job->backend1; 2035 m = NULL; 2036 2037#ifdef MAC 2038 error = mac_socket_check_send(fp->f_cred, so); 2039 if (error != 0) 2040 goto out; 2041#endif 2042 2043 if (ab == NULL) { 2044 error = hold_aio(job); 2045 if (error != 0) 2046 goto out; 2047 ab = job->backend1; 2048 } 2049 2050 /* Inline sosend_generic(). */ 2051 2052 job->msgsnd = 1; 2053 2054 error = sblock(sb, SBL_WAIT); 2055 MPASS(error == 0); 2056 2057sendanother: 2058 m = m_get(M_WAITOK, MT_DATA); 2059 2060 SOCKBUF_LOCK(sb); 2061 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2062 SOCKBUF_UNLOCK(sb); 2063 sbunlock(sb); 2064 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2065 PROC_LOCK(job->userproc); 2066 kern_psignal(job->userproc, SIGPIPE); 2067 PROC_UNLOCK(job->userproc); 2068 } 2069 error = EPIPE; 2070 goto out; 2071 } 2072 if (so->so_error) { 2073 error = so->so_error; 2074 so->so_error = 0; 2075 SOCKBUF_UNLOCK(sb); 2076 sbunlock(sb); 2077 goto out; 2078 } 2079 if ((so->so_state & SS_ISCONNECTED) == 0) { 2080 SOCKBUF_UNLOCK(sb); 2081 sbunlock(sb); 2082 error = ENOTCONN; 2083 goto out; 2084 } 2085 if (sbspace(sb) < sb->sb_lowat) { 2086 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2087 2088 /* 2089 * Don't block if there is too little room in the socket 2090 * buffer. Instead, requeue the request. 2091 */ 2092 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2093 SOCKBUF_UNLOCK(sb); 2094 sbunlock(sb); 2095 error = ECANCELED; 2096 goto out; 2097 } 2098 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2099 SOCKBUF_UNLOCK(sb); 2100 sbunlock(sb); 2101 goto out; 2102 } 2103 2104 /* 2105 * Write as much data as the socket permits, but no more than a 2106 * a single sndbuf at a time. 2107 */ 2108 m->m_len = sbspace(sb); 2109 if (m->m_len > ab->ps.len - job->aio_sent) { 2110 m->m_len = ab->ps.len - job->aio_sent; 2111 moretocome = false; 2112 } else 2113 moretocome = true; 2114 if (m->m_len > sc->tt.sndbuf) { 2115 m->m_len = sc->tt.sndbuf; 2116 sendmore = true; 2117 } else 2118 sendmore = false; 2119 2120 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2121 moretocome = true; 2122 SOCKBUF_UNLOCK(sb); 2123 MPASS(m->m_len != 0); 2124 2125 /* Inlined tcp_usr_send(). */ 2126 2127 inp = toep->inp; 2128 INP_WLOCK(inp); 2129 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2130 INP_WUNLOCK(inp); 2131 sbunlock(sb); 2132 error = ECONNRESET; 2133 goto out; 2134 } 2135 2136 refcount_acquire(&ab->refcount); 2137 m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab, 2138 (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV); 2139 m->m_ext.ext_flags |= EXT_FLAG_AIOTX; 2140 job->aio_sent += m->m_len; 2141 2142 sbappendstream(sb, m, 0); 2143 m = NULL; 2144 2145 if (!(inp->inp_flags & INP_DROPPED)) { 2146 tp = intotcpcb(inp); 2147 if (moretocome) 2148 tp->t_flags |= TF_MORETOCOME; 2149 error = tp->t_fb->tfb_tcp_output(tp); 2150 if (moretocome) 2151 tp->t_flags &= ~TF_MORETOCOME; 2152 } 2153 2154 INP_WUNLOCK(inp); 2155 if (sendmore) 2156 goto sendanother; 2157 sbunlock(sb); 2158 2159 if (error) 2160 goto out; 2161 2162 /* 2163 * If this is a non-blocking socket and the request has not 2164 * been fully completed, requeue it until the socket is ready 2165 * again. 2166 */ 2167 if (job->aio_sent < job->uaiocb.aio_nbytes && 2168 !(so->so_state & SS_NBIO)) { 2169 SOCKBUF_LOCK(sb); 2170 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2171 SOCKBUF_UNLOCK(sb); 2172 error = ECANCELED; 2173 goto out; 2174 } 2175 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2176 return; 2177 } 2178 2179 /* 2180 * If the request will not be requeued, drop a reference on 2181 * the the aiotx buffer. Any mbufs in flight should still 2182 * contain a reference, but this drops the reference that the 2183 * job owns while it is waiting to queue mbufs to the socket. 2184 */ 2185 free_aiotx_buffer(ab); 2186 2187out: 2188 if (error) { 2189 if (ab != NULL) { 2190 job->aio_error = error; 2191 free_aiotx_buffer(ab); 2192 } else { 2193 MPASS(job->aio_sent == 0); 2194 aio_complete(job, -1, error); 2195 } 2196 } 2197 if (m != NULL) 2198 m_free(m); 2199 SOCKBUF_LOCK(sb); 2200} 2201 2202static void 2203t4_aiotx_task(void *context, int pending) 2204{ 2205 struct toepcb *toep = context; 2206 struct socket *so; 2207 struct kaiocb *job; 2208 2209 so = toep->aiotx_so; 2210 CURVNET_SET(toep->vnet); 2211 SOCKBUF_LOCK(&so->so_snd); 2212 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2213 job = TAILQ_FIRST(&toep->aiotx_jobq); 2214 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2215 if (!aio_clear_cancel_function(job)) 2216 continue; 2217 2218 t4_aiotx_process_job(toep, so, job); 2219 } 2220 toep->aiotx_so = NULL; 2221 SOCKBUF_UNLOCK(&so->so_snd); 2222 CURVNET_RESTORE(); 2223 2224 free_toepcb(toep); 2225 SOCK_LOCK(so); 2226 sorele(so); 2227} 2228 2229static void 2230t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2231{ 2232 2233 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2234#ifdef VERBOSE_TRACES 2235 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2236 __func__, toep->tid, toep->aiotx_task_active ? "true" : "false"); 2237#endif 2238 if (toep->aiotx_so != NULL) 2239 return; 2240 soref(so); 2241 toep->aiotx_so = so; 2242 hold_toepcb(toep); 2243 soaio_enqueue(&toep->aiotx_task); 2244} 2245 2246static void 2247t4_aiotx_cancel(struct kaiocb *job) 2248{ 2249 struct aiotx_buffer *ab; 2250 struct socket *so; 2251 struct sockbuf *sb; 2252 struct tcpcb *tp; 2253 struct toepcb *toep; 2254 2255 so = job->fd_file->f_data; 2256 tp = so_sototcpcb(so); 2257 toep = tp->t_toe; 2258 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2259 sb = &so->so_snd; 2260 2261 SOCKBUF_LOCK(sb); 2262 if (!aio_cancel_cleared(job)) 2263 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2264 SOCKBUF_UNLOCK(sb); 2265 2266 ab = job->backend1; 2267 if (ab != NULL) 2268 free_aiotx_buffer(ab); 2269 else 2270 aio_cancel(job); 2271} 2272 2273int 2274t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2275{ 2276 struct tcpcb *tp = so_sototcpcb(so); 2277 struct toepcb *toep = tp->t_toe; 2278 struct adapter *sc = td_adapter(toep->td); 2279 2280 /* This only handles writes. */ 2281 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2282 return (EOPNOTSUPP); 2283 2284 if (!sc->tt.tx_zcopy) 2285 return (EOPNOTSUPP); 2286 2287 if (tls_tx_key(toep)) 2288 return (EOPNOTSUPP); 2289 2290 SOCKBUF_LOCK(&so->so_snd); 2291#ifdef VERBOSE_TRACES 2292 CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job); 2293#endif 2294 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2295 panic("new job was cancelled"); 2296 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2297 if (sowriteable(so)) 2298 t4_aiotx_queue_toep(so, toep); 2299 SOCKBUF_UNLOCK(&so->so_snd); 2300 return (0); 2301} 2302 2303void 2304aiotx_init_toep(struct toepcb *toep) 2305{ 2306 2307 TAILQ_INIT(&toep->aiotx_jobq); 2308 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2309} 2310#endif 2311