t4_cpl_io.c revision 345664
1/*- 2 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c 345664 2019-03-28 23:43:38Z jhb $"); 30 31#include "opt_inet.h" 32 33#ifdef TCP_OFFLOAD 34#include <sys/param.h> 35#include <sys/aio.h> 36#include <sys/file.h> 37#include <sys/kernel.h> 38#include <sys/ktr.h> 39#include <sys/module.h> 40#include <sys/proc.h> 41#include <sys/protosw.h> 42#include <sys/domain.h> 43#include <sys/socket.h> 44#include <sys/socketvar.h> 45#include <sys/sglist.h> 46#include <sys/taskqueue.h> 47#include <netinet/in.h> 48#include <netinet/in_pcb.h> 49#include <netinet/ip.h> 50#include <netinet/ip6.h> 51#define TCPSTATES 52#include <netinet/tcp_fsm.h> 53#include <netinet/tcp_seq.h> 54#include <netinet/tcp_var.h> 55#include <netinet/toecore.h> 56 57#include <security/mac/mac_framework.h> 58 59#include <vm/vm.h> 60#include <vm/vm_extern.h> 61#include <vm/pmap.h> 62#include <vm/vm_map.h> 63#include <vm/vm_page.h> 64 65#include "common/common.h" 66#include "common/t4_msg.h" 67#include "common/t4_regs.h" 68#include "common/t4_tcb.h" 69#include "tom/t4_tom_l2t.h" 70#include "tom/t4_tom.h" 71 72VNET_DECLARE(int, tcp_do_autosndbuf); 73#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) 74VNET_DECLARE(int, tcp_autosndbuf_inc); 75#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) 76VNET_DECLARE(int, tcp_autosndbuf_max); 77#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) 78VNET_DECLARE(int, tcp_do_autorcvbuf); 79#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 80VNET_DECLARE(int, tcp_autorcvbuf_inc); 81#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 82VNET_DECLARE(int, tcp_autorcvbuf_max); 83#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 84 85static void t4_aiotx_cancel(struct kaiocb *job); 86static void t4_aiotx_queue_toep(struct toepcb *toep); 87 88static size_t 89aiotx_mbuf_pgoff(struct mbuf *m) 90{ 91 struct aiotx_buffer *ab; 92 93 MPASS(IS_AIOTX_MBUF(m)); 94 ab = m->m_ext.ext_arg1; 95 return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE); 96} 97 98static vm_page_t * 99aiotx_mbuf_pages(struct mbuf *m) 100{ 101 struct aiotx_buffer *ab; 102 int npages; 103 104 MPASS(IS_AIOTX_MBUF(m)); 105 ab = m->m_ext.ext_arg1; 106 npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE; 107 return (ab->ps.pages + npages); 108} 109 110void 111send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) 112{ 113 struct wrqe *wr; 114 struct fw_flowc_wr *flowc; 115 unsigned int nparams, flowclen, paramidx; 116 struct vi_info *vi = toep->vi; 117 struct port_info *pi = vi->pi; 118 struct adapter *sc = pi->adapter; 119 unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN; 120 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 121 122 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 123 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 124 125 if (ftxp != NULL) 126 nparams = 8; 127 else 128 nparams = 6; 129 if (toep->ulp_mode == ULP_MODE_TLS) 130 nparams++; 131 if (toep->tls.fcplenmax != 0) 132 nparams++; 133 134 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 135 136 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 137 if (wr == NULL) { 138 /* XXX */ 139 panic("%s: allocation failure.", __func__); 140 } 141 flowc = wrtod(wr); 142 memset(flowc, 0, wr->wr_len); 143 144 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 145 V_FW_FLOWC_WR_NPARAMS(nparams)); 146 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 147 V_FW_WR_FLOWID(toep->tid)); 148 149#define FLOWC_PARAM(__m, __v) \ 150 do { \ 151 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 152 flowc->mnemval[paramidx].val = htobe32(__v); \ 153 paramidx++; \ 154 } while (0) 155 156 paramidx = 0; 157 158 FLOWC_PARAM(PFNVFN, pfvf); 159 FLOWC_PARAM(CH, pi->tx_chan); 160 FLOWC_PARAM(PORT, pi->tx_chan); 161 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 162 if (ftxp) { 163 uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); 164 165 FLOWC_PARAM(SNDNXT, ftxp->snd_nxt); 166 FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt); 167 FLOWC_PARAM(SNDBUF, sndbuf); 168 FLOWC_PARAM(MSS, ftxp->mss); 169 170 CTR6(KTR_CXGBE, 171 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 172 __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt, 173 ftxp->rcv_nxt); 174 } else { 175 FLOWC_PARAM(SNDBUF, 512); 176 FLOWC_PARAM(MSS, 512); 177 178 CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); 179 } 180 if (toep->ulp_mode == ULP_MODE_TLS) 181 FLOWC_PARAM(ULP_MODE, toep->ulp_mode); 182 if (toep->tls.fcplenmax != 0) 183 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 184#undef FLOWC_PARAM 185 186 KASSERT(paramidx == nparams, ("nparams mismatch")); 187 188 txsd->tx_credits = howmany(flowclen, 16); 189 txsd->plen = 0; 190 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 191 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 192 toep->tx_credits -= txsd->tx_credits; 193 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 194 toep->txsd_pidx = 0; 195 toep->txsd_avail--; 196 197 toep->flags |= TPF_FLOWC_WR_SENT; 198 t4_wrq_tx(sc, wr); 199} 200 201void 202send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 203{ 204 struct wrqe *wr; 205 struct cpl_abort_req *req; 206 int tid = toep->tid; 207 struct inpcb *inp = toep->inp; 208 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 209 210 INP_WLOCK_ASSERT(inp); 211 212 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 213 __func__, toep->tid, 214 inp->inp_flags & INP_DROPPED ? "inp dropped" : 215 tcpstates[tp->t_state], 216 toep->flags, inp->inp_flags, 217 toep->flags & TPF_ABORT_SHUTDOWN ? 218 " (abort already in progress)" : ""); 219 220 if (toep->flags & TPF_ABORT_SHUTDOWN) 221 return; /* abort already in progress */ 222 223 toep->flags |= TPF_ABORT_SHUTDOWN; 224 225 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 226 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 227 228 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 229 if (wr == NULL) { 230 /* XXX */ 231 panic("%s: allocation failure.", __func__); 232 } 233 req = wrtod(wr); 234 235 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 236 if (inp->inp_flags & INP_DROPPED) 237 req->rsvd0 = htobe32(snd_nxt); 238 else 239 req->rsvd0 = htobe32(tp->snd_nxt); 240 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 241 req->cmd = CPL_ABORT_SEND_RST; 242 243 /* 244 * XXX: What's the correct way to tell that the inp hasn't been detached 245 * from its socket? Should I even be flushing the snd buffer here? 246 */ 247 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 248 struct socket *so = inp->inp_socket; 249 250 if (so != NULL) /* because I'm not sure. See comment above */ 251 sbflush(&so->so_snd); 252 } 253 254 t4_l2t_send(sc, wr, toep->l2te); 255} 256 257/* 258 * Called when a connection is established to translate the TCP options 259 * reported by HW to FreeBSD's native format. 260 */ 261static void 262assign_rxopt(struct tcpcb *tp, unsigned int opt) 263{ 264 struct toepcb *toep = tp->t_toe; 265 struct inpcb *inp = tp->t_inpcb; 266 struct adapter *sc = td_adapter(toep->td); 267 int n; 268 269 INP_LOCK_ASSERT(inp); 270 271 if (inp->inp_inc.inc_flags & INC_ISIPV6) 272 n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 273 else 274 n = sizeof(struct ip) + sizeof(struct tcphdr); 275 if (V_tcp_do_rfc1323) 276 n += TCPOLEN_TSTAMP_APPA; 277 tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; 278 279 CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid, 280 G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]); 281 282 if (G_TCPOPT_TSTAMP(opt)) { 283 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 284 tp->ts_recent = 0; /* hmmm */ 285 tp->ts_recent_age = tcp_ts_getticks(); 286 } 287 288 if (G_TCPOPT_SACK(opt)) 289 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 290 else 291 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 292 293 if (G_TCPOPT_WSCALE_OK(opt)) 294 tp->t_flags |= TF_RCVD_SCALE; 295 296 /* Doing window scaling? */ 297 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 298 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 299 tp->rcv_scale = tp->request_r_scale; 300 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 301 } 302} 303 304/* 305 * Completes some final bits of initialization for just established connections 306 * and changes their state to TCPS_ESTABLISHED. 307 * 308 * The ISNs are from after the exchange of SYNs. i.e., the true ISN + 1. 309 */ 310void 311make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn, 312 uint16_t opt) 313{ 314 struct inpcb *inp = toep->inp; 315 struct socket *so = inp->inp_socket; 316 struct tcpcb *tp = intotcpcb(inp); 317 long bufsize; 318 uint32_t iss = be32toh(snd_isn) - 1; /* true ISS */ 319 uint32_t irs = be32toh(rcv_isn) - 1; /* true IRS */ 320 uint16_t tcpopt = be16toh(opt); 321 struct flowc_tx_params ftxp; 322 323 INP_WLOCK_ASSERT(inp); 324 KASSERT(tp->t_state == TCPS_SYN_SENT || 325 tp->t_state == TCPS_SYN_RECEIVED, 326 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 327 328 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 329 __func__, toep->tid, so, inp, tp, toep); 330 331 tcp_state_change(tp, TCPS_ESTABLISHED); 332 tp->t_starttime = ticks; 333 TCPSTAT_INC(tcps_connects); 334 335 tp->irs = irs; 336 tcp_rcvseqinit(tp); 337 tp->rcv_wnd = toep->rx_credits << 10; 338 tp->rcv_adv += tp->rcv_wnd; 339 tp->last_ack_sent = tp->rcv_nxt; 340 341 /* 342 * If we were unable to send all rx credits via opt0, save the remainder 343 * in rx_credits so that they can be handed over with the next credit 344 * update. 345 */ 346 SOCKBUF_LOCK(&so->so_rcv); 347 bufsize = select_rcv_wnd(so); 348 SOCKBUF_UNLOCK(&so->so_rcv); 349 toep->rx_credits = bufsize - tp->rcv_wnd; 350 351 tp->iss = iss; 352 tcp_sendseqinit(tp); 353 tp->snd_una = iss + 1; 354 tp->snd_nxt = iss + 1; 355 tp->snd_max = iss + 1; 356 357 assign_rxopt(tp, tcpopt); 358 359 SOCKBUF_LOCK(&so->so_snd); 360 if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) 361 bufsize = V_tcp_autosndbuf_max; 362 else 363 bufsize = sbspace(&so->so_snd); 364 SOCKBUF_UNLOCK(&so->so_snd); 365 366 ftxp.snd_nxt = tp->snd_nxt; 367 ftxp.rcv_nxt = tp->rcv_nxt; 368 ftxp.snd_space = bufsize; 369 ftxp.mss = tp->t_maxseg; 370 send_flowc_wr(toep, &ftxp); 371 372 soisconnected(so); 373} 374 375int 376send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 377{ 378 struct wrqe *wr; 379 struct cpl_rx_data_ack *req; 380 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 381 382 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 383 384 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 385 if (wr == NULL) 386 return (0); 387 req = wrtod(wr); 388 389 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 390 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 391 392 t4_wrq_tx(sc, wr); 393 return (credits); 394} 395 396void 397send_rx_modulate(struct adapter *sc, struct toepcb *toep) 398{ 399 struct wrqe *wr; 400 struct cpl_rx_data_ack *req; 401 402 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 403 if (wr == NULL) 404 return; 405 req = wrtod(wr); 406 407 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 408 req->credit_dack = htobe32(F_RX_MODULATE_RX); 409 410 t4_wrq_tx(sc, wr); 411} 412 413void 414t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 415{ 416 struct adapter *sc = tod->tod_softc; 417 struct inpcb *inp = tp->t_inpcb; 418 struct socket *so = inp->inp_socket; 419 struct sockbuf *sb = &so->so_rcv; 420 struct toepcb *toep = tp->t_toe; 421 int credits; 422 423 INP_WLOCK_ASSERT(inp); 424 425 SOCKBUF_LOCK_ASSERT(sb); 426 KASSERT(toep->sb_cc >= sbused(sb), 427 ("%s: sb %p has more data (%d) than last time (%d).", 428 __func__, sb, sbused(sb), toep->sb_cc)); 429 430 credits = toep->sb_cc - sbused(sb); 431 toep->sb_cc = sbused(sb); 432 if (toep->ulp_mode == ULP_MODE_TLS) { 433 if (toep->tls.rcv_over >= credits) { 434 toep->tls.rcv_over -= credits; 435 credits = 0; 436 } else { 437 credits -= toep->tls.rcv_over; 438 toep->tls.rcv_over = 0; 439 } 440 } 441 toep->rx_credits += credits; 442 443 if (toep->rx_credits > 0 && 444 (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 || 445 (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 446 toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) { 447 448 credits = send_rx_credits(sc, toep, toep->rx_credits); 449 toep->rx_credits -= credits; 450 tp->rcv_wnd += credits; 451 tp->rcv_adv += credits; 452 } else if (toep->flags & TPF_FORCE_CREDITS) 453 send_rx_modulate(sc, toep); 454} 455 456void 457t4_rcvd(struct toedev *tod, struct tcpcb *tp) 458{ 459 struct inpcb *inp = tp->t_inpcb; 460 struct socket *so = inp->inp_socket; 461 struct sockbuf *sb = &so->so_rcv; 462 463 SOCKBUF_LOCK(sb); 464 t4_rcvd_locked(tod, tp); 465 SOCKBUF_UNLOCK(sb); 466} 467 468/* 469 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 470 */ 471int 472t4_close_conn(struct adapter *sc, struct toepcb *toep) 473{ 474 struct wrqe *wr; 475 struct cpl_close_con_req *req; 476 unsigned int tid = toep->tid; 477 478 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 479 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 480 481 if (toep->flags & TPF_FIN_SENT) 482 return (0); 483 484 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 485 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 486 487 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 488 if (wr == NULL) { 489 /* XXX */ 490 panic("%s: allocation failure.", __func__); 491 } 492 req = wrtod(wr); 493 494 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 495 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 496 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 497 V_FW_WR_FLOWID(tid)); 498 req->wr.wr_lo = cpu_to_be64(0); 499 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 500 req->rsvd = 0; 501 502 toep->flags |= TPF_FIN_SENT; 503 toep->flags &= ~TPF_SEND_FIN; 504 t4_l2t_send(sc, wr, toep->l2te); 505 506 return (0); 507} 508 509#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 510#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 511 512/* Maximum amount of immediate data we could stuff in a WR */ 513static inline int 514max_imm_payload(int tx_credits) 515{ 516 const int n = 2; /* Use only up to 2 desc for imm. data WR */ 517 518 KASSERT(tx_credits >= 0 && 519 tx_credits <= MAX_OFLD_TX_CREDITS, 520 ("%s: %d credits", __func__, tx_credits)); 521 522 if (tx_credits < MIN_OFLD_TX_CREDITS) 523 return (0); 524 525 if (tx_credits >= (n * EQ_ESIZE) / 16) 526 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 527 else 528 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 529} 530 531/* Maximum number of SGL entries we could stuff in a WR */ 532static inline int 533max_dsgl_nsegs(int tx_credits) 534{ 535 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 536 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 537 538 KASSERT(tx_credits >= 0 && 539 tx_credits <= MAX_OFLD_TX_CREDITS, 540 ("%s: %d credits", __func__, tx_credits)); 541 542 if (tx_credits < MIN_OFLD_TX_CREDITS) 543 return (0); 544 545 nseg += 2 * (sge_pair_credits * 16 / 24); 546 if ((sge_pair_credits * 16) % 24 == 16) 547 nseg++; 548 549 return (nseg); 550} 551 552static inline void 553write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 554 unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign) 555{ 556 struct fw_ofld_tx_data_wr *txwr = dst; 557 558 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 559 V_FW_WR_IMMDLEN(immdlen)); 560 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 561 V_FW_WR_LEN16(credits)); 562 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) | 563 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 564 txwr->plen = htobe32(plen); 565 566 if (txalign > 0) { 567 struct tcpcb *tp = intotcpcb(toep->inp); 568 569 if (plen < 2 * tp->t_maxseg) 570 txwr->lsodisable_to_flags |= 571 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 572 else 573 txwr->lsodisable_to_flags |= 574 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 575 (tp->t_flags & TF_NODELAY ? 0 : 576 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 577 } 578} 579 580/* 581 * Generate a DSGL from a starting mbuf. The total number of segments and the 582 * maximum segments in any one mbuf are provided. 583 */ 584static void 585write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 586{ 587 struct mbuf *m; 588 struct ulptx_sgl *usgl = dst; 589 int i, j, rc; 590 struct sglist sg; 591 struct sglist_seg segs[n]; 592 593 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 594 595 sglist_init(&sg, n, segs); 596 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 597 V_ULPTX_NSGE(nsegs)); 598 599 i = -1; 600 for (m = start; m != stop; m = m->m_next) { 601 if (IS_AIOTX_MBUF(m)) 602 rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m), 603 aiotx_mbuf_pgoff(m), m->m_len); 604 else 605 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 606 if (__predict_false(rc != 0)) 607 panic("%s: sglist_append %d", __func__, rc); 608 609 for (j = 0; j < sg.sg_nseg; i++, j++) { 610 if (i < 0) { 611 usgl->len0 = htobe32(segs[j].ss_len); 612 usgl->addr0 = htobe64(segs[j].ss_paddr); 613 } else { 614 usgl->sge[i / 2].len[i & 1] = 615 htobe32(segs[j].ss_len); 616 usgl->sge[i / 2].addr[i & 1] = 617 htobe64(segs[j].ss_paddr); 618 } 619#ifdef INVARIANTS 620 nsegs--; 621#endif 622 } 623 sglist_reset(&sg); 624 } 625 if (i & 1) 626 usgl->sge[i / 2].len[1] = htobe32(0); 627 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 628 __func__, nsegs, start, stop)); 629} 630 631/* 632 * Max number of SGL entries an offload tx work request can have. This is 41 633 * (1 + 40) for a full 512B work request. 634 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 635 */ 636#define OFLD_SGL_LEN (41) 637 638/* 639 * Send data and/or a FIN to the peer. 640 * 641 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 642 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 643 * was transmitted. 644 * 645 * drop indicates the number of bytes that should be dropped from the head of 646 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 647 * contention on the send buffer lock (before this change it used to do 648 * sowwakeup and then t4_push_frames right after that when recovering from tx 649 * stalls). When drop is set this function MUST drop the bytes and wake up any 650 * writers. 651 */ 652void 653t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 654{ 655 struct mbuf *sndptr, *m, *sb_sndptr; 656 struct fw_ofld_tx_data_wr *txwr; 657 struct wrqe *wr; 658 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 659 struct inpcb *inp = toep->inp; 660 struct tcpcb *tp = intotcpcb(inp); 661 struct socket *so = inp->inp_socket; 662 struct sockbuf *sb = &so->so_snd; 663 int tx_credits, shove, compl, sowwakeup; 664 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 665 bool aiotx_mbuf_seen; 666 667 INP_WLOCK_ASSERT(inp); 668 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 669 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 670 671 KASSERT(toep->ulp_mode == ULP_MODE_NONE || 672 toep->ulp_mode == ULP_MODE_TCPDDP || 673 toep->ulp_mode == ULP_MODE_TLS || 674 toep->ulp_mode == ULP_MODE_RDMA, 675 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 676 677#ifdef VERBOSE_TRACES 678 CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 679 __func__, toep->tid, toep->flags, tp->t_flags); 680#endif 681 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 682 return; 683 684 /* 685 * This function doesn't resume by itself. Someone else must clear the 686 * flag and call this function. 687 */ 688 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 689 KASSERT(drop == 0, 690 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 691 return; 692 } 693 694 do { 695 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 696 max_imm = max_imm_payload(tx_credits); 697 max_nsegs = max_dsgl_nsegs(tx_credits); 698 699 SOCKBUF_LOCK(sb); 700 sowwakeup = drop; 701 if (drop) { 702 sbdrop_locked(sb, drop); 703 drop = 0; 704 } 705 sb_sndptr = sb->sb_sndptr; 706 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 707 plen = 0; 708 nsegs = 0; 709 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 710 aiotx_mbuf_seen = false; 711 for (m = sndptr; m != NULL; m = m->m_next) { 712 int n; 713 714 if (IS_AIOTX_MBUF(m)) 715 n = sglist_count_vmpages(aiotx_mbuf_pages(m), 716 aiotx_mbuf_pgoff(m), m->m_len); 717 else 718 n = sglist_count(mtod(m, void *), m->m_len); 719 720 nsegs += n; 721 plen += m->m_len; 722 723 /* This mbuf sent us _over_ the nsegs limit, back out */ 724 if (plen > max_imm && nsegs > max_nsegs) { 725 nsegs -= n; 726 plen -= m->m_len; 727 if (plen == 0) { 728 /* Too few credits */ 729 toep->flags |= TPF_TX_SUSPENDED; 730 if (sowwakeup) { 731 if (!TAILQ_EMPTY( 732 &toep->aiotx_jobq)) 733 t4_aiotx_queue_toep( 734 toep); 735 sowwakeup_locked(so); 736 } else 737 SOCKBUF_UNLOCK(sb); 738 SOCKBUF_UNLOCK_ASSERT(sb); 739 return; 740 } 741 break; 742 } 743 744 if (IS_AIOTX_MBUF(m)) 745 aiotx_mbuf_seen = true; 746 if (max_nsegs_1mbuf < n) 747 max_nsegs_1mbuf = n; 748 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 749 750 /* This mbuf put us right at the max_nsegs limit */ 751 if (plen > max_imm && nsegs == max_nsegs) { 752 m = m->m_next; 753 break; 754 } 755 } 756 757 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 758 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 759 compl = 1; 760 else 761 compl = 0; 762 763 if (sb->sb_flags & SB_AUTOSIZE && 764 V_tcp_do_autosndbuf && 765 sb->sb_hiwat < V_tcp_autosndbuf_max && 766 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 767 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 768 V_tcp_autosndbuf_max); 769 770 if (!sbreserve_locked(sb, newsize, so, NULL)) 771 sb->sb_flags &= ~SB_AUTOSIZE; 772 else 773 sowwakeup = 1; /* room available */ 774 } 775 if (sowwakeup) { 776 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 777 t4_aiotx_queue_toep(toep); 778 sowwakeup_locked(so); 779 } else 780 SOCKBUF_UNLOCK(sb); 781 SOCKBUF_UNLOCK_ASSERT(sb); 782 783 /* nothing to send */ 784 if (plen == 0) { 785 KASSERT(m == NULL, 786 ("%s: nothing to send, but m != NULL", __func__)); 787 break; 788 } 789 790 if (__predict_false(toep->flags & TPF_FIN_SENT)) 791 panic("%s: excess tx.", __func__); 792 793 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 794 if (plen <= max_imm && !aiotx_mbuf_seen) { 795 796 /* Immediate data tx */ 797 798 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 799 toep->ofld_txq); 800 if (wr == NULL) { 801 /* XXX: how will we recover from this? */ 802 toep->flags |= TPF_TX_SUSPENDED; 803 return; 804 } 805 txwr = wrtod(wr); 806 credits = howmany(wr->wr_len, 16); 807 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0, 808 sc->tt.tx_align); 809 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 810 nsegs = 0; 811 } else { 812 int wr_len; 813 814 /* DSGL tx */ 815 816 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 817 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 818 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 819 if (wr == NULL) { 820 /* XXX: how will we recover from this? */ 821 toep->flags |= TPF_TX_SUSPENDED; 822 return; 823 } 824 txwr = wrtod(wr); 825 credits = howmany(wr_len, 16); 826 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0, 827 sc->tt.tx_align); 828 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 829 max_nsegs_1mbuf); 830 if (wr_len & 0xf) { 831 uint64_t *pad = (uint64_t *) 832 ((uintptr_t)txwr + wr_len); 833 *pad = 0; 834 } 835 } 836 837 KASSERT(toep->tx_credits >= credits, 838 ("%s: not enough credits", __func__)); 839 840 toep->tx_credits -= credits; 841 toep->tx_nocompl += credits; 842 toep->plen_nocompl += plen; 843 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 844 toep->tx_nocompl >= toep->tx_total / 4) 845 compl = 1; 846 847 if (compl || toep->ulp_mode == ULP_MODE_RDMA) { 848 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 849 toep->tx_nocompl = 0; 850 toep->plen_nocompl = 0; 851 } 852 853 tp->snd_nxt += plen; 854 tp->snd_max += plen; 855 856 SOCKBUF_LOCK(sb); 857 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 858 sb->sb_sndptr = sb_sndptr; 859 SOCKBUF_UNLOCK(sb); 860 861 toep->flags |= TPF_TX_DATA_SENT; 862 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 863 toep->flags |= TPF_TX_SUSPENDED; 864 865 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 866 txsd->plen = plen; 867 txsd->tx_credits = credits; 868 txsd++; 869 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 870 toep->txsd_pidx = 0; 871 txsd = &toep->txsd[0]; 872 } 873 toep->txsd_avail--; 874 875 t4_l2t_send(sc, wr, toep->l2te); 876 } while (m != NULL); 877 878 /* Send a FIN if requested, but only if there's no more data to send */ 879 if (m == NULL && toep->flags & TPF_SEND_FIN) 880 t4_close_conn(sc, toep); 881} 882 883static inline void 884rqdrop_locked(struct mbufq *q, int plen) 885{ 886 struct mbuf *m; 887 888 while (plen > 0) { 889 m = mbufq_dequeue(q); 890 891 /* Too many credits. */ 892 MPASS(m != NULL); 893 M_ASSERTPKTHDR(m); 894 895 /* Partial credits. */ 896 MPASS(plen >= m->m_pkthdr.len); 897 898 plen -= m->m_pkthdr.len; 899 m_freem(m); 900 } 901} 902 903void 904t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 905{ 906 struct mbuf *sndptr, *m; 907 struct fw_ofld_tx_data_wr *txwr; 908 struct wrqe *wr; 909 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 910 u_int adjusted_plen, ulp_submode; 911 struct inpcb *inp = toep->inp; 912 struct tcpcb *tp = intotcpcb(inp); 913 int tx_credits, shove; 914 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 915 struct mbufq *pduq = &toep->ulp_pduq; 916 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 917 918 INP_WLOCK_ASSERT(inp); 919 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 920 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 921 KASSERT(toep->ulp_mode == ULP_MODE_ISCSI, 922 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 923 924 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 925 return; 926 927 /* 928 * This function doesn't resume by itself. Someone else must clear the 929 * flag and call this function. 930 */ 931 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 932 KASSERT(drop == 0, 933 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 934 return; 935 } 936 937 if (drop) 938 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 939 940 while ((sndptr = mbufq_first(pduq)) != NULL) { 941 M_ASSERTPKTHDR(sndptr); 942 943 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 944 max_imm = max_imm_payload(tx_credits); 945 max_nsegs = max_dsgl_nsegs(tx_credits); 946 947 plen = 0; 948 nsegs = 0; 949 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 950 for (m = sndptr; m != NULL; m = m->m_next) { 951 int n = sglist_count(mtod(m, void *), m->m_len); 952 953 nsegs += n; 954 plen += m->m_len; 955 956 /* 957 * This mbuf would send us _over_ the nsegs limit. 958 * Suspend tx because the PDU can't be sent out. 959 */ 960 if (plen > max_imm && nsegs > max_nsegs) { 961 toep->flags |= TPF_TX_SUSPENDED; 962 return; 963 } 964 965 if (max_nsegs_1mbuf < n) 966 max_nsegs_1mbuf = n; 967 } 968 969 if (__predict_false(toep->flags & TPF_FIN_SENT)) 970 panic("%s: excess tx.", __func__); 971 972 /* 973 * We have a PDU to send. All of it goes out in one WR so 'm' 974 * is NULL. A PDU's length is always a multiple of 4. 975 */ 976 MPASS(m == NULL); 977 MPASS((plen & 3) == 0); 978 MPASS(sndptr->m_pkthdr.len == plen); 979 980 shove = !(tp->t_flags & TF_MORETOCOME); 981 ulp_submode = mbuf_ulp_submode(sndptr); 982 MPASS(ulp_submode < nitems(ulp_extra_len)); 983 984 /* 985 * plen doesn't include header and data digests, which are 986 * generated and inserted in the right places by the TOE, but 987 * they do occupy TCP sequence space and need to be accounted 988 * for. 989 */ 990 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 991 if (plen <= max_imm) { 992 993 /* Immediate data tx */ 994 995 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 996 toep->ofld_txq); 997 if (wr == NULL) { 998 /* XXX: how will we recover from this? */ 999 toep->flags |= TPF_TX_SUSPENDED; 1000 return; 1001 } 1002 txwr = wrtod(wr); 1003 credits = howmany(wr->wr_len, 16); 1004 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1005 shove, ulp_submode, sc->tt.tx_align); 1006 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1007 nsegs = 0; 1008 } else { 1009 int wr_len; 1010 1011 /* DSGL tx */ 1012 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1013 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1014 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 1015 if (wr == NULL) { 1016 /* XXX: how will we recover from this? */ 1017 toep->flags |= TPF_TX_SUSPENDED; 1018 return; 1019 } 1020 txwr = wrtod(wr); 1021 credits = howmany(wr_len, 16); 1022 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1023 shove, ulp_submode, sc->tt.tx_align); 1024 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1025 max_nsegs_1mbuf); 1026 if (wr_len & 0xf) { 1027 uint64_t *pad = (uint64_t *) 1028 ((uintptr_t)txwr + wr_len); 1029 *pad = 0; 1030 } 1031 } 1032 1033 KASSERT(toep->tx_credits >= credits, 1034 ("%s: not enough credits", __func__)); 1035 1036 m = mbufq_dequeue(pduq); 1037 MPASS(m == sndptr); 1038 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1039 1040 toep->tx_credits -= credits; 1041 toep->tx_nocompl += credits; 1042 toep->plen_nocompl += plen; 1043 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1044 toep->tx_nocompl >= toep->tx_total / 4) { 1045 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1046 toep->tx_nocompl = 0; 1047 toep->plen_nocompl = 0; 1048 } 1049 1050 tp->snd_nxt += adjusted_plen; 1051 tp->snd_max += adjusted_plen; 1052 1053 toep->flags |= TPF_TX_DATA_SENT; 1054 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1055 toep->flags |= TPF_TX_SUSPENDED; 1056 1057 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1058 txsd->plen = plen; 1059 txsd->tx_credits = credits; 1060 txsd++; 1061 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1062 toep->txsd_pidx = 0; 1063 txsd = &toep->txsd[0]; 1064 } 1065 toep->txsd_avail--; 1066 1067 t4_l2t_send(sc, wr, toep->l2te); 1068 } 1069 1070 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1071 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1072 t4_close_conn(sc, toep); 1073} 1074 1075int 1076t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1077{ 1078 struct adapter *sc = tod->tod_softc; 1079#ifdef INVARIANTS 1080 struct inpcb *inp = tp->t_inpcb; 1081#endif 1082 struct toepcb *toep = tp->t_toe; 1083 1084 INP_WLOCK_ASSERT(inp); 1085 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1086 ("%s: inp %p dropped.", __func__, inp)); 1087 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1088 1089 if (toep->ulp_mode == ULP_MODE_ISCSI) 1090 t4_push_pdus(sc, toep, 0); 1091 else if (tls_tx_key(toep)) 1092 t4_push_tls_records(sc, toep, 0); 1093 else 1094 t4_push_frames(sc, toep, 0); 1095 1096 return (0); 1097} 1098 1099int 1100t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1101{ 1102 struct adapter *sc = tod->tod_softc; 1103#ifdef INVARIANTS 1104 struct inpcb *inp = tp->t_inpcb; 1105#endif 1106 struct toepcb *toep = tp->t_toe; 1107 1108 INP_WLOCK_ASSERT(inp); 1109 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1110 ("%s: inp %p dropped.", __func__, inp)); 1111 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1112 1113 toep->flags |= TPF_SEND_FIN; 1114 if (tp->t_state >= TCPS_ESTABLISHED) { 1115 if (toep->ulp_mode == ULP_MODE_ISCSI) 1116 t4_push_pdus(sc, toep, 0); 1117 else if (tls_tx_key(toep)) 1118 t4_push_tls_records(sc, toep, 0); 1119 else 1120 t4_push_frames(sc, toep, 0); 1121 } 1122 1123 return (0); 1124} 1125 1126int 1127t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1128{ 1129 struct adapter *sc = tod->tod_softc; 1130#if defined(INVARIANTS) 1131 struct inpcb *inp = tp->t_inpcb; 1132#endif 1133 struct toepcb *toep = tp->t_toe; 1134 1135 INP_WLOCK_ASSERT(inp); 1136 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1137 ("%s: inp %p dropped.", __func__, inp)); 1138 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1139 1140 /* hmmmm */ 1141 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1142 ("%s: flowc for tid %u [%s] not sent already", 1143 __func__, toep->tid, tcpstates[tp->t_state])); 1144 1145 send_reset(sc, toep, 0); 1146 return (0); 1147} 1148 1149/* 1150 * Peer has sent us a FIN. 1151 */ 1152static int 1153do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1154{ 1155 struct adapter *sc = iq->adapter; 1156 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1157 unsigned int tid = GET_TID(cpl); 1158 struct toepcb *toep = lookup_tid(sc, tid); 1159 struct inpcb *inp = toep->inp; 1160 struct tcpcb *tp = NULL; 1161 struct socket *so; 1162#ifdef INVARIANTS 1163 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1164#endif 1165 1166 KASSERT(opcode == CPL_PEER_CLOSE, 1167 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1168 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1169 1170 if (__predict_false(toep->flags & TPF_SYNQE)) { 1171#ifdef INVARIANTS 1172 struct synq_entry *synqe = (void *)toep; 1173 1174 INP_WLOCK(synqe->lctx->inp); 1175 if (synqe->flags & TPF_SYNQE_HAS_L2TE) { 1176 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1177 ("%s: listen socket closed but tid %u not aborted.", 1178 __func__, tid)); 1179 } else { 1180 /* 1181 * do_pass_accept_req is still running and will 1182 * eventually take care of this tid. 1183 */ 1184 } 1185 INP_WUNLOCK(synqe->lctx->inp); 1186#endif 1187 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1188 toep, toep->flags); 1189 return (0); 1190 } 1191 1192 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1193 1194 CURVNET_SET(toep->vnet); 1195 INP_INFO_RLOCK(&V_tcbinfo); 1196 INP_WLOCK(inp); 1197 tp = intotcpcb(inp); 1198 1199 CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, 1200 tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp); 1201 1202 if (toep->flags & TPF_ABORT_SHUTDOWN) 1203 goto done; 1204 1205 tp->rcv_nxt++; /* FIN */ 1206 1207 so = inp->inp_socket; 1208 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1209 DDP_LOCK(toep); 1210 if (__predict_false(toep->ddp.flags & 1211 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1212 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1213 DDP_UNLOCK(toep); 1214 } 1215 socantrcvmore(so); 1216 1217 if (toep->ulp_mode != ULP_MODE_RDMA) { 1218 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1219 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1220 be32toh(cpl->rcv_nxt))); 1221 } 1222 1223 switch (tp->t_state) { 1224 case TCPS_SYN_RECEIVED: 1225 tp->t_starttime = ticks; 1226 /* FALLTHROUGH */ 1227 1228 case TCPS_ESTABLISHED: 1229 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1230 break; 1231 1232 case TCPS_FIN_WAIT_1: 1233 tcp_state_change(tp, TCPS_CLOSING); 1234 break; 1235 1236 case TCPS_FIN_WAIT_2: 1237 tcp_twstart(tp); 1238 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1239 INP_INFO_RUNLOCK(&V_tcbinfo); 1240 CURVNET_RESTORE(); 1241 1242 INP_WLOCK(inp); 1243 final_cpl_received(toep); 1244 return (0); 1245 1246 default: 1247 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1248 __func__, tid, tp->t_state); 1249 } 1250done: 1251 INP_WUNLOCK(inp); 1252 INP_INFO_RUNLOCK(&V_tcbinfo); 1253 CURVNET_RESTORE(); 1254 return (0); 1255} 1256 1257/* 1258 * Peer has ACK'd our FIN. 1259 */ 1260static int 1261do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1262 struct mbuf *m) 1263{ 1264 struct adapter *sc = iq->adapter; 1265 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1266 unsigned int tid = GET_TID(cpl); 1267 struct toepcb *toep = lookup_tid(sc, tid); 1268 struct inpcb *inp = toep->inp; 1269 struct tcpcb *tp = NULL; 1270 struct socket *so = NULL; 1271#ifdef INVARIANTS 1272 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1273#endif 1274 1275 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1276 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1277 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1278 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1279 1280 CURVNET_SET(toep->vnet); 1281 INP_INFO_RLOCK(&V_tcbinfo); 1282 INP_WLOCK(inp); 1283 tp = intotcpcb(inp); 1284 1285 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1286 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1287 1288 if (toep->flags & TPF_ABORT_SHUTDOWN) 1289 goto done; 1290 1291 so = inp->inp_socket; 1292 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1293 1294 switch (tp->t_state) { 1295 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1296 tcp_twstart(tp); 1297release: 1298 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1299 INP_INFO_RUNLOCK(&V_tcbinfo); 1300 CURVNET_RESTORE(); 1301 1302 INP_WLOCK(inp); 1303 final_cpl_received(toep); /* no more CPLs expected */ 1304 1305 return (0); 1306 case TCPS_LAST_ACK: 1307 if (tcp_close(tp)) 1308 INP_WUNLOCK(inp); 1309 goto release; 1310 1311 case TCPS_FIN_WAIT_1: 1312 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1313 soisdisconnected(so); 1314 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1315 break; 1316 1317 default: 1318 log(LOG_ERR, 1319 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1320 __func__, tid, tcpstates[tp->t_state]); 1321 } 1322done: 1323 INP_WUNLOCK(inp); 1324 INP_INFO_RUNLOCK(&V_tcbinfo); 1325 CURVNET_RESTORE(); 1326 return (0); 1327} 1328 1329void 1330send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1331 int rst_status) 1332{ 1333 struct wrqe *wr; 1334 struct cpl_abort_rpl *cpl; 1335 1336 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1337 if (wr == NULL) { 1338 /* XXX */ 1339 panic("%s: allocation failure.", __func__); 1340 } 1341 cpl = wrtod(wr); 1342 1343 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1344 cpl->cmd = rst_status; 1345 1346 t4_wrq_tx(sc, wr); 1347} 1348 1349static int 1350abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1351{ 1352 switch (abort_reason) { 1353 case CPL_ERR_BAD_SYN: 1354 case CPL_ERR_CONN_RESET: 1355 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1356 case CPL_ERR_XMIT_TIMEDOUT: 1357 case CPL_ERR_PERSIST_TIMEDOUT: 1358 case CPL_ERR_FINWAIT2_TIMEDOUT: 1359 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1360 return (ETIMEDOUT); 1361 default: 1362 return (EIO); 1363 } 1364} 1365 1366/* 1367 * TCP RST from the peer, timeout, or some other such critical error. 1368 */ 1369static int 1370do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1371{ 1372 struct adapter *sc = iq->adapter; 1373 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1374 unsigned int tid = GET_TID(cpl); 1375 struct toepcb *toep = lookup_tid(sc, tid); 1376 struct sge_wrq *ofld_txq = toep->ofld_txq; 1377 struct inpcb *inp; 1378 struct tcpcb *tp; 1379#ifdef INVARIANTS 1380 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1381#endif 1382 1383 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1384 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1385 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1386 1387 if (toep->flags & TPF_SYNQE) 1388 return (do_abort_req_synqe(iq, rss, m)); 1389 1390 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1391 1392 if (negative_advice(cpl->status)) { 1393 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1394 __func__, cpl->status, tid, toep->flags); 1395 return (0); /* Ignore negative advice */ 1396 } 1397 1398 inp = toep->inp; 1399 CURVNET_SET(toep->vnet); 1400 INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ 1401 INP_WLOCK(inp); 1402 1403 tp = intotcpcb(inp); 1404 1405 CTR6(KTR_CXGBE, 1406 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1407 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1408 inp->inp_flags, cpl->status); 1409 1410 /* 1411 * If we'd initiated an abort earlier the reply to it is responsible for 1412 * cleaning up resources. Otherwise we tear everything down right here 1413 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1414 */ 1415 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1416 INP_WUNLOCK(inp); 1417 goto done; 1418 } 1419 toep->flags |= TPF_ABORT_SHUTDOWN; 1420 1421 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1422 struct socket *so = inp->inp_socket; 1423 1424 if (so != NULL) 1425 so_error_set(so, abort_status_to_errno(tp, 1426 cpl->status)); 1427 tp = tcp_close(tp); 1428 if (tp == NULL) 1429 INP_WLOCK(inp); /* re-acquire */ 1430 } 1431 1432 final_cpl_received(toep); 1433done: 1434 INP_INFO_RUNLOCK(&V_tcbinfo); 1435 CURVNET_RESTORE(); 1436 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1437 return (0); 1438} 1439 1440/* 1441 * Reply to the CPL_ABORT_REQ (send_reset) 1442 */ 1443static int 1444do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1445{ 1446 struct adapter *sc = iq->adapter; 1447 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1448 unsigned int tid = GET_TID(cpl); 1449 struct toepcb *toep = lookup_tid(sc, tid); 1450 struct inpcb *inp = toep->inp; 1451#ifdef INVARIANTS 1452 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1453#endif 1454 1455 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1456 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1457 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1458 1459 if (toep->flags & TPF_SYNQE) 1460 return (do_abort_rpl_synqe(iq, rss, m)); 1461 1462 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1463 1464 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1465 __func__, tid, toep, inp, cpl->status); 1466 1467 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1468 ("%s: wasn't expecting abort reply", __func__)); 1469 1470 INP_WLOCK(inp); 1471 final_cpl_received(toep); 1472 1473 return (0); 1474} 1475 1476static int 1477do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1478{ 1479 struct adapter *sc = iq->adapter; 1480 const struct cpl_rx_data *cpl = mtod(m, const void *); 1481 unsigned int tid = GET_TID(cpl); 1482 struct toepcb *toep = lookup_tid(sc, tid); 1483 struct inpcb *inp = toep->inp; 1484 struct tcpcb *tp; 1485 struct socket *so; 1486 struct sockbuf *sb; 1487 int len; 1488 uint32_t ddp_placed = 0; 1489 1490 if (__predict_false(toep->flags & TPF_SYNQE)) { 1491#ifdef INVARIANTS 1492 struct synq_entry *synqe = (void *)toep; 1493 1494 INP_WLOCK(synqe->lctx->inp); 1495 if (synqe->flags & TPF_SYNQE_HAS_L2TE) { 1496 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1497 ("%s: listen socket closed but tid %u not aborted.", 1498 __func__, tid)); 1499 } else { 1500 /* 1501 * do_pass_accept_req is still running and will 1502 * eventually take care of this tid. 1503 */ 1504 } 1505 INP_WUNLOCK(synqe->lctx->inp); 1506#endif 1507 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1508 toep, toep->flags); 1509 m_freem(m); 1510 return (0); 1511 } 1512 1513 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1514 1515 /* strip off CPL header */ 1516 m_adj(m, sizeof(*cpl)); 1517 len = m->m_pkthdr.len; 1518 1519 INP_WLOCK(inp); 1520 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1521 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1522 __func__, tid, len, inp->inp_flags); 1523 INP_WUNLOCK(inp); 1524 m_freem(m); 1525 return (0); 1526 } 1527 1528 tp = intotcpcb(inp); 1529 1530 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1531 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1532 1533 tp->rcv_nxt += len; 1534 if (tp->rcv_wnd < len) { 1535 KASSERT(toep->ulp_mode == ULP_MODE_RDMA, 1536 ("%s: negative window size", __func__)); 1537 } 1538 1539 tp->rcv_wnd -= len; 1540 tp->t_rcvtime = ticks; 1541 1542 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1543 DDP_LOCK(toep); 1544 so = inp_inpcbtosocket(inp); 1545 sb = &so->so_rcv; 1546 SOCKBUF_LOCK(sb); 1547 1548 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1549 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1550 __func__, tid, len); 1551 m_freem(m); 1552 SOCKBUF_UNLOCK(sb); 1553 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1554 DDP_UNLOCK(toep); 1555 INP_WUNLOCK(inp); 1556 1557 CURVNET_SET(toep->vnet); 1558 INP_INFO_RLOCK(&V_tcbinfo); 1559 INP_WLOCK(inp); 1560 tp = tcp_drop(tp, ECONNRESET); 1561 if (tp) 1562 INP_WUNLOCK(inp); 1563 INP_INFO_RUNLOCK(&V_tcbinfo); 1564 CURVNET_RESTORE(); 1565 1566 return (0); 1567 } 1568 1569 /* receive buffer autosize */ 1570 MPASS(toep->vnet == so->so_vnet); 1571 CURVNET_SET(toep->vnet); 1572 if (sb->sb_flags & SB_AUTOSIZE && 1573 V_tcp_do_autorcvbuf && 1574 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1575 len > (sbspace(sb) / 8 * 7)) { 1576 unsigned int hiwat = sb->sb_hiwat; 1577 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 1578 V_tcp_autorcvbuf_max); 1579 1580 if (!sbreserve_locked(sb, newsize, so, NULL)) 1581 sb->sb_flags &= ~SB_AUTOSIZE; 1582 else 1583 toep->rx_credits += newsize - hiwat; 1584 } 1585 1586 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1587 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1588 1589 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1590 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1591 __func__, tid, len); 1592 1593 if (changed) { 1594 if (toep->ddp.flags & DDP_SC_REQ) 1595 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1596 else { 1597 KASSERT(cpl->ddp_off == 1, 1598 ("%s: DDP switched on by itself.", 1599 __func__)); 1600 1601 /* Fell out of DDP mode */ 1602 toep->ddp.flags &= ~DDP_ON; 1603 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1604 __func__); 1605 1606 insert_ddp_data(toep, ddp_placed); 1607 } 1608 } 1609 1610 if (toep->ddp.flags & DDP_ON) { 1611 /* 1612 * CPL_RX_DATA with DDP on can only be an indicate. 1613 * Start posting queued AIO requests via DDP. The 1614 * payload that arrived in this indicate is appended 1615 * to the socket buffer as usual. 1616 */ 1617 handle_ddp_indicate(toep); 1618 } 1619 } 1620 1621 KASSERT(toep->sb_cc >= sbused(sb), 1622 ("%s: sb %p has more data (%d) than last time (%d).", 1623 __func__, sb, sbused(sb), toep->sb_cc)); 1624 toep->rx_credits += toep->sb_cc - sbused(sb); 1625 sbappendstream_locked(sb, m, 0); 1626 toep->sb_cc = sbused(sb); 1627 if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) { 1628 int credits; 1629 1630 credits = send_rx_credits(sc, toep, toep->rx_credits); 1631 toep->rx_credits -= credits; 1632 tp->rcv_wnd += credits; 1633 tp->rcv_adv += credits; 1634 } 1635 1636 if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1637 sbavail(sb) != 0) { 1638 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1639 tid); 1640 ddp_queue_toep(toep); 1641 } 1642 sorwakeup_locked(so); 1643 SOCKBUF_UNLOCK_ASSERT(sb); 1644 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1645 DDP_UNLOCK(toep); 1646 1647 INP_WUNLOCK(inp); 1648 CURVNET_RESTORE(); 1649 return (0); 1650} 1651 1652#define S_CPL_FW4_ACK_OPCODE 24 1653#define M_CPL_FW4_ACK_OPCODE 0xff 1654#define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE) 1655#define G_CPL_FW4_ACK_OPCODE(x) \ 1656 (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE) 1657 1658#define S_CPL_FW4_ACK_FLOWID 0 1659#define M_CPL_FW4_ACK_FLOWID 0xffffff 1660#define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID) 1661#define G_CPL_FW4_ACK_FLOWID(x) \ 1662 (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID) 1663 1664#define S_CPL_FW4_ACK_CR 24 1665#define M_CPL_FW4_ACK_CR 0xff 1666#define V_CPL_FW4_ACK_CR(x) ((x) << S_CPL_FW4_ACK_CR) 1667#define G_CPL_FW4_ACK_CR(x) (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR) 1668 1669#define S_CPL_FW4_ACK_SEQVAL 0 1670#define M_CPL_FW4_ACK_SEQVAL 0x1 1671#define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL) 1672#define G_CPL_FW4_ACK_SEQVAL(x) \ 1673 (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL) 1674#define F_CPL_FW4_ACK_SEQVAL V_CPL_FW4_ACK_SEQVAL(1U) 1675 1676static int 1677do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1678{ 1679 struct adapter *sc = iq->adapter; 1680 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1681 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1682 struct toepcb *toep = lookup_tid(sc, tid); 1683 struct inpcb *inp; 1684 struct tcpcb *tp; 1685 struct socket *so; 1686 uint8_t credits = cpl->credits; 1687 struct ofld_tx_sdesc *txsd; 1688 int plen; 1689#ifdef INVARIANTS 1690 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1691#endif 1692 1693 /* 1694 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1695 * now this comes back carrying the credits for the flowc. 1696 */ 1697 if (__predict_false(toep->flags & TPF_SYNQE)) { 1698 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1699 ("%s: credits for a synq entry %p", __func__, toep)); 1700 return (0); 1701 } 1702 1703 inp = toep->inp; 1704 1705 KASSERT(opcode == CPL_FW4_ACK, 1706 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1707 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1708 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1709 1710 INP_WLOCK(inp); 1711 1712 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1713 INP_WUNLOCK(inp); 1714 return (0); 1715 } 1716 1717 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1718 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1719 1720 tp = intotcpcb(inp); 1721 1722 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1723 tcp_seq snd_una = be32toh(cpl->snd_una); 1724 1725#ifdef INVARIANTS 1726 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1727 log(LOG_ERR, 1728 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1729 __func__, snd_una, toep->tid, tp->snd_una); 1730 } 1731#endif 1732 1733 if (tp->snd_una != snd_una) { 1734 tp->snd_una = snd_una; 1735 tp->ts_recent_age = tcp_ts_getticks(); 1736 } 1737 } 1738 1739#ifdef VERBOSE_TRACES 1740 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1741#endif 1742 so = inp->inp_socket; 1743 txsd = &toep->txsd[toep->txsd_cidx]; 1744 plen = 0; 1745 while (credits) { 1746 KASSERT(credits >= txsd->tx_credits, 1747 ("%s: too many (or partial) credits", __func__)); 1748 credits -= txsd->tx_credits; 1749 toep->tx_credits += txsd->tx_credits; 1750 plen += txsd->plen; 1751 if (txsd->iv_buffer) { 1752 free(txsd->iv_buffer, M_CXGBE); 1753 txsd->iv_buffer = NULL; 1754 } 1755 txsd++; 1756 toep->txsd_avail++; 1757 KASSERT(toep->txsd_avail <= toep->txsd_total, 1758 ("%s: txsd avail > total", __func__)); 1759 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1760 txsd = &toep->txsd[0]; 1761 toep->txsd_cidx = 0; 1762 } 1763 } 1764 1765 if (toep->tx_credits == toep->tx_total) { 1766 toep->tx_nocompl = 0; 1767 toep->plen_nocompl = 0; 1768 } 1769 1770 if (toep->flags & TPF_TX_SUSPENDED && 1771 toep->tx_credits >= toep->tx_total / 4) { 1772#ifdef VERBOSE_TRACES 1773 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1774 tid); 1775#endif 1776 toep->flags &= ~TPF_TX_SUSPENDED; 1777 CURVNET_SET(toep->vnet); 1778 if (toep->ulp_mode == ULP_MODE_ISCSI) 1779 t4_push_pdus(sc, toep, plen); 1780 else if (tls_tx_key(toep)) 1781 t4_push_tls_records(sc, toep, plen); 1782 else 1783 t4_push_frames(sc, toep, plen); 1784 CURVNET_RESTORE(); 1785 } else if (plen > 0) { 1786 struct sockbuf *sb = &so->so_snd; 1787 int sbu; 1788 1789 SOCKBUF_LOCK(sb); 1790 sbu = sbused(sb); 1791 if (toep->ulp_mode == ULP_MODE_ISCSI) { 1792 1793 if (__predict_false(sbu > 0)) { 1794 /* 1795 * The data trasmitted before the tid's ULP mode 1796 * changed to ISCSI is still in so_snd. 1797 * Incoming credits should account for so_snd 1798 * first. 1799 */ 1800 sbdrop_locked(sb, min(sbu, plen)); 1801 plen -= min(sbu, plen); 1802 } 1803 sowwakeup_locked(so); /* unlocks so_snd */ 1804 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1805 } else { 1806#ifdef VERBOSE_TRACES 1807 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1808 tid, plen); 1809#endif 1810 sbdrop_locked(sb, plen); 1811 if (tls_tx_key(toep)) { 1812 struct tls_ofld_info *tls_ofld = &toep->tls; 1813 1814 MPASS(tls_ofld->sb_off >= plen); 1815 tls_ofld->sb_off -= plen; 1816 } 1817 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1818 t4_aiotx_queue_toep(toep); 1819 sowwakeup_locked(so); /* unlocks so_snd */ 1820 } 1821 SOCKBUF_UNLOCK_ASSERT(sb); 1822 } 1823 1824 INP_WUNLOCK(inp); 1825 1826 return (0); 1827} 1828 1829int 1830do_set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1831{ 1832 struct adapter *sc = iq->adapter; 1833 const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); 1834 unsigned int tid = GET_TID(cpl); 1835 struct toepcb *toep; 1836#ifdef INVARIANTS 1837 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1838#endif 1839 1840 KASSERT(opcode == CPL_SET_TCB_RPL, 1841 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1842 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1843 MPASS(iq != &sc->sge.fwq); 1844 1845 toep = lookup_tid(sc, tid); 1846 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1847 handle_ddp_tcb_rpl(toep, cpl); 1848 return (0); 1849 } 1850 1851 /* 1852 * TOM and/or other ULPs don't request replies for CPL_SET_TCB or 1853 * CPL_SET_TCB_FIELD requests. This can easily change and when it does 1854 * the dispatch code will go here. 1855 */ 1856#ifdef INVARIANTS 1857 panic("%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p", __func__, 1858 tid, iq); 1859#else 1860 log(LOG_ERR, "%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p\n", 1861 __func__, tid, iq); 1862#endif 1863 1864 return (0); 1865} 1866 1867void 1868t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1869 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1870{ 1871 struct wrqe *wr; 1872 struct cpl_set_tcb_field *req; 1873 struct ofld_tx_sdesc *txsd; 1874 1875 MPASS((cookie & ~M_COOKIE) == 0); 1876 1877 wr = alloc_wrqe(sizeof(*req), wrq); 1878 if (wr == NULL) { 1879 /* XXX */ 1880 panic("%s: allocation failure.", __func__); 1881 } 1882 req = wrtod(wr); 1883 1884 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1885 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1886 if (reply == 0) 1887 req->reply_ctrl |= htobe16(F_NO_REPLY); 1888 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1889 req->mask = htobe64(mask); 1890 req->val = htobe64(val); 1891 if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { 1892 txsd = &toep->txsd[toep->txsd_pidx]; 1893 txsd->tx_credits = howmany(sizeof(*req), 16); 1894 txsd->plen = 0; 1895 KASSERT(toep->tx_credits >= txsd->tx_credits && 1896 toep->txsd_avail > 0, 1897 ("%s: not enough credits (%d)", __func__, 1898 toep->tx_credits)); 1899 toep->tx_credits -= txsd->tx_credits; 1900 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1901 toep->txsd_pidx = 0; 1902 toep->txsd_avail--; 1903 } 1904 1905 t4_wrq_tx(sc, wr); 1906} 1907 1908void 1909t4_init_cpl_io_handlers(void) 1910{ 1911 1912 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1913 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1914 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1915 t4_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 1916 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1917 t4_register_cpl_handler(CPL_FW4_ACK, do_fw4_ack); 1918} 1919 1920void 1921t4_uninit_cpl_io_handlers(void) 1922{ 1923 1924 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1925 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1926 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1927 t4_register_cpl_handler(CPL_ABORT_RPL_RSS, NULL); 1928 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1929 t4_register_cpl_handler(CPL_FW4_ACK, NULL); 1930} 1931 1932/* 1933 * Use the 'backend3' field in AIO jobs to store the amount of data 1934 * sent by the AIO job so far and the 'backend4' field to hold an 1935 * error that should be reported when the job is completed. 1936 */ 1937#define aio_sent backend3 1938#define aio_error backend4 1939 1940#define jobtotid(job) \ 1941 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1942 1943static void 1944free_aiotx_buffer(struct aiotx_buffer *ab) 1945{ 1946 struct kaiocb *job; 1947 long status; 1948 int error; 1949 1950 if (refcount_release(&ab->refcount) == 0) 1951 return; 1952 1953 job = ab->job; 1954 error = job->aio_error; 1955 status = job->aio_sent; 1956 vm_page_unhold_pages(ab->ps.pages, ab->ps.npages); 1957 free(ab, M_CXGBE); 1958#ifdef VERBOSE_TRACES 1959 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1960 jobtotid(job), job, status, error); 1961#endif 1962 if (error == ECANCELED && status != 0) 1963 error = 0; 1964 if (error == ECANCELED) 1965 aio_cancel(job); 1966 else if (error) 1967 aio_complete(job, -1, error); 1968 else 1969 aio_complete(job, status, 0); 1970} 1971 1972static void 1973t4_aiotx_mbuf_free(struct mbuf *m, void *buffer, void *arg) 1974{ 1975 struct aiotx_buffer *ab = buffer; 1976 1977#ifdef VERBOSE_TRACES 1978 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1979 m->m_len, jobtotid(ab->job)); 1980#endif 1981 free_aiotx_buffer(ab); 1982} 1983 1984/* 1985 * Hold the buffer backing an AIO request and return an AIO transmit 1986 * buffer. 1987 */ 1988static int 1989hold_aio(struct kaiocb *job) 1990{ 1991 struct aiotx_buffer *ab; 1992 struct vmspace *vm; 1993 vm_map_t map; 1994 vm_offset_t start, end, pgoff; 1995 int n; 1996 1997 MPASS(job->backend1 == NULL); 1998 1999 /* 2000 * The AIO subsystem will cancel and drain all requests before 2001 * permitting a process to exit or exec, so p_vmspace should 2002 * be stable here. 2003 */ 2004 vm = job->userproc->p_vmspace; 2005 map = &vm->vm_map; 2006 start = (uintptr_t)job->uaiocb.aio_buf; 2007 pgoff = start & PAGE_MASK; 2008 end = round_page(start + job->uaiocb.aio_nbytes); 2009 start = trunc_page(start); 2010 n = atop(end - start); 2011 2012 ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | 2013 M_ZERO); 2014 refcount_init(&ab->refcount, 1); 2015 ab->ps.pages = (vm_page_t *)(ab + 1); 2016 ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start, 2017 VM_PROT_WRITE, ab->ps.pages, n); 2018 if (ab->ps.npages < 0) { 2019 free(ab, M_CXGBE); 2020 return (EFAULT); 2021 } 2022 2023 KASSERT(ab->ps.npages == n, 2024 ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n)); 2025 2026 ab->ps.offset = pgoff; 2027 ab->ps.len = job->uaiocb.aio_nbytes; 2028 ab->job = job; 2029 job->backend1 = ab; 2030#ifdef VERBOSE_TRACES 2031 CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", 2032 __func__, jobtotid(job), &ab->ps, job, ab->ps.npages); 2033#endif 2034 return (0); 2035} 2036 2037static void 2038t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2039{ 2040 struct adapter *sc; 2041 struct sockbuf *sb; 2042 struct file *fp; 2043 struct aiotx_buffer *ab; 2044 struct inpcb *inp; 2045 struct tcpcb *tp; 2046 struct mbuf *m; 2047 int error; 2048 bool moretocome, sendmore; 2049 2050 sc = td_adapter(toep->td); 2051 sb = &so->so_snd; 2052 SOCKBUF_UNLOCK(sb); 2053 fp = job->fd_file; 2054 ab = job->backend1; 2055 m = NULL; 2056 2057#ifdef MAC 2058 error = mac_socket_check_send(fp->f_cred, so); 2059 if (error != 0) 2060 goto out; 2061#endif 2062 2063 if (ab == NULL) { 2064 error = hold_aio(job); 2065 if (error != 0) 2066 goto out; 2067 ab = job->backend1; 2068 } 2069 2070 /* Inline sosend_generic(). */ 2071 2072 job->msgsnd = 1; 2073 2074 error = sblock(sb, SBL_WAIT); 2075 MPASS(error == 0); 2076 2077sendanother: 2078 m = m_get(M_WAITOK, MT_DATA); 2079 2080 SOCKBUF_LOCK(sb); 2081 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2082 SOCKBUF_UNLOCK(sb); 2083 sbunlock(sb); 2084 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2085 PROC_LOCK(job->userproc); 2086 kern_psignal(job->userproc, SIGPIPE); 2087 PROC_UNLOCK(job->userproc); 2088 } 2089 error = EPIPE; 2090 goto out; 2091 } 2092 if (so->so_error) { 2093 error = so->so_error; 2094 so->so_error = 0; 2095 SOCKBUF_UNLOCK(sb); 2096 sbunlock(sb); 2097 goto out; 2098 } 2099 if ((so->so_state & SS_ISCONNECTED) == 0) { 2100 SOCKBUF_UNLOCK(sb); 2101 sbunlock(sb); 2102 error = ENOTCONN; 2103 goto out; 2104 } 2105 if (sbspace(sb) < sb->sb_lowat) { 2106 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2107 2108 /* 2109 * Don't block if there is too little room in the socket 2110 * buffer. Instead, requeue the request. 2111 */ 2112 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2113 SOCKBUF_UNLOCK(sb); 2114 sbunlock(sb); 2115 error = ECANCELED; 2116 goto out; 2117 } 2118 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2119 SOCKBUF_UNLOCK(sb); 2120 sbunlock(sb); 2121 goto out; 2122 } 2123 2124 /* 2125 * Write as much data as the socket permits, but no more than a 2126 * a single sndbuf at a time. 2127 */ 2128 m->m_len = sbspace(sb); 2129 if (m->m_len > ab->ps.len - job->aio_sent) { 2130 m->m_len = ab->ps.len - job->aio_sent; 2131 moretocome = false; 2132 } else 2133 moretocome = true; 2134 if (m->m_len > sc->tt.sndbuf) { 2135 m->m_len = sc->tt.sndbuf; 2136 sendmore = true; 2137 } else 2138 sendmore = false; 2139 2140 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2141 moretocome = true; 2142 SOCKBUF_UNLOCK(sb); 2143 MPASS(m->m_len != 0); 2144 2145 /* Inlined tcp_usr_send(). */ 2146 2147 inp = toep->inp; 2148 INP_WLOCK(inp); 2149 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2150 INP_WUNLOCK(inp); 2151 sbunlock(sb); 2152 error = ECONNRESET; 2153 goto out; 2154 } 2155 2156 refcount_acquire(&ab->refcount); 2157 m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab, 2158 (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV); 2159 m->m_ext.ext_flags |= EXT_FLAG_AIOTX; 2160 job->aio_sent += m->m_len; 2161 2162 sbappendstream(sb, m, 0); 2163 m = NULL; 2164 2165 if (!(inp->inp_flags & INP_DROPPED)) { 2166 tp = intotcpcb(inp); 2167 if (moretocome) 2168 tp->t_flags |= TF_MORETOCOME; 2169 error = tp->t_fb->tfb_tcp_output(tp); 2170 if (moretocome) 2171 tp->t_flags &= ~TF_MORETOCOME; 2172 } 2173 2174 INP_WUNLOCK(inp); 2175 if (sendmore) 2176 goto sendanother; 2177 sbunlock(sb); 2178 2179 if (error) 2180 goto out; 2181 2182 /* 2183 * If this is a non-blocking socket and the request has not 2184 * been fully completed, requeue it until the socket is ready 2185 * again. 2186 */ 2187 if (job->aio_sent < job->uaiocb.aio_nbytes && 2188 !(so->so_state & SS_NBIO)) { 2189 SOCKBUF_LOCK(sb); 2190 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2191 SOCKBUF_UNLOCK(sb); 2192 error = ECANCELED; 2193 goto out; 2194 } 2195 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2196 return; 2197 } 2198 2199 /* 2200 * If the request will not be requeued, drop a reference on 2201 * the the aiotx buffer. Any mbufs in flight should still 2202 * contain a reference, but this drops the reference that the 2203 * job owns while it is waiting to queue mbufs to the socket. 2204 */ 2205 free_aiotx_buffer(ab); 2206 2207out: 2208 if (error) { 2209 if (ab != NULL) { 2210 job->aio_error = error; 2211 free_aiotx_buffer(ab); 2212 } else { 2213 MPASS(job->aio_sent == 0); 2214 aio_complete(job, -1, error); 2215 } 2216 } 2217 if (m != NULL) 2218 m_free(m); 2219 SOCKBUF_LOCK(sb); 2220} 2221 2222static void 2223t4_aiotx_task(void *context, int pending) 2224{ 2225 struct toepcb *toep = context; 2226 struct inpcb *inp = toep->inp; 2227 struct socket *so = inp->inp_socket; 2228 struct kaiocb *job; 2229 2230 CURVNET_SET(toep->vnet); 2231 SOCKBUF_LOCK(&so->so_snd); 2232 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2233 job = TAILQ_FIRST(&toep->aiotx_jobq); 2234 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2235 if (!aio_clear_cancel_function(job)) 2236 continue; 2237 2238 t4_aiotx_process_job(toep, so, job); 2239 } 2240 toep->aiotx_task_active = false; 2241 SOCKBUF_UNLOCK(&so->so_snd); 2242 CURVNET_RESTORE(); 2243 2244 free_toepcb(toep); 2245} 2246 2247static void 2248t4_aiotx_queue_toep(struct toepcb *toep) 2249{ 2250 2251 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2252#ifdef VERBOSE_TRACES 2253 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2254 __func__, toep->tid, toep->aiotx_task_active ? "true" : "false"); 2255#endif 2256 if (toep->aiotx_task_active) 2257 return; 2258 toep->aiotx_task_active = true; 2259 hold_toepcb(toep); 2260 soaio_enqueue(&toep->aiotx_task); 2261} 2262 2263static void 2264t4_aiotx_cancel(struct kaiocb *job) 2265{ 2266 struct aiotx_buffer *ab; 2267 struct socket *so; 2268 struct sockbuf *sb; 2269 struct tcpcb *tp; 2270 struct toepcb *toep; 2271 2272 so = job->fd_file->f_data; 2273 tp = so_sototcpcb(so); 2274 toep = tp->t_toe; 2275 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2276 sb = &so->so_snd; 2277 2278 SOCKBUF_LOCK(sb); 2279 if (!aio_cancel_cleared(job)) 2280 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2281 SOCKBUF_UNLOCK(sb); 2282 2283 ab = job->backend1; 2284 if (ab != NULL) 2285 free_aiotx_buffer(ab); 2286 else 2287 aio_cancel(job); 2288} 2289 2290int 2291t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2292{ 2293 struct tcpcb *tp = so_sototcpcb(so); 2294 struct toepcb *toep = tp->t_toe; 2295 struct adapter *sc = td_adapter(toep->td); 2296 2297 /* This only handles writes. */ 2298 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2299 return (EOPNOTSUPP); 2300 2301 if (!sc->tt.tx_zcopy) 2302 return (EOPNOTSUPP); 2303 2304 if (tls_tx_key(toep)) 2305 return (EOPNOTSUPP); 2306 2307 SOCKBUF_LOCK(&so->so_snd); 2308#ifdef VERBOSE_TRACES 2309 CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job); 2310#endif 2311 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2312 panic("new job was cancelled"); 2313 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2314 if (sowriteable(so)) 2315 t4_aiotx_queue_toep(toep); 2316 SOCKBUF_UNLOCK(&so->so_snd); 2317 return (0); 2318} 2319 2320void 2321aiotx_init_toep(struct toepcb *toep) 2322{ 2323 2324 TAILQ_INIT(&toep->aiotx_jobq); 2325 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2326} 2327#endif 2328