cxgb_cpl_io.c revision 315805
1/*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/11/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 315805 2017-03-23 06:28:34Z mav $"); 29 30#include "opt_inet.h" 31 32#ifdef TCP_OFFLOAD 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/sockstate.h> 43#include <sys/sockopt.h> 44#include <sys/socket.h> 45#include <sys/socketvar.h> 46#include <sys/sockbuf.h> 47#include <sys/sysctl.h> 48#include <sys/syslog.h> 49#include <sys/protosw.h> 50#include <sys/priv.h> 51#include <sys/sglist.h> 52#include <sys/taskqueue.h> 53 54#include <net/if.h> 55#include <net/if_var.h> 56#include <net/ethernet.h> 57#include <net/route.h> 58 59#include <netinet/in.h> 60#include <netinet/in_pcb.h> 61#include <netinet/in_systm.h> 62#include <netinet/in_var.h> 63 64#include <netinet/ip.h> 65#define TCPSTATES 66#include <netinet/tcp_fsm.h> 67#include <netinet/tcp_var.h> 68#include <netinet/toecore.h> 69#include <netinet/tcp_seq.h> 70#include <netinet/tcp_timer.h> 71#include <net/route.h> 72 73#include "cxgb_include.h" 74#include "ulp/tom/cxgb_l2t.h" 75#include "ulp/tom/cxgb_tom.h" 76#include "ulp/tom/cxgb_toepcb.h" 77 78VNET_DECLARE(int, tcp_do_autosndbuf); 79#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) 80VNET_DECLARE(int, tcp_autosndbuf_inc); 81#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) 82VNET_DECLARE(int, tcp_autosndbuf_max); 83#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) 84VNET_DECLARE(int, tcp_do_autorcvbuf); 85#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 86VNET_DECLARE(int, tcp_autorcvbuf_inc); 87#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 88VNET_DECLARE(int, tcp_autorcvbuf_max); 89#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 90extern int always_keepalive; 91 92/* 93 * For ULP connections HW may add headers, e.g., for digests, that aren't part 94 * of the messages sent by the host but that are part of the TCP payload and 95 * therefore consume TCP sequence space. Tx connection parameters that 96 * operate in TCP sequence space are affected by the HW additions and need to 97 * compensate for them to accurately track TCP sequence numbers. This array 98 * contains the compensating extra lengths for ULP packets. It is indexed by 99 * a packet's ULP submode. 100 */ 101const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 102 103/* 104 * Max receive window supported by HW in bytes. Only a small part of it can 105 * be set through option0, the rest needs to be set through RX_DATA_ACK. 106 */ 107#define MAX_RCV_WND ((1U << 27) - 1) 108 109/* 110 * Min receive window. We want it to be large enough to accommodate receive 111 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 112 */ 113#define MIN_RCV_WND (24 * 1024U) 114#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 115 116static void t3_release_offload_resources(struct toepcb *); 117static void send_reset(struct toepcb *toep); 118 119/* 120 * Called after the last CPL for the toepcb has been received. 121 * 122 * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the 123 * time this function exits. 124 */ 125static int 126toepcb_release(struct toepcb *toep) 127{ 128 struct inpcb *inp = toep->tp_inp; 129 struct toedev *tod = toep->tp_tod; 130 struct tom_data *td = t3_tomdata(tod); 131 int rc; 132 133 INP_WLOCK_ASSERT(inp); 134 KASSERT(!(toep->tp_flags & TP_CPL_DONE), 135 ("%s: double release?", __func__)); 136 137 CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid); 138 139 toep->tp_flags |= TP_CPL_DONE; 140 toep->tp_inp = NULL; 141 142 mtx_lock(&td->toep_list_lock); 143 TAILQ_REMOVE(&td->toep_list, toep, link); 144 mtx_unlock(&td->toep_list_lock); 145 146 if (!(toep->tp_flags & TP_ATTACHED)) 147 t3_release_offload_resources(toep); 148 149 rc = in_pcbrele_wlocked(inp); 150 if (!rc) 151 INP_WUNLOCK(inp); 152 return (rc); 153} 154 155/* 156 * One sided detach. The tcpcb is going away and we need to unhook the toepcb 157 * hanging off it. If the TOE driver is also done with the toepcb we'll release 158 * all offload resources. 159 */ 160static void 161toepcb_detach(struct inpcb *inp) 162{ 163 struct toepcb *toep; 164 struct tcpcb *tp; 165 166 KASSERT(inp, ("%s: inp is NULL", __func__)); 167 INP_WLOCK_ASSERT(inp); 168 169 tp = intotcpcb(inp); 170 toep = tp->t_toe; 171 172 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 173 KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__)); 174 175 CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__, 176 tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid, 177 toep, inp, tp); 178 179 tp->t_toe = NULL; 180 tp->t_flags &= ~TF_TOE; 181 toep->tp_flags &= ~TP_ATTACHED; 182 183 if (toep->tp_flags & TP_CPL_DONE) 184 t3_release_offload_resources(toep); 185} 186 187void 188t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) 189{ 190 191 toepcb_detach(tp->t_inpcb); 192} 193 194static int 195alloc_atid(struct tid_info *t, void *ctx) 196{ 197 int atid = -1; 198 199 mtx_lock(&t->atid_lock); 200 if (t->afree) { 201 union active_open_entry *p = t->afree; 202 203 atid = (p - t->atid_tab) + t->atid_base; 204 t->afree = p->next; 205 p->ctx = ctx; 206 t->atids_in_use++; 207 } 208 mtx_unlock(&t->atid_lock); 209 210 return (atid); 211} 212 213static void 214free_atid(struct tid_info *t, int atid) 215{ 216 union active_open_entry *p = atid2entry(t, atid); 217 218 mtx_lock(&t->atid_lock); 219 p->next = t->afree; 220 t->afree = p; 221 t->atids_in_use--; 222 mtx_unlock(&t->atid_lock); 223} 224 225void 226insert_tid(struct tom_data *td, void *ctx, unsigned int tid) 227{ 228 struct tid_info *t = &td->tid_maps; 229 230 t->tid_tab[tid] = ctx; 231 atomic_add_int(&t->tids_in_use, 1); 232} 233 234void 235update_tid(struct tom_data *td, void *ctx, unsigned int tid) 236{ 237 struct tid_info *t = &td->tid_maps; 238 239 t->tid_tab[tid] = ctx; 240} 241 242void 243remove_tid(struct tom_data *td, unsigned int tid) 244{ 245 struct tid_info *t = &td->tid_maps; 246 247 t->tid_tab[tid] = NULL; 248 atomic_add_int(&t->tids_in_use, -1); 249} 250 251/* use ctx as a next pointer in the tid release list */ 252void 253queue_tid_release(struct toedev *tod, unsigned int tid) 254{ 255 struct tom_data *td = t3_tomdata(tod); 256 void **p = &td->tid_maps.tid_tab[tid]; 257 struct adapter *sc = tod->tod_softc; 258 259 mtx_lock(&td->tid_release_lock); 260 *p = td->tid_release_list; 261 td->tid_release_list = p; 262 if (!*p) 263 taskqueue_enqueue(sc->tq, &td->tid_release_task); 264 mtx_unlock(&td->tid_release_lock); 265} 266 267/* 268 * Populate a TID_RELEASE WR. 269 */ 270static inline void 271mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid) 272{ 273 274 cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 275 OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 276} 277 278void 279release_tid(struct toedev *tod, unsigned int tid, int qset) 280{ 281 struct tom_data *td = t3_tomdata(tod); 282 struct adapter *sc = tod->tod_softc; 283 struct mbuf *m; 284 struct cpl_tid_release *cpl; 285#ifdef INVARIANTS 286 struct tid_info *t = &td->tid_maps; 287#endif 288 289 KASSERT(tid < t->ntids, 290 ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids)); 291 292 m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); 293 if (m) { 294 mk_tid_release(cpl, tid); 295 t3_offload_tx(sc, m); 296 remove_tid(td, tid); 297 } else 298 queue_tid_release(tod, tid); 299 300} 301 302void 303t3_process_tid_release_list(void *data, int pending) 304{ 305 struct mbuf *m; 306 struct tom_data *td = data; 307 struct adapter *sc = td->tod.tod_softc; 308 309 mtx_lock(&td->tid_release_lock); 310 while (td->tid_release_list) { 311 void **p = td->tid_release_list; 312 unsigned int tid = p - td->tid_maps.tid_tab; 313 struct cpl_tid_release *cpl; 314 315 td->tid_release_list = (void **)*p; 316 m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */ 317 if (m == NULL) 318 break; /* XXX: who reschedules the release task? */ 319 mtx_unlock(&td->tid_release_lock); 320 mk_tid_release(cpl, tid); 321 t3_offload_tx(sc, m); 322 remove_tid(td, tid); 323 mtx_lock(&td->tid_release_lock); 324 } 325 mtx_unlock(&td->tid_release_lock); 326} 327 328static void 329close_conn(struct adapter *sc, struct toepcb *toep) 330{ 331 struct mbuf *m; 332 struct cpl_close_con_req *req; 333 334 if (toep->tp_flags & TP_FIN_SENT) 335 return; 336 337 m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); 338 if (m == NULL) 339 CXGB_UNIMPLEMENTED(); 340 341 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 342 req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); 343 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid)); 344 req->rsvd = 0; 345 346 toep->tp_flags |= TP_FIN_SENT; 347 t3_offload_tx(sc, m); 348} 349 350static inline void 351make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len, 352 struct mbuf *tail) 353{ 354 struct tcpcb *tp = so_sototcpcb(so); 355 struct toepcb *toep = tp->t_toe; 356 struct sockbuf *snd; 357 358 inp_lock_assert(tp->t_inpcb); 359 snd = so_sockbuf_snd(so); 360 361 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 362 req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); 363 /* len includes the length of any HW ULP additions */ 364 req->len = htonl(len); 365 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 366 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 367 req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) | 368 V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1))); 369 req->sndseq = htonl(tp->snd_nxt); 370 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 371 struct adapter *sc = toep->tp_tod->tod_softc; 372 int cpu_idx = sc->rrss_map[toep->tp_qset]; 373 374 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 375 V_TX_CPU_IDX(cpu_idx)); 376 377 /* Sendbuffer is in units of 32KB. */ 378 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 379 req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15)); 380 else 381 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 382 383 toep->tp_flags |= TP_DATASENT; 384 } 385} 386 387/* 388 * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc. 389 * TOM_XXX_MOVE to some common header file. 390 */ 391/* 392 * IMM_LEN: # of bytes that can be tx'd as immediate data. There are 16 flits 393 * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more 394 * for the second gen bit flit. This leaves us with 12 flits. 395 * 396 * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs. 397 * The first desc has a tx_data_wr (which includes the WR header), the rest have 398 * the WR header only. All descs have the second gen bit flit. 399 * 400 * sgllen_to_descs: # of tx descs used up by an sgl of given length. The first 401 * desc has a tx_data_wr (which includes the WR header), the rest have the WR 402 * header only. All descs have the second gen bit flit. 403 * 404 * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits. 405 * 406 */ 407#define IMM_LEN 96 408static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35}; 409static int sgllen_to_descs[TX_MAX_SEGS] = { 410 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, /* 0 - 9 */ 411 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, /* 10 - 19 */ 412 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, /* 20 - 29 */ 413 4, 4, 4, 4, 4, 4 /* 30 - 35 */ 414}; 415#if 0 416static int flits_to_sgllen[TX_DESC_FLITS + 1] = { 417 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10 418}; 419#endif 420#if SGE_NUM_GENBITS != 2 421#error "SGE_NUM_GENBITS really must be 2" 422#endif 423 424int 425t3_push_frames(struct socket *so, int req_completion) 426{ 427 struct tcpcb *tp = so_sototcpcb(so); 428 struct toepcb *toep = tp->t_toe; 429 struct mbuf *m0, *sndptr, *m; 430 struct toedev *tod = toep->tp_tod; 431 struct adapter *sc = tod->tod_softc; 432 int bytes, ndesc, total_bytes = 0, mlen; 433 struct sockbuf *snd; 434 struct sglist *sgl; 435 struct ofld_hdr *oh; 436 caddr_t dst; 437 struct tx_data_wr *wr; 438 439 inp_lock_assert(tp->t_inpcb); 440 441 snd = so_sockbuf_snd(so); 442 SOCKBUF_LOCK(snd); 443 444 /* 445 * Autosize the send buffer. 446 */ 447 if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) { 448 if (sbused(snd) >= (snd->sb_hiwat / 8 * 7) && 449 sbused(snd) < VNET(tcp_autosndbuf_max)) { 450 if (!sbreserve_locked(snd, min(snd->sb_hiwat + 451 VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)), 452 so, curthread)) 453 snd->sb_flags &= ~SB_AUTOSIZE; 454 } 455 } 456 457 if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr) 458 sndptr = toep->tp_m_last->m_next; 459 else 460 sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 461 462 /* Nothing to send or no WRs available for sending data */ 463 if (toep->tp_wr_avail == 0 || sndptr == NULL) 464 goto out; 465 466 /* Something to send and at least 1 WR available */ 467 while (toep->tp_wr_avail && sndptr != NULL) { 468 469 m0 = m_gethdr(M_NOWAIT, MT_DATA); 470 if (m0 == NULL) 471 break; 472 oh = mtod(m0, struct ofld_hdr *); 473 wr = (void *)(oh + 1); 474 dst = (void *)(wr + 1); 475 476 m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr); 477 oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF | 478 V_HDR_QSET(toep->tp_qset); 479 480 /* 481 * Try to construct an immediate data WR if possible. Stuff as 482 * much data into it as possible, one whole mbuf at a time. 483 */ 484 mlen = sndptr->m_len; 485 ndesc = bytes = 0; 486 while (mlen <= IMM_LEN - bytes) { 487 bcopy(sndptr->m_data, dst, mlen); 488 bytes += mlen; 489 dst += mlen; 490 491 if (!(sndptr = sndptr->m_next)) 492 break; 493 mlen = sndptr->m_len; 494 } 495 496 if (bytes) { 497 498 /* Was able to fit 'bytes' bytes in an immediate WR */ 499 500 ndesc = 1; 501 make_tx_data_wr(so, wr, bytes, sndptr); 502 503 m0->m_len += bytes; 504 m0->m_pkthdr.len = m0->m_len; 505 506 } else { 507 int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC); 508 509 /* Need to make an SGL */ 510 511 sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT); 512 if (sgl == NULL) 513 break; 514 515 for (m = sndptr; m != NULL; m = m->m_next) { 516 if ((mlen = m->m_len) > 0) { 517 if (sglist_append(sgl, m->m_data, mlen)) 518 break; 519 } 520 bytes += mlen; 521 } 522 sndptr = m; 523 if (bytes == 0) { 524 sglist_free(sgl); 525 break; 526 } 527 ndesc = sgllen_to_descs[sgl->sg_nseg]; 528 oh->flags |= F_HDR_SGL; 529 oh->sgl = sgl; 530 make_tx_data_wr(so, wr, bytes, sndptr); 531 } 532 533 oh->flags |= V_HDR_NDESC(ndesc); 534 oh->plen = bytes; 535 536 snd->sb_sndptr = sndptr; 537 snd->sb_sndptroff += bytes; 538 if (sndptr == NULL) { 539 snd->sb_sndptr = snd->sb_mbtail; 540 snd->sb_sndptroff -= snd->sb_mbtail->m_len; 541 toep->tp_m_last = snd->sb_mbtail; 542 } else 543 toep->tp_m_last = NULL; 544 545 total_bytes += bytes; 546 547 toep->tp_wr_avail -= ndesc; 548 toep->tp_wr_unacked += ndesc; 549 550 if ((req_completion && toep->tp_wr_unacked == ndesc) || 551 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 552 wr->wr.wrh_hi |= htonl(F_WR_COMPL); 553 toep->tp_wr_unacked = 0; 554 } 555 556 enqueue_wr(toep, m0); 557 l2t_send(sc, m0, toep->tp_l2t); 558 } 559out: 560 SOCKBUF_UNLOCK(snd); 561 562 if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN)) 563 close_conn(sc, toep); 564 565 return (total_bytes); 566} 567 568static int 569send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 570{ 571 struct mbuf *m; 572 struct cpl_rx_data_ack *req; 573 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 574 575 m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req); 576 if (m == NULL) 577 return (0); 578 579 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 580 req->wr.wrh_lo = 0; 581 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 582 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 583 t3_offload_tx(sc, m); 584 return (credits); 585} 586 587void 588t3_rcvd(struct toedev *tod, struct tcpcb *tp) 589{ 590 struct adapter *sc = tod->tod_softc; 591 struct inpcb *inp = tp->t_inpcb; 592 struct socket *so = inp->inp_socket; 593 struct sockbuf *so_rcv = &so->so_rcv; 594 struct toepcb *toep = tp->t_toe; 595 int must_send; 596 597 INP_WLOCK_ASSERT(inp); 598 599 SOCKBUF_LOCK(so_rcv); 600 KASSERT(toep->tp_enqueued >= sbused(so_rcv), 601 ("%s: sbused(so_rcv) > enqueued", __func__)); 602 toep->tp_rx_credits += toep->tp_enqueued - sbused(so_rcv); 603 toep->tp_enqueued = sbused(so_rcv); 604 SOCKBUF_UNLOCK(so_rcv); 605 606 must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd; 607 if (must_send || toep->tp_rx_credits >= 15 * 1024) { 608 int credits; 609 610 credits = send_rx_credits(sc, toep, toep->tp_rx_credits); 611 toep->tp_rx_credits -= credits; 612 tp->rcv_wnd += credits; 613 tp->rcv_adv += credits; 614 } 615} 616 617static int 618do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 619{ 620 struct adapter *sc = qs->adap; 621 struct tom_data *td = sc->tom_softc; 622 struct cpl_rx_urg_notify *hdr = mtod(m, void *); 623 unsigned int tid = GET_TID(hdr); 624 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 625 626 log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp); 627 628 m_freem(m); 629 return (0); 630} 631 632int 633t3_send_fin(struct toedev *tod, struct tcpcb *tp) 634{ 635 struct toepcb *toep = tp->t_toe; 636 struct inpcb *inp = tp->t_inpcb; 637 struct socket *so = inp_inpcbtosocket(inp); 638#if defined(KTR) 639 unsigned int tid = toep->tp_tid; 640#endif 641 642 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 643 INP_WLOCK_ASSERT(inp); 644 645 CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep, 646 toep->tp_flags); 647 648 toep->tp_flags |= TP_SEND_FIN; 649 t3_push_frames(so, 1); 650 651 return (0); 652} 653 654int 655t3_tod_output(struct toedev *tod, struct tcpcb *tp) 656{ 657 struct inpcb *inp = tp->t_inpcb; 658 struct socket *so = inp->inp_socket; 659 660 t3_push_frames(so, 1); 661 return (0); 662} 663 664/* What mtu_idx to use, given a 4-tuple and/or an MSS cap */ 665int 666find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) 667{ 668 unsigned short *mtus = &sc->params.mtus[0]; 669 int i = 0, mss; 670 671 KASSERT(inc != NULL || pmss > 0, 672 ("%s: at least one of inc/pmss must be specified", __func__)); 673 674 mss = inc ? tcp_mssopt(inc) : pmss; 675 if (pmss > 0 && mss > pmss) 676 mss = pmss; 677 678 while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40) 679 ++i; 680 681 return (i); 682} 683 684static inline void 685purge_wr_queue(struct toepcb *toep) 686{ 687 struct mbuf *m; 688 struct ofld_hdr *oh; 689 690 while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) { 691 oh = mtod(m, struct ofld_hdr *); 692 if (oh->flags & F_HDR_SGL) 693 sglist_free(oh->sgl); 694 m_freem(m); 695 } 696} 697 698/* 699 * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T 700 * entry, etc.) 701 */ 702static void 703t3_release_offload_resources(struct toepcb *toep) 704{ 705 struct toedev *tod = toep->tp_tod; 706 struct tom_data *td = t3_tomdata(tod); 707 708 /* 709 * The TOM explicitly detaches its toepcb from the system's inp before 710 * it releases the offload resources. 711 */ 712 if (toep->tp_inp) { 713 panic("%s: inp %p still attached to toepcb %p", 714 __func__, toep->tp_inp, toep); 715 } 716 717 if (toep->tp_wr_avail != toep->tp_wr_max) 718 purge_wr_queue(toep); 719 720 if (toep->tp_l2t) { 721 l2t_release(td->l2t, toep->tp_l2t); 722 toep->tp_l2t = NULL; 723 } 724 725 if (toep->tp_tid >= 0) 726 release_tid(tod, toep->tp_tid, toep->tp_qset); 727 728 toepcb_free(toep); 729} 730 731/* 732 * Determine the receive window size for a socket. 733 */ 734unsigned long 735select_rcv_wnd(struct socket *so) 736{ 737 unsigned long wnd; 738 739 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 740 741 wnd = sbspace(&so->so_rcv); 742 if (wnd < MIN_RCV_WND) 743 wnd = MIN_RCV_WND; 744 745 return min(wnd, MAX_RCV_WND); 746} 747 748int 749select_rcv_wscale(void) 750{ 751 int wscale = 0; 752 unsigned long space = sb_max; 753 754 if (space > MAX_RCV_WND) 755 space = MAX_RCV_WND; 756 757 while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) 758 wscale++; 759 760 return (wscale); 761} 762 763 764/* 765 * Set up the socket for TCP offload. 766 */ 767void 768offload_socket(struct socket *so, struct toepcb *toep) 769{ 770 struct toedev *tod = toep->tp_tod; 771 struct tom_data *td = t3_tomdata(tod); 772 struct inpcb *inp = sotoinpcb(so); 773 struct tcpcb *tp = intotcpcb(inp); 774 775 INP_WLOCK_ASSERT(inp); 776 777 /* Update socket */ 778 SOCKBUF_LOCK(&so->so_snd); 779 so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE; 780 SOCKBUF_UNLOCK(&so->so_snd); 781 SOCKBUF_LOCK(&so->so_rcv); 782 so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE; 783 SOCKBUF_UNLOCK(&so->so_rcv); 784 785 /* Update TCP PCB */ 786 tp->tod = toep->tp_tod; 787 tp->t_toe = toep; 788 tp->t_flags |= TF_TOE; 789 790 /* Install an extra hold on inp */ 791 toep->tp_inp = inp; 792 toep->tp_flags |= TP_ATTACHED; 793 in_pcbref(inp); 794 795 /* Add the TOE PCB to the active list */ 796 mtx_lock(&td->toep_list_lock); 797 TAILQ_INSERT_HEAD(&td->toep_list, toep, link); 798 mtx_unlock(&td->toep_list_lock); 799} 800 801/* This is _not_ the normal way to "unoffload" a socket. */ 802void 803undo_offload_socket(struct socket *so) 804{ 805 struct inpcb *inp = sotoinpcb(so); 806 struct tcpcb *tp = intotcpcb(inp); 807 struct toepcb *toep = tp->t_toe; 808 struct toedev *tod = toep->tp_tod; 809 struct tom_data *td = t3_tomdata(tod); 810 811 INP_WLOCK_ASSERT(inp); 812 813 so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE; 814 so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE; 815 816 tp->tod = NULL; 817 tp->t_toe = NULL; 818 tp->t_flags &= ~TF_TOE; 819 820 toep->tp_inp = NULL; 821 toep->tp_flags &= ~TP_ATTACHED; 822 if (in_pcbrele_wlocked(inp)) 823 panic("%s: inp freed.", __func__); 824 825 mtx_lock(&td->toep_list_lock); 826 TAILQ_REMOVE(&td->toep_list, toep, link); 827 mtx_unlock(&td->toep_list_lock); 828} 829 830/* 831 * Socket could be a listening socket, and we may not have a toepcb at all at 832 * this time. 833 */ 834uint32_t 835calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e) 836{ 837 uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) | 838 V_MSS_IDX(mtu_idx); 839 840 if (so != NULL) { 841 struct inpcb *inp = sotoinpcb(so); 842 struct tcpcb *tp = intotcpcb(inp); 843 int keepalive = always_keepalive || 844 so_options_get(so) & SO_KEEPALIVE; 845 846 opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); 847 opt0h |= V_KEEP_ALIVE(keepalive != 0); 848 } 849 850 if (e != NULL) 851 opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx); 852 853 return (htobe32(opt0h)); 854} 855 856uint32_t 857calc_opt0l(struct socket *so, int rcv_bufsize) 858{ 859 uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize); 860 861 KASSERT(rcv_bufsize <= M_RCV_BUFSIZ, 862 ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize)); 863 864 if (so != NULL) /* optional because no one cares about IP TOS */ 865 opt0l |= V_TOS(INP_TOS(sotoinpcb(so))); 866 867 return (htobe32(opt0l)); 868} 869 870/* 871 * Convert an ACT_OPEN_RPL status to an errno. 872 */ 873static int 874act_open_rpl_status_to_errno(int status) 875{ 876 switch (status) { 877 case CPL_ERR_CONN_RESET: 878 return (ECONNREFUSED); 879 case CPL_ERR_ARP_MISS: 880 return (EHOSTUNREACH); 881 case CPL_ERR_CONN_TIMEDOUT: 882 return (ETIMEDOUT); 883 case CPL_ERR_TCAM_FULL: 884 return (EAGAIN); 885 case CPL_ERR_CONN_EXIST: 886 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 887 return (EAGAIN); 888 default: 889 return (EIO); 890 } 891} 892 893/* 894 * Return whether a failed active open has allocated a TID 895 */ 896static inline int 897act_open_has_tid(int status) 898{ 899 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 900 status != CPL_ERR_ARP_MISS; 901} 902 903/* 904 * Active open failed. 905 */ 906static int 907do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 908{ 909 struct adapter *sc = qs->adap; 910 struct tom_data *td = sc->tom_softc; 911 struct toedev *tod = &td->tod; 912 struct cpl_act_open_rpl *rpl = mtod(m, void *); 913 unsigned int atid = G_TID(ntohl(rpl->atid)); 914 struct toepcb *toep = lookup_atid(&td->tid_maps, atid); 915 struct inpcb *inp = toep->tp_inp; 916 int s = rpl->status, rc; 917 918 CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s); 919 920 free_atid(&td->tid_maps, atid); 921 toep->tp_tid = -1; 922 923 if (act_open_has_tid(s)) 924 queue_tid_release(tod, GET_TID(rpl)); 925 926 rc = act_open_rpl_status_to_errno(s); 927 if (rc != EAGAIN) 928 INP_INFO_RLOCK(&V_tcbinfo); 929 INP_WLOCK(inp); 930 toe_connect_failed(tod, inp, rc); 931 toepcb_release(toep); /* unlocks inp */ 932 if (rc != EAGAIN) 933 INP_INFO_RUNLOCK(&V_tcbinfo); 934 935 m_freem(m); 936 return (0); 937} 938 939/* 940 * Send an active open request. 941 * 942 * State of affairs on entry: 943 * soisconnecting (so_state |= SS_ISCONNECTING) 944 * tcbinfo not locked (this has changed - used to be WLOCKed) 945 * inp WLOCKed 946 * tp->t_state = TCPS_SYN_SENT 947 * rtalloc1, RT_UNLOCK on rt. 948 */ 949int 950t3_connect(struct toedev *tod, struct socket *so, 951 struct rtentry *rt, struct sockaddr *nam) 952{ 953 struct mbuf *m = NULL; 954 struct l2t_entry *e = NULL; 955 struct tom_data *td = t3_tomdata(tod); 956 struct adapter *sc = tod->tod_softc; 957 struct cpl_act_open_req *cpl; 958 struct inpcb *inp = sotoinpcb(so); 959 struct tcpcb *tp = intotcpcb(inp); 960 struct toepcb *toep; 961 int atid = -1, mtu_idx, rscale, cpu_idx, qset; 962 struct sockaddr *gw; 963 struct ifnet *ifp = rt->rt_ifp; 964 struct port_info *pi = ifp->if_softc; /* XXX wrong for VLAN etc. */ 965 966 INP_WLOCK_ASSERT(inp); 967 968 toep = toepcb_alloc(tod); 969 if (toep == NULL) 970 goto failed; 971 972 atid = alloc_atid(&td->tid_maps, toep); 973 if (atid < 0) 974 goto failed; 975 976 qset = pi->first_qset + (arc4random() % pi->nqsets); 977 978 m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); 979 if (m == NULL) 980 goto failed; 981 982 gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam; 983 e = t3_l2t_get(pi, ifp, gw); 984 if (e == NULL) 985 goto failed; 986 987 toep->tp_l2t = e; 988 toep->tp_tid = atid; /* used to double check response */ 989 toep->tp_qset = qset; 990 991 SOCKBUF_LOCK(&so->so_rcv); 992 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ 993 toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); 994 SOCKBUF_UNLOCK(&so->so_rcv); 995 996 offload_socket(so, toep); 997 998 /* 999 * The kernel sets request_r_scale based on sb_max whereas we need to 1000 * take hardware's MAX_RCV_WND into account too. This is normally a 1001 * no-op as MAX_RCV_WND is much larger than the default sb_max. 1002 */ 1003 if (tp->t_flags & TF_REQ_SCALE) 1004 rscale = tp->request_r_scale = select_rcv_wscale(); 1005 else 1006 rscale = 0; 1007 mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); 1008 cpu_idx = sc->rrss_map[qset]; 1009 1010 cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD)); 1011 cpl->wr.wrh_lo = 0; 1012 OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1013 inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, 1014 &cpl->peer_port); 1015 cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e); 1016 cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits); 1017 cpl->params = 0; 1018 cpl->opt2 = calc_opt2(cpu_idx); 1019 1020 CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__, 1021 toep->tp_tid, tcpstates[tp->t_state], toep, inp); 1022 1023 if (l2t_send(sc, m, e) == 0) 1024 return (0); 1025 1026 undo_offload_socket(so); 1027 1028failed: 1029 CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p", 1030 __func__, atid, toep, e, m); 1031 1032 if (atid >= 0) 1033 free_atid(&td->tid_maps, atid); 1034 1035 if (e) 1036 l2t_release(td->l2t, e); 1037 1038 if (toep) 1039 toepcb_free(toep); 1040 1041 m_freem(m); 1042 1043 return (ENOMEM); 1044} 1045 1046/* 1047 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do not 1048 * send multiple ABORT_REQs for the same connection and also that we do not try 1049 * to send a message after the connection has closed. 1050 */ 1051static void 1052send_reset(struct toepcb *toep) 1053{ 1054 1055 struct cpl_abort_req *req; 1056 unsigned int tid = toep->tp_tid; 1057 struct inpcb *inp = toep->tp_inp; 1058 struct socket *so = inp->inp_socket; 1059 struct tcpcb *tp = intotcpcb(inp); 1060 struct toedev *tod = toep->tp_tod; 1061 struct adapter *sc = tod->tod_softc; 1062 struct mbuf *m; 1063 1064 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1065 INP_WLOCK_ASSERT(inp); 1066 1067 CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep, 1068 toep->tp_flags); 1069 1070 if (toep->tp_flags & TP_ABORT_SHUTDOWN) 1071 return; 1072 1073 toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); 1074 1075 /* Purge the send queue */ 1076 sbflush(so_sockbuf_snd(so)); 1077 purge_wr_queue(toep); 1078 1079 m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); 1080 if (m == NULL) 1081 CXGB_UNIMPLEMENTED(); 1082 1083 req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1084 req->wr.wrh_lo = htonl(V_WR_TID(tid)); 1085 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1086 req->rsvd0 = htonl(tp->snd_nxt); 1087 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1088 req->cmd = CPL_ABORT_SEND_RST; 1089 1090 if (tp->t_state == TCPS_SYN_SENT) 1091 (void )mbufq_enqueue(&toep->out_of_order_queue, m); /* defer */ 1092 else 1093 l2t_send(sc, m, toep->tp_l2t); 1094} 1095 1096int 1097t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp) 1098{ 1099 1100 send_reset(tp->t_toe); 1101 return (0); 1102} 1103 1104/* 1105 * Handler for RX_DATA CPL messages. 1106 */ 1107static int 1108do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1109{ 1110 struct adapter *sc = qs->adap; 1111 struct tom_data *td = sc->tom_softc; 1112 struct cpl_rx_data *hdr = mtod(m, void *); 1113 unsigned int tid = GET_TID(hdr); 1114 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1115 struct inpcb *inp = toep->tp_inp; 1116 struct tcpcb *tp; 1117 struct socket *so; 1118 struct sockbuf *so_rcv; 1119 1120 /* Advance over CPL */ 1121 m_adj(m, sizeof(*hdr)); 1122 1123 /* XXX: revisit. This comes from the T4 TOM */ 1124 if (__predict_false(inp == NULL)) { 1125 /* 1126 * do_pass_establish failed and must be attempting to abort the 1127 * connection. Meanwhile, the T4 has sent us data for such a 1128 * connection. 1129 */ 1130#ifdef notyet 1131 KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), 1132 ("%s: inp NULL and tid isn't being aborted", __func__)); 1133#endif 1134 m_freem(m); 1135 return (0); 1136 } 1137 1138 INP_WLOCK(inp); 1139 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1140 CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1141 __func__, tid, m->m_pkthdr.len, inp->inp_flags); 1142 INP_WUNLOCK(inp); 1143 m_freem(m); 1144 return (0); 1145 } 1146 1147 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) 1148 toep->tp_delack_mode = hdr->dack_mode; 1149 1150 tp = intotcpcb(inp); 1151 1152#ifdef INVARIANTS 1153 if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) { 1154 log(LOG_ERR, 1155 "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n", 1156 __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt); 1157 } 1158#endif 1159 tp->rcv_nxt += m->m_pkthdr.len; 1160 KASSERT(tp->rcv_wnd >= m->m_pkthdr.len, 1161 ("%s: negative window size", __func__)); 1162 tp->rcv_wnd -= m->m_pkthdr.len; 1163 tp->t_rcvtime = ticks; 1164 1165 so = inp->inp_socket; 1166 so_rcv = &so->so_rcv; 1167 SOCKBUF_LOCK(so_rcv); 1168 1169 if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) { 1170 CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)", 1171 __func__, tid, m->m_pkthdr.len); 1172 SOCKBUF_UNLOCK(so_rcv); 1173 INP_WUNLOCK(inp); 1174 1175 INP_INFO_RLOCK(&V_tcbinfo); 1176 INP_WLOCK(inp); 1177 tp = tcp_drop(tp, ECONNRESET); 1178 if (tp) 1179 INP_WUNLOCK(inp); 1180 INP_INFO_RUNLOCK(&V_tcbinfo); 1181 1182 m_freem(m); 1183 return (0); 1184 } 1185 1186 /* receive buffer autosize */ 1187 if (so_rcv->sb_flags & SB_AUTOSIZE && 1188 V_tcp_do_autorcvbuf && 1189 so_rcv->sb_hiwat < V_tcp_autorcvbuf_max && 1190 (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) { 1191 unsigned int hiwat = so_rcv->sb_hiwat; 1192 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 1193 V_tcp_autorcvbuf_max); 1194 1195 if (!sbreserve_locked(so_rcv, newsize, so, NULL)) 1196 so_rcv->sb_flags &= ~SB_AUTOSIZE; 1197 else 1198 toep->tp_rx_credits += newsize - hiwat; 1199 } 1200 1201 toep->tp_enqueued += m->m_pkthdr.len; 1202 sbappendstream_locked(so_rcv, m, 0); 1203 sorwakeup_locked(so); 1204 SOCKBUF_UNLOCK_ASSERT(so_rcv); 1205 1206 INP_WUNLOCK(inp); 1207 return (0); 1208} 1209 1210/* 1211 * Handler for PEER_CLOSE CPL messages. 1212 */ 1213static int 1214do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1215{ 1216 struct adapter *sc = qs->adap; 1217 struct tom_data *td = sc->tom_softc; 1218 const struct cpl_peer_close *hdr = mtod(m, void *); 1219 unsigned int tid = GET_TID(hdr); 1220 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1221 struct inpcb *inp = toep->tp_inp; 1222 struct tcpcb *tp; 1223 struct socket *so; 1224 1225 INP_INFO_RLOCK(&V_tcbinfo); 1226 INP_WLOCK(inp); 1227 tp = intotcpcb(inp); 1228 1229 CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, 1230 tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp); 1231 1232 if (toep->tp_flags & TP_ABORT_RPL_PENDING) 1233 goto done; 1234 1235 so = inp_inpcbtosocket(inp); 1236 1237 socantrcvmore(so); 1238 tp->rcv_nxt++; 1239 1240 switch (tp->t_state) { 1241 case TCPS_SYN_RECEIVED: 1242 tp->t_starttime = ticks; 1243 /* FALLTHROUGH */ 1244 case TCPS_ESTABLISHED: 1245 tp->t_state = TCPS_CLOSE_WAIT; 1246 break; 1247 case TCPS_FIN_WAIT_1: 1248 tp->t_state = TCPS_CLOSING; 1249 break; 1250 case TCPS_FIN_WAIT_2: 1251 tcp_twstart(tp); 1252 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1253 INP_INFO_RUNLOCK(&V_tcbinfo); 1254 1255 INP_WLOCK(inp); 1256 toepcb_release(toep); /* no more CPLs expected */ 1257 1258 m_freem(m); 1259 return (0); 1260 default: 1261 log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n", 1262 __func__, toep->tp_tid, tp->t_state); 1263 } 1264 1265done: 1266 INP_WUNLOCK(inp); 1267 INP_INFO_RUNLOCK(&V_tcbinfo); 1268 1269 m_freem(m); 1270 return (0); 1271} 1272 1273/* 1274 * Handler for CLOSE_CON_RPL CPL messages. peer ACK to our FIN received. 1275 */ 1276static int 1277do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1278{ 1279 struct adapter *sc = qs->adap; 1280 struct tom_data *td = sc->tom_softc; 1281 const struct cpl_close_con_rpl *rpl = mtod(m, void *); 1282 unsigned int tid = GET_TID(rpl); 1283 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1284 struct inpcb *inp = toep->tp_inp; 1285 struct tcpcb *tp; 1286 struct socket *so; 1287 1288 INP_INFO_RLOCK(&V_tcbinfo); 1289 INP_WLOCK(inp); 1290 tp = intotcpcb(inp); 1291 1292 CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, 1293 tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags); 1294 1295 if ((toep->tp_flags & TP_ABORT_RPL_PENDING)) 1296 goto done; 1297 1298 so = inp_inpcbtosocket(inp); 1299 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 1300 1301 switch (tp->t_state) { 1302 case TCPS_CLOSING: 1303 tcp_twstart(tp); 1304release: 1305 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1306 INP_INFO_RUNLOCK(&V_tcbinfo); 1307 1308 INP_WLOCK(inp); 1309 toepcb_release(toep); /* no more CPLs expected */ 1310 1311 m_freem(m); 1312 return (0); 1313 case TCPS_LAST_ACK: 1314 if (tcp_close(tp)) 1315 INP_WUNLOCK(inp); 1316 goto release; 1317 1318 case TCPS_FIN_WAIT_1: 1319 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1320 soisdisconnected(so); 1321 tp->t_state = TCPS_FIN_WAIT_2; 1322 break; 1323 default: 1324 log(LOG_ERR, 1325 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 1326 __func__, toep->tp_tid, tp->t_state); 1327 } 1328 1329done: 1330 INP_WUNLOCK(inp); 1331 INP_INFO_RUNLOCK(&V_tcbinfo); 1332 1333 m_freem(m); 1334 return (0); 1335} 1336 1337static int 1338do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1339{ 1340 struct cpl_smt_write_rpl *rpl = mtod(m, void *); 1341 1342 if (rpl->status != CPL_ERR_NONE) { 1343 log(LOG_ERR, 1344 "Unexpected SMT_WRITE_RPL status %u for entry %u\n", 1345 rpl->status, GET_TID(rpl)); 1346 } 1347 1348 m_freem(m); 1349 return (0); 1350} 1351 1352static int 1353do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1354{ 1355 struct cpl_set_tcb_rpl *rpl = mtod(m, void *); 1356 1357 if (rpl->status != CPL_ERR_NONE) { 1358 log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n", 1359 rpl->status, GET_TID(rpl)); 1360 } 1361 1362 m_freem(m); 1363 return (0); 1364} 1365 1366/* 1367 * Handle an ABORT_RPL_RSS CPL message. 1368 */ 1369static int 1370do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1371{ 1372 struct adapter *sc = qs->adap; 1373 struct tom_data *td = sc->tom_softc; 1374 const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); 1375 unsigned int tid = GET_TID(rpl); 1376 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1377 struct inpcb *inp; 1378 1379 /* 1380 * Ignore replies to post-close aborts indicating that the abort was 1381 * requested too late. These connections are terminated when we get 1382 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 1383 * arrives the TID is either no longer used or it has been recycled. 1384 */ 1385 if (rpl->status == CPL_ERR_ABORT_FAILED) { 1386 m_freem(m); 1387 return (0); 1388 } 1389 1390 if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) 1391 return (do_abort_rpl_synqe(qs, r, m)); 1392 1393 CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep, 1394 rpl->status); 1395 1396 inp = toep->tp_inp; 1397 INP_WLOCK(inp); 1398 1399 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1400 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) { 1401 toep->tp_flags |= TP_ABORT_RPL_RCVD; 1402 INP_WUNLOCK(inp); 1403 } else { 1404 toep->tp_flags &= ~TP_ABORT_RPL_RCVD; 1405 toep->tp_flags &= TP_ABORT_RPL_PENDING; 1406 toepcb_release(toep); /* no more CPLs expected */ 1407 } 1408 } 1409 1410 m_freem(m); 1411 return (0); 1412} 1413 1414/* 1415 * Convert the status code of an ABORT_REQ into a FreeBSD error code. 1416 */ 1417static int 1418abort_status_to_errno(struct tcpcb *tp, int abort_reason) 1419{ 1420 switch (abort_reason) { 1421 case CPL_ERR_BAD_SYN: 1422 case CPL_ERR_CONN_RESET: 1423 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1424 case CPL_ERR_XMIT_TIMEDOUT: 1425 case CPL_ERR_PERSIST_TIMEDOUT: 1426 case CPL_ERR_FINWAIT2_TIMEDOUT: 1427 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1428 return (ETIMEDOUT); 1429 default: 1430 return (EIO); 1431 } 1432} 1433 1434/* 1435 * Returns whether an ABORT_REQ_RSS message is a negative advice. 1436 */ 1437static inline int 1438is_neg_adv_abort(unsigned int status) 1439{ 1440 return status == CPL_ERR_RTX_NEG_ADVICE || 1441 status == CPL_ERR_PERSIST_NEG_ADVICE; 1442} 1443 1444void 1445send_abort_rpl(struct toedev *tod, int tid, int qset) 1446{ 1447 struct mbuf *reply; 1448 struct cpl_abort_rpl *rpl; 1449 struct adapter *sc = tod->tod_softc; 1450 1451 reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl); 1452 if (!reply) 1453 CXGB_UNIMPLEMENTED(); 1454 1455 rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 1456 rpl->wr.wrh_lo = htonl(V_WR_TID(tid)); 1457 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 1458 rpl->cmd = CPL_ABORT_NO_RST; 1459 1460 t3_offload_tx(sc, reply); 1461} 1462 1463/* 1464 * Handle an ABORT_REQ_RSS CPL message. If we're waiting for an ABORT_RPL we 1465 * ignore this request except that we need to reply to it. 1466 */ 1467static int 1468do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1469{ 1470 struct adapter *sc = qs->adap; 1471 struct tom_data *td = sc->tom_softc; 1472 struct toedev *tod = &td->tod; 1473 const struct cpl_abort_req_rss *req = mtod(m, void *); 1474 unsigned int tid = GET_TID(req); 1475 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1476 struct inpcb *inp; 1477 struct tcpcb *tp; 1478 struct socket *so; 1479 int qset = toep->tp_qset; 1480 1481 if (is_neg_adv_abort(req->status)) { 1482 CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)", 1483 __func__, req->status, tid, toep->tp_flags); 1484 m_freem(m); 1485 return (0); 1486 } 1487 1488 if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) 1489 return (do_abort_req_synqe(qs, r, m)); 1490 1491 inp = toep->tp_inp; 1492 INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ 1493 INP_WLOCK(inp); 1494 1495 tp = intotcpcb(inp); 1496 so = inp->inp_socket; 1497 1498 CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d", 1499 __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags, 1500 req->status); 1501 1502 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) { 1503 toep->tp_flags |= TP_ABORT_REQ_RCVD; 1504 toep->tp_flags |= TP_ABORT_SHUTDOWN; 1505 INP_WUNLOCK(inp); 1506 INP_INFO_RUNLOCK(&V_tcbinfo); 1507 m_freem(m); 1508 return (0); 1509 } 1510 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 1511 1512 /* 1513 * If we'd sent a reset on this toep, we'll ignore this and clean up in 1514 * the T3's reply to our reset instead. 1515 */ 1516 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1517 toep->tp_flags |= TP_ABORT_RPL_SENT; 1518 INP_WUNLOCK(inp); 1519 } else { 1520 so_error_set(so, abort_status_to_errno(tp, req->status)); 1521 tp = tcp_close(tp); 1522 if (tp == NULL) 1523 INP_WLOCK(inp); /* re-acquire */ 1524 toepcb_release(toep); /* no more CPLs expected */ 1525 } 1526 INP_INFO_RUNLOCK(&V_tcbinfo); 1527 1528 send_abort_rpl(tod, tid, qset); 1529 m_freem(m); 1530 return (0); 1531} 1532 1533static void 1534assign_rxopt(struct tcpcb *tp, uint16_t tcpopt) 1535{ 1536 struct toepcb *toep = tp->t_toe; 1537 struct adapter *sc = toep->tp_tod->tod_softc; 1538 1539 tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40; 1540 1541 if (G_TCPOPT_TSTAMP(tcpopt)) { 1542 tp->t_flags |= TF_RCVD_TSTMP; 1543 tp->t_flags |= TF_REQ_TSTMP; /* forcibly set */ 1544 tp->ts_recent = 0; /* XXX */ 1545 tp->ts_recent_age = tcp_ts_getticks(); 1546 } 1547 1548 if (G_TCPOPT_SACK(tcpopt)) 1549 tp->t_flags |= TF_SACK_PERMIT; 1550 else 1551 tp->t_flags &= ~TF_SACK_PERMIT; 1552 1553 if (G_TCPOPT_WSCALE_OK(tcpopt)) 1554 tp->t_flags |= TF_RCVD_SCALE; 1555 1556 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 1557 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 1558 tp->rcv_scale = tp->request_r_scale; 1559 tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt); 1560 } 1561 1562} 1563 1564/* 1565 * The ISS and IRS are from after the exchange of SYNs and are off by 1. 1566 */ 1567void 1568make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs, 1569 uint16_t cpl_tcpopt) 1570{ 1571 struct inpcb *inp = sotoinpcb(so); 1572 struct tcpcb *tp = intotcpcb(inp); 1573 struct toepcb *toep = tp->t_toe; 1574 long bufsize; 1575 uint32_t iss = be32toh(cpl_iss) - 1; /* true ISS */ 1576 uint32_t irs = be32toh(cpl_irs) - 1; /* true IRS */ 1577 uint16_t tcpopt = be16toh(cpl_tcpopt); 1578 1579 INP_WLOCK_ASSERT(inp); 1580 1581 tp->t_state = TCPS_ESTABLISHED; 1582 tp->t_starttime = ticks; 1583 TCPSTAT_INC(tcps_connects); 1584 1585 CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state], 1586 toep->tp_tid, toep, inp); 1587 1588 tp->irs = irs; 1589 tcp_rcvseqinit(tp); 1590 tp->rcv_wnd = toep->tp_rx_credits << 10; 1591 tp->rcv_adv += tp->rcv_wnd; 1592 tp->last_ack_sent = tp->rcv_nxt; 1593 1594 /* 1595 * If we were unable to send all rx credits via opt0, save the remainder 1596 * in rx_credits so that they can be handed over with the next credit 1597 * update. 1598 */ 1599 SOCKBUF_LOCK(&so->so_rcv); 1600 bufsize = select_rcv_wnd(so); 1601 SOCKBUF_UNLOCK(&so->so_rcv); 1602 toep->tp_rx_credits = bufsize - tp->rcv_wnd; 1603 1604 tp->iss = iss; 1605 tcp_sendseqinit(tp); 1606 tp->snd_una = iss + 1; 1607 tp->snd_nxt = iss + 1; 1608 tp->snd_max = iss + 1; 1609 1610 assign_rxopt(tp, tcpopt); 1611 soisconnected(so); 1612} 1613 1614/* 1615 * Fill in the right TID for CPL messages waiting in the out-of-order queue 1616 * and send them to the TOE. 1617 */ 1618static void 1619fixup_and_send_ofo(struct toepcb *toep) 1620{ 1621 struct mbuf *m; 1622 struct toedev *tod = toep->tp_tod; 1623 struct adapter *sc = tod->tod_softc; 1624 unsigned int tid = toep->tp_tid; 1625 1626 inp_lock_assert(toep->tp_inp); 1627 1628 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 1629 struct ofld_hdr *oh = mtod(m, void *); 1630 /* 1631 * A variety of messages can be waiting but the fields we'll 1632 * be touching are common to all so any message type will do. 1633 */ 1634 struct cpl_close_con_req *p = (void *)(oh + 1); 1635 1636 p->wr.wrh_lo = htonl(V_WR_TID(tid)); 1637 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 1638 t3_offload_tx(sc, m); 1639 } 1640} 1641 1642/* 1643 * Process a CPL_ACT_ESTABLISH message. 1644 */ 1645static int 1646do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1647{ 1648 struct adapter *sc = qs->adap; 1649 struct tom_data *td = sc->tom_softc; 1650 struct cpl_act_establish *req = mtod(m, void *); 1651 unsigned int tid = GET_TID(req); 1652 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 1653 struct toepcb *toep = lookup_atid(&td->tid_maps, atid); 1654 struct inpcb *inp = toep->tp_inp; 1655 struct tcpcb *tp; 1656 struct socket *so; 1657 1658 CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid); 1659 1660 free_atid(&td->tid_maps, atid); 1661 1662 INP_WLOCK(inp); 1663 tp = intotcpcb(inp); 1664 1665 KASSERT(toep->tp_qset == qs->idx, 1666 ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx)); 1667 KASSERT(toep->tp_tid == atid, 1668 ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid)); 1669 1670 toep->tp_tid = tid; 1671 insert_tid(td, toep, tid); 1672 1673 if (inp->inp_flags & INP_DROPPED) { 1674 /* socket closed by the kernel before hw told us it connected */ 1675 send_reset(toep); 1676 goto done; 1677 } 1678 1679 KASSERT(tp->t_state == TCPS_SYN_SENT, 1680 ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state)); 1681 1682 so = inp->inp_socket; 1683 make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt); 1684 1685 /* 1686 * Now that we finally have a TID send any CPL messages that we had to 1687 * defer for lack of a TID. 1688 */ 1689 if (mbufq_len(&toep->out_of_order_queue)) 1690 fixup_and_send_ofo(toep); 1691 1692done: 1693 INP_WUNLOCK(inp); 1694 m_freem(m); 1695 return (0); 1696} 1697 1698/* 1699 * Process an acknowledgment of WR completion. Advance snd_una and send the 1700 * next batch of work requests from the write queue. 1701 */ 1702static void 1703wr_ack(struct toepcb *toep, struct mbuf *m) 1704{ 1705 struct inpcb *inp = toep->tp_inp; 1706 struct tcpcb *tp; 1707 struct cpl_wr_ack *hdr = mtod(m, void *); 1708 struct socket *so; 1709 unsigned int credits = ntohs(hdr->credits); 1710 u32 snd_una = ntohl(hdr->snd_una); 1711 int bytes = 0; 1712 struct sockbuf *snd; 1713 struct mbuf *p; 1714 struct ofld_hdr *oh; 1715 1716 inp_wlock(inp); 1717 tp = intotcpcb(inp); 1718 so = inp->inp_socket; 1719 toep->tp_wr_avail += credits; 1720 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 1721 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 1722 1723 while (credits) { 1724 p = peek_wr(toep); 1725 1726 if (__predict_false(!p)) { 1727 CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, " 1728 "tid %u, state %u, wr_avail %u", __func__, credits, 1729 toep->tp_tid, tp->t_state, toep->tp_wr_avail); 1730 1731 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 1732 "nothing pending, state %u wr_avail=%u\n", 1733 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 1734 break; 1735 } 1736 1737 oh = mtod(p, struct ofld_hdr *); 1738 1739 KASSERT(credits >= G_HDR_NDESC(oh->flags), 1740 ("%s: partial credits? %d %d", __func__, credits, 1741 G_HDR_NDESC(oh->flags))); 1742 1743 dequeue_wr(toep); 1744 credits -= G_HDR_NDESC(oh->flags); 1745 bytes += oh->plen; 1746 1747 if (oh->flags & F_HDR_SGL) 1748 sglist_free(oh->sgl); 1749 m_freem(p); 1750 } 1751 1752 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) 1753 goto out_free; 1754 1755 if (tp->snd_una != snd_una) { 1756 tp->snd_una = snd_una; 1757 tp->ts_recent_age = tcp_ts_getticks(); 1758 if (tp->snd_una == tp->snd_nxt) 1759 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 1760 } 1761 1762 snd = so_sockbuf_snd(so); 1763 if (bytes) { 1764 SOCKBUF_LOCK(snd); 1765 sbdrop_locked(snd, bytes); 1766 so_sowwakeup_locked(so); 1767 } 1768 1769 if (snd->sb_sndptroff < sbused(snd)) 1770 t3_push_frames(so, 0); 1771 1772out_free: 1773 inp_wunlock(tp->t_inpcb); 1774 m_freem(m); 1775} 1776 1777/* 1778 * Handler for TX_DATA_ACK CPL messages. 1779 */ 1780static int 1781do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) 1782{ 1783 struct adapter *sc = qs->adap; 1784 struct tom_data *td = sc->tom_softc; 1785 struct cpl_wr_ack *hdr = mtod(m, void *); 1786 unsigned int tid = GET_TID(hdr); 1787 struct toepcb *toep = lookup_tid(&td->tid_maps, tid); 1788 1789 /* XXX bad race */ 1790 if (toep) 1791 wr_ack(toep, m); 1792 1793 return (0); 1794} 1795 1796void 1797t3_init_cpl_io(struct adapter *sc) 1798{ 1799 t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish); 1800 t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl); 1801 t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify); 1802 t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data); 1803 t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack); 1804 t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close); 1805 t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req); 1806 t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl); 1807 t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl); 1808 t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl); 1809 t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl); 1810} 1811#endif 1812