cxgb_cpl_io.c revision 183289
1/************************************************************************** 2 3Copyright (c) 2007-2008, Chelsio Inc. 4All rights reserved. 5 6Redistribution and use in source and binary forms, with or without 7modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26POSSIBILITY OF SUCH DAMAGE. 27 28***************************************************************************/ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 183289 2008-09-23 02:22:24Z kmacy $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/sockstate.h> 43#include <sys/sockopt.h> 44#include <sys/socket.h> 45#include <sys/sockbuf.h> 46#include <sys/sysctl.h> 47#include <sys/syslog.h> 48#include <sys/protosw.h> 49#include <sys/priv.h> 50 51#if __FreeBSD_version >= 800044 52#include <sys/vimage.h> 53#else 54#define V_tcp_do_autosndbuf tcp_do_autosndbuf 55#define V_tcp_autosndbuf_max tcp_autosndbuf_max 56#define V_tcp_do_rfc1323 tcp_do_rfc1323 57#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf 58#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max 59#define V_tcpstat tcpstat 60#endif 61 62#include <net/if.h> 63#include <net/route.h> 64 65#include <netinet/in.h> 66#include <netinet/in_pcb.h> 67#include <netinet/in_systm.h> 68#include <netinet/in_var.h> 69 70 71#include <dev/cxgb/cxgb_osdep.h> 72#include <dev/cxgb/sys/mbufq.h> 73 74#include <netinet/ip.h> 75#include <netinet/tcp_var.h> 76#include <netinet/tcp_fsm.h> 77#include <netinet/tcp_offload.h> 78#include <netinet/tcp_seq.h> 79#include <netinet/tcp_syncache.h> 80#include <netinet/tcp_timer.h> 81#include <net/route.h> 82 83#include <dev/cxgb/t3cdev.h> 84#include <dev/cxgb/common/cxgb_firmware_exports.h> 85#include <dev/cxgb/common/cxgb_t3_cpl.h> 86#include <dev/cxgb/common/cxgb_tcb.h> 87#include <dev/cxgb/common/cxgb_ctl_defs.h> 88#include <dev/cxgb/cxgb_offload.h> 89#include <vm/vm.h> 90#include <vm/pmap.h> 91#include <machine/bus.h> 92#include <dev/cxgb/sys/mvec.h> 93#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> 94#include <dev/cxgb/ulp/tom/cxgb_defs.h> 95#include <dev/cxgb/ulp/tom/cxgb_tom.h> 96#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> 97#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> 98#include <dev/cxgb/ulp/tom/cxgb_tcp.h> 99 100#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h> 101 102/* 103 * For ULP connections HW may add headers, e.g., for digests, that aren't part 104 * of the messages sent by the host but that are part of the TCP payload and 105 * therefore consume TCP sequence space. Tx connection parameters that 106 * operate in TCP sequence space are affected by the HW additions and need to 107 * compensate for them to accurately track TCP sequence numbers. This array 108 * contains the compensating extra lengths for ULP packets. It is indexed by 109 * a packet's ULP submode. 110 */ 111const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 112 113#ifdef notyet 114/* 115 * This sk_buff holds a fake header-only TCP segment that we use whenever we 116 * need to exploit SW TCP functionality that expects TCP headers, such as 117 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 118 * CPUs without locking. 119 */ 120static struct mbuf *tcphdr_mbuf __read_mostly; 121#endif 122 123/* 124 * Size of WRs in bytes. Note that we assume all devices we are handling have 125 * the same WR size. 126 */ 127static unsigned int wrlen __read_mostly; 128 129/* 130 * The number of WRs needed for an skb depends on the number of page fragments 131 * in the skb and whether it has any payload in its main body. This maps the 132 * length of the gather list represented by an skb into the # of necessary WRs. 133 */ 134static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; 135 136/* 137 * Max receive window supported by HW in bytes. Only a small part of it can 138 * be set through option0, the rest needs to be set through RX_DATA_ACK. 139 */ 140#define MAX_RCV_WND ((1U << 27) - 1) 141 142/* 143 * Min receive window. We want it to be large enough to accommodate receive 144 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 145 */ 146#define MIN_RCV_WND (24 * 1024U) 147#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 148 149#define VALIDATE_SEQ 0 150#define VALIDATE_SOCK(so) 151#define DEBUG_WR 0 152 153#define TCP_TIMEWAIT 1 154#define TCP_CLOSE 2 155#define TCP_DROP 3 156 157extern int tcp_do_autorcvbuf; 158extern int tcp_do_autosndbuf; 159extern int tcp_autorcvbuf_max; 160extern int tcp_autosndbuf_max; 161 162static void t3_send_reset(struct toepcb *toep); 163static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 164static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 165static void handle_syncache_event(int event, void *arg); 166 167static inline void 168SBAPPEND(struct sockbuf *sb, struct mbuf *n) 169{ 170 struct mbuf *m; 171 172 m = sb->sb_mb; 173 while (m) { 174 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 175 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 176 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 177 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 178 m->m_next, m->m_nextpkt, m->m_flags)); 179 m = m->m_next; 180 } 181 m = n; 182 while (m) { 183 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 184 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 185 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 186 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 187 m->m_next, m->m_nextpkt, m->m_flags)); 188 m = m->m_next; 189 } 190 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); 191 sbappendstream_locked(sb, n); 192 m = sb->sb_mb; 193 194 while (m) { 195 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 196 m->m_next, m->m_nextpkt, m->m_flags)); 197 m = m->m_next; 198 } 199} 200 201static inline int 202is_t3a(const struct toedev *dev) 203{ 204 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 205} 206 207static void 208dump_toepcb(struct toepcb *toep) 209{ 210 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 211 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 212 toep->tp_mtu_idx, toep->tp_tid); 213 214 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 215 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 216 toep->tp_mss_clamp, toep->tp_flags); 217} 218 219#ifndef RTALLOC2_DEFINED 220static struct rtentry * 221rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 222{ 223 struct rtentry *rt = NULL; 224 225 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 226 RT_UNLOCK(rt); 227 228 return (rt); 229} 230#endif 231 232/* 233 * Determine whether to send a CPL message now or defer it. A message is 234 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 235 * For connections in other states the message is sent immediately. 236 * If through_l2t is set the message is subject to ARP processing, otherwise 237 * it is sent directly. 238 */ 239static inline void 240send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) 241{ 242 struct tcpcb *tp = toep->tp_tp; 243 244 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 245 inp_wlock(tp->t_inpcb); 246 mbufq_tail(&toep->out_of_order_queue, m); // defer 247 inp_wunlock(tp->t_inpcb); 248 } else if (through_l2t) 249 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T 250 else 251 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly 252} 253 254static inline unsigned int 255mkprio(unsigned int cntrl, const struct toepcb *toep) 256{ 257 return (cntrl); 258} 259 260/* 261 * Populate a TID_RELEASE WR. The skb must be already propely sized. 262 */ 263static inline void 264mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) 265{ 266 struct cpl_tid_release *req; 267 268 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); 269 m->m_pkthdr.len = m->m_len = sizeof(*req); 270 req = mtod(m, struct cpl_tid_release *); 271 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 272 req->wr.wr_lo = 0; 273 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 274} 275 276static inline void 277make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 278{ 279 struct tcpcb *tp = so_sototcpcb(so); 280 struct toepcb *toep = tp->t_toe; 281 struct tx_data_wr *req; 282 struct sockbuf *snd; 283 284 inp_lock_assert(tp->t_inpcb); 285 snd = so_sockbuf_snd(so); 286 287 req = mtod(m, struct tx_data_wr *); 288 m->m_len = sizeof(*req); 289 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 290 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 291 /* len includes the length of any HW ULP additions */ 292 req->len = htonl(len); 293 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 294 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 295 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 296 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 297 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 298 (tail ? 0 : 1)))); 299 req->sndseq = htonl(tp->snd_nxt); 300 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 301 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 302 V_TX_CPU_IDX(toep->tp_qset)); 303 304 /* Sendbuffer is in units of 32KB. 305 */ 306 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 307 req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15)); 308 else { 309 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 310 } 311 312 toep->tp_flags |= TP_DATASENT; 313 } 314} 315 316#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ 317 318int 319t3_push_frames(struct socket *so, int req_completion) 320{ 321 struct tcpcb *tp = so_sototcpcb(so); 322 struct toepcb *toep = tp->t_toe; 323 324 struct mbuf *tail, *m0, *last; 325 struct t3cdev *cdev; 326 struct tom_data *d; 327 int state, bytes, count, total_bytes; 328 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 329 struct sockbuf *snd; 330 331 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 332 DPRINTF("tcp state=%d\n", tp->t_state); 333 return (0); 334 } 335 336 state = so_state_get(so); 337 338 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 339 DPRINTF("disconnecting\n"); 340 341 return (0); 342 } 343 344 inp_lock_assert(tp->t_inpcb); 345 346 snd = so_sockbuf_snd(so); 347 sockbuf_lock(snd); 348 349 d = TOM_DATA(toep->tp_toedev); 350 cdev = d->cdev; 351 352 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 353 354 total_bytes = 0; 355 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 356 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); 357 358 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { 359 KASSERT(tail, ("sbdrop error")); 360 last = tail = tail->m_next; 361 } 362 363 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 364 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 365 sockbuf_unlock(snd); 366 367 return (0); 368 } 369 370 toep->tp_m_last = NULL; 371 while (toep->tp_wr_avail && (tail != NULL)) { 372 count = bytes = 0; 373 segp = segs; 374 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 375 sockbuf_unlock(snd); 376 return (0); 377 } 378 /* 379 * If the data in tail fits as in-line, then 380 * make an immediate data wr. 381 */ 382 if (tail->m_len <= IMM_LEN) { 383 count = 1; 384 bytes = tail->m_len; 385 last = tail; 386 tail = tail->m_next; 387 m_set_sgl(m0, NULL); 388 m_set_sgllen(m0, 0); 389 make_tx_data_wr(so, m0, bytes, tail); 390 m_append(m0, bytes, mtod(last, caddr_t)); 391 KASSERT(!m0->m_next, ("bad append")); 392 } else { 393 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 394 && (tail != NULL) && (count < TX_MAX_SEGS-1)) { 395 bytes += tail->m_len; 396 last = tail; 397 count++; 398 /* 399 * technically an abuse to be using this for a VA 400 * but less gross than defining my own structure 401 * or calling pmap_kextract from here :-| 402 */ 403 segp->ds_addr = (bus_addr_t)tail->m_data; 404 segp->ds_len = tail->m_len; 405 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 406 count, mbuf_wrs[count], tail->m_data, tail->m_len); 407 segp++; 408 tail = tail->m_next; 409 } 410 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 411 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 412 413 m_set_sgl(m0, segs); 414 m_set_sgllen(m0, count); 415 make_tx_data_wr(so, m0, bytes, tail); 416 } 417 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); 418 419 if (tail) { 420 snd->sb_sndptr = tail; 421 toep->tp_m_last = NULL; 422 } else 423 toep->tp_m_last = snd->sb_sndptr = last; 424 425 426 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 427 428 snd->sb_sndptroff += bytes; 429 total_bytes += bytes; 430 toep->tp_write_seq += bytes; 431 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" 432 " tail=%p sndptr=%p sndptroff=%d", 433 toep->tp_wr_avail, count, mbuf_wrs[count], 434 tail, snd->sb_sndptr, snd->sb_sndptroff); 435 if (tail) 436 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" 437 " tp_m_last=%p tailbuf=%p snd_una=0x%08x", 438 total_bytes, toep->tp_m_last, tail->m_data, 439 tp->snd_una); 440 else 441 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" 442 " tp_m_last=%p snd_una=0x%08x", 443 total_bytes, toep->tp_m_last, tp->snd_una); 444 445 446#ifdef KTR 447{ 448 int i; 449 450 i = 0; 451 while (i < count && m_get_sgllen(m0)) { 452 if ((count - i) >= 3) { 453 CTR6(KTR_TOM, 454 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 455 " len=%d pa=0x%zx len=%d", 456 segs[i].ds_addr, segs[i].ds_len, 457 segs[i + 1].ds_addr, segs[i + 1].ds_len, 458 segs[i + 2].ds_addr, segs[i + 2].ds_len); 459 i += 3; 460 } else if ((count - i) == 2) { 461 CTR4(KTR_TOM, 462 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 463 " len=%d", 464 segs[i].ds_addr, segs[i].ds_len, 465 segs[i + 1].ds_addr, segs[i + 1].ds_len); 466 i += 2; 467 } else { 468 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", 469 segs[i].ds_addr, segs[i].ds_len); 470 i++; 471 } 472 473 } 474} 475#endif 476 /* 477 * remember credits used 478 */ 479 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 480 m0->m_pkthdr.len = bytes; 481 toep->tp_wr_avail -= mbuf_wrs[count]; 482 toep->tp_wr_unacked += mbuf_wrs[count]; 483 484 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 485 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 486 struct work_request_hdr *wr = cplhdr(m0); 487 488 wr->wr_hi |= htonl(F_WR_COMPL); 489 toep->tp_wr_unacked = 0; 490 } 491 KASSERT((m0->m_pkthdr.csum_data > 0) && 492 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", 493 m0->m_pkthdr.csum_data)); 494 m0->m_type = MT_DONTFREE; 495 enqueue_wr(toep, m0); 496 DPRINTF("sending offload tx with %d bytes in %d segments\n", 497 bytes, count); 498 l2t_send(cdev, m0, toep->tp_l2t); 499 } 500 sockbuf_unlock(snd); 501 return (total_bytes); 502} 503 504/* 505 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 506 * under any circumstances. We take the easy way out and always queue the 507 * message to the write_queue. We can optimize the case where the queue is 508 * already empty though the optimization is probably not worth it. 509 */ 510static void 511close_conn(struct socket *so) 512{ 513 struct mbuf *m; 514 struct cpl_close_con_req *req; 515 struct tom_data *d; 516 struct inpcb *inp = so_sotoinpcb(so); 517 struct tcpcb *tp; 518 struct toepcb *toep; 519 unsigned int tid; 520 521 522 inp_wlock(inp); 523 tp = so_sototcpcb(so); 524 toep = tp->t_toe; 525 526 if (tp->t_state != TCPS_SYN_SENT) 527 t3_push_frames(so, 1); 528 529 if (toep->tp_flags & TP_FIN_SENT) { 530 inp_wunlock(inp); 531 return; 532 } 533 534 tid = toep->tp_tid; 535 536 d = TOM_DATA(toep->tp_toedev); 537 538 m = m_gethdr_nofail(sizeof(*req)); 539 m_set_priority(m, CPL_PRIORITY_DATA); 540 m_set_sgl(m, NULL); 541 m_set_sgllen(m, 0); 542 543 toep->tp_flags |= TP_FIN_SENT; 544 req = mtod(m, struct cpl_close_con_req *); 545 546 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 547 req->wr.wr_lo = htonl(V_WR_TID(tid)); 548 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 549 req->rsvd = 0; 550 inp_wunlock(inp); 551 /* 552 * XXX - need to defer shutdown while there is still data in the queue 553 * 554 */ 555 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); 556 cxgb_ofld_send(d->cdev, m); 557 558} 559 560/* 561 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 562 * and send it along. 563 */ 564static void 565abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 566{ 567 struct cpl_abort_req *req = cplhdr(m); 568 569 req->cmd = CPL_ABORT_NO_RST; 570 cxgb_ofld_send(cdev, m); 571} 572 573/* 574 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 575 * permitted to return without sending the message in case we cannot allocate 576 * an sk_buff. Returns the number of credits sent. 577 */ 578uint32_t 579t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 580{ 581 struct mbuf *m; 582 struct cpl_rx_data_ack *req; 583 struct toepcb *toep = tp->t_toe; 584 struct toedev *tdev = toep->tp_toedev; 585 586 m = m_gethdr_nofail(sizeof(*req)); 587 588 DPRINTF("returning %u credits to HW\n", credits); 589 590 req = mtod(m, struct cpl_rx_data_ack *); 591 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 592 req->wr.wr_lo = 0; 593 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 594 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 595 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 596 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 597 return (credits); 598} 599 600/* 601 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. 602 * This is only used in DDP mode, so we take the opportunity to also set the 603 * DACK mode and flush any Rx credits. 604 */ 605void 606t3_send_rx_modulate(struct toepcb *toep) 607{ 608 struct mbuf *m; 609 struct cpl_rx_data_ack *req; 610 611 m = m_gethdr_nofail(sizeof(*req)); 612 613 req = mtod(m, struct cpl_rx_data_ack *); 614 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 615 req->wr.wr_lo = 0; 616 m->m_pkthdr.len = m->m_len = sizeof(*req); 617 618 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 619 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 620 V_RX_DACK_MODE(1) | 621 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); 622 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 623 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 624 toep->tp_rcv_wup = toep->tp_copied_seq; 625} 626 627/* 628 * Handle receipt of an urgent pointer. 629 */ 630static void 631handle_urg_ptr(struct socket *so, uint32_t urg_seq) 632{ 633#ifdef URGENT_DATA_SUPPORTED 634 struct tcpcb *tp = so_sototcpcb(so); 635 636 urg_seq--; /* initially points past the urgent data, per BSD */ 637 638 if (tp->urg_data && !after(urg_seq, tp->urg_seq)) 639 return; /* duplicate pointer */ 640 sk_send_sigurg(sk); 641 if (tp->urg_seq == tp->copied_seq && tp->urg_data && 642 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { 643 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 644 645 tp->copied_seq++; 646 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) 647 tom_eat_skb(sk, skb, 0); 648 } 649 tp->urg_data = TCP_URG_NOTYET; 650 tp->urg_seq = urg_seq; 651#endif 652} 653 654/* 655 * Returns true if a socket cannot accept new Rx data. 656 */ 657static inline int 658so_no_receive(const struct socket *so) 659{ 660 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); 661} 662 663/* 664 * Process an urgent data notification. 665 */ 666static void 667rx_urg_notify(struct toepcb *toep, struct mbuf *m) 668{ 669 struct cpl_rx_urg_notify *hdr = cplhdr(m); 670 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 671 672 VALIDATE_SOCK(so); 673 674 if (!so_no_receive(so)) 675 handle_urg_ptr(so, ntohl(hdr->seq)); 676 677 m_freem(m); 678} 679 680/* 681 * Handler for RX_URG_NOTIFY CPL messages. 682 */ 683static int 684do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) 685{ 686 struct toepcb *toep = (struct toepcb *)ctx; 687 688 rx_urg_notify(toep, m); 689 return (0); 690} 691 692static __inline int 693is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) 694{ 695 return (toep->tp_ulp_mode || 696 (toep->tp_ulp_mode == ULP_MODE_TCPDDP && 697 dev->tod_ttid >= TOE_ID_CHELSIO_T3)); 698} 699 700/* 701 * Set of states for which we should return RX credits. 702 */ 703#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 704 705/* 706 * Called after some received data has been read. It returns RX credits 707 * to the HW for the amount of data processed. 708 */ 709void 710t3_cleanup_rbuf(struct tcpcb *tp, int copied) 711{ 712 struct toepcb *toep = tp->t_toe; 713 struct socket *so; 714 struct toedev *dev; 715 int dack_mode, must_send, read; 716 u32 thres, credits, dack = 0; 717 struct sockbuf *rcv; 718 719 so = inp_inpcbtosocket(tp->t_inpcb); 720 rcv = so_sockbuf_rcv(so); 721 722 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 723 (tp->t_state == TCPS_FIN_WAIT_2))) { 724 if (copied) { 725 sockbuf_lock(rcv); 726 toep->tp_copied_seq += copied; 727 sockbuf_unlock(rcv); 728 } 729 730 return; 731 } 732 733 inp_lock_assert(tp->t_inpcb); 734 735 sockbuf_lock(rcv); 736 if (copied) 737 toep->tp_copied_seq += copied; 738 else { 739 read = toep->tp_enqueued_bytes - rcv->sb_cc; 740 toep->tp_copied_seq += read; 741 } 742 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 743 toep->tp_enqueued_bytes = rcv->sb_cc; 744 sockbuf_unlock(rcv); 745 746 if (credits > rcv->sb_mbmax) { 747 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", 748 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 749 credits = rcv->sb_mbmax; 750 } 751 752 753 /* 754 * XXX this won't accurately reflect credit return - we need 755 * to look at the difference between the amount that has been 756 * put in the recv sockbuf and what is there now 757 */ 758 759 if (__predict_false(!credits)) 760 return; 761 762 dev = toep->tp_toedev; 763 thres = TOM_TUNABLE(dev, rx_credit_thres); 764 765 if (__predict_false(thres == 0)) 766 return; 767 768 if (is_delack_mode_valid(dev, toep)) { 769 dack_mode = TOM_TUNABLE(dev, delack); 770 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 771 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 772 773 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 774 dack = F_RX_DACK_CHANGE | 775 V_RX_DACK_MODE(dack_mode); 776 } 777 } else 778 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 779 780 /* 781 * For coalescing to work effectively ensure the receive window has 782 * at least 16KB left. 783 */ 784 must_send = credits + 16384 >= tp->rcv_wnd; 785 786 if (must_send || credits >= thres) 787 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 788} 789 790static int 791cxgb_toe_disconnect(struct tcpcb *tp) 792{ 793 struct socket *so; 794 795 DPRINTF("cxgb_toe_disconnect\n"); 796 797 so = inp_inpcbtosocket(tp->t_inpcb); 798 close_conn(so); 799 return (0); 800} 801 802static int 803cxgb_toe_reset(struct tcpcb *tp) 804{ 805 struct toepcb *toep = tp->t_toe; 806 807 t3_send_reset(toep); 808 809 /* 810 * unhook from socket 811 */ 812 tp->t_flags &= ~TF_TOE; 813 toep->tp_tp = NULL; 814 tp->t_toe = NULL; 815 return (0); 816} 817 818static int 819cxgb_toe_send(struct tcpcb *tp) 820{ 821 struct socket *so; 822 823 DPRINTF("cxgb_toe_send\n"); 824 dump_toepcb(tp->t_toe); 825 826 so = inp_inpcbtosocket(tp->t_inpcb); 827 t3_push_frames(so, 1); 828 return (0); 829} 830 831static int 832cxgb_toe_rcvd(struct tcpcb *tp) 833{ 834 835 inp_lock_assert(tp->t_inpcb); 836 837 t3_cleanup_rbuf(tp, 0); 838 839 return (0); 840} 841 842static void 843cxgb_toe_detach(struct tcpcb *tp) 844{ 845 struct toepcb *toep; 846 847 /* 848 * XXX how do we handle teardown in the SYN_SENT state? 849 * 850 */ 851 inp_lock_assert(tp->t_inpcb); 852 toep = tp->t_toe; 853 toep->tp_tp = NULL; 854 855 /* 856 * unhook from socket 857 */ 858 tp->t_flags &= ~TF_TOE; 859 tp->t_toe = NULL; 860} 861 862 863static struct toe_usrreqs cxgb_toe_usrreqs = { 864 .tu_disconnect = cxgb_toe_disconnect, 865 .tu_reset = cxgb_toe_reset, 866 .tu_send = cxgb_toe_send, 867 .tu_rcvd = cxgb_toe_rcvd, 868 .tu_detach = cxgb_toe_detach, 869 .tu_detach = cxgb_toe_detach, 870 .tu_syncache_event = handle_syncache_event, 871}; 872 873 874static void 875__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, 876 uint64_t mask, uint64_t val, int no_reply) 877{ 878 struct cpl_set_tcb_field *req; 879 880 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 881 toep->tp_tid, word, mask, val); 882 883 req = mtod(m, struct cpl_set_tcb_field *); 884 m->m_pkthdr.len = m->m_len = sizeof(*req); 885 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 886 req->wr.wr_lo = 0; 887 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 888 req->reply = V_NO_REPLY(no_reply); 889 req->cpu_idx = 0; 890 req->word = htons(word); 891 req->mask = htobe64(mask); 892 req->val = htobe64(val); 893 894 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 895 send_or_defer(toep, m, 0); 896} 897 898static void 899t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) 900{ 901 struct mbuf *m; 902 struct tcpcb *tp = toep->tp_tp; 903 904 if (toep == NULL) 905 return; 906 907 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { 908 printf("not seting field\n"); 909 return; 910 } 911 912 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 913 914 __set_tcb_field(toep, m, word, mask, val, 1); 915} 916 917/* 918 * Set one of the t_flags bits in the TCB. 919 */ 920static void 921set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) 922{ 923 924 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 925} 926 927/* 928 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 929 */ 930static void 931t3_set_nagle(struct toepcb *toep) 932{ 933 struct tcpcb *tp = toep->tp_tp; 934 935 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 936} 937 938/* 939 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 940 */ 941void 942t3_set_keepalive(struct toepcb *toep, int on_off) 943{ 944 945 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); 946} 947 948void 949t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) 950{ 951 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); 952} 953 954void 955t3_set_dack_mss(struct toepcb *toep, int on_off) 956{ 957 958 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); 959} 960 961/* 962 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 963 */ 964static void 965t3_set_tos(struct toepcb *toep) 966{ 967 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); 968 969 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 970 V_TCB_TOS(tos)); 971} 972 973 974/* 975 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 976 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 977 * set the PSH bit in the last segment, which would trigger delivery.] 978 * We work around the issue by setting a DDP buffer in a partial placed state, 979 * which guarantees that TP will schedule a timer. 980 */ 981#define TP_DDP_TIMER_WORKAROUND_MASK\ 982 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 983 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 984 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 985#define TP_DDP_TIMER_WORKAROUND_VAL\ 986 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 987 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 988 32)) 989 990static void 991t3_enable_ddp(struct toepcb *toep, int on) 992{ 993 if (on) { 994 995 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 996 V_TF_DDP_OFF(0)); 997 } else 998 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, 999 V_TF_DDP_OFF(1) | 1000 TP_DDP_TIMER_WORKAROUND_MASK, 1001 V_TF_DDP_OFF(1) | 1002 TP_DDP_TIMER_WORKAROUND_VAL); 1003 1004} 1005 1006void 1007t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) 1008{ 1009 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 1010 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 1011 tag_color); 1012} 1013 1014void 1015t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, 1016 unsigned int len) 1017{ 1018 if (buf_idx == 0) 1019 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, 1020 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 1021 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 1022 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 1023 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 1024 else 1025 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, 1026 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 1027 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 1028 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 1029 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 1030} 1031 1032static int 1033t3_set_cong_control(struct socket *so, const char *name) 1034{ 1035#ifdef CONGESTION_CONTROL_SUPPORTED 1036 int cong_algo; 1037 1038 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 1039 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 1040 break; 1041 1042 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 1043 return -EINVAL; 1044#endif 1045 return 0; 1046} 1047 1048int 1049t3_get_tcb(struct toepcb *toep) 1050{ 1051 struct cpl_get_tcb *req; 1052 struct tcpcb *tp = toep->tp_tp; 1053 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 1054 1055 if (!m) 1056 return (ENOMEM); 1057 1058 inp_lock_assert(tp->t_inpcb); 1059 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 1060 req = mtod(m, struct cpl_get_tcb *); 1061 m->m_pkthdr.len = m->m_len = sizeof(*req); 1062 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1063 req->wr.wr_lo = 0; 1064 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 1065 req->cpuno = htons(toep->tp_qset); 1066 req->rsvd = 0; 1067 if (tp->t_state == TCPS_SYN_SENT) 1068 mbufq_tail(&toep->out_of_order_queue, m); // defer 1069 else 1070 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 1071 return 0; 1072} 1073 1074static inline void 1075so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) 1076{ 1077 1078 toepcb_hold(toep); 1079 1080 cxgb_insert_tid(d->cdev, d->client, toep, tid); 1081} 1082 1083/** 1084 * find_best_mtu - find the entry in the MTU table closest to an MTU 1085 * @d: TOM state 1086 * @mtu: the target MTU 1087 * 1088 * Returns the index of the value in the MTU table that is closest to but 1089 * does not exceed the target MTU. 1090 */ 1091static unsigned int 1092find_best_mtu(const struct t3c_data *d, unsigned short mtu) 1093{ 1094 int i = 0; 1095 1096 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 1097 ++i; 1098 return (i); 1099} 1100 1101static unsigned int 1102select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 1103{ 1104 unsigned int idx; 1105 1106#ifdef notyet 1107 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; 1108#endif 1109 if (tp) { 1110 tp->t_maxseg = pmtu - 40; 1111 if (tp->t_maxseg < td->mtus[0] - 40) 1112 tp->t_maxseg = td->mtus[0] - 40; 1113 idx = find_best_mtu(td, tp->t_maxseg + 40); 1114 1115 tp->t_maxseg = td->mtus[idx] - 40; 1116 } else 1117 idx = find_best_mtu(td, pmtu); 1118 1119 return (idx); 1120} 1121 1122static inline void 1123free_atid(struct t3cdev *cdev, unsigned int tid) 1124{ 1125 struct toepcb *toep = cxgb_free_atid(cdev, tid); 1126 1127 if (toep) 1128 toepcb_release(toep); 1129} 1130 1131/* 1132 * Release resources held by an offload connection (TID, L2T entry, etc.) 1133 */ 1134static void 1135t3_release_offload_resources(struct toepcb *toep) 1136{ 1137 struct tcpcb *tp = toep->tp_tp; 1138 struct toedev *tdev = toep->tp_toedev; 1139 struct t3cdev *cdev; 1140 struct socket *so; 1141 unsigned int tid = toep->tp_tid; 1142 struct sockbuf *rcv; 1143 1144 CTR0(KTR_TOM, "t3_release_offload_resources"); 1145 1146 if (!tdev) 1147 return; 1148 1149 cdev = TOEP_T3C_DEV(toep); 1150 if (!cdev) 1151 return; 1152 1153 toep->tp_qset = 0; 1154 t3_release_ddp_resources(toep); 1155 1156#ifdef CTRL_SKB_CACHE 1157 kfree_skb(CTRL_SKB_CACHE(tp)); 1158 CTRL_SKB_CACHE(tp) = NULL; 1159#endif 1160 1161 if (toep->tp_wr_avail != toep->tp_wr_max) { 1162 purge_wr_queue(toep); 1163 reset_wr_list(toep); 1164 } 1165 1166 if (toep->tp_l2t) { 1167 l2t_release(L2DATA(cdev), toep->tp_l2t); 1168 toep->tp_l2t = NULL; 1169 } 1170 toep->tp_tp = NULL; 1171 if (tp) { 1172 inp_lock_assert(tp->t_inpcb); 1173 so = inp_inpcbtosocket(tp->t_inpcb); 1174 rcv = so_sockbuf_rcv(so); 1175 /* 1176 * cancel any offloaded reads 1177 * 1178 */ 1179 sockbuf_lock(rcv); 1180 tp->t_toe = NULL; 1181 tp->t_flags &= ~TF_TOE; 1182 if (toep->tp_ddp_state.user_ddp_pending) { 1183 t3_cancel_ubuf(toep, rcv); 1184 toep->tp_ddp_state.user_ddp_pending = 0; 1185 } 1186 so_sorwakeup_locked(so); 1187 1188 } 1189 1190 if (toep->tp_state == TCPS_SYN_SENT) { 1191 free_atid(cdev, tid); 1192#ifdef notyet 1193 __skb_queue_purge(&tp->out_of_order_queue); 1194#endif 1195 } else { // we have TID 1196 cxgb_remove_tid(cdev, toep, tid); 1197 toepcb_release(toep); 1198 } 1199#if 0 1200 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 1201#endif 1202} 1203 1204static void 1205install_offload_ops(struct socket *so) 1206{ 1207 struct tcpcb *tp = so_sototcpcb(so); 1208 1209 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 1210 1211 t3_install_socket_ops(so); 1212 tp->t_flags |= TF_TOE; 1213 tp->t_tu = &cxgb_toe_usrreqs; 1214} 1215 1216/* 1217 * Determine the receive window scaling factor given a target max 1218 * receive window. 1219 */ 1220static __inline int 1221select_rcv_wscale(int space) 1222{ 1223 int wscale = 0; 1224 1225 if (space > MAX_RCV_WND) 1226 space = MAX_RCV_WND; 1227 1228 if (V_tcp_do_rfc1323) 1229 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 1230 1231 return (wscale); 1232} 1233 1234/* 1235 * Determine the receive window size for a socket. 1236 */ 1237static unsigned long 1238select_rcv_wnd(struct toedev *dev, struct socket *so) 1239{ 1240 struct tom_data *d = TOM_DATA(dev); 1241 unsigned int wnd; 1242 unsigned int max_rcv_wnd; 1243 struct sockbuf *rcv; 1244 1245 rcv = so_sockbuf_rcv(so); 1246 1247 if (V_tcp_do_autorcvbuf) 1248 wnd = V_tcp_autorcvbuf_max; 1249 else 1250 wnd = rcv->sb_hiwat; 1251 1252 1253 1254 /* XXX 1255 * For receive coalescing to work effectively we need a receive window 1256 * that can accomodate a coalesced segment. 1257 */ 1258 if (wnd < MIN_RCV_WND) 1259 wnd = MIN_RCV_WND; 1260 1261 /* PR 5138 */ 1262 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 1263 (uint32_t)d->rx_page_size * 23 : 1264 MAX_RCV_WND); 1265 1266 return min(wnd, max_rcv_wnd); 1267} 1268 1269/* 1270 * Assign offload parameters to some socket fields. This code is used by 1271 * both active and passive opens. 1272 */ 1273static inline void 1274init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1275 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1276{ 1277 struct tcpcb *tp = so_sototcpcb(so); 1278 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1279 struct sockbuf *snd, *rcv; 1280 1281#ifdef notyet 1282 SOCK_LOCK_ASSERT(so); 1283#endif 1284 1285 snd = so_sockbuf_snd(so); 1286 rcv = so_sockbuf_rcv(so); 1287 1288 log(LOG_INFO, "initializing offload socket\n"); 1289 /* 1290 * We either need to fix push frames to work with sbcompress 1291 * or we need to add this 1292 */ 1293 snd->sb_flags |= SB_NOCOALESCE; 1294 rcv->sb_flags |= SB_NOCOALESCE; 1295 1296 tp->t_toe = toep; 1297 toep->tp_tp = tp; 1298 toep->tp_toedev = dev; 1299 1300 toep->tp_tid = tid; 1301 toep->tp_l2t = e; 1302 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1303 toep->tp_wr_unacked = 0; 1304 toep->tp_delack_mode = 0; 1305 1306 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1307 /* 1308 * XXX broken 1309 * 1310 */ 1311 tp->rcv_wnd = select_rcv_wnd(dev, so); 1312 1313 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 1314 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1315 toep->tp_qset_idx = 0; 1316 1317 reset_wr_list(toep); 1318 DPRINTF("initialization done\n"); 1319} 1320 1321/* 1322 * The next two functions calculate the option 0 value for a socket. 1323 */ 1324static inline unsigned int 1325calc_opt0h(struct socket *so, int mtu_idx) 1326{ 1327 struct tcpcb *tp = so_sototcpcb(so); 1328 int wscale = select_rcv_wscale(tp->rcv_wnd); 1329 1330 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1331 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1332 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1333} 1334 1335static inline unsigned int 1336calc_opt0l(struct socket *so, int ulp_mode) 1337{ 1338 struct tcpcb *tp = so_sototcpcb(so); 1339 unsigned int val; 1340 1341 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | 1342 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1343 1344 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); 1345 return (val); 1346} 1347 1348static inline unsigned int 1349calc_opt2(const struct socket *so, struct toedev *dev) 1350{ 1351 int flv_valid; 1352 1353 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1354 1355 return (V_FLAVORS_VALID(flv_valid) | 1356 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); 1357} 1358 1359#if DEBUG_WR > 1 1360static int 1361count_pending_wrs(const struct toepcb *toep) 1362{ 1363 const struct mbuf *m; 1364 int n = 0; 1365 1366 wr_queue_walk(toep, m) 1367 n += m->m_pkthdr.csum_data; 1368 return (n); 1369} 1370#endif 1371 1372#if 0 1373(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1374#endif 1375 1376static void 1377mk_act_open_req(struct socket *so, struct mbuf *m, 1378 unsigned int atid, const struct l2t_entry *e) 1379{ 1380 struct cpl_act_open_req *req; 1381 struct inpcb *inp = so_sotoinpcb(so); 1382 struct tcpcb *tp = inp_inpcbtotcpcb(inp); 1383 struct toepcb *toep = tp->t_toe; 1384 struct toedev *tdev = toep->tp_toedev; 1385 1386 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); 1387 1388 req = mtod(m, struct cpl_act_open_req *); 1389 m->m_pkthdr.len = m->m_len = sizeof(*req); 1390 1391 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1392 req->wr.wr_lo = 0; 1393 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1394 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); 1395#if 0 1396 req->local_port = inp->inp_lport; 1397 req->peer_port = inp->inp_fport; 1398 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1399 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1400#endif 1401 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1402 V_TX_CHANNEL(e->smt_idx)); 1403 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1404 req->params = 0; 1405 req->opt2 = htonl(calc_opt2(so, tdev)); 1406} 1407 1408 1409/* 1410 * Convert an ACT_OPEN_RPL status to an errno. 1411 */ 1412static int 1413act_open_rpl_status_to_errno(int status) 1414{ 1415 switch (status) { 1416 case CPL_ERR_CONN_RESET: 1417 return (ECONNREFUSED); 1418 case CPL_ERR_ARP_MISS: 1419 return (EHOSTUNREACH); 1420 case CPL_ERR_CONN_TIMEDOUT: 1421 return (ETIMEDOUT); 1422 case CPL_ERR_TCAM_FULL: 1423 return (ENOMEM); 1424 case CPL_ERR_CONN_EXIST: 1425 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1426 return (EADDRINUSE); 1427 default: 1428 return (EIO); 1429 } 1430} 1431 1432static void 1433fail_act_open(struct toepcb *toep, int errno) 1434{ 1435 struct tcpcb *tp = toep->tp_tp; 1436 1437 t3_release_offload_resources(toep); 1438 if (tp) { 1439 inp_wunlock(tp->t_inpcb); 1440 tcp_offload_drop(tp, errno); 1441 } 1442 1443#ifdef notyet 1444 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1445#endif 1446} 1447 1448/* 1449 * Handle active open failures. 1450 */ 1451static void 1452active_open_failed(struct toepcb *toep, struct mbuf *m) 1453{ 1454 struct cpl_act_open_rpl *rpl = cplhdr(m); 1455 struct inpcb *inp; 1456 1457 if (toep->tp_tp == NULL) 1458 goto done; 1459 1460 inp = toep->tp_tp->t_inpcb; 1461 1462/* 1463 * Don't handle connection retry for now 1464 */ 1465#ifdef notyet 1466 struct inet_connection_sock *icsk = inet_csk(sk); 1467 1468 if (rpl->status == CPL_ERR_CONN_EXIST && 1469 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1470 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1471 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1472 jiffies + HZ / 2); 1473 } else 1474#endif 1475 { 1476 inp_wlock(inp); 1477 /* 1478 * drops the inpcb lock 1479 */ 1480 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1481 } 1482 1483 done: 1484 m_free(m); 1485} 1486 1487/* 1488 * Return whether a failed active open has allocated a TID 1489 */ 1490static inline int 1491act_open_has_tid(int status) 1492{ 1493 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1494 status != CPL_ERR_ARP_MISS; 1495} 1496 1497/* 1498 * Process an ACT_OPEN_RPL CPL message. 1499 */ 1500static int 1501do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1502{ 1503 struct toepcb *toep = (struct toepcb *)ctx; 1504 struct cpl_act_open_rpl *rpl = cplhdr(m); 1505 1506 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1507 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1508 1509 active_open_failed(toep, m); 1510 return (0); 1511} 1512 1513/* 1514 * Handle an ARP failure for an active open. XXX purge ofo queue 1515 * 1516 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1517 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1518 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1519 * free the atid. Hmm. 1520 */ 1521#ifdef notyet 1522static void 1523act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1524{ 1525 struct toepcb *toep = m_get_toep(m); 1526 struct tcpcb *tp = toep->tp_tp; 1527 struct inpcb *inp = tp->t_inpcb; 1528 struct socket *so; 1529 1530 inp_wlock(inp); 1531 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1532 /* 1533 * drops the inpcb lock 1534 */ 1535 fail_act_open(so, EHOSTUNREACH); 1536 printf("freeing %p\n", m); 1537 1538 m_free(m); 1539 } else 1540 inp_wunlock(inp); 1541} 1542#endif 1543/* 1544 * Send an active open request. 1545 */ 1546int 1547t3_connect(struct toedev *tdev, struct socket *so, 1548 struct rtentry *rt, struct sockaddr *nam) 1549{ 1550 struct mbuf *m; 1551 struct l2t_entry *e; 1552 struct tom_data *d = TOM_DATA(tdev); 1553 struct inpcb *inp = so_sotoinpcb(so); 1554 struct tcpcb *tp = intotcpcb(inp); 1555 struct toepcb *toep; /* allocated by init_offload_socket */ 1556 1557 int atid; 1558 1559 toep = toepcb_alloc(); 1560 if (toep == NULL) 1561 goto out_err; 1562 1563 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1564 goto out_err; 1565 1566 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1567 if (!e) 1568 goto free_tid; 1569 1570 inp_lock_assert(inp); 1571 m = m_gethdr(MT_DATA, M_WAITOK); 1572 1573#if 0 1574 m->m_toe.mt_toepcb = tp->t_toe; 1575 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1576#endif 1577 so_lock(so); 1578 1579 init_offload_socket(so, tdev, atid, e, rt, toep); 1580 1581 install_offload_ops(so); 1582 1583 mk_act_open_req(so, m, atid, e); 1584 so_unlock(so); 1585 1586 soisconnecting(so); 1587 toep = tp->t_toe; 1588 m_set_toep(m, tp->t_toe); 1589 1590 toep->tp_state = TCPS_SYN_SENT; 1591 l2t_send(d->cdev, (struct mbuf *)m, e); 1592 1593 if (toep->tp_ulp_mode) 1594 t3_enable_ddp(toep, 0); 1595 return (0); 1596 1597free_tid: 1598 printf("failing connect - free atid\n"); 1599 1600 free_atid(d->cdev, atid); 1601out_err: 1602 printf("return ENOMEM\n"); 1603 return (ENOMEM); 1604} 1605 1606/* 1607 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1608 * not send multiple ABORT_REQs for the same connection and also that we do 1609 * not try to send a message after the connection has closed. Returns 1 if 1610 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1611 */ 1612static void 1613t3_send_reset(struct toepcb *toep) 1614{ 1615 1616 struct cpl_abort_req *req; 1617 unsigned int tid = toep->tp_tid; 1618 int mode = CPL_ABORT_SEND_RST; 1619 struct tcpcb *tp = toep->tp_tp; 1620 struct toedev *tdev = toep->tp_toedev; 1621 struct socket *so = NULL; 1622 struct mbuf *m; 1623 struct sockbuf *snd; 1624 1625 if (tp) { 1626 inp_lock_assert(tp->t_inpcb); 1627 so = inp_inpcbtosocket(tp->t_inpcb); 1628 } 1629 1630 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1631 tdev == NULL)) 1632 return; 1633 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1634 1635 snd = so_sockbuf_snd(so); 1636 /* Purge the send queue so we don't send anything after an abort. */ 1637 if (so) 1638 sbflush(snd); 1639 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1640 mode |= CPL_ABORT_POST_CLOSE_REQ; 1641 1642 m = m_gethdr_nofail(sizeof(*req)); 1643 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); 1644 set_arp_failure_handler(m, abort_arp_failure); 1645 1646 req = mtod(m, struct cpl_abort_req *); 1647 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1648 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1649 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1650 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1651 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1652 req->cmd = mode; 1653 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1654 mbufq_tail(&toep->out_of_order_queue, m); // defer 1655 else 1656 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1657} 1658 1659static int 1660t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1661{ 1662 struct inpcb *inp; 1663 int error, optval; 1664 1665 if (sopt->sopt_name == IP_OPTIONS) 1666 return (ENOPROTOOPT); 1667 1668 if (sopt->sopt_name != IP_TOS) 1669 return (EOPNOTSUPP); 1670 1671 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1672 1673 if (error) 1674 return (error); 1675 1676 if (optval > IPTOS_PREC_CRITIC_ECP) 1677 return (EINVAL); 1678 1679 inp = so_sotoinpcb(so); 1680 inp_wlock(inp); 1681 inp_ip_tos_set(inp, optval); 1682#if 0 1683 inp->inp_ip_tos = optval; 1684#endif 1685 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); 1686 inp_wunlock(inp); 1687 1688 return (0); 1689} 1690 1691static int 1692t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1693{ 1694 int err = 0; 1695 size_t copied; 1696 1697 if (sopt->sopt_name != TCP_CONGESTION && 1698 sopt->sopt_name != TCP_NODELAY) 1699 return (EOPNOTSUPP); 1700 1701 if (sopt->sopt_name == TCP_CONGESTION) { 1702 char name[TCP_CA_NAME_MAX]; 1703 int optlen = sopt->sopt_valsize; 1704 struct tcpcb *tp; 1705 1706 if (sopt->sopt_dir == SOPT_GET) { 1707 KASSERT(0, ("unimplemented")); 1708 return (EOPNOTSUPP); 1709 } 1710 1711 if (optlen < 1) 1712 return (EINVAL); 1713 1714 err = copyinstr(sopt->sopt_val, name, 1715 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1716 if (err) 1717 return (err); 1718 if (copied < 1) 1719 return (EINVAL); 1720 1721 tp = so_sototcpcb(so); 1722 /* 1723 * XXX I need to revisit this 1724 */ 1725 if ((err = t3_set_cong_control(so, name)) == 0) { 1726#ifdef CONGESTION_CONTROL_SUPPORTED 1727 tp->t_cong_control = strdup(name, M_CXGB); 1728#endif 1729 } else 1730 return (err); 1731 } else { 1732 int optval, oldval; 1733 struct inpcb *inp; 1734 struct tcpcb *tp; 1735 1736 if (sopt->sopt_dir == SOPT_GET) 1737 return (EOPNOTSUPP); 1738 1739 err = sooptcopyin(sopt, &optval, sizeof optval, 1740 sizeof optval); 1741 1742 if (err) 1743 return (err); 1744 1745 inp = so_sotoinpcb(so); 1746 inp_wlock(inp); 1747 tp = inp_inpcbtotcpcb(inp); 1748 1749 oldval = tp->t_flags; 1750 if (optval) 1751 tp->t_flags |= TF_NODELAY; 1752 else 1753 tp->t_flags &= ~TF_NODELAY; 1754 inp_wunlock(inp); 1755 1756 1757 if (oldval != tp->t_flags && (tp->t_toe != NULL)) 1758 t3_set_nagle(tp->t_toe); 1759 1760 } 1761 1762 return (0); 1763} 1764 1765int 1766t3_ctloutput(struct socket *so, struct sockopt *sopt) 1767{ 1768 int err; 1769 1770 if (sopt->sopt_level != IPPROTO_TCP) 1771 err = t3_ip_ctloutput(so, sopt); 1772 else 1773 err = t3_tcp_ctloutput(so, sopt); 1774 1775 if (err != EOPNOTSUPP) 1776 return (err); 1777 1778 return (tcp_ctloutput(so, sopt)); 1779} 1780 1781/* 1782 * Returns true if we need to explicitly request RST when we receive new data 1783 * on an RX-closed connection. 1784 */ 1785static inline int 1786need_rst_on_excess_rx(const struct toepcb *toep) 1787{ 1788 return (1); 1789} 1790 1791/* 1792 * Handles Rx data that arrives in a state where the socket isn't accepting 1793 * new data. 1794 */ 1795static void 1796handle_excess_rx(struct toepcb *toep, struct mbuf *m) 1797{ 1798 1799 if (need_rst_on_excess_rx(toep) && 1800 !(toep->tp_flags & TP_ABORT_SHUTDOWN)) 1801 t3_send_reset(toep); 1802 m_freem(m); 1803} 1804 1805/* 1806 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) 1807 * by getting the DDP offset from the TCB. 1808 */ 1809static void 1810tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) 1811{ 1812 struct ddp_state *q = &toep->tp_ddp_state; 1813 struct ddp_buf_state *bsp; 1814 struct cpl_get_tcb_rpl *hdr; 1815 unsigned int ddp_offset; 1816 struct socket *so; 1817 struct tcpcb *tp; 1818 struct sockbuf *rcv; 1819 int state; 1820 1821 uint64_t t; 1822 __be64 *tcb; 1823 1824 tp = toep->tp_tp; 1825 so = inp_inpcbtosocket(tp->t_inpcb); 1826 1827 inp_lock_assert(tp->t_inpcb); 1828 rcv = so_sockbuf_rcv(so); 1829 sockbuf_lock(rcv); 1830 1831 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. 1832 * We really need a cookie in order to dispatch the RPLs. 1833 */ 1834 q->get_tcb_count--; 1835 1836 /* It is a possible that a previous CPL already invalidated UBUF DDP 1837 * and moved the cur_buf idx and hence no further processing of this 1838 * skb is required. However, the app might be sleeping on 1839 * !q->get_tcb_count and we need to wake it up. 1840 */ 1841 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { 1842 int state = so_state_get(so); 1843 1844 m_freem(m); 1845 if (__predict_true((state & SS_NOFDREF) == 0)) 1846 so_sorwakeup_locked(so); 1847 else 1848 sockbuf_unlock(rcv); 1849 1850 return; 1851 } 1852 1853 bsp = &q->buf_state[q->cur_buf]; 1854 hdr = cplhdr(m); 1855 tcb = (__be64 *)(hdr + 1); 1856 if (q->cur_buf == 0) { 1857 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); 1858 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); 1859 } else { 1860 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); 1861 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; 1862 } 1863 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; 1864 m->m_cur_offset = bsp->cur_offset; 1865 bsp->cur_offset = ddp_offset; 1866 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; 1867 1868 CTR5(KTR_TOM, 1869 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", 1870 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); 1871 KASSERT(ddp_offset >= m->m_cur_offset, 1872 ("ddp_offset=%u less than cur_offset=%u", 1873 ddp_offset, m->m_cur_offset)); 1874 1875#if 0 1876{ 1877 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; 1878 1879 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); 1880 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; 1881 1882 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); 1883 rcv_nxt = t >> S_TCB_RCV_NXT; 1884 rcv_nxt &= M_TCB_RCV_NXT; 1885 1886 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); 1887 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); 1888 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; 1889 1890 T3_TRACE2(TIDTB(sk), 1891 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", 1892 ddp_flags, rcv_nxt - rx_hdr_offset); 1893 T3_TRACE4(TB(q), 1894 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", 1895 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); 1896 T3_TRACE3(TB(q), 1897 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", 1898 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); 1899 T3_TRACE2(TB(q), 1900 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", 1901 q->buf_state[0].flags, q->buf_state[1].flags); 1902 1903} 1904#endif 1905 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { 1906 handle_excess_rx(toep, m); 1907 return; 1908 } 1909 1910#ifdef T3_TRACE 1911 if ((int)m->m_pkthdr.len < 0) { 1912 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); 1913 } 1914#endif 1915 if (bsp->flags & DDP_BF_NOCOPY) { 1916#ifdef T3_TRACE 1917 T3_TRACE0(TB(q), 1918 "tcb_rpl_as_ddp_complete: CANCEL UBUF"); 1919 1920 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1921 printk("!cancel_ubuf"); 1922 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); 1923 } 1924#endif 1925 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; 1926 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); 1927 q->cur_buf ^= 1; 1928 } else if (bsp->flags & DDP_BF_NOFLIP) { 1929 1930 m->m_ddp_flags = 1; /* always a kernel buffer */ 1931 1932 /* now HW buffer carries a user buffer */ 1933 bsp->flags &= ~DDP_BF_NOFLIP; 1934 bsp->flags |= DDP_BF_NOCOPY; 1935 1936 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate 1937 * any new data in which case we're done. If in addition the 1938 * offset is 0, then there wasn't a completion for the kbuf 1939 * and we need to decrement the posted count. 1940 */ 1941 if (m->m_pkthdr.len == 0) { 1942 if (ddp_offset == 0) { 1943 q->kbuf_posted--; 1944 bsp->flags |= DDP_BF_NODATA; 1945 } 1946 sockbuf_unlock(rcv); 1947 m_free(m); 1948 return; 1949 } 1950 } else { 1951 sockbuf_unlock(rcv); 1952 1953 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, 1954 * but it got here way late and nobody cares anymore. 1955 */ 1956 m_free(m); 1957 return; 1958 } 1959 1960 m->m_ddp_gl = (unsigned char *)bsp->gl; 1961 m->m_flags |= M_DDP; 1962 m->m_seq = tp->rcv_nxt; 1963 tp->rcv_nxt += m->m_pkthdr.len; 1964 tp->t_rcvtime = ticks; 1965 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", 1966 m->m_seq, q->cur_buf, m->m_pkthdr.len); 1967 if (m->m_pkthdr.len == 0) { 1968 q->user_ddp_pending = 0; 1969 m_free(m); 1970 } else 1971 SBAPPEND(rcv, m); 1972 1973 state = so_state_get(so); 1974 if (__predict_true((state & SS_NOFDREF) == 0)) 1975 so_sorwakeup_locked(so); 1976 else 1977 sockbuf_unlock(rcv); 1978} 1979 1980/* 1981 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, 1982 * in that case they are similar to DDP completions. 1983 */ 1984static int 1985do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1986{ 1987 struct toepcb *toep = (struct toepcb *)ctx; 1988 1989 /* OK if socket doesn't exist */ 1990 if (toep == NULL) { 1991 printf("null toep in do_get_tcb_rpl\n"); 1992 return (CPL_RET_BUF_DONE); 1993 } 1994 1995 inp_wlock(toep->tp_tp->t_inpcb); 1996 tcb_rpl_as_ddp_complete(toep, m); 1997 inp_wunlock(toep->tp_tp->t_inpcb); 1998 1999 return (0); 2000} 2001 2002static void 2003handle_ddp_data(struct toepcb *toep, struct mbuf *m) 2004{ 2005 struct tcpcb *tp = toep->tp_tp; 2006 struct socket *so; 2007 struct ddp_state *q; 2008 struct ddp_buf_state *bsp; 2009 struct cpl_rx_data *hdr = cplhdr(m); 2010 unsigned int rcv_nxt = ntohl(hdr->seq); 2011 struct sockbuf *rcv; 2012 2013 if (tp->rcv_nxt == rcv_nxt) 2014 return; 2015 2016 inp_lock_assert(tp->t_inpcb); 2017 so = inp_inpcbtosocket(tp->t_inpcb); 2018 rcv = so_sockbuf_rcv(so); 2019 sockbuf_lock(rcv); 2020 2021 q = &toep->tp_ddp_state; 2022 bsp = &q->buf_state[q->cur_buf]; 2023 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", 2024 rcv_nxt, tp->rcv_nxt)); 2025 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2026 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2027 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", 2028 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); 2029 2030#ifdef T3_TRACE 2031 if ((int)m->m_pkthdr.len < 0) { 2032 t3_ddp_error(so, "handle_ddp_data: neg len"); 2033 } 2034#endif 2035 m->m_ddp_gl = (unsigned char *)bsp->gl; 2036 m->m_flags |= M_DDP; 2037 m->m_cur_offset = bsp->cur_offset; 2038 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2039 if (bsp->flags & DDP_BF_NOCOPY) 2040 bsp->flags &= ~DDP_BF_NOCOPY; 2041 2042 m->m_seq = tp->rcv_nxt; 2043 tp->rcv_nxt = rcv_nxt; 2044 bsp->cur_offset += m->m_pkthdr.len; 2045 if (!(bsp->flags & DDP_BF_NOFLIP)) 2046 q->cur_buf ^= 1; 2047 /* 2048 * For now, don't re-enable DDP after a connection fell out of DDP 2049 * mode. 2050 */ 2051 q->ubuf_ddp_ready = 0; 2052 sockbuf_unlock(rcv); 2053} 2054 2055/* 2056 * Process new data received for a connection. 2057 */ 2058static void 2059new_rx_data(struct toepcb *toep, struct mbuf *m) 2060{ 2061 struct cpl_rx_data *hdr = cplhdr(m); 2062 struct tcpcb *tp = toep->tp_tp; 2063 struct socket *so; 2064 struct sockbuf *rcv; 2065 int state; 2066 int len = be16toh(hdr->len); 2067 2068 inp_wlock(tp->t_inpcb); 2069 2070 so = inp_inpcbtosocket(tp->t_inpcb); 2071 2072 if (__predict_false(so_no_receive(so))) { 2073 handle_excess_rx(toep, m); 2074 inp_wunlock(tp->t_inpcb); 2075 TRACE_EXIT; 2076 return; 2077 } 2078 2079 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) 2080 handle_ddp_data(toep, m); 2081 2082 m->m_seq = ntohl(hdr->seq); 2083 m->m_ulp_mode = 0; /* for iSCSI */ 2084 2085#if VALIDATE_SEQ 2086 if (__predict_false(m->m_seq != tp->rcv_nxt)) { 2087 log(LOG_ERR, 2088 "%s: TID %u: Bad sequence number %u, expected %u\n", 2089 toep->tp_toedev->name, toep->tp_tid, m->m_seq, 2090 tp->rcv_nxt); 2091 m_freem(m); 2092 inp_wunlock(tp->t_inpcb); 2093 return; 2094 } 2095#endif 2096 m_adj(m, sizeof(*hdr)); 2097 2098#ifdef URGENT_DATA_SUPPORTED 2099 /* 2100 * We don't handle urgent data yet 2101 */ 2102 if (__predict_false(hdr->urg)) 2103 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 2104 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 2105 tp->urg_seq - tp->rcv_nxt < skb->len)) 2106 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 2107 tp->rcv_nxt]; 2108#endif 2109 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 2110 toep->tp_delack_mode = hdr->dack_mode; 2111 toep->tp_delack_seq = tp->rcv_nxt; 2112 } 2113 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", 2114 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); 2115 2116 if (len < m->m_pkthdr.len) 2117 m->m_pkthdr.len = m->m_len = len; 2118 2119 tp->rcv_nxt += m->m_pkthdr.len; 2120 tp->t_rcvtime = ticks; 2121 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2122 CTR2(KTR_TOM, 2123 "new_rx_data: seq 0x%x len %u", 2124 m->m_seq, m->m_pkthdr.len); 2125 inp_wunlock(tp->t_inpcb); 2126 rcv = so_sockbuf_rcv(so); 2127 sockbuf_lock(rcv); 2128#if 0 2129 if (sb_notify(rcv)) 2130 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); 2131#endif 2132 SBAPPEND(rcv, m); 2133 2134#ifdef notyet 2135 /* 2136 * We're giving too many credits to the card - but disable this check so we can keep on moving :-| 2137 * 2138 */ 2139 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), 2140 2141 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 2142 so, rcv->sb_cc, rcv->sb_mbmax)); 2143#endif 2144 2145 2146 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", 2147 rcv->sb_cc, rcv->sb_mbcnt); 2148 2149 state = so_state_get(so); 2150 if (__predict_true((state & SS_NOFDREF) == 0)) 2151 so_sorwakeup_locked(so); 2152 else 2153 sockbuf_unlock(rcv); 2154} 2155 2156/* 2157 * Handler for RX_DATA CPL messages. 2158 */ 2159static int 2160do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2161{ 2162 struct toepcb *toep = (struct toepcb *)ctx; 2163 2164 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 2165 2166 new_rx_data(toep, m); 2167 2168 return (0); 2169} 2170 2171static void 2172new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) 2173{ 2174 struct tcpcb *tp; 2175 struct ddp_state *q; 2176 struct ddp_buf_state *bsp; 2177 struct cpl_rx_data_ddp *hdr; 2178 struct socket *so; 2179 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 2180 int nomoredata = 0; 2181 unsigned int delack_mode; 2182 struct sockbuf *rcv; 2183 2184 tp = toep->tp_tp; 2185 inp_wlock(tp->t_inpcb); 2186 so = inp_inpcbtosocket(tp->t_inpcb); 2187 2188 if (__predict_false(so_no_receive(so))) { 2189 2190 handle_excess_rx(toep, m); 2191 inp_wunlock(tp->t_inpcb); 2192 return; 2193 } 2194 2195 q = &toep->tp_ddp_state; 2196 hdr = cplhdr(m); 2197 ddp_report = ntohl(hdr->u.ddp_report); 2198 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2199 bsp = &q->buf_state[buf_idx]; 2200 2201 CTR4(KTR_TOM, 2202 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 2203 "hdr seq 0x%x len %u", 2204 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 2205 ntohs(hdr->len)); 2206 CTR3(KTR_TOM, 2207 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", 2208 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); 2209 2210 ddp_len = ntohs(hdr->len); 2211 rcv_nxt = ntohl(hdr->seq) + ddp_len; 2212 2213 delack_mode = G_DDP_DACK_MODE(ddp_report); 2214 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2215 toep->tp_delack_mode = delack_mode; 2216 toep->tp_delack_seq = tp->rcv_nxt; 2217 } 2218 2219 m->m_seq = tp->rcv_nxt; 2220 tp->rcv_nxt = rcv_nxt; 2221 2222 tp->t_rcvtime = ticks; 2223 /* 2224 * Store the length in m->m_len. We are changing the meaning of 2225 * m->m_len here, we need to be very careful that nothing from now on 2226 * interprets ->len of this packet the usual way. 2227 */ 2228 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; 2229 inp_wunlock(tp->t_inpcb); 2230 CTR3(KTR_TOM, 2231 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", 2232 m->m_len, rcv_nxt, m->m_seq); 2233 /* 2234 * Figure out where the new data was placed in the buffer and store it 2235 * in when. Assumes the buffer offset starts at 0, consumer needs to 2236 * account for page pod's pg_offset. 2237 */ 2238 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 2239 m->m_cur_offset = end_offset - m->m_pkthdr.len; 2240 2241 rcv = so_sockbuf_rcv(so); 2242 sockbuf_lock(rcv); 2243 2244 m->m_ddp_gl = (unsigned char *)bsp->gl; 2245 m->m_flags |= M_DDP; 2246 bsp->cur_offset = end_offset; 2247 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2248 2249 /* 2250 * Length is only meaningful for kbuf 2251 */ 2252 if (!(bsp->flags & DDP_BF_NOCOPY)) 2253 KASSERT(m->m_len <= bsp->gl->dgl_length, 2254 ("length received exceeds ddp pages: len=%d dgl_length=%d", 2255 m->m_len, bsp->gl->dgl_length)); 2256 2257 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2258 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); 2259 /* 2260 * Bit 0 of flags stores whether the DDP buffer is completed. 2261 * Note that other parts of the code depend on this being in bit 0. 2262 */ 2263 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 2264 panic("spurious ddp completion"); 2265 } else { 2266 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 2267 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 2268 q->cur_buf ^= 1; /* flip buffers */ 2269 } 2270 2271 if (bsp->flags & DDP_BF_NOCOPY) { 2272 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); 2273 bsp->flags &= ~DDP_BF_NOCOPY; 2274 } 2275 2276 if (ddp_report & F_DDP_PSH) 2277 m->m_ddp_flags |= DDP_BF_PSH; 2278 if (nomoredata) 2279 m->m_ddp_flags |= DDP_BF_NODATA; 2280 2281#ifdef notyet 2282 skb_reset_transport_header(skb); 2283 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ 2284#endif 2285 SBAPPEND(rcv, m); 2286 2287 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || 2288 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) 2289 || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) 2290 so_sorwakeup_locked(so); 2291 else 2292 sockbuf_unlock(rcv); 2293} 2294 2295#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 2296 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 2297 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 2298 F_DDP_INVALID_PPOD) 2299 2300/* 2301 * Handler for RX_DATA_DDP CPL messages. 2302 */ 2303static int 2304do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2305{ 2306 struct toepcb *toep = ctx; 2307 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 2308 2309 VALIDATE_SOCK(so); 2310 2311 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 2312 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 2313 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 2314 return (CPL_RET_BUF_DONE); 2315 } 2316#if 0 2317 skb->h.th = tcphdr_skb->h.th; 2318#endif 2319 new_rx_data_ddp(toep, m); 2320 return (0); 2321} 2322 2323static void 2324process_ddp_complete(struct toepcb *toep, struct mbuf *m) 2325{ 2326 struct tcpcb *tp = toep->tp_tp; 2327 struct socket *so; 2328 struct ddp_state *q; 2329 struct ddp_buf_state *bsp; 2330 struct cpl_rx_ddp_complete *hdr; 2331 unsigned int ddp_report, buf_idx, when, delack_mode; 2332 int nomoredata = 0; 2333 struct sockbuf *rcv; 2334 2335 inp_wlock(tp->t_inpcb); 2336 so = inp_inpcbtosocket(tp->t_inpcb); 2337 2338 if (__predict_false(so_no_receive(so))) { 2339 struct inpcb *inp = so_sotoinpcb(so); 2340 2341 handle_excess_rx(toep, m); 2342 inp_wunlock(inp); 2343 return; 2344 } 2345 q = &toep->tp_ddp_state; 2346 hdr = cplhdr(m); 2347 ddp_report = ntohl(hdr->ddp_report); 2348 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2349 m->m_pkthdr.csum_data = tp->rcv_nxt; 2350 2351 rcv = so_sockbuf_rcv(so); 2352 sockbuf_lock(rcv); 2353 2354 bsp = &q->buf_state[buf_idx]; 2355 when = bsp->cur_offset; 2356 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; 2357 tp->rcv_nxt += m->m_len; 2358 tp->t_rcvtime = ticks; 2359 2360 delack_mode = G_DDP_DACK_MODE(ddp_report); 2361 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2362 toep->tp_delack_mode = delack_mode; 2363 toep->tp_delack_seq = tp->rcv_nxt; 2364 } 2365#ifdef notyet 2366 skb_reset_transport_header(skb); 2367 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2368#endif 2369 inp_wunlock(tp->t_inpcb); 2370 2371 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2372 CTR5(KTR_TOM, 2373 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2374 "ddp_report 0x%x offset %u, len %u", 2375 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2376 G_DDP_OFFSET(ddp_report), m->m_len); 2377 2378 m->m_cur_offset = bsp->cur_offset; 2379 bsp->cur_offset += m->m_len; 2380 2381 if (!(bsp->flags & DDP_BF_NOFLIP)) { 2382 q->cur_buf ^= 1; /* flip buffers */ 2383 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) 2384 nomoredata=1; 2385 } 2386 2387 CTR4(KTR_TOM, 2388 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2389 "ddp_report %u offset %u", 2390 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2391 G_DDP_OFFSET(ddp_report)); 2392 2393 m->m_ddp_gl = (unsigned char *)bsp->gl; 2394 m->m_flags |= M_DDP; 2395 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 2396 if (bsp->flags & DDP_BF_NOCOPY) 2397 bsp->flags &= ~DDP_BF_NOCOPY; 2398 if (nomoredata) 2399 m->m_ddp_flags |= DDP_BF_NODATA; 2400 2401 SBAPPEND(rcv, m); 2402 if ((so_state_get(so) & SS_NOFDREF) == 0) 2403 so_sorwakeup_locked(so); 2404 else 2405 sockbuf_unlock(rcv); 2406} 2407 2408/* 2409 * Handler for RX_DDP_COMPLETE CPL messages. 2410 */ 2411static int 2412do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2413{ 2414 struct toepcb *toep = ctx; 2415 2416 VALIDATE_SOCK(so); 2417#if 0 2418 skb->h.th = tcphdr_skb->h.th; 2419#endif 2420 process_ddp_complete(toep, m); 2421 return (0); 2422} 2423 2424/* 2425 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 2426 * socket state before calling tcp_time_wait to comply with its expectations. 2427 */ 2428static void 2429enter_timewait(struct tcpcb *tp) 2430{ 2431 /* 2432 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2433 * process peer_close because we don't want to carry the peer FIN in 2434 * the socket's receive queue and if we increment rcv_nxt without 2435 * having the FIN in the receive queue we'll confuse facilities such 2436 * as SIOCINQ. 2437 */ 2438 inp_wlock(tp->t_inpcb); 2439 tp->rcv_nxt++; 2440 2441 tp->ts_recent_age = 0; /* defeat recycling */ 2442 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2443 inp_wunlock(tp->t_inpcb); 2444 tcp_offload_twstart(tp); 2445} 2446 2447/* 2448 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This 2449 * function deals with the data that may be reported along with the FIN. 2450 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to 2451 * perform normal FIN-related processing. In the latter case 1 indicates that 2452 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the 2453 * skb can be freed. 2454 */ 2455static int 2456handle_peer_close_data(struct socket *so, struct mbuf *m) 2457{ 2458 struct tcpcb *tp = so_sototcpcb(so); 2459 struct toepcb *toep = tp->t_toe; 2460 struct ddp_state *q; 2461 struct ddp_buf_state *bsp; 2462 struct cpl_peer_close *req = cplhdr(m); 2463 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ 2464 struct sockbuf *rcv; 2465 2466 if (tp->rcv_nxt == rcv_nxt) /* no data */ 2467 return (0); 2468 2469 CTR0(KTR_TOM, "handle_peer_close_data"); 2470 if (__predict_false(so_no_receive(so))) { 2471 handle_excess_rx(toep, m); 2472 2473 /* 2474 * Although we discard the data we want to process the FIN so 2475 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + 2476 * PEER_CLOSE without data. In particular this PEER_CLOSE 2477 * may be what will close the connection. We return 1 because 2478 * handle_excess_rx() already freed the packet. 2479 */ 2480 return (1); 2481 } 2482 2483 inp_lock_assert(tp->t_inpcb); 2484 q = &toep->tp_ddp_state; 2485 rcv = so_sockbuf_rcv(so); 2486 sockbuf_lock(rcv); 2487 2488 bsp = &q->buf_state[q->cur_buf]; 2489 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2490 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2491 m->m_ddp_gl = (unsigned char *)bsp->gl; 2492 m->m_flags |= M_DDP; 2493 m->m_cur_offset = bsp->cur_offset; 2494 m->m_ddp_flags = 2495 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2496 m->m_seq = tp->rcv_nxt; 2497 tp->rcv_nxt = rcv_nxt; 2498 bsp->cur_offset += m->m_pkthdr.len; 2499 if (!(bsp->flags & DDP_BF_NOFLIP)) 2500 q->cur_buf ^= 1; 2501#ifdef notyet 2502 skb_reset_transport_header(skb); 2503 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2504#endif 2505 tp->t_rcvtime = ticks; 2506 SBAPPEND(rcv, m); 2507 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 2508 so_sorwakeup_locked(so); 2509 else 2510 sockbuf_unlock(rcv); 2511 2512 return (1); 2513} 2514 2515/* 2516 * Handle a peer FIN. 2517 */ 2518static void 2519do_peer_fin(struct toepcb *toep, struct mbuf *m) 2520{ 2521 struct socket *so; 2522 struct tcpcb *tp = toep->tp_tp; 2523 int keep, action; 2524 2525 action = keep = 0; 2526 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); 2527 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2528 printf("abort_pending set\n"); 2529 2530 goto out; 2531 } 2532 inp_wlock(tp->t_inpcb); 2533 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 2534 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { 2535 keep = handle_peer_close_data(so, m); 2536 if (keep < 0) { 2537 inp_wunlock(tp->t_inpcb); 2538 return; 2539 } 2540 } 2541 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2542 CTR1(KTR_TOM, 2543 "waking up waiters for cantrcvmore on %p ", so); 2544 socantrcvmore(so); 2545 2546 /* 2547 * If connection is half-synchronized 2548 * (ie NEEDSYN flag on) then delay ACK, 2549 * so it may be piggybacked when SYN is sent. 2550 * Otherwise, since we received a FIN then no 2551 * more input can be expected, send ACK now. 2552 */ 2553 if (tp->t_flags & TF_NEEDSYN) 2554 tp->t_flags |= TF_DELACK; 2555 else 2556 tp->t_flags |= TF_ACKNOW; 2557 tp->rcv_nxt++; 2558 } 2559 2560 switch (tp->t_state) { 2561 case TCPS_SYN_RECEIVED: 2562 tp->t_starttime = ticks; 2563 /* FALLTHROUGH */ 2564 case TCPS_ESTABLISHED: 2565 tp->t_state = TCPS_CLOSE_WAIT; 2566 break; 2567 case TCPS_FIN_WAIT_1: 2568 tp->t_state = TCPS_CLOSING; 2569 break; 2570 case TCPS_FIN_WAIT_2: 2571 /* 2572 * If we've sent an abort_req we must have sent it too late, 2573 * HW will send us a reply telling us so, and this peer_close 2574 * is really the last message for this connection and needs to 2575 * be treated as an abort_rpl, i.e., transition the connection 2576 * to TCP_CLOSE (note that the host stack does this at the 2577 * time of generating the RST but we must wait for HW). 2578 * Otherwise we enter TIME_WAIT. 2579 */ 2580 t3_release_offload_resources(toep); 2581 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2582 action = TCP_CLOSE; 2583 } else { 2584 action = TCP_TIMEWAIT; 2585 } 2586 break; 2587 default: 2588 log(LOG_ERR, 2589 "%s: TID %u received PEER_CLOSE in bad state %d\n", 2590 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); 2591 } 2592 inp_wunlock(tp->t_inpcb); 2593 2594 if (action == TCP_TIMEWAIT) { 2595 enter_timewait(tp); 2596 } else if (action == TCP_DROP) { 2597 tcp_offload_drop(tp, 0); 2598 } else if (action == TCP_CLOSE) { 2599 tcp_offload_close(tp); 2600 } 2601 2602#ifdef notyet 2603 /* Do not send POLL_HUP for half duplex close. */ 2604 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 2605 sk->sk_state == TCP_CLOSE) 2606 sk_wake_async(so, 1, POLL_HUP); 2607 else 2608 sk_wake_async(so, 1, POLL_IN); 2609#endif 2610 2611out: 2612 if (!keep) 2613 m_free(m); 2614} 2615 2616/* 2617 * Handler for PEER_CLOSE CPL messages. 2618 */ 2619static int 2620do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2621{ 2622 struct toepcb *toep = (struct toepcb *)ctx; 2623 2624 VALIDATE_SOCK(so); 2625 2626 do_peer_fin(toep, m); 2627 return (0); 2628} 2629 2630static void 2631process_close_con_rpl(struct toepcb *toep, struct mbuf *m) 2632{ 2633 struct cpl_close_con_rpl *rpl = cplhdr(m); 2634 struct tcpcb *tp = toep->tp_tp; 2635 struct socket *so; 2636 int action = 0; 2637 struct sockbuf *rcv; 2638 2639 inp_wlock(tp->t_inpcb); 2640 so = inp_inpcbtosocket(tp->t_inpcb); 2641 2642 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 2643 2644 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2645 inp_wunlock(tp->t_inpcb); 2646 goto out; 2647 } 2648 2649 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 2650 tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); 2651 2652 switch (tp->t_state) { 2653 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 2654 t3_release_offload_resources(toep); 2655 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2656 action = TCP_CLOSE; 2657 2658 } else { 2659 action = TCP_TIMEWAIT; 2660 } 2661 break; 2662 case TCPS_LAST_ACK: 2663 /* 2664 * In this state we don't care about pending abort_rpl. 2665 * If we've sent abort_req it was post-close and was sent too 2666 * late, this close_con_rpl is the actual last message. 2667 */ 2668 t3_release_offload_resources(toep); 2669 action = TCP_CLOSE; 2670 break; 2671 case TCPS_FIN_WAIT_1: 2672 /* 2673 * If we can't receive any more 2674 * data, then closing user can proceed. 2675 * Starting the timer is contrary to the 2676 * specification, but if we don't get a FIN 2677 * we'll hang forever. 2678 * 2679 * XXXjl: 2680 * we should release the tp also, and use a 2681 * compressed state. 2682 */ 2683 if (so) 2684 rcv = so_sockbuf_rcv(so); 2685 else 2686 break; 2687 2688 if (rcv->sb_state & SBS_CANTRCVMORE) { 2689 int timeout; 2690 2691 if (so) 2692 soisdisconnected(so); 2693 timeout = (tcp_fast_finwait2_recycle) ? 2694 tcp_finwait2_timeout : tcp_maxidle; 2695 tcp_timer_activate(tp, TT_2MSL, timeout); 2696 } 2697 tp->t_state = TCPS_FIN_WAIT_2; 2698 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && 2699 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 2700 action = TCP_DROP; 2701 } 2702 2703 break; 2704 default: 2705 log(LOG_ERR, 2706 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 2707 toep->tp_toedev->tod_name, toep->tp_tid, 2708 tp->t_state); 2709 } 2710 inp_wunlock(tp->t_inpcb); 2711 2712 2713 if (action == TCP_TIMEWAIT) { 2714 enter_timewait(tp); 2715 } else if (action == TCP_DROP) { 2716 tcp_offload_drop(tp, 0); 2717 } else if (action == TCP_CLOSE) { 2718 tcp_offload_close(tp); 2719 } 2720out: 2721 m_freem(m); 2722} 2723 2724/* 2725 * Handler for CLOSE_CON_RPL CPL messages. 2726 */ 2727static int 2728do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 2729 void *ctx) 2730{ 2731 struct toepcb *toep = (struct toepcb *)ctx; 2732 2733 process_close_con_rpl(toep, m); 2734 return (0); 2735} 2736 2737/* 2738 * Process abort replies. We only process these messages if we anticipate 2739 * them as the coordination between SW and HW in this area is somewhat lacking 2740 * and sometimes we get ABORT_RPLs after we are done with the connection that 2741 * originated the ABORT_REQ. 2742 */ 2743static void 2744process_abort_rpl(struct toepcb *toep, struct mbuf *m) 2745{ 2746 struct tcpcb *tp = toep->tp_tp; 2747 struct socket *so; 2748 int needclose = 0; 2749 2750#ifdef T3_TRACE 2751 T3_TRACE1(TIDTB(sk), 2752 "process_abort_rpl: GTS rpl pending %d", 2753 sock_flag(sk, ABORT_RPL_PENDING)); 2754#endif 2755 2756 inp_wlock(tp->t_inpcb); 2757 so = inp_inpcbtosocket(tp->t_inpcb); 2758 2759 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2760 /* 2761 * XXX panic on tcpdrop 2762 */ 2763 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) 2764 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2765 else { 2766 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2767 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2768 !is_t3a(toep->tp_toedev)) { 2769 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2770 panic("TP_ABORT_REQ_RCVD set"); 2771 t3_release_offload_resources(toep); 2772 needclose = 1; 2773 } 2774 } 2775 } 2776 inp_wunlock(tp->t_inpcb); 2777 2778 if (needclose) 2779 tcp_offload_close(tp); 2780 2781 m_free(m); 2782} 2783 2784/* 2785 * Handle an ABORT_RPL_RSS CPL message. 2786 */ 2787static int 2788do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2789{ 2790 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2791 struct toepcb *toep; 2792 2793 /* 2794 * Ignore replies to post-close aborts indicating that the abort was 2795 * requested too late. These connections are terminated when we get 2796 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2797 * arrives the TID is either no longer used or it has been recycled. 2798 */ 2799 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2800discard: 2801 m_free(m); 2802 return (0); 2803 } 2804 2805 toep = (struct toepcb *)ctx; 2806 2807 /* 2808 * Sometimes we've already closed the socket, e.g., a post-close 2809 * abort races with ABORT_REQ_RSS, the latter frees the socket 2810 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2811 * but FW turns the ABORT_REQ into a regular one and so we get 2812 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2813 */ 2814 if (!toep) 2815 goto discard; 2816 2817 if (toep->tp_tp == NULL) { 2818 log(LOG_NOTICE, "removing tid for abort\n"); 2819 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2820 if (toep->tp_l2t) 2821 l2t_release(L2DATA(cdev), toep->tp_l2t); 2822 2823 toepcb_release(toep); 2824 goto discard; 2825 } 2826 2827 log(LOG_NOTICE, "toep=%p\n", toep); 2828 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); 2829 2830 toepcb_hold(toep); 2831 process_abort_rpl(toep, m); 2832 toepcb_release(toep); 2833 return (0); 2834} 2835 2836/* 2837 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also 2838 * indicate whether RST should be sent in response. 2839 */ 2840static int 2841abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2842{ 2843 struct tcpcb *tp = so_sototcpcb(so); 2844 2845 switch (abort_reason) { 2846 case CPL_ERR_BAD_SYN: 2847#if 0 2848 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2849#endif 2850 case CPL_ERR_CONN_RESET: 2851 // XXX need to handle SYN_RECV due to crossed SYNs 2852 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2853 case CPL_ERR_XMIT_TIMEDOUT: 2854 case CPL_ERR_PERSIST_TIMEDOUT: 2855 case CPL_ERR_FINWAIT2_TIMEDOUT: 2856 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2857#if 0 2858 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2859#endif 2860 return (ETIMEDOUT); 2861 default: 2862 return (EIO); 2863 } 2864} 2865 2866static inline void 2867set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2868{ 2869 struct cpl_abort_rpl *rpl = cplhdr(m); 2870 2871 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2872 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2873 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2874 2875 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2876 rpl->cmd = cmd; 2877} 2878 2879static void 2880send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2881{ 2882 struct mbuf *reply_mbuf; 2883 struct cpl_abort_req_rss *req = cplhdr(m); 2884 2885 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2886 m_set_priority(m, CPL_PRIORITY_DATA); 2887 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2888 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2889 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2890 m_free(m); 2891} 2892 2893/* 2894 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2895 */ 2896static inline int 2897is_neg_adv_abort(unsigned int status) 2898{ 2899 return status == CPL_ERR_RTX_NEG_ADVICE || 2900 status == CPL_ERR_PERSIST_NEG_ADVICE; 2901} 2902 2903static void 2904send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2905{ 2906 struct mbuf *reply_mbuf; 2907 struct cpl_abort_req_rss *req = cplhdr(m); 2908 2909 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2910 2911 if (!reply_mbuf) { 2912 /* Defer the reply. Stick rst_status into req->cmd. */ 2913 req->status = rst_status; 2914 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2915 return; 2916 } 2917 2918 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2919 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2920 m_free(m); 2921 2922 /* 2923 * XXX need to sync with ARP as for SYN_RECV connections we can send 2924 * these messages while ARP is pending. For other connection states 2925 * it's not a problem. 2926 */ 2927 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2928} 2929 2930#ifdef notyet 2931static void 2932cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2933{ 2934 CXGB_UNIMPLEMENTED(); 2935#ifdef notyet 2936 struct request_sock *req = child->sk_user_data; 2937 2938 inet_csk_reqsk_queue_removed(parent, req); 2939 synq_remove(tcp_sk(child)); 2940 __reqsk_free(req); 2941 child->sk_user_data = NULL; 2942#endif 2943} 2944 2945 2946/* 2947 * Performs the actual work to abort a SYN_RECV connection. 2948 */ 2949static void 2950do_abort_syn_rcv(struct socket *child, struct socket *parent) 2951{ 2952 struct tcpcb *parenttp = so_sototcpcb(parent); 2953 struct tcpcb *childtp = so_sototcpcb(child); 2954 2955 /* 2956 * If the server is still open we clean up the child connection, 2957 * otherwise the server already did the clean up as it was purging 2958 * its SYN queue and the skb was just sitting in its backlog. 2959 */ 2960 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2961 cleanup_syn_rcv_conn(child, parent); 2962 inp_wlock(childtp->t_inpcb); 2963 t3_release_offload_resources(childtp->t_toe); 2964 inp_wunlock(childtp->t_inpcb); 2965 tcp_offload_close(childtp); 2966 } 2967} 2968#endif 2969 2970/* 2971 * Handle abort requests for a SYN_RECV connection. These need extra work 2972 * because the socket is on its parent's SYN queue. 2973 */ 2974static int 2975abort_syn_rcv(struct socket *so, struct mbuf *m) 2976{ 2977 CXGB_UNIMPLEMENTED(); 2978#ifdef notyet 2979 struct socket *parent; 2980 struct toedev *tdev = toep->tp_toedev; 2981 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2982 struct socket *oreq = so->so_incomp; 2983 struct t3c_tid_entry *t3c_stid; 2984 struct tid_info *t; 2985 2986 if (!oreq) 2987 return -1; /* somehow we are not on the SYN queue */ 2988 2989 t = &(T3C_DATA(cdev))->tid_maps; 2990 t3c_stid = lookup_stid(t, oreq->ts_recent); 2991 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2992 2993 so_lock(parent); 2994 do_abort_syn_rcv(so, parent); 2995 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2996 so_unlock(parent); 2997#endif 2998 return (0); 2999} 3000 3001/* 3002 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 3003 * request except that we need to reply to it. 3004 */ 3005static void 3006process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) 3007{ 3008 int rst_status = CPL_ABORT_NO_RST; 3009 const struct cpl_abort_req_rss *req = cplhdr(m); 3010 struct tcpcb *tp = toep->tp_tp; 3011 struct socket *so; 3012 int needclose = 0; 3013 3014 inp_wlock(tp->t_inpcb); 3015 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 3016 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 3017 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 3018 m_free(m); 3019 goto skip; 3020 } 3021 3022 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 3023 /* 3024 * Three cases to consider: 3025 * a) We haven't sent an abort_req; close the connection. 3026 * b) We have sent a post-close abort_req that will get to TP too late 3027 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 3028 * be ignored and the connection should be closed now. 3029 * c) We have sent a regular abort_req that will get to TP too late. 3030 * That will generate an abort_rpl with status 0, wait for it. 3031 */ 3032 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 3033 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 3034 int error; 3035 3036 error = abort_status_to_errno(so, req->status, 3037 &rst_status); 3038 so_error_set(so, error); 3039 3040 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 3041 so_sorwakeup(so); 3042 /* 3043 * SYN_RECV needs special processing. If abort_syn_rcv() 3044 * returns 0 is has taken care of the abort. 3045 */ 3046 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 3047 goto skip; 3048 3049 t3_release_offload_resources(toep); 3050 needclose = 1; 3051 } 3052 inp_wunlock(tp->t_inpcb); 3053 3054 if (needclose) 3055 tcp_offload_close(tp); 3056 3057 send_abort_rpl(m, tdev, rst_status); 3058 return; 3059skip: 3060 inp_wunlock(tp->t_inpcb); 3061} 3062 3063/* 3064 * Handle an ABORT_REQ_RSS CPL message. 3065 */ 3066static int 3067do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3068{ 3069 const struct cpl_abort_req_rss *req = cplhdr(m); 3070 struct toepcb *toep = (struct toepcb *)ctx; 3071 3072 if (is_neg_adv_abort(req->status)) { 3073 m_free(m); 3074 return (0); 3075 } 3076 3077 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); 3078 3079 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 3080 cxgb_remove_tid(cdev, toep, toep->tp_tid); 3081 toep->tp_flags |= TP_ABORT_REQ_RCVD; 3082 3083 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 3084 if (toep->tp_l2t) 3085 l2t_release(L2DATA(cdev), toep->tp_l2t); 3086 3087 /* 3088 * Unhook 3089 */ 3090 toep->tp_tp->t_toe = NULL; 3091 toep->tp_tp->t_flags &= ~TF_TOE; 3092 toep->tp_tp = NULL; 3093 /* 3094 * XXX need to call syncache_chkrst - but we don't 3095 * have a way of doing that yet 3096 */ 3097 toepcb_release(toep); 3098 log(LOG_ERR, "abort for unestablished connection :-(\n"); 3099 return (0); 3100 } 3101 if (toep->tp_tp == NULL) { 3102 log(LOG_NOTICE, "disconnected toepcb\n"); 3103 /* should be freed momentarily */ 3104 return (0); 3105 } 3106 3107 3108 toepcb_hold(toep); 3109 process_abort_req(toep, m, toep->tp_toedev); 3110 toepcb_release(toep); 3111 return (0); 3112} 3113#ifdef notyet 3114static void 3115pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 3116{ 3117 struct toedev *tdev = TOE_DEV(parent); 3118 3119 do_abort_syn_rcv(child, parent); 3120 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 3121 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 3122 3123 rpl->opt0h = htonl(F_TCAM_BYPASS); 3124 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3125 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3126 } else 3127 m_free(m); 3128} 3129#endif 3130static void 3131handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 3132{ 3133 CXGB_UNIMPLEMENTED(); 3134 3135#ifdef notyet 3136 struct t3cdev *cdev; 3137 struct socket *parent; 3138 struct socket *oreq; 3139 struct t3c_tid_entry *t3c_stid; 3140 struct tid_info *t; 3141 struct tcpcb *otp, *tp = so_sototcpcb(so); 3142 struct toepcb *toep = tp->t_toe; 3143 3144 /* 3145 * If the connection is being aborted due to the parent listening 3146 * socket going away there's nothing to do, the ABORT_REQ will close 3147 * the connection. 3148 */ 3149 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 3150 m_free(m); 3151 return; 3152 } 3153 3154 oreq = so->so_incomp; 3155 otp = so_sototcpcb(oreq); 3156 3157 cdev = T3C_DEV(so); 3158 t = &(T3C_DATA(cdev))->tid_maps; 3159 t3c_stid = lookup_stid(t, otp->ts_recent); 3160 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 3161 3162 so_lock(parent); 3163 pass_open_abort(so, parent, m); 3164 so_unlock(parent); 3165#endif 3166} 3167 3168/* 3169 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 3170 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 3171 * connection. 3172 */ 3173static void 3174pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 3175{ 3176 3177#ifdef notyet 3178 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3179 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 3180#endif 3181 handle_pass_open_arp_failure(m_get_socket(m), m); 3182} 3183 3184/* 3185 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 3186 */ 3187static void 3188mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 3189{ 3190 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 3191 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 3192 unsigned int tid = GET_TID(req); 3193 3194 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 3195 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3196 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3197 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 3198 rpl->opt0h = htonl(F_TCAM_BYPASS); 3199 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3200 rpl->opt2 = 0; 3201 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3202} 3203 3204/* 3205 * Send a deferred reject to an accept request. 3206 */ 3207static void 3208reject_pass_request(struct toedev *tdev, struct mbuf *m) 3209{ 3210 struct mbuf *reply_mbuf; 3211 3212 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 3213 mk_pass_accept_rpl(reply_mbuf, m); 3214 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 3215 m_free(m); 3216} 3217 3218static void 3219handle_syncache_event(int event, void *arg) 3220{ 3221 struct toepcb *toep = arg; 3222 3223 switch (event) { 3224 case TOE_SC_ENTRY_PRESENT: 3225 /* 3226 * entry already exists - free toepcb 3227 * and l2t 3228 */ 3229 printf("syncache entry present\n"); 3230 toepcb_release(toep); 3231 break; 3232 case TOE_SC_DROP: 3233 /* 3234 * The syncache has given up on this entry 3235 * either it timed out, or it was evicted 3236 * we need to explicitly release the tid 3237 */ 3238 printf("syncache entry dropped\n"); 3239 toepcb_release(toep); 3240 break; 3241 default: 3242 log(LOG_ERR, "unknown syncache event %d\n", event); 3243 break; 3244 } 3245} 3246 3247static void 3248syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 3249{ 3250 struct in_conninfo inc; 3251 struct tcpopt to; 3252 struct tcphdr th; 3253 struct inpcb *inp; 3254 int mss, wsf, sack, ts; 3255 uint32_t rcv_isn = ntohl(req->rcv_isn); 3256 3257 bzero(&to, sizeof(struct tcpopt)); 3258 inp = so_sotoinpcb(lso); 3259 3260 /* 3261 * Fill out information for entering us into the syncache 3262 */ 3263 bzero(&inc, sizeof(inc)); 3264 inc.inc_fport = th.th_sport = req->peer_port; 3265 inc.inc_lport = th.th_dport = req->local_port; 3266 th.th_seq = req->rcv_isn; 3267 th.th_flags = TH_SYN; 3268 3269 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; 3270 3271 3272 inc.inc_isipv6 = 0; 3273 inc.inc_len = 0; 3274 inc.inc_faddr.s_addr = req->peer_ip; 3275 inc.inc_laddr.s_addr = req->local_ip; 3276 3277 DPRINTF("syncache add of %d:%d %d:%d\n", 3278 ntohl(req->local_ip), ntohs(req->local_port), 3279 ntohl(req->peer_ip), ntohs(req->peer_port)); 3280 3281 mss = req->tcp_options.mss; 3282 wsf = req->tcp_options.wsf; 3283 ts = req->tcp_options.tstamp; 3284 sack = req->tcp_options.sack; 3285 to.to_mss = mss; 3286 to.to_wscale = wsf; 3287 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3288 tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 3289} 3290 3291 3292/* 3293 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 3294 * lock held. Note that the sock here is a listening socket that is not owned 3295 * by the TOE. 3296 */ 3297static void 3298process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 3299 struct listen_ctx *lctx) 3300{ 3301 int rt_flags; 3302 struct l2t_entry *e; 3303 struct iff_mac tim; 3304 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 3305 struct cpl_pass_accept_rpl *rpl; 3306 struct cpl_pass_accept_req *req = cplhdr(m); 3307 unsigned int tid = GET_TID(req); 3308 struct tom_data *d = TOM_DATA(tdev); 3309 struct t3cdev *cdev = d->cdev; 3310 struct tcpcb *tp = so_sototcpcb(so); 3311 struct toepcb *newtoep; 3312 struct rtentry *dst; 3313 struct sockaddr_in nam; 3314 struct t3c_data *td = T3C_DATA(cdev); 3315 3316 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3317 if (__predict_false(reply_mbuf == NULL)) { 3318 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3319 t3_defer_reply(m, tdev, reject_pass_request); 3320 else { 3321 cxgb_queue_tid_release(cdev, tid); 3322 m_free(m); 3323 } 3324 DPRINTF("failed to get reply_mbuf\n"); 3325 3326 goto out; 3327 } 3328 3329 if (tp->t_state != TCPS_LISTEN) { 3330 DPRINTF("socket not in listen state\n"); 3331 3332 goto reject; 3333 } 3334 3335 tim.mac_addr = req->dst_mac; 3336 tim.vlan_tag = ntohs(req->vlan_tag); 3337 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 3338 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 3339 goto reject; 3340 } 3341 3342#ifdef notyet 3343 /* 3344 * XXX do route lookup to confirm that we're still listening on this 3345 * address 3346 */ 3347 if (ip_route_input(skb, req->local_ip, req->peer_ip, 3348 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 3349 goto reject; 3350 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 3351 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 3352 dst_release(skb->dst); // done with the input route, release it 3353 skb->dst = NULL; 3354 3355 if ((rt_flags & RTF_LOCAL) == 0) 3356 goto reject; 3357#endif 3358 /* 3359 * XXX 3360 */ 3361 rt_flags = RTF_LOCAL; 3362 if ((rt_flags & RTF_LOCAL) == 0) 3363 goto reject; 3364 3365 /* 3366 * Calculate values and add to syncache 3367 */ 3368 3369 newtoep = toepcb_alloc(); 3370 if (newtoep == NULL) 3371 goto reject; 3372 3373 bzero(&nam, sizeof(struct sockaddr_in)); 3374 3375 nam.sin_len = sizeof(struct sockaddr_in); 3376 nam.sin_family = AF_INET; 3377 nam.sin_addr.s_addr =req->peer_ip; 3378 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 3379 3380 if (dst == NULL) { 3381 printf("failed to find route\n"); 3382 goto reject; 3383 } 3384 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 3385 (struct sockaddr *)&nam); 3386 if (e == NULL) { 3387 DPRINTF("failed to get l2t\n"); 3388 } 3389 /* 3390 * Point to our listen socket until accept 3391 */ 3392 newtoep->tp_tp = tp; 3393 newtoep->tp_flags = TP_SYN_RCVD; 3394 newtoep->tp_tid = tid; 3395 newtoep->tp_toedev = tdev; 3396 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3397 3398 cxgb_insert_tid(cdev, d->client, newtoep, tid); 3399 so_lock(so); 3400 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 3401 so_unlock(so); 3402 3403 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 3404 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 3405 3406 if (newtoep->tp_ulp_mode) { 3407 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3408 3409 if (ddp_mbuf == NULL) 3410 newtoep->tp_ulp_mode = 0; 3411 } 3412 3413 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", 3414 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); 3415 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 3416 /* 3417 * XXX workaround for lack of syncache drop 3418 */ 3419 toepcb_hold(newtoep); 3420 syncache_add_accept_req(req, so, newtoep); 3421 3422 rpl = cplhdr(reply_mbuf); 3423 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 3424 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3425 rpl->wr.wr_lo = 0; 3426 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3427 rpl->opt2 = htonl(calc_opt2(so, tdev)); 3428 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3429 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 3430 3431 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 3432 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 3433 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | 3434 CPL_PASS_OPEN_ACCEPT); 3435 3436 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 3437 3438 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); 3439 3440 l2t_send(cdev, reply_mbuf, e); 3441 m_free(m); 3442 if (newtoep->tp_ulp_mode) { 3443 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 3444 V_TF_DDP_OFF(1) | 3445 TP_DDP_TIMER_WORKAROUND_MASK, 3446 V_TF_DDP_OFF(1) | 3447 TP_DDP_TIMER_WORKAROUND_VAL, 1); 3448 } else 3449 printf("not offloading\n"); 3450 3451 3452 3453 return; 3454reject: 3455 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3456 mk_pass_accept_rpl(reply_mbuf, m); 3457 else 3458 mk_tid_release(reply_mbuf, newtoep, tid); 3459 cxgb_ofld_send(cdev, reply_mbuf); 3460 m_free(m); 3461out: 3462#if 0 3463 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3464#else 3465 return; 3466#endif 3467} 3468 3469/* 3470 * Handle a CPL_PASS_ACCEPT_REQ message. 3471 */ 3472static int 3473do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3474{ 3475 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 3476 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ 3477 struct tom_data *d = listen_ctx->tom_data; 3478 3479#if VALIDATE_TID 3480 struct cpl_pass_accept_req *req = cplhdr(m); 3481 unsigned int tid = GET_TID(req); 3482 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 3483 3484 if (unlikely(!lsk)) { 3485 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 3486 cdev->name, 3487 (unsigned long)((union listen_entry *)ctx - 3488 t->stid_tab)); 3489 return CPL_RET_BUF_DONE; 3490 } 3491 if (unlikely(tid >= t->ntids)) { 3492 printk(KERN_ERR "%s: passive open TID %u too large\n", 3493 cdev->name, tid); 3494 return CPL_RET_BUF_DONE; 3495 } 3496 /* 3497 * For T3A the current user of the TID may have closed but its last 3498 * message(s) may have been backlogged so the TID appears to be still 3499 * in use. Just take the TID away, the connection can close at its 3500 * own leisure. For T3B this situation is a bug. 3501 */ 3502 if (!valid_new_tid(t, tid) && 3503 cdev->type != T3A) { 3504 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 3505 cdev->name, tid); 3506 return CPL_RET_BUF_DONE; 3507 } 3508#endif 3509 3510 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 3511 return (0); 3512} 3513 3514/* 3515 * Called when a connection is established to translate the TCP options 3516 * reported by HW to FreeBSD's native format. 3517 */ 3518static void 3519assign_rxopt(struct socket *so, unsigned int opt) 3520{ 3521 struct tcpcb *tp = so_sototcpcb(so); 3522 struct toepcb *toep = tp->t_toe; 3523 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); 3524 3525 inp_lock_assert(tp->t_inpcb); 3526 3527 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3528 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 3529 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 3530 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 3531 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3532 (TF_RCVD_SCALE|TF_REQ_SCALE)) 3533 tp->rcv_scale = tp->request_r_scale; 3534} 3535 3536/* 3537 * Completes some final bits of initialization for just established connections 3538 * and changes their state to TCP_ESTABLISHED. 3539 * 3540 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 3541 */ 3542static void 3543make_established(struct socket *so, u32 snd_isn, unsigned int opt) 3544{ 3545 struct tcpcb *tp = so_sototcpcb(so); 3546 struct toepcb *toep = tp->t_toe; 3547 3548 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 3549 assign_rxopt(so, opt); 3550 3551 /* 3552 *XXXXXXXXXXX 3553 * 3554 */ 3555#ifdef notyet 3556 so->so_proto->pr_ctloutput = t3_ctloutput; 3557#endif 3558 3559#if 0 3560 inet_sk(sk)->id = tp->write_seq ^ jiffies; 3561#endif 3562 /* 3563 * XXX not clear what rcv_wup maps to 3564 */ 3565 /* 3566 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 3567 * pass through opt0. 3568 */ 3569 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 3570 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 3571 3572 dump_toepcb(toep); 3573 3574#ifdef notyet 3575/* 3576 * no clean interface for marking ARP up to date 3577 */ 3578 dst_confirm(sk->sk_dst_cache); 3579#endif 3580 tp->t_starttime = ticks; 3581 tp->t_state = TCPS_ESTABLISHED; 3582 soisconnected(so); 3583} 3584 3585static int 3586syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 3587{ 3588 3589 struct in_conninfo inc; 3590 struct tcpopt to; 3591 struct tcphdr th; 3592 int mss, wsf, sack, ts; 3593 struct mbuf *m = NULL; 3594 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 3595 unsigned int opt; 3596 3597#ifdef MAC 3598#error "no MAC support" 3599#endif 3600 3601 opt = ntohs(req->tcp_opt); 3602 3603 bzero(&to, sizeof(struct tcpopt)); 3604 3605 /* 3606 * Fill out information for entering us into the syncache 3607 */ 3608 bzero(&inc, sizeof(inc)); 3609 inc.inc_fport = th.th_sport = req->peer_port; 3610 inc.inc_lport = th.th_dport = req->local_port; 3611 th.th_seq = req->rcv_isn; 3612 th.th_flags = TH_ACK; 3613 3614 inc.inc_isipv6 = 0; 3615 inc.inc_len = 0; 3616 inc.inc_faddr.s_addr = req->peer_ip; 3617 inc.inc_laddr.s_addr = req->local_ip; 3618 3619 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3620 wsf = G_TCPOPT_WSCALE_OK(opt); 3621 ts = G_TCPOPT_TSTAMP(opt); 3622 sack = G_TCPOPT_SACK(opt); 3623 3624 to.to_mss = mss; 3625 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 3626 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3627 3628 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 3629 ntohl(req->local_ip), ntohs(req->local_port), 3630 ntohl(req->peer_ip), ntohs(req->peer_port), 3631 mss, wsf, ts, sack); 3632 return tcp_offload_syncache_expand(&inc, &to, &th, so, m); 3633} 3634 3635 3636/* 3637 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 3638 * if we are in TCP_SYN_RECV due to crossed SYNs 3639 */ 3640static int 3641do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3642{ 3643 struct cpl_pass_establish *req = cplhdr(m); 3644 struct toepcb *toep = (struct toepcb *)ctx; 3645 struct tcpcb *tp = toep->tp_tp; 3646 struct socket *so, *lso; 3647 struct t3c_data *td = T3C_DATA(cdev); 3648 struct sockbuf *snd, *rcv; 3649 3650 // Complete socket initialization now that we have the SND_ISN 3651 3652 struct toedev *tdev; 3653 3654 3655 tdev = toep->tp_toedev; 3656 3657 inp_wlock(tp->t_inpcb); 3658 3659 /* 3660 * 3661 * XXX need to add reference while we're manipulating 3662 */ 3663 so = lso = inp_inpcbtosocket(tp->t_inpcb); 3664 3665 inp_wunlock(tp->t_inpcb); 3666 3667 so_lock(so); 3668 LIST_REMOVE(toep, synq_entry); 3669 so_unlock(so); 3670 3671 if (!syncache_expand_establish_req(req, &so, toep)) { 3672 /* 3673 * No entry 3674 */ 3675 CXGB_UNIMPLEMENTED(); 3676 } 3677 if (so == NULL) { 3678 /* 3679 * Couldn't create the socket 3680 */ 3681 CXGB_UNIMPLEMENTED(); 3682 } 3683 3684 tp = so_sototcpcb(so); 3685 inp_wlock(tp->t_inpcb); 3686 3687 snd = so_sockbuf_snd(so); 3688 rcv = so_sockbuf_rcv(so); 3689 3690 snd->sb_flags |= SB_NOCOALESCE; 3691 rcv->sb_flags |= SB_NOCOALESCE; 3692 3693 toep->tp_tp = tp; 3694 toep->tp_flags = 0; 3695 tp->t_toe = toep; 3696 reset_wr_list(toep); 3697 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3698 tp->rcv_nxt = toep->tp_copied_seq; 3699 install_offload_ops(so); 3700 3701 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 3702 toep->tp_wr_unacked = 0; 3703 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3704 toep->tp_qset_idx = 0; 3705 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 3706 3707 /* 3708 * XXX Cancel any keep alive timer 3709 */ 3710 3711 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3712 3713 /* 3714 * XXX workaround for lack of syncache drop 3715 */ 3716 toepcb_release(toep); 3717 inp_wunlock(tp->t_inpcb); 3718 3719 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); 3720 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3721#ifdef notyet 3722 /* 3723 * XXX not sure how these checks map to us 3724 */ 3725 if (unlikely(sk->sk_socket)) { // simultaneous opens only 3726 sk->sk_state_change(sk); 3727 sk_wake_async(so, 0, POLL_OUT); 3728 } 3729 /* 3730 * The state for the new connection is now up to date. 3731 * Next check if we should add the connection to the parent's 3732 * accept queue. When the parent closes it resets connections 3733 * on its SYN queue, so check if we are being reset. If so we 3734 * don't need to do anything more, the coming ABORT_RPL will 3735 * destroy this socket. Otherwise move the connection to the 3736 * accept queue. 3737 * 3738 * Note that we reset the synq before closing the server so if 3739 * we are not being reset the stid is still open. 3740 */ 3741 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3742 __kfree_skb(skb); 3743 goto unlock; 3744 } 3745#endif 3746 m_free(m); 3747 3748 return (0); 3749} 3750 3751/* 3752 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3753 * and send them to the TOE. 3754 */ 3755static void 3756fixup_and_send_ofo(struct toepcb *toep) 3757{ 3758 struct mbuf *m; 3759 struct toedev *tdev = toep->tp_toedev; 3760 struct tcpcb *tp = toep->tp_tp; 3761 unsigned int tid = toep->tp_tid; 3762 3763 log(LOG_NOTICE, "fixup_and_send_ofo\n"); 3764 3765 inp_lock_assert(tp->t_inpcb); 3766 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3767 /* 3768 * A variety of messages can be waiting but the fields we'll 3769 * be touching are common to all so any message type will do. 3770 */ 3771 struct cpl_close_con_req *p = cplhdr(m); 3772 3773 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3774 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3775 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3776 } 3777} 3778 3779/* 3780 * Updates socket state from an active establish CPL message. Runs with the 3781 * socket lock held. 3782 */ 3783static void 3784socket_act_establish(struct socket *so, struct mbuf *m) 3785{ 3786 struct cpl_act_establish *req = cplhdr(m); 3787 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3788 struct tcpcb *tp = so_sototcpcb(so); 3789 struct toepcb *toep = tp->t_toe; 3790 3791 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3792 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3793 toep->tp_tid, tp->t_state); 3794 3795 tp->ts_recent_age = ticks; 3796 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3797 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3798 3799 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3800 3801 /* 3802 * Now that we finally have a TID send any CPL messages that we had to 3803 * defer for lack of a TID. 3804 */ 3805 if (mbufq_len(&toep->out_of_order_queue)) 3806 fixup_and_send_ofo(toep); 3807 3808 if (__predict_false(so_state_get(so) & SS_NOFDREF)) { 3809 /* 3810 * XXX does this even make sense? 3811 */ 3812 so_sorwakeup(so); 3813 } 3814 m_free(m); 3815#ifdef notyet 3816/* 3817 * XXX assume no write requests permitted while socket connection is 3818 * incomplete 3819 */ 3820 /* 3821 * Currently the send queue must be empty at this point because the 3822 * socket layer does not send anything before a connection is 3823 * established. To be future proof though we handle the possibility 3824 * that there are pending buffers to send (either TX_DATA or 3825 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3826 * buffers according to the just learned write_seq, and then we send 3827 * them on their way. 3828 */ 3829 fixup_pending_writeq_buffers(sk); 3830 if (t3_push_frames(so, 1)) 3831 sk->sk_write_space(sk); 3832#endif 3833 3834 toep->tp_state = tp->t_state; 3835 V_tcpstat.tcps_connects++; 3836 3837} 3838 3839/* 3840 * Process a CPL_ACT_ESTABLISH message. 3841 */ 3842static int 3843do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3844{ 3845 struct cpl_act_establish *req = cplhdr(m); 3846 unsigned int tid = GET_TID(req); 3847 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3848 struct toepcb *toep = (struct toepcb *)ctx; 3849 struct tcpcb *tp = toep->tp_tp; 3850 struct socket *so; 3851 struct toedev *tdev; 3852 struct tom_data *d; 3853 3854 if (tp == NULL) { 3855 free_atid(cdev, atid); 3856 return (0); 3857 } 3858 inp_wlock(tp->t_inpcb); 3859 3860 /* 3861 * XXX 3862 */ 3863 so = inp_inpcbtosocket(tp->t_inpcb); 3864 tdev = toep->tp_toedev; /* blow up here if link was down */ 3865 d = TOM_DATA(tdev); 3866 3867 /* 3868 * It's OK if the TID is currently in use, the owning socket may have 3869 * backlogged its last CPL message(s). Just take it away. 3870 */ 3871 toep->tp_tid = tid; 3872 toep->tp_tp = tp; 3873 so_insert_tid(d, toep, tid); 3874 free_atid(cdev, atid); 3875 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3876 3877 socket_act_establish(so, m); 3878 inp_wunlock(tp->t_inpcb); 3879 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); 3880 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3881 3882 return (0); 3883} 3884 3885/* 3886 * Process an acknowledgment of WR completion. Advance snd_una and send the 3887 * next batch of work requests from the write queue. 3888 */ 3889static void 3890wr_ack(struct toepcb *toep, struct mbuf *m) 3891{ 3892 struct tcpcb *tp = toep->tp_tp; 3893 struct cpl_wr_ack *hdr = cplhdr(m); 3894 struct socket *so; 3895 unsigned int credits = ntohs(hdr->credits); 3896 u32 snd_una = ntohl(hdr->snd_una); 3897 int bytes = 0; 3898 struct sockbuf *snd; 3899 3900 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); 3901 3902 inp_wlock(tp->t_inpcb); 3903 so = inp_inpcbtosocket(tp->t_inpcb); 3904 toep->tp_wr_avail += credits; 3905 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3906 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3907 3908 while (credits) { 3909 struct mbuf *p = peek_wr(toep); 3910 3911 if (__predict_false(!p)) { 3912 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3913 "nothing pending, state %u wr_avail=%u\n", 3914 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 3915 break; 3916 } 3917 CTR2(KTR_TOM, 3918 "wr_ack: p->credits=%d p->bytes=%d", 3919 p->m_pkthdr.csum_data, p->m_pkthdr.len); 3920 KASSERT(p->m_pkthdr.csum_data != 0, 3921 ("empty request still on list")); 3922 3923 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3924 3925#if DEBUG_WR > 1 3926 struct tx_data_wr *w = cplhdr(p); 3927 log(LOG_ERR, 3928 "TID %u got %u WR credits, need %u, len %u, " 3929 "main body %u, frags %u, seq # %u, ACK una %u," 3930 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3931 toep->tp_tid, credits, p->csum, p->len, 3932 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3933 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3934 toep->tp_wr_avail, count_pending_wrs(tp) - credits); 3935#endif 3936 p->m_pkthdr.csum_data -= credits; 3937 break; 3938 } else { 3939 dequeue_wr(toep); 3940 credits -= p->m_pkthdr.csum_data; 3941 bytes += p->m_pkthdr.len; 3942 CTR3(KTR_TOM, 3943 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", 3944 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); 3945 3946 m_free(p); 3947 } 3948 } 3949 3950#if DEBUG_WR 3951 check_wr_invariants(tp); 3952#endif 3953 3954 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3955#if VALIDATE_SEQ 3956 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3957 3958 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3959 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3960 toep->tp_tid, tp->snd_una); 3961#endif 3962 goto out_free; 3963 } 3964 3965 if (tp->snd_una != snd_una) { 3966 tp->snd_una = snd_una; 3967 tp->ts_recent_age = ticks; 3968#ifdef notyet 3969 /* 3970 * Keep ARP entry "minty fresh" 3971 */ 3972 dst_confirm(sk->sk_dst_cache); 3973#endif 3974 if (tp->snd_una == tp->snd_nxt) 3975 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3976 } 3977 3978 snd = so_sockbuf_snd(so); 3979 if (bytes) { 3980 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); 3981 snd = so_sockbuf_snd(so); 3982 sockbuf_lock(snd); 3983 sbdrop_locked(snd, bytes); 3984 so_sowwakeup_locked(so); 3985 } 3986 3987 if (snd->sb_sndptroff < snd->sb_cc) 3988 t3_push_frames(so, 0); 3989 3990out_free: 3991 inp_wunlock(tp->t_inpcb); 3992 m_free(m); 3993} 3994 3995/* 3996 * Handler for TX_DATA_ACK CPL messages. 3997 */ 3998static int 3999do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 4000{ 4001 struct toepcb *toep = (struct toepcb *)ctx; 4002 4003 VALIDATE_SOCK(so); 4004 4005 wr_ack(toep, m); 4006 return 0; 4007} 4008 4009/* 4010 * Handler for TRACE_PKT CPL messages. Just sink these packets. 4011 */ 4012static int 4013do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) 4014{ 4015 m_freem(m); 4016 return 0; 4017} 4018 4019/* 4020 * Reset a connection that is on a listener's SYN queue or accept queue, 4021 * i.e., one that has not had a struct socket associated with it. 4022 * Must be called from process context. 4023 * 4024 * Modeled after code in inet_csk_listen_stop(). 4025 */ 4026static void 4027t3_reset_listen_child(struct socket *child) 4028{ 4029 struct tcpcb *tp = so_sototcpcb(child); 4030 4031 t3_send_reset(tp->t_toe); 4032} 4033 4034 4035static void 4036t3_child_disconnect(struct socket *so, void *arg) 4037{ 4038 struct tcpcb *tp = so_sototcpcb(so); 4039 4040 if (tp->t_flags & TF_TOE) { 4041 inp_wlock(tp->t_inpcb); 4042 t3_reset_listen_child(so); 4043 inp_wunlock(tp->t_inpcb); 4044 } 4045} 4046 4047/* 4048 * Disconnect offloaded established but not yet accepted connections sitting 4049 * on a server's accept_queue. We just send an ABORT_REQ at this point and 4050 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 4051 */ 4052void 4053t3_disconnect_acceptq(struct socket *listen_so) 4054{ 4055 4056 so_lock(listen_so); 4057 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); 4058 so_unlock(listen_so); 4059} 4060 4061/* 4062 * Reset offloaded connections sitting on a server's syn queue. As above 4063 * we send ABORT_REQ and finish off when we get ABORT_RPL. 4064 */ 4065 4066void 4067t3_reset_synq(struct listen_ctx *lctx) 4068{ 4069 struct toepcb *toep; 4070 4071 so_lock(lctx->lso); 4072 while (!LIST_EMPTY(&lctx->synq_head)) { 4073 toep = LIST_FIRST(&lctx->synq_head); 4074 LIST_REMOVE(toep, synq_entry); 4075 toep->tp_tp = NULL; 4076 t3_send_reset(toep); 4077 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 4078 toepcb_release(toep); 4079 } 4080 so_unlock(lctx->lso); 4081} 4082 4083 4084int 4085t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, 4086 unsigned int nppods, unsigned int tag, unsigned int maxoff, 4087 unsigned int pg_off, unsigned int color) 4088{ 4089 unsigned int i, j, pidx; 4090 struct pagepod *p; 4091 struct mbuf *m; 4092 struct ulp_mem_io *req; 4093 unsigned int tid = toep->tp_tid; 4094 const struct tom_data *td = TOM_DATA(toep->tp_toedev); 4095 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; 4096 4097 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", 4098 gl, nppods, tag, maxoff, pg_off, color); 4099 4100 for (i = 0; i < nppods; ++i) { 4101 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); 4102 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4103 req = mtod(m, struct ulp_mem_io *); 4104 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; 4105 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4106 req->wr.wr_lo = 0; 4107 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | 4108 V_ULPTX_CMD(ULP_MEM_WRITE)); 4109 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | 4110 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); 4111 4112 p = (struct pagepod *)(req + 1); 4113 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { 4114 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); 4115 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | 4116 V_PPOD_COLOR(color)); 4117 p->pp_max_offset = htonl(maxoff); 4118 p->pp_page_offset = htonl(pg_off); 4119 p->pp_rsvd = 0; 4120 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) 4121 p->pp_addr[j] = pidx < gl->dgl_nelem ? 4122 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; 4123 } else 4124 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ 4125 send_or_defer(toep, m, 0); 4126 ppod_addr += PPOD_SIZE; 4127 } 4128 return (0); 4129} 4130 4131/* 4132 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. 4133 */ 4134static inline void 4135mk_cpl_barrier_ulp(struct cpl_barrier *b) 4136{ 4137 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; 4138 4139 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4140 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); 4141 b->opcode = CPL_BARRIER; 4142} 4143 4144/* 4145 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. 4146 */ 4147static inline void 4148mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) 4149{ 4150 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4151 4152 txpkt = (struct ulp_txpkt *)req; 4153 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4154 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4155 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); 4156 req->cpuno = htons(cpuno); 4157} 4158 4159/* 4160 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. 4161 */ 4162static inline void 4163mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, 4164 unsigned int word, uint64_t mask, uint64_t val) 4165{ 4166 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4167 4168 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 4169 tid, word, mask, val); 4170 4171 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4172 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4173 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); 4174 req->reply = V_NO_REPLY(1); 4175 req->cpu_idx = 0; 4176 req->word = htons(word); 4177 req->mask = htobe64(mask); 4178 req->val = htobe64(val); 4179} 4180 4181/* 4182 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. 4183 */ 4184static void 4185mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, 4186 unsigned int tid, unsigned int credits) 4187{ 4188 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; 4189 4190 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4191 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); 4192 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); 4193 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 4194 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | 4195 V_RX_CREDITS(credits)); 4196} 4197 4198void 4199t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) 4200{ 4201 unsigned int wrlen; 4202 struct mbuf *m; 4203 struct work_request_hdr *wr; 4204 struct cpl_barrier *lock; 4205 struct cpl_set_tcb_field *req; 4206 struct cpl_get_tcb *getreq; 4207 struct ddp_state *p = &toep->tp_ddp_state; 4208 4209#if 0 4210 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4211#endif 4212 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + 4213 sizeof(*getreq); 4214 m = m_gethdr_nofail(wrlen); 4215 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4216 wr = mtod(m, struct work_request_hdr *); 4217 bzero(wr, wrlen); 4218 4219 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4220 m->m_pkthdr.len = m->m_len = wrlen; 4221 4222 lock = (struct cpl_barrier *)(wr + 1); 4223 mk_cpl_barrier_ulp(lock); 4224 4225 req = (struct cpl_set_tcb_field *)(lock + 1); 4226 4227 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); 4228 4229 /* Hmmm, not sure if this actually a good thing: reactivating 4230 * the other buffer might be an issue if it has been completed 4231 * already. However, that is unlikely, since the fact that the UBUF 4232 * is not completed indicates that there is no oustanding data. 4233 */ 4234 if (bufidx == 0) 4235 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4236 V_TF_DDP_ACTIVE_BUF(1) | 4237 V_TF_DDP_BUF0_VALID(1), 4238 V_TF_DDP_ACTIVE_BUF(1)); 4239 else 4240 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4241 V_TF_DDP_ACTIVE_BUF(1) | 4242 V_TF_DDP_BUF1_VALID(1), 0); 4243 4244 getreq = (struct cpl_get_tcb *)(req + 1); 4245 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4246 4247 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); 4248 4249 /* Keep track of the number of oustanding CPL_GET_TCB requests 4250 */ 4251 p->get_tcb_count++; 4252 4253#ifdef T3_TRACE 4254 T3_TRACE1(TIDTB(so), 4255 "t3_cancel_ddpbuf: bufidx %u", bufidx); 4256#endif 4257 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4258} 4259 4260/** 4261 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one 4262 * @sk: the socket associated with the buffers 4263 * @bufidx: index of HW DDP buffer (0 or 1) 4264 * @tag0: new tag for HW buffer 0 4265 * @tag1: new tag for HW buffer 1 4266 * @len: new length for HW buf @bufidx 4267 * 4268 * Sends a compound WR to overlay a new DDP buffer on top of an existing 4269 * buffer by changing the buffer tag and length and setting the valid and 4270 * active flag accordingly. The caller must ensure the new buffer is at 4271 * least as big as the existing one. Since we typically reprogram both HW 4272 * buffers this function sets both tags for convenience. Read the TCB to 4273 * determine how made data was written into the buffer before the overlay 4274 * took place. 4275 */ 4276void 4277t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, 4278 unsigned int tag1, unsigned int len) 4279{ 4280 unsigned int wrlen; 4281 struct mbuf *m; 4282 struct work_request_hdr *wr; 4283 struct cpl_get_tcb *getreq; 4284 struct cpl_set_tcb_field *req; 4285 struct ddp_state *p = &toep->tp_ddp_state; 4286 4287 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", 4288 bufidx, tag0, tag1, len); 4289#if 0 4290 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4291#endif 4292 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); 4293 m = m_gethdr_nofail(wrlen); 4294 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4295 wr = mtod(m, struct work_request_hdr *); 4296 m->m_pkthdr.len = m->m_len = wrlen; 4297 bzero(wr, wrlen); 4298 4299 4300 /* Set the ATOMIC flag to make sure that TP processes the following 4301 * CPLs in an atomic manner and no wire segments can be interleaved. 4302 */ 4303 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); 4304 req = (struct cpl_set_tcb_field *)(wr + 1); 4305 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, 4306 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | 4307 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, 4308 V_TCB_RX_DDP_BUF0_TAG(tag0) | 4309 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); 4310 req++; 4311 if (bufidx == 0) { 4312 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, 4313 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4314 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 4315 req++; 4316 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4317 V_TF_DDP_PUSH_DISABLE_0(1) | 4318 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4319 V_TF_DDP_PUSH_DISABLE_0(0) | 4320 V_TF_DDP_BUF0_VALID(1)); 4321 } else { 4322 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, 4323 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), 4324 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); 4325 req++; 4326 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4327 V_TF_DDP_PUSH_DISABLE_1(1) | 4328 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4329 V_TF_DDP_PUSH_DISABLE_1(0) | 4330 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); 4331 } 4332 4333 getreq = (struct cpl_get_tcb *)(req + 1); 4334 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4335 4336 /* Keep track of the number of oustanding CPL_GET_TCB requests 4337 */ 4338 p->get_tcb_count++; 4339 4340#ifdef T3_TRACE 4341 T3_TRACE4(TIDTB(sk), 4342 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " 4343 "len %d", 4344 bufidx, tag0, tag1, len); 4345#endif 4346 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4347} 4348 4349/* 4350 * Sends a compound WR containing all the CPL messages needed to program the 4351 * two HW DDP buffers, namely optionally setting up the length and offset of 4352 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. 4353 */ 4354void 4355t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, 4356 unsigned int len1, unsigned int offset1, 4357 uint64_t ddp_flags, uint64_t flag_mask, int modulate) 4358{ 4359 unsigned int wrlen; 4360 struct mbuf *m; 4361 struct work_request_hdr *wr; 4362 struct cpl_set_tcb_field *req; 4363 4364 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", 4365 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); 4366 4367#if 0 4368 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4369#endif 4370 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + 4371 (len1 ? sizeof(*req) : 0) + 4372 (modulate ? sizeof(struct cpl_rx_data_ack) : 0); 4373 m = m_gethdr_nofail(wrlen); 4374 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4375 wr = mtod(m, struct work_request_hdr *); 4376 bzero(wr, wrlen); 4377 4378 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4379 m->m_pkthdr.len = m->m_len = wrlen; 4380 4381 req = (struct cpl_set_tcb_field *)(wr + 1); 4382 if (len0) { /* program buffer 0 offset and length */ 4383 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, 4384 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 4385 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4386 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | 4387 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); 4388 req++; 4389 } 4390 if (len1) { /* program buffer 1 offset and length */ 4391 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, 4392 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 4393 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, 4394 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | 4395 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); 4396 req++; 4397 } 4398 4399 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, 4400 ddp_flags); 4401 4402 if (modulate) { 4403 mk_rx_data_ack_ulp(toep, 4404 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, 4405 toep->tp_copied_seq - toep->tp_rcv_wup); 4406 toep->tp_rcv_wup = toep->tp_copied_seq; 4407 } 4408 4409#ifdef T3_TRACE 4410 T3_TRACE5(TIDTB(sk), 4411 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " 4412 "modulate %d", 4413 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, 4414 modulate); 4415#endif 4416 4417 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4418} 4419 4420void 4421t3_init_wr_tab(unsigned int wr_len) 4422{ 4423 int i; 4424 4425 if (mbuf_wrs[1]) /* already initialized */ 4426 return; 4427 4428 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 4429 int sgl_len = (3 * i) / 2 + (i & 1); 4430 4431 sgl_len += 3; 4432 mbuf_wrs[i] = sgl_len <= wr_len ? 4433 1 : 1 + (sgl_len - 2) / (wr_len - 1); 4434 } 4435 4436 wrlen = wr_len * 8; 4437} 4438 4439int 4440t3_init_cpl_io(void) 4441{ 4442#ifdef notyet 4443 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 4444 if (!tcphdr_skb) { 4445 log(LOG_ERR, 4446 "Chelsio TCP offload: can't allocate sk_buff\n"); 4447 return -1; 4448 } 4449 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 4450 tcphdr_skb->h.raw = tcphdr_skb->data; 4451 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 4452#endif 4453 4454 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 4455 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 4456 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 4457 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 4458 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 4459 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 4460 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 4461 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 4462 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 4463 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 4464 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 4465 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 4466 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 4467 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 4468 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 4469 return (0); 4470} 4471 4472