cxgb_cpl_io.c revision 185088
1/************************************************************************** 2 3Copyright (c) 2007-2008, Chelsio Inc. 4All rights reserved. 5 6Redistribution and use in source and binary forms, with or without 7modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26POSSIBILITY OF SUCH DAMAGE. 27 28***************************************************************************/ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 185088 2008-11-19 09:39:34Z zec $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/sockstate.h> 43#include <sys/sockopt.h> 44#include <sys/socket.h> 45#include <sys/sockbuf.h> 46#include <sys/sysctl.h> 47#include <sys/syslog.h> 48#include <sys/protosw.h> 49#include <sys/priv.h> 50 51#if __FreeBSD_version >= 800044 52#include <sys/vimage.h> 53#else 54#define V_tcp_do_autosndbuf tcp_do_autosndbuf 55#define V_tcp_autosndbuf_max tcp_autosndbuf_max 56#define V_tcp_do_rfc1323 tcp_do_rfc1323 57#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf 58#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max 59#define V_tcpstat tcpstat 60#endif 61 62#include <net/if.h> 63#include <net/route.h> 64 65#include <netinet/in.h> 66#include <netinet/in_pcb.h> 67#include <netinet/in_systm.h> 68#include <netinet/in_var.h> 69 70 71#include <cxgb_osdep.h> 72#include <sys/mbufq.h> 73 74#include <netinet/ip.h> 75#include <netinet/tcp_var.h> 76#include <netinet/tcp_fsm.h> 77#include <netinet/tcp_offload.h> 78#include <netinet/tcp_seq.h> 79#include <netinet/tcp_syncache.h> 80#include <netinet/tcp_timer.h> 81#include <net/route.h> 82 83#include <t3cdev.h> 84#include <common/cxgb_firmware_exports.h> 85#include <common/cxgb_t3_cpl.h> 86#include <common/cxgb_tcb.h> 87#include <common/cxgb_ctl_defs.h> 88#include <cxgb_offload.h> 89#include <vm/vm.h> 90#include <vm/pmap.h> 91#include <machine/bus.h> 92#include <sys/mvec.h> 93#include <ulp/toecore/cxgb_toedev.h> 94#include <ulp/tom/cxgb_l2t.h> 95#include <ulp/tom/cxgb_defs.h> 96#include <ulp/tom/cxgb_tom.h> 97#include <ulp/tom/cxgb_t3_ddp.h> 98#include <ulp/tom/cxgb_toepcb.h> 99#include <ulp/tom/cxgb_tcp.h> 100#include <ulp/tom/cxgb_tcp_offload.h> 101 102/* 103 * For ULP connections HW may add headers, e.g., for digests, that aren't part 104 * of the messages sent by the host but that are part of the TCP payload and 105 * therefore consume TCP sequence space. Tx connection parameters that 106 * operate in TCP sequence space are affected by the HW additions and need to 107 * compensate for them to accurately track TCP sequence numbers. This array 108 * contains the compensating extra lengths for ULP packets. It is indexed by 109 * a packet's ULP submode. 110 */ 111const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 112 113#ifdef notyet 114/* 115 * This sk_buff holds a fake header-only TCP segment that we use whenever we 116 * need to exploit SW TCP functionality that expects TCP headers, such as 117 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 118 * CPUs without locking. 119 */ 120static struct mbuf *tcphdr_mbuf __read_mostly; 121#endif 122 123/* 124 * Size of WRs in bytes. Note that we assume all devices we are handling have 125 * the same WR size. 126 */ 127static unsigned int wrlen __read_mostly; 128 129/* 130 * The number of WRs needed for an skb depends on the number of page fragments 131 * in the skb and whether it has any payload in its main body. This maps the 132 * length of the gather list represented by an skb into the # of necessary WRs. 133 */ 134static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; 135 136/* 137 * Max receive window supported by HW in bytes. Only a small part of it can 138 * be set through option0, the rest needs to be set through RX_DATA_ACK. 139 */ 140#define MAX_RCV_WND ((1U << 27) - 1) 141 142/* 143 * Min receive window. We want it to be large enough to accommodate receive 144 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 145 */ 146#define MIN_RCV_WND (24 * 1024U) 147#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 148 149#define VALIDATE_SEQ 0 150#define VALIDATE_SOCK(so) 151#define DEBUG_WR 0 152 153#define TCP_TIMEWAIT 1 154#define TCP_CLOSE 2 155#define TCP_DROP 3 156 157static void t3_send_reset(struct toepcb *toep); 158static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 159static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 160static void handle_syncache_event(int event, void *arg); 161 162static inline void 163SBAPPEND(struct sockbuf *sb, struct mbuf *n) 164{ 165 struct mbuf *m; 166 167 m = sb->sb_mb; 168 while (m) { 169 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 170 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 171 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 172 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 173 m->m_next, m->m_nextpkt, m->m_flags)); 174 m = m->m_next; 175 } 176 m = n; 177 while (m) { 178 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 179 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 180 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 181 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 182 m->m_next, m->m_nextpkt, m->m_flags)); 183 m = m->m_next; 184 } 185 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); 186 sbappendstream_locked(sb, n); 187 m = sb->sb_mb; 188 189 while (m) { 190 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 191 m->m_next, m->m_nextpkt, m->m_flags)); 192 m = m->m_next; 193 } 194} 195 196static inline int 197is_t3a(const struct toedev *dev) 198{ 199 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 200} 201 202static void 203dump_toepcb(struct toepcb *toep) 204{ 205 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 206 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 207 toep->tp_mtu_idx, toep->tp_tid); 208 209 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 210 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 211 toep->tp_mss_clamp, toep->tp_flags); 212} 213 214#ifndef RTALLOC2_DEFINED 215static struct rtentry * 216rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 217{ 218 struct rtentry *rt = NULL; 219 220 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 221 RT_UNLOCK(rt); 222 223 return (rt); 224} 225#endif 226 227/* 228 * Determine whether to send a CPL message now or defer it. A message is 229 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 230 * For connections in other states the message is sent immediately. 231 * If through_l2t is set the message is subject to ARP processing, otherwise 232 * it is sent directly. 233 */ 234static inline void 235send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) 236{ 237 struct tcpcb *tp = toep->tp_tp; 238 239 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 240 inp_wlock(tp->t_inpcb); 241 mbufq_tail(&toep->out_of_order_queue, m); // defer 242 inp_wunlock(tp->t_inpcb); 243 } else if (through_l2t) 244 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T 245 else 246 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly 247} 248 249static inline unsigned int 250mkprio(unsigned int cntrl, const struct toepcb *toep) 251{ 252 return (cntrl); 253} 254 255/* 256 * Populate a TID_RELEASE WR. The skb must be already propely sized. 257 */ 258static inline void 259mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) 260{ 261 struct cpl_tid_release *req; 262 263 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); 264 m->m_pkthdr.len = m->m_len = sizeof(*req); 265 req = mtod(m, struct cpl_tid_release *); 266 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 267 req->wr.wr_lo = 0; 268 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 269} 270 271static inline void 272make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 273{ 274 INIT_VNET_INET(so->so_vnet); 275 struct tcpcb *tp = so_sototcpcb(so); 276 struct toepcb *toep = tp->t_toe; 277 struct tx_data_wr *req; 278 struct sockbuf *snd; 279 280 inp_lock_assert(tp->t_inpcb); 281 snd = so_sockbuf_snd(so); 282 283 req = mtod(m, struct tx_data_wr *); 284 m->m_len = sizeof(*req); 285 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 286 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 287 /* len includes the length of any HW ULP additions */ 288 req->len = htonl(len); 289 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 290 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 291 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 292 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 293 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 294 (tail ? 0 : 1)))); 295 req->sndseq = htonl(tp->snd_nxt); 296 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 297 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 298 V_TX_CPU_IDX(toep->tp_qset)); 299 300 /* Sendbuffer is in units of 32KB. 301 */ 302 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 303 req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15)); 304 else { 305 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 306 } 307 308 toep->tp_flags |= TP_DATASENT; 309 } 310} 311 312#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ 313 314int 315t3_push_frames(struct socket *so, int req_completion) 316{ 317 struct tcpcb *tp = so_sototcpcb(so); 318 struct toepcb *toep = tp->t_toe; 319 320 struct mbuf *tail, *m0, *last; 321 struct t3cdev *cdev; 322 struct tom_data *d; 323 int state, bytes, count, total_bytes; 324 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 325 struct sockbuf *snd; 326 327 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 328 DPRINTF("tcp state=%d\n", tp->t_state); 329 return (0); 330 } 331 332 state = so_state_get(so); 333 334 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 335 DPRINTF("disconnecting\n"); 336 337 return (0); 338 } 339 340 inp_lock_assert(tp->t_inpcb); 341 342 snd = so_sockbuf_snd(so); 343 sockbuf_lock(snd); 344 345 d = TOM_DATA(toep->tp_toedev); 346 cdev = d->cdev; 347 348 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 349 350 total_bytes = 0; 351 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 352 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); 353 354 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { 355 KASSERT(tail, ("sbdrop error")); 356 last = tail = tail->m_next; 357 } 358 359 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 360 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 361 sockbuf_unlock(snd); 362 363 return (0); 364 } 365 366 toep->tp_m_last = NULL; 367 while (toep->tp_wr_avail && (tail != NULL)) { 368 count = bytes = 0; 369 segp = segs; 370 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 371 sockbuf_unlock(snd); 372 return (0); 373 } 374 /* 375 * If the data in tail fits as in-line, then 376 * make an immediate data wr. 377 */ 378 if (tail->m_len <= IMM_LEN) { 379 count = 1; 380 bytes = tail->m_len; 381 last = tail; 382 tail = tail->m_next; 383 m_set_sgl(m0, NULL); 384 m_set_sgllen(m0, 0); 385 make_tx_data_wr(so, m0, bytes, tail); 386 m_append(m0, bytes, mtod(last, caddr_t)); 387 KASSERT(!m0->m_next, ("bad append")); 388 } else { 389 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 390 && (tail != NULL) && (count < TX_MAX_SEGS-1)) { 391 bytes += tail->m_len; 392 last = tail; 393 count++; 394 /* 395 * technically an abuse to be using this for a VA 396 * but less gross than defining my own structure 397 * or calling pmap_kextract from here :-| 398 */ 399 segp->ds_addr = (bus_addr_t)tail->m_data; 400 segp->ds_len = tail->m_len; 401 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 402 count, mbuf_wrs[count], tail->m_data, tail->m_len); 403 segp++; 404 tail = tail->m_next; 405 } 406 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 407 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 408 409 m_set_sgl(m0, segs); 410 m_set_sgllen(m0, count); 411 make_tx_data_wr(so, m0, bytes, tail); 412 } 413 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); 414 415 if (tail) { 416 snd->sb_sndptr = tail; 417 toep->tp_m_last = NULL; 418 } else 419 toep->tp_m_last = snd->sb_sndptr = last; 420 421 422 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 423 424 snd->sb_sndptroff += bytes; 425 total_bytes += bytes; 426 toep->tp_write_seq += bytes; 427 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" 428 " tail=%p sndptr=%p sndptroff=%d", 429 toep->tp_wr_avail, count, mbuf_wrs[count], 430 tail, snd->sb_sndptr, snd->sb_sndptroff); 431 if (tail) 432 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" 433 " tp_m_last=%p tailbuf=%p snd_una=0x%08x", 434 total_bytes, toep->tp_m_last, tail->m_data, 435 tp->snd_una); 436 else 437 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" 438 " tp_m_last=%p snd_una=0x%08x", 439 total_bytes, toep->tp_m_last, tp->snd_una); 440 441 442#ifdef KTR 443{ 444 int i; 445 446 i = 0; 447 while (i < count && m_get_sgllen(m0)) { 448 if ((count - i) >= 3) { 449 CTR6(KTR_TOM, 450 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 451 " len=%d pa=0x%zx len=%d", 452 segs[i].ds_addr, segs[i].ds_len, 453 segs[i + 1].ds_addr, segs[i + 1].ds_len, 454 segs[i + 2].ds_addr, segs[i + 2].ds_len); 455 i += 3; 456 } else if ((count - i) == 2) { 457 CTR4(KTR_TOM, 458 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 459 " len=%d", 460 segs[i].ds_addr, segs[i].ds_len, 461 segs[i + 1].ds_addr, segs[i + 1].ds_len); 462 i += 2; 463 } else { 464 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", 465 segs[i].ds_addr, segs[i].ds_len); 466 i++; 467 } 468 469 } 470} 471#endif 472 /* 473 * remember credits used 474 */ 475 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 476 m0->m_pkthdr.len = bytes; 477 toep->tp_wr_avail -= mbuf_wrs[count]; 478 toep->tp_wr_unacked += mbuf_wrs[count]; 479 480 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 481 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 482 struct work_request_hdr *wr = cplhdr(m0); 483 484 wr->wr_hi |= htonl(F_WR_COMPL); 485 toep->tp_wr_unacked = 0; 486 } 487 KASSERT((m0->m_pkthdr.csum_data > 0) && 488 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", 489 m0->m_pkthdr.csum_data)); 490 m0->m_type = MT_DONTFREE; 491 enqueue_wr(toep, m0); 492 DPRINTF("sending offload tx with %d bytes in %d segments\n", 493 bytes, count); 494 l2t_send(cdev, m0, toep->tp_l2t); 495 } 496 sockbuf_unlock(snd); 497 return (total_bytes); 498} 499 500/* 501 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 502 * under any circumstances. We take the easy way out and always queue the 503 * message to the write_queue. We can optimize the case where the queue is 504 * already empty though the optimization is probably not worth it. 505 */ 506static void 507close_conn(struct socket *so) 508{ 509 struct mbuf *m; 510 struct cpl_close_con_req *req; 511 struct tom_data *d; 512 struct inpcb *inp = so_sotoinpcb(so); 513 struct tcpcb *tp; 514 struct toepcb *toep; 515 unsigned int tid; 516 517 518 inp_wlock(inp); 519 tp = so_sototcpcb(so); 520 toep = tp->t_toe; 521 522 if (tp->t_state != TCPS_SYN_SENT) 523 t3_push_frames(so, 1); 524 525 if (toep->tp_flags & TP_FIN_SENT) { 526 inp_wunlock(inp); 527 return; 528 } 529 530 tid = toep->tp_tid; 531 532 d = TOM_DATA(toep->tp_toedev); 533 534 m = m_gethdr_nofail(sizeof(*req)); 535 m_set_priority(m, CPL_PRIORITY_DATA); 536 m_set_sgl(m, NULL); 537 m_set_sgllen(m, 0); 538 539 toep->tp_flags |= TP_FIN_SENT; 540 req = mtod(m, struct cpl_close_con_req *); 541 542 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 543 req->wr.wr_lo = htonl(V_WR_TID(tid)); 544 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 545 req->rsvd = 0; 546 inp_wunlock(inp); 547 /* 548 * XXX - need to defer shutdown while there is still data in the queue 549 * 550 */ 551 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); 552 cxgb_ofld_send(d->cdev, m); 553 554} 555 556/* 557 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 558 * and send it along. 559 */ 560static void 561abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 562{ 563 struct cpl_abort_req *req = cplhdr(m); 564 565 req->cmd = CPL_ABORT_NO_RST; 566 cxgb_ofld_send(cdev, m); 567} 568 569/* 570 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 571 * permitted to return without sending the message in case we cannot allocate 572 * an sk_buff. Returns the number of credits sent. 573 */ 574uint32_t 575t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 576{ 577 struct mbuf *m; 578 struct cpl_rx_data_ack *req; 579 struct toepcb *toep = tp->t_toe; 580 struct toedev *tdev = toep->tp_toedev; 581 582 m = m_gethdr_nofail(sizeof(*req)); 583 584 DPRINTF("returning %u credits to HW\n", credits); 585 586 req = mtod(m, struct cpl_rx_data_ack *); 587 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 588 req->wr.wr_lo = 0; 589 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 590 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 591 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 592 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 593 return (credits); 594} 595 596/* 597 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. 598 * This is only used in DDP mode, so we take the opportunity to also set the 599 * DACK mode and flush any Rx credits. 600 */ 601void 602t3_send_rx_modulate(struct toepcb *toep) 603{ 604 struct mbuf *m; 605 struct cpl_rx_data_ack *req; 606 607 m = m_gethdr_nofail(sizeof(*req)); 608 609 req = mtod(m, struct cpl_rx_data_ack *); 610 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 611 req->wr.wr_lo = 0; 612 m->m_pkthdr.len = m->m_len = sizeof(*req); 613 614 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 615 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 616 V_RX_DACK_MODE(1) | 617 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); 618 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 619 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 620 toep->tp_rcv_wup = toep->tp_copied_seq; 621} 622 623/* 624 * Handle receipt of an urgent pointer. 625 */ 626static void 627handle_urg_ptr(struct socket *so, uint32_t urg_seq) 628{ 629#ifdef URGENT_DATA_SUPPORTED 630 struct tcpcb *tp = so_sototcpcb(so); 631 632 urg_seq--; /* initially points past the urgent data, per BSD */ 633 634 if (tp->urg_data && !after(urg_seq, tp->urg_seq)) 635 return; /* duplicate pointer */ 636 sk_send_sigurg(sk); 637 if (tp->urg_seq == tp->copied_seq && tp->urg_data && 638 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { 639 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 640 641 tp->copied_seq++; 642 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) 643 tom_eat_skb(sk, skb, 0); 644 } 645 tp->urg_data = TCP_URG_NOTYET; 646 tp->urg_seq = urg_seq; 647#endif 648} 649 650/* 651 * Returns true if a socket cannot accept new Rx data. 652 */ 653static inline int 654so_no_receive(const struct socket *so) 655{ 656 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); 657} 658 659/* 660 * Process an urgent data notification. 661 */ 662static void 663rx_urg_notify(struct toepcb *toep, struct mbuf *m) 664{ 665 struct cpl_rx_urg_notify *hdr = cplhdr(m); 666 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 667 668 VALIDATE_SOCK(so); 669 670 if (!so_no_receive(so)) 671 handle_urg_ptr(so, ntohl(hdr->seq)); 672 673 m_freem(m); 674} 675 676/* 677 * Handler for RX_URG_NOTIFY CPL messages. 678 */ 679static int 680do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) 681{ 682 struct toepcb *toep = (struct toepcb *)ctx; 683 684 rx_urg_notify(toep, m); 685 return (0); 686} 687 688static __inline int 689is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) 690{ 691 return (toep->tp_ulp_mode || 692 (toep->tp_ulp_mode == ULP_MODE_TCPDDP && 693 dev->tod_ttid >= TOE_ID_CHELSIO_T3)); 694} 695 696/* 697 * Set of states for which we should return RX credits. 698 */ 699#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 700 701/* 702 * Called after some received data has been read. It returns RX credits 703 * to the HW for the amount of data processed. 704 */ 705void 706t3_cleanup_rbuf(struct tcpcb *tp, int copied) 707{ 708 struct toepcb *toep = tp->t_toe; 709 struct socket *so; 710 struct toedev *dev; 711 int dack_mode, must_send, read; 712 u32 thres, credits, dack = 0; 713 struct sockbuf *rcv; 714 715 so = inp_inpcbtosocket(tp->t_inpcb); 716 rcv = so_sockbuf_rcv(so); 717 718 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 719 (tp->t_state == TCPS_FIN_WAIT_2))) { 720 if (copied) { 721 sockbuf_lock(rcv); 722 toep->tp_copied_seq += copied; 723 sockbuf_unlock(rcv); 724 } 725 726 return; 727 } 728 729 inp_lock_assert(tp->t_inpcb); 730 731 sockbuf_lock(rcv); 732 if (copied) 733 toep->tp_copied_seq += copied; 734 else { 735 read = toep->tp_enqueued_bytes - rcv->sb_cc; 736 toep->tp_copied_seq += read; 737 } 738 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 739 toep->tp_enqueued_bytes = rcv->sb_cc; 740 sockbuf_unlock(rcv); 741 742 if (credits > rcv->sb_mbmax) { 743 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", 744 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 745 credits = rcv->sb_mbmax; 746 } 747 748 749 /* 750 * XXX this won't accurately reflect credit return - we need 751 * to look at the difference between the amount that has been 752 * put in the recv sockbuf and what is there now 753 */ 754 755 if (__predict_false(!credits)) 756 return; 757 758 dev = toep->tp_toedev; 759 thres = TOM_TUNABLE(dev, rx_credit_thres); 760 761 if (__predict_false(thres == 0)) 762 return; 763 764 if (is_delack_mode_valid(dev, toep)) { 765 dack_mode = TOM_TUNABLE(dev, delack); 766 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 767 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 768 769 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 770 dack = F_RX_DACK_CHANGE | 771 V_RX_DACK_MODE(dack_mode); 772 } 773 } else 774 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 775 776 /* 777 * For coalescing to work effectively ensure the receive window has 778 * at least 16KB left. 779 */ 780 must_send = credits + 16384 >= tp->rcv_wnd; 781 782 if (must_send || credits >= thres) 783 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 784} 785 786static int 787cxgb_toe_disconnect(struct tcpcb *tp) 788{ 789 struct socket *so; 790 791 DPRINTF("cxgb_toe_disconnect\n"); 792 793 so = inp_inpcbtosocket(tp->t_inpcb); 794 close_conn(so); 795 return (0); 796} 797 798static int 799cxgb_toe_reset(struct tcpcb *tp) 800{ 801 struct toepcb *toep = tp->t_toe; 802 803 t3_send_reset(toep); 804 805 /* 806 * unhook from socket 807 */ 808 tp->t_flags &= ~TF_TOE; 809 toep->tp_tp = NULL; 810 tp->t_toe = NULL; 811 return (0); 812} 813 814static int 815cxgb_toe_send(struct tcpcb *tp) 816{ 817 struct socket *so; 818 819 DPRINTF("cxgb_toe_send\n"); 820 dump_toepcb(tp->t_toe); 821 822 so = inp_inpcbtosocket(tp->t_inpcb); 823 t3_push_frames(so, 1); 824 return (0); 825} 826 827static int 828cxgb_toe_rcvd(struct tcpcb *tp) 829{ 830 831 inp_lock_assert(tp->t_inpcb); 832 833 t3_cleanup_rbuf(tp, 0); 834 835 return (0); 836} 837 838static void 839cxgb_toe_detach(struct tcpcb *tp) 840{ 841 struct toepcb *toep; 842 843 /* 844 * XXX how do we handle teardown in the SYN_SENT state? 845 * 846 */ 847 inp_lock_assert(tp->t_inpcb); 848 toep = tp->t_toe; 849 toep->tp_tp = NULL; 850 851 /* 852 * unhook from socket 853 */ 854 tp->t_flags &= ~TF_TOE; 855 tp->t_toe = NULL; 856} 857 858 859static struct toe_usrreqs cxgb_toe_usrreqs = { 860 .tu_disconnect = cxgb_toe_disconnect, 861 .tu_reset = cxgb_toe_reset, 862 .tu_send = cxgb_toe_send, 863 .tu_rcvd = cxgb_toe_rcvd, 864 .tu_detach = cxgb_toe_detach, 865 .tu_detach = cxgb_toe_detach, 866 .tu_syncache_event = handle_syncache_event, 867}; 868 869 870static void 871__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, 872 uint64_t mask, uint64_t val, int no_reply) 873{ 874 struct cpl_set_tcb_field *req; 875 876 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 877 toep->tp_tid, word, mask, val); 878 879 req = mtod(m, struct cpl_set_tcb_field *); 880 m->m_pkthdr.len = m->m_len = sizeof(*req); 881 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 882 req->wr.wr_lo = 0; 883 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 884 req->reply = V_NO_REPLY(no_reply); 885 req->cpu_idx = 0; 886 req->word = htons(word); 887 req->mask = htobe64(mask); 888 req->val = htobe64(val); 889 890 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 891 send_or_defer(toep, m, 0); 892} 893 894static void 895t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) 896{ 897 struct mbuf *m; 898 struct tcpcb *tp = toep->tp_tp; 899 900 if (toep == NULL) 901 return; 902 903 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { 904 printf("not seting field\n"); 905 return; 906 } 907 908 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 909 910 __set_tcb_field(toep, m, word, mask, val, 1); 911} 912 913/* 914 * Set one of the t_flags bits in the TCB. 915 */ 916static void 917set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) 918{ 919 920 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 921} 922 923/* 924 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 925 */ 926static void 927t3_set_nagle(struct toepcb *toep) 928{ 929 struct tcpcb *tp = toep->tp_tp; 930 931 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 932} 933 934/* 935 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 936 */ 937void 938t3_set_keepalive(struct toepcb *toep, int on_off) 939{ 940 941 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); 942} 943 944void 945t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) 946{ 947 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); 948} 949 950void 951t3_set_dack_mss(struct toepcb *toep, int on_off) 952{ 953 954 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); 955} 956 957/* 958 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 959 */ 960static void 961t3_set_tos(struct toepcb *toep) 962{ 963 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); 964 965 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 966 V_TCB_TOS(tos)); 967} 968 969 970/* 971 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 972 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 973 * set the PSH bit in the last segment, which would trigger delivery.] 974 * We work around the issue by setting a DDP buffer in a partial placed state, 975 * which guarantees that TP will schedule a timer. 976 */ 977#define TP_DDP_TIMER_WORKAROUND_MASK\ 978 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 979 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 980 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 981#define TP_DDP_TIMER_WORKAROUND_VAL\ 982 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 983 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 984 32)) 985 986static void 987t3_enable_ddp(struct toepcb *toep, int on) 988{ 989 if (on) { 990 991 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 992 V_TF_DDP_OFF(0)); 993 } else 994 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, 995 V_TF_DDP_OFF(1) | 996 TP_DDP_TIMER_WORKAROUND_MASK, 997 V_TF_DDP_OFF(1) | 998 TP_DDP_TIMER_WORKAROUND_VAL); 999 1000} 1001 1002void 1003t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) 1004{ 1005 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 1006 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 1007 tag_color); 1008} 1009 1010void 1011t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, 1012 unsigned int len) 1013{ 1014 if (buf_idx == 0) 1015 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, 1016 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 1017 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 1018 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 1019 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 1020 else 1021 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, 1022 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 1023 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 1024 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 1025 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 1026} 1027 1028static int 1029t3_set_cong_control(struct socket *so, const char *name) 1030{ 1031#ifdef CONGESTION_CONTROL_SUPPORTED 1032 int cong_algo; 1033 1034 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 1035 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 1036 break; 1037 1038 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 1039 return -EINVAL; 1040#endif 1041 return 0; 1042} 1043 1044int 1045t3_get_tcb(struct toepcb *toep) 1046{ 1047 struct cpl_get_tcb *req; 1048 struct tcpcb *tp = toep->tp_tp; 1049 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 1050 1051 if (!m) 1052 return (ENOMEM); 1053 1054 inp_lock_assert(tp->t_inpcb); 1055 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 1056 req = mtod(m, struct cpl_get_tcb *); 1057 m->m_pkthdr.len = m->m_len = sizeof(*req); 1058 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1059 req->wr.wr_lo = 0; 1060 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 1061 req->cpuno = htons(toep->tp_qset); 1062 req->rsvd = 0; 1063 if (tp->t_state == TCPS_SYN_SENT) 1064 mbufq_tail(&toep->out_of_order_queue, m); // defer 1065 else 1066 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 1067 return 0; 1068} 1069 1070static inline void 1071so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) 1072{ 1073 1074 toepcb_hold(toep); 1075 1076 cxgb_insert_tid(d->cdev, d->client, toep, tid); 1077} 1078 1079/** 1080 * find_best_mtu - find the entry in the MTU table closest to an MTU 1081 * @d: TOM state 1082 * @mtu: the target MTU 1083 * 1084 * Returns the index of the value in the MTU table that is closest to but 1085 * does not exceed the target MTU. 1086 */ 1087static unsigned int 1088find_best_mtu(const struct t3c_data *d, unsigned short mtu) 1089{ 1090 int i = 0; 1091 1092 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 1093 ++i; 1094 return (i); 1095} 1096 1097static unsigned int 1098select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 1099{ 1100 unsigned int idx; 1101 1102#ifdef notyet 1103 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; 1104#endif 1105 if (tp) { 1106 tp->t_maxseg = pmtu - 40; 1107 if (tp->t_maxseg < td->mtus[0] - 40) 1108 tp->t_maxseg = td->mtus[0] - 40; 1109 idx = find_best_mtu(td, tp->t_maxseg + 40); 1110 1111 tp->t_maxseg = td->mtus[idx] - 40; 1112 } else 1113 idx = find_best_mtu(td, pmtu); 1114 1115 return (idx); 1116} 1117 1118static inline void 1119free_atid(struct t3cdev *cdev, unsigned int tid) 1120{ 1121 struct toepcb *toep = cxgb_free_atid(cdev, tid); 1122 1123 if (toep) 1124 toepcb_release(toep); 1125} 1126 1127/* 1128 * Release resources held by an offload connection (TID, L2T entry, etc.) 1129 */ 1130static void 1131t3_release_offload_resources(struct toepcb *toep) 1132{ 1133 struct tcpcb *tp = toep->tp_tp; 1134 struct toedev *tdev = toep->tp_toedev; 1135 struct t3cdev *cdev; 1136 struct socket *so; 1137 unsigned int tid = toep->tp_tid; 1138 struct sockbuf *rcv; 1139 1140 CTR0(KTR_TOM, "t3_release_offload_resources"); 1141 1142 if (!tdev) 1143 return; 1144 1145 cdev = TOEP_T3C_DEV(toep); 1146 if (!cdev) 1147 return; 1148 1149 toep->tp_qset = 0; 1150 t3_release_ddp_resources(toep); 1151 1152#ifdef CTRL_SKB_CACHE 1153 kfree_skb(CTRL_SKB_CACHE(tp)); 1154 CTRL_SKB_CACHE(tp) = NULL; 1155#endif 1156 1157 if (toep->tp_wr_avail != toep->tp_wr_max) { 1158 purge_wr_queue(toep); 1159 reset_wr_list(toep); 1160 } 1161 1162 if (toep->tp_l2t) { 1163 l2t_release(L2DATA(cdev), toep->tp_l2t); 1164 toep->tp_l2t = NULL; 1165 } 1166 toep->tp_tp = NULL; 1167 if (tp) { 1168 inp_lock_assert(tp->t_inpcb); 1169 so = inp_inpcbtosocket(tp->t_inpcb); 1170 rcv = so_sockbuf_rcv(so); 1171 /* 1172 * cancel any offloaded reads 1173 * 1174 */ 1175 sockbuf_lock(rcv); 1176 tp->t_toe = NULL; 1177 tp->t_flags &= ~TF_TOE; 1178 if (toep->tp_ddp_state.user_ddp_pending) { 1179 t3_cancel_ubuf(toep, rcv); 1180 toep->tp_ddp_state.user_ddp_pending = 0; 1181 } 1182 so_sorwakeup_locked(so); 1183 1184 } 1185 1186 if (toep->tp_state == TCPS_SYN_SENT) { 1187 free_atid(cdev, tid); 1188#ifdef notyet 1189 __skb_queue_purge(&tp->out_of_order_queue); 1190#endif 1191 } else { // we have TID 1192 cxgb_remove_tid(cdev, toep, tid); 1193 toepcb_release(toep); 1194 } 1195#if 0 1196 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 1197#endif 1198} 1199 1200static void 1201install_offload_ops(struct socket *so) 1202{ 1203 struct tcpcb *tp = so_sototcpcb(so); 1204 1205 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 1206 1207 t3_install_socket_ops(so); 1208 tp->t_flags |= TF_TOE; 1209 tp->t_tu = &cxgb_toe_usrreqs; 1210} 1211 1212/* 1213 * Determine the receive window scaling factor given a target max 1214 * receive window. 1215 */ 1216static __inline int 1217select_rcv_wscale(int space) 1218{ 1219 INIT_VNET_INET(so->so_vnet); 1220 int wscale = 0; 1221 1222 if (space > MAX_RCV_WND) 1223 space = MAX_RCV_WND; 1224 1225 if (V_tcp_do_rfc1323) 1226 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 1227 1228 return (wscale); 1229} 1230 1231/* 1232 * Determine the receive window size for a socket. 1233 */ 1234static unsigned long 1235select_rcv_wnd(struct toedev *dev, struct socket *so) 1236{ 1237 INIT_VNET_INET(so->so_vnet); 1238 struct tom_data *d = TOM_DATA(dev); 1239 unsigned int wnd; 1240 unsigned int max_rcv_wnd; 1241 struct sockbuf *rcv; 1242 1243 rcv = so_sockbuf_rcv(so); 1244 1245 if (V_tcp_do_autorcvbuf) 1246 wnd = V_tcp_autorcvbuf_max; 1247 else 1248 wnd = rcv->sb_hiwat; 1249 1250 1251 1252 /* XXX 1253 * For receive coalescing to work effectively we need a receive window 1254 * that can accomodate a coalesced segment. 1255 */ 1256 if (wnd < MIN_RCV_WND) 1257 wnd = MIN_RCV_WND; 1258 1259 /* PR 5138 */ 1260 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 1261 (uint32_t)d->rx_page_size * 23 : 1262 MAX_RCV_WND); 1263 1264 return min(wnd, max_rcv_wnd); 1265} 1266 1267/* 1268 * Assign offload parameters to some socket fields. This code is used by 1269 * both active and passive opens. 1270 */ 1271static inline void 1272init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1273 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1274{ 1275 struct tcpcb *tp = so_sototcpcb(so); 1276 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1277 struct sockbuf *snd, *rcv; 1278 1279#ifdef notyet 1280 SOCK_LOCK_ASSERT(so); 1281#endif 1282 1283 snd = so_sockbuf_snd(so); 1284 rcv = so_sockbuf_rcv(so); 1285 1286 log(LOG_INFO, "initializing offload socket\n"); 1287 /* 1288 * We either need to fix push frames to work with sbcompress 1289 * or we need to add this 1290 */ 1291 snd->sb_flags |= SB_NOCOALESCE; 1292 rcv->sb_flags |= SB_NOCOALESCE; 1293 1294 tp->t_toe = toep; 1295 toep->tp_tp = tp; 1296 toep->tp_toedev = dev; 1297 1298 toep->tp_tid = tid; 1299 toep->tp_l2t = e; 1300 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1301 toep->tp_wr_unacked = 0; 1302 toep->tp_delack_mode = 0; 1303 1304 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1305 /* 1306 * XXX broken 1307 * 1308 */ 1309 tp->rcv_wnd = select_rcv_wnd(dev, so); 1310 1311 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 1312 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1313 toep->tp_qset_idx = 0; 1314 1315 reset_wr_list(toep); 1316 DPRINTF("initialization done\n"); 1317} 1318 1319/* 1320 * The next two functions calculate the option 0 value for a socket. 1321 */ 1322static inline unsigned int 1323calc_opt0h(struct socket *so, int mtu_idx) 1324{ 1325 struct tcpcb *tp = so_sototcpcb(so); 1326 int wscale = select_rcv_wscale(tp->rcv_wnd); 1327 1328 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1329 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1330 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1331} 1332 1333static inline unsigned int 1334calc_opt0l(struct socket *so, int ulp_mode) 1335{ 1336 struct tcpcb *tp = so_sototcpcb(so); 1337 unsigned int val; 1338 1339 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | 1340 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1341 1342 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); 1343 return (val); 1344} 1345 1346static inline unsigned int 1347calc_opt2(const struct socket *so, struct toedev *dev) 1348{ 1349 int flv_valid; 1350 1351 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1352 1353 return (V_FLAVORS_VALID(flv_valid) | 1354 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); 1355} 1356 1357#if DEBUG_WR > 1 1358static int 1359count_pending_wrs(const struct toepcb *toep) 1360{ 1361 const struct mbuf *m; 1362 int n = 0; 1363 1364 wr_queue_walk(toep, m) 1365 n += m->m_pkthdr.csum_data; 1366 return (n); 1367} 1368#endif 1369 1370#if 0 1371(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1372#endif 1373 1374static void 1375mk_act_open_req(struct socket *so, struct mbuf *m, 1376 unsigned int atid, const struct l2t_entry *e) 1377{ 1378 struct cpl_act_open_req *req; 1379 struct inpcb *inp = so_sotoinpcb(so); 1380 struct tcpcb *tp = inp_inpcbtotcpcb(inp); 1381 struct toepcb *toep = tp->t_toe; 1382 struct toedev *tdev = toep->tp_toedev; 1383 1384 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); 1385 1386 req = mtod(m, struct cpl_act_open_req *); 1387 m->m_pkthdr.len = m->m_len = sizeof(*req); 1388 1389 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1390 req->wr.wr_lo = 0; 1391 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1392 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); 1393#if 0 1394 req->local_port = inp->inp_lport; 1395 req->peer_port = inp->inp_fport; 1396 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1397 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1398#endif 1399 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1400 V_TX_CHANNEL(e->smt_idx)); 1401 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1402 req->params = 0; 1403 req->opt2 = htonl(calc_opt2(so, tdev)); 1404} 1405 1406 1407/* 1408 * Convert an ACT_OPEN_RPL status to an errno. 1409 */ 1410static int 1411act_open_rpl_status_to_errno(int status) 1412{ 1413 switch (status) { 1414 case CPL_ERR_CONN_RESET: 1415 return (ECONNREFUSED); 1416 case CPL_ERR_ARP_MISS: 1417 return (EHOSTUNREACH); 1418 case CPL_ERR_CONN_TIMEDOUT: 1419 return (ETIMEDOUT); 1420 case CPL_ERR_TCAM_FULL: 1421 return (ENOMEM); 1422 case CPL_ERR_CONN_EXIST: 1423 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1424 return (EADDRINUSE); 1425 default: 1426 return (EIO); 1427 } 1428} 1429 1430static void 1431fail_act_open(struct toepcb *toep, int errno) 1432{ 1433 struct tcpcb *tp = toep->tp_tp; 1434 1435 t3_release_offload_resources(toep); 1436 if (tp) { 1437 inp_wunlock(tp->t_inpcb); 1438 tcp_offload_drop(tp, errno); 1439 } 1440 1441#ifdef notyet 1442 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1443#endif 1444} 1445 1446/* 1447 * Handle active open failures. 1448 */ 1449static void 1450active_open_failed(struct toepcb *toep, struct mbuf *m) 1451{ 1452 struct cpl_act_open_rpl *rpl = cplhdr(m); 1453 struct inpcb *inp; 1454 1455 if (toep->tp_tp == NULL) 1456 goto done; 1457 1458 inp = toep->tp_tp->t_inpcb; 1459 1460/* 1461 * Don't handle connection retry for now 1462 */ 1463#ifdef notyet 1464 struct inet_connection_sock *icsk = inet_csk(sk); 1465 1466 if (rpl->status == CPL_ERR_CONN_EXIST && 1467 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1468 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1469 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1470 jiffies + HZ / 2); 1471 } else 1472#endif 1473 { 1474 inp_wlock(inp); 1475 /* 1476 * drops the inpcb lock 1477 */ 1478 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1479 } 1480 1481 done: 1482 m_free(m); 1483} 1484 1485/* 1486 * Return whether a failed active open has allocated a TID 1487 */ 1488static inline int 1489act_open_has_tid(int status) 1490{ 1491 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1492 status != CPL_ERR_ARP_MISS; 1493} 1494 1495/* 1496 * Process an ACT_OPEN_RPL CPL message. 1497 */ 1498static int 1499do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1500{ 1501 struct toepcb *toep = (struct toepcb *)ctx; 1502 struct cpl_act_open_rpl *rpl = cplhdr(m); 1503 1504 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1505 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1506 1507 active_open_failed(toep, m); 1508 return (0); 1509} 1510 1511/* 1512 * Handle an ARP failure for an active open. XXX purge ofo queue 1513 * 1514 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1515 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1516 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1517 * free the atid. Hmm. 1518 */ 1519#ifdef notyet 1520static void 1521act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1522{ 1523 struct toepcb *toep = m_get_toep(m); 1524 struct tcpcb *tp = toep->tp_tp; 1525 struct inpcb *inp = tp->t_inpcb; 1526 struct socket *so; 1527 1528 inp_wlock(inp); 1529 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1530 /* 1531 * drops the inpcb lock 1532 */ 1533 fail_act_open(so, EHOSTUNREACH); 1534 printf("freeing %p\n", m); 1535 1536 m_free(m); 1537 } else 1538 inp_wunlock(inp); 1539} 1540#endif 1541/* 1542 * Send an active open request. 1543 */ 1544int 1545t3_connect(struct toedev *tdev, struct socket *so, 1546 struct rtentry *rt, struct sockaddr *nam) 1547{ 1548 struct mbuf *m; 1549 struct l2t_entry *e; 1550 struct tom_data *d = TOM_DATA(tdev); 1551 struct inpcb *inp = so_sotoinpcb(so); 1552 struct tcpcb *tp = intotcpcb(inp); 1553 struct toepcb *toep; /* allocated by init_offload_socket */ 1554 1555 int atid; 1556 1557 toep = toepcb_alloc(); 1558 if (toep == NULL) 1559 goto out_err; 1560 1561 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1562 goto out_err; 1563 1564 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1565 if (!e) 1566 goto free_tid; 1567 1568 inp_lock_assert(inp); 1569 m = m_gethdr(MT_DATA, M_WAITOK); 1570 1571#if 0 1572 m->m_toe.mt_toepcb = tp->t_toe; 1573 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1574#endif 1575 so_lock(so); 1576 1577 init_offload_socket(so, tdev, atid, e, rt, toep); 1578 1579 install_offload_ops(so); 1580 1581 mk_act_open_req(so, m, atid, e); 1582 so_unlock(so); 1583 1584 soisconnecting(so); 1585 toep = tp->t_toe; 1586 m_set_toep(m, tp->t_toe); 1587 1588 toep->tp_state = TCPS_SYN_SENT; 1589 l2t_send(d->cdev, (struct mbuf *)m, e); 1590 1591 if (toep->tp_ulp_mode) 1592 t3_enable_ddp(toep, 0); 1593 return (0); 1594 1595free_tid: 1596 printf("failing connect - free atid\n"); 1597 1598 free_atid(d->cdev, atid); 1599out_err: 1600 printf("return ENOMEM\n"); 1601 return (ENOMEM); 1602} 1603 1604/* 1605 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1606 * not send multiple ABORT_REQs for the same connection and also that we do 1607 * not try to send a message after the connection has closed. Returns 1 if 1608 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1609 */ 1610static void 1611t3_send_reset(struct toepcb *toep) 1612{ 1613 1614 struct cpl_abort_req *req; 1615 unsigned int tid = toep->tp_tid; 1616 int mode = CPL_ABORT_SEND_RST; 1617 struct tcpcb *tp = toep->tp_tp; 1618 struct toedev *tdev = toep->tp_toedev; 1619 struct socket *so = NULL; 1620 struct mbuf *m; 1621 struct sockbuf *snd; 1622 1623 if (tp) { 1624 inp_lock_assert(tp->t_inpcb); 1625 so = inp_inpcbtosocket(tp->t_inpcb); 1626 } 1627 1628 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1629 tdev == NULL)) 1630 return; 1631 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1632 1633 snd = so_sockbuf_snd(so); 1634 /* Purge the send queue so we don't send anything after an abort. */ 1635 if (so) 1636 sbflush(snd); 1637 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1638 mode |= CPL_ABORT_POST_CLOSE_REQ; 1639 1640 m = m_gethdr_nofail(sizeof(*req)); 1641 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); 1642 set_arp_failure_handler(m, abort_arp_failure); 1643 1644 req = mtod(m, struct cpl_abort_req *); 1645 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1646 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1647 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1648 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1649 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1650 req->cmd = mode; 1651 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1652 mbufq_tail(&toep->out_of_order_queue, m); // defer 1653 else 1654 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1655} 1656 1657static int 1658t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1659{ 1660 struct inpcb *inp; 1661 int error, optval; 1662 1663 if (sopt->sopt_name == IP_OPTIONS) 1664 return (ENOPROTOOPT); 1665 1666 if (sopt->sopt_name != IP_TOS) 1667 return (EOPNOTSUPP); 1668 1669 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1670 1671 if (error) 1672 return (error); 1673 1674 if (optval > IPTOS_PREC_CRITIC_ECP) 1675 return (EINVAL); 1676 1677 inp = so_sotoinpcb(so); 1678 inp_wlock(inp); 1679 inp_ip_tos_set(inp, optval); 1680#if 0 1681 inp->inp_ip_tos = optval; 1682#endif 1683 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); 1684 inp_wunlock(inp); 1685 1686 return (0); 1687} 1688 1689static int 1690t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1691{ 1692 int err = 0; 1693 size_t copied; 1694 1695 if (sopt->sopt_name != TCP_CONGESTION && 1696 sopt->sopt_name != TCP_NODELAY) 1697 return (EOPNOTSUPP); 1698 1699 if (sopt->sopt_name == TCP_CONGESTION) { 1700 char name[TCP_CA_NAME_MAX]; 1701 int optlen = sopt->sopt_valsize; 1702 struct tcpcb *tp; 1703 1704 if (sopt->sopt_dir == SOPT_GET) { 1705 KASSERT(0, ("unimplemented")); 1706 return (EOPNOTSUPP); 1707 } 1708 1709 if (optlen < 1) 1710 return (EINVAL); 1711 1712 err = copyinstr(sopt->sopt_val, name, 1713 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1714 if (err) 1715 return (err); 1716 if (copied < 1) 1717 return (EINVAL); 1718 1719 tp = so_sototcpcb(so); 1720 /* 1721 * XXX I need to revisit this 1722 */ 1723 if ((err = t3_set_cong_control(so, name)) == 0) { 1724#ifdef CONGESTION_CONTROL_SUPPORTED 1725 tp->t_cong_control = strdup(name, M_CXGB); 1726#endif 1727 } else 1728 return (err); 1729 } else { 1730 int optval, oldval; 1731 struct inpcb *inp; 1732 struct tcpcb *tp; 1733 1734 if (sopt->sopt_dir == SOPT_GET) 1735 return (EOPNOTSUPP); 1736 1737 err = sooptcopyin(sopt, &optval, sizeof optval, 1738 sizeof optval); 1739 1740 if (err) 1741 return (err); 1742 1743 inp = so_sotoinpcb(so); 1744 inp_wlock(inp); 1745 tp = inp_inpcbtotcpcb(inp); 1746 1747 oldval = tp->t_flags; 1748 if (optval) 1749 tp->t_flags |= TF_NODELAY; 1750 else 1751 tp->t_flags &= ~TF_NODELAY; 1752 inp_wunlock(inp); 1753 1754 1755 if (oldval != tp->t_flags && (tp->t_toe != NULL)) 1756 t3_set_nagle(tp->t_toe); 1757 1758 } 1759 1760 return (0); 1761} 1762 1763int 1764t3_ctloutput(struct socket *so, struct sockopt *sopt) 1765{ 1766 int err; 1767 1768 if (sopt->sopt_level != IPPROTO_TCP) 1769 err = t3_ip_ctloutput(so, sopt); 1770 else 1771 err = t3_tcp_ctloutput(so, sopt); 1772 1773 if (err != EOPNOTSUPP) 1774 return (err); 1775 1776 return (tcp_ctloutput(so, sopt)); 1777} 1778 1779/* 1780 * Returns true if we need to explicitly request RST when we receive new data 1781 * on an RX-closed connection. 1782 */ 1783static inline int 1784need_rst_on_excess_rx(const struct toepcb *toep) 1785{ 1786 return (1); 1787} 1788 1789/* 1790 * Handles Rx data that arrives in a state where the socket isn't accepting 1791 * new data. 1792 */ 1793static void 1794handle_excess_rx(struct toepcb *toep, struct mbuf *m) 1795{ 1796 1797 if (need_rst_on_excess_rx(toep) && 1798 !(toep->tp_flags & TP_ABORT_SHUTDOWN)) 1799 t3_send_reset(toep); 1800 m_freem(m); 1801} 1802 1803/* 1804 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) 1805 * by getting the DDP offset from the TCB. 1806 */ 1807static void 1808tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) 1809{ 1810 struct ddp_state *q = &toep->tp_ddp_state; 1811 struct ddp_buf_state *bsp; 1812 struct cpl_get_tcb_rpl *hdr; 1813 unsigned int ddp_offset; 1814 struct socket *so; 1815 struct tcpcb *tp; 1816 struct sockbuf *rcv; 1817 int state; 1818 1819 uint64_t t; 1820 __be64 *tcb; 1821 1822 tp = toep->tp_tp; 1823 so = inp_inpcbtosocket(tp->t_inpcb); 1824 1825 inp_lock_assert(tp->t_inpcb); 1826 rcv = so_sockbuf_rcv(so); 1827 sockbuf_lock(rcv); 1828 1829 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. 1830 * We really need a cookie in order to dispatch the RPLs. 1831 */ 1832 q->get_tcb_count--; 1833 1834 /* It is a possible that a previous CPL already invalidated UBUF DDP 1835 * and moved the cur_buf idx and hence no further processing of this 1836 * skb is required. However, the app might be sleeping on 1837 * !q->get_tcb_count and we need to wake it up. 1838 */ 1839 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { 1840 int state = so_state_get(so); 1841 1842 m_freem(m); 1843 if (__predict_true((state & SS_NOFDREF) == 0)) 1844 so_sorwakeup_locked(so); 1845 else 1846 sockbuf_unlock(rcv); 1847 1848 return; 1849 } 1850 1851 bsp = &q->buf_state[q->cur_buf]; 1852 hdr = cplhdr(m); 1853 tcb = (__be64 *)(hdr + 1); 1854 if (q->cur_buf == 0) { 1855 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); 1856 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); 1857 } else { 1858 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); 1859 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; 1860 } 1861 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; 1862 m->m_cur_offset = bsp->cur_offset; 1863 bsp->cur_offset = ddp_offset; 1864 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; 1865 1866 CTR5(KTR_TOM, 1867 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", 1868 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); 1869 KASSERT(ddp_offset >= m->m_cur_offset, 1870 ("ddp_offset=%u less than cur_offset=%u", 1871 ddp_offset, m->m_cur_offset)); 1872 1873#if 0 1874{ 1875 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; 1876 1877 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); 1878 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; 1879 1880 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); 1881 rcv_nxt = t >> S_TCB_RCV_NXT; 1882 rcv_nxt &= M_TCB_RCV_NXT; 1883 1884 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); 1885 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); 1886 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; 1887 1888 T3_TRACE2(TIDTB(sk), 1889 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", 1890 ddp_flags, rcv_nxt - rx_hdr_offset); 1891 T3_TRACE4(TB(q), 1892 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", 1893 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); 1894 T3_TRACE3(TB(q), 1895 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", 1896 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); 1897 T3_TRACE2(TB(q), 1898 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", 1899 q->buf_state[0].flags, q->buf_state[1].flags); 1900 1901} 1902#endif 1903 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { 1904 handle_excess_rx(toep, m); 1905 return; 1906 } 1907 1908#ifdef T3_TRACE 1909 if ((int)m->m_pkthdr.len < 0) { 1910 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); 1911 } 1912#endif 1913 if (bsp->flags & DDP_BF_NOCOPY) { 1914#ifdef T3_TRACE 1915 T3_TRACE0(TB(q), 1916 "tcb_rpl_as_ddp_complete: CANCEL UBUF"); 1917 1918 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1919 printk("!cancel_ubuf"); 1920 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); 1921 } 1922#endif 1923 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; 1924 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); 1925 q->cur_buf ^= 1; 1926 } else if (bsp->flags & DDP_BF_NOFLIP) { 1927 1928 m->m_ddp_flags = 1; /* always a kernel buffer */ 1929 1930 /* now HW buffer carries a user buffer */ 1931 bsp->flags &= ~DDP_BF_NOFLIP; 1932 bsp->flags |= DDP_BF_NOCOPY; 1933 1934 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate 1935 * any new data in which case we're done. If in addition the 1936 * offset is 0, then there wasn't a completion for the kbuf 1937 * and we need to decrement the posted count. 1938 */ 1939 if (m->m_pkthdr.len == 0) { 1940 if (ddp_offset == 0) { 1941 q->kbuf_posted--; 1942 bsp->flags |= DDP_BF_NODATA; 1943 } 1944 sockbuf_unlock(rcv); 1945 m_free(m); 1946 return; 1947 } 1948 } else { 1949 sockbuf_unlock(rcv); 1950 1951 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, 1952 * but it got here way late and nobody cares anymore. 1953 */ 1954 m_free(m); 1955 return; 1956 } 1957 1958 m->m_ddp_gl = (unsigned char *)bsp->gl; 1959 m->m_flags |= M_DDP; 1960 m->m_seq = tp->rcv_nxt; 1961 tp->rcv_nxt += m->m_pkthdr.len; 1962 tp->t_rcvtime = ticks; 1963 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", 1964 m->m_seq, q->cur_buf, m->m_pkthdr.len); 1965 if (m->m_pkthdr.len == 0) { 1966 q->user_ddp_pending = 0; 1967 m_free(m); 1968 } else 1969 SBAPPEND(rcv, m); 1970 1971 state = so_state_get(so); 1972 if (__predict_true((state & SS_NOFDREF) == 0)) 1973 so_sorwakeup_locked(so); 1974 else 1975 sockbuf_unlock(rcv); 1976} 1977 1978/* 1979 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, 1980 * in that case they are similar to DDP completions. 1981 */ 1982static int 1983do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1984{ 1985 struct toepcb *toep = (struct toepcb *)ctx; 1986 1987 /* OK if socket doesn't exist */ 1988 if (toep == NULL) { 1989 printf("null toep in do_get_tcb_rpl\n"); 1990 return (CPL_RET_BUF_DONE); 1991 } 1992 1993 inp_wlock(toep->tp_tp->t_inpcb); 1994 tcb_rpl_as_ddp_complete(toep, m); 1995 inp_wunlock(toep->tp_tp->t_inpcb); 1996 1997 return (0); 1998} 1999 2000static void 2001handle_ddp_data(struct toepcb *toep, struct mbuf *m) 2002{ 2003 struct tcpcb *tp = toep->tp_tp; 2004 struct socket *so; 2005 struct ddp_state *q; 2006 struct ddp_buf_state *bsp; 2007 struct cpl_rx_data *hdr = cplhdr(m); 2008 unsigned int rcv_nxt = ntohl(hdr->seq); 2009 struct sockbuf *rcv; 2010 2011 if (tp->rcv_nxt == rcv_nxt) 2012 return; 2013 2014 inp_lock_assert(tp->t_inpcb); 2015 so = inp_inpcbtosocket(tp->t_inpcb); 2016 rcv = so_sockbuf_rcv(so); 2017 sockbuf_lock(rcv); 2018 2019 q = &toep->tp_ddp_state; 2020 bsp = &q->buf_state[q->cur_buf]; 2021 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", 2022 rcv_nxt, tp->rcv_nxt)); 2023 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2024 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2025 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", 2026 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); 2027 2028#ifdef T3_TRACE 2029 if ((int)m->m_pkthdr.len < 0) { 2030 t3_ddp_error(so, "handle_ddp_data: neg len"); 2031 } 2032#endif 2033 m->m_ddp_gl = (unsigned char *)bsp->gl; 2034 m->m_flags |= M_DDP; 2035 m->m_cur_offset = bsp->cur_offset; 2036 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2037 if (bsp->flags & DDP_BF_NOCOPY) 2038 bsp->flags &= ~DDP_BF_NOCOPY; 2039 2040 m->m_seq = tp->rcv_nxt; 2041 tp->rcv_nxt = rcv_nxt; 2042 bsp->cur_offset += m->m_pkthdr.len; 2043 if (!(bsp->flags & DDP_BF_NOFLIP)) 2044 q->cur_buf ^= 1; 2045 /* 2046 * For now, don't re-enable DDP after a connection fell out of DDP 2047 * mode. 2048 */ 2049 q->ubuf_ddp_ready = 0; 2050 sockbuf_unlock(rcv); 2051} 2052 2053/* 2054 * Process new data received for a connection. 2055 */ 2056static void 2057new_rx_data(struct toepcb *toep, struct mbuf *m) 2058{ 2059 struct cpl_rx_data *hdr = cplhdr(m); 2060 struct tcpcb *tp = toep->tp_tp; 2061 struct socket *so; 2062 struct sockbuf *rcv; 2063 int state; 2064 int len = be16toh(hdr->len); 2065 2066 inp_wlock(tp->t_inpcb); 2067 2068 so = inp_inpcbtosocket(tp->t_inpcb); 2069 2070 if (__predict_false(so_no_receive(so))) { 2071 handle_excess_rx(toep, m); 2072 inp_wunlock(tp->t_inpcb); 2073 TRACE_EXIT; 2074 return; 2075 } 2076 2077 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) 2078 handle_ddp_data(toep, m); 2079 2080 m->m_seq = ntohl(hdr->seq); 2081 m->m_ulp_mode = 0; /* for iSCSI */ 2082 2083#if VALIDATE_SEQ 2084 if (__predict_false(m->m_seq != tp->rcv_nxt)) { 2085 log(LOG_ERR, 2086 "%s: TID %u: Bad sequence number %u, expected %u\n", 2087 toep->tp_toedev->name, toep->tp_tid, m->m_seq, 2088 tp->rcv_nxt); 2089 m_freem(m); 2090 inp_wunlock(tp->t_inpcb); 2091 return; 2092 } 2093#endif 2094 m_adj(m, sizeof(*hdr)); 2095 2096#ifdef URGENT_DATA_SUPPORTED 2097 /* 2098 * We don't handle urgent data yet 2099 */ 2100 if (__predict_false(hdr->urg)) 2101 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 2102 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 2103 tp->urg_seq - tp->rcv_nxt < skb->len)) 2104 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 2105 tp->rcv_nxt]; 2106#endif 2107 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 2108 toep->tp_delack_mode = hdr->dack_mode; 2109 toep->tp_delack_seq = tp->rcv_nxt; 2110 } 2111 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", 2112 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); 2113 2114 if (len < m->m_pkthdr.len) 2115 m->m_pkthdr.len = m->m_len = len; 2116 2117 tp->rcv_nxt += m->m_pkthdr.len; 2118 tp->t_rcvtime = ticks; 2119 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2120 CTR2(KTR_TOM, 2121 "new_rx_data: seq 0x%x len %u", 2122 m->m_seq, m->m_pkthdr.len); 2123 inp_wunlock(tp->t_inpcb); 2124 rcv = so_sockbuf_rcv(so); 2125 sockbuf_lock(rcv); 2126#if 0 2127 if (sb_notify(rcv)) 2128 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); 2129#endif 2130 SBAPPEND(rcv, m); 2131 2132#ifdef notyet 2133 /* 2134 * We're giving too many credits to the card - but disable this check so we can keep on moving :-| 2135 * 2136 */ 2137 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), 2138 2139 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 2140 so, rcv->sb_cc, rcv->sb_mbmax)); 2141#endif 2142 2143 2144 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", 2145 rcv->sb_cc, rcv->sb_mbcnt); 2146 2147 state = so_state_get(so); 2148 if (__predict_true((state & SS_NOFDREF) == 0)) 2149 so_sorwakeup_locked(so); 2150 else 2151 sockbuf_unlock(rcv); 2152} 2153 2154/* 2155 * Handler for RX_DATA CPL messages. 2156 */ 2157static int 2158do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2159{ 2160 struct toepcb *toep = (struct toepcb *)ctx; 2161 2162 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 2163 2164 new_rx_data(toep, m); 2165 2166 return (0); 2167} 2168 2169static void 2170new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) 2171{ 2172 struct tcpcb *tp; 2173 struct ddp_state *q; 2174 struct ddp_buf_state *bsp; 2175 struct cpl_rx_data_ddp *hdr; 2176 struct socket *so; 2177 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 2178 int nomoredata = 0; 2179 unsigned int delack_mode; 2180 struct sockbuf *rcv; 2181 2182 tp = toep->tp_tp; 2183 inp_wlock(tp->t_inpcb); 2184 so = inp_inpcbtosocket(tp->t_inpcb); 2185 2186 if (__predict_false(so_no_receive(so))) { 2187 2188 handle_excess_rx(toep, m); 2189 inp_wunlock(tp->t_inpcb); 2190 return; 2191 } 2192 2193 q = &toep->tp_ddp_state; 2194 hdr = cplhdr(m); 2195 ddp_report = ntohl(hdr->u.ddp_report); 2196 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2197 bsp = &q->buf_state[buf_idx]; 2198 2199 CTR4(KTR_TOM, 2200 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 2201 "hdr seq 0x%x len %u", 2202 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 2203 ntohs(hdr->len)); 2204 CTR3(KTR_TOM, 2205 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", 2206 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); 2207 2208 ddp_len = ntohs(hdr->len); 2209 rcv_nxt = ntohl(hdr->seq) + ddp_len; 2210 2211 delack_mode = G_DDP_DACK_MODE(ddp_report); 2212 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2213 toep->tp_delack_mode = delack_mode; 2214 toep->tp_delack_seq = tp->rcv_nxt; 2215 } 2216 2217 m->m_seq = tp->rcv_nxt; 2218 tp->rcv_nxt = rcv_nxt; 2219 2220 tp->t_rcvtime = ticks; 2221 /* 2222 * Store the length in m->m_len. We are changing the meaning of 2223 * m->m_len here, we need to be very careful that nothing from now on 2224 * interprets ->len of this packet the usual way. 2225 */ 2226 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; 2227 inp_wunlock(tp->t_inpcb); 2228 CTR3(KTR_TOM, 2229 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", 2230 m->m_len, rcv_nxt, m->m_seq); 2231 /* 2232 * Figure out where the new data was placed in the buffer and store it 2233 * in when. Assumes the buffer offset starts at 0, consumer needs to 2234 * account for page pod's pg_offset. 2235 */ 2236 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 2237 m->m_cur_offset = end_offset - m->m_pkthdr.len; 2238 2239 rcv = so_sockbuf_rcv(so); 2240 sockbuf_lock(rcv); 2241 2242 m->m_ddp_gl = (unsigned char *)bsp->gl; 2243 m->m_flags |= M_DDP; 2244 bsp->cur_offset = end_offset; 2245 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2246 2247 /* 2248 * Length is only meaningful for kbuf 2249 */ 2250 if (!(bsp->flags & DDP_BF_NOCOPY)) 2251 KASSERT(m->m_len <= bsp->gl->dgl_length, 2252 ("length received exceeds ddp pages: len=%d dgl_length=%d", 2253 m->m_len, bsp->gl->dgl_length)); 2254 2255 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2256 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); 2257 /* 2258 * Bit 0 of flags stores whether the DDP buffer is completed. 2259 * Note that other parts of the code depend on this being in bit 0. 2260 */ 2261 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 2262 panic("spurious ddp completion"); 2263 } else { 2264 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 2265 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 2266 q->cur_buf ^= 1; /* flip buffers */ 2267 } 2268 2269 if (bsp->flags & DDP_BF_NOCOPY) { 2270 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); 2271 bsp->flags &= ~DDP_BF_NOCOPY; 2272 } 2273 2274 if (ddp_report & F_DDP_PSH) 2275 m->m_ddp_flags |= DDP_BF_PSH; 2276 if (nomoredata) 2277 m->m_ddp_flags |= DDP_BF_NODATA; 2278 2279#ifdef notyet 2280 skb_reset_transport_header(skb); 2281 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ 2282#endif 2283 SBAPPEND(rcv, m); 2284 2285 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || 2286 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) 2287 || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) 2288 so_sorwakeup_locked(so); 2289 else 2290 sockbuf_unlock(rcv); 2291} 2292 2293#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 2294 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 2295 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 2296 F_DDP_INVALID_PPOD) 2297 2298/* 2299 * Handler for RX_DATA_DDP CPL messages. 2300 */ 2301static int 2302do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2303{ 2304 struct toepcb *toep = ctx; 2305 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 2306 2307 VALIDATE_SOCK(so); 2308 2309 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 2310 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 2311 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 2312 return (CPL_RET_BUF_DONE); 2313 } 2314#if 0 2315 skb->h.th = tcphdr_skb->h.th; 2316#endif 2317 new_rx_data_ddp(toep, m); 2318 return (0); 2319} 2320 2321static void 2322process_ddp_complete(struct toepcb *toep, struct mbuf *m) 2323{ 2324 struct tcpcb *tp = toep->tp_tp; 2325 struct socket *so; 2326 struct ddp_state *q; 2327 struct ddp_buf_state *bsp; 2328 struct cpl_rx_ddp_complete *hdr; 2329 unsigned int ddp_report, buf_idx, when, delack_mode; 2330 int nomoredata = 0; 2331 struct sockbuf *rcv; 2332 2333 inp_wlock(tp->t_inpcb); 2334 so = inp_inpcbtosocket(tp->t_inpcb); 2335 2336 if (__predict_false(so_no_receive(so))) { 2337 struct inpcb *inp = so_sotoinpcb(so); 2338 2339 handle_excess_rx(toep, m); 2340 inp_wunlock(inp); 2341 return; 2342 } 2343 q = &toep->tp_ddp_state; 2344 hdr = cplhdr(m); 2345 ddp_report = ntohl(hdr->ddp_report); 2346 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2347 m->m_pkthdr.csum_data = tp->rcv_nxt; 2348 2349 rcv = so_sockbuf_rcv(so); 2350 sockbuf_lock(rcv); 2351 2352 bsp = &q->buf_state[buf_idx]; 2353 when = bsp->cur_offset; 2354 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; 2355 tp->rcv_nxt += m->m_len; 2356 tp->t_rcvtime = ticks; 2357 2358 delack_mode = G_DDP_DACK_MODE(ddp_report); 2359 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2360 toep->tp_delack_mode = delack_mode; 2361 toep->tp_delack_seq = tp->rcv_nxt; 2362 } 2363#ifdef notyet 2364 skb_reset_transport_header(skb); 2365 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2366#endif 2367 inp_wunlock(tp->t_inpcb); 2368 2369 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2370 CTR5(KTR_TOM, 2371 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2372 "ddp_report 0x%x offset %u, len %u", 2373 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2374 G_DDP_OFFSET(ddp_report), m->m_len); 2375 2376 m->m_cur_offset = bsp->cur_offset; 2377 bsp->cur_offset += m->m_len; 2378 2379 if (!(bsp->flags & DDP_BF_NOFLIP)) { 2380 q->cur_buf ^= 1; /* flip buffers */ 2381 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) 2382 nomoredata=1; 2383 } 2384 2385 CTR4(KTR_TOM, 2386 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2387 "ddp_report %u offset %u", 2388 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2389 G_DDP_OFFSET(ddp_report)); 2390 2391 m->m_ddp_gl = (unsigned char *)bsp->gl; 2392 m->m_flags |= M_DDP; 2393 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 2394 if (bsp->flags & DDP_BF_NOCOPY) 2395 bsp->flags &= ~DDP_BF_NOCOPY; 2396 if (nomoredata) 2397 m->m_ddp_flags |= DDP_BF_NODATA; 2398 2399 SBAPPEND(rcv, m); 2400 if ((so_state_get(so) & SS_NOFDREF) == 0) 2401 so_sorwakeup_locked(so); 2402 else 2403 sockbuf_unlock(rcv); 2404} 2405 2406/* 2407 * Handler for RX_DDP_COMPLETE CPL messages. 2408 */ 2409static int 2410do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2411{ 2412 struct toepcb *toep = ctx; 2413 2414 VALIDATE_SOCK(so); 2415#if 0 2416 skb->h.th = tcphdr_skb->h.th; 2417#endif 2418 process_ddp_complete(toep, m); 2419 return (0); 2420} 2421 2422/* 2423 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 2424 * socket state before calling tcp_time_wait to comply with its expectations. 2425 */ 2426static void 2427enter_timewait(struct tcpcb *tp) 2428{ 2429 /* 2430 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2431 * process peer_close because we don't want to carry the peer FIN in 2432 * the socket's receive queue and if we increment rcv_nxt without 2433 * having the FIN in the receive queue we'll confuse facilities such 2434 * as SIOCINQ. 2435 */ 2436 inp_wlock(tp->t_inpcb); 2437 tp->rcv_nxt++; 2438 2439 tp->ts_recent_age = 0; /* defeat recycling */ 2440 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2441 inp_wunlock(tp->t_inpcb); 2442 tcp_offload_twstart(tp); 2443} 2444 2445/* 2446 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This 2447 * function deals with the data that may be reported along with the FIN. 2448 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to 2449 * perform normal FIN-related processing. In the latter case 1 indicates that 2450 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the 2451 * skb can be freed. 2452 */ 2453static int 2454handle_peer_close_data(struct socket *so, struct mbuf *m) 2455{ 2456 struct tcpcb *tp = so_sototcpcb(so); 2457 struct toepcb *toep = tp->t_toe; 2458 struct ddp_state *q; 2459 struct ddp_buf_state *bsp; 2460 struct cpl_peer_close *req = cplhdr(m); 2461 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ 2462 struct sockbuf *rcv; 2463 2464 if (tp->rcv_nxt == rcv_nxt) /* no data */ 2465 return (0); 2466 2467 CTR0(KTR_TOM, "handle_peer_close_data"); 2468 if (__predict_false(so_no_receive(so))) { 2469 handle_excess_rx(toep, m); 2470 2471 /* 2472 * Although we discard the data we want to process the FIN so 2473 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + 2474 * PEER_CLOSE without data. In particular this PEER_CLOSE 2475 * may be what will close the connection. We return 1 because 2476 * handle_excess_rx() already freed the packet. 2477 */ 2478 return (1); 2479 } 2480 2481 inp_lock_assert(tp->t_inpcb); 2482 q = &toep->tp_ddp_state; 2483 rcv = so_sockbuf_rcv(so); 2484 sockbuf_lock(rcv); 2485 2486 bsp = &q->buf_state[q->cur_buf]; 2487 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2488 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2489 m->m_ddp_gl = (unsigned char *)bsp->gl; 2490 m->m_flags |= M_DDP; 2491 m->m_cur_offset = bsp->cur_offset; 2492 m->m_ddp_flags = 2493 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2494 m->m_seq = tp->rcv_nxt; 2495 tp->rcv_nxt = rcv_nxt; 2496 bsp->cur_offset += m->m_pkthdr.len; 2497 if (!(bsp->flags & DDP_BF_NOFLIP)) 2498 q->cur_buf ^= 1; 2499#ifdef notyet 2500 skb_reset_transport_header(skb); 2501 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2502#endif 2503 tp->t_rcvtime = ticks; 2504 SBAPPEND(rcv, m); 2505 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 2506 so_sorwakeup_locked(so); 2507 else 2508 sockbuf_unlock(rcv); 2509 2510 return (1); 2511} 2512 2513/* 2514 * Handle a peer FIN. 2515 */ 2516static void 2517do_peer_fin(struct toepcb *toep, struct mbuf *m) 2518{ 2519 struct socket *so; 2520 struct tcpcb *tp = toep->tp_tp; 2521 int keep, action; 2522 2523 action = keep = 0; 2524 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); 2525 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2526 printf("abort_pending set\n"); 2527 2528 goto out; 2529 } 2530 inp_wlock(tp->t_inpcb); 2531 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 2532 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { 2533 keep = handle_peer_close_data(so, m); 2534 if (keep < 0) { 2535 inp_wunlock(tp->t_inpcb); 2536 return; 2537 } 2538 } 2539 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2540 CTR1(KTR_TOM, 2541 "waking up waiters for cantrcvmore on %p ", so); 2542 socantrcvmore(so); 2543 2544 /* 2545 * If connection is half-synchronized 2546 * (ie NEEDSYN flag on) then delay ACK, 2547 * so it may be piggybacked when SYN is sent. 2548 * Otherwise, since we received a FIN then no 2549 * more input can be expected, send ACK now. 2550 */ 2551 if (tp->t_flags & TF_NEEDSYN) 2552 tp->t_flags |= TF_DELACK; 2553 else 2554 tp->t_flags |= TF_ACKNOW; 2555 tp->rcv_nxt++; 2556 } 2557 2558 switch (tp->t_state) { 2559 case TCPS_SYN_RECEIVED: 2560 tp->t_starttime = ticks; 2561 /* FALLTHROUGH */ 2562 case TCPS_ESTABLISHED: 2563 tp->t_state = TCPS_CLOSE_WAIT; 2564 break; 2565 case TCPS_FIN_WAIT_1: 2566 tp->t_state = TCPS_CLOSING; 2567 break; 2568 case TCPS_FIN_WAIT_2: 2569 /* 2570 * If we've sent an abort_req we must have sent it too late, 2571 * HW will send us a reply telling us so, and this peer_close 2572 * is really the last message for this connection and needs to 2573 * be treated as an abort_rpl, i.e., transition the connection 2574 * to TCP_CLOSE (note that the host stack does this at the 2575 * time of generating the RST but we must wait for HW). 2576 * Otherwise we enter TIME_WAIT. 2577 */ 2578 t3_release_offload_resources(toep); 2579 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2580 action = TCP_CLOSE; 2581 } else { 2582 action = TCP_TIMEWAIT; 2583 } 2584 break; 2585 default: 2586 log(LOG_ERR, 2587 "%s: TID %u received PEER_CLOSE in bad state %d\n", 2588 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); 2589 } 2590 inp_wunlock(tp->t_inpcb); 2591 2592 if (action == TCP_TIMEWAIT) { 2593 enter_timewait(tp); 2594 } else if (action == TCP_DROP) { 2595 tcp_offload_drop(tp, 0); 2596 } else if (action == TCP_CLOSE) { 2597 tcp_offload_close(tp); 2598 } 2599 2600#ifdef notyet 2601 /* Do not send POLL_HUP for half duplex close. */ 2602 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 2603 sk->sk_state == TCP_CLOSE) 2604 sk_wake_async(so, 1, POLL_HUP); 2605 else 2606 sk_wake_async(so, 1, POLL_IN); 2607#endif 2608 2609out: 2610 if (!keep) 2611 m_free(m); 2612} 2613 2614/* 2615 * Handler for PEER_CLOSE CPL messages. 2616 */ 2617static int 2618do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2619{ 2620 struct toepcb *toep = (struct toepcb *)ctx; 2621 2622 VALIDATE_SOCK(so); 2623 2624 do_peer_fin(toep, m); 2625 return (0); 2626} 2627 2628static void 2629process_close_con_rpl(struct toepcb *toep, struct mbuf *m) 2630{ 2631 struct cpl_close_con_rpl *rpl = cplhdr(m); 2632 struct tcpcb *tp = toep->tp_tp; 2633 struct socket *so; 2634 int action = 0; 2635 struct sockbuf *rcv; 2636 2637 inp_wlock(tp->t_inpcb); 2638 so = inp_inpcbtosocket(tp->t_inpcb); 2639 2640 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 2641 2642 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2643 inp_wunlock(tp->t_inpcb); 2644 goto out; 2645 } 2646 2647 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 2648 tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); 2649 2650 switch (tp->t_state) { 2651 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 2652 t3_release_offload_resources(toep); 2653 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2654 action = TCP_CLOSE; 2655 2656 } else { 2657 action = TCP_TIMEWAIT; 2658 } 2659 break; 2660 case TCPS_LAST_ACK: 2661 /* 2662 * In this state we don't care about pending abort_rpl. 2663 * If we've sent abort_req it was post-close and was sent too 2664 * late, this close_con_rpl is the actual last message. 2665 */ 2666 t3_release_offload_resources(toep); 2667 action = TCP_CLOSE; 2668 break; 2669 case TCPS_FIN_WAIT_1: 2670 /* 2671 * If we can't receive any more 2672 * data, then closing user can proceed. 2673 * Starting the timer is contrary to the 2674 * specification, but if we don't get a FIN 2675 * we'll hang forever. 2676 * 2677 * XXXjl: 2678 * we should release the tp also, and use a 2679 * compressed state. 2680 */ 2681 if (so) 2682 rcv = so_sockbuf_rcv(so); 2683 else 2684 break; 2685 2686 if (rcv->sb_state & SBS_CANTRCVMORE) { 2687 int timeout; 2688 2689 if (so) 2690 soisdisconnected(so); 2691 timeout = (tcp_fast_finwait2_recycle) ? 2692 tcp_finwait2_timeout : tcp_maxidle; 2693 tcp_timer_activate(tp, TT_2MSL, timeout); 2694 } 2695 tp->t_state = TCPS_FIN_WAIT_2; 2696 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && 2697 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 2698 action = TCP_DROP; 2699 } 2700 2701 break; 2702 default: 2703 log(LOG_ERR, 2704 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 2705 toep->tp_toedev->tod_name, toep->tp_tid, 2706 tp->t_state); 2707 } 2708 inp_wunlock(tp->t_inpcb); 2709 2710 2711 if (action == TCP_TIMEWAIT) { 2712 enter_timewait(tp); 2713 } else if (action == TCP_DROP) { 2714 tcp_offload_drop(tp, 0); 2715 } else if (action == TCP_CLOSE) { 2716 tcp_offload_close(tp); 2717 } 2718out: 2719 m_freem(m); 2720} 2721 2722/* 2723 * Handler for CLOSE_CON_RPL CPL messages. 2724 */ 2725static int 2726do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 2727 void *ctx) 2728{ 2729 struct toepcb *toep = (struct toepcb *)ctx; 2730 2731 process_close_con_rpl(toep, m); 2732 return (0); 2733} 2734 2735/* 2736 * Process abort replies. We only process these messages if we anticipate 2737 * them as the coordination between SW and HW in this area is somewhat lacking 2738 * and sometimes we get ABORT_RPLs after we are done with the connection that 2739 * originated the ABORT_REQ. 2740 */ 2741static void 2742process_abort_rpl(struct toepcb *toep, struct mbuf *m) 2743{ 2744 struct tcpcb *tp = toep->tp_tp; 2745 struct socket *so; 2746 int needclose = 0; 2747 2748#ifdef T3_TRACE 2749 T3_TRACE1(TIDTB(sk), 2750 "process_abort_rpl: GTS rpl pending %d", 2751 sock_flag(sk, ABORT_RPL_PENDING)); 2752#endif 2753 2754 inp_wlock(tp->t_inpcb); 2755 so = inp_inpcbtosocket(tp->t_inpcb); 2756 2757 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2758 /* 2759 * XXX panic on tcpdrop 2760 */ 2761 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) 2762 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2763 else { 2764 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2765 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2766 !is_t3a(toep->tp_toedev)) { 2767 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2768 panic("TP_ABORT_REQ_RCVD set"); 2769 t3_release_offload_resources(toep); 2770 needclose = 1; 2771 } 2772 } 2773 } 2774 inp_wunlock(tp->t_inpcb); 2775 2776 if (needclose) 2777 tcp_offload_close(tp); 2778 2779 m_free(m); 2780} 2781 2782/* 2783 * Handle an ABORT_RPL_RSS CPL message. 2784 */ 2785static int 2786do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2787{ 2788 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2789 struct toepcb *toep; 2790 2791 /* 2792 * Ignore replies to post-close aborts indicating that the abort was 2793 * requested too late. These connections are terminated when we get 2794 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2795 * arrives the TID is either no longer used or it has been recycled. 2796 */ 2797 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2798discard: 2799 m_free(m); 2800 return (0); 2801 } 2802 2803 toep = (struct toepcb *)ctx; 2804 2805 /* 2806 * Sometimes we've already closed the socket, e.g., a post-close 2807 * abort races with ABORT_REQ_RSS, the latter frees the socket 2808 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2809 * but FW turns the ABORT_REQ into a regular one and so we get 2810 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2811 */ 2812 if (!toep) 2813 goto discard; 2814 2815 if (toep->tp_tp == NULL) { 2816 log(LOG_NOTICE, "removing tid for abort\n"); 2817 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2818 if (toep->tp_l2t) 2819 l2t_release(L2DATA(cdev), toep->tp_l2t); 2820 2821 toepcb_release(toep); 2822 goto discard; 2823 } 2824 2825 log(LOG_NOTICE, "toep=%p\n", toep); 2826 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); 2827 2828 toepcb_hold(toep); 2829 process_abort_rpl(toep, m); 2830 toepcb_release(toep); 2831 return (0); 2832} 2833 2834/* 2835 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also 2836 * indicate whether RST should be sent in response. 2837 */ 2838static int 2839abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2840{ 2841 struct tcpcb *tp = so_sototcpcb(so); 2842 2843 switch (abort_reason) { 2844 case CPL_ERR_BAD_SYN: 2845#if 0 2846 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2847#endif 2848 case CPL_ERR_CONN_RESET: 2849 // XXX need to handle SYN_RECV due to crossed SYNs 2850 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2851 case CPL_ERR_XMIT_TIMEDOUT: 2852 case CPL_ERR_PERSIST_TIMEDOUT: 2853 case CPL_ERR_FINWAIT2_TIMEDOUT: 2854 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2855#if 0 2856 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2857#endif 2858 return (ETIMEDOUT); 2859 default: 2860 return (EIO); 2861 } 2862} 2863 2864static inline void 2865set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2866{ 2867 struct cpl_abort_rpl *rpl = cplhdr(m); 2868 2869 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2870 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2871 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2872 2873 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2874 rpl->cmd = cmd; 2875} 2876 2877static void 2878send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2879{ 2880 struct mbuf *reply_mbuf; 2881 struct cpl_abort_req_rss *req = cplhdr(m); 2882 2883 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2884 m_set_priority(m, CPL_PRIORITY_DATA); 2885 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2886 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2887 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2888 m_free(m); 2889} 2890 2891/* 2892 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2893 */ 2894static inline int 2895is_neg_adv_abort(unsigned int status) 2896{ 2897 return status == CPL_ERR_RTX_NEG_ADVICE || 2898 status == CPL_ERR_PERSIST_NEG_ADVICE; 2899} 2900 2901static void 2902send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2903{ 2904 struct mbuf *reply_mbuf; 2905 struct cpl_abort_req_rss *req = cplhdr(m); 2906 2907 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2908 2909 if (!reply_mbuf) { 2910 /* Defer the reply. Stick rst_status into req->cmd. */ 2911 req->status = rst_status; 2912 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2913 return; 2914 } 2915 2916 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2917 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2918 m_free(m); 2919 2920 /* 2921 * XXX need to sync with ARP as for SYN_RECV connections we can send 2922 * these messages while ARP is pending. For other connection states 2923 * it's not a problem. 2924 */ 2925 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2926} 2927 2928#ifdef notyet 2929static void 2930cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2931{ 2932 CXGB_UNIMPLEMENTED(); 2933#ifdef notyet 2934 struct request_sock *req = child->sk_user_data; 2935 2936 inet_csk_reqsk_queue_removed(parent, req); 2937 synq_remove(tcp_sk(child)); 2938 __reqsk_free(req); 2939 child->sk_user_data = NULL; 2940#endif 2941} 2942 2943 2944/* 2945 * Performs the actual work to abort a SYN_RECV connection. 2946 */ 2947static void 2948do_abort_syn_rcv(struct socket *child, struct socket *parent) 2949{ 2950 struct tcpcb *parenttp = so_sototcpcb(parent); 2951 struct tcpcb *childtp = so_sototcpcb(child); 2952 2953 /* 2954 * If the server is still open we clean up the child connection, 2955 * otherwise the server already did the clean up as it was purging 2956 * its SYN queue and the skb was just sitting in its backlog. 2957 */ 2958 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2959 cleanup_syn_rcv_conn(child, parent); 2960 inp_wlock(childtp->t_inpcb); 2961 t3_release_offload_resources(childtp->t_toe); 2962 inp_wunlock(childtp->t_inpcb); 2963 tcp_offload_close(childtp); 2964 } 2965} 2966#endif 2967 2968/* 2969 * Handle abort requests for a SYN_RECV connection. These need extra work 2970 * because the socket is on its parent's SYN queue. 2971 */ 2972static int 2973abort_syn_rcv(struct socket *so, struct mbuf *m) 2974{ 2975 CXGB_UNIMPLEMENTED(); 2976#ifdef notyet 2977 struct socket *parent; 2978 struct toedev *tdev = toep->tp_toedev; 2979 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2980 struct socket *oreq = so->so_incomp; 2981 struct t3c_tid_entry *t3c_stid; 2982 struct tid_info *t; 2983 2984 if (!oreq) 2985 return -1; /* somehow we are not on the SYN queue */ 2986 2987 t = &(T3C_DATA(cdev))->tid_maps; 2988 t3c_stid = lookup_stid(t, oreq->ts_recent); 2989 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2990 2991 so_lock(parent); 2992 do_abort_syn_rcv(so, parent); 2993 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2994 so_unlock(parent); 2995#endif 2996 return (0); 2997} 2998 2999/* 3000 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 3001 * request except that we need to reply to it. 3002 */ 3003static void 3004process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) 3005{ 3006 int rst_status = CPL_ABORT_NO_RST; 3007 const struct cpl_abort_req_rss *req = cplhdr(m); 3008 struct tcpcb *tp = toep->tp_tp; 3009 struct socket *so; 3010 int needclose = 0; 3011 3012 inp_wlock(tp->t_inpcb); 3013 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 3014 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 3015 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 3016 m_free(m); 3017 goto skip; 3018 } 3019 3020 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 3021 /* 3022 * Three cases to consider: 3023 * a) We haven't sent an abort_req; close the connection. 3024 * b) We have sent a post-close abort_req that will get to TP too late 3025 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 3026 * be ignored and the connection should be closed now. 3027 * c) We have sent a regular abort_req that will get to TP too late. 3028 * That will generate an abort_rpl with status 0, wait for it. 3029 */ 3030 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 3031 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 3032 int error; 3033 3034 error = abort_status_to_errno(so, req->status, 3035 &rst_status); 3036 so_error_set(so, error); 3037 3038 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 3039 so_sorwakeup(so); 3040 /* 3041 * SYN_RECV needs special processing. If abort_syn_rcv() 3042 * returns 0 is has taken care of the abort. 3043 */ 3044 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 3045 goto skip; 3046 3047 t3_release_offload_resources(toep); 3048 needclose = 1; 3049 } 3050 inp_wunlock(tp->t_inpcb); 3051 3052 if (needclose) 3053 tcp_offload_close(tp); 3054 3055 send_abort_rpl(m, tdev, rst_status); 3056 return; 3057skip: 3058 inp_wunlock(tp->t_inpcb); 3059} 3060 3061/* 3062 * Handle an ABORT_REQ_RSS CPL message. 3063 */ 3064static int 3065do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3066{ 3067 const struct cpl_abort_req_rss *req = cplhdr(m); 3068 struct toepcb *toep = (struct toepcb *)ctx; 3069 3070 if (is_neg_adv_abort(req->status)) { 3071 m_free(m); 3072 return (0); 3073 } 3074 3075 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); 3076 3077 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 3078 cxgb_remove_tid(cdev, toep, toep->tp_tid); 3079 toep->tp_flags |= TP_ABORT_REQ_RCVD; 3080 3081 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 3082 if (toep->tp_l2t) 3083 l2t_release(L2DATA(cdev), toep->tp_l2t); 3084 3085 /* 3086 * Unhook 3087 */ 3088 toep->tp_tp->t_toe = NULL; 3089 toep->tp_tp->t_flags &= ~TF_TOE; 3090 toep->tp_tp = NULL; 3091 /* 3092 * XXX need to call syncache_chkrst - but we don't 3093 * have a way of doing that yet 3094 */ 3095 toepcb_release(toep); 3096 log(LOG_ERR, "abort for unestablished connection :-(\n"); 3097 return (0); 3098 } 3099 if (toep->tp_tp == NULL) { 3100 log(LOG_NOTICE, "disconnected toepcb\n"); 3101 /* should be freed momentarily */ 3102 return (0); 3103 } 3104 3105 3106 toepcb_hold(toep); 3107 process_abort_req(toep, m, toep->tp_toedev); 3108 toepcb_release(toep); 3109 return (0); 3110} 3111#ifdef notyet 3112static void 3113pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 3114{ 3115 struct toedev *tdev = TOE_DEV(parent); 3116 3117 do_abort_syn_rcv(child, parent); 3118 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 3119 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 3120 3121 rpl->opt0h = htonl(F_TCAM_BYPASS); 3122 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3123 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3124 } else 3125 m_free(m); 3126} 3127#endif 3128static void 3129handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 3130{ 3131 CXGB_UNIMPLEMENTED(); 3132 3133#ifdef notyet 3134 struct t3cdev *cdev; 3135 struct socket *parent; 3136 struct socket *oreq; 3137 struct t3c_tid_entry *t3c_stid; 3138 struct tid_info *t; 3139 struct tcpcb *otp, *tp = so_sototcpcb(so); 3140 struct toepcb *toep = tp->t_toe; 3141 3142 /* 3143 * If the connection is being aborted due to the parent listening 3144 * socket going away there's nothing to do, the ABORT_REQ will close 3145 * the connection. 3146 */ 3147 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 3148 m_free(m); 3149 return; 3150 } 3151 3152 oreq = so->so_incomp; 3153 otp = so_sototcpcb(oreq); 3154 3155 cdev = T3C_DEV(so); 3156 t = &(T3C_DATA(cdev))->tid_maps; 3157 t3c_stid = lookup_stid(t, otp->ts_recent); 3158 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 3159 3160 so_lock(parent); 3161 pass_open_abort(so, parent, m); 3162 so_unlock(parent); 3163#endif 3164} 3165 3166/* 3167 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 3168 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 3169 * connection. 3170 */ 3171static void 3172pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 3173{ 3174 3175#ifdef notyet 3176 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3177 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 3178#endif 3179 handle_pass_open_arp_failure(m_get_socket(m), m); 3180} 3181 3182/* 3183 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 3184 */ 3185static void 3186mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 3187{ 3188 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 3189 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 3190 unsigned int tid = GET_TID(req); 3191 3192 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 3193 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3194 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3195 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 3196 rpl->opt0h = htonl(F_TCAM_BYPASS); 3197 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3198 rpl->opt2 = 0; 3199 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3200} 3201 3202/* 3203 * Send a deferred reject to an accept request. 3204 */ 3205static void 3206reject_pass_request(struct toedev *tdev, struct mbuf *m) 3207{ 3208 struct mbuf *reply_mbuf; 3209 3210 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 3211 mk_pass_accept_rpl(reply_mbuf, m); 3212 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 3213 m_free(m); 3214} 3215 3216static void 3217handle_syncache_event(int event, void *arg) 3218{ 3219 struct toepcb *toep = arg; 3220 3221 switch (event) { 3222 case TOE_SC_ENTRY_PRESENT: 3223 /* 3224 * entry already exists - free toepcb 3225 * and l2t 3226 */ 3227 printf("syncache entry present\n"); 3228 toepcb_release(toep); 3229 break; 3230 case TOE_SC_DROP: 3231 /* 3232 * The syncache has given up on this entry 3233 * either it timed out, or it was evicted 3234 * we need to explicitly release the tid 3235 */ 3236 printf("syncache entry dropped\n"); 3237 toepcb_release(toep); 3238 break; 3239 default: 3240 log(LOG_ERR, "unknown syncache event %d\n", event); 3241 break; 3242 } 3243} 3244 3245static void 3246syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 3247{ 3248 struct in_conninfo inc; 3249 struct tcpopt to; 3250 struct tcphdr th; 3251 struct inpcb *inp; 3252 int mss, wsf, sack, ts; 3253 uint32_t rcv_isn = ntohl(req->rcv_isn); 3254 3255 bzero(&to, sizeof(struct tcpopt)); 3256 inp = so_sotoinpcb(lso); 3257 3258 /* 3259 * Fill out information for entering us into the syncache 3260 */ 3261 bzero(&inc, sizeof(inc)); 3262 inc.inc_fport = th.th_sport = req->peer_port; 3263 inc.inc_lport = th.th_dport = req->local_port; 3264 th.th_seq = req->rcv_isn; 3265 th.th_flags = TH_SYN; 3266 3267 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; 3268 3269 3270 inc.inc_isipv6 = 0; 3271 inc.inc_len = 0; 3272 inc.inc_faddr.s_addr = req->peer_ip; 3273 inc.inc_laddr.s_addr = req->local_ip; 3274 3275 DPRINTF("syncache add of %d:%d %d:%d\n", 3276 ntohl(req->local_ip), ntohs(req->local_port), 3277 ntohl(req->peer_ip), ntohs(req->peer_port)); 3278 3279 mss = req->tcp_options.mss; 3280 wsf = req->tcp_options.wsf; 3281 ts = req->tcp_options.tstamp; 3282 sack = req->tcp_options.sack; 3283 to.to_mss = mss; 3284 to.to_wscale = wsf; 3285 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3286 tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 3287} 3288 3289 3290/* 3291 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 3292 * lock held. Note that the sock here is a listening socket that is not owned 3293 * by the TOE. 3294 */ 3295static void 3296process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 3297 struct listen_ctx *lctx) 3298{ 3299 int rt_flags; 3300 struct l2t_entry *e; 3301 struct iff_mac tim; 3302 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 3303 struct cpl_pass_accept_rpl *rpl; 3304 struct cpl_pass_accept_req *req = cplhdr(m); 3305 unsigned int tid = GET_TID(req); 3306 struct tom_data *d = TOM_DATA(tdev); 3307 struct t3cdev *cdev = d->cdev; 3308 struct tcpcb *tp = so_sototcpcb(so); 3309 struct toepcb *newtoep; 3310 struct rtentry *dst; 3311 struct sockaddr_in nam; 3312 struct t3c_data *td = T3C_DATA(cdev); 3313 3314 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3315 if (__predict_false(reply_mbuf == NULL)) { 3316 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3317 t3_defer_reply(m, tdev, reject_pass_request); 3318 else { 3319 cxgb_queue_tid_release(cdev, tid); 3320 m_free(m); 3321 } 3322 DPRINTF("failed to get reply_mbuf\n"); 3323 3324 goto out; 3325 } 3326 3327 if (tp->t_state != TCPS_LISTEN) { 3328 DPRINTF("socket not in listen state\n"); 3329 3330 goto reject; 3331 } 3332 3333 tim.mac_addr = req->dst_mac; 3334 tim.vlan_tag = ntohs(req->vlan_tag); 3335 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 3336 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 3337 goto reject; 3338 } 3339 3340#ifdef notyet 3341 /* 3342 * XXX do route lookup to confirm that we're still listening on this 3343 * address 3344 */ 3345 if (ip_route_input(skb, req->local_ip, req->peer_ip, 3346 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 3347 goto reject; 3348 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 3349 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 3350 dst_release(skb->dst); // done with the input route, release it 3351 skb->dst = NULL; 3352 3353 if ((rt_flags & RTF_LOCAL) == 0) 3354 goto reject; 3355#endif 3356 /* 3357 * XXX 3358 */ 3359 rt_flags = RTF_LOCAL; 3360 if ((rt_flags & RTF_LOCAL) == 0) 3361 goto reject; 3362 3363 /* 3364 * Calculate values and add to syncache 3365 */ 3366 3367 newtoep = toepcb_alloc(); 3368 if (newtoep == NULL) 3369 goto reject; 3370 3371 bzero(&nam, sizeof(struct sockaddr_in)); 3372 3373 nam.sin_len = sizeof(struct sockaddr_in); 3374 nam.sin_family = AF_INET; 3375 nam.sin_addr.s_addr =req->peer_ip; 3376 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 3377 3378 if (dst == NULL) { 3379 printf("failed to find route\n"); 3380 goto reject; 3381 } 3382 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 3383 (struct sockaddr *)&nam); 3384 if (e == NULL) { 3385 DPRINTF("failed to get l2t\n"); 3386 } 3387 /* 3388 * Point to our listen socket until accept 3389 */ 3390 newtoep->tp_tp = tp; 3391 newtoep->tp_flags = TP_SYN_RCVD; 3392 newtoep->tp_tid = tid; 3393 newtoep->tp_toedev = tdev; 3394 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3395 3396 cxgb_insert_tid(cdev, d->client, newtoep, tid); 3397 so_lock(so); 3398 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 3399 so_unlock(so); 3400 3401 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 3402 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 3403 3404 if (newtoep->tp_ulp_mode) { 3405 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3406 3407 if (ddp_mbuf == NULL) 3408 newtoep->tp_ulp_mode = 0; 3409 } 3410 3411 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", 3412 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); 3413 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 3414 /* 3415 * XXX workaround for lack of syncache drop 3416 */ 3417 toepcb_hold(newtoep); 3418 syncache_add_accept_req(req, so, newtoep); 3419 3420 rpl = cplhdr(reply_mbuf); 3421 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 3422 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3423 rpl->wr.wr_lo = 0; 3424 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3425 rpl->opt2 = htonl(calc_opt2(so, tdev)); 3426 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3427 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 3428 3429 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 3430 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 3431 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | 3432 CPL_PASS_OPEN_ACCEPT); 3433 3434 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 3435 3436 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); 3437 3438 l2t_send(cdev, reply_mbuf, e); 3439 m_free(m); 3440 if (newtoep->tp_ulp_mode) { 3441 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 3442 V_TF_DDP_OFF(1) | 3443 TP_DDP_TIMER_WORKAROUND_MASK, 3444 V_TF_DDP_OFF(1) | 3445 TP_DDP_TIMER_WORKAROUND_VAL, 1); 3446 } else 3447 DPRINTF("no DDP\n"); 3448 3449 return; 3450reject: 3451 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3452 mk_pass_accept_rpl(reply_mbuf, m); 3453 else 3454 mk_tid_release(reply_mbuf, newtoep, tid); 3455 cxgb_ofld_send(cdev, reply_mbuf); 3456 m_free(m); 3457out: 3458#if 0 3459 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3460#else 3461 return; 3462#endif 3463} 3464 3465/* 3466 * Handle a CPL_PASS_ACCEPT_REQ message. 3467 */ 3468static int 3469do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3470{ 3471 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 3472 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ 3473 struct tom_data *d = listen_ctx->tom_data; 3474 3475#if VALIDATE_TID 3476 struct cpl_pass_accept_req *req = cplhdr(m); 3477 unsigned int tid = GET_TID(req); 3478 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 3479 3480 if (unlikely(!lsk)) { 3481 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 3482 cdev->name, 3483 (unsigned long)((union listen_entry *)ctx - 3484 t->stid_tab)); 3485 return CPL_RET_BUF_DONE; 3486 } 3487 if (unlikely(tid >= t->ntids)) { 3488 printk(KERN_ERR "%s: passive open TID %u too large\n", 3489 cdev->name, tid); 3490 return CPL_RET_BUF_DONE; 3491 } 3492 /* 3493 * For T3A the current user of the TID may have closed but its last 3494 * message(s) may have been backlogged so the TID appears to be still 3495 * in use. Just take the TID away, the connection can close at its 3496 * own leisure. For T3B this situation is a bug. 3497 */ 3498 if (!valid_new_tid(t, tid) && 3499 cdev->type != T3A) { 3500 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 3501 cdev->name, tid); 3502 return CPL_RET_BUF_DONE; 3503 } 3504#endif 3505 3506 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 3507 return (0); 3508} 3509 3510/* 3511 * Called when a connection is established to translate the TCP options 3512 * reported by HW to FreeBSD's native format. 3513 */ 3514static void 3515assign_rxopt(struct socket *so, unsigned int opt) 3516{ 3517 struct tcpcb *tp = so_sototcpcb(so); 3518 struct toepcb *toep = tp->t_toe; 3519 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); 3520 3521 inp_lock_assert(tp->t_inpcb); 3522 3523 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3524 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 3525 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 3526 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 3527 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3528 (TF_RCVD_SCALE|TF_REQ_SCALE)) 3529 tp->rcv_scale = tp->request_r_scale; 3530} 3531 3532/* 3533 * Completes some final bits of initialization for just established connections 3534 * and changes their state to TCP_ESTABLISHED. 3535 * 3536 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 3537 */ 3538static void 3539make_established(struct socket *so, u32 snd_isn, unsigned int opt) 3540{ 3541 struct tcpcb *tp = so_sototcpcb(so); 3542 struct toepcb *toep = tp->t_toe; 3543 3544 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 3545 assign_rxopt(so, opt); 3546 3547 /* 3548 *XXXXXXXXXXX 3549 * 3550 */ 3551#ifdef notyet 3552 so->so_proto->pr_ctloutput = t3_ctloutput; 3553#endif 3554 3555#if 0 3556 inet_sk(sk)->id = tp->write_seq ^ jiffies; 3557#endif 3558 /* 3559 * XXX not clear what rcv_wup maps to 3560 */ 3561 /* 3562 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 3563 * pass through opt0. 3564 */ 3565 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 3566 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 3567 3568 dump_toepcb(toep); 3569 3570#ifdef notyet 3571/* 3572 * no clean interface for marking ARP up to date 3573 */ 3574 dst_confirm(sk->sk_dst_cache); 3575#endif 3576 tp->t_starttime = ticks; 3577 tp->t_state = TCPS_ESTABLISHED; 3578 soisconnected(so); 3579} 3580 3581static int 3582syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 3583{ 3584 3585 struct in_conninfo inc; 3586 struct tcpopt to; 3587 struct tcphdr th; 3588 int mss, wsf, sack, ts; 3589 struct mbuf *m = NULL; 3590 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 3591 unsigned int opt; 3592 3593#ifdef MAC 3594#error "no MAC support" 3595#endif 3596 3597 opt = ntohs(req->tcp_opt); 3598 3599 bzero(&to, sizeof(struct tcpopt)); 3600 3601 /* 3602 * Fill out information for entering us into the syncache 3603 */ 3604 bzero(&inc, sizeof(inc)); 3605 inc.inc_fport = th.th_sport = req->peer_port; 3606 inc.inc_lport = th.th_dport = req->local_port; 3607 th.th_seq = req->rcv_isn; 3608 th.th_flags = TH_ACK; 3609 3610 inc.inc_isipv6 = 0; 3611 inc.inc_len = 0; 3612 inc.inc_faddr.s_addr = req->peer_ip; 3613 inc.inc_laddr.s_addr = req->local_ip; 3614 3615 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3616 wsf = G_TCPOPT_WSCALE_OK(opt); 3617 ts = G_TCPOPT_TSTAMP(opt); 3618 sack = G_TCPOPT_SACK(opt); 3619 3620 to.to_mss = mss; 3621 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 3622 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3623 3624 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 3625 ntohl(req->local_ip), ntohs(req->local_port), 3626 ntohl(req->peer_ip), ntohs(req->peer_port), 3627 mss, wsf, ts, sack); 3628 return tcp_offload_syncache_expand(&inc, &to, &th, so, m); 3629} 3630 3631 3632/* 3633 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 3634 * if we are in TCP_SYN_RECV due to crossed SYNs 3635 */ 3636static int 3637do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3638{ 3639 struct cpl_pass_establish *req = cplhdr(m); 3640 struct toepcb *toep = (struct toepcb *)ctx; 3641 struct tcpcb *tp = toep->tp_tp; 3642 struct socket *so, *lso; 3643 struct t3c_data *td = T3C_DATA(cdev); 3644 struct sockbuf *snd, *rcv; 3645 3646 // Complete socket initialization now that we have the SND_ISN 3647 3648 struct toedev *tdev; 3649 3650 3651 tdev = toep->tp_toedev; 3652 3653 inp_wlock(tp->t_inpcb); 3654 3655 /* 3656 * 3657 * XXX need to add reference while we're manipulating 3658 */ 3659 so = lso = inp_inpcbtosocket(tp->t_inpcb); 3660 3661 inp_wunlock(tp->t_inpcb); 3662 3663 so_lock(so); 3664 LIST_REMOVE(toep, synq_entry); 3665 so_unlock(so); 3666 3667 if (!syncache_expand_establish_req(req, &so, toep)) { 3668 /* 3669 * No entry 3670 */ 3671 CXGB_UNIMPLEMENTED(); 3672 } 3673 if (so == NULL) { 3674 /* 3675 * Couldn't create the socket 3676 */ 3677 CXGB_UNIMPLEMENTED(); 3678 } 3679 3680 tp = so_sototcpcb(so); 3681 inp_wlock(tp->t_inpcb); 3682 3683 snd = so_sockbuf_snd(so); 3684 rcv = so_sockbuf_rcv(so); 3685 3686 snd->sb_flags |= SB_NOCOALESCE; 3687 rcv->sb_flags |= SB_NOCOALESCE; 3688 3689 toep->tp_tp = tp; 3690 toep->tp_flags = 0; 3691 tp->t_toe = toep; 3692 reset_wr_list(toep); 3693 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3694 tp->rcv_nxt = toep->tp_copied_seq; 3695 install_offload_ops(so); 3696 3697 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 3698 toep->tp_wr_unacked = 0; 3699 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3700 toep->tp_qset_idx = 0; 3701 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 3702 3703 /* 3704 * XXX Cancel any keep alive timer 3705 */ 3706 3707 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3708 3709 /* 3710 * XXX workaround for lack of syncache drop 3711 */ 3712 toepcb_release(toep); 3713 inp_wunlock(tp->t_inpcb); 3714 3715 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); 3716 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3717#ifdef notyet 3718 /* 3719 * XXX not sure how these checks map to us 3720 */ 3721 if (unlikely(sk->sk_socket)) { // simultaneous opens only 3722 sk->sk_state_change(sk); 3723 sk_wake_async(so, 0, POLL_OUT); 3724 } 3725 /* 3726 * The state for the new connection is now up to date. 3727 * Next check if we should add the connection to the parent's 3728 * accept queue. When the parent closes it resets connections 3729 * on its SYN queue, so check if we are being reset. If so we 3730 * don't need to do anything more, the coming ABORT_RPL will 3731 * destroy this socket. Otherwise move the connection to the 3732 * accept queue. 3733 * 3734 * Note that we reset the synq before closing the server so if 3735 * we are not being reset the stid is still open. 3736 */ 3737 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3738 __kfree_skb(skb); 3739 goto unlock; 3740 } 3741#endif 3742 m_free(m); 3743 3744 return (0); 3745} 3746 3747/* 3748 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3749 * and send them to the TOE. 3750 */ 3751static void 3752fixup_and_send_ofo(struct toepcb *toep) 3753{ 3754 struct mbuf *m; 3755 struct toedev *tdev = toep->tp_toedev; 3756 struct tcpcb *tp = toep->tp_tp; 3757 unsigned int tid = toep->tp_tid; 3758 3759 log(LOG_NOTICE, "fixup_and_send_ofo\n"); 3760 3761 inp_lock_assert(tp->t_inpcb); 3762 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3763 /* 3764 * A variety of messages can be waiting but the fields we'll 3765 * be touching are common to all so any message type will do. 3766 */ 3767 struct cpl_close_con_req *p = cplhdr(m); 3768 3769 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3770 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3771 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3772 } 3773} 3774 3775/* 3776 * Updates socket state from an active establish CPL message. Runs with the 3777 * socket lock held. 3778 */ 3779static void 3780socket_act_establish(struct socket *so, struct mbuf *m) 3781{ 3782 INIT_VNET_INET(so->so_vnet); 3783 struct cpl_act_establish *req = cplhdr(m); 3784 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3785 struct tcpcb *tp = so_sototcpcb(so); 3786 struct toepcb *toep = tp->t_toe; 3787 3788 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3789 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3790 toep->tp_tid, tp->t_state); 3791 3792 tp->ts_recent_age = ticks; 3793 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3794 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3795 3796 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3797 3798 /* 3799 * Now that we finally have a TID send any CPL messages that we had to 3800 * defer for lack of a TID. 3801 */ 3802 if (mbufq_len(&toep->out_of_order_queue)) 3803 fixup_and_send_ofo(toep); 3804 3805 if (__predict_false(so_state_get(so) & SS_NOFDREF)) { 3806 /* 3807 * XXX does this even make sense? 3808 */ 3809 so_sorwakeup(so); 3810 } 3811 m_free(m); 3812#ifdef notyet 3813/* 3814 * XXX assume no write requests permitted while socket connection is 3815 * incomplete 3816 */ 3817 /* 3818 * Currently the send queue must be empty at this point because the 3819 * socket layer does not send anything before a connection is 3820 * established. To be future proof though we handle the possibility 3821 * that there are pending buffers to send (either TX_DATA or 3822 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3823 * buffers according to the just learned write_seq, and then we send 3824 * them on their way. 3825 */ 3826 fixup_pending_writeq_buffers(sk); 3827 if (t3_push_frames(so, 1)) 3828 sk->sk_write_space(sk); 3829#endif 3830 3831 toep->tp_state = tp->t_state; 3832 V_tcpstat.tcps_connects++; 3833 3834} 3835 3836/* 3837 * Process a CPL_ACT_ESTABLISH message. 3838 */ 3839static int 3840do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3841{ 3842 struct cpl_act_establish *req = cplhdr(m); 3843 unsigned int tid = GET_TID(req); 3844 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3845 struct toepcb *toep = (struct toepcb *)ctx; 3846 struct tcpcb *tp = toep->tp_tp; 3847 struct socket *so; 3848 struct toedev *tdev; 3849 struct tom_data *d; 3850 3851 if (tp == NULL) { 3852 free_atid(cdev, atid); 3853 return (0); 3854 } 3855 inp_wlock(tp->t_inpcb); 3856 3857 /* 3858 * XXX 3859 */ 3860 so = inp_inpcbtosocket(tp->t_inpcb); 3861 tdev = toep->tp_toedev; /* blow up here if link was down */ 3862 d = TOM_DATA(tdev); 3863 3864 /* 3865 * It's OK if the TID is currently in use, the owning socket may have 3866 * backlogged its last CPL message(s). Just take it away. 3867 */ 3868 toep->tp_tid = tid; 3869 toep->tp_tp = tp; 3870 so_insert_tid(d, toep, tid); 3871 free_atid(cdev, atid); 3872 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3873 3874 socket_act_establish(so, m); 3875 inp_wunlock(tp->t_inpcb); 3876 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); 3877 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3878 3879 return (0); 3880} 3881 3882/* 3883 * Process an acknowledgment of WR completion. Advance snd_una and send the 3884 * next batch of work requests from the write queue. 3885 */ 3886static void 3887wr_ack(struct toepcb *toep, struct mbuf *m) 3888{ 3889 struct tcpcb *tp = toep->tp_tp; 3890 struct cpl_wr_ack *hdr = cplhdr(m); 3891 struct socket *so; 3892 unsigned int credits = ntohs(hdr->credits); 3893 u32 snd_una = ntohl(hdr->snd_una); 3894 int bytes = 0; 3895 struct sockbuf *snd; 3896 3897 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); 3898 3899 inp_wlock(tp->t_inpcb); 3900 so = inp_inpcbtosocket(tp->t_inpcb); 3901 toep->tp_wr_avail += credits; 3902 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3903 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3904 3905 while (credits) { 3906 struct mbuf *p = peek_wr(toep); 3907 3908 if (__predict_false(!p)) { 3909 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3910 "nothing pending, state %u wr_avail=%u\n", 3911 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 3912 break; 3913 } 3914 CTR2(KTR_TOM, 3915 "wr_ack: p->credits=%d p->bytes=%d", 3916 p->m_pkthdr.csum_data, p->m_pkthdr.len); 3917 KASSERT(p->m_pkthdr.csum_data != 0, 3918 ("empty request still on list")); 3919 3920 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3921 3922#if DEBUG_WR > 1 3923 struct tx_data_wr *w = cplhdr(p); 3924 log(LOG_ERR, 3925 "TID %u got %u WR credits, need %u, len %u, " 3926 "main body %u, frags %u, seq # %u, ACK una %u," 3927 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3928 toep->tp_tid, credits, p->csum, p->len, 3929 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3930 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3931 toep->tp_wr_avail, count_pending_wrs(tp) - credits); 3932#endif 3933 p->m_pkthdr.csum_data -= credits; 3934 break; 3935 } else { 3936 dequeue_wr(toep); 3937 credits -= p->m_pkthdr.csum_data; 3938 bytes += p->m_pkthdr.len; 3939 CTR3(KTR_TOM, 3940 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", 3941 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); 3942 3943 m_free(p); 3944 } 3945 } 3946 3947#if DEBUG_WR 3948 check_wr_invariants(tp); 3949#endif 3950 3951 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3952#if VALIDATE_SEQ 3953 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3954 3955 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3956 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3957 toep->tp_tid, tp->snd_una); 3958#endif 3959 goto out_free; 3960 } 3961 3962 if (tp->snd_una != snd_una) { 3963 tp->snd_una = snd_una; 3964 tp->ts_recent_age = ticks; 3965#ifdef notyet 3966 /* 3967 * Keep ARP entry "minty fresh" 3968 */ 3969 dst_confirm(sk->sk_dst_cache); 3970#endif 3971 if (tp->snd_una == tp->snd_nxt) 3972 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3973 } 3974 3975 snd = so_sockbuf_snd(so); 3976 if (bytes) { 3977 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); 3978 snd = so_sockbuf_snd(so); 3979 sockbuf_lock(snd); 3980 sbdrop_locked(snd, bytes); 3981 so_sowwakeup_locked(so); 3982 } 3983 3984 if (snd->sb_sndptroff < snd->sb_cc) 3985 t3_push_frames(so, 0); 3986 3987out_free: 3988 inp_wunlock(tp->t_inpcb); 3989 m_free(m); 3990} 3991 3992/* 3993 * Handler for TX_DATA_ACK CPL messages. 3994 */ 3995static int 3996do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 3997{ 3998 struct toepcb *toep = (struct toepcb *)ctx; 3999 4000 VALIDATE_SOCK(so); 4001 4002 wr_ack(toep, m); 4003 return 0; 4004} 4005 4006/* 4007 * Handler for TRACE_PKT CPL messages. Just sink these packets. 4008 */ 4009static int 4010do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) 4011{ 4012 m_freem(m); 4013 return 0; 4014} 4015 4016/* 4017 * Reset a connection that is on a listener's SYN queue or accept queue, 4018 * i.e., one that has not had a struct socket associated with it. 4019 * Must be called from process context. 4020 * 4021 * Modeled after code in inet_csk_listen_stop(). 4022 */ 4023static void 4024t3_reset_listen_child(struct socket *child) 4025{ 4026 struct tcpcb *tp = so_sototcpcb(child); 4027 4028 t3_send_reset(tp->t_toe); 4029} 4030 4031 4032static void 4033t3_child_disconnect(struct socket *so, void *arg) 4034{ 4035 struct tcpcb *tp = so_sototcpcb(so); 4036 4037 if (tp->t_flags & TF_TOE) { 4038 inp_wlock(tp->t_inpcb); 4039 t3_reset_listen_child(so); 4040 inp_wunlock(tp->t_inpcb); 4041 } 4042} 4043 4044/* 4045 * Disconnect offloaded established but not yet accepted connections sitting 4046 * on a server's accept_queue. We just send an ABORT_REQ at this point and 4047 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 4048 */ 4049void 4050t3_disconnect_acceptq(struct socket *listen_so) 4051{ 4052 4053 so_lock(listen_so); 4054 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); 4055 so_unlock(listen_so); 4056} 4057 4058/* 4059 * Reset offloaded connections sitting on a server's syn queue. As above 4060 * we send ABORT_REQ and finish off when we get ABORT_RPL. 4061 */ 4062 4063void 4064t3_reset_synq(struct listen_ctx *lctx) 4065{ 4066 struct toepcb *toep; 4067 4068 so_lock(lctx->lso); 4069 while (!LIST_EMPTY(&lctx->synq_head)) { 4070 toep = LIST_FIRST(&lctx->synq_head); 4071 LIST_REMOVE(toep, synq_entry); 4072 toep->tp_tp = NULL; 4073 t3_send_reset(toep); 4074 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 4075 toepcb_release(toep); 4076 } 4077 so_unlock(lctx->lso); 4078} 4079 4080 4081int 4082t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, 4083 unsigned int nppods, unsigned int tag, unsigned int maxoff, 4084 unsigned int pg_off, unsigned int color) 4085{ 4086 unsigned int i, j, pidx; 4087 struct pagepod *p; 4088 struct mbuf *m; 4089 struct ulp_mem_io *req; 4090 unsigned int tid = toep->tp_tid; 4091 const struct tom_data *td = TOM_DATA(toep->tp_toedev); 4092 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; 4093 4094 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", 4095 gl, nppods, tag, maxoff, pg_off, color); 4096 4097 for (i = 0; i < nppods; ++i) { 4098 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); 4099 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4100 req = mtod(m, struct ulp_mem_io *); 4101 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; 4102 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4103 req->wr.wr_lo = 0; 4104 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | 4105 V_ULPTX_CMD(ULP_MEM_WRITE)); 4106 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | 4107 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); 4108 4109 p = (struct pagepod *)(req + 1); 4110 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { 4111 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); 4112 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | 4113 V_PPOD_COLOR(color)); 4114 p->pp_max_offset = htonl(maxoff); 4115 p->pp_page_offset = htonl(pg_off); 4116 p->pp_rsvd = 0; 4117 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) 4118 p->pp_addr[j] = pidx < gl->dgl_nelem ? 4119 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; 4120 } else 4121 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ 4122 send_or_defer(toep, m, 0); 4123 ppod_addr += PPOD_SIZE; 4124 } 4125 return (0); 4126} 4127 4128/* 4129 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. 4130 */ 4131static inline void 4132mk_cpl_barrier_ulp(struct cpl_barrier *b) 4133{ 4134 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; 4135 4136 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4137 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); 4138 b->opcode = CPL_BARRIER; 4139} 4140 4141/* 4142 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. 4143 */ 4144static inline void 4145mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) 4146{ 4147 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4148 4149 txpkt = (struct ulp_txpkt *)req; 4150 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4151 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4152 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); 4153 req->cpuno = htons(cpuno); 4154} 4155 4156/* 4157 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. 4158 */ 4159static inline void 4160mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, 4161 unsigned int word, uint64_t mask, uint64_t val) 4162{ 4163 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4164 4165 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 4166 tid, word, mask, val); 4167 4168 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4169 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4170 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); 4171 req->reply = V_NO_REPLY(1); 4172 req->cpu_idx = 0; 4173 req->word = htons(word); 4174 req->mask = htobe64(mask); 4175 req->val = htobe64(val); 4176} 4177 4178/* 4179 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. 4180 */ 4181static void 4182mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, 4183 unsigned int tid, unsigned int credits) 4184{ 4185 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; 4186 4187 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4188 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); 4189 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); 4190 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 4191 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | 4192 V_RX_CREDITS(credits)); 4193} 4194 4195void 4196t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) 4197{ 4198 unsigned int wrlen; 4199 struct mbuf *m; 4200 struct work_request_hdr *wr; 4201 struct cpl_barrier *lock; 4202 struct cpl_set_tcb_field *req; 4203 struct cpl_get_tcb *getreq; 4204 struct ddp_state *p = &toep->tp_ddp_state; 4205 4206#if 0 4207 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4208#endif 4209 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + 4210 sizeof(*getreq); 4211 m = m_gethdr_nofail(wrlen); 4212 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4213 wr = mtod(m, struct work_request_hdr *); 4214 bzero(wr, wrlen); 4215 4216 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4217 m->m_pkthdr.len = m->m_len = wrlen; 4218 4219 lock = (struct cpl_barrier *)(wr + 1); 4220 mk_cpl_barrier_ulp(lock); 4221 4222 req = (struct cpl_set_tcb_field *)(lock + 1); 4223 4224 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); 4225 4226 /* Hmmm, not sure if this actually a good thing: reactivating 4227 * the other buffer might be an issue if it has been completed 4228 * already. However, that is unlikely, since the fact that the UBUF 4229 * is not completed indicates that there is no oustanding data. 4230 */ 4231 if (bufidx == 0) 4232 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4233 V_TF_DDP_ACTIVE_BUF(1) | 4234 V_TF_DDP_BUF0_VALID(1), 4235 V_TF_DDP_ACTIVE_BUF(1)); 4236 else 4237 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4238 V_TF_DDP_ACTIVE_BUF(1) | 4239 V_TF_DDP_BUF1_VALID(1), 0); 4240 4241 getreq = (struct cpl_get_tcb *)(req + 1); 4242 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4243 4244 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); 4245 4246 /* Keep track of the number of oustanding CPL_GET_TCB requests 4247 */ 4248 p->get_tcb_count++; 4249 4250#ifdef T3_TRACE 4251 T3_TRACE1(TIDTB(so), 4252 "t3_cancel_ddpbuf: bufidx %u", bufidx); 4253#endif 4254 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4255} 4256 4257/** 4258 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one 4259 * @sk: the socket associated with the buffers 4260 * @bufidx: index of HW DDP buffer (0 or 1) 4261 * @tag0: new tag for HW buffer 0 4262 * @tag1: new tag for HW buffer 1 4263 * @len: new length for HW buf @bufidx 4264 * 4265 * Sends a compound WR to overlay a new DDP buffer on top of an existing 4266 * buffer by changing the buffer tag and length and setting the valid and 4267 * active flag accordingly. The caller must ensure the new buffer is at 4268 * least as big as the existing one. Since we typically reprogram both HW 4269 * buffers this function sets both tags for convenience. Read the TCB to 4270 * determine how made data was written into the buffer before the overlay 4271 * took place. 4272 */ 4273void 4274t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, 4275 unsigned int tag1, unsigned int len) 4276{ 4277 unsigned int wrlen; 4278 struct mbuf *m; 4279 struct work_request_hdr *wr; 4280 struct cpl_get_tcb *getreq; 4281 struct cpl_set_tcb_field *req; 4282 struct ddp_state *p = &toep->tp_ddp_state; 4283 4284 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", 4285 bufidx, tag0, tag1, len); 4286#if 0 4287 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4288#endif 4289 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); 4290 m = m_gethdr_nofail(wrlen); 4291 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4292 wr = mtod(m, struct work_request_hdr *); 4293 m->m_pkthdr.len = m->m_len = wrlen; 4294 bzero(wr, wrlen); 4295 4296 4297 /* Set the ATOMIC flag to make sure that TP processes the following 4298 * CPLs in an atomic manner and no wire segments can be interleaved. 4299 */ 4300 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); 4301 req = (struct cpl_set_tcb_field *)(wr + 1); 4302 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, 4303 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | 4304 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, 4305 V_TCB_RX_DDP_BUF0_TAG(tag0) | 4306 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); 4307 req++; 4308 if (bufidx == 0) { 4309 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, 4310 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4311 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 4312 req++; 4313 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4314 V_TF_DDP_PUSH_DISABLE_0(1) | 4315 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4316 V_TF_DDP_PUSH_DISABLE_0(0) | 4317 V_TF_DDP_BUF0_VALID(1)); 4318 } else { 4319 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, 4320 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), 4321 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); 4322 req++; 4323 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4324 V_TF_DDP_PUSH_DISABLE_1(1) | 4325 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4326 V_TF_DDP_PUSH_DISABLE_1(0) | 4327 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); 4328 } 4329 4330 getreq = (struct cpl_get_tcb *)(req + 1); 4331 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4332 4333 /* Keep track of the number of oustanding CPL_GET_TCB requests 4334 */ 4335 p->get_tcb_count++; 4336 4337#ifdef T3_TRACE 4338 T3_TRACE4(TIDTB(sk), 4339 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " 4340 "len %d", 4341 bufidx, tag0, tag1, len); 4342#endif 4343 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4344} 4345 4346/* 4347 * Sends a compound WR containing all the CPL messages needed to program the 4348 * two HW DDP buffers, namely optionally setting up the length and offset of 4349 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. 4350 */ 4351void 4352t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, 4353 unsigned int len1, unsigned int offset1, 4354 uint64_t ddp_flags, uint64_t flag_mask, int modulate) 4355{ 4356 unsigned int wrlen; 4357 struct mbuf *m; 4358 struct work_request_hdr *wr; 4359 struct cpl_set_tcb_field *req; 4360 4361 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", 4362 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); 4363 4364#if 0 4365 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4366#endif 4367 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + 4368 (len1 ? sizeof(*req) : 0) + 4369 (modulate ? sizeof(struct cpl_rx_data_ack) : 0); 4370 m = m_gethdr_nofail(wrlen); 4371 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4372 wr = mtod(m, struct work_request_hdr *); 4373 bzero(wr, wrlen); 4374 4375 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4376 m->m_pkthdr.len = m->m_len = wrlen; 4377 4378 req = (struct cpl_set_tcb_field *)(wr + 1); 4379 if (len0) { /* program buffer 0 offset and length */ 4380 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, 4381 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 4382 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4383 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | 4384 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); 4385 req++; 4386 } 4387 if (len1) { /* program buffer 1 offset and length */ 4388 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, 4389 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 4390 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, 4391 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | 4392 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); 4393 req++; 4394 } 4395 4396 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, 4397 ddp_flags); 4398 4399 if (modulate) { 4400 mk_rx_data_ack_ulp(toep, 4401 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, 4402 toep->tp_copied_seq - toep->tp_rcv_wup); 4403 toep->tp_rcv_wup = toep->tp_copied_seq; 4404 } 4405 4406#ifdef T3_TRACE 4407 T3_TRACE5(TIDTB(sk), 4408 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " 4409 "modulate %d", 4410 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, 4411 modulate); 4412#endif 4413 4414 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4415} 4416 4417void 4418t3_init_wr_tab(unsigned int wr_len) 4419{ 4420 int i; 4421 4422 if (mbuf_wrs[1]) /* already initialized */ 4423 return; 4424 4425 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 4426 int sgl_len = (3 * i) / 2 + (i & 1); 4427 4428 sgl_len += 3; 4429 mbuf_wrs[i] = sgl_len <= wr_len ? 4430 1 : 1 + (sgl_len - 2) / (wr_len - 1); 4431 } 4432 4433 wrlen = wr_len * 8; 4434} 4435 4436int 4437t3_init_cpl_io(void) 4438{ 4439#ifdef notyet 4440 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 4441 if (!tcphdr_skb) { 4442 log(LOG_ERR, 4443 "Chelsio TCP offload: can't allocate sk_buff\n"); 4444 return -1; 4445 } 4446 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 4447 tcphdr_skb->h.raw = tcphdr_skb->data; 4448 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 4449#endif 4450 4451 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 4452 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 4453 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 4454 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 4455 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 4456 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 4457 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 4458 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 4459 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 4460 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 4461 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 4462 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 4463 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 4464 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 4465 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 4466 return (0); 4467} 4468 4469