cxgb_cpl_io.c revision 180644
1/************************************************************************** 2 3Copyright (c) 2007-2008, Chelsio Inc. 4All rights reserved. 5 6Redistribution and use in source and binary forms, with or without 7modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26POSSIBILITY OF SUCH DAMAGE. 27 28***************************************************************************/ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 180644 2008-07-21 01:23:19Z kmacy $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/socket.h> 43#include <sys/sysctl.h> 44#include <sys/syslog.h> 45#include <sys/protosw.h> 46#include <sys/priv.h> 47 48#include <net/if.h> 49#include <net/route.h> 50 51#include <netinet/in.h> 52#include <netinet/in_pcb.h> 53#include <netinet/in_systm.h> 54#include <netinet/in_var.h> 55 56 57#include <dev/cxgb/cxgb_osdep.h> 58#include <dev/cxgb/sys/mbufq.h> 59 60#include <netinet/ip.h> 61#include <netinet/tcp_var.h> 62#include <netinet/tcp_fsm.h> 63#include <netinet/tcp_offload.h> 64#include <netinet/tcp_seq.h> 65#include <netinet/tcp_syncache.h> 66#include <netinet/tcp_timer.h> 67#include <net/route.h> 68 69#include <dev/cxgb/t3cdev.h> 70#include <dev/cxgb/common/cxgb_firmware_exports.h> 71#include <dev/cxgb/common/cxgb_t3_cpl.h> 72#include <dev/cxgb/common/cxgb_tcb.h> 73#include <dev/cxgb/common/cxgb_ctl_defs.h> 74#include <dev/cxgb/cxgb_offload.h> 75#include <vm/vm.h> 76#include <vm/pmap.h> 77#include <machine/bus.h> 78#include <dev/cxgb/sys/mvec.h> 79#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> 80#include <dev/cxgb/ulp/tom/cxgb_defs.h> 81#include <dev/cxgb/ulp/tom/cxgb_tom.h> 82#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> 83#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> 84#include <dev/cxgb/ulp/tom/cxgb_tcp.h> 85 86#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h> 87 88/* 89 * For ULP connections HW may add headers, e.g., for digests, that aren't part 90 * of the messages sent by the host but that are part of the TCP payload and 91 * therefore consume TCP sequence space. Tx connection parameters that 92 * operate in TCP sequence space are affected by the HW additions and need to 93 * compensate for them to accurately track TCP sequence numbers. This array 94 * contains the compensating extra lengths for ULP packets. It is indexed by 95 * a packet's ULP submode. 96 */ 97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 98 99#ifdef notyet 100/* 101 * This sk_buff holds a fake header-only TCP segment that we use whenever we 102 * need to exploit SW TCP functionality that expects TCP headers, such as 103 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 104 * CPUs without locking. 105 */ 106static struct mbuf *tcphdr_mbuf __read_mostly; 107#endif 108 109/* 110 * Size of WRs in bytes. Note that we assume all devices we are handling have 111 * the same WR size. 112 */ 113static unsigned int wrlen __read_mostly; 114 115/* 116 * The number of WRs needed for an skb depends on the number of page fragments 117 * in the skb and whether it has any payload in its main body. This maps the 118 * length of the gather list represented by an skb into the # of necessary WRs. 119 */ 120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; 121 122/* 123 * Max receive window supported by HW in bytes. Only a small part of it can 124 * be set through option0, the rest needs to be set through RX_DATA_ACK. 125 */ 126#define MAX_RCV_WND ((1U << 27) - 1) 127 128/* 129 * Min receive window. We want it to be large enough to accommodate receive 130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 131 */ 132#define MIN_RCV_WND (24 * 1024U) 133#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 134 135#define VALIDATE_SEQ 0 136#define VALIDATE_SOCK(so) 137#define DEBUG_WR 0 138 139#define TCP_TIMEWAIT 1 140#define TCP_CLOSE 2 141#define TCP_DROP 3 142 143extern int tcp_do_autorcvbuf; 144extern int tcp_do_autosndbuf; 145extern int tcp_autorcvbuf_max; 146extern int tcp_autosndbuf_max; 147 148static void t3_send_reset(struct toepcb *toep); 149static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 150static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 151static void handle_syncache_event(int event, void *arg); 152 153static inline void 154SBAPPEND(struct sockbuf *sb, struct mbuf *n) 155{ 156 struct mbuf *m; 157 158 m = sb->sb_mb; 159 while (m) { 160 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 161 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 162 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 163 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 164 m->m_next, m->m_nextpkt, m->m_flags)); 165 m = m->m_next; 166 } 167 m = n; 168 while (m) { 169 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 170 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 171 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 172 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 173 m->m_next, m->m_nextpkt, m->m_flags)); 174 m = m->m_next; 175 } 176 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); 177 sbappendstream_locked(sb, n); 178 m = sb->sb_mb; 179 180 while (m) { 181 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 182 m->m_next, m->m_nextpkt, m->m_flags)); 183 m = m->m_next; 184 } 185} 186 187static inline int 188is_t3a(const struct toedev *dev) 189{ 190 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 191} 192 193static void 194dump_toepcb(struct toepcb *toep) 195{ 196 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 197 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 198 toep->tp_mtu_idx, toep->tp_tid); 199 200 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 201 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 202 toep->tp_mss_clamp, toep->tp_flags); 203} 204 205#ifndef RTALLOC2_DEFINED 206static struct rtentry * 207rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 208{ 209 struct rtentry *rt = NULL; 210 211 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 212 RT_UNLOCK(rt); 213 214 return (rt); 215} 216#endif 217 218/* 219 * Determine whether to send a CPL message now or defer it. A message is 220 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 221 * For connections in other states the message is sent immediately. 222 * If through_l2t is set the message is subject to ARP processing, otherwise 223 * it is sent directly. 224 */ 225static inline void 226send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) 227{ 228 struct tcpcb *tp = toep->tp_tp; 229 230 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 231 inp_wlock(tp->t_inpcb); 232 mbufq_tail(&toep->out_of_order_queue, m); // defer 233 inp_wunlock(tp->t_inpcb); 234 } else if (through_l2t) 235 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T 236 else 237 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly 238} 239 240static inline unsigned int 241mkprio(unsigned int cntrl, const struct toepcb *toep) 242{ 243 return (cntrl); 244} 245 246/* 247 * Populate a TID_RELEASE WR. The skb must be already propely sized. 248 */ 249static inline void 250mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) 251{ 252 struct cpl_tid_release *req; 253 254 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); 255 m->m_pkthdr.len = m->m_len = sizeof(*req); 256 req = mtod(m, struct cpl_tid_release *); 257 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 258 req->wr.wr_lo = 0; 259 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 260} 261 262static inline void 263make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 264{ 265 struct tcpcb *tp = so_sototcpcb(so); 266 struct toepcb *toep = tp->t_toe; 267 struct tx_data_wr *req; 268 struct sockbuf *snd; 269 270 inp_lock_assert(tp->t_inpcb); 271 snd = so_sockbuf_snd(so); 272 273 req = mtod(m, struct tx_data_wr *); 274 m->m_len = sizeof(*req); 275 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 276 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 277 /* len includes the length of any HW ULP additions */ 278 req->len = htonl(len); 279 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 280 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 281 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 282 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 283 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 284 (tail ? 0 : 1)))); 285 req->sndseq = htonl(tp->snd_nxt); 286 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 287 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 288 V_TX_CPU_IDX(toep->tp_qset)); 289 290 /* Sendbuffer is in units of 32KB. 291 */ 292 if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 293 req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15)); 294 else { 295 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 296 } 297 298 toep->tp_flags |= TP_DATASENT; 299 } 300} 301 302#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ 303 304int 305t3_push_frames(struct socket *so, int req_completion) 306{ 307 struct tcpcb *tp = so_sototcpcb(so); 308 struct toepcb *toep = tp->t_toe; 309 310 struct mbuf *tail, *m0, *last; 311 struct t3cdev *cdev; 312 struct tom_data *d; 313 int state, bytes, count, total_bytes; 314 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 315 struct sockbuf *snd; 316 317 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 318 DPRINTF("tcp state=%d\n", tp->t_state); 319 return (0); 320 } 321 322 state = so_state_get(so); 323 324 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 325 DPRINTF("disconnecting\n"); 326 327 return (0); 328 } 329 330 inp_lock_assert(tp->t_inpcb); 331 332 snd = so_sockbuf_snd(so); 333 sockbuf_lock(snd); 334 335 d = TOM_DATA(toep->tp_toedev); 336 cdev = d->cdev; 337 338 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 339 340 total_bytes = 0; 341 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 342 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); 343 344 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { 345 KASSERT(tail, ("sbdrop error")); 346 last = tail = tail->m_next; 347 } 348 349 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 350 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 351 sockbuf_unlock(snd); 352 353 return (0); 354 } 355 356 toep->tp_m_last = NULL; 357 while (toep->tp_wr_avail && (tail != NULL)) { 358 count = bytes = 0; 359 segp = segs; 360 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 361 sockbuf_unlock(snd); 362 return (0); 363 } 364 /* 365 * If the data in tail fits as in-line, then 366 * make an immediate data wr. 367 */ 368 if (tail->m_len <= IMM_LEN) { 369 count = 1; 370 bytes = tail->m_len; 371 last = tail; 372 tail = tail->m_next; 373 m_set_sgl(m0, NULL); 374 m_set_sgllen(m0, 0); 375 make_tx_data_wr(so, m0, bytes, tail); 376 m_append(m0, bytes, mtod(last, caddr_t)); 377 KASSERT(!m0->m_next, ("bad append")); 378 } else { 379 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 380 && (tail != NULL) && (count < TX_MAX_SEGS-1)) { 381 bytes += tail->m_len; 382 last = tail; 383 count++; 384 /* 385 * technically an abuse to be using this for a VA 386 * but less gross than defining my own structure 387 * or calling pmap_kextract from here :-| 388 */ 389 segp->ds_addr = (bus_addr_t)tail->m_data; 390 segp->ds_len = tail->m_len; 391 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 392 count, mbuf_wrs[count], tail->m_data, tail->m_len); 393 segp++; 394 tail = tail->m_next; 395 } 396 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 397 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 398 399 m_set_sgl(m0, segs); 400 m_set_sgllen(m0, count); 401 make_tx_data_wr(so, m0, bytes, tail); 402 } 403 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); 404 405 if (tail) { 406 snd->sb_sndptr = tail; 407 toep->tp_m_last = NULL; 408 } else 409 toep->tp_m_last = snd->sb_sndptr = last; 410 411 412 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 413 414 snd->sb_sndptroff += bytes; 415 total_bytes += bytes; 416 toep->tp_write_seq += bytes; 417 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" 418 " tail=%p sndptr=%p sndptroff=%d", 419 toep->tp_wr_avail, count, mbuf_wrs[count], 420 tail, snd->sb_sndptr, snd->sb_sndptroff); 421 if (tail) 422 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" 423 " tp_m_last=%p tailbuf=%p snd_una=0x%08x", 424 total_bytes, toep->tp_m_last, tail->m_data, 425 tp->snd_una); 426 else 427 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" 428 " tp_m_last=%p snd_una=0x%08x", 429 total_bytes, toep->tp_m_last, tp->snd_una); 430 431 432#ifdef KTR 433{ 434 int i; 435 436 i = 0; 437 while (i < count && m_get_sgllen(m0)) { 438 if ((count - i) >= 3) { 439 CTR6(KTR_TOM, 440 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 441 " len=%d pa=0x%zx len=%d", 442 segs[i].ds_addr, segs[i].ds_len, 443 segs[i + 1].ds_addr, segs[i + 1].ds_len, 444 segs[i + 2].ds_addr, segs[i + 2].ds_len); 445 i += 3; 446 } else if ((count - i) == 2) { 447 CTR4(KTR_TOM, 448 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 449 " len=%d", 450 segs[i].ds_addr, segs[i].ds_len, 451 segs[i + 1].ds_addr, segs[i + 1].ds_len); 452 i += 2; 453 } else { 454 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", 455 segs[i].ds_addr, segs[i].ds_len); 456 i++; 457 } 458 459 } 460} 461#endif 462 /* 463 * remember credits used 464 */ 465 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 466 m0->m_pkthdr.len = bytes; 467 toep->tp_wr_avail -= mbuf_wrs[count]; 468 toep->tp_wr_unacked += mbuf_wrs[count]; 469 470 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 471 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 472 struct work_request_hdr *wr = cplhdr(m0); 473 474 wr->wr_hi |= htonl(F_WR_COMPL); 475 toep->tp_wr_unacked = 0; 476 } 477 KASSERT((m0->m_pkthdr.csum_data > 0) && 478 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", 479 m0->m_pkthdr.csum_data)); 480 m0->m_type = MT_DONTFREE; 481 enqueue_wr(toep, m0); 482 DPRINTF("sending offload tx with %d bytes in %d segments\n", 483 bytes, count); 484 l2t_send(cdev, m0, toep->tp_l2t); 485 } 486 sockbuf_unlock(snd); 487 return (total_bytes); 488} 489 490/* 491 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 492 * under any circumstances. We take the easy way out and always queue the 493 * message to the write_queue. We can optimize the case where the queue is 494 * already empty though the optimization is probably not worth it. 495 */ 496static void 497close_conn(struct socket *so) 498{ 499 struct mbuf *m; 500 struct cpl_close_con_req *req; 501 struct tom_data *d; 502 struct inpcb *inp = so_sotoinpcb(so); 503 struct tcpcb *tp; 504 struct toepcb *toep; 505 unsigned int tid; 506 507 508 inp_wlock(inp); 509 tp = so_sototcpcb(so); 510 toep = tp->t_toe; 511 512 if (tp->t_state != TCPS_SYN_SENT) 513 t3_push_frames(so, 1); 514 515 if (toep->tp_flags & TP_FIN_SENT) { 516 inp_wunlock(inp); 517 return; 518 } 519 520 tid = toep->tp_tid; 521 522 d = TOM_DATA(toep->tp_toedev); 523 524 m = m_gethdr_nofail(sizeof(*req)); 525 m_set_priority(m, CPL_PRIORITY_DATA); 526 m_set_sgl(m, NULL); 527 m_set_sgllen(m, 0); 528 529 toep->tp_flags |= TP_FIN_SENT; 530 req = mtod(m, struct cpl_close_con_req *); 531 532 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 533 req->wr.wr_lo = htonl(V_WR_TID(tid)); 534 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 535 req->rsvd = 0; 536 inp_wunlock(inp); 537 /* 538 * XXX - need to defer shutdown while there is still data in the queue 539 * 540 */ 541 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); 542 cxgb_ofld_send(d->cdev, m); 543 544} 545 546/* 547 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 548 * and send it along. 549 */ 550static void 551abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 552{ 553 struct cpl_abort_req *req = cplhdr(m); 554 555 req->cmd = CPL_ABORT_NO_RST; 556 cxgb_ofld_send(cdev, m); 557} 558 559/* 560 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 561 * permitted to return without sending the message in case we cannot allocate 562 * an sk_buff. Returns the number of credits sent. 563 */ 564uint32_t 565t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 566{ 567 struct mbuf *m; 568 struct cpl_rx_data_ack *req; 569 struct toepcb *toep = tp->t_toe; 570 struct toedev *tdev = toep->tp_toedev; 571 572 m = m_gethdr_nofail(sizeof(*req)); 573 574 DPRINTF("returning %u credits to HW\n", credits); 575 576 req = mtod(m, struct cpl_rx_data_ack *); 577 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 578 req->wr.wr_lo = 0; 579 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 580 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 581 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 582 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 583 return (credits); 584} 585 586/* 587 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. 588 * This is only used in DDP mode, so we take the opportunity to also set the 589 * DACK mode and flush any Rx credits. 590 */ 591void 592t3_send_rx_modulate(struct toepcb *toep) 593{ 594 struct mbuf *m; 595 struct cpl_rx_data_ack *req; 596 597 m = m_gethdr_nofail(sizeof(*req)); 598 599 req = mtod(m, struct cpl_rx_data_ack *); 600 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 601 req->wr.wr_lo = 0; 602 m->m_pkthdr.len = m->m_len = sizeof(*req); 603 604 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 605 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 606 V_RX_DACK_MODE(1) | 607 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); 608 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 609 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 610 toep->tp_rcv_wup = toep->tp_copied_seq; 611} 612 613/* 614 * Handle receipt of an urgent pointer. 615 */ 616static void 617handle_urg_ptr(struct socket *so, uint32_t urg_seq) 618{ 619#ifdef URGENT_DATA_SUPPORTED 620 struct tcpcb *tp = so_sototcpcb(so); 621 622 urg_seq--; /* initially points past the urgent data, per BSD */ 623 624 if (tp->urg_data && !after(urg_seq, tp->urg_seq)) 625 return; /* duplicate pointer */ 626 sk_send_sigurg(sk); 627 if (tp->urg_seq == tp->copied_seq && tp->urg_data && 628 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { 629 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 630 631 tp->copied_seq++; 632 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) 633 tom_eat_skb(sk, skb, 0); 634 } 635 tp->urg_data = TCP_URG_NOTYET; 636 tp->urg_seq = urg_seq; 637#endif 638} 639 640/* 641 * Returns true if a socket cannot accept new Rx data. 642 */ 643static inline int 644so_no_receive(const struct socket *so) 645{ 646 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); 647} 648 649/* 650 * Process an urgent data notification. 651 */ 652static void 653rx_urg_notify(struct toepcb *toep, struct mbuf *m) 654{ 655 struct cpl_rx_urg_notify *hdr = cplhdr(m); 656 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 657 658 VALIDATE_SOCK(so); 659 660 if (!so_no_receive(so)) 661 handle_urg_ptr(so, ntohl(hdr->seq)); 662 663 m_freem(m); 664} 665 666/* 667 * Handler for RX_URG_NOTIFY CPL messages. 668 */ 669static int 670do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) 671{ 672 struct toepcb *toep = (struct toepcb *)ctx; 673 674 rx_urg_notify(toep, m); 675 return (0); 676} 677 678static __inline int 679is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) 680{ 681 return (toep->tp_ulp_mode || 682 (toep->tp_ulp_mode == ULP_MODE_TCPDDP && 683 dev->tod_ttid >= TOE_ID_CHELSIO_T3)); 684} 685 686/* 687 * Set of states for which we should return RX credits. 688 */ 689#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 690 691/* 692 * Called after some received data has been read. It returns RX credits 693 * to the HW for the amount of data processed. 694 */ 695void 696t3_cleanup_rbuf(struct tcpcb *tp, int copied) 697{ 698 struct toepcb *toep = tp->t_toe; 699 struct socket *so; 700 struct toedev *dev; 701 int dack_mode, must_send, read; 702 u32 thres, credits, dack = 0; 703 struct sockbuf *rcv; 704 705 so = inp_inpcbtosocket(tp->t_inpcb); 706 rcv = so_sockbuf_rcv(so); 707 708 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 709 (tp->t_state == TCPS_FIN_WAIT_2))) { 710 if (copied) { 711 sockbuf_lock(rcv); 712 toep->tp_copied_seq += copied; 713 sockbuf_unlock(rcv); 714 } 715 716 return; 717 } 718 719 inp_lock_assert(tp->t_inpcb); 720 721 sockbuf_lock(rcv); 722 if (copied) 723 toep->tp_copied_seq += copied; 724 else { 725 read = toep->tp_enqueued_bytes - rcv->sb_cc; 726 toep->tp_copied_seq += read; 727 } 728 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 729 toep->tp_enqueued_bytes = rcv->sb_cc; 730 sockbuf_unlock(rcv); 731 732 if (credits > rcv->sb_mbmax) { 733 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", 734 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 735 credits = rcv->sb_mbmax; 736 } 737 738 739 /* 740 * XXX this won't accurately reflect credit return - we need 741 * to look at the difference between the amount that has been 742 * put in the recv sockbuf and what is there now 743 */ 744 745 if (__predict_false(!credits)) 746 return; 747 748 dev = toep->tp_toedev; 749 thres = TOM_TUNABLE(dev, rx_credit_thres); 750 751 if (__predict_false(thres == 0)) 752 return; 753 754 if (is_delack_mode_valid(dev, toep)) { 755 dack_mode = TOM_TUNABLE(dev, delack); 756 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 757 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 758 759 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 760 dack = F_RX_DACK_CHANGE | 761 V_RX_DACK_MODE(dack_mode); 762 } 763 } else 764 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 765 766 /* 767 * For coalescing to work effectively ensure the receive window has 768 * at least 16KB left. 769 */ 770 must_send = credits + 16384 >= tp->rcv_wnd; 771 772 if (must_send || credits >= thres) 773 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 774} 775 776static int 777cxgb_toe_disconnect(struct tcpcb *tp) 778{ 779 struct socket *so; 780 781 DPRINTF("cxgb_toe_disconnect\n"); 782 783 so = inp_inpcbtosocket(tp->t_inpcb); 784 close_conn(so); 785 return (0); 786} 787 788static int 789cxgb_toe_reset(struct tcpcb *tp) 790{ 791 struct toepcb *toep = tp->t_toe; 792 793 t3_send_reset(toep); 794 795 /* 796 * unhook from socket 797 */ 798 tp->t_flags &= ~TF_TOE; 799 toep->tp_tp = NULL; 800 tp->t_toe = NULL; 801 return (0); 802} 803 804static int 805cxgb_toe_send(struct tcpcb *tp) 806{ 807 struct socket *so; 808 809 DPRINTF("cxgb_toe_send\n"); 810 dump_toepcb(tp->t_toe); 811 812 so = inp_inpcbtosocket(tp->t_inpcb); 813 t3_push_frames(so, 1); 814 return (0); 815} 816 817static int 818cxgb_toe_rcvd(struct tcpcb *tp) 819{ 820 821 inp_lock_assert(tp->t_inpcb); 822 823 t3_cleanup_rbuf(tp, 0); 824 825 return (0); 826} 827 828static void 829cxgb_toe_detach(struct tcpcb *tp) 830{ 831 struct toepcb *toep; 832 833 /* 834 * XXX how do we handle teardown in the SYN_SENT state? 835 * 836 */ 837 inp_lock_assert(tp->t_inpcb); 838 toep = tp->t_toe; 839 toep->tp_tp = NULL; 840 841 /* 842 * unhook from socket 843 */ 844 tp->t_flags &= ~TF_TOE; 845 tp->t_toe = NULL; 846} 847 848 849static struct toe_usrreqs cxgb_toe_usrreqs = { 850 .tu_disconnect = cxgb_toe_disconnect, 851 .tu_reset = cxgb_toe_reset, 852 .tu_send = cxgb_toe_send, 853 .tu_rcvd = cxgb_toe_rcvd, 854 .tu_detach = cxgb_toe_detach, 855 .tu_detach = cxgb_toe_detach, 856 .tu_syncache_event = handle_syncache_event, 857}; 858 859 860static void 861__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, 862 uint64_t mask, uint64_t val, int no_reply) 863{ 864 struct cpl_set_tcb_field *req; 865 866 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 867 toep->tp_tid, word, mask, val); 868 869 req = mtod(m, struct cpl_set_tcb_field *); 870 m->m_pkthdr.len = m->m_len = sizeof(*req); 871 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 872 req->wr.wr_lo = 0; 873 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 874 req->reply = V_NO_REPLY(no_reply); 875 req->cpu_idx = 0; 876 req->word = htons(word); 877 req->mask = htobe64(mask); 878 req->val = htobe64(val); 879 880 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 881 send_or_defer(toep, m, 0); 882} 883 884static void 885t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) 886{ 887 struct mbuf *m; 888 struct tcpcb *tp = toep->tp_tp; 889 890 if (toep == NULL) 891 return; 892 893 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { 894 printf("not seting field\n"); 895 return; 896 } 897 898 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 899 900 __set_tcb_field(toep, m, word, mask, val, 1); 901} 902 903/* 904 * Set one of the t_flags bits in the TCB. 905 */ 906static void 907set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) 908{ 909 910 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 911} 912 913/* 914 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 915 */ 916static void 917t3_set_nagle(struct toepcb *toep) 918{ 919 struct tcpcb *tp = toep->tp_tp; 920 921 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 922} 923 924/* 925 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 926 */ 927void 928t3_set_keepalive(struct toepcb *toep, int on_off) 929{ 930 931 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); 932} 933 934void 935t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) 936{ 937 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); 938} 939 940void 941t3_set_dack_mss(struct toepcb *toep, int on_off) 942{ 943 944 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); 945} 946 947/* 948 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 949 */ 950static void 951t3_set_tos(struct toepcb *toep) 952{ 953 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); 954 955 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 956 V_TCB_TOS(tos)); 957} 958 959 960/* 961 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 962 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 963 * set the PSH bit in the last segment, which would trigger delivery.] 964 * We work around the issue by setting a DDP buffer in a partial placed state, 965 * which guarantees that TP will schedule a timer. 966 */ 967#define TP_DDP_TIMER_WORKAROUND_MASK\ 968 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 969 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 970 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 971#define TP_DDP_TIMER_WORKAROUND_VAL\ 972 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 973 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 974 32)) 975 976static void 977t3_enable_ddp(struct toepcb *toep, int on) 978{ 979 if (on) { 980 981 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 982 V_TF_DDP_OFF(0)); 983 } else 984 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, 985 V_TF_DDP_OFF(1) | 986 TP_DDP_TIMER_WORKAROUND_MASK, 987 V_TF_DDP_OFF(1) | 988 TP_DDP_TIMER_WORKAROUND_VAL); 989 990} 991 992void 993t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) 994{ 995 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 996 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 997 tag_color); 998} 999 1000void 1001t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, 1002 unsigned int len) 1003{ 1004 if (buf_idx == 0) 1005 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, 1006 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 1007 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 1008 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 1009 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 1010 else 1011 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, 1012 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 1013 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 1014 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 1015 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 1016} 1017 1018static int 1019t3_set_cong_control(struct socket *so, const char *name) 1020{ 1021#ifdef CONGESTION_CONTROL_SUPPORTED 1022 int cong_algo; 1023 1024 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 1025 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 1026 break; 1027 1028 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 1029 return -EINVAL; 1030#endif 1031 return 0; 1032} 1033 1034int 1035t3_get_tcb(struct toepcb *toep) 1036{ 1037 struct cpl_get_tcb *req; 1038 struct tcpcb *tp = toep->tp_tp; 1039 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 1040 1041 if (!m) 1042 return (ENOMEM); 1043 1044 inp_lock_assert(tp->t_inpcb); 1045 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 1046 req = mtod(m, struct cpl_get_tcb *); 1047 m->m_pkthdr.len = m->m_len = sizeof(*req); 1048 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1049 req->wr.wr_lo = 0; 1050 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 1051 req->cpuno = htons(toep->tp_qset); 1052 req->rsvd = 0; 1053 if (tp->t_state == TCPS_SYN_SENT) 1054 mbufq_tail(&toep->out_of_order_queue, m); // defer 1055 else 1056 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 1057 return 0; 1058} 1059 1060static inline void 1061so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) 1062{ 1063 1064 toepcb_hold(toep); 1065 1066 cxgb_insert_tid(d->cdev, d->client, toep, tid); 1067} 1068 1069/** 1070 * find_best_mtu - find the entry in the MTU table closest to an MTU 1071 * @d: TOM state 1072 * @mtu: the target MTU 1073 * 1074 * Returns the index of the value in the MTU table that is closest to but 1075 * does not exceed the target MTU. 1076 */ 1077static unsigned int 1078find_best_mtu(const struct t3c_data *d, unsigned short mtu) 1079{ 1080 int i = 0; 1081 1082 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 1083 ++i; 1084 return (i); 1085} 1086 1087static unsigned int 1088select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 1089{ 1090 unsigned int idx; 1091 1092#ifdef notyet 1093 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; 1094#endif 1095 if (tp) { 1096 tp->t_maxseg = pmtu - 40; 1097 if (tp->t_maxseg < td->mtus[0] - 40) 1098 tp->t_maxseg = td->mtus[0] - 40; 1099 idx = find_best_mtu(td, tp->t_maxseg + 40); 1100 1101 tp->t_maxseg = td->mtus[idx] - 40; 1102 } else 1103 idx = find_best_mtu(td, pmtu); 1104 1105 return (idx); 1106} 1107 1108static inline void 1109free_atid(struct t3cdev *cdev, unsigned int tid) 1110{ 1111 struct toepcb *toep = cxgb_free_atid(cdev, tid); 1112 1113 if (toep) 1114 toepcb_release(toep); 1115} 1116 1117/* 1118 * Release resources held by an offload connection (TID, L2T entry, etc.) 1119 */ 1120static void 1121t3_release_offload_resources(struct toepcb *toep) 1122{ 1123 struct tcpcb *tp = toep->tp_tp; 1124 struct toedev *tdev = toep->tp_toedev; 1125 struct t3cdev *cdev; 1126 struct socket *so; 1127 unsigned int tid = toep->tp_tid; 1128 struct sockbuf *rcv; 1129 1130 CTR0(KTR_TOM, "t3_release_offload_resources"); 1131 1132 if (!tdev) 1133 return; 1134 1135 cdev = TOEP_T3C_DEV(toep); 1136 if (!cdev) 1137 return; 1138 1139 toep->tp_qset = 0; 1140 t3_release_ddp_resources(toep); 1141 1142#ifdef CTRL_SKB_CACHE 1143 kfree_skb(CTRL_SKB_CACHE(tp)); 1144 CTRL_SKB_CACHE(tp) = NULL; 1145#endif 1146 1147 if (toep->tp_wr_avail != toep->tp_wr_max) { 1148 purge_wr_queue(toep); 1149 reset_wr_list(toep); 1150 } 1151 1152 if (toep->tp_l2t) { 1153 l2t_release(L2DATA(cdev), toep->tp_l2t); 1154 toep->tp_l2t = NULL; 1155 } 1156 toep->tp_tp = NULL; 1157 if (tp) { 1158 inp_lock_assert(tp->t_inpcb); 1159 so = inp_inpcbtosocket(tp->t_inpcb); 1160 rcv = so_sockbuf_rcv(so); 1161 /* 1162 * cancel any offloaded reads 1163 * 1164 */ 1165 sockbuf_lock(rcv); 1166 tp->t_toe = NULL; 1167 tp->t_flags &= ~TF_TOE; 1168 if (toep->tp_ddp_state.user_ddp_pending) { 1169 t3_cancel_ubuf(toep, rcv); 1170 toep->tp_ddp_state.user_ddp_pending = 0; 1171 } 1172 so_sorwakeup_locked(so); 1173 1174 } 1175 1176 if (toep->tp_state == TCPS_SYN_SENT) { 1177 free_atid(cdev, tid); 1178#ifdef notyet 1179 __skb_queue_purge(&tp->out_of_order_queue); 1180#endif 1181 } else { // we have TID 1182 cxgb_remove_tid(cdev, toep, tid); 1183 toepcb_release(toep); 1184 } 1185#if 0 1186 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 1187#endif 1188} 1189 1190static void 1191install_offload_ops(struct socket *so) 1192{ 1193 struct tcpcb *tp = so_sototcpcb(so); 1194 1195 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 1196 1197 t3_install_socket_ops(so); 1198 tp->t_flags |= TF_TOE; 1199 tp->t_tu = &cxgb_toe_usrreqs; 1200} 1201 1202/* 1203 * Determine the receive window scaling factor given a target max 1204 * receive window. 1205 */ 1206static __inline int 1207select_rcv_wscale(int space) 1208{ 1209 int wscale = 0; 1210 1211 if (space > MAX_RCV_WND) 1212 space = MAX_RCV_WND; 1213 1214 if (tcp_do_rfc1323) 1215 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 1216 1217 return (wscale); 1218} 1219 1220/* 1221 * Determine the receive window size for a socket. 1222 */ 1223static unsigned long 1224select_rcv_wnd(struct toedev *dev, struct socket *so) 1225{ 1226 struct tom_data *d = TOM_DATA(dev); 1227 unsigned int wnd; 1228 unsigned int max_rcv_wnd; 1229 struct sockbuf *rcv; 1230 1231 rcv = so_sockbuf_rcv(so); 1232 1233 if (tcp_do_autorcvbuf) 1234 wnd = tcp_autorcvbuf_max; 1235 else 1236 wnd = rcv->sb_hiwat; 1237 1238 1239 1240 /* XXX 1241 * For receive coalescing to work effectively we need a receive window 1242 * that can accomodate a coalesced segment. 1243 */ 1244 if (wnd < MIN_RCV_WND) 1245 wnd = MIN_RCV_WND; 1246 1247 /* PR 5138 */ 1248 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 1249 (uint32_t)d->rx_page_size * 23 : 1250 MAX_RCV_WND); 1251 1252 return min(wnd, max_rcv_wnd); 1253} 1254 1255/* 1256 * Assign offload parameters to some socket fields. This code is used by 1257 * both active and passive opens. 1258 */ 1259static inline void 1260init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1261 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1262{ 1263 struct tcpcb *tp = so_sototcpcb(so); 1264 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1265 struct sockbuf *snd, *rcv; 1266 1267#ifdef notyet 1268 SOCK_LOCK_ASSERT(so); 1269#endif 1270 1271 snd = so_sockbuf_snd(so); 1272 rcv = so_sockbuf_rcv(so); 1273 1274 log(LOG_INFO, "initializing offload socket\n"); 1275 /* 1276 * We either need to fix push frames to work with sbcompress 1277 * or we need to add this 1278 */ 1279 snd->sb_flags |= SB_NOCOALESCE; 1280 rcv->sb_flags |= SB_NOCOALESCE; 1281 1282 tp->t_toe = toep; 1283 toep->tp_tp = tp; 1284 toep->tp_toedev = dev; 1285 1286 toep->tp_tid = tid; 1287 toep->tp_l2t = e; 1288 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1289 toep->tp_wr_unacked = 0; 1290 toep->tp_delack_mode = 0; 1291 1292 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1293 /* 1294 * XXX broken 1295 * 1296 */ 1297 tp->rcv_wnd = select_rcv_wnd(dev, so); 1298 1299 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 1300 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1301 toep->tp_qset_idx = 0; 1302 1303 reset_wr_list(toep); 1304 DPRINTF("initialization done\n"); 1305} 1306 1307/* 1308 * The next two functions calculate the option 0 value for a socket. 1309 */ 1310static inline unsigned int 1311calc_opt0h(struct socket *so, int mtu_idx) 1312{ 1313 struct tcpcb *tp = so_sototcpcb(so); 1314 int wscale = select_rcv_wscale(tp->rcv_wnd); 1315 1316 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1317 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1318 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1319} 1320 1321static inline unsigned int 1322calc_opt0l(struct socket *so, int ulp_mode) 1323{ 1324 struct tcpcb *tp = so_sototcpcb(so); 1325 unsigned int val; 1326 1327 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | 1328 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1329 1330 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); 1331 return (val); 1332} 1333 1334static inline unsigned int 1335calc_opt2(const struct socket *so, struct toedev *dev) 1336{ 1337 int flv_valid; 1338 1339 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1340 1341 return (V_FLAVORS_VALID(flv_valid) | 1342 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); 1343} 1344 1345#if DEBUG_WR > 1 1346static int 1347count_pending_wrs(const struct toepcb *toep) 1348{ 1349 const struct mbuf *m; 1350 int n = 0; 1351 1352 wr_queue_walk(toep, m) 1353 n += m->m_pkthdr.csum_data; 1354 return (n); 1355} 1356#endif 1357 1358#if 0 1359(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1360#endif 1361 1362static void 1363mk_act_open_req(struct socket *so, struct mbuf *m, 1364 unsigned int atid, const struct l2t_entry *e) 1365{ 1366 struct cpl_act_open_req *req; 1367 struct inpcb *inp = so_sotoinpcb(so); 1368 struct tcpcb *tp = inp_inpcbtotcpcb(inp); 1369 struct toepcb *toep = tp->t_toe; 1370 struct toedev *tdev = toep->tp_toedev; 1371 1372 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); 1373 1374 req = mtod(m, struct cpl_act_open_req *); 1375 m->m_pkthdr.len = m->m_len = sizeof(*req); 1376 1377 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1378 req->wr.wr_lo = 0; 1379 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1380 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); 1381#if 0 1382 req->local_port = inp->inp_lport; 1383 req->peer_port = inp->inp_fport; 1384 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1385 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1386#endif 1387 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1388 V_TX_CHANNEL(e->smt_idx)); 1389 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1390 req->params = 0; 1391 req->opt2 = htonl(calc_opt2(so, tdev)); 1392} 1393 1394 1395/* 1396 * Convert an ACT_OPEN_RPL status to an errno. 1397 */ 1398static int 1399act_open_rpl_status_to_errno(int status) 1400{ 1401 switch (status) { 1402 case CPL_ERR_CONN_RESET: 1403 return (ECONNREFUSED); 1404 case CPL_ERR_ARP_MISS: 1405 return (EHOSTUNREACH); 1406 case CPL_ERR_CONN_TIMEDOUT: 1407 return (ETIMEDOUT); 1408 case CPL_ERR_TCAM_FULL: 1409 return (ENOMEM); 1410 case CPL_ERR_CONN_EXIST: 1411 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1412 return (EADDRINUSE); 1413 default: 1414 return (EIO); 1415 } 1416} 1417 1418static void 1419fail_act_open(struct toepcb *toep, int errno) 1420{ 1421 struct tcpcb *tp = toep->tp_tp; 1422 1423 t3_release_offload_resources(toep); 1424 if (tp) { 1425 inp_wunlock(tp->t_inpcb); 1426 tcp_offload_drop(tp, errno); 1427 } 1428 1429#ifdef notyet 1430 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1431#endif 1432} 1433 1434/* 1435 * Handle active open failures. 1436 */ 1437static void 1438active_open_failed(struct toepcb *toep, struct mbuf *m) 1439{ 1440 struct cpl_act_open_rpl *rpl = cplhdr(m); 1441 struct inpcb *inp; 1442 1443 if (toep->tp_tp == NULL) 1444 goto done; 1445 1446 inp = toep->tp_tp->t_inpcb; 1447 1448/* 1449 * Don't handle connection retry for now 1450 */ 1451#ifdef notyet 1452 struct inet_connection_sock *icsk = inet_csk(sk); 1453 1454 if (rpl->status == CPL_ERR_CONN_EXIST && 1455 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1456 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1457 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1458 jiffies + HZ / 2); 1459 } else 1460#endif 1461 { 1462 inp_wlock(inp); 1463 /* 1464 * drops the inpcb lock 1465 */ 1466 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1467 } 1468 1469 done: 1470 m_free(m); 1471} 1472 1473/* 1474 * Return whether a failed active open has allocated a TID 1475 */ 1476static inline int 1477act_open_has_tid(int status) 1478{ 1479 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1480 status != CPL_ERR_ARP_MISS; 1481} 1482 1483/* 1484 * Process an ACT_OPEN_RPL CPL message. 1485 */ 1486static int 1487do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1488{ 1489 struct toepcb *toep = (struct toepcb *)ctx; 1490 struct cpl_act_open_rpl *rpl = cplhdr(m); 1491 1492 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1493 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1494 1495 active_open_failed(toep, m); 1496 return (0); 1497} 1498 1499/* 1500 * Handle an ARP failure for an active open. XXX purge ofo queue 1501 * 1502 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1503 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1504 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1505 * free the atid. Hmm. 1506 */ 1507#ifdef notyet 1508static void 1509act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1510{ 1511 struct toepcb *toep = m_get_toep(m); 1512 struct tcpcb *tp = toep->tp_tp; 1513 struct inpcb *inp = tp->t_inpcb; 1514 struct socket *so; 1515 1516 inp_wlock(inp); 1517 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1518 /* 1519 * drops the inpcb lock 1520 */ 1521 fail_act_open(so, EHOSTUNREACH); 1522 printf("freeing %p\n", m); 1523 1524 m_free(m); 1525 } else 1526 inp_wunlock(inp); 1527} 1528#endif 1529/* 1530 * Send an active open request. 1531 */ 1532int 1533t3_connect(struct toedev *tdev, struct socket *so, 1534 struct rtentry *rt, struct sockaddr *nam) 1535{ 1536 struct mbuf *m; 1537 struct l2t_entry *e; 1538 struct tom_data *d = TOM_DATA(tdev); 1539 struct inpcb *inp = so_sotoinpcb(so); 1540 struct tcpcb *tp = intotcpcb(inp); 1541 struct toepcb *toep; /* allocated by init_offload_socket */ 1542 1543 int atid; 1544 1545 toep = toepcb_alloc(); 1546 if (toep == NULL) 1547 goto out_err; 1548 1549 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1550 goto out_err; 1551 1552 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1553 if (!e) 1554 goto free_tid; 1555 1556 inp_lock_assert(inp); 1557 m = m_gethdr(MT_DATA, M_WAITOK); 1558 1559#if 0 1560 m->m_toe.mt_toepcb = tp->t_toe; 1561 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1562#endif 1563 so_lock(so); 1564 1565 init_offload_socket(so, tdev, atid, e, rt, toep); 1566 1567 install_offload_ops(so); 1568 1569 mk_act_open_req(so, m, atid, e); 1570 so_unlock(so); 1571 1572 soisconnecting(so); 1573 toep = tp->t_toe; 1574 m_set_toep(m, tp->t_toe); 1575 1576 toep->tp_state = TCPS_SYN_SENT; 1577 l2t_send(d->cdev, (struct mbuf *)m, e); 1578 1579 if (toep->tp_ulp_mode) 1580 t3_enable_ddp(toep, 0); 1581 return (0); 1582 1583free_tid: 1584 printf("failing connect - free atid\n"); 1585 1586 free_atid(d->cdev, atid); 1587out_err: 1588 printf("return ENOMEM\n"); 1589 return (ENOMEM); 1590} 1591 1592/* 1593 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1594 * not send multiple ABORT_REQs for the same connection and also that we do 1595 * not try to send a message after the connection has closed. Returns 1 if 1596 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1597 */ 1598static void 1599t3_send_reset(struct toepcb *toep) 1600{ 1601 1602 struct cpl_abort_req *req; 1603 unsigned int tid = toep->tp_tid; 1604 int mode = CPL_ABORT_SEND_RST; 1605 struct tcpcb *tp = toep->tp_tp; 1606 struct toedev *tdev = toep->tp_toedev; 1607 struct socket *so = NULL; 1608 struct mbuf *m; 1609 struct sockbuf *snd; 1610 1611 if (tp) { 1612 inp_lock_assert(tp->t_inpcb); 1613 so = inp_inpcbtosocket(tp->t_inpcb); 1614 } 1615 1616 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1617 tdev == NULL)) 1618 return; 1619 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1620 1621 snd = so_sockbuf_snd(so); 1622 /* Purge the send queue so we don't send anything after an abort. */ 1623 if (so) 1624 sbflush(snd); 1625 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1626 mode |= CPL_ABORT_POST_CLOSE_REQ; 1627 1628 m = m_gethdr_nofail(sizeof(*req)); 1629 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); 1630 set_arp_failure_handler(m, abort_arp_failure); 1631 1632 req = mtod(m, struct cpl_abort_req *); 1633 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1634 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1635 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1636 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1637 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1638 req->cmd = mode; 1639 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1640 mbufq_tail(&toep->out_of_order_queue, m); // defer 1641 else 1642 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1643} 1644 1645static int 1646t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1647{ 1648 struct inpcb *inp; 1649 int error, optval; 1650 1651 if (sopt->sopt_name == IP_OPTIONS) 1652 return (ENOPROTOOPT); 1653 1654 if (sopt->sopt_name != IP_TOS) 1655 return (EOPNOTSUPP); 1656 1657 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1658 1659 if (error) 1660 return (error); 1661 1662 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread)) 1663 return (EPERM); 1664 1665 inp = so_sotoinpcb(so); 1666 inp_wlock(inp); 1667 inp_ip_tos_set(inp, optval); 1668#if 0 1669 inp->inp_ip_tos = optval; 1670#endif 1671 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); 1672 inp_wunlock(inp); 1673 1674 return (0); 1675} 1676 1677static int 1678t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1679{ 1680 int err = 0; 1681 size_t copied; 1682 1683 if (sopt->sopt_name != TCP_CONGESTION && 1684 sopt->sopt_name != TCP_NODELAY) 1685 return (EOPNOTSUPP); 1686 1687 if (sopt->sopt_name == TCP_CONGESTION) { 1688 char name[TCP_CA_NAME_MAX]; 1689 int optlen = sopt->sopt_valsize; 1690 struct tcpcb *tp; 1691 1692 if (sopt->sopt_dir == SOPT_GET) { 1693 KASSERT(0, ("unimplemented")); 1694 return (EOPNOTSUPP); 1695 } 1696 1697 if (optlen < 1) 1698 return (EINVAL); 1699 1700 err = copyinstr(sopt->sopt_val, name, 1701 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1702 if (err) 1703 return (err); 1704 if (copied < 1) 1705 return (EINVAL); 1706 1707 tp = so_sototcpcb(so); 1708 /* 1709 * XXX I need to revisit this 1710 */ 1711 if ((err = t3_set_cong_control(so, name)) == 0) { 1712#ifdef CONGESTION_CONTROL_SUPPORTED 1713 tp->t_cong_control = strdup(name, M_CXGB); 1714#endif 1715 } else 1716 return (err); 1717 } else { 1718 int optval, oldval; 1719 struct inpcb *inp; 1720 struct tcpcb *tp; 1721 1722 if (sopt->sopt_dir == SOPT_GET) 1723 return (EOPNOTSUPP); 1724 1725 err = sooptcopyin(sopt, &optval, sizeof optval, 1726 sizeof optval); 1727 1728 if (err) 1729 return (err); 1730 1731 inp = so_sotoinpcb(so); 1732 tp = inp_inpcbtotcpcb(inp); 1733 1734 inp_wlock(inp); 1735 1736 oldval = tp->t_flags; 1737 if (optval) 1738 tp->t_flags |= TF_NODELAY; 1739 else 1740 tp->t_flags &= ~TF_NODELAY; 1741 inp_wunlock(inp); 1742 1743 1744 if (oldval != tp->t_flags && (tp->t_toe != NULL)) 1745 t3_set_nagle(tp->t_toe); 1746 1747 } 1748 1749 return (0); 1750} 1751 1752int 1753t3_ctloutput(struct socket *so, struct sockopt *sopt) 1754{ 1755 int err; 1756 1757 if (sopt->sopt_level != IPPROTO_TCP) 1758 err = t3_ip_ctloutput(so, sopt); 1759 else 1760 err = t3_tcp_ctloutput(so, sopt); 1761 1762 if (err != EOPNOTSUPP) 1763 return (err); 1764 1765 return (tcp_ctloutput(so, sopt)); 1766} 1767 1768/* 1769 * Returns true if we need to explicitly request RST when we receive new data 1770 * on an RX-closed connection. 1771 */ 1772static inline int 1773need_rst_on_excess_rx(const struct toepcb *toep) 1774{ 1775 return (1); 1776} 1777 1778/* 1779 * Handles Rx data that arrives in a state where the socket isn't accepting 1780 * new data. 1781 */ 1782static void 1783handle_excess_rx(struct toepcb *toep, struct mbuf *m) 1784{ 1785 1786 if (need_rst_on_excess_rx(toep) && 1787 !(toep->tp_flags & TP_ABORT_SHUTDOWN)) 1788 t3_send_reset(toep); 1789 m_freem(m); 1790} 1791 1792/* 1793 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) 1794 * by getting the DDP offset from the TCB. 1795 */ 1796static void 1797tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) 1798{ 1799 struct ddp_state *q = &toep->tp_ddp_state; 1800 struct ddp_buf_state *bsp; 1801 struct cpl_get_tcb_rpl *hdr; 1802 unsigned int ddp_offset; 1803 struct socket *so; 1804 struct tcpcb *tp; 1805 struct sockbuf *rcv; 1806 int state; 1807 1808 uint64_t t; 1809 __be64 *tcb; 1810 1811 tp = toep->tp_tp; 1812 so = inp_inpcbtosocket(tp->t_inpcb); 1813 1814 inp_lock_assert(tp->t_inpcb); 1815 rcv = so_sockbuf_rcv(so); 1816 sockbuf_lock(rcv); 1817 1818 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. 1819 * We really need a cookie in order to dispatch the RPLs. 1820 */ 1821 q->get_tcb_count--; 1822 1823 /* It is a possible that a previous CPL already invalidated UBUF DDP 1824 * and moved the cur_buf idx and hence no further processing of this 1825 * skb is required. However, the app might be sleeping on 1826 * !q->get_tcb_count and we need to wake it up. 1827 */ 1828 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { 1829 int state = so_state_get(so); 1830 1831 m_freem(m); 1832 if (__predict_true((state & SS_NOFDREF) == 0)) 1833 so_sorwakeup_locked(so); 1834 else 1835 sockbuf_unlock(rcv); 1836 1837 return; 1838 } 1839 1840 bsp = &q->buf_state[q->cur_buf]; 1841 hdr = cplhdr(m); 1842 tcb = (__be64 *)(hdr + 1); 1843 if (q->cur_buf == 0) { 1844 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); 1845 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); 1846 } else { 1847 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); 1848 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; 1849 } 1850 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; 1851 m->m_cur_offset = bsp->cur_offset; 1852 bsp->cur_offset = ddp_offset; 1853 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; 1854 1855 CTR5(KTR_TOM, 1856 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", 1857 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); 1858 KASSERT(ddp_offset >= m->m_cur_offset, 1859 ("ddp_offset=%u less than cur_offset=%u", 1860 ddp_offset, m->m_cur_offset)); 1861 1862#if 0 1863{ 1864 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; 1865 1866 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); 1867 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; 1868 1869 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); 1870 rcv_nxt = t >> S_TCB_RCV_NXT; 1871 rcv_nxt &= M_TCB_RCV_NXT; 1872 1873 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); 1874 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); 1875 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; 1876 1877 T3_TRACE2(TIDTB(sk), 1878 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", 1879 ddp_flags, rcv_nxt - rx_hdr_offset); 1880 T3_TRACE4(TB(q), 1881 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", 1882 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); 1883 T3_TRACE3(TB(q), 1884 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", 1885 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); 1886 T3_TRACE2(TB(q), 1887 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", 1888 q->buf_state[0].flags, q->buf_state[1].flags); 1889 1890} 1891#endif 1892 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { 1893 handle_excess_rx(toep, m); 1894 return; 1895 } 1896 1897#ifdef T3_TRACE 1898 if ((int)m->m_pkthdr.len < 0) { 1899 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); 1900 } 1901#endif 1902 if (bsp->flags & DDP_BF_NOCOPY) { 1903#ifdef T3_TRACE 1904 T3_TRACE0(TB(q), 1905 "tcb_rpl_as_ddp_complete: CANCEL UBUF"); 1906 1907 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1908 printk("!cancel_ubuf"); 1909 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); 1910 } 1911#endif 1912 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; 1913 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); 1914 q->cur_buf ^= 1; 1915 } else if (bsp->flags & DDP_BF_NOFLIP) { 1916 1917 m->m_ddp_flags = 1; /* always a kernel buffer */ 1918 1919 /* now HW buffer carries a user buffer */ 1920 bsp->flags &= ~DDP_BF_NOFLIP; 1921 bsp->flags |= DDP_BF_NOCOPY; 1922 1923 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate 1924 * any new data in which case we're done. If in addition the 1925 * offset is 0, then there wasn't a completion for the kbuf 1926 * and we need to decrement the posted count. 1927 */ 1928 if (m->m_pkthdr.len == 0) { 1929 if (ddp_offset == 0) { 1930 q->kbuf_posted--; 1931 bsp->flags |= DDP_BF_NODATA; 1932 } 1933 sockbuf_unlock(rcv); 1934 m_free(m); 1935 return; 1936 } 1937 } else { 1938 sockbuf_unlock(rcv); 1939 1940 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, 1941 * but it got here way late and nobody cares anymore. 1942 */ 1943 m_free(m); 1944 return; 1945 } 1946 1947 m->m_ddp_gl = (unsigned char *)bsp->gl; 1948 m->m_flags |= M_DDP; 1949 m->m_seq = tp->rcv_nxt; 1950 tp->rcv_nxt += m->m_pkthdr.len; 1951 tp->t_rcvtime = ticks; 1952 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", 1953 m->m_seq, q->cur_buf, m->m_pkthdr.len); 1954 if (m->m_pkthdr.len == 0) { 1955 q->user_ddp_pending = 0; 1956 m_free(m); 1957 } else 1958 SBAPPEND(rcv, m); 1959 1960 state = so_state_get(so); 1961 if (__predict_true((state & SS_NOFDREF) == 0)) 1962 so_sorwakeup_locked(so); 1963 else 1964 sockbuf_unlock(rcv); 1965} 1966 1967/* 1968 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, 1969 * in that case they are similar to DDP completions. 1970 */ 1971static int 1972do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1973{ 1974 struct toepcb *toep = (struct toepcb *)ctx; 1975 1976 /* OK if socket doesn't exist */ 1977 if (toep == NULL) { 1978 printf("null toep in do_get_tcb_rpl\n"); 1979 return (CPL_RET_BUF_DONE); 1980 } 1981 1982 inp_wlock(toep->tp_tp->t_inpcb); 1983 tcb_rpl_as_ddp_complete(toep, m); 1984 inp_wunlock(toep->tp_tp->t_inpcb); 1985 1986 return (0); 1987} 1988 1989static void 1990handle_ddp_data(struct toepcb *toep, struct mbuf *m) 1991{ 1992 struct tcpcb *tp = toep->tp_tp; 1993 struct socket *so; 1994 struct ddp_state *q; 1995 struct ddp_buf_state *bsp; 1996 struct cpl_rx_data *hdr = cplhdr(m); 1997 unsigned int rcv_nxt = ntohl(hdr->seq); 1998 struct sockbuf *rcv; 1999 2000 if (tp->rcv_nxt == rcv_nxt) 2001 return; 2002 2003 inp_lock_assert(tp->t_inpcb); 2004 so = inp_inpcbtosocket(tp->t_inpcb); 2005 rcv = so_sockbuf_rcv(so); 2006 sockbuf_lock(rcv); 2007 2008 q = &toep->tp_ddp_state; 2009 bsp = &q->buf_state[q->cur_buf]; 2010 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", 2011 rcv_nxt, tp->rcv_nxt)); 2012 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2013 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2014 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", 2015 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); 2016 2017#ifdef T3_TRACE 2018 if ((int)m->m_pkthdr.len < 0) { 2019 t3_ddp_error(so, "handle_ddp_data: neg len"); 2020 } 2021#endif 2022 m->m_ddp_gl = (unsigned char *)bsp->gl; 2023 m->m_flags |= M_DDP; 2024 m->m_cur_offset = bsp->cur_offset; 2025 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2026 if (bsp->flags & DDP_BF_NOCOPY) 2027 bsp->flags &= ~DDP_BF_NOCOPY; 2028 2029 m->m_seq = tp->rcv_nxt; 2030 tp->rcv_nxt = rcv_nxt; 2031 bsp->cur_offset += m->m_pkthdr.len; 2032 if (!(bsp->flags & DDP_BF_NOFLIP)) 2033 q->cur_buf ^= 1; 2034 /* 2035 * For now, don't re-enable DDP after a connection fell out of DDP 2036 * mode. 2037 */ 2038 q->ubuf_ddp_ready = 0; 2039 sockbuf_unlock(rcv); 2040} 2041 2042/* 2043 * Process new data received for a connection. 2044 */ 2045static void 2046new_rx_data(struct toepcb *toep, struct mbuf *m) 2047{ 2048 struct cpl_rx_data *hdr = cplhdr(m); 2049 struct tcpcb *tp = toep->tp_tp; 2050 struct socket *so; 2051 struct sockbuf *rcv; 2052 int state; 2053 int len = be16toh(hdr->len); 2054 2055 inp_wlock(tp->t_inpcb); 2056 2057 so = inp_inpcbtosocket(tp->t_inpcb); 2058 2059 if (__predict_false(so_no_receive(so))) { 2060 handle_excess_rx(toep, m); 2061 inp_wunlock(tp->t_inpcb); 2062 TRACE_EXIT; 2063 return; 2064 } 2065 2066 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) 2067 handle_ddp_data(toep, m); 2068 2069 m->m_seq = ntohl(hdr->seq); 2070 m->m_ulp_mode = 0; /* for iSCSI */ 2071 2072#if VALIDATE_SEQ 2073 if (__predict_false(m->m_seq != tp->rcv_nxt)) { 2074 log(LOG_ERR, 2075 "%s: TID %u: Bad sequence number %u, expected %u\n", 2076 toep->tp_toedev->name, toep->tp_tid, m->m_seq, 2077 tp->rcv_nxt); 2078 m_freem(m); 2079 inp_wunlock(tp->t_inpcb); 2080 return; 2081 } 2082#endif 2083 m_adj(m, sizeof(*hdr)); 2084 2085#ifdef URGENT_DATA_SUPPORTED 2086 /* 2087 * We don't handle urgent data yet 2088 */ 2089 if (__predict_false(hdr->urg)) 2090 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 2091 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 2092 tp->urg_seq - tp->rcv_nxt < skb->len)) 2093 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 2094 tp->rcv_nxt]; 2095#endif 2096 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 2097 toep->tp_delack_mode = hdr->dack_mode; 2098 toep->tp_delack_seq = tp->rcv_nxt; 2099 } 2100 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", 2101 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); 2102 2103 if (len < m->m_pkthdr.len) 2104 m->m_pkthdr.len = m->m_len = len; 2105 2106 tp->rcv_nxt += m->m_pkthdr.len; 2107 tp->t_rcvtime = ticks; 2108 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2109 CTR2(KTR_TOM, 2110 "new_rx_data: seq 0x%x len %u", 2111 m->m_seq, m->m_pkthdr.len); 2112 inp_wunlock(tp->t_inpcb); 2113 rcv = so_sockbuf_rcv(so); 2114 sockbuf_lock(rcv); 2115#if 0 2116 if (sb_notify(rcv)) 2117 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); 2118#endif 2119 SBAPPEND(rcv, m); 2120 2121#ifdef notyet 2122 /* 2123 * We're giving too many credits to the card - but disable this check so we can keep on moving :-| 2124 * 2125 */ 2126 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), 2127 2128 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 2129 so, rcv->sb_cc, rcv->sb_mbmax)); 2130#endif 2131 2132 2133 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", 2134 rcv->sb_cc, rcv->sb_mbcnt); 2135 2136 state = so_state_get(so); 2137 if (__predict_true((state & SS_NOFDREF) == 0)) 2138 so_sorwakeup_locked(so); 2139 else 2140 sockbuf_unlock(rcv); 2141} 2142 2143/* 2144 * Handler for RX_DATA CPL messages. 2145 */ 2146static int 2147do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2148{ 2149 struct toepcb *toep = (struct toepcb *)ctx; 2150 2151 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 2152 2153 new_rx_data(toep, m); 2154 2155 return (0); 2156} 2157 2158static void 2159new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) 2160{ 2161 struct tcpcb *tp; 2162 struct ddp_state *q; 2163 struct ddp_buf_state *bsp; 2164 struct cpl_rx_data_ddp *hdr; 2165 struct socket *so; 2166 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 2167 int nomoredata = 0; 2168 unsigned int delack_mode; 2169 struct sockbuf *rcv; 2170 2171 tp = toep->tp_tp; 2172 inp_wlock(tp->t_inpcb); 2173 so = inp_inpcbtosocket(tp->t_inpcb); 2174 2175 if (__predict_false(so_no_receive(so))) { 2176 2177 handle_excess_rx(toep, m); 2178 inp_wunlock(tp->t_inpcb); 2179 return; 2180 } 2181 2182 q = &toep->tp_ddp_state; 2183 hdr = cplhdr(m); 2184 ddp_report = ntohl(hdr->u.ddp_report); 2185 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2186 bsp = &q->buf_state[buf_idx]; 2187 2188 CTR4(KTR_TOM, 2189 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 2190 "hdr seq 0x%x len %u", 2191 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 2192 ntohs(hdr->len)); 2193 CTR3(KTR_TOM, 2194 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", 2195 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); 2196 2197 ddp_len = ntohs(hdr->len); 2198 rcv_nxt = ntohl(hdr->seq) + ddp_len; 2199 2200 delack_mode = G_DDP_DACK_MODE(ddp_report); 2201 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2202 toep->tp_delack_mode = delack_mode; 2203 toep->tp_delack_seq = tp->rcv_nxt; 2204 } 2205 2206 m->m_seq = tp->rcv_nxt; 2207 tp->rcv_nxt = rcv_nxt; 2208 2209 tp->t_rcvtime = ticks; 2210 /* 2211 * Store the length in m->m_len. We are changing the meaning of 2212 * m->m_len here, we need to be very careful that nothing from now on 2213 * interprets ->len of this packet the usual way. 2214 */ 2215 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; 2216 inp_wunlock(tp->t_inpcb); 2217 CTR3(KTR_TOM, 2218 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", 2219 m->m_len, rcv_nxt, m->m_seq); 2220 /* 2221 * Figure out where the new data was placed in the buffer and store it 2222 * in when. Assumes the buffer offset starts at 0, consumer needs to 2223 * account for page pod's pg_offset. 2224 */ 2225 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 2226 m->m_cur_offset = end_offset - m->m_pkthdr.len; 2227 2228 rcv = so_sockbuf_rcv(so); 2229 sockbuf_lock(rcv); 2230 2231 m->m_ddp_gl = (unsigned char *)bsp->gl; 2232 m->m_flags |= M_DDP; 2233 bsp->cur_offset = end_offset; 2234 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2235 2236 /* 2237 * Length is only meaningful for kbuf 2238 */ 2239 if (!(bsp->flags & DDP_BF_NOCOPY)) 2240 KASSERT(m->m_len <= bsp->gl->dgl_length, 2241 ("length received exceeds ddp pages: len=%d dgl_length=%d", 2242 m->m_len, bsp->gl->dgl_length)); 2243 2244 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2245 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); 2246 /* 2247 * Bit 0 of flags stores whether the DDP buffer is completed. 2248 * Note that other parts of the code depend on this being in bit 0. 2249 */ 2250 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 2251 panic("spurious ddp completion"); 2252 } else { 2253 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 2254 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 2255 q->cur_buf ^= 1; /* flip buffers */ 2256 } 2257 2258 if (bsp->flags & DDP_BF_NOCOPY) { 2259 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); 2260 bsp->flags &= ~DDP_BF_NOCOPY; 2261 } 2262 2263 if (ddp_report & F_DDP_PSH) 2264 m->m_ddp_flags |= DDP_BF_PSH; 2265 if (nomoredata) 2266 m->m_ddp_flags |= DDP_BF_NODATA; 2267 2268#ifdef notyet 2269 skb_reset_transport_header(skb); 2270 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ 2271#endif 2272 SBAPPEND(rcv, m); 2273 2274 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || 2275 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) 2276 || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) 2277 so_sorwakeup_locked(so); 2278 else 2279 sockbuf_unlock(rcv); 2280} 2281 2282#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 2283 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 2284 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 2285 F_DDP_INVALID_PPOD) 2286 2287/* 2288 * Handler for RX_DATA_DDP CPL messages. 2289 */ 2290static int 2291do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2292{ 2293 struct toepcb *toep = ctx; 2294 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 2295 2296 VALIDATE_SOCK(so); 2297 2298 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 2299 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 2300 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 2301 return (CPL_RET_BUF_DONE); 2302 } 2303#if 0 2304 skb->h.th = tcphdr_skb->h.th; 2305#endif 2306 new_rx_data_ddp(toep, m); 2307 return (0); 2308} 2309 2310static void 2311process_ddp_complete(struct toepcb *toep, struct mbuf *m) 2312{ 2313 struct tcpcb *tp = toep->tp_tp; 2314 struct socket *so; 2315 struct ddp_state *q; 2316 struct ddp_buf_state *bsp; 2317 struct cpl_rx_ddp_complete *hdr; 2318 unsigned int ddp_report, buf_idx, when, delack_mode; 2319 int nomoredata = 0; 2320 struct sockbuf *rcv; 2321 2322 inp_wlock(tp->t_inpcb); 2323 so = inp_inpcbtosocket(tp->t_inpcb); 2324 2325 if (__predict_false(so_no_receive(so))) { 2326 struct inpcb *inp = so_sotoinpcb(so); 2327 2328 handle_excess_rx(toep, m); 2329 inp_wunlock(inp); 2330 return; 2331 } 2332 q = &toep->tp_ddp_state; 2333 hdr = cplhdr(m); 2334 ddp_report = ntohl(hdr->ddp_report); 2335 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2336 m->m_pkthdr.csum_data = tp->rcv_nxt; 2337 2338 rcv = so_sockbuf_rcv(so); 2339 sockbuf_lock(rcv); 2340 2341 bsp = &q->buf_state[buf_idx]; 2342 when = bsp->cur_offset; 2343 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; 2344 tp->rcv_nxt += m->m_len; 2345 tp->t_rcvtime = ticks; 2346 2347 delack_mode = G_DDP_DACK_MODE(ddp_report); 2348 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2349 toep->tp_delack_mode = delack_mode; 2350 toep->tp_delack_seq = tp->rcv_nxt; 2351 } 2352#ifdef notyet 2353 skb_reset_transport_header(skb); 2354 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2355#endif 2356 inp_wunlock(tp->t_inpcb); 2357 2358 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2359 CTR5(KTR_TOM, 2360 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2361 "ddp_report 0x%x offset %u, len %u", 2362 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2363 G_DDP_OFFSET(ddp_report), m->m_len); 2364 2365 m->m_cur_offset = bsp->cur_offset; 2366 bsp->cur_offset += m->m_len; 2367 2368 if (!(bsp->flags & DDP_BF_NOFLIP)) { 2369 q->cur_buf ^= 1; /* flip buffers */ 2370 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) 2371 nomoredata=1; 2372 } 2373 2374 CTR4(KTR_TOM, 2375 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2376 "ddp_report %u offset %u", 2377 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2378 G_DDP_OFFSET(ddp_report)); 2379 2380 m->m_ddp_gl = (unsigned char *)bsp->gl; 2381 m->m_flags |= M_DDP; 2382 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 2383 if (bsp->flags & DDP_BF_NOCOPY) 2384 bsp->flags &= ~DDP_BF_NOCOPY; 2385 if (nomoredata) 2386 m->m_ddp_flags |= DDP_BF_NODATA; 2387 2388 SBAPPEND(rcv, m); 2389 if ((so_state_get(so) & SS_NOFDREF) == 0) 2390 so_sorwakeup_locked(so); 2391 else 2392 sockbuf_unlock(rcv); 2393} 2394 2395/* 2396 * Handler for RX_DDP_COMPLETE CPL messages. 2397 */ 2398static int 2399do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2400{ 2401 struct toepcb *toep = ctx; 2402 2403 VALIDATE_SOCK(so); 2404#if 0 2405 skb->h.th = tcphdr_skb->h.th; 2406#endif 2407 process_ddp_complete(toep, m); 2408 return (0); 2409} 2410 2411/* 2412 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 2413 * socket state before calling tcp_time_wait to comply with its expectations. 2414 */ 2415static void 2416enter_timewait(struct tcpcb *tp) 2417{ 2418 /* 2419 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2420 * process peer_close because we don't want to carry the peer FIN in 2421 * the socket's receive queue and if we increment rcv_nxt without 2422 * having the FIN in the receive queue we'll confuse facilities such 2423 * as SIOCINQ. 2424 */ 2425 inp_wlock(tp->t_inpcb); 2426 tp->rcv_nxt++; 2427 2428 tp->ts_recent_age = 0; /* defeat recycling */ 2429 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2430 inp_wunlock(tp->t_inpcb); 2431 tcp_offload_twstart(tp); 2432} 2433 2434static void 2435enter_timewait_disconnect(struct tcpcb *tp) 2436{ 2437 /* 2438 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2439 * process peer_close because we don't want to carry the peer FIN in 2440 * the socket's receive queue and if we increment rcv_nxt without 2441 * having the FIN in the receive queue we'll confuse facilities such 2442 * as SIOCINQ. 2443 */ 2444 inp_wlock(tp->t_inpcb); 2445 tp->rcv_nxt++; 2446 2447 tp->ts_recent_age = 0; /* defeat recycling */ 2448 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2449 inp_wunlock(tp->t_inpcb); 2450 tcp_offload_twstart_disconnect(tp); 2451} 2452 2453/* 2454 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This 2455 * function deals with the data that may be reported along with the FIN. 2456 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to 2457 * perform normal FIN-related processing. In the latter case 1 indicates that 2458 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the 2459 * skb can be freed. 2460 */ 2461static int 2462handle_peer_close_data(struct socket *so, struct mbuf *m) 2463{ 2464 struct tcpcb *tp = so_sototcpcb(so); 2465 struct toepcb *toep = tp->t_toe; 2466 struct ddp_state *q; 2467 struct ddp_buf_state *bsp; 2468 struct cpl_peer_close *req = cplhdr(m); 2469 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ 2470 struct sockbuf *rcv; 2471 2472 if (tp->rcv_nxt == rcv_nxt) /* no data */ 2473 return (0); 2474 2475 CTR0(KTR_TOM, "handle_peer_close_data"); 2476 if (__predict_false(so_no_receive(so))) { 2477 handle_excess_rx(toep, m); 2478 2479 /* 2480 * Although we discard the data we want to process the FIN so 2481 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + 2482 * PEER_CLOSE without data. In particular this PEER_CLOSE 2483 * may be what will close the connection. We return 1 because 2484 * handle_excess_rx() already freed the packet. 2485 */ 2486 return (1); 2487 } 2488 2489 inp_lock_assert(tp->t_inpcb); 2490 q = &toep->tp_ddp_state; 2491 rcv = so_sockbuf_rcv(so); 2492 sockbuf_lock(rcv); 2493 2494 bsp = &q->buf_state[q->cur_buf]; 2495 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2496 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2497 m->m_ddp_gl = (unsigned char *)bsp->gl; 2498 m->m_flags |= M_DDP; 2499 m->m_cur_offset = bsp->cur_offset; 2500 m->m_ddp_flags = 2501 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2502 m->m_seq = tp->rcv_nxt; 2503 tp->rcv_nxt = rcv_nxt; 2504 bsp->cur_offset += m->m_pkthdr.len; 2505 if (!(bsp->flags & DDP_BF_NOFLIP)) 2506 q->cur_buf ^= 1; 2507#ifdef notyet 2508 skb_reset_transport_header(skb); 2509 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2510#endif 2511 tp->t_rcvtime = ticks; 2512 SBAPPEND(rcv, m); 2513 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 2514 so_sorwakeup_locked(so); 2515 else 2516 sockbuf_unlock(rcv); 2517 2518 return (1); 2519} 2520 2521/* 2522 * Handle a peer FIN. 2523 */ 2524static void 2525do_peer_fin(struct toepcb *toep, struct mbuf *m) 2526{ 2527 struct socket *so; 2528 struct tcpcb *tp = toep->tp_tp; 2529 int keep, action; 2530 2531 action = keep = 0; 2532 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); 2533 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2534 printf("abort_pending set\n"); 2535 2536 goto out; 2537 } 2538 inp_wlock(tp->t_inpcb); 2539 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 2540 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { 2541 keep = handle_peer_close_data(so, m); 2542 if (keep < 0) { 2543 inp_wunlock(tp->t_inpcb); 2544 return; 2545 } 2546 } 2547 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2548 CTR1(KTR_TOM, 2549 "waking up waiters for cantrcvmore on %p ", so); 2550 socantrcvmore(so); 2551 2552 /* 2553 * If connection is half-synchronized 2554 * (ie NEEDSYN flag on) then delay ACK, 2555 * so it may be piggybacked when SYN is sent. 2556 * Otherwise, since we received a FIN then no 2557 * more input can be expected, send ACK now. 2558 */ 2559 if (tp->t_flags & TF_NEEDSYN) 2560 tp->t_flags |= TF_DELACK; 2561 else 2562 tp->t_flags |= TF_ACKNOW; 2563 tp->rcv_nxt++; 2564 } 2565 2566 switch (tp->t_state) { 2567 case TCPS_SYN_RECEIVED: 2568 tp->t_starttime = ticks; 2569 /* FALLTHROUGH */ 2570 case TCPS_ESTABLISHED: 2571 tp->t_state = TCPS_CLOSE_WAIT; 2572 break; 2573 case TCPS_FIN_WAIT_1: 2574 tp->t_state = TCPS_CLOSING; 2575 break; 2576 case TCPS_FIN_WAIT_2: 2577 /* 2578 * If we've sent an abort_req we must have sent it too late, 2579 * HW will send us a reply telling us so, and this peer_close 2580 * is really the last message for this connection and needs to 2581 * be treated as an abort_rpl, i.e., transition the connection 2582 * to TCP_CLOSE (note that the host stack does this at the 2583 * time of generating the RST but we must wait for HW). 2584 * Otherwise we enter TIME_WAIT. 2585 */ 2586 t3_release_offload_resources(toep); 2587 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2588 action = TCP_CLOSE; 2589 } else { 2590 action = TCP_TIMEWAIT; 2591 } 2592 break; 2593 default: 2594 log(LOG_ERR, 2595 "%s: TID %u received PEER_CLOSE in bad state %d\n", 2596 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); 2597 } 2598 inp_wunlock(tp->t_inpcb); 2599 2600 if (action == TCP_TIMEWAIT) { 2601 enter_timewait(tp); 2602 } else if (action == TCP_DROP) { 2603 tcp_offload_drop(tp, 0); 2604 } else if (action == TCP_CLOSE) { 2605 tcp_offload_close(tp); 2606 } 2607 2608#ifdef notyet 2609 /* Do not send POLL_HUP for half duplex close. */ 2610 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 2611 sk->sk_state == TCP_CLOSE) 2612 sk_wake_async(so, 1, POLL_HUP); 2613 else 2614 sk_wake_async(so, 1, POLL_IN); 2615#endif 2616 2617out: 2618 if (!keep) 2619 m_free(m); 2620} 2621 2622/* 2623 * Handler for PEER_CLOSE CPL messages. 2624 */ 2625static int 2626do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2627{ 2628 struct toepcb *toep = (struct toepcb *)ctx; 2629 2630 VALIDATE_SOCK(so); 2631 2632 do_peer_fin(toep, m); 2633 return (0); 2634} 2635 2636static void 2637process_close_con_rpl(struct toepcb *toep, struct mbuf *m) 2638{ 2639 struct cpl_close_con_rpl *rpl = cplhdr(m); 2640 struct tcpcb *tp = toep->tp_tp; 2641 struct socket *so; 2642 int action = 0; 2643 struct sockbuf *rcv; 2644 2645 inp_wlock(tp->t_inpcb); 2646 so = inp_inpcbtosocket(tp->t_inpcb); 2647 2648 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 2649 2650 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2651 inp_wunlock(tp->t_inpcb); 2652 goto out; 2653 } 2654 2655 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 2656 tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); 2657 2658 switch (tp->t_state) { 2659 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 2660 t3_release_offload_resources(toep); 2661 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2662 action = TCP_CLOSE; 2663 2664 } else { 2665 action = TCP_TIMEWAIT; 2666 } 2667 break; 2668 case TCPS_LAST_ACK: 2669 /* 2670 * In this state we don't care about pending abort_rpl. 2671 * If we've sent abort_req it was post-close and was sent too 2672 * late, this close_con_rpl is the actual last message. 2673 */ 2674 t3_release_offload_resources(toep); 2675 action = TCP_CLOSE; 2676 break; 2677 case TCPS_FIN_WAIT_1: 2678 /* 2679 * If we can't receive any more 2680 * data, then closing user can proceed. 2681 * Starting the timer is contrary to the 2682 * specification, but if we don't get a FIN 2683 * we'll hang forever. 2684 * 2685 * XXXjl: 2686 * we should release the tp also, and use a 2687 * compressed state. 2688 */ 2689 if (so) 2690 rcv = so_sockbuf_rcv(so); 2691 else 2692 break; 2693 2694 if (rcv->sb_state & SBS_CANTRCVMORE) { 2695 int timeout; 2696 2697 if (so) 2698 soisdisconnected(so); 2699 timeout = (tcp_fast_finwait2_recycle) ? 2700 tcp_finwait2_timeout : tcp_maxidle; 2701 tcp_timer_activate(tp, TT_2MSL, timeout); 2702 } 2703 tp->t_state = TCPS_FIN_WAIT_2; 2704 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && 2705 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 2706 action = TCP_DROP; 2707 } 2708 2709 break; 2710 default: 2711 log(LOG_ERR, 2712 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 2713 toep->tp_toedev->tod_name, toep->tp_tid, 2714 tp->t_state); 2715 } 2716 inp_wunlock(tp->t_inpcb); 2717 2718 2719 if (action == TCP_TIMEWAIT) { 2720 enter_timewait_disconnect(tp); 2721 } else if (action == TCP_DROP) { 2722 tcp_offload_drop(tp, 0); 2723 } else if (action == TCP_CLOSE) { 2724 tcp_offload_close(tp); 2725 } 2726out: 2727 m_freem(m); 2728} 2729 2730/* 2731 * Handler for CLOSE_CON_RPL CPL messages. 2732 */ 2733static int 2734do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 2735 void *ctx) 2736{ 2737 struct toepcb *toep = (struct toepcb *)ctx; 2738 2739 process_close_con_rpl(toep, m); 2740 return (0); 2741} 2742 2743/* 2744 * Process abort replies. We only process these messages if we anticipate 2745 * them as the coordination between SW and HW in this area is somewhat lacking 2746 * and sometimes we get ABORT_RPLs after we are done with the connection that 2747 * originated the ABORT_REQ. 2748 */ 2749static void 2750process_abort_rpl(struct toepcb *toep, struct mbuf *m) 2751{ 2752 struct tcpcb *tp = toep->tp_tp; 2753 struct socket *so; 2754 int needclose = 0; 2755 2756#ifdef T3_TRACE 2757 T3_TRACE1(TIDTB(sk), 2758 "process_abort_rpl: GTS rpl pending %d", 2759 sock_flag(sk, ABORT_RPL_PENDING)); 2760#endif 2761 2762 inp_wlock(tp->t_inpcb); 2763 so = inp_inpcbtosocket(tp->t_inpcb); 2764 2765 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2766 /* 2767 * XXX panic on tcpdrop 2768 */ 2769 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) 2770 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2771 else { 2772 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2773 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2774 !is_t3a(toep->tp_toedev)) { 2775 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2776 panic("TP_ABORT_REQ_RCVD set"); 2777 t3_release_offload_resources(toep); 2778 needclose = 1; 2779 } 2780 } 2781 } 2782 inp_wunlock(tp->t_inpcb); 2783 2784 if (needclose) 2785 tcp_offload_close(tp); 2786 2787 m_free(m); 2788} 2789 2790/* 2791 * Handle an ABORT_RPL_RSS CPL message. 2792 */ 2793static int 2794do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2795{ 2796 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2797 struct toepcb *toep; 2798 2799 /* 2800 * Ignore replies to post-close aborts indicating that the abort was 2801 * requested too late. These connections are terminated when we get 2802 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2803 * arrives the TID is either no longer used or it has been recycled. 2804 */ 2805 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2806discard: 2807 m_free(m); 2808 return (0); 2809 } 2810 2811 toep = (struct toepcb *)ctx; 2812 2813 /* 2814 * Sometimes we've already closed the socket, e.g., a post-close 2815 * abort races with ABORT_REQ_RSS, the latter frees the socket 2816 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2817 * but FW turns the ABORT_REQ into a regular one and so we get 2818 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2819 */ 2820 if (!toep) 2821 goto discard; 2822 2823 if (toep->tp_tp == NULL) { 2824 log(LOG_NOTICE, "removing tid for abort\n"); 2825 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2826 if (toep->tp_l2t) 2827 l2t_release(L2DATA(cdev), toep->tp_l2t); 2828 2829 toepcb_release(toep); 2830 goto discard; 2831 } 2832 2833 log(LOG_NOTICE, "toep=%p\n", toep); 2834 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); 2835 2836 toepcb_hold(toep); 2837 process_abort_rpl(toep, m); 2838 toepcb_release(toep); 2839 return (0); 2840} 2841 2842/* 2843 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also 2844 * indicate whether RST should be sent in response. 2845 */ 2846static int 2847abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2848{ 2849 struct tcpcb *tp = so_sototcpcb(so); 2850 2851 switch (abort_reason) { 2852 case CPL_ERR_BAD_SYN: 2853#if 0 2854 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2855#endif 2856 case CPL_ERR_CONN_RESET: 2857 // XXX need to handle SYN_RECV due to crossed SYNs 2858 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2859 case CPL_ERR_XMIT_TIMEDOUT: 2860 case CPL_ERR_PERSIST_TIMEDOUT: 2861 case CPL_ERR_FINWAIT2_TIMEDOUT: 2862 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2863#if 0 2864 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2865#endif 2866 return (ETIMEDOUT); 2867 default: 2868 return (EIO); 2869 } 2870} 2871 2872static inline void 2873set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2874{ 2875 struct cpl_abort_rpl *rpl = cplhdr(m); 2876 2877 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2878 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2879 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2880 2881 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2882 rpl->cmd = cmd; 2883} 2884 2885static void 2886send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2887{ 2888 struct mbuf *reply_mbuf; 2889 struct cpl_abort_req_rss *req = cplhdr(m); 2890 2891 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2892 m_set_priority(m, CPL_PRIORITY_DATA); 2893 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2894 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2895 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2896 m_free(m); 2897} 2898 2899/* 2900 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2901 */ 2902static inline int 2903is_neg_adv_abort(unsigned int status) 2904{ 2905 return status == CPL_ERR_RTX_NEG_ADVICE || 2906 status == CPL_ERR_PERSIST_NEG_ADVICE; 2907} 2908 2909static void 2910send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2911{ 2912 struct mbuf *reply_mbuf; 2913 struct cpl_abort_req_rss *req = cplhdr(m); 2914 2915 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2916 2917 if (!reply_mbuf) { 2918 /* Defer the reply. Stick rst_status into req->cmd. */ 2919 req->status = rst_status; 2920 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2921 return; 2922 } 2923 2924 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2925 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2926 m_free(m); 2927 2928 /* 2929 * XXX need to sync with ARP as for SYN_RECV connections we can send 2930 * these messages while ARP is pending. For other connection states 2931 * it's not a problem. 2932 */ 2933 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2934} 2935 2936#ifdef notyet 2937static void 2938cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2939{ 2940 CXGB_UNIMPLEMENTED(); 2941#ifdef notyet 2942 struct request_sock *req = child->sk_user_data; 2943 2944 inet_csk_reqsk_queue_removed(parent, req); 2945 synq_remove(tcp_sk(child)); 2946 __reqsk_free(req); 2947 child->sk_user_data = NULL; 2948#endif 2949} 2950 2951 2952/* 2953 * Performs the actual work to abort a SYN_RECV connection. 2954 */ 2955static void 2956do_abort_syn_rcv(struct socket *child, struct socket *parent) 2957{ 2958 struct tcpcb *parenttp = so_sototcpcb(parent); 2959 struct tcpcb *childtp = so_sototcpcb(child); 2960 2961 /* 2962 * If the server is still open we clean up the child connection, 2963 * otherwise the server already did the clean up as it was purging 2964 * its SYN queue and the skb was just sitting in its backlog. 2965 */ 2966 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2967 cleanup_syn_rcv_conn(child, parent); 2968 inp_wlock(childtp->t_inpcb); 2969 t3_release_offload_resources(childtp->t_toe); 2970 inp_wunlock(childtp->t_inpcb); 2971 tcp_offload_close(childtp); 2972 } 2973} 2974#endif 2975 2976/* 2977 * Handle abort requests for a SYN_RECV connection. These need extra work 2978 * because the socket is on its parent's SYN queue. 2979 */ 2980static int 2981abort_syn_rcv(struct socket *so, struct mbuf *m) 2982{ 2983 CXGB_UNIMPLEMENTED(); 2984#ifdef notyet 2985 struct socket *parent; 2986 struct toedev *tdev = toep->tp_toedev; 2987 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2988 struct socket *oreq = so->so_incomp; 2989 struct t3c_tid_entry *t3c_stid; 2990 struct tid_info *t; 2991 2992 if (!oreq) 2993 return -1; /* somehow we are not on the SYN queue */ 2994 2995 t = &(T3C_DATA(cdev))->tid_maps; 2996 t3c_stid = lookup_stid(t, oreq->ts_recent); 2997 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2998 2999 so_lock(parent); 3000 do_abort_syn_rcv(so, parent); 3001 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 3002 so_unlock(parent); 3003#endif 3004 return (0); 3005} 3006 3007/* 3008 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 3009 * request except that we need to reply to it. 3010 */ 3011static void 3012process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) 3013{ 3014 int rst_status = CPL_ABORT_NO_RST; 3015 const struct cpl_abort_req_rss *req = cplhdr(m); 3016 struct tcpcb *tp = toep->tp_tp; 3017 struct socket *so; 3018 int needclose = 0; 3019 3020 inp_wlock(tp->t_inpcb); 3021 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 3022 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 3023 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 3024 m_free(m); 3025 goto skip; 3026 } 3027 3028 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 3029 /* 3030 * Three cases to consider: 3031 * a) We haven't sent an abort_req; close the connection. 3032 * b) We have sent a post-close abort_req that will get to TP too late 3033 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 3034 * be ignored and the connection should be closed now. 3035 * c) We have sent a regular abort_req that will get to TP too late. 3036 * That will generate an abort_rpl with status 0, wait for it. 3037 */ 3038 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 3039 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 3040 int error; 3041 3042 error = abort_status_to_errno(so, req->status, 3043 &rst_status); 3044 so_error_set(so, error); 3045 3046 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 3047 so_sorwakeup(so); 3048 /* 3049 * SYN_RECV needs special processing. If abort_syn_rcv() 3050 * returns 0 is has taken care of the abort. 3051 */ 3052 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 3053 goto skip; 3054 3055 t3_release_offload_resources(toep); 3056 needclose = 1; 3057 } 3058 inp_wunlock(tp->t_inpcb); 3059 3060 if (needclose) 3061 tcp_offload_close(tp); 3062 3063 send_abort_rpl(m, tdev, rst_status); 3064 return; 3065skip: 3066 inp_wunlock(tp->t_inpcb); 3067} 3068 3069/* 3070 * Handle an ABORT_REQ_RSS CPL message. 3071 */ 3072static int 3073do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3074{ 3075 const struct cpl_abort_req_rss *req = cplhdr(m); 3076 struct toepcb *toep = (struct toepcb *)ctx; 3077 3078 if (is_neg_adv_abort(req->status)) { 3079 m_free(m); 3080 return (0); 3081 } 3082 3083 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); 3084 3085 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 3086 cxgb_remove_tid(cdev, toep, toep->tp_tid); 3087 toep->tp_flags |= TP_ABORT_REQ_RCVD; 3088 3089 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 3090 if (toep->tp_l2t) 3091 l2t_release(L2DATA(cdev), toep->tp_l2t); 3092 3093 /* 3094 * Unhook 3095 */ 3096 toep->tp_tp->t_toe = NULL; 3097 toep->tp_tp->t_flags &= ~TF_TOE; 3098 toep->tp_tp = NULL; 3099 /* 3100 * XXX need to call syncache_chkrst - but we don't 3101 * have a way of doing that yet 3102 */ 3103 toepcb_release(toep); 3104 log(LOG_ERR, "abort for unestablished connection :-(\n"); 3105 return (0); 3106 } 3107 if (toep->tp_tp == NULL) { 3108 log(LOG_NOTICE, "disconnected toepcb\n"); 3109 /* should be freed momentarily */ 3110 return (0); 3111 } 3112 3113 3114 toepcb_hold(toep); 3115 process_abort_req(toep, m, toep->tp_toedev); 3116 toepcb_release(toep); 3117 return (0); 3118} 3119#ifdef notyet 3120static void 3121pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 3122{ 3123 struct toedev *tdev = TOE_DEV(parent); 3124 3125 do_abort_syn_rcv(child, parent); 3126 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 3127 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 3128 3129 rpl->opt0h = htonl(F_TCAM_BYPASS); 3130 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3131 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3132 } else 3133 m_free(m); 3134} 3135#endif 3136static void 3137handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 3138{ 3139 CXGB_UNIMPLEMENTED(); 3140 3141#ifdef notyet 3142 struct t3cdev *cdev; 3143 struct socket *parent; 3144 struct socket *oreq; 3145 struct t3c_tid_entry *t3c_stid; 3146 struct tid_info *t; 3147 struct tcpcb *otp, *tp = so_sototcpcb(so); 3148 struct toepcb *toep = tp->t_toe; 3149 3150 /* 3151 * If the connection is being aborted due to the parent listening 3152 * socket going away there's nothing to do, the ABORT_REQ will close 3153 * the connection. 3154 */ 3155 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 3156 m_free(m); 3157 return; 3158 } 3159 3160 oreq = so->so_incomp; 3161 otp = so_sototcpcb(oreq); 3162 3163 cdev = T3C_DEV(so); 3164 t = &(T3C_DATA(cdev))->tid_maps; 3165 t3c_stid = lookup_stid(t, otp->ts_recent); 3166 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 3167 3168 so_lock(parent); 3169 pass_open_abort(so, parent, m); 3170 so_unlock(parent); 3171#endif 3172} 3173 3174/* 3175 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 3176 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 3177 * connection. 3178 */ 3179static void 3180pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 3181{ 3182 3183#ifdef notyet 3184 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3185 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 3186#endif 3187 handle_pass_open_arp_failure(m_get_socket(m), m); 3188} 3189 3190/* 3191 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 3192 */ 3193static void 3194mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 3195{ 3196 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 3197 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 3198 unsigned int tid = GET_TID(req); 3199 3200 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 3201 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3202 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3203 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 3204 rpl->opt0h = htonl(F_TCAM_BYPASS); 3205 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3206 rpl->opt2 = 0; 3207 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3208} 3209 3210/* 3211 * Send a deferred reject to an accept request. 3212 */ 3213static void 3214reject_pass_request(struct toedev *tdev, struct mbuf *m) 3215{ 3216 struct mbuf *reply_mbuf; 3217 3218 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 3219 mk_pass_accept_rpl(reply_mbuf, m); 3220 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 3221 m_free(m); 3222} 3223 3224static void 3225handle_syncache_event(int event, void *arg) 3226{ 3227 struct toepcb *toep = arg; 3228 3229 switch (event) { 3230 case TOE_SC_ENTRY_PRESENT: 3231 /* 3232 * entry already exists - free toepcb 3233 * and l2t 3234 */ 3235 printf("syncache entry present\n"); 3236 toepcb_release(toep); 3237 break; 3238 case TOE_SC_DROP: 3239 /* 3240 * The syncache has given up on this entry 3241 * either it timed out, or it was evicted 3242 * we need to explicitly release the tid 3243 */ 3244 printf("syncache entry dropped\n"); 3245 toepcb_release(toep); 3246 break; 3247 default: 3248 log(LOG_ERR, "unknown syncache event %d\n", event); 3249 break; 3250 } 3251} 3252 3253static void 3254syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 3255{ 3256 struct in_conninfo inc; 3257 struct tcpopt to; 3258 struct tcphdr th; 3259 struct inpcb *inp; 3260 int mss, wsf, sack, ts; 3261 uint32_t rcv_isn = ntohl(req->rcv_isn); 3262 3263 bzero(&to, sizeof(struct tcpopt)); 3264 inp = so_sotoinpcb(lso); 3265 3266 /* 3267 * Fill out information for entering us into the syncache 3268 */ 3269 inc.inc_fport = th.th_sport = req->peer_port; 3270 inc.inc_lport = th.th_dport = req->local_port; 3271 th.th_seq = req->rcv_isn; 3272 th.th_flags = TH_SYN; 3273 3274 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; 3275 3276 3277 inc.inc_isipv6 = 0; 3278 inc.inc_len = 0; 3279 inc.inc_faddr.s_addr = req->peer_ip; 3280 inc.inc_laddr.s_addr = req->local_ip; 3281 3282 DPRINTF("syncache add of %d:%d %d:%d\n", 3283 ntohl(req->local_ip), ntohs(req->local_port), 3284 ntohl(req->peer_ip), ntohs(req->peer_port)); 3285 3286 mss = req->tcp_options.mss; 3287 wsf = req->tcp_options.wsf; 3288 ts = req->tcp_options.tstamp; 3289 sack = req->tcp_options.sack; 3290 to.to_mss = mss; 3291 to.to_wscale = wsf; 3292 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3293 syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 3294} 3295 3296 3297/* 3298 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 3299 * lock held. Note that the sock here is a listening socket that is not owned 3300 * by the TOE. 3301 */ 3302static void 3303process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 3304 struct listen_ctx *lctx) 3305{ 3306 int rt_flags; 3307 struct l2t_entry *e; 3308 struct iff_mac tim; 3309 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 3310 struct cpl_pass_accept_rpl *rpl; 3311 struct cpl_pass_accept_req *req = cplhdr(m); 3312 unsigned int tid = GET_TID(req); 3313 struct tom_data *d = TOM_DATA(tdev); 3314 struct t3cdev *cdev = d->cdev; 3315 struct tcpcb *tp = so_sototcpcb(so); 3316 struct toepcb *newtoep; 3317 struct rtentry *dst; 3318 struct sockaddr_in nam; 3319 struct t3c_data *td = T3C_DATA(cdev); 3320 3321 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3322 if (__predict_false(reply_mbuf == NULL)) { 3323 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3324 t3_defer_reply(m, tdev, reject_pass_request); 3325 else { 3326 cxgb_queue_tid_release(cdev, tid); 3327 m_free(m); 3328 } 3329 DPRINTF("failed to get reply_mbuf\n"); 3330 3331 goto out; 3332 } 3333 3334 if (tp->t_state != TCPS_LISTEN) { 3335 DPRINTF("socket not in listen state\n"); 3336 3337 goto reject; 3338 } 3339 3340 tim.mac_addr = req->dst_mac; 3341 tim.vlan_tag = ntohs(req->vlan_tag); 3342 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 3343 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 3344 goto reject; 3345 } 3346 3347#ifdef notyet 3348 /* 3349 * XXX do route lookup to confirm that we're still listening on this 3350 * address 3351 */ 3352 if (ip_route_input(skb, req->local_ip, req->peer_ip, 3353 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 3354 goto reject; 3355 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 3356 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 3357 dst_release(skb->dst); // done with the input route, release it 3358 skb->dst = NULL; 3359 3360 if ((rt_flags & RTF_LOCAL) == 0) 3361 goto reject; 3362#endif 3363 /* 3364 * XXX 3365 */ 3366 rt_flags = RTF_LOCAL; 3367 if ((rt_flags & RTF_LOCAL) == 0) 3368 goto reject; 3369 3370 /* 3371 * Calculate values and add to syncache 3372 */ 3373 3374 newtoep = toepcb_alloc(); 3375 if (newtoep == NULL) 3376 goto reject; 3377 3378 bzero(&nam, sizeof(struct sockaddr_in)); 3379 3380 nam.sin_len = sizeof(struct sockaddr_in); 3381 nam.sin_family = AF_INET; 3382 nam.sin_addr.s_addr =req->peer_ip; 3383 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 3384 3385 if (dst == NULL) { 3386 printf("failed to find route\n"); 3387 goto reject; 3388 } 3389 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 3390 (struct sockaddr *)&nam); 3391 if (e == NULL) { 3392 DPRINTF("failed to get l2t\n"); 3393 } 3394 /* 3395 * Point to our listen socket until accept 3396 */ 3397 newtoep->tp_tp = tp; 3398 newtoep->tp_flags = TP_SYN_RCVD; 3399 newtoep->tp_tid = tid; 3400 newtoep->tp_toedev = tdev; 3401 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3402 3403 cxgb_insert_tid(cdev, d->client, newtoep, tid); 3404 so_lock(so); 3405 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 3406 so_unlock(so); 3407 3408 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 3409 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 3410 3411 if (newtoep->tp_ulp_mode) { 3412 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3413 3414 if (ddp_mbuf == NULL) 3415 newtoep->tp_ulp_mode = 0; 3416 } 3417 3418 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", 3419 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); 3420 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 3421 /* 3422 * XXX workaround for lack of syncache drop 3423 */ 3424 toepcb_hold(newtoep); 3425 syncache_add_accept_req(req, so, newtoep); 3426 3427 rpl = cplhdr(reply_mbuf); 3428 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 3429 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3430 rpl->wr.wr_lo = 0; 3431 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3432 rpl->opt2 = htonl(calc_opt2(so, tdev)); 3433 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3434 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 3435 3436 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 3437 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 3438 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | 3439 CPL_PASS_OPEN_ACCEPT); 3440 3441 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 3442 3443 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); 3444 3445 l2t_send(cdev, reply_mbuf, e); 3446 m_free(m); 3447 if (newtoep->tp_ulp_mode) { 3448 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 3449 V_TF_DDP_OFF(1) | 3450 TP_DDP_TIMER_WORKAROUND_MASK, 3451 V_TF_DDP_OFF(1) | 3452 TP_DDP_TIMER_WORKAROUND_VAL, 1); 3453 } else 3454 printf("not offloading\n"); 3455 3456 3457 3458 return; 3459reject: 3460 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3461 mk_pass_accept_rpl(reply_mbuf, m); 3462 else 3463 mk_tid_release(reply_mbuf, newtoep, tid); 3464 cxgb_ofld_send(cdev, reply_mbuf); 3465 m_free(m); 3466out: 3467#if 0 3468 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3469#else 3470 return; 3471#endif 3472} 3473 3474/* 3475 * Handle a CPL_PASS_ACCEPT_REQ message. 3476 */ 3477static int 3478do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3479{ 3480 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 3481 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ 3482 struct tom_data *d = listen_ctx->tom_data; 3483 3484#if VALIDATE_TID 3485 struct cpl_pass_accept_req *req = cplhdr(m); 3486 unsigned int tid = GET_TID(req); 3487 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 3488 3489 if (unlikely(!lsk)) { 3490 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 3491 cdev->name, 3492 (unsigned long)((union listen_entry *)ctx - 3493 t->stid_tab)); 3494 return CPL_RET_BUF_DONE; 3495 } 3496 if (unlikely(tid >= t->ntids)) { 3497 printk(KERN_ERR "%s: passive open TID %u too large\n", 3498 cdev->name, tid); 3499 return CPL_RET_BUF_DONE; 3500 } 3501 /* 3502 * For T3A the current user of the TID may have closed but its last 3503 * message(s) may have been backlogged so the TID appears to be still 3504 * in use. Just take the TID away, the connection can close at its 3505 * own leisure. For T3B this situation is a bug. 3506 */ 3507 if (!valid_new_tid(t, tid) && 3508 cdev->type != T3A) { 3509 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 3510 cdev->name, tid); 3511 return CPL_RET_BUF_DONE; 3512 } 3513#endif 3514 3515 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 3516 return (0); 3517} 3518 3519/* 3520 * Called when a connection is established to translate the TCP options 3521 * reported by HW to FreeBSD's native format. 3522 */ 3523static void 3524assign_rxopt(struct socket *so, unsigned int opt) 3525{ 3526 struct tcpcb *tp = so_sototcpcb(so); 3527 struct toepcb *toep = tp->t_toe; 3528 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); 3529 3530 inp_lock_assert(tp->t_inpcb); 3531 3532 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3533 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 3534 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 3535 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 3536 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3537 (TF_RCVD_SCALE|TF_REQ_SCALE)) 3538 tp->rcv_scale = tp->request_r_scale; 3539} 3540 3541/* 3542 * Completes some final bits of initialization for just established connections 3543 * and changes their state to TCP_ESTABLISHED. 3544 * 3545 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 3546 */ 3547static void 3548make_established(struct socket *so, u32 snd_isn, unsigned int opt) 3549{ 3550 struct tcpcb *tp = so_sototcpcb(so); 3551 struct toepcb *toep = tp->t_toe; 3552 3553 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 3554 assign_rxopt(so, opt); 3555 3556 /* 3557 *XXXXXXXXXXX 3558 * 3559 */ 3560#ifdef notyet 3561 so->so_proto->pr_ctloutput = t3_ctloutput; 3562#endif 3563 3564#if 0 3565 inet_sk(sk)->id = tp->write_seq ^ jiffies; 3566#endif 3567 /* 3568 * XXX not clear what rcv_wup maps to 3569 */ 3570 /* 3571 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 3572 * pass through opt0. 3573 */ 3574 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 3575 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 3576 3577 dump_toepcb(toep); 3578 3579#ifdef notyet 3580/* 3581 * no clean interface for marking ARP up to date 3582 */ 3583 dst_confirm(sk->sk_dst_cache); 3584#endif 3585 tp->t_starttime = ticks; 3586 tp->t_state = TCPS_ESTABLISHED; 3587 soisconnected(so); 3588} 3589 3590static int 3591syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 3592{ 3593 3594 struct in_conninfo inc; 3595 struct tcpopt to; 3596 struct tcphdr th; 3597 int mss, wsf, sack, ts; 3598 struct mbuf *m = NULL; 3599 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 3600 unsigned int opt; 3601 3602#ifdef MAC 3603#error "no MAC support" 3604#endif 3605 3606 opt = ntohs(req->tcp_opt); 3607 3608 bzero(&to, sizeof(struct tcpopt)); 3609 3610 /* 3611 * Fill out information for entering us into the syncache 3612 */ 3613 inc.inc_fport = th.th_sport = req->peer_port; 3614 inc.inc_lport = th.th_dport = req->local_port; 3615 th.th_seq = req->rcv_isn; 3616 th.th_flags = TH_ACK; 3617 3618 inc.inc_isipv6 = 0; 3619 inc.inc_len = 0; 3620 inc.inc_faddr.s_addr = req->peer_ip; 3621 inc.inc_laddr.s_addr = req->local_ip; 3622 3623 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3624 wsf = G_TCPOPT_WSCALE_OK(opt); 3625 ts = G_TCPOPT_TSTAMP(opt); 3626 sack = G_TCPOPT_SACK(opt); 3627 3628 to.to_mss = mss; 3629 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 3630 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3631 3632 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 3633 ntohl(req->local_ip), ntohs(req->local_port), 3634 ntohl(req->peer_ip), ntohs(req->peer_port), 3635 mss, wsf, ts, sack); 3636 return syncache_offload_expand(&inc, &to, &th, so, m); 3637} 3638 3639 3640/* 3641 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 3642 * if we are in TCP_SYN_RECV due to crossed SYNs 3643 */ 3644static int 3645do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3646{ 3647 struct cpl_pass_establish *req = cplhdr(m); 3648 struct toepcb *toep = (struct toepcb *)ctx; 3649 struct tcpcb *tp = toep->tp_tp; 3650 struct socket *so, *lso; 3651 struct t3c_data *td = T3C_DATA(cdev); 3652 struct sockbuf *snd, *rcv; 3653 3654 // Complete socket initialization now that we have the SND_ISN 3655 3656 struct toedev *tdev; 3657 3658 3659 tdev = toep->tp_toedev; 3660 3661 inp_wlock(tp->t_inpcb); 3662 3663 /* 3664 * 3665 * XXX need to add reference while we're manipulating 3666 */ 3667 so = lso = inp_inpcbtosocket(tp->t_inpcb); 3668 3669 inp_wunlock(tp->t_inpcb); 3670 3671 so_lock(so); 3672 LIST_REMOVE(toep, synq_entry); 3673 so_unlock(so); 3674 3675 if (!syncache_expand_establish_req(req, &so, toep)) { 3676 /* 3677 * No entry 3678 */ 3679 CXGB_UNIMPLEMENTED(); 3680 } 3681 if (so == NULL) { 3682 /* 3683 * Couldn't create the socket 3684 */ 3685 CXGB_UNIMPLEMENTED(); 3686 } 3687 3688 tp = so_sototcpcb(so); 3689 inp_wlock(tp->t_inpcb); 3690 3691 snd = so_sockbuf_snd(so); 3692 rcv = so_sockbuf_rcv(so); 3693 3694 snd->sb_flags |= SB_NOCOALESCE; 3695 rcv->sb_flags |= SB_NOCOALESCE; 3696 3697 toep->tp_tp = tp; 3698 toep->tp_flags = 0; 3699 tp->t_toe = toep; 3700 reset_wr_list(toep); 3701 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3702 tp->rcv_nxt = toep->tp_copied_seq; 3703 install_offload_ops(so); 3704 3705 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 3706 toep->tp_wr_unacked = 0; 3707 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3708 toep->tp_qset_idx = 0; 3709 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 3710 3711 /* 3712 * XXX Cancel any keep alive timer 3713 */ 3714 3715 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3716 3717 /* 3718 * XXX workaround for lack of syncache drop 3719 */ 3720 toepcb_release(toep); 3721 inp_wunlock(tp->t_inpcb); 3722 3723 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); 3724 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3725#ifdef notyet 3726 /* 3727 * XXX not sure how these checks map to us 3728 */ 3729 if (unlikely(sk->sk_socket)) { // simultaneous opens only 3730 sk->sk_state_change(sk); 3731 sk_wake_async(so, 0, POLL_OUT); 3732 } 3733 /* 3734 * The state for the new connection is now up to date. 3735 * Next check if we should add the connection to the parent's 3736 * accept queue. When the parent closes it resets connections 3737 * on its SYN queue, so check if we are being reset. If so we 3738 * don't need to do anything more, the coming ABORT_RPL will 3739 * destroy this socket. Otherwise move the connection to the 3740 * accept queue. 3741 * 3742 * Note that we reset the synq before closing the server so if 3743 * we are not being reset the stid is still open. 3744 */ 3745 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3746 __kfree_skb(skb); 3747 goto unlock; 3748 } 3749#endif 3750 m_free(m); 3751 3752 return (0); 3753} 3754 3755/* 3756 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3757 * and send them to the TOE. 3758 */ 3759static void 3760fixup_and_send_ofo(struct toepcb *toep) 3761{ 3762 struct mbuf *m; 3763 struct toedev *tdev = toep->tp_toedev; 3764 struct tcpcb *tp = toep->tp_tp; 3765 unsigned int tid = toep->tp_tid; 3766 3767 log(LOG_NOTICE, "fixup_and_send_ofo\n"); 3768 3769 inp_lock_assert(tp->t_inpcb); 3770 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3771 /* 3772 * A variety of messages can be waiting but the fields we'll 3773 * be touching are common to all so any message type will do. 3774 */ 3775 struct cpl_close_con_req *p = cplhdr(m); 3776 3777 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3778 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3779 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3780 } 3781} 3782 3783/* 3784 * Updates socket state from an active establish CPL message. Runs with the 3785 * socket lock held. 3786 */ 3787static void 3788socket_act_establish(struct socket *so, struct mbuf *m) 3789{ 3790 struct cpl_act_establish *req = cplhdr(m); 3791 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3792 struct tcpcb *tp = so_sototcpcb(so); 3793 struct toepcb *toep = tp->t_toe; 3794 3795 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3796 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3797 toep->tp_tid, tp->t_state); 3798 3799 tp->ts_recent_age = ticks; 3800 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3801 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3802 3803 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3804 3805 /* 3806 * Now that we finally have a TID send any CPL messages that we had to 3807 * defer for lack of a TID. 3808 */ 3809 if (mbufq_len(&toep->out_of_order_queue)) 3810 fixup_and_send_ofo(toep); 3811 3812 if (__predict_false(so_state_get(so) & SS_NOFDREF)) { 3813 /* 3814 * XXX does this even make sense? 3815 */ 3816 so_sorwakeup(so); 3817 } 3818 m_free(m); 3819#ifdef notyet 3820/* 3821 * XXX assume no write requests permitted while socket connection is 3822 * incomplete 3823 */ 3824 /* 3825 * Currently the send queue must be empty at this point because the 3826 * socket layer does not send anything before a connection is 3827 * established. To be future proof though we handle the possibility 3828 * that there are pending buffers to send (either TX_DATA or 3829 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3830 * buffers according to the just learned write_seq, and then we send 3831 * them on their way. 3832 */ 3833 fixup_pending_writeq_buffers(sk); 3834 if (t3_push_frames(so, 1)) 3835 sk->sk_write_space(sk); 3836#endif 3837 3838 toep->tp_state = tp->t_state; 3839 tcpstat.tcps_connects++; 3840 3841} 3842 3843/* 3844 * Process a CPL_ACT_ESTABLISH message. 3845 */ 3846static int 3847do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3848{ 3849 struct cpl_act_establish *req = cplhdr(m); 3850 unsigned int tid = GET_TID(req); 3851 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3852 struct toepcb *toep = (struct toepcb *)ctx; 3853 struct tcpcb *tp = toep->tp_tp; 3854 struct socket *so; 3855 struct toedev *tdev; 3856 struct tom_data *d; 3857 3858 if (tp == NULL) { 3859 free_atid(cdev, atid); 3860 return (0); 3861 } 3862 inp_wlock(tp->t_inpcb); 3863 3864 /* 3865 * XXX 3866 */ 3867 so = inp_inpcbtosocket(tp->t_inpcb); 3868 tdev = toep->tp_toedev; /* blow up here if link was down */ 3869 d = TOM_DATA(tdev); 3870 3871 /* 3872 * It's OK if the TID is currently in use, the owning socket may have 3873 * backlogged its last CPL message(s). Just take it away. 3874 */ 3875 toep->tp_tid = tid; 3876 toep->tp_tp = tp; 3877 so_insert_tid(d, toep, tid); 3878 free_atid(cdev, atid); 3879 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3880 3881 socket_act_establish(so, m); 3882 inp_wunlock(tp->t_inpcb); 3883 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); 3884 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3885 3886 return (0); 3887} 3888 3889/* 3890 * Process an acknowledgment of WR completion. Advance snd_una and send the 3891 * next batch of work requests from the write queue. 3892 */ 3893static void 3894wr_ack(struct toepcb *toep, struct mbuf *m) 3895{ 3896 struct tcpcb *tp = toep->tp_tp; 3897 struct cpl_wr_ack *hdr = cplhdr(m); 3898 struct socket *so; 3899 unsigned int credits = ntohs(hdr->credits); 3900 u32 snd_una = ntohl(hdr->snd_una); 3901 int bytes = 0; 3902 struct sockbuf *snd; 3903 3904 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); 3905 3906 inp_wlock(tp->t_inpcb); 3907 so = inp_inpcbtosocket(tp->t_inpcb); 3908 toep->tp_wr_avail += credits; 3909 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3910 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3911 3912 while (credits) { 3913 struct mbuf *p = peek_wr(toep); 3914 3915 if (__predict_false(!p)) { 3916 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3917 "nothing pending, state %u wr_avail=%u\n", 3918 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 3919 break; 3920 } 3921 CTR2(KTR_TOM, 3922 "wr_ack: p->credits=%d p->bytes=%d", 3923 p->m_pkthdr.csum_data, p->m_pkthdr.len); 3924 KASSERT(p->m_pkthdr.csum_data != 0, 3925 ("empty request still on list")); 3926 3927 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3928 3929#if DEBUG_WR > 1 3930 struct tx_data_wr *w = cplhdr(p); 3931 log(LOG_ERR, 3932 "TID %u got %u WR credits, need %u, len %u, " 3933 "main body %u, frags %u, seq # %u, ACK una %u," 3934 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3935 toep->tp_tid, credits, p->csum, p->len, 3936 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3937 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3938 toep->tp_wr_avail, count_pending_wrs(tp) - credits); 3939#endif 3940 p->m_pkthdr.csum_data -= credits; 3941 break; 3942 } else { 3943 dequeue_wr(toep); 3944 credits -= p->m_pkthdr.csum_data; 3945 bytes += p->m_pkthdr.len; 3946 CTR3(KTR_TOM, 3947 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", 3948 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); 3949 3950 m_free(p); 3951 } 3952 } 3953 3954#if DEBUG_WR 3955 check_wr_invariants(tp); 3956#endif 3957 3958 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3959#if VALIDATE_SEQ 3960 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3961 3962 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3963 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3964 toep->tp_tid, tp->snd_una); 3965#endif 3966 goto out_free; 3967 } 3968 3969 if (tp->snd_una != snd_una) { 3970 tp->snd_una = snd_una; 3971 tp->ts_recent_age = ticks; 3972#ifdef notyet 3973 /* 3974 * Keep ARP entry "minty fresh" 3975 */ 3976 dst_confirm(sk->sk_dst_cache); 3977#endif 3978 if (tp->snd_una == tp->snd_nxt) 3979 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3980 } 3981 3982 snd = so_sockbuf_snd(so); 3983 if (bytes) { 3984 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); 3985 snd = so_sockbuf_snd(so); 3986 sockbuf_lock(snd); 3987 sbdrop_locked(snd, bytes); 3988 so_sowwakeup_locked(so); 3989 } 3990 3991 if (snd->sb_sndptroff < snd->sb_cc) 3992 t3_push_frames(so, 0); 3993 3994out_free: 3995 inp_wunlock(tp->t_inpcb); 3996 m_free(m); 3997} 3998 3999/* 4000 * Handler for TX_DATA_ACK CPL messages. 4001 */ 4002static int 4003do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 4004{ 4005 struct toepcb *toep = (struct toepcb *)ctx; 4006 4007 VALIDATE_SOCK(so); 4008 4009 wr_ack(toep, m); 4010 return 0; 4011} 4012 4013/* 4014 * Handler for TRACE_PKT CPL messages. Just sink these packets. 4015 */ 4016static int 4017do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) 4018{ 4019 m_freem(m); 4020 return 0; 4021} 4022 4023/* 4024 * Reset a connection that is on a listener's SYN queue or accept queue, 4025 * i.e., one that has not had a struct socket associated with it. 4026 * Must be called from process context. 4027 * 4028 * Modeled after code in inet_csk_listen_stop(). 4029 */ 4030static void 4031t3_reset_listen_child(struct socket *child) 4032{ 4033 struct tcpcb *tp = so_sototcpcb(child); 4034 4035 t3_send_reset(tp->t_toe); 4036} 4037 4038 4039static void 4040t3_child_disconnect(struct socket *so, void *arg) 4041{ 4042 struct tcpcb *tp = so_sototcpcb(so); 4043 4044 if (tp->t_flags & TF_TOE) { 4045 inp_wlock(tp->t_inpcb); 4046 t3_reset_listen_child(so); 4047 inp_wunlock(tp->t_inpcb); 4048 } 4049} 4050 4051/* 4052 * Disconnect offloaded established but not yet accepted connections sitting 4053 * on a server's accept_queue. We just send an ABORT_REQ at this point and 4054 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 4055 */ 4056void 4057t3_disconnect_acceptq(struct socket *listen_so) 4058{ 4059 4060 so_lock(listen_so); 4061 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); 4062 so_unlock(listen_so); 4063} 4064 4065/* 4066 * Reset offloaded connections sitting on a server's syn queue. As above 4067 * we send ABORT_REQ and finish off when we get ABORT_RPL. 4068 */ 4069 4070void 4071t3_reset_synq(struct listen_ctx *lctx) 4072{ 4073 struct toepcb *toep; 4074 4075 so_lock(lctx->lso); 4076 while (!LIST_EMPTY(&lctx->synq_head)) { 4077 toep = LIST_FIRST(&lctx->synq_head); 4078 LIST_REMOVE(toep, synq_entry); 4079 toep->tp_tp = NULL; 4080 t3_send_reset(toep); 4081 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 4082 toepcb_release(toep); 4083 } 4084 so_unlock(lctx->lso); 4085} 4086 4087 4088int 4089t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, 4090 unsigned int nppods, unsigned int tag, unsigned int maxoff, 4091 unsigned int pg_off, unsigned int color) 4092{ 4093 unsigned int i, j, pidx; 4094 struct pagepod *p; 4095 struct mbuf *m; 4096 struct ulp_mem_io *req; 4097 unsigned int tid = toep->tp_tid; 4098 const struct tom_data *td = TOM_DATA(toep->tp_toedev); 4099 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; 4100 4101 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", 4102 gl, nppods, tag, maxoff, pg_off, color); 4103 4104 for (i = 0; i < nppods; ++i) { 4105 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); 4106 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4107 req = mtod(m, struct ulp_mem_io *); 4108 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; 4109 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4110 req->wr.wr_lo = 0; 4111 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | 4112 V_ULPTX_CMD(ULP_MEM_WRITE)); 4113 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | 4114 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); 4115 4116 p = (struct pagepod *)(req + 1); 4117 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { 4118 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); 4119 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | 4120 V_PPOD_COLOR(color)); 4121 p->pp_max_offset = htonl(maxoff); 4122 p->pp_page_offset = htonl(pg_off); 4123 p->pp_rsvd = 0; 4124 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) 4125 p->pp_addr[j] = pidx < gl->dgl_nelem ? 4126 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; 4127 } else 4128 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ 4129 send_or_defer(toep, m, 0); 4130 ppod_addr += PPOD_SIZE; 4131 } 4132 return (0); 4133} 4134 4135/* 4136 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. 4137 */ 4138static inline void 4139mk_cpl_barrier_ulp(struct cpl_barrier *b) 4140{ 4141 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; 4142 4143 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4144 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); 4145 b->opcode = CPL_BARRIER; 4146} 4147 4148/* 4149 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. 4150 */ 4151static inline void 4152mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) 4153{ 4154 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4155 4156 txpkt = (struct ulp_txpkt *)req; 4157 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4158 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4159 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); 4160 req->cpuno = htons(cpuno); 4161} 4162 4163/* 4164 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. 4165 */ 4166static inline void 4167mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, 4168 unsigned int word, uint64_t mask, uint64_t val) 4169{ 4170 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4171 4172 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 4173 tid, word, mask, val); 4174 4175 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4176 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4177 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); 4178 req->reply = V_NO_REPLY(1); 4179 req->cpu_idx = 0; 4180 req->word = htons(word); 4181 req->mask = htobe64(mask); 4182 req->val = htobe64(val); 4183} 4184 4185/* 4186 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. 4187 */ 4188static void 4189mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, 4190 unsigned int tid, unsigned int credits) 4191{ 4192 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; 4193 4194 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4195 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); 4196 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); 4197 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 4198 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | 4199 V_RX_CREDITS(credits)); 4200} 4201 4202void 4203t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) 4204{ 4205 unsigned int wrlen; 4206 struct mbuf *m; 4207 struct work_request_hdr *wr; 4208 struct cpl_barrier *lock; 4209 struct cpl_set_tcb_field *req; 4210 struct cpl_get_tcb *getreq; 4211 struct ddp_state *p = &toep->tp_ddp_state; 4212 4213#if 0 4214 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4215#endif 4216 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + 4217 sizeof(*getreq); 4218 m = m_gethdr_nofail(wrlen); 4219 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4220 wr = mtod(m, struct work_request_hdr *); 4221 bzero(wr, wrlen); 4222 4223 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4224 m->m_pkthdr.len = m->m_len = wrlen; 4225 4226 lock = (struct cpl_barrier *)(wr + 1); 4227 mk_cpl_barrier_ulp(lock); 4228 4229 req = (struct cpl_set_tcb_field *)(lock + 1); 4230 4231 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); 4232 4233 /* Hmmm, not sure if this actually a good thing: reactivating 4234 * the other buffer might be an issue if it has been completed 4235 * already. However, that is unlikely, since the fact that the UBUF 4236 * is not completed indicates that there is no oustanding data. 4237 */ 4238 if (bufidx == 0) 4239 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4240 V_TF_DDP_ACTIVE_BUF(1) | 4241 V_TF_DDP_BUF0_VALID(1), 4242 V_TF_DDP_ACTIVE_BUF(1)); 4243 else 4244 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4245 V_TF_DDP_ACTIVE_BUF(1) | 4246 V_TF_DDP_BUF1_VALID(1), 0); 4247 4248 getreq = (struct cpl_get_tcb *)(req + 1); 4249 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4250 4251 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); 4252 4253 /* Keep track of the number of oustanding CPL_GET_TCB requests 4254 */ 4255 p->get_tcb_count++; 4256 4257#ifdef T3_TRACE 4258 T3_TRACE1(TIDTB(so), 4259 "t3_cancel_ddpbuf: bufidx %u", bufidx); 4260#endif 4261 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4262} 4263 4264/** 4265 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one 4266 * @sk: the socket associated with the buffers 4267 * @bufidx: index of HW DDP buffer (0 or 1) 4268 * @tag0: new tag for HW buffer 0 4269 * @tag1: new tag for HW buffer 1 4270 * @len: new length for HW buf @bufidx 4271 * 4272 * Sends a compound WR to overlay a new DDP buffer on top of an existing 4273 * buffer by changing the buffer tag and length and setting the valid and 4274 * active flag accordingly. The caller must ensure the new buffer is at 4275 * least as big as the existing one. Since we typically reprogram both HW 4276 * buffers this function sets both tags for convenience. Read the TCB to 4277 * determine how made data was written into the buffer before the overlay 4278 * took place. 4279 */ 4280void 4281t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, 4282 unsigned int tag1, unsigned int len) 4283{ 4284 unsigned int wrlen; 4285 struct mbuf *m; 4286 struct work_request_hdr *wr; 4287 struct cpl_get_tcb *getreq; 4288 struct cpl_set_tcb_field *req; 4289 struct ddp_state *p = &toep->tp_ddp_state; 4290 4291 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", 4292 bufidx, tag0, tag1, len); 4293#if 0 4294 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4295#endif 4296 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); 4297 m = m_gethdr_nofail(wrlen); 4298 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4299 wr = mtod(m, struct work_request_hdr *); 4300 m->m_pkthdr.len = m->m_len = wrlen; 4301 bzero(wr, wrlen); 4302 4303 4304 /* Set the ATOMIC flag to make sure that TP processes the following 4305 * CPLs in an atomic manner and no wire segments can be interleaved. 4306 */ 4307 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); 4308 req = (struct cpl_set_tcb_field *)(wr + 1); 4309 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, 4310 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | 4311 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, 4312 V_TCB_RX_DDP_BUF0_TAG(tag0) | 4313 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); 4314 req++; 4315 if (bufidx == 0) { 4316 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, 4317 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4318 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 4319 req++; 4320 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4321 V_TF_DDP_PUSH_DISABLE_0(1) | 4322 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4323 V_TF_DDP_PUSH_DISABLE_0(0) | 4324 V_TF_DDP_BUF0_VALID(1)); 4325 } else { 4326 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, 4327 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), 4328 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); 4329 req++; 4330 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4331 V_TF_DDP_PUSH_DISABLE_1(1) | 4332 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4333 V_TF_DDP_PUSH_DISABLE_1(0) | 4334 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); 4335 } 4336 4337 getreq = (struct cpl_get_tcb *)(req + 1); 4338 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4339 4340 /* Keep track of the number of oustanding CPL_GET_TCB requests 4341 */ 4342 p->get_tcb_count++; 4343 4344#ifdef T3_TRACE 4345 T3_TRACE4(TIDTB(sk), 4346 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " 4347 "len %d", 4348 bufidx, tag0, tag1, len); 4349#endif 4350 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4351} 4352 4353/* 4354 * Sends a compound WR containing all the CPL messages needed to program the 4355 * two HW DDP buffers, namely optionally setting up the length and offset of 4356 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. 4357 */ 4358void 4359t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, 4360 unsigned int len1, unsigned int offset1, 4361 uint64_t ddp_flags, uint64_t flag_mask, int modulate) 4362{ 4363 unsigned int wrlen; 4364 struct mbuf *m; 4365 struct work_request_hdr *wr; 4366 struct cpl_set_tcb_field *req; 4367 4368 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", 4369 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); 4370 4371#if 0 4372 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4373#endif 4374 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + 4375 (len1 ? sizeof(*req) : 0) + 4376 (modulate ? sizeof(struct cpl_rx_data_ack) : 0); 4377 m = m_gethdr_nofail(wrlen); 4378 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4379 wr = mtod(m, struct work_request_hdr *); 4380 bzero(wr, wrlen); 4381 4382 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4383 m->m_pkthdr.len = m->m_len = wrlen; 4384 4385 req = (struct cpl_set_tcb_field *)(wr + 1); 4386 if (len0) { /* program buffer 0 offset and length */ 4387 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, 4388 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 4389 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4390 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | 4391 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); 4392 req++; 4393 } 4394 if (len1) { /* program buffer 1 offset and length */ 4395 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, 4396 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 4397 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, 4398 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | 4399 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); 4400 req++; 4401 } 4402 4403 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, 4404 ddp_flags); 4405 4406 if (modulate) { 4407 mk_rx_data_ack_ulp(toep, 4408 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, 4409 toep->tp_copied_seq - toep->tp_rcv_wup); 4410 toep->tp_rcv_wup = toep->tp_copied_seq; 4411 } 4412 4413#ifdef T3_TRACE 4414 T3_TRACE5(TIDTB(sk), 4415 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " 4416 "modulate %d", 4417 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, 4418 modulate); 4419#endif 4420 4421 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4422} 4423 4424void 4425t3_init_wr_tab(unsigned int wr_len) 4426{ 4427 int i; 4428 4429 if (mbuf_wrs[1]) /* already initialized */ 4430 return; 4431 4432 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 4433 int sgl_len = (3 * i) / 2 + (i & 1); 4434 4435 sgl_len += 3; 4436 mbuf_wrs[i] = sgl_len <= wr_len ? 4437 1 : 1 + (sgl_len - 2) / (wr_len - 1); 4438 } 4439 4440 wrlen = wr_len * 8; 4441} 4442 4443int 4444t3_init_cpl_io(void) 4445{ 4446#ifdef notyet 4447 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 4448 if (!tcphdr_skb) { 4449 log(LOG_ERR, 4450 "Chelsio TCP offload: can't allocate sk_buff\n"); 4451 return -1; 4452 } 4453 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 4454 tcphdr_skb->h.raw = tcphdr_skb->data; 4455 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 4456#endif 4457 4458 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 4459 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 4460 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 4461 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 4462 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 4463 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 4464 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 4465 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 4466 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 4467 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 4468 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 4469 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 4470 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 4471 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 4472 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 4473 return (0); 4474} 4475 4476