cxgb_cpl_io.c revision 181803
1/************************************************************************** 2 3Copyright (c) 2007-2008, Chelsio Inc. 4All rights reserved. 5 6Redistribution and use in source and binary forms, with or without 7modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26POSSIBILITY OF SUCH DAMAGE. 27 28***************************************************************************/ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 181803 2008-08-17 23:27:27Z bz $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/sockstate.h> 43#include <sys/sockopt.h> 44#include <sys/socket.h> 45#include <sys/socketvar.h> 46#include <sys/sockbuf.h> 47#include <sys/sysctl.h> 48#include <sys/syslog.h> 49#include <sys/protosw.h> 50#include <sys/priv.h> 51#include <sys/vimage.h> 52 53#include <net/if.h> 54#include <net/route.h> 55 56#include <netinet/in.h> 57#include <netinet/in_pcb.h> 58#include <netinet/in_systm.h> 59#include <netinet/in_var.h> 60 61 62#include <dev/cxgb/cxgb_osdep.h> 63#include <dev/cxgb/sys/mbufq.h> 64 65#include <netinet/ip.h> 66#include <netinet/tcp_var.h> 67#include <netinet/tcp_fsm.h> 68#include <netinet/tcp_offload.h> 69#include <netinet/tcp_seq.h> 70#include <netinet/tcp_syncache.h> 71#include <netinet/tcp_timer.h> 72#include <net/route.h> 73 74#include <dev/cxgb/t3cdev.h> 75#include <dev/cxgb/common/cxgb_firmware_exports.h> 76#include <dev/cxgb/common/cxgb_t3_cpl.h> 77#include <dev/cxgb/common/cxgb_tcb.h> 78#include <dev/cxgb/common/cxgb_ctl_defs.h> 79#include <dev/cxgb/cxgb_offload.h> 80#include <vm/vm.h> 81#include <vm/pmap.h> 82#include <machine/bus.h> 83#include <dev/cxgb/sys/mvec.h> 84#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> 85#include <dev/cxgb/ulp/tom/cxgb_defs.h> 86#include <dev/cxgb/ulp/tom/cxgb_tom.h> 87#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> 88#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> 89#include <dev/cxgb/ulp/tom/cxgb_tcp.h> 90 91#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h> 92 93/* 94 * For ULP connections HW may add headers, e.g., for digests, that aren't part 95 * of the messages sent by the host but that are part of the TCP payload and 96 * therefore consume TCP sequence space. Tx connection parameters that 97 * operate in TCP sequence space are affected by the HW additions and need to 98 * compensate for them to accurately track TCP sequence numbers. This array 99 * contains the compensating extra lengths for ULP packets. It is indexed by 100 * a packet's ULP submode. 101 */ 102const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 103 104#ifdef notyet 105/* 106 * This sk_buff holds a fake header-only TCP segment that we use whenever we 107 * need to exploit SW TCP functionality that expects TCP headers, such as 108 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 109 * CPUs without locking. 110 */ 111static struct mbuf *tcphdr_mbuf __read_mostly; 112#endif 113 114/* 115 * Size of WRs in bytes. Note that we assume all devices we are handling have 116 * the same WR size. 117 */ 118static unsigned int wrlen __read_mostly; 119 120/* 121 * The number of WRs needed for an skb depends on the number of page fragments 122 * in the skb and whether it has any payload in its main body. This maps the 123 * length of the gather list represented by an skb into the # of necessary WRs. 124 */ 125static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; 126 127/* 128 * Max receive window supported by HW in bytes. Only a small part of it can 129 * be set through option0, the rest needs to be set through RX_DATA_ACK. 130 */ 131#define MAX_RCV_WND ((1U << 27) - 1) 132 133/* 134 * Min receive window. We want it to be large enough to accommodate receive 135 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 136 */ 137#define MIN_RCV_WND (24 * 1024U) 138#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 139 140#define VALIDATE_SEQ 0 141#define VALIDATE_SOCK(so) 142#define DEBUG_WR 0 143 144#define TCP_TIMEWAIT 1 145#define TCP_CLOSE 2 146#define TCP_DROP 3 147 148extern int tcp_do_autorcvbuf; 149extern int tcp_do_autosndbuf; 150extern int tcp_autorcvbuf_max; 151extern int tcp_autosndbuf_max; 152 153static void t3_send_reset(struct toepcb *toep); 154static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 155static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 156static void handle_syncache_event(int event, void *arg); 157 158static inline void 159SBAPPEND(struct sockbuf *sb, struct mbuf *n) 160{ 161 struct mbuf *m; 162 163 m = sb->sb_mb; 164 while (m) { 165 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 166 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 167 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 168 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 169 m->m_next, m->m_nextpkt, m->m_flags)); 170 m = m->m_next; 171 } 172 m = n; 173 while (m) { 174 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 175 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 176 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 177 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 178 m->m_next, m->m_nextpkt, m->m_flags)); 179 m = m->m_next; 180 } 181 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); 182 sbappendstream_locked(sb, n); 183 m = sb->sb_mb; 184 185 while (m) { 186 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 187 m->m_next, m->m_nextpkt, m->m_flags)); 188 m = m->m_next; 189 } 190} 191 192static inline int 193is_t3a(const struct toedev *dev) 194{ 195 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 196} 197 198static void 199dump_toepcb(struct toepcb *toep) 200{ 201 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 202 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 203 toep->tp_mtu_idx, toep->tp_tid); 204 205 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 206 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 207 toep->tp_mss_clamp, toep->tp_flags); 208} 209 210#ifndef RTALLOC2_DEFINED 211static struct rtentry * 212rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 213{ 214 struct rtentry *rt = NULL; 215 216 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 217 RT_UNLOCK(rt); 218 219 return (rt); 220} 221#endif 222 223/* 224 * Determine whether to send a CPL message now or defer it. A message is 225 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 226 * For connections in other states the message is sent immediately. 227 * If through_l2t is set the message is subject to ARP processing, otherwise 228 * it is sent directly. 229 */ 230static inline void 231send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) 232{ 233 struct tcpcb *tp = toep->tp_tp; 234 235 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 236 inp_wlock(tp->t_inpcb); 237 mbufq_tail(&toep->out_of_order_queue, m); // defer 238 inp_wunlock(tp->t_inpcb); 239 } else if (through_l2t) 240 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T 241 else 242 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly 243} 244 245static inline unsigned int 246mkprio(unsigned int cntrl, const struct toepcb *toep) 247{ 248 return (cntrl); 249} 250 251/* 252 * Populate a TID_RELEASE WR. The skb must be already propely sized. 253 */ 254static inline void 255mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) 256{ 257 struct cpl_tid_release *req; 258 259 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); 260 m->m_pkthdr.len = m->m_len = sizeof(*req); 261 req = mtod(m, struct cpl_tid_release *); 262 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 263 req->wr.wr_lo = 0; 264 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 265} 266 267static inline void 268make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 269{ 270 struct tcpcb *tp = so_sototcpcb(so); 271 struct toepcb *toep = tp->t_toe; 272 struct tx_data_wr *req; 273 struct sockbuf *snd; 274 275 inp_lock_assert(tp->t_inpcb); 276 snd = so_sockbuf_snd(so); 277 278 req = mtod(m, struct tx_data_wr *); 279 m->m_len = sizeof(*req); 280 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 281 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 282 /* len includes the length of any HW ULP additions */ 283 req->len = htonl(len); 284 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 285 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 286 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 287 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 288 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 289 (tail ? 0 : 1)))); 290 req->sndseq = htonl(tp->snd_nxt); 291 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 292 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 293 V_TX_CPU_IDX(toep->tp_qset)); 294 295 /* Sendbuffer is in units of 32KB. 296 */ 297 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 298 req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15)); 299 else { 300 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 301 } 302 303 toep->tp_flags |= TP_DATASENT; 304 } 305} 306 307#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ 308 309int 310t3_push_frames(struct socket *so, int req_completion) 311{ 312 struct tcpcb *tp = so_sototcpcb(so); 313 struct toepcb *toep = tp->t_toe; 314 315 struct mbuf *tail, *m0, *last; 316 struct t3cdev *cdev; 317 struct tom_data *d; 318 int state, bytes, count, total_bytes; 319 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 320 struct sockbuf *snd; 321 322 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 323 DPRINTF("tcp state=%d\n", tp->t_state); 324 return (0); 325 } 326 327 state = so_state_get(so); 328 329 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 330 DPRINTF("disconnecting\n"); 331 332 return (0); 333 } 334 335 inp_lock_assert(tp->t_inpcb); 336 337 snd = so_sockbuf_snd(so); 338 sockbuf_lock(snd); 339 340 d = TOM_DATA(toep->tp_toedev); 341 cdev = d->cdev; 342 343 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 344 345 total_bytes = 0; 346 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 347 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); 348 349 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { 350 KASSERT(tail, ("sbdrop error")); 351 last = tail = tail->m_next; 352 } 353 354 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 355 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 356 sockbuf_unlock(snd); 357 358 return (0); 359 } 360 361 toep->tp_m_last = NULL; 362 while (toep->tp_wr_avail && (tail != NULL)) { 363 count = bytes = 0; 364 segp = segs; 365 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 366 sockbuf_unlock(snd); 367 return (0); 368 } 369 /* 370 * If the data in tail fits as in-line, then 371 * make an immediate data wr. 372 */ 373 if (tail->m_len <= IMM_LEN) { 374 count = 1; 375 bytes = tail->m_len; 376 last = tail; 377 tail = tail->m_next; 378 m_set_sgl(m0, NULL); 379 m_set_sgllen(m0, 0); 380 make_tx_data_wr(so, m0, bytes, tail); 381 m_append(m0, bytes, mtod(last, caddr_t)); 382 KASSERT(!m0->m_next, ("bad append")); 383 } else { 384 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 385 && (tail != NULL) && (count < TX_MAX_SEGS-1)) { 386 bytes += tail->m_len; 387 last = tail; 388 count++; 389 /* 390 * technically an abuse to be using this for a VA 391 * but less gross than defining my own structure 392 * or calling pmap_kextract from here :-| 393 */ 394 segp->ds_addr = (bus_addr_t)tail->m_data; 395 segp->ds_len = tail->m_len; 396 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 397 count, mbuf_wrs[count], tail->m_data, tail->m_len); 398 segp++; 399 tail = tail->m_next; 400 } 401 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 402 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 403 404 m_set_sgl(m0, segs); 405 m_set_sgllen(m0, count); 406 make_tx_data_wr(so, m0, bytes, tail); 407 } 408 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); 409 410 if (tail) { 411 snd->sb_sndptr = tail; 412 toep->tp_m_last = NULL; 413 } else 414 toep->tp_m_last = snd->sb_sndptr = last; 415 416 417 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 418 419 snd->sb_sndptroff += bytes; 420 total_bytes += bytes; 421 toep->tp_write_seq += bytes; 422 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" 423 " tail=%p sndptr=%p sndptroff=%d", 424 toep->tp_wr_avail, count, mbuf_wrs[count], 425 tail, snd->sb_sndptr, snd->sb_sndptroff); 426 if (tail) 427 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" 428 " tp_m_last=%p tailbuf=%p snd_una=0x%08x", 429 total_bytes, toep->tp_m_last, tail->m_data, 430 tp->snd_una); 431 else 432 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" 433 " tp_m_last=%p snd_una=0x%08x", 434 total_bytes, toep->tp_m_last, tp->snd_una); 435 436 437#ifdef KTR 438{ 439 int i; 440 441 i = 0; 442 while (i < count && m_get_sgllen(m0)) { 443 if ((count - i) >= 3) { 444 CTR6(KTR_TOM, 445 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 446 " len=%d pa=0x%zx len=%d", 447 segs[i].ds_addr, segs[i].ds_len, 448 segs[i + 1].ds_addr, segs[i + 1].ds_len, 449 segs[i + 2].ds_addr, segs[i + 2].ds_len); 450 i += 3; 451 } else if ((count - i) == 2) { 452 CTR4(KTR_TOM, 453 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 454 " len=%d", 455 segs[i].ds_addr, segs[i].ds_len, 456 segs[i + 1].ds_addr, segs[i + 1].ds_len); 457 i += 2; 458 } else { 459 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", 460 segs[i].ds_addr, segs[i].ds_len); 461 i++; 462 } 463 464 } 465} 466#endif 467 /* 468 * remember credits used 469 */ 470 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 471 m0->m_pkthdr.len = bytes; 472 toep->tp_wr_avail -= mbuf_wrs[count]; 473 toep->tp_wr_unacked += mbuf_wrs[count]; 474 475 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 476 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 477 struct work_request_hdr *wr = cplhdr(m0); 478 479 wr->wr_hi |= htonl(F_WR_COMPL); 480 toep->tp_wr_unacked = 0; 481 } 482 KASSERT((m0->m_pkthdr.csum_data > 0) && 483 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", 484 m0->m_pkthdr.csum_data)); 485 m0->m_type = MT_DONTFREE; 486 enqueue_wr(toep, m0); 487 DPRINTF("sending offload tx with %d bytes in %d segments\n", 488 bytes, count); 489 l2t_send(cdev, m0, toep->tp_l2t); 490 } 491 sockbuf_unlock(snd); 492 return (total_bytes); 493} 494 495/* 496 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 497 * under any circumstances. We take the easy way out and always queue the 498 * message to the write_queue. We can optimize the case where the queue is 499 * already empty though the optimization is probably not worth it. 500 */ 501static void 502close_conn(struct socket *so) 503{ 504 struct mbuf *m; 505 struct cpl_close_con_req *req; 506 struct tom_data *d; 507 struct inpcb *inp = so_sotoinpcb(so); 508 struct tcpcb *tp; 509 struct toepcb *toep; 510 unsigned int tid; 511 512 513 inp_wlock(inp); 514 tp = so_sototcpcb(so); 515 toep = tp->t_toe; 516 517 if (tp->t_state != TCPS_SYN_SENT) 518 t3_push_frames(so, 1); 519 520 if (toep->tp_flags & TP_FIN_SENT) { 521 inp_wunlock(inp); 522 return; 523 } 524 525 tid = toep->tp_tid; 526 527 d = TOM_DATA(toep->tp_toedev); 528 529 m = m_gethdr_nofail(sizeof(*req)); 530 m_set_priority(m, CPL_PRIORITY_DATA); 531 m_set_sgl(m, NULL); 532 m_set_sgllen(m, 0); 533 534 toep->tp_flags |= TP_FIN_SENT; 535 req = mtod(m, struct cpl_close_con_req *); 536 537 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 538 req->wr.wr_lo = htonl(V_WR_TID(tid)); 539 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 540 req->rsvd = 0; 541 inp_wunlock(inp); 542 /* 543 * XXX - need to defer shutdown while there is still data in the queue 544 * 545 */ 546 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); 547 cxgb_ofld_send(d->cdev, m); 548 549} 550 551/* 552 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 553 * and send it along. 554 */ 555static void 556abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 557{ 558 struct cpl_abort_req *req = cplhdr(m); 559 560 req->cmd = CPL_ABORT_NO_RST; 561 cxgb_ofld_send(cdev, m); 562} 563 564/* 565 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 566 * permitted to return without sending the message in case we cannot allocate 567 * an sk_buff. Returns the number of credits sent. 568 */ 569uint32_t 570t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 571{ 572 struct mbuf *m; 573 struct cpl_rx_data_ack *req; 574 struct toepcb *toep = tp->t_toe; 575 struct toedev *tdev = toep->tp_toedev; 576 577 m = m_gethdr_nofail(sizeof(*req)); 578 579 DPRINTF("returning %u credits to HW\n", credits); 580 581 req = mtod(m, struct cpl_rx_data_ack *); 582 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 583 req->wr.wr_lo = 0; 584 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 585 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 586 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 587 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 588 return (credits); 589} 590 591/* 592 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. 593 * This is only used in DDP mode, so we take the opportunity to also set the 594 * DACK mode and flush any Rx credits. 595 */ 596void 597t3_send_rx_modulate(struct toepcb *toep) 598{ 599 struct mbuf *m; 600 struct cpl_rx_data_ack *req; 601 602 m = m_gethdr_nofail(sizeof(*req)); 603 604 req = mtod(m, struct cpl_rx_data_ack *); 605 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 606 req->wr.wr_lo = 0; 607 m->m_pkthdr.len = m->m_len = sizeof(*req); 608 609 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 610 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 611 V_RX_DACK_MODE(1) | 612 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); 613 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 614 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 615 toep->tp_rcv_wup = toep->tp_copied_seq; 616} 617 618/* 619 * Handle receipt of an urgent pointer. 620 */ 621static void 622handle_urg_ptr(struct socket *so, uint32_t urg_seq) 623{ 624#ifdef URGENT_DATA_SUPPORTED 625 struct tcpcb *tp = so_sototcpcb(so); 626 627 urg_seq--; /* initially points past the urgent data, per BSD */ 628 629 if (tp->urg_data && !after(urg_seq, tp->urg_seq)) 630 return; /* duplicate pointer */ 631 sk_send_sigurg(sk); 632 if (tp->urg_seq == tp->copied_seq && tp->urg_data && 633 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { 634 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 635 636 tp->copied_seq++; 637 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) 638 tom_eat_skb(sk, skb, 0); 639 } 640 tp->urg_data = TCP_URG_NOTYET; 641 tp->urg_seq = urg_seq; 642#endif 643} 644 645/* 646 * Returns true if a socket cannot accept new Rx data. 647 */ 648static inline int 649so_no_receive(const struct socket *so) 650{ 651 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); 652} 653 654/* 655 * Process an urgent data notification. 656 */ 657static void 658rx_urg_notify(struct toepcb *toep, struct mbuf *m) 659{ 660 struct cpl_rx_urg_notify *hdr = cplhdr(m); 661 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 662 663 VALIDATE_SOCK(so); 664 665 if (!so_no_receive(so)) 666 handle_urg_ptr(so, ntohl(hdr->seq)); 667 668 m_freem(m); 669} 670 671/* 672 * Handler for RX_URG_NOTIFY CPL messages. 673 */ 674static int 675do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) 676{ 677 struct toepcb *toep = (struct toepcb *)ctx; 678 679 rx_urg_notify(toep, m); 680 return (0); 681} 682 683static __inline int 684is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) 685{ 686 return (toep->tp_ulp_mode || 687 (toep->tp_ulp_mode == ULP_MODE_TCPDDP && 688 dev->tod_ttid >= TOE_ID_CHELSIO_T3)); 689} 690 691/* 692 * Set of states for which we should return RX credits. 693 */ 694#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 695 696/* 697 * Called after some received data has been read. It returns RX credits 698 * to the HW for the amount of data processed. 699 */ 700void 701t3_cleanup_rbuf(struct tcpcb *tp, int copied) 702{ 703 struct toepcb *toep = tp->t_toe; 704 struct socket *so; 705 struct toedev *dev; 706 int dack_mode, must_send, read; 707 u32 thres, credits, dack = 0; 708 struct sockbuf *rcv; 709 710 so = inp_inpcbtosocket(tp->t_inpcb); 711 rcv = so_sockbuf_rcv(so); 712 713 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 714 (tp->t_state == TCPS_FIN_WAIT_2))) { 715 if (copied) { 716 sockbuf_lock(rcv); 717 toep->tp_copied_seq += copied; 718 sockbuf_unlock(rcv); 719 } 720 721 return; 722 } 723 724 inp_lock_assert(tp->t_inpcb); 725 726 sockbuf_lock(rcv); 727 if (copied) 728 toep->tp_copied_seq += copied; 729 else { 730 read = toep->tp_enqueued_bytes - rcv->sb_cc; 731 toep->tp_copied_seq += read; 732 } 733 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 734 toep->tp_enqueued_bytes = rcv->sb_cc; 735 sockbuf_unlock(rcv); 736 737 if (credits > rcv->sb_mbmax) { 738 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", 739 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 740 credits = rcv->sb_mbmax; 741 } 742 743 744 /* 745 * XXX this won't accurately reflect credit return - we need 746 * to look at the difference between the amount that has been 747 * put in the recv sockbuf and what is there now 748 */ 749 750 if (__predict_false(!credits)) 751 return; 752 753 dev = toep->tp_toedev; 754 thres = TOM_TUNABLE(dev, rx_credit_thres); 755 756 if (__predict_false(thres == 0)) 757 return; 758 759 if (is_delack_mode_valid(dev, toep)) { 760 dack_mode = TOM_TUNABLE(dev, delack); 761 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 762 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 763 764 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 765 dack = F_RX_DACK_CHANGE | 766 V_RX_DACK_MODE(dack_mode); 767 } 768 } else 769 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 770 771 /* 772 * For coalescing to work effectively ensure the receive window has 773 * at least 16KB left. 774 */ 775 must_send = credits + 16384 >= tp->rcv_wnd; 776 777 if (must_send || credits >= thres) 778 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 779} 780 781static int 782cxgb_toe_disconnect(struct tcpcb *tp) 783{ 784 struct socket *so; 785 786 DPRINTF("cxgb_toe_disconnect\n"); 787 788 so = inp_inpcbtosocket(tp->t_inpcb); 789 close_conn(so); 790 return (0); 791} 792 793static int 794cxgb_toe_reset(struct tcpcb *tp) 795{ 796 struct toepcb *toep = tp->t_toe; 797 798 t3_send_reset(toep); 799 800 /* 801 * unhook from socket 802 */ 803 tp->t_flags &= ~TF_TOE; 804 toep->tp_tp = NULL; 805 tp->t_toe = NULL; 806 return (0); 807} 808 809static int 810cxgb_toe_send(struct tcpcb *tp) 811{ 812 struct socket *so; 813 814 DPRINTF("cxgb_toe_send\n"); 815 dump_toepcb(tp->t_toe); 816 817 so = inp_inpcbtosocket(tp->t_inpcb); 818 t3_push_frames(so, 1); 819 return (0); 820} 821 822static int 823cxgb_toe_rcvd(struct tcpcb *tp) 824{ 825 826 inp_lock_assert(tp->t_inpcb); 827 828 t3_cleanup_rbuf(tp, 0); 829 830 return (0); 831} 832 833static void 834cxgb_toe_detach(struct tcpcb *tp) 835{ 836 struct toepcb *toep; 837 838 /* 839 * XXX how do we handle teardown in the SYN_SENT state? 840 * 841 */ 842 inp_lock_assert(tp->t_inpcb); 843 toep = tp->t_toe; 844 toep->tp_tp = NULL; 845 846 /* 847 * unhook from socket 848 */ 849 tp->t_flags &= ~TF_TOE; 850 tp->t_toe = NULL; 851} 852 853 854static struct toe_usrreqs cxgb_toe_usrreqs = { 855 .tu_disconnect = cxgb_toe_disconnect, 856 .tu_reset = cxgb_toe_reset, 857 .tu_send = cxgb_toe_send, 858 .tu_rcvd = cxgb_toe_rcvd, 859 .tu_detach = cxgb_toe_detach, 860 .tu_detach = cxgb_toe_detach, 861 .tu_syncache_event = handle_syncache_event, 862}; 863 864 865static void 866__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, 867 uint64_t mask, uint64_t val, int no_reply) 868{ 869 struct cpl_set_tcb_field *req; 870 871 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 872 toep->tp_tid, word, mask, val); 873 874 req = mtod(m, struct cpl_set_tcb_field *); 875 m->m_pkthdr.len = m->m_len = sizeof(*req); 876 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 877 req->wr.wr_lo = 0; 878 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 879 req->reply = V_NO_REPLY(no_reply); 880 req->cpu_idx = 0; 881 req->word = htons(word); 882 req->mask = htobe64(mask); 883 req->val = htobe64(val); 884 885 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 886 send_or_defer(toep, m, 0); 887} 888 889static void 890t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) 891{ 892 struct mbuf *m; 893 struct tcpcb *tp = toep->tp_tp; 894 895 if (toep == NULL) 896 return; 897 898 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { 899 printf("not seting field\n"); 900 return; 901 } 902 903 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 904 905 __set_tcb_field(toep, m, word, mask, val, 1); 906} 907 908/* 909 * Set one of the t_flags bits in the TCB. 910 */ 911static void 912set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) 913{ 914 915 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 916} 917 918/* 919 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 920 */ 921static void 922t3_set_nagle(struct toepcb *toep) 923{ 924 struct tcpcb *tp = toep->tp_tp; 925 926 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 927} 928 929/* 930 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 931 */ 932void 933t3_set_keepalive(struct toepcb *toep, int on_off) 934{ 935 936 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); 937} 938 939void 940t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) 941{ 942 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); 943} 944 945void 946t3_set_dack_mss(struct toepcb *toep, int on_off) 947{ 948 949 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); 950} 951 952/* 953 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 954 */ 955static void 956t3_set_tos(struct toepcb *toep) 957{ 958 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); 959 960 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 961 V_TCB_TOS(tos)); 962} 963 964 965/* 966 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 967 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 968 * set the PSH bit in the last segment, which would trigger delivery.] 969 * We work around the issue by setting a DDP buffer in a partial placed state, 970 * which guarantees that TP will schedule a timer. 971 */ 972#define TP_DDP_TIMER_WORKAROUND_MASK\ 973 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 974 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 975 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 976#define TP_DDP_TIMER_WORKAROUND_VAL\ 977 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 978 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 979 32)) 980 981static void 982t3_enable_ddp(struct toepcb *toep, int on) 983{ 984 if (on) { 985 986 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 987 V_TF_DDP_OFF(0)); 988 } else 989 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, 990 V_TF_DDP_OFF(1) | 991 TP_DDP_TIMER_WORKAROUND_MASK, 992 V_TF_DDP_OFF(1) | 993 TP_DDP_TIMER_WORKAROUND_VAL); 994 995} 996 997void 998t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) 999{ 1000 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 1001 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 1002 tag_color); 1003} 1004 1005void 1006t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, 1007 unsigned int len) 1008{ 1009 if (buf_idx == 0) 1010 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, 1011 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 1012 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 1013 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 1014 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 1015 else 1016 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, 1017 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 1018 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 1019 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 1020 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 1021} 1022 1023static int 1024t3_set_cong_control(struct socket *so, const char *name) 1025{ 1026#ifdef CONGESTION_CONTROL_SUPPORTED 1027 int cong_algo; 1028 1029 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 1030 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 1031 break; 1032 1033 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 1034 return -EINVAL; 1035#endif 1036 return 0; 1037} 1038 1039int 1040t3_get_tcb(struct toepcb *toep) 1041{ 1042 struct cpl_get_tcb *req; 1043 struct tcpcb *tp = toep->tp_tp; 1044 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 1045 1046 if (!m) 1047 return (ENOMEM); 1048 1049 inp_lock_assert(tp->t_inpcb); 1050 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 1051 req = mtod(m, struct cpl_get_tcb *); 1052 m->m_pkthdr.len = m->m_len = sizeof(*req); 1053 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1054 req->wr.wr_lo = 0; 1055 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 1056 req->cpuno = htons(toep->tp_qset); 1057 req->rsvd = 0; 1058 if (tp->t_state == TCPS_SYN_SENT) 1059 mbufq_tail(&toep->out_of_order_queue, m); // defer 1060 else 1061 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 1062 return 0; 1063} 1064 1065static inline void 1066so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) 1067{ 1068 1069 toepcb_hold(toep); 1070 1071 cxgb_insert_tid(d->cdev, d->client, toep, tid); 1072} 1073 1074/** 1075 * find_best_mtu - find the entry in the MTU table closest to an MTU 1076 * @d: TOM state 1077 * @mtu: the target MTU 1078 * 1079 * Returns the index of the value in the MTU table that is closest to but 1080 * does not exceed the target MTU. 1081 */ 1082static unsigned int 1083find_best_mtu(const struct t3c_data *d, unsigned short mtu) 1084{ 1085 int i = 0; 1086 1087 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 1088 ++i; 1089 return (i); 1090} 1091 1092static unsigned int 1093select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 1094{ 1095 unsigned int idx; 1096 1097#ifdef notyet 1098 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; 1099#endif 1100 if (tp) { 1101 tp->t_maxseg = pmtu - 40; 1102 if (tp->t_maxseg < td->mtus[0] - 40) 1103 tp->t_maxseg = td->mtus[0] - 40; 1104 idx = find_best_mtu(td, tp->t_maxseg + 40); 1105 1106 tp->t_maxseg = td->mtus[idx] - 40; 1107 } else 1108 idx = find_best_mtu(td, pmtu); 1109 1110 return (idx); 1111} 1112 1113static inline void 1114free_atid(struct t3cdev *cdev, unsigned int tid) 1115{ 1116 struct toepcb *toep = cxgb_free_atid(cdev, tid); 1117 1118 if (toep) 1119 toepcb_release(toep); 1120} 1121 1122/* 1123 * Release resources held by an offload connection (TID, L2T entry, etc.) 1124 */ 1125static void 1126t3_release_offload_resources(struct toepcb *toep) 1127{ 1128 struct tcpcb *tp = toep->tp_tp; 1129 struct toedev *tdev = toep->tp_toedev; 1130 struct t3cdev *cdev; 1131 struct socket *so; 1132 unsigned int tid = toep->tp_tid; 1133 struct sockbuf *rcv; 1134 1135 CTR0(KTR_TOM, "t3_release_offload_resources"); 1136 1137 if (!tdev) 1138 return; 1139 1140 cdev = TOEP_T3C_DEV(toep); 1141 if (!cdev) 1142 return; 1143 1144 toep->tp_qset = 0; 1145 t3_release_ddp_resources(toep); 1146 1147#ifdef CTRL_SKB_CACHE 1148 kfree_skb(CTRL_SKB_CACHE(tp)); 1149 CTRL_SKB_CACHE(tp) = NULL; 1150#endif 1151 1152 if (toep->tp_wr_avail != toep->tp_wr_max) { 1153 purge_wr_queue(toep); 1154 reset_wr_list(toep); 1155 } 1156 1157 if (toep->tp_l2t) { 1158 l2t_release(L2DATA(cdev), toep->tp_l2t); 1159 toep->tp_l2t = NULL; 1160 } 1161 toep->tp_tp = NULL; 1162 if (tp) { 1163 inp_lock_assert(tp->t_inpcb); 1164 so = inp_inpcbtosocket(tp->t_inpcb); 1165 rcv = so_sockbuf_rcv(so); 1166 /* 1167 * cancel any offloaded reads 1168 * 1169 */ 1170 sockbuf_lock(rcv); 1171 tp->t_toe = NULL; 1172 tp->t_flags &= ~TF_TOE; 1173 if (toep->tp_ddp_state.user_ddp_pending) { 1174 t3_cancel_ubuf(toep, rcv); 1175 toep->tp_ddp_state.user_ddp_pending = 0; 1176 } 1177 so_sorwakeup_locked(so); 1178 1179 } 1180 1181 if (toep->tp_state == TCPS_SYN_SENT) { 1182 free_atid(cdev, tid); 1183#ifdef notyet 1184 __skb_queue_purge(&tp->out_of_order_queue); 1185#endif 1186 } else { // we have TID 1187 cxgb_remove_tid(cdev, toep, tid); 1188 toepcb_release(toep); 1189 } 1190#if 0 1191 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 1192#endif 1193} 1194 1195static void 1196install_offload_ops(struct socket *so) 1197{ 1198 struct tcpcb *tp = so_sototcpcb(so); 1199 1200 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 1201 1202 t3_install_socket_ops(so); 1203 tp->t_flags |= TF_TOE; 1204 tp->t_tu = &cxgb_toe_usrreqs; 1205} 1206 1207/* 1208 * Determine the receive window scaling factor given a target max 1209 * receive window. 1210 */ 1211static __inline int 1212select_rcv_wscale(int space) 1213{ 1214 int wscale = 0; 1215 1216 if (space > MAX_RCV_WND) 1217 space = MAX_RCV_WND; 1218 1219 if (V_tcp_do_rfc1323) 1220 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 1221 1222 return (wscale); 1223} 1224 1225/* 1226 * Determine the receive window size for a socket. 1227 */ 1228static unsigned long 1229select_rcv_wnd(struct toedev *dev, struct socket *so) 1230{ 1231 struct tom_data *d = TOM_DATA(dev); 1232 unsigned int wnd; 1233 unsigned int max_rcv_wnd; 1234 struct sockbuf *rcv; 1235 1236 rcv = so_sockbuf_rcv(so); 1237 1238 if (V_tcp_do_autorcvbuf) 1239 wnd = V_tcp_autorcvbuf_max; 1240 else 1241 wnd = rcv->sb_hiwat; 1242 1243 1244 1245 /* XXX 1246 * For receive coalescing to work effectively we need a receive window 1247 * that can accomodate a coalesced segment. 1248 */ 1249 if (wnd < MIN_RCV_WND) 1250 wnd = MIN_RCV_WND; 1251 1252 /* PR 5138 */ 1253 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 1254 (uint32_t)d->rx_page_size * 23 : 1255 MAX_RCV_WND); 1256 1257 return min(wnd, max_rcv_wnd); 1258} 1259 1260/* 1261 * Assign offload parameters to some socket fields. This code is used by 1262 * both active and passive opens. 1263 */ 1264static inline void 1265init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1266 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1267{ 1268 struct tcpcb *tp = so_sototcpcb(so); 1269 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1270 struct sockbuf *snd, *rcv; 1271 1272#ifdef notyet 1273 SOCK_LOCK_ASSERT(so); 1274#endif 1275 1276 snd = so_sockbuf_snd(so); 1277 rcv = so_sockbuf_rcv(so); 1278 1279 log(LOG_INFO, "initializing offload socket\n"); 1280 /* 1281 * We either need to fix push frames to work with sbcompress 1282 * or we need to add this 1283 */ 1284 snd->sb_flags |= SB_NOCOALESCE; 1285 rcv->sb_flags |= SB_NOCOALESCE; 1286 1287 tp->t_toe = toep; 1288 toep->tp_tp = tp; 1289 toep->tp_toedev = dev; 1290 1291 toep->tp_tid = tid; 1292 toep->tp_l2t = e; 1293 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1294 toep->tp_wr_unacked = 0; 1295 toep->tp_delack_mode = 0; 1296 1297 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1298 /* 1299 * XXX broken 1300 * 1301 */ 1302 tp->rcv_wnd = select_rcv_wnd(dev, so); 1303 1304 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 1305 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1306 toep->tp_qset_idx = 0; 1307 1308 reset_wr_list(toep); 1309 DPRINTF("initialization done\n"); 1310} 1311 1312/* 1313 * The next two functions calculate the option 0 value for a socket. 1314 */ 1315static inline unsigned int 1316calc_opt0h(struct socket *so, int mtu_idx) 1317{ 1318 struct tcpcb *tp = so_sototcpcb(so); 1319 int wscale = select_rcv_wscale(tp->rcv_wnd); 1320 1321 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1322 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1323 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1324} 1325 1326static inline unsigned int 1327calc_opt0l(struct socket *so, int ulp_mode) 1328{ 1329 struct tcpcb *tp = so_sototcpcb(so); 1330 unsigned int val; 1331 1332 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | 1333 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1334 1335 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); 1336 return (val); 1337} 1338 1339static inline unsigned int 1340calc_opt2(const struct socket *so, struct toedev *dev) 1341{ 1342 int flv_valid; 1343 1344 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1345 1346 return (V_FLAVORS_VALID(flv_valid) | 1347 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); 1348} 1349 1350#if DEBUG_WR > 1 1351static int 1352count_pending_wrs(const struct toepcb *toep) 1353{ 1354 const struct mbuf *m; 1355 int n = 0; 1356 1357 wr_queue_walk(toep, m) 1358 n += m->m_pkthdr.csum_data; 1359 return (n); 1360} 1361#endif 1362 1363#if 0 1364(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1365#endif 1366 1367static void 1368mk_act_open_req(struct socket *so, struct mbuf *m, 1369 unsigned int atid, const struct l2t_entry *e) 1370{ 1371 struct cpl_act_open_req *req; 1372 struct inpcb *inp = so_sotoinpcb(so); 1373 struct tcpcb *tp = inp_inpcbtotcpcb(inp); 1374 struct toepcb *toep = tp->t_toe; 1375 struct toedev *tdev = toep->tp_toedev; 1376 1377 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); 1378 1379 req = mtod(m, struct cpl_act_open_req *); 1380 m->m_pkthdr.len = m->m_len = sizeof(*req); 1381 1382 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1383 req->wr.wr_lo = 0; 1384 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1385 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); 1386#if 0 1387 req->local_port = inp->inp_lport; 1388 req->peer_port = inp->inp_fport; 1389 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1390 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1391#endif 1392 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1393 V_TX_CHANNEL(e->smt_idx)); 1394 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1395 req->params = 0; 1396 req->opt2 = htonl(calc_opt2(so, tdev)); 1397} 1398 1399 1400/* 1401 * Convert an ACT_OPEN_RPL status to an errno. 1402 */ 1403static int 1404act_open_rpl_status_to_errno(int status) 1405{ 1406 switch (status) { 1407 case CPL_ERR_CONN_RESET: 1408 return (ECONNREFUSED); 1409 case CPL_ERR_ARP_MISS: 1410 return (EHOSTUNREACH); 1411 case CPL_ERR_CONN_TIMEDOUT: 1412 return (ETIMEDOUT); 1413 case CPL_ERR_TCAM_FULL: 1414 return (ENOMEM); 1415 case CPL_ERR_CONN_EXIST: 1416 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1417 return (EADDRINUSE); 1418 default: 1419 return (EIO); 1420 } 1421} 1422 1423static void 1424fail_act_open(struct toepcb *toep, int errno) 1425{ 1426 struct tcpcb *tp = toep->tp_tp; 1427 1428 t3_release_offload_resources(toep); 1429 if (tp) { 1430 inp_wunlock(tp->t_inpcb); 1431 tcp_offload_drop(tp, errno); 1432 } 1433 1434#ifdef notyet 1435 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1436#endif 1437} 1438 1439/* 1440 * Handle active open failures. 1441 */ 1442static void 1443active_open_failed(struct toepcb *toep, struct mbuf *m) 1444{ 1445 struct cpl_act_open_rpl *rpl = cplhdr(m); 1446 struct inpcb *inp; 1447 1448 if (toep->tp_tp == NULL) 1449 goto done; 1450 1451 inp = toep->tp_tp->t_inpcb; 1452 1453/* 1454 * Don't handle connection retry for now 1455 */ 1456#ifdef notyet 1457 struct inet_connection_sock *icsk = inet_csk(sk); 1458 1459 if (rpl->status == CPL_ERR_CONN_EXIST && 1460 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1461 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1462 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1463 jiffies + HZ / 2); 1464 } else 1465#endif 1466 { 1467 inp_wlock(inp); 1468 /* 1469 * drops the inpcb lock 1470 */ 1471 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1472 } 1473 1474 done: 1475 m_free(m); 1476} 1477 1478/* 1479 * Return whether a failed active open has allocated a TID 1480 */ 1481static inline int 1482act_open_has_tid(int status) 1483{ 1484 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1485 status != CPL_ERR_ARP_MISS; 1486} 1487 1488/* 1489 * Process an ACT_OPEN_RPL CPL message. 1490 */ 1491static int 1492do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1493{ 1494 struct toepcb *toep = (struct toepcb *)ctx; 1495 struct cpl_act_open_rpl *rpl = cplhdr(m); 1496 1497 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1498 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1499 1500 active_open_failed(toep, m); 1501 return (0); 1502} 1503 1504/* 1505 * Handle an ARP failure for an active open. XXX purge ofo queue 1506 * 1507 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1508 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1509 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1510 * free the atid. Hmm. 1511 */ 1512#ifdef notyet 1513static void 1514act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1515{ 1516 struct toepcb *toep = m_get_toep(m); 1517 struct tcpcb *tp = toep->tp_tp; 1518 struct inpcb *inp = tp->t_inpcb; 1519 struct socket *so; 1520 1521 inp_wlock(inp); 1522 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1523 /* 1524 * drops the inpcb lock 1525 */ 1526 fail_act_open(so, EHOSTUNREACH); 1527 printf("freeing %p\n", m); 1528 1529 m_free(m); 1530 } else 1531 inp_wunlock(inp); 1532} 1533#endif 1534/* 1535 * Send an active open request. 1536 */ 1537int 1538t3_connect(struct toedev *tdev, struct socket *so, 1539 struct rtentry *rt, struct sockaddr *nam) 1540{ 1541 struct mbuf *m; 1542 struct l2t_entry *e; 1543 struct tom_data *d = TOM_DATA(tdev); 1544 struct inpcb *inp = so_sotoinpcb(so); 1545 struct tcpcb *tp = intotcpcb(inp); 1546 struct toepcb *toep; /* allocated by init_offload_socket */ 1547 1548 int atid; 1549 1550 toep = toepcb_alloc(); 1551 if (toep == NULL) 1552 goto out_err; 1553 1554 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1555 goto out_err; 1556 1557 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1558 if (!e) 1559 goto free_tid; 1560 1561 inp_lock_assert(inp); 1562 m = m_gethdr(MT_DATA, M_WAITOK); 1563 1564#if 0 1565 m->m_toe.mt_toepcb = tp->t_toe; 1566 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1567#endif 1568 so_lock(so); 1569 1570 init_offload_socket(so, tdev, atid, e, rt, toep); 1571 1572 install_offload_ops(so); 1573 1574 mk_act_open_req(so, m, atid, e); 1575 so_unlock(so); 1576 1577 soisconnecting(so); 1578 toep = tp->t_toe; 1579 m_set_toep(m, tp->t_toe); 1580 1581 toep->tp_state = TCPS_SYN_SENT; 1582 l2t_send(d->cdev, (struct mbuf *)m, e); 1583 1584 if (toep->tp_ulp_mode) 1585 t3_enable_ddp(toep, 0); 1586 return (0); 1587 1588free_tid: 1589 printf("failing connect - free atid\n"); 1590 1591 free_atid(d->cdev, atid); 1592out_err: 1593 printf("return ENOMEM\n"); 1594 return (ENOMEM); 1595} 1596 1597/* 1598 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1599 * not send multiple ABORT_REQs for the same connection and also that we do 1600 * not try to send a message after the connection has closed. Returns 1 if 1601 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1602 */ 1603static void 1604t3_send_reset(struct toepcb *toep) 1605{ 1606 1607 struct cpl_abort_req *req; 1608 unsigned int tid = toep->tp_tid; 1609 int mode = CPL_ABORT_SEND_RST; 1610 struct tcpcb *tp = toep->tp_tp; 1611 struct toedev *tdev = toep->tp_toedev; 1612 struct socket *so = NULL; 1613 struct mbuf *m; 1614 struct sockbuf *snd; 1615 1616 if (tp) { 1617 inp_lock_assert(tp->t_inpcb); 1618 so = inp_inpcbtosocket(tp->t_inpcb); 1619 } 1620 1621 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1622 tdev == NULL)) 1623 return; 1624 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1625 1626 snd = so_sockbuf_snd(so); 1627 /* Purge the send queue so we don't send anything after an abort. */ 1628 if (so) 1629 sbflush(snd); 1630 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1631 mode |= CPL_ABORT_POST_CLOSE_REQ; 1632 1633 m = m_gethdr_nofail(sizeof(*req)); 1634 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); 1635 set_arp_failure_handler(m, abort_arp_failure); 1636 1637 req = mtod(m, struct cpl_abort_req *); 1638 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1639 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1640 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1641 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1642 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1643 req->cmd = mode; 1644 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1645 mbufq_tail(&toep->out_of_order_queue, m); // defer 1646 else 1647 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1648} 1649 1650static int 1651t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1652{ 1653 struct inpcb *inp; 1654 int error, optval; 1655 1656 if (sopt->sopt_name == IP_OPTIONS) 1657 return (ENOPROTOOPT); 1658 1659 if (sopt->sopt_name != IP_TOS) 1660 return (EOPNOTSUPP); 1661 1662 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1663 1664 if (error) 1665 return (error); 1666 1667 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread)) 1668 return (EPERM); 1669 1670 inp = so_sotoinpcb(so); 1671 inp_wlock(inp); 1672 inp_ip_tos_set(inp, optval); 1673#if 0 1674 inp->inp_ip_tos = optval; 1675#endif 1676 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); 1677 inp_wunlock(inp); 1678 1679 return (0); 1680} 1681 1682static int 1683t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1684{ 1685 int err = 0; 1686 size_t copied; 1687 1688 if (sopt->sopt_name != TCP_CONGESTION && 1689 sopt->sopt_name != TCP_NODELAY) 1690 return (EOPNOTSUPP); 1691 1692 if (sopt->sopt_name == TCP_CONGESTION) { 1693 char name[TCP_CA_NAME_MAX]; 1694 int optlen = sopt->sopt_valsize; 1695 struct tcpcb *tp; 1696 1697 if (sopt->sopt_dir == SOPT_GET) { 1698 KASSERT(0, ("unimplemented")); 1699 return (EOPNOTSUPP); 1700 } 1701 1702 if (optlen < 1) 1703 return (EINVAL); 1704 1705 err = copyinstr(sopt->sopt_val, name, 1706 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1707 if (err) 1708 return (err); 1709 if (copied < 1) 1710 return (EINVAL); 1711 1712 tp = so_sototcpcb(so); 1713 /* 1714 * XXX I need to revisit this 1715 */ 1716 if ((err = t3_set_cong_control(so, name)) == 0) { 1717#ifdef CONGESTION_CONTROL_SUPPORTED 1718 tp->t_cong_control = strdup(name, M_CXGB); 1719#endif 1720 } else 1721 return (err); 1722 } else { 1723 int optval, oldval; 1724 struct inpcb *inp; 1725 struct tcpcb *tp; 1726 1727 if (sopt->sopt_dir == SOPT_GET) 1728 return (EOPNOTSUPP); 1729 1730 err = sooptcopyin(sopt, &optval, sizeof optval, 1731 sizeof optval); 1732 1733 if (err) 1734 return (err); 1735 1736 inp = so_sotoinpcb(so); 1737 tp = inp_inpcbtotcpcb(inp); 1738 1739 inp_wlock(inp); 1740 1741 oldval = tp->t_flags; 1742 if (optval) 1743 tp->t_flags |= TF_NODELAY; 1744 else 1745 tp->t_flags &= ~TF_NODELAY; 1746 inp_wunlock(inp); 1747 1748 1749 if (oldval != tp->t_flags && (tp->t_toe != NULL)) 1750 t3_set_nagle(tp->t_toe); 1751 1752 } 1753 1754 return (0); 1755} 1756 1757int 1758t3_ctloutput(struct socket *so, struct sockopt *sopt) 1759{ 1760 int err; 1761 1762 if (sopt->sopt_level != IPPROTO_TCP) 1763 err = t3_ip_ctloutput(so, sopt); 1764 else 1765 err = t3_tcp_ctloutput(so, sopt); 1766 1767 if (err != EOPNOTSUPP) 1768 return (err); 1769 1770 return (tcp_ctloutput(so, sopt)); 1771} 1772 1773/* 1774 * Returns true if we need to explicitly request RST when we receive new data 1775 * on an RX-closed connection. 1776 */ 1777static inline int 1778need_rst_on_excess_rx(const struct toepcb *toep) 1779{ 1780 return (1); 1781} 1782 1783/* 1784 * Handles Rx data that arrives in a state where the socket isn't accepting 1785 * new data. 1786 */ 1787static void 1788handle_excess_rx(struct toepcb *toep, struct mbuf *m) 1789{ 1790 1791 if (need_rst_on_excess_rx(toep) && 1792 !(toep->tp_flags & TP_ABORT_SHUTDOWN)) 1793 t3_send_reset(toep); 1794 m_freem(m); 1795} 1796 1797/* 1798 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) 1799 * by getting the DDP offset from the TCB. 1800 */ 1801static void 1802tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) 1803{ 1804 struct ddp_state *q = &toep->tp_ddp_state; 1805 struct ddp_buf_state *bsp; 1806 struct cpl_get_tcb_rpl *hdr; 1807 unsigned int ddp_offset; 1808 struct socket *so; 1809 struct tcpcb *tp; 1810 struct sockbuf *rcv; 1811 int state; 1812 1813 uint64_t t; 1814 __be64 *tcb; 1815 1816 tp = toep->tp_tp; 1817 so = inp_inpcbtosocket(tp->t_inpcb); 1818 1819 inp_lock_assert(tp->t_inpcb); 1820 rcv = so_sockbuf_rcv(so); 1821 sockbuf_lock(rcv); 1822 1823 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. 1824 * We really need a cookie in order to dispatch the RPLs. 1825 */ 1826 q->get_tcb_count--; 1827 1828 /* It is a possible that a previous CPL already invalidated UBUF DDP 1829 * and moved the cur_buf idx and hence no further processing of this 1830 * skb is required. However, the app might be sleeping on 1831 * !q->get_tcb_count and we need to wake it up. 1832 */ 1833 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { 1834 int state = so_state_get(so); 1835 1836 m_freem(m); 1837 if (__predict_true((state & SS_NOFDREF) == 0)) 1838 so_sorwakeup_locked(so); 1839 else 1840 sockbuf_unlock(rcv); 1841 1842 return; 1843 } 1844 1845 bsp = &q->buf_state[q->cur_buf]; 1846 hdr = cplhdr(m); 1847 tcb = (__be64 *)(hdr + 1); 1848 if (q->cur_buf == 0) { 1849 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); 1850 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); 1851 } else { 1852 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); 1853 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; 1854 } 1855 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; 1856 m->m_cur_offset = bsp->cur_offset; 1857 bsp->cur_offset = ddp_offset; 1858 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; 1859 1860 CTR5(KTR_TOM, 1861 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", 1862 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); 1863 KASSERT(ddp_offset >= m->m_cur_offset, 1864 ("ddp_offset=%u less than cur_offset=%u", 1865 ddp_offset, m->m_cur_offset)); 1866 1867#if 0 1868{ 1869 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; 1870 1871 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); 1872 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; 1873 1874 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); 1875 rcv_nxt = t >> S_TCB_RCV_NXT; 1876 rcv_nxt &= M_TCB_RCV_NXT; 1877 1878 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); 1879 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); 1880 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; 1881 1882 T3_TRACE2(TIDTB(sk), 1883 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", 1884 ddp_flags, rcv_nxt - rx_hdr_offset); 1885 T3_TRACE4(TB(q), 1886 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", 1887 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); 1888 T3_TRACE3(TB(q), 1889 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", 1890 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); 1891 T3_TRACE2(TB(q), 1892 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", 1893 q->buf_state[0].flags, q->buf_state[1].flags); 1894 1895} 1896#endif 1897 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { 1898 handle_excess_rx(toep, m); 1899 return; 1900 } 1901 1902#ifdef T3_TRACE 1903 if ((int)m->m_pkthdr.len < 0) { 1904 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); 1905 } 1906#endif 1907 if (bsp->flags & DDP_BF_NOCOPY) { 1908#ifdef T3_TRACE 1909 T3_TRACE0(TB(q), 1910 "tcb_rpl_as_ddp_complete: CANCEL UBUF"); 1911 1912 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1913 printk("!cancel_ubuf"); 1914 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); 1915 } 1916#endif 1917 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; 1918 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); 1919 q->cur_buf ^= 1; 1920 } else if (bsp->flags & DDP_BF_NOFLIP) { 1921 1922 m->m_ddp_flags = 1; /* always a kernel buffer */ 1923 1924 /* now HW buffer carries a user buffer */ 1925 bsp->flags &= ~DDP_BF_NOFLIP; 1926 bsp->flags |= DDP_BF_NOCOPY; 1927 1928 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate 1929 * any new data in which case we're done. If in addition the 1930 * offset is 0, then there wasn't a completion for the kbuf 1931 * and we need to decrement the posted count. 1932 */ 1933 if (m->m_pkthdr.len == 0) { 1934 if (ddp_offset == 0) { 1935 q->kbuf_posted--; 1936 bsp->flags |= DDP_BF_NODATA; 1937 } 1938 sockbuf_unlock(rcv); 1939 m_free(m); 1940 return; 1941 } 1942 } else { 1943 sockbuf_unlock(rcv); 1944 1945 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, 1946 * but it got here way late and nobody cares anymore. 1947 */ 1948 m_free(m); 1949 return; 1950 } 1951 1952 m->m_ddp_gl = (unsigned char *)bsp->gl; 1953 m->m_flags |= M_DDP; 1954 m->m_seq = tp->rcv_nxt; 1955 tp->rcv_nxt += m->m_pkthdr.len; 1956 tp->t_rcvtime = ticks; 1957 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", 1958 m->m_seq, q->cur_buf, m->m_pkthdr.len); 1959 if (m->m_pkthdr.len == 0) { 1960 q->user_ddp_pending = 0; 1961 m_free(m); 1962 } else 1963 SBAPPEND(rcv, m); 1964 1965 state = so_state_get(so); 1966 if (__predict_true((state & SS_NOFDREF) == 0)) 1967 so_sorwakeup_locked(so); 1968 else 1969 sockbuf_unlock(rcv); 1970} 1971 1972/* 1973 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, 1974 * in that case they are similar to DDP completions. 1975 */ 1976static int 1977do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1978{ 1979 struct toepcb *toep = (struct toepcb *)ctx; 1980 1981 /* OK if socket doesn't exist */ 1982 if (toep == NULL) { 1983 printf("null toep in do_get_tcb_rpl\n"); 1984 return (CPL_RET_BUF_DONE); 1985 } 1986 1987 inp_wlock(toep->tp_tp->t_inpcb); 1988 tcb_rpl_as_ddp_complete(toep, m); 1989 inp_wunlock(toep->tp_tp->t_inpcb); 1990 1991 return (0); 1992} 1993 1994static void 1995handle_ddp_data(struct toepcb *toep, struct mbuf *m) 1996{ 1997 struct tcpcb *tp = toep->tp_tp; 1998 struct socket *so; 1999 struct ddp_state *q; 2000 struct ddp_buf_state *bsp; 2001 struct cpl_rx_data *hdr = cplhdr(m); 2002 unsigned int rcv_nxt = ntohl(hdr->seq); 2003 struct sockbuf *rcv; 2004 2005 if (tp->rcv_nxt == rcv_nxt) 2006 return; 2007 2008 inp_lock_assert(tp->t_inpcb); 2009 so = inp_inpcbtosocket(tp->t_inpcb); 2010 rcv = so_sockbuf_rcv(so); 2011 sockbuf_lock(rcv); 2012 2013 q = &toep->tp_ddp_state; 2014 bsp = &q->buf_state[q->cur_buf]; 2015 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", 2016 rcv_nxt, tp->rcv_nxt)); 2017 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2018 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2019 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", 2020 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); 2021 2022#ifdef T3_TRACE 2023 if ((int)m->m_pkthdr.len < 0) { 2024 t3_ddp_error(so, "handle_ddp_data: neg len"); 2025 } 2026#endif 2027 m->m_ddp_gl = (unsigned char *)bsp->gl; 2028 m->m_flags |= M_DDP; 2029 m->m_cur_offset = bsp->cur_offset; 2030 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2031 if (bsp->flags & DDP_BF_NOCOPY) 2032 bsp->flags &= ~DDP_BF_NOCOPY; 2033 2034 m->m_seq = tp->rcv_nxt; 2035 tp->rcv_nxt = rcv_nxt; 2036 bsp->cur_offset += m->m_pkthdr.len; 2037 if (!(bsp->flags & DDP_BF_NOFLIP)) 2038 q->cur_buf ^= 1; 2039 /* 2040 * For now, don't re-enable DDP after a connection fell out of DDP 2041 * mode. 2042 */ 2043 q->ubuf_ddp_ready = 0; 2044 sockbuf_unlock(rcv); 2045} 2046 2047/* 2048 * Process new data received for a connection. 2049 */ 2050static void 2051new_rx_data(struct toepcb *toep, struct mbuf *m) 2052{ 2053 struct cpl_rx_data *hdr = cplhdr(m); 2054 struct tcpcb *tp = toep->tp_tp; 2055 struct socket *so; 2056 struct sockbuf *rcv; 2057 int state; 2058 int len = be16toh(hdr->len); 2059 2060 inp_wlock(tp->t_inpcb); 2061 2062 so = inp_inpcbtosocket(tp->t_inpcb); 2063 2064 if (__predict_false(so_no_receive(so))) { 2065 handle_excess_rx(toep, m); 2066 inp_wunlock(tp->t_inpcb); 2067 TRACE_EXIT; 2068 return; 2069 } 2070 2071 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) 2072 handle_ddp_data(toep, m); 2073 2074 m->m_seq = ntohl(hdr->seq); 2075 m->m_ulp_mode = 0; /* for iSCSI */ 2076 2077#if VALIDATE_SEQ 2078 if (__predict_false(m->m_seq != tp->rcv_nxt)) { 2079 log(LOG_ERR, 2080 "%s: TID %u: Bad sequence number %u, expected %u\n", 2081 toep->tp_toedev->name, toep->tp_tid, m->m_seq, 2082 tp->rcv_nxt); 2083 m_freem(m); 2084 inp_wunlock(tp->t_inpcb); 2085 return; 2086 } 2087#endif 2088 m_adj(m, sizeof(*hdr)); 2089 2090#ifdef URGENT_DATA_SUPPORTED 2091 /* 2092 * We don't handle urgent data yet 2093 */ 2094 if (__predict_false(hdr->urg)) 2095 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 2096 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 2097 tp->urg_seq - tp->rcv_nxt < skb->len)) 2098 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 2099 tp->rcv_nxt]; 2100#endif 2101 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 2102 toep->tp_delack_mode = hdr->dack_mode; 2103 toep->tp_delack_seq = tp->rcv_nxt; 2104 } 2105 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", 2106 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); 2107 2108 if (len < m->m_pkthdr.len) 2109 m->m_pkthdr.len = m->m_len = len; 2110 2111 tp->rcv_nxt += m->m_pkthdr.len; 2112 tp->t_rcvtime = ticks; 2113 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2114 CTR2(KTR_TOM, 2115 "new_rx_data: seq 0x%x len %u", 2116 m->m_seq, m->m_pkthdr.len); 2117 inp_wunlock(tp->t_inpcb); 2118 rcv = so_sockbuf_rcv(so); 2119 sockbuf_lock(rcv); 2120#if 0 2121 if (sb_notify(rcv)) 2122 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); 2123#endif 2124 SBAPPEND(rcv, m); 2125 2126#ifdef notyet 2127 /* 2128 * We're giving too many credits to the card - but disable this check so we can keep on moving :-| 2129 * 2130 */ 2131 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), 2132 2133 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 2134 so, rcv->sb_cc, rcv->sb_mbmax)); 2135#endif 2136 2137 2138 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", 2139 rcv->sb_cc, rcv->sb_mbcnt); 2140 2141 state = so_state_get(so); 2142 if (__predict_true((state & SS_NOFDREF) == 0)) 2143 so_sorwakeup_locked(so); 2144 else 2145 sockbuf_unlock(rcv); 2146} 2147 2148/* 2149 * Handler for RX_DATA CPL messages. 2150 */ 2151static int 2152do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2153{ 2154 struct toepcb *toep = (struct toepcb *)ctx; 2155 2156 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 2157 2158 new_rx_data(toep, m); 2159 2160 return (0); 2161} 2162 2163static void 2164new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) 2165{ 2166 struct tcpcb *tp; 2167 struct ddp_state *q; 2168 struct ddp_buf_state *bsp; 2169 struct cpl_rx_data_ddp *hdr; 2170 struct socket *so; 2171 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 2172 int nomoredata = 0; 2173 unsigned int delack_mode; 2174 struct sockbuf *rcv; 2175 2176 tp = toep->tp_tp; 2177 inp_wlock(tp->t_inpcb); 2178 so = inp_inpcbtosocket(tp->t_inpcb); 2179 2180 if (__predict_false(so_no_receive(so))) { 2181 2182 handle_excess_rx(toep, m); 2183 inp_wunlock(tp->t_inpcb); 2184 return; 2185 } 2186 2187 q = &toep->tp_ddp_state; 2188 hdr = cplhdr(m); 2189 ddp_report = ntohl(hdr->u.ddp_report); 2190 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2191 bsp = &q->buf_state[buf_idx]; 2192 2193 CTR4(KTR_TOM, 2194 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 2195 "hdr seq 0x%x len %u", 2196 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 2197 ntohs(hdr->len)); 2198 CTR3(KTR_TOM, 2199 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", 2200 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); 2201 2202 ddp_len = ntohs(hdr->len); 2203 rcv_nxt = ntohl(hdr->seq) + ddp_len; 2204 2205 delack_mode = G_DDP_DACK_MODE(ddp_report); 2206 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2207 toep->tp_delack_mode = delack_mode; 2208 toep->tp_delack_seq = tp->rcv_nxt; 2209 } 2210 2211 m->m_seq = tp->rcv_nxt; 2212 tp->rcv_nxt = rcv_nxt; 2213 2214 tp->t_rcvtime = ticks; 2215 /* 2216 * Store the length in m->m_len. We are changing the meaning of 2217 * m->m_len here, we need to be very careful that nothing from now on 2218 * interprets ->len of this packet the usual way. 2219 */ 2220 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; 2221 inp_wunlock(tp->t_inpcb); 2222 CTR3(KTR_TOM, 2223 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", 2224 m->m_len, rcv_nxt, m->m_seq); 2225 /* 2226 * Figure out where the new data was placed in the buffer and store it 2227 * in when. Assumes the buffer offset starts at 0, consumer needs to 2228 * account for page pod's pg_offset. 2229 */ 2230 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 2231 m->m_cur_offset = end_offset - m->m_pkthdr.len; 2232 2233 rcv = so_sockbuf_rcv(so); 2234 sockbuf_lock(rcv); 2235 2236 m->m_ddp_gl = (unsigned char *)bsp->gl; 2237 m->m_flags |= M_DDP; 2238 bsp->cur_offset = end_offset; 2239 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2240 2241 /* 2242 * Length is only meaningful for kbuf 2243 */ 2244 if (!(bsp->flags & DDP_BF_NOCOPY)) 2245 KASSERT(m->m_len <= bsp->gl->dgl_length, 2246 ("length received exceeds ddp pages: len=%d dgl_length=%d", 2247 m->m_len, bsp->gl->dgl_length)); 2248 2249 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2250 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); 2251 /* 2252 * Bit 0 of flags stores whether the DDP buffer is completed. 2253 * Note that other parts of the code depend on this being in bit 0. 2254 */ 2255 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 2256 panic("spurious ddp completion"); 2257 } else { 2258 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 2259 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 2260 q->cur_buf ^= 1; /* flip buffers */ 2261 } 2262 2263 if (bsp->flags & DDP_BF_NOCOPY) { 2264 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); 2265 bsp->flags &= ~DDP_BF_NOCOPY; 2266 } 2267 2268 if (ddp_report & F_DDP_PSH) 2269 m->m_ddp_flags |= DDP_BF_PSH; 2270 if (nomoredata) 2271 m->m_ddp_flags |= DDP_BF_NODATA; 2272 2273#ifdef notyet 2274 skb_reset_transport_header(skb); 2275 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ 2276#endif 2277 SBAPPEND(rcv, m); 2278 2279 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || 2280 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) 2281 || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) 2282 so_sorwakeup_locked(so); 2283 else 2284 sockbuf_unlock(rcv); 2285} 2286 2287#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 2288 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 2289 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 2290 F_DDP_INVALID_PPOD) 2291 2292/* 2293 * Handler for RX_DATA_DDP CPL messages. 2294 */ 2295static int 2296do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2297{ 2298 struct toepcb *toep = ctx; 2299 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 2300 2301 VALIDATE_SOCK(so); 2302 2303 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 2304 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 2305 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 2306 return (CPL_RET_BUF_DONE); 2307 } 2308#if 0 2309 skb->h.th = tcphdr_skb->h.th; 2310#endif 2311 new_rx_data_ddp(toep, m); 2312 return (0); 2313} 2314 2315static void 2316process_ddp_complete(struct toepcb *toep, struct mbuf *m) 2317{ 2318 struct tcpcb *tp = toep->tp_tp; 2319 struct socket *so; 2320 struct ddp_state *q; 2321 struct ddp_buf_state *bsp; 2322 struct cpl_rx_ddp_complete *hdr; 2323 unsigned int ddp_report, buf_idx, when, delack_mode; 2324 int nomoredata = 0; 2325 struct sockbuf *rcv; 2326 2327 inp_wlock(tp->t_inpcb); 2328 so = inp_inpcbtosocket(tp->t_inpcb); 2329 2330 if (__predict_false(so_no_receive(so))) { 2331 struct inpcb *inp = so_sotoinpcb(so); 2332 2333 handle_excess_rx(toep, m); 2334 inp_wunlock(inp); 2335 return; 2336 } 2337 q = &toep->tp_ddp_state; 2338 hdr = cplhdr(m); 2339 ddp_report = ntohl(hdr->ddp_report); 2340 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2341 m->m_pkthdr.csum_data = tp->rcv_nxt; 2342 2343 rcv = so_sockbuf_rcv(so); 2344 sockbuf_lock(rcv); 2345 2346 bsp = &q->buf_state[buf_idx]; 2347 when = bsp->cur_offset; 2348 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; 2349 tp->rcv_nxt += m->m_len; 2350 tp->t_rcvtime = ticks; 2351 2352 delack_mode = G_DDP_DACK_MODE(ddp_report); 2353 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2354 toep->tp_delack_mode = delack_mode; 2355 toep->tp_delack_seq = tp->rcv_nxt; 2356 } 2357#ifdef notyet 2358 skb_reset_transport_header(skb); 2359 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2360#endif 2361 inp_wunlock(tp->t_inpcb); 2362 2363 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2364 CTR5(KTR_TOM, 2365 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2366 "ddp_report 0x%x offset %u, len %u", 2367 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2368 G_DDP_OFFSET(ddp_report), m->m_len); 2369 2370 m->m_cur_offset = bsp->cur_offset; 2371 bsp->cur_offset += m->m_len; 2372 2373 if (!(bsp->flags & DDP_BF_NOFLIP)) { 2374 q->cur_buf ^= 1; /* flip buffers */ 2375 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) 2376 nomoredata=1; 2377 } 2378 2379 CTR4(KTR_TOM, 2380 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2381 "ddp_report %u offset %u", 2382 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2383 G_DDP_OFFSET(ddp_report)); 2384 2385 m->m_ddp_gl = (unsigned char *)bsp->gl; 2386 m->m_flags |= M_DDP; 2387 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 2388 if (bsp->flags & DDP_BF_NOCOPY) 2389 bsp->flags &= ~DDP_BF_NOCOPY; 2390 if (nomoredata) 2391 m->m_ddp_flags |= DDP_BF_NODATA; 2392 2393 SBAPPEND(rcv, m); 2394 if ((so_state_get(so) & SS_NOFDREF) == 0) 2395 so_sorwakeup_locked(so); 2396 else 2397 sockbuf_unlock(rcv); 2398} 2399 2400/* 2401 * Handler for RX_DDP_COMPLETE CPL messages. 2402 */ 2403static int 2404do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2405{ 2406 struct toepcb *toep = ctx; 2407 2408 VALIDATE_SOCK(so); 2409#if 0 2410 skb->h.th = tcphdr_skb->h.th; 2411#endif 2412 process_ddp_complete(toep, m); 2413 return (0); 2414} 2415 2416/* 2417 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 2418 * socket state before calling tcp_time_wait to comply with its expectations. 2419 */ 2420static void 2421enter_timewait(struct tcpcb *tp) 2422{ 2423 /* 2424 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2425 * process peer_close because we don't want to carry the peer FIN in 2426 * the socket's receive queue and if we increment rcv_nxt without 2427 * having the FIN in the receive queue we'll confuse facilities such 2428 * as SIOCINQ. 2429 */ 2430 inp_wlock(tp->t_inpcb); 2431 tp->rcv_nxt++; 2432 2433 tp->ts_recent_age = 0; /* defeat recycling */ 2434 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2435 inp_wunlock(tp->t_inpcb); 2436 tcp_offload_twstart(tp); 2437} 2438 2439/* 2440 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This 2441 * function deals with the data that may be reported along with the FIN. 2442 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to 2443 * perform normal FIN-related processing. In the latter case 1 indicates that 2444 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the 2445 * skb can be freed. 2446 */ 2447static int 2448handle_peer_close_data(struct socket *so, struct mbuf *m) 2449{ 2450 struct tcpcb *tp = so_sototcpcb(so); 2451 struct toepcb *toep = tp->t_toe; 2452 struct ddp_state *q; 2453 struct ddp_buf_state *bsp; 2454 struct cpl_peer_close *req = cplhdr(m); 2455 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ 2456 struct sockbuf *rcv; 2457 2458 if (tp->rcv_nxt == rcv_nxt) /* no data */ 2459 return (0); 2460 2461 CTR0(KTR_TOM, "handle_peer_close_data"); 2462 if (__predict_false(so_no_receive(so))) { 2463 handle_excess_rx(toep, m); 2464 2465 /* 2466 * Although we discard the data we want to process the FIN so 2467 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + 2468 * PEER_CLOSE without data. In particular this PEER_CLOSE 2469 * may be what will close the connection. We return 1 because 2470 * handle_excess_rx() already freed the packet. 2471 */ 2472 return (1); 2473 } 2474 2475 inp_lock_assert(tp->t_inpcb); 2476 q = &toep->tp_ddp_state; 2477 rcv = so_sockbuf_rcv(so); 2478 sockbuf_lock(rcv); 2479 2480 bsp = &q->buf_state[q->cur_buf]; 2481 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2482 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2483 m->m_ddp_gl = (unsigned char *)bsp->gl; 2484 m->m_flags |= M_DDP; 2485 m->m_cur_offset = bsp->cur_offset; 2486 m->m_ddp_flags = 2487 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2488 m->m_seq = tp->rcv_nxt; 2489 tp->rcv_nxt = rcv_nxt; 2490 bsp->cur_offset += m->m_pkthdr.len; 2491 if (!(bsp->flags & DDP_BF_NOFLIP)) 2492 q->cur_buf ^= 1; 2493#ifdef notyet 2494 skb_reset_transport_header(skb); 2495 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2496#endif 2497 tp->t_rcvtime = ticks; 2498 SBAPPEND(rcv, m); 2499 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 2500 so_sorwakeup_locked(so); 2501 else 2502 sockbuf_unlock(rcv); 2503 2504 return (1); 2505} 2506 2507/* 2508 * Handle a peer FIN. 2509 */ 2510static void 2511do_peer_fin(struct toepcb *toep, struct mbuf *m) 2512{ 2513 struct socket *so; 2514 struct tcpcb *tp = toep->tp_tp; 2515 int keep, action; 2516 2517 action = keep = 0; 2518 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); 2519 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2520 printf("abort_pending set\n"); 2521 2522 goto out; 2523 } 2524 inp_wlock(tp->t_inpcb); 2525 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 2526 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { 2527 keep = handle_peer_close_data(so, m); 2528 if (keep < 0) { 2529 inp_wunlock(tp->t_inpcb); 2530 return; 2531 } 2532 } 2533 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2534 CTR1(KTR_TOM, 2535 "waking up waiters for cantrcvmore on %p ", so); 2536 socantrcvmore(so); 2537 2538 /* 2539 * If connection is half-synchronized 2540 * (ie NEEDSYN flag on) then delay ACK, 2541 * so it may be piggybacked when SYN is sent. 2542 * Otherwise, since we received a FIN then no 2543 * more input can be expected, send ACK now. 2544 */ 2545 if (tp->t_flags & TF_NEEDSYN) 2546 tp->t_flags |= TF_DELACK; 2547 else 2548 tp->t_flags |= TF_ACKNOW; 2549 tp->rcv_nxt++; 2550 } 2551 2552 switch (tp->t_state) { 2553 case TCPS_SYN_RECEIVED: 2554 tp->t_starttime = ticks; 2555 /* FALLTHROUGH */ 2556 case TCPS_ESTABLISHED: 2557 tp->t_state = TCPS_CLOSE_WAIT; 2558 break; 2559 case TCPS_FIN_WAIT_1: 2560 tp->t_state = TCPS_CLOSING; 2561 break; 2562 case TCPS_FIN_WAIT_2: 2563 /* 2564 * If we've sent an abort_req we must have sent it too late, 2565 * HW will send us a reply telling us so, and this peer_close 2566 * is really the last message for this connection and needs to 2567 * be treated as an abort_rpl, i.e., transition the connection 2568 * to TCP_CLOSE (note that the host stack does this at the 2569 * time of generating the RST but we must wait for HW). 2570 * Otherwise we enter TIME_WAIT. 2571 */ 2572 t3_release_offload_resources(toep); 2573 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2574 action = TCP_CLOSE; 2575 } else { 2576 action = TCP_TIMEWAIT; 2577 } 2578 break; 2579 default: 2580 log(LOG_ERR, 2581 "%s: TID %u received PEER_CLOSE in bad state %d\n", 2582 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); 2583 } 2584 inp_wunlock(tp->t_inpcb); 2585 2586 if (action == TCP_TIMEWAIT) { 2587 enter_timewait(tp); 2588 } else if (action == TCP_DROP) { 2589 tcp_offload_drop(tp, 0); 2590 } else if (action == TCP_CLOSE) { 2591 tcp_offload_close(tp); 2592 } 2593 2594#ifdef notyet 2595 /* Do not send POLL_HUP for half duplex close. */ 2596 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 2597 sk->sk_state == TCP_CLOSE) 2598 sk_wake_async(so, 1, POLL_HUP); 2599 else 2600 sk_wake_async(so, 1, POLL_IN); 2601#endif 2602 2603out: 2604 if (!keep) 2605 m_free(m); 2606} 2607 2608/* 2609 * Handler for PEER_CLOSE CPL messages. 2610 */ 2611static int 2612do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2613{ 2614 struct toepcb *toep = (struct toepcb *)ctx; 2615 2616 VALIDATE_SOCK(so); 2617 2618 do_peer_fin(toep, m); 2619 return (0); 2620} 2621 2622static void 2623process_close_con_rpl(struct toepcb *toep, struct mbuf *m) 2624{ 2625 struct cpl_close_con_rpl *rpl = cplhdr(m); 2626 struct tcpcb *tp = toep->tp_tp; 2627 struct socket *so; 2628 int action = 0; 2629 struct sockbuf *rcv; 2630 2631 inp_wlock(tp->t_inpcb); 2632 so = inp_inpcbtosocket(tp->t_inpcb); 2633 2634 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 2635 2636 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2637 inp_wunlock(tp->t_inpcb); 2638 goto out; 2639 } 2640 2641 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 2642 tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); 2643 2644 switch (tp->t_state) { 2645 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 2646 t3_release_offload_resources(toep); 2647 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2648 action = TCP_CLOSE; 2649 2650 } else { 2651 action = TCP_TIMEWAIT; 2652 } 2653 break; 2654 case TCPS_LAST_ACK: 2655 /* 2656 * In this state we don't care about pending abort_rpl. 2657 * If we've sent abort_req it was post-close and was sent too 2658 * late, this close_con_rpl is the actual last message. 2659 */ 2660 t3_release_offload_resources(toep); 2661 action = TCP_CLOSE; 2662 break; 2663 case TCPS_FIN_WAIT_1: 2664 /* 2665 * If we can't receive any more 2666 * data, then closing user can proceed. 2667 * Starting the timer is contrary to the 2668 * specification, but if we don't get a FIN 2669 * we'll hang forever. 2670 * 2671 * XXXjl: 2672 * we should release the tp also, and use a 2673 * compressed state. 2674 */ 2675 if (so) 2676 rcv = so_sockbuf_rcv(so); 2677 else 2678 break; 2679 2680 if (rcv->sb_state & SBS_CANTRCVMORE) { 2681 int timeout; 2682 2683 if (so) 2684 soisdisconnected(so); 2685 timeout = (tcp_fast_finwait2_recycle) ? 2686 tcp_finwait2_timeout : tcp_maxidle; 2687 tcp_timer_activate(tp, TT_2MSL, timeout); 2688 } 2689 tp->t_state = TCPS_FIN_WAIT_2; 2690 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && 2691 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 2692 action = TCP_DROP; 2693 } 2694 2695 break; 2696 default: 2697 log(LOG_ERR, 2698 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 2699 toep->tp_toedev->tod_name, toep->tp_tid, 2700 tp->t_state); 2701 } 2702 inp_wunlock(tp->t_inpcb); 2703 2704 2705 if (action == TCP_TIMEWAIT) { 2706 enter_timewait(tp); 2707 } else if (action == TCP_DROP) { 2708 tcp_offload_drop(tp, 0); 2709 } else if (action == TCP_CLOSE) { 2710 tcp_offload_close(tp); 2711 } 2712out: 2713 m_freem(m); 2714} 2715 2716/* 2717 * Handler for CLOSE_CON_RPL CPL messages. 2718 */ 2719static int 2720do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 2721 void *ctx) 2722{ 2723 struct toepcb *toep = (struct toepcb *)ctx; 2724 2725 process_close_con_rpl(toep, m); 2726 return (0); 2727} 2728 2729/* 2730 * Process abort replies. We only process these messages if we anticipate 2731 * them as the coordination between SW and HW in this area is somewhat lacking 2732 * and sometimes we get ABORT_RPLs after we are done with the connection that 2733 * originated the ABORT_REQ. 2734 */ 2735static void 2736process_abort_rpl(struct toepcb *toep, struct mbuf *m) 2737{ 2738 struct tcpcb *tp = toep->tp_tp; 2739 struct socket *so; 2740 int needclose = 0; 2741 2742#ifdef T3_TRACE 2743 T3_TRACE1(TIDTB(sk), 2744 "process_abort_rpl: GTS rpl pending %d", 2745 sock_flag(sk, ABORT_RPL_PENDING)); 2746#endif 2747 2748 inp_wlock(tp->t_inpcb); 2749 so = inp_inpcbtosocket(tp->t_inpcb); 2750 2751 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2752 /* 2753 * XXX panic on tcpdrop 2754 */ 2755 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) 2756 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2757 else { 2758 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2759 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2760 !is_t3a(toep->tp_toedev)) { 2761 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2762 panic("TP_ABORT_REQ_RCVD set"); 2763 t3_release_offload_resources(toep); 2764 needclose = 1; 2765 } 2766 } 2767 } 2768 inp_wunlock(tp->t_inpcb); 2769 2770 if (needclose) 2771 tcp_offload_close(tp); 2772 2773 m_free(m); 2774} 2775 2776/* 2777 * Handle an ABORT_RPL_RSS CPL message. 2778 */ 2779static int 2780do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2781{ 2782 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2783 struct toepcb *toep; 2784 2785 /* 2786 * Ignore replies to post-close aborts indicating that the abort was 2787 * requested too late. These connections are terminated when we get 2788 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2789 * arrives the TID is either no longer used or it has been recycled. 2790 */ 2791 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2792discard: 2793 m_free(m); 2794 return (0); 2795 } 2796 2797 toep = (struct toepcb *)ctx; 2798 2799 /* 2800 * Sometimes we've already closed the socket, e.g., a post-close 2801 * abort races with ABORT_REQ_RSS, the latter frees the socket 2802 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2803 * but FW turns the ABORT_REQ into a regular one and so we get 2804 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2805 */ 2806 if (!toep) 2807 goto discard; 2808 2809 if (toep->tp_tp == NULL) { 2810 log(LOG_NOTICE, "removing tid for abort\n"); 2811 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2812 if (toep->tp_l2t) 2813 l2t_release(L2DATA(cdev), toep->tp_l2t); 2814 2815 toepcb_release(toep); 2816 goto discard; 2817 } 2818 2819 log(LOG_NOTICE, "toep=%p\n", toep); 2820 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); 2821 2822 toepcb_hold(toep); 2823 process_abort_rpl(toep, m); 2824 toepcb_release(toep); 2825 return (0); 2826} 2827 2828/* 2829 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also 2830 * indicate whether RST should be sent in response. 2831 */ 2832static int 2833abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2834{ 2835 struct tcpcb *tp = so_sototcpcb(so); 2836 2837 switch (abort_reason) { 2838 case CPL_ERR_BAD_SYN: 2839#if 0 2840 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2841#endif 2842 case CPL_ERR_CONN_RESET: 2843 // XXX need to handle SYN_RECV due to crossed SYNs 2844 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2845 case CPL_ERR_XMIT_TIMEDOUT: 2846 case CPL_ERR_PERSIST_TIMEDOUT: 2847 case CPL_ERR_FINWAIT2_TIMEDOUT: 2848 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2849#if 0 2850 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2851#endif 2852 return (ETIMEDOUT); 2853 default: 2854 return (EIO); 2855 } 2856} 2857 2858static inline void 2859set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2860{ 2861 struct cpl_abort_rpl *rpl = cplhdr(m); 2862 2863 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2864 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2865 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2866 2867 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2868 rpl->cmd = cmd; 2869} 2870 2871static void 2872send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2873{ 2874 struct mbuf *reply_mbuf; 2875 struct cpl_abort_req_rss *req = cplhdr(m); 2876 2877 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2878 m_set_priority(m, CPL_PRIORITY_DATA); 2879 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2880 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2881 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2882 m_free(m); 2883} 2884 2885/* 2886 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2887 */ 2888static inline int 2889is_neg_adv_abort(unsigned int status) 2890{ 2891 return status == CPL_ERR_RTX_NEG_ADVICE || 2892 status == CPL_ERR_PERSIST_NEG_ADVICE; 2893} 2894 2895static void 2896send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2897{ 2898 struct mbuf *reply_mbuf; 2899 struct cpl_abort_req_rss *req = cplhdr(m); 2900 2901 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2902 2903 if (!reply_mbuf) { 2904 /* Defer the reply. Stick rst_status into req->cmd. */ 2905 req->status = rst_status; 2906 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2907 return; 2908 } 2909 2910 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2911 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2912 m_free(m); 2913 2914 /* 2915 * XXX need to sync with ARP as for SYN_RECV connections we can send 2916 * these messages while ARP is pending. For other connection states 2917 * it's not a problem. 2918 */ 2919 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2920} 2921 2922#ifdef notyet 2923static void 2924cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2925{ 2926 CXGB_UNIMPLEMENTED(); 2927#ifdef notyet 2928 struct request_sock *req = child->sk_user_data; 2929 2930 inet_csk_reqsk_queue_removed(parent, req); 2931 synq_remove(tcp_sk(child)); 2932 __reqsk_free(req); 2933 child->sk_user_data = NULL; 2934#endif 2935} 2936 2937 2938/* 2939 * Performs the actual work to abort a SYN_RECV connection. 2940 */ 2941static void 2942do_abort_syn_rcv(struct socket *child, struct socket *parent) 2943{ 2944 struct tcpcb *parenttp = so_sototcpcb(parent); 2945 struct tcpcb *childtp = so_sototcpcb(child); 2946 2947 /* 2948 * If the server is still open we clean up the child connection, 2949 * otherwise the server already did the clean up as it was purging 2950 * its SYN queue and the skb was just sitting in its backlog. 2951 */ 2952 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2953 cleanup_syn_rcv_conn(child, parent); 2954 inp_wlock(childtp->t_inpcb); 2955 t3_release_offload_resources(childtp->t_toe); 2956 inp_wunlock(childtp->t_inpcb); 2957 tcp_offload_close(childtp); 2958 } 2959} 2960#endif 2961 2962/* 2963 * Handle abort requests for a SYN_RECV connection. These need extra work 2964 * because the socket is on its parent's SYN queue. 2965 */ 2966static int 2967abort_syn_rcv(struct socket *so, struct mbuf *m) 2968{ 2969 CXGB_UNIMPLEMENTED(); 2970#ifdef notyet 2971 struct socket *parent; 2972 struct toedev *tdev = toep->tp_toedev; 2973 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2974 struct socket *oreq = so->so_incomp; 2975 struct t3c_tid_entry *t3c_stid; 2976 struct tid_info *t; 2977 2978 if (!oreq) 2979 return -1; /* somehow we are not on the SYN queue */ 2980 2981 t = &(T3C_DATA(cdev))->tid_maps; 2982 t3c_stid = lookup_stid(t, oreq->ts_recent); 2983 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2984 2985 so_lock(parent); 2986 do_abort_syn_rcv(so, parent); 2987 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2988 so_unlock(parent); 2989#endif 2990 return (0); 2991} 2992 2993/* 2994 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 2995 * request except that we need to reply to it. 2996 */ 2997static void 2998process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) 2999{ 3000 int rst_status = CPL_ABORT_NO_RST; 3001 const struct cpl_abort_req_rss *req = cplhdr(m); 3002 struct tcpcb *tp = toep->tp_tp; 3003 struct socket *so; 3004 int needclose = 0; 3005 3006 inp_wlock(tp->t_inpcb); 3007 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 3008 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 3009 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 3010 m_free(m); 3011 goto skip; 3012 } 3013 3014 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 3015 /* 3016 * Three cases to consider: 3017 * a) We haven't sent an abort_req; close the connection. 3018 * b) We have sent a post-close abort_req that will get to TP too late 3019 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 3020 * be ignored and the connection should be closed now. 3021 * c) We have sent a regular abort_req that will get to TP too late. 3022 * That will generate an abort_rpl with status 0, wait for it. 3023 */ 3024 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 3025 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 3026 int error; 3027 3028 error = abort_status_to_errno(so, req->status, 3029 &rst_status); 3030 so_error_set(so, error); 3031 3032 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 3033 so_sorwakeup(so); 3034 /* 3035 * SYN_RECV needs special processing. If abort_syn_rcv() 3036 * returns 0 is has taken care of the abort. 3037 */ 3038 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 3039 goto skip; 3040 3041 t3_release_offload_resources(toep); 3042 needclose = 1; 3043 } 3044 inp_wunlock(tp->t_inpcb); 3045 3046 if (needclose) 3047 tcp_offload_close(tp); 3048 3049 send_abort_rpl(m, tdev, rst_status); 3050 return; 3051skip: 3052 inp_wunlock(tp->t_inpcb); 3053} 3054 3055/* 3056 * Handle an ABORT_REQ_RSS CPL message. 3057 */ 3058static int 3059do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3060{ 3061 const struct cpl_abort_req_rss *req = cplhdr(m); 3062 struct toepcb *toep = (struct toepcb *)ctx; 3063 3064 if (is_neg_adv_abort(req->status)) { 3065 m_free(m); 3066 return (0); 3067 } 3068 3069 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); 3070 3071 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 3072 cxgb_remove_tid(cdev, toep, toep->tp_tid); 3073 toep->tp_flags |= TP_ABORT_REQ_RCVD; 3074 3075 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 3076 if (toep->tp_l2t) 3077 l2t_release(L2DATA(cdev), toep->tp_l2t); 3078 3079 /* 3080 * Unhook 3081 */ 3082 toep->tp_tp->t_toe = NULL; 3083 toep->tp_tp->t_flags &= ~TF_TOE; 3084 toep->tp_tp = NULL; 3085 /* 3086 * XXX need to call syncache_chkrst - but we don't 3087 * have a way of doing that yet 3088 */ 3089 toepcb_release(toep); 3090 log(LOG_ERR, "abort for unestablished connection :-(\n"); 3091 return (0); 3092 } 3093 if (toep->tp_tp == NULL) { 3094 log(LOG_NOTICE, "disconnected toepcb\n"); 3095 /* should be freed momentarily */ 3096 return (0); 3097 } 3098 3099 3100 toepcb_hold(toep); 3101 process_abort_req(toep, m, toep->tp_toedev); 3102 toepcb_release(toep); 3103 return (0); 3104} 3105#ifdef notyet 3106static void 3107pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 3108{ 3109 struct toedev *tdev = TOE_DEV(parent); 3110 3111 do_abort_syn_rcv(child, parent); 3112 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 3113 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 3114 3115 rpl->opt0h = htonl(F_TCAM_BYPASS); 3116 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3117 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3118 } else 3119 m_free(m); 3120} 3121#endif 3122static void 3123handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 3124{ 3125 CXGB_UNIMPLEMENTED(); 3126 3127#ifdef notyet 3128 struct t3cdev *cdev; 3129 struct socket *parent; 3130 struct socket *oreq; 3131 struct t3c_tid_entry *t3c_stid; 3132 struct tid_info *t; 3133 struct tcpcb *otp, *tp = so_sototcpcb(so); 3134 struct toepcb *toep = tp->t_toe; 3135 3136 /* 3137 * If the connection is being aborted due to the parent listening 3138 * socket going away there's nothing to do, the ABORT_REQ will close 3139 * the connection. 3140 */ 3141 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 3142 m_free(m); 3143 return; 3144 } 3145 3146 oreq = so->so_incomp; 3147 otp = so_sototcpcb(oreq); 3148 3149 cdev = T3C_DEV(so); 3150 t = &(T3C_DATA(cdev))->tid_maps; 3151 t3c_stid = lookup_stid(t, otp->ts_recent); 3152 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 3153 3154 so_lock(parent); 3155 pass_open_abort(so, parent, m); 3156 so_unlock(parent); 3157#endif 3158} 3159 3160/* 3161 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 3162 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 3163 * connection. 3164 */ 3165static void 3166pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 3167{ 3168 3169#ifdef notyet 3170 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3171 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 3172#endif 3173 handle_pass_open_arp_failure(m_get_socket(m), m); 3174} 3175 3176/* 3177 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 3178 */ 3179static void 3180mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 3181{ 3182 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 3183 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 3184 unsigned int tid = GET_TID(req); 3185 3186 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 3187 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3188 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3189 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 3190 rpl->opt0h = htonl(F_TCAM_BYPASS); 3191 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3192 rpl->opt2 = 0; 3193 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3194} 3195 3196/* 3197 * Send a deferred reject to an accept request. 3198 */ 3199static void 3200reject_pass_request(struct toedev *tdev, struct mbuf *m) 3201{ 3202 struct mbuf *reply_mbuf; 3203 3204 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 3205 mk_pass_accept_rpl(reply_mbuf, m); 3206 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 3207 m_free(m); 3208} 3209 3210static void 3211handle_syncache_event(int event, void *arg) 3212{ 3213 struct toepcb *toep = arg; 3214 3215 switch (event) { 3216 case TOE_SC_ENTRY_PRESENT: 3217 /* 3218 * entry already exists - free toepcb 3219 * and l2t 3220 */ 3221 printf("syncache entry present\n"); 3222 toepcb_release(toep); 3223 break; 3224 case TOE_SC_DROP: 3225 /* 3226 * The syncache has given up on this entry 3227 * either it timed out, or it was evicted 3228 * we need to explicitly release the tid 3229 */ 3230 printf("syncache entry dropped\n"); 3231 toepcb_release(toep); 3232 break; 3233 default: 3234 log(LOG_ERR, "unknown syncache event %d\n", event); 3235 break; 3236 } 3237} 3238 3239static void 3240syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 3241{ 3242 struct in_conninfo inc; 3243 struct tcpopt to; 3244 struct tcphdr th; 3245 struct inpcb *inp; 3246 int mss, wsf, sack, ts; 3247 uint32_t rcv_isn = ntohl(req->rcv_isn); 3248 3249 bzero(&to, sizeof(struct tcpopt)); 3250 inp = so_sotoinpcb(lso); 3251 3252 /* 3253 * Fill out information for entering us into the syncache 3254 */ 3255 inc.inc_fport = th.th_sport = req->peer_port; 3256 inc.inc_lport = th.th_dport = req->local_port; 3257 th.th_seq = req->rcv_isn; 3258 th.th_flags = TH_SYN; 3259 3260 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; 3261 3262 3263 inc.inc_isipv6 = 0; 3264 inc.inc_len = 0; 3265 inc.inc_faddr.s_addr = req->peer_ip; 3266 inc.inc_laddr.s_addr = req->local_ip; 3267 3268 DPRINTF("syncache add of %d:%d %d:%d\n", 3269 ntohl(req->local_ip), ntohs(req->local_port), 3270 ntohl(req->peer_ip), ntohs(req->peer_port)); 3271 3272 mss = req->tcp_options.mss; 3273 wsf = req->tcp_options.wsf; 3274 ts = req->tcp_options.tstamp; 3275 sack = req->tcp_options.sack; 3276 to.to_mss = mss; 3277 to.to_wscale = wsf; 3278 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3279 tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 3280} 3281 3282 3283/* 3284 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 3285 * lock held. Note that the sock here is a listening socket that is not owned 3286 * by the TOE. 3287 */ 3288static void 3289process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 3290 struct listen_ctx *lctx) 3291{ 3292 int rt_flags; 3293 struct l2t_entry *e; 3294 struct iff_mac tim; 3295 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 3296 struct cpl_pass_accept_rpl *rpl; 3297 struct cpl_pass_accept_req *req = cplhdr(m); 3298 unsigned int tid = GET_TID(req); 3299 struct tom_data *d = TOM_DATA(tdev); 3300 struct t3cdev *cdev = d->cdev; 3301 struct tcpcb *tp = so_sototcpcb(so); 3302 struct toepcb *newtoep; 3303 struct rtentry *dst; 3304 struct sockaddr_in nam; 3305 struct t3c_data *td = T3C_DATA(cdev); 3306 3307 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3308 if (__predict_false(reply_mbuf == NULL)) { 3309 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3310 t3_defer_reply(m, tdev, reject_pass_request); 3311 else { 3312 cxgb_queue_tid_release(cdev, tid); 3313 m_free(m); 3314 } 3315 DPRINTF("failed to get reply_mbuf\n"); 3316 3317 goto out; 3318 } 3319 3320 if (tp->t_state != TCPS_LISTEN) { 3321 DPRINTF("socket not in listen state\n"); 3322 3323 goto reject; 3324 } 3325 3326 tim.mac_addr = req->dst_mac; 3327 tim.vlan_tag = ntohs(req->vlan_tag); 3328 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 3329 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 3330 goto reject; 3331 } 3332 3333#ifdef notyet 3334 /* 3335 * XXX do route lookup to confirm that we're still listening on this 3336 * address 3337 */ 3338 if (ip_route_input(skb, req->local_ip, req->peer_ip, 3339 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 3340 goto reject; 3341 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 3342 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 3343 dst_release(skb->dst); // done with the input route, release it 3344 skb->dst = NULL; 3345 3346 if ((rt_flags & RTF_LOCAL) == 0) 3347 goto reject; 3348#endif 3349 /* 3350 * XXX 3351 */ 3352 rt_flags = RTF_LOCAL; 3353 if ((rt_flags & RTF_LOCAL) == 0) 3354 goto reject; 3355 3356 /* 3357 * Calculate values and add to syncache 3358 */ 3359 3360 newtoep = toepcb_alloc(); 3361 if (newtoep == NULL) 3362 goto reject; 3363 3364 bzero(&nam, sizeof(struct sockaddr_in)); 3365 3366 nam.sin_len = sizeof(struct sockaddr_in); 3367 nam.sin_family = AF_INET; 3368 nam.sin_addr.s_addr =req->peer_ip; 3369 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 3370 3371 if (dst == NULL) { 3372 printf("failed to find route\n"); 3373 goto reject; 3374 } 3375 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 3376 (struct sockaddr *)&nam); 3377 if (e == NULL) { 3378 DPRINTF("failed to get l2t\n"); 3379 } 3380 /* 3381 * Point to our listen socket until accept 3382 */ 3383 newtoep->tp_tp = tp; 3384 newtoep->tp_flags = TP_SYN_RCVD; 3385 newtoep->tp_tid = tid; 3386 newtoep->tp_toedev = tdev; 3387 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3388 3389 cxgb_insert_tid(cdev, d->client, newtoep, tid); 3390 so_lock(so); 3391 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 3392 so_unlock(so); 3393 3394 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 3395 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 3396 3397 if (newtoep->tp_ulp_mode) { 3398 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3399 3400 if (ddp_mbuf == NULL) 3401 newtoep->tp_ulp_mode = 0; 3402 } 3403 3404 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", 3405 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); 3406 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 3407 /* 3408 * XXX workaround for lack of syncache drop 3409 */ 3410 toepcb_hold(newtoep); 3411 syncache_add_accept_req(req, so, newtoep); 3412 3413 rpl = cplhdr(reply_mbuf); 3414 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 3415 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3416 rpl->wr.wr_lo = 0; 3417 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3418 rpl->opt2 = htonl(calc_opt2(so, tdev)); 3419 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3420 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 3421 3422 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 3423 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 3424 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | 3425 CPL_PASS_OPEN_ACCEPT); 3426 3427 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 3428 3429 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); 3430 3431 l2t_send(cdev, reply_mbuf, e); 3432 m_free(m); 3433 if (newtoep->tp_ulp_mode) { 3434 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 3435 V_TF_DDP_OFF(1) | 3436 TP_DDP_TIMER_WORKAROUND_MASK, 3437 V_TF_DDP_OFF(1) | 3438 TP_DDP_TIMER_WORKAROUND_VAL, 1); 3439 } else 3440 printf("not offloading\n"); 3441 3442 3443 3444 return; 3445reject: 3446 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3447 mk_pass_accept_rpl(reply_mbuf, m); 3448 else 3449 mk_tid_release(reply_mbuf, newtoep, tid); 3450 cxgb_ofld_send(cdev, reply_mbuf); 3451 m_free(m); 3452out: 3453#if 0 3454 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3455#else 3456 return; 3457#endif 3458} 3459 3460/* 3461 * Handle a CPL_PASS_ACCEPT_REQ message. 3462 */ 3463static int 3464do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3465{ 3466 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 3467 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ 3468 struct tom_data *d = listen_ctx->tom_data; 3469 3470#if VALIDATE_TID 3471 struct cpl_pass_accept_req *req = cplhdr(m); 3472 unsigned int tid = GET_TID(req); 3473 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 3474 3475 if (unlikely(!lsk)) { 3476 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 3477 cdev->name, 3478 (unsigned long)((union listen_entry *)ctx - 3479 t->stid_tab)); 3480 return CPL_RET_BUF_DONE; 3481 } 3482 if (unlikely(tid >= t->ntids)) { 3483 printk(KERN_ERR "%s: passive open TID %u too large\n", 3484 cdev->name, tid); 3485 return CPL_RET_BUF_DONE; 3486 } 3487 /* 3488 * For T3A the current user of the TID may have closed but its last 3489 * message(s) may have been backlogged so the TID appears to be still 3490 * in use. Just take the TID away, the connection can close at its 3491 * own leisure. For T3B this situation is a bug. 3492 */ 3493 if (!valid_new_tid(t, tid) && 3494 cdev->type != T3A) { 3495 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 3496 cdev->name, tid); 3497 return CPL_RET_BUF_DONE; 3498 } 3499#endif 3500 3501 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 3502 return (0); 3503} 3504 3505/* 3506 * Called when a connection is established to translate the TCP options 3507 * reported by HW to FreeBSD's native format. 3508 */ 3509static void 3510assign_rxopt(struct socket *so, unsigned int opt) 3511{ 3512 struct tcpcb *tp = so_sototcpcb(so); 3513 struct toepcb *toep = tp->t_toe; 3514 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); 3515 3516 inp_lock_assert(tp->t_inpcb); 3517 3518 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3519 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 3520 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 3521 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 3522 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3523 (TF_RCVD_SCALE|TF_REQ_SCALE)) 3524 tp->rcv_scale = tp->request_r_scale; 3525} 3526 3527/* 3528 * Completes some final bits of initialization for just established connections 3529 * and changes their state to TCP_ESTABLISHED. 3530 * 3531 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 3532 */ 3533static void 3534make_established(struct socket *so, u32 snd_isn, unsigned int opt) 3535{ 3536 struct tcpcb *tp = so_sototcpcb(so); 3537 struct toepcb *toep = tp->t_toe; 3538 3539 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 3540 assign_rxopt(so, opt); 3541 3542 /* 3543 *XXXXXXXXXXX 3544 * 3545 */ 3546#ifdef notyet 3547 so->so_proto->pr_ctloutput = t3_ctloutput; 3548#endif 3549 3550#if 0 3551 inet_sk(sk)->id = tp->write_seq ^ jiffies; 3552#endif 3553 /* 3554 * XXX not clear what rcv_wup maps to 3555 */ 3556 /* 3557 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 3558 * pass through opt0. 3559 */ 3560 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 3561 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 3562 3563 dump_toepcb(toep); 3564 3565#ifdef notyet 3566/* 3567 * no clean interface for marking ARP up to date 3568 */ 3569 dst_confirm(sk->sk_dst_cache); 3570#endif 3571 tp->t_starttime = ticks; 3572 tp->t_state = TCPS_ESTABLISHED; 3573 soisconnected(so); 3574} 3575 3576static int 3577syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 3578{ 3579 3580 struct in_conninfo inc; 3581 struct tcpopt to; 3582 struct tcphdr th; 3583 int mss, wsf, sack, ts; 3584 struct mbuf *m = NULL; 3585 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 3586 unsigned int opt; 3587 3588#ifdef MAC 3589#error "no MAC support" 3590#endif 3591 3592 opt = ntohs(req->tcp_opt); 3593 3594 bzero(&to, sizeof(struct tcpopt)); 3595 3596 /* 3597 * Fill out information for entering us into the syncache 3598 */ 3599 inc.inc_fport = th.th_sport = req->peer_port; 3600 inc.inc_lport = th.th_dport = req->local_port; 3601 th.th_seq = req->rcv_isn; 3602 th.th_flags = TH_ACK; 3603 3604 inc.inc_isipv6 = 0; 3605 inc.inc_len = 0; 3606 inc.inc_faddr.s_addr = req->peer_ip; 3607 inc.inc_laddr.s_addr = req->local_ip; 3608 3609 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3610 wsf = G_TCPOPT_WSCALE_OK(opt); 3611 ts = G_TCPOPT_TSTAMP(opt); 3612 sack = G_TCPOPT_SACK(opt); 3613 3614 to.to_mss = mss; 3615 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 3616 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3617 3618 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 3619 ntohl(req->local_ip), ntohs(req->local_port), 3620 ntohl(req->peer_ip), ntohs(req->peer_port), 3621 mss, wsf, ts, sack); 3622 return tcp_offload_syncache_expand(&inc, &to, &th, so, m); 3623} 3624 3625 3626/* 3627 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 3628 * if we are in TCP_SYN_RECV due to crossed SYNs 3629 */ 3630static int 3631do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3632{ 3633 struct cpl_pass_establish *req = cplhdr(m); 3634 struct toepcb *toep = (struct toepcb *)ctx; 3635 struct tcpcb *tp = toep->tp_tp; 3636 struct socket *so, *lso; 3637 struct t3c_data *td = T3C_DATA(cdev); 3638 struct sockbuf *snd, *rcv; 3639 3640 // Complete socket initialization now that we have the SND_ISN 3641 3642 struct toedev *tdev; 3643 3644 3645 tdev = toep->tp_toedev; 3646 3647 inp_wlock(tp->t_inpcb); 3648 3649 /* 3650 * 3651 * XXX need to add reference while we're manipulating 3652 */ 3653 so = lso = inp_inpcbtosocket(tp->t_inpcb); 3654 3655 inp_wunlock(tp->t_inpcb); 3656 3657 so_lock(so); 3658 LIST_REMOVE(toep, synq_entry); 3659 so_unlock(so); 3660 3661 if (!syncache_expand_establish_req(req, &so, toep)) { 3662 /* 3663 * No entry 3664 */ 3665 CXGB_UNIMPLEMENTED(); 3666 } 3667 if (so == NULL) { 3668 /* 3669 * Couldn't create the socket 3670 */ 3671 CXGB_UNIMPLEMENTED(); 3672 } 3673 3674 tp = so_sototcpcb(so); 3675 inp_wlock(tp->t_inpcb); 3676 3677 snd = so_sockbuf_snd(so); 3678 rcv = so_sockbuf_rcv(so); 3679 3680 snd->sb_flags |= SB_NOCOALESCE; 3681 rcv->sb_flags |= SB_NOCOALESCE; 3682 3683 toep->tp_tp = tp; 3684 toep->tp_flags = 0; 3685 tp->t_toe = toep; 3686 reset_wr_list(toep); 3687 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3688 tp->rcv_nxt = toep->tp_copied_seq; 3689 install_offload_ops(so); 3690 3691 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 3692 toep->tp_wr_unacked = 0; 3693 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3694 toep->tp_qset_idx = 0; 3695 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 3696 3697 /* 3698 * XXX Cancel any keep alive timer 3699 */ 3700 3701 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3702 3703 /* 3704 * XXX workaround for lack of syncache drop 3705 */ 3706 toepcb_release(toep); 3707 inp_wunlock(tp->t_inpcb); 3708 3709 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); 3710 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3711#ifdef notyet 3712 /* 3713 * XXX not sure how these checks map to us 3714 */ 3715 if (unlikely(sk->sk_socket)) { // simultaneous opens only 3716 sk->sk_state_change(sk); 3717 sk_wake_async(so, 0, POLL_OUT); 3718 } 3719 /* 3720 * The state for the new connection is now up to date. 3721 * Next check if we should add the connection to the parent's 3722 * accept queue. When the parent closes it resets connections 3723 * on its SYN queue, so check if we are being reset. If so we 3724 * don't need to do anything more, the coming ABORT_RPL will 3725 * destroy this socket. Otherwise move the connection to the 3726 * accept queue. 3727 * 3728 * Note that we reset the synq before closing the server so if 3729 * we are not being reset the stid is still open. 3730 */ 3731 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3732 __kfree_skb(skb); 3733 goto unlock; 3734 } 3735#endif 3736 m_free(m); 3737 3738 return (0); 3739} 3740 3741/* 3742 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3743 * and send them to the TOE. 3744 */ 3745static void 3746fixup_and_send_ofo(struct toepcb *toep) 3747{ 3748 struct mbuf *m; 3749 struct toedev *tdev = toep->tp_toedev; 3750 struct tcpcb *tp = toep->tp_tp; 3751 unsigned int tid = toep->tp_tid; 3752 3753 log(LOG_NOTICE, "fixup_and_send_ofo\n"); 3754 3755 inp_lock_assert(tp->t_inpcb); 3756 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3757 /* 3758 * A variety of messages can be waiting but the fields we'll 3759 * be touching are common to all so any message type will do. 3760 */ 3761 struct cpl_close_con_req *p = cplhdr(m); 3762 3763 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3764 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3765 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3766 } 3767} 3768 3769/* 3770 * Updates socket state from an active establish CPL message. Runs with the 3771 * socket lock held. 3772 */ 3773static void 3774socket_act_establish(struct socket *so, struct mbuf *m) 3775{ 3776 struct cpl_act_establish *req = cplhdr(m); 3777 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3778 struct tcpcb *tp = so_sototcpcb(so); 3779 struct toepcb *toep = tp->t_toe; 3780 3781 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3782 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3783 toep->tp_tid, tp->t_state); 3784 3785 tp->ts_recent_age = ticks; 3786 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3787 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3788 3789 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3790 3791 /* 3792 * Now that we finally have a TID send any CPL messages that we had to 3793 * defer for lack of a TID. 3794 */ 3795 if (mbufq_len(&toep->out_of_order_queue)) 3796 fixup_and_send_ofo(toep); 3797 3798 if (__predict_false(so_state_get(so) & SS_NOFDREF)) { 3799 /* 3800 * XXX does this even make sense? 3801 */ 3802 so_sorwakeup(so); 3803 } 3804 m_free(m); 3805#ifdef notyet 3806/* 3807 * XXX assume no write requests permitted while socket connection is 3808 * incomplete 3809 */ 3810 /* 3811 * Currently the send queue must be empty at this point because the 3812 * socket layer does not send anything before a connection is 3813 * established. To be future proof though we handle the possibility 3814 * that there are pending buffers to send (either TX_DATA or 3815 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3816 * buffers according to the just learned write_seq, and then we send 3817 * them on their way. 3818 */ 3819 fixup_pending_writeq_buffers(sk); 3820 if (t3_push_frames(so, 1)) 3821 sk->sk_write_space(sk); 3822#endif 3823 3824 toep->tp_state = tp->t_state; 3825 V_tcpstat.tcps_connects++; 3826 3827} 3828 3829/* 3830 * Process a CPL_ACT_ESTABLISH message. 3831 */ 3832static int 3833do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3834{ 3835 struct cpl_act_establish *req = cplhdr(m); 3836 unsigned int tid = GET_TID(req); 3837 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3838 struct toepcb *toep = (struct toepcb *)ctx; 3839 struct tcpcb *tp = toep->tp_tp; 3840 struct socket *so; 3841 struct toedev *tdev; 3842 struct tom_data *d; 3843 3844 if (tp == NULL) { 3845 free_atid(cdev, atid); 3846 return (0); 3847 } 3848 inp_wlock(tp->t_inpcb); 3849 3850 /* 3851 * XXX 3852 */ 3853 so = inp_inpcbtosocket(tp->t_inpcb); 3854 tdev = toep->tp_toedev; /* blow up here if link was down */ 3855 d = TOM_DATA(tdev); 3856 3857 /* 3858 * It's OK if the TID is currently in use, the owning socket may have 3859 * backlogged its last CPL message(s). Just take it away. 3860 */ 3861 toep->tp_tid = tid; 3862 toep->tp_tp = tp; 3863 so_insert_tid(d, toep, tid); 3864 free_atid(cdev, atid); 3865 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3866 3867 socket_act_establish(so, m); 3868 inp_wunlock(tp->t_inpcb); 3869 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); 3870 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3871 3872 return (0); 3873} 3874 3875/* 3876 * Process an acknowledgment of WR completion. Advance snd_una and send the 3877 * next batch of work requests from the write queue. 3878 */ 3879static void 3880wr_ack(struct toepcb *toep, struct mbuf *m) 3881{ 3882 struct tcpcb *tp = toep->tp_tp; 3883 struct cpl_wr_ack *hdr = cplhdr(m); 3884 struct socket *so; 3885 unsigned int credits = ntohs(hdr->credits); 3886 u32 snd_una = ntohl(hdr->snd_una); 3887 int bytes = 0; 3888 struct sockbuf *snd; 3889 3890 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); 3891 3892 inp_wlock(tp->t_inpcb); 3893 so = inp_inpcbtosocket(tp->t_inpcb); 3894 toep->tp_wr_avail += credits; 3895 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3896 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3897 3898 while (credits) { 3899 struct mbuf *p = peek_wr(toep); 3900 3901 if (__predict_false(!p)) { 3902 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3903 "nothing pending, state %u wr_avail=%u\n", 3904 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 3905 break; 3906 } 3907 CTR2(KTR_TOM, 3908 "wr_ack: p->credits=%d p->bytes=%d", 3909 p->m_pkthdr.csum_data, p->m_pkthdr.len); 3910 KASSERT(p->m_pkthdr.csum_data != 0, 3911 ("empty request still on list")); 3912 3913 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3914 3915#if DEBUG_WR > 1 3916 struct tx_data_wr *w = cplhdr(p); 3917 log(LOG_ERR, 3918 "TID %u got %u WR credits, need %u, len %u, " 3919 "main body %u, frags %u, seq # %u, ACK una %u," 3920 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3921 toep->tp_tid, credits, p->csum, p->len, 3922 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3923 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3924 toep->tp_wr_avail, count_pending_wrs(tp) - credits); 3925#endif 3926 p->m_pkthdr.csum_data -= credits; 3927 break; 3928 } else { 3929 dequeue_wr(toep); 3930 credits -= p->m_pkthdr.csum_data; 3931 bytes += p->m_pkthdr.len; 3932 CTR3(KTR_TOM, 3933 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", 3934 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); 3935 3936 m_free(p); 3937 } 3938 } 3939 3940#if DEBUG_WR 3941 check_wr_invariants(tp); 3942#endif 3943 3944 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3945#if VALIDATE_SEQ 3946 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3947 3948 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3949 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3950 toep->tp_tid, tp->snd_una); 3951#endif 3952 goto out_free; 3953 } 3954 3955 if (tp->snd_una != snd_una) { 3956 tp->snd_una = snd_una; 3957 tp->ts_recent_age = ticks; 3958#ifdef notyet 3959 /* 3960 * Keep ARP entry "minty fresh" 3961 */ 3962 dst_confirm(sk->sk_dst_cache); 3963#endif 3964 if (tp->snd_una == tp->snd_nxt) 3965 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3966 } 3967 3968 snd = so_sockbuf_snd(so); 3969 if (bytes) { 3970 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); 3971 snd = so_sockbuf_snd(so); 3972 sockbuf_lock(snd); 3973 sbdrop_locked(snd, bytes); 3974 so_sowwakeup_locked(so); 3975 } 3976 3977 if (snd->sb_sndptroff < snd->sb_cc) 3978 t3_push_frames(so, 0); 3979 3980out_free: 3981 inp_wunlock(tp->t_inpcb); 3982 m_free(m); 3983} 3984 3985/* 3986 * Handler for TX_DATA_ACK CPL messages. 3987 */ 3988static int 3989do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 3990{ 3991 struct toepcb *toep = (struct toepcb *)ctx; 3992 3993 VALIDATE_SOCK(so); 3994 3995 wr_ack(toep, m); 3996 return 0; 3997} 3998 3999/* 4000 * Handler for TRACE_PKT CPL messages. Just sink these packets. 4001 */ 4002static int 4003do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) 4004{ 4005 m_freem(m); 4006 return 0; 4007} 4008 4009/* 4010 * Reset a connection that is on a listener's SYN queue or accept queue, 4011 * i.e., one that has not had a struct socket associated with it. 4012 * Must be called from process context. 4013 * 4014 * Modeled after code in inet_csk_listen_stop(). 4015 */ 4016static void 4017t3_reset_listen_child(struct socket *child) 4018{ 4019 struct tcpcb *tp = so_sototcpcb(child); 4020 4021 t3_send_reset(tp->t_toe); 4022} 4023 4024 4025static void 4026t3_child_disconnect(struct socket *so, void *arg) 4027{ 4028 struct tcpcb *tp = so_sototcpcb(so); 4029 4030 if (tp->t_flags & TF_TOE) { 4031 inp_wlock(tp->t_inpcb); 4032 t3_reset_listen_child(so); 4033 inp_wunlock(tp->t_inpcb); 4034 } 4035} 4036 4037/* 4038 * Disconnect offloaded established but not yet accepted connections sitting 4039 * on a server's accept_queue. We just send an ABORT_REQ at this point and 4040 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 4041 */ 4042void 4043t3_disconnect_acceptq(struct socket *listen_so) 4044{ 4045 4046 so_lock(listen_so); 4047 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); 4048 so_unlock(listen_so); 4049} 4050 4051/* 4052 * Reset offloaded connections sitting on a server's syn queue. As above 4053 * we send ABORT_REQ and finish off when we get ABORT_RPL. 4054 */ 4055 4056void 4057t3_reset_synq(struct listen_ctx *lctx) 4058{ 4059 struct toepcb *toep; 4060 4061 so_lock(lctx->lso); 4062 while (!LIST_EMPTY(&lctx->synq_head)) { 4063 toep = LIST_FIRST(&lctx->synq_head); 4064 LIST_REMOVE(toep, synq_entry); 4065 toep->tp_tp = NULL; 4066 t3_send_reset(toep); 4067 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 4068 toepcb_release(toep); 4069 } 4070 so_unlock(lctx->lso); 4071} 4072 4073 4074int 4075t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, 4076 unsigned int nppods, unsigned int tag, unsigned int maxoff, 4077 unsigned int pg_off, unsigned int color) 4078{ 4079 unsigned int i, j, pidx; 4080 struct pagepod *p; 4081 struct mbuf *m; 4082 struct ulp_mem_io *req; 4083 unsigned int tid = toep->tp_tid; 4084 const struct tom_data *td = TOM_DATA(toep->tp_toedev); 4085 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; 4086 4087 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", 4088 gl, nppods, tag, maxoff, pg_off, color); 4089 4090 for (i = 0; i < nppods; ++i) { 4091 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); 4092 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4093 req = mtod(m, struct ulp_mem_io *); 4094 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; 4095 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4096 req->wr.wr_lo = 0; 4097 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | 4098 V_ULPTX_CMD(ULP_MEM_WRITE)); 4099 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | 4100 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); 4101 4102 p = (struct pagepod *)(req + 1); 4103 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { 4104 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); 4105 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | 4106 V_PPOD_COLOR(color)); 4107 p->pp_max_offset = htonl(maxoff); 4108 p->pp_page_offset = htonl(pg_off); 4109 p->pp_rsvd = 0; 4110 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) 4111 p->pp_addr[j] = pidx < gl->dgl_nelem ? 4112 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; 4113 } else 4114 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ 4115 send_or_defer(toep, m, 0); 4116 ppod_addr += PPOD_SIZE; 4117 } 4118 return (0); 4119} 4120 4121/* 4122 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. 4123 */ 4124static inline void 4125mk_cpl_barrier_ulp(struct cpl_barrier *b) 4126{ 4127 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; 4128 4129 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4130 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); 4131 b->opcode = CPL_BARRIER; 4132} 4133 4134/* 4135 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. 4136 */ 4137static inline void 4138mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) 4139{ 4140 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4141 4142 txpkt = (struct ulp_txpkt *)req; 4143 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4144 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4145 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); 4146 req->cpuno = htons(cpuno); 4147} 4148 4149/* 4150 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. 4151 */ 4152static inline void 4153mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, 4154 unsigned int word, uint64_t mask, uint64_t val) 4155{ 4156 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4157 4158 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 4159 tid, word, mask, val); 4160 4161 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4162 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4163 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); 4164 req->reply = V_NO_REPLY(1); 4165 req->cpu_idx = 0; 4166 req->word = htons(word); 4167 req->mask = htobe64(mask); 4168 req->val = htobe64(val); 4169} 4170 4171/* 4172 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. 4173 */ 4174static void 4175mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, 4176 unsigned int tid, unsigned int credits) 4177{ 4178 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; 4179 4180 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4181 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); 4182 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); 4183 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 4184 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | 4185 V_RX_CREDITS(credits)); 4186} 4187 4188void 4189t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) 4190{ 4191 unsigned int wrlen; 4192 struct mbuf *m; 4193 struct work_request_hdr *wr; 4194 struct cpl_barrier *lock; 4195 struct cpl_set_tcb_field *req; 4196 struct cpl_get_tcb *getreq; 4197 struct ddp_state *p = &toep->tp_ddp_state; 4198 4199#if 0 4200 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4201#endif 4202 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + 4203 sizeof(*getreq); 4204 m = m_gethdr_nofail(wrlen); 4205 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4206 wr = mtod(m, struct work_request_hdr *); 4207 bzero(wr, wrlen); 4208 4209 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4210 m->m_pkthdr.len = m->m_len = wrlen; 4211 4212 lock = (struct cpl_barrier *)(wr + 1); 4213 mk_cpl_barrier_ulp(lock); 4214 4215 req = (struct cpl_set_tcb_field *)(lock + 1); 4216 4217 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); 4218 4219 /* Hmmm, not sure if this actually a good thing: reactivating 4220 * the other buffer might be an issue if it has been completed 4221 * already. However, that is unlikely, since the fact that the UBUF 4222 * is not completed indicates that there is no oustanding data. 4223 */ 4224 if (bufidx == 0) 4225 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4226 V_TF_DDP_ACTIVE_BUF(1) | 4227 V_TF_DDP_BUF0_VALID(1), 4228 V_TF_DDP_ACTIVE_BUF(1)); 4229 else 4230 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4231 V_TF_DDP_ACTIVE_BUF(1) | 4232 V_TF_DDP_BUF1_VALID(1), 0); 4233 4234 getreq = (struct cpl_get_tcb *)(req + 1); 4235 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4236 4237 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); 4238 4239 /* Keep track of the number of oustanding CPL_GET_TCB requests 4240 */ 4241 p->get_tcb_count++; 4242 4243#ifdef T3_TRACE 4244 T3_TRACE1(TIDTB(so), 4245 "t3_cancel_ddpbuf: bufidx %u", bufidx); 4246#endif 4247 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4248} 4249 4250/** 4251 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one 4252 * @sk: the socket associated with the buffers 4253 * @bufidx: index of HW DDP buffer (0 or 1) 4254 * @tag0: new tag for HW buffer 0 4255 * @tag1: new tag for HW buffer 1 4256 * @len: new length for HW buf @bufidx 4257 * 4258 * Sends a compound WR to overlay a new DDP buffer on top of an existing 4259 * buffer by changing the buffer tag and length and setting the valid and 4260 * active flag accordingly. The caller must ensure the new buffer is at 4261 * least as big as the existing one. Since we typically reprogram both HW 4262 * buffers this function sets both tags for convenience. Read the TCB to 4263 * determine how made data was written into the buffer before the overlay 4264 * took place. 4265 */ 4266void 4267t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, 4268 unsigned int tag1, unsigned int len) 4269{ 4270 unsigned int wrlen; 4271 struct mbuf *m; 4272 struct work_request_hdr *wr; 4273 struct cpl_get_tcb *getreq; 4274 struct cpl_set_tcb_field *req; 4275 struct ddp_state *p = &toep->tp_ddp_state; 4276 4277 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", 4278 bufidx, tag0, tag1, len); 4279#if 0 4280 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4281#endif 4282 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); 4283 m = m_gethdr_nofail(wrlen); 4284 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4285 wr = mtod(m, struct work_request_hdr *); 4286 m->m_pkthdr.len = m->m_len = wrlen; 4287 bzero(wr, wrlen); 4288 4289 4290 /* Set the ATOMIC flag to make sure that TP processes the following 4291 * CPLs in an atomic manner and no wire segments can be interleaved. 4292 */ 4293 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); 4294 req = (struct cpl_set_tcb_field *)(wr + 1); 4295 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, 4296 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | 4297 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, 4298 V_TCB_RX_DDP_BUF0_TAG(tag0) | 4299 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); 4300 req++; 4301 if (bufidx == 0) { 4302 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, 4303 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4304 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 4305 req++; 4306 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4307 V_TF_DDP_PUSH_DISABLE_0(1) | 4308 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4309 V_TF_DDP_PUSH_DISABLE_0(0) | 4310 V_TF_DDP_BUF0_VALID(1)); 4311 } else { 4312 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, 4313 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), 4314 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); 4315 req++; 4316 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4317 V_TF_DDP_PUSH_DISABLE_1(1) | 4318 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4319 V_TF_DDP_PUSH_DISABLE_1(0) | 4320 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); 4321 } 4322 4323 getreq = (struct cpl_get_tcb *)(req + 1); 4324 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4325 4326 /* Keep track of the number of oustanding CPL_GET_TCB requests 4327 */ 4328 p->get_tcb_count++; 4329 4330#ifdef T3_TRACE 4331 T3_TRACE4(TIDTB(sk), 4332 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " 4333 "len %d", 4334 bufidx, tag0, tag1, len); 4335#endif 4336 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4337} 4338 4339/* 4340 * Sends a compound WR containing all the CPL messages needed to program the 4341 * two HW DDP buffers, namely optionally setting up the length and offset of 4342 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. 4343 */ 4344void 4345t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, 4346 unsigned int len1, unsigned int offset1, 4347 uint64_t ddp_flags, uint64_t flag_mask, int modulate) 4348{ 4349 unsigned int wrlen; 4350 struct mbuf *m; 4351 struct work_request_hdr *wr; 4352 struct cpl_set_tcb_field *req; 4353 4354 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", 4355 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); 4356 4357#if 0 4358 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4359#endif 4360 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + 4361 (len1 ? sizeof(*req) : 0) + 4362 (modulate ? sizeof(struct cpl_rx_data_ack) : 0); 4363 m = m_gethdr_nofail(wrlen); 4364 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4365 wr = mtod(m, struct work_request_hdr *); 4366 bzero(wr, wrlen); 4367 4368 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4369 m->m_pkthdr.len = m->m_len = wrlen; 4370 4371 req = (struct cpl_set_tcb_field *)(wr + 1); 4372 if (len0) { /* program buffer 0 offset and length */ 4373 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, 4374 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 4375 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4376 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | 4377 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); 4378 req++; 4379 } 4380 if (len1) { /* program buffer 1 offset and length */ 4381 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, 4382 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 4383 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, 4384 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | 4385 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); 4386 req++; 4387 } 4388 4389 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, 4390 ddp_flags); 4391 4392 if (modulate) { 4393 mk_rx_data_ack_ulp(toep, 4394 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, 4395 toep->tp_copied_seq - toep->tp_rcv_wup); 4396 toep->tp_rcv_wup = toep->tp_copied_seq; 4397 } 4398 4399#ifdef T3_TRACE 4400 T3_TRACE5(TIDTB(sk), 4401 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " 4402 "modulate %d", 4403 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, 4404 modulate); 4405#endif 4406 4407 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4408} 4409 4410void 4411t3_init_wr_tab(unsigned int wr_len) 4412{ 4413 int i; 4414 4415 if (mbuf_wrs[1]) /* already initialized */ 4416 return; 4417 4418 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 4419 int sgl_len = (3 * i) / 2 + (i & 1); 4420 4421 sgl_len += 3; 4422 mbuf_wrs[i] = sgl_len <= wr_len ? 4423 1 : 1 + (sgl_len - 2) / (wr_len - 1); 4424 } 4425 4426 wrlen = wr_len * 8; 4427} 4428 4429int 4430t3_init_cpl_io(void) 4431{ 4432#ifdef notyet 4433 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 4434 if (!tcphdr_skb) { 4435 log(LOG_ERR, 4436 "Chelsio TCP offload: can't allocate sk_buff\n"); 4437 return -1; 4438 } 4439 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 4440 tcphdr_skb->h.raw = tcphdr_skb->data; 4441 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 4442#endif 4443 4444 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 4445 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 4446 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 4447 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 4448 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 4449 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 4450 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 4451 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 4452 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 4453 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 4454 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 4455 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 4456 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 4457 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 4458 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 4459 return (0); 4460} 4461 4462