cxgb_cpl_io.c revision 174708
1/************************************************************************** 2 3Copyright (c) 2007, Chelsio Inc. 4All rights reserved. 5 6Redistribution and use in source and binary forms, with or without 7modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26POSSIBILITY OF SUCH DAMAGE. 27 28***************************************************************************/ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 174708 2007-12-17 08:17:51Z kmacy $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/lock.h> 39#include <sys/mbuf.h> 40#include <sys/mutex.h> 41#include <sys/socket.h> 42#include <sys/sysctl.h> 43#include <sys/syslog.h> 44#include <sys/socketvar.h> 45#include <sys/protosw.h> 46#include <sys/priv.h> 47 48#include <net/if.h> 49#include <net/route.h> 50 51#include <netinet/in.h> 52#include <netinet/in_pcb.h> 53#include <netinet/in_systm.h> 54#include <netinet/in_var.h> 55 56 57#include <dev/cxgb/cxgb_osdep.h> 58#include <dev/cxgb/sys/mbufq.h> 59 60#include <netinet/ip.h> 61#include <netinet/tcp_var.h> 62#include <netinet/tcp_fsm.h> 63#include <netinet/tcp_offload.h> 64#include <netinet/tcp_seq.h> 65#include <netinet/tcp_syncache.h> 66#include <net/route.h> 67 68 69#include <dev/cxgb/t3cdev.h> 70#include <dev/cxgb/common/cxgb_firmware_exports.h> 71#include <dev/cxgb/common/cxgb_t3_cpl.h> 72#include <dev/cxgb/common/cxgb_tcb.h> 73#include <dev/cxgb/common/cxgb_ctl_defs.h> 74#include <dev/cxgb/cxgb_l2t.h> 75#include <dev/cxgb/cxgb_offload.h> 76#include <vm/vm.h> 77#include <vm/pmap.h> 78#include <machine/bus.h> 79#include <dev/cxgb/sys/mvec.h> 80#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> 81#include <dev/cxgb/ulp/tom/cxgb_defs.h> 82#include <dev/cxgb/ulp/tom/cxgb_tom.h> 83#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> 84#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> 85#include <dev/cxgb/ulp/tom/cxgb_tcp.h> 86 87 88 89/* 90 * For ULP connections HW may add headers, e.g., for digests, that aren't part 91 * of the messages sent by the host but that are part of the TCP payload and 92 * therefore consume TCP sequence space. Tx connection parameters that 93 * operate in TCP sequence space are affected by the HW additions and need to 94 * compensate for them to accurately track TCP sequence numbers. This array 95 * contains the compensating extra lengths for ULP packets. It is indexed by 96 * a packet's ULP submode. 97 */ 98const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 99 100#ifdef notyet 101/* 102 * This sk_buff holds a fake header-only TCP segment that we use whenever we 103 * need to exploit SW TCP functionality that expects TCP headers, such as 104 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 105 * CPUs without locking. 106 */ 107static struct mbuf *tcphdr_mbuf __read_mostly; 108#endif 109 110/* 111 * Size of WRs in bytes. Note that we assume all devices we are handling have 112 * the same WR size. 113 */ 114static unsigned int wrlen __read_mostly; 115 116/* 117 * The number of WRs needed for an skb depends on the number of page fragments 118 * in the skb and whether it has any payload in its main body. This maps the 119 * length of the gather list represented by an skb into the # of necessary WRs. 120 */ 121static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly; 122 123/* 124 * Max receive window supported by HW in bytes. Only a small part of it can 125 * be set through option0, the rest needs to be set through RX_DATA_ACK. 126 */ 127#define MAX_RCV_WND ((1U << 27) - 1) 128 129/* 130 * Min receive window. We want it to be large enough to accommodate receive 131 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 132 */ 133#define MIN_RCV_WND (24 * 1024U) 134#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS) 135 136#define VALIDATE_SEQ 0 137#define VALIDATE_SOCK(so) 138#define DEBUG_WR 0 139 140extern int tcp_do_autorcvbuf; 141extern int tcp_do_autosndbuf; 142extern int tcp_autorcvbuf_max; 143extern int tcp_autosndbuf_max; 144 145static void t3_send_reset(struct toepcb *toep); 146static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 147static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 148static void handle_syncache_event(int event, void *arg); 149 150 151static inline int 152is_t3a(const struct toedev *dev) 153{ 154 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 155} 156 157static void 158dump_toepcb(struct toepcb *toep) 159{ 160 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 161 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 162 toep->tp_mtu_idx, toep->tp_tid); 163 164 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 165 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 166 toep->tp_mss_clamp, toep->tp_flags); 167} 168 169static struct rtentry * 170rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 171{ 172 struct rtentry *rt = NULL; 173 174 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 175 RT_UNLOCK(rt); 176 177 return (rt); 178} 179 180/* 181 * Determine whether to send a CPL message now or defer it. A message is 182 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 183 * For connections in other states the message is sent immediately. 184 * If through_l2t is set the message is subject to ARP processing, otherwise 185 * it is sent directly. 186 */ 187static inline void 188send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t) 189{ 190 struct toepcb *toep = tp->t_toe; 191 192 193 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 194 INP_LOCK(tp->t_inpcb); 195 mbufq_tail(&toep->out_of_order_queue, m); // defer 196 INP_UNLOCK(tp->t_inpcb); 197 } else if (through_l2t) 198 l2t_send(T3C_DEV(so), m, toep->tp_l2t); // send through L2T 199 else 200 cxgb_ofld_send(T3C_DEV(so), m); // send directly 201} 202 203static inline unsigned int 204mkprio(unsigned int cntrl, const struct socket *so) 205{ 206 return cntrl; 207} 208 209/* 210 * Populate a TID_RELEASE WR. The skb must be already propely sized. 211 */ 212static inline void 213mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid) 214{ 215 struct cpl_tid_release *req; 216 217 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so)); 218 m->m_pkthdr.len = m->m_len = sizeof(*req); 219 req = mtod(m, struct cpl_tid_release *); 220 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 221 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 222} 223 224static inline void 225make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 226{ 227 struct tcpcb *tp = sototcpcb(so); 228 struct toepcb *toep = tp->t_toe; 229 struct tx_data_wr *req; 230 231 INP_LOCK_ASSERT(tp->t_inpcb); 232 233 req = mtod(m, struct tx_data_wr *); 234 m->m_len = sizeof(*req); 235 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 236 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 237 /* len includes the length of any HW ULP additions */ 238 req->len = htonl(len); 239 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 240 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 241 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 242 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 243 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 244 (tail ? 0 : 1)))); 245 req->sndseq = htonl(tp->snd_nxt); 246 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 247 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 248 V_TX_CPU_IDX(toep->tp_qset)); 249 250 /* Sendbuffer is in units of 32KB. 251 */ 252 if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) 253 req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15)); 254 else 255 req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15)); 256 toep->tp_flags |= TP_DATASENT; 257 } 258} 259 260int 261t3_push_frames(struct socket *so, int req_completion) 262{ 263 struct tcpcb *tp = sototcpcb(so); 264 struct toepcb *toep = tp->t_toe; 265 266 struct mbuf *tail, *m0, *last; 267 struct t3cdev *cdev; 268 struct tom_data *d; 269 int bytes, count, total_bytes; 270 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 271 segp = segs; 272 273 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 274 DPRINTF("tcp state=%d\n", tp->t_state); 275 return (0); 276 } 277 278 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 279 DPRINTF("disconnecting\n"); 280 281 return (0); 282 } 283 284 INP_LOCK_ASSERT(tp->t_inpcb); 285 286 SOCKBUF_LOCK(&so->so_snd); 287 288 d = TOM_DATA(TOE_DEV(so)); 289 cdev = d->cdev; 290 last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb; 291 total_bytes = 0; 292 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 293 toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last); 294 295 if (last && toep->tp_m_last == last && so->so_snd.sb_sndptroff != 0) { 296 KASSERT(tail, ("sbdrop error")); 297 last = tail = tail->m_next; 298 } 299 300 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 301 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 302 SOCKBUF_UNLOCK(&so->so_snd); 303 return (0); 304 } 305 306 toep->tp_m_last = NULL; 307 while (toep->tp_wr_avail && (tail != NULL)) { 308 count = bytes = 0; 309 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 310 SOCKBUF_UNLOCK(&so->so_snd); 311 return (0); 312 } 313 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 314 && (tail != NULL) && (count < TX_MAX_SEGS)) { 315 bytes += tail->m_len; 316 count++; 317 last = tail; 318 /* 319 * technically an abuse to be using this for a VA 320 * but less gross than defining my own structure 321 * or calling pmap_kextract from here :-| 322 */ 323 segp->ds_addr = (bus_addr_t)tail->m_data; 324 segp->ds_len = tail->m_len; 325 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 326 count, mbuf_wrs[count], tail->m_data, tail->m_len); 327 328 segp++; 329 tail = tail->m_next; 330 } 331 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 332 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 333 if (tail) { 334 so->so_snd.sb_sndptr = tail; 335 toep->tp_m_last = NULL; 336 } else 337 toep->tp_m_last = so->so_snd.sb_sndptr = last; 338 339 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 340 341 so->so_snd.sb_sndptroff += bytes; 342 total_bytes += bytes; 343 toep->tp_write_seq += bytes; 344 345 346 SOCKBUF_UNLOCK(&so->so_snd); 347 348 /* 349 * XXX can drop socket buffer lock here 350 */ 351 352 toep->tp_wr_avail -= mbuf_wrs[count]; 353 toep->tp_wr_unacked += mbuf_wrs[count]; 354 355 make_tx_data_wr(so, m0, bytes, tail); 356 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so)); 357 m_set_sgl(m0, segs); 358 m_set_sgllen(m0, count); 359 /* 360 * remember credits used 361 */ 362 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 363 m0->m_pkthdr.len = bytes; 364 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 365 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 366 struct work_request_hdr *wr = cplhdr(m0); 367 368 wr->wr_hi |= htonl(F_WR_COMPL); 369 toep->tp_wr_unacked = 0; 370 } 371 372 m0->m_type = MT_DONTFREE; 373 enqueue_wr(toep, m0); 374 DPRINTF("sending offload tx with %d bytes in %d segments\n", 375 bytes, count); 376 377 l2t_send(cdev, m0, toep->tp_l2t); 378 if (toep->tp_wr_avail && (tail != NULL)) 379 SOCKBUF_LOCK(&so->so_snd); 380 } 381 382 SOCKBUF_UNLOCK_ASSERT(&so->so_snd); 383 return (total_bytes); 384} 385 386/* 387 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 388 * under any circumstances. We take the easy way out and always queue the 389 * message to the write_queue. We can optimize the case where the queue is 390 * already empty though the optimization is probably not worth it. 391 */ 392static void 393close_conn(struct socket *so) 394{ 395 struct mbuf *m; 396 struct cpl_close_con_req *req; 397 struct tom_data *d; 398 struct inpcb *inp = sotoinpcb(so); 399 struct tcpcb *tp; 400 struct toepcb *toep; 401 unsigned int tid; 402 403 404 INP_LOCK(inp); 405 tp = sototcpcb(so); 406 toep = tp->t_toe; 407 408 if (tp->t_state != TCPS_SYN_SENT) 409 t3_push_frames(so, 1); 410 411 if (toep->tp_flags & TP_FIN_SENT) { 412 INP_UNLOCK(inp); 413 return; 414 } 415 416 tid = toep->tp_tid; 417 418 d = TOM_DATA(toep->tp_toedev); 419 420 m = m_gethdr_nofail(sizeof(*req)); 421 422 toep->tp_flags |= TP_FIN_SENT; 423 req = mtod(m, struct cpl_close_con_req *); 424 425 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 426 req->wr.wr_lo = htonl(V_WR_TID(tid)); 427 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 428 req->rsvd = htonl(toep->tp_write_seq); 429 INP_UNLOCK(inp); 430 /* 431 * XXX - need to defer shutdown while there is still data in the queue 432 * 433 */ 434 cxgb_ofld_send(d->cdev, m); 435 436} 437 438/* 439 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 440 * and send it along. 441 */ 442static void 443abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 444{ 445 struct cpl_abort_req *req = cplhdr(m); 446 447 req->cmd = CPL_ABORT_NO_RST; 448 cxgb_ofld_send(cdev, m); 449} 450 451/* 452 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 453 * permitted to return without sending the message in case we cannot allocate 454 * an sk_buff. Returns the number of credits sent. 455 */ 456uint32_t 457t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 458{ 459 struct mbuf *m; 460 struct cpl_rx_data_ack *req; 461 struct toepcb *toep = tp->t_toe; 462 struct toedev *tdev = toep->tp_toedev; 463 464 m = m_gethdr_nofail(sizeof(*req)); 465 466 DPRINTF("returning %u credits to HW\n", credits); 467 468 req = mtod(m, struct cpl_rx_data_ack *); 469 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 470 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 471 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 472 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep))); 473 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 474 return (credits); 475} 476 477 478/* 479 * Set of states for which we should return RX credits. 480 */ 481#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 482 483/* 484 * Called after some received data has been read. It returns RX credits 485 * to the HW for the amount of data processed. 486 */ 487void 488t3_cleanup_rbuf(struct tcpcb *tp) 489{ 490 struct toepcb *toep = tp->t_toe; 491 struct socket *so; 492 struct toedev *dev; 493 int dack_mode, must_send, read; 494 u32 thres, credits, dack = 0; 495 496 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 497 (tp->t_state == TCPS_FIN_WAIT_2))) 498 return; 499 INP_LOCK_ASSERT(tp->t_inpcb); 500 501 so = tp->t_inpcb->inp_socket; 502 SOCKBUF_LOCK(&so->so_rcv); 503 read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc; 504 toep->tp_copied_seq += read; 505 toep->tp_enqueued_bytes -= read; 506 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 507 SOCKBUF_UNLOCK(&so->so_rcv); 508 509 if (credits > so->so_rcv.sb_mbmax) 510 printf("copied_seq=%u rcv_wup=%u credits=%u\n", 511 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 512 /* 513 * XXX this won't accurately reflect credit return - we need 514 * to look at the difference between the amount that has been 515 * put in the recv sockbuf and what is there now 516 */ 517 518 if (__predict_false(!credits)) 519 return; 520 521 dev = toep->tp_toedev; 522 thres = TOM_TUNABLE(dev, rx_credit_thres); 523 524 if (__predict_false(thres == 0)) 525 return; 526 527 if (toep->tp_ulp_mode) 528 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 529 else { 530 dack_mode = TOM_TUNABLE(dev, delack); 531 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 532 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 533 534 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 535 dack = F_RX_DACK_CHANGE | 536 V_RX_DACK_MODE(dack_mode); 537 } 538 } 539 540 /* 541 * For coalescing to work effectively ensure the receive window has 542 * at least 16KB left. 543 */ 544 must_send = credits + 16384 >= tp->rcv_wnd; 545 546 if (must_send || credits >= thres) 547 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 548} 549 550static int 551cxgb_toe_disconnect(struct tcpcb *tp) 552{ 553 struct socket *so; 554 555 DPRINTF("cxgb_toe_disconnect\n"); 556 557 so = tp->t_inpcb->inp_socket; 558 close_conn(so); 559 return (0); 560} 561 562static int 563cxgb_toe_reset(struct tcpcb *tp) 564{ 565 struct toepcb *toep = tp->t_toe; 566 567 568 t3_send_reset(toep); 569 570 /* 571 * unhook from socket 572 */ 573 tp->t_flags &= ~TF_TOE; 574 toep->tp_tp = NULL; 575 tp->t_toe = NULL; 576 return (0); 577} 578 579static int 580cxgb_toe_send(struct tcpcb *tp) 581{ 582 struct socket *so; 583 584 DPRINTF("cxgb_toe_send\n"); 585 dump_toepcb(tp->t_toe); 586 587 so = tp->t_inpcb->inp_socket; 588 t3_push_frames(so, 1); 589 return (0); 590} 591 592static int 593cxgb_toe_rcvd(struct tcpcb *tp) 594{ 595 INP_LOCK_ASSERT(tp->t_inpcb); 596 t3_cleanup_rbuf(tp); 597 598 return (0); 599} 600 601static void 602cxgb_toe_detach(struct tcpcb *tp) 603{ 604 struct toepcb *toep; 605 /* 606 * XXX how do we handle teardown in the SYN_SENT state? 607 * 608 */ 609 INP_INFO_WLOCK(&tcbinfo); 610 toep = tp->t_toe; 611 toep->tp_tp = NULL; 612 613 /* 614 * unhook from socket 615 */ 616 tp->t_flags &= ~TF_TOE; 617 tp->t_toe = NULL; 618 INP_INFO_WUNLOCK(&tcbinfo); 619} 620 621 622static struct toe_usrreqs cxgb_toe_usrreqs = { 623 .tu_disconnect = cxgb_toe_disconnect, 624 .tu_reset = cxgb_toe_reset, 625 .tu_send = cxgb_toe_send, 626 .tu_rcvd = cxgb_toe_rcvd, 627 .tu_detach = cxgb_toe_detach, 628 .tu_detach = cxgb_toe_detach, 629 .tu_syncache_event = handle_syncache_event, 630}; 631 632 633static void 634__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word, 635 uint64_t mask, uint64_t val, int no_reply) 636{ 637 struct cpl_set_tcb_field *req; 638 struct tcpcb *tp = sototcpcb(so); 639 struct toepcb *toep = tp->t_toe; 640 641 req = mtod(m, struct cpl_set_tcb_field *); 642 m->m_pkthdr.len = m->m_len = sizeof(*req); 643 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 644 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 645 req->reply = V_NO_REPLY(no_reply); 646 req->cpu_idx = 0; 647 req->word = htons(word); 648 req->mask = htobe64(mask); 649 req->val = htobe64(val); 650 651 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); 652 send_or_defer(so, tp, m, 0); 653} 654 655static void 656t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val) 657{ 658 struct mbuf *m; 659 struct tcpcb *tp = sototcpcb(so); 660 struct toepcb *toep = tp->t_toe; 661 662 if (toep == NULL) 663 return; 664 665 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) 666 return; 667 668 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 669 670 __set_tcb_field(so, m, word, mask, val, 1); 671} 672 673/* 674 * Set one of the t_flags bits in the TCB. 675 */ 676static void 677set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val) 678{ 679 t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 680} 681 682/* 683 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 684 */ 685static void 686t3_set_nagle(struct socket *so) 687{ 688 struct tcpcb *tp = sototcpcb(so); 689 690 set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 691} 692 693/* 694 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 695 */ 696void 697t3_set_keepalive(struct socket *so, int on_off) 698{ 699 set_tcb_tflag(so, S_TF_KEEPALIVE, on_off); 700} 701 702void 703t3_set_rcv_coalesce_enable(struct socket *so, int on_off) 704{ 705 set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off); 706} 707 708/* 709 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 710 */ 711static void 712t3_set_tos(struct socket *so) 713{ 714 t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 715 V_TCB_TOS(SO_TOS(so))); 716} 717 718 719/* 720 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 721 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 722 * set the PSH bit in the last segment, which would trigger delivery.] 723 * We work around the issue by setting a DDP buffer in a partial placed state, 724 * which guarantees that TP will schedule a timer. 725 */ 726#define TP_DDP_TIMER_WORKAROUND_MASK\ 727 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 728 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 729 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 730#define TP_DDP_TIMER_WORKAROUND_VAL\ 731 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 732 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 733 32)) 734 735static void 736t3_enable_ddp(struct socket *so, int on) 737{ 738 if (on) 739 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 740 V_TF_DDP_OFF(0)); 741 else 742 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, 743 V_TF_DDP_OFF(1) | 744 TP_DDP_TIMER_WORKAROUND_MASK, 745 V_TF_DDP_OFF(1) | 746 TP_DDP_TIMER_WORKAROUND_VAL); 747 748} 749 750 751void 752t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color) 753{ 754 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 755 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 756 tag_color); 757} 758 759void 760t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset, 761 unsigned int len) 762{ 763 if (buf_idx == 0) 764 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET, 765 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 766 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 767 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 768 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 769 else 770 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET, 771 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 772 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 773 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 774 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 775} 776 777static int 778t3_set_cong_control(struct socket *so, const char *name) 779{ 780#ifdef notyet 781 int cong_algo; 782 783 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 784 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 785 break; 786 787 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 788 return -EINVAL; 789#endif 790 return 0; 791} 792 793int 794t3_get_tcb(struct socket *so) 795{ 796 struct cpl_get_tcb *req; 797 struct tcpcb *tp = sototcpcb(so); 798 struct toepcb *toep = tp->t_toe; 799 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 800 801 if (!m) 802 return (ENOMEM); 803 804 INP_LOCK_ASSERT(tp->t_inpcb); 805 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); 806 req = mtod(m, struct cpl_get_tcb *); 807 m->m_pkthdr.len = m->m_len = sizeof(*req); 808 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 809 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 810 req->cpuno = htons(toep->tp_qset); 811 if (sototcpcb(so)->t_state == TCPS_SYN_SENT) 812 mbufq_tail(&toep->out_of_order_queue, m); // defer 813 else 814 cxgb_ofld_send(T3C_DEV(so), m); 815 return 0; 816} 817 818static inline void 819so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid) 820{ 821 struct toepcb *toep = sototoep(so); 822 toepcb_hold(toep); 823 824 cxgb_insert_tid(d->cdev, d->client, toep, tid); 825} 826 827/** 828 * find_best_mtu - find the entry in the MTU table closest to an MTU 829 * @d: TOM state 830 * @mtu: the target MTU 831 * 832 * Returns the index of the value in the MTU table that is closest to but 833 * does not exceed the target MTU. 834 */ 835static unsigned int 836find_best_mtu(const struct t3c_data *d, unsigned short mtu) 837{ 838 int i = 0; 839 840 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 841 ++i; 842 return (i); 843} 844 845static unsigned int 846select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 847{ 848 unsigned int idx; 849 850#ifdef notyet 851 struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt; 852#endif 853 if (tp) { 854 tp->t_maxseg = pmtu - 40; 855 if (tp->t_maxseg < td->mtus[0] - 40) 856 tp->t_maxseg = td->mtus[0] - 40; 857 idx = find_best_mtu(td, tp->t_maxseg + 40); 858 859 tp->t_maxseg = td->mtus[idx] - 40; 860 } else 861 idx = find_best_mtu(td, pmtu); 862 863 return (idx); 864} 865 866void 867t3_release_ddp_resources(struct toepcb *toep) 868{ 869 /* 870 * This is a no-op until we have DDP support 871 */ 872} 873 874static inline void 875free_atid(struct t3cdev *cdev, unsigned int tid) 876{ 877 struct toepcb *toep = cxgb_free_atid(cdev, tid); 878 879 if (toep) 880 toepcb_release(toep); 881} 882 883/* 884 * Release resources held by an offload connection (TID, L2T entry, etc.) 885 */ 886static void 887t3_release_offload_resources(struct toepcb *toep) 888{ 889 struct tcpcb *tp = toep->tp_tp; 890 struct toedev *tdev = toep->tp_toedev; 891 struct t3cdev *cdev; 892 unsigned int tid = toep->tp_tid; 893 894 if (!tdev) 895 return; 896 897 cdev = TOEP_T3C_DEV(toep); 898 if (!cdev) 899 return; 900 901 toep->tp_qset = 0; 902 t3_release_ddp_resources(toep); 903 904#ifdef CTRL_SKB_CACHE 905 kfree_skb(CTRL_SKB_CACHE(tp)); 906 CTRL_SKB_CACHE(tp) = NULL; 907#endif 908 909 if (toep->tp_wr_avail != toep->tp_wr_max) { 910 purge_wr_queue(toep); 911 reset_wr_list(toep); 912 } 913 914 if (toep->tp_l2t) { 915 l2t_release(L2DATA(cdev), toep->tp_l2t); 916 toep->tp_l2t = NULL; 917 } 918 printf("setting toep->tp_tp to NULL\n"); 919 920 toep->tp_tp = NULL; 921 if (tp) { 922 INP_LOCK_ASSERT(tp->t_inpcb); 923 tp->t_toe = NULL; 924 tp->t_flags &= ~TF_TOE; 925 } 926 927 if (toep->tp_state == TCPS_SYN_SENT) { 928 free_atid(cdev, tid); 929#ifdef notyet 930 __skb_queue_purge(&tp->out_of_order_queue); 931#endif 932 } else { // we have TID 933 cxgb_remove_tid(cdev, toep, tid); 934 toepcb_release(toep); 935 } 936#if 0 937 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 938#endif 939} 940 941static void 942install_offload_ops(struct socket *so) 943{ 944 struct tcpcb *tp = sototcpcb(so); 945 946 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 947 948 t3_install_socket_ops(so); 949 tp->t_flags |= TF_TOE; 950 tp->t_tu = &cxgb_toe_usrreqs; 951} 952 953/* 954 * Determine the receive window scaling factor given a target max 955 * receive window. 956 */ 957static __inline int 958select_rcv_wscale(int space) 959{ 960 int wscale = 0; 961 962 if (space > MAX_RCV_WND) 963 space = MAX_RCV_WND; 964 965 if (tcp_do_rfc1323) 966 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 967 return wscale; 968} 969 970/* 971 * Determine the receive window size for a socket. 972 */ 973static unsigned int 974select_rcv_wnd(struct socket *so) 975{ 976 struct toedev *dev = TOE_DEV(so); 977 struct tom_data *d = TOM_DATA(dev); 978 unsigned int wnd; 979 unsigned int max_rcv_wnd; 980 981 if (tcp_do_autorcvbuf) 982 wnd = tcp_autorcvbuf_max; 983 else 984 wnd = sbspace(&so->so_rcv); 985 986 /* XXX 987 * For receive coalescing to work effectively we need a receive window 988 * that can accomodate a coalesced segment. 989 */ 990 if (wnd < MIN_RCV_WND) 991 wnd = MIN_RCV_WND; 992 993 /* PR 5138 */ 994 max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ? 995 (uint32_t)d->rx_page_size * 23 : 996 MAX_RCV_WND); 997 998 return min(wnd, max_rcv_wnd); 999} 1000 1001/* 1002 * Assign offload parameters to some socket fields. This code is used by 1003 * both active and passive opens. 1004 */ 1005static inline void 1006init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1007 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1008{ 1009 struct tcpcb *tp = sototcpcb(so); 1010 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1011 1012 SOCK_LOCK_ASSERT(so); 1013 1014 printf("initializing offload socket\n"); 1015#ifdef notyet 1016 /* 1017 * We either need to fix push frames to work with sbcompress 1018 * or we need to add this 1019 */ 1020 so->so_rcv.sb_flags |= SB_TOE; 1021 so->so_snd.sb_flags |= SB_TOE; 1022#endif 1023 tp->t_toe = toep; 1024 toep->tp_tp = tp; 1025 toep->tp_toedev = dev; 1026 1027 toep->tp_tid = tid; 1028 toep->tp_l2t = e; 1029 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1030 toep->tp_wr_unacked = 0; 1031 toep->tp_delack_mode = 0; 1032 1033 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1034 /* 1035 * XXX broken 1036 * 1037 */ 1038 tp->rcv_wnd = select_rcv_wnd(so); 1039 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) && 1040 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1041 toep->tp_qset_idx = 0; 1042 1043 reset_wr_list(toep); 1044 DPRINTF("initialization done\n"); 1045} 1046 1047/* 1048 * The next two functions calculate the option 0 value for a socket. 1049 */ 1050static inline unsigned int 1051calc_opt0h(struct socket *so, int mtu_idx) 1052{ 1053 struct tcpcb *tp = sototcpcb(so); 1054 int wscale = select_rcv_wscale(tp->rcv_wnd); 1055 1056 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1057 V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1058 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1059} 1060 1061static inline unsigned int 1062calc_opt0l(struct socket *so, int ulp_mode) 1063{ 1064 struct tcpcb *tp = sototcpcb(so); 1065 unsigned int val; 1066 1067 val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) | 1068 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1069 1070 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val); 1071 return (val); 1072} 1073 1074static inline unsigned int 1075calc_opt2(const struct socket *so, struct toedev *dev) 1076{ 1077 int flv_valid; 1078 1079 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1080 1081 return V_FLAVORS_VALID(flv_valid) | 1082 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0); 1083} 1084#if 0 1085(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1086#endif 1087 1088static void 1089mk_act_open_req(struct socket *so, struct mbuf *m, 1090 unsigned int atid, const struct l2t_entry *e) 1091{ 1092 struct cpl_act_open_req *req; 1093 struct inpcb *inp = sotoinpcb(so); 1094 struct tcpcb *tp = intotcpcb(inp); 1095 struct toepcb *toep = tp->t_toe; 1096 struct toedev *tdev = TOE_DEV(so); 1097 1098 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so)); 1099 1100 req = mtod(m, struct cpl_act_open_req *); 1101 m->m_pkthdr.len = m->m_len = sizeof(*req); 1102 1103 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1104 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1105 req->local_port = inp->inp_lport; 1106 req->peer_port = inp->inp_fport; 1107 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1108 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1109 DPRINTF("connect smt_idx=%d\n", e->smt_idx); 1110 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1111 V_TX_CHANNEL(e->smt_idx)); 1112 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1113 req->params = 0; 1114 req->opt2 = htonl(calc_opt2(so, tdev)); 1115} 1116 1117 1118/* 1119 * Convert an ACT_OPEN_RPL status to an errno. 1120 */ 1121static int 1122act_open_rpl_status_to_errno(int status) 1123{ 1124 switch (status) { 1125 case CPL_ERR_CONN_RESET: 1126 return (ECONNREFUSED); 1127 case CPL_ERR_ARP_MISS: 1128 return (EHOSTUNREACH); 1129 case CPL_ERR_CONN_TIMEDOUT: 1130 return (ETIMEDOUT); 1131 case CPL_ERR_TCAM_FULL: 1132 return (ENOMEM); 1133 case CPL_ERR_CONN_EXIST: 1134 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1135 return (EADDRINUSE); 1136 default: 1137 return (EIO); 1138 } 1139} 1140 1141static void 1142fail_act_open(struct toepcb *toep, int errno) 1143{ 1144 struct tcpcb *tp = toep->tp_tp; 1145 1146 t3_release_offload_resources(toep); 1147 if (tp) { 1148 INP_LOCK_ASSERT(tp->t_inpcb); 1149 cxgb_tcp_drop(tp, errno); 1150 } 1151 1152#ifdef notyet 1153 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1154#endif 1155} 1156 1157/* 1158 * Handle active open failures. 1159 */ 1160static void 1161active_open_failed(struct toepcb *toep, struct mbuf *m) 1162{ 1163 struct cpl_act_open_rpl *rpl = cplhdr(m); 1164 struct inpcb *inp; 1165 1166 INP_INFO_WLOCK(&tcbinfo); 1167 if (toep->tp_tp == NULL) 1168 goto done; 1169 1170 inp = toep->tp_tp->t_inpcb; 1171 INP_LOCK(inp); 1172 1173/* 1174 * Don't handle connection retry for now 1175 */ 1176#ifdef notyet 1177 struct inet_connection_sock *icsk = inet_csk(sk); 1178 1179 if (rpl->status == CPL_ERR_CONN_EXIST && 1180 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1181 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1182 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1183 jiffies + HZ / 2); 1184 } else 1185#endif 1186 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1187 INP_UNLOCK(inp); 1188done: 1189 INP_INFO_WUNLOCK(&tcbinfo); 1190 1191 m_free(m); 1192} 1193 1194/* 1195 * Return whether a failed active open has allocated a TID 1196 */ 1197static inline int 1198act_open_has_tid(int status) 1199{ 1200 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1201 status != CPL_ERR_ARP_MISS; 1202} 1203 1204/* 1205 * Process an ACT_OPEN_RPL CPL message. 1206 */ 1207static int 1208do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1209{ 1210 struct toepcb *toep = (struct toepcb *)ctx; 1211 struct cpl_act_open_rpl *rpl = cplhdr(m); 1212 1213 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1214 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1215 1216 active_open_failed(toep, m); 1217 return (0); 1218} 1219 1220/* 1221 * Handle an ARP failure for an active open. XXX purge ofo queue 1222 * 1223 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1224 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1225 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1226 * free the atid. Hmm. 1227 */ 1228#ifdef notyet 1229static void 1230act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1231{ 1232 struct toepcb *toep = m_get_toep(m); 1233 struct tcpcb *tp = toep->tp_tp; 1234 struct inpcb *inp = tp->t_inpcb; 1235 struct socket *so = toeptoso(toep); 1236 1237 INP_LOCK(inp); 1238 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1239 fail_act_open(so, EHOSTUNREACH); 1240 printf("freeing %p\n", m); 1241 1242 m_free(m); 1243 } 1244 INP_UNLOCK(inp); 1245} 1246#endif 1247/* 1248 * Send an active open request. 1249 */ 1250int 1251t3_connect(struct toedev *tdev, struct socket *so, 1252 struct rtentry *rt, struct sockaddr *nam) 1253{ 1254 struct mbuf *m; 1255 struct l2t_entry *e; 1256 struct tom_data *d = TOM_DATA(tdev); 1257 struct inpcb *inp = sotoinpcb(so); 1258 struct tcpcb *tp = intotcpcb(inp); 1259 struct toepcb *toep; /* allocated by init_offload_socket */ 1260 1261 int atid; 1262 1263 toep = toepcb_alloc(); 1264 if (toep == NULL) 1265 goto out_err; 1266 1267 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1268 goto out_err; 1269 1270 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1271 if (!e) 1272 goto free_tid; 1273 1274 INP_LOCK_ASSERT(inp); 1275 m = m_gethdr(MT_DATA, M_WAITOK); 1276 1277#if 0 1278 m->m_toe.mt_toepcb = tp->t_toe; 1279 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1280#endif 1281 SOCK_LOCK(so); 1282 1283 init_offload_socket(so, tdev, atid, e, rt, toep); 1284 1285 install_offload_ops(so); 1286 1287 mk_act_open_req(so, m, atid, e); 1288 SOCK_UNLOCK(so); 1289 1290 soisconnecting(so); 1291 toep = tp->t_toe; 1292 m_set_toep(m, tp->t_toe); 1293 1294 printf("sending off request\n"); 1295 1296 toep->tp_state = TCPS_SYN_SENT; 1297 l2t_send(d->cdev, (struct mbuf *)m, e); 1298 1299 if (toep->tp_ulp_mode) 1300 t3_enable_ddp(so, 0); 1301 return (0); 1302 1303free_tid: 1304 printf("failing connect - free atid\n"); 1305 1306 free_atid(d->cdev, atid); 1307out_err: 1308 printf("return ENOMEM\n"); 1309 return (ENOMEM); 1310} 1311 1312/* 1313 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1314 * not send multiple ABORT_REQs for the same connection and also that we do 1315 * not try to send a message after the connection has closed. Returns 1 if 1316 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1317 */ 1318static void 1319t3_send_reset(struct toepcb *toep) 1320{ 1321 1322 struct cpl_abort_req *req; 1323 unsigned int tid = toep->tp_tid; 1324 int mode = CPL_ABORT_SEND_RST; 1325 struct tcpcb *tp = toep->tp_tp; 1326 struct toedev *tdev = toep->tp_toedev; 1327 struct socket *so = NULL; 1328 struct mbuf *m; 1329 1330 if (tp) { 1331 INP_LOCK_ASSERT(tp->t_inpcb); 1332 so = toeptoso(toep); 1333 } 1334 1335 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1336 tdev == NULL)) 1337 return; 1338 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1339 1340 /* Purge the send queue so we don't send anything after an abort. */ 1341 if (so) 1342 sbflush(&so->so_snd); 1343 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1344 mode |= CPL_ABORT_POST_CLOSE_REQ; 1345 1346 m = m_gethdr_nofail(sizeof(*req)); 1347 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so)); 1348 set_arp_failure_handler(m, abort_arp_failure); 1349 1350 req = mtod(m, struct cpl_abort_req *); 1351 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1352 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1353 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1354 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1355 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1356 req->cmd = mode; 1357 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1358 mbufq_tail(&toep->out_of_order_queue, m); // defer 1359 else 1360 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1361} 1362 1363static int 1364t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1365{ 1366 struct inpcb *inp; 1367 int error, optval; 1368 1369 if (sopt->sopt_name == IP_OPTIONS) 1370 return (ENOPROTOOPT); 1371 1372 if (sopt->sopt_name != IP_TOS) 1373 return (EOPNOTSUPP); 1374 1375 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1376 1377 if (error) 1378 return (error); 1379 1380 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread)) 1381 return (EPERM); 1382 1383 inp = sotoinpcb(so); 1384 inp->inp_ip_tos = optval; 1385 1386 t3_set_tos(so); 1387 1388 return (0); 1389} 1390 1391static int 1392t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1393{ 1394 int err = 0; 1395 size_t copied; 1396 1397 if (sopt->sopt_name != TCP_CONGESTION && 1398 sopt->sopt_name != TCP_NODELAY) 1399 return (EOPNOTSUPP); 1400 1401 if (sopt->sopt_name == TCP_CONGESTION) { 1402 char name[TCP_CA_NAME_MAX]; 1403 int optlen = sopt->sopt_valsize; 1404 struct tcpcb *tp; 1405 1406 if (optlen < 1) 1407 return (EINVAL); 1408 1409 err = copyinstr(sopt->sopt_val, name, 1410 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1411 if (err) 1412 return (err); 1413 if (copied < 1) 1414 return (EINVAL); 1415 1416 tp = sototcpcb(so); 1417 /* 1418 * XXX I need to revisit this 1419 */ 1420 if ((err = t3_set_cong_control(so, name)) == 0) { 1421#ifdef notyet 1422 tp->t_cong_control = strdup(name, M_CXGB); 1423#endif 1424 } else 1425 return (err); 1426 } else { 1427 int optval, oldval; 1428 struct inpcb *inp; 1429 struct tcpcb *tp; 1430 1431 err = sooptcopyin(sopt, &optval, sizeof optval, 1432 sizeof optval); 1433 1434 if (err) 1435 return (err); 1436 1437 inp = sotoinpcb(so); 1438 tp = intotcpcb(inp); 1439 1440 INP_LOCK(inp); 1441 1442 oldval = tp->t_flags; 1443 if (optval) 1444 tp->t_flags |= TF_NODELAY; 1445 else 1446 tp->t_flags &= ~TF_NODELAY; 1447 INP_UNLOCK(inp); 1448 1449 if (oldval != tp->t_flags) 1450 t3_set_nagle(so); 1451 1452 } 1453 1454 return (0); 1455} 1456 1457static int 1458t3_ctloutput(struct socket *so, struct sockopt *sopt) 1459{ 1460 int err; 1461 1462 if (sopt->sopt_level != IPPROTO_TCP) 1463 err = t3_ip_ctloutput(so, sopt); 1464 else 1465 err = t3_tcp_ctloutput(so, sopt); 1466 1467 if (err != EOPNOTSUPP) 1468 return (err); 1469 1470 return tcp_ctloutput(so, sopt); 1471} 1472 1473/* 1474 * Process new data received for a connection. 1475 */ 1476static void 1477new_rx_data(struct toepcb *toep, struct mbuf *m) 1478{ 1479 struct cpl_rx_data *hdr = cplhdr(m); 1480 struct tcpcb *tp = toep->tp_tp; 1481 struct socket *so = toeptoso(toep); 1482 int len = be16toh(hdr->len); 1483 1484 INP_LOCK(tp->t_inpcb); 1485 1486#ifdef notyet 1487 if (__predict_false(sk_no_receive(sk))) { 1488 handle_excess_rx(so, skb); 1489 return; 1490 } 1491 1492 if (ULP_MODE(tp) == ULP_MODE_TCPDDP) 1493 handle_ddp_data(so, skb); 1494 1495 TCP_SKB_CB(skb)->seq = ntohl(hdr->seq); 1496 TCP_SKB_CB(skb)->flags = 0; 1497 skb_ulp_mode(skb) = 0; /* for iSCSI */ 1498#endif 1499#if VALIDATE_SEQ 1500 if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) { 1501 printk(KERN_ERR 1502 "%s: TID %u: Bad sequence number %u, expected %u\n", 1503 TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq, 1504 tp->rcv_nxt); 1505 __kfree_skb(skb); 1506 return; 1507 } 1508#endif 1509 m_adj(m, sizeof(*hdr)); 1510 1511#ifdef notyet 1512 /* 1513 * We don't handle urgent data yet 1514 */ 1515 if (__predict_false(hdr->urg)) 1516 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 1517 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 1518 tp->urg_seq - tp->rcv_nxt < skb->len)) 1519 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 1520 tp->rcv_nxt]; 1521#endif 1522 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 1523 toep->tp_delack_mode = hdr->dack_mode; 1524 toep->tp_delack_seq = tp->rcv_nxt; 1525 } 1526 1527 DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len); 1528 1529 if (len < m->m_pkthdr.len) 1530 m->m_pkthdr.len = m->m_len = len; 1531 1532 tp->rcv_nxt += m->m_pkthdr.len; 1533 tp->t_rcvtime = ticks; 1534 toep->tp_enqueued_bytes += m->m_pkthdr.len; 1535#ifdef T3_TRACE 1536 T3_TRACE2(TIDTB(sk), 1537 "new_rx_data: seq 0x%x len %u", 1538 TCP_SKB_CB(skb)->seq, skb->len); 1539#endif 1540 SOCKBUF_LOCK(&so->so_rcv); 1541 if (sb_notify(&so->so_rcv)) 1542 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len); 1543 1544 sbappend_locked(&so->so_rcv, m); 1545 KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax, 1546 1547 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 1548 so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax)); 1549 1550 INP_UNLOCK(tp->t_inpcb); 1551 DPRINTF("sb_cc=%d sb_mbcnt=%d\n", 1552 so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt); 1553 1554 if (__predict_true((so->so_state & SS_NOFDREF) == 0)) 1555 sorwakeup_locked(so); 1556 else 1557 SOCKBUF_UNLOCK(&so->so_rcv); 1558} 1559 1560/* 1561 * Handler for RX_DATA CPL messages. 1562 */ 1563static int 1564do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1565{ 1566 struct toepcb *toep = (struct toepcb *)ctx; 1567 1568 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 1569 1570 new_rx_data(toep, m); 1571 1572 return (0); 1573} 1574 1575static void 1576new_rx_data_ddp(struct socket *so, struct mbuf *m) 1577{ 1578 struct tcpcb *tp = sototcpcb(so); 1579 struct toepcb *toep = tp->t_toe; 1580 struct ddp_state *q; 1581 struct ddp_buf_state *bsp; 1582 struct cpl_rx_data_ddp *hdr; 1583 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 1584 1585#ifdef notyet 1586 if (unlikely(sk_no_receive(sk))) { 1587 handle_excess_rx(so, m); 1588 return; 1589 } 1590#endif 1591 tp = sototcpcb(so); 1592 q = &toep->tp_ddp_state; 1593 hdr = cplhdr(m); 1594 ddp_report = ntohl(hdr->u.ddp_report); 1595 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 1596 bsp = &q->buf_state[buf_idx]; 1597 1598#ifdef T3_TRACE 1599 T3_TRACE5(TIDTB(sk), 1600 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 1601 "hdr seq 0x%x len %u offset %u", 1602 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 1603 ntohs(hdr->len), G_DDP_OFFSET(ddp_report)); 1604 T3_TRACE1(TIDTB(sk), 1605 "new_rx_data_ddp: ddp_report 0x%x", 1606 ddp_report); 1607#endif 1608 1609 ddp_len = ntohs(hdr->len); 1610 rcv_nxt = ntohl(hdr->seq) + ddp_len; 1611 1612 /* 1613 * Overload to store old rcv_next 1614 */ 1615 m->m_pkthdr.csum_data = tp->rcv_nxt; 1616 tp->rcv_nxt = rcv_nxt; 1617 1618 /* 1619 * Store the length in m->m_len. We are changing the meaning of 1620 * m->m_len here, we need to be very careful that nothing from now on 1621 * interprets ->len of this packet the usual way. 1622 */ 1623 m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data; 1624 1625 /* 1626 * Figure out where the new data was placed in the buffer and store it 1627 * in when. Assumes the buffer offset starts at 0, consumer needs to 1628 * account for page pod's pg_offset. 1629 */ 1630 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 1631#ifdef notyet 1632 TCP_SKB_CB(skb)->when = end_offset - skb->len; 1633 1634 /* 1635 * We store in mac.raw the address of the gather list where the 1636 * placement happened. 1637 */ 1638 skb->mac.raw = (unsigned char *)bsp->gl; 1639#endif 1640 bsp->cur_offset = end_offset; 1641 1642 /* 1643 * Bit 0 of flags stores whether the DDP buffer is completed. 1644 * Note that other parts of the code depend on this being in bit 0. 1645 */ 1646 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 1647#if 0 1648 TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */ 1649#endif 1650 panic("spurious ddp completion"); 1651 } else { 1652 m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 1653 if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP)) 1654 q->cur_buf ^= 1; /* flip buffers */ 1655 } 1656 1657 if (bsp->flags & DDP_BF_NOCOPY) { 1658 m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY); 1659 bsp->flags &= ~DDP_BF_NOCOPY; 1660 } 1661 1662 if (ddp_report & F_DDP_PSH) 1663 m->m_pkthdr.csum_flags |= DDP_BF_PSH; 1664 1665 tp->t_rcvtime = ticks; 1666 sbappendstream_locked(&so->so_rcv, m); 1667#ifdef notyet 1668 if (!sock_flag(sk, SOCK_DEAD)) 1669 sk->sk_data_ready(sk, 0); 1670#endif 1671} 1672 1673#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 1674 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 1675 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 1676 F_DDP_INVALID_PPOD) 1677 1678/* 1679 * Handler for RX_DATA_DDP CPL messages. 1680 */ 1681static int 1682do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1683{ 1684 struct toepcb *toep = ctx; 1685 struct socket *so = toeptoso(toep); 1686 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 1687 1688 VALIDATE_SOCK(so); 1689 1690 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 1691 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 1692 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 1693 return CPL_RET_BUF_DONE; 1694 } 1695#if 0 1696 skb->h.th = tcphdr_skb->h.th; 1697#endif 1698 new_rx_data_ddp(so, m); 1699 return (0); 1700} 1701 1702static void 1703process_ddp_complete(struct socket *so, struct mbuf *m) 1704{ 1705 struct tcpcb *tp = sototcpcb(so); 1706 struct toepcb *toep = tp->t_toe; 1707 struct ddp_state *q; 1708 struct ddp_buf_state *bsp; 1709 struct cpl_rx_ddp_complete *hdr; 1710 unsigned int ddp_report, buf_idx, when; 1711 1712#ifdef notyet 1713 if (unlikely(sk_no_receive(sk))) { 1714 handle_excess_rx(sk, skb); 1715 return; 1716 } 1717#endif 1718 q = &toep->tp_ddp_state; 1719 hdr = cplhdr(m); 1720 ddp_report = ntohl(hdr->ddp_report); 1721 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 1722 bsp = &q->buf_state[buf_idx]; 1723 1724 when = bsp->cur_offset; 1725 m->m_len = G_DDP_OFFSET(ddp_report) - when; 1726 1727#ifdef T3_TRACE 1728 T3_TRACE5(TIDTB(sk), 1729 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 1730 "ddp_report 0x%x offset %u, len %u", 1731 tp->rcv_nxt, bsp->cur_offset, ddp_report, 1732 G_DDP_OFFSET(ddp_report), skb->len); 1733#endif 1734 1735 bsp->cur_offset += m->m_len; 1736 1737 if (!(bsp->flags & DDP_BF_NOFLIP)) 1738 q->cur_buf ^= 1; /* flip buffers */ 1739 1740#ifdef T3_TRACE 1741 T3_TRACE4(TIDTB(sk), 1742 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 1743 "ddp_report %u offset %u", 1744 tp->rcv_nxt, bsp->cur_offset, ddp_report, 1745 G_DDP_OFFSET(ddp_report)); 1746#endif 1747#if 0 1748 skb->mac.raw = (unsigned char *)bsp->gl; 1749#endif 1750 m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 1751 if (bsp->flags & DDP_BF_NOCOPY) 1752 bsp->flags &= ~DDP_BF_NOCOPY; 1753 m->m_pkthdr.csum_data = tp->rcv_nxt; 1754 tp->rcv_nxt += m->m_len; 1755 1756 tp->t_rcvtime = ticks; 1757 sbappendstream_locked(&so->so_rcv, m); 1758#ifdef notyet 1759 if (!sock_flag(sk, SOCK_DEAD)) 1760 sk->sk_data_ready(sk, 0); 1761#endif 1762} 1763 1764/* 1765 * Handler for RX_DDP_COMPLETE CPL messages. 1766 */ 1767static int 1768do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1769{ 1770 struct toepcb *toep = ctx; 1771 struct socket *so = toeptoso(toep); 1772 1773 VALIDATE_SOCK(so); 1774#if 0 1775 skb->h.th = tcphdr_skb->h.th; 1776#endif 1777 process_ddp_complete(so, m); 1778 return (0); 1779} 1780 1781/* 1782 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 1783 * socket state before calling tcp_time_wait to comply with its expectations. 1784 */ 1785static void 1786enter_timewait(struct socket *so) 1787{ 1788 struct tcpcb *tp = sototcpcb(so); 1789 1790 INP_LOCK_ASSERT(tp->t_inpcb); 1791 /* 1792 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 1793 * process peer_close because we don't want to carry the peer FIN in 1794 * the socket's receive queue and if we increment rcv_nxt without 1795 * having the FIN in the receive queue we'll confuse facilities such 1796 * as SIOCINQ. 1797 */ 1798 tp->rcv_nxt++; 1799 1800 tp->ts_recent_age = 0; /* defeat recycling */ 1801 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 1802 tcp_twstart(tp); 1803} 1804 1805/* 1806 * Handle a peer FIN. 1807 */ 1808static void 1809do_peer_fin(struct socket *so, struct mbuf *m) 1810{ 1811 struct tcpcb *tp = sototcpcb(so); 1812 struct toepcb *toep = tp->t_toe; 1813 int keep = 0, dead = (so->so_state & SS_NOFDREF); 1814 1815 DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead); 1816 1817#ifdef T3_TRACE 1818 T3_TRACE0(TIDTB(sk),"do_peer_fin:"); 1819#endif 1820 1821 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 1822 printf("abort_pending set\n"); 1823 1824 goto out; 1825 } 1826 1827#ifdef notyet 1828 if (ULP_MODE(tp) == ULP_MODE_TCPDDP) { 1829 keep = handle_peer_close_data(so, skb); 1830 if (keep < 0) 1831 return; 1832 } 1833 sk->sk_shutdown |= RCV_SHUTDOWN; 1834 sock_set_flag(so, SOCK_DONE); 1835#endif 1836 INP_INFO_WLOCK(&tcbinfo); 1837 INP_LOCK(tp->t_inpcb); 1838 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) 1839 socantrcvmore(so); 1840 switch (tp->t_state) { 1841 case TCPS_SYN_RECEIVED: 1842 tp->t_starttime = ticks; 1843 /* FALLTHROUGH */ 1844 case TCPS_ESTABLISHED: 1845 tp->t_state = TCPS_CLOSE_WAIT; 1846 break; 1847 case TCPS_FIN_WAIT_1: 1848 tp->t_state = TCPS_CLOSING; 1849 break; 1850 case TCPS_FIN_WAIT_2: 1851 /* 1852 * If we've sent an abort_req we must have sent it too late, 1853 * HW will send us a reply telling us so, and this peer_close 1854 * is really the last message for this connection and needs to 1855 * be treated as an abort_rpl, i.e., transition the connection 1856 * to TCP_CLOSE (note that the host stack does this at the 1857 * time of generating the RST but we must wait for HW). 1858 * Otherwise we enter TIME_WAIT. 1859 */ 1860 t3_release_offload_resources(toep); 1861 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1862 tp = tcp_close(tp); 1863 } else 1864 enter_timewait(so); 1865 break; 1866 default: 1867 log(LOG_ERR, 1868 "%s: TID %u received PEER_CLOSE in bad state %d\n", 1869 TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state); 1870 } 1871 INP_INFO_WUNLOCK(&tcbinfo); 1872 if (tp) 1873 INP_UNLOCK(tp->t_inpcb); 1874 1875 if (!dead) { 1876 DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags); 1877 1878 sorwakeup(so); 1879 sowwakeup(so); 1880 wakeup(&so->so_timeo); 1881#ifdef notyet 1882 sk->sk_state_change(sk); 1883 1884 /* Do not send POLL_HUP for half duplex close. */ 1885 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 1886 sk->sk_state == TCP_CLOSE) 1887 sk_wake_async(so, 1, POLL_HUP); 1888 else 1889 sk_wake_async(so, 1, POLL_IN); 1890#endif 1891 } 1892out: 1893 if (!keep) 1894 m_free(m); 1895} 1896 1897/* 1898 * Handler for PEER_CLOSE CPL messages. 1899 */ 1900static int 1901do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1902{ 1903 struct toepcb *toep = (struct toepcb *)ctx; 1904 struct socket *so = toeptoso(toep); 1905 1906 VALIDATE_SOCK(so); 1907 1908 do_peer_fin(so, m); 1909 return (0); 1910} 1911 1912static void 1913process_close_con_rpl(struct socket *so, struct mbuf *m) 1914{ 1915 struct tcpcb *tp = sototcpcb(so); 1916 struct cpl_close_con_rpl *rpl = cplhdr(m); 1917 struct toepcb *toep = tp->t_toe; 1918 1919 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 1920 1921 DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state, 1922 !!(so->so_state & SS_NOFDREF)); 1923 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) 1924 goto out; 1925 1926 INP_INFO_WLOCK(&tcbinfo); 1927 INP_LOCK(tp->t_inpcb); 1928 switch (tp->t_state) { 1929 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 1930 t3_release_offload_resources(toep); 1931 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1932 tp = tcp_close(tp); 1933 1934 } else 1935 enter_timewait(so); 1936 break; 1937 case TCPS_LAST_ACK: 1938 /* 1939 * In this state we don't care about pending abort_rpl. 1940 * If we've sent abort_req it was post-close and was sent too 1941 * late, this close_con_rpl is the actual last message. 1942 */ 1943 t3_release_offload_resources(toep); 1944 tp = tcp_close(tp); 1945 break; 1946 case TCPS_FIN_WAIT_1: 1947#ifdef notyet 1948 dst_confirm(sk->sk_dst_cache); 1949#endif 1950 soisdisconnecting(so); 1951 1952 if ((so->so_state & SS_NOFDREF) == 0) { 1953 /* 1954 * Wake up lingering close 1955 */ 1956 sowwakeup(so); 1957 sorwakeup(so); 1958 wakeup(&so->so_timeo); 1959 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0 && 1960 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 1961 tp = cxgb_tcp_drop(tp, 0); 1962 } 1963 1964 break; 1965 default: 1966 log(LOG_ERR, 1967 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 1968 TOE_DEV(so)->tod_name, toep->tp_tid, 1969 tp->t_state); 1970 } 1971 INP_INFO_WUNLOCK(&tcbinfo); 1972 if (tp) 1973 INP_UNLOCK(tp->t_inpcb); 1974out: 1975 m_free(m); 1976} 1977 1978/* 1979 * Handler for CLOSE_CON_RPL CPL messages. 1980 */ 1981static int 1982do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 1983 void *ctx) 1984{ 1985 struct toepcb *toep = (struct toepcb *)ctx; 1986 struct socket *so = toeptoso(toep); 1987 1988 VALIDATE_SOCK(so); 1989 1990 process_close_con_rpl(so, m); 1991 return (0); 1992} 1993 1994/* 1995 * Process abort replies. We only process these messages if we anticipate 1996 * them as the coordination between SW and HW in this area is somewhat lacking 1997 * and sometimes we get ABORT_RPLs after we are done with the connection that 1998 * originated the ABORT_REQ. 1999 */ 2000static void 2001process_abort_rpl(struct socket *so, struct mbuf *m) 2002{ 2003 struct tcpcb *tp = sototcpcb(so); 2004 struct toepcb *toep = tp->t_toe; 2005 2006#ifdef T3_TRACE 2007 T3_TRACE1(TIDTB(sk), 2008 "process_abort_rpl: GTS rpl pending %d", 2009 sock_flag(sk, ABORT_RPL_PENDING)); 2010#endif 2011 INP_LOCK(tp->t_inpcb); 2012 2013 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2014 /* 2015 * XXX panic on tcpdrop 2016 */ 2017 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so))) 2018 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2019 else { 2020 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2021 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2022 !is_t3a(TOE_DEV(so))) { 2023 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2024 panic("TP_ABORT_REQ_RCVD set"); 2025 INP_INFO_WLOCK(&tcbinfo); 2026 INP_LOCK(tp->t_inpcb); 2027 t3_release_offload_resources(toep); 2028 tp = tcp_close(tp); 2029 INP_INFO_WUNLOCK(&tcbinfo); 2030 } 2031 } 2032 } 2033 if (tp) 2034 INP_UNLOCK(tp->t_inpcb); 2035 2036 m_free(m); 2037} 2038 2039/* 2040 * Handle an ABORT_RPL_RSS CPL message. 2041 */ 2042static int 2043do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2044{ 2045 struct socket *so; 2046 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2047 struct toepcb *toep; 2048 2049 /* 2050 * Ignore replies to post-close aborts indicating that the abort was 2051 * requested too late. These connections are terminated when we get 2052 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2053 * arrives the TID is either no longer used or it has been recycled. 2054 */ 2055 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2056discard: 2057 m_free(m); 2058 return (0); 2059 } 2060 2061 toep = (struct toepcb *)ctx; 2062 2063 /* 2064 * Sometimes we've already closed the socket, e.g., a post-close 2065 * abort races with ABORT_REQ_RSS, the latter frees the socket 2066 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2067 * but FW turns the ABORT_REQ into a regular one and so we get 2068 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2069 */ 2070 if (!toep) 2071 goto discard; 2072 2073 if (toep->tp_tp == NULL) { 2074 printf("removing tid for abort\n"); 2075 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2076 if (toep->tp_l2t) 2077 l2t_release(L2DATA(cdev), toep->tp_l2t); 2078 2079 toepcb_release(toep); 2080 goto discard; 2081 } 2082 2083 printf("toep=%p\n", toep); 2084 printf("tp=%p\n", toep->tp_tp); 2085 2086 so = toeptoso(toep); /* <- XXX panic */ 2087 toepcb_hold(toep); 2088 process_abort_rpl(so, m); 2089 toepcb_release(toep); 2090 return (0); 2091} 2092 2093/* 2094 * Convert the status code of an ABORT_REQ into a Linux error code. Also 2095 * indicate whether RST should be sent in response. 2096 */ 2097static int 2098abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2099{ 2100 struct tcpcb *tp = sototcpcb(so); 2101 2102 switch (abort_reason) { 2103 case CPL_ERR_BAD_SYN: 2104#if 0 2105 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2106#endif 2107 case CPL_ERR_CONN_RESET: 2108 // XXX need to handle SYN_RECV due to crossed SYNs 2109 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2110 case CPL_ERR_XMIT_TIMEDOUT: 2111 case CPL_ERR_PERSIST_TIMEDOUT: 2112 case CPL_ERR_FINWAIT2_TIMEDOUT: 2113 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2114#if 0 2115 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2116#endif 2117 return (ETIMEDOUT); 2118 default: 2119 return (EIO); 2120 } 2121} 2122 2123static inline void 2124set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2125{ 2126 struct cpl_abort_rpl *rpl = cplhdr(m); 2127 2128 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2129 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2130 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2131 2132 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2133 rpl->cmd = cmd; 2134} 2135 2136static void 2137send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2138{ 2139 struct mbuf *reply_mbuf; 2140 struct cpl_abort_req_rss *req = cplhdr(m); 2141 2142 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2143 m_set_priority(m, CPL_PRIORITY_DATA); 2144 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2145 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2146 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2147 m_free(m); 2148} 2149 2150/* 2151 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2152 */ 2153static inline int 2154is_neg_adv_abort(unsigned int status) 2155{ 2156 return status == CPL_ERR_RTX_NEG_ADVICE || 2157 status == CPL_ERR_PERSIST_NEG_ADVICE; 2158} 2159 2160static void 2161send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2162{ 2163 struct mbuf *reply_mbuf; 2164 struct cpl_abort_req_rss *req = cplhdr(m); 2165 2166 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2167 2168 if (!reply_mbuf) { 2169 /* Defer the reply. Stick rst_status into req->cmd. */ 2170 req->status = rst_status; 2171 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2172 return; 2173 } 2174 2175 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2176 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2177 m_free(m); 2178 2179 /* 2180 * XXX need to sync with ARP as for SYN_RECV connections we can send 2181 * these messages while ARP is pending. For other connection states 2182 * it's not a problem. 2183 */ 2184 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2185} 2186 2187#ifdef notyet 2188static void 2189cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2190{ 2191 UNIMPLEMENTED(); 2192#ifdef notyet 2193 struct request_sock *req = child->sk_user_data; 2194 2195 inet_csk_reqsk_queue_removed(parent, req); 2196 synq_remove(tcp_sk(child)); 2197 __reqsk_free(req); 2198 child->sk_user_data = NULL; 2199#endif 2200} 2201 2202 2203/* 2204 * Performs the actual work to abort a SYN_RECV connection. 2205 */ 2206static void 2207do_abort_syn_rcv(struct socket *child, struct socket *parent) 2208{ 2209 struct tcpcb *parenttp = sototcpcb(parent); 2210 struct tcpcb *childtp = sototcpcb(child); 2211 2212 /* 2213 * If the server is still open we clean up the child connection, 2214 * otherwise the server already did the clean up as it was purging 2215 * its SYN queue and the skb was just sitting in its backlog. 2216 */ 2217 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2218 cleanup_syn_rcv_conn(child, parent); 2219 INP_INFO_WLOCK(&tcbinfo); 2220 INP_LOCK(childtp->t_inpcb); 2221 t3_release_offload_resources(childtp->t_toe); 2222 childtp = tcp_close(childtp); 2223 INP_INFO_WUNLOCK(&tcbinfo); 2224 if (childtp) 2225 INP_UNLOCK(childtp->t_inpcb); 2226 } 2227} 2228#endif 2229 2230/* 2231 * Handle abort requests for a SYN_RECV connection. These need extra work 2232 * because the socket is on its parent's SYN queue. 2233 */ 2234static int 2235abort_syn_rcv(struct socket *so, struct mbuf *m) 2236{ 2237 UNIMPLEMENTED(); 2238#ifdef notyet 2239 struct socket *parent; 2240 struct toedev *tdev = TOE_DEV(so); 2241 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2242 struct socket *oreq = so->so_incomp; 2243 struct t3c_tid_entry *t3c_stid; 2244 struct tid_info *t; 2245 2246 if (!oreq) 2247 return -1; /* somehow we are not on the SYN queue */ 2248 2249 t = &(T3C_DATA(cdev))->tid_maps; 2250 t3c_stid = lookup_stid(t, oreq->ts_recent); 2251 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2252 2253 SOCK_LOCK(parent); 2254 do_abort_syn_rcv(so, parent); 2255 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2256 SOCK_UNLOCK(parent); 2257#endif 2258 return (0); 2259} 2260 2261/* 2262 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 2263 * request except that we need to reply to it. 2264 */ 2265static void 2266process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev) 2267{ 2268 int rst_status = CPL_ABORT_NO_RST; 2269 const struct cpl_abort_req_rss *req = cplhdr(m); 2270 struct tcpcb *tp = sototcpcb(so); 2271 struct toepcb *toep = tp->t_toe; 2272 2273 INP_LOCK(tp->t_inpcb); 2274 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 2275 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 2276 m_free(m); 2277 goto skip; 2278 } 2279 2280 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 2281 /* 2282 * Three cases to consider: 2283 * a) We haven't sent an abort_req; close the connection. 2284 * b) We have sent a post-close abort_req that will get to TP too late 2285 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 2286 * be ignored and the connection should be closed now. 2287 * c) We have sent a regular abort_req that will get to TP too late. 2288 * That will generate an abort_rpl with status 0, wait for it. 2289 */ 2290 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 2291 (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 2292 so->so_error = abort_status_to_errno(so, req->status, 2293 &rst_status); 2294#if 0 2295 if (!sock_flag(sk, SOCK_DEAD)) 2296 sk->sk_error_report(sk); 2297#endif 2298 /* 2299 * SYN_RECV needs special processing. If abort_syn_rcv() 2300 * returns 0 is has taken care of the abort. 2301 */ 2302 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 2303 goto skip; 2304 2305 t3_release_offload_resources(toep); 2306 tp = tcp_close(tp); 2307 } 2308 if (tp) 2309 INP_UNLOCK(tp->t_inpcb); 2310 send_abort_rpl(m, tdev, rst_status); 2311 return; 2312 2313skip: 2314 INP_UNLOCK(tp->t_inpcb); 2315} 2316 2317/* 2318 * Handle an ABORT_REQ_RSS CPL message. 2319 */ 2320static int 2321do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2322{ 2323 const struct cpl_abort_req_rss *req = cplhdr(m); 2324 struct toepcb *toep = (struct toepcb *)ctx; 2325 struct socket *so; 2326 struct inpcb *inp; 2327 2328 if (is_neg_adv_abort(req->status)) { 2329 m_free(m); 2330 return (0); 2331 } 2332 2333 printf("aborting tid=%d\n", toep->tp_tid); 2334 2335 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 2336 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2337 toep->tp_flags |= TP_ABORT_REQ_RCVD; 2338 printf("sending abort rpl\n"); 2339 2340 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 2341 printf("sent\n"); 2342 if (toep->tp_l2t) 2343 l2t_release(L2DATA(cdev), toep->tp_l2t); 2344 2345 /* 2346 * Unhook 2347 */ 2348 toep->tp_tp->t_toe = NULL; 2349 toep->tp_tp->t_flags &= ~TF_TOE; 2350 toep->tp_tp = NULL; 2351 /* 2352 * XXX need to call syncache_chkrst - but we don't 2353 * have a way of doing that yet 2354 */ 2355 toepcb_release(toep); 2356 printf("abort for unestablished connection :-(\n"); 2357 return (0); 2358 } 2359 if (toep->tp_tp == NULL) { 2360 printf("disconnected toepcb\n"); 2361 /* should be freed momentarily */ 2362 return (0); 2363 } 2364 2365 so = toeptoso(toep); 2366 inp = sotoinpcb(so); 2367 2368 VALIDATE_SOCK(so); 2369 toepcb_hold(toep); 2370 INP_INFO_WLOCK(&tcbinfo); 2371 process_abort_req(so, m, TOE_DEV(so)); 2372 INP_INFO_WUNLOCK(&tcbinfo); 2373 toepcb_release(toep); 2374 return (0); 2375} 2376#ifdef notyet 2377static void 2378pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 2379{ 2380 struct toedev *tdev = TOE_DEV(parent); 2381 2382 do_abort_syn_rcv(child, parent); 2383 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 2384 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 2385 2386 rpl->opt0h = htonl(F_TCAM_BYPASS); 2387 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 2388 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 2389 } else 2390 m_free(m); 2391} 2392#endif 2393static void 2394handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 2395{ 2396 UNIMPLEMENTED(); 2397 2398#ifdef notyet 2399 struct t3cdev *cdev; 2400 struct socket *parent; 2401 struct socket *oreq; 2402 struct t3c_tid_entry *t3c_stid; 2403 struct tid_info *t; 2404 struct tcpcb *otp, *tp = sototcpcb(so); 2405 struct toepcb *toep = tp->t_toe; 2406 2407 /* 2408 * If the connection is being aborted due to the parent listening 2409 * socket going away there's nothing to do, the ABORT_REQ will close 2410 * the connection. 2411 */ 2412 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2413 m_free(m); 2414 return; 2415 } 2416 2417 oreq = so->so_incomp; 2418 otp = sototcpcb(oreq); 2419 2420 cdev = T3C_DEV(so); 2421 t = &(T3C_DATA(cdev))->tid_maps; 2422 t3c_stid = lookup_stid(t, otp->ts_recent); 2423 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2424 2425 SOCK_LOCK(parent); 2426 pass_open_abort(so, parent, m); 2427 SOCK_UNLOCK(parent); 2428#endif 2429} 2430 2431/* 2432 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 2433 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 2434 * connection. 2435 */ 2436static void 2437pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 2438{ 2439 2440#ifdef notyet 2441 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 2442 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 2443#endif 2444 handle_pass_open_arp_failure(m_get_socket(m), m); 2445} 2446 2447/* 2448 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 2449 */ 2450static void 2451mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 2452{ 2453 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 2454 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 2455 unsigned int tid = GET_TID(req); 2456 2457 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 2458 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 2459 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 2460 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 2461 rpl->opt0h = htonl(F_TCAM_BYPASS); 2462 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 2463 rpl->opt2 = 0; 2464 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 2465} 2466 2467/* 2468 * Send a deferred reject to an accept request. 2469 */ 2470static void 2471reject_pass_request(struct toedev *tdev, struct mbuf *m) 2472{ 2473 struct mbuf *reply_mbuf; 2474 2475 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 2476 mk_pass_accept_rpl(reply_mbuf, m); 2477 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2478 m_free(m); 2479} 2480 2481static void 2482handle_syncache_event(int event, void *arg) 2483{ 2484 struct toepcb *toep = arg; 2485 2486 switch (event) { 2487 case TOE_SC_ENTRY_PRESENT: 2488 /* 2489 * entry already exists - free toepcb 2490 * and l2t 2491 */ 2492 printf("syncache entry present\n"); 2493 toepcb_release(toep); 2494 break; 2495 case TOE_SC_DROP: 2496 /* 2497 * The syncache has given up on this entry 2498 * either it timed out, or it was evicted 2499 * we need to explicitly release the tid 2500 */ 2501 printf("syncache entry dropped\n"); 2502 toepcb_release(toep); 2503 break; 2504 default: 2505 log(LOG_ERR, "unknown syncache event %d\n", event); 2506 break; 2507 } 2508} 2509 2510static void 2511syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 2512{ 2513 struct in_conninfo inc; 2514 struct tcpopt to; 2515 struct tcphdr th; 2516 struct inpcb *inp; 2517 int mss, wsf, sack, ts; 2518 2519 bzero(&to, sizeof(struct tcpopt)); 2520 inp = sotoinpcb(lso); 2521 2522 /* 2523 * Fill out information for entering us into the syncache 2524 */ 2525 inc.inc_fport = th.th_sport = req->peer_port; 2526 inc.inc_lport = th.th_dport = req->local_port; 2527 toep->tp_iss = th.th_seq = req->rcv_isn; 2528 th.th_flags = TH_SYN; 2529 2530 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn); 2531 2532 inc.inc_isipv6 = 0; 2533 inc.inc_len = 0; 2534 inc.inc_faddr.s_addr = req->peer_ip; 2535 inc.inc_laddr.s_addr = req->local_ip; 2536 2537 DPRINTF("syncache add of %d:%d %d:%d\n", 2538 ntohl(req->local_ip), ntohs(req->local_port), 2539 ntohl(req->peer_ip), ntohs(req->peer_port)); 2540 2541 mss = req->tcp_options.mss; 2542 wsf = req->tcp_options.wsf; 2543 ts = req->tcp_options.tstamp; 2544 sack = req->tcp_options.sack; 2545 to.to_mss = mss; 2546 to.to_wscale = wsf; 2547 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 2548 2549 INP_INFO_WLOCK(&tcbinfo); 2550 INP_LOCK(inp); 2551 syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 2552} 2553 2554 2555/* 2556 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 2557 * lock held. Note that the sock here is a listening socket that is not owned 2558 * by the TOE. 2559 */ 2560static void 2561process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 2562 struct listen_ctx *lctx) 2563{ 2564 int rt_flags; 2565 struct l2t_entry *e; 2566 struct iff_mac tim; 2567 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 2568 struct cpl_pass_accept_rpl *rpl; 2569 struct cpl_pass_accept_req *req = cplhdr(m); 2570 unsigned int tid = GET_TID(req); 2571 struct tom_data *d = TOM_DATA(tdev); 2572 struct t3cdev *cdev = d->cdev; 2573 struct tcpcb *tp = sototcpcb(so); 2574 struct toepcb *newtoep; 2575 struct rtentry *dst; 2576 struct sockaddr_in nam; 2577 struct t3c_data *td = T3C_DATA(cdev); 2578 2579 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2580 if (__predict_false(reply_mbuf == NULL)) { 2581 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 2582 t3_defer_reply(m, tdev, reject_pass_request); 2583 else { 2584 cxgb_queue_tid_release(cdev, tid); 2585 m_free(m); 2586 } 2587 DPRINTF("failed to get reply_mbuf\n"); 2588 2589 goto out; 2590 } 2591 2592 if (tp->t_state != TCPS_LISTEN) { 2593 DPRINTF("socket not in listen state\n"); 2594 2595 goto reject; 2596 } 2597 2598 tim.mac_addr = req->dst_mac; 2599 tim.vlan_tag = ntohs(req->vlan_tag); 2600 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 2601 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 2602 goto reject; 2603 } 2604 2605#ifdef notyet 2606 /* 2607 * XXX do route lookup to confirm that we're still listening on this 2608 * address 2609 */ 2610 if (ip_route_input(skb, req->local_ip, req->peer_ip, 2611 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 2612 goto reject; 2613 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 2614 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 2615 dst_release(skb->dst); // done with the input route, release it 2616 skb->dst = NULL; 2617 2618 if ((rt_flags & RTF_LOCAL) == 0) 2619 goto reject; 2620#endif 2621 /* 2622 * XXX 2623 */ 2624 rt_flags = RTF_LOCAL; 2625 if ((rt_flags & RTF_LOCAL) == 0) 2626 goto reject; 2627 2628 /* 2629 * Calculate values and add to syncache 2630 */ 2631 2632 newtoep = toepcb_alloc(); 2633 if (newtoep == NULL) 2634 goto reject; 2635 2636 bzero(&nam, sizeof(struct sockaddr_in)); 2637 2638 nam.sin_len = sizeof(struct sockaddr_in); 2639 nam.sin_family = AF_INET; 2640 nam.sin_addr.s_addr =req->peer_ip; 2641 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 2642 2643 if (dst == NULL) { 2644 printf("failed to find route\n"); 2645 goto reject; 2646 } 2647 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 2648 (struct sockaddr *)&nam); 2649 if (e == NULL) { 2650 DPRINTF("failed to get l2t\n"); 2651 } 2652 /* 2653 * Point to our listen socket until accept 2654 */ 2655 newtoep->tp_tp = tp; 2656 newtoep->tp_flags = TP_SYN_RCVD; 2657 newtoep->tp_tid = tid; 2658 newtoep->tp_toedev = tdev; 2659 2660 printf("inserting tid=%d\n", tid); 2661 cxgb_insert_tid(cdev, d->client, newtoep, tid); 2662 SOCK_LOCK(so); 2663 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 2664 SOCK_UNLOCK(so); 2665 2666 2667 if (lctx->ulp_mode) { 2668 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2669 2670 if (!ddp_mbuf) 2671 newtoep->tp_ulp_mode = 0; 2672 else 2673 newtoep->tp_ulp_mode = lctx->ulp_mode; 2674 } 2675 2676 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 2677 2678 DPRINTF("adding request to syn cache\n"); 2679 2680 /* 2681 * XXX workaround for lack of syncache drop 2682 */ 2683 toepcb_hold(newtoep); 2684 syncache_add_accept_req(req, so, newtoep); 2685 2686 2687 2688 rpl = cplhdr(reply_mbuf); 2689 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 2690 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 2691 rpl->wr.wr_lo = 0; 2692 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 2693 rpl->opt2 = htonl(calc_opt2(so, tdev)); 2694 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 2695 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 2696 2697 DPRINTF("accept smt_idx=%d\n", e->smt_idx); 2698 2699 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 2700 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 2701 rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) | 2702 CPL_PASS_OPEN_ACCEPT); 2703 2704 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 2705 2706 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so)); 2707 2708#ifdef DEBUG_PRINT 2709 { 2710 int i; 2711 2712 DPRINTF("rpl:\n"); 2713 uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *); 2714 2715 for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++) 2716 DPRINTF("[%d] %08x\n", i, rplbuf[i]); 2717 } 2718#endif 2719 2720 2721 l2t_send(cdev, reply_mbuf, e); 2722 m_free(m); 2723#ifdef notyet 2724 /* 2725 * XXX this call path has to be converted to not depend on sockets 2726 */ 2727 if (newtoep->tp_ulp_mode) 2728 __set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 2729 V_TF_DDP_OFF(1) | 2730 TP_DDP_TIMER_WORKAROUND_MASK, 2731 V_TF_DDP_OFF(1) | 2732 TP_DDP_TIMER_WORKAROUND_VAL, 1); 2733 2734#endif 2735 return; 2736reject: 2737 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 2738 mk_pass_accept_rpl(reply_mbuf, m); 2739 else 2740 mk_tid_release(reply_mbuf, NULL, tid); 2741 cxgb_ofld_send(cdev, reply_mbuf); 2742 m_free(m); 2743out: 2744#if 0 2745 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 2746#else 2747 return; 2748#endif 2749} 2750 2751/* 2752 * Handle a CPL_PASS_ACCEPT_REQ message. 2753 */ 2754static int 2755do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2756{ 2757 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 2758 struct socket *lso = listen_ctx->lso; 2759 struct tom_data *d = listen_ctx->tom_data; 2760 2761#if VALIDATE_TID 2762 struct cpl_pass_accept_req *req = cplhdr(m); 2763 unsigned int tid = GET_TID(req); 2764 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 2765 2766 if (unlikely(!lsk)) { 2767 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 2768 cdev->name, 2769 (unsigned long)((union listen_entry *)ctx - 2770 t->stid_tab)); 2771 return CPL_RET_BUF_DONE; 2772 } 2773 if (unlikely(tid >= t->ntids)) { 2774 printk(KERN_ERR "%s: passive open TID %u too large\n", 2775 cdev->name, tid); 2776 return CPL_RET_BUF_DONE; 2777 } 2778 /* 2779 * For T3A the current user of the TID may have closed but its last 2780 * message(s) may have been backlogged so the TID appears to be still 2781 * in use. Just take the TID away, the connection can close at its 2782 * own leisure. For T3B this situation is a bug. 2783 */ 2784 if (!valid_new_tid(t, tid) && 2785 cdev->type != T3A) { 2786 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 2787 cdev->name, tid); 2788 return CPL_RET_BUF_DONE; 2789 } 2790#endif 2791 2792 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 2793 return (0); 2794} 2795 2796/* 2797 * Called when a connection is established to translate the TCP options 2798 * reported by HW to Linux's native format. 2799 */ 2800static void 2801assign_rxopt(struct socket *so, unsigned int opt) 2802{ 2803 const struct t3c_data *td = T3C_DATA(T3C_DEV(so)); 2804 struct tcpcb *tp = sototcpcb(so); 2805 struct toepcb *toep = tp->t_toe; 2806 2807 INP_LOCK_ASSERT(tp->t_inpcb); 2808 2809 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 2810 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 2811 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 2812 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 2813 if (tp->t_flags & TF_RCVD_SCALE) 2814 tp->rcv_scale = 0; 2815} 2816 2817/* 2818 * Completes some final bits of initialization for just established connections 2819 * and changes their state to TCP_ESTABLISHED. 2820 * 2821 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 2822 */ 2823static void 2824make_established(struct socket *so, u32 snd_isn, unsigned int opt) 2825{ 2826 struct tcpcb *tp = sototcpcb(so); 2827 struct toepcb *toep = tp->t_toe; 2828 2829 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 2830 assign_rxopt(so, opt); 2831 so->so_proto->pr_ctloutput = t3_ctloutput; 2832 2833#if 0 2834 inet_sk(sk)->id = tp->write_seq ^ jiffies; 2835#endif 2836 2837 2838 /* 2839 * XXX not clear what rcv_wup maps to 2840 */ 2841 /* 2842 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 2843 * pass through opt0. 2844 */ 2845 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 2846 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 2847 2848 dump_toepcb(toep); 2849 2850#ifdef notyet 2851/* 2852 * no clean interface for marking ARP up to date 2853 */ 2854 dst_confirm(sk->sk_dst_cache); 2855#endif 2856 tp->t_state = TCPS_ESTABLISHED; 2857} 2858 2859static int 2860syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 2861{ 2862 2863 struct in_conninfo inc; 2864 struct tcpopt to; 2865 struct tcphdr th; 2866 int mss, wsf, sack, ts; 2867 struct mbuf *m = NULL; 2868 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 2869 unsigned int opt; 2870 2871#ifdef MAC 2872#error "no MAC support" 2873#endif 2874 2875 opt = ntohs(req->tcp_opt); 2876 2877 bzero(&to, sizeof(struct tcpopt)); 2878 2879 /* 2880 * Fill out information for entering us into the syncache 2881 */ 2882 inc.inc_fport = th.th_sport = req->peer_port; 2883 inc.inc_lport = th.th_dport = req->local_port; 2884 th.th_seq = req->rcv_isn; 2885 th.th_flags = TH_ACK; 2886 2887 inc.inc_isipv6 = 0; 2888 inc.inc_len = 0; 2889 inc.inc_faddr.s_addr = req->peer_ip; 2890 inc.inc_laddr.s_addr = req->local_ip; 2891 2892 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 2893 wsf = G_TCPOPT_WSCALE_OK(opt); 2894 ts = G_TCPOPT_TSTAMP(opt); 2895 sack = G_TCPOPT_SACK(opt); 2896 2897 to.to_mss = mss; 2898 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 2899 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 2900 2901 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 2902 ntohl(req->local_ip), ntohs(req->local_port), 2903 ntohl(req->peer_ip), ntohs(req->peer_port), 2904 mss, wsf, ts, sack); 2905 return syncache_expand(&inc, &to, &th, so, m); 2906} 2907 2908 2909/* 2910 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 2911 * if we are in TCP_SYN_RECV due to crossed SYNs 2912 */ 2913static int 2914do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2915{ 2916 struct cpl_pass_establish *req = cplhdr(m); 2917 struct toepcb *toep = (struct toepcb *)ctx; 2918 struct tcpcb *tp; 2919 struct socket *so, *lso; 2920 struct t3c_data *td = T3C_DATA(cdev); 2921 // Complete socket initialization now that we have the SND_ISN 2922 2923 struct toedev *tdev; 2924 2925 so = lso = toeptoso(toep); 2926 tdev = toep->tp_toedev; 2927 2928 SOCK_LOCK(so); 2929 LIST_REMOVE(toep, synq_entry); 2930 SOCK_UNLOCK(so); 2931 2932 INP_INFO_WLOCK(&tcbinfo); 2933 if (!syncache_expand_establish_req(req, &so, toep)) { 2934 /* 2935 * No entry 2936 */ 2937 UNIMPLEMENTED(); 2938 } 2939 if (so == NULL) { 2940 /* 2941 * Couldn't create the socket 2942 */ 2943 UNIMPLEMENTED(); 2944 } 2945 2946 /* 2947 * XXX workaround for lack of syncache drop 2948 */ 2949 toepcb_release(toep); 2950 2951 tp = sototcpcb(so); 2952 INP_LOCK(tp->t_inpcb); 2953#ifdef notyet 2954 so->so_snd.sb_flags |= SB_TOE; 2955 so->so_rcv.sb_flags |= SB_TOE; 2956#endif 2957 toep->tp_tp = tp; 2958 toep->tp_flags = 0; 2959 tp->t_toe = toep; 2960 reset_wr_list(toep); 2961 tp->rcv_wnd = select_rcv_wnd(so); 2962 DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd); 2963 install_offload_ops(so); 2964 2965 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 2966 toep->tp_wr_unacked = 0; 2967 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 2968 toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) && 2969 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 2970 toep->tp_qset_idx = 0; 2971 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 2972 2973 /* 2974 * XXX Cancel any keep alive timer 2975 */ 2976 2977 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 2978 INP_INFO_WUNLOCK(&tcbinfo); 2979 INP_UNLOCK(tp->t_inpcb); 2980 soisconnected(so); 2981 2982#ifdef notyet 2983 /* 2984 * XXX not sure how these checks map to us 2985 */ 2986 if (unlikely(sk->sk_socket)) { // simultaneous opens only 2987 sk->sk_state_change(sk); 2988 sk_wake_async(so, 0, POLL_OUT); 2989 } 2990 /* 2991 * The state for the new connection is now up to date. 2992 * Next check if we should add the connection to the parent's 2993 * accept queue. When the parent closes it resets connections 2994 * on its SYN queue, so check if we are being reset. If so we 2995 * don't need to do anything more, the coming ABORT_RPL will 2996 * destroy this socket. Otherwise move the connection to the 2997 * accept queue. 2998 * 2999 * Note that we reset the synq before closing the server so if 3000 * we are not being reset the stid is still open. 3001 */ 3002 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3003 __kfree_skb(skb); 3004 goto unlock; 3005 } 3006#endif 3007 m_free(m); 3008 3009 return (0); 3010} 3011 3012/* 3013 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3014 * and send them to the TOE. 3015 */ 3016static void 3017fixup_and_send_ofo(struct socket *so) 3018{ 3019 struct mbuf *m; 3020 struct toedev *tdev = TOE_DEV(so); 3021 struct tcpcb *tp = sototcpcb(so); 3022 struct toepcb *toep = tp->t_toe; 3023 unsigned int tid = toep->tp_tid; 3024 3025 printf("fixup_and_send_ofo\n"); 3026 3027 INP_LOCK_ASSERT(tp->t_inpcb); 3028 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3029 /* 3030 * A variety of messages can be waiting but the fields we'll 3031 * be touching are common to all so any message type will do. 3032 */ 3033 struct cpl_close_con_req *p = cplhdr(m); 3034 3035 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3036 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3037 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3038 } 3039} 3040 3041/* 3042 * Updates socket state from an active establish CPL message. Runs with the 3043 * socket lock held. 3044 */ 3045static void 3046socket_act_establish(struct socket *so, struct mbuf *m) 3047{ 3048 struct cpl_act_establish *req = cplhdr(m); 3049 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3050 struct tcpcb *tp = sototcpcb(so); 3051 struct toepcb *toep = tp->t_toe; 3052 3053 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3054 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3055 toep->tp_tid, tp->t_state); 3056 3057 tp->ts_recent_age = ticks; 3058 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3059 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3060 3061 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3062 3063 /* 3064 * Now that we finally have a TID send any CPL messages that we had to 3065 * defer for lack of a TID. 3066 */ 3067 if (mbufq_len(&toep->out_of_order_queue)) 3068 fixup_and_send_ofo(so); 3069 3070 if (__predict_false(so->so_state & SS_NOFDREF)) { 3071#ifdef notyet 3072 /* 3073 * XXX not clear what should be done here 3074 * appears to correspond to sorwakeup_locked 3075 */ 3076 sk->sk_state_change(sk); 3077 sk_wake_async(so, 0, POLL_OUT); 3078#endif 3079 } 3080 m_free(m); 3081#ifdef notyet 3082/* 3083 * XXX assume no write requests permitted while socket connection is 3084 * incomplete 3085 */ 3086 /* 3087 * Currently the send queue must be empty at this point because the 3088 * socket layer does not send anything before a connection is 3089 * established. To be future proof though we handle the possibility 3090 * that there are pending buffers to send (either TX_DATA or 3091 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3092 * buffers according to the just learned write_seq, and then we send 3093 * them on their way. 3094 */ 3095 fixup_pending_writeq_buffers(sk); 3096 if (t3_push_frames(so, 1)) 3097 sk->sk_write_space(sk); 3098#endif 3099 3100 soisconnected(so); 3101 toep->tp_state = tp->t_state = TCPS_ESTABLISHED; 3102 tcpstat.tcps_connects++; 3103 3104} 3105 3106/* 3107 * Process a CPL_ACT_ESTABLISH message. 3108 */ 3109static int 3110do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3111{ 3112 struct cpl_act_establish *req = cplhdr(m); 3113 unsigned int tid = GET_TID(req); 3114 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3115 struct toepcb *toep = (struct toepcb *)ctx; 3116 struct tcpcb *tp = toep->tp_tp; 3117 struct socket *so; 3118 struct toedev *tdev; 3119 struct tom_data *d; 3120 3121 if (tp == NULL) { 3122 free_atid(cdev, atid); 3123 return (0); 3124 } 3125 3126 so = toeptoso(toep); 3127 tdev = TOE_DEV(so); /* blow up here if link was down */ 3128 d = TOM_DATA(tdev); 3129 3130 INP_LOCK(tp->t_inpcb); 3131 3132 /* 3133 * It's OK if the TID is currently in use, the owning socket may have 3134 * backlogged its last CPL message(s). Just take it away. 3135 */ 3136 toep->tp_tid = tid; 3137 toep->tp_tp = tp; 3138 so_insert_tid(d, so, tid); 3139 free_atid(cdev, atid); 3140 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3141 3142 socket_act_establish(so, m); 3143 INP_UNLOCK(tp->t_inpcb); 3144 return (0); 3145} 3146 3147/* 3148 * Process an acknowledgment of WR completion. Advance snd_una and send the 3149 * next batch of work requests from the write queue. 3150 */ 3151static void 3152wr_ack(struct toepcb *toep, struct mbuf *m) 3153{ 3154 struct tcpcb *tp = toep->tp_tp; 3155 struct cpl_wr_ack *hdr = cplhdr(m); 3156 struct socket *so = toeptoso(toep); 3157 unsigned int credits = ntohs(hdr->credits); 3158 u32 snd_una = ntohl(hdr->snd_una); 3159 int bytes = 0; 3160 3161 DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits); 3162 3163 INP_LOCK(tp->t_inpcb); 3164 3165 toep->tp_wr_avail += credits; 3166 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3167 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3168 3169 while (credits) { 3170 struct mbuf *p = peek_wr(toep); 3171 DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ; 3172 3173 if (__predict_false(!p)) { 3174 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3175 "nothing pending, state %u\n", 3176 credits, toep->tp_tid, tp->t_state); 3177 break; 3178 } 3179 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3180#if DEBUG_WR > 1 3181 struct tx_data_wr *w = cplhdr(p); 3182#ifdef notyet 3183 log(LOG_ERR, 3184 "TID %u got %u WR credits, need %u, len %u, " 3185 "main body %u, frags %u, seq # %u, ACK una %u," 3186 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3187 toep->tp_tid, credits, p->csum, p->len, 3188 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3189 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3190 WR_AVAIL(tp), count_pending_wrs(tp) - credits); 3191#endif 3192#endif 3193 p->m_pkthdr.csum_data -= credits; 3194 break; 3195 } else { 3196 dequeue_wr(toep); 3197 credits -= p->m_pkthdr.csum_data; 3198 bytes += p->m_pkthdr.len; 3199 DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len); 3200 3201 m_free(p); 3202 } 3203 } 3204 3205#if DEBUG_WR 3206 check_wr_invariants(tp); 3207#endif 3208 3209 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3210#if VALIDATE_SEQ 3211 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3212 3213 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3214 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3215 toep->tp_tid, tp->snd_una); 3216#endif 3217 goto out_free; 3218 } 3219 3220 if (tp->snd_una != snd_una) { 3221 tp->snd_una = snd_una; 3222 tp->ts_recent_age = ticks; 3223#ifdef notyet 3224 /* 3225 * Keep ARP entry "minty fresh" 3226 */ 3227 dst_confirm(sk->sk_dst_cache); 3228#endif 3229 if (tp->snd_una == tp->snd_nxt) 3230 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3231 } 3232 if (bytes) { 3233 DPRINTF("sbdrop(%d)\n", bytes); 3234 SOCKBUF_LOCK(&so->so_snd); 3235 sbdrop_locked(&so->so_snd, bytes); 3236 sowwakeup_locked(so); 3237 } 3238 3239 if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc) 3240 t3_push_frames(so, 0); 3241 3242out_free: 3243 INP_UNLOCK(tp->t_inpcb); 3244 m_free(m); 3245} 3246 3247/* 3248 * Handler for TX_DATA_ACK CPL messages. 3249 */ 3250static int 3251do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 3252{ 3253 struct toepcb *toep = (struct toepcb *)ctx; 3254 3255 DPRINTF("do_wr_ack\n"); 3256 dump_toepcb(toep); 3257 3258 VALIDATE_SOCK(so); 3259 3260 wr_ack(toep, m); 3261 return 0; 3262} 3263 3264 3265/* 3266 * Reset a connection that is on a listener's SYN queue or accept queue, 3267 * i.e., one that has not had a struct socket associated with it. 3268 * Must be called from process context. 3269 * 3270 * Modeled after code in inet_csk_listen_stop(). 3271 */ 3272static void 3273t3_reset_listen_child(struct socket *child) 3274{ 3275 struct tcpcb *tp = sototcpcb(child); 3276 3277 t3_send_reset(tp->t_toe); 3278} 3279 3280/* 3281 * Disconnect offloaded established but not yet accepted connections sitting 3282 * on a server's accept_queue. We just send an ABORT_REQ at this point and 3283 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 3284 */ 3285void 3286t3_disconnect_acceptq(struct socket *listen_so) 3287{ 3288 struct socket *so; 3289 struct tcpcb *tp; 3290 3291 TAILQ_FOREACH(so, &listen_so->so_comp, so_list) { 3292 tp = sototcpcb(so); 3293 3294 if (tp->t_flags & TF_TOE) { 3295 INP_LOCK(tp->t_inpcb); 3296 t3_reset_listen_child(so); 3297 INP_UNLOCK(tp->t_inpcb); 3298 } 3299 3300 } 3301} 3302 3303/* 3304 * Reset offloaded connections sitting on a server's syn queue. As above 3305 * we send ABORT_REQ and finish off when we get ABORT_RPL. 3306 */ 3307 3308void 3309t3_reset_synq(struct listen_ctx *lctx) 3310{ 3311 struct toepcb *toep; 3312 3313 SOCK_LOCK(lctx->lso); 3314 while (!LIST_EMPTY(&lctx->synq_head)) { 3315 toep = LIST_FIRST(&lctx->synq_head); 3316 LIST_REMOVE(toep, synq_entry); 3317 toep->tp_tp = NULL; 3318 t3_send_reset(toep); 3319 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 3320 toepcb_release(toep); 3321 } 3322 SOCK_UNLOCK(lctx->lso); 3323} 3324 3325void 3326t3_init_wr_tab(unsigned int wr_len) 3327{ 3328 int i; 3329 3330 if (mbuf_wrs[1]) /* already initialized */ 3331 return; 3332 3333 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 3334 int sgl_len = (3 * i) / 2 + (i & 1); 3335 3336 sgl_len += 3; 3337 mbuf_wrs[i] = sgl_len <= wr_len ? 3338 1 : 1 + (sgl_len - 2) / (wr_len - 1); 3339 } 3340 3341 wrlen = wr_len * 8; 3342} 3343 3344int 3345t3_init_cpl_io(void) 3346{ 3347#ifdef notyet 3348 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 3349 if (!tcphdr_skb) { 3350 log(LOG_ERR, 3351 "Chelsio TCP offload: can't allocate sk_buff\n"); 3352 return -1; 3353 } 3354 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 3355 tcphdr_skb->h.raw = tcphdr_skb->data; 3356 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 3357#endif 3358 3359 3360 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 3361 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 3362 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 3363 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 3364 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 3365 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 3366 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 3367 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 3368 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 3369 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 3370 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 3371 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 3372#ifdef notyet 3373 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 3374 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 3375 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 3376#endif 3377 return (0); 3378} 3379 3380